aboutsummaryrefslogtreecommitdiffstats
path: root/roms/skiboot/hw
diff options
context:
space:
mode:
authorAngelos Mouzakitis <a.mouzakitis@virtualopensystems.com>2023-10-10 14:33:42 +0000
committerAngelos Mouzakitis <a.mouzakitis@virtualopensystems.com>2023-10-10 14:33:42 +0000
commitaf1a266670d040d2f4083ff309d732d648afba2a (patch)
tree2fc46203448ddcc6f81546d379abfaeb323575e9 /roms/skiboot/hw
parente02cda008591317b1625707ff8e115a4841aa889 (diff)
Add submodule dependency filesHEADmaster
Change-Id: Iaf8d18082d3991dec7c0ebbea540f092188eb4ec
Diffstat (limited to 'roms/skiboot/hw')
-rw-r--r--roms/skiboot/hw/Makefile.inc19
-rw-r--r--roms/skiboot/hw/ast-bmc/Makefile.inc6
-rw-r--r--roms/skiboot/hw/ast-bmc/ast-io.c498
-rw-r--r--roms/skiboot/hw/ast-bmc/ast-sf-ctrl.c1020
-rw-r--r--roms/skiboot/hw/bt.c720
-rw-r--r--roms/skiboot/hw/cache-p9.c162
-rw-r--r--roms/skiboot/hw/capp.c243
-rw-r--r--roms/skiboot/hw/centaur.c555
-rw-r--r--roms/skiboot/hw/chiptod.c2067
-rw-r--r--roms/skiboot/hw/dio-p9.c132
-rw-r--r--roms/skiboot/hw/dts.c416
-rw-r--r--roms/skiboot/hw/fake-nvram.c49
-rw-r--r--roms/skiboot/hw/fake-rtc.c100
-rw-r--r--roms/skiboot/hw/fsi-master.c675
-rw-r--r--roms/skiboot/hw/fsp/Makefile.inc13
-rw-r--r--roms/skiboot/hw/fsp/fsp-attn.c143
-rw-r--r--roms/skiboot/hw/fsp/fsp-chiptod.c69
-rw-r--r--roms/skiboot/hw/fsp/fsp-codeupdate.c1315
-rw-r--r--roms/skiboot/hw/fsp/fsp-codeupdate.h222
-rw-r--r--roms/skiboot/hw/fsp/fsp-console.c1062
-rw-r--r--roms/skiboot/hw/fsp/fsp-diag.c46
-rw-r--r--roms/skiboot/hw/fsp/fsp-dpo.c154
-rw-r--r--roms/skiboot/hw/fsp/fsp-dump.c916
-rw-r--r--roms/skiboot/hw/fsp/fsp-elog-read.c608
-rw-r--r--roms/skiboot/hw/fsp/fsp-elog-write.c441
-rw-r--r--roms/skiboot/hw/fsp/fsp-epow.c192
-rw-r--r--roms/skiboot/hw/fsp/fsp-epow.h21
-rw-r--r--roms/skiboot/hw/fsp/fsp-ipmi.c400
-rw-r--r--roms/skiboot/hw/fsp/fsp-leds.c1939
-rw-r--r--roms/skiboot/hw/fsp/fsp-mem-err.c401
-rw-r--r--roms/skiboot/hw/fsp/fsp-nvram.c424
-rw-r--r--roms/skiboot/hw/fsp/fsp-occ.c417
-rw-r--r--roms/skiboot/hw/fsp/fsp-op-panel.c266
-rw-r--r--roms/skiboot/hw/fsp/fsp-psi.c75
-rw-r--r--roms/skiboot/hw/fsp/fsp-rtc.c567
-rw-r--r--roms/skiboot/hw/fsp/fsp-sensor.c860
-rw-r--r--roms/skiboot/hw/fsp/fsp-surveillance.c226
-rw-r--r--roms/skiboot/hw/fsp/fsp-sysdump.c407
-rw-r--r--roms/skiboot/hw/fsp/fsp-sysparam.c508
-rw-r--r--roms/skiboot/hw/fsp/fsp.c2709
-rw-r--r--roms/skiboot/hw/homer.c252
-rw-r--r--roms/skiboot/hw/imc.c1075
-rw-r--r--roms/skiboot/hw/ipmi/Makefile.inc9
-rw-r--r--roms/skiboot/hw/ipmi/ipmi-attn.c100
-rw-r--r--roms/skiboot/hw/ipmi/ipmi-fru.c231
-rw-r--r--roms/skiboot/hw/ipmi/ipmi-info.c206
-rw-r--r--roms/skiboot/hw/ipmi/ipmi-power.c85
-rw-r--r--roms/skiboot/hw/ipmi/ipmi-rtc.c127
-rw-r--r--roms/skiboot/hw/ipmi/ipmi-sel.c701
-rw-r--r--roms/skiboot/hw/ipmi/ipmi-sensor.c160
-rw-r--r--roms/skiboot/hw/ipmi/ipmi-watchdog.c218
-rw-r--r--roms/skiboot/hw/ipmi/test/Makefile.check34
-rw-r--r--roms/skiboot/hw/ipmi/test/run-fru.c116
-rw-r--r--roms/skiboot/hw/lpc-mbox.c346
-rw-r--r--roms/skiboot/hw/lpc-port80h.c173
-rw-r--r--roms/skiboot/hw/lpc-rtc.c235
-rw-r--r--roms/skiboot/hw/lpc-uart.c738
-rw-r--r--roms/skiboot/hw/lpc.c1407
-rw-r--r--roms/skiboot/hw/npu-hw-procedures.c608
-rw-r--r--roms/skiboot/hw/npu-opal.c176
-rw-r--r--roms/skiboot/hw/npu.c1693
-rw-r--r--roms/skiboot/hw/npu2-common.c681
-rw-r--r--roms/skiboot/hw/npu2-hw-procedures.c1079
-rw-r--r--roms/skiboot/hw/npu2-opencapi.c2370
-rw-r--r--roms/skiboot/hw/npu2.c2323
-rw-r--r--roms/skiboot/hw/npu3-hw-procedures.c792
-rw-r--r--roms/skiboot/hw/npu3-nvlink.c1828
-rw-r--r--roms/skiboot/hw/npu3.c549
-rw-r--r--roms/skiboot/hw/nx-842.c231
-rw-r--r--roms/skiboot/hw/nx-compress.c340
-rw-r--r--roms/skiboot/hw/nx-crypto.c298
-rw-r--r--roms/skiboot/hw/nx-gzip.c118
-rw-r--r--roms/skiboot/hw/nx-rng.c121
-rw-r--r--roms/skiboot/hw/nx.c138
-rw-r--r--roms/skiboot/hw/occ-sensor.c640
-rw-r--r--roms/skiboot/hw/occ.c2339
-rw-r--r--roms/skiboot/hw/ocmb.c167
-rw-r--r--roms/skiboot/hw/p8-i2c.c1688
-rw-r--r--roms/skiboot/hw/phb3.c5052
-rw-r--r--roms/skiboot/hw/phb4.c6400
-rw-r--r--roms/skiboot/hw/phys-map.c445
-rw-r--r--roms/skiboot/hw/prd.c789
-rw-r--r--roms/skiboot/hw/psi.c1079
-rw-r--r--roms/skiboot/hw/sbe-p8.c195
-rw-r--r--roms/skiboot/hw/sbe-p9.c1040
-rw-r--r--roms/skiboot/hw/sfc-ctrl.c510
-rw-r--r--roms/skiboot/hw/slw.c1731
-rw-r--r--roms/skiboot/hw/test/Makefile.check29
-rw-r--r--roms/skiboot/hw/test/phys-map-test.c203
-rw-r--r--roms/skiboot/hw/test/run-port80h.c99
-rw-r--r--roms/skiboot/hw/vas.c639
-rw-r--r--roms/skiboot/hw/xive.c5234
-rw-r--r--roms/skiboot/hw/xive2.c4666
-rw-r--r--roms/skiboot/hw/xscom.c1019
94 files changed, 74585 insertions, 0 deletions
diff --git a/roms/skiboot/hw/Makefile.inc b/roms/skiboot/hw/Makefile.inc
new file mode 100644
index 000000000..37256d3cc
--- /dev/null
+++ b/roms/skiboot/hw/Makefile.inc
@@ -0,0 +1,19 @@
+# -*-Makefile-*-
+SUBDIRS += hw
+HW_OBJS = xscom.o chiptod.o lpc.o lpc-uart.o psi.o
+HW_OBJS += homer.o slw.o occ.o fsi-master.o centaur.o imc.o
+HW_OBJS += nx.o nx-rng.o nx-crypto.o nx-compress.o nx-842.o nx-gzip.o
+HW_OBJS += phb3.o sfc-ctrl.o fake-rtc.o bt.o p8-i2c.o prd.o
+HW_OBJS += dts.o lpc-rtc.o npu.o npu-hw-procedures.o xive.o phb4.o
+HW_OBJS += fake-nvram.o lpc-mbox.o npu2.o npu2-hw-procedures.o
+HW_OBJS += npu2-common.o npu2-opencapi.o phys-map.o sbe-p9.o capp.o
+HW_OBJS += occ-sensor.o vas.o sbe-p8.o dio-p9.o lpc-port80h.o cache-p9.o
+HW_OBJS += npu-opal.o npu3.o npu3-nvlink.o npu3-hw-procedures.o
+HW_OBJS += ocmb.o xive2.o
+HW=hw/built-in.a
+
+include $(SRC)/hw/fsp/Makefile.inc
+include $(SRC)/hw/ast-bmc/Makefile.inc
+include $(SRC)/hw/ipmi/Makefile.inc
+
+$(HW): $(HW_OBJS:%=hw/%) $(FSP) $(EC) $(AST_BMC) $(IPMI)
diff --git a/roms/skiboot/hw/ast-bmc/Makefile.inc b/roms/skiboot/hw/ast-bmc/Makefile.inc
new file mode 100644
index 000000000..e7ded0e88
--- /dev/null
+++ b/roms/skiboot/hw/ast-bmc/Makefile.inc
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+SUBDIRS += hw/ast-bmc
+
+AST_BMC_OBJS = ast-io.o ast-sf-ctrl.o
+AST_BMC = hw/ast-bmc/built-in.a
+$(AST_BMC): $(AST_BMC_OBJS:%=hw/ast-bmc/%)
diff --git a/roms/skiboot/hw/ast-bmc/ast-io.c b/roms/skiboot/hw/ast-bmc/ast-io.c
new file mode 100644
index 000000000..f0f8c4c4d
--- /dev/null
+++ b/roms/skiboot/hw/ast-bmc/ast-io.c
@@ -0,0 +1,498 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Note about accesses to the AST2400 internal memory map:
+ *
+ * There are two ways to genrate accesses to the AHB bus of the AST2400
+ * from the host. The LPC->AHB bridge and the iLPC->AHB bridge.
+ *
+ * LPC->AHB bridge
+ * ---------------
+ *
+ * This bridge directly converts memory or firmware accesses using
+ * a set of registers for establishing a remapping window. We prefer
+ * using FW space as normal memory space is limited to byte accesses
+ * to a fixed 256M window, while FW space allows us to use different
+ * access sizes and to control the IDSEL bits which essentially enable
+ * a full 4G address space.
+ *
+ * The way FW accesses map onto AHB is controlled via two registers
+ * in the BMC's LPC host controller:
+ *
+ * HICR7 at 0x1e789088 [31:16] : ADRBASE
+ * [15:00] : HWMBASE
+ *
+ * HICR8 at 0x1e78908c [31:16] : ADRMASK
+ * [15:00] : HWNCARE
+ *
+ * All decoding/remapping happens on the top 16 bits of the LPC address
+ * named LPC_ADDR as follow:
+ *
+ * - For decoding, LPC_ADDR bits are compared with HWMBASE if the
+ * corresponding bit in HWNCARE is 0.
+ *
+ * - For remapping, the AHB address is constructed by taking bits
+ * from LPC_ADDR if the corresponding bit in ADRMASK is 0 or in
+ * ADRBASE if the corresponding bit in ADRMASK is 1
+ *
+ * Example of 2MB SPI flash, LPC 0xFCE00000~0xFCFFFFFF onto
+ * AHB 0x30000000~0x301FFFFF (SPI flash)
+ *
+ * ADRBASE=0x3000 HWMBASE=0xFCE0
+ * ADRMASK=0xFFE0 HWNCARE=0x001F
+ *
+ * This comes pre-configured by the BMC or HostBoot to access the PNOR
+ * flash from IDSEL 0 as follow:
+ *
+ * ADRBASE=0x3000 HWMBASE=0x0e00 for 32MB
+ * ADRMASK=0xfe00 HWNCARE=0x01ff
+ *
+ * Which means mapping of LPC 0x0e000000..0x0fffffff onto
+ * AHB 0x30000000..0x31ffffff
+ *
+ * iLPC->AHB bridge
+ * ---------------
+ *
+ * This bridge is hosted in the SuperIO part of the BMC and is
+ * controlled by a series of byte-sized registers accessed indirectly
+ * via IO ports 0x2e and 0x2f.
+ *
+ * Via these, byte by byte, we can construct an AHB address and
+ * fill a data buffer to trigger a write cycle, or we can do a
+ * read cycle and read back the data, byte after byte.
+ *
+ * This is fairly convoluted and slow but works regardless of what
+ * mapping was established in the LPC->AHB bridge.
+ *
+ * For the time being, we use the iLPC->AHB for everything except
+ * pnor accesses. In the long run, we will reconfigure the LPC->AHB
+ * to provide more direct access to all of the BMC address space but
+ * we'll only do that after the boot script/program on the BMC is
+ * updated to restore the bridge to a state compatible with the SBE
+ * expectations on boot.
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <lpc.h>
+#include <lock.h>
+#include <device.h>
+
+#include "ast.h"
+
+#define BMC_SIO_SCR28 0x28
+#define BOOT_FLAGS_VERSION 0x42
+
+/*
+ * SIO Register 0x29: Boot Flags (normal bit ordering)
+ *
+ * [7:6] Hostboot Boot mode:
+ * 00 : Normal
+ * 01 : Terminate on first error
+ * 10 : istep mode
+ * 11 : reserved
+ * [5:4] Boot options
+ * 00 : reserved
+ * 01 : Memboot
+ * 10 : Clear gard
+ * 11 : reserved
+ * [ 3 ] BMC mbox PNOR driver
+ * [2:0] Hostboot Log level:
+ * 000 : Normal
+ * 001 : Enable Scan trace
+ * xxx : reserved
+ */
+
+#define BMC_SIO_SCR29 0x29
+#define BMC_SIO_SCR29_MBOX 0x08
+#define BMC_SIO_SCR29_MEMBOOT 0x10
+
+/*
+ * SIO Register 0x2d: Platform Flags (normal bit ordering)
+ *
+ * [ 7 ] Hostboot configures SUART
+ * [ 6 ] Hostboot configures VUART
+ * [5:1] Reserved
+ * [ 0 ] Isolate Service Processor
+ */
+#define BMC_SIO_PLAT_FLAGS 0x2d
+#define BMC_SIO_PLAT_ISOLATE_SP 0x01
+
+enum {
+ BMC_SIO_DEV_NONE = -1,
+ BMC_SIO_DEV_UART1 = 2,
+ BMC_SIO_DEV_UART2 = 3,
+ BMC_SIO_DEV_SWC = 4,
+ BMC_SIO_DEV_KBC = 5,
+ BMC_SIO_DEV_P80 = 7,
+ BMC_SIO_DEV_UART3 = 0xb,
+ BMC_SIO_DEV_UART4 = 0xc,
+ BMC_SIO_DEV_LPC2AHB = 0xd,
+ BMC_SIO_DEV_MBOX = 0xe,
+};
+
+static struct lock bmc_sio_lock = LOCK_UNLOCKED;
+static int bmc_sio_cur_dev = BMC_SIO_DEV_NONE;
+
+/*
+ * SuperIO indirect accesses
+ */
+static void bmc_sio_outb(uint8_t val, uint8_t reg)
+{
+ lpc_outb(reg, 0x2e);
+ lpc_outb(val, 0x2f);
+}
+
+static uint8_t bmc_sio_inb(uint8_t reg)
+{
+ lpc_outb(reg, 0x2e);
+ return lpc_inb(0x2f);
+}
+
+static void bmc_sio_get(int dev)
+{
+ lock(&bmc_sio_lock);
+
+ if (bmc_sio_cur_dev == dev || dev < 0)
+ return;
+
+ if (bmc_sio_cur_dev == BMC_SIO_DEV_NONE) {
+ /* Send SuperIO password */
+ lpc_outb(0xa5, 0x2e);
+ lpc_outb(0xa5, 0x2e);
+ }
+
+ /* Select logical dev */
+ bmc_sio_outb(dev, 0x07);
+
+ bmc_sio_cur_dev = dev;
+}
+
+static void bmc_sio_put(bool lock_sio)
+{
+ if (lock_sio) {
+ /* Re-lock SuperIO */
+ lpc_outb(0xaa, 0x2e);
+
+ bmc_sio_cur_dev = BMC_SIO_DEV_NONE;
+ }
+ unlock(&bmc_sio_lock);
+}
+
+/*
+ * AHB accesses via iLPC->AHB in SuperIO. Works on byteswapped
+ * values (ie. Little Endian registers)
+ */
+static void bmc_sio_ahb_prep(uint32_t reg, uint8_t type)
+{
+ /* Enable iLPC->AHB */
+ bmc_sio_outb(0x01, 0x30);
+
+ /* Address */
+ bmc_sio_outb((reg >> 24) & 0xff, 0xf0);
+ bmc_sio_outb((reg >> 16) & 0xff, 0xf1);
+ bmc_sio_outb((reg >> 8) & 0xff, 0xf2);
+ bmc_sio_outb((reg ) & 0xff, 0xf3);
+
+ /* bytes cycle type */
+ bmc_sio_outb(type, 0xf8);
+}
+
+static void bmc_sio_ahb_writel(uint32_t val, uint32_t reg)
+{
+ bmc_sio_get(BMC_SIO_DEV_LPC2AHB);
+
+ bmc_sio_ahb_prep(reg, 2);
+
+ /* Write data */
+ bmc_sio_outb(val >> 24, 0xf4);
+ bmc_sio_outb(val >> 16, 0xf5);
+ bmc_sio_outb(val >> 8, 0xf6);
+ bmc_sio_outb(val , 0xf7);
+
+ /* Trigger */
+ bmc_sio_outb(0xcf, 0xfe);
+
+ bmc_sio_put(false);
+}
+
+static uint32_t bmc_sio_ahb_readl(uint32_t reg)
+{
+ uint32_t val = 0;
+
+ bmc_sio_get(BMC_SIO_DEV_LPC2AHB);
+
+ bmc_sio_ahb_prep(reg, 2);
+
+ /* Trigger */
+ bmc_sio_inb(0xfe);
+
+ /* Read results */
+ val = (val << 8) | bmc_sio_inb(0xf4);
+ val = (val << 8) | bmc_sio_inb(0xf5);
+ val = (val << 8) | bmc_sio_inb(0xf6);
+ val = (val << 8) | bmc_sio_inb(0xf7);
+
+ bmc_sio_put(false);
+
+ return val;
+}
+
+/*
+ * External API
+ *
+ * We only support 4-byte accesses to all of AHB. We additionally
+ * support 1-byte accesses to the flash area only.
+ *
+ * We could support all access sizes via iLPC but we don't need
+ * that for now.
+ */
+
+void ast_ahb_writel(uint32_t val, uint32_t reg)
+{
+ /* For now, always use iLPC->AHB, it will byteswap */
+ bmc_sio_ahb_writel(val, reg);
+}
+
+uint32_t ast_ahb_readl(uint32_t reg)
+{
+ /* For now, always use iLPC->AHB, it will byteswap */
+ return bmc_sio_ahb_readl(reg);
+}
+
+static void ast_setup_sio_irq_polarity(void)
+{
+ /* Select logical dev 2 */
+ bmc_sio_get(BMC_SIO_DEV_UART1);
+ bmc_sio_outb(0x01, 0x71); /* level low */
+ bmc_sio_put(false);
+
+ /* Select logical dev 3 */
+ bmc_sio_get(BMC_SIO_DEV_UART2);
+ bmc_sio_outb(0x01, 0x71); /* irq level low */
+ bmc_sio_put(false);
+
+ /* Select logical dev 4 */
+ bmc_sio_get(BMC_SIO_DEV_SWC);
+ bmc_sio_outb(0x01, 0x71); /* irq level low */
+ bmc_sio_put(false);
+
+ /* Select logical dev 5 */
+ bmc_sio_get(BMC_SIO_DEV_KBC);
+ bmc_sio_outb(0x01, 0x71); /* irq level low */
+ bmc_sio_outb(0x01, 0x73); /* irq level low */
+ bmc_sio_put(false);
+
+ /* Select logical dev 7 */
+ bmc_sio_get(BMC_SIO_DEV_P80);
+ bmc_sio_outb(0x01, 0x71); /* irq level low */
+ bmc_sio_put(false);
+
+ /* Select logical dev d */
+ bmc_sio_get(BMC_SIO_DEV_UART3);
+ bmc_sio_outb(0x01, 0x71); /* irq level low */
+ bmc_sio_put(false);
+
+ /* Select logical dev c */
+ bmc_sio_get(BMC_SIO_DEV_UART4);
+ bmc_sio_outb(0x01, 0x71); /* irq level low */
+ bmc_sio_put(false);
+
+ /* Select logical dev d */
+ bmc_sio_get(BMC_SIO_DEV_LPC2AHB);
+ bmc_sio_outb(0x01, 0x71); /* irq level low */
+ bmc_sio_put(false);
+
+ /* Select logical dev e */
+ bmc_sio_get(BMC_SIO_DEV_MBOX);
+ bmc_sio_outb(0x01, 0x71); /* irq level low */
+ bmc_sio_put(true);
+}
+
+bool ast_sio_is_enabled(void)
+{
+ bool enabled;
+ int64_t rc;
+
+ lock(&bmc_sio_lock);
+ /*
+ * Probe by attempting to lock the SIO device, this way the
+ * post-condition is that the SIO device is locked or not able to be
+ * unlocked. This turns out neater than trying to use the unlock code.
+ */
+ rc = lpc_probe_write(OPAL_LPC_IO, 0x2e, 0xaa, 1);
+ if (rc) {
+ enabled = false;
+ /* If we can't lock it, then we can't unlock it either */
+ goto out;
+ }
+
+ /*
+ * Now that we know that is locked and able to be unlocked, unlock it
+ * if skiboot's recorded device state indicates it was previously
+ * unlocked.
+ */
+ if (bmc_sio_cur_dev != BMC_SIO_DEV_NONE) {
+ /* Send SuperIO password */
+ lpc_outb(0xa5, 0x2e);
+ lpc_outb(0xa5, 0x2e);
+
+ /* Ensure the previously selected logical dev is selected */
+ bmc_sio_outb(bmc_sio_cur_dev, 0x07);
+ }
+
+ enabled = true;
+out:
+ unlock(&bmc_sio_lock);
+
+ return enabled;
+}
+
+bool ast_sio_init(void)
+{
+ bool enabled = ast_sio_is_enabled();
+
+ /* Configure all AIO interrupts to level low */
+ if (enabled)
+ ast_setup_sio_irq_polarity();
+
+ return enabled;
+}
+
+bool ast_io_is_rw(void)
+{
+ return !(ast_ahb_readl(LPC_HICRB) & LPC_HICRB_ILPC_DISABLE);
+}
+
+bool ast_io_init(void)
+{
+ return ast_io_is_rw();
+}
+
+bool ast_lpc_fw_ipmi_hiomap(void)
+{
+ return platform.bmc->sw->ipmi_oem_hiomap_cmd != 0;
+}
+
+bool ast_lpc_fw_mbox_hiomap(void)
+{
+ struct dt_node *n;
+
+ n = dt_find_compatible_node(dt_root, NULL, "mbox");
+
+ return n != NULL;
+}
+
+bool ast_lpc_fw_maps_flash(void)
+{
+ uint8_t boot_version;
+ uint8_t boot_flags;
+
+ boot_version = bmc_sio_inb(BMC_SIO_SCR28);
+ if (boot_version != BOOT_FLAGS_VERSION)
+ return true;
+
+ boot_flags = bmc_sio_inb(BMC_SIO_SCR29);
+ return !(boot_flags & BMC_SIO_SCR29_MEMBOOT);
+}
+
+bool ast_scratch_reg_is_mbox(void)
+{
+ uint8_t boot_version;
+ uint8_t boot_flags;
+
+ boot_version = bmc_sio_inb(BMC_SIO_SCR28);
+ if (boot_version != BOOT_FLAGS_VERSION)
+ return false;
+
+ boot_flags = bmc_sio_inb(BMC_SIO_SCR29);
+ return boot_flags & BMC_SIO_SCR29_MBOX;
+}
+
+void ast_setup_ibt(uint16_t io_base, uint8_t irq)
+{
+ uint32_t v;
+
+ v = bmc_sio_ahb_readl(LPC_iBTCR0);
+ v = v & ~(0xfffffc00u);
+ v = v | (((uint32_t)io_base) << 16);
+ v = v | (((uint32_t)irq) << 12);
+ bmc_sio_ahb_writel(v, LPC_iBTCR0);
+}
+
+bool ast_is_vuart1_enabled(void)
+{
+ uint32_t v;
+
+ v = bmc_sio_ahb_readl(VUART1_GCTRLA);
+ return !!(v & 1);
+}
+
+void ast_setup_vuart1(uint16_t io_base, uint8_t irq)
+{
+ uint32_t v;
+
+ /* IRQ level low */
+ v = bmc_sio_ahb_readl(VUART1_GCTRLA);
+ v = v & ~2u;
+ bmc_sio_ahb_writel(v, VUART1_GCTRLA);
+ v = bmc_sio_ahb_readl(VUART1_GCTRLA);
+
+ /* IRQ number */
+ v = bmc_sio_ahb_readl(VUART1_GCTRLB);
+ v = (v & ~0xf0u) | (irq << 4);
+ bmc_sio_ahb_writel(v, VUART1_GCTRLB);
+
+ /* Address */
+ bmc_sio_ahb_writel(io_base & 0xff, VUART1_ADDRL);
+ bmc_sio_ahb_writel(io_base >> 8, VUART1_ADDRH);
+}
+
+/* Setup SuperIO UART 1 */
+void ast_setup_sio_uart1(uint16_t io_base, uint8_t irq)
+{
+ bmc_sio_get(BMC_SIO_DEV_UART1);
+
+ /* Disable UART1 for configuration */
+ bmc_sio_outb(0x00, 0x30);
+
+ /* Configure base and interrupt */
+ bmc_sio_outb(io_base >> 8, 0x60);
+ bmc_sio_outb(io_base & 0xff, 0x61);
+ bmc_sio_outb(irq, 0x70);
+ bmc_sio_outb(0x01, 0x71); /* level low */
+
+ /* Enable UART1 */
+ bmc_sio_outb(0x01, 0x30);
+
+ bmc_sio_put(true);
+}
+
+void ast_disable_sio_uart1(void)
+{
+ bmc_sio_get(BMC_SIO_DEV_UART1);
+
+ /* Disable UART1 */
+ bmc_sio_outb(0x00, 0x30);
+
+ bmc_sio_put(true);
+}
+
+void ast_setup_sio_mbox(uint16_t io_base, uint8_t irq)
+{
+ bmc_sio_get(BMC_SIO_DEV_MBOX);
+
+ /* Disable for configuration */
+ bmc_sio_outb(0x00, 0x30);
+
+ bmc_sio_outb(io_base >> 8, 0x60);
+ bmc_sio_outb(io_base & 0xff, 0x61);
+ bmc_sio_outb(irq, 0x70);
+ bmc_sio_outb(0x01, 0x71); /* level low */
+
+ /* Enable MailBox */
+ bmc_sio_outb(0x01, 0x30);
+
+ bmc_sio_put(true);
+}
+
diff --git a/roms/skiboot/hw/ast-bmc/ast-sf-ctrl.c b/roms/skiboot/hw/ast-bmc/ast-sf-ctrl.c
new file mode 100644
index 000000000..03cc44318
--- /dev/null
+++ b/roms/skiboot/hw/ast-bmc/ast-sf-ctrl.c
@@ -0,0 +1,1020 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2013-2018 IBM Corp. */
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <libflash/libflash.h>
+#include <libflash/libflash-priv.h>
+#ifdef __SKIBOOT__
+#include "lpc.h"
+#endif
+
+#include "ast.h"
+
+#ifndef __unused
+#define __unused __attribute__((unused))
+#endif
+
+#define CALIBRATE_BUF_SIZE 16384
+
+struct ast_sf_ctrl {
+ /* We have 2 controllers, one for the BMC flash, one for the PNOR */
+ uint8_t type;
+
+ /* Address and previous value of the ctrl register */
+ uint32_t ctl_reg;
+
+ /* Control register value for normal commands */
+ uint32_t ctl_val;
+
+ /* Control register value for (fast) reads */
+ uint32_t ctl_read_val;
+
+ /* Flash read timing register */
+ uint32_t fread_timing_reg;
+ uint32_t fread_timing_val;
+
+ /* Address of the flash mapping */
+ uint32_t flash;
+
+ /* Current 4b mode */
+ bool mode_4b;
+
+ /* Callbacks */
+ struct spi_flash_ctrl ops;
+};
+
+static uint32_t ast_ahb_freq;
+
+static const uint32_t ast_ct_hclk_divs[] = {
+ 0xf, /* HCLK */
+ 0x7, /* HCLK/2 */
+ 0xe, /* HCLK/3 */
+ 0x6, /* HCLK/4 */
+ 0xd, /* HCLK/5 */
+};
+
+#ifdef __SKIBOOT__
+#define PNOR_AHB_ADDR 0x30000000
+static uint32_t pnor_lpc_offset;
+
+static int ast_copy_to_ahb(uint32_t reg, const void *src, uint32_t len)
+{
+ /* Check we don't cross IDSEL segments */
+ if ((reg ^ (reg + len - 1)) >> 28)
+ return -EINVAL;
+
+ /* SPI flash, use LPC->AHB bridge */
+ if ((reg >> 28) == (PNOR_AHB_ADDR >> 28)) {
+ uint32_t chunk, off = reg - PNOR_AHB_ADDR + pnor_lpc_offset;
+ int64_t rc;
+
+ while(len) {
+ /* Chose access size */
+ if (len > 3 && !(off & 3)) {
+ rc = lpc_write(OPAL_LPC_FW, off,
+ *(uint32_t *)src, 4);
+ chunk = 4;
+ } else {
+ rc = lpc_write(OPAL_LPC_FW, off,
+ *(uint8_t *)src, 1);
+ chunk = 1;
+ }
+ if (rc) {
+ prerror("AST_IO: lpc_write.sb failure %lld"
+ " to FW 0x%08x\n", rc, off);
+ return rc;
+ }
+ len -= chunk;
+ off += chunk;
+ src += chunk;
+ }
+ return 0;
+ }
+
+ /* Otherwise we don't do byte access (... yet) */
+ prerror("AST_IO: Attempted write bytes access to %08x\n", reg);
+ return -EINVAL;
+}
+
+static int ast_copy_from_ahb(void *dst, uint32_t reg, uint32_t len)
+{
+ /* Check we don't cross IDSEL segments */
+ if ((reg ^ (reg + len - 1)) >> 28)
+ return -EINVAL;
+
+ /* SPI flash, use LPC->AHB bridge */
+ if ((reg >> 28) == (PNOR_AHB_ADDR >> 28)) {
+ uint32_t chunk, off = reg - PNOR_AHB_ADDR + pnor_lpc_offset;
+ int64_t rc;
+
+ while(len) {
+ uint32_t dat;
+
+ /* Chose access size */
+ if (len > 3 && !(off & 3)) {
+ rc = lpc_read(OPAL_LPC_FW, off, &dat, 4);
+ if (!rc)
+ *(uint32_t *)dst = dat;
+ chunk = 4;
+ } else {
+ rc = lpc_read(OPAL_LPC_FW, off, &dat, 1);
+ if (!rc)
+ *(uint8_t *)dst = dat;
+ chunk = 1;
+ }
+ if (rc) {
+ prerror("AST_IO: lpc_read.sb failure %lld"
+ " to FW 0x%08x\n", rc, off);
+ return rc;
+ }
+ len -= chunk;
+ off += chunk;
+ dst += chunk;
+ }
+ return 0;
+ }
+ /* Otherwise we don't do byte access (... yet) */
+ prerror("AST_IO: Attempted read bytes access to %08x\n", reg);
+ return -EINVAL;
+}
+#endif /* __SKIBOOT__ */
+
+static int ast_sf_start_cmd(struct ast_sf_ctrl *ct, uint8_t cmd)
+{
+ /* Switch to user mode, CE# dropped */
+ ast_ahb_writel(ct->ctl_val | 7, ct->ctl_reg);
+
+ /* user mode, CE# active */
+ ast_ahb_writel(ct->ctl_val | 3, ct->ctl_reg);
+
+ /* write cmd */
+ return ast_copy_to_ahb(ct->flash, &cmd, 1);
+}
+
+static void ast_sf_end_cmd(struct ast_sf_ctrl *ct)
+{
+ /* clear CE# */
+ ast_ahb_writel(ct->ctl_val | 7, ct->ctl_reg);
+
+ /* Switch back to read mode */
+ ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg);
+}
+
+static int ast_sf_send_addr(struct ast_sf_ctrl *ct, uint32_t addr)
+{
+ const void *ap;
+ beint32_t tmp;
+
+ /* Layout address MSB first in memory */
+ tmp = cpu_to_be32(addr);
+
+ /* Send the right amount of bytes */
+ ap = (char *)&tmp;
+
+ if (ct->mode_4b)
+ return ast_copy_to_ahb(ct->flash, ap, 4);
+ else
+ return ast_copy_to_ahb(ct->flash, ap + 1, 3);
+}
+
+static int ast_sf_cmd_rd(struct spi_flash_ctrl *ctrl, uint8_t cmd,
+ bool has_addr, uint32_t addr, void *buffer,
+ uint32_t size)
+{
+ struct ast_sf_ctrl *ct = container_of(ctrl, struct ast_sf_ctrl, ops);
+ int rc;
+
+ rc = ast_sf_start_cmd(ct, cmd);
+ if (rc)
+ goto bail;
+ if (has_addr) {
+ rc = ast_sf_send_addr(ct, addr);
+ if (rc)
+ goto bail;
+ }
+ if (buffer && size)
+ rc = ast_copy_from_ahb(buffer, ct->flash, size);
+ bail:
+ ast_sf_end_cmd(ct);
+ return rc;
+}
+
+static int ast_sf_cmd_wr(struct spi_flash_ctrl *ctrl, uint8_t cmd,
+ bool has_addr, uint32_t addr, const void *buffer,
+ uint32_t size)
+{
+ struct ast_sf_ctrl *ct = container_of(ctrl, struct ast_sf_ctrl, ops);
+ int rc;
+
+ rc = ast_sf_start_cmd(ct, cmd);
+ if (rc)
+ goto bail;
+ if (has_addr) {
+ rc = ast_sf_send_addr(ct, addr);
+ if (rc)
+ goto bail;
+ }
+ if (buffer && size)
+ rc = ast_copy_to_ahb(ct->flash, buffer, size);
+ bail:
+ ast_sf_end_cmd(ct);
+ return rc;
+}
+
+static int ast_sf_set_4b(struct spi_flash_ctrl *ctrl, bool enable)
+{
+ struct ast_sf_ctrl *ct = container_of(ctrl, struct ast_sf_ctrl, ops);
+ uint32_t ce_ctrl = 0;
+
+ if (ct->type == AST_SF_TYPE_BMC && ct->ops.finfo->size > 0x1000000)
+ ce_ctrl = ast_ahb_readl(BMC_SPI_FCTL_CE_CTRL);
+ else if (ct->type != AST_SF_TYPE_PNOR)
+ return enable ? FLASH_ERR_4B_NOT_SUPPORTED : 0;
+
+ /*
+ * We update the "old" value as well since when quitting
+ * we don't restore the mode of the flash itself so we need
+ * to leave the controller in a compatible setup
+ */
+ if (enable) {
+ ct->ctl_val |= 0x2000;
+ ct->ctl_read_val |= 0x2000;
+ ce_ctrl |= 0x1;
+ } else {
+ ct->ctl_val &= ~0x2000;
+ ct->ctl_read_val &= ~0x2000;
+ ce_ctrl &= ~0x1;
+ }
+ ct->mode_4b = enable;
+
+ /* Update read mode */
+ ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg);
+
+ if (ce_ctrl && ct->type == AST_SF_TYPE_BMC)
+ ast_ahb_writel(ce_ctrl, BMC_SPI_FCTL_CE_CTRL);
+
+ return 0;
+}
+
+static int ast_sf_read(struct spi_flash_ctrl *ctrl, uint32_t pos,
+ void *buf, uint32_t len)
+{
+ struct ast_sf_ctrl *ct = container_of(ctrl, struct ast_sf_ctrl, ops);
+
+ /*
+ * We are in read mode by default. We don't yet support fancy
+ * things like fast read or X2 mode
+ */
+ return ast_copy_from_ahb(buf, ct->flash + pos, len);
+}
+
+static void ast_get_ahb_freq(void)
+{
+ static const uint32_t cpu_freqs_24_48[] = {
+ 384000000,
+ 360000000,
+ 336000000,
+ 408000000
+ };
+ static const uint32_t cpu_freqs_25[] = {
+ 400000000,
+ 375000000,
+ 350000000,
+ 425000000
+ };
+ static const uint32_t ahb_div[] = { 1, 2, 4, 3 };
+ uint32_t strap, cpu_clk, div;
+
+ if (ast_ahb_freq)
+ return;
+
+ /* HW strapping gives us the CPU freq and AHB divisor */
+ strap = ast_ahb_readl(SCU_HW_STRAPPING);
+ if (strap & 0x00800000) {
+ FL_DBG("AST: CLKIN 25Mhz\n");
+ cpu_clk = cpu_freqs_25[(strap >> 8) & 3];
+ } else {
+ FL_DBG("AST: CLKIN 24/48Mhz\n");
+ cpu_clk = cpu_freqs_24_48[(strap >> 8) & 3];
+ }
+ FL_DBG("AST: CPU frequency: %d Mhz\n", cpu_clk / 1000000);
+ div = ahb_div[(strap >> 10) & 3];
+ ast_ahb_freq = cpu_clk / div;
+ FL_DBG("AST: AHB frequency: %d Mhz\n", ast_ahb_freq / 1000000);
+}
+
+static int ast_sf_check_reads(struct ast_sf_ctrl *ct,
+ const uint8_t *golden_buf, uint8_t *test_buf)
+{
+ int i, rc;
+
+ for (i = 0; i < 10; i++) {
+ rc = ast_copy_from_ahb(test_buf, ct->flash, CALIBRATE_BUF_SIZE);
+ if (rc)
+ return rc;
+ if (memcmp(test_buf, golden_buf, CALIBRATE_BUF_SIZE) != 0)
+ return FLASH_ERR_VERIFY_FAILURE;
+ }
+ return 0;
+}
+
+static int ast_sf_calibrate_reads(struct ast_sf_ctrl *ct, uint32_t hdiv,
+ const uint8_t *golden_buf, uint8_t *test_buf)
+{
+ int i, rc;
+ int good_pass = -1, pass_count = 0;
+ uint32_t shift = (hdiv - 1) << 2;
+ uint32_t mask = ~(0xfu << shift);
+
+#define FREAD_TPASS(i) (((i) / 2) | (((i) & 1) ? 0 : 8))
+
+ /* Try HCLK delay 0..5, each one with/without delay and look for a
+ * good pair.
+ */
+ for (i = 0; i < 12; i++) {
+ bool pass;
+
+ ct->fread_timing_val &= mask;
+ ct->fread_timing_val |= FREAD_TPASS(i) << shift;
+ ast_ahb_writel(ct->fread_timing_val, ct->fread_timing_reg);
+ rc = ast_sf_check_reads(ct, golden_buf, test_buf);
+ if (rc && rc != FLASH_ERR_VERIFY_FAILURE)
+ return rc;
+ pass = (rc == 0);
+ FL_DBG(" * [%08x] %d HCLK delay, %dns DI delay : %s\n",
+ ct->fread_timing_val, i/2, (i & 1) ? 0 : 4, pass ? "PASS" : "FAIL");
+ if (pass) {
+ pass_count++;
+ if (pass_count == 3) {
+ good_pass = i - 1;
+ break;
+ }
+ } else
+ pass_count = 0;
+ }
+
+ /* No good setting for this frequency */
+ if (good_pass < 0)
+ return FLASH_ERR_VERIFY_FAILURE;
+
+ /* We have at least one pass of margin, let's use first pass */
+ ct->fread_timing_val &= mask;
+ ct->fread_timing_val |= FREAD_TPASS(good_pass) << shift;
+ ast_ahb_writel(ct->fread_timing_val, ct->fread_timing_reg);
+ FL_DBG("AST: * -> good is pass %d [0x%08x]\n",
+ good_pass, ct->fread_timing_val);
+ return 0;
+}
+
+static bool ast_calib_data_usable(const uint8_t *test_buf, uint32_t size)
+{
+ const uint32_t *tb32 = (const uint32_t *)test_buf;
+ uint32_t i, cnt = 0;
+
+ /* We check if we have enough words that are neither all 0
+ * nor all 1's so the calibration can be considered valid.
+ *
+ * I use an arbitrary threshold for now of 64
+ */
+ size >>= 2;
+ for (i = 0; i < size; i++) {
+ if (tb32[i] != 0 && tb32[i] != 0xffffffff)
+ cnt++;
+ }
+ return cnt >= 64;
+}
+
+static int ast_sf_optimize_reads(struct ast_sf_ctrl *ct,
+ struct flash_info *info __unused,
+ uint32_t max_freq)
+{
+ uint8_t *golden_buf, *test_buf;
+ int i, rc, best_div = -1;
+ uint32_t save_read_val = ct->ctl_read_val;
+
+ test_buf = malloc(CALIBRATE_BUF_SIZE * 2);
+ golden_buf = test_buf + CALIBRATE_BUF_SIZE;
+
+ /* We start with the dumbest setting and read some data */
+ ct->ctl_read_val = (ct->ctl_read_val & 0x2000) |
+ (0x00 << 28) | /* Single bit */
+ (0x00 << 24) | /* CE# max */
+ (0x03 << 16) | /* use normal reads */
+ (0x00 << 8) | /* HCLK/16 */
+ (0x00 << 6) | /* no dummy cycle */
+ (0x00); /* normal read */
+ ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg);
+
+ rc = ast_copy_from_ahb(golden_buf, ct->flash, CALIBRATE_BUF_SIZE);
+ if (rc) {
+ free(test_buf);
+ return rc;
+ }
+
+ /* Establish our read mode with freq field set to 0 */
+ ct->ctl_read_val = save_read_val & 0xfffff0ff;
+
+ /* Check if calibration data is suitable */
+ if (!ast_calib_data_usable(golden_buf, CALIBRATE_BUF_SIZE)) {
+ FL_INF("AST: Calibration area too uniform, "
+ "using low speed\n");
+ ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg);
+ free(test_buf);
+ return 0;
+ }
+
+ /* Now we iterate the HCLK dividers until we find our breaking point */
+ for (i = 5; i > 0; i--) {
+ uint32_t tv, freq;
+
+ /* Compare timing to max */
+ freq = ast_ahb_freq / i;
+ if (freq >= max_freq)
+ continue;
+
+ /* Set the timing */
+ tv = ct->ctl_read_val | (ast_ct_hclk_divs[i - 1] << 8);
+ ast_ahb_writel(tv, ct->ctl_reg);
+ FL_DBG("AST: Trying HCLK/%d...\n", i);
+ rc = ast_sf_calibrate_reads(ct, i, golden_buf, test_buf);
+
+ /* Some other error occurred, bail out */
+ if (rc && rc != FLASH_ERR_VERIFY_FAILURE) {
+ free(test_buf);
+ return rc;
+ }
+ if (rc == 0)
+ best_div = i;
+ }
+ free(test_buf);
+
+ /* Nothing found ? */
+ if (best_div < 0)
+ FL_ERR("AST: No good frequency, using dumb slow\n");
+ else {
+ FL_DBG("AST: Found good read timings at HCLK/%d\n", best_div);
+ ct->ctl_read_val |= (ast_ct_hclk_divs[best_div - 1] << 8);
+ }
+ ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg);
+
+ return 0;
+}
+
+static int ast_sf_get_hclk(uint32_t *ctl_val, uint32_t max_freq)
+{
+ int i;
+
+ /* It appears that running commands at HCLK/2 on some micron
+ * chips results in occasionally reads of bogus status (that
+ * or unrelated chip hangs).
+ *
+ * Since we cannot calibrate properly the reads for commands,
+ * instead, let's limit our SPI frequency to HCLK/4 to stay
+ * on the safe side of things
+ */
+#define MIN_CMD_FREQ 4
+ for (i = MIN_CMD_FREQ; i <= 5; i++) {
+ uint32_t freq = ast_ahb_freq / i;
+ if (freq >= max_freq)
+ continue;
+ *ctl_val |= (ast_ct_hclk_divs[i - 1] << 8);
+ return i;
+ }
+ return 0;
+}
+
+static int ast_sf_setup_macronix(struct ast_sf_ctrl *ct, struct flash_info *info)
+{
+ int rc, div __unused;
+ uint8_t srcr[2];
+
+ /*
+ * Those Macronix chips support dual reads at 104Mhz
+ * and dual IO at 84Mhz with 4 dummies.
+ *
+ * Our calibration algo should give us something along
+ * the lines of HCLK/3 (HCLK/2 seems to work sometimes
+ * but appears to be fairly unreliable) which is 64Mhz
+ *
+ * So we chose dual IO mode.
+ *
+ * The CE# inactive width for reads must be 7ns, we set it
+ * to 3T which is about 15ns at the fastest speed we support
+ * HCLK/2) as I've had issue with smaller values.
+ *
+ * For write and program it's 30ns so let's set the value
+ * for normal ops to 6T.
+ *
+ * Preserve the current 4b mode.
+ */
+ FL_DBG("AST: Setting up Macronix...\n");
+
+ /*
+ * Read the status and config registers
+ */
+ rc = ast_sf_cmd_rd(&ct->ops, CMD_RDSR, false, 0, &srcr[0], 1);
+ if (rc != 0) {
+ FL_ERR("AST: Failed to read status\n");
+ return rc;
+ }
+ rc = ast_sf_cmd_rd(&ct->ops, CMD_RDCR, false, 0, &srcr[1], 1);
+ if (rc != 0) {
+ FL_ERR("AST: Failed to read configuration\n");
+ return rc;
+ }
+
+ FL_DBG("AST: Macronix SR:CR: 0x%02x:%02x\n", srcr[0], srcr[1]);
+
+ /* Switch to 8 dummy cycles to enable 104Mhz operations */
+ srcr[1] = (srcr[1] & 0x3f) | 0x80;
+
+ rc = fl_wren(&ct->ops);
+ if (rc) {
+ FL_ERR("AST: Failed to WREN for Macronix config\n");
+ return rc;
+ }
+
+ rc = ast_sf_cmd_wr(&ct->ops, CMD_WRSR, false, 0, srcr, 2);
+ if (rc != 0) {
+ FL_ERR("AST: Failed to write Macronix config\n");
+ return rc;
+ }
+ rc = fl_sync_wait_idle(&ct->ops);;
+ if (rc != 0) {
+ FL_ERR("AST: Failed waiting for config write\n");
+ return rc;
+ }
+
+ FL_DBG("AST: Macronix SR:CR: 0x%02x:%02x\n", srcr[0], srcr[1]);
+
+ /* Use 2READ */
+ ct->ctl_read_val = (ct->ctl_read_val & 0x2000) |
+ (0x03 << 28) | /* Dual IO */
+ (0x0d << 24) | /* CE# width 3T */
+ (0xbb << 16) | /* 2READ command */
+ (0x00 << 8) | /* HCLK/16 (optimize later) */
+ (0x02 << 6) | /* 2 bytes dummy cycle (8 clocks) */
+ (0x01); /* fast read */
+
+ /* Configure SPI flash read timing */
+ rc = ast_sf_optimize_reads(ct, info, 104000000);
+ if (rc) {
+ FL_ERR("AST: Failed to setup proper read timings, rc=%d\n", rc);
+ return rc;
+ }
+
+ /*
+ * For other commands and writes also increase the SPI clock
+ * to HCLK/2 since the chip supports up to 133Mhz and set
+ * CE# inactive to 6T. We request a timing that is 20% below
+ * the limit of the chip, so about 106Mhz which should fit.
+ */
+ ct->ctl_val = (ct->ctl_val & 0x2000) |
+ (0x00 << 28) | /* Single bit */
+ (0x0a << 24) | /* CE# width 6T (b1010) */
+ (0x00 << 16) | /* no command */
+ (0x00 << 8) | /* HCLK/16 (done later) */
+ (0x00 << 6) | /* no dummy cycle */
+ (0x00); /* normal read */
+
+ div = ast_sf_get_hclk(&ct->ctl_val, 106000000);
+ FL_DBG("AST: Command timing set to HCLK/%d\n", div);
+
+ /* Update chip with current read config */
+ ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg);
+ return 0;
+}
+
+static int ast_sf_setup_winbond(struct ast_sf_ctrl *ct, struct flash_info *info)
+{
+ int rc, div __unused;
+
+ FL_DBG("AST: Setting up Windbond...\n");
+
+ /*
+ * This Windbond chip support dual reads at 104Mhz
+ * with 8 dummy cycles.
+ *
+ * The CE# inactive width for reads must be 10ns, we set it
+ * to 3T which is about 15.6ns.
+ */
+ ct->ctl_read_val = (ct->ctl_read_val & 0x2000) |
+ (0x02 << 28) | /* Dual bit data only */
+ (0x0e << 24) | /* CE# width 2T (b1110) */
+ (0x3b << 16) | /* DREAD command */
+ (0x00 << 8) | /* HCLK/16 */
+ (0x01 << 6) | /* 1-byte dummy cycle */
+ (0x01); /* fast read */
+
+ /* Configure SPI flash read timing */
+ rc = ast_sf_optimize_reads(ct, info, 104000000);
+ if (rc) {
+ FL_ERR("AST: Failed to setup proper read timings, rc=%d\n", rc);
+ return rc;
+ }
+
+ /*
+ * For other commands and writes also increase the SPI clock
+ * to HCLK/2 since the chip supports up to 133Mhz. CE# inactive
+ * for write and erase is 50ns so let's set it to 10T.
+ */
+ ct->ctl_val = (ct->ctl_read_val & 0x2000) |
+ (0x00 << 28) | /* Single bit */
+ (0x06 << 24) | /* CE# width 10T (b0110) */
+ (0x00 << 16) | /* no command */
+ (0x00 << 8) | /* HCLK/16 */
+ (0x00 << 6) | /* no dummy cycle */
+ (0x01); /* fast read */
+
+ div = ast_sf_get_hclk(&ct->ctl_val, 106000000);
+ FL_DBG("AST: Command timing set to HCLK/%d\n", div);
+
+ /* Update chip with current read config */
+ ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg);
+ return 0;
+}
+
+static int ast_sf_setup_micron(struct ast_sf_ctrl *ct, struct flash_info *info)
+{
+ uint8_t vconf, ext_id[6];
+ int rc, div __unused;
+
+ FL_DBG("AST: Setting up Micron...\n");
+
+ /*
+ * Read the extended chip ID to try to detect old vs. new
+ * flashes since old Micron flashes have a lot of issues
+ */
+ rc = ast_sf_cmd_rd(&ct->ops, CMD_RDID, false, 0, ext_id, 6);
+ if (rc != 0) {
+ FL_ERR("AST: Failed to read Micron ext ID, sticking to dumb speed\n");
+ return 0;
+ }
+ /* Check ID matches expectations */
+ if (ext_id[0] != ((info->id >> 16) & 0xff) ||
+ ext_id[1] != ((info->id >> 8) & 0xff) ||
+ ext_id[2] != ((info->id ) & 0xff)) {
+ FL_ERR("AST: Micron ext ID mismatch, sticking to dumb speed\n");
+ return 0;
+ }
+ FL_DBG("AST: Micron ext ID byte: 0x%02x\n", ext_id[4]);
+
+ /* Check for old (<45nm) chips, don't try to be fancy on those */
+ if (!(ext_id[4] & 0x40)) {
+ FL_DBG("AST: Old chip, using dumb timings\n");
+ goto dumb;
+ }
+
+ /*
+ * Read the micron specific volatile configuration reg
+ */
+ rc = ast_sf_cmd_rd(&ct->ops, CMD_MIC_RDVCONF, false, 0, &vconf, 1);
+ if (rc != 0) {
+ FL_ERR("AST: Failed to read Micron vconf, sticking to dumb speed\n");
+ goto dumb;
+ }
+ FL_DBG("AST: Micron VCONF: 0x%02x\n", vconf);
+
+ /* Switch to 8 dummy cycles (we might be able to operate with 4
+ * but let's keep some margin
+ */
+ vconf = (vconf & 0x0f) | 0x80;
+
+ rc = ast_sf_cmd_wr(&ct->ops, CMD_MIC_WRVCONF, false, 0, &vconf, 1);
+ if (rc != 0) {
+ FL_ERR("AST: Failed to write Micron vconf, "
+ " sticking to dumb speed\n");
+ goto dumb;
+ }
+ rc = fl_sync_wait_idle(&ct->ops);;
+ if (rc != 0) {
+ FL_ERR("AST: Failed waiting for config write\n");
+ return rc;
+ }
+ FL_DBG("AST: Updated to : 0x%02x\n", vconf);
+
+ /*
+ * Try to do full dual IO, with 8 dummy cycles it supports 133Mhz
+ *
+ * The CE# inactive width for reads must be 20ns, we set it
+ * to 4T which is about 20.8ns.
+ */
+ ct->ctl_read_val = (ct->ctl_read_val & 0x2000) |
+ (0x03 << 28) | /* Single bit */
+ (0x0c << 24) | /* CE# 4T */
+ (0xbb << 16) | /* 2READ command */
+ (0x00 << 8) | /* HCLK/16 (optimize later) */
+ (0x02 << 6) | /* 8 dummy cycles (2 bytes) */
+ (0x01); /* fast read */
+
+ /* Configure SPI flash read timing */
+ rc = ast_sf_optimize_reads(ct, info, 133000000);
+ if (rc) {
+ FL_ERR("AST: Failed to setup proper read timings, rc=%d\n", rc);
+ return rc;
+ }
+
+ /*
+ * For other commands and writes also increase the SPI clock
+ * to HCLK/2 since the chip supports up to 133Mhz. CE# inactive
+ * for write and erase is 50ns so let's set it to 10T.
+ */
+ ct->ctl_val = (ct->ctl_read_val & 0x2000) |
+ (0x00 << 28) | /* Single bit */
+ (0x06 << 24) | /* CE# width 10T (b0110) */
+ (0x00 << 16) | /* no command */
+ (0x00 << 8) | /* HCLK/16 */
+ (0x00 << 6) | /* no dummy cycle */
+ (0x00); /* norm read */
+
+ div = ast_sf_get_hclk(&ct->ctl_val, 133000000);
+ FL_DBG("AST: Command timing set to HCLK/%d\n", div);
+
+ /* Update chip with current read config */
+ ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg);
+
+ return 0;
+
+ dumb:
+ ct->ctl_val = ct->ctl_read_val = (ct->ctl_read_val & 0x2000) |
+ (0x00 << 28) | /* Single bit */
+ (0x00 << 24) | /* CE# max */
+ (0x03 << 16) | /* use normal reads */
+ (0x06 << 8) | /* HCLK/4 */
+ (0x00 << 6) | /* no dummy cycle */
+ (0x00); /* normal read */
+
+ /* Update chip with current read config */
+ ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg);
+
+ return 0;
+}
+
+static int ast_sf_setup(struct spi_flash_ctrl *ctrl, uint32_t *tsize)
+{
+ struct ast_sf_ctrl *ct = container_of(ctrl, struct ast_sf_ctrl, ops);
+ struct flash_info *info = ctrl->finfo;
+
+ (void)tsize;
+
+ /*
+ * Configure better timings and read mode for known
+ * flash chips
+ */
+ switch(info->id) {
+ case 0xc22018: /* MX25L12835F */
+ case 0xc22019: /* MX25L25635F */
+ case 0xc2201a: /* MX66L51235F */
+ case 0xc2201b: /* MX66L1G45G */
+ return ast_sf_setup_macronix(ct, info);
+ case 0xef4018: /* W25Q128BV */
+ return ast_sf_setup_winbond(ct, info);
+ case 0x20ba20: /* MT25Qx512xx */
+ return ast_sf_setup_micron(ct, info);
+ }
+ /* No special tuning */
+ return 0;
+}
+
+static bool ast_sf_init_pnor(struct ast_sf_ctrl *ct)
+{
+ uint32_t reg;
+
+ ct->ctl_reg = PNOR_SPI_FCTL_CTRL;
+ ct->fread_timing_reg = PNOR_SPI_FREAD_TIMING;
+ ct->flash = PNOR_FLASH_BASE;
+
+ /* Enable writing to the controller */
+ reg = ast_ahb_readl(PNOR_SPI_FCTL_CONF);
+ if (reg == 0xffffffff) {
+ FL_ERR("AST_SF: Failed read from controller config\n");
+ return false;
+ }
+ ast_ahb_writel(reg | 1, PNOR_SPI_FCTL_CONF);
+
+ /*
+ * Snapshot control reg and sanitize it for our
+ * use, switching to 1-bit mode, clearing user
+ * mode if set, etc...
+ *
+ * Also configure SPI clock to something safe
+ * like HCLK/8 (24Mhz)
+ */
+ ct->ctl_val = ast_ahb_readl(ct->ctl_reg);
+ if (ct->ctl_val == 0xffffffff) {
+ FL_ERR("AST_SF: Failed read from controller control\n");
+ return false;
+ }
+
+ ct->ctl_val = (ct->ctl_val & 0x2000) |
+ (0x00 << 28) | /* Single bit */
+ (0x00 << 24) | /* CE# width 16T */
+ (0x00 << 16) | /* no command */
+ (0x04 << 8) | /* HCLK/8 */
+ (0x00 << 6) | /* no dummy cycle */
+ (0x00); /* normal read */
+
+ /* Initial read mode is default */
+ ct->ctl_read_val = ct->ctl_val;
+
+ /* Initial read timings all 0 */
+ ct->fread_timing_val = 0;
+
+ /* Configure for read */
+ ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg);
+ ast_ahb_writel(ct->fread_timing_val, ct->fread_timing_reg);
+
+ if (ct->ctl_val & 0x2000)
+ ct->mode_4b = true;
+ else
+ ct->mode_4b = false;
+
+ return true;
+}
+
+static bool ast_sf_init_bmc(struct ast_sf_ctrl *ct)
+{
+ ct->ctl_reg = BMC_SPI_FCTL_CTRL;
+ ct->fread_timing_reg = BMC_SPI_FREAD_TIMING;
+ ct->flash = BMC_FLASH_BASE;
+
+ /*
+ * Snapshot control reg and sanitize it for our
+ * use, switching to 1-bit mode, clearing user
+ * mode if set, etc...
+ *
+ * Also configure SPI clock to something safe
+ * like HCLK/8 (24Mhz)
+ */
+ ct->ctl_val =
+ (0x00 << 28) | /* Single bit */
+ (0x00 << 24) | /* CE# width 16T */
+ (0x00 << 16) | /* no command */
+ (0x04 << 8) | /* HCLK/8 */
+ (0x00 << 6) | /* no dummy cycle */
+ (0x00); /* normal read */
+
+ /* Initial read mode is default */
+ ct->ctl_read_val = ct->ctl_val;
+
+ /* Initial read timings all 0 */
+ ct->fread_timing_val = 0;
+
+ /* Configure for read */
+ ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg);
+ ast_ahb_writel(ct->fread_timing_val, ct->fread_timing_reg);
+
+ ct->mode_4b = false;
+
+ return true;
+}
+
+static int ast_mem_set4b(struct spi_flash_ctrl *ctrl __unused,
+ bool enable __unused)
+{
+ return 0;
+}
+
+static int ast_mem_setup(struct spi_flash_ctrl *ctrl __unused,
+ uint32_t *tsize __unused)
+{
+ return 0;
+}
+
+static int ast_mem_chipid(struct spi_flash_ctrl *ctrl __unused, uint8_t *id_buf,
+ uint32_t *id_size)
+{
+ if (*id_size < 3)
+ return -1;
+
+ id_buf[0] = 0xaa;
+ id_buf[1] = 0x55;
+ id_buf[2] = 0xaa;
+ *id_size = 3;
+ return 0;
+}
+
+static int ast_mem_write(struct spi_flash_ctrl *ctrl, uint32_t pos,
+ const void *buf, uint32_t len)
+{
+ struct ast_sf_ctrl *ct = container_of(ctrl, struct ast_sf_ctrl, ops);
+
+ /*
+ * This only works when the ahb is pointed at system memory.
+ */
+ return ast_copy_to_ahb(ct->flash + pos, buf, len);
+}
+
+static int ast_mem_erase(struct spi_flash_ctrl *ctrl, uint32_t addr, uint32_t size)
+{
+ struct ast_sf_ctrl *ct = container_of(ctrl, struct ast_sf_ctrl, ops);
+ uint32_t pos, len, end = addr + size;
+ uint64_t zero = 0;
+ int ret;
+
+ for (pos = addr; pos < end; pos += sizeof(zero)) {
+ if (pos + sizeof(zero) > end)
+ len = end - pos;
+ else
+ len = sizeof(zero);
+
+ ret = ast_copy_to_ahb(ct->flash + pos, &zero, len);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+int ast_sf_open(uint8_t type, struct spi_flash_ctrl **ctrl)
+{
+ struct ast_sf_ctrl *ct;
+#ifdef __SKIBOOT__
+ uint32_t hicr7;
+
+ if (!ast_sio_is_enabled())
+ return -ENODEV;
+#endif /* __SKIBOOT__ */
+
+ if (type != AST_SF_TYPE_PNOR && type != AST_SF_TYPE_BMC
+ && type != AST_SF_TYPE_MEM)
+ return -EINVAL;
+
+ *ctrl = NULL;
+ ct = malloc(sizeof(*ct));
+ if (!ct) {
+ FL_ERR("AST_SF: Failed to allocate\n");
+ return -ENOMEM;
+ }
+ memset(ct, 0, sizeof(*ct));
+ ct->type = type;
+
+ if (type == AST_SF_TYPE_MEM) {
+ ct->ops.cmd_wr = NULL;
+ ct->ops.cmd_rd = NULL;
+ ct->ops.read = ast_sf_read;
+ ct->ops.set_4b = ast_mem_set4b;
+ ct->ops.write = ast_mem_write;
+ ct->ops.erase = ast_mem_erase;
+ ct->ops.setup = ast_mem_setup;
+ ct->ops.chip_id = ast_mem_chipid;
+ ct->flash = PNOR_FLASH_BASE;
+ } else {
+ ct->ops.cmd_wr = ast_sf_cmd_wr;
+ ct->ops.cmd_rd = ast_sf_cmd_rd;
+ ct->ops.set_4b = ast_sf_set_4b;
+ ct->ops.read = ast_sf_read;
+ ct->ops.setup = ast_sf_setup;
+ }
+
+ ast_get_ahb_freq();
+
+ if (type == AST_SF_TYPE_PNOR) {
+ if (!ast_sf_init_pnor(ct))
+ goto fail;
+ } else if (type == AST_SF_TYPE_BMC) {
+ if (!ast_sf_init_bmc(ct))
+ goto fail;
+ }
+
+#ifdef __SKIBOOT__
+ /* Read the configuration of the LPC->AHB bridge for PNOR
+ * to extract the PNOR LPC offset which can be different
+ * depending on flash size
+ */
+ hicr7 = ast_ahb_readl(LPC_HICR7);
+ pnor_lpc_offset = (hicr7 & 0xffffu) << 16;
+ prlog(PR_DEBUG, "AST: PNOR LPC offset: 0x%08x\n", pnor_lpc_offset);
+#endif /* __SKIBOOT__ */
+
+ *ctrl = &ct->ops;
+
+ return 0;
+ fail:
+ free(ct);
+ return -EIO;
+}
+
+void ast_sf_close(struct spi_flash_ctrl *ctrl)
+{
+ struct ast_sf_ctrl *ct = container_of(ctrl, struct ast_sf_ctrl, ops);
+
+ /* Restore control reg to read */
+ ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg);
+
+ /* Additional cleanup */
+ if (ct->type == AST_SF_TYPE_PNOR) {
+ uint32_t reg = ast_ahb_readl(PNOR_SPI_FCTL_CONF);
+ if (reg != 0xffffffff)
+ ast_ahb_writel(reg & ~1, PNOR_SPI_FCTL_CONF);
+ }
+
+ /* Free the whole lot */
+ free(ct);
+}
diff --git a/roms/skiboot/hw/bt.c b/roms/skiboot/hw/bt.c
new file mode 100644
index 000000000..5016feab6
--- /dev/null
+++ b/roms/skiboot/hw/bt.c
@@ -0,0 +1,720 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Block Transfer, typically what IPMI goes over
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#define pr_fmt(fmt) "BT: " fmt
+
+#include <skiboot.h>
+#include <lpc.h>
+#include <lock.h>
+#include <device.h>
+#include <timebase.h>
+#include <ipmi.h>
+#include <bt.h>
+#include <timer.h>
+#include <ipmi.h>
+#include <timebase.h>
+#include <chip.h>
+#include <interrupts.h>
+
+/* BT registers */
+#define BT_CTRL 0
+#define BT_CTRL_B_BUSY 0x80
+#define BT_CTRL_H_BUSY 0x40
+#define BT_CTRL_OEM0 0x20
+#define BT_CTRL_SMS_ATN 0x10
+#define BT_CTRL_B2H_ATN 0x08
+#define BT_CTRL_H2B_ATN 0x04
+#define BT_CTRL_CLR_RD_PTR 0x02
+#define BT_CTRL_CLR_WR_PTR 0x01
+#define BT_HOST2BMC 1
+#define BT_INTMASK 2
+#define BT_INTMASK_B2H_IRQEN 0x01
+#define BT_INTMASK_B2H_IRQ 0x02
+#define BT_INTMASK_BMC_HWRST 0x80
+
+/* Maximum size of the HW FIFO */
+#define BT_FIFO_LEN 64
+
+/* Default poll interval before interrupts are working */
+#define BT_DEFAULT_POLL_MS 200
+
+/*
+ * Minimum size of an IPMI request/response including
+ * mandatory headers.
+ */
+#define BT_MIN_REQ_LEN 3
+#define BT_MIN_RESP_LEN 4
+
+/* How long (in uS) to poll for new ipmi data. */
+#define POLL_TIMEOUT 10000
+
+/* Maximum number of outstanding messages to allow in the queue. */
+#define BT_MAX_QUEUE_LEN 10
+
+/* How long (in seconds) before a message is timed out. */
+#define BT_MSG_TIMEOUT 3
+
+/* Maximum number of times to attempt sending a message before giving up. */
+#define BT_MAX_RETRIES 1
+
+/* Macro to enable printing BT message queue for debug */
+#define BT_QUEUE_DEBUG 0
+
+/* BT message logging macros */
+#define _BT_Q_LOG(level, msg, fmt, args...) \
+ do { if (msg) \
+ prlog(level, "seq 0x%02x netfn 0x%02x cmd 0x%02x: " fmt "\n", \
+ (msg)->seq, ((msg)->ipmi_msg.netfn >> 2), (msg)->ipmi_msg.cmd, ##args); \
+ else \
+ prlog(level, "seq 0x?? netfn 0x?? cmd 0x??: " fmt "\n", ##args); \
+ } while (0)
+
+#define BT_Q_ERR(msg, fmt, args...) \
+ _BT_Q_LOG(PR_ERR, msg, fmt, ##args)
+
+#define BT_Q_DBG(msg, fmt, args...) \
+ _BT_Q_LOG(PR_DEBUG, msg, fmt, ##args)
+
+#define BT_Q_TRACE(msg, fmt, args...) \
+ _BT_Q_LOG(PR_TRACE, msg, fmt, ##args)
+
+struct bt_msg {
+ struct list_node link;
+ unsigned long tb;
+ uint8_t seq;
+ uint8_t send_count;
+ bool disable_retry;
+ struct ipmi_msg ipmi_msg;
+};
+
+struct bt_caps {
+ uint8_t num_requests;
+ uint16_t input_buf_len;
+ uint16_t output_buf_len;
+ uint8_t msg_timeout;
+ uint8_t max_retries;
+};
+
+struct bt {
+ uint32_t base_addr;
+ struct lock lock;
+ struct list_head msgq;
+ struct list_head msgq_sync; /* separate list for synchronous messages */
+ struct timer poller;
+ bool irq_ok;
+ int queue_len;
+ struct bt_caps caps;
+};
+
+static struct bt bt;
+static struct bt_msg *inflight_bt_msg; /* Holds in flight message */
+
+static int ipmi_seq;
+
+static inline uint8_t bt_inb(uint32_t reg)
+{
+ return lpc_inb(bt.base_addr + reg);
+}
+
+static inline void bt_outb(uint8_t data, uint32_t reg)
+{
+ lpc_outb(data, bt.base_addr + reg);
+}
+
+static inline void bt_set_h_busy(bool value)
+{
+ uint8_t rval;
+
+ rval = bt_inb(BT_CTRL);
+ if (value != !!(rval & BT_CTRL_H_BUSY))
+ bt_outb(BT_CTRL_H_BUSY, BT_CTRL);
+}
+
+static inline void bt_assert_h_busy(void)
+{
+ uint8_t rval;
+ rval = bt_inb(BT_CTRL);
+ assert(rval & BT_CTRL_H_BUSY);
+}
+
+static void get_bt_caps_complete(struct ipmi_msg *msg)
+{
+ /* Ignore errors, we'll fallback to using the defaults, no big deal */
+ if (msg->data[0] == 0) {
+ prlog(PR_DEBUG, "Got illegal BMC BT capability\n");
+ goto out;
+ }
+
+ if (msg->data[1] != BT_FIFO_LEN) {
+ prlog(PR_DEBUG, "Got a input buffer len (%u) cap which differs from the default\n",
+ msg->data[1]);
+ }
+
+ if (msg->data[2] != BT_FIFO_LEN) {
+ prlog(PR_DEBUG, "Got a output buffer len (%u) cap which differs from the default\n",
+ msg->data[2]);
+ }
+
+ /*
+ * IPMI Spec says that the value for buffer sizes are:
+ * "the largest value allowed in first byte"
+ * Therefore we want to add one to what we get
+ */
+ bt.caps.num_requests = msg->data[0];
+ bt.caps.input_buf_len = msg->data[1] + 1;
+ bt.caps.output_buf_len = msg->data[2] + 1;
+ bt.caps.msg_timeout = msg->data[3];
+ bt.caps.max_retries = msg->data[4];
+ prlog(PR_DEBUG, "BMC BT capabilities received:\n");
+ prlog(PR_DEBUG, "buffer sizes: %d input %d output\n",
+ bt.caps.input_buf_len, bt.caps.output_buf_len);
+ prlog(PR_DEBUG, "number of requests: %d\n", bt.caps.num_requests);
+ prlog(PR_DEBUG, "msg timeout: %d max retries: %d\n",
+ bt.caps.msg_timeout, bt.caps.max_retries);
+
+out:
+ ipmi_free_msg(msg);
+}
+
+static void get_bt_caps(void)
+{
+
+ struct ipmi_msg *bmc_caps;
+ /*
+ * Didn't sent a message, now is a good time to ask the BMC for its
+ * capabilities.
+ */
+ bmc_caps = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, IPMI_GET_BT_CAPS,
+ get_bt_caps_complete, NULL, NULL, 0, sizeof(struct bt_caps));
+ if (!bmc_caps)
+ prerror("Couldn't create BMC BT capabilities msg\n");
+
+ if (bmc_caps && ipmi_queue_msg(bmc_caps))
+ prerror("Couldn't enqueue request for BMC BT capabilities\n");
+
+ /* Ignore errors, we'll fallback to using the defaults, no big deal */
+}
+
+static inline bool bt_idle(void)
+{
+ uint8_t bt_ctrl = bt_inb(BT_CTRL);
+
+ return !(bt_ctrl & BT_CTRL_B_BUSY) && !(bt_ctrl & BT_CTRL_H2B_ATN);
+}
+
+/* Must be called with bt.lock held */
+static void bt_msg_del(struct bt_msg *bt_msg)
+{
+ list_del(&bt_msg->link);
+ bt.queue_len--;
+
+ /* once inflight_bt_msg out of list, it should be emptyed */
+ if (bt_msg == inflight_bt_msg)
+ inflight_bt_msg = NULL;
+
+ unlock(&bt.lock);
+ ipmi_cmd_done(bt_msg->ipmi_msg.cmd,
+ IPMI_NETFN_RETURN_CODE(bt_msg->ipmi_msg.netfn),
+ IPMI_TIMEOUT_ERR, &bt_msg->ipmi_msg);
+ lock(&bt.lock);
+}
+
+static void bt_init_interface(void)
+{
+ /* Clear interrupt condition & enable irq */
+ bt_outb(BT_INTMASK_B2H_IRQ | BT_INTMASK_B2H_IRQEN, BT_INTMASK);
+
+ /* Take care of a stable H_BUSY if any */
+ bt_set_h_busy(false);
+}
+
+static void bt_reset_interface(void)
+{
+ bt_outb(BT_INTMASK_BMC_HWRST, BT_INTMASK);
+ bt_init_interface();
+}
+
+/*
+ * Try and send a message from the message queue. Caller must hold
+ * bt.bt_lock and bt.lock and ensue the message queue is not
+ * empty.
+ */
+static void bt_send_msg(struct bt_msg *bt_msg)
+{
+ int i;
+ struct ipmi_msg *ipmi_msg;
+
+ ipmi_msg = &bt_msg->ipmi_msg;
+
+ /* Send the message */
+ bt_outb(BT_CTRL_CLR_WR_PTR, BT_CTRL);
+
+ /* Byte 1 - Length */
+ bt_outb(ipmi_msg->req_size + BT_MIN_REQ_LEN, BT_HOST2BMC);
+
+ /* Byte 2 - NetFn/LUN */
+ bt_outb(ipmi_msg->netfn, BT_HOST2BMC);
+
+ /* Byte 3 - Seq */
+ bt_outb(bt_msg->seq, BT_HOST2BMC);
+
+ /* Byte 4 - Cmd */
+ bt_outb(ipmi_msg->cmd, BT_HOST2BMC);
+
+ /* Byte 5:N - Data */
+ for (i = 0; i < ipmi_msg->req_size; i++)
+ bt_outb(ipmi_msg->data[i], BT_HOST2BMC);
+
+ BT_Q_TRACE(bt_msg, "Message sent to host");
+ bt_msg->send_count++;
+
+ bt_outb(BT_CTRL_H2B_ATN, BT_CTRL);
+
+ return;
+}
+
+static void bt_clear_fifo(void)
+{
+ int i;
+
+ for (i = 0; i < bt.caps.input_buf_len; i++)
+ bt_outb(0xff, BT_HOST2BMC);
+}
+
+static void bt_flush_msg(void)
+{
+ bt_assert_h_busy();
+ bt_outb(BT_CTRL_B2H_ATN | BT_CTRL_CLR_RD_PTR | BT_CTRL_CLR_WR_PTR, BT_CTRL);
+ bt_clear_fifo();
+ /* Can't hurt to clear the write pointer again, just to be sure */
+ bt_outb(BT_CTRL_CLR_WR_PTR, BT_CTRL);
+ bt_set_h_busy(false);
+}
+
+static void bt_get_resp(void)
+{
+ int i;
+ struct ipmi_msg *ipmi_msg;
+ uint8_t resp_len, netfn, seq, cmd;
+ uint8_t cc = IPMI_CC_NO_ERROR;
+
+ /* Indicate to the BMC that we are busy */
+ bt_set_h_busy(true);
+
+ /* Clear B2H_ATN and read pointer */
+ bt_outb(BT_CTRL_B2H_ATN, BT_CTRL);
+ bt_outb(BT_CTRL_CLR_RD_PTR, BT_CTRL);
+
+ /* Read the response */
+ /* Byte 1 - Length (includes header size) */
+ resp_len = bt_inb(BT_HOST2BMC) - BT_MIN_RESP_LEN;
+
+ /* Byte 2 - NetFn/LUN */
+ netfn = bt_inb(BT_HOST2BMC);
+
+ /* Byte 3 - Seq */
+ seq = bt_inb(BT_HOST2BMC);
+
+ /* Byte 4 - Cmd */
+ cmd = bt_inb(BT_HOST2BMC);
+
+ /* Byte 5 - Completion Code */
+ cc = bt_inb(BT_HOST2BMC);
+
+ /* Find the corresponding message */
+ if (inflight_bt_msg == NULL || inflight_bt_msg->seq != seq) {
+ /* A response to a message we no longer care about. */
+ prlog(PR_INFO, "Nobody cared about a response to an BT/IPMI message"
+ "(seq 0x%02x netfn 0x%02x cmd 0x%02x)\n", seq, (netfn >> 2), cmd);
+ bt_flush_msg();
+ return;
+ }
+
+ ipmi_msg = &inflight_bt_msg->ipmi_msg;
+
+ /*
+ * Make sure we have enough room to store the response. As all values
+ * are unsigned we will also trigger this error if
+ * bt_inb(BT_HOST2BMC) < BT_MIN_RESP_LEN (which should never occur).
+ */
+ if (resp_len > ipmi_msg->resp_size) {
+ BT_Q_ERR(inflight_bt_msg, "Invalid resp_len %d", resp_len);
+ resp_len = ipmi_msg->resp_size;
+ cc = IPMI_ERR_MSG_TRUNCATED;
+ }
+ ipmi_msg->resp_size = resp_len;
+
+ /* Byte 6:N - Data */
+ for (i = 0; i < resp_len; i++)
+ ipmi_msg->data[i] = bt_inb(BT_HOST2BMC);
+ bt_set_h_busy(false);
+
+ BT_Q_TRACE(inflight_bt_msg, "IPMI MSG done");
+
+ list_del(&inflight_bt_msg->link);
+ /* Ready to send next message */
+ inflight_bt_msg = NULL;
+ bt.queue_len--;
+ unlock(&bt.lock);
+
+ /* Call IPMI layer to finish processing the message. */
+ ipmi_cmd_done(cmd, netfn, cc, ipmi_msg);
+ lock(&bt.lock);
+
+ return;
+}
+
+static void bt_expire_old_msg(uint64_t tb)
+{
+ struct bt_msg *bt_msg = inflight_bt_msg;
+
+ if (bt_msg && bt_msg->tb > 0 && !chip_quirk(QUIRK_SIMICS) &&
+ (tb_compare(tb, bt_msg->tb +
+ secs_to_tb(bt.caps.msg_timeout)) == TB_AAFTERB)) {
+ if (bt_msg->send_count <= bt.caps.max_retries &&
+ !bt_msg->disable_retry) {
+ /* A message timeout is usually due to the BMC
+ * clearing the H2B_ATN flag without actually
+ * doing anything. The data will still be in the
+ * FIFO so just reset the flag.*/
+ BT_Q_ERR(bt_msg, "Retry sending message");
+
+ /* This means we have started message timeout, but not
+ * yet sent message to BMC as driver was not free to
+ * send message. Lets resend message.
+ */
+ if (bt_msg->send_count == 0)
+ bt_send_msg(bt_msg);
+ else
+ bt_outb(BT_CTRL_H2B_ATN, BT_CTRL);
+
+ bt_msg->send_count++;
+ bt_msg->tb = tb;
+ } else {
+ BT_Q_ERR(bt_msg, "Timeout sending message");
+ bt_msg_del(bt_msg);
+
+ /*
+ * Timing out a message is inherently racy as the BMC
+ * may start writing just as we decide to kill the
+ * message. Hopefully resetting the interface is
+ * sufficient to guard against such things.
+ */
+ bt_reset_interface();
+ }
+ }
+}
+
+#if BT_QUEUE_DEBUG
+static void print_debug_queue_info(void)
+{
+ struct bt_msg *msg;
+ static bool printed;
+
+ if (!list_empty(&bt.msgq_sync) || !list_empty(&bt.msgq)) {
+ printed = false;
+ prlog(PR_DEBUG, "-------- BT Sync Msg Queue -------\n");
+ list_for_each(&bt.msgq_sync, msg, link) {
+ BT_Q_DBG(msg, "[ sent %d ]", msg->send_count);
+ }
+ prlog(PR_DEBUG, "---------- BT Msg Queue ----------\n");
+ list_for_each(&bt.msgq, msg, link) {
+ BT_Q_DBG(msg, "[ sent %d ]", msg->send_count);
+ }
+ prlog(PR_DEBUG, "----------------------------------\n");
+ } else if (!printed) {
+ printed = true;
+ prlog(PR_DEBUG, "------- BT Msg Queue Empty -------\n");
+ }
+}
+#endif
+
+static void bt_send_and_unlock(void)
+{
+ /* Busy? */
+ if (inflight_bt_msg)
+ goto out_unlock;
+
+ if (!lpc_ok())
+ goto out_unlock;
+
+ /* Synchronous messages gets priority over normal message */
+ if (!list_empty(&bt.msgq_sync))
+ inflight_bt_msg = list_top(&bt.msgq_sync, struct bt_msg, link);
+ else if (!list_empty(&bt.msgq))
+ inflight_bt_msg = list_top(&bt.msgq, struct bt_msg, link);
+ else
+ goto out_unlock;
+
+ assert(inflight_bt_msg);
+ /*
+ * Start the message timeout once it gets to the top
+ * of the queue. This will ensure we timeout messages
+ * in the case of a broken bt interface as occurs when
+ * the BMC is not responding to any IPMI messages.
+ */
+ if (inflight_bt_msg->tb == 0)
+ inflight_bt_msg->tb = mftb();
+
+ /*
+ * Only send it if we haven't already.
+ * Timeouts and retries happen in bt_expire_old_msg()
+ * called from bt_poll()
+ */
+ if (bt_idle() && inflight_bt_msg->send_count == 0)
+ bt_send_msg(inflight_bt_msg);
+
+out_unlock:
+ unlock(&bt.lock);
+}
+
+static void bt_poll(struct timer *t __unused, void *data __unused,
+ uint64_t now)
+{
+ uint8_t bt_ctrl;
+
+ /* Don't do anything if the LPC bus is offline */
+ if (!lpc_ok())
+ return;
+
+ /*
+ * If we can't get the lock assume someone else will notice
+ * the new message and process it.
+ */
+ lock(&bt.lock);
+
+#if BT_QUEUE_DEBUG
+ print_debug_queue_info();
+#endif
+
+ bt_ctrl = bt_inb(BT_CTRL);
+
+ /* Is there a response waiting for us? */
+ if (bt_ctrl & BT_CTRL_B2H_ATN)
+ bt_get_resp();
+
+ bt_expire_old_msg(now);
+
+ /* Check for sms_atn */
+ if (bt_inb(BT_CTRL) & BT_CTRL_SMS_ATN) {
+ bt_outb(BT_CTRL_SMS_ATN, BT_CTRL);
+ unlock(&bt.lock);
+ ipmi_sms_attention();
+ lock(&bt.lock);
+ }
+
+ /*
+ * Send messages if we can. If the BMC was really quick we
+ * could loop back to the start and check for a response
+ * instead of unlocking, but testing shows the BMC isn't that
+ * fast so we will wait for the IRQ or a call to the pollers instead.
+ */
+ bt_send_and_unlock();
+
+ schedule_timer(&bt.poller,
+ bt.irq_ok ? TIMER_POLL : msecs_to_tb(BT_DEFAULT_POLL_MS));
+}
+
+static void bt_ipmi_poll(void)
+{
+ bt_poll(NULL, NULL, mftb());
+}
+
+static void bt_add_msg(struct bt_msg *bt_msg)
+{
+ bt_msg->tb = 0;
+ bt_msg->seq = ipmi_seq++;
+ bt_msg->send_count = 0;
+ bt.queue_len++;
+ if (bt.queue_len > BT_MAX_QUEUE_LEN) {
+ /* Maximum queue length exceeded, remove oldest messages. */
+ BT_Q_ERR(bt_msg, "Maximum queue length exceeded");
+ /* First try to remove message from normal queue */
+ if (!list_empty(&bt.msgq))
+ bt_msg = list_tail(&bt.msgq, struct bt_msg, link);
+ else if (!list_empty(&bt.msgq_sync))
+ bt_msg = list_tail(&bt.msgq_sync, struct bt_msg, link);
+ assert(bt_msg);
+ BT_Q_ERR(bt_msg, "Removed from queue");
+ bt_msg_del(bt_msg);
+ }
+}
+
+/* Add message to synchronous message list */
+static int bt_add_ipmi_msg_head(struct ipmi_msg *ipmi_msg)
+{
+ struct bt_msg *bt_msg = container_of(ipmi_msg, struct bt_msg, ipmi_msg);
+
+ lock(&bt.lock);
+ bt_add_msg(bt_msg);
+ list_add_tail(&bt.msgq_sync, &bt_msg->link);
+ bt_send_and_unlock();
+
+ return 0;
+}
+
+static int bt_add_ipmi_msg(struct ipmi_msg *ipmi_msg)
+{
+ struct bt_msg *bt_msg = container_of(ipmi_msg, struct bt_msg, ipmi_msg);
+
+ lock(&bt.lock);
+ bt_add_msg(bt_msg);
+ list_add_tail(&bt.msgq, &bt_msg->link);
+ bt_send_and_unlock();
+
+ return 0;
+}
+
+static void bt_irq(uint32_t chip_id __unused, uint32_t irq_mask __unused)
+{
+ uint8_t ireg;
+
+ ireg = bt_inb(BT_INTMASK);
+
+ bt.irq_ok = true;
+ if (ireg & BT_INTMASK_B2H_IRQ) {
+ bt_outb(BT_INTMASK_B2H_IRQ | BT_INTMASK_B2H_IRQEN, BT_INTMASK);
+ bt_poll(NULL, NULL, mftb());
+ }
+}
+
+/*
+ * Allocate an ipmi message and bt container and return the ipmi
+ * message struct. Allocates enough space for the request and response
+ * data.
+ */
+static struct ipmi_msg *bt_alloc_ipmi_msg(size_t request_size, size_t response_size)
+{
+ struct bt_msg *bt_msg;
+
+ bt_msg = zalloc(sizeof(struct bt_msg) + MAX(request_size, response_size));
+ if (!bt_msg)
+ return NULL;
+
+ bt_msg->ipmi_msg.req_size = request_size;
+ bt_msg->ipmi_msg.resp_size = response_size;
+ bt_msg->ipmi_msg.data = (uint8_t *) (bt_msg + 1);
+
+ return &bt_msg->ipmi_msg;
+}
+
+/*
+ * Free a previously allocated ipmi message.
+ */
+static void bt_free_ipmi_msg(struct ipmi_msg *ipmi_msg)
+{
+ struct bt_msg *bt_msg = container_of(ipmi_msg, struct bt_msg, ipmi_msg);
+
+ free(bt_msg);
+}
+
+/*
+ * Do not resend IPMI messages to BMC.
+ */
+static void bt_disable_ipmi_msg_retry(struct ipmi_msg *ipmi_msg)
+{
+ struct bt_msg *bt_msg = container_of(ipmi_msg, struct bt_msg, ipmi_msg);
+
+ bt_msg->disable_retry = true;
+}
+
+/*
+ * Remove a message from the queue. The memory allocated for the ipmi message
+ * will need to be freed by the caller with bt_free_ipmi_msg() as it will no
+ * longer be in the queue of messages.
+ */
+static int bt_del_ipmi_msg(struct ipmi_msg *ipmi_msg)
+{
+ struct bt_msg *bt_msg = container_of(ipmi_msg, struct bt_msg, ipmi_msg);
+
+ lock(&bt.lock);
+ list_del(&bt_msg->link);
+ bt.queue_len--;
+ bt_send_and_unlock();
+ return 0;
+}
+
+static struct ipmi_backend bt_backend = {
+ .alloc_msg = bt_alloc_ipmi_msg,
+ .free_msg = bt_free_ipmi_msg,
+ .queue_msg = bt_add_ipmi_msg,
+ .queue_msg_head = bt_add_ipmi_msg_head,
+ .dequeue_msg = bt_del_ipmi_msg,
+ .disable_retry = bt_disable_ipmi_msg_retry,
+ .poll = bt_ipmi_poll,
+};
+
+static struct lpc_client bt_lpc_client = {
+ .interrupt = bt_irq,
+};
+
+void bt_init(void)
+{
+ struct dt_node *n;
+ const struct dt_property *prop;
+ uint32_t irq;
+
+ /* Set sane capability defaults */
+ bt.caps.num_requests = 1;
+ bt.caps.input_buf_len = BT_FIFO_LEN;
+ bt.caps.output_buf_len = BT_FIFO_LEN;
+ bt.caps.msg_timeout = BT_MSG_TIMEOUT;
+ bt.caps.max_retries = BT_MAX_RETRIES;
+
+ /* We support only one */
+ n = dt_find_compatible_node(dt_root, NULL, "ipmi-bt");
+ if (!n) {
+ prerror("No BT device\n");
+ return;
+ }
+
+ /* Get IO base */
+ prop = dt_find_property(n, "reg");
+ if (!prop) {
+ prerror("Can't find reg property\n");
+ return;
+ }
+ if (dt_property_get_cell(prop, 0) != OPAL_LPC_IO) {
+ prerror("Only supports IO addresses\n");
+ return;
+ }
+ bt.base_addr = dt_property_get_cell(prop, 1);
+ init_timer(&bt.poller, bt_poll, NULL);
+
+ bt_init_interface();
+ init_lock(&bt.lock);
+
+ /*
+ * The iBT interface comes up in the busy state until the daemon has
+ * initialised it.
+ */
+ list_head_init(&bt.msgq);
+ list_head_init(&bt.msgq_sync);
+ inflight_bt_msg = NULL;
+ bt.queue_len = 0;
+
+ prlog(PR_INFO, "Interface initialized, IO 0x%04x\n", bt.base_addr);
+
+ ipmi_register_backend(&bt_backend);
+
+ /*
+ * We initially schedule the poller as a relatively fast timer, at
+ * least until we have at least one interrupt occurring at which
+ * point we turn it into a background poller
+ */
+ schedule_timer(&bt.poller, msecs_to_tb(BT_DEFAULT_POLL_MS));
+
+ irq = dt_prop_get_u32(n, "interrupts");
+ bt_lpc_client.interrupts = LPC_IRQ(irq);
+ lpc_register_client(dt_get_chip_id(n), &bt_lpc_client,
+ IRQ_ATTR_TARGET_OPAL);
+
+ /* Enqueue an IPMI message to ask the BMC about its BT capabilities */
+ get_bt_caps();
+
+ prlog(PR_DEBUG, "Using LPC IRQ %d\n", irq);
+}
diff --git a/roms/skiboot/hw/cache-p9.c b/roms/skiboot/hw/cache-p9.c
new file mode 100644
index 000000000..fb5ce3087
--- /dev/null
+++ b/roms/skiboot/hw/cache-p9.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <chip.h>
+#include <xscom.h>
+#include <timebase.h>
+#include <xscom-p9-regs.h>
+#include <cache-p9.h>
+
+/* Registers and bits used to clear the L2 and L3 cache */
+#define L2_PRD_PURGE_CMD_REG 0x1080e
+#define L2_PRD_PURGE_CMD_TRIGGER PPC_BIT(0)
+#define L2_PRD_PURGE_CMD_TYPE_MASK PPC_BITMASK(1, 4)
+#define L2CAC_FLUSH 0x0
+#define L2_PRD_PURGE_CMD_REG_BUSY PPC_BIT(9)
+#define L3_PRD_PURGE_REG 0x1180e
+#define L3_PRD_PURGE_REQ PPC_BIT(0)
+#define L3_PRD_PURGE_TTYPE_MASK PPC_BITMASK(1, 4)
+#define L3_FULL_PURGE 0x0
+
+#define L2_L3_PRD_PURGE_TIMEOUT_MS 20
+
+static int start_l2_purge(uint32_t chip_id, uint32_t core_id)
+{
+ uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG);
+ int rc;
+
+ rc = xscom_write_mask(chip_id, addr, L2CAC_FLUSH,
+ L2_PRD_PURGE_CMD_TYPE_MASK);
+ if (!rc)
+ rc = xscom_write_mask(chip_id, addr, L2_PRD_PURGE_CMD_TRIGGER,
+ L2_PRD_PURGE_CMD_TRIGGER);
+ if (rc)
+ prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM write_mask "
+ "failed %i\n", core_id, rc);
+ return rc;
+}
+
+static int wait_l2_purge(uint32_t chip_id, uint32_t core_id)
+{
+ uint64_t val;
+ uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG);
+ unsigned long now = mftb();
+ unsigned long end = now + msecs_to_tb(L2_L3_PRD_PURGE_TIMEOUT_MS);
+ int rc;
+
+ while (1) {
+ rc = xscom_read(chip_id, addr, &val);
+ if (rc) {
+ prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM read "
+ "failed %i\n", core_id, rc);
+ break;
+ }
+ if (!(val & L2_PRD_PURGE_CMD_REG_BUSY))
+ break;
+ now = mftb();
+ if (tb_compare(now, end) == TB_AAFTERB) {
+ prlog(PR_ERR, "PURGE L2 on core 0x%x timed out %i\n",
+ core_id, rc);
+ return OPAL_BUSY;
+ }
+ }
+
+ /* We have to clear the trigger bit ourselves */
+ val &= ~L2_PRD_PURGE_CMD_TRIGGER;
+ rc = xscom_write(chip_id, addr, val);
+ if (rc)
+ prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM write failed %i\n",
+ core_id, rc);
+ return rc;
+}
+
+static int start_l3_purge(uint32_t chip_id, uint32_t core_id)
+{
+ uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
+ int rc;
+
+ rc = xscom_write_mask(chip_id, addr, L3_FULL_PURGE,
+ L3_PRD_PURGE_TTYPE_MASK);
+ if (!rc)
+ rc = xscom_write_mask(chip_id, addr, L3_PRD_PURGE_REQ,
+ L3_PRD_PURGE_REQ);
+ if (rc)
+ prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM write_mask "
+ "failed %i\n", core_id, rc);
+ return rc;
+}
+
+static int wait_l3_purge(uint32_t chip_id, uint32_t core_id)
+{
+ uint64_t val;
+ uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
+ unsigned long now = mftb();
+ unsigned long end = now + msecs_to_tb(L2_L3_PRD_PURGE_TIMEOUT_MS);
+ int rc;
+
+ /* Trigger bit is automatically set to zero when flushing is done */
+ while (1) {
+ rc = xscom_read(chip_id, addr, &val);
+ if (rc) {
+ prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM read "
+ "failed %i\n", core_id, rc);
+ break;
+ }
+ if (!(val & L3_PRD_PURGE_REQ))
+ break;
+ now = mftb();
+ if (tb_compare(now, end) == TB_AAFTERB) {
+ prlog(PR_ERR, "PURGE L3 on core 0x%x timed out %i\n",
+ core_id, rc);
+ return OPAL_BUSY;
+ }
+ }
+ return rc;
+}
+
+int64_t purge_l2_l3_caches(void)
+{
+ struct cpu_thread *t;
+ uint64_t core_id, prev_core_id = (uint64_t)-1;
+ int rc;
+ unsigned long now = mftb();
+
+ for_each_ungarded_cpu(t) {
+ /* Only need to do it once per core chiplet */
+ core_id = pir_to_core_id(t->pir);
+ if (prev_core_id == core_id)
+ continue;
+ prev_core_id = core_id;
+ rc = start_l2_purge(t->chip_id, core_id);
+ if (rc)
+ goto trace_exit;
+ rc = start_l3_purge(t->chip_id, core_id);
+ if (rc)
+ goto trace_exit;
+ }
+
+ prev_core_id = (uint64_t)-1;
+ for_each_ungarded_cpu(t) {
+ /* Only need to do it once per core chiplet */
+ core_id = pir_to_core_id(t->pir);
+ if (prev_core_id == core_id)
+ continue;
+ prev_core_id = core_id;
+
+ rc = wait_l2_purge(t->chip_id, core_id);
+ if (rc)
+ goto trace_exit;
+ rc = wait_l3_purge(t->chip_id, core_id);
+ if (rc)
+ goto trace_exit;
+ }
+
+trace_exit:
+ prlog(PR_TRACE, "L2/L3 purging took %ldus\n",
+ tb_to_usecs(mftb() - now));
+
+ return rc;
+}
diff --git a/roms/skiboot/hw/capp.c b/roms/skiboot/hw/capp.c
new file mode 100644
index 000000000..a1aa1caa9
--- /dev/null
+++ b/roms/skiboot/hw/capp.c
@@ -0,0 +1,243 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * CAPP unit (i.e. CAPI)
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <io.h>
+#include <opal.h>
+#include <chip.h>
+#include <xscom.h>
+#include <capp.h>
+
+#define PHBERR(opal_id, chip_id, index, fmt, a...) \
+ prlog(PR_ERR, "PHB#%04x[%d:%d]: " fmt, \
+ opal_id, chip_id, \
+ index, ## a)
+
+static struct {
+ uint32_t ec_level;
+ struct capp_lid_hdr *lid;
+ size_t size;
+ int load_result;
+} capp_ucode_info = { 0, NULL, 0, false };
+
+#define CAPP_UCODE_MAX_SIZE 0x20000
+
+struct lock capi_lock = LOCK_UNLOCKED;
+struct capp_ops capi_ops = { NULL };
+
+bool capp_ucode_loaded(struct proc_chip *chip, unsigned int index)
+{
+ return (chip->capp_ucode_loaded & (1 << index));
+}
+
+int preload_capp_ucode(void)
+{
+ struct dt_node *p;
+ struct proc_chip *chip;
+ uint32_t index;
+ uint64_t rc;
+ int ret;
+
+ /* CAPI is supported on P8 and P9 only */
+ p = dt_find_compatible_node(dt_root, NULL, "ibm,power8-pbcq");
+ if (!p)
+ p = dt_find_compatible_node(dt_root, NULL, "ibm,power9-pbcq");
+ if (!p)
+ return OPAL_SUCCESS;
+
+ chip = get_chip(dt_get_chip_id(p));
+
+ rc = xscom_read_cfam_chipid(chip->id, &index);
+ if (rc) {
+ prerror("CAPP: Error reading cfam chip-id\n");
+ ret = OPAL_HARDWARE;
+ return ret;
+ }
+ /* Keep ChipID and Major/Minor EC. Mask out the Location Code. */
+ index = index & 0xf0fff;
+
+ /* Assert that we're preloading */
+ assert(capp_ucode_info.lid == NULL);
+ capp_ucode_info.load_result = OPAL_EMPTY;
+
+ capp_ucode_info.ec_level = index;
+
+ /* Is the ucode preloaded like for BML? */
+ if (dt_has_node_property(p, "ibm,capp-ucode", NULL)) {
+ capp_ucode_info.lid = (struct capp_lid_hdr *)(u64)
+ dt_prop_get_u32(p, "ibm,capp-ucode");
+ capp_ucode_info.load_result = OPAL_SUCCESS;
+ ret = OPAL_SUCCESS;
+ goto end;
+ }
+ /* If we successfully download the ucode, we leave it around forever */
+ capp_ucode_info.size = CAPP_UCODE_MAX_SIZE;
+ capp_ucode_info.lid = malloc(CAPP_UCODE_MAX_SIZE);
+ if (!capp_ucode_info.lid) {
+ prerror("CAPP: Can't allocate space for ucode lid\n");
+ ret = OPAL_NO_MEM;
+ goto end;
+ }
+
+ prlog(PR_INFO, "CAPI: Preloading ucode %x\n", capp_ucode_info.ec_level);
+
+ ret = start_preload_resource(RESOURCE_ID_CAPP, index,
+ capp_ucode_info.lid,
+ &capp_ucode_info.size);
+
+ if (ret != OPAL_SUCCESS) {
+ prerror("CAPI: Failed to preload resource %d\n", ret);
+ capp_ucode_info.load_result = ret;
+ }
+
+end:
+ return ret;
+}
+
+static int64_t capp_lid_download(void)
+{
+ int64_t ret;
+
+ if (capp_ucode_info.load_result != OPAL_EMPTY)
+ return capp_ucode_info.load_result;
+
+ capp_ucode_info.load_result = wait_for_resource_loaded(
+ RESOURCE_ID_CAPP,
+ capp_ucode_info.ec_level);
+
+ if (capp_ucode_info.load_result != OPAL_SUCCESS) {
+ prerror("CAPP: Error loading ucode lid. index=%x\n",
+ capp_ucode_info.ec_level);
+ ret = OPAL_RESOURCE;
+ free(capp_ucode_info.lid);
+ capp_ucode_info.lid = NULL;
+ goto end;
+ }
+
+ ret = OPAL_SUCCESS;
+end:
+ return ret;
+}
+
+int64_t capp_load_ucode(unsigned int chip_id, uint32_t opal_id,
+ unsigned int index, u64 lid_eyecatcher,
+ uint32_t reg_offset,
+ uint64_t apc_master_addr, uint64_t apc_master_write,
+ uint64_t snp_array_addr, uint64_t snp_array_write)
+{
+ struct proc_chip *chip = get_chip(chip_id);
+ struct capp_ucode_lid *ucode;
+ struct capp_ucode_data *data;
+ struct capp_lid_hdr *lid;
+ uint64_t rc, val, addr;
+ uint32_t chunk_count, offset;
+ int i;
+
+ if (capp_ucode_loaded(chip, index))
+ return OPAL_SUCCESS;
+
+ rc = capp_lid_download();
+ if (rc)
+ return rc;
+
+ prlog(PR_INFO, "CHIP%i: CAPP ucode lid loaded at %p\n",
+ chip_id, capp_ucode_info.lid);
+
+ lid = capp_ucode_info.lid;
+ /*
+ * If lid header is present (on FSP machines), it'll tell us where to
+ * find the ucode. Otherwise this is the ucode.
+ */
+ ucode = (struct capp_ucode_lid *)lid;
+ if (be64_to_cpu(lid->eyecatcher) == lid_eyecatcher) {
+ if (be64_to_cpu(lid->version) != 0x1) {
+ PHBERR(opal_id, chip_id, index,
+ "capi ucode lid header invalid\n");
+ return OPAL_HARDWARE;
+ }
+ ucode = (struct capp_ucode_lid *)
+ ((char *)ucode + be64_to_cpu(lid->ucode_offset));
+ }
+
+ /* 'CAPPULID' in ASCII */
+ if ((be64_to_cpu(ucode->eyecatcher) != 0x43415050554C4944UL) ||
+ (be64_to_cpu(ucode->version) != 1)) {
+ PHBERR(opal_id, chip_id, index,
+ "CAPP: ucode header invalid\n");
+ return OPAL_HARDWARE;
+ }
+
+ offset = 0;
+ while (offset < be64_to_cpu(ucode->data_size)) {
+ data = (struct capp_ucode_data *)
+ ((char *)&ucode->data + offset);
+ chunk_count = be32_to_cpu(data->hdr.chunk_count);
+ offset += sizeof(struct capp_ucode_data_hdr) + chunk_count * 8;
+
+ /* 'CAPPUCOD' in ASCII */
+ if (be64_to_cpu(data->hdr.eyecatcher) != 0x4341505055434F44UL) {
+ PHBERR(opal_id, chip_id, index,
+ "CAPP: ucode data header invalid:%i\n",
+ offset);
+ return OPAL_HARDWARE;
+ }
+
+ switch (data->hdr.reg) {
+ case apc_master_cresp:
+ xscom_write(chip_id, apc_master_addr + reg_offset,
+ 0);
+ addr = apc_master_write;
+ break;
+ case apc_master_uop_table:
+ xscom_write(chip_id, apc_master_addr + reg_offset,
+ 0x180ULL << 52);
+ addr = apc_master_write;
+ break;
+ case snp_ttype:
+ xscom_write(chip_id, snp_array_addr + reg_offset,
+ 0x5000ULL << 48);
+ addr = snp_array_write;
+ break;
+ case snp_uop_table:
+ xscom_write(chip_id, snp_array_addr + reg_offset,
+ 0x4000ULL << 48);
+ addr = snp_array_write;
+ break;
+ default:
+ continue;
+ }
+
+ for (i = 0; i < chunk_count; i++) {
+ val = be64_to_cpu(data->data[i]);
+ xscom_write(chip_id, addr + reg_offset, val);
+ }
+ }
+
+ chip->capp_ucode_loaded |= (1 << index);
+
+ return OPAL_SUCCESS;
+}
+
+int64_t capp_get_info(int chip_id, struct phb *phb, struct capp_info *info)
+{
+ if (capi_ops.get_capp_info)
+ return capi_ops.get_capp_info(chip_id, phb, info);
+
+ return OPAL_PARAMETER;
+}
+
+int64_t capp_xscom_read(struct capp *capp, int64_t off, uint64_t *val)
+{
+ return capp == NULL ? OPAL_PARAMETER :
+ xscom_read(capp->chip_id, off + capp->capp_xscom_offset, val);
+}
+
+int64_t capp_xscom_write(struct capp *capp, int64_t off, uint64_t val)
+{
+ return capp == NULL ? OPAL_PARAMETER :
+ xscom_write(capp->chip_id, off + capp->capp_xscom_offset, val);
+}
diff --git a/roms/skiboot/hw/centaur.c b/roms/skiboot/hw/centaur.c
new file mode 100644
index 000000000..e9ff4197f
--- /dev/null
+++ b/roms/skiboot/hw/centaur.c
@@ -0,0 +1,555 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Centaur memory buffer chip
+ *
+ * Copyright 2013-2017 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <processor.h>
+#include <device.h>
+#include <chip.h>
+#include <centaur.h>
+#include <lock.h>
+#include <fsi-master.h>
+#include <timebase.h>
+
+/*
+ * Centaur chip IDs are using the XSCOM "partID" encoding
+ * described in xscom.h. recap:
+ *
+ * 0b1000.0000.0000.0000.0000.00NN.NCCC.MMMM
+ * N=Node, C=Chip, M=Memory Channel
+ *
+ * We currently use FSI exclusively for centaur access. We can
+ * start using MMIO on Centaur DD2.x when we have a way to handle
+ * machine checks happening inside Sapphire which we don't at the
+ * moment.
+ */
+
+/* Is that correct ? */
+#define MAX_CENTAURS_PER_CHIP 8
+
+/* Mark the centaur offline after this many consecutive errors */
+#define CENTAUR_ERR_OFFLINE_THRESHOLD 10
+
+/*
+ * FSI2PIB register definitions (this could be moved out if we were to
+ * support FSI master to other chips.
+ */
+#define FSI_DATA0_REG 0x1000
+#define FSI_DATA1_REG 0x1004
+#define FSI_CMD_REG 0x1008
+#define FSI_CMD_WR 0x80000000
+#define FSI_CMD_RD 0x00000000
+#define FSI_ENG_RESET_REG 0x1018
+#define FSI_STATUS_REG 0x101c
+#define FSI_STATUS_ABORT 0x00100000
+#define FSI_STATUS_ERRORS 0x00007000
+
+/* Some Centaur XSCOMs we care about */
+#define SCAC_CONFIG_REG 0x020115ce
+#define SCAC_CONFIG_SET 0x020115cf
+#define SCAC_CONFIG_CLR 0x020115d0
+#define SCAC_ENABLE_MSK PPC_BIT(0)
+
+#define cent_log(__lev, __c, __fmt, ...) \
+ prlog(__lev, "CENTAUR %x: " __fmt, __c->part_id, ##__VA_ARGS__)
+
+static int64_t centaur_fsiscom_complete(struct centaur_chip *centaur)
+{
+ int64_t rc;
+ uint32_t stat;
+
+ rc = mfsi_read(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
+ centaur->fsi_master_port, FSI_STATUS_REG, &stat);
+ if (rc) {
+ cent_log(PR_ERR, centaur, "MFSI read error %lld reading STAT\n", rc);
+ return rc;
+ }
+ if ((stat & (FSI_STATUS_ABORT | FSI_STATUS_ERRORS)) == 0)
+ return OPAL_SUCCESS;
+
+ cent_log(PR_ERR, centaur, "Remote FSI SCOM error, status=0x%08x\n", stat);
+
+ /* All 1's ? Assume it's gone */
+ if (stat == 0xffffffffu) {
+ cent_log(PR_ERR, centaur, "Chip appears to be dead !\n");
+ centaur->valid = false;
+
+ /* Here, hostboot grabs a pile of FFDC from the FSI layer,
+ * we could do that too ...
+ */
+ return OPAL_HARDWARE;
+ }
+
+ /* Here HB prints the GPx registers which I believe are only
+ * in the host (FSI master). We skip that for now, we don't have
+ * a good API to them
+ */
+
+ /* Recovery sequence from HostBoot fsiscom.C
+ * if SCOM fails and FSI Master displays "MasterTimeOut"
+ * then 7,6 <covered by FSI driver>
+ * else if SCOM fails and FSI2PIB Status shows PIB abort
+ * then just perform unit reset (6) and wait 1 ms
+ * else (PIB_abort='0' but PIB error is unequal 0)
+ * then just perform unit reset (6) (wait not needed).
+ *
+ * Note: Waiting 1ms inside OPAL is a BIG NO NO !!! We have
+ * no choice but doing it at the moment but that will have
+ * to be fixed one way or another, possibly by returning some
+ * kind of busy status until the delay is expired.
+ */
+ rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
+ centaur->fsi_master_port, FSI_ENG_RESET_REG, 0);
+ if (rc) {
+ cent_log(PR_ERR, centaur, "MFSI write error %lld resetting SCOM engine\n",
+ rc);
+ }
+ return OPAL_HARDWARE;
+}
+
+static int64_t centaur_fsiscom_read(struct centaur_chip *centaur, uint32_t pcb_addr,
+ uint64_t *val)
+{
+ int64_t rc;
+ uint32_t data0, data1;
+
+ rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
+ centaur->fsi_master_port, FSI_CMD_REG, pcb_addr | FSI_CMD_RD);
+ if (rc) {
+ cent_log(PR_ERR, centaur, "MFSI write error %lld writing CMD\n", rc);
+ return rc;
+ }
+
+ rc = centaur_fsiscom_complete(centaur);
+ if (rc)
+ return rc;
+
+ rc = mfsi_read(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
+ centaur->fsi_master_port, FSI_DATA0_REG, &data0);
+ if (rc) {
+ cent_log(PR_ERR, centaur, "MFSI read error %lld reading DATA0\n", rc);
+ return rc;
+ }
+ rc = mfsi_read(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
+ centaur->fsi_master_port, FSI_DATA1_REG, &data1);
+ if (rc) {
+ cent_log(PR_ERR, centaur, "MFSI read error %lld readking DATA1\n", rc);
+ return rc;
+ }
+
+ *val = (((uint64_t)data0) << 32) | data1;
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t centaur_fsiscom_write(struct centaur_chip *centaur, uint32_t pcb_addr,
+ uint64_t val)
+{
+ int64_t rc;
+
+ rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
+ centaur->fsi_master_port, FSI_DATA0_REG, hi32(val));
+ if (rc) {
+ cent_log(PR_ERR, centaur, "MFSI write error %lld writing DATA0\n", rc);
+ return rc;
+ }
+ rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
+ centaur->fsi_master_port, FSI_DATA1_REG, lo32(val));
+ if (rc) {
+ cent_log(PR_ERR, centaur, "MFSI write error %lld writing DATA1\n", rc);
+ return rc;
+ }
+ rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
+ centaur->fsi_master_port, FSI_CMD_REG, pcb_addr | FSI_CMD_WR);
+ if (rc) {
+ cent_log(PR_ERR, centaur, "MFSI write error %lld writing CMD\n", rc);
+ return rc;
+ }
+
+ return centaur_fsiscom_complete(centaur);
+}
+
+struct centaur_chip *get_centaur(uint32_t part_id)
+{
+ uint32_t hchip_id, mchan;
+ struct proc_chip *hchip;
+ struct centaur_chip *centaur;
+
+ if ((part_id >> 28) != 8) {
+ prerror("CENTAUR: Invalid part ID 0x%x\n", part_id);
+ return NULL;
+ }
+ hchip_id = (part_id & 0x0fffffff) >> 4;
+ mchan = part_id & 0xf;
+
+ hchip = get_chip(hchip_id);
+ if (!hchip) {
+ prerror("CENTAUR: Centaur 0x%x not found on non-existing chip 0%x\n",
+ part_id, hchip_id);
+ return NULL;
+ }
+ if (mchan >= MAX_CENTAURS_PER_CHIP) {
+ prerror("CENTAUR: Centaur 0x%x channel out of bounds !\n", part_id);
+ return NULL;
+ }
+ if (!hchip->centaurs) {
+ prerror("CENTAUR: Centaur 0x%x not found on chip 0%x (no centaurs)\n",
+ part_id, hchip_id);
+ return NULL;
+ }
+ centaur = &hchip->centaurs[mchan];
+ if (!centaur->valid) {
+ prerror("CENTAUR: Centaur 0x%x not valid on chip 0%x\n",
+ part_id, hchip_id);
+ return NULL;
+ }
+ return centaur;
+}
+
+/*
+ * Indirect XSCOM access functions. Copied from xscom.c, at a
+ * latter date, we should merge these properly.
+ */
+static void centaur_xscom_handle_ind_error(struct centaur_chip *centaur,
+ uint64_t data, uint64_t pcb_addr,
+ bool is_write)
+{
+ unsigned int stat = GETFIELD(XSCOM_DATA_IND_ERR, data);
+ bool timeout = !(data & XSCOM_DATA_IND_COMPLETE);
+
+ /* XXX: Create error log entry ? */
+ if (timeout)
+ cent_log(PR_ERR, centaur,
+ "inddirect %s timeout, pcb_addr=0x%llx stat=0x%x\n",
+ is_write ? "write" : "read", pcb_addr, stat);
+ else
+ cent_log(PR_ERR, centaur,
+ "indirect %s error, pcb_addr=0x%llx stat=0x%x\n",
+ is_write ? "write" : "read", pcb_addr, stat);
+}
+
+static int centaur_xscom_ind_read(struct centaur_chip *centaur,
+ uint64_t pcb_addr, uint64_t *val)
+{
+ uint32_t addr;
+ uint64_t data;
+ int rc, retries;
+
+ /* Write indirect address */
+ addr = pcb_addr & 0x7fffffff;
+ data = XSCOM_DATA_IND_READ |
+ (pcb_addr & XSCOM_ADDR_IND_ADDR);
+ rc = centaur_fsiscom_write(centaur, addr, data);
+ if (rc)
+ goto bail;
+
+ /* Wait for completion */
+ for (retries = 0; retries < XSCOM_IND_MAX_RETRIES; retries++) {
+ rc = centaur_fsiscom_read(centaur, addr, &data);
+ if (rc)
+ goto bail;
+ if ((data & XSCOM_DATA_IND_COMPLETE) &&
+ ((data & XSCOM_DATA_IND_ERR) == 0)) {
+ *val = data & XSCOM_DATA_IND_DATA;
+ break;
+ }
+ if ((data & XSCOM_DATA_IND_COMPLETE) ||
+ (retries >= XSCOM_IND_MAX_RETRIES)) {
+ centaur_xscom_handle_ind_error(centaur, data, pcb_addr,
+ false);
+ rc = OPAL_HARDWARE;
+ goto bail;
+ }
+ }
+ bail:
+ if (rc)
+ *val = (uint64_t)-1;
+ return rc;
+}
+
+static int centaur_xscom_ind_write(struct centaur_chip *centaur,
+ uint64_t pcb_addr, uint64_t val)
+{
+ uint32_t addr;
+ uint64_t data;
+ int rc, retries;
+
+ /* Write indirect address & data */
+ addr = pcb_addr & 0x7fffffff;
+ data = pcb_addr & XSCOM_ADDR_IND_ADDR;
+ data |= val & XSCOM_ADDR_IND_DATA;
+
+ rc = centaur_fsiscom_write(centaur, addr, data);
+ if (rc)
+ goto bail;
+
+ /* Wait for completion */
+ for (retries = 0; retries < XSCOM_IND_MAX_RETRIES; retries++) {
+ rc = centaur_fsiscom_read(centaur, addr, &data);
+ if (rc)
+ goto bail;
+ if ((data & XSCOM_DATA_IND_COMPLETE) &&
+ ((data & XSCOM_DATA_IND_ERR) == 0))
+ break;
+ if ((data & XSCOM_DATA_IND_COMPLETE) ||
+ (retries >= XSCOM_IND_MAX_RETRIES)) {
+ centaur_xscom_handle_ind_error(centaur, data, pcb_addr,
+ true);
+ rc = OPAL_HARDWARE;
+ goto bail;
+ }
+ }
+ bail:
+ return rc;
+}
+
+static int64_t centaur_xscom_read(struct scom_controller *scom,
+ uint32_t id __unused, uint64_t pcb_addr,
+ uint64_t *val)
+{
+ struct centaur_chip *centaur = scom->private;
+ int64_t rc;
+
+ if (!centaur)
+ return OPAL_PARAMETER;
+ if (!centaur->online)
+ return OPAL_XSCOM_CTR_OFFLINED;
+
+ lock(&centaur->lock);
+ if (pcb_addr & XSCOM_ADDR_IND_FLAG)
+ rc = centaur_xscom_ind_read(centaur, pcb_addr, val);
+ else
+ rc = centaur_fsiscom_read(centaur, pcb_addr, val);
+
+ /* We mark the centaur offline if we get too many errors on
+ * consecutive accesses
+ */
+ if (rc) {
+ centaur->error_count++;
+ if (centaur->error_count > CENTAUR_ERR_OFFLINE_THRESHOLD) {
+ centaur->online = false;
+ /**
+ * @fwts-label CentaurOfflinedTooManyErrors
+ * @fwts-advice OPAL marked a Centaur (memory buffer)
+ * as offline due to CENTAUR_ERR_OFFLINE_THRESHOLD (10)
+ * consecutive errors on XSCOMs to this centaur.
+ * OPAL will now return OPAL_XSCOM_CTR_OFFLINED and not
+ * try any further XSCOMs. This is likely caused by
+ * some hardware issue or PRD recovery issue.
+ */
+ prlog(PR_ERR, "CENTAUR: Offlined %x due to > %d consecutive XSCOM errors. No more XSCOMs to this centaur.\n",
+ id, CENTAUR_ERR_OFFLINE_THRESHOLD);
+ }
+ } else
+ centaur->error_count = 0;
+ unlock(&centaur->lock);
+
+ return rc;
+}
+
+static int64_t centaur_xscom_write(struct scom_controller *scom,
+ uint32_t id __unused, uint64_t pcb_addr,
+ uint64_t val)
+{
+ struct centaur_chip *centaur = scom->private;
+ int64_t rc;
+
+ if (!centaur)
+ return OPAL_PARAMETER;
+ if (!centaur->online)
+ return OPAL_XSCOM_CTR_OFFLINED;
+
+ lock(&centaur->lock);
+ if (pcb_addr & XSCOM_ADDR_IND_FLAG)
+ rc = centaur_xscom_ind_write(centaur, pcb_addr, val);
+ else
+ rc = centaur_fsiscom_write(centaur, pcb_addr, val);
+
+ /* We mark the centaur offline if we get too many errors on
+ * consecutive accesses
+ */
+ if (rc) {
+ centaur->error_count++;
+ if (centaur->error_count > CENTAUR_ERR_OFFLINE_THRESHOLD)
+ centaur->online = false;
+ } else
+ centaur->error_count = 0;
+ unlock(&centaur->lock);
+
+ return rc;
+}
+
+static bool centaur_check_id(struct centaur_chip *centaur)
+{
+ int64_t rc;
+ uint64_t val;
+
+ rc = centaur_fsiscom_read(centaur, 0xf000f, &val);
+ if (rc) {
+ cent_log(PR_ERR, centaur,
+ " FSISCOM error %lld reading ID register\n",
+ rc);
+ return false;
+ }
+
+ /* Extract CFAM id */
+ val >>= 44;
+
+ /* Identify chip */
+ if ((val & 0xff) != 0xe9) {
+ cent_log(PR_ERR, centaur,
+ " CFAM ID 0x%02x is not a Centaur !\n",
+ (unsigned int)(val & 0xff));
+ return false;
+ }
+
+ /* Get EC level from CFAM ID */
+ centaur->ec_level = ((val >> 16) & 0xf) << 4;
+ centaur->ec_level |= (val >> 8) & 0xf;
+
+ return true;
+}
+
+static bool centaur_add(uint32_t part_id, uint32_t mchip, uint32_t meng,
+ uint32_t mport)
+{
+ uint32_t hchip_id, mchan;
+ struct proc_chip *hchip;
+ struct centaur_chip *centaur;
+
+ if ((part_id >> 28) != 8) {
+ prerror("CENTAUR: Invalid part ID 0x%x\n", part_id);
+ return false;
+ }
+ hchip_id = (part_id & 0x0fffffff) >> 4;
+ mchan = part_id & 0xf;
+
+ printf("CENTAUR: Found centaur for chip 0x%x channel %d\n",
+ hchip_id, mchan);
+ printf("CENTAUR: FSI host: 0x%x cMFSI%d port %d\n",
+ mchip, meng, mport);
+
+ hchip = get_chip(hchip_id);
+ if (!hchip) {
+ prerror("CENTAUR: No such chip !!!\n");
+ return false;
+ }
+
+ if (mchan >= MAX_CENTAURS_PER_CHIP) {
+ prerror("CENTAUR: Channel out of bounds !\n");
+ return false;
+ }
+
+ if (!hchip->centaurs) {
+ hchip->centaurs =
+ zalloc(sizeof(struct centaur_chip) *
+ MAX_CENTAURS_PER_CHIP);
+ assert(hchip->centaurs);
+ }
+
+ centaur = &hchip->centaurs[mchan];
+ if (centaur->valid) {
+ prerror("CENTAUR: Duplicate centaur !\n");
+ return false;
+ }
+ centaur->part_id = part_id;
+ centaur->fsi_master_chip_id = mchip;
+ centaur->fsi_master_port = mport;
+ centaur->fsi_master_engine = meng ? MFSI_cMFSI1 : MFSI_cMFSI0;
+ centaur->online = true;
+ init_lock(&centaur->lock);
+ list_head_init(&centaur->i2cms);
+
+ if (!centaur_check_id(centaur))
+ return false;
+
+ centaur->scom.part_id = part_id;
+ centaur->scom.private = centaur;
+ centaur->scom.read = centaur_xscom_read;
+ centaur->scom.write = centaur_xscom_write;
+ scom_register(&centaur->scom);
+
+ cent_log(PR_INFO, centaur, "Found DD%x.%x chip\n",
+ centaur->ec_level >> 4,
+ centaur->ec_level & 0xf);
+
+ centaur->valid = true;
+ return true;
+}
+
+/* Returns how long to wait for logic to stop in TB ticks or a negative
+ * value on error
+ */
+int64_t centaur_disable_sensor_cache(uint32_t part_id)
+{
+ struct centaur_chip *centaur = get_centaur(part_id);
+ int64_t rc = 0;
+ uint64_t ctrl;
+
+ if (!centaur)
+ return false;
+
+ lock(&centaur->lock);
+ centaur->scache_disable_count++;
+ if (centaur->scache_disable_count == 1) {
+ centaur->scache_was_enabled = false;
+ rc = centaur_fsiscom_read(centaur, SCAC_CONFIG_REG, &ctrl);
+ if (rc)
+ goto bail;
+ centaur->scache_was_enabled = !!(ctrl & SCAC_ENABLE_MSK);
+ rc = centaur_fsiscom_write(centaur, SCAC_CONFIG_CLR, SCAC_ENABLE_MSK);
+ if (rc)
+ goto bail;
+ rc = msecs_to_tb(30);
+ }
+ bail:
+ unlock(&centaur->lock);
+ return rc;
+}
+
+int64_t centaur_enable_sensor_cache(uint32_t part_id)
+{
+ struct centaur_chip *centaur = get_centaur(part_id);
+ int64_t rc = 0;
+
+ if (!centaur)
+ return false;
+
+ lock(&centaur->lock);
+ if (centaur->scache_disable_count == 0) {
+ cent_log(PR_ERR, centaur, "Cache count going negative !\n");
+ backtrace();
+ goto bail;
+ }
+ centaur->scache_disable_count--;
+ if (centaur->scache_disable_count == 0 && centaur->scache_was_enabled)
+ rc = centaur_fsiscom_write(centaur, SCAC_CONFIG_SET, SCAC_ENABLE_MSK);
+ bail:
+ unlock(&centaur->lock);
+ return rc;
+}
+
+void centaur_init(void)
+{
+ struct dt_node *cn;
+
+ dt_for_each_compatible(dt_root, cn, "ibm,centaur") {
+ uint32_t chip_id, mchip, meng, mport;
+
+ chip_id = dt_prop_get_u32(cn, "ibm,chip-id");
+ mchip = dt_prop_get_u32(cn, "ibm,fsi-master-chip-id");
+ meng = dt_prop_get_cell(cn, "ibm,fsi-master-port", 0);
+ mport = dt_prop_get_cell(cn, "ibm,fsi-master-port", 1);
+
+ /*
+ * If adding the centaur succeeds, we expose it to
+ * Linux as a scom-controller
+ */
+ if (centaur_add(chip_id, mchip, meng, mport))
+ dt_add_property(cn, "scom-controller", NULL, 0);
+ }
+}
diff --git a/roms/skiboot/hw/chiptod.c b/roms/skiboot/hw/chiptod.c
new file mode 100644
index 000000000..7c0a1ffc7
--- /dev/null
+++ b/roms/skiboot/hw/chiptod.c
@@ -0,0 +1,2067 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Handle ChipTOD chip & configure core and CAPP timebases
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#define pr_fmt(fmt) "CHIPTOD: " fmt
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <pci.h>
+#include <chiptod.h>
+#include <chip.h>
+#include <io.h>
+#include <cpu.h>
+#include <timebase.h>
+#include <opal-api.h>
+
+/* TOD chip XSCOM addresses */
+#define TOD_MASTER_PATH_CTRL 0x00040000 /* Master Path ctrl reg */
+#define TOD_PRI_PORT0_CTRL 0x00040001 /* Primary port0 ctrl reg */
+#define TOD_PRI_PORT1_CTRL 0x00040002 /* Primary port1 ctrl reg */
+#define TOD_SEC_PORT0_CTRL 0x00040003 /* Secondary p0 ctrl reg */
+#define TOD_SEC_PORT1_CTRL 0x00040004 /* Secondary p1 ctrl reg */
+#define TOD_SLAVE_PATH_CTRL 0x00040005 /* Slave Path ctrl reg */
+#define TOD_INTERNAL_PATH_CTRL 0x00040006 /* Internal Path ctrl reg */
+
+/* -- TOD primary/secondary master/slave control register -- */
+#define TOD_PSMS_CTRL 0x00040007
+#define TOD_PSMSC_PM_TOD_SELECT PPC_BIT(1) /* Primary Master TOD */
+#define TOD_PSMSC_PM_DRAW_SELECT PPC_BIT(2) /* Primary Master Drawer */
+#define TOD_PSMSC_SM_TOD_SELECT PPC_BIT(9) /* Secondary Master TOD */
+#define TOD_PSMSC_SM_DRAW_SELECT PPC_BIT(10) /* Secondary Master Draw */
+
+/* -- TOD primary/secondary master/slave status register -- */
+#define TOD_STATUS 0x00040008
+#define TOD_ST_TOPOLOGY_SELECT PPC_BITMASK(0, 2)
+#define TOD_ST_MPATH0_STEP_VALID PPC_BIT(6) /* MasterPath0 step valid */
+#define TOD_ST_MPATH1_STEP_VALID PPC_BIT(7) /* MasterPath1 step valid */
+#define TOD_ST_SPATH0_STEP_VALID PPC_BIT(8) /* SlavePath0 step valid */
+#define TOD_ST_SPATH1_STEP_VALID PPC_BIT(10) /* SlavePath1 step valid */
+/* Primary master/slave path select (0 = PATH_0, 1 = PATH_1) */
+#define TOD_ST_PRI_MPATH_SELECT PPC_BIT(12) /* Primary MPath Select */
+#define TOD_ST_PRI_SPATH_SELECT PPC_BIT(15) /* Primary SPath Select */
+/* Secondary master/slave path select (0 = PATH_0, 1 = PATH_1) */
+#define TOD_ST_SEC_MPATH_SELECT PPC_BIT(16) /* Secondary MPath Select */
+#define TOD_ST_SEC_SPATH_SELECT PPC_BIT(19) /* Secondary SPath Select */
+#define TOD_ST_ACTIVE_MASTER PPC_BIT(23)
+#define TOD_ST_BACKUP_MASTER PPC_BIT(24)
+
+/* TOD chip XSCOM addresses */
+#define TOD_CHIP_CTRL 0x00040010 /* Chip control register */
+#define TOD_TTYPE_0 0x00040011
+#define TOD_TTYPE_1 0x00040012 /* PSS switch */
+#define TOD_TTYPE_2 0x00040013 /* Enable step checkers */
+#define TOD_TTYPE_3 0x00040014 /* Request TOD */
+#define TOD_TTYPE_4 0x00040015 /* Send TOD */
+#define TOD_TTYPE_5 0x00040016 /* Invalidate TOD */
+#define TOD_CHIPTOD_TO_TB 0x00040017
+#define TOD_LOAD_TOD_MOD 0x00040018
+#define TOD_CHIPTOD_VALUE 0x00040020
+#define TOD_CHIPTOD_LOAD_TB 0x00040021
+#define TOD_CHIPTOD_FSM 0x00040024
+
+/* -- TOD PIB Master reg -- */
+#define TOD_PIB_MASTER 0x00040027
+#define TOD_PIBM_ADDR_CFG_MCAST PPC_BIT(25)
+#define TOD_PIBM_ADDR_CFG_SLADDR PPC_BITMASK(26, 31)
+#define TOD_PIBM_TTYPE4_SEND_MODE PPC_BIT(32)
+#define TOD_PIBM_TTYPE4_SEND_ENBL PPC_BIT(33)
+
+/* -- TOD Error interrupt register -- */
+#define TOD_ERROR 0x00040030
+/* SYNC errors */
+#define TOD_ERR_CRMO_PARITY PPC_BIT(0)
+#define TOD_ERR_OSC0_PARITY PPC_BIT(1)
+#define TOD_ERR_OSC1_PARITY PPC_BIT(2)
+#define TOD_ERR_PPORT0_CREG_PARITY PPC_BIT(3)
+#define TOD_ERR_PPORT1_CREG_PARITY PPC_BIT(4)
+#define TOD_ERR_SPORT0_CREG_PARITY PPC_BIT(5)
+#define TOD_ERR_SPORT1_CREG_PARITY PPC_BIT(6)
+#define TOD_ERR_SPATH_CREG_PARITY PPC_BIT(7)
+#define TOD_ERR_IPATH_CREG_PARITY PPC_BIT(8)
+#define TOD_ERR_PSMS_CREG_PARITY PPC_BIT(9)
+#define TOD_ERR_CRITC_PARITY PPC_BIT(13)
+#define TOD_ERR_MP0_STEP_CHECK PPC_BIT(14)
+#define TOD_ERR_MP1_STEP_CHECK PPC_BIT(15)
+#define TOD_ERR_PSS_HAMMING_DISTANCE PPC_BIT(18)
+#define TOD_ERR_DELAY_COMPL_PARITY PPC_BIT(22)
+/* CNTR errors */
+#define TOD_ERR_CTCR_PARITY PPC_BIT(32)
+#define TOD_ERR_TOD_SYNC_CHECK PPC_BIT(33)
+#define TOD_ERR_TOD_FSM_PARITY PPC_BIT(34)
+#define TOD_ERR_TOD_REGISTER_PARITY PPC_BIT(35)
+#define TOD_ERR_OVERFLOW_YR2042 PPC_BIT(36)
+#define TOD_ERR_TOD_WOF_LSTEP_PARITY PPC_BIT(37)
+#define TOD_ERR_TTYPE0_RECVD PPC_BIT(38)
+#define TOD_ERR_TTYPE1_RECVD PPC_BIT(39)
+#define TOD_ERR_TTYPE2_RECVD PPC_BIT(40)
+#define TOD_ERR_TTYPE3_RECVD PPC_BIT(41)
+#define TOD_ERR_TTYPE4_RECVD PPC_BIT(42)
+#define TOD_ERR_TTYPE5_RECVD PPC_BIT(43)
+
+/* -- TOD Error interrupt register -- */
+#define TOD_ERROR_INJECT 0x00040031
+
+/* PC unit PIB address which recieves the timebase transfer from TOD */
+#define PC_TOD 0x4A3
+
+/* Local FIR EH.TPCHIP.TPC.LOCAL_FIR */
+#define LOCAL_CORE_FIR 0x0104000C
+#define LFIR_SWITCH_COMPLETE PPC_BIT(18)
+
+/* Number of iterations for the various timeouts */
+#define TIMEOUT_LOOPS 20000000
+
+/* TOD active Primary/secondary configuration */
+#define TOD_PRI_CONF_IN_USE 0 /* Tod using primary topology*/
+#define TOD_SEC_CONF_IN_USE 7 /* Tod using secondary topo */
+
+/* Timebase State Machine error state */
+#define TBST_STATE_ERROR 9
+
+static enum chiptod_type {
+ chiptod_unknown,
+ chiptod_p8,
+ chiptod_p9,
+ chiptod_p10,
+} chiptod_type;
+
+enum chiptod_chip_role {
+ chiptod_chip_role_UNKNOWN = -1,
+ chiptod_chip_role_MDMT = 0, /* Master Drawer Master TOD */
+ chiptod_chip_role_MDST, /* Master Drawer Slave TOD */
+ chiptod_chip_role_SDMT, /* Slave Drawer Master TOD */
+ chiptod_chip_role_SDST, /* Slave Drawer Slave TOD */
+};
+
+enum chiptod_chip_status {
+ chiptod_active_master = 0, /* Chip TOD is Active master */
+ chiptod_backup_master = 1, /* Chip TOD is backup master */
+ chiptod_backup_disabled, /* Chip TOD is backup but disabled */
+};
+
+struct chiptod_chip_config_info {
+ int32_t id; /* chip id */
+ enum chiptod_chip_role role; /* Chip role */
+ enum chiptod_chip_status status; /* active/backup/disabled */
+};
+
+static int32_t chiptod_primary = -1;
+static int32_t chiptod_secondary = -1;
+static enum chiptod_topology current_topology = chiptod_topo_unknown;
+
+/*
+ * chiptod_topology_info holds primary/secondary chip configuration info.
+ * This info is initialized during chiptod_init(). This is an array of two:
+ * [0] = [chiptod_topo_primary] = Primary topology config info
+ * [1] = [chiptod_topo_secondary] = Secondary topology config info
+ */
+static struct chiptod_chip_config_info chiptod_topology_info[2];
+
+/*
+ * Array of TOD control registers that holds last known valid values.
+ *
+ * Cache chiptod control register values at following instances:
+ * 1. Chiptod initialization
+ * 2. After topology switch is complete.
+ * 3. Upon receiving enable/disable topology request from FSP.
+ *
+ * Cache following chip TOD control registers:
+ * - Master Path control register (0x00040000)
+ * - Primary Port-0 control register (0x00040001)
+ * - Primary Port-1 control register (0x00040002)
+ * - Secondary Port-0 control register (0x00040003)
+ * - Secondary Port-1 control register (0x00040004)
+ * - Slave Path control register (0x00040005)
+ * - Internal Path control register (0x00040006)
+ * - Primary/secondary master/slave control register (0x00040007)
+ * - Chip control register (0x00040010)
+ *
+ * This data is used for restoring respective TOD registers to sane values
+ * whenever parity errors are reported on these registers (through HMI).
+ * The error_bit maps to corresponding bit from TOD error register that
+ * reports parity error on respective TOD registers.
+ */
+static struct chiptod_tod_regs {
+ /* error bit from TOD Error reg */
+ const uint64_t error_bit;
+
+ /* xscom address of TOD register to be restored. */
+ const uint64_t xscom_addr;
+ /* per chip cached value of TOD control registers to be restored. */
+ struct {
+ uint64_t data;
+ bool valid;
+ } val[MAX_CHIPS];
+} chiptod_tod_regs[] = {
+ { TOD_ERR_CRMO_PARITY, TOD_MASTER_PATH_CTRL, { } },
+ { TOD_ERR_PPORT0_CREG_PARITY, TOD_PRI_PORT0_CTRL, { } },
+ { TOD_ERR_PPORT1_CREG_PARITY, TOD_PRI_PORT1_CTRL, { } },
+ { TOD_ERR_SPORT0_CREG_PARITY, TOD_SEC_PORT0_CTRL, { } },
+ { TOD_ERR_SPORT1_CREG_PARITY, TOD_SEC_PORT1_CTRL, { } },
+ { TOD_ERR_SPATH_CREG_PARITY, TOD_SLAVE_PATH_CTRL, { } },
+ { TOD_ERR_IPATH_CREG_PARITY, TOD_INTERNAL_PATH_CTRL, { } },
+ { TOD_ERR_PSMS_CREG_PARITY, TOD_PSMS_CTRL, { } },
+ { TOD_ERR_CTCR_PARITY, TOD_CHIP_CTRL, { } },
+};
+
+/* The base TFMR value is the same for the whole machine
+ * for now as far as I can tell
+ */
+static uint64_t base_tfmr;
+
+/*
+ * For now, we use a global lock for runtime chiptod operations,
+ * eventually make this a per-core lock for wakeup rsync and
+ * take all of them for RAS cases.
+ */
+static struct lock chiptod_lock = LOCK_UNLOCKED;
+static bool chiptod_unrecoverable;
+
+#define NUM_SYNC_RETRIES 10
+
+static void _chiptod_cache_tod_regs(int32_t chip_id)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(chiptod_tod_regs); i++) {
+ if (xscom_read(chip_id, chiptod_tod_regs[i].xscom_addr,
+ &(chiptod_tod_regs[i].val[chip_id].data))) {
+ prerror("XSCOM error reading 0x%08llx reg.\n",
+ chiptod_tod_regs[i].xscom_addr);
+ /* Invalidate this record and continue */
+ chiptod_tod_regs[i].val[chip_id].valid = 0;
+ continue;
+ }
+ chiptod_tod_regs[i].val[chip_id].valid = 1;
+ }
+}
+
+static void chiptod_cache_tod_registers(void)
+{
+ struct proc_chip *chip;
+
+ for_each_chip(chip)
+ _chiptod_cache_tod_regs(chip->id);
+}
+
+static void print_topo_info(enum chiptod_topology topo)
+{
+ const char *role[] = { "Unknown", "MDMT", "MDST", "SDMT", "SDST" };
+ const char *status[] = { "Unknown",
+ "Active Master", "Backup Master", "Backup Master Disabled" };
+
+ prlog(PR_DEBUG, " Chip id: %d, Role: %s, Status: %s\n",
+ chiptod_topology_info[topo].id,
+ role[chiptod_topology_info[topo].role + 1],
+ status[chiptod_topology_info[topo].status + 1]);
+}
+
+static void print_topology_info(void)
+{
+ const char *topo[] = { "Unknown", "Primary", "Secondary" };
+
+ if (current_topology < 0)
+ return;
+
+ prlog(PR_DEBUG, "TOD Topology in Use: %s\n",
+ topo[current_topology+1]);
+ prlog(PR_DEBUG, " Primary configuration:\n");
+ print_topo_info(chiptod_topo_primary);
+ prlog(PR_DEBUG, " Secondary configuration:\n");
+ print_topo_info(chiptod_topo_secondary);
+}
+
+static enum chiptod_topology query_current_topology(void)
+{
+ uint64_t tod_status;
+
+ if (xscom_readme(TOD_STATUS, &tod_status)) {
+ prerror("XSCOM error reading TOD_STATUS reg\n");
+ return chiptod_topo_unknown;
+ }
+
+ /*
+ * Tod status register bit [0-2] tells configuration in use.
+ * 000 <= primary configuration in use
+ * 111 <= secondary configuration in use
+ */
+ if ((tod_status & TOD_ST_TOPOLOGY_SELECT) == TOD_PRI_CONF_IN_USE)
+ return chiptod_topo_primary;
+ else
+ return chiptod_topo_secondary;
+}
+
+static enum chiptod_chip_role
+chiptod_get_chip_role(enum chiptod_topology topology, int32_t chip_id)
+{
+ uint64_t tod_ctrl;
+ enum chiptod_chip_role role = chiptod_chip_role_UNKNOWN;
+
+ if (chip_id < 0)
+ return role;
+
+ if (xscom_read(chip_id, TOD_PSMS_CTRL, &tod_ctrl)) {
+ prerror("XSCOM error reading TOD_PSMS_CTRL\n");
+ return chiptod_chip_role_UNKNOWN;
+ }
+
+ switch (topology) {
+ case chiptod_topo_primary:
+ if (tod_ctrl & TOD_PSMSC_PM_DRAW_SELECT) {
+ if (tod_ctrl & TOD_PSMSC_PM_TOD_SELECT)
+ role = chiptod_chip_role_MDMT;
+ else
+ role = chiptod_chip_role_MDST;
+ } else {
+ if (tod_ctrl & TOD_PSMSC_PM_TOD_SELECT)
+ role = chiptod_chip_role_SDMT;
+ else
+ role = chiptod_chip_role_SDST;
+ }
+ break;
+ case chiptod_topo_secondary:
+ if (tod_ctrl & TOD_PSMSC_SM_DRAW_SELECT) {
+ if (tod_ctrl & TOD_PSMSC_SM_TOD_SELECT)
+ role = chiptod_chip_role_MDMT;
+ else
+ role = chiptod_chip_role_MDST;
+ } else {
+ if (tod_ctrl & TOD_PSMSC_SM_TOD_SELECT)
+ role = chiptod_chip_role_SDMT;
+ else
+ role = chiptod_chip_role_SDST;
+ }
+ break;
+ case chiptod_topo_unknown:
+ default:
+ break;
+ }
+ return role;
+}
+
+/*
+ * Check and return the status of sync step network for a given
+ * topology configuration.
+ * Return values:
+ * true: Sync Step network is running
+ * false: Sync Step network is not running
+ */
+static bool chiptod_sync_step_check_running(enum chiptod_topology topology)
+{
+ uint64_t tod_status;
+ enum chiptod_chip_role role;
+ bool running = false;
+ int32_t chip_id = chiptod_topology_info[topology].id;
+
+ /* Sanity check */
+ if (chip_id < 0)
+ return false;
+
+ if (xscom_read(chip_id, TOD_STATUS, &tod_status)) {
+ prerror("XSCOM error reading TOD_STATUS reg\n");
+ return false;
+ }
+
+ switch (topology) {
+ case chiptod_topo_primary:
+ /* Primary configuration */
+ role = chiptod_topology_info[topology].role;
+ if (role == chiptod_chip_role_MDMT) {
+ /*
+ * Chip is using Master path.
+ * Check if it is using path_0/path_1 and then
+ * validity of that path.
+ *
+ * TOD_STATUS[12]: 0 = PATH_0, 1 = PATH_1
+ */
+ if (tod_status & TOD_ST_PRI_MPATH_SELECT) {
+ if (tod_status & TOD_ST_MPATH1_STEP_VALID)
+ running = true;
+ } else {
+ if (tod_status & TOD_ST_MPATH0_STEP_VALID)
+ running = true;
+ }
+ } else {
+ /*
+ * Chip is using Slave path.
+ *
+ * TOD_STATUS[15]: 0 = PATH_0, 1 = PATH_1
+ */
+ if (tod_status & TOD_ST_PRI_SPATH_SELECT) {
+ if (tod_status & TOD_ST_SPATH1_STEP_VALID)
+ running = true;
+ } else {
+ if (tod_status & TOD_ST_SPATH0_STEP_VALID)
+ running = true;
+ }
+ }
+ break;
+ case chiptod_topo_secondary:
+ /* Secondary configuration */
+ role = chiptod_topology_info[topology].role;
+ if (role == chiptod_chip_role_MDMT) {
+ /*
+ * Chip is using Master path.
+ * Check if it is using path_0/path_1 and then
+ * validity of that path.
+ *
+ * TOD_STATUS[12]: 0 = PATH_0, 1 = PATH_1
+ */
+ if (tod_status & TOD_ST_SEC_MPATH_SELECT) {
+ if (tod_status & TOD_ST_MPATH1_STEP_VALID)
+ running = true;
+ } else {
+ if (tod_status & TOD_ST_MPATH0_STEP_VALID)
+ running = true;
+ }
+ } else {
+ /*
+ * Chip is using Slave path.
+ *
+ * TOD_STATUS[15]: 0 = PATH_0, 1 = PATH_1
+ */
+ if (tod_status & TOD_ST_SEC_SPATH_SELECT) {
+ if (tod_status & TOD_ST_SPATH1_STEP_VALID)
+ running = true;
+ } else {
+ if (tod_status & TOD_ST_SPATH0_STEP_VALID)
+ running = true;
+ }
+ }
+ break;
+ default:
+ break;
+ }
+ return running;
+}
+
+static enum chiptod_chip_status _chiptod_get_chip_status(int32_t chip_id)
+{
+ uint64_t tod_status;
+ enum chiptod_chip_status status = -1;
+
+ if (chip_id < 0)
+ return chiptod_backup_disabled;
+
+ if (xscom_read(chip_id, TOD_STATUS, &tod_status)) {
+ prerror("XSCOM error reading TOD_STATUS reg\n");
+ return status;
+ }
+
+ if (tod_status & TOD_ST_ACTIVE_MASTER)
+ status = chiptod_active_master;
+ else if (tod_status & TOD_ST_BACKUP_MASTER)
+ status = chiptod_backup_master;
+
+ return status;
+}
+
+static enum chiptod_chip_status
+chiptod_get_chip_status(enum chiptod_topology topology)
+{
+ return _chiptod_get_chip_status(chiptod_topology_info[topology].id);
+}
+
+static void chiptod_update_topology(enum chiptod_topology topo)
+{
+ int32_t chip_id = chiptod_topology_info[topo].id;
+
+ if (chip_id < 0)
+ return;
+
+ chiptod_topology_info[topo].role = chiptod_get_chip_role(topo, chip_id);
+ chiptod_topology_info[topo].status = chiptod_get_chip_status(topo);
+
+ /*
+ * If chip TOD on this topology is a backup master then check if
+ * sync/step network is running on this topology. If not,
+ * then mark status as backup not valid.
+ */
+ if ((chiptod_topology_info[topo].status == chiptod_backup_master) &&
+ !chiptod_sync_step_check_running(topo))
+ chiptod_topology_info[topo].status = chiptod_backup_disabled;
+}
+
+static void chiptod_setup_base_tfmr(void)
+{
+ struct dt_node *cpu = this_cpu()->node;
+ uint64_t core_freq, tod_freq;
+ uint64_t mcbs;
+
+ base_tfmr = SPR_TFMR_TB_ECLIPZ;
+
+ /* Get CPU and TOD freqs in Hz */
+ if (dt_has_node_property(cpu, "ibm,extended-clock-frequency", NULL))
+ core_freq = dt_prop_get_u64(cpu, "ibm,extended-clock-frequency");
+ else
+ core_freq = dt_prop_get_u32(cpu, "clock-frequency");
+
+ if (!core_freq) {
+ prlog(PR_ERR, "CPU clock frequency is not set\n");
+ abort();
+ }
+
+ tod_freq = 32000000;
+
+ /* Calculate the "Max Cycles Between Steps" value according
+ * to the magic formula:
+ *
+ * mcbs = (core_freq * max_jitter_factor) / (4 * tod_freq) / 100;
+ *
+ * The max jitter factor is set to 240 based on what pHyp uses.
+ */
+ mcbs = (core_freq * 240) / (4 * tod_freq) / 100;
+ prlog(PR_INFO, "Calculated MCBS is 0x%llx"
+ " (Cfreq=%lld Tfreq=%lld)\n",
+ mcbs, core_freq, tod_freq);
+
+ /* Bake that all into TFMR */
+ base_tfmr = SETFIELD(SPR_TFMR_MAX_CYC_BET_STEPS, base_tfmr, mcbs);
+ base_tfmr = SETFIELD(SPR_TFMR_N_CLKS_PER_STEP, base_tfmr, 0);
+ base_tfmr = SETFIELD(SPR_TFMR_SYNC_BIT_SEL, base_tfmr, 4);
+}
+
+static bool chiptod_mod_tb(void)
+{
+ uint64_t tfmr = base_tfmr;
+ uint64_t timeout = 0;
+
+ /* Switch timebase to "Not Set" state */
+ mtspr(SPR_TFMR, tfmr | SPR_TFMR_LOAD_TOD_MOD);
+ do {
+ if (++timeout >= (TIMEOUT_LOOPS*2)) {
+ prerror("TB \"Not Set\" timeout\n");
+ return false;
+ }
+ tfmr = mfspr(SPR_TFMR);
+ if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
+ prerror("TB \"Not Set\" TFMR corrupt\n");
+ return false;
+ }
+ if (GETFIELD(SPR_TFMR_TBST_ENCODED, tfmr) == 9) {
+ prerror("TB \"Not Set\" TOD in error state\n");
+ return false;
+ }
+ } while (tfmr & SPR_TFMR_LOAD_TOD_MOD);
+
+ return true;
+}
+
+static bool chiptod_interrupt_check(void)
+{
+ uint64_t tfmr;
+ uint64_t timeout = 0;
+
+ do {
+ if (++timeout >= TIMEOUT_LOOPS) {
+ prerror("Interrupt check fail\n");
+ return false;
+ }
+ tfmr = mfspr(SPR_TFMR);
+ if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
+ prerror("Interrupt check TFMR corrupt !\n");
+ return false;
+ }
+ } while (tfmr & SPR_TFMR_CHIP_TOD_INTERRUPT);
+
+ return true;
+}
+
+static bool chiptod_running_check(uint32_t chip_id)
+{
+ uint64_t tval;
+
+ if (xscom_read(chip_id, TOD_CHIPTOD_FSM, &tval)) {
+ prerror("XSCOM error polling run\n");
+ return false;
+ }
+ if (tval & 0x0800000000000000UL)
+ return true;
+ else
+ return false;
+}
+
+static bool chiptod_poll_running(void)
+{
+ uint64_t timeout = 0;
+ uint64_t tval;
+
+ /* Chip TOD running check */
+ do {
+ if (++timeout >= TIMEOUT_LOOPS) {
+ prerror("Running check fail timeout\n");
+ return false;
+ }
+ if (xscom_readme(TOD_CHIPTOD_FSM, &tval)) {
+ prerror("XSCOM error polling run\n");
+ return false;
+ }
+ } while (!(tval & 0x0800000000000000UL));
+
+ return true;
+}
+
+static bool chiptod_to_tb(void)
+{
+ uint32_t pir = this_cpu()->pir;
+ uint64_t tval, tfmr;
+ uint64_t timeout = 0;
+
+ /* Tell the ChipTOD about our fabric address
+ *
+ * The pib_master value is calculated from the CPU core ID, given in
+ * the PIR. Because we have different core/thread arrangements in the
+ * PIR between p7 and p8, we need to do the calculation differently.
+ *
+ * p7: 0b00001 || 3-bit core id
+ * p8: 0b0001 || 4-bit core id
+ * p9: 0b001 || 5-bit core id
+ * p10: 0b001 || 5-bit core id
+ *
+ * However in P10 we don't use the core ID addressing, but rather core
+ * scom addressing mode, which appears to work better.
+ */
+
+ if (xscom_readme(TOD_PIB_MASTER, &tval)) {
+ prerror("XSCOM error reading PIB_MASTER\n");
+ return false;
+ }
+
+ if (chiptod_type == chiptod_p10) {
+ uint32_t core_id = pir_to_core_id(pir);
+
+ if (this_cpu()->is_fused_core &&
+ PVR_VERS_MAJ(mfspr(SPR_PVR)) == 2) {
+ /* Workaround: must address the even small core. */
+ core_id &= ~1;
+ }
+
+ tval = XSCOM_ADDR_P10_EC(core_id, PC_TOD);
+
+ tval <<= 32; /* PIB slave address goes in PPC bits [0:31] */
+
+ tval |= PPC_BIT(35); /* Enable SCOM addressing. */
+
+ } else {
+ uint64_t tvbits;
+
+ if (chiptod_type == chiptod_p9) {
+ tvbits = (pir >> 2) & 0x1f;
+ tvbits |= 0x20;
+ } else if (chiptod_type == chiptod_p8) {
+ tvbits = (pir >> 3) & 0xf;
+ tvbits |= 0x10;
+ } else {
+ tvbits = (pir >> 2) & 0x7;
+ tvbits |= 0x08;
+ }
+ tval &= ~TOD_PIBM_ADDR_CFG_MCAST;
+ tval = SETFIELD(TOD_PIBM_ADDR_CFG_SLADDR, tval, tvbits);
+ }
+
+ if (xscom_writeme(TOD_PIB_MASTER, tval)) {
+ prerror("XSCOM error writing PIB_MASTER\n");
+ return false;
+ }
+
+ /* Make us ready to get the TB from the chipTOD */
+ mtspr(SPR_TFMR, base_tfmr | SPR_TFMR_MOVE_CHIP_TOD_TO_TB);
+
+ /* Tell the ChipTOD to send it */
+ if (xscom_writeme(TOD_CHIPTOD_TO_TB, PPC_BIT(0))) {
+ prerror("XSCOM error writing CHIPTOD_TO_TB\n");
+ return false;
+ }
+
+ /* Wait for it to complete */
+ timeout = 0;
+ do {
+ if (++timeout >= TIMEOUT_LOOPS) {
+ prerror("Chip to TB timeout\n");
+ return false;
+ }
+ tfmr = mfspr(SPR_TFMR);
+ if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
+ prerror("MoveToTB: corrupt TFMR !\n");
+ return false;
+ }
+ } while (tfmr & SPR_TFMR_MOVE_CHIP_TOD_TO_TB);
+
+ return true;
+}
+
+static bool chiptod_check_tb_running(void)
+{
+ /* We used to wait for two SYNC pulses in TFMR but that
+ * doesn't seem to occur in sim, so instead we use a
+ * method similar to what pHyp does which is to check for
+ * TFMR SPR_TFMR_TB_VALID and not SPR_TFMR_TFMR_CORRUPT
+ */
+#if 0
+ uint64_t tfmr, timeout;
+ unsigned int i;
+
+ for (i = 0; i < 2; i++) {
+ tfmr = mfspr(SPR_TFMR);
+ tfmr &= ~SPR_TFMR_TB_SYNC_OCCURED;
+ mtspr(SPR_TFMR, tfmr);
+ timeout = 0;
+ do {
+ if (++timeout >= TIMEOUT_LOOPS) {
+ prerror("CHIPTOD: No sync pulses\n");
+ return false;
+ }
+ tfmr = mfspr(SPR_TFMR);
+ } while (!(tfmr & SPR_TFMR_TB_SYNC_OCCURED));
+ }
+#else
+ uint64_t tfmr = mfspr(SPR_TFMR);
+
+ return (tfmr & SPR_TFMR_TB_VALID) &&
+ !(tfmr & SPR_TFMR_TFMR_CORRUPT);
+#endif
+ return true;
+}
+
+static bool chiptod_reset_tb_errors(void)
+{
+ uint64_t tfmr;
+ unsigned long timeout = 0;
+
+ /* Ask for automatic clear of errors */
+ tfmr = base_tfmr | SPR_TFMR_CLEAR_TB_ERRORS;
+
+ /* Additionally pHyp sets these (write-1-to-clear ?) */
+ tfmr |= SPR_TFMR_TB_MISSING_SYNC;
+ tfmr |= SPR_TFMR_TB_MISSING_STEP;
+ tfmr |= SPR_TFMR_TB_RESIDUE_ERR;
+ mtspr(SPR_TFMR, tfmr);
+
+ /* We have to write "Clear TB Errors" again */
+ tfmr = base_tfmr | SPR_TFMR_CLEAR_TB_ERRORS;
+ mtspr(SPR_TFMR, tfmr);
+
+ do {
+ if (++timeout >= TIMEOUT_LOOPS) {
+ /* Don't actually do anything on error for
+ * now ... not much we can do, panic maybe ?
+ */
+ prerror("TB error reset timeout !\n");
+ return false;
+ }
+ tfmr = mfspr(SPR_TFMR);
+ if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
+ prerror("TB error reset: corrupt TFMR !\n");
+ return false;
+ }
+ } while (tfmr & SPR_TFMR_CLEAR_TB_ERRORS);
+ return true;
+}
+
+static void chiptod_cleanup_thread_tfmr(void)
+{
+ uint64_t tfmr = base_tfmr;
+
+ tfmr |= SPR_TFMR_PURR_PARITY_ERR;
+ tfmr |= SPR_TFMR_SPURR_PARITY_ERR;
+ tfmr |= SPR_TFMR_DEC_PARITY_ERR;
+ tfmr |= SPR_TFMR_TFMR_CORRUPT;
+ tfmr |= SPR_TFMR_PURR_OVERFLOW;
+ tfmr |= SPR_TFMR_SPURR_OVERFLOW;
+ mtspr(SPR_TFMR, tfmr);
+}
+
+static void chiptod_reset_tod_errors(void)
+{
+ uint64_t terr;
+
+ /*
+ * At boot, we clear the errors that the firmware is
+ * supposed to handle. List provided by the pHyp folks.
+ */
+
+ terr = TOD_ERR_CRITC_PARITY;
+ terr |= TOD_ERR_PSS_HAMMING_DISTANCE;
+ terr |= TOD_ERR_DELAY_COMPL_PARITY;
+ terr |= TOD_ERR_CTCR_PARITY;
+ terr |= TOD_ERR_TOD_SYNC_CHECK;
+ terr |= TOD_ERR_TOD_FSM_PARITY;
+ terr |= TOD_ERR_TOD_REGISTER_PARITY;
+
+ if (xscom_writeme(TOD_ERROR, terr)) {
+ prerror("XSCOM error writing TOD_ERROR !\n");
+ /* Not much we can do here ... abort ? */
+ }
+}
+
+static void chiptod_sync_master(void *data)
+{
+ uint64_t initial_tb_value;
+ bool *result = data;
+
+ prlog(PR_DEBUG, "Master sync on CPU PIR 0x%04x...\n",
+ this_cpu()->pir);
+
+ /* Apply base tfmr */
+ mtspr(SPR_TFMR, base_tfmr);
+
+ /* From recipe provided by pHyp folks, reset various errors
+ * before attempting the sync
+ */
+ chiptod_reset_tb_errors();
+
+ /* Cleanup thread tfmr bits */
+ chiptod_cleanup_thread_tfmr();
+
+ /* Reset errors in the chiptod itself */
+ chiptod_reset_tod_errors();
+
+ /* Switch timebase to "Not Set" state */
+ if (!chiptod_mod_tb())
+ goto error;
+ prlog(PR_INSANE, "SYNC MASTER Step 2 TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+
+ /* Chip TOD step checkers enable */
+ if (xscom_writeme(TOD_TTYPE_2, PPC_BIT(0))) {
+ prerror("XSCOM error enabling steppers\n");
+ goto error;
+ }
+
+ prlog(PR_INSANE, "SYNC MASTER Step 3 TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+
+ /* Chip TOD interrupt check */
+ if (!chiptod_interrupt_check())
+ goto error;
+ prlog(PR_INSANE, "SYNC MASTER Step 4 TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+
+ /* Switch local chiptod to "Not Set" state */
+ if (xscom_writeme(TOD_LOAD_TOD_MOD, PPC_BIT(0))) {
+ prerror("XSCOM error sending LOAD_TOD_MOD\n");
+ goto error;
+ }
+
+ /* Switch all remote chiptod to "Not Set" state */
+ if (xscom_writeme(TOD_TTYPE_5, PPC_BIT(0))) {
+ prerror("XSCOM error sending TTYPE_5\n");
+ goto error;
+ }
+
+ /*
+ * Load the master's current timebase value into the Chip TOD
+ * network. This is so we have sane timestamps across the whole
+ * IPL process. The Chip TOD documentation says that the loaded
+ * value needs to be one STEP before a SYNC. In other words,
+ * set the low bits to 0x1ff0.
+ */
+ initial_tb_value = (mftb() & ~0x1fff) | 0x1ff0;
+
+ /* Chip TOD load initial value */
+ if (xscom_writeme(TOD_CHIPTOD_LOAD_TB, initial_tb_value)) {
+ prerror("XSCOM error setting init TB\n");
+ goto error;
+ }
+
+ prlog(PR_INSANE, "SYNC MASTER Step 5 TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+
+ if (!chiptod_poll_running())
+ goto error;
+ prlog(PR_INSANE, "SYNC MASTER Step 6 TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+
+ /* Move chiptod value to core TB */
+ if (!chiptod_to_tb())
+ goto error;
+ prlog(PR_INSANE, "SYNC MASTER Step 7 TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+
+ /* Send local chip TOD to all chips TOD */
+ if (xscom_writeme(TOD_TTYPE_4, PPC_BIT(0))) {
+ prerror("XSCOM error sending TTYPE_4\n");
+ goto error;
+ }
+
+ /* Check if TB is running */
+ if (!chiptod_check_tb_running())
+ goto error;
+
+ prlog(PR_INSANE, "Master sync completed, TB=%lx\n", mfspr(SPR_TBRL));
+
+ /*
+ * A little delay to make sure the remote chips get up to
+ * speed before we start syncing them.
+ *
+ * We have to do it here because we know our TB is running
+ * while the boot thread TB might not yet.
+ */
+ time_wait_ms(1);
+
+ *result = true;
+ return;
+ error:
+ prerror("Master sync failed! TFMR=0x%016lx, retrying...\n", mfspr(SPR_TFMR));
+ *result = false;
+}
+
+static void chiptod_sync_slave(void *data)
+{
+ bool *result = data;
+ bool do_sync = false;
+
+ /* Only get primaries, not threads */
+ if (!this_cpu()->is_secondary)
+ do_sync = true;
+
+ if (chiptod_type == chiptod_p10 && this_cpu()->is_fused_core &&
+ PVR_VERS_MAJ(mfspr(SPR_PVR)) == 2) {
+ /* P10 DD2 fused core workaround, must sync on small cores */
+ if (this_cpu() == this_cpu()->ec_primary)
+ do_sync = true;
+ }
+
+ if (!do_sync) {
+ /* Just cleanup the TFMR */
+ chiptod_cleanup_thread_tfmr();
+ *result = true;
+ return;
+ }
+
+ prlog(PR_DEBUG, "Slave sync on CPU PIR 0x%04x...\n",
+ this_cpu()->pir);
+
+ /* Apply base tfmr */
+ mtspr(SPR_TFMR, base_tfmr);
+
+ /* From recipe provided by pHyp folks, reset various errors
+ * before attempting the sync
+ */
+ chiptod_reset_tb_errors();
+
+ /* Cleanup thread tfmr bits */
+ chiptod_cleanup_thread_tfmr();
+
+ /* Switch timebase to "Not Set" state */
+ if (!chiptod_mod_tb())
+ goto error;
+ prlog(PR_INSANE, "SYNC SLAVE Step 2 TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+
+ /* Chip TOD running check */
+ if (!chiptod_poll_running())
+ goto error;
+ prlog(PR_INSANE, "SYNC SLAVE Step 3 TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+
+ /* Chip TOD interrupt check */
+ if (!chiptod_interrupt_check())
+ goto error;
+ prlog(PR_INSANE, "SYNC SLAVE Step 4 TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+
+ /* Move chiptod value to core TB */
+ if (!chiptod_to_tb())
+ goto error;
+ prlog(PR_INSANE, "SYNC SLAVE Step 5 TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+
+ /* Check if TB is running */
+ if (!chiptod_check_tb_running())
+ goto error;
+
+ prlog(PR_INSANE, "Slave sync completed, TB=%lx\n", mfspr(SPR_TBRL));
+
+ *result = true;
+ return;
+ error:
+ prerror("Slave sync failed ! TFMR=0x%016lx, retrying...\n", mfspr(SPR_TFMR));
+ *result = false;
+}
+
+bool chiptod_wakeup_resync(void)
+{
+ if (chiptod_primary < 0)
+ return 0;
+
+ lock(&chiptod_lock);
+
+ /* Apply base tfmr */
+ mtspr(SPR_TFMR, base_tfmr);
+
+ /* From recipe provided by pHyp folks, reset various errors
+ * before attempting the sync
+ */
+ chiptod_reset_tb_errors();
+
+ /* Cleanup thread tfmr bits */
+ chiptod_cleanup_thread_tfmr();
+
+ /* Switch timebase to "Not Set" state */
+ if (!chiptod_mod_tb())
+ goto error;
+
+ /* Move chiptod value to core TB */
+ if (!chiptod_to_tb())
+ goto error;
+
+ unlock(&chiptod_lock);
+
+ return true;
+ error:
+ prerror("Resync failed ! TFMR=0x%16lx\n", mfspr(SPR_TFMR));
+ unlock(&chiptod_lock);
+ return false;
+}
+
+/*
+ * Fixup for p10 TOD bug workaround.
+ *
+ * The TOD may fail to start if all clocks in the system are derived from
+ * the same reference oscillator.
+ *
+ * Avoiding this is pretty easy: Whenever we clear/reset the TOD registers,
+ * make sure to init bits 26:31 of TOD_SLAVE_PATH_CTRL (0x40005) to 0b111111
+ * instead of 0b000000. The value 0 in TOD_S_PATH_CTRL_REG(26:31) must be
+ * avoided, and if it does get written it must be followed up by writing a
+ * value of all ones to clean up the resulting bad state before the (nonzero)
+ * final value can be written.
+ */
+static void fixup_tod_reg_value(struct chiptod_tod_regs *treg_entry)
+{
+ int32_t chip_id = this_cpu()->chip_id;
+
+ if (proc_gen != proc_gen_p10)
+ return;
+
+ if (treg_entry->xscom_addr == TOD_SLAVE_PATH_CTRL)
+ treg_entry->val[chip_id].data |= PPC_BITMASK(26,31);
+}
+
+static int __chiptod_recover_tod_errors(void)
+{
+ uint64_t terr;
+ uint64_t treset = 0;
+ int i, rc = -1;
+ int32_t chip_id = this_cpu()->chip_id;
+
+ /* Read TOD error register */
+ if (xscom_readme(TOD_ERROR, &terr)) {
+ prerror("XSCOM error reading TOD_ERROR reg\n");
+ return 0;
+ }
+ /* Check for sync check error and recover */
+ if ((terr & TOD_ERR_TOD_SYNC_CHECK) ||
+ (terr & TOD_ERR_TOD_FSM_PARITY) ||
+ (terr & TOD_ERR_CTCR_PARITY) ||
+ (terr & TOD_ERR_PSS_HAMMING_DISTANCE) ||
+ (terr & TOD_ERR_DELAY_COMPL_PARITY) ||
+ (terr & TOD_ERR_TOD_REGISTER_PARITY)) {
+ chiptod_reset_tod_errors();
+ rc = 1;
+ }
+
+ /*
+ * Check for TOD control register parity errors and restore those
+ * registers with last saved valid values.
+ */
+ for (i = 0; i < ARRAY_SIZE(chiptod_tod_regs); i++) {
+ if (!(terr & chiptod_tod_regs[i].error_bit))
+ continue;
+
+ /* Check if we have valid last saved register value. */
+ if (!chiptod_tod_regs[i].val[chip_id].valid) {
+ prerror("Failed to restore TOD register: %08llx",
+ chiptod_tod_regs[i].xscom_addr);
+ return 0;
+ }
+
+ fixup_tod_reg_value(&chiptod_tod_regs[i]);
+
+ prlog(PR_DEBUG, "Parity error, Restoring TOD register: "
+ "%08llx = %016llx\n",
+ chiptod_tod_regs[i].xscom_addr,
+ chiptod_tod_regs[i].val[chip_id].data);
+ if (xscom_writeme(chiptod_tod_regs[i].xscom_addr,
+ chiptod_tod_regs[i].val[chip_id].data)) {
+ prerror("XSCOM error writing 0x%08llx reg.\n",
+ chiptod_tod_regs[i].xscom_addr);
+ return 0;
+ }
+ treset |= chiptod_tod_regs[i].error_bit;
+ }
+
+ if (treset && (xscom_writeme(TOD_ERROR, treset))) {
+ prerror("XSCOM error writing TOD_ERROR !\n");
+ return 0;
+ }
+ /* We have handled all the TOD errors routed to hypervisor */
+ if (treset)
+ rc = 1;
+ return rc;
+}
+
+int chiptod_recover_tod_errors(void)
+{
+ int rc;
+
+ lock(&chiptod_lock);
+ rc = __chiptod_recover_tod_errors();
+ unlock(&chiptod_lock);
+ return rc;
+}
+
+static int32_t chiptod_get_active_master(void)
+{
+ if (current_topology < 0)
+ return -1;
+
+ if (chiptod_topology_info[current_topology].status ==
+ chiptod_active_master)
+ return chiptod_topology_info[current_topology].id;
+ return -1;
+}
+
+/* Return true if Active master TOD is running. */
+static bool chiptod_master_running(void)
+{
+ int32_t active_master_chip;
+
+ active_master_chip = chiptod_get_active_master();
+ if (active_master_chip != -1) {
+ if (chiptod_running_check(active_master_chip))
+ return true;
+ }
+ return false;
+}
+
+static bool chiptod_set_ttype4_mode(struct proc_chip *chip, bool enable)
+{
+ uint64_t tval;
+
+ /* Sanity check */
+ if (!chip)
+ return false;
+
+ if (xscom_read(chip->id, TOD_PIB_MASTER, &tval)) {
+ prerror("XSCOM error reading PIB_MASTER\n");
+ return false;
+ }
+
+ if (enable) {
+ /*
+ * Enable TTYPE4 send mode. This allows TOD to respond to
+ * TTYPE3 request.
+ */
+ tval |= TOD_PIBM_TTYPE4_SEND_MODE;
+ tval |= TOD_PIBM_TTYPE4_SEND_ENBL;
+ } else {
+ /* Disable TTYPE4 send mode. */
+ tval &= ~TOD_PIBM_TTYPE4_SEND_MODE;
+ tval &= ~TOD_PIBM_TTYPE4_SEND_ENBL;
+ }
+
+ if (xscom_write(chip->id, TOD_PIB_MASTER, tval)) {
+ prerror("XSCOM error writing PIB_MASTER\n");
+ return false;
+ }
+ return true;
+}
+
+/* Stop TODs on slave chips in backup topology. */
+static void chiptod_stop_slave_tods(void)
+{
+ struct proc_chip *chip = NULL;
+ enum chiptod_topology backup_topo;
+ uint64_t terr = 0;
+
+ /* Inject TOD sync check error on salve TODs to stop them. */
+ terr |= TOD_ERR_TOD_SYNC_CHECK;
+
+ if (current_topology == chiptod_topo_primary)
+ backup_topo = chiptod_topo_secondary;
+ else
+ backup_topo = chiptod_topo_primary;
+
+ for_each_chip(chip) {
+ enum chiptod_chip_role role;
+
+ /* Current chip TOD is already in stooped state */
+ if (chip->id == this_cpu()->chip_id)
+ continue;
+
+ role = chiptod_get_chip_role(backup_topo, chip->id);
+
+ /* Skip backup master chip TOD. */
+ if (role == chiptod_chip_role_MDMT)
+ continue;
+
+ if (xscom_write(chip->id, TOD_ERROR_INJECT, terr))
+ prerror("XSCOM error writing TOD_ERROR_INJ\n");
+
+ if (chiptod_running_check(chip->id)) {
+ prlog(PR_DEBUG,
+ "Failed to stop TOD on slave CHIP [%d]\n",
+ chip->id);
+ }
+ }
+}
+
+static bool is_topology_switch_required(void)
+{
+ int32_t active_master_chip;
+ uint64_t tod_error;
+
+ active_master_chip = chiptod_get_active_master();
+
+ /* Check if TOD is running on Active master. */
+ if (chiptod_master_running())
+ return false;
+
+ /*
+ * Check if sync/step network is running.
+ *
+ * If sync/step network is not running on current active topology
+ * then we need switch topology to recover from TOD error.
+ */
+ if (!chiptod_sync_step_check_running(current_topology)) {
+ prlog(PR_DEBUG, "Sync/Step network not running\n");
+ return true;
+ }
+
+ /*
+ * Check if there is a step check error reported on
+ * Active master.
+ */
+ if (xscom_read(active_master_chip, TOD_ERROR, &tod_error)) {
+ prerror("XSCOM error reading TOD_ERROR reg\n");
+ /*
+ * Can't do anything here. But we already found that
+ * sync/step network is running. Hence return false.
+ */
+ return false;
+ }
+
+ if (tod_error & TOD_ERR_MP0_STEP_CHECK) {
+ prlog(PR_DEBUG, "TOD step check error\n");
+ return true;
+ }
+
+ return false;
+}
+
+static bool chiptod_backup_valid(void)
+{
+ enum chiptod_topology backup_topo;
+
+ if (current_topology < 0)
+ return false;
+
+ if (current_topology == chiptod_topo_primary)
+ backup_topo = chiptod_topo_secondary;
+ else
+ backup_topo = chiptod_topo_primary;
+
+ if (chiptod_topology_info[backup_topo].status == chiptod_backup_master)
+ return chiptod_sync_step_check_running(backup_topo);
+
+ return false;
+}
+
+static void chiptod_topology_switch_complete(void)
+{
+ /*
+ * After the topology switch, we may have a non-functional backup
+ * topology, and we won't be able to recover from future TOD errors
+ * that requires topology switch. Someone needs to either fix it OR
+ * configure new functional backup topology.
+ *
+ * Bit 18 of the Pervasive FIR is used to signal that TOD error
+ * analysis needs to be performed. This allows FSP/PRD to
+ * investigate and re-configure new backup topology if required.
+ * Once new backup topology is configured and ready, FSP sends a
+ * mailbox command xE6, s/c 0x06, mod 0, to enable the backup
+ * topology.
+ *
+ * This isn't documented anywhere. This info is provided by FSP
+ * folks.
+ */
+ if (xscom_writeme(LOCAL_CORE_FIR, LFIR_SWITCH_COMPLETE)) {
+ prerror("XSCOM error writing LOCAL_CORE_FIR\n");
+ return;
+ }
+
+ /* Save TOD control registers values. */
+ chiptod_cache_tod_registers();
+
+ prlog(PR_DEBUG, "Topology switch complete\n");
+ print_topology_info();
+}
+
+/*
+ * Sync up TOD with other chips and get TOD in running state.
+ * Check if current topology is active and running. If not, then
+ * trigger a topology switch.
+ */
+static int chiptod_start_tod(void)
+{
+ struct proc_chip *chip = NULL;
+
+ /* Do a topology switch if required. */
+ if (is_topology_switch_required()) {
+ int32_t mchip = chiptod_get_active_master();
+
+ prlog(PR_DEBUG, "Need topology switch to recover\n");
+ /*
+ * There is a failure in StepSync network in current
+ * active topology. TOD is not running on active master chip.
+ * We need to sync with backup master chip TOD.
+ * But before we do that we need to switch topology to make
+ * backup master as the new active master. Once we switch the
+ * topology we can then request TOD value from new active
+ * master. But make sure we move local chiptod to Not Set
+ * before requesting TOD value.
+ *
+ * Before triggering a topology switch, check if backup
+ * is valid and stop all slave TODs in backup topology.
+ */
+ if (!chiptod_backup_valid()) {
+ prerror("Backup master is not enabled. "
+ "Can not do a topology switch.\n");
+ goto error_out;
+ }
+
+ chiptod_stop_slave_tods();
+
+ if (xscom_write(mchip, TOD_TTYPE_1, PPC_BIT(0))) {
+ prerror("XSCOM error switching primary/secondary\n");
+ goto error_out;
+ }
+
+ /* Update topology info. */
+ current_topology = query_current_topology();
+ chiptod_update_topology(chiptod_topo_primary);
+ chiptod_update_topology(chiptod_topo_secondary);
+
+ /*
+ * We just switched topologies to recover.
+ * Check if new master TOD is running.
+ */
+ if (!chiptod_master_running()) {
+ prerror("TOD is not running on new master.\n");
+ goto error_out;
+ }
+
+ /*
+ * Enable step checkers on all Chip TODs
+ *
+ * During topology switch, step checkers are disabled
+ * on all Chip TODs by default. Enable them.
+ */
+ if (xscom_writeme(TOD_TTYPE_2, PPC_BIT(0))) {
+ prerror("XSCOM error enabling steppers\n");
+ goto error_out;
+ }
+
+ chiptod_topology_switch_complete();
+ }
+
+ if (!chiptod_master_running()) {
+ /*
+ * Active Master TOD is not running, which means it won't
+ * respond to TTYPE_3 request.
+ *
+ * Find a chip that has TOD in running state and configure
+ * it to respond to TTYPE_3 request.
+ */
+ for_each_chip(chip) {
+ if (chiptod_running_check(chip->id)) {
+ if (chiptod_set_ttype4_mode(chip, true))
+ break;
+ }
+ }
+ }
+
+ /* Switch local chiptod to "Not Set" state */
+ if (xscom_writeme(TOD_LOAD_TOD_MOD, PPC_BIT(0))) {
+ prerror("XSCOM error sending LOAD_TOD_MOD\n");
+ goto error_out;
+ }
+
+ /*
+ * Request the current TOD value from another chip.
+ * This will move TOD in running state
+ */
+ if (xscom_writeme(TOD_TTYPE_3, PPC_BIT(0))) {
+ prerror("XSCOM error sending TTYPE_3\n");
+ goto error_out;
+ }
+
+ /* Check if chip TOD is running. */
+ if (!chiptod_poll_running())
+ goto error_out;
+
+ /* Restore the ttype4_mode. */
+ chiptod_set_ttype4_mode(chip, false);
+ return 1;
+
+error_out:
+ chiptod_unrecoverable = true;
+ return 0;
+}
+
+static bool tfmr_recover_tb_errors(uint64_t tfmr)
+{
+ uint64_t tfmr_reset_error;
+ unsigned long timeout = 0;
+
+ /* Ask for automatic clear of errors */
+ tfmr_reset_error = base_tfmr | SPR_TFMR_CLEAR_TB_ERRORS;
+
+ /* Additionally pHyp sets these (write-1-to-clear ?) */
+ if (tfmr & SPR_TFMR_TB_MISSING_SYNC)
+ tfmr_reset_error |= SPR_TFMR_TB_MISSING_SYNC;
+
+ if (tfmr & SPR_TFMR_TB_MISSING_STEP)
+ tfmr_reset_error |= SPR_TFMR_TB_MISSING_STEP;
+
+ /*
+ * write 1 to bit 45 to clear TB residue the error.
+ * TB register has already been reset to zero as part pre-recovery.
+ */
+ if (tfmr & SPR_TFMR_TB_RESIDUE_ERR)
+ tfmr_reset_error |= SPR_TFMR_TB_RESIDUE_ERR;
+
+ if (tfmr & SPR_TFMR_FW_CONTROL_ERR)
+ tfmr_reset_error |= SPR_TFMR_FW_CONTROL_ERR;
+
+ if (tfmr & SPR_TFMR_TBST_CORRUPT)
+ tfmr_reset_error |= SPR_TFMR_TBST_CORRUPT;
+
+ mtspr(SPR_TFMR, tfmr_reset_error);
+
+ /* We have to write "Clear TB Errors" again */
+ tfmr_reset_error = base_tfmr | SPR_TFMR_CLEAR_TB_ERRORS;
+ mtspr(SPR_TFMR, tfmr_reset_error);
+
+ do {
+ if (++timeout >= TIMEOUT_LOOPS) {
+ prerror("TB error reset timeout !\n");
+ return false;
+ }
+ tfmr = mfspr(SPR_TFMR);
+ if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
+ prerror("TB error reset: corrupt TFMR !\n");
+ return false;
+ }
+ } while (tfmr & SPR_TFMR_CLEAR_TB_ERRORS);
+ return true;
+}
+
+bool tfmr_recover_local_errors(uint64_t tfmr)
+{
+ uint64_t tfmr_reset_errors = 0;
+
+ if (tfmr & SPR_TFMR_DEC_PARITY_ERR) {
+ /* Set DEC with all ones */
+ mtspr(SPR_DEC, ~0);
+
+ /* set bit 59 to clear TFMR DEC parity error. */
+ tfmr_reset_errors |= SPR_TFMR_DEC_PARITY_ERR;
+ }
+
+ /*
+ * Reset PURR/SPURR to recover. We also need help from KVM
+ * layer to handle this change in PURR/SPURR. That needs
+ * to be handled in kernel KVM layer. For now, to recover just
+ * reset it.
+ */
+ if (tfmr & SPR_TFMR_PURR_PARITY_ERR) {
+ /* set PURR register with sane value or reset it. */
+ mtspr(SPR_PURR, 0);
+
+ /* set bit 57 to clear TFMR PURR parity error. */
+ tfmr_reset_errors |= SPR_TFMR_PURR_PARITY_ERR;
+ }
+
+ if (tfmr & SPR_TFMR_SPURR_PARITY_ERR) {
+ /* set PURR register with sane value or reset it. */
+ mtspr(SPR_SPURR, 0);
+
+ /* set bit 58 to clear TFMR PURR parity error. */
+ tfmr_reset_errors |= SPR_TFMR_SPURR_PARITY_ERR;
+ }
+
+ /* Write TFMR twice to clear the error */
+ mtspr(SPR_TFMR, base_tfmr | tfmr_reset_errors);
+ mtspr(SPR_TFMR, base_tfmr | tfmr_reset_errors);
+
+ /* Get fresh copy of TFMR */
+ tfmr = mfspr(SPR_TFMR);
+
+ /* Check if TFMR non-TB errors still present. */
+ if (tfmr & tfmr_reset_errors) {
+ prerror("TFMR non-TB error recovery failed! "
+ "TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+ return false;
+ }
+ return true;
+}
+
+/*
+ * TFMR parity error recovery as per pc_workbook:
+ * MT(TFMR) bits 11 and 60 are b’1’
+ * MT(HMER) all bits 1 except for bits 4,5
+ */
+bool recover_corrupt_tfmr(void)
+{
+ uint64_t tfmr;
+
+ /* Get the base TFMR */
+ tfmr = base_tfmr;
+
+ /* Set bit 60 to clear TFMR parity error. */
+ tfmr |= SPR_TFMR_TFMR_CORRUPT;
+ mtspr(SPR_TFMR, tfmr);
+
+ /* Write twice to clear the error */
+ mtspr(SPR_TFMR, tfmr);
+
+ /* Get fresh copy of TFMR */
+ tfmr = mfspr(SPR_TFMR);
+
+ /* Check if TFMR parity error still present. */
+ if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
+ prerror("TFMR error recovery: corrupt TFMR !\n");
+ return false;
+ }
+
+ /*
+ * Now that we have sane value in TFMR, check if Timebase machine
+ * state is in ERROR state. If yes, clear TB errors so that
+ * Timebase machine state changes to RESET state. Once in RESET state
+ * then we can then load TB with TOD value.
+ */
+ if (GETFIELD(SPR_TFMR_TBST_ENCODED, tfmr) == TBST_STATE_ERROR) {
+ if (!chiptod_reset_tb_errors())
+ return false;
+ }
+ return true;
+}
+
+void tfmr_cleanup_core_errors(uint64_t tfmr)
+{
+ /* If HDEC is bad, clean it on all threads before we clear the
+ * error condition.
+ */
+ if (tfmr & SPR_TFMR_HDEC_PARITY_ERROR)
+ mtspr(SPR_HDEC, 0);
+
+ /* If TB is invalid, clean it on all threads as well, it will be
+ * restored after the next rendez-vous
+ */
+ if (!(tfmr & SPR_TFMR_TB_VALID)) {
+ mtspr(SPR_TBWU, 0);
+ mtspr(SPR_TBWU, 0);
+ }
+}
+
+int tfmr_clear_core_errors(uint64_t tfmr)
+{
+ uint64_t tfmr_reset_errors = 0;
+
+ /* return -1 if there is nothing to be fixed. */
+ if (!(tfmr & SPR_TFMR_HDEC_PARITY_ERROR))
+ return -1;
+
+ tfmr_reset_errors |= SPR_TFMR_HDEC_PARITY_ERROR;
+
+ /* Write TFMR twice to clear the error */
+ mtspr(SPR_TFMR, base_tfmr | tfmr_reset_errors);
+ mtspr(SPR_TFMR, base_tfmr | tfmr_reset_errors);
+
+ return 1;
+}
+
+/*
+ * Recover from TB and TOD errors.
+ * Timebase register is per core and first thread that gets chance to
+ * handle interrupt would fix actual TFAC errors and rest of the threads
+ * from same core would see no errors. Return -1 if no errors have been
+ * found. The caller (handle_hmi_exception) of this function would not
+ * send an HMI event to host if return value is -1.
+ *
+ * Return values:
+ * 0 <= Failed to recover from errors
+ * 1 <= Successfully recovered from errors
+ * -1 <= No errors found. Errors are already been fixed.
+ */
+int chiptod_recover_tb_errors(bool *out_resynced)
+{
+ uint64_t tfmr;
+ int rc = -1;
+
+ *out_resynced = false;
+
+ if (chiptod_primary < 0)
+ return 0;
+
+ lock(&chiptod_lock);
+
+ /*
+ * Return if TOD is unrecoverable.
+ * The previous attempt to recover TOD has been failed.
+ */
+ if (chiptod_unrecoverable) {
+ rc = 0;
+ goto error_out;
+ }
+
+ /* Get fresh copy of TFMR */
+ tfmr = mfspr(SPR_TFMR);
+
+ /*
+ * Check for TB errors.
+ * On Sync check error, bit 44 of TFMR is set. Check for it and
+ * clear it.
+ *
+ * In some rare situations we may have all TB errors already cleared,
+ * but TB stuck in waiting for new value from TOD with TFMR bit 18
+ * set to '1'. This uncertain state of TB would fail the process
+ * of getting TB back into running state. Get TB in clean initial
+ * state by clearing TB errors if TFMR[18] is set.
+ */
+ if ((tfmr & SPR_TFMR_TB_MISSING_STEP) ||
+ (tfmr & SPR_TFMR_TB_RESIDUE_ERR) ||
+ (tfmr & SPR_TFMR_FW_CONTROL_ERR) ||
+ (tfmr & SPR_TFMR_TBST_CORRUPT) ||
+ (tfmr & SPR_TFMR_MOVE_CHIP_TOD_TO_TB) ||
+ (tfmr & SPR_TFMR_TB_MISSING_SYNC)) {
+ if (!tfmr_recover_tb_errors(tfmr)) {
+ rc = 0;
+ goto error_out;
+ }
+ }
+
+ /*
+ * Check for TOD sync check error.
+ * On TOD errors, bit 51 of TFMR is set. If this bit is on then we
+ * need to fetch TOD error register and recover from TOD errors.
+ * Bit 33 of TOD error register indicates sync check error.
+ */
+ if (tfmr & SPR_TFMR_CHIP_TOD_INTERRUPT)
+ rc = __chiptod_recover_tod_errors();
+
+ /* Check if TB is running. If not then we need to get it running. */
+ if (!(tfmr & SPR_TFMR_TB_VALID)) {
+ rc = 0;
+
+ /* Place TB in Notset state. */
+ if (!chiptod_mod_tb())
+ goto error_out;
+
+ /*
+ * Before we move TOD to core TB check if TOD is running.
+ * If not, then get TOD in running state.
+ */
+ if (!chiptod_running_check(this_cpu()->chip_id))
+ if (!chiptod_start_tod())
+ goto error_out;
+
+ /* Move chiptod value to core TB */
+ if (!chiptod_to_tb())
+ goto error_out;
+
+ *out_resynced = true;
+
+ /* We have successfully able to get TB running. */
+ rc = 1;
+ }
+
+error_out:
+ unlock(&chiptod_lock);
+ return rc;
+}
+
+static int64_t opal_resync_timebase(void)
+{
+ /* Mambo and qemu doesn't simulate the chiptod */
+ if (chip_quirk(QUIRK_NO_CHIPTOD))
+ return OPAL_SUCCESS;
+
+ if (!chiptod_wakeup_resync()) {
+ prerror("OPAL: Resync timebase failed on CPU 0x%04x\n",
+ this_cpu()->pir);
+ return OPAL_HARDWARE;
+ }
+ return OPAL_SUCCESS;
+}
+opal_call(OPAL_RESYNC_TIMEBASE, opal_resync_timebase, 0);
+
+static void chiptod_print_tb(void *data __unused)
+{
+ prlog(PR_DEBUG, "PIR 0x%04x TB=%lx\n", this_cpu()->pir,
+ mfspr(SPR_TBRL));
+}
+
+static bool chiptod_probe(void)
+{
+ struct dt_node *np;
+
+ dt_for_each_compatible(dt_root, np, "ibm,power-chiptod") {
+ uint32_t chip;
+
+ /* Old DT has chip-id in chiptod node, newer only in the
+ * parent xscom bridge
+ */
+ chip = dt_get_chip_id(np);
+
+ if (dt_has_node_property(np, "primary", NULL)) {
+ chiptod_primary = chip;
+ if (dt_node_is_compatible(np, "ibm,power8-chiptod"))
+ chiptod_type = chiptod_p8;
+ if (dt_node_is_compatible(np, "ibm,power9-chiptod"))
+ chiptod_type = chiptod_p9;
+ if (dt_node_is_compatible(np, "ibm,power10-chiptod"))
+ chiptod_type = chiptod_p10;
+ }
+
+ if (dt_has_node_property(np, "secondary", NULL))
+ chiptod_secondary = chip;
+
+ }
+
+ if (chiptod_type == chiptod_unknown) {
+ prerror("Unknown TOD type !\n");
+ return false;
+ }
+
+ return true;
+}
+
+static void chiptod_discover_new_backup(enum chiptod_topology topo)
+{
+ struct proc_chip *chip = NULL;
+
+ /* Scan through available chips to find new backup master chip */
+ for_each_chip(chip) {
+ if (_chiptod_get_chip_status(chip->id) == chiptod_backup_master)
+ break;
+ }
+
+ /* Found new backup master chip. Update the topology info */
+ if (chip) {
+ prlog(PR_DEBUG, "New backup master: CHIP [%d]\n",
+ chip->id);
+
+ if (topo == chiptod_topo_primary)
+ chiptod_primary = chip->id;
+ else
+ chiptod_secondary = chip->id;
+ chiptod_topology_info[topo].id = chip->id;
+ chiptod_update_topology(topo);
+
+ prlog(PR_DEBUG,
+ "Backup topology configuration changed.\n");
+ print_topology_info();
+ }
+
+ /*
+ * Topology configuration has changed. Save TOD control registers
+ * values.
+ */
+ chiptod_cache_tod_registers();
+}
+
+/*
+ * Enable/disable backup topology.
+ * If request is to enable topology, then discover new backup master
+ * chip and update the topology configuration info. If the request is
+ * to disable topology, then mark the current backup topology as disabled.
+ * Return error (-1) if the action is requested on currenlty active
+ * topology.
+ *
+ * Return values:
+ * true <= Success
+ * false <= Topology is active and in use.
+ */
+bool chiptod_adjust_topology(enum chiptod_topology topo, bool enable)
+{
+ uint8_t rc = true;
+ /*
+ * The FSP can only request that the currently inactive topology
+ * be disabled or enabled. If the requested topology is currently
+ * the active topology, then fail this request with a -1 (TOD
+ * topology in use) status as return code.
+ */
+ lock(&chiptod_lock);
+ if (topo == current_topology) {
+ rc = false;
+ goto out;
+ }
+
+ if (enable)
+ chiptod_discover_new_backup(topo);
+ else
+ chiptod_topology_info[topo].status = chiptod_backup_disabled;
+out:
+ unlock(&chiptod_lock);
+ return rc;
+}
+
+static void chiptod_init_topology_info(void)
+{
+ /* Find and update current topology in use. */
+ current_topology = query_current_topology();
+
+ /* Initialized primary topology chip config info */
+ chiptod_topology_info[chiptod_topo_primary].id = chiptod_primary;
+ chiptod_update_topology(chiptod_topo_primary);
+
+ /* Initialized secondary topology chip config info */
+ chiptod_topology_info[chiptod_topo_secondary].id = chiptod_secondary;
+ chiptod_update_topology(chiptod_topo_secondary);
+
+ /* Cache TOD control registers values. */
+ chiptod_cache_tod_registers();
+ print_topology_info();
+}
+
+void chiptod_init(void)
+{
+ struct cpu_thread *cpu0, *cpu;
+ bool sres;
+ int i;
+
+ /* Mambo and qemu doesn't simulate the chiptod */
+ if (chip_quirk(QUIRK_NO_CHIPTOD))
+ return;
+
+ op_display(OP_LOG, OP_MOD_CHIPTOD, 0);
+
+ if (!chiptod_probe()) {
+ prerror("Failed ChipTOD detection !\n");
+ op_display(OP_FATAL, OP_MOD_CHIPTOD, 0);
+ abort();
+ }
+
+ op_display(OP_LOG, OP_MOD_CHIPTOD, 1);
+
+ /* Pick somebody on the primary */
+ cpu0 = find_cpu_by_chip_id(chiptod_primary);
+
+ /* Calculate the base TFMR value used for everybody */
+ chiptod_setup_base_tfmr();
+
+ prlog(PR_DEBUG, "Base TFMR=0x%016llx\n", base_tfmr);
+
+ i = NUM_SYNC_RETRIES;
+ do {
+ /* Schedule master sync */
+ sres = false;
+ cpu_wait_job(cpu_queue_job(cpu0, "chiptod_sync_master",
+ chiptod_sync_master, &sres), true);
+ } while (!sres && i--);
+
+ if (!sres) {
+ op_display(OP_FATAL, OP_MOD_CHIPTOD, 2);
+ abort();
+ }
+
+ op_display(OP_LOG, OP_MOD_CHIPTOD, 2);
+
+ /* Schedule slave sync */
+ for_each_available_cpu(cpu) {
+ /* Skip master */
+ if (cpu == cpu0)
+ continue;
+
+ i = NUM_SYNC_RETRIES;
+ do {
+ /* Queue job */
+ sres = false;
+ cpu_wait_job(cpu_queue_job(cpu, "chiptod_sync_slave",
+ chiptod_sync_slave, &sres),
+ true);
+ } while (!sres && i--);
+
+ if (!sres) {
+ op_display(OP_WARN, OP_MOD_CHIPTOD, 3|(cpu->pir << 8));
+ prerror("CHIPTOD: Failed to sync PIR 0x%04x\n",
+ this_cpu()->pir);
+
+ /* Disable threads */
+ cpu_disable_all_threads(cpu);
+ }
+ op_display(OP_LOG, OP_MOD_CHIPTOD, 3|(cpu->pir << 8));
+ }
+
+ /* Display TBs */
+ for_each_available_cpu(cpu) {
+ /* Only do primaries, not threads */
+ if (cpu->is_secondary)
+ continue;
+ cpu_wait_job(cpu_queue_job(cpu, "chiptod_print_tb",
+ chiptod_print_tb, NULL), true);
+ }
+
+ chiptod_init_topology_info();
+ op_display(OP_LOG, OP_MOD_CHIPTOD, 4);
+}
+
+/* CAPP timebase sync */
+
+static bool chiptod_capp_reset_tb_errors(uint32_t chip_id,
+ uint32_t tfmr_addr,
+ uint32_t offset)
+{
+ uint64_t tfmr;
+ unsigned long timeout = 0;
+
+ /* Ask for automatic clear of errors */
+ tfmr = base_tfmr | SPR_TFMR_CLEAR_TB_ERRORS;
+
+ /* Additionally pHyp sets these (write-1-to-clear ?) */
+ tfmr |= SPR_TFMR_TB_MISSING_SYNC;
+ tfmr |= SPR_TFMR_TB_MISSING_STEP;
+ tfmr |= SPR_TFMR_TB_RESIDUE_ERR;
+ tfmr |= SPR_TFMR_TBST_CORRUPT;
+ tfmr |= SPR_TFMR_TFMR_CORRUPT;
+
+ /* Write CAPP TFMR */
+ xscom_write(chip_id, tfmr_addr + offset, tfmr);
+
+ /* We have to write "Clear TB Errors" again */
+ tfmr = base_tfmr | SPR_TFMR_CLEAR_TB_ERRORS;
+ /* Write CAPP TFMR */
+ xscom_write(chip_id, tfmr_addr + offset, tfmr);
+
+ do {
+ if (++timeout >= TIMEOUT_LOOPS) {
+ prerror("CAPP: TB error reset timeout !\n");
+ return false;
+ }
+ /* Read CAPP TFMR */
+ xscom_read(chip_id, tfmr_addr + offset, &tfmr);
+ if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
+ prerror("CAPP: TB error reset: corrupt TFMR!\n");
+ return false;
+ }
+ } while (tfmr & SPR_TFMR_CLEAR_TB_ERRORS);
+ return true;
+}
+
+static bool chiptod_capp_mod_tb(uint32_t chip_id, uint32_t tfmr_addr,
+ uint32_t offset)
+{
+ uint64_t timeout = 0;
+ uint64_t tfmr;
+
+ /* Switch CAPP timebase to "Not Set" state */
+ tfmr = base_tfmr | SPR_TFMR_LOAD_TOD_MOD;
+ xscom_write(chip_id, tfmr_addr + offset, tfmr);
+ do {
+ if (++timeout >= (TIMEOUT_LOOPS*2)) {
+ prerror("CAPP: TB \"Not Set\" timeout\n");
+ return false;
+ }
+ xscom_read(chip_id, tfmr_addr + offset, &tfmr);
+ if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
+ prerror("CAPP: TB \"Not Set\" TFMR corrupt\n");
+ return false;
+ }
+ if (GETFIELD(SPR_TFMR_TBST_ENCODED, tfmr) == 9) {
+ prerror("CAPP: TB \"Not Set\" TOD in error state\n");
+ return false;
+ }
+ } while (tfmr & SPR_TFMR_LOAD_TOD_MOD);
+
+ return true;
+}
+
+static bool chiptod_wait_for_chip_sync(void)
+{
+ uint64_t tfmr;
+ uint64_t timeout = 0;
+
+ /* Read core TFMR, mask bit 42, write core TFMR back */
+ tfmr = mfspr(SPR_TFMR);
+ tfmr &= ~SPR_TFMR_TB_SYNC_OCCURED;
+ mtspr(SPR_TFMR, tfmr);
+
+ /* Read core TFMR until the TB sync occurred */
+ do {
+ if (++timeout >= TIMEOUT_LOOPS) {
+ prerror("No sync pulses\n");
+ return false;
+ }
+ tfmr = mfspr(SPR_TFMR);
+ } while (!(tfmr & SPR_TFMR_TB_SYNC_OCCURED));
+ return true;
+}
+
+static bool chiptod_capp_check_tb_running(uint32_t chip_id,
+ uint32_t tfmr_addr,
+ uint32_t offset)
+{
+ uint64_t tfmr;
+ uint64_t timeout = 0;
+
+ /* Read CAPP TFMR until TB becomes valid */
+ do {
+ if (++timeout >= (TIMEOUT_LOOPS*2)) {
+ prerror("CAPP: TB Invalid!\n");
+ return false;
+ }
+ xscom_read(chip_id, tfmr_addr + offset, &tfmr);
+ if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
+ prerror("CAPP: TFMR corrupt!\n");
+ return false;
+ }
+ } while (!(tfmr & SPR_TFMR_TB_VALID));
+ return true;
+}
+
+bool chiptod_capp_timebase_sync(unsigned int chip_id, uint32_t tfmr_addr,
+ uint32_t tb_addr, uint32_t offset)
+{
+ uint64_t tfmr;
+ uint64_t capp_tb;
+ int64_t delta;
+ unsigned int retry = 0;
+
+ /* Set CAPP TFMR to base tfmr value */
+ xscom_write(chip_id, tfmr_addr + offset, base_tfmr);
+
+ /* Reset CAPP TB errors before attempting the sync */
+ if (!chiptod_capp_reset_tb_errors(chip_id, tfmr_addr, offset))
+ return false;
+
+ /* Switch CAPP TB to "Not Set" state */
+ if (!chiptod_capp_mod_tb(chip_id, tfmr_addr, offset))
+ return false;
+
+ /* Sync CAPP TB with core TB, retry while difference > 16usecs */
+ do {
+ if (retry++ > 5) {
+ prerror("CAPP: TB sync: giving up!\n");
+ return false;
+ }
+
+ /* Make CAPP ready to get the TB, wait for chip sync */
+ tfmr = base_tfmr | SPR_TFMR_MOVE_CHIP_TOD_TO_TB;
+ xscom_write(chip_id, tfmr_addr + offset, tfmr);
+ if (!chiptod_wait_for_chip_sync())
+ return false;
+
+ /* Set CAPP TB from core TB */
+ xscom_write(chip_id, tb_addr + offset, mftb());
+
+ /* Wait for CAPP TFMR tb_valid bit */
+ if (!chiptod_capp_check_tb_running(chip_id, tfmr_addr, offset))
+ return false;
+
+ /* Read CAPP TB, read core TB, compare */
+ xscom_read(chip_id, tb_addr + offset, &capp_tb);
+ delta = mftb() - capp_tb;
+ if (delta < 0)
+ delta = -delta;
+ } while (tb_to_usecs(delta) > 16);
+
+ return true;
+}
diff --git a/roms/skiboot/hw/dio-p9.c b/roms/skiboot/hw/dio-p9.c
new file mode 100644
index 000000000..5153f6eeb
--- /dev/null
+++ b/roms/skiboot/hw/dio-p9.c
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2019 IBM Corp. */
+
+#define pr_fmt(fmt) "DIO: " fmt
+
+#include <chip.h>
+#include <dio-p9.h>
+#include <opal.h>
+#include <xscom.h>
+#include <xscom-p9-regs.h>
+
+void p9_dio_init(void)
+{
+ struct dt_node *xn;
+ struct proc_chip *chip;
+ struct p9_dio *dio;
+
+ if (proc_gen < proc_gen_p9)
+ return;
+
+ dt_for_each_compatible(dt_root, xn, "ibm,xscom") {
+ dio = zalloc(sizeof(struct p9_dio));
+ assert(dio);
+ chip = get_chip(dt_get_chip_id(xn));
+ assert(chip);
+ chip->dio = dio;
+ }
+}
+
+int dio_interrupt_register(struct proc_chip *chip,
+ int port, dio_interrupt_callback callback)
+{
+ u64 val;
+ int rc;
+
+ assert(chip);
+ assert(chip->dio);
+
+ if (port < 0 || port >= NUM_OF_P9_DIO_PORTS)
+ return OPAL_PARAMETER;
+
+ if (chip->dio->callbacks[port]) /* This port already has a callback */
+ return OPAL_PARAMETER;
+
+ rc = xscom_read(chip->id, P9_GPIO_INTERRUPT_ENABLE, &val);
+ if (rc != OPAL_SUCCESS) {
+ prlog(PR_ERR, "XSCOM error %d reading reg 0x%llx\n",
+ rc, P9_GPIO_INTERRUPT_ENABLE);
+ return OPAL_HARDWARE;
+ }
+
+ val |= PPC_BIT(port);
+ rc = xscom_write(chip->id, P9_GPIO_INTERRUPT_ENABLE, val);
+ if (rc != OPAL_SUCCESS) {
+ prlog(PR_ERR, "XSCOM error %d writing reg 0x%llx\n",
+ rc, P9_GPIO_INTERRUPT_ENABLE);
+ return OPAL_HARDWARE;
+ }
+
+ chip->dio->callbacks[port] = callback;
+
+ return OPAL_SUCCESS;
+}
+
+int dio_interrupt_deregister(struct proc_chip* chip,
+ int port, dio_interrupt_callback callback)
+{
+ u64 val;
+ int rc;
+
+ assert(chip);
+ assert(chip->dio);
+
+ if (port < 0 || port >= NUM_OF_P9_DIO_PORTS)
+ return OPAL_PARAMETER;
+
+ if (chip->dio->callbacks[port] != callback)
+ return OPAL_PARAMETER;
+
+ rc = xscom_read(chip->id, P9_GPIO_INTERRUPT_ENABLE, &val);
+ if (rc != OPAL_SUCCESS) {
+ prlog(PR_ERR, "XSCOM error %d reading reg 0x%llx\n",
+ rc, P9_GPIO_INTERRUPT_ENABLE);
+ return OPAL_HARDWARE;
+ }
+
+ val &= ~PPC_BIT(port);
+ rc = xscom_write(chip->id, P9_GPIO_INTERRUPT_ENABLE, val);
+ if (rc != OPAL_SUCCESS) {
+ prlog(PR_ERR, "XSCOM error %d writing reg 0x%llx\n",
+ rc, P9_GPIO_INTERRUPT_ENABLE);
+ return OPAL_HARDWARE;
+ }
+
+ chip->dio->callbacks[port] = NULL;
+
+ return OPAL_SUCCESS;
+}
+
+void dio_interrupt_handler(uint32_t chip_id)
+{
+ struct proc_chip *chip;
+ u64 val;
+ int rc;
+ int i;
+
+ chip = get_chip(chip_id);
+ if (chip == NULL || chip->dio == NULL)
+ return;
+
+ rc = xscom_read(chip->id, P9_GPIO_INTERRUPT_STATUS, &val);
+ if (rc != OPAL_SUCCESS) {
+ prlog(PR_ERR, "XSCOM error %d reading reg 0x%llx\n",
+ rc, P9_GPIO_INTERRUPT_STATUS);
+ return;
+ }
+
+ for (i = 0; i < NUM_OF_P9_DIO_PORTS; ++i) {
+ if (val & PPC_BIT(i)) {
+ if (chip->dio->callbacks[i])
+ chip->dio->callbacks[i](chip);
+ else
+ prlog(PR_ERR,
+ "DIO interrupt triggerd on chip 0x%x"
+ " port %d but no handler\n",
+ chip->id, i);
+ /* Write 1 to clear the interrupt status */
+ xscom_write(chip->id, P9_GPIO_INTERRUPT_CONDITION,
+ val & PPC_BIT(i));
+ }
+ }
+}
diff --git a/roms/skiboot/hw/dts.c b/roms/skiboot/hw/dts.c
new file mode 100644
index 000000000..d8831e4d3
--- /dev/null
+++ b/roms/skiboot/hw/dts.c
@@ -0,0 +1,416 @@
+// SPDX-License-Identifier: Apache-2.0
+/* Copyright 2013-2019 IBM Corp. */
+
+#include <xscom.h>
+#include <chip.h>
+#include <sensor.h>
+#include <dts.h>
+#include <skiboot.h>
+#include <opal-api.h>
+#include <opal-msg.h>
+#include <timer.h>
+#include <timebase.h>
+
+struct dts {
+ uint8_t valid;
+ uint8_t trip;
+ int16_t temp;
+};
+
+/*
+ * Attributes for the core temperature sensor
+ */
+enum {
+ SENSOR_DTS_ATTR_TEMP_MAX,
+ SENSOR_DTS_ATTR_TEMP_TRIP
+};
+
+
+/* Therm mac result masking for DTS (result(0:15)
+ * 0:3 - 0x0
+ * 4:11 - Temperature in degrees C
+ * 12:13 - trip bits: 00 - no trip; 01 - warning; 10 - critical; 11 - fatal
+ * 14 - spare
+ * 15 - valid
+ */
+static void dts_decode_one_dts(uint16_t raw, struct dts *dts)
+{
+ /*
+ * The value is both signed and unsigned :-) 0xff could be
+ * either 255C or -1C, so for now we treat this as unsigned
+ * which is sufficient for our purpose. We could try to be
+ * a bit smarter and treat it as signed for values between
+ * -10 and 0 and unsigned to 239 or something like that...
+ */
+ dts->valid = raw & 1;
+ if (dts->valid) {
+ dts->temp = (raw >> 4) & 0xff;
+ dts->trip = (raw >> 2) & 0x3;
+ } else {
+ dts->temp = 0;
+ dts->trip = 0;
+ }
+}
+
+static void dts_keep_max(struct dts *temps, int n, struct dts *dts)
+{
+ int i;
+
+ for (i = 0; i < n; i++) {
+ int16_t t = temps[i].temp;
+
+ if (!temps[i].valid)
+ continue;
+
+ if (t > dts->temp)
+ dts->temp = t;
+
+ dts->valid++;
+ dts->trip |= temps[i].trip;
+ }
+}
+
+/* Per core Digital Thermal Sensors */
+#define EX_THERM_DTS_RESULT0 0x10050000
+#define EX_THERM_DTS_RESULT1 0x10050001
+
+/* Different sensor locations */
+#define P8_CT_ZONE_LSU 0
+#define P8_CT_ZONE_ISU 1
+#define P8_CT_ZONE_FXU 2
+#define P8_CT_ZONE_L3C 3
+#define P8_CT_ZONES 4
+
+/*
+ * Returns the temperature as the max of all 4 zones and a global trip
+ * attribute.
+ */
+static int dts_read_core_temp_p8(uint32_t pir, struct dts *dts)
+{
+ int32_t chip_id = pir_to_chip_id(pir);
+ int32_t core = pir_to_core_id(pir);
+ uint64_t dts0, dts1;
+ struct dts temps[P8_CT_ZONES];
+ int rc;
+
+ rc = xscom_read(chip_id, XSCOM_ADDR_P8_EX(core, EX_THERM_DTS_RESULT0),
+ &dts0);
+ if (rc)
+ return rc;
+
+ rc = xscom_read(chip_id, XSCOM_ADDR_P8_EX(core, EX_THERM_DTS_RESULT1),
+ &dts1);
+ if (rc)
+ return rc;
+
+ dts_decode_one_dts(dts0 >> 48, &temps[P8_CT_ZONE_LSU]);
+ dts_decode_one_dts(dts0 >> 32, &temps[P8_CT_ZONE_ISU]);
+ dts_decode_one_dts(dts0 >> 16, &temps[P8_CT_ZONE_FXU]);
+ dts_decode_one_dts(dts1 >> 48, &temps[P8_CT_ZONE_L3C]);
+
+ dts_keep_max(temps, P8_CT_ZONES, dts);
+
+ prlog(PR_TRACE, "DTS: Chip %x Core %x temp:%dC trip:%x\n",
+ chip_id, core, dts->temp, dts->trip);
+
+ /*
+ * FIXME: The trip bits are always set ?! Just discard
+ * them for the moment until we understand why.
+ */
+ dts->trip = 0;
+ return 0;
+}
+
+/* Per core Digital Thermal Sensors */
+#define EC_THERM_P9_DTS_RESULT0 0x050000
+
+/* Different sensor locations */
+#define P9_CORE_DTS0 0
+#define P9_CORE_DTS1 1
+#define P9_CORE_ZONES 2
+
+/*
+ * Returns the temperature as the max of all zones and a global trip
+ * attribute.
+ */
+static int dts_read_core_temp_p9(uint32_t pir, struct dts *dts)
+{
+ int32_t chip_id = pir_to_chip_id(pir);
+ int32_t core = pir_to_core_id(pir);
+ uint64_t dts0;
+ struct dts temps[P9_CORE_ZONES];
+ int rc;
+
+ rc = xscom_read(chip_id, XSCOM_ADDR_P9_EC(core, EC_THERM_P9_DTS_RESULT0),
+ &dts0);
+ if (rc)
+ return rc;
+
+ dts_decode_one_dts(dts0 >> 48, &temps[P9_CORE_DTS0]);
+ dts_decode_one_dts(dts0 >> 32, &temps[P9_CORE_DTS1]);
+
+ dts_keep_max(temps, P9_CORE_ZONES, dts);
+
+ prlog(PR_TRACE, "DTS: Chip %x Core %x temp:%dC trip:%x\n",
+ chip_id, core, dts->temp, dts->trip);
+
+ /*
+ * FIXME: The trip bits are always set ?! Just discard
+ * them for the moment until we understand why.
+ */
+ dts->trip = 0;
+ return 0;
+}
+
+static void dts_async_read_temp(struct timer *t __unused, void *data,
+ u64 now __unused)
+{
+ struct dts dts = {0};
+ int rc, swkup_rc;
+ struct cpu_thread *cpu = data;
+
+ swkup_rc = dctl_set_special_wakeup(cpu);
+
+ if (proc_gen == proc_gen_p9)
+ rc = dts_read_core_temp_p9(cpu->pir, &dts);
+ else /* (proc_gen == proc_gen_p10) */
+ rc = OPAL_UNSUPPORTED; /* XXX P10 */
+
+ if (!rc) {
+ if (cpu->sensor_attr == SENSOR_DTS_ATTR_TEMP_MAX)
+ *cpu->sensor_data = cpu_to_be64(dts.temp);
+ else if (cpu->sensor_attr == SENSOR_DTS_ATTR_TEMP_TRIP)
+ *cpu->sensor_data = cpu_to_be64(dts.trip);
+ }
+
+ if (!swkup_rc)
+ dctl_clear_special_wakeup(cpu);
+
+ check_sensor_read(cpu->token);
+ rc = opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL,
+ cpu_to_be64(cpu->token),
+ cpu_to_be64(rc));
+ if (rc)
+ prerror("Failed to queue async message\n");
+
+ cpu->dts_read_in_progress = false;
+}
+
+static int dts_read_core_temp(u32 pir, struct dts *dts, u8 attr,
+ int token, __be64 *sensor_data)
+{
+ struct cpu_thread *cpu;
+ int rc;
+
+ switch (proc_gen) {
+ case proc_gen_p8:
+ rc = dts_read_core_temp_p8(pir, dts);
+ break;
+ case proc_gen_p9: /* Asynchronus read */
+ cpu = find_cpu_by_pir(pir);
+ if (!cpu)
+ return OPAL_PARAMETER;
+ lock(&cpu->dts_lock);
+ if (cpu->dts_read_in_progress) {
+ unlock(&cpu->dts_lock);
+ return OPAL_BUSY;
+ }
+ cpu->dts_read_in_progress = true;
+ cpu->sensor_attr = attr;
+ cpu->sensor_data = sensor_data;
+ cpu->token = token;
+ schedule_timer(&cpu->dts_timer, 0);
+ rc = OPAL_ASYNC_COMPLETION;
+ unlock(&cpu->dts_lock);
+ break;
+ case proc_gen_p10: /* XXX P10 */
+ default:
+ rc = OPAL_UNSUPPORTED;
+ }
+ return rc;
+}
+
+/* Per memory controller Digital Thermal Sensors */
+#define THERM_MEM_DTS_RESULT0 0x2050000
+
+/* Different sensor locations */
+#define P8_MEM_DTS0 0
+#define P8_MEM_DTS1 1
+#define P8_MEM_ZONES 2
+
+static int dts_read_mem_temp(uint32_t chip_id, struct dts *dts)
+{
+ uint64_t dts0;
+ struct dts temps[P8_MEM_ZONES];
+ int i;
+ int rc;
+
+ rc = xscom_read(chip_id, THERM_MEM_DTS_RESULT0, &dts0);
+ if (rc)
+ return rc;
+
+ dts_decode_one_dts(dts0 >> 48, &temps[P8_MEM_DTS0]);
+ dts_decode_one_dts(dts0 >> 32, &temps[P8_MEM_DTS1]);
+
+ for (i = 0; i < P8_MEM_ZONES; i++) {
+ int16_t t = temps[i].temp;
+
+ if (!temps[i].valid)
+ continue;
+
+ /* keep the max temperature of all 4 sensors */
+ if (t > dts->temp)
+ dts->temp = t;
+
+ dts->valid++;
+ dts->trip |= temps[i].trip;
+ }
+
+ prlog(PR_TRACE, "DTS: Chip %x temp:%dC trip:%x\n",
+ chip_id, dts->temp, dts->trip);
+
+ /*
+ * FIXME: The trip bits are always set ?! Just discard
+ * them for the moment until we understand why.
+ */
+ dts->trip = 0;
+ return 0;
+}
+
+/*
+ * DTS sensor class ids. Only one for the moment: the core
+ * temperature.
+ */
+enum sensor_dts_class {
+ SENSOR_DTS_CORE_TEMP,
+ SENSOR_DTS_MEM_TEMP,
+ /* To be continued */
+};
+
+/*
+ * Extract the centaur chip id which was truncated to fit in the
+ * resource identifier field of the sensor handler
+ */
+#define centaur_get_id(rid) (0x80000000 | ((rid) & 0x3ff))
+
+int64_t dts_sensor_read(u32 sensor_hndl, int token, __be64 *sensor_data)
+{
+ uint8_t attr = sensor_get_attr(sensor_hndl);
+ uint32_t rid = sensor_get_rid(sensor_hndl);
+ struct dts dts = {0};
+ int64_t rc;
+
+ if (attr > SENSOR_DTS_ATTR_TEMP_TRIP)
+ return OPAL_PARAMETER;
+
+ memset(&dts, 0, sizeof(struct dts));
+
+ switch (sensor_get_frc(sensor_hndl)) {
+ case SENSOR_DTS_CORE_TEMP:
+ rc = dts_read_core_temp(rid, &dts, attr, token, sensor_data);
+ break;
+ case SENSOR_DTS_MEM_TEMP:
+ rc = dts_read_mem_temp(centaur_get_id(rid), &dts);
+ break;
+ default:
+ rc = OPAL_PARAMETER;
+ break;
+ }
+ if (rc)
+ return rc;
+
+ if (attr == SENSOR_DTS_ATTR_TEMP_MAX)
+ *sensor_data = cpu_to_be64(dts.temp);
+ else if (attr == SENSOR_DTS_ATTR_TEMP_TRIP)
+ *sensor_data = cpu_to_be64(dts.trip);
+
+ return 0;
+}
+
+/*
+ * We only have two bytes for the resource identifier in the sensor
+ * handler. Let's trunctate the centaur chip id to squeeze it in.
+ *
+ * Centaur chip IDs are using the XSCOM "partID" encoding described in
+ * xscom.h. recap:
+ *
+ * 0b1000.0000.0000.0000.0000.00NN.NCCC.MMMM
+ * N=Node, C=Chip, M=Memory Channel
+ */
+#define centaur_make_id(cen_id, dimm_id) \
+ (((chip_id) & 0x3ff) | ((dimm_id) << 10))
+
+#define core_handler(core_id, attr_id) \
+ sensor_make_handler(SENSOR_DTS, SENSOR_DTS_CORE_TEMP, \
+ core_id, attr_id)
+
+#define cen_handler(cen_id, attr_id) \
+ sensor_make_handler(SENSOR_DTS, SENSOR_DTS_MEM_TEMP, \
+ centaur_make_id(chip_id, 0), attr_id)
+
+bool dts_sensor_create_nodes(struct dt_node *sensors)
+{
+ struct proc_chip *chip;
+ struct dt_node *cn;
+ char name[64];
+
+ /* build the device tree nodes :
+ *
+ * sensors/core-temp@pir
+ *
+ * The core is identified by its PIR, is stored in the resource
+ * number of the sensor handler.
+ */
+ for_each_chip(chip) {
+ struct cpu_thread *c;
+
+ for_each_available_core_in_chip(c, chip->id) {
+ struct dt_node *node;
+ uint32_t handler;
+
+ snprintf(name, sizeof(name), "core-temp@%x", c->pir);
+
+ handler = core_handler(c->pir, SENSOR_DTS_ATTR_TEMP_MAX);
+ node = dt_new(sensors, name);
+ dt_add_property_string(node, "compatible",
+ "ibm,opal-sensor");
+ dt_add_property_cells(node, "sensor-data", handler);
+ handler = core_handler(c->pir, SENSOR_DTS_ATTR_TEMP_TRIP);
+ dt_add_property_cells(node, "sensor-status", handler);
+ dt_add_property_string(node, "sensor-type", "temp");
+ dt_add_property_cells(node, "ibm,pir", c->pir);
+ dt_add_property_cells(node, "reg", handler);
+ dt_add_property_string(node, "label", "Core");
+ init_timer(&c->dts_timer, dts_async_read_temp, c);
+ c->dts_read_in_progress = false;
+ }
+ }
+
+ /*
+ * sensors/mem-temp@chip for Centaurs
+ */
+ dt_for_each_compatible(dt_root, cn, "ibm,centaur") {
+ uint32_t chip_id;
+ struct dt_node *node;
+ uint32_t handler;
+
+ chip_id = dt_prop_get_u32(cn, "ibm,chip-id");
+
+ snprintf(name, sizeof(name), "mem-temp@%x", chip_id);
+
+ handler = cen_handler(chip_id, SENSOR_DTS_ATTR_TEMP_MAX);
+ node = dt_new(sensors, name);
+ dt_add_property_string(node, "compatible",
+ "ibm,opal-sensor");
+ dt_add_property_cells(node, "sensor-data", handler);
+
+ handler = cen_handler(chip_id, SENSOR_DTS_ATTR_TEMP_TRIP);
+ dt_add_property_cells(node, "sensor-status", handler);
+ dt_add_property_string(node, "sensor-type", "temp");
+ dt_add_property_cells(node, "ibm,chip-id", chip_id);
+ dt_add_property_cells(node, "reg", handler);
+ dt_add_property_string(node, "label", "Centaur");
+ }
+
+ return true;
+}
diff --git a/roms/skiboot/hw/fake-nvram.c b/roms/skiboot/hw/fake-nvram.c
new file mode 100644
index 000000000..44adde4a3
--- /dev/null
+++ b/roms/skiboot/hw/fake-nvram.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2013-2017 IBM Corp. */
+
+#include <skiboot.h>
+#include <opal.h>
+#include <mem_region.h>
+#include <lock.h>
+
+static struct mem_region *nvram_region;
+static struct lock fake_nvram_lock = LOCK_UNLOCKED;
+
+int fake_nvram_info(uint32_t *total_size)
+{
+ nvram_region = find_mem_region("ibm,fake-nvram");
+
+ if (!nvram_region)
+ return OPAL_HARDWARE;
+
+ *total_size = nvram_region->len;
+
+ return OPAL_SUCCESS;
+}
+
+int fake_nvram_start_read(void *dst, uint32_t src, uint32_t len)
+{
+ if (!nvram_region)
+ return -ENODEV;
+
+ lock(&fake_nvram_lock);
+ memcpy(dst, (void *) (nvram_region->start + src), len);
+ unlock(&fake_nvram_lock);
+
+ nvram_read_complete(true);
+
+ return 0;
+}
+
+int fake_nvram_write(uint32_t offset, void *src, uint32_t size)
+{
+ if (!nvram_region)
+ return OPAL_HARDWARE;
+
+ lock(&fake_nvram_lock);
+ memcpy((void *) (nvram_region->start + offset), src, size);
+ unlock(&fake_nvram_lock);
+
+ return 0;
+}
+
diff --git a/roms/skiboot/hw/fake-rtc.c b/roms/skiboot/hw/fake-rtc.c
new file mode 100644
index 000000000..3f083050c
--- /dev/null
+++ b/roms/skiboot/hw/fake-rtc.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2013-2017 IBM Corp. */
+
+#include <skiboot.h>
+#include <opal.h>
+#include <mem_region.h>
+#include <device.h>
+#include <timebase.h>
+#include <time-utils.h>
+#include <lock.h>
+
+/* timebase when tm_offset was assigned */
+static unsigned long tb_synctime;
+
+/*
+ * Absolute time that was last assigned.
+ * Current rtc value is calculated from this.
+*/
+static struct tm tm_offset;
+
+/* protects tm_offset & tb_synctime */
+static struct lock emulation_lock;
+
+static int64_t fake_rtc_write(uint32_t ymd, uint64_t hmsm)
+{
+
+ lock(&emulation_lock);
+
+ datetime_to_tm(ymd, hmsm, &tm_offset);
+ tb_synctime = mftb();
+
+ unlock(&emulation_lock);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t fake_rtc_read(__be32 *__ymd, __be64 *__hmsm)
+{
+
+ time_t sec;
+ struct tm tm_calculated;
+ uint32_t ymd;
+ uint64_t hmsm;
+
+ if (!__ymd || !__hmsm)
+ return OPAL_PARAMETER;
+
+ /* Compute the emulated clock value */
+ lock(&emulation_lock);
+
+ sec = tb_to_secs(mftb() - tb_synctime) + mktime(&tm_offset);
+ gmtime_r(&sec, &tm_calculated);
+ tm_to_datetime(&tm_calculated, &ymd, &hmsm);
+
+ unlock(&emulation_lock);
+
+ *__ymd = cpu_to_be32(ymd);
+ *__hmsm = cpu_to_be64(hmsm);
+
+ return OPAL_SUCCESS;
+}
+
+void fake_rtc_init(void)
+{
+ struct mem_region *rtc_region = NULL;
+ uint32_t *rtc = NULL, *fake_ymd;
+ uint64_t *fake_hmsm;
+ struct dt_node *np;
+
+ /* Read initial values from reserved memory */
+ rtc_region = find_mem_region("ibm,fake-rtc");
+
+ /* Should we register anyway? */
+ if (!rtc_region) {
+ prlog(PR_TRACE, "No initial RTC value found\n");
+ return;
+ }
+
+ init_lock(&emulation_lock);
+
+ /* Fetch the initial rtc values */
+ rtc = (uint32_t *) rtc_region->start;
+
+ fake_ymd = rtc;
+ fake_hmsm = ((uint64_t *) &rtc[1]);
+
+ fake_rtc_write(*fake_ymd, *fake_hmsm);
+
+ /* Register opal calls */
+ opal_register(OPAL_RTC_READ, fake_rtc_read, 2);
+ opal_register(OPAL_RTC_WRITE, fake_rtc_write, 2);
+
+ /* add the fake rtc dt node */
+ np = dt_new(opal_node, "rtc");
+ dt_add_property_strings(np, "compatible", "ibm,opal-rtc");
+
+ prlog(PR_TRACE, "Init fake RTC to Date:%d-%d-%d Time:%d-%d-%d\n",
+ tm_offset.tm_mon, tm_offset.tm_mday, tm_offset.tm_year,
+ tm_offset.tm_hour, tm_offset.tm_min, tm_offset.tm_sec);
+}
diff --git a/roms/skiboot/hw/fsi-master.c b/roms/skiboot/hw/fsi-master.c
new file mode 100644
index 000000000..410542a19
--- /dev/null
+++ b/roms/skiboot/hw/fsi-master.c
@@ -0,0 +1,675 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2013-2017 IBM Corp. */
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <lock.h>
+#include <timebase.h>
+#include <chip.h>
+#include <fsi-master.h>
+
+/*
+ * FSI Masters sit on OPB busses behind PIB2OPB bridges
+ *
+ * There are two cMFSI behind two different bridges at
+ * different XSCOM addresses. For now we don't have them in
+ * the device-tree so we hard code the address
+ */
+#define PIB2OPB_MFSI0_ADDR 0x20000
+#define PIB2OPB_MFSI1_ADDR 0x30000
+
+/*
+ * Bridge registers on XSCOM that allow generatoin
+ * of OPB cycles
+ */
+#define PIB2OPB_REG_CMD 0x0
+#define OPB_CMD_WRITE 0x80000000
+#define OPB_CMD_READ 0x00000000
+#define OPB_CMD_8BIT 0x00000000
+#define OPB_CMD_16BIT 0x20000000
+#define OPB_CMD_32BIT 0x60000000
+#define PIB2OPB_REG_STAT 0x1
+#define OPB_STAT_ANY_ERR 0x80000000
+#define OPB_STAT_ERR_OPB 0x7FEC0000
+#define OPB_STAT_ERRACK 0x00100000
+#define OPB_STAT_BUSY 0x00010000
+#define OPB_STAT_READ_VALID 0x00020000
+#define OPB_STAT_ERR_CMFSI 0x0000FC00
+#define OPB_STAT_ERR_HMFSI 0x000000FC
+#define OPB_STAT_ERR_BASE (OPB_STAT_ANY_ERR | \
+ OPB_STAT_ERR_OPB | \
+ OPB_STAT_ERRACK)
+#define PIB2OPB_REG_LSTAT 0x2
+#define PIB2OPB_REG_RESET 0x4
+#define PIB2OPB_REG_cRSIC 0x5
+#define PIB2OPB_REG_cRSIM 0x6
+#define PIB2OPB_REG_cRSIS 0x7
+#define PIB2OPB_REG_hRSIC 0x8
+#define PIB2OPB_REG_hRSIM 0x9
+#define PIB2OPB_REG_hRSIS 0xA
+
+/* Low level errors from OPB contain the status in the bottom 32-bit
+ * and one of these in the top 32-bit
+ */
+#define OPB_ERR_XSCOM_ERR 0x100000000ull
+#define OPB_ERR_TIMEOUT_ERR 0x200000000ull
+#define OPB_ERR_BAD_OPB_ADDR 0x400000000ull
+
+/*
+ * PIB2OPB 0 has 2 MFSIs, cMFSI and hMFSI, PIB2OPB 1 only
+ * has cMFSI
+ */
+#define cMFSI_OPB_PORTS_BASE 0x40000
+#define cMFSI_OPB_REG_BASE 0x03000
+#define hMFSI_OPB_PORTS_BASE 0x80000
+#define hMFSI_OPB_REG_BASE 0x03400
+#define MFSI_OPB_PORT_STRIDE 0x08000
+
+/* MFSI control registers */
+#define MFSI_REG_MSTAP(__n) (0x0D0 + (__n) * 4)
+#define MFSI_REG_MATRB0 0x1D8
+#define MFSI_REG_MDTRB0 0x1DC
+#define MFSI_REG_MESRB0 0x1D0
+#define MFSI_REG_MAESP0 0x050
+#define MFSI_REG_MAEB 0x070
+#define MFSI_REG_MSCSB0 0x1D4
+
+/* FSI Slave registers */
+#define FSI_SLAVE_REGS 0x000800 /**< FSI Slave Register */
+#define FSI_SMODE (FSI_SLAVE_REGS | 0x00)
+#define FSI_SLBUS (FSI_SLAVE_REGS | 0x30)
+#define FSI_SLRES (FSI_SLAVE_REGS | 0x34)
+
+#define FSI2PIB_ENGINE 0x001000 /**< FSI2PIB Engine (SCOM) */
+#define FSI2PIB_RESET (FSI2PIB_ENGINE | 0x18)
+#define FSI2PIB_STATUS (FSI2PIB_ENGINE | 0x1C)
+#define FSI2PIB_COMPMASK (FSI2PIB_ENGINE | 0x30)
+#define FSI2PIB_TRUEMASK (FSI2PIB_ENGINE | 0x34)
+
+struct mfsi {
+ uint32_t chip_id;
+ uint32_t unit;
+ uint32_t xscom_base;
+ uint32_t ports_base;
+ uint32_t reg_base;
+ uint32_t err_bits;
+};
+
+#define mfsi_log(__lev, __m, __fmt, ...) \
+ prlog(__lev, "MFSI %x:%x: " __fmt, __m->chip_id, __m->unit, ##__VA_ARGS__)
+/*
+ * Use a global FSI lock for now. Beware of re-entrancy
+ * if we ever add support for normal chip XSCOM via FSI, in
+ * which case we'll probably have to consider either per chip
+ * lock (which can have AB->BA deadlock issues) or a re-entrant
+ * global lock or something else. ...
+ */
+static struct lock fsi_lock = LOCK_UNLOCKED;
+
+/*
+ * OPB accessors
+ */
+
+/* We try up to 1.2ms for an OPB access */
+#define MFSI_OPB_MAX_TRIES 1200
+
+static uint64_t mfsi_opb_poll(struct mfsi *mfsi, uint32_t *read_data)
+{
+ unsigned long retries = MFSI_OPB_MAX_TRIES;
+ uint64_t sval;
+ uint32_t stat;
+ int64_t rc;
+
+ /* We try again every 10us for a bit more than 1ms */
+ for (;;) {
+ /* Read OPB status register */
+ rc = xscom_read(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_STAT, &sval);
+ if (rc) {
+ /* Do something here ? */
+ mfsi_log(PR_ERR, mfsi, "XSCOM error %lld read OPB STAT\n", rc);
+ return OPB_ERR_XSCOM_ERR;
+ }
+ mfsi_log(PR_INSANE, mfsi, " STAT=0x%16llx...\n", sval);
+
+ stat = sval >> 32;
+
+ /* Complete */
+ if (!(stat & OPB_STAT_BUSY))
+ break;
+ if (retries-- == 0) {
+ /* This isn't supposed to happen (HW timeout) */
+ mfsi_log(PR_ERR, mfsi, "OPB POLL timeout !\n");
+ return OPB_ERR_TIMEOUT_ERR | (stat & mfsi->err_bits);
+ }
+ time_wait_us(1);
+ }
+
+ /* Did we have an error ? */
+ if (stat & mfsi->err_bits)
+ return stat & mfsi->err_bits;
+
+ if (read_data) {
+ if (!(stat & OPB_STAT_READ_VALID)) {
+ mfsi_log(PR_ERR, mfsi, "Read successful but no data !\n");
+
+ /* What do do here ? can it actually happen ? */
+ sval = 0xffffffff;
+ }
+ *read_data = sval & 0xffffffff;
+ }
+
+ return 0;
+}
+
+static uint64_t mfsi_opb_read(struct mfsi *mfsi, uint32_t opb_addr, uint32_t *data)
+{
+ uint64_t opb_cmd = OPB_CMD_READ | OPB_CMD_32BIT;
+ int64_t rc;
+
+ if (opb_addr > 0x00ffffff)
+ return OPB_ERR_BAD_OPB_ADDR;
+
+ opb_cmd |= opb_addr;
+ opb_cmd <<= 32;
+
+ mfsi_log(PR_INSANE, mfsi, "MFSI_OPB_READ: Writing 0x%16llx to XSCOM %x\n",
+ opb_cmd, mfsi->xscom_base);
+
+ rc = xscom_write(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_CMD, opb_cmd);
+ if (rc) {
+ mfsi_log(PR_ERR, mfsi, "XSCOM error %lld writing OPB CMD\n", rc);
+ return OPB_ERR_XSCOM_ERR;
+ }
+ return mfsi_opb_poll(mfsi, data);
+}
+
+static uint64_t mfsi_opb_write(struct mfsi *mfsi, uint32_t opb_addr, uint32_t data)
+{
+ uint64_t opb_cmd = OPB_CMD_WRITE | OPB_CMD_32BIT;
+ int64_t rc;
+
+ if (opb_addr > 0x00ffffff)
+ return OPB_ERR_BAD_OPB_ADDR;
+
+ opb_cmd |= opb_addr;
+ opb_cmd <<= 32;
+ opb_cmd |= data;
+
+ mfsi_log(PR_INSANE, mfsi, "MFSI_OPB_WRITE: Writing 0x%16llx to XSCOM %x\n",
+ opb_cmd, mfsi->xscom_base);
+
+ rc = xscom_write(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_CMD, opb_cmd);
+ if (rc) {
+ mfsi_log(PR_ERR, mfsi, "XSCOM error %lld writing OPB CMD\n", rc);
+ return OPB_ERR_XSCOM_ERR;
+ }
+ return mfsi_opb_poll(mfsi, NULL);
+}
+
+static struct mfsi *mfsi_get(uint32_t chip_id, uint32_t unit)
+{
+ struct proc_chip *chip = get_chip(chip_id);
+ struct mfsi *mfsi;
+
+ if (!chip || unit > MFSI_hMFSI0)
+ return NULL;
+ mfsi = &chip->fsi_masters[unit];
+ if (mfsi->xscom_base == 0)
+ return NULL;
+ return mfsi;
+}
+
+static int64_t mfsi_reset_pib2opb(struct mfsi *mfsi)
+{
+ uint64_t stat;
+ int64_t rc;
+
+ rc = xscom_write(mfsi->chip_id,
+ mfsi->xscom_base + PIB2OPB_REG_RESET, (1ul << 63));
+ if (rc) {
+ mfsi_log(PR_ERR, mfsi, "XSCOM error %lld resetting PIB2OPB\n", rc);
+ return rc;
+ }
+ rc = xscom_write(mfsi->chip_id,
+ mfsi->xscom_base + PIB2OPB_REG_STAT, (1ul << 63));
+ if (rc) {
+ mfsi_log(PR_ERR, mfsi, "XSCOM error %lld resetting status\n", rc);
+ return rc;
+ }
+ rc = xscom_read(mfsi->chip_id,
+ mfsi->xscom_base + PIB2OPB_REG_STAT, &stat);
+ if (rc) {
+ mfsi_log(PR_ERR, mfsi, "XSCOM error %lld reading status\n", rc);
+ return rc;
+ }
+ return 0;
+}
+
+
+static void mfsi_dump_pib2opb_state(struct mfsi *mfsi)
+{
+ uint64_t val;
+
+ /* Dump a bunch of registers */
+ if (xscom_read(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_CMD, &val))
+ goto xscom_error;
+ mfsi_log(PR_ERR, mfsi, " PIB2OPB CMD = %016llx\n", val);
+ if (xscom_read(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_STAT, &val))
+ goto xscom_error;
+ mfsi_log(PR_ERR, mfsi, " PIB2OPB STAT = %016llx\n", val);
+ if (xscom_read(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_LSTAT, &val))
+ goto xscom_error;
+ mfsi_log(PR_ERR, mfsi, " PIB2OPB LSTAT = %016llx\n", val);
+
+ if (mfsi->unit == MFSI_cMFSI0 || mfsi->unit == MFSI_cMFSI1) {
+ if (xscom_read(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_cRSIC, &val))
+ goto xscom_error;
+ mfsi_log(PR_ERR, mfsi, " PIB2OPB cRSIC = %016llx\n", val);
+ if (xscom_read(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_cRSIM, &val))
+ goto xscom_error;
+ mfsi_log(PR_ERR, mfsi, " PIB2OPB cRSIM = %016llx\n", val);
+ if (xscom_read(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_cRSIS, &val))
+ goto xscom_error;
+ mfsi_log(PR_ERR, mfsi, " PIB2OPB cRSIS = %016llx\n", val);
+ } else if (mfsi->unit == MFSI_hMFSI0) {
+ if (xscom_read(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_hRSIC, &val))
+ goto xscom_error;
+ mfsi_log(PR_ERR, mfsi, " PIB2OPB hRSIC = %016llx\n", val);
+ if (xscom_read(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_hRSIM, &val))
+ goto xscom_error;
+ mfsi_log(PR_ERR, mfsi, " PIB2OPB hRSIM = %016llx\n", val);
+ if (xscom_read(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_hRSIS, &val))
+ goto xscom_error;
+ mfsi_log(PR_ERR, mfsi, " PIB2OPB hRSIS = %016llx\n", val);
+ }
+ return;
+ xscom_error:
+ mfsi_log(PR_ERR, mfsi, "XSCOM error reading PIB2OPB registers\n");
+}
+
+static int64_t mfsi_dump_ctrl_regs(struct mfsi *mfsi)
+{
+ uint64_t opb_stat;
+ uint32_t i;
+
+ /* List of registers to dump (from HB) */
+ static uint32_t dump_regs[] = {
+ MFSI_REG_MATRB0,
+ MFSI_REG_MDTRB0,
+ MFSI_REG_MESRB0,
+ MFSI_REG_MAESP0,
+ MFSI_REG_MAEB,
+ MFSI_REG_MSCSB0,
+ };
+ static const char *dump_regs_names[] = {
+ "MFSI_REG_MATRB0",
+ "MFSI_REG_MDTRB0",
+ "MFSI_REG_MESRB0",
+ "MFSI_REG_MAESP0",
+ "MFSI_REG_MAEB ",
+ "MFSI_REG_MSCSB0",
+ };
+ for (i = 0; i < ARRAY_SIZE(dump_regs); i++) {
+ uint32_t val;
+
+ opb_stat = mfsi_opb_read(mfsi, mfsi->reg_base + dump_regs[i], &val);
+ if (opb_stat) {
+ /* Error on dump, give up */
+ mfsi_log(PR_ERR, mfsi, " OPB stat 0x%016llx dumping reg %x\n",
+ opb_stat, dump_regs[i]);
+ return OPAL_HARDWARE;
+ }
+ mfsi_log(PR_ERR, mfsi, " %s = %08x\n", dump_regs_names[i], val);
+ }
+ for (i = 0; i < 8; i++) {
+ uint32_t val;
+
+ opb_stat = mfsi_opb_read(mfsi, mfsi->reg_base + MFSI_REG_MSTAP(i), &val);
+ if (opb_stat) {
+ /* Error on dump, give up */
+ mfsi_log(PR_ERR, mfsi, " OPB stat 0x%016llx dumping reg %x\n",
+ opb_stat, MFSI_REG_MSTAP(i));
+ return OPAL_HARDWARE;
+ }
+ mfsi_log(PR_ERR, mfsi, " MFSI_REG_MSTAP%d = %08x\n", i, val);
+ }
+ return OPAL_SUCCESS;
+}
+
+static int64_t mfsi_master_cleanup(struct mfsi *mfsi, uint32_t port)
+{
+ uint64_t opb_stat;
+ uint32_t port_base, compmask, truemask;
+
+ /* Reset the bridge to clear up the residual errors */
+
+ /* bit0 = Bridge: General reset */
+ opb_stat = mfsi_opb_write(mfsi, mfsi->reg_base + MFSI_REG_MESRB0, 0x80000000u);
+ if (opb_stat) {
+ mfsi_log(PR_ERR, mfsi, " OPB stat 0x%016llx writing reset to MESRB0\n",
+ opb_stat);
+ return OPAL_HARDWARE;
+ }
+
+ /* Calculate base address of port */
+ port_base = mfsi->ports_base + port * MFSI_OPB_PORT_STRIDE;
+
+ /* Perform error reset on Centaur fsi slave: */
+ /* write 0x4000000 to addr=834 */
+ opb_stat = mfsi_opb_write(mfsi, port_base + FSI_SLRES, 0x04000000);
+ if (opb_stat) {
+ mfsi_log(PR_ERR, mfsi,
+ " OPB stat 0x%016llx writing reset to FSI slave\n",
+ opb_stat);
+ return OPAL_HARDWARE;
+ }
+
+ /* Further step is to issue a PIB reset to the FSI2PIB engine
+ * in busy state, i.e. write arbitrary data to 101c
+ * (putcfam 1007) register of the previously failed FSI2PIB
+ * engine on Centaur.
+ *
+ * XXX BenH: Should that be done by the upper FSI XSCOM layer ?
+ */
+ opb_stat = mfsi_opb_write(mfsi, port_base + FSI2PIB_STATUS, 0xFFFFFFFF);
+ if (opb_stat) {
+ mfsi_log(PR_ERR, mfsi,
+ " OPB stat 0x%016llx clearing FSI2PIB_STATUS\n",
+ opb_stat);
+ return OPAL_HARDWARE;
+ }
+
+ /* Need to save/restore the true/comp masks or the FSP (PRD ?) will
+ * get annoyed
+ */
+ opb_stat = mfsi_opb_read(mfsi, port_base + FSI2PIB_COMPMASK, &compmask);
+ if (opb_stat) {
+ mfsi_log(PR_ERR, mfsi,
+ " OPB stat 0x%016llx reading FSI2PIB_COMPMASK\n",
+ opb_stat);
+ return OPAL_HARDWARE;
+ }
+ opb_stat = mfsi_opb_read(mfsi, port_base + FSI2PIB_TRUEMASK, &truemask);
+ if (opb_stat) {
+ mfsi_log(PR_ERR, mfsi,
+ " OPB stat 0x%016llx reading FSI2PIB_TRUEMASK\n",
+ opb_stat);
+ return OPAL_HARDWARE;
+ }
+
+ /* Then, write arbitrary data to 1018 (putcfam 1006) to
+ * reset any pending FSI2PIB errors.
+ */
+ opb_stat = mfsi_opb_write(mfsi, port_base + FSI2PIB_RESET, 0xFFFFFFFF);
+ if (opb_stat) {
+ mfsi_log(PR_ERR, mfsi,
+ " OPB stat 0x%016llx writing FSI2PIB_RESET\n",
+ opb_stat);
+ return OPAL_HARDWARE;
+ }
+
+ /* Restore the true/comp masks */
+ opb_stat = mfsi_opb_write(mfsi, port_base + FSI2PIB_COMPMASK, compmask);
+ if (opb_stat) {
+ mfsi_log(PR_ERR, mfsi,
+ " OPB stat 0x%016llx writing FSI2PIB_COMPMASK\n",
+ opb_stat);
+ return OPAL_HARDWARE;
+ }
+ opb_stat = mfsi_opb_write(mfsi, port_base + FSI2PIB_TRUEMASK, truemask);
+ if (opb_stat) {
+ mfsi_log(PR_ERR, mfsi,
+ " OPB stat 0x%016llx writing FSI2PIB_TRUEMASK\n",
+ opb_stat);
+ return OPAL_HARDWARE;
+ }
+ return OPAL_SUCCESS;
+}
+
+static int64_t mfsi_analyse_fsi_error(struct mfsi *mfsi)
+{
+ uint64_t opb_stat;
+ uint32_t mesrb0;
+
+ /* Most of the code below is adapted from HB. The main difference is
+ * that we don't gard
+ */
+
+ /* Read MESRB0 */
+ opb_stat = mfsi_opb_read(mfsi, mfsi->reg_base + MFSI_REG_MESRB0, &mesrb0);
+ if (opb_stat) {
+ mfsi_log(PR_ERR, mfsi, " OPB stat 0x%016llx reading MESRB0\n", opb_stat);
+ return OPAL_HARDWARE;
+ }
+ mfsi_log(PR_ERR, mfsi, " MESRB0=%08x\n", mesrb0);
+
+ /* bits 8:15 are internal parity errors in the master */
+ if (mesrb0 & 0x00FF0000) {
+ mfsi_log(PR_ERR, mfsi, " Master parity error !\n");
+ } else {
+ /* bits 0:3 are a specific error code */
+ switch ((mesrb0 & 0xF0000000) >> 28) {
+ case 0x1: /* OPB error */
+ case 0x2: /* Invalid state of OPB state machine */
+ /* error is inside the OPB logic */
+ mfsi_log(PR_ERR, mfsi, " OPB logic error !\n");
+ break;
+ case 0x3: /* Port access error */
+ /* probably some kind of code collision */
+ /* could also be something weird in the chip */
+ mfsi_log(PR_ERR, mfsi, " Port access error !\n");
+ break;
+ case 0x4: /* ID mismatch */
+ mfsi_log(PR_ERR, mfsi, " Port ID mismatch !\n");
+ break;
+ case 0x6: /* port timeout error */
+ mfsi_log(PR_ERR, mfsi, " Port timeout !\n");
+ break;
+ case 0x7: /* master timeout error */
+ mfsi_log(PR_ERR, mfsi, " Master timeout !\n");
+ break;
+ case 0x9: /* Any error response from Slave */
+ mfsi_log(PR_ERR, mfsi, " Slave error response !\n");
+ break;
+ case 0xC: /* bridge parity error */
+ mfsi_log(PR_ERR, mfsi, " Bridge parity error !\n");
+ break;
+ case 0xB: /* protocol error */
+ mfsi_log(PR_ERR, mfsi, " Protocol error !\n");
+ break;
+ case 0x8: /* master CRC error */
+ mfsi_log(PR_ERR, mfsi, " Master CRC error !\n");
+ break;
+ case 0xA: /* Slave CRC error */
+ mfsi_log(PR_ERR, mfsi, " Slave CRC error !\n");
+ break;
+ default:
+ mfsi_log(PR_ERR, mfsi, " Unknown error !\n");
+ break;
+ }
+ }
+ return OPAL_SUCCESS;
+}
+
+static int64_t mfsi_handle_error(struct mfsi *mfsi, uint32_t port,
+ uint64_t opb_stat, uint32_t fsi_addr)
+{
+ int rc;
+ bool found_root_cause = false;
+
+ mfsi_log(PR_ERR, mfsi, "Access error on port %d, stat=%012llx\n",
+ port, opb_stat);
+
+ /* First handle stat codes we synthetized */
+ if (opb_stat & OPB_ERR_XSCOM_ERR)
+ return OPAL_HARDWARE;
+ if (opb_stat & OPB_ERR_BAD_OPB_ADDR)
+ return OPAL_PARAMETER;
+
+ /* Dump a bunch of regisers from PIB2OPB and reset it */
+ mfsi_dump_pib2opb_state(mfsi);
+
+ /* Reset PIB2OPB */
+ mfsi_reset_pib2opb(mfsi);
+
+ /* This one is not supposed to happen but ... */
+ if (opb_stat & OPB_ERR_TIMEOUT_ERR)
+ return OPAL_HARDWARE;
+
+ /* Dump some FSI control registers */
+ rc = mfsi_dump_ctrl_regs(mfsi);
+
+ /* If that failed, reset PIB2OPB again and return */
+ if (rc) {
+ mfsi_dump_pib2opb_state(mfsi);
+ mfsi_reset_pib2opb(mfsi);
+ return OPAL_HARDWARE;
+ }
+
+ /* Now check for known root causes (from HB) */
+
+ /* First check if it's a ctrl register access error and we got an OPB NACK,
+ * which means an out of bounds control reg
+ */
+ if ((opb_stat & OPB_STAT_ERRACK) &&
+ ((fsi_addr & ~0x2ffu) == mfsi->reg_base)) {
+ mfsi_log(PR_ERR, mfsi, " Error appears to be out of bounds reg %08x\n",
+ fsi_addr);
+ found_root_cause = true;
+ }
+ /* Else check for other OPB errors */
+ else if (opb_stat & OPB_STAT_ERR_OPB) {
+ mfsi_log(PR_ERR, mfsi, " Error appears to be an OPB error\n");
+ found_root_cause = true;
+ }
+
+ /* Root cause not found, dig into FSI logic */
+ if (!found_root_cause) {
+ rc = mfsi_analyse_fsi_error(mfsi);
+ if (!rc) {
+ /* If that failed too, reset the PIB2OPB again */
+ mfsi_reset_pib2opb(mfsi);
+ }
+ }
+
+ /* Cleanup MFSI master */
+ mfsi_master_cleanup(mfsi, port);
+
+ return OPAL_HARDWARE;
+}
+
+int64_t mfsi_read(uint32_t chip, uint32_t unit, uint32_t port,
+ uint32_t fsi_addr, uint32_t *data)
+{
+ struct mfsi *mfsi = mfsi_get(chip, unit);
+ uint32_t port_addr;
+ uint64_t opb_stat;
+ int64_t rc = OPAL_SUCCESS;
+
+ if (!mfsi || port > 7)
+ return OPAL_PARAMETER;
+
+ lock(&fsi_lock);
+
+ /* Calculate port address */
+ port_addr = mfsi->ports_base + port * MFSI_OPB_PORT_STRIDE;
+ port_addr += fsi_addr;
+
+ /* Perform OPB access */
+ opb_stat = mfsi_opb_read(mfsi, port_addr, data);
+ if (opb_stat)
+ rc = mfsi_handle_error(mfsi, port, opb_stat, port_addr);
+
+ unlock(&fsi_lock);
+
+ return rc;
+}
+
+int64_t mfsi_write(uint32_t chip, uint32_t unit, uint32_t port,
+ uint32_t fsi_addr, uint32_t data)
+{
+ struct mfsi *mfsi = mfsi_get(chip, unit);
+ uint32_t port_addr;
+ uint64_t opb_stat;
+ int64_t rc = OPAL_SUCCESS;
+
+ if (!mfsi || port > 7)
+ return OPAL_PARAMETER;
+
+ lock(&fsi_lock);
+
+ /* Calculate port address */
+ port_addr = mfsi->ports_base + port * MFSI_OPB_PORT_STRIDE;
+ port_addr += fsi_addr;
+
+ /* Perform OPB access */
+ opb_stat = mfsi_opb_write(mfsi, port_addr, data);
+ if (opb_stat)
+ rc = mfsi_handle_error(mfsi, port, opb_stat, port_addr);
+
+ unlock(&fsi_lock);
+
+ return rc;
+}
+
+static void mfsi_add(struct proc_chip *chip, struct mfsi *mfsi, uint32_t unit)
+{
+ mfsi->chip_id = chip->id;
+ mfsi->unit = unit;
+
+ /* We hard code everything for now */
+ switch (unit) {
+ case MFSI_cMFSI0:
+ mfsi->xscom_base = PIB2OPB_MFSI0_ADDR;
+ mfsi->ports_base = cMFSI_OPB_PORTS_BASE;
+ mfsi->reg_base = cMFSI_OPB_REG_BASE;
+ mfsi->err_bits = OPB_STAT_ERR_BASE | OPB_STAT_ERR_CMFSI;
+ break;
+ case MFSI_cMFSI1:
+ mfsi->xscom_base = PIB2OPB_MFSI1_ADDR;
+ mfsi->ports_base = cMFSI_OPB_PORTS_BASE;
+ mfsi->reg_base = cMFSI_OPB_REG_BASE;
+ mfsi->err_bits = OPB_STAT_ERR_BASE | OPB_STAT_ERR_CMFSI;
+ break;
+ case MFSI_hMFSI0:
+ mfsi->xscom_base = PIB2OPB_MFSI0_ADDR;
+ mfsi->ports_base = hMFSI_OPB_PORTS_BASE;
+ mfsi->reg_base = hMFSI_OPB_REG_BASE;
+ mfsi->err_bits = OPB_STAT_ERR_BASE | OPB_STAT_ERR_HMFSI;
+ break;
+ default:
+ /* ??? */
+ return;
+ }
+
+ /* Hardware Bug HW222712 on Murano DD1.0 causes the
+ * any_error bit to be un-clearable so we just
+ * have to ignore it. Additionally, HostBoot applies
+ * this to Venice too, though the comment there claims
+ * this is a Simics workaround.
+ *
+ * The doc says that bit can be safely ignored, so let's
+ * just not bother and always take it out.
+ */
+
+ /* 16: cMFSI any-master-error */
+ /* 24: hMFSI any-master-error */
+ mfsi->err_bits &= 0xFFFF7F7F;
+
+ mfsi_log(PR_INFO, mfsi, "Initialized\n");
+}
+
+void mfsi_init(void)
+{
+ struct proc_chip *chip;
+
+ for_each_chip(chip) {
+ chip->fsi_masters = zalloc(sizeof(struct mfsi) * 3);
+ assert(chip->fsi_masters);
+ mfsi_add(chip, &chip->fsi_masters[MFSI_cMFSI0], MFSI_cMFSI0);
+ mfsi_add(chip, &chip->fsi_masters[MFSI_hMFSI0], MFSI_hMFSI0);
+ mfsi_add(chip, &chip->fsi_masters[MFSI_cMFSI1], MFSI_cMFSI1);
+
+ }
+}
+
diff --git a/roms/skiboot/hw/fsp/Makefile.inc b/roms/skiboot/hw/fsp/Makefile.inc
new file mode 100644
index 000000000..21dc52a9f
--- /dev/null
+++ b/roms/skiboot/hw/fsp/Makefile.inc
@@ -0,0 +1,13 @@
+SUBDIRS += hw/fsp
+
+FSP_OBJS = fsp.o fsp-console.o fsp-rtc.o fsp-nvram.o fsp-sysparam.o
+FSP_OBJS += fsp-surveillance.o fsp-codeupdate.o fsp-sensor.o
+FSP_OBJS += fsp-diag.o fsp-leds.o fsp-mem-err.o fsp-op-panel.o
+FSP_OBJS += fsp-elog-read.o fsp-elog-write.o fsp-epow.o fsp-dpo.o
+FSP_OBJS += fsp-dump.o fsp-sysdump.o fsp-chiptod.o fsp-ipmi.o
+FSP_OBJS += fsp-attn.o fsp-occ.o fsp-psi.o
+FSP = hw/fsp/built-in.a
+
+ifeq ($(CONFIG_FSP),1)
+$(FSP): $(FSP_OBJS:%=hw/fsp/%)
+endif
diff --git a/roms/skiboot/hw/fsp/fsp-attn.c b/roms/skiboot/hw/fsp/fsp-attn.c
new file mode 100644
index 000000000..6e358e0d4
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-attn.c
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * FSP ATTentioN support
+ *
+ * FSP can grab a bunch of things on host firmware dying,
+ * let's set that up.
+ *
+ * Copyright 2013-2019 IBM Corp.
+*/
+#include <fsp.h>
+#include <skiboot.h>
+#include <fsp-elog.h>
+#include <fsp-attn.h>
+#include <hdata/spira.h>
+#include <stack.h>
+#include <processor.h>
+#include <opal-dump.h>
+
+#define TI_CMD_VALID 0x1 /* Command valid */
+#define TI_CMD 0xA1 /* Terminate Immediate command */
+#define TI_DATA_LEN 0x0400 /* Data length */
+/* Controls dump actions
+ * - Non-destructive hardware dump (bit 0)
+ * - memory dump (bit 1)
+ * - Destructive hardware dump (bit 2)
+ */
+#define TI_DMP_CTL 0x6
+/* Dump type
+ * 0 - Abbreviated hardware dump
+ * 1 - Complete hardware dump
+ * 2 - No hardware dump
+ */
+#define TI_DUMP_TYPE 0x1
+#define TI_FORMAT 0x02 /* SRC format */
+#define TI_SRC_FLAGS 0x0 /* SRC flags */
+#define TI_ASCII_WORDS 0x0 /* Number of ASCII words */
+
+/* HEX words: Number of hex words of data added, up to 8 total
+ * this value is one more.
+ */
+#define TI_HEX_WORDS 0x02
+/* SRC length : 8 byte header, 8 hex words of data and
+ * 32 byte ASCII SRC
+ */
+#define TI_SRC_LEN 0x48
+
+static struct ti_attn *ti_attn;
+
+/* Initialises SP attention area with default values */
+static void init_sp_attn_area(void)
+{
+ /* Already done */
+ if (ti_attn)
+ return;
+
+ /* We are just enabling attention area 1 */
+ ti_attn = (struct ti_attn *)&cpu_ctl_sp_attn_area1;
+
+ /* Attention component checks Attn area 2 first, if its NULL
+ * it will check for Attn area 1.
+ */
+ memset(&cpu_ctl_sp_attn_area1, 0, sizeof(struct sp_attn_area));
+ memset(&cpu_ctl_sp_attn_area2, 0, sizeof(struct sp_attn_area));
+
+ ti_attn->cmd_valid = TI_CMD_VALID;
+ ti_attn->attn_cmd = TI_CMD;
+ ti_attn->data_len = CPU_TO_BE16(TI_DATA_LEN);
+ /* Dump control byte not used as of now */
+ ti_attn->dump_ctrl =TI_DMP_CTL;
+ ti_attn->dump_type = CPU_TO_BE16(TI_DUMP_TYPE);
+
+ /* SRC format */
+ ti_attn->src_fmt = TI_FORMAT;
+ /* SRC flags */
+ ti_attn->src_flags = TI_SRC_FLAGS;
+ /* #ASCII words */
+ ti_attn->ascii_cnt = TI_ASCII_WORDS;
+ /* #HEX words */
+ ti_attn->hex_cnt = TI_HEX_WORDS;
+ ti_attn->src_len = CPU_TO_BE16(TI_SRC_LEN);
+ snprintf(ti_attn->src, SRC_LEN, "%X", generate_src_from_comp(OPAL_RC_ATTN));
+}
+
+/* Updates src in sp attention area
+ */
+static void update_sp_attn_area(const char *msg)
+{
+#define STACK_BUF_ENTRIES 20
+ struct bt_entry bt_buf[STACK_BUF_ENTRIES];
+ struct bt_metadata metadata;
+ unsigned int len;
+
+ if (!fsp_present())
+ return;
+
+ /* This can be called early */
+ if (!ti_attn)
+ init_sp_attn_area();
+
+ ti_attn->src_word[0] =
+ cpu_to_be32((uint32_t)((uint64_t)__builtin_return_address(0) & 0xffffffff));
+
+ snprintf(ti_attn->msg.version, VERSION_LEN, "%s", version);
+ backtrace_create(bt_buf, STACK_BUF_ENTRIES, &metadata);
+ metadata.token = OPAL_LAST + 1;
+ len = BT_FRAME_LEN;
+ backtrace_print(bt_buf, &metadata, ti_attn->msg.bt_buf, &len, false);
+ snprintf(ti_attn->msg.file_info, FILE_INFO_LEN, "%s", msg);
+
+ ti_attn->msg_len = cpu_to_be32(VERSION_LEN + BT_FRAME_LEN +
+ strlen(ti_attn->msg.file_info));
+}
+
+void __attribute__((noreturn)) ibm_fsp_terminate(const char *msg)
+{
+ /* Update SP attention area */
+ update_sp_attn_area(msg);
+
+ /* Update op panel op_display */
+ op_display(OP_FATAL, OP_MOD_CORE, 0x6666);
+
+ /* Save crashing CPU details */
+ opal_mpipl_save_crashing_pir();
+
+ /* XXX FIXME: We should fsp_poll for a while to ensure any pending
+ * console writes have made it out, but until we have decent PSI
+ * link handling we must not do it forever. Polling can prevent the
+ * FSP from bringing the PSI link up and it can get stuck in a
+ * reboot loop.
+ */
+
+ trigger_attn();
+ for (;;) ;
+}
+
+/* Intialises SP attention area */
+void fsp_attn_init(void)
+{
+ if (!fsp_present())
+ return;
+
+ init_sp_attn_area();
+}
diff --git a/roms/skiboot/hw/fsp/fsp-chiptod.c b/roms/skiboot/hw/fsp/fsp-chiptod.c
new file mode 100644
index 000000000..e4ede3c1c
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-chiptod.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * On some chiptod errors, ask the FSP for a new topology
+ *
+ * Copyright 2013-2017 IBM Corp.
+ */
+
+#define pr_fmt(fmt) "CHIPTOD: " fmt
+
+#include <skiboot.h>
+#include <chiptod.h>
+#include <fsp.h>
+
+/* Response status for fsp command 0xE6, s/c 0x06 (Enable/Disable Topology) */
+#define FSP_STATUS_TOPO_IN_USE 0xb8 /* topology is in use */
+
+static bool fsp_chiptod_update_topology(uint32_t cmd_sub_mod,
+ struct fsp_msg *msg)
+{
+ struct fsp_msg *resp;
+ enum chiptod_topology topo;
+ bool action;
+ uint8_t status = 0;
+
+ switch (cmd_sub_mod) {
+ case FSP_CMD_TOPO_ENABLE_DISABLE:
+ /*
+ * Action Values: 0x00 = Disable, 0x01 = Enable
+ * Topology Values: 0x00 = Primary, 0x01 = Secondary
+ */
+ action = !!msg->data.bytes[2];
+ topo = msg->data.bytes[3];
+ prlog(PR_DEBUG, "Topology update event:\n");
+ prlog(PR_DEBUG, " Action = %s, Topology = %s\n",
+ action ? "Enable" : "Disable",
+ topo ? "Secondary" : "Primary");
+
+ if (!chiptod_adjust_topology(topo, action))
+ status = FSP_STATUS_TOPO_IN_USE;
+ else
+ status = 0x00;
+
+ resp = fsp_mkmsg(FSP_RSP_TOPO_ENABLE_DISABLE | status, 0);
+ if (!resp) {
+ prerror("Response allocation failed\n");
+ return false;
+ }
+ if (fsp_queue_msg(resp, fsp_freemsg)) {
+ fsp_freemsg(resp);
+ prerror("Failed to queue response msg\n");
+ return false;
+ }
+ return true;
+ default:
+ prlog(PR_DEBUG, "Unhandled sub cmd: %06x\n", cmd_sub_mod);
+ break;
+ }
+ return false;
+}
+
+static struct fsp_client fsp_chiptod_client = {
+ .message = fsp_chiptod_update_topology,
+};
+
+void fsp_chiptod_init(void)
+{
+ /* Register for Class E6 (HW maintanance) */
+ fsp_register_client(&fsp_chiptod_client, FSP_MCLASS_HW_MAINT);
+}
diff --git a/roms/skiboot/hw/fsp/fsp-codeupdate.c b/roms/skiboot/hw/fsp/fsp-codeupdate.c
new file mode 100644
index 000000000..3cd5b2bc9
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-codeupdate.c
@@ -0,0 +1,1315 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Firmware code update for FSP systems
+ *
+ * Copyright 2013-2018 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <fsp.h>
+#include <fsp-sysparam.h>
+#include <lock.h>
+#include <device.h>
+#include <ccan/endian/endian.h>
+#include <errorlog.h>
+#include <opal-api.h>
+#include <timebase.h>
+
+#include "fsp-codeupdate.h"
+
+enum flash_state {
+ FLASH_STATE_ABSENT,
+ FLASH_STATE_INVALID, /* IPL side marker lid is invalid */
+ FLASH_STATE_READING,
+ FLASH_STATE_READ,
+ FLASH_STATE_ABORT,
+};
+
+enum lid_fetch_side {
+ FETCH_T_SIDE_ONLY,
+ FETCH_P_SIDE_ONLY,
+ FETCH_BOTH_SIDE,
+};
+
+static enum flash_state flash_state = FLASH_STATE_INVALID;
+static enum lid_fetch_side lid_fetch_side = FETCH_BOTH_SIDE;
+
+/* Image buffers */
+static struct opal_sg_list *image_data;
+static uint32_t tce_start;
+static void *lid_data;
+static char validate_buf[VALIDATE_BUF_SIZE];
+
+/* TCE buffer lock */
+static struct lock flash_lock = LOCK_UNLOCKED;
+
+/* FW VPD data */
+static struct fw_image_vpd fw_vpd[2];
+
+/* Code update related sys parameters */
+static uint32_t ipl_side;
+static uint32_t hmc_managed;
+static uint32_t update_policy;
+static uint32_t in_flight_params;
+
+/* If non-NULL, this gets called just before rebooting */
+int (*fsp_flash_term_hook)(void);
+
+DEFINE_LOG_ENTRY(OPAL_RC_CU_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_CODEUPDATE,
+ OPAL_PLATFORM_FIRMWARE,
+ OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT, OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_CU_FLASH, OPAL_PLATFORM_ERR_EVT, OPAL_CODEUPDATE,
+ OPAL_PLATFORM_FIRMWARE,
+ OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT, OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_CU_SG_LIST, OPAL_PLATFORM_ERR_EVT, OPAL_CODEUPDATE,
+ OPAL_PLATFORM_FIRMWARE,
+ OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT, OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_CU_COMMIT, OPAL_PLATFORM_ERR_EVT, OPAL_CODEUPDATE,
+ OPAL_PLATFORM_FIRMWARE,
+ OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT, OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_CU_MSG, OPAL_PLATFORM_ERR_EVT, OPAL_CODEUPDATE,
+ OPAL_PLATFORM_FIRMWARE,
+ OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT, OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_CU_NOTIFY, OPAL_PLATFORM_ERR_EVT, OPAL_CODEUPDATE,
+ OPAL_PLATFORM_FIRMWARE,
+ OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT, OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_CU_MARKER_LID, OPAL_PLATFORM_ERR_EVT, OPAL_CODEUPDATE,
+ OPAL_PLATFORM_FIRMWARE,
+ OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT, OPAL_NA);
+
+static inline void code_update_tce_map(uint32_t tce_offset,
+ void *buffer, uint32_t size)
+{
+ uint32_t tlen = ALIGN_UP(size, TCE_PSIZE);
+
+ fsp_tce_map(PSI_DMA_CODE_UPD + tce_offset, buffer, tlen);
+}
+
+static inline void code_update_tce_unmap(uint32_t size)
+{
+ fsp_tce_unmap(PSI_DMA_CODE_UPD, size);
+}
+
+static inline void set_def_fw_version(uint32_t side)
+{
+ strncpy(fw_vpd[side].mi_keyword, FW_VERSION_UNKNOWN, MI_KEYWORD_SIZE);
+ strncpy(fw_vpd[side].ext_fw_id, FW_VERSION_UNKNOWN, ML_KEYWORD_SIZE);
+}
+
+/*
+ * Get IPL side
+ */
+static void get_ipl_side(void)
+{
+ struct dt_node *iplp;
+ const char *side = NULL;
+
+ iplp = dt_find_by_path(dt_root, "ipl-params/ipl-params");
+ if (iplp)
+ side = dt_prop_get_def(iplp, "cec-ipl-side", NULL);
+ prlog(PR_NOTICE, "CUPD: IPL SIDE = %s\n", side);
+
+ if (!side || !strcmp(side, "temp"))
+ ipl_side = FW_IPL_SIDE_TEMP;
+ else
+ ipl_side = FW_IPL_SIDE_PERM;
+}
+
+
+/*
+ * Helper routines to retrieve code update related
+ * system parameters from FSP.
+ */
+
+static void inc_in_flight_param(void)
+{
+ lock(&flash_lock);
+ in_flight_params++;
+ unlock(&flash_lock);
+}
+
+static void dec_in_flight_param(void)
+{
+ lock(&flash_lock);
+ assert(in_flight_params > 0);
+ in_flight_params--;
+ unlock(&flash_lock);
+}
+
+static void got_code_update_policy(uint32_t param_id __unused, int err_len,
+ void *data __unused)
+{
+ if (err_len != 4) {
+ log_simple_error(&e_info(OPAL_RC_CU_INIT), "CUPD: Error "
+ "retrieving code update policy: %d\n", err_len);
+ } else {
+ update_policy = be32_to_cpu((__be32)update_policy);
+ prlog(PR_NOTICE, "CUPD: Code update policy from FSP: %d\n",
+ update_policy);
+ }
+
+ dec_in_flight_param();
+}
+
+static void get_code_update_policy(void)
+{
+ int rc;
+
+ inc_in_flight_param();
+ rc = fsp_get_sys_param(SYS_PARAM_FLASH_POLICY, &update_policy, 4,
+ got_code_update_policy, NULL);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_CU_INIT),
+ "CUPD: Error %d queueing param request\n", rc);
+ dec_in_flight_param();
+ }
+}
+
+static void got_platform_hmc_managed(uint32_t param_id __unused, int err_len,
+ void *data __unused)
+{
+ if (err_len != 4) {
+ log_simple_error(&e_info(OPAL_RC_CU_INIT), "CUPD: Error "
+ "retrieving hmc managed status: %d\n", err_len);
+ } else {
+ hmc_managed = be32_to_cpu((__be32)hmc_managed);
+ prlog(PR_NOTICE, "CUPD: HMC managed status from FSP: %d\n",
+ hmc_managed);
+ }
+
+ dec_in_flight_param();
+}
+
+static void get_platform_hmc_managed(void)
+{
+ int rc;
+
+ inc_in_flight_param();
+ rc = fsp_get_sys_param(SYS_PARAM_HMC_MANAGED, &hmc_managed, 4,
+ got_platform_hmc_managed, NULL);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_CU_INIT),
+ "CUPD: Error %d queueing param request\n", rc);
+ dec_in_flight_param();
+ }
+}
+
+static bool fw_ipl_side_update_notify(struct fsp_msg *msg)
+{
+ u32 param_id = fsp_msg_get_data_word(msg, 0);
+ int dlen = fsp_msg_get_data_word(msg, 1) & 0xffff;
+ uint32_t state = fsp_msg_get_data_word(msg, 2);
+
+ if (param_id != SYS_PARAM_FW_IPL_SIDE)
+ return false;
+
+ if (dlen != 4) {
+ prlog(PR_DEBUG,
+ "CUPD: Invalid sysparams notify len : 0x%x\n", dlen);
+ return false;
+ }
+
+ prlog(PR_NOTICE, "CUPD: FW IPL side changed. Disable fast reboot\n");
+ prlog(PR_NOTICE, "CUPD: Next IPL side : %s\n",
+ state == FW_IPL_SIDE_TEMP ? "temp" : "perm");
+
+ disable_fast_reboot("FSP IPL Side Change");
+ return true;
+}
+
+static int64_t code_update_check_state(void)
+{
+ switch(flash_state) {
+ case FLASH_STATE_ABSENT:
+ return OPAL_HARDWARE;
+ case FLASH_STATE_INVALID:
+ case FLASH_STATE_ABORT:
+ return OPAL_INTERNAL_ERROR;
+ case FLASH_STATE_READING:
+ return OPAL_BUSY;
+ default:
+ break;
+ }
+ return OPAL_SUCCESS;
+}
+
+/*
+ * Get common marker LID additional data section
+ */
+static void *get_adf_sec_data(struct com_marker_adf_sec *adf_sec,
+ uint32_t name)
+{
+ struct com_marker_adf_header *adf_header;
+ int i;
+
+ adf_header = (void *)adf_sec->adf_data;
+ for (i = 0; i < be32_to_cpu(adf_sec->adf_cnt); i++) {
+ if (be32_to_cpu(adf_header->name) == name)
+ return adf_header;
+
+ adf_header = (void *)adf_header + be32_to_cpu(adf_header->size);
+ }
+ return NULL;
+}
+
+/*
+ * Parse common marker LID to get FW version details
+ *
+ * Note:
+ * At present, we are parsing "Service Pack Nomenclature ADF"
+ * section only. If we are adding FW IP support, then we have
+ * to parse "Firmware IP Protection ADF" as well.
+ */
+static void parse_marker_lid(uint32_t side)
+{
+ struct com_marker_header *header;
+ struct com_marker_mi_section *mi_sec;
+ struct com_marker_adf_sec *adf_sec;
+ struct com_marker_adf_sp *adf_sp;
+
+ header = (void *)lid_data;
+
+ /* Get MI details */
+ mi_sec = (void *)header + be32_to_cpu(header->MI_offset);
+ /*
+ * If Marker LID is invalid, then FSP will return a Marker
+ * LID with ASCII zeros for the entire MI keyword.
+ */
+ if (mi_sec->mi_keyword[0] == '0')
+ return;
+
+ strncpy(fw_vpd[side].mi_keyword, mi_sec->mi_keyword, MI_KEYWORD_SIZE);
+ fw_vpd[side].mi_keyword[MI_KEYWORD_SIZE - 1] = '\0';
+ prlog(PR_NOTICE, "CUPD: %s side MI Keyword = %s\n",
+ side == 0x00 ? "P" : "T", fw_vpd[side].mi_keyword);
+
+ /* Get ML details */
+ adf_sec = (void *)header + be32_to_cpu(mi_sec->adf_offset);
+ adf_sp = get_adf_sec_data(adf_sec, ADF_NAME_SP);
+ if (!adf_sp)
+ return;
+
+ strncpy(fw_vpd[side].ext_fw_id,
+ (void *)adf_sp + be32_to_cpu(adf_sp->sp_name_offset),
+ ML_KEYWORD_SIZE);
+ fw_vpd[side].ext_fw_id[ML_KEYWORD_SIZE - 1] = '\0';
+ prlog(PR_NOTICE, "CUPD: %s side ML Keyword = %s\n",
+ side == 0x00 ? "P" : "T", fw_vpd[side].ext_fw_id);
+}
+
+static void validate_com_marker_lid(void)
+{
+ if (!strncmp(fw_vpd[ipl_side].mi_keyword, FW_VERSION_UNKNOWN,
+ sizeof(FW_VERSION_UNKNOWN))) {
+ log_simple_error(&e_info(OPAL_RC_CU_MARKER_LID),
+ "CUPD: IPL side Marker LID is not valid\n");
+ flash_state = FLASH_STATE_INVALID;
+ return;
+ }
+
+ flash_state = FLASH_STATE_READ;
+}
+
+static void fetch_lid_data_complete(struct fsp_msg *msg)
+{
+ void *buffer;
+ size_t length, chunk;
+ uint32_t lid_id, offset;
+ uint16_t id;
+ uint8_t flags, status;
+ int rc;
+
+ status = (msg->resp->word1 >> 8) & 0xff;
+ flags = (fsp_msg_get_data_word(msg, 0) >> 16) & 0xff;
+ id = fsp_msg_get_data_word(msg, 0) & 0xffff;
+ lid_id = fsp_msg_get_data_word(msg, 1);
+ offset = fsp_msg_get_data_word(msg->resp, 1);
+ length = fsp_msg_get_data_word(msg->resp, 2);
+
+ prlog(PR_NOTICE, "CUPD: Marker LID id : size : status = "
+ "0x%x : 0x%x : 0x%x\n",
+ fsp_msg_get_data_word(msg, 1), fsp_msg_get_data_word(msg->resp, 2), status);
+
+ fsp_freemsg(msg);
+
+ switch (status) {
+ case FSP_STATUS_SUCCESS: /* Read complete, parse VPD */
+ parse_marker_lid(lid_id == P_COM_MARKER_LID_ID ? 0 : 1);
+ break;
+ case FSP_STATUS_MORE_DATA: /* More data left */
+ offset += length;
+ chunk = MARKER_LID_SIZE - offset;
+ if (chunk > 0) {
+ buffer = (void *)PSI_DMA_CODE_UPD + offset;
+ rc = fsp_fetch_data_queue(flags, id, lid_id,
+ offset, buffer, &chunk,
+ fetch_lid_data_complete);
+
+ /* If queue msg fails, then continue with marker LID
+ * validation hoping that we have at least boot side
+ * information.
+ */
+ if (rc == OPAL_SUCCESS)
+ return;
+ }
+ break;
+ default: /* Fetch LID call failed */
+ break;
+ }
+
+ /* If required, fetch T side marker LID */
+ if (lid_id == P_COM_MARKER_LID_ID &&
+ lid_fetch_side == FETCH_BOTH_SIDE) {
+ length = MARKER_LID_SIZE;
+ rc = fsp_fetch_data_queue(flags, id, T_COM_MARKER_LID_ID,
+ 0, (void *)PSI_DMA_CODE_UPD,
+ &length, fetch_lid_data_complete);
+
+ /* If queue msg fails, then continue with marker LID
+ * validation hoping that we have at least boot side
+ * information.
+ */
+ if (rc == OPAL_SUCCESS)
+ return;
+ }
+
+ lock(&flash_lock);
+
+ /* Validate marker LID data */
+ validate_com_marker_lid();
+ /* TCE unmap */
+ code_update_tce_unmap(MARKER_LID_SIZE);
+
+ unlock(&flash_lock);
+}
+
+static void fetch_com_marker_lid(void)
+{
+ size_t length = MARKER_LID_SIZE;
+ uint32_t lid_id;
+ int rc;
+
+ /* Read in progress? */
+ rc = code_update_check_state();
+ if (rc == OPAL_HARDWARE || rc == OPAL_BUSY)
+ return;
+
+ if (lid_fetch_side == FETCH_T_SIDE_ONLY) {
+ lid_id = T_COM_MARKER_LID_ID;
+ set_def_fw_version(FW_IPL_SIDE_TEMP);
+ } else if (lid_fetch_side == FETCH_P_SIDE_ONLY) {
+ lid_id = P_COM_MARKER_LID_ID;
+ set_def_fw_version(FW_IPL_SIDE_PERM);
+ } else {
+ lid_id = P_COM_MARKER_LID_ID;
+ set_def_fw_version(FW_IPL_SIDE_PERM);
+ set_def_fw_version(FW_IPL_SIDE_TEMP);
+ }
+
+ code_update_tce_map(0, lid_data, length);
+ rc = fsp_fetch_data_queue(0x00, 0x05, lid_id, 0,
+ (void *)PSI_DMA_CODE_UPD, &length,
+ fetch_lid_data_complete);
+ if (!rc)
+ flash_state = FLASH_STATE_READING;
+ else
+ flash_state = FLASH_STATE_INVALID;
+}
+
+/*
+ * Add MI and ML keyword details into DT
+ */
+#define FW_VER_SIZE 64
+static void add_opal_firmware_version(void)
+{
+ struct dt_node *dt_fw;
+ char buffer[FW_VER_SIZE];
+ int offset;
+
+ dt_fw = dt_find_by_path(dt_root, "ibm,opal/firmware");
+ if (!dt_fw)
+ return;
+
+ /* MI version */
+ offset = snprintf(buffer, FW_VER_SIZE, "MI %s %s",
+ fw_vpd[FW_IPL_SIDE_TEMP].mi_keyword,
+ fw_vpd[FW_IPL_SIDE_PERM].mi_keyword);
+ if (ipl_side == FW_IPL_SIDE_TEMP)
+ snprintf(buffer + offset, FW_VER_SIZE - offset,
+ " %s", fw_vpd[FW_IPL_SIDE_TEMP].mi_keyword);
+ else
+ snprintf(buffer + offset, FW_VER_SIZE - offset,
+ " %s", fw_vpd[FW_IPL_SIDE_PERM].mi_keyword);
+
+ dt_add_property(dt_fw, "mi-version", buffer, strlen(buffer));
+
+ /* ML version */
+ offset = snprintf(buffer, FW_VER_SIZE, "ML %s %s",
+ fw_vpd[FW_IPL_SIDE_TEMP].ext_fw_id,
+ fw_vpd[FW_IPL_SIDE_PERM].ext_fw_id);
+ if (ipl_side == FW_IPL_SIDE_TEMP)
+ snprintf(buffer + offset, FW_VER_SIZE - offset,
+ " %s", fw_vpd[FW_IPL_SIDE_TEMP].ext_fw_id);
+ else
+ snprintf(buffer + offset, FW_VER_SIZE - offset,
+ " %s", fw_vpd[FW_IPL_SIDE_PERM].ext_fw_id);
+
+ dt_add_property(dt_fw, "ml-version", buffer, strlen(buffer));
+}
+
+/*
+ * This is called right before starting the payload (Linux) to
+ * ensure the common marker LID read and parsing has happened
+ * before we transfer control.
+ */
+void fsp_code_update_wait_vpd(bool is_boot)
+{
+ int waited = 0;
+
+ if (!fsp_present())
+ return;
+
+ prlog(PR_NOTICE, "CUPD: Waiting read marker LID"
+ " and in flight parsm completion...\n");
+
+ lock(&flash_lock);
+ while(true) {
+ if (!(flash_state == FLASH_STATE_READING || in_flight_params))
+ break;
+ unlock(&flash_lock);
+ time_wait_ms(5);
+ waited+=5;
+ lock(&flash_lock);
+ }
+ unlock(&flash_lock);
+
+ if (waited)
+ prlog(PR_DEBUG, "CUPD: fsp_code_update_wait_vpd %d\n", waited);
+
+ if (is_boot)
+ add_opal_firmware_version();
+}
+
+static int code_update_start(void)
+{
+ struct fsp_msg *msg;
+ int rc;
+ uint16_t comp = 0x00; /* All components */
+ uint8_t side = OPAL_COMMIT_TMP_SIDE; /* Temporary side */
+
+ msg = fsp_mkmsg(FSP_CMD_FLASH_START, 1, side << 16 | comp);
+ if (!msg) {
+ log_simple_error(&e_info(OPAL_RC_CU_MSG),
+ "CUPD: CMD_FLASH_START message allocation failed !\n");
+ return OPAL_INTERNAL_ERROR;
+ }
+ if (fsp_sync_msg(msg, false)) {
+ fsp_freemsg(msg);
+ return OPAL_INTERNAL_ERROR;
+ }
+ rc = (msg->resp->word1 >> 8) & 0xff;
+ fsp_freemsg(msg);
+ return rc;
+}
+
+static int code_update_write_lid(uint32_t lid_id, uint32_t size)
+{
+ struct fsp_msg *msg;
+ int rc, n_pairs = 1;
+
+ msg = fsp_mkmsg(FSP_CMD_FLASH_WRITE, 5, lid_id,
+ n_pairs, 0, tce_start, size);
+ if (!msg) {
+ log_simple_error(&e_info(OPAL_RC_CU_MSG),
+ "CUPD: CMD_FLASH_WRITE message allocation failed !\n");
+ return OPAL_INTERNAL_ERROR;
+ }
+ if (fsp_sync_msg(msg, false)) {
+ fsp_freemsg(msg);
+ return OPAL_INTERNAL_ERROR;
+ }
+ rc = (msg->resp->word1 >> 8) & 0xff;
+ fsp_freemsg(msg);
+ return rc;
+}
+
+static int code_update_del_lid(uint32_t lid_id)
+{
+ struct fsp_msg *msg;
+ int rc;
+
+ msg = fsp_mkmsg(FSP_CMD_FLASH_DEL, 1, lid_id);
+ if (!msg) {
+ log_simple_error(&e_info(OPAL_RC_CU_MSG),
+ "CUPD: CMD_FLASH_DEL message allocation failed !\n");
+ return OPAL_INTERNAL_ERROR;
+ }
+ if (fsp_sync_msg(msg, false)) {
+ fsp_freemsg(msg);
+ return OPAL_INTERNAL_ERROR;
+ }
+ rc = (msg->resp->word1 >> 8) & 0xff;
+ fsp_freemsg(msg);
+ return rc;
+}
+
+static int code_update_complete(uint32_t cmd)
+{
+ struct fsp_msg *msg;
+ int rc;
+
+ msg = fsp_mkmsg(cmd, 0);
+ if (!msg) {
+ log_simple_error(&e_info(OPAL_RC_CU_MSG),
+ "CUPD: CUPD COMPLETE message allocation failed !\n");
+ return OPAL_INTERNAL_ERROR;
+ }
+ if (fsp_sync_msg(msg, false)) {
+ fsp_freemsg(msg);
+ return OPAL_INTERNAL_ERROR;
+ }
+ rc = (msg->resp->word1 >> 8) & 0xff;
+ fsp_freemsg(msg);
+ return rc;
+}
+
+static int code_update_swap_side(void)
+{
+ struct fsp_msg *msg;
+ int rc;
+
+ msg = fsp_mkmsg(FSP_CMD_FLASH_SWAP, 0);
+ if (!msg) {
+ log_simple_error(&e_info(OPAL_RC_CU_MSG),
+ "CUPD: CMD_FLASH_SWAP message allocation failed !\n");
+ return OPAL_INTERNAL_ERROR;
+ }
+
+ if (fsp_sync_msg(msg, false)) {
+ fsp_freemsg(msg);
+ return OPAL_INTERNAL_ERROR;
+ }
+ rc = (msg->resp->word1 >> 8) & 0xff;
+ fsp_freemsg(msg);
+ return rc;
+}
+
+static int code_update_set_ipl_side(void)
+{
+ struct fsp_msg *msg;
+ uint8_t side = FW_IPL_SIDE_TEMP; /* Next IPL side */
+ int rc;
+
+ msg = fsp_mkmsg(FSP_CMD_SET_IPL_SIDE, 1, side << 16);
+ if (!msg) {
+ log_simple_error(&e_info(OPAL_RC_CU_MSG),
+ "CUPD: CMD_SET_IPL_SIDE message allocation failed!\n");
+ return OPAL_INTERNAL_ERROR;
+ }
+ if (fsp_sync_msg(msg, false)) {
+ fsp_freemsg(msg);
+ log_simple_error(&e_info(OPAL_RC_CU_MSG),
+ "CUPD: Setting next IPL side failed!\n");
+ return OPAL_INTERNAL_ERROR;
+ }
+ rc = (msg->resp->word1 >> 8) & 0xff;
+ fsp_freemsg(msg);
+ return rc;
+}
+
+static void code_update_commit_complete(struct fsp_msg *msg)
+{
+ int rc;
+ uint8_t type;
+
+ rc = (msg->resp->word1 >> 8) & 0xff;
+ type = (msg->word1 >> 8) & 0xff;
+ fsp_freemsg(msg);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_CU_COMMIT),
+ "CUPD: Code update commit failed, err 0x%x\n", rc);
+ return;
+ }
+
+ /* Reset cached VPD data */
+ lock(&flash_lock);
+
+ /* Find commit type */
+ if (type == 0x01) {
+ lid_fetch_side = FETCH_P_SIDE_ONLY;
+ } else if (type == 0x02)
+ lid_fetch_side = FETCH_T_SIDE_ONLY;
+ else
+ lid_fetch_side = FETCH_BOTH_SIDE;
+
+ fetch_com_marker_lid();
+
+ unlock(&flash_lock);
+}
+
+static int code_update_commit(uint32_t cmd)
+{
+ struct fsp_msg *msg;
+
+ msg = fsp_mkmsg(cmd, 0);
+ if (!msg) {
+ log_simple_error(&e_info(OPAL_RC_CU_MSG),
+ "CUPD: COMMIT message allocation failed !\n");
+ return OPAL_INTERNAL_ERROR;
+ }
+ if (fsp_queue_msg(msg, code_update_commit_complete)) {
+ log_simple_error(&e_info(OPAL_RC_CU_COMMIT),
+ "CUPD: Failed to queue code update commit message\n");
+ fsp_freemsg(msg);
+ return OPAL_INTERNAL_ERROR;
+ }
+ return OPAL_SUCCESS;
+}
+
+/*
+ * Inband code update is allowed?
+ */
+static int64_t validate_inband_policy(void)
+{
+ /* Quirk:
+ * If the code update policy is out-of-band, but the system
+ * is not HMC-managed, then inband update is allowed.
+ */
+ if (hmc_managed != PLATFORM_HMC_MANAGED)
+ return 0;
+ if (update_policy == INBAND_UPDATE_ALLOWED)
+ return 0;
+
+ return -1;
+}
+
+/*
+ * Validate magic Number
+ */
+static int64_t validate_magic_num(uint16_t magic)
+{
+ if (magic != IMAGE_MAGIC_NUMBER)
+ return -1;
+ return 0;
+}
+
+/*
+ * Compare MI keyword to make sure candidate image
+ * is valid for this platform.
+ */
+static int64_t validate_image_version(struct update_image_header *header,
+ uint32_t *result)
+{
+ struct fw_image_vpd vpd;
+ int t_valid = 0, p_valid = 0, cton_ver = -1, ptot_ver = -1;
+
+ /* Valid flash image level? */
+ if (strncmp(fw_vpd[0].mi_keyword, FW_VERSION_UNKNOWN,
+ sizeof(FW_VERSION_UNKNOWN)) != 0)
+ p_valid = 1;
+
+ if (strncmp(fw_vpd[1].mi_keyword, FW_VERSION_UNKNOWN,
+ sizeof(FW_VERSION_UNKNOWN)) != 0)
+ t_valid = 1;
+
+ /* Validate with IPL side image */
+ vpd = fw_vpd[ipl_side];
+
+ /* Validate platform identifier (first two char of MI keyword) */
+ if (strncmp(vpd.mi_keyword, header->mi_keyword_data, 2) != 0) {
+ *result = VALIDATE_INVALID_IMG;
+ return OPAL_SUCCESS;
+ }
+
+ /* Don't flash different FW series (like P7 image on P8) */
+ if (vpd.mi_keyword[2] != header->mi_keyword_data[2]) {
+ *result = VALIDATE_INVALID_IMG;
+ return OPAL_SUCCESS;
+ }
+
+ /* Get current to new version difference */
+ cton_ver = strncmp(vpd.mi_keyword + 3, header->mi_keyword_data + 3, 6);
+
+ /* Get P to T version difference */
+ if (t_valid && p_valid)
+ ptot_ver = strncmp(fw_vpd[0].mi_keyword + 3,
+ fw_vpd[1].mi_keyword + 3, 6);
+
+ /* Update validation result */
+ if (ipl_side == FW_IPL_SIDE_TEMP) {
+ if (!ptot_ver && cton_ver > 0) /* downgrade T side */
+ *result = VALIDATE_TMP_UPDATE_DL;
+ else if (!ptot_ver && cton_ver <= 0) /* upgrade T side */
+ *result = VALIDATE_TMP_UPDATE;
+ else if (cton_ver > 0) /* Implied commit & downgrade T side */
+ *result = VALIDATE_TMP_COMMIT_DL;
+ else /* Implied commit & upgrade T side */
+ *result = VALIDATE_TMP_COMMIT;
+ } else {
+ if (!t_valid) /* Current unknown */
+ *result = VALIDATE_CUR_UNKNOWN;
+ else if (cton_ver > 0) /* downgrade FW version */
+ *result = VALIDATE_TMP_UPDATE_DL;
+ else /* upgrade FW version */
+ *result = VALIDATE_TMP_UPDATE;
+ }
+ return OPAL_SUCCESS;
+}
+
+/*
+ * Validate candidate image
+ */
+static int validate_candidate_image(uint64_t buffer,
+ uint32_t size, uint32_t *result)
+{
+ struct update_image_header *header;
+ int rc = OPAL_PARAMETER;
+
+ if (size < VALIDATE_BUF_SIZE)
+ goto out;
+
+ rc = code_update_check_state();
+ if (rc != OPAL_SUCCESS)
+ goto out;
+
+ if (validate_inband_policy() != 0) {
+ *result = VALIDATE_FLASH_AUTH;
+ rc = OPAL_SUCCESS;
+ goto out;
+ }
+
+ memcpy(validate_buf, (void *)buffer, VALIDATE_BUF_SIZE);
+ header = (struct update_image_header *)validate_buf;
+
+ if (validate_magic_num(be16_to_cpu(header->magic)) != 0) {
+ *result = VALIDATE_INVALID_IMG;
+ rc = OPAL_SUCCESS;
+ goto out;
+ }
+ rc = validate_image_version(header, result);
+out:
+ return rc;
+}
+
+static int validate_out_buf_mi_data(void *buffer, int offset, uint32_t result)
+{
+ struct update_image_header *header = (void *)validate_buf;
+
+ /* Current T & P side MI data */
+ offset += snprintf(buffer + offset, VALIDATE_BUF_SIZE - offset,
+ "MI %s %s\n",
+ fw_vpd[1].mi_keyword, fw_vpd[0].mi_keyword);
+
+ /* New T & P side MI data */
+ offset += snprintf(buffer + offset, VALIDATE_BUF_SIZE - offset,
+ "MI %s", header->mi_keyword_data);
+ if (result == VALIDATE_TMP_COMMIT_DL ||
+ result == VALIDATE_TMP_COMMIT)
+ offset += snprintf(buffer + offset,
+ VALIDATE_BUF_SIZE - offset,
+ " %s\n", fw_vpd[1].mi_keyword);
+ else
+ offset += snprintf(buffer + offset,
+ VALIDATE_BUF_SIZE - offset,
+ " %s\n", fw_vpd[0].mi_keyword);
+ return offset;
+}
+
+static int validate_out_buf_ml_data(void *buffer, int offset, uint32_t result)
+{
+ struct update_image_header *header = (void *)validate_buf;
+ /* Candidate image ML data */
+ char *ext_fw_id = (void *)header->data;
+
+ /* Current T & P side ML data */
+ offset += snprintf(buffer + offset, VALIDATE_BUF_SIZE - offset,
+ "ML %s %s\n",
+ fw_vpd[1].ext_fw_id, fw_vpd[0].ext_fw_id);
+
+ /* New T & P side ML data */
+ offset += snprintf(buffer + offset, VALIDATE_BUF_SIZE - offset,
+ "ML %s", ext_fw_id);
+ if (result == VALIDATE_TMP_COMMIT_DL ||
+ result == VALIDATE_TMP_COMMIT)
+ offset += snprintf(buffer + offset,
+ VALIDATE_BUF_SIZE - offset,
+ " %s\n", fw_vpd[1].ext_fw_id);
+ else
+ offset += snprintf(buffer + offset,
+ VALIDATE_BUF_SIZE - offset,
+ " %s\n", fw_vpd[0].ext_fw_id);
+
+ return offset;
+}
+
+/*
+ * Copy LID data to TCE buffer
+ */
+static int get_lid_data(struct opal_sg_list *list,
+ int lid_size, int lid_offset)
+{
+ struct opal_sg_list *sg;
+ struct opal_sg_entry *entry;
+ int length, num_entries, i, buf_pos = 0;
+ int map_act, map_size;
+ bool last = false;
+
+ /* Reset TCE start address */
+ tce_start = 0;
+
+ for (sg = list; sg; sg = (struct opal_sg_list*)be64_to_cpu(sg->next)) {
+ length = (be64_to_cpu(sg->length) & ~(SG_LIST_VERSION << 56)) - 16;
+ num_entries = length / sizeof(struct opal_sg_entry);
+ if (num_entries <= 0)
+ return -1;
+
+ for (i = 0; i < num_entries; i++) {
+ entry = &sg->entry[i];
+
+ /*
+ * Continue until we get data block which
+ * contains LID data
+ */
+ if (lid_offset > be64_to_cpu(entry->length)) {
+ lid_offset -= be64_to_cpu(entry->length);
+ continue;
+ }
+
+ /*
+ * SG list entry size can be more than 4k.
+ * Map only required pages, instead of
+ * mapping entire entry.
+ */
+ map_act = be64_to_cpu(entry->length);
+ map_size = be64_to_cpu(entry->length);
+
+ /* First TCE mapping */
+ if (!tce_start) {
+ tce_start = PSI_DMA_CODE_UPD +
+ (lid_offset & 0xfff);
+ map_act = be64_to_cpu(entry->length) - lid_offset;
+ lid_offset &= ~0xfff;
+ map_size = be64_to_cpu(entry->length) - lid_offset;
+ }
+
+ /* Check pending LID size to map */
+ if (lid_size <= map_act) {
+ /* (map_size - map_act) gives page
+ * start to tce offset difference.
+ * This is required when LID size
+ * is <= 4k.
+ */
+ map_size = (map_size - map_act) + lid_size;
+ last = true;
+ }
+
+ /* Ajust remaining size to map */
+ lid_size -= map_act;
+
+ /* TCE mapping */
+ code_update_tce_map(buf_pos,
+ (void*)(be64_to_cpu(entry->data)
+ + lid_offset),
+ map_size);
+ buf_pos += map_size;
+ /* Reset LID offset count */
+ lid_offset = 0;
+
+ if (last)
+ return OPAL_SUCCESS;
+ }
+ } /* outer loop */
+ return -1;
+}
+
+/*
+ * If IPL side is T, then swap P & T sides to add
+ * new fix to T side.
+ */
+static int validate_ipl_side(void)
+{
+ if (ipl_side == FW_IPL_SIDE_PERM)
+ return 0;
+ return code_update_swap_side();
+}
+
+static int64_t fsp_opal_validate_flash(uint64_t buffer,
+ __be32 *size, __be32 *result)
+{
+ int64_t rc = 0;
+ int offset;
+ uint32_t r;
+
+ lock(&flash_lock);
+
+ rc = validate_candidate_image(buffer, be32_to_cpu(*size), &r);
+ /* Fill output buffer
+ *
+ * Format:
+ * MI<sp>current-T-image<sp>current-P-image<0x0A>
+ * MI<sp>new-T-image<sp>new-P-image<0x0A>
+ * ML<sp>current-T-image<sp>current-P-image<0x0A>
+ * ML<sp>new-T-image<sp>new-P-image<0x0A>
+ */
+ if (!rc && (r != VALIDATE_FLASH_AUTH && r != VALIDATE_INVALID_IMG)) {
+ /* Clear output buffer */
+ memset((void *)buffer, 0, VALIDATE_BUF_SIZE);
+
+ offset = validate_out_buf_mi_data((void *)buffer, 0, r);
+ offset += validate_out_buf_ml_data((void *)buffer, offset, r);
+ *size = cpu_to_be32(offset);
+ }
+ *result = cpu_to_be32(r);
+
+ unlock(&flash_lock);
+ return rc;
+}
+
+/* Commit/Reject T side image */
+static int64_t fsp_opal_manage_flash(uint8_t op)
+{
+ uint32_t cmd;
+ int rc;
+
+ lock(&flash_lock);
+ rc = code_update_check_state();
+ unlock(&flash_lock);
+
+ if (rc != OPAL_SUCCESS)
+ return rc;
+
+ if (op != OPAL_REJECT_TMP_SIDE && op != OPAL_COMMIT_TMP_SIDE)
+ return OPAL_PARAMETER;
+
+ if ((op == OPAL_COMMIT_TMP_SIDE && ipl_side == FW_IPL_SIDE_PERM) ||
+ (op == OPAL_REJECT_TMP_SIDE && ipl_side == FW_IPL_SIDE_TEMP))
+ return OPAL_ACTIVE_SIDE_ERR;
+
+ if (op == OPAL_COMMIT_TMP_SIDE)
+ cmd = FSP_CMD_FLASH_NORMAL;
+ else
+ cmd = FSP_CMD_FLASH_REMOVE;
+
+ return code_update_commit(cmd);
+}
+
+static int fsp_flash_firmware(void)
+{
+ struct update_image_header *header;
+ struct lid_index_entry *idx_entry;
+ struct opal_sg_list *list;
+ struct opal_sg_entry *entry;
+ int rc, i;
+
+ /* Make sure no outstanding LID read is in progress */
+ rc = code_update_check_state();
+ if (rc == OPAL_BUSY)
+ fsp_code_update_wait_vpd(false);
+
+ /* Get LID Index */
+ list = image_data;
+ if (!list)
+ goto out;
+ entry = &list->entry[0];
+ header = (struct update_image_header *)be64_to_cpu(entry->data);
+ idx_entry = (void *)header + be16_to_cpu(header->lid_index_offset);
+
+ /* FIXME:
+ * At present we depend on FSP to validate CRC for
+ * individual LIDs. Calculate and validate individual
+ * LID CRC here.
+ */
+
+ if (validate_ipl_side() != 0) {
+ log_simple_error(&e_info(OPAL_RC_CU_FLASH), "CUPD: "
+ "Rename (Swap T and P) failed!\n");
+ goto out;
+ }
+
+ /* Set next IPL side */
+ if (code_update_set_ipl_side() != 0) {
+ log_simple_error(&e_info(OPAL_RC_CU_FLASH), "CUPD: "
+ "Setting next IPL side failed!\n");
+ goto out;
+ }
+
+ /* Start code update process */
+ if (code_update_start() != 0) {
+ log_simple_error(&e_info(OPAL_RC_CU_FLASH), "CUPD: "
+ "Code update start failed!\n");
+ goto out;
+ }
+
+ /*
+ * Delete T side LIDs before writing.
+ *
+ * Note:
+ * - Applicable for FWv >= 760.
+ * - Current Code Update design is to ignore
+ * any delete lid failure, and continue with
+ * the update.
+ */
+ rc = code_update_del_lid(DEL_UPD_SIDE_LIDS);
+
+ if (rc)
+ prlog(PR_TRACE, "CUPD: Failed to delete LIDs (%d). This is okay, continuing..", rc);
+
+ for (i = 0; i < be16_to_cpu(header->number_lids); i++) {
+ if (be32_to_cpu(idx_entry->size) > LID_MAX_SIZE) {
+ log_simple_error(&e_info(OPAL_RC_CU_FLASH), "CUPD: LID"
+ " (0x%x) size 0x%x is > max LID size (0x%x).\n",
+ be32_to_cpu(idx_entry->id),
+ be32_to_cpu(idx_entry->size), LID_MAX_SIZE);
+ goto abort_update;
+ }
+
+ rc = get_lid_data(list, be32_to_cpu(idx_entry->size),
+ be32_to_cpu(idx_entry->offset));
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_CU_FLASH), "CUPD: "
+ "Failed to parse LID from firmware image."
+ " (rc : %d).\n", rc);
+ goto abort_update;
+ }
+
+ rc = code_update_write_lid(be32_to_cpu(idx_entry->id),
+ be32_to_cpu(idx_entry->size));
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_CU_FLASH), "CUPD: "
+ "Failed to write LID to FSP. (rc : %d).\n", rc);
+ goto abort_update;
+ }
+
+ /* Unmap TCE */
+ code_update_tce_unmap(PSI_DMA_CODE_UPD_SIZE);
+
+ /* Next LID index */
+ idx_entry = (void *)idx_entry + sizeof(struct lid_index_entry);
+ }
+
+ /* Code update completed */
+ rc = code_update_complete(FSP_CMD_FLASH_COMPLETE);
+
+ return rc;
+
+abort_update:
+ rc = code_update_complete(FSP_CMD_FLASH_ABORT);
+ if (rc)
+ log_simple_error(&e_info(OPAL_RC_CU_FLASH), "CUPD: "
+ "Code update abort command failed. (rc : %d).", rc);
+
+out:
+ return -1;
+}
+
+static int64_t validate_sglist(struct opal_sg_list *list)
+{
+ struct opal_sg_list *sg;
+ struct opal_sg_entry *prev_entry, *entry;
+ int length, num_entries, i;
+
+ prev_entry = NULL;
+ for (sg = list; sg; sg = (struct opal_sg_list*)be64_to_cpu(sg->next)) {
+ length = (be64_to_cpu(sg->length) & ~(SG_LIST_VERSION << 56)) - 16;
+ num_entries = length / sizeof(struct opal_sg_entry);
+ if (num_entries <= 0)
+ return -1;
+
+ for (i = 0; i < num_entries; i++) {
+ entry = &sg->entry[i];
+
+ /* All entries must be aligned */
+ if (((uint64_t)be64_to_cpu(entry->data)) & 0xfff)
+ return OPAL_PARAMETER;
+
+ /* All non-terminal entries size must be aligned */
+ if (prev_entry && (be64_to_cpu(prev_entry->length) & 0xfff))
+ return OPAL_PARAMETER;
+
+ prev_entry = entry;
+ }
+ }
+ return OPAL_SUCCESS;
+}
+
+static int64_t fsp_opal_update_flash(struct opal_sg_list *list)
+{
+ struct opal_sg_entry *entry;
+ int length, num_entries, result = 0, rc = OPAL_PARAMETER;
+
+ /* Ensure that the sg list honors our alignment requirements */
+ rc = validate_sglist(list);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_CU_SG_LIST),
+ "CUPD: sglist fails alignment requirements\n");
+ return rc;
+ }
+
+ lock(&flash_lock);
+ if (!list) { /* Cancel update request */
+ fsp_flash_term_hook = NULL;
+ image_data = NULL;
+ rc = OPAL_SUCCESS;
+ goto out;
+ }
+
+ disable_fast_reboot("FSP Code Update");
+
+ length = (be64_to_cpu(list->length) & ~(SG_LIST_VERSION << 56)) - 16;
+ num_entries = length / sizeof(struct opal_sg_entry);
+ if (num_entries <= 0)
+ goto out;
+
+ /* Validate image header */
+ entry = &list->entry[0];
+ rc = validate_candidate_image((uint64_t)be64_to_cpu(entry->data),
+ VALIDATE_BUF_SIZE, &result);
+ if (!rc && (result != VALIDATE_FLASH_AUTH &&
+ result != VALIDATE_INVALID_IMG)) {
+ image_data = list;
+ fsp_flash_term_hook = fsp_flash_firmware;
+ goto out;
+ }
+
+ /* Adjust return code */
+ if (result == VALIDATE_FLASH_AUTH)
+ rc = OPAL_FLASH_NO_AUTH;
+ else if (result == VALIDATE_INVALID_IMG)
+ rc = OPAL_INVALID_IMAGE;
+
+out:
+ unlock(&flash_lock);
+ return rc;
+}
+
+/*
+ * Code Update notifications
+ *
+ * Note: At present we just ACK these notifications.
+ * Reset cached VPD data if we are going to support
+ * concurrent image maint in future.
+ */
+static bool code_update_notify(uint32_t cmd_sub_mod, struct fsp_msg *msg)
+{
+ int rc;
+ uint32_t cmd;
+
+ switch(cmd_sub_mod) {
+ case FSP_CMD_FLASH_CACHE:
+ cmd = FSP_CMD_FLASH_CACHE_RSP;
+ prlog(PR_NOTICE, "CUPD: Update LID cache event [data = 0x%x]\n",
+ fsp_msg_get_data_word(msg, 0));
+ break;
+ case FSP_CMD_FLASH_OUTC:
+ case FSP_CMD_FLASH_OUTR:
+ case FSP_CMD_FLASH_OUTS:
+ cmd = FSP_CMD_FLASH_OUT_RSP;
+ prlog(PR_NOTICE, "CUPD: Out of band commit notify "
+ "[Type = 0x%x]\n", (msg->word1 >> 8) & 0xff);
+ break;
+ default:
+ log_simple_error(&e_info(OPAL_RC_CU_NOTIFY), "CUPD: Unknown "
+ "notification [cmd = 0x%x]\n", cmd_sub_mod);
+ return false;
+ }
+
+ rc = fsp_queue_msg(fsp_mkmsg(cmd, 0), fsp_freemsg);
+ if (rc)
+ log_simple_error(&e_info(OPAL_RC_CU_NOTIFY), "CUPD: Failed to "
+ "queue code update notification response :%d\n", rc);
+
+ return true;
+}
+
+/*
+ * Handle FSP R/R event.
+ *
+ * Note:
+ * If FSP R/R happens during code update, then entire system reboots
+ * and comes up with P side image (and T side image will be invalid).
+ * Hence we don't need to handle R/R during code update.
+ *
+ * Also if FSP R/R happens in init path (while retrieving in_flight_params)
+ * then system fails to continue booting (because we have not yet loaded
+ * all required data/LID from FSP). Hence we don't need to handle R/R
+ * for system params.
+ */
+static bool fsp_code_update_rr(uint32_t cmd_sub_mod,
+ struct fsp_msg *msg __unused)
+{
+ switch (cmd_sub_mod) {
+ case FSP_RESET_START:
+ lock(&flash_lock);
+
+ if (code_update_check_state() == OPAL_BUSY)
+ flash_state = FLASH_STATE_ABORT;
+
+ unlock(&flash_lock);
+ return true;
+ case FSP_RELOAD_COMPLETE:
+ lock(&flash_lock);
+
+ /* Lets try to parse marker LID again, if we failed
+ * to parse marker LID last time.
+ */
+ if (code_update_check_state() == OPAL_INTERNAL_ERROR)
+ fetch_com_marker_lid();
+
+ unlock(&flash_lock);
+ return true;
+ }
+ return false;
+}
+
+static struct fsp_client fsp_cupd_client_rr = {
+ .message = fsp_code_update_rr,
+};
+
+static struct fsp_client fsp_get_notify = {
+ .message = code_update_notify,
+};
+
+void fsp_code_update_init(void)
+{
+ if (!fsp_present()) {
+ flash_state = FLASH_STATE_ABSENT;
+ return;
+ }
+
+ /* OPAL interface */
+ opal_register(OPAL_FLASH_VALIDATE, fsp_opal_validate_flash, 3);
+ opal_register(OPAL_FLASH_MANAGE, fsp_opal_manage_flash, 1);
+ opal_register(OPAL_FLASH_UPDATE, fsp_opal_update_flash, 1);
+
+ /* register Code Update Class D3 */
+ fsp_register_client(&fsp_get_notify, FSP_MCLASS_CODE_UPDATE);
+ /* Register for Class AA (FSP R/R) */
+ fsp_register_client(&fsp_cupd_client_rr, FSP_MCLASS_RR_EVENT);
+
+ /* Register for firmware IPL side update notification */
+ sysparam_add_update_notifier(fw_ipl_side_update_notify);
+
+ /* Flash hook */
+ fsp_flash_term_hook = NULL;
+
+ /* Fetch various code update related sys parameters */
+ get_ipl_side();
+ get_code_update_policy();
+ get_platform_hmc_managed();
+
+ /* Fetch common marker LID */
+ lid_data = memalign(TCE_PSIZE, MARKER_LID_SIZE);
+ if (!lid_data) {
+ log_simple_error(&e_info(OPAL_RC_CU_INIT),
+ "CUPD: Failed to allocate memory for marker LID\n");
+ flash_state = FLASH_STATE_ABSENT;
+ return;
+ }
+ fetch_com_marker_lid();
+}
diff --git a/roms/skiboot/hw/fsp/fsp-codeupdate.h b/roms/skiboot/hw/fsp/fsp-codeupdate.h
new file mode 100644
index 000000000..2b86619ef
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-codeupdate.h
@@ -0,0 +1,222 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2013-2015 IBM Corp. */
+
+#ifndef __CODEUPDATE_H
+#define __CODEUPDATE_H
+
+/* Flash SG list version */
+#define SG_LIST_VERSION (1UL)
+
+/* LID size <= 16M */
+#define LID_MAX_SIZE 0x1000000
+
+/* Delete all LIDs in */
+#define DEL_UPD_SIDE_LIDS 0xFFFFFFFF
+
+/* System parameter values used in code update validation */
+#define INBAND_UPDATE_ALLOWED 0x01
+#define PLATFORM_HMC_MANAGED 0x01
+#define FW_LICENSE_ACCEPT 0x01
+
+/* Running image side */
+#define FW_IPL_SIDE_TEMP 0x01
+#define FW_IPL_SIDE_PERM 0x00
+
+/* Manage operations */
+#define OPAL_REJECT_TMP_SIDE 0
+#define OPAL_COMMIT_TMP_SIDE 1
+
+/* Validate image size */
+#define VALIDATE_BUF_SIZE 4096
+
+/* Code update operation status */
+#define OPAL_INVALID_IMAGE -1003 /* Unacceptable image */
+#define OPAL_ACTIVE_SIDE_ERR -9001
+#define OPAL_FLASH_NO_AUTH -9002
+
+/* Validate image update result tokens */
+#define VALIDATE_TMP_UPDATE 0 /* T side will be updated */
+#define VALIDATE_FLASH_AUTH 1 /* Partition does not have authority */
+#define VALIDATE_INVALID_IMG 2 /* Candidate image is not valid */
+#define VALIDATE_CUR_UNKNOWN 3 /* Current fixpack level is unknown */
+/*
+ * Current T side will be committed to P side before being replace with new
+ * image, and the new image is downlevel from current image
+ */
+#define VALIDATE_TMP_COMMIT_DL 4
+/*
+ * Current T side will be committed to P side before being replaced with new
+ * image
+ */
+#define VALIDATE_TMP_COMMIT 5
+/*
+ * T side will be updated with a downlevel image
+ */
+#define VALIDATE_TMP_UPDATE_DL 6
+/*
+ * The candidate image's release date is later than the system's firmware
+ * service entitlement date - service warranty period has expired
+ */
+#define VALIDATE_OUT_OF_WRNTY 7
+
+/* default version */
+#define FW_VERSION_UNKNOWN "UNKNOWN"
+
+/* Actual size of MI & ML keyword including NULL */
+#define MI_KEYWORD_SIZE 10
+#define ML_KEYWORD_SIZE 9
+
+/* Firmware image VPD data */
+struct fw_image_vpd {
+ char mi_keyword[MI_KEYWORD_SIZE]; /* NNSSS_FFF */
+ char ext_fw_id[ML_KEYWORD_SIZE]; /* FWxxx.yy */
+};
+
+/* Master LID header */
+struct master_lid_header {
+ char key[3]; /* "MLH" */
+ uint8_t version; /* 0x02 */
+ __be16 header_size;
+ __be16 entry_size;
+ uint8_t reserved[56];
+};
+
+/* LID index entry */
+struct lid_index_entry {
+ __be32 id;
+ __be32 size;
+ __be32 offset;
+ __be32 crc;
+};
+
+/* SP flags */
+#define FW_ONE_OFF_SP 0x80000000
+#define FW_EMERGENCY_SP 0x40000000
+
+/*
+ * SP GA date
+ *
+ * sp_flag addr = header->data + header->ext_fw_id_size
+ */
+struct update_image_ga_date {
+ __be32 sp_flag;
+ char sp_ga_date[8]; /* YYYYMMDD */
+};
+
+/* Image magic number */
+#define IMAGE_MAGIC_NUMBER 0x5549
+
+/* Image header structure */
+struct update_image_header {
+ __be16 magic;
+ __be16 version;
+ __be32 package_size;
+ __be32 crc;
+ __be16 lid_index_offset;
+ __be16 number_lids;
+ __be16 package_flags;
+ __be16 mi_keyword_size;
+ char mi_keyword_data[40];
+ __be16 ext_fw_id_size;
+ /* Rest of the image data including ext fw id, sp flags */
+ char data[];
+};
+
+/* FipS header */
+struct fips_header {
+ __be16 magic;
+ __be16 version;
+ __be32 lid_id;
+ __be32 lid_date; /* YYYYMMDD */
+ __be16 lid_time; /* HHMM */
+ __be16 lid_class;
+ __be32 crc;
+ __be32 lid_size; /* Number of bytes below header */
+ __be32 header_size;
+ uint8_t mtd_number;
+ uint8_t valid; /* 1 = valid, 0 = invalid */
+ uint8_t reserved;
+ uint8_t lid_info_size;
+ char lid_info[64]; /* code level */
+ __be32 update_date; /* YYYYMMDD */
+ __be16 update_time; /* HHMM */
+ __be16 phylum_len;
+ uint8_t lid_phylum[];
+};
+
+/* Approximate LID size */
+#define MASTER_LID_SIZE 0x5000
+/*
+ * Note:
+ * Doc indicates non-SP LIDs size is 0-8MB. However
+ * in reality marker LID size less than 4k. Allocating
+ * 8k to give some breathing space.
+ */
+#define MARKER_LID_SIZE 0x00002000
+
+/* Common marker LID no */
+#define P_COM_MARKER_LID_ID 0x80A00001
+#define T_COM_MARKER_LID_ID (P_COM_MARKER_LID_ID | ADJUST_T_SIDE_LID_NO)
+
+/*
+ * Common marker LID structure
+ *
+ * Note that we are populating only required sections,
+ * not all ADF sections in common marker LID.
+ */
+struct com_marker_header {
+ __be32 version;
+ __be32 MI_offset; /* Offset to MI section */
+ __be32 iseries_offset;
+};
+
+/* MI Keyword section */
+struct com_marker_mi_section {
+ __be32 MI_size;
+ char mi_keyword[40]; /* MI Keyword */
+ char lst_disrupt_fix_lvl[3];
+ char skip[21]; /* Skip not interested fields */
+ __be32 adf_offset; /* Offset to ADF section */
+};
+
+/* Additional Data Fields */
+struct com_marker_adf_sec {
+ __be32 adf_cnt; /* ADF count */
+ char adf_data[]; /* ADF data */
+};
+
+/* ADF common header */
+struct com_marker_adf_header {
+ __be32 size; /* Section size */
+ __be32 name; /* Section name */
+};
+
+/*
+ * Service Pack Nomenclature ADF
+ *
+ * Service pack release name.
+ */
+#define ADF_NAME_SP 0x53504E4D /* SPNM */
+struct com_marker_adf_sp
+{
+ struct com_marker_adf_header header;
+ __be32 sp_name_offset; /* Offset from start of ADF */
+ __be32 sp_name_size;
+ __be32 skip[4]; /* Skip rest of fields */
+};
+
+/*
+ * Firmware IP Protection ADF
+ *
+ * Service Pack flags and GA date.
+ */
+#define ADF_NAME_FW_IP 0x46495050 /* FIPP */
+struct com_marker_fw_ip {
+ struct com_marker_adf_header header;
+ __be32 sp_flag_offset; /* Offset from start of ADF */
+ __be32 sp_flag_size;
+ __be32 sp_ga_offset; /* Offset from start of ADF*/
+ __be32 sp_ga_size;
+};
+
+#endif /* __CODEUPDATE_H */
diff --git a/roms/skiboot/hw/fsp/fsp-console.c b/roms/skiboot/hw/fsp/fsp-console.c
new file mode 100644
index 000000000..dc23ac46f
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-console.c
@@ -0,0 +1,1062 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Flexible Service Processor (FSP) serial console handling code
+ *
+ * Copyright 2013-2018 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <processor.h>
+#include <io.h>
+#include <fsp.h>
+#include <console.h>
+#include <opal.h>
+#include <timebase.h>
+#include <device.h>
+#include <fsp-sysparam.h>
+#include <errorlog.h>
+#include <lock.h>
+
+DEFINE_LOG_ENTRY(OPAL_RC_CONSOLE_HANG, OPAL_PLATFORM_ERR_EVT, OPAL_CONSOLE,
+ OPAL_PLATFORM_FIRMWARE,
+ OPAL_PREDICTIVE_ERR_GENERAL, OPAL_NA);
+
+struct fsp_serbuf_hdr {
+ __be16 partition_id;
+ u8 session_id;
+ u8 hmc_id;
+ __be16 data_offset;
+ __be16 last_valid;
+ __be16 ovf_count;
+ __be16 next_in;
+ u8 flags;
+ u8 reserved;
+ __be16 next_out;
+ u8 data[];
+};
+#define SER_BUF_DATA_SIZE (0x10000 - sizeof(struct fsp_serbuf_hdr))
+
+struct fsp_serial {
+ bool available;
+ bool open;
+ bool has_part0;
+ bool has_part1;
+ bool log_port;
+ bool out_poke;
+ char loc_code[LOC_CODE_SIZE];
+ u16 rsrc_id;
+ struct fsp_serbuf_hdr *in_buf;
+ struct fsp_serbuf_hdr *out_buf;
+ struct fsp_msg *poke_msg;
+ u8 waiting;
+ u64 irq;
+ u16 out_buf_prev_len;
+ u64 out_buf_timeout;
+};
+
+#define SER_BUFFER_SIZE 0x00040000UL
+#define MAX_SERIAL 4
+
+#define SER_BUFFER_OUT_TIMEOUT 10
+
+static struct fsp_serial fsp_serials[MAX_SERIAL];
+static bool got_intf_query;
+static struct lock fsp_con_lock = LOCK_UNLOCKED;
+static void* ser_buffer = NULL;
+
+static void fsp_console_reinit(void)
+{
+ int i;
+ void *base;
+ struct fsp_msg *msg;
+
+ /* Initialize out data structure pointers & TCE maps */
+ base = ser_buffer;
+ for (i = 0; i < MAX_SERIAL; i++) {
+ struct fsp_serial *ser = &fsp_serials[i];
+
+ ser->in_buf = base;
+ ser->out_buf = base + SER_BUFFER_SIZE/2;
+ base += SER_BUFFER_SIZE;
+ }
+ fsp_tce_map(PSI_DMA_SER0_BASE, ser_buffer,
+ 4 * PSI_DMA_SER0_SIZE);
+
+ for (i = 0; i < MAX_SERIAL; i++) {
+ struct fsp_serial *fs = &fsp_serials[i];
+
+ if (!fs->available)
+ continue;
+
+ if (fs->rsrc_id == 0xffff)
+ continue;
+ prlog(PR_DEBUG, "FSP: Reassociating HVSI console %d\n", i);
+ msg = fsp_mkmsg(FSP_CMD_ASSOC_SERIAL, 2,
+ (fs->rsrc_id << 16) | 1, i);
+ if (!msg) {
+ prerror("FSPCON: Failed to allocate associate msg\n");
+ return;
+ }
+ if (fsp_queue_msg(msg, fsp_freemsg)) {
+ fsp_freemsg(msg);
+ prerror("FSPCON: Failed to queue associate msg\n");
+ return;
+ }
+ }
+}
+
+static void fsp_close_consoles(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < MAX_SERIAL; i++) {
+ struct fsp_serial *fs = &fsp_serials[i];
+
+ if (!fs->available)
+ continue;
+
+ lock(&fsp_con_lock);
+ if (fs->open) {
+ fs->open = false;
+ fs->out_poke = false;
+ if (fs->poke_msg->state != fsp_msg_unused)
+ fsp_cancelmsg(fs->poke_msg);
+ fsp_freemsg(fs->poke_msg);
+ fs->poke_msg = NULL;
+ }
+ unlock(&fsp_con_lock);
+ }
+ prlog(PR_DEBUG, "FSPCON: Closed consoles due to FSP reset/reload\n");
+}
+
+static void fsp_pokemsg_reclaim(struct fsp_msg *msg)
+{
+ struct fsp_serial *fs = msg->user_data;
+
+ /*
+ * The poke_msg might have been "detached" from the console
+ * in vserial_close, so we need to check whether it's current
+ * before touching the state, otherwise, just free it
+ */
+ lock(&fsp_con_lock);
+ if (fs->open && fs->poke_msg == msg) {
+ if (fs->out_poke) {
+ if (fsp_queue_msg(fs->poke_msg, fsp_pokemsg_reclaim)) {
+ prerror("FSPCON: failed to queue poke msg\n");
+ } else {
+ fs->out_poke = false;
+ }
+ } else
+ fs->poke_msg->state = fsp_msg_unused;
+ } else
+ fsp_freemsg(msg);
+ unlock(&fsp_con_lock);
+}
+
+/* Called with the fsp_con_lock held */
+static size_t fsp_write_vserial(struct fsp_serial *fs, const char *buf,
+ size_t len)
+{
+ struct fsp_serbuf_hdr *sb = fs->out_buf;
+ u16 old_nin = be16_to_cpu(sb->next_in);
+ u16 space, chunk;
+
+ if (!fs->open)
+ return 0;
+
+ space = (be16_to_cpu(sb->next_out) + SER_BUF_DATA_SIZE - old_nin - 1)
+ % SER_BUF_DATA_SIZE;
+ if (space < len)
+ len = space;
+ if (!len)
+ return 0;
+
+ chunk = SER_BUF_DATA_SIZE - old_nin;
+ if (chunk > len)
+ chunk = len;
+ memcpy(&sb->data[old_nin], buf, chunk);
+ if (chunk < len)
+ memcpy(&sb->data[0], buf + chunk, len - chunk);
+ lwsync();
+ sb->next_in = cpu_to_be16((old_nin + len) % SER_BUF_DATA_SIZE);
+ sync();
+
+ if (be16_to_cpu(sb->next_out) == old_nin && fs->poke_msg) {
+ if (fs->poke_msg->state == fsp_msg_unused) {
+ if (fsp_queue_msg(fs->poke_msg, fsp_pokemsg_reclaim))
+ prerror("FSPCON: poke msg queuing failed\n");
+ } else
+ fs->out_poke = true;
+ }
+#ifndef DISABLE_CON_PENDING_EVT
+ opal_update_pending_evt(OPAL_EVENT_CONSOLE_OUTPUT,
+ OPAL_EVENT_CONSOLE_OUTPUT);
+#endif
+ return len;
+}
+
+#ifdef DVS_CONSOLE
+static int fsp_con_port = -1;
+static bool fsp_con_full;
+
+/*
+ * This is called by the code in console.c without the con_lock
+ * held. However it can be called as the result of any printf
+ * thus any other lock might be held including possibly the
+ * FSP lock
+ */
+static size_t fsp_con_write(const char *buf, size_t len)
+{
+ size_t written;
+
+ if (fsp_con_port < 0)
+ return 0;
+
+ lock(&fsp_con_lock);
+ written = fsp_write_vserial(&fsp_serials[fsp_con_port], buf, len);
+ fsp_con_full = (written < len);
+ unlock(&fsp_con_lock);
+
+ return written;
+}
+
+static struct con_ops fsp_con_ops = {
+ .write = fsp_con_write,
+};
+#endif /* DVS_CONSOLE */
+
+static void fsp_open_vserial(struct fsp_msg *msg)
+{
+ struct fsp_msg *resp;
+
+ u16 part_id = fsp_msg_get_data_word(msg, 0) & 0xffff;
+ u16 sess_id = fsp_msg_get_data_word(msg, 1) & 0xffff;
+ u8 hmc_sess = msg->data.bytes[0];
+ u8 hmc_indx = msg->data.bytes[1];
+ u8 authority = msg->data.bytes[4];
+ u32 tce_in, tce_out;
+ struct fsp_serial *fs;
+
+ prlog(PR_INFO, "FSPCON: Got VSerial Open\n");
+ prlog(PR_DEBUG, " part_id = 0x%04x\n", part_id);
+ prlog(PR_DEBUG, " sess_id = 0x%04x\n", sess_id);
+ prlog(PR_DEBUG, " hmc_sess = 0x%02x\n", hmc_sess);
+ prlog(PR_DEBUG, " hmc_indx = 0x%02x\n", hmc_indx);
+ prlog(PR_DEBUG, " authority = 0x%02x\n", authority);
+
+ if (sess_id >= MAX_SERIAL || !fsp_serials[sess_id].available) {
+ prlog(PR_WARNING, "FSPCON: 0x%04x NOT AVAILABLE!\n", sess_id);
+ resp = fsp_mkmsg(FSP_RSP_OPEN_VSERIAL | 0x2f, 0);
+ if (!resp) {
+ prerror("FSPCON: Response allocation failed\n");
+ return;
+ }
+ if (fsp_queue_msg(resp, fsp_freemsg)) {
+ fsp_freemsg(resp);
+ prerror("FSPCON: Failed to queue response msg\n");
+ }
+ return;
+ }
+
+ fs = &fsp_serials[sess_id];
+
+ /* Hack ! On blades, the console opened via the mm has partition 1
+ * while the debug DVS generally has partition 0 (though you can
+ * use what you want really).
+ * We don't want a DVS open/close to crap on the blademm console
+ * thus if it's a raw console, gets an open with partID 1, we
+ * set a flag that ignores the close of partid 0
+ */
+ if (fs->rsrc_id == 0xffff) {
+ if (part_id == 0)
+ fs->has_part0 = true;
+ if (part_id == 1)
+ fs->has_part1 = true;
+ }
+
+ tce_in = PSI_DMA_SER0_BASE + PSI_DMA_SER0_SIZE * sess_id;
+ tce_out = tce_in + SER_BUFFER_SIZE/2;
+
+ lock(&fsp_con_lock);
+ if (fs->open) {
+ prlog(PR_DEBUG, " already open, skipping init !\n");
+ unlock(&fsp_con_lock);
+ goto already_open;
+ }
+
+ fs->poke_msg = fsp_mkmsg(FSP_CMD_VSERIAL_OUT, 2,
+ fsp_msg_get_data_word(msg, 0),
+ fsp_msg_get_data_word(msg, 1) & 0xffff);
+ if (fs->poke_msg == NULL) {
+ prerror("FSPCON: Failed to allocate poke_msg\n");
+ unlock(&fsp_con_lock);
+ return;
+ }
+
+ fs->open = true;
+ fs->poke_msg->user_data = fs;
+
+ fs->in_buf->partition_id = fs->out_buf->partition_id = cpu_to_be16(part_id);
+ fs->in_buf->session_id = fs->out_buf->session_id = sess_id;
+ fs->in_buf->hmc_id = fs->out_buf->hmc_id = hmc_indx;
+ fs->in_buf->data_offset = fs->out_buf->data_offset =
+ cpu_to_be16(sizeof(struct fsp_serbuf_hdr));
+ fs->in_buf->last_valid = fs->out_buf->last_valid =
+ cpu_to_be16(SER_BUF_DATA_SIZE - 1);
+ fs->in_buf->ovf_count = fs->out_buf->ovf_count = 0;
+ fs->in_buf->next_in = fs->out_buf->next_in = 0;
+ fs->in_buf->flags = fs->out_buf->flags = 0;
+ fs->in_buf->reserved = fs->out_buf->reserved = 0;
+ fs->in_buf->next_out = fs->out_buf->next_out = 0;
+ fs->out_buf_prev_len = 0;
+ fs->out_buf_timeout = 0;
+ unlock(&fsp_con_lock);
+
+ already_open:
+ resp = fsp_mkmsg(FSP_RSP_OPEN_VSERIAL, 6, fsp_msg_get_data_word(msg, 0),
+ fsp_msg_get_data_word(msg, 1) & 0xffff, 0, tce_in, 0, tce_out);
+ if (!resp) {
+ prerror("FSPCON: Failed to allocate open msg response\n");
+ return;
+ }
+ if (fsp_queue_msg(resp, fsp_freemsg)) {
+ fsp_freemsg(resp);
+ prerror("FSPCON: Failed to queue open msg response\n");
+ return;
+ }
+
+#ifdef DVS_CONSOLE
+ prlog(PR_DEBUG, " log_port = %d\n", fs->log_port);
+ if (fs->log_port) {
+ fsp_con_port = sess_id;
+ sync();
+ /*
+ * We mark the FSP lock as being in the console
+ * path. We do that only once, we never unmark it
+ * (there is really no much point)
+ */
+ fsp_used_by_console();
+ fsp_con_lock.in_con_path = true;
+ /* See comment in fsp_used_by_console */
+ lock(&fsp_con_lock);
+ unlock(&fsp_con_lock);
+ set_console(&fsp_con_ops);
+ }
+#endif
+}
+
+static void fsp_close_vserial(struct fsp_msg *msg)
+{
+ u16 part_id = fsp_msg_get_data_word(msg, 0) & 0xffff;
+ u16 sess_id = fsp_msg_get_data_word(msg, 1) & 0xffff;
+ u8 hmc_sess = msg->data.bytes[0];
+ u8 hmc_indx = msg->data.bytes[1];
+ u8 authority = msg->data.bytes[4];
+ struct fsp_serial *fs;
+ struct fsp_msg *resp;
+
+ prlog(PR_INFO, "FSPCON: Got VSerial Close\n");
+ prlog(PR_DEBUG, " part_id = 0x%04x\n", part_id);
+ prlog(PR_DEBUG, " sess_id = 0x%04x\n", sess_id);
+ prlog(PR_DEBUG, " hmc_sess = 0x%02x\n", hmc_sess);
+ prlog(PR_DEBUG, " hmc_indx = 0x%02x\n", hmc_indx);
+ prlog(PR_DEBUG, " authority = 0x%02x\n", authority);
+
+ if (sess_id >= MAX_SERIAL || !fsp_serials[sess_id].available) {
+ prlog(PR_WARNING, "FSPCON: 0x%04x NOT AVAILABLE!\n", sess_id);
+ goto skip_close;
+ }
+
+ fs = &fsp_serials[sess_id];
+
+ /* See "HACK" comment in open */
+ if (fs->rsrc_id == 0xffff) {
+ if (part_id == 0)
+ fs->has_part0 = false;
+ if (part_id == 1)
+ fs->has_part1 = false;
+ if (fs->has_part0 || fs->has_part1) {
+ prlog(PR_DEBUG, " skipping close !\n");
+ goto skip_close;
+ }
+ }
+
+#ifdef DVS_CONSOLE
+ if (fs->log_port) {
+ fsp_con_port = -1;
+ set_console(NULL);
+ }
+#endif
+
+ lock(&fsp_con_lock);
+ if (fs->open) {
+ fs->open = false;
+ fs->out_poke = false;
+ if (fs->poke_msg && fs->poke_msg->state == fsp_msg_unused) {
+ fsp_freemsg(fs->poke_msg);
+ fs->poke_msg = NULL;
+ }
+ }
+ unlock(&fsp_con_lock);
+ skip_close:
+ resp = fsp_mkmsg(FSP_RSP_CLOSE_VSERIAL, 2, fsp_msg_get_data_word(msg, 0),
+ fsp_msg_get_data_word(msg, 1) & 0xffff);
+ if (!resp) {
+ prerror("FSPCON: Failed to allocate close msg response\n");
+ return;
+ }
+ if (fsp_queue_msg(resp, fsp_freemsg)) {
+ fsp_freemsg(resp);
+ prerror("FSPCON: Failed to queue close msg response\n");
+ }
+}
+
+static bool fsp_con_msg_hmc(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+ struct fsp_msg *resp;
+
+ /* Associate response */
+ if ((cmd_sub_mod >> 8) == 0xe08a) {
+ prlog(PR_TRACE, "FSPCON: Got associate response, status"
+ " 0x%02x\n", cmd_sub_mod & 0xff);
+ return true;
+ }
+ if ((cmd_sub_mod >> 8) == 0xe08b) {
+ prlog(PR_TRACE, "Got unassociate response, status 0x%02x\n",
+ cmd_sub_mod & 0xff);
+ return true;
+ }
+ switch(cmd_sub_mod) {
+ case FSP_CMD_OPEN_VSERIAL:
+ fsp_open_vserial(msg);
+ return true;
+ case FSP_CMD_CLOSE_VSERIAL:
+ fsp_close_vserial(msg);
+ return true;
+ case FSP_CMD_HMC_INTF_QUERY:
+ prlog(PR_DEBUG, "FSPCON: Got HMC interface query\n");
+ got_intf_query = true;
+ resp = fsp_mkmsg(FSP_RSP_HMC_INTF_QUERY, 1,
+ fsp_msg_get_data_word(msg, 0) & 0x00ffffff);
+ if (!resp) {
+ prerror("FSPCON: Failed to allocate hmc intf response\n");
+ return true;
+ }
+ if (fsp_queue_msg(resp, fsp_freemsg)) {
+ fsp_freemsg(resp);
+ prerror("FSPCON: Failed to queue hmc intf response\n");
+ }
+ return true;
+ }
+ return false;
+}
+
+static bool fsp_con_msg_vt(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+ u16 sess_id = fsp_msg_get_data_word(msg, 1) & 0xffff;
+
+ if (cmd_sub_mod == FSP_CMD_VSERIAL_IN && sess_id < MAX_SERIAL) {
+ struct fsp_serial *fs = &fsp_serials[sess_id];
+
+ if (!fs->open)
+ return true;
+
+ /* FSP is signaling some incoming data. We take the console
+ * lock to avoid racing with a simultaneous read, though we
+ * might want to consider to simplify all that locking into
+ * one single lock that covers the console and the pending
+ * events.
+ */
+ lock(&fsp_con_lock);
+ opal_update_pending_evt(OPAL_EVENT_CONSOLE_INPUT,
+ OPAL_EVENT_CONSOLE_INPUT);
+ opal_update_pending_evt(fs->irq, fs->irq);
+ unlock(&fsp_con_lock);
+ }
+ return true;
+}
+
+static bool fsp_con_msg_rr(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+ assert(msg == NULL);
+
+ switch (cmd_sub_mod) {
+ case FSP_RESET_START:
+ fsp_close_consoles();
+ return true;
+ case FSP_RELOAD_COMPLETE:
+ fsp_console_reinit();
+ return true;
+ }
+ return false;
+}
+
+static struct fsp_client fsp_con_client_hmc = {
+ .message = fsp_con_msg_hmc,
+};
+
+static struct fsp_client fsp_con_client_vt = {
+ .message = fsp_con_msg_vt,
+};
+
+static struct fsp_client fsp_con_client_rr = {
+ .message = fsp_con_msg_rr,
+};
+
+static void fsp_serial_add(int index, u16 rsrc_id, const char *loc_code,
+ bool log_port)
+{
+ struct fsp_serial *ser;
+ struct fsp_msg *msg;
+
+ lock(&fsp_con_lock);
+ ser = &fsp_serials[index];
+
+ if (ser->available) {
+ unlock(&fsp_con_lock);
+ return;
+ }
+
+ ser->rsrc_id = rsrc_id;
+ memset(ser->loc_code, 0x00, LOC_CODE_SIZE);
+ strncpy(ser->loc_code, loc_code, LOC_CODE_SIZE - 1);
+ ser->available = true;
+ ser->log_port = log_port;
+ unlock(&fsp_con_lock);
+
+ /* DVS doesn't have that */
+ if (rsrc_id != 0xffff) {
+ msg = fsp_mkmsg(FSP_CMD_ASSOC_SERIAL, 2,
+ (rsrc_id << 16) | 1, index);
+ if (!msg) {
+ prerror("FSPCON: Assoc serial alloc failed\n");
+ return;
+ }
+ if (fsp_queue_msg(msg, fsp_freemsg)) {
+ fsp_freemsg(msg);
+ prerror("FSPCON: Assoc serial queue failed\n");
+ return;
+ }
+ }
+}
+
+void fsp_console_preinit(void)
+{
+ int i;
+ void *base;
+
+ if (!fsp_present())
+ return;
+
+ ser_buffer = memalign(TCE_PSIZE, SER_BUFFER_SIZE * MAX_SERIAL);
+
+ /* Initialize out data structure pointers & TCE maps */
+ base = ser_buffer;
+ for (i = 0; i < MAX_SERIAL; i++) {
+ struct fsp_serial *ser = &fsp_serials[i];
+
+ ser->in_buf = base;
+ ser->out_buf = base + SER_BUFFER_SIZE/2;
+ base += SER_BUFFER_SIZE;
+ }
+ fsp_tce_map(PSI_DMA_SER0_BASE, ser_buffer,
+ 4 * PSI_DMA_SER0_SIZE);
+
+ /* Register for class E0 and E1 */
+ fsp_register_client(&fsp_con_client_hmc, FSP_MCLASS_HMC_INTFMSG);
+ fsp_register_client(&fsp_con_client_vt, FSP_MCLASS_HMC_VT);
+ fsp_register_client(&fsp_con_client_rr, FSP_MCLASS_RR_EVENT);
+
+ /* Add DVS ports. We currently have session 0 and 3, 0 is for
+ * OS use. 3 is our debug port. We need to add those before
+ * we complete the OPL or we'll potentially miss the
+ * console setup on Firebird blades.
+ */
+ fsp_serial_add(0, 0xffff, "DVS_OS", false);
+ op_display(OP_LOG, OP_MOD_FSPCON, 0x0001);
+ fsp_serial_add(3, 0xffff, "DVS_FW", true);
+ op_display(OP_LOG, OP_MOD_FSPCON, 0x0002);
+
+}
+
+static int64_t fsp_console_write(int64_t term_number, __be64 *__length,
+ const uint8_t *buffer)
+{
+ struct fsp_serial *fs;
+ size_t written, requested;
+
+ if (term_number < 0 || term_number >= MAX_SERIAL)
+ return OPAL_PARAMETER;
+ fs = &fsp_serials[term_number];
+ if (!fs->available || fs->log_port)
+ return OPAL_PARAMETER;
+ lock(&fsp_con_lock);
+ if (!fs->open) {
+ unlock(&fsp_con_lock);
+ return OPAL_CLOSED;
+ }
+ /* Clamp to a reasonable size */
+ requested = be64_to_cpu(*__length);
+ if (requested > 0x1000)
+ requested = 0x1000;
+ written = fsp_write_vserial(fs, buffer, requested);
+
+ if (written) {
+ /* If we wrote anything, reset timeout */
+ fs->out_buf_prev_len = 0;
+ fs->out_buf_timeout = 0;
+ }
+
+#ifdef OPAL_DEBUG_CONSOLE_IO
+ prlog(PR_TRACE, "OPAL: console write req=%ld written=%ld"
+ " ni=%d no=%d\n",
+ requested, written, be16_to_cpu(fs->out_buf->next_in),
+ be16_to_cpu(fs->out_buf->next_out));
+ prlog(PR_TRACE, " %02x %02x %02x %02x "
+ "%02x \'%c\' %02x \'%c\' %02x \'%c\'.%02x \'%c\'..\n",
+ buffer[0], buffer[1], buffer[2], buffer[3],
+ buffer[4], buffer[4], buffer[5], buffer[5],
+ buffer[6], buffer[6], buffer[7], buffer[7]);
+#endif /* OPAL_DEBUG_CONSOLE_IO */
+
+ *__length = cpu_to_be64(written);
+ unlock(&fsp_con_lock);
+
+ if (written)
+ return OPAL_SUCCESS;
+
+ return OPAL_HARDWARE;
+}
+
+static int64_t fsp_console_write_buffer_space(int64_t term_number,
+ __be64 *__length)
+{
+ static bool elog_generated = false;
+ struct fsp_serial *fs;
+ struct fsp_serbuf_hdr *sb;
+ int64_t length;
+
+ if (term_number < 0 || term_number >= MAX_SERIAL)
+ return OPAL_PARAMETER;
+ fs = &fsp_serials[term_number];
+ if (!fs->available || fs->log_port)
+ return OPAL_PARAMETER;
+ lock(&fsp_con_lock);
+ if (!fs->open) {
+ unlock(&fsp_con_lock);
+ return OPAL_CLOSED;
+ }
+ sb = fs->out_buf;
+ length = (be16_to_cpu(sb->next_out) + SER_BUF_DATA_SIZE
+ - be16_to_cpu(sb->next_in) - 1)
+ % SER_BUF_DATA_SIZE;
+ unlock(&fsp_con_lock);
+
+ /* Console buffer has enough space to write incoming data */
+ if (length != fs->out_buf_prev_len) {
+ fs->out_buf_prev_len = length;
+ fs->out_buf_timeout = 0;
+
+ *__length = cpu_to_be64(length);
+ return OPAL_SUCCESS;
+ }
+
+ /*
+ * Buffer is full, start internal timer. We will continue returning
+ * SUCCESS until timeout happens, hoping FSP will consume data within
+ * timeout period.
+ */
+ if (fs->out_buf_timeout == 0) {
+ fs->out_buf_timeout = mftb() +
+ secs_to_tb(SER_BUFFER_OUT_TIMEOUT);
+ }
+
+ if (tb_compare(mftb(), fs->out_buf_timeout) != TB_AAFTERB) {
+ *__length = cpu_to_be64(length);
+ return OPAL_SUCCESS;
+ }
+
+ /*
+ * FSP is still active but not reading console data. Hence
+ * our console buffer became full. Most likely IPMI daemon
+ * on FSP is buggy. Lets log error and return OPAL_RESOURCE
+ * to payload (Linux).
+ */
+ if (!elog_generated) {
+ elog_generated = true;
+ log_simple_error(&e_info(OPAL_RC_CONSOLE_HANG), "FSPCON: Console "
+ "buffer is full, dropping console data\n");
+ }
+
+ /* Timeout happened. Lets drop incoming data */
+ return OPAL_RESOURCE;
+}
+
+static int64_t fsp_console_read(int64_t term_number, __be64 *__length,
+ uint8_t *buffer)
+{
+ struct fsp_serial *fs;
+ struct fsp_serbuf_hdr *sb;
+ bool pending = false;
+ uint32_t old_nin, n, i, chunk, req = be64_to_cpu(*__length);
+ int rc = OPAL_SUCCESS;
+
+ if (term_number < 0 || term_number >= MAX_SERIAL)
+ return OPAL_PARAMETER;
+ fs = &fsp_serials[term_number];
+ if (!fs->available || fs->log_port)
+ return OPAL_PARAMETER;
+ lock(&fsp_con_lock);
+ if (!fs->open) {
+ rc = OPAL_CLOSED;
+ goto clr_flag;
+ }
+ if (fs->waiting)
+ fs->waiting = 0;
+ sb = fs->in_buf;
+ old_nin = be16_to_cpu(sb->next_in);
+ lwsync();
+ n = (old_nin + SER_BUF_DATA_SIZE - be16_to_cpu(sb->next_out))
+ % SER_BUF_DATA_SIZE;
+ if (n > req) {
+ pending = true;
+ n = req;
+ }
+ *__length = cpu_to_be64(n);
+
+ chunk = SER_BUF_DATA_SIZE - be16_to_cpu(sb->next_out);
+ if (chunk > n)
+ chunk = n;
+ memcpy(buffer, &sb->data[be16_to_cpu(sb->next_out)], chunk);
+ if (chunk < n)
+ memcpy(buffer + chunk, &sb->data[0], n - chunk);
+ sb->next_out = cpu_to_be16(((be16_to_cpu(sb->next_out)) + n) % SER_BUF_DATA_SIZE);
+
+#ifdef OPAL_DEBUG_CONSOLE_IO
+ prlog(PR_TRACE, "OPAL: console read req=%d read=%d ni=%d no=%d\n",
+ req, n, be16_to_cpu(sb->next_in), be16_to_cpu(sb->next_out));
+ prlog(PR_TRACE, " %02x %02x %02x %02x %02x %02x %02x %02x ...\n",
+ buffer[0], buffer[1], buffer[2], buffer[3],
+ buffer[4], buffer[5], buffer[6], buffer[7]);
+#endif /* OPAL_DEBUG_CONSOLE_IO */
+
+clr_flag:
+ /* Might clear the input pending flag */
+ for (i = 0; i < MAX_SERIAL && !pending; i++) {
+ struct fsp_serial *fs = &fsp_serials[i];
+ struct fsp_serbuf_hdr *sb = fs->in_buf;
+
+ if (fs->log_port || !fs->open)
+ continue;
+ if (sb->next_out != sb->next_in) {
+ /*
+ * HACK: Some kernels (4.1+) may fail to properly
+ * register hvc1 and will never read it. This can lead
+ * to RCU stalls, so if we notice this console is not
+ * being read, do not set OPAL_EVENT_CONSOLE_INPUT even
+ * if it has data
+ */
+ if (fs->waiting < 5) {
+ pending = true;
+ fs->waiting++;
+ }
+ }
+ }
+ if (!pending) {
+ opal_update_pending_evt(fs->irq, 0);
+ opal_update_pending_evt(OPAL_EVENT_CONSOLE_INPUT, 0);
+ }
+
+ unlock(&fsp_con_lock);
+
+ return rc;
+}
+
+void fsp_console_poll(void *data __unused)
+{
+#ifdef OPAL_DEBUG_CONSOLE_POLL
+ static int debug;
+#endif
+
+ /*
+ * We don't get messages for out buffer being consumed, so we
+ * need to poll. We also defer sending of poke messages from
+ * the sapphire console to avoid a locking nightmare with
+ * beging called from printf() deep into an existing lock nest
+ * stack.
+ */
+ if (fsp_con_full ||
+ (opal_pending_events & OPAL_EVENT_CONSOLE_OUTPUT)) {
+ unsigned int i;
+ bool pending = false;
+
+ /* We take the console lock. This is somewhat inefficient
+ * but it guarantees we aren't racing with a write, and
+ * thus clearing an event improperly
+ */
+ lock(&fsp_con_lock);
+ for (i = 0; i < MAX_SERIAL && !pending; i++) {
+ struct fsp_serial *fs = &fsp_serials[i];
+ struct fsp_serbuf_hdr *sb = fs->out_buf;
+
+ if (!fs->open)
+ continue;
+ if (sb->next_out == sb->next_in) {
+ continue;
+ }
+ if (fs->log_port) {
+ flush_console();
+ } else {
+#ifdef OPAL_DEBUG_CONSOLE_POLL
+ if (debug < 5) {
+ prlog(PR_DEBUG,"OPAL: %d still pending"
+ " ni=%d no=%d\n",
+ i, be16_to_cpu(sb->next_in),
+ be16_to_cpu(sb->next_out));
+ debug++;
+ }
+#endif /* OPAL_DEBUG_CONSOLE_POLL */
+ pending = true;
+ }
+ }
+ if (!pending) {
+ opal_update_pending_evt(OPAL_EVENT_CONSOLE_OUTPUT, 0);
+#ifdef OPAL_DEBUG_CONSOLE_POLL
+ debug = 0;
+#endif
+ }
+ unlock(&fsp_con_lock);
+ }
+}
+
+void fsp_console_init(void)
+{
+ struct dt_node *serials, *ser;
+ int i;
+
+ if (!fsp_present())
+ return;
+
+ /* Wait until we got the intf query before moving on */
+ while (!got_intf_query)
+ opal_run_pollers();
+
+ op_display(OP_LOG, OP_MOD_FSPCON, 0x0000);
+
+ /* Register poller */
+ opal_add_poller(fsp_console_poll, NULL);
+
+ /* Register OPAL console backend */
+ set_opal_console(&fsp_opal_con);
+
+ /* Parse serial port data */
+ serials = dt_find_by_path(dt_root, "ipl-params/fsp-serial");
+ if (!serials) {
+ prerror("FSPCON: No FSP serial ports in device-tree\n");
+ return;
+ }
+
+ i = 1;
+ dt_for_each_child(serials, ser) {
+ u32 rsrc_id = dt_prop_get_u32(ser, "reg");
+ const void *lc = dt_prop_get(ser, "ibm,loc-code");
+
+ prlog(PR_NOTICE, "FSPCON: Serial %d rsrc: %04x loc: %s\n",
+ i, rsrc_id, (const char *)lc);
+ fsp_serial_add(i++, rsrc_id, lc, false);
+ op_display(OP_LOG, OP_MOD_FSPCON, 0x0010 + i);
+ }
+
+ op_display(OP_LOG, OP_MOD_FSPCON, 0x0005);
+}
+
+static int64_t fsp_console_flush(int64_t terminal __unused)
+{
+ /* FIXME: There's probably something we can do here... */
+ return OPAL_PARAMETER;
+}
+
+struct opal_con_ops fsp_opal_con = {
+ .name = "FSP OPAL console",
+ .init = NULL, /* all the required setup is done in fsp_console_init() */
+ .read = fsp_console_read,
+ .write = fsp_console_write,
+ .space = fsp_console_write_buffer_space,
+ .flush = fsp_console_flush,
+};
+
+static void flush_all_input(void)
+{
+ unsigned int i;
+
+ lock(&fsp_con_lock);
+ for (i = 0; i < MAX_SERIAL; i++) {
+ struct fsp_serial *fs = &fsp_serials[i];
+ struct fsp_serbuf_hdr *sb = fs->in_buf;
+
+ if (fs->log_port)
+ continue;
+
+ sb->next_out = sb->next_in;
+ }
+ unlock(&fsp_con_lock);
+}
+
+static bool send_all_hvsi_close(void)
+{
+ unsigned int i;
+ bool has_hvsi = false;
+ static const uint8_t close_packet[] = { 0xfe, 6, 0, 1, 0, 3 };
+
+ for (i = 0; i < MAX_SERIAL; i++) {
+ struct fsp_serial *fs = &fsp_serials[i];
+ struct fsp_serbuf_hdr *sb = fs->out_buf;
+ unsigned int space, timeout = 10;
+
+ if (fs->log_port)
+ continue;
+ if (fs->rsrc_id == 0xffff)
+ continue;
+ has_hvsi = true;
+
+ /* Do we have room ? Wait a bit if not */
+ while(timeout--) {
+ space = (be16_to_cpu(sb->next_out) + SER_BUF_DATA_SIZE -
+ be16_to_cpu(sb->next_in) - 1) % SER_BUF_DATA_SIZE;
+ if (space >= 6)
+ break;
+ time_wait_ms(500);
+ }
+ lock(&fsp_con_lock);
+ fsp_write_vserial(fs, close_packet, 6);
+ unlock(&fsp_con_lock);
+ }
+
+ return has_hvsi;
+}
+
+static void reopen_all_hvsi(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < MAX_SERIAL; i++) {
+ struct fsp_serial *fs = &fsp_serials[i];
+
+ if (!fs->available)
+ continue;
+
+ if (fs->rsrc_id == 0xffff)
+ continue;
+ prlog(PR_NOTICE, "FSP: Deassociating HVSI console %d\n", i);
+ fsp_sync_msg(fsp_mkmsg(FSP_CMD_UNASSOC_SERIAL, 1,
+ (i << 16) | 1), true);
+ }
+ for (i = 0; i < MAX_SERIAL; i++) {
+ struct fsp_serial *fs = &fsp_serials[i];
+
+ if (!fs->available)
+ continue;
+
+ if (fs->rsrc_id == 0xffff)
+ continue;
+ prlog(PR_NOTICE, "FSP: Reassociating HVSI console %d\n", i);
+ fsp_sync_msg(fsp_mkmsg(FSP_CMD_ASSOC_SERIAL, 2,
+ (fs->rsrc_id << 16) | 1, i), true);
+ }
+}
+
+void fsp_console_reset(void)
+{
+ if (!fsp_present())
+ return;
+
+ prlog(PR_NOTICE, "FSP: Console reset !\n");
+
+ /* This is called on a fast-reset. To work around issues with HVSI
+ * initial negotiation, before we reboot the kernel, we flush all
+ * input and send an HVSI close packet.
+ */
+ flush_all_input();
+
+ /* Returns false if there is no HVSI console */
+ if (!send_all_hvsi_close())
+ return;
+
+ time_wait_ms(500);
+
+ reopen_all_hvsi();
+
+}
+
+void fsp_console_add_nodes(void)
+{
+ struct dt_node *opal_event;
+ unsigned int i;
+
+ opal_event = dt_find_by_name(opal_node, "event");
+
+ for (i = 0; i < MAX_SERIAL; i++) {
+ struct fsp_serial *fs = &fsp_serials[i];
+ struct dt_node *fs_node;
+ const char *type;
+
+ if (fs->log_port || !fs->available)
+ continue;
+
+ if (fs->rsrc_id == 0xffff)
+ type = "raw";
+ else
+ type = "hvsi";
+
+ fs_node = add_opal_console_node(i, type, SER_BUF_DATA_SIZE);
+
+ fs->irq = opal_dynamic_event_alloc();
+ dt_add_property_cells(fs_node, "interrupts", ilog2(fs->irq));
+
+ if (opal_event)
+ dt_add_property_cells(fs_node, "interrupt-parent",
+ opal_event->phandle);
+ }
+}
+
+void fsp_console_select_stdout(void)
+{
+ bool use_serial = false;
+ int rc;
+ u8 param;
+
+ if (!fsp_present())
+ return;
+
+ rc = fsp_get_sys_param(SYS_PARAM_CONSOLE_SELECT,
+ &param, 1, NULL, NULL);
+ if (rc != 1) {
+ prerror("FSPCON: Failed to get console"
+ " sysparam rc %d\n", rc);
+ } else {
+ switch(param) {
+ case 0:
+ use_serial = false;
+ break;
+ case 1:
+ use_serial = true;
+ break;
+ default:
+ prerror("FSPCON: Unknown console"
+ " sysparam %d\n", param);
+ }
+ }
+
+ dt_check_del_prop(dt_chosen, "linux,stdout-path");
+
+ if (fsp_serials[1].open && use_serial) {
+ dt_add_property_string(dt_chosen, "linux,stdout-path",
+ "/ibm,opal/consoles/serial@1");
+ prlog(PR_NOTICE, "FSPCON: default console set to serial A\n");
+ } else {
+ dt_add_property_string(dt_chosen, "linux,stdout-path",
+ "/ibm,opal/consoles/serial@0");
+ prlog(PR_NOTICE, "FSPCON: default console set to SOL/DVS\n");
+ }
+}
+
diff --git a/roms/skiboot/hw/fsp/fsp-diag.c b/roms/skiboot/hw/fsp/fsp-diag.c
new file mode 100644
index 000000000..d9101f31b
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-diag.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Code for handling FSP_MCLASS_DIAG messages (cmd 0xee)
+ * Receiving a high level ack timeout is likely indicative of a firmware bug
+ *
+ * Copyright 2013-2014 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <fsp.h>
+#include <lock.h>
+#include <processor.h>
+#include <timebase.h>
+#include <opal.h>
+#include <fsp-sysparam.h>
+
+static bool fsp_diag_msg(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+
+ if (cmd_sub_mod == FSP_RSP_DIAG_LINK_ERROR) {
+ printf("FIXME: Unhandled FSP_MCLASS_DIAG Link Error Report\n");
+ return false;
+ }
+
+ if (cmd_sub_mod != FSP_RSP_DIAG_ACK_TIMEOUT) {
+ printf("BUG: Unhandled subcommand: 0x%x (New FSP spec?)\n",
+ cmd_sub_mod);
+ return false;
+ }
+
+ printf("BUG: High Level ACK timeout (FSP_MCLASS_DIAG) for 0x%x\n",
+ fsp_msg_get_data_word(msg, 0) & 0xffff0000);
+
+ return true;
+}
+
+static struct fsp_client fsp_diag = {
+ .message = fsp_diag_msg,
+};
+
+/* This is called at boot time */
+void fsp_init_diag(void)
+{
+ /* Register for the diag event */
+ fsp_register_client(&fsp_diag, FSP_MCLASS_DIAG);
+}
diff --git a/roms/skiboot/hw/fsp/fsp-dpo.c b/roms/skiboot/hw/fsp/fsp-dpo.c
new file mode 100644
index 000000000..91919f915
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-dpo.c
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * FSP DPO (Delayed Power Off) event support
+ *
+ * Copyright 2013-2017 IBM Corp.
+ */
+
+#define pr_fmt(fmt) "FSP-DPO: " fmt
+
+#include <skiboot.h>
+#include <fsp.h>
+#include <stdio.h>
+#include <timebase.h>
+#include <opal.h>
+#include <opal-msg.h>
+
+#define DPO_CMD_SGN_BYTE0 0xf4 /* Byte[0] signature */
+#define DPO_CMD_SGN_BYTE1 0x20 /* Byte[1] signature */
+#define DPO_TIMEOUT 2700 /* 45 minutes in seconds */
+
+bool fsp_dpo_pending;
+static unsigned long fsp_dpo_init_tb;
+
+/*
+ * OPAL DPO interface
+ *
+ * Returns zero if DPO is not active, positive value indicating number
+ * of seconds remaining for a forced system shutdown. This will enable
+ * the host to schedule for shutdown voluntarily before timeout occurs.
+ */
+static int64_t fsp_opal_get_dpo_status(__be64 *dpo_timeout)
+{
+ if (!fsp_dpo_pending) {
+ *dpo_timeout = 0;
+ return OPAL_WRONG_STATE;
+ }
+
+ *dpo_timeout = cpu_to_be64(DPO_TIMEOUT - tb_to_secs(mftb() - fsp_dpo_init_tb));
+ return OPAL_SUCCESS;
+}
+
+/* Process FSP DPO init message */
+static void fsp_process_dpo(struct fsp_msg *msg)
+{
+ struct fsp_msg *resp;
+ u32 cmd = FSP_RSP_INIT_DPO;
+ int rc;
+
+ /* DPO message does not have the correct signatures */
+ if ((msg->data.bytes[0] != DPO_CMD_SGN_BYTE0)
+ || (msg->data.bytes[1] != DPO_CMD_SGN_BYTE1)) {
+ prerror("Message signatures did not match\n");
+ cmd |= FSP_STATUS_INVALID_CMD;
+ resp = fsp_mkmsg(cmd, 0);
+ if (resp == NULL) {
+ prerror("%s : Message allocation failed\n", __func__);
+ return;
+ }
+ if (fsp_queue_msg(resp, fsp_freemsg)) {
+ fsp_freemsg(resp);
+ prerror("%s : Failed to queue response "
+ "message\n", __func__);
+ }
+ return;
+ }
+
+ /* OPAL is already in "DPO pending" state */
+ if (fsp_dpo_pending) {
+ prlog(PR_INFO, "OPAL already in DPO pending state\n");
+ cmd |= FSP_STATUS_INVALID_DPOSTATE;
+ resp = fsp_mkmsg(cmd, 0);
+ if (resp == NULL) {
+ prerror("%s : Message allocation failed\n", __func__);
+ return;
+ }
+ if (fsp_queue_msg(resp, fsp_freemsg)) {
+ fsp_freemsg(resp);
+ prerror("%s : Failed to queue response "
+ "message\n", __func__);
+ }
+ return;
+ }
+
+
+ /* Inform the host about DPO */
+ rc = opal_queue_msg(OPAL_MSG_DPO, NULL, NULL);
+ if (rc) {
+ prerror("OPAL message queuing failed\n");
+ cmd |= FSP_STATUS_GENERIC_ERROR;
+ resp = fsp_mkmsg(cmd, 0);
+ if (resp == NULL) {
+ prerror("%s : Message allocation failed\n", __func__);
+ return;
+ }
+ if (fsp_queue_msg(resp, fsp_freemsg)) {
+ fsp_freemsg(resp);
+ prerror("%s : Failed to queue response "
+ "message\n", __func__);
+ }
+ return;
+ } else
+ prlog(PR_INFO, "Notified host about DPO event\n");
+
+ /* Acknowledge the FSP on DPO */
+ resp = fsp_mkmsg(cmd, 0);
+ if (resp == NULL) {
+ prerror("%s : Message allocation failed\n", __func__);
+ return;
+ }
+ if (fsp_queue_msg(resp, fsp_freemsg)) {
+ fsp_freemsg(resp);
+ prerror("%s : Failed to queue response message\n", __func__);
+ return;
+ }
+
+ /* Record DPO init time and set DPO pending flag */
+ fsp_dpo_init_tb = mftb();
+ fsp_dpo_pending = true;
+
+ /*
+ * OPAL is now in DPO pending state. After first detecting DPO
+ * condition from OPAL, the host will have 45 minutes to prepare
+ * the system for shutdown. The host must take all necessary actions
+ * required in that regard and at the end shutdown itself. The host
+ * shutdown sequence eventually will make the call OPAL_CEC_POWER_DOWN
+ * which in turn ask the FSP to shutdown the CEC. If the FSP does not
+ * receive the cec power down command from OPAL within 45 minutes,
+ * it will assume that the host and the OPAL has processed the DPO
+ * sequence successfully and hence force power off the system.
+ */
+}
+
+/* Handle DPO sub-command from FSP */
+static bool fsp_dpo_message(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+ if (cmd_sub_mod == FSP_CMD_INIT_DPO) {
+ prlog(PR_INFO, "Delayed Power Off (DPO) notification received\n");
+ fsp_process_dpo(msg);
+ return true;
+ }
+
+ return false;
+}
+
+static struct fsp_client fsp_dpo_client = {
+ .message = fsp_dpo_message,
+};
+
+void fsp_dpo_init(void)
+{
+ fsp_register_client(&fsp_dpo_client, FSP_MCLASS_SERVICE);
+ opal_register(OPAL_GET_DPO_STATUS, fsp_opal_get_dpo_status, 1);
+ prlog(PR_INFO, "FSP DPO support initialized\n");
+}
diff --git a/roms/skiboot/hw/fsp/fsp-dump.c b/roms/skiboot/hw/fsp/fsp-dump.c
new file mode 100644
index 000000000..96cb45e6f
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-dump.c
@@ -0,0 +1,916 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Dump support:
+ * We get dump notification from different sources:
+ * - During system initialization via HDAT
+ * - During FSP reset/reload (FipS dump)
+ * - Dump available notification MBOX command (0xCE, 0x78, 0x00)
+ *
+ * To avoid complications, we keep list of dumps in a list and fetch
+ * them serially.
+ *
+ * Dump retrieve process:
+ * - Once we get notification from FSP we enqueue the dump ID and notify
+ * Linux via OPAL event notification.
+ * - Linux reads dump info and allocates required memory to fetch the dump
+ * and makes dump read call.
+ * - Sapphire fetches dump data from FSP.
+ * - Linux writes dump to disk and sends acknowledgement.
+ * - Sapphire acknowledges FSP.
+ *
+ * Copyright 2013-2015 IBM Corp.
+ */
+
+#include <fsp.h>
+#include <psi.h>
+#include <lock.h>
+#include <device.h>
+#include <skiboot.h>
+#include <errorlog.h>
+#include <opal-api.h>
+
+/*
+ * Max outstanding dumps to retrieve
+ *
+ * Note:
+ * Dumps are serialized. We don't get notification for second
+ * dump of given type until we acknowledge first one. But we
+ * may get notification for different dump type. And our dump
+ * retrieval code is serialized. Hence we use list to keep
+ * track of outstanding dumps to be retrieved.
+ */
+#define MAX_DUMP_RECORD 0x04
+
+/* Max retry */
+#define FIPS_DUMP_MAX_RETRY 0x03
+
+/* Dump type */
+#define DUMP_TYPE_FSP 0x01
+#define DUMP_TYPE_SYS 0x02
+#define DUMP_TYPE_SMA 0x03
+
+/* Dump fetch size */
+#define DUMP_FETCH_SIZE_FSP 0x500000
+#define DUMP_FETCH_SIZE_SYS 0x400000
+#define DUMP_FETCH_SIZE_RES 0x200000
+
+/* Params for Fips dump */
+#define FSP_DUMP_TOOL_TYPE "SYS "
+#define FSP_DUMP_CLIENT_ID "SAPPHIRE_CLIENT"
+
+enum dump_state {
+ DUMP_STATE_ABSENT, /* No FSP dump */
+ DUMP_STATE_NONE, /* No dump to retrieve */
+ DUMP_STATE_NOTIFY, /* Notified Linux */
+ DUMP_STATE_FETCHING, /* Dump retrieval is in progress */
+ DUMP_STATE_FETCH, /* Dump retrieve complete */
+ DUMP_STATE_PARTIAL, /* Partial read */
+ DUMP_STATE_ABORTING, /* Aborting due to kexec */
+};
+
+/* Pending dump list */
+struct dump_record {
+ uint8_t type;
+ uint32_t id;
+ uint32_t size;
+ struct list_node link;
+};
+
+/* List definations */
+static LIST_HEAD(dump_pending);
+static LIST_HEAD(dump_free);
+
+/* Dump retrieve state */
+static enum dump_state dump_state = DUMP_STATE_NONE;
+
+/* Dump buffer SG list */
+static struct opal_sg_list *dump_data;
+static struct dump_record *dump_entry;
+static int64_t dump_offset;
+static size_t fetch_remain;
+
+/* FipS dump retry count */
+static int retry_cnt;
+
+/* Protect list and dump retrieve state */
+static struct lock dump_lock = LOCK_UNLOCKED;
+
+/* Forward declaration */
+static int64_t fsp_opal_dump_init(uint8_t dump_type);
+static int64_t fsp_dump_read(void);
+
+DEFINE_LOG_ENTRY(OPAL_RC_DUMP_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_DUMP,
+ OPAL_PLATFORM_FIRMWARE,
+ OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT,
+ OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_DUMP_LIST, OPAL_PLATFORM_ERR_EVT, OPAL_DUMP,
+ OPAL_PLATFORM_FIRMWARE,
+ OPAL_INFO,
+ OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_DUMP_ACK, OPAL_PLATFORM_ERR_EVT, OPAL_DUMP,
+ OPAL_PLATFORM_FIRMWARE, OPAL_INFO,
+ OPAL_NA);
+
+/*
+ * Helper functions
+ */
+static inline void update_dump_state(enum dump_state state)
+{
+ dump_state = state;
+}
+
+static int64_t check_dump_state(void)
+{
+ switch (dump_state) {
+ case DUMP_STATE_ABSENT:
+ return OPAL_HARDWARE;
+ case DUMP_STATE_NONE:
+ case DUMP_STATE_NOTIFY:
+ /* During dump fetch, notify is wrong state */
+ return OPAL_WRONG_STATE;
+ case DUMP_STATE_FETCHING:
+ case DUMP_STATE_ABORTING:
+ return OPAL_BUSY_EVENT;
+ case DUMP_STATE_FETCH:
+ return OPAL_SUCCESS;
+ case DUMP_STATE_PARTIAL:
+ return OPAL_PARTIAL;
+ }
+ return OPAL_SUCCESS;
+}
+
+static inline void dump_tce_map(uint32_t tce_offset,
+ void *buffer, uint32_t size)
+{
+ uint32_t tlen = ALIGN_UP(size, TCE_PSIZE);
+ fsp_tce_map(PSI_DMA_DUMP_DATA + tce_offset, buffer, tlen);
+}
+
+static inline void dump_tce_unmap(uint32_t size)
+{
+ fsp_tce_unmap(PSI_DMA_DUMP_DATA, size);
+}
+
+/*
+ * Returns Data set ID for the given dump type
+ */
+static inline uint16_t get_dump_data_set_id(uint8_t type)
+{
+ switch (type) {
+ case DUMP_TYPE_FSP:
+ return FSP_DATASET_SP_DUMP;
+ case DUMP_TYPE_SYS:
+ return FSP_DATASET_HW_DUMP;
+ default:
+ break;
+ }
+ return OPAL_INTERNAL_ERROR;
+}
+
+/*
+ * Returns max data we can fetch from FSP fetch data call
+ */
+static inline int64_t get_dump_fetch_max_size(uint8_t type)
+{
+ switch (type) {
+ case DUMP_TYPE_FSP:
+ return DUMP_FETCH_SIZE_FSP;
+ case DUMP_TYPE_SYS:
+ return DUMP_FETCH_SIZE_SYS;
+ default:
+ break;
+ }
+ return OPAL_INTERNAL_ERROR;
+}
+
+/*
+ * Get dump record from pending list
+ */
+static inline struct dump_record *get_dump_rec_from_list(uint32_t id)
+{
+ struct dump_record *record;
+
+ list_for_each(&dump_pending, record, link) {
+ if (record->id == id)
+ return record;
+ }
+ return NULL;
+}
+
+/*
+ * New dump available notification to Linux
+ */
+static void update_opal_dump_notify(void)
+{
+ /*
+ * Wait until current dump retrieval to complete
+ * before notifying again.
+ */
+ if (dump_state != DUMP_STATE_NONE)
+ return;
+
+ /* More dump's to retrieve */
+ if (!list_empty(&dump_pending)) {
+ update_dump_state(DUMP_STATE_NOTIFY);
+ opal_update_pending_evt(OPAL_EVENT_DUMP_AVAIL,
+ OPAL_EVENT_DUMP_AVAIL);
+ }
+}
+
+static int64_t remove_dump_id_from_list(uint32_t dump_id)
+{
+ struct dump_record *record, *nxt_record;
+ int rc = OPAL_SUCCESS;
+ bool found = false;
+
+ /* Remove record from pending list */
+ list_for_each_safe(&dump_pending, record, nxt_record, link) {
+ if (record->id != dump_id)
+ continue;
+
+ found = true;
+ list_del(&record->link);
+ list_add(&dump_free, &record->link);
+ break;
+ }
+
+ /*
+ * Continue update_opal_dump_notify even if it fails
+ * to remove ID. So that we can resend notification
+ * for the same dump ID to Linux.
+ */
+ if (!found) { /* List corrupted? */
+ log_simple_error(&e_info(OPAL_RC_DUMP_LIST),
+ "DUMP: ID 0x%x not found in list!\n",
+ dump_id);
+ rc = OPAL_PARAMETER;
+ }
+
+ /* Update state */
+ update_dump_state(DUMP_STATE_NONE);
+ /* Notify next available dump to retrieve */
+ update_opal_dump_notify();
+
+ return rc;
+}
+
+static int64_t add_dump_id_to_list(uint8_t dump_type,
+ uint32_t dump_id, uint32_t dump_size)
+{
+ struct dump_record *record;
+ int rc = OPAL_SUCCESS;
+
+ lock(&dump_lock);
+
+ rc = check_dump_state();
+ if (rc == OPAL_HARDWARE)
+ goto out;
+
+ /* List is full ? */
+ if (list_empty(&dump_free)) {
+ printf("DUMP: Dump ID 0x%x is not queued.\n", dump_id);
+ rc = OPAL_RESOURCE;
+ goto out;
+ }
+
+ /* Already queued? */
+ record = get_dump_rec_from_list(dump_id);
+ if (record) {
+ rc = OPAL_SUCCESS;
+ goto out;
+ }
+
+ /* Add to list */
+ record = list_pop(&dump_free, struct dump_record, link);
+ record->type = dump_type;
+ record->id = dump_id;
+ record->size = dump_size;
+ list_add_tail(&dump_pending, &record->link);
+
+ /* OPAL notification */
+ update_opal_dump_notify();
+ rc = OPAL_SUCCESS;
+
+out:
+ unlock(&dump_lock);
+ return rc;
+}
+
+static void dump_init_complete(struct fsp_msg *msg)
+{
+ uint8_t status = (msg->resp->word1 >> 8) & 0xff;
+
+ printf("DUMP: FipS dump init status = 0x%x\n", status);
+ fsp_freemsg(msg);
+
+ switch (status) {
+ case FSP_STATUS_SUCCESS:
+ printf("DUMP: Initiated FipS dump.\n");
+ break;
+ case FSP_STATUS_BUSY: /* Retry, if FSP is busy */
+ if (retry_cnt++ < FIPS_DUMP_MAX_RETRY)
+ if (fsp_opal_dump_init(DUMP_TYPE_FSP) == OPAL_SUCCESS)
+ return;
+ break;
+ default:
+ break;
+ }
+ /* Reset max retry count */
+ retry_cnt = 0;
+}
+
+/*
+ * Initiate new FipS dump
+ */
+static int64_t fsp_opal_dump_init(uint8_t dump_type)
+{
+ struct fsp_msg *msg;
+ int rc = OPAL_SUCCESS;
+ uint32_t *tool_type = (void *)FSP_DUMP_TOOL_TYPE;
+ uint32_t *client_id = (void *)FSP_DUMP_CLIENT_ID;
+
+ /* Only FipS dump generate request is supported */
+ if (dump_type != DUMP_TYPE_FSP)
+ return OPAL_PARAMETER;
+
+ msg = fsp_mkmsg(FSP_CMD_FSP_DUMP_INIT, 6, *tool_type,
+ sizeof(FSP_DUMP_CLIENT_ID), *client_id,
+ *(client_id + 1), *(client_id + 2), *(client_id + 3));
+
+ if (!msg) {
+ log_simple_error(&e_info(OPAL_RC_DUMP_INIT),
+ "DUMP: Message allocation failed.\n");
+ rc = OPAL_INTERNAL_ERROR;
+ } else if (fsp_queue_msg(msg, dump_init_complete)) {
+ log_simple_error(&e_info(OPAL_RC_DUMP_INIT),
+ "DUMP: Failed to queue FipS dump init request.\n");
+ fsp_freemsg(msg);
+ rc = OPAL_INTERNAL_ERROR;
+ }
+
+ return rc;
+}
+
+/*
+ * OPAL interface to send dump information to Linux.
+ */
+static int64_t fsp_opal_dump_info2(__be32 *dump_id, __be32 *dump_size,
+ __be32 *dump_type)
+{
+ struct dump_record *record;
+ int rc = OPAL_SUCCESS;
+
+ lock(&dump_lock);
+
+ /* Clear notification */
+ opal_update_pending_evt(OPAL_EVENT_DUMP_AVAIL, 0);
+
+ record = list_top(&dump_pending, struct dump_record, link);
+ if (!record) { /* List corrupted? */
+ update_dump_state(DUMP_STATE_NONE);
+ rc = OPAL_INTERNAL_ERROR;
+ goto out;
+ }
+ *dump_id = cpu_to_be32(record->id);
+ *dump_size = cpu_to_be32(record->size);
+ *dump_type = cpu_to_be32(record->type);
+
+out:
+ unlock(&dump_lock);
+ return rc;
+}
+
+static int64_t fsp_opal_dump_info(__be32 *dump_id, __be32 *dump_size)
+{
+ __be32 dump_type;
+ return fsp_opal_dump_info2(dump_id, dump_size, &dump_type);
+}
+
+static int64_t validate_dump_sglist(struct opal_sg_list *list,
+ int64_t *size)
+{
+ struct opal_sg_list *sg;
+ struct opal_sg_entry *prev_entry, *entry;
+ int length, num_entries, i;
+
+ prev_entry = NULL;
+ *size = 0;
+ for (sg = list; sg; sg = (struct opal_sg_list*)be64_to_cpu(sg->next)) {
+ length = be64_to_cpu(sg->length) - 16;
+ num_entries = length / sizeof(struct opal_sg_entry);
+ if (num_entries <= 0)
+ return OPAL_PARAMETER;
+
+ for (i = 0; i < num_entries; i++) {
+ entry = &sg->entry[i];
+ *size += be64_to_cpu(entry->length);
+
+ /* All entries must be aligned */
+ if (((uint64_t)be64_to_cpu(entry->data)) & 0xfff)
+ return OPAL_PARAMETER;
+
+ /* All non-terminal entries size must be aligned */
+ if (prev_entry && (be64_to_cpu(prev_entry->length) & 0xfff))
+ return OPAL_PARAMETER;
+
+ prev_entry = entry;
+ }
+ }
+ return OPAL_SUCCESS;
+}
+
+/*
+ * Map dump buffer to TCE buffer
+ */
+static int64_t map_dump_buffer(void)
+{
+ struct opal_sg_list *sg;
+ struct opal_sg_entry *entry;
+ int64_t fetch_max;
+ int length, num_entries, i;
+ int buf_off, fetch_off, tce_off, sg_off;
+ bool last = false;
+
+ /* FSP fetch max size */
+ fetch_max = get_dump_fetch_max_size(dump_entry->type);
+ if (fetch_max > (dump_entry->size - dump_offset))
+ fetch_remain = dump_entry->size - dump_offset;
+ else
+ fetch_remain = fetch_max;
+
+ /* offsets */
+ fetch_off = fetch_remain;
+ tce_off = sg_off = 0;
+
+ for (sg = dump_data; sg; sg = (struct opal_sg_list*)be64_to_cpu(sg->next)) {
+ num_entries = (be64_to_cpu(sg->length) - 16) /
+ sizeof(struct opal_sg_entry);
+ if (num_entries <= 0)
+ return OPAL_PARAMETER;
+
+ for (i = 0; i < num_entries; i++) {
+ entry = &sg->entry[i];
+
+ /* Continue until we get offset */
+ if ((sg_off + be64_to_cpu(entry->length)) < dump_offset) {
+ sg_off += be64_to_cpu(entry->length);
+ continue;
+ }
+
+ /*
+ * SG list entry size can be more than 4k.
+ * Map only required pages, instead of
+ * mapping entire entry.
+ */
+ if (!tce_off) {
+ buf_off = (dump_offset - sg_off) & ~0xfff;
+ length = be64_to_cpu(entry->length) - buf_off;
+ } else {
+ buf_off = 0;
+ length = be64_to_cpu(entry->length);
+ }
+
+ /* Adjust length for last mapping */
+ if (fetch_off <= length) {
+ length = fetch_off;
+ last = true;
+ }
+
+ /* Adjust offset */
+ sg_off += be64_to_cpu(entry->length);
+ fetch_off -= length;
+
+ /* TCE mapping */
+ dump_tce_map(tce_off, (void*)(be64_to_cpu(entry->data) + buf_off), length);
+ tce_off += length;
+
+ /* TCE mapping complete */
+ if (last)
+ return OPAL_SUCCESS;
+ }
+ } /* outer loop */
+ return OPAL_PARAMETER;
+}
+
+static void dump_read_complete(struct fsp_msg *msg)
+{
+ void *buffer;
+ size_t length, offset;
+ int rc;
+ uint32_t dump_id;
+ uint16_t id;
+ uint8_t flags, status;
+ bool compl = false;
+
+ status = (msg->resp->word1 >> 8) & 0xff;
+ flags = (fsp_msg_get_data_word(msg, 0) >> 16) & 0xff;
+ id = fsp_msg_get_data_word(msg, 0) & 0xffff;
+ dump_id = fsp_msg_get_data_word(msg, 1);
+ offset = fsp_msg_get_data_word(msg->resp, 1);
+ length = fsp_msg_get_data_word(msg->resp, 2);
+
+ fsp_freemsg(msg);
+
+ lock(&dump_lock);
+
+ if (dump_state == DUMP_STATE_ABORTING) {
+ printf("DUMP: Fetch dump aborted, ID = 0x%x\n", dump_id);
+ dump_tce_unmap(PSI_DMA_DUMP_DATA_SIZE);
+ update_dump_state(DUMP_STATE_NONE);
+ goto bail;
+ }
+
+ switch (status) {
+ case FSP_STATUS_SUCCESS: /* Fetch next dump block */
+ if (dump_offset < dump_entry->size) {
+ dump_tce_unmap(PSI_DMA_DUMP_DATA_SIZE);
+ rc = fsp_dump_read();
+ if (rc == OPAL_SUCCESS)
+ goto bail;
+ } else { /* Dump read complete */
+ compl = true;
+ }
+ break;
+ case FSP_STATUS_MORE_DATA: /* More data to read */
+ offset += length;
+ buffer = (void *)PSI_DMA_DUMP_DATA + offset;
+ fetch_remain -= length;
+
+ rc = fsp_fetch_data_queue(flags, id, dump_id, offset, buffer,
+ &fetch_remain, dump_read_complete);
+ if (rc == OPAL_SUCCESS)
+ goto bail;
+ break;
+ default:
+ break;
+ }
+
+ dump_tce_unmap(PSI_DMA_DUMP_DATA_SIZE);
+
+ /* Update state */
+ if (compl) {
+ printf("DUMP: Fetch dump success. ID = 0x%x\n", dump_id);
+ update_dump_state(DUMP_STATE_FETCH);
+ } else {
+ printf("DUMP: Fetch dump partial. ID = 0x%x\n", dump_id);
+ update_dump_state(DUMP_STATE_PARTIAL);
+ }
+ bail:
+ unlock(&dump_lock);
+}
+
+/*
+ * Fetch dump data from FSP
+ */
+static int64_t fsp_dump_read(void)
+{
+ int64_t rc;
+ uint16_t data_set;
+ uint8_t flags = 0x00;
+
+ /* Get data set ID */
+ data_set = get_dump_data_set_id(dump_entry->type);
+
+ /* Map TCE buffer */
+ rc = map_dump_buffer();
+ if (rc != OPAL_SUCCESS) {
+ printf("DUMP: TCE mapping failed\n");
+ return rc;
+ }
+
+ printf("DUMP: Fetch Dump. ID = %02x, sub ID = %08x, len = %ld\n",
+ data_set, dump_entry->id, fetch_remain);
+
+ /* Fetch data */
+ rc = fsp_fetch_data_queue(flags, data_set, dump_entry->id,
+ dump_offset, (void *)PSI_DMA_DUMP_DATA,
+ &fetch_remain, dump_read_complete);
+
+ /* Adjust dump fetch offset */
+ dump_offset += fetch_remain;
+
+ return rc;
+}
+
+static int64_t fsp_opal_dump_read(uint32_t dump_id,
+ struct opal_sg_list *list)
+{
+ struct dump_record *record;
+ int64_t rc, size;
+
+ lock(&dump_lock);
+
+ /* Check state */
+ if (dump_state != DUMP_STATE_NOTIFY) {
+ rc = check_dump_state();
+ goto out;
+ }
+
+ /* Validate dump ID */
+ record = get_dump_rec_from_list(dump_id);
+ if (!record) { /* List corrupted? */
+ rc = OPAL_INTERNAL_ERROR;
+ goto out;
+ }
+
+ /* Validate dump buffer and size */
+ rc = validate_dump_sglist(list, &size);
+ if (rc != OPAL_SUCCESS) {
+ printf("DUMP: SG list validation failed\n");
+ goto out;
+ }
+
+ if (size < record->size) { /* Insuffient buffer */
+ printf("DUMP: Insufficient buffer\n");
+ rc = OPAL_PARAMETER;
+ goto out;
+ }
+
+ /* Update state */
+ update_dump_state(DUMP_STATE_FETCHING);
+
+ /* Fetch dump data */
+ dump_entry = record;
+ dump_data = list;
+ dump_offset = 0;
+ rc = fsp_dump_read();
+ if (rc != OPAL_SUCCESS)
+ goto out;
+
+ /* Check status after initiating fetch data */
+ rc = check_dump_state();
+
+out:
+ unlock(&dump_lock);
+ return rc;
+}
+
+static void dump_ack_complete(struct fsp_msg *msg)
+{
+ uint8_t status = (msg->resp->word1 >> 8) & 0xff;
+
+ if (status)
+ log_simple_error(&e_info(OPAL_RC_DUMP_ACK),
+ "DUMP: ACK failed for ID: 0x%x\n",
+ fsp_msg_get_data_word(msg, 0));
+ else
+ printf("DUMP: ACKed dump ID: 0x%x\n", fsp_msg_get_data_word(msg, 0));
+
+ fsp_freemsg(msg);
+}
+
+/*
+ * Acknowledge dump
+ */
+static int64_t fsp_opal_dump_ack(uint32_t dump_id)
+{
+ struct dump_record *record;
+ struct fsp_msg *msg;
+ int rc;
+ uint32_t cmd;
+ uint8_t dump_type = 0;
+
+ /* Get dump type */
+ lock(&dump_lock);
+ record = get_dump_rec_from_list(dump_id);
+ if (record)
+ dump_type = record->type;
+
+ /*
+ * Next available dump in pending list will be of different
+ * type. Hence we don't need to wait for ack complete.
+ *
+ * Note:
+ * This allows us to proceed even if we fail to ACK.
+ * In the worst case we may get notification for the
+ * same dump again, which is probably better than
+ * looping forever.
+ */
+ rc = remove_dump_id_from_list(dump_id);
+ if (rc != OPAL_SUCCESS) /* Invalid dump id */
+ goto out;
+
+ /* Adjust mod value */
+ cmd = FSP_CMD_ACK_DUMP | (dump_type & 0xff);
+ msg = fsp_mkmsg(cmd, 1, dump_id);
+ if (!msg) {
+ log_simple_error(&e_info(OPAL_RC_DUMP_ACK),
+ "DUMP: Message allocation failed.!\n");
+ rc = OPAL_INTERNAL_ERROR;
+ } else if (fsp_queue_msg(msg, dump_ack_complete)) {
+ log_simple_error(&e_info(OPAL_RC_DUMP_ACK),
+ "DUMP: Failed to queue dump ack message.\n");
+ fsp_freemsg(msg);
+ rc = OPAL_INTERNAL_ERROR;
+ }
+out:
+ unlock(&dump_lock);
+ return rc;
+}
+
+/* Resend dump available notification */
+static int64_t fsp_opal_dump_resend_notification(void)
+{
+ lock(&dump_lock);
+
+ if (dump_state != DUMP_STATE_ABSENT)
+ update_dump_state(DUMP_STATE_NONE);
+
+ update_opal_dump_notify();
+
+ unlock(&dump_lock);
+
+ return OPAL_SUCCESS;
+}
+
+/*
+ * Handle FSP R/R event.
+ */
+static bool fsp_dump_retrieve_rr(uint32_t cmd_sub_mod,
+ struct fsp_msg *msg __unused)
+{
+ switch (cmd_sub_mod) {
+ case FSP_RESET_START:
+ lock(&dump_lock);
+ /* Reset dump state */
+ if (dump_state == DUMP_STATE_FETCHING)
+ update_dump_state(DUMP_STATE_ABORTING);
+ unlock(&dump_lock);
+ return true;
+ case FSP_RELOAD_COMPLETE:
+ lock(&dump_lock);
+
+ /* Reset TCE mapping */
+ dump_tce_unmap(PSI_DMA_DUMP_DATA_SIZE);
+
+ /* Reset dump state */
+ update_dump_state(DUMP_STATE_NONE);
+
+ /*
+ * For now keeping R/R handler simple. In the worst case
+ * we may endup resending dump available notification for
+ * same dump ID twice to Linux.
+ */
+ update_opal_dump_notify();
+ unlock(&dump_lock);
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Handle host kexec'ing scenarios
+ */
+static bool opal_kexec_dump_notify(void *data __unused)
+{
+ bool ready = true;
+
+ lock(&dump_lock);
+
+ /* Dump retrieve is in progress? */
+ if (dump_state == DUMP_STATE_FETCHING)
+ dump_state = DUMP_STATE_ABORTING;
+
+ /* Not yet safe to kexec */
+ if (dump_state == DUMP_STATE_ABORTING)
+ ready = false;
+
+ unlock(&dump_lock);
+
+ return ready;
+}
+
+/*
+ * FipS dump notification
+ */
+void fsp_fips_dump_notify(uint32_t dump_id, uint32_t dump_size)
+{
+ printf("DUMP: FipS dump available. ID = 0x%x [size: %d bytes]\n",
+ dump_id, dump_size);
+ add_dump_id_to_list(DUMP_TYPE_FSP, dump_id, dump_size);
+}
+
+/*
+ * System/Platform dump notification
+ */
+static bool fsp_sys_dump_notify(uint32_t cmd_sub_mod, struct fsp_msg *msg)
+{
+ /*
+ * Though spec says mod 00 is deprecated we still
+ * seems to get mod 00 notification (at least on
+ * P7 machine).
+ */
+ if (cmd_sub_mod != FSP_RSP_SYS_DUMP &&
+ cmd_sub_mod != FSP_RSP_SYS_DUMP_OLD)
+ return false;
+
+ printf("DUMP: Platform dump available. ID = 0x%x [size: %d bytes]\n",
+ fsp_msg_get_data_word(msg, 0), fsp_msg_get_data_word(msg, 1));
+
+ add_dump_id_to_list(DUMP_TYPE_SYS,
+ fsp_msg_get_data_word(msg, 0),
+ fsp_msg_get_data_word(msg, 1));
+ return true;
+}
+
+/*
+ * If platform dump available during IPL time, then we
+ * get notification via HDAT. Check for DT for the dump
+ * presence.
+ */
+static void check_ipl_sys_dump(void)
+{
+ struct dt_node *dump_node, *opal_node;
+ uint32_t dump_id, dump_size;
+
+ if (proc_gen >= proc_gen_p9) {
+ opal_node = dt_find_by_path(dt_root, "ibm,opal");
+ if (!opal_node)
+ return;
+ dump_node = dt_find_by_path(opal_node, "dump");
+ if (dump_node) {
+ if (dt_find_property(dump_node, "mpipl-boot"))
+ return;
+ }
+ }
+
+ dump_node = dt_find_by_path(dt_root, "ipl-params/platform-dump");
+ if (!dump_node)
+ return;
+
+ if (!dt_find_property(dump_node, "dump-id"))
+ return;
+
+ dump_id = dt_prop_get_u32(dump_node, "dump-id");
+ dump_size = (uint32_t)dt_prop_get_u64(dump_node, "total-size");
+
+ printf("DUMP: Platform dump present during IPL.\n");
+ printf(" ID = 0x%x [size: %d bytes]\n", dump_id, dump_size);
+
+ add_dump_id_to_list(DUMP_TYPE_SYS, dump_id, dump_size);
+}
+
+/*
+ * Allocate and initialize dump list
+ */
+static int init_dump_free_list(void)
+{
+ struct dump_record *entry;
+ int i;
+
+ entry = zalloc(sizeof(struct dump_record) * MAX_DUMP_RECORD);
+ if (!entry) {
+ log_simple_error(&e_info(OPAL_RC_DUMP_INIT),
+ "DUMP: Out of memory\n");
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < MAX_DUMP_RECORD; i++) {
+ list_add_tail(&dump_free, &entry->link);
+ entry++;
+ }
+ return 0;
+}
+
+static struct fsp_client fsp_sys_dump_client = {
+ .message = fsp_sys_dump_notify,
+};
+
+static struct fsp_client fsp_dump_client_rr = {
+ .message = fsp_dump_retrieve_rr,
+};
+
+void fsp_dump_init(void)
+{
+ if (!fsp_present()) {
+ update_dump_state(DUMP_STATE_ABSENT);
+ return;
+ }
+
+ /* Initialize list */
+ if (init_dump_free_list() != 0) {
+ update_dump_state(DUMP_STATE_ABSENT);
+ return;
+ }
+
+ /* Register for Class CE */
+ fsp_register_client(&fsp_sys_dump_client, FSP_MCLASS_SERVICE);
+ /* Register for Class AA (FSP R/R) */
+ fsp_register_client(&fsp_dump_client_rr, FSP_MCLASS_RR_EVENT);
+
+ /* Register for sync on host reboot call */
+ opal_add_host_sync_notifier(opal_kexec_dump_notify, NULL);
+
+ /* OPAL interface */
+ opal_register(OPAL_DUMP_INIT, fsp_opal_dump_init, 1);
+ opal_register(OPAL_DUMP_INFO, fsp_opal_dump_info, 2);
+ opal_register(OPAL_DUMP_INFO2, fsp_opal_dump_info2, 3);
+ opal_register(OPAL_DUMP_READ, fsp_opal_dump_read, 2);
+ opal_register(OPAL_DUMP_ACK, fsp_opal_dump_ack, 1);
+ opal_register(OPAL_DUMP_RESEND, fsp_opal_dump_resend_notification, 0);
+
+ /* Check for platform dump presence during IPL time */
+ check_ipl_sys_dump();
+}
diff --git a/roms/skiboot/hw/fsp/fsp-elog-read.c b/roms/skiboot/hw/fsp/fsp-elog-read.c
new file mode 100644
index 000000000..bd23ffbe8
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-elog-read.c
@@ -0,0 +1,608 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * This code will enable retrieving of error log from FSP -> Sapphire in
+ * sequence.
+ * Here, FSP would send next log only when Sapphire sends a new log notification
+ * response to FSP. On Completion of reading the log from FSP,
+ * OPAL_EVENT_ERROR_LOG_AVAIL is signaled. This will remain raised until a call
+ * to opal_elog_read() is made and OPAL_SUCCESS is returned. Upon which, the
+ * operation is complete and the event is cleared. This is READ action from FSP.
+ *
+ * Copyright 2013-2017 IBM Corp.
+ */
+
+/*
+ * Design of READ error log :
+ * When we receive a new error log entry notification from FSP, we queue it into
+ * the "pending" list. If the "pending" list is not empty, then we start
+ * fetching log from FSP.
+ *
+ * When Linux reads a log entry, we dequeue it from the "pending" list and
+ * enqueue it to another "processed" list. At this point, if the "pending"
+ * list is not empty, we continue to fetch the next log.
+ *
+ * When Linux calls opal_resend_pending_logs(), we fetch the log corresponding
+ * to the head of the pending list and move it to the processed list, and
+ * continue this process until the pending list is empty. If the pending list
+ * was empty earlier and is currently non-empty, we initiate an error log fetch.
+ *
+ * When Linux acks an error log, we remove it from processed list.
+ */
+
+#include <errno.h>
+#include <fsp.h>
+#include <fsp-elog.h>
+#include <lock.h>
+#include <opal-api.h>
+#include <psi.h>
+#include <skiboot.h>
+
+/*
+ * Maximum number of entries that are pre-allocated
+ * to keep track of pending elogs to be fetched.
+ */
+#define ELOG_READ_MAX_RECORD 128
+
+/* Structure to maintain log-id, log-size, pending and processed list. */
+struct fsp_log_entry {
+ uint32_t log_id;
+ size_t log_size;
+ struct list_node link;
+};
+
+static LIST_HEAD(elog_read_pending);
+static LIST_HEAD(elog_read_processed);
+static LIST_HEAD(elog_read_free);
+/*
+ * Lock is used to protect overwriting of processed and pending list
+ * and also used while updating state of each log.
+ */
+static struct lock elog_read_lock = LOCK_UNLOCKED;
+
+#define ELOG_READ_BUFFER_SIZE 0x00004000
+/* Log buffer to copy FSP log for read */
+static void *elog_read_buffer;
+static uint32_t elog_head_id; /* FSP entry ID */
+static size_t elog_head_size; /* Actual FSP log size */
+static uint32_t elog_read_retries; /* Bad response status count */
+
+/* Initialize the state of the log */
+static enum elog_head_state elog_read_from_fsp_head_state = ELOG_STATE_NONE;
+
+static bool elog_enabled = false;
+
+/* Need forward declaration because of circular dependency. */
+static void fsp_elog_queue_fetch(void);
+
+/*
+ * Check the response message for mbox acknowledgement
+ * command send to FSP.
+ */
+static void fsp_elog_ack_complete(struct fsp_msg *msg)
+{
+ uint8_t val;
+
+ val = (msg->resp->word1 >> 8) & 0xff;
+ if (val != 0)
+ prerror("ELOG: Acknowledgement error\n");
+
+ fsp_freemsg(msg);
+}
+
+/* Send error log PHYP acknowledgement to FSP with entry ID. */
+static int64_t fsp_send_elog_ack(uint32_t log_id)
+{
+ struct fsp_msg *ack_msg;
+
+ ack_msg = fsp_mkmsg(FSP_CMD_ERRLOG_PHYP_ACK, 1, log_id);
+ if (!ack_msg) {
+ prerror("ELOG: Failed to allocate ack message\n");
+ return OPAL_INTERNAL_ERROR;
+ }
+
+ if (fsp_queue_msg(ack_msg, fsp_elog_ack_complete)) {
+ fsp_freemsg(ack_msg);
+ ack_msg = NULL;
+ prerror("ELOG: Error queueing elog ack complete\n");
+ return OPAL_INTERNAL_ERROR;
+ }
+
+ return OPAL_SUCCESS;
+}
+
+/* Retrieve error log from FSP with TCE for the data transfer. */
+static void fsp_elog_check_and_fetch_head(void)
+{
+ lock(&elog_read_lock);
+ if (elog_read_from_fsp_head_state != ELOG_STATE_NONE ||
+ list_empty(&elog_read_pending)) {
+ unlock(&elog_read_lock);
+ return;
+ }
+
+ elog_read_retries = 0;
+ /* Start fetching first entry from the pending list */
+ fsp_elog_queue_fetch();
+ unlock(&elog_read_lock);
+}
+
+void elog_set_head_state(bool opal_logs, enum elog_head_state state)
+{
+ static enum elog_head_state opal_logs_state = ELOG_STATE_NONE;
+ static enum elog_head_state fsp_logs_state = ELOG_STATE_NONE;
+
+ /* ELOG disabled */
+ if (!elog_enabled)
+ return;
+
+ if (opal_logs)
+ opal_logs_state = state;
+ else
+ fsp_logs_state = state;
+
+ if (fsp_logs_state == ELOG_STATE_FETCHED_DATA ||
+ opal_logs_state == ELOG_STATE_FETCHED_DATA)
+ opal_update_pending_evt(OPAL_EVENT_ERROR_LOG_AVAIL,
+ OPAL_EVENT_ERROR_LOG_AVAIL);
+ else
+ opal_update_pending_evt(OPAL_EVENT_ERROR_LOG_AVAIL, 0);
+}
+
+/* This function should be called with the lock held. */
+static inline void fsp_elog_set_head_state(enum elog_head_state state)
+{
+ elog_set_head_state(false, state);
+ elog_read_from_fsp_head_state = state;
+}
+
+/*
+ * When, we try maximum time of fetching log from FSP
+ * we call following function to delete log from the
+ * pending list and update the state to fetch next log.
+ *
+ * This function should be called with the lock held.
+ */
+static void fsp_elog_fetch_failure(uint8_t fsp_status)
+{
+ struct fsp_log_entry *log_data;
+
+ /* Read top list and delete the node */
+ log_data = list_top(&elog_read_pending, struct fsp_log_entry, link);
+ if (!log_data) {
+ /**
+ * @fwts-label ElogFetchFailureInconsistent
+ * @fwts-advice Inconsistent state between OPAL and FSP
+ * in code path for handling failure of fetching error log
+ * from FSP. Likely a bug in interaction between FSP and OPAL.
+ */
+ prlog(PR_ERR, "%s: Inconsistent internal list state !\n",
+ __func__);
+ } else {
+ list_del(&log_data->link);
+ list_add(&elog_read_free, &log_data->link);
+ prerror("ELOG: received invalid data: %x FSP status: 0x%x\n",
+ log_data->log_id, fsp_status);
+ }
+
+ fsp_elog_set_head_state(ELOG_STATE_NONE);
+}
+
+/* Read response value from FSP for fetch sp data mbox command */
+static void fsp_elog_read_complete(struct fsp_msg *read_msg)
+{
+ uint8_t val;
+
+ lock(&elog_read_lock);
+ val = (read_msg->resp->word1 >> 8) & 0xff;
+ fsp_freemsg(read_msg);
+ if (elog_read_from_fsp_head_state == ELOG_STATE_REJECTED) {
+ fsp_elog_set_head_state(ELOG_STATE_NONE);
+ goto elog_read_out;
+ }
+
+ switch (val) {
+ case FSP_STATUS_SUCCESS:
+ fsp_elog_set_head_state(ELOG_STATE_FETCHED_DATA);
+ break;
+
+ case FSP_STATUS_DMA_ERROR:
+ if (elog_read_retries++ < MAX_RETRIES) {
+ /*
+ * For a error response value from FSP, we try to
+ * send fetch sp data mbox command again for three
+ * times if response from FSP is still not valid
+ * we send generic error response to FSP.
+ */
+ fsp_elog_queue_fetch();
+ break;
+ }
+
+ fsp_elog_fetch_failure(val);
+ break;
+
+ default:
+ fsp_elog_fetch_failure(val);
+ }
+
+elog_read_out:
+ unlock(&elog_read_lock);
+
+ /* Check if a new log needs fetching */
+ fsp_elog_check_and_fetch_head();
+}
+
+/* Read error log from FSP through mbox commands */
+static void fsp_elog_queue_fetch(void)
+{
+ int rc;
+ uint8_t flags = 0;
+ struct fsp_log_entry *entry;
+
+ entry = list_top(&elog_read_pending, struct fsp_log_entry, link);
+ if (!entry) {
+ /**
+ * @fwts-label ElogQueueInconsistent
+ * @fwts-advice Bug in interaction between FSP and OPAL. We
+ * expected there to be a pending read from FSP but the list
+ * was empty.
+ */
+ prlog(PR_ERR, "%s: Inconsistent internal list state !\n",
+ __func__);
+ fsp_elog_set_head_state(ELOG_STATE_NONE);
+ return;
+ }
+
+ fsp_elog_set_head_state(ELOG_STATE_FETCHING);
+ elog_head_id = entry->log_id;
+ elog_head_size = entry->log_size;
+ rc = fsp_fetch_data_queue(flags, FSP_DATASET_ERRLOG, elog_head_id,
+ 0, (void *)PSI_DMA_ERRLOG_READ_BUF,
+ &elog_head_size, fsp_elog_read_complete);
+ if (rc) {
+ prerror("ELOG: failed to queue read message: %d\n", rc);
+ fsp_elog_set_head_state(ELOG_STATE_NONE);
+ }
+}
+
+/* OPAL interface for PowerNV to read log size and log ID from Sapphire. */
+static int64_t fsp_opal_elog_info(__be64 *opal_elog_id,
+ __be64 *opal_elog_size, __be64 *elog_type)
+{
+ struct fsp_log_entry *log_data;
+
+ /* Copy type of the error log */
+ *elog_type = cpu_to_be64(ELOG_TYPE_PEL);
+
+ /* Check if any OPAL log needs to be reported to the host */
+ if (opal_elog_info(opal_elog_id, opal_elog_size))
+ return OPAL_SUCCESS;
+
+ lock(&elog_read_lock);
+ if (elog_read_from_fsp_head_state != ELOG_STATE_FETCHED_DATA) {
+ unlock(&elog_read_lock);
+ return OPAL_WRONG_STATE;
+ }
+
+ log_data = list_top(&elog_read_pending, struct fsp_log_entry, link);
+ if (!log_data) {
+ /**
+ * @fwts-label ElogInfoInconsistentState
+ * @fwts-advice We expected there to be an entry in the list
+ * of error logs for the error log we're fetching information
+ * for. There wasn't. This means there's a bug.
+ */
+ prlog(PR_ERR, "%s: Inconsistent internal list state !\n",
+ __func__);
+ fsp_elog_set_head_state(ELOG_STATE_NONE);
+ unlock(&elog_read_lock);
+ return OPAL_WRONG_STATE;
+ }
+
+ *opal_elog_id = cpu_to_be64(log_data->log_id);
+ *opal_elog_size = cpu_to_be64(log_data->log_size);
+ fsp_elog_set_head_state(ELOG_STATE_HOST_INFO);
+ unlock(&elog_read_lock);
+ return OPAL_SUCCESS;
+}
+
+/* OPAL interface for PowerNV to read log from Sapphire. */
+static int64_t fsp_opal_elog_read(void *buffer, uint64_t opal_elog_size,
+ uint64_t opal_elog_id)
+{
+ int size = opal_elog_size;
+ struct fsp_log_entry *log_data;
+
+ /* Check if any OPAL log needs to be reported to the PowerNV */
+ if (opal_elog_read(buffer, opal_elog_size, opal_elog_id))
+ return OPAL_SUCCESS;
+
+ /*
+ * Read top entry from list.
+ * As we know always top record of the list is fetched from FSP
+ */
+ lock(&elog_read_lock);
+ if (elog_read_from_fsp_head_state != ELOG_STATE_HOST_INFO) {
+ unlock(&elog_read_lock);
+ return OPAL_WRONG_STATE;
+ }
+
+ log_data = list_top(&elog_read_pending, struct fsp_log_entry, link);
+ if (!log_data) {
+ /**
+ * @fwts-label ElogReadInconsistentState
+ * @fwts-advice Inconsistent state while reading error log
+ * from FSP. Bug in OPAL and FSP interaction.
+ */
+ prlog(PR_ERR, "%s: Inconsistent internal list state !\n",
+ __func__);
+ fsp_elog_set_head_state(ELOG_STATE_NONE);
+ unlock(&elog_read_lock);
+ return OPAL_WRONG_STATE;
+ }
+
+ /* Check log ID and then read log from buffer */
+ if (opal_elog_id != log_data->log_id) {
+ unlock(&elog_read_lock);
+ return OPAL_PARAMETER;
+ }
+
+ /* Do not copy more than actual log size */
+ if (opal_elog_size > log_data->log_size)
+ size = log_data->log_size;
+
+ memset(buffer, 0, opal_elog_size);
+ memcpy(buffer, elog_read_buffer, size);
+
+ /*
+ * Once log is read from linux move record from pending
+ * to processed list and delete record from pending list
+ * and change state of the log to fetch next record.
+ */
+ list_del(&log_data->link);
+ list_add(&elog_read_processed, &log_data->link);
+ fsp_elog_set_head_state(ELOG_STATE_NONE);
+ unlock(&elog_read_lock);
+
+ /* Read error log from FSP */
+ fsp_elog_check_and_fetch_head();
+
+ return OPAL_SUCCESS;
+}
+
+/* Set state of the log head before fetching the log. */
+static void elog_reject_head(void)
+{
+ if (elog_read_from_fsp_head_state == ELOG_STATE_FETCHING)
+ fsp_elog_set_head_state(ELOG_STATE_REJECTED);
+ else
+ fsp_elog_set_head_state(ELOG_STATE_NONE);
+}
+
+/* OPAL interface for PowerNV to send ack to FSP with log ID */
+static int64_t fsp_opal_elog_ack(uint64_t ack_id)
+{
+ int rc = 0;
+ struct fsp_log_entry *record, *next_record;
+
+ if (opal_elog_ack(ack_id))
+ return rc;
+
+ /* Send acknowledgement to FSP */
+ rc = fsp_send_elog_ack(ack_id);
+ if (rc != OPAL_SUCCESS) {
+ prerror("ELOG: failed to send acknowledgement: %d\n", rc);
+ return rc;
+ }
+
+ lock(&elog_read_lock);
+ list_for_each_safe(&elog_read_processed, record, next_record, link) {
+ if (record->log_id != ack_id)
+ continue;
+
+ list_del(&record->link);
+ list_add(&elog_read_free, &record->link);
+ unlock(&elog_read_lock);
+ return rc;
+ }
+
+ list_for_each_safe(&elog_read_pending, record, next_record, link) {
+ if (record->log_id != ack_id)
+ continue;
+ /*
+ * It means PowerNV has sent ACK without reading actual data.
+ * Because of this elog_read_from_fsp_head_state may be
+ * stuck in wrong state (ELOG_STATE_HOST_INFO) and not able
+ * to send remaining ELOGs to PowerNV. Hence reset ELOG state
+ * and start sending remaining ELOGs.
+ */
+ list_del(&record->link);
+ list_add(&elog_read_free, &record->link);
+ elog_reject_head();
+ unlock(&elog_read_lock);
+ fsp_elog_check_and_fetch_head();
+ return rc;
+ }
+
+ unlock(&elog_read_lock);
+ return OPAL_PARAMETER;
+}
+
+/*
+ * Once Linux kexec's it ask to resend all logs which
+ * are not acknowledged from Linux.
+ */
+static void fsp_opal_resend_pending_logs(void)
+{
+ struct fsp_log_entry *entry;
+
+ lock(&elog_read_lock);
+ elog_enabled = true;
+ unlock(&elog_read_lock);
+
+ /* Check if any Sapphire logs are pending. */
+ opal_resend_pending_logs();
+
+ lock(&elog_read_lock);
+ /*
+ * If processed list is not empty add all record from
+ * processed list to pending list at head of the list
+ * and delete records from processed list.
+ */
+ while (!list_empty(&elog_read_processed)) {
+ entry = list_pop(&elog_read_processed,
+ struct fsp_log_entry, link);
+ list_add(&elog_read_pending, &entry->link);
+ }
+
+ unlock(&elog_read_lock);
+
+ /* Read error log from FSP */
+ elog_reject_head();
+ fsp_elog_check_and_fetch_head();
+}
+
+/* Disable ELOG event flag until PowerNV is ready to receive event */
+static bool opal_kexec_elog_notify(void *data __unused)
+{
+ lock(&elog_read_lock);
+ elog_enabled = false;
+ opal_update_pending_evt(OPAL_EVENT_ERROR_LOG_AVAIL, 0);
+ unlock(&elog_read_lock);
+
+ return true;
+}
+
+/* FSP elog notify function */
+static bool fsp_elog_msg(uint32_t cmd_sub_mod, struct fsp_msg *msg)
+{
+ int rc = 0;
+ struct fsp_log_entry *record;
+ uint32_t log_id;
+ uint32_t log_size;
+
+ if (cmd_sub_mod != FSP_CMD_ERRLOG_NOTIFICATION)
+ return false;
+
+ log_id = fsp_msg_get_data_word(msg, 0);
+ log_size = fsp_msg_get_data_word(msg, 1);
+
+ prlog(PR_TRACE, "ELOG: Notified of log 0x%08x (size: %d)\n",
+ log_id, log_size);
+
+ /* Make sure we don't cross read buffer size */
+ if (log_size > ELOG_READ_BUFFER_SIZE) {
+ log_size = ELOG_READ_BUFFER_SIZE;
+ printf("ELOG: Truncated log (0x%08x) to 0x%x\n",
+ log_id, log_size);
+ }
+
+ /* Take a lock until we take out the node from elog_read_free */
+ lock(&elog_read_lock);
+ if (!list_empty(&elog_read_free)) {
+ /* Create a new entry in the pending list. */
+ record = list_pop(&elog_read_free, struct fsp_log_entry, link);
+ record->log_id = log_id;
+ record->log_size = log_size;
+ list_add_tail(&elog_read_pending, &record->link);
+ unlock(&elog_read_lock);
+
+ /* Send response back to FSP for a new elog notify message. */
+ rc = fsp_queue_msg(fsp_mkmsg(FSP_RSP_ERRLOG_NOTIFICATION,
+ 1, log_id), fsp_freemsg);
+ if (rc)
+ prerror("ELOG: Failed to queue errlog notification"
+ " response: %d\n", rc);
+
+ /* Read error log from FSP */
+ fsp_elog_check_and_fetch_head();
+
+ } else {
+ prlog(PR_TRACE, "ELOG: Log entry 0x%08x discarded\n", log_id);
+
+ /* Unlock if elog_read_free is empty. */
+ unlock(&elog_read_lock);
+
+ rc = fsp_queue_msg(fsp_mkmsg(FSP_RSP_ERRLOG_NOTIFICATION,
+ 1, log_id), fsp_freemsg);
+ if (rc)
+ prerror("ELOG: Failed to queue errlog notification"
+ " response: %d\n", rc);
+
+ /*
+ * If list is full with max record then we send discarded by
+ * phyp (condition full) ack to FSP.
+ *
+ * At some point in the future, we'll get notified again.
+ * This is largely up to FSP as to when they tell us about
+ * the log again.
+ */
+ rc = fsp_queue_msg(fsp_mkmsg(FSP_CMD_ERRLOG_PHYP_ACK | 0x02,
+ 1, log_id), fsp_freemsg);
+ if (rc)
+ prerror("ELOG: Failed to queue errlog ack"
+ " response: %d\n", rc);
+ }
+
+ return true;
+}
+
+static struct fsp_client fsp_get_elog_notify = {
+ .message = fsp_elog_msg,
+};
+
+/* Pre-allocate memory for reading error log from FSP */
+static int init_elog_read_free_list(uint32_t num_entries)
+{
+ struct fsp_log_entry *entry;
+ int i;
+
+ entry = zalloc(sizeof(struct fsp_log_entry) * num_entries);
+ if (!entry)
+ goto out_err;
+
+ for (i = 0; i < num_entries; ++i) {
+ list_add_tail(&elog_read_free, &entry->link);
+ entry++;
+ }
+
+ return 0;
+
+out_err:
+ return -ENOMEM;
+}
+
+/* FSP elog read init function */
+void fsp_elog_read_init(void)
+{
+ int val = 0;
+
+ if (!fsp_present())
+ return;
+
+ elog_read_buffer = memalign(TCE_PSIZE, ELOG_READ_BUFFER_SIZE);
+ if (!elog_read_buffer) {
+ prerror("FSP: could not allocate FSP ELOG_READ_BUFFER!\n");
+ return;
+ }
+
+ /* Map TCEs */
+ fsp_tce_map(PSI_DMA_ERRLOG_READ_BUF, elog_read_buffer,
+ PSI_DMA_ERRLOG_READ_BUF_SZ);
+
+ /* Pre allocate memory for 128 record */
+ val = init_elog_read_free_list(ELOG_READ_MAX_RECORD);
+ if (val != 0)
+ return;
+
+ /* Register error log class D2 */
+ fsp_register_client(&fsp_get_elog_notify, FSP_MCLASS_ERR_LOG);
+
+ /* Register for sync on PowerNV reboot call */
+ opal_add_host_sync_notifier(opal_kexec_elog_notify, NULL);
+
+ /* Register OPAL interface */
+ opal_register(OPAL_ELOG_READ, fsp_opal_elog_read, 3);
+ opal_register(OPAL_ELOG_ACK, fsp_opal_elog_ack, 1);
+ opal_register(OPAL_ELOG_RESEND, fsp_opal_resend_pending_logs, 0);
+ opal_register(OPAL_ELOG_SIZE, fsp_opal_elog_info, 3);
+}
diff --git a/roms/skiboot/hw/fsp/fsp-elog-write.c b/roms/skiboot/hw/fsp/fsp-elog-write.c
new file mode 100644
index 000000000..7b26a1867
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-elog-write.c
@@ -0,0 +1,441 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * This code will enable generation and pushing of error log from Sapphire
+ * to FSP.
+ * Critical events from Sapphire that needs to be reported will be pushed
+ * on to FSP after converting the error log to Platform Error Log(PEL) format.
+ * This is termed as write action to FSP.
+ *
+ * Copyright 2013-2016 IBM Corp.
+ */
+
+#include <cpu.h>
+#include <errno.h>
+#include <fsp.h>
+#include <fsp-elog.h>
+#include <lock.h>
+#include <opal-api.h>
+#include <pel.h>
+#include <pool.h>
+#include <skiboot.h>
+#include <timebase.h>
+
+static LIST_HEAD(elog_write_to_fsp_pending);
+static LIST_HEAD(elog_write_to_host_pending);
+static LIST_HEAD(elog_write_to_host_processed);
+
+static struct lock elog_write_lock = LOCK_UNLOCKED;
+static struct lock elog_panic_write_lock = LOCK_UNLOCKED;
+static struct lock elog_write_to_host_lock = LOCK_UNLOCKED;
+
+#define ELOG_WRITE_TO_FSP_BUFFER_SIZE 0x00004000
+/* Log buffer to copy OPAL log for write to FSP. */
+static void *elog_write_to_fsp_buffer;
+
+#define ELOG_PANIC_WRITE_BUFFER_SIZE 0x00004000
+static void *elog_panic_write_buffer;
+
+#define ELOG_WRITE_TO_HOST_BUFFER_SIZE 0x00004000
+static void *elog_write_to_host_buffer;
+
+static uint32_t elog_write_retries;
+
+/* Manipulate this only with write_lock held */
+static uint32_t elog_plid_fsp_commit = -1;
+static enum elog_head_state elog_write_to_host_head_state = ELOG_STATE_NONE;
+
+/* Need forward declaration because of circular dependency */
+static int opal_send_elog_to_fsp(void);
+
+static void remove_elog_head_entry(void)
+{
+ struct errorlog *head, *entry;
+
+ lock(&elog_write_lock);
+ if (!list_empty(&elog_write_to_fsp_pending)) {
+ head = list_top(&elog_write_to_fsp_pending,
+ struct errorlog, link);
+ if (head->plid == elog_plid_fsp_commit) {
+ entry = list_pop(&elog_write_to_fsp_pending,
+ struct errorlog, link);
+ opal_elog_complete(entry,
+ elog_write_retries < MAX_RETRIES);
+ /* Reset the counter */
+ elog_plid_fsp_commit = -1;
+ }
+ }
+
+ elog_write_retries = 0;
+ unlock(&elog_write_lock);
+}
+
+static void opal_fsp_write_complete(struct fsp_msg *read_msg)
+{
+ uint8_t val;
+
+ val = (read_msg->resp->word1 >> 8) & 0xff;
+ fsp_freemsg(read_msg);
+
+ switch (val) {
+ case FSP_STATUS_SUCCESS:
+ remove_elog_head_entry();
+ break;
+ default:
+ if (elog_write_retries++ >= MAX_RETRIES) {
+ remove_elog_head_entry();
+ prerror("ELOG: Error in writing to FSP (0x%x)!\n", val);
+ }
+
+ break;
+ }
+
+ if (opal_send_elog_to_fsp() != OPAL_SUCCESS)
+ prerror("ELOG: Error sending elog to FSP !\n");
+}
+
+/* Write PEL format hex dump of the log to FSP */
+static int64_t fsp_opal_elog_write(size_t opal_elog_size)
+{
+ struct fsp_msg *elog_msg;
+
+ elog_msg = fsp_mkmsg(FSP_CMD_CREATE_ERRLOG, 3, opal_elog_size,
+ 0, PSI_DMA_ERRLOG_WRITE_BUF);
+ if (!elog_msg) {
+ prerror("ELOG: Failed to create message for WRITE to FSP\n");
+ return OPAL_INTERNAL_ERROR;
+ }
+
+ if (fsp_queue_msg(elog_msg, opal_fsp_write_complete)) {
+ fsp_freemsg(elog_msg);
+ elog_msg = NULL;
+ prerror("FSP: Error queueing elog update\n");
+ return OPAL_INTERNAL_ERROR;
+ }
+
+ return OPAL_SUCCESS;
+}
+
+/* This should be called with elog_write_to_host_lock lock */
+static inline void fsp_elog_write_set_head_state(enum elog_head_state state)
+{
+ elog_set_head_state(true, state);
+ elog_write_to_host_head_state = state;
+}
+
+bool opal_elog_info(__be64 *opal_elog_id, __be64 *opal_elog_size)
+{
+ struct errorlog *head;
+ bool rc = false;
+
+ lock(&elog_write_to_host_lock);
+ if (elog_write_to_host_head_state == ELOG_STATE_FETCHED_DATA) {
+ head = list_top(&elog_write_to_host_pending,
+ struct errorlog, link);
+ if (!head) {
+ /**
+ * @fwts-label ElogListInconsistent
+ * @fwts-advice Bug in interaction between FSP and
+ * OPAL. The state maintained by OPAL didn't match
+ * what the FSP sent.
+ */
+ prlog(PR_ERR,
+ "%s: Inconsistent internal list state !\n",
+ __func__);
+ fsp_elog_write_set_head_state(ELOG_STATE_NONE);
+ } else {
+ *opal_elog_id = cpu_to_be64(head->plid);
+ *opal_elog_size = cpu_to_be64(head->log_size);
+ fsp_elog_write_set_head_state(ELOG_STATE_HOST_INFO);
+ rc = true;
+ }
+ }
+
+ unlock(&elog_write_to_host_lock);
+ return rc;
+}
+
+static void opal_commit_elog_in_host(void)
+{
+ struct errorlog *buf;
+
+ lock(&elog_write_to_host_lock);
+ if (!list_empty(&elog_write_to_host_pending) &&
+ (elog_write_to_host_head_state == ELOG_STATE_NONE)) {
+ buf = list_top(&elog_write_to_host_pending,
+ struct errorlog, link);
+ buf->log_size = create_pel_log(buf,
+ (char *)elog_write_to_host_buffer,
+ ELOG_WRITE_TO_HOST_BUFFER_SIZE);
+ fsp_elog_write_set_head_state(ELOG_STATE_FETCHED_DATA);
+ }
+
+ unlock(&elog_write_to_host_lock);
+}
+
+bool opal_elog_read(void *buffer, uint64_t opal_elog_size,
+ uint64_t opal_elog_id)
+{
+ struct errorlog *log_data;
+ bool rc = false;
+
+ lock(&elog_write_to_host_lock);
+ if (elog_write_to_host_head_state == ELOG_STATE_HOST_INFO) {
+ log_data = list_top(&elog_write_to_host_pending,
+ struct errorlog, link);
+ if (!log_data) {
+ fsp_elog_write_set_head_state(ELOG_STATE_NONE);
+ unlock(&elog_write_to_host_lock);
+ return rc;
+ }
+
+ if ((opal_elog_id != log_data->plid) &&
+ (opal_elog_size != log_data->log_size)) {
+ unlock(&elog_write_to_host_lock);
+ return rc;
+ }
+
+ memcpy(buffer, elog_write_to_host_buffer, opal_elog_size);
+ list_del(&log_data->link);
+ list_add(&elog_write_to_host_processed, &log_data->link);
+ fsp_elog_write_set_head_state(ELOG_STATE_NONE);
+ rc = true;
+ }
+
+ unlock(&elog_write_to_host_lock);
+ opal_commit_elog_in_host();
+ return rc;
+}
+
+bool opal_elog_ack(uint64_t ack_id)
+{
+ bool rc = false;
+ struct errorlog *log_data;
+ struct errorlog *record, *next_record;
+
+ lock(&elog_write_to_host_lock);
+ if (!list_empty(&elog_write_to_host_processed)) {
+ list_for_each_safe(&elog_write_to_host_processed, record,
+ next_record, link) {
+ if (record->plid != ack_id)
+ continue;
+
+ list_del(&record->link);
+ opal_elog_complete(record, true);
+ rc = true;
+ }
+ }
+
+ if ((!rc) && (!list_empty(&elog_write_to_host_pending))) {
+ log_data = list_top(&elog_write_to_host_pending,
+ struct errorlog, link);
+ if (ack_id == log_data->plid)
+ fsp_elog_write_set_head_state(ELOG_STATE_NONE);
+
+ list_for_each_safe(&elog_write_to_host_pending, record,
+ next_record, link) {
+ if (record->plid != ack_id)
+ continue;
+
+ list_del(&record->link);
+ opal_elog_complete(record, true);
+ rc = true;
+ unlock(&elog_write_to_host_lock);
+ opal_commit_elog_in_host();
+ return rc;
+ }
+ }
+
+ unlock(&elog_write_to_host_lock);
+ return rc;
+}
+
+void opal_resend_pending_logs(void)
+{
+ struct errorlog *record;
+
+ lock(&elog_write_to_host_lock);
+ while (!list_empty(&elog_write_to_host_processed)) {
+ record = list_pop(&elog_write_to_host_processed,
+ struct errorlog, link);
+ list_add_tail(&elog_write_to_host_pending, &record->link);
+ }
+
+ fsp_elog_write_set_head_state(ELOG_STATE_NONE);
+ unlock(&elog_write_to_host_lock);
+ opal_commit_elog_in_host();
+}
+
+static inline u64 get_elog_timeout(void)
+{
+ return (mftb() + secs_to_tb(ERRORLOG_TIMEOUT_INTERVAL));
+}
+
+static int opal_send_elog_to_fsp(void)
+{
+ struct errorlog *head;
+ int rc = OPAL_SUCCESS;
+
+ /*
+ * Convert entry to PEL and push it down to FSP.
+ * Then we wait for the ack from FSP.
+ */
+ lock(&elog_write_lock);
+ if (!list_empty(&elog_write_to_fsp_pending)) {
+ head = list_top(&elog_write_to_fsp_pending,
+ struct errorlog, link);
+ /* Error needs to be committed, update the time out value */
+ head->elog_timeout = get_elog_timeout();
+
+ elog_plid_fsp_commit = head->plid;
+ head->log_size = create_pel_log(head,
+ (char *)elog_write_to_fsp_buffer,
+ ELOG_WRITE_TO_FSP_BUFFER_SIZE);
+ rc = fsp_opal_elog_write(head->log_size);
+ unlock(&elog_write_lock);
+ return rc;
+ }
+
+ unlock(&elog_write_lock);
+ return rc;
+}
+
+static int opal_push_logs_sync_to_fsp(struct errorlog *buf)
+{
+ struct fsp_msg *elog_msg;
+ int opal_elog_size = 0;
+ int rc = OPAL_SUCCESS;
+
+ lock(&elog_panic_write_lock);
+
+ /* Error needs to be committed, update the time out value */
+ buf->elog_timeout = get_elog_timeout();
+
+ opal_elog_size = create_pel_log(buf,
+ (char *)elog_panic_write_buffer,
+ ELOG_PANIC_WRITE_BUFFER_SIZE);
+
+ elog_msg = fsp_mkmsg(FSP_CMD_CREATE_ERRLOG, 3, opal_elog_size,
+ 0, PSI_DMA_ELOG_PANIC_WRITE_BUF);
+ if (!elog_msg) {
+ prerror("ELOG: PLID: 0x%x Failed to create message for WRITE "
+ "to FSP\n", buf->plid);
+ unlock(&elog_panic_write_lock);
+ opal_elog_complete(buf, false);
+ return OPAL_INTERNAL_ERROR;
+ }
+
+ if (fsp_sync_msg(elog_msg, false)) {
+ fsp_freemsg(elog_msg);
+ rc = OPAL_INTERNAL_ERROR;
+ } else {
+ rc = (elog_msg->resp->word1 >> 8) & 0xff;
+ fsp_freemsg(elog_msg);
+ }
+
+ unlock(&elog_panic_write_lock);
+ if (rc != OPAL_SUCCESS)
+ opal_elog_complete(buf, false);
+ else
+ opal_elog_complete(buf, true);
+
+ return rc;
+}
+
+int elog_fsp_commit(struct errorlog *buf)
+{
+ int rc = OPAL_SUCCESS;
+
+ if (buf->event_severity == OPAL_ERROR_PANIC) {
+ rc = opal_push_logs_sync_to_fsp(buf);
+ return rc;
+ }
+
+ lock(&elog_write_lock);
+ if (list_empty(&elog_write_to_fsp_pending)) {
+ list_add_tail(&elog_write_to_fsp_pending, &buf->link);
+ unlock(&elog_write_lock);
+ rc = opal_send_elog_to_fsp();
+ return rc;
+ }
+
+ list_add_tail(&elog_write_to_fsp_pending, &buf->link);
+ unlock(&elog_write_lock);
+ return rc;
+}
+
+static void elog_append_write_to_host(struct errorlog *buf)
+{
+ lock(&elog_write_to_host_lock);
+ if (list_empty(&elog_write_to_host_pending)) {
+ list_add(&elog_write_to_host_pending, &buf->link);
+ unlock(&elog_write_to_host_lock);
+ opal_commit_elog_in_host();
+ } else {
+ list_add_tail(&elog_write_to_host_pending, &buf->link);
+ unlock(&elog_write_to_host_lock);
+ }
+}
+
+static void elog_timeout_poll(void *data __unused)
+{
+ uint64_t now;
+ struct errorlog *head, *entry;
+
+ lock(&elog_write_lock);
+ if (list_empty(&elog_write_to_fsp_pending)) {
+ unlock(&elog_write_lock);
+ return;
+ }
+
+ head = list_top(&elog_write_to_fsp_pending, struct errorlog, link);
+ now = mftb();
+ if ((tb_compare(now, head->elog_timeout) == TB_AAFTERB) ||
+ (tb_compare(now, head->elog_timeout) == TB_AEQUALB)) {
+ entry = list_pop(&elog_write_to_fsp_pending,
+ struct errorlog, link);
+ unlock(&elog_write_lock);
+ elog_append_write_to_host(entry);
+ } else {
+ unlock(&elog_write_lock);
+ }
+}
+
+/* FSP elog init function */
+void fsp_elog_write_init(void)
+{
+ if (!fsp_present())
+ return;
+
+ elog_panic_write_buffer = memalign(TCE_PSIZE,
+ ELOG_PANIC_WRITE_BUFFER_SIZE);
+ if (!elog_panic_write_buffer) {
+ prerror("FSP: could not allocate ELOG_PANIC_WRITE_BUFFER!\n");
+ return;
+ }
+
+ elog_write_to_fsp_buffer = memalign(TCE_PSIZE,
+ ELOG_WRITE_TO_FSP_BUFFER_SIZE);
+ if (!elog_write_to_fsp_buffer) {
+ prerror("FSP: could not allocate ELOG_WRITE_BUFFER!\n");
+ return;
+ }
+
+ elog_write_to_host_buffer = memalign(TCE_PSIZE,
+ ELOG_WRITE_TO_HOST_BUFFER_SIZE);
+ if (!elog_write_to_host_buffer) {
+ prerror("FSP: could not allocate ELOG_WRITE_TO_HOST_BUFFER!\n");
+ return;
+ }
+
+ /* Map TCEs */
+ fsp_tce_map(PSI_DMA_ELOG_PANIC_WRITE_BUF, elog_panic_write_buffer,
+ PSI_DMA_ELOG_PANIC_WRITE_BUF_SZ);
+
+ fsp_tce_map(PSI_DMA_ERRLOG_WRITE_BUF, elog_write_to_fsp_buffer,
+ PSI_DMA_ERRLOG_WRITE_BUF_SZ);
+
+ elog_init();
+
+ /* Add a poller */
+ opal_add_poller(elog_timeout_poll, NULL);
+}
diff --git a/roms/skiboot/hw/fsp/fsp-epow.c b/roms/skiboot/hw/fsp/fsp-epow.c
new file mode 100644
index 000000000..8869e91e6
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-epow.c
@@ -0,0 +1,192 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * FSP Environmental and Power Warnings (EPOW) support
+ *
+ * Copyright 2013-2016 IBM Corp.
+ */
+
+#define pr_fmt(fmt) "FSP-EPOW: " fmt
+
+#include <fsp.h>
+#include <device.h>
+#include <lock.h>
+#include <opal-msg.h>
+#include <opal-api.h>
+
+#include "fsp-epow.h"
+
+/*
+ * System EPOW status
+ *
+ * This value is exported to the host. Each individual element in this
+ * array [0...(OPAL_SYSEPOW_MAX-1)] contains bitwise EPOW event info
+ * corresponding to particular defined EPOW sub class. For example.
+ * opal_epow_status[OPAL_SYSEPOW_POWER] will reflect power related EPOW events.
+ */
+static int16_t epow_status[OPAL_SYSEPOW_MAX];
+
+/* EPOW lock */
+static struct lock epow_lock = LOCK_UNLOCKED;
+
+/* Process FSP sent EPOW based information */
+static void epow_process_ex1_event(u8 *epow)
+{
+ memset(epow_status, 0, sizeof(epow_status));
+
+ if (epow[4] == EPOW_TMP_INT) {
+ prlog(PR_INFO, "Internal temp above normal\n");
+ epow_status[OPAL_SYSEPOW_TEMP] = OPAL_SYSTEMP_INT;
+
+ } else if (epow[4] == EPOW_TMP_AMB) {
+ prlog(PR_INFO, "Ambient temp above normal\n");
+ epow_status[OPAL_SYSEPOW_TEMP] = OPAL_SYSTEMP_AMB;
+
+ } else if (epow[4] == EPOW_ON_UPS) {
+ prlog(PR_INFO, "System running on UPS power\n");
+ epow_status[OPAL_SYSEPOW_POWER] = OPAL_SYSPOWER_UPS;
+
+ }
+}
+
+/* Process EPOW event */
+static void fsp_process_epow(struct fsp_msg *msg, int epow_type)
+{
+ int rc;
+ u8 epow[8];
+ bool epow_changed = false;
+ int16_t old_epow_status[OPAL_SYSEPOW_MAX];
+
+ /* Basic EPOW signature */
+ if (msg->data.bytes[0] != 0xF2) {
+ /**
+ * @fwts-label EPOWSignatureMismatch
+ * @fwts-advice Bug in skiboot/FSP code for EPOW event handling
+ */
+ prlog(PR_ERR, "Signature mismatch\n");
+ return;
+ }
+
+ lock(&epow_lock);
+
+ /* Copy over and clear system EPOW status */
+ memcpy(old_epow_status, epow_status, sizeof(old_epow_status));
+
+ switch(epow_type) {
+ case EPOW_NORMAL:
+ case EPOW_EX2:
+ break;
+ case EPOW_EX1:
+ epow[0] = msg->data.bytes[0];
+ epow[1] = msg->data.bytes[1];
+ epow[2] = msg->data.bytes[2];
+ epow[3] = msg->data.bytes[3];
+ epow[4] = msg->data.bytes[4];
+
+ epow_process_ex1_event(epow);
+ break;
+ default:
+ prlog(PR_WARNING, "Unknown EPOW event notification\n");
+ break;
+ }
+
+ if (memcmp(epow_status, old_epow_status, sizeof(epow_status)))
+ epow_changed = true;
+
+ unlock(&epow_lock);
+
+ /* Send OPAL message notification */
+ if (epow_changed) {
+ rc = opal_queue_msg(OPAL_MSG_EPOW, NULL, NULL);
+ if (rc) {
+ /**
+ * @fwts-label EPOWMessageQueueFailed
+ * @fwts-advice Queueing a message from OPAL to FSP
+ * failed. This is likely due to either an OPAL bug
+ * or the FSP going away.
+ */
+ prlog(PR_ERR, "OPAL EPOW message queuing failed\n");
+ return;
+ }
+ prlog(PR_INFO, "Notified host about EPOW event\n");
+ }
+}
+
+/*
+ * EPOW OPAL interface
+ *
+ * The host requests for the system EPOW status through this
+ * OPAl call, where it passes a buffer with a give length.
+ * Sapphire fills the buffer with updated system EPOW status
+ * and then updates the length variable back to reflect the
+ * number of EPOW sub classes it has updated the buffer with.
+ */
+static int64_t fsp_opal_get_epow_status(__be16 *out_epow, __be16 *length)
+{
+ int i;
+ int n_epow_class;
+ int l = be16_to_cpu(*length);
+
+ /*
+ * There can be situations where the host and the Sapphire versions
+ * don't match with eact other and hence the expected system EPOW status
+ * details. Newer hosts might be expecting status for more number of EPOW
+ * sub classes which Sapphire may not know about and older hosts might be
+ * expecting status for EPOW sub classes which is a subset of what
+ * Sapphire really knows about. Both these situations are handled here.
+ *
+ * (A) Host version >= Sapphire version
+ *
+ * Sapphire sends out EPOW status for sub classes it knows about
+ * and keeps the status. Updates the length variable for the host.
+ *
+ * (B) Host version < Sapphire version
+ *
+ * Sapphire sends out EPOW status for sub classes host knows about
+ * and can interpret correctly.
+ */
+ if (l >= OPAL_SYSEPOW_MAX) {
+ n_epow_class = OPAL_SYSEPOW_MAX;
+ *length = cpu_to_be16(OPAL_SYSEPOW_MAX);
+ } else {
+ n_epow_class = l;
+ }
+
+ /* Transfer EPOW Status */
+ for (i = 0; i < n_epow_class; i++)
+ out_epow[i] = cpu_to_be16(epow_status[i]);
+
+ return OPAL_SUCCESS;
+}
+
+/* Handle EPOW sub-commands from FSP */
+static bool fsp_epow_message(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+ switch(cmd_sub_mod) {
+ case FSP_CMD_PANELSTATUS:
+ fsp_process_epow(msg, EPOW_NORMAL);
+ return true;
+ case FSP_CMD_PANELSTATUS_EX1:
+ fsp_process_epow(msg, EPOW_EX1);
+ return true;
+ case FSP_CMD_PANELSTATUS_EX2:
+ fsp_process_epow(msg, EPOW_EX2);
+ return true;
+ }
+ return false;
+}
+
+static struct fsp_client fsp_epow_client = {
+ .message = fsp_epow_message,
+};
+
+void fsp_epow_init(void)
+{
+ struct dt_node *np;
+
+ fsp_register_client(&fsp_epow_client, FSP_MCLASS_SERVICE);
+ opal_register(OPAL_GET_EPOW_STATUS, fsp_opal_get_epow_status, 2);
+ np = dt_new(opal_node, "epow");
+ dt_add_property_strings(np, "compatible", "ibm,opal-v3-epow");
+ dt_add_property_strings(np, "epow-classes", "power", "temperature", "cooling");
+ prlog(PR_INFO, "FSP EPOW support initialized\n");
+}
diff --git a/roms/skiboot/hw/fsp/fsp-epow.h b/roms/skiboot/hw/fsp/fsp-epow.h
new file mode 100644
index 000000000..bc1df258e
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-epow.h
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Handle FSP EPOW event notifications
+ *
+ * Copyright 2013-2015 IBM Corp.
+ */
+
+#ifndef __FSP_EPOW_H
+#define __FSP_EPOW_H
+
+/* FSP based EPOW event notifications */
+#define EPOW_NORMAL 0x00 /* panel status normal */
+#define EPOW_EX1 0x01 /* panel status extended 1 */
+#define EPOW_EX2 0x02 /* Panel status extended 2 */
+
+/* EPOW reason code notifications */
+#define EPOW_ON_UPS 1 /* System on UPS */
+#define EPOW_TMP_AMB 2 /* Over ambient temperature */
+#define EPOW_TMP_INT 3 /* Over internal temperature */
+
+#endif
diff --git a/roms/skiboot/hw/fsp/fsp-ipmi.c b/roms/skiboot/hw/fsp/fsp-ipmi.c
new file mode 100644
index 000000000..e368c2828
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-ipmi.c
@@ -0,0 +1,400 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Conduit for IPMI messages to/from FSP
+ *
+ * Copyright 2014-2019 IBM Corp.
+ */
+
+#include <errorlog.h>
+#include <fsp.h>
+#include <ipmi.h>
+#include <lock.h>
+#include <opal-api.h>
+
+/*
+ * Under the hood, FSP IPMI component implements the KCS (Keyboard Controller
+ * Style) interface
+ *
+ * KCS interface request message format
+ *
+ * BYTE 1 BYTE 2 BYTE 3:N
+ * -------------------------------------
+ * | NetFn/LUN | Cmd | Data |
+ * -------------------------------------
+ *
+ * KCS interface response message format
+ *
+ * BYTE 1 BYTE 2 BYTE 3 BYTE 4:N
+ * ------------------------------------------------
+ * | NetFn/LUN | Cmd | CompCode | Data |
+ * ------------------------------------------------
+
+ */
+
+#define FSP_IPMI_REQ_MIN_LEN 2 /* NetFn + Cmd */
+#define FSP_IPMI_RESP_MIN_LEN 3 /* NetFn + Cmd + Completion code */
+
+DEFINE_LOG_ENTRY(OPAL_RC_IPMI_REQ, OPAL_PLATFORM_ERR_EVT, OPAL_IPMI,
+ OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+ OPAL_NA);
+DEFINE_LOG_ENTRY(OPAL_RC_IPMI_RESP, OPAL_PLATFORM_ERR_EVT, OPAL_IPMI,
+ OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+ OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_IPMI_DMA_ERROR_RESP, OPAL_PLATFORM_ERR_EVT, OPAL_IPMI,
+ OPAL_PLATFORM_FIRMWARE, OPAL_INFO,
+ OPAL_NA);
+
+struct fsp_ipmi_msg {
+ struct list_node link;
+ struct ipmi_msg ipmi_msg;
+};
+
+static struct fsp_ipmi {
+ struct list_head msg_queue;
+ void *ipmi_req_buf;
+ void *ipmi_resp_buf;
+ /* There can only be one outstanding request whose reference is stored
+ * in 'cur_msg' and the 'lock' protects against the concurrent updates
+ * of it through request and response. The same 'lock' also protects
+ * the list manipulation.
+ */
+ struct fsp_ipmi_msg *cur_msg;
+ struct lock lock;
+} fsp_ipmi;
+
+static int fsp_ipmi_send_request(void);
+
+static void fsp_ipmi_cmd_done(uint8_t cmd, uint8_t netfn, uint8_t cc)
+{
+ struct fsp_ipmi_msg *fsp_ipmi_msg = fsp_ipmi.cur_msg;
+
+ lock(&fsp_ipmi.lock);
+ if (fsp_ipmi.cur_msg == NULL) {
+ unlock(&fsp_ipmi.lock);
+ return;
+ }
+ list_del(&fsp_ipmi_msg->link);
+ fsp_ipmi.cur_msg = NULL;
+ unlock(&fsp_ipmi.lock);
+
+ ipmi_cmd_done(cmd, netfn, cc, &fsp_ipmi_msg->ipmi_msg);
+}
+
+
+static void fsp_ipmi_req_complete(struct fsp_msg *msg)
+{
+ uint8_t status = (msg->resp->word1 >> 8) & 0xff;
+ uint32_t length = fsp_msg_get_data_word(msg->resp, 0);
+ struct fsp_ipmi_msg *fsp_ipmi_msg = msg->user_data;
+ struct ipmi_msg *ipmi_msg;
+
+ fsp_freemsg(msg);
+
+ if (status != FSP_STATUS_SUCCESS) {
+ assert(fsp_ipmi_msg == fsp_ipmi.cur_msg);
+
+ ipmi_msg = &fsp_ipmi_msg->ipmi_msg;
+
+ if (length != (ipmi_msg->req_size + FSP_IPMI_REQ_MIN_LEN))
+ prlog(PR_DEBUG, "IPMI: Length mismatch in req completion "
+ "(%d, %d)\n", ipmi_msg->req_size, length);
+
+ log_simple_error(&e_info(OPAL_RC_IPMI_REQ), "IPMI: Request "
+ "failed with status:0x%02x\n", status);
+ /* FSP will not send the response now, so clear the current
+ * outstanding request
+ */
+ fsp_ipmi_cmd_done(ipmi_msg->cmd,
+ IPMI_NETFN_RETURN_CODE(ipmi_msg->netfn),
+ IPMI_ERR_UNSPECIFIED);
+
+ /* Send the next request in the queue */
+ fsp_ipmi_send_request();
+ }
+}
+
+static int fsp_ipmi_send_request(void)
+{
+ uint8_t *req_buf = fsp_ipmi.ipmi_req_buf;
+ struct ipmi_msg *ipmi_msg;
+ struct fsp_msg *msg;
+ int rc;
+
+ if (fsp_in_rr())
+ return OPAL_BUSY;
+
+ lock(&fsp_ipmi.lock);
+ /* An outstanding request is still pending */
+ if (fsp_ipmi.cur_msg) {
+ unlock(&fsp_ipmi.lock);
+ return OPAL_SUCCESS;
+ }
+
+ fsp_ipmi.cur_msg = list_top(&fsp_ipmi.msg_queue, struct fsp_ipmi_msg,
+ link);
+ unlock(&fsp_ipmi.lock);
+
+ if (!fsp_ipmi.cur_msg)
+ return OPAL_SUCCESS;
+
+ ipmi_msg = &fsp_ipmi.cur_msg->ipmi_msg;
+ prlog(PR_TRACE, "IPMI: Send request, netfn:0x%02x, cmd:0x%02x, "
+ "req_len:%d\n", ipmi_msg->netfn, ipmi_msg->cmd, ipmi_msg->req_size);
+
+ /* KCS request message format */
+ *req_buf++ = ipmi_msg->netfn; /* BYTE 1 */
+ *req_buf++ = ipmi_msg->cmd; /* BYTE 2 */
+ if (ipmi_msg->req_size)
+ memcpy(req_buf, ipmi_msg->data, ipmi_msg->req_size);
+
+ msg = fsp_mkmsg(FSP_CMD_FETCH_PLAT_DATA, 5, 0, PSI_DMA_PLAT_REQ_BUF,
+ 0, PSI_DMA_PLAT_RESP_BUF,
+ ipmi_msg->req_size + FSP_IPMI_REQ_MIN_LEN);
+ if (!msg) {
+ log_simple_error(&e_info(OPAL_RC_IPMI_REQ), "IPMI: Failed to "
+ "allocate request message\n");
+ fsp_ipmi_cmd_done(ipmi_msg->cmd,
+ IPMI_NETFN_RETURN_CODE(ipmi_msg->netfn),
+ IPMI_ERR_UNSPECIFIED);
+ return OPAL_NO_MEM;
+ }
+
+ msg->user_data = fsp_ipmi.cur_msg;
+ rc = fsp_queue_msg(msg, fsp_ipmi_req_complete);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_IPMI_REQ), "IPMI: Failed to "
+ "queue request message (%d)\n", rc);
+ fsp_freemsg(msg);
+ fsp_ipmi_cmd_done(ipmi_msg->cmd,
+ IPMI_NETFN_RETURN_CODE(ipmi_msg->netfn),
+ IPMI_ERR_UNSPECIFIED);
+ return OPAL_INTERNAL_ERROR;
+ }
+
+ return OPAL_SUCCESS;
+}
+
+static struct ipmi_msg *fsp_ipmi_alloc_msg(size_t req_size, size_t resp_size)
+{
+ struct fsp_ipmi_msg *fsp_ipmi_msg;
+ struct ipmi_msg *ipmi_msg;
+
+ fsp_ipmi_msg = zalloc(sizeof(*fsp_ipmi_msg) + MAX(req_size, resp_size));
+ if (!fsp_ipmi_msg)
+ return NULL;
+
+ ipmi_msg = &fsp_ipmi_msg->ipmi_msg;
+
+ ipmi_msg->req_size = req_size;
+ ipmi_msg->resp_size = resp_size;
+ ipmi_msg->data = (uint8_t *)(fsp_ipmi_msg + 1);
+
+ return ipmi_msg;
+}
+
+static void fsp_ipmi_free_msg(struct ipmi_msg *ipmi_msg)
+{
+ struct fsp_ipmi_msg *fsp_ipmi_msg = container_of(ipmi_msg,
+ struct fsp_ipmi_msg, ipmi_msg);
+
+ free(fsp_ipmi_msg);
+}
+
+static int fsp_ipmi_queue_msg(struct ipmi_msg *ipmi_msg)
+{
+ struct fsp_ipmi_msg *fsp_ipmi_msg = container_of(ipmi_msg,
+ struct fsp_ipmi_msg, ipmi_msg);
+
+ if (fsp_in_rr())
+ return OPAL_BUSY;
+
+ lock(&fsp_ipmi.lock);
+ list_add_tail(&fsp_ipmi.msg_queue, &fsp_ipmi_msg->link);
+ unlock(&fsp_ipmi.lock);
+
+ return fsp_ipmi_send_request();
+}
+
+static int fsp_ipmi_queue_msg_head(struct ipmi_msg *ipmi_msg)
+{
+ struct fsp_ipmi_msg *fsp_ipmi_msg = container_of(ipmi_msg,
+ struct fsp_ipmi_msg, ipmi_msg);
+
+ if (fsp_in_rr())
+ return OPAL_BUSY;
+
+ lock(&fsp_ipmi.lock);
+ list_add(&fsp_ipmi.msg_queue, &fsp_ipmi_msg->link);
+ unlock(&fsp_ipmi.lock);
+
+ return fsp_ipmi_send_request();
+}
+
+static int fsp_ipmi_dequeue_msg(struct ipmi_msg *ipmi_msg)
+{
+ struct fsp_ipmi_msg *fsp_ipmi_msg = container_of(ipmi_msg,
+ struct fsp_ipmi_msg, ipmi_msg);
+
+ lock(&fsp_ipmi.lock);
+ list_del_from(&fsp_ipmi.msg_queue, &fsp_ipmi_msg->link);
+ unlock(&fsp_ipmi.lock);
+
+ return 0;
+}
+
+static struct ipmi_backend fsp_ipmi_backend = {
+ .alloc_msg = fsp_ipmi_alloc_msg,
+ .free_msg = fsp_ipmi_free_msg,
+ .queue_msg = fsp_ipmi_queue_msg,
+ .queue_msg_head = fsp_ipmi_queue_msg_head,
+ .dequeue_msg = fsp_ipmi_dequeue_msg,
+ /* FIXME if ever use ipmi_queue_msg_sync on FSP */
+ .poll = NULL,
+};
+
+static bool fsp_ipmi_rr_notify(uint32_t cmd_sub_mod,
+ struct fsp_msg *msg __unused)
+{
+ struct ipmi_msg *ipmi_msg;
+
+ switch (cmd_sub_mod) {
+ case FSP_RESET_START:
+ return true;
+ case FSP_RELOAD_COMPLETE:
+ /*
+ * We will not get response for outstanding request. Send error
+ * message to caller and start sending new ipmi messages.
+ */
+ if (fsp_ipmi.cur_msg) {
+ ipmi_msg = &fsp_ipmi.cur_msg->ipmi_msg;
+ fsp_ipmi_cmd_done(ipmi_msg->cmd,
+ IPMI_NETFN_RETURN_CODE(ipmi_msg->netfn),
+ IPMI_ERR_UNSPECIFIED);
+ }
+ fsp_ipmi_send_request();
+ return true;
+ }
+ return false;
+}
+
+static struct fsp_client fsp_ipmi_client_rr = {
+ .message = fsp_ipmi_rr_notify,
+};
+
+static bool fsp_ipmi_send_response(uint32_t cmd)
+{
+ struct fsp_msg *resp;
+ int rc;
+
+ resp = fsp_mkmsg(cmd, 0);
+ if (!resp) {
+ log_simple_error(&e_info(OPAL_RC_IPMI_RESP), "IPMI: Failed to "
+ "allocate response message\n");
+ return false;
+ }
+
+ rc = fsp_queue_msg(resp, fsp_freemsg);
+ if (rc) {
+ fsp_freemsg(resp);
+ log_simple_error(&e_info(OPAL_RC_IPMI_RESP), "IPMI: Failed to "
+ "queue response message\n");
+ return false;
+ }
+
+ return true;
+}
+
+static bool fsp_ipmi_read_response(struct fsp_msg *msg)
+{
+ uint8_t *resp_buf = fsp_ipmi.ipmi_resp_buf;
+ uint32_t status = fsp_msg_get_data_word(msg, 3);
+ uint32_t length = fsp_msg_get_data_word(msg, 2);
+ struct ipmi_msg *ipmi_msg;
+ uint8_t netfn, cmd, cc;
+
+ assert(fsp_ipmi.cur_msg);
+ ipmi_msg = &fsp_ipmi.cur_msg->ipmi_msg;
+
+ /* Response TCE token */
+ assert(fsp_msg_get_data_word(msg, 1) == PSI_DMA_PLAT_RESP_BUF);
+
+ if (status != FSP_STATUS_SUCCESS) {
+ if(status == FSP_STATUS_DMA_ERROR)
+ log_simple_error(&e_info(OPAL_RC_IPMI_DMA_ERROR_RESP), "IPMI: Received "
+ "DMA ERROR response from FSP, this may be due to FSP "
+ "is in termination state:0x%02x\n", status);
+ else
+ log_simple_error(&e_info(OPAL_RC_IPMI_RESP), "IPMI: FSP response "
+ "received with bad status:0x%02x\n", status);
+
+ fsp_ipmi_cmd_done(ipmi_msg->cmd,
+ IPMI_NETFN_RETURN_CODE(ipmi_msg->netfn),
+ IPMI_ERR_UNSPECIFIED);
+ return fsp_ipmi_send_response(FSP_RSP_PLAT_DATA |
+ FSP_STATUS_SUCCESS);
+ }
+
+ /* KCS response message format */
+ netfn = *resp_buf++;
+ cmd = *resp_buf++;
+ cc = *resp_buf++;
+ length -= FSP_IPMI_RESP_MIN_LEN;
+
+ prlog(PR_TRACE, "IPMI: fsp response received, netfn:0x%02x, cmd:0x%02x,"
+ " cc:0x%02x, length:%d\n", netfn, cmd, cc, length);
+
+ if (length > ipmi_msg->resp_size) {
+ prlog(PR_DEBUG, "IPMI: Length mismatch in response (%d, %d)\n",
+ length, ipmi_msg->resp_size);
+ length = ipmi_msg->resp_size; /* Truncate */
+ cc = IPMI_ERR_MSG_TRUNCATED;
+ }
+
+ ipmi_msg->resp_size = length;
+ if (length)
+ memcpy(ipmi_msg->data, resp_buf, length);
+
+ fsp_ipmi_cmd_done(cmd, netfn, cc);
+
+ return fsp_ipmi_send_response(FSP_RSP_PLAT_DATA);
+}
+
+static bool fsp_ipmi_response(uint32_t cmd_sub_mod, struct fsp_msg *msg)
+{
+ bool rc;
+
+ switch (cmd_sub_mod) {
+ case FSP_CMD_SEND_PLAT_DATA:
+ prlog(PR_TRACE, "FSP_CMD_SEND_PLAT_DATA command received\n");
+ rc = fsp_ipmi_read_response(msg);
+ break;
+ default:
+ return false;
+ };
+
+ /* If response sent successfully, pick the next request */
+ if (rc == true)
+ fsp_ipmi_send_request();
+
+ return rc;
+}
+
+static struct fsp_client fsp_ipmi_client = {
+ .message = fsp_ipmi_response,
+};
+
+void fsp_ipmi_init(void)
+{
+ fsp_tce_map(PSI_DMA_PLAT_REQ_BUF, fsp_ipmi.ipmi_req_buf,
+ PSI_DMA_PLAT_REQ_BUF_SIZE);
+ fsp_tce_map(PSI_DMA_PLAT_RESP_BUF, fsp_ipmi.ipmi_resp_buf,
+ PSI_DMA_PLAT_RESP_BUF_SIZE);
+
+ list_head_init(&fsp_ipmi.msg_queue);
+ init_lock(&fsp_ipmi.lock);
+
+ fsp_register_client(&fsp_ipmi_client, FSP_MCLASS_FETCH_SPDATA);
+ fsp_register_client(&fsp_ipmi_client_rr, FSP_MCLASS_RR_EVENT);
+ ipmi_register_backend(&fsp_ipmi_backend);
+}
diff --git a/roms/skiboot/hw/fsp/fsp-leds.c b/roms/skiboot/hw/fsp/fsp-leds.c
new file mode 100644
index 000000000..5a552ab3e
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-leds.c
@@ -0,0 +1,1939 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * LED location code and indicator handling
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#define pr_fmt(fmt) "FSPLED: " fmt
+#include <skiboot.h>
+#include <fsp.h>
+#include <device.h>
+#include <spcn.h>
+#include <lock.h>
+#include <errorlog.h>
+#include <opal.h>
+#include <opal-msg.h>
+#include <fsp-leds.h>
+#include <fsp-sysparam.h>
+
+#define buf_write(p, type, val) do { *(type *)(p) = val;\
+ p += sizeof(type); } while(0)
+#define buf_read(p, type, addr) do { *addr = *(type *)(p);\
+ p += sizeof(type); } while(0)
+
+/* SPCN replay threshold */
+#define SPCN_REPLAY_THRESHOLD 2
+
+/* LED support status */
+enum led_support_state {
+ LED_STATE_ABSENT,
+ LED_STATE_READING,
+ LED_STATE_PRESENT,
+};
+
+static enum led_support_state led_support = LED_STATE_ABSENT;
+
+/*
+ * PSI mapped buffer for LED data
+ *
+ * Mapped once and never unmapped. Used for fetching all
+ * available LED information and creating the list. Also
+ * used for setting individual LED state.
+ *
+ */
+static void *led_buffer;
+static u8 *loc_code_list_buffer = NULL;
+
+/* Maintain list of all LEDs
+ *
+ * The contents here will be used to cater requests from FSP
+ * async commands and HV initiated OPAL calls.
+ */
+static struct list_head cec_ledq; /* CEC LED list */
+static struct list_head encl_ledq; /* Enclosure LED list */
+static struct list_head spcn_cmdq; /* SPCN command queue */
+
+/* LED lock */
+static struct lock led_lock = LOCK_UNLOCKED;
+static struct lock spcn_cmd_lock = LOCK_UNLOCKED;
+static struct lock sai_lock = LOCK_UNLOCKED;
+
+static bool spcn_cmd_complete = true; /* SPCN command complete */
+
+/* Last SPCN command */
+static u32 last_spcn_cmd;
+static int replay = 0;
+
+/*
+ * FSP controls System Attention Indicator. But it expects hypervisor
+ * keep track of the status and serve get LED state request (both from
+ * Linux and FSP itself)!
+ */
+static struct sai_data sai_data;
+
+/* Forward declaration */
+static void fsp_read_leds_data_complete(struct fsp_msg *msg);
+static int process_led_state_change(void);
+
+
+DEFINE_LOG_ENTRY(OPAL_RC_LED_SPCN, OPAL_PLATFORM_ERR_EVT, OPAL_LED,
+ OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+ OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_LED_BUFF, OPAL_PLATFORM_ERR_EVT, OPAL_LED,
+ OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+ OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_LED_LC, OPAL_PLATFORM_ERR_EVT, OPAL_LED,
+ OPAL_PLATFORM_FIRMWARE, OPAL_INFO, OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_LED_STATE, OPAL_PLATFORM_ERR_EVT, OPAL_LED,
+ OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+ OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_LED_SUPPORT, OPAL_PLATFORM_ERR_EVT, OPAL_LED,
+ OPAL_PLATFORM_FIRMWARE, OPAL_INFO, OPAL_NA);
+
+
+/* Find descendent LED record with CEC location code in CEC list */
+static struct fsp_led_data *fsp_find_cec_led(char *loc_code)
+{
+ struct fsp_led_data *led, *next;
+
+ list_for_each_safe(&cec_ledq, led, next, link) {
+ if (strcmp(led->loc_code, loc_code))
+ continue;
+ return led;
+ }
+ return NULL;
+}
+
+/* Find encl LED record with ENCL location code in ENCL list */
+static struct fsp_led_data *fsp_find_encl_led(char *loc_code)
+{
+ struct fsp_led_data *led, *next;
+
+ list_for_each_safe(&encl_ledq, led, next, link) {
+ if (strcmp(led->loc_code, loc_code))
+ continue;
+ return led;
+ }
+ return NULL;
+}
+
+/* Find encl LED record with CEC location code in CEC list */
+static struct fsp_led_data *fsp_find_encl_cec_led(char *loc_code)
+{
+ struct fsp_led_data *led, *next;
+
+ list_for_each_safe(&cec_ledq, led, next, link) {
+ if (strstr(led->loc_code, "-"))
+ continue;
+ if (!strstr(loc_code, led->loc_code))
+ continue;
+ return led;
+ }
+ return NULL;
+}
+
+/* Find encl LED record with CEC location code in ENCL list */
+static struct fsp_led_data *fsp_find_encl_encl_led(char *loc_code)
+{
+ struct fsp_led_data *led, *next;
+
+ list_for_each_safe(&encl_ledq, led, next, link) {
+ if (!strstr(loc_code, led->loc_code))
+ continue;
+ return led;
+ }
+ return NULL;
+}
+
+/* Compute the ENCL LED status in CEC list */
+static void compute_encl_status_cec(struct fsp_led_data *encl_led)
+{
+ struct fsp_led_data *led, *next;
+
+ encl_led->status &= ~SPCN_LED_IDENTIFY_MASK;
+ encl_led->status &= ~SPCN_LED_FAULT_MASK;
+
+ list_for_each_safe(&cec_ledq, led, next, link) {
+ if (!strstr(led->loc_code, encl_led->loc_code))
+ continue;
+
+ /* Don't count the enclsure LED itself */
+ if (!strcmp(led->loc_code, encl_led->loc_code))
+ continue;
+
+ if (led->status & SPCN_LED_IDENTIFY_MASK)
+ encl_led->status |= SPCN_LED_IDENTIFY_MASK;
+
+ if (led->status & SPCN_LED_FAULT_MASK)
+ encl_led->status |= SPCN_LED_FAULT_MASK;
+ }
+}
+
+/* Is a enclosure LED */
+static bool is_enclosure_led(char *loc_code)
+{
+ if (strstr(loc_code, "-"))
+ return false;
+ if (!fsp_find_cec_led(loc_code) || !fsp_find_encl_led(loc_code))
+ return false;
+ return true;
+}
+
+static inline void opal_led_update_complete(u64 async_token, u64 result)
+{
+ opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL,
+ cpu_to_be64(async_token),
+ cpu_to_be64(result));
+}
+
+static inline bool is_sai_loc_code(const char *loc_code)
+{
+ if (!loc_code)
+ return false;
+
+ if (!strncmp(sai_data.loc_code, loc_code, strlen(sai_data.loc_code)))
+ return true;
+
+ return false;
+}
+
+/* Set/Reset System attention indicator */
+static void fsp_set_sai_complete(struct fsp_msg *msg)
+{
+ int ret = OPAL_SUCCESS;
+ int rc = msg->resp->word1 & 0xff00;
+ struct led_set_cmd *spcn_cmd = (struct led_set_cmd *)msg->user_data;
+
+ if (rc) {
+ /**
+ * @fwts-label FSPSAIFailed
+ * @fwts-advice Failed to update System Attention Indicator.
+ * Likely means some bug with OPAL interacting with FSP.
+ */
+ prlog(PR_ERR, "Update SAI cmd failed [rc=%d].\n", rc);
+ ret = OPAL_INTERNAL_ERROR;
+
+ /* Roll back */
+ lock(&sai_lock);
+ sai_data.state = spcn_cmd->ckpt_status;
+ unlock(&sai_lock);
+ }
+
+ if (spcn_cmd->cmd_src == SPCN_SRC_OPAL)
+ opal_led_update_complete(spcn_cmd->async_token, ret);
+
+ /* free msg and spcn command */
+ free(spcn_cmd);
+ fsp_freemsg(msg);
+
+ /* Process pending LED update request */
+ process_led_state_change();
+}
+
+static int fsp_set_sai(struct led_set_cmd *spcn_cmd)
+{
+ int rc = -ENOMEM;
+ uint32_t cmd = FSP_CMD_SA_INDICATOR;
+ struct fsp_msg *msg;
+
+ /*
+ * FSP does not allow hypervisor to set real SAI, but we can
+ * reset real SAI. Also in our case only host can control
+ * LEDs, not guests. Hence we will set platform virtual SAI
+ * and reset real SAI.
+ */
+ if (spcn_cmd->state == LED_STATE_ON)
+ cmd |= FSP_LED_SET_PLAT_SAI;
+ else
+ cmd |= FSP_LED_RESET_REAL_SAI;
+
+ prlog(PR_TRACE, "Update SAI Indicator [cur : 0x%x, new : 0x%x].\n",
+ sai_data.state, spcn_cmd->state);
+
+ msg = fsp_mkmsg(cmd, 0);
+ if (!msg) {
+ /**
+ * @fwts-label SAIMallocFail
+ * @fwts-advice OPAL ran out of memory while trying to
+ * allocate an FSP message in SAI code path. This indicates
+ * an OPAL bug that caused OPAL to run out of memory.
+ */
+ prlog(PR_ERR, "%s: Memory allocation failed.\n", __func__);
+ goto sai_fail;
+ }
+
+ spcn_cmd->ckpt_status = sai_data.state;
+ msg->user_data = spcn_cmd;
+ rc = fsp_queue_msg(msg, fsp_set_sai_complete);
+ if (rc) {
+ fsp_freemsg(msg);
+ /**
+ * @fwts-label SAIQueueFail
+ * @fwts-advice Error in queueing message to FSP in SAI code
+ * path. Likely an OPAL bug.
+ */
+ prlog(PR_ERR, "%s: Failed to queue the message\n", __func__);
+ goto sai_fail;
+ }
+
+ lock(&sai_lock);
+ sai_data.state = spcn_cmd->state;
+ unlock(&sai_lock);
+
+ return OPAL_SUCCESS;
+
+sai_fail:
+ if (spcn_cmd->cmd_src == SPCN_SRC_OPAL)
+ opal_led_update_complete(spcn_cmd->async_token,
+ OPAL_INTERNAL_ERROR);
+
+ return OPAL_INTERNAL_ERROR;
+}
+
+static void fsp_get_sai_complete(struct fsp_msg *msg)
+{
+ int rc = msg->resp->word1 & 0xff00;
+
+ if (rc) {
+ /**
+ * @fwts-label FSPSAIGetFailed
+ * @fwts-advice Possibly an error on FSP side, OPAL failed
+ * to read state from FSP.
+ */
+ prlog(PR_ERR, "Read real SAI cmd failed [rc = 0x%x].\n", rc);
+ } else { /* Update SAI state */
+ lock(&sai_lock);
+ sai_data.state = fsp_msg_get_data_word(msg->resp, 0) & 0xff;
+ unlock(&sai_lock);
+
+ prlog(PR_TRACE, "SAI initial state = 0x%x\n", sai_data.state);
+ }
+
+ fsp_freemsg(msg);
+}
+
+/* Read initial SAI state. */
+static void fsp_get_sai(void)
+{
+ int rc;
+ uint32_t cmd = FSP_CMD_SA_INDICATOR | FSP_LED_READ_REAL_SAI;
+ struct fsp_msg *msg;
+
+ msg = fsp_mkmsg(cmd, 0);
+ if (!msg) {
+ /**
+ * @fwts-label FSPGetSAIMallocFail
+ * @fwts-advice OPAL ran out of memory: OPAL bug.
+ */
+ prlog(PR_ERR, "%s: Memory allocation failed.\n", __func__);
+ return;
+ }
+ rc = fsp_queue_msg(msg, fsp_get_sai_complete);
+ if (rc) {
+ fsp_freemsg(msg);
+ /**
+ * @fwts-label FSPGetSAIQueueFail
+ * @fwts-advice Failed to queue message to FSP: OPAL bug
+ */
+ prlog(PR_ERR, "%s: Failed to queue the message\n", __func__);
+ }
+}
+
+static bool sai_update_notification(struct fsp_msg *msg)
+{
+ uint32_t state = fsp_msg_get_data_word(msg, 2);
+ uint32_t param_id = fsp_msg_get_data_word(msg, 0);
+ int len = fsp_msg_get_data_word(msg, 1) & 0xffff;
+
+ if (param_id != SYS_PARAM_REAL_SAI && param_id != SYS_PARAM_PLAT_SAI)
+ return false;
+
+ if (len != 4)
+ return false;
+
+ if (state != LED_STATE_ON && state != LED_STATE_OFF)
+ return false;
+
+ /* Update SAI state */
+ lock(&sai_lock);
+ sai_data.state = state;
+ unlock(&sai_lock);
+
+ prlog(PR_TRACE, "SAI updated. New SAI state = 0x%x\n", state);
+ return true;
+}
+
+
+/*
+ * Update both the local LED lists to reflect upon led state changes
+ * occurred with the recent SPCN command. Subsequent LED requests will
+ * be served with these updates changed to the list.
+ */
+static void update_led_list(char *loc_code, u32 led_state, u32 excl_bit)
+{
+ struct fsp_led_data *led = NULL, *encl_led = NULL, *encl_cec_led = NULL;
+ bool is_encl_led = is_enclosure_led(loc_code);
+
+ /* Enclosure LED in CEC list */
+ encl_cec_led = fsp_find_encl_cec_led(loc_code);
+ if (!encl_cec_led) {
+ log_simple_error(&e_info(OPAL_RC_LED_LC),
+ "Could not find enclosure LED in CEC LC=%s\n",
+ loc_code);
+ return;
+ }
+
+ /* Update state */
+ if (is_encl_led) {
+ /* Enclosure exclusive bit */
+ encl_cec_led->excl_bit = excl_bit;
+ } else { /* Descendant LED in CEC list */
+ led = fsp_find_cec_led(loc_code);
+ if (!led) {
+ log_simple_error(&e_info(OPAL_RC_LED_LC),
+ "Could not find descendent LED in \
+ CEC LC=%s\n", loc_code);
+ return;
+ }
+ led->status = led_state;
+ }
+
+ /* Enclosure LED in ENCL list */
+ encl_led = fsp_find_encl_encl_led(loc_code);
+ if (!encl_led) {
+ log_simple_error(&e_info(OPAL_RC_LED_LC),
+ "Could not find enclosure LED in ENCL LC=%s\n",
+ loc_code);
+ return;
+ }
+
+ /* Compute descendent rolled up status */
+ compute_encl_status_cec(encl_cec_led);
+
+ /* Check whether exclussive bits set */
+ if (encl_cec_led->excl_bit & FSP_LED_EXCL_FAULT)
+ encl_cec_led->status |= SPCN_LED_FAULT_MASK;
+
+ if (encl_cec_led->excl_bit & FSP_LED_EXCL_IDENTIFY)
+ encl_cec_led->status |= SPCN_LED_IDENTIFY_MASK;
+
+ /* Copy over */
+ encl_led->status = encl_cec_led->status;
+ encl_led->excl_bit = encl_cec_led->excl_bit;
+}
+
+static int fsp_set_led_response(uint32_t cmd)
+{
+ struct fsp_msg *msg;
+ int rc = -1;
+
+ msg = fsp_mkmsg(cmd, 0);
+ if (!msg) {
+ prerror("Failed to allocate FSP_RSP_SET_LED_STATE [cmd=%x])\n",
+ cmd);
+ } else {
+ rc = fsp_queue_msg(msg, fsp_freemsg);
+ if (rc != OPAL_SUCCESS) {
+ fsp_freemsg(msg);
+ prerror("Failed to queue FSP_RSP_SET_LED_STATE"
+ " [cmd=%x]\n", cmd);
+ }
+ }
+ return rc;
+}
+
+static void fsp_spcn_set_led_completion(struct fsp_msg *msg)
+{
+ struct fsp_msg *resp = msg->resp;
+ u32 cmd = FSP_RSP_SET_LED_STATE;
+ u8 status = resp->word1 & 0xff00;
+ struct led_set_cmd *spcn_cmd = (struct led_set_cmd *)msg->user_data;
+
+ lock(&led_lock);
+
+ /*
+ * LED state update request came as part of FSP async message
+ * FSP_CMD_SET_LED_STATE, we need to send response message.
+ *
+ * Also if SPCN command failed, then roll back changes.
+ */
+ if (status != FSP_STATUS_SUCCESS) {
+ log_simple_error(&e_info(OPAL_RC_LED_SPCN),
+ "Last SPCN command failed, status=%02x\n",
+ status);
+ cmd |= FSP_STATUS_GENERIC_ERROR;
+
+ /* Rollback the changes */
+ update_led_list(spcn_cmd->loc_code,
+ spcn_cmd->ckpt_status, spcn_cmd->ckpt_excl_bit);
+ }
+
+ /* FSP initiated SPCN command */
+ if (spcn_cmd->cmd_src == SPCN_SRC_FSP)
+ fsp_set_led_response(cmd);
+
+ /* OPAL initiated SPCN command */
+ if (spcn_cmd->cmd_src == SPCN_SRC_OPAL) {
+ if (status != FSP_STATUS_SUCCESS)
+ opal_led_update_complete(spcn_cmd->async_token,
+ OPAL_INTERNAL_ERROR);
+ else
+ opal_led_update_complete(spcn_cmd->async_token,
+ OPAL_SUCCESS);
+ }
+
+ unlock(&led_lock);
+
+ /* free msg and spcn command */
+ free(spcn_cmd);
+ fsp_freemsg(msg);
+
+ /* Process pending LED update request */
+ process_led_state_change();
+}
+
+/*
+ * Set the state of the LED pointed by the location code
+ *
+ * LED command: FAULT state or IDENTIFY state
+ * LED state : OFF (reset) or ON (set)
+ *
+ * SPCN TCE mapped buffer entries for setting LED state
+ *
+ * struct spcn_led_data {
+ * u8 lc_len;
+ * u16 state;
+ * char lc_code[LOC_CODE_SIZE];
+ *};
+ */
+static int fsp_msg_set_led_state(struct led_set_cmd *spcn_cmd)
+{
+ struct spcn_led_data sled;
+ struct fsp_msg *msg = NULL;
+ struct fsp_led_data *led = NULL;
+ void *buf = led_buffer;
+ u16 data_len = 0;
+ u32 cmd_hdr = 0;
+ u32 cmd = FSP_RSP_SET_LED_STATE;
+ int rc = -1;
+
+ memset(sled.lc_code, 0, LOC_CODE_SIZE);
+ sled.lc_len = strlen(spcn_cmd->loc_code);
+ if (sled.lc_len >= LOC_CODE_SIZE)
+ sled.lc_len = LOC_CODE_SIZE - 1;
+ strncpy(sled.lc_code, spcn_cmd->loc_code, LOC_CODE_SIZE - 1);
+
+ lock(&led_lock);
+
+ /* Location code length + Location code + LED control */
+ data_len = LOC_CODE_LEN + sled.lc_len + LED_CONTROL_LEN;
+ cmd_hdr = SPCN_MOD_SET_LED_CTL_LOC_CODE << 24 | SPCN_CMD_SET << 16 |
+ data_len;
+
+ /* Fetch the current state of LED */
+ led = fsp_find_cec_led(spcn_cmd->loc_code);
+
+ /* LED not present */
+ if (led == NULL) {
+ if (spcn_cmd->cmd_src == SPCN_SRC_FSP) {
+ cmd |= FSP_STATUS_INVALID_LC;
+ fsp_set_led_response(cmd);
+ }
+
+ if (spcn_cmd->cmd_src == SPCN_SRC_OPAL)
+ opal_led_update_complete(spcn_cmd->async_token,
+ OPAL_INTERNAL_ERROR);
+
+ unlock(&led_lock);
+ return rc;
+ }
+
+ /*
+ * Checkpoint the status here, will use it if the SPCN
+ * command eventually fails.
+ */
+ spcn_cmd->ckpt_status = led->status;
+ spcn_cmd->ckpt_excl_bit = led->excl_bit;
+ sled.state = cpu_to_be16(led->status);
+
+ /* Update the exclussive LED bits */
+ if (is_enclosure_led(spcn_cmd->loc_code)) {
+ if (spcn_cmd->command == LED_COMMAND_FAULT) {
+ if (spcn_cmd->state == LED_STATE_ON)
+ led->excl_bit |= FSP_LED_EXCL_FAULT;
+ if (spcn_cmd->state == LED_STATE_OFF)
+ led->excl_bit &= ~FSP_LED_EXCL_FAULT;
+ }
+
+ if (spcn_cmd->command == LED_COMMAND_IDENTIFY) {
+ if (spcn_cmd->state == LED_STATE_ON)
+ led->excl_bit |= FSP_LED_EXCL_IDENTIFY;
+ if (spcn_cmd->state == LED_STATE_OFF)
+ led->excl_bit &= ~FSP_LED_EXCL_IDENTIFY;
+ }
+ }
+
+ /* LED FAULT commad */
+ if (spcn_cmd->command == LED_COMMAND_FAULT) {
+ if (spcn_cmd->state == LED_STATE_ON)
+ sled.state |= cpu_to_be16(SPCN_LED_FAULT_MASK);
+ if (spcn_cmd->state == LED_STATE_OFF)
+ sled.state &= cpu_to_be16(~SPCN_LED_FAULT_MASK);
+ }
+
+ /* LED IDENTIFY command */
+ if (spcn_cmd->command == LED_COMMAND_IDENTIFY) {
+ if (spcn_cmd->state == LED_STATE_ON)
+ sled.state |= cpu_to_be16(SPCN_LED_IDENTIFY_MASK);
+ if (spcn_cmd->state == LED_STATE_OFF)
+ sled.state &= cpu_to_be16(~SPCN_LED_IDENTIFY_MASK);
+ }
+
+ /* Write into SPCN TCE buffer */
+ buf_write(buf, u8, sled.lc_len); /* Location code length */
+ memcpy(buf, sled.lc_code, sled.lc_len); /* Location code */
+ buf += sled.lc_len;
+ buf_write(buf, __be16, sled.state); /* LED state */
+
+ msg = fsp_mkmsg(FSP_CMD_SPCN_PASSTHRU, 4,
+ SPCN_ADDR_MODE_CEC_NODE, cmd_hdr, 0, PSI_DMA_LED_BUF);
+ if (!msg) {
+ cmd |= FSP_STATUS_GENERIC_ERROR;
+ rc = -1;
+ goto update_fail;
+ }
+
+ /*
+ * Update the local lists based on the attempted SPCN command to
+ * set/reset an individual led (CEC or ENCL).
+ */
+ update_led_list(spcn_cmd->loc_code, be16_to_cpu(sled.state), led->excl_bit);
+ msg->user_data = spcn_cmd;
+
+ rc = fsp_queue_msg(msg, fsp_spcn_set_led_completion);
+ if (rc != OPAL_SUCCESS) {
+ cmd |= FSP_STATUS_GENERIC_ERROR;
+ fsp_freemsg(msg);
+ /* Revert LED state update */
+ update_led_list(spcn_cmd->loc_code, spcn_cmd->ckpt_status,
+ spcn_cmd->ckpt_excl_bit);
+ }
+
+update_fail:
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_LED_STATE),
+ "Set led state failed at LC=%s\n",
+ spcn_cmd->loc_code);
+
+ if (spcn_cmd->cmd_src == SPCN_SRC_FSP)
+ fsp_set_led_response(cmd);
+
+ if (spcn_cmd->cmd_src == SPCN_SRC_OPAL)
+ opal_led_update_complete(spcn_cmd->async_token,
+ OPAL_INTERNAL_ERROR);
+ }
+
+ unlock(&led_lock);
+ return rc;
+}
+
+/*
+ * process_led_state_change
+ *
+ * If the command queue is empty, it sets the 'spcn_cmd_complete' as true
+ * and just returns. Else it pops one element from the command queue
+ * and processes the command for the requested LED state change.
+ */
+static int process_led_state_change(void)
+{
+ struct led_set_cmd *spcn_cmd;
+ int rc = 0;
+
+ /*
+ * The command queue is empty. This will only
+ * happen during the SPCN command callback path
+ * in which case we set 'spcn_cmd_complete' as true.
+ */
+ lock(&spcn_cmd_lock);
+ if (list_empty(&spcn_cmdq)) {
+ spcn_cmd_complete = true;
+ unlock(&spcn_cmd_lock);
+ return rc;
+ }
+
+ spcn_cmd = list_pop(&spcn_cmdq, struct led_set_cmd, link);
+ unlock(&spcn_cmd_lock);
+
+ if (is_sai_loc_code(spcn_cmd->loc_code))
+ rc = fsp_set_sai(spcn_cmd);
+ else
+ rc = fsp_msg_set_led_state(spcn_cmd);
+
+ if (rc) {
+ free(spcn_cmd);
+ process_led_state_change();
+ }
+
+ return rc;
+}
+
+/*
+ * queue_led_state_change
+ *
+ * FSP async command or OPAL based request for LED state change gets queued
+ * up in the command queue. If no previous SPCN command is pending, then it
+ * immediately pops up one element from the list and processes it. If previous
+ * SPCN commands are still pending then it just queues up and return. When the
+ * SPCN command callback gets to execute, it processes one element from the
+ * list and keeps the chain execution going. At last when there are no elements
+ * in the command queue it sets 'spcn_cmd_complete' as true again.
+ */
+static int queue_led_state_change(char *loc_code, u8 command,
+ u8 state, int cmd_src, uint64_t async_token)
+{
+ struct led_set_cmd *cmd;
+ int rc = 0;
+
+ /* New request node */
+ cmd = zalloc(sizeof(struct led_set_cmd));
+ if (!cmd) {
+ /**
+ * @fwts-label FSPLEDRequestMallocFail
+ * @fwts-advice OPAL failed to allocate memory for FSP LED
+ * command. Likely an OPAL bug led to out of memory.
+ */
+ prlog(PR_ERR, "SPCN set command node allocation failed\n");
+ return -1;
+ }
+
+ /* Save the request */
+ strncpy(cmd->loc_code, loc_code, LOC_CODE_SIZE - 1);
+ cmd->command = command;
+ cmd->state = state;
+ cmd->cmd_src = cmd_src;
+ cmd->async_token = async_token;
+
+ /* Add to the queue */
+ lock(&spcn_cmd_lock);
+ list_add_tail(&spcn_cmdq, &cmd->link);
+
+ /* No previous SPCN command pending */
+ if (spcn_cmd_complete) {
+ spcn_cmd_complete = false;
+ unlock(&spcn_cmd_lock);
+ rc = process_led_state_change();
+ return rc;
+ }
+
+ unlock(&spcn_cmd_lock);
+ return rc;
+}
+
+/*
+ * Write single location code information into the TCE outbound buffer
+ *
+ * Data layout
+ *
+ * 2 bytes - Length of location code structure
+ * 4 bytes - CCIN in ASCII
+ * 1 byte - Resource status flag
+ * 1 byte - Indicator state
+ * 1 byte - Raw loc code length
+ * 1 byte - Loc code field size
+ * Field size byte - Null terminated ASCII string padded to 4 byte boundary
+ *
+ */
+static u32 fsp_push_data_to_tce(struct fsp_led_data *led, u8 *out_data,
+ u32 total_size)
+{
+ struct fsp_loc_code_data lcode;
+
+ /* CCIN value is irrelevant */
+ lcode.ccin = 0x0;
+
+ lcode.status = FSP_IND_NOT_IMPLMNTD;
+
+ if (led->parms & SPCN_LED_IDENTIFY_MASK)
+ lcode.status = FSP_IND_IMPLMNTD;
+
+ /* LED indicator status */
+ lcode.ind_state = FSP_IND_INACTIVE;
+ if (led->status & SPCN_LED_IDENTIFY_MASK)
+ lcode.ind_state |= FSP_IND_IDENTIFY_ACTV;
+ if (led->status & SPCN_LED_FAULT_MASK)
+ lcode.ind_state |= FSP_IND_FAULT_ACTV;
+
+ /* Location code */
+ memset(lcode.loc_code, 0, LOC_CODE_SIZE);
+ lcode.raw_len = strlen(led->loc_code);
+ strncpy(lcode.loc_code, led->loc_code, LOC_CODE_SIZE - 1);
+ lcode.fld_sz = sizeof(lcode.loc_code);
+
+ /* Rest of the structure */
+ lcode.size = cpu_to_be16(sizeof(lcode));
+ lcode.status &= 0x0f;
+
+ /*
+ * Check for outbound buffer overflow. If there are still
+ * more LEDs to be sent across to FSP, don't send, ignore.
+ */
+ if ((total_size + be16_to_cpu(lcode.size)) > PSI_DMA_LOC_COD_BUF_SZ)
+ return 0;
+
+ /* Copy over to the buffer */
+ memcpy(out_data, &lcode, sizeof(lcode));
+
+ return be16_to_cpu(lcode.size);
+}
+
+/*
+ * Send out LED information structure pointed by "loc_code"
+ * to FSP through the PSI DMA mapping. Buffer layout structure
+ * must be followed.
+ */
+static void fsp_ret_loc_code_list(u16 req_type, char *loc_code)
+{
+ struct fsp_led_data *led, *next;
+ struct fsp_msg *msg;
+
+ u8 *data; /* Start of TCE mapped buffer */
+ u8 *out_data; /* Start of location code data */
+ u32 bytes_sent = 0, total_size = 0;
+ u16 header_size = 0, flags = 0;
+
+ if (loc_code_list_buffer == NULL) {
+ prerror("No loc_code_list_buffer\n");
+ return;
+ }
+
+ /* Init the addresses */
+ data = loc_code_list_buffer;
+ out_data = NULL;
+
+ /* Unmapping through FSP_CMD_RET_LOC_BUFFER command */
+ fsp_tce_map(PSI_DMA_LOC_COD_BUF, (void *)data, PSI_DMA_LOC_COD_BUF_SZ);
+ out_data = data + 8;
+
+ /* CEC LED list */
+ list_for_each_safe(&cec_ledq, led, next, link) {
+ /*
+ * When the request type is system wide led list
+ * i.e GET_LC_CMPLT_SYS, send the entire contents
+ * of the CEC list including both all descendents
+ * and all of their enclosures.
+ */
+
+ if (req_type == GET_LC_ENCLOSURES)
+ break;
+
+ if (req_type == GET_LC_ENCL_DESCENDANTS) {
+ if (strstr(led->loc_code, loc_code) == NULL)
+ continue;
+ }
+
+ if (req_type == GET_LC_SINGLE_LOC_CODE) {
+ if (strcmp(led->loc_code, loc_code))
+ continue;
+ }
+
+ /* Push the data into TCE buffer */
+ bytes_sent = fsp_push_data_to_tce(led, out_data, total_size);
+
+ /* Advance the TCE pointer */
+ out_data += bytes_sent;
+ total_size += bytes_sent;
+ }
+
+ /* Enclosure LED list */
+ if (req_type == GET_LC_ENCLOSURES) {
+ list_for_each_safe(&encl_ledq, led, next, link) {
+
+ /* Push the data into TCE buffer */
+ bytes_sent = fsp_push_data_to_tce(led,
+ out_data, total_size);
+
+ /* Advance the TCE pointer */
+ out_data += bytes_sent;
+ total_size += bytes_sent;
+ }
+ }
+
+ /* Count from 'data' instead of 'data_out' */
+ total_size += 8;
+ memcpy(data, &total_size, sizeof(total_size));
+
+ header_size = OUTBUF_HEADER_SIZE;
+ memcpy(data + sizeof(total_size), &header_size, sizeof(header_size));
+
+ if (req_type == GET_LC_ENCL_DESCENDANTS)
+ flags = 0x8000;
+
+ memcpy(data + sizeof(total_size) + sizeof(header_size), &flags,
+ sizeof(flags));
+ msg = fsp_mkmsg(FSP_RSP_GET_LED_LIST, 3, 0,
+ PSI_DMA_LOC_COD_BUF, total_size);
+ if (!msg) {
+ prerror("Failed to allocate FSP_RSP_GET_LED_LIST.\n");
+ } else {
+ if (fsp_queue_msg(msg, fsp_freemsg)) {
+ fsp_freemsg(msg);
+ prerror("Failed to queue FSP_RSP_GET_LED_LIST\n");
+ }
+ }
+}
+
+/*
+ * FSP async command: FSP_CMD_GET_LED_LIST
+ *
+ * (1) FSP sends the list of location codes through inbound buffer
+ * (2) HV sends the status of those location codes through outbound buffer
+ *
+ * Inbound buffer data layout (loc code request structure)
+ *
+ * 2 bytes - Length of entire structure
+ * 2 bytes - Request type
+ * 1 byte - Raw length of location code
+ * 1 byte - Location code field size
+ * `Field size` bytes - NULL terminated ASCII location code string
+ */
+static void fsp_get_led_list(struct fsp_msg *msg)
+{
+ struct fsp_loc_code_req req;
+ u32 tce_token = fsp_msg_get_data_word(msg, 1);
+ void *buf;
+
+ /* Parse inbound buffer */
+ buf = fsp_inbound_buf_from_tce(tce_token);
+ if (!buf) {
+ struct fsp_msg *msg;
+ msg = fsp_mkmsg(FSP_RSP_GET_LED_LIST | FSP_STATUS_INVALID_DATA,
+ 0);
+ if (!msg) {
+ prerror("Failed to allocate FSP_RSP_GET_LED_LIST"
+ " | FSP_STATUS_INVALID_DATA\n");
+ } else {
+ if (fsp_queue_msg(msg, fsp_freemsg)) {
+ fsp_freemsg(msg);
+ prerror("Failed to queue "
+ "FSP_RSP_GET_LED_LIST |"
+ " FSP_STATUS_INVALID_DATA\n");
+ }
+ }
+ return;
+ }
+ memcpy(&req, buf, sizeof(req));
+
+ prlog(PR_TRACE, "Request for loc code list type 0x%04x LC=%s\n",
+ be16_to_cpu(req.req_type), req.loc_code);
+
+ fsp_ret_loc_code_list(be16_to_cpu(req.req_type), req.loc_code);
+}
+
+/*
+ * FSP async command: FSP_CMD_RET_LOC_BUFFER
+ *
+ * With this command FSP returns ownership of the outbound buffer
+ * used by Sapphire to pass the indicator list previous time. That
+ * way FSP tells Sapphire that it has consumed all the data present
+ * on the outbound buffer and Sapphire can reuse it for next request.
+ */
+static void fsp_free_led_list_buf(struct fsp_msg *msg)
+{
+ u32 tce_token = fsp_msg_get_data_word(msg, 1);
+ u32 cmd = FSP_RSP_RET_LED_BUFFER;
+ struct fsp_msg *resp;
+
+ /* Token does not point to outbound buffer */
+ if (tce_token != PSI_DMA_LOC_COD_BUF) {
+ log_simple_error(&e_info(OPAL_RC_LED_BUFF),
+ "Invalid tce token from FSP\n");
+ cmd |= FSP_STATUS_GENERIC_ERROR;
+ resp = fsp_mkmsg(cmd, 0);
+ if (!resp) {
+ prerror("Failed to allocate FSP_RSP_RET_LED_BUFFER"
+ "| FSP_STATUS_GENERIC_ERROR\n");
+ return;
+ }
+
+ if (fsp_queue_msg(resp, fsp_freemsg)) {
+ fsp_freemsg(resp);
+ prerror("Failed to queue "
+ "RET_LED_BUFFER|ERROR\n");
+ }
+ return;
+ }
+
+ /* Unmap the location code DMA buffer */
+ fsp_tce_unmap(PSI_DMA_LOC_COD_BUF, PSI_DMA_LOC_COD_BUF_SZ);
+
+ resp = fsp_mkmsg(cmd, 0);
+ if (!resp) {
+ prerror("Failed to allocate FSP_RSP_RET_LED_BUFFER\n");
+ return;
+ }
+ if (fsp_queue_msg(resp, fsp_freemsg)) {
+ fsp_freemsg(resp);
+ prerror("Failed to queue FSP_RSP_RET_LED_BUFFER\n");
+ }
+}
+
+static void fsp_ret_led_state(char *loc_code)
+{
+ bool found = false;
+ u8 ind_state = 0;
+ u32 cmd = FSP_RSP_GET_LED_STATE;
+ struct fsp_led_data *led, *next;
+ struct fsp_msg *msg;
+
+ if (is_sai_loc_code(loc_code)) {
+ if (sai_data.state & OPAL_SLOT_LED_STATE_ON)
+ ind_state = FSP_IND_FAULT_ACTV;
+ found = true;
+ } else {
+ list_for_each_safe(&cec_ledq, led, next, link) {
+ if (strcmp(loc_code, led->loc_code))
+ continue;
+
+ /* Found the location code */
+ if (led->status & SPCN_LED_IDENTIFY_MASK)
+ ind_state |= FSP_IND_IDENTIFY_ACTV;
+ if (led->status & SPCN_LED_FAULT_MASK)
+ ind_state |= FSP_IND_FAULT_ACTV;
+
+ found = true;
+ break;
+ }
+ }
+
+ /* Location code not found */
+ if (!found) {
+ log_simple_error(&e_info(OPAL_RC_LED_LC),
+ "Could not find the location code LC=%s\n",
+ loc_code);
+ cmd |= FSP_STATUS_INVALID_LC;
+ ind_state = 0xff;
+ }
+
+ msg = fsp_mkmsg(cmd, 1, ind_state);
+ if (!msg) {
+ prerror("Couldn't alloc FSP_RSP_GET_LED_STATE\n");
+ return;
+ }
+
+ if (fsp_queue_msg(msg, fsp_freemsg)) {
+ fsp_freemsg(msg);
+ prerror("Couldn't queue FSP_RSP_GET_LED_STATE\n");
+ }
+}
+
+/*
+ * FSP async command: FSP_CMD_GET_LED_STATE
+ *
+ * With this command FSP query the state for any given LED
+ */
+static void fsp_get_led_state(struct fsp_msg *msg)
+{
+ struct fsp_get_ind_state_req req;
+ u32 tce_token = fsp_msg_get_data_word(msg, 1);
+ void *buf;
+
+ /* Parse the inbound buffer */
+ buf = fsp_inbound_buf_from_tce(tce_token);
+ if (!buf) {
+ struct fsp_msg *msg;
+ msg = fsp_mkmsg(FSP_RSP_GET_LED_STATE |
+ FSP_STATUS_INVALID_DATA, 0);
+ if (!msg) {
+ prerror("Failed to allocate FSP_RSP_GET_LED_STATE"
+ " | FSP_STATUS_INVALID_DATA\n");
+ return;
+ }
+ if (fsp_queue_msg(msg, fsp_freemsg)) {
+ fsp_freemsg(msg);
+ prerror("Failed to queue FSP_RSP_GET_LED_STATE"
+ " | FSP_STATUS_INVALID_DATA\n");
+ }
+ return;
+ }
+ memcpy(&req, buf, sizeof(req));
+
+ prlog(PR_TRACE, "%s: tce=0x%08x buf=%p rq.sz=%d rq.lc_len=%d"
+ " rq.fld_sz=%d LC: %02x %02x %02x %02x....\n", __func__,
+ tce_token, buf, req.size, req.lc_len, req.fld_sz,
+ req.loc_code[0], req.loc_code[1],
+ req.loc_code[2], req.loc_code[3]);
+
+ /* Bound check */
+ if (req.lc_len >= LOC_CODE_SIZE) {
+ log_simple_error(&e_info(OPAL_RC_LED_LC),
+ "Loc code too large in %s: %d bytes\n",
+ __func__, req.lc_len);
+ req.lc_len = LOC_CODE_SIZE - 1;
+ }
+ /* Ensure NULL termination */
+ req.loc_code[req.lc_len] = 0;
+
+ /* Do the deed */
+ fsp_ret_led_state(req.loc_code);
+}
+
+/*
+ * FSP async command: FSP_CMD_SET_LED_STATE
+ *
+ * With this command FSP sets/resets the state for any given LED
+ */
+static void fsp_set_led_state(struct fsp_msg *msg)
+{
+ struct fsp_set_ind_state_req req;
+ struct fsp_led_data *led, *next;
+ u32 tce_token = fsp_msg_get_data_word(msg, 1);
+ bool command, state;
+ void *buf;
+ int rc;
+
+ /* Parse the inbound buffer */
+ buf = fsp_inbound_buf_from_tce(tce_token);
+ if (!buf) {
+ fsp_set_led_response(FSP_RSP_SET_LED_STATE |
+ FSP_STATUS_INVALID_DATA);
+ return;
+ }
+ memcpy(&req, buf, sizeof(req));
+
+ prlog(PR_TRACE, "%s: tce=0x%08x buf=%p rq.sz=%d rq.typ=0x%04x"
+ " rq.lc_len=%d rq.fld_sz=%d LC: %02x %02x %02x %02x....\n",
+ __func__, tce_token, buf, be16_to_cpu(req.size), req.lc_len, req.fld_sz,
+ be16_to_cpu(req.req_type),
+ req.loc_code[0], req.loc_code[1],
+ req.loc_code[2], req.loc_code[3]);
+
+ /* Bound check */
+ if (req.lc_len >= LOC_CODE_SIZE) {
+ log_simple_error(&e_info(OPAL_RC_LED_LC),
+ "Loc code too large in %s: %d bytes\n",
+ __func__, req.lc_len);
+ req.lc_len = LOC_CODE_SIZE - 1;
+ }
+ /* Ensure NULL termination */
+ req.loc_code[req.lc_len] = 0;
+
+ /* Decode command */
+ command = (req.ind_state & LOGICAL_IND_STATE_MASK) ?
+ LED_COMMAND_FAULT : LED_COMMAND_IDENTIFY;
+ state = (req.ind_state & ACTIVE_LED_STATE_MASK) ?
+ LED_STATE_ON : LED_STATE_OFF;
+
+ /* Handle requests */
+ switch (be16_to_cpu(req.req_type)) {
+ case SET_IND_ENCLOSURE:
+ list_for_each_safe(&cec_ledq, led, next, link) {
+ /* Only descendants of the same enclosure */
+ if (!strstr(led->loc_code, req.loc_code))
+ continue;
+
+ /* Skip the enclosure */
+ if (!strcmp(led->loc_code, req.loc_code))
+ continue;
+
+ rc = queue_led_state_change(led->loc_code, command,
+ state, SPCN_SRC_FSP, 0);
+ if (rc != 0)
+ fsp_set_led_response(FSP_RSP_SET_LED_STATE |
+ FSP_STATUS_GENERIC_ERROR);
+ }
+ break;
+ case SET_IND_SINGLE_LOC_CODE:
+ /* Set led state for single descendent led */
+ rc = queue_led_state_change(req.loc_code,
+ command, state, SPCN_SRC_FSP, 0);
+ if (rc != 0)
+ fsp_set_led_response(FSP_RSP_SET_LED_STATE |
+ FSP_STATUS_GENERIC_ERROR);
+ break;
+ default:
+ fsp_set_led_response(FSP_RSP_SET_LED_STATE |
+ FSP_STATUS_NOT_SUPPORTED);
+ break;
+ }
+}
+
+/* Handle received indicator message from FSP */
+static bool fsp_indicator_message(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+ u32 cmd;
+ struct fsp_msg *resp;
+
+ /* LED support not available yet */
+ if (led_support != LED_STATE_PRESENT) {
+ log_simple_error(&e_info(OPAL_RC_LED_SUPPORT),
+ "Indicator message while LED support not"
+ " available yet\n");
+ return false;
+ }
+
+ switch (cmd_sub_mod) {
+ case FSP_CMD_GET_LED_LIST:
+ prlog(PR_TRACE, "FSP_CMD_GET_LED_LIST command received\n");
+ fsp_get_led_list(msg);
+ return true;
+ case FSP_CMD_RET_LED_BUFFER:
+ prlog(PR_TRACE, "FSP_CMD_RET_LED_BUFFER command received\n");
+ fsp_free_led_list_buf(msg);
+ return true;
+ case FSP_CMD_GET_LED_STATE:
+ prlog(PR_TRACE, "FSP_CMD_GET_LED_STATE command received\n");
+ fsp_get_led_state(msg);
+ return true;
+ case FSP_CMD_SET_LED_STATE:
+ prlog(PR_TRACE, "FSP_CMD_SET_LED_STATE command received\n");
+ fsp_set_led_state(msg);
+ return true;
+ /*
+ * FSP async sub commands which have not been implemented.
+ * For these async sub commands, print for the log and ack
+ * the field service processor with a generic error.
+ */
+ case FSP_CMD_GET_MTMS_LIST:
+ prlog(PR_TRACE, "FSP_CMD_GET_MTMS_LIST command received\n");
+ cmd = FSP_RSP_GET_MTMS_LIST;
+ break;
+ case FSP_CMD_RET_MTMS_BUFFER:
+ prlog(PR_TRACE, "FSP_CMD_RET_MTMS_BUFFER command received\n");
+ cmd = FSP_RSP_RET_MTMS_BUFFER;
+ break;
+ case FSP_CMD_SET_ENCL_MTMS:
+ prlog(PR_TRACE, "FSP_CMD_SET_MTMS command received\n");
+ cmd = FSP_RSP_SET_ENCL_MTMS;
+ break;
+ case FSP_CMD_CLR_INCT_ENCL:
+ prlog(PR_TRACE, "FSP_CMD_CLR_INCT_ENCL command received\n");
+ cmd = FSP_RSP_CLR_INCT_ENCL;
+ break;
+ case FSP_CMD_ENCL_MCODE_INIT:
+ prlog(PR_TRACE, "FSP_CMD_ENCL_MCODE_INIT command received\n");
+ cmd = FSP_RSP_ENCL_MCODE_INIT;
+ break;
+ case FSP_CMD_ENCL_MCODE_INTR:
+ prlog(PR_TRACE, "FSP_CMD_ENCL_MCODE_INTR command received\n");
+ cmd = FSP_RSP_ENCL_MCODE_INTR;
+ break;
+ case FSP_CMD_ENCL_POWR_TRACE:
+ prlog(PR_TRACE, "FSP_CMD_ENCL_POWR_TRACE command received\n");
+ cmd = FSP_RSP_ENCL_POWR_TRACE;
+ break;
+ case FSP_CMD_RET_ENCL_TRACE_BUFFER:
+ prlog(PR_TRACE, "FSP_CMD_RET_ENCL_TRACE_BUFFER command received\n");
+ cmd = FSP_RSP_RET_ENCL_TRACE_BUFFER;
+ break;
+ case FSP_CMD_GET_SPCN_LOOP_STATUS:
+ prlog(PR_TRACE, "FSP_CMD_GET_SPCN_LOOP_STATUS command received\n");
+ cmd = FSP_RSP_GET_SPCN_LOOP_STATUS;
+ break;
+ case FSP_CMD_INITIATE_LAMP_TEST:
+ /* XXX: FSP ACK not required for this sub command */
+ prlog(PR_TRACE, "FSP_CMD_INITIATE_LAMP_TEST command received\n");
+ return true;
+ default:
+ return false;
+ }
+ cmd |= FSP_STATUS_GENERIC_ERROR;
+ resp = fsp_mkmsg(cmd, 0);
+ if (!resp) {
+ prerror("Failed to allocate FSP_STATUS_GENERIC_ERROR\n");
+ return false;
+ }
+ if (fsp_queue_msg(resp, fsp_freemsg)) {
+ fsp_freemsg(resp);
+ prerror("Failed to queue FSP_STATUS_GENERIC_ERROR\n");
+ return false;
+ }
+ return true;
+}
+
+/* Indicator class client */
+static struct fsp_client fsp_indicator_client = {
+ .message = fsp_indicator_message,
+};
+
+
+static int fsp_opal_get_sai(__be64 *led_mask, __be64 *led_value)
+{
+ *led_mask |= cpu_to_be64(OPAL_SLOT_LED_STATE_ON << OPAL_SLOT_LED_TYPE_ATTN);
+ if (sai_data.state & OPAL_SLOT_LED_STATE_ON)
+ *led_value |= cpu_to_be64(OPAL_SLOT_LED_STATE_ON << OPAL_SLOT_LED_TYPE_ATTN);
+
+ return OPAL_SUCCESS;
+}
+
+static int fsp_opal_set_sai(uint64_t async_token, char *loc_code,
+ const u64 led_mask, const u64 led_value)
+{
+ int state = LED_STATE_OFF;
+
+ if (!((led_mask >> OPAL_SLOT_LED_TYPE_ATTN) & OPAL_SLOT_LED_STATE_ON))
+ return OPAL_PARAMETER;
+
+ if ((led_value >> OPAL_SLOT_LED_TYPE_ATTN) & OPAL_SLOT_LED_STATE_ON)
+ state = LED_STATE_ON;
+
+ return queue_led_state_change(loc_code, 0,
+ state, SPCN_SRC_OPAL, async_token);
+}
+
+/*
+ * fsp_opal_leds_get_ind (OPAL_LEDS_GET_INDICATOR)
+ *
+ * Argument Description Updated By
+ * -------- ----------- ----------
+ * loc_code Location code of the LEDs (Host)
+ * led_mask LED types whose status is available (OPAL)
+ * led_value Status of the available LED types (OPAL)
+ * max_led_type Maximum number of supported LED types (Host/OPAL)
+ *
+ * The host will pass the location code of the LED types (loc_code) and
+ * maximum number of LED types it understands (max_led_type). OPAL will
+ * update the 'led_mask' with set bits pointing to LED types whose status
+ * is available and updates the 'led_value' with actual status. OPAL checks
+ * the 'max_led_type' to understand whether the host is newer or older
+ * compared to itself. In the case where the OPAL is newer compared
+ * to host (OPAL's max_led_type > host's max_led_type), it will update
+ * led_mask and led_value according to max_led_type requested by the host.
+ * When the host is newer compared to the OPAL (host's max_led_type >
+ * OPAL's max_led_type), OPAL updates 'max_led_type' to the maximum
+ * number of LED type it understands and updates 'led_mask', 'led_value'
+ * based on that maximum value of LED types.
+ */
+static int64_t fsp_opal_leds_get_ind(char *loc_code, __be64 *led_mask,
+ __be64 *led_value, __be64 *max_led_type)
+{
+ bool supported = true;
+ int64_t max;
+ int rc;
+ struct fsp_led_data *led;
+
+ /* FSP not present */
+ if (!fsp_present())
+ return OPAL_HARDWARE;
+
+ /* LED support not available */
+ if (led_support != LED_STATE_PRESENT)
+ return OPAL_HARDWARE;
+
+ max = be64_to_cpu(*max_led_type);
+
+ /* Adjust max LED type */
+ if (max > OPAL_SLOT_LED_TYPE_MAX) {
+ supported = false;
+ max = OPAL_SLOT_LED_TYPE_MAX;
+ *max_led_type = cpu_to_be64(max);
+ }
+
+ /* Invalid parameter */
+ if (max <= 0)
+ return OPAL_PARAMETER;
+
+ /* Get System attention indicator state */
+ if (is_sai_loc_code(loc_code)) {
+ rc = fsp_opal_get_sai(led_mask, led_value);
+ return rc;
+ }
+
+ /* LED not found */
+ led = fsp_find_cec_led(loc_code);
+ if (!led)
+ return OPAL_PARAMETER;
+
+ *led_mask = 0;
+ *led_value = 0;
+
+ /* Identify LED */
+ --max;
+ *led_mask |= cpu_to_be64(OPAL_SLOT_LED_STATE_ON << OPAL_SLOT_LED_TYPE_ID);
+ if (led->status & SPCN_LED_IDENTIFY_MASK)
+ *led_value |= cpu_to_be64(OPAL_SLOT_LED_STATE_ON << OPAL_SLOT_LED_TYPE_ID);
+
+ /* Fault LED */
+ if (!max)
+ return OPAL_SUCCESS;
+
+ --max;
+ *led_mask |= cpu_to_be64(OPAL_SLOT_LED_STATE_ON << OPAL_SLOT_LED_TYPE_FAULT);
+ if (led->status & SPCN_LED_FAULT_MASK)
+ *led_value |= cpu_to_be64(OPAL_SLOT_LED_STATE_ON << OPAL_SLOT_LED_TYPE_FAULT);
+
+ /* OPAL doesn't support all the LED type requested by payload */
+ if (!supported)
+ return OPAL_PARTIAL;
+
+ return OPAL_SUCCESS;
+}
+
+/*
+ * fsp_opal_leds_set_ind (OPAL_LEDS_SET_INDICATOR)
+ *
+ * Argument Description Updated By
+ * -------- ----------- ----------
+ * loc_code Location code of the LEDs (Host)
+ * led_mask LED types whose status will be updated (Host)
+ * led_value Requested status of various LED types (Host)
+ * max_led_type Maximum number of supported LED types (Host/OPAL)
+ *
+ * The host will pass the location code of the LED types, mask, value
+ * and maximum number of LED types it understands. OPAL will update
+ * LED status for all the LED types mentioned in the mask with their
+ * value mentioned. OPAL checks the 'max_led_type' to understand
+ * whether the host is newer or older compared to itself. In case where
+ * the OPAL is newer compared to the host (OPAL's max_led_type >
+ * host's max_led_type), it updates LED status based on max_led_type
+ * requested from the host. When the host is newer compared to the OPAL
+ * (host's max_led_type > OPAL's max_led_type), OPAL updates
+ * 'max_led_type' to the maximum number of LED type it understands and
+ * then it updates LED status based on that updated maximum value of LED
+ * types. Host needs to check the returned updated value of max_led_type
+ * to figure out which part of it's request got served and which ones got
+ * ignored.
+ */
+static int64_t fsp_opal_leds_set_ind(uint64_t async_token,
+ char *loc_code, const u64 led_mask,
+ const u64 led_value, __be64 *max_led_type)
+{
+ bool supported = true;
+ int command, state, rc = OPAL_SUCCESS;
+ int64_t max;
+ struct fsp_led_data *led;
+
+ /* FSP not present */
+ if (!fsp_present())
+ return OPAL_HARDWARE;
+
+ /* LED support not available */
+ if (led_support != LED_STATE_PRESENT)
+ return OPAL_HARDWARE;
+
+ max = be64_to_cpu(*max_led_type);
+
+ /* Adjust max LED type */
+ if (max > OPAL_SLOT_LED_TYPE_MAX) {
+ supported = false;
+ max = OPAL_SLOT_LED_TYPE_MAX;
+ *max_led_type = cpu_to_be64(max);
+ }
+
+ /* Invalid parameter */
+ if (max <= 0)
+ return OPAL_PARAMETER;
+
+ /* Set System attention indicator state */
+ if (is_sai_loc_code(loc_code)) {
+ supported = true;
+ rc = fsp_opal_set_sai(async_token,
+ loc_code, led_mask, led_value);
+ goto success;
+ }
+
+ /* LED not found */
+ led = fsp_find_cec_led(loc_code);
+ if (!led)
+ return OPAL_PARAMETER;
+
+ /* Indentify LED mask */
+ --max;
+
+ if ((led_mask >> OPAL_SLOT_LED_TYPE_ID) & OPAL_SLOT_LED_STATE_ON) {
+ supported = true;
+
+ command = LED_COMMAND_IDENTIFY;
+ state = LED_STATE_OFF;
+ if ((led_value >> OPAL_SLOT_LED_TYPE_ID)
+ & OPAL_SLOT_LED_STATE_ON)
+ state = LED_STATE_ON;
+
+ rc = queue_led_state_change(loc_code, command,
+ state, SPCN_SRC_OPAL, async_token);
+ }
+
+ if (!max)
+ goto success;
+
+ /* Fault LED mask */
+ --max;
+ if ((led_mask >> OPAL_SLOT_LED_TYPE_FAULT) & OPAL_SLOT_LED_STATE_ON) {
+ supported = true;
+
+ command = LED_COMMAND_FAULT;
+ state = LED_STATE_OFF;
+ if ((led_value >> OPAL_SLOT_LED_TYPE_FAULT)
+ & OPAL_SLOT_LED_STATE_ON)
+ state = LED_STATE_ON;
+
+ rc = queue_led_state_change(loc_code, command,
+ state, SPCN_SRC_OPAL, async_token);
+ }
+
+success:
+ /* Unsupported LED type */
+ if (!supported)
+ return OPAL_UNSUPPORTED;
+
+ if (rc == OPAL_SUCCESS)
+ rc = OPAL_ASYNC_COMPLETION;
+ else
+ rc = OPAL_INTERNAL_ERROR;
+
+ return rc;
+}
+
+/* Get LED node from device tree */
+static struct dt_node *dt_get_led_node(void)
+{
+ struct dt_node *pled;
+
+ if (!opal_node) {
+ prlog(PR_WARNING, "OPAL parent device node not available\n");
+ return NULL;
+ }
+
+ pled = dt_find_by_path(opal_node, DT_PROPERTY_LED_NODE);
+ if (!pled)
+ prlog(PR_WARNING, "Parent device node not available\n");
+
+ return pled;
+}
+
+/* Get System attention indicator location code from device tree */
+static void dt_get_sai_loc_code(void)
+{
+ struct dt_node *pled, *child;
+ const char *led_type = NULL;
+
+ memset(sai_data.loc_code, 0, LOC_CODE_SIZE);
+
+ pled = dt_get_led_node();
+ if (!pled)
+ return;
+
+ list_for_each(&pled->children, child, list) {
+ led_type = dt_prop_get(child, DT_PROPERTY_LED_TYPES);
+ if (!led_type)
+ continue;
+
+ if (strcmp(led_type, LED_TYPE_ATTENTION))
+ continue;
+
+ memcpy(sai_data.loc_code, child->name, LOC_CODE_SIZE - 1);
+
+ prlog(PR_TRACE, "SAI Location code = %s\n", sai_data.loc_code);
+ return;
+ }
+}
+
+/*
+ * create_led_device_node
+ *
+ * Creates the system parent LED device node and all individual
+ * child LED device nodes under it. This is called right before
+ * starting the payload (Linux) to ensure that the SPCN command
+ * sequence to fetch the LED location code list has been finished
+ * and to have a better chance of creating the deviced nodes.
+ */
+void create_led_device_nodes(void)
+{
+ const char *led_mode = NULL;
+ struct fsp_led_data *led, *next;
+ struct dt_node *pled, *cled;
+
+ if (!fsp_present())
+ return;
+
+ /* Make sure LED list read is completed */
+ while (led_support == LED_STATE_READING)
+ opal_run_pollers();
+
+ if (led_support == LED_STATE_ABSENT) {
+ prlog(PR_WARNING, "LED support not available, \
+ hence device tree nodes will not be created\n");
+ return;
+ }
+
+ /* Get LED node */
+ pled = dt_get_led_node();
+ if (!pled)
+ return;
+
+ /* Check if already populated (fast-reboot) */
+ if (dt_has_node_property(pled, "compatible", NULL))
+ return;
+ dt_add_property_strings(pled, "compatible", DT_PROPERTY_LED_COMPATIBLE);
+
+ led_mode = dt_prop_get(pled, DT_PROPERTY_LED_MODE);
+ if (!led_mode) {
+ prlog(PR_WARNING, "Unknown LED operating mode\n");
+ return;
+ }
+
+ /* LED child nodes */
+ list_for_each_safe(&cec_ledq, led, next, link) {
+ /* Duplicate LED location code */
+ if (dt_find_by_path(pled, led->loc_code)) {
+ prlog(PR_WARNING, "duplicate location code %s\n",
+ led->loc_code);
+ continue;
+ }
+
+ cled = dt_new(pled, led->loc_code);
+ if (!cled) {
+ prlog(PR_WARNING, "Child device node creation "
+ "failed\n");
+ continue;
+ }
+
+ if (!strcmp(led_mode, LED_MODE_LIGHT_PATH))
+ dt_add_property_strings(cled, DT_PROPERTY_LED_TYPES,
+ LED_TYPE_IDENTIFY,
+ LED_TYPE_FAULT);
+ else
+ dt_add_property_strings(cled, DT_PROPERTY_LED_TYPES,
+ LED_TYPE_IDENTIFY);
+ }
+}
+
+/*
+ * Process the received LED data from SPCN
+ *
+ * Every LED state data is added into the CEC list. If the location
+ * code is a enclosure type, its added into the enclosure list as well.
+ *
+ */
+static void fsp_process_leds_data(u16 len)
+{
+ struct fsp_led_data *led_data = NULL;
+ void *buf = NULL;
+
+ /*
+ * Process the entire captured data from the last command
+ *
+ * TCE mapped 'led_buffer' contains the fsp_led_data structure
+ * one after the other till the total length 'len'.
+ *
+ */
+ buf = led_buffer;
+ while (len) {
+ size_t lc_len;
+ __be16 tmp;
+
+ /* Prepare */
+ led_data = zalloc(sizeof(struct fsp_led_data));
+ assert(led_data);
+
+ /* Resource ID */
+ buf_read(buf, __be16, &tmp);
+ led_data->rid = be16_to_cpu(tmp);
+ len -= sizeof(led_data->rid);
+
+ /* Location code length */
+ buf_read(buf, u8, &led_data->lc_len);
+ len -= sizeof(led_data->lc_len);
+
+ lc_len = led_data->lc_len;
+ if (lc_len == 0) {
+ free(led_data);
+ break;
+ }
+
+ if (lc_len >= LOC_CODE_SIZE)
+ lc_len = LOC_CODE_SIZE - 1;
+
+ /* Location code */
+ strncpy(led_data->loc_code, buf, lc_len);
+ led_data->loc_code[lc_len] = '\0';
+
+ buf += led_data->lc_len;
+ len -= led_data->lc_len;
+
+ /* Parameters */
+ buf_read(buf, __be16, &tmp);
+ led_data->parms = be16_to_cpu(tmp);
+ len -= sizeof(led_data->parms);
+
+ /* Status */
+ buf_read(buf, __be16, &tmp);
+ led_data->status = be16_to_cpu(tmp);
+ len -= sizeof(led_data->status);
+
+ /*
+ * This is Enclosure LED's location code, need to go
+ * inside the enclosure LED list as well.
+ */
+ if (!strstr(led_data->loc_code, "-")) {
+ struct fsp_led_data *encl_led_data = NULL;
+ encl_led_data = zalloc(sizeof(struct fsp_led_data));
+ assert(encl_led_data);
+
+ /* copy over the original */
+ memcpy(encl_led_data, led_data, sizeof(struct fsp_led_data));
+
+ /* Add to the list of enclosure LEDs */
+ list_add_tail(&encl_ledq, &encl_led_data->link);
+ }
+
+ /* Push this onto the list */
+ list_add_tail(&cec_ledq, &led_data->link);
+ }
+}
+
+/* Replay the SPCN command */
+static void replay_spcn_cmd(u32 last_spcn_cmd)
+{
+ u32 cmd_hdr = 0;
+ int rc = -1;
+
+ /* Reached threshold */
+ if (replay == SPCN_REPLAY_THRESHOLD) {
+ replay = 0;
+ led_support = LED_STATE_ABSENT;
+ return;
+ }
+
+ replay++;
+ if (last_spcn_cmd == SPCN_MOD_PRS_LED_DATA_FIRST) {
+ cmd_hdr = SPCN_MOD_PRS_LED_DATA_FIRST << 24 |
+ SPCN_CMD_PRS << 16;
+ rc = fsp_queue_msg(fsp_mkmsg(FSP_CMD_SPCN_PASSTHRU, 4,
+ SPCN_ADDR_MODE_CEC_NODE,
+ cmd_hdr, 0,
+ PSI_DMA_LED_BUF),
+ fsp_read_leds_data_complete);
+ if (rc)
+ prlog(PR_ERR, "Replay SPCN_MOD_PRS_LED_DATA_FIRST"
+ " command could not be queued\n");
+ }
+
+ if (last_spcn_cmd == SPCN_MOD_PRS_LED_DATA_SUB) {
+ cmd_hdr = SPCN_MOD_PRS_LED_DATA_SUB << 24 | SPCN_CMD_PRS << 16;
+ rc = fsp_queue_msg(fsp_mkmsg(FSP_CMD_SPCN_PASSTHRU, 4,
+ SPCN_ADDR_MODE_CEC_NODE, cmd_hdr,
+ 0, PSI_DMA_LED_BUF),
+ fsp_read_leds_data_complete);
+ if (rc)
+ prlog(PR_ERR, "Replay SPCN_MOD_PRS_LED_DATA_SUB"
+ " command could not be queued\n");
+ }
+
+ /* Failed to queue MBOX message */
+ if (rc)
+ led_support = LED_STATE_ABSENT;
+}
+
+/*
+ * FSP message response handler for following SPCN LED commands
+ * which are used to fetch all of the LED data from SPCN
+ *
+ * 1. SPCN_MOD_PRS_LED_DATA_FIRST --> First 1KB of LED data
+ * 2. SPCN_MOD_PRS_LED_DATA_SUB --> Subsequent 1KB of LED data
+ *
+ * Once the SPCN_RSP_STATUS_SUCCESS response code has been received
+ * indicating the last batch of 1KB LED data is here, the list addition
+ * process is now complete and we enable LED support for FSP async commands
+ * and for OPAL interface.
+ */
+static void fsp_read_leds_data_complete(struct fsp_msg *msg)
+{
+ struct fsp_led_data *led, *next;
+ struct fsp_msg *resp = msg->resp;
+ u32 cmd_hdr = 0;
+ int rc = 0;
+
+ u32 msg_status = resp->word1 & 0xff00;
+ u32 led_status = (fsp_msg_get_data_word(resp, 1) >> 24) & 0xff;
+ u16 data_len = (u16)(fsp_msg_get_data_word(resp, 1) & 0xffff);
+
+ if (msg_status != FSP_STATUS_SUCCESS) {
+ log_simple_error(&e_info(OPAL_RC_LED_SUPPORT),
+ "FSP returned error %x LED not supported\n",
+ msg_status);
+ /* LED support not available */
+ led_support = LED_STATE_ABSENT;
+
+ fsp_freemsg(msg);
+ return;
+ }
+
+ /* SPCN command status */
+ switch (led_status) {
+ /* Last 1KB of LED data */
+ case SPCN_RSP_STATUS_SUCCESS:
+ prlog(PR_DEBUG, "SPCN_RSP_STATUS_SUCCESS: %d bytes received\n",
+ data_len);
+
+ led_support = LED_STATE_PRESENT;
+
+ /* Copy data to the local list */
+ fsp_process_leds_data(data_len);
+
+ /* LEDs captured on the system */
+ prlog(PR_DEBUG, "CEC LEDs captured on the system:\n");
+ list_for_each_safe(&cec_ledq, led, next, link) {
+ prlog(PR_DEBUG,
+ "rid: %x\t"
+ "len: %x "
+ "lcode: %-30s\t"
+ "parms: %04x\t"
+ "status: %04x\n",
+ led->rid,
+ led->lc_len,
+ led->loc_code,
+ led->parms,
+ led->status);
+ }
+
+ prlog(PR_DEBUG, "ENCL LEDs captured on the system:\n");
+ list_for_each_safe(&encl_ledq, led, next, link) {
+ prlog(PR_DEBUG,
+ "rid: %x\t"
+ "len: %x "
+ "lcode: %-30s\t"
+ "parms: %04x\t"
+ "status: %04x\n",
+ led->rid,
+ led->lc_len,
+ led->loc_code,
+ led->parms,
+ led->status);
+ }
+
+ break;
+
+ /* If more 1KB of LED data present */
+ case SPCN_RSP_STATUS_COND_SUCCESS:
+ prlog(PR_DEBUG, "SPCN_RSP_STATUS_COND_SUCCESS: %d bytes "
+ " received\n", data_len);
+
+ /* Copy data to the local list */
+ fsp_process_leds_data(data_len);
+
+ /* Fetch the remaining data from SPCN */
+ last_spcn_cmd = SPCN_MOD_PRS_LED_DATA_SUB;
+ cmd_hdr = SPCN_MOD_PRS_LED_DATA_SUB << 24 | SPCN_CMD_PRS << 16;
+ rc = fsp_queue_msg(fsp_mkmsg(FSP_CMD_SPCN_PASSTHRU, 4,
+ SPCN_ADDR_MODE_CEC_NODE,
+ cmd_hdr, 0, PSI_DMA_LED_BUF),
+ fsp_read_leds_data_complete);
+ if (rc) {
+ prlog(PR_ERR, "SPCN_MOD_PRS_LED_DATA_SUB command"
+ " could not be queued\n");
+
+ led_support = LED_STATE_ABSENT;
+ }
+ break;
+
+ /* Other expected error codes*/
+ case SPCN_RSP_STATUS_INVALID_RACK:
+ case SPCN_RSP_STATUS_INVALID_SLAVE:
+ case SPCN_RSP_STATUS_INVALID_MOD:
+ case SPCN_RSP_STATUS_STATE_PROHIBIT:
+ case SPCN_RSP_STATUS_UNKNOWN:
+ default:
+ /* Replay the previous SPCN command */
+ replay_spcn_cmd(last_spcn_cmd);
+ }
+ fsp_freemsg(msg);
+}
+
+/*
+ * Init the LED state
+ *
+ * This is called during the host boot process. This is the place where
+ * we figure out all the LEDs present on the system, their state and then
+ * create structure out of those information and popullate two master lists.
+ * One for all the LEDs on the CEC and one for all the LEDs on the enclosure.
+ * The LED information contained in the lists will cater either to various
+ * FSP initiated async commands or POWERNV initiated OPAL calls. Need to make
+ * sure that this initialization process is complete before allowing any requets
+ * on LED. Also need to be called to re-fetch data from SPCN after any LED state
+ * have been updated.
+ */
+static void fsp_leds_query_spcn(void)
+{
+ struct fsp_led_data *led = NULL;
+ int rc = 0;
+
+ u32 cmd_hdr = SPCN_MOD_PRS_LED_DATA_FIRST << 24 | SPCN_CMD_PRS << 16;
+
+ /* Till the last batch of LED data */
+ last_spcn_cmd = 0;
+
+ /* Empty the lists */
+ while (!list_empty(&cec_ledq)) {
+ led = list_pop(&cec_ledq, struct fsp_led_data, link);
+ free(led);
+ }
+
+ while (!list_empty(&encl_ledq)) {
+ led = list_pop(&encl_ledq, struct fsp_led_data, link);
+ free(led);
+ }
+
+ /* Allocate buffer with alignment requirements */
+ if (led_buffer == NULL) {
+ led_buffer = memalign(TCE_PSIZE, PSI_DMA_LED_BUF_SZ);
+ if (!led_buffer)
+ return;
+ }
+
+ /* TCE mapping - will not unmap */
+ fsp_tce_map(PSI_DMA_LED_BUF, led_buffer, PSI_DMA_LED_BUF_SZ);
+
+ /* Request the first 1KB of LED data */
+ last_spcn_cmd = SPCN_MOD_PRS_LED_DATA_FIRST;
+ rc = fsp_queue_msg(fsp_mkmsg(FSP_CMD_SPCN_PASSTHRU, 4,
+ SPCN_ADDR_MODE_CEC_NODE, cmd_hdr, 0,
+ PSI_DMA_LED_BUF), fsp_read_leds_data_complete);
+ if (rc)
+ prlog(PR_ERR,
+ "SPCN_MOD_PRS_LED_DATA_FIRST command could"
+ " not be queued\n");
+ else /* Initiated LED list fetch MBOX command */
+ led_support = LED_STATE_READING;
+}
+
+/* Init the LED subsystem at boot time */
+void fsp_led_init(void)
+{
+ led_buffer = NULL;
+
+ if (!fsp_present())
+ return;
+
+ /* Init the master lists */
+ list_head_init(&cec_ledq);
+ list_head_init(&encl_ledq);
+ list_head_init(&spcn_cmdq);
+
+ fsp_leds_query_spcn();
+
+ loc_code_list_buffer = memalign(TCE_PSIZE, PSI_DMA_LOC_COD_BUF_SZ);
+ if (loc_code_list_buffer == NULL)
+ prerror("ERROR: Unable to allocate loc_code_list_buffer!\n");
+
+ prlog(PR_TRACE, "Init completed\n");
+
+ /* Get System attention indicator state */
+ dt_get_sai_loc_code();
+ fsp_get_sai();
+
+ /* Handle FSP initiated async LED commands */
+ fsp_register_client(&fsp_indicator_client, FSP_MCLASS_INDICATOR);
+ prlog(PR_TRACE, "FSP async command client registered\n");
+
+ /* Register for SAI update notification */
+ sysparam_add_update_notifier(sai_update_notification);
+
+ opal_register(OPAL_LEDS_GET_INDICATOR, fsp_opal_leds_get_ind, 4);
+ opal_register(OPAL_LEDS_SET_INDICATOR, fsp_opal_leds_set_ind, 5);
+ prlog(PR_TRACE, "LED OPAL interface registered\n");
+}
diff --git a/roms/skiboot/hw/fsp/fsp-mem-err.c b/roms/skiboot/hw/fsp/fsp-mem-err.c
new file mode 100644
index 000000000..2e3e65401
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-mem-err.c
@@ -0,0 +1,401 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Sometimes some memory needs to go and sit in the naughty corner
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#define pr_fmt(fmt) "FSPMEMERR: " fmt
+#include <skiboot.h>
+#include <opal.h>
+#include <opal-msg.h>
+#include <lock.h>
+#include <fsp.h>
+#include <errorlog.h>
+
+/* FSP sends real address of 4K memory page. */
+#define MEM_ERR_PAGE_SIZE_4K (1UL << 12)
+
+/* maximum number of error event to hold until linux consumes it. */
+#define MERR_MAX_RECORD 1024
+
+struct fsp_mem_err_node {
+ struct list_node list;
+ struct OpalMemoryErrorData data;
+};
+
+static LIST_HEAD(merr_free_list);
+static LIST_HEAD(mem_error_list);
+/*
+ * lock is used to protect overwriting of merr_free_list and mem_error_list
+ * list.
+ */
+static struct lock mem_err_lock = LOCK_UNLOCKED;
+
+DEFINE_LOG_ENTRY(OPAL_RC_MEM_ERR_RES, OPAL_PLATFORM_ERR_EVT, OPAL_MEM_ERR,
+ OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL,
+ OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_MEM_ERR_DEALLOC, OPAL_PLATFORM_ERR_EVT, OPAL_MEM_ERR,
+ OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL,
+ OPAL_NA);
+
+static bool send_response_to_fsp(u32 cmd_sub_mod)
+{
+ struct fsp_msg *rsp;
+ int rc = -ENOMEM;
+
+ rsp = fsp_mkmsg(cmd_sub_mod, 0);
+ if (rsp)
+ rc = fsp_queue_msg(rsp, fsp_freemsg);
+ if (rc) {
+ fsp_freemsg(rsp);
+ /* XXX Generate error logs */
+ prerror("Error %d queueing FSP memory error reply\n", rc);
+ return false;
+ }
+ return true;
+}
+
+/*
+ * Queue up the memory error message for delivery.
+ *
+ * queue_event_for_delivery get called from two places.
+ * 1) from queue_mem_err_node when new fsp mem error is available and
+ * 2) from completion callback indicating that linux has consumed an message.
+ *
+ * TODO:
+ * There is a chance that, we may not get a free slot to queue our event
+ * for delivery to linux during both the above invocations. In that case
+ * we end up holding events with us until next fsp memory error comes in.
+ * We need to address this case either here OR fix up messaging infrastructure
+ * to make sure at least one slot will always be available per message type.
+ *
+ * XXX: BenH: I changed the msg infrastructure to attempt an allocation
+ * in that case, at least until we clarify a bit better how
+ * we want to handle things.
+ */
+static void queue_event_for_delivery(void *data __unused, int staus __unused)
+{
+ struct fsp_mem_err_node *entry;
+ uint64_t *merr_data;
+ int rc;
+
+ lock(&mem_err_lock);
+ entry = list_pop(&mem_error_list, struct fsp_mem_err_node, list);
+ unlock(&mem_err_lock);
+
+ if (!entry)
+ return;
+
+ /*
+ * struct OpalMemoryErrorData is of (4 * 64 bits) size and well packed
+ * structure. Hence use uint64_t pointer to pass entire structure
+ * using 4 params in generic message format.
+ */
+ merr_data = (uint64_t *)&entry->data;
+
+ /* queue up for delivery */
+ rc = opal_queue_msg(OPAL_MSG_MEM_ERR, NULL, queue_event_for_delivery,
+ cpu_to_be64(merr_data[0]),
+ cpu_to_be64(merr_data[1]),
+ cpu_to_be64(merr_data[2]),
+ cpu_to_be64(merr_data[3]));
+ lock(&mem_err_lock);
+ if (rc) {
+ /*
+ * Failed to queue up the event for delivery. No free slot
+ * available. There is a chance that we are trying to queue
+ * up multiple event at the same time. We may already have
+ * at least one event queued up, in that case we will be
+ * called again through completion callback and we should
+ * be able to grab empty slot then.
+ *
+ * For now, put this node back on mem_error_list.
+ */
+ list_add(&mem_error_list, &entry->list);
+ } else
+ list_add(&merr_free_list, &entry->list);
+ unlock(&mem_err_lock);
+}
+
+static int queue_mem_err_node(struct OpalMemoryErrorData *merr_evt)
+{
+ struct fsp_mem_err_node *entry;
+
+ lock(&mem_err_lock);
+ entry = list_pop(&merr_free_list, struct fsp_mem_err_node, list);
+ if (!entry) {
+ printf("Failed to queue up memory error event.\n");
+ unlock(&mem_err_lock);
+ return -ENOMEM;
+ }
+
+ entry->data = *merr_evt;
+ list_add(&mem_error_list, &entry->list);
+ unlock(&mem_err_lock);
+
+ /* Queue up the event for delivery to OS. */
+ queue_event_for_delivery(NULL, OPAL_SUCCESS);
+ return 0;
+}
+
+/* Check if memory resilience event for same address already exists. */
+static bool is_resilience_event_exist(u64 paddr)
+{
+ struct fsp_mem_err_node *entry;
+ struct OpalMemoryErrorData *merr_evt;
+ int found = 0;
+
+ lock(&mem_err_lock);
+ list_for_each(&mem_error_list, entry, list) {
+ merr_evt = &entry->data;
+ if ((merr_evt->type == OPAL_MEM_ERR_TYPE_RESILIENCE) &&
+ (be64_to_cpu(merr_evt->u.resilience.physical_address_start)
+ == paddr)) {
+ found = 1;
+ break;
+ }
+ }
+ unlock(&mem_err_lock);
+ return !!found;
+}
+
+/*
+ * handle Memory Resilience error message.
+ * Section 28.2 of Hypervisor to FSP Mailbox Interface Specification.
+ *
+ * The flow for Memory Resilence Event is:
+ * 1. PRD component in FSP gets a recoverable attention from hardware when
+ * there is a corretable/uncorrectable memory error to free up a page.
+ * 2. PRD sends Memory Resilence Command to hypervisor with the real address of
+ * the 4K memory page in which the error occurred.
+ * 3. The hypervisor acknowledges with a status immediately. Immediate
+ * acknowledgment doesn’t require the freeing of the page to be completed.
+ */
+static bool handle_memory_resilience(u32 cmd_sub_mod, u64 paddr)
+{
+ int rc = 0;
+ struct OpalMemoryErrorData mem_err_evt;
+ struct errorlog *buf;
+
+ memset(&mem_err_evt, 0, sizeof(struct OpalMemoryErrorData));
+ /* Check arguments */
+ if (paddr == 0) {
+ prerror("memory resilience: Invalid real address.\n");
+ return send_response_to_fsp(FSP_RSP_MEM_RES |
+ FSP_STATUS_GENERIC_ERROR);
+ }
+
+ /* Check if event already exist for same address. */
+ if (is_resilience_event_exist(paddr))
+ goto send_response;
+
+ /* Populate an event. */
+ mem_err_evt.version = OpalMemErr_V1;
+ mem_err_evt.type = OPAL_MEM_ERR_TYPE_RESILIENCE;
+
+ switch (cmd_sub_mod) {
+ case FSP_CMD_MEM_RES_CE:
+ /*
+ * Should we keep counter for corrected errors in
+ * sapphire OR let linux (PowerNV) handle it?
+ *
+ * For now, send corrected errors to linux and let
+ * linux handle corrected errors thresholding.
+ */
+ mem_err_evt.flags |= cpu_to_be16(OPAL_MEM_CORRECTED_ERROR);
+ mem_err_evt.u.resilience.resil_err_type =
+ OPAL_MEM_RESILIENCE_CE;
+ break;
+ case FSP_CMD_MEM_RES_UE:
+ mem_err_evt.u.resilience.resil_err_type =
+ OPAL_MEM_RESILIENCE_UE;
+ break;
+ case FSP_CMD_MEM_RES_UE_SCRB:
+ mem_err_evt.u.resilience.resil_err_type =
+ OPAL_MEM_RESILIENCE_UE_SCRUB;
+ break;
+ }
+ mem_err_evt.u.resilience.physical_address_start = cpu_to_be64(paddr);
+ mem_err_evt.u.resilience.physical_address_end =
+ cpu_to_be64(paddr + MEM_ERR_PAGE_SIZE_4K);
+
+ /* Queue up the event and inform OS about it. */
+ rc = queue_mem_err_node(&mem_err_evt);
+
+send_response:
+ /* Queue up an OK response to the resilience message itself */
+ if (!rc)
+ return send_response_to_fsp(FSP_RSP_MEM_RES);
+ else {
+ buf = opal_elog_create(&e_info(OPAL_RC_MEM_ERR_RES), 0);
+ log_append_msg(buf,
+ "OPAL_MEM_ERR: Cannot queue up memory "
+ "resilience error event to the OS");
+ log_add_section(buf, OPAL_ELOG_SEC_DESC);
+ log_append_data(buf, (char *) &mem_err_evt,
+ sizeof(struct OpalMemoryErrorData));
+ log_commit(buf);
+ return false;
+ }
+}
+
+/* update existing event entry if match is found. */
+static bool update_memory_deallocation_event(u64 paddr_start, u64 paddr_end)
+{
+ struct fsp_mem_err_node *entry;
+ struct OpalMemoryErrorData *merr_evt;
+ int found = 0;
+
+ lock(&mem_err_lock);
+ list_for_each(&mem_error_list, entry, list) {
+ merr_evt = &entry->data;
+ if ((merr_evt->type == OPAL_MEM_ERR_TYPE_DYN_DALLOC) &&
+ (be64_to_cpu(merr_evt->u.dyn_dealloc.physical_address_start)
+ == paddr_start)) {
+ found = 1;
+ if (be64_to_cpu(merr_evt->u.dyn_dealloc.physical_address_end)
+ < paddr_end)
+ merr_evt->u.dyn_dealloc.physical_address_end =
+ cpu_to_be64(paddr_end);
+ break;
+ }
+ }
+ unlock(&mem_err_lock);
+ return !!found;
+}
+
+/*
+ * Handle dynamic memory deallocation message.
+ *
+ * When a condition occurs in which we need to do a large scale memory
+ * deallocation, PRD will send a starting and ending address of an area of
+ * memory to Hypervisor. Hypervisor then need to use this to deallocate all
+ * pages between and including the addresses.
+ *
+ */
+static bool handle_memory_deallocation(u64 paddr_start, u64 paddr_end)
+{
+ int rc = 0;
+ u8 err = 0;
+ struct OpalMemoryErrorData mem_err_evt;
+ struct errorlog *buf;
+
+ memset(&mem_err_evt, 0, sizeof(struct OpalMemoryErrorData));
+ /* Check arguments */
+ if ((paddr_start == 0) || (paddr_end == 0)) {
+ prerror("memory deallocation: Invalid "
+ "starting/ending real address.\n");
+ err = FSP_STATUS_GENERIC_ERROR;
+ }
+
+ /* If we had an error, send response to fsp and return */
+ if (err)
+ return send_response_to_fsp(FSP_RSP_MEM_DYN_DEALLOC | err);
+
+ /*
+ * FSP can send dynamic memory deallocation multiple times for the
+ * same address/address ranges. Hence check and update if we already
+ * have sam event queued.
+ */
+ if (update_memory_deallocation_event(paddr_start, paddr_end))
+ goto send_response;
+
+ /* Populate an new event. */
+ mem_err_evt.version = OpalMemErr_V1;
+ mem_err_evt.type = OPAL_MEM_ERR_TYPE_DYN_DALLOC;
+ mem_err_evt.u.dyn_dealloc.dyn_err_type =
+ OPAL_MEM_DYNAMIC_DEALLOC;
+ mem_err_evt.u.dyn_dealloc.physical_address_start = cpu_to_be64(paddr_start);
+ mem_err_evt.u.dyn_dealloc.physical_address_end = cpu_to_be64(paddr_end);
+
+ /* Queue up the event and inform OS about it. */
+ rc = queue_mem_err_node(&mem_err_evt);
+
+send_response:
+ /* Queue up an OK response to the memory deallocation message itself */
+ if (!rc)
+ return send_response_to_fsp(FSP_RSP_MEM_DYN_DEALLOC);
+ else {
+ buf = opal_elog_create(&e_info(OPAL_RC_MEM_ERR_DEALLOC), 0);
+ log_append_msg(buf,
+ "OPAL_MEM_ERR: Cannot queue up memory "
+ "deallocation error event to the OS");
+ log_add_section(buf, OPAL_ELOG_SEC_DESC);
+ log_append_data(buf, (char *)&mem_err_evt,
+ sizeof(struct OpalMemoryErrorData));
+ log_commit(buf);
+ return false;
+ }
+}
+
+/* Receive a memory error mesages and handle it. */
+static bool fsp_mem_err_msg(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+ u64 paddr_start, paddr_end;
+
+ printf("Received 0x%08ux command\n", cmd_sub_mod);
+ switch (cmd_sub_mod) {
+ case FSP_CMD_MEM_RES_CE:
+ case FSP_CMD_MEM_RES_UE:
+ case FSP_CMD_MEM_RES_UE_SCRB:
+ /*
+ * We get the memory relilence command from FSP for
+ * correctable/Uncorrectable/scrub UE errors with real
+ * address of 4K memory page in which the error occurred.
+ */
+ paddr_start = be64_to_cpu(*((__be64 *)&msg->data.bytes[0]));
+ printf("Got memory resilience error message for "
+ "paddr=0x%016llux\n", paddr_start);
+ return handle_memory_resilience(cmd_sub_mod, paddr_start);
+ case FSP_CMD_MEM_DYN_DEALLOC:
+ paddr_start = be64_to_cpu(*((__be64 *)&msg->data.bytes[0]));
+ paddr_end = be64_to_cpu(*((__be64 *)&msg->data.bytes[8]));
+ printf("Got dynamic memory deallocation message: "
+ "paddr_start=0x%016llux, paddr_end=0x%016llux\n",
+ paddr_start, paddr_end);
+ return handle_memory_deallocation(paddr_start, paddr_end);
+ }
+ return false;
+}
+
+/*
+ * pre allocate memory to hold maximum of 128 memory error event until linux
+ * consumes it.
+ */
+static int init_merr_free_list(uint32_t num_entries)
+{
+ struct fsp_mem_err_node *entry;
+ int i;
+
+ entry = zalloc(sizeof(struct fsp_mem_err_node) * num_entries);
+ if (!entry)
+ return -ENOMEM;
+
+ for (i = 0; i < num_entries; ++i, entry++)
+ list_add_tail(&merr_free_list, &entry->list);
+
+ return 0;
+}
+
+static struct fsp_client fsp_mem_err_client = {
+ .message = fsp_mem_err_msg,
+};
+
+void fsp_memory_err_init(void)
+{
+ int rc;
+
+ printf("Intializing fsp memory handling.\n");
+ /* If we have an FSP, register for notifications */
+ if (!fsp_present())
+ return;
+
+ /* pre allocate memory for 128 record */
+ rc = init_merr_free_list(MERR_MAX_RECORD);
+ if (rc < 0)
+ return;
+
+ fsp_register_client(&fsp_mem_err_client, FSP_MCLASS_MEMORY_ERR);
+}
diff --git a/roms/skiboot/hw/fsp/fsp-nvram.c b/roms/skiboot/hw/fsp/fsp-nvram.c
new file mode 100644
index 000000000..aa17cb5e7
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-nvram.c
@@ -0,0 +1,424 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Read/Write NVRAM from/to FSP
+ *
+ * Copyright 2013-2017 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <fsp.h>
+#include <opal.h>
+#include <lock.h>
+#include <device.h>
+#include <errorlog.h>
+
+/*
+ * The FSP NVRAM API operates in "blocks" of 4K. It is entirely exposed
+ * to the OS via the OPAL APIs.
+ *
+ * In order to avoid dealing with complicated read/modify/write state
+ * machines (and added issues related to FSP failover in the middle)
+ * we keep a memory copy of the entire nvram which we load at boot
+ * time. We save only modified blocks.
+ *
+ * To limit the amount of memory used by the nvram image, we limit
+ * how much nvram we support to NVRAM_SIZE. Additionally, this limit
+ * of 1M is the maximum that the CHRP/PAPR nvram partition format
+ * supports for a partition entry.
+ *
+ * (Q: should we save the whole thing in case of FSP failover ?)
+ *
+ * The nvram is expected to comply with the CHRP/PAPR defined format,
+ * and specifically contain a System partition (ID 0x70) named "common"
+ * with configuration variables for the bootloader and a FW private
+ * partition for future use by skiboot.
+ *
+ * If the partition layout appears broken or lacks one of the above
+ * partitions, we reformat the entire nvram at boot time.
+ *
+ * We do not exploit the ability of the FSP to store a checksum. This
+ * is documented as possibly going away. The CHRP format for nvram
+ * that Linux uses has its own (though weak) checksum mechanism already
+ *
+ */
+
+#define NVRAM_BLKSIZE 0x1000
+
+struct nvram_triplet {
+ __be64 dma_addr;
+ __be32 blk_offset;
+ __be32 blk_count;
+} __packed;
+
+#define NVRAM_FLAG_CLEAR_WPEND 0x80000000
+
+enum nvram_state {
+ NVRAM_STATE_CLOSED,
+ NVRAM_STATE_OPENING,
+ NVRAM_STATE_BROKEN,
+ NVRAM_STATE_OPEN,
+ NVRAM_STATE_ABSENT,
+};
+
+static void *fsp_nvram_image;
+static uint32_t fsp_nvram_size;
+static struct lock fsp_nvram_lock = LOCK_UNLOCKED;
+static struct fsp_msg *fsp_nvram_msg;
+static uint32_t fsp_nvram_dirty_start;
+static uint32_t fsp_nvram_dirty_end;
+static bool fsp_nvram_was_read;
+static struct nvram_triplet fsp_nvram_triplet __align(0x1000);
+static enum nvram_state fsp_nvram_state = NVRAM_STATE_CLOSED;
+
+DEFINE_LOG_ENTRY(OPAL_RC_NVRAM_INIT, OPAL_PLATFORM_ERR_EVT , OPAL_NVRAM,
+ OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL,
+ OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_NVRAM_OPEN, OPAL_PLATFORM_ERR_EVT, OPAL_NVRAM,
+ OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL,
+ OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_NVRAM_SIZE, OPAL_PLATFORM_ERR_EVT, OPAL_NVRAM,
+ OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL,
+ OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_NVRAM_READ, OPAL_PLATFORM_ERR_EVT, OPAL_NVRAM,
+ OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL,
+ OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_NVRAM_WRITE, OPAL_PLATFORM_ERR_EVT, OPAL_NVRAM,
+ OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL,
+ OPAL_NA);
+
+static void fsp_nvram_send_write(void);
+
+static void fsp_nvram_wr_complete(struct fsp_msg *msg)
+{
+ struct fsp_msg *resp = msg->resp;
+ uint8_t rc;
+
+ lock(&fsp_nvram_lock);
+ fsp_nvram_msg = NULL;
+
+ /* Check for various errors. If an error occurred,
+ * we generally assume the nvram is completely dirty
+ * but we won't trigger a new write until we get
+ * either a new attempt at writing, or an FSP reset
+ * reload (TODO)
+ */
+ if (!resp || resp->state != fsp_msg_response)
+ goto fail_dirty;
+ rc = (msg->word1 >> 8) & 0xff;
+ switch(rc) {
+ case 0:
+ case 0x44:
+ /* Sync to secondary required... XXX */
+ case 0x45:
+ break;
+ case 0xef:
+ /* Sync to secondary failed, let's ignore that for now,
+ * maybe when (if) we handle redundant FSPs ...
+ */
+ prerror("FSP: NVRAM sync to secondary failed\n");
+ break;
+ default:
+ log_simple_error(&e_info(OPAL_RC_NVRAM_WRITE),
+ "FSP: NVRAM write return error 0x%02x\n", rc);
+ goto fail_dirty;
+ }
+ fsp_freemsg(msg);
+ if (fsp_nvram_dirty_start <= fsp_nvram_dirty_end)
+ fsp_nvram_send_write();
+ unlock(&fsp_nvram_lock);
+ return;
+ fail_dirty:
+ fsp_nvram_dirty_start = 0;
+ fsp_nvram_dirty_end = fsp_nvram_size - 1;
+ fsp_freemsg(msg);
+ unlock(&fsp_nvram_lock);
+}
+
+static void fsp_nvram_send_write(void)
+{
+ uint32_t start = fsp_nvram_dirty_start;
+ uint32_t end = fsp_nvram_dirty_end;
+ uint32_t count;
+
+ if (start > end || fsp_nvram_state != NVRAM_STATE_OPEN)
+ return;
+ count = (end - start) / NVRAM_BLKSIZE + 1;
+ fsp_nvram_triplet.dma_addr = cpu_to_be64(PSI_DMA_NVRAM_BODY + start);
+ fsp_nvram_triplet.blk_offset = cpu_to_be32(start / NVRAM_BLKSIZE);
+ fsp_nvram_triplet.blk_count = cpu_to_be32(count);
+ fsp_nvram_msg = fsp_mkmsg(FSP_CMD_WRITE_VNVRAM, 6,
+ 0, PSI_DMA_NVRAM_TRIPL, 1,
+ NVRAM_FLAG_CLEAR_WPEND, 0, 0);
+ if (fsp_queue_msg(fsp_nvram_msg, fsp_nvram_wr_complete)) {
+ fsp_freemsg(fsp_nvram_msg);
+ fsp_nvram_msg = NULL;
+ log_simple_error(&e_info(OPAL_RC_NVRAM_WRITE),
+ "FSP: Error queueing nvram update\n");
+ return;
+ }
+ fsp_nvram_dirty_start = fsp_nvram_size;
+ fsp_nvram_dirty_end = 0;
+}
+
+static void fsp_nvram_rd_complete(struct fsp_msg *msg)
+{
+ int64_t rc;
+
+ lock(&fsp_nvram_lock);
+
+ /* Read complete, check status. What to do if the read fails ?
+ *
+ * Well, there could be various reasons such as an FSP reboot
+ * at the wrong time, but there is really not much we can do
+ * so for now I'll just mark the nvram as closed, and we'll
+ * attempt a re-open and re-read whenever the OS tries to
+ * access it
+ */
+ rc = (msg->resp->word1 >> 8) & 0xff;
+ fsp_nvram_msg = NULL;
+ fsp_freemsg(msg);
+ if (rc) {
+ prerror("FSP: NVRAM read failed, will try again later\n");
+ fsp_nvram_state = NVRAM_STATE_CLOSED;
+ } else {
+ /* nvram was read once, no need to do it ever again */
+ fsp_nvram_was_read = true;
+ fsp_nvram_state = NVRAM_STATE_OPEN;
+
+ /* XXX Here we should look for nvram settings that concern
+ * us such as guest kernel arguments etc...
+ */
+ }
+ unlock(&fsp_nvram_lock);
+ nvram_read_complete(fsp_nvram_state == NVRAM_STATE_OPEN);
+ if (fsp_nvram_state != NVRAM_STATE_OPEN)
+ log_simple_error(&e_info(OPAL_RC_NVRAM_INIT),
+ "FSP: NVRAM not read, skipping init\n");
+}
+
+static void fsp_nvram_send_read(void)
+{
+ fsp_nvram_msg = fsp_mkmsg(FSP_CMD_READ_VNVRAM, 4,
+ 0, PSI_DMA_NVRAM_BODY, 0,
+ fsp_nvram_size / NVRAM_BLKSIZE);
+ if (fsp_queue_msg(fsp_nvram_msg, fsp_nvram_rd_complete)) {
+ /* If the nvram read fails to queue, we mark ourselves
+ * closed. Shouldn't have happened anyway. Not much else
+ * we can do.
+ */
+ fsp_nvram_state = NVRAM_STATE_CLOSED;
+ fsp_freemsg(fsp_nvram_msg);
+ fsp_nvram_msg = NULL;
+ log_simple_error(&e_info(OPAL_RC_NVRAM_READ),
+ "FSP: Error queueing nvram read\n");
+ return;
+ }
+}
+
+static void fsp_nvram_open_complete(struct fsp_msg *msg)
+{
+ int8_t rc;
+
+ lock(&fsp_nvram_lock);
+
+ /* Open complete, check status */
+ rc = (msg->resp->word1 >> 8) & 0xff;
+ fsp_nvram_msg = NULL;
+ fsp_freemsg(msg);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_NVRAM_OPEN),
+ "FSP: NVRAM open failed, FSP error 0x%02x\n", rc);
+ goto failed;
+ }
+ if (fsp_nvram_was_read)
+ fsp_nvram_state = NVRAM_STATE_OPEN;
+ else
+ fsp_nvram_send_read();
+ unlock(&fsp_nvram_lock);
+ return;
+ failed:
+ fsp_nvram_state = NVRAM_STATE_CLOSED;
+ unlock(&fsp_nvram_lock);
+}
+
+static void fsp_nvram_send_open(void)
+{
+ printf("FSP NVRAM: Opening nvram...\n");
+ fsp_nvram_msg = fsp_mkmsg(FSP_CMD_OPEN_VNVRAM, 1, fsp_nvram_size);
+ assert(fsp_nvram_msg);
+ fsp_nvram_state = NVRAM_STATE_OPENING;
+ if (!fsp_queue_msg(fsp_nvram_msg, fsp_nvram_open_complete))
+ return;
+
+ prerror("FSP NVRAM: Failed to queue nvram open message\n");
+ fsp_freemsg(fsp_nvram_msg);
+ fsp_nvram_msg = NULL;
+ fsp_nvram_state = NVRAM_STATE_CLOSED;
+}
+
+static bool fsp_nvram_get_size(uint32_t *out_size)
+{
+ struct fsp_msg *msg;
+ int rc, size;
+
+ msg = fsp_mkmsg(FSP_CMD_GET_VNVRAM_SIZE, 0);
+ assert(msg);
+
+ rc = fsp_sync_msg(msg, false);
+ size = msg->resp ? fsp_msg_get_data_word(msg->resp, 0) : 0;
+ fsp_freemsg(msg);
+ if (rc || size == 0) {
+ log_simple_error(&e_info(OPAL_RC_NVRAM_SIZE),
+ "FSP: Error %d nvram size reported is %d\n", rc, size);
+ fsp_nvram_state = NVRAM_STATE_BROKEN;
+ return false;
+ }
+ printf("FSP: NVRAM file size from FSP is %d bytes\n", size);
+ *out_size = size;
+ return true;
+}
+
+static bool fsp_nvram_msg_rr(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+ assert(msg == NULL);
+
+ switch (cmd_sub_mod) {
+ case FSP_RESET_START:
+ printf("FSP: Closing NVRAM on account of FSP Reset\n");
+ fsp_nvram_state = NVRAM_STATE_CLOSED;
+ return true;
+ case FSP_RELOAD_COMPLETE:
+ printf("FSP: Reopening NVRAM of FSP Reload complete\n");
+ lock(&fsp_nvram_lock);
+ fsp_nvram_send_open();
+ unlock(&fsp_nvram_lock);
+ return true;
+ }
+ return false;
+}
+
+static struct fsp_client fsp_nvram_client_rr = {
+ .message = fsp_nvram_msg_rr,
+};
+
+static bool fsp_vnvram_msg(u32 cmd_sub_mod, struct fsp_msg *msg __unused)
+{
+ u32 cmd;
+ struct fsp_msg *resp;
+
+ switch (cmd_sub_mod) {
+ case FSP_CMD_GET_VNV_STATS:
+ prlog(PR_DEBUG,
+ "FSP NVRAM: Get vNVRAM statistics not supported\n");
+ cmd = FSP_RSP_GET_VNV_STATS | FSP_STATUS_INVALID_SUBCMD;
+ break;
+ case FSP_CMD_FREE_VNV_STATS:
+ prlog(PR_DEBUG,
+ "FSP NVRAM: Free vNVRAM statistics buffer not supported\n");
+ cmd = FSP_RSP_FREE_VNV_STATS | FSP_STATUS_INVALID_SUBCMD;
+ break;
+ default:
+ return false;
+ }
+
+ resp = fsp_mkmsg(cmd, 0);
+ if (!resp) {
+ prerror("FSP NVRAM: Failed to allocate resp message\n");
+ return false;
+ }
+ if (fsp_queue_msg(resp, fsp_freemsg)) {
+ prerror("FSP NVRAM: Failed to queue resp message\n");
+ fsp_freemsg(resp);
+ return false;
+ }
+ return true;
+}
+
+static struct fsp_client fsp_vnvram_client = {
+ .message = fsp_vnvram_msg,
+};
+
+int fsp_nvram_info(uint32_t *total_size)
+{
+ if (!fsp_present()) {
+ fsp_nvram_state = NVRAM_STATE_ABSENT;
+ return OPAL_HARDWARE;
+ }
+
+ if (!fsp_nvram_get_size(total_size))
+ return OPAL_HARDWARE;
+ return OPAL_SUCCESS;
+}
+
+int fsp_nvram_start_read(void *dst, uint32_t src, uint32_t len)
+{
+ /* We are currently limited to fully aligned transfers */
+ assert((((uint64_t)dst) & 0xfff) == 0);
+ assert(dst);
+
+ /* Currently don't support src!=0 */
+ assert(src == 0);
+
+ if (!fsp_present())
+ return -ENODEV;
+
+ op_display(OP_LOG, OP_MOD_INIT, 0x0007);
+
+ lock(&fsp_nvram_lock);
+
+ /* Store image info */
+ fsp_nvram_image = dst;
+ fsp_nvram_size = len;
+
+ /* Mark nvram as not dirty */
+ fsp_nvram_dirty_start = len;
+ fsp_nvram_dirty_end = 0;
+
+ /* Map TCEs */
+ fsp_tce_map(PSI_DMA_NVRAM_TRIPL, &fsp_nvram_triplet,
+ PSI_DMA_NVRAM_TRIPL_SZ);
+ fsp_tce_map(PSI_DMA_NVRAM_BODY, dst, PSI_DMA_NVRAM_BODY_SZ);
+
+ /* Register for the reset/reload event */
+ fsp_register_client(&fsp_nvram_client_rr, FSP_MCLASS_RR_EVENT);
+
+ /* Register for virtual NVRAM interface events */
+ fsp_register_client(&fsp_vnvram_client, FSP_MCLASS_VIRTUAL_NVRAM);
+
+ /* Open and load the nvram from the FSP */
+ fsp_nvram_send_open();
+
+ unlock(&fsp_nvram_lock);
+
+ return 0;
+}
+
+int fsp_nvram_write(uint32_t offset, void *src, uint32_t size)
+{
+ uint64_t end = offset + size - 1;
+
+ /* We only support writing from the original image */
+ if (src != fsp_nvram_image + offset)
+ return OPAL_HARDWARE;
+
+ offset &= ~(NVRAM_BLKSIZE - 1);
+ end &= ~(NVRAM_BLKSIZE - 1);
+
+ lock(&fsp_nvram_lock);
+ /* If the nvram is closed, try re-opening */
+ if (fsp_nvram_state == NVRAM_STATE_CLOSED)
+ fsp_nvram_send_open();
+ if (fsp_nvram_dirty_start > offset)
+ fsp_nvram_dirty_start = offset;
+ if (fsp_nvram_dirty_end < end)
+ fsp_nvram_dirty_end = end;
+ if (!fsp_nvram_msg && fsp_nvram_state == NVRAM_STATE_OPEN)
+ fsp_nvram_send_write();
+ unlock(&fsp_nvram_lock);
+
+ return 0;
+}
diff --git a/roms/skiboot/hw/fsp/fsp-occ.c b/roms/skiboot/hw/fsp/fsp-occ.c
new file mode 100644
index 000000000..58926f408
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-occ.c
@@ -0,0 +1,417 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * FSP/OCC interactions
+ *
+ * Unlike OpenPOWER machines, FSP machines are much more tightly coupled
+ * between FSP, host, and OCC. On P8 we have to do a dance to start the
+ * OCC, but on P9 Hostboot does that, consistent with what we do on
+ * OpenPOWER.
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <xscom-p8-regs.h>
+#include <io.h>
+#include <cpu.h>
+#include <chip.h>
+#include <mem_region.h>
+#include <fsp.h>
+#include <timebase.h>
+#include <hostservices.h>
+#include <errorlog.h>
+#include <opal-api.h>
+#include <opal-msg.h>
+#include <timer.h>
+#include <i2c.h>
+#include <powercap.h>
+#include <psr.h>
+#include <sensor.h>
+#include <occ.h>
+
+DEFINE_LOG_ENTRY(OPAL_RC_OCC_LOAD, OPAL_PLATFORM_ERR_EVT, OPAL_OCC,
+ OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+ OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_OCC_RESET, OPAL_PLATFORM_ERR_EVT, OPAL_OCC,
+ OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+ OPAL_NA);
+
+struct occ_load_req {
+ u8 scope;
+ u32 dbob_id;
+ u32 seq_id;
+ struct list_node link;
+};
+static LIST_HEAD(occ_load_req_list);
+
+
+static void occ_queue_load(u8 scope, u32 dbob_id, u32 seq_id)
+{
+ struct occ_load_req *occ_req;
+
+ occ_req = zalloc(sizeof(struct occ_load_req));
+ if (!occ_req) {
+ /**
+ * @fwts-label OCCload_reqENOMEM
+ * @fwts-advice ENOMEM while allocating OCC load message.
+ * OCCs not started, consequently no power/frequency scaling
+ * will be functional.
+ */
+ prlog(PR_ERR, "OCC: Could not allocate occ_load_req\n");
+ return;
+ }
+
+ occ_req->scope = scope;
+ occ_req->dbob_id = dbob_id;
+ occ_req->seq_id = seq_id;
+ list_add_tail(&occ_load_req_list, &occ_req->link);
+}
+
+static void __occ_do_load(u8 scope, u32 dbob_id __unused, u32 seq_id)
+{
+ struct fsp_msg *stat;
+ int rc = -ENOMEM;
+ int status_word = 0;
+ struct proc_chip *chip = next_chip(NULL);
+
+ /* Call HBRT... */
+ rc = host_services_occ_load();
+
+ /* Handle fallback to preload */
+ if (rc == -ENOENT && chip->homer_base) {
+ prlog(PR_INFO, "OCC: Load: Fallback to preloaded image\n");
+ rc = 0;
+ } else if (!rc) {
+ struct opal_occ_msg occ_msg = { CPU_TO_BE64(OCC_LOAD), 0, 0 };
+
+ rc = _opal_queue_msg(OPAL_MSG_OCC, NULL, NULL,
+ sizeof(struct opal_occ_msg), &occ_msg);
+ if (rc)
+ prlog(PR_INFO, "OCC: Failed to queue message %d\n",
+ OCC_LOAD);
+
+ /* Success, start OCC */
+ rc = host_services_occ_start();
+ }
+ if (rc) {
+ /* If either of hostservices call fail, send fail to FSP */
+ /* Find a chip ID to send failure */
+ for_each_chip(chip) {
+ if (scope == 0x01 && dbob_id != chip->dbob_id)
+ continue;
+ status_word = 0xB500 | (chip->pcid & 0xff);
+ break;
+ }
+ log_simple_error(&e_info(OPAL_RC_OCC_LOAD),
+ "OCC: Error %d in load/start OCC\n", rc);
+ }
+
+ /* Send a single response for all chips */
+ stat = fsp_mkmsg(FSP_CMD_LOAD_OCC_STAT, 2, status_word, seq_id);
+ if (stat)
+ rc = fsp_queue_msg(stat, fsp_freemsg);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_OCC_LOAD),
+ "OCC: Error %d queueing FSP OCC LOAD STATUS msg", rc);
+ fsp_freemsg(stat);
+ }
+}
+
+void occ_poke_load_queue(void)
+{
+ struct occ_load_req *occ_req, *next;
+
+ if (list_empty(&occ_load_req_list))
+ return;
+
+ list_for_each_safe(&occ_load_req_list, occ_req, next, link) {
+ __occ_do_load(occ_req->scope, occ_req->dbob_id,
+ occ_req->seq_id);
+ list_del(&occ_req->link);
+ free(occ_req);
+ }
+}
+
+static u32 last_seq_id;
+static bool in_ipl = true;
+static void occ_do_load(u8 scope, u32 dbob_id __unused, u32 seq_id)
+{
+ struct fsp_msg *rsp;
+ int rc = -ENOMEM;
+ u8 err = 0;
+
+ if (scope != 0x01 && scope != 0x02) {
+ /**
+ * @fwts-label OCCLoadInvalidScope
+ * @fwts-advice Invalid request for loading OCCs. Power and
+ * frequency management not functional
+ */
+ prlog(PR_ERR, "OCC: Load message with invalid scope 0x%x\n",
+ scope);
+ err = 0x22;
+ }
+
+ /* First queue up an OK response to the load message itself */
+ rsp = fsp_mkmsg(FSP_RSP_LOAD_OCC | err, 0);
+ if (rsp)
+ rc = fsp_queue_msg(rsp, fsp_freemsg);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_OCC_LOAD),
+ "OCC: Error %d queueing FSP OCC LOAD reply\n", rc);
+ fsp_freemsg(rsp);
+ return;
+ }
+
+ if (err)
+ return;
+
+ if (proc_gen >= proc_gen_p9) {
+ if (in_ipl) {
+ /* OCC is pre-loaded in P9, so send SUCCESS to FSP */
+ rsp = fsp_mkmsg(FSP_CMD_LOAD_OCC_STAT, 2, 0, seq_id);
+ if (!rsp)
+ return;
+
+ rc = fsp_queue_msg(rsp, fsp_freemsg);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_OCC_LOAD),
+ "OCC: Error %d queueing OCC LOAD STATUS msg",
+ rc);
+ fsp_freemsg(rsp);
+ }
+ in_ipl = false;
+ } else {
+ struct proc_chip *chip = next_chip(NULL);
+
+ last_seq_id = seq_id;
+ prd_fsp_occ_load_start(chip->id);
+ }
+ return;
+ }
+
+ /*
+ * Check if hostservices lid caching is complete. If not, queue
+ * the load request.
+ */
+ if (!hservices_lid_preload_complete()) {
+ occ_queue_load(scope, dbob_id, seq_id);
+ return;
+ }
+
+ __occ_do_load(scope, dbob_id, seq_id);
+}
+
+int fsp_occ_reset_status(u64 chipid, s64 status)
+{
+ struct fsp_msg *stat;
+ int rc = OPAL_NO_MEM;
+ int status_word = 0;
+
+ prlog(PR_INFO, "HBRT: OCC stop() completed with %lld\n", status);
+
+ if (status) {
+ struct proc_chip *chip = get_chip(chipid);
+
+ if (!chip)
+ return OPAL_PARAMETER;
+
+ status_word = 0xfe00 | (chip->pcid & 0xff);
+ log_simple_error(&e_info(OPAL_RC_OCC_RESET),
+ "OCC: Error %lld in OCC reset of chip %lld\n",
+ status, chipid);
+ } else {
+ occ_msg_queue_occ_reset();
+ }
+
+ stat = fsp_mkmsg(FSP_CMD_RESET_OCC_STAT, 2, status_word, last_seq_id);
+ if (!stat)
+ return rc;
+
+ rc = fsp_queue_msg(stat, fsp_freemsg);
+ if (rc) {
+ fsp_freemsg(stat);
+ log_simple_error(&e_info(OPAL_RC_OCC_RESET),
+ "OCC: Error %d queueing FSP OCC RESET STATUS message\n",
+ rc);
+ }
+ return rc;
+}
+
+int fsp_occ_load_start_status(u64 chipid, s64 status)
+{
+ struct fsp_msg *stat;
+ int rc = OPAL_NO_MEM;
+ int status_word = 0;
+
+ if (status) {
+ struct proc_chip *chip = get_chip(chipid);
+
+ if (!chip)
+ return OPAL_PARAMETER;
+
+ status_word = 0xB500 | (chip->pcid & 0xff);
+ log_simple_error(&e_info(OPAL_RC_OCC_LOAD),
+ "OCC: Error %d in load/start OCC %lld\n", rc,
+ chipid);
+ }
+
+ stat = fsp_mkmsg(FSP_CMD_LOAD_OCC_STAT, 2, status_word, last_seq_id);
+ if (!stat)
+ return rc;
+
+ rc = fsp_queue_msg(stat, fsp_freemsg);
+ if (rc) {
+ fsp_freemsg(stat);
+ log_simple_error(&e_info(OPAL_RC_OCC_LOAD),
+ "OCC: Error %d queueing FSP OCC LOAD STATUS msg", rc);
+ }
+
+ return rc;
+}
+
+static void occ_do_reset(u8 scope, u32 dbob_id, u32 seq_id)
+{
+ struct fsp_msg *rsp, *stat;
+ struct proc_chip *chip = next_chip(NULL);
+ int rc = -ENOMEM;
+ u8 err = 0;
+
+ /* Check arguments */
+ if (scope != 0x01 && scope != 0x02) {
+ /**
+ * @fwts-label OCCResetInvalidScope
+ * @fwts-advice Invalid request for resetting OCCs. Power and
+ * frequency management not functional
+ */
+ prlog(PR_ERR, "OCC: Reset message with invalid scope 0x%x\n",
+ scope);
+ err = 0x22;
+ }
+
+ /* First queue up an OK response to the reset message itself */
+ rsp = fsp_mkmsg(FSP_RSP_RESET_OCC | err, 0);
+ if (rsp)
+ rc = fsp_queue_msg(rsp, fsp_freemsg);
+ if (rc) {
+ fsp_freemsg(rsp);
+ log_simple_error(&e_info(OPAL_RC_OCC_RESET),
+ "OCC: Error %d queueing FSP OCC RESET reply\n", rc);
+ return;
+ }
+
+ /* If we had an error, return */
+ if (err)
+ return;
+
+ /*
+ * Call HBRT to stop OCC and leave it stopped. FSP will send load/start
+ * request subsequently. Also after few runtime restarts (currently 3),
+ * FSP will request OCC to left in stopped state.
+ */
+
+ switch (proc_gen) {
+ case proc_gen_p8:
+ rc = host_services_occ_stop();
+ break;
+ case proc_gen_p9:
+ case proc_gen_p10:
+ last_seq_id = seq_id;
+ chip = next_chip(NULL);
+ prd_fsp_occ_reset(chip->id);
+ return;
+ default:
+ return;
+ }
+
+ /* Handle fallback to preload */
+ if (rc == -ENOENT && chip->homer_base) {
+ prlog(PR_INFO, "OCC: Reset: Fallback to preloaded image\n");
+ rc = 0;
+ }
+ if (!rc) {
+ /* Send a single success response for all chips */
+ stat = fsp_mkmsg(FSP_CMD_RESET_OCC_STAT, 2, 0, seq_id);
+ if (stat)
+ rc = fsp_queue_msg(stat, fsp_freemsg);
+ if (rc) {
+ fsp_freemsg(stat);
+ log_simple_error(&e_info(OPAL_RC_OCC_RESET),
+ "OCC: Error %d queueing FSP OCC RESET"
+ " STATUS message\n", rc);
+ }
+ occ_msg_queue_occ_reset();
+ } else {
+
+ /*
+ * Then send a matching OCC Reset Status message with an 0xFE
+ * (fail) response code as well to the first matching chip
+ */
+ for_each_chip(chip) {
+ if (scope == 0x01 && dbob_id != chip->dbob_id)
+ continue;
+ rc = -ENOMEM;
+ stat = fsp_mkmsg(FSP_CMD_RESET_OCC_STAT, 2,
+ 0xfe00 | (chip->pcid & 0xff), seq_id);
+ if (stat)
+ rc = fsp_queue_msg(stat, fsp_freemsg);
+ if (rc) {
+ fsp_freemsg(stat);
+ log_simple_error(&e_info(OPAL_RC_OCC_RESET),
+ "OCC: Error %d queueing FSP OCC RESET"
+ " STATUS message\n", rc);
+ }
+ break;
+ }
+ }
+}
+
+static bool fsp_occ_msg(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+ u32 dbob_id, seq_id;
+ u8 scope;
+
+ switch (cmd_sub_mod) {
+ case FSP_CMD_LOAD_OCC:
+ /*
+ * We get the "Load OCC" command at boot. We don't currently
+ * support loading it ourselves (we don't have the procedures,
+ * they will come with Host Services). For now HostBoot will
+ * have loaded a OCC firmware for us, but we still need to
+ * be nice and respond to OCC.
+ */
+ scope = msg->data.bytes[3];
+ dbob_id = fsp_msg_get_data_word(msg, 1);
+ seq_id = fsp_msg_get_data_word(msg, 2);
+ prlog(PR_INFO, "OCC: Got OCC Load message, scope=0x%x"
+ " dbob=0x%x seq=0x%x\n", scope, dbob_id, seq_id);
+ occ_do_load(scope, dbob_id, seq_id);
+ return true;
+
+ case FSP_CMD_RESET_OCC:
+ /*
+ * We shouldn't be getting this one, but if we do, we have
+ * to reply something sensible or the FSP will get upset
+ */
+ scope = msg->data.bytes[3];
+ dbob_id = fsp_msg_get_data_word(msg, 1);
+ seq_id = fsp_msg_get_data_word(msg, 2);
+ prlog(PR_INFO, "OCC: Got OCC Reset message, scope=0x%x"
+ " dbob=0x%x seq=0x%x\n", scope, dbob_id, seq_id);
+ occ_do_reset(scope, dbob_id, seq_id);
+ return true;
+ }
+ return false;
+}
+
+static struct fsp_client fsp_occ_client = {
+ .message = fsp_occ_msg,
+};
+
+void occ_fsp_init(void)
+{
+ /* If we have an FSP, register for notifications */
+ if (fsp_present())
+ fsp_register_client(&fsp_occ_client, FSP_MCLASS_OCC);
+}
diff --git a/roms/skiboot/hw/fsp/fsp-op-panel.c b/roms/skiboot/hw/fsp/fsp-op-panel.c
new file mode 100644
index 000000000..a8ac00b7a
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-op-panel.c
@@ -0,0 +1,266 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Small LCD screen on the front of FSP machines
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <fsp.h>
+#include <lock.h>
+#include <opal.h>
+#include <device.h>
+#include <processor.h>
+#include <opal-msg.h>
+#include <errorlog.h>
+
+DEFINE_LOG_ENTRY(OPAL_RC_PANEL_WRITE, OPAL_PLATFORM_ERR_EVT, OPAL_OP_PANEL,
+ OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL, OPAL_NA);
+
+/* For OPAL OP_PANEL API we can only have one in flight due to TCEs */
+static struct fsp_msg *op_req;
+static uint64_t op_async_token;
+static struct lock op_lock = LOCK_UNLOCKED;
+
+static void fsp_op_display_fatal(uint32_t w0, uint32_t w1)
+{
+ static struct fsp_msg op_msg_resp;
+ static struct fsp_msg op_msg = {
+ .resp = &op_msg_resp,
+ };
+
+ fsp_fillmsg(&op_msg, FSP_CMD_DISP_SRC_DIRECT, 3, 1, w0, w1);
+
+ /*
+ * A special way to send a message: it doesn't run pollers.
+ * This means we can call it while in a poller, which we may
+ * well be in when we're terminating (and thus displaying a *fatal*
+ * message on the op-panel).
+ */
+ fsp_fatal_msg(&op_msg);
+}
+
+void fsp_op_display(enum op_severity sev, enum op_module mod, uint16_t code)
+{
+ struct fsp_msg *op_msg;
+ uint32_t w0;
+ uint32_t w1;
+
+ if (!fsp_present())
+ return;
+
+ w0 = sev << 16 | mod;
+
+ w1 = tohex((code >> 12) & 0xf) << 24;
+ w1 |= tohex((code >> 8) & 0xf) << 16;
+ w1 |= tohex((code >> 4) & 0xf) << 8;
+ w1 |= tohex((code ) & 0xf);
+
+ if (sev == OP_FATAL) {
+ fsp_op_display_fatal(w0, w1);
+ } else {
+ op_msg = fsp_allocmsg(true);
+ if (!op_msg) {
+ prerror("Failed to allocate FSP message for PANEL\n");
+ return;
+ }
+
+ fsp_fillmsg(op_msg, FSP_CMD_DISP_SRC_DIRECT, 3, 1, w0, w1);
+
+ if(fsp_queue_msg(op_msg, fsp_freemsg))
+ prerror("Failed to queue FSP message for OP PANEL\n");
+ }
+}
+
+void op_panel_disable_src_echo(void)
+{
+ struct fsp_msg op_msg_resp;
+ struct fsp_msg op_msg = {
+ .resp = &op_msg_resp,
+ };
+
+ if (!fsp_present())
+ return;
+
+ fsp_fillmsg(&op_msg, FSP_CMD_DIS_SRC_ECHO, 0);
+ fsp_sync_msg(&op_msg, false);
+}
+
+void op_panel_clear_src(void)
+{
+ struct fsp_msg op_msg_resp;
+ struct fsp_msg op_msg = {
+ .resp = &op_msg_resp,
+ };
+
+ if (!fsp_present())
+ return;
+
+ fsp_fillmsg(&op_msg, FSP_CMD_CLEAR_SRC, 0);
+ fsp_sync_msg(&op_msg, false);
+}
+
+/* opal_write_oppanel - Write to the physical op panel.
+ *
+ * Pass in an array of oppanel_line_t structs defining the ASCII characters
+ * to display on each line of the oppanel. If there are two lines on the
+ * physical panel, and you only want to write to the first line, you only
+ * need to pass in one line. If you only want to write to the second line,
+ * you need to pass in both lines, and set the line_len of the first line
+ * to zero.
+ *
+ * This command is asynchronous. If OPAL_SUCCESS is returned, then the
+ * operation was initiated successfully. Subsequent calls will return
+ * OPAL_BUSY until the current operation is complete.
+ */
+struct op_src {
+ uint8_t version;
+#define OP_SRC_VERSION 2
+ uint8_t flags;
+ uint8_t reserved;
+ uint8_t hex_word_cnt;
+ __be16 reserved2;
+ __be16 total_size;
+ __be32 word2; /* SRC format in low byte */
+ __be32 word3;
+ __be32 word4;
+ __be32 word5;
+ __be32 word6;
+ __be32 word7;
+ __be32 word8;
+ __be32 word9;
+ uint8_t ascii[OP_PANEL_NUM_LINES * OP_PANEL_LINE_LEN]; /* Word 11 */
+} __packed __align(4);
+
+/* Page align for the sake of TCE mapping */
+static struct op_src op_src __align(0x1000);
+
+static void __op_panel_write_complete(struct fsp_msg *msg)
+{
+ fsp_tce_unmap(PSI_DMA_OP_PANEL_MISC, 0x1000);
+
+ lock(&op_lock);
+ op_req = NULL;
+ unlock(&op_lock);
+
+ fsp_freemsg(msg);
+}
+
+static void op_panel_write_complete(struct fsp_msg *msg)
+{
+ uint8_t rc = (msg->resp->word1 >> 8) & 0xff;
+
+ if (rc)
+ prerror("OPPANEL: Error 0x%02x in display command\n", rc);
+
+ __op_panel_write_complete(msg);
+
+ opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL,
+ cpu_to_be64(1),
+ cpu_to_be64(op_async_token));
+}
+
+static int64_t __opal_write_oppanel(oppanel_line_t *lines, uint64_t num_lines,
+ uint64_t async_token)
+{
+ int64_t rc = OPAL_ASYNC_COMPLETION;
+ int len;
+ int i;
+
+ if (num_lines < 1 || num_lines > OP_PANEL_NUM_LINES)
+ return OPAL_PARAMETER;
+
+ /* Only one in flight */
+ lock(&op_lock);
+ if (op_req) {
+ rc = OPAL_BUSY_EVENT;
+ unlock(&op_lock);
+ goto bail;
+ }
+
+ op_req = fsp_allocmsg(true);
+ if (!op_req) {
+ rc = OPAL_NO_MEM;
+ unlock(&op_lock);
+ goto bail;
+ }
+ unlock(&op_lock);
+
+ op_async_token = async_token;
+
+ memset(&op_src, 0, sizeof(op_src));
+
+ op_src.version = OP_SRC_VERSION;
+ op_src.flags = 0;
+ op_src.reserved = 0;
+ op_src.hex_word_cnt = 1; /* header word only */
+ op_src.reserved2 = 0;
+ op_src.total_size = cpu_to_be16(sizeof(op_src));
+ op_src.word2 = 0; /* should be unneeded */
+
+ for (i = 0; i < num_lines; i++) {
+ uint8_t *current_line = op_src.ascii + (i * OP_PANEL_LINE_LEN);
+
+ len = be64_to_cpu(lines[i].line_len);
+ if (len < OP_PANEL_LINE_LEN)
+ memset(current_line + len, ' ', OP_PANEL_LINE_LEN-len);
+ else
+ len = OP_PANEL_LINE_LEN;
+ memcpy(current_line, (void *) be64_to_cpu(lines[i].line), len);
+ }
+
+ for (i = 0; i < sizeof(op_src.ascii); i++) {
+ /*
+ * So, there's this interesting thing if you send
+ * HTML/Javascript through the Operator Panel.
+ * You get to inject it into the ASM web ui!
+ * So we filter out anything suspect here,
+ * at least for the time being.
+ *
+ * Allowed characters:
+ * . / 0-9 : a-z A-Z SPACE
+ */
+ if (! ((op_src.ascii[i] >= '.' && op_src.ascii[i] <= ':') ||
+ (op_src.ascii[i] >= 'a' && op_src.ascii[i] <= 'z') ||
+ (op_src.ascii[i] >= 'A' && op_src.ascii[i] <= 'Z') ||
+ op_src.ascii[i] == ' ')) {
+ op_src.ascii[i] = '.';
+ }
+ }
+
+ fsp_tce_map(PSI_DMA_OP_PANEL_MISC, &op_src, 0x1000);
+
+ fsp_fillmsg(op_req, FSP_CMD_DISP_SRC_INDIR, 3, 0,
+ PSI_DMA_OP_PANEL_MISC, sizeof(struct op_src));
+ rc = fsp_queue_msg(op_req, op_panel_write_complete);
+ if (rc) {
+ __op_panel_write_complete(op_req);
+ rc = OPAL_INTERNAL_ERROR;
+ }
+ bail:
+ log_simple_error(&e_info(OPAL_RC_PANEL_WRITE),
+ "FSP: Error updating Op Panel: %lld\n", rc);
+ return rc;
+}
+
+static int64_t opal_write_oppanel_async(uint64_t async_token,
+ oppanel_line_t *lines,
+ uint64_t num_lines)
+{
+ return __opal_write_oppanel(lines, num_lines, async_token);
+}
+
+void fsp_oppanel_init(void)
+{
+ struct dt_node *oppanel;
+
+ if (!fsp_present())
+ return;
+
+ opal_register(OPAL_WRITE_OPPANEL_ASYNC, opal_write_oppanel_async, 3);
+
+ oppanel = dt_new(opal_node, "oppanel");
+ dt_add_property_cells(oppanel, "#length", OP_PANEL_LINE_LEN);
+ dt_add_property_cells(oppanel, "#lines", OP_PANEL_NUM_LINES);
+ dt_add_property_string(oppanel, "compatible", "ibm,opal-oppanel");
+}
diff --git a/roms/skiboot/hw/fsp/fsp-psi.c b/roms/skiboot/hw/fsp/fsp-psi.c
new file mode 100644
index 000000000..38f130dd7
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-psi.c
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2013-2019 IBM Corp. */
+
+#include <io.h>
+#include <psi.h>
+#include <lock.h>
+#include <fsp.h>
+
+static void psi_tce_enable(struct psi *psi, bool enable)
+{
+ void *addr = psi->regs + PSIHB_PHBSCR;
+ u64 val;
+
+ val = in_be64(addr);
+ if (enable)
+ val |= PSIHB_PHBSCR_TCE_ENABLE;
+ else
+ val &= ~PSIHB_PHBSCR_TCE_ENABLE;
+ out_be64(addr, val);
+}
+
+/*
+ * Configure the PSI interface for communicating with
+ * an FSP, such as enabling the TCEs, FSP commands,
+ * etc...
+ */
+void psi_init_for_fsp(struct psi *psi)
+{
+ uint64_t reg;
+ bool enable_tce = true;
+
+ lock(&psi_lock);
+
+ /* Disable and setup TCE base address */
+ psi_tce_enable(psi, false);
+
+ switch (proc_gen) {
+ case proc_gen_p8:
+ case proc_gen_p9:
+ case proc_gen_p10:
+ out_be64(psi->regs + PSIHB_TAR, PSI_TCE_TABLE_BASE |
+ PSIHB_TAR_256K_ENTRIES);
+ break;
+ default:
+ enable_tce = false;
+ };
+
+ /* Enable various other configuration register bits based
+ * on what pHyp does. We keep interrupts disabled until
+ * after the mailbox has been properly configured. We assume
+ * basic stuff such as PSI link enable is already there.
+ *
+ * - FSP CMD Enable
+ * - FSP MMIO Enable
+ * - TCE Enable
+ * - Error response enable
+ *
+ * Clear all other error bits
+ */
+ if (!psi->active) {
+ prerror("PSI: psi_init_for_fsp() called on inactive link!\n");
+ unlock(&psi_lock);
+ return;
+ }
+
+ reg = in_be64(psi->regs + PSIHB_CR);
+ reg |= PSIHB_CR_FSP_CMD_ENABLE;
+ reg |= PSIHB_CR_FSP_MMIO_ENABLE;
+ reg |= PSIHB_CR_FSP_ERR_RSP_ENABLE;
+ reg &= ~0x00000000ffffffffull;
+ out_be64(psi->regs + PSIHB_CR, reg);
+ psi_tce_enable(psi, enable_tce);
+
+ unlock(&psi_lock);
+}
diff --git a/roms/skiboot/hw/fsp/fsp-rtc.c b/roms/skiboot/hw/fsp/fsp-rtc.c
new file mode 100644
index 000000000..237560a8d
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-rtc.c
@@ -0,0 +1,567 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Real Time Clock (RTC) attached to FSP
+ *
+ * Copyright 2013-2017 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <fsp.h>
+#include <lock.h>
+#include <timebase.h>
+#include <time.h>
+#include <time-utils.h>
+#include <opal-api.h>
+#include <opal-msg.h>
+#include <errorlog.h>
+#include <device.h>
+
+/*
+ * Note on how those operate:
+ *
+ * Because the RTC calls can be pretty slow, these functions will shoot
+ * an asynchronous request to the FSP (if none is already pending)
+ *
+ * The requests will return OPAL_BUSY_EVENT as long as the event has
+ * not been completed.
+ *
+ * WARNING: An attempt at doing an RTC write while one is already pending
+ * will simply ignore the new arguments and continue returning
+ * OPAL_BUSY_EVENT. This is to be compatible with existing Linux code.
+ *
+ * Completion of the request will result in an event OPAL_EVENT_RTC
+ * being signaled, which will remain raised until a corresponding call
+ * to opal_rtc_read() or opal_rtc_write() finally returns OPAL_SUCCESS,
+ * at which point the operation is complete and the event cleared.
+ *
+ * If we end up taking longer than rtc_read_timeout_ms millieconds waiting
+ * for the response from a read request, we simply return a cached value (plus
+ * an offset calculated from the timebase. When the read request finally
+ * returns, we update our cache value accordingly.
+ *
+ * There is two separate set of state for reads and writes. If both are
+ * attempted at the same time, the event bit will remain set as long as either
+ * of the two has a pending event to signal.
+ */
+
+#include <rtc.h>
+
+/* All of the below state is protected by rtc_lock.
+ * It should be held for the shortest amount of time possible.
+ * Certainly not across calls to FSP.
+ */
+static struct lock rtc_lock;
+
+static enum {
+ RTC_TOD_VALID,
+ RTC_TOD_INVALID,
+ RTC_TOD_PERMANENT_ERROR,
+} rtc_tod_state = RTC_TOD_INVALID;
+
+/* State machine for getting an RTC request.
+ * RTC_{READ/WRITE}_NO_REQUEST -> RTC_{READ/WRITE}_PENDING_REQUEST (one in flight)
+ * RTC_{READ/WRITE}_PENDING_REQUEST -> RTC_{READ/WRITE}_REQUEST_AVAILABLE,
+ * when FSP responds
+ * RTC_{READ/WRITE}_REQUEST_AVAILABLE -> RTC_{READ/WRITE}_NO_REQUEST,
+ * when OS retrieves it
+ */
+static enum {
+ RTC_READ_NO_REQUEST,
+ RTC_READ_PENDING_REQUEST,
+ RTC_READ_REQUEST_AVAILABLE,
+} rtc_read_request_state = RTC_READ_NO_REQUEST;
+
+static enum {
+ RTC_WRITE_NO_REQUEST,
+ RTC_WRITE_PENDING_REQUEST,
+ RTC_WRITE_REQUEST_AVAILABLE,
+} rtc_write_request_state = RTC_WRITE_NO_REQUEST;
+
+static bool rtc_tod_cache_dirty = false;
+
+struct opal_tpo_data {
+ uint64_t tpo_async_token;
+ __be32 *year_month_day;
+ __be32 *hour_min;
+};
+
+/* Timebase value when we last initiated a RTC read request */
+static unsigned long read_req_tb;
+
+/* If a RTC read takes longer than this, we return a value generated
+ * from the cache + timebase */
+static const int rtc_read_timeout_ms = 1500;
+
+DEFINE_LOG_ENTRY(OPAL_RC_RTC_TOD, OPAL_PLATFORM_ERR_EVT, OPAL_RTC,
+ OPAL_PLATFORM_FIRMWARE, OPAL_INFO, OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_RTC_READ, OPAL_PLATFORM_ERR_EVT, OPAL_RTC,
+ OPAL_PLATFORM_FIRMWARE, OPAL_INFO, OPAL_NA);
+
+static void fsp_tpo_req_complete(struct fsp_msg *read_resp)
+{
+ struct opal_tpo_data *attr = read_resp->user_data;
+ int val;
+ int rc;
+
+ val = (read_resp->resp->word1 >> 8) & 0xff;
+ switch (val) {
+ case FSP_STATUS_TOD_RESET:
+ log_simple_error(&e_info(OPAL_RC_RTC_TOD),
+ "RTC TPO in invalid state\n");
+ rc = OPAL_INTERNAL_ERROR;
+ break;
+
+ case FSP_STATUS_TOD_PERMANENT_ERROR:
+ log_simple_error(&e_info(OPAL_RC_RTC_TOD),
+ "RTC TPO in permanent error state\n");
+ rc = OPAL_INTERNAL_ERROR;
+ break;
+ case FSP_STATUS_INVALID_DATA:
+ log_simple_error(&e_info(OPAL_RC_RTC_TOD),
+ "RTC TPO: Invalid data\n");
+ rc = OPAL_PARAMETER;
+ break;
+ case FSP_STATUS_SUCCESS:
+ /* Save the read TPO value in our cache */
+ if (attr->year_month_day)
+ *attr->year_month_day = cpu_to_be32(fsp_msg_get_data_word(read_resp->resp, 0));
+ if (attr->hour_min)
+ *attr->hour_min = cpu_to_be32(fsp_msg_get_data_word(read_resp->resp, 1));
+ rc = OPAL_SUCCESS;
+ break;
+
+ default:
+ log_simple_error(&e_info(OPAL_RC_RTC_TOD),
+ "TPO read failed: %d\n", val);
+ rc = OPAL_INTERNAL_ERROR;
+ break;
+ }
+ opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL,
+ cpu_to_be64(attr->tpo_async_token),
+ cpu_to_be64(rc));
+ free(attr);
+ fsp_freemsg(read_resp);
+}
+
+static void fsp_rtc_process_read(struct fsp_msg *read_resp)
+{
+ int val = (read_resp->word1 >> 8) & 0xff;
+ struct tm tm;
+
+ assert(lock_held_by_me(&rtc_lock));
+
+ assert(rtc_read_request_state == RTC_READ_PENDING_REQUEST);
+
+ switch (val) {
+ case FSP_STATUS_TOD_RESET:
+ log_simple_error(&e_info(OPAL_RC_RTC_TOD),
+ "RTC TOD in invalid state\n");
+ rtc_tod_state = RTC_TOD_INVALID;
+ break;
+
+ case FSP_STATUS_TOD_PERMANENT_ERROR:
+ log_simple_error(&e_info(OPAL_RC_RTC_TOD),
+ "RTC TOD in permanent error state\n");
+ rtc_tod_state = RTC_TOD_PERMANENT_ERROR;
+ break;
+
+ case FSP_STATUS_SUCCESS:
+ /* Save the read RTC value in our cache */
+ rtc_tod_state = RTC_TOD_VALID;
+ datetime_to_tm(fsp_msg_get_data_word(read_resp, 0),
+ (u64)fsp_msg_get_data_word(read_resp, 1) << 32, &tm);
+ rtc_cache_update(&tm);
+ prlog(PR_TRACE, "FSP-RTC Got time: %d-%d-%d %d:%d:%d\n",
+ tm.tm_year, tm.tm_mon, tm.tm_mday,
+ tm.tm_hour, tm.tm_min, tm.tm_sec);
+ break;
+
+ default:
+ log_simple_error(&e_info(OPAL_RC_RTC_TOD),
+ "RTC TOD read failed: %d\n", val);
+ rtc_tod_state = RTC_TOD_INVALID;
+ }
+ rtc_read_request_state = RTC_READ_REQUEST_AVAILABLE;
+}
+
+static void opal_rtc_eval_events(bool read_write)
+{
+ bool request_available;
+
+ if (read_write)
+ request_available = (rtc_read_request_state ==
+ RTC_READ_REQUEST_AVAILABLE);
+ else
+ request_available = (rtc_write_request_state ==
+ RTC_WRITE_REQUEST_AVAILABLE);
+
+ assert(lock_held_by_me(&rtc_lock));
+ opal_update_pending_evt(OPAL_EVENT_RTC,
+ request_available ? OPAL_EVENT_RTC : 0);
+}
+
+static void fsp_rtc_req_complete(struct fsp_msg *msg)
+{
+ lock(&rtc_lock);
+ prlog(PR_TRACE, "RTC completion %p\n", msg);
+
+ if (fsp_msg_cmd(msg) == (FSP_CMD_READ_TOD & 0xffffff)) {
+ fsp_rtc_process_read(msg->resp);
+ opal_rtc_eval_events(true);
+ } else {
+ assert(rtc_write_request_state == RTC_WRITE_PENDING_REQUEST);
+ rtc_write_request_state = RTC_WRITE_REQUEST_AVAILABLE;
+ opal_rtc_eval_events(false);
+ }
+
+ unlock(&rtc_lock);
+ fsp_freemsg(msg);
+}
+
+static int64_t fsp_rtc_send_read_request(void)
+{
+ struct fsp_msg *msg;
+ int rc;
+
+ assert(lock_held_by_me(&rtc_lock));
+ assert(rtc_read_request_state == RTC_READ_NO_REQUEST);
+
+ msg = fsp_mkmsg(FSP_CMD_READ_TOD, 0);
+ if (!msg) {
+ log_simple_error(&e_info(OPAL_RC_RTC_READ),
+ "RTC: failed to allocate read message\n");
+ return OPAL_INTERNAL_ERROR;
+ }
+
+ rc = fsp_queue_msg(msg, fsp_rtc_req_complete);
+ if (rc) {
+ fsp_freemsg(msg);
+ log_simple_error(&e_info(OPAL_RC_RTC_READ),
+ "RTC: failed to queue read message: %d\n", rc);
+ return OPAL_INTERNAL_ERROR;
+ }
+
+ rtc_read_request_state = RTC_READ_PENDING_REQUEST;
+
+ read_req_tb = mftb();
+
+ return OPAL_BUSY_EVENT;
+}
+
+static int64_t fsp_opal_rtc_read(__be32 *__ymd, __be64 *__hmsm)
+{
+ int64_t rc;
+ uint32_t ymd;
+ uint64_t hmsm;
+
+ if (!__ymd || !__hmsm)
+ return OPAL_PARAMETER;
+
+ lock(&rtc_lock);
+
+ if (rtc_tod_state == RTC_TOD_PERMANENT_ERROR) {
+ rc = OPAL_HARDWARE;
+ goto out;
+ }
+
+ /* During R/R of FSP, read cached TOD */
+ if (fsp_in_rr()) {
+ if (rtc_tod_state == RTC_TOD_VALID) {
+ rtc_cache_get_datetime(&ymd, &hmsm);
+ rc = OPAL_SUCCESS;
+ } else {
+ rc = OPAL_INTERNAL_ERROR;
+ }
+ goto out;
+ }
+
+ /* If we don't have a read pending already, fire off a request and
+ * return */
+ if (rtc_read_request_state == RTC_READ_NO_REQUEST) {
+ prlog(PR_TRACE, "Sending new RTC read request\n");
+ rc = fsp_rtc_send_read_request();
+ /* If our pending read is done, clear events and return the time
+ * from the cache */
+ } else if (rtc_read_request_state == RTC_READ_REQUEST_AVAILABLE) {
+ prlog(PR_TRACE, "RTC read complete, state %d\n", rtc_tod_state);
+ rtc_read_request_state = RTC_READ_NO_REQUEST;
+
+ opal_rtc_eval_events(true);
+
+ if (rtc_tod_state == RTC_TOD_VALID) {
+ rtc_cache_get_datetime(&ymd, &hmsm);
+ prlog(PR_TRACE,"FSP-RTC Cached datetime: %x %llx\n",
+ ymd, hmsm);
+ rc = OPAL_SUCCESS;
+ } else {
+ rc = OPAL_INTERNAL_ERROR;
+ }
+
+ /* Timeout: return our cached value (updated from tb), but leave the
+ * read request pending so it will update the cache later */
+ } else if (mftb() > read_req_tb + msecs_to_tb(rtc_read_timeout_ms)) {
+ prlog(PR_TRACE, "RTC read timed out\n");
+
+ if (rtc_tod_state == RTC_TOD_VALID) {
+ rtc_cache_get_datetime(&ymd, &hmsm);
+ rc = OPAL_SUCCESS;
+ } else {
+ rc = OPAL_INTERNAL_ERROR;
+ }
+ /* Otherwise, we're still waiting on the read to complete */
+ } else {
+ assert(rtc_read_request_state == RTC_READ_PENDING_REQUEST);
+ rc = OPAL_BUSY_EVENT;
+ }
+out:
+ unlock(&rtc_lock);
+
+ if (rc == OPAL_SUCCESS) {
+ *__ymd = cpu_to_be32(ymd);
+ *__hmsm = cpu_to_be64(hmsm);
+ }
+
+ return rc;
+}
+
+static int64_t fsp_rtc_send_write_request(uint32_t year_month_day,
+ uint64_t hour_minute_second_millisecond)
+{
+ struct fsp_msg *msg;
+ uint32_t w0, w1, w2;
+
+ assert(lock_held_by_me(&rtc_lock));
+ assert(rtc_write_request_state == RTC_WRITE_NO_REQUEST);
+
+ /* Create a request and send it. Just like for read, we ignore
+ * the "millisecond" field which is probably supposed to be
+ * microseconds and which Linux ignores as well anyway
+ */
+ w0 = year_month_day;
+ w1 = (hour_minute_second_millisecond >> 32) & 0xffffff00;
+ w2 = 0;
+
+ msg = fsp_mkmsg(FSP_CMD_WRITE_TOD, 3, w0, w1, w2);
+ if (!msg) {
+ prlog(PR_TRACE, " -> allocation failed !\n");
+ return OPAL_INTERNAL_ERROR;
+ }
+ prlog(PR_TRACE, " -> req at %p\n", msg);
+
+ if (fsp_queue_msg(msg, fsp_rtc_req_complete)) {
+ prlog(PR_TRACE, " -> queueing failed !\n");
+ fsp_freemsg(msg);
+ return OPAL_INTERNAL_ERROR;
+ }
+
+ rtc_write_request_state = RTC_WRITE_PENDING_REQUEST;
+
+ return OPAL_BUSY_EVENT;
+}
+
+static int64_t fsp_opal_rtc_write(uint32_t year_month_day,
+ uint64_t hour_minute_second_millisecond)
+{
+ int rc;
+ struct tm tm;
+
+ lock(&rtc_lock);
+ if (rtc_tod_state == RTC_TOD_PERMANENT_ERROR) {
+ rc = OPAL_HARDWARE;
+ goto out;
+ }
+
+ if (fsp_in_rr()) {
+ datetime_to_tm(year_month_day,
+ hour_minute_second_millisecond, &tm);
+ rtc_cache_update(&tm);
+ rtc_tod_cache_dirty = true;
+ rc = OPAL_SUCCESS;
+ goto out;
+ }
+
+ if (rtc_write_request_state == RTC_WRITE_NO_REQUEST) {
+ prlog(PR_TRACE, "Sending new RTC write request\n");
+ rc = fsp_rtc_send_write_request(year_month_day,
+ hour_minute_second_millisecond);
+ } else if (rtc_write_request_state == RTC_WRITE_PENDING_REQUEST) {
+ rc = OPAL_BUSY_EVENT;
+ } else {
+ assert(rtc_write_request_state == RTC_WRITE_REQUEST_AVAILABLE);
+ rtc_write_request_state = RTC_WRITE_NO_REQUEST;
+
+ opal_rtc_eval_events(false);
+ rc = OPAL_SUCCESS;
+ }
+
+out:
+ unlock(&rtc_lock);
+ return rc;
+}
+
+/* Set timed power on values to fsp */
+static int64_t fsp_opal_tpo_write(uint64_t async_token, uint32_t y_m_d,
+ uint32_t hr_min)
+{
+ static struct opal_tpo_data *attr;
+ struct fsp_msg *msg;
+
+ if (!fsp_present())
+ return OPAL_HARDWARE;
+
+ attr = zalloc(sizeof(struct opal_tpo_data));
+ if (!attr)
+ return OPAL_NO_MEM;
+
+ /* Create a request and send it.*/
+ attr->tpo_async_token = async_token;
+
+ /* check if this is a disable tpo request */
+ if (y_m_d == 0 && hr_min == 0) {
+ prlog(PR_TRACE, "Sending TPO disable request...\n");
+ msg = fsp_mkmsg(FSP_CMD_TPO_DISABLE, 0);
+ } else {
+ prlog(PR_TRACE, "Sending TPO write request...\n");
+ msg = fsp_mkmsg(FSP_CMD_TPO_WRITE, 2, y_m_d, hr_min);
+ }
+
+ if (!msg) {
+ prerror("TPO: Failed to create message for WRITE to FSP\n");
+ free(attr);
+ return OPAL_INTERNAL_ERROR;
+ }
+ msg->user_data = attr;
+ if (fsp_queue_msg(msg, fsp_tpo_req_complete)) {
+ free(attr);
+ fsp_freemsg(msg);
+ return OPAL_INTERNAL_ERROR;
+ }
+ return OPAL_ASYNC_COMPLETION;
+}
+
+/* Read Timed power on (TPO) from FSP */
+static int64_t fsp_opal_tpo_read(uint64_t async_token, __be32 *y_m_d,
+ __be32 *hr_min)
+{
+ static struct opal_tpo_data *attr;
+ struct fsp_msg *msg;
+ int64_t rc;
+
+ if (!fsp_present())
+ return OPAL_HARDWARE;
+
+ if (!y_m_d || !hr_min)
+ return OPAL_PARAMETER;
+
+ attr = zalloc(sizeof(*attr));
+ if (!attr)
+ return OPAL_NO_MEM;
+
+ /* Send read requet to FSP */
+ attr->tpo_async_token = async_token;
+ attr->year_month_day = y_m_d;
+ attr->hour_min = hr_min;
+
+ prlog(PR_TRACE, "Sending new TPO read request\n");
+ msg = fsp_mkmsg(FSP_CMD_TPO_READ, 0);
+ if (!msg) {
+ log_simple_error(&e_info(OPAL_RC_RTC_READ),
+ "TPO: failed to allocate read message\n");
+ free(attr);
+ return OPAL_INTERNAL_ERROR;
+ }
+ msg->user_data = attr;
+ rc = fsp_queue_msg(msg, fsp_tpo_req_complete);
+ if (rc) {
+ free(attr);
+ fsp_freemsg(msg);
+ log_simple_error(&e_info(OPAL_RC_RTC_READ),
+ "TPO: failed to queue read message: %lld\n", rc);
+ return OPAL_INTERNAL_ERROR;
+ }
+ return OPAL_ASYNC_COMPLETION;
+}
+
+static void rtc_flush_cached_tod(void)
+{
+ struct fsp_msg *msg;
+ uint64_t h_m_s_m;
+ uint32_t y_m_d;
+
+ if (rtc_cache_get_datetime(&y_m_d, &h_m_s_m))
+ return;
+ msg = fsp_mkmsg(FSP_CMD_WRITE_TOD, 3, y_m_d,
+ (h_m_s_m >> 32) & 0xffffff00, 0);
+ if (!msg) {
+ prerror("TPO: %s : Failed to allocate write TOD message\n",
+ __func__);
+ return;
+ }
+ if (fsp_queue_msg(msg, fsp_freemsg)) {
+ fsp_freemsg(msg);
+ prerror("TPO: %s : Failed to queue WRITE_TOD command\n",
+ __func__);
+ return;
+ }
+}
+
+static bool fsp_rtc_msg_rr(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+
+ int rc = false;
+ assert(msg == NULL);
+
+ switch (cmd_sub_mod) {
+ case FSP_RESET_START:
+ rc = true;
+ break;
+ case FSP_RELOAD_COMPLETE:
+ lock(&rtc_lock);
+ if (rtc_tod_cache_dirty) {
+ rtc_flush_cached_tod();
+ rtc_tod_cache_dirty = false;
+ }
+ unlock(&rtc_lock);
+ rc = true;
+ break;
+ }
+
+ return rc;
+}
+
+static struct fsp_client fsp_rtc_client_rr = {
+ .message = fsp_rtc_msg_rr,
+};
+
+void fsp_rtc_init(void)
+{
+ struct dt_node *np;
+
+ if (!fsp_present()) {
+ rtc_tod_state = RTC_TOD_PERMANENT_ERROR;
+ return;
+ }
+
+ opal_register(OPAL_RTC_READ, fsp_opal_rtc_read, 2);
+ opal_register(OPAL_RTC_WRITE, fsp_opal_rtc_write, 2);
+ opal_register(OPAL_WRITE_TPO, fsp_opal_tpo_write, 3);
+ opal_register(OPAL_READ_TPO, fsp_opal_tpo_read, 3);
+
+ np = dt_new(opal_node, "rtc");
+ dt_add_property_strings(np, "compatible", "ibm,opal-rtc");
+ dt_add_property(np, "has-tpo", NULL, 0);
+
+ /* Register for the reset/reload event */
+ fsp_register_client(&fsp_rtc_client_rr, FSP_MCLASS_RR_EVENT);
+
+ prlog(PR_TRACE, "Getting initial RTC TOD\n");
+
+ /* We don't wait for RTC response and this is actually okay as
+ * any OPAL callers will wait correctly and if we ever have
+ * internal users then they should check the state properly
+ */
+ lock(&rtc_lock);
+ fsp_rtc_send_read_request();
+ unlock(&rtc_lock);
+}
diff --git a/roms/skiboot/hw/fsp/fsp-sensor.c b/roms/skiboot/hw/fsp/fsp-sensor.c
new file mode 100644
index 000000000..ffcd004f3
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-sensor.c
@@ -0,0 +1,860 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * This code will enable the 'powernv' to retrieve sensor related data from FSP
+ * using SPCN passthru mailbox commands.
+ *
+ * The OPAL read sensor API in Sapphire is implemented as an 'asynchronous' read
+ * call that returns after queuing the read request. A unique sensor-id is
+ * expected as an argument for OPAL read call which has already been exported
+ * to the device tree during fsp init. The sapphire code decodes this Id to
+ * determine requested attribute and sensor.
+ *
+ * Copyright 2013-2017 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <fsp.h>
+#include <lock.h>
+#include <device.h>
+#include <spcn.h>
+#include <opal-api.h>
+#include <opal-msg.h>
+#include <errorlog.h>
+#include <sensor.h>
+
+#define INVALID_DATA ((uint32_t)-1)
+
+/* Entry size of PRS command modifiers */
+#define PRS_STATUS_ENTRY_SZ 0x08
+#define SENSOR_PARAM_ENTRY_SZ 0x10
+#define SENSOR_DATA_ENTRY_SZ 0x08
+#define PROC_JUNC_ENTRY_SZ 0x04
+
+DEFINE_LOG_ENTRY(OPAL_RC_SENSOR_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_SENSOR,
+ OPAL_MISC_SUBSYSTEM,
+ OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT,
+ OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_SENSOR_READ, OPAL_PLATFORM_ERR_EVT, OPAL_SENSOR,
+ OPAL_MISC_SUBSYSTEM, OPAL_INFO,
+ OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_SENSOR_ASYNC_COMPLETE, OPAL_PLATFORM_ERR_EVT,
+ OPAL_SENSOR, OPAL_MISC_SUBSYSTEM, OPAL_INFO,
+ OPAL_NA);
+
+/* FSP response status codes */
+enum {
+ SP_RSP_STATUS_VALID_DATA = 0x00,
+ SP_RSP_STATUS_INVALID_DATA = 0x22,
+ SP_RSP_STATUS_SPCN_ERR = 0xA8,
+ SP_RSP_STATUS_DMA_ERR = 0x24,
+};
+
+enum sensor_state {
+ SENSOR_VALID_DATA,
+ SENSOR_INVALID_DATA,
+ SENSOR_SPCN_ERROR,
+ SENSOR_DMA_ERROR,
+ SENSOR_PERMANENT_ERROR,
+ SENSOR_OPAL_ERROR,
+};
+
+enum spcn_attr {
+ SENSOR_STATUS,
+ SENSOR_THRS,
+ SENSOR_DATA,
+ SENSOR_MAX,
+};
+
+/* Parsed sensor attributes, passed through OPAL */
+struct opal_sensor_data {
+ uint64_t async_token; /* Asynchronous token */
+ __be64 *sensor_data; /* Kernel pointer to copy data */
+ enum spcn_attr spcn_attr; /* Modifier attribute */
+ uint16_t rid; /* Sensor RID */
+ uint8_t frc; /* Sensor resource class */
+ uint32_t mod_index; /* Modifier index*/
+ uint32_t offset; /* Offset in sensor buffer */
+};
+
+struct spcn_mod {
+ uint8_t mod; /* Modifier code */
+ uint8_t entry_size; /* Size of each entry in response buffer */
+ uint16_t entry_count; /* Number of entries */
+};
+
+static struct spcn_mod spcn_mod_data[] = {
+ {SPCN_MOD_PRS_STATUS_FIRST, PRS_STATUS_ENTRY_SZ, 0 },
+ {SPCN_MOD_PRS_STATUS_SUBS, PRS_STATUS_ENTRY_SZ, 0 },
+ {SPCN_MOD_SENSOR_PARAM_FIRST, SENSOR_PARAM_ENTRY_SZ, 0 },
+ {SPCN_MOD_SENSOR_PARAM_SUBS, SENSOR_PARAM_ENTRY_SZ, 0 },
+ {SPCN_MOD_SENSOR_DATA_FIRST, SENSOR_DATA_ENTRY_SZ, 0 },
+ {SPCN_MOD_SENSOR_DATA_SUBS, SENSOR_DATA_ENTRY_SZ, 0 },
+ /* TODO Support this modifier '0x14', if required */
+ /* {SPCN_MOD_PROC_JUNC_TEMP, PROC_JUNC_ENTRY_SZ, 0, NULL}, */
+ {SPCN_MOD_SENSOR_POWER, SENSOR_DATA_ENTRY_SZ, 0 },
+ {SPCN_MOD_LAST, 0xff, 0xffff}
+};
+
+/* Frame resource class (FRC) names */
+static const char *frc_names[] = {
+ /* 0x00 and 0x01 are reserved */
+ NULL,
+ NULL,
+ "power-controller",
+ "power",
+ "regulator",
+ "cooling-fan",
+ "cooling-controller",
+ "battery-charger",
+ "battery-pack",
+ "amb-temp",
+ "temp",
+ "vrm",
+ "riser-card",
+ "io-backplane"
+};
+
+#define SENSOR_MAX_SIZE 0x00100000
+static void *sensor_buffer = NULL;
+static enum sensor_state sensor_state;
+static bool prev_msg_consumed = true;
+static struct lock sensor_lock;
+
+/* Function prototypes */
+static int64_t fsp_sensor_send_read_request(struct opal_sensor_data *attr);
+static void queue_msg_for_delivery(int rc, struct opal_sensor_data *attr);
+
+
+/*
+ * Power Resource Status (PRS)
+ * Command: 0x42
+ *
+ * Modifier: 0x01
+ * --------------------------------------------------------------------------
+ * | 0 1 2 3 4 5 6 7 |
+ * --------------------------------------------------------------------------
+ * |Frame resrc class| PRID | SRC | Status |
+ * --------------------------------------------------------------------------
+ *
+ *
+ * Modifier: 0x10
+ * --------------------------------------------------------------------------
+ * | 0 1 2 3 4 5 6 7 |
+ * --------------------------------------------------------------------------
+ * |Frame resrc class| PRID | Sensor location |
+ * --------------------------------------------------------------------------
+ * --------------------------------------------------------------------------
+ * | 8 9 10 11 12 13 14 15 |
+ * --------------------------------------------------------------------------
+ * | Reserved | Reserved | Threshold | Status |
+ * --------------------------------------------------------------------------
+ *
+ *
+ * Modifier: 0x12
+ * --------------------------------------------------------------------------
+ * | 0 1 2 3 4 5 6 7 |
+ * --------------------------------------------------------------------------
+ * |Frame resrc class| PRID | Sensor data | Status |
+ * --------------------------------------------------------------------------
+ *
+ *
+ * Modifier: 0x14
+ * --------------------------------------------------------------------------
+ * | 0 1 2 3 |
+ * --------------------------------------------------------------------------
+ * |Enclosure Tj Avg | Chip Tj Avg | Reserved | Reserved |
+ * --------------------------------------------------------------------------
+ */
+
+
+/*
+ * When coming from a SENSOR_POWER modifier command, the resource id
+ * of a power supply is on one byte and misses a "subclass" byte
+ * (0x10). This routine adds it to be consistent with the PRS_STATUS
+ * modifier command.
+ */
+#define normalize_power_rid(rid) (0x1000|(rid))
+
+static uint32_t sensor_power_process_data(uint16_t rid,
+ struct sensor_power *power)
+{
+ int i;
+
+ if (!sensor_power_is_valid(power)) {
+ prlog(PR_TRACE, "Power Sensor data not valid\n");
+ return INVALID_DATA;
+ }
+
+ for (i = 0; i < sensor_power_count(power); i++) {
+ prlog(PR_TRACE, "Power[%d]: %d mW\n", i,
+ power->supplies[i].milliwatts);
+ if (rid == normalize_power_rid(power->supplies[i].rid))
+ return be32_to_cpu(power->supplies[i].milliwatts) / 1000;
+ }
+
+ return 0;
+}
+
+static inline uint16_t convert_status_to_fault(uint16_t status)
+{
+ return status & 0x06;
+}
+
+static void fsp_sensor_process_data(struct opal_sensor_data *attr)
+{
+ uint8_t *sensor_buf_ptr = (uint8_t *)sensor_buffer;
+ uint32_t sensor_data = INVALID_DATA;
+ __be16 sensor_mod_data[8];
+ int count;
+
+ for (count = 0; count < spcn_mod_data[attr->mod_index].entry_count;
+ count++) {
+ memcpy((void *)sensor_mod_data, sensor_buf_ptr,
+ spcn_mod_data[attr->mod_index].entry_size);
+ if (spcn_mod_data[attr->mod_index].mod == SPCN_MOD_PROC_JUNC_TEMP) {
+ /* TODO Support this modifier '0x14', if required */
+
+ } else if (spcn_mod_data[attr->mod_index].mod == SPCN_MOD_SENSOR_POWER) {
+ sensor_data = sensor_power_process_data(attr->rid,
+ (struct sensor_power *) sensor_buf_ptr);
+ break;
+ } else if (be16_to_cpu(sensor_mod_data[0]) == attr->frc &&
+ be16_to_cpu(sensor_mod_data[1]) == attr->rid) {
+ switch (attr->spcn_attr) {
+ case SENSOR_STATUS:
+ sensor_data =
+ convert_status_to_fault(be16_to_cpu(sensor_mod_data[3]));
+ break;
+ case SENSOR_THRS:
+ sensor_data = be16_to_cpu(sensor_mod_data[6]);
+ break;
+ case SENSOR_DATA:
+ sensor_data = be16_to_cpu(sensor_mod_data[2]);
+ break;
+ default:
+ break;
+ }
+
+ break;
+ }
+
+ sensor_buf_ptr += spcn_mod_data[attr->mod_index].entry_size;
+ }
+
+ *attr->sensor_data = cpu_to_be64(sensor_data);
+ if (sensor_data == INVALID_DATA)
+ queue_msg_for_delivery(OPAL_PARTIAL, attr);
+ else
+ queue_msg_for_delivery(OPAL_SUCCESS, attr);
+}
+
+static int fsp_sensor_process_read(struct fsp_msg *resp_msg)
+{
+ uint8_t mbx_rsp_status;
+ uint32_t size = 0;
+
+ mbx_rsp_status = (resp_msg->word1 >> 8) & 0xff;
+ switch (mbx_rsp_status) {
+ case SP_RSP_STATUS_VALID_DATA:
+ sensor_state = SENSOR_VALID_DATA;
+ size = fsp_msg_get_data_word(resp_msg, 1) & 0xffff;
+ break;
+ case SP_RSP_STATUS_INVALID_DATA:
+ log_simple_error(&e_info(OPAL_RC_SENSOR_READ),
+ "SENSOR: %s: Received invalid data\n", __func__);
+ sensor_state = SENSOR_INVALID_DATA;
+ break;
+ case SP_RSP_STATUS_SPCN_ERR:
+ log_simple_error(&e_info(OPAL_RC_SENSOR_READ),
+ "SENSOR: %s: Failure due to SPCN error\n", __func__);
+ sensor_state = SENSOR_SPCN_ERROR;
+ break;
+ case SP_RSP_STATUS_DMA_ERR:
+ log_simple_error(&e_info(OPAL_RC_SENSOR_READ),
+ "SENSOR: %s: Failure due to DMA error\n", __func__);
+ sensor_state = SENSOR_DMA_ERROR;
+ break;
+ default:
+ log_simple_error(&e_info(OPAL_RC_SENSOR_READ),
+ "SENSOR %s: Read failed, status:0x%02X\n",
+ __func__, mbx_rsp_status);
+ sensor_state = SENSOR_INVALID_DATA;
+ break;
+ }
+
+ return size;
+}
+
+static void queue_msg_for_delivery(int rc, struct opal_sensor_data *attr)
+{
+ prlog(PR_INSANE, "%s: rc:%d, data:%lld\n",
+ __func__, rc, *(attr->sensor_data));
+ check_sensor_read(attr->async_token);
+ opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL,
+ cpu_to_be64(attr->async_token),
+ cpu_to_be64(rc));
+ spcn_mod_data[attr->mod_index].entry_count = 0;
+ free(attr);
+ prev_msg_consumed = true;
+}
+
+static void fsp_sensor_read_complete(struct fsp_msg *msg)
+{
+ struct opal_sensor_data *attr = msg->user_data;
+ enum spcn_rsp_status status;
+ int rc, size;
+
+ prlog(PR_INSANE, "%s()\n", __func__);
+
+ status = (fsp_msg_get_data_word(msg->resp, 1) >> 24) & 0xff;
+ size = fsp_sensor_process_read(msg->resp);
+ fsp_freemsg(msg);
+
+ lock(&sensor_lock);
+ if (sensor_state == SENSOR_VALID_DATA) {
+ spcn_mod_data[attr->mod_index].entry_count += (size /
+ spcn_mod_data[attr->mod_index].entry_size);
+ attr->offset += size;
+ /* Fetch the subsequent entries of the same modifier type */
+ if (status == SPCN_RSP_STATUS_COND_SUCCESS) {
+ switch (spcn_mod_data[attr->mod_index].mod) {
+ case SPCN_MOD_PRS_STATUS_FIRST:
+ case SPCN_MOD_SENSOR_PARAM_FIRST:
+ case SPCN_MOD_SENSOR_DATA_FIRST:
+ attr->mod_index++;
+ spcn_mod_data[attr->mod_index].entry_count =
+ spcn_mod_data[attr->mod_index - 1].
+ entry_count;
+ spcn_mod_data[attr->mod_index - 1].entry_count = 0;
+ break;
+ default:
+ break;
+ }
+
+ rc = fsp_sensor_send_read_request(attr);
+ if (rc != OPAL_ASYNC_COMPLETION)
+ goto err;
+ } else { /* Notify 'powernv' of read completion */
+ fsp_sensor_process_data(attr);
+ }
+ } else {
+ rc = OPAL_INTERNAL_ERROR;
+ goto err;
+ }
+ unlock(&sensor_lock);
+ return;
+err:
+ *attr->sensor_data = cpu_to_be64(INVALID_DATA);
+ queue_msg_for_delivery(rc, attr);
+ unlock(&sensor_lock);
+ log_simple_error(&e_info(OPAL_RC_SENSOR_ASYNC_COMPLETE),
+ "SENSOR: %s: Failed to queue the "
+ "read request to fsp\n", __func__);
+}
+
+static int64_t fsp_sensor_send_read_request(struct opal_sensor_data *attr)
+{
+ int rc;
+ struct fsp_msg *msg;
+ uint32_t align;
+ uint32_t cmd_header;
+
+ if (fsp_in_rr())
+ return OPAL_BUSY;
+
+ prlog(PR_INSANE, "Get the data for modifier [%x]\n",
+ spcn_mod_data[attr->mod_index].mod);
+
+ if (spcn_mod_data[attr->mod_index].mod == SPCN_MOD_PROC_JUNC_TEMP) {
+ /* TODO Support this modifier '0x14', if required */
+ align = attr->offset % sizeof(uint32_t);
+ if (align)
+ attr->offset += (sizeof(uint32_t) - align);
+
+ /* TODO Add 8 byte command data required for mod 0x14 */
+
+ attr->offset += 8;
+
+ cmd_header = spcn_mod_data[attr->mod_index].mod << 24 |
+ SPCN_CMD_PRS << 16 | 0x0008;
+ } else {
+ cmd_header = spcn_mod_data[attr->mod_index].mod << 24 |
+ SPCN_CMD_PRS << 16;
+ }
+
+ msg = fsp_mkmsg(FSP_CMD_SPCN_PASSTHRU, 4,
+ SPCN_ADDR_MODE_CEC_NODE, cmd_header, 0,
+ PSI_DMA_SENSOR_BUF + attr->offset);
+
+ if (!msg) {
+ log_simple_error(&e_info(OPAL_RC_SENSOR_READ), "SENSOR: Failed "
+ "to allocate read message\n");
+ return OPAL_INTERNAL_ERROR;
+ }
+
+ msg->user_data = attr;
+ rc = fsp_queue_msg(msg, fsp_sensor_read_complete);
+ if (rc) {
+ fsp_freemsg(msg);
+ msg = NULL;
+ log_simple_error(&e_info(OPAL_RC_SENSOR_READ), "SENSOR: Failed "
+ "to queue read message (%d)\n", rc);
+ return OPAL_INTERNAL_ERROR;
+ }
+
+ return OPAL_ASYNC_COMPLETION;
+}
+
+/*
+ * These are the resources we know about and for which we provide a
+ * mapping in the device tree to capture data from the OS. Just
+ * discard the other ones for the moment.
+ */
+static inline bool sensor_frc_is_valid(uint16_t frc)
+{
+ switch (frc) {
+ case SENSOR_FRC_POWER_SUPPLY:
+ case SENSOR_FRC_COOLING_FAN:
+ case SENSOR_FRC_AMB_TEMP:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/*
+ * Each attribute of a resource needs a request to the FSP to capture
+ * its data. The routine below provides the mapping between the
+ * attribute and the PRS command modifier to use.
+ *
+ * resource | data | thrs | status |
+ * ----------------+--------+--------+-----------+
+ * power_supply | POWER | | |
+ * | | | PRS |
+ * ----------------+--------+--------+-----------+
+ * amb-temp | DATA | | DATA |
+ * | | PARAM | PARAM (*) |
+ * ----------------+--------+--------+-----------+
+ * fan | DATA | | DATA (*) |
+ * | | PARAM | PARAM (*) |
+ * | | | PRS |
+ *
+ * (*) don't use the attribute given by this command modifier
+ */
+static int64_t parse_sensor_id(uint32_t handler, struct opal_sensor_data *attr)
+{
+ uint32_t mod, index;
+
+ attr->frc = sensor_get_frc(handler);
+ attr->rid = sensor_get_rid(handler);
+ attr->spcn_attr = sensor_get_attr(handler);
+
+ if (!sensor_frc_is_valid(attr->frc))
+ return OPAL_PARAMETER;
+
+ /* now compute the PRS command modifier which will be used to
+ * request a resource attribute from the FSP */
+ switch (attr->spcn_attr) {
+ case SENSOR_DATA:
+ if (attr->frc == SENSOR_FRC_POWER_SUPPLY)
+ mod = SPCN_MOD_SENSOR_POWER;
+ else
+ mod = SPCN_MOD_SENSOR_DATA_FIRST;
+ break;
+
+ case SENSOR_THRS:
+ mod = SPCN_MOD_SENSOR_PARAM_FIRST;
+ break;
+
+ case SENSOR_STATUS:
+ switch (attr->frc) {
+ case SENSOR_FRC_AMB_TEMP:
+ mod = SPCN_MOD_SENSOR_DATA_FIRST;
+ break;
+ case SENSOR_FRC_POWER_SUPPLY:
+ case SENSOR_FRC_COOLING_FAN:
+ mod = SPCN_MOD_PRS_STATUS_FIRST;
+ break;
+ default:
+ return OPAL_PARAMETER;
+ }
+ break;
+
+ default:
+ return OPAL_PARAMETER;
+ }
+
+ for (index = 0; spcn_mod_data[index].mod != SPCN_MOD_LAST; index++) {
+ if (spcn_mod_data[index].mod == mod)
+ break;
+ }
+
+ attr->mod_index = index;
+ return 0;
+}
+
+
+int64_t fsp_opal_read_sensor(uint32_t sensor_hndl, int token,
+ __be64 *sensor_data)
+{
+ struct opal_sensor_data *attr;
+ int64_t rc;
+
+ prlog(PR_INSANE, "fsp_opal_read_sensor [%08x]\n", sensor_hndl);
+
+ if (fsp_in_rr())
+ return OPAL_BUSY;
+
+ if (sensor_state == SENSOR_PERMANENT_ERROR) {
+ rc = OPAL_HARDWARE;
+ goto out;
+ }
+
+ if (!sensor_hndl) {
+ rc = OPAL_PARAMETER;
+ goto out;
+ }
+
+ lock(&sensor_lock);
+ if (prev_msg_consumed) {
+ attr = zalloc(sizeof(*attr));
+ if (!attr) {
+ log_simple_error(&e_info(OPAL_RC_SENSOR_READ),
+ "SENSOR: Failed to allocate memory\n");
+ rc = OPAL_NO_MEM;
+ goto out_lock;
+ }
+
+ /* Parse the sensor id and store them to the local structure */
+ rc = parse_sensor_id(sensor_hndl, attr);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_SENSOR_READ),
+ "SENSOR: %s: Failed to parse the sensor "
+ "handle[0x%08x]\n", __func__, sensor_hndl);
+ goto out_free;
+ }
+ /* Kernel buffer pointer to copy the data later when ready */
+ attr->sensor_data = sensor_data;
+ attr->async_token = token;
+
+ rc = fsp_sensor_send_read_request(attr);
+ if (rc != OPAL_ASYNC_COMPLETION) {
+ log_simple_error(&e_info(OPAL_RC_SENSOR_READ),
+ "SENSOR: %s: Failed to queue the read "
+ "request to fsp\n", __func__);
+ goto out_free;
+ }
+
+ prev_msg_consumed = false;
+ } else {
+ rc = OPAL_BUSY_EVENT;
+ }
+
+ unlock(&sensor_lock);
+ return rc;
+
+out_free:
+ free(attr);
+out_lock:
+ unlock(&sensor_lock);
+out:
+ return rc;
+}
+
+
+#define MAX_NAME 64
+
+static struct dt_node *sensor_get_node(struct dt_node *sensors,
+ struct sensor_header *header, const char* attrname)
+{
+ char name[MAX_NAME];
+ struct dt_node *node;
+
+ /*
+ * Just use the resource class name and resource id. This
+ * should be obvious enough for a node name.
+ */
+ snprintf(name, sizeof(name), "%s#%d-%s", frc_names[be16_to_cpu(header->frc)], be16_to_cpu(header->rid), attrname);
+
+ /*
+ * The same resources are reported by the different PRS
+ * subcommands (PRS_STATUS, SENSOR_PARAM, SENSOR_DATA). So we
+ * need to check that we did not already create the device
+ * node.
+ */
+ node = dt_find_by_path(sensors, name);
+ if (!node) {
+ prlog(PR_INFO, "SENSOR: creating node %s\n", name);
+
+ node = dt_new(sensors, name);
+
+ snprintf(name, sizeof(name), "ibm,opal-sensor-%s",
+ frc_names[be16_to_cpu(header->frc)]);
+ dt_add_property_string(node, "compatible", name);
+ } else {
+ /**
+ * @fwts-label OPALSensorNodeExists
+ * @fwts-advice OPAL had trouble creating the sensor
+ * nodes in the device tree as there was already one there.
+ * This indicates either the device tree from Hostboot
+ * already filled in sensors or an OPAL bug.
+ */
+ prlog(PR_ERR, "SENSOR: node %s exists\n", name);
+ }
+ return node;
+}
+
+#define sensor_handler(header, attr_num) \
+ sensor_make_handler(SENSOR_FSP, be16_to_cpu((header).frc), be16_to_cpu((header).rid), attr_num)
+
+static int add_sensor_prs(struct dt_node *sensors, struct sensor_prs *prs)
+{
+ struct dt_node *node;
+
+ node = sensor_get_node(sensors, &prs->header, "faulted");
+ if (!node)
+ return -1;
+
+ dt_add_property_cells(node, "sensor-id",
+ sensor_handler(prs->header, SENSOR_STATUS));
+ return 0;
+}
+
+static int add_sensor_param(struct dt_node *sensors, struct sensor_param *param)
+{
+ struct dt_node *node;
+
+ node = sensor_get_node(sensors, &param->header, "thrs");
+ if (!node)
+ return -1;
+
+ dt_add_property_string(node, "ibm,loc-code", param->location);
+ dt_add_property_cells(node, "sensor-id",
+ sensor_handler(param->header, SENSOR_THRS));
+ /* don't use the status coming from the response of the
+ * SENSOR_PARAM subcommand */
+ return 0;
+}
+
+static int add_sensor_data(struct dt_node *sensors,
+ struct sensor_data *data)
+{
+ struct dt_node *node;
+
+ node = sensor_get_node(sensors, &data->header, "data");
+ if (!node)
+ return -1;
+
+ dt_add_property_cells(node, "sensor-id",
+ sensor_handler(data->header, SENSOR_DATA));
+
+ /* Let's make sure we are not adding a duplicate device node.
+ * Some resource, like fans, get their status attribute from
+ * three different commands ...
+ */
+ if (be16_to_cpu(data->header.frc) == SENSOR_FRC_AMB_TEMP) {
+ node = sensor_get_node(sensors, &data->header, "faulted");
+ if (!node)
+ return -1;
+
+ dt_add_property_cells(node, "sensor-id",
+ sensor_handler(data->header, SENSOR_STATUS));
+ }
+
+ return 0;
+}
+
+static int add_sensor_power(struct dt_node *sensors, struct sensor_power *power)
+{
+ int i;
+ struct dt_node *node;
+
+ if (!sensor_power_is_valid(power))
+ return -1;
+
+ for (i = 0; i < sensor_power_count(power); i++) {
+ struct sensor_header header = {
+ cpu_to_be16(SENSOR_FRC_POWER_SUPPLY),
+ cpu_to_be16(normalize_power_rid(power->supplies[i].rid))
+ };
+
+ node = sensor_get_node(sensors, &header, "data");
+
+ prlog(PR_TRACE, "SENSOR: Power[%d] : %d mW\n",
+ power->supplies[i].rid,
+ be32_to_cpu(power->supplies[i].milliwatts));
+
+ dt_add_property_cells(node, "sensor-id",
+ sensor_handler(header, SENSOR_DATA));
+ }
+ return 0;
+}
+
+static void add_sensor_ids(struct dt_node *sensors)
+{
+ uint8_t *sensor_buf_ptr = (uint8_t *)sensor_buffer;
+ struct spcn_mod *smod;
+ int i;
+
+ for (smod = spcn_mod_data; smod->mod != SPCN_MOD_LAST; smod++) {
+ /*
+ * SPCN_MOD_SENSOR_POWER (0x1C) has a different layout.
+ */
+ if (smod->mod == SPCN_MOD_SENSOR_POWER) {
+ add_sensor_power(sensors,
+ (struct sensor_power *) sensor_buf_ptr);
+
+ sensor_buf_ptr += smod->entry_size * smod->entry_count;
+ continue;
+ }
+
+ for (i = 0; i < smod->entry_count; i++) {
+ struct sensor_header *header =
+ (struct sensor_header *) sensor_buf_ptr;
+
+ if (!sensor_frc_is_valid(be16_to_cpu(header->frc)))
+ goto out_sensor;
+
+ switch (smod->mod) {
+ case SPCN_MOD_PROC_JUNC_TEMP:
+ /* TODO Support this modifier '0x14',
+ if required */
+ break;
+
+ case SPCN_MOD_PRS_STATUS_FIRST:
+ case SPCN_MOD_PRS_STATUS_SUBS:
+ add_sensor_prs(sensors,
+ (struct sensor_prs *) header);
+ break;
+
+ case SPCN_MOD_SENSOR_PARAM_FIRST:
+ case SPCN_MOD_SENSOR_PARAM_SUBS:
+ add_sensor_param(sensors,
+ (struct sensor_param *) header);
+ break;
+
+ case SPCN_MOD_SENSOR_DATA_FIRST:
+ case SPCN_MOD_SENSOR_DATA_SUBS:
+ add_sensor_data(sensors,
+ (struct sensor_data *) header);
+
+ break;
+
+ default:
+ prerror("SENSOR: unknown modifier : %x\n",
+ smod->mod);
+ }
+
+out_sensor:
+ sensor_buf_ptr += smod->entry_size;
+ }
+ }
+}
+
+static void add_opal_sensor_node(void)
+{
+ int index;
+
+ if (!fsp_present())
+ return;
+
+ add_sensor_ids(sensor_node);
+
+ /* Reset the entry count of each modifier */
+ for (index = 0; spcn_mod_data[index].mod != SPCN_MOD_LAST;
+ index++)
+ spcn_mod_data[index].entry_count = 0;
+}
+
+void fsp_init_sensor(void)
+{
+ uint32_t cmd_header, align, size, psi_dma_offset = 0;
+ enum spcn_rsp_status status;
+ struct fsp_msg msg, resp;
+ int index, rc;
+
+ if (!fsp_present()) {
+ sensor_state = SENSOR_PERMANENT_ERROR;
+ return;
+ }
+
+ sensor_buffer = memalign(TCE_PSIZE, SENSOR_MAX_SIZE);
+ if (!sensor_buffer) {
+ log_simple_error(&e_info(OPAL_RC_SENSOR_INIT), "SENSOR: could "
+ "not allocate sensor_buffer!\n");
+ return;
+ }
+
+ /* Map TCE */
+ fsp_tce_map(PSI_DMA_SENSOR_BUF, sensor_buffer, PSI_DMA_SENSOR_BUF_SZ);
+
+ msg.resp = &resp;
+
+ /* Traverse using all the modifiers to know all the sensors available
+ * in the system */
+ for (index = 0; spcn_mod_data[index].mod != SPCN_MOD_LAST &&
+ sensor_state == SENSOR_VALID_DATA;) {
+ prlog(PR_TRACE, "Get the data for modifier [%d]\n",
+ spcn_mod_data[index].mod);
+ if (spcn_mod_data[index].mod == SPCN_MOD_PROC_JUNC_TEMP) {
+ /* TODO Support this modifier 0x14, if required */
+ align = psi_dma_offset % sizeof(uint32_t);
+ if (align)
+ psi_dma_offset += (sizeof(uint32_t) - align);
+
+ /* TODO Add 8 byte command data required for mod 0x14 */
+ psi_dma_offset += 8;
+
+ cmd_header = spcn_mod_data[index].mod << 24 |
+ SPCN_CMD_PRS << 16 | 0x0008;
+ } else {
+ cmd_header = spcn_mod_data[index].mod << 24 |
+ SPCN_CMD_PRS << 16;
+ }
+
+ fsp_fillmsg(&msg, FSP_CMD_SPCN_PASSTHRU, 4,
+ SPCN_ADDR_MODE_CEC_NODE, cmd_header, 0,
+ PSI_DMA_SENSOR_BUF + psi_dma_offset);
+
+ rc = fsp_sync_msg(&msg, false);
+ if (rc >= 0) {
+ status = (fsp_msg_get_data_word(&resp, 1) >> 24) & 0xff;
+ size = fsp_sensor_process_read(&resp);
+ psi_dma_offset += size;
+ spcn_mod_data[index].entry_count += (size /
+ spcn_mod_data[index].entry_size);
+ } else {
+ sensor_state = SENSOR_PERMANENT_ERROR;
+ break;
+ }
+
+ switch (spcn_mod_data[index].mod) {
+ case SPCN_MOD_PRS_STATUS_FIRST:
+ case SPCN_MOD_SENSOR_PARAM_FIRST:
+ case SPCN_MOD_SENSOR_DATA_FIRST:
+ if (status == SPCN_RSP_STATUS_COND_SUCCESS)
+ index++;
+ else
+ index += 2;
+
+ break;
+ case SPCN_MOD_PRS_STATUS_SUBS:
+ case SPCN_MOD_SENSOR_PARAM_SUBS:
+ case SPCN_MOD_SENSOR_DATA_SUBS:
+ if (status != SPCN_RSP_STATUS_COND_SUCCESS)
+ index++;
+ break;
+ case SPCN_MOD_SENSOR_POWER:
+ index++;
+ default:
+ break;
+ }
+ }
+
+ if (sensor_state != SENSOR_VALID_DATA)
+ sensor_state = SENSOR_PERMANENT_ERROR;
+ else
+ add_opal_sensor_node();
+}
diff --git a/roms/skiboot/hw/fsp/fsp-surveillance.c b/roms/skiboot/hw/fsp/fsp-surveillance.c
new file mode 100644
index 000000000..84e6878f3
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-surveillance.c
@@ -0,0 +1,226 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * We don't want to go on the cart!
+ *
+ * Copyright 2013-2018 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <fsp.h>
+#include <lock.h>
+#include <processor.h>
+#include <timebase.h>
+#include <fsp-sysparam.h>
+#include <errorlog.h>
+#include <opal-api.h>
+
+static bool fsp_surv_state = false;
+static bool fsp_surv_ack_pending = false;
+static u64 surv_timer;
+static u64 surv_ack_timer;
+static u32 surv_state_param;
+static struct lock surv_lock = LOCK_UNLOCKED;
+
+#define FSP_SURV_ACK_TIMEOUT 120 /* surv ack timeout in seconds */
+
+DEFINE_LOG_ENTRY(OPAL_RC_SURVE_INIT, OPAL_MISC_ERR_EVT, OPAL_SURVEILLANCE,
+ OPAL_SURVEILLANCE_ERR, OPAL_PREDICTIVE_ERR_GENERAL,
+ OPAL_MISCELLANEOUS_INFO_ONLY);
+
+DEFINE_LOG_ENTRY(OPAL_RC_SURVE_STATUS, OPAL_MISC_ERR_EVT, OPAL_SURVEILLANCE,
+ OPAL_SURVEILLANCE_ERR, OPAL_PREDICTIVE_ERR_GENERAL,
+ OPAL_MISCELLANEOUS_INFO_ONLY);
+
+DEFINE_LOG_ENTRY(OPAL_RC_SURVE_ACK, OPAL_MISC_ERR_EVT, OPAL_SURVEILLANCE,
+ OPAL_SURVEILLANCE_ERR, OPAL_PREDICTIVE_ERR_GENERAL,
+ OPAL_MISCELLANEOUS_INFO_ONLY);
+
+static void fsp_surv_ack(struct fsp_msg *msg)
+{
+ uint8_t val;
+
+ if (!msg->resp)
+ return;
+
+ val = (msg->resp->word1 >> 8) & 0xff;
+ if (val == 0) {
+ /* reset the pending flag */
+ prlog(PR_TRACE,
+ "SURV: Received heartbeat acknowledge from FSP\n");
+ lock(&surv_lock);
+ fsp_surv_ack_pending = false;
+ unlock(&surv_lock);
+ } else {
+ /**
+ * @fwts-label FSPHeartbeatAckError
+ * @fwts-advice Error in acknowledging heartbeat to FSP.
+ * This could mean the FSP has gone away or it may mean
+ * the FSP may kill us for missing too many heartbeats.
+ */
+ prlog(PR_ERR,
+ "SURV: Heartbeat Acknowledgment error from FSP\n");
+ }
+
+ fsp_freemsg(msg);
+}
+
+static void fsp_surv_check_timeout(void)
+{
+ u64 now = mftb();
+
+ /*
+ * We just checked fsp_surv_ack_pending to be true in fsp_surv_hbeat
+ * and we haven't dropped the surv_lock between then and now. So, we
+ * just go ahead and check timeouts.
+ */
+ if (tb_compare(now, surv_ack_timer) == TB_AAFTERB) {
+ uint32_t plid = log_simple_error(&e_info(OPAL_RC_SURVE_ACK),
+ "SURV: Surv ACK timed out; initiating R/R\n");
+
+ /* Reset the pending trigger too */
+ fsp_surv_ack_pending = false;
+ fsp_trigger_reset(plid);
+ }
+
+ return;
+}
+
+/* Send surveillance heartbeat based on a timebase trigger */
+static void fsp_surv_hbeat(void)
+{
+ u64 now = mftb();
+ struct fsp_msg *msg;
+
+ /* Check if an ack is pending... if so, don't send the ping just yet */
+ if (fsp_surv_ack_pending) {
+ fsp_surv_check_timeout();
+ return;
+ }
+
+ /* add timebase callbacks */
+ /*
+ * XXX This packet needs to be pushed to FSP in an interval
+ * less than 120s that's advertised to FSP.
+ *
+ * Verify if the command building format and call is fine.
+ */
+ if (surv_timer == 0 ||
+ (tb_compare(now, surv_timer) == TB_AAFTERB) ||
+ (tb_compare(now, surv_timer) == TB_AEQUALB)) {
+ prlog(PR_TRACE,
+ "SURV: Sending the heartbeat command to FSP\n");
+ msg = fsp_mkmsg(FSP_CMD_SURV_HBEAT, 1, 120);
+ if (!msg) {
+ prerror("SURV: Failed to allocate heartbeat msg\n");
+ return;
+ }
+ if (fsp_queue_msg(msg, fsp_surv_ack)) {
+ fsp_freemsg(msg);
+ prerror("SURV: Failed to queue heartbeat msg\n");
+ } else {
+ fsp_surv_ack_pending = true;
+ surv_timer = now + secs_to_tb(60);
+ surv_ack_timer = now + secs_to_tb(FSP_SURV_ACK_TIMEOUT);
+ }
+ }
+}
+
+static void fsp_surv_poll(void *data __unused)
+{
+ if (!fsp_surv_state)
+ return;
+ lock(&surv_lock);
+ fsp_surv_hbeat();
+ unlock(&surv_lock);
+}
+
+static void fsp_surv_got_param(uint32_t param_id __unused, int err_len,
+ void *data __unused)
+{
+ if (err_len != 4) {
+ uint32_t plid = log_simple_error(&e_info(OPAL_RC_SURVE_STATUS),
+ "SURV: Error (%d) retrieving surv status; initiating R/R\n",
+ err_len);
+ fsp_trigger_reset(plid);
+ return;
+ }
+
+ surv_state_param = be32_to_cpu((__be32)surv_state_param);
+ if (!(surv_state_param & 0x01)) {
+ prlog(PR_NOTICE, "SURV: Status from FSP: disabled\n");
+ return;
+ }
+ prlog(PR_NOTICE, "SURV: Status from FSP: enabled\n");
+
+ lock(&surv_lock);
+ fsp_surv_state = true;
+
+ /* Also send one heartbeat now. The next one will not happen
+ * until we hit the OS.
+ */
+ fsp_surv_hbeat();
+ unlock(&surv_lock);
+}
+
+void fsp_surv_query(void)
+{
+ int rc;
+
+ printf("SURV: Querying FSP's surveillance status\n");
+
+ /* Reset surveillance settings */
+ lock(&surv_lock);
+ fsp_surv_state = false;
+ surv_timer = 0;
+ surv_ack_timer = 0;
+ unlock(&surv_lock);
+
+ /* Query FPS for surveillance state */
+ rc = fsp_get_sys_param(SYS_PARAM_SURV, &surv_state_param, 4,
+ fsp_surv_got_param, NULL);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_SURVE_INIT),
+ "SURV: Error %d queueing param request\n", rc);
+ }
+}
+
+static bool fsp_surv_msg_rr(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+ assert(msg == NULL);
+
+ switch (cmd_sub_mod) {
+ case FSP_RESET_START:
+ printf("SURV: Disabling surveillance\n");
+ lock(&surv_lock);
+ fsp_surv_state = false;
+ fsp_surv_ack_pending = false;
+ unlock(&surv_lock);
+ return true;
+ case FSP_RELOAD_COMPLETE:
+ fsp_surv_query();
+ return true;
+ }
+ return false;
+}
+
+static struct fsp_client fsp_surv_client_rr = {
+ .message = fsp_surv_msg_rr,
+};
+
+/* This is called at boot time */
+void fsp_init_surveillance(void)
+{
+ /* Always register the poller, so we don't have to add/remove
+ * it on reset-reload or change of surveillance state. Also the
+ * poller list has no locking so we don't want to play with it
+ * at runtime.
+ */
+ opal_add_poller(fsp_surv_poll, NULL);
+
+ /* Register for the reset/reload event */
+ fsp_register_client(&fsp_surv_client_rr, FSP_MCLASS_RR_EVENT);
+
+ /* Send query to FSP */
+ fsp_surv_query();
+}
+
diff --git a/roms/skiboot/hw/fsp/fsp-sysdump.c b/roms/skiboot/hw/fsp/fsp-sysdump.c
new file mode 100644
index 000000000..cd8744062
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-sysdump.c
@@ -0,0 +1,407 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Sapphire dump design:
+ * - During initialization we setup Memory Dump Source Table (MDST) table
+ * which contains address, size pair.
+ * - We send MDST table update notification to FSP via MBOX command.
+ * - During Sapphire checkstop:
+ * - FSP retrieves HWDUMP.
+ * - FSP retrieves CEC memory based on MDST table.
+ * - Once Sapphire reboot FSP sends new dump avialable notification via HDAT
+ *
+ * Copyright 2013-2016 IBM Corp.
+ */
+
+#include <fsp.h>
+#include <psi.h>
+#include <opal.h>
+#include <lock.h>
+#include <skiboot.h>
+#include <errorlog.h>
+#include <opal-dump.h>
+
+/*
+ * Sapphire dump size
+ * This is the maximum memory that FSP can retrieve during checkstop.
+ *
+ * Note:
+ * Presently we are hardcoding this parameter. Eventually we need
+ * new System parameter so that we can get max size dynamically.
+ */
+#define MAX_SAPPHIRE_DUMP_SIZE 0x1000000
+
+DEFINE_LOG_ENTRY(OPAL_RC_DUMP_MDST_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_DUMP,
+ OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT,
+ OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_DUMP_MDST_UPDATE, OPAL_PLATFORM_ERR_EVT, OPAL_DUMP,
+ OPAL_PLATFORM_FIRMWARE,
+ OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT,
+ OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_DUMP_MDST_ADD, OPAL_PLATFORM_ERR_EVT, OPAL_DUMP,
+ OPAL_PLATFORM_FIRMWARE, OPAL_INFO, OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_DUMP_MDST_REMOVE, OPAL_PLATFORM_ERR_EVT, OPAL_DUMP,
+ OPAL_PLATFORM_FIRMWARE, OPAL_INFO, OPAL_NA);
+
+
+static struct mdst_table *mdst_table;
+static struct mdst_table *dump_mem_region;
+
+static int cur_mdst_entry;
+static int max_mdst_entry;
+static int cur_dump_size;
+/*
+ * Presently both sizes are same.. But if someday FSP gives more space
+ * than our TCE mapping then we need this validation..
+ *
+ * Also once FSP implements MAX_SAPPHIRE_DUMP_SIZE system param, we can
+ * move this validation to separate function.
+ */
+static int max_dump_size = MIN(MAX_SAPPHIRE_DUMP_SIZE, PSI_DMA_HYP_DUMP_SIZE);
+
+/* Protect MDST table entries */
+static struct lock mdst_lock = LOCK_UNLOCKED;
+
+static inline uint32_t get_dump_region_map_size(uint64_t addr, uint32_t size)
+{
+ uint64_t start, end;
+
+ start = addr & ~TCE_MASK;
+ end = addr + size;
+ end = ALIGN_UP(end, TCE_PSIZE);
+
+ return (end - start);
+}
+
+static int dump_region_tce_map(void)
+{
+ int i;
+ uint32_t t_size = 0, size;
+ uint64_t addr;
+
+ for (i = 0; i < cur_mdst_entry; i++) {
+
+ addr = be64_to_cpu(dump_mem_region[i].addr) & ~TCE_MASK;
+ size = get_dump_region_map_size(be64_to_cpu(dump_mem_region[i].addr),
+ be32_to_cpu(dump_mem_region[i].size));
+
+ if (t_size + size > max_dump_size)
+ break;
+
+ /* TCE mapping */
+ fsp_tce_map(PSI_DMA_HYP_DUMP + t_size, (void *)addr, size);
+
+ /* Add entry to MDST table */
+ mdst_table[i].data_region = dump_mem_region[i].data_region;
+ mdst_table[i].size = dump_mem_region[i].size;
+ mdst_table[i].addr = cpu_to_be64(PSI_DMA_HYP_DUMP + t_size);
+
+ /* TCE alignment adjustment */
+ mdst_table[i].addr = cpu_to_be64(be64_to_cpu(mdst_table[i].addr) +
+ (be64_to_cpu(dump_mem_region[i].addr) & 0xfff));
+
+ t_size += size;
+ }
+
+ return i;
+}
+
+static inline void dump_region_tce_unmap(void)
+{
+ fsp_tce_unmap(PSI_DMA_HYP_DUMP, PSI_DMA_HYP_DUMP_SIZE);
+}
+
+static void update_mdst_table_complete(struct fsp_msg *msg)
+{
+ uint8_t status = (msg->resp->word1 >> 8) & 0xff;
+
+ if (status)
+ log_simple_error(&e_info(OPAL_RC_DUMP_MDST_UPDATE),
+ "MDST: Update table MBOX command failed: "
+ "0x%x\n", status);
+ else
+ printf("MDST: Table updated.\n");
+
+ fsp_freemsg(msg);
+}
+
+/* Send MDST table to FSP */
+static int64_t fsp_update_mdst_table(void)
+{
+ struct fsp_msg *msg;
+ int count;
+ int rc = OPAL_SUCCESS;
+
+ if (cur_mdst_entry <= 0) {
+ printf("MDST: Table is empty\n");
+ return OPAL_INTERNAL_ERROR;
+ }
+
+ lock(&mdst_lock);
+
+ /* Unmap previous mapping */
+ dump_region_tce_unmap();
+ count = dump_region_tce_map();
+
+ msg = fsp_mkmsg(FSP_CMD_HYP_MDST_TABLE, 4, 0,
+ PSI_DMA_MDST_TABLE,
+ sizeof(*mdst_table) * count,
+ sizeof(*mdst_table));
+ unlock(&mdst_lock);
+
+ if (!msg) {
+ log_simple_error(&e_info(OPAL_RC_DUMP_MDST_UPDATE),
+ "MDST: Message allocation failed.!\n");
+ rc = OPAL_INTERNAL_ERROR;
+ } else if (fsp_queue_msg(msg, update_mdst_table_complete)) {
+ log_simple_error(&e_info(OPAL_RC_DUMP_MDST_UPDATE),
+ "MDST: Failed to queue MDST table message.\n");
+ fsp_freemsg(msg);
+ rc = OPAL_INTERNAL_ERROR;
+ }
+ return rc;
+}
+
+static int dump_region_del_entry(uint32_t id)
+{
+ int i;
+ uint32_t size;
+ bool found = false;
+ int rc = OPAL_SUCCESS;
+
+ lock(&mdst_lock);
+
+ for (i = 0; i < cur_mdst_entry; i++) {
+ if (dump_mem_region[i].data_region != id)
+ continue;
+
+ found = true;
+ break;
+ }
+
+ if (!found) {
+ rc = OPAL_PARAMETER;
+ goto del_out;
+ }
+
+ /* Adjust current dump size */
+ size = get_dump_region_map_size(be64_to_cpu(dump_mem_region[i].addr),
+ be32_to_cpu(dump_mem_region[i].size));
+ cur_dump_size -= size;
+
+ for ( ; i < cur_mdst_entry - 1; i++)
+ dump_mem_region[i] = dump_mem_region[i + 1];
+
+ dump_mem_region[i].data_region = 0;
+ cur_mdst_entry--;
+
+del_out:
+ unlock(&mdst_lock);
+ return rc;
+}
+
+/* Add entry to MDST table */
+static int __dump_region_add_entry(uint32_t id, uint64_t addr, uint32_t size)
+{
+ int rc = OPAL_INTERNAL_ERROR;
+ uint32_t act_size;
+
+ /* Delete function takes lock before modifying table */
+ dump_region_del_entry(id);
+
+ lock(&mdst_lock);
+
+ if (cur_mdst_entry >= max_mdst_entry) {
+ log_simple_error(&e_info(OPAL_RC_DUMP_MDST_ADD),
+ "MDST: Table is full.\n");
+ goto out;
+ }
+
+ /* TCE alignment adjustment */
+ act_size = get_dump_region_map_size(addr, size);
+
+ /* Make sure we don't cross dump size limit */
+ if (cur_dump_size + act_size > max_dump_size) {
+ log_simple_error(&e_info(OPAL_RC_DUMP_MDST_ADD),
+ "MDST: 0x%x is crossing max dump size (0x%x) limit.\n",
+ cur_dump_size + act_size, max_dump_size);
+ goto out;
+ }
+
+ /* Add entry to dump memory region table */
+ dump_mem_region[cur_mdst_entry].data_region = (u8)id;
+ dump_mem_region[cur_mdst_entry].addr = cpu_to_be64(addr);
+ dump_mem_region[cur_mdst_entry].size = cpu_to_be32(size);
+
+ /* Update dump region count and dump size */
+ cur_mdst_entry++;
+ cur_dump_size += act_size;
+
+ printf("MDST: Addr = 0x%llx [size : 0x%x bytes] added to MDST table.\n",
+ (uint64_t)addr, size);
+
+ rc = OPAL_SUCCESS;
+
+out:
+ unlock(&mdst_lock);
+ return rc;
+}
+
+static int dump_region_add_entries(void)
+{
+ int rc;
+
+ /* Add console buffer */
+ rc = __dump_region_add_entry(DUMP_REGION_CONSOLE,
+ INMEM_CON_START, INMEM_CON_LEN);
+ if (rc)
+ return rc;
+
+ /* Add HBRT buffer */
+ rc = __dump_region_add_entry(DUMP_REGION_HBRT_LOG,
+ HBRT_CON_START, HBRT_CON_LEN);
+
+ return rc;
+}
+
+static int64_t fsp_opal_register_dump_region(uint32_t id,
+ uint64_t addr, uint64_t size)
+{
+ int rc = OPAL_SUCCESS;
+
+ if (!fsp_present())
+ return OPAL_UNSUPPORTED;
+
+ /* Validate memory region id */
+ if (id < DUMP_REGION_HOST_START || id > DUMP_REGION_HOST_END) {
+ log_simple_error(&e_info(OPAL_RC_DUMP_MDST_ADD),
+ "MDST: Invalid dump region id : 0x%x\n", id);
+ return OPAL_PARAMETER;
+ }
+
+ if (size <= 0) {
+ log_simple_error(&e_info(OPAL_RC_DUMP_MDST_ADD),
+ "MDST: Invalid size : 0x%llx\n", size);
+ return OPAL_PARAMETER;
+ }
+
+ rc = __dump_region_add_entry(id, addr, size);
+ if (rc)
+ return rc;
+
+ /* Send updated MDST to FSP */
+ rc = fsp_update_mdst_table();
+
+ return rc;
+}
+
+static int64_t fsp_opal_unregister_dump_region(uint32_t id)
+{
+ int rc = OPAL_SUCCESS;
+
+ if (!fsp_present())
+ return OPAL_UNSUPPORTED;
+
+ /* Validate memory region id */
+ if (id < DUMP_REGION_HOST_START || id > DUMP_REGION_HOST_END) {
+ log_simple_error(&e_info(OPAL_RC_DUMP_MDST_REMOVE),
+ "MDST: Invalid dump region id : 0x%x\n", id);
+ return OPAL_PARAMETER;
+ }
+
+ rc = dump_region_del_entry(id);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_DUMP_MDST_REMOVE),
+ "MDST: dump region id : 0x%x not found\n", id);
+ return OPAL_PARAMETER;
+ }
+
+ /* Send updated MDST to FSP */
+ rc = fsp_update_mdst_table();
+
+ return rc;
+}
+
+/* TCE mapping */
+static inline void mdst_table_tce_map(void)
+{
+ fsp_tce_map(PSI_DMA_MDST_TABLE, mdst_table, PSI_DMA_MDST_TABLE_SIZE);
+}
+
+/* Initialize MDST table */
+static int mdst_table_init(void)
+{
+ dump_mem_region = memalign(TCE_PSIZE, PSI_DMA_MDST_TABLE_SIZE);
+ if (!dump_mem_region) {
+ log_simple_error(&e_info(OPAL_RC_DUMP_MDST_INIT),
+ "MDST: Failed to allocate memory for dump "
+ "memory region table.\n");
+ return -ENOMEM;
+ }
+
+ memset(dump_mem_region, 0, PSI_DMA_MDST_TABLE_SIZE);
+
+ mdst_table = memalign(TCE_PSIZE, PSI_DMA_MDST_TABLE_SIZE);
+ if (!mdst_table) {
+ log_simple_error(&e_info(OPAL_RC_DUMP_MDST_INIT),
+ "MDST: Failed to allocate memory for MDST table.\n");
+ return -ENOMEM;
+ }
+
+ memset(mdst_table, 0, PSI_DMA_MDST_TABLE_SIZE);
+ mdst_table_tce_map();
+
+ max_mdst_entry = PSI_DMA_MDST_TABLE_SIZE / sizeof(*mdst_table);
+ printf("MDST: Max entries in MDST table : %d\n", max_mdst_entry);
+
+ return OPAL_SUCCESS;
+}
+
+/*
+ * Handle FSP R/R event.
+ */
+static bool fsp_mdst_update_rr(uint32_t cmd_sub_mod,
+ struct fsp_msg *msg __unused)
+{
+ switch (cmd_sub_mod) {
+ case FSP_RESET_START:
+ return true;
+ case FSP_RELOAD_COMPLETE: /* Send MDST to FSP */
+ fsp_update_mdst_table();
+ return true;
+ }
+ return false;
+}
+
+static struct fsp_client fsp_mdst_client_rr = {
+ .message = fsp_mdst_update_rr,
+};
+
+/* Initialize MDST table and send notification to FSP */
+void fsp_mdst_table_init(void)
+{
+ if (!fsp_present())
+ return;
+
+ /* OPAL interface */
+ opal_register(OPAL_REGISTER_DUMP_REGION,
+ fsp_opal_register_dump_region, 3);
+ opal_register(OPAL_UNREGISTER_DUMP_REGION,
+ fsp_opal_unregister_dump_region, 1);
+
+ /* Initiate MDST */
+ if (mdst_table_init() != OPAL_SUCCESS)
+ return;
+
+ /*
+ * Ignore return code from mdst_table_add_entries so that
+ * we can atleast capture partial dump.
+ */
+ dump_region_add_entries();
+ fsp_update_mdst_table();
+
+ /* Register for Class AA (FSP R/R) */
+ fsp_register_client(&fsp_mdst_client_rr, FSP_MCLASS_RR_EVENT);
+}
diff --git a/roms/skiboot/hw/fsp/fsp-sysparam.c b/roms/skiboot/hw/fsp/fsp-sysparam.c
new file mode 100644
index 000000000..adb424e5e
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-sysparam.c
@@ -0,0 +1,508 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * There's some system level parameters that aren't over IPMI or NVRAM
+ * but that the FSP exposes through this interface.
+ *
+ * We expose these through an OPAL API as there really isn't any other/better
+ * way of doing so.
+ *
+ * Copyright 2013-2017 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <fsp.h>
+#include <opal.h>
+#include <device.h>
+#include <lock.h>
+#include <processor.h>
+#include <psi.h>
+#include <opal-msg.h>
+#include <fsp-sysparam.h>
+
+struct sysparam_comp_data {
+ uint32_t param_len;
+ uint64_t async_token;
+};
+
+struct sysparam_req {
+ sysparam_compl_t completion;
+ void *comp_data;
+ void *ubuf;
+ uint32_t ulen;
+ struct fsp_msg msg;
+ struct fsp_msg resp;
+ bool done;
+};
+
+static struct sysparam_attr {
+ const char *name;
+ uint32_t id;
+ uint32_t length;
+ uint8_t perm;
+} sysparam_attrs[] = {
+#define _R OPAL_SYSPARAM_READ
+#define _W OPAL_SYSPARAM_WRITE
+#define _RW OPAL_SYSPARAM_RW
+ {"surveillance", SYS_PARAM_SURV, 4, _RW},
+ {"hmc-management", SYS_PARAM_HMC_MANAGED, 4, _R},
+ {"cupd-policy", SYS_PARAM_FLASH_POLICY, 4, _RW},
+ {"plat-hmc-managed", SYS_PARAM_NEED_HMC, 4, _RW},
+ {"fw-license-policy", SYS_PARAM_FW_LICENSE, 4, _RW},
+ {"world-wide-port-num", SYS_PARAM_WWPN, 12, _W},
+ {"default-boot-device", SYS_PARAM_DEF_BOOT_DEV, 1, _RW},
+ {"next-boot-device", SYS_PARAM_NEXT_BOOT_DEV,1, _RW},
+ {"console-select", SYS_PARAM_CONSOLE_SELECT,1, _RW},
+ {"boot-device-path", SYS_PARAM_BOOT_DEV_PATH,48, _RW}
+#undef _R
+#undef _W
+#undef _RW
+};
+
+static int fsp_sysparam_process(struct sysparam_req *r)
+{
+ u32 param_id, len;
+ int stlen = 0;
+ u8 fstat;
+ /* Snapshot completion before we set the "done" flag */
+ sysparam_compl_t comp = r->completion;
+ void *cdata = r->comp_data;
+
+ if (r->msg.state != fsp_msg_done) {
+ prerror("FSP: Request for sysparam 0x%x got FSP failure!\n",
+ fsp_msg_get_data_word(&r->msg, 0));
+ stlen = -1; /* XXX Find saner error codes */
+ goto complete;
+ }
+
+ param_id = fsp_msg_get_data_word(&r->resp, 0);
+ len = fsp_msg_get_data_word(&r->resp, 1) & 0xffff;
+
+ /* Check params validity */
+ if (param_id != fsp_msg_get_data_word(&r->msg, 0)) {
+ prerror("FSP: Request for sysparam 0x%x got resp. for 0x%x!\n",
+ fsp_msg_get_data_word(&r->msg, 0), param_id);
+ stlen = -2; /* XXX Sane error codes */
+ goto complete;
+ }
+ if (len > r->ulen) {
+ prerror("FSP: Request for sysparam 0x%x truncated!\n",
+ param_id);
+ len = r->ulen;
+ }
+
+ /* Decode the request status */
+ fstat = (r->msg.resp->word1 >> 8) & 0xff;
+ switch(fstat) {
+ case 0x00: /* XXX Is that even possible ? */
+ case 0x11: /* Data in request */
+ memcpy(r->ubuf, &r->resp.data.bytes[8], len);
+ /* fallthrough */
+ case 0x12: /* Data in TCE */
+ stlen = len;
+ break;
+ default:
+ stlen = -fstat;
+ }
+ complete:
+ /* Call completion if any */
+ if (comp)
+ comp(fsp_msg_get_data_word(&r->msg, 0), stlen, cdata);
+
+ free(r);
+
+ return stlen;
+}
+
+static void fsp_sysparam_get_complete(struct fsp_msg *msg)
+{
+ struct sysparam_req *r = container_of(msg, struct sysparam_req, msg);
+
+ /* If it's an asynchronous request, process it now */
+ if (r->completion) {
+ fsp_sysparam_process(r);
+ return;
+ }
+
+ /* Else just set the done flag */
+
+ /* Another CPU can be polling on the "done" flag without the
+ * lock held, so let's order the udpates to the structure
+ */
+ lwsync();
+ r->done = true;
+}
+
+int fsp_get_sys_param(uint32_t param_id, void *buffer, uint32_t length,
+ sysparam_compl_t async_complete, void *comp_data)
+{
+ struct sysparam_req *r;
+ uint64_t baddr, tce_token;
+ int rc;
+
+ if (!fsp_present())
+ return -ENODEV;
+ /*
+ * XXX FIXME: We currently always allocate the sysparam_req here
+ * however, we want to avoid runtime allocations as much as
+ * possible, so if this is going to be used a lot at runtime,
+ * we probably want to pre-allocate a pool of these
+ */
+ if (length > 4096)
+ return -EINVAL;
+ r = zalloc(sizeof(struct sysparam_req));
+ if (!r)
+ return -ENOMEM;
+ r->completion = async_complete;
+ r->comp_data = comp_data;
+ r->done = false;
+ r->ubuf = buffer;
+ r->ulen = length;
+ r->msg.resp = &r->resp;
+
+ /* Map always 1 page ... easier that way and none of that
+ * is performance critical
+ */
+ baddr = (uint64_t)buffer;
+ fsp_tce_map(PSI_DMA_GET_SYSPARAM, (void *)(baddr & ~0xffful), 0x1000);
+ tce_token = PSI_DMA_GET_SYSPARAM | (baddr & 0xfff);
+ fsp_fillmsg(&r->msg, FSP_CMD_QUERY_SPARM, 3,
+ param_id, length, tce_token);
+ rc = fsp_queue_msg(&r->msg, fsp_sysparam_get_complete);
+
+ if (rc)
+ free(r);
+
+ /* Asynchronous operation or queueing failure, return */
+ if (rc || async_complete)
+ return rc;
+
+ /* Synchronous operation requested, spin and process */
+ while(!r->done)
+ opal_run_pollers();
+
+ /* Will free the request */
+ return fsp_sysparam_process(r);
+}
+
+static void fsp_opal_getparam_complete(uint32_t param_id __unused, int err_len,
+ void *data)
+{
+ struct sysparam_comp_data *comp_data = data;
+ int rc = OPAL_SUCCESS;
+
+ if (comp_data->param_len != err_len)
+ rc = OPAL_INTERNAL_ERROR;
+
+ opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL,
+ cpu_to_be64(comp_data->async_token),
+ cpu_to_be64(rc));
+ free(comp_data);
+}
+
+static void fsp_opal_setparam_complete(struct fsp_msg *msg)
+{
+ struct sysparam_comp_data *comp_data = msg->user_data;
+ u8 fstat;
+ uint32_t param_id;
+ int rc = OPAL_SUCCESS;
+
+ if (msg->state != fsp_msg_done) {
+ prerror("FSP: Request for set sysparam 0x%x got FSP failure!\n",
+ fsp_msg_get_data_word(msg, 0));
+ rc = OPAL_INTERNAL_ERROR;
+ goto out;
+ }
+
+ param_id = fsp_msg_get_data_word(msg->resp, 0);
+ if (param_id != fsp_msg_get_data_word(msg, 0)) {
+ prerror("FSP: Request for set sysparam 0x%x got resp. for 0x%x!"
+ "\n", fsp_msg_get_data_word(msg, 0), param_id);
+ rc = OPAL_INTERNAL_ERROR;
+ goto out;
+ }
+
+ fstat = (msg->resp->word1 >> 8) & 0xff;
+ switch (fstat) {
+ case 0x00:
+ rc = OPAL_SUCCESS;
+ break;
+ case 0x22:
+ prerror("%s: Response status 0x%x, invalid data\n", __func__,
+ fstat);
+ rc = OPAL_INTERNAL_ERROR;
+ break;
+ case 0x24:
+ prerror("%s: Response status 0x%x, DMA error\n", __func__,
+ fstat);
+ rc = OPAL_INTERNAL_ERROR;
+ break;
+ default:
+ rc = OPAL_INTERNAL_ERROR;
+ break;
+ }
+
+out:
+ opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL,
+ cpu_to_be64(comp_data->async_token),
+ cpu_to_be64(rc));
+ free(comp_data);
+ fsp_freemsg(msg);
+}
+
+/* OPAL interface for PowerNV to read the system parameter from FSP */
+static int64_t fsp_opal_get_param(uint64_t async_token, uint32_t param_id,
+ uint64_t buffer, uint64_t length)
+{
+ struct sysparam_comp_data *comp_data;
+ int count, rc, i;
+
+ if (!fsp_present())
+ return OPAL_HARDWARE;
+
+ count = ARRAY_SIZE(sysparam_attrs);
+ for (i = 0; i < count; i++)
+ if (sysparam_attrs[i].id == param_id)
+ break;
+ if (i == count)
+ return OPAL_PARAMETER;
+
+ if (length < sysparam_attrs[i].length)
+ return OPAL_PARAMETER;
+ if (!(sysparam_attrs[i].perm & OPAL_SYSPARAM_READ))
+ return OPAL_PERMISSION;
+
+ comp_data = zalloc(sizeof(struct sysparam_comp_data));
+ if (!comp_data)
+ return OPAL_NO_MEM;
+
+ comp_data->param_len = sysparam_attrs[i].length;
+ comp_data->async_token = async_token;
+ rc = fsp_get_sys_param(param_id, (void *)buffer,
+ sysparam_attrs[i].length, fsp_opal_getparam_complete,
+ comp_data);
+ if (rc) {
+ free(comp_data);
+ prerror("%s: Error %d queuing param request\n", __func__, rc);
+ return OPAL_INTERNAL_ERROR;
+ }
+
+ return OPAL_ASYNC_COMPLETION;
+}
+
+/* OPAL interface for PowerNV to update the system parameter to FSP */
+static int64_t fsp_opal_set_param(uint64_t async_token, uint32_t param_id,
+ uint64_t buffer, uint64_t length)
+{
+ struct sysparam_comp_data *comp_data;
+ struct fsp_msg *msg;
+ uint64_t tce_token;
+ int count, rc, i;
+
+ if (!fsp_present())
+ return OPAL_HARDWARE;
+
+ count = ARRAY_SIZE(sysparam_attrs);
+ for (i = 0; i < count; i++)
+ if (sysparam_attrs[i].id == param_id)
+ break;
+ if (i == count)
+ return OPAL_PARAMETER;
+
+ if (length < sysparam_attrs[i].length)
+ return OPAL_PARAMETER;
+ if (!(sysparam_attrs[i].perm & OPAL_SYSPARAM_WRITE))
+ return OPAL_PERMISSION;
+
+ fsp_tce_map(PSI_DMA_SET_SYSPARAM, (void *)(buffer & ~0xffful), 0x1000);
+ tce_token = PSI_DMA_SET_SYSPARAM | (buffer & 0xfff);
+
+ msg = fsp_mkmsg(FSP_CMD_SET_SPARM_2, 4, param_id, length,
+ tce_token >> 32, tce_token);
+ if (!msg) {
+ prerror("%s: Failed to allocate the message\n", __func__);
+ return OPAL_INTERNAL_ERROR;
+ }
+
+ comp_data = zalloc(sizeof(struct sysparam_comp_data));
+ if (!comp_data) {
+ fsp_freemsg(msg);
+ return OPAL_NO_MEM;
+ }
+
+ comp_data->param_len = length;
+ comp_data->async_token = async_token;
+ msg->user_data = comp_data;
+
+ rc = fsp_queue_msg(msg, fsp_opal_setparam_complete);
+ if (rc) {
+ free(comp_data);
+ fsp_freemsg(msg);
+ prerror("%s: Failed to queue the message\n", __func__);
+ return OPAL_INTERNAL_ERROR;
+ }
+
+ return OPAL_ASYNC_COMPLETION;
+}
+
+struct sysparam_notify_entry {
+ struct list_node link;
+ sysparam_update_notify notify;
+};
+
+static LIST_HEAD(sysparam_update_notifiers);
+
+/* Add client to notifier chain */
+void sysparam_add_update_notifier(sysparam_update_notify notify)
+{
+ struct sysparam_notify_entry *entry;
+
+ entry = zalloc(sizeof(struct sysparam_notify_entry));
+ assert(entry);
+
+ entry->notify = notify;
+ list_add_tail(&sysparam_update_notifiers, &entry->link);
+}
+
+/* Remove client from notifier chain */
+void sysparam_del_update_notifier(sysparam_update_notify notify)
+{
+ struct sysparam_notify_entry *entry;
+
+ list_for_each(&sysparam_update_notifiers, entry, link) {
+ if (entry->notify == notify) {
+ list_del(&entry->link);
+ free(entry);
+ return;
+ }
+ }
+}
+
+/* Update notification chain */
+static void sysparam_run_update_notifier(struct fsp_msg *msg)
+{
+ bool ret;
+ struct sysparam_notify_entry *entry;
+
+ list_for_each(&sysparam_update_notifiers, entry, link) {
+ ret = entry->notify(msg);
+ if (ret == true)
+ break;
+ }
+}
+
+static bool fsp_sysparam_msg(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+ struct fsp_msg *rsp;
+ int rc = -ENOMEM;
+
+ switch(cmd_sub_mod) {
+ case FSP_CMD_SP_SPARM_UPD_0:
+ case FSP_CMD_SP_SPARM_UPD_1:
+ printf("FSP: Got sysparam update, param ID 0x%x\n",
+ fsp_msg_get_data_word(msg, 0));
+
+ sysparam_run_update_notifier(msg);
+
+ rsp = fsp_mkmsg((cmd_sub_mod & 0xffff00) | 0x008000, 0);
+ if (rsp)
+ rc = fsp_queue_msg(rsp, fsp_freemsg);
+ if (rc) {
+ prerror("FSP: Error %d queuing sysparam reply\n", rc);
+ /* What to do here ? R/R ? */
+ fsp_freemsg(rsp);
+ }
+ return true;
+ }
+ return false;
+}
+
+static struct fsp_client fsp_sysparam_client = {
+ .message = fsp_sysparam_msg,
+};
+
+static void add_opal_sysparam_node(void)
+{
+ struct dt_node *sysparams;
+ char *names, *s;
+ __be32 *ids, *lens;
+ uint8_t *perms;
+ unsigned int i, count, size = 0;
+
+ if (!fsp_present())
+ return;
+
+ sysparams = dt_new(opal_node, "sysparams");
+ dt_add_property_string(sysparams, "compatible", "ibm,opal-sysparams");
+
+ count = ARRAY_SIZE(sysparam_attrs);
+ for (i = 0; i < count; i++)
+ size = size + strlen(sysparam_attrs[i].name) + 1;
+
+ names = zalloc(size);
+ if (!names) {
+ prerror("%s: Failed to allocate memory for parameter names\n",
+ __func__);
+ return;
+ }
+
+ ids = zalloc(count * sizeof(*ids));
+ if (!ids) {
+ prerror("%s: Failed to allocate memory for parameter ids\n",
+ __func__);
+ goto out_free_name;
+ }
+
+ lens = zalloc(count * sizeof(*lens));
+ if (!lens) {
+ prerror("%s: Failed to allocate memory for parameter length\n",
+ __func__);
+ goto out_free_id;
+ }
+
+ perms = zalloc(count * sizeof(*perms));
+ if (!perms) {
+ prerror("%s: Failed to allocate memory for parameter length\n",
+ __func__);
+ goto out_free_len;
+ }
+
+ s = names;
+ for (i = 0; i < count; i++) {
+ strcpy(s, sysparam_attrs[i].name);
+ s = s + strlen(sysparam_attrs[i].name) + 1;
+
+ ids[i] = cpu_to_be32(sysparam_attrs[i].id);
+ lens[i] = cpu_to_be32(sysparam_attrs[i].length);
+ perms[i] = sysparam_attrs[i].perm;
+ }
+
+ dt_add_property(sysparams, "param-name", names, size);
+ dt_add_property(sysparams, "param-id", ids, count * sizeof(*ids));
+ dt_add_property(sysparams, "param-len", lens, count * sizeof(*lens));
+ dt_add_property(sysparams, "param-perm", perms, count * sizeof(*perms));
+
+ free(perms);
+
+out_free_len:
+ free(lens);
+out_free_id:
+ free(ids);
+out_free_name:
+ free(names);
+}
+
+void fsp_sysparam_init(void)
+{
+ if (!fsp_present())
+ return;
+
+ /* Register change notifications */
+ fsp_register_client(&fsp_sysparam_client, FSP_MCLASS_SERVICE);
+
+ /* Register OPAL interfaces */
+ opal_register(OPAL_GET_PARAM, fsp_opal_get_param, 4);
+ opal_register(OPAL_SET_PARAM, fsp_opal_set_param, 4);
+
+ /* Add device-tree nodes */
+ add_opal_sysparam_node();
+}
diff --git a/roms/skiboot/hw/fsp/fsp.c b/roms/skiboot/hw/fsp/fsp.c
new file mode 100644
index 000000000..2c5f9d71b
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp.c
@@ -0,0 +1,2709 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Base FSP (Flexible Service Processor) Support
+ *
+ * FSP is the BMC-like thing in some IBM POWER servers
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <stdarg.h>
+#include <processor.h>
+#include <io.h>
+#include <fsp.h>
+#include <lock.h>
+#include <interrupts.h>
+#include <device.h>
+#include <trace.h>
+#include <timebase.h>
+#include <cpu.h>
+#include <errorlog.h>
+#include <opal.h>
+#include <opal-msg.h>
+#include <ccan/list/list.h>
+
+extern uint32_t hir_trigger;
+
+DEFINE_LOG_ENTRY(OPAL_RC_FSP_POLL_TIMEOUT, OPAL_PLATFORM_ERR_EVT, OPAL_FSP,
+ OPAL_PLATFORM_FIRMWARE, OPAL_RECOVERED_ERR_GENERAL, OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_FSP_MBOX_ERR, OPAL_PLATFORM_ERR_EVT, OPAL_FSP,
+ OPAL_PLATFORM_FIRMWARE, OPAL_RECOVERED_ERR_GENERAL, OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_FSP_DISR_HIR_MASK, OPAL_PLATFORM_ERR_EVT, OPAL_FSP,
+ OPAL_PLATFORM_FIRMWARE, OPAL_RECOVERED_ERR_GENERAL, OPAL_NA);
+
+/* We make this look like a Surveillance error, even though it really
+ * isn't one.
+ */
+DEFINE_LOG_ENTRY(OPAL_INJECTED_HIR, OPAL_MISC_ERR_EVT, OPAL_SURVEILLANCE,
+ OPAL_SURVEILLANCE_ERR, OPAL_PREDICTIVE_ERR_GENERAL,
+ OPAL_MISCELLANEOUS_INFO_ONLY);
+
+#define FSP_TRACE_MSG
+#define FSP_TRACE_EVENT
+
+#define FSP_MAX_IOPATH 4
+
+enum fsp_path_state {
+ fsp_path_bad,
+ fsp_path_backup,
+ fsp_path_active,
+};
+
+struct fsp_iopath {
+ enum fsp_path_state state;
+ void *fsp_regs;
+ struct psi *psi;
+};
+
+enum fsp_mbx_state {
+ fsp_mbx_idle, /* Mailbox ready to send */
+ fsp_mbx_send, /* Mailbox sent, waiting for ack */
+ fsp_mbx_crit_op, /* Critical operation in progress */
+ fsp_mbx_prep_for_reset, /* Prepare for reset sent */
+ fsp_mbx_hir_seq_done, /* HIR sequence done, link forced down */
+ fsp_mbx_err, /* Mailbox in error state, waiting for r&r */
+ fsp_mbx_rr, /* Mailbox in r&r */
+};
+
+struct fsp {
+ struct fsp *link;
+ unsigned int index;
+ enum fsp_mbx_state state;
+ struct fsp_msg *pending;
+
+ unsigned int iopath_count;
+ int active_iopath; /* -1: no active IO path */
+ struct fsp_iopath iopath[FSP_MAX_IOPATH];
+};
+
+enum ipl_state {
+ ipl_initial = 0x00000000,
+ ipl_opl_sent = 0x00000001,
+ ipl_got_continue = 0x00000002,
+ ipl_got_new_role = 0x00000004,
+ ipl_got_caps = 0x00000008,
+ ipl_got_fsp_functional = 0x00000010
+};
+static enum ipl_state ipl_state = ipl_initial;
+
+static struct fsp *first_fsp;
+static struct fsp *active_fsp;
+static u16 fsp_curseq = 0x8000;
+static __be64 *fsp_tce_table;
+
+#define FSP_INBOUND_SIZE 0x00100000UL
+static void *fsp_inbound_buf = NULL;
+static u32 fsp_inbound_off;
+
+static struct lock fsp_lock = LOCK_UNLOCKED;
+static struct lock fsp_poll_lock = LOCK_UNLOCKED;
+
+static u64 fsp_cmdclass_resp_bitmask;
+static u64 timeout_timer;
+
+static u64 fsp_hir_timeout;
+
+#define FSP_CRITICAL_OP_TIMEOUT 128
+#define FSP_DRCR_CLEAR_TIMEOUT 128
+
+/* LID numbers. For now we hijack some of pHyp's own until i figure
+ * out the whole business with the MasterLID
+ */
+#define KERNEL_LID_PHYP 0x80a00701
+#define KERNEL_LID_OPAL 0x80f00101
+#define INITRAMFS_LID_OPAL 0x80f00102
+
+/*
+ * We keep track on last logged values for some things to print only on
+ * value changes, but also to relieve pressure on the tracer which
+ * doesn't do a very good job at detecting repeats when called from
+ * many different CPUs
+ */
+static u32 disr_last_print;
+static u32 drcr_last_print;
+static u32 hstate_last_print;
+
+void fsp_handle_resp(struct fsp_msg *msg);
+
+struct fsp_cmdclass {
+ int timeout;
+ bool busy;
+ struct list_head msgq;
+ struct list_head clientq;
+ struct list_head rr_queue; /* To queue up msgs during R/R */
+ u64 timesent;
+};
+
+static struct fsp_cmdclass fsp_cmdclass_rr;
+
+static struct fsp_cmdclass fsp_cmdclass[FSP_MCLASS_LAST - FSP_MCLASS_FIRST + 1]
+= {
+#define DEF_CLASS(_cl, _to) [_cl - FSP_MCLASS_FIRST] = { .timeout = _to }
+ DEF_CLASS(FSP_MCLASS_SERVICE, 16),
+ DEF_CLASS(FSP_MCLASS_PCTRL_MSG, 16),
+ DEF_CLASS(FSP_MCLASS_PCTRL_ABORTS, 16),
+ DEF_CLASS(FSP_MCLASS_ERR_LOG, 16),
+ DEF_CLASS(FSP_MCLASS_CODE_UPDATE, 40),
+ DEF_CLASS(FSP_MCLASS_FETCH_SPDATA, 16),
+ DEF_CLASS(FSP_MCLASS_FETCH_HVDATA, 16),
+ DEF_CLASS(FSP_MCLASS_NVRAM, 16),
+ DEF_CLASS(FSP_MCLASS_MBOX_SURV, 2),
+ DEF_CLASS(FSP_MCLASS_RTC, 16),
+ DEF_CLASS(FSP_MCLASS_SMART_CHIP, 20),
+ DEF_CLASS(FSP_MCLASS_INDICATOR, 180),
+ DEF_CLASS(FSP_MCLASS_HMC_INTFMSG, 16),
+ DEF_CLASS(FSP_MCLASS_HMC_VT, 16),
+ DEF_CLASS(FSP_MCLASS_HMC_BUFFERS, 16),
+ DEF_CLASS(FSP_MCLASS_SHARK, 16),
+ DEF_CLASS(FSP_MCLASS_MEMORY_ERR, 16),
+ DEF_CLASS(FSP_MCLASS_CUOD_EVENT, 16),
+ DEF_CLASS(FSP_MCLASS_HW_MAINT, 16),
+ DEF_CLASS(FSP_MCLASS_VIO, 16),
+ DEF_CLASS(FSP_MCLASS_SRC_MSG, 16),
+ DEF_CLASS(FSP_MCLASS_DATA_COPY, 16),
+ DEF_CLASS(FSP_MCLASS_TONE, 16),
+ DEF_CLASS(FSP_MCLASS_VIRTUAL_NVRAM, 16),
+ DEF_CLASS(FSP_MCLASS_TORRENT, 16),
+ DEF_CLASS(FSP_MCLASS_NODE_PDOWN, 16),
+ DEF_CLASS(FSP_MCLASS_DIAG, 16),
+ DEF_CLASS(FSP_MCLASS_PCIE_LINK_TOPO, 16),
+ DEF_CLASS(FSP_MCLASS_OCC, 16),
+ DEF_CLASS(FSP_MCLASS_TRUSTED_BOOT, 2),
+ DEF_CLASS(FSP_MCLASS_HBRT, 2),
+};
+
+static void fsp_trace_msg(struct fsp_msg *msg, u8 dir __unused)
+{
+ union trace fsp __unused;
+#ifdef FSP_TRACE_MSG
+ size_t len = offsetof(struct trace_fsp_msg, data[msg->dlen]);
+
+ fsp.fsp_msg.dlen = msg->dlen;
+ fsp.fsp_msg.word0 = cpu_to_be32(msg->word0);
+ fsp.fsp_msg.word1 = cpu_to_be32(msg->word1);
+ fsp.fsp_msg.dir = dir;
+ memcpy(fsp.fsp_msg.data, msg->data.bytes, msg->dlen);
+ trace_add(&fsp, TRACE_FSP_MSG, len);
+#endif /* FSP_TRACE_MSG */
+ assert(msg->dlen <= sizeof(fsp.fsp_msg.data));
+}
+
+static struct fsp *fsp_get_active(void)
+{
+ /* XXX Handle transition between FSPs */
+ return active_fsp;
+}
+
+static u64 fsp_get_class_bit(u8 class)
+{
+ /* Alias classes CE and CF as the FSP has a single queue */
+ if (class == FSP_MCLASS_IPL)
+ class = FSP_MCLASS_SERVICE;
+
+ return 1ul << (class - FSP_MCLASS_FIRST);
+}
+
+static struct fsp_cmdclass *__fsp_get_cmdclass(u8 class)
+{
+ struct fsp_cmdclass *ret;
+
+ /* RR class is special */
+ if (class == FSP_MCLASS_RR_EVENT)
+ return &fsp_cmdclass_rr;
+
+ /* Bound check */
+ if (class < FSP_MCLASS_FIRST || class > FSP_MCLASS_LAST)
+ return NULL;
+
+ /* Alias classes CE and CF as the FSP has a single queue */
+ if (class == FSP_MCLASS_IPL)
+ class = FSP_MCLASS_SERVICE;
+
+ ret = &fsp_cmdclass[class - FSP_MCLASS_FIRST];
+
+ /* Unknown class */
+ if (ret->timeout == 0)
+ return NULL;
+
+ return ret;
+}
+
+static struct fsp_cmdclass *fsp_get_cmdclass(struct fsp_msg *msg)
+{
+ u8 c = msg->word0 & 0xff;
+
+ return __fsp_get_cmdclass(c);
+}
+
+static struct fsp_msg *__fsp_allocmsg(void)
+{
+ return zalloc(sizeof(struct fsp_msg));
+}
+
+struct fsp_msg *fsp_allocmsg(bool alloc_response)
+{
+ struct fsp_msg *msg;
+
+ msg = __fsp_allocmsg();
+ if (!msg)
+ return NULL;
+ if (alloc_response) {
+ msg->resp = __fsp_allocmsg();
+ if (!msg->resp) {
+ free(msg);
+ return NULL;
+ }
+ }
+
+ return msg;
+}
+
+void __fsp_freemsg(struct fsp_msg *msg)
+{
+ free(msg);
+}
+
+void fsp_freemsg(struct fsp_msg *msg)
+{
+ if (msg && msg->resp)
+ __fsp_freemsg(msg->resp);
+ __fsp_freemsg(msg);
+}
+
+void fsp_cancelmsg(struct fsp_msg *msg)
+{
+ bool need_unlock = false;
+ struct fsp_cmdclass* cmdclass = fsp_get_cmdclass(msg);
+
+ if (!fsp_in_rr()) {
+ prerror("FSP: Message cancel allowed only when"
+ "FSP is in reset\n");
+ return;
+ }
+
+ if (!cmdclass)
+ return;
+
+ /* Recursive locking */
+ need_unlock = lock_recursive(&fsp_lock);
+
+ list_del(&msg->link);
+ msg->state = fsp_msg_cancelled;
+
+ if (need_unlock)
+ unlock(&fsp_lock);
+}
+
+static void fsp_wreg(struct fsp *fsp, u32 reg, u32 val)
+{
+ struct fsp_iopath *iop;
+
+ if (fsp->active_iopath < 0)
+ return;
+ iop = &fsp->iopath[fsp->active_iopath];
+ if (iop->state == fsp_path_bad)
+ return;
+ out_be32(iop->fsp_regs + reg, val);
+}
+
+static u32 fsp_rreg(struct fsp *fsp, u32 reg)
+{
+ struct fsp_iopath *iop;
+
+ if (fsp->active_iopath < 0)
+ return 0xffffffff;
+ iop = &fsp->iopath[fsp->active_iopath];
+ if (iop->state == fsp_path_bad)
+ return 0xffffffff;
+ return in_be32(iop->fsp_regs + reg);
+}
+
+static void fsp_reg_dump(void)
+{
+#define FSP_DUMP_ONE(x) \
+ prlog(PR_DEBUG, " %20s: %x\n", #x, fsp_rreg(fsp, x));
+
+ struct fsp *fsp = fsp_get_active();
+
+ if (!fsp)
+ return;
+
+ prlog(PR_DEBUG, "FSP #%d: Register dump (state=%d)\n",
+ fsp->index, fsp->state);
+ FSP_DUMP_ONE(FSP_DRCR_REG);
+ FSP_DUMP_ONE(FSP_DISR_REG);
+ FSP_DUMP_ONE(FSP_MBX1_HCTL_REG);
+ FSP_DUMP_ONE(FSP_MBX1_FCTL_REG);
+ FSP_DUMP_ONE(FSP_MBX2_HCTL_REG);
+ FSP_DUMP_ONE(FSP_MBX2_FCTL_REG);
+ FSP_DUMP_ONE(FSP_SDES_REG);
+ FSP_DUMP_ONE(FSP_HDES_REG);
+ FSP_DUMP_ONE(FSP_HDIR_REG);
+ FSP_DUMP_ONE(FSP_HDIM_SET_REG);
+ FSP_DUMP_ONE(FSP_PDIR_REG);
+ FSP_DUMP_ONE(FSP_PDIM_SET_REG);
+ FSP_DUMP_ONE(FSP_SCRATCH0_REG);
+ FSP_DUMP_ONE(FSP_SCRATCH1_REG);
+ FSP_DUMP_ONE(FSP_SCRATCH2_REG);
+ FSP_DUMP_ONE(FSP_SCRATCH3_REG);
+}
+
+static void fsp_notify_rr_state(u32 state)
+{
+ struct fsp_client *client, *next;
+ struct fsp_cmdclass *cmdclass = __fsp_get_cmdclass(FSP_MCLASS_RR_EVENT);
+
+ assert(cmdclass);
+ list_for_each_safe(&cmdclass->clientq, client, next, link)
+ client->message(state, NULL);
+}
+
+static void fsp_reset_cmdclass(void)
+{
+ int i;
+ struct fsp_msg *msg;
+
+ /*
+ * The FSP is in reset and hence we can't expect any response
+ * to outstanding messages that we've already sent. Clear the
+ * bitmap to reflect that.
+ */
+ fsp_cmdclass_resp_bitmask = 0;
+ for (i = 0; i <= (FSP_MCLASS_LAST - FSP_MCLASS_FIRST); i++) {
+ struct fsp_cmdclass *cmdclass = &fsp_cmdclass[i];
+ cmdclass->busy = false;
+ cmdclass->timesent = 0;
+
+ /* Make sure the message queue is empty */
+ while(!list_empty(&cmdclass->msgq)) {
+ msg = list_pop(&cmdclass->msgq, struct fsp_msg,
+ link);
+ list_add_tail(&cmdclass->rr_queue, &msg->link);
+ }
+ }
+}
+
+static bool fsp_in_hir(struct fsp *fsp)
+{
+ switch (fsp->state) {
+ case fsp_mbx_crit_op:
+ case fsp_mbx_prep_for_reset:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool fsp_in_reset(struct fsp *fsp)
+{
+ switch (fsp->state) {
+ case fsp_mbx_hir_seq_done: /* FSP reset triggered */
+ case fsp_mbx_err: /* Will be reset soon */
+ case fsp_mbx_rr: /* Mbx activity stopped pending reset */
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool fsp_in_rr(void)
+{
+ struct fsp *fsp = fsp_get_active();
+ struct fsp_iopath *iop;
+
+ if (fsp->active_iopath < 0)
+ return true;
+
+ iop = &fsp->iopath[fsp->active_iopath];
+
+ if (fsp_in_reset(fsp) || fsp_in_hir(fsp) || !(psi_check_link_active(iop->psi)))
+ return true;
+
+ return false;
+}
+
+static bool fsp_hir_state_timeout(void)
+{
+ u64 now = mftb();
+
+ if (tb_compare(now, fsp_hir_timeout) == TB_AAFTERB)
+ return true;
+
+ return false;
+}
+
+static void fsp_set_hir_timeout(u32 seconds)
+{
+ u64 now = mftb();
+ fsp_hir_timeout = now + secs_to_tb(seconds);
+}
+
+static bool fsp_crit_op_in_progress(struct fsp *fsp)
+{
+ u32 disr = fsp_rreg(fsp, FSP_DISR_REG);
+
+ if (disr & FSP_DISR_CRIT_OP_IN_PROGRESS)
+ return true;
+
+ return false;
+}
+
+/* Notify the FSP that it will be reset soon by writing to the DRCR */
+static void fsp_prep_for_reset(struct fsp *fsp)
+{
+ u32 drcr;
+
+ /*
+ * Its possible that the FSP went into reset by itself between the
+ * time the HIR is triggered and we get here. Check and bail out if so.
+ */
+ if (fsp_in_rr())
+ return;
+
+ drcr = fsp_rreg(fsp, FSP_DRCR_REG);
+
+ prlog(PR_TRACE, "FSP: Writing reset to DRCR\n");
+ drcr_last_print = drcr;
+ fsp_wreg(fsp, FSP_DRCR_REG, (drcr | FSP_PREP_FOR_RESET_CMD));
+ fsp->state = fsp_mbx_prep_for_reset;
+ fsp_set_hir_timeout(FSP_DRCR_CLEAR_TIMEOUT);
+}
+
+static void fsp_hir_poll(struct fsp *fsp, struct psi *psi)
+{
+ u32 drcr;
+
+ if (fsp_in_reset(fsp) || !(psi_check_link_active(psi)))
+ return;
+
+ switch (fsp->state) {
+ case fsp_mbx_crit_op:
+ if (fsp_crit_op_in_progress(fsp)) {
+ if (fsp_hir_state_timeout())
+ prerror("FSP: Critical operation timeout\n");
+ /* XXX What do do next? Check with FSP folks */
+ } else {
+ fsp_prep_for_reset(fsp);
+ }
+ break;
+ case fsp_mbx_prep_for_reset:
+ drcr = fsp_rreg(fsp, FSP_DRCR_REG);
+
+ if (drcr != drcr_last_print) {
+ prlog(PR_TRACE, "FSP: DRCR changed, old = %x,"
+ " new = %x\n",
+ drcr_last_print, drcr);
+ drcr_last_print = drcr;
+ }
+
+ if (drcr & FSP_DRCR_ACK_MASK) {
+ if (fsp_hir_state_timeout()) {
+ prerror("FSP: Ack timeout. Triggering reset\n");
+ psi_reset_fsp(psi);
+ fsp->state = fsp_mbx_hir_seq_done;
+ }
+ } else {
+ prlog(PR_TRACE, "FSP: DRCR ack received."
+ " Triggering reset\n");
+ psi_reset_fsp(psi);
+ fsp->state = fsp_mbx_hir_seq_done;
+ }
+ break;
+ default:
+ break;
+ }
+}
+
+/*
+ * This is the main entry for the host initiated reset case.
+ * This gets called when:
+ * a. Surveillance ack is not received in 120 seconds
+ * b. A mailbox command doesn't get a response within the stipulated time.
+ */
+static void __fsp_trigger_reset(void)
+{
+ struct fsp *fsp = fsp_get_active();
+ u32 disr;
+
+ /* Already in one of the error processing states */
+ if (fsp_in_hir(fsp) || fsp_in_reset(fsp))
+ return;
+
+ prerror("FSP: fsp_trigger_reset() entry\n");
+
+ drcr_last_print = 0;
+ /*
+ * Check if we are allowed to reset the FSP. We aren't allowed to
+ * reset the FSP if the FSP_DISR_DBG_IN_PROGRESS is set.
+ */
+ disr = fsp_rreg(fsp, FSP_DISR_REG);
+ if (disr & FSP_DISR_DBG_IN_PROGRESS) {
+ prerror("FSP: Host initiated reset disabled\n");
+ return;
+ }
+
+ /*
+ * Check if some critical operation is in progress as indicated
+ * by FSP_DISR_CRIT_OP_IN_PROGRESS. Timeout is 128 seconds
+ */
+ if (fsp_crit_op_in_progress(fsp)) {
+ prlog(PR_NOTICE, "FSP: Critical operation in progress\n");
+ fsp->state = fsp_mbx_crit_op;
+ fsp_set_hir_timeout(FSP_CRITICAL_OP_TIMEOUT);
+ } else
+ fsp_prep_for_reset(fsp);
+}
+
+static uint32_t fsp_hir_reason_plid;
+
+void fsp_trigger_reset(uint32_t plid)
+{
+ lock(&fsp_lock);
+ fsp_hir_reason_plid = plid;
+ __fsp_trigger_reset();
+ unlock(&fsp_lock);
+}
+
+/*
+ * Called when we trigger a HIR or when the FSP tells us via the DISR's
+ * RR bit that one is impending. We should therefore stop all mbox activity.
+ */
+static void fsp_start_rr(struct fsp *fsp)
+{
+ struct fsp_iopath *iop;
+
+ if (fsp->state == fsp_mbx_rr)
+ return;
+
+ /* We no longer have an active path on that FSP */
+ if (fsp->active_iopath >= 0) {
+ iop = &fsp->iopath[fsp->active_iopath];
+ iop->state = fsp_path_bad;
+ fsp->active_iopath = -1;
+ }
+ fsp->state = fsp_mbx_rr;
+ disr_last_print = 0;
+ hstate_last_print = 0;
+
+ /*
+ * Mark all command classes as non-busy and clear their
+ * timeout, then flush all messages in our staging queue
+ */
+ fsp_reset_cmdclass();
+
+ /* Notify clients. We have to drop the lock here */
+ unlock(&fsp_lock);
+ fsp_notify_rr_state(FSP_RESET_START);
+ lock(&fsp_lock);
+
+ /*
+ * Unlike earlier, we don't trigger the PSI link polling
+ * from this point. We wait for the PSI interrupt to tell
+ * us the FSP is really down and then start the polling there.
+ */
+}
+
+/*
+ * Called on normal/quick shutdown to give up the PSI link
+ */
+void fsp_reset_links(void)
+{
+ struct fsp *fsp = fsp_get_active();
+ struct fsp_iopath *iop;
+
+ if (!fsp)
+ return;
+
+ /* Already in one of the error states? */
+ if (fsp_in_hir(fsp) || fsp_in_reset(fsp))
+ return;
+
+ iop = &fsp->iopath[fsp->active_iopath];
+ prlog(PR_NOTICE, "FSP #%d: Host initiated shutdown."
+ " Giving up the PSI link\n", fsp->index);
+ psi_disable_link(iop->psi);
+ return;
+}
+
+static void fsp_trace_event(struct fsp *fsp, u32 evt,
+ u32 data0, u32 data1, u32 data2, u32 data3)
+{
+ union trace tfsp __unused;
+#ifdef FSP_TRACE_EVENT
+ size_t len = sizeof(struct trace_fsp_event);
+
+ tfsp.fsp_evt.event = cpu_to_be16(evt);
+ tfsp.fsp_evt.fsp_state = cpu_to_be16(fsp->state);
+ tfsp.fsp_evt.data[0] = cpu_to_be32(data0);
+ tfsp.fsp_evt.data[1] = cpu_to_be32(data1);
+ tfsp.fsp_evt.data[2] = cpu_to_be32(data2);
+ tfsp.fsp_evt.data[3] = cpu_to_be32(data3);
+ trace_add(&tfsp, TRACE_FSP_EVENT, len);
+#endif /* FSP_TRACE_EVENT */
+}
+
+static void fsp_handle_errors(struct fsp *fsp)
+{
+ u32 hstate;
+ struct fsp_iopath *iop;
+ struct psi *psi;
+ u32 disr;
+
+ if (fsp->active_iopath < 0) {
+ prerror("FSP #%d: fsp_handle_errors() with no active IOP\n",
+ fsp->index);
+ return;
+ }
+
+ iop = &fsp->iopath[fsp->active_iopath];
+ if (!iop->psi) {
+ prerror("FSP: Active IOP with no PSI link !\n");
+ return;
+ }
+ psi = iop->psi;
+
+ /*
+ * If the link is not up, start R&R immediately, we do call
+ * psi_disable_link() in this case as while the link might
+ * not be up, it might still be enabled and the PSI layer
+ * "active" bit still set
+ */
+ if (!psi_check_link_active(psi)) {
+ /* Start R&R process */
+ fsp_trace_event(fsp, TRACE_FSP_EVT_LINK_DOWN, 0, 0, 0, 0);
+ prerror("FSP #%d: Link down, starting R&R\n", fsp->index);
+
+ fsp_start_rr(fsp);
+ return;
+ }
+
+ /* Link is up, check for other conditions */
+ disr = fsp_rreg(fsp, FSP_DISR_REG);
+
+ /* If in R&R, log values */
+ if (disr != disr_last_print) {
+ fsp_trace_event(fsp, TRACE_FSP_EVT_DISR_CHG, disr, 0, 0, 0);
+
+ prlog(PR_TRACE, "FSP #%d: DISR stat change = 0x%08x\n",
+ fsp->index, disr);
+ disr_last_print = disr;
+ }
+
+ /* On a deferred mbox error, trigger a HIR
+ * Note: We may never get here since the link inactive case is handled
+ * above and the other case is when the iop->psi is NULL, which is
+ * quite rare.
+ */
+ if (fsp->state == fsp_mbx_err) {
+ uint32_t plid;
+ plid = log_simple_error(&e_info(OPAL_RC_FSP_MBOX_ERR),
+ "FSP #%d: Triggering HIR on mbx_err\n",
+ fsp->index);
+ fsp_trigger_reset(plid);
+ return;
+ }
+
+ /*
+ * If we get here as part of normal flow, the FSP is telling
+ * us that there will be an impending R&R, so we stop all mbox
+ * activity. The actual link down trigger is via a PSI
+ * interrupt that may arrive in due course.
+ */
+ if (disr & FSP_DISR_FSP_IN_RR) {
+ /*
+ * If we get here with DEBUG_IN_PROGRESS also set, the
+ * FSP is in debug and we should *not* reset it now
+ */
+ if (disr & FSP_DISR_DBG_IN_PROGRESS)
+ return;
+
+ /*
+ * When the linux comes back up, we still see that bit
+ * set for a bit, so just move on, nothing to see here
+ */
+ if (fsp->state == fsp_mbx_rr)
+ return;
+
+ if (fsp_dpo_pending) {
+ /*
+ * If we are about to process a reset when DPO
+ * is pending, its possible that the host has
+ * gone down, and OPAL is on its way down and
+ * hence will not see the subsequent PSI interrupt.
+ * So, just give up the link here.
+ */
+ prlog(PR_NOTICE, "FSP #%d: FSP reset with DPO pending."
+ " Giving up PSI link\n",
+ fsp->index);
+ psi_disable_link(psi);
+ } else {
+ prlog(PR_NOTICE, "FSP #%d: FSP in Reset."
+ " Waiting for PSI interrupt\n",
+ fsp->index);
+ }
+ fsp_start_rr(fsp);
+ }
+
+ /*
+ * However, if any of Unit Check or Runtime Termintated or
+ * Flash Terminated bits is also set, the FSP is asking us
+ * to trigger a HIR so it can try to recover via the DRCR route.
+ */
+ if (disr & FSP_DISR_HIR_TRIGGER_MASK) {
+ const char *reason = "Unknown FSP_DISR_HIR_TRIGGER";
+ uint32_t plid;
+ fsp_trace_event(fsp, TRACE_FSP_EVT_SOFT_RR, disr, 0, 0, 0);
+
+ if (disr & FSP_DISR_FSP_UNIT_CHECK)
+ reason = "DISR Unit Check set";
+ else if (disr & FSP_DISR_FSP_RUNTIME_TERM)
+ reason = "DISR Runtime Terminate set";
+ else if (disr & FSP_DISR_FSP_FLASH_TERM)
+ reason = "DISR Flash Terminate set";
+
+ plid = log_simple_error(&e_info(OPAL_RC_FSP_DISR_HIR_MASK),
+ "FSP: %s. Triggering host initiated "
+ "reset.", reason);
+
+ /* Clear all interrupt conditions */
+ fsp_wreg(fsp, FSP_HDIR_REG, FSP_DBIRQ_ALL);
+
+ /* Make sure this happened */
+ fsp_rreg(fsp, FSP_HDIR_REG);
+
+ fsp_trigger_reset(plid);
+ return;
+ }
+
+ /*
+ * We detect an R&R complete indication, acknolwedge it
+ */
+ if (disr & FSP_DISR_FSP_RR_COMPLETE) {
+ /*
+ * Acking this bit doens't make it go away immediately, so
+ * only do it while still in R&R state
+ */
+ if (fsp->state == fsp_mbx_rr) {
+ fsp_trace_event(fsp, TRACE_FSP_EVT_RR_COMPL, 0,0,0,0);
+
+ prlog(PR_NOTICE, "FSP #%d: Detected R&R complete,"
+ " acking\n", fsp->index);
+
+ /* Clear HDATA area */
+ fsp_wreg(fsp, FSP_MBX1_HDATA_AREA, 0xff);
+
+ /* Ack it (XDN) and clear HPEND & counts */
+ fsp_wreg(fsp, FSP_MBX1_HCTL_REG,
+ FSP_MBX_CTL_PTS |
+ FSP_MBX_CTL_XDN |
+ FSP_MBX_CTL_HPEND |
+ FSP_MBX_CTL_HCSP_MASK |
+ FSP_MBX_CTL_DCSP_MASK);
+
+ /*
+ * Mark the mbox as usable again so we can process
+ * incoming messages
+ */
+ fsp->state = fsp_mbx_idle;
+
+ /* Also clear R&R complete bit in DISR */
+ fsp_wreg(fsp, FSP_DISR_REG, FSP_DISR_FSP_RR_COMPLETE);
+
+ psi_enable_fsp_interrupt(psi);
+ }
+ }
+
+ /*
+ * XXX
+ *
+ * Here we detect a number of errors, should we initiate
+ * and R&R ?
+ */
+
+ hstate = fsp_rreg(fsp, FSP_HDES_REG);
+ if (hstate != hstate_last_print) {
+ fsp_trace_event(fsp, TRACE_FSP_EVT_HDES_CHG, hstate, 0, 0, 0);
+
+ prlog(PR_DEBUG, "FSP #%d: HDES stat change = 0x%08x\n",
+ fsp->index, hstate);
+ hstate_last_print = hstate;
+ }
+
+ if (hstate == 0xffffffff)
+ return;
+
+ /* Clear errors */
+ fsp_wreg(fsp, FSP_HDES_REG, FSP_DBERRSTAT_CLR1);
+
+ /*
+ * Most of those errors shouldn't have happened, we just clear
+ * the error state and return. In the long run, we might want
+ * to start retrying commands, switching FSPs or links, etc...
+ *
+ * We currently don't set our mailbox to a permanent error state.
+ */
+ if (hstate & FSP_DBERRSTAT_ILLEGAL1)
+ prerror("FSP #%d: Illegal command error !\n", fsp->index);
+
+ if (hstate & FSP_DBERRSTAT_WFULL1)
+ prerror("FSP #%d: Write to a full mbox !\n", fsp->index);
+
+ if (hstate & FSP_DBERRSTAT_REMPTY1)
+ prerror("FSP #%d: Read from an empty mbox !\n", fsp->index);
+
+ if (hstate & FSP_DBERRSTAT_PAR1)
+ prerror("FSP #%d: Parity error !\n", fsp->index);
+}
+
+/*
+ * This is called by fsp_post_msg() to check if the mbox
+ * is in a state that allows sending of a message
+ *
+ * Due to the various "interesting" contexts fsp_post_msg()
+ * can be called from, including recursive locks from lock
+ * error messages or console code, this should avoid doing
+ * anything more complex than checking a bit of state.
+ *
+ * Specifically, we cannot initiate an R&R and call back into
+ * clients etc... from this function.
+ *
+ * The best we can do is to se the mbox in error state and
+ * handle it later during a poll or interrupts.
+ */
+static bool fsp_check_can_send(struct fsp *fsp)
+{
+ struct fsp_iopath *iop;
+ struct psi *psi;
+
+ /* Look for FSP in non-idle state */
+ if (fsp->state != fsp_mbx_idle)
+ return false;
+
+ /* Look for an active IO path */
+ if (fsp->active_iopath < 0)
+ goto mbox_error;
+ iop = &fsp->iopath[fsp->active_iopath];
+ if (!iop->psi) {
+ prerror("FSP: Active IOP with no PSI link !\n");
+ goto mbox_error;
+ }
+ psi = iop->psi;
+
+ /* Check if link has gone down. This will be handled later */
+ if (!psi_check_link_active(psi)) {
+ prerror("FSP #%d: Link seems to be down on send\n", fsp->index);
+ goto mbox_error;
+ }
+
+ /* XXX Do we want to check for other error conditions ? */
+ return true;
+
+ /*
+ * An error of some case occurred, we'll handle it later
+ * from a more normal "poll" context
+ */
+ mbox_error:
+ fsp->state = fsp_mbx_err;
+ return false;
+}
+
+static bool fsp_post_msg(struct fsp *fsp, struct fsp_msg *msg)
+{
+ u32 ctl, reg;
+ int i, wlen;
+
+ prlog(PR_INSANE, "FSP #%d: fsp_post_msg (w0: 0x%08x w1: 0x%08x)\n",
+ fsp->index, msg->word0, msg->word1);
+
+ /* Note: We used to read HCTL here and only modify some of
+ * the bits in it. This was bogus, because we would write back
+ * the incoming bits as '1' and clear them, causing fsp_poll()
+ * to then miss them. Let's just start with 0, which is how
+ * I suppose the HW intends us to do.
+ */
+
+ /* Set ourselves as busy */
+ fsp->pending = msg;
+ fsp->state = fsp_mbx_send;
+ msg->state = fsp_msg_sent;
+
+ /* We trace after setting the mailbox state so that if the
+ * tracing recurses, it ends up just queuing the message up
+ */
+ fsp_trace_msg(msg, TRACE_FSP_MSG_OUT);
+
+ /* Build the message in the mailbox */
+ reg = FSP_MBX1_HDATA_AREA;
+ fsp_wreg(fsp, reg, msg->word0); reg += 4;
+ fsp_wreg(fsp, reg, msg->word1); reg += 4;
+ wlen = (msg->dlen + 3) >> 2;
+ for (i = 0; i < wlen; i++) {
+ fsp_wreg(fsp, reg, fsp_msg_get_data_word(msg, i));
+ reg += 4;
+ }
+
+ /* Write the header */
+ fsp_wreg(fsp, FSP_MBX1_HHDR0_REG, (msg->dlen + 8) << 16);
+
+ /* Write the control register */
+ ctl = 4 << FSP_MBX_CTL_HCHOST_SHIFT;
+ ctl |= (msg->dlen + 8) << FSP_MBX_CTL_DCHOST_SHIFT;
+ ctl |= FSP_MBX_CTL_PTS | FSP_MBX_CTL_SPPEND;
+ prlog(PR_INSANE, " new ctl: %08x\n", ctl);
+ fsp_wreg(fsp, FSP_MBX1_HCTL_REG, ctl);
+
+ return true;
+}
+
+static void fsp_poke_queue(struct fsp_cmdclass *cmdclass)
+{
+ struct fsp *fsp = fsp_get_active();
+ struct fsp_msg *msg;
+
+ if (!fsp)
+ return;
+ if (!fsp_check_can_send(fsp))
+ return;
+
+ /* From here to the point where fsp_post_msg() sets fsp->state
+ * to !idle we must not cause any re-entrancy (no debug or trace)
+ * in a code path that may hit fsp_post_msg() (it's ok to do so
+ * if we are going to bail out), as we are committed to calling
+ * fsp_post_msg() and so a re-entrancy could cause us to do a
+ * double-send into the mailbox.
+ */
+ if (cmdclass->busy || list_empty(&cmdclass->msgq))
+ return;
+
+ msg = list_top(&cmdclass->msgq, struct fsp_msg, link);
+ assert(msg);
+ cmdclass->busy = true;
+
+ if (!fsp_post_msg(fsp, msg)) {
+ prerror("FSP #%d: Failed to send message\n", fsp->index);
+ cmdclass->busy = false;
+ return;
+ }
+}
+
+static void __fsp_fillmsg(struct fsp_msg *msg, u32 cmd_sub_mod,
+ u8 add_words, va_list list)
+{
+ bool response = !!(cmd_sub_mod & 0x1000000);
+ u8 cmd = (cmd_sub_mod >> 16) & 0xff;
+ u8 sub = (cmd_sub_mod >> 8) & 0xff;
+ u8 mod = cmd_sub_mod & 0xff;
+ int i;
+
+ msg->word0 = cmd & 0xff;
+ msg->word1 = mod << 8 | sub;
+ msg->response = response;
+ msg->dlen = add_words << 2;
+
+ for (i = 0; i < add_words; i++)
+ fsp_msg_set_data_word(msg, i, va_arg(list, unsigned int));
+}
+
+void fsp_fillmsg(struct fsp_msg *msg, u32 cmd_sub_mod, u32 add_words, ...)
+{
+ va_list list;
+
+ va_start(list, add_words);
+ __fsp_fillmsg(msg, cmd_sub_mod, add_words, list);
+ va_end(list);
+}
+
+struct fsp_msg *fsp_mkmsg(u32 cmd_sub_mod, u32 add_words, ...)
+{
+ struct fsp_msg *msg = fsp_allocmsg(!!(cmd_sub_mod & 0x1000000));
+ va_list list;
+
+ if (!msg) {
+ prerror("FSP: Failed to allocate struct fsp_msg\n");
+ return NULL;
+ }
+
+ va_start(list, add_words);
+ __fsp_fillmsg(msg, cmd_sub_mod, add_words, list);
+ va_end(list);
+
+ return msg;
+}
+
+/*
+ * IMPORTANT NOTE: This is *guaranteed* to not call the completion
+ * routine recusrively for *any* fsp message, either the
+ * queued one or a previous one. Thus it is *ok* to call
+ * this function with a lock held which will itself be
+ * taken by the completion function.
+ *
+ * Any change to this implementation must respect this
+ * rule. This will be especially true of things like
+ * reset/reload and error handling, if we fail to queue
+ * we must just return an error, not call any completion
+ * from the scope of fsp_queue_msg().
+ */
+int fsp_queue_msg(struct fsp_msg *msg, void (*comp)(struct fsp_msg *msg))
+{
+ struct fsp_cmdclass *cmdclass;
+ struct fsp *fsp = fsp_get_active();
+ bool need_unlock;
+ u16 seq;
+ int rc = 0;
+
+ if (!fsp || !msg)
+ return -1;
+
+ /* Recursive locking */
+ need_unlock = lock_recursive(&fsp_lock);
+
+ /* Grab a new sequence number */
+ seq = fsp_curseq;
+ fsp_curseq = fsp_curseq + 1;
+ if (fsp_curseq == 0)
+ fsp_curseq = 0x8000;
+ msg->word0 = (msg->word0 & 0xffff) | seq << 16;
+
+ /* Set completion */
+ msg->complete = comp;
+
+ /* Clear response state */
+ if (msg->resp)
+ msg->resp->state = fsp_msg_unused;
+
+ /* Queue the message in the appropriate queue */
+ cmdclass = fsp_get_cmdclass(msg);
+ if (!cmdclass) {
+ prerror("FSP: Invalid msg in fsp_queue_msg w0/1=0x%08x/%08x\n",
+ msg->word0, msg->word1);
+ rc = -1;
+ goto unlock;
+ }
+
+ msg->state = fsp_msg_queued;
+
+ /*
+ * If we have initiated or about to initiate a reset/reload operation,
+ * we stash the message on the R&R backup queue. Otherwise, queue it
+ * normally and poke the HW
+ */
+ if (fsp_in_hir(fsp) || fsp_in_reset(fsp))
+ list_add_tail(&cmdclass->rr_queue, &msg->link);
+ else {
+ list_add_tail(&cmdclass->msgq, &msg->link);
+ fsp_poke_queue(cmdclass);
+ }
+
+ unlock:
+ if (need_unlock)
+ unlock(&fsp_lock);
+
+ return rc;
+}
+
+/* WARNING: This will drop the FSP lock !!! */
+static void fsp_complete_msg(struct fsp_msg *msg)
+{
+ struct fsp_cmdclass *cmdclass = fsp_get_cmdclass(msg);
+ void (*comp)(struct fsp_msg *msg);
+
+ assert(cmdclass);
+
+ prlog(PR_INSANE, " completing msg, word0: 0x%08x\n", msg->word0);
+
+ comp = msg->complete;
+ list_del_from(&cmdclass->msgq, &msg->link);
+ cmdclass->busy = false;
+ msg->state = fsp_msg_done;
+
+ unlock(&fsp_lock);
+ if (comp)
+ (*comp)(msg);
+ lock(&fsp_lock);
+}
+
+/* WARNING: This will drop the FSP lock !!! */
+static void fsp_complete_send(struct fsp *fsp)
+{
+ struct fsp_msg *msg = fsp->pending;
+ struct fsp_cmdclass *cmdclass = fsp_get_cmdclass(msg);
+
+ assert(msg);
+ assert(cmdclass);
+
+ fsp->pending = NULL;
+
+ prlog(PR_INSANE, " completing send, word0: 0x%08x, resp: %d\n",
+ msg->word0, msg->response);
+
+ if (msg->response) {
+ u64 setbit = fsp_get_class_bit(msg->word0 & 0xff);
+ msg->state = fsp_msg_wresp;
+ fsp_cmdclass_resp_bitmask |= setbit;
+ cmdclass->timesent = mftb();
+ } else
+ fsp_complete_msg(msg);
+}
+
+static void fsp_alloc_inbound(struct fsp_msg *msg)
+{
+ u16 func_id = fsp_msg_get_data_word(msg, 0) & 0xffff;
+ u32 len = fsp_msg_get_data_word(msg, 1);
+ u32 tce_token = 0, act_len = 0;
+ u8 rc = 0;
+ void *buf;
+ struct fsp_msg *resp;
+
+ prlog(PR_DEBUG, "FSP: Allocate inbound buffer func: %04x len: %d\n",
+ func_id, len);
+
+ lock(&fsp_lock);
+ if ((fsp_inbound_off + len) > FSP_INBOUND_SIZE) {
+ prerror("FSP: Out of space in buffer area !\n");
+ rc = 0xeb;
+ goto reply;
+ }
+
+ if (!fsp_inbound_buf) {
+ fsp_inbound_buf = memalign(TCE_PSIZE, FSP_INBOUND_SIZE);
+ if (!fsp_inbound_buf) {
+ prerror("FSP: could not allocate fsp_inbound_buf!\n");
+ rc = 0xeb;
+ goto reply;
+ }
+ }
+
+ buf = fsp_inbound_buf + fsp_inbound_off;
+ tce_token = PSI_DMA_INBOUND_BUF + fsp_inbound_off;
+ len = (len + TCE_MASK) & ~TCE_MASK;
+ fsp_inbound_off += len;
+ fsp_tce_map(tce_token, buf, len);
+ prlog(PR_DEBUG, "FSP: -> buffer at 0x%p, TCE: 0x%08x, alen: 0x%x\n",
+ buf, tce_token, len);
+ act_len = len;
+
+ reply:
+ unlock(&fsp_lock);
+
+ resp = fsp_mkmsg(FSP_RSP_ALLOC_INBOUND | rc, 3, 0, tce_token, act_len);
+ if (!resp) {
+ prerror("FSP: response message allocation failed\n");
+ return;
+ }
+ if (fsp_queue_msg(resp, fsp_freemsg)) {
+ fsp_freemsg(resp);
+ prerror("FSP: Failed to queue response message\n");
+ return;
+ }
+}
+
+void *fsp_inbound_buf_from_tce(u32 tce_token)
+{
+ u32 offset = tce_token - PSI_DMA_INBOUND_BUF;
+
+ if (tce_token < PSI_DMA_INBOUND_BUF || offset >= fsp_inbound_off) {
+ prerror("FSP: TCE token 0x%x out of bounds\n", tce_token);
+ return NULL;
+ }
+ return fsp_inbound_buf + offset;
+}
+
+static void fsp_repost_queued_msgs_post_rr(void)
+{
+ struct fsp_msg *msg;
+ int i;
+
+ for (i = 0; i <= (FSP_MCLASS_LAST - FSP_MCLASS_FIRST); i++) {
+ struct fsp_cmdclass *cmdclass = &fsp_cmdclass[i];
+ bool poke = false;
+
+ while(!list_empty(&cmdclass->rr_queue)) {
+ msg = list_pop(&cmdclass->rr_queue,
+ struct fsp_msg, link);
+ list_add_tail(&cmdclass->msgq, &msg->link);
+ poke = true;
+ }
+ if (poke)
+ fsp_poke_queue(cmdclass);
+ }
+}
+
+static bool fsp_local_command(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+ u32 cmd = 0;
+ u32 rsp_data = 0;
+ struct fsp_msg *resp;
+
+ switch(cmd_sub_mod) {
+ case FSP_CMD_CONTINUE_IPL:
+ /* We get a CONTINUE_IPL as a response to OPL */
+ prlog(PR_NOTICE, "FSP: Got CONTINUE_IPL !\n");
+ ipl_state |= ipl_got_continue;
+ return true;
+
+ case FSP_CMD_HV_STATE_CHG:
+ prlog(PR_NOTICE, "FSP: Got HV state change request to %d\n",
+ msg->data.bytes[0]);
+
+ /* Send response synchronously for now, we might want to
+ * deal with that sort of stuff asynchronously if/when
+ * we add support for auto-freeing of messages
+ */
+ resp = fsp_mkmsg(FSP_RSP_HV_STATE_CHG, 0);
+ if (!resp)
+ prerror("FSP: Failed to allocate HV state response\n");
+ else {
+ if (fsp_queue_msg(resp, fsp_freemsg)) {
+ fsp_freemsg(resp);
+ prerror("FSP: Failed to queue HV state resp\n");
+ }
+ }
+ return true;
+
+ case FSP_CMD_SP_NEW_ROLE:
+ /* FSP is assuming a new role */
+ prlog(PR_INFO, "FSP: FSP assuming new role\n");
+ resp = fsp_mkmsg(FSP_RSP_SP_NEW_ROLE, 0);
+ if (!resp)
+ prerror("FSP: Failed to allocate SP role response\n");
+ else {
+ if (fsp_queue_msg(resp, fsp_freemsg)) {
+ fsp_freemsg(resp);
+ prerror("FSP: Failed to queue SP role resp\n");
+ }
+ }
+ ipl_state |= ipl_got_new_role;
+ return true;
+
+ case FSP_CMD_SP_QUERY_CAPS:
+ prlog(PR_INFO, "FSP: FSP query capabilities\n");
+ /* XXX Do something saner. For now do a synchronous
+ * response and hard code our capabilities
+ */
+ resp = fsp_mkmsg(FSP_RSP_SP_QUERY_CAPS, 4, 0x3ff80000, 0, 0, 0);
+ if (!resp)
+ prerror("FSP: Failed to allocate CAPS response\n");
+ else {
+ if (fsp_queue_msg(resp, fsp_freemsg)) {
+ fsp_freemsg(resp);
+ prerror("FSP: Failed to queue CAPS resp\n");
+ }
+ }
+ ipl_state |= ipl_got_caps;
+ return true;
+ case FSP_CMD_FSP_FUNCTNAL:
+ prlog(PR_INFO, "FSP: Got FSP Functional\n");
+ ipl_state |= ipl_got_fsp_functional;
+ return true;
+ case FSP_CMD_ALLOC_INBOUND:
+ fsp_alloc_inbound(msg);
+ return true;
+ case FSP_CMD_SP_RELOAD_COMP:
+ if (msg->data.bytes[3] & PPC_BIT8(0)) {
+ fsp_fips_dump_notify(fsp_msg_get_data_word(msg, 1),
+ fsp_msg_get_data_word(msg, 2));
+
+ if (msg->data.bytes[3] & PPC_BIT8(1))
+ prlog(PR_DEBUG, " PLID is %x\n",
+ fsp_msg_get_data_word(msg, 3));
+ }
+ if (msg->data.bytes[3] & PPC_BIT8(2)) {
+ prlog(PR_INFO, "FSP: SP Reset/Reload was NOT done\n");
+ } else {
+ prlog(PR_INFO, "FSP: SP says Reset/Reload complete\n");
+ /* Notify clients that the FSP is back up */
+ fsp_notify_rr_state(FSP_RELOAD_COMPLETE);
+ fsp_repost_queued_msgs_post_rr();
+ }
+ return true;
+ case FSP_CMD_CLOSE_HMC_INTF:
+ /* Close the HMC interface */
+ /* Though Sapphire does not support a HMC connection, the FSP
+ * sends this message when it is trying to open any new
+ * hypervisor session. So returning an error 0x51.
+ */
+ cmd = FSP_RSP_CLOSE_HMC_INTF | FSP_STAUS_INVALID_HMC_ID;
+ rsp_data = msg->data.bytes[0] << 24 | msg->data.bytes[1] << 16;
+ rsp_data &= 0xffff0000;
+ resp = fsp_mkmsg(cmd, 1, rsp_data);
+ if (!resp)
+ prerror("FSP: Failed to allocate HMC close response\n");
+ else {
+ if (fsp_queue_msg(resp, fsp_freemsg)) {
+ fsp_freemsg(resp);
+ prerror("FSP: Failed to queue HMC close resp\n");
+ }
+ }
+ return true;
+ case FSP_CMD_GET_HIR_PLID:
+ /* Get Platform Log Id with reason for Host Initiated Reset */
+ prlog(PR_DEBUG, "FSP: Sending PLID 0x%x as HIR reason\n",
+ fsp_hir_reason_plid);
+ resp = fsp_mkmsg(FSP_RSP_GET_HIR_PLID, 1, fsp_hir_reason_plid);
+ if (!resp)
+ prerror("FSP: Failed to allocate GET_HIR_PLID response\n");
+ else {
+ if (fsp_queue_msg(resp, fsp_freemsg)) {
+ fsp_freemsg(resp);
+ prerror("FSP: Failed to queue GET_HIR_PLID resp\n");
+ }
+ }
+ fsp_hir_reason_plid = 0;
+ return true;
+ }
+ return false;
+}
+
+
+/* This is called without the FSP lock */
+static void fsp_handle_command(struct fsp_msg *msg)
+{
+ struct fsp_cmdclass *cmdclass = fsp_get_cmdclass(msg);
+ struct fsp_client *client, *next;
+ struct fsp_msg *resp;
+ u32 cmd_sub_mod;
+
+ if (!cmdclass) {
+ prerror("FSP: Got message for unknown class %x\n",
+ msg->word0 & 0xff);
+ goto free;
+ }
+
+ cmd_sub_mod = (msg->word0 & 0xff) << 16;
+ cmd_sub_mod |= (msg->word1 & 0xff) << 8;
+ cmd_sub_mod |= (msg->word1 >> 8) & 0xff;
+
+ /* Some commands are handled locally */
+ if (fsp_local_command(cmd_sub_mod, msg))
+ goto free;
+
+ /* The rest go to clients */
+ list_for_each_safe(&cmdclass->clientq, client, next, link) {
+ if (client->message(cmd_sub_mod, msg))
+ goto free;
+ }
+
+ prerror("FSP: Unhandled message %06x\n", cmd_sub_mod);
+
+ /* We don't know whether the message expected some kind of
+ * response, so we send one anyway
+ */
+ resp = fsp_mkmsg((cmd_sub_mod & 0xffff00) | 0x008020, 0);
+ if (!resp)
+ prerror("FSP: Failed to allocate default response\n");
+ else {
+ if (fsp_queue_msg(resp, fsp_freemsg)) {
+ fsp_freemsg(resp);
+ prerror("FSP: Failed to queue default response\n");
+ }
+ }
+
+ free:
+ fsp_freemsg(msg);
+}
+
+static void __fsp_fill_incoming(struct fsp *fsp, struct fsp_msg *msg,
+ int dlen, u32 w0, u32 w1)
+{
+ unsigned int wlen, i, reg;
+
+ msg->dlen = dlen - 8;
+ msg->word0 = w0;
+ msg->word1 = w1;
+ wlen = (dlen + 3) >> 2;
+ reg = FSP_MBX1_FDATA_AREA + 8;
+ for (i = 0; i < wlen; i++) {
+ fsp_msg_set_data_word(msg, i, fsp_rreg(fsp, reg));
+ reg += 4;
+ }
+
+ /* Ack it (XDN) and clear HPEND & counts */
+ fsp_wreg(fsp, FSP_MBX1_HCTL_REG,
+ FSP_MBX_CTL_PTS |
+ FSP_MBX_CTL_XDN |
+ FSP_MBX_CTL_HPEND |
+ FSP_MBX_CTL_HCSP_MASK |
+ FSP_MBX_CTL_DCSP_MASK);
+
+ fsp_trace_msg(msg, TRACE_FSP_MSG_IN);
+}
+
+static void __fsp_drop_incoming(struct fsp *fsp)
+{
+ /* Ack it (XDN) and clear HPEND & counts */
+ fsp_wreg(fsp, FSP_MBX1_HCTL_REG,
+ FSP_MBX_CTL_PTS |
+ FSP_MBX_CTL_XDN |
+ FSP_MBX_CTL_HPEND |
+ FSP_MBX_CTL_HCSP_MASK |
+ FSP_MBX_CTL_DCSP_MASK);
+}
+
+/* WARNING: This will drop the FSP lock */
+static void fsp_handle_incoming(struct fsp *fsp)
+{
+ struct fsp_msg *msg;
+ u32 h0, w0, w1;
+ unsigned int dlen;
+ bool special_response = false;
+
+ h0 = fsp_rreg(fsp, FSP_MBX1_FHDR0_REG);
+ dlen = (h0 >> 16) & 0xff;
+
+ w0 = fsp_rreg(fsp, FSP_MBX1_FDATA_AREA);
+ w1 = fsp_rreg(fsp, FSP_MBX1_FDATA_AREA + 4);
+
+ prlog(PR_INSANE, " Incoming: w0: 0x%08x, w1: 0x%08x, dlen: %d\n",
+ w0, w1, dlen);
+
+ /* Some responses are expected out of band */
+ if ((w0 & 0xff) == FSP_MCLASS_HMC_INTFMSG &&
+ ((w1 & 0xff) == 0x8a || ((w1 & 0xff) == 0x8b)))
+ special_response = true;
+
+ /* Check for response bit */
+ if (w1 & 0x80 && !special_response) {
+ struct fsp_cmdclass *cmdclass = __fsp_get_cmdclass(w0 & 0xff);
+ struct fsp_msg *req;
+
+ if (!cmdclass) {
+ prerror("FSP: Got response for unknown class %x\n",
+ w0 & 0xff);
+ __fsp_drop_incoming(fsp);
+ return;
+ }
+
+ if (!cmdclass->busy || list_empty(&cmdclass->msgq)) {
+ prerror("FSP #%d: Got orphan response! w0 = 0x%08x w1 = 0x%08x\n",
+ fsp->index, w0, w1);
+ __fsp_drop_incoming(fsp);
+ return;
+ }
+ req = list_top(&cmdclass->msgq, struct fsp_msg, link);
+
+ /* Check if the response seems to match the message */
+ if (req->state != fsp_msg_wresp ||
+ (req->word0 & 0xff) != (w0 & 0xff) ||
+ (req->word1 & 0xff) != (w1 & 0x7f)) {
+ __fsp_drop_incoming(fsp);
+ prerror("FSP #%d: Response doesn't match pending msg. w0 = 0x%08x w1 = 0x%08x\n",
+ fsp->index, w0, w1);
+ return;
+ } else {
+ u64 resetbit = ~fsp_get_class_bit(req->word0 & 0xff);
+ fsp_cmdclass_resp_bitmask &= resetbit;
+ cmdclass->timesent = 0;
+ }
+
+ /* Allocate response if needed XXX We need to complete
+ * the original message with some kind of error here ?
+ */
+ if (!req->resp) {
+ req->resp = __fsp_allocmsg();
+ if (!req->resp) {
+ __fsp_drop_incoming(fsp);
+ prerror("FSP #%d: Failed to allocate response\n",
+ fsp->index);
+ return;
+ }
+ }
+
+ /* Populate and complete (will drop the lock) */
+ req->resp->state = fsp_msg_response;
+ __fsp_fill_incoming(fsp, req->resp, dlen, w0, w1);
+ fsp_complete_msg(req);
+ return;
+ }
+
+ /* Allocate an incoming message */
+ msg = __fsp_allocmsg();
+ if (!msg) {
+ __fsp_drop_incoming(fsp);
+ prerror("FSP #%d: Failed to allocate incoming msg\n",
+ fsp->index);
+ return;
+ }
+ msg->state = fsp_msg_incoming;
+ __fsp_fill_incoming(fsp, msg, dlen, w0, w1);
+
+ /* Handle FSP commands. This can recurse into fsp_queue_msg etc.. */
+ unlock(&fsp_lock);
+ fsp_handle_command(msg);
+ lock(&fsp_lock);
+}
+
+static void fsp_check_queues(struct fsp *fsp)
+{
+ int i;
+
+ /* XXX In the long run, we might want to have a queue of
+ * classes waiting to be serviced to speed this up, either
+ * that or a bitmap.
+ */
+ for (i = 0; i <= (FSP_MCLASS_LAST - FSP_MCLASS_FIRST); i++) {
+ struct fsp_cmdclass *cmdclass = &fsp_cmdclass[i];
+
+ if (fsp->state != fsp_mbx_idle)
+ break;
+ if (cmdclass->busy || list_empty(&cmdclass->msgq))
+ continue;
+ fsp_poke_queue(cmdclass);
+ }
+}
+
+static void __fsp_poll(bool interrupt)
+{
+ struct fsp_iopath *iop;
+ struct fsp *fsp = fsp_get_active();
+ u32 ctl, hdir = 0;
+ bool psi_irq;
+
+ /*
+ * The tracer isn't terribly efficient at detecting dups
+ * especially when coming from multiple CPUs so we do our
+ * own change-detection locally
+ */
+ static u32 hdir_last_trace;
+ static u32 ctl_last_trace;
+ static bool psi_irq_last_trace;
+ static bool irq_last_trace;
+
+ if (!fsp)
+ return;
+
+ /* Crazy interrupt handling scheme:
+ *
+ * In order to avoid "losing" interrupts when polling the mbox
+ * we only clear interrupt conditions when called as a result of
+ * an interrupt.
+ *
+ * That way, if a poll clears, for example, the HPEND condition,
+ * the interrupt remains, causing a dummy interrupt later on
+ * thus allowing the OS to be notified of a state change (ie it
+ * doesn't need every poll site to monitor every state change).
+ *
+ * However, this scheme is complicated by the fact that we need
+ * to clear the interrupt condition after we have cleared the
+ * original condition in HCTL, and we might have long stale
+ * interrupts which we do need to eventually get rid of. However
+ * clearing interrupts in such a way is racy, so we need to loop
+ * and re-poll HCTL after having done so or we might miss an
+ * event. It's a latency risk, but unlikely and probably worth it.
+ */
+
+ again:
+ if (fsp->active_iopath < 0) {
+ /* That should never happen */
+ if (interrupt && (fsp->state != fsp_mbx_rr))
+ prerror("FSP: Interrupt with no working IO path\n");
+ return;
+ }
+ iop = &fsp->iopath[fsp->active_iopath];
+
+ /* Check for error state and handle R&R completion */
+ fsp_handle_errors(fsp);
+
+ /* Handle host initiated resets */
+ if (fsp_in_hir(fsp)) {
+ fsp_hir_poll(fsp, iop->psi);
+ return;
+ }
+
+ /*
+ * The above might have triggered and R&R, check that we
+ * are still functional
+ */
+ if ((fsp->active_iopath < 0) || fsp_in_hir(fsp))
+ return;
+ iop = &fsp->iopath[fsp->active_iopath];
+
+ /* Read interrupt status (we may or may not use it) */
+ hdir = fsp_rreg(fsp, FSP_HDIR_REG);
+
+ /* Read control now as well so we can trace them */
+ ctl = fsp_rreg(fsp, FSP_MBX1_HCTL_REG);
+
+ /* Ditto with PSI irq state */
+ psi_irq = psi_poll_fsp_interrupt(iop->psi);
+
+ /* Trace it if anything changes */
+ if (hdir != hdir_last_trace || ctl != ctl_last_trace ||
+ interrupt != irq_last_trace || psi_irq != psi_irq_last_trace) {
+ fsp_trace_event(fsp, TRACE_FSP_EVT_POLL_IRQ,
+ interrupt, hdir, ctl, psi_irq);
+
+ hdir_last_trace = hdir;
+ ctl_last_trace = ctl;
+ irq_last_trace = interrupt;
+ psi_irq_last_trace = psi_irq;
+ }
+
+ /*
+ * We *MUST* ignore the MBOX2 bits here. While MBOX2 cannot generate
+ * interrupt, it might still latch some bits here (and we found cases
+ * where the MBOX2 XUP would be set). If that happens, clearing HDIR
+ * never works (the bit gets set again immediately) because we don't
+ * clear the condition in HTCL2 and thus we loop forever.
+ */
+ hdir &= FSP_DBIRQ_MBOX1;
+
+ /*
+ * Sanity check: If an interrupt is pending and we are in polling
+ * mode, check that the PSI side is also pending. If some bit is
+ * set, just clear and move on.
+ */
+ if (hdir && !interrupt && !psi_irq) {
+ prerror("FSP: WARNING ! HDIR 0x%08x but no PSI irq !\n", hdir);
+ fsp_wreg(fsp, FSP_HDIR_REG, hdir);
+ }
+
+ /*
+ * We should never have the mbox in error state here unless it
+ * was fine until some printf inside fsp_handle_errors() caused
+ * the console to poke the FSP which detected a branch new error
+ * in the process. Let's be safe rather than sorry and handle that
+ * here
+ */
+ if (fsp_in_hir(fsp) || fsp->state == fsp_mbx_err) {
+ prerror("FSP: Late error state detection\n");
+ goto again;
+ }
+
+ /*
+ * If we are in an R&R state with an active IO path, we
+ * shouldn't be getting interrupts. If we do, just clear
+ * the condition and print a message
+ */
+ if (fsp->state == fsp_mbx_rr) {
+ if (interrupt) {
+ prerror("FSP: Interrupt in RR state [HDIR=0x%08x]\n",
+ hdir);
+ fsp_wreg(fsp, FSP_HDIR_REG, hdir);
+ }
+ return;
+ }
+
+ /* Poll FSP CTL */
+ if (ctl & (FSP_MBX_CTL_XUP | FSP_MBX_CTL_HPEND))
+ prlog(PR_INSANE, "FSP #%d: poll, ctl: %x\n", fsp->index, ctl);
+
+ /* Do we have a pending message waiting to complete ? */
+ if (ctl & FSP_MBX_CTL_XUP) {
+ fsp_wreg(fsp, FSP_MBX1_HCTL_REG, FSP_MBX_CTL_XUP);
+ if (fsp->state == fsp_mbx_send) {
+ /* mbox is free */
+ fsp->state = fsp_mbx_idle;
+
+ /* Complete message (will break the lock) */
+ fsp_complete_send(fsp);
+
+ /* Lock can have been broken, so ctl is now
+ * potentially invalid, let's recheck
+ */
+ goto again;
+ } else {
+ prerror("FSP #%d: Got XUP with no pending message !\n",
+ fsp->index);
+ }
+ }
+
+ if (fsp->state == fsp_mbx_send) {
+ /* XXX Handle send timeouts!!! */
+ }
+
+ /* Is there an incoming message ? This will break the lock as well */
+ if (ctl & FSP_MBX_CTL_HPEND)
+ fsp_handle_incoming(fsp);
+
+ /* Note: Lock may have been broken above, thus ctl might be invalid
+ * now, don't use it any further.
+ */
+
+ /* Check for something else to send */
+ if (fsp->state == fsp_mbx_idle)
+ fsp_check_queues(fsp);
+
+ /* Clear interrupts, and recheck HCTL if any occurred */
+ if (interrupt && hdir) {
+ fsp_wreg(fsp, FSP_HDIR_REG, hdir);
+ goto again;
+ }
+}
+
+void fsp_interrupt(void)
+{
+ lock(&fsp_lock);
+ __fsp_poll(true);
+ unlock(&fsp_lock);
+}
+
+
+int fsp_sync_msg(struct fsp_msg *msg, bool autofree)
+{
+ int rc;
+
+ rc = fsp_queue_msg(msg, NULL);
+ if (rc)
+ goto bail;
+
+ while(fsp_msg_busy(msg)) {
+ if (fsp_in_rr()) {
+ fsp_cancelmsg(msg);
+ rc = -1;
+ goto bail;
+ }
+ cpu_relax();
+ opal_run_pollers();
+ }
+
+ switch(msg->state) {
+ case fsp_msg_done:
+ rc = 0;
+ break;
+ case fsp_msg_timeout:
+ rc = -1; /* XXX to improve */
+ break;
+ default:
+ rc = -1; /* Should not happen... (assert ?) */
+ }
+
+ if (msg->resp)
+ rc = (msg->resp->word1 >> 8) & 0xff;
+ bail:
+ if (autofree)
+ fsp_freemsg(msg);
+ return rc;
+}
+
+void fsp_register_client(struct fsp_client *client, u8 msgclass)
+{
+ struct fsp_cmdclass *cmdclass = __fsp_get_cmdclass(msgclass);
+
+ if (!fsp_present())
+ return;
+ assert(cmdclass);
+ list_add_tail(&cmdclass->clientq, &client->link);
+}
+
+void fsp_unregister_client(struct fsp_client *client, u8 msgclass)
+{
+ struct fsp_cmdclass *cmdclass = __fsp_get_cmdclass(msgclass);
+
+ if (!fsp_present())
+ return;
+ assert(cmdclass);
+ list_del_from(&cmdclass->clientq, &client->link);
+}
+
+static int fsp_init_mbox(struct fsp *fsp)
+{
+ unsigned int i;
+ u32 reg;
+
+ /*
+ * Note: The documentation contradicts itself as to
+ * whether the HDIM bits should be set or cleared to
+ * enable interrupts
+ *
+ * This seems to work...
+ */
+
+ /* Mask all interrupts */
+ fsp_wreg(fsp, FSP_HDIM_CLR_REG, FSP_DBIRQ_ALL);
+
+ /* Clear all errors */
+ fsp_wreg(fsp, FSP_HDES_REG, FSP_DBERRSTAT_CLR1 | FSP_DBERRSTAT_CLR2);
+
+ /* Initialize data area as the doco says */
+ for (i = 0; i < 0x40; i += 4)
+ fsp_wreg(fsp, FSP_MBX1_HDATA_AREA + i, 0);
+
+ /*
+ * Clear whatever crap may remain in HDCR. Do not write XDN as that
+ * would be interpreted incorrectly as an R&R completion which
+ * we aren't ready to send yet !
+ */
+ fsp_wreg(fsp, FSP_MBX1_HCTL_REG, FSP_MBX_CTL_XUP | FSP_MBX_CTL_HPEND |
+ FSP_MBX_CTL_HCSP_MASK | FSP_MBX_CTL_DCSP_MASK |
+ FSP_MBX_CTL_PTS);
+
+ /* Clear all pending interrupts */
+ fsp_wreg(fsp, FSP_HDIR_REG, FSP_DBIRQ_ALL);
+
+ /* Enable all mbox1 interrupts */
+ fsp_wreg(fsp, FSP_HDIM_SET_REG, FSP_DBIRQ_MBOX1);
+
+ /* Decode what FSP we are connected to */
+ reg = fsp_rreg(fsp, FSP_SCRATCH0_REG);
+ if (reg & PPC_BIT32(0)) { /* Is it a valid connection */
+ if (reg & PPC_BIT32(3))
+ prlog(PR_INFO, "FSP: Connected to FSP-B\n");
+ else
+ prlog(PR_INFO, "FSP: Connected to FSP-A\n");
+ }
+
+ return 0;
+}
+
+/* We use a single fixed TCE table for all PSI interfaces */
+static void fsp_init_tce_table(void)
+{
+ fsp_tce_table = (__be64 *)PSI_TCE_TABLE_BASE;
+
+ memset(fsp_tce_table, 0, PSI_TCE_TABLE_SIZE);
+}
+
+void fsp_tce_map(u32 offset, void *addr, u32 size)
+{
+ u64 raddr = (u64)addr;
+
+ assert(!(offset & TCE_MASK));
+ assert(!(raddr & TCE_MASK));
+ assert(!(size & TCE_MASK));
+
+ size >>= TCE_SHIFT;
+ offset >>= TCE_SHIFT;
+
+ while(size--) {
+ fsp_tce_table[offset++] = cpu_to_be64(raddr | 0x3);
+ raddr += TCE_PSIZE;
+ }
+}
+
+void fsp_tce_unmap(u32 offset, u32 size)
+{
+ assert(!(offset & TCE_MASK));
+ assert(!(size & TCE_MASK));
+
+ size >>= TCE_SHIFT;
+ offset >>= TCE_SHIFT;
+
+ while(size--)
+ fsp_tce_table[offset++] = 0;
+}
+
+static struct fsp *fsp_find_by_index(int index)
+{
+ struct fsp *fsp = first_fsp;
+
+ do {
+ if (fsp->index == index)
+ return fsp;
+ } while (fsp->link != first_fsp);
+
+ return NULL;
+}
+
+static void fsp_init_links(struct dt_node *fsp_node)
+{
+ const struct dt_property *linksprop;
+ int i, index;
+ struct fsp *fsp;
+ struct fsp_iopath *fiop;
+
+ linksprop = dt_find_property(fsp_node, "ibm,psi-links");
+ assert(linksprop);
+
+ index = dt_prop_get_u32(fsp_node, "reg");
+ fsp = fsp_find_by_index(index);
+ if (!fsp) {
+ prerror("FSP: FSP with index %d not found\n", index);
+ return;
+ }
+
+ fsp->state = fsp_mbx_idle;
+
+ /* Iterate all links */
+ for (i = 0; i < fsp->iopath_count; i++) {
+ u64 reg;
+ u32 link;
+
+ link = dt_property_get_cell(linksprop, i);
+ fiop = &fsp->iopath[i];
+ fiop->psi = psi_find_link(link);
+ if (fiop->psi == NULL) {
+ prerror("FSP #%d: Couldn't find PSI link\n",
+ fsp->index);
+ continue;
+ }
+
+ prlog(PR_DEBUG, "FSP #%d: Found PSI HB link to chip %d\n",
+ fsp->index, link);
+
+ psi_fsp_link_in_use(fiop->psi);
+
+ /* Get the FSP register window */
+ reg = in_be64(fiop->psi->regs + PSIHB_FSPBAR);
+ fiop->fsp_regs = (void *)(reg | (1ULL << 63) |
+ dt_prop_get_u32(fsp_node, "reg-offset"));
+ }
+}
+
+static void fsp_update_links_states(struct fsp *fsp)
+{
+ struct fsp_iopath *fiop;
+ unsigned int i;
+
+ /* Iterate all links */
+ for (i = 0; i < fsp->iopath_count; i++) {
+ fiop = &fsp->iopath[i];
+ if (!fiop->psi)
+ fiop->state = fsp_path_bad;
+ else if (fiop->psi->active) {
+ fsp->active_iopath = i;
+ fiop->state = fsp_path_active;
+ } else
+ fiop->state = fsp_path_backup;
+ }
+
+ if (fsp->active_iopath >= 0) {
+ if (!active_fsp || (active_fsp != fsp))
+ active_fsp = fsp;
+
+ fsp_inbound_off = 0;
+ fiop = &fsp->iopath[fsp->active_iopath];
+ psi_init_for_fsp(fiop->psi);
+ fsp_init_mbox(fsp);
+ }
+}
+
+void fsp_reinit_fsp(void)
+{
+ struct fsp *fsp;
+
+ /* Notify all FSPs to check for an updated link state */
+ for (fsp = first_fsp; fsp; fsp = fsp->link)
+ fsp_update_links_states(fsp);
+}
+
+static void fsp_create_fsp(struct dt_node *fsp_node)
+{
+ const struct dt_property *linksprop;
+ struct fsp *fsp;
+ int count, index;
+
+ index = dt_prop_get_u32(fsp_node, "reg");
+ prlog(PR_INFO, "FSP #%d: Found in device-tree, setting up...\n",
+ index);
+
+ linksprop = dt_find_property(fsp_node, "ibm,psi-links");
+ if (!linksprop || linksprop->len < 4) {
+ prerror("FSP #%d: No links !\n", index);
+ return;
+ }
+
+ fsp = zalloc(sizeof(struct fsp));
+ if (!fsp) {
+ prerror("FSP #%d: Can't allocate memory !\n", index);
+ return;
+ }
+
+ fsp->index = index;
+ fsp->active_iopath = -1;
+
+ count = linksprop->len / 4;
+ prlog(PR_DEBUG, "FSP #%d: Found %d IO PATH\n", index, count);
+ if (count > FSP_MAX_IOPATH) {
+ prerror("FSP #%d: WARNING, limited to %d IO PATH\n",
+ index, FSP_MAX_IOPATH);
+ count = FSP_MAX_IOPATH;
+ }
+ fsp->iopath_count = count;
+
+ fsp->link = first_fsp;
+ first_fsp = fsp;
+
+ fsp_init_links(fsp_node);
+ fsp_update_links_states(fsp);
+
+ if (fsp->active_iopath >= 0)
+ psi_enable_fsp_interrupt(fsp->iopath[fsp->active_iopath].psi);
+}
+
+static void fsp_opal_poll(void *data __unused)
+{
+ /* Test the host initiated reset */
+ if (hir_trigger == 0xdeadbeef) {
+ uint32_t plid = log_simple_error(&e_info(OPAL_INJECTED_HIR),
+ "SURV: Injected HIR, initiating FSP R/R\n");
+ fsp_trigger_reset(plid);
+ hir_trigger = 0;
+ }
+
+ if (try_lock(&fsp_lock)) {
+ __fsp_poll(false);
+ unlock(&fsp_lock);
+ }
+}
+
+int fsp_fatal_msg(struct fsp_msg *msg)
+{
+ int rc = 0;
+
+ rc = fsp_queue_msg(msg, NULL);
+ if (rc)
+ return rc;
+
+ while(fsp_msg_busy(msg)) {
+ if (fsp_in_rr()) {
+ fsp_cancelmsg(msg);
+ return -1;
+ }
+
+ cpu_relax();
+ fsp_opal_poll(NULL);
+ }
+
+ switch(msg->state) {
+ case fsp_msg_done:
+ rc = 0;
+ break;
+ case fsp_msg_timeout:
+ rc = -1; /* XXX to improve */
+ break;
+ default:
+ rc = -1; /* Should not happen... (assert ?) */
+ }
+
+ if (msg->resp)
+ rc = (msg->resp->word1 >> 8) & 0xff;
+
+ return rc;
+}
+
+static bool fsp_init_one(const char *compat)
+{
+ struct dt_node *fsp_node;
+ bool inited = false;
+
+ dt_for_each_compatible(dt_root, fsp_node, compat) {
+ if (!inited) {
+ int i;
+
+ /* Initialize the per-class msg queues */
+ for (i = 0;
+ i <= (FSP_MCLASS_LAST - FSP_MCLASS_FIRST); i++) {
+ list_head_init(&fsp_cmdclass[i].msgq);
+ list_head_init(&fsp_cmdclass[i].clientq);
+ list_head_init(&fsp_cmdclass[i].rr_queue);
+ }
+
+ /* Init the queues for RR notifier cmdclass */
+ list_head_init(&fsp_cmdclass_rr.msgq);
+ list_head_init(&fsp_cmdclass_rr.clientq);
+ list_head_init(&fsp_cmdclass_rr.rr_queue);
+
+ /* Register poller */
+ opal_add_poller(fsp_opal_poll, NULL);
+
+ inited = true;
+ }
+
+ /* Create the FSP data structure */
+ fsp_create_fsp(fsp_node);
+ }
+
+ return inited;
+}
+
+void fsp_init(void)
+{
+ prlog(PR_DEBUG, "FSP: Looking for FSP...\n");
+
+ fsp_init_tce_table();
+
+ if (!fsp_init_one("ibm,fsp1") && !fsp_init_one("ibm,fsp2")) {
+ prlog(PR_DEBUG, "FSP: No FSP on this machine\n");
+ return;
+ }
+}
+
+bool fsp_present(void)
+{
+ return first_fsp != NULL;
+}
+
+static void fsp_timeout_poll(void *data __unused)
+{
+ u64 now = mftb();
+ u64 timeout_val = 0;
+ u64 cmdclass_resp_bitmask = fsp_cmdclass_resp_bitmask;
+ struct fsp_cmdclass *cmdclass = NULL;
+ struct fsp_msg *req = NULL;
+ u32 index = 0;
+
+ if (timeout_timer == 0)
+ timeout_timer = now + secs_to_tb(30);
+
+ /* The lowest granularity for a message timeout is 30 secs.
+ * So every 30secs, check if there is any message
+ * waiting for a response from the FSP
+ */
+ if (tb_compare(now, timeout_timer) == TB_ABEFOREB)
+ return;
+ if (!try_lock(&fsp_poll_lock))
+ return;
+ if (tb_compare(now, timeout_timer) == TB_ABEFOREB) {
+ unlock(&fsp_poll_lock);
+ return;
+ }
+
+ while (cmdclass_resp_bitmask) {
+ u64 time_sent = 0;
+ u64 time_to_comp = 0;
+
+ if (!(cmdclass_resp_bitmask & 0x1))
+ goto next_bit;
+
+ cmdclass = &fsp_cmdclass[index];
+ timeout_val = secs_to_tb((cmdclass->timeout) * 60);
+ time_sent = cmdclass->timesent;
+ time_to_comp = now - cmdclass->timesent;
+
+ /* Now check if the response has timed out */
+ if (tb_compare(time_to_comp, timeout_val) == TB_AAFTERB) {
+ u32 w0, w1;
+ enum fsp_msg_state mstate;
+
+ /* Take the FSP lock now and re-check */
+ lock(&fsp_lock);
+ if (!(fsp_cmdclass_resp_bitmask & (1ull << index)) ||
+ time_sent != cmdclass->timesent) {
+ unlock(&fsp_lock);
+ goto next_bit;
+ }
+ req = list_top(&cmdclass->msgq, struct fsp_msg, link);
+ if (!req) {
+ printf("FSP: Timeout state mismatch on class %d\n",
+ index);
+ fsp_cmdclass_resp_bitmask &= ~(1ull << index);
+ cmdclass->timesent = 0;
+ unlock(&fsp_lock);
+ goto next_bit;
+ }
+ w0 = req->word0;
+ w1 = req->word1;
+ mstate = req->state;
+ prlog(PR_WARNING, "FSP: Response from FSP timed out,"
+ " cmd = %x subcmd = %x mod = %x state: %d\n",
+ w0 & 0xff, w1 & 0xff, (w1 >> 8) & 0xff, mstate);
+ fsp_reg_dump();
+ fsp_cmdclass_resp_bitmask &= ~(1ull << index);
+ cmdclass->timesent = 0;
+ if (req->resp) {
+ req->resp->state = fsp_msg_timeout;
+ req->resp->word1 = (FSP_STATUS_BUSY << 8) |
+ (req->resp->word1 & 0xff);
+ }
+ fsp_complete_msg(req);
+ __fsp_trigger_reset();
+ unlock(&fsp_lock);
+ fsp_hir_reason_plid = log_simple_error(
+ &e_info(OPAL_RC_FSP_POLL_TIMEOUT),
+ "FSP: Response from FSP timed out,"
+ " cmd = %x subcmd = %x mod = %x state: %d\n",
+ w0 & 0xff, w1 & 0xff, (w1 >> 8) & 0xff, mstate);
+ }
+ next_bit:
+ cmdclass_resp_bitmask = cmdclass_resp_bitmask >> 1;
+ index++;
+ }
+ unlock(&fsp_poll_lock);
+}
+
+void fsp_opl(void)
+{
+ struct dt_node *iplp;
+
+ if (!fsp_present())
+ return;
+
+ /* Send OPL */
+ ipl_state |= ipl_opl_sent;
+ fsp_sync_msg(fsp_mkmsg(FSP_CMD_OPL, 0), true);
+ while(!(ipl_state & ipl_got_continue)) {
+ opal_run_pollers();
+ cpu_relax();
+ }
+
+ /* Send continue ACK */
+ fsp_sync_msg(fsp_mkmsg(FSP_CMD_CONTINUE_ACK, 0), true);
+
+ /* Wait for various FSP messages */
+ prlog(PR_INFO, "INIT: Waiting for FSP to advertise new role...\n");
+ while(!(ipl_state & ipl_got_new_role)) {
+ cpu_relax();
+ opal_run_pollers();
+ }
+ prlog(PR_INFO, "INIT: Waiting for FSP to request capabilities...\n");
+ while(!(ipl_state & ipl_got_caps)) {
+ cpu_relax();
+ opal_run_pollers();
+ }
+
+ /* Initiate the timeout poller */
+ opal_add_poller(fsp_timeout_poll, NULL);
+
+ /* Tell FSP we are in standby */
+ prlog(PR_INFO, "INIT: Sending HV Functional: Standby...\n");
+ fsp_sync_msg(fsp_mkmsg(FSP_CMD_HV_FUNCTNAL, 1, 0x01000000), true);
+
+ /* Wait for FSP functional */
+ prlog(PR_INFO, "INIT: Waiting for FSP functional\n");
+ while(!(ipl_state & ipl_got_fsp_functional)) {
+ cpu_relax();
+ opal_run_pollers();
+ }
+
+ /* Tell FSP we are in running state */
+ prlog(PR_INFO, "INIT: Sending HV Functional: Runtime...\n");
+ fsp_sync_msg(fsp_mkmsg(FSP_CMD_HV_FUNCTNAL, 1, 0x02000000), true);
+
+ /*
+ * For the factory reset case, FSP sends us the PCI Bus
+ * Reset request. We don't have to do anything special with
+ * PCI bus numbers here; just send the Power Down message
+ * with modifier 0x02 to FSP.
+ */
+ iplp = dt_find_by_path(dt_root, "ipl-params/ipl-params");
+ if (iplp && dt_find_property(iplp, "pci-busno-reset-ipl")) {
+ prlog(PR_DEBUG, "INIT: PCI Bus Reset requested."
+ " Sending Power Down\n");
+ fsp_sync_msg(fsp_mkmsg(FSP_CMD_POWERDOWN_PCIRS, 0), true);
+ }
+
+ /*
+ * Tell FSP we are in running state with all partitions.
+ *
+ * This is need otherwise the FSP will not reset it's reboot count
+ * on failures. Ideally we should send that when we know the
+ * OS is up but we don't currently have a very good way to do
+ * that so this will do as a stop-gap
+ */
+ prlog(PR_NOTICE, "INIT: Sending HV Functional: Runtime all partitions\n");
+ fsp_sync_msg(fsp_mkmsg(FSP_CMD_HV_FUNCTNAL, 1, 0x04000000), true);
+}
+
+uint32_t fsp_adjust_lid_side(uint32_t lid_no)
+{
+ struct dt_node *iplp;
+ const char *side = NULL;
+
+ iplp = dt_find_by_path(dt_root, "ipl-params/ipl-params");
+ if (iplp)
+ side = dt_prop_get_def(iplp, "cec-ipl-side", NULL);
+ if (!side || !strcmp(side, "temp"))
+ lid_no |= ADJUST_T_SIDE_LID_NO;
+ return lid_no;
+}
+
+struct fsp_fetch_lid_item {
+ enum resource_id id;
+ uint32_t idx;
+
+ uint32_t lid;
+ uint32_t lid_no;
+ uint64_t bsize;
+ uint32_t offset;
+ void *buffer;
+ size_t *length;
+ size_t remaining;
+ size_t chunk_requested;
+ struct list_node link;
+ int result;
+};
+
+/*
+ * We have a queue of things to fetch
+ * when fetched, it moves to fsp_fetched_lid until we're asked if it
+ * has been fetched, in which case it's free()d.
+ *
+ * Everything is protected with fsp_fetch_lock.
+ *
+ * We use PSI_DMA_FETCH TCE entry for this fetching queue. If something
+ * is in the fsp_fetch_lid_queue, it means we're using this TCE entry!
+ *
+ * If we add the first entry to fsp_fetch_lid_queue, we trigger fetching!
+ */
+static LIST_HEAD(fsp_fetch_lid_queue);
+static LIST_HEAD(fsp_fetched_lid);
+static struct lock fsp_fetch_lock = LOCK_UNLOCKED;
+
+/*
+ * Asynchronous fsp fetch data call
+ *
+ * Note:
+ * buffer = PSI DMA address space
+ */
+int fsp_fetch_data_queue(uint8_t flags, uint16_t id, uint32_t sub_id,
+ uint32_t offset, void *buffer, size_t *length,
+ void (*comp)(struct fsp_msg *msg))
+{
+ struct fsp_msg *msg;
+ uint32_t chunk = *length;
+
+ if (!comp)
+ return OPAL_PARAMETER;
+
+ msg = fsp_mkmsg(FSP_CMD_FETCH_SP_DATA, 0x6, flags << 16 | id,
+ sub_id, offset, 0, buffer, chunk);
+ if (!msg) {
+ prerror("FSP: allocation failed!\n");
+ return OPAL_INTERNAL_ERROR;
+ }
+ if (fsp_queue_msg(msg, comp)) {
+ fsp_freemsg(msg);
+ prerror("FSP: Failed to queue fetch data message\n");
+ return OPAL_INTERNAL_ERROR;
+ }
+ return OPAL_SUCCESS;
+}
+
+#define CAPP_IDX_VENICE_DD10 0x100ea
+#define CAPP_IDX_VENICE_DD20 0x200ea
+#define CAPP_IDX_MURANO_DD20 0x200ef
+#define CAPP_IDX_MURANO_DD21 0x201ef
+#define CAPP_IDX_NAPLES_DD10 0x100d3
+#define CAPP_IDX_NIMBUS_DD10 0x100d1
+#define CAPP_IDX_NIMBUS_DD20 0x200d1
+#define CAPP_IDX_NIMBUS_DD21 0x201d1
+#define CAPP_IDX_NIMBUS_DD22 0x202d1
+#define CAPP_IDX_NIMBUS_DD23 0x203d1
+
+#define IMA_CATALOG_NIMBUS 0x4e0200
+#define IMA_CATALOG_P10_DD1 0x800100
+#define IMA_CATALOG_P10_DD2 0x800200
+
+
+static struct {
+ enum resource_id id;
+ uint32_t idx;
+ uint32_t lid_no;
+} fsp_lid_map[] = {
+ { RESOURCE_ID_KERNEL, RESOURCE_SUBID_NONE, KERNEL_LID_OPAL },
+ { RESOURCE_ID_INITRAMFS,RESOURCE_SUBID_NONE, INITRAMFS_LID_OPAL },
+ { RESOURCE_ID_IMA_CATALOG,IMA_CATALOG_NIMBUS, 0x80f00103 },
+ { RESOURCE_ID_CAPP, CAPP_IDX_MURANO_DD20, 0x80a02002 },
+ { RESOURCE_ID_CAPP, CAPP_IDX_MURANO_DD21, 0x80a02001 },
+ { RESOURCE_ID_CAPP, CAPP_IDX_VENICE_DD10, 0x80a02003 },
+ { RESOURCE_ID_CAPP, CAPP_IDX_VENICE_DD20, 0x80a02004 },
+ { RESOURCE_ID_CAPP, CAPP_IDX_NAPLES_DD10, 0x80a02005 },
+ { RESOURCE_ID_CAPP, CAPP_IDX_NIMBUS_DD10, 0x80a02006 },
+ { RESOURCE_ID_CAPP, CAPP_IDX_NIMBUS_DD20, 0x80a02007 },
+ { RESOURCE_ID_CAPP, CAPP_IDX_NIMBUS_DD21, 0x80a02007 },
+ { RESOURCE_ID_CAPP, CAPP_IDX_NIMBUS_DD22, 0x80a02007 },
+ { RESOURCE_ID_CAPP, CAPP_IDX_NIMBUS_DD23, 0x80a02007 },
+ { RESOURCE_ID_IMA_CATALOG,IMA_CATALOG_P10_DD1, 0x80f00103 },
+ { RESOURCE_ID_IMA_CATALOG,IMA_CATALOG_P10_DD2, 0x80f00103 },
+};
+
+static void fsp_start_fetching_next_lid(void);
+static void fsp_fetch_lid_next_chunk(struct fsp_fetch_lid_item *last);
+
+static void fsp_fetch_lid_complete(struct fsp_msg *msg)
+{
+ struct fsp_fetch_lid_item *last;
+ uint32_t woffset, wlen;
+ uint8_t rc;
+
+ lock(&fsp_fetch_lock);
+ last = list_top(&fsp_fetch_lid_queue, struct fsp_fetch_lid_item, link);
+ fsp_tce_unmap(PSI_DMA_FETCH, last->bsize);
+
+ woffset = fsp_msg_get_data_word(msg->resp, 1);
+ wlen = fsp_msg_get_data_word(msg->resp, 2);
+ rc = (msg->resp->word1 >> 8) & 0xff;
+
+ /* Fall back to a PHYP LID for kernel loads */
+ if (rc && last->lid_no == KERNEL_LID_OPAL) {
+ const char *ltype = dt_prop_get_def(dt_root, "lid-type", NULL);
+ if (!ltype || strcmp(ltype, "opal")) {
+ prerror("Failed to load in OPAL mode...\n");
+ last->result = OPAL_PARAMETER;
+ last = list_pop(&fsp_fetch_lid_queue,
+ struct fsp_fetch_lid_item, link);
+ list_add_tail(&fsp_fetched_lid, &last->link);
+ fsp_start_fetching_next_lid();
+ unlock(&fsp_fetch_lock);
+ return;
+ }
+ printf("Trying to load as PHYP LID...\n");
+ last->lid = KERNEL_LID_PHYP;
+ /* Retry with different LID */
+ fsp_fetch_lid_next_chunk(last);
+ }
+
+ if (rc !=0 && rc != 2) {
+ last->result = -EIO;
+ last = list_pop(&fsp_fetch_lid_queue, struct fsp_fetch_lid_item, link);
+ prerror("FSP LID %08x load ERROR %d\n", last->lid_no, rc);
+ list_add_tail(&fsp_fetched_lid, &last->link);
+ fsp_start_fetching_next_lid();
+ unlock(&fsp_fetch_lock);
+ return;
+ }
+
+ /*
+ * As per documentation, rc=2 means end of file not reached and
+ * rc=1 means we reached end of file. But it looks like we always
+ * get rc=0 irrespective of whether end of file is reached or not.
+ * The old implementation (fsp_sync_msg) used to rely on
+ * (wlen < chunk) to decide whether we reached end of file.
+ *
+ * Ideally FSP folks should be fix their code as per documentation.
+ * but until they do, adding the old check (hack) here again.
+ *
+ * Without this hack some systems would load partial lid and won't
+ * be able to boot into petitboot kernel.
+ */
+ if (rc == 0 && (wlen < last->chunk_requested))
+ last->result = OPAL_SUCCESS;
+
+ fsp_freemsg(msg);
+
+ last->remaining -= wlen;
+ *(last->length) += wlen;
+ last->buffer += wlen;
+ last->offset += wlen;
+
+ prlog(PR_DEBUG, "FSP: LID %x Chunk read -> rc=0x%02x off: %08x"
+ " twritten: %08x\n", last->lid, rc, woffset, wlen);
+
+ fsp_fetch_lid_next_chunk(last);
+
+ unlock(&fsp_fetch_lock);
+}
+
+static void fsp_fetch_lid_next_chunk(struct fsp_fetch_lid_item *last)
+{
+ uint64_t baddr;
+ uint64_t balign, boff;
+ uint32_t chunk;
+ uint32_t taddr;
+ struct fsp_msg *msg;
+ uint8_t flags = 0;
+ uint16_t id = FSP_DATASET_NONSP_LID;
+ uint32_t sub_id;
+
+ assert(lock_held_by_me(&fsp_fetch_lock));
+
+ if (last->remaining == 0 || last->result == OPAL_SUCCESS) {
+ last->result = OPAL_SUCCESS;
+ last = list_pop(&fsp_fetch_lid_queue,
+ struct fsp_fetch_lid_item, link);
+ list_add_tail(&fsp_fetched_lid, &last->link);
+ fsp_start_fetching_next_lid();
+ return;
+ }
+
+ baddr = (uint64_t)last->buffer;
+ balign = baddr & ~TCE_MASK;
+ boff = baddr & TCE_MASK;
+
+ chunk = last->remaining;
+ if (chunk > (PSI_DMA_FETCH_SIZE - boff))
+ chunk = PSI_DMA_FETCH_SIZE - boff;
+ last->bsize = ((boff + chunk) + TCE_MASK) & ~TCE_MASK;
+ last->chunk_requested = chunk;
+
+ prlog(PR_DEBUG, "FSP: LID %08x chunk 0x%08x bytes balign=%llx"
+ " boff=%llx bsize=%llx\n",
+ last->lid_no, chunk, balign, boff, last->bsize);
+
+ fsp_tce_map(PSI_DMA_FETCH, (void *)balign, last->bsize);
+ taddr = PSI_DMA_FETCH + boff;
+
+ sub_id = last->lid;
+
+ msg = fsp_mkmsg(FSP_CMD_FETCH_SP_DATA, 6,
+ flags << 16 | id, sub_id, last->offset,
+ 0, taddr, chunk);
+
+ if (fsp_queue_msg(msg, fsp_fetch_lid_complete)) {
+ fsp_freemsg(msg);
+ prerror("FSP: Failed to queue fetch data message\n");
+ last->result = OPAL_INTERNAL_ERROR;
+ last = list_pop(&fsp_fetch_lid_queue,
+ struct fsp_fetch_lid_item, link);
+ list_add_tail(&fsp_fetched_lid, &last->link);
+ }
+ last->result = OPAL_BUSY;
+}
+
+static void fsp_start_fetching_next_lid(void)
+{
+ struct fsp_fetch_lid_item *last;
+
+ assert(lock_held_by_me(&fsp_fetch_lock));
+
+ last = list_top(&fsp_fetch_lid_queue, struct fsp_fetch_lid_item, link);
+
+ if (last == NULL)
+ return;
+
+ /* If we're not already fetching */
+ if (last->result == OPAL_EMPTY)
+ fsp_fetch_lid_next_chunk(last);
+}
+
+int fsp_start_preload_resource(enum resource_id id, uint32_t idx,
+ void *buf, size_t *size)
+{
+ struct fsp_fetch_lid_item *resource;
+ uint32_t lid_no = 0;
+ int i;
+
+ resource = malloc(sizeof(struct fsp_fetch_lid_item));
+ assert(resource != NULL);
+
+ resource->id = id;
+ resource->idx = idx;
+
+ resource->offset = 0;
+ resource->buffer = buf;
+ resource->remaining = *size;
+ *size = 0;
+ resource->length = size;
+ resource->result = OPAL_EMPTY;
+
+ for (i = 0; i < ARRAY_SIZE(fsp_lid_map); i++) {
+ if (id != fsp_lid_map[i].id)
+ continue;
+
+ if (fsp_lid_map[i].idx == idx) {
+ lid_no = fsp_lid_map[i].lid_no;
+ break;
+ }
+ }
+ if (lid_no == 0)
+ return OPAL_PARAMETER;
+
+ printf("Trying to load OPAL LID %08x...\n", lid_no);
+ resource->lid_no = lid_no;
+ resource->lid = fsp_adjust_lid_side(lid_no);
+
+ lock(&fsp_fetch_lock);
+ list_add_tail(&fsp_fetch_lid_queue, &resource->link);
+ fsp_start_fetching_next_lid();
+ unlock(&fsp_fetch_lock);
+
+ return OPAL_SUCCESS;
+}
+
+int fsp_resource_loaded(enum resource_id id, uint32_t idx)
+{
+ struct fsp_fetch_lid_item *resource = NULL;
+ struct fsp_fetch_lid_item *r;
+ int rc = OPAL_BUSY;
+
+ lock(&fsp_fetch_lock);
+ list_for_each(&fsp_fetched_lid, r, link) {
+ if (r->id == id && r->idx == idx) {
+ resource = r;
+ break;
+ }
+ }
+
+ if (resource) {
+ rc = resource->result;
+ list_del(&resource->link);
+ free(resource);
+ }
+ unlock(&fsp_fetch_lock);
+
+ return rc;
+}
+
+static int fsp_lid_loaded(uint32_t lid_no)
+{
+ struct fsp_fetch_lid_item *resource = NULL;
+ struct fsp_fetch_lid_item *r;
+ int rc = OPAL_BUSY;
+
+ lock(&fsp_fetch_lock);
+ list_for_each(&fsp_fetched_lid, r, link) {
+ if (r->lid_no == lid_no) {
+ resource = r;
+ break;
+ }
+ }
+
+ if (resource) {
+ rc = resource->result;
+ if (rc == OPAL_SUCCESS) {
+ list_del(&resource->link);
+ free(resource);
+ }
+ }
+ unlock(&fsp_fetch_lock);
+
+ return rc;
+}
+
+int fsp_preload_lid(uint32_t lid_no, char *buf, size_t *size)
+{
+ struct fsp_fetch_lid_item *resource;
+ int r = OPAL_SUCCESS;
+
+ resource = malloc(sizeof(struct fsp_fetch_lid_item));
+ assert(resource != NULL);
+
+ resource->id = -1;
+ resource->idx = -1;
+
+ resource->offset = 0;
+ resource->buffer = buf;
+ resource->remaining = *size;
+ *size = 0;
+ resource->length = size;
+ resource->result = OPAL_EMPTY;
+
+ if (lid_no == 0)
+ return OPAL_PARAMETER;
+
+ printf("Trying to load LID %08x from FSP\n", lid_no);
+ resource->lid_no = lid_no;
+ resource->lid = fsp_adjust_lid_side(lid_no);
+
+ lock(&fsp_fetch_lock);
+ list_add_tail(&fsp_fetch_lid_queue, &resource->link);
+ fsp_start_fetching_next_lid();
+ unlock(&fsp_fetch_lock);
+
+ return r;
+}
+
+int fsp_wait_lid_loaded(uint32_t lid_no)
+{
+ int r;
+ int waited = 0;
+
+ r = fsp_lid_loaded(lid_no);
+
+ while(r == OPAL_BUSY) {
+ opal_run_pollers();
+ time_wait_nopoll(msecs_to_tb(5));
+ waited+=5;
+ cpu_relax();
+ r = fsp_lid_loaded(lid_no);
+ }
+
+ prlog(PR_DEBUG, "FSP: fsp_wait_lid_loaded %x %u ms\n", lid_no, waited);
+
+ return r;
+}
+
+void fsp_used_by_console(void)
+{
+ fsp_lock.in_con_path = true;
+
+ /*
+ * Some other processor might hold it without having
+ * disabled the console locally so let's make sure that
+ * is over by taking/releasing the lock ourselves
+ */
+ lock(&fsp_lock);
+ unlock(&fsp_lock);
+}
diff --git a/roms/skiboot/hw/homer.c b/roms/skiboot/hw/homer.c
new file mode 100644
index 000000000..3ff6ed1ae
--- /dev/null
+++ b/roms/skiboot/hw/homer.c
@@ -0,0 +1,252 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2013-2019 IBM Corp. */
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <io.h>
+#include <cpu.h>
+#include <chip.h>
+#include <mem_region.h>
+#include <hostservices.h>
+
+#define P8_PBA_BAR0 0x2013f00
+#define P8_PBA_BARMASK0 0x2013f04
+
+#define P9_PBA_BAR0 0x5012B00
+#define P9_PBA_BARMASK0 0x5012B04
+
+#define P10_PBA_BAR0 0x01010CDA
+#define P10_PBA_BARMASK0 0x01010CDE
+
+#define PBA_MASK_ALL_BITS 0x000001FFFFF00000ULL /* Bits 23:43 */
+
+enum P8_BAR {
+ P8_BAR_HOMER = 0,
+ P8_BAR_CENTAUR = 1,
+ P8_BAR_SLW = 2,
+ P8_BAR_OCC_COMMON = 3,
+};
+
+enum P9_BAR {
+ P9_BAR_HOMER = 0,
+ P9_BAR_CENTAUR = 1,
+ P9_BAR_OCC_COMMON = 2,
+ P9_BAR_SBE = 3,
+};
+
+enum P10_BAR {
+ P10_BAR_HOMER = 0,
+ P10_BAR_OCMB_THERMAL = 1,
+ P10_BAR_OCC_COMMON = 2,
+ P10_BAR_SBE = 3,
+};
+
+static u64 pba_bar0, pba_barmask0;
+static u8 bar_homer, bar_slw, bar_occ_common;
+
+static bool read_pba_bar(struct proc_chip *chip, unsigned int bar_no,
+ uint64_t *base, uint64_t *size)
+{
+ uint64_t bar, mask;
+ int rc;
+
+ rc = xscom_read(chip->id, pba_bar0 + bar_no, &bar);
+ if (rc) {
+ prerror("SLW: Error %d reading PBA BAR%d on chip %d\n",
+ rc, bar_no, chip->id);
+ return false;
+ }
+ rc = xscom_read(chip->id, pba_barmask0 + bar_no, &mask);
+ if (rc) {
+ prerror("SLW: Error %d reading PBA BAR MASK%d on chip %d\n",
+ rc, bar_no, chip->id);
+ return false;
+ }
+ prlog(PR_DEBUG, " PBA BAR%d : 0x%016llx\n", bar_no, bar);
+ prlog(PR_DEBUG, " PBA MASK%d: 0x%016llx\n", bar_no, mask);
+
+ if (mask == PBA_MASK_ALL_BITS) {
+ /*
+ * This could happen if all HOMER users are not enabled during
+ * early system bringup. Skip using the PBA BAR.
+ */
+ mask = 0;
+ bar = 0;
+ prerror(" PBA MASK%d uninitalized skipping BAR\n", bar_no);
+ }
+
+ *base = bar & 0x0ffffffffffffffful;
+ *size = (mask | 0xfffff) + 1;
+
+ return (*base) != 0;
+}
+
+static void homer_init_chip(struct proc_chip *chip)
+{
+ uint64_t hbase = 0, hsize = 0;
+ uint64_t sbase, ssize, obase, osize;
+
+ /*
+ * PBA BARs assigned by HB:
+ *
+ * P8:
+ * 0 : Entire HOMER
+ * 1 : OCC to Centaur path (we don't care)
+ * 2 : SLW image
+ * 3 : OCC Common area
+ *
+ * We need to reserve the memory covered by BAR 0 and BAR 3, however
+ * on earlier HBs, BAR0 isn't set so we need BAR 2 instead in that
+ * case to cover SLW (OCC not running).
+ *
+ * P9:
+ * 0 : Entire HOMER
+ * 1 : OCC to Centaur path (Cumulus only)
+ * 2 : OCC Common area
+ * 3 : SBE communication
+ *
+ */
+ if (read_pba_bar(chip, bar_homer, &hbase, &hsize)) {
+ prlog(PR_DEBUG, " HOMER Image at 0x%llx size %lldMB\n",
+ hbase, hsize / 0x100000);
+
+ if (!mem_range_is_reserved(hbase, hsize)) {
+ prlog(PR_WARNING,
+ "HOMER image is not reserved! Reserving\n");
+ mem_reserve_fw("ibm,homer-image", hbase, hsize);
+ }
+
+ chip->homer_base = hbase;
+ chip->homer_size = hsize;
+ }
+
+ /*
+ * We always read the SLW BAR since we need to grab info about the
+ * SLW image in the struct proc_chip for use by the slw.c code
+ */
+ if (proc_gen == proc_gen_p8 &&
+ read_pba_bar(chip, bar_slw, &sbase, &ssize)) {
+ prlog(PR_DEBUG, " SLW Image at 0x%llx size %lldMB\n",
+ sbase, ssize / 0x100000);
+
+ /*
+ * Only reserve it if we have no homer image or if it
+ * doesn't fit in it (only check the base).
+ */
+ if ((sbase < hbase || sbase > (hbase + hsize) ||
+ (hbase == 0 && sbase > 0)) &&
+ !mem_range_is_reserved(sbase, ssize)) {
+ prlog(PR_WARNING,
+ "SLW image is not reserved! Reserving\n");
+ mem_reserve_fw("ibm,slw-image", sbase, ssize);
+ }
+
+ chip->slw_base = sbase;
+ chip->slw_bar_size = ssize;
+ chip->slw_image_size = ssize; /* will be adjusted later */
+ }
+
+ if (read_pba_bar(chip, bar_occ_common, &obase, &osize)) {
+ prlog(PR_DEBUG, " OCC Common Area at 0x%llx size %lldMB\n",
+ obase, osize / 0x100000);
+ chip->occ_common_base = obase;
+ chip->occ_common_size = osize;
+ }
+}
+
+
+static void host_services_occ_base_setup(void)
+{
+ struct proc_chip *chip;
+ uint64_t occ_common;
+
+ chip = next_chip(NULL); /* Frist chip */
+ occ_common = (uint64_t) local_alloc(chip->id, OCC_COMMON_SIZE, OCC_COMMON_SIZE);
+
+ for_each_chip(chip) {
+ chip->occ_common_base = occ_common;
+ chip->occ_common_size = OCC_COMMON_SIZE;
+
+ chip->homer_base = (uint64_t) local_alloc(chip->id, HOMER_IMAGE_SIZE,
+ HOMER_IMAGE_SIZE);
+ chip->homer_size = HOMER_IMAGE_SIZE;
+ memset((void *)chip->homer_base, 0, chip->homer_size);
+
+ prlog(PR_DEBUG, "HBRT: Chip %d HOMER base %016llx : %08llx\n",
+ chip->id, chip->homer_base, chip->homer_size);
+ prlog(PR_DEBUG, "HBRT: OCC common base %016llx : %08llx\n",
+ chip->occ_common_base, chip->occ_common_size);
+ }
+}
+
+void homer_init(void)
+{
+ struct proc_chip *chip;
+
+ if (chip_quirk(QUIRK_NO_PBA))
+ return;
+
+ switch (proc_gen) {
+ case proc_gen_p8:
+ pba_bar0 = P8_PBA_BAR0;
+ pba_barmask0 = P8_PBA_BARMASK0;
+ bar_homer = P8_BAR_HOMER;
+ bar_slw = P8_BAR_SLW;
+ bar_occ_common = P8_BAR_OCC_COMMON;
+ break;
+ case proc_gen_p9:
+ pba_bar0 = P9_PBA_BAR0;
+ pba_barmask0 = P9_PBA_BARMASK0;
+ bar_homer = P9_BAR_HOMER;
+ bar_occ_common = P9_BAR_OCC_COMMON;
+ break;
+ case proc_gen_p10:
+ pba_bar0 = P10_PBA_BAR0;
+ pba_barmask0 = P10_PBA_BARMASK0;
+ bar_homer = P10_BAR_HOMER;
+ bar_occ_common = P10_BAR_OCC_COMMON;
+ break;
+ default:
+ return;
+ };
+
+ /*
+ * XXX This is temporary, on P8 we look for any configured
+ * SLW/OCC BAR and reserve the memory. Eventually, this will be
+ * done via HostBoot using the device-tree "reserved-ranges"
+ * or we'll load the SLW & OCC images ourselves using Host Services.
+ */
+ for_each_chip(chip) {
+ prlog(PR_DEBUG, "HOMER: Init chip %d\n", chip->id);
+ homer_init_chip(chip);
+ }
+
+ /*
+ * Check is PBA BARs are already loaded with HOMER and
+ * skip host services.
+ */
+
+ chip = next_chip(NULL);
+ /* Both HOMER images and OCC areas are setup */
+ if (chip->homer_base && chip->occ_common_base) {
+ /* Reserve OCC common area from BAR */
+ if (!mem_range_is_reserved(chip->occ_common_base,
+ chip->occ_common_size)) {
+ prlog(PR_WARNING,
+ "OCC common area is not reserved! Reserving\n");
+ mem_reserve_fw("ibm,occ-common-area",
+ chip->occ_common_base,
+ chip->occ_common_size);
+ }
+ } else if (chip->homer_base) {
+ /*
+ * HOMER is setup but not OCC!! Do not allocate HOMER
+ * regions. This case is possible during early system
+ * bringup where OCC images are not yet operational.
+ */
+ } else {
+ /* Allocate memory for HOMER and OCC common area */
+ host_services_occ_base_setup();
+ }
+}
+
diff --git a/roms/skiboot/hw/imc.c b/roms/skiboot/hw/imc.c
new file mode 100644
index 000000000..cbd68edc4
--- /dev/null
+++ b/roms/skiboot/hw/imc.c
@@ -0,0 +1,1075 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * In-Memory Counters (IMC)
+ * Sometimes called IMA, but that's also a different thing.
+ *
+ * Copyright 2016-2019 IBM Corp.
+ */
+
+#define pr_fmt(fmt) "IMC: " fmt
+#include <skiboot.h>
+#include <xscom.h>
+#include <imc.h>
+#include <chip.h>
+#include <libxz/xz.h>
+#include <device.h>
+#include <p9_stop_api.H>
+
+/*
+ * IMC trace scom values
+ */
+#define IMC_TRACE_CPMC1 0 /* select cpmc1 */
+#define IMC_TRACE_CPMC2 1 /* select cpmc2 */
+#define IMC_TRACE_CPMCLOAD_VAL 0xfa /*
+ * Value to be loaded into cpmc2
+ * at sampling start
+ */
+
+/* Event: CPM_32MHZ_CYC */
+#define IMC_TRACE_CPMC2SEL_VAL 2
+#define IMC_TRACE_CPMC1SEL_VAL 4
+
+#define IMC_TRACE_BUFF_SIZE 0 /*
+ * b’000’- 4K entries * 64 per
+ * entry = 256K buffersize
+ */
+static uint64_t TRACE_IMC_ADDR;
+static uint64_t CORE_IMC_EVENT_MASK_ADDR;
+static uint64_t trace_scom_val;
+/*
+ * Initialise these with the pdbar and htm scom port address array
+ * at run time, based on the processor version.
+ */
+static unsigned int *pdbar_scom_index;
+static unsigned int *htm_scom_index;
+
+/*
+ * Nest IMC PMU names along with their bit values as represented in the
+ * imc_chip_avl_vector(in struct imc_chip_cb, look at include/imc.h).
+ * nest_pmus[] is an array containing all the possible nest IMC PMU node names.
+ */
+static char const *nest_pmus[] = {
+ "powerbus0",
+ "mcs0",
+ "mcs1",
+ "mcs2",
+ "mcs3",
+ "mcs4",
+ "mcs5",
+ "mcs6",
+ "mcs7",
+ "mba0",
+ "mba1",
+ "mba2",
+ "mba3",
+ "mba4",
+ "mba5",
+ "mba6",
+ "mba7",
+ "cen0",
+ "cen1",
+ "cen2",
+ "cen3",
+ "cen4",
+ "cen5",
+ "cen6",
+ "cen7",
+ "xlink0",
+ "xlink1",
+ "xlink2",
+ "mcd0",
+ "mcd1",
+ "phb0",
+ "phb1",
+ "phb2",
+ "phb3",
+ "phb4",
+ "phb5",
+ "nx",
+ "capp0",
+ "capp1",
+ "vas",
+ "int",
+ "alink0",
+ "alink1",
+ "alink2",
+ "alink3",
+ "nvlink0",
+ "nvlink1",
+ "nvlink2",
+ "nvlink3",
+ "nvlink4",
+ "nvlink5",
+ /* reserved bits : 51 - 63 */
+};
+
+/*
+ * Due to Nest HW/OCC restriction, microcode will not support individual unit
+ * events for these nest units mcs0, mcs1 ... mcs7 in the accumulation mode.
+ * And events to monitor each mcs units individually will be supported only
+ * in the debug mode (which will be supported by microcode in the future).
+ * These will be advertised only when OPAL provides interface for the it.
+ */
+static char const *debug_mode_units[] = {
+ "mcs0",
+ "mcs1",
+ "mcs2",
+ "mcs3",
+ "mcs4",
+ "mcs5",
+ "mcs6",
+ "mcs7",
+};
+
+/*
+ * Combined unit node events are counted when any of the individual
+ * unit is enabled in the availability vector. That is,
+ * ex, mcs01 unit node should be enabled only when mcs0 or mcs1 enabled.
+ * mcs23 unit node should be enabled only when mcs2 or mcs3 is enabled
+ */
+static struct combined_units_node cu_node[] = {
+ { .name = "mcs01", .unit1 = PPC_BIT(1), .unit2 = PPC_BIT(2) },
+ { .name = "mcs23", .unit1 = PPC_BIT(3), .unit2 = PPC_BIT(4) },
+ { .name = "mcs45", .unit1 = PPC_BIT(5), .unit2 = PPC_BIT(6) },
+ { .name = "mcs67", .unit1 = PPC_BIT(7), .unit2 = PPC_BIT(8) },
+};
+
+static char *compress_buf;
+static size_t compress_buf_size;
+const char **prop_to_fix(struct dt_node *node);
+static const char *props_to_fix[] = {"events", NULL};
+
+static bool is_nest_mem_initialized(struct imc_chip_cb *ptr)
+{
+ /*
+ * Non zero value in "Status" field indicate memory initialized.
+ */
+ if (!ptr->imc_chip_run_status)
+ return false;
+
+ return true;
+}
+
+/*
+ * A Quad contains 4 cores in Power 9, and there are 4 addresses for
+ * the Core Hardware Trace Macro (CHTM) attached to each core.
+ * So, for core index 0 to core index 3, we have a sequential range of
+ * SCOM port addresses in the arrays below, each for Hardware Trace Macro (HTM)
+ * mode and PDBAR.
+ */
+static unsigned int pdbar_scom_index_p9[] = {
+ 0x1001220B,
+ 0x1001230B,
+ 0x1001260B,
+ 0x1001270B
+};
+static unsigned int htm_scom_index_p9[] = {
+ 0x10012200,
+ 0x10012300,
+ 0x10012600,
+ 0x10012700
+};
+
+static unsigned int pdbar_scom_index_p10[] = {
+ 0x2001868B,
+ 0x2001468B,
+ 0x2001268B,
+ 0x2001168B
+};
+
+static unsigned int htm_scom_index_p10[] = {
+ 0x20018680,
+ 0x20014680,
+ 0x20012680,
+ 0x20011680
+};
+
+static struct imc_chip_cb *get_imc_cb(uint32_t chip_id)
+{
+ struct proc_chip *chip = get_chip(chip_id);
+ struct imc_chip_cb *cb;
+
+ if (!chip->homer_base)
+ return NULL; /* The No Homers Club */
+
+ cb = (struct imc_chip_cb *)(chip->homer_base + P9_CB_STRUCT_OFFSET);
+ if (!is_nest_mem_initialized(cb))
+ return NULL;
+
+ return cb;
+}
+
+static int pause_microcode_at_boot(void)
+{
+ struct proc_chip *chip;
+ struct imc_chip_cb *cb;
+
+ for_each_chip(chip) {
+ cb = get_imc_cb(chip->id);
+ if (cb)
+ cb->imc_chip_command = cpu_to_be64(NEST_IMC_DISABLE);
+ else
+ return -1; /* ucode is not init-ed */
+ }
+
+ return 0;
+}
+
+/*
+ * Function return list of properties names for the fixup
+ */
+const char **prop_to_fix(struct dt_node *node)
+{
+ if (dt_node_is_compatible(node, "ibm,imc-counters"))
+ return props_to_fix;
+
+ return NULL;
+}
+
+/* Helper to get the IMC device type for a device node */
+static int get_imc_device_type(struct dt_node *node)
+{
+ const struct dt_property *type;
+ u32 val=0;
+
+ if (!node)
+ return -1;
+
+ type = dt_find_property(node, "type");
+ if (!type)
+ return -1;
+
+ val = dt_prop_get_u32(node, "type");
+ switch (val){
+ case IMC_COUNTER_CHIP:
+ return IMC_COUNTER_CHIP;
+ case IMC_COUNTER_CORE:
+ return IMC_COUNTER_CORE;
+ case IMC_COUNTER_THREAD:
+ return IMC_COUNTER_THREAD;
+ case IMC_COUNTER_TRACE:
+ return IMC_COUNTER_TRACE;
+ default:
+ break;
+ }
+
+ /* Unknown/Unsupported IMC device type */
+ return -1;
+}
+
+static bool is_nest_node(struct dt_node *node)
+{
+ if (get_imc_device_type(node) == IMC_COUNTER_CHIP)
+ return true;
+
+ return false;
+}
+
+static bool is_imc_device_type_supported(struct dt_node *node)
+{
+ u32 val = get_imc_device_type(node);
+ struct proc_chip *chip = get_chip(this_cpu()->chip_id);
+ uint64_t pvr;
+
+ if ((val == IMC_COUNTER_CHIP) || (val == IMC_COUNTER_CORE) ||
+ (val == IMC_COUNTER_THREAD))
+ return true;
+
+ if (val == IMC_COUNTER_TRACE) {
+ pvr = mfspr(SPR_PVR);
+
+ switch (chip->type) {
+ case PROC_CHIP_P9_NIMBUS:
+ /*
+ * Trace mode is supported in Nimbus DD2.2
+ * and later versions.
+ */
+ if ((PVR_VERS_MAJ(pvr) == 2) &&
+ (PVR_VERS_MIN(pvr) >= 2))
+ return true;
+ break;
+ case PROC_CHIP_P10:
+ return true;
+ default:
+ return false;
+ }
+
+ }
+ return false;
+}
+
+/*
+ * Helper to check for the imc device type in the incoming device tree.
+ * Remove unsupported device node.
+ */
+static void check_imc_device_type(struct dt_node *dev)
+{
+ struct dt_node *node;
+
+ dt_for_each_compatible(dev, node, "ibm,imc-counters") {
+ if (!is_imc_device_type_supported(node)) {
+ /*
+ * ah nice, found a device type which I didnt know.
+ * Remove it and also mark node as NULL, since dt_next
+ * will try to fetch info for "prev" which is removed
+ * by dt_free.
+ */
+ dt_free(node);
+ node = NULL;
+ }
+ }
+
+ return;
+}
+
+static void imc_dt_exports_prop_add(struct dt_node *dev)
+{
+ struct dt_node *node;
+ struct proc_chip *chip;
+ const struct dt_property *type;
+ uint32_t offset = 0, size = 0;
+ uint64_t baddr;
+ char namebuf[32];
+
+
+ dt_for_each_compatible(dev, node, "ibm,imc-counters") {
+ type = dt_find_property(node, "type");
+ if (type && is_nest_node(node)) {
+ offset = dt_prop_get_u32(node, "offset");
+ size = dt_prop_get_u32(node, "size");
+ }
+ }
+
+ /*
+ * Enable only if we have valid values.
+ */
+ if (!size && !offset)
+ return;
+
+ node = dt_find_by_name(opal_node, "exports");
+ if (!node)
+ return;
+
+ for_each_chip(chip) {
+ snprintf(namebuf, sizeof(namebuf), "imc_nest_chip_%x", chip->id);
+ baddr = chip->homer_base;
+ baddr += offset;
+ dt_add_property_u64s(node, namebuf, baddr, size);
+ }
+}
+
+/*
+ * Remove the PMU device nodes from the incoming new subtree, if they are not
+ * available in the hardware. The availability is described by the
+ * control block's imc_chip_avl_vector.
+ * Each bit represents a device unit. If the device is available, then
+ * the bit is set else its unset.
+ */
+static void disable_unavailable_units(struct dt_node *dev)
+{
+ uint64_t avl_vec;
+ struct imc_chip_cb *cb;
+ struct dt_node *target;
+ int i;
+ bool disable_all_nests = false;
+ struct proc_chip *chip;
+
+ /*
+ * Check the state of ucode in all the chip.
+ * Disable the nest unit if ucode is not initialized
+ * in any of the chip.
+ */
+ for_each_chip(chip) {
+ cb = get_imc_cb(chip->id);
+ if (!cb) {
+ /*
+ * At least currently, if one chip isn't functioning,
+ * none of the IMC Nest units will be functional.
+ * So while you may *think* this should be per chip,
+ * it isn't.
+ */
+ disable_all_nests = true;
+ break;
+ }
+ }
+
+ /* Add a property to "exports" node in opal_node */
+ imc_dt_exports_prop_add(dev);
+
+ /* Fetch the IMC control block structure */
+ cb = get_imc_cb(this_cpu()->chip_id);
+ if (cb && !disable_all_nests)
+ avl_vec = be64_to_cpu(cb->imc_chip_avl_vector);
+ else {
+ avl_vec = 0; /* Remove only nest imc device nodes */
+
+ /* Incase of mambo, just fake it */
+ if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS)
+ avl_vec = (0xffULL) << 56;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(nest_pmus); i++) {
+ if (!(PPC_BITMASK(i, i) & avl_vec)) {
+ /* Check if the device node exists */
+ target = dt_find_by_name(dev, nest_pmus[i]);
+ if (!target)
+ continue;
+ /* Remove the device node */
+ dt_free(target);
+ }
+ }
+
+ /*
+ * Loop to detect debug mode units and remove them
+ * since the microcode does not support debug mode function yet.
+ */
+ for (i = 0; i < ARRAY_SIZE(debug_mode_units); i++) {
+ target = dt_find_by_name(dev, debug_mode_units[i]);
+ if (!target)
+ continue;
+ /* Remove the device node */
+ dt_free(target);
+ }
+
+ /*
+ * Based on availability unit vector from control block,
+ * check and enable combined unit nodes in the device tree.
+ */
+ for (i = 0; i < MAX_NEST_COMBINED_UNITS ; i++ ) {
+ if (!(cu_node[i].unit1 & avl_vec) &&
+ !(cu_node[i].unit2 & avl_vec)) {
+ target = dt_find_by_name(dev, cu_node[i].name);
+ if (!target)
+ continue;
+
+ /* Remove the device node */
+ dt_free(target);
+ }
+ }
+
+ return;
+}
+
+static void disable_imc_type_from_dt(struct dt_node *dev, int imc_type)
+{
+ struct dt_node *node;
+
+ dt_for_each_compatible(dev, node, "ibm,imc-counters") {
+ if (get_imc_device_type(node) == imc_type) {
+ dt_free(node);
+ node = NULL;
+ }
+ }
+
+ return;
+}
+
+/*
+ * Function to queue the loading of imc catalog data
+ * from the IMC pnor partition.
+ */
+void imc_catalog_preload(void)
+{
+ uint32_t pvr = (mfspr(SPR_PVR) & ~(0xf0ff));
+ int ret = OPAL_SUCCESS;
+ compress_buf_size = MAX_COMPRESSED_IMC_DTB_SIZE;
+
+ if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS)
+ return;
+
+ /* Enable only for power 9/10 */
+ if (proc_gen < proc_gen_p9)
+ return;
+
+ compress_buf = malloc(MAX_COMPRESSED_IMC_DTB_SIZE);
+ if (!compress_buf) {
+ prerror("Memory allocation for catalog failed\n");
+ return;
+ }
+
+ ret = start_preload_resource(RESOURCE_ID_IMA_CATALOG,
+ pvr, compress_buf, &compress_buf_size);
+ if (ret != OPAL_SUCCESS) {
+ prerror("Failed to load IMA_CATALOG: %d\n", ret);
+ free(compress_buf);
+ compress_buf = NULL;
+ }
+
+ return;
+}
+
+static void imc_dt_update_nest_node(struct dt_node *dev)
+{
+ struct proc_chip *chip;
+ __be64 *base_addr = NULL;
+ __be32 *chipids = NULL;
+ int i=0, nr_chip = nr_chips();
+ struct dt_node *node;
+ const struct dt_property *type;
+
+ /* Add the base_addr and chip-id properties for the nest node */
+ base_addr = malloc(sizeof(u64) * nr_chip);
+ chipids = malloc(sizeof(u32) * nr_chip);
+ for_each_chip(chip) {
+ base_addr[i] = cpu_to_be64(chip->homer_base);
+ chipids[i] = cpu_to_be32(chip->id);
+ i++;
+ }
+
+ dt_for_each_compatible(dev, node, "ibm,imc-counters") {
+ type = dt_find_property(node, "type");
+ if (type && is_nest_node(node)) {
+ dt_add_property(node, "base-addr", base_addr, (i * sizeof(u64)));
+ dt_add_property(node, "chip-id", chipids, (i * sizeof(u32)));
+ }
+ }
+}
+
+static struct xz_decompress *imc_xz;
+
+void imc_decompress_catalog(void)
+{
+ void *decompress_buf = NULL;
+ uint32_t pvr = (mfspr(SPR_PVR) & ~(0xf0ff));
+ int ret;
+
+ /* Check we succeeded in starting the preload */
+ if (compress_buf == NULL)
+ return;
+
+ ret = wait_for_resource_loaded(RESOURCE_ID_IMA_CATALOG, pvr);
+ if (ret != OPAL_SUCCESS) {
+ prerror("IMC Catalog load failed\n");
+ return;
+ }
+
+ /*
+ * Memory for decompression.
+ */
+ decompress_buf = malloc(MAX_DECOMPRESSED_IMC_DTB_SIZE);
+ if (!decompress_buf) {
+ prerror("No memory for decompress_buf \n");
+ return;
+ }
+
+ /*
+ * Decompress the compressed buffer
+ */
+ imc_xz = malloc(sizeof(struct xz_decompress));
+ if (!imc_xz) {
+ prerror("No memory to decompress IMC catalog\n");
+ free(decompress_buf);
+ return;
+ }
+
+ imc_xz->dst = decompress_buf;
+ imc_xz->src = compress_buf;
+ imc_xz->dst_size = MAX_DECOMPRESSED_IMC_DTB_SIZE;
+ imc_xz->src_size = compress_buf_size;
+ xz_start_decompress(imc_xz);
+}
+
+static int setup_imc_scoms(void)
+{
+ switch (proc_gen) {
+ case proc_gen_p9:
+ CORE_IMC_EVENT_MASK_ADDR = CORE_IMC_EVENT_MASK_ADDR_P9;
+ TRACE_IMC_ADDR = TRACE_IMC_ADDR_P9;
+ pdbar_scom_index = pdbar_scom_index_p9;
+ htm_scom_index = htm_scom_index_p9;
+ trace_scom_val = TRACE_IMC_SCOM(IMC_TRACE_CPMC2,
+ IMC_TRACE_CPMCLOAD_VAL,
+ IMC_TRACE_CPMC1SEL_VAL,
+ IMC_TRACE_CPMC2SEL_VAL,
+ IMC_TRACE_BUFF_SIZE);
+ return 0;
+ case proc_gen_p10:
+ CORE_IMC_EVENT_MASK_ADDR = CORE_IMC_EVENT_MASK_ADDR_P10;
+ TRACE_IMC_ADDR = TRACE_IMC_ADDR_P10;
+ pdbar_scom_index = pdbar_scom_index_p10;
+ htm_scom_index = htm_scom_index_p10;
+ trace_scom_val = TRACE_IMC_SCOM(IMC_TRACE_CPMC1,
+ IMC_TRACE_CPMCLOAD_VAL,
+ IMC_TRACE_CPMC1SEL_VAL,
+ IMC_TRACE_CPMC2SEL_VAL,
+ IMC_TRACE_BUFF_SIZE);
+ return 0;
+ default:
+ prerror("%s: Unknown cpu type\n", __func__);
+ break;
+ }
+ return -1;
+}
+
+/*
+ * Load the IMC pnor partition and find the appropriate sub-partition
+ * based on the platform's PVR.
+ * Decompress the sub-partition and link the imc device tree to the
+ * existing device tree.
+ */
+void imc_init(void)
+{
+ struct dt_node *dev;
+ int err_flag = -1;
+
+ if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS) {
+ dev = dt_find_compatible_node(dt_root, NULL,
+ "ibm,opal-in-memory-counters");
+ if (!dev)
+ return;
+
+ goto imc_mambo;
+ }
+
+ /* Enable only for power 9/10 */
+ if (proc_gen < proc_gen_p9)
+ return;
+
+ if (!imc_xz)
+ return;
+
+ wait_xz_decompress(imc_xz);
+ if (imc_xz->status != OPAL_SUCCESS) {
+ prerror("IMC: xz_decompress failed\n");
+ goto err;
+ }
+
+ /*
+ * Flow of the data from PNOR to main device tree:
+ *
+ * PNOR -> compressed local buffer (compress_buf)
+ * compressed local buffer -> decompressed local buf (decompress_buf)
+ * decompress local buffer -> main device tree
+ * free compressed local buffer
+ */
+
+
+ /* Create a device tree entry for imc counters */
+ dev = dt_new_root("imc-counters");
+ if (!dev) {
+ prerror("IMC: Failed to add an imc-counters root node\n");
+ goto err;
+ }
+
+ /*
+ * Attach the new decompress_buf to the imc-counters node.
+ * dt_expand_node() does sanity checks for fdt_header, piggyback
+ */
+ if (dt_expand_node(dev, imc_xz->dst, 0) < 0) {
+ dt_free(dev);
+ prerror("IMC: dt_expand_node failed\n");
+ goto err;
+ }
+
+imc_mambo:
+ if (setup_imc_scoms()) {
+ prerror("IMC: Failed to setup the scoms\n");
+ goto err;
+ }
+
+ /* Check and remove unsupported imc device types */
+ check_imc_device_type(dev);
+
+ /*
+ * Check and remove unsupported nest unit nodes by the microcode,
+ * from the incoming device tree.
+ */
+ disable_unavailable_units(dev);
+
+ /* Fix the phandle in the incoming device tree */
+ dt_adjust_subtree_phandle(dev, prop_to_fix);
+
+ /* Update the base_addr and chip-id for nest nodes */
+ imc_dt_update_nest_node(dev);
+
+ if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS)
+ return;
+
+ /*
+ * IMC nest counters has both in-band (ucode access) and out of band
+ * access to it. Since not all nest counter configurations are supported
+ * by ucode, out of band tools are used to characterize other
+ * configuration.
+ *
+ * If the ucode not paused and OS does not have IMC driver support,
+ * then out to band tools will race with ucode and end up getting
+ * undesirable values. Hence pause the ucode if it is already running.
+ */
+ if (pause_microcode_at_boot()) {
+ prerror("IMC: Pausing ucode failed, disabling nest imc\n");
+ disable_imc_type_from_dt(dev, IMC_COUNTER_CHIP);
+ }
+
+ /*
+ * If the dt_attach_root() fails, "imc-counters" node will not be
+ * seen in the device-tree and hence OS should not make any
+ * OPAL_IMC_* calls.
+ */
+ if (!dt_attach_root(dt_root, dev)) {
+ dt_free(dev);
+ prerror("IMC: Failed to attach imc-counter node to dt root\n");
+ goto err;
+ }
+
+ err_flag = OPAL_SUCCESS;
+
+err:
+ if (err_flag != OPAL_SUCCESS)
+ prerror("IMC Devices not added\n");
+
+ free(compress_buf);
+ free(imc_xz->dst);
+ free(imc_xz);
+}
+
+static int stop_api_init(struct proc_chip *chip, int phys_core_id,
+ uint32_t scoms, uint64_t data,
+ const ScomOperation_t operation,
+ const ScomSection_t section,
+ const char *type)
+{
+ int ret;
+
+ prlog(PR_DEBUG, "Configuring stopapi for IMC\n");
+ ret = p9_stop_save_scom((void *)chip->homer_base, scoms,
+ data, operation, section);
+ if (ret) {
+ prerror("IMC %s stopapi ret = %d, scoms = %x (core id = %x)\n",\
+ type, ret, scoms, phys_core_id);
+ if (ret != STOP_SAVE_SCOM_ENTRY_UPDATE_FAILED)
+ wakeup_engine_state = WAKEUP_ENGINE_FAILED;
+ else
+ prerror("SCOM entries are full\n");
+ return OPAL_HARDWARE;
+ }
+
+ return ret;
+}
+
+/* Function to return the scom address for the specified core */
+static uint32_t get_imc_scom_addr_for_core(int core, uint64_t addr)
+{
+ uint32_t scom_addr;
+
+ switch (proc_gen) {
+ case proc_gen_p9:
+ scom_addr = XSCOM_ADDR_P9_EC(core, addr);
+ return scom_addr;
+ case proc_gen_p10:
+ scom_addr = XSCOM_ADDR_P10_EC(core, addr);
+ return scom_addr;
+ default:
+ return 0;
+ }
+}
+
+/* Function to return the scom address for the specified core in the quad */
+static uint32_t get_imc_scom_addr_for_quad(int core, uint64_t addr)
+{
+ uint32_t scom_addr;
+
+ switch (proc_gen) {
+ case proc_gen_p9:
+ scom_addr = XSCOM_ADDR_P9_EQ(core, addr);
+ return scom_addr;
+ case proc_gen_p10:
+ scom_addr = XSCOM_ADDR_P10_EQ(core, addr);
+ return scom_addr;
+ default:
+ return 0;
+ }
+}
+
+static int64_t core_imc_counters_init(uint64_t addr, int port_id,
+ int phys_core_id, struct cpu_thread *c)
+{
+ uint32_t pdbar_addr, event_mask_addr, htm_addr;
+ int ret;
+
+ /* Get the scom address for this core, based on the platform */
+ pdbar_addr = get_imc_scom_addr_for_quad(phys_core_id,
+ pdbar_scom_index[port_id]);
+ event_mask_addr = get_imc_scom_addr_for_core(phys_core_id,
+ CORE_IMC_EVENT_MASK_ADDR);
+
+ /*
+ * Core IMC hardware mandate initing of three scoms
+ * to enbale or disable of the Core IMC engine.
+ *
+ * PDBAR: Scom contains the real address to store per-core
+ * counter data in memory along with other bits.
+ *
+ * EventMask: Scom contain bits to denote event to multiplex
+ * at different MSR[HV PR] values, along with bits for
+ * sampling duration.
+ *
+ * HTM Scom: scom to enable counter data movement to memory.
+ */
+
+
+ if (xscom_write(c->chip_id, pdbar_addr,
+ (u64)(CORE_IMC_PDBAR_MASK & addr))) {
+ prerror("error in xscom_write for pdbar\n");
+ return OPAL_HARDWARE;
+ }
+
+ if (has_deep_states) {
+ if (wakeup_engine_state == WAKEUP_ENGINE_PRESENT) {
+ struct proc_chip *chip = get_chip(c->chip_id);
+
+ ret = stop_api_init(chip, phys_core_id, pdbar_addr,
+ (u64)(CORE_IMC_PDBAR_MASK & addr),
+ P9_STOP_SCOM_REPLACE,
+ P9_STOP_SECTION_EQ_SCOM,
+ "pdbar");
+ if (ret)
+ return ret;
+ ret = stop_api_init(chip, phys_core_id,
+ event_mask_addr,
+ (u64)CORE_IMC_EVENT_MASK,
+ P9_STOP_SCOM_REPLACE,
+ P9_STOP_SECTION_CORE_SCOM,
+ "event_mask");
+ if (ret)
+ return ret;
+ } else {
+ prerror("IMC: Wakeup engine not present!");
+ return OPAL_HARDWARE;
+ }
+ }
+
+ if (xscom_write(c->chip_id, event_mask_addr,
+ (u64)CORE_IMC_EVENT_MASK)) {
+ prerror("error in xscom_write for event mask\n");
+ return OPAL_HARDWARE;
+ }
+
+ /* Get the scom address for htm_mode scom based on the platform */
+ htm_addr = get_imc_scom_addr_for_quad(phys_core_id,
+ htm_scom_index[port_id]);
+ if (xscom_write(c->chip_id, htm_addr,
+ (u64)CORE_IMC_HTM_MODE_DISABLE)) {
+ prerror("error in xscom_write for htm mode\n");
+ return OPAL_HARDWARE;
+ }
+ return OPAL_SUCCESS;
+}
+
+/*
+ * opal_imc_counters_init : This call initialize the IMC engine.
+ *
+ * For Nest IMC, this is no-op and returns OPAL_SUCCESS at this point.
+ * For Core IMC, this initializes core IMC Engine, by initializing
+ * these scoms "PDBAR", "HTM_MODE" and the "EVENT_MASK" in a given cpu.
+ */
+static int64_t opal_imc_counters_init(uint32_t type, uint64_t addr, uint64_t cpu_pir)
+{
+ struct cpu_thread *c = find_cpu_by_pir(cpu_pir);
+ int port_id, phys_core_id;
+ int ret;
+ uint32_t htm_addr, trace_addr;
+
+ switch (type) {
+ case OPAL_IMC_COUNTERS_NEST:
+ return OPAL_SUCCESS;
+ case OPAL_IMC_COUNTERS_CORE:
+ if (!c)
+ return OPAL_PARAMETER;
+
+ /*
+ * Core IMC hardware mandates setting of htm_mode and
+ * pdbar in specific scom ports. port_id are in
+ * pdbar_scom_index[] and htm_scom_index[].
+ */
+ phys_core_id = pir_to_core_id(c->pir);
+ port_id = phys_core_id % 4;
+
+ if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS)
+ return OPAL_SUCCESS;
+
+ ret = core_imc_counters_init(addr, port_id, phys_core_id, c);
+ if (ret < 0)
+ return ret;
+ /*
+ * If fused core is supported, do the scoms for the
+ * secondary core also.
+ */
+ if (this_cpu()->is_fused_core) {
+ struct cpu_thread *c1 = find_cpu_by_pir(cpu_pir ^ 1);
+
+ phys_core_id = pir_to_core_id(c1->pir);
+ port_id = phys_core_id % 4;
+
+ ret = core_imc_counters_init(addr, port_id, phys_core_id, c1);
+ if (ret < 0)
+ return ret;
+ }
+ return ret;
+ case OPAL_IMC_COUNTERS_TRACE:
+ if (!c)
+ return OPAL_PARAMETER;
+
+ phys_core_id = pir_to_core_id(c->pir);
+ port_id = phys_core_id % 4;
+
+ if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS)
+ return OPAL_SUCCESS;
+
+ trace_addr = get_imc_scom_addr_for_core(phys_core_id,
+ TRACE_IMC_ADDR);
+ htm_addr = get_imc_scom_addr_for_quad(phys_core_id,
+ htm_scom_index[port_id]);
+
+ if (has_deep_states) {
+ if (wakeup_engine_state == WAKEUP_ENGINE_PRESENT) {
+ struct proc_chip *chip = get_chip(c->chip_id);
+
+ ret = stop_api_init(chip, phys_core_id,
+ trace_addr,
+ trace_scom_val,
+ P9_STOP_SCOM_REPLACE,
+ P9_STOP_SECTION_CORE_SCOM,
+ "trace_imc");
+ if (ret)
+ return ret;
+ } else {
+ prerror("IMC-trace:Wakeup engine not present!");
+ return OPAL_HARDWARE;
+ }
+ }
+ if (xscom_write(c->chip_id, htm_addr, (u64)CORE_IMC_HTM_MODE_DISABLE)) {
+ prerror("IMC-trace: error in xscom_write for htm mode\n");
+ return OPAL_HARDWARE;
+ }
+ if (xscom_write(c->chip_id, trace_addr, trace_scom_val)) {
+ prerror("IMC-trace: error in xscom_write for trace mode\n");
+ return OPAL_HARDWARE;
+ }
+ return OPAL_SUCCESS;
+
+ }
+
+ return OPAL_SUCCESS;
+}
+opal_call(OPAL_IMC_COUNTERS_INIT, opal_imc_counters_init, 3);
+
+/* opal_imc_counters_control_start: This call starts the nest/core imc engine. */
+static int64_t opal_imc_counters_start(uint32_t type, uint64_t cpu_pir)
+{
+ u64 op;
+ struct cpu_thread *c = find_cpu_by_pir(cpu_pir);
+ struct imc_chip_cb *cb;
+ int port_id, phys_core_id;
+ uint32_t htm_addr;
+
+ if (!c)
+ return OPAL_PARAMETER;
+
+ switch (type) {
+ case OPAL_IMC_COUNTERS_NEST:
+ /* Fetch the IMC control block structure */
+ cb = get_imc_cb(c->chip_id);
+ if (!cb)
+ return OPAL_HARDWARE;
+
+ /* Set the run command */
+ op = NEST_IMC_ENABLE;
+
+ if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS)
+ return OPAL_SUCCESS;
+
+ /* Write the command to the control block now */
+ cb->imc_chip_command = cpu_to_be64(op);
+
+ return OPAL_SUCCESS;
+ case OPAL_IMC_COUNTERS_CORE:
+ case OPAL_IMC_COUNTERS_TRACE:
+ /*
+ * Core IMC hardware mandates setting of htm_mode in specific
+ * scom ports (port_id are in htm_scom_index[])
+ */
+ phys_core_id = pir_to_core_id(c->pir);
+ port_id = phys_core_id % 4;
+
+ if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS)
+ return OPAL_SUCCESS;
+
+ htm_addr = get_imc_scom_addr_for_quad(phys_core_id,
+ htm_scom_index[port_id]);
+ /*
+ * Enables the core imc engine by appropriately setting
+ * bits 4-9 of the HTM_MODE scom port. No initialization
+ * is done in this call. This just enables the the counters
+ * to count with the previous initialization.
+ */
+ if (xscom_write(c->chip_id, htm_addr, (u64)CORE_IMC_HTM_MODE_ENABLE)) {
+ prerror("IMC OPAL_start: error in xscom_write for htm_mode\n");
+ return OPAL_HARDWARE;
+ }
+
+ return OPAL_SUCCESS;
+ }
+
+ return OPAL_SUCCESS;
+}
+opal_call(OPAL_IMC_COUNTERS_START, opal_imc_counters_start, 2);
+
+/* opal_imc_counters_control_stop: This call stops the nest imc engine. */
+static int64_t opal_imc_counters_stop(uint32_t type, uint64_t cpu_pir)
+{
+ u64 op;
+ struct imc_chip_cb *cb;
+ struct cpu_thread *c = find_cpu_by_pir(cpu_pir);
+ int port_id, phys_core_id;
+ uint32_t htm_addr;
+
+ if (!c)
+ return OPAL_PARAMETER;
+
+ switch (type) {
+ case OPAL_IMC_COUNTERS_NEST:
+ /* Fetch the IMC control block structure */
+ cb = get_imc_cb(c->chip_id);
+ if (!cb)
+ return OPAL_HARDWARE;
+
+ /* Set the run command */
+ op = NEST_IMC_DISABLE;
+
+ if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS)
+ return OPAL_SUCCESS;
+
+ /* Write the command to the control block */
+ cb->imc_chip_command = cpu_to_be64(op);
+
+ return OPAL_SUCCESS;
+
+ case OPAL_IMC_COUNTERS_CORE:
+ case OPAL_IMC_COUNTERS_TRACE:
+ /*
+ * Core IMC hardware mandates setting of htm_mode in specific
+ * scom ports (port_id are in htm_scom_index[])
+ */
+ phys_core_id = pir_to_core_id(c->pir);
+ port_id = phys_core_id % 4;
+
+ if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS)
+ return OPAL_SUCCESS;
+
+ htm_addr = get_imc_scom_addr_for_quad(phys_core_id,
+ htm_scom_index[port_id]);
+ /*
+ * Disables the core imc engine by clearing
+ * bits 4-9 of the HTM_MODE scom port.
+ */
+ if (xscom_write(c->chip_id, htm_addr, (u64) CORE_IMC_HTM_MODE_DISABLE)) {
+ prerror("error in xscom_write for htm_mode\n");
+ return OPAL_HARDWARE;
+ }
+
+ return OPAL_SUCCESS;
+ }
+
+ return OPAL_SUCCESS;
+}
+opal_call(OPAL_IMC_COUNTERS_STOP, opal_imc_counters_stop, 2);
diff --git a/roms/skiboot/hw/ipmi/Makefile.inc b/roms/skiboot/hw/ipmi/Makefile.inc
new file mode 100644
index 000000000..c6b36a2b3
--- /dev/null
+++ b/roms/skiboot/hw/ipmi/Makefile.inc
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+
+SUBDIRS += hw/ipmi
+
+IPMI_OBJS = ipmi-rtc.o ipmi-power.o ipmi-fru.o ipmi-sel.o
+IPMI_OBJS += ipmi-watchdog.o ipmi-sensor.o ipmi-attn.o ipmi-info.o
+
+IPMI = hw/ipmi/built-in.a
+$(IPMI): $(IPMI_OBJS:%=hw/ipmi/%)
diff --git a/roms/skiboot/hw/ipmi/ipmi-attn.c b/roms/skiboot/hw/ipmi/ipmi-attn.c
new file mode 100644
index 000000000..280b2525f
--- /dev/null
+++ b/roms/skiboot/hw/ipmi/ipmi-attn.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * When everything is terrible, tell the FSP as much as possible as to why
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <errorlog.h>
+#include <ipmi.h>
+#include <pel.h>
+#include <platform.h>
+#include <processor.h>
+#include <sbe-p9.h>
+#include <skiboot.h>
+#include <stack.h>
+#include <timebase.h>
+#include <xscom.h>
+
+/* Use same attention SRC for BMC based machine */
+DEFINE_LOG_ENTRY(OPAL_RC_ATTN, OPAL_PLATFORM_ERR_EVT,
+ OPAL_ATTN, OPAL_PLATFORM_FIRMWARE,
+ OPAL_ERROR_PANIC, OPAL_ABNORMAL_POWER_OFF);
+
+/* Maximum buffer size to capture backtrace and other useful information */
+#define IPMI_TI_BUFFER_SIZE (IPMI_MAX_PEL_SIZE - PEL_MIN_SIZE)
+static char ti_buffer[IPMI_TI_BUFFER_SIZE];
+
+#define STACK_BUF_ENTRIES 20
+static struct bt_entry bt_buf[STACK_BUF_ENTRIES];
+
+/* Log eSEL event with OPAL backtrace */
+static void ipmi_log_terminate_event(const char *msg)
+{
+ struct bt_metadata metadata;
+ unsigned int ti_len;
+ unsigned int ti_size;
+ struct errorlog *elog_buf;
+
+ /* Fill OPAL version */
+ ti_len = snprintf(ti_buffer, IPMI_TI_BUFFER_SIZE,
+ "OPAL version : %s\n", version);
+
+ /* File information */
+ ti_len += snprintf(ti_buffer + ti_len, IPMI_TI_BUFFER_SIZE - ti_len,
+ "File info : %s\n", msg);
+ ti_size = IPMI_TI_BUFFER_SIZE - ti_len;
+
+ /* Backtrace */
+ backtrace_create(bt_buf, STACK_BUF_ENTRIES, &metadata);
+ metadata.token = OPAL_LAST + 1;
+ backtrace_print(bt_buf, &metadata, ti_buffer + ti_len, &ti_size, true);
+
+ /* Create eSEL event and commit */
+ elog_buf = opal_elog_create(&e_info(OPAL_RC_ATTN), 0);
+ log_append_data(elog_buf, (char *)&ti_buffer, ti_len + ti_size);
+ log_commit(elog_buf);
+}
+
+void __attribute__((noreturn)) ipmi_terminate(const char *msg)
+{
+ /* Log eSEL event */
+ if (ipmi_present())
+ ipmi_log_terminate_event(msg);
+
+ /*
+ * If mpipl is supported then trigger SBE interrupt
+ * to initiate mpipl
+ */
+ p9_sbe_terminate();
+
+ /*
+ * Trigger software xstop (OPAL TI). It will stop all the CPU threads
+ * moving them into quiesced state. OCC will collect all FIR data.
+ * Upon checkstop signal, BMC will then decide whether to reboot/IPL or
+ * not depending on AutoReboot policy, if any. This helps in cases
+ * where OPAL is crashing/terminating before host reaches to runtime.
+ * With OpenBMC AutoReboot policy, in such cases, it will make sure
+ * that system is moved to Quiesced state after 3 or so attempts to
+ * IPL. Without OPAL TI, OpenBMC will never know that OPAL is
+ * terminating and system would go into never ending IPL'ing loop.
+ *
+ * Once the system reaches to runtime, OpenBMC resets the boot counter.
+ * Hence next time when BMC receieves the OPAL TI, it will IPL the
+ * system if AutoReboot is enabled. We don't need to worry about self
+ * rebooting.
+ */
+
+ xscom_trigger_xstop();
+ /*
+ * Control will not reach here if software xstop has been supported and
+ * enabled. If not supported then fallback to cec reboot path below.
+ */
+
+ /* Reboot call */
+ if (platform.cec_reboot)
+ platform.cec_reboot();
+
+ while (1)
+ time_wait_ms(100);
+}
diff --git a/roms/skiboot/hw/ipmi/ipmi-fru.c b/roms/skiboot/hw/ipmi/ipmi-fru.c
new file mode 100644
index 000000000..86c9ca0ce
--- /dev/null
+++ b/roms/skiboot/hw/ipmi/ipmi-fru.c
@@ -0,0 +1,231 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Fill out firmware related FRUs (Field Replaceable Units)
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ipmi.h>
+#include <lock.h>
+#include <opal.h>
+#include <device.h>
+
+struct product_info {
+ char *manufacturer;
+ char *product;
+ char *part_no;
+ char *version;
+ char *serial_no;
+ char *asset_tag;
+};
+
+struct common_header {
+ u8 version;
+ u8 internal_offset;
+ u8 chassis_offset;
+ u8 board_offset;
+ u8 product_offset;
+ u8 multirecord_offset;
+ u8 pad;
+ u8 checksum;
+} __packed;
+
+/* The maximum amount of FRU data we can store. */
+#define FRU_DATA_SIZE 256
+
+/* We allocate two bytes at these locations in the data array to track
+ * state. */
+#define WRITE_INDEX 256
+#define REMAINING 257
+
+/* The ASCII string encoding used only has 5 bits to encode length
+ * hence the maximum is 31 characters. */
+#define MAX_STR_LEN 31
+
+static u8 fru_dev_id = 0;
+
+static int fru_insert_string(u8 *buf, char *str)
+{
+ int len = strlen(str);
+
+ /* The ASCII type/length format only supports a string length
+ * between 2 and 31 characters. Zero characters is ok though
+ * as it indicates no data present. */
+ if (len == 1 || len > MAX_STR_LEN)
+ return OPAL_PARAMETER;
+
+ buf[0] = 0xc0 | len;
+ memcpy(&buf[1], str, len);
+
+ return len + 1;
+}
+
+static u8 fru_checksum(u8 *buf, int len)
+{
+ int i;
+ u8 checksum = 0;
+
+ for(i = 0; i < len; i++) {
+ checksum += buf[i];
+ }
+ checksum = ~checksum + 1;
+ return checksum;
+}
+
+#define FRU_INSERT_STRING(x, y) \
+ ({ rc = fru_insert_string(x, y); \
+ { if (rc < 1) return OPAL_PARAMETER; } rc; })
+
+static int fru_fill_product_info(u8 *buf, struct product_info *info, size_t size)
+{
+ size_t total_size = 11;
+ int index = 0;
+ int rc;
+
+ total_size += strlen(info->manufacturer);
+ total_size += strlen(info->product);
+ total_size += strlen(info->part_no);
+ total_size += strlen(info->version);
+ total_size += strlen(info->serial_no);
+ total_size += strlen(info->asset_tag);
+ total_size += (8 - (total_size % 8)) % 8;
+ if (total_size > size)
+ return OPAL_PARAMETER;
+
+ buf[index++] = 0x1; /* Version */
+ buf[index++] = total_size / 8; /* Size */
+ buf[index++] = 0; /* Language code (English) */
+
+ index += FRU_INSERT_STRING(&buf[index], info->manufacturer);
+ index += FRU_INSERT_STRING(&buf[index], info->product);
+ index += FRU_INSERT_STRING(&buf[index], info->part_no);
+ index += FRU_INSERT_STRING(&buf[index], info->version);
+ index += FRU_INSERT_STRING(&buf[index], info->serial_no);
+ index += FRU_INSERT_STRING(&buf[index], info->asset_tag);
+
+ buf[index++] = 0xc1; /* End of data marker */
+ memset(&buf[index], 0, total_size - index - 1);
+ index += total_size - index - 1;
+ buf[index] = fru_checksum(buf, index);
+ assert(index == total_size - 1);
+
+ return total_size;
+}
+
+static int fru_add(u8 *buf, int size)
+{
+ int len;
+ struct common_header common_hdr;
+ char *short_version;
+ struct product_info info = {
+ .manufacturer = (char *) "IBM",
+ .product = (char *) "skiboot",
+ .part_no = (char *) "",
+ .serial_no = (char *) "",
+ .asset_tag = (char *) "",
+ };
+
+ if (size < sizeof(common_hdr))
+ return OPAL_PARAMETER;
+
+ /* We currently only support adding the version number at the
+ * product information offset. We choose an offset of 64 bytes
+ * because that's what the standard recommends. */
+ common_hdr.version = 1;
+ common_hdr.internal_offset = 0;
+ common_hdr.chassis_offset = 0;
+ common_hdr.board_offset = 0;
+ common_hdr.product_offset = 64/8;
+ common_hdr.multirecord_offset = 0;
+ common_hdr.pad = 0;
+ common_hdr.checksum = fru_checksum((u8 *) &common_hdr, sizeof(common_hdr) - 1);
+ memcpy(buf, &common_hdr, sizeof(common_hdr));
+
+ short_version = strdup(version);
+ info.version = short_version;
+ if (!strncmp(version, "skiboot-", 8))
+ info.version = &short_version[8];
+
+ if (strlen(info.version) >= MAX_STR_LEN) {
+ if (info.version[MAX_STR_LEN] != '\0')
+ info.version[MAX_STR_LEN - 1] = '+';
+ info.version[MAX_STR_LEN] = '\0';
+ }
+
+ len = fru_fill_product_info(&buf[64], &info, size - 64);
+ free(short_version);
+ if (len < 0)
+ return OPAL_PARAMETER;
+
+ return len + 64;
+}
+
+static void fru_write_complete(struct ipmi_msg *msg)
+{
+ u8 write_count = msg->data[0];
+ u16 offset;
+
+ msg->data[WRITE_INDEX] += write_count;
+ msg->data[REMAINING] -= write_count;
+ if (msg->data[REMAINING] == 0)
+ goto out;
+
+ offset = msg->data[WRITE_INDEX];
+ ipmi_init_msg(msg, IPMI_DEFAULT_INTERFACE, IPMI_WRITE_FRU,
+ fru_write_complete, NULL,
+ MIN(msg->data[REMAINING] + 3, IPMI_MAX_REQ_SIZE), 2);
+
+ memmove(&msg->data[3], &msg->data[offset + 3], msg->req_size - 3);
+
+ msg->data[0] = fru_dev_id; /* FRU Device ID */
+ msg->data[1] = offset & 0xff; /* Offset LSB */
+ msg->data[2] = (offset >> 8) & 0xff; /* Offset MSB */
+
+ ipmi_queue_msg(msg);
+
+ return;
+
+out:
+ ipmi_free_msg(msg);
+}
+
+static int fru_write(void)
+{
+ struct ipmi_msg *msg;
+ int len;
+
+ /* We allocate FRU_DATA_SIZE + 5 bytes for the message:
+ * - 3 bytes for the the write FRU command header
+ * - FRU_DATA_SIZE bytes for FRU data
+ * - 2 bytes for offset & bytes remaining count
+ */
+ msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, IPMI_WRITE_FRU,
+ fru_write_complete, NULL, NULL, FRU_DATA_SIZE + 5, 2);
+ if (!msg)
+ return OPAL_RESOURCE;
+
+ msg->data[0] = fru_dev_id; /* FRU Device ID */
+ msg->data[1] = 0x0; /* Offset LSB (we always write a new common header) */
+ msg->data[2] = 0x0; /* Offset MSB */
+ len = fru_add(&msg->data[3], FRU_DATA_SIZE);
+
+ if (len < 0)
+ return len;
+
+ /* Three bytes for the actual FRU Data Command */
+ msg->data[WRITE_INDEX] = 0;
+ msg->data[REMAINING] = len;
+ msg->req_size = MIN(len + 3, IPMI_MAX_REQ_SIZE);
+ return ipmi_queue_msg(msg);
+}
+
+void ipmi_fru_init(u8 dev_id)
+{
+ fru_dev_id = dev_id;
+ fru_write();
+
+ return;
+}
diff --git a/roms/skiboot/hw/ipmi/ipmi-info.c b/roms/skiboot/hw/ipmi/ipmi-info.c
new file mode 100644
index 000000000..d93b59d7d
--- /dev/null
+++ b/roms/skiboot/hw/ipmi/ipmi-info.c
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Various bits of info retreived over IPMI
+ *
+ * Copyright 2018-2019 IBM Corp.
+ */
+
+#include <device.h>
+#include <skiboot.h>
+#include <stdlib.h>
+#include <ipmi.h>
+#include <mem_region-malloc.h>
+#include <opal.h>
+#include <timebase.h>
+
+/*
+ * Response data from IPMI Get device ID command (As defined in
+ * Section 20.1 Get Device ID Command - IPMI standard spec).
+ */
+struct ipmi_dev_id {
+ uint8_t dev_id;
+ uint8_t dev_revision;
+ uint8_t fw_rev1;
+ uint8_t fw_rev2;
+ uint8_t ipmi_ver;
+ uint8_t add_dev_support;
+ uint8_t manufactur_id[3];
+ uint8_t product_id[2];
+ uint8_t aux_fw_rev[4];
+};
+static struct ipmi_dev_id *ipmi_dev_id;
+
+/*
+ * Response data from IPMI Chassis Get System Boot Option (As defined in
+ * Section 28.13 Get System Boot Options Command - IPMI standard spec).
+ */
+struct ipmi_sys_boot_opt {
+ uint8_t param_version;
+ uint8_t param_valid;
+ /*
+ * Fields for OEM parameter 0x62. This parameter does not follow
+ * the normal layout and just has a single byte to signal if it
+ * is active or not.
+ */
+ uint8_t flag_set;
+};
+static struct ipmi_sys_boot_opt *ipmi_sys_boot_opt;
+
+/* Got response from BMC? */
+static bool bmc_info_waiting = false;
+static bool bmc_info_valid = false;
+static bool bmc_boot_opt_waiting = false;
+static bool bmc_boot_opt_valid = false;
+
+/* This will free ipmi_dev_id structure */
+void ipmi_dt_add_bmc_info(void)
+{
+ char buf[8];
+ struct dt_node *dt_fw_version;
+
+ while (bmc_info_waiting)
+ time_wait_ms(5);
+
+ if (!bmc_info_valid)
+ return;
+
+ dt_fw_version = dt_find_by_name(dt_root, "ibm,firmware-versions");
+ if (!dt_fw_version) {
+ free(ipmi_dev_id);
+ return;
+ }
+
+ memset(buf, 0, sizeof(buf));
+ snprintf(buf, sizeof(buf), "%x.%02x",
+ ipmi_dev_id->fw_rev1, ipmi_dev_id->fw_rev2);
+ dt_add_property_string(dt_fw_version, "bmc-firmware-version", buf);
+
+ free(ipmi_dev_id);
+}
+
+static void ipmi_get_bmc_info_resp(struct ipmi_msg *msg)
+{
+ bmc_info_waiting = false;
+
+ if (msg->cc != IPMI_CC_NO_ERROR) {
+ prlog(PR_ERR, "IPMI: IPMI_BMC_GET_DEVICE_ID cmd returned error"
+ " [rc : 0x%x]\n", msg->data[0]);
+ return;
+ }
+
+ /* ipmi_dev_id has optional fields */
+ if (msg->resp_size <= sizeof(struct ipmi_dev_id)) {
+ bmc_info_valid = true;
+ memcpy(ipmi_dev_id, msg->data, msg->resp_size);
+ } else {
+ prlog(PR_WARNING, "IPMI: IPMI_BMC_GET_DEVICE_ID unexpected response size\n");
+ }
+
+ ipmi_free_msg(msg);
+}
+
+int ipmi_get_bmc_info_request(void)
+{
+ int rc;
+ struct ipmi_msg *msg;
+
+ ipmi_dev_id = zalloc(sizeof(struct ipmi_dev_id));
+ assert(ipmi_dev_id);
+
+ msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, IPMI_BMC_GET_DEVICE_ID,
+ ipmi_get_bmc_info_resp, NULL, NULL,
+ 0, sizeof(struct ipmi_dev_id));
+ if (!msg)
+ return OPAL_NO_MEM;
+
+ msg->error = ipmi_get_bmc_info_resp;
+ prlog(PR_INFO, "IPMI: Requesting IPMI_BMC_GET_DEVICE_ID\n");
+ rc = ipmi_queue_msg(msg);
+ if (rc) {
+ prlog(PR_ERR, "IPMI: Failed to queue IPMI_BMC_GET_DEVICE_ID\n");
+ ipmi_free_msg(msg);
+ return rc;
+ }
+
+ bmc_info_waiting = true;
+ return rc;
+}
+
+/* This will free ipmi_sys_boot_opt structure */
+int ipmi_chassis_check_sbe_validation(void)
+{
+ int rc = -1;
+
+ while (bmc_boot_opt_waiting)
+ time_wait_ms(10);
+
+ if (!bmc_boot_opt_valid)
+ goto out;
+
+ if ((ipmi_sys_boot_opt->param_valid & 0x8) != 0)
+ goto out;
+ if (ipmi_sys_boot_opt->param_valid != 0x62)
+ goto out;
+
+ rc = ipmi_sys_boot_opt->flag_set;
+
+out:
+ free(ipmi_sys_boot_opt);
+ return rc;
+}
+
+static void ipmi_get_chassis_boot_opt_resp(struct ipmi_msg *msg)
+{
+ bmc_boot_opt_waiting = false;
+
+ if (msg->cc != IPMI_CC_NO_ERROR) {
+ prlog(PR_INFO, "IPMI: IPMI_CHASSIS_GET_BOOT_OPT cmd returned error"
+ " [rc : 0x%x]\n", msg->data[0]);
+ ipmi_free_msg(msg);
+ return;
+ }
+
+ if (msg->resp_size == sizeof(struct ipmi_sys_boot_opt)) {
+ bmc_boot_opt_valid = true;
+ memcpy(ipmi_sys_boot_opt, msg->data, msg->resp_size);
+ } else {
+ prlog(PR_WARNING, "IPMI: IPMI_CHASSIS_GET_BOOT_OPT unexpected response size\n");
+ }
+
+ ipmi_free_msg(msg);
+}
+
+int ipmi_get_chassis_boot_opt_request(void)
+{
+ int rc;
+ struct ipmi_msg *msg;
+ uint8_t req[] = {
+ 0x62, /* OEM parameter (SBE Validation on astbmc) */
+ 0x00, /* no set selector */
+ 0x00, /* no block selector */
+ };
+
+ ipmi_sys_boot_opt = zalloc(sizeof(struct ipmi_sys_boot_opt));
+ assert(ipmi_sys_boot_opt);
+
+ msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, IPMI_CHASSIS_GET_BOOT_OPT,
+ ipmi_get_chassis_boot_opt_resp, NULL, req,
+ sizeof(req), sizeof(struct ipmi_sys_boot_opt));
+ if (!msg) {
+ free(ipmi_sys_boot_opt);
+ return OPAL_NO_MEM;
+ }
+
+ msg->error = ipmi_get_chassis_boot_opt_resp;
+ prlog(PR_INFO, "IPMI: Requesting IPMI_CHASSIS_GET_BOOT_OPT\n");
+ rc = ipmi_queue_msg(msg);
+ if (rc) {
+ prlog(PR_ERR, "IPMI: Failed to queue IPMI_CHASSIS_GET_BOOT_OPT\n");
+ free(ipmi_sys_boot_opt);
+ ipmi_free_msg(msg);
+ return rc;
+ }
+
+ bmc_boot_opt_waiting = true;
+ return rc;
+}
diff --git a/roms/skiboot/hw/ipmi/ipmi-power.c b/roms/skiboot/hw/ipmi/ipmi-power.c
new file mode 100644
index 000000000..8101a8524
--- /dev/null
+++ b/roms/skiboot/hw/ipmi/ipmi-power.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Power as in electricity, not POWER as in POWER
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <stdlib.h>
+#include <ipmi.h>
+#include <opal.h>
+#include <timebase.h>
+
+static void ipmi_chassis_control_complete(struct ipmi_msg *msg)
+{
+ uint8_t request = msg->data[0];
+ uint8_t cc = msg->cc;
+
+ ipmi_free_msg(msg);
+ if (cc == IPMI_CC_NO_ERROR)
+ return;
+
+ prlog(PR_INFO, "IPMI: Chassis control request failed. "
+ "request=0x%02x, rc=0x%02x\n", request, cc);
+
+ if (ipmi_chassis_control(request)) {
+ prlog(PR_INFO, "IPMI: Failed to resend chassis control "
+ "request [0x%02x]\n", request);
+ }
+}
+
+int ipmi_chassis_control(uint8_t request)
+{
+ struct ipmi_msg *msg;
+
+ if (!ipmi_present())
+ return OPAL_CLOSED;
+
+ if (request > IPMI_CHASSIS_SOFT_SHUTDOWN)
+ return OPAL_PARAMETER;
+
+ msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, IPMI_CHASSIS_CONTROL,
+ ipmi_chassis_control_complete, NULL,
+ &request, sizeof(request), 0);
+ if (!msg)
+ return OPAL_HARDWARE;
+ /* Set msg->error callback function */
+ msg->error = ipmi_chassis_control_complete;
+
+ prlog(PR_INFO, "IPMI: sending chassis control request 0x%02x\n",
+ request);
+
+ return ipmi_queue_msg(msg);
+}
+
+int ipmi_set_power_state(uint8_t system, uint8_t device)
+{
+ struct ipmi_msg *msg;
+ struct {
+ uint8_t system;
+ uint8_t device;
+ } power_state;
+
+ if (!ipmi_present())
+ return OPAL_CLOSED;
+
+ power_state.system = system;
+ power_state.device = device;
+
+ if (system != IPMI_PWR_NOCHANGE)
+ power_state.system |= 0x80;
+ if (device != IPMI_PWR_NOCHANGE)
+ power_state.device |= 0x80;
+
+ msg = ipmi_mkmsg_simple(IPMI_SET_POWER_STATE, &power_state,
+ sizeof(power_state));
+
+ if (!msg)
+ return OPAL_HARDWARE;
+
+ prlog(PR_INFO, "IPMI: setting power state: sys %02x, dev %02x\n",
+ power_state.system, power_state.device);
+
+ return ipmi_queue_msg(msg);
+}
diff --git a/roms/skiboot/hw/ipmi/ipmi-rtc.c b/roms/skiboot/hw/ipmi/ipmi-rtc.c
new file mode 100644
index 000000000..52da2946c
--- /dev/null
+++ b/roms/skiboot/hw/ipmi/ipmi-rtc.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Talk to a Real Time Clock (RTC) over IPMI
+ *
+ * Copyright 2013-2015 IBM Corp.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <ipmi.h>
+#include <time.h>
+#include <time-utils.h>
+#include <device.h>
+#include <opal.h>
+#include <rtc.h>
+
+static enum {idle, waiting, updated, error} time_status;
+
+static void get_sel_time_error(struct ipmi_msg *msg)
+{
+ time_status = error;
+ ipmi_free_msg(msg);
+}
+
+static void get_sel_time_complete(struct ipmi_msg *msg)
+{
+ struct tm tm;
+ le32 result;
+ time_t time;
+
+ memcpy(&result, msg->data, 4);
+ time = le32_to_cpu(result);
+ gmtime_r(&time, &tm);
+ rtc_cache_update(&tm);
+ time_status = updated;
+ ipmi_free_msg(msg);
+}
+
+static int64_t ipmi_get_sel_time(void)
+{
+ struct ipmi_msg *msg;
+
+ msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, IPMI_GET_SEL_TIME,
+ get_sel_time_complete, NULL, NULL, 0, 4);
+ if (!msg)
+ return OPAL_HARDWARE;
+
+ msg->error = get_sel_time_error;
+
+ return ipmi_queue_msg(msg);
+}
+
+static int64_t ipmi_set_sel_time(uint32_t _tv)
+{
+ struct ipmi_msg *msg;
+ const le32 tv = cpu_to_le32(_tv);
+
+ msg = ipmi_mkmsg_simple(IPMI_SET_SEL_TIME, (void*)&tv, sizeof(tv));
+ if (!msg)
+ return OPAL_HARDWARE;
+
+ return ipmi_queue_msg(msg);
+}
+
+static int64_t ipmi_opal_rtc_read(__be32 *__ymd, __be64 *__hmsm)
+{
+ int ret = 0;
+ uint32_t ymd;
+ uint64_t hmsm;
+
+ if (!__ymd || !__hmsm)
+ return OPAL_PARAMETER;
+
+ switch(time_status) {
+ case idle:
+ if (ipmi_get_sel_time() < 0)
+ return OPAL_HARDWARE;
+ time_status = waiting;
+ ret = OPAL_BUSY_EVENT;
+ break;
+
+ case waiting:
+ ret = OPAL_BUSY_EVENT;
+ break;
+
+ case updated:
+ rtc_cache_get_datetime(&ymd, &hmsm);
+ *__ymd = cpu_to_be32(ymd);
+ *__hmsm = cpu_to_be64(hmsm);
+ time_status = idle;
+ ret = OPAL_SUCCESS;
+ break;
+
+ case error:
+ time_status = idle;
+ ret = OPAL_HARDWARE;
+ break;
+ }
+
+ return ret;
+}
+
+static int64_t ipmi_opal_rtc_write(uint32_t year_month_day,
+ uint64_t hour_minute_second_millisecond)
+{
+ time_t t;
+ struct tm tm;
+
+ datetime_to_tm(year_month_day, hour_minute_second_millisecond, &tm);
+ t = mktime(&tm);
+ if (ipmi_set_sel_time(t))
+ return OPAL_HARDWARE;
+
+ return OPAL_SUCCESS;
+}
+
+void ipmi_rtc_init(void)
+{
+ struct dt_node *np = dt_new(opal_node, "rtc");
+ dt_add_property_strings(np, "compatible", "ibm,opal-rtc");
+
+ opal_register(OPAL_RTC_READ, ipmi_opal_rtc_read, 2);
+ opal_register(OPAL_RTC_WRITE, ipmi_opal_rtc_write, 2);
+
+ /* Initialise the rtc cache */
+ ipmi_get_sel_time();
+}
diff --git a/roms/skiboot/hw/ipmi/ipmi-sel.c b/roms/skiboot/hw/ipmi/ipmi-sel.c
new file mode 100644
index 000000000..215b8ba7d
--- /dev/null
+++ b/roms/skiboot/hw/ipmi/ipmi-sel.c
@@ -0,0 +1,701 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2013-2018 IBM Corp. */
+
+#define pr_fmt(fmt) "IPMI: " fmt
+#include <ccan/list/list.h>
+#include <ccan/str/str.h>
+#include <compiler.h>
+#include <errno.h>
+#include <skiboot.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ipmi.h>
+#include <device.h>
+#include <opal.h>
+#include <lock.h>
+#include <errorlog.h>
+#include <pel.h>
+#include <opal-msg.h>
+#include <debug_descriptor.h>
+#include <occ.h>
+#include <timebase.h>
+
+/* OEM SEL fields */
+#define SEL_OEM_ID_0 0x55
+#define SEL_OEM_ID_1 0x55
+#define SEL_RECORD_TYPE_OEM 0xC0
+#define SEL_RECORD_TYPE_EVENT 0x02
+
+#define SEL_NETFN_IBM 0x3a
+
+/* OEM SEL Commands */
+/* TODO: Move these to their respective source files */
+#define CMD_AMI_POWER 0x04
+#define CMD_AMI_PNOR_ACCESS 0x07
+#define CMD_AMI_OCC_RESET 0x0e
+#define CMD_HEARTBEAT 0xff
+
+/* XXX: Listed here for completeness, registered in libflash/ipmi-flash.c */
+#define CMD_OP_HIOMAP_EVENT 0x0f
+
+#define SOFT_OFF 0x00
+#define SOFT_REBOOT 0x01
+
+#define RELEASE_PNOR 0x00
+#define REQUEST_PNOR 0x01
+
+/* 32.1 SEL Event Records type */
+#define SEL_REC_TYPE_SYS_EVENT 0x02
+#define SEL_REC_TYPE_AMI_ESEL 0xDF
+
+/* OEM SEL generator ID for AMI */
+#define SEL_GENERATOR_ID_AMI 0x0020
+
+/* IPMI SEL version */
+#define SEL_EVM_VER_1 0x03
+#define SEL_EVM_VER_2 0x04
+
+/*
+ * Sensor type for System events
+ *
+ * Sensor information (type, number, etc) is passed to us via
+ * device tree. Currently we are using System Event type to
+ * log OPAL events.
+ */
+#define SENSOR_TYPE_SYS_EVENT 0x12
+
+/*
+ * 42.1 Event/Reading Type Codes
+ *
+ * Note that device hotplug and availability related events
+ * are not defined as we are not using those events type.
+ */
+#define SEL_EVENT_DIR_TYPE_UNSPECIFIED 0x00
+#define SEL_EVENT_DIR_TYPE_THRESHOLD 0x01
+#define SEL_EVENT_DIR_TYPE_STATE 0x03
+#define SEL_EVENT_DIR_TYPE_PREDICTIVE 0x04
+#define SEL_EVENT_DIR_TYPE_LIMIT 0x05
+#define SEL_EVENT_DIR_TYPE_PERFORMANCE 0x06
+#define SEL_EVENT_DIR_TYPE_TRANSITION 0x07
+#define SEL_EVENT_DIR_TYPE_OEM 0x70
+
+/*
+ * 42.1 Event/Reading Type Codes
+ */
+#define SEL_DATA1_AMI 0xAA
+#define SEL_DATA1_DEASSERTED 0x00
+#define SEL_DATA1_ASSERTED 0x01
+#define SEL_DATA1_OK 0x00
+#define SEL_DATA1_NON_CRIT_FROM_OK 0x01
+#define SEL_DATA1_CRIT_FROM_LESS_SEV 0x02
+#define SEL_DATA1_NON_REC_FROM_LESS_SEV 0x03
+#define SEL_DATA1_NON_CRIT 0x04
+#define SEL_DATA1_CRITICAL 0x05
+#define SEL_DATA1_NON_RECOVERABLE 0X06
+#define SEL_DATA1_MONITOR 0x07
+#define SEL_DATA1_INFORMATIONAL 0x08
+
+/* SEL Record Entry */
+struct sel_record {
+ le16 record_id;
+ uint8_t record_type;
+ le32 timestamp;
+ le16 generator_id;
+ uint8_t evm_ver;
+ uint8_t sensor_type;
+ uint8_t sensor_number;
+ uint8_t event_dir_type;
+ uint8_t event_data1;
+ uint8_t event_data2;
+ uint8_t event_data3;
+} __packed;
+
+static struct sel_record sel_record;
+
+struct oem_sel {
+ /* SEL header */
+ uint8_t id[2];
+ uint8_t type;
+ uint8_t timestamp[4];
+ uint8_t manuf_id[3];
+ /* OEM SEL data (6 bytes) follows */
+ uint8_t netfun;
+ uint8_t cmd;
+ uint8_t data[4];
+};
+
+#define ESEL_HDR_SIZE 7
+
+/* Used for sending PANIC events like abort() path */
+struct ipmi_sel_panic_msg {
+ bool busy;
+ struct ipmi_msg *msg;
+ struct lock lock;
+};
+static struct ipmi_sel_panic_msg ipmi_sel_panic_msg;
+
+static LIST_HEAD(sel_handlers);
+
+/* Forward declaration */
+static void ipmi_elog_poll(struct ipmi_msg *msg);
+
+/*
+ * Allocate IPMI message:
+ * For normal event, allocate memory using ipmi_mkmsg and for PANIC
+ * event, use pre-allocated buffer.
+ */
+static struct ipmi_msg *ipmi_sel_alloc_msg(struct errorlog *elog_buf)
+{
+ struct ipmi_msg *msg = NULL;
+
+ if (elog_buf->event_severity == OPAL_ERROR_PANIC) {
+ /* Called before initialization completes */
+ if (ipmi_sel_panic_msg.msg == NULL) {
+ ipmi_sel_init(); /* Try to allocate IPMI message */
+ if (ipmi_sel_panic_msg.msg == NULL)
+ return NULL;
+ }
+
+ if (ipmi_sel_panic_msg.busy == true)
+ return NULL;
+
+ lock(&ipmi_sel_panic_msg.lock);
+ msg = ipmi_sel_panic_msg.msg;
+ ipmi_sel_panic_msg.busy = true;
+ unlock(&ipmi_sel_panic_msg.lock);
+
+ ipmi_init_msg(msg, IPMI_DEFAULT_INTERFACE, IPMI_RESERVE_SEL,
+ ipmi_elog_poll, elog_buf, IPMI_MAX_REQ_SIZE, 2);
+ } else {
+ msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, IPMI_RESERVE_SEL,
+ ipmi_elog_poll, elog_buf, NULL,
+ IPMI_MAX_REQ_SIZE, 2);
+ }
+
+ return msg;
+}
+
+static void ipmi_sel_free_msg(struct ipmi_msg *msg)
+{
+ if (msg == ipmi_sel_panic_msg.msg) {
+ lock(&ipmi_sel_panic_msg.lock);
+ ipmi_sel_panic_msg.busy = false;
+ unlock(&ipmi_sel_panic_msg.lock);
+ } else {
+ ipmi_free_msg(msg);
+ }
+
+ msg = NULL;
+}
+
+/* Initialize eSEL record */
+static void ipmi_init_esel_record(void)
+{
+ memset(&sel_record, 0, sizeof(struct sel_record));
+ sel_record.record_type = SEL_REC_TYPE_AMI_ESEL;
+ sel_record.generator_id = cpu_to_le16(SEL_GENERATOR_ID_AMI);
+ sel_record.evm_ver = SEL_EVM_VER_2;
+ sel_record.sensor_type = SENSOR_TYPE_SYS_EVENT;
+ sel_record.sensor_number =
+ ipmi_get_sensor_number(SENSOR_TYPE_SYS_EVENT);
+ sel_record.event_dir_type = SEL_EVENT_DIR_TYPE_OEM;
+ sel_record.event_data1 = SEL_DATA1_AMI;
+}
+
+/* Update required fields in SEL record */
+static void ipmi_update_sel_record(uint8_t event_severity, uint16_t esel_record_id)
+{
+ sel_record.record_type = SEL_REC_TYPE_SYS_EVENT;
+ sel_record.event_data2 = (esel_record_id >> 8) & 0xff;
+ sel_record.event_data3 = esel_record_id & 0xff;
+
+ switch (event_severity) {
+ case OPAL_ERROR_PANIC:
+ sel_record.event_dir_type = SEL_EVENT_DIR_TYPE_TRANSITION;
+ sel_record.event_data1 = SEL_DATA1_CRITICAL;
+ break;
+ case OPAL_UNRECOVERABLE_ERR_GENERAL: /* Fall through */
+ case OPAL_UNRECOVERABLE_ERR_DEGRADE_PERF:
+ case OPAL_UNRECOVERABLE_ERR_LOSS_REDUNDANCY:
+ case OPAL_UNRECOVERABLE_ERR_LOSS_REDUNDANCY_PERF:
+ case OPAL_UNRECOVERABLE_ERR_LOSS_OF_FUNCTION:
+ sel_record.event_dir_type = SEL_EVENT_DIR_TYPE_TRANSITION;
+ sel_record.event_data1 = SEL_DATA1_NON_RECOVERABLE;
+ break;
+ case OPAL_PREDICTIVE_ERR_GENERAL: /* Fall through */
+ case OPAL_PREDICTIVE_ERR_DEGRADED_PERF:
+ case OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT:
+ case OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_BOOT_DEGRADE_PERF:
+ case OPAL_PREDICTIVE_ERR_LOSS_OF_REDUNDANCY:
+ sel_record.event_dir_type = SEL_EVENT_DIR_TYPE_PREDICTIVE;
+ sel_record.event_data1 = SEL_DATA1_NON_CRIT_FROM_OK;
+ break;
+ case OPAL_RECOVERED_ERR_GENERAL:
+ sel_record.event_dir_type = SEL_EVENT_DIR_TYPE_TRANSITION;
+ sel_record.event_data1 = SEL_DATA1_OK;
+ break;
+ case OPAL_INFO:
+ sel_record.event_dir_type = SEL_EVENT_DIR_TYPE_TRANSITION;
+ sel_record.event_data1 = SEL_DATA1_INFORMATIONAL;
+ break;
+ default:
+ sel_record.event_dir_type = SEL_EVENT_DIR_TYPE_STATE;
+ sel_record.event_data1 = SEL_DATA1_ASSERTED;
+ break;
+ }
+}
+
+static void ipmi_elog_error(struct ipmi_msg *msg)
+{
+ if (msg->cc == IPMI_LOST_ARBITRATION_ERR)
+ /* Retry due to SEL erase */
+ ipmi_queue_msg(msg);
+ else {
+ opal_elog_complete(msg->user_data, false);
+ ipmi_sel_free_msg(msg);
+ }
+}
+
+static void ipmi_log_sel_event_error(struct ipmi_msg *msg)
+{
+ if (msg->cc != IPMI_CC_NO_ERROR)
+ prlog(PR_INFO, "SEL: Failed to log SEL event\n");
+
+ ipmi_sel_free_msg(msg);
+}
+
+static void ipmi_log_sel_event_complete(struct ipmi_msg *msg)
+{
+ prlog(PR_INFO, "SEL: New event logged [ID : %x%x]\n", msg->data[1],
+ msg->data[0]);
+
+ ipmi_sel_free_msg(msg);
+}
+
+/* Log SEL event with eSEL record ID */
+static void ipmi_log_sel_event(struct ipmi_msg *msg, uint8_t event_severity,
+ uint16_t esel_record_id)
+{
+ /* Fill required SEL event fields */
+ ipmi_update_sel_record(event_severity, esel_record_id);
+
+ /* Fill IPMI message */
+ ipmi_init_msg(msg, IPMI_DEFAULT_INTERFACE, IPMI_ADD_SEL_EVENT,
+ ipmi_log_sel_event_complete, NULL,
+ sizeof(struct sel_record), 2);
+
+ /* Copy SEL data */
+ memcpy(msg->data, &sel_record, sizeof(struct sel_record));
+
+ msg->error = ipmi_log_sel_event_error;
+ ipmi_queue_msg_head(msg);
+}
+
+/* Goes through the required steps to add a complete eSEL:
+ *
+ * 1. Get a reservation
+ * 2. Add eSEL header
+ * 3. Partially add data to the SEL
+ *
+ * Because a reservation is needed we need to ensure eSEL's are added
+ * as a single transaction as concurrent/interleaved adds would cancel
+ * the reservation. We guarantee this by always adding our messages to
+ * the head of the transmission queue, blocking any other messages
+ * being sent until we have completed sending this message.
+ *
+ * There is still a very small chance that we will accidentally
+ * interleave a message if there is another one waiting at the head of
+ * the ipmi queue and another cpu calls the ipmi poller before we
+ * complete. However this should just cause a resevation cancelled
+ * error which we have to deal with anyway (eg. because there may be a
+ * SEL erase in progress) so it shouldn't cause any problems.
+ */
+static void ipmi_elog_poll(struct ipmi_msg *msg)
+{
+ static bool first = false;
+ static char pel_buf[IPMI_MAX_PEL_SIZE];
+ static size_t pel_size;
+ static size_t esel_size;
+ static int esel_index = 0;
+ int pel_index;
+ static unsigned int reservation_id = 0;
+ static unsigned int record_id = 0;
+ struct errorlog *elog_buf = (struct errorlog *) msg->user_data;
+ size_t req_size;
+
+ if (bmc_platform->sw->ipmi_oem_partial_add_esel == 0) {
+ prlog(PR_WARNING, "Dropped eSEL: BMC code is buggy/missing\n");
+ ipmi_sel_free_msg(msg);
+ return;
+ }
+
+ ipmi_init_esel_record();
+ if (msg->cmd == IPMI_CMD(IPMI_RESERVE_SEL)) {
+ first = true;
+ reservation_id = msg->data[0];
+ reservation_id |= msg->data[1] << 8;
+ if (!reservation_id) {
+ /*
+ * According to specification we should never
+ * get here, but just in case we do we cancel
+ * sending the message.
+ */
+ prerror("Invalid reservation id");
+ opal_elog_complete(elog_buf, false);
+ ipmi_sel_free_msg(msg);
+ return;
+ }
+
+ pel_size = create_pel_log(elog_buf, pel_buf, IPMI_MAX_PEL_SIZE);
+ esel_size = pel_size + sizeof(struct sel_record);
+ esel_index = 0;
+ record_id = 0;
+ } else {
+ record_id = msg->data[0];
+ record_id |= msg->data[1] << 8;
+ }
+
+ /* Start or continue the IPMI_PARTIAL_ADD_SEL */
+ if (esel_index >= esel_size) {
+ /*
+ * We're all done. Invalidate the resevation id to
+ * ensure we get an error if we cut in on another eSEL
+ * message.
+ */
+ reservation_id = 0;
+ esel_index = 0;
+
+ /* Log SEL event and free ipmi message */
+ ipmi_log_sel_event(msg, elog_buf->event_severity, record_id);
+
+ opal_elog_complete(elog_buf, true);
+ return;
+ }
+
+ if ((esel_size - esel_index) <= (IPMI_MAX_REQ_SIZE - ESEL_HDR_SIZE)) {
+ /* Last data to send */
+ msg->data[6] = 1;
+ req_size = esel_size - esel_index + ESEL_HDR_SIZE;
+ } else {
+ msg->data[6] = 0;
+ req_size = IPMI_MAX_REQ_SIZE;
+ }
+
+ ipmi_init_msg(msg, IPMI_DEFAULT_INTERFACE,
+ bmc_platform->sw->ipmi_oem_partial_add_esel,
+ ipmi_elog_poll, elog_buf, req_size, 2);
+
+ msg->data[0] = reservation_id & 0xff;
+ msg->data[1] = (reservation_id >> 8) & 0xff;
+ msg->data[2] = record_id & 0xff;
+ msg->data[3] = (record_id >> 8) & 0xff;
+ msg->data[4] = esel_index & 0xff;
+ msg->data[5] = (esel_index >> 8) & 0xff;
+
+ if (first) {
+ first = false;
+ memcpy(&msg->data[ESEL_HDR_SIZE], &sel_record,
+ sizeof(struct sel_record));
+ esel_index = sizeof(struct sel_record);
+ msg->req_size = esel_index + ESEL_HDR_SIZE;
+ } else {
+ pel_index = esel_index - sizeof(struct sel_record);
+ memcpy(&msg->data[ESEL_HDR_SIZE], &pel_buf[pel_index],
+ msg->req_size - ESEL_HDR_SIZE);
+ esel_index += msg->req_size - ESEL_HDR_SIZE;
+ }
+
+ ipmi_queue_msg_head(msg);
+ return;
+}
+
+int ipmi_elog_commit(struct errorlog *elog_buf)
+{
+ struct ipmi_msg *msg;
+
+ /* Only log events that needs attention */
+ if (elog_buf->event_severity <
+ OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT ||
+ elog_buf->elog_origin != ORG_SAPPHIRE) {
+ prlog(PR_INFO, "dropping non severe PEL event\n");
+ opal_elog_complete(elog_buf, true);
+ return 0;
+ }
+
+ /*
+ * We pass a large request size in to mkmsg so that we have a
+ * large enough allocation to reuse the message to pass the
+ * PEL data via a series of partial add commands.
+ */
+ msg = ipmi_sel_alloc_msg(elog_buf);
+ if (!msg) {
+ opal_elog_complete(elog_buf, false);
+ return OPAL_RESOURCE;
+ }
+
+ msg->error = ipmi_elog_error;
+ msg->req_size = 0;
+ if (elog_buf->event_severity == OPAL_ERROR_PANIC) {
+ ipmi_queue_msg_sync(msg);
+
+ /*
+ * eSEL logs are split into multiple smaller chunks and sent
+ * to BMC. Lets wait until we finish sending all the chunks
+ * to BMC.
+ */
+ while (ipmi_sel_panic_msg.busy != false) {
+ if (msg->backend->poll)
+ msg->backend->poll();
+ time_wait_ms(10);
+ }
+ } else {
+ ipmi_queue_msg(msg);
+ }
+
+ return 0;
+}
+
+#define ACCESS_DENIED 0x00
+#define ACCESS_GRANTED 0x01
+
+static void sel_pnor(uint8_t access, void *context __unused)
+{
+ struct ipmi_msg *msg;
+ uint8_t granted = ACCESS_GRANTED;
+
+ switch (access) {
+ case REQUEST_PNOR:
+ prlog(PR_NOTICE, "PNOR access requested\n");
+ if (bmc_platform->sw->ipmi_oem_pnor_access_status == 0) {
+ /**
+ * @fwts-label PNORAccessYeahButNoBut
+ * @fwts-advice OPAL doesn't know that the BMC supports
+ * PNOR access commands. This will be a bug in the OPAL
+ * support for this BMC.
+ */
+ prlog(PR_ERR, "PNOR BUG: access requested but BMC doesn't support request\n");
+ break;
+ }
+
+ granted = flash_reserve();
+ if (granted)
+ occ_pnor_set_owner(PNOR_OWNER_EXTERNAL);
+ /* Ack the request */
+ msg = ipmi_mkmsg_simple(bmc_platform->sw->ipmi_oem_pnor_access_status, &granted, 1);
+ ipmi_queue_msg(msg);
+ break;
+ case RELEASE_PNOR:
+ prlog(PR_NOTICE, "PNOR access released\n");
+ flash_release();
+ occ_pnor_set_owner(PNOR_OWNER_HOST);
+ break;
+ default:
+ /**
+ * @fwts-label InvalidPNORAccessRequest
+ * @fwts-advice In negotiating PNOR access with BMC, we
+ * got an odd/invalid request from the BMC. Likely a bug
+ * in OPAL/BMC interaction.
+ */
+ prlog(PR_ERR, "invalid PNOR access requested: %02x\n",
+ access);
+ }
+}
+
+static void sel_power(uint8_t power, void *context __unused)
+{
+ switch (power) {
+ case SOFT_OFF:
+ prlog(PR_NOTICE, "Soft shutdown requested\n");
+ if (opal_booting() && platform.cec_power_down) {
+ prlog(PR_NOTICE, "Host not up, shutting down now\n");
+ platform.cec_power_down(IPMI_CHASSIS_PWR_DOWN);
+ } else {
+ opal_queue_msg(OPAL_MSG_SHUTDOWN, NULL, NULL,
+ cpu_to_be64(SOFT_OFF));
+ }
+
+ break;
+ case SOFT_REBOOT:
+ prlog(PR_NOTICE, "Soft reboot requested\n");
+ if (opal_booting() && platform.cec_reboot) {
+ prlog(PR_NOTICE, "Host not up, rebooting now\n");
+ platform.cec_reboot();
+ } else {
+ opal_queue_msg(OPAL_MSG_SHUTDOWN, NULL, NULL,
+ cpu_to_be64(SOFT_REBOOT));
+ }
+
+ break;
+ default:
+ prlog(PR_WARNING, "requested bad power state: %02x\n",
+ power);
+ }
+}
+
+static void sel_heartbeat(uint8_t heartbeat, void *context __unused)
+{
+ /* There is only one sub-command so no processing needed */
+ prlog(PR_DEBUG, "BMC issued heartbeat command: %02x\n",
+ heartbeat);
+}
+
+static uint32_t occ_sensor_id_to_chip(uint8_t sensor, uint32_t *chip)
+{
+ struct dt_node *node, *bmc_node, *sensors_node;
+
+ /* Default chip id */
+ *chip = 0;
+
+ bmc_node = dt_find_by_name(dt_root, "bmc");
+ if (!bmc_node)
+ return 0;
+
+ sensors_node = dt_find_by_name(bmc_node, "sensors");
+ if (!sensors_node)
+ return 0;
+
+ node = dt_find_by_name_addr(sensors_node, "sensor", sensor);
+ if (!node) {
+ prlog(PR_DEBUG, "Could not find OCC sensor node. Id : %d\n",
+ (u32)sensor);
+ return 0;
+ }
+
+ if (!dt_has_node_property(node, "ibm,chip-id", NULL)) {
+ prlog(PR_DEBUG, "Could not find chip-id for OCC sensor : %d\n",
+ (u32)sensor);
+ return 0;
+ }
+
+ *chip = dt_get_chip_id(node);
+ return 0;
+}
+
+static void sel_occ_reset(uint8_t sensor, void *context __unused)
+{
+ uint32_t chip;
+ int rc;
+
+ rc = occ_sensor_id_to_chip(sensor, &chip);
+ if (rc) {
+ /**
+ * @fwts-label: SELUnknownOCCReset
+ * @fwts-advice: Likely bug in what sent us the OCC reset.
+ */
+ prlog(PR_ERR, "SEL message to reset an unknown OCC "
+ "(sensor ID 0x%02x)\n", sensor);
+ return;
+ }
+
+ prd_occ_reset(chip);
+}
+
+struct ipmi_sel_handler {
+ uint8_t oem_cmd;
+ void (*fn)(uint8_t data, void *context);
+ void *context;
+ struct list_node node;
+};
+
+int ipmi_sel_register(uint8_t oem_cmd,
+ void (*fn)(uint8_t data, void *context),
+ void *context)
+{
+ struct ipmi_sel_handler *handler;
+
+ list_for_each(&sel_handlers, handler, node) {
+ if (handler->oem_cmd == oem_cmd) {
+ prerror("Handler for SEL command 0x%02x already registered\n",
+ oem_cmd);
+ return -EINVAL;
+ }
+ }
+
+ handler = malloc(sizeof(*handler));
+ if (!handler)
+ return -ENOMEM;
+
+ handler->oem_cmd = oem_cmd;
+ handler->fn = fn;
+ handler->context = context;
+
+ list_add(&sel_handlers, &handler->node);
+
+ return 0;
+}
+
+void ipmi_sel_init(void)
+{
+ int rc;
+
+ /* Already done */
+ if (ipmi_sel_panic_msg.msg != NULL)
+ return;
+
+ memset(&ipmi_sel_panic_msg, 0, sizeof(struct ipmi_sel_panic_msg));
+ ipmi_sel_panic_msg.msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE,
+ IPMI_RESERVE_SEL, ipmi_elog_poll,
+ NULL, NULL, IPMI_MAX_REQ_SIZE, 2);
+
+ /* Hackishly register these old-style handlers here for now */
+ /* TODO: Move them to their appropriate source files */
+ rc = ipmi_sel_register(CMD_AMI_POWER, sel_power, NULL);
+ if (rc < 0) {
+ prerror("Failed to register SEL handler for %s",
+ stringify(CMD_AMI_POWER));
+ }
+
+ rc = ipmi_sel_register(CMD_AMI_OCC_RESET, sel_occ_reset, NULL);
+ if (rc < 0) {
+ prerror("Failed to register SEL handler for %s",
+ stringify(CMD_AMI_OCC_RESET));
+ }
+
+ rc = ipmi_sel_register(CMD_AMI_PNOR_ACCESS, sel_pnor, NULL);
+ if (rc < 0) {
+ prerror("Failed to register SEL handler for %s",
+ stringify(CMD_AMI_PNOR_ACCESS));
+ }
+
+ rc = ipmi_sel_register(CMD_HEARTBEAT, sel_heartbeat, NULL);
+ if (rc < 0) {
+ prerror("Failed to register SEL handler for %s",
+ stringify(CMD_HEARTBEAT));
+ }
+}
+
+void ipmi_parse_sel(struct ipmi_msg *msg)
+{
+ struct ipmi_sel_handler *handler;
+ struct oem_sel sel;
+
+ assert(msg->resp_size <= 16);
+
+ memcpy(&sel, msg->data, msg->resp_size);
+
+ /* We do not process system event records */
+ if (sel.type == SEL_RECORD_TYPE_EVENT) {
+ prlog(PR_INFO, "dropping System Event Record SEL\n");
+ return;
+ }
+
+ prlog(PR_DEBUG, "SEL received (%d bytes, netfn %d, cmd %d)\n",
+ msg->resp_size, sel.netfun, sel.cmd);
+
+ /* Only accept OEM SEL messages */
+ if (sel.id[0] != SEL_OEM_ID_0 || sel.id[1] != SEL_OEM_ID_1 ||
+ sel.type != SEL_RECORD_TYPE_OEM) {
+ prlog(PR_WARNING, "unknown SEL %02x%02x (type %02x)\n",
+ sel.id[0], sel.id[1], sel.type);
+ return;
+ }
+
+ list_for_each(&sel_handlers, handler, node) {
+ if (handler->oem_cmd == sel.cmd) {
+ handler->fn(sel.data[0], handler->context);
+ return;
+ }
+ }
+
+ prlog(PR_WARNING, "unknown OEM SEL command %02x received\n", sel.cmd);
+}
diff --git a/roms/skiboot/hw/ipmi/ipmi-sensor.c b/roms/skiboot/hw/ipmi/ipmi-sensor.c
new file mode 100644
index 000000000..857b789e4
--- /dev/null
+++ b/roms/skiboot/hw/ipmi/ipmi-sensor.c
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2013-2017 IBM Corp. */
+
+#include <device.h>
+#include <ipmi.h>
+#include <opal.h>
+#include <skiboot.h>
+#include <string.h>
+#include <stdbool.h>
+
+#define IPMI_WRITE_SENSOR (1 << 0)
+
+#define FW_PROGRESS_SENSOR_TYPE 0x0F
+#define BOOT_COUNT_SENSOR_TYPE 0xC3
+
+static int16_t sensors[MAX_IPMI_SENSORS];
+
+static bool sensors_present = false;
+
+struct set_sensor_req {
+ u8 sensor_number;
+ u8 operation;
+ u8 sensor_reading;
+ u8 assertion_mask[2];
+ u8 deassertion_mask[2];
+ u8 event_data[3];
+};
+
+static bool ipmi_sensor_type_present(uint8_t sensor_type)
+{
+ const struct dt_property *type_prop;
+ uint8_t type;
+ struct dt_node *node;
+
+ dt_for_each_compatible(dt_root, node, "ibm,ipmi-sensor") {
+ type_prop = dt_find_property(node, "ipmi-sensor-type");
+ if (!type_prop) {
+ prlog(PR_ERR, "IPMI: sensor doesn't have ipmi-sensor-type\n");
+ continue;
+ }
+
+ type = (uint8_t)dt_property_get_cell(type_prop, 0);
+ if (type == sensor_type)
+ return true;
+ }
+ return false;
+}
+
+uint8_t ipmi_get_sensor_number(uint8_t sensor_type)
+{
+ assert(sensor_type < MAX_IPMI_SENSORS);
+ return sensors[sensor_type];
+}
+
+int ipmi_set_boot_count(void)
+{
+ struct set_sensor_req req;
+ struct ipmi_msg *msg;
+ int boot_count_sensor;
+
+ if (!sensors_present)
+ return OPAL_UNSUPPORTED;
+
+ if (!ipmi_present())
+ return OPAL_CLOSED;
+
+ if (!ipmi_sensor_type_present(BOOT_COUNT_SENSOR_TYPE))
+ return OPAL_HARDWARE;
+
+ boot_count_sensor = sensors[BOOT_COUNT_SENSOR_TYPE];
+
+ if (boot_count_sensor < 0) {
+ prlog(PR_DEBUG, "IPMI: boot count set but not present\n");
+ return OPAL_HARDWARE;
+ }
+
+ memset(&req, 0, sizeof(req));
+
+ req.sensor_number = boot_count_sensor;
+ req.operation = IPMI_WRITE_SENSOR;
+ req.sensor_reading = 0x00;
+ req.assertion_mask[0] = 0x02;
+
+ msg = ipmi_mkmsg_simple(IPMI_SET_SENSOR_READING, &req, sizeof(req));
+ if (!msg)
+ return OPAL_HARDWARE;
+
+ printf("IPMI: Resetting boot count on successful boot\n");
+
+ return ipmi_queue_msg(msg);
+}
+
+int ipmi_set_fw_progress_sensor(uint8_t state)
+{
+ struct ipmi_msg *msg;
+ struct set_sensor_req request;
+ int fw_sensor_num;
+
+ if (!sensors_present)
+ return OPAL_UNSUPPORTED;
+
+ if (!ipmi_present())
+ return OPAL_CLOSED;
+
+ if (!ipmi_sensor_type_present(FW_PROGRESS_SENSOR_TYPE))
+ return OPAL_HARDWARE;
+
+ fw_sensor_num = sensors[FW_PROGRESS_SENSOR_TYPE];
+
+ if (fw_sensor_num < 0) {
+ prlog(PR_DEBUG, "IPMI: fw progress set but not present\n");
+ return OPAL_HARDWARE;
+ }
+
+ memset(&request, 0, sizeof(request));
+
+ request.sensor_number = fw_sensor_num;
+ request.operation = 0xa0; /* Set event data bytes, assertion bits */
+ request.assertion_mask[0] = 0x04; /* Firmware progress offset */
+ request.event_data[0] = 0xc2;
+ request.event_data[1] = state;
+
+ prlog(PR_INFO, "IPMI: setting fw progress sensor %02x to %02x\n",
+ request.sensor_number, request.event_data[1]);
+
+ msg = ipmi_mkmsg_simple(IPMI_SET_SENSOR_READING, &request,
+ sizeof(request));
+ if (!msg)
+ return OPAL_HARDWARE;
+
+ return ipmi_queue_msg(msg);
+}
+
+void ipmi_sensor_init(void)
+{
+ const struct dt_property *type_prop, *num_prop;
+ uint8_t num, type;
+ struct dt_node *n;
+
+ memset(sensors, -1, sizeof(sensors));
+
+ dt_for_each_compatible(dt_root, n, "ibm,ipmi-sensor") {
+ type_prop = dt_find_property(n, "ipmi-sensor-type");
+ if (!type_prop) {
+ prerror("IPMI: sensor doesn't have ipmi-sensor-type\n");
+ continue;
+ }
+
+ num_prop = dt_find_property(n, "reg");
+ if (!num_prop) {
+ prerror("IPMI: sensor doesn't have reg property\n");
+ continue;
+ }
+ num = (uint8_t)dt_property_get_cell(num_prop, 0);
+ type = (uint8_t)dt_property_get_cell(type_prop, 0);
+ assert(type < MAX_IPMI_SENSORS);
+ sensors[type] = num;
+ }
+ sensors_present = true;
+}
diff --git a/roms/skiboot/hw/ipmi/ipmi-watchdog.c b/roms/skiboot/hw/ipmi/ipmi-watchdog.c
new file mode 100644
index 000000000..dc0a9e5b4
--- /dev/null
+++ b/roms/skiboot/hw/ipmi/ipmi-watchdog.c
@@ -0,0 +1,218 @@
+// SPDX-License-Identifier: Apache-2.0
+/*
+ * Copyright 2013-2018 IBM Corp.
+ * Copyright 2018 Google Corp.
+ */
+
+#include <stdlib.h>
+#include <ipmi.h>
+#include <lock.h>
+#include <opal.h>
+#include <device.h>
+#include <timer.h>
+#include <timebase.h>
+#include <pool.h>
+#include <skiboot.h>
+
+#define TIMER_USE_DONT_LOG 0x80
+#define TIMER_USE_DONT_STOP 0x40
+#define TIMER_USE_POST 0x02
+
+/* WDT expiration actions */
+#define WDT_PRETIMEOUT_SMI 0x10
+#define WDT_RESET_ACTION 0x01
+#define WDT_NO_ACTION 0x00
+
+/* IPMI defined custom completion codes for the watchdog */
+#define WDT_CC_OK 0x00
+#define WDT_CC_NOT_INITIALIZED 0x80
+
+/* Flags used for IPMI callbacks */
+#define WDT_SET_DO_RESET 0x01
+#define WDT_RESET_NO_REINIT 0x01
+
+/* How long to set the overall watchdog timeout for. In units of
+ * 100ms. If the timer is not reset within this time the watchdog
+ * expiration action will occur. */
+#define WDT_TIMEOUT 600
+
+/* How often to reset the timer using schedule_timer(). Too short and
+we risk accidentally resetting the system due to opal_run_pollers() not
+being called in time, too short and we waste time resetting the wdt
+more frequently than necessary. */
+#define WDT_MARGIN 300
+
+static struct timer wdt_timer;
+static bool wdt_stopped;
+static bool wdt_ticking;
+
+/* Saved values from the last watchdog set action */
+static uint8_t last_action;
+static uint16_t last_count;
+static uint8_t last_pretimeout;
+
+static void reset_wdt(struct timer *t, void *data, uint64_t now);
+
+static void set_wdt_complete(struct ipmi_msg *msg)
+{
+ const uintptr_t flags = (uintptr_t)msg->user_data;
+
+ if (flags & WDT_SET_DO_RESET) {
+ /* Make sure the reset action does not create a loop and
+ * perform a reset in the case where the BMC send an
+ * uninitialized error. */
+ reset_wdt(NULL, (void *)WDT_RESET_NO_REINIT, 0);
+ }
+
+ ipmi_free_msg(msg);
+}
+
+static void set_wdt(uint8_t action, uint16_t count, uint8_t pretimeout,
+ bool dont_stop, bool do_reset)
+{
+ struct ipmi_msg *ipmi_msg;
+ uintptr_t completion_flags = 0;
+
+ if (do_reset)
+ completion_flags |= WDT_SET_DO_RESET;
+
+ /* Save the values prior to issuing the set operation so that we can
+ * re-initialize the watchdog in error cases. */
+ last_action = action;
+ last_count = count;
+ last_pretimeout = pretimeout;
+
+ ipmi_msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, IPMI_SET_WDT,
+ set_wdt_complete, NULL, NULL, 6, 0);
+ if (!ipmi_msg) {
+ prerror("Unable to allocate set wdt message\n");
+ return;
+ }
+ ipmi_msg->error = set_wdt_complete;
+ ipmi_msg->user_data = (void *)completion_flags;
+ ipmi_msg->data[0] = TIMER_USE_POST |
+ TIMER_USE_DONT_LOG |
+ (dont_stop ? TIMER_USE_DONT_STOP : 0);
+ ipmi_msg->data[1] = action; /* Timer Actions */
+ ipmi_msg->data[2] = pretimeout; /* Pre-timeout Interval */
+ ipmi_msg->data[3] = 0; /* Timer Use Flags */
+ ipmi_msg->data[4] = count & 0xff; /* Initial countdown (lsb) */
+ ipmi_msg->data[5] = (count >> 8) & 0xff; /* Initial countdown (msb) */
+ ipmi_queue_msg(ipmi_msg);
+}
+
+static void reset_wdt_complete(struct ipmi_msg *msg)
+{
+ const uintptr_t flags = (uintptr_t)msg->user_data;
+ uint64_t reset_delay_ms = (WDT_TIMEOUT - WDT_MARGIN) * 100;
+
+ if (msg->cc == WDT_CC_NOT_INITIALIZED &&
+ !(flags & WDT_RESET_NO_REINIT)) {
+ /* If our timer was not initialized on the BMC side, we should
+ * perform a single attempt to set it up again. */
+ set_wdt(last_action, last_count, last_pretimeout, true, true);
+ } else if (msg->cc != WDT_CC_OK) {
+ /* Use a short (10s) timeout before performing the next reset
+ * if we encounter an unknown error. This makes sure that we
+ * are able to reset and re-initialize the timer since it might
+ * expire. */
+ reset_delay_ms = 10 * 1000;
+ }
+
+ /* If we are inside of skiboot we need to periodically restart the
+ * timer. Reschedule a reset so it happens before the timeout. */
+ if (wdt_ticking)
+ schedule_timer(&wdt_timer, msecs_to_tb(reset_delay_ms));
+
+ ipmi_free_msg(msg);
+}
+
+static struct ipmi_msg *wdt_reset_mkmsg(void)
+{
+ struct ipmi_msg *ipmi_msg;
+
+ ipmi_msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, IPMI_RESET_WDT,
+ reset_wdt_complete, NULL, NULL, 0, 0);
+ if (!ipmi_msg) {
+ prerror("Unable to allocate reset wdt message\n");
+ return NULL;
+ }
+ ipmi_msg->error = reset_wdt_complete;
+
+ return ipmi_msg;
+}
+
+static void sync_reset_wdt(void)
+{
+ struct ipmi_msg *ipmi_msg;
+
+ if ((ipmi_msg = wdt_reset_mkmsg()))
+ ipmi_queue_msg_sync(ipmi_msg);
+}
+
+static void reset_wdt(struct timer *t __unused, void *data,
+ uint64_t now __unused)
+{
+ struct ipmi_msg *ipmi_msg;
+
+ if ((ipmi_msg = wdt_reset_mkmsg())) {
+ ipmi_msg->user_data = data;
+ ipmi_queue_msg_head(ipmi_msg);
+ }
+}
+
+void ipmi_wdt_stop(void)
+{
+ if (!wdt_stopped) {
+ /* Make sure the background reset timer is disabled before
+ * stopping the watchdog. If we issue a reset after disabling
+ * the timer, it will be re-enabled. */
+ wdt_ticking = false;
+ cancel_timer(&wdt_timer);
+
+ /* Configure the watchdog to be disabled and do no action
+ * in case the underlying implementation is buggy and times
+ * out anyway. */
+ wdt_stopped = true;
+ set_wdt(WDT_NO_ACTION, 100, 0, false, false);
+ }
+}
+
+void ipmi_wdt_final_reset(void)
+{
+ /* We can safely stop the timer prior to setting up our final
+ * watchdog timeout since we have enough margin before the
+ * timeout. */
+ wdt_ticking = false;
+ cancel_timer(&wdt_timer);
+
+ /*
+ * We're going to wait a little while before requiring
+ * BOOTKERNEL to have IPMI watchdog support so that people
+ * can catch up in their development environments.
+ * If you still read this after 2018, send a patch!
+ */
+#if 0
+ /* Configure the watchdog and make sure it is still enabled */
+ set_wdt(WDT_RESET_ACTION | WDT_PRETIMEOUT_SMI, WDT_TIMEOUT,
+ WDT_MARGIN/10, true, true);
+ sync_reset_wdt();
+#else
+ set_wdt(WDT_NO_ACTION, 100, 0, false, false);
+#endif
+ ipmi_set_boot_count();
+}
+
+void ipmi_wdt_init(void)
+{
+ init_timer(&wdt_timer, reset_wdt, NULL);
+ set_wdt(WDT_RESET_ACTION, WDT_TIMEOUT, 0, true, false);
+
+ /* Start the WDT. We do it synchronously to make sure it has
+ * started before skiboot continues booting. Otherwise we
+ * could crash before the wdt has actually been started. */
+ wdt_ticking = true;
+ sync_reset_wdt();
+
+ return;
+}
diff --git a/roms/skiboot/hw/ipmi/test/Makefile.check b/roms/skiboot/hw/ipmi/test/Makefile.check
new file mode 100644
index 000000000..ceed1ed39
--- /dev/null
+++ b/roms/skiboot/hw/ipmi/test/Makefile.check
@@ -0,0 +1,34 @@
+# -*-Makefile-*-
+IPMI_TEST := hw/ipmi/test/run-fru
+
+LCOV_EXCLUDE += $(IPMI_TEST:%=%.c)
+
+.PHONY : hw-ipmi-check hw-ipmi-coverage
+hw-ipmi-check: $(IPMI_TEST:%=%-check)
+hw-ipmi-coverage: $(IPMI_TEST:%=%-gcov-run)
+
+check: hw-ipmi-check
+coverage: hw-ipmi-coverage
+
+$(IPMI_TEST:%=%-gcov-run) : %-run: %
+ $(call Q, TEST-COVERAGE ,$< , $<)
+
+$(IPMI_TEST:%=%-check) : %-check: %
+ $(call Q, RUN-TEST ,$(VALGRIND) $<, $<)
+
+$(IPMI_TEST) : % : %.c
+ $(call Q, HOSTCC ,$(HOSTCC) $(HOSTCFLAGS) -O0 -g -I include -I . -o $@ $<, $<)
+
+$(IPMI_TEST:%=%-gcov): %-gcov : %.c %
+ $(call Q, HOSTCC ,$(HOSTCC) $(HOSTCFLAGS) $(HOSTGCOVCFLAGS) -I include -I . -I libfdt -lgcov -o $@ $<, $<)
+
+$(IPMI_TEST:%=%-gcov): % : $(%.d:-gcov=)
+
+-include $(wildcard hw/ipmi/test/*.d)
+
+clean: ipmi-test-clean
+
+ipmi-test-clean:
+ $(RM) -f hw/ipmi/test/*.[od] $(IPMI_TEST) $(IPMI_TEST:%=%-gcov)
+ $(RM) -f *.gcda *.gcno skiboot.info
+ $(RM) -rf coverage-report
diff --git a/roms/skiboot/hw/ipmi/test/run-fru.c b/roms/skiboot/hw/ipmi/test/run-fru.c
new file mode 100644
index 000000000..fa79c98a1
--- /dev/null
+++ b/roms/skiboot/hw/ipmi/test/run-fru.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2013-2019 IBM Corp. */
+
+#include <unistd.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#define __TEST__
+
+#include "../ipmi-fru.c"
+
+#include <string.h>
+
+int error = 0;
+
+const char version[] = "a-too-long-version-test-string-is-here";
+
+void ipmi_free_msg(struct ipmi_msg __unused *msg)
+{
+}
+
+void ipmi_init_msg(struct ipmi_msg __unused *msg, int __unused interface,
+ uint32_t __unused code,
+ void __unused (*complete)(struct ipmi_msg *),
+ void __unused *user_data, size_t __unused req_size,
+ size_t __unused resp_size)
+{
+}
+
+struct ipmi_msg *ipmi_mkmsg(int __unused interface, uint32_t __unused code,
+ void __unused (*complete)(struct ipmi_msg *),
+ void __unused *user_data, void __unused *req_data, size_t __unused req_size,
+ size_t __unused resp_size)
+{
+ return NULL;
+}
+
+int ipmi_queue_msg(struct ipmi_msg __unused *msg)
+{
+ return 0;
+}
+
+void _prlog(int __unused log_level, const __unused char* fmt, ...)
+{
+ return;
+}
+
+int main(void)
+{
+ u8 *buf;
+ int len;
+ struct product_info info = {
+ .manufacturer = (char *) "IBM",
+ .product = (char *) "skiboot",
+ .part_no = (char *) "hello",
+ .version = (char *) "12345",
+ .serial_no = (char *) "12345",
+ .asset_tag = (char *) "abcd",
+ };
+ struct product_info invalid_info = {
+ .manufacturer = (char *) "I",
+ .product = (char *) "skiboot",
+ .part_no = (char *) "hello",
+ .version = (char *) "12345",
+ .serial_no = (char *) "12345",
+ .asset_tag = (char *) "abcd",
+ };
+ struct product_info invalid_info2 = {
+ .manufacturer = (char *) "IBM",
+ .product = (char *) "skiboot",
+ .part_no = (char *) "this is a really long string that's more"
+ "than 32 characters, because it turns out that's invalid.",
+ .version = (char *) "12345",
+ .serial_no = (char *) "12345",
+ .asset_tag = (char *) "abcd",
+ };
+
+ buf = malloc(256);
+
+ len = fru_fill_product_info(buf, &info, 40);
+ assert(len == 40);
+ assert(memcmp(buf, "\001\005\000\303IBM\307skiboot\305hello"
+ "\30512345\30512345\304abcd\301-",len) == 0);
+
+
+ /* Make sure the checksum is right */
+ assert(!fru_checksum(buf, len));
+
+ /* This should fail (not enough space) */
+ assert(fru_fill_product_info(buf, &info, 39) < 0);
+
+ memset(buf, 0, 256);
+ len = fru_fill_product_info(buf, &invalid_info, 40);
+ assert(len == OPAL_PARAMETER);
+
+ memset(buf, 0, 256);
+ len = fru_fill_product_info(buf, &invalid_info2, 256);
+ assert(len == OPAL_PARAMETER);
+
+ memset(buf, 0, 256);
+ assert(fru_add(buf, 256) > 0);
+ assert(0 == memcmp(&buf[64], "\001\a\000\303IBM\307skiboot\300"
+ "\337a-too-long-version-test-string+\300\300\301"
+ "\0\0\0",54));
+
+
+ memset(buf, 0, 256);
+ assert(fru_add(buf, 1) == OPAL_PARAMETER);
+
+ memset(buf, 0, 256);
+ assert(fru_add(buf, 65) == OPAL_PARAMETER);
+
+ free(buf);
+
+ return 0;
+}
diff --git a/roms/skiboot/hw/lpc-mbox.c b/roms/skiboot/hw/lpc-mbox.c
new file mode 100644
index 000000000..f5bb97ea4
--- /dev/null
+++ b/roms/skiboot/hw/lpc-mbox.c
@@ -0,0 +1,346 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * LPC MBOX
+ *
+ * Copyright 2017-2018 IBM Corp.
+ */
+
+#define pr_fmt(fmt) "LPC-MBOX: " fmt
+
+#include <skiboot.h>
+#include <lpc.h>
+#include <console.h>
+#include <opal.h>
+#include <device.h>
+#include <interrupts.h>
+#include <processor.h>
+#include <errorlog.h>
+#include <trace.h>
+#include <timebase.h>
+#include <timer.h>
+#include <cpu.h>
+#include <chip.h>
+#include <io.h>
+
+#include <lpc-mbox.h>
+
+#define MBOX_FLAG_REG 0x0f
+#define MBOX_STATUS_0 0x10
+#define MBOX_STATUS_1 0x11
+#define MBOX_STATUS_1_ATTN (1 << 7)
+#define MBOX_STATUS_1_RESP (1 << 5)
+#define MBOX_BMC_CTRL 0x12
+#define MBOX_CTRL_INT_STATUS (1 << 7)
+#define MBOX_CTRL_INT_MASK (1 << 1)
+#define MBOX_CTRL_INT_PING (1 << 0)
+#define MBOX_CTRL_INT_SEND (MBOX_CTRL_INT_PING | MBOX_CTRL_INT_MASK)
+#define MBOX_HOST_CTRL 0x13
+#define MBOX_BMC_INT_EN_0 0x14
+#define MBOX_BMC_INT_EN_1 0x15
+#define MBOX_HOST_INT_EN_0 0x16
+#define MBOX_HOST_INT_EN_1 0x17
+
+#define MBOX_MAX_QUEUE_LEN 5
+
+struct mbox {
+ uint32_t base;
+ int queue_len;
+ bool irq_ok;
+ uint8_t seq;
+ struct timer poller;
+ void (*callback)(struct bmc_mbox_msg *msg, void *priv);
+ void *drv_data;
+ void (*attn)(uint8_t bits, void *priv);
+ void *attn_data;
+ struct lock lock;
+ uint8_t sequence;
+ unsigned long timeout;
+};
+
+static struct mbox mbox;
+
+/*
+ * MBOX accesses
+ */
+
+static void bmc_mbox_outb(uint8_t val, uint8_t reg)
+{
+ lpc_outb(val, mbox.base + reg);
+}
+
+static uint8_t bmc_mbox_inb(uint8_t reg)
+{
+ return lpc_inb(mbox.base + reg);
+}
+
+static void bmc_mbox_recv_message(struct bmc_mbox_msg *msg)
+{
+ uint8_t *msg_data = (uint8_t *)msg;
+ int i;
+
+ for (i = 0; i < BMC_MBOX_READ_REGS; i++)
+ msg_data[i] = bmc_mbox_inb(i);
+}
+
+/* This needs work, don't write the data bytes that aren't needed */
+static void bmc_mbox_send_message(struct bmc_mbox_msg *msg)
+{
+ uint8_t *msg_data = (uint8_t *)msg;
+ int i;
+
+ if (!lpc_ok())
+ /* We're going to have to handle this better */
+ prlog(PR_ERR, "LPC isn't ok\n");
+
+ for (i = 0; i < BMC_MBOX_WRITE_REGS; i++)
+ bmc_mbox_outb(msg_data[i], i);
+
+ /*
+ * Don't touch the response byte - it's setup to generate an interrupt
+ * to the host (us) when written to, or the host status reg - we don't
+ * currently use it, or the BMC status reg - we're not allowed to.
+ */
+
+ /* Ping */
+ prlog(PR_TRACE, "Sending BMC interrupt\n");
+ bmc_mbox_outb(MBOX_CTRL_INT_SEND, MBOX_HOST_CTRL);
+}
+
+int bmc_mbox_enqueue(struct bmc_mbox_msg *msg, unsigned int timeout_sec)
+{
+ if (!mbox.base) {
+ prlog(PR_CRIT, "Using MBOX without init!\n");
+ return OPAL_WRONG_STATE;
+ }
+
+ lock(&mbox.lock);
+ if (mbox.timeout) {
+ prlog(PR_DEBUG, "MBOX message already in flight\n");
+ if (mftb() > mbox.timeout) {
+ prlog(PR_ERR, "In flight message dropped on the floor\n");
+ } else {
+ unlock(&mbox.lock);
+ return OPAL_BUSY;
+ }
+ }
+
+ mbox.timeout = mftb() + secs_to_tb(timeout_sec);
+ msg->seq = ++mbox.sequence;
+
+ bmc_mbox_send_message(msg);
+ unlock(&mbox.lock);
+
+ schedule_timer(&mbox.poller, mbox.irq_ok ?
+ TIMER_POLL : msecs_to_tb(MBOX_DEFAULT_POLL_MS));
+
+ return 0;
+}
+
+static void mbox_poll(struct timer *t __unused, void *data __unused,
+ uint64_t now __unused)
+{
+ struct bmc_mbox_msg msg;
+
+ if (!lpc_ok())
+ return;
+
+ /*
+ * This status bit being high means that someone touched the
+ * response byte (byte 13).
+ * There is probably a response for the previously sent commant
+ */
+ lock(&mbox.lock);
+ if (bmc_mbox_inb(MBOX_STATUS_1) & MBOX_STATUS_1_RESP) {
+ /* W1C on that reg */
+ bmc_mbox_outb(MBOX_STATUS_1_RESP, MBOX_STATUS_1);
+
+ prlog(PR_INSANE, "Got a regular interrupt\n");
+
+ bmc_mbox_recv_message(&msg);
+ if (mbox.sequence != msg.seq) {
+ prlog(PR_ERR, "Got a response to a message we no longer care about\n");
+ goto out_response;
+ }
+
+ mbox.timeout = 0;
+ if (mbox.callback)
+ mbox.callback(&msg, mbox.drv_data);
+ else
+ prlog(PR_ERR, "Detected NULL callback for mbox message\n");
+ }
+
+out_response:
+
+ /*
+ * The BMC has touched byte 15 to get our attention as it has
+ * something to tell us.
+ */
+ if (bmc_mbox_inb(MBOX_STATUS_1) & MBOX_STATUS_1_ATTN) {
+ uint8_t action, all;
+
+ /* W1C on that reg */
+ bmc_mbox_outb(MBOX_STATUS_1_ATTN, MBOX_STATUS_1);
+
+ all = action = bmc_mbox_inb(MBOX_FLAG_REG);
+ prlog(PR_TRACE, "Got a status register interrupt with action 0x%02x\n",
+ action);
+ if (action & MBOX_ATTN_BMC_REBOOT) {
+ /*
+ * It's unlikely that something needs to be done at the
+ * driver level. Let libflash deal with it.
+ * Print something just in case, it is quite a signficant
+ * event.
+ */
+ prlog(PR_WARNING, "BMC reset detected\n");
+ action &= ~MBOX_ATTN_BMC_REBOOT;
+ }
+
+ if (action & MBOX_ATTN_BMC_WINDOW_RESET)
+ action &= ~MBOX_ATTN_BMC_WINDOW_RESET;
+
+ if (action & MBOX_ATTN_BMC_FLASH_LOST)
+ action &= ~MBOX_ATTN_BMC_FLASH_LOST;
+
+ if (action & MBOX_ATTN_BMC_DAEMON_READY)
+ action &= ~MBOX_ATTN_BMC_DAEMON_READY;
+
+ if (action)
+ prlog(PR_ERR, "Got a status bit set that don't know about: 0x%02x\n",
+ action);
+
+ mbox.attn(all, mbox.attn_data);
+ }
+
+ unlock(&mbox.lock);
+
+ schedule_timer(&mbox.poller,
+ mbox.irq_ok ? TIMER_POLL : msecs_to_tb(MBOX_DEFAULT_POLL_MS));
+}
+
+static void mbox_irq(uint32_t chip_id __unused, uint32_t irq_mask __unused)
+{
+ mbox.irq_ok = true;
+ mbox_poll(NULL, NULL, 0);
+}
+
+static struct lpc_client mbox_lpc_client = {
+ .interrupt = mbox_irq,
+};
+
+static bool mbox_init_hw(void)
+{
+ /* Disable all status interrupts except attentions */
+ bmc_mbox_outb(0x00, MBOX_HOST_INT_EN_0);
+ bmc_mbox_outb(MBOX_STATUS_1_ATTN, MBOX_HOST_INT_EN_1);
+
+ /* Cleanup host interrupt and status */
+ bmc_mbox_outb(MBOX_CTRL_INT_STATUS, MBOX_HOST_CTRL);
+
+ /* Disable host control interrupt for now (will be
+ * re-enabled when needed). Clear BMC interrupts
+ */
+ bmc_mbox_outb(MBOX_CTRL_INT_MASK, MBOX_BMC_CTRL);
+
+ return true;
+}
+
+int bmc_mbox_register_callback(void (*callback)(struct bmc_mbox_msg *msg, void *priv),
+ void *drv_data)
+{
+ mbox.callback = callback;
+ mbox.drv_data = drv_data;
+ return 0;
+}
+
+int bmc_mbox_register_attn(void (*callback)(uint8_t bits, void *priv),
+ void *drv_data)
+{
+ mbox.attn = callback;
+ mbox.attn_data = drv_data;
+ return 0;
+}
+
+uint8_t bmc_mbox_get_attn_reg(void)
+{
+ return bmc_mbox_inb(MBOX_FLAG_REG);
+}
+
+void mbox_init(void)
+{
+ const struct dt_property *prop;
+ struct dt_node *np;
+ uint32_t irq, chip_id;
+
+ if (mbox.base) {
+ prlog(PR_ERR, "Duplicate call to mbox_init()\n");
+ return;
+ }
+
+ prlog(PR_DEBUG, "Attempting mbox init\n");
+ np = dt_find_compatible_node(dt_root, NULL, "mbox");
+ if (!np) {
+ /* Only an ERROR on P9 and above, otherwise just
+ * a warning for someone doing development
+ */
+ prlog((proc_gen <= proc_gen_p8) ? PR_DEBUG : PR_ERR,
+ "No device tree entry\n");
+ return;
+ }
+
+ /* Read the interrupts property if any */
+ irq = dt_prop_get_u32_def(np, "interrupts", 0);
+ if (!irq) {
+ prlog(PR_ERR, "No interrupts property\n");
+ return;
+ }
+
+ if (!lpc_present()) {
+ prlog(PR_ERR, "LPC not present\n");
+ return;
+ }
+
+ /* Get IO base */
+ prop = dt_find_property(np, "reg");
+ if (!prop) {
+ prlog(PR_ERR, "Can't find reg property\n");
+ return;
+ }
+ if (dt_property_get_cell(prop, 0) != OPAL_LPC_IO) {
+ prlog(PR_ERR, "Only supports IO addresses\n");
+ return;
+ }
+ mbox.base = dt_property_get_cell(prop, 1);
+
+ if (!mbox_init_hw()) {
+ prlog(PR_DEBUG, "Couldn't init HW\n");
+ return;
+ }
+
+ /* Disable the standard interrupt we don't care */
+ bmc_mbox_outb(MBOX_CTRL_INT_MASK, MBOX_HOST_CTRL);
+
+ /* Clear the status reg bits that we intend to use for interrupts */
+ /* W1C */
+ bmc_mbox_outb(MBOX_STATUS_1_RESP | MBOX_STATUS_1_ATTN, MBOX_STATUS_1);
+
+ mbox.queue_len = 0;
+ mbox.callback = NULL;
+ mbox.drv_data = NULL;
+ mbox.timeout = 0;
+ mbox.sequence = 0;
+ init_lock(&mbox.lock);
+
+ init_timer(&mbox.poller, mbox_poll, NULL);
+
+ chip_id = dt_get_chip_id(np);
+ mbox_lpc_client.interrupts = LPC_IRQ(irq);
+ lpc_register_client(chip_id, &mbox_lpc_client, IRQ_ATTR_TARGET_OPAL);
+
+ /* Enable interrupts */
+ bmc_mbox_outb(MBOX_STATUS_1_ATTN | MBOX_STATUS_1_RESP, MBOX_HOST_INT_EN_1);
+
+ prlog(PR_DEBUG, "Enabled on chip %d, IO port 0x%x, IRQ %d\n",
+ chip_id, mbox.base, irq);
+}
+
+
diff --git a/roms/skiboot/hw/lpc-port80h.c b/roms/skiboot/hw/lpc-port80h.c
new file mode 100644
index 000000000..0d1fee99e
--- /dev/null
+++ b/roms/skiboot/hw/lpc-port80h.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * op_display() but over the 1 byte LPC port 80h just like an original IBM PC
+ *
+ * Copyright 2018-2019 IBM Corp.
+ */
+
+#define pr_fmt(fmt) "Port80h: " fmt
+
+#include <lpc.h>
+#include <op-panel.h>
+#include <chip.h>
+
+/*
+ * Convert our detailed op_display() call into 1 byte for LPC port 80h
+ *
+ * Our layout looks like this:
+ * MSB (bit 7): 1 = Comes from OPAL
+ * bit 6 : 0 = OP_MOD_INIT (the main one), 1 = (see bit 5)
+ * bit 5432 : (if bit 6=0, low nibble of op-panel code)
+ * bit 5432 : (if bit 6=1, other OP_MOD_ values in bits 54:
+ * 00b=OP_MOD_CPU, 01b=OP_MOD_LOCK,
+ * 10b=OP_MOD_MEM, 11b=OP_MOD_CHIPTOD
+ * bits 0,1 from code in bits 32)
+ *
+ * bit 1,0: 00b=OP_LOG, 10b=OP_WARN, 01b=OP_ERROR, 11b=OP_FATAL
+ * i.e. bit 0 indicates ERROR or FATAL.
+ *
+ * If port 80h number has the MSB and LSB set, then you died in OPAL.
+ * Any *odd* number with the MSB set (i.e. > 0x80) indicates error.
+ */
+static inline uint8_t op_display_to_port80(uint8_t last_value, enum op_severity s, enum op_module m, uint16_t c)
+{
+ uint8_t r = 0x80; /* Start with top bit set indicating in OPAL */
+
+ switch(m) {
+ case OP_MOD_INIT:
+ /* bit 6 is zero */
+ /* bits 5432 have low nibble of c */
+ r |= (c & 0x0f) << 2;
+ break;
+ case OP_MOD_CPU:
+ r |= 0x40 | (c & 0x03) << 2;
+ break;
+ case OP_MOD_LOCK:
+ r |= 0x50 | (c & 0x03) << 2;
+ break;
+ case OP_MOD_MEM:
+ r |= 0x60 | (c & 0x03) << 2;
+ break;
+ case OP_MOD_CHIPTOD:
+ r |= 0x70 | (c & 0x03) << 2;
+ break;
+ case OP_MOD_CORE:
+ /*
+ * Only current OP_MOD_CORE is where we're OP_FATAL,
+ * So let's go for the last value set and tweak the
+ * bits for OP_FATAL.
+ */
+ r = last_value & 0xFC;
+ break;
+ case OP_MOD_FSP:
+ case OP_MOD_FSPCON:
+ /* Should never be hit, port80h only used on non-FSP! */
+ break;
+ }
+
+ switch(s) {
+ case OP_LOG:
+ break;
+ case OP_WARN:
+ r |= 0x02;
+ break;
+ case OP_ERROR:
+ r |= 0x01;
+ break;
+ case OP_FATAL:
+ r |= 0x03;
+ }
+
+ return r;
+}
+
+/*
+ * Convert our detailed op_display() call into 2 bytes for LPC port 81h and 82h
+ *
+ * This looks pretty similar to our port80 code.
+ * Notably we now have more bits to throw progress into.
+ *
+ * Our layout looks like this:
+ * MSB (bit 15): 1 = Comes from OPAL
+ * bit 14 : 0 = OP_MOD_INIT (the main one), 1 = (see bit 13)
+ * bits 13-2 : (if bit 6=0, low 12 bits of op-panel code)
+ * bit 13,12 : (if bit 6=1, other OP_MOD_ values in bits 13 and 12:
+ * 00b=OP_MOD_CPU, 01b=OP_MOD_LOCK,
+ * 10b=OP_MOD_MEM, 11b=OP_MOD_CHIPTOD)
+ * and bits 11-2 are low 10 bits of op-panel code)
+ *
+ * bit 1,0: 00b=OP_LOG, 10b=OP_WARN, 01b=OP_ERROR, 11b=OP_FATAL
+ * i.e. bit 0 indicates ERROR or FATAL.
+ *
+ * If port 80h number has the MSB and LSB set, then you died in OPAL.
+ * Any *odd* number with the MSB set (i.e. > 0x80) indicates error.
+ */
+static inline uint16_t op_display_to_port8x(uint16_t last_value, enum op_severity s, enum op_module m, uint16_t c)
+{
+ uint16_t r = 0x8000; /* Start with top bit set indicating in OPAL */
+
+ switch(m) {
+ case OP_MOD_INIT:
+ /* bit 6 is zero */
+ /* bits 13 through 2 have low 12 bits of c */
+ r |= (c & 0xFFF) << 2;
+ break;
+ case OP_MOD_CPU:
+ r |= 0x4000 | (c & 0x03FF) << 2;
+ break;
+ case OP_MOD_LOCK:
+ r |= 0x5000 | (c & 0x03FF) << 2;
+ break;
+ case OP_MOD_MEM:
+ r |= 0x6000 | (c & 0x03FF) << 2;
+ break;
+ case OP_MOD_CHIPTOD:
+ r |= 0x7000 | (c & 0x03FF) << 2;
+ break;
+ case OP_MOD_CORE:
+ /*
+ * Only current OP_MOD_CORE is where we're OP_FATAL,
+ * So let's go for the last value set and tweak the
+ * bits for OP_FATAL.
+ */
+ r = last_value & 0xFFFC;
+ break;
+ case OP_MOD_FSP:
+ case OP_MOD_FSPCON:
+ /* Should never be hit, port80h only used on non-FSP! */
+ break;
+ }
+
+ switch(s) {
+ case OP_LOG:
+ break;
+ case OP_WARN:
+ r |= 0x02;
+ break;
+ case OP_ERROR:
+ r |= 0x01;
+ break;
+ case OP_FATAL:
+ r |= 0x03;
+ }
+
+ return r;
+}
+
+
+void op_display_lpc(enum op_severity s, enum op_module m, uint16_t c)
+{
+ static uint8_t port80_val = 0x80;
+ static uint16_t port8x_val = 0x8000;
+
+ if (chip_quirk(QUIRK_SIMICS))
+ return;
+
+ port80_val = op_display_to_port80(port80_val, s, m, c);
+ port8x_val = op_display_to_port8x(port8x_val, s, m, c);
+
+ lpc_probe_write(OPAL_LPC_IO, 0x80, port80_val, 1);
+ lpc_probe_write(OPAL_LPC_IO, 0x81, port8x_val >> 8, 1);
+ lpc_probe_write(OPAL_LPC_IO, 0x82, port8x_val & 0xff, 1);
+}
+
diff --git a/roms/skiboot/hw/lpc-rtc.c b/roms/skiboot/hw/lpc-rtc.c
new file mode 100644
index 000000000..dc4a484b3
--- /dev/null
+++ b/roms/skiboot/hw/lpc-rtc.c
@@ -0,0 +1,235 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Real Time Clock hanging off LPC
+ *
+ * Copyright 2015 IBM Corp.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <ipmi.h>
+#include <time.h>
+#include <time-utils.h>
+#include <device.h>
+#include <opal.h>
+#include <rtc.h>
+#include <lpc.h>
+#include <lock.h>
+#include <timebase.h>
+
+/* Legacy RTC registers */
+#define RTC_REG_SECONDS 0
+#define RTC_REG_MINUTES 2
+#define RTC_REG_HOURS 4
+#define RTC_REG_DAY_OF_WEEK 6
+#define RTC_REG_DAY_OF_MONTH 7
+#define RTC_REG_MONTH 8
+#define RTC_REG_YEAR 9
+#define RTC_REG_A 10
+#define RTC_REG_A_UIP 0x80
+#define RTC_REG_B 11
+#define RTC_REG_B_DIS_UPD 0x80
+#define RTC_REG_B_PIE 0x40
+#define RTC_REG_B_AIE 0x20
+#define RTC_REG_B_UIE 0x10
+#define RTC_REG_B_SQWE 0x08
+#define RTC_REG_B_DM_BINARY 0x04
+#define RTC_REG_B_24H 0x02
+#define RTC_REG_B_DST_EN 0x01
+#define RTC_REG_C 12
+#define RTC_REG_D 13
+#define RTC_REG_D_VALID 0x80
+
+/* Init value is no interrupts, 24H mode, updates enabled */
+#define RTC_REG_B_INIT (RTC_REG_B_24H)
+
+static u32 rtc_port;
+static struct lock rtc_lock = LOCK_UNLOCKED;
+
+static uint8_t rtc_read(uint8_t reg)
+{
+ lpc_outb(reg, rtc_port);
+ return lpc_inb(rtc_port + 1);
+}
+
+static void rtc_write(uint8_t reg, uint8_t val)
+{
+ lpc_outb(reg, rtc_port);
+ lpc_outb(val, rtc_port + 1);
+}
+
+static bool lpc_rtc_read_tm(struct tm *tm)
+{
+ struct tm tm2;
+ unsigned int loops = 0;
+
+ /* Read until two series provide identical values, this
+ * should deal with update races in all practical cases
+ */
+ for (;;) {
+ tm2 = *tm;
+ tm->tm_sec = rtc_read(RTC_REG_SECONDS);
+ tm->tm_min = rtc_read(RTC_REG_MINUTES);
+ tm->tm_hour = rtc_read(RTC_REG_HOURS);
+ tm->tm_mday = rtc_read(RTC_REG_DAY_OF_MONTH);
+ tm->tm_mon = rtc_read(RTC_REG_MONTH);
+ tm->tm_year = rtc_read(RTC_REG_YEAR);
+ if (loops > 0 && memcmp(&tm2, tm, sizeof(struct tm)) == 0)
+ break;
+ loops++;
+ if (loops > 10) {
+ prerror("RTC: Failed to obtain stable values\n");
+ return false;
+ }
+ }
+ tm->tm_sec = bcd_byte(tm->tm_sec, 0);
+ tm->tm_min = bcd_byte(tm->tm_min, 0);
+ tm->tm_hour = bcd_byte(tm->tm_hour, 0);
+ tm->tm_mday = bcd_byte(tm->tm_mday, 0);
+ tm->tm_mon = bcd_byte(tm->tm_mon, 0) - 1;
+ tm->tm_year = bcd_byte(tm->tm_year, 0);
+
+ /* 2000 wrap */
+ if (tm->tm_year < 69)
+ tm->tm_year += 100;
+
+ /* Base */
+ tm->tm_year += 1900;
+
+ return true;
+}
+
+static void lpc_rtc_write_tm(struct tm *tm __unused)
+{
+ /* XXX */
+}
+
+static void lpc_init_time(void)
+{
+ uint8_t val;
+ struct tm tm;
+ bool valid;
+
+ memset(&tm, 0, sizeof(tm));
+
+ lock(&rtc_lock);
+
+ /* If update is in progress, wait a bit */
+ val = rtc_read(RTC_REG_A);
+ if (val & RTC_REG_A_UIP)
+ time_wait_ms(10);
+
+ /* Read from RTC */
+ valid = lpc_rtc_read_tm(&tm);
+
+ unlock(&rtc_lock);
+
+ /* Update cache */
+ if (valid)
+ rtc_cache_update(&tm);
+}
+
+static void lpc_init_hw(void)
+{
+ lock(&rtc_lock);
+
+ /* Set REG B to a suitable default */
+ rtc_write(RTC_REG_B, RTC_REG_B_INIT);
+
+ unlock(&rtc_lock);
+}
+
+static int64_t lpc_opal_rtc_read(__be32 *__ymd, __be64 *__hmsm)
+{
+ uint8_t val;
+ int64_t rc = OPAL_SUCCESS;
+ struct tm tm;
+ uint32_t ymd;
+ uint64_t hmsm;
+
+ if (!__ymd || !__hmsm)
+ return OPAL_PARAMETER;
+
+ /* Return busy if updating. This is somewhat racy, but will
+ * do for now, most RTCs nowadays are smart enough to atomically
+ * update. Alternatively we could just read from the cache...
+ */
+ lock(&rtc_lock);
+ val = rtc_read(RTC_REG_A);
+ if (val & RTC_REG_A_UIP) {
+ unlock(&rtc_lock);
+ return OPAL_BUSY_EVENT;
+ }
+
+ /* Read from RTC */
+ if (lpc_rtc_read_tm(&tm))
+ rc = OPAL_SUCCESS;
+ else
+ rc = OPAL_HARDWARE;
+ unlock(&rtc_lock);
+
+ if (rc == OPAL_SUCCESS) {
+ /* Update cache */
+ rtc_cache_update(&tm);
+
+ /* Convert to OPAL time */
+ tm_to_datetime(&tm, &ymd, &hmsm);
+ *__ymd = cpu_to_be32(ymd);
+ *__hmsm = cpu_to_be64(hmsm);
+ }
+
+ return rc;
+}
+
+static int64_t lpc_opal_rtc_write(uint32_t year_month_day,
+ uint64_t hour_minute_second_millisecond)
+{
+ struct tm tm;
+
+ /* Convert to struct tm */
+ datetime_to_tm(year_month_day, hour_minute_second_millisecond, &tm);
+
+ /* Write it out */
+ lock(&rtc_lock);
+ lpc_rtc_write_tm(&tm);
+ unlock(&rtc_lock);
+
+ return OPAL_SUCCESS;
+}
+
+void lpc_rtc_init(void)
+{
+ struct dt_node *rtc_node, *np;
+
+ if (!lpc_present())
+ return;
+
+ /* We support only one */
+ rtc_node = dt_find_compatible_node(dt_root, NULL, "pnpPNP,b00");
+ if (!rtc_node)
+ return;
+
+ /* Get IO base */
+ rtc_port = dt_prop_get_cell_def(rtc_node, "reg", 1, 0);
+ if (!rtc_port) {
+ prerror("RTC: Can't find reg property\n");
+ return;
+ }
+ if (dt_prop_get_cell_def(rtc_node, "reg", 0, 0) != OPAL_LPC_IO) {
+ prerror("RTC: Unsupported address type\n");
+ return;
+ }
+
+ /* Init the HW */
+ lpc_init_hw();
+
+ /* Create OPAL API node and register OPAL calls */
+ np = dt_new(opal_node, "rtc");
+ dt_add_property_strings(np, "compatible", "ibm,opal-rtc");
+
+ opal_register(OPAL_RTC_READ, lpc_opal_rtc_read, 2);
+ opal_register(OPAL_RTC_WRITE, lpc_opal_rtc_write, 2);
+
+ /* Initialise the rtc cache */
+ lpc_init_time();
+}
diff --git a/roms/skiboot/hw/lpc-uart.c b/roms/skiboot/hw/lpc-uart.c
new file mode 100644
index 000000000..834011b37
--- /dev/null
+++ b/roms/skiboot/hw/lpc-uart.c
@@ -0,0 +1,738 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Serial port hanging off LPC
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <lpc.h>
+#include <console.h>
+#include <opal.h>
+#include <device.h>
+#include <interrupts.h>
+#include <processor.h>
+#include <errorlog.h>
+#include <trace.h>
+#include <timebase.h>
+#include <cpu.h>
+#include <chip.h>
+#include <io.h>
+#include <nvram.h>
+
+DEFINE_LOG_ENTRY(OPAL_RC_UART_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_UART,
+ OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+ OPAL_NA);
+
+/* UART reg defs */
+#define REG_RBR 0
+#define REG_THR 0
+#define REG_DLL 0
+#define REG_IER 1
+#define REG_DLM 1
+#define REG_FCR 2
+#define REG_IIR 2
+#define REG_LCR 3
+#define REG_MCR 4
+#define REG_LSR 5
+#define REG_MSR 6
+#define REG_SCR 7
+
+#define LSR_DR 0x01 /* Data ready */
+#define LSR_OE 0x02 /* Overrun */
+#define LSR_PE 0x04 /* Parity error */
+#define LSR_FE 0x08 /* Framing error */
+#define LSR_BI 0x10 /* Break */
+#define LSR_THRE 0x20 /* Xmit holding register empty */
+#define LSR_TEMT 0x40 /* Xmitter empty */
+#define LSR_ERR 0x80 /* Error */
+
+#define LCR_DLAB 0x80 /* DLL access */
+
+#define IER_RX 0x01
+#define IER_THRE 0x02
+#define IER_ALL 0x0f
+
+static struct lock uart_lock = LOCK_UNLOCKED;
+static struct dt_node *uart_node;
+static uint32_t uart_base;
+static uint64_t uart_tx_full_time;
+static bool has_irq = false, irq_ok, rx_full, tx_full;
+static uint8_t tx_room;
+static uint8_t cached_ier;
+static void *mmio_uart_base;
+static int uart_console_policy = UART_CONSOLE_OPAL;
+static int lpc_irq = -1;
+
+void uart_set_console_policy(int policy)
+{
+ uart_console_policy = policy;
+}
+
+static void uart_trace(u8 ctx, u8 cnt, u8 irq_state, u8 in_count)
+{
+ union trace t;
+
+ t.uart.ctx = ctx;
+ t.uart.cnt = cnt;
+ t.uart.irq_state = irq_state;
+ t.uart.in_count = cpu_to_be16(in_count);
+ trace_add(&t, TRACE_UART, sizeof(struct trace_uart));
+}
+
+static inline uint8_t uart_read(unsigned int reg)
+{
+ if (mmio_uart_base)
+ return in_8(mmio_uart_base + reg);
+ else
+ return lpc_inb(uart_base + reg);
+}
+
+static inline void uart_write(unsigned int reg, uint8_t val)
+{
+ if (mmio_uart_base)
+ out_8(mmio_uart_base + reg, val);
+ else
+ lpc_outb(val, uart_base + reg);
+}
+
+static bool uart_check_tx_room(void)
+{
+ if (tx_room)
+ return true;
+
+ if (uart_read(REG_LSR) & LSR_THRE) {
+ /* FIFO is 16 entries */
+ tx_room = 16;
+ tx_full = false;
+ return true;
+ }
+
+ return false;
+}
+
+/* Must be called with UART lock held */
+static void uart_write_thr(uint8_t val)
+{
+ uart_write(REG_THR, val);
+
+ tx_room--;
+ if (tx_room == 0) {
+ if (!uart_check_tx_room())
+ uart_tx_full_time = mftb();
+ }
+}
+
+static bool uart_timed_out(unsigned long msecs)
+{
+ if (uart_check_tx_room())
+ return false;
+
+ if (chip_quirk(QUIRK_SLOW_SIM))
+ msecs *= 5;
+
+ if (tb_compare(mftb(), uart_tx_full_time + msecs_to_tb(msecs)) == TB_AAFTERB)
+ return true;
+
+ return false;
+}
+
+static bool uart_wait_tx_room(void)
+{
+ if (uart_check_tx_room())
+ return true;
+
+ smt_lowest();
+ while (!uart_check_tx_room()) {
+ if (uart_timed_out(100)) {
+ smt_medium();
+ return false;
+ }
+ }
+ smt_medium();
+
+ return true;
+}
+
+static void uart_update_ier(void)
+{
+ uint8_t ier = 0;
+
+ if (!has_irq)
+ return;
+
+ /* If we have never got an interrupt, enable them all,
+ * the first interrupt received will tell us if interrupts
+ * are functional (some boards are missing an EC or FPGA
+ * programming causing LPC interrupts not to work).
+ */
+ if (!irq_ok)
+ ier = IER_ALL;
+ if (!rx_full)
+ ier |= IER_RX;
+ if (tx_full)
+ ier |= IER_THRE;
+ if (ier != cached_ier) {
+ uart_write(REG_IER, ier);
+ cached_ier = ier;
+ }
+}
+
+bool uart_enabled(void)
+{
+ return mmio_uart_base || uart_base;
+}
+
+/*
+ * Internal console driver (output only)
+ */
+static size_t uart_con_write(const char *buf, size_t len)
+{
+ size_t written = 0;
+
+ /* If LPC bus is bad, we just swallow data */
+ if (!lpc_ok() && !mmio_uart_base)
+ return len;
+
+ lock(&uart_lock);
+ while (written < len) {
+ if (!uart_wait_tx_room())
+ break;
+
+ uart_write_thr(buf[written++]);
+ }
+
+ if (!written && uart_timed_out(1000)) {
+ unlock(&uart_lock);
+ return len; /* swallow data */
+ }
+
+ unlock(&uart_lock);
+
+ return written;
+}
+
+static struct con_ops uart_con_driver = {
+ .write = uart_con_write,
+};
+
+/*
+ * OPAL console driver
+ */
+
+/*
+ * We implement a simple buffer to buffer input data as some bugs in
+ * Linux make it fail to read fast enough after we get an interrupt.
+ *
+ * We use it on non-interrupt operations as well while at it because
+ * it doesn't cost us much and might help in a few cases where Linux
+ * is calling opal_poll_events() but not actually reading.
+ *
+ * Most of the time I expect we'll flush it completely to Linux into
+ * it's tty flip buffers so I don't bother with a ring buffer.
+ */
+#define IN_BUF_SIZE 0x1000
+static uint8_t *in_buf;
+static uint32_t in_count;
+
+/*
+ * We implement a ring buffer for output data as well to speed things
+ * up a bit. This allows us to have interrupt driven sends. This is only
+ * for the output data coming from the OPAL API, not the internal one
+ * which is already bufferred.
+ */
+#define OUT_BUF_SIZE 0x1000
+static uint8_t *out_buf;
+static uint32_t out_buf_prod;
+static uint32_t out_buf_cons;
+
+/* Asynchronous flush, uart_lock must be held */
+static int64_t uart_con_flush(void)
+{
+ bool tx_was_full = tx_full;
+ uint32_t out_buf_cons_initial = out_buf_cons;
+
+ while(out_buf_prod != out_buf_cons) {
+ if (tx_room == 0) {
+ /*
+ * If the interrupt is not functional,
+ * we force a full synchronous flush,
+ * otherwise the Linux console isn't
+ * usable (too slow).
+ */
+ if (irq_ok)
+ uart_check_tx_room();
+ else
+ uart_wait_tx_room();
+ }
+ if (tx_room == 0) {
+ tx_full = true;
+ break;
+ }
+
+ uart_write_thr(out_buf[out_buf_cons++]);
+ out_buf_cons %= OUT_BUF_SIZE;
+ }
+ if (tx_full != tx_was_full)
+ uart_update_ier();
+ if (out_buf_prod != out_buf_cons) {
+ /* Return busy if nothing was flushed this call */
+ if (out_buf_cons == out_buf_cons_initial) {
+ if (uart_timed_out(1000))
+ return OPAL_TIMEOUT;
+ return OPAL_BUSY;
+ }
+ /* Return partial if there's more to flush */
+ return OPAL_PARTIAL;
+ }
+
+ return OPAL_SUCCESS;
+}
+
+static uint32_t uart_tx_buf_space(void)
+{
+ return OUT_BUF_SIZE - 1 -
+ (out_buf_prod + OUT_BUF_SIZE - out_buf_cons) % OUT_BUF_SIZE;
+}
+
+static int64_t uart_opal_write(int64_t term_number, __be64 *__length,
+ const uint8_t *buffer)
+{
+ size_t written = 0, len = be64_to_cpu(*__length);
+ int64_t ret = OPAL_SUCCESS;
+
+ if (term_number != 0)
+ return OPAL_PARAMETER;
+
+ lock(&uart_lock);
+
+ /* Copy data to out buffer */
+ while (uart_tx_buf_space() && len--) {
+ out_buf[out_buf_prod++] = *(buffer++);
+ out_buf_prod %= OUT_BUF_SIZE;
+ written++;
+ }
+
+ /* Flush out buffer again */
+ uart_con_flush();
+
+ if (!written && uart_timed_out(1000))
+ ret = OPAL_TIMEOUT;
+ unlock(&uart_lock);
+
+ *__length = cpu_to_be64(written);
+
+ return ret;
+}
+
+static int64_t uart_opal_write_buffer_space(int64_t term_number,
+ __be64 *__length)
+{
+ int64_t ret = OPAL_SUCCESS;
+ int64_t tx_buf_len;
+
+ if (term_number != 0)
+ return OPAL_PARAMETER;
+
+ lock(&uart_lock);
+ tx_buf_len = uart_tx_buf_space();
+
+ if ((tx_buf_len < be64_to_cpu(*__length)) && uart_timed_out(1000))
+ ret = OPAL_TIMEOUT;
+
+ *__length = cpu_to_be64(tx_buf_len);
+ unlock(&uart_lock);
+
+ return ret;
+}
+
+/* Must be called with UART lock held */
+static void uart_read_to_buffer(void)
+{
+ /* As long as there is room in the buffer */
+ while(in_count < IN_BUF_SIZE) {
+ /* Read status register */
+ uint8_t lsr = uart_read(REG_LSR);
+
+ /* Nothing to read ... */
+ if ((lsr & LSR_DR) == 0)
+ break;
+
+ /* Read and add to buffer */
+ in_buf[in_count++] = uart_read(REG_RBR);
+ }
+
+ /* If the buffer is full disable the interrupt */
+ rx_full = (in_count == IN_BUF_SIZE);
+ uart_update_ier();
+}
+
+static void uart_adjust_opal_event(void)
+{
+ if (in_count)
+ opal_update_pending_evt(OPAL_EVENT_CONSOLE_INPUT,
+ OPAL_EVENT_CONSOLE_INPUT);
+ else
+ opal_update_pending_evt(OPAL_EVENT_CONSOLE_INPUT, 0);
+}
+
+/* This is called with the console lock held */
+static int64_t uart_opal_read(int64_t term_number, __be64 *__length,
+ uint8_t *buffer)
+{
+ size_t req_count = be64_to_cpu(*__length), read_cnt = 0;
+ uint8_t lsr = 0;
+
+ if (term_number != 0)
+ return OPAL_PARAMETER;
+ if (!in_buf)
+ return OPAL_INTERNAL_ERROR;
+
+ lock(&uart_lock);
+
+ /* Read from buffer first */
+ if (in_count) {
+ read_cnt = in_count;
+ if (req_count < read_cnt)
+ read_cnt = req_count;
+ memcpy(buffer, in_buf, read_cnt);
+ req_count -= read_cnt;
+ if (in_count != read_cnt)
+ memmove(in_buf, in_buf + read_cnt, in_count - read_cnt);
+ in_count -= read_cnt;
+ }
+
+ /*
+ * If there's still room in the user buffer, read from the UART
+ * directly
+ */
+ while(req_count) {
+ lsr = uart_read(REG_LSR);
+ if ((lsr & LSR_DR) == 0)
+ break;
+ buffer[read_cnt++] = uart_read(REG_RBR);
+ req_count--;
+ }
+
+ /* Finally, flush whatever's left in the UART into our buffer */
+ uart_read_to_buffer();
+
+ uart_trace(TRACE_UART_CTX_READ, read_cnt, tx_full, in_count);
+
+ unlock(&uart_lock);
+
+ /* Adjust the OPAL event */
+ uart_adjust_opal_event();
+
+ *__length = cpu_to_be64(read_cnt);
+ return OPAL_SUCCESS;
+}
+
+static int64_t uart_opal_flush(int64_t term_number)
+{
+ int64_t rc;
+
+ if (term_number != 0)
+ return OPAL_PARAMETER;
+
+ lock(&uart_lock);
+ rc = uart_con_flush();
+ unlock(&uart_lock);
+
+ return rc;
+}
+
+static void __uart_do_poll(u8 trace_ctx)
+{
+ if (!in_buf)
+ return;
+
+ lock(&uart_lock);
+ uart_read_to_buffer();
+ uart_con_flush();
+ uart_trace(trace_ctx, 0, tx_full, in_count);
+ unlock(&uart_lock);
+
+ uart_adjust_opal_event();
+}
+
+static void uart_console_poll(void *data __unused)
+{
+ __uart_do_poll(TRACE_UART_CTX_POLL);
+}
+
+static void uart_irq(uint32_t chip_id __unused, uint32_t irq_mask __unused)
+{
+ if (!irq_ok) {
+ prlog(PR_DEBUG, "UART: IRQ functional !\n");
+ irq_ok = true;
+ }
+ __uart_do_poll(TRACE_UART_CTX_IRQ);
+}
+
+/*
+ * Common setup/inits
+ */
+
+static void uart_setup_os_passthrough(void)
+{
+ char *path;
+
+ static struct lpc_client uart_lpc_os_client = {
+ .reset = NULL,
+ .interrupt = NULL,
+ .interrupts = 0
+ };
+
+ dt_add_property_strings(uart_node, "status", "ok");
+ path = dt_get_path(uart_node);
+ dt_add_property_string(dt_chosen, "linux,stdout-path", path);
+ free(path);
+
+ /* Setup LPC client for OS interrupts */
+ if (lpc_irq >= 0) {
+ uint32_t chip_id = dt_get_chip_id(uart_node);
+ uart_lpc_os_client.interrupts = LPC_IRQ(lpc_irq);
+ lpc_register_client(chip_id, &uart_lpc_os_client,
+ IRQ_ATTR_TARGET_LINUX);
+ }
+ prlog(PR_DEBUG, "UART: Enabled as OS pass-through\n");
+}
+
+static void uart_setup_opal_console(void)
+{
+ static struct lpc_client uart_lpc_opal_client = {
+ .interrupt = uart_irq,
+ };
+
+ /* Add the opal console node */
+ add_opal_console_node(0, "raw", OUT_BUF_SIZE);
+
+ dt_add_property_string(dt_chosen, "linux,stdout-path",
+ "/ibm,opal/consoles/serial@0");
+
+ /*
+ * We mark the UART as reserved since we don't want the
+ * kernel to start using it with its own 8250 driver
+ */
+ dt_add_property_strings(uart_node, "status", "reserved");
+
+ /* Allocate an input buffer */
+ in_buf = zalloc(IN_BUF_SIZE);
+ out_buf = zalloc(OUT_BUF_SIZE);
+
+ /* Setup LPC client for OPAL interrupts */
+ if (lpc_irq >= 0) {
+ uint32_t chip_id = dt_get_chip_id(uart_node);
+ uart_lpc_opal_client.interrupts = LPC_IRQ(lpc_irq);
+ lpc_register_client(chip_id, &uart_lpc_opal_client,
+ IRQ_ATTR_TARGET_OPAL);
+ has_irq = true;
+ }
+
+ /*
+ * If the interrupt is enabled, turn on RX interrupts (and
+ * only these for now
+ */
+ tx_full = rx_full = false;
+ uart_update_ier();
+
+ /* Start console poller */
+ opal_add_poller(uart_console_poll, NULL);
+}
+
+static void uart_init_opal_console(void)
+{
+ const char *nv_policy;
+
+ /* Update the policy if the corresponding nvram variable
+ * is present
+ */
+ nv_policy = nvram_query_dangerous("uart-con-policy");
+ if (nv_policy) {
+ if (!strcmp(nv_policy, "opal"))
+ uart_console_policy = UART_CONSOLE_OPAL;
+ else if (!strcmp(nv_policy, "os"))
+ uart_console_policy = UART_CONSOLE_OS;
+ else
+ prlog(PR_WARNING,
+ "UART: Unknown console policy in NVRAM: %s\n",
+ nv_policy);
+ }
+ if (uart_console_policy == UART_CONSOLE_OPAL)
+ uart_setup_opal_console();
+ else
+ uart_setup_os_passthrough();
+}
+
+struct opal_con_ops uart_opal_con = {
+ .name = "OPAL UART console",
+ .init = uart_init_opal_console,
+ .read = uart_opal_read,
+ .write = uart_opal_write,
+ .space = uart_opal_write_buffer_space,
+ .flush = uart_opal_flush,
+};
+
+static bool uart_init_hw(unsigned int speed, unsigned int clock)
+{
+ unsigned int dll = (clock / 16) / speed;
+
+ /* Clear line control */
+ uart_write(REG_LCR, 0x00);
+
+ /* Check if the UART responds */
+ uart_write(REG_IER, 0x01);
+ if (uart_read(REG_IER) != 0x01)
+ goto detect_fail;
+ uart_write(REG_IER, 0x00);
+ if (uart_read(REG_IER) != 0x00)
+ goto detect_fail;
+
+ uart_write(REG_LCR, LCR_DLAB);
+ uart_write(REG_DLL, dll & 0xff);
+ uart_write(REG_DLM, dll >> 8);
+ uart_write(REG_LCR, 0x03); /* 8N1 */
+ uart_write(REG_MCR, 0x03); /* RTS/DTR */
+ uart_write(REG_FCR, 0x07); /* clear & en. fifos */
+
+ /*
+ * On some UART implementations[1], we have observed that characters
+ * written to the UART during early boot (where no RX path is used,
+ * so we don't read from RBR) can cause a character timeout interrupt
+ * once we eventually enable interrupts through the IER. This
+ * interrupt can only be cleared by reading from RBR (even though we've
+ * cleared the RX FIFO!).
+ *
+ * Unfortunately though, the LCR[DR] bit does *not* indicate that there
+ * are characters to be read from RBR, so we may never read it, so the
+ * interrupt continuously fires.
+ *
+ * So, manually clear the timeout interrupt by reading the RBR here.
+ * We discard the read data, but that shouldn't matter as we've just
+ * reset the FIFO anyway.
+ *
+ * 1: seen on the AST2500 SUART. I assume this applies to 2400 too.
+ */
+ uart_read(REG_RBR);
+
+ return true;
+
+ detect_fail:
+ prerror("UART: Presence detect failed !\n");
+ return false;
+}
+
+/*
+ * early_uart_init() is similar to uart_init() in that it configures skiboot
+ * console log to output via a UART. The main differences are that the early
+ * version only works with MMIO UARTs and will not setup interrupts or locks.
+ */
+void early_uart_init(void)
+{
+ struct dt_node *uart_node;
+ u32 clk, baud;
+
+ uart_node = dt_find_compatible_node(dt_root, NULL, "ns16550");
+ if (!uart_node)
+ return;
+
+ /* Try translate the address, if this fails then it's not a MMIO UART */
+ mmio_uart_base = (void *) dt_translate_address(uart_node, 0, NULL);
+ if (!mmio_uart_base)
+ return;
+
+ clk = dt_prop_get_u32(uart_node, "clock-frequency");
+ baud = dt_prop_get_u32(uart_node, "current-speed");
+
+ if (uart_init_hw(baud, clk)) {
+ set_console(&uart_con_driver);
+ prlog(PR_DEBUG, "UART: Using UART at %p\n", mmio_uart_base);
+ } else {
+ prerror("UART: Early init failed!");
+ mmio_uart_base = NULL;
+ }
+}
+
+void uart_init(void)
+{
+ const struct dt_property *prop;
+ struct dt_node *n;
+ char *path __unused;
+ const be32 *irqp;
+
+ /* Clean up after early_uart_init() */
+ mmio_uart_base = NULL;
+
+ /* UART lock is in the console path and thus must block
+ * printf re-entrancy
+ */
+ uart_lock.in_con_path = true;
+
+ /* We support only one */
+ uart_node = n = dt_find_compatible_node(dt_root, NULL, "ns16550");
+ if (!n)
+ return;
+
+ /* Read the interrupts property if any */
+ irqp = dt_prop_get_def(n, "interrupts", NULL);
+
+ /* Now check if the UART is on the root bus. This is the case of
+ * directly mapped UARTs in simulation environments
+ */
+ if (n->parent == dt_root) {
+ printf("UART: Found at root !\n");
+ mmio_uart_base = (void *)dt_translate_address(n, 0, NULL);
+ if (!mmio_uart_base) {
+ printf("UART: Failed to translate address !\n");
+ return;
+ }
+
+ /* If it has an interrupt properly, we consider this to be
+ * a direct XICS/XIVE interrupt
+ */
+ if (irqp)
+ has_irq = true;
+
+ } else {
+ if (!lpc_present())
+ return;
+
+ /* Get IO base */
+ prop = dt_find_property(n, "reg");
+ if (!prop) {
+ log_simple_error(&e_info(OPAL_RC_UART_INIT),
+ "UART: Can't find reg property\n");
+ return;
+ }
+ if (dt_property_get_cell(prop, 0) != OPAL_LPC_IO) {
+ log_simple_error(&e_info(OPAL_RC_UART_INIT),
+ "UART: Only supports IO addresses\n");
+ return;
+ }
+ uart_base = dt_property_get_cell(prop, 1);
+
+ if (irqp) {
+ lpc_irq = be32_to_cpu(*irqp);
+ prlog(PR_DEBUG, "UART: Using LPC IRQ %d\n", lpc_irq);
+ }
+ }
+
+
+ if (!uart_init_hw(dt_prop_get_u32(n, "current-speed"),
+ dt_prop_get_u32(n, "clock-frequency"))) {
+ prerror("UART: Initialization failed\n");
+ dt_add_property_strings(n, "status", "bad");
+ return;
+ }
+
+ /*
+ * Mark LPC used by the console (will mark the relevant
+ * locks to avoid deadlocks when flushing the console)
+ */
+ lpc_used_by_console();
+
+ /* Install console backend for printf() */
+ set_console(&uart_con_driver);
+}
+
diff --git a/roms/skiboot/hw/lpc.c b/roms/skiboot/hw/lpc.c
new file mode 100644
index 000000000..bf3ab1fae
--- /dev/null
+++ b/roms/skiboot/hw/lpc.c
@@ -0,0 +1,1407 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Low Pin Count (LPC) Bus.
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#define pr_fmt(fmt) "LPC: " fmt
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <io.h>
+#include <lock.h>
+#include <chip.h>
+#include <lpc.h>
+#include <timebase.h>
+#include <errorlog.h>
+#include <opal-api.h>
+#include <platform.h>
+#include <psi.h>
+#include <interrupts.h>
+
+//#define DBG_IRQ(fmt...) prerror(fmt)
+#define DBG_IRQ(fmt...) do { } while(0)
+
+DEFINE_LOG_ENTRY(OPAL_RC_LPC_READ, OPAL_PLATFORM_ERR_EVT, OPAL_LPC,
+ OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL,
+ OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_LPC_WRITE, OPAL_PLATFORM_ERR_EVT, OPAL_LPC,
+ OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL,
+ OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_LPC_SYNC, OPAL_PLATFORM_ERR_EVT, OPAL_LPC,
+ OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL,
+ OPAL_NA);
+
+/* Used exclusively in manufacturing mode */
+DEFINE_LOG_ENTRY(OPAL_RC_LPC_SYNC_PERF, OPAL_PLATFORM_ERR_EVT, OPAL_LPC,
+ OPAL_MISC_SUBSYSTEM, OPAL_UNRECOVERABLE_ERR_DEGRADE_PERF,
+ OPAL_NA);
+
+#define ECCB_CTL 0 /* b0020 -> b00200 */
+#define ECCB_STAT 2 /* b0022 -> b00210 */
+#define ECCB_DATA 3 /* b0023 -> b00218 */
+
+#define ECCB_CTL_MAGIC 0xd000000000000000ul
+#define ECCB_CTL_DATASZ PPC_BITMASK(4,7)
+#define ECCB_CTL_READ PPC_BIT(15)
+#define ECCB_CTL_ADDRLEN PPC_BITMASK(23,25)
+#define ECCB_ADDRLEN_4B 0x4
+#define ECCB_CTL_ADDR PPC_BITMASK(32,63)
+
+#define ECCB_STAT_PIB_ERR PPC_BITMASK(0,5)
+#define ECCB_STAT_RD_DATA PPC_BITMASK(6,37)
+#define ECCB_STAT_BUSY PPC_BIT(44)
+#define ECCB_STAT_ERRORS1 PPC_BITMASK(45,51)
+#define ECCB_STAT_OP_DONE PPC_BIT(52)
+#define ECCB_STAT_ERRORS2 PPC_BITMASK(53,55)
+
+#define ECCB_STAT_ERR_MASK (ECCB_STAT_PIB_ERR | \
+ ECCB_STAT_ERRORS1 | \
+ ECCB_STAT_ERRORS2)
+
+#define ECCB_TIMEOUT 1000000
+
+/* OPB Master LS registers */
+#define OPB_MASTER_LS_IRQ_STAT 0x50
+#define OPB_MASTER_LS_IRQ_MASK 0x54
+#define OPB_MASTER_LS_IRQ_POL 0x58
+#define OPB_MASTER_IRQ_LPC 0x00000800
+
+/* LPC HC registers */
+#define LPC_HC_FW_SEG_IDSEL 0x24
+#define LPC_HC_FW_RD_ACC_SIZE 0x28
+#define LPC_HC_FW_RD_1B 0x00000000
+#define LPC_HC_FW_RD_2B 0x01000000
+#define LPC_HC_FW_RD_4B 0x02000000
+#define LPC_HC_FW_RD_16B 0x04000000
+#define LPC_HC_FW_RD_128B 0x07000000
+#define LPC_HC_IRQSER_CTRL 0x30
+#define LPC_HC_IRQSER_EN 0x80000000
+#define LPC_HC_IRQSER_QMODE 0x40000000
+#define LPC_HC_IRQSER_START_MASK 0x03000000
+#define LPC_HC_IRQSER_START_4CLK 0x00000000
+#define LPC_HC_IRQSER_START_6CLK 0x01000000
+#define LPC_HC_IRQSER_START_8CLK 0x02000000
+#define LPC_HC_IRQSER_AUTO_CLEAR 0x00800000
+#define LPC_HC_IRQMASK 0x34 /* same bit defs as LPC_HC_IRQSTAT */
+#define LPC_HC_IRQSTAT 0x38
+#define LPC_HC_IRQ_SERIRQ0 0x80000000u /* all bits down to ... */
+#define LPC_HC_IRQ_SERIRQ16 0x00008000 /* IRQ16=IOCHK#, IRQ2=SMI# */
+#define LPC_HC_IRQ_SERIRQ_ALL 0xffff8000
+#define LPC_HC_IRQ_LRESET 0x00000400
+#define LPC_HC_IRQ_SYNC_ABNORM_ERR 0x00000080
+#define LPC_HC_IRQ_SYNC_NORESP_ERR 0x00000040
+#define LPC_HC_IRQ_SYNC_NORM_ERR 0x00000020
+#define LPC_HC_IRQ_SYNC_TIMEOUT_ERR 0x00000010
+#define LPC_HC_IRQ_TARG_TAR_ERR 0x00000008
+#define LPC_HC_IRQ_BM_TAR_ERR 0x00000004
+#define LPC_HC_IRQ_BM0_REQ 0x00000002
+#define LPC_HC_IRQ_BM1_REQ 0x00000001
+#define LPC_HC_IRQ_BASE_IRQS ( \
+ LPC_HC_IRQ_LRESET | \
+ LPC_HC_IRQ_SYNC_ABNORM_ERR | \
+ LPC_HC_IRQ_SYNC_NORESP_ERR | \
+ LPC_HC_IRQ_SYNC_NORM_ERR | \
+ LPC_HC_IRQ_SYNC_TIMEOUT_ERR | \
+ LPC_HC_IRQ_TARG_TAR_ERR | \
+ LPC_HC_IRQ_BM_TAR_ERR)
+#define LPC_HC_ERROR_ADDRESS 0x40
+
+#define LPC_NUM_SERIRQ 17
+
+enum {
+ LPC_ROUTE_FREE = 0,
+ LPC_ROUTE_OPAL,
+ LPC_ROUTE_LINUX
+};
+
+struct lpc_error_entry {
+ int64_t rc;
+ const char *description;
+};
+
+struct lpcm {
+ uint32_t chip_id;
+ uint32_t xbase;
+ void *mbase;
+ struct lock lock;
+ uint8_t fw_idsel;
+ uint8_t fw_rdsz;
+ struct list_head clients;
+ bool has_serirq;
+ uint8_t sirq_routes[LPC_NUM_SERIRQ];
+ bool sirq_routed[LPC_NUM_SERIRQ];
+ uint32_t sirq_rmasks[4];
+ uint8_t sirq_ralloc[4];
+ struct dt_node *node;
+};
+
+
+#define LPC_BUS_DEGRADED_PERF_THRESHOLD 5
+
+struct lpc_client_entry {
+ struct list_node node;
+ const struct lpc_client *clt;
+ uint32_t policy;
+};
+
+/* Default LPC bus */
+static int32_t lpc_default_chip_id = -1;
+static bool lpc_irqs_ready;
+
+/*
+ * These are expected to be the same on all chips and should probably
+ * be read (or configured) dynamically. This is how things are configured
+ * today on Tuletta.
+ */
+static uint32_t lpc_io_opb_base = 0xd0010000;
+static uint32_t lpc_mem_opb_base = 0xe0000000;
+static uint32_t lpc_fw_opb_base = 0xf0000000;
+static uint32_t lpc_reg_opb_base = 0xc0012000;
+static uint32_t opb_master_reg_base = 0xc0010000;
+
+static int64_t opb_mmio_write(struct lpcm *lpc, uint32_t addr, uint32_t data,
+ uint32_t sz)
+{
+ switch (sz) {
+ case 1:
+ out_8(lpc->mbase + addr, data);
+ return OPAL_SUCCESS;
+ case 2:
+ out_be16(lpc->mbase + addr, data);
+ return OPAL_SUCCESS;
+ case 4:
+ out_be32(lpc->mbase + addr, data);
+ return OPAL_SUCCESS;
+ }
+ prerror("Invalid data size %d\n", sz);
+ return OPAL_PARAMETER;
+}
+
+static int64_t opb_write(struct lpcm *lpc, uint32_t addr, uint32_t data,
+ uint32_t sz)
+{
+ uint64_t ctl = ECCB_CTL_MAGIC, stat;
+ int64_t rc, tout;
+ uint64_t data_reg;
+
+ if (lpc->mbase)
+ return opb_mmio_write(lpc, addr, data, sz);
+
+ switch(sz) {
+ case 1:
+ data_reg = ((uint64_t)data) << 56;
+ break;
+ case 2:
+ data_reg = ((uint64_t)data) << 48;
+ break;
+ case 4:
+ data_reg = ((uint64_t)data) << 32;
+ break;
+ default:
+ prerror("Invalid data size %d\n", sz);
+ return OPAL_PARAMETER;
+ }
+
+ rc = xscom_write(lpc->chip_id, lpc->xbase + ECCB_DATA, data_reg);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_LPC_WRITE),
+ "LPC: XSCOM write to ECCB DATA error %lld\n", rc);
+ return rc;
+ }
+
+ ctl = SETFIELD(ECCB_CTL_DATASZ, ctl, sz);
+ ctl = SETFIELD(ECCB_CTL_ADDRLEN, ctl, ECCB_ADDRLEN_4B);
+ ctl = SETFIELD(ECCB_CTL_ADDR, ctl, addr);
+ rc = xscom_write(lpc->chip_id, lpc->xbase + ECCB_CTL, ctl);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_LPC_WRITE),
+ "LPC: XSCOM write to ECCB CTL error %lld\n", rc);
+ return rc;
+ }
+
+ for (tout = 0; tout < ECCB_TIMEOUT; tout++) {
+ rc = xscom_read(lpc->chip_id, lpc->xbase + ECCB_STAT,
+ &stat);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_LPC_WRITE),
+ "LPC: XSCOM read from ECCB STAT err %lld\n",
+ rc);
+ return rc;
+ }
+ if (stat & ECCB_STAT_OP_DONE) {
+ if (stat & ECCB_STAT_ERR_MASK) {
+ log_simple_error(&e_info(OPAL_RC_LPC_WRITE),
+ "LPC: Error status: 0x%llx\n", stat);
+ return OPAL_HARDWARE;
+ }
+ return OPAL_SUCCESS;
+ }
+ time_wait_nopoll(100);
+ }
+ log_simple_error(&e_info(OPAL_RC_LPC_WRITE), "LPC: Write timeout !\n");
+ return OPAL_HARDWARE;
+}
+
+static int64_t opb_mmio_read(struct lpcm *lpc, uint32_t addr, uint32_t *data,
+ uint32_t sz)
+{
+ switch (sz) {
+ case 1:
+ *data = in_8(lpc->mbase + addr);
+ return OPAL_SUCCESS;
+ case 2:
+ *data = in_be16(lpc->mbase + addr);
+ return OPAL_SUCCESS;
+ case 4:
+ *data = in_be32(lpc->mbase + addr);
+ return OPAL_SUCCESS;
+ }
+ prerror("Invalid data size %d\n", sz);
+ return OPAL_PARAMETER;
+}
+
+static int64_t opb_read(struct lpcm *lpc, uint32_t addr, uint32_t *data,
+ uint32_t sz)
+{
+ uint64_t ctl = ECCB_CTL_MAGIC | ECCB_CTL_READ, stat;
+ int64_t rc, tout;
+
+ if (lpc->mbase)
+ return opb_mmio_read(lpc, addr, data, sz);
+
+ if (sz != 1 && sz != 2 && sz != 4) {
+ prerror("Invalid data size %d\n", sz);
+ return OPAL_PARAMETER;
+ }
+
+ ctl = SETFIELD(ECCB_CTL_DATASZ, ctl, sz);
+ ctl = SETFIELD(ECCB_CTL_ADDRLEN, ctl, ECCB_ADDRLEN_4B);
+ ctl = SETFIELD(ECCB_CTL_ADDR, ctl, addr);
+ rc = xscom_write(lpc->chip_id, lpc->xbase + ECCB_CTL, ctl);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_LPC_READ),
+ "LPC: XSCOM write to ECCB CTL error %lld\n", rc);
+ return rc;
+ }
+
+ for (tout = 0; tout < ECCB_TIMEOUT; tout++) {
+ rc = xscom_read(lpc->chip_id, lpc->xbase + ECCB_STAT,
+ &stat);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_LPC_READ),
+ "LPC: XSCOM read from ECCB STAT err %lld\n",
+ rc);
+ return rc;
+ }
+ if (stat & ECCB_STAT_OP_DONE) {
+ uint32_t rdata = GETFIELD(ECCB_STAT_RD_DATA, stat);
+ if (stat & ECCB_STAT_ERR_MASK) {
+ log_simple_error(&e_info(OPAL_RC_LPC_READ),
+ "LPC: Error status: 0x%llx\n", stat);
+ return OPAL_HARDWARE;
+ }
+ switch(sz) {
+ case 1:
+ *data = rdata >> 24;
+ break;
+ case 2:
+ *data = rdata >> 16;
+ break;
+ default:
+ *data = rdata;
+ break;
+ }
+ return 0;
+ }
+ time_wait_nopoll(100);
+ }
+ log_simple_error(&e_info(OPAL_RC_LPC_READ), "LPC: Read timeout !\n");
+ return OPAL_HARDWARE;
+}
+
+static int64_t lpc_set_fw_idsel(struct lpcm *lpc, uint8_t idsel)
+{
+ uint32_t val;
+ int64_t rc;
+
+ if (idsel == lpc->fw_idsel)
+ return OPAL_SUCCESS;
+ if (idsel > 0xf)
+ return OPAL_PARAMETER;
+
+ rc = opb_read(lpc, lpc_reg_opb_base + LPC_HC_FW_SEG_IDSEL,
+ &val, 4);
+ if (rc) {
+ prerror("Failed to read HC_FW_SEG_IDSEL register !\n");
+ return rc;
+ }
+ val = (val & 0xfffffff0) | idsel;
+ rc = opb_write(lpc, lpc_reg_opb_base + LPC_HC_FW_SEG_IDSEL,
+ val, 4);
+ if (rc) {
+ prerror("Failed to write HC_FW_SEG_IDSEL register !\n");
+ return rc;
+ }
+ lpc->fw_idsel = idsel;
+ return OPAL_SUCCESS;
+}
+
+static int64_t lpc_set_fw_rdsz(struct lpcm *lpc, uint8_t rdsz)
+{
+ uint32_t val;
+ int64_t rc;
+
+ if (rdsz == lpc->fw_rdsz)
+ return OPAL_SUCCESS;
+ switch(rdsz) {
+ case 1:
+ val = LPC_HC_FW_RD_1B;
+ break;
+ case 2:
+ val = LPC_HC_FW_RD_2B;
+ break;
+ case 4:
+ val = LPC_HC_FW_RD_4B;
+ break;
+ default:
+ /*
+ * The HW supports 16 and 128 via a buffer/cache
+ * but I have never exprimented with it and am not
+ * sure it works the way we expect so let's leave it
+ * at that for now
+ */
+ return OPAL_PARAMETER;
+ }
+ rc = opb_write(lpc, lpc_reg_opb_base + LPC_HC_FW_RD_ACC_SIZE,
+ val, 4);
+ if (rc) {
+ prerror("Failed to write LPC_HC_FW_RD_ACC_SIZE !\n");
+ return rc;
+ }
+ lpc->fw_rdsz = rdsz;
+ return OPAL_SUCCESS;
+}
+
+static int64_t lpc_opb_prepare(struct lpcm *lpc,
+ enum OpalLPCAddressType addr_type,
+ uint32_t addr, uint32_t sz,
+ uint32_t *opb_base, bool is_write)
+{
+ uint32_t top = addr + sz;
+ uint8_t fw_idsel;
+ int64_t rc;
+
+ /* Address wraparound */
+ if (top < addr)
+ return OPAL_PARAMETER;
+
+ /*
+ * Bound check access and get the OPB base address for
+ * the window corresponding to the access type
+ */
+ switch(addr_type) {
+ case OPAL_LPC_IO:
+ /* IO space is 64K */
+ if (top > 0x10000)
+ return OPAL_PARAMETER;
+ /* And only supports byte accesses */
+ if (sz != 1)
+ return OPAL_PARAMETER;
+ *opb_base = lpc_io_opb_base;
+ break;
+ case OPAL_LPC_MEM:
+ /* MEM space is 256M */
+ if (top > 0x10000000)
+ return OPAL_PARAMETER;
+ /* And only supports byte accesses */
+ if (sz != 1)
+ return OPAL_PARAMETER;
+ *opb_base = lpc_mem_opb_base;
+ break;
+ case OPAL_LPC_FW:
+ /*
+ * FW space is in segments of 256M controlled
+ * by IDSEL, make sure we don't cross segments
+ */
+ *opb_base = lpc_fw_opb_base;
+ fw_idsel = (addr >> 28);
+ if (((top - 1) >> 28) != fw_idsel)
+ return OPAL_PARAMETER;
+
+ /* Set segment */
+ rc = lpc_set_fw_idsel(lpc, fw_idsel);
+ if (rc)
+ return rc;
+ /* Set read access size */
+ if (!is_write) {
+ rc = lpc_set_fw_rdsz(lpc, sz);
+ if (rc)
+ return rc;
+ }
+ break;
+ default:
+ return OPAL_PARAMETER;
+ }
+ return OPAL_SUCCESS;
+}
+
+#define LPC_ERROR_IDX(x) (__builtin_ffs(x) - 1 - 2)
+#define LPC_ERROR(_sts, _rc, _description) \
+ [LPC_ERROR_IDX(_sts)] = { _rc, _description }
+static const struct lpc_error_entry lpc_error_table[] = {
+ LPC_ERROR(LPC_HC_IRQ_BM_TAR_ERR, OPAL_WRONG_STATE, "Got bus master TAR error."),
+ LPC_ERROR(LPC_HC_IRQ_TARG_TAR_ERR, OPAL_WRONG_STATE, "Got abnormal TAR error."),
+ LPC_ERROR(LPC_HC_IRQ_SYNC_TIMEOUT_ERR, OPAL_TIMEOUT, "Got SYNC timeout error."),
+ LPC_ERROR(LPC_HC_IRQ_SYNC_NORM_ERR, OPAL_WRONG_STATE, "Got SYNC normal error."),
+ LPC_ERROR(LPC_HC_IRQ_SYNC_NORESP_ERR, OPAL_HARDWARE, "Got SYNC no-response error."),
+ LPC_ERROR(LPC_HC_IRQ_SYNC_ABNORM_ERR, OPAL_WRONG_STATE, "Got SYNC abnormal error."),
+};
+
+static int64_t lpc_probe_prepare(struct lpcm *lpc)
+{
+ const uint32_t irqmask_addr = lpc_reg_opb_base + LPC_HC_IRQMASK;
+ const uint32_t irqstat_addr = lpc_reg_opb_base + LPC_HC_IRQSTAT;
+ uint32_t irqmask;
+ int rc;
+
+ rc = opb_read(lpc, irqmask_addr, &irqmask, 4);
+ if (rc)
+ return rc;
+
+ irqmask &= ~LPC_HC_IRQ_SYNC_NORESP_ERR;
+ rc = opb_write(lpc, irqmask_addr, irqmask, 4);
+ if (rc)
+ return rc;
+
+ return opb_write(lpc, irqstat_addr, LPC_HC_IRQ_SYNC_NORESP_ERR, 4);
+}
+
+static int64_t lpc_probe_test(struct lpcm *lpc)
+{
+ const uint32_t irqmask_addr = lpc_reg_opb_base + LPC_HC_IRQMASK;
+ const uint32_t irqstat_addr = lpc_reg_opb_base + LPC_HC_IRQSTAT;
+ uint32_t irqmask, irqstat;
+ int64_t idx;
+ int rc;
+
+ rc = opb_read(lpc, irqstat_addr, &irqstat, 4);
+ if (rc)
+ return rc;
+
+ rc = opb_write(lpc, irqstat_addr, LPC_HC_IRQ_SYNC_NORESP_ERR, 4);
+ if (rc)
+ return rc;
+
+ rc = opb_read(lpc, irqmask_addr, &irqmask, 4);
+ if (rc)
+ return rc;
+
+ irqmask |= LPC_HC_IRQ_SYNC_NORESP_ERR;
+ rc = opb_write(lpc, irqmask_addr, irqmask, 4);
+ if (rc)
+ return rc;
+
+ if (!(irqstat & LPC_HC_IRQ_BASE_IRQS))
+ return OPAL_SUCCESS;
+
+ /* Ensure we can perform a valid lookup in the error table */
+ idx = LPC_ERROR_IDX(irqstat);
+ if (idx < 0 || idx >= ARRAY_SIZE(lpc_error_table)) {
+ prerror("LPC bus error translation failed with status 0x%x\n",
+ irqstat);
+ return OPAL_PARAMETER;
+ }
+
+ rc = lpc_error_table[idx].rc;
+ return rc;
+}
+
+static int64_t __lpc_write(struct lpcm *lpc, enum OpalLPCAddressType addr_type,
+ uint32_t addr, uint32_t data, uint32_t sz,
+ bool probe)
+{
+ uint32_t opb_base;
+ int64_t rc;
+
+ lock(&lpc->lock);
+ if (probe) {
+ rc = lpc_probe_prepare(lpc);
+ if (rc)
+ goto bail;
+ }
+
+ /*
+ * Convert to an OPB access and handle LPC HC configuration
+ * for FW accesses (IDSEL)
+ */
+ rc = lpc_opb_prepare(lpc, addr_type, addr, sz, &opb_base, true);
+ if (rc)
+ goto bail;
+
+ /* Perform OPB access */
+ rc = opb_write(lpc, opb_base + addr, data, sz);
+ if (rc)
+ goto bail;
+
+ if (probe)
+ rc = lpc_probe_test(lpc);
+ bail:
+ unlock(&lpc->lock);
+ return rc;
+}
+
+static int64_t __lpc_write_sanity(enum OpalLPCAddressType addr_type,
+ uint32_t addr, uint32_t data, uint32_t sz,
+ bool probe)
+{
+ struct proc_chip *chip;
+
+ if (lpc_default_chip_id < 0)
+ return OPAL_PARAMETER;
+ chip = get_chip(lpc_default_chip_id);
+ if (!chip || !chip->lpc)
+ return OPAL_PARAMETER;
+ return __lpc_write(chip->lpc, addr_type, addr, data, sz, probe);
+}
+
+int64_t lpc_write(enum OpalLPCAddressType addr_type, uint32_t addr,
+ uint32_t data, uint32_t sz)
+{
+ return __lpc_write_sanity(addr_type, addr, data, sz, false);
+}
+
+int64_t lpc_probe_write(enum OpalLPCAddressType addr_type, uint32_t addr,
+ uint32_t data, uint32_t sz)
+{
+ return __lpc_write_sanity(addr_type, addr, data, sz, true);
+}
+
+/*
+ * The "OPAL" variant add the emulation of 2 and 4 byte accesses using
+ * byte accesses for IO and MEM space in order to be compatible with
+ * existing Linux expectations
+ */
+static int64_t opal_lpc_write(uint32_t chip_id, enum OpalLPCAddressType addr_type,
+ uint32_t addr, uint32_t data, uint32_t sz)
+{
+ struct proc_chip *chip;
+ int64_t rc;
+
+ chip = get_chip(chip_id);
+ if (!chip || !chip->lpc)
+ return OPAL_PARAMETER;
+
+ if (addr_type == OPAL_LPC_FW || sz == 1)
+ return __lpc_write(chip->lpc, addr_type, addr, data, sz, false);
+ while(sz--) {
+ rc = __lpc_write(chip->lpc, addr_type, addr, data & 0xff, 1, false);
+ if (rc)
+ return rc;
+ addr++;
+ data >>= 8;
+ }
+ return OPAL_SUCCESS;
+}
+
+static int64_t __lpc_read(struct lpcm *lpc, enum OpalLPCAddressType addr_type,
+ uint32_t addr, uint32_t *data, uint32_t sz,
+ bool probe)
+{
+ uint32_t opb_base;
+ int64_t rc;
+
+ lock(&lpc->lock);
+ if (probe) {
+ rc = lpc_probe_prepare(lpc);
+ if (rc)
+ goto bail;
+ }
+
+ /*
+ * Convert to an OPB access and handle LPC HC configuration
+ * for FW accesses (IDSEL and read size)
+ */
+ rc = lpc_opb_prepare(lpc, addr_type, addr, sz, &opb_base, false);
+ if (rc)
+ goto bail;
+
+ /* Perform OPB access */
+ rc = opb_read(lpc, opb_base + addr, data, sz);
+ if (rc)
+ goto bail;
+
+ if (probe)
+ rc = lpc_probe_test(lpc);
+ bail:
+ unlock(&lpc->lock);
+ return rc;
+}
+
+static int64_t __lpc_read_sanity(enum OpalLPCAddressType addr_type,
+ uint32_t addr, uint32_t *data, uint32_t sz,
+ bool probe)
+{
+ struct proc_chip *chip;
+
+ if (lpc_default_chip_id < 0)
+ return OPAL_PARAMETER;
+ chip = get_chip(lpc_default_chip_id);
+ if (!chip || !chip->lpc)
+ return OPAL_PARAMETER;
+ return __lpc_read(chip->lpc, addr_type, addr, data, sz, probe);
+}
+
+int64_t lpc_read(enum OpalLPCAddressType addr_type, uint32_t addr,
+ uint32_t *data, uint32_t sz)
+{
+ return __lpc_read_sanity(addr_type, addr, data, sz, false);
+}
+
+int64_t lpc_probe_read(enum OpalLPCAddressType addr_type, uint32_t addr,
+ uint32_t *data, uint32_t sz)
+{
+ return __lpc_read_sanity(addr_type, addr, data, sz, true);
+}
+
+/*
+ * The "OPAL" variant add the emulation of 2 and 4 byte accesses using
+ * byte accesses for IO and MEM space in order to be compatible with
+ * existing Linux expectations
+ */
+static int64_t opal_lpc_read(uint32_t chip_id, enum OpalLPCAddressType addr_type,
+ uint32_t addr, __be32 *data, uint32_t sz)
+{
+ struct proc_chip *chip;
+ int64_t rc;
+ uint32_t tmp;
+
+ chip = get_chip(chip_id);
+ if (!chip || !chip->lpc)
+ return OPAL_PARAMETER;
+
+ if (addr_type == OPAL_LPC_FW) {
+ rc = __lpc_read(chip->lpc, addr_type, addr, &tmp, sz, false);
+ if (rc)
+ return rc;
+
+ } else {
+ tmp = 0;
+ while (sz--) {
+ uint32_t byte;
+
+ rc = __lpc_read(chip->lpc, addr_type, addr, &byte, 1, false);
+ if (rc)
+ return rc;
+ tmp = tmp | (byte << (8 * sz));
+ addr++;
+ }
+ }
+
+ *data = cpu_to_be32(tmp);
+
+ return OPAL_SUCCESS;
+}
+
+bool lpc_present(void)
+{
+ return lpc_default_chip_id >= 0;
+}
+
+/* Called with LPC lock held */
+static void lpc_setup_serirq(struct lpcm *lpc)
+{
+ struct lpc_client_entry *ent;
+ uint32_t mask = LPC_HC_IRQ_BASE_IRQS;
+ int rc;
+
+ if (!lpc_irqs_ready)
+ return;
+
+ /* Collect serirq enable bits */
+ list_for_each(&lpc->clients, ent, node)
+ mask |= ent->clt->interrupts & LPC_HC_IRQ_SERIRQ_ALL;
+
+ rc = opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQMASK, mask, 4);
+ if (rc) {
+ prerror("Failed to update irq mask\n");
+ return;
+ }
+ DBG_IRQ("IRQ mask set to 0x%08x\n", mask);
+
+ /* Enable the LPC interrupt in the OPB Master */
+ opb_write(lpc, opb_master_reg_base + OPB_MASTER_LS_IRQ_POL, 0, 4);
+ rc = opb_write(lpc, opb_master_reg_base + OPB_MASTER_LS_IRQ_MASK,
+ OPB_MASTER_IRQ_LPC, 4);
+ if (rc)
+ prerror("Failed to enable IRQs in OPB\n");
+
+ /* Check whether we should enable serirq */
+ if (mask & LPC_HC_IRQ_SERIRQ_ALL) {
+ rc = opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQSER_CTRL,
+ LPC_HC_IRQSER_EN |
+ LPC_HC_IRQSER_START_4CLK |
+ /*
+ * New mode bit for P9N DD2.0 (ignored otherwise)
+ * when set we no longer have to manually clear
+ * the SerIRQs on EOI.
+ */
+ LPC_HC_IRQSER_AUTO_CLEAR, 4);
+ DBG_IRQ("SerIRQ enabled\n");
+ } else {
+ rc = opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQSER_CTRL,
+ 0, 4);
+ DBG_IRQ("SerIRQ disabled\n");
+ }
+ if (rc)
+ prerror("Failed to configure SerIRQ\n");
+ {
+ u32 val;
+ rc = opb_read(lpc, lpc_reg_opb_base + LPC_HC_IRQMASK, &val, 4);
+ if (rc)
+ prerror("Failed to readback mask");
+ else
+ DBG_IRQ("MASK READBACK=%x\n", val);
+
+ rc = opb_read(lpc, lpc_reg_opb_base + LPC_HC_IRQSER_CTRL,
+ &val, 4);
+ if (rc)
+ prerror("Failed to readback ctrl");
+ else
+ DBG_IRQ("CTRL READBACK=%x\n", val);
+ }
+}
+
+static void lpc_route_serirq(struct lpcm *lpc, uint32_t sirq,
+ uint32_t psi_idx)
+{
+ uint32_t reg, shift, val, psi_old;
+ int64_t rc;
+
+ psi_old = lpc->sirq_routes[sirq];
+ lpc->sirq_rmasks[psi_old] &= ~(LPC_HC_IRQ_SERIRQ0 >> sirq);
+ lpc->sirq_rmasks[psi_idx] |= (LPC_HC_IRQ_SERIRQ0 >> sirq);
+ lpc->sirq_routes[sirq] = psi_idx;
+ lpc->sirq_routed[sirq] = true;
+
+ /* We may not be ready yet ... */
+ if (!lpc->has_serirq)
+ return;
+
+ if (sirq < 14) {
+ reg = 0xc;
+ shift = 4 + (sirq << 1);
+ } else {
+ reg = 0x8;
+ shift = 8 + ((sirq - 14) << 1);
+ }
+ shift = 30-shift;
+ rc = opb_read(lpc, opb_master_reg_base + reg, &val, 4);
+ if (rc)
+ return;
+ val = val & ~(3 << shift);
+ val |= (psi_idx & 3) << shift;
+ opb_write(lpc, opb_master_reg_base + reg, val, 4);
+}
+
+static void lpc_alloc_route(struct lpcm *lpc, unsigned int irq,
+ unsigned int policy)
+{
+ unsigned int i, r, c;
+ int route = -1;
+
+ if (policy == IRQ_ATTR_TARGET_OPAL)
+ r = LPC_ROUTE_OPAL;
+ else
+ r = LPC_ROUTE_LINUX;
+
+ prlog(PR_DEBUG, "Routing irq %d, policy: %d (r=%d)\n",
+ irq, policy, r);
+
+ /* Are we already routed ? */
+ if (lpc->sirq_routed[irq] &&
+ r != lpc->sirq_ralloc[lpc->sirq_routes[irq]]) {
+ prerror("irq %d has conflicting policies\n", irq);
+ return;
+ }
+
+ /* First try to find a free route. Leave one for another
+ * policy though
+ */
+ for (i = 0, c = 0; i < 4; i++) {
+ /* Count routes with identical policy */
+ if (lpc->sirq_ralloc[i] == r)
+ c++;
+
+ /* Use the route if it's free and there is no more
+ * than 3 existing routes with that policy
+ */
+ if (lpc->sirq_ralloc[i] == LPC_ROUTE_FREE && c < 4) {
+ lpc->sirq_ralloc[i] = r;
+ route = i;
+ break;
+ }
+ }
+
+ /* If we couldn't get a free one, try to find an existing one
+ * with a matching policy
+ */
+ for (i = 0; route < 0 && i < 4; i++) {
+ if (lpc->sirq_ralloc[i] == r)
+ route = i;
+ }
+
+ /* Still no route ? bail. That should never happen */
+ if (route < 0) {
+ prerror("Can't find a route for irq %d\n", irq);
+ return;
+ }
+
+ /* Program route */
+ lpc_route_serirq(lpc, irq, route);
+
+ prlog(PR_DEBUG, "SerIRQ %d using route %d targetted at %s\n",
+ irq, route, r == LPC_ROUTE_LINUX ? "OS" : "OPAL");
+}
+
+unsigned int lpc_get_irq_policy(uint32_t chip_id, uint32_t psi_idx)
+{
+ struct proc_chip *c = get_chip(chip_id);
+
+ if (!c || !c->lpc)
+ return IRQ_ATTR_TARGET_LINUX;
+
+ if (c->lpc->sirq_ralloc[psi_idx] == LPC_ROUTE_LINUX)
+ return IRQ_ATTR_TARGET_LINUX;
+ else
+ return IRQ_ATTR_TARGET_OPAL | IRQ_ATTR_TYPE_LSI;
+}
+
+static void lpc_create_int_map(struct lpcm *lpc, struct dt_node *psi_node)
+{
+ __be32 map[LPC_NUM_SERIRQ * 5], *pmap;
+ uint32_t i;
+
+ if (!psi_node)
+ return;
+ pmap = map;
+ for (i = 0; i < LPC_NUM_SERIRQ; i++) {
+ if (!lpc->sirq_routed[i])
+ continue;
+ *(pmap++) = 0;
+ *(pmap++) = 0;
+ *(pmap++) = cpu_to_be32(i);
+ *(pmap++) = cpu_to_be32(psi_node->phandle);
+ *(pmap++) = cpu_to_be32(lpc->sirq_routes[i] + P9_PSI_IRQ_LPC_SIRQ0);
+ }
+ if (pmap == map)
+ return;
+ dt_add_property(lpc->node, "interrupt-map", map,
+ (pmap - map) * sizeof(uint32_t));
+ dt_add_property_cells(lpc->node, "interrupt-map-mask", 0, 0, 0xff);
+ dt_add_property_cells(lpc->node, "#interrupt-cells", 1);
+}
+
+void lpc_finalize_interrupts(void)
+{
+ struct proc_chip *chip;
+
+ lpc_irqs_ready = true;
+
+ for_each_chip(chip) {
+ if (chip->lpc && chip->psi &&
+ (chip->type == PROC_CHIP_P9_NIMBUS ||
+ chip->type == PROC_CHIP_P9_CUMULUS ||
+ chip->type == PROC_CHIP_P9P ||
+ chip->type == PROC_CHIP_P10))
+ lpc_create_int_map(chip->lpc, chip->psi->node);
+ }
+}
+
+static void lpc_init_interrupts_one(struct proc_chip *chip)
+{
+ struct lpcm *lpc = chip->lpc;
+ int i, rc;
+
+ lock(&lpc->lock);
+
+ /* First mask them all */
+ rc = opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQMASK, 0, 4);
+ if (rc) {
+ prerror("Failed to init interrutps\n");
+ goto bail;
+ }
+
+ switch(chip->type) {
+ case PROC_CHIP_P8_MURANO:
+ case PROC_CHIP_P8_VENICE:
+ /* On Murano/Venice, there is no SerIRQ, only enable error
+ * interrupts
+ */
+ rc = opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQMASK,
+ LPC_HC_IRQ_BASE_IRQS, 4);
+ if (rc) {
+ prerror("Failed to set interrupt mask\n");
+ goto bail;
+ }
+ opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQSER_CTRL, 0, 4);
+ break;
+ case PROC_CHIP_P8_NAPLES:
+ /* On Naples, we support LPC interrupts, enable them based
+ * on what clients requests. This will setup the mask and
+ * enable processing
+ */
+ lpc->has_serirq = true;
+ lpc_setup_serirq(lpc);
+ break;
+ case PROC_CHIP_P9_NIMBUS:
+ case PROC_CHIP_P9_CUMULUS:
+ case PROC_CHIP_P9P:
+ case PROC_CHIP_P10:
+ /* On P9, we additionally setup the routing. */
+ lpc->has_serirq = true;
+ for (i = 0; i < LPC_NUM_SERIRQ; i++) {
+ if (lpc->sirq_routed[i])
+ lpc_route_serirq(lpc, i, lpc->sirq_routes[i]);
+ }
+ lpc_setup_serirq(lpc);
+ break;
+ default:
+ ;
+ }
+ bail:
+ unlock(&lpc->lock);
+}
+
+void lpc_init_interrupts(void)
+{
+ struct proc_chip *chip;
+
+ lpc_irqs_ready = true;
+
+ for_each_chip(chip) {
+ if (chip->lpc)
+ lpc_init_interrupts_one(chip);
+ }
+}
+
+static void lpc_dispatch_reset(struct lpcm *lpc)
+{
+ struct lpc_client_entry *ent;
+
+ /* XXX We are going to hit this repeatedly while reset is
+ * asserted which might be sub-optimal. We should instead
+ * detect assertion and start a poller that will wait for
+ * de-assertion. We could notify clients of LPC being
+ * on/off rather than just reset
+ */
+
+ prerror("Got LPC reset on chip 0x%x !\n", lpc->chip_id);
+
+ /* Collect serirq enable bits */
+ list_for_each(&lpc->clients, ent, node) {
+ if (!ent->clt->reset)
+ continue;
+ unlock(&lpc->lock);
+ ent->clt->reset(lpc->chip_id);
+ lock(&lpc->lock);
+ }
+
+ /* Reconfigure serial interrupts */
+ if (lpc->has_serirq)
+ lpc_setup_serirq(lpc);
+}
+
+static void lpc_dispatch_err_irqs(struct lpcm *lpc, uint32_t irqs)
+{
+ const struct lpc_error_entry *err;
+ static int lpc_bus_err_count;
+ struct opal_err_info *info;
+ uint32_t addr;
+ int64_t idx;
+ int rc;
+
+ /* Write back to clear error interrupts, we clear SerIRQ later
+ * as they are handled as level interrupts
+ */
+ rc = opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQSTAT,
+ LPC_HC_IRQ_BASE_IRQS, 4);
+ if (rc)
+ prerror("Failed to clear IRQ error latches !\n");
+
+ if (irqs & LPC_HC_IRQ_LRESET) {
+ lpc_dispatch_reset(lpc);
+ return;
+ }
+
+ /* Ensure we can perform a valid lookup in the error table */
+ idx = LPC_ERROR_IDX(irqs);
+ if (idx < 0 || idx >= ARRAY_SIZE(lpc_error_table)) {
+ prerror("LPC bus error translation failed with status 0x%x\n",
+ irqs);
+ return;
+ }
+
+ /* Find and report the error */
+ err = &lpc_error_table[idx];
+ lpc_bus_err_count++;
+ if (manufacturing_mode && (lpc_bus_err_count > LPC_BUS_DEGRADED_PERF_THRESHOLD))
+ info = &e_info(OPAL_RC_LPC_SYNC_PERF);
+ else
+ info = &e_info(OPAL_RC_LPC_SYNC);
+
+ rc = opb_read(lpc, lpc_reg_opb_base + LPC_HC_ERROR_ADDRESS, &addr, 4);
+ if (rc)
+ log_simple_error(info, "LPC[%03x]: %s "
+ "Error reading error address register\n",
+ lpc->chip_id, err->description);
+ else
+ log_simple_error(info, "LPC[%03x]: %s Error address reg: "
+ "0x%08x\n",
+ lpc->chip_id, err->description, addr);
+}
+
+static void lpc_dispatch_ser_irqs(struct lpcm *lpc, uint32_t irqs,
+ bool clear_latch)
+{
+ struct lpc_client_entry *ent;
+ uint32_t cirqs;
+ int rc;
+
+ irqs &= LPC_HC_IRQ_SERIRQ_ALL;
+
+ /* Collect serirq enable bits */
+ list_for_each(&lpc->clients, ent, node) {
+ if (!ent->clt->interrupt)
+ continue;
+ cirqs = ent->clt->interrupts & irqs;
+ if (cirqs) {
+ unlock(&lpc->lock);
+ ent->clt->interrupt(lpc->chip_id, cirqs);
+ lock(&lpc->lock);
+ }
+ }
+
+ /* Our SerIRQ are level sensitive, we clear the latch after
+ * we call the handler.
+ */
+ if (!clear_latch)
+ return;
+
+ rc = opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQSTAT, irqs, 4);
+ if (rc)
+ prerror("Failed to clear SerIRQ latches !\n");
+}
+
+void lpc_interrupt(uint32_t chip_id)
+{
+ struct proc_chip *chip = get_chip(chip_id);
+ struct lpcm *lpc;
+ uint32_t irqs, opb_irqs;
+ int rc;
+
+ /* No initialized LPC controller on that chip */
+ if (!chip || !chip->lpc)
+ return;
+ lpc = chip->lpc;
+
+ lock(&lpc->lock);
+
+ /* Grab OPB Master LS interrupt status */
+ rc = opb_read(lpc, opb_master_reg_base + OPB_MASTER_LS_IRQ_STAT,
+ &opb_irqs, 4);
+ if (rc) {
+ prerror("Failed to read OPB IRQ state\n");
+ unlock(&lpc->lock);
+ return;
+ }
+
+ DBG_IRQ("OPB IRQ on chip 0x%x, oirqs=0x%08x\n", chip_id, opb_irqs);
+
+ /* Check if it's an LPC interrupt */
+ if (!(opb_irqs & OPB_MASTER_IRQ_LPC)) {
+ /* Something we don't support ? Ack it anyway... */
+ goto bail;
+ }
+
+ /* Handle the lpc interrupt source (errors etc...) */
+ rc = opb_read(lpc, lpc_reg_opb_base + LPC_HC_IRQSTAT, &irqs, 4);
+ if (rc) {
+ prerror("Failed to read LPC IRQ state\n");
+ goto bail;
+ }
+
+ DBG_IRQ("LPC IRQ on chip 0x%x, irqs=0x%08x\n", chip_id, irqs);
+
+ /* Handle error interrupts */
+ if (irqs & LPC_HC_IRQ_BASE_IRQS)
+ lpc_dispatch_err_irqs(lpc, irqs);
+
+ /* Handle SerIRQ interrupts */
+ if (irqs & LPC_HC_IRQ_SERIRQ_ALL)
+ lpc_dispatch_ser_irqs(lpc, irqs, true);
+ bail:
+ /* Ack it at the OPB level */
+ opb_write(lpc, opb_master_reg_base + OPB_MASTER_LS_IRQ_STAT,
+ opb_irqs, 4);
+ unlock(&lpc->lock);
+}
+
+void lpc_serirq(uint32_t chip_id, uint32_t index)
+{
+ struct proc_chip *chip = get_chip(chip_id);
+ struct lpcm *lpc;
+ uint32_t irqs, rmask;
+ int rc;
+
+ /* No initialized LPC controller on that chip */
+ if (!chip || !chip->lpc)
+ return;
+ lpc = chip->lpc;
+
+ lock(&lpc->lock);
+
+ /* Handle the lpc interrupt source (errors etc...) */
+ rc = opb_read(lpc, lpc_reg_opb_base + LPC_HC_IRQSTAT, &irqs, 4);
+ if (rc) {
+ prerror("Failed to read LPC IRQ state\n");
+ goto bail;
+ }
+ rmask = lpc->sirq_rmasks[index];
+
+ DBG_IRQ("IRQ on chip 0x%x, irqs=0x%08x rmask=0x%08x\n",
+ chip_id, irqs, rmask);
+ irqs &= rmask;
+
+ /*
+ * Handle SerIRQ interrupts. Don't clear the latch,
+ * it will be done in our special EOI callback if
+ * necessary on DD1
+ */
+ if (irqs)
+ lpc_dispatch_ser_irqs(lpc, irqs, false);
+
+ bail:
+ unlock(&lpc->lock);
+}
+
+void lpc_all_interrupts(uint32_t chip_id)
+{
+ struct proc_chip *chip = get_chip(chip_id);
+ struct lpcm *lpc;
+
+ /* No initialized LPC controller on that chip */
+ if (!chip || !chip->lpc)
+ return;
+ lpc = chip->lpc;
+
+ /* Dispatch all */
+ lock(&lpc->lock);
+ lpc_dispatch_ser_irqs(lpc, LPC_HC_IRQ_SERIRQ_ALL, false);
+ unlock(&lpc->lock);
+}
+
+static void lpc_init_chip_p8(struct dt_node *xn)
+ {
+ uint32_t gcid = dt_get_chip_id(xn);
+ struct proc_chip *chip;
+ struct lpcm *lpc;
+
+ chip = get_chip(gcid);
+ assert(chip);
+
+ lpc = zalloc(sizeof(struct lpcm));
+ assert(lpc);
+ lpc->chip_id = gcid;
+ lpc->xbase = dt_get_address(xn, 0, NULL);
+ lpc->fw_idsel = 0xff;
+ lpc->fw_rdsz = 0xff;
+ lpc->node = xn;
+ list_head_init(&lpc->clients);
+ init_lock(&lpc->lock);
+
+ if (lpc_default_chip_id < 0 ||
+ dt_has_node_property(xn, "primary", NULL)) {
+ lpc_default_chip_id = gcid;
+ }
+
+ /* Mask all interrupts for now */
+ opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQMASK, 0, 4);
+
+ printf("LPC[%03x]: Initialized, access via XSCOM @0x%x\n",
+ gcid, lpc->xbase);
+
+ dt_add_property(xn, "interrupt-controller", NULL, 0);
+ dt_add_property_cells(xn, "#interrupt-cells", 1);
+ assert(dt_prop_get_u32(xn, "#address-cells") == 2);
+
+ chip->lpc = lpc;
+}
+
+static void lpc_init_chip_p9(struct dt_node *opb_node)
+{
+ uint32_t gcid = dt_get_chip_id(opb_node);
+ struct dt_node *lpc_node;
+ struct proc_chip *chip;
+ struct lpcm *lpc;
+ u64 addr;
+ u32 val;
+
+ chip = get_chip(gcid);
+ assert(chip);
+
+ /* Grab OPB base address */
+ addr = dt_prop_get_cell(opb_node, "ranges", 1);
+ addr <<= 32;
+ addr |= dt_prop_get_cell(opb_node, "ranges", 2);
+
+ /* Find the "lpc" child node */
+ lpc_node = dt_find_compatible_node(opb_node, NULL, "ibm,power9-lpc");
+ if (!lpc_node)
+ return;
+
+ lpc = zalloc(sizeof(struct lpcm));
+ assert(lpc);
+ lpc->chip_id = gcid;
+ lpc->mbase = (void *)addr;
+ lpc->fw_idsel = 0xff;
+ lpc->fw_rdsz = 0xff;
+ lpc->node = lpc_node;
+ list_head_init(&lpc->clients);
+ init_lock(&lpc->lock);
+
+ if (lpc_default_chip_id < 0 ||
+ dt_has_node_property(opb_node, "primary", NULL)) {
+ lpc_default_chip_id = gcid;
+ }
+
+ /* Mask all interrupts for now */
+ opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQMASK, 0, 4);
+
+ /* Clear any stale LPC bus errors */
+ opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQSTAT,
+ LPC_HC_IRQ_BASE_IRQS, 4);
+
+ /* Default with routing to PSI SerIRQ 0, this will be updated
+ * later when interrupts are initialized.
+ */
+ opb_read(lpc, opb_master_reg_base + 8, &val, 4);
+ val &= 0xff03ffff;
+ opb_write(lpc, opb_master_reg_base + 8, val, 4);
+ opb_read(lpc, opb_master_reg_base + 0xc, &val, 4);
+ val &= 0xf0000000;
+ opb_write(lpc, opb_master_reg_base + 0xc, val, 4);
+
+ prlog(PR_INFO, "LPC[%03x]: Initialized\n", gcid);
+ prlog(PR_DEBUG,"access via MMIO @%p\n", lpc->mbase);
+
+ chip->lpc = lpc;
+}
+
+void lpc_init(void)
+{
+ struct dt_node *xn;
+ bool has_lpc = false;
+
+ /* Look for P9 first as the DT is compatile for both 8 and 9 */
+ dt_for_each_compatible(dt_root, xn, "ibm,power9-lpcm-opb") {
+ lpc_init_chip_p9(xn);
+ has_lpc = true;
+ }
+
+ if (!has_lpc) {
+ dt_for_each_compatible(dt_root, xn, "ibm,power8-lpc") {
+ lpc_init_chip_p8(xn);
+ has_lpc = true;
+ }
+ }
+ if (lpc_default_chip_id >= 0)
+ prlog(PR_DEBUG, "Default bus on chip 0x%x\n",
+ lpc_default_chip_id);
+
+ if (has_lpc) {
+ opal_register(OPAL_LPC_WRITE, opal_lpc_write, 5);
+ opal_register(OPAL_LPC_READ, opal_lpc_read, 5);
+ }
+}
+
+void lpc_used_by_console(void)
+{
+ struct proc_chip *chip;
+
+ xscom_used_by_console();
+
+ for_each_chip(chip) {
+ struct lpcm *lpc = chip->lpc;
+ if (lpc) {
+ lpc->lock.in_con_path = true;
+ lock(&lpc->lock);
+ unlock(&lpc->lock);
+ }
+ }
+}
+
+bool lpc_ok(void)
+{
+ struct proc_chip *chip;
+
+ if (lpc_default_chip_id < 0)
+ return false;
+ if (!xscom_ok())
+ return false;
+ chip = get_chip(lpc_default_chip_id);
+ if (!chip->lpc)
+ return false;
+ return !lock_held_by_me(&chip->lpc->lock);
+}
+
+void lpc_register_client(uint32_t chip_id,
+ const struct lpc_client *clt,
+ uint32_t policy)
+{
+ struct lpc_client_entry *ent;
+ struct proc_chip *chip;
+ struct lpcm *lpc;
+ bool has_routes;
+
+ chip = get_chip(chip_id);
+ assert(chip);
+ lpc = chip->lpc;
+ if (!lpc) {
+ prerror("Attempt to register client on bad chip 0x%x\n",
+ chip_id);
+ return;
+ }
+
+ has_routes =
+ chip->type == PROC_CHIP_P9_NIMBUS ||
+ chip->type == PROC_CHIP_P9_CUMULUS ||
+ chip->type == PROC_CHIP_P9P ||
+ chip->type == PROC_CHIP_P10;
+
+ if (policy != IRQ_ATTR_TARGET_OPAL && !has_routes) {
+ prerror("Chip doesn't support OS interrupt policy\n");
+ return;
+ }
+
+ ent = malloc(sizeof(*ent));
+ assert(ent);
+ ent->clt = clt;
+ ent->policy = policy;
+ lock(&lpc->lock);
+ list_add(&lpc->clients, &ent->node);
+
+ if (has_routes) {
+ unsigned int i;
+ for (i = 0; i < LPC_NUM_SERIRQ; i++)
+ if (clt->interrupts & LPC_IRQ(i))
+ lpc_alloc_route(lpc, i, policy);
+ }
+
+ if (lpc->has_serirq)
+ lpc_setup_serirq(lpc);
+ unlock(&lpc->lock);
+}
diff --git a/roms/skiboot/hw/npu-hw-procedures.c b/roms/skiboot/hw/npu-hw-procedures.c
new file mode 100644
index 000000000..91bbb0f15
--- /dev/null
+++ b/roms/skiboot/hw/npu-hw-procedures.c
@@ -0,0 +1,608 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * NPU (NVLink1, POWER8NVL) Hardware Procedures
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <io.h>
+#include <timebase.h>
+#include <pci.h>
+#include <pci-virt.h>
+#include <interrupts.h>
+#include <npu-regs.h>
+#include <npu.h>
+#include <xscom.h>
+
+typedef uint32_t (*step)(struct npu_dev *);
+
+struct procedure {
+ const char *name;
+ step steps[];
+};
+
+#define DEFINE_PROCEDURE(NAME, STEPS...) \
+ static struct procedure procedure_##NAME = \
+ {.name = #NAME, .steps = {NAME, ##STEPS}}
+
+#define PROCEDURE_INPROGRESS (1 << 31)
+#define PROCEDURE_COMPLETE (1 << 30)
+#define PROCEDURE_NEXT (1 << 29)
+#define PROCEDURE_FAILED 2
+#define PROCEDURE_ABORTED 3
+#define PROCEDURE_UNSUPPORTED 4
+
+/* Mask defining which status bits we want to expose */
+#define PROCEDURE_STATUS_MASK 0xc000000f
+
+/* Accesors for PHY registers. These can be done either via MMIO or SCOM. */
+static bool pl_use_scom = 1;
+static void phy_write(struct npu_dev *npu_dev, uint64_t addr, uint32_t val)
+{
+ if (pl_use_scom)
+ xscom_write(npu_dev->npu->chip_id, npu_dev->pl_xscom_base | addr, val);
+ else
+ out_be16((void *) npu_dev->pl_base + PL_MMIO_ADDR(addr), val);
+}
+
+static uint16_t phy_read(struct npu_dev *npu_dev, uint64_t addr)
+{
+ uint64_t val;
+
+ if (pl_use_scom)
+ xscom_read(npu_dev->npu->chip_id, npu_dev->pl_xscom_base + addr, &val);
+ else
+ val = in_be16((void *) npu_dev->pl_base + PL_MMIO_ADDR(addr));
+
+ return val & 0xffff;
+}
+
+/* The DL registers can be accessed indirectly via the NTL */
+static void dl_write(struct npu_dev *npu_dev, uint32_t addr, uint32_t val)
+{
+ xscom_write(npu_dev->npu->chip_id,
+ npu_dev->xscom + NX_DL_REG_ADDR, addr);
+ xscom_write(npu_dev->npu->chip_id,
+ npu_dev->xscom + NX_DL_REG_DATA, val);
+}
+
+static uint64_t __unused dl_read(struct npu_dev *npu_dev, uint32_t addr)
+{
+ uint64_t val;
+
+ xscom_write(npu_dev->npu->chip_id,
+ npu_dev->xscom + NX_DL_REG_ADDR, addr);
+ xscom_read(npu_dev->npu->chip_id,
+ npu_dev->xscom + NX_DL_REG_DATA, &val);
+ return val;
+}
+
+/* Our hardware bits are backwards here. The lane vectors are 16-bit
+ * values represented in IBM bit ordering. This means lane 0 is
+ * represented by bit 15 in most of the registers. Internally we keep
+ * this sane (ie. npu_dev->lane_mask[0] == lane 0) as we need sane
+ * numbering for set_lane_reg() anyway. */
+static uint32_t phy_lane_mask(struct npu_dev *npu_dev)
+{
+ /* We only train 8 lanes at a time so we don't do a full
+ * bit-swap */
+ assert(npu_dev->lane_mask == 0xff00 || npu_dev->lane_mask == 0xff);
+
+ return ~npu_dev->lane_mask & 0xffff;
+}
+
+static void set_lane_reg(struct npu_dev *npu_dev, uint64_t base_reg,
+ uint64_t data, uint64_t mask)
+{
+ uint64_t val, i;
+ uint32_t lane_mask = npu_dev->lane_mask;
+
+ for (i = 0; i <= 23; i++) {
+ if (lane_mask & (1ul << i)) {
+ uint64_t tx_rxcal_reg = base_reg + (i << 32);
+ val = phy_read(npu_dev, tx_rxcal_reg);
+ val = (val & ~mask) | data;
+ phy_write(npu_dev, tx_rxcal_reg, val);
+ }
+ }
+}
+
+static uint32_t stop(struct npu_dev *npu_dev __unused)
+{
+ return PROCEDURE_COMPLETE | PROCEDURE_ABORTED;
+}
+DEFINE_PROCEDURE(stop);
+
+static uint32_t nop(struct npu_dev *npu_dev __unused)
+{
+ return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(nop);
+
+/* Procedure 1.2.1 (RESET_NPU_DL) from opt_programmerguide.odt. Also
+ * incorporates AT reset. */
+static uint32_t reset_npu_dl(struct npu_dev *npu_dev)
+{
+ uint64_t val;
+
+ /* Assert NPU reset */
+ xscom_read(npu_dev->npu->chip_id, npu_dev->xscom + NX_NTL_CONTROL, &val);
+ val |= NTL_CONTROL_RESET;
+ xscom_write(npu_dev->npu->chip_id, npu_dev->xscom + NX_NTL_CONTROL, val);
+
+ /* Put the Nvidia logic in reset */
+ dl_write(npu_dev, NDL_CONTROL, 0xe8000000);
+
+ /* Release Nvidia logic from reset */
+ dl_write(npu_dev, NDL_CONTROL, 0);
+
+ /* Release NPU from reset */
+ val &= ~NTL_CONTROL_RESET;
+ xscom_write(npu_dev->npu->chip_id, npu_dev->xscom + NX_NTL_CONTROL, val);
+
+ /* Setup up TL credits */
+ xscom_write(npu_dev->npu->chip_id, npu_dev->xscom + NX_TL_CMD_CR, PPC_BIT(0));
+ xscom_write(npu_dev->npu->chip_id, npu_dev->xscom + NX_TL_CMD_D_CR, PPC_BIT(0));
+ xscom_write(npu_dev->npu->chip_id, npu_dev->xscom + NX_TL_RSP_CR, PPC_BIT(15));
+ xscom_write(npu_dev->npu->chip_id, npu_dev->xscom + NX_TL_RSP_D_CR, PPC_BIT(15));
+
+ /* Reset error registers. TODO: are there more we should clear here? */
+ npu_ioda_sel(npu_dev->npu, NPU_IODA_TBL_PESTB, 0, true);
+ for (val = 0; val < NPU_NUM_OF_PES; val++)
+ out_be64(npu_dev->npu->at_regs + NPU_IODA_DATA0, 0);
+
+ return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(reset_npu_dl);
+
+/* Procedures 1.2.3 (reset_lanes) & 1.2.4
+ * (io_register_write_reset_values) */
+static uint32_t phy_reset(struct npu_dev *npu_dev)
+{
+ uint16_t val;
+
+ /* Lower run_lane inputs for lanes to be reset */
+ val = phy_read(npu_dev, RX_RUN_LANE_VEC_0_15);
+ val &= ~phy_lane_mask(npu_dev);
+ phy_write(npu_dev, RX_RUN_LANE_VEC_0_15, val);
+
+ return PROCEDURE_NEXT;
+}
+
+static uint32_t phy_reset_wait(struct npu_dev *npu_dev)
+{
+ uint16_t val;
+
+ /* Wait for lane busy outputs to go to zero for lanes to be
+ * reset */
+ val = phy_read(npu_dev, RX_LANE_BUSY_VEC_0_15);
+ if (val & phy_lane_mask(npu_dev))
+ return PROCEDURE_INPROGRESS;
+
+ return PROCEDURE_NEXT;
+}
+
+static uint32_t phy_reset_complete(struct npu_dev *npu_dev)
+{
+ uint16_t val;
+ uint32_t lane_mask = phy_lane_mask(npu_dev);
+
+ /* Set ioreset_vec for the desired lanes bit positions */
+ val = phy_read(npu_dev, RX_IORESET_VEC_0_15);
+ phy_write(npu_dev, RX_IORESET_VEC_0_15, val | lane_mask);
+
+ val = phy_read(npu_dev, TX_IORESET_VEC_0_15);
+ phy_write(npu_dev, TX_IORESET_VEC_0_15, val | lane_mask);
+
+ /* Clear ioreset_vec */
+ val = phy_read(npu_dev, RX_IORESET_VEC_0_15);
+ phy_write(npu_dev, RX_IORESET_VEC_0_15, val & ~lane_mask);
+
+ val = phy_read(npu_dev, TX_IORESET_VEC_0_15);
+ phy_write(npu_dev, TX_IORESET_VEC_0_15, val & ~lane_mask);
+
+ /* Reset RX phase rotators */
+ set_lane_reg(npu_dev, RX_PR_CNTL_PL, RX_PR_RESET, RX_PR_RESET);
+ set_lane_reg(npu_dev, RX_PR_CNTL_PL, 0, RX_PR_RESET);
+
+ /* Restore registers from scominit that may have changed */
+ set_lane_reg(npu_dev, RX_PR_MODE, 0x8, RX_PR_PHASE_STEP);
+ set_lane_reg(npu_dev, RX_A_DAC_CNTL,
+ 0x7 << MASK_TO_LSH(RX_PR_IQ_RES_SEL),
+ RX_PR_IQ_RES_SEL);
+ set_lane_reg(npu_dev, TX_MODE1_PL, 0, TX_LANE_PDWN);
+ set_lane_reg(npu_dev, RX_BANK_CONTROLS, 0, RX_LANE_ANA_PDWN);
+ set_lane_reg(npu_dev, RX_MODE, 0, RX_LANE_DIG_PDWN);
+
+ return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_reset, phy_reset_wait, phy_reset_complete);
+
+/* Round a fixed decimal number. Frac is the number of fractional
+ * bits */
+static uint32_t round(uint32_t val, int frac)
+{
+ if (val >> (frac - 1) & 0x1)
+ return (val >> frac) + 1;
+ else
+ return val >> frac;
+}
+
+#define ZCAL_MIN (10 << 3)
+#define ZCAL_MAX (40 << 3)
+#define ZCAL_K0 0x0
+#define ZCAL_M 128
+/* TODO: add a test case for the following values:
+
+ Initial values:
+ zcal_n = 0xda;
+ zcal_p = 0xc7;
+
+ Results:
+ pre_p = 0x0
+ pre_n = 0x0
+ margin_p = 0x0
+ margin_n = 0x0
+ total_en_p = 0x32
+ total_en_n = 0x37
+ */
+
+static uint32_t phy_tx_zcal(struct npu_dev *npu_dev)
+{
+ uint64_t val;
+
+ if (npu_dev->index < 2 && npu_dev->npu->tx_zcal_complete[0])
+ return PROCEDURE_COMPLETE;
+
+ if (npu_dev->index >= 2 && npu_dev->npu->tx_zcal_complete[1])
+ return PROCEDURE_COMPLETE;
+
+ /* Start calibration */
+ val = phy_read(npu_dev, TX_IMPCAL_SWO1_PB);
+ val &= TX_ZCAL_SWO_EN;
+ phy_write(npu_dev, TX_IMPCAL_SWO1_PB, val);
+ phy_write(npu_dev, TX_IMPCAL_SWO2_PB, 0x50 << 2);
+ val = phy_read(npu_dev, TX_IMPCAL_PB);
+ val |= TX_ZCAL_REQ;
+ phy_write(npu_dev, TX_IMPCAL_PB, val);
+
+ return PROCEDURE_NEXT;
+}
+
+static uint32_t phy_tx_zcal_wait(struct npu_dev *npu_dev)
+{
+ uint64_t val;
+
+ val = phy_read(npu_dev, TX_IMPCAL_PB);
+ if (!(val & TX_ZCAL_DONE))
+ return PROCEDURE_INPROGRESS;
+
+ if (val & TX_ZCAL_ERROR)
+ return PROCEDURE_COMPLETE | PROCEDURE_FAILED;
+
+ return PROCEDURE_NEXT;
+}
+
+static uint32_t phy_tx_zcal_calculate(struct npu_dev *npu_dev)
+{
+ uint64_t val;
+ uint64_t zcal_n;
+ uint64_t zcal_p;
+ uint64_t margin_n;
+ uint64_t margin_p;
+ uint64_t pre_n;
+ uint64_t pre_p;
+ uint64_t total_en_n;
+ uint64_t total_en_p;
+
+ val = phy_read(npu_dev, TX_IMPCAL_NVAL_PB);
+ zcal_n = GETFIELD(TX_ZCAL_N, val);
+ val = phy_read(npu_dev, TX_IMPCAL_PVAL_PB);
+ zcal_p = GETFIELD(TX_ZCAL_P, val);
+
+ if ((zcal_n < ZCAL_MIN) || (zcal_n > ZCAL_MAX) ||
+ (zcal_p < ZCAL_MIN) || (zcal_p > ZCAL_MAX))
+ return PROCEDURE_COMPLETE | PROCEDURE_FAILED;
+
+ margin_n = (0x80 - ZCAL_M) * zcal_n / 2;
+ margin_p = (0x80 - ZCAL_M) * zcal_p / 2;
+ pre_n = (((0x80 * zcal_n) - (2 * margin_n)) * ZCAL_K0) / 0x80;
+ pre_p = (((0x80 * zcal_p) - (2 * margin_p)) * ZCAL_K0) / 0x80;
+
+ total_en_n = 0x80 * zcal_n - (2 * margin_n) - (pre_n & 1023);
+ total_en_p = 0x80 * zcal_p - (2 * margin_p) - (pre_p & 1023);
+
+ pre_p = round(pre_p, 9);
+ pre_n = round(pre_n, 9);
+ margin_p = round(margin_p, 9);
+ margin_n = round(margin_n, 9);
+ total_en_p = round(total_en_p, 9);
+ total_en_n = round(total_en_n, 9);
+
+ val = SETFIELD(TX_FFE_TOTAL_ENABLE_N_ENC, 0, total_en_n);
+ val = SETFIELD(TX_FFE_TOTAL_ENABLE_P_ENC, val, total_en_p);
+ phy_write(npu_dev, TX_FFE_TOTAL_2RSTEP_EN, val);
+
+ val = SETFIELD(TX_FFE_PRE_N_SEL_ENC, 0, pre_n);
+ val = SETFIELD(TX_FFE_PRE_P_SEL_ENC, val, pre_p);
+ phy_write(npu_dev, TX_FFE_PRE_2RSTEP_SEL, val);
+
+ val = SETFIELD(TX_FFE_MARGIN_PD_N_SEL_ENC, 0, margin_n);
+ val = SETFIELD(TX_FFE_MARGIN_PU_P_SEL_ENC, val, margin_p);
+ phy_write(npu_dev, TX_FFE_MARGIN_2RSTEP_SEL, val);
+
+ if (npu_dev->index < 2)
+ npu_dev->npu->tx_zcal_complete[0] = true;
+ else
+ npu_dev->npu->tx_zcal_complete[1] = true;
+
+ return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_tx_zcal, phy_tx_zcal_wait, phy_tx_zcal_calculate);
+
+static uint32_t phy_enable_tx_rxcal(struct npu_dev *npu_dev)
+{
+ /* Turn common mode on */
+ set_lane_reg(npu_dev, TX_MODE2_PL, TX_RXCAL, TX_RXCAL);
+
+ return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_enable_tx_rxcal);
+
+static uint32_t phy_disable_tx_rxcal(struct npu_dev *npu_dev)
+{
+ /* Turn common mode off */
+ set_lane_reg(npu_dev, TX_MODE2_PL, 0, TX_RXCAL);
+
+ return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_disable_tx_rxcal);
+
+static uint32_t phy_rx_dccal(struct npu_dev *npu_dev)
+{
+ if (phy_read(npu_dev, RX_LANE_BUSY_VEC_0_15)
+ & ~phy_read(npu_dev, RX_INIT_DONE_VEC_0_15))
+ return PROCEDURE_INPROGRESS;
+
+ return PROCEDURE_NEXT;
+}
+
+static uint32_t phy_rx_dccal_start(struct npu_dev *npu_dev)
+{
+ uint64_t val;
+
+ /* Save EO step control */
+ val = phy_read(npu_dev, RX_EO_STEP_CNTL_PG);
+ npu_dev->procedure_data = val;
+
+ phy_write(npu_dev, RX_EO_STEP_CNTL_PG,
+ RX_EO_ENABLE_LATCH_OFFSET_CAL
+ | RX_EO_ENABLE_CM_COARSE_CAL);
+
+ val = phy_read(npu_dev, RX_RECAL_ABORT_VEC_0_15);
+ val |= phy_lane_mask(npu_dev);
+ phy_write(npu_dev, RX_RECAL_ABORT_VEC_0_15, val);
+
+ val = phy_read(npu_dev, RX_RUN_LANE_VEC_0_15);
+ val |= phy_lane_mask(npu_dev);
+ phy_write(npu_dev, RX_RUN_LANE_VEC_0_15, val);
+
+ return PROCEDURE_NEXT;
+}
+
+static uint32_t phy_rx_dccal_complete(struct npu_dev *npu_dev)
+{
+ /* Poll for completion on relevant lanes */
+ if ((phy_read(npu_dev, RX_INIT_DONE_VEC_0_15) & phy_lane_mask(npu_dev))
+ != phy_lane_mask(npu_dev))
+ return PROCEDURE_INPROGRESS;
+
+ return PROCEDURE_NEXT;
+}
+
+static uint32_t phy_rx_dccal_fifo_init(struct npu_dev *npu_dev)
+{
+ uint64_t val;
+
+ val = phy_read(npu_dev, RX_RUN_LANE_VEC_0_15);
+ val &= ~phy_lane_mask(npu_dev);
+ phy_write(npu_dev, RX_RUN_LANE_VEC_0_15, val);
+
+ /* Turn off recal abort */
+ val = phy_read(npu_dev, RX_RECAL_ABORT_VEC_0_15);
+ val &= ~phy_lane_mask(npu_dev);
+ phy_write(npu_dev, RX_RECAL_ABORT_VEC_0_15, val);
+
+ /* Restore original settings */
+ phy_write(npu_dev, RX_EO_STEP_CNTL_PG, npu_dev->procedure_data);
+
+ /* FIFO Init */
+ set_lane_reg(npu_dev, TX_MODE2_PL, 0, TX_UNLOAD_CLK_DISABLE);
+ set_lane_reg(npu_dev, TX_CNTL_STAT2, TX_FIFO_INIT, TX_FIFO_INIT);
+ set_lane_reg(npu_dev, TX_MODE2_PL, TX_UNLOAD_CLK_DISABLE,
+ TX_UNLOAD_CLK_DISABLE);
+
+ return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_rx_dccal, phy_rx_dccal_start, phy_rx_dccal_complete,
+ phy_rx_dccal_fifo_init);
+
+static uint32_t phy_rx_training(struct npu_dev *npu_dev)
+{
+ uint16_t val;
+
+ if (!npu_dev->procedure_data) {
+ val = phy_read(npu_dev, RX_RUN_LANE_VEC_0_15);
+ val |= phy_lane_mask(npu_dev);
+ phy_write(npu_dev, RX_RUN_LANE_VEC_0_15, val);
+ }
+
+ npu_dev->procedure_data++;
+ if (npu_dev->procedure_data >= 1000000)
+ return PROCEDURE_COMPLETE | PROCEDURE_FAILED;
+
+ val = phy_read(npu_dev, RX_RUN_LANE_VEC_0_15);
+ if ((val & phy_lane_mask(npu_dev)) != phy_lane_mask(npu_dev))
+ return PROCEDURE_INPROGRESS;
+
+ return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_rx_training);
+
+static struct procedure *npu_procedures[] = {
+ &procedure_stop,
+ &procedure_nop,
+ NULL,
+ NULL,
+ &procedure_phy_reset,
+ &procedure_phy_tx_zcal,
+ &procedure_phy_rx_dccal,
+ &procedure_phy_enable_tx_rxcal,
+ &procedure_phy_disable_tx_rxcal,
+ &procedure_phy_rx_training,
+ &procedure_reset_npu_dl,
+
+ /* Place holders for pre-terminate and terminate procedures */
+ &procedure_nop,
+ &procedure_nop};
+
+/* Run a procedure step(s) and return status */
+static uint32_t get_procedure_status(struct npu_dev *dev)
+{
+ uint32_t result;
+ uint16_t procedure = dev->procedure_number;
+ uint16_t step = dev->procedure_step;
+ const char *name = npu_procedures[procedure]->name;
+
+ do {
+ result = npu_procedures[procedure]->steps[step](dev);
+
+ if (result & PROCEDURE_NEXT) {
+ step++;
+ NPUDEVINF(dev, "Running procedure %s step %d\n", name, step);
+ }
+ } while (result & PROCEDURE_NEXT);
+
+ dev->procedure_step = step;
+
+ if (result & PROCEDURE_COMPLETE)
+ NPUDEVINF(dev, "Procedure %s complete\n", name);
+ else if (mftb() > dev->procedure_tb + msecs_to_tb(100)) {
+ NPUDEVINF(dev, "Procedure %s timed out\n", name);
+ result = PROCEDURE_COMPLETE | PROCEDURE_FAILED;
+ }
+
+ /* Mask off internal state bits */
+ dev->procedure_status = result & PROCEDURE_STATUS_MASK;
+
+ return dev->procedure_status;
+}
+
+static int64_t npu_dev_procedure_read(struct npu_dev *dev, uint32_t offset,
+ uint32_t size, uint32_t *data)
+{
+ int64_t rc = OPAL_SUCCESS;
+
+ if (size != 4) {
+ /* Short config reads are not supported */
+ prlog(PR_ERR, "NPU%d: Short read of procedure register\n", dev->npu->phb.opal_id);
+ return OPAL_PARAMETER;
+ }
+
+ *data = 0;
+
+ switch (offset) {
+ case 0:
+ /* Only run the procedure if not already complete */
+ if (dev->procedure_status & PROCEDURE_COMPLETE)
+ *data = dev->procedure_status;
+ else
+ *data = get_procedure_status(dev);
+
+ break;
+
+ case 4:
+ *data = dev->procedure_number;
+ break;
+
+ default:
+ prlog(PR_ERR, "NPU%d: Invalid vendor specific offset 0x%08x\n",
+ dev->npu->phb.opal_id, offset);
+ rc = OPAL_PARAMETER;
+ }
+
+ return rc;
+}
+
+static int64_t npu_dev_procedure_write(struct npu_dev *dev, uint32_t offset,
+ uint32_t size, uint32_t data)
+{
+ const char *name;
+ int64_t rc = OPAL_SUCCESS;
+
+ if (size != 4) {
+ /* Short config writes are not supported */
+ prlog(PR_ERR, "NPU%d: Short read of procedure register\n",
+ dev->npu->phb.opal_id);
+ return OPAL_PARAMETER;
+ }
+
+ switch (offset) {
+ case 0:
+ /* We ignore writes to the status register */
+ NPUDEVINF(dev, "Ignoring writes to status register\n");
+ break;
+
+ case 4:
+ if (data >= ARRAY_SIZE(npu_procedures) ||
+ !npu_procedures[data]) {
+ NPUDEVINF(dev, "Unsupported procedure number %d\n", data);
+ dev->procedure_status = PROCEDURE_COMPLETE
+ | PROCEDURE_UNSUPPORTED;
+ break;
+ }
+
+ name = npu_procedures[data]->name;
+ if (dev->procedure_number == data
+ && !(dev->procedure_status & PROCEDURE_COMPLETE))
+ NPUDEVINF(dev, "Restarting procuedure %s\n", name);
+ else
+ NPUDEVINF(dev, "Starting procedure %s\n", name);
+
+ dev->procedure_status = PROCEDURE_INPROGRESS;
+ dev->procedure_number = data;
+ dev->procedure_step = 0;
+ dev->procedure_data = 0;
+ dev->procedure_tb = mftb();
+ break;
+
+ default:
+ NPUDEVINF(dev, "Invalid vendor specific offset 0x%08x\n", offset);
+ rc = OPAL_PARAMETER;
+ }
+
+ return rc;
+}
+
+int64_t npu_dev_procedure(void *dev, struct pci_cfg_reg_filter *pcrf,
+ uint32_t offset, uint32_t len, uint32_t *data,
+ bool write)
+{
+ struct pci_virt_device *pvd = dev;
+ struct npu_dev *ndev = pvd->data;
+
+ if (write)
+ return npu_dev_procedure_write(ndev, offset - pcrf->start,
+ len, *data);
+
+ return npu_dev_procedure_read(ndev, offset - pcrf->start, len, data);
+}
+
+void npu_dev_procedure_reset(struct npu_dev *dev)
+{
+ dev->procedure_status = 0;
+ dev->procedure_number = 0;
+ dev->procedure_step = 0;
+ dev->procedure_data = 0;
+}
diff --git a/roms/skiboot/hw/npu-opal.c b/roms/skiboot/hw/npu-opal.c
new file mode 100644
index 000000000..412ea460e
--- /dev/null
+++ b/roms/skiboot/hw/npu-opal.c
@@ -0,0 +1,176 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <pci.h>
+#include <phb4.h>
+#include <npu2.h>
+#include <npu3.h>
+
+static int64_t opal_npu_init_context(uint64_t phb_id, int pid __unused,
+ uint64_t msr, uint64_t bdf)
+{
+ struct phb *phb = pci_get_phb(phb_id);
+
+ if (!phb)
+ return OPAL_PARAMETER;
+
+ if (phb->phb_type == phb_type_npu_v2)
+ return npu2_init_context(phb, msr, bdf);
+
+ if (phb->phb_type == phb_type_npu_v3)
+ return npu3_init_context(phb, msr, bdf);
+
+ return OPAL_PARAMETER;
+}
+opal_call(OPAL_NPU_INIT_CONTEXT, opal_npu_init_context, 4);
+
+static int64_t opal_npu_destroy_context(uint64_t phb_id, uint64_t pid __unused,
+ uint64_t bdf)
+{
+ struct phb *phb = pci_get_phb(phb_id);
+
+ if (!phb)
+ return OPAL_PARAMETER;
+
+ if (phb->phb_type == phb_type_npu_v2)
+ return npu2_destroy_context(phb, bdf);
+
+ if (phb->phb_type == phb_type_npu_v3)
+ return npu3_destroy_context(phb, bdf);
+
+ return OPAL_PARAMETER;
+}
+opal_call(OPAL_NPU_DESTROY_CONTEXT, opal_npu_destroy_context, 3);
+
+static int64_t opal_npu_map_lpar(uint64_t phb_id, uint64_t bdf, uint64_t lparid,
+ uint64_t lpcr)
+{
+ struct phb *phb = pci_get_phb(phb_id);
+
+ if (!phb)
+ return OPAL_PARAMETER;
+
+ if (phb->phb_type == phb_type_npu_v2)
+ return npu2_map_lpar(phb, bdf, lparid, lpcr);
+
+ if (phb->phb_type == phb_type_npu_v3)
+ return npu3_map_lpar(phb, bdf, lparid, lpcr);
+
+ return OPAL_PARAMETER;
+}
+opal_call(OPAL_NPU_MAP_LPAR, opal_npu_map_lpar, 4);
+
+static int npu_check_relaxed_ordering(struct phb *phb, struct pci_device *pd,
+ void *enable)
+{
+ /*
+ * IBM PCIe bridge devices (ie. the root ports) can always allow relaxed
+ * ordering
+ */
+ if (pd->vdid == 0x04c11014)
+ pd->allow_relaxed_ordering = true;
+
+ PCIDBG(phb, pd->bdfn, "Checking relaxed ordering config\n");
+ if (pd->allow_relaxed_ordering)
+ return 0;
+
+ PCIDBG(phb, pd->bdfn, "Relaxed ordering not allowed\n");
+ *(bool *)enable = false;
+
+ return 1;
+}
+
+static int64_t npu_set_relaxed_order(uint32_t gcid, int pec, bool enable)
+{
+ struct phb *phb;
+ int64_t rc;
+
+ for_each_phb(phb) {
+ if (phb->phb_type == phb_type_npu_v2)
+ rc = npu2_set_relaxed_order(phb, gcid, pec, enable);
+ else if (phb->phb_type == phb_type_npu_v3)
+ rc = npu3_set_relaxed_order(phb, gcid, pec, enable);
+ else
+ continue;
+
+ if (rc)
+ return rc;
+ }
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t opal_npu_set_relaxed_order(uint64_t phb_id, uint16_t bdfn,
+ bool request_enabled)
+{
+ struct phb *phb = pci_get_phb(phb_id);
+ struct phb4 *phb4;
+ uint32_t chip_id, pec;
+ struct pci_device *pd;
+ bool enable = true;
+
+ if (!phb || phb->phb_type != phb_type_pcie_v4)
+ return OPAL_PARAMETER;
+
+ phb4 = phb_to_phb4(phb);
+ pec = phb4->pec;
+ chip_id = phb4->chip_id;
+
+ if (chip_id & ~0x1b)
+ return OPAL_PARAMETER;
+
+ pd = pci_find_dev(phb, bdfn);
+ if (!pd)
+ return OPAL_PARAMETER;
+
+ /*
+ * Not changing state, so no need to rescan PHB devices to determine if
+ * we need to enable/disable it
+ */
+ if (pd->allow_relaxed_ordering == request_enabled)
+ return OPAL_SUCCESS;
+
+ pd->allow_relaxed_ordering = request_enabled;
+
+ /*
+ * Walk all devices on this PHB to ensure they all support relaxed
+ * ordering
+ */
+ pci_walk_dev(phb, NULL, npu_check_relaxed_ordering, &enable);
+
+ if (request_enabled && !enable) {
+ /*
+ * Not all devices on this PHB support relaxed-ordering
+ * mode so we can't enable it as requested
+ */
+ prlog(PR_INFO, "Cannot set relaxed ordering for PEC %d on chip %d\n",
+ pec, chip_id);
+ return OPAL_CONSTRAINED;
+ }
+
+ if (npu_set_relaxed_order(chip_id, pec, request_enabled)) {
+ npu_set_relaxed_order(chip_id, pec, false);
+ return OPAL_RESOURCE;
+ }
+
+ phb4->ro_state = request_enabled;
+ return OPAL_SUCCESS;
+}
+opal_call(OPAL_NPU_SET_RELAXED_ORDER, opal_npu_set_relaxed_order, 3);
+
+static int64_t opal_npu_get_relaxed_order(uint64_t phb_id,
+ uint16_t bdfn __unused)
+{
+ struct phb *phb = pci_get_phb(phb_id);
+ struct phb4 *phb4;
+
+ if (!phb || phb->phb_type != phb_type_pcie_v4)
+ return OPAL_PARAMETER;
+
+ phb4 = phb_to_phb4(phb);
+ return phb4->ro_state;
+}
+opal_call(OPAL_NPU_GET_RELAXED_ORDER, opal_npu_get_relaxed_order, 2);
diff --git a/roms/skiboot/hw/npu.c b/roms/skiboot/hw/npu.c
new file mode 100644
index 000000000..dba7ee50f
--- /dev/null
+++ b/roms/skiboot/hw/npu.c
@@ -0,0 +1,1693 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * NVLink1, supported by the NPU (POWER8)
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <io.h>
+#include <timebase.h>
+#include <pci.h>
+#include <pci-cfg.h>
+#include <pci-virt.h>
+#include <pci-slot.h>
+#include <interrupts.h>
+#include <opal.h>
+#include <opal-api.h>
+#include <cpu.h>
+#include <device.h>
+#include <ccan/str/str.h>
+#include <ccan/array_size/array_size.h>
+#include <ccan/build_assert/build_assert.h>
+#include <affinity.h>
+#include <npu-regs.h>
+#include <npu.h>
+#include <xscom.h>
+#include <string.h>
+
+/*
+ * Terminology:
+ *
+ * Brick - A group of either 8 TX or 8 RX lanes
+ * Link - A group of 8 TX and 8 RX lanes
+ *
+ * Each link is represented in system software as an emulated PCI
+ * device. Garrison has two chips each with 4 links, therefore there
+ * are 8 emulated PCI devices in total.
+ *
+ * +----------------------------------------------------------------+
+ * | PBCQ3 (SCOM Base Address 0x2012c00) |
+ * | PHB3 (SCOM Base Address 0x9012c00) |
+ * +----------------------------------------------------------------+
+ * |||||||| ||||||||
+ * |||||||| ||||||||
+ * |||||||| ||||||||
+ * |||||||| ||||||||
+ * +----------------------------------------------------------------+
+ * | PCIe x8 |
+ * +----------------------------------------------------------------+
+ * | GPU0 |
+ * +--------------------------------+-------------------------------+
+ * | NV Link 1 | NV Link 0 |
+ * +---------------+----------------+---------------+---------------+
+ * | RX | TX | RX | TX |
+ * +---------------+----------------+---------------+---------------+
+ * |||||||| |||||||| |||||||| ||||||||
+ * |||||||| |||||||| |||||||| ||||||||
+ * |||||||| |||||||| |||||||| ||||||||
+ * |||||||| |||||||| |||||||| ||||||||
+ * +---------------+----------------+---------------+---------------+
+ * | TX | RX | TX | RX |
+ * +---------------+----------------+---------------+---------------+
+ * | Lanes [0:7] PHY 0 Lanes [8:15] |
+ * | SCOM Base Address 0x8000080008010c3f |
+ * +--------------------------------+-------------------------------+
+ * | Link 0 NDL/NTL | Link 1 NTL/NDL |
+ * | SCOM Base Address 0x8013c00 | SCOM Base Address 0x8013c40 |
+ * +--------------------------------+-------------------------------+
+ * | |
+ * | Address Translation/AT (shared for all links) |
+ * | SCOM Base Address 0x8013d80 |
+ * | |
+ * +--------------------------------+-------------------------------+
+ * | Link 3 NDL/NTL | Link 4 NTL/NDL |
+ * | SCOM Base Address 0x8013d00 | SCOM Base Address 0x8013d40 |
+ * +--------------------------------+-------------------------------+
+ * | Lanes [8:15] PHY 1 Lanes [0:7] |
+ * | SCOM Base Address 0x8000080008010c7f |
+ * +---------------+----------------+---------------+---------------+
+ * | TX | RX | TX | RX |
+ * +---------------+----------------+---------------+---------------+
+ * |||||||| |||||||| |||||||| ||||||||
+ * |||||||| |||||||| |||||||| ||||||||
+ * |||||||| |||||||| |||||||| ||||||||
+ * |||||||| |||||||| |||||||| ||||||||
+ * +---------------+----------------+---------------+---------------+
+ * | RX | TX | RX | TX |
+ * +---------------+----------------+---------------+---------------+
+ * | NV Link 2 | NV Link 3 |
+ * +--------------------------------+-------------------------------+
+ * | GPU1 |
+ * +----------------------------------------------------------------+
+ * | PCIe x8 |
+ * +----------------------------------------------------------------+
+ * |||||||| ||||||||
+ * |||||||| ||||||||
+ * |||||||| ||||||||
+ * |||||||| ||||||||
+ * +----------------------------------------------------------------+
+ * | PHB2 (SCOM Base Address 0x9012800) |
+ * | PBCQ2 (SCOM Base Address 0x2012800) |
+ * +----------------------------------------------------------------+
+ *
+ */
+
+static struct npu_dev_cap *npu_dev_find_capability(struct npu_dev *dev,
+ uint16_t id);
+
+#define OPAL_NPU_VERSION 0x02
+
+#define PCIE_CAP_START 0x40
+#define PCIE_CAP_END 0x80
+#define VENDOR_CAP_START 0x80
+#define VENDOR_CAP_END 0x90
+
+#define VENDOR_CAP_PCI_DEV_OFFSET 0x0d
+
+/* Returns the scom base for the given link index */
+static uint64_t npu_link_scom_base(struct dt_node *dn, uint32_t scom_base,
+ int index)
+{
+ struct dt_node *link;
+ uint32_t link_index;
+ char namebuf[32];
+
+ snprintf(namebuf, sizeof(namebuf), "link@%x", index);
+ link = dt_find_by_name(dn, namebuf);
+ assert(link);
+ link_index = dt_prop_get_u32(link, "ibm,npu-link-index");
+ return scom_base + (link_index * NPU_LINK_SIZE);
+}
+
+static uint64_t get_bar_size(uint64_t bar)
+{
+ return (1 << GETFIELD(NX_MMIO_BAR_SIZE, bar)) * 0x10000;
+}
+
+/* Update the changes of the device BAR to link BARs */
+static void npu_dev_bar_update(uint32_t gcid, struct npu_dev_bar *bar,
+ bool enable)
+{
+ uint64_t val;
+
+ if (!bar->xscom)
+ return;
+
+ val = bar->base;
+ val = SETFIELD(NX_MMIO_BAR_SIZE, val, ilog2(bar->size / 0x10000));
+ if (enable)
+ val |= NX_MMIO_BAR_ENABLE;
+ xscom_write(gcid, bar->xscom, val);
+}
+
+/* Trap for PCI command (0x4) to enable or disable device's BARs */
+static int64_t npu_dev_cfg_write_cmd(void *dev,
+ struct pci_cfg_reg_filter *pcrf __unused,
+ uint32_t offset, uint32_t size,
+ uint32_t *data, bool write)
+{
+ struct pci_virt_device *pvd = dev;
+ struct npu_dev *ndev = pvd->data;
+ bool enable;
+
+ if (!write)
+ return OPAL_PARTIAL;
+
+ if (offset != PCI_CFG_CMD)
+ return OPAL_PARAMETER;
+ if (size != 1 && size != 2 && size != 4)
+ return OPAL_PARAMETER;
+
+ /* Update device BARs and link BARs will be syncrhonized
+ * with hardware automatically.
+ */
+ enable = !!(*data & PCI_CFG_CMD_MEM_EN);
+ npu_dev_bar_update(ndev->npu->chip_id, &ndev->bar, enable);
+
+ /* Normal path to update PCI config buffer */
+ return OPAL_PARTIAL;
+}
+
+/*
+ * Trap for memory BARs: 0xFF's should be written to BAR register
+ * prior to getting its size.
+ */
+static int64_t npu_dev_cfg_bar_read(struct npu_dev *dev __unused,
+ struct pci_cfg_reg_filter *pcrf,
+ uint32_t offset, uint32_t size,
+ uint32_t *data)
+{
+ struct npu_dev_bar *bar = (struct npu_dev_bar *)(pcrf->data);
+
+ /* Revert to normal path if we weren't trapped for BAR size */
+ if (!bar->trapped)
+ return OPAL_PARTIAL;
+
+ if (offset != pcrf->start &&
+ offset != pcrf->start + 4)
+ return OPAL_PARAMETER;
+ if (size != 4)
+ return OPAL_PARAMETER;
+
+ bar->trapped = false;
+ *data = bar->bar_sz;
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu_dev_cfg_bar_write(struct npu_dev *dev,
+ struct pci_cfg_reg_filter *pcrf,
+ uint32_t offset, uint32_t size,
+ uint32_t data)
+{
+ struct pci_virt_device *pvd = dev->pvd;
+ struct npu_dev_bar *bar = (struct npu_dev_bar *)(pcrf->data);
+ uint32_t pci_cmd;
+
+ if (offset != pcrf->start &&
+ offset != pcrf->start + 4)
+ return OPAL_PARAMETER;
+ if (size != 4)
+ return OPAL_PARAMETER;
+
+ /* Return BAR size on next read */
+ if (data == 0xffffffff) {
+ bar->trapped = true;
+ if (offset == pcrf->start)
+ bar->bar_sz = (bar->size & 0xffffffff);
+ else
+ bar->bar_sz = (bar->size >> 32);
+
+ return OPAL_SUCCESS;
+ }
+
+ /* Update BAR base address */
+ if (offset == pcrf->start) {
+ bar->base &= 0xffffffff00000000UL;
+ bar->base |= (data & 0xfffffff0);
+ } else {
+ bar->base &= 0x00000000ffffffffUL;
+ bar->base |= ((uint64_t)data << 32);
+
+ PCI_VIRT_CFG_NORMAL_RD(pvd, PCI_CFG_CMD, 4, &pci_cmd);
+ npu_dev_bar_update(dev->npu->chip_id, bar,
+ !!(pci_cmd & PCI_CFG_CMD_MEM_EN));
+ }
+
+ /* We still depend on the normal path to update the
+ * cached config buffer.
+ */
+ return OPAL_PARAMETER;
+}
+
+static int64_t npu_dev_cfg_bar(void *dev, struct pci_cfg_reg_filter *pcrf,
+ uint32_t offset, uint32_t len, uint32_t *data,
+ bool write)
+{
+ struct pci_virt_device *pvd = dev;
+ struct npu_dev *ndev = pvd->data;
+
+ if (write)
+ return npu_dev_cfg_bar_write(ndev, pcrf, offset, len, *data);
+
+ return npu_dev_cfg_bar_read(ndev, pcrf, offset, len, data);
+}
+
+static int64_t npu_dev_cfg_exp_devcap(void *dev,
+ struct pci_cfg_reg_filter *pcrf __unused,
+ uint32_t offset, uint32_t size,
+ uint32_t *data, bool write)
+{
+ struct pci_virt_device *pvd = dev;
+ struct npu_dev *ndev = pvd->data;
+
+ assert(write);
+
+ if ((size != 2) || (offset & 1)) {
+ /* Short config writes are not supported */
+ prlog(PR_ERR, "NPU%d: Unsupported write to pcie control register\n",
+ ndev->phb->opal_id);
+ return OPAL_PARAMETER;
+ }
+
+ if (*data & PCICAP_EXP_DEVCTL_FUNC_RESET)
+ npu_dev_procedure_reset(ndev);
+
+ return OPAL_PARTIAL;
+}
+
+static struct npu_dev *bdfn_to_npu_dev(struct npu *p, uint32_t bdfn)
+{
+ struct pci_virt_device *pvd;
+
+ /* Sanity check */
+ if (bdfn & ~0xff)
+ return NULL;
+
+ pvd = pci_virt_find_device(&p->phb, bdfn);
+ if (pvd)
+ return pvd->data;
+
+ return NULL;
+}
+
+#define NPU_CFG_READ(size, type) \
+static int64_t npu_cfg_read##size(struct phb *phb, uint32_t bdfn, \
+ uint32_t offset, type *data) \
+{ \
+ uint32_t val; \
+ int64_t ret; \
+ \
+ ret = pci_virt_cfg_read(phb, bdfn, offset, sizeof(*data), &val); \
+ *data = (type)val; \
+ return ret; \
+}
+#define NPU_CFG_WRITE(size, type) \
+static int64_t npu_cfg_write##size(struct phb *phb, uint32_t bdfn, \
+ uint32_t offset, type data) \
+{ \
+ uint32_t val = data; \
+ \
+ return pci_virt_cfg_write(phb, bdfn, offset, sizeof(data), val); \
+}
+
+NPU_CFG_READ(8, u8);
+NPU_CFG_READ(16, u16);
+NPU_CFG_READ(32, u32);
+NPU_CFG_WRITE(8, u8);
+NPU_CFG_WRITE(16, u16);
+NPU_CFG_WRITE(32, u32);
+
+static int __npu_dev_bind_pci_dev(struct phb *phb __unused,
+ struct pci_device *pd,
+ void *data)
+{
+ struct npu_dev *dev = data;
+ struct dt_node *pci_dt_node;
+ char *pcislot;
+
+ /* Ignore non-nvidia PCI devices */
+ if ((pd->vdid & 0xffff) != 0x10de)
+ return 0;
+
+ /* Find the PCI device's slot location */
+ for (pci_dt_node = pd->dn;
+ pci_dt_node && !dt_find_property(pci_dt_node, "ibm,slot-label");
+ pci_dt_node = pci_dt_node->parent);
+
+ if (!pci_dt_node)
+ return 0;
+
+ pcislot = (char *)dt_prop_get(pci_dt_node, "ibm,slot-label");
+
+ prlog(PR_DEBUG, "NPU: comparing GPU %s and NPU %s\n",
+ pcislot, dev->slot_label);
+
+ if (streq(pcislot, dev->slot_label))
+ return 1;
+
+ return 0;
+}
+
+static void npu_dev_bind_pci_dev(struct npu_dev *dev)
+{
+ struct phb *phb;
+ uint32_t i;
+
+ if (dev->pd)
+ return;
+
+ for (i = 0; i < 64; i++) {
+ if (dev->npu->phb.opal_id == i)
+ continue;
+
+ phb = pci_get_phb(i);
+ if (!phb)
+ continue;
+
+ dev->pd = pci_walk_dev(phb, NULL, __npu_dev_bind_pci_dev, dev);
+ if (dev->pd) {
+ dev->phb = phb;
+ /* Found the device, set the bit in config space */
+ PCI_VIRT_CFG_INIT_RO(dev->pvd, VENDOR_CAP_START +
+ VENDOR_CAP_PCI_DEV_OFFSET, 1, 0x01);
+ return;
+ }
+ }
+
+ prlog(PR_INFO, "%s: No PCI device for NPU device %04x:%02x:%02x.%x to bind to. If you expect a GPU to be there, this is a problem.\n",
+ __func__, dev->npu->phb.opal_id,
+ dev->pvd->bdfn >> 8 & 0xff,
+ dev->pvd->bdfn >> 3 & 0x1f,
+ dev->pvd->bdfn & 0x7);
+
+}
+
+static struct lock pci_npu_phandle_lock = LOCK_UNLOCKED;
+
+/* Appends an NPU phandle to the given PCI device node ibm,npu
+ * property */
+static void npu_append_pci_phandle(struct dt_node *dn, u32 phandle)
+{
+ uint32_t *npu_phandles;
+ struct dt_property *pci_npu_phandle_prop;
+ size_t prop_len;
+
+ /* Use a lock to make sure no one else has a reference to an
+ * ibm,npu property (this assumes this is the only function
+ * that holds a reference to it). */
+ lock(&pci_npu_phandle_lock);
+
+ /* This function shouldn't be called unless ibm,npu exists */
+ pci_npu_phandle_prop = (struct dt_property *)
+ dt_require_property(dn, "ibm,npu", -1);
+
+ /* Need to append to the properties */
+ prop_len = pci_npu_phandle_prop->len;
+ prop_len += sizeof(*npu_phandles);
+ dt_resize_property(&pci_npu_phandle_prop, prop_len);
+
+ npu_phandles = (uint32_t *) pci_npu_phandle_prop->prop;
+ npu_phandles[prop_len/sizeof(*npu_phandles) - 1] = phandle;
+ unlock(&pci_npu_phandle_lock);
+}
+
+static int npu_dn_fixup(struct phb *phb,
+ struct pci_device *pd,
+ void *data __unused)
+{
+ struct npu *p = phb_to_npu(phb);
+ struct npu_dev *dev;
+
+ dev = bdfn_to_npu_dev(p, pd->bdfn);
+ assert(dev);
+
+ if (dev->phb || dev->pd)
+ return 0;
+
+ /* NPU devices require a slot location to associate with GPUs */
+ dev->slot_label = dt_prop_get(pd->dn, "ibm,slot-label");
+
+ /* Bind the emulated PCI device with the real one, which can't
+ * be done until the PCI devices are populated. Once the real
+ * PCI device is identified, we also need fix the device-tree
+ * for it
+ */
+ npu_dev_bind_pci_dev(dev);
+ if (dev->phb && dev->pd && dev->pd->dn) {
+ if (dt_find_property(dev->pd->dn, "ibm,npu"))
+ npu_append_pci_phandle(dev->pd->dn, pd->dn->phandle);
+ else
+ dt_add_property_cells(dev->pd->dn, "ibm,npu", pd->dn->phandle);
+
+ dt_add_property_cells(pd->dn, "ibm,gpu", dev->pd->dn->phandle);
+ }
+
+ return 0;
+}
+
+static void npu_phb_final_fixup(struct phb *phb)
+{
+ pci_walk_dev(phb, NULL, npu_dn_fixup, NULL);
+}
+
+static void npu_ioda_init(struct npu *p)
+{
+ uint64_t *data64;
+ uint32_t i;
+
+ /* LXIVT - Disable all LSIs */
+ for (i = 0; i < ARRAY_SIZE(p->lxive_cache); i++) {
+ data64 = &p->lxive_cache[i];
+ *data64 = SETFIELD(NPU_IODA_LXIVT_PRIORITY, 0ul, 0xff);
+ *data64 = SETFIELD(NPU_IODA_LXIVT_SERVER, *data64, 0);
+ }
+
+ /* PCT - Reset to reserved PE# */
+ for (i = 0; i < ARRAY_SIZE(p->pce_cache); i++) {
+ data64 = &p->pce_cache[i];
+ *data64 = SETFIELD(NPU_IODA_PCT_PE, 0ul, 0ul);
+ *data64 |= NPU_IODA_PCT_LINK_ENABLED;
+ }
+
+ /* Clear TVT */
+ memset(p->tve_cache, 0, sizeof(p->tve_cache));
+}
+
+static int64_t npu_ioda_reset(struct phb *phb, bool purge)
+{
+ struct npu *p = phb_to_npu(phb);
+ uint32_t i;
+
+ if (purge) {
+ NPUDBG(p, "Purging all IODA tables...\n");
+ npu_ioda_init(p);
+ }
+
+ /* LIST */
+ npu_ioda_sel(p, NPU_IODA_TBL_LIST, 0, true);
+ for (i = 0; i < 8; i++)
+ out_be64(p->at_regs + NPU_IODA_DATA0, 0x1);
+
+ /* LIXVT */
+ npu_ioda_sel(p, NPU_IODA_TBL_LXIVT, 0, true);
+ for (i = 0; i < ARRAY_SIZE(p->lxive_cache); i++)
+ out_be64(p->at_regs + NPU_IODA_DATA0, p->lxive_cache[i]);
+
+ /* PCT */
+ npu_ioda_sel(p, NPU_IODA_TBL_PCT, 0, true);
+ for (i = 0; i < ARRAY_SIZE(p->pce_cache); i++)
+ out_be64(p->at_regs + NPU_IODA_DATA0, p->pce_cache[i]);
+
+ /* TVT */
+ npu_ioda_sel(p, NPU_IODA_TBL_TVT, 0, true);
+ for (i = 0; i < ARRAY_SIZE(p->tve_cache); i++)
+ out_be64(p->at_regs + NPU_IODA_DATA0, p->tve_cache[i]);
+
+ return OPAL_SUCCESS;
+}
+
+static int npu_isn_valid(struct npu *p, uint32_t isn)
+{
+ if (p->chip_id != p8_irq_to_chip(isn) || p->index != 0 ||
+ NPU_IRQ_NUM(isn) < NPU_LSI_IRQ_MIN ||
+ NPU_IRQ_NUM(isn) > NPU_LSI_IRQ_MAX) {
+ /**
+ * @fwts-label NPUisnInvalid
+ * @fwts-advice NVLink not functional
+ */
+ prlog(PR_ERR, "NPU%d: isn 0x%x not valid for this NPU\n",
+ p->phb.opal_id, isn);
+ return false;
+ }
+
+ return true;
+}
+
+static int64_t npu_lsi_get_xive(struct irq_source *is, uint32_t isn,
+ uint16_t *server, uint8_t *prio)
+{
+ struct npu *p = is->data;
+ uint32_t irq = NPU_IRQ_NUM(isn);
+ uint64_t lxive;
+
+ if (!npu_isn_valid(p, isn))
+ return OPAL_PARAMETER;
+
+ /* The content is fetched from the cache, which requires
+ * that the initial cache should be initialized with the
+ * default values
+ */
+ irq -= NPU_LSI_IRQ_MIN;
+ lxive = p->lxive_cache[irq];
+ *server = GETFIELD(NPU_IODA_LXIVT_SERVER, lxive);
+ *prio = GETFIELD(NPU_IODA_LXIVT_PRIORITY, lxive);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu_lsi_set_xive(struct irq_source *is, uint32_t isn,
+ uint16_t server, uint8_t prio)
+{
+ struct npu *p = is->data;
+ uint32_t irq = NPU_IRQ_NUM(isn);
+ uint64_t lxive;
+
+ if (!npu_isn_valid(p, isn))
+ return OPAL_PARAMETER;
+
+ /* Figure out LXIVT entry */
+ lxive = SETFIELD(NPU_IODA_LXIVT_SERVER, 0ul, server);
+ lxive = SETFIELD(NPU_IODA_LXIVT_PRIORITY, lxive, prio);
+
+ /* Cache LXIVT entry */
+ irq -= NPU_LSI_IRQ_MIN;
+ p->lxive_cache[irq] = lxive;
+
+ /* Update to LXIVT entry */
+ npu_ioda_sel(p, NPU_IODA_TBL_LXIVT, irq, false);
+ lxive = in_be64(p->at_regs + NPU_IODA_DATA0);
+ lxive = SETFIELD(NPU_IODA_LXIVT_SERVER, lxive, server);
+ lxive = SETFIELD(NPU_IODA_LXIVT_PRIORITY, lxive, prio);
+ out_be64(p->at_regs + NPU_IODA_DATA0, lxive);
+
+ return OPAL_SUCCESS;
+}
+
+static void npu_err_interrupt(struct irq_source *is, uint32_t isn)
+{
+ struct npu *p = is->data;
+ uint32_t irq = NPU_IRQ_NUM(isn);
+
+ if (!npu_isn_valid(p, isn))
+ return;
+
+ /* There're 4 LSIs used for error reporting: 4/5 for data
+ * link error reporting while 6/7 for frozen PE detection
+ */
+ irq -= NPU_LSI_IRQ_MIN;
+ switch (irq) {
+ case 4 ... 5:
+ prerror("Invalid NPU error interrupt received\n");
+ break;
+ case 6 ... 7:
+ opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
+ OPAL_EVENT_PCI_ERROR);
+ }
+}
+
+static uint64_t npu_lsi_attributes(struct irq_source *is, uint32_t isn)
+{
+ struct npu *p = is->data;
+ uint32_t idx = isn - p->base_lsi;
+
+ if (idx >= 4)
+ return IRQ_ATTR_TARGET_OPAL | IRQ_ATTR_TARGET_RARE | IRQ_ATTR_TYPE_LSI;
+ return IRQ_ATTR_TARGET_LINUX;
+}
+
+/* Error LSIs (skiboot owned) */
+static const struct irq_source_ops npu_lsi_irq_ops = {
+ .get_xive = npu_lsi_get_xive,
+ .set_xive = npu_lsi_set_xive,
+ .attributes = npu_lsi_attributes,
+ .interrupt = npu_err_interrupt,
+};
+
+static void npu_register_irq(struct npu *p)
+{
+ register_irq_source(&npu_lsi_irq_ops, p, p->base_lsi, 8);
+}
+
+static void npu_hw_init(struct npu *p)
+{
+ /* 3 MMIO setup for AT */
+ out_be64(p->at_regs + NPU_LSI_SOURCE_ID,
+ SETFIELD(NPU_LSI_SRC_ID_BASE, 0ul, NPU_LSI_IRQ_MIN >> 4));
+ BUILD_ASSERT((NPU_LSI_IRQ_MIN & 0x07F0) == NPU_LSI_IRQ_MIN);
+ out_be64(p->at_regs + NPU_INTREP_TIMER, 0x0ul);
+ npu_ioda_reset(&p->phb, false);
+}
+
+static int64_t npu_map_pe_dma_window_real(struct phb *phb,
+ uint64_t pe_number,
+ uint16_t window_id,
+ uint64_t pci_start_addr,
+ uint64_t pci_mem_size)
+{
+ struct npu *p = phb_to_npu(phb);
+ uint64_t end;
+ uint64_t tve;
+
+ /* Sanity check. Each PE has one corresponding TVE */
+ if (pe_number >= NPU_NUM_OF_PES ||
+ window_id != pe_number)
+ return OPAL_PARAMETER;
+
+ if (pci_mem_size) {
+ /* Enable */
+
+ end = pci_start_addr + pci_mem_size;
+
+ /* We have to be 16M aligned */
+ if ((pci_start_addr & 0x00ffffff) ||
+ (pci_mem_size & 0x00ffffff))
+ return OPAL_PARAMETER;
+
+ /*
+ * It *looks* like this is the max we can support (we need
+ * to verify this. Also we are not checking for rollover,
+ * but then we aren't trying too hard to protect ourselves
+ * againt a completely broken OS.
+ */
+ if (end > 0x0003ffffffffffffull)
+ return OPAL_PARAMETER;
+
+ /*
+ * Put start address bits 49:24 into TVE[52:53]||[0:23]
+ * and end address bits 49:24 into TVE[54:55]||[24:47]
+ * and set TVE[51]
+ */
+ tve = (pci_start_addr << 16) & (0xffffffull << 48);
+ tve |= (pci_start_addr >> 38) & (3ull << 10);
+ tve |= (end >> 8) & (0xfffffful << 16);
+ tve |= (end >> 40) & (3ull << 8);
+ tve |= PPC_BIT(51);
+ } else {
+ /* Disable */
+ tve = 0;
+ }
+
+ npu_ioda_sel(p, NPU_IODA_TBL_TVT, window_id, false);
+ out_be64(p->at_regs + NPU_IODA_DATA0, tve);
+ p->tve_cache[window_id] = tve;
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu_map_pe_dma_window(struct phb *phb,
+ uint64_t pe_number,
+ uint16_t window_id,
+ uint16_t tce_levels,
+ uint64_t tce_table_addr,
+ uint64_t tce_table_size,
+ uint64_t tce_page_size)
+{
+ struct npu *p = phb_to_npu(phb);
+ uint64_t tts_encoded;
+ uint64_t data64 = 0;
+
+ /* Sanity check. Each PE has one corresponding TVE */
+ if (pe_number >= NPU_NUM_OF_PES ||
+ window_id != pe_number)
+ return OPAL_PARAMETER;
+
+ /* Special condition, zero TCE table size used to disable
+ * the TVE.
+ */
+ if (!tce_table_size) {
+ npu_ioda_sel(p, NPU_IODA_TBL_TVT, window_id, false);
+ out_be64(p->at_regs + NPU_IODA_DATA0, 0ul);
+ p->tve_cache[window_id] = 0ul;
+ return OPAL_SUCCESS;
+ }
+
+ /* Additional arguments validation */
+ if (tce_levels < 1 ||
+ tce_levels > 4 ||
+ !is_pow2(tce_table_size) ||
+ tce_table_size < 0x1000)
+ return OPAL_PARAMETER;
+
+ /* TCE table size */
+ data64 = SETFIELD(NPU_IODA_TVT_TTA, 0ul, tce_table_addr >> 12);
+ tts_encoded = ilog2(tce_table_size) - 11;
+ if (tts_encoded > 39)
+ return OPAL_PARAMETER;
+ data64 = SETFIELD(NPU_IODA_TVT_SIZE, data64, tts_encoded);
+
+ /* TCE page size */
+ switch (tce_page_size) {
+ case 0x10000: /* 64K */
+ data64 = SETFIELD(NPU_IODA_TVT_PSIZE, data64, 5);
+ break;
+ case 0x1000000: /* 16M */
+ data64 = SETFIELD(NPU_IODA_TVT_PSIZE, data64, 13);
+ break;
+ case 0x10000000: /* 256M */
+ data64 = SETFIELD(NPU_IODA_TVT_PSIZE, data64, 17);
+ break;
+ case 0x1000: /* 4K */
+ default:
+ data64 = SETFIELD(NPU_IODA_TVT_PSIZE, data64, 1);
+ }
+
+ /* Number of levels */
+ data64 = SETFIELD(NPU_IODA_TVT_LEVELS, data64, tce_levels - 1);
+
+ /* Update to hardware */
+ npu_ioda_sel(p, NPU_IODA_TBL_TVT, window_id, false);
+ out_be64(p->at_regs + NPU_IODA_DATA0, data64);
+ p->tve_cache[window_id] = data64;
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu_set_pe(struct phb *phb,
+ uint64_t pe_number,
+ uint64_t bdfn,
+ uint8_t bcompare,
+ uint8_t dcompare,
+ uint8_t fcompare,
+ uint8_t action)
+{
+ struct npu *p = phb_to_npu(phb);
+ struct npu_dev *dev;
+ uint32_t link_idx;
+ uint64_t *data64;
+
+ /* Sanity check */
+ if (action != OPAL_MAP_PE &&
+ action != OPAL_UNMAP_PE)
+ return OPAL_PARAMETER;
+ if (pe_number >= NPU_NUM_OF_PES)
+ return OPAL_PARAMETER;
+
+ /* All emulated PCI devices hooked to root bus, whose
+ * bus number is zero.
+ */
+ dev = bdfn_to_npu_dev(p, bdfn);
+ if (PCI_BUS_NUM(bdfn) || !dev)
+ return OPAL_PARAMETER;
+
+ link_idx = dev->index;
+ dev->pe_number = pe_number;
+
+ /* Separate links will be mapped to different PEs */
+ if (bcompare != OpalPciBusAll ||
+ dcompare != OPAL_COMPARE_RID_DEVICE_NUMBER ||
+ fcompare != OPAL_COMPARE_RID_FUNCTION_NUMBER)
+ return OPAL_UNSUPPORTED;
+
+ /* Map the link to the corresponding PE */
+ data64 = &p->pce_cache[link_idx];
+ if (action == OPAL_MAP_PE)
+ *data64 = SETFIELD(NPU_IODA_PCT_PE, *data64,
+ pe_number);
+ else
+ *data64 = SETFIELD(NPU_IODA_PCT_PE, *data64,
+ NPU_NUM_OF_PES);
+
+ *data64 |= NPU_IODA_PCT_LINK_ENABLED;
+
+ npu_ioda_sel(p, NPU_IODA_TBL_PCT, link_idx, false);
+ out_be64(p->at_regs + NPU_IODA_DATA0, *data64);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu_get_link_state(struct pci_slot *slot __unused, uint8_t *val)
+{
+ /* As we're emulating all PCI stuff, the link bandwidth
+ * isn't big deal anyway.
+ */
+ *val = OPAL_SHPC_LINK_UP_x1;
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu_get_power_state(struct pci_slot *slot __unused, uint8_t *val)
+{
+ *val = PCI_SLOT_POWER_ON;
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu_hreset(struct pci_slot *slot __unused)
+{
+ prlog(PR_DEBUG, "NPU: driver should call reset procedure here\n");
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu_freset(struct pci_slot *slot __unused)
+{
+ /* FIXME: PHB fundamental reset, which need to be
+ * figured out later. It's used by EEH recovery
+ * upon fenced AT.
+ */
+ return OPAL_SUCCESS;
+}
+
+static struct pci_slot *npu_slot_create(struct phb *phb)
+{
+ struct pci_slot *slot;
+
+ slot = pci_slot_alloc(phb, NULL);
+ if (!slot)
+ return slot;
+
+ /* Elementary functions */
+ slot->ops.get_presence_state = NULL;
+ slot->ops.get_link_state = npu_get_link_state;
+ slot->ops.get_power_state = npu_get_power_state;
+ slot->ops.get_attention_state = NULL;
+ slot->ops.get_latch_state = NULL;
+ slot->ops.set_power_state = NULL;
+ slot->ops.set_attention_state = NULL;
+
+ slot->ops.prepare_link_change = NULL;
+ slot->ops.poll_link = NULL;
+ slot->ops.hreset = npu_hreset;
+ slot->ops.freset = npu_freset;
+ slot->ops.creset = NULL;
+
+ return slot;
+}
+
+static int64_t npu_freeze_status(struct phb *phb,
+ uint64_t pe_number __unused,
+ uint8_t *freeze_state,
+ uint16_t *pci_error_type __unused,
+ uint16_t *severity __unused)
+{
+ /* FIXME: When it's called by skiboot PCI config accessor,
+ * the PE number is fixed to 0, which is incorrect. We need
+ * introduce another PHB callback to translate it. For now,
+ * it keeps the skiboot PCI enumeration going.
+ */
+ struct npu *p = phb_to_npu(phb);
+ if (p->fenced)
+ *freeze_state = OPAL_EEH_STOPPED_MMIO_DMA_FREEZE;
+ else
+ *freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN;
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu_eeh_next_error(struct phb *phb,
+ uint64_t *first_frozen_pe,
+ uint16_t *pci_error_type,
+ uint16_t *severity)
+{
+ struct npu *p = phb_to_npu(phb);
+ int i;
+ uint64_t result = 0;
+ *first_frozen_pe = -1;
+ *pci_error_type = OPAL_EEH_NO_ERROR;
+ *severity = OPAL_EEH_SEV_NO_ERROR;
+
+ if (p->fenced) {
+ *pci_error_type = OPAL_EEH_PHB_ERROR;
+ *severity = OPAL_EEH_SEV_PHB_FENCED;
+ return OPAL_SUCCESS;
+ }
+
+ npu_ioda_sel(p, NPU_IODA_TBL_PESTB, 0, true);
+ for (i = 0; i < NPU_NUM_OF_PES; i++) {
+ result = in_be64(p->at_regs + NPU_IODA_DATA0);
+ if (result > 0) {
+ *first_frozen_pe = i;
+ *pci_error_type = OPAL_EEH_PE_ERROR;
+ *severity = OPAL_EEH_SEV_PE_ER;
+ break;
+ }
+ }
+
+ return OPAL_SUCCESS;
+}
+
+/* For use in error injection and handling. */
+void npu_set_fence_state(struct npu *p, bool fence) {
+ p->fenced = fence;
+
+ if (fence)
+ prlog(PR_ERR, "NPU: Chip %x is fenced, reboot required.\n",
+ p->chip_id);
+ else
+ prlog(PR_WARNING, "NPU: un-fencing is dangerous and should \
+ only be used for development purposes.");
+}
+
+/* Sets the NPU to trigger an error when a DMA occurs */
+static int64_t npu_err_inject(struct phb *phb, uint64_t pe_number,
+ uint32_t type, uint32_t func __unused,
+ uint64_t addr __unused, uint64_t mask __unused)
+{
+ struct npu *p = phb_to_npu(phb);
+ struct npu_dev *dev = NULL;
+ int i;
+
+ if (pe_number >= NPU_NUM_OF_PES) {
+ prlog(PR_ERR, "NPU: error injection failed, bad PE given\n");
+ return OPAL_PARAMETER;
+ }
+
+ for (i = 0; i < p->total_devices; i++) {
+ if (p->devices[i].pe_number == pe_number) {
+ dev = &p->devices[i];
+ break;
+ }
+ }
+
+ if (!dev) {
+ prlog(PR_ERR, "NPU: couldn't find device with PE%llx\n", pe_number);
+ return OPAL_PARAMETER;
+ }
+
+ /* TODO: extend this to conform to OPAL injection standards */
+ if (type > 1) {
+ prlog(PR_ERR, "NPU: invalid error injection type\n");
+ return OPAL_PARAMETER;
+ } else if (type == 1) {
+ /* Emulate fence mode. */
+ npu_set_fence_state(p, true);
+ } else {
+ /* Cause a freeze with an invalid MMIO read. If the BAR is not
+ * enabled, this will checkstop the machine.
+ */
+ npu_dev_bar_update(p->chip_id, &dev->bar, true);
+ in_be64((void *)dev->bar.base);
+ }
+
+ return OPAL_SUCCESS;
+}
+
+static const struct phb_ops npu_ops = {
+ .cfg_read8 = npu_cfg_read8,
+ .cfg_read16 = npu_cfg_read16,
+ .cfg_read32 = npu_cfg_read32,
+ .cfg_write8 = npu_cfg_write8,
+ .cfg_write16 = npu_cfg_write16,
+ .cfg_write32 = npu_cfg_write32,
+ .get_reserved_pe_number = NULL,
+ .device_init = NULL,
+ .phb_final_fixup = npu_phb_final_fixup,
+ .ioda_reset = npu_ioda_reset,
+ .papr_errinjct_reset = NULL,
+ .pci_reinit = NULL,
+ .set_phb_mem_window = NULL,
+ .phb_mmio_enable = NULL,
+ .map_pe_mmio_window = NULL,
+ .map_pe_dma_window = npu_map_pe_dma_window,
+ .map_pe_dma_window_real = npu_map_pe_dma_window_real,
+ .pci_msi_eoi = NULL,
+ .set_xive_pe = NULL,
+ .get_msi_32 = NULL,
+ .get_msi_64 = NULL,
+ .set_pe = npu_set_pe,
+ .set_peltv = NULL,
+ .eeh_freeze_status = npu_freeze_status,
+ .eeh_freeze_clear = NULL,
+ .eeh_freeze_set = NULL,
+ .next_error = npu_eeh_next_error,
+ .err_inject = npu_err_inject,
+ .get_diag_data2 = NULL,
+ .set_capi_mode = NULL,
+ .set_capp_recovery = NULL,
+};
+
+static void assign_mmio_bars(uint32_t gcid, uint32_t xscom,
+ struct dt_node *npu_dn, uint64_t mm_win[2],
+ uint64_t at_bar[2])
+{
+ uint64_t mem_start, mem_end;
+ struct npu_dev_bar bar;
+ struct dt_node *link;
+
+ /* Configure BAR selection.
+ *
+ * Currently, each PHY contains 2 links and each link has 2
+ * BARs. The first BAR is assigned to the DLTL region which is
+ * what the kernel uses. The second BAR is either assigned to
+ * either the PL or AT region or unassigned. The PL0/PL1/AT
+ * MMIO regions are not exposed to the kernel so we assigned
+ * them at the start of the available memory area followed by
+ * the DLTL regions. So we end up with the following memory
+ * map (assuming we're given a memory region starting at
+ * 0x3fff000000000):
+ *
+ * Link#0-BAR#0: NTL/NDL BAR (128KB) - 0x3fff000420000
+ * Link#0-BAR#1: PL0 BAR ( 2MB) - 0x3fff000000000
+ * Link#1-BAR#0: NTL/NDL BAR (128KB) - 0x3fff000440000
+ * Link#1-BAR#1: AT BAR ( 64KB) - 0x3fff000400000
+ * Link#2-BAR#0: NTL/NDL BAR (128KB) - 0x3fff000460000
+ * Link#2-BAR#1: PL1 BAR ( 2MB) - 0x3fff000200000
+ * Link#3-BAR#0: NTL/NDL BAR (128KB) - 0x3fff000480000
+ * Link#3-BAR#1: UNASSIGNED
+ */
+ xscom_write(gcid, xscom + NPU_AT_SCOM_OFFSET + NX_BAR,
+ 0x0211000043500000UL);
+
+ xscom_read(gcid, npu_link_scom_base(npu_dn, xscom, 0) + NX_MMIO_BAR_0,
+ &mem_start);
+ mem_start = GETFIELD(NX_MMIO_BAR_BASE, mem_start) << 12;
+
+ xscom_read(gcid, npu_link_scom_base(npu_dn, xscom, 5) + NX_MMIO_BAR_0,
+ &mem_end);
+ mem_end = (GETFIELD(NX_MMIO_BAR_BASE, mem_end) << 12) +
+ get_bar_size(mem_end);
+
+ /* PL0 BAR comes first at 0x3fff000000000 */
+ bar.xscom = npu_link_scom_base(npu_dn, xscom, 0) + NX_MMIO_BAR_1;
+ bar.base = mem_start;
+ bar.size = NX_MMIO_PL_SIZE;
+ npu_dev_bar_update(gcid, &bar, true);
+
+ /* PL1 BAR */
+ bar.xscom = npu_link_scom_base(npu_dn, xscom, 4) + NX_MMIO_BAR_1;
+ bar.base += bar.size;
+ bar.size = NX_MMIO_PL_SIZE;
+ npu_dev_bar_update(gcid, &bar, true);
+
+ /* Then the AT BAR */
+ bar.xscom = npu_link_scom_base(npu_dn, xscom, 1) + NX_MMIO_BAR_1;
+ bar.base += bar.size;
+ bar.size = NX_MMIO_AT_SIZE;
+ at_bar[0] = bar.base;
+ at_bar[1] = NX_MMIO_AT_SIZE;
+ npu_dev_bar_update(gcid, &bar, true);
+
+ /* Now we configure all the DLTL BARs. These are the ones
+ * actually exposed to the kernel. */
+ mm_win[0] = bar.base + bar.size;
+ dt_for_each_node(npu_dn, link) {
+ uint32_t index;
+
+ index = dt_prop_get_u32(link, "ibm,npu-link-index");
+ bar.xscom = npu_link_scom_base(npu_dn, xscom, index) +
+ NX_MMIO_BAR_0;
+ bar.base += bar.size;
+ bar.size = NX_MMIO_DL_SIZE;
+ bar.base = ALIGN_UP(bar.base, bar.size);
+ npu_dev_bar_update(gcid, &bar, false);
+ }
+ mm_win[1] = (bar.base + bar.size) - mm_win[0];
+
+ /* If we weren't given enough room to setup all the BARs we
+ * require it's better to crash here than risk creating
+ * overlapping BARs which will xstop the machine randomly in
+ * the future.*/
+ assert(bar.base + bar.size <= mem_end);
+}
+
+/* Probe NPU device node and create PCI root device node
+ * accordingly. The NPU deivce node should specify number
+ * of links and xscom base address to access links.
+ */
+static void npu_probe_phb(struct dt_node *dn)
+{
+ struct dt_node *np;
+ uint32_t gcid, index, phb_index, xscom;
+ uint64_t at_bar[2], mm_win[2];
+ uint32_t links;
+ char *path;
+
+ /* Retrieve chip id */
+ path = dt_get_path(dn);
+ gcid = dt_get_chip_id(dn);
+ index = dt_prop_get_u32(dn, "ibm,npu-index");
+ phb_index = dt_prop_get_u32(dn, "ibm,phb-index");
+ links = dt_prop_get_u32(dn, "ibm,npu-links");
+ prlog(PR_INFO, "Chip %d Found NPU%d (%d links) at %s\n",
+ gcid, index, links, path);
+ free(path);
+
+ /* Retrieve xscom base addr */
+ xscom = dt_get_address(dn, 0, NULL);
+ prlog(PR_INFO, " XSCOM Base: %08x\n", xscom);
+
+ assign_mmio_bars(gcid, xscom, dn, mm_win, at_bar);
+ prlog(PR_INFO, " AT BAR: %016llx (%lldKB)\n",
+ at_bar[0], at_bar[1] / 0x400);
+
+ /* Create PCI root device node */
+ np = dt_new_addr(dt_root, "pciex", at_bar[0]);
+ assert(np);
+
+ dt_add_property_strings(np, "compatible",
+ "ibm,power8-npu-pciex", "ibm,ioda2-npu-phb");
+ dt_add_property_strings(np, "device_type", "pciex");
+ dt_add_property(np, "reg", at_bar, sizeof(at_bar));
+
+ dt_add_property_cells(np, "ibm,phb-index", phb_index);
+ dt_add_property_cells(np, "ibm,npu-index", index);
+ dt_add_property_cells(np, "ibm,chip-id", gcid);
+ dt_add_property_cells(np, "ibm,xscom-base", xscom);
+ dt_add_property_cells(np, "ibm,npcq", dn->phandle);
+ dt_add_property_cells(np, "ibm,links", links);
+ dt_add_property(np, "ibm,mmio-window", mm_win, sizeof(mm_win));
+ dt_add_property_cells(np, "ibm,phb-diag-data-size", 0);
+
+ /* Disable fast reboot - not currently supported */
+ disable_fast_reboot("NVLink device enabled");
+}
+
+static void npu_dev_populate_vendor_cap(struct npu_dev_cap *cap)
+{
+ struct npu_dev *dev = cap->dev;
+ struct pci_virt_device *pvd = dev->pvd;
+ uint32_t offset = cap->start;
+ uint8_t val;
+
+ /* Add length and version information */
+ val = cap->end - cap->start;
+ PCI_VIRT_CFG_INIT_RO(pvd, offset + 2, 1, val);
+ PCI_VIRT_CFG_INIT_RO(pvd, offset + 3, 1, OPAL_NPU_VERSION);
+ offset += 4;
+
+ /* Defaults when the trap can't handle the read/write (eg. due
+ * to reading/writing less than 4 bytes). */
+ val = 0x0;
+ PCI_VIRT_CFG_INIT_RO(pvd, offset, 4, val);
+ PCI_VIRT_CFG_INIT_RO(pvd, offset + 4, 4, val);
+
+ /* Create a trap for AT/PL procedures */
+ pci_virt_add_filter(pvd, offset, 8,
+ PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
+ npu_dev_procedure, NULL);
+ offset += 8;
+
+ PCI_VIRT_CFG_INIT_RO(pvd, offset, 1, dev->index);
+}
+
+static void npu_dev_populate_pcie_cap(struct npu_dev_cap *cap)
+{
+ struct npu_dev *dev = cap->dev;
+ struct pci_virt_device *pvd = dev->pvd;
+ uint32_t base = cap->start;
+ uint32_t val;
+
+ /* Sanity check on capability ID */
+ if (cap->id != PCI_CFG_CAP_ID_EXP) {
+ prlog(PR_NOTICE, "%s: Invalid capability ID %d (%d)\n",
+ __func__, cap->id, PCI_CFG_CAP_ID_EXP);
+ return;
+ }
+
+ /* Sanity check on spanned registers */
+ if ((cap->end - cap->start) < PCIE_CAP_START) {
+ prlog(PR_NOTICE, "%s: Invalid reg region [%x, %x] for cap %d\n",
+ __func__, cap->start, cap->end, cap->id);
+ return;
+ }
+
+ /* 0x00 - ID/PCIE capability */
+ val = cap->id;
+ val |= ((0x2 << 16) | (PCIE_TYPE_ENDPOINT << 20));
+ PCI_VIRT_CFG_INIT_RO(pvd, base, 4, val);
+
+ /* 0x04 - Device capability
+ *
+ * We should support FLR. Otherwise, it might have
+ * problem passing it through to userland via Linux
+ * VFIO infrastructure
+ */
+ val = ((PCIE_MPSS_128) |
+ (PCIE_PHANTOM_NONE << 3) |
+ (PCIE_L0SL_MAX_NO_LIMIT << 6) |
+ (PCIE_L1L_MAX_NO_LIMIT << 9) |
+ (PCICAP_EXP_DEVCAP_FUNC_RESET));
+ PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_DEVCAP, 4, val);
+
+ pci_virt_add_filter(pvd, base + PCICAP_EXP_DEVCTL, 2,
+ PCI_REG_FLAG_WRITE,
+ npu_dev_cfg_exp_devcap, NULL);
+
+ /* 0x08 - Device control and status */
+ PCI_VIRT_CFG_INIT(pvd, base + PCICAP_EXP_DEVCTL, 4, 0x00002810,
+ 0xffff0000, 0x000f0000);
+
+ /* 0x0c - Link capability */
+ val = (PCIE_LSPEED_VECBIT_2 | (PCIE_LWIDTH_1X << 4));
+ PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_LCAP, 4, val);
+
+ /* 0x10 - Link control and status */
+ PCI_VIRT_CFG_INIT(pvd, base + PCICAP_EXP_LCTL, 4, 0x00130000,
+ 0xfffff000, 0xc0000000);
+
+ /* 0x14 - Slot capability */
+ PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_SLOTCAP, 4, 0x00000000);
+
+ /* 0x18 - Slot control and status */
+ PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_SLOTCTL, 4, 0x00000000);
+
+ /* 0x1c - Root control and capability */
+ PCI_VIRT_CFG_INIT(pvd, base + PCICAP_EXP_RC, 4, 0x00000000,
+ 0xffffffe0, 0x00000000);
+
+ /* 0x20 - Root status */
+ PCI_VIRT_CFG_INIT(pvd, base + PCICAP_EXP_RSTAT, 4, 0x00000000,
+ 0xffffffff, 0x00010000);
+
+ /* 0x24 - Device capability 2 */
+ PCI_VIRT_CFG_INIT_RO(pvd, base + PCIECAP_EXP_DCAP2, 4, 0x00000000);
+
+ /* 0x28 - Device Control and status 2 */
+ PCI_VIRT_CFG_INIT(pvd, base + PCICAP_EXP_DCTL2, 4, 0x00070000,
+ 0xffff0000, 0x00000000);
+
+ /* 0x2c - Link capability 2 */
+ PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_LCAP2, 4, 0x00000007);
+
+ /* 0x30 - Link control and status 2 */
+ PCI_VIRT_CFG_INIT(pvd, base + PCICAP_EXP_LCTL2, 4, 0x00000003,
+ 0xffff0000, 0x00200000);
+
+ /* 0x34 - Slot capability 2 */
+ PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_SCAP2, 4, 0x00000000);
+
+ /* 0x38 - Slot control and status 2 */
+ PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_SCTL2, 4, 0x00000000);
+}
+
+static struct npu_dev_cap *npu_dev_create_capability(struct npu_dev *dev,
+ void (*populate)(struct npu_dev_cap *),
+ uint16_t id,
+ uint16_t start,
+ uint16_t end)
+{
+ struct npu_dev_cap *cap;
+
+ /* Check if the capability is existing */
+ cap = npu_dev_find_capability(dev, id);
+ if (cap)
+ return cap;
+
+ /* Allocate new one */
+ cap = zalloc(sizeof(struct npu_dev_cap));
+ assert(cap);
+
+ /* Put it into the pool */
+ cap->id = id;
+ cap->start = start;
+ cap->end = end;
+ cap->dev = dev;
+ cap->populate = populate;
+ list_add_tail(&dev->capabilities, &cap->link);
+
+ return cap;
+}
+
+static struct npu_dev_cap *npu_dev_find_capability(struct npu_dev *dev,
+ uint16_t id)
+{
+ struct npu_dev_cap *cap;
+
+ list_for_each(&dev->capabilities, cap, link) {
+ if (cap->id == id)
+ return cap;
+ }
+
+ return NULL;
+}
+
+/*
+ * All capabilities should be put into the device capability
+ * list according to register offset in ascending order for
+ * easy access at later point.
+ */
+static void npu_dev_create_capabilities(struct npu_dev *dev)
+{
+ list_head_init(&dev->capabilities);
+
+ /* PCI express capability */
+ npu_dev_create_capability(dev, npu_dev_populate_pcie_cap,
+ PCI_CFG_CAP_ID_EXP, PCIE_CAP_START,
+ PCIE_CAP_END);
+
+ /* Vendor specific capability */
+ npu_dev_create_capability(dev, npu_dev_populate_vendor_cap,
+ PCI_CFG_CAP_ID_VENDOR, VENDOR_CAP_START,
+ VENDOR_CAP_END);
+}
+
+static void npu_dev_create_cfg(struct npu_dev *dev)
+{
+ struct pci_virt_device *pvd = dev->pvd;
+ struct npu_dev_cap *cap;
+ uint32_t offset;
+ uint32_t last_cap_offset;
+
+ /* 0x00 - Vendor/Device ID */
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_VENDOR_ID, 4, 0x04ea1014);
+
+ /* 0x04 - Command/Status
+ *
+ * Create one trap to trace toggling memory BAR enable bit
+ */
+ PCI_VIRT_CFG_INIT(pvd, PCI_CFG_CMD, 4, 0x00100000, 0xffb802b8,
+ 0xf9000000);
+
+ pci_virt_add_filter(pvd, PCI_CFG_CMD, 1, PCI_REG_FLAG_WRITE,
+ npu_dev_cfg_write_cmd, NULL);
+
+ /* 0x08 - Rev/Class/Cache */
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_REV_ID, 4, 0x06800100);
+
+ /* 0x0c - CLS/Latency Timer/Header/BIST */
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CACHE_LINE_SIZE, 4, 0x00800000);
+
+ /* 0x10 - BARs, always 64-bits non-prefetchable
+ *
+ * Each emulated device represents one link and therefore
+ * there is one BAR for the associated DLTL region.
+ */
+
+ /* Low 32-bits */
+ PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR0, 4,
+ (dev->bar.base & 0xfffffff0) | dev->bar.flags,
+ 0x0000000f, 0x00000000);
+
+ /* High 32-bits */
+ PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR1, 4, (dev->bar.base >> 32),
+ 0x00000000, 0x00000000);
+
+ /*
+ * Create trap. Writting 0xFF's to BAR registers should be
+ * trapped and return size on next read
+ */
+ pci_virt_add_filter(pvd, PCI_CFG_BAR0, 8,
+ PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
+ npu_dev_cfg_bar, &dev->bar);
+
+ /* 0x18/1c/20/24 - Disabled BAR#2/3/4/5
+ *
+ * Mark those BARs readonly so that 0x0 will be returned when
+ * probing the length and the BARs will be skipped.
+ */
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR2, 4, 0x00000000);
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR3, 4, 0x00000000);
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR4, 4, 0x00000000);
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR5, 4, 0x00000000);
+
+ /* 0x28 - Cardbus CIS pointer */
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CARDBUS_CIS, 4, 0x00000000);
+
+ /* 0x2c - Subsystem ID */
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_SUBSYS_VENDOR_ID, 4, 0x00000000);
+
+ /* 0x30 - ROM BAR
+ *
+ * Force its size to be zero so that the kernel will skip
+ * probing the ROM BAR. We needn't emulate ROM BAR.
+ */
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_ROMBAR, 4, 0xffffffff);
+
+ /* 0x34 - PCI Capability
+ *
+ * By default, we don't have any capabilities
+ */
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CAP, 4, 0x00000000);
+
+ last_cap_offset = PCI_CFG_CAP - 1;
+ list_for_each(&dev->capabilities, cap, link) {
+ offset = cap->start;
+
+ /* Initialize config space for the capability */
+ if (cap->populate)
+ cap->populate(cap);
+
+ /* Add capability header */
+ PCI_VIRT_CFG_INIT_RO(pvd, offset, 2, cap->id);
+
+ /* Update the next capability pointer */
+ PCI_VIRT_CFG_NORMAL_WR(pvd, last_cap_offset + 1, 1, offset);
+
+ last_cap_offset = offset;
+ }
+
+ /* 0x38 - Reserved */
+ PCI_VIRT_CFG_INIT_RO(pvd, 0x38, 4, 0x00000000);
+
+ /* 0x3c - INT line/pin/Minimal grant/Maximal latency */
+ if (!(dev->index % 2))
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_INT_LINE, 4, 0x00000100);
+ else
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_INT_LINE, 4, 0x00000200);
+}
+
+static uint32_t npu_allocate_bdfn(struct npu *p, uint32_t group)
+{
+ int i;
+ int bdfn = (group << 3);
+
+ for (i = 0; i < p->total_devices; i++) {
+ if ((p->devices[i].pvd->bdfn & 0xf8) == (bdfn & 0xf8))
+ bdfn++;
+ }
+
+ return bdfn;
+}
+
+static void npu_create_devices(struct dt_node *dn, struct npu *p)
+{
+ struct npu_dev *dev;
+ struct dt_node *npu_dn, *link;
+ uint32_t bdfn, npu_phandle, index = 0;
+ uint64_t buid_reg;
+ uint64_t lsisrcid;
+ uint64_t buid;
+
+
+ /* The bits in the LSI ID Base register are always compared and
+ * can be set to 0 in the buid base and mask fields. The
+ * buid (bus unit id) is the full irq minus the last 4 bits. */
+ lsisrcid = GETFIELD(NPU_LSI_SRC_ID_BASE, NPU_LSI_SRC_ID_BASE);
+ buid = p8_chip_irq_block_base(p->chip_id, P8_IRQ_BLOCK_MISC) >> 4;
+
+ buid_reg = SETFIELD(NP_IRQ_LEVELS, NP_BUID_ENABLE, ~0);
+ buid_reg = SETFIELD(NP_BUID_MASK, buid_reg, ~lsisrcid);
+ buid_reg = SETFIELD(NP_BUID_BASE, buid_reg, (buid & ~lsisrcid));
+
+ /* Get the npu node which has the links which we expand here
+ * into pci like devices attached to our emulated phb. */
+ npu_phandle = dt_prop_get_u32(dn, "ibm,npcq");
+ npu_dn = dt_find_by_phandle(dt_root, npu_phandle);
+ assert(npu_dn);
+
+ /* Walk the link@x nodes to initialize devices */
+ p->total_devices = 0;
+ p->phb.scan_map = 0;
+ list_head_init(&p->phb.virt_devices);
+ dt_for_each_compatible(npu_dn, link, "ibm,npu-link") {
+ struct npu_dev_bar *bar;
+ uint32_t group_id;
+ uint64_t val;
+
+ dev = &p->devices[index];
+ dev->index = dt_prop_get_u32(link, "ibm,npu-link-index");
+ dev->xscom = npu_link_scom_base(npu_dn, p->xscom_base,
+ dev->index);
+
+ dev->npu = p;
+ dev->dt_node = link;
+
+ /* We don't support MMIO PHY access yet */
+ dev->pl_base = NULL;
+
+ group_id = dt_prop_get_u32(link, "ibm,npu-group-id");
+ bdfn = npu_allocate_bdfn(p, group_id);
+
+ /* This must be done after calling
+ * npu_allocate_bdfn() */
+ p->total_devices++;
+ p->phb.scan_map |= 0x1 << ((bdfn & 0xf8) >> 3);
+
+ dev->pl_xscom_base = dt_prop_get_u64(link, "ibm,npu-phy");
+ dev->lane_mask = dt_prop_get_u32(link, "ibm,npu-lane-mask");
+
+ /* Setup BUID/ISRN */
+ xscom_write(p->chip_id, dev->xscom + NX_NP_BUID, buid_reg);
+
+ /* Create PCI virtual device */
+ dev->pvd = pci_virt_add_device(&p->phb, bdfn, NPU_DEV_CFG_SIZE, dev);
+ assert(dev->pvd);
+ bar = &dev->bar;
+ bar->flags = (PCI_CFG_BAR_TYPE_MEM |
+ PCI_CFG_BAR_MEM64);
+
+ /* Update BAR info */
+ bar->xscom = dev->xscom + NX_MMIO_BAR_0;
+ xscom_read(p->chip_id, bar->xscom, &val);
+ bar->base = GETFIELD(NX_MMIO_BAR_BASE, val) << 12;
+ bar->size = get_bar_size(val);
+
+ /*
+ * The config space is initialised with the BARs
+ * disabled, so make sure it is actually disabled in
+ * hardware.
+ */
+ npu_dev_bar_update(p->chip_id, bar, false);
+
+ /* Initialize capabilities */
+ npu_dev_create_capabilities(dev);
+
+ /* Initialize config space */
+ npu_dev_create_cfg(dev);
+
+ index++;
+ }
+}
+
+static void npu_add_phb_properties(struct npu *p)
+{
+ struct dt_node *np = p->phb.dt_node;
+ uint32_t icsp = get_ics_phandle();
+ uint64_t tkill, mm_base, mm_size;
+ uint32_t base_lsi = p->base_lsi;
+ uint32_t map[] = {
+ /* Dev 0 INT#A (used by fn0) */
+ 0x0000, 0x0, 0x0, 0x1, icsp, base_lsi + NPU_LSI_INT_DL0, 1,
+ /* Dev 0 INT#B (used by fn1) */
+ 0x0000, 0x0, 0x0, 0x2, icsp, base_lsi + NPU_LSI_INT_DL1, 1,
+ /* Dev 1 INT#A (used by fn0) */
+ 0x0800, 0x0, 0x0, 0x1, icsp, base_lsi + NPU_LSI_INT_DL2, 1,
+ /* Dev 1 INT#B (used by fn1) */
+ 0x0800, 0x0, 0x0, 0x2, icsp, base_lsi + NPU_LSI_INT_DL3, 1,
+ };
+ /* Mask is bus, device and INT# */
+ uint32_t mask[] = {0xf800, 0x0, 0x0, 0x7};
+ char slotbuf[32];
+
+ /* Add various properties that HB doesn't have to
+ * add, some of them simply because they result from
+ * policy decisions made in skiboot rather than in HB
+ * such as the MMIO windows going to PCI, interrupts,
+ * etc.
+ */
+ dt_add_property_cells(np, "#address-cells", 3);
+ dt_add_property_cells(np, "#size-cells", 2);
+ dt_add_property_cells(np, "#interrupt-cells", 1);
+ dt_add_property_cells(np, "bus-range", 0, 0xff);
+ dt_add_property_cells(np, "clock-frequency", 0x200, 0);
+ dt_add_property_cells(np, "interrupt-parent", icsp);
+
+ /* DLPL Interrupts, we don't use the standard swizzle */
+ p->phb.lstate.int_size = 0;
+ dt_add_property(np, "interrupt-map", map, sizeof(map));
+ dt_add_property(np, "interrupt-map-mask", mask, sizeof(mask));
+
+ /* NPU PHB properties */
+ /* TODO: Due to an errata TCE KILL only works when DMA traffic
+ * has been stopped. We need to implement the work around
+ * which is to do a TCE kill all instead. */
+ tkill = cleanup_addr((uint64_t)p->at_regs) + NPU_TCE_KILL;
+ dt_add_property_cells(np, "ibm,opal-num-pes",
+ NPU_NUM_OF_PES);
+ dt_add_property_cells(np, "ibm,opal-reserved-pe",
+ 0);
+ dt_add_property_u64(np, "ibm,opal-tce-kill", tkill);
+
+ /* Memory window is exposed as 32-bits non-prefetchable
+ * one because 64-bits prefetchable one is kind of special
+ * to kernel.
+ */
+ mm_base = p->mm_base;
+ mm_size = p->mm_size;
+ dt_add_property_cells(np, "ranges", 0x02000000,
+ hi32(mm_base), lo32(mm_base),
+ hi32(mm_base), lo32(mm_base),
+ hi32(mm_size), lo32(mm_size));
+
+ /* Set the slot location on the NPU PHB. This PHB can contain
+ * devices that correlate with multiple physical slots, so
+ * present the chip ID instead.
+ */
+ snprintf(slotbuf, sizeof(slotbuf), "NPU Chip %d", p->chip_id);
+ dt_add_property_string(np, "ibm,io-base-loc-code", slotbuf);
+}
+
+static void npu_create_phb(struct dt_node *dn)
+{
+ const struct dt_property *prop;
+ struct npu *p;
+ struct pci_slot *slot;
+ uint32_t links;
+ void *pmem;
+
+ /* Retrieve number of devices */
+ links = dt_prop_get_u32(dn, "ibm,links");
+ pmem = zalloc(sizeof(struct npu) + links * sizeof(struct npu_dev));
+ assert(pmem);
+
+ /* Populate PHB */
+ p = pmem;
+ p->index = dt_prop_get_u32(dn, "ibm,npu-index");
+ p->chip_id = dt_prop_get_u32(dn, "ibm,chip-id");
+ p->xscom_base = dt_prop_get_u32(dn, "ibm,xscom-base");
+ p->total_devices = links;
+
+ /* TODO: When hardware fences are implemented, detect them here */
+ p->fenced = false;
+
+ /* This is the AT base */
+ p->at_xscom = p->xscom_base + NPU_AT_SCOM_OFFSET;
+ p->at_regs = (void *)dt_get_address(dn, 0, NULL);
+
+ prop = dt_require_property(dn, "ibm,mmio-window", -1);
+ assert(prop->len >= (2 * sizeof(uint64_t)));
+ p->mm_base = ((const uint64_t *)prop->prop)[0];
+ p->mm_size = ((const uint64_t *)prop->prop)[1];
+
+ p->devices = pmem + sizeof(struct npu);
+
+ /* Interrupt */
+ p->base_lsi = p8_chip_irq_block_base(p->chip_id, P8_IRQ_BLOCK_MISC) +
+ NPU_LSI_IRQ_MIN;
+
+ /* Generic PHB */
+ p->phb.dt_node = dn;
+ p->phb.ops = &npu_ops;
+ p->phb.phb_type = phb_type_pcie_v3;
+
+ /* Populate devices */
+ npu_create_devices(dn, p);
+
+ /* Populate extra properties */
+ npu_add_phb_properties(p);
+
+ /* Create PHB slot */
+ slot = npu_slot_create(&p->phb);
+ if (!slot)
+ {
+ /**
+ * @fwts-label NPUCannotCreatePHBSlot
+ * @fwts-advice Firmware probably ran out of memory creating
+ * NPU slot. NVLink functionality could be broken.
+ */
+ prlog(PR_ERR, "NPU: Cannot create PHB slot\n");
+ }
+
+ /* Register PHB */
+ pci_register_phb(&p->phb, OPAL_DYNAMIC_PHB_ID);
+
+ /* Initialize IODA cache */
+ npu_ioda_init(p);
+
+ /* Register interrupt source */
+ npu_register_irq(p);
+
+ /* Initialize hardware */
+ npu_hw_init(p);
+}
+
+void probe_npu(void)
+{
+ struct dt_node *np;
+
+ /* Scan NPU XSCOM nodes */
+ dt_for_each_compatible(dt_root, np, "ibm,power8-npu")
+ npu_probe_phb(np);
+
+ /* Scan newly created PHB nodes */
+ dt_for_each_compatible(dt_root, np, "ibm,power8-npu-pciex")
+ npu_create_phb(np);
+}
diff --git a/roms/skiboot/hw/npu2-common.c b/roms/skiboot/hw/npu2-common.c
new file mode 100644
index 000000000..3bc9bcee6
--- /dev/null
+++ b/roms/skiboot/hw/npu2-common.c
@@ -0,0 +1,681 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2013-2019 IBM Corp. */
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <pci.h>
+#include <npu2.h>
+#include <npu2-regs.h>
+#include <bitutils.h>
+#include <nvram.h>
+#include <i2c.h>
+#include <interrupts.h>
+#include <xive.h>
+
+#define NPU2_IRQ_BASE_SHIFT 13
+#define NPU2_N_DL_IRQS 35
+#define NPU2_N_DL_IRQS_ALIGN 64
+
+/*
+ * We use the indirect method because it uses the same addresses as
+ * the MMIO offsets (NPU RING)
+ */
+static void npu2_scom_set_addr(uint64_t gcid, uint64_t scom_base,
+ uint64_t addr, uint64_t size)
+{
+ addr = SETFIELD(NPU2_MISC_DA_ADDR, 0ull, addr);
+ addr = SETFIELD(NPU2_MISC_DA_LEN, addr, size);
+ xscom_write(gcid, scom_base + NPU2_MISC_SCOM_IND_SCOM_ADDR, addr);
+}
+
+void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
+ uint64_t reg, uint64_t size,
+ uint64_t val)
+{
+ npu2_scom_set_addr(gcid, scom_base, reg, size);
+ xscom_write(gcid, scom_base + NPU2_MISC_SCOM_IND_SCOM_DATA, val);
+}
+
+uint64_t npu2_scom_read(uint64_t gcid, uint64_t scom_base,
+ uint64_t reg, uint64_t size)
+{
+ uint64_t val;
+
+ npu2_scom_set_addr(gcid, scom_base, reg, size);
+ xscom_read(gcid, scom_base + NPU2_MISC_SCOM_IND_SCOM_DATA, &val);
+
+ return val;
+}
+
+void npu2_write_4b(struct npu2 *p, uint64_t reg, uint32_t val)
+{
+ npu2_scom_write(p->chip_id, p->xscom_base, reg, NPU2_MISC_DA_LEN_4B,
+ (uint64_t)val << 32);
+}
+
+uint32_t npu2_read_4b(struct npu2 *p, uint64_t reg)
+{
+ return npu2_scom_read(p->chip_id, p->xscom_base, reg,
+ NPU2_MISC_DA_LEN_4B) >> 32;
+}
+
+void npu2_write(struct npu2 *p, uint64_t reg, uint64_t val)
+{
+ npu2_scom_write(p->chip_id, p->xscom_base, reg, NPU2_MISC_DA_LEN_8B, val);
+}
+
+uint64_t npu2_read(struct npu2 *p, uint64_t reg)
+{
+ return npu2_scom_read(p->chip_id, p->xscom_base, reg, NPU2_MISC_DA_LEN_8B);
+}
+
+void npu2_write_mask(struct npu2 *p, uint64_t reg, uint64_t val, uint64_t mask)
+{
+ uint64_t new_val;
+
+ new_val = npu2_read(p, reg);
+ new_val &= ~mask;
+ new_val |= val & mask;
+ npu2_scom_write(p->chip_id, p->xscom_base, reg, NPU2_MISC_DA_LEN_8B, new_val);
+}
+
+void npu2_write_mask_4b(struct npu2 *p, uint64_t reg, uint32_t val, uint32_t mask)
+{
+ uint32_t new_val;
+
+ new_val = npu2_read_4b(p, reg);
+ new_val &= ~mask;
+ new_val |= val & mask;
+ npu2_scom_write(p->chip_id, p->xscom_base, reg, NPU2_MISC_DA_LEN_4B,
+ (uint64_t)new_val << 32);
+}
+
+typedef struct {
+ const char *name;
+ uint32_t block;
+ uint32_t offset;
+} npu2_scom_dump_t;
+
+static npu2_scom_dump_t npu2_scom_dump_global[] = {
+ /* CQ State Machine */
+ { "CS.SM0.MISC.CERR_MESSAGE0", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG0 },
+ { "CS.SM1.MISC.CERR_MESSAGE0", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG0 },
+ { "CS.SM2.MISC.CERR_MESSAGE0", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG0 },
+ { "CS.SM3.MISC.CERR_MESSAGE0", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG0 },
+
+ { "CS.SM0.MISC.CERR_MESSAGE1", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG1 },
+ { "CS.SM1.MISC.CERR_MESSAGE1", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG1 },
+ { "CS.SM2.MISC.CERR_MESSAGE1", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG1 },
+ { "CS.SM3.MISC.CERR_MESSAGE1", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG1 },
+
+ { "CS.SM0.MISC.CERR_MESSAGE2", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG2 },
+ { "CS.SM1.MISC.CERR_MESSAGE2", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG2 },
+ { "CS.SM2.MISC.CERR_MESSAGE2", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG2 },
+ { "CS.SM3.MISC.CERR_MESSAGE2", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG2 },
+
+ { "CS.SM0.MISC.CERR_MESSAGE3", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG3 },
+ { "CS.SM1.MISC.CERR_MESSAGE3", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG3 },
+ { "CS.SM2.MISC.CERR_MESSAGE3", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG3 },
+ { "CS.SM3.MISC.CERR_MESSAGE3", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG3 },
+
+ { "CS.SM0.MISC.CERR_MESSAGE4", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG4 },
+ { "CS.SM1.MISC.CERR_MESSAGE4", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG4 },
+ { "CS.SM2.MISC.CERR_MESSAGE4", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG4 },
+ { "CS.SM3.MISC.CERR_MESSAGE4", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG4 },
+
+ { "CS.SM0.MISC.CERR_MESSAGE5", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG5 },
+ { "CS.SM1.MISC.CERR_MESSAGE5", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG5 },
+ { "CS.SM2.MISC.CERR_MESSAGE5", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG5 },
+ { "CS.SM3.MISC.CERR_MESSAGE5", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG5 },
+
+ { "CS.SM0.MISC.CERR_MESSAGE6", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG6 },
+ { "CS.SM1.MISC.CERR_MESSAGE6", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG6 },
+ { "CS.SM2.MISC.CERR_MESSAGE6", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG6 },
+ { "CS.SM3.MISC.CERR_MESSAGE6", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG6 },
+
+ { "CS.SM0.MISC.CERR_FIRST0", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST0 },
+ { "CS.SM1.MISC.CERR_FIRST0", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST0 },
+ { "CS.SM2.MISC.CERR_FIRST0", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST0 },
+ { "CS.SM3.MISC.CERR_FIRST0", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST0 },
+
+ { "CS.SM0.MISC.CERR_FIRST1", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST1 },
+ { "CS.SM1.MISC.CERR_FIRST1", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST1 },
+ { "CS.SM2.MISC.CERR_FIRST1", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST1 },
+ { "CS.SM3.MISC.CERR_FIRST1", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST1 },
+
+ { "CS.SM0.MISC.CERR_FIRST2", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST2 },
+ { "CS.SM1.MISC.CERR_FIRST2", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST2 },
+ { "CS.SM2.MISC.CERR_FIRST2", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST2 },
+ { "CS.SM3.MISC.CERR_FIRST2", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST2 },
+
+ /* CQ Control */
+ { "CS.CTL.MISC.CERR_MESSAGE0", NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_MSG0 },
+ { "CS.CTL.MISC.CERR_MESSAGE1", NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_MSG1 },
+ { "CS.CTL.MISC.CERR_FIRST0", NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_FIRST0 },
+ { "CS.CTL.MISC.CERR_FIRST1", NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_FIRST1 },
+
+ /* CQ Data */
+ { "DAT.MISC.CERR_ECC_HOLD", NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_STATUS },
+ { "DAT.MISC.CERR_ECC_MASK", NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_MASK },
+ { "DAT.MISC.CERR_ECC_FIRST", NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_FIRST },
+ { "DAT.MISC.REM0", NPU2_BLOCK_DAT, NPU2_CQ_DAT_RAS_MSG0 },
+ { "DAT.MISC.REM1", NPU2_BLOCK_DAT, NPU2_CQ_DAT_RAS_MSG1 },
+};
+
+static npu2_scom_dump_t npu2_scom_dump_nvlink[] = {
+ { "NTL0.REGS.CERR_FIRST1", NPU2_BLOCK_NTL0, NPU2_NTL_ERR_FIRST1_OFF },
+ { "NTL1.REGS.CERR_FIRST1", NPU2_BLOCK_NTL1, NPU2_NTL_ERR_FIRST1_OFF },
+ { "NTL0.REGS.CERR_FIRST2", NPU2_BLOCK_NTL0, NPU2_NTL_ERR_FIRST2_OFF },
+ { "NTL1.REGS.CERR_FIRST2", NPU2_BLOCK_NTL1, NPU2_NTL_ERR_FIRST2_OFF },
+};
+
+static npu2_scom_dump_t npu2_scom_dump_ocapi[] = {
+ { "OTL0.MISC.C_ERR_RPT_HOLD0", NPU2_BLOCK_OTL0, NPU2_OTL_ERR_RPT_HOLD0 },
+ { "OTL1.MISC.C_ERR_RPT_HOLD0", NPU2_BLOCK_OTL1, NPU2_OTL_ERR_RPT_HOLD0 },
+ { "OTL0.MISC.OTL_REM0", NPU2_BLOCK_OTL0, NPU2_OTL_RAS_ERR_MSG0 },
+ { "OTL1.MISC.OTL_REM0", NPU2_BLOCK_OTL1, NPU2_OTL_RAS_ERR_MSG0 },
+ { "OTL0.MISC.ERROR_SIG_RXI", NPU2_BLOCK_OTL0, NPU2_OTL_RXI_ERR_SIG },
+ { "OTL1.MISC.ERROR_SIG_RXI", NPU2_BLOCK_OTL1, NPU2_OTL_RXI_ERR_SIG },
+ { "OTL0.MISC.ERROR_SIG_RXO", NPU2_BLOCK_OTL0, NPU2_OTL_RXO_ERR_SIG },
+ { "OTL1.MISC.ERROR_SIG_RXO", NPU2_BLOCK_OTL1, NPU2_OTL_RXO_ERR_SIG },
+ { "OTL0.MISC.C_ERR_RPT_HOLD1", NPU2_BLOCK_OTL0, NPU2_OTL_ERR_RPT_HOLD1 },
+ { "OTL1.MISC.C_ERR_RPT_HOLD1", NPU2_BLOCK_OTL1, NPU2_OTL_ERR_RPT_HOLD1 },
+};
+
+static void print_one_npu_reg(struct npu2 *npu, npu2_scom_dump_t *scom, int stack)
+{
+ uint64_t reg, val;
+
+ reg = NPU2_REG_OFFSET(stack, scom->block, scom->offset);
+ val = npu2_scom_read(npu->chip_id, npu->xscom_base,
+ reg, NPU2_MISC_DA_LEN_8B);
+
+ prlog(PR_ERR, "NPU[%d] STCK%d.%s 0x%llx = 0x%016llx\n",
+ npu->chip_id, stack - 4, scom->name, reg, val);
+}
+
+/* same as above, but for direct access registers */
+static void print_one_reg(int chip_id, int brick_index,
+ uint64_t reg_addr, const char *reg_name)
+{
+ uint64_t val;
+
+ xscom_read(chip_id, reg_addr, &val);
+ prlog(PR_ERR, "NPU[%d] %s brick %d 0x%llx = 0x%016llx\n",
+ chip_id, reg_name, brick_index, reg_addr, val);
+}
+
+static void show_nvlink_regs(struct npu2 *npu, int brick_index)
+{
+ uint32_t stack, ntl;
+ int i;
+
+ stack = NPU2_STACK_STCK_0 + brick_index / 2;
+ ntl = NPU2_BLOCK_NTL0 + (brick_index % 2) * 2;
+
+ for (i = 0; i < ARRAY_SIZE(npu2_scom_dump_nvlink); i++) {
+ if (npu2_scom_dump_nvlink[i].block == ntl)
+ print_one_npu_reg(npu, &npu2_scom_dump_nvlink[i], stack);
+ }
+}
+
+static void show_opencapi_regs(struct npu2 *npu, int brick_index)
+{
+ uint32_t stack, otl;
+ int i;
+
+ stack = NPU2_STACK_STCK_0 + brick_index / 2;
+ otl = NPU2_BLOCK_OTL0 + (brick_index % 2);
+
+ /* NPU registers */
+ for (i = 0; i < ARRAY_SIZE(npu2_scom_dump_ocapi); i++) {
+ if (npu2_scom_dump_ocapi[i].block == otl)
+ print_one_npu_reg(npu, &npu2_scom_dump_ocapi[i], stack);
+ }
+
+ /* Fabric registers */
+ print_one_reg(npu->chip_id, brick_index,
+ OB_ODL_STATUS(brick_index), "ODL status");
+ print_one_reg(npu->chip_id, brick_index,
+ OB_ODL_TRAINING_STATUS(brick_index), "ODL training status");
+ print_one_reg(npu->chip_id, brick_index,
+ OB_ODL_ENDPOINT_INFO(brick_index), "ODL endpoint info");
+}
+
+static void show_all_regs(struct npu2 *npu, int brick_index)
+{
+ int i, stack, stack_min, stack_max;
+ uint64_t fir_val, mask_val, fir_addr, mask_addr;
+ struct npu2_dev *dev;
+ npu2_scom_dump_t scom_reg;
+
+ if (brick_index != -1) {
+ stack_min = stack_max = NPU2_STACK_STCK_0 + brick_index / 2;
+ } else {
+ stack_min = NPU2_STACK_STCK_0;
+ stack_max = NPU2_STACK_STCK_2;
+ /* Avoid dumping unused stacks for opencapi on Lagrange */
+ if (npu->total_devices == 2)
+ stack_min = stack_max = NPU2_STACK_STCK_1;
+ }
+
+ /* NPU FIRs */
+ for (i = 0; i < NPU2_TOTAL_FIR_REGISTERS; i++) {
+ fir_addr = NPU2_FIR_REGISTER_0 + i * NPU2_FIR_OFFSET;
+ mask_addr = fir_addr + NPU2_FIR_MASK_OFFSET;
+ xscom_read(npu->chip_id, fir_addr, &fir_val);
+ xscom_read(npu->chip_id, mask_addr, &mask_val);
+ prlog(PR_ERR, "NPU[%d] FIR%d = 0x%016llx (mask 0x%016llx => 0x%016llx)\n",
+ npu->chip_id, i, fir_val, mask_val, fir_val & ~mask_val);
+ }
+
+ /* NPU global, per-stack registers */
+ for (i = 0; i < ARRAY_SIZE(npu2_scom_dump_global); i++) {
+ for (stack = stack_min; stack <= stack_max; stack++)
+ print_one_npu_reg(npu, &npu2_scom_dump_global[i], stack);
+ }
+
+ /*
+ * NPU global registers, stack independent
+ * We have only one for now, so dump it directly
+ */
+ scom_reg.name = "XTS.REG.ERR_HOLD";
+ scom_reg.block = NPU2_BLOCK_XTS;
+ scom_reg.offset = 0;
+ print_one_npu_reg(npu, &scom_reg, NPU2_STACK_MISC);
+
+ /* nvlink- or opencapi-specific registers */
+ for (i = 0; i < npu->total_devices; i++) {
+ dev = &npu->devices[i];
+ if (brick_index == -1 || dev->brick_index == brick_index) {
+ if (dev->type == NPU2_DEV_TYPE_NVLINK)
+ show_nvlink_regs(npu, dev->brick_index);
+ else if (dev->type == NPU2_DEV_TYPE_OPENCAPI)
+ show_opencapi_regs(npu, dev->brick_index);
+ }
+ }
+}
+
+void npu2_dump_scoms(int chip_id)
+{
+ struct npu2 *npu;
+ struct phb *phb;
+ struct npu2_dev *dev;
+
+ /*
+ * Look for the npu2 structure for that chip ID. We can access it
+ * through the array of phbs, looking for a nvlink or opencapi
+ * phb. We can have several entries, but they all point
+ * to the same npu2 structure
+ */
+ for_each_phb(phb) {
+ npu = NULL;
+ if (phb->phb_type == phb_type_npu_v2) {
+ npu = phb_to_npu2_nvlink(phb);
+ } else if (phb->phb_type == phb_type_npu_v2_opencapi) {
+ dev = phb_to_npu2_dev_ocapi(phb);
+ npu = dev->npu;
+ }
+ if (npu && npu->chip_id == chip_id) {
+ show_all_regs(npu, -1 /* all bricks */);
+ break;
+ }
+ }
+}
+
+static uint64_t npu2_ipi_attributes(struct irq_source *is __unused, uint32_t isn __unused)
+{
+ struct npu2 *p = is->data;
+ uint32_t idx = isn - p->base_lsi;
+
+ if ((idx == 18) || (idx >= 27 && idx <= 34))
+ /*
+ * level 18: TCE Interrupt - used to detect a frozen PE (nvlink)
+ * level 27-30: OTL interrupt (opencapi)
+ * level 31-34: XSL interrupt (opencapi)
+ */
+ return IRQ_ATTR_TARGET_OPAL | IRQ_ATTR_TARGET_RARE | IRQ_ATTR_TYPE_MSI;
+ else
+ return IRQ_ATTR_TARGET_LINUX;
+}
+
+static char *npu2_ipi_name(struct irq_source *is, uint32_t isn)
+{
+ struct npu2 *p = is->data;
+ uint32_t idx = isn - p->base_lsi;
+ const char *name;
+
+ switch (idx) {
+ case 0: name = "NDL 0 Stall Event (brick 0)"; break;
+ case 1: name = "NDL 0 No-Stall Event (brick 0)"; break;
+ case 2: name = "NDL 1 Stall Event (brick 1)"; break;
+ case 3: name = "NDL 1 No-Stall Event (brick 1)"; break;
+ case 4: name = "NDL 2 Stall Event (brick 2)"; break;
+ case 5: name = "NDL 2 No-Stall Event (brick 2)"; break;
+ case 6: name = "NDL 5 Stall Event (brick 3)"; break;
+ case 7: name = "NDL 5 No-Stall Event (brick 3)"; break;
+ case 8: name = "NDL 4 Stall Event (brick 4)"; break;
+ case 9: name = "NDL 4 No-Stall Event (brick 4)"; break;
+ case 10: name = "NDL 3 Stall Event (brick 5)"; break;
+ case 11: name = "NDL 3 No-Stall Event (brick 5)"; break;
+ case 12: name = "NTL 0 Event"; break;
+ case 13: name = "NTL 1 Event"; break;
+ case 14: name = "NTL 2 Event"; break;
+ case 15: name = "NTL 3 Event"; break;
+ case 16: name = "NTL 4 Event"; break;
+ case 17: name = "NTL 5 Event"; break;
+ case 18: name = "TCE Event"; break;
+ case 19: name = "ATS Event"; break;
+ case 20: name = "CQ Event"; break;
+ case 21: name = "MISC Event"; break;
+ case 22: name = "NMMU Local Xstop"; break;
+ case 23: name = "Translate Fail (brick 2)"; break;
+ case 24: name = "Translate Fail (brick 3)"; break;
+ case 25: name = "Translate Fail (brick 4)"; break;
+ case 26: name = "Translate Fail (brick 5)"; break;
+ case 27: name = "OTL Event (brick 2)"; break;
+ case 28: name = "OTL Event (brick 3)"; break;
+ case 29: name = "OTL Event (brick 4)"; break;
+ case 30: name = "OTL Event (brick 5)"; break;
+ case 31: name = "XSL Event (brick 2)"; break;
+ case 32: name = "XSL Event (brick 3)"; break;
+ case 33: name = "XSL Event (brick 4)"; break;
+ case 34: name = "XSL Event (brick 5)"; break;
+ default: name = "Unknown";
+ }
+ return strdup(name);
+}
+
+static void npu2_err_interrupt(struct irq_source *is, uint32_t isn)
+{
+ struct npu2 *p = is->data;
+ uint32_t idx = isn - p->base_lsi;
+ char *irq_name;
+ int brick;
+
+ switch (idx) {
+ case 18:
+ opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
+ OPAL_EVENT_PCI_ERROR);
+ break;
+ case 27 ... 34:
+ /* opencapi only */
+ brick = 2 + ((idx - 27) % 4);
+ irq_name = npu2_ipi_name(is, isn);
+ prlog(PR_ERR, "NPU[%d] received error interrupt '%s'\n",
+ p->chip_id, irq_name);
+ free(irq_name);
+ show_all_regs(p, brick);
+ /*
+ * P9 NPU doesn't support recovering a link going down
+ * unexpectedly. So we mark the device as broken and
+ * report it to the OS, so that the error is logged
+ * and the drivers notified.
+ */
+ npu2_opencapi_set_broken(p, brick);
+ opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
+ OPAL_EVENT_PCI_ERROR);
+ break;
+ default:
+ prerror("OPAL received unknown NPU2 interrupt %d\n", idx);
+ return;
+ }
+}
+
+static const struct irq_source_ops npu2_ipi_ops = {
+ .interrupt = npu2_err_interrupt,
+ .attributes = npu2_ipi_attributes,
+ .name = npu2_ipi_name,
+};
+
+static void setup_irqs(struct npu2 *p)
+{
+ uint64_t reg, val;
+ void *tp;
+
+ p->base_lsi = xive_alloc_ipi_irqs(p->chip_id, NPU2_N_DL_IRQS, NPU2_N_DL_IRQS_ALIGN);
+ if (p->base_lsi == XIVE_IRQ_ERROR) {
+ prlog(PR_ERR, "NPU: Failed to allocate interrupt sources\n");
+ return;
+ }
+ xive_register_ipi_source(p->base_lsi, NPU2_N_DL_IRQS, p, &npu2_ipi_ops);
+
+ /* Set IPI configuration */
+ reg = NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, NPU2_MISC_CFG);
+ val = npu2_read(p, reg);
+ val = SETFIELD(NPU2_MISC_CFG_IPI_PS, val, NPU2_MISC_CFG_IPI_PS_64K);
+ val = SETFIELD(NPU2_MISC_CFG_IPI_OS, val, NPU2_MISC_CFG_IPI_OS_AIX);
+ npu2_write(p, reg, val);
+
+ /* Set IRQ base */
+ reg = NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, NPU2_MISC_IRQ_BASE);
+ tp = xive_get_trigger_port(p->base_lsi);
+ val = ((uint64_t)tp) << NPU2_IRQ_BASE_SHIFT;
+ npu2_write(p, reg, val);
+}
+
+static bool _i2c_presence_detect(struct npu2_dev *dev)
+{
+ uint8_t state, data;
+ int rc;
+
+ rc = i2c_request_send(dev->npu->i2c_port_id_ocapi,
+ platform.ocapi->i2c_presence_addr,
+ SMBUS_READ, 0, 1,
+ &state, 1, 120);
+ if (rc) {
+ OCAPIERR(dev, "error detecting link presence: %d\n", rc);
+ return true; /* assume link exists */
+ }
+
+ OCAPIDBG(dev, "I2C presence detect: 0x%x\n", state);
+
+ switch (dev->link_index) {
+ case 2:
+ data = platform.ocapi->i2c_presence_brick2;
+ break;
+ case 3:
+ data = platform.ocapi->i2c_presence_brick3;
+ break;
+ case 4:
+ data = platform.ocapi->i2c_presence_brick4;
+ break;
+ case 5:
+ data = platform.ocapi->i2c_presence_brick5;
+ break;
+ default:
+ OCAPIERR(dev, "presence detection on invalid link\n");
+ return true;
+ }
+ /* Presence detect bits are active low */
+ return !(state & data);
+}
+
+/*
+ * A default presence detection implementation for platforms like ZZ and Zaius
+ * that don't implement their own. Assumes all devices found will be OpenCAPI.
+ */
+void npu2_i2c_presence_detect(struct npu2 *npu)
+{
+ struct npu2_dev *dev;
+ assert(platform.ocapi);
+ for (int i = 0; i < npu->total_devices; i++) {
+ dev = &npu->devices[i];
+ if (_i2c_presence_detect(dev))
+ dev->type = NPU2_DEV_TYPE_OPENCAPI;
+ else
+ dev->type = NPU2_DEV_TYPE_UNKNOWN;
+ }
+}
+
+static struct npu2 *setup_npu(struct dt_node *dn)
+{
+ struct npu2 *npu;
+ struct npu2_dev *dev;
+ struct dt_node *np;
+ uint32_t num_links;
+ char port_name[17];
+ void *npumem;
+ char *path;
+ int gcid;
+ struct proc_chip *chip;
+ int i = 0;
+
+ /* Retrieve chip ID */
+ path = dt_get_path(dn);
+ gcid = dt_get_chip_id(dn);
+ chip = get_chip(gcid);
+ assert(chip);
+
+ num_links = dt_prop_get_u32(dn, "ibm,npu-links");
+ npumem = zalloc(sizeof(struct npu2) + num_links *
+ sizeof(struct npu2_dev));
+ assert(npumem);
+ npu = npumem;
+
+ npu->dt_node = dn;
+ npu->index = dt_prop_get_u32(dn, "ibm,npu-index");
+ npu->chip_id = gcid;
+ npu->xscom_base = dt_get_address(dn, 0, NULL);
+
+ init_lock(&npu->i2c_lock);
+ npu->i2c_pin_mode = ~0; // input mode by default
+ npu->i2c_pin_wr_state = ~0; // reset is active low
+ if (platform.ocapi) {
+ /* Find I2C port for handling device presence/reset */
+ snprintf(port_name, sizeof(port_name), "p8_%08x_e%dp%d",
+ gcid, platform.ocapi->i2c_engine,
+ platform.ocapi->i2c_port);
+ prlog(PR_DEBUG, "NPU: Looking for I2C port %s\n", port_name);
+
+ dt_for_each_compatible(dt_root, np, "ibm,power9-i2c-port") {
+ if (streq(port_name, dt_prop_get(np, "ibm,port-name"))) {
+ npu->i2c_port_id_ocapi = dt_prop_get_u32(np, "ibm,opal-id");
+ break;
+ }
+ }
+
+ if (!npu->i2c_port_id_ocapi) {
+ prlog(PR_ERR, "NPU: Couldn't find I2C port %s\n",
+ port_name);
+ goto failed;
+ }
+ }
+
+ npu->devices = npumem + sizeof(struct npu2);
+
+ dt_for_each_compatible(dn, np, "ibm,npu-link") {
+ assert(i < num_links);
+ dev = &npu->devices[i];
+ dev->link_index = dt_prop_get_u32(np, "ibm,npu-link-index");
+ /* May be overridden by platform presence detection */
+ dev->brick_index = dev->link_index;
+ /* Will be overridden by presence detection */
+ dev->type = NPU2_DEV_TYPE_UNKNOWN;
+ dev->npu = npu;
+ dev->dt_node = np;
+ dev->pl_xscom_base = dt_prop_get_u64(np, "ibm,npu-phy");
+ dev->lane_mask = dt_prop_get_u32(np, "ibm,npu-lane-mask");
+ dev->link_speed = dt_prop_get_u64(np, "ibm,link-speed");
+ i++;
+ };
+ npu->total_devices = i;
+
+ prlog(PR_INFO, "NPU: Chip %d Found NPU2#%d (%d links) at %s\n",
+ npu->chip_id, npu->index, npu->total_devices, path);
+ prlog(PR_INFO, " SCOM Base: %08llx\n", npu->xscom_base);
+ free(path);
+ return npu;
+
+failed:
+ prlog(PR_ERR, "NPU: Chip %d NPU setup failed\n", gcid);
+ free(path);
+ free(npu);
+ return NULL;
+}
+
+static void setup_devices(struct npu2 *npu)
+{
+ bool nvlink_detected = false, ocapi_detected = false;
+ struct npu2_dev *dev;
+
+ /*
+ * TODO: In future, we'll do brick configuration here to support mixed
+ * setups.
+ */
+ for (int i = 0; i < npu->total_devices; i++) {
+ dev = &npu->devices[i];
+ switch (dev->type) {
+ case NPU2_DEV_TYPE_NVLINK:
+ nvlink_detected = true;
+ dt_add_property_strings(dev->dt_node,
+ "ibm,npu-link-type",
+ "nvlink");
+ break;
+ case NPU2_DEV_TYPE_OPENCAPI:
+ ocapi_detected = true;
+ dt_add_property_strings(dev->dt_node,
+ "ibm,npu-link-type",
+ "opencapi");
+ break;
+ default:
+ prlog(PR_INFO, "NPU: Link %d device not present\n",
+ npu->devices[i].link_index);
+ dt_add_property_strings(dev->dt_node,
+ "ibm,npu-link-type",
+ "unknown");
+ }
+ }
+
+ if (nvlink_detected && ocapi_detected) {
+ prlog(PR_ERR, "NPU: NVLink and OpenCAPI devices on same chip not supported, aborting NPU init\n");
+ return;
+ }
+
+ setup_irqs(npu);
+
+ if (nvlink_detected)
+ npu2_nvlink_init_npu(npu);
+ else if (ocapi_detected)
+ npu2_opencapi_init_npu(npu);
+}
+
+void probe_npu2(void)
+{
+ struct proc_chip *chip = next_chip(NULL);
+ struct npu2 *npu;
+ struct dt_node *np;
+ const char *zcal;
+
+ /* npu2 only */
+ if (!dt_find_compatible_node(dt_root, NULL, "ibm,power9-npu"))
+ return;
+
+ /* Abort if we're running on POWER9C DD1 (P9N DD1 is not supported) */
+ if (chip &&
+ chip->type == PROC_CHIP_P9_CUMULUS &&
+ (chip->ec_level & 0xf0) == 0x10) {
+ prlog(PR_INFO, "NPU2: DD1 not supported\n");
+ return;
+ }
+
+ /* Check for a zcal override */
+ zcal = nvram_query_dangerous("nv_zcal_override");
+ if (zcal) {
+ nv_zcal_nominal = atoi(zcal);
+ prlog(PR_WARNING, "NPU2: Using ZCAL impedance override = %d\n", nv_zcal_nominal);
+ }
+
+ if (!platform.npu2_device_detect) {
+ prlog(PR_INFO, "NPU: Platform does not support NPU\n");
+ return;
+ }
+
+ dt_for_each_compatible(dt_root, np, "ibm,power9-npu") {
+ npu = setup_npu(np);
+ if (!npu)
+ continue;
+ platform.npu2_device_detect(npu);
+ setup_devices(npu);
+ }
+}
diff --git a/roms/skiboot/hw/npu2-hw-procedures.c b/roms/skiboot/hw/npu2-hw-procedures.c
new file mode 100644
index 000000000..fb88dfdf6
--- /dev/null
+++ b/roms/skiboot/hw/npu2-hw-procedures.c
@@ -0,0 +1,1079 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * NPU2 (POWER9) Hardware Procedures
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <io.h>
+#include <timebase.h>
+#include <pci.h>
+#include <pci-virt.h>
+#include <interrupts.h>
+#include <npu2.h>
+#include <npu2-regs.h>
+#include <xscom.h>
+
+/* Set in npu2.c if there is an nvram override for the zcal settings on this
+ * machine */
+int nv_zcal_nominal = -1;
+
+/* PHY Registers. The documentation for the PHY training is written in
+ * terms of bits within an actual register so we use that
+ * representation here. */
+struct npu2_phy_reg {
+ uint64_t offset;
+ uint64_t start;
+ uint64_t len;
+};
+
+/*
+ * Currently unused, but documented here:
+static struct npu2_phy_reg NPU2_PHY_RX_DATA_DAC_SPARE_MODE = {0x000, 63, 64};
+static struct npu2_phy_reg NPU2_PHY_RX_DAC_CNTL6 = {0x00c, 63, 64};
+static struct npu2_phy_reg NPU2_PHY_RX_DAC_CNTL5 = {0x028, 63, 64};
+static struct npu2_phy_reg NPU2_PHY_RX_DAC_CNTL9 = {0x030, 63, 64};
+static struct npu2_phy_reg NPU2_PHY_RX_DAC_CNTL5_EO = {0x00a, 63, 64};
+static struct npu2_phy_reg NPU2_PHY_RX_DAC_CNTL4 = {0x026, 63, 64};
+*/
+static struct npu2_phy_reg NPU2_PHY_RX_RUN_LANE = {0x0c8, 48, 1};
+static struct npu2_phy_reg NPU2_PHY_RX_IORESET = {0x096, 63, 1};
+static struct npu2_phy_reg NPU2_PHY_TX_IORESET = {0x113, 48, 1};
+static struct npu2_phy_reg NPU2_PHY_RX_PR_RESET = {0x096, 62, 1};
+static struct npu2_phy_reg NPU2_PHY_RX_LANE_ANA_PDWN = {0x002, 54, 1};
+static struct npu2_phy_reg NPU2_PHY_RX_LANE_DIG_PDWN = {0x088, 48, 1};
+static struct npu2_phy_reg NPU2_PHY_RX_PR_IQ_RES_SEL = {0x004, 59, 3};
+static struct npu2_phy_reg NPU2_PHY_RX_PR_PHASE_STEP = {0x08a, 60, 4};
+static struct npu2_phy_reg NPU2_PHY_TX_LANE_PDWN = {0x101, 48, 1};
+static struct npu2_phy_reg NPU2_PHY_RX_RUN_DCCAL = {0x0c8, 49, 1};
+static struct npu2_phy_reg NPU2_PHY_RX_DCCAL_DONE = {0x0ca, 49, 1};
+static struct npu2_phy_reg NPU2_PHY_RX_LANE_BUSY = {0x0ca, 50, 1};
+static struct npu2_phy_reg NPU2_PHY_RX_B_BANK_CONTROLS = {0x002, 58, 6};
+static struct npu2_phy_reg NPU2_PHY_TX_UNLOAD_CLK_DISABLE = {0x103, 56, 1};
+static struct npu2_phy_reg NPU2_PHY_TX_FIFO_INIT = {0x105, 53, 1};
+static struct npu2_phy_reg NPU2_PHY_TX_RXCAL = {0x103, 57, 1};
+static struct npu2_phy_reg NPU2_PHY_RX_INIT_DONE = {0x0ca, 48, 1};
+static struct npu2_phy_reg NPU2_PHY_RX_PR_EDGE_TRACK_CNTL = {0x092, 48, 2};
+static struct npu2_phy_reg NPU2_PHY_RX_PR_BUMP_SL_1UI = {0x092, 57, 1};
+static struct npu2_phy_reg NPU2_PHY_RX_PR_FW_OFF = {0x08a, 56, 1};
+static struct npu2_phy_reg NPU2_PHY_RX_PR_FW_INERTIA_AMT = {0x08a, 57, 3};
+static struct npu2_phy_reg NPU2_PHY_RX_CFG_LTE_MC = {0x000, 60, 4};
+static struct npu2_phy_reg NPU2_PHY_RX_A_INTEG_COARSE_GAIN = {0x00a, 48, 4};
+static struct npu2_phy_reg NPU2_PHY_RX_A_CTLE_COARSE = {0x00c, 48, 5};
+static struct npu2_phy_reg NPU2_PHY_RX_A_CTLE_GAIN = {0x00c, 53, 4};
+static struct npu2_phy_reg NPU2_PHY_RX_B_INTEG_COARSE_GAIN = {0x026, 48, 4};
+static struct npu2_phy_reg NPU2_PHY_RX_B_CTLE_COARSE = {0x028, 48, 5};
+static struct npu2_phy_reg NPU2_PHY_RX_B_CTLE_GAIN = {0x028, 53, 4};
+static struct npu2_phy_reg NPU2_PHY_RX_E_INTEG_COARSE_GAIN = {0x030, 48, 4};
+static struct npu2_phy_reg NPU2_PHY_RX_E_CTLE_COARSE = {0x032, 48, 5};
+static struct npu2_phy_reg NPU2_PHY_RX_E_CTLE_GAIN = {0x032, 53, 4};
+
+/* These registers are per-PHY, not per lane */
+static struct npu2_phy_reg NPU2_PHY_RX_SPEED_SELECT = {0x262, 51, 2};
+static struct npu2_phy_reg NPU2_PHY_RX_AC_COUPLED = {0x262, 53, 1};
+static struct npu2_phy_reg NPU2_PHY_TX_ZCAL_SWO_EN = {0x3c9, 48, 1};
+static struct npu2_phy_reg NPU2_PHY_TX_ZCAL_REQ = {0x3c1, 49, 1};
+static struct npu2_phy_reg NPU2_PHY_TX_ZCAL_DONE = {0x3c1, 50, 1};
+static struct npu2_phy_reg NPU2_PHY_TX_ZCAL_ERROR = {0x3c1, 51, 1};
+static struct npu2_phy_reg NPU2_PHY_TX_ZCAL_N = {0x3c3, 48, 9};
+static struct npu2_phy_reg NPU2_PHY_TX_ZCAL_P = {0x3c5, 48, 9};
+static struct npu2_phy_reg NPU2_PHY_TX_FFE_BOOST_EN = {0x34b, 59, 1};
+static struct npu2_phy_reg NPU2_PHY_TX_PSEG_PRE_EN = {0x34d, 51, 5};
+static struct npu2_phy_reg NPU2_PHY_TX_PSEG_PRE_SELECT = {0x34d, 56, 5};
+static struct npu2_phy_reg NPU2_PHY_TX_NSEG_PRE_EN = {0x34f, 51, 5};
+static struct npu2_phy_reg NPU2_PHY_TX_NSEG_PRE_SELECT = {0x34f, 56, 5};
+static struct npu2_phy_reg NPU2_PHY_TX_PSEG_POST_EN = {0x361, 49, 7};
+static struct npu2_phy_reg NPU2_PHY_TX_PSEG_POST_SELECT = {0x361, 56, 7};
+static struct npu2_phy_reg NPU2_PHY_TX_NSEG_POST_EN = {0x363, 49, 7};
+static struct npu2_phy_reg NPU2_PHY_TX_NSEG_POST_SELECT = {0x363, 56, 7};
+static struct npu2_phy_reg NPU2_PHY_TX_PSEG_MARGINPU_EN = {0x351, 48, 8};
+static struct npu2_phy_reg NPU2_PHY_TX_NSEG_MARGINPU_EN = {0x353, 48, 8};
+static struct npu2_phy_reg NPU2_PHY_TX_PSEG_MARGINPD_EN = {0x351, 56, 8};
+static struct npu2_phy_reg NPU2_PHY_TX_NSEG_MARGINPD_EN = {0x353, 56, 8};
+static struct npu2_phy_reg NPU2_PHY_TX_MARGINPU_SELECT = {0x355, 48, 8};
+static struct npu2_phy_reg NPU2_PHY_TX_MARGINPD_SELECT = {0x355, 56, 8};
+static struct npu2_phy_reg NPU2_PHY_TX_PSEG_MAIN_EN = {0x357, 51, 7};
+static struct npu2_phy_reg NPU2_PHY_TX_NSEG_MAIN_EN = {0x359, 51, 7};
+/* Currently unused, but documented here
+static struct npu2_phy_reg NPU2_PHY_RX_HIST_MIN_EYE_WIDTH = {0x24e, 54, 8};
+static struct npu2_phy_reg NPU2_PHY_RX_HIST_MIN_EYE_WIDTH_LANE = {0x24e, 49, 5};
+static struct npu2_phy_reg NPU2_PHY_RX_HIST_MIN_EYE_WIDTH_VALID= {0x24e, 48, 1};
+*/
+static struct npu2_phy_reg NPU2_PHY_RX_RC_ENABLE_AUTO_RECAL = {0x25c, 51, 1};
+
+static struct npu2_phy_reg NPU2_PHY_RX_CLKDIST_PDWN = {0x204, 48, 3};
+static struct npu2_phy_reg NPU2_PHY_RX_IREF_PDWN = {0x230, 54, 1};
+static struct npu2_phy_reg NPU2_PHY_TX_CLKDIST_PDWN = {0x305, 48, 3};
+static struct npu2_phy_reg NPU2_PHY_RX_CTL_DATASM_CLKDIST_PDWN = {0x2e0, 60, 1};
+static struct npu2_phy_reg NPU2_PHY_TX_DRV_DATA_PATTERN_GCRMSG = {0x309, 50, 4};
+
+#define NPU2_PHY_REG(scom_base, reg, lane) \
+ SETFIELD(PPC_BITMASK(27, 31), ((reg)->offset << 42) | scom_base, lane)
+
+#define NPU2_MAX_PHY_LANE 23
+
+/* This is a bit of a gross hack but it does the job */
+#define FOR_EACH_LANE(ndev, lane) \
+ for (lane = 0; lane <= NPU2_MAX_PHY_LANE; lane++) \
+ if (!(ndev->lane_mask & (1 << (NPU2_MAX_PHY_LANE - lane)))) \
+ continue; \
+ else
+
+typedef uint32_t (*step)(struct npu2_dev *);
+
+struct procedure {
+ const char *name;
+ step steps[];
+};
+
+#define DEFINE_PROCEDURE(NAME, STEPS...) \
+ static struct procedure procedure_##NAME = \
+ {.name = #NAME, .steps = {NAME, ##STEPS}}
+
+#define PROCEDURE_INPROGRESS (1 << 31)
+#define PROCEDURE_COMPLETE (1 << 30)
+#define PROCEDURE_NEXT (1 << 29)
+#define PROCEDURE_FAILED 2
+#define PROCEDURE_ABORTED 3
+#define PROCEDURE_UNSUPPORTED 4
+
+/* Mask defining which status bits we want to expose */
+#define PROCEDURE_STATUS_MASK 0xc000000f
+
+static void phy_write_lane(struct npu2_dev *ndev, struct npu2_phy_reg *reg, int lane, uint64_t val)
+{
+ uint64_t old_val, reg_addr;
+ int rc;
+ uint64_t mask = PPC_BITMASK(reg->start, reg->start + reg->len - 1);
+
+ /* Check to make sure we're not trying to specify a lane to a
+ * non-per-lane register */
+ if (lane >= 0)
+ assert(reg->offset < 0x200);
+ else
+ assert(reg->offset >= 0x200);
+
+ reg_addr = NPU2_PHY_REG(ndev->pl_xscom_base, reg, lane);
+ rc = xscom_read(ndev->npu->chip_id, reg_addr, &old_val);
+ if (rc)
+ NPU2DEVERR(ndev, "error %d reading scom 0x%llx\n", rc, reg_addr);
+ val = SETFIELD(mask, old_val, val);
+ rc = xscom_write(ndev->npu->chip_id, reg_addr, val);
+ if (rc)
+ NPU2DEVERR(ndev, "error %d writing scom 0x%llx\n", rc, reg_addr);
+}
+
+static uint64_t phy_read_lane(struct npu2_dev *ndev, struct npu2_phy_reg *reg, int lane)
+{
+ uint64_t val, reg_addr;
+ int rc;
+ uint64_t mask = PPC_BITMASK(reg->start, reg->start + reg->len - 1);
+
+ /* Check to make sure we're not trying to specify a lane to a
+ * non-per-lane register */
+ if (lane >= 0)
+ assert(reg->offset < 0x200);
+ else
+ assert(reg->offset >= 0x200);
+
+ reg_addr = NPU2_PHY_REG(ndev->pl_xscom_base, reg, lane);
+ rc = xscom_read(ndev->npu->chip_id, reg_addr, &val);
+ if (rc)
+ NPU2DEVERR(ndev, "error %d reading scom 0x%llx\n", rc, reg_addr);
+
+ return GETFIELD(mask, val);
+}
+
+#define phy_write(ndev, reg, val) phy_write_lane(ndev, reg, -1, val)
+#define phy_read(ndev, reg) phy_read_lane(ndev, reg, -1)
+
+static uint32_t stop(struct npu2_dev *npu_dev __unused)
+{
+ return PROCEDURE_COMPLETE | PROCEDURE_ABORTED;
+}
+DEFINE_PROCEDURE(stop);
+
+static uint32_t nop(struct npu2_dev *npu_dev __unused)
+{
+ return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(nop);
+
+/*
+ * Return the obus (0 or 1) of a device
+ *
+ * Using the brick index is dangerous, because it varies for a link
+ * depending on the mode (opencapi or nvlink)
+ */
+static int obus_index(struct npu2_dev *ndev)
+{
+ if ((ndev->pl_xscom_base & 0x3F000000) == 0x09000000)
+ return 0;
+ else
+ return 1;
+}
+
+/*
+ * Return the brick number (0-2) within an obus chiplet.
+ * Only valid for nvlink devices
+ */
+static int obus_brick_index(struct npu2_dev *ndev)
+{
+ int index = ndev->brick_index % 3;
+
+ assert(ndev->type != NPU2_DEV_TYPE_OPENCAPI);
+ /* On the second obus chiplet, index is reversed */
+ if ((ndev->pl_xscom_base & 0x3F000000) != 0x09000000)
+ return 2 - index;
+
+ return index;
+}
+
+static void set_iovalid(struct npu2_dev *ndev, bool raise)
+{
+ uint64_t addr, val, mask;
+ int rc;
+
+ if (ndev->type == NPU2_DEV_TYPE_OPENCAPI)
+ return;
+
+ addr = (ndev->pl_xscom_base & 0x3F000000) | 0x9;
+ mask = PPC_BIT(6 + obus_brick_index(ndev));
+ val = raise ? mask : 0;
+
+ rc = xscom_write_mask(ndev->npu->chip_id, addr, val, mask);
+ if (rc)
+ NPU2DEVERR(ndev, "error %d writing scom 0x%llx\n", rc, addr);
+}
+
+static bool poll_fence_status(struct npu2_dev *ndev, uint64_t val)
+{
+ uint64_t fs;
+ int i;
+
+ for (i = 0; i < 4096; i++) {
+ fs = npu2_read(ndev->npu, NPU2_NTL_CQ_FENCE_STATUS(ndev));
+ if ((fs & 0xc000000000000000UL) == val)
+ return true;
+ }
+
+ NPU2DEVERR(ndev, "NPU2_NTL_CQ_FENCE_STATUS timeout (0x%llx)\n", val);
+ return false;
+}
+
+/* Procedure 1.2.1 - Reset NPU/NDL */
+uint32_t reset_ntl(struct npu2_dev *ndev)
+{
+ uint64_t val, check;
+ int lane, i;
+
+ set_iovalid(ndev, true);
+
+ /* Power on clocks */
+ phy_write(ndev, &NPU2_PHY_RX_CLKDIST_PDWN, 0);
+ phy_write(ndev, &NPU2_PHY_RX_IREF_PDWN, 1);
+ phy_write(ndev, &NPU2_PHY_TX_CLKDIST_PDWN, 0);
+ phy_write(ndev, &NPU2_PHY_RX_CTL_DATASM_CLKDIST_PDWN, 0);
+
+ FOR_EACH_LANE(ndev, lane) {
+ phy_write_lane(ndev, &NPU2_PHY_RX_LANE_ANA_PDWN, lane, 0);
+ phy_write_lane(ndev, &NPU2_PHY_RX_LANE_DIG_PDWN, lane, 0);
+ phy_write_lane(ndev, &NPU2_PHY_TX_LANE_PDWN, lane, 0);
+ }
+
+ /* Clear fence state for the brick */
+ val = npu2_read(ndev->npu, NPU2_MISC_FENCE_STATE);
+ if (val) {
+ NPU2DEVINF(ndev, "Clearing all bricks fence\n");
+ npu2_write(ndev->npu, NPU2_MISC_FENCE_STATE, val);
+ for (i = 0, check = 0; i < 4096; i++) {
+ check = npu2_read(ndev->npu, NPU2_NTL_CQ_FENCE_STATUS(ndev));
+ if (!check)
+ break;
+ }
+ if (check)
+ NPU2DEVERR(ndev, "Clearing NPU2_MISC_FENCE_STATE=0x%llx timeout, current=0x%llx\n",
+ val, check);
+ }
+
+ /* Write PRI */
+ val = SETFIELD(PPC_BITMASK(0,1), 0ull, obus_brick_index(ndev));
+ npu2_write_mask(ndev->npu, NPU2_NTL_PRI_CFG(ndev), val, -1ULL);
+
+ val = NPU2_NTL_MISC_CFG2_NDL_RX_PARITY_ENA;
+ npu2_write_mask(ndev->npu, NPU2_NTL_MISC_CFG2(ndev), 0ull, val);
+
+ /* NTL Reset */
+ val = npu2_read(ndev->npu, NPU2_NTL_MISC_CFG1(ndev));
+ val |= PPC_BIT(8) | PPC_BIT(9);
+ npu2_write(ndev->npu, NPU2_NTL_MISC_CFG1(ndev), val);
+
+ if (!poll_fence_status(ndev, 0xc000000000000000UL))
+ return PROCEDURE_COMPLETE | PROCEDURE_FAILED;
+
+ return PROCEDURE_NEXT;
+}
+
+static uint32_t reset_ndl(struct npu2_dev *ndev)
+{
+ uint64_t val;
+
+ val = npu2_read_4b(ndev->npu, NPU2_NTL_DL_CONTROL(ndev));
+ val |= PPC_BIT32(0) | PPC_BIT32(1);
+ npu2_write_4b(ndev->npu, NPU2_NTL_DL_CONTROL(ndev), val);
+
+ val = npu2_read_4b(ndev->npu, NPU2_NTL_DL_CONTROL(ndev));
+ val &= ~(PPC_BIT32(0) | PPC_BIT32(1));
+ npu2_write_4b(ndev->npu, NPU2_NTL_DL_CONTROL(ndev), val);
+
+ val = PPC_BIT32(0);
+ npu2_write_4b(ndev->npu, NPU2_NTL_DL_CONFIG(ndev), val);
+
+ return PROCEDURE_NEXT;
+}
+
+static uint32_t reset_ntl_release(struct npu2_dev *ndev)
+{
+ uint64_t val;
+ uint64_t npu2_fir;
+ uint64_t npu2_fir_addr;
+ int i;
+
+ /* Clear FIR bits */
+ npu2_fir_addr = NPU2_FIR_REGISTER_0;
+ npu2_fir = 0;
+
+ for (i = 0; i < NPU2_TOTAL_FIR_REGISTERS; i++) {
+ xscom_write(ndev->npu->chip_id, npu2_fir_addr, npu2_fir);
+ npu2_fir_addr += NPU2_FIR_OFFSET;
+
+ }
+
+ val = npu2_read(ndev->npu, NPU2_NTL_MISC_CFG1(ndev));
+ val &= 0xFFBFFFFFFFFFFFFFUL;
+ npu2_write(ndev->npu, NPU2_NTL_MISC_CFG1(ndev), val);
+
+ if (!poll_fence_status(ndev, 0x8000000000000000UL))
+ return PROCEDURE_COMPLETE | PROCEDURE_FAILED;
+
+ return PROCEDURE_NEXT;
+}
+
+static uint32_t reset_ntl_finish(struct npu2_dev *ndev)
+{
+ /* Credit Setup */
+ npu2_write(ndev->npu, NPU2_NTL_CRED_HDR_CREDIT_TX(ndev), 0x0200000000000000UL);
+ npu2_write(ndev->npu, NPU2_NTL_PRB_HDR_CREDIT_TX(ndev), 0x0200000000000000UL);
+ npu2_write(ndev->npu, NPU2_NTL_ATR_HDR_CREDIT_TX(ndev), 0x0200000000000000UL);
+ npu2_write(ndev->npu, NPU2_NTL_RSP_HDR_CREDIT_TX(ndev), 0x0200000000000000UL);
+ npu2_write(ndev->npu, NPU2_NTL_CRED_DATA_CREDIT_TX(ndev), 0x1000000000000000UL);
+ npu2_write(ndev->npu, NPU2_NTL_RSP_DATA_CREDIT_TX(ndev), 0x1000000000000000UL);
+ npu2_write(ndev->npu, NPU2_NTL_CRED_HDR_CREDIT_RX(ndev), 0x0000BE0000000000UL);
+ npu2_write(ndev->npu, NPU2_NTL_DBD_HDR_CREDIT_RX(ndev), 0x0000640000000000UL);
+ npu2_write(ndev->npu, NPU2_NTL_ATSD_HDR_CREDIT_RX(ndev), 0x0000200000000000UL);
+ npu2_write(ndev->npu, NPU2_NTL_RSP_HDR_CREDIT_RX(ndev), 0x0000BE0000000000UL);
+ npu2_write(ndev->npu, NPU2_NTL_CRED_DATA_CREDIT_RX(ndev), 0x0001000000000000UL);
+ npu2_write(ndev->npu, NPU2_NTL_RSP_DATA_CREDIT_RX(ndev), 0x0001000000000000UL);
+
+ npu2_set_link_flag(ndev, NPU2_DEV_DL_RESET);
+
+ return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(reset_ntl, reset_ndl, reset_ntl_release, reset_ntl_finish);
+
+/* Procedure 1.2.2 - Reset I/O PHY Lanes */
+static uint32_t phy_reset(struct npu2_dev *ndev)
+{
+ int lane;
+
+ set_iovalid(ndev, false);
+
+ /* Power on clocks */
+ phy_write(ndev, &NPU2_PHY_RX_CLKDIST_PDWN, 0);
+ phy_write(ndev, &NPU2_PHY_RX_IREF_PDWN, 1);
+ phy_write(ndev, &NPU2_PHY_TX_CLKDIST_PDWN, 0);
+ phy_write(ndev, &NPU2_PHY_RX_CTL_DATASM_CLKDIST_PDWN, 0);
+
+ FOR_EACH_LANE(ndev, lane)
+ phy_write_lane(ndev, &NPU2_PHY_RX_RUN_LANE, lane, 0);
+
+ return PROCEDURE_NEXT;
+}
+
+static uint32_t phy_reset_wait(struct npu2_dev *ndev)
+{
+ int lane;
+
+ /* Wait for all lanes to become inactive */
+ FOR_EACH_LANE(ndev, lane)
+ if (phy_read_lane(ndev, &NPU2_PHY_RX_LANE_BUSY, lane))
+ return PROCEDURE_INPROGRESS;
+
+ FOR_EACH_LANE(ndev, lane) {
+ /* Set lane in reset */
+ phy_write_lane(ndev, &NPU2_PHY_RX_IORESET, lane, 1);
+ phy_write_lane(ndev, &NPU2_PHY_TX_IORESET, lane, 1);
+
+ /* Release lane from reset */
+ phy_write_lane(ndev, &NPU2_PHY_RX_IORESET, lane, 0);
+ phy_write_lane(ndev, &NPU2_PHY_TX_IORESET, lane, 0);
+
+ /* Reset the phase rotator */
+ phy_write_lane(ndev, &NPU2_PHY_RX_PR_RESET, lane, 1);
+ phy_write_lane(ndev, &NPU2_PHY_RX_PR_RESET, lane, 0);
+ }
+
+ return PROCEDURE_NEXT;
+}
+
+/* Procedure 1.2.3 - Initialise I/O PHY Registers */
+static uint32_t phy_reset_complete(struct npu2_dev *ndev)
+{
+ int lane;
+
+ FOR_EACH_LANE(ndev, lane) {
+ phy_write_lane(ndev, &NPU2_PHY_RX_LANE_ANA_PDWN, lane, 0);
+ phy_write_lane(ndev, &NPU2_PHY_RX_LANE_DIG_PDWN, lane, 0);
+ phy_write_lane(ndev, &NPU2_PHY_RX_PR_IQ_RES_SEL, lane, 0x7);
+ phy_write_lane(ndev, &NPU2_PHY_RX_PR_PHASE_STEP, lane, 0xc);
+ phy_write_lane(ndev, &NPU2_PHY_TX_LANE_PDWN, lane, 0);
+ phy_write_lane(ndev, &NPU2_PHY_RX_PR_FW_INERTIA_AMT, lane, 4);
+ phy_write_lane(ndev, &NPU2_PHY_RX_CFG_LTE_MC, lane, 3);
+ phy_write_lane(ndev, &NPU2_PHY_RX_A_INTEG_COARSE_GAIN, lane, 11);
+ phy_write_lane(ndev, &NPU2_PHY_RX_B_INTEG_COARSE_GAIN, lane, 11);
+ phy_write_lane(ndev, &NPU2_PHY_RX_E_INTEG_COARSE_GAIN, lane, 11);
+
+ if (ndev->type == NPU2_DEV_TYPE_OPENCAPI) {
+ phy_write_lane(ndev, &NPU2_PHY_RX_A_CTLE_GAIN, lane, 0);
+ phy_write_lane(ndev, &NPU2_PHY_RX_B_CTLE_GAIN, lane, 0);
+ phy_write_lane(ndev, &NPU2_PHY_RX_E_CTLE_GAIN, lane, 0);
+
+ phy_write_lane(ndev, &NPU2_PHY_RX_A_CTLE_COARSE, lane, 20);
+ phy_write_lane(ndev, &NPU2_PHY_RX_B_CTLE_COARSE, lane, 20);
+ phy_write_lane(ndev, &NPU2_PHY_RX_E_CTLE_COARSE, lane, 20);
+ }
+ }
+
+ set_iovalid(ndev, true);
+
+ return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_reset, phy_reset_wait, phy_reset_complete);
+
+/* Procedure 1.2.6 - I/O PHY Tx Impedance Calibration */
+static uint32_t phy_tx_zcal(struct npu2_dev *ndev)
+{
+ if (ndev->npu->tx_zcal_complete[obus_index(ndev)])
+ return PROCEDURE_COMPLETE;
+
+ /* Turn off SW enable and enable zcal state machine */
+ phy_write(ndev, &NPU2_PHY_TX_ZCAL_SWO_EN, 0);
+
+ /* Start impedance calibration state machine */
+ phy_write(ndev, &NPU2_PHY_TX_ZCAL_REQ, 1);
+
+ return PROCEDURE_NEXT;
+}
+
+static uint32_t phy_tx_zcal_wait(struct npu2_dev *ndev)
+{
+ int done, error;
+
+ done = phy_read(ndev, &NPU2_PHY_TX_ZCAL_DONE);
+ error = phy_read(ndev, &NPU2_PHY_TX_ZCAL_ERROR);
+
+ /* We have never seen this in the field and it is not expected.
+ * Therefore it's best to error out which will complain loudly. Nominal
+ * vaules may be set in nvram to ignore this error. */
+ if (error && nv_zcal_nominal < 0) {
+ NPU2DEVERR(ndev, "ZCAL failed. Nominal values may be used by"
+ " setting nvram variable nv_zcal_override = 50\n");
+ NPU2DEVERR(ndev, "However this may impact link performance\n");
+ return PROCEDURE_COMPLETE | PROCEDURE_FAILED;
+ }
+
+ if (!done)
+ return PROCEDURE_INPROGRESS;
+
+ return PROCEDURE_NEXT;
+}
+
+#define MARGIN_RATIO (0)
+#define FFE_PRE_COEFF (0)
+#define FFE_POST_COEFF (0)
+
+#define PRE_WIDTH (5)
+#define POST_WIDTH (7)
+#define MAIN_WIDTH (7)
+#define ZCAL_MIN (16 * 2)
+#define ZCAL_MAX (33 * 2)
+#define PRECURSOR_X2_MAX (4 * 2 + 1)
+#define POSTCURSOR_X2_MAX (6 * 2 + 1)
+#define MARGIN_X2_MAX (8 * 2)
+#define MAIN_X2_MAX ((6 * 2) + 1)
+#define TOTAL_X2_MAX (PRECURSOR_X2_MAX + POSTCURSOR_X2_MAX + 2*MARGIN_X2_MAX + MAIN_X2_MAX)
+
+static uint32_t therm(uint32_t dec)
+{
+ return ((0x1 << dec) - 1);
+}
+
+static uint32_t therm_with_half(uint32_t dec, uint8_t width)
+{
+ /* If the LSB of the 2r equivalent is on, then we need to set the 2r bit (MSB) */
+ uint32_t half_on = ( dec & 0x1 ) << ( width - 1 );
+
+ /* Shift the 2r equivalent to a 1r value and convert to a thermometer code. */
+ uint32_t x1_equiv = ((1 << (dec >> 1 )) - 1);
+
+ /* Combine 1r equivalent thermometer code + the 2r MSB value. */
+ return half_on | x1_equiv;
+}
+
+static uint32_t phy_tx_zcal_calculate(struct npu2_dev *ndev)
+{
+ int p_value, n_value;
+ int ffe_pre_coeff = FFE_PRE_COEFF;
+ int ffe_post_coeff = FFE_POST_COEFF;
+ uint32_t zcal_n;
+ uint32_t zcal_p;
+ uint32_t p_main_enable = MAIN_X2_MAX;
+ uint32_t p_margin_pu_enable = MARGIN_X2_MAX;
+ uint32_t p_margin_pd_enable = MARGIN_X2_MAX;
+ uint32_t p_precursor_select;
+ uint32_t p_postcursor_select;
+ uint32_t margin_pu_select;
+ uint32_t n_main_enable = MAIN_X2_MAX;
+ uint32_t n_margin_pu_enable = MARGIN_X2_MAX;
+ uint32_t n_margin_pd_enable = MARGIN_X2_MAX;
+ uint32_t n_precursor_select;
+ uint32_t n_postcursor_select;
+ uint32_t margin_pd_select;
+ uint32_t margin_select;
+
+ if (nv_zcal_nominal < 0) {
+ /* Convert the value from 8R to 2R by / 4 */
+ zcal_n = phy_read(ndev, &NPU2_PHY_TX_ZCAL_N) / 4;
+ zcal_p = phy_read(ndev, &NPU2_PHY_TX_ZCAL_P) / 4;
+ } else {
+ zcal_n = zcal_p = nv_zcal_nominal;
+ NPU2DEVINF(ndev, "Using nominal values for zcal, performance may be impacted\n");
+ }
+
+ /* Again, if the hardware detects an unexpected condition it's
+ * better just to fail loudly. */
+ if ((zcal_n < ZCAL_MIN) || (zcal_n > ZCAL_MAX) ||
+ (zcal_p < ZCAL_MIN) || (zcal_p > ZCAL_MAX))
+ return PROCEDURE_COMPLETE | PROCEDURE_FAILED;
+
+ if (ndev->type == NPU2_DEV_TYPE_OPENCAPI &&
+ platform.ocapi->phy_setup) {
+ ffe_pre_coeff = platform.ocapi->phy_setup->tx_ffe_pre_coeff;
+ ffe_post_coeff = platform.ocapi->phy_setup->tx_ffe_post_coeff;
+ }
+
+ p_value = zcal_p - TOTAL_X2_MAX;
+ p_precursor_select = (p_value * ffe_pre_coeff)/128;
+ p_postcursor_select = (p_value * ffe_post_coeff)/128;
+ margin_pu_select = (p_value * MARGIN_RATIO)/256;
+
+ if (p_value % 2) {
+ p_main_enable--;
+ p_value++;
+ }
+
+ while (p_value < 0) {
+ if (p_main_enable > 1) {
+ p_main_enable -= 2;
+ } else if ((p_margin_pu_enable + p_margin_pd_enable) > 0) {
+ if (p_margin_pu_enable == p_margin_pd_enable)
+ p_margin_pd_enable -= 2;
+ else
+ p_margin_pu_enable -= 2;
+ }
+ p_value += 2;
+ }
+
+ n_value = zcal_n - TOTAL_X2_MAX;
+ n_precursor_select = (n_value * ffe_pre_coeff)/128;
+ n_postcursor_select = (n_value * ffe_post_coeff)/128;
+ margin_pd_select = (p_value * MARGIN_RATIO)/256;
+
+ if (n_value % 2) {
+ n_main_enable--;
+ n_value++;
+ }
+
+ while (n_value < 0) {
+ if (n_main_enable > 1) {
+ n_main_enable -= 2;
+ } else if ((n_margin_pu_enable + n_margin_pd_enable) > 0) {
+ if (n_margin_pu_enable == n_margin_pd_enable)
+ n_margin_pd_enable -= 2;
+ else
+ n_margin_pu_enable -= 2;
+ }
+ n_value += 2;
+ }
+
+ margin_select = therm((margin_pu_select + 1)/2) &
+ therm((margin_pd_select + 1)/2) &
+ therm((p_margin_pu_enable + 1)/2) &
+ therm((p_margin_pd_enable + 1)/2) &
+ therm((n_margin_pu_enable + 1)/2) &
+ therm((n_margin_pd_enable + 1)/2);
+
+ phy_write(ndev, &NPU2_PHY_TX_PSEG_PRE_EN, therm_with_half(PRECURSOR_X2_MAX, PRE_WIDTH));
+ phy_write(ndev, &NPU2_PHY_TX_PSEG_PRE_SELECT, therm_with_half(p_precursor_select, PRE_WIDTH));
+ phy_write(ndev, &NPU2_PHY_TX_PSEG_POST_EN, therm_with_half(POSTCURSOR_X2_MAX, POST_WIDTH));
+ phy_write(ndev, &NPU2_PHY_TX_PSEG_POST_SELECT, therm_with_half(p_postcursor_select, POST_WIDTH));
+ phy_write(ndev, &NPU2_PHY_TX_PSEG_MARGINPU_EN, therm((p_margin_pu_enable + 1)/2));
+ phy_write(ndev, &NPU2_PHY_TX_PSEG_MARGINPD_EN, therm((p_margin_pd_enable + 1)/2));
+ phy_write(ndev, &NPU2_PHY_TX_PSEG_MAIN_EN, therm_with_half(p_main_enable, MAIN_WIDTH));
+
+ phy_write(ndev, &NPU2_PHY_TX_NSEG_PRE_EN, therm_with_half(PRECURSOR_X2_MAX, PRE_WIDTH));
+ phy_write(ndev, &NPU2_PHY_TX_NSEG_PRE_SELECT, therm_with_half(n_precursor_select, PRE_WIDTH));
+ phy_write(ndev, &NPU2_PHY_TX_NSEG_POST_EN, therm_with_half(POSTCURSOR_X2_MAX, POST_WIDTH));
+ phy_write(ndev, &NPU2_PHY_TX_NSEG_POST_SELECT, therm_with_half(n_postcursor_select, POST_WIDTH));
+ phy_write(ndev, &NPU2_PHY_TX_NSEG_MARGINPU_EN, therm((n_margin_pu_enable + 1)/2));
+ phy_write(ndev, &NPU2_PHY_TX_NSEG_MARGINPD_EN, therm((n_margin_pd_enable + 1)/2));
+ phy_write(ndev, &NPU2_PHY_TX_NSEG_MAIN_EN, therm_with_half(n_main_enable, MAIN_WIDTH));
+
+ phy_write(ndev, &NPU2_PHY_TX_MARGINPU_SELECT, therm(margin_select + 1)/2);
+ phy_write(ndev, &NPU2_PHY_TX_MARGINPD_SELECT, therm(margin_select + 1)/2);
+
+ ndev->npu->tx_zcal_complete[obus_index(ndev)] = 1;
+ return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_tx_zcal, phy_tx_zcal_wait, phy_tx_zcal_calculate);
+
+/* Procedure 1.2.8 - Enable Downstream Link Training */
+static uint32_t phy_enable_tx_rxcal(struct npu2_dev *ndev)
+{
+ int lane;
+
+ FOR_EACH_LANE(ndev, lane)
+ phy_write_lane(ndev, &NPU2_PHY_TX_RXCAL, lane, 1);
+
+ return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_enable_tx_rxcal);
+
+/* Procedure 1.2.9 - Disable Downstream Link Training */
+static uint32_t phy_disable_tx_rxcal(struct npu2_dev *ndev)
+{
+ int lane;
+
+ FOR_EACH_LANE(ndev, lane)
+ phy_write_lane(ndev, &NPU2_PHY_TX_RXCAL, lane, 0);
+
+ return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_disable_tx_rxcal);
+
+/* Procedure 1.2.4 - I/O PHY DC Calibration */
+static uint32_t phy_rx_dccal(struct npu2_dev *ndev)
+{
+ int lane;
+
+ set_iovalid(ndev, false);
+
+ FOR_EACH_LANE(ndev, lane)
+ phy_write_lane(ndev, &NPU2_PHY_RX_PR_FW_OFF, lane, 1);
+
+ FOR_EACH_LANE(ndev, lane)
+ phy_write_lane(ndev, &NPU2_PHY_RX_RUN_DCCAL, lane, 1);
+
+ return PROCEDURE_NEXT;
+}
+
+static uint32_t phy_rx_dccal_complete(struct npu2_dev *ndev)
+{
+ int lane;
+
+ FOR_EACH_LANE(ndev, lane)
+ if (!phy_read_lane(ndev, &NPU2_PHY_RX_DCCAL_DONE, lane))
+ return PROCEDURE_INPROGRESS;
+
+ FOR_EACH_LANE(ndev, lane)
+ phy_write_lane(ndev, &NPU2_PHY_RX_RUN_DCCAL, lane, 0);
+
+ FOR_EACH_LANE(ndev, lane) {
+ phy_write_lane(ndev, &NPU2_PHY_RX_B_BANK_CONTROLS, lane, 0);
+ phy_write_lane(ndev, &NPU2_PHY_RX_PR_EDGE_TRACK_CNTL, lane, 0);
+ phy_write_lane(ndev, &NPU2_PHY_RX_PR_FW_OFF, lane, 0);
+ }
+
+ set_iovalid(ndev, true);
+
+ return PROCEDURE_NEXT;
+}
+
+static uint32_t phy_rx_clock_sel(struct npu2_dev *ndev)
+{
+ if (ndev->type != NPU2_DEV_TYPE_OPENCAPI) {
+ /*
+ * Change the RX clk mux control to be done by
+ * software instead of HW. This avoids glitches caused
+ * by changing the mux setting.
+ *
+ * Work around a known DL bug by doing these writes
+ * twice.
+ */
+ npu2_write_mask_4b(ndev->npu, NPU2_NTL_DL_CLK_CTRL(ndev),
+ 0x80000002, 0x80000003);
+ npu2_write_mask_4b(ndev->npu, NPU2_NTL_DL_CLK_CTRL(ndev),
+ 0x80000002, 0x80000003);
+
+ npu2_write_mask_4b(ndev->npu, NPU2_NTL_DL_CLK_CTRL(ndev),
+ 0x80000000, 0x80000003);
+ npu2_write_mask_4b(ndev->npu, NPU2_NTL_DL_CLK_CTRL(ndev),
+ 0x80000000, 0x80000003);
+ }
+ return PROCEDURE_NEXT;
+}
+
+/* Procedure 1.2.5 - IO PHY Tx FIFO Init */
+static uint32_t phy_tx_fifo_init(struct npu2_dev *ndev)
+{
+ int lane;
+
+ FOR_EACH_LANE(ndev, lane) {
+ phy_write_lane(ndev, &NPU2_PHY_TX_UNLOAD_CLK_DISABLE, lane, 0);
+ phy_write_lane(ndev, &NPU2_PHY_TX_FIFO_INIT, lane, 1);
+ phy_write_lane(ndev, &NPU2_PHY_TX_UNLOAD_CLK_DISABLE, lane, 1);
+ }
+
+ return PROCEDURE_COMPLETE;
+}
+
+/* We group TX FIFO init in here mainly because that's what was done
+ * on NVLink1 */
+DEFINE_PROCEDURE(phy_rx_dccal, phy_rx_dccal_complete, phy_rx_clock_sel,
+ phy_tx_fifo_init);
+
+/* Procedure 1.2.7 - I/O PHY Upstream Link Training */
+static uint32_t phy_rx_training(struct npu2_dev *ndev)
+{
+ int lane;
+
+ FOR_EACH_LANE(ndev, lane)
+ phy_write_lane(ndev, &NPU2_PHY_RX_RUN_LANE, lane, 1);
+
+ return PROCEDURE_NEXT;
+}
+
+static uint32_t phy_rx_training_wait(struct npu2_dev *ndev)
+{
+ int lane;
+
+ FOR_EACH_LANE(ndev, lane)
+ if (!phy_read_lane(ndev, &NPU2_PHY_RX_INIT_DONE, lane))
+ return PROCEDURE_INPROGRESS;
+
+ return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_rx_training, phy_rx_training_wait);
+
+static uint32_t check_credit(struct npu2_dev *ndev, uint64_t reg,
+ const char *reg_name, uint64_t expected)
+{
+ uint64_t val;
+
+ val = npu2_read(ndev->npu, reg);
+ if (val == expected)
+ return 0;
+
+ NPU2DEVERR(ndev, "%s: expected 0x%llx, read 0x%llx\n",
+ reg_name, expected, val);
+
+ return 1;
+}
+
+#define CHECK_CREDIT(ndev, reg, expected) \
+ check_credit(ndev, reg(ndev), #reg, expected);
+
+static uint32_t check_credits(struct npu2_dev *ndev)
+{
+ uint64_t val;
+
+ CHECK_CREDIT(ndev, NPU2_NTL_CRED_HDR_CREDIT_RX, 0x0BE0BE0000000000ULL);
+ CHECK_CREDIT(ndev, NPU2_NTL_RSP_HDR_CREDIT_RX, 0x0BE0BE0000000000ULL);
+ CHECK_CREDIT(ndev, NPU2_NTL_CRED_DATA_CREDIT_RX, 0x1001000000000000ULL);
+ CHECK_CREDIT(ndev, NPU2_NTL_RSP_DATA_CREDIT_RX, 0x1001000000000000ULL);
+ CHECK_CREDIT(ndev, NPU2_NTL_DBD_HDR_CREDIT_RX, 0x0640640000000000ULL);
+ CHECK_CREDIT(ndev, NPU2_NTL_ATSD_HDR_CREDIT_RX, 0x0200200000000000ULL);
+
+ val = npu2_read(ndev->npu, NPU2_NTL_MISC_CFG1(ndev));
+ val &= 0xFF3FFFFFFFFFFFFFUL;
+ npu2_write(ndev->npu, NPU2_NTL_MISC_CFG1(ndev), val);
+
+ if (!poll_fence_status(ndev, 0x0))
+ return PROCEDURE_COMPLETE | PROCEDURE_FAILED;
+
+ val = NPU2_NTL_MISC_CFG2_NDL_RX_PARITY_ENA;
+ npu2_write_mask(ndev->npu, NPU2_NTL_MISC_CFG2(ndev), val, val);
+
+ return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(check_credits);
+
+static struct procedure *npu_procedures[] = {
+ &procedure_stop,
+ &procedure_nop,
+ NULL,
+ NULL,
+ &procedure_phy_reset,
+ &procedure_phy_tx_zcal,
+ &procedure_phy_rx_dccal,
+ &procedure_phy_enable_tx_rxcal,
+ &procedure_phy_disable_tx_rxcal,
+ &procedure_phy_rx_training,
+ &procedure_reset_ntl,
+
+ /* Place holders for pre-terminate and terminate procedures */
+ &procedure_nop,
+ &procedure_nop,
+ &procedure_check_credits
+};
+
+/* Run a procedure step(s) and return status */
+static uint32_t get_procedure_status(struct npu2_dev *dev)
+{
+ uint32_t result;
+ uint16_t procedure = dev->procedure_number;
+ uint16_t step = dev->procedure_step;
+ const char *name = npu_procedures[procedure]->name;
+
+ do {
+ result = npu_procedures[procedure]->steps[step](dev);
+
+ if (result & PROCEDURE_NEXT) {
+ step++;
+ NPU2DEVINF(dev, "Running procedure %s step %d\n", name, step);
+ }
+ } while (result & PROCEDURE_NEXT);
+
+ dev->procedure_step = step;
+
+ if (result & PROCEDURE_COMPLETE)
+ NPU2DEVINF(dev, "Procedure %s complete\n", name);
+ else if (mftb() > dev->procedure_tb + msecs_to_tb(1000)) {
+ NPU2DEVINF(dev, "Procedure %s timed out\n", name);
+ result = PROCEDURE_COMPLETE | PROCEDURE_FAILED;
+ }
+
+ /* Mask off internal state bits */
+ dev->procedure_status = result & PROCEDURE_STATUS_MASK;
+
+ return dev->procedure_status;
+}
+
+static int64_t npu_dev_procedure_read(struct npu2_dev *dev, uint32_t offset,
+ uint32_t size, uint32_t *data)
+{
+ int64_t rc = OPAL_SUCCESS;
+
+ if (size != 4) {
+ /* Short config reads are not supported */
+ prlog(PR_ERR, "NPU%d: Short read of procedure register\n", npu2_dev_to_phb(dev)->opal_id);
+ return OPAL_PARAMETER;
+ }
+
+ *data = 0;
+
+ switch (offset) {
+ case 0:
+ /* Only run the procedure if not already complete */
+ if (dev->procedure_status & PROCEDURE_COMPLETE)
+ *data = dev->procedure_status;
+ else
+ *data = get_procedure_status(dev);
+
+ break;
+
+ case 4:
+ *data = dev->procedure_number;
+ break;
+
+ default:
+ prlog(PR_ERR, "NPU%d: Invalid vendor specific offset 0x%08x\n",
+ npu2_dev_to_phb(dev)->opal_id, offset);
+ rc = OPAL_PARAMETER;
+ }
+
+ return rc;
+}
+
+static int64_t npu_dev_procedure_write(struct npu2_dev *dev, uint32_t offset,
+ uint32_t size, uint32_t data)
+{
+ const char *name;
+ int64_t rc = OPAL_SUCCESS;
+
+ if (size != 4) {
+ /* Short config writes are not supported */
+ prlog(PR_ERR, "NPU%d: Short read of procedure register\n",
+ npu2_dev_to_phb(dev)->opal_id);
+ return OPAL_PARAMETER;
+ }
+
+ switch (offset) {
+ case 0:
+ /* We ignore writes to the status register */
+ NPU2DEVINF(dev, "Ignoring writes to status register\n");
+ break;
+
+ case 4:
+ if (data >= ARRAY_SIZE(npu_procedures) ||
+ !npu_procedures[data]) {
+ NPU2DEVINF(dev, "Unsupported procedure number %d\n", data);
+ dev->procedure_status = PROCEDURE_COMPLETE
+ | PROCEDURE_UNSUPPORTED;
+ break;
+ }
+
+ name = npu_procedures[data]->name;
+ if (dev->procedure_number == data
+ && !(dev->procedure_status & PROCEDURE_COMPLETE))
+ NPU2DEVINF(dev, "Restarting procedure %s\n", name);
+ else
+ NPU2DEVINF(dev, "Starting procedure %s\n", name);
+
+ dev->procedure_status = PROCEDURE_INPROGRESS;
+ dev->procedure_number = data;
+ dev->procedure_step = 0;
+ dev->procedure_tb = mftb();
+ break;
+
+ default:
+ NPU2DEVINF(dev, "Invalid vendor specific offset 0x%08x\n", offset);
+ rc = OPAL_PARAMETER;
+ }
+
+ return rc;
+}
+
+int64_t npu2_dev_procedure(void *dev, struct pci_cfg_reg_filter *pcrf,
+ uint32_t offset, uint32_t len, uint32_t *data,
+ bool write)
+{
+ struct pci_virt_device *pvd = dev;
+ struct npu2_dev *ndev = pvd->data;
+
+ if (write)
+ return npu_dev_procedure_write(ndev, offset - pcrf->start,
+ len, *data);
+
+ return npu_dev_procedure_read(ndev, offset - pcrf->start, len, data);
+}
+
+void npu2_dev_procedure_reset(struct npu2_dev *dev)
+{
+ uint64_t val;
+
+ /* Fence the brick */
+ val = npu2_read(dev->npu, NPU2_NTL_MISC_CFG1(dev));
+ val |= PPC_BIT(8) | PPC_BIT(9);
+ npu2_write(dev->npu, NPU2_NTL_MISC_CFG1(dev), val);
+
+ npu2_clear_link_flag(dev, NPU2_DEV_DL_RESET);
+}
+
+static uint32_t run_procedure(struct npu2_dev *dev, uint16_t procedure_number)
+{
+ struct procedure *proc;
+ const char *name;
+ uint32_t result;
+
+ assert(procedure_number <= ARRAY_SIZE(npu_procedures));
+ proc = npu_procedures[procedure_number];
+ assert(proc);
+
+ name = proc->name;
+ NPU2DEVINF(dev, "Running procedure %s\n", name);
+ dev->procedure_status = PROCEDURE_INPROGRESS;
+ dev->procedure_number = procedure_number;
+ dev->procedure_step = 0;
+ dev->procedure_tb = mftb();
+
+ result = get_procedure_status(dev);
+ while (!(result & PROCEDURE_COMPLETE)) {
+ time_wait_ms(1);
+ result = get_procedure_status(dev);
+ }
+ return result;
+}
+
+void npu2_opencapi_bump_ui_lane(struct npu2_dev *dev)
+{
+ uint64_t reg;
+ uint64_t status_xscom;
+ int lane, bit = 7;
+
+ status_xscom = OB_ODL_TRAINING_STATUS(dev->brick_index);
+ xscom_read(dev->npu->chip_id, status_xscom, &reg);
+ reg = GETFIELD(OB_ODL_TRAINING_STATUS_STS_RX_PATTERN_B, reg);
+
+ FOR_EACH_LANE(dev, lane) {
+ if (reg & (1 << bit--))
+ continue;
+ prlog(PR_TRACE, "OCAPI: bumpui bumping lane %d\n", lane);
+ for (int i = 0; i < 4; i++) {
+ phy_write_lane(dev, &NPU2_PHY_RX_PR_BUMP_SL_1UI, lane, 1);
+ phy_write_lane(dev, &NPU2_PHY_RX_PR_BUMP_SL_1UI, lane, 0);
+ }
+ }
+}
+
+void npu2_opencapi_phy_init(struct npu2_dev *dev)
+{
+ if (platform.ocapi->phy_setup) {
+ OCAPIINF(dev, "Enabling platform-specific PHY setup\n");
+ phy_write(dev, &NPU2_PHY_TX_FFE_BOOST_EN,
+ platform.ocapi->phy_setup->tx_ffe_boost_en);
+ }
+
+ run_procedure(dev, 5); /* procedure_phy_tx_zcal */
+ /*
+ * This is only required for OpenCAPI - Hostboot tries to set this
+ * on systems where it can tell a link is OpenCAPI, but for
+ * Witherspoon it needs to be done in skiboot after device detection.
+ */
+ phy_write(dev, &NPU2_PHY_RX_RC_ENABLE_AUTO_RECAL, 0x1);
+ phy_write(dev, &NPU2_PHY_RX_AC_COUPLED, 1);
+
+ switch (dev->link_speed) {
+ case 20000000000UL:
+ OCAPIINF(dev, "Link speed set at 20Gb/s\n");
+ phy_write(dev, &NPU2_PHY_RX_SPEED_SELECT, 1);
+ break;
+ case 25000000000UL:
+ case 25781250000UL:
+ OCAPIINF(dev, "Link speed set at 25.xGb/s\n");
+ phy_write(dev, &NPU2_PHY_RX_SPEED_SELECT, 0);
+ break;
+ default:
+ OCAPIERR(dev, "Invalid link speed!\n");
+ assert(false);
+ }
+}
+
+int npu2_opencapi_phy_reset(struct npu2_dev *dev)
+{
+ int rc;
+
+ rc = run_procedure(dev, 4); /* procedure_phy_reset */
+ if (rc != PROCEDURE_COMPLETE)
+ return -1;
+ rc = run_procedure(dev, 6); /* procedure_phy_rx_dccal */
+ if (rc != PROCEDURE_COMPLETE)
+ return -1;
+ return 0;
+}
+
+void npu2_opencapi_phy_prbs31(struct npu2_dev *dev)
+{
+ phy_write(dev, &NPU2_PHY_TX_DRV_DATA_PATTERN_GCRMSG, 0xD);
+}
diff --git a/roms/skiboot/hw/npu2-opencapi.c b/roms/skiboot/hw/npu2-opencapi.c
new file mode 100644
index 000000000..035c6cdc3
--- /dev/null
+++ b/roms/skiboot/hw/npu2-opencapi.c
@@ -0,0 +1,2370 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Support for OpenCAPI on POWER9 NPUs
+ *
+ * This file provides support for OpenCAPI as implemented on POWER9.
+ *
+ * At present, we initialise the NPU separately from the NVLink code in npu2.c.
+ * As such, we don't currently support mixed NVLink and OpenCAPI configurations
+ * on the same NPU for machines such as Witherspoon.
+ *
+ * Procedure references in this file are to the POWER9 OpenCAPI NPU Workbook
+ * (IBM internal document).
+ *
+ * TODO:
+ * - Support for mixed NVLink and OpenCAPI on the same NPU
+ * - Support for link ganging (one AFU using multiple links)
+ * - Link reset and error handling
+ * - Presence detection
+ * - Consume HDAT NPU information
+ * - LPC Memory support
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <io.h>
+#include <timebase.h>
+#include <pci.h>
+#include <pci-cfg.h>
+#include <pci-slot.h>
+#include <interrupts.h>
+#include <opal.h>
+#include <opal-api.h>
+#include <npu2.h>
+#include <npu2-regs.h>
+#include <phys-map.h>
+#include <i2c.h>
+#include <nvram.h>
+
+#define NPU_IRQ_LEVELS_XSL 23
+#define MAX_PE_HANDLE ((1 << 15) - 1)
+#define TL_MAX_TEMPLATE 63
+#define TL_RATE_BUF_SIZE 32
+
+#define OCAPI_SLOT_NORMAL PCI_SLOT_STATE_NORMAL
+#define OCAPI_SLOT_LINK PCI_SLOT_STATE_LINK
+#define OCAPI_SLOT_LINK_START (OCAPI_SLOT_LINK + 1)
+#define OCAPI_SLOT_LINK_WAIT (OCAPI_SLOT_LINK + 2)
+#define OCAPI_SLOT_LINK_TRAINED (OCAPI_SLOT_LINK + 3)
+#define OCAPI_SLOT_FRESET PCI_SLOT_STATE_FRESET
+#define OCAPI_SLOT_FRESET_START (OCAPI_SLOT_FRESET + 1)
+#define OCAPI_SLOT_FRESET_INIT (OCAPI_SLOT_FRESET + 2)
+#define OCAPI_SLOT_FRESET_ASSERT_DELAY (OCAPI_SLOT_FRESET + 3)
+#define OCAPI_SLOT_FRESET_DEASSERT_DELAY (OCAPI_SLOT_FRESET + 4)
+#define OCAPI_SLOT_FRESET_INIT_DELAY (OCAPI_SLOT_FRESET + 5)
+
+#define OCAPI_LINK_TRAINING_RETRIES 2
+#define OCAPI_LINK_TRAINING_TIMEOUT 3000 /* ms */
+#define OCAPI_LINK_STATE_TRAINED 0x7
+
+enum npu2_link_training_state {
+ NPU2_TRAIN_DEFAULT, /* fully train the link */
+ NPU2_TRAIN_PRBS31, /* used for Signal Integrity testing */
+ NPU2_TRAIN_NONE, /* used for testing with loopback cable */
+};
+static enum npu2_link_training_state npu2_ocapi_training_state = NPU2_TRAIN_DEFAULT;
+
+static const struct phb_ops npu2_opencapi_ops;
+
+static inline uint64_t index_to_stack(uint64_t index) {
+ switch (index) {
+ case 2:
+ case 3:
+ return NPU2_STACK_STCK_1;
+ break;
+ case 4:
+ case 5:
+ return NPU2_STACK_STCK_2;
+ break;
+ default:
+ assert(false);
+ }
+}
+
+static inline uint64_t index_to_stacku(uint64_t index) {
+ switch (index) {
+ case 2:
+ case 3:
+ return NPU2_STACK_STCK_1U;
+ break;
+ case 4:
+ case 5:
+ return NPU2_STACK_STCK_2U;
+ break;
+ default:
+ assert(false);
+ }
+}
+
+static inline uint64_t index_to_block(uint64_t index) {
+ switch (index) {
+ case 2:
+ case 4:
+ return NPU2_BLOCK_OTL0;
+ break;
+ case 3:
+ case 5:
+ return NPU2_BLOCK_OTL1;
+ break;
+ default:
+ assert(false);
+ }
+}
+
+static uint64_t get_odl_status(uint32_t gcid, uint64_t index)
+{
+ uint64_t reg, status_xscom;
+
+ status_xscom = OB_ODL_STATUS(index);
+ xscom_read(gcid, status_xscom, &reg);
+ return reg;
+}
+
+static uint64_t get_odl_training_status(uint32_t gcid, uint64_t index)
+{
+ uint64_t status_xscom, reg;
+
+ status_xscom = OB_ODL_TRAINING_STATUS(index);
+ xscom_read(gcid, status_xscom, &reg);
+ return reg;
+}
+
+static uint64_t get_odl_endpoint_info(uint32_t gcid, uint64_t index)
+{
+ uint64_t status_xscom, reg;
+
+ status_xscom = OB_ODL_ENDPOINT_INFO(index);
+ xscom_read(gcid, status_xscom, &reg);
+ return reg;
+}
+
+static void disable_nvlink(uint32_t gcid, int index)
+{
+ uint64_t phy_config_scom, reg;
+
+ switch (index) {
+ case 2:
+ case 3:
+ phy_config_scom = OBUS_LL0_IOOL_PHY_CONFIG;
+ break;
+ case 4:
+ case 5:
+ phy_config_scom = OBUS_LL3_IOOL_PHY_CONFIG;
+ break;
+ default:
+ assert(false);
+ }
+ /* Disable NV-Link link layers */
+ xscom_read(gcid, phy_config_scom, &reg);
+ reg &= ~OBUS_IOOL_PHY_CONFIG_NV0_NPU_ENABLED;
+ reg &= ~OBUS_IOOL_PHY_CONFIG_NV1_NPU_ENABLED;
+ reg &= ~OBUS_IOOL_PHY_CONFIG_NV2_NPU_ENABLED;
+ xscom_write(gcid, phy_config_scom, reg);
+}
+
+/* Procedure 13.1.3.1 - select OCAPI vs NVLink for bricks 2-3/4-5 */
+
+static void set_transport_mux_controls(uint32_t gcid, uint32_t scom_base,
+ int index, enum npu2_dev_type type)
+{
+ /* Step 1 - Set Transport MUX controls to select correct OTL or NTL */
+ uint64_t reg;
+ uint64_t field;
+
+ /* TODO: Rework this to select for NVLink too */
+ assert(type == NPU2_DEV_TYPE_OPENCAPI);
+
+ prlog(PR_DEBUG, "OCAPI: %s: Setting transport mux controls\n", __func__);
+
+ /* Optical IO Transport Mux Config for Bricks 0-2 and 4-5 */
+ reg = npu2_scom_read(gcid, scom_base, NPU2_MISC_OPTICAL_IO_CFG0,
+ NPU2_MISC_DA_LEN_8B);
+ switch (index) {
+ case 0:
+ case 1:
+ /* not valid for OpenCAPI */
+ assert(false);
+ break;
+ case 2: /* OTL1.0 */
+ field = GETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_NDLMUX_BRK0TO2, reg);
+ field &= ~0b100;
+ reg = SETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_NDLMUX_BRK0TO2, reg,
+ field);
+ field = GETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_OCMUX_BRK0TO1, reg);
+ field |= 0b10;
+ reg = SETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_OCMUX_BRK0TO1, reg,
+ field);
+ break;
+ case 3: /* OTL1.1 */
+ field = GETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_NDLMUX_BRK0TO2, reg);
+ field &= ~0b010;
+ reg = SETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_NDLMUX_BRK0TO2, reg,
+ field);
+ field = GETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_OCMUX_BRK0TO1, reg);
+ field |= 0b01;
+ reg = SETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_OCMUX_BRK0TO1, reg,
+ field);
+ break;
+ case 4: /* OTL2.0 */
+ field = GETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_OCMUX_BRK4TO5, reg);
+ field |= 0b10;
+ reg = SETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_OCMUX_BRK4TO5, reg,
+ field);
+ break;
+ case 5: /* OTL2.1 */
+ field = GETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_OCMUX_BRK4TO5, reg);
+ field |= 0b01;
+ reg = SETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_OCMUX_BRK4TO5, reg,
+ field);
+ break;
+ default:
+ assert(false);
+ }
+ npu2_scom_write(gcid, scom_base, NPU2_MISC_OPTICAL_IO_CFG0,
+ NPU2_MISC_DA_LEN_8B, reg);
+
+ /*
+ * PowerBus Optical Miscellaneous Config Register - select
+ * OpenCAPI for b4/5 and A-Link for b3
+ */
+ xscom_read(gcid, PU_IOE_PB_MISC_CFG, &reg);
+ switch (index) {
+ case 0:
+ case 1:
+ case 2:
+ case 3:
+ break;
+ case 4:
+ reg = SETFIELD(PU_IOE_PB_MISC_CFG_SEL_04_NPU_NOT_PB, reg, 1);
+ break;
+ case 5:
+ reg = SETFIELD(PU_IOE_PB_MISC_CFG_SEL_05_NPU_NOT_PB, reg, 1);
+ break;
+ }
+ xscom_write(gcid, PU_IOE_PB_MISC_CFG, reg);
+}
+
+static void assert_odl_reset(uint32_t gcid, int index)
+{
+ uint64_t reg, config_xscom;
+
+ config_xscom = OB_ODL_CONFIG(index);
+ /* Reset ODL */
+ reg = OB_ODL_CONFIG_RESET;
+ reg = SETFIELD(OB_ODL_CONFIG_VERSION, reg, 0b000001);
+ reg = SETFIELD(OB_ODL_CONFIG_TRAIN_MODE, reg, 0b0110);
+ reg = SETFIELD(OB_ODL_CONFIG_SUPPORTED_MODES, reg, 0b0010);
+ reg |= OB_ODL_CONFIG_X4_BACKOFF_ENABLE;
+ reg = SETFIELD(OB_ODL_CONFIG_PHY_CNTR_LIMIT, reg, 0b1111);
+ reg |= OB_ODL_CONFIG_DEBUG_ENABLE;
+ reg = SETFIELD(OB_ODL_CONFIG_FWD_PROGRESS_TIMER, reg, 0b0110);
+ xscom_write(gcid, config_xscom, reg);
+}
+
+static void deassert_odl_reset(uint32_t gcid, int index)
+{
+ uint64_t reg, config_xscom;
+
+ config_xscom = OB_ODL_CONFIG(index);
+ xscom_read(gcid, config_xscom, &reg);
+ reg &= ~OB_ODL_CONFIG_RESET;
+ xscom_write(gcid, config_xscom, reg);
+}
+
+static void enable_odl_phy_mux(uint32_t gcid, int index)
+{
+ uint64_t reg;
+ uint64_t phy_config_scom;
+ prlog(PR_DEBUG, "OCAPI: %s: Enabling ODL to PHY MUXes\n", __func__);
+ /* Step 2 - Enable MUXes for ODL to PHY connection */
+ switch (index) {
+ case 2:
+ case 3:
+ phy_config_scom = OBUS_LL0_IOOL_PHY_CONFIG;
+ break;
+ case 4:
+ case 5:
+ phy_config_scom = OBUS_LL3_IOOL_PHY_CONFIG;
+ break;
+ default:
+ assert(false);
+ }
+
+ /*
+ * ODL must be in reset when enabling.
+ * It stays in reset until the link is trained
+ */
+ assert_odl_reset(gcid, index);
+
+ /* PowerBus OLL PHY Training Config Register */
+ xscom_read(gcid, phy_config_scom, &reg);
+
+ /*
+ * Enable ODL to use shared PHYs
+ *
+ * On obus3, OTL0 is connected to ODL1 (and OTL1 to ODL0), so
+ * even if it may look odd at first, we do want to enable ODL0
+ * for links 2 and 5
+ */
+ switch (index) {
+ case 2:
+ case 5:
+ reg |= OBUS_IOOL_PHY_CONFIG_ODL0_ENABLED;
+ break;
+ case 3:
+ case 4:
+ reg |= OBUS_IOOL_PHY_CONFIG_ODL1_ENABLED;
+ break;
+ }
+
+ /*
+ * Based on the platform, we may have to activate an extra mux
+ * to connect the ODL to the right set of lanes.
+ *
+ * FIXME: to be checked once we have merged with nvlink
+ * code. Need to verify that it's a platform parameter and not
+ * slot-dependent
+ */
+ if (platform.ocapi->odl_phy_swap)
+ reg |= OBUS_IOOL_PHY_CONFIG_ODL_PHY_SWAP;
+ else
+ reg &= ~OBUS_IOOL_PHY_CONFIG_ODL_PHY_SWAP;
+
+ /* Disable A-Link link layers */
+ reg &= ~OBUS_IOOL_PHY_CONFIG_LINK0_OLL_ENABLED;
+ reg &= ~OBUS_IOOL_PHY_CONFIG_LINK1_OLL_ENABLED;
+
+ xscom_write(gcid, phy_config_scom, reg);
+}
+
+static void disable_alink_fp(uint32_t gcid)
+{
+ uint64_t reg = 0;
+
+ prlog(PR_DEBUG, "OCAPI: %s: Disabling A-Link framer/parsers\n", __func__);
+ /* Step 3 - Disable A-Link framers/parsers */
+ /* TODO: Confirm if needed on OPAL system */
+
+ reg |= PU_IOE_PB_FP_CFG_FP0_FMR_DISABLE;
+ reg |= PU_IOE_PB_FP_CFG_FP0_PRS_DISABLE;
+ reg |= PU_IOE_PB_FP_CFG_FP1_FMR_DISABLE;
+ reg |= PU_IOE_PB_FP_CFG_FP1_PRS_DISABLE;
+ xscom_write(gcid, PU_IOE_PB_FP01_CFG, reg);
+ xscom_write(gcid, PU_IOE_PB_FP23_CFG, reg);
+ xscom_write(gcid, PU_IOE_PB_FP45_CFG, reg);
+ xscom_write(gcid, PU_IOE_PB_FP67_CFG, reg);
+}
+
+static void enable_xsl_clocks(uint32_t gcid, uint32_t scom_base, int index)
+{
+ /* Step 5 - Enable Clocks in XSL */
+
+ prlog(PR_DEBUG, "OCAPI: %s: Enable clocks in XSL\n", __func__);
+
+ npu2_scom_write(gcid, scom_base, NPU2_REG_OFFSET(index_to_stack(index),
+ NPU2_BLOCK_XSL,
+ NPU2_XSL_WRAP_CFG),
+ NPU2_MISC_DA_LEN_8B, NPU2_XSL_WRAP_CFG_XSLO_CLOCK_ENABLE);
+}
+
+#define CQ_CTL_STATUS_TIMEOUT 10 /* milliseconds */
+
+static int set_fence_control(uint32_t gcid, uint32_t scom_base,
+ int index, uint8_t status)
+{
+ int stack, block;
+ uint64_t reg, status_field;
+ uint8_t status_val;
+ uint64_t fence_control;
+ uint64_t timeout = mftb() + msecs_to_tb(CQ_CTL_STATUS_TIMEOUT);
+
+ stack = index_to_stack(index);
+ block = index_to_block(index);
+
+ fence_control = NPU2_REG_OFFSET(stack, NPU2_BLOCK_CTL,
+ block == NPU2_BLOCK_OTL0 ?
+ NPU2_CQ_CTL_FENCE_CONTROL_0 :
+ NPU2_CQ_CTL_FENCE_CONTROL_1);
+
+ reg = SETFIELD(NPU2_CQ_CTL_FENCE_CONTROL_REQUEST_FENCE, 0ull, status);
+ npu2_scom_write(gcid, scom_base, fence_control,
+ NPU2_MISC_DA_LEN_8B, reg);
+
+ /* Wait for fence status to update */
+ if (index_to_block(index) == NPU2_BLOCK_OTL0)
+ status_field = NPU2_CQ_CTL_STATUS_BRK0_AM_FENCED;
+ else
+ status_field = NPU2_CQ_CTL_STATUS_BRK1_AM_FENCED;
+
+ do {
+ reg = npu2_scom_read(gcid, scom_base,
+ NPU2_REG_OFFSET(index_to_stack(index),
+ NPU2_BLOCK_CTL,
+ NPU2_CQ_CTL_STATUS),
+ NPU2_MISC_DA_LEN_8B);
+ status_val = GETFIELD(status_field, reg);
+ if (status_val == status)
+ return OPAL_SUCCESS;
+ time_wait_ms(1);
+ } while (tb_compare(mftb(), timeout) == TB_ABEFOREB);
+
+ /**
+ * @fwts-label OCAPIFenceStatusTimeout
+ * @fwts-advice The NPU fence status did not update as expected. This
+ * could be the result of a firmware or hardware bug. OpenCAPI
+ * functionality could be broken.
+ */
+ prlog(PR_ERR,
+ "OCAPI: Fence status for brick %d stuck: expected 0x%x, got 0x%x\n",
+ index, status, status_val);
+ return OPAL_HARDWARE;
+}
+
+static void set_npcq_config(uint32_t gcid, uint32_t scom_base, int index)
+{
+ uint64_t reg, stack, block;
+
+ prlog(PR_DEBUG, "OCAPI: %s: Set NPCQ Config\n", __func__);
+ /* Step 6 - Set NPCQ configuration */
+ /* CQ_CTL Misc Config Register #0 */
+ stack = index_to_stack(index);
+ block = index_to_block(index);
+
+ /* Enable OTL */
+ npu2_scom_write(gcid, scom_base, NPU2_OTL_CONFIG0(stack, block),
+ NPU2_MISC_DA_LEN_8B, NPU2_OTL_CONFIG0_EN);
+ set_fence_control(gcid, scom_base, index, 0b01);
+ reg = npu2_scom_read(gcid, scom_base,
+ NPU2_REG_OFFSET(stack, NPU2_BLOCK_CTL,
+ NPU2_CQ_CTL_MISC_CFG),
+ NPU2_MISC_DA_LEN_8B);
+ /* Set OCAPI mode */
+ reg |= NPU2_CQ_CTL_MISC_CFG_CONFIG_OCAPI_MODE;
+ if (block == NPU2_BLOCK_OTL0)
+ reg |= NPU2_CQ_CTL_MISC_CFG_CONFIG_OTL0_ENABLE;
+ else
+ reg |= NPU2_CQ_CTL_MISC_CFG_CONFIG_OTL1_ENABLE;
+ npu2_scom_write(gcid, scom_base,
+ NPU2_REG_OFFSET(stack, NPU2_BLOCK_CTL,
+ NPU2_CQ_CTL_MISC_CFG),
+ NPU2_MISC_DA_LEN_8B, reg);
+
+ /* NPU Fenced */
+ set_fence_control(gcid, scom_base, index, 0b11);
+
+ /* NPU Half Fenced */
+ set_fence_control(gcid, scom_base, index, 0b10);
+
+ /* CQ_DAT Misc Config Register #1 */
+ reg = npu2_scom_read(gcid, scom_base,
+ NPU2_REG_OFFSET(stack, NPU2_BLOCK_DAT,
+ NPU2_CQ_DAT_MISC_CFG),
+ NPU2_MISC_DA_LEN_8B);
+ /* Set OCAPI mode for bricks 2-5 */
+ reg |= NPU2_CQ_DAT_MISC_CFG_CONFIG_OCAPI_MODE;
+ npu2_scom_write(gcid, scom_base,
+ NPU2_REG_OFFSET(stack, NPU2_BLOCK_DAT,
+ NPU2_CQ_DAT_MISC_CFG),
+ NPU2_MISC_DA_LEN_8B, reg);
+
+ /* CQ_SM Misc Config Register #0 */
+ for (block = NPU2_BLOCK_SM_0; block <= NPU2_BLOCK_SM_3; block++) {
+ reg = npu2_scom_read(gcid, scom_base,
+ NPU2_REG_OFFSET(stack, block,
+ NPU2_CQ_SM_MISC_CFG0),
+ NPU2_MISC_DA_LEN_8B);
+ /* Set OCAPI mode for bricks 2-5 */
+ reg |= NPU2_CQ_SM_MISC_CFG0_CONFIG_OCAPI_MODE;
+ npu2_scom_write(gcid, scom_base,
+ NPU2_REG_OFFSET(stack, block,
+ NPU2_CQ_SM_MISC_CFG0),
+ NPU2_MISC_DA_LEN_8B, reg);
+ }
+}
+
+static void enable_xsl_xts_interfaces(uint32_t gcid, uint32_t scom_base, int index)
+{
+ uint64_t reg;
+
+ prlog(PR_DEBUG, "OCAPI: %s: Enable XSL-XTS Interfaces\n", __func__);
+ /* Step 7 - Enable XSL-XTS interfaces */
+ /* XTS Config Register - Enable XSL-XTS interface */
+ reg = npu2_scom_read(gcid, scom_base, NPU2_XTS_CFG, NPU2_MISC_DA_LEN_8B);
+ reg |= NPU2_XTS_CFG_OPENCAPI;
+ npu2_scom_write(gcid, scom_base, NPU2_XTS_CFG, NPU2_MISC_DA_LEN_8B, reg);
+
+ /* XTS Config2 Register - Enable XSL1/2 */
+ reg = npu2_scom_read(gcid, scom_base, NPU2_XTS_CFG2, NPU2_MISC_DA_LEN_8B);
+ switch (index_to_stack(index)) {
+ case NPU2_STACK_STCK_1:
+ reg |= NPU2_XTS_CFG2_XSL1_ENA;
+ break;
+ case NPU2_STACK_STCK_2:
+ reg |= NPU2_XTS_CFG2_XSL2_ENA;
+ break;
+ }
+ npu2_scom_write(gcid, scom_base, NPU2_XTS_CFG2, NPU2_MISC_DA_LEN_8B, reg);
+}
+
+static void enable_sm_allocation(uint32_t gcid, uint32_t scom_base, int index)
+{
+ uint64_t reg, block;
+ int stack = index_to_stack(index);
+
+ prlog(PR_DEBUG, "OCAPI: %s: Enable State Machine Allocation\n", __func__);
+ /* Step 8 - Enable state-machine allocation */
+ /* Low-Water Marks Registers - Enable state machine allocation */
+ for (block = NPU2_BLOCK_SM_0; block <= NPU2_BLOCK_SM_3; block++) {
+ reg = npu2_scom_read(gcid, scom_base,
+ NPU2_REG_OFFSET(stack, block,
+ NPU2_LOW_WATER_MARKS),
+ NPU2_MISC_DA_LEN_8B);
+ reg |= NPU2_LOW_WATER_MARKS_ENABLE_MACHINE_ALLOC;
+ npu2_scom_write(gcid, scom_base,
+ NPU2_REG_OFFSET(stack, block,
+ NPU2_LOW_WATER_MARKS),
+ NPU2_MISC_DA_LEN_8B, reg);
+ }
+}
+
+static void enable_pb_snooping(uint32_t gcid, uint32_t scom_base, int index)
+{
+ uint64_t reg, block;
+ int stack = index_to_stack(index);
+
+ prlog(PR_DEBUG, "OCAPI: %s: Enable PowerBus snooping\n", __func__);
+ /* Step 9 - Enable PowerBus snooping */
+ /* CQ_SM Misc Config Register #0 - Enable PowerBus snooping */
+ for (block = NPU2_BLOCK_SM_0; block <= NPU2_BLOCK_SM_3; block++) {
+ reg = npu2_scom_read(gcid, scom_base,
+ NPU2_REG_OFFSET(stack, block,
+ NPU2_CQ_SM_MISC_CFG0),
+ NPU2_MISC_DA_LEN_8B);
+ reg |= NPU2_CQ_SM_MISC_CFG0_CONFIG_ENABLE_PBUS;
+ npu2_scom_write(gcid, scom_base,
+ NPU2_REG_OFFSET(stack, block,
+ NPU2_CQ_SM_MISC_CFG0),
+ NPU2_MISC_DA_LEN_8B, reg);
+ }
+}
+
+static void brick_config(uint32_t gcid, uint32_t scom_base, int index)
+{
+ /*
+ * We assume at this point that the PowerBus Hotplug Mode Control
+ * register is correctly set by Hostboot
+ */
+ disable_nvlink(gcid, index);
+ set_transport_mux_controls(gcid, scom_base, index,
+ NPU2_DEV_TYPE_OPENCAPI);
+ enable_odl_phy_mux(gcid, index);
+ disable_alink_fp(gcid);
+ enable_xsl_clocks(gcid, scom_base, index);
+ set_npcq_config(gcid, scom_base, index);
+ enable_xsl_xts_interfaces(gcid, scom_base, index);
+ enable_sm_allocation(gcid, scom_base, index);
+ enable_pb_snooping(gcid, scom_base, index);
+}
+
+/* Procedure 13.1.3.4 - Brick to PE Mapping */
+static void pe_config(struct npu2_dev *dev)
+{
+ /* We currently use a fixed PE assignment per brick */
+ uint64_t val, reg;
+ val = NPU2_MISC_BRICK_BDF2PE_MAP_ENABLE;
+ val = SETFIELD(NPU2_MISC_BRICK_BDF2PE_MAP_PE, val, NPU2_OCAPI_PE(dev));
+ val = SETFIELD(NPU2_MISC_BRICK_BDF2PE_MAP_BDF, val, 0);
+ reg = NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC,
+ NPU2_MISC_BRICK0_BDF2PE_MAP0 +
+ (dev->brick_index * 0x18));
+ npu2_write(dev->npu, reg, val);
+}
+
+/* Procedure 13.1.3.5 - TL Configuration */
+static void tl_config(uint32_t gcid, uint32_t scom_base, uint64_t index)
+{
+ uint64_t reg;
+ uint64_t stack = index_to_stack(index);
+ uint64_t block = index_to_block(index);
+
+ prlog(PR_DEBUG, "OCAPI: %s: TL Configuration\n", __func__);
+ /* OTL Config 0 Register */
+ reg = 0;
+ /* OTL Enable */
+ reg |= NPU2_OTL_CONFIG0_EN;
+ /* Block PE Handle from ERAT Index */
+ reg |= NPU2_OTL_CONFIG0_BLOCK_PE_HANDLE;
+ /* OTL Brick ID */
+ reg = SETFIELD(NPU2_OTL_CONFIG0_BRICKID, reg, index - 2);
+ /* ERAT Hash 0 */
+ reg = SETFIELD(NPU2_OTL_CONFIG0_ERAT_HASH_0, reg, 0b011001);
+ /* ERAT Hash 1 */
+ reg = SETFIELD(NPU2_OTL_CONFIG0_ERAT_HASH_1, reg, 0b000111);
+ /* ERAT Hash 2 */
+ reg = SETFIELD(NPU2_OTL_CONFIG0_ERAT_HASH_2, reg, 0b101100);
+ /* ERAT Hash 3 */
+ reg = SETFIELD(NPU2_OTL_CONFIG0_ERAT_HASH_3, reg, 0b100110);
+ npu2_scom_write(gcid, scom_base, NPU2_OTL_CONFIG0(stack, block),
+ NPU2_MISC_DA_LEN_8B, reg);
+
+ /* OTL Config 1 Register */
+ reg = 0;
+ /*
+ * We leave Template 1-3 bits at 0 to force template 0 as required
+ * for unknown devices.
+ *
+ * Template 0 Transmit Rate is set to most conservative setting which
+ * will always be supported. Other Template Transmit rates are left
+ * unset and will be set later by OS.
+ */
+ reg = SETFIELD(NPU2_OTL_CONFIG1_TX_TEMP0_RATE, reg, 0b1111);
+ /* Extra wait cycles TXI-TXO */
+ reg = SETFIELD(NPU2_OTL_CONFIG1_TX_DRDY_WAIT, reg, 0b001);
+ /* Minimum Frequency to Return TLX Credits to AFU */
+ reg = SETFIELD(NPU2_OTL_CONFIG1_TX_CRET_FREQ, reg, 0b001);
+ /* Frequency to add age to Transmit Requests */
+ reg = SETFIELD(NPU2_OTL_CONFIG1_TX_AGE_FREQ, reg, 0b11000);
+ /* Response High Priority Threshold */
+ reg = SETFIELD(NPU2_OTL_CONFIG1_TX_RS2_HPWAIT, reg, 0b011011);
+ /* 4-slot Request High Priority Threshold */
+ reg = SETFIELD(NPU2_OTL_CONFIG1_TX_RQ4_HPWAIT, reg, 0b011011);
+ /* 6-slot Request High Priority */
+ reg = SETFIELD(NPU2_OTL_CONFIG1_TX_RQ6_HPWAIT, reg, 0b011011);
+ /* Stop the OCAPI Link on Uncorrectable Error
+ * TODO: Confirm final value - disabled for debug */
+
+ npu2_scom_write(gcid, scom_base, NPU2_OTL_CONFIG1(stack, block),
+ NPU2_MISC_DA_LEN_8B, reg);
+
+ /* TLX Credit Configuration Register */
+ reg = 0;
+ /* VC0/VC3/DCP0/DCP1 credits to send to AFU */
+ reg = SETFIELD(NPU2_OTL_TLX_CREDITS_VC0_CREDITS, reg, 0x40);
+ reg = SETFIELD(NPU2_OTL_TLX_CREDITS_VC3_CREDITS, reg, 0x40);
+ reg = SETFIELD(NPU2_OTL_TLX_CREDITS_DCP0_CREDITS, reg, 0x80);
+ reg = SETFIELD(NPU2_OTL_TLX_CREDITS_DCP1_CREDITS, reg, 0x80);
+ npu2_scom_write(gcid, scom_base, NPU2_OTL_TLX_CREDITS(stack, block),
+ NPU2_MISC_DA_LEN_8B, reg);
+}
+
+/* Detect Nimbus DD2.0 and DD2.01 */
+static int get_nimbus_level(void)
+{
+ struct proc_chip *chip = next_chip(NULL);
+
+ if (chip && chip->type == PROC_CHIP_P9_NIMBUS)
+ return chip->ec_level & 0xff;
+ return -1;
+}
+
+/* Procedure 13.1.3.6 - Address Translation Configuration */
+static void address_translation_config(uint32_t gcid, uint32_t scom_base,
+ uint64_t index)
+{
+ int chip_level;
+ uint64_t reg;
+ uint64_t stack = index_to_stack(index);
+
+ prlog(PR_DEBUG, "OCAPI: %s: Address Translation Configuration\n", __func__);
+ /* PSL_SCNTL_A0 Register */
+ /*
+ * ERAT shared between multiple AFUs
+ *
+ * The workbook has this bit around the wrong way from the hardware.
+ *
+ * TODO: handle correctly with link ganging
+ */
+ reg = npu2_scom_read(gcid, scom_base,
+ NPU2_REG_OFFSET(stack, NPU2_BLOCK_XSL,
+ NPU2_XSL_PSL_SCNTL_A0),
+ NPU2_MISC_DA_LEN_8B);
+ reg |= NPU2_XSL_PSL_SCNTL_A0_MULTI_AFU_DIAL;
+ npu2_scom_write(gcid, scom_base,
+ NPU2_REG_OFFSET(stack, NPU2_BLOCK_XSL,
+ NPU2_XSL_PSL_SCNTL_A0),
+ NPU2_MISC_DA_LEN_8B, reg);
+
+ chip_level = get_nimbus_level();
+ if (chip_level == 0x20) {
+ /*
+ * Errata HW408041 (section 15.1.10 of NPU workbook)
+ * "RA mismatch when both tlbie and checkout response
+ * are seen in same cycle"
+ */
+ /* XSL_GP Register - Bloom Filter Disable */
+ reg = npu2_scom_read(gcid, scom_base,
+ NPU2_REG_OFFSET(stack, NPU2_BLOCK_XSL, NPU2_XSL_GP),
+ NPU2_MISC_DA_LEN_8B);
+ /* To update XSL_GP, we must first write a magic value to it */
+ npu2_scom_write(gcid, scom_base,
+ NPU2_REG_OFFSET(stack, NPU2_BLOCK_XSL, NPU2_XSL_GP),
+ NPU2_MISC_DA_LEN_8B, 0x0523790323000000UL);
+ reg &= ~NPU2_XSL_GP_BLOOM_FILTER_ENABLE;
+ npu2_scom_write(gcid, scom_base,
+ NPU2_REG_OFFSET(stack, NPU2_BLOCK_XSL, NPU2_XSL_GP),
+ NPU2_MISC_DA_LEN_8B, reg);
+ }
+
+ if (chip_level == 0x20 || chip_level == 0x21) {
+ /*
+ * DD2.0/2.1 EOA Bug. Fixed in DD2.2
+ */
+ reg = 0x32F8000000000001UL;
+ npu2_scom_write(gcid, scom_base,
+ NPU2_REG_OFFSET(stack, NPU2_BLOCK_XSL,
+ NPU2_XSL_DEF),
+ NPU2_MISC_DA_LEN_8B, reg);
+ }
+}
+
+/* TODO: Merge this with NVLink implementation - we don't use the npu2_bar
+ * wrapper for the PHY BARs yet */
+static void write_bar(uint32_t gcid, uint32_t scom_base, uint64_t reg,
+ uint64_t addr, uint64_t size)
+{
+ uint64_t val;
+ int block;
+ switch (NPU2_REG(reg)) {
+ case NPU2_PHY_BAR:
+ val = SETFIELD(NPU2_PHY_BAR_ADDR, 0ul, addr >> 21);
+ val = SETFIELD(NPU2_PHY_BAR_ENABLE, val, 1);
+ break;
+ case NPU2_NTL0_BAR:
+ case NPU2_NTL1_BAR:
+ val = SETFIELD(NPU2_NTL_BAR_ADDR, 0ul, addr >> 16);
+ val = SETFIELD(NPU2_NTL_BAR_SIZE, val, ilog2(size >> 16));
+ val = SETFIELD(NPU2_NTL_BAR_ENABLE, val, 1);
+ break;
+ case NPU2_GENID_BAR:
+ val = SETFIELD(NPU2_GENID_BAR_ADDR, 0ul, addr >> 16);
+ val = SETFIELD(NPU2_GENID_BAR_ENABLE, val, 1);
+ break;
+ default:
+ val = 0ul;
+ }
+
+ for (block = NPU2_BLOCK_SM_0; block <= NPU2_BLOCK_SM_3; block++) {
+ npu2_scom_write(gcid, scom_base, NPU2_REG_OFFSET(0, block, reg),
+ NPU2_MISC_DA_LEN_8B, val);
+ prlog(PR_DEBUG, "OCAPI: Setting BAR %llx to %llx\n",
+ NPU2_REG_OFFSET(0, block, reg), val);
+ }
+}
+
+static void setup_global_mmio_bar(uint32_t gcid, uint32_t scom_base,
+ uint64_t reg[])
+{
+ uint64_t addr, size;
+
+ prlog(PR_DEBUG, "OCAPI: patching up PHY0 bar, %s\n", __func__);
+ phys_map_get(gcid, NPU_PHY, 0, &addr, &size);
+ write_bar(gcid, scom_base,
+ NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_PHY_BAR),
+ addr, size);
+ prlog(PR_DEBUG, "OCAPI: patching up PHY1 bar, %s\n", __func__);
+ phys_map_get(gcid, NPU_PHY, 1, &addr, &size);
+ write_bar(gcid, scom_base,
+ NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_PHY_BAR),
+ addr, size);
+
+ prlog(PR_DEBUG, "OCAPI: setup global mmio, %s\n", __func__);
+ phys_map_get(gcid, NPU_REGS, 0, &addr, &size);
+ write_bar(gcid, scom_base,
+ NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_PHY_BAR),
+ addr, size);
+ reg[0] = addr;
+ reg[1] = size;
+}
+
+/* Procedure 13.1.3.8 - AFU MMIO Range BARs */
+static void setup_afu_mmio_bars(uint32_t gcid, uint32_t scom_base,
+ struct npu2_dev *dev)
+{
+ uint64_t stack = index_to_stack(dev->brick_index);
+ uint64_t offset = index_to_block(dev->brick_index) == NPU2_BLOCK_OTL0 ?
+ NPU2_NTL0_BAR : NPU2_NTL1_BAR;
+ uint64_t pa_offset = index_to_block(dev->brick_index) == NPU2_BLOCK_OTL0 ?
+ NPU2_CQ_CTL_MISC_MMIOPA0_CONFIG :
+ NPU2_CQ_CTL_MISC_MMIOPA1_CONFIG;
+ uint64_t addr, size, reg;
+
+ prlog(PR_DEBUG, "OCAPI: %s: Setup AFU MMIO BARs\n", __func__);
+ phys_map_get(gcid, NPU_OCAPI_MMIO, dev->brick_index, &addr, &size);
+
+ prlog(PR_DEBUG, "OCAPI: AFU MMIO set to %llx, size %llx\n", addr, size);
+ write_bar(gcid, scom_base, NPU2_REG_OFFSET(stack, 0, offset), addr,
+ size);
+ dev->bars[0].npu2_bar.base = addr;
+ dev->bars[0].npu2_bar.size = size;
+
+ reg = SETFIELD(NPU2_CQ_CTL_MISC_MMIOPA_ADDR, 0ull, addr >> 16);
+ reg = SETFIELD(NPU2_CQ_CTL_MISC_MMIOPA_SIZE, reg, ilog2(size >> 16));
+ prlog(PR_DEBUG, "OCAPI: PA translation %llx\n", reg);
+ npu2_scom_write(gcid, scom_base,
+ NPU2_REG_OFFSET(stack, NPU2_BLOCK_CTL,
+ pa_offset),
+ NPU2_MISC_DA_LEN_8B, reg);
+}
+
+/* Procedure 13.1.3.9 - AFU Config BARs */
+static void setup_afu_config_bars(uint32_t gcid, uint32_t scom_base,
+ struct npu2_dev *dev)
+{
+ uint64_t stack = index_to_stack(dev->brick_index);
+ int stack_num = stack - NPU2_STACK_STCK_0;
+ uint64_t addr, size;
+
+ prlog(PR_DEBUG, "OCAPI: %s: Setup AFU Config BARs\n", __func__);
+ phys_map_get(gcid, NPU_GENID, stack_num, &addr, &size);
+ prlog(PR_DEBUG, "OCAPI: Assigning GENID BAR: %016llx\n", addr);
+ write_bar(gcid, scom_base, NPU2_REG_OFFSET(stack, 0, NPU2_GENID_BAR),
+ addr, size);
+ dev->bars[1].npu2_bar.base = addr;
+ dev->bars[1].npu2_bar.size = size;
+}
+
+static void otl_enabletx(uint32_t gcid, uint32_t scom_base,
+ struct npu2_dev *dev)
+{
+ uint64_t stack = index_to_stack(dev->brick_index);
+ uint64_t block = index_to_block(dev->brick_index);
+ uint64_t reg;
+
+ /* OTL Config 2 Register */
+ /* Transmit Enable */
+ OCAPIDBG(dev, "Enabling TX\n");
+ reg = 0;
+ reg |= NPU2_OTL_CONFIG2_TX_SEND_EN;
+ npu2_scom_write(gcid, scom_base, NPU2_OTL_CONFIG2(stack, block),
+ NPU2_MISC_DA_LEN_8B, reg);
+
+ reg = npu2_scom_read(gcid, scom_base, NPU2_OTL_VC_CREDITS(stack, block),
+ NPU2_MISC_DA_LEN_8B);
+ OCAPIDBG(dev, "credit counter: %llx\n", reg);
+ /* TODO: Abort if credits are zero */
+}
+
+static uint8_t get_reset_pin(struct npu2_dev *dev)
+{
+ uint8_t pin;
+
+ switch (dev->brick_index) {
+ case 2:
+ pin = platform.ocapi->i2c_reset_brick2;
+ break;
+ case 3:
+ pin = platform.ocapi->i2c_reset_brick3;
+ break;
+ case 4:
+ pin = platform.ocapi->i2c_reset_brick4;
+ break;
+ case 5:
+ pin = platform.ocapi->i2c_reset_brick5;
+ break;
+ default:
+ assert(false);
+ }
+ return pin;
+}
+
+static void assert_adapter_reset(struct npu2_dev *dev)
+{
+ uint8_t pin, data;
+ int rc;
+
+ pin = get_reset_pin(dev);
+ /*
+ * set the i2c reset pin in output mode
+ *
+ * On the 9554 device, register 3 is the configuration
+ * register and a pin is in output mode if its value is 0
+ */
+ lock(&dev->npu->i2c_lock);
+ dev->npu->i2c_pin_mode &= ~pin;
+ data = dev->npu->i2c_pin_mode;
+
+ rc = i2c_request_send(dev->npu->i2c_port_id_ocapi,
+ platform.ocapi->i2c_reset_addr, SMBUS_WRITE,
+ 0x3, 1,
+ &data, sizeof(data), 120);
+ if (rc)
+ goto err;
+
+ /* register 1 controls the signal, reset is active low */
+ dev->npu->i2c_pin_wr_state &= ~pin;
+ data = dev->npu->i2c_pin_wr_state;
+
+ rc = i2c_request_send(dev->npu->i2c_port_id_ocapi,
+ platform.ocapi->i2c_reset_addr, SMBUS_WRITE,
+ 0x1, 1,
+ &data, sizeof(data), 120);
+ if (rc)
+ goto err;
+ unlock(&dev->npu->i2c_lock);
+ return;
+
+err:
+ unlock(&dev->npu->i2c_lock);
+ /**
+ * @fwts-label OCAPIDeviceResetFailed
+ * @fwts-advice There was an error attempting to send
+ * a reset signal over I2C to the OpenCAPI device.
+ */
+ OCAPIERR(dev, "Error writing I2C reset signal: %d\n", rc);
+}
+
+static void deassert_adapter_reset(struct npu2_dev *dev)
+{
+ uint8_t pin, data;
+ int rc, rc2;
+
+ pin = get_reset_pin(dev);
+
+ /*
+ * All we need to do here is deassert the reset signal by
+ * setting the reset pin to high. However, we cannot leave the
+ * pin in output mode, as it can cause troubles with the
+ * opencapi adapter: when the slot is powered off (on a reboot
+ * for example), if the i2c controller is actively setting the
+ * reset signal to high, it maintains voltage on part of the
+ * fpga and can leak current. It can lead the fpga to be in an
+ * unspecified state and potentially cause damage.
+ *
+ * The circumvention is to set the pin back to input
+ * mode. There are pullup resistors on the planar on all
+ * platforms to make sure the signal will "naturally" be high,
+ * without the i2c controller actively setting it, so we won't
+ * have problems when the slot is powered off. And it takes
+ * the adapter out of reset.
+ *
+ * To summarize:
+ * 1. set the pin to input mode. That is enough to raise the
+ * signal
+ * 2. set the value of the pin to high. The pin is input mode,
+ * so it won't really do anything. But it's more coherent
+ * and avoids bad surprises on the next call to
+ * assert_adapter_reset()
+ */
+ lock(&dev->npu->i2c_lock);
+ dev->npu->i2c_pin_mode |= pin;
+ data = dev->npu->i2c_pin_mode;
+
+ rc = i2c_request_send(dev->npu->i2c_port_id_ocapi,
+ platform.ocapi->i2c_reset_addr, SMBUS_WRITE,
+ 0x3, 1,
+ &data, sizeof(data), 120);
+
+ dev->npu->i2c_pin_wr_state |= pin;
+ data = dev->npu->i2c_pin_wr_state;
+ rc2 = i2c_request_send(dev->npu->i2c_port_id_ocapi,
+ platform.ocapi->i2c_reset_addr, SMBUS_WRITE,
+ 0x1, 1,
+ &data, sizeof(data), 120);
+ unlock(&dev->npu->i2c_lock);
+ if (!rc)
+ rc = rc2;
+ if (rc) {
+ /**
+ * @fwts-label OCAPIDeviceResetFailed
+ * @fwts-advice There was an error attempting to send
+ * a reset signal over I2C to the OpenCAPI device.
+ */
+ OCAPIERR(dev, "Error writing I2C reset signal: %d\n", rc);
+ }
+}
+
+static void setup_perf_counters(struct npu2_dev *dev)
+{
+ uint64_t addr, reg, link;
+
+ /*
+ * setup the DLL perf counters to check CRC errors detected by
+ * the NPU or the adapter.
+ *
+ * Counter 0: link 0/ODL0, CRC error detected by ODL
+ * Counter 1: link 0/ODL0, CRC error detected by DLx
+ * Counter 2: link 1/ODL1, CRC error detected by ODL
+ * Counter 3: link 1/ODL1, CRC error detected by DLx
+ */
+ if ((dev->brick_index == 2) || (dev->brick_index == 5))
+ link = 0;
+ else
+ link = 1;
+
+ addr = OB_DLL_PERF_MONITOR_CONFIG(dev->brick_index);
+ xscom_read(dev->npu->chip_id, addr, &reg);
+ if (link == 0) {
+ reg = SETFIELD(OB_DLL_PERF_MONITOR_CONFIG_ENABLE, reg,
+ OB_DLL_PERF_MONITOR_CONFIG_LINK0);
+ reg = SETFIELD(OB_DLL_PERF_MONITOR_CONFIG_ENABLE >> 2, reg,
+ OB_DLL_PERF_MONITOR_CONFIG_LINK0);
+ } else {
+ reg = SETFIELD(OB_DLL_PERF_MONITOR_CONFIG_ENABLE >> 4, reg,
+ OB_DLL_PERF_MONITOR_CONFIG_LINK1);
+ reg = SETFIELD(OB_DLL_PERF_MONITOR_CONFIG_ENABLE >> 6, reg,
+ OB_DLL_PERF_MONITOR_CONFIG_LINK1);
+ }
+ reg = SETFIELD(OB_DLL_PERF_MONITOR_CONFIG_SIZE, reg,
+ OB_DLL_PERF_MONITOR_CONFIG_SIZE16);
+ xscom_write(dev->npu->chip_id,
+ OB_DLL_PERF_MONITOR_CONFIG(dev->brick_index), reg);
+ OCAPIDBG(dev, "perf counter config %llx = %llx\n", addr, reg);
+
+ addr = OB_DLL_PERF_MONITOR_SELECT(dev->brick_index);
+ xscom_read(dev->npu->chip_id, addr, &reg);
+ reg = SETFIELD(OB_DLL_PERF_MONITOR_SELECT_COUNTER >> (link * 16),
+ reg, OB_DLL_PERF_MONITOR_SELECT_CRC_ODL);
+ reg = SETFIELD(OB_DLL_PERF_MONITOR_SELECT_COUNTER >> ((link * 16) + 8),
+ reg, OB_DLL_PERF_MONITOR_SELECT_CRC_DLX);
+ xscom_write(dev->npu->chip_id, addr, reg);
+ OCAPIDBG(dev, "perf counter select %llx = %llx\n", addr, reg);
+}
+
+static void check_perf_counters(struct npu2_dev *dev)
+{
+ uint64_t addr, reg, link0, link1;
+
+ addr = OB_DLL_PERF_COUNTER0(dev->brick_index);
+ xscom_read(dev->npu->chip_id, addr, &reg);
+ link0 = GETFIELD(PPC_BITMASK(0, 31), reg);
+ link1 = GETFIELD(PPC_BITMASK(32, 63), reg);
+ if (link0 || link1)
+ OCAPIERR(dev, "CRC error count link0=%08llx link1=%08llx\n",
+ link0, link1);
+}
+
+static void set_init_pattern(uint32_t gcid, struct npu2_dev *dev)
+{
+ uint64_t reg, config_xscom;
+
+ config_xscom = OB_ODL_CONFIG(dev->brick_index);
+ /* Transmit Pattern A */
+ xscom_read(gcid, config_xscom, &reg);
+ reg = SETFIELD(OB_ODL_CONFIG_TRAIN_MODE, reg, 0b0001);
+ xscom_write(gcid, config_xscom, reg);
+}
+
+static void start_training(uint32_t gcid, struct npu2_dev *dev)
+{
+ uint64_t reg, config_xscom;
+
+ config_xscom = OB_ODL_CONFIG(dev->brick_index);
+ /* Start training */
+ xscom_read(gcid, config_xscom, &reg);
+ reg = SETFIELD(OB_ODL_CONFIG_TRAIN_MODE, reg, 0b1000);
+ xscom_write(gcid, config_xscom, reg);
+}
+
+static int64_t npu2_opencapi_get_presence_state(struct pci_slot __unused *slot,
+ uint8_t *val)
+{
+ /*
+ * Presence detection for OpenCAPI is currently done at the start of
+ * NPU initialisation, and we only create slots if a device is present.
+ * As such we will never be asked to get the presence of a slot that's
+ * empty.
+ *
+ * This may change if we ever support surprise hotplug down
+ * the track.
+ */
+ *val = OPAL_PCI_SLOT_PRESENT;
+ return OPAL_SUCCESS;
+}
+
+static void fence_brick(struct npu2_dev *dev)
+{
+ OCAPIDBG(dev, "Fencing brick\n");
+ set_fence_control(dev->npu->chip_id, dev->npu->xscom_base,
+ dev->brick_index, 0b11);
+ /* from 13.2.1, Quiesce Fence State */
+ npu2_write(dev->npu, NPU2_MISC_FENCE_STATE,
+ PPC_BIT(dev->brick_index + 6));
+}
+
+static void unfence_brick(struct npu2_dev *dev)
+{
+ OCAPIDBG(dev, "Unfencing brick\n");
+ npu2_write(dev->npu, NPU2_MISC_FENCE_STATE,
+ PPC_BIT(dev->brick_index));
+
+ set_fence_control(dev->npu->chip_id, dev->npu->xscom_base,
+ dev->brick_index, 0b10);
+ set_fence_control(dev->npu->chip_id, dev->npu->xscom_base,
+ dev->brick_index, 0b00);
+}
+
+static enum OpalShpcLinkState get_link_width(uint64_t odl_status)
+{
+ uint64_t tx_lanes, rx_lanes, state;
+
+ /*
+ * On P9, the 'trained mode' field of the ODL status is
+ * hard-coded to x8 and is useless for us. We need to look at
+ * the status of the individual lanes.
+ * The link trains at x8, x4 or not at all.
+ */
+ state = GETFIELD(OB_ODL_STATUS_TRAINING_STATE_MACHINE, odl_status);
+ if (state != OCAPI_LINK_STATE_TRAINED)
+ return OPAL_SHPC_LINK_DOWN;
+
+ rx_lanes = GETFIELD(OB_ODL_STATUS_RX_TRAINED_LANES, odl_status);
+ tx_lanes = GETFIELD(OB_ODL_STATUS_TX_TRAINED_LANES, odl_status);
+ if ((rx_lanes != 0xFF) || (tx_lanes != 0xFF))
+ return OPAL_SHPC_LINK_UP_x4;
+ else
+ return OPAL_SHPC_LINK_UP_x8;
+}
+
+static int64_t npu2_opencapi_get_link_state(struct pci_slot *slot, uint8_t *val)
+{
+ struct npu2_dev *dev = phb_to_npu2_dev_ocapi(slot->phb);
+ uint64_t reg;
+
+ reg = get_odl_status(dev->npu->chip_id, dev->brick_index);
+ *val = get_link_width(reg);
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu2_opencapi_get_power_state(struct pci_slot *slot,
+ uint8_t *val)
+{
+ *val = slot->power_state;
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu2_opencapi_set_power_state(struct pci_slot *slot, uint8_t val)
+{
+ struct npu2_dev *dev = phb_to_npu2_dev_ocapi(slot->phb);
+
+ switch (val) {
+ case PCI_SLOT_POWER_OFF:
+ OCAPIDBG(dev, "Fake power off\n");
+ fence_brick(dev);
+ assert_adapter_reset(dev);
+ slot->power_state = PCI_SLOT_POWER_OFF;
+ return OPAL_SUCCESS;
+
+ case PCI_SLOT_POWER_ON:
+ if (slot->power_state != PCI_SLOT_POWER_OFF)
+ return OPAL_SUCCESS;
+ OCAPIDBG(dev, "Fake power on\n");
+ slot->power_state = PCI_SLOT_POWER_ON;
+ slot->state = OCAPI_SLOT_NORMAL;
+ return OPAL_SUCCESS;
+
+ default:
+ return OPAL_UNSUPPORTED;
+ }
+}
+
+static void check_trained_link(struct npu2_dev *dev, uint64_t odl_status)
+{
+ if (get_link_width(odl_status) != OPAL_SHPC_LINK_UP_x8) {
+ OCAPIERR(dev, "Link trained in degraded mode (%016llx)\n",
+ odl_status);
+ OCAPIDBG(dev, "Link endpoint info: %016llx\n",
+ get_odl_endpoint_info(dev->npu->chip_id, dev->brick_index));
+ }
+}
+
+static int64_t npu2_opencapi_retry_state(struct pci_slot *slot,
+ uint64_t odl_status)
+{
+ struct npu2_dev *dev = phb_to_npu2_dev_ocapi(slot->phb);
+ uint32_t chip_id = dev->npu->chip_id;
+
+ if (!slot->link_retries--) {
+ /**
+ * @fwts-label OCAPILinkTrainingFailed
+ * @fwts-advice The OpenCAPI link training procedure failed.
+ * This indicates a hardware or firmware bug. OpenCAPI
+ * functionality will not be available on this link.
+ */
+ OCAPIERR(dev,
+ "Link failed to train, final link status: %016llx\n",
+ odl_status);
+ OCAPIDBG(dev, "Final link training status: %016llx\n",
+ get_odl_training_status(chip_id, dev->brick_index));
+ return OPAL_HARDWARE;
+ }
+
+ OCAPIERR(dev, "Link failed to train, retrying\n");
+ OCAPIDBG(dev, "Link status: %016llx, training status: %016llx\n",
+ odl_status,
+ get_odl_training_status(chip_id, dev->brick_index));
+
+ pci_slot_set_state(slot, OCAPI_SLOT_FRESET_INIT);
+ return pci_slot_set_sm_timeout(slot, msecs_to_tb(1));
+}
+
+static void npu2_opencapi_prepare_link_change(struct pci_slot *slot __unused,
+ bool up __unused)
+{
+ /*
+ * PCI hotplug wants it defined, but we don't need to do anything
+ */
+}
+
+static int64_t npu2_opencapi_poll_link(struct pci_slot *slot)
+{
+ struct npu2_dev *dev = phb_to_npu2_dev_ocapi(slot->phb);
+ uint32_t chip_id = dev->npu->chip_id;
+ uint64_t reg;
+
+ switch (slot->state) {
+ case OCAPI_SLOT_NORMAL:
+ case OCAPI_SLOT_LINK_START:
+ OCAPIDBG(dev, "Start polling\n");
+ pci_slot_set_state(slot, OCAPI_SLOT_LINK_WAIT);
+ /* fall-through */
+ case OCAPI_SLOT_LINK_WAIT:
+ reg = get_odl_status(chip_id, dev->brick_index);
+ if (GETFIELD(OB_ODL_STATUS_TRAINING_STATE_MACHINE, reg) ==
+ OCAPI_LINK_STATE_TRAINED) {
+ OCAPIINF(dev, "link trained in %ld ms\n",
+ tb_to_msecs(mftb() - dev->train_start));
+ check_trained_link(dev, reg);
+ pci_slot_set_state(slot, OCAPI_SLOT_LINK_TRAINED);
+ return pci_slot_set_sm_timeout(slot, msecs_to_tb(1));
+ }
+ if (tb_compare(mftb(), dev->train_timeout) == TB_AAFTERB)
+ return npu2_opencapi_retry_state(slot, reg);
+
+ return pci_slot_set_sm_timeout(slot, msecs_to_tb(1));
+
+ case OCAPI_SLOT_LINK_TRAINED:
+ otl_enabletx(chip_id, dev->npu->xscom_base, dev);
+ pci_slot_set_state(slot, OCAPI_SLOT_NORMAL);
+ if (dev->flags & NPU2_DEV_BROKEN) {
+ OCAPIERR(dev, "Resetting a device which hit a previous error. Device recovery is not supported, so future behavior is undefined\n");
+ dev->flags &= ~NPU2_DEV_BROKEN;
+ }
+ check_perf_counters(dev);
+ dev->phb_ocapi.scan_map = 1;
+ return OPAL_SUCCESS;
+
+ default:
+ OCAPIERR(dev, "unexpected slot state %08x\n", slot->state);
+
+ }
+ pci_slot_set_state(slot, OCAPI_SLOT_NORMAL);
+ return OPAL_HARDWARE;
+}
+
+static int64_t npu2_opencapi_creset(struct pci_slot *slot)
+{
+ struct npu2_dev *dev = phb_to_npu2_dev_ocapi(slot->phb);
+
+ OCAPIERR(dev, "creset not supported\n");
+ return OPAL_UNSUPPORTED;
+}
+
+static int64_t npu2_opencapi_freset(struct pci_slot *slot)
+{
+ struct npu2_dev *dev = phb_to_npu2_dev_ocapi(slot->phb);
+ uint32_t chip_id = dev->npu->chip_id;
+ uint8_t presence = 1;
+ int rc;
+
+ switch (slot->state) {
+ case OCAPI_SLOT_NORMAL:
+ case OCAPI_SLOT_FRESET_START:
+ OCAPIDBG(dev, "FRESET starts\n");
+
+ if (slot->ops.get_presence_state)
+ slot->ops.get_presence_state(slot, &presence);
+ if (!presence) {
+ /*
+ * FIXME: if there's no card on the link, we
+ * should consider powering off the unused
+ * lanes to save energy
+ */
+ OCAPIINF(dev, "no card detected\n");
+ return OPAL_SUCCESS;
+ }
+ slot->link_retries = OCAPI_LINK_TRAINING_RETRIES;
+ /* fall-through */
+ case OCAPI_SLOT_FRESET_INIT:
+ fence_brick(dev);
+ assert_odl_reset(chip_id, dev->brick_index);
+ assert_adapter_reset(dev);
+ pci_slot_set_state(slot,
+ OCAPI_SLOT_FRESET_ASSERT_DELAY);
+ /* assert for 5ms */
+ return pci_slot_set_sm_timeout(slot, msecs_to_tb(5));
+
+ case OCAPI_SLOT_FRESET_ASSERT_DELAY:
+ rc = npu2_opencapi_phy_reset(dev);
+ if (rc) {
+ OCAPIERR(dev, "FRESET: couldn't reset PHY state\n");
+ return OPAL_HARDWARE;
+ }
+ deassert_odl_reset(chip_id, dev->brick_index);
+ deassert_adapter_reset(dev);
+ pci_slot_set_state(slot,
+ OCAPI_SLOT_FRESET_DEASSERT_DELAY);
+ /* give 250ms to device to be ready */
+ return pci_slot_set_sm_timeout(slot, msecs_to_tb(250));
+
+ case OCAPI_SLOT_FRESET_DEASSERT_DELAY:
+ unfence_brick(dev);
+ set_init_pattern(chip_id, dev);
+ pci_slot_set_state(slot,
+ OCAPI_SLOT_FRESET_INIT_DELAY);
+ return pci_slot_set_sm_timeout(slot, msecs_to_tb(5));
+
+ case OCAPI_SLOT_FRESET_INIT_DELAY:
+ /* Bump lanes - this improves training reliability */
+ npu2_opencapi_bump_ui_lane(dev);
+ start_training(chip_id, dev);
+ dev->train_start = mftb();
+ dev->train_timeout = dev->train_start + msecs_to_tb(OCAPI_LINK_TRAINING_TIMEOUT);
+ pci_slot_set_state(slot, OCAPI_SLOT_LINK_START);
+ return slot->ops.poll_link(slot);
+
+ default:
+ OCAPIERR(dev, "FRESET: unexpected slot state %08x\n",
+ slot->state);
+ }
+ pci_slot_set_state(slot, OCAPI_SLOT_NORMAL);
+ return OPAL_HARDWARE;
+}
+
+static int64_t npu2_opencapi_hreset(struct pci_slot *slot __unused)
+{
+ struct npu2_dev *dev = phb_to_npu2_dev_ocapi(slot->phb);
+
+ OCAPIERR(dev, "hreset not supported\n");
+ return OPAL_UNSUPPORTED;
+}
+
+static void make_slot_hotpluggable(struct pci_slot *slot, struct phb *phb)
+{
+ struct npu2_dev *dev = phb_to_npu2_dev_ocapi(phb);
+ char name[40];
+ const char *label = NULL;
+
+ /*
+ * Add a few definitions to the DT so that the linux PCI
+ * hotplug framework can find the slot and identify it as
+ * hot-pluggable.
+ *
+ * The "ibm,slot-label" property is used by linux as the slot name
+ */
+ slot->pluggable = 1;
+ pci_slot_add_dt_properties(slot, phb->dt_node);
+
+ if (platform.ocapi->ocapi_slot_label)
+ label = platform.ocapi->ocapi_slot_label(dev->npu->chip_id,
+ dev->brick_index);
+
+ if (!label) {
+ snprintf(name, sizeof(name), "OPENCAPI-%04x",
+ (int)PCI_SLOT_PHB_INDEX(slot->id));
+ label = name;
+ }
+ dt_add_property_string(phb->dt_node, "ibm,slot-label", label);
+}
+
+static struct pci_slot *npu2_opencapi_slot_create(struct phb *phb)
+{
+ struct pci_slot *slot;
+
+ slot = pci_slot_alloc(phb, NULL);
+ if (!slot)
+ return slot;
+
+ /* TODO: Figure out other slot functions */
+ slot->ops.get_presence_state = npu2_opencapi_get_presence_state;
+ slot->ops.get_link_state = npu2_opencapi_get_link_state;
+ slot->ops.get_power_state = npu2_opencapi_get_power_state;
+ slot->ops.get_attention_state = NULL;
+ slot->ops.get_latch_state = NULL;
+ slot->ops.set_power_state = npu2_opencapi_set_power_state;
+ slot->ops.set_attention_state = NULL;
+
+ slot->ops.prepare_link_change = npu2_opencapi_prepare_link_change;
+ slot->ops.poll_link = npu2_opencapi_poll_link;
+ slot->ops.creset = npu2_opencapi_creset;
+ slot->ops.freset = npu2_opencapi_freset;
+ slot->ops.hreset = npu2_opencapi_hreset;
+
+ return slot;
+}
+
+static int64_t npu2_opencapi_pcicfg_check(struct npu2_dev *dev, uint32_t offset,
+ uint32_t size)
+{
+ if (!dev || offset > 0xfff || (offset & (size - 1)))
+ return OPAL_PARAMETER;
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu2_opencapi_pcicfg_read(struct phb *phb, uint32_t bdfn,
+ uint32_t offset, uint32_t size,
+ void *data)
+{
+ uint64_t cfg_addr;
+ struct npu2_dev *dev = phb_to_npu2_dev_ocapi(phb);
+ uint64_t genid_base;
+ int64_t rc;
+
+ rc = npu2_opencapi_pcicfg_check(dev, offset, size);
+ if (rc)
+ return rc;
+
+ genid_base = dev->bars[1].npu2_bar.base +
+ (index_to_block(dev->brick_index) == NPU2_BLOCK_OTL1 ? 256 : 0);
+
+ cfg_addr = NPU2_CQ_CTL_CONFIG_ADDR_ENABLE;
+ cfg_addr = SETFIELD(NPU2_CQ_CTL_CONFIG_ADDR_BUS_NUMBER |
+ NPU2_CQ_CTL_CONFIG_ADDR_DEVICE_NUMBER |
+ NPU2_CQ_CTL_CONFIG_ADDR_FUNCTION_NUMBER,
+ cfg_addr, bdfn);
+ cfg_addr = SETFIELD(NPU2_CQ_CTL_CONFIG_ADDR_REGISTER_NUMBER,
+ cfg_addr, offset & ~3u);
+
+ out_be64((beint64_t *)genid_base, cfg_addr);
+ sync();
+
+ switch (size) {
+ case 1:
+ *((uint8_t *)data) =
+ in_8((volatile uint8_t *)(genid_base + 128 + (offset & 3)));
+ break;
+ case 2:
+ *((uint16_t *)data) =
+ in_le16((volatile leint16_t *)(genid_base + 128 + (offset & 2)));
+ break;
+ case 4:
+ *((uint32_t *)data) = in_le32((volatile leint32_t *)(genid_base + 128));
+ break;
+ default:
+ return OPAL_PARAMETER;
+ }
+
+ return OPAL_SUCCESS;
+}
+
+#define NPU2_OPENCAPI_PCI_CFG_READ(size, type) \
+static int64_t npu2_opencapi_pcicfg_read##size(struct phb *phb, \
+ uint32_t bdfn, \
+ uint32_t offset, \
+ type *data) \
+{ \
+ /* Initialize data in case of error */ \
+ *data = (type)0xffffffff; \
+ return npu2_opencapi_pcicfg_read(phb, bdfn, offset, \
+ sizeof(type), data); \
+}
+
+static int64_t npu2_opencapi_pcicfg_write(struct phb *phb, uint32_t bdfn,
+ uint32_t offset, uint32_t size,
+ uint32_t data)
+{
+ uint64_t cfg_addr;
+ struct npu2_dev *dev = phb_to_npu2_dev_ocapi(phb);
+ uint64_t genid_base;
+ int64_t rc;
+
+ rc = npu2_opencapi_pcicfg_check(dev, offset, size);
+ if (rc)
+ return rc;
+
+ genid_base = dev->bars[1].npu2_bar.base +
+ (index_to_block(dev->brick_index) == NPU2_BLOCK_OTL1 ? 256 : 0);
+
+ cfg_addr = NPU2_CQ_CTL_CONFIG_ADDR_ENABLE;
+ cfg_addr = SETFIELD(NPU2_CQ_CTL_CONFIG_ADDR_BUS_NUMBER |
+ NPU2_CQ_CTL_CONFIG_ADDR_DEVICE_NUMBER |
+ NPU2_CQ_CTL_CONFIG_ADDR_FUNCTION_NUMBER,
+ cfg_addr, bdfn);
+ cfg_addr = SETFIELD(NPU2_CQ_CTL_CONFIG_ADDR_REGISTER_NUMBER,
+ cfg_addr, offset & ~3u);
+
+ out_be64((beint64_t *)genid_base, cfg_addr);
+ sync();
+
+ switch (size) {
+ case 1:
+ out_8((volatile uint8_t *)(genid_base + 128 + (offset & 3)),
+ data);
+ break;
+ case 2:
+ out_le16((volatile leint16_t *)(genid_base + 128 + (offset & 2)),
+ data);
+ break;
+ case 4:
+ out_le32((volatile leint32_t *)(genid_base + 128), data);
+ break;
+ default:
+ return OPAL_PARAMETER;
+ }
+
+ return OPAL_SUCCESS;
+}
+
+#define NPU2_OPENCAPI_PCI_CFG_WRITE(size, type) \
+static int64_t npu2_opencapi_pcicfg_write##size(struct phb *phb, \
+ uint32_t bdfn, \
+ uint32_t offset, \
+ type data) \
+{ \
+ return npu2_opencapi_pcicfg_write(phb, bdfn, offset, \
+ sizeof(type), data); \
+}
+
+NPU2_OPENCAPI_PCI_CFG_READ(8, u8)
+NPU2_OPENCAPI_PCI_CFG_READ(16, u16)
+NPU2_OPENCAPI_PCI_CFG_READ(32, u32)
+NPU2_OPENCAPI_PCI_CFG_WRITE(8, u8)
+NPU2_OPENCAPI_PCI_CFG_WRITE(16, u16)
+NPU2_OPENCAPI_PCI_CFG_WRITE(32, u32)
+
+static int64_t npu2_opencapi_ioda_reset(struct phb __unused *phb,
+ bool __unused purge)
+{
+ /* Not relevant to OpenCAPI - we do this just to silence the error */
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu2_opencapi_set_pe(struct phb *phb,
+ uint64_t pe_num,
+ uint64_t __unused bdfn,
+ uint8_t __unused bcompare,
+ uint8_t __unused dcompare,
+ uint8_t __unused fcompare,
+ uint8_t action)
+{
+ struct npu2_dev *dev = phb_to_npu2_dev_ocapi(phb);
+ /*
+ * Ignored on OpenCAPI - we use fixed PE assignments. May need
+ * addressing when we support dual-link devices.
+ *
+ * We nonetheless store the PE reported by the OS so that we
+ * can send it back in case of error. If there are several PCI
+ * functions on the device, the OS can define many PEs, we
+ * only keep one, the OS will handle it.
+ */
+ if (action != OPAL_MAP_PE && action != OPAL_UNMAP_PE)
+ return OPAL_PARAMETER;
+
+ if (action == OPAL_UNMAP_PE)
+ pe_num = -1;
+ dev->linux_pe = pe_num;
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu2_opencapi_freeze_status(struct phb *phb __unused,
+ uint64_t pe_number __unused,
+ uint8_t *freeze_state,
+ uint16_t *pci_error_type,
+ uint16_t *severity)
+{
+ *freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN;
+ *pci_error_type = OPAL_EEH_NO_ERROR;
+ if (severity)
+ *severity = OPAL_EEH_SEV_NO_ERROR;
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu2_opencapi_eeh_next_error(struct phb *phb,
+ uint64_t *first_frozen_pe,
+ uint16_t *pci_error_type,
+ uint16_t *severity)
+{
+ struct npu2_dev *dev = phb_to_npu2_dev_ocapi(phb);
+
+ if (!first_frozen_pe || !pci_error_type || !severity)
+ return OPAL_PARAMETER;
+
+ if (dev->flags & NPU2_DEV_BROKEN) {
+ OCAPIDBG(dev, "Reporting device as broken\n");
+ *first_frozen_pe = dev->linux_pe;
+ *pci_error_type = OPAL_EEH_PHB_ERROR;
+ *severity = OPAL_EEH_SEV_PHB_DEAD;
+ } else {
+ *first_frozen_pe = -1;
+ *pci_error_type = OPAL_EEH_NO_ERROR;
+ *severity = OPAL_EEH_SEV_NO_ERROR;
+ }
+ return OPAL_SUCCESS;
+}
+
+static int npu2_add_mmio_regs(struct phb *phb, struct pci_device *pd,
+ void *data __unused)
+{
+ uint32_t irq;
+ struct npu2_dev *dev = phb_to_npu2_dev_ocapi(phb);
+ uint64_t block = index_to_block(dev->brick_index);
+ uint64_t stacku = index_to_stacku(dev->brick_index);
+ uint64_t dsisr, dar, tfc, handle;
+
+ /*
+ * Pass the hw irq number for the translation fault irq
+ * irq levels 23 -> 26 are for translation faults, 1 per brick
+ */
+ irq = dev->npu->base_lsi + NPU_IRQ_LEVELS_XSL;
+ if (stacku == NPU2_STACK_STCK_2U)
+ irq += 2;
+ if (block == NPU2_BLOCK_OTL1)
+ irq++;
+
+ /*
+ * Add the addresses of the registers needed by the OS to handle
+ * faults. The OS accesses them by mmio.
+ */
+ dsisr = (uint64_t) dev->npu->regs + NPU2_OTL_OSL_DSISR(stacku, block);
+ dar = (uint64_t) dev->npu->regs + NPU2_OTL_OSL_DAR(stacku, block);
+ tfc = (uint64_t) dev->npu->regs + NPU2_OTL_OSL_TFC(stacku, block);
+ handle = (uint64_t) dev->npu->regs + NPU2_OTL_OSL_PEHANDLE(stacku,
+ block);
+ dt_add_property_cells(pd->dn, "ibm,opal-xsl-irq", irq);
+ dt_add_property_cells(pd->dn, "ibm,opal-xsl-mmio",
+ hi32(dsisr), lo32(dsisr),
+ hi32(dar), lo32(dar),
+ hi32(tfc), lo32(tfc),
+ hi32(handle), lo32(handle));
+ return 0;
+}
+
+static void npu2_opencapi_final_fixup(struct phb *phb)
+{
+ pci_walk_dev(phb, NULL, npu2_add_mmio_regs, NULL);
+}
+
+static void mask_nvlink_fir(struct npu2 *p)
+{
+ uint64_t reg;
+
+ /*
+ * From section 13.1.3.10 of the NPU workbook: "the NV-Link
+ * Datalink Layer Stall and NoStall signals are used for a
+ * different purpose when the link is configured for
+ * OpenCAPI. Therefore, the corresponding bits in NPU FIR
+ * Register 1 must be masked and configured to NOT cause the
+ * NPU to go into Freeze or Fence mode or send an Interrupt."
+ *
+ * FIXME: will need to revisit when mixing nvlink with
+ * opencapi. Assumes an opencapi-only setup on both PHYs for
+ * now.
+ */
+
+ /* Mask FIRs */
+ xscom_read(p->chip_id, p->xscom_base + NPU2_MISC_FIR1_MASK, &reg);
+ reg = SETFIELD(PPC_BITMASK(0, 11), reg, 0xFFF);
+ xscom_write(p->chip_id, p->xscom_base + NPU2_MISC_FIR1_MASK, reg);
+
+ /* freeze disable */
+ reg = npu2_scom_read(p->chip_id, p->xscom_base,
+ NPU2_MISC_FREEZE_ENABLE1, NPU2_MISC_DA_LEN_8B);
+ reg = SETFIELD(PPC_BITMASK(0, 11), reg, 0);
+ npu2_scom_write(p->chip_id, p->xscom_base,
+ NPU2_MISC_FREEZE_ENABLE1, NPU2_MISC_DA_LEN_8B, reg);
+
+ /* fence disable */
+ reg = npu2_scom_read(p->chip_id, p->xscom_base,
+ NPU2_MISC_FENCE_ENABLE1, NPU2_MISC_DA_LEN_8B);
+ reg = SETFIELD(PPC_BITMASK(0, 11), reg, 0);
+ npu2_scom_write(p->chip_id, p->xscom_base,
+ NPU2_MISC_FENCE_ENABLE1, NPU2_MISC_DA_LEN_8B, reg);
+
+ /* irq disable */
+ reg = npu2_scom_read(p->chip_id, p->xscom_base,
+ NPU2_MISC_IRQ_ENABLE1, NPU2_MISC_DA_LEN_8B);
+ reg = SETFIELD(PPC_BITMASK(0, 11), reg, 0);
+ npu2_scom_write(p->chip_id, p->xscom_base,
+ NPU2_MISC_IRQ_ENABLE1, NPU2_MISC_DA_LEN_8B, reg);
+}
+
+static int enable_interrupts(struct npu2 *p)
+{
+ uint64_t reg, xsl_fault, xstop_override, xsl_mask;
+
+ /*
+ * We need to:
+ * - enable translation interrupts for all bricks
+ * - override most brick-fatal errors from FIR2 to send an
+ * interrupt instead of the default action of checkstopping
+ * the systems, since we can just fence the brick and keep
+ * the system alive.
+ * - the exception to the above is 2 FIRs for XSL errors
+ * resulting from bad AFU behavior, for which we don't want to
+ * checkstop but can't configure to send an error interrupt
+ * either, as the XSL errors are reported on 2 links (the
+ * XSL is shared between 2 links). Instead, we mask
+ * them. The XSL errors will result in an OTL error, which
+ * is reported only once, for the correct link.
+ *
+ * FIR bits configured to trigger an interrupt must have their
+ * default action masked
+ */
+ xsl_fault = PPC_BIT(0) | PPC_BIT(1) | PPC_BIT(2) | PPC_BIT(3);
+ xstop_override = 0x0FFFEFC00F91B000;
+ xsl_mask = NPU2_CHECKSTOP_REG2_XSL_XLAT_REQ_WHILE_SPAP_INVALID |
+ NPU2_CHECKSTOP_REG2_XSL_INVALID_PEE;
+
+ xscom_read(p->chip_id, p->xscom_base + NPU2_MISC_FIR2_MASK, &reg);
+ reg |= xsl_fault | xstop_override | xsl_mask;
+ xscom_write(p->chip_id, p->xscom_base + NPU2_MISC_FIR2_MASK, reg);
+
+ reg = npu2_scom_read(p->chip_id, p->xscom_base, NPU2_MISC_IRQ_ENABLE2,
+ NPU2_MISC_DA_LEN_8B);
+ reg |= xsl_fault | xstop_override;
+ npu2_scom_write(p->chip_id, p->xscom_base, NPU2_MISC_IRQ_ENABLE2,
+ NPU2_MISC_DA_LEN_8B, reg);
+
+ /*
+ * Make sure the brick is fenced on those errors.
+ * Fencing is incompatible with freezing, but there's no
+ * freeze defined for FIR2, so we don't have to worry about it
+ *
+ * For the 2 XSL bits we ignore, we need to make sure they
+ * don't fence the link, as the NPU logic could allow it even
+ * when masked.
+ */
+ reg = npu2_scom_read(p->chip_id, p->xscom_base, NPU2_MISC_FENCE_ENABLE2,
+ NPU2_MISC_DA_LEN_8B);
+ reg |= xstop_override;
+ reg &= ~NPU2_CHECKSTOP_REG2_XSL_XLAT_REQ_WHILE_SPAP_INVALID;
+ reg &= ~NPU2_CHECKSTOP_REG2_XSL_INVALID_PEE;
+ npu2_scom_write(p->chip_id, p->xscom_base, NPU2_MISC_FENCE_ENABLE2,
+ NPU2_MISC_DA_LEN_8B, reg);
+
+ mask_nvlink_fir(p);
+ return 0;
+}
+
+static void setup_debug_training_state(struct npu2_dev *dev)
+{
+ npu2_opencapi_phy_reset(dev);
+
+ switch (npu2_ocapi_training_state) {
+ case NPU2_TRAIN_PRBS31:
+ OCAPIINF(dev, "sending PRBS31 pattern per NVRAM setting\n");
+ npu2_opencapi_phy_prbs31(dev);
+ break;
+
+ case NPU2_TRAIN_NONE:
+ OCAPIINF(dev, "link not trained per NVRAM setting\n");
+ break;
+ default:
+ assert(false);
+ }
+}
+
+static void setup_device(struct npu2_dev *dev)
+{
+ struct dt_node *dn_phb;
+ struct pci_slot *slot;
+ uint64_t mm_win[2];
+
+ /* Populate PHB device node */
+ phys_map_get(dev->npu->chip_id, NPU_OCAPI_MMIO, dev->brick_index, &mm_win[0],
+ &mm_win[1]);
+ prlog(PR_DEBUG, "OCAPI: Setting MMIO window to %016llx + %016llx\n",
+ mm_win[0], mm_win[1]);
+ dn_phb = dt_new_addr(dt_root, "pciex", mm_win[0]);
+ assert(dn_phb);
+ dt_add_property_strings(dn_phb,
+ "compatible",
+ "ibm,power9-npu-opencapi-pciex",
+ "ibm,ioda2-npu2-opencapi-phb");
+
+ dt_add_property_cells(dn_phb, "#address-cells", 3);
+ dt_add_property_cells(dn_phb, "#size-cells", 2);
+ dt_add_property_cells(dn_phb, "#interrupt-cells", 1);
+ dt_add_property_cells(dn_phb, "bus-range", 0, 0xff);
+ dt_add_property_cells(dn_phb, "clock-frequency", 0x200, 0);
+ dt_add_property_cells(dn_phb, "interrupt-parent", get_ics_phandle());
+
+ dt_add_property_strings(dn_phb, "device_type", "pciex");
+ dt_add_property(dn_phb, "reg", mm_win, sizeof(mm_win));
+ dt_add_property_cells(dn_phb, "ibm,npu-index", dev->npu->index);
+ dt_add_property_cells(dn_phb, "ibm,phb-index",
+ npu2_get_phb_index(dev->brick_index));
+ dt_add_property_cells(dn_phb, "ibm,chip-id", dev->npu->chip_id);
+ dt_add_property_cells(dn_phb, "ibm,xscom-base", dev->npu->xscom_base);
+ dt_add_property_cells(dn_phb, "ibm,npcq", dev->npu->dt_node->phandle);
+ dt_add_property_cells(dn_phb, "ibm,links", 1);
+ dt_add_property(dn_phb, "ibm,mmio-window", mm_win, sizeof(mm_win));
+ dt_add_property_cells(dn_phb, "ibm,phb-diag-data-size", 0);
+
+ /*
+ * We ignore whatever PE numbers Linux tries to set, so we just
+ * advertise enough that Linux won't complain
+ */
+ dt_add_property_cells(dn_phb, "ibm,opal-num-pes", NPU2_MAX_PE_NUM);
+ dt_add_property_cells(dn_phb, "ibm,opal-reserved-pe", NPU2_RESERVED_PE_NUM);
+
+ dt_add_property_cells(dn_phb, "ranges", 0x02000000,
+ hi32(mm_win[0]), lo32(mm_win[0]),
+ hi32(mm_win[0]), lo32(mm_win[0]),
+ hi32(mm_win[1]), lo32(mm_win[1]));
+
+ dev->phb_ocapi.dt_node = dn_phb;
+ dev->phb_ocapi.ops = &npu2_opencapi_ops;
+ dev->phb_ocapi.phb_type = phb_type_npu_v2_opencapi;
+ dev->phb_ocapi.scan_map = 0;
+
+ dev->bdfn = 0;
+ dev->linux_pe = -1;
+
+ /* TODO: Procedure 13.1.3.7 - AFU Memory Range BARs */
+ /* Procedure 13.1.3.8 - AFU MMIO Range BARs */
+ setup_afu_mmio_bars(dev->npu->chip_id, dev->npu->xscom_base, dev);
+ /* Procedure 13.1.3.9 - AFU Config BARs */
+ setup_afu_config_bars(dev->npu->chip_id, dev->npu->xscom_base, dev);
+ setup_perf_counters(dev);
+ npu2_opencapi_phy_init(dev);
+
+ set_fence_control(dev->npu->chip_id, dev->npu->xscom_base, dev->brick_index, 0b00);
+
+ pci_register_phb(&dev->phb_ocapi, OPAL_DYNAMIC_PHB_ID);
+
+ if (npu2_ocapi_training_state != NPU2_TRAIN_DEFAULT) {
+ setup_debug_training_state(dev);
+ } else {
+ slot = npu2_opencapi_slot_create(&dev->phb_ocapi);
+ if (!slot) {
+ /**
+ * @fwts-label OCAPICannotCreatePHBSlot
+ * @fwts-advice Firmware probably ran out of memory creating
+ * NPU slot. OpenCAPI functionality could be broken.
+ */
+ prlog(PR_ERR, "OCAPI: Cannot create PHB slot\n");
+ }
+ make_slot_hotpluggable(slot, &dev->phb_ocapi);
+ }
+ return;
+}
+
+static void read_nvram_training_state(void)
+{
+ const char *state;
+
+ state = nvram_query_dangerous("opencapi-link-training");
+ if (state) {
+ if (!strcmp(state, "prbs31"))
+ npu2_ocapi_training_state = NPU2_TRAIN_PRBS31;
+ else if (!strcmp(state, "none"))
+ npu2_ocapi_training_state = NPU2_TRAIN_NONE;
+ else
+ prlog(PR_WARNING,
+ "OCAPI: invalid training state in NVRAM: %s\n",
+ state);
+ }
+}
+
+int npu2_opencapi_init_npu(struct npu2 *npu)
+{
+ struct npu2_dev *dev;
+ uint64_t reg[2];
+
+ assert(platform.ocapi);
+ read_nvram_training_state();
+
+ /* TODO: Test OpenCAPI with fast reboot and make it work */
+ disable_fast_reboot("OpenCAPI device enabled");
+
+ setup_global_mmio_bar(npu->chip_id, npu->xscom_base, reg);
+
+ npu->regs = (void *)reg[0];
+
+ for (int i = 0; i < npu->total_devices; i++) {
+ dev = &npu->devices[i];
+ if (dev->type != NPU2_DEV_TYPE_OPENCAPI)
+ continue;
+
+ prlog(PR_INFO, "OCAPI: Configuring link index %d, brick %d\n",
+ dev->link_index, dev->brick_index);
+
+ /* Procedure 13.1.3.1 - Select OCAPI vs NVLink */
+ brick_config(npu->chip_id, npu->xscom_base, dev->brick_index);
+
+ /* Procedure 13.1.3.4 - Brick to PE Mapping */
+ pe_config(dev);
+
+ /* Procedure 13.1.3.5 - Transaction Layer Configuration */
+ tl_config(npu->chip_id, npu->xscom_base, dev->brick_index);
+
+ /* Procedure 13.1.3.6 - Address Translation Configuration */
+ address_translation_config(npu->chip_id, npu->xscom_base, dev->brick_index);
+ }
+
+ enable_interrupts(npu);
+
+ for (int i = 0; i < npu->total_devices; i++) {
+ dev = &npu->devices[i];
+ if (dev->type != NPU2_DEV_TYPE_OPENCAPI)
+ continue;
+ setup_device(dev);
+ }
+
+ return 0;
+}
+
+static const struct phb_ops npu2_opencapi_ops = {
+ .cfg_read8 = npu2_opencapi_pcicfg_read8,
+ .cfg_read16 = npu2_opencapi_pcicfg_read16,
+ .cfg_read32 = npu2_opencapi_pcicfg_read32,
+ .cfg_write8 = npu2_opencapi_pcicfg_write8,
+ .cfg_write16 = npu2_opencapi_pcicfg_write16,
+ .cfg_write32 = npu2_opencapi_pcicfg_write32,
+ .device_init = NULL,
+ .phb_final_fixup = npu2_opencapi_final_fixup,
+ .ioda_reset = npu2_opencapi_ioda_reset,
+ .papr_errinjct_reset = NULL,
+ .pci_reinit = NULL,
+ .set_phb_mem_window = NULL,
+ .phb_mmio_enable = NULL,
+ .map_pe_mmio_window = NULL,
+ .map_pe_dma_window = NULL,
+ .map_pe_dma_window_real = NULL,
+ .pci_msi_eoi = NULL,
+ .set_xive_pe = NULL,
+ .get_msi_32 = NULL,
+ .get_msi_64 = NULL,
+ .set_pe = npu2_opencapi_set_pe,
+ .set_peltv = NULL,
+ .eeh_freeze_status = npu2_opencapi_freeze_status,
+ .eeh_freeze_clear = NULL,
+ .eeh_freeze_set = NULL,
+ .next_error = npu2_opencapi_eeh_next_error,
+ .err_inject = NULL,
+ .get_diag_data2 = NULL,
+ .set_capi_mode = NULL,
+ .set_capp_recovery = NULL,
+ .tce_kill = NULL,
+};
+
+void npu2_opencapi_set_broken(struct npu2 *npu, int brick)
+{
+ struct phb *phb;
+ struct npu2_dev *dev;
+
+ for_each_phb(phb) {
+ if (phb->phb_type == phb_type_npu_v2_opencapi) {
+ dev = phb_to_npu2_dev_ocapi(phb);
+ if (dev->npu == npu &&
+ dev->brick_index == brick)
+ dev->flags |= NPU2_DEV_BROKEN;
+ }
+ }
+}
+
+static int64_t opal_npu_spa_setup(uint64_t phb_id, uint32_t __unused bdfn,
+ uint64_t addr, uint64_t PE_mask)
+{
+ uint64_t stack, block, offset, reg;
+ struct phb *phb = pci_get_phb(phb_id);
+ struct npu2_dev *dev;
+ int rc;
+
+ if (!phb || phb->phb_type != phb_type_npu_v2_opencapi)
+ return OPAL_PARAMETER;
+
+ /* 4k aligned */
+ if (addr & 0xFFF)
+ return OPAL_PARAMETER;
+
+ if (PE_mask > 15)
+ return OPAL_PARAMETER;
+
+ dev = phb_to_npu2_dev_ocapi(phb);
+ if (!dev)
+ return OPAL_PARAMETER;
+
+ block = index_to_block(dev->brick_index);
+ stack = index_to_stack(dev->brick_index);
+ if (block == NPU2_BLOCK_OTL1)
+ offset = NPU2_XSL_PSL_SPAP_A1;
+ else
+ offset = NPU2_XSL_PSL_SPAP_A0;
+
+
+ lock(&dev->npu->lock);
+ /*
+ * set the SPAP used by the device
+ */
+ reg = npu2_scom_read(dev->npu->chip_id, dev->npu->xscom_base,
+ NPU2_REG_OFFSET(stack, NPU2_BLOCK_XSL, offset),
+ NPU2_MISC_DA_LEN_8B);
+ if ((addr && (reg & NPU2_XSL_PSL_SPAP_EN)) ||
+ (!addr && !(reg & NPU2_XSL_PSL_SPAP_EN))) {
+ rc = OPAL_BUSY;
+ goto out;
+ }
+ /* SPA is disabled by passing a NULL address */
+ reg = addr;
+ if (addr)
+ reg = addr | NPU2_XSL_PSL_SPAP_EN;
+
+ npu2_scom_write(dev->npu->chip_id, dev->npu->xscom_base,
+ NPU2_REG_OFFSET(stack, NPU2_BLOCK_XSL, offset),
+ NPU2_MISC_DA_LEN_8B, reg);
+
+ /*
+ * set the PE mask that the OS uses for PASID -> PE handle
+ * conversion
+ */
+ reg = npu2_scom_read(dev->npu->chip_id, dev->npu->xscom_base,
+ NPU2_OTL_CONFIG0(stack, block), NPU2_MISC_DA_LEN_8B);
+ reg &= ~NPU2_OTL_CONFIG0_PE_MASK;
+ reg |= (PE_mask << (63-7));
+ npu2_scom_write(dev->npu->chip_id, dev->npu->xscom_base,
+ NPU2_OTL_CONFIG0(stack, block), NPU2_MISC_DA_LEN_8B,
+ reg);
+ rc = OPAL_SUCCESS;
+out:
+ unlock(&dev->npu->lock);
+ return rc;
+}
+opal_call(OPAL_NPU_SPA_SETUP, opal_npu_spa_setup, 4);
+
+static int64_t opal_npu_spa_clear_cache(uint64_t phb_id, uint32_t __unused bdfn,
+ uint64_t PE_handle)
+{
+ uint64_t cc_inv, stack, block, reg, rc;
+ uint32_t retries = 5;
+ struct phb *phb = pci_get_phb(phb_id);
+ struct npu2_dev *dev;
+
+ if (!phb || phb->phb_type != phb_type_npu_v2_opencapi)
+ return OPAL_PARAMETER;
+
+ if (PE_handle > MAX_PE_HANDLE)
+ return OPAL_PARAMETER;
+
+ dev = phb_to_npu2_dev_ocapi(phb);
+ if (!dev)
+ return OPAL_PARAMETER;
+
+ block = index_to_block(dev->brick_index);
+ stack = index_to_stack(dev->brick_index);
+ cc_inv = NPU2_REG_OFFSET(stack, NPU2_BLOCK_XSL, NPU2_XSL_PSL_LLCMD_A0);
+
+ lock(&dev->npu->lock);
+ reg = npu2_scom_read(dev->npu->chip_id, dev->npu->xscom_base, cc_inv,
+ NPU2_MISC_DA_LEN_8B);
+ if (reg & PPC_BIT(16)) {
+ rc = OPAL_BUSY;
+ goto out;
+ }
+
+ reg = PE_handle | PPC_BIT(15);
+ if (block == NPU2_BLOCK_OTL1)
+ reg |= PPC_BIT(48);
+ npu2_scom_write(dev->npu->chip_id, dev->npu->xscom_base, cc_inv,
+ NPU2_MISC_DA_LEN_8B, reg);
+
+ rc = OPAL_HARDWARE;
+ while (retries--) {
+ reg = npu2_scom_read(dev->npu->chip_id, dev->npu->xscom_base,
+ cc_inv, NPU2_MISC_DA_LEN_8B);
+ if (!(reg & PPC_BIT(16))) {
+ rc = OPAL_SUCCESS;
+ break;
+ }
+ /* the bit expected to flip in less than 200us */
+ time_wait_us(200);
+ }
+out:
+ unlock(&dev->npu->lock);
+ return rc;
+}
+opal_call(OPAL_NPU_SPA_CLEAR_CACHE, opal_npu_spa_clear_cache, 3);
+
+static int get_template_rate(unsigned int templ, char *rate_buf)
+{
+ int shift, idx, val;
+
+ /*
+ * Each rate is encoded over 4 bits (0->15), with 15 being the
+ * slowest. The buffer is a succession of rates for all the
+ * templates. The first 4 bits are for template 63, followed
+ * by 4 bits for template 62, ... etc. So the rate for
+ * template 0 is at the very end of the buffer.
+ */
+ idx = (TL_MAX_TEMPLATE - templ) / 2;
+ shift = 4 * (1 - ((TL_MAX_TEMPLATE - templ) % 2));
+ val = rate_buf[idx] >> shift;
+ return val;
+}
+
+static bool is_template_supported(unsigned int templ, long capabilities)
+{
+ return !!(capabilities & (1ull << templ));
+}
+
+static int64_t opal_npu_tl_set(uint64_t phb_id, uint32_t __unused bdfn,
+ long capabilities, uint64_t rate_phys, int rate_sz)
+{
+ struct phb *phb = pci_get_phb(phb_id);
+ struct npu2_dev *dev;
+ uint64_t stack, block, reg, templ_rate;
+ int i, rate_pos;
+ char *rate = (char *) rate_phys;
+
+ if (!phb || phb->phb_type != phb_type_npu_v2_opencapi)
+ return OPAL_PARAMETER;
+ if (!opal_addr_valid(rate) || rate_sz != TL_RATE_BUF_SIZE)
+ return OPAL_PARAMETER;
+
+ dev = phb_to_npu2_dev_ocapi(phb);
+ if (!dev)
+ return OPAL_PARAMETER;
+
+ block = index_to_block(dev->brick_index);
+ stack = index_to_stack(dev->brick_index);
+ /*
+ * The 'capabilities' argument defines what TL template the
+ * device can receive. OpenCAPI 3.0 and 4.0 define 64 templates, so
+ * that's one bit per template.
+ *
+ * For each template, the device processing time may vary, so
+ * the device advertises at what rate a message of a given
+ * template can be sent. That's encoded in the 'rate' buffer.
+ *
+ * On P9, NPU only knows about TL templates 0 -> 3.
+ * Per the spec, template 0 must be supported.
+ */
+ if (!is_template_supported(0, capabilities))
+ return OPAL_PARAMETER;
+
+ reg = npu2_scom_read(dev->npu->chip_id, dev->npu->xscom_base,
+ NPU2_OTL_CONFIG1(stack, block),
+ NPU2_MISC_DA_LEN_8B);
+ reg &= ~(NPU2_OTL_CONFIG1_TX_TEMP1_EN | NPU2_OTL_CONFIG1_TX_TEMP2_EN |
+ NPU2_OTL_CONFIG1_TX_TEMP3_EN);
+ for (i = 0; i < 4; i++) {
+ /* Skip template 0 as it is implicitly enabled */
+ if (i && is_template_supported(i, capabilities))
+ reg |= PPC_BIT(i);
+ /* The tx rate should still be set for template 0 */
+ templ_rate = get_template_rate(i, rate);
+ rate_pos = 8 + i * 4;
+ reg = SETFIELD(PPC_BITMASK(rate_pos, rate_pos + 3), reg,
+ templ_rate);
+ }
+ npu2_scom_write(dev->npu->chip_id, dev->npu->xscom_base,
+ NPU2_OTL_CONFIG1(stack, block), NPU2_MISC_DA_LEN_8B,
+ reg);
+ OCAPIDBG(dev, "OTL configuration 1 register set to %llx\n", reg);
+ return OPAL_SUCCESS;
+}
+opal_call(OPAL_NPU_TL_SET, opal_npu_tl_set, 5);
+
+static void set_mem_bar(struct npu2_dev *dev, uint64_t base, uint64_t size)
+{
+ uint64_t stack, val, reg, bar_offset, pa_config_offset;
+ uint8_t memsel;
+
+ stack = index_to_stack(dev->brick_index);
+ switch (dev->brick_index) {
+ case 2:
+ case 4:
+ bar_offset = NPU2_GPU0_MEM_BAR;
+ pa_config_offset = NPU2_CQ_CTL_MISC_PA0_CONFIG;
+ break;
+ case 3:
+ case 5:
+ bar_offset = NPU2_GPU1_MEM_BAR;
+ pa_config_offset = NPU2_CQ_CTL_MISC_PA1_CONFIG;
+ break;
+ default:
+ assert(false);
+ }
+
+ assert((!size && !base) || (size && base));
+
+ /*
+ * Memory select configuration:
+ * - 0b000 - BAR disabled
+ * - 0b001 - match 0b00, 0b01
+ * - 0b010 - match 0b01, 0b10
+ * - 0b011 - match 0b00, 0b10
+ * - 0b100 - match 0b00
+ * - 0b101 - match 0b01
+ * - 0b110 - match 0b10
+ * - 0b111 - match 0b00, 0b01, 0b10
+ */
+ memsel = GETFIELD(PPC_BITMASK(13, 14), base);
+ if (size)
+ val = SETFIELD(NPU2_MEM_BAR_EN | NPU2_MEM_BAR_SEL_MEM, 0ULL, 0b100 + memsel);
+ else
+ val = 0;
+
+ /* Base address - 12 bits, 1G aligned */
+ val = SETFIELD(NPU2_MEM_BAR_NODE_ADDR, val, GETFIELD(PPC_BITMASK(22, 33), base));
+
+ /* GCID */
+ val = SETFIELD(NPU2_MEM_BAR_GROUP, val, GETFIELD(PPC_BITMASK(15, 18), base));
+ val = SETFIELD(NPU2_MEM_BAR_CHIP, val, GETFIELD(PPC_BITMASK(19, 21), base));
+
+ /* Other settings */
+ val = SETFIELD(NPU2_MEM_BAR_POISON, val, 1);
+ val = SETFIELD(NPU2_MEM_BAR_GRANULE, val, 0);
+ val = SETFIELD(NPU2_MEM_BAR_BAR_SIZE, val, ilog2(size >> 30));
+ val = SETFIELD(NPU2_MEM_BAR_MODE, val, 0);
+
+ for (int block = NPU2_BLOCK_SM_0; block <= NPU2_BLOCK_SM_3; block++) {
+ reg = NPU2_REG_OFFSET(stack, block, bar_offset);
+ npu2_write(dev->npu, reg, val);
+ }
+
+ /* Set PA config */
+ if (size)
+ val = SETFIELD(NPU2_CQ_CTL_MISC_PA_CONFIG_MEMSELMATCH, 0ULL, 0b100 + memsel);
+ else
+ val = 0;
+ val = SETFIELD(NPU2_CQ_CTL_MISC_PA_CONFIG_GRANULE, val, 0);
+ val = SETFIELD(NPU2_CQ_CTL_MISC_PA_CONFIG_SIZE, val, ilog2(size >> 30));
+ val = SETFIELD(NPU2_CQ_CTL_MISC_PA_CONFIG_MODE, val, 0);
+ val = SETFIELD(NPU2_CQ_CTL_MISC_PA_CONFIG_MASK, val, 0);
+ reg = NPU2_REG_OFFSET(stack, NPU2_BLOCK_CTL, pa_config_offset);
+ npu2_write(dev->npu, reg, val);
+}
+
+static int64_t alloc_mem_bar(struct npu2_dev *dev, uint64_t size, uint64_t *bar)
+{
+ uint64_t phys_map_base, phys_map_size, val;
+ int rc = OPAL_SUCCESS;
+
+ lock(&dev->npu->lock);
+
+ if (dev->lpc_mem_base) {
+ OCAPIERR(dev, "LPC allocation failed - BAR already in use\n");
+ rc = OPAL_RESOURCE;
+ goto out;
+ }
+
+ /*
+ * The supported chip address extension mask is 1100 100 (mask
+ * off 2 bits from group ID and 1 bit from chip ID).
+ *
+ * Fall back to only permitting a single allocation if we
+ * don't see this mask value.
+ */
+ xscom_read(dev->npu->chip_id, PB_CENT_MODE, &val);
+ if (GETFIELD(PB_CFG_CHIP_ADDR_EXTENSION_MASK_CENT, val) == 0b1100100) {
+ phys_map_get(dev->npu->chip_id, OCAPI_MEM,
+ dev->brick_index - 2, &phys_map_base,
+ &phys_map_size);
+ } else {
+ bool in_use = false;
+
+ for (int i = 0; i < dev->npu->total_devices; i++) {
+ if (dev->npu->devices[i].lpc_mem_base)
+ in_use = true;
+ }
+
+ if (in_use) {
+ OCAPIERR(dev, "LPC allocation failed - single device per chip limit, FW upgrade required (pb_cent_mode=0x%016llx)\n", val);
+ rc = OPAL_RESOURCE;
+ goto out;
+ }
+
+ phys_map_get(dev->npu->chip_id, OCAPI_MEM, 0, &phys_map_base,
+ &phys_map_size);
+ }
+
+ if (size > phys_map_size) {
+ /**
+ * @fwts-label OCAPIInvalidLPCMemoryBARSize
+ * @fwts-advice The operating system requested an unsupported
+ * amount of OpenCAPI LPC memory. This is possibly a kernel
+ * bug, or you may need to upgrade your firmware.
+ */
+ OCAPIERR(dev, "Invalid LPC memory BAR allocation size requested: 0x%llx bytes (limit 0x%llx)\n",
+ size, phys_map_size);
+ rc = OPAL_PARAMETER;
+ goto out;
+ }
+
+ /* Minimum BAR size is 1 GB */
+ if (size < (1 << 30)) {
+ size = 1 << 30;
+ }
+
+ if (!is_pow2(size)) {
+ size = 1ull << (ilog2(size) + 1);
+ }
+
+ set_mem_bar(dev, phys_map_base, size);
+ *bar = phys_map_base;
+ dev->lpc_mem_base = phys_map_base;
+ dev->lpc_mem_size = size;
+
+out:
+ unlock(&dev->npu->lock);
+ return rc;
+}
+
+static int64_t release_mem_bar(struct npu2_dev *dev)
+{
+ int rc = OPAL_SUCCESS;
+
+ lock(&dev->npu->lock);
+
+ if (!dev->lpc_mem_base) {
+ rc = OPAL_PARAMETER;
+ goto out;
+ }
+
+ set_mem_bar(dev, 0, 0);
+ dev->lpc_mem_base = 0;
+ dev->lpc_mem_size = 0;
+
+out:
+ unlock(&dev->npu->lock);
+ return rc;
+}
+
+static int64_t opal_npu_mem_alloc(uint64_t phb_id, uint32_t __unused bdfn,
+ uint64_t size, __be64 *__bar)
+{
+ struct phb *phb = pci_get_phb(phb_id);
+ struct npu2_dev *dev;
+ uint64_t bar;
+ int64_t rc;
+
+
+ if (!phb || phb->phb_type != phb_type_npu_v2_opencapi)
+ return OPAL_PARAMETER;
+
+ dev = phb_to_npu2_dev_ocapi(phb);
+ if (!dev)
+ return OPAL_PARAMETER;
+
+ if (!opal_addr_valid(__bar))
+ return OPAL_PARAMETER;
+
+ rc = alloc_mem_bar(dev, size, &bar);
+ if (rc == OPAL_SUCCESS)
+ *__bar = cpu_to_be64(bar);
+
+ return rc;
+}
+opal_call(OPAL_NPU_MEM_ALLOC, opal_npu_mem_alloc, 4);
+
+static int64_t opal_npu_mem_release(uint64_t phb_id, uint32_t __unused bdfn)
+{
+ struct phb *phb = pci_get_phb(phb_id);
+ struct npu2_dev *dev;
+
+
+ if (!phb || phb->phb_type != phb_type_npu_v2_opencapi)
+ return OPAL_PARAMETER;
+
+ dev = phb_to_npu2_dev_ocapi(phb);
+ if (!dev)
+ return OPAL_PARAMETER;
+
+ return release_mem_bar(dev);
+}
+opal_call(OPAL_NPU_MEM_RELEASE, opal_npu_mem_release, 2);
diff --git a/roms/skiboot/hw/npu2.c b/roms/skiboot/hw/npu2.c
new file mode 100644
index 000000000..cf57eeb0c
--- /dev/null
+++ b/roms/skiboot/hw/npu2.c
@@ -0,0 +1,2323 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * NPU - NVlink and OpenCAPI
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <io.h>
+#include <timebase.h>
+#include <pci-cfg.h>
+#include <pci.h>
+#include <pci-slot.h>
+#include <pci-virt.h>
+#include <opal.h>
+#include <opal-api.h>
+#include <cpu.h>
+#include <device.h>
+#include <ccan/str/str.h>
+#include <ccan/array_size/array_size.h>
+#include <affinity.h>
+#include <npu2.h>
+#include <lock.h>
+#include <xscom.h>
+#include <bitutils.h>
+#include <chip.h>
+#include <phys-map.h>
+#include <nvram.h>
+#include <xscom-p9-regs.h>
+#include <phb4.h>
+#include <cache-p9.h>
+
+#define VENDOR_CAP_START 0x80
+#define VENDOR_CAP_END 0x90
+#define VENDOR_CAP_LEN 0x10
+#define VENDOR_CAP_VERSION 0x01
+#define VENDOR_CAP_PCI_DEV_OFFSET 0x0d
+
+/*
+ * NPU2 BAR layout definition. We have 3 stacks and each of them
+ * contains 2 bricks. So every NPU2 has 6 bricks in total. There are 2
+ * PHY BARs and each of them is shared by 3 bricks. Every brick has
+ * one NTL BAR and two bricks share one GENID BAR. There is also a
+ * global MMIO BAR. We only expose DL and GENID BARs to the OS and all
+ * other BARs will be hidden in skiboot.
+ *
+ * Before the global MMIO BAR is configured, scom is the only way to
+ * access the BAR registers. At NPU2 PHB probing time, we rely on scom
+ * to assign all BARs until the global MMIO BAR is established.
+ *
+ * We need to access 4 SM registers in the same stack in order to
+ * configure one particular BAR.
+ */
+
+/* Set a specific flag in the vendor config space */
+void npu2_set_link_flag(struct npu2_dev *ndev, uint8_t flag)
+{
+ ndev->nvlink.link_flags |= flag;
+ PCI_VIRT_CFG_INIT_RO(ndev->nvlink.pvd, VENDOR_CAP_START +
+ VENDOR_CAP_PCI_DEV_OFFSET, 1, ndev->nvlink.link_flags);
+}
+
+void npu2_clear_link_flag(struct npu2_dev *ndev, uint8_t flag)
+{
+ ndev->nvlink.link_flags &= ~flag;
+ PCI_VIRT_CFG_INIT_RO(ndev->nvlink.pvd, VENDOR_CAP_START +
+ VENDOR_CAP_PCI_DEV_OFFSET, 1, ndev->nvlink.link_flags);
+}
+
+static inline void npu2_ioda_sel(struct npu2 *p, uint32_t table,
+ uint32_t index, bool autoinc)
+{
+ out_be64(p->regs + NPU2_ATS_IODA_TBL,
+ (autoinc ? NPU2_ATS_IODA_TBL_AUTOINC : 0ul) |
+ SETFIELD(NPU2_ATS_IODA_TBL_SELECT, 0ul, table) |
+ SETFIELD(NPU2_ATS_IODA_TBL_INDEX, 0ul, index));
+}
+
+static struct npu2_dev *npu2_bdf_to_dev(struct npu2 *p,
+ uint32_t bdfn)
+{
+ struct pci_virt_device *pvd;
+
+ /* All emulated devices are attached to root bus */
+ if (bdfn & ~0xff)
+ return NULL;
+
+ pvd = pci_virt_find_device(&p->phb_nvlink, bdfn);
+ if (pvd)
+ return pvd->data;
+
+ return NULL;
+}
+
+static inline void npu2_get_bar(uint32_t gcid, struct npu2_bar *bar)
+{
+ phys_map_get(gcid, bar->type, bar->index, &bar->base, &bar->size);
+}
+
+static void npu2_read_bar(struct npu2 *p, struct npu2_bar *bar)
+{
+ uint64_t reg, val;
+ int enabled;
+
+ reg = NPU2_REG_OFFSET(0, NPU2_BLOCK_SM_0, bar->reg);
+ val = npu2_read(p, reg);
+
+ switch (NPU2_REG(bar->reg)) {
+ case NPU2_PHY_BAR:
+ bar->base = GETFIELD(NPU2_PHY_BAR_ADDR, val) << 21;
+ enabled = GETFIELD(NPU2_PHY_BAR_ENABLE, val);
+
+ if (NPU2_REG_STACK(reg) == NPU2_STACK_STCK_2)
+ /* This is the global MMIO BAR */
+ bar->size = 0x1000000;
+ else
+ bar->size = 0x200000;
+ break;
+ case NPU2_NTL0_BAR:
+ case NPU2_NTL1_BAR:
+ bar->base = GETFIELD(NPU2_NTL_BAR_ADDR, val) << 16;
+ enabled = GETFIELD(NPU2_NTL_BAR_ENABLE, val);
+ bar->size = 0x10000 << GETFIELD(NPU2_NTL_BAR_SIZE, val);
+ break;
+ case NPU2_GENID_BAR:
+ bar->base = GETFIELD(NPU2_GENID_BAR_ADDR, val) << 16;
+ enabled = GETFIELD(NPU2_GENID_BAR_ENABLE, val);
+ bar->size = 0x20000;
+ break;
+ default:
+ bar->base = 0ul;
+ enabled = 0;
+ bar->size = 0;
+ break;
+ }
+
+ bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED, bar->flags, enabled);
+}
+
+static void npu2_write_bar(struct npu2 *p,
+ struct npu2_bar *bar,
+ uint32_t gcid,
+ uint32_t scom)
+{
+ uint64_t reg, val, enable = !!(bar->flags & NPU2_BAR_FLAG_ENABLED);
+ int block;
+
+ switch (NPU2_REG(bar->reg)) {
+ case NPU2_PHY_BAR:
+ val = SETFIELD(NPU2_PHY_BAR_ADDR, 0ul, bar->base >> 21);
+ val = SETFIELD(NPU2_PHY_BAR_ENABLE, val, enable);
+ break;
+ case NPU2_NTL0_BAR:
+ case NPU2_NTL1_BAR:
+ val = SETFIELD(NPU2_NTL_BAR_ADDR, 0ul, bar->base >> 16);
+ val = SETFIELD(NPU2_NTL_BAR_ENABLE, val, enable);
+ val = SETFIELD(NPU2_NTL_BAR_SIZE, val, 1);
+ break;
+ case NPU2_GENID_BAR:
+ val = SETFIELD(NPU2_GENID_BAR_ADDR, 0ul, bar->base >> 16);
+ val = SETFIELD(NPU2_GENID_BAR_ENABLE, val, enable);
+ break;
+ default:
+ val = 0ul;
+ }
+
+ for (block = NPU2_BLOCK_SM_0; block <= NPU2_BLOCK_SM_3; block++) {
+ reg = NPU2_REG_OFFSET(0, block, bar->reg);
+ if (p)
+ npu2_write(p, reg, val);
+ else
+ npu2_scom_write(gcid, scom, reg, NPU2_MISC_DA_LEN_8B, val);
+ }
+}
+
+/* Trap for PCI command (0x4) to enable or disable device's BARs */
+static int64_t npu2_cfg_write_cmd(void *dev,
+ struct pci_cfg_reg_filter *pcrf __unused,
+ uint32_t offset, uint32_t size,
+ uint32_t *data, bool write)
+{
+ struct pci_virt_device *pvd = dev;
+ struct npu2_dev *ndev = pvd->data;
+ struct npu2_bar *ntl_npu_bar, *genid_npu_bar;
+ bool enabled;
+
+ if (!write)
+ return OPAL_PARTIAL;
+
+ if (offset != PCI_CFG_CMD)
+ return OPAL_PARAMETER;
+ if (size != 1 && size != 2 && size != 4)
+ return OPAL_PARAMETER;
+
+ /*
+ * Enable or disable NTL and GENID BAR. Two bricks share
+ * one GENID BAR, which is exposed via the first brick.
+ */
+ enabled = !!(*data & PCI_CFG_CMD_MEM_EN);
+ ntl_npu_bar = &ndev->bars[0].npu2_bar;
+ genid_npu_bar = &ndev->bars[1].npu2_bar;
+
+ ntl_npu_bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED, ntl_npu_bar->flags, enabled);
+ npu2_write_bar(ndev->npu, ntl_npu_bar, 0, 0);
+
+ /*
+ * Enable/disable the GENID BAR. Two bricks share one GENID
+ * BAR which is exposed via the first brick so we need to
+ * track the enables separately.
+ */
+ if (NPU2DEV_BRICK(ndev))
+ genid_npu_bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED1, genid_npu_bar->flags,
+ enabled);
+ else
+ genid_npu_bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED0, genid_npu_bar->flags,
+ enabled);
+
+ /* Enable the BAR if either device requests it enabled, otherwise disable it */
+ genid_npu_bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED, genid_npu_bar->flags,
+ !!(genid_npu_bar->flags & (NPU2_BAR_FLAG_ENABLED0 |
+ NPU2_BAR_FLAG_ENABLED1)));
+ npu2_write_bar(ndev->npu, genid_npu_bar, 0, 0);
+
+ return OPAL_PARTIAL;
+}
+
+static int64_t npu2_cfg_read_bar(struct npu2_dev *dev __unused,
+ struct pci_cfg_reg_filter *pcrf,
+ uint32_t offset, uint32_t size,
+ uint32_t *data)
+{
+ struct npu2_pcie_bar *bar = (struct npu2_pcie_bar *) pcrf->data;
+
+ if (!(bar->flags & NPU2_PCIE_BAR_FLAG_TRAPPED))
+ return OPAL_PARTIAL;
+
+ if ((size != 4) ||
+ (offset != pcrf->start && offset != pcrf->start + 4))
+ return OPAL_PARAMETER;
+
+ if (bar->flags & NPU2_PCIE_BAR_FLAG_SIZE_HI)
+ *data = bar->npu2_bar.size >> 32;
+ else
+ *data = bar->npu2_bar.size;
+ bar->flags &= ~(NPU2_PCIE_BAR_FLAG_TRAPPED | NPU2_PCIE_BAR_FLAG_SIZE_HI);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu2_cfg_write_bar(struct npu2_dev *dev,
+ struct pci_cfg_reg_filter *pcrf,
+ uint32_t offset, uint32_t size,
+ uint32_t data)
+{
+ struct npu2_pcie_bar *bar = (struct npu2_pcie_bar *) pcrf->data;
+ struct npu2_bar old_bar, *npu2_bar = &bar->npu2_bar;
+
+ if ((size != 4) ||
+ (offset != pcrf->start && offset != pcrf->start + 4))
+ return OPAL_PARAMETER;
+
+ /* Return BAR size on next read */
+ if (data == 0xffffffff) {
+ bar->flags |= NPU2_PCIE_BAR_FLAG_TRAPPED;
+ if (offset == pcrf->start + 4)
+ bar->flags |= NPU2_PCIE_BAR_FLAG_SIZE_HI;
+
+ return OPAL_SUCCESS;
+ }
+
+ if (offset == pcrf->start) {
+ npu2_bar->base &= 0xffffffff00000000UL;
+ npu2_bar->base |= (data & 0xfffffff0);
+ } else {
+ npu2_bar->base &= 0x00000000ffffffffUL;
+ npu2_bar->base |= ((uint64_t)data << 32);
+
+ if (NPU2_REG(npu2_bar->reg) == NPU2_GENID_BAR && NPU2DEV_BRICK(dev))
+ npu2_bar->base -= 0x10000;
+
+ old_bar.reg = npu2_bar->reg;
+ npu2_read_bar(dev->npu, &old_bar);
+
+ /* Only allow changing the base address if the BAR is not enabled */
+ if ((npu2_bar->flags & NPU2_BAR_FLAG_ENABLED) &&
+ (npu2_bar->base != old_bar.base)) {
+ npu2_bar->base = old_bar.base;
+ return OPAL_HARDWARE;
+ }
+
+ npu2_write_bar(dev->npu, &bar->npu2_bar, 0, 0);
+ }
+
+ /* To update the config cache */
+ return OPAL_PARTIAL;
+}
+
+static int64_t npu2_dev_cfg_bar(void *dev, struct pci_cfg_reg_filter *pcrf,
+ uint32_t offset, uint32_t len, uint32_t *data,
+ bool write)
+{
+ struct pci_virt_device *pvd = dev;
+ struct npu2_dev *ndev = (struct npu2_dev *) pvd->data;
+
+ if (write)
+ return npu2_cfg_write_bar(ndev, pcrf, offset, len, *data);
+
+ return npu2_cfg_read_bar(ndev, pcrf, offset, len, data);
+}
+
+static int64_t npu2_dev_cfg_exp_devcap(void *dev,
+ struct pci_cfg_reg_filter *pcrf __unused,
+ uint32_t offset, uint32_t size,
+ uint32_t *data, bool write)
+{
+ struct pci_virt_device *pvd = dev;
+ struct npu2_dev *ndev = pvd->data;
+ int rc;
+
+ assert(write);
+
+ if ((size != 2) || (offset & 1)) {
+ /* Short config writes are not supported */
+ prlog(PR_ERR, "NPU%d: Unsupported write to pcie control register\n",
+ ndev->nvlink.phb->opal_id);
+ return OPAL_PARAMETER;
+ }
+
+ if (*data & PCICAP_EXP_DEVCTL_FUNC_RESET)
+ npu2_dev_procedure_reset(ndev);
+
+ rc = purge_l2_l3_caches();
+ if (rc)
+ return rc;
+
+ return OPAL_PARTIAL;
+}
+
+#define NPU2_CFG_READ(size, type) \
+static int64_t npu2_cfg_read##size(struct phb *phb, uint32_t bdfn, \
+ uint32_t offset, type *data) \
+{ \
+ uint32_t val; \
+ int64_t ret; \
+ \
+ ret = pci_virt_cfg_read(phb, bdfn, offset, \
+ sizeof(*data), &val); \
+ *data = (type)val; \
+ return ret; \
+}
+#define NPU2_CFG_WRITE(size, type) \
+static int64_t npu2_cfg_write##size(struct phb *phb, uint32_t bdfn, \
+ uint32_t offset, type data) \
+{ \
+ uint32_t val = data; \
+ int64_t ret; \
+ \
+ ret = pci_virt_cfg_write(phb, bdfn, offset, \
+ sizeof(data), val); \
+ return ret; \
+}
+
+NPU2_CFG_READ(8, u8);
+NPU2_CFG_READ(16, u16);
+NPU2_CFG_READ(32, u32);
+NPU2_CFG_WRITE(8, u8);
+NPU2_CFG_WRITE(16, u16);
+NPU2_CFG_WRITE(32, u32);
+
+static int __npu2_dev_bind_pci_dev(struct phb *phb __unused,
+ struct pci_device *pd,
+ void *data)
+{
+ struct npu2_dev *dev = data;
+ struct dt_node *pci_dt_node;
+ char *pcislot;
+
+ /* Ignore non-nvidia PCI devices */
+ if ((pd->vdid & 0xffff) != 0x10de)
+ return 0;
+
+ /* Find the PCI device's slot location */
+ for (pci_dt_node = pd->dn;
+ pci_dt_node && !dt_find_property(pci_dt_node, "ibm,loc-code");
+ pci_dt_node = pci_dt_node->parent);
+
+ if (!pci_dt_node)
+ return 0;
+
+ pcislot = (char *)dt_prop_get(pci_dt_node, "ibm,loc-code");
+
+ NPU2DEVDBG(dev, "Comparing GPU '%s' and NPU2 '%s'\n",
+ pcislot, dev->nvlink.slot_label);
+
+ if (streq(pcislot, dev->nvlink.slot_label))
+ return 1;
+
+ return 0;
+}
+
+static int64_t npu2_gpu_bridge_sec_bus_reset(void *dev,
+ struct pci_cfg_reg_filter *pcrf __unused,
+ uint32_t offset, uint32_t len,
+ uint32_t *data, bool write)
+{
+ struct pci_device *pd = dev;
+ struct pci_device *gpu;
+ struct phb *npphb;
+ struct npu2 *npu;
+ struct dt_node *np;
+ struct npu2_dev *ndev;
+ int i;
+
+ assert(write);
+
+ if ((len != 2) || (offset & 1)) {
+ /* Short config writes are not supported */
+ PCIERR(pd->phb, pd->bdfn,
+ "Unsupported write to bridge control register\n");
+ return OPAL_PARAMETER;
+ }
+
+ gpu = list_top(&pd->children, struct pci_device, link);
+ if (gpu && (*data & PCI_CFG_BRCTL_SECONDARY_RESET)) {
+ int64_t rc;
+
+ dt_for_each_compatible(dt_root, np, "ibm,power9-npu-pciex") {
+ npphb = pci_get_phb(dt_prop_get_cell(np,
+ "ibm,opal-phbid", 1));
+ if (!npphb || npphb->phb_type != phb_type_npu_v2)
+ continue;
+
+ npu = phb_to_npu2_nvlink(npphb);
+ for (i = 0; i < npu->total_devices; ++i) {
+ ndev = &npu->devices[i];
+ if (ndev->nvlink.pd == gpu)
+ npu2_dev_procedure_reset(ndev);
+ }
+ }
+
+ rc = purge_l2_l3_caches();
+ if (rc)
+ return rc;
+ }
+
+ return OPAL_PARTIAL;
+}
+
+static void npu2_dev_bind_pci_dev(struct npu2_dev *dev)
+{
+ struct phb *phb;
+ uint32_t i;
+
+ if (dev->nvlink.pd)
+ return;
+
+ for (i = 0; i < 64; i++) {
+ if (dev->npu->phb_nvlink.opal_id == i)
+ continue;
+
+ phb = pci_get_phb(i);
+ if (!phb)
+ continue;
+
+ dev->nvlink.pd = pci_walk_dev(phb, NULL, __npu2_dev_bind_pci_dev, dev);
+ if (dev->nvlink.pd) {
+ dev->nvlink.phb = phb;
+ /* Found the device, set the bit in config space */
+ npu2_set_link_flag(dev, NPU2_DEV_PCI_LINKED);
+
+ /*
+ * We define a custom sec bus reset handler for a slot
+ * with an NVLink-connected GPU to prevent HMIs which
+ * will otherwise happen if we reset GPU before
+ * resetting NVLinks.
+ */
+ if (dev->nvlink.pd->parent &&
+ dev->nvlink.pd->parent->slot)
+ pci_add_cfg_reg_filter(dev->nvlink.pd->parent,
+ PCI_CFG_BRCTL, 2,
+ PCI_REG_FLAG_WRITE,
+ npu2_gpu_bridge_sec_bus_reset);
+ return;
+ }
+ }
+
+ NPU2DEVINF(dev, "No PCI device found for slot '%s'\n",
+ dev->nvlink.slot_label);
+}
+
+static struct lock pci_npu_phandle_lock = LOCK_UNLOCKED;
+
+static void npu2_append_phandle(struct dt_node *dn,
+ u32 phandle)
+{
+ struct dt_property *prop;
+ uint32_t *npu_phandles;
+ size_t len;
+
+ /*
+ * Use a lock to make sure no one else has a reference to an
+ * ibm,npu property (this assumes this is the only function
+ * that holds a reference to it)
+ */
+ lock(&pci_npu_phandle_lock);
+
+ /* This function shouldn't be called unless ibm,npu exists */
+ prop = (struct dt_property *)dt_require_property(dn, "ibm,npu", -1);
+
+ /* Need to append to the properties */
+ len = prop->len + sizeof(*npu_phandles);
+ dt_resize_property(&prop, len);
+
+ npu_phandles = (uint32_t *)prop->prop;
+ npu_phandles[len / sizeof(*npu_phandles) - 1] = phandle;
+ unlock(&pci_npu_phandle_lock);
+}
+
+static struct dt_node *npu2_create_memory_dn(uint64_t addr, uint64_t size)
+{
+ struct dt_node *mem;
+ static u32 chip_id = 255;
+
+ mem = dt_find_by_name_addr(dt_root, "memory", addr);
+ if (mem)
+ return mem;
+
+ mem = dt_new_addr(dt_root, "memory", addr);
+ if (!mem)
+ return NULL;
+ dt_add_property_string(mem, "device_type", "memory");
+ dt_add_property_string(mem, "compatible", "ibm,coherent-device-memory");
+ dt_add_property_u64s(mem, "reg", addr, size);
+ dt_add_property_cells(mem, "ibm,chip-id", chip_id);
+ dt_add_property_u64s(mem, "linux,usable-memory", addr, 0);
+ dt_add_property_cells(mem, "ibm,associativity", 4, chip_id, chip_id, chip_id, chip_id);
+ chip_id--;
+
+ assert(chip_id);
+ return mem;
+}
+
+/* There are potentially multiple links per GPU, so lookup the GPU memory based
+ * on bdfn. */
+static void npu2_get_gpu_base(struct npu2_dev *ndev, uint64_t *addr, uint64_t *size)
+{
+ struct npu2 *p = ndev->npu;
+ int group;
+
+ group = PCI_DEV(ndev->bdfn);
+ phys_map_get(ndev->npu->chip_id, p->gpu_map_type, group, addr, size);
+}
+
+static void npu2_dn_fixup_gmb(struct dt_node *pd_dn, struct npu2_dev *ndev)
+{
+ uint64_t gpu_base, gpu_size, gta;
+ struct dt_node *mem_dn;
+
+ npu2_get_gpu_base(ndev, &gpu_base, &gpu_size);
+ mem_dn = npu2_create_memory_dn(gpu_base, gpu_size);
+ assert(mem_dn);
+ dt_add_property_cells(pd_dn, "memory-region", mem_dn->phandle);
+
+ /* Coral mode address compression. This is documented in Figure 3.5
+ * "P9->GPU RA Compression (Coral) of the NPU2 workbook". */
+ gta = ((gpu_base >> 42) & 0x1) << 42;
+ gta |= ((gpu_base >> 45) & 0x3) << 43;
+ gta |= ((gpu_base >> 49) & 0x3) << 45;
+ gta |= gpu_base & ((1UL << 43) - 1);
+
+ dt_add_property_u64s(pd_dn, "ibm,device-tgt-addr", gta);
+}
+
+static int npu2_assign_gmb(struct npu2_dev *ndev)
+{
+ struct npu2 *p = ndev->npu;
+ int peers, mode;
+ uint32_t bdfn;
+ uint64_t base, size, reg, val, gmb;
+
+ /* Need to work out number of link peers. This amount to
+ * working out the maximum function number. So work start at
+ * the highest bdfn (fn = 6) and count back until we find a
+ * npu2_dev. */
+ for (bdfn = (ndev->bdfn & ~0x7) | NPU2_LINKS_PER_CHIP;
+ PCI_FUNC(bdfn) != 0x7; bdfn = (bdfn & ~0x7) | (PCI_FUNC(bdfn) - 1))
+ if (npu2_bdf_to_dev(p, bdfn))
+ break;
+ peers = PCI_FUNC(bdfn);
+
+ npu2_get_gpu_base(ndev, &base, &size);
+
+ NPU2DBG(p, "Setting BAR region dt:%llx\n", base);
+ val = SETFIELD(NPU2_MEM_BAR_EN, 0ULL, 1);
+ val = SETFIELD(NPU2_MEM_BAR_SEL_MEM, val, base >> (63-14));
+ val = SETFIELD(NPU2_MEM_BAR_GROUP, val, base >> (63-18));
+ val = SETFIELD(NPU2_MEM_BAR_CHIP, val, base >> (63-21));
+ val = SETFIELD(NPU2_MEM_BAR_NODE_ADDR, val, base >> (63-33));
+ val = SETFIELD(NPU2_MEM_BAR_POISON, val, 1);
+ val = SETFIELD(NPU2_MEM_BAR_GRANULE, val, 0);
+
+ /* We don't know how much memory the GPU has, so we may as well just
+ * pass the whole aperture through at this point. */
+ val = SETFIELD(NPU2_MEM_BAR_BAR_SIZE, val, ilog2(size >> 30));
+
+ switch (peers) {
+ case 0:
+ mode = 0;
+ break;
+ case 1:
+ mode = 1;
+ break;
+ case 2:
+ mode = 3;
+ break;
+ case 3:
+ mode = 6;
+ break;
+ case 5:
+ mode = 10;
+ break;
+ default:
+ /* Hardware does not support this configuration */
+ assert(0);
+ }
+
+ mode += PCI_FUNC(ndev->bdfn);
+ val = SETFIELD(NPU2_MEM_BAR_MODE, val, mode);
+
+ gmb = NPU2_GPU0_MEM_BAR;
+ if (NPU2DEV_BRICK(ndev))
+ gmb = NPU2_GPU1_MEM_BAR;
+
+ reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(ndev),
+ NPU2_BLOCK_SM_0, gmb);
+
+ npu2_write(p, reg, val);
+ reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(ndev),
+ NPU2_BLOCK_SM_1, gmb);
+ npu2_write(p, reg, val);
+ reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(ndev),
+ NPU2_BLOCK_SM_2, gmb);
+ npu2_write(p, reg, val);
+ reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(ndev),
+ NPU2_BLOCK_SM_3, gmb);
+ npu2_write(p, reg, val);
+
+ return 0;
+}
+
+static int npu2_dn_fixup(struct phb *phb,
+ struct pci_device *pd,
+ void *data __unused)
+{
+ struct npu2 *p = phb_to_npu2_nvlink(phb);
+ struct npu2_dev *dev;
+ uint32_t speed;
+ const char *label;
+
+ dev = npu2_bdf_to_dev(p, pd->bdfn);
+ assert(dev);
+ if (dev->nvlink.phb || dev->nvlink.pd)
+ return 0;
+
+ npu2_assign_gmb(dev);
+ npu2_dn_fixup_gmb(pd->dn, dev);
+ dt_add_property_cells(pd->dn, "ibm,nvlink", dev->dt_node->phandle);
+
+ /*
+ * NVLink supports multiple speeds and device drivers need to know what
+ * speed has been set by firmware. Hostboot does the inits that set the
+ * link speed and tell us via HDAT and we need to copy that from the
+ * link node.
+ */
+ speed = dt_prop_get_u32_def(dev->dt_node, "nvidia,link-speed", 0xff);
+ if (speed != 0xff)
+ dt_add_property_cells(pd->dn, "ibm,nvlink-speed", speed);
+
+ /*
+ * NPU2 devices have a slot label that indicates which GPU slot
+ * this NPU is connected to. Add a location code to the NVlink
+ * device node based on the slot label.
+ */
+ label = dt_prop_get_def(dev->dt_node, "ibm,slot-label", NULL);
+ if (!label) {
+ /**
+ * @fwts-label NPUNoPHBSlotLabel
+ * @fwts-advice No GPU/NPU2 slot information was found.
+ * NVLink2 functionality will not work.
+ */
+ prlog(PR_ERR, "NPU: Cannot find GPU slot information\n");
+ return 0;
+ }
+ dt_add_property_string(pd->dn, "ibm,loc-code", label);
+
+ dev->nvlink.slot_label = label;
+
+ /*
+ * Bind the emulated PCI device with the real one, which can't
+ * be done until the PCI devices are populated. Once the real
+ * PCI device is identified, we also need fix the device-tree
+ * for it
+ */
+ npu2_dev_bind_pci_dev(dev);
+ if (dev->nvlink.phb && dev->nvlink.pd && dev->nvlink.pd->dn) {
+ if (dt_find_property(dev->nvlink.pd->dn, "ibm,npu"))
+ npu2_append_phandle(dev->nvlink.pd->dn, pd->dn->phandle);
+ else
+ dt_add_property_cells(dev->nvlink.pd->dn, "ibm,npu", pd->dn->phandle);
+
+ dt_add_property_cells(pd->dn, "ibm,gpu", dev->nvlink.pd->dn->phandle);
+ dev->nvlink.gpu_bdfn = dev->nvlink.pd->bdfn;
+ }
+
+ return 0;
+}
+
+static int npu2_links_per_gpu(struct phb *phb,
+ struct pci_device *pd,
+ void *data)
+{
+ struct npu2 *p = phb_to_npu2_nvlink(phb);
+ struct npu2_dev *dev;
+ int *nlinks = (int *)data;
+
+ dev = npu2_bdf_to_dev(p, pd->bdfn);
+ assert(dev);
+
+ if (dev->nvlink.phb && dev->nvlink.pd && dev->nvlink.pd->dn) {
+ const struct dt_property *prop;
+ int n;
+
+ /* The link count is the number of phandles in "ibm,npu" */
+ prop = dt_find_property(dev->nvlink.pd->dn, "ibm,npu");
+ if (!prop)
+ return 0;
+
+ /* Count could vary by gpu, so find the max */
+ n = prop->len / sizeof(uint32_t);
+ if (n > *nlinks)
+ *nlinks = n;
+ }
+
+ return 0;
+}
+
+static void npu2_phb_fixup_scominit(struct dt_node *dn, int links_per_gpu)
+{
+ uint32_t gcid = dt_get_chip_id(dn);
+ uint64_t val, mask;
+
+ /*
+ * MRBSP settings for 2- and 3-link GPU systems. These can improve
+ * GPU peer-to-peer fully ordered write performance.
+ */
+ if (links_per_gpu == 3) {
+ val = PPC_BIT(30) | PPC_BIT(34) | PPC_BIT(36) | PPC_BIT(37) |
+ PPC_BIT(44) | PPC_BIT(45);
+ mask = PPC_BITMASK(28,39) | PPC_BITMASK(44,47);
+ } else if (links_per_gpu == 2) {
+ val = PPC_BIT(46) | PPC_BIT(47);
+ mask = PPC_BITMASK(44,47);
+ } else
+ return;
+
+ xscom_write_mask(gcid, 0x50110c0, val, mask);
+ xscom_write_mask(gcid, 0x50112c0, val, mask);
+ xscom_write_mask(gcid, 0x50114c0, val, mask);
+}
+
+static void npu2_phb_final_fixup(struct phb *phb)
+{
+ int links_per_gpu = 0;
+ struct dt_node *np;
+
+ pci_walk_dev(phb, NULL, npu2_dn_fixup, NULL);
+
+ /*
+ * Now that the emulated devices are bound to the real ones, we can
+ * determine links_per_gpu and do some final init.
+ */
+ pci_walk_dev(phb, NULL, npu2_links_per_gpu, &links_per_gpu);
+ dt_for_each_compatible(dt_root, np, "ibm,power9-npu")
+ npu2_phb_fixup_scominit(np, links_per_gpu);
+}
+
+static void npu2_init_ioda_cache(struct npu2 *p)
+{
+ /* TVT */
+ memset(p->tve_cache, 0, sizeof(p->tve_cache));
+}
+
+static int64_t npu2_ioda_reset(struct phb *phb, bool purge)
+{
+ struct npu2 *p = phb_to_npu2_nvlink(phb);
+ uint32_t i;
+
+ if (purge) {
+ NPU2DBG(p, "Purging all IODA tables...\n");
+ npu2_init_ioda_cache(p);
+ }
+
+ /* TVT */
+ npu2_ioda_sel(p, NPU2_ATS_IODA_TBL_TVT, 0, true);
+ for (i = 0; i < ARRAY_SIZE(p->tve_cache); i++)
+ out_be64(p->regs + NPU2_ATS_IODA_DATA, p->tve_cache[i]);
+
+ return OPAL_SUCCESS;
+}
+
+static void npu2_write_mcd(struct npu2 *p, uint64_t pcb_addr, uint64_t addr,
+ uint64_t size)
+{
+ uint64_t val;
+
+ NPU2DBG(p, "Setting MCD addr:%llx\n", pcb_addr);
+ assert(is_pow2(size));
+
+ val = MCD_BANK_CN_VALID;
+ val = SETFIELD(MCD_BANK_CN_SIZE, val, (size >> 25) - 1);
+ val = SETFIELD(MCD_BANK_CN_ADDR, val, addr >> 25);
+ xscom_write(p->chip_id, pcb_addr, val);
+}
+
+static void npu2_mcd_init(struct npu2 *p)
+{
+ int i;
+ uint64_t size, addr, gpu_min_addr, gpu_max_addr, total_size;
+
+ /* Init memory cache directory (MCD) registers. */
+ phys_map_get(p->chip_id, p->gpu_map_type, NPU2_LINKS_PER_CHIP - 1,
+ &gpu_min_addr, NULL);
+ phys_map_get(p->chip_id, p->gpu_map_type, 0, &gpu_max_addr, &size);
+ gpu_max_addr += size;
+
+ /* We assume GPU memory is contiguous from the first possible GPU to the
+ * last and that the size is the same so best to check that. */
+ for (i = 0; i < NPU2_LINKS_PER_CHIP; i++) {
+ uint64_t tmp;
+ phys_map_get(p->chip_id, p->gpu_map_type, i, &addr, &tmp);
+ assert((addr >= gpu_min_addr) && (addr + tmp <= gpu_max_addr));
+ assert(tmp == size);
+ }
+
+ /* We have two MCDs, so if neccessary we can split the region covered
+ * across both if total_size is not a power of two. */
+ total_size = gpu_max_addr - gpu_min_addr;
+ size = 1ull << ilog2(total_size);
+
+ /* Allocate the biggest chunk first as we assume gpu_max_addr has the
+ * highest alignment. */
+ addr = gpu_max_addr - size;
+ npu2_write_mcd(p, MCD0_BANK0_CN3, addr, size);
+ total_size -= size;
+ if (total_size) {
+ /* total_size was not a power of two, but the remainder should
+ * be if all GPUs were assigned the same size. */
+ assert(is_pow2(total_size));
+ size = 1ull << ilog2(total_size);
+ addr -= size;
+ assert(addr <= gpu_min_addr);
+ npu2_write_mcd(p, MCD1_BANK0_CN3, addr, size);
+ }
+}
+
+static void npu2_hw_init(struct npu2 *p)
+{
+ uint64_t reg, val;
+ int s, b;
+
+ npu2_ioda_reset(&p->phb_nvlink, false);
+
+ /* Enable XTS retry mode */
+ val = npu2_read(p, NPU2_XTS_CFG);
+ npu2_write(p, NPU2_XTS_CFG, val | NPU2_XTS_CFG_MMIOSD | NPU2_XTS_CFG_TRY_ATR_RO);
+
+ val = npu2_read(p, NPU2_XTS_CFG2);
+ npu2_write(p, NPU2_XTS_CFG2, val | NPU2_XTS_CFG2_NO_FLUSH_ENA);
+
+ /*
+ * There are three different ways we configure the MCD and memory map.
+ * 1) Old way
+ * Skiboot configures the MCD and puts GPUs at 4TB and below
+ * 2) New way with MCD
+ * Hostboot configures the MCD and skiboot puts GPU at 4TB and above
+ * 3) New way without MCD
+ * No one configures the MCD and skiboot puts GPU at 4TB and below
+ *
+ * 1) Will go away evenutally as it's a configuration that can
+ * cause an xstop or data integrity problems. We are keeping
+ * it around to support existing hostboot. Print error
+ * message if used.
+ * 2) Is for smaller memory configurations and will be used
+ * initially for GPUs on Witherspoon. Supports only to
+ * 512GB of memory and 4 GPUs per socket.
+ * 3) Is for fully populated configurations of 4TB of memory
+ * and 6GPUs per socket. May have performance impacts.
+ *
+ * The different configurations can be detected via the following scoms:
+ * 1) 0x5011c0c bit 2 = 1, 0x5011c0a bits 42:48 = 0
+ * 2) 0x5011c0c bit 2 = 1, 0x5011c0a bits 42:48 = 7
+ * 3) 0x5011c0c bit 2 = 0, 0x5011c0a bits 42:48 = 0
+ */
+
+ /* Get 0x05011c0c bit 2 = 1 */
+ xscom_read(p->chip_id, PB_CENT_HP_MODE_CURR, &val);
+ if ((val & PB_CFG_CHG_RATE_GP_MASTER) != 0) {
+ /* Get 0x05011c0a bits 42:48 */
+ xscom_read(p->chip_id, PB_CENT_MODE, &val);
+ if (GETFIELD(PB_CFG_CHIP_ADDR_EXTENSION_MASK_CENT, val) == 0) {
+ /* 1) */
+ NPU2DBG(p, "Using old memory map + MCD enabled in skiboot\n");
+ NPU2ERR(p, "!!! Old firmware detected. Update hostboot for new MCD mapping !!!\n");
+ p->gpu_map_type = GPU_MEM_4T_DOWN;
+ npu2_mcd_init(p);
+ } else if (GETFIELD(PB_CFG_CHIP_ADDR_EXTENSION_MASK_CENT, val) == 7) {
+ /* 2) */
+ NPU2DBG(p, "Using small memory map + MCD enabled\n");
+ p->gpu_map_type = GPU_MEM_4T_UP;
+ } else
+ NPU2ERR(p, "!!! Unsupported NPU2 configuration. "
+ "0x%llx!!!\n", val);
+ } else {
+ /* 3) */
+ NPU2DBG(p, "Using large memory map + MCD disabled\n");
+ p->gpu_map_type = GPU_MEM_4T_DOWN;
+ }
+
+ /* Static initialization of every relaxed-ordering cfg[2] register */
+ val = NPU2_RELAXED_ORDERING_CMD_CL_DMA_W |
+ NPU2_RELAXED_ORDERING_CMD_CL_DMA_W_HP |
+ NPU2_RELAXED_ORDERING_CMD_CL_DMA_INJ |
+ NPU2_RELAXED_ORDERING_CMD_PR_DMA_INJ |
+ NPU2_RELAXED_ORDERING_CMD_DMA_PR_W |
+ NPU2_RELAXED_ORDERING_CMD_CL_RD_NC_F0 |
+ NPU2_RELAXED_ORDERING_SOURCE4_RDENA;
+
+ for (s = NPU2_STACK_STCK_0; s <= NPU2_STACK_STCK_2; s++) {
+ for (b = NPU2_BLOCK_SM_0; b <= NPU2_BLOCK_SM_3; b++) {
+ reg = NPU2_REG_OFFSET(s, b, NPU2_RELAXED_ORDERING_CFG(2));
+ npu2_write(p, reg, val);
+ }
+ }
+}
+
+static int64_t npu2_map_pe_dma_window_real(struct phb *phb,
+ uint64_t pe_num,
+ uint16_t window_id,
+ uint64_t pci_start_addr __unused,
+ uint64_t pci_mem_size __unused)
+{
+ struct npu2 *p = phb_to_npu2_nvlink(phb);
+ uint64_t tve;
+
+ /* Sanity check. Each PE has one corresponding TVE */
+ if (pe_num >= NPU2_MAX_PE_NUM ||
+ window_id != pe_num)
+ return OPAL_PARAMETER;
+
+ if (pci_mem_size) {
+ /* GPUs need to be able to access the MMIO memory space as well.
+ * On POWER9 this is above the top of ram so disable the TVT
+ * range check allowing access to all memory addresses. */
+ tve = 0;
+ } else {
+ /* Disable */
+ tve = PPC_BIT(51);
+ }
+
+ npu2_ioda_sel(p, NPU2_ATS_IODA_TBL_TVT, window_id, false);
+ out_be64(p->regs + NPU2_ATS_IODA_DATA, tve);
+ p->tve_cache[window_id] = tve;
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu2_map_pe_dma_window(struct phb *phb,
+ uint64_t pe_num,
+ uint16_t window_id,
+ uint16_t tce_levels,
+ uint64_t tce_table_addr,
+ uint64_t tce_table_size,
+ uint64_t tce_page_size)
+{
+ struct npu2 *p = phb_to_npu2_nvlink(phb);
+ uint64_t tts_encoded;
+ uint64_t data64 = 0;
+
+ /* Sanity check. Each PE has one corresponding TVE */
+ if (pe_num >= NPU2_MAX_PE_NUM ||
+ window_id != pe_num)
+ return OPAL_PARAMETER;
+
+ /*
+ * Special condition, zero TCE table size used to disable
+ * the TVE.
+ */
+ if (!tce_table_size) {
+ npu2_ioda_sel(p, NPU2_ATS_IODA_TBL_TVT, window_id, false);
+ out_be64(p->regs + NPU2_ATS_IODA_DATA, 0ul);
+ p->tve_cache[window_id] = 0ul;
+ return OPAL_SUCCESS;
+ }
+
+ /* Additional arguments validation */
+ if (tce_levels < 1 ||
+ tce_levels > 4 ||
+ !is_pow2(tce_table_size) ||
+ tce_table_size < 0x1000)
+ return OPAL_PARAMETER;
+
+ /* TCE table size */
+ data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_TTA, 0ul, tce_table_addr >> 12);
+ tts_encoded = ilog2(tce_table_size) - 11;
+ if (tts_encoded > 39)
+ return OPAL_PARAMETER;
+ data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_SIZE, data64, tts_encoded);
+
+ /* TCE page size */
+ switch (tce_page_size) {
+ case 0x10000: /* 64K */
+ data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_PSIZE, data64, 5);
+ break;
+ case 0x1000000: /* 16M */
+ data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_PSIZE, data64, 13);
+ break;
+ case 0x10000000: /* 256M */
+ data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_PSIZE, data64, 17);
+ break;
+ case 0x1000: /* 4K */
+ default:
+ data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_PSIZE, data64, 1);
+ }
+
+ /* Number of levels */
+ data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_LEVEL, data64, tce_levels - 1);
+
+ /* Update to hardware */
+ npu2_ioda_sel(p, NPU2_ATS_IODA_TBL_TVT, window_id, false);
+ out_be64(p->regs + NPU2_ATS_IODA_DATA, data64);
+ p->tve_cache[window_id] = data64;
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu2_set_pe(struct phb *phb,
+ uint64_t pe_num,
+ uint64_t bdfn,
+ uint8_t bcompare,
+ uint8_t dcompare,
+ uint8_t fcompare,
+ uint8_t action)
+{
+ struct npu2 *p;
+ struct npu2_dev *dev;
+ uint64_t reg, val;
+
+ /* Sanity check */
+ if (action != OPAL_MAP_PE && action != OPAL_UNMAP_PE)
+ return OPAL_PARAMETER;
+ if (pe_num >= NPU2_MAX_PE_NUM)
+ return OPAL_PARAMETER;
+ if (bdfn >> 8)
+ return OPAL_PARAMETER;
+ if (bcompare != OpalPciBusAll ||
+ dcompare != OPAL_COMPARE_RID_DEVICE_NUMBER ||
+ fcompare != OPAL_COMPARE_RID_FUNCTION_NUMBER)
+ return OPAL_UNSUPPORTED;
+ if (phb->phb_type != phb_type_npu_v2)
+ return OPAL_PARAMETER;
+
+ p = phb_to_npu2_nvlink(phb);
+ if (!p)
+ return OPAL_PARAMETER;
+
+ dev = npu2_bdf_to_dev(p, bdfn);
+ if (!dev)
+ return OPAL_PARAMETER;
+
+ val = NPU2_CQ_BRICK_BDF2PE_MAP_ENABLE;
+ val = SETFIELD(NPU2_CQ_BRICK_BDF2PE_MAP_PE, val, pe_num);
+ val = SETFIELD(NPU2_CQ_BRICK_BDF2PE_MAP_BDF, val, dev->nvlink.gpu_bdfn);
+
+ if (!NPU2DEV_BRICK(dev))
+ reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + dev->brick_index/2,
+ NPU2_BLOCK_CTL, NPU2_CQ_BRICK0_BDF2PE_MAP0);
+ else
+ reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + dev->brick_index/2,
+ NPU2_BLOCK_CTL, NPU2_CQ_BRICK1_BDF2PE_MAP0);
+
+ npu2_write(p, reg, val);
+ val = NPU2_MISC_BRICK_BDF2PE_MAP_ENABLE;
+ val = SETFIELD(NPU2_MISC_BRICK_BDF2PE_MAP_PE, val, pe_num);
+ val = SETFIELD(NPU2_MISC_BRICK_BDF2PE_MAP_BDF, val, dev->nvlink.gpu_bdfn);
+ reg = NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC,
+ NPU2_MISC_BRICK0_BDF2PE_MAP0 + (dev->brick_index * 0x18));
+ npu2_write(p, reg, val);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu2_get_link_state(struct pci_slot *slot __unused, uint8_t *val)
+{
+ /*
+ * As we're emulating all PCI stuff, the link bandwidth
+ * isn't big deal anyway.
+ */
+ *val = OPAL_SHPC_LINK_UP_x1;
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu2_get_power_state(struct pci_slot *slot __unused, uint8_t *val)
+{
+ *val = PCI_SLOT_POWER_ON;
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu2_hreset(struct pci_slot *slot __unused)
+{
+ struct npu2 *p;
+ int i;
+ struct npu2_dev *ndev;
+
+ p = phb_to_npu2_nvlink(slot->phb);
+ NPU2INF(p, "Hreset PHB state\n");
+
+ for (i = 0; i < p->total_devices; i++) {
+ ndev = &p->devices[i];
+ if (ndev) {
+ NPU2DEVINF(ndev, "Resetting device\n");
+ reset_ntl(ndev);
+ }
+ }
+ return purge_l2_l3_caches();
+}
+
+static int64_t npu2_freset(struct pci_slot *slot __unused)
+{
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu2_creset(struct pci_slot *slot)
+{
+ struct npu2 *p;
+ int i;
+ struct npu2_dev *ndev;
+
+ p = phb_to_npu2_nvlink(slot->phb);
+ NPU2INF(p, "Creset PHB state\n");
+
+ for (i = 0; i < p->total_devices; i++) {
+ ndev = &p->devices[i];
+ if (ndev) {
+ NPU2DEVINF(ndev, "Resetting device\n");
+ reset_ntl(ndev);
+ }
+ }
+ return OPAL_SUCCESS;
+}
+
+static struct pci_slot *npu2_slot_create(struct phb *phb)
+{
+ struct pci_slot *slot;
+
+ slot = pci_slot_alloc(phb, NULL);
+ if (!slot)
+ return slot;
+
+ /* Elementary functions */
+ slot->ops.get_presence_state = NULL;
+ slot->ops.get_link_state = npu2_get_link_state;
+ slot->ops.get_power_state = npu2_get_power_state;
+ slot->ops.get_attention_state = NULL;
+ slot->ops.get_latch_state = NULL;
+ slot->ops.set_power_state = NULL;
+ slot->ops.set_attention_state = NULL;
+
+ slot->ops.prepare_link_change = NULL;
+ slot->ops.poll_link = NULL;
+ slot->ops.hreset = npu2_hreset;
+ slot->ops.freset = npu2_freset;
+ slot->ops.creset = npu2_creset;
+
+ return slot;
+}
+
+int64_t npu2_freeze_status(struct phb *phb __unused,
+ uint64_t pe_number __unused,
+ uint8_t *freeze_state,
+ uint16_t *pci_error_type,
+ uint16_t *severity)
+{
+ /*
+ * FIXME: When it's called by skiboot PCI config accessor,
+ * the PE number is fixed to 0, which is incorrect. We need
+ * introduce another PHB callback to translate it. For now,
+ * it keeps the skiboot PCI enumeration going.
+ */
+ *freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN;
+ *pci_error_type = OPAL_EEH_NO_ERROR;
+ if (severity)
+ *severity = OPAL_EEH_SEV_NO_ERROR;
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu2_eeh_next_error(struct phb *phb,
+ uint64_t *first_frozen_pe,
+ uint16_t *pci_error_type,
+ uint16_t *severity)
+{
+ struct npu2 *p = phb_to_npu2_nvlink(phb);
+ int i;
+ uint64_t result = 0;
+
+ if (!first_frozen_pe || !pci_error_type || !severity)
+ return OPAL_PARAMETER;
+
+ *first_frozen_pe = -1;
+ *pci_error_type = OPAL_EEH_NO_ERROR;
+ *severity = OPAL_EEH_SEV_NO_ERROR;
+
+ for (i = 0; i < NPU2_MAX_PE_NUM; i++) {
+ result = npu2_read(p, NPU2_MISC_PESTB(i));
+ if (result > 0) {
+ *first_frozen_pe = i;
+ *pci_error_type = OPAL_EEH_PE_ERROR;
+ *severity = OPAL_EEH_SEV_PE_ER;
+ break;
+ }
+ }
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu2_tce_kill(struct phb *phb, uint32_t kill_type,
+ uint64_t pe_number, uint32_t tce_size,
+ uint64_t dma_addr, uint32_t npages)
+{
+ struct npu2 *npu = phb_to_npu2_nvlink(phb);
+ uint32_t tce_page_size;
+ uint64_t val;
+
+ if (pe_number > NPU2_MAX_PE_NUM)
+ return OPAL_PARAMETER;
+
+ sync();
+ switch(kill_type) {
+ case OPAL_PCI_TCE_KILL_PAGES:
+ tce_page_size = 1ULL << (
+ 11 + GETFIELD(npu->tve_cache[pe_number],
+ NPU2_ATS_IODA_TBL_TVT_PSIZE));
+ if (tce_page_size != tce_size) {
+ NPU2ERR(npu, "npu2_tce_kill: Unexpected TCE size (got 0x%x expected 0x%x)\n",
+ tce_size, tce_page_size);
+ return OPAL_PARAMETER;
+ }
+
+ if (npages < 128) {
+ while (npages--) {
+ val = SETFIELD(NPU2_ATS_TCE_KILL_PENUM, dma_addr, pe_number);
+ npu2_write(npu, NPU2_ATS_TCE_KILL, NPU2_ATS_TCE_KILL_ONE | val);
+ dma_addr += tce_size;
+ }
+ break;
+ }
+ /*
+ * For too many TCEs do not bother with the loop above and simply
+ * flush everything, going to be lot faster.
+ */
+ /* Fall through */
+ case OPAL_PCI_TCE_KILL_PE:
+ /*
+ * NPU2 doesn't support killing a PE so fall through
+ * and do a kill all instead.
+ */
+ case OPAL_PCI_TCE_KILL_ALL:
+ npu2_write(npu, NPU2_ATS_TCE_KILL, NPU2_ATS_TCE_KILL_ALL);
+ break;
+ default:
+ return OPAL_PARAMETER;
+ }
+
+ return OPAL_SUCCESS;
+}
+
+static const struct phb_ops npu_ops = {
+ .cfg_read8 = npu2_cfg_read8,
+ .cfg_read16 = npu2_cfg_read16,
+ .cfg_read32 = npu2_cfg_read32,
+ .cfg_write8 = npu2_cfg_write8,
+ .cfg_write16 = npu2_cfg_write16,
+ .cfg_write32 = npu2_cfg_write32,
+ .device_init = NULL,
+ .phb_final_fixup = npu2_phb_final_fixup,
+ .ioda_reset = npu2_ioda_reset,
+ .papr_errinjct_reset = NULL,
+ .pci_reinit = NULL,
+ .set_phb_mem_window = NULL,
+ .phb_mmio_enable = NULL,
+ .map_pe_mmio_window = NULL,
+ .map_pe_dma_window = npu2_map_pe_dma_window,
+ .map_pe_dma_window_real = npu2_map_pe_dma_window_real,
+ .pci_msi_eoi = NULL,
+ .set_xive_pe = NULL,
+ .get_msi_32 = NULL,
+ .get_msi_64 = NULL,
+ .set_pe = npu2_set_pe,
+ .set_peltv = NULL,
+ .eeh_freeze_status = npu2_freeze_status,
+ .eeh_freeze_clear = NULL,
+ .eeh_freeze_set = NULL,
+ .next_error = npu2_eeh_next_error,
+ .err_inject = NULL,
+ .get_diag_data2 = NULL,
+ .set_capi_mode = NULL,
+ .set_capp_recovery = NULL,
+ .tce_kill = npu2_tce_kill,
+};
+
+static void assign_mmio_bars(uint64_t gcid, uint32_t scom, uint64_t reg[2], uint64_t mm_win[2])
+{
+ uint32_t i;
+ struct npu2_bar *bar;
+ struct npu2_bar npu2_bars[] = {
+ /* NPU_REGS must be first in this list */
+ { .type = NPU_REGS, .index = 0,
+ .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_PHY_BAR),
+ .flags = NPU2_BAR_FLAG_ENABLED },
+ { .type = NPU_PHY, .index = 0,
+ .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_PHY_BAR),
+ .flags = NPU2_BAR_FLAG_ENABLED },
+ { .type = NPU_PHY, .index = 1,
+ .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_PHY_BAR),
+ .flags = NPU2_BAR_FLAG_ENABLED },
+ { .type = NPU_NTL, .index = 0,
+ .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_NTL0_BAR) },
+ { .type = NPU_NTL, .index = 1,
+ .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_NTL1_BAR) },
+ { .type = NPU_NTL, .index = 2,
+ .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_NTL0_BAR) },
+ { .type = NPU_NTL, .index = 3,
+ .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_NTL1_BAR) },
+ { .type = NPU_NTL, .index = 4,
+ .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_NTL0_BAR) },
+ { .type = NPU_NTL, .index = 5,
+ .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_NTL1_BAR) },
+ { .type = NPU_GENID, .index = 0,
+ .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_GENID_BAR) },
+ { .type = NPU_GENID, .index = 1,
+ .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_GENID_BAR) },
+ { .type = NPU_GENID, .index = 2,
+ .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_GENID_BAR) },
+ };
+
+ for (i = 0; i < ARRAY_SIZE(npu2_bars); i++) {
+ bar = &npu2_bars[i];
+ npu2_get_bar(gcid, bar);
+ npu2_write_bar(NULL, bar, gcid, scom);
+ }
+
+ /* Global MMIO BAR */
+ reg[0] = npu2_bars[0].base;
+ reg[1] = npu2_bars[0].size;
+
+ /* NTL and GENID BARs are exposed to kernel via the mm
+ * window */
+ mm_win[0] = npu2_bars[3].base;
+ mm_win[1] = npu2_bars[ARRAY_SIZE(npu2_bars) - 1].base +
+ npu2_bars[ARRAY_SIZE(npu2_bars) - 1].size -
+ mm_win[0];
+}
+
+/*
+ * Set up NPU for NVLink and create PCI root device node
+ * accordingly.
+ */
+int npu2_nvlink_init_npu(struct npu2 *npu)
+{
+ struct dt_node *np;
+ uint64_t reg[2], mm_win[2], val, mask;
+
+ /* TODO: Clean this up with register names, etc. when we get
+ * time. This just turns NVLink mode on in each brick and should
+ * get replaced with a patch from ajd once we've worked out how
+ * things are going to work there.
+ *
+ * Obviously if the year is now 2020 that didn't happen and you
+ * should fix this :-) */
+
+ val = PPC_BIT(58);
+ mask = PPC_BIT(58) | /* CONFIG_NVLINK_MODE */
+ PPC_BIT(40); /* CONFIG_ENABLE_SNARF_CPM */
+
+ /*
+ * V100 GPUs are known to violate NVLink2 protocol if some GPU memory
+ * mapped by a CPU was also "linear-block" mapped by a GPU. When this
+ * happens, it breaks the NPU2 cache coherency state machine and
+ * it throws machine checkstop. Disabling snarfing fixes this so let's
+ * disable it by default.
+ */
+ if (nvram_query_eq_dangerous("opal-npu2-snarf-cpm", "enable")) {
+ prlog(PR_WARNING, "NPU2#%d: enabling Probe.I.MO snarfing, a bad GPU driver may crash the system!\n",
+ npu->index);
+ val |= PPC_BIT(40); /* CONFIG_ENABLE_SNARF_CPM */
+ }
+
+ xscom_write_mask(npu->chip_id, NPU_STCK0_CS_SM0_MISC_CONFIG0,
+ val, mask);
+ xscom_write_mask(npu->chip_id, NPU_STCK0_CS_SM1_MISC_CONFIG0,
+ val, mask);
+ xscom_write_mask(npu->chip_id, NPU_STCK0_CS_SM2_MISC_CONFIG0,
+ val, mask);
+ xscom_write_mask(npu->chip_id, NPU_STCK0_CS_SM3_MISC_CONFIG0,
+ val, mask);
+ xscom_write_mask(npu->chip_id, NPU_STCK1_CS_SM0_MISC_CONFIG0,
+ val, mask);
+ xscom_write_mask(npu->chip_id, NPU_STCK1_CS_SM1_MISC_CONFIG0,
+ val, mask);
+ xscom_write_mask(npu->chip_id, NPU_STCK1_CS_SM2_MISC_CONFIG0,
+ val, mask);
+ xscom_write_mask(npu->chip_id, NPU_STCK1_CS_SM3_MISC_CONFIG0,
+ val, mask);
+ xscom_write_mask(npu->chip_id, NPU_STCK2_CS_SM0_MISC_CONFIG0,
+ val, mask);
+ xscom_write_mask(npu->chip_id, NPU_STCK2_CS_SM1_MISC_CONFIG0,
+ val, mask);
+ xscom_write_mask(npu->chip_id, NPU_STCK2_CS_SM2_MISC_CONFIG0,
+ val, mask);
+ xscom_write_mask(npu->chip_id, NPU_STCK2_CS_SM3_MISC_CONFIG0,
+ val, mask);
+
+ xscom_write_mask(npu->chip_id, 0x50110c0, PPC_BIT(53), PPC_BIT(53));
+ xscom_write_mask(npu->chip_id, 0x50112c0, PPC_BIT(53), PPC_BIT(53));
+ xscom_write_mask(npu->chip_id, 0x50114c0, PPC_BIT(53), PPC_BIT(53));
+ xscom_write_mask(npu->chip_id, 0x50110f1, PPC_BIT(41), PPC_BIT(41));
+ xscom_write_mask(npu->chip_id, 0x50112f1, PPC_BIT(41), PPC_BIT(41));
+ xscom_write_mask(npu->chip_id, 0x50114f1, PPC_BIT(41), PPC_BIT(41));
+
+ val = NPU2_NTL_MISC_CFG2_BRICK_ENABLE |
+ NPU2_NTL_MISC_CFG2_NDL_TX_PARITY_ENA |
+ NPU2_NTL_MISC_CFG2_NDL_PRI_PARITY_ENA |
+ NPU2_NTL_MISC_CFG2_RCV_CREDIT_OVERFLOW_ENA;
+ xscom_write_mask(npu->chip_id, 0x5011110, val, val);
+ xscom_write_mask(npu->chip_id, 0x5011130, val, val);
+ xscom_write_mask(npu->chip_id, 0x5011310, val, val);
+ xscom_write_mask(npu->chip_id, 0x5011330, val, val);
+ xscom_write_mask(npu->chip_id, 0x5011510, val, val);
+ xscom_write_mask(npu->chip_id, 0x5011530, val, val);
+
+ val = PPC_BIT(6) | PPC_BIT(7) | PPC_BIT(11);
+ xscom_write_mask(npu->chip_id, 0x5011009, val, PPC_BITMASK(6,11));
+ xscom_write_mask(npu->chip_id, 0x5011039, val, PPC_BITMASK(6,11));
+ xscom_write_mask(npu->chip_id, 0x5011069, val, PPC_BITMASK(6,11));
+ xscom_write_mask(npu->chip_id, 0x5011099, val, PPC_BITMASK(6,11));
+ xscom_write_mask(npu->chip_id, 0x5011209, val, PPC_BITMASK(6,11));
+ xscom_write_mask(npu->chip_id, 0x5011239, val, PPC_BITMASK(6,11));
+ xscom_write_mask(npu->chip_id, 0x5011269, val, PPC_BITMASK(6,11));
+ xscom_write_mask(npu->chip_id, 0x5011299, val, PPC_BITMASK(6,11));
+ xscom_write_mask(npu->chip_id, 0x5011409, val, PPC_BITMASK(6,11));
+ xscom_write_mask(npu->chip_id, 0x5011439, val, PPC_BITMASK(6,11));
+ xscom_write_mask(npu->chip_id, 0x5011469, val, PPC_BITMASK(6,11));
+ xscom_write_mask(npu->chip_id, 0x5011499, val, PPC_BITMASK(6,11));
+
+ /* Reassign the BARs */
+ assign_mmio_bars(npu->chip_id, npu->xscom_base, reg, mm_win);
+ npu->regs = (void *)reg[0];
+ npu->mm_base = mm_win[0];
+ npu->mm_size = mm_win[1];
+
+ if (reg[0] && reg[1])
+ prlog(PR_INFO, " Global MMIO BAR: %016llx (%lldMB)\n",
+ reg[0], reg[1] >> 20);
+ else
+ prlog(PR_ERR, " Global MMIO BAR: Disabled\n");
+
+ /* Populate PCI root device node */
+ np = dt_new_addr(dt_root, "pciex", reg[0]);
+ assert(np);
+ dt_add_property_strings(np,
+ "compatible",
+ "ibm,power9-npu-pciex",
+ "ibm,ioda2-npu2-phb");
+ dt_add_property_strings(np, "device_type", "pciex");
+ dt_add_property(np, "reg", reg, sizeof(reg));
+ dt_add_property_cells(np, "ibm,phb-index", npu2_get_phb_index(0));
+ dt_add_property_cells(np, "ibm,npu-index", npu->index);
+ dt_add_property_cells(np, "ibm,chip-id", npu->chip_id);
+ dt_add_property_cells(np, "ibm,xscom-base", npu->xscom_base);
+ dt_add_property_cells(np, "ibm,npcq", npu->dt_node->phandle);
+ dt_add_property_cells(np, "ibm,links", npu->total_devices);
+ dt_add_property(np, "ibm,mmio-window", mm_win, sizeof(mm_win));
+ dt_add_property_cells(np, "ibm,phb-diag-data-size", 0);
+
+ /* Disable fast reboot - not currently supported */
+ disable_fast_reboot("NVLink device enabled");
+
+ npu2_nvlink_create_phb(npu, np);
+
+ return 0;
+}
+
+static uint32_t npu2_populate_pcie_cap(struct npu2_dev *dev,
+ uint32_t start,
+ uint32_t prev_cap)
+{
+ struct pci_virt_device *pvd = dev->nvlink.pvd;
+ uint32_t val;
+
+ /* Add capability list */
+ PCI_VIRT_CFG_INIT_RO(pvd, prev_cap, 1, start);
+ PCI_VIRT_CFG_INIT_RO(pvd, start, 1, PCI_CFG_CAP_ID_EXP);
+
+ /* 0x00 - ID/PCIE capability */
+ val = PCI_CFG_CAP_ID_EXP;
+ val |= ((0x2 << 16) | (PCIE_TYPE_ENDPOINT << 20));
+ PCI_VIRT_CFG_INIT_RO(pvd, start, 4, val);
+
+ /* 0x04 - Device capability
+ *
+ * We should support FLR. Otherwise, it might have
+ * problem passing it through to userland via Linux
+ * VFIO infrastructure
+ */
+ val = ((PCIE_MPSS_128) |
+ (PCIE_PHANTOM_NONE << 3) |
+ (PCIE_L0SL_MAX_NO_LIMIT << 6) |
+ (PCIE_L1L_MAX_NO_LIMIT << 9) |
+ (PCICAP_EXP_DEVCAP_FUNC_RESET));
+ PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_DEVCAP, 4, val);
+
+ pci_virt_add_filter(pvd, start + PCICAP_EXP_DEVCTL, 2,
+ PCI_REG_FLAG_WRITE,
+ npu2_dev_cfg_exp_devcap, NULL);
+
+ /* 0x08 - Device control and status */
+ PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_DEVCTL, 4, 0x00002810,
+ 0xffff0000, 0x000f0000);
+
+ /* 0x0c - Link capability */
+ val = (PCIE_LSPEED_VECBIT_2 | (PCIE_LWIDTH_1X << 4));
+ PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_LCAP, 4, val);
+
+ /* 0x10 - Link control and status */
+ PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_LCTL, 4, 0x00130000,
+ 0xfffff000, 0xc0000000);
+
+ /* 0x14 - Slot capability */
+ PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SLOTCAP, 4, 0x00000000);
+
+ /* 0x18 - Slot control and status */
+ PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SLOTCTL, 4, 0x00000000);
+
+ /* 0x1c - Root control and capability */
+ PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_RC, 4, 0x00000000,
+ 0xffffffe0, 0x00000000);
+
+ /* 0x20 - Root status */
+ PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_RSTAT, 4, 0x00000000,
+ 0xffffffff, 0x00010000);
+
+ /* 0x24 - Device capability 2 */
+ PCI_VIRT_CFG_INIT_RO(pvd, start + PCIECAP_EXP_DCAP2, 4, 0x00000000);
+
+ /* 0x28 - Device Control and status 2 */
+ PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_DCTL2, 4, 0x00070000,
+ 0xffff0000, 0x00000000);
+
+ /* 0x2c - Link capability 2 */
+ PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_LCAP2, 4, 0x00000007);
+
+ /* 0x30 - Link control and status 2 */
+ PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_LCTL2, 4, 0x00000003,
+ 0xffff0000, 0x00200000);
+
+ /* 0x34 - Slot capability 2 */
+ PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SCAP2, 4, 0x00000000);
+
+ /* 0x38 - Slot control and status 2 */
+ PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SCTL2, 4, 0x00000000);
+
+ return start + PCICAP_EXP_SCTL2 + 8;
+}
+
+static uint32_t npu2_populate_vendor_cap(struct npu2_dev *dev,
+ uint32_t start,
+ uint32_t prev_cap)
+{
+ struct pci_virt_device *pvd = dev->nvlink.pvd;
+
+ /* Capbility list */
+ PCI_VIRT_CFG_INIT_RO(pvd, prev_cap, 1, start);
+ PCI_VIRT_CFG_INIT_RO(pvd, start, 1, PCI_CFG_CAP_ID_VENDOR);
+
+ /* Length and version */
+ PCI_VIRT_CFG_INIT_RO(pvd, start + 2, 1, VENDOR_CAP_LEN);
+ PCI_VIRT_CFG_INIT_RO(pvd, start + 3, 1, VENDOR_CAP_VERSION);
+
+ /*
+ * Defaults when the trap can't handle the read/write (eg. due
+ * to reading/writing less than 4 bytes).
+ */
+ PCI_VIRT_CFG_INIT_RO(pvd, start + 4, 4, 0);
+ PCI_VIRT_CFG_INIT_RO(pvd, start + 8, 4, 0);
+
+ /* Add NVLink2 PHY procedures trap */
+ pci_virt_add_filter(pvd, start + 4, 8,
+ PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
+ npu2_dev_procedure,
+ NULL);
+
+ /* Link index */
+ PCI_VIRT_CFG_INIT_RO(pvd, start + 0xc, 1, dev->link_index);
+
+ return start + VENDOR_CAP_LEN;
+}
+
+static void npu2_populate_cfg(struct npu2_dev *dev)
+{
+ struct pci_virt_device *pvd = dev->nvlink.pvd;
+ struct npu2_pcie_bar *bar;
+ uint32_t pos;
+
+ /* 0x00 - Vendor/Device ID */
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_VENDOR_ID, 4, 0x04ea1014);
+
+ /* 0x04 - Command/Status */
+ PCI_VIRT_CFG_INIT(pvd, PCI_CFG_CMD, 4, 0x00100000, 0xffb802b8,
+ 0xf9000000);
+
+ pci_virt_add_filter(pvd, PCI_CFG_CMD, 1, PCI_REG_FLAG_WRITE,
+ npu2_cfg_write_cmd, NULL);
+
+ /* 0x08 - Rev/Class/Cache */
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_REV_ID, 4, 0x06800101);
+
+ /* 0x0c - CLS/Latency Timer/Header/BIST */
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CACHE_LINE_SIZE, 4, 0x00800000);
+
+ /* 0x10/14 - BAR#0, NTL BAR */
+ bar = &dev->bars[0];
+ PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR0, 4,
+ (bar->npu2_bar.base & 0xfffffff0) | (bar->flags & 0xF),
+ 0x0000000f, 0x00000000);
+ PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR1, 4, (bar->npu2_bar.base >> 32),
+ 0x00000000, 0x00000000);
+ pci_virt_add_filter(pvd, PCI_CFG_BAR0, 8,
+ PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
+ npu2_dev_cfg_bar, bar);
+
+ /* 0x18/1c - BAR#1, GENID BAR */
+ bar = &dev->bars[1];
+ if (NPU2DEV_BRICK(dev) == 0)
+ PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR2, 4, (bar->npu2_bar.base & 0xfffffff0) |
+ (bar->flags & 0xF),
+ 0x0000000f, 0x00000000);
+ else
+ /* Brick 1 gets the upper portion of the generation id register */
+ PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR2, 4, ((bar->npu2_bar.base + 0x10000) & 0xfffffff0) |
+ (bar->flags & 0xF),
+ 0x0000000f, 0x00000000);
+
+ PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR3, 4, (bar->npu2_bar.base >> 32), 0x00000000,
+ 0x00000000);
+ pci_virt_add_filter(pvd, PCI_CFG_BAR2, 8,
+ PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
+ npu2_dev_cfg_bar, bar);
+
+ /* 0x20/0x24 - BARs, disabled */
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR4, 4, 0x00000000);
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR5, 4, 0x00000000);
+
+ /* 0x28 - Cardbus CIS pointer */
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CARDBUS_CIS, 4, 0x00000000);
+
+ /* 0x2c - Subsystem ID */
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_SUBSYS_VENDOR_ID, 4, 0x00000000);
+
+ /* 0x30 - ROM BAR, zero sized */
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_ROMBAR, 4, 0xffffffff);
+
+ /* 0x34 - PCI Capability */
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CAP, 4, 0x00000000);
+
+ /* 0x38 - Reserved */
+ PCI_VIRT_CFG_INIT_RO(pvd, 0x38, 4, 0x00000000);
+
+ /* 0x3c - INT line/pin/Minimal grant/Maximal latency */
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_INT_LINE, 4, 0x00000100); /* INT A */
+
+ /* PCIE and vendor specific capability */
+ pos = npu2_populate_pcie_cap(dev, 0x40, PCI_CFG_CAP);
+ pos = npu2_populate_vendor_cap(dev, pos, 0x41);
+ PCI_VIRT_CFG_INIT_RO(pvd, pos + 1, 1, 0);
+}
+
+static uint32_t npu_allocate_bdfn(struct npu2 *p, uint32_t group)
+{
+ int i;
+ int bdfn = (group << 3);
+
+ for (i = 0; i < p->total_devices; i++) {
+ if ((p->devices[i].bdfn & 0xf8) == (bdfn & 0xf8))
+ bdfn++;
+ }
+
+ return bdfn;
+}
+
+static void npu2_populate_devices(struct npu2 *p,
+ struct dt_node *dn)
+{
+ struct npu2_dev *dev;
+ struct dt_node *npu2_dn, *link;
+ uint32_t npu_phandle, index = 0;
+ int stack;
+
+ /*
+ * Get the npu node which has the links which we expand here
+ * into pci like devices attached to our emulated phb.
+ */
+ npu_phandle = dt_prop_get_u32(dn, "ibm,npcq");
+ npu2_dn = dt_find_by_phandle(dt_root, npu_phandle);
+ assert(npu2_dn);
+
+ /* Walk the link@x nodes to initialize devices */
+ p->total_devices = 0;
+ p->phb_nvlink.scan_map = 0;
+ dt_for_each_compatible(npu2_dn, link, "ibm,npu-link") {
+ uint32_t group_id;
+ struct npu2_bar *npu2_bar;
+
+ dev = &p->devices[index];
+ dev->type = NPU2_DEV_TYPE_NVLINK;
+ dev->npu = p;
+ dev->dt_node = link;
+ dev->link_index = dt_prop_get_u32(link, "ibm,npu-link-index");
+ dev->brick_index = dev->link_index;
+
+ group_id = dt_prop_get_u32(link, "ibm,npu-group-id");
+ dev->bdfn = npu_allocate_bdfn(p, group_id);
+
+ /* This must be done after calling
+ * npu_allocate_bdfn() */
+ p->total_devices++;
+ p->phb_nvlink.scan_map |= 0x1 << ((dev->bdfn & 0xf8) >> 3);
+
+ dev->pl_xscom_base = dt_prop_get_u64(link, "ibm,npu-phy");
+ dev->lane_mask = dt_prop_get_u32(link, "ibm,npu-lane-mask");
+
+ /* Populate BARs. BAR0/1 is the NTL bar. */
+ stack = NPU2_STACK_STCK_0 + NPU2DEV_STACK(dev);
+ npu2_bar = &dev->bars[0].npu2_bar;
+ npu2_bar->type = NPU_NTL;
+ npu2_bar->index = dev->brick_index;
+ npu2_bar->reg = NPU2_REG_OFFSET(stack, 0, NPU2DEV_BRICK(dev) == 0 ?
+ NPU2_NTL0_BAR : NPU2_NTL1_BAR);
+ npu2_get_bar(p->chip_id, npu2_bar);
+
+ dev->bars[0].flags = PCI_CFG_BAR_TYPE_MEM | PCI_CFG_BAR_MEM64;
+
+ /* BAR2/3 is the GENID bar. */
+ npu2_bar = &dev->bars[1].npu2_bar;
+ npu2_bar->type = NPU_GENID;
+ npu2_bar->index = NPU2DEV_STACK(dev);
+ npu2_bar->reg = NPU2_REG_OFFSET(stack, 0, NPU2_GENID_BAR);
+ npu2_get_bar(p->chip_id, npu2_bar);
+
+ /* The GENID is a single physical BAR that we split
+ * for each emulated device */
+ npu2_bar->size = 0x10000;
+ if (NPU2DEV_BRICK(dev))
+ npu2_bar->base += 0x10000;
+ dev->bars[1].flags = PCI_CFG_BAR_TYPE_MEM | PCI_CFG_BAR_MEM64;
+
+ /* Initialize PCI virtual device */
+ dev->nvlink.pvd = pci_virt_add_device(&p->phb_nvlink, dev->bdfn, 0x100, dev);
+ if (dev->nvlink.pvd)
+ npu2_populate_cfg(dev);
+
+ index++;
+ }
+}
+
+static void npu2_add_interrupt_map(struct npu2 *p,
+ struct dt_node *dn)
+{
+ struct dt_node *npu2_dn, *link, *phb_dn;
+ uint32_t npu2_phandle, index = 0, i;
+ uint32_t icsp = get_ics_phandle();
+ uint32_t *map;
+ size_t map_size;
+ uint32_t mask[] = {0xff00, 0x0, 0x0, 0x7};
+
+ assert(p->phb_nvlink.dt_node);
+ phb_dn = p->phb_nvlink.dt_node;
+
+ npu2_phandle = dt_prop_get_u32(dn, "ibm,npcq");
+ npu2_dn = dt_find_by_phandle(dt_root, npu2_phandle);
+ assert(npu2_dn);
+ map_size = 7 * sizeof(*map) * p->total_devices;
+ map = malloc(map_size);
+ index = 0;
+ dt_for_each_compatible(npu2_dn, link, "ibm,npu-link") {
+ i = index * 7;
+ map[i + 0] = (p->devices[index].bdfn << 8);
+ map[i + 1] = 0;
+ map[i + 2] = 0;
+
+ map[i + 3] = 1; /* INT A */
+ map[i + 4] = icsp; /* interrupt-parent */
+ map[i + 5] = p->base_lsi + (index * 2) + 1; /* NDL No-Stall Event */
+ map[i + 6] = 0; /* 0 = EDGE, 1 = LEVEL. */
+ index++;
+ }
+ dt_add_property(phb_dn, "interrupt-map", map, map_size);
+ free(map);
+ dt_add_property(phb_dn, "interrupt-map-mask", mask, sizeof(mask));
+}
+
+static void npu2_add_phb_properties(struct npu2 *p)
+{
+ struct dt_node *np = p->phb_nvlink.dt_node;
+ uint32_t icsp = get_ics_phandle();
+ uint64_t mm_base, mm_size;
+
+ /*
+ * Add various properties that HB doesn't have to
+ * add, some of them simply because they result from
+ * policy decisions made in skiboot rather than in HB
+ * such as the MMIO windows going to PCI, interrupts,
+ * etc.
+ */
+ dt_add_property_cells(np, "#address-cells", 3);
+ dt_add_property_cells(np, "#size-cells", 2);
+ dt_add_property_cells(np, "#interrupt-cells", 1);
+ dt_add_property_cells(np, "bus-range", 0, 0xff);
+ dt_add_property_cells(np, "clock-frequency", 0x200, 0);
+ dt_add_property_cells(np, "interrupt-parent", icsp);
+
+ /* NPU2 PHB properties */
+ dt_add_property_cells(np, "ibm,opal-num-pes",
+ NPU2_MAX_PE_NUM);
+ dt_add_property_cells(np, "ibm,opal-reserved-pe",
+ NPU2_RESERVED_PE_NUM);
+ dt_add_property_cells(np, "ibm,supported-tce-sizes",
+ 12, // 4K
+ 16, // 64K
+ 24, // 16M
+ 28); // 256M
+
+ dt_add_property_u64s(np, "ibm,mmio-atsd",
+ MMIO_ATSD_ADDR(p->regs, 0),
+ MMIO_ATSD_ADDR(p->regs, 1),
+ MMIO_ATSD_ADDR(p->regs, 2),
+ MMIO_ATSD_ADDR(p->regs, 3),
+ MMIO_ATSD_ADDR(p->regs, 4),
+ MMIO_ATSD_ADDR(p->regs, 5),
+ MMIO_ATSD_ADDR(p->regs, 6),
+ MMIO_ATSD_ADDR(p->regs, 7));
+
+ /*
+ * Memory window is exposed as 64-bits non-prefetchable
+ * one because 64-bits prefetchable one is kind of special
+ * to kernel.
+ */
+ mm_base = p->mm_base;
+ mm_size = p->mm_size;
+ dt_add_property_cells(np, "ranges", 0x02000000,
+ hi32(mm_base), lo32(mm_base),
+ hi32(mm_base), lo32(mm_base),
+ hi32(mm_size), lo32(mm_size));
+}
+
+void npu2_nvlink_create_phb(struct npu2 *npu, struct dt_node *dn)
+{
+ struct pci_slot *slot;
+
+ /* Generic PHB */
+ npu->phb_nvlink.dt_node = dn;
+ npu->phb_nvlink.ops = &npu_ops;
+ npu->phb_nvlink.phb_type = phb_type_npu_v2;
+ init_lock(&npu->lock);
+ init_lock(&npu->phb_nvlink.lock);
+ list_head_init(&npu->phb_nvlink.devices);
+ list_head_init(&npu->phb_nvlink.virt_devices);
+
+ npu2_populate_devices(npu, dn);
+ npu2_add_interrupt_map(npu, dn);
+ npu2_add_phb_properties(npu);
+
+ slot = npu2_slot_create(&npu->phb_nvlink);
+ if (!slot)
+ {
+ /**
+ * @fwts-label NPUCannotCreatePHBSlot
+ * @fwts-advice Firmware probably ran out of memory creating
+ * NPU2 slot. NVLink functionality could be broken.
+ */
+ prlog(PR_ERR, "NPU: Cannot create PHB slot\n");
+ }
+
+ pci_register_phb(&npu->phb_nvlink, OPAL_DYNAMIC_PHB_ID);
+
+ npu2_init_ioda_cache(npu);
+ npu2_hw_init(npu);
+}
+
+/*
+ * Search a table for an entry with matching value under mask. Returns
+ * the index and the current value in *value.
+ */
+static int npu_table_search(struct npu2 *p, uint64_t table_addr, int stride,
+ int table_size, uint64_t *value, uint64_t mask)
+{
+ int i;
+ uint64_t val;
+
+ assert(value);
+
+ for (i = 0; i < table_size; i++) {
+ val = npu2_read(p, table_addr + i*stride);
+ if ((val & mask) == *value) {
+ *value = val;
+ return i;
+ }
+ }
+
+ return -1;
+}
+
+/*
+ * Allocate a context ID and initialise the tables with the relevant
+ * information. Returns the ID on or error if one couldn't be
+ * allocated.
+ */
+#define NPU2_VALID_ATS_MSR_BITS (MSR_DR | MSR_HV | MSR_PR | MSR_SF)
+int64_t npu2_init_context(struct phb *phb, uint64_t msr, uint64_t bdf)
+{
+ struct npu2 *p;
+ uint64_t xts_bdf, old_xts_bdf_pid, xts_bdf_pid;
+ int id;
+
+ /*
+ * MSR bits should be masked by the caller to allow for future
+ * expansion if required.
+ */
+ if (msr & ~NPU2_VALID_ATS_MSR_BITS)
+ return OPAL_UNSUPPORTED;
+
+ /*
+ * Need to get LPARSHORT.
+ */
+ p = phb_to_npu2_nvlink(phb);
+ lock(&p->lock);
+ xts_bdf = SETFIELD(NPU2_XTS_BDF_MAP_BDF, 0ul, bdf);
+ if (npu_table_search(p, NPU2_XTS_BDF_MAP, 8, NPU2_XTS_BDF_MAP_SIZE,
+ &xts_bdf, NPU2_XTS_BDF_MAP_BDF) < 0) {
+ NPU2ERR(p, "LPARID not associated with any GPU\n");
+ id = OPAL_PARAMETER;
+ goto out;
+ }
+
+ id = GETFIELD(NPU2_XTS_BDF_MAP_LPARSHORT, xts_bdf);
+ NPU2DBG(p, "Found LPARSHORT = 0x%x for BDF = 0x%03llx\n", id, bdf);
+
+ /* Enable this mapping for both real and virtual addresses */
+ xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_VALID_ATRGPA0, 0UL, 1);
+ xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_VALID_ATRGPA1, xts_bdf_pid, 1);
+
+ /* Enables TLBIE/MMIOSD forwarding for this entry */
+ xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_VALID_ATSD, xts_bdf_pid, 1);
+ xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_LPARSHORT, xts_bdf_pid, id);
+
+ /* Set the relevant MSR bits */
+ xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_MSR_DR, xts_bdf_pid,
+ !!(msr & MSR_DR));
+ xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_MSR_HV, xts_bdf_pid,
+ !!(msr & MSR_HV));
+ xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_MSR_PR, xts_bdf_pid,
+ !!(msr & MSR_PR));
+
+ /* We don't support anything other than 64-bit so we can safely hardcode
+ * it here */
+ xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_MSR_SF, xts_bdf_pid, 1);
+
+ /*
+ * Throw an error if the wildcard entry for this bdf is already set
+ * with different msr bits.
+ */
+ old_xts_bdf_pid = npu2_read(p, NPU2_XTS_PID_MAP + id*0x20);
+ if (old_xts_bdf_pid) {
+ if (GETFIELD(NPU2_XTS_PID_MAP_MSR, old_xts_bdf_pid) !=
+ GETFIELD(NPU2_XTS_PID_MAP_MSR, xts_bdf_pid)) {
+ NPU2ERR(p, "%s: Unexpected MSR value\n", __func__);
+ id = OPAL_PARAMETER;
+ goto out;
+ } else if (!p->ctx_ref[id]) {
+ NPU2ERR(p, "%s: Unexpected mapping\n", __func__);
+ id = OPAL_INTERNAL_ERROR;
+ goto out;
+ }
+ }
+
+ /* Write the entry */
+ if (!p->ctx_ref[id]) {
+ NPU2DBG(p, "XTS_PID_MAP[%03d] = 0x%08llx\n", id, xts_bdf_pid);
+ npu2_write(p, NPU2_XTS_PID_MAP + id*0x20, xts_bdf_pid);
+
+ if (!GETFIELD(NPU2_XTS_BDF_MAP_VALID, xts_bdf)) {
+ xts_bdf = SETFIELD(NPU2_XTS_BDF_MAP_VALID, xts_bdf, 1);
+ npu2_write(p, NPU2_XTS_BDF_MAP + id*8, xts_bdf);
+ }
+ }
+ ++p->ctx_ref[id];
+
+out:
+ unlock(&p->lock);
+ return id;
+}
+
+int64_t npu2_destroy_context(struct phb *phb, uint64_t bdf)
+{
+ struct npu2 *p;
+ uint64_t xts_bdf;
+ int rc = OPAL_PARAMETER, id;
+
+ p = phb_to_npu2_nvlink(phb);
+ lock(&p->lock);
+
+ /* Need to find lparshort for this bdf */
+ xts_bdf = SETFIELD(NPU2_XTS_BDF_MAP_BDF, 0ul, bdf);
+ if (npu_table_search(p, NPU2_XTS_BDF_MAP, 8, NPU2_XTS_BDF_MAP_SIZE,
+ &xts_bdf, NPU2_XTS_BDF_MAP_BDF) < 0) {
+ NPU2ERR(p, "LPARID not associated with any GPU\n");
+ } else {
+ /*
+ * The bdf/pid table contains wildcard entries and MSR bits
+ * which we need to clear between switching a device from
+ * a host to a guest or vice versa.
+ */
+ id = GETFIELD(NPU2_XTS_BDF_MAP_LPARSHORT, xts_bdf);
+ if (p->ctx_ref[id]) {
+ --p->ctx_ref[id];
+ if (!p->ctx_ref[id]) {
+ NPU2DBG(p, "XTS_PID_MAP[%03d] = 0 (destroy)\n",
+ id);
+ npu2_write(p, NPU2_XTS_PID_MAP + id*0x20, 0);
+ }
+ rc = OPAL_SUCCESS;
+ }
+ }
+ unlock(&p->lock);
+ return rc;
+}
+
+/*
+ * Map the given virtual bdf to lparid with given lpcr.
+ */
+int64_t npu2_map_lpar(struct phb *phb, uint64_t bdf, uint64_t lparid,
+ uint64_t lpcr)
+{
+ struct npu2 *p;
+ struct npu2_dev *ndev = NULL;
+ uint64_t xts_bdf_lpar, atsd_lpar, rc = OPAL_SUCCESS;
+ int i;
+ int id;
+ static uint64_t atsd_lpar_regs[] = {
+ NPU2_XTS_MMIO_ATSD0_LPARID, NPU2_XTS_MMIO_ATSD1_LPARID,
+ NPU2_XTS_MMIO_ATSD2_LPARID, NPU2_XTS_MMIO_ATSD3_LPARID,
+ NPU2_XTS_MMIO_ATSD4_LPARID, NPU2_XTS_MMIO_ATSD5_LPARID,
+ NPU2_XTS_MMIO_ATSD6_LPARID, NPU2_XTS_MMIO_ATSD7_LPARID
+ };
+
+ if (lpcr)
+ /* The LPCR bits are only required for hash based ATS,
+ * which we don't currently support but may need to in
+ * future. */
+ return OPAL_UNSUPPORTED;
+
+ p = phb_to_npu2_nvlink(phb);
+ lock(&p->lock);
+
+ /* Find any existing entries and update them */
+ xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_BDF, 0L, bdf);
+ id = npu_table_search(p, NPU2_XTS_BDF_MAP, 8, NPU2_XTS_BDF_MAP_SIZE,
+ &xts_bdf_lpar, NPU2_XTS_BDF_MAP_BDF);
+ if (id < 0) {
+ /* No existing mapping found, find space for a new one */
+ xts_bdf_lpar = 0;
+ id = npu_table_search(p, NPU2_XTS_BDF_MAP, 8, NPU2_XTS_BDF_MAP_SIZE,
+ &xts_bdf_lpar, -1UL);
+ }
+
+ if (id < 0) {
+ /* Unable to find a free mapping */
+ NPU2ERR(p, "No free XTS_BDF[] entry\n");
+ rc = OPAL_RESOURCE;
+ goto out;
+ }
+
+ xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_UNFILT, 0UL, 1);
+ xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_BDF, xts_bdf_lpar, bdf);
+
+ /* We only support radix for the moment */
+ xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_XLAT, xts_bdf_lpar, 0x3);
+ xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_LPARID, xts_bdf_lpar, lparid);
+ xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_LPARSHORT, xts_bdf_lpar, id);
+
+ /* Need to find an NVLink to send the ATSDs for this device over */
+ for (i = 0; i < p->total_devices; i++) {
+ if (p->devices[i].nvlink.gpu_bdfn == bdf) {
+ ndev = &p->devices[i];
+ break;
+ }
+ }
+
+ if (!ndev) {
+ NPU2ERR(p, "Unable to find nvlink for bdf %llx\n", bdf);
+ rc = OPAL_PARAMETER;
+ goto out;
+ }
+
+ /*
+ * We need to allocate an ATSD per NVLink bridge if possible,
+ * use the ibm,npu-link-index property for that.
+ */
+ atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_LPARID, 0, lparid);
+ if (!lparid)
+ atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_MSR_HV, atsd_lpar, 1);
+
+ if (ndev->link_index < ARRAY_SIZE(atsd_lpar_regs))
+ npu2_write(p, atsd_lpar_regs[ndev->link_index], atsd_lpar);
+ else
+ NPU2ERR(p, "Unable to assign ATSD for link index %u\n",
+ ndev->link_index);
+
+ xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_STACK, xts_bdf_lpar,
+ 0x4 >> (ndev->brick_index / 2));
+ xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_BRICK, xts_bdf_lpar,
+ (ndev->brick_index % 2));
+
+ NPU2DBG(p, "XTS_BDF_MAP[%03d] = 0x%08llx\n", id, xts_bdf_lpar);
+ npu2_write(p, NPU2_XTS_BDF_MAP + id*8, xts_bdf_lpar);
+
+ /* Reset wildcard in the PID map and the refcounter */
+ if (npu2_read(p, NPU2_XTS_PID_MAP + id*0x20) || p->ctx_ref[id]) {
+ prlog(PR_INFO, "Resetting PID MAP for LPID %lld\n", lparid);
+ p->ctx_ref[id] = 0;
+ npu2_write(p, NPU2_XTS_PID_MAP + id*0x20, 0);
+ }
+
+out:
+ unlock(&p->lock);
+ return rc;
+}
+
+static inline uint32_t npu2_relaxed_ordering_source_grpchp(uint32_t gcid)
+{
+ if (gcid & ~0x1b)
+ return OPAL_PARAMETER;
+
+ /* Repack 0bGGGGCCC to 0bGGCC */
+ return ((gcid & 0x18) >> 1) | (gcid & 0x3);
+}
+
+static uint64_t npu2_relaxed_ordering_cfg_read(struct npu2_dev *ndev, int n)
+{
+ uint64_t reg = NPU2_SM_REG_OFFSET(ndev, 0, NPU2_RELAXED_ORDERING_CFG(n));
+
+ return npu2_read(ndev->npu, reg);
+}
+
+static void npu2_relaxed_ordering_cfg_write(struct npu2_dev *ndev, int n,
+ uint64_t val)
+{
+ uint64_t reg;
+ int sm;
+
+ /* Set every register on our stack */
+ for (sm = NPU2_BLOCK_SM_0; sm <= NPU2_BLOCK_SM_3; sm++) {
+ reg = NPU2_SM_REG_OFFSET(ndev, sm, NPU2_RELAXED_ORDERING_CFG(n));
+ npu2_write(ndev->npu, reg, val);
+ }
+}
+
+/*
+ * Parse the value of a relaxed ordering config register. Returns SOURCE0 or
+ * SOURCE1 register mask if relaxed ordering is set for the given chip/pec.
+ * Returns 0 if unset.
+ */
+static uint64_t npu2_relaxed_ordering_cfg_enabled(uint64_t val, uint32_t gcid,
+ int pec)
+{
+ uint32_t src, grpchp;
+ uint64_t mask;
+ int i;
+
+ for (i = 0; i < 2; i++) {
+ mask = NPU2_RELAXED_ORDERING_SOURCE(i);
+ src = GETFIELD(mask, val);
+
+ if (!GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_ENA, src))
+ continue;
+
+ if (GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_PECSEL, src) != pec)
+ continue;
+
+ grpchp = GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_GRPCHP, src);
+ if (grpchp == npu2_relaxed_ordering_source_grpchp(gcid))
+ return mask;
+
+ if (grpchp == 0xf) /* match all */
+ return mask;
+ }
+
+ return 0;
+}
+
+static int npu2_enable_relaxed_ordering(struct npu2_dev *ndev, uint32_t gcid,
+ int pec)
+{
+ uint64_t val, mask;
+ uint32_t src;
+ int rc = OPAL_RESOURCE;
+ int i;
+
+ NPU2DEVINF(ndev, "Enabling relaxed ordering for PEC %d on chip %d\n", pec, gcid);
+ lock(&ndev->npu->lock);
+
+ for (i = 0; i < 2; i++) {
+ val = npu2_relaxed_ordering_cfg_read(ndev, i);
+ if (!npu2_relaxed_ordering_cfg_enabled(val, gcid, pec))
+ continue;
+
+ /* Already enabled */
+ rc = OPAL_SUCCESS;
+ goto out;
+ }
+
+ src = NPU2_RELAXED_ORDERING_SOURCE_WRENA |
+ NPU2_RELAXED_ORDERING_SOURCE_RDENA;
+ src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_PECSEL, src, pec);
+ src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_GRPCHP, src,
+ npu2_relaxed_ordering_source_grpchp(gcid));
+ src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_WRMIN, src, 0);
+ src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_WRMAX, src, 23);
+ src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_RDMIN, src, 0);
+ src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_RDMAX, src, 47);
+
+ /* Find somewhere to write this config */
+ for (i = 0; i < 2; i++) {
+ val = npu2_relaxed_ordering_cfg_read(ndev, i);
+
+ if (!GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_ENA << 32, val))
+ mask = NPU2_RELAXED_ORDERING_SOURCE(0);
+ else if (!GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_ENA, val))
+ mask = NPU2_RELAXED_ORDERING_SOURCE(1);
+ else
+ continue;
+
+ val = SETFIELD(mask, val, src);
+ npu2_relaxed_ordering_cfg_write(ndev, i, val);
+
+ rc = OPAL_SUCCESS;
+ break;
+ }
+
+out:
+ unlock(&ndev->npu->lock);
+ return rc;
+}
+
+static void npu2_disable_relaxed_ordering(struct npu2_dev *ndev, uint32_t gcid,
+ int pec)
+{
+ uint64_t val, mask;
+ int i;
+
+ NPU2DEVINF(ndev, "Disabling relaxed ordering for PEC %d on chip %d\n", pec, gcid);
+ lock(&ndev->npu->lock);
+
+ for (i = 0; i < 2; i++) {
+ val = npu2_relaxed_ordering_cfg_read(ndev, i);
+
+ mask = npu2_relaxed_ordering_cfg_enabled(val, gcid, pec);
+ if (!mask)
+ continue;
+
+ val = SETFIELD(mask, val, 0);
+ npu2_relaxed_ordering_cfg_write(ndev, i, val);
+ }
+
+ unlock(&ndev->npu->lock);
+}
+
+/*
+ * Enable or disable relaxed ordering on all nvlinks for a given PEC. May leave
+ * relaxed ordering partially enabled if there are insufficient HW resources to
+ * enable it on all links.
+ */
+int64_t npu2_set_relaxed_order(struct phb *phb, uint32_t gcid, int pec,
+ bool enable)
+{
+ struct npu2 *npu = phb_to_npu2_nvlink(phb);
+ struct npu2_dev *ndev;
+ int64_t rc = OPAL_SUCCESS;
+
+ for (int i = 0; i < npu->total_devices; i++) {
+ ndev = &npu->devices[i];
+ if (enable)
+ rc = npu2_enable_relaxed_ordering(ndev, gcid, pec);
+ else
+ npu2_disable_relaxed_ordering(ndev, gcid, pec);
+
+ if (rc != OPAL_SUCCESS) {
+ NPU2DEVINF(ndev, "Insufficient resources to activate relaxed ordering mode\n");
+ return OPAL_RESOURCE;
+ }
+ }
+
+ return OPAL_SUCCESS;
+}
diff --git a/roms/skiboot/hw/npu3-hw-procedures.c b/roms/skiboot/hw/npu3-hw-procedures.c
new file mode 100644
index 000000000..098e6e467
--- /dev/null
+++ b/roms/skiboot/hw/npu3-hw-procedures.c
@@ -0,0 +1,792 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <npu3.h>
+#include <npu3-regs.h>
+#include <timebase.h>
+#include <xscom.h>
+#include <xscom-p9-regs.h>
+
+#define NPU3DEVLOG(l, dev, fmt, a...) \
+ prlog(l, "NPU[%d:%d:%d]: " fmt, \
+ (dev)->npu->chip_id, \
+ (dev)->npu->index, \
+ (dev)->index, ##a)
+#define NPU3DEVDBG(dev, fmt, a...) NPU3DEVLOG(PR_DEBUG, dev, fmt, ##a)
+#define NPU3DEVINF(dev, fmt, a...) NPU3DEVLOG(PR_INFO, dev, fmt, ##a)
+#define NPU3DEVERR(dev, fmt, a...) NPU3DEVLOG(PR_ERR, dev, fmt, ##a)
+
+/*
+ * The documentation for the PHY training is written in terms of bits within an
+ * actual register so we use that representation here.
+ */
+struct npu3_phy_reg {
+ uint64_t offset;
+ uint64_t mask;
+};
+
+static struct npu3_phy_reg
+NPU3_PHY_RX_RUN_LANE = { 0x0c8, PPC_BIT(48) },
+NPU3_PHY_RX_IORESET = { 0x096, PPC_BIT(63) },
+NPU3_PHY_TX_IORESET = { 0x113, PPC_BIT(48) },
+NPU3_PHY_RX_PR_RESET = { 0x096, PPC_BIT(62) },
+NPU3_PHY_RX_LANE_ANA_PDWN = { 0x002, PPC_BIT(54) },
+NPU3_PHY_RX_LANE_DIG_PDWN = { 0x088, PPC_BIT(48) },
+NPU3_PHY_RX_PR_PHASE_STEP = { 0x08a, PPC_BITMASK(60, 63) },
+NPU3_PHY_TX_LANE_PDWN = { 0x101, PPC_BIT(48) },
+NPU3_PHY_RX_RUN_DCCAL = { 0x0c8, PPC_BIT(49) },
+NPU3_PHY_RX_DCCAL_DONE = { 0x0ca, PPC_BIT(49) },
+NPU3_PHY_RX_LANE_BUSY = { 0x0ca, PPC_BIT(50) },
+NPU3_PHY_RX_B_BANK_CONTROLS = { 0x002, PPC_BITMASK(58, 63) },
+NPU3_PHY_TX_UNLOAD_CLK_DISABLE = { 0x103, PPC_BIT(56) },
+NPU3_PHY_TX_FIFO_INIT = { 0x105, PPC_BIT(53) },
+NPU3_PHY_TX_RXCAL = { 0x103, PPC_BIT(57) },
+NPU3_PHY_RX_INIT_DONE = { 0x0ca, PPC_BIT(48) },
+NPU3_PHY_RX_PR_EDGE_TRACK_CNTL = { 0x092, PPC_BITMASK(48, 49) },
+NPU3_PHY_RX_PR_FW_OFF = { 0x08a, PPC_BIT(56) },
+NPU3_PHY_RX_PR_FW_INERTIA_AMT = { 0x08a, PPC_BITMASK(57, 59) },
+NPU3_PHY_RX_CFG_LTE_MC = { 0x000, PPC_BITMASK(60, 63) },
+NPU3_PHY_RX_A_INTEG_COARSE_GAIN = { 0x00a, PPC_BITMASK(48, 51) },
+NPU3_PHY_RX_B_INTEG_COARSE_GAIN = { 0x026, PPC_BITMASK(48, 51) },
+NPU3_PHY_RX_E_INTEG_COARSE_GAIN = { 0x030, PPC_BITMASK(48, 51) },
+
+/* These registers are per-PHY, not per lane */
+NPU3_PHY_TX_ZCAL_SWO_EN = { 0x3c9, PPC_BIT(48) },
+NPU3_PHY_TX_ZCAL_REQ = { 0x3c1, PPC_BIT(49) },
+NPU3_PHY_TX_ZCAL_DONE = { 0x3c1, PPC_BIT(50) },
+NPU3_PHY_TX_ZCAL_ERROR = { 0x3c1, PPC_BIT(51) },
+NPU3_PHY_TX_ZCAL_N = { 0x3c3, PPC_BITMASK(48, 56) },
+NPU3_PHY_TX_ZCAL_P = { 0x3c5, PPC_BITMASK(48, 56) },
+NPU3_PHY_TX_PSEG_PRE_EN = { 0x34d, PPC_BITMASK(51, 55) },
+NPU3_PHY_TX_PSEG_PRE_SELECT = { 0x34d, PPC_BITMASK(56, 60) },
+NPU3_PHY_TX_NSEG_PRE_EN = { 0x34f, PPC_BITMASK(51, 55) },
+NPU3_PHY_TX_NSEG_PRE_SELECT = { 0x34f, PPC_BITMASK(56, 60) },
+NPU3_PHY_TX_PSEG_POST_EN = { 0x361, PPC_BITMASK(49, 55) },
+NPU3_PHY_TX_PSEG_POST_SELECT = { 0x361, PPC_BITMASK(56, 62) },
+NPU3_PHY_TX_NSEG_POST_EN = { 0x363, PPC_BITMASK(49, 55) },
+NPU3_PHY_TX_NSEG_POST_SELECT = { 0x363, PPC_BITMASK(56, 62) },
+NPU3_PHY_TX_PSEG_MARGINPU_EN = { 0x351, PPC_BITMASK(48, 55) },
+NPU3_PHY_TX_NSEG_MARGINPU_EN = { 0x353, PPC_BITMASK(48, 55) },
+NPU3_PHY_TX_PSEG_MARGINPD_EN = { 0x351, PPC_BITMASK(56, 63) },
+NPU3_PHY_TX_NSEG_MARGINPD_EN = { 0x353, PPC_BITMASK(56, 63) },
+NPU3_PHY_TX_MARGINPU_SELECT = { 0x355, PPC_BITMASK(48, 55) },
+NPU3_PHY_TX_MARGINPD_SELECT = { 0x355, PPC_BITMASK(56, 63) },
+NPU3_PHY_TX_PSEG_MAIN_EN = { 0x357, PPC_BITMASK(51, 57) },
+NPU3_PHY_TX_NSEG_MAIN_EN = { 0x359, PPC_BITMASK(51, 57) },
+NPU3_PHY_RX_CLKDIST_PDWN = { 0x204, PPC_BITMASK(48, 50) },
+NPU3_PHY_RX_IREF_PDWN = { 0x230, PPC_BIT(54) },
+NPU3_PHY_TX_CLKDIST_PDWN = { 0x305, PPC_BITMASK(48, 50) },
+NPU3_PHY_RX_CTL_DATASM_CLKDIST_PDWN = { 0x2e0, PPC_BIT(60) };
+
+static uint64_t npu3_phy_scom(struct npu3_dev *dev, struct npu3_phy_reg *reg,
+ int lane)
+{
+ uint64_t scom;
+
+ /* Don't specify a lane for a non-per-lane register */
+ if (lane >= 0)
+ assert(reg->offset < 0x200);
+ else
+ assert(reg->offset >= 0x200);
+
+ scom = OB_INDIRECT(dev->ob_chiplet);
+ scom = SETFIELD(PPC_BITMASK(12, 21), scom, reg->offset);
+
+ if (lane > 0)
+ scom = SETFIELD(PPC_BITMASK(27, 31), scom, lane);
+
+ return scom;
+}
+
+static void npu3_phy_write_lane(struct npu3_dev *dev, struct npu3_phy_reg *reg,
+ int lane, uint64_t val)
+{
+ struct npu3 *npu = dev->npu;
+ uint64_t scom, scom_val;
+
+ scom = npu3_phy_scom(dev, reg, lane);
+
+ xscom_read(npu->chip_id, scom, &scom_val);
+ scom_val = SETFIELD(reg->mask, scom_val, val);
+ xscom_write(npu->chip_id, scom, scom_val);
+}
+
+static uint64_t npu3_phy_read_lane(struct npu3_dev *dev,
+ struct npu3_phy_reg *reg,
+ int lane)
+{
+ struct npu3 *npu = dev->npu;
+ uint64_t scom, scom_val;
+
+ scom = npu3_phy_scom(dev, reg, lane);
+ xscom_read(npu->chip_id, scom, &scom_val);
+
+ return GETFIELD(reg->mask, scom_val);
+}
+
+static inline void npu3_phy_write(struct npu3_dev *dev,
+ struct npu3_phy_reg *reg,
+ uint64_t val)
+{
+ npu3_phy_write_lane(dev, reg, -1, val);
+}
+
+static inline uint64_t npu3_phy_read(struct npu3_dev *dev,
+ struct npu3_phy_reg *reg)
+{
+ return npu3_phy_read_lane(dev, reg, -1);
+}
+
+struct procedure {
+ const char *name;
+ uint32_t (*steps[])(struct npu3_dev *);
+};
+
+#define DEFINE_PROCEDURE(NAME, STEPS...) \
+static struct procedure procedure_##NAME = { \
+ .name = #NAME, \
+ .steps = { NAME, ##STEPS } \
+}
+
+static uint32_t stop(struct npu3_dev *npu_dev __unused)
+{
+ return NPU3_PROC_COMPLETE | NPU3_PROC_ABORTED;
+}
+
+DEFINE_PROCEDURE(stop);
+
+static uint32_t nop(struct npu3_dev *npu_dev __unused)
+{
+ return NPU3_PROC_COMPLETE;
+}
+
+DEFINE_PROCEDURE(nop);
+
+static void set_iovalid(struct npu3_dev *dev, bool raise)
+{
+ struct npu3 *npu = dev->npu;
+ uint64_t reg, val;
+
+ reg = OB_CPLT_CONF1(dev->ob_chiplet);
+
+ xscom_read(npu->chip_id, reg, &val);
+ val = SETFIELD(OB_CPLT_CONF1_NV_IOVALID(dev->index), val, raise);
+ xscom_write(npu->chip_id, reg, val);
+}
+
+#define NPU3_PHY_LANES 24
+
+#define npu3_for_each_lane(lane, dev) \
+ for (lane = 0; lane < NPU3_PHY_LANES; lane++) \
+ if (dev->phy_lane_mask & PPC_BIT32(lane)) \
+
+static uint32_t phy_reset(struct npu3_dev *dev)
+{
+ uint32_t lane;
+
+ set_iovalid(dev, false);
+
+ npu3_for_each_lane(lane, dev)
+ npu3_phy_write_lane(dev, &NPU3_PHY_RX_RUN_LANE, lane, 0);
+
+ return NPU3_PROC_NEXT;
+}
+
+static uint32_t phy_reset_wait(struct npu3_dev *dev)
+{
+ int lane;
+
+ /* Wait for all lanes to become inactive */
+ npu3_for_each_lane(lane, dev)
+ if (npu3_phy_read_lane(dev, &NPU3_PHY_RX_LANE_BUSY, lane))
+ return NPU3_PROC_INPROGRESS;
+
+ npu3_for_each_lane(lane, dev) {
+ /* Set lane in reset */
+ npu3_phy_write_lane(dev, &NPU3_PHY_RX_IORESET, lane, 1);
+ npu3_phy_write_lane(dev, &NPU3_PHY_TX_IORESET, lane, 1);
+
+ /* Release lane from reset */
+ npu3_phy_write_lane(dev, &NPU3_PHY_RX_IORESET, lane, 0);
+ npu3_phy_write_lane(dev, &NPU3_PHY_TX_IORESET, lane, 0);
+
+ /* Reset the phase rotator */
+ npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_RESET, lane, 1);
+ npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_RESET, lane, 0);
+ }
+
+ return NPU3_PROC_NEXT;
+}
+
+/* Procedure 1.2.3 - Initialise I/O PHY Registers */
+static uint32_t phy_reset_complete(struct npu3_dev *dev)
+{
+ int lane;
+
+ npu3_for_each_lane(lane, dev) {
+ npu3_phy_write_lane(dev, &NPU3_PHY_RX_LANE_ANA_PDWN, lane, 0);
+ npu3_phy_write_lane(dev, &NPU3_PHY_RX_LANE_DIG_PDWN, lane, 0);
+ npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_PHASE_STEP, lane, 0xc);
+ npu3_phy_write_lane(dev, &NPU3_PHY_TX_LANE_PDWN, lane, 0);
+ npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_FW_INERTIA_AMT, lane, 4);
+ npu3_phy_write_lane(dev, &NPU3_PHY_RX_CFG_LTE_MC, lane, 3);
+ npu3_phy_write_lane(dev, &NPU3_PHY_RX_A_INTEG_COARSE_GAIN, lane, 11);
+ npu3_phy_write_lane(dev, &NPU3_PHY_RX_B_INTEG_COARSE_GAIN, lane, 11);
+ npu3_phy_write_lane(dev, &NPU3_PHY_RX_E_INTEG_COARSE_GAIN, lane, 11);
+ }
+
+ set_iovalid(dev, true);
+
+ return NPU3_PROC_COMPLETE;
+}
+
+DEFINE_PROCEDURE(phy_reset, phy_reset_wait, phy_reset_complete);
+
+/* Procedure 1.2.6 - I/O PHY Tx Impedance Calibration */
+static uint32_t phy_tx_zcal(struct npu3_dev *dev)
+{
+ if (dev->npu->tx_zcal_complete)
+ return NPU3_PROC_COMPLETE;
+
+ /* Turn off SW enable and enable zcal state machine */
+ npu3_phy_write(dev, &NPU3_PHY_TX_ZCAL_SWO_EN, 0);
+
+ /* Start impedance calibration state machine */
+ npu3_phy_write(dev, &NPU3_PHY_TX_ZCAL_REQ, 1);
+
+ return NPU3_PROC_NEXT;
+}
+
+static uint32_t phy_tx_zcal_wait(struct npu3_dev *dev)
+{
+ if (npu3_phy_read(dev, &NPU3_PHY_TX_ZCAL_ERROR))
+ return NPU3_PROC_COMPLETE | NPU3_PROC_FAILED;
+
+ if (!npu3_phy_read(dev, &NPU3_PHY_TX_ZCAL_DONE))
+ return NPU3_PROC_INPROGRESS;
+
+ return NPU3_PROC_NEXT;
+}
+
+#define MARGIN_RATIO 0
+#define FFE_PRE_COEFF 0
+#define FFE_POST_COEFF 0
+
+#define PRE_WIDTH 5
+#define POST_WIDTH 7
+#define MAIN_WIDTH 7
+#define ZCAL_MIN (16 * 2)
+#define ZCAL_MAX (33 * 2)
+#define PRECURSOR_X2_MAX (4 * 2 + 1)
+#define POSTCURSOR_X2_MAX (6 * 2 + 1)
+#define MARGIN_X2_MAX (8 * 2)
+#define MAIN_X2_MAX (6 * 2 + 1)
+#define TOTAL_X2_MAX (PRECURSOR_X2_MAX + POSTCURSOR_X2_MAX + \
+ 2 * MARGIN_X2_MAX + MAIN_X2_MAX)
+
+static uint32_t therm(uint32_t dec)
+{
+ return (0x1 << dec) - 1;
+}
+
+static uint32_t therm_with_half(uint32_t dec, uint8_t width)
+{
+ /* If the LSB of the 2r equivalent is on, then we need to set the 2r bit (MSB) */
+ uint32_t half_on = (dec & 0x1) << (width - 1);
+
+ /* Shift the 2r equivalent to a 1r value and convert to a thermometer code. */
+ uint32_t x1_equiv = ((1 << (dec >> 1)) - 1);
+
+ /* Combine 1r equivalent thermometer code + the 2r MSB value. */
+ return half_on | x1_equiv;
+}
+
+static uint32_t phy_tx_zcal_calculate(struct npu3_dev *dev)
+{
+ int p_value, n_value;
+ uint32_t zcal_n;
+ uint32_t zcal_p;
+ uint32_t p_main_enable = MAIN_X2_MAX;
+ uint32_t p_margin_pu_enable = MARGIN_X2_MAX;
+ uint32_t p_margin_pd_enable = MARGIN_X2_MAX;
+ uint32_t p_precursor_select;
+ uint32_t p_postcursor_select;
+ uint32_t margin_pu_select;
+ uint32_t n_main_enable = MAIN_X2_MAX;
+ uint32_t n_margin_pu_enable = MARGIN_X2_MAX;
+ uint32_t n_margin_pd_enable = MARGIN_X2_MAX;
+ uint32_t n_precursor_select;
+ uint32_t n_postcursor_select;
+ uint32_t margin_pd_select;
+ uint32_t margin_select;
+
+ /* Convert the value from 8R to 2R by / 4 */
+ zcal_n = npu3_phy_read(dev, &NPU3_PHY_TX_ZCAL_N) / 4;
+ zcal_p = npu3_phy_read(dev, &NPU3_PHY_TX_ZCAL_P) / 4;
+
+ /*
+ * Again, if the hardware detects an unexpected condition it's
+ * better just to fail loudly.
+ */
+ if (zcal_n < ZCAL_MIN || zcal_n > ZCAL_MAX ||
+ zcal_p < ZCAL_MIN || zcal_p > ZCAL_MAX)
+ return NPU3_PROC_COMPLETE | NPU3_PROC_FAILED;
+
+ p_value = zcal_p - TOTAL_X2_MAX;
+ p_precursor_select = p_value * FFE_PRE_COEFF / 128;
+ p_postcursor_select = p_value * FFE_POST_COEFF / 128;
+ margin_pu_select = p_value * MARGIN_RATIO / 256;
+
+ if (p_value % 2) {
+ p_main_enable--;
+ p_value++;
+ }
+
+ while (p_value < 0) {
+ if (p_main_enable > 1) {
+ p_main_enable -= 2;
+ } else if (p_margin_pu_enable + p_margin_pd_enable > 0) {
+ if (p_margin_pu_enable == p_margin_pd_enable)
+ p_margin_pd_enable -= 2;
+ else
+ p_margin_pu_enable -= 2;
+ }
+ p_value += 2;
+ }
+
+ n_value = zcal_n - TOTAL_X2_MAX;
+ n_precursor_select = n_value * FFE_PRE_COEFF / 128;
+ n_postcursor_select = n_value * FFE_POST_COEFF / 128;
+ margin_pd_select = p_value * MARGIN_RATIO / 256;
+
+ if (n_value % 2) {
+ n_main_enable--;
+ n_value++;
+ }
+
+ while (n_value < 0) {
+ if (n_main_enable > 1) {
+ n_main_enable -= 2;
+ } else if (n_margin_pu_enable + n_margin_pd_enable > 0) {
+ if (n_margin_pu_enable == n_margin_pd_enable)
+ n_margin_pd_enable -= 2;
+ else
+ n_margin_pu_enable -= 2;
+ }
+ n_value += 2;
+ }
+
+ margin_select = therm((margin_pu_select + 1) / 2) &
+ therm((margin_pd_select + 1) / 2) &
+ therm((p_margin_pu_enable + 1) / 2) &
+ therm((p_margin_pd_enable + 1) / 2) &
+ therm((n_margin_pu_enable + 1) / 2) &
+ therm((n_margin_pd_enable + 1) / 2);
+
+ npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_PRE_EN, therm_with_half(PRECURSOR_X2_MAX, PRE_WIDTH));
+ npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_PRE_SELECT, therm_with_half(p_precursor_select, PRE_WIDTH));
+ npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_POST_EN, therm_with_half(POSTCURSOR_X2_MAX, POST_WIDTH));
+ npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_POST_SELECT, therm_with_half(p_postcursor_select, POST_WIDTH));
+ npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_MARGINPU_EN, therm((p_margin_pu_enable + 1) / 2));
+ npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_MARGINPD_EN, therm((p_margin_pd_enable + 1) / 2));
+ npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_MAIN_EN, therm_with_half(p_main_enable, MAIN_WIDTH));
+
+ npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_PRE_EN, therm_with_half(PRECURSOR_X2_MAX, PRE_WIDTH));
+ npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_PRE_SELECT, therm_with_half(n_precursor_select, PRE_WIDTH));
+ npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_POST_EN, therm_with_half(POSTCURSOR_X2_MAX, POST_WIDTH));
+ npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_POST_SELECT, therm_with_half(n_postcursor_select, POST_WIDTH));
+ npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_MARGINPU_EN, therm((n_margin_pu_enable + 1) / 2));
+ npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_MARGINPD_EN, therm((n_margin_pd_enable + 1) / 2));
+ npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_MAIN_EN, therm_with_half(n_main_enable, MAIN_WIDTH));
+
+ npu3_phy_write(dev, &NPU3_PHY_TX_MARGINPU_SELECT, therm(margin_select + 1) / 2);
+ npu3_phy_write(dev, &NPU3_PHY_TX_MARGINPD_SELECT, therm(margin_select + 1) / 2);
+
+ dev->npu->tx_zcal_complete = true;
+
+ return NPU3_PROC_COMPLETE;
+}
+
+DEFINE_PROCEDURE(phy_tx_zcal, phy_tx_zcal_wait, phy_tx_zcal_calculate);
+
+/* Procedure 1.2.4 - I/O PHY DC Calibration */
+static uint32_t phy_rx_dccal(struct npu3_dev *dev)
+{
+ int lane;
+
+ set_iovalid(dev, false);
+
+ npu3_for_each_lane(lane, dev)
+ npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_FW_OFF, lane, 1);
+
+ npu3_for_each_lane(lane, dev)
+ npu3_phy_write_lane(dev, &NPU3_PHY_RX_RUN_DCCAL, lane, 1);
+
+ return NPU3_PROC_NEXT;
+}
+
+static uint32_t phy_rx_dccal_complete(struct npu3_dev *dev)
+{
+ int lane;
+
+ npu3_for_each_lane(lane, dev)
+ if (!npu3_phy_read_lane(dev, &NPU3_PHY_RX_DCCAL_DONE, lane))
+ return NPU3_PROC_INPROGRESS;
+
+ npu3_for_each_lane(lane, dev)
+ npu3_phy_write_lane(dev, &NPU3_PHY_RX_RUN_DCCAL, lane, 0);
+
+ npu3_for_each_lane(lane, dev) {
+ npu3_phy_write_lane(dev, &NPU3_PHY_RX_B_BANK_CONTROLS, lane, 0);
+ npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_EDGE_TRACK_CNTL, lane, 0);
+ npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_FW_OFF, lane, 0);
+ }
+
+ return NPU3_PROC_NEXT;
+}
+
+/* Procedure 1.2.5 - IO PHY Tx FIFO Init */
+static uint32_t phy_tx_fifo_init(struct npu3_dev *dev)
+{
+ int lane;
+
+ npu3_for_each_lane(lane, dev) {
+ npu3_phy_write_lane(dev, &NPU3_PHY_TX_UNLOAD_CLK_DISABLE, lane, 0);
+ npu3_phy_write_lane(dev, &NPU3_PHY_TX_FIFO_INIT, lane, 1);
+ npu3_phy_write_lane(dev, &NPU3_PHY_TX_UNLOAD_CLK_DISABLE, lane, 1);
+ }
+
+ set_iovalid(dev, true);
+
+ return NPU3_PROC_COMPLETE;
+}
+
+DEFINE_PROCEDURE(phy_rx_dccal, phy_rx_dccal_complete, phy_tx_fifo_init);
+
+/* Procedure 1.2.8 - Enable Downstream Link Training */
+static uint32_t phy_enable_tx_rxcal(struct npu3_dev *dev)
+{
+ int lane;
+
+ npu3_for_each_lane(lane, dev)
+ npu3_phy_write_lane(dev, &NPU3_PHY_TX_RXCAL, lane, 1);
+
+ return NPU3_PROC_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_enable_tx_rxcal);
+
+/* Procedure 1.2.9 - Disable Downstream Link Training */
+static uint32_t phy_disable_tx_rxcal(struct npu3_dev *dev)
+{
+ int lane;
+
+ npu3_for_each_lane(lane, dev)
+ npu3_phy_write_lane(dev, &NPU3_PHY_TX_RXCAL, lane, 0);
+
+ return NPU3_PROC_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_disable_tx_rxcal);
+
+/* Procedure 1.2.7 - I/O PHY Upstream Link Training */
+static uint32_t phy_rx_training(struct npu3_dev *dev)
+{
+ int lane;
+
+ npu3_for_each_lane(lane, dev)
+ npu3_phy_write_lane(dev, &NPU3_PHY_RX_RUN_LANE, lane, 1);
+
+ return NPU3_PROC_NEXT;
+}
+
+static uint32_t phy_rx_training_wait(struct npu3_dev *dev)
+{
+ int lane;
+
+ npu3_for_each_lane(lane, dev)
+ if (!npu3_phy_read_lane(dev, &NPU3_PHY_RX_INIT_DONE, lane))
+ return NPU3_PROC_INPROGRESS;
+
+ return NPU3_PROC_COMPLETE;
+}
+
+DEFINE_PROCEDURE(phy_rx_training, phy_rx_training_wait);
+
+static void npu3_dev_fence_set(struct npu3_dev *dev, uint8_t state)
+{
+ struct npu3 *npu = dev->npu;
+ uint64_t val;
+
+ val = npu3_read(npu, NPU3_NTL_MISC_CFG1(dev->index));
+ val = SETFIELD(NPU3_NTL_MISC_CFG1_NTL_RESET, val, state);
+ npu3_write(npu, NPU3_NTL_MISC_CFG1(dev->index), val);
+}
+
+static uint8_t npu3_dev_fence_get(struct npu3_dev *dev)
+{
+ uint64_t val;
+
+ val = npu3_read(dev->npu, NPU3_NTL_CQ_FENCE_STATUS(dev->index));
+ return GETFIELD(NPU3_NTL_CQ_FENCE_STATUS_FIELD, val);
+}
+
+/* Procedure 1.2.1 - Reset NPU/NDL */
+static uint32_t reset_ntl(struct npu3_dev *dev)
+{
+ struct npu3 *npu = dev->npu;
+ uint64_t val;
+ int lane;
+
+ set_iovalid(dev, true);
+
+ /* Power on clocks */
+ npu3_phy_write(dev, &NPU3_PHY_RX_CLKDIST_PDWN, 0);
+ npu3_phy_write(dev, &NPU3_PHY_RX_IREF_PDWN, 1);
+ npu3_phy_write(dev, &NPU3_PHY_TX_CLKDIST_PDWN, 0);
+ npu3_phy_write(dev, &NPU3_PHY_RX_CTL_DATASM_CLKDIST_PDWN, 0);
+
+ npu3_for_each_lane(lane, dev) {
+ npu3_phy_write_lane(dev, &NPU3_PHY_RX_LANE_ANA_PDWN, lane, 0);
+ npu3_phy_write_lane(dev, &NPU3_PHY_RX_LANE_DIG_PDWN, lane, 0);
+ npu3_phy_write_lane(dev, &NPU3_PHY_TX_LANE_PDWN, lane, 0);
+ }
+
+ /* Write PRI */
+ val = SETFIELD(NPU3_NTL_PRI_CFG_NDL, 0ull, dev->index);
+ npu3_write(npu, NPU3_NTL_PRI_CFG(dev->index), val);
+
+ /* Disable parity checking */
+ val = npu3_read(npu, NPU3_NTL_MISC_CFG2(dev->index));
+ val &= ~(NPU3_NTL_MISC_CFG2_NDL_RX_PARITY_ENA |
+ NPU3_NTL_MISC_CFG2_NDL_TX_PARITY_ENA |
+ NPU3_NTL_MISC_CFG2_NDL_PRI_PARITY_ENA);
+ npu3_write(npu, NPU3_NTL_MISC_CFG2(dev->index), val);
+
+ if (dev->type == NPU3_DEV_TYPE_NVLINK)
+ npu3_pvd_flag_clear(dev, NPU3_DEV_DL_RESET);
+
+ npu3_dev_fence_set(dev, NPU3_NTL_CQ_FENCE_STATUS_FULL);
+
+ return NPU3_PROC_NEXT;
+}
+
+static uint32_t reset_ndl(struct npu3_dev *dev)
+{
+ struct npu3 *npu = dev->npu;
+ uint64_t reg;
+ uint32_t val32;
+
+ if (npu3_dev_fence_get(dev) != NPU3_NTL_CQ_FENCE_STATUS_FULL)
+ return NPU3_PROC_INPROGRESS;
+
+ reg = NPU3_DLPL_CTL(dev->index);
+ val32 = npu3_read_4b(npu, reg);
+ val32 |= NPU3_DLPL_CTL_RESET_RX | NPU3_DLPL_CTL_RESET_MISC;
+ npu3_write_4b(npu, reg, val32);
+
+ val32 = npu3_read_4b(npu, reg);
+ val32 &= ~(NPU3_DLPL_CTL_RESET_RX | NPU3_DLPL_CTL_RESET_MISC);
+ npu3_write_4b(npu, reg, val32);
+
+ reg = NPU3_DLPL_CFG(dev->index);
+ val32 = NPU3_DLPL_CFG_PRI_BYTESWAP;
+ npu3_write_4b(npu, reg, val32);
+
+ /* Clear FIR bits */
+ for (uint32_t i = 0; i < NPU3_FIR_MAX; i++)
+ xscom_write(npu->chip_id, npu->xscom_base + NPU3_FIR(i), 0ull);
+
+ npu3_dev_fence_set(dev, NPU3_NTL_CQ_FENCE_STATUS_HALF);
+
+ return NPU3_PROC_NEXT;
+}
+
+static uint32_t reset_ntl_release(struct npu3_dev *dev)
+{
+ struct npu3 *npu = dev->npu;
+ uint32_t i = dev->index;
+
+ if (npu3_dev_fence_get(dev) != NPU3_NTL_CQ_FENCE_STATUS_HALF)
+ return NPU3_PROC_INPROGRESS;
+
+ /* Credit setup */
+ npu3_write(npu, NPU3_NTL_CREQ_HDR_CRED_SND(i), 0x0200000000000000);
+ npu3_write(npu, NPU3_NTL_PRB_HDR_CRED_SND(i), 0x0200000000000000);
+ npu3_write(npu, NPU3_NTL_ATR_HDR_CRED_SND(i), 0x0200000000000000);
+ npu3_write(npu, NPU3_NTL_RSP_HDR_CRED_SND(i), 0x0200000000000000);
+ npu3_write(npu, NPU3_NTL_CREQ_DAT_CRED_SND(i), 0x1000000000000000);
+ npu3_write(npu, NPU3_NTL_RSP_DAT_CRED_SND(i), 0x1000000000000000);
+
+ npu3_write(npu, NPU3_NTL_CREQ_HDR_CRED_RCV(i), 0x0000be0000000000);
+ npu3_write(npu, NPU3_NTL_DGD_HDR_CRED_RCV(i), 0x0000640000000000);
+ npu3_write(npu, NPU3_NTL_ATSD_HDR_CRED_RCV(i), 0x0000200000000000);
+ npu3_write(npu, NPU3_NTL_RSP_HDR_CRED_RCV(i), 0x0000be0000000000);
+ npu3_write(npu, NPU3_NTL_CREQ_DAT_CRED_RCV(i), 0x0001000000000000);
+ npu3_write(npu, NPU3_NTL_RSP_DAT_CRED_RCV(i), 0x0001000000000000);
+
+ npu3_dev_fence_set(dev, NPU3_NTL_CQ_FENCE_STATUS_NONE);
+
+ return NPU3_PROC_NEXT;
+}
+
+static uint32_t reset_ntl_finish(struct npu3_dev *dev) {
+ struct npu3 *npu = dev->npu;
+ uint64_t val;
+
+ if (npu3_dev_fence_get(dev) != NPU3_NTL_CQ_FENCE_STATUS_NONE)
+ return NPU3_PROC_INPROGRESS;
+
+ /* Enable parity checking */
+ val = npu3_read(npu, NPU3_NTL_MISC_CFG2(dev->index));
+ val |= NPU3_NTL_MISC_CFG2_NDL_RX_PARITY_ENA |
+ NPU3_NTL_MISC_CFG2_NDL_TX_PARITY_ENA |
+ NPU3_NTL_MISC_CFG2_NDL_PRI_PARITY_ENA;
+ npu3_write(npu, NPU3_NTL_MISC_CFG2(dev->index), val);
+
+ if (dev->type == NPU3_DEV_TYPE_NVLINK)
+ npu3_pvd_flag_set(dev, NPU3_DEV_DL_RESET);
+
+ return NPU3_PROC_COMPLETE;
+}
+
+DEFINE_PROCEDURE(reset_ntl, reset_ndl, reset_ntl_release, reset_ntl_finish);
+
+static int npu3_dev_regcmp(struct npu3_dev *dev, uint64_t reg,
+ const char *reg_name, uint64_t expected)
+{
+ uint64_t val;
+
+ val = npu3_read(dev->npu, reg);
+ if (val == expected)
+ return 0;
+
+ NPU3DEVERR(dev, "%s: expected 0x%llx, read 0x%llx\n",
+ reg_name, expected, val);
+
+ return 1;
+}
+
+#define REGCMP(reg, expected) \
+ npu3_dev_regcmp(dev, reg(dev->index), #reg, expected)
+
+static uint32_t check_credits(struct npu3_dev *dev)
+{
+ /* Use bitwise OR to prevent short-circuit evaluation */
+ if (REGCMP(NPU3_NTL_CREQ_HDR_CRED_RCV, 0x0be0be0000000000ull) |
+ REGCMP(NPU3_NTL_DGD_HDR_CRED_RCV, 0x0640640000000000ull) |
+ REGCMP(NPU3_NTL_ATSD_HDR_CRED_RCV, 0x0200200000000000ull) |
+ REGCMP(NPU3_NTL_RSP_HDR_CRED_RCV, 0x0be0be0000000000ull) |
+ REGCMP(NPU3_NTL_CREQ_DAT_CRED_RCV, 0x1001000000000000ull) |
+ REGCMP(NPU3_NTL_RSP_DAT_CRED_RCV, 0x1001000000000000ull))
+ return NPU3_PROC_COMPLETE | NPU3_PROC_FAILED;
+
+ return NPU3_PROC_COMPLETE;
+}
+
+DEFINE_PROCEDURE(check_credits);
+
+static struct procedure *procedures[] = {
+ [0] = &procedure_stop,
+ [1] = &procedure_nop,
+ [4] = &procedure_phy_reset,
+ [5] = &procedure_phy_tx_zcal,
+ [6] = &procedure_phy_rx_dccal,
+ [7] = &procedure_phy_enable_tx_rxcal,
+ [8] = &procedure_phy_disable_tx_rxcal,
+ [9] = &procedure_phy_rx_training,
+ [10] = &procedure_reset_ntl,
+ [11] = &procedure_nop, /* Placeholder for pre-terminate */
+ [12] = &procedure_nop, /* Placeholder for terminate */
+ [13] = &procedure_check_credits,
+};
+
+void npu3_dev_procedure_init(struct npu3_dev *dev, uint32_t pnum)
+{
+ struct npu3_procedure *proc = &dev->proc;
+ const char *name;
+
+ if (pnum >= ARRAY_SIZE(procedures) || !procedures[pnum]) {
+ NPU3DEVERR(dev, "Unsupported procedure number %d\n", pnum);
+ proc->status = NPU3_PROC_COMPLETE | NPU3_PROC_UNSUPPORTED;
+ return;
+ }
+
+ name = procedures[pnum]->name;
+
+ if (proc->number == pnum && !(proc->status & NPU3_PROC_COMPLETE))
+ NPU3DEVINF(dev, "Restarting procedure %s\n", name);
+ else
+ NPU3DEVINF(dev, "Starting procedure %s\n", name);
+
+ proc->status = NPU3_PROC_INPROGRESS;
+ proc->number = pnum;
+ proc->step = 0;
+ proc->timeout = mftb() + msecs_to_tb(1000);
+}
+
+static uint32_t npu3_dev_procedure_run_step(struct npu3_dev *dev)
+{
+ struct npu3_procedure *proc = &dev->proc;
+ uint32_t result;
+
+ result = procedures[proc->number]->steps[proc->step](dev);
+ if (result & NPU3_PROC_NEXT) {
+ proc->step++;
+
+ NPU3DEVINF(dev, "Running procedure %s step %d\n",
+ procedures[proc->number]->name, proc->step);
+ }
+
+ return result;
+}
+
+static void npu3_dev_procedure_run(struct npu3_dev *dev)
+{
+ struct npu3_procedure *proc = &dev->proc;
+ const char *name;
+ uint32_t result;
+
+ do {
+ result = npu3_dev_procedure_run_step(dev);
+ } while (result & NPU3_PROC_NEXT);
+
+ name = procedures[proc->number]->name;
+
+ if (result & NPU3_PROC_COMPLETE) {
+ NPU3DEVINF(dev, "Procedure %s complete\n", name);
+ } else if (tb_compare(mftb(), proc->timeout) == TB_AAFTERB) {
+ NPU3DEVINF(dev, "Procedure %s timed out\n", name);
+ result = NPU3_PROC_COMPLETE | NPU3_PROC_FAILED;
+ }
+
+ /* Mask off internal state bits */
+ proc->status = result & NPU3_PROC_STATUS_MASK;
+}
+
+uint32_t npu3_dev_procedure_status(struct npu3_dev *dev)
+{
+ /* Run the procedure if not already complete */
+ if (!(dev->proc.status & NPU3_PROC_COMPLETE))
+ npu3_dev_procedure_run(dev);
+
+ return dev->proc.status;
+}
+
+int64_t npu3_dev_reset(struct npu3_dev *dev)
+{
+ unsigned long timeout;
+
+ reset_ntl(dev);
+ timeout = mftb() + msecs_to_tb(1000);
+
+ while (npu3_dev_fence_get(dev) != NPU3_NTL_CQ_FENCE_STATUS_FULL) {
+ if (tb_compare(mftb(), timeout) == TB_AAFTERB) {
+ NPU3DEVINF(dev, "Device reset timed out\n");
+ return OPAL_BUSY;
+ }
+ }
+
+ return OPAL_SUCCESS;
+}
diff --git a/roms/skiboot/hw/npu3-nvlink.c b/roms/skiboot/hw/npu3-nvlink.c
new file mode 100644
index 000000000..920864b32
--- /dev/null
+++ b/roms/skiboot/hw/npu3-nvlink.c
@@ -0,0 +1,1828 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <device.h>
+#include <phys-map.h>
+#include <npu3.h>
+#include <npu3-regs.h>
+#include <pci-virt.h>
+#include <xscom.h>
+#include <xscom-p9-regs.h>
+#include <interrupts.h>
+#include <pci-cfg.h>
+#include <pci-slot.h>
+#include <cache-p9.h>
+
+#define NPU3LOG(l, npu, fmt, a...) \
+ prlog(l, "NPU#%04x[%d:%d]: " fmt, \
+ (npu)->nvlink.phb.opal_id, \
+ (npu)->chip_id, \
+ (npu)->index, ##a)
+#define NPU3DBG(npu, fmt, a...) NPU3LOG(PR_DEBUG, npu, fmt, ##a)
+#define NPU3INF(npu, fmt, a...) NPU3LOG(PR_INFO, npu, fmt, ##a)
+#define NPU3ERR(npu, fmt, a...) NPU3LOG(PR_ERR, npu, fmt, ##a)
+
+#define NPU3DEVLOG(l, dev, fmt, a...) \
+ prlog(l, "NPU#%04x:%02x:%02x.%x " fmt, \
+ (dev)->npu->nvlink.phb.opal_id, \
+ PCI_BUS_NUM((dev)->nvlink.pvd->bdfn), \
+ PCI_DEV((dev)->nvlink.pvd->bdfn), \
+ PCI_FUNC((dev)->nvlink.pvd->bdfn), ##a)
+#define NPU3DEVDBG(dev, fmt, a...) NPU3DEVLOG(PR_DEBUG, dev, fmt, ##a)
+#define NPU3DEVINF(dev, fmt, a...) NPU3DEVLOG(PR_INFO, dev, fmt, ##a)
+#define NPU3DEVERR(dev, fmt, a...) NPU3DEVLOG(PR_ERR, dev, fmt, ##a)
+
+#define NPU3_CFG_READ(size, type) \
+static int64_t npu3_cfg_read##size(struct phb *phb, uint32_t bdfn, \
+ uint32_t offset, type *data) \
+{ \
+ uint32_t val; \
+ int64_t ret; \
+ \
+ ret = pci_virt_cfg_read(phb, bdfn, offset, \
+ sizeof(*data), &val); \
+ *data = (type)val; \
+ return ret; \
+}
+
+#define NPU3_CFG_WRITE(size, type) \
+static int64_t npu3_cfg_write##size(struct phb *phb, uint32_t bdfn, \
+ uint32_t offset, type data) \
+{ \
+ uint32_t val = data; \
+ int64_t ret; \
+ \
+ ret = pci_virt_cfg_write(phb, bdfn, offset, \
+ sizeof(data), val); \
+ return ret; \
+}
+
+NPU3_CFG_READ(8, u8);
+NPU3_CFG_READ(16, u16);
+NPU3_CFG_READ(32, u32);
+NPU3_CFG_WRITE(8, u8);
+NPU3_CFG_WRITE(16, u16);
+NPU3_CFG_WRITE(32, u32);
+
+static int64_t npu3_eeh_freeze_status(struct phb *phb __unused,
+ uint64_t pe_num __unused,
+ uint8_t *freeze_state,
+ uint16_t *pci_error_type,
+ uint16_t *severity)
+{
+ /*
+ * FIXME: When it's called by skiboot PCI config accessor,
+ * the PE number is fixed to 0, which is incorrect. We need
+ * introduce another PHB callback to translate it. For now,
+ * it keeps the skiboot PCI enumeration going.
+ */
+ *freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN;
+ *pci_error_type = OPAL_EEH_NO_ERROR;
+
+ if (severity)
+ *severity = OPAL_EEH_SEV_NO_ERROR;
+
+ return OPAL_SUCCESS;
+}
+
+/* Number of PEs supported */
+#define NPU3_MAX_PE_NUM 16
+#define NPU3_RESERVED_PE_NUM 15
+
+static int64_t npu3_ioda_reset(struct phb *phb, bool purge __unused)
+{
+ struct npu3 *npu = npu3_phb_to_npu(phb);
+ uint64_t val;
+
+ val = NPU3_ATS_IODA_ADDR_AUTO_INC;
+ val = SETFIELD(NPU3_ATS_IODA_ADDR_TBL_SEL, val,
+ NPU3_ATS_IODA_ADDR_TBL_TVT);
+ npu3_write(npu, NPU3_ATS_IODA_ADDR, val);
+
+ for (uint32_t i = 0; i < NPU3_MAX_PE_NUM; i++)
+ npu3_write(npu, NPU3_ATS_IODA_DATA, 0ull);
+
+ return OPAL_SUCCESS;
+}
+
+static inline void npu3_ioda_sel(struct npu3 *npu, uint32_t table,
+ uint32_t index)
+{
+ uint64_t val;
+
+ val = SETFIELD(NPU3_ATS_IODA_ADDR_TBL_SEL, 0ull, table);
+ val = SETFIELD(NPU3_ATS_IODA_ADDR_TBL_ADDR, val, index);
+ npu3_write(npu, NPU3_ATS_IODA_ADDR, val);
+}
+
+static int64_t npu3_map_pe_dma_window(struct phb *phb,
+ uint64_t pe_num,
+ uint16_t window_id,
+ uint16_t tce_levels,
+ uint64_t tce_table_addr,
+ uint64_t tce_table_size,
+ uint64_t tce_page_size)
+{
+ struct npu3 *npu = npu3_phb_to_npu(phb);
+ uint64_t tts_encoded, val;
+ uint32_t page_size;
+
+ /* Each PE has one corresponding TVE */
+ if (window_id != pe_num || pe_num >= NPU3_MAX_PE_NUM)
+ return OPAL_PARAMETER;
+
+ npu3_ioda_sel(npu, NPU3_ATS_IODA_ADDR_TBL_TVT, pe_num);
+
+ /* TCE table size zero is used to disable the TVE */
+ if (!tce_table_size) {
+ npu3_write(npu, NPU3_ATS_IODA_DATA, 0ull);
+ return OPAL_SUCCESS;
+ }
+
+ /* TCE table size */
+ if (!is_pow2(tce_table_size) || tce_table_size < 0x1000)
+ return OPAL_PARAMETER;
+
+ tts_encoded = ilog2(tce_table_size) - 11;
+ if (tts_encoded > 39)
+ return OPAL_PARAMETER;
+
+ val = SETFIELD(NPU3_ATS_IODA_TVT_TABLE_SIZE, 0ull, tts_encoded);
+
+ /* Number of levels */
+ if (tce_levels < 1 || tce_levels > 4)
+ return OPAL_PARAMETER;
+
+ val = SETFIELD(NPU3_ATS_IODA_TVT_TABLE_LEVEL, val, tce_levels - 1);
+
+ /* TCE page size */
+ switch (tce_page_size) {
+ case 256 << 20:
+ page_size = 17;
+ break;
+ case 16 << 20:
+ page_size = 13;
+ break;
+ case 64 << 10:
+ page_size = 5;
+ break;
+ default:
+ page_size = 1;
+ }
+
+ val = SETFIELD(NPU3_ATS_IODA_TVT_PAGE_SIZE, val, page_size);
+ val = SETFIELD(NPU3_ATS_IODA_TVT_XLAT_ADDR, val, tce_table_addr >> 12);
+ npu3_write(npu, NPU3_ATS_IODA_DATA, val);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu3_map_pe_dma_window_real(struct phb *phb,
+ uint64_t pe_num,
+ uint16_t window_id,
+ uint64_t pci_start_addr __unused,
+ uint64_t pci_mem_size __unused)
+{
+ struct npu3 *npu = npu3_phb_to_npu(phb);
+ uint64_t val;
+
+ /* Each PE has one corresponding TVE */
+ if (window_id != pe_num || pe_num >= NPU3_MAX_PE_NUM)
+ return OPAL_PARAMETER;
+
+ if (pci_mem_size) {
+ /*
+ * GPUs need to be able to access the MMIO memory space as well.
+ * On POWER9 this is above the top of RAM, so disable the TVT
+ * range check, allowing access to all memory addresses.
+ */
+ val = 0;
+ } else {
+ /* Disable */
+ val = PPC_BIT(51);
+ }
+
+ npu3_ioda_sel(npu, NPU3_ATS_IODA_ADDR_TBL_TVT, pe_num);
+ npu3_write(npu, NPU3_ATS_IODA_DATA, val);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu3_next_error(struct phb *phb,
+ uint64_t *first_frozen_pe,
+ uint16_t *pci_error_type,
+ uint16_t *severity)
+{
+ struct npu3 *npu = npu3_phb_to_npu(phb);
+ uint64_t val;
+ uint32_t pe_num;
+
+ if (!first_frozen_pe || !pci_error_type || !severity)
+ return OPAL_PARAMETER;
+
+ *first_frozen_pe = -1;
+ *pci_error_type = OPAL_EEH_NO_ERROR;
+ *severity = OPAL_EEH_SEV_NO_ERROR;
+
+ for (pe_num = 0; pe_num < NPU3_MAX_PE_NUM; pe_num++) {
+ val = npu3_read(npu, NPU3_MISC_PESTB_DATA(pe_num));
+ if (!GETFIELD(NPU3_MISC_PESTB_DATA_DMA_STOPPED_STATE, val))
+ continue;
+
+ *first_frozen_pe = pe_num;
+ *pci_error_type = OPAL_EEH_PE_ERROR;
+ *severity = OPAL_EEH_SEV_PE_ER;
+ break;
+ }
+
+ return OPAL_SUCCESS;
+}
+
+static struct npu3_dev *npu3_bdfn_to_dev(struct npu3 *npu, uint32_t bdfn)
+{
+ struct pci_virt_device *pvd;
+
+ /* All emulated devices are attached to root bus */
+ if (bdfn & ~0xff)
+ return NULL;
+
+ pvd = pci_virt_find_device(&npu->nvlink.phb, bdfn);
+ if (pvd)
+ return pvd->data;
+
+ return NULL;
+}
+
+static int npu3_match_gpu(struct phb *phb __unused, struct pci_device *pd,
+ void *data)
+{
+ const char *slot = data;
+ struct dt_node *dn;
+ char *loc_code;
+
+ /* Ignore non-NVIDIA devices */
+ if (PCI_VENDOR_ID(pd->vdid) != 0x10de)
+ return 0;
+
+ /* Find the PCI device's slot location */
+ for (dn = pd->dn;
+ dn && !dt_find_property(dn, "ibm,loc-code");
+ dn = dn->parent);
+
+ if (!dn)
+ return 0;
+
+ loc_code = (char *)dt_prop_get(dn, "ibm,loc-code");
+ if (streq(loc_code, slot))
+ return 1;
+
+ return 0;
+}
+
+static void npu3_dev_find_gpu(struct npu3_dev *dev)
+{
+ const char *slot = dev->nvlink.loc_code;
+ struct phb *phb;
+ struct pci_device *gpu;
+
+ if (!slot)
+ return;
+
+ for_each_phb(phb) {
+ gpu = pci_walk_dev(phb, NULL, npu3_match_gpu, (void *)slot);
+ if (!gpu)
+ continue;
+
+ dev->nvlink.gpu = gpu;
+ return;
+ }
+
+ NPU3DEVINF(dev, "No PCI device found for slot '%s'\n", slot);
+}
+
+#define VENDOR_CAP_START 0x80
+#define VENDOR_CAP_LINK_FLAG_OFFSET 0x0d
+
+void npu3_pvd_flag_set(struct npu3_dev *dev, uint8_t flag)
+{
+ uint32_t offset = VENDOR_CAP_START + VENDOR_CAP_LINK_FLAG_OFFSET;
+ uint32_t flags;
+
+ PCI_VIRT_CFG_RDONLY_RD(dev->nvlink.pvd, offset, 1, &flags);
+ flags |= flag;
+ PCI_VIRT_CFG_INIT_RO(dev->nvlink.pvd, offset, 1, flags);
+}
+
+void npu3_pvd_flag_clear(struct npu3_dev *dev, uint8_t flag)
+{
+ uint32_t offset = VENDOR_CAP_START + VENDOR_CAP_LINK_FLAG_OFFSET;
+ uint32_t flags;
+
+ PCI_VIRT_CFG_RDONLY_RD(dev->nvlink.pvd, offset, 1, &flags);
+ flags &= ~flag;
+ PCI_VIRT_CFG_INIT_RO(dev->nvlink.pvd, offset, 1, flags);
+}
+
+static struct lock npu3_phandle_lock = LOCK_UNLOCKED;
+
+static void npu3_append_phandle(struct dt_node *dn, const char *name,
+ uint32_t phandle)
+{
+ struct dt_property *prop;
+ uint32_t *phandles;
+ size_t len;
+
+ prop = __dt_find_property(dn, name);
+ if (!prop) {
+ dt_add_property_cells(dn, name, phandle);
+ return;
+ }
+
+ /*
+ * Make sure no one else has a reference to the property. Assume
+ * this is the only function that holds a reference to it.
+ */
+ lock(&npu3_phandle_lock);
+
+ /* Need to append to the property */
+ len = prop->len + sizeof(*phandles);
+ dt_resize_property(&prop, len);
+
+ phandles = (uint32_t *)prop->prop;
+ phandles[len / sizeof(*phandles) - 1] = phandle;
+
+ unlock(&npu3_phandle_lock);
+}
+
+static void npu3_dev_fixup_dt(struct npu3_dev *dev)
+{
+ struct pci_device *pd = dev->nvlink.pd;
+ struct pci_device *gpu = dev->nvlink.gpu;
+
+ dt_add_property_cells(pd->dn, "ibm,nvlink", dev->dn->phandle);
+ dt_add_property_string(pd->dn, "ibm,loc-code", dev->nvlink.loc_code);
+ if (dev->link_speed != 0xff)
+ dt_add_property_cells(pd->dn, "ibm,nvlink-speed",
+ lo32(dev->link_speed));
+
+ if (!gpu)
+ return;
+
+ npu3_append_phandle(gpu->dn, "ibm,npu", pd->dn->phandle);
+ dt_add_property_cells(pd->dn, "ibm,gpu", gpu->dn->phandle);
+}
+
+static int64_t npu3_gpu_bridge_sec_bus_reset(void *pdev,
+ struct pci_cfg_reg_filter *pcrf __unused,
+ uint32_t offset, uint32_t len,
+ uint32_t *data, bool write)
+{
+ struct pci_device *pd = pdev;
+ struct pci_device *gpu;
+ struct npu3 *npu;
+ struct npu3_dev *dev;
+ bool purge = false;
+
+ if (!write)
+ return OPAL_PARAMETER;
+
+ if (len != 2 || offset & 1) {
+ PCIERR(pd->phb, pd->bdfn,
+ "Unsupported write to bridge control register\n");
+ return OPAL_PARAMETER;
+ }
+
+ if (!(*data & PCI_CFG_BRCTL_SECONDARY_RESET))
+ return OPAL_PARTIAL;
+
+ gpu = list_top(&pd->children, struct pci_device, link);
+ if (!gpu)
+ return OPAL_PARTIAL;
+
+ npu3_for_each_nvlink_npu(npu)
+ npu3_for_each_nvlink_dev(dev, npu)
+ if (dev->nvlink.gpu == gpu)
+ if (!npu3_dev_reset(dev))
+ purge = true;
+
+ if (purge)
+ purge_l2_l3_caches();
+
+ return OPAL_PARTIAL;
+}
+
+static int npu3_dev_bind(struct phb *phb, struct pci_device *pd,
+ void *data __unused)
+{
+ struct npu3 *npu = npu3_phb_to_npu(phb);
+ struct npu3_dev *dev = npu3_bdfn_to_dev(npu, pd->bdfn);
+ struct pci_device *gpu;
+
+ dev->nvlink.pd = pd;
+
+ /* The slot label indicates which GPU this link is connected to */
+ dev->nvlink.loc_code = dt_prop_get_def(dev->dn, "ibm,slot-label", NULL);
+ if (!dev->nvlink.loc_code) {
+ /**
+ * @fwts-label NPUNoPHBSlotLabel
+ * @fwts-advice No GPU/NPU slot information was found.
+ * NVLink3 functionality will not work.
+ */
+ NPU3DEVERR(dev, "Cannot find GPU slot information\n");
+ }
+
+ npu3_dev_find_gpu(dev);
+ npu3_dev_fixup_dt(dev);
+
+ gpu = dev->nvlink.gpu;
+ if (!gpu)
+ return 0;
+
+ /* When a GPU is reset, ensure all of its links are reset too */
+ if (gpu->parent && gpu->parent->slot)
+ pci_add_cfg_reg_filter(gpu->parent, PCI_CFG_BRCTL, 2,
+ PCI_REG_FLAG_WRITE,
+ npu3_gpu_bridge_sec_bus_reset);
+
+ npu3_pvd_flag_set(dev, NPU3_DEV_PCI_LINKED);
+
+ return 0;
+}
+
+struct npu3 *npu3_next_nvlink_npu(struct npu3 *npu, uint32_t chip_id)
+{
+ uint64_t phb_id = 0;
+ struct phb *phb;
+
+ if (npu)
+ phb_id = npu->nvlink.phb.opal_id + 1;
+
+ for (; (phb = __pci_next_phb_idx(&phb_id));) {
+ if (phb->phb_type != phb_type_npu_v3)
+ continue;
+
+ npu = npu3_phb_to_npu(phb);
+ if (npu->chip_id == chip_id || chip_id == NPU3_ANY_CHIP)
+ return npu;
+ }
+
+ return NULL;
+}
+
+static struct npu3 *npu3_last_npu(void)
+{
+ static struct npu3 *last = NULL;
+ struct npu3 *npu;
+
+ if (last)
+ return last;
+
+ npu3_for_each_nvlink_npu(npu)
+ last = npu;
+
+ return last;
+}
+
+static uint32_t npu3_gpu_links(struct pci_device *gpu)
+{
+ const struct dt_property *prop;
+
+ if (!gpu)
+ return 0;
+
+ /* The link count is the number of phandles in "ibm,npu" */
+ prop = dt_find_property(gpu->dn, "ibm,npu");
+ if (!prop)
+ return 0;
+
+ return prop->len / sizeof(uint32_t);
+}
+
+static uint32_t npu3_links_per_gpu(void)
+{
+ struct npu3 *npu;
+ struct npu3_dev *dev;
+ uint32_t links = 0;
+
+ /* Use the first GPU we find to figure this out */
+ npu3_for_each_nvlink_npu(npu) {
+ npu3_for_each_nvlink_dev(dev, npu) {
+ links = npu3_gpu_links(dev->nvlink.gpu);
+ if (links)
+ goto out;
+ }
+ }
+
+out:
+ prlog(PR_DEBUG, "NPU: %s: %d\n", __func__, links);
+
+ return links;
+}
+
+int32_t npu3_dev_gpu_index(struct npu3_dev *dev)
+{
+ const char *slot;
+ char *p = NULL;
+ int ret;
+
+ slot = dev->nvlink.loc_code;
+ if (!slot)
+ return -1;
+
+ if (memcmp(slot, "GPU", 3))
+ return -1;
+
+ ret = strtol(slot + 3, &p, 10);
+ if (*p || p == slot + 3)
+ return -1;
+
+ return ret;
+}
+
+static uint32_t npu3_chip_possible_gpu_links(void)
+{
+ struct proc_chip *chip;
+ struct npu3 *npu;
+ struct npu3_dev *dev;
+ uint32_t possible = 0;
+
+ for_each_chip(chip) {
+ npu3_for_each_chip_nvlink_npu(npu, chip->id)
+ npu3_for_each_nvlink_dev(dev, npu)
+ if (npu3_dev_gpu_index(dev) != -1)
+ possible++;
+
+ if (possible)
+ break;
+ }
+
+ prlog(PR_DEBUG, "NPU: %s: %d\n", __func__, possible);
+
+ return possible;
+}
+
+uint32_t npu3_chip_possible_gpus(void)
+{
+ static uint32_t possible = -1;
+ uint32_t links_per_gpu;
+
+ /* Static value, same for all chips; only do this once */
+ if (possible != -1)
+ return possible;
+
+ possible = 0;
+
+ links_per_gpu = npu3_links_per_gpu();
+ if (links_per_gpu)
+ possible = npu3_chip_possible_gpu_links() / links_per_gpu;
+
+ prlog(PR_DEBUG, "NPU: %s: %d\n", __func__, possible);
+
+ return possible;
+}
+
+static void npu3_dev_assign_gmb(struct npu3_dev *dev, uint64_t addr,
+ uint64_t size)
+{
+ uint32_t mode;
+ uint64_t val;
+
+ switch (npu3_gpu_links(dev->nvlink.gpu)) {
+ case 0:
+ return;
+ case 1:
+ mode = 0;
+ break;
+ case 2:
+ mode = 1;
+ break;
+ case 3:
+ mode = 3;
+ break;
+ case 4:
+ mode = 6;
+ break;
+ case 6:
+ mode = 10;
+ break;
+ default:
+ /* Hardware does not support this configuration */
+ assert(0);
+ }
+
+ mode += PCI_FUNC(dev->nvlink.pvd->bdfn);
+
+ val = NPU3_GPU_MEM_BAR_ENABLE |
+ NPU3_GPU_MEM_BAR_POISON;
+ val = SETFIELD(NPU3_GPU_MEM_BAR_ADDR, val, addr >> 30);
+ val = SETFIELD(NPU3_GPU_MEM_BAR_SIZE, val, size >> 30);
+ val = SETFIELD(NPU3_GPU_MEM_BAR_MODE, val, mode);
+
+ npu3_write(dev->npu, NPU3_GPU_MEM_BAR(dev->index), val);
+}
+
+static struct dt_node *npu3_create_memory_dn(struct npu3_dev *dev,
+ uint32_t gpu_index, uint64_t addr,
+ uint64_t size)
+{
+ uint32_t nid = 255 - gpu_index;
+ struct dt_node *mem;
+
+ mem = dt_find_by_name_addr(dt_root, "memory", addr);
+ if (mem)
+ return mem;
+
+ mem = dt_new_addr(dt_root, "memory", addr);
+ assert(mem);
+
+ dt_add_property_string(mem, "device_type", "memory");
+ dt_add_property_string(mem, "compatible", "ibm,coherent-device-memory");
+ dt_add_property_u64s(mem, "reg", addr, size);
+ dt_add_property_u64s(mem, "linux,usable-memory", addr, 0);
+ dt_add_property_cells(mem, "ibm,chip-id", nid);
+ dt_add_property_cells(mem, "ibm,associativity", 4, nid, nid, nid, nid);
+
+ NPU3INF(dev->npu, "%s mem: 0x%016llx (nid %d)\n", dev->nvlink.loc_code,
+ addr, nid);
+
+ return mem;
+}
+
+static void npu3_dev_init_gpu_mem(struct npu3_dev *dev)
+{
+ struct pci_device *pd = dev->nvlink.pd;
+ struct npu3 *npu = dev->npu;
+ struct dt_node *mem;
+ uint64_t addr, size, gta;
+ uint32_t gpu_index;
+
+ if (!dev->nvlink.gpu)
+ return;
+
+ gpu_index = npu3_dev_gpu_index(dev) % npu3_chip_possible_gpus();
+ phys_map_get(npu->chip_id, GPU_MEM_4T_DOWN, gpu_index, &addr, &size);
+
+ npu3_dev_assign_gmb(dev, addr, size);
+ mem = npu3_create_memory_dn(dev, gpu_index, addr, size);
+
+ /*
+ * Coral mode address compression. This is documented in Figure 3.5 of
+ * the NPU workbook; "P9->GPU RA Compression (Coral)".
+ */
+ gta = (addr >> 42 & 0x1) << 42;
+ gta |= (addr >> 45 & 0x3) << 43;
+ gta |= (addr >> 49 & 0x3) << 45;
+ gta |= addr & ((1ul << 43) - 1);
+
+ dt_add_property_cells(pd->dn, "memory-region", mem->phandle);
+ dt_add_property_u64s(pd->dn, "ibm,device-tgt-addr", gta);
+}
+
+static void npu3_final_fixup(void)
+{
+ struct npu3 *npu;
+ struct npu3_dev *dev;
+
+ npu3_for_each_nvlink_npu(npu)
+ npu3_for_each_nvlink_dev(dev, npu)
+ npu3_dev_init_gpu_mem(dev);
+}
+
+static void npu3_phb_final_fixup(struct phb *phb)
+{
+ struct npu3 *npu = npu3_phb_to_npu(phb);
+
+ pci_walk_dev(phb, NULL, npu3_dev_bind, NULL);
+
+ /*
+ * After every npu's devices are bound, do gpu-related fixup. This
+ * counts on npu3_last_npu() walking the phbs in the same order as
+ * the PHB final fixup loop in __pci_init_slots().
+ */
+ if (npu == npu3_last_npu())
+ npu3_final_fixup();
+}
+
+static int64_t npu3_set_pe(struct phb *phb,
+ uint64_t pe_num,
+ uint64_t bdfn,
+ uint8_t bcompare,
+ uint8_t dcompare,
+ uint8_t fcompare,
+ uint8_t action)
+{
+ struct npu3 *npu = npu3_phb_to_npu(phb);
+ struct npu3_dev *dev;
+ uint64_t val;
+
+ dev = npu3_bdfn_to_dev(npu, bdfn);
+ if (!dev)
+ return OPAL_PARAMETER;
+
+ if (action != OPAL_MAP_PE && action != OPAL_UNMAP_PE)
+ return OPAL_PARAMETER;
+
+ if (pe_num >= NPU3_MAX_PE_NUM)
+ return OPAL_PARAMETER;
+
+ if (bcompare != OpalPciBusAll ||
+ dcompare != OPAL_COMPARE_RID_DEVICE_NUMBER ||
+ fcompare != OPAL_COMPARE_RID_FUNCTION_NUMBER)
+ return OPAL_UNSUPPORTED;
+
+ if (!dev->nvlink.gpu)
+ return OPAL_SUCCESS;
+
+ val = NPU3_CTL_BDF2PE_CFG_ENABLE;
+ val = SETFIELD(NPU3_CTL_BDF2PE_CFG_PE, val, pe_num);
+ val = SETFIELD(NPU3_CTL_BDF2PE_CFG_BDF, val, dev->nvlink.gpu->bdfn);
+ npu3_write(npu, NPU3_CTL_BDF2PE_CFG(pe_num), val);
+
+ val = NPU3_MISC_BDF2PE_CFG_ENABLE;
+ val = SETFIELD(NPU3_MISC_BDF2PE_CFG_PE, val, pe_num);
+ val = SETFIELD(NPU3_MISC_BDF2PE_CFG_BDF, val, dev->nvlink.gpu->bdfn);
+ npu3_write(npu, NPU3_MISC_BDF2PE_CFG(pe_num), val);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu3_tce_kill_pages(struct npu3 *npu,
+ uint64_t pe_num,
+ uint32_t tce_size,
+ uint64_t dma_addr,
+ uint32_t npages)
+{
+ uint32_t check_tce_size;
+ uint64_t val;
+
+ if (pe_num >= NPU3_MAX_PE_NUM)
+ return OPAL_PARAMETER;
+
+ npu3_ioda_sel(npu, NPU3_ATS_IODA_ADDR_TBL_TVT, pe_num);
+ val = npu3_read(npu, NPU3_ATS_IODA_DATA);
+
+ check_tce_size = 0x800 << GETFIELD(NPU3_ATS_IODA_TVT_PAGE_SIZE, val);
+ if (check_tce_size != tce_size) {
+ NPU3ERR(npu, "%s: Unexpected TCE size (got 0x%x, expected 0x%x)\n",
+ __func__, tce_size, check_tce_size);
+
+ return OPAL_PARAMETER;
+ }
+
+ val = NPU3_ATS_TCE_KILL_ONE;
+ val = SETFIELD(NPU3_ATS_TCE_KILL_PE_NUMBER, val, pe_num);
+
+ while (npages--) {
+ val = SETFIELD(NPU3_ATS_TCE_KILL_ADDRESS, val, dma_addr >> 12);
+ npu3_write(npu, NPU3_ATS_TCE_KILL, val);
+
+ dma_addr += tce_size;
+ }
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu3_tce_kill(struct phb *phb,
+ uint32_t kill_type,
+ uint64_t pe_num,
+ uint32_t tce_size,
+ uint64_t dma_addr,
+ uint32_t npages)
+{
+ struct npu3 *npu = npu3_phb_to_npu(phb);
+
+ sync();
+
+ switch(kill_type) {
+ case OPAL_PCI_TCE_KILL_PAGES:
+ return npu3_tce_kill_pages(npu, pe_num, tce_size,
+ dma_addr, npages);
+ case OPAL_PCI_TCE_KILL_PE:
+ /*
+ * NPU doesn't support killing a PE so fall through
+ * and do a kill all instead.
+ */
+ case OPAL_PCI_TCE_KILL_ALL:
+ npu3_write(npu, NPU3_ATS_TCE_KILL, NPU3_ATS_TCE_KILL_ALL);
+ return OPAL_SUCCESS;
+ }
+
+ return OPAL_PARAMETER;
+}
+
+static const struct phb_ops npu_ops = {
+ .cfg_read8 = npu3_cfg_read8,
+ .cfg_read16 = npu3_cfg_read16,
+ .cfg_read32 = npu3_cfg_read32,
+ .cfg_write8 = npu3_cfg_write8,
+ .cfg_write16 = npu3_cfg_write16,
+ .cfg_write32 = npu3_cfg_write32,
+ .eeh_freeze_status = npu3_eeh_freeze_status,
+ .ioda_reset = npu3_ioda_reset,
+ .map_pe_dma_window = npu3_map_pe_dma_window,
+ .map_pe_dma_window_real = npu3_map_pe_dma_window_real,
+ .next_error = npu3_next_error,
+ .phb_final_fixup = npu3_phb_final_fixup,
+ .set_pe = npu3_set_pe,
+ .tce_kill = npu3_tce_kill,
+};
+
+static int64_t npu3_reset(struct pci_slot *slot)
+{
+ struct npu3 *npu = npu3_phb_to_npu(slot->phb);
+ struct npu3_dev *dev;
+ int64_t rc = OPAL_SUCCESS;
+ bool purge = false;
+
+ npu3_for_each_nvlink_dev(dev, npu) {
+ rc = npu3_dev_reset(dev);
+ if (rc)
+ break;
+
+ purge = true;
+ }
+
+ /* No devices reset; don't purge, just return */
+ if (!purge)
+ return rc;
+
+ /* All devices reset */
+ if (!rc)
+ return purge_l2_l3_caches();
+
+ /* Some devices successfully reset; purge, but still return error */
+ purge_l2_l3_caches();
+ return rc;
+}
+
+static int64_t npu3_freset(struct pci_slot *slot __unused)
+{
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu3_get_link_state(struct pci_slot *slot __unused,
+ uint8_t *val)
+{
+ *val = OPAL_SHPC_LINK_UP_x1;
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu3_get_power_state(struct pci_slot *slot __unused,
+ uint8_t *val)
+{
+ *val = PCI_SLOT_POWER_ON;
+ return OPAL_SUCCESS;
+}
+
+static void npu3_create_phb_slot(struct npu3 *npu)
+{
+ struct pci_slot *slot;
+
+ slot = pci_slot_alloc(&npu->nvlink.phb, NULL);
+ if (!slot)
+ return;
+
+ /* Elementary functions */
+ slot->ops.creset = npu3_reset;
+ slot->ops.freset = npu3_freset;
+ slot->ops.hreset = npu3_reset;
+ slot->ops.get_link_state = npu3_get_link_state;
+ slot->ops.get_power_state = npu3_get_power_state;
+}
+
+static void npu3_create_phb(struct npu3 *npu)
+{
+ struct phb *phb = &npu->nvlink.phb;
+
+ phb->phb_type = phb_type_npu_v3;
+ phb->ops = &npu_ops;
+ phb->dt_node = dt_new_addr(dt_root, "pciex", npu->regs[0]);
+ assert(phb->dt_node);
+
+ list_head_init(&phb->virt_devices);
+ pci_register_phb(phb, npu3_get_opal_id(npu->chip_id,
+ npu3_get_phb_index(npu->index)));
+ npu3_create_phb_slot(npu);
+ npu3_ioda_reset(phb, true);
+}
+
+static void npu3_dev_init_hw(struct npu3_dev *dev)
+{
+ struct npu3 *npu = dev->npu;
+ uint64_t reg, val;
+
+ reg = NPU3_RELAXED_CFG2(dev->index);
+ val = npu3_read(npu, reg);
+ val |= NPU3_RELAXED_CFG2_CMD_CL_DMA_W |
+ NPU3_RELAXED_CFG2_CMD_CL_DMA_W_HP |
+ NPU3_RELAXED_CFG2_CMD_CL_DMA_INJ |
+ NPU3_RELAXED_CFG2_CMD_PR_DMA_INJ |
+ NPU3_RELAXED_CFG2_CMD_DMA_PR_W |
+ NPU3_RELAXED_CFG2_CMD_CL_RD_NC_F0 |
+ NPU3_RELAXED_CFG2_SRC_RDENA(0);
+ npu3_write(npu, reg, val);
+
+ reg = NPU3_NTL_MISC_CFG2(dev->index);
+ val = npu3_read(npu, reg);
+ val |= NPU3_NTL_MISC_CFG2_BRICK_ENABLE |
+ NPU3_NTL_MISC_CFG2_RCV_CREDIT_OVERFLOW_ENA;
+ npu3_write(npu, reg, val);
+}
+
+static void npu3_init_hw(struct npu3 *npu)
+{
+ struct npu3_dev *dev;
+ uint64_t reg, val;
+
+ reg = NPU3_XTS_CFG;
+ val = npu3_read(npu, reg);
+ val |= NPU3_XTS_CFG_MMIOSD | NPU3_XTS_CFG_TRY_ATR_RO;
+ npu3_write(npu, reg, val);
+
+ reg = NPU3_XTS_CFG2;
+ val = npu3_read(npu, reg);
+ val |= NPU3_XTS_CFG2_NO_FLUSH_ENA;
+ npu3_write(npu, reg, val);
+
+ reg = NPU3_RELAXED_SRC(0);
+ val = NPU3_RELAXED_SRC_MASK_NPU;
+ npu3_write(npu, reg, val);
+
+ npu3_for_each_nvlink_dev(dev, npu)
+ npu3_dev_init_hw(dev);
+}
+
+/* PCI command register (BAR enable/disable) */
+static int64_t npu3_cfg_cmd(void *pvd,
+ struct pci_cfg_reg_filter *pcrf __unused,
+ uint32_t offset, uint32_t size,
+ uint32_t *data, bool write)
+{
+ struct npu3_dev *dev = ((struct pci_virt_device *)pvd)->data;
+
+ if (!write)
+ return OPAL_PARTIAL;
+
+ if (offset != PCI_CFG_CMD)
+ return OPAL_PARAMETER;
+
+ if (size != 1 && size != 2 && size != 4)
+ return OPAL_PARAMETER;
+
+ npu3_dev_enable_bars(dev, !!(*data & PCI_CFG_CMD_MEM_EN));
+
+ return OPAL_PARTIAL;
+}
+
+static int64_t npu3_cfg_bar_write(struct npu3_bar *bar, uint64_t mask,
+ uint32_t data)
+{
+ if (data != 0xffffffff)
+ return OPAL_HARDWARE;
+
+ /* Return BAR size on next read */
+ bar->trap |= mask;
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu3_cfg_bar_read(struct npu3_bar *bar, uint64_t mask,
+ uint32_t *data)
+{
+ if (!(bar->trap & mask))
+ return OPAL_PARTIAL;
+
+ *data = GETFIELD(mask, bar->size);
+ bar->trap &= ~mask;
+
+ return OPAL_SUCCESS;
+}
+
+/* PCI BAR registers (NTL/GENID) */
+static int64_t npu3_cfg_bar(void *pvd __unused,
+ struct pci_cfg_reg_filter *pcrf,
+ uint32_t offset, uint32_t size, uint32_t *data,
+ bool write)
+{
+ struct npu3_bar *bar = (struct npu3_bar *)pcrf->data;
+ uint64_t mask;
+
+ if (size != 4)
+ return OPAL_PARAMETER;
+
+ if (offset == pcrf->start)
+ mask = 0xffffffff;
+ else if (offset == pcrf->start + 4)
+ mask = 0xffffffffull << 32;
+ else
+ return OPAL_PARAMETER;
+
+ if (write)
+ return npu3_cfg_bar_write(bar, mask, *data);
+
+ return npu3_cfg_bar_read(bar, mask, data);
+}
+
+/* PCI control register */
+static int64_t npu3_cfg_devctl(void *pvd,
+ struct pci_cfg_reg_filter *pcrf __unused,
+ uint32_t offset, uint32_t size,
+ uint32_t *data, bool write)
+{
+ struct npu3_dev *dev = ((struct pci_virt_device *)pvd)->data;
+
+ if (!write)
+ return OPAL_HARDWARE;
+
+ if (size != 2 || offset & 1) {
+ NPU3DEVERR(dev, "Unsupported write to pcie control register\n");
+ return OPAL_PARAMETER;
+ }
+
+ if (*data & PCICAP_EXP_DEVCTL_FUNC_RESET)
+ if (!npu3_dev_reset(dev))
+ purge_l2_l3_caches();
+
+ return OPAL_PARTIAL;
+}
+
+static uint32_t npu3_cfg_populate_pcie_cap(struct npu3_dev *dev, uint32_t start,
+ uint32_t prev_cap)
+{
+ struct pci_virt_device *pvd = dev->nvlink.pvd;
+ uint32_t val;
+
+ /* Add capability list */
+ PCI_VIRT_CFG_INIT_RO(pvd, prev_cap, 1, start);
+ PCI_VIRT_CFG_INIT_RO(pvd, start, 1, PCI_CFG_CAP_ID_EXP);
+
+ /* 0x00 - ID/PCIE capability */
+ val = PCI_CFG_CAP_ID_EXP;
+ val |= 0x2 << 16 | PCIE_TYPE_ENDPOINT << 20;
+ PCI_VIRT_CFG_INIT_RO(pvd, start, 4, val);
+
+ /* 0x04 - Device capability */
+ val = PCIE_MPSS_128 |
+ PCIE_PHANTOM_NONE << 3 |
+ PCIE_L0SL_MAX_NO_LIMIT << 6 |
+ PCIE_L1L_MAX_NO_LIMIT << 9 |
+ PCICAP_EXP_DEVCAP_FUNC_RESET;
+ PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_DEVCAP, 4, val);
+
+ pci_virt_add_filter(pvd, start + PCICAP_EXP_DEVCTL, 2,
+ PCI_REG_FLAG_WRITE,
+ npu3_cfg_devctl, NULL);
+
+ /* 0x08 - Device control and status */
+ PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_DEVCTL, 4, 0x00002810,
+ 0xffff0000, 0x000f0000);
+
+ /* 0x0c - Link capability */
+ val = PCIE_LSPEED_VECBIT_2 | PCIE_LWIDTH_1X << 4;
+ PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_LCAP, 4, val);
+
+ /* 0x10 - Link control and status */
+ PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_LCTL, 4, 0x00130000,
+ 0xfffff000, 0xc0000000);
+
+ /* 0x14 - Slot capability */
+ PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SLOTCAP, 4, 0x00000000);
+
+ /* 0x18 - Slot control and status */
+ PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SLOTCTL, 4, 0x00000000);
+
+ /* 0x1c - Root control and capability */
+ PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_RC, 4, 0x00000000,
+ 0xffffffe0, 0x00000000);
+
+ /* 0x20 - Root status */
+ PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_RSTAT, 4, 0x00000000,
+ 0xffffffff, 0x00010000);
+
+ /* 0x24 - Device capability 2 */
+ PCI_VIRT_CFG_INIT_RO(pvd, start + PCIECAP_EXP_DCAP2, 4, 0x00000000);
+
+ /* 0x28 - Device Control and status 2 */
+ PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_DCTL2, 4, 0x00070000,
+ 0xffff0000, 0x00000000);
+
+ /* 0x2c - Link capability 2 */
+ PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_LCAP2, 4, 0x00000007);
+
+ /* 0x30 - Link control and status 2 */
+ PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_LCTL2, 4, 0x00000003,
+ 0xffff0000, 0x00200000);
+
+ /* 0x34 - Slot capability 2 */
+ PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SCAP2, 4, 0x00000000);
+
+ /* 0x38 - Slot control and status 2 */
+ PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SCTL2, 4, 0x00000000);
+
+ return start + PCICAP_EXP_SCTL2 + 8;
+}
+
+static int64_t npu3_dev_procedure_write(struct npu3_dev *dev, uint32_t offset,
+ uint32_t data)
+{
+ switch (offset) {
+ case 0:
+ NPU3DEVINF(dev, "Ignoring write to status register\n");
+ break;
+ case 4:
+ npu3_dev_procedure_init(dev, data);
+ break;
+ default:
+ return OPAL_PARAMETER;
+ }
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu3_dev_procedure_read(struct npu3_dev *dev, uint32_t offset,
+ uint32_t *data)
+{
+ switch (offset) {
+ case 0:
+ *data = npu3_dev_procedure_status(dev);
+ break;
+ case 4:
+ *data = dev->proc.number;
+ break;
+ default:
+ *data = 0;
+ return OPAL_PARAMETER;
+ }
+
+ return OPAL_SUCCESS;
+}
+
+/* Hardware procedure control/status registers */
+static int64_t npu3_dev_procedure(void *pvd, struct pci_cfg_reg_filter *pcrf,
+ uint32_t offset, uint32_t size,
+ uint32_t *data, bool write)
+{
+ struct npu3_dev *dev = ((struct pci_virt_device *)pvd)->data;
+
+ if (size != 4)
+ return OPAL_PARAMETER;
+
+ offset -= pcrf->start;
+
+ if (write)
+ return npu3_dev_procedure_write(dev, offset, *data);
+
+ return npu3_dev_procedure_read(dev, offset, data);
+}
+
+/* PPE SRAM access is indirect via CSAR/CSDR */
+static void npu3_dev_ppe_sram_sel(struct npu3_dev *dev, uint32_t reg)
+{
+ uint64_t val;
+
+ val = SETFIELD(OB_PPE_CSAR_SRAM_ADDR, 0ull, reg);
+ xscom_write(dev->npu->chip_id, OB_PPE_CSAR(dev->ob_chiplet), val);
+}
+
+static void npu3_dev_ppe_sram_write(struct npu3_dev *dev, uint32_t reg,
+ uint64_t val)
+{
+ npu3_dev_ppe_sram_sel(dev, reg);
+ xscom_write(dev->npu->chip_id, OB_PPE_CSDR(dev->ob_chiplet), val);
+}
+
+static uint64_t npu3_dev_ppe_sram_read(struct npu3_dev *dev, uint32_t reg)
+{
+ uint64_t val;
+
+ npu3_dev_ppe_sram_sel(dev, reg);
+ xscom_read(dev->npu->chip_id, OB_PPE_CSDR(dev->ob_chiplet), &val);
+
+ return val;
+}
+
+/* Software-implemented autonomous link training (SALT) */
+static int64_t npu3_dev_salt(void *pvd, struct pci_cfg_reg_filter *pcrf,
+ uint32_t offset, uint32_t size, uint32_t *data,
+ bool write)
+{
+ struct npu3_dev *dev = ((struct pci_virt_device *)pvd)->data;
+ unsigned long timeout;
+ uint32_t cmd_reg;
+ uint64_t val;
+
+ if (size != 4 || offset != pcrf->start)
+ return OPAL_PARAMETER;
+
+ /* The config register before this one holds CMD_REG */
+ PCI_VIRT_CFG_NORMAL_RD(pvd, pcrf->start - 4, 4, &cmd_reg);
+ if (cmd_reg == 0xffffffff)
+ return OPAL_PARAMETER;
+
+ /* Check for another command in progress */
+ val = npu3_dev_ppe_sram_read(dev, OB_PPE_SALT_CMD);
+ if (GETFIELD(OB_PPE_SALT_CMD_READY, val)) {
+ NPU3DEVINF(dev, "SALT_CMD 0x%x: Not ready\n", cmd_reg);
+ return OPAL_BUSY;
+ }
+
+ val = OB_PPE_SALT_CMD_READY;
+ val = SETFIELD(OB_PPE_SALT_CMD_RW, val, write);
+ val = SETFIELD(OB_PPE_SALT_CMD_LINKNUM, val, npu3_chip_dev_index(dev));
+ val = SETFIELD(OB_PPE_SALT_CMD_REG, val, cmd_reg);
+ if (write)
+ val = SETFIELD(OB_PPE_SALT_CMD_DATA, val, *data);
+
+ npu3_dev_ppe_sram_write(dev, OB_PPE_SALT_CMD, val);
+
+ /* Wait for the go bit to clear */
+ timeout = mftb() + msecs_to_tb(1000);
+
+ while (GETFIELD(OB_PPE_SALT_CMD_READY, val)) {
+ if (tb_compare(mftb(), timeout) == TB_AAFTERB) {
+ NPU3DEVINF(dev, "SALT_CMD 0x%x: Timeout\n", cmd_reg);
+ return OPAL_BUSY;
+ }
+
+ val = npu3_dev_ppe_sram_read(dev, OB_PPE_SALT_CMD);
+ }
+
+ if (GETFIELD(OB_PPE_SALT_CMD_ERR, val))
+ NPU3DEVINF(dev, "SALT_CMD 0x%x: Error\n", cmd_reg);
+
+ if (!write)
+ *data = GETFIELD(OB_PPE_SALT_CMD_DATA, val);
+
+ return OPAL_SUCCESS;
+}
+
+#define VENDOR_CAP_LEN 0x1c
+#define VENDOR_CAP_VERSION 0x02
+
+static uint32_t npu3_cfg_populate_vendor_cap(struct npu3_dev *dev,
+ uint32_t start, uint32_t prev_cap)
+{
+ struct pci_virt_device *pvd = dev->nvlink.pvd;
+
+ /* Capabilities list */
+ PCI_VIRT_CFG_INIT_RO(pvd, prev_cap, 1, start);
+ PCI_VIRT_CFG_INIT_RO(pvd, start, 1, PCI_CFG_CAP_ID_VENDOR);
+
+ /* Length and version */
+ PCI_VIRT_CFG_INIT_RO(pvd, start + 2, 1, VENDOR_CAP_LEN);
+ PCI_VIRT_CFG_INIT_RO(pvd, start + 3, 1, VENDOR_CAP_VERSION);
+
+ /*
+ * Defaults when the trap can't handle the read/write (eg. due to
+ * reading/writing less than 4 bytes).
+ */
+ PCI_VIRT_CFG_INIT_RO(pvd, start + 4, 4, 0);
+ PCI_VIRT_CFG_INIT_RO(pvd, start + 8, 4, 0);
+
+ /* PHY procedure trap */
+ pci_virt_add_filter(pvd, start + 4, 8,
+ PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
+ npu3_dev_procedure, NULL);
+
+ /* Link index */
+ PCI_VIRT_CFG_INIT_RO(pvd, start + 0xc, 1, npu3_chip_dev_index(dev));
+
+ /* SALT registers */
+ PCI_VIRT_CFG_INIT(pvd, start + 0x10, 4, 0xffffffff, 0, 0);
+ PCI_VIRT_CFG_INIT_RO(pvd, start + 0x14, 4, 0);
+
+ pci_virt_add_filter(pvd, start + 0x14, 4,
+ PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
+ npu3_dev_salt, NULL);
+
+ return start + VENDOR_CAP_LEN;
+}
+
+static void npu3_cfg_populate(struct npu3_dev *dev)
+{
+ struct pci_virt_device *pvd = dev->nvlink.pvd;
+ uint64_t addr;
+ uint32_t pos;
+
+ /* 0x00 - Vendor/Device ID */
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_VENDOR_ID, 4, 0x04ea1014);
+
+ /* 0x04 - Command/Status */
+ PCI_VIRT_CFG_INIT(pvd, PCI_CFG_CMD, 4, 0x00100000, 0xffb802b8,
+ 0xf9000000);
+
+ pci_virt_add_filter(pvd, PCI_CFG_CMD, 1, PCI_REG_FLAG_WRITE,
+ npu3_cfg_cmd, NULL);
+
+ /* 0x08 - Rev/Class/Cache */
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_REV_ID, 4, 0x06800102);
+
+ /* 0x0c - CLS/Latency Timer/Header/BIST */
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CACHE_LINE_SIZE, 4, 0x00800000);
+
+ /* 0x10/14 - NTL BAR */
+ addr = SETFIELD(0xf, dev->ntl_bar.addr,
+ PCI_CFG_BAR_TYPE_MEM | PCI_CFG_BAR_MEM64);
+ PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR0, 4, lo32(addr), 0xf, 0);
+ PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR1, 4, hi32(addr), 0, 0);
+
+ pci_virt_add_filter(pvd, PCI_CFG_BAR0, 8,
+ PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
+ npu3_cfg_bar, &dev->ntl_bar);
+
+ /* 0x18/1c - GENID BAR */
+ addr = SETFIELD(0xf, dev->genid_bar.addr,
+ PCI_CFG_BAR_TYPE_MEM | PCI_CFG_BAR_MEM64);
+ PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR2, 4, lo32(addr), 0xf, 0);
+ PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR3, 4, hi32(addr), 0, 0);
+
+ pci_virt_add_filter(pvd, PCI_CFG_BAR2, 8,
+ PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
+ npu3_cfg_bar, &dev->genid_bar);
+
+ /* 0x20/0x24 - BARs, disabled */
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR4, 4, 0x00000000);
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR5, 4, 0x00000000);
+
+ /* 0x28 - Cardbus CIS pointer */
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CARDBUS_CIS, 4, 0x00000000);
+
+ /* 0x2c - Subsystem ID */
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_SUBSYS_VENDOR_ID, 4, 0x00000000);
+
+ /* 0x30 - ROM BAR, zero sized */
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_ROMBAR, 4, 0xffffffff);
+
+ /* 0x34 - PCI Capability */
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CAP, 4, 0x00000000);
+
+ /* 0x38 - Reserved */
+ PCI_VIRT_CFG_INIT_RO(pvd, 0x38, 4, 0x00000000);
+
+ /* 0x3c - INT line/pin/Minimal grant/Maximal latency */
+ PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_INT_LINE, 4, 0x00000100); /* INT A */
+
+ /* PCIE and vendor specific capability */
+ pos = npu3_cfg_populate_pcie_cap(dev, 0x40, PCI_CFG_CAP);
+ pos = npu3_cfg_populate_vendor_cap(dev, pos, 0x41);
+ PCI_VIRT_CFG_INIT_RO(pvd, pos + 1, 1, 0);
+}
+
+static void npu3_dev_create_pvd(struct npu3_dev *dev)
+{
+ struct npu3 *npu = dev->npu;
+ struct phb *phb = &npu->nvlink.phb;
+
+ dev->nvlink.pvd = pci_virt_add_device(phb, dev->index, 0x100, dev);
+ if (!dev->nvlink.pvd)
+ return;
+
+ phb->scan_map |= 0x1 << GETFIELD(0xf8, dev->nvlink.pvd->bdfn);
+ npu3_cfg_populate(dev);
+}
+
+static void npu3_dt_add_mmio_atsd(struct npu3 *npu)
+{
+ struct dt_node *dn = npu->nvlink.phb.dt_node;
+ uint64_t mmio_atsd[NPU3_XTS_ATSD_MAX];
+
+ for (uint32_t i = 0; i < NPU3_XTS_ATSD_MAX; i++)
+ mmio_atsd[i] = npu->regs[0] + NPU3_XTS_ATSD_LAUNCH(i);
+
+ dt_add_property(dn, "ibm,mmio-atsd", mmio_atsd, sizeof(mmio_atsd));
+}
+
+static void npu3_dt_add_mmio_window(struct npu3 *npu)
+{
+ struct dt_node *dn = npu->nvlink.phb.dt_node;
+ uint32_t ntl0_index = npu->index * NPU3_LINKS_PER_NPU;
+ uint64_t addr, size, win[2];
+
+ /* Device MMIO window (NTL/GENID regs only) */
+ phys_map_get(npu->chip_id, NPU_NTL, ntl0_index, &win[0], NULL);
+ phys_map_get(npu->chip_id, NPU_GENID, npu->index, &addr, &size);
+ win[1] = addr + size - win[0];
+
+ dt_add_property(dn, "ibm,mmio-window", win, sizeof(win));
+ dt_add_property_cells(dn, "ranges", 0x02000000,
+ hi32(win[0]), lo32(win[0]),
+ hi32(win[0]), lo32(win[0]),
+ hi32(win[1]), lo32(win[1]));
+}
+
+/* NDL No-Stall Event level */
+static uint32_t npu3_dev_interrupt_level(struct npu3_dev *dev)
+{
+ const uint32_t level[12] = { 1, 3, 5, 7, 9, 11,
+ 43, 45, 47, 49, 51, 53 };
+
+ return level[npu3_chip_dev_index(dev)];
+}
+
+static void npu3_dt_add_interrupts(struct npu3 *npu)
+{
+ struct dt_node *dn = npu->nvlink.phb.dt_node;
+ uint32_t *map, icsp, i = 0;
+ struct npu3_dev *dev;
+ size_t map_size = 0;
+
+ npu3_for_each_nvlink_dev(dev, npu)
+ map_size += sizeof(*map) * 7;
+
+ if (!map_size)
+ return;
+
+ icsp = get_ics_phandle();
+ map = zalloc(map_size);
+ assert(map);
+
+ npu3_for_each_nvlink_dev(dev, npu) {
+ map[i] = dev->nvlink.pvd->bdfn << 8;
+ map[i + 3] = 1; /* INT A */
+ map[i + 4] = icsp; /* interrupt-parent */
+ map[i + 5] = npu->irq_base + npu3_dev_interrupt_level(dev);
+ map[i + 6] = 0; /* 0 = EDGE, 1 = LEVEL */
+ i += 7;
+ }
+
+ dt_add_property_cells(dn, "interrupt-parent", icsp);
+ dt_add_property(dn, "interrupt-map", map, map_size);
+ dt_add_property_cells(dn, "interrupt-map-mask", 0xff00, 0x0, 0x0, 0x7);
+
+ free(map);
+}
+
+/* Populate PCI root device node */
+static void npu3_dt_add_props(struct npu3 *npu)
+{
+ struct dt_node *dn = npu->nvlink.phb.dt_node;
+
+ dt_add_property_cells(dn, "#address-cells", 3);
+ dt_add_property_cells(dn, "#size-cells", 2);
+ dt_add_property_cells(dn, "#interrupt-cells", 1);
+ dt_add_property_cells(dn, "bus-range", 0, 0xff);
+ dt_add_property_cells(dn, "clock-frequency", 0x200, 0);
+
+ dt_add_property_strings(dn, "device_type", "pciex");
+
+ /*
+ * To the OS, npu2 and npu3 are both ibm,ioda2-npu2-phb. The added
+ * ibm,ioda3-npu3-phb allows for possible quirks.
+ */
+ dt_add_property_strings(dn, "compatible",
+ "ibm,power9-npu-pciex",
+ "ibm,ioda2-npu2-phb",
+ "ibm,ioda2-npu3-phb");
+
+ dt_add_property_cells(dn, "ibm,phb-index",
+ npu3_get_phb_index(npu->index));
+ dt_add_property_cells(dn, "ibm,phb-diag-data-size", 0);
+ dt_add_property_cells(dn, "ibm,opal-num-pes", NPU3_MAX_PE_NUM);
+ dt_add_property_cells(dn, "ibm,opal-reserved-pe", NPU3_RESERVED_PE_NUM);
+ dt_add_property_cells(dn, "ibm,supported-tce-sizes",
+ 12, /* 4K */
+ 16, /* 64K */
+ 24, /* 16M */
+ 28); /* 256M */
+
+ dt_add_property_cells(dn, "ibm,chip-id", npu->chip_id);
+ dt_add_property_cells(dn, "ibm,npu-index", npu->index);
+ dt_add_property_cells(dn, "ibm,npcq", npu->dt_node->phandle);
+ dt_add_property_cells(dn, "ibm,xscom-base", npu->xscom_base);
+ dt_add_property_cells(dn, "ibm,links", NPU3_LINKS_PER_NPU);
+
+ dt_add_property(dn, "reg", npu->regs, sizeof(npu->regs));
+
+ npu3_dt_add_mmio_atsd(npu);
+ npu3_dt_add_mmio_window(npu);
+ npu3_dt_add_interrupts(npu);
+}
+
+void npu3_init_nvlink(struct npu3 *npu)
+{
+ struct npu3_dev *dev;
+
+ if (!npu3_next_dev(npu, NULL, NPU3_DEV_TYPE_NVLINK))
+ return;
+
+ npu3_init_hw(npu);
+ npu3_create_phb(npu);
+
+ npu3_for_each_nvlink_dev(dev, npu)
+ npu3_dev_create_pvd(dev);
+
+ npu3_dt_add_props(npu);
+
+ /* TODO: Sort out if/why we still can't enable this */
+ disable_fast_reboot("NVLink device enabled");
+}
+
+static int64_t npu3_init_context_pid(struct npu3 *npu, uint32_t index,
+ uint64_t msr)
+{
+ uint64_t map, old_map;
+
+ /* Unfiltered XTS mode; index is lparshort */
+ map = SETFIELD(NPU3_XTS_PID_MAP_LPARSHORT, 0ull, index);
+
+ /* Enable this mapping for both real and virtual addresses */
+ map |= NPU3_XTS_PID_MAP_VALID_ATRGPA0 | NPU3_XTS_PID_MAP_VALID_ATRGPA1;
+
+ /* Enable TLBIE/MMIOSD forwarding for this entry */
+ map |= NPU3_XTS_PID_MAP_VALID_ATSD;
+
+ /* Set the relevant MSR bits */
+ if (msr & MSR_DR)
+ map |= NPU3_XTS_PID_MAP_MSR_DR;
+
+ if (msr & MSR_HV)
+ map |= NPU3_XTS_PID_MAP_MSR_HV;
+
+ if (msr & MSR_PR)
+ map |= NPU3_XTS_PID_MAP_MSR_PR;
+
+ /* We don't support anything other than 64-bit so hardcode it here */
+ map |= NPU3_XTS_PID_MAP_MSR_SF;
+
+ old_map = npu3_read(npu, NPU3_XTS_PID_MAP(index));
+
+ /* Error out if this entry is already set with different msr bits */
+ if (old_map && GETFIELD(NPU3_XTS_PID_MAP_MSR, old_map) !=
+ GETFIELD(NPU3_XTS_PID_MAP_MSR, map)) {
+ NPU3ERR(npu, "%s: Unexpected MSR value\n", __func__);
+ return OPAL_PARAMETER;
+ }
+
+ if (!old_map) {
+ NPU3DBG(npu, "XTS_PID_MAP[%03d] = 0x%08llx\n", index, map);
+ npu3_write(npu, NPU3_XTS_PID_MAP(index), map);
+ }
+
+ npu->nvlink.ctx_ref[index]++;
+
+ return OPAL_SUCCESS;
+}
+
+#define NPU3_VALID_ATS_MSR_BITS (MSR_DR | MSR_HV | MSR_PR | MSR_SF)
+
+/*
+ * Allocate a context ID and initialize the tables with the relevant
+ * information. Returns the ID or error if one couldn't be allocated.
+ */
+int64_t npu3_init_context(struct phb *phb, uint64_t msr, uint64_t bdf)
+{
+ struct npu3 *npu = npu3_phb_to_npu(phb);
+ uint32_t lparshort, i;
+ uint64_t map;
+ int64_t rc;
+
+ /*
+ * MSR bits should be masked by the caller to allow for future
+ * expansion if required.
+ */
+ if (msr & ~NPU3_VALID_ATS_MSR_BITS)
+ return OPAL_UNSUPPORTED;
+
+ lock(&npu->lock);
+
+ for (i = 0; i < NPU3_XTS_BDF_MAP_MAX; i++) {
+ map = npu3_read(npu, NPU3_XTS_BDF_MAP(i));
+
+ if (map && GETFIELD(NPU3_XTS_BDF_MAP_BDF, map) == bdf)
+ break;
+ }
+
+ if (i == NPU3_XTS_BDF_MAP_MAX) {
+ NPU3ERR(npu, "LPARID not associated with any GPU\n");
+ rc = OPAL_PARAMETER;
+ goto out;
+ }
+
+ lparshort = GETFIELD(NPU3_XTS_BDF_MAP_LPARSHORT, map);
+ NPU3DBG(npu, "Found LPARSHORT 0x%x for bdf %02llx:%02llx.%llx\n",
+ lparshort, PCI_BUS_NUM(bdf), PCI_DEV(bdf), PCI_FUNC(bdf));
+
+ rc = npu3_init_context_pid(npu, lparshort, msr);
+ if (rc)
+ goto out;
+
+ if (!(map & NPU3_XTS_BDF_MAP_VALID)) {
+ map |= NPU3_XTS_BDF_MAP_VALID;
+ npu3_write(npu, NPU3_XTS_BDF_MAP(i), map);
+ }
+
+ rc = lparshort;
+
+out:
+ unlock(&npu->lock);
+ return rc;
+}
+
+static int64_t npu3_destroy_context_pid(struct npu3 *npu, uint32_t index)
+{
+ if (!npu->nvlink.ctx_ref[index])
+ return OPAL_PARAMETER;
+
+ /* Only destroy when refcount hits 0 */
+ if (--npu->nvlink.ctx_ref[index])
+ return OPAL_PARTIAL;
+
+ NPU3DBG(npu, "XTS_PID_MAP[%03d] = 0 (destroy)\n", index);
+ npu3_write(npu, NPU3_XTS_PID_MAP(index), 0ull);
+
+ return OPAL_SUCCESS;
+}
+
+int64_t npu3_destroy_context(struct phb *phb, uint64_t bdf)
+{
+ struct npu3 *npu = npu3_phb_to_npu(phb);
+ uint32_t lparshort, i;
+ int64_t map, rc;
+
+ lock(&npu->lock);
+
+ for (i = 0; i < NPU3_XTS_BDF_MAP_MAX; i++) {
+ map = npu3_read(npu, NPU3_XTS_BDF_MAP(i));
+
+ if (map && GETFIELD(NPU3_XTS_BDF_MAP_BDF, map) == bdf)
+ break;
+ }
+
+ if (i == NPU3_XTS_BDF_MAP_MAX) {
+ NPU3ERR(npu, "LPARID not associated with any GPU\n");
+ rc = OPAL_PARAMETER;
+ goto out;
+ }
+
+ lparshort = GETFIELD(NPU3_XTS_BDF_MAP_LPARSHORT, map);
+ rc = npu3_destroy_context_pid(npu, lparshort);
+
+out:
+ unlock(&npu->lock);
+ return rc;
+}
+
+/* Map the given virtual bdf to lparid with given lpcr */
+int64_t npu3_map_lpar(struct phb *phb, uint64_t bdf, uint64_t lparid,
+ uint64_t lpcr)
+{
+ struct npu3 *npu = npu3_phb_to_npu(phb);
+ struct npu3_dev *dev;
+ int64_t rc = OPAL_SUCCESS;
+ uint64_t map, val;
+ uint32_t i;
+
+ /*
+ * The LPCR bits are only required for hash based ATS, which we don't
+ * currently support, but may need to in the future.
+ */
+ if (lpcr)
+ return OPAL_UNSUPPORTED;
+
+ lock(&npu->lock);
+
+ /* Update the entry if it already exists */
+ for (i = 0; i < NPU3_XTS_BDF_MAP_MAX; i++) {
+ map = npu3_read(npu, NPU3_XTS_BDF_MAP(i));
+
+ if (map && GETFIELD(NPU3_XTS_BDF_MAP_BDF, map) == bdf)
+ break;
+ }
+
+ if (i == NPU3_XTS_BDF_MAP_MAX) {
+ /* No existing mapping found, find space for a new one */
+ for (i = 0; i < NPU3_XTS_BDF_MAP_MAX; i++)
+ if (!npu3_read(npu, NPU3_XTS_BDF_MAP(i)))
+ break;
+ }
+
+ if (i == NPU3_XTS_BDF_MAP_MAX) {
+ NPU3ERR(npu, "No free XTS_BDF[] entry\n");
+ rc = OPAL_RESOURCE;
+ goto out;
+ }
+
+ map = NPU3_XTS_BDF_MAP_UNFILT;
+ map = SETFIELD(NPU3_XTS_BDF_MAP_BDF, map, bdf);
+ map = SETFIELD(NPU3_XTS_BDF_MAP_LPARID, map, lparid);
+ map = SETFIELD(NPU3_XTS_BDF_MAP_LPARSHORT, map, i);
+
+ /* We only support radix at the moment */
+ map = SETFIELD(NPU3_XTS_BDF_MAP_XLAT, map, 0x3);
+
+ /* Find a link on which to send ATSDs for this device */
+ npu3_for_each_nvlink_dev(dev, npu)
+ if (dev->nvlink.gpu->bdfn == bdf)
+ break;
+
+ if (!dev || dev->nvlink.gpu->bdfn != bdf) {
+ NPU3ERR(npu, "Can't find a link for bdf %02llx:%02llx.%llx\n",
+ PCI_BUS_NUM(bdf), PCI_DEV(bdf), PCI_FUNC(bdf));
+ rc = OPAL_PARAMETER;
+ goto out;
+ }
+
+ map = SETFIELD(NPU3_XTS_BDF_MAP_BRICK, map, dev->index);
+
+ NPU3DBG(npu, "XTS_BDF_MAP[%03d] = 0x%08llx\n", i, map);
+ npu3_write(npu, NPU3_XTS_BDF_MAP(i), map);
+
+ /* We need to allocate an ATSD per link */
+ val = SETFIELD(NPU3_XTS_ATSD_HYP_LPARID, 0ull, lparid);
+ if (!lparid)
+ val |= NPU3_XTS_ATSD_HYP_MSR_HV;
+
+ npu3_write(npu, NPU3_XTS_ATSD_HYP(dev->index), val);
+
+out:
+ unlock(&npu->lock);
+ return rc;
+}
+
+static int64_t npu3_relaxed_order_enable(struct npu3 *npu, uint64_t src)
+{
+ struct npu3_dev *dev;
+ uint32_t i;
+
+ for (i = 0; i < NPU3_RELAXED_SRC_MAX; i++)
+ if (npu3_read(npu, NPU3_RELAXED_SRC(i)) == src)
+ return OPAL_SUCCESS; /* Already enabled */
+
+ /* Find somewhere to write this source */
+ for (i = 0; i < NPU3_RELAXED_SRC_MAX; i++)
+ if (!npu3_read(npu, NPU3_RELAXED_SRC(i)))
+ break;
+
+ if (i == NPU3_RELAXED_SRC_MAX) {
+ NPU3ERR(npu, "Insufficient resources to activate relaxed ordering mode\n");
+ return OPAL_RESOURCE;
+ }
+
+ npu3_write(npu, NPU3_RELAXED_SRC(i), src);
+
+ npu3_for_each_nvlink_dev(dev, npu) {
+ uint64_t val = npu3_read(npu, NPU3_RELAXED_CFG2(dev->index));
+
+ val |= NPU3_RELAXED_CFG2_SRC_WRENA(i) |
+ NPU3_RELAXED_CFG2_SRC_RDENA(i);
+ npu3_write(npu, NPU3_RELAXED_CFG2(dev->index), val);
+ }
+
+ return OPAL_SUCCESS;
+}
+
+static void npu3_relaxed_order_disable(struct npu3 *npu, uint64_t src)
+{
+ struct npu3_dev *dev;
+ uint32_t i;
+
+ for (i = 0; i < NPU3_RELAXED_SRC_MAX; i++)
+ if (npu3_read(npu, NPU3_RELAXED_SRC(i)) == src)
+ break;
+
+ if (i == NPU3_RELAXED_SRC_MAX)
+ return; /* Already disabled */
+
+ npu3_for_each_nvlink_dev(dev, npu) {
+ uint64_t val = npu3_read(npu, NPU3_RELAXED_CFG2(dev->index));
+
+ val &= ~NPU3_RELAXED_CFG2_SRC_WRENA(i);
+ val &= ~NPU3_RELAXED_CFG2_SRC_RDENA(i);
+ npu3_write(npu, NPU3_RELAXED_CFG2(dev->index), val);
+ }
+
+ npu3_write(npu, NPU3_RELAXED_SRC(i), 0ull);
+}
+
+/* Enable or disable relaxed ordering on all nvlinks for a given PEC. */
+int64_t npu3_set_relaxed_order(struct phb *phb, uint32_t gcid, int pec,
+ bool enable)
+{
+ struct npu3 *npu = npu3_phb_to_npu(phb);
+ int64_t rc = OPAL_SUCCESS;
+ uint64_t src;
+
+ NPU3INF(npu, "%s relaxed ordering for PEC %d on chip %d\n",
+ enable ? "Enabling" : "Disabling",
+ pec, gcid);
+
+ lock(&npu->lock);
+
+ src = SETFIELD(NPU3_RELAXED_SRC_GRPCHP, 0ull, gcid);
+ src = SETFIELD(NPU3_RELAXED_SRC_PEC, src, pec);
+ src = SETFIELD(NPU3_RELAXED_SRC_RDSTART, src, 0);
+ src = SETFIELD(NPU3_RELAXED_SRC_RDEND, src, 47);
+ src = SETFIELD(NPU3_RELAXED_SRC_WRSTART, src, 0);
+ src = SETFIELD(NPU3_RELAXED_SRC_WREND, src, 23);
+
+ if (enable)
+ rc = npu3_relaxed_order_enable(npu, src);
+ else
+ npu3_relaxed_order_disable(npu, src);
+
+ unlock(&npu->lock);
+ return rc;
+}
diff --git a/roms/skiboot/hw/npu3.c b/roms/skiboot/hw/npu3.c
new file mode 100644
index 000000000..03461373e
--- /dev/null
+++ b/roms/skiboot/hw/npu3.c
@@ -0,0 +1,549 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2019 IBM Corp.
+ */
+
+#include <io.h>
+#include <xscom.h>
+#include <npu3.h>
+#include <npu3-regs.h>
+#include <nvram.h>
+#include <interrupts.h>
+#include <xive.h>
+
+#define NPU3LOG(l, npu, fmt, a...) \
+ prlog(l, "NPU[%d:%d]: " fmt, (npu)->chip_id, (npu)->index, ##a)
+#define NPU3DBG(npu, fmt, a...) NPU3LOG(PR_DEBUG, npu, fmt, ##a)
+#define NPU3INF(npu, fmt, a...) NPU3LOG(PR_INFO, npu, fmt, ##a)
+#define NPU3ERR(npu, fmt, a...) NPU3LOG(PR_ERR, npu, fmt, ##a)
+
+#define NPU3DEVLOG(l, dev, fmt, a...) \
+ prlog(l, "NPU[%d:%d:%d]: " fmt, \
+ (dev)->npu->chip_id, \
+ (dev)->npu->index, \
+ (dev)->index, ##a)
+#define NPU3DEVDBG(dev, fmt, a...) NPU3DEVLOG(PR_DEBUG, dev, fmt, ##a)
+#define NPU3DEVINF(dev, fmt, a...) NPU3DEVLOG(PR_INFO, dev, fmt, ##a)
+#define NPU3DEVERR(dev, fmt, a...) NPU3DEVLOG(PR_ERR, dev, fmt, ##a)
+
+static void npu3_dt_create_link(struct dt_node *npu, uint32_t npu_index,
+ uint32_t dev_index)
+{
+ struct dt_node *link;
+ uint32_t phy_lane_mask, ob_chiplet;
+
+ link = dt_new_addr(npu, "link", dev_index);
+
+ dt_add_property_string(link, "compatible", "ibm,npu-link");
+ dt_add_property_cells(link, "reg", dev_index);
+ dt_add_property_cells(link, "ibm,npu-link-index", dev_index);
+
+ switch (npu_index) {
+ case 0:
+ /* fall through */
+ case 2:
+ ob_chiplet = npu_index ? 3 : 0;
+
+ switch (dev_index) {
+ case 0:
+ phy_lane_mask = PPC_BITMASK32(0, 3);
+ break;
+ case 1:
+ phy_lane_mask = PPC_BITMASK32(13, 16);
+ break;
+ case 2:
+ phy_lane_mask = PPC_BITMASK32(7, 10);
+ break;
+ case 3:
+ phy_lane_mask = PPC_BITMASK32(20, 23);
+ break;
+ }
+
+ break;
+ case 1:
+ switch (dev_index) {
+ case 0:
+ ob_chiplet = 1;
+ phy_lane_mask = PPC_BITMASK32(0, 3);
+ break;
+ case 1:
+ ob_chiplet = 2;
+ phy_lane_mask = PPC_BITMASK32(0, 3);
+ break;
+ case 2:
+ ob_chiplet = 1;
+ phy_lane_mask = PPC_BITMASK32(7, 10);
+ break;
+ case 3:
+ ob_chiplet = 2;
+ phy_lane_mask = PPC_BITMASK32(7, 10);
+ break;
+ }
+
+ break;
+ default:
+ return;
+ }
+
+ dt_add_property_cells(link, "ibm,npu-phy", ob_chiplet);
+ dt_add_property_cells(link, "ibm,npu-lane-mask", phy_lane_mask);
+}
+
+static void npu3_dt_create_npu(struct dt_node *xscom, uint32_t npu_index)
+{
+ const uint32_t npu_base[] = { 0x5011000, 0x5011400, 0x3011c00 };
+ struct dt_node *npu;
+
+ npu = dt_new_addr(xscom, "npu", npu_base[npu_index]);
+
+ dt_add_property_cells(npu, "#size-cells", 0);
+ dt_add_property_cells(npu, "#address-cells", 1);
+ dt_add_property_cells(npu, "reg", npu_base[npu_index], 0x2c);
+ dt_add_property_string(npu, "compatible", "ibm,power9-npu3");
+ dt_add_property_cells(npu, "ibm,npu-index", npu_index);
+
+ for (uint32_t i = 0; i < NPU3_LINKS_PER_NPU; i++)
+ npu3_dt_create_link(npu, npu_index, i);
+}
+
+/* This can be removed when/if we decide to use HDAT instead */
+static bool npu3_dt_create(void)
+{
+ struct proc_chip *chip = next_chip(NULL);
+ struct dt_node *xscom;
+
+ /* npu3 chips only */
+ if (proc_gen < proc_gen_p9 ||
+ chip->type == PROC_CHIP_P9_NIMBUS ||
+ chip->type == PROC_CHIP_P9_CUMULUS)
+ return false;
+
+ dt_for_each_compatible(dt_root, xscom, "ibm,xscom")
+ for (uint32_t i = 0; i < 3; i++)
+ npu3_dt_create_npu(xscom, i);
+
+ return true;
+}
+
+static struct npu3 *npu3_create(struct dt_node *dn)
+{
+ struct npu3 *npu;
+ struct dt_node *link;
+ struct npu3_dev *dev;
+ char *path;
+ uint32_t i;
+
+ npu = zalloc(sizeof(*npu));
+ assert(npu);
+
+ init_lock(&npu->lock);
+
+ npu->dt_node = dn;
+ npu->index = dt_prop_get_u32(dn, "ibm,npu-index");
+ npu->xscom_base = dt_get_address(dn, 0, NULL);
+
+ npu->chip_id = dt_get_chip_id(dn);
+ assert(get_chip(npu->chip_id));
+
+ dt_for_each_compatible(dn, link, "ibm,npu-link") {
+ i = dt_prop_get_u32(link, "ibm,npu-link-index");
+ assert(i < NPU3_LINKS_PER_NPU);
+
+ dev = &npu->devices[i];
+ dev->index = i;
+ dev->npu = npu;
+ dev->dn = link;
+ dev->ob_chiplet = dt_prop_get_u32(link, "ibm,npu-phy");
+ dev->phy_lane_mask = dt_prop_get_u32(link, "ibm,npu-lane-mask");
+ dev->proc.status = NPU3_PROC_COMPLETE;
+ };
+
+ path = dt_get_path(dn);
+ NPU3INF(npu, "Found %s\n", path);
+ NPU3INF(npu, "SCOM base: 0x%llx\n", npu->xscom_base);
+ free(path);
+
+ return npu;
+}
+
+struct npu3_dev *npu3_next_dev(struct npu3 *npu, struct npu3_dev *dev,
+ enum npu3_dev_type type)
+{
+ uint32_t i = 0;
+
+ if (dev)
+ i = dev->index + 1;
+
+ for (; i < NPU3_LINKS_PER_NPU; i++) {
+ dev = &npu->devices[i];
+
+ if (dev->type == type || type == NPU3_DEV_TYPE_ANY)
+ return dev;
+ }
+
+ return NULL;
+}
+
+static void npu3_device_detect_fixup(struct npu3_dev *dev)
+{
+ struct dt_node *dn = dev->dn;
+
+ if (dev->type == NPU3_DEV_TYPE_NVLINK) {
+ dt_add_property_strings(dn, "ibm,npu-link-type", "nvlink");
+ dev->link_speed = dt_prop_get_u32_def(
+ dn, "nvidia,link-speed", 0xff);
+ return;
+ }
+
+ NPU3DEVDBG(dev, "Link type unknown\n");
+ dt_add_property_strings(dn, "ibm,npu-link-type", "unknown");
+}
+
+/*
+ * We use the indirect method because it uses the same addresses as
+ * the MMIO offsets (NPU RING)
+ */
+static void npu3_scom_sel(struct npu3 *npu, uint64_t reg, uint64_t size)
+{
+ uint64_t val;
+
+ val = SETFIELD(NPU3_MISC_DA_ADDR, 0ull, reg);
+ val = SETFIELD(NPU3_MISC_DA_LEN, val, size);
+ xscom_write(npu->chip_id,
+ npu->xscom_base + NPU3_MISC_SCOM_IND_SCOM_ADDR,
+ val);
+}
+
+static void npu3_scom_write(struct npu3 *npu, uint64_t reg, uint64_t size,
+ uint64_t val)
+{
+ npu3_scom_sel(npu, reg, size);
+ xscom_write(npu->chip_id,
+ npu->xscom_base + NPU3_MISC_SCOM_IND_SCOM_DATA,
+ val);
+}
+
+static uint64_t npu3_scom_read(struct npu3 *npu, uint64_t reg, uint64_t size)
+{
+ uint64_t val;
+
+ npu3_scom_sel(npu, reg, size);
+ xscom_read(npu->chip_id,
+ npu->xscom_base + NPU3_MISC_SCOM_IND_SCOM_DATA,
+ &val);
+
+ return val;
+}
+
+void npu3_write(struct npu3 *npu, uint64_t reg, uint64_t val)
+{
+ void *mmio = (void *)npu->regs[0];
+
+ if (mmio)
+ out_be64(mmio + reg, val);
+ else
+ npu3_scom_write(npu, reg, NPU3_MISC_DA_LEN_8B, val);
+
+ /* CQ_SM writes should be mirrored in all four blocks */
+ if (NPU3_REG_BLOCK(reg) != NPU3_BLOCK_CQ_SM(0))
+ return;
+
+ for (uint32_t i = 1; i < 4; i++)
+ npu3_write(npu, NPU3_BLOCK_CQ_SM(i) + NPU3_REG_OFFSET(reg),
+ val);
+}
+
+uint64_t npu3_read(struct npu3 *npu, uint64_t reg)
+{
+ void *mmio = (void *)npu->regs[0];
+
+ if (mmio)
+ return in_be64(mmio + reg);
+
+ return npu3_scom_read(npu, reg, NPU3_MISC_DA_LEN_8B);
+}
+
+void npu3_write_4b(struct npu3 *npu, uint64_t reg, uint32_t val)
+{
+ void *mmio = (void *)npu->regs[0];
+
+ if (mmio)
+ out_be32(mmio + reg, val);
+ else
+ npu3_scom_write(npu, reg, NPU3_MISC_DA_LEN_4B,
+ (uint64_t)val << 32);
+
+ if (NPU3_REG_BLOCK(reg) != NPU3_BLOCK_CQ_SM(0))
+ return;
+
+ for (uint32_t i = 1; i < 4; i++)
+ npu3_write_4b(npu, NPU3_BLOCK_CQ_SM(i) + NPU3_REG_OFFSET(reg),
+ val);
+}
+
+uint32_t npu3_read_4b(struct npu3 *npu, uint64_t reg)
+{
+ void *mmio = (void *)npu->regs[0];
+
+ if (mmio)
+ return in_be32(mmio + reg);
+
+ return npu3_scom_read(npu, reg, NPU3_MISC_DA_LEN_4B) >> 32;
+}
+
+static void npu3_misc_config(struct npu3 *npu)
+{
+ struct npu3_dev *dev;
+ uint32_t typemap = 0;
+ uint64_t reg, val;
+
+ npu3_for_each_nvlink_dev(dev, npu)
+ typemap |= 0x10 >> dev->index;
+
+ reg = NPU3_MCP_MISC_CFG0;
+ val = npu3_read(npu, reg);
+ val |= NPU3_MCP_MISC_CFG0_ENABLE_PBUS;
+ val &= ~NPU3_MCP_MISC_CFG0_ENABLE_SNARF_CPM;
+ val = SETFIELD(NPU3_MCP_MISC_CFG0_NVLINK_MODE, val, typemap);
+ val = SETFIELD(NPU3_MCP_MISC_CFG0_OCAPI_MODE, val, ~typemap);
+ npu3_write(npu, reg, val);
+
+ reg = NPU3_SNP_MISC_CFG0;
+ val = npu3_read(npu, reg);
+ val |= NPU3_SNP_MISC_CFG0_ENABLE_PBUS;
+ val = SETFIELD(NPU3_SNP_MISC_CFG0_NVLINK_MODE, val, typemap);
+ val = SETFIELD(NPU3_SNP_MISC_CFG0_OCAPI_MODE, val, ~typemap);
+ npu3_write(npu, reg, val);
+
+ reg = NPU3_CTL_MISC_CFG2;
+ val = npu3_read(npu, reg);
+ val = SETFIELD(NPU3_CTL_MISC_CFG2_NVLINK_MODE, val, typemap);
+ val = SETFIELD(NPU3_CTL_MISC_CFG2_OCAPI_MODE, val, ~typemap);
+ npu3_write(npu, reg, val);
+
+ reg = NPU3_DAT_MISC_CFG1;
+ val = npu3_read(npu, reg);
+ val = SETFIELD(NPU3_DAT_MISC_CFG1_NVLINK_MODE, val, typemap);
+ val = SETFIELD(NPU3_DAT_MISC_CFG1_OCAPI_MODE, val, ~typemap);
+ npu3_write(npu, reg, val);
+}
+
+static void npu3_assign_bars(struct npu3 *npu)
+{
+ struct npu3_dev *dev;
+ uint64_t addr, size, val;
+
+ /* Global MMIO bar (per npu) */
+ phys_map_get(npu->chip_id, NPU_REGS, npu->index, &addr, &size);
+ val = SETFIELD(NPU3_MMIO_BAR_ADDR, 0ull, addr >> 24);
+ val |= NPU3_MMIO_BAR_ENABLE;
+ npu3_write(npu, NPU3_MMIO_BAR, val);
+
+ NPU3INF(npu, "MMIO base: 0x%016llx (%lldMB)\n", addr, size >> 20);
+ npu->regs[0] = addr;
+ npu->regs[1] = size;
+
+ /* NTL bar (per device) */
+ npu3_for_each_dev(dev, npu) {
+ phys_map_get(npu->chip_id, NPU_NTL, npu3_chip_dev_index(dev),
+ &addr, &size);
+ val = SETFIELD(NPU3_NTL_BAR_ADDR, 0ull, addr >> 16);
+ val = SETFIELD(NPU3_NTL_BAR_SIZE, val, ilog2(size >> 16));
+ npu3_write(npu, NPU3_NTL_BAR(dev->index), val);
+
+ dev->ntl_bar.addr = addr;
+ dev->ntl_bar.size = size;
+ }
+
+ /* GENID bar (logically divided per device) */
+ phys_map_get(npu->chip_id, NPU_GENID, npu->index, &addr, NULL);
+ val = SETFIELD(NPU3_GENID_BAR_ADDR, 0ull, addr >> 19);
+ npu3_write(npu, NPU3_GENID_BAR, val);
+
+ npu3_for_each_dev(dev, npu) {
+ dev->genid_bar.addr = addr + (dev->index << 16);
+ dev->genid_bar.size = 64 << 10;
+ }
+}
+
+void npu3_dev_enable_bars(struct npu3_dev *dev, bool enable)
+{
+ struct npu3 *npu = dev->npu;
+ uint64_t reg, val;
+
+ if (dev->ntl_bar.enable == enable) /* No state change */
+ return;
+
+ dev->ntl_bar.enable = enable;
+ dev->genid_bar.enable = enable;
+
+ reg = NPU3_NTL_BAR(dev->index);
+ val = npu3_read(npu, reg);
+ val = SETFIELD(NPU3_NTL_BAR_ENABLE, val, enable);
+ npu3_write(npu, reg, val);
+
+ /*
+ * Generation IDs are a single space in the hardware but we split them
+ * per device. Only disable in hardware if every device has disabled.
+ */
+ if (!enable)
+ npu3_for_each_dev(dev, npu)
+ if (dev->genid_bar.enable)
+ return;
+
+ reg = NPU3_GENID_BAR;
+ val = npu3_read(npu, reg);
+ val = SETFIELD(NPU3_GENID_BAR_ENABLE, val, enable);
+ npu3_write(npu, reg, val);
+}
+
+static uint64_t npu3_ipi_attributes(struct irq_source *is, uint32_t isn)
+{
+ struct npu3 *npu = is->data;
+ uint32_t level = isn - npu->irq_base;
+
+ /* TCE interrupt is used to detect a frozen PE */
+ if (level == 18)
+ return IRQ_ATTR_TARGET_OPAL |
+ IRQ_ATTR_TARGET_RARE |
+ IRQ_ATTR_TYPE_MSI;
+
+ return IRQ_ATTR_TARGET_LINUX;
+}
+
+static void npu3_ipi_interrupt(struct irq_source *is, uint32_t isn)
+{
+ struct npu3 *npu = is->data;
+ uint32_t level = isn - npu->irq_base;
+
+ if (level != 18) {
+ NPU3ERR(npu, "Received unknown interrupt %d\n", level);
+ return;
+ }
+
+ opal_update_pending_evt(OPAL_EVENT_PCI_ERROR, OPAL_EVENT_PCI_ERROR);
+}
+
+#define NPU3_IRQ_LEVELS 60
+
+static char *npu3_ipi_name(struct irq_source *is, uint32_t isn)
+{
+ struct npu3 *npu = is->data;
+ uint32_t level = isn - npu->irq_base;
+ static const char *names[NPU3_IRQ_LEVELS] = {
+ [0] = "NDL 0 Stall Event (brick 0)",
+ [1] = "NDL 0 No-Stall Event (brick 0)",
+ [2] = "NDL 1 Stall Event (brick 1)",
+ [3] = "NDL 1 No-Stall Event (brick 1)",
+ [4] = "NDL 2 Stall Event (brick 2)",
+ [5] = "NDL 2 No-Stall Event (brick 2)",
+ [6] = "NDL 3 Stall Event (brick 3)",
+ [7] = "NDL 3 No-Stall Event (brick 3)",
+ [8] = "NDL 4 Stall Event (brick 4)",
+ [9] = "NDL 4 No-Stall Event (brick 4)",
+ [10] = "NDL 5 Stall Event (brick 5)",
+ [11] = "NDL 5 No-Stall Event (brick 5)",
+ [12] = "NTL 0 Event",
+ [13] = "NTL 1 Event",
+ [14] = "NTL 2 Event",
+ [15] = "NTL 3 Event",
+ [16] = "NTL 4 Event",
+ [17] = "NTL 5 Event",
+ [18] = "TCE Event",
+ [19] = "ATS Event",
+ [20] = "CQ Event",
+ [21] = "MISC Event",
+ [41] = "Memory Controller Event",
+ [42] = "NDL 6 Stall Event (brick 6)",
+ [43] = "NDL 6 No-Stall Event (brick 6)",
+ [44] = "NDL 7 Stall Event (brick 7)",
+ [45] = "NDL 7 No-Stall Event (brick 7)",
+ [46] = "NDL 8 Stall Event (brick 8)",
+ [47] = "NDL 8 No-Stall Event (brick 8)",
+ [48] = "NDL 9 Stall Event (brick 9)",
+ [49] = "NDL 9 No-Stall Event (brick 9)",
+ [50] = "NDL 10 Stall Event (brick 10)",
+ [51] = "NDL 10 No-Stall Event (brick 10)",
+ [52] = "NDL 11 Stall Event (brick 11)",
+ [53] = "NDL 11 No-Stall Event (brick 11)",
+ [54] = "NTL 6 Event",
+ [55] = "NTL 7 Event",
+ [56] = "NTL 8 Event",
+ [57] = "NTL 9 Event",
+ [58] = "NTL 10 Event",
+ [59] = "NTL 11 Event",
+ };
+
+ if (level >= NPU3_IRQ_LEVELS || !names[level])
+ return strdup("Unknown");
+
+ return strdup(names[level]);
+}
+
+static const struct irq_source_ops npu3_ipi_ops = {
+ .attributes = npu3_ipi_attributes,
+ .interrupt = npu3_ipi_interrupt,
+ .name = npu3_ipi_name,
+};
+
+static void npu3_setup_irqs(struct npu3 *npu)
+{
+ uint64_t reg, val;
+ uint32_t base;
+
+ base = xive_alloc_ipi_irqs(npu->chip_id, NPU3_IRQ_LEVELS, 64);
+ if (base == XIVE_IRQ_ERROR) {
+ NPU3ERR(npu, "Failed to allocate interrupt sources\n");
+ return;
+ }
+
+ xive_register_ipi_source(base, NPU3_IRQ_LEVELS, npu, &npu3_ipi_ops);
+
+ /* Set IPI configuration */
+ reg = NPU3_MISC_CFG;
+ val = npu3_read(npu, reg);
+ val = SETFIELD(NPU3_MISC_CFG_IPI_PS, val, NPU3_MISC_CFG_IPI_PS_64K);
+ val = SETFIELD(NPU3_MISC_CFG_IPI_OS, val, NPU3_MISC_CFG_IPI_OS_AIX);
+ npu3_write(npu, reg, val);
+
+ /* Set IRQ base */
+ reg = NPU3_MISC_INT_BAR;
+ val = SETFIELD(NPU3_MISC_INT_BAR_ADDR, 0ull,
+ (uint64_t)xive_get_trigger_port(base) >> 12);
+ npu3_write(npu, reg, val);
+
+ npu->irq_base = base;
+}
+
+static void npu3_init(struct npu3 *npu)
+{
+ struct npu3_dev *dev;
+
+ platform.npu3_device_detect(npu);
+ npu3_for_each_dev(dev, npu)
+ npu3_device_detect_fixup(dev);
+
+ npu3_misc_config(npu);
+ npu3_assign_bars(npu);
+ npu3_setup_irqs(npu);
+ npu3_init_nvlink(npu);
+}
+
+void probe_npu3(void)
+{
+ struct dt_node *dn;
+ struct npu3 *npu;
+
+ if (!npu3_dt_create())
+ return;
+
+ if (!platform.npu3_device_detect) {
+ prlog(PR_INFO, "NPU: Platform does not support NPU\n");
+ return;
+ }
+
+ dt_for_each_compatible(dt_root, dn, "ibm,power9-npu3") {
+ npu = npu3_create(dn);
+ npu3_init(npu);
+ }
+}
diff --git a/roms/skiboot/hw/nx-842.c b/roms/skiboot/hw/nx-842.c
new file mode 100644
index 000000000..0cb87dcc8
--- /dev/null
+++ b/roms/skiboot/hw/nx-842.c
@@ -0,0 +1,231 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * NX unit 842 compression accellerator
+ *
+ * Copyright 2015-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <chip.h>
+#include <xscom.h>
+#include <io.h>
+#include <cpu.h>
+#include <nx.h>
+#include <vas.h>
+
+/* Configuration settings */
+#define CFG_842_FC_ENABLE (0x1f) /* enable all 842 functions */
+#define CFG_842_ENABLE (1) /* enable 842 engines */
+#define DMA_CSB_WR NX_DMA_CSB_WR_CI
+#define DMA_COMPLETION_MODE NX_DMA_COMPLETION_MODE_CI
+#define DMA_CPB_WR NX_DMA_CPB_WR_CI_PAD
+#define DMA_OUTPUT_DATA_WR NX_DMA_OUTPUT_DATA_WR_CI
+#define EE_1 (1) /* enable engine 842 1 */
+#define EE_0 (1) /* enable engine 842 0 */
+
+static int nx_cfg_842(u32 gcid, u64 xcfg)
+{
+ u64 cfg, ci, ct;
+ int rc, instance = gcid + 1;
+
+ BUILD_ASSERT(MAX_CHIPS < NX_842_CFG_CI_MAX);
+
+ rc = xscom_read(gcid, xcfg, &cfg);
+ if (rc) {
+ prerror("NX%d: ERROR: XSCOM 842 config read failure %d\n",
+ gcid, rc);
+ return rc;
+ }
+
+ ct = GETFIELD(NX_842_CFG_CT, cfg);
+ if (!ct)
+ prlog(PR_INFO, "NX%d: 842 CT set to %u\n", gcid, NX_CT_842);
+ else if (ct == NX_CT_842)
+ prlog(PR_INFO, "NX%d: 842 CT already set to %u\n",
+ gcid, NX_CT_842);
+ else
+ prlog(PR_INFO, "NX%d: 842 CT already set to %u, "
+ "changing to %u\n", gcid, (unsigned int)ct, NX_CT_842);
+ ct = NX_CT_842;
+ cfg = SETFIELD(NX_842_CFG_CT, cfg, ct);
+
+ /* Coprocessor Instance must be shifted left.
+ * See hw doc Section 5.5.1.
+ */
+ ci = GETFIELD(NX_842_CFG_CI, cfg) >> NX_842_CFG_CI_LSHIFT;
+ if (!ci)
+ prlog(PR_INFO, "NX%d: 842 CI set to %d\n", gcid, instance);
+ else if (ci == instance)
+ prlog(PR_INFO, "NX%d: 842 CI already set to %u\n", gcid,
+ (unsigned int)ci);
+ else
+ prlog(PR_INFO, "NX%d: 842 CI already set to %u, "
+ "changing to %d\n", gcid, (unsigned int)ci, instance);
+ ci = instance;
+ cfg = SETFIELD(NX_842_CFG_CI, cfg, ci << NX_842_CFG_CI_LSHIFT);
+
+ /* Enable all functions */
+ cfg = SETFIELD(NX_842_CFG_FC_ENABLE, cfg, CFG_842_FC_ENABLE);
+
+ cfg = SETFIELD(NX_842_CFG_ENABLE, cfg, CFG_842_ENABLE);
+
+ rc = xscom_write(gcid, xcfg, cfg);
+ if (rc)
+ prerror("NX%d: ERROR: 842 CT %u CI %u config failure %d\n",
+ gcid, (unsigned int)ct, (unsigned int)ci, rc);
+ else
+ prlog(PR_DEBUG, "NX%d: 842 Config 0x%016lx\n",
+ gcid, (unsigned long)cfg);
+
+ return rc;
+}
+
+static int nx_cfg_842_umac(struct dt_node *node, u32 gcid, u32 pb_base)
+{
+ int rc;
+ u64 umac_bar, umac_notify;
+ struct dt_node *nx_node;
+ static u32 nx842_tid = 1; /* tid counter within coprocessor type */
+
+ nx_node = dt_new(node, "ibm,842-high-fifo");
+ umac_bar = pb_base + NX_P9_842_HIGH_PRI_RX_FIFO_BAR;
+ umac_notify = pb_base + NX_P9_842_HIGH_PRI_RX_FIFO_NOTIFY_MATCH;
+ rc = nx_cfg_rx_fifo(nx_node, "ibm,p9-nx-842", "High", gcid,
+ NX_CT_842, nx842_tid++, umac_bar,
+ umac_notify);
+ if (rc)
+ return rc;
+
+ nx_node = dt_new(node, "ibm,842-normal-fifo");
+ umac_bar = pb_base + NX_P9_842_NORMAL_PRI_RX_FIFO_BAR;
+ umac_notify = pb_base + NX_P9_842_NORMAL_PRI_RX_FIFO_NOTIFY_MATCH;
+ rc = nx_cfg_rx_fifo(nx_node, "ibm,p9-nx-842", "Normal", gcid,
+ NX_CT_842, nx842_tid++, umac_bar,
+ umac_notify);
+
+ return rc;
+}
+
+static int nx_cfg_842_dma(u32 gcid, u64 xcfg)
+{
+ u64 cfg;
+ int rc;
+
+ rc = xscom_read(gcid, xcfg, &cfg);
+ if (rc) {
+ prerror("NX%d: ERROR: XSCOM DMA config read failure %d\n",
+ gcid, rc);
+ return rc;
+ }
+
+ cfg = SETFIELD(NX_DMA_CFG_842_COMPRESS_PREFETCH, cfg,
+ DMA_COMPRESS_PREFETCH);
+ cfg = SETFIELD(NX_DMA_CFG_842_DECOMPRESS_PREFETCH, cfg,
+ DMA_DECOMPRESS_PREFETCH);
+ cfg = SETFIELD(NX_DMA_CFG_842_COMPRESS_MAX_RR, cfg,
+ DMA_COMPRESS_MAX_RR);
+ cfg = SETFIELD(NX_DMA_CFG_842_DECOMPRESS_MAX_RR, cfg,
+ DMA_DECOMPRESS_MAX_RR);
+ cfg = SETFIELD(NX_DMA_CFG_842_SPBC, cfg,
+ DMA_SPBC);
+ if (proc_gen < proc_gen_p9) {
+ cfg = SETFIELD(NX_DMA_CFG_842_CSB_WR, cfg,
+ DMA_CSB_WR);
+ cfg = SETFIELD(NX_DMA_CFG_842_COMPLETION_MODE, cfg,
+ DMA_COMPLETION_MODE);
+ cfg = SETFIELD(NX_DMA_CFG_842_CPB_WR, cfg,
+ DMA_CPB_WR);
+ cfg = SETFIELD(NX_DMA_CFG_842_OUTPUT_DATA_WR, cfg,
+ DMA_OUTPUT_DATA_WR);
+ }
+
+ rc = xscom_write(gcid, xcfg, cfg);
+ if (rc)
+ prerror("NX%d: ERROR: DMA config failure %d\n", gcid, rc);
+ else
+ prlog(PR_DEBUG, "NX%d: DMA 0x%016lx\n", gcid,
+ (unsigned long)cfg);
+
+ return rc;
+}
+
+static int nx_cfg_842_ee(u32 gcid, u64 xcfg)
+{
+ u64 cfg;
+ int rc;
+
+ rc = xscom_read(gcid, xcfg, &cfg);
+ if (rc) {
+ prerror("NX%d: ERROR: XSCOM EE config read failure %d\n",
+ gcid, rc);
+ return rc;
+ }
+
+ cfg = SETFIELD(NX_EE_CFG_CH1, cfg, EE_1);
+ cfg = SETFIELD(NX_EE_CFG_CH0, cfg, EE_0);
+
+ rc = xscom_write(gcid, xcfg, cfg);
+ if (rc)
+ prerror("NX%d: ERROR: Engine Enable failure %d\n", gcid, rc);
+ else
+ prlog(PR_DEBUG, "NX%d: Engine Enable 0x%016lx\n",
+ gcid, (unsigned long)cfg);
+
+ return rc;
+}
+
+void nx_enable_842(struct dt_node *node, u32 gcid, u32 pb_base)
+{
+ u64 cfg_dma, cfg_842, cfg_ee;
+ int rc;
+
+ if (dt_node_is_compatible(node, "ibm,power8-nx")) {
+ cfg_dma = pb_base + NX_P8_DMA_CFG;
+ cfg_842 = pb_base + NX_P8_842_CFG;
+ cfg_ee = pb_base + NX_P8_EE_CFG;
+ } else {
+ prerror("NX%d: ERROR: Unknown NX type!\n", gcid);
+ return;
+ }
+
+ rc = nx_cfg_842_dma(gcid, cfg_dma);
+ if (rc)
+ return;
+
+ rc = nx_cfg_842(gcid, cfg_842);
+ if (rc)
+ return;
+
+ rc = nx_cfg_842_ee(gcid, cfg_ee);
+ if (rc)
+ return;
+
+ prlog(PR_INFO, "NX%d: 842 Coprocessor Enabled\n", gcid);
+
+ dt_add_property_cells(node, "ibm,842-coprocessor-type", NX_CT_842);
+ dt_add_property_cells(node, "ibm,842-coprocessor-instance", gcid + 1);
+}
+
+void p9_nx_enable_842(struct dt_node *node, u32 gcid, u32 pb_base)
+{
+ u64 cfg_dma, cfg_ee;
+ int rc;
+
+ cfg_dma = pb_base + NX_P9_DMA_CFG;
+ cfg_ee = pb_base + NX_P9_EE_CFG;
+
+ rc = nx_cfg_842_dma(gcid, cfg_dma);
+ if (rc)
+ return;
+
+ rc = nx_cfg_842_umac(node, gcid, pb_base);
+ if (rc)
+ return;
+
+ rc = nx_cfg_842_ee(gcid, cfg_ee);
+ if (rc)
+ return;
+
+ prlog(PR_INFO, "NX%d: 842 Coprocessor Enabled\n", gcid);
+
+}
diff --git a/roms/skiboot/hw/nx-compress.c b/roms/skiboot/hw/nx-compress.c
new file mode 100644
index 000000000..9b3c6717d
--- /dev/null
+++ b/roms/skiboot/hw/nx-compress.c
@@ -0,0 +1,340 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * NX has 842 and GZIP (P9) accellerators
+ *
+ * Copyright 2015-2018 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <chip.h>
+#include <xscom.h>
+#include <io.h>
+#include <cpu.h>
+#include <nx.h>
+#include <vas.h>
+#include <opal.h>
+
+static int nx_cfg_umac_tx_wc(u32 gcid, u64 xcfg)
+{
+ int rc = 0;
+ u64 cfg;
+
+ cfg = vas_get_wcbs_bar(gcid);
+ if (!cfg) {
+ prerror("NX%d: ERROR finding WC Backing store BAR\n", gcid);
+ return -ENOMEM;
+ }
+
+ /*
+ * NOTE: Write the entire bar address to SCOM. VAS/NX will extract
+ * the relevant (NX_P9_UMAC_TX_WINDOW_CONTEXT_ADDR) bits.
+ * IOW, _don't_ just write the bit field like:
+ *
+ * cfg = SETFIELD(NX_P9_UMAC_TX_WINDOW_CONTEXT_ADDR, 0ULL, cfg);
+ */
+ rc = xscom_write(gcid, xcfg, cfg);
+
+ if (rc)
+ prerror("NX%d: ERROR: UMAC SEND WC BAR, %d\n", gcid, rc);
+ else
+ prlog(PR_DEBUG, "NX%d: UMAC SEND WC BAR, 0x%016lx, "
+ "xcfg 0x%llx\n",
+ gcid, (unsigned long)cfg, xcfg);
+
+ return rc;
+}
+
+static int nx_cfg_dma_vas_mmio(u32 gcid, u64 xcfg)
+{
+ int rc = 0;
+ u64 cfg;
+
+ cfg = vas_get_hvwc_mmio_bar(gcid);
+ /*
+ * NOTE: Write the entire bar address to SCOM. VAS/NX will extract
+ * the relevant (NX_P9_UMAC_VAS_MMIO_ADDR) bits. IOW, _don't_
+ * just write the bit field like:
+ *
+ * cfg = SETFIELD(NX_P9_DMA_VAS_MMIO_ADDR, 0ULL, cfg);
+ */
+ rc = xscom_write(gcid, xcfg, cfg);
+
+ if (rc)
+ prerror("NX%d: ERROR: DMA VAS MMIO BAR, %d\n", gcid, rc);
+ else
+ prlog(PR_DEBUG, "NX%d: DMA VAS MMIO BAR, 0x%016lx, xcfg 0x%llx\n",
+ gcid, (unsigned long)cfg, xcfg);
+
+ return rc;
+}
+
+static int nx_cfg_umac_vas_mmio(u32 gcid, u64 xcfg)
+{
+ int rc = 0;
+ u64 cfg;
+
+ cfg = vas_get_hvwc_mmio_bar(gcid);
+ /*
+ * NOTE: Write the entire bar address to SCOM. VAS/NX will extract
+ * the relevant (NX_P9_UMAC_VAS_MMIO_ADDR) bits. IOW, _don't_
+ * just write the bit field like:
+ *
+ * cfg = SETFIELD(NX_P9_UMAC_VAS_MMIO_ADDR, 0ULL, cfg);
+ */
+ rc = xscom_write(gcid, xcfg, cfg);
+
+ if (rc)
+ prerror("NX%d: ERROR: UMAC VAS MMIO BAR, %d\n", gcid, rc);
+ else
+ prlog(PR_DEBUG, "NX%d: UMAC VAS MMIO BAR, 0x%016lx, "
+ "xcfg 0x%llx\n",
+ gcid, (unsigned long)cfg, xcfg);
+
+ return rc;
+}
+
+static int nx_cfg_umac_status_ctrl(u32 gcid, u64 xcfg)
+{
+ u64 uctrl;
+ int rc;
+#define CRB_ENABLE 1
+
+ rc = xscom_read(gcid, xcfg, &uctrl);
+ if (rc)
+ return rc;
+
+ uctrl = SETFIELD(NX_P9_UMAC_STATUS_CTRL_CRB_ENABLE, uctrl, CRB_ENABLE);
+ rc = xscom_write(gcid, xcfg, uctrl);
+ if (rc)
+ prerror("NX%d: ERROR: Setting UMAC Status Control failure %d\n",
+ gcid, rc);
+ else
+ prlog(PR_DEBUG, "NX%d: Setting UMAC Status Control 0x%016lx\n",
+ gcid, (unsigned long)uctrl);
+
+ return rc;
+}
+
+static int nx_cfg_vas_rma_bar(u32 gcid, u64 xcfg)
+{
+ int rc = 0;
+ u64 cfg;
+
+ cfg = vas_get_rma_bar(gcid);
+ /*
+ * NOTE: Write the entire bar address to SCOM. VAS/NX will extract
+ * the relevant (NX_P10_VAS_RMA_WRITE_BAR) bits. IOW, _don't_
+ * just write the bit field like:
+ * cfg = SETFIELD(NX_P10_VAS_RMA_WRITE_BAR, 0ULL, cfg);
+ */
+ rc = xscom_write(gcid, xcfg, cfg);
+
+ if (rc)
+ prerror("NX%d: ERROR: VAS RMA WRITE BAR, %d\n", gcid, rc);
+ else
+ prlog(PR_DEBUG, "NX%d: VAS RMA WRITE BAR, 0x%016lx, "
+ "xcfg 0x%llx\n", gcid, (unsigned long)cfg,
+ xcfg);
+
+ return rc;
+}
+
+int nx_cfg_rx_fifo(struct dt_node *node, const char *compat,
+ const char *priority, u32 gcid, u32 pid, u32 tid,
+ u64 umac_bar, u64 umac_notify)
+{
+ u64 cfg;
+ int rc, size;
+ uint64_t fifo;
+ u32 lpid = 0xfff; /* All 1's for 12 bits in UMAC notify match reg */
+#define MATCH_ENABLE 1
+
+ fifo = (uint64_t) local_alloc(gcid, RX_FIFO_SIZE, RX_FIFO_SIZE);
+ assert(fifo);
+
+ /*
+ * When configuring the address of the Rx FIFO into the Receive FIFO
+ * BAR, we should _NOT_ shift the address into bits 8:53. Instead we
+ * should copy the address as is and VAS/NX will extract relevant bits.
+ */
+ /*
+ * Section 5.21 of P9 NX Workbook Version 2.42 shows Receive FIFO BAR
+ * 54:56 represents FIFO size
+ * 000 = 1KB, 8 CRBs
+ * 001 = 2KB, 16 CRBs
+ * 010 = 4KB, 32 CRBs
+ * 011 = 8KB, 64 CRBs
+ * 100 = 16KB, 128 CRBs
+ * 101 = 32KB, 256 CRBs
+ * 110 = 111 reserved
+ */
+ size = RX_FIFO_SIZE / 1024;
+ cfg = SETFIELD(NX_P9_RX_FIFO_BAR_SIZE, fifo, ilog2(size));
+
+ rc = xscom_write(gcid, umac_bar, cfg);
+ if (rc) {
+ prerror("NX%d: ERROR: Setting UMAC FIFO bar failure %d\n",
+ gcid, rc);
+ return rc;
+ } else
+ prlog(PR_DEBUG, "NX%d: Setting UMAC FIFO bar 0x%016lx\n",
+ gcid, (unsigned long)cfg);
+
+ rc = xscom_read(gcid, umac_notify, &cfg);
+ if (rc)
+ return rc;
+
+ /*
+ * VAS issues asb_notify with the unique ID to identify the target
+ * co-processor/engine. Logical partition ID (lpid), process ID (pid),
+ * and thread ID (tid) combination is used to define the unique ID
+ * in the system. Export these values in device-tree such that the
+ * driver configure RxFIFO with VAS. Set these values in RxFIFO notify
+ * match register for each engine which compares the ID with each
+ * request.
+ * To define unique indentification, 0xfff (1's for 12 bits),
+ * co-processor type, and counter within coprocessor type are used
+ * for lpid, pid, and tid respectively.
+ */
+ cfg = SETFIELD(NX_P9_RX_FIFO_NOTIFY_MATCH_LPID, cfg, lpid);
+ cfg = SETFIELD(NX_P9_RX_FIFO_NOTIFY_MATCH_PID, cfg, pid);
+ cfg = SETFIELD(NX_P9_RX_FIFO_NOTIFY_MATCH_TID, cfg, tid);
+ cfg = SETFIELD(NX_P9_RX_FIFO_NOTIFY_MATCH_MATCH_ENABLE, cfg,
+ MATCH_ENABLE);
+
+ rc = xscom_write(gcid, umac_notify, cfg);
+ if (rc) {
+ prerror("NX%d: ERROR: Setting UMAC notify match failure %d\n",
+ gcid, rc);
+ return rc;
+ } else
+ prlog(PR_DEBUG, "NX%d: Setting UMAC notify match 0x%016lx\n",
+ gcid, (unsigned long)cfg);
+
+ dt_add_property_string(node, "compatible", compat);
+ dt_add_property_string(node, "priority", priority);
+ dt_add_property_u64(node, "rx-fifo-address", fifo);
+ dt_add_property_cells(node, "rx-fifo-size", RX_FIFO_SIZE);
+ dt_add_property_cells(node, "lpid", lpid);
+ dt_add_property_cells(node, "pid", pid);
+ dt_add_property_cells(node, "tid", tid);
+
+ return 0;
+}
+
+static int nx_init_fifo_ctrl(u32 gcid, u64 fifo_ctrl)
+{
+ u64 cfg;
+ int rc = 0;
+
+ rc = xscom_read(gcid, fifo_ctrl, &cfg);
+ if (rc)
+ return rc;
+
+ cfg = SETFIELD(NX_P9_RX_FIFO_CTRL_READ_OFFSET, cfg, 0);
+ cfg = SETFIELD(NX_P9_RX_FIFO_CTRL_QUEUED, cfg, 0);
+
+ rc = xscom_write(gcid, fifo_ctrl, cfg);
+
+ return rc;
+}
+
+
+static int opal_nx_coproc_init(u32 gcid, u32 ct)
+{
+ struct proc_chip *chip;
+ u64 fifo, fifo_hi;
+ u32 nx_base;
+ int rc;
+
+ if (proc_gen < proc_gen_p9)
+ return OPAL_UNSUPPORTED;
+
+ chip = get_chip(gcid);
+ if (!chip)
+ return OPAL_PARAMETER;
+
+ nx_base = chip->nx_base;
+ if (!nx_base)
+ return OPAL_PARAMETER;
+
+ switch (ct) {
+ case NX_CT_842:
+ fifo_hi = nx_base + NX_P9_842_HIGH_PRI_RX_FIFO_CTRL;
+ fifo = nx_base + NX_P9_842_NORMAL_PRI_RX_FIFO_CTRL;
+ break;
+ case NX_CT_GZIP:
+ fifo_hi = nx_base + NX_P9_GZIP_HIGH_PRI_RX_FIFO_CTRL;
+ fifo = nx_base + NX_P9_GZIP_NORMAL_PRI_RX_FIFO_CTRL;
+ break;
+ default:
+ prlog(PR_EMERG, "OPAL: Unknown NX coprocessor type\n");
+ return OPAL_PARAMETER;
+ }
+
+ rc = nx_init_fifo_ctrl(gcid, fifo_hi);
+
+ if (!rc)
+ rc = nx_init_fifo_ctrl(gcid, fifo);
+
+ return rc;
+}
+
+opal_call(OPAL_NX_COPROC_INIT, opal_nx_coproc_init, 2);
+
+void nx_create_compress_node(struct dt_node *node)
+{
+ u32 gcid, pb_base;
+ struct proc_chip *chip;
+ int rc;
+
+ gcid = dt_get_chip_id(node);
+ pb_base = dt_get_address(node, 0, NULL);
+
+ chip = get_chip(gcid);
+ chip->nx_base = pb_base;
+
+ prlog(PR_INFO, "NX%d: 842 at 0x%x\n", gcid, pb_base);
+
+ /*
+ * ibm,power9-nx is compatible on P10. So using same
+ * compatible string.
+ */
+ if (dt_node_is_compatible(node, "ibm,power9-nx")) {
+ u64 cfg_mmio, cfg_txwc, cfg_uctrl, cfg_dma;
+
+ prlog(PR_DEBUG, "Found ibm,power9-nx\n");
+ cfg_mmio = pb_base + NX_P9_UMAC_VAS_MMIO_BAR;
+ cfg_dma = pb_base + NX_P9_DMA_VAS_MMIO_BAR;
+ cfg_txwc = pb_base + NX_P9_UMAC_TX_WINDOW_CONTEXT_BAR;
+ cfg_uctrl = pb_base + NX_P9_UMAC_STATUS_CTRL;
+
+ rc = nx_cfg_umac_vas_mmio(gcid, cfg_mmio);
+ if (rc)
+ return;
+
+ rc = nx_cfg_dma_vas_mmio(gcid, cfg_dma);
+ if (rc)
+ return;
+
+ rc = nx_cfg_umac_tx_wc(gcid, cfg_txwc);
+ if (rc)
+ return;
+
+ rc = nx_cfg_umac_status_ctrl(gcid, cfg_uctrl);
+ if (rc)
+ return;
+
+ if (proc_gen > proc_gen_p9) {
+ u64 cfg_rma = pb_base + NX_P10_VAS_RMA_WRITE_BAR;
+
+ rc = nx_cfg_vas_rma_bar(gcid, cfg_rma);
+ if (rc)
+ return;
+ }
+
+ p9_nx_enable_842(node, gcid, pb_base);
+ p9_nx_enable_gzip(node, gcid, pb_base);
+ } else
+ nx_enable_842(node, gcid, pb_base);
+}
diff --git a/roms/skiboot/hw/nx-crypto.c b/roms/skiboot/hw/nx-crypto.c
new file mode 100644
index 000000000..8b8ff5ee5
--- /dev/null
+++ b/roms/skiboot/hw/nx-crypto.c
@@ -0,0 +1,298 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * NX Cryptographic accellerators
+ *
+ * Copyright 2015-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <chip.h>
+#include <xscom.h>
+#include <io.h>
+#include <cpu.h>
+#include <nx.h>
+
+/* Configuration settings */
+#define CFG_SYM_FC_ENABLE (0) /* disable all sym functions */
+#define CFG_SYM_ENABLE (0) /* disable sym engines */
+#define CFG_ASYM_FC_ENABLE (0) /* disable all asym functions */
+#define CFG_ASYM_ENABLE (0) /* disable asym engines */
+#define CFG_CRB_IQ_SYM (0) /* don't use any extra input queues */
+#define CFG_CRB_IQ_ASYM (0) /* don't use any extra input queues */
+#define AES_SHA_MAX_RR (1) /* valid range: 1-8 */
+#define AES_SHA_CSB_WR NX_DMA_CSB_WR_PDMA
+#define AES_SHA_COMPLETION_MODE NX_DMA_COMPLETION_MODE_PDMA
+#define AES_SHA_CPB_WR NX_DMA_CPB_WR_DMA_NOPAD
+#define AES_SHA_OUTPUT_DATA_WR NX_DMA_OUTPUT_DATA_WR_DMA
+#define AMF_MAX_RR (1) /* valid range: 1-8 */
+#define AMF_CSB_WR NX_DMA_CSB_WR_PDMA
+#define AMF_COMPLETION_MODE NX_DMA_COMPLETION_MODE_PDMA
+#define AMF_CPB_WR (0) /* CPB WR not done with AMF */
+#define AMF_OUTPUT_DATA_WR NX_DMA_OUTPUT_DATA_WR_DMA
+#define EE_CH7 (0) /* disable engine AMF 3(P8) */
+#define EE_CH6 (0) /* disable engine AMF 2(P8) */
+#define EE_CH5 (0) /* disable engine AMF 1(P8) */
+#define EE_CH4 (0) /* disable engine SYM AMF 0(P8) */
+#define EE_CH3 (0) /* disable engine SYM 1 */
+#define EE_CH2 (0) /* disable engine SYM 0 */
+
+static int nx_cfg_sym(u32 gcid, u64 xcfg)
+{
+ u64 cfg, ci, ct;
+ int rc, instance = gcid + 1;
+
+ BUILD_ASSERT(MAX_CHIPS < NX_SYM_CFG_CI_MAX);
+
+ rc = xscom_read(gcid, xcfg, &cfg);
+ if (rc) {
+ prerror("NX%d: ERROR: XSCOM SYM config read failure %d\n",
+ gcid, rc);
+ return rc;
+ }
+
+ ct = GETFIELD(NX_SYM_CFG_CT, cfg);
+ if (!ct)
+ prlog(PR_INFO, "NX%d: SYM CT set to %u\n", gcid, NX_CT_SYM);
+ else if (ct == NX_CT_SYM)
+ prlog(PR_INFO, "NX%d: SYM CT already set to %u\n",
+ gcid, NX_CT_SYM);
+ else
+ prlog(PR_INFO, "NX%d: SYM CT already set to %u, "
+ "changing to %u\n", gcid, (unsigned int)ct, NX_CT_SYM);
+ ct = NX_CT_SYM;
+ cfg = SETFIELD(NX_SYM_CFG_CT, cfg, ct);
+
+ /* Coprocessor Instance must be shifted left.
+ * See hw doc Section 5.5.1.
+ */
+ ci = GETFIELD(NX_SYM_CFG_CI, cfg) >> NX_SYM_CFG_CI_LSHIFT;
+ if (!ci)
+ prlog(PR_INFO, "NX%d: SYM CI set to %d\n", gcid, instance);
+ else if (ci == instance)
+ prlog(PR_INFO, "NX%d: SYM CI already set to %u\n", gcid,
+ (unsigned int)ci);
+ else
+ prlog(PR_INFO, "NX%d: SYM CI already set to %u, "
+ "changing to %d\n", gcid, (unsigned int)ci, instance);
+ ci = instance;
+ cfg = SETFIELD(NX_SYM_CFG_CI, cfg, ci << NX_SYM_CFG_CI_LSHIFT);
+
+ cfg = SETFIELD(NX_SYM_CFG_FC_ENABLE, cfg, CFG_SYM_FC_ENABLE);
+
+ cfg = SETFIELD(NX_SYM_CFG_ENABLE, cfg, CFG_SYM_ENABLE);
+
+ rc = xscom_write(gcid, xcfg, cfg);
+ if (rc)
+ prerror("NX%d: ERROR: SYM CT %u CI %u config failure %d\n",
+ gcid, (unsigned int)ct, (unsigned int)ci, rc);
+ else
+ prlog(PR_DEBUG, "NX%d: SYM Config 0x%016lx\n",
+ gcid, (unsigned long)cfg);
+
+ return rc;
+}
+
+static int nx_cfg_asym(u32 gcid, u64 xcfg)
+{
+ u64 cfg, ci, ct;
+ int rc, instance = gcid + 1;
+
+ BUILD_ASSERT(MAX_CHIPS < NX_ASYM_CFG_CI_MAX);
+
+ rc = xscom_read(gcid, xcfg, &cfg);
+ if (rc) {
+ prerror("NX%d: ERROR: XSCOM ASYM config read failure %d\n",
+ gcid, rc);
+ return rc;
+ }
+
+ ct = GETFIELD(NX_ASYM_CFG_CT, cfg);
+ if (!ct)
+ prlog(PR_INFO, "NX%d: ASYM CT set to %u\n",
+ gcid, NX_CT_ASYM);
+ else if (ct == NX_CT_ASYM)
+ prlog(PR_INFO, "NX%d: ASYM CT already set to %u\n",
+ gcid, NX_CT_ASYM);
+ else
+ prlog(PR_INFO, "NX%d: ASYM CT already set to %u, "
+ "changing to %u\n", gcid, (unsigned int)ct, NX_CT_ASYM);
+ ct = NX_CT_ASYM;
+ cfg = SETFIELD(NX_ASYM_CFG_CT, cfg, ct);
+
+ /* Coprocessor Instance must be shifted left.
+ * See hw doc Section 5.5.1.
+ */
+ ci = GETFIELD(NX_ASYM_CFG_CI, cfg) >> NX_ASYM_CFG_CI_LSHIFT;
+ if (!ci)
+ prlog(PR_INFO, "NX%d: ASYM CI set to %d\n", gcid, instance);
+ else if (ci == instance)
+ prlog(PR_INFO, "NX%d: ASYM CI already set to %u\n", gcid,
+ (unsigned int)ci);
+ else
+ prlog(PR_INFO, "NX%d: ASYM CI already set to %u, "
+ "changing to %d\n", gcid, (unsigned int)ci, instance);
+ ci = instance;
+ cfg = SETFIELD(NX_ASYM_CFG_CI, cfg, ci << NX_ASYM_CFG_CI_LSHIFT);
+
+ cfg = SETFIELD(NX_ASYM_CFG_FC_ENABLE, cfg, CFG_ASYM_FC_ENABLE);
+
+ cfg = SETFIELD(NX_ASYM_CFG_ENABLE, cfg, CFG_ASYM_ENABLE);
+
+ rc = xscom_write(gcid, xcfg, cfg);
+ if (rc)
+ prerror("NX%d: ERROR: ASYM CT %u CI %u config failure %d\n",
+ gcid, (unsigned int)ct, (unsigned int)ci, rc);
+ else
+ prlog(PR_DEBUG, "NX%d: ASYM Config 0x%016lx\n",
+ gcid, (unsigned long)cfg);
+
+ return rc;
+}
+
+static int nx_cfg_dma(u32 gcid, u64 xcfg)
+{
+ u64 cfg;
+ int rc;
+
+ rc = xscom_read(gcid, xcfg, &cfg);
+ if (rc) {
+ prerror("NX%d: ERROR: XSCOM DMA config read failure %d\n",
+ gcid, rc);
+ return rc;
+ }
+
+ cfg = SETFIELD(NX_DMA_CFG_AES_SHA_MAX_RR, cfg,
+ AES_SHA_MAX_RR);
+ cfg = SETFIELD(NX_DMA_CFG_AES_SHA_CSB_WR, cfg,
+ AES_SHA_CSB_WR);
+ cfg = SETFIELD(NX_DMA_CFG_AES_SHA_COMPLETION_MODE, cfg,
+ AES_SHA_COMPLETION_MODE);
+ cfg = SETFIELD(NX_DMA_CFG_AES_SHA_CPB_WR, cfg,
+ AES_SHA_CPB_WR);
+ cfg = SETFIELD(NX_DMA_CFG_AES_SHA_OUTPUT_DATA_WR, cfg,
+ AES_SHA_OUTPUT_DATA_WR);
+
+ cfg = SETFIELD(NX_DMA_CFG_AMF_MAX_RR, cfg,
+ AMF_MAX_RR);
+ cfg = SETFIELD(NX_DMA_CFG_AMF_CSB_WR, cfg,
+ AMF_CSB_WR);
+ cfg = SETFIELD(NX_DMA_CFG_AMF_COMPLETION_MODE, cfg,
+ AMF_COMPLETION_MODE);
+ cfg = SETFIELD(NX_DMA_CFG_AMF_CPB_WR, cfg,
+ AMF_CPB_WR);
+ cfg = SETFIELD(NX_DMA_CFG_AMF_OUTPUT_DATA_WR, cfg,
+ AMF_OUTPUT_DATA_WR);
+
+ rc = xscom_write(gcid, xcfg, cfg);
+ if (rc)
+ prerror("NX%d: ERROR: DMA config failure %d\n", gcid, rc);
+ else
+ prlog(PR_DEBUG, "NX%d: DMA 0x%016lx\n", gcid,
+ (unsigned long)cfg);
+
+ return rc;
+}
+
+static int nx_cfg_iq(u32 gcid, u64 xcfg)
+{
+ u64 cfg;
+ int rc;
+
+ rc = xscom_read(gcid, xcfg, &cfg);
+ if (rc) {
+ prerror("NX%d: ERROR: XSCOM CRB IQ config read failure %d\n",
+ gcid, rc);
+ return rc;
+ }
+
+ cfg = SETFIELD(NX_CRB_IQ_SYM, cfg, CFG_CRB_IQ_SYM);
+ cfg = SETFIELD(NX_CRB_IQ_ASYM, cfg, CFG_CRB_IQ_ASYM);
+
+ rc = xscom_write(gcid, xcfg, cfg);
+ if (rc)
+ prerror("NX%d: ERROR: CRB Input Queue failure %d\n", gcid, rc);
+ else
+ prlog(PR_DEBUG, "NX%d: CRB Input Queue 0x%016lx\n",
+ gcid, (unsigned long)cfg);
+
+ return rc;
+}
+
+static int nx_cfg_ee(u32 gcid, u64 xcfg)
+{
+ u64 cfg;
+ int rc;
+
+ rc = xscom_read(gcid, xcfg, &cfg);
+ if (rc) {
+ prerror("NX%d: ERROR: XSCOM EE config read failure %d\n",
+ gcid, rc);
+ return rc;
+ }
+
+ cfg = SETFIELD(NX_EE_CFG_CH7, cfg, EE_CH7);
+ cfg = SETFIELD(NX_EE_CFG_CH6, cfg, EE_CH6);
+ cfg = SETFIELD(NX_EE_CFG_CH5, cfg, EE_CH5);
+ cfg = SETFIELD(NX_EE_CFG_CH4, cfg, EE_CH4);
+ cfg = SETFIELD(NX_EE_CFG_CH3, cfg, EE_CH3);
+ cfg = SETFIELD(NX_EE_CFG_CH2, cfg, EE_CH2);
+
+ rc = xscom_write(gcid, xcfg, cfg);
+ if (rc)
+ prerror("NX%d: ERROR: Engine Enable failure %d\n", gcid, rc);
+ else
+ prlog(PR_DEBUG, "NX%d: Engine Enable 0x%016lx\n",
+ gcid, (unsigned long)cfg);
+
+ return rc;
+}
+
+void nx_create_crypto_node(struct dt_node *node)
+{
+ u32 gcid;
+ u32 pb_base;
+ u64 cfg_dma, cfg_sym, cfg_asym, cfg_iq, cfg_ee;
+ int rc;
+
+ gcid = dt_get_chip_id(node);
+ pb_base = dt_get_address(node, 0, NULL);
+
+ prlog(PR_INFO, "NX%d: Crypto at 0x%x\n", gcid, pb_base);
+
+ if (dt_node_is_compatible(node, "ibm,power8-nx")) {
+ cfg_dma = pb_base + NX_P8_DMA_CFG;
+ cfg_sym = pb_base + NX_P8_SYM_CFG;
+ cfg_asym = pb_base + NX_P8_ASYM_CFG;
+ cfg_iq = pb_base + NX_P8_CRB_IQ;
+ cfg_ee = pb_base + NX_P8_EE_CFG;
+ } else if (dt_node_is_compatible(node, "ibm,power9-nx")) {
+ prlog(PR_INFO, "NX%d: POWER9 nx-crypto not yet supported\n",
+ gcid);
+ return;
+ } else {
+ prerror("NX%d: ERROR: Unknown NX type!\n", gcid);
+ return;
+ }
+
+ rc = nx_cfg_dma(gcid, cfg_dma);
+ if (rc)
+ return;
+
+ rc = nx_cfg_sym(gcid, cfg_sym);
+ if (rc)
+ return;
+
+ rc = nx_cfg_asym(gcid, cfg_asym);
+ if (rc)
+ return;
+
+ rc = nx_cfg_iq(gcid, cfg_iq);
+ if (rc)
+ return;
+
+ rc = nx_cfg_ee(gcid, cfg_ee);
+ if (rc)
+ return;
+
+ prlog(PR_INFO, "NX%d: Crypto Coprocessors Disabled (not supported)\n", gcid);
+}
diff --git a/roms/skiboot/hw/nx-gzip.c b/roms/skiboot/hw/nx-gzip.c
new file mode 100644
index 000000000..9bc491e70
--- /dev/null
+++ b/roms/skiboot/hw/nx-gzip.c
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * NX GZIP (p9) accellerator support
+ *
+ * Copyright 2016-2017 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <chip.h>
+#include <xscom.h>
+#include <io.h>
+#include <cpu.h>
+#include <nx.h>
+
+#define EE (1) /* enable gzip engine */
+
+static int nx_cfg_gzip_umac(struct dt_node *node, u32 gcid, u32 pb_base)
+{
+ int rc;
+ u64 umac_bar, umac_notify;
+ struct dt_node *nx_node;
+ static u32 nxgzip_tid = 1; /* tid counter within coprocessor type */
+
+ nx_node = dt_new(node, "ibm,gzip-high-fifo");
+ umac_bar = pb_base + NX_P9_GZIP_HIGH_PRI_RX_FIFO_BAR;
+ umac_notify = pb_base + NX_P9_GZIP_HIGH_PRI_RX_FIFO_NOTIFY_MATCH;
+
+ rc = nx_cfg_rx_fifo(nx_node, "ibm,p9-nx-gzip", "High", gcid,
+ NX_CT_GZIP, nxgzip_tid++, umac_bar,
+ umac_notify);
+ if (rc)
+ return rc;
+
+ nx_node = dt_new(node, "ibm,gzip-normal-fifo");
+ umac_bar = pb_base + NX_P9_GZIP_NORMAL_PRI_RX_FIFO_BAR;
+ umac_notify = pb_base + NX_P9_GZIP_NORMAL_PRI_RX_FIFO_NOTIFY_MATCH;
+
+ rc = nx_cfg_rx_fifo(nx_node, "ibm,p9-nx-gzip", "Normal", gcid,
+ NX_CT_GZIP, nxgzip_tid++, umac_bar,
+ umac_notify);
+
+ return rc;
+}
+
+static int nx_cfg_gzip_dma(u32 gcid, u64 xcfg)
+{
+ u64 cfg;
+ int rc;
+
+ rc = xscom_read(gcid, xcfg, &cfg);
+ if (rc)
+ return rc;
+
+ cfg = SETFIELD(NX_DMA_CFG_GZIP_COMPRESS_PREFETCH, cfg,
+ DMA_COMPRESS_PREFETCH);
+ cfg = SETFIELD(NX_DMA_CFG_GZIP_DECOMPRESS_PREFETCH, cfg,
+ DMA_DECOMPRESS_PREFETCH);
+
+ cfg = SETFIELD(NX_DMA_CFG_GZIP_COMPRESS_MAX_RR, cfg,
+ DMA_COMPRESS_MAX_RR);
+ cfg = SETFIELD(NX_DMA_CFG_GZIP_DECOMPRESS_MAX_RR, cfg,
+ DMA_DECOMPRESS_MAX_RR);
+
+ rc = xscom_write(gcid, xcfg, cfg);
+ if (rc)
+ prerror("NX%d: ERROR: DMA config failure %d\n", gcid, rc);
+ else
+ prlog(PR_DEBUG, "NX%d: DMA 0x%016lx\n", gcid,
+ (unsigned long)cfg);
+
+ return rc;
+}
+
+static int nx_cfg_gzip_ee(u32 gcid, u64 xcfg)
+{
+ u64 cfg;
+ int rc;
+
+ rc = xscom_read(gcid, xcfg, &cfg);
+ if (rc)
+ return rc;
+
+ cfg = SETFIELD(NX_P9_EE_CFG_CH4, cfg, EE);
+
+ rc = xscom_write(gcid, xcfg, cfg);
+ if (rc)
+ prerror("NX%d: ERROR: Engine Enable failure %d\n", gcid, rc);
+ else
+ prlog(PR_DEBUG, "NX%d: Engine Enable 0x%016lx\n",
+ gcid, (unsigned long)cfg);
+
+ return rc;
+}
+
+void p9_nx_enable_gzip(struct dt_node *node, u32 gcid, u32 pb_base)
+{
+ u64 cfg_dma, cfg_ee;
+ int rc;
+
+ prlog(PR_INFO, "NX%d: gzip at 0x%x\n", gcid, pb_base);
+
+ cfg_dma = pb_base + NX_P9_DMA_CFG;
+ cfg_ee = pb_base + NX_P9_EE_CFG;
+
+ rc = nx_cfg_gzip_dma(gcid, cfg_dma);
+ if (rc)
+ return;
+
+ rc = nx_cfg_gzip_ee(gcid, cfg_ee);
+ if (rc)
+ return;
+
+ rc = nx_cfg_gzip_umac(node, gcid, pb_base);
+ if (rc)
+ return;
+
+ prlog(PR_INFO, "NX%d: gzip Coprocessor Enabled\n", gcid);
+}
diff --git a/roms/skiboot/hw/nx-rng.c b/roms/skiboot/hw/nx-rng.c
new file mode 100644
index 000000000..274b33211
--- /dev/null
+++ b/roms/skiboot/hw/nx-rng.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * NX Hardware Random Number Generator
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <io.h>
+#include <cpu.h>
+#include <nx.h>
+#include <chip.h>
+#include <phys-map.h>
+#include <xscom-p9-regs.h>
+
+/*
+ * On P9 the DARN instruction is used to access the HW RNG. There is still
+ * an NX RNG BAR, but it is used to configure which NX a core will source
+ * random numbers from rather than being a MMIO window.
+ */
+static void nx_init_p9_rng(uint32_t chip_id)
+{
+ uint64_t bar, tmp;
+
+ if (chip_quirk(QUIRK_NO_RNG))
+ return;
+
+ phys_map_get(chip_id, NX_RNG, 0, &bar, NULL);
+ xscom_write(chip_id, P9X_NX_MMIO_BAR, bar | P9X_NX_MMIO_BAR_EN);
+
+ /* Read config register for pace info */
+ xscom_read(chip_id, P9X_NX_RNG_CFG, &tmp);
+ prlog(PR_INFO, "NX RNG[%x] pace:%lli\n", chip_id, 0xffff & (tmp >> 2));
+}
+
+void nx_create_rng_node(struct dt_node *node)
+{
+ u64 bar, cfg;
+ u64 xbar, xcfg;
+ u32 pb_base;
+ u32 gcid;
+ u64 rng_addr, rng_len, len, addr_mask;
+ struct dt_node *rng;
+ int rc;
+
+ gcid = dt_get_chip_id(node);
+ pb_base = dt_get_address(node, 0, NULL);
+
+ if (dt_node_is_compatible(node, "ibm,power8-nx")) {
+ xbar = pb_base + NX_P8_RNG_BAR;
+ xcfg = pb_base + NX_P8_RNG_CFG;
+ addr_mask = NX_P8_RNG_BAR_ADDR;
+ } else if (dt_node_is_compatible(node, "ibm,power9-nx")) {
+ nx_init_p9_rng(gcid);
+ return;
+ } else {
+ prerror("NX%d: Unknown NX type!\n", gcid);
+ return;
+ }
+
+ rc = xscom_read(gcid, xbar, &bar); /* Get RNG BAR */
+ if (rc) {
+ prerror("NX%d: ERROR: XSCOM RNG BAR read failure %d\n",
+ gcid, rc);
+ return;
+ }
+
+ rc = xscom_read(gcid, xcfg, &cfg); /* Get RNG CFG */
+ if (rc) {
+ prerror("NX%d: ERROR: XSCOM RNG config read failure %d\n",
+ gcid, rc);
+ return;
+ }
+
+ /*
+ * We mask in-place rather than using GETFIELD for the base address
+ * as we happen to *know* that it's properly aligned in the register.
+ *
+ * FIXME? Always assusme BAR gets a valid address from FSP
+ */
+ rng_addr = bar & addr_mask;
+ len = GETFIELD(NX_RNG_BAR_SIZE, bar);
+ if (len > 4) {
+ prerror("NX%d: Corrupted bar size %lld\n", gcid, len);
+ return;
+ }
+ rng_len = (u64[]){ 0x1000, /* 4K */
+ 0x10000, /* 64K */
+ 0x400000000UL, /* 16G*/
+ 0x100000, /* 1M */
+ 0x1000000 /* 16M */} [len];
+
+
+ prlog(PR_INFO, "NX%d: RNG BAR set to 0x%016llx..0x%016llx\n",
+ gcid, rng_addr, rng_addr + rng_len - 1);
+
+ /* RNG must be enabled before MMIO is enabled */
+ rc = xscom_write(gcid, xcfg, cfg | NX_RNG_CFG_ENABLE);
+ if (rc) {
+ prerror("NX%d: ERROR: XSCOM RNG config enable failure %d\n",
+ gcid, rc);
+ return;
+ }
+
+ /* The BAR needs to be enabled too */
+ rc = xscom_write(gcid, xbar, bar | NX_RNG_BAR_ENABLE);
+ if (rc) {
+ prerror("NX%d: ERROR: XSCOM RNG config enable failure %d\n",
+ gcid, rc);
+ return;
+ }
+
+ rng = dt_new_addr(dt_root, "hwrng", rng_addr);
+ if (!rng)
+ return;
+
+ dt_add_property_strings(rng, "compatible", "ibm,power-rng");
+ dt_add_property_u64s(rng, "reg", rng_addr, rng_len);
+ dt_add_property_cells(rng, "ibm,chip-id", gcid);
+}
diff --git a/roms/skiboot/hw/nx.c b/roms/skiboot/hw/nx.c
new file mode 100644
index 000000000..fdadf53c7
--- /dev/null
+++ b/roms/skiboot/hw/nx.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * NX Accellerator unit support
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <io.h>
+#include <cpu.h>
+#include <nx.h>
+#include <chip.h>
+#include <xscom-p9-regs.h>
+#include <xscom-p10-regs.h>
+#include <phys-map.h>
+#include <vas.h>
+#include <p9_stop_api.H>
+
+static void darn_init(void)
+{
+ struct dt_node *nx;
+ struct proc_chip *chip;
+ struct cpu_thread *c;
+ uint64_t bar, default_bar;
+
+ if (chip_quirk(QUIRK_NO_RNG))
+ return;
+
+ /*
+ * To allow the DARN instruction to function there must be at least
+ * one NX available in the system. Otherwise using DARN will result
+ * in a checkstop. I suppose we could mask the FIR...
+ */
+ dt_for_each_compatible(dt_root, nx, "ibm,power9-nx")
+ break;
+ assert(nx);
+
+ phys_map_get(dt_get_chip_id(nx), NX_RNG, 0, &default_bar, NULL);
+
+ for_each_chip(chip) {
+ /* is this NX enabled? */
+ xscom_read(chip->id, P9X_NX_MMIO_BAR, &bar);
+ if (!(bar & ~P9X_NX_MMIO_BAR_EN))
+ bar = default_bar;
+
+ for_each_available_core_in_chip(c, chip->id) {
+ uint64_t addr;
+
+ if (proc_gen == proc_gen_p9) {
+ addr = XSCOM_ADDR_P9_EX(pir_to_core_id(c->pir),
+ P9X_EX_NCU_DARN_BAR);
+ xscom_write(chip->id, addr,
+ bar | P9X_EX_NCU_DARN_BAR_EN);
+ } else if (proc_gen >= proc_gen_p10) {
+ addr = XSCOM_ADDR_P10_NCU(pir_to_core_id(c->pir),
+ P10_NCU_DARN_BAR);
+ xscom_write(chip->id, addr,
+ bar | P10_NCU_DARN_BAR_EN);
+ /* Init for sibling core also */
+ if (c->is_fused_core) {
+ addr = XSCOM_ADDR_P10_NCU(pir_to_core_id(c->pir + 1),
+ P10_NCU_DARN_BAR);
+ xscom_write(chip->id, addr,
+ bar | P10_NCU_DARN_BAR_EN);
+ }
+ }
+ }
+ }
+}
+
+void nx_p9_rng_late_init(void)
+{
+ struct cpu_thread *c;
+ uint64_t rc;
+
+ if (proc_gen < proc_gen_p9)
+ return;
+ if (chip_quirk(QUIRK_NO_RNG))
+ return;
+
+ prlog(PR_INFO, "SLW: Configuring self-restore for P9X_EX_NCU_DARN_BAR\n");
+ for_each_present_cpu(c) {
+ if(cpu_is_thread0(c)) {
+ struct proc_chip *chip = get_chip(c->chip_id);
+ uint64_t addr, bar;
+
+ phys_map_get(chip->id, NX_RNG, 0, &bar, NULL);
+ addr = XSCOM_ADDR_P9_EX(pir_to_core_id(c->pir),
+ P9X_EX_NCU_DARN_BAR);
+ /* Bail out if wakeup engine has already failed */
+ if ( wakeup_engine_state != WAKEUP_ENGINE_PRESENT) {
+ prlog(PR_ERR,"DARN BAR p9_stop_api fail detected\n");
+ break;
+ }
+ rc = p9_stop_save_scom((void *)chip->homer_base,
+ addr, bar | P9X_EX_NCU_DARN_BAR_EN,
+ P9_STOP_SCOM_REPLACE,
+ P9_STOP_SECTION_EQ_SCOM);
+ if (rc) {
+ prlog(PR_ERR,
+ "p9_stop_api for DARN_BAR failed rc= %lld",
+ rc);
+ prlog(PR_ERR, "Disabling deep stop states\n");
+ wakeup_engine_state = WAKEUP_ENGINE_FAILED;
+ break;
+ }
+ }
+ }
+}
+
+static void nx_init_one(struct dt_node *node)
+{
+ nx_create_rng_node(node);
+
+ if (!vas_nx_enabled())
+ return;
+
+ nx_create_crypto_node(node);
+
+ nx_create_compress_node(node);
+}
+
+void nx_init(void)
+{
+ struct dt_node *node;
+
+ dt_for_each_compatible(dt_root, node, "ibm,power-nx") {
+ nx_init_one(node);
+ }
+
+ dt_for_each_compatible(dt_root, node, "ibm,power9-nx") {
+ nx_init_one(node);
+ }
+
+ if (proc_gen >= proc_gen_p9)
+ darn_init();
+}
diff --git a/roms/skiboot/hw/occ-sensor.c b/roms/skiboot/hw/occ-sensor.c
new file mode 100644
index 000000000..6efaf908b
--- /dev/null
+++ b/roms/skiboot/hw/occ-sensor.c
@@ -0,0 +1,640 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * OCC (On Chip Controller) exports a bunch of sensors
+ *
+ * Copyright 2017-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <opal.h>
+#include <chip.h>
+#include <sensor.h>
+#include <device.h>
+#include <cpu.h>
+#include <occ.h>
+
+enum sensor_attr {
+ SENSOR_SAMPLE,
+ SENSOR_SAMPLE_MIN, /* OCC's min/max */
+ SENSOR_SAMPLE_MAX,
+ SENSOR_CSM_MIN, /* CSM's min/max */
+ SENSOR_CSM_MAX,
+ SENSOR_ACCUMULATOR,
+ MAX_SENSOR_ATTR,
+};
+
+#define HWMON_SENSORS_MASK (OCC_SENSOR_TYPE_CURRENT | \
+ OCC_SENSOR_TYPE_VOLTAGE | \
+ OCC_SENSOR_TYPE_TEMPERATURE | \
+ OCC_SENSOR_TYPE_POWER)
+
+/*
+ * Standard HWMON linux interface expects the below units for the
+ * environment sensors:
+ * - Current : milliampere
+ * - Voltage : millivolt
+ * - Temperature : millidegree Celsius (scaled in kernel)
+ * - Power : microWatt (scaled in kernel)
+ * - Energy : microJoule
+ */
+
+/*
+ * OCC sensor units are obtained after scaling the sensor values.
+ * https://github.com/open-power/occ/blob/master/src/occ_405/sensor/sensor_info.c
+ */
+
+static struct str_map {
+ const char *occ_str;
+ const char *opal_str;
+} str_maps[] = {
+ {"PWRSYS", "System"},
+ /* Bulk power of the system: Watt */
+ {"PWRFAN", "Fan"},
+ /* Power consumption of the system fans: Watt */
+ {"PWRIO", "IO"},
+ /* Power consumption of the IO subsystem: Watt */
+ {"PWRSTORE", "Storage"},
+ /* Power comsumption of the storage subsystem: Watt */
+ {"PWRGPU", "GPU"},
+ /* Power consumption for GPUs per socket read from APSS: Watt */
+ {"PWRAPSSCH", "APSS"},
+ /* Power Provided by APSS channel x (where x=0…15): Watt */
+ {"PWRPROC", ""},
+ /* Power consumption for this Processor: Watt */
+ {"PWRVDD", "Vdd"},
+ /* Power consumption for this Processor's Vdd(AVSBus readings): Watt */
+ {"PWRVDN", "Vdn"},
+ /* Power consumption for  this Processor's Vdn (nest)
+ * Calculated from AVSBus readings: Watt */
+ {"PWRMEM", "Memory"},
+ /* Power consumption for Memory  for this Processor read from APSS:
+ * Watt */
+ {"CURVDD", "Vdd"},
+ /* Processor Vdd Current (read from AVSBus): Ampere */
+ {"CURVDN", "Vdn"},
+ /* Processor Vdn Current (read from AVSBus): Ampere */
+ {"VOLTVDDSENSE", "Vdd Remote Sense"},
+ /* Vdd Voltage at the remote sense.
+ * AVS reading adjusted for loadline: millivolt */
+ {"VOLTVDNSENSE", "Vdn Remote Sense"},
+ /* Vdn Voltage at the remote sense.
+ * AVS reading adjusted for loadline: millivolt */
+ {"VOLTVDD", "Vdd"},
+ /* Processor Vdd Voltage (read from AVSBus): millivolt */
+ {"VOLTVDN", "Vdn"},
+ /* Processor Vdn Voltage (read from AVSBus): millivolt */
+ {"TEMPC", "Core"},
+ /* Average temperature of core DTS sensors for Processor's Core y:
+ * Celsius */
+ {"TEMPQ", "Quad"},
+ /* Average temperature of quad (in cache) DTS sensors for
+ * Processor’s Quad y: Celsius */
+ {"TEMPNEST", "Nest"},
+ /* Average temperature of nest DTS sensors: Celsius */
+ {"TEMPPROCTHRMC", "Core"},
+ /* The combined weighted core/quad temperature for processor core y:
+ * Celsius */
+ {"TEMPDIMM", "DIMM"},
+ /* DIMM temperature for DIMM x: Celsius */
+ {"TEMPGPU", "GPU"},
+ /* GPU x (0..2) board temperature: Celsius */
+ /* TEMPGPUxMEM: GPU x hottest HBM temperature (individual memory
+ * temperatures are not available): Celsius */
+ {"TEMPVDD", "VRM VDD"},
+ /* VRM Vdd temperature: Celsius */
+};
+
+static u64 occ_sensor_base;
+
+static inline
+struct occ_sensor_data_header *get_sensor_header_block(int occ_num)
+{
+ return (struct occ_sensor_data_header *)
+ (occ_sensor_base + occ_num * OCC_SENSOR_DATA_BLOCK_SIZE);
+}
+
+static inline
+struct occ_sensor_name *get_names_block(struct occ_sensor_data_header *hb)
+{
+ return ((struct occ_sensor_name *)((u64)hb + be32_to_cpu(hb->names_offset)));
+}
+
+static inline u32 sensor_handler(int occ_num, int sensor_id, int attr)
+{
+ return sensor_make_handler(SENSOR_OCC, occ_num, sensor_id, attr);
+}
+
+/*
+ * The scaling factor for the sensors is encoded in the below format:
+ * (((UINT32)mantissa << 8) | (UINT32)((UINT8) 256 + (UINT8)exp))
+ * https://github.com/open-power/occ/blob/master/src/occ_405/sensor/sensor.h
+ */
+static void scale_sensor(struct occ_sensor_name *md, u64 *sensor)
+{
+ u32 factor = be32_to_cpu(md->scale_factor);
+ int i;
+ s8 exp;
+
+ if (be16_to_cpu(md->type) == OCC_SENSOR_TYPE_CURRENT)
+ *sensor *= 1000; //convert to mA
+
+ *sensor *= factor >> 8;
+ exp = factor & 0xFF;
+
+ if (exp > 0) {
+ for (i = labs(exp); i > 0; i--)
+ *sensor *= 10;
+ } else {
+ for (i = labs(exp); i > 0; i--)
+ *sensor /= 10;
+ }
+}
+
+static void scale_energy(struct occ_sensor_name *md, u64 *sensor)
+{
+ u32 factor = be32_to_cpu(md->freq);
+ int i;
+ s8 exp;
+
+ *sensor *= 1000000; //convert to uJ
+
+ *sensor /= factor >> 8;
+ exp = factor & 0xFF;
+
+ if (exp > 0) {
+ for (i = labs(exp); i > 0; i--)
+ *sensor /= 10;
+ } else {
+ for (i = labs(exp); i > 0; i--)
+ *sensor *= 10;
+ }
+}
+
+static u64 read_sensor(struct occ_sensor_record *sensor, int attr)
+{
+ switch (attr) {
+ case SENSOR_SAMPLE:
+ return be16_to_cpu(sensor->sample);
+ case SENSOR_SAMPLE_MIN:
+ return be16_to_cpu(sensor->sample_min);
+ case SENSOR_SAMPLE_MAX:
+ return be16_to_cpu(sensor->sample_max);
+ case SENSOR_CSM_MIN:
+ return be16_to_cpu(sensor->csm_min);
+ case SENSOR_CSM_MAX:
+ return be16_to_cpu(sensor->csm_max);
+ case SENSOR_ACCUMULATOR:
+ return be64_to_cpu(sensor->accumulator);
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static void *select_sensor_buffer(struct occ_sensor_data_header *hb, int id)
+{
+ struct occ_sensor_name *md;
+ u8 *ping, *pong;
+ void *buffer = NULL;
+ u32 reading_offset;
+
+ if (!hb)
+ return NULL;
+
+ md = get_names_block(hb);
+
+ ping = (u8 *)((u64)hb + be32_to_cpu(hb->reading_ping_offset));
+ pong = (u8 *)((u64)hb + be32_to_cpu(hb->reading_pong_offset));
+ reading_offset = be32_to_cpu(md[id].reading_offset);
+
+ /* Check which buffer is valid and read the data from that.
+ * Ping Pong Action
+ * 0 0 Return with error
+ * 0 1 Read Pong
+ * 1 0 Read Ping
+ * 1 1 Read the buffer with latest timestamp
+ */
+
+ if (*ping && *pong) {
+ u64 tping, tpong;
+ u64 ping_buf = (u64)ping + reading_offset;
+ u64 pong_buf = (u64)pong + reading_offset;
+
+ tping = be64_to_cpu(((struct occ_sensor_record *)ping_buf)->timestamp);
+ tpong = be64_to_cpu(((struct occ_sensor_record *)pong_buf)->timestamp);
+
+ if (tping > tpong)
+ buffer = ping;
+ else
+ buffer = pong;
+ } else if (*ping && !*pong) {
+ buffer = ping;
+ } else if (!*ping && *pong) {
+ buffer = pong;
+ } else if (!*ping && !*pong) {
+ prlog(PR_DEBUG, "OCC: Both ping and pong sensor buffers are invalid\n");
+ return NULL;
+ }
+
+ assert(buffer);
+ buffer = (void *)((u64)buffer + reading_offset);
+
+ return buffer;
+}
+
+int occ_sensor_read(u32 handle, __be64 *data)
+{
+ struct occ_sensor_data_header *hb;
+ struct occ_sensor_name *md;
+ u16 id = sensor_get_rid(handle);
+ u8 occ_num = sensor_get_frc(handle);
+ u8 attr = sensor_get_attr(handle);
+ u64 d;
+ void *buff;
+
+ if (occ_num > MAX_OCCS)
+ return OPAL_PARAMETER;
+
+ if (attr > MAX_SENSOR_ATTR)
+ return OPAL_PARAMETER;
+
+ if (is_occ_reset())
+ return OPAL_HARDWARE;
+
+ hb = get_sensor_header_block(occ_num);
+
+ if (hb->valid != 1)
+ return OPAL_HARDWARE;
+
+ if (id > be16_to_cpu(hb->nr_sensors))
+ return OPAL_PARAMETER;
+
+ buff = select_sensor_buffer(hb, id);
+ if (!buff)
+ return OPAL_HARDWARE;
+
+ d = read_sensor(buff, attr);
+ if (!d)
+ goto out_success;
+
+ md = get_names_block(hb);
+ if (be16_to_cpu(md[id].type) == OCC_SENSOR_TYPE_POWER && attr == SENSOR_ACCUMULATOR)
+ scale_energy(&md[id], &d);
+ else
+ scale_sensor(&md[id], &d);
+
+out_success:
+ *data = cpu_to_be64(d);
+
+ return OPAL_SUCCESS;
+}
+
+static bool occ_sensor_sanity(struct occ_sensor_data_header *hb, int chipid)
+{
+ if (hb->valid != 0x01) {
+ prerror("OCC: Chip %d sensor data invalid\n", chipid);
+ return false;
+ }
+
+ if (hb->version != 0x01) {
+ prerror("OCC: Chip %d unsupported sensor header block version %d\n",
+ chipid, hb->version);
+ return false;
+ }
+
+ if (hb->reading_version != 0x01) {
+ prerror("OCC: Chip %d unsupported sensor record format %d\n",
+ chipid, hb->reading_version);
+ return false;
+ }
+
+ if (hb->names_version != 0x01) {
+ prerror("OCC: Chip %d unsupported sensor names format %d\n",
+ chipid, hb->names_version);
+ return false;
+ }
+
+ if (hb->name_length != sizeof(struct occ_sensor_name)) {
+ prerror("OCC: Chip %d unsupported sensor names length %d\n",
+ chipid, hb->name_length);
+ return false;
+ }
+
+ if (!hb->nr_sensors) {
+ prerror("OCC: Chip %d has no sensors\n", chipid);
+ return false;
+ }
+
+ if (!hb->names_offset ||
+ !hb->reading_ping_offset ||
+ !hb->reading_pong_offset) {
+ prerror("OCC: Chip %d Invalid sensor buffer pointers\n",
+ chipid);
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * parse_entity: Parses OCC sensor name to return the entity number like
+ * chipid, core-id, dimm-no, gpu-no. 'end' is used to
+ * get the subentity strings. Returns -1 if no number is found.
+ * TEMPC4 --> returns 4, end will be NULL
+ * TEMPGPU2DRAM1 --> returns 2, end = "DRAM1"
+ * PWRSYS --> returns -1, end = NULL
+ */
+static int parse_entity(const char *name, char **end)
+{
+ while (*name != '\0') {
+ if (isdigit(*name))
+ break;
+ name++;
+ }
+
+ if (*name)
+ return strtol(name, end, 10);
+ else
+ return -1;
+}
+
+static void add_sensor_label(struct dt_node *node, struct occ_sensor_name *md,
+ int chipid)
+{
+ char sname[30] = "";
+ char prefix[30] = "";
+ uint16_t location = be16_to_cpu(md->location);
+ int i;
+
+ if (location != OCC_SENSOR_LOC_SYSTEM)
+ snprintf(prefix, sizeof(prefix), "%s %d ", "Chip", chipid);
+
+ for (i = 0; i < ARRAY_SIZE(str_maps); i++)
+ if (!strncmp(str_maps[i].occ_str, md->name,
+ strlen(str_maps[i].occ_str))) {
+ char *end;
+ int num = -1;
+
+ if (location != OCC_SENSOR_LOC_CORE)
+ num = parse_entity(md->name, &end);
+
+ if (num != -1) {
+ snprintf(sname, sizeof(sname), "%s%s %d %s",
+ prefix, str_maps[i].opal_str, num,
+ end);
+ } else {
+ snprintf(sname, sizeof(sname), "%s%s", prefix,
+ str_maps[i].opal_str);
+ }
+ dt_add_property_string(node, "label", sname);
+ return;
+ }
+
+ /* Fallback to OCC literal if mapping is not found */
+ if (location == OCC_SENSOR_LOC_SYSTEM) {
+ dt_add_property_string(node, "label", md->name);
+ } else {
+ snprintf(sname, sizeof(sname), "%s%s", prefix, md->name);
+ dt_add_property_string(node, "label", sname);
+ }
+}
+
+static const char *get_sensor_type_string(enum occ_sensor_type type)
+{
+ switch (type) {
+ case OCC_SENSOR_TYPE_POWER:
+ return "power";
+ case OCC_SENSOR_TYPE_TEMPERATURE:
+ return "temp";
+ case OCC_SENSOR_TYPE_CURRENT:
+ return "curr";
+ case OCC_SENSOR_TYPE_VOLTAGE:
+ return "in";
+ default:
+ break;
+ }
+
+ return "unknown";
+}
+
+static const char *get_sensor_loc_string(enum occ_sensor_location loc)
+{
+ switch (loc) {
+ case OCC_SENSOR_LOC_SYSTEM:
+ return "sys";
+ case OCC_SENSOR_LOC_PROCESSOR:
+ return "proc";
+ case OCC_SENSOR_LOC_MEMORY:
+ return "mem";
+ case OCC_SENSOR_LOC_VRM:
+ return "vrm";
+ case OCC_SENSOR_LOC_CORE:
+ return "core";
+ case OCC_SENSOR_LOC_QUAD:
+ return "quad";
+ case OCC_SENSOR_LOC_GPU:
+ return "gpu";
+ default:
+ break;
+ }
+
+ return "unknown";
+}
+
+/*
+ * Power sensors can be 0 valued in few platforms like Zaius, Romulus
+ * which do not have APSS. At the moment there is no HDAT/DT property
+ * to indicate if APSS is present. So for now skip zero valued power
+ * sensors.
+ */
+static bool check_sensor_sample(struct occ_sensor_data_header *hb, u32 offset)
+{
+ struct occ_sensor_record *ping, *pong;
+
+ ping = (struct occ_sensor_record *)((u64)hb
+ + be32_to_cpu(hb->reading_ping_offset) + offset);
+ pong = (struct occ_sensor_record *)((u64)hb
+ + be32_to_cpu(hb->reading_pong_offset) + offset);
+ return ping->sample || pong->sample;
+}
+
+static void add_sensor_node(const char *loc, const char *type, int i, int attr,
+ struct occ_sensor_name *md, __be32 *phandle, u32 *ptype,
+ u32 pir, u32 occ_num, u32 chipid)
+{
+ char name[30];
+ struct dt_node *node;
+ u32 handler;
+
+ snprintf(name, sizeof(name), "%s-%s", loc, type);
+ handler = sensor_handler(occ_num, i, attr);
+ node = dt_new_addr(sensor_node, name, handler);
+ dt_add_property_string(node, "sensor-type", type);
+ dt_add_property_cells(node, "sensor-data", handler);
+ dt_add_property_cells(node, "reg", handler);
+ dt_add_property_string(node, "occ_label", md->name);
+ add_sensor_label(node, md, chipid);
+
+ if (be16_to_cpu(md->location) == OCC_SENSOR_LOC_CORE)
+ dt_add_property_cells(node, "ibm,pir", pir);
+
+ *ptype = be16_to_cpu(md->type);
+
+ if (attr == SENSOR_SAMPLE) {
+ handler = sensor_handler(occ_num, i, SENSOR_CSM_MAX);
+ dt_add_property_cells(node, "sensor-data-max", handler);
+
+ handler = sensor_handler(occ_num, i, SENSOR_CSM_MIN);
+ dt_add_property_cells(node, "sensor-data-min", handler);
+ }
+
+ dt_add_property_string(node, "compatible", "ibm,opal-sensor");
+ *phandle = cpu_to_be32(node->phandle);
+}
+
+bool occ_sensors_init(void)
+{
+ struct proc_chip *chip;
+ struct dt_node *sg, *exports;
+ int occ_num = 0, i;
+ bool has_gpu = false;
+
+ /* OCC inband sensors is only supported in P9/10 */
+ if (proc_gen < proc_gen_p9)
+ return false;
+
+ /* Sensors are copied to BAR2 OCC Common Area */
+ chip = next_chip(NULL);
+ if (!chip->occ_common_base) {
+ prerror("OCC: Unassigned OCC Common Area. No sensors found\n");
+ return false;
+ }
+
+ occ_sensor_base = chip->occ_common_base + OCC_SENSOR_DATA_BLOCK_OFFSET;
+
+ sg = dt_new(opal_node, "sensor-groups");
+ if (!sg) {
+ prerror("OCC: Failed to create sensor groups node\n");
+ return false;
+ }
+ dt_add_property_string(sg, "compatible", "ibm,opal-sensor-group");
+ dt_add_property_cells(sg, "#address-cells", 1);
+ dt_add_property_cells(sg, "#size-cells", 0);
+
+ /*
+ * On POWER9, ibm,ioda2-npu2-phb indicates the presence of a
+ * GPU NVlink.
+ */
+ if (dt_find_compatible_node(dt_root, NULL, "ibm,ioda2-npu2-phb")) {
+
+ for_each_chip(chip) {
+ int max_gpus_per_chip = 3, i;
+
+ for(i = 0; i < max_gpus_per_chip; i++) {
+ has_gpu = occ_get_gpu_presence(chip, i);
+
+ if (has_gpu)
+ break;
+ }
+
+ if (has_gpu)
+ break;
+ }
+ }
+
+ for_each_chip(chip) {
+ struct occ_sensor_data_header *hb;
+ struct occ_sensor_name *md;
+ __be32 *phandles;
+ u32 *ptype, phcount = 0;
+ unsigned int nr_sensors;
+
+ hb = get_sensor_header_block(occ_num);
+ md = get_names_block(hb);
+
+ /* Sanity check of the Sensor Data Header Block */
+ if (!occ_sensor_sanity(hb, chip->id))
+ continue;
+
+ nr_sensors = be16_to_cpu(hb->nr_sensors);
+
+ phandles = malloc(nr_sensors * sizeof(__be32));
+ assert(phandles);
+ ptype = malloc(nr_sensors * sizeof(u32));
+ assert(ptype);
+
+ for (i = 0; i < nr_sensors; i++) {
+ const char *type_name, *loc;
+ struct cpu_thread *c = NULL;
+ uint32_t pir = 0;
+ uint16_t type = be16_to_cpu(md[i].type);
+ uint16_t location = be16_to_cpu(md[i].location);
+
+ if (md[i].structure_type != OCC_SENSOR_READING_FULL)
+ continue;
+
+ if (!(type & HWMON_SENSORS_MASK))
+ continue;
+
+ if (location == OCC_SENSOR_LOC_GPU && !has_gpu)
+ continue;
+
+ if (type == OCC_SENSOR_TYPE_POWER &&
+ !check_sensor_sample(hb, be32_to_cpu(md[i].reading_offset)))
+ continue;
+
+ if (location == OCC_SENSOR_LOC_CORE) {
+ int num = parse_entity(md[i].name, NULL);
+
+ for_each_available_core_in_chip(c, chip->id)
+ if (pir_to_core_id(c->pir) == num)
+ break;
+ if (!c)
+ continue;
+ pir = c->pir;
+ }
+
+ type_name = get_sensor_type_string(type);
+ loc = get_sensor_loc_string(location);
+
+ add_sensor_node(loc, type_name, i, SENSOR_SAMPLE, &md[i],
+ &phandles[phcount], &ptype[phcount],
+ pir, occ_num, chip->id);
+ phcount++;
+
+ /* Add energy sensors */
+ if (type == OCC_SENSOR_TYPE_POWER &&
+ md[i].structure_type == OCC_SENSOR_READING_FULL) {
+ add_sensor_node(loc, "energy", i,
+ SENSOR_ACCUMULATOR, &md[i],
+ &phandles[phcount], &ptype[phcount],
+ pir, occ_num, chip->id);
+ phcount++;
+ }
+
+ }
+ occ_num++;
+ occ_add_sensor_groups(sg, phandles, ptype, phcount, chip->id);
+ free(phandles);
+ free(ptype);
+ }
+ /* clear the device tree property if no sensors */
+ if (list_empty(&sg->children)) {
+ dt_free(sg);
+ }
+
+ if (!occ_num)
+ return false;
+
+ exports = dt_find_by_path(dt_root, "/ibm,opal/firmware/exports");
+ if (!exports) {
+ prerror("OCC: dt node /ibm,opal/firmware/exports not found\n");
+ return false;
+ }
+
+ dt_add_property_u64s(exports, "occ_inband_sensors", occ_sensor_base,
+ OCC_SENSOR_DATA_BLOCK_SIZE * occ_num);
+
+ return true;
+}
diff --git a/roms/skiboot/hw/occ.c b/roms/skiboot/hw/occ.c
new file mode 100644
index 000000000..8d7bcbec9
--- /dev/null
+++ b/roms/skiboot/hw/occ.c
@@ -0,0 +1,2339 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Interface with the On Chip Controller,
+ * which enforces power and thermal management
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <xscom-p8-regs.h>
+#include <io.h>
+#include <cpu.h>
+#include <chip.h>
+#include <mem_region.h>
+#include <timebase.h>
+#include <errorlog.h>
+#include <opal-api.h>
+#include <opal-msg.h>
+#include <timer.h>
+#include <i2c.h>
+#include <powercap.h>
+#include <psr.h>
+#include <sensor.h>
+#include <occ.h>
+#include <psi.h>
+
+/* OCC Communication Area for PStates */
+
+#define P8_HOMER_OPAL_DATA_OFFSET 0x1F8000
+#define P9_HOMER_OPAL_DATA_OFFSET 0x0E2000
+
+#define OPAL_DYNAMIC_DATA_OFFSET 0x0B80
+/* relative to HOMER_OPAL_DATA_OFFSET */
+
+#define MAX_PSTATES 256
+#define MAX_P8_CORES 12
+#define MAX_P9_CORES 24
+#define MAX_P10_CORES 32
+
+#define MAX_OPAL_CMD_DATA_LENGTH 4090
+#define MAX_OCC_RSP_DATA_LENGTH 8698
+
+#define P8_PIR_CORE_MASK 0xFFF8
+#define P9_PIR_QUAD_MASK 0xFFF0
+#define P10_PIR_CHIP_MASK 0x0000
+#define FREQ_MAX_IN_DOMAIN 0
+#define FREQ_MOST_RECENTLY_SET 1
+
+/**
+ * OCC-OPAL Shared Memory Region
+ *
+ * Reference document :
+ * https://github.com/open-power/docs/blob/master/occ/OCC_OpenPwr_FW_Interfaces.pdf
+ *
+ * Supported layout versions:
+ * - 0x01, 0x02 : P8
+ * https://github.com/open-power/occ/blob/master_p8/src/occ/proc/proc_pstate.h
+ *
+ * - 0x90 : P9
+ * https://github.com/open-power/occ/blob/master/src/occ_405/proc/proc_pstate.h
+ * In 0x90 the data is separated into :-
+ * -- Static Data (struct occ_pstate_table): Data is written once by OCC
+ * -- Dynamic Data (struct occ_dynamic_data): Data is updated at runtime
+ *
+ * struct occ_pstate_table - Pstate table layout
+ * @valid: Indicates if data is valid
+ * @version: Layout version [Major/Minor]
+ * @v2.throttle: Reason for limiting the max pstate
+ * @v9.occ_role: OCC role (Master/Slave)
+ * @v#.pstate_min: Minimum pstate ever allowed
+ * @v#.pstate_nom: Nominal pstate
+ * @v#.pstate_turbo: Maximum turbo pstate
+ * @v#.pstate_ultra_turbo: Maximum ultra turbo pstate and the maximum
+ * pstate ever allowed
+ * @v#.pstates: Pstate-id and frequency list from Pmax to Pmin
+ * @v#.pstates.id: Pstate-id
+ * @v#.pstates.flags: Pstate-flag(reserved)
+ * @v2.pstates.vdd: Voltage Identifier
+ * @v2.pstates.vcs: Voltage Identifier
+ * @v#.pstates.freq_khz: Frequency in KHz
+ * @v#.core_max[1..N]: Max pstate with N active cores
+ * @spare/reserved/pad: Unused data
+ */
+struct occ_pstate_table {
+ u8 valid;
+ u8 version;
+ union __packed {
+ struct __packed { /* Version 0x01 and 0x02 */
+ u8 throttle;
+ s8 pstate_min;
+ s8 pstate_nom;
+ s8 pstate_turbo;
+ s8 pstate_ultra_turbo;
+ u8 spare;
+ u64 reserved;
+ struct __packed {
+ s8 id;
+ u8 flags;
+ u8 vdd;
+ u8 vcs;
+ __be32 freq_khz;
+ } pstates[MAX_PSTATES];
+ s8 core_max[MAX_P8_CORES];
+ u8 pad[100];
+ } v2;
+ struct __packed { /* Version 0x90 */
+ u8 occ_role;
+ u8 pstate_min;
+ u8 pstate_nom;
+ u8 pstate_turbo;
+ u8 pstate_ultra_turbo;
+ u8 spare;
+ u64 reserved1;
+ u64 reserved2;
+ struct __packed {
+ u8 id;
+ u8 flags;
+ u16 reserved;
+ __be32 freq_khz;
+ } pstates[MAX_PSTATES];
+ u8 core_max[MAX_P9_CORES];
+ u8 pad[56];
+ } v9;
+ struct __packed { /* Version 0xA0 */
+ u8 occ_role;
+ u8 pstate_min;
+ u8 pstate_fixed_freq;
+ u8 pstate_base;
+ u8 pstate_ultra_turbo;
+ u8 pstate_fmax;
+ u8 minor;
+ u8 pstate_bottom_throttle;
+ u8 spare;
+ u8 spare1;
+ u32 reserved_32;
+ u64 reserved_64;
+ struct __packed {
+ u8 id;
+ u8 valid;
+ u16 reserved;
+ __be32 freq_khz;
+ } pstates[MAX_PSTATES];
+ u8 core_max[MAX_P10_CORES];
+ u8 pad[48];
+ } v10;
+ };
+} __packed;
+
+/**
+ * OPAL-OCC Command Response Interface
+ *
+ * OPAL-OCC Command Buffer
+ *
+ * ---------------------------------------------------------------------
+ * | OPAL | Cmd | OPAL | | Cmd Data | Cmd Data | OPAL |
+ * | Cmd | Request | OCC | Reserved | Length | Length | Cmd |
+ * | Flags | ID | Cmd | | (MSB) | (LSB) | Data... |
+ * ---------------------------------------------------------------------
+ * | ….OPAL Command Data up to max of Cmd Data Length 4090 bytes |
+ * | |
+ * ---------------------------------------------------------------------
+ *
+ * OPAL Command Flag
+ *
+ * -----------------------------------------------------------------
+ * | Bit 7 | Bit 6 | Bit 5 | Bit 4 | Bit 3 | Bit 2 | Bit 1 | Bit 0 |
+ * | (msb) | | | | | | | (lsb) |
+ * -----------------------------------------------------------------
+ * |Cmd | | | | | | | |
+ * |Ready | | | | | | | |
+ * -----------------------------------------------------------------
+ *
+ * struct opal_command_buffer - Defines the layout of OPAL command buffer
+ * @flag: Provides general status of the command
+ * @request_id: Token to identify request
+ * @cmd: Command sent
+ * @data_size: Command data length
+ * @data: Command specific data
+ * @spare: Unused byte
+ */
+struct opal_command_buffer {
+ u8 flag;
+ u8 request_id;
+ u8 cmd;
+ u8 spare;
+ u16 data_size;
+ u8 data[MAX_OPAL_CMD_DATA_LENGTH];
+} __packed;
+
+/**
+ * OPAL-OCC Response Buffer
+ *
+ * ---------------------------------------------------------------------
+ * | OCC | Cmd | OPAL | Response | Rsp Data | Rsp Data | OPAL |
+ * | Rsp | Request | OCC | Status | Length | Length | Rsp |
+ * | Flags | ID | Cmd | | (MSB) | (LSB) | Data... |
+ * ---------------------------------------------------------------------
+ * | ….OPAL Response Data up to max of Rsp Data Length 8698 bytes |
+ * | |
+ * ---------------------------------------------------------------------
+ *
+ * OCC Response Flag
+ *
+ * -----------------------------------------------------------------
+ * | Bit 7 | Bit 6 | Bit 5 | Bit 4 | Bit 3 | Bit 2 | Bit 1 | Bit 0 |
+ * | (msb) | | | | | | | (lsb) |
+ * -----------------------------------------------------------------
+ * | | | | | | |OCC in | Rsp |
+ * | | | | | | |progress|Ready |
+ * -----------------------------------------------------------------
+ *
+ * struct occ_response_buffer - Defines the layout of OCC response buffer
+ * @flag: Provides general status of the response
+ * @request_id: Token to identify request
+ * @cmd: Command requested
+ * @status: Indicates success/failure status of
+ * the command
+ * @data_size: Response data length
+ * @data: Response specific data
+ */
+struct occ_response_buffer {
+ u8 flag;
+ u8 request_id;
+ u8 cmd;
+ u8 status;
+ u16 data_size;
+ u8 data[MAX_OCC_RSP_DATA_LENGTH];
+} __packed;
+
+/**
+ * OCC-OPAL Shared Memory Interface Dynamic Data Vx90
+ *
+ * struct occ_dynamic_data - Contains runtime attributes
+ * @occ_state: Current state of OCC
+ * @major_version: Major version number
+ * @minor_version: Minor version number (backwards compatible)
+ * Version 1 indicates GPU presence populated
+ * @gpus_present: Bitmask of GPUs present (on systems where GPU
+ * presence is detected through APSS)
+ * @cpu_throttle: Reason for limiting the max pstate
+ * @mem_throttle: Reason for throttling memory
+ * @quick_pwr_drop: Indicates if QPD is asserted
+ * @pwr_shifting_ratio: Indicates the current percentage of power to
+ * take away from the CPU vs GPU when shifting
+ * power to maintain a power cap. Value of 100
+ * means take all power from CPU.
+ * @pwr_cap_type: Indicates type of power cap in effect
+ * @hard_min_pwr_cap: Hard minimum system power cap in Watts.
+ * Guaranteed unless hardware failure
+ * @max_pwr_cap: Maximum allowed system power cap in Watts
+ * @cur_pwr_cap: Current system power cap
+ * @soft_min_pwr_cap: Soft powercap minimum. OCC may or may not be
+ * able to maintain this
+ * @spare/reserved: Unused data
+ * @cmd: Opal Command Buffer
+ * @rsp: OCC Response Buffer
+ */
+struct occ_dynamic_data {
+ u8 occ_state;
+ u8 major_version;
+ u8 minor_version;
+ u8 gpus_present;
+ struct __packed { /* Version 0x90 */
+ u8 spare1;
+ } v9;
+ struct __packed { /* Version 0xA0 */
+ u8 wof_enabled;
+ } v10;
+ u8 cpu_throttle;
+ u8 mem_throttle;
+ u8 quick_pwr_drop;
+ u8 pwr_shifting_ratio;
+ u8 pwr_cap_type;
+ u16 hard_min_pwr_cap;
+ u16 max_pwr_cap;
+ u16 cur_pwr_cap;
+ u16 soft_min_pwr_cap;
+ u8 pad[110];
+ struct opal_command_buffer cmd;
+ struct occ_response_buffer rsp;
+} __packed;
+
+static bool occ_reset;
+static struct lock occ_lock = LOCK_UNLOCKED;
+static unsigned long homer_opal_data_offset;
+
+DEFINE_LOG_ENTRY(OPAL_RC_OCC_PSTATE_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_OCC,
+ OPAL_CEC_HARDWARE, OPAL_INFO,
+ OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_OCC_TIMEOUT, OPAL_PLATFORM_ERR_EVT, OPAL_OCC,
+ OPAL_CEC_HARDWARE, OPAL_UNRECOVERABLE_ERR_GENERAL,
+ OPAL_NA);
+
+/*
+ * POWER9 and newer platforms have pstate values which are unsigned
+ * positive values. They are continuous set of unsigned integers
+ * [0 to +N] where Pmax is 0 and Pmin is N. The linear ordering of
+ * pstates for P9 has changed compared to P8. Where P8 has negative
+ * pstate values advertised as [0 to -N] where Pmax is 0 and
+ * Pmin is -N. The following routine helps to abstract pstate
+ * comparison with pmax and perform sanity checks on pstate limits.
+ */
+
+/**
+ * cmp_pstates: Compares the given two pstates and determines which
+ * among them is associated with a higher pstate.
+ *
+ * @a,@b: The pstate ids of the pstates being compared.
+ *
+ * Returns: -1 : If pstate associated with @a is smaller than
+ * the pstate associated with @b.
+ * 0 : If pstates associated with @a and @b are equal.
+ * 1 : If pstate associated with @a is greater than
+ * the pstate associated with @b.
+ */
+static int cmp_pstates(int a, int b)
+{
+ /* P8 has 0 to -N (pmax to pmin), P9 has 0 to +N (pmax to pmin) */
+ if (a > b)
+ return (proc_gen == proc_gen_p8)? 1 : -1;
+ else if (a < b)
+ return (proc_gen == proc_gen_p8)? -1 : 1;
+
+ return 0;
+}
+
+static inline
+struct occ_pstate_table *get_occ_pstate_table(struct proc_chip *chip)
+{
+ return (struct occ_pstate_table *)
+ (chip->homer_base + homer_opal_data_offset);
+}
+
+static inline
+struct occ_dynamic_data *get_occ_dynamic_data(struct proc_chip *chip)
+{
+ return (struct occ_dynamic_data *)
+ (chip->homer_base + homer_opal_data_offset +
+ OPAL_DYNAMIC_DATA_OFFSET);
+}
+
+/*
+ * On Chips which have at least one active EX unit, check the
+ * HOMER area for pstate-table valid bit on versions 0x1 and 0x2, or
+ * HOMER dynamic area occ_state on version 0x90.
+ */
+static bool wait_for_all_occ_init(void)
+{
+ struct proc_chip *chip;
+ struct dt_node *xn;
+ struct occ_pstate_table *occ_data;
+ struct occ_dynamic_data *occ_dyn_data;
+ int tries;
+ uint64_t start_time, end_time;
+ uint32_t timeout = 0;
+
+ if (platform.occ_timeout)
+ timeout = platform.occ_timeout();
+
+ start_time = mftb();
+ for_each_chip(chip) {
+ u8 version;
+
+ /*
+ * If the chip doesn't any EX unit present, then OCC
+ * will not update the pstate-table. So, skip the
+ * check.
+ */
+ if (!chip->ex_present) {
+ prlog(PR_DEBUG, "OCC: Chip %02x has no active EX units. Skipping check\n",
+ chip->id);
+ continue;
+ }
+
+ /* Check for valid homer address */
+ if (!chip->homer_base) {
+ /**
+ * @fwts-label OCCInvalidHomerBase
+ * @fwts-advice The HOMER base address for a chip
+ * was not valid. This means that OCC (On Chip
+ * Controller) will be non-functional and CPU
+ * frequency scaling will not be functional. CPU may
+ * be set to a safe, low frequency. Power savings in
+ * CPU idle or CPU hotplug may be impacted.
+ */
+ prlog(PR_ERR,"OCC: Chip: %x homer_base is not valid\n",
+ chip->id);
+ return false;
+ }
+
+ /* Get PState table address */
+ occ_data = get_occ_pstate_table(chip);
+
+ /*
+ * Wait for the OCC to set an appropriate version bit.
+ * The wait is needed since on some platforms (such P8
+ * Tuletta), OCC is not loaded before OPAL boot. Hence
+ * initialization can take a while.
+ *
+ * Note: Checking for occ_data->version == (0x01/0x02/0x90/0xA0)
+ * is ok because we clear all of
+ * homer_base+size before passing memory to host
+ * services. This ensures occ_data->version == 0x0
+ * before OCC load.
+ */
+ tries = timeout * 10;
+ while (tries--) {
+ version = occ_data->version;
+
+ if (version == 0x01 || version == 0x02 ||
+ version == 0x90 || version == 0xA0)
+ break;
+
+ time_wait_ms(100);
+ }
+
+ version = occ_data->version;
+ switch (version) {
+ case 0x1:
+ case 0x2:
+ /*
+ * OCC-OPAL interface version 0x1 and 0x2 do not have
+ * the dynamic data. Hence the the only way to figure out
+ * if the OCC is up or not is to check the valid-bit
+ * in the pstate table.
+ */
+ if (occ_data->valid != 1) {
+ /**
+ * @fwts-label OCCInvalidPStateTable
+ * @fwts-advice The pstate table for a chip
+ * was not valid. This means that OCC (On Chip
+ * Controller) will be non-functional and CPU
+ * frequency scaling will not be functional. CPU may
+ * be set to a low, safe frequency. This means
+ * that CPU idle states and CPU frequency scaling
+ * may not be functional.
+ */
+ prlog(PR_ERR, "OCC: Chip: %x PState table is not valid\n",
+ chip->id);
+ return false;
+ }
+ break;
+
+ case 0x90:
+ /*
+ * OCC-OPAL interface version 0x90 has a
+ * dynamic data section. This has an
+ * occ_state field whose values inform about
+ * the state of the OCC.
+ *
+ * 0x00 = OCC not running. No communication
+ * allowed.
+ *
+ * 0x01 = Standby. No communication allowed.
+ *
+ * 0x02 = Observation State. Communication
+ * allowed and is command dependent.
+ *
+ * 0x03 = Active State. Communication allowed
+ * and is command dependent.
+ *
+ * 0x04 = Safe State. No communication
+ * allowed. Just like CPU throttle
+ * status, some failures will not allow
+ * for OCC to update state to safe.
+ *
+ * 0x05 = Characterization State.
+ * Communication allowed and is command
+ * dependent.
+ *
+ * We will error out if OCC is not in the
+ * Active State.
+ *
+ * XXX : Should we error out only if no
+ * communication is allowed with the
+ * OCC ?
+ */
+ occ_dyn_data = get_occ_dynamic_data(chip);
+ if (occ_dyn_data->occ_state != 0x3) {
+ /**
+ * @fwts-label OCCInactive
+ * @fwts-advice The OCC for a chip was not active.
+ * This means that CPU frequency scaling will
+ * not be functional. CPU may be set to a low,
+ * safe frequency. This means that CPU idle
+ * states and CPU frequency scaling may not be
+ * functional.
+ */
+ prlog(PR_ERR, "OCC: Chip: %x: OCC not active\n",
+ chip->id);
+ return false;
+ }
+ break;
+
+ case 0xA0:
+ /*
+ * OCC-OPAL interface version 0x90 has a
+ * dynamic data section. This has an
+ * occ_state field whose values inform about
+ * the state of the OCC.
+ *
+ * 0x00 = OCC not running. No communication
+ * allowed.
+ *
+ * 0x01 = Standby. No communication allowed.
+ *
+ * 0x02 = Observation State. Communication
+ * allowed and is command dependent.
+ *
+ * 0x03 = Active State. Communication allowed
+ * and is command dependent.
+ *
+ * 0x04 = Safe State. No communication
+ * allowed. Just like CPU throttle
+ * status, some failures will not allow
+ * for OCC to update state to safe.
+ *
+ * 0x05 = Characterization State.
+ * Communication allowed and is command
+ * dependent.
+ *
+ * We will error out if OCC is not in the
+ * Active State.
+ *
+ * XXX : Should we error out only if no
+ * communication is allowed with the
+ * OCC ?
+ */
+ occ_dyn_data = get_occ_dynamic_data(chip);
+ if (occ_dyn_data->occ_state != 0x3) {
+ /**
+ * @fwts-label OCCInactive
+ * @fwts-advice The OCC for a chip was not active.
+ * This means that CPU frequency scaling will
+ * not be functional. CPU may be set to a low,
+ * safe frequency. This means that CPU idle
+ * states and CPU frequency scaling may not be
+ * functional.
+ */
+ prlog(PR_ERR, "OCC: Chip: %x: OCC not active\n",
+ chip->id);
+ return false;
+ }
+ break;
+
+ default:
+ prlog(PR_ERR, "OCC: Unknown OCC-OPAL interface version.\n");
+ return false;
+ }
+
+ if (!chip->occ_functional)
+ chip->occ_functional = true;
+
+ prlog(PR_DEBUG, "OCC: Chip %02x Data (%016llx) = %016llx\n",
+ chip->id, (uint64_t)occ_data, be64_to_cpu(*(__be64 *)occ_data));
+
+ if (version == 0x90 || version == 0xA0) {
+ occ_dyn_data = get_occ_dynamic_data(chip);
+ prlog(PR_DEBUG, "OCC: Chip %02x Dynamic Data (%016llx) = %016llx\n",
+ chip->id, (uint64_t)occ_dyn_data,
+ be64_to_cpu(*(__be64 *)occ_dyn_data));
+ }
+ }
+
+ end_time = mftb();
+ prlog(PR_NOTICE, "OCC: All Chip Rdy after %lu ms\n",
+ tb_to_msecs(end_time - start_time));
+
+ dt_for_each_compatible(dt_root, xn, "ibm,xscom") {
+ const struct dt_property *p;
+ p = dt_find_property(xn, "ibm,occ-functional-state");
+ if (!p)
+ dt_add_property_cells(xn, "ibm,occ-functional-state",
+ 0x1);
+ }
+ return true;
+}
+
+/*
+ * OCC provides pstate table entries in continuous descending order.
+ * Parse the pstate table to skip pstate_ids that are greater
+ * than Pmax. If a pstate_id is equal to Pmin then add it to
+ * the list and break from the loop as this is the last valid
+ * element in the pstate table.
+ */
+static void parse_pstates_v2(struct occ_pstate_table *data, __be32 *dt_id,
+ __be32 *dt_freq, int nr_pstates, int pmax, int pmin)
+{
+ int i, j;
+
+ for (i = 0, j = 0; i < MAX_PSTATES && j < nr_pstates; i++) {
+ if (cmp_pstates(data->v2.pstates[i].id, pmax) > 0)
+ continue;
+
+ dt_id[j] = cpu_to_be32(data->v2.pstates[i].id);
+ dt_freq[j] = cpu_to_be32(be32_to_cpu(data->v2.pstates[i].freq_khz) / 1000);
+ j++;
+
+ if (data->v2.pstates[i].id == pmin)
+ break;
+ }
+
+ if (j != nr_pstates)
+ prerror("OCC: Expected pstates(%d) is not equal to parsed pstates(%d)\n",
+ nr_pstates, j);
+}
+
+static void parse_pstates_v9(struct occ_pstate_table *data, __be32 *dt_id,
+ __be32 *dt_freq, int nr_pstates, int pmax, int pmin)
+{
+ int i, j;
+
+ for (i = 0, j = 0; i < MAX_PSTATES && j < nr_pstates; i++) {
+ if (cmp_pstates(data->v9.pstates[i].id, pmax) > 0)
+ continue;
+
+ dt_id[j] = cpu_to_be32(data->v9.pstates[i].id);
+ dt_freq[j] = cpu_to_be32(be32_to_cpu(data->v9.pstates[i].freq_khz) / 1000);
+ j++;
+
+ if (data->v9.pstates[i].id == pmin)
+ break;
+ }
+
+ if (j != nr_pstates)
+ prerror("OCC: Expected pstates(%d) is not equal to parsed pstates(%d)\n",
+ nr_pstates, j);
+}
+
+static void parse_pstates_v10(struct occ_pstate_table *data, __be32 *dt_id,
+ __be32 *dt_freq, int nr_pstates, int pmax, int pmin)
+{
+ int i, j;
+ int invalid = 0;
+
+ for (i = 0, j = 0; i < MAX_PSTATES && j < nr_pstates; i++) {
+ if (cmp_pstates(data->v10.pstates[i].id, pmax) > 0)
+ continue;
+
+ if (!data->v10.pstates[i].valid) {
+ prlog(PR_WARNING, "OCC: Found Invalid pstate with index %d. Skipping it.\n", i);
+ invalid++;
+ continue;
+ }
+
+ dt_id[j] = cpu_to_be32(data->v10.pstates[i].id);
+ dt_freq[j] = cpu_to_be32(be32_to_cpu(data->v10.pstates[i].freq_khz) / 1000);
+ j++;
+
+ if (data->v10.pstates[i].id == pmin)
+ break;
+ }
+
+ if ((j + invalid) != nr_pstates) {
+ prerror("OCC: Expected pstates(%d) not equal to (Parsed pstates(%d) + Invalid Pstates (%d))\n",
+ nr_pstates, j, invalid);
+ }
+}
+
+static void parse_vid(struct occ_pstate_table *occ_data,
+ struct dt_node *node, u8 nr_pstates,
+ int pmax, int pmin)
+{
+ u8 *dt_vdd, *dt_vcs;
+ int i, j;
+
+ dt_vdd = malloc(nr_pstates);
+ assert(dt_vdd);
+ dt_vcs = malloc(nr_pstates);
+ assert(dt_vcs);
+
+ for (i = 0, j = 0; i < MAX_PSTATES && j < nr_pstates; i++) {
+ if (cmp_pstates(occ_data->v2.pstates[i].id, pmax) > 0)
+ continue;
+
+ dt_vdd[j] = occ_data->v2.pstates[i].vdd;
+ dt_vcs[j] = occ_data->v2.pstates[i].vcs;
+ j++;
+
+ if (occ_data->v2.pstates[i].id == pmin)
+ break;
+ }
+
+ dt_add_property(node, "ibm,pstate-vdds", dt_vdd, nr_pstates);
+ dt_add_property(node, "ibm,pstate-vcss", dt_vcs, nr_pstates);
+
+ free(dt_vdd);
+ free(dt_vcs);
+}
+
+/* Add device tree properties to describe pstates states */
+/* Return nominal pstate to set in each core */
+static bool add_cpu_pstate_properties(struct dt_node *power_mgt,
+ int *pstate_nom)
+{
+ struct proc_chip *chip;
+ uint64_t occ_data_area;
+ struct occ_pstate_table *occ_data = NULL;
+ struct occ_dynamic_data *occ_dyn_data;
+ /* Arrays for device tree */
+ __be32 *dt_id, *dt_freq;
+ int pmax, pmin, pnom;
+ u8 nr_pstates;
+ bool ultra_turbo_supported;
+ int i, major, minor;
+
+ prlog(PR_DEBUG, "OCC: CPU pstate state device tree init\n");
+
+ /*
+ * Find first chip with an OCC which has as a valid
+ * pstate-table
+ */
+ for_each_chip(chip) {
+ occ_data = get_occ_pstate_table(chip);
+
+ /* Dump first 16 bytes of PState table */
+ occ_data_area = (uint64_t)occ_data;
+ prlog(PR_DEBUG, "OCC: Chip %02d :Data (%16llx) = %16llx %16llx\n",
+ chip->id, occ_data_area,
+ be64_to_cpu(*(__be64 *)occ_data_area),
+ be64_to_cpu(*(__be64 *)(occ_data_area + 8)));
+
+ if (occ_data->valid)
+ break;
+ /*
+ * XXX : Error out if !occ_data->valid but Chip has at
+ * least one EX Unit?
+ */
+ }
+
+ assert(occ_data);
+ if (!occ_data->valid) {
+ /**
+ * @fwts-label OCCInvalidPStateTableDT
+ * @fwts-advice The pstate tables for none of the chips
+ * are valid. This means that OCC (On Chip
+ * Controller) will be non-functional. This means
+ * that CPU idle states and CPU frequency scaling
+ * will not be functional as OPAL doesn't populate
+ * the device tree with pstates in this case.
+ */
+ prlog(PR_ERR, "OCC: PState table is not valid\n");
+ return false;
+ }
+
+ /*
+ * Workload-Optimized-Frequency(WOF) or Ultra-Turbo is supported
+ * from version 0x02 onwards. If WOF is disabled then, the max
+ * ultra_turbo pstate will be equal to max turbo pstate.
+ */
+ ultra_turbo_supported = true;
+
+ major = occ_data->version >> 4;
+ minor = occ_data->version & 0xF;
+
+ /* Parse Pmax, Pmin and Pnominal */
+ switch (major) {
+ case 0:
+ if (proc_gen >= proc_gen_p9) {
+ /**
+ * @fwts-label OCCInvalidVersion02
+ * @fwts-advice The PState table layout version is not
+ * supported in P9. So OPAL will not parse the PState
+ * table. CPU frequency scaling will not be functional
+ * as frequency and pstate-ids are not added to DT.
+ */
+ prerror("OCC: Version %x is not supported in P9\n",
+ occ_data->version);
+ return false;
+ }
+ if (minor == 0x1)
+ ultra_turbo_supported = false;
+ pmin = occ_data->v2.pstate_min;
+ pnom = occ_data->v2.pstate_nom;
+ if (ultra_turbo_supported)
+ pmax = occ_data->v2.pstate_ultra_turbo;
+ else
+ pmax = occ_data->v2.pstate_turbo;
+ break;
+ case 0x9:
+ if (proc_gen == proc_gen_p8) {
+ /**
+ * @fwts-label OCCInvalidVersion90
+ * @fwts-advice The PState table layout version is not
+ * supported in P8. So OPAL will not parse the PState
+ * table. CPU frequency scaling will not be functional
+ * as frequency and pstate-ids are not added to DT.
+ */
+ prerror("OCC: Version %x is not supported in P8\n",
+ occ_data->version);
+ return false;
+ }
+ pmin = occ_data->v9.pstate_min;
+ pnom = occ_data->v9.pstate_nom;
+ pmax = occ_data->v9.pstate_ultra_turbo;
+ break;
+ case 0xA:
+ pmin = occ_data->v10.pstate_min;
+ pnom = occ_data->v10.pstate_fixed_freq;
+ occ_dyn_data = get_occ_dynamic_data(chip);
+ if (occ_dyn_data->v10.wof_enabled)
+ pmax = occ_data->v10.pstate_ultra_turbo;
+ else
+ pmax = occ_data->v10.pstate_fmax;
+ break;
+ default:
+ /**
+ * @fwts-label OCCUnsupportedVersion
+ * @fwts-advice The PState table layout version is not
+ * supported. So OPAL will not parse the PState table.
+ * CPU frequency scaling will not be functional as OPAL
+ * doesn't populate the device tree with pstates.
+ */
+ prerror("OCC: Unsupported pstate table layout version %d\n",
+ occ_data->version);
+ return false;
+ }
+
+ /* Sanity check for pstate limits */
+ if (cmp_pstates(pmin, pmax) > 0) {
+ /**
+ * @fwts-label OCCInvalidPStateLimits
+ * @fwts-advice The min pstate is greater than the
+ * max pstate, this could be due to corrupted/invalid
+ * data in OCC-OPAL shared memory region. So OPAL has
+ * not added pstates to device tree. This means that
+ * CPU Frequency management will not be functional in
+ * the host.
+ */
+ prerror("OCC: Invalid pstate limits. Pmin(%d) > Pmax (%d)\n",
+ pmin, pmax);
+ return false;
+ }
+
+ if (cmp_pstates(pnom, pmax) > 0) {
+ /**
+ * @fwts-label OCCInvalidNominalPState
+ * @fwts-advice The nominal pstate is greater than the
+ * max pstate, this could be due to corrupted/invalid
+ * data in OCC-OPAL shared memory region. So OPAL has
+ * limited the nominal pstate to max pstate.
+ */
+ prerror("OCC: Clipping nominal pstate(%d) to Pmax(%d)\n",
+ pnom, pmax);
+ pnom = pmax;
+ }
+
+ nr_pstates = labs(pmax - pmin) + 1;
+ prlog(PR_DEBUG, "OCC: Version %x Min %d Nom %d Max %d Nr States %d\n",
+ occ_data->version, pmin, pnom, pmax, nr_pstates);
+ if (((major == 0x9 || major == 0xA) && nr_pstates <= 1) ||
+ (major == 0 && (nr_pstates <= 1 || nr_pstates > 128))) {
+ /**
+ * @fwts-label OCCInvalidPStateRange
+ * @fwts-advice The number of pstates is outside the valid
+ * range (currently <=1 or > 128 on p8, >255 on P9), so OPAL
+ * has not added pstates to the device tree. This means that
+ * OCC (On Chip Controller) will be non-functional. This means
+ * that CPU idle states and CPU frequency scaling
+ * will not be functional.
+ */
+ prerror("OCC: OCC range is not valid; No of pstates = %d\n",
+ nr_pstates);
+ return false;
+ }
+
+ dt_id = malloc(nr_pstates * sizeof(__be32));
+ assert(dt_id);
+ dt_freq = malloc(nr_pstates * sizeof(__be32));
+ assert(dt_freq);
+
+ switch (major) {
+ case 0:
+ parse_pstates_v2(occ_data, dt_id, dt_freq, nr_pstates,
+ pmax, pmin);
+ break;
+ case 0x9:
+ parse_pstates_v9(occ_data, dt_id, dt_freq, nr_pstates,
+ pmax, pmin);
+ break;
+ case 0xA:
+ parse_pstates_v10(occ_data, dt_id, dt_freq, nr_pstates,
+ pmax, pmin);
+ break;
+ default:
+ return false;
+ }
+
+ /* Add the device-tree entries */
+ dt_add_property(power_mgt, "ibm,pstate-ids", dt_id,
+ nr_pstates * sizeof(__be32));
+ dt_add_property(power_mgt, "ibm,pstate-frequencies-mhz", dt_freq,
+ nr_pstates * sizeof(__be32));
+ dt_add_property_cells(power_mgt, "ibm,pstate-min", pmin);
+ dt_add_property_cells(power_mgt, "ibm,pstate-nominal", pnom);
+ dt_add_property_cells(power_mgt, "ibm,pstate-max", pmax);
+
+ free(dt_freq);
+ free(dt_id);
+
+ /*
+ * Parse and add WOF properties: turbo, ultra-turbo and core_max array.
+ * core_max[1..n] array provides the max sustainable pstate that can be
+ * achieved with i active cores in the chip.
+ */
+ if (ultra_turbo_supported) {
+ int pturbo, pultra_turbo;
+ u8 nr_cores = get_available_nr_cores_in_chip(chip->id);
+ __be32 *dt_cmax;
+
+ dt_cmax = malloc(nr_cores * sizeof(u32));
+ assert(dt_cmax);
+ switch (major) {
+ case 0:
+ pturbo = occ_data->v2.pstate_turbo;
+ pultra_turbo = occ_data->v2.pstate_ultra_turbo;
+ for (i = 0; i < nr_cores; i++)
+ dt_cmax[i] = cpu_to_be32(occ_data->v2.core_max[i]);
+ break;
+ case 0x9:
+ pturbo = occ_data->v9.pstate_turbo;
+ pultra_turbo = occ_data->v9.pstate_ultra_turbo;
+ for (i = 0; i < nr_cores; i++)
+ dt_cmax[i] = cpu_to_be32(occ_data->v9.core_max[i]);
+ break;
+ case 0xA:
+ pturbo = occ_data->v10.pstate_base;
+ pultra_turbo = occ_data->v10.pstate_ultra_turbo;
+ for (i = 0; i < nr_cores; i++)
+ dt_cmax[i] = cpu_to_be32(occ_data->v10.core_max[i]);
+ break;
+ default:
+ return false;
+ }
+
+ if (cmp_pstates(pturbo, pmax) > 0) {
+ prerror("OCC: Clipping turbo pstate(%d) to Pmax(%d)\n",
+ pturbo, pmax);
+ dt_add_property_cells(power_mgt, "ibm,pstate-turbo",
+ pmax);
+ } else {
+ dt_add_property_cells(power_mgt, "ibm,pstate-turbo",
+ pturbo);
+ }
+
+ dt_add_property_cells(power_mgt, "ibm,pstate-ultra-turbo",
+ pultra_turbo);
+ dt_add_property(power_mgt, "ibm,pstate-core-max", dt_cmax,
+ nr_cores * sizeof(u32));
+
+ dt_add_property_cells(power_mgt, "ibm,pstate-base", pturbo);
+ free(dt_cmax);
+ }
+
+ if (major == 0x9 || major == 0xA)
+ goto out;
+
+ dt_add_property_cells(power_mgt, "#address-cells", 2);
+ dt_add_property_cells(power_mgt, "#size-cells", 1);
+
+ /* Add chip specific pstate properties */
+ for_each_chip(chip) {
+ struct dt_node *occ_node;
+
+ occ_data = get_occ_pstate_table(chip);
+ occ_node = dt_new_addr(power_mgt, "occ", (uint64_t)occ_data);
+ if (!occ_node) {
+ /**
+ * @fwts-label OCCDTFailedNodeCreation
+ * @fwts-advice Failed to create
+ * /ibm,opal/power-mgt/occ. Per-chip pstate properties
+ * are not added to Device Tree.
+ */
+ prerror("OCC: Failed to create /ibm,opal/power-mgt/occ@%llx\n",
+ (uint64_t)occ_data);
+ return false;
+ }
+
+ dt_add_property_cells(occ_node, "reg",
+ hi32((uint64_t)occ_data),
+ lo32((uint64_t)occ_data),
+ OPAL_DYNAMIC_DATA_OFFSET +
+ sizeof(struct occ_dynamic_data));
+ dt_add_property_cells(occ_node, "ibm,chip-id", chip->id);
+
+ /*
+ * Parse and add pstate Voltage Identifiers (VID) to DT which
+ * are provided by OCC in version 0x01 and 0x02
+ */
+ parse_vid(occ_data, occ_node, nr_pstates, pmax, pmin);
+ }
+out:
+ /* Return pstate to set for each core */
+ *pstate_nom = pnom;
+ return true;
+}
+
+/*
+ * Prepare chip for pstate transitions
+ */
+
+static bool cpu_pstates_prepare_core(struct proc_chip *chip,
+ struct cpu_thread *c,
+ int pstate_nom)
+{
+ uint32_t core = pir_to_core_id(c->pir);
+ uint64_t tmp, pstate;
+ int rc;
+
+ /*
+ * Currently Fastsleep init clears EX_PM_SPR_OVERRIDE_EN.
+ * Need to ensure only relevant bits are inited
+ */
+
+ /* Init PM GP1 for SCOM based PSTATE control to set nominal freq
+ *
+ * Use the OR SCOM to set the required bits in PM_GP1 register
+ * since the OCC might be mainpulating the PM_GP1 register as well.
+ */
+ rc = xscom_write(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_SET_GP1),
+ EX_PM_SETUP_GP1_PM_SPR_OVERRIDE_EN);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT),
+ "OCC: Failed to write PM_GP1 in pstates init\n");
+ return false;
+ }
+
+ /* Set new pstate to core */
+ rc = xscom_read(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_PPMCR), &tmp);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT),
+ "OCC: Failed to read PM_PPMCR from OCC in pstates init\n");
+ return false;
+ }
+ tmp = tmp & ~0xFFFF000000000000ULL;
+ pstate = ((uint64_t) pstate_nom) & 0xFF;
+ tmp = tmp | (pstate << 56) | (pstate << 48);
+ rc = xscom_write(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_PPMCR), tmp);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT),
+ "OCC: Failed to write PM_PPMCR in pstates init\n");
+ return false;
+ }
+ time_wait_ms(1); /* Wait for PState to change */
+ /*
+ * Init PM GP1 for SPR based PSTATE control.
+ * Once OCC is active EX_PM_SETUP_GP1_DPLL_FREQ_OVERRIDE_EN will be
+ * cleared by OCC. Sapphire need not clear.
+ * However wait for DVFS state machine to become idle after min->nominal
+ * transition initiated above. If not switch over to SPR control could fail.
+ *
+ * Use the AND SCOM to clear the required bits in PM_GP1 register
+ * since the OCC might be mainpulating the PM_GP1 register as well.
+ */
+ tmp = ~EX_PM_SETUP_GP1_PM_SPR_OVERRIDE_EN;
+ rc = xscom_write(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_CLEAR_GP1),
+ tmp);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT),
+ "OCC: Failed to write PM_GP1 in pstates init\n");
+ return false;
+ }
+
+ /* Just debug */
+ rc = xscom_read(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_PPMSR), &tmp);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT),
+ "OCC: Failed to read PM_PPMSR from OCC"
+ "in pstates init\n");
+ return false;
+ }
+ prlog(PR_DEBUG, "OCC: Chip %x Core %x PPMSR %016llx\n",
+ chip->id, core, tmp);
+
+ /*
+ * If PMSR is still in transition at this point due to PState change
+ * initiated above, then the switchover to SPR may not work.
+ * ToDo: Check for DVFS state machine idle before change.
+ */
+
+ return true;
+}
+
+static bool occ_opal_msg_outstanding = false;
+static void occ_msg_consumed(void *data __unused, int status __unused)
+{
+ lock(&occ_lock);
+ occ_opal_msg_outstanding = false;
+ unlock(&occ_lock);
+}
+
+static inline u8 get_cpu_throttle(struct proc_chip *chip)
+{
+ struct occ_pstate_table *pdata = get_occ_pstate_table(chip);
+ struct occ_dynamic_data *data;
+
+ switch (pdata->version >> 4) {
+ case 0:
+ return pdata->v2.throttle;
+ case 0x9:
+ case 0xA:
+ data = get_occ_dynamic_data(chip);
+ return data->cpu_throttle;
+ default:
+ return 0;
+ };
+}
+
+bool is_occ_reset(void)
+{
+ return occ_reset;
+}
+
+static void occ_throttle_poll(void *data __unused)
+{
+ struct proc_chip *chip;
+ struct occ_pstate_table *occ_data;
+ struct opal_occ_msg occ_msg;
+ int rc;
+
+ if (!try_lock(&occ_lock))
+ return;
+ if (occ_reset) {
+ int inactive = 0;
+
+ for_each_chip(chip) {
+ occ_data = get_occ_pstate_table(chip);
+ if (occ_data->valid != 1) {
+ inactive = 1;
+ break;
+ }
+ }
+ if (!inactive) {
+ /*
+ * Queue OCC_THROTTLE with throttle status as 0 to
+ * indicate all OCCs are active after a reset.
+ */
+ occ_msg.type = cpu_to_be64(OCC_THROTTLE);
+ occ_msg.chip = 0;
+ occ_msg.throttle_status = 0;
+ rc = _opal_queue_msg(OPAL_MSG_OCC, NULL, NULL,
+ sizeof(struct opal_occ_msg),
+ &occ_msg);
+ if (!rc)
+ occ_reset = false;
+ }
+ } else {
+ if (occ_opal_msg_outstanding)
+ goto done;
+ for_each_chip(chip) {
+ u8 throttle;
+
+ occ_data = get_occ_pstate_table(chip);
+ throttle = get_cpu_throttle(chip);
+ if ((occ_data->valid == 1) &&
+ (chip->throttle != throttle) &&
+ (throttle <= OCC_MAX_THROTTLE_STATUS)) {
+ occ_msg.type = cpu_to_be64(OCC_THROTTLE);
+ occ_msg.chip = cpu_to_be64(chip->id);
+ occ_msg.throttle_status = cpu_to_be64(throttle);
+ rc = _opal_queue_msg(OPAL_MSG_OCC, NULL,
+ occ_msg_consumed,
+ sizeof(struct opal_occ_msg),
+ &occ_msg);
+ if (!rc) {
+ chip->throttle = throttle;
+ occ_opal_msg_outstanding = true;
+ break;
+ }
+ }
+ }
+ }
+done:
+ unlock(&occ_lock);
+}
+
+/* OPAL-OCC Command/Response Interface */
+
+enum occ_state {
+ OCC_STATE_NOT_RUNNING = 0x00,
+ OCC_STATE_STANDBY = 0x01,
+ OCC_STATE_OBSERVATION = 0x02,
+ OCC_STATE_ACTIVE = 0x03,
+ OCC_STATE_SAFE = 0x04,
+ OCC_STATE_CHARACTERIZATION = 0x05,
+};
+
+enum occ_role {
+ OCC_ROLE_SLAVE = 0x0,
+ OCC_ROLE_MASTER = 0x1,
+};
+
+enum occ_cmd {
+ OCC_CMD_CLEAR_SENSOR_DATA,
+ OCC_CMD_SET_POWER_CAP,
+ OCC_CMD_SET_POWER_SHIFTING_RATIO,
+ OCC_CMD_SELECT_SENSOR_GROUP,
+};
+
+struct opal_occ_cmd_info {
+ enum occ_cmd cmd;
+ u8 cmd_value;
+ u16 cmd_size;
+ u16 rsp_size;
+ int timeout_ms;
+ u16 state_mask;
+ u8 role_mask;
+};
+
+static struct opal_occ_cmd_info occ_cmds[] = {
+ { OCC_CMD_CLEAR_SENSOR_DATA,
+ 0xD0, 4, 4, 1000,
+ PPC_BIT16(OCC_STATE_OBSERVATION) |
+ PPC_BIT16(OCC_STATE_ACTIVE) |
+ PPC_BIT16(OCC_STATE_CHARACTERIZATION),
+ PPC_BIT8(OCC_ROLE_MASTER) | PPC_BIT8(OCC_ROLE_SLAVE)
+ },
+ { OCC_CMD_SET_POWER_CAP,
+ 0xD1, 2, 2, 1000,
+ PPC_BIT16(OCC_STATE_OBSERVATION) |
+ PPC_BIT16(OCC_STATE_ACTIVE) |
+ PPC_BIT16(OCC_STATE_CHARACTERIZATION),
+ PPC_BIT8(OCC_ROLE_MASTER)
+ },
+ { OCC_CMD_SET_POWER_SHIFTING_RATIO,
+ 0xD2, 1, 1, 1000,
+ PPC_BIT16(OCC_STATE_OBSERVATION) |
+ PPC_BIT16(OCC_STATE_ACTIVE) |
+ PPC_BIT16(OCC_STATE_CHARACTERIZATION),
+ PPC_BIT8(OCC_ROLE_MASTER) | PPC_BIT8(OCC_ROLE_SLAVE)
+ },
+ { OCC_CMD_SELECT_SENSOR_GROUP,
+ 0xD3, 2, 2, 1000,
+ PPC_BIT16(OCC_STATE_OBSERVATION) |
+ PPC_BIT16(OCC_STATE_ACTIVE) |
+ PPC_BIT16(OCC_STATE_CHARACTERIZATION),
+ PPC_BIT8(OCC_ROLE_MASTER) | PPC_BIT8(OCC_ROLE_SLAVE)
+ },
+};
+
+enum occ_response_status {
+ OCC_RSP_SUCCESS = 0x00,
+ OCC_RSP_INVALID_COMMAND = 0x11,
+ OCC_RSP_INVALID_CMD_DATA_LENGTH = 0x12,
+ OCC_RSP_INVALID_DATA = 0x13,
+ OCC_RSP_INTERNAL_ERROR = 0x15,
+};
+
+#define OCC_FLAG_RSP_READY 0x01
+#define OCC_FLAG_CMD_IN_PROGRESS 0x02
+#define OPAL_FLAG_CMD_READY 0x80
+
+struct opal_occ_cmd_data {
+ u8 *data;
+ enum occ_cmd cmd;
+};
+
+static struct cmd_interface {
+ struct lock queue_lock;
+ struct timer timeout;
+ struct opal_occ_cmd_data *cdata;
+ struct opal_command_buffer *cmd;
+ struct occ_response_buffer *rsp;
+ u8 *occ_state;
+ u8 *valid;
+ u32 chip_id;
+ u32 token;
+ u16 enabled_sensor_mask;
+ u8 occ_role;
+ u8 request_id;
+ bool cmd_in_progress;
+ bool retry;
+} *chips;
+
+static int nr_occs;
+
+static inline struct cmd_interface *get_chip_cmd_interface(int chip_id)
+{
+ int i;
+
+ for (i = 0; i < nr_occs; i++)
+ if (chips[i].chip_id == chip_id)
+ return &chips[i];
+
+ return NULL;
+}
+
+static inline bool occ_in_progress(struct cmd_interface *chip)
+{
+ return (chip->rsp->flag == OCC_FLAG_CMD_IN_PROGRESS);
+}
+
+static int write_occ_cmd(struct cmd_interface *chip)
+{
+ struct opal_command_buffer *cmd = chip->cmd;
+ enum occ_cmd ocmd = chip->cdata->cmd;
+
+ if (!chip->retry && occ_in_progress(chip)) {
+ chip->cmd_in_progress = false;
+ return OPAL_BUSY;
+ }
+
+ cmd->flag = chip->rsp->flag = 0;
+ cmd->cmd = occ_cmds[ocmd].cmd_value;
+ cmd->request_id = chip->request_id++;
+ cmd->data_size = occ_cmds[ocmd].cmd_size;
+ memcpy(&cmd->data, chip->cdata->data, cmd->data_size);
+ cmd->flag = OPAL_FLAG_CMD_READY;
+
+ schedule_timer(&chip->timeout,
+ msecs_to_tb(occ_cmds[ocmd].timeout_ms));
+
+ return OPAL_ASYNC_COMPLETION;
+}
+
+static int64_t opal_occ_command(struct cmd_interface *chip, int token,
+ struct opal_occ_cmd_data *cdata)
+{
+ int rc;
+
+ if (!(*chip->valid) ||
+ (!(PPC_BIT16(*chip->occ_state) & occ_cmds[cdata->cmd].state_mask)))
+ return OPAL_HARDWARE;
+
+ if (!(PPC_BIT8(chip->occ_role) & occ_cmds[cdata->cmd].role_mask))
+ return OPAL_PERMISSION;
+
+ lock(&chip->queue_lock);
+ if (chip->cmd_in_progress) {
+ rc = OPAL_BUSY;
+ goto out;
+ }
+
+ chip->cdata = cdata;
+ chip->token = token;
+ chip->cmd_in_progress = true;
+ chip->retry = false;
+ rc = write_occ_cmd(chip);
+out:
+ unlock(&chip->queue_lock);
+ return rc;
+}
+
+static inline bool sanity_check_opal_cmd(struct opal_command_buffer *cmd,
+ struct cmd_interface *chip)
+{
+ return ((cmd->cmd == occ_cmds[chip->cdata->cmd].cmd_value) &&
+ (cmd->request_id == chip->request_id - 1) &&
+ (cmd->data_size == occ_cmds[chip->cdata->cmd].cmd_size));
+}
+
+static inline bool check_occ_rsp(struct opal_command_buffer *cmd,
+ struct occ_response_buffer *rsp)
+{
+ if (cmd->cmd != rsp->cmd) {
+ prlog(PR_DEBUG, "OCC: Command value mismatch in OCC response"
+ "rsp->cmd = %d cmd->cmd = %d\n", rsp->cmd, cmd->cmd);
+ return false;
+ }
+
+ if (cmd->request_id != rsp->request_id) {
+ prlog(PR_DEBUG, "OCC: Request ID mismatch in OCC response"
+ "rsp->request_id = %d cmd->request_id = %d\n",
+ rsp->request_id, cmd->request_id);
+ return false;
+ }
+
+ return true;
+}
+
+static inline void queue_occ_rsp_msg(int token, int rc)
+{
+ int ret;
+
+ ret = opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL,
+ cpu_to_be64(token),
+ cpu_to_be64(rc));
+ if (ret)
+ prerror("OCC: Failed to queue OCC response status message\n");
+}
+
+static void occ_cmd_timeout_handler(struct timer *t __unused, void *data,
+ uint64_t now __unused)
+{
+ struct cmd_interface *chip = data;
+
+ lock(&chip->queue_lock);
+ if (!chip->cmd_in_progress)
+ goto exit;
+
+ if (!chip->retry) {
+ prlog(PR_DEBUG, "OCC: Command timeout, retrying\n");
+ chip->retry = true;
+ write_occ_cmd(chip);
+ } else {
+ chip->cmd_in_progress = false;
+ queue_occ_rsp_msg(chip->token, OPAL_TIMEOUT);
+ prlog(PR_DEBUG, "OCC: Command timeout after retry\n");
+ }
+exit:
+ unlock(&chip->queue_lock);
+}
+
+static int read_occ_rsp(struct occ_response_buffer *rsp)
+{
+ switch (rsp->status) {
+ case OCC_RSP_SUCCESS:
+ return OPAL_SUCCESS;
+ case OCC_RSP_INVALID_COMMAND:
+ prlog(PR_DEBUG, "OCC: Rsp status: Invalid command\n");
+ break;
+ case OCC_RSP_INVALID_CMD_DATA_LENGTH:
+ prlog(PR_DEBUG, "OCC: Rsp status: Invalid command data length\n");
+ break;
+ case OCC_RSP_INVALID_DATA:
+ prlog(PR_DEBUG, "OCC: Rsp status: Invalid command data\n");
+ break;
+ case OCC_RSP_INTERNAL_ERROR:
+ prlog(PR_DEBUG, "OCC: Rsp status: OCC internal error\n");
+ break;
+ default:
+ break;
+ }
+
+ /* Clear the OCC response flag */
+ rsp->flag = 0;
+ return OPAL_INTERNAL_ERROR;
+}
+
+static void handle_occ_rsp(uint32_t chip_id)
+{
+ struct cmd_interface *chip;
+ struct opal_command_buffer *cmd;
+ struct occ_response_buffer *rsp;
+
+ chip = get_chip_cmd_interface(chip_id);
+ if (!chip)
+ return;
+
+ cmd = chip->cmd;
+ rsp = chip->rsp;
+
+ /*Read rsp*/
+ if (rsp->flag != OCC_FLAG_RSP_READY)
+ return;
+ lock(&chip->queue_lock);
+ if (!chip->cmd_in_progress)
+ goto exit;
+
+ cancel_timer(&chip->timeout);
+ if (!sanity_check_opal_cmd(cmd, chip) ||
+ !check_occ_rsp(cmd, rsp)) {
+ if (!chip->retry) {
+ prlog(PR_DEBUG, "OCC: Command-response mismatch, retrying\n");
+ chip->retry = true;
+ write_occ_cmd(chip);
+ } else {
+ chip->cmd_in_progress = false;
+ queue_occ_rsp_msg(chip->token, OPAL_INTERNAL_ERROR);
+ prlog(PR_DEBUG, "OCC: Command-response mismatch\n");
+ }
+ goto exit;
+ }
+
+ if (rsp->cmd == occ_cmds[OCC_CMD_SELECT_SENSOR_GROUP].cmd_value &&
+ rsp->status == OCC_RSP_SUCCESS)
+ chip->enabled_sensor_mask = *(u16 *)chip->cdata->data;
+
+ chip->cmd_in_progress = false;
+ queue_occ_rsp_msg(chip->token, read_occ_rsp(chip->rsp));
+exit:
+ unlock(&chip->queue_lock);
+}
+
+bool occ_get_gpu_presence(struct proc_chip *chip, int gpu_num)
+{
+ struct occ_dynamic_data *ddata;
+ static int max_retries = 20;
+ static bool found = false;
+
+ assert(gpu_num <= 2);
+
+ ddata = get_occ_dynamic_data(chip);
+ while (!found && max_retries) {
+ if (ddata->major_version == 0 && ddata->minor_version >= 1) {
+ found = true;
+ break;
+ }
+ time_wait_ms(100);
+ max_retries--;
+ ddata = get_occ_dynamic_data(chip);
+ }
+
+ if (!found) {
+ prlog(PR_INFO, "OCC: No GPU slot presence, assuming GPU present\n");
+ return true;
+ }
+
+ return (bool)(ddata->gpus_present & 1 << gpu_num);
+}
+
+static void occ_add_powercap_sensors(struct dt_node *power_mgt);
+static void occ_add_psr_sensors(struct dt_node *power_mgt);
+
+static void occ_cmd_interface_init(void)
+{
+ struct occ_dynamic_data *data;
+ struct occ_pstate_table *pdata;
+ struct dt_node *power_mgt;
+ struct proc_chip *chip;
+ int i = 0, major;
+
+ /* Check if the OCC data is valid */
+ for_each_chip(chip) {
+ pdata = get_occ_pstate_table(chip);
+ if (!pdata->valid)
+ return;
+ }
+
+ chip = next_chip(NULL);
+ pdata = get_occ_pstate_table(chip);
+ major = pdata->version >> 4;
+ if (major != 0x9 || major != 0xA)
+ return;
+
+ for_each_chip(chip)
+ nr_occs++;
+
+ chips = malloc(sizeof(*chips) * nr_occs);
+ assert(chips);
+
+ for_each_chip(chip) {
+ pdata = get_occ_pstate_table(chip);
+ data = get_occ_dynamic_data(chip);
+ chips[i].chip_id = chip->id;
+ chips[i].occ_state = &data->occ_state;
+ chips[i].valid = &pdata->valid;
+ chips[i].cmd = &data->cmd;
+ chips[i].rsp = &data->rsp;
+ switch (major) {
+ case 0x9:
+ chips[i].occ_role = pdata->v9.occ_role;
+ break;
+ case 0xA:
+ chips[i].occ_role = pdata->v10.occ_role;
+ break;
+ }
+ init_lock(&chips[i].queue_lock);
+ chips[i].cmd_in_progress = false;
+ chips[i].request_id = 0;
+ chips[i].enabled_sensor_mask = OCC_ENABLED_SENSOR_MASK;
+ init_timer(&chips[i].timeout, occ_cmd_timeout_handler,
+ &chips[i]);
+ i++;
+ }
+
+ power_mgt = dt_find_by_path(dt_root, "/ibm,opal/power-mgt");
+ if (!power_mgt) {
+ prerror("OCC: dt node /ibm,opal/power-mgt not found\n");
+ return;
+ }
+
+ /* Add powercap sensors to DT */
+ occ_add_powercap_sensors(power_mgt);
+
+ /* Add power-shifting-ratio CPU-GPU sensors to DT */
+ occ_add_psr_sensors(power_mgt);
+}
+
+/* Powercap interface */
+enum sensor_powercap_occ_attr {
+ POWERCAP_OCC_SOFT_MIN,
+ POWERCAP_OCC_MAX,
+ POWERCAP_OCC_CUR,
+ POWERCAP_OCC_HARD_MIN,
+};
+
+static void occ_add_powercap_sensors(struct dt_node *power_mgt)
+{
+ struct dt_node *pcap, *node;
+ u32 handle;
+
+ pcap = dt_new(power_mgt, "powercap");
+ if (!pcap) {
+ prerror("OCC: Failed to create powercap node\n");
+ return;
+ }
+
+ dt_add_property_string(pcap, "compatible", "ibm,opal-powercap");
+ node = dt_new(pcap, "system-powercap");
+ if (!node) {
+ prerror("OCC: Failed to create system powercap node\n");
+ return;
+ }
+
+ handle = powercap_make_handle(POWERCAP_CLASS_OCC, POWERCAP_OCC_CUR);
+ dt_add_property_cells(node, "powercap-current", handle);
+
+ handle = powercap_make_handle(POWERCAP_CLASS_OCC,
+ POWERCAP_OCC_SOFT_MIN);
+ dt_add_property_cells(node, "powercap-min", handle);
+
+ handle = powercap_make_handle(POWERCAP_CLASS_OCC, POWERCAP_OCC_MAX);
+ dt_add_property_cells(node, "powercap-max", handle);
+
+ handle = powercap_make_handle(POWERCAP_CLASS_OCC,
+ POWERCAP_OCC_HARD_MIN);
+ dt_add_property_cells(node, "powercap-hard-min", handle);
+
+}
+
+int occ_get_powercap(u32 handle, u32 *pcap)
+{
+ struct occ_pstate_table *pdata;
+ struct occ_dynamic_data *ddata;
+ struct proc_chip *chip;
+
+ chip = next_chip(NULL);
+ pdata = get_occ_pstate_table(chip);
+ ddata = get_occ_dynamic_data(chip);
+
+ if (!pdata->valid)
+ return OPAL_HARDWARE;
+
+ switch (powercap_get_attr(handle)) {
+ case POWERCAP_OCC_SOFT_MIN:
+ *pcap = ddata->soft_min_pwr_cap;
+ break;
+ case POWERCAP_OCC_MAX:
+ *pcap = ddata->max_pwr_cap;
+ break;
+ case POWERCAP_OCC_CUR:
+ *pcap = ddata->cur_pwr_cap;
+ break;
+ case POWERCAP_OCC_HARD_MIN:
+ *pcap = ddata->hard_min_pwr_cap;
+ break;
+ default:
+ *pcap = 0;
+ return OPAL_UNSUPPORTED;
+ }
+
+ return OPAL_SUCCESS;
+}
+
+static u16 pcap_cdata;
+static struct opal_occ_cmd_data pcap_data = {
+ .data = (u8 *)&pcap_cdata,
+ .cmd = OCC_CMD_SET_POWER_CAP,
+};
+
+int __attribute__((__const__)) occ_set_powercap(u32 handle, int token, u32 pcap)
+{
+ struct occ_dynamic_data *ddata;
+ struct proc_chip *chip;
+ int i;
+
+ if (powercap_get_attr(handle) != POWERCAP_OCC_CUR)
+ return OPAL_PERMISSION;
+
+ if (!chips)
+ return OPAL_HARDWARE;
+
+ for (i = 0; i < nr_occs; i++)
+ if (chips[i].occ_role == OCC_ROLE_MASTER)
+ break;
+
+ if (!(*chips[i].valid))
+ return OPAL_HARDWARE;
+
+ chip = get_chip(chips[i].chip_id);
+ ddata = get_occ_dynamic_data(chip);
+
+ if (pcap == ddata->cur_pwr_cap)
+ return OPAL_SUCCESS;
+
+ if (pcap && (pcap > ddata->max_pwr_cap ||
+ pcap < ddata->soft_min_pwr_cap))
+ return OPAL_PARAMETER;
+
+ pcap_cdata = pcap;
+ return opal_occ_command(&chips[i], token, &pcap_data);
+};
+
+/* Power-Shifting Ratio */
+enum psr_type {
+ PSR_TYPE_CPU_TO_GPU, /* 0% Cap GPU first, 100% Cap CPU first */
+};
+
+int occ_get_psr(u32 handle, u32 *ratio)
+{
+ struct occ_dynamic_data *ddata;
+ struct proc_chip *chip;
+ u8 i = psr_get_rid(handle);
+
+ if (psr_get_type(handle) != PSR_TYPE_CPU_TO_GPU)
+ return OPAL_UNSUPPORTED;
+
+ if (i > nr_occs)
+ return OPAL_UNSUPPORTED;
+
+ if (!(*chips[i].valid))
+ return OPAL_HARDWARE;
+
+ chip = get_chip(chips[i].chip_id);
+ ddata = get_occ_dynamic_data(chip);
+ *ratio = ddata->pwr_shifting_ratio;
+ return OPAL_SUCCESS;
+}
+
+static u8 psr_cdata;
+static struct opal_occ_cmd_data psr_data = {
+ .data = &psr_cdata,
+ .cmd = OCC_CMD_SET_POWER_SHIFTING_RATIO,
+};
+
+int occ_set_psr(u32 handle, int token, u32 ratio)
+{
+ struct occ_dynamic_data *ddata;
+ struct proc_chip *chip;
+ u8 i = psr_get_rid(handle);
+
+ if (psr_get_type(handle) != PSR_TYPE_CPU_TO_GPU)
+ return OPAL_UNSUPPORTED;
+
+ if (ratio > 100)
+ return OPAL_PARAMETER;
+
+ if (i > nr_occs)
+ return OPAL_UNSUPPORTED;
+
+ if (!(*chips[i].valid))
+ return OPAL_HARDWARE;
+
+ chip = get_chip(chips[i].chip_id);
+ ddata = get_occ_dynamic_data(chip);
+ if (ratio == ddata->pwr_shifting_ratio)
+ return OPAL_SUCCESS;
+
+ psr_cdata = ratio;
+ return opal_occ_command(&chips[i], token, &psr_data);
+}
+
+static void occ_add_psr_sensors(struct dt_node *power_mgt)
+{
+ struct dt_node *node;
+ int i;
+
+ node = dt_new(power_mgt, "psr");
+ if (!node) {
+ prerror("OCC: Failed to create power-shifting-ratio node\n");
+ return;
+ }
+
+ dt_add_property_string(node, "compatible",
+ "ibm,opal-power-shift-ratio");
+ dt_add_property_cells(node, "#address-cells", 1);
+ dt_add_property_cells(node, "#size-cells", 0);
+ for (i = 0; i < nr_occs; i++) {
+ struct dt_node *cnode;
+ char name[20];
+ u32 handle = psr_make_handle(PSR_CLASS_OCC, i,
+ PSR_TYPE_CPU_TO_GPU);
+
+ cnode = dt_new_addr(node, "cpu-to-gpu", handle);
+ if (!cnode) {
+ prerror("OCC: Failed to create power-shifting-ratio node\n");
+ return;
+ }
+
+ snprintf(name, 20, "cpu_to_gpu_%d", chips[i].chip_id);
+ dt_add_property_string(cnode, "label", name);
+ dt_add_property_cells(cnode, "handle", handle);
+ dt_add_property_cells(cnode, "reg", chips[i].chip_id);
+ }
+}
+
+/* OCC clear sensor limits CSM/Profiler/Job-scheduler */
+
+enum occ_sensor_limit_group {
+ OCC_SENSOR_LIMIT_GROUP_CSM = 0x10,
+ OCC_SENSOR_LIMIT_GROUP_PROFILER = 0x20,
+ OCC_SENSOR_LIMIT_GROUP_JOB_SCHED = 0x40,
+};
+
+static u32 sensor_limit;
+static struct opal_occ_cmd_data slimit_data = {
+ .data = (u8 *)&sensor_limit,
+ .cmd = OCC_CMD_CLEAR_SENSOR_DATA,
+};
+
+int occ_sensor_group_clear(u32 group_hndl, int token)
+{
+ u32 limit = sensor_get_rid(group_hndl);
+ u8 i = sensor_get_attr(group_hndl);
+
+ if (i > nr_occs)
+ return OPAL_UNSUPPORTED;
+
+ switch (limit) {
+ case OCC_SENSOR_LIMIT_GROUP_CSM:
+ case OCC_SENSOR_LIMIT_GROUP_PROFILER:
+ case OCC_SENSOR_LIMIT_GROUP_JOB_SCHED:
+ break;
+ default:
+ return OPAL_UNSUPPORTED;
+ }
+
+ if (!(*chips[i].valid))
+ return OPAL_HARDWARE;
+
+ sensor_limit = limit << 24;
+ return opal_occ_command(&chips[i], token, &slimit_data);
+}
+
+static u16 sensor_enable;
+static struct opal_occ_cmd_data sensor_mask_data = {
+ .data = (u8 *)&sensor_enable,
+ .cmd = OCC_CMD_SELECT_SENSOR_GROUP,
+};
+
+int occ_sensor_group_enable(u32 group_hndl, int token, bool enable)
+{
+ u16 type = sensor_get_rid(group_hndl);
+ u8 i = sensor_get_attr(group_hndl);
+
+ if (i > nr_occs)
+ return OPAL_UNSUPPORTED;
+
+ switch (type) {
+ case OCC_SENSOR_TYPE_GENERIC:
+ case OCC_SENSOR_TYPE_CURRENT:
+ case OCC_SENSOR_TYPE_VOLTAGE:
+ case OCC_SENSOR_TYPE_TEMPERATURE:
+ case OCC_SENSOR_TYPE_UTILIZATION:
+ case OCC_SENSOR_TYPE_TIME:
+ case OCC_SENSOR_TYPE_FREQUENCY:
+ case OCC_SENSOR_TYPE_POWER:
+ case OCC_SENSOR_TYPE_PERFORMANCE:
+ break;
+ default:
+ return OPAL_UNSUPPORTED;
+ }
+
+ if (!(*chips[i].valid))
+ return OPAL_HARDWARE;
+
+ if (enable && (type & chips[i].enabled_sensor_mask))
+ return OPAL_SUCCESS;
+ else if (!enable && !(type & chips[i].enabled_sensor_mask))
+ return OPAL_SUCCESS;
+
+ sensor_enable = enable ? type | chips[i].enabled_sensor_mask :
+ ~type & chips[i].enabled_sensor_mask;
+
+ return opal_occ_command(&chips[i], token, &sensor_mask_data);
+}
+
+void occ_add_sensor_groups(struct dt_node *sg, __be32 *phandles, u32 *ptype,
+ int nr_phandles, int chipid)
+{
+ struct group_info {
+ int type;
+ const char *str;
+ u32 ops;
+ } groups[] = {
+ { OCC_SENSOR_LIMIT_GROUP_CSM, "csm",
+ OPAL_SENSOR_GROUP_CLEAR
+ },
+ { OCC_SENSOR_LIMIT_GROUP_PROFILER, "profiler",
+ OPAL_SENSOR_GROUP_CLEAR
+ },
+ { OCC_SENSOR_LIMIT_GROUP_JOB_SCHED, "js",
+ OPAL_SENSOR_GROUP_CLEAR
+ },
+ { OCC_SENSOR_TYPE_GENERIC, "generic",
+ OPAL_SENSOR_GROUP_ENABLE
+ },
+ { OCC_SENSOR_TYPE_CURRENT, "curr",
+ OPAL_SENSOR_GROUP_ENABLE
+ },
+ { OCC_SENSOR_TYPE_VOLTAGE, "in",
+ OPAL_SENSOR_GROUP_ENABLE
+ },
+ { OCC_SENSOR_TYPE_TEMPERATURE, "temp",
+ OPAL_SENSOR_GROUP_ENABLE
+ },
+ { OCC_SENSOR_TYPE_UTILIZATION, "utilization",
+ OPAL_SENSOR_GROUP_ENABLE
+ },
+ { OCC_SENSOR_TYPE_TIME, "time",
+ OPAL_SENSOR_GROUP_ENABLE
+ },
+ { OCC_SENSOR_TYPE_FREQUENCY, "frequency",
+ OPAL_SENSOR_GROUP_ENABLE
+ },
+ { OCC_SENSOR_TYPE_POWER, "power",
+ OPAL_SENSOR_GROUP_ENABLE
+ },
+ { OCC_SENSOR_TYPE_PERFORMANCE, "performance",
+ OPAL_SENSOR_GROUP_ENABLE
+ },
+ };
+ int i, j;
+
+ /*
+ * Dont add sensor groups if cmd-interface is not intialized
+ */
+ if (!chips)
+ return;
+
+ for (i = 0; i < nr_occs; i++)
+ if (chips[i].chip_id == chipid)
+ break;
+
+ for (j = 0; j < ARRAY_SIZE(groups); j++) {
+ struct dt_node *node;
+ char name[20];
+ u32 handle;
+
+ snprintf(name, 20, "occ-%s", groups[j].str);
+ handle = sensor_make_handler(SENSOR_OCC, 0,
+ groups[j].type, i);
+ node = dt_new_addr(sg, name, handle);
+ if (!node) {
+ prerror("Failed to create sensor group nodes\n");
+ return;
+ }
+
+ dt_add_property_cells(node, "sensor-group-id", handle);
+ dt_add_property_string(node, "type", groups[j].str);
+
+ if (groups[j].type == OCC_SENSOR_TYPE_CURRENT ||
+ groups[j].type == OCC_SENSOR_TYPE_VOLTAGE ||
+ groups[j].type == OCC_SENSOR_TYPE_TEMPERATURE ||
+ groups[j].type == OCC_SENSOR_TYPE_POWER) {
+ dt_add_property_string(node, "sensor-type",
+ groups[j].str);
+ dt_add_property_string(node, "compatible",
+ "ibm,opal-sensor");
+ }
+
+ dt_add_property_cells(node, "ibm,chip-id", chipid);
+ dt_add_property_cells(node, "reg", handle);
+ if (groups[j].ops == OPAL_SENSOR_GROUP_ENABLE) {
+ __be32 *_phandles;
+ int k, pcount = 0;
+
+ _phandles = malloc(sizeof(u32) * nr_phandles);
+ assert(_phandles);
+ for (k = 0; k < nr_phandles; k++)
+ if (ptype[k] == groups[j].type)
+ _phandles[pcount++] = phandles[k];
+ if (pcount)
+ dt_add_property(node, "sensors", _phandles,
+ pcount * sizeof(u32));
+ free(_phandles);
+ } else {
+ dt_add_property(node, "sensors", phandles,
+ nr_phandles * sizeof(u32));
+ }
+ dt_add_property_cells(node, "ops", groups[j].ops);
+ }
+}
+
+/* CPU-OCC PState init */
+/* Called after OCC init on P8 and P9 */
+void occ_pstates_init(void)
+{
+ struct proc_chip *chip;
+ struct cpu_thread *c;
+ struct dt_node *power_mgt;
+ int pstate_nom;
+ u32 freq_domain_mask;
+ u8 domain_runs_at;
+ static bool occ_pstates_initialized;
+
+ power_mgt = dt_find_by_path(dt_root, "/ibm,opal/power-mgt");
+ if (!power_mgt) {
+ /**
+ * @fwts-label OCCDTNodeNotFound
+ * @fwts-advice Device tree node /ibm,opal/power-mgt not
+ * found. OPAL didn't add pstate information to device tree.
+ * Probably a firmware bug.
+ */
+ prlog(PR_ERR, "OCC: dt node /ibm,opal/power-mgt not found\n");
+ return;
+ }
+
+ /* Handle fast reboots */
+ if (occ_pstates_initialized) {
+ struct dt_node *child;
+ int i;
+ const char *props[] = {
+ "ibm,pstate-core-max",
+ "ibm,pstate-frequencies-mhz",
+ "ibm,pstate-ids",
+ "ibm,pstate-max",
+ "ibm,pstate-min",
+ "ibm,pstate-nominal",
+ "ibm,pstate-turbo",
+ "ibm,pstate-ultra-turbo",
+ "ibm,pstate-base",
+ "#address-cells",
+ "#size-cells",
+ };
+
+ for (i = 0; i < ARRAY_SIZE(props); i++)
+ dt_check_del_prop(power_mgt, props[i]);
+
+ dt_for_each_child(power_mgt, child)
+ if (!strncmp(child->name, "occ", 3))
+ dt_free(child);
+ }
+
+ switch (proc_gen) {
+ case proc_gen_p8:
+ homer_opal_data_offset = P8_HOMER_OPAL_DATA_OFFSET;
+ break;
+ case proc_gen_p9:
+ case proc_gen_p10:
+ homer_opal_data_offset = P9_HOMER_OPAL_DATA_OFFSET;
+ break;
+ default:
+ return;
+ }
+
+ chip = next_chip(NULL);
+ if (!chip->homer_base) {
+ log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT),
+ "OCC: No HOMER detected, assuming no pstates\n");
+ return;
+ }
+
+ /* Wait for all OCC to boot up */
+ if(!wait_for_all_occ_init()) {
+ log_simple_error(&e_info(OPAL_RC_OCC_TIMEOUT),
+ "OCC: Initialization on all chips did not complete"
+ "(timed out)\n");
+ return;
+ }
+
+ /*
+ * Check boundary conditions and add device tree nodes
+ * and return nominal pstate to set for the core
+ */
+ if (!add_cpu_pstate_properties(power_mgt, &pstate_nom)) {
+ log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT),
+ "Skiping core cpufreq init due to OCC error\n");
+ } else if (proc_gen == proc_gen_p8) {
+ /*
+ * Setup host based pstates and set nominal frequency only in
+ * P8.
+ */
+ for_each_chip(chip)
+ for_each_available_core_in_chip(c, chip->id)
+ cpu_pstates_prepare_core(chip, c, pstate_nom);
+ }
+
+ if (occ_pstates_initialized)
+ return;
+
+ /* Add opal_poller to poll OCC throttle status of each chip */
+ for_each_chip(chip)
+ chip->throttle = 0;
+ opal_add_poller(occ_throttle_poll, NULL);
+ occ_pstates_initialized = true;
+
+ /* Init OPAL-OCC command-response interface */
+ occ_cmd_interface_init();
+
+ /* TODO Firmware plumbing required so as to have two modes to set
+ * PMCR based on max in domain or most recently used. As of today,
+ * it is always max in domain for P9.
+ */
+ domain_runs_at = 0;
+ freq_domain_mask = 0;
+ if (proc_gen == proc_gen_p8) {
+ freq_domain_mask = P8_PIR_CORE_MASK;
+ domain_runs_at = FREQ_MOST_RECENTLY_SET;
+ } else if (proc_gen == proc_gen_p9) {
+ freq_domain_mask = P9_PIR_QUAD_MASK;
+ domain_runs_at = FREQ_MAX_IN_DOMAIN;
+ } else if (proc_gen == proc_gen_p10) {
+ freq_domain_mask = P10_PIR_CHIP_MASK;
+ domain_runs_at = FREQ_MAX_IN_DOMAIN;
+ } else {
+ assert(0);
+ }
+
+ dt_add_property_cells(power_mgt, "freq-domain-mask", freq_domain_mask);
+ dt_add_property_cells(power_mgt, "domain-runs-at", domain_runs_at);
+}
+
+int find_master_and_slave_occ(uint64_t **master, uint64_t **slave,
+ int *nr_masters, int *nr_slaves)
+{
+ struct proc_chip *chip;
+ int nr_chips = 0, i;
+ uint64_t chipids[MAX_CHIPS];
+
+ for_each_chip(chip) {
+ chipids[nr_chips++] = chip->id;
+ }
+
+ chip = next_chip(NULL);
+ /*
+ * Proc0 is the master OCC for Tuleta/Alpine boxes.
+ * Hostboot expects the pair of chips for MURANO, so pass the sibling
+ * chip id along with proc0 to hostboot.
+ */
+ *nr_masters = (chip->type == PROC_CHIP_P8_MURANO) ? 2 : 1;
+ *master = (uint64_t *)malloc(*nr_masters * sizeof(uint64_t));
+
+ if (!*master) {
+ printf("OCC: master array alloc failure\n");
+ return -ENOMEM;
+ }
+
+ if (nr_chips - *nr_masters > 0) {
+ *nr_slaves = nr_chips - *nr_masters;
+ *slave = (uint64_t *)malloc(*nr_slaves * sizeof(uint64_t));
+ if (!*slave) {
+ printf("OCC: slave array alloc failure\n");
+ return -ENOMEM;
+ }
+ }
+
+ for (i = 0; i < nr_chips; i++) {
+ if (i < *nr_masters) {
+ *(*master + i) = chipids[i];
+ continue;
+ }
+ *(*slave + i - *nr_masters) = chipids[i];
+ }
+ return 0;
+}
+
+
+int occ_msg_queue_occ_reset(void)
+{
+ struct opal_occ_msg occ_msg = { CPU_TO_BE64(OCC_RESET), 0, 0 };
+ struct proc_chip *chip;
+ int rc;
+
+ lock(&occ_lock);
+ rc = _opal_queue_msg(OPAL_MSG_OCC, NULL, NULL,
+ sizeof(struct opal_occ_msg), &occ_msg);
+ if (rc) {
+ prlog(PR_INFO, "OCC: Failed to queue OCC_RESET message\n");
+ goto out;
+ }
+ /*
+ * Set 'valid' byte of occ_pstate_table to 0 since OCC
+ * may not clear this byte on a reset.
+ * OCC will set the 'valid' byte to 1 when it becomes
+ * active again.
+ */
+ for_each_chip(chip) {
+ struct occ_pstate_table *occ_data;
+
+ occ_data = get_occ_pstate_table(chip);
+ occ_data->valid = 0;
+ chip->throttle = 0;
+ }
+ occ_reset = true;
+out:
+ unlock(&occ_lock);
+ return rc;
+}
+
+#define PV_OCC_GP0 0x01000000
+#define PV_OCC_GP0_AND 0x01000004
+#define PV_OCC_GP0_OR 0x01000005
+#define PV_OCC_GP0_PNOR_OWNER PPC_BIT(18) /* 1 = OCC / Host, 0 = BMC */
+
+static void occ_pnor_set_one_owner(uint32_t chip_id, enum pnor_owner owner)
+{
+ uint64_t reg, mask;
+
+ if (owner == PNOR_OWNER_HOST) {
+ reg = PV_OCC_GP0_OR;
+ mask = PV_OCC_GP0_PNOR_OWNER;
+ } else {
+ reg = PV_OCC_GP0_AND;
+ mask = ~PV_OCC_GP0_PNOR_OWNER;
+ }
+
+ xscom_write(chip_id, reg, mask);
+}
+
+void occ_pnor_set_owner(enum pnor_owner owner)
+{
+ struct proc_chip *chip;
+
+ for_each_chip(chip)
+ occ_pnor_set_one_owner(chip->id, owner);
+}
+
+
+#define P8_OCB_OCI_OCCMISC 0x6a020
+#define P8_OCB_OCI_OCCMISC_AND 0x6a021
+#define P8_OCB_OCI_OCCMISC_OR 0x6a022
+
+#define P9_OCB_OCI_OCCMISC 0x6c080
+#define P9_OCB_OCI_OCCMISC_CLEAR 0x6c081
+#define P9_OCB_OCI_OCCMISC_OR 0x6c082
+
+#define OCB_OCI_OCIMISC_IRQ PPC_BIT(0)
+#define OCB_OCI_OCIMISC_IRQ_TMGT PPC_BIT(1)
+#define OCB_OCI_OCIMISC_IRQ_SLW_TMR PPC_BIT(14)
+#define OCB_OCI_OCIMISC_IRQ_OPAL_DUMMY PPC_BIT(15)
+
+#define P8_OCB_OCI_OCIMISC_MASK (OCB_OCI_OCIMISC_IRQ_TMGT | \
+ OCB_OCI_OCIMISC_IRQ_OPAL_DUMMY | \
+ OCB_OCI_OCIMISC_IRQ_SLW_TMR)
+
+#define OCB_OCI_OCIMISC_IRQ_I2C PPC_BIT(2)
+#define OCB_OCI_OCIMISC_IRQ_SHMEM PPC_BIT(3)
+#define P9_OCB_OCI_OCIMISC_MASK (OCB_OCI_OCIMISC_IRQ_TMGT | \
+ OCB_OCI_OCIMISC_IRQ_I2C | \
+ OCB_OCI_OCIMISC_IRQ_SHMEM | \
+ OCB_OCI_OCIMISC_IRQ_OPAL_DUMMY)
+
+void occ_send_dummy_interrupt(void)
+{
+ struct psi *psi;
+ struct proc_chip *chip = get_chip(this_cpu()->chip_id);
+
+ /* Emulators don't do this */
+ if (chip_quirk(QUIRK_NO_OCC_IRQ))
+ return;
+
+ /* Find a functional PSI. This ensures an interrupt even if
+ * the psihb on the current chip is not configured */
+ if (chip->psi)
+ psi = chip->psi;
+ else
+ psi = psi_find_functional_chip();
+
+ if (!psi) {
+ prlog_once(PR_WARNING, "PSI: no functional PSI HB found, "
+ "no self interrupts delivered\n");
+ return;
+ }
+
+ switch (proc_gen) {
+ case proc_gen_p8:
+ xscom_write(psi->chip_id, P8_OCB_OCI_OCCMISC_OR,
+ OCB_OCI_OCIMISC_IRQ |
+ OCB_OCI_OCIMISC_IRQ_OPAL_DUMMY);
+ break;
+ case proc_gen_p9:
+ xscom_write(psi->chip_id, P9_OCB_OCI_OCCMISC_OR,
+ OCB_OCI_OCIMISC_IRQ |
+ OCB_OCI_OCIMISC_IRQ_OPAL_DUMMY);
+ break;
+ case proc_gen_p10:
+ xscom_write(psi->chip_id, P9_OCB_OCI_OCCMISC_OR,
+ OCB_OCI_OCIMISC_IRQ |
+ OCB_OCI_OCIMISC_IRQ_OPAL_DUMMY);
+ break;
+ default:
+ break;
+ }
+}
+
+void occ_p8_interrupt(uint32_t chip_id)
+{
+ uint64_t ireg;
+ int64_t rc;
+
+ /* The OCC interrupt is used to mux up to 15 different sources */
+ rc = xscom_read(chip_id, P8_OCB_OCI_OCCMISC, &ireg);
+ if (rc) {
+ prerror("OCC: Failed to read interrupt status !\n");
+ /* Should we mask it in the XIVR ? */
+ return;
+ }
+ prlog(PR_TRACE, "OCC: IRQ received: %04llx\n", ireg >> 48);
+
+ /* Clear the bits */
+ xscom_write(chip_id, P8_OCB_OCI_OCCMISC_AND, ~ireg);
+
+ /* Dispatch */
+ if (ireg & OCB_OCI_OCIMISC_IRQ_TMGT)
+ prd_tmgt_interrupt(chip_id);
+ if (ireg & OCB_OCI_OCIMISC_IRQ_SLW_TMR)
+ check_timers(true);
+
+ /* We may have masked-out OCB_OCI_OCIMISC_IRQ in the previous
+ * OCCMISC_AND write. Check if there are any new source bits set,
+ * and trigger another interrupt if so.
+ */
+ rc = xscom_read(chip_id, P8_OCB_OCI_OCCMISC, &ireg);
+ if (!rc && (ireg & P8_OCB_OCI_OCIMISC_MASK))
+ xscom_write(chip_id, P8_OCB_OCI_OCCMISC_OR,
+ OCB_OCI_OCIMISC_IRQ);
+}
+
+void occ_p9_interrupt(uint32_t chip_id)
+{
+ u64 ireg;
+ s64 rc;
+
+ /* The OCC interrupt is used to mux up to 15 different sources */
+ rc = xscom_read(chip_id, P9_OCB_OCI_OCCMISC, &ireg);
+ if (rc) {
+ prerror("OCC: Failed to read interrupt status !\n");
+ return;
+ }
+ prlog(PR_TRACE, "OCC: IRQ received: %04llx\n", ireg >> 48);
+
+ /* Clear the bits */
+ xscom_write(chip_id, P9_OCB_OCI_OCCMISC_CLEAR, ireg);
+
+ /* Dispatch */
+ if (ireg & OCB_OCI_OCIMISC_IRQ_TMGT)
+ prd_tmgt_interrupt(chip_id);
+
+ if (ireg & OCB_OCI_OCIMISC_IRQ_SHMEM) {
+ occ_throttle_poll(NULL);
+ handle_occ_rsp(chip_id);
+ }
+
+ if (ireg & OCB_OCI_OCIMISC_IRQ_I2C)
+ p9_i2c_bus_owner_change(chip_id);
+
+ /* We may have masked-out OCB_OCI_OCIMISC_IRQ in the previous
+ * OCCMISC_AND write. Check if there are any new source bits set,
+ * and trigger another interrupt if so.
+ */
+ rc = xscom_read(chip_id, P9_OCB_OCI_OCCMISC, &ireg);
+ if (!rc && (ireg & P9_OCB_OCI_OCIMISC_MASK))
+ xscom_write(chip_id, P9_OCB_OCI_OCCMISC_OR,
+ OCB_OCI_OCIMISC_IRQ);
+}
diff --git a/roms/skiboot/hw/ocmb.c b/roms/skiboot/hw/ocmb.c
new file mode 100644
index 000000000..bc470d0ab
--- /dev/null
+++ b/roms/skiboot/hw/ocmb.c
@@ -0,0 +1,167 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Open Capi Memory Buffer chip
+ *
+ * Copyright 2020 IBM Corp.
+ */
+
+
+#define pr_fmt(fmt) "OCMB: " fmt
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <device.h>
+#include <ocmb.h>
+#include <io.h>
+#include <inttypes.h>
+
+struct ocmb_range {
+ uint64_t start;
+ uint64_t end;
+ uint64_t flags;
+
+ /* flags come from hdat */
+#define ACCESS_8B PPC_BIT(0)
+#define ACCESS_4B PPC_BIT(1)
+#define ACCESS_SIZE_MASK (ACCESS_8B | ACCESS_4B)
+};
+
+struct ocmb {
+ struct scom_controller scom;
+ int range_count;
+ struct ocmb_range ranges[];
+};
+
+static const struct ocmb_range *find_range(const struct ocmb *o, uint64_t offset)
+{
+ int i;
+ uint64_t addr = offset & ~(HRMOR_BIT);
+
+ for (i = 0; i < o->range_count; i++) {
+ uint64_t start = o->ranges[i].start;
+ uint64_t end = o->ranges[i].end;
+
+ if (addr >= start && addr <= end)
+ return &o->ranges[i];
+ }
+
+ return NULL;
+}
+
+static int64_t ocmb_fake_scom_write(struct scom_controller *f,
+ uint32_t __unused chip_id,
+ uint64_t offset, uint64_t val)
+{
+ const struct ocmb *o = f->private;
+ const struct ocmb_range *r;
+
+ r = find_range(o, offset);
+ if (!r) {
+ prerror("no matching address range!\n");
+ return OPAL_XSCOM_ADDR_ERROR;
+ }
+
+ switch (r->flags & ACCESS_SIZE_MASK) {
+ case ACCESS_8B:
+ if (offset & 0x7)
+ return OPAL_XSCOM_ADDR_ERROR;
+ out_be64((void *) offset, val);
+ break;
+
+ case ACCESS_4B:
+ if (offset & 0x3)
+ return OPAL_XSCOM_ADDR_ERROR;
+ out_be32((void *) offset, val);
+ break;
+ default:
+ prerror("bad flags? %llx\n", r->flags);
+ return OPAL_XSCOM_ADDR_ERROR;
+ }
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t ocmb_fake_scom_read(struct scom_controller *f,
+ uint32_t chip_id __unused,
+ uint64_t offset, uint64_t *val)
+{
+ const struct ocmb *o = f->private;
+ const struct ocmb_range *r = NULL;
+
+ r = find_range(o, offset);
+ if (!r) {
+ prerror("no matching address range!\n");
+ return OPAL_XSCOM_ADDR_ERROR;
+ }
+
+
+ switch (r->flags & ACCESS_SIZE_MASK) {
+ case ACCESS_8B:
+ if (offset & 0x7)
+ return OPAL_XSCOM_ADDR_ERROR;
+ *val = in_be64((void *) offset);
+ break;
+
+ case ACCESS_4B:
+ if (offset & 0x3)
+ return OPAL_XSCOM_ADDR_ERROR;
+ *val = in_be32((void *) offset);
+ break;
+ default:
+ prerror("bad flags? %llx\n", r->flags);
+ return OPAL_XSCOM_ADDR_ERROR;
+ }
+
+ return OPAL_SUCCESS;
+}
+
+static bool ocmb_probe_one(struct dt_node *ocmb_node)
+{
+ uint64_t chip_id = dt_prop_get_u32(ocmb_node, "ibm,chip-id");
+ const struct dt_property *flags;
+ int i = 0, num = 0;
+ struct ocmb *ocmb;
+
+ num = dt_count_addresses(ocmb_node);
+
+ ocmb = zalloc(sizeof(*ocmb) + sizeof(*ocmb->ranges) * num);
+ if (!ocmb)
+ return false;
+
+ ocmb->scom.private = ocmb;
+ ocmb->scom.part_id = chip_id;
+ ocmb->scom.write = ocmb_fake_scom_write;
+ ocmb->scom.read = ocmb_fake_scom_read;
+ ocmb->range_count = num;
+
+ flags = dt_require_property(ocmb_node, "flags", sizeof(u64) * num);
+
+ for (i = 0; i < num; i++) {
+ uint64_t start, size;
+
+ start = dt_get_address(ocmb_node, i, &size);
+
+ ocmb->ranges[i].start = start;
+ ocmb->ranges[i].end = start + size - 1;
+ ocmb->ranges[i].flags = dt_property_get_u64(flags, i);
+
+ prlog(PR_DEBUG, "Added range: %" PRIx64 " - [%llx - %llx]\n",
+ chip_id, start, start + size - 1);
+ }
+
+ if (scom_register(&ocmb->scom))
+ prerror("Error registering fake scom\n");
+
+ dt_add_property(ocmb_node, "scom-controller", NULL, 0);
+ prlog(PR_NOTICE, "Added scom controller for %s\n", ocmb_node->name);
+
+ return true;
+}
+
+void ocmb_init(void)
+{
+ struct dt_node *dn;
+
+ dt_for_each_compatible(dt_root, dn, "ibm,explorer")
+ ocmb_probe_one(dn);
+}
diff --git a/roms/skiboot/hw/p8-i2c.c b/roms/skiboot/hw/p8-i2c.c
new file mode 100644
index 000000000..45815858e
--- /dev/null
+++ b/roms/skiboot/hw/p8-i2c.c
@@ -0,0 +1,1688 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * P8 i2c master
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#undef DEBUG
+
+#include <opal.h>
+#include <skiboot.h>
+#include <mem_region-malloc.h>
+#include <lock.h>
+#include <chip.h>
+#include <i2c.h>
+#include <xscom.h>
+#include <timebase.h>
+#include <timer.h>
+#include <opal-msg.h>
+#include <errorlog.h>
+#include <centaur.h>
+#include <debug_descriptor.h>
+
+DEFINE_LOG_ENTRY(OPAL_RC_I2C_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_I2C,
+ OPAL_IO_SUBSYSTEM, OPAL_PREDICTIVE_ERR_DEGRADED_PERF,
+ OPAL_NA);
+DEFINE_LOG_ENTRY(OPAL_RC_I2C_START_REQ, OPAL_INPUT_OUTPUT_ERR_EVT, OPAL_I2C,
+ OPAL_IO_SUBSYSTEM, OPAL_INFO, OPAL_NA);
+DEFINE_LOG_ENTRY(OPAL_RC_I2C_TIMEOUT, OPAL_INPUT_OUTPUT_ERR_EVT, OPAL_I2C,
+ OPAL_IO_SUBSYSTEM, OPAL_INFO, OPAL_NA);
+DEFINE_LOG_ENTRY(OPAL_RC_I2C_TRANSFER, OPAL_INPUT_OUTPUT_ERR_EVT, OPAL_I2C,
+ OPAL_IO_SUBSYSTEM, OPAL_INFO, OPAL_NA);
+DEFINE_LOG_ENTRY(OPAL_RC_I2C_RESET, OPAL_INPUT_OUTPUT_ERR_EVT, OPAL_I2C,
+ OPAL_IO_SUBSYSTEM, OPAL_INFO, OPAL_NA);
+
+#ifdef DEBUG
+#define DBG(fmt...) prlog(PR_ERR, "I2C: " fmt)
+#define I2C_TIMEOUT_IRQ_MS 100 /* 100ms/byte timeout */
+#define I2C_TIMEOUT_POLL_MS 4000 /* 4s/byte timeout */
+#else
+#define DBG(fmt...) prlog(PR_TRACE, "I2C: " fmt)
+#define I2C_TIMEOUT_IRQ_MS 1 /* 1ms/byte timeout */
+#define I2C_TIMEOUT_POLL_MS 4000 /* 4s/byte timeout */
+#endif
+
+/* How long to keep the sensor cache disabled after an access
+ * in milliseconds
+ */
+#define SENSOR_CACHE_EN_DELAY 10
+
+#define USEC_PER_SEC 1000000
+#define USEC_PER_MSEC 1000
+#define I2C_RESET_DELAY_MS 5 /* 5 msecs */
+#define I2C_FIFO_HI_LVL 4
+#define I2C_FIFO_LO_LVL 4
+
+/*
+ * I2C registers set.
+ * Below is the offset of registers from base which is stored in the
+ * 'struct p8_i2c_master'
+ */
+
+/* I2C FIFO register */
+#define I2C_FIFO_REG 0x4
+#define I2C_FIFO PPC_BITMASK(0, 7)
+
+/* I2C command register */
+#define I2C_CMD_REG 0x5
+#define I2C_CMD_WITH_START PPC_BIT(0)
+#define I2C_CMD_WITH_ADDR PPC_BIT(1)
+#define I2C_CMD_READ_CONT PPC_BIT(2)
+#define I2C_CMD_WITH_STOP PPC_BIT(3)
+#define I2C_CMD_INTR_STEERING PPC_BITMASK(6,7) /* P9 */
+#define I2C_CMD_INTR_STEER_HOST 1
+#define I2C_CMD_INTR_STEER_OCC 2
+#define I2C_CMD_DEV_ADDR PPC_BITMASK(8, 14)
+#define I2C_CMD_READ_NOT_WRITE PPC_BIT(15)
+#define I2C_CMD_LEN_BYTES PPC_BITMASK(16, 31)
+#define I2C_MAX_TFR_LEN 0xfff0ull
+
+/* I2C mode register */
+#define I2C_MODE_REG 0x6
+#define I2C_MODE_BIT_RATE_DIV PPC_BITMASK(0, 15)
+#define I2C_MODE_PORT_NUM PPC_BITMASK(16, 21)
+#define I2C_MODE_ENHANCED PPC_BIT(28)
+#define I2C_MODE_DIAGNOSTIC PPC_BIT(29)
+#define I2C_MODE_PACING_ALLOW PPC_BIT(30)
+#define I2C_MODE_WRAP PPC_BIT(31)
+
+/* I2C watermark register */
+#define I2C_WATERMARK_REG 0x7
+#define I2C_WATERMARK_HIGH PPC_BITMASK(16, 19)
+#define I2C_WATERMARK_LOW PPC_BITMASK(24, 27)
+
+/*
+ * I2C interrupt mask and condition registers
+ *
+ * NB: The function of 0x9 and 0xa changes depending on whether you're reading
+ * or writing to them. When read they return the interrupt condition bits
+ * and on writes they update the interrupt mask register.
+ *
+ * The bit definitions are the same for all the interrupt registers.
+ */
+#define I2C_INTR_MASK_REG 0x8
+
+#define I2C_INTR_RAW_COND_REG 0x9 /* read */
+#define I2C_INTR_MASK_OR_REG 0x9 /* write*/
+
+#define I2C_INTR_COND_REG 0xa /* read */
+#define I2C_INTR_MASK_AND_REG 0xa /* write */
+
+#define I2C_INTR_ALL PPC_BITMASK(16, 31)
+#define I2C_INTR_INVALID_CMD PPC_BIT(16)
+#define I2C_INTR_LBUS_PARITY_ERR PPC_BIT(17)
+#define I2C_INTR_BKEND_OVERRUN_ERR PPC_BIT(18)
+#define I2C_INTR_BKEND_ACCESS_ERR PPC_BIT(19)
+#define I2C_INTR_ARBT_LOST_ERR PPC_BIT(20)
+#define I2C_INTR_NACK_RCVD_ERR PPC_BIT(21)
+#define I2C_INTR_DATA_REQ PPC_BIT(22)
+#define I2C_INTR_CMD_COMP PPC_BIT(23)
+#define I2C_INTR_STOP_ERR PPC_BIT(24)
+#define I2C_INTR_I2C_BUSY PPC_BIT(25)
+#define I2C_INTR_NOT_I2C_BUSY PPC_BIT(26)
+#define I2C_INTR_SCL_EQ_1 PPC_BIT(28)
+#define I2C_INTR_SCL_EQ_0 PPC_BIT(29)
+#define I2C_INTR_SDA_EQ_1 PPC_BIT(30)
+#define I2C_INTR_SDA_EQ_0 PPC_BIT(31)
+
+/* I2C status register */
+#define I2C_RESET_I2C_REG 0xb
+#define I2C_RESET_ERRORS 0xc
+#define I2C_STAT_REG 0xb
+#define I2C_STAT_INVALID_CMD PPC_BIT(0)
+#define I2C_STAT_LBUS_PARITY_ERR PPC_BIT(1)
+#define I2C_STAT_BKEND_OVERRUN_ERR PPC_BIT(2)
+#define I2C_STAT_BKEND_ACCESS_ERR PPC_BIT(3)
+#define I2C_STAT_ARBT_LOST_ERR PPC_BIT(4)
+#define I2C_STAT_NACK_RCVD_ERR PPC_BIT(5)
+#define I2C_STAT_DATA_REQ PPC_BIT(6)
+#define I2C_STAT_CMD_COMP PPC_BIT(7)
+#define I2C_STAT_STOP_ERR PPC_BIT(8)
+#define I2C_STAT_UPPER_THRS PPC_BITMASK(9, 15)
+#define I2C_STAT_ANY_I2C_INTR PPC_BIT(16)
+#define I2C_STAT_PORT_HISTORY_BUSY PPC_BIT(19)
+#define I2C_STAT_SCL_INPUT_LEVEL PPC_BIT(20)
+#define I2C_STAT_SDA_INPUT_LEVEL PPC_BIT(21)
+#define I2C_STAT_PORT_BUSY PPC_BIT(22)
+#define I2C_STAT_INTERFACE_BUSY PPC_BIT(23)
+#define I2C_STAT_FIFO_ENTRY_COUNT PPC_BITMASK(24, 31)
+
+#define I2C_STAT_ANY_ERR (I2C_STAT_INVALID_CMD | I2C_STAT_LBUS_PARITY_ERR | \
+ I2C_STAT_BKEND_OVERRUN_ERR | \
+ I2C_STAT_BKEND_ACCESS_ERR | I2C_STAT_ARBT_LOST_ERR | \
+ I2C_STAT_NACK_RCVD_ERR | I2C_STAT_STOP_ERR)
+
+
+#define I2C_INTR_ACTIVE \
+ ((I2C_STAT_ANY_ERR >> 16) | I2C_INTR_CMD_COMP | I2C_INTR_DATA_REQ)
+
+/* Pseudo-status used for timeouts */
+#define I2C_STAT_PSEUDO_TIMEOUT PPC_BIT(63)
+
+
+/* I2C extended status register */
+#define I2C_EXTD_STAT_REG 0xc
+#define I2C_EXTD_STAT_FIFO_SIZE PPC_BITMASK(0, 7)
+#define I2C_EXTD_STAT_MSM_CURSTATE PPC_BITMASK(11, 15)
+#define I2C_EXTD_STAT_SCL_IN_SYNC PPC_BIT(16)
+#define I2C_EXTD_STAT_SDA_IN_SYNC PPC_BIT(17)
+#define I2C_EXTD_STAT_S_SCL PPC_BIT(18)
+#define I2C_EXTD_STAT_S_SDA PPC_BIT(19)
+#define I2C_EXTD_STAT_M_SCL PPC_BIT(20)
+#define I2C_EXTD_STAT_M_SDA PPC_BIT(21)
+#define I2C_EXTD_STAT_HIGH_WATER PPC_BIT(22)
+#define I2C_EXTD_STAT_LOW_WATER PPC_BIT(23)
+#define I2C_EXTD_STAT_I2C_BUSY PPC_BIT(24)
+#define I2C_EXTD_STAT_SELF_BUSY PPC_BIT(25)
+#define I2C_EXTD_STAT_I2C_VERSION PPC_BITMASK(27, 31)
+
+/* I2C residual front end/back end length */
+#define I2C_RESIDUAL_LEN_REG 0xd
+#define I2C_RESIDUAL_FRONT_END PPC_BITMASK(0, 15)
+#define I2C_RESIDUAL_BACK_END PPC_BITMASK(16, 31)
+
+/* Port busy register */
+#define I2C_PORT_BUSY_REG 0xe
+#define I2C_SET_S_SCL_REG 0xd
+#define I2C_RESET_S_SCL_REG 0xf
+#define I2C_SET_S_SDA_REG 0x10
+#define I2C_RESET_S_SDA_REG 0x11
+
+enum p8_i2c_master_type {
+ I2C_POWER8,
+ I2C_CENTAUR,
+ MAX_I2C_TYPE,
+};
+
+struct p8_i2c_master {
+ struct dt_node *dt_node;
+ struct lock lock; /* Lock to guard the members */
+ enum p8_i2c_master_type type; /* P8 vs. Centaur */
+ uint64_t start_time; /* Request start time */
+ uint64_t last_update;
+ uint64_t poll_interval; /* Polling interval */
+ uint64_t xscom_base; /* xscom base of i2cm */
+ uint32_t fifo_size; /* Maximum size of FIFO */
+ uint32_t chip_id; /* Chip the i2cm sits on */
+ uint32_t engine_id; /* Engine# on chip */
+ uint8_t obuf[4]; /* Offset buffer */
+ uint32_t bytes_sent;
+ bool irq_ok; /* Interrupt working ? */
+ bool occ_cache_dis; /* I have disabled the cache */
+ bool occ_lock_acquired; /* Acquired lock from OCC */
+ enum request_state {
+ state_idle,
+ state_occache_dis,
+ state_offset,
+ state_data,
+ state_error,
+ state_recovery,
+ } state;
+ struct list_head req_list; /* Request queue head */
+ struct timer poller;
+ struct timer timeout;
+ struct timer recovery;
+ struct timer sensor_cache;
+ uint8_t recovery_pass;
+ struct list_node link;
+ struct list_head ports;
+};
+
+struct p8_i2c_master_port {
+ struct i2c_bus bus; /* Abstract bus struct for the client */
+ struct p8_i2c_master *master;
+ uint32_t port_num;
+ uint32_t bit_rate_div; /* Divisor to set bus speed*/
+ uint64_t byte_timeout; /* Timeout per byte */
+ uint64_t poll_interval; /* Polling interval */
+ struct list_node link;
+};
+
+static int occ_i2c_unlock(struct p8_i2c_master *master);
+
+static int64_t i2cm_read_reg(struct p8_i2c_master *m, int reg, uint64_t *val)
+{
+ return xscom_read(m->chip_id, m->xscom_base + reg, val);
+}
+
+static int64_t i2cm_write_reg(struct p8_i2c_master *m, int reg, uint64_t val)
+{
+ return xscom_write(m->chip_id, m->xscom_base + reg, val);
+}
+
+static void p8_i2c_print_debug_info(struct p8_i2c_master_port *port,
+ struct i2c_request *req, uint64_t end_time)
+{
+ struct p8_i2c_master *master = port->master;
+ uint64_t cmd, mode, stat, estat, intm, intc;
+
+ /* Print master and request structure bits */
+ log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER),
+ "I2C: Chip %08x Eng. %d Port %d--\n"
+ " xscom_base=0x%016llx\tstate=%d\tbytes_sent=%d\n",
+ master->chip_id, master->engine_id, port->port_num,
+ master->xscom_base, master->state, master->bytes_sent);
+
+ log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), "I2C: Request info--\n"
+ " addr=0x%04x\toffset_bytes=%d\toffset=%d\tlen=%d\n",
+ req->dev_addr, req->offset_bytes, req->offset,
+ req->rw_len);
+
+ log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), "I2C: "
+ " start_time=%016llx end_time=%016llx (duration=%016llx)\n",
+ master->start_time, end_time, end_time - master->start_time);
+
+ /* initialise to some fake value in case of read errors */
+ cmd = mode = stat = estat = intm = intc = 0xDEAD;
+
+ /* Dump the current state of i2c registers */
+ i2cm_read_reg(master, I2C_CMD_REG, &cmd);
+ i2cm_read_reg(master, I2C_MODE_REG, &mode);
+ i2cm_read_reg(master, I2C_MODE_REG, &mode);
+ i2cm_read_reg(master, I2C_STAT_REG, &stat);
+ i2cm_read_reg(master, I2C_EXTD_STAT_REG, &estat);
+ i2cm_read_reg(master, I2C_INTR_MASK_REG, &intm);
+ i2cm_read_reg(master, I2C_INTR_RAW_COND_REG, &intc);
+
+ log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), "I2C: Register dump--\n"
+ " cmd:0x%016llx\tmode:0x%016llx\tstat:0x%016llx\n"
+ " estat:0x%016llx\tintm:0x%016llx\tintc:0x%016llx\n",
+ cmd, mode, stat, estat, intm, intc);
+
+ log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER),
+ "I2C: Error bits set: %s%s%s%s%s%s%s\n",
+ (stat & I2C_STAT_NACK_RCVD_ERR) ? "nack, " : "",
+ (stat & I2C_STAT_INVALID_CMD) ? "cmd invalid, " : "",
+ (stat & I2C_STAT_LBUS_PARITY_ERR) ? "interal parity, " : "",
+ (stat & I2C_STAT_BKEND_OVERRUN_ERR) ? "backend overrun, " : "",
+ (stat & I2C_STAT_BKEND_ACCESS_ERR) ? "backend access, " : "",
+ (stat & I2C_STAT_ARBT_LOST_ERR) ? "arbitration loss, " : "",
+ (stat & I2C_STAT_STOP_ERR) ? "stop error, " : "");
+}
+
+static bool p8_i2c_has_irqs(struct p8_i2c_master *master)
+{
+ struct proc_chip *chip;
+
+ /* Centaur I2C doesn't have interrupts */
+ if (master->type == I2C_CENTAUR)
+ return false;
+
+ chip = get_chip(master->chip_id);
+
+ /* The i2c interrupts was only added to Murano DD2.1 and Venice
+ * DD2.0. When operating without interrupts, we need to bump the
+ * timeouts as we rely solely on the polls from Linux which can
+ * be up to 2s apart !
+ */
+ if (proc_gen >= proc_gen_p9)
+ return true;
+ else if (chip->type == PROC_CHIP_P8_MURANO)
+ return chip->ec_level >= 0x21;
+ else if (chip->type == PROC_CHIP_P8_VENICE)
+ return chip->ec_level >= 0x20;
+
+ return true;
+}
+
+static int p8_i2c_enable_irqs(struct p8_i2c_master *master)
+{
+ int rc;
+
+ /* enable interrupts we're interested in */
+ rc = i2cm_write_reg(master, I2C_INTR_MASK_OR_REG, I2C_INTR_ACTIVE);
+ if (rc)
+ prlog(PR_ERR, "I2C: Failed to enable the interrupts\n");
+
+ return rc;
+}
+
+static void p8_i2c_reset_timeout(struct p8_i2c_master *master,
+ struct i2c_request *req)
+{
+ uint64_t now = mftb();
+
+ master->last_update = now;
+ schedule_timer_at(&master->timeout, now + msecs_to_tb(req->timeout));
+}
+
+static int p8_i2c_prog_watermark(struct p8_i2c_master *master)
+{
+ uint64_t watermark;
+ int rc;
+
+ rc = xscom_read(master->chip_id, master->xscom_base + I2C_WATERMARK_REG,
+ &watermark);
+ if (rc) {
+ prlog(PR_ERR, "I2C: Failed to read the WATERMARK_REG\n");
+ return rc;
+ }
+
+ /* Set the high/low watermark */
+ watermark = SETFIELD(I2C_WATERMARK_HIGH, watermark, I2C_FIFO_HI_LVL);
+ watermark = SETFIELD(I2C_WATERMARK_LOW, watermark, I2C_FIFO_LO_LVL);
+ rc = xscom_write(master->chip_id, master->xscom_base +
+ I2C_WATERMARK_REG, watermark);
+ if (rc)
+ prlog(PR_ERR, "I2C: Failed to set high/low watermark level\n");
+
+ return rc;
+}
+
+static int p8_i2c_prog_mode(struct p8_i2c_master_port *port, bool enhanced_mode)
+{
+ struct p8_i2c_master *master = port->master;
+ uint64_t mode, omode;
+ int rc;
+
+ rc = xscom_read(master->chip_id, master->xscom_base +
+ I2C_MODE_REG, &mode);
+ if (rc) {
+ prlog(PR_ERR, "I2C: Failed to read the MODE_REG\n");
+ return rc;
+ }
+ omode = mode;
+ mode = SETFIELD(I2C_MODE_PORT_NUM, mode, port->port_num);
+ mode = SETFIELD(I2C_MODE_BIT_RATE_DIV, mode, port->bit_rate_div);
+ if (enhanced_mode)
+ mode |= I2C_MODE_ENHANCED;
+ else
+ mode &= ~I2C_MODE_ENHANCED;
+ if (mode == omode)
+ return 0;
+
+ rc = xscom_write(master->chip_id, master->xscom_base + I2C_MODE_REG,
+ mode);
+ if (rc)
+ prlog(PR_ERR, "I2C: Failed to write the MODE_REG\n");
+
+ return rc;
+}
+
+static void p8_i2c_complete_request(struct p8_i2c_master *master,
+ struct i2c_request *req, int ret)
+{
+ /* We only complete the current top level request */
+ assert(req == list_top(&master->req_list, struct i2c_request, link));
+
+ cancel_timer_async(&master->timeout);
+
+ list_del(&req->link);
+ master->state = state_idle;
+ req->result = ret;
+ req->req_state = i2c_req_done;
+
+ /* Schedule re-enabling of sensor cache */
+ if (master->occ_cache_dis)
+ schedule_timer(&master->sensor_cache,
+ msecs_to_tb(SENSOR_CACHE_EN_DELAY));
+
+ /* If we're done with i2c master, allow OCC to use it */
+ if (master->occ_lock_acquired && list_empty(&master->req_list))
+ occ_i2c_unlock(master);
+
+ unlock(&master->lock);
+ if (req->completion)
+ req->completion(ret, req);
+ /* req might have been freed at this point */
+ lock(&master->lock);
+}
+
+
+static int p8_i2c_engine_reset(struct p8_i2c_master_port *port)
+{
+ struct p8_i2c_master *master = port->master;
+ int rc;
+
+ /* Reset the i2c engine */
+ rc = xscom_write(master->chip_id, master->xscom_base +
+ I2C_RESET_I2C_REG, 0);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_I2C_RESET), "I2C: Failed "
+ "to reset the i2c engine\n");
+ return rc;
+ }
+
+ /* Reprogram the watermark and mode */
+ rc = p8_i2c_prog_watermark(port->master);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_I2C_RESET), "I2C: Failed to"
+ "program the WATERMARK_REG\n");
+ return rc;
+ }
+
+ rc = p8_i2c_prog_mode(port, false);
+ if (rc)
+ log_simple_error(&e_info(OPAL_RC_I2C_RESET), "I2C: Failed to"
+ "program the MODE_REG\n");
+
+ return rc;
+}
+
+static void p8_i2c_translate_error(struct i2c_request *req, uint64_t status)
+{
+ /* Assuming there are not more than one type of error simultaneously */
+ if (status & I2C_STAT_NACK_RCVD_ERR)
+ req->result = OPAL_I2C_NACK_RCVD;
+ else if (status & I2C_STAT_INVALID_CMD)
+ req->result = OPAL_I2C_INVALID_CMD;
+ else if (status & I2C_STAT_LBUS_PARITY_ERR)
+ req->result = OPAL_I2C_LBUS_PARITY;
+ else if (status & I2C_STAT_BKEND_OVERRUN_ERR)
+ req->result = OPAL_I2C_BKEND_OVERRUN;
+ else if (status & I2C_STAT_BKEND_ACCESS_ERR)
+ req->result = OPAL_I2C_BKEND_ACCESS;
+ else if (status & I2C_STAT_ARBT_LOST_ERR)
+ req->result = OPAL_I2C_ARBT_LOST;
+ else if (status & I2C_STAT_STOP_ERR)
+ req->result = OPAL_I2C_STOP_ERR;
+ else if (status & I2C_STAT_PSEUDO_TIMEOUT)
+ req->result = OPAL_I2C_TIMEOUT;
+}
+
+static int p8_i2c_reset_port(struct p8_i2c_master_port *p)
+{
+ struct p8_i2c_master *master = p->master;
+ int reset_loops, rc;
+ uint64_t status;
+
+ /* FIXME: this should per per-port rather than per-master */
+ master->state = state_error;
+
+ /*
+ * Put the master into enhanced STOP mode when recovering the
+ * port. This causes the master to send additional STOP conditions
+ * to work around some particularly stupid I2C devices and it's
+ * required on secure I2C masters since they will not send a bare
+ * stop condition.
+ */
+ rc = p8_i2c_prog_mode(p, true);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_I2C_RESET),
+ "I2C: Failed to enable enhanced mode\n");
+ return -1;
+ }
+
+ rc = xscom_write(master->chip_id, master->xscom_base +
+ I2C_CMD_REG, I2C_CMD_WITH_STOP);
+ if (rc)
+ goto err;
+
+ /* Wait for COMMAND COMPLETE */
+ for (reset_loops = 0; reset_loops < 10; reset_loops++) {
+ time_wait_ms(10);
+
+ rc = xscom_read(master->chip_id,
+ master->xscom_base + I2C_STAT_REG,
+ &status);
+ if (rc)
+ goto err;
+
+ if (status & I2C_STAT_CMD_COMP)
+ break;
+ }
+
+ if (status & I2C_STAT_CMD_COMP)
+ return 0;
+err:
+ prerror("I2C: Failed to reset c%de%dp%d\n",
+ master->chip_id, master->engine_id, p->port_num);
+ return -1;
+}
+
+static void p8_i2c_status_error(struct p8_i2c_master_port *port,
+ struct i2c_request *req,
+ uint64_t status, uint64_t end_time)
+{
+ struct p8_i2c_master *master = port->master;
+ int rc;
+
+ /* Display any error other than I2C_INTR_NACK_RCVD_ERR or
+ * timeout since getting NACK's is normal if Linux is probing
+ * the bus and timeouts will have already logged something.
+ */
+ if (!(status & (I2C_STAT_NACK_RCVD_ERR | I2C_STAT_PSEUDO_TIMEOUT))) {
+ log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER),
+ "I2C: Transfer error occurred\n");
+ p8_i2c_print_debug_info(port, req, end_time);
+ } else if (status == I2C_STAT_PSEUDO_TIMEOUT) {
+ log_simple_error(&e_info(OPAL_RC_I2C_TIMEOUT),
+ "I2C: request timed out!\n");
+ p8_i2c_print_debug_info(port, req, end_time);
+ }
+
+ p8_i2c_translate_error(req, status);
+
+ rc = p8_i2c_engine_reset(port);
+ if (rc)
+ goto exit;
+
+ if (status & (I2C_STAT_LBUS_PARITY_ERR | I2C_STAT_ARBT_LOST_ERR |
+ I2C_STAT_STOP_ERR)) {
+ /*
+ * Don't bother issuing a STOP command for those errors
+ * just get rid of the current request and start off with
+ * the fresh one in the list
+ */
+ p8_i2c_complete_request(master, req, req->result);
+ } else {
+ if (p8_i2c_reset_port(port))
+ goto exit;
+ /* Enable the interrupt */
+ p8_i2c_enable_irqs(master);
+ }
+ return;
+
+exit:
+ p8_i2c_complete_request(master, req, req->result);
+}
+
+static int p8_i2c_fifo_read(struct p8_i2c_master *master,
+ uint8_t *buf, uint32_t count)
+{
+ uint64_t fifo;
+ uint32_t i;
+ int rc = 0;
+
+ for (i = 0; i < count; i++, buf++) {
+ rc = xscom_read(master->chip_id, master->xscom_base +
+ I2C_FIFO_REG, &fifo);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER),
+ "I2C: Failed to read the fifo\n");
+ break;
+ }
+
+ *buf = GETFIELD(I2C_FIFO, fifo);
+ }
+ return rc;
+}
+
+static int p8_i2c_fifo_write(struct p8_i2c_master *master,
+ uint8_t *buf, uint32_t count)
+{
+ uint64_t fifo;
+ uint32_t i;
+ int rc = 0;
+
+ for (i = 0; i < count; i++, buf++) {
+ fifo = SETFIELD(I2C_FIFO, 0ull, *buf);
+ rc = xscom_write(master->chip_id, master->xscom_base +
+ I2C_FIFO_REG, fifo);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER),
+ "I2C: Failed to write the fifo\n");
+ break;
+ }
+ }
+ return rc;
+}
+
+static void p8_i2c_status_data_request(struct p8_i2c_master *master,
+ struct i2c_request *req,
+ uint64_t status)
+{
+ uint32_t fifo_count, fifo_free, count;
+ uint8_t *buf;
+ int rc = 0;
+
+ fifo_count = GETFIELD(I2C_STAT_FIFO_ENTRY_COUNT, status);
+ fifo_free = master->fifo_size - fifo_count;
+
+ DBG("Data request, state=%d fifo_count=%d/%d bytes_sent=%d\n",
+ master->state, fifo_count, master->fifo_size, master->bytes_sent);
+
+ switch(master->state) {
+ case state_offset:
+ /* We assume the offset can always be written in one go */
+ if (fifo_free < req->offset_bytes) {
+ log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER),
+ "I2C: Fifo too small for offset !\n");
+ rc = OPAL_HARDWARE;
+ } else {
+ rc = p8_i2c_fifo_write(master, master->obuf,
+ req->offset_bytes);
+ }
+
+ /* For read, wait address phase to complete */
+ if (rc || req->op != SMBUS_WRITE)
+ break;
+
+ /* For writes, transition to data phase now */
+ master->state = state_data;
+ fifo_free -= req->offset_bytes;
+ /* Fall through */
+ case state_data:
+ /* Sanity check */
+ if (master->bytes_sent >= req->rw_len) {
+ log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), "I2C: "
+ "Data req with no data to send sent=%d "
+ "req=%d\n", master->bytes_sent,
+ req->rw_len);
+ rc = OPAL_HARDWARE;
+ break;
+ }
+
+ /* Get next chunk */
+ buf = req->rw_buf + master->bytes_sent;
+ count = req->rw_len - master->bytes_sent;
+
+ /* Check direction */
+ if (req->op == I2C_READ || req->op == SMBUS_READ) {
+ if (count > fifo_count)
+ count = fifo_count;
+ rc = p8_i2c_fifo_read(master, buf, count);
+ } else {
+ if (count > fifo_free)
+ count = fifo_free;
+ rc = p8_i2c_fifo_write(master, buf, count);
+ }
+ if (rc == 0)
+ master->bytes_sent += count;
+ break;
+ default:
+ log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), "I2C: Invalid "
+ "state %d in data req !\n", master->state);
+ rc = OPAL_WRONG_STATE;
+ }
+
+ if (rc) {
+ p8_i2c_complete_request(master, req, rc);
+ } else {
+ p8_i2c_enable_irqs(master);
+ p8_i2c_reset_timeout(master, req);
+ }
+}
+
+static void p8_i2c_complete_offset(struct p8_i2c_master *master,
+ struct i2c_request *req)
+{
+ uint64_t cmd;
+ int rc = 0;
+
+ DBG("Completing offset phase\n");
+
+ /* If it's a write, we should only get here for empty
+ * write commands
+ */
+ if (req->op == SMBUS_WRITE && req->rw_len != 0) {
+ log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), "I2C: Write "
+ "completion in offset state !\n");
+ rc = OPAL_HARDWARE;
+ goto complete;
+ }
+
+ /* Switch to data phase */
+ master->state = state_data;
+
+ /* If it's not a read command, or there are no data to read,
+ * then we complete the command
+ */
+ if (req->op != SMBUS_READ || req->rw_len == 0)
+ goto complete;
+
+ /* Otherwise, let's start the data phase */
+ cmd = I2C_CMD_WITH_START | I2C_CMD_WITH_ADDR |
+ I2C_CMD_WITH_STOP | I2C_CMD_READ_NOT_WRITE;
+ cmd = SETFIELD(I2C_CMD_DEV_ADDR, cmd, req->dev_addr);
+ cmd = SETFIELD(I2C_CMD_LEN_BYTES, cmd, req->rw_len);
+ cmd = SETFIELD(I2C_CMD_INTR_STEERING, cmd, I2C_CMD_INTR_STEER_HOST);
+
+ DBG("Command: %016llx, state: %d\n", cmd, master->state);
+
+ /* Send command */
+ rc = xscom_write(master->chip_id, master->xscom_base + I2C_CMD_REG,
+ cmd);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), "I2C: Failed "
+ "to write the CMD_REG\n");
+ goto complete;
+ }
+
+ /* Enable the interrupts */
+ p8_i2c_enable_irqs(master);
+ p8_i2c_reset_timeout(master, req);
+ return;
+
+ complete:
+ p8_i2c_complete_request(master, req, rc);
+}
+
+static void p8_i2c_status_cmd_completion(struct p8_i2c_master *master,
+ struct i2c_request *req,
+ uint64_t end_time __unused)
+{
+ int rc;
+
+ DBG("Command completion, state=%d bytes_sent=%d\n",
+ master->state, master->bytes_sent);
+ DBG(" start_time=%016llx end_time=%016llx (duration=%016llx)\n",
+ master->start_time, end_time, end_time - master->start_time);
+
+ /* If we complete an offset, we probably need to transition
+ * do a data read, check if that all makes sense
+ */
+ if (master->state == state_offset) {
+ p8_i2c_complete_offset(master, req);
+ return;
+ }
+
+ /* If we are not already in error state, check if we have
+ * completed our data transfer properly
+ */
+ if (master->state != state_error && master->bytes_sent != req->rw_len) {
+ log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), "I2C: Request "
+ "complete with residual data req=%d done=%d\n",
+ req->rw_len, master->bytes_sent);
+ /* Should we error out here ? */
+ }
+ rc = master->state == state_error ? req->result : OPAL_SUCCESS;
+ p8_i2c_complete_request(master, req, rc);
+}
+
+static void p8_i2c_check_status(struct p8_i2c_master *master)
+{
+ struct p8_i2c_master_port *port;
+ uint64_t status, deadline, now;
+ struct i2c_request *req;
+ int rc;
+
+ /*
+ * When idle or waiting for the occ to release the bus there's
+ * nothing to check. Also ignore recovery state, as the bus
+ * can be reset in that state, and a request can think it's
+ * complete when it just means the reset is complete.
+ * Error states are handled when starting a new request.
+ */
+ if (master->state == state_idle || master->state == state_occache_dis ||
+ master->state == state_recovery)
+ return;
+
+ /* A non-idle master should always have a pending request */
+ req = list_top(&master->req_list, struct i2c_request, link);
+ if (!req) {
+ prerror("I2C: Master is not idle and has no pending request\n");
+ return;
+ }
+
+ rc = i2cm_read_reg(master, I2C_STAT_REG, &status);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER),
+ "I2C: Failed to read the STAT_REG\n");
+ return;
+ }
+
+ /* mask interrupts while we're mucking with the master */
+ rc = i2cm_write_reg(master, I2C_INTR_MASK_AND_REG, ~I2C_INTR_ALL);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER),
+ "I2C: Failed to disable the interrupts\n");
+ return;
+ }
+
+ /* Get port for current request */
+ port = container_of(req->bus, struct p8_i2c_master_port, bus);
+ now = mftb();
+
+ deadline = master->last_update + msecs_to_tb(req->timeout);
+
+ if (status & I2C_STAT_ANY_ERR)
+ p8_i2c_status_error(port, req, status & I2C_STAT_ANY_ERR, now);
+ else if (status & I2C_STAT_DATA_REQ)
+ p8_i2c_status_data_request(master, req, status);
+ else if (status & I2C_STAT_CMD_COMP)
+ p8_i2c_status_cmd_completion(master, req, now);
+ else if (tb_compare(now, deadline) == TB_AAFTERB)
+ p8_i2c_status_error(port, req, I2C_STAT_PSEUDO_TIMEOUT, now);
+ else
+ p8_i2c_enable_irqs(master);
+}
+
+static int p8_i2c_check_initial_status(struct p8_i2c_master_port *port)
+{
+ struct p8_i2c_master *master = port->master;
+ uint64_t status, estat;
+ int rc;
+
+ master->recovery_pass++;
+
+ /* Read status register */
+ rc = xscom_read(master->chip_id, master->xscom_base + I2C_STAT_REG,
+ &status);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_I2C_START_REQ), "I2C: Failed "
+ "to read the STAT_REG\n");
+ return rc;
+ }
+
+ rc = xscom_read(master->chip_id,
+ master->xscom_base + I2C_EXTD_STAT_REG,
+ &estat);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_I2C_START_REQ), "I2C: Failed "
+ "to read the EXTD_STAT_REG\n");
+ return rc;
+ }
+ if (estat & (I2C_EXTD_STAT_I2C_BUSY | I2C_EXTD_STAT_SELF_BUSY)) {
+ DBG("Initial estat busy ! %016llx\n", estat);
+ /* Just a warning for now */
+ }
+
+ /* Nothing happened ? Go back */
+ if (status & I2C_STAT_ANY_ERR) {
+ log_simple_error(&e_info(OPAL_RC_I2C_START_REQ), "I2C: "
+ "Initial error status 0x%016llx\n", status);
+
+ if (master->recovery_pass > 1) {
+ log_simple_error(&e_info(OPAL_RC_I2C_START_REQ), "I2C: "
+ "Error stuck, aborting !!\n");
+ return OPAL_HARDWARE;
+ }
+
+ /* Mark state as "recovery" to block any other activity */
+ master->state = state_recovery;
+
+ /* Reset the engine */
+ p8_i2c_engine_reset(port);
+
+ /* Delay 5ms for bus to settle */
+ schedule_timer(&master->recovery, msecs_to_tb(5));
+ return OPAL_BUSY;
+ }
+
+ /* Still busy ? */
+ if (!(status & I2C_STAT_CMD_COMP)) {
+ log_simple_error(&e_info(OPAL_RC_I2C_START_REQ), "I2C: Initial "
+ "command complete not set\n");
+
+ if (master->recovery_pass > 5) {
+ log_simple_error(&e_info(OPAL_RC_I2C_START_REQ), "I2C: "
+ "Command stuck, aborting !!\n");
+ return OPAL_HARDWARE;
+ }
+
+
+ master->state = state_recovery;
+
+ /* Delay 5ms for bus to settle */
+ schedule_timer(&master->recovery, msecs_to_tb(5));
+ return OPAL_BUSY;
+ }
+
+ master->recovery_pass = 0;
+ return 0;
+}
+
+/*
+ * On POWER9, the I2C may also wish to use some of the i2cm engines,
+ * to do things like read sensor data. There's a couple of shared
+ * registers with the OCC to negotiate locking of the i2cm engines.
+ * See occ/src/occ_405/lock/lock.c
+ */
+static bool occ_uses_master(struct p8_i2c_master *master)
+{
+ /* OCC uses I2CM Engines 1,2 and 3, only on POWER9/10 */
+ if (master->type == I2C_POWER8 && proc_gen >= proc_gen_p9)
+ return master->engine_id >= 1;
+
+ return false;
+}
+
+static uint32_t occflg;
+#define OCCFLG_BASE 0
+#define OCCFLG_CLEAR 1
+#define OCCFLG_SET 2
+
+static int occ_i2c_lock(struct p8_i2c_master *master)
+{
+ u64 occflags, busflag;
+ int rc;
+
+ if (!occ_uses_master(master) || !occflg)
+ return 0;
+
+ if (master->occ_lock_acquired)
+ return 0;
+
+ rc = xscom_read(master->chip_id, occflg, &occflags);
+ if (rc) {
+ prerror("I2C: Failed to read OCC FLAG register\n");
+ return rc;
+ }
+
+ assert(master->engine_id > 0);
+
+ busflag = PPC_BIT(16 + (master->engine_id - 1) * 2);
+
+ DBG("I2C: c%de%d: occflags = %llx (locks = %x:%x:%x)\n",
+ master->chip_id, master->engine_id, (u64) occflags,
+ (u32) GETFIELD(PPC_BITMASK(16, 17), occflags),
+ (u32) GETFIELD(PPC_BITMASK(18, 19), occflags),
+ (u32) GETFIELD(PPC_BITMASK(20, 21), occflags));
+
+ rc = xscom_write(master->chip_id, occflg + OCCFLG_SET, busflag);
+ if (rc) {
+ prerror("I2C: Failed to write OCC FLAG register\n");
+ return rc;
+ }
+
+ /* If the OCC also has this bus locked then wait for IRQ */
+ if (occflags & (busflag >> 1)) {
+ DBG("I2C: c%de%d: Master in use by OCC\n",
+ master->chip_id, master->engine_id);
+ return 1;
+ }
+
+ master->occ_lock_acquired = true;
+
+ return 0;
+}
+
+static int occ_i2c_unlock(struct p8_i2c_master *master)
+{
+ u64 busflag, occflags;
+ int rc;
+
+ if (!occ_uses_master(master) || !occflg)
+ return 0;
+
+ rc = xscom_read(master->chip_id, occflg, &occflags);
+ if (rc) {
+ prerror("I2C: Failed to read OCC Flag register\n");
+ return rc;
+ }
+
+ busflag = PPC_BIT(16 + (master->engine_id - 1) * 2);
+
+ if (!(occflags & busflag)) {
+ DBG("I2C: spurious unlock for c%de%d already cleared (flags = %.16llx)",
+ master->chip_id, master->engine_id, occflags);
+ }
+
+ rc = xscom_write(master->chip_id, occflg + OCCFLG_CLEAR, busflag);
+ if (rc)
+ prerror("I2C: Failed to write OCC Flag register\n");
+
+ master->occ_lock_acquired = false;
+
+ return rc;
+}
+
+static int p8_i2c_start_request(struct p8_i2c_master *master,
+ struct i2c_request *req)
+{
+ struct p8_i2c_master_port *port;
+ uint64_t cmd;
+ int64_t rc;
+
+ DBG("Starting req %d len=%d addr=%02x (offset=%x)\n",
+ req->op, req->rw_len, req->dev_addr, req->offset);
+
+ /* Get port */
+ port = container_of(req->bus, struct p8_i2c_master_port, bus);
+
+ /* Check if we need to disable the OCC cache first */
+ if (master->type == I2C_CENTAUR && !master->occ_cache_dis) {
+ DBG("Disabling OCC cache...\n");
+ rc = centaur_disable_sensor_cache(master->chip_id);
+
+ if (rc < 0) {
+ log_simple_error(&e_info(OPAL_RC_I2C_START_REQ),
+ "I2C: Failed "
+ "to disable the sensor cache\n");
+ return rc;
+ }
+ master->occ_cache_dis = true;
+
+ /* Do we need to wait ? */
+ if (rc > 0) {
+ DBG("Waiting %lld\n", rc);
+ master->state = state_occache_dis;
+ schedule_timer(&master->recovery, rc);
+ return 0;
+ }
+ }
+
+ /*
+ * on P9 we need to set the "I2C master using bit" so we don't
+ * conflict with the OCC's use of the i2c master.
+ */
+ rc = occ_i2c_lock(master);
+ if (rc < 0) {
+ log_simple_error(&e_info(OPAL_RC_I2C_START_REQ),
+ "I2C: Failed to get I2CM lock from OCC\n");
+ return rc;
+ }
+ if (rc > 0) {
+ /* Wait for OCC IRQ */
+ master->state = state_occache_dis;
+ schedule_timer(&master->recovery, msecs_to_tb(10));
+ return 0;
+ }
+
+ /* Convert the offset if needed */
+ if (req->offset_bytes) {
+ int i;
+
+ for (i = 0; i < req->offset_bytes; i++) {
+ uint8_t b;
+
+ b = req->offset >> (8 * (req->offset_bytes - i - 1));
+ master->obuf[i] = b;
+ }
+ DBG("Offset %d bytes: %02x %02x %02x %02x\n",
+ req->offset_bytes, master->obuf[0], master->obuf[1],
+ master->obuf[2], master->obuf[3]);
+ }
+
+ /* Program mode register */
+ rc = p8_i2c_prog_mode(port, false);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_I2C_START_REQ), "I2C: Failed "
+ "to program the MODE_REG\n");
+ return rc;
+ }
+
+ /* Check status */
+ rc = p8_i2c_check_initial_status(port);
+ if (rc != OPAL_BUSY)
+ master->recovery_pass = 0;
+ if (rc)
+ return rc;
+
+ /* program the watermark register */
+ rc = p8_i2c_prog_watermark(master);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_I2C_INIT),
+ "I2C: Failed to program the WATERMARK_REG\n");
+ return rc;
+ }
+
+ /* Initialize bytes_sent */
+ master->bytes_sent = 0;
+
+ /* Set up the command register */
+ cmd = I2C_CMD_WITH_START | I2C_CMD_WITH_ADDR;
+ cmd = SETFIELD(I2C_CMD_DEV_ADDR, cmd, req->dev_addr);
+ cmd = SETFIELD(I2C_CMD_INTR_STEERING, cmd, I2C_CMD_INTR_STEER_HOST);
+ switch (req->op) {
+ case I2C_READ:
+ cmd |= I2C_CMD_READ_NOT_WRITE;
+ /* Fall through */
+ case I2C_WRITE:
+ cmd |= I2C_CMD_WITH_STOP;
+ cmd = SETFIELD(I2C_CMD_LEN_BYTES, cmd, req->rw_len);
+ master->state = state_data;
+ break;
+ case SMBUS_READ:
+ cmd = SETFIELD(I2C_CMD_LEN_BYTES, cmd, req->offset_bytes);
+ master->state = state_offset;
+ break;
+ case SMBUS_WRITE:
+ cmd |= I2C_CMD_WITH_STOP;
+ cmd = SETFIELD(I2C_CMD_LEN_BYTES, cmd,
+ req->rw_len + req->offset_bytes);
+ master->state = state_offset;
+ break;
+ default:
+ return OPAL_PARAMETER;
+ }
+ DBG("Command: %016llx, state: %d\n", cmd, master->state);
+
+ master->start_time = mftb();
+
+ /* Send command */
+ rc = xscom_write(master->chip_id, master->xscom_base + I2C_CMD_REG,
+ cmd);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_I2C_START_REQ), "I2C: Failed "
+ "to write the CMD_REG\n");
+ return rc;
+ }
+
+ /* Enable the interrupts */
+ p8_i2c_enable_irqs(master);
+
+ /* Run a poll timer for boot cases or non-working interrupts
+ * cases
+ */
+ if (!opal_booting() && master->irq_ok)
+ master->poll_interval = TIMER_POLL;
+ else
+ master->poll_interval = port->poll_interval;
+ schedule_timer(&master->poller, master->poll_interval);
+
+ /* If we don't have a user-set timeout then use the master's default */
+ if (!req->timeout)
+ req->timeout = port->byte_timeout;
+
+ /* Start the timeout */
+ p8_i2c_reset_timeout(master, req);
+
+ return OPAL_SUCCESS;
+}
+
+static void p8_i2c_check_work(struct p8_i2c_master *master)
+{
+ struct i2c_request *req;
+ int rc;
+
+ while (master->state == state_idle && !list_empty(&master->req_list)) {
+ req = list_top(&master->req_list, struct i2c_request, link);
+ rc = p8_i2c_start_request(master, req);
+ if (rc) {
+ /*
+ * If it didn't work the first three times then
+ * odds are it's not going to work on the 4th.
+ */
+ if (rc && req->retries > 3)
+ p8_i2c_complete_request(master, req, rc);
+ else
+ req->retries++;
+ }
+ }
+}
+
+/* OCC IRQ Handler for I2C Ownership Change*/
+void p9_i2c_bus_owner_change(u32 chip_id)
+{
+ struct proc_chip *chip = get_chip(chip_id);
+ struct p8_i2c_master *master = NULL;
+
+ assert(chip);
+ list_for_each(&chip->i2cms, master, link) {
+ lock(&master->lock);
+
+ /* spurious */
+ if (master->state != state_occache_dis)
+ goto done;
+
+ /* Can we now lock this master? */
+ if (occ_i2c_lock(master))
+ goto done;
+
+ /* clear the existing wait timer */
+ cancel_timer_async(&master->recovery);
+
+ /* re-start the request now that we own the master */
+ master->state = state_idle;
+
+ p8_i2c_check_work(master);
+ p8_i2c_check_status(master);
+done:
+ unlock(&master->lock);
+ }
+}
+
+static int p8_i2c_queue_request(struct i2c_request *req)
+{
+ struct i2c_bus *bus = req->bus;
+ struct p8_i2c_master_port *port =
+ container_of(bus, struct p8_i2c_master_port, bus);
+ struct p8_i2c_master *master = port->master;
+ int rc = 0;
+
+ /* Parameter check */
+ if (req->rw_len > I2C_MAX_TFR_LEN) {
+ prlog(PR_ERR, "I2C: Too large transfer %d bytes\n", req->rw_len);
+ return OPAL_PARAMETER;
+ }
+
+ if (req->offset_bytes > 4) {
+ prlog(PR_ERR, "I2C: Invalid offset size %d\n", req->offset_bytes);
+ return OPAL_PARAMETER;
+ }
+ lock(&master->lock);
+ list_add_tail(&master->req_list, &req->link);
+ p8_i2c_check_work(master);
+ unlock(&master->lock);
+
+ return rc;
+}
+
+static uint64_t p8_i2c_run_request(struct i2c_request *req)
+{
+ struct i2c_bus *bus = req->bus;
+ struct p8_i2c_master_port *port =
+ container_of(bus, struct p8_i2c_master_port, bus);
+ struct p8_i2c_master *master = port->master;
+ uint64_t poll_interval = 0;
+
+ lock(&master->lock);
+ p8_i2c_check_status(master);
+ p8_i2c_check_work(master);
+ poll_interval = master->poll_interval;
+ unlock(&master->lock);
+
+ return poll_interval;
+}
+
+static inline uint32_t p8_i2c_get_bit_rate_divisor(uint32_t lb_freq,
+ uint32_t bus_speed)
+{
+ assert(bus_speed > 0);
+ return (((lb_freq / bus_speed) - 1) / 4);
+}
+
+static inline uint64_t p8_i2c_get_poll_interval(uint32_t bus_speed)
+{
+ uint64_t usec;
+
+ assert(bus_speed > 0);
+
+ /* Polling Interval = 8 * (1/bus_speed) * (1/10) -> convert to uSec */
+ usec = ((8 * USEC_PER_SEC) / (10 * bus_speed));
+ return usecs_to_tb(usec);
+}
+
+static void p8_i2c_timeout(struct timer *t __unused, void *data,
+ uint64_t __unused now)
+{
+ struct p8_i2c_master *master = data;
+
+ lock(&master->lock);
+
+ DBG("timeout on c%de%d\n", master->chip_id, master->engine_id);
+
+ /*
+ * Run through the usual status checks. It's possible to get spurious
+ * timeouts due to races between the interrupt/poller paths and the
+ * timeout handler. So we do all the checking, all the time.
+ */
+ p8_i2c_check_status(master);
+ p8_i2c_check_work(master);
+
+ unlock(&master->lock);
+}
+
+static void p8_i2c_recover(struct timer *t __unused, void *data,
+ uint64_t now __unused)
+{
+ struct p8_i2c_master *master = data;
+
+ lock(&master->lock);
+
+ /*
+ * The recovery timer can race with the OCC interrupt. If the interrupt
+ * comes in just before this is called, then we'll get a spurious
+ * timeout which we need to ignore.
+ */
+ if (master->state != state_recovery &&
+ master->state != state_occache_dis) {
+ unlock(&master->lock);
+ return;
+ }
+
+ master->state = state_idle;
+
+ /* We may or may not still have work pending, re-enable the sensor cache
+ * immediately if we don't (we just waited the recovery time so there is
+ * little point waiting longer).
+ */
+ if (master->occ_cache_dis && list_empty(&master->req_list)) {
+ DBG("Re-enabling OCC cache after recovery\n");
+ centaur_enable_sensor_cache(master->chip_id);
+ master->occ_cache_dis = false;
+ }
+
+ if (master->occ_lock_acquired && list_empty(&master->req_list))
+ occ_i2c_unlock(master);
+
+ /* Re-check for new work */
+ p8_i2c_check_work(master);
+ unlock(&master->lock);
+}
+
+static void p8_i2c_enable_scache(struct timer *t __unused, void *data,
+ uint64_t now __unused)
+{
+ struct p8_i2c_master *master = data;
+
+ lock(&master->lock);
+
+ /* Check if we are still idle */
+ if (master->state == state_idle && master->occ_cache_dis) {
+ DBG("Re-enabling OCC cache\n");
+ centaur_enable_sensor_cache(master->chip_id);
+ master->occ_cache_dis = false;
+ }
+ unlock(&master->lock);
+}
+
+static void p8_i2c_poll(struct timer *t __unused, void *data, uint64_t now)
+{
+ struct p8_i2c_master *master = data;
+
+ /*
+ * This is called when the interrupt isn't functional or
+ * generally from the opal pollers, so fast while booting
+ * and slowly when Linux is up.
+ */
+
+ /* Lockless fast bailout */
+ if (master->state == state_idle)
+ return;
+
+ lock(&master->lock);
+ p8_i2c_check_status(master);
+ if (master->state != state_idle)
+ schedule_timer_at(&master->poller, now + master->poll_interval);
+ p8_i2c_check_work(master);
+ unlock(&master->lock);
+}
+
+void p8_i2c_interrupt(uint32_t chip_id)
+{
+ struct proc_chip *chip = get_chip(chip_id);
+ struct p8_i2c_master *master = NULL;
+
+ assert(chip);
+ list_for_each(&chip->i2cms, master, link) {
+
+ /* Lockless fast bailout (shared interrupt) */
+ if (master->state == state_idle)
+ continue;
+
+ lock(&master->lock);
+
+ /* Run the state machine */
+ p8_i2c_check_status(master);
+
+ /* Check for new work */
+ p8_i2c_check_work(master);
+
+ unlock(&master->lock);
+ }
+}
+
+static const char *compat[] = {
+ "ibm,power8-i2cm",
+ "ibm,centaur-i2cm"
+};
+
+static void p8_i2c_add_bus_prop(struct p8_i2c_master_port *port)
+{
+ const struct dt_property *c, *p;
+ struct dt_node *np = port->bus.dt_node;
+ char name[32];
+
+ c = dt_find_property(np, "compatible");
+ p = dt_find_property(np, "ibm,port-name");
+
+ if (!c) {
+ if (port->master->type == I2C_POWER8)
+ dt_add_property_strings(np, "compatible",
+ "ibm,power8-i2c-port",
+ "ibm,opal-i2c");
+ else if (port->master->type == I2C_CENTAUR)
+ dt_add_property_strings(np, "compatible",
+ "ibm,centaur-i2c-port",
+ "ibm,opal-i2c");
+ }
+
+ if (!p) {
+ if (port->master->type == I2C_POWER8)
+ snprintf(name, sizeof(name), "p8_%08x_e%dp%d",
+ port->master->chip_id, port->master->engine_id,
+ port->port_num);
+ else if (port->master->type == I2C_CENTAUR)
+ snprintf(name, sizeof(name), "cen_%08x_e%dp%d",
+ port->master->chip_id, port->master->engine_id,
+ port->port_num);
+
+ dt_add_property_string(np, "ibm,port-name", name);
+ }
+}
+
+static struct p8_i2c_master_port *p8_i2c_init_one_port(struct p8_i2c_master *m,
+ struct dt_node *n)
+{
+ struct p8_i2c_master_port *port;
+ uint64_t def_timeout, lb_freq;
+ uint32_t speed, div;
+
+ port = zalloc(sizeof(*port));
+ if (!port)
+ return NULL;
+
+ def_timeout = m->irq_ok ? I2C_TIMEOUT_IRQ_MS : I2C_TIMEOUT_POLL_MS;
+
+ lb_freq = dt_prop_get_u32_def(m->dt_node, "clock-frequency", 150000000);
+ speed = dt_prop_get_u32_def(n, "bus-frequency", 100000);
+ div = p8_i2c_get_bit_rate_divisor(lb_freq, speed);
+
+ /* p8-i2c stuff */
+ port->master = m;
+ port->bit_rate_div = div;
+ port->poll_interval = p8_i2c_get_poll_interval(speed);
+ port->port_num = dt_prop_get_u32(n, "reg");
+ port->byte_timeout = dt_prop_get_u32_def(n, "timeout-ms", def_timeout);
+ list_add_tail(&m->ports, &port->link);
+
+ /* core i2c stuff */
+ port->bus.dt_node = n;
+ port->bus.queue_req = p8_i2c_queue_request;
+ port->bus.run_req = p8_i2c_run_request;
+ i2c_add_bus(&port->bus);
+
+ /* add the bus name and compatible (if needed) */
+ p8_i2c_add_bus_prop(port);
+
+ prlog(PR_INFO, " P%d: <%s> %d kHz\n", port->port_num,
+ (char *) dt_prop_get(n, "ibm,port-name"), speed / 1000);
+
+ return port;
+}
+
+static struct p8_i2c_master *p8_i2c_init_one(struct dt_node *i2cm,
+ enum p8_i2c_master_type type)
+{
+ struct p8_i2c_master *master;
+ struct list_head *chip_list;
+ struct dt_node *i2cm_port;
+ uint64_t ex_stat;
+ uint32_t lb_freq;
+ int64_t rc;
+
+ master = zalloc(sizeof(*master));
+ if (!master) {
+ log_simple_error(&e_info(OPAL_RC_I2C_INIT),
+ "I2C: Failed to allocate master "
+ "structure\n");
+ return NULL;
+ }
+ master->type = type;
+
+ /* Local bus speed in Hz */
+ lb_freq = dt_prop_get_u32(i2cm, "clock-frequency");
+
+ /* Initialise the i2c master structure */
+ master->state = state_idle;
+ master->chip_id = dt_get_chip_id(i2cm);
+ master->engine_id = dt_prop_get_u32(i2cm, "chip-engine#");
+ master->xscom_base = dt_get_address(i2cm, 0, NULL);
+ master->dt_node = i2cm;
+ if (master->type == I2C_CENTAUR) {
+ struct centaur_chip *centaur = get_centaur(master->chip_id);
+ if (centaur == NULL) {
+ log_simple_error(&e_info(OPAL_RC_I2C_INIT),
+ "I2C: Failed to get centaur 0x%x ",
+ master->chip_id);
+ free(master);
+ return NULL;
+ }
+ chip_list = &centaur->i2cms;
+
+ /* Detect bad device-tree from HostBoot giving us bogus
+ * i2c masters
+ */
+ if (master->engine_id > 0) {
+ prlog(PR_ERR, "I2C: Skipping Centaur Master #1\n");
+ free(master);
+ return NULL;
+ }
+ } else {
+ struct proc_chip *chip = get_chip(master->chip_id);
+ assert(chip);
+ chip_list = &chip->i2cms;
+ }
+ init_timer(&master->timeout, p8_i2c_timeout, master);
+ init_timer(&master->poller, p8_i2c_poll, master);
+ init_timer(&master->recovery, p8_i2c_recover, master);
+ init_timer(&master->sensor_cache, p8_i2c_enable_scache, master);
+
+ master->irq_ok = p8_i2c_has_irqs(master);
+
+ prlog(PR_INFO, "I2C: Chip %08x Eng. %d Clock %d Mhz %s\n",
+ master->chip_id, master->engine_id, lb_freq / 1000000,
+ master->irq_ok ? "" : "(no interrupt)");
+
+ /* Disable OCC cache during inits */
+ if (master->type == I2C_CENTAUR) {
+ rc = centaur_disable_sensor_cache(master->chip_id);
+ if (rc < 0) {
+ log_simple_error(&e_info(OPAL_RC_I2C_INIT), "I2C: "
+ "Error %lld disabling sensor cache\n",
+ rc);
+ /* Ignore error and move on ... */
+ } else
+ time_wait(rc);
+ }
+ rc = xscom_read(master->chip_id, master->xscom_base +
+ I2C_EXTD_STAT_REG, &ex_stat);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_I2C_INIT), "I2C: "
+ "Failed to read EXTD_STAT_REG\n");
+ if (master->type == I2C_CENTAUR)
+ centaur_enable_sensor_cache(master->chip_id);
+
+ free(master);
+ return NULL;
+ }
+
+ master->fifo_size = GETFIELD(I2C_EXTD_STAT_FIFO_SIZE, ex_stat);
+ list_head_init(&master->req_list);
+ list_head_init(&master->ports);
+
+ /* Re-enable the sensor cache, we aren't touching HW anymore */
+ if (master->type == I2C_CENTAUR)
+ centaur_enable_sensor_cache(master->chip_id);
+
+ /* Add master to chip's list */
+ list_add_tail(chip_list, &master->link);
+
+ /* initialise ports */
+ dt_for_each_child(i2cm, i2cm_port)
+ p8_i2c_init_one_port(master, i2cm_port);
+
+ return master;
+}
+
+void p8_i2c_init(void)
+{
+ struct dt_node *i2cm;
+ int i;
+
+ /* setup the handshake reg */
+ if (proc_gen <= proc_gen_p9)
+ occflg = 0x6C08A;
+ else if (proc_gen == proc_gen_p10)
+ occflg = 0x6C0AC;
+ else
+ return;
+
+ prlog(PR_INFO, "I2C: OCC flag reg: %x\n", occflg);
+
+ for (i = 0; i < MAX_I2C_TYPE; i++) {
+ dt_for_each_compatible(dt_root, i2cm, compat[i])
+ p8_i2c_init_one(i2cm, i);
+ }
+}
+
+struct i2c_bus *p8_i2c_find_bus_by_port(uint32_t chip_id, int eng, int port_num)
+{
+ struct proc_chip *chip = get_chip(chip_id);
+ struct p8_i2c_master *m, *master = NULL;
+ struct p8_i2c_master_port *port;
+
+ if (!chip)
+ return NULL;
+
+ list_for_each(&chip->i2cms, m, link) {
+ if (m->engine_id == eng) {
+ master = m;
+ break;
+ }
+ }
+
+ if (!master)
+ return NULL;
+
+ list_for_each(&master->ports, port, link)
+ if (port->port_num == port_num)
+ return &port->bus;
+
+ return NULL;
+}
+
+/* Adds a new i2c port to the DT and initialises it */
+struct i2c_bus *p8_i2c_add_bus(uint32_t chip_id, int eng_id, int port_id,
+ uint32_t bus_speed)
+{
+ struct proc_chip *c = get_chip(chip_id);
+ struct p8_i2c_master *m, *master = NULL;
+ struct p8_i2c_master_port *port;
+ struct dt_node *pn;
+
+ if (!c) {
+ prerror("I2C: Unable to add i2c bus: c%de%dp%d: chip doesn't exist\n",
+ chip_id, eng_id, port_id);
+ return NULL;
+ }
+
+ list_for_each(&c->i2cms, m, link) {
+ if (m->engine_id == eng_id) {
+ master = m;
+ break;
+ }
+ }
+
+ if (!master) {
+ struct dt_node *mn;
+
+ mn = p8_i2c_add_master_node(c->devnode, eng_id);
+ if (!mn) {
+ prerror("I2C: Unable to add DT node for I2CM c%xe%d\n",
+ chip_id, eng_id);
+ return NULL;
+ }
+
+ master = p8_i2c_init_one(mn, I2C_POWER8);
+ if (!master) {
+ prerror("I2C: Unable to initialise I2CM c%xe%d\n",
+ chip_id, eng_id);
+ return NULL;
+ }
+ }
+
+ list_for_each(&master->ports, port, link)
+ if (port->port_num == port_id)
+ return &port->bus;
+
+ pn = __p8_i2c_add_port_node(master->dt_node, port_id, bus_speed);
+ if (!pn) {
+ prerror("I2C: Unable to add dt node for bus c%xe%dp%d\n",
+ chip_id, eng_id, port_id);
+ return NULL;
+ }
+
+ port = p8_i2c_init_one_port(master, pn);
+ if (!port) {
+ prerror("I2C: Unable to init bus c%xe%dp%d\n",
+ chip_id, eng_id, port_id);
+ return NULL;
+ }
+
+ return &port->bus;
+}
diff --git a/roms/skiboot/hw/phb3.c b/roms/skiboot/hw/phb3.c
new file mode 100644
index 000000000..8af6b6164
--- /dev/null
+++ b/roms/skiboot/hw/phb3.c
@@ -0,0 +1,5052 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * PHB3: PCI Host Bridge 3, in POWER8
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <io.h>
+#include <timebase.h>
+#include <pci-cfg.h>
+#include <pci.h>
+#include <pci-slot.h>
+#include <vpd.h>
+#include <interrupts.h>
+#include <opal.h>
+#include <cpu.h>
+#include <device.h>
+#include <ccan/str/str.h>
+#include <ccan/array_size/array_size.h>
+#include <xscom.h>
+#include <affinity.h>
+#include <phb3.h>
+#include <phb3-regs.h>
+#include <phb3-capp.h>
+#include <capp.h>
+#include <fsp.h>
+#include <chip.h>
+#include <chiptod.h>
+
+/* Enable this to disable error interrupts for debug purposes */
+#undef DISABLE_ERR_INTS
+
+static void phb3_init_hw(struct phb3 *p, bool first_init);
+
+#define PHBDBG(p, fmt, a...) prlog(PR_DEBUG, "PHB#%04x: " fmt, \
+ (p)->phb.opal_id, ## a)
+#define PHBINF(p, fmt, a...) prlog(PR_INFO, "PHB#%04x: " fmt, \
+ (p)->phb.opal_id, ## a)
+#define PHBERR(p, fmt, a...) prlog(PR_ERR, "PHB#%04x: " fmt, \
+ (p)->phb.opal_id, ## a)
+
+#define PE_CAPP_EN 0x9013c03
+
+#define PE_REG_OFFSET(p) \
+ ((PHB3_IS_NAPLES(p) && (p)->index) ? 0x40 : 0x0)
+
+/* Helper to select an IODA table entry */
+static inline void phb3_ioda_sel(struct phb3 *p, uint32_t table,
+ uint32_t addr, bool autoinc)
+{
+ out_be64(p->regs + PHB_IODA_ADDR,
+ (autoinc ? PHB_IODA_AD_AUTOINC : 0) |
+ SETFIELD(PHB_IODA_AD_TSEL, 0ul, table) |
+ SETFIELD(PHB_IODA_AD_TADR, 0ul, addr));
+}
+
+static void phb3_eeh_dump_regs(struct phb3 *p,
+ struct OpalIoPhb3ErrorData *regs);
+
+/* Check if AIB is fenced via PBCQ NFIR */
+static bool phb3_fenced(struct phb3 *p)
+{
+ uint64_t nfir;
+
+ /* We still probably has crazy xscom */
+ xscom_read(p->chip_id, p->pe_xscom + 0x0, &nfir);
+ if (nfir & PPC_BIT(16)) {
+ p->flags |= PHB3_AIB_FENCED;
+
+ phb3_eeh_dump_regs(p, NULL);
+ return true;
+ }
+ return false;
+}
+
+static int64_t phb3_pcicfg_rc_pref_window(void *dev __unused,
+ struct pci_cfg_reg_filter *pcrf,
+ uint32_t offset, uint32_t len,
+ uint32_t *data, bool write)
+{
+ uint8_t *pdata;
+ uint32_t i;
+
+ /* Cache whatever we received */
+ if (write) {
+ pdata = &pcrf->data[offset - pcrf->start];
+ for (i = 0; i < len; i++, pdata++)
+ *pdata = (uint8_t)(*data >> (8 * i));
+ return OPAL_SUCCESS;
+ }
+
+ /* Return whatever we cached */
+ *data = 0;
+ pdata = &pcrf->data[offset - pcrf->start + len - 1];
+ for (i = len; i > 0; i--, pdata--) {
+ *data = (*data) << 8;
+ if (offset + i == PCI_CFG_PREF_MEM_BASE) {
+ *data |= ((*pdata & 0xf0) | 0x1);
+ continue;
+ }
+
+ *data |= *pdata;
+ }
+
+ return OPAL_SUCCESS;
+}
+
+/*
+ * Configuration space access
+ *
+ * The PHB lock is assumed to be already held
+ */
+static int64_t phb3_pcicfg_check(struct phb3 *p, uint32_t bdfn,
+ uint32_t offset, uint32_t size,
+ uint8_t *pe)
+{
+ uint32_t sm = size - 1;
+
+ if (offset > 0xfff || bdfn > 0xffff)
+ return OPAL_PARAMETER;
+ if (offset & sm)
+ return OPAL_PARAMETER;
+
+ /* The root bus only has a device at 0 and we get into an
+ * error state if we try to probe beyond that, so let's
+ * avoid that and just return an error to Linux
+ */
+ if (PCI_BUS_NUM(bdfn) == 0 && (bdfn & 0xff))
+ return OPAL_HARDWARE;
+
+ /* Check PHB state */
+ if (p->broken)
+ return OPAL_HARDWARE;
+
+ /* Fetch the PE# from cache */
+ *pe = p->rte_cache[bdfn];
+
+ return OPAL_SUCCESS;
+}
+
+static void phb3_link_update(struct phb *phb, uint16_t data)
+{
+ struct phb3 *p = phb_to_phb3(phb);
+ uint32_t new_spd, new_wid;
+ uint32_t old_spd, old_wid;
+ uint16_t old_data;
+ uint64_t lreg;
+ int i;
+
+ /* Read the old speed and width */
+ pci_cfg_read16(phb, 0, 0x5a, &old_data);
+
+ /* Decode the register values */
+ new_spd = data & PCICAP_EXP_LSTAT_SPEED;
+ new_wid = (data & PCICAP_EXP_LSTAT_WIDTH) >> 4;
+ old_spd = old_data & PCICAP_EXP_LSTAT_SPEED;
+ old_wid = (old_data & PCICAP_EXP_LSTAT_WIDTH) >> 4;
+
+ /* Apply maximums */
+ if (new_wid > 16)
+ new_wid = 16;
+ if (new_wid < 1)
+ new_wid = 1;
+ if (new_spd > 3)
+ new_spd = 3;
+ if (new_spd < 1)
+ new_spd = 1;
+
+ PHBINF(p, "Link change request: speed %d->%d, width %d->%d\n",
+ old_spd, new_spd, old_wid, new_wid);
+
+ /* Check if width needs to be changed */
+ if (old_wid != new_wid) {
+ PHBINF(p, "Changing width...\n");
+ lreg = in_be64(p->regs + PHB_PCIE_LINK_MANAGEMENT);
+ lreg = SETFIELD(PHB_PCIE_LM_TGT_LINK_WIDTH, lreg, new_wid);
+ lreg |= PHB_PCIE_LM_CHG_LINK_WIDTH;
+ out_be64(p->regs + PHB_PCIE_LINK_MANAGEMENT, lreg);
+ for (i=0; i<10;i++) {
+ lreg = in_be64(p->regs + PHB_PCIE_LINK_MANAGEMENT);
+ if (lreg & PHB_PCIE_LM_DL_WCHG_PENDING)
+ break;
+ time_wait_ms_nopoll(1);
+ }
+ if (!(lreg & PHB_PCIE_LM_DL_WCHG_PENDING))
+ PHBINF(p, "Timeout waiting for speed change start\n");
+ for (i=0; i<100;i++) {
+ lreg = in_be64(p->regs + PHB_PCIE_LINK_MANAGEMENT);
+ if (!(lreg & PHB_PCIE_LM_DL_WCHG_PENDING))
+ break;
+ time_wait_ms_nopoll(1);
+ }
+ if (lreg & PHB_PCIE_LM_DL_WCHG_PENDING)
+ PHBINF(p, "Timeout waiting for speed change end\n");
+ }
+ /* Check if speed needs to be changed */
+ if (old_spd != new_spd) {
+ PHBINF(p, "Changing speed...\n");
+ lreg = in_be64(p->regs + PHB_PCIE_LINK_MANAGEMENT);
+ if (lreg & PPC_BIT(19)) {
+ uint16_t lctl2;
+ PHBINF(p, " Bit19 set ! working around...\n");
+ pci_cfg_read16(phb, 0, 0x78, &lctl2);
+ PHBINF(p, " LCTL2=%04x\n", lctl2);
+ lctl2 &= ~PCICAP_EXP_LCTL2_HWAUTSPDIS;
+ pci_cfg_write16(phb, 0, 0x78, lctl2);
+ }
+ lreg = in_be64(p->regs + PHB_PCIE_LINK_MANAGEMENT);
+ lreg = SETFIELD(PHB_PCIE_LM_TGT_SPEED, lreg, new_spd);
+ lreg |= PHB_PCIE_LM_CHG_SPEED;
+ out_be64(p->regs + PHB_PCIE_LINK_MANAGEMENT, lreg);
+ }
+}
+
+static int64_t phb3_pcicfg_rc_link_speed(void *dev,
+ struct pci_cfg_reg_filter *pcrf __unused,
+ uint32_t offset, uint32_t len,
+ uint32_t *data, bool write)
+{
+ struct pci_device *pd = dev;
+
+ /* Hack for link speed changes. We intercept attempts at writing
+ * the link control/status register
+ */
+ if (write && len == 4 && offset == 0x58) {
+ phb3_link_update(pd->phb, (*data) >> 16);
+ return OPAL_SUCCESS;
+ }
+ if (write && len == 2 && offset == 0x5a) {
+ phb3_link_update(pd->phb, *(uint16_t *)data);
+ return OPAL_SUCCESS;
+ }
+
+ return OPAL_PARTIAL;
+}
+
+#define PHB3_PCI_CFG_READ(size, type) \
+static int64_t phb3_pcicfg_read##size(struct phb *phb, uint32_t bdfn, \
+ uint32_t offset, type *data) \
+{ \
+ struct phb3 *p = phb_to_phb3(phb); \
+ uint64_t addr, val64; \
+ int64_t rc; \
+ uint8_t pe; \
+ bool use_asb = false; \
+ \
+ /* Initialize data in case of error */ \
+ *data = (type)0xffffffff; \
+ \
+ rc = phb3_pcicfg_check(p, bdfn, offset, sizeof(type), &pe); \
+ if (rc) \
+ return rc; \
+ \
+ if (p->flags & PHB3_AIB_FENCED) { \
+ if (!(p->flags & PHB3_CFG_USE_ASB)) \
+ return OPAL_HARDWARE; \
+ use_asb = true; \
+ } else if ((p->flags & PHB3_CFG_BLOCKED) && bdfn != 0) { \
+ return OPAL_HARDWARE; \
+ } \
+ \
+ rc = pci_handle_cfg_filters(phb, bdfn, offset, sizeof(type), \
+ (uint32_t *)data, false); \
+ if (rc != OPAL_PARTIAL) \
+ return rc; \
+ \
+ addr = PHB_CA_ENABLE; \
+ addr = SETFIELD(PHB_CA_BDFN, addr, bdfn); \
+ addr = SETFIELD(PHB_CA_REG, addr, offset); \
+ addr = SETFIELD(PHB_CA_PE, addr, pe); \
+ if (use_asb) { \
+ phb3_write_reg_asb(p, PHB_CONFIG_ADDRESS, addr); \
+ sync(); \
+ val64 = bswap_64(phb3_read_reg_asb(p, PHB_CONFIG_DATA)); \
+ *data = (type)(val64 >> (8 * (offset & (4 - sizeof(type))))); \
+ } else { \
+ out_be64(p->regs + PHB_CONFIG_ADDRESS, addr); \
+ *data = in_le##size(p->regs + PHB_CONFIG_DATA + \
+ (offset & (4 - sizeof(type)))); \
+ } \
+ \
+ return OPAL_SUCCESS; \
+}
+
+#define PHB3_PCI_CFG_WRITE(size, type) \
+static int64_t phb3_pcicfg_write##size(struct phb *phb, uint32_t bdfn, \
+ uint32_t offset, type data) \
+{ \
+ struct phb3 *p = phb_to_phb3(phb); \
+ uint64_t addr, val64 = 0; \
+ int64_t rc; \
+ uint8_t pe; \
+ bool use_asb = false; \
+ \
+ rc = phb3_pcicfg_check(p, bdfn, offset, sizeof(type), &pe); \
+ if (rc) \
+ return rc; \
+ \
+ if (p->flags & PHB3_AIB_FENCED) { \
+ if (!(p->flags & PHB3_CFG_USE_ASB)) \
+ return OPAL_HARDWARE; \
+ use_asb = true; \
+ } else if ((p->flags & PHB3_CFG_BLOCKED) && bdfn != 0) { \
+ return OPAL_HARDWARE; \
+ } \
+ \
+ rc = pci_handle_cfg_filters(phb, bdfn, offset, sizeof(type), \
+ (uint32_t *)&data, true); \
+ if (rc != OPAL_PARTIAL) \
+ return rc; \
+ \
+ addr = PHB_CA_ENABLE; \
+ addr = SETFIELD(PHB_CA_BDFN, addr, bdfn); \
+ addr = SETFIELD(PHB_CA_REG, addr, offset); \
+ addr = SETFIELD(PHB_CA_PE, addr, pe); \
+ if (use_asb) { \
+ val64 = data; \
+ val64 = bswap_64(val64 << 8 * (offset & (4 - sizeof(type)))); \
+ phb3_write_reg_asb(p, PHB_CONFIG_ADDRESS, addr); \
+ sync(); \
+ phb3_write_reg_asb(p, PHB_CONFIG_DATA, val64); \
+ } else { \
+ out_be64(p->regs + PHB_CONFIG_ADDRESS, addr); \
+ out_le##size(p->regs + PHB_CONFIG_DATA + \
+ (offset & (4 - sizeof(type))), data); \
+ } \
+ \
+ return OPAL_SUCCESS; \
+}
+
+PHB3_PCI_CFG_READ(8, u8)
+PHB3_PCI_CFG_READ(16, u16)
+PHB3_PCI_CFG_READ(32, u32)
+PHB3_PCI_CFG_WRITE(8, u8)
+PHB3_PCI_CFG_WRITE(16, u16)
+PHB3_PCI_CFG_WRITE(32, u32)
+
+static int64_t phb3_get_reserved_pe_number(struct phb *phb __unused)
+{
+ return PHB3_RESERVED_PE_NUM;
+}
+
+static inline void phb3_enable_ecrc(struct phb *phb, bool enable)
+{
+ struct phb3 *p = phb_to_phb3(phb);
+ uint32_t ctl;
+
+ if (p->aercap <= 0)
+ return;
+
+ pci_cfg_read32(phb, 0, p->aercap + PCIECAP_AER_CAPCTL, &ctl);
+ if (enable) {
+ ctl |= (PCIECAP_AER_CAPCTL_ECRCG_EN |
+ PCIECAP_AER_CAPCTL_ECRCC_EN);
+ } else {
+ ctl &= ~(PCIECAP_AER_CAPCTL_ECRCG_EN |
+ PCIECAP_AER_CAPCTL_ECRCC_EN);
+ }
+
+ pci_cfg_write32(phb, 0, p->aercap + PCIECAP_AER_CAPCTL, ctl);
+}
+
+static void phb3_root_port_init(struct phb *phb, struct pci_device *dev,
+ int ecap, int aercap)
+{
+ struct phb3 *p = phb_to_phb3(phb);
+ uint16_t bdfn = dev->bdfn;
+ uint16_t val16;
+ uint32_t val32;
+
+ /* Use PHB's callback so that the UTL events will be masked
+ * or unmasked when the link is down or up.
+ */
+ if (dev->slot && dev->slot->ops.prepare_link_change &&
+ phb->slot && phb->slot->ops.prepare_link_change)
+ dev->slot->ops.prepare_link_change =
+ phb->slot->ops.prepare_link_change;
+
+ /* Mask UTL link down event if root slot supports surprise
+ * hotplug as the event should be handled by hotplug driver
+ * instead of EEH subsystem.
+ */
+ if (dev->slot && dev->slot->surprise_pluggable)
+ out_be64(p->regs + UTL_PCIE_PORT_IRQ_EN, 0xad42800000000000UL);
+
+ /* Enable SERR and parity checking */
+ pci_cfg_read16(phb, bdfn, PCI_CFG_CMD, &val16);
+ val16 |= (PCI_CFG_CMD_SERR_EN | PCI_CFG_CMD_PERR_RESP);
+ pci_cfg_write16(phb, bdfn, PCI_CFG_CMD, val16);
+
+ /* Enable reporting various errors */
+ if (!ecap) return;
+ pci_cfg_read16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, &val16);
+ val16 |= (PCICAP_EXP_DEVCTL_CE_REPORT |
+ PCICAP_EXP_DEVCTL_NFE_REPORT |
+ PCICAP_EXP_DEVCTL_FE_REPORT |
+ PCICAP_EXP_DEVCTL_UR_REPORT);
+ pci_cfg_write16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, val16);
+
+ if (!aercap) return;
+
+ /* Mask various unrecoverable errors. The link surprise down
+ * event should be masked when its PCI slot support surprise
+ * hotplug. The link surprise down event should be handled by
+ * PCI hotplug driver instead of EEH subsystem.
+ */
+ pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_UE_MASK, &val32);
+ val32 |= (PCIECAP_AER_UE_MASK_POISON_TLP |
+ PCIECAP_AER_UE_MASK_COMPL_TIMEOUT |
+ PCIECAP_AER_UE_MASK_COMPL_ABORT |
+ PCIECAP_AER_UE_MASK_ECRC);
+ if (dev->slot && dev->slot->surprise_pluggable)
+ val32 |= PCIECAP_AER_UE_MASK_SURPRISE_DOWN;
+ pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_MASK, val32);
+
+ /* Report various unrecoverable errors as fatal errors */
+ pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_UE_SEVERITY, &val32);
+ val32 |= (PCIECAP_AER_UE_SEVERITY_DLLP |
+ PCIECAP_AER_UE_SEVERITY_SURPRISE_DOWN |
+ PCIECAP_AER_UE_SEVERITY_FLOW_CTL_PROT |
+ PCIECAP_AER_UE_SEVERITY_UNEXP_COMPL |
+ PCIECAP_AER_UE_SEVERITY_RECV_OVFLOW |
+ PCIECAP_AER_UE_SEVERITY_MALFORMED_TLP);
+ pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_SEVERITY, val32);
+
+ /* Mask various recoverable errors */
+ pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_CE_MASK, &val32);
+ val32 |= PCIECAP_AER_CE_MASK_ADV_NONFATAL;
+ pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CE_MASK, val32);
+
+ /* Enable ECRC check */
+ phb3_enable_ecrc(phb, true);
+
+ /* Enable all error reporting */
+ pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_RERR_CMD, &val32);
+ val32 |= (PCIECAP_AER_RERR_CMD_FE |
+ PCIECAP_AER_RERR_CMD_NFE |
+ PCIECAP_AER_RERR_CMD_CE);
+ pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_RERR_CMD, val32);
+}
+
+static void phb3_switch_port_init(struct phb *phb,
+ struct pci_device *dev,
+ int ecap, int aercap)
+{
+ struct phb3 *p = phb_to_phb3(phb);
+ uint16_t bdfn = dev->bdfn;
+ uint16_t val16;
+ uint32_t val32;
+
+ /* Enable SERR and parity checking and disable INTx */
+ pci_cfg_read16(phb, bdfn, PCI_CFG_CMD, &val16);
+ val16 |= (PCI_CFG_CMD_PERR_RESP |
+ PCI_CFG_CMD_SERR_EN |
+ PCI_CFG_CMD_INTx_DIS);
+ pci_cfg_write16(phb, bdfn, PCI_CFG_CMD, val16);
+
+ /* Disable partity error and enable system error */
+ pci_cfg_read16(phb, bdfn, PCI_CFG_BRCTL, &val16);
+ val16 &= ~PCI_CFG_BRCTL_PERR_RESP_EN;
+ val16 |= PCI_CFG_BRCTL_SERR_EN;
+ pci_cfg_write16(phb, bdfn, PCI_CFG_BRCTL, val16);
+
+ /* Enable reporting various errors */
+ if (!ecap) return;
+ pci_cfg_read16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, &val16);
+ val16 |= (PCICAP_EXP_DEVCTL_CE_REPORT |
+ PCICAP_EXP_DEVCTL_NFE_REPORT |
+ PCICAP_EXP_DEVCTL_FE_REPORT);
+ /* HW279570 - Disable reporting of correctable errors */
+ val16 &= ~PCICAP_EXP_DEVCTL_CE_REPORT;
+ pci_cfg_write16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, val16);
+
+ /* Unmask all unrecoverable errors for upstream port. For
+ * downstream port, the surprise link down is masked because
+ * it should be handled by hotplug driver instead of EEH
+ * subsystem.
+ */
+ if (!aercap) return;
+ if (dev->dev_type == PCIE_TYPE_SWITCH_DNPORT &&
+ dev->slot && dev->slot->surprise_pluggable)
+ pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_MASK,
+ PCIECAP_AER_UE_MASK_SURPRISE_DOWN);
+ else
+ pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_MASK, 0x0);
+
+ /* Severity of unrecoverable errors */
+ if (dev->dev_type == PCIE_TYPE_SWITCH_UPPORT)
+ val32 = (PCIECAP_AER_UE_SEVERITY_DLLP |
+ PCIECAP_AER_UE_SEVERITY_SURPRISE_DOWN |
+ PCIECAP_AER_UE_SEVERITY_FLOW_CTL_PROT |
+ PCIECAP_AER_UE_SEVERITY_RECV_OVFLOW |
+ PCIECAP_AER_UE_SEVERITY_MALFORMED_TLP |
+ PCIECAP_AER_UE_SEVERITY_INTERNAL);
+ else
+ val32 = (PCIECAP_AER_UE_SEVERITY_FLOW_CTL_PROT |
+ PCIECAP_AER_UE_SEVERITY_INTERNAL);
+ pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_SEVERITY, val32);
+
+ /*
+ * Mask various correctable errors
+ *
+ * On Murano and Venice DD1.0 we disable emission of corrected
+ * error messages to the PHB completely to workaround errata
+ * HW257476 causing the loss of tags.
+ */
+ if (p->rev < PHB3_REV_MURANO_DD20)
+ val32 = 0xffffffff;
+ else
+ val32 = PCIECAP_AER_CE_MASK_ADV_NONFATAL;
+ pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CE_MASK, val32);
+
+ /* Enable ECRC generation and disable ECRC check */
+ pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, &val32);
+ val32 |= PCIECAP_AER_CAPCTL_ECRCG_EN;
+ val32 &= ~PCIECAP_AER_CAPCTL_ECRCC_EN;
+ pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, val32);
+}
+
+static void phb3_endpoint_init(struct phb *phb,
+ struct pci_device *dev,
+ int ecap, int aercap)
+{
+ struct phb3 *p = phb_to_phb3(phb);
+ uint16_t bdfn = dev->bdfn;
+ uint16_t val16;
+ uint32_t val32;
+
+ /* Enable SERR and parity checking */
+ pci_cfg_read16(phb, bdfn, PCI_CFG_CMD, &val16);
+ val16 |= (PCI_CFG_CMD_PERR_RESP |
+ PCI_CFG_CMD_SERR_EN);
+ pci_cfg_write16(phb, bdfn, PCI_CFG_CMD, val16);
+
+ /* Enable reporting various errors */
+ if (!ecap) return;
+ pci_cfg_read16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, &val16);
+ val16 &= ~PCICAP_EXP_DEVCTL_CE_REPORT;
+ val16 |= (PCICAP_EXP_DEVCTL_NFE_REPORT |
+ PCICAP_EXP_DEVCTL_FE_REPORT |
+ PCICAP_EXP_DEVCTL_UR_REPORT);
+ /* HW279570 - Disable reporting of correctable errors */
+ val16 &= ~PCICAP_EXP_DEVCTL_CE_REPORT;
+ pci_cfg_write16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, val16);
+
+ /*
+ * On Murano and Venice DD1.0 we disable emission of corrected
+ * error messages to the PHB completely to workaround errata
+ * HW257476 causing the loss of tags.
+ */
+ if (p->rev < PHB3_REV_MURANO_DD20)
+ pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CE_MASK,
+ 0xffffffff);
+
+ /* Enable ECRC generation and check */
+ pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, &val32);
+ val32 |= (PCIECAP_AER_CAPCTL_ECRCG_EN |
+ PCIECAP_AER_CAPCTL_ECRCC_EN);
+ pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, val32);
+}
+
+static int64_t phb3_pcicfg_no_dstate(void *dev __unused,
+ struct pci_cfg_reg_filter *pcrf,
+ uint32_t offset, uint32_t len __unused,
+ uint32_t *data __unused, bool write)
+{
+ uint32_t loff = offset - pcrf->start;
+
+ /* Disable D-state change on children of the PHB. For now we
+ * simply block all writes to the PM control/status
+ */
+ if (write && loff >= 4 && loff < 6)
+ return OPAL_SUCCESS;
+
+ return OPAL_PARTIAL;
+}
+
+static void phb3_check_device_quirks(struct phb *phb, struct pci_device *dev)
+{
+ struct phb3 *p = phb_to_phb3(phb);
+
+ if (dev->primary_bus != 0 &&
+ dev->primary_bus != 1)
+ return;
+
+ if (dev->primary_bus == 1) {
+ u64 modectl;
+
+ /*
+ * For these adapters, if they are directly under the PHB, we
+ * adjust the disable_wr_scope_group bit for performances
+ *
+ * 15b3:1003 Mellanox Travis3-EN (CX3)
+ * 15b3:1011 Mellanox HydePark (ConnectIB)
+ * 15b3:1013 Mellanox GlacierPark (CX4)
+ */
+ xscom_read(p->chip_id, p->pe_xscom + 0x0b, &modectl);
+ if (PCI_VENDOR_ID(dev->vdid) == 0x15b3 &&
+ (PCI_DEVICE_ID(dev->vdid) == 0x1003 ||
+ PCI_DEVICE_ID(dev->vdid) == 0x1011 ||
+ PCI_DEVICE_ID(dev->vdid) == 0x1013))
+ modectl |= PPC_BIT(14);
+ else
+ modectl &= ~PPC_BIT(14);
+ xscom_write(p->chip_id, p->pe_xscom + 0x0b, modectl);
+
+ /*
+ * Naples has a problem with D-states at least on Mellanox CX4,
+ * disable changing D-state on Naples like we do it for PHB4.
+ */
+ if (PHB3_IS_NAPLES(p) &&
+ pci_has_cap(dev, PCI_CFG_CAP_ID_PM, false)) {
+ pci_add_cfg_reg_filter(dev,
+ pci_cap(dev, PCI_CFG_CAP_ID_PM, false),
+ 8,
+ PCI_REG_FLAG_WRITE,
+ phb3_pcicfg_no_dstate);
+ }
+ } else if (dev->primary_bus == 0) {
+ /*
+ * Emulate the prefetchable window of the root port
+ * when the corresponding HW registers are readonly.
+ *
+ * 1014:03dc Root port on P8/P8E/P8NVL
+ */
+ if (PCI_VENDOR_ID(dev->vdid) == 0x1014 &&
+ PCI_DEVICE_ID(dev->vdid) == 0x03dc) {
+ uint32_t pref_hi, tmp;
+
+ pci_cfg_read32(phb, dev->bdfn,
+ PCI_CFG_PREF_MEM_BASE_U32, &pref_hi);
+ pci_cfg_write32(phb, dev->bdfn,
+ PCI_CFG_PREF_MEM_BASE_U32, ~pref_hi);
+ pci_cfg_read32(phb, dev->bdfn,
+ PCI_CFG_PREF_MEM_BASE_U32, &tmp);
+ pci_cfg_write32(phb, dev->bdfn,
+ PCI_CFG_PREF_MEM_BASE_U32, pref_hi);
+ if (tmp == pref_hi)
+ pci_add_cfg_reg_filter(dev,
+ PCI_CFG_PREF_MEM_BASE_U32, 12,
+ PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
+ phb3_pcicfg_rc_pref_window);
+ /* Add filter to control link speed */
+ pci_add_cfg_reg_filter(dev,
+ 0x58, 4,
+ PCI_REG_FLAG_WRITE,
+ phb3_pcicfg_rc_link_speed);
+ }
+ }
+}
+
+static inline int phb3_should_disable_ecrc(struct pci_device *pd)
+{
+ /*
+ * When we have PMC PCIe switch, we need disable ECRC on root port.
+ * Otherwise, the adapters behind the switch downstream ports might
+ * not probed successfully.
+ */
+ if (pd->vdid == 0x854611f8)
+ return true;
+
+ return false;
+}
+
+static int phb3_device_init(struct phb *phb,
+ struct pci_device *dev,
+ void *data)
+{
+ struct phb3 *p = phb_to_phb3(phb);
+ int ecap, aercap;
+
+ /* Some special adapter tweaks for devices directly under the PHB */
+ phb3_check_device_quirks(phb, dev);
+
+ /* Common initialization for the device */
+ pci_device_init(phb, dev);
+
+ ecap = pci_cap(dev, PCI_CFG_CAP_ID_EXP, false);
+ aercap = pci_cap(dev, PCIECAP_ID_AER, true);
+ if (dev->dev_type == PCIE_TYPE_ROOT_PORT)
+ phb3_root_port_init(phb, dev, ecap, aercap);
+ else if (dev->dev_type == PCIE_TYPE_SWITCH_UPPORT ||
+ dev->dev_type == PCIE_TYPE_SWITCH_DNPORT)
+ phb3_switch_port_init(phb, dev, ecap, aercap);
+ else
+ phb3_endpoint_init(phb, dev, ecap, aercap);
+
+ /*
+ * Check if we need disable ECRC functionality on root port. It
+ * only happens when PCI topology changes, meaning it's skipped
+ * when reinitializing PCI device after EEH reset.
+ */
+ if (!data && phb3_should_disable_ecrc(dev)) {
+ if (p->no_ecrc_devs++ == 0)
+ phb3_enable_ecrc(phb, false);
+ }
+
+ return 0;
+}
+
+static void phb3_device_remove(struct phb *phb, struct pci_device *pd)
+{
+ struct phb3 *p = phb_to_phb3(phb);
+
+ if (!phb3_should_disable_ecrc(pd) || p->no_ecrc_devs == 0)
+ return;
+
+ if (--p->no_ecrc_devs == 0)
+ phb3_enable_ecrc(phb, true);
+}
+
+static int64_t phb3_pci_reinit(struct phb *phb, uint64_t scope, uint64_t data)
+{
+ struct pci_device *pd;
+ uint16_t bdfn = data;
+ int ret;
+
+ if (scope != OPAL_REINIT_PCI_DEV)
+ return OPAL_PARAMETER;
+
+ pd = pci_find_dev(phb, bdfn);
+ if (!pd)
+ return OPAL_PARAMETER;
+
+ ret = phb3_device_init(phb, pd, pd);
+ if (ret)
+ return OPAL_HARDWARE;
+
+ return OPAL_SUCCESS;
+}
+
+/* Clear IODA cache tables */
+static void phb3_init_ioda_cache(struct phb3 *p)
+{
+ uint32_t i;
+ uint64_t *data64;
+
+ /*
+ * RTT and PELTV. RTE should be 0xFF's to indicate
+ * invalid PE# for the corresponding RID.
+ *
+ * Note: Instead we set all RTE entries to 0x00 to
+ * work around a problem where PE lookups might be
+ * done before Linux has established valid PE's
+ * (during PCI probing). We can revisit that once/if
+ * Linux has been fixed to always setup valid PEs.
+ *
+ * The value 0x00 corresponds to the default PE# Linux
+ * uses to check for config space freezes before it
+ * has assigned PE# to busses.
+ *
+ * WARNING: Additionally, we need to be careful, there's
+ * a HW issue, if we get an MSI on an RTT entry that is
+ * FF, things will go bad. We need to ensure we don't
+ * ever let a live FF RTT even temporarily when resetting
+ * for EEH etc... (HW278969).
+ */
+ for (i = 0; i < ARRAY_SIZE(p->rte_cache); i++)
+ p->rte_cache[i] = PHB3_RESERVED_PE_NUM;
+ memset(p->peltv_cache, 0x0, sizeof(p->peltv_cache));
+
+ /* Disable all LSI */
+ for (i = 0; i < ARRAY_SIZE(p->lxive_cache); i++) {
+ data64 = &p->lxive_cache[i];
+ *data64 = SETFIELD(IODA2_LXIVT_PRIORITY, 0ul, 0xff);
+ *data64 = SETFIELD(IODA2_LXIVT_SERVER, *data64, 0x0);
+ }
+
+ /* Diable all MSI */
+ for (i = 0; i < ARRAY_SIZE(p->ive_cache); i++) {
+ data64 = &p->ive_cache[i];
+ *data64 = SETFIELD(IODA2_IVT_PRIORITY, 0ul, 0xff);
+ *data64 = SETFIELD(IODA2_IVT_SERVER, *data64, 0x0);
+ }
+
+ /* Clear TVT */
+ memset(p->tve_cache, 0x0, sizeof(p->tve_cache));
+ /* Clear M32 domain */
+ memset(p->m32d_cache, 0x0, sizeof(p->m32d_cache));
+ /* Clear M64 domain */
+ memset(p->m64b_cache, 0x0, sizeof(p->m64b_cache));
+}
+
+/* phb3_ioda_reset - Reset the IODA tables
+ *
+ * @purge: If true, the cache is cleared and the cleared values
+ * are applied to HW. If false, the cached values are
+ * applied to HW
+ *
+ * This reset the IODA tables in the PHB. It is called at
+ * initialization time, on PHB reset, and can be called
+ * explicitly from OPAL
+ */
+static int64_t phb3_ioda_reset(struct phb *phb, bool purge)
+{
+ struct phb3 *p = phb_to_phb3(phb);
+ uint64_t server, prio;
+ uint64_t *pdata64, data64;
+ uint32_t i;
+
+ if (purge) {
+ prlog(PR_DEBUG, "PHB%x: Purging all IODA tables...\n",
+ p->phb.opal_id);
+ phb3_init_ioda_cache(p);
+ }
+
+ /* Init_27..28 - LIXVT */
+ phb3_ioda_sel(p, IODA2_TBL_LXIVT, 0, true);
+ for (i = 0; i < ARRAY_SIZE(p->lxive_cache); i++) {
+ data64 = p->lxive_cache[i];
+ server = GETFIELD(IODA2_LXIVT_SERVER, data64);
+ prio = GETFIELD(IODA2_LXIVT_PRIORITY, data64);
+ data64 = SETFIELD(IODA2_LXIVT_SERVER, data64, server);
+ data64 = SETFIELD(IODA2_LXIVT_PRIORITY, data64, prio);
+ out_be64(p->regs + PHB_IODA_DATA0, data64);
+ }
+
+ /* Init_29..30 - MRT */
+ phb3_ioda_sel(p, IODA2_TBL_MRT, 0, true);
+ for (i = 0; i < 8; i++)
+ out_be64(p->regs + PHB_IODA_DATA0, 0);
+
+ /* Init_31..32 - TVT */
+ phb3_ioda_sel(p, IODA2_TBL_TVT, 0, true);
+ for (i = 0; i < ARRAY_SIZE(p->tve_cache); i++)
+ out_be64(p->regs + PHB_IODA_DATA0, p->tve_cache[i]);
+
+ /* Init_33..34 - M64BT */
+ phb3_ioda_sel(p, IODA2_TBL_M64BT, 0, true);
+ for (i = 0; i < ARRAY_SIZE(p->m64b_cache); i++)
+ out_be64(p->regs + PHB_IODA_DATA0, p->m64b_cache[i]);
+
+ /* Init_35..36 - M32DT */
+ phb3_ioda_sel(p, IODA2_TBL_M32DT, 0, true);
+ for (i = 0; i < ARRAY_SIZE(p->m32d_cache); i++)
+ out_be64(p->regs + PHB_IODA_DATA0, p->m32d_cache[i]);
+
+ /* Load RTE, PELTV */
+ if (p->tbl_rtt)
+ memcpy((void *)p->tbl_rtt, p->rte_cache, RTT_TABLE_SIZE);
+ if (p->tbl_peltv)
+ memcpy((void *)p->tbl_peltv, p->peltv_cache, PELTV_TABLE_SIZE);
+
+ /* Load IVT */
+ if (p->tbl_ivt) {
+ pdata64 = (uint64_t *)p->tbl_ivt;
+ for (i = 0; i < IVT_TABLE_ENTRIES; i++)
+ pdata64[i * IVT_TABLE_STRIDE] = p->ive_cache[i];
+ }
+
+ /* Invalidate RTE, IVE, TCE cache */
+ out_be64(p->regs + PHB_RTC_INVALIDATE, PHB_RTC_INVALIDATE_ALL);
+ out_be64(p->regs + PHB_IVC_INVALIDATE, PHB_IVC_INVALIDATE_ALL);
+ out_be64(p->regs + PHB_TCE_KILL, PHB_TCE_KILL_ALL);
+
+ /* Clear RBA */
+ if (p->rev >= PHB3_REV_MURANO_DD20) {
+ phb3_ioda_sel(p, IODA2_TBL_RBA, 0, true);
+ for (i = 0; i < 32; i++)
+ out_be64(p->regs + PHB_IODA_DATA0, 0x0ul);
+ }
+
+ /* Clear PEST & PEEV */
+ for (i = 0; i < PHB3_MAX_PE_NUM; i++) {
+ uint64_t pesta, pestb;
+
+ phb3_ioda_sel(p, IODA2_TBL_PESTA, i, false);
+ pesta = in_be64(p->regs + PHB_IODA_DATA0);
+ out_be64(p->regs + PHB_IODA_DATA0, 0);
+ phb3_ioda_sel(p, IODA2_TBL_PESTB, i, false);
+ pestb = in_be64(p->regs + PHB_IODA_DATA0);
+ out_be64(p->regs + PHB_IODA_DATA0, 0);
+
+ if ((pesta & IODA2_PESTA_MMIO_FROZEN) ||
+ (pestb & IODA2_PESTB_DMA_STOPPED))
+ PHBDBG(p, "Frozen PE#%x (%s - %s)\n",
+ i, (pesta & IODA2_PESTA_MMIO_FROZEN) ? "DMA" : "",
+ (pestb & IODA2_PESTB_DMA_STOPPED) ? "MMIO" : "");
+ }
+
+ phb3_ioda_sel(p, IODA2_TBL_PEEV, 0, true);
+ for (i = 0; i < 4; i++)
+ out_be64(p->regs + PHB_IODA_DATA0, 0);
+
+ return OPAL_SUCCESS;
+}
+
+/*
+ * Clear anything we have in PAPR Error Injection registers. Though
+ * the spec says the PAPR error injection should be one-shot without
+ * the "sticky" bit. However, that's false according to the experiments
+ * I had. So we have to clear it at appropriate point in kernel to
+ * avoid endless frozen PE.
+ */
+static int64_t phb3_papr_errinjct_reset(struct phb *phb)
+{
+ struct phb3 *p = phb_to_phb3(phb);
+
+ out_be64(p->regs + PHB_PAPR_ERR_INJ_CTL, 0x0ul);
+ out_be64(p->regs + PHB_PAPR_ERR_INJ_ADDR, 0x0ul);
+ out_be64(p->regs + PHB_PAPR_ERR_INJ_MASK, 0x0ul);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb3_set_phb_mem_window(struct phb *phb,
+ uint16_t window_type,
+ uint16_t window_num,
+ uint64_t addr,
+ uint64_t __unused pci_addr,
+ uint64_t size)
+{
+ struct phb3 *p = phb_to_phb3(phb);
+ uint64_t data64;
+
+ /*
+ * By design, PHB3 doesn't support IODT any more.
+ * Besides, we can't enable M32 BAR as well. So
+ * the function is used to do M64 mapping and each
+ * BAR is supposed to be shared by all PEs.
+ */
+ switch (window_type) {
+ case OPAL_IO_WINDOW_TYPE:
+ case OPAL_M32_WINDOW_TYPE:
+ return OPAL_UNSUPPORTED;
+ case OPAL_M64_WINDOW_TYPE:
+ if (window_num >= 16)
+ return OPAL_PARAMETER;
+
+ data64 = p->m64b_cache[window_num];
+ if (data64 & IODA2_M64BT_SINGLE_PE) {
+ if ((addr & 0x1FFFFFFul) ||
+ (size & 0x1FFFFFFul))
+ return OPAL_PARAMETER;
+ } else {
+ if ((addr & 0xFFFFFul) ||
+ (size & 0xFFFFFul))
+ return OPAL_PARAMETER;
+ }
+
+ /* size should be 2^N */
+ if (!size || size & (size-1))
+ return OPAL_PARAMETER;
+
+ /* address should be size aligned */
+ if (addr & (size - 1))
+ return OPAL_PARAMETER;
+
+ break;
+ default:
+ return OPAL_PARAMETER;
+ }
+
+ if (data64 & IODA2_M64BT_SINGLE_PE) {
+ data64 = SETFIELD(IODA2_M64BT_SINGLE_BASE, data64,
+ addr >> 25);
+ data64 = SETFIELD(IODA2_M64BT_SINGLE_MASK, data64,
+ 0x20000000 - (size >> 25));
+ } else {
+ data64 = SETFIELD(IODA2_M64BT_BASE, data64,
+ addr >> 20);
+ data64 = SETFIELD(IODA2_M64BT_MASK, data64,
+ 0x40000000 - (size >> 20));
+ }
+ p->m64b_cache[window_num] = data64;
+
+ return OPAL_SUCCESS;
+}
+
+/*
+ * For one specific M64 BAR, it can be shared by all PEs,
+ * or owned by single PE exclusively.
+ */
+static int64_t phb3_phb_mmio_enable(struct phb *phb,
+ uint16_t window_type,
+ uint16_t window_num,
+ uint16_t enable)
+{
+ struct phb3 *p = phb_to_phb3(phb);
+ uint64_t data64, base, mask;
+
+ /*
+ * By design, PHB3 doesn't support IODT any more.
+ * Besides, we can't enable M32 BAR as well. So
+ * the function is used to do M64 mapping and each
+ * BAR is supposed to be shared by all PEs.
+ */
+ switch (window_type) {
+ case OPAL_IO_WINDOW_TYPE:
+ case OPAL_M32_WINDOW_TYPE:
+ return OPAL_UNSUPPORTED;
+ case OPAL_M64_WINDOW_TYPE:
+ if (window_num >= 16 ||
+ enable > OPAL_ENABLE_M64_NON_SPLIT)
+ return OPAL_PARAMETER;
+ break;
+ default:
+ return OPAL_PARAMETER;
+ }
+
+ /*
+ * We need check the base/mask while enabling
+ * the M64 BAR. Otherwise, invalid base/mask
+ * might cause fenced AIB unintentionally
+ */
+ data64 = p->m64b_cache[window_num];
+ switch (enable) {
+ case OPAL_DISABLE_M64:
+ data64 &= ~IODA2_M64BT_SINGLE_PE;
+ data64 &= ~IODA2_M64BT_ENABLE;
+ break;
+ case OPAL_ENABLE_M64_SPLIT:
+ if (data64 & IODA2_M64BT_SINGLE_PE)
+ return OPAL_PARAMETER;
+ base = GETFIELD(IODA2_M64BT_BASE, data64);
+ base = (base << 20);
+ mask = GETFIELD(IODA2_M64BT_MASK, data64);
+ if (base < p->mm0_base || !mask)
+ return OPAL_PARTIAL;
+
+ data64 |= IODA2_M64BT_ENABLE;
+ break;
+ case OPAL_ENABLE_M64_NON_SPLIT:
+ if (!(data64 & IODA2_M64BT_SINGLE_PE))
+ return OPAL_PARAMETER;
+ base = GETFIELD(IODA2_M64BT_SINGLE_BASE, data64);
+ base = (base << 25);
+ mask = GETFIELD(IODA2_M64BT_SINGLE_MASK, data64);
+ if (base < p->mm0_base || !mask)
+ return OPAL_PARTIAL;
+
+ data64 |= IODA2_M64BT_SINGLE_PE;
+ data64 |= IODA2_M64BT_ENABLE;
+ break;
+ }
+
+ /* Update HW and cache */
+ phb3_ioda_sel(p, IODA2_TBL_M64BT, window_num, false);
+ out_be64(p->regs + PHB_IODA_DATA0, data64);
+ p->m64b_cache[window_num] = data64;
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb3_map_pe_mmio_window(struct phb *phb,
+ uint64_t pe_number,
+ uint16_t window_type,
+ uint16_t window_num,
+ uint16_t segment_num)
+{
+ struct phb3 *p = phb_to_phb3(phb);
+ uint64_t data64, *cache;
+
+ if (pe_number >= PHB3_MAX_PE_NUM)
+ return OPAL_PARAMETER;
+
+ /*
+ * PHB3 doesn't support IODT any more. On the other
+ * hand, PHB3 support M64DT with much more flexibility.
+ * we need figure it out later. At least, we never use
+ * M64DT in kernel.
+ */
+ switch(window_type) {
+ case OPAL_IO_WINDOW_TYPE:
+ return OPAL_UNSUPPORTED;
+ case OPAL_M32_WINDOW_TYPE:
+ if (window_num != 0 || segment_num >= PHB3_MAX_PE_NUM)
+ return OPAL_PARAMETER;
+
+ cache = &p->m32d_cache[segment_num];
+ phb3_ioda_sel(p, IODA2_TBL_M32DT, segment_num, false);
+ out_be64(p->regs + PHB_IODA_DATA0,
+ SETFIELD(IODA2_M32DT_PE, 0ull, pe_number));
+ *cache = SETFIELD(IODA2_M32DT_PE, 0ull, pe_number);
+
+ break;
+ case OPAL_M64_WINDOW_TYPE:
+ if (window_num >= 16)
+ return OPAL_PARAMETER;
+ cache = &p->m64b_cache[window_num];
+ data64 = *cache;
+
+ /* The BAR shouldn't be enabled yet */
+ if (data64 & IODA2_M64BT_ENABLE)
+ return OPAL_PARTIAL;
+
+ data64 |= IODA2_M64BT_SINGLE_PE;
+ data64 = SETFIELD(IODA2_M64BT_PE_HI, data64, pe_number >> 5);
+ data64 = SETFIELD(IODA2_M64BT_PE_LOW, data64, pe_number);
+ *cache = data64;
+
+ break;
+ default:
+ return OPAL_PARAMETER;
+ }
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb3_map_pe_dma_window(struct phb *phb,
+ uint64_t pe_number,
+ uint16_t window_id,
+ uint16_t tce_levels,
+ uint64_t tce_table_addr,
+ uint64_t tce_table_size,
+ uint64_t tce_page_size)
+{
+ struct phb3 *p = phb_to_phb3(phb);
+ uint64_t tts_encoded;
+ uint64_t data64 = 0;
+
+ /*
+ * Sanity check. We currently only support "2 window per PE" mode
+ * ie, only bit 59 of the PCI address is used to select the window
+ */
+ if (pe_number >= PHB3_MAX_PE_NUM ||
+ (window_id >> 1) != pe_number)
+ return OPAL_PARAMETER;
+
+ /*
+ * tce_table_size == 0 is used to disable an entry, in this case
+ * we ignore other arguments
+ */
+ if (tce_table_size == 0) {
+ phb3_ioda_sel(p, IODA2_TBL_TVT, window_id, false);
+ out_be64(p->regs + PHB_IODA_DATA0, 0);
+ p->tve_cache[window_id] = 0;
+ return OPAL_SUCCESS;
+ }
+
+ /* Additional arguments validation */
+ if (tce_levels < 1 || tce_levels > 5 ||
+ !is_pow2(tce_table_size) ||
+ tce_table_size < 0x1000)
+ return OPAL_PARAMETER;
+
+ /* Encode TCE table size */
+ data64 = SETFIELD(IODA2_TVT_TABLE_ADDR, 0ul, tce_table_addr >> 12);
+ tts_encoded = ilog2(tce_table_size) - 11;
+ if (tts_encoded > 31)
+ return OPAL_PARAMETER;
+ data64 = SETFIELD(IODA2_TVT_TCE_TABLE_SIZE, data64, tts_encoded);
+
+ /* Encode TCE page size */
+ switch (tce_page_size) {
+ case 0x1000: /* 4K */
+ data64 = SETFIELD(IODA2_TVT_IO_PSIZE, data64, 1);
+ break;
+ case 0x10000: /* 64K */
+ data64 = SETFIELD(IODA2_TVT_IO_PSIZE, data64, 5);
+ break;
+ case 0x1000000: /* 16M */
+ data64 = SETFIELD(IODA2_TVT_IO_PSIZE, data64, 13);
+ break;
+ case 0x10000000: /* 256M */
+ data64 = SETFIELD(IODA2_TVT_IO_PSIZE, data64, 17);
+ break;
+ default:
+ return OPAL_PARAMETER;
+ }
+
+ /* Encode number of levels */
+ data64 = SETFIELD(IODA2_TVT_NUM_LEVELS, data64, tce_levels - 1);
+
+ phb3_ioda_sel(p, IODA2_TBL_TVT, window_id, false);
+ out_be64(p->regs + PHB_IODA_DATA0, data64);
+ p->tve_cache[window_id] = data64;
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb3_map_pe_dma_window_real(struct phb *phb,
+ uint64_t pe_number,
+ uint16_t window_id,
+ uint64_t pci_start_addr,
+ uint64_t pci_mem_size)
+{
+ struct phb3 *p = phb_to_phb3(phb);
+ uint64_t end;
+ uint64_t tve;
+
+ if (pe_number >= PHB3_MAX_PE_NUM ||
+ (window_id >> 1) != pe_number)
+ return OPAL_PARAMETER;
+
+ if (pci_mem_size) {
+ /* Enable */
+
+ /*
+ * Check that the start address has the right TVE index,
+ * we only support the 1 bit mode where each PE has 2
+ * TVEs
+ */
+ if ((pci_start_addr >> 59) != (window_id & 1))
+ return OPAL_PARAMETER;
+ pci_start_addr &= ((1ull << 59) - 1);
+ end = pci_start_addr + pci_mem_size;
+
+ /* We have to be 16M aligned */
+ if ((pci_start_addr & 0x00ffffff) ||
+ (pci_mem_size & 0x00ffffff))
+ return OPAL_PARAMETER;
+
+ /*
+ * It *looks* like this is the max we can support (we need
+ * to verify this. Also we are not checking for rollover,
+ * but then we aren't trying too hard to protect ourselves
+ * againt a completely broken OS.
+ */
+ if (end > 0x0003ffffffffffffull)
+ return OPAL_PARAMETER;
+
+ /*
+ * Put start address bits 49:24 into TVE[52:53]||[0:23]
+ * and end address bits 49:24 into TVE[54:55]||[24:47]
+ * and set TVE[51]
+ */
+ tve = (pci_start_addr << 16) & (0xffffffull << 48);
+ tve |= (pci_start_addr >> 38) & (3ull << 10);
+ tve |= (end >> 8) & (0xfffffful << 16);
+ tve |= (end >> 40) & (3ull << 8);
+ tve |= PPC_BIT(51);
+ } else {
+ /* Disable */
+ tve = 0;
+ }
+
+ phb3_ioda_sel(p, IODA2_TBL_TVT, window_id, false);
+ out_be64(p->regs + PHB_IODA_DATA0, tve);
+ p->tve_cache[window_id] = tve;
+
+ return OPAL_SUCCESS;
+}
+
+static bool phb3_pci_msi_check_q(struct phb3 *p, uint32_t ive_num)
+{
+ uint64_t ive, ivc, ffi, state;
+ uint8_t *q_byte;
+
+ /* Each IVE has 16-bytes or 128-bytes */
+ ive = p->tbl_ivt + (ive_num * IVT_TABLE_STRIDE * 8);
+ q_byte = (uint8_t *)(ive + 5);
+
+ /*
+ * Handle Q bit. If the Q bit doesn't show up,
+ * we would have CI load to make that.
+ */
+ if (!(*q_byte & 0x1)) {
+ /* Read from random PHB reg to force flush */
+ in_be64(p->regs + PHB_IVC_UPDATE);
+
+ /* Order with subsequent read of Q */
+ sync();
+
+ /* Q still not set, bail out */
+ if (!(*q_byte & 0x1))
+ return false;
+ }
+
+ /* Lock FFI and send interrupt */
+ while (1) {
+ state = in_be64(p->regs + PHB_FFI_LOCK);
+ if (!state)
+ break;
+ if (state == ~0ULL) /* PHB Fenced */
+ return false;
+ }
+
+ /* Clear Q bit and update IVC */
+ *q_byte = 0;
+ ivc = SETFIELD(PHB_IVC_UPDATE_SID, 0ul, ive_num) |
+ PHB_IVC_UPDATE_ENABLE_Q;
+ out_be64(p->regs + PHB_IVC_UPDATE, ivc);
+
+ /*
+ * Resend interrupt. Note the lock clear bit isn't documented in
+ * the PHB3 spec and thus is probably unnecessary but it's in
+ * IODA2 so let's be safe here, it won't hurt to set it
+ */
+ ffi = SETFIELD(PHB_FFI_REQUEST_ISN, 0ul, ive_num) | PHB_FFI_LOCK_CLEAR;
+ out_be64(p->regs + PHB_FFI_REQUEST, ffi);
+
+ return true;
+}
+
+static void phb3_pci_msi_flush_ive(struct phb3 *p, uint32_t ive_num)
+{
+ asm volatile("dcbf %0,%1"
+ :
+ : "b" (p->tbl_ivt), "r" (ive_num * IVT_TABLE_STRIDE * 8)
+ : "memory");
+}
+
+static int64_t phb3_pci_msi_eoi(struct phb *phb,
+ uint32_t hwirq)
+{
+ struct phb3 *p = phb_to_phb3(phb);
+ uint32_t ive_num = PHB3_IRQ_NUM(hwirq);
+ uint64_t ive, ivc;
+ uint8_t *p_byte, gp, gen, newgen;
+
+ /* OS might not configure IVT yet */
+ if (!p->tbl_ivt)
+ return OPAL_HARDWARE;
+
+ /* Each IVE has 16-bytes or 128-bytes */
+ ive = p->tbl_ivt + (ive_num * IVT_TABLE_STRIDE * 8);
+ p_byte = (uint8_t *)(ive + 4);
+
+ /* Read generation and P */
+ gp = *p_byte;
+ gen = (gp >> 1) & 3;
+ newgen = (gen + 1) & 3;
+
+ /* Increment generation count and clear P */
+ *p_byte = newgen << 1;
+
+ /* If at this point:
+ * - the IVC is invalid (due to high IRQ load) and
+ * - we get a new interrupt on this hwirq.
+ * Due to the new interrupt, the IVC will fetch from the IVT.
+ * This IVC reload will result in P set and gen=n+1. This
+ * interrupt may not actually be delievered at this point
+ * though.
+ *
+ * Software will then try to clear P in the IVC (out_be64
+ * below). This could cause an interrupt to be lost because P
+ * is cleared in the IVC without the new interrupt being
+ * delivered.
+ *
+ * To avoid this race, we increment the generation count in
+ * the IVT when we clear P. When software writes the IVC with
+ * P cleared but with gen=n, the IVC won't actually clear P
+ * because gen doesn't match what it just cached from the IVT.
+ * Hence we don't lose P being set.
+ */
+
+ /* Update the P bit in the IVC is gen count matches */
+ ivc = SETFIELD(PHB_IVC_UPDATE_SID, 0ul, ive_num) |
+ PHB_IVC_UPDATE_ENABLE_P |
+ PHB_IVC_UPDATE_ENABLE_GEN |
+ PHB_IVC_UPDATE_ENABLE_CON |
+ SETFIELD(PHB_IVC_UPDATE_GEN_MATCH, 0ul, gen) |
+ SETFIELD(PHB_IVC_UPDATE_GEN, 0ul, newgen);
+ /* out_be64 has a sync to order with the IVT update above */
+ out_be64(p->regs + PHB_IVC_UPDATE, ivc);
+
+ /* Handle Q bit */
+ phb3_pci_msi_check_q(p, ive_num);
+
+ phb3_pci_msi_flush_ive(p, ive_num);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb3_set_ive_pe(struct phb *phb,
+ uint64_t pe_number,
+ uint32_t ive_num)
+{
+ struct phb3 *p = phb_to_phb3(phb);
+ uint64_t *cache, ivep, data64;
+ uint16_t *pe_word;
+
+ /* OS should enable the BAR in advance */
+ if (!p->tbl_ivt)
+ return OPAL_HARDWARE;
+
+ /* Each IVE reserves 128 bytes */
+ if (pe_number >= PHB3_MAX_PE_NUM ||
+ ive_num >= IVT_TABLE_ENTRIES)
+ return OPAL_PARAMETER;
+
+ /* Update IVE cache */
+ cache = &p->ive_cache[ive_num];
+ *cache = SETFIELD(IODA2_IVT_PE, *cache, pe_number);
+
+ /* Update in-memory IVE without clobbering P and Q */
+ ivep = p->tbl_ivt + (ive_num * IVT_TABLE_STRIDE * 8);
+ pe_word = (uint16_t *)(ivep + 6);
+ *pe_word = pe_number;
+
+ /* Invalidate IVC */
+ data64 = SETFIELD(PHB_IVC_INVALIDATE_SID, 0ul, ive_num);
+ out_be64(p->regs + PHB_IVC_INVALIDATE, data64);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb3_get_msi_32(struct phb *phb __unused,
+ uint64_t pe_number,
+ uint32_t ive_num,
+ uint8_t msi_range,
+ uint32_t *msi_address,
+ uint32_t *message_data)
+{
+ /*
+ * Sanity check. We needn't check on mve_number (PE#)
+ * on PHB3 since the interrupt source is purely determined
+ * by its DMA address and data, but the check isn't
+ * harmful.
+ */
+ if (pe_number >= PHB3_MAX_PE_NUM ||
+ ive_num >= IVT_TABLE_ENTRIES ||
+ msi_range != 1 || !msi_address|| !message_data)
+ return OPAL_PARAMETER;
+
+ /*
+ * DMA address and data will form the IVE index.
+ * For more details, please refer to IODA2 spec.
+ */
+ *msi_address = 0xFFFF0000 | ((ive_num << 4) & 0xFFFFFE0F);
+ *message_data = ive_num & 0x1F;
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb3_get_msi_64(struct phb *phb __unused,
+ uint64_t pe_number,
+ uint32_t ive_num,
+ uint8_t msi_range,
+ uint64_t *msi_address,
+ uint32_t *message_data)
+{
+ /* Sanity check */
+ if (pe_number >= PHB3_MAX_PE_NUM ||
+ ive_num >= IVT_TABLE_ENTRIES ||
+ msi_range != 1 || !msi_address || !message_data)
+ return OPAL_PARAMETER;
+
+ /*
+ * DMA address and data will form the IVE index.
+ * For more details, please refer to IODA2 spec.
+ */
+ *msi_address = (0x1ul << 60) | ((ive_num << 4) & 0xFFFFFFFFFFFFFE0Ful);
+ *message_data = ive_num & 0x1F;
+
+ return OPAL_SUCCESS;
+}
+
+static bool phb3_err_check_pbcq(struct phb3 *p)
+{
+ uint64_t nfir, mask, wof, val64;
+ int32_t class, bit;
+ uint64_t severity[PHB3_ERR_CLASS_LAST] = {
+ 0x0000000000000000UL, /* NONE */
+ 0x018000F800000000UL, /* DEAD */
+ 0x7E7DC70000000000UL, /* FENCED */
+ 0x0000000000000000UL, /* ER */
+ 0x0000000000000000UL /* INF */
+ };
+
+ /*
+ * Read on NFIR to see if XSCOM is working properly.
+ * If XSCOM doesn't work well, we need take the PHB
+ * into account any more.
+ */
+ xscom_read(p->chip_id, p->pe_xscom + 0x0, &nfir);
+ if (nfir == 0xffffffffffffffffUL) {
+ p->err.err_src = PHB3_ERR_SRC_NONE;
+ p->err.err_class = PHB3_ERR_CLASS_DEAD;
+ phb3_set_err_pending(p, true);
+ return true;
+ }
+
+ /*
+ * Check WOF. We need handle unmasked errors firstly.
+ * We probably run into the situation (on simulator)
+ * where we have asserted FIR bits, but WOF has nothing.
+ * For that case, we should check FIR as well.
+ */
+ xscom_read(p->chip_id, p->pe_xscom + 0x3, &mask);
+ xscom_read(p->chip_id, p->pe_xscom + 0x8, &wof);
+ if (wof & ~mask)
+ wof &= ~mask;
+ if (!wof) {
+ if (nfir & ~mask)
+ nfir &= ~mask;
+ if (!nfir)
+ return false;
+ wof = nfir;
+ }
+
+ /* We shouldn't hit class PHB3_ERR_CLASS_NONE */
+ for (class = PHB3_ERR_CLASS_NONE;
+ class < PHB3_ERR_CLASS_LAST;
+ class++) {
+ val64 = wof & severity[class];
+ if (!val64)
+ continue;
+
+ for (bit = 0; bit < 64; bit++) {
+ if (val64 & PPC_BIT(bit)) {
+ p->err.err_src = PHB3_ERR_SRC_PBCQ;
+ p->err.err_class = class;
+ p->err.err_bit = 63 - bit;
+ phb3_set_err_pending(p, true);
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+static bool phb3_err_check_lem(struct phb3 *p)
+{
+ uint64_t fir, wof, mask, val64;
+ int32_t class, bit;
+ uint64_t severity[PHB3_ERR_CLASS_LAST] = {
+ 0x0000000000000000UL, /* NONE */
+ 0x0000000000000000UL, /* DEAD */
+ 0xADB670C980ADD151UL, /* FENCED */
+ 0x000800107F500A2CUL, /* ER */
+ 0x42018E2200002482UL /* INF */
+ };
+
+ /*
+ * Read FIR. If XSCOM or ASB is frozen, we needn't
+ * go forward and just mark the PHB with dead state
+ */
+ fir = phb3_read_reg_asb(p, PHB_LEM_FIR_ACCUM);
+ if (fir == 0xffffffffffffffffUL) {
+ p->err.err_src = PHB3_ERR_SRC_PHB;
+ p->err.err_class = PHB3_ERR_CLASS_DEAD;
+ phb3_set_err_pending(p, true);
+ return true;
+ }
+
+ /*
+ * Check on WOF for the unmasked errors firstly. Under
+ * some situation where we run skiboot on simulator,
+ * we already had FIR bits asserted, but WOF is still zero.
+ * For that case, we check FIR directly.
+ */
+ wof = phb3_read_reg_asb(p, PHB_LEM_WOF);
+ mask = phb3_read_reg_asb(p, PHB_LEM_ERROR_MASK);
+ if (wof & ~mask)
+ wof &= ~mask;
+ if (!wof) {
+ if (fir & ~mask)
+ fir &= ~mask;
+ if (!fir)
+ return false;
+ wof = fir;
+ }
+
+ /* We shouldn't hit PHB3_ERR_CLASS_NONE */
+ for (class = PHB3_ERR_CLASS_NONE;
+ class < PHB3_ERR_CLASS_LAST;
+ class++) {
+ val64 = wof & severity[class];
+ if (!val64)
+ continue;
+
+ for (bit = 0; bit < 64; bit++) {
+ if (val64 & PPC_BIT(bit)) {
+ p->err.err_src = PHB3_ERR_SRC_PHB;
+ p->err.err_class = class;
+ p->err.err_bit = 63 - bit;
+ phb3_set_err_pending(p, true);
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+/*
+ * The function can be called during error recovery for INF
+ * and ER class. For INF case, it's expected to be called
+ * when grabbing the error log. We will call it explicitly
+ * when clearing frozen PE state for ER case.
+ */
+static void phb3_err_ER_clear(struct phb3 *p)
+{
+ uint32_t val32;
+ uint64_t val64;
+ uint64_t fir = in_be64(p->regs + PHB_LEM_FIR_ACCUM);
+
+ /* Rec 1: Grab the PCI config lock */
+ /* Removed... unnecessary. We have our own lock here */
+
+ /* Rec 2/3/4: Take all inbound transactions */
+ out_be64(p->regs + PHB_CONFIG_ADDRESS, 0x8000001c00000000ul);
+ out_be32(p->regs + PHB_CONFIG_DATA, 0x10000000);
+
+ /* Rec 5/6/7: Clear pending non-fatal errors */
+ out_be64(p->regs + PHB_CONFIG_ADDRESS, 0x8000005000000000ul);
+ val32 = in_be32(p->regs + PHB_CONFIG_DATA);
+ out_be32(p->regs + PHB_CONFIG_DATA, (val32 & 0xe0700000) | 0x0f000f00);
+
+ /* Rec 8/9/10: Clear pending fatal errors for AER */
+ out_be64(p->regs + PHB_CONFIG_ADDRESS, 0x8000010400000000ul);
+ out_be32(p->regs + PHB_CONFIG_DATA, 0xffffffff);
+
+ /* Rec 11/12/13: Clear pending non-fatal errors for AER */
+ out_be64(p->regs + PHB_CONFIG_ADDRESS, 0x8000011000000000ul);
+ out_be32(p->regs + PHB_CONFIG_DATA, 0xffffffff);
+
+ /* Rec 22/23/24: Clear root port errors */
+ out_be64(p->regs + PHB_CONFIG_ADDRESS, 0x8000013000000000ul);
+ out_be32(p->regs + PHB_CONFIG_DATA, 0xffffffff);
+
+ /* Rec 25/26/27: Enable IO and MMIO bar */
+ out_be64(p->regs + PHB_CONFIG_ADDRESS, 0x8000004000000000ul);
+ out_be32(p->regs + PHB_CONFIG_DATA, 0x470100f8);
+
+ /* Rec 28: Release the PCI config lock */
+ /* Removed... unnecessary. We have our own lock here */
+
+ /* Rec 29...34: Clear UTL errors */
+ val64 = in_be64(p->regs + UTL_SYS_BUS_AGENT_STATUS);
+ out_be64(p->regs + UTL_SYS_BUS_AGENT_STATUS, val64);
+ val64 = in_be64(p->regs + UTL_PCIE_PORT_STATUS);
+ out_be64(p->regs + UTL_PCIE_PORT_STATUS, val64);
+ val64 = in_be64(p->regs + UTL_RC_STATUS);
+ out_be64(p->regs + UTL_RC_STATUS, val64);
+
+ /* Rec 39...66: Clear PHB error trap */
+ val64 = in_be64(p->regs + PHB_ERR_STATUS);
+ out_be64(p->regs + PHB_ERR_STATUS, val64);
+ out_be64(p->regs + PHB_ERR1_STATUS, 0x0ul);
+ out_be64(p->regs + PHB_ERR_LOG_0, 0x0ul);
+ out_be64(p->regs + PHB_ERR_LOG_1, 0x0ul);
+
+ val64 = in_be64(p->regs + PHB_OUT_ERR_STATUS);
+ out_be64(p->regs + PHB_OUT_ERR_STATUS, val64);
+ out_be64(p->regs + PHB_OUT_ERR1_STATUS, 0x0ul);
+ out_be64(p->regs + PHB_OUT_ERR_LOG_0, 0x0ul);
+ out_be64(p->regs + PHB_OUT_ERR_LOG_1, 0x0ul);
+
+ val64 = in_be64(p->regs + PHB_INA_ERR_STATUS);
+ out_be64(p->regs + PHB_INA_ERR_STATUS, val64);
+ out_be64(p->regs + PHB_INA_ERR1_STATUS, 0x0ul);
+ out_be64(p->regs + PHB_INA_ERR_LOG_0, 0x0ul);
+ out_be64(p->regs + PHB_INA_ERR_LOG_1, 0x0ul);
+
+ val64 = in_be64(p->regs + PHB_INB_ERR_STATUS);
+ out_be64(p->regs + PHB_INB_ERR_STATUS, val64);
+ out_be64(p->regs + PHB_INB_ERR1_STATUS, 0x0ul);
+ out_be64(p->regs + PHB_INB_ERR_LOG_0, 0x0ul);
+ out_be64(p->regs + PHB_INB_ERR_LOG_1, 0x0ul);
+
+ /* Rec 67/68: Clear FIR/WOF */
+ out_be64(p->regs + PHB_LEM_FIR_AND_MASK, ~fir);
+ out_be64(p->regs + PHB_LEM_WOF, 0x0ul);
+}
+
+static void phb3_read_phb_status(struct phb3 *p,
+ struct OpalIoPhb3ErrorData *stat)
+{
+ uint16_t val;
+ uint64_t *pPEST;
+ uint64_t val64 = 0;
+ uint32_t i;
+
+ memset(stat, 0, sizeof(struct OpalIoPhb3ErrorData));
+
+ /* Error data common part */
+ stat->common.version = OPAL_PHB_ERROR_DATA_VERSION_1;
+ stat->common.ioType = OPAL_PHB_ERROR_DATA_TYPE_PHB3;
+ stat->common.len = sizeof(struct OpalIoPhb3ErrorData);
+
+ /*
+ * We read some registers using config space through AIB.
+ *
+ * Get to other registers using ASB when possible to get to them
+ * through a fence if one is present.
+ */
+
+ /* Use ASB to access PCICFG if the PHB has been fenced */
+ p->flags |= PHB3_CFG_USE_ASB;
+
+ /* Grab RC bridge control, make it 32-bit */
+ phb3_pcicfg_read16(&p->phb, 0, PCI_CFG_BRCTL, &val);
+ stat->brdgCtl = val;
+
+ /* Grab UTL status registers */
+ stat->portStatusReg = hi32(phb3_read_reg_asb(p, UTL_PCIE_PORT_STATUS));
+ stat->rootCmplxStatus = hi32(phb3_read_reg_asb(p, UTL_RC_STATUS));
+ stat->busAgentStatus = hi32(phb3_read_reg_asb(p, UTL_SYS_BUS_AGENT_STATUS));
+
+ /*
+ * Grab various RC PCIe capability registers. All device, slot
+ * and link status are 16-bit, so we grab the pair control+status
+ * for each of them
+ */
+ phb3_pcicfg_read32(&p->phb, 0, p->ecap + PCICAP_EXP_DEVCTL,
+ &stat->deviceStatus);
+ phb3_pcicfg_read32(&p->phb, 0, p->ecap + PCICAP_EXP_SLOTCTL,
+ &stat->slotStatus);
+ phb3_pcicfg_read32(&p->phb, 0, p->ecap + PCICAP_EXP_LCTL,
+ &stat->linkStatus);
+
+ /*
+ * I assume those are the standard config space header, cmd & status
+ * together makes 32-bit. Secondary status is 16-bit so I'll clear
+ * the top on that one
+ */
+ phb3_pcicfg_read32(&p->phb, 0, PCI_CFG_CMD, &stat->devCmdStatus);
+ phb3_pcicfg_read16(&p->phb, 0, PCI_CFG_SECONDARY_STATUS, &val);
+ stat->devSecStatus = val;
+
+ /* Grab a bunch of AER regs */
+ phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_RERR_STA,
+ &stat->rootErrorStatus);
+ phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_UE_STATUS,
+ &stat->uncorrErrorStatus);
+ phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_CE_STATUS,
+ &stat->corrErrorStatus);
+ phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG0,
+ &stat->tlpHdr1);
+ phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG1,
+ &stat->tlpHdr2);
+ phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG2,
+ &stat->tlpHdr3);
+ phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG3,
+ &stat->tlpHdr4);
+ phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_SRCID,
+ &stat->sourceId);
+
+ /* Restore to AIB */
+ p->flags &= ~PHB3_CFG_USE_ASB;
+
+ /* PEC NFIR */
+ xscom_read(p->chip_id, p->pe_xscom + 0x0, &stat->nFir);
+ xscom_read(p->chip_id, p->pe_xscom + 0x3, &stat->nFirMask);
+ xscom_read(p->chip_id, p->pe_xscom + 0x8, &stat->nFirWOF);
+
+ /* PHB3 inbound and outbound error Regs */
+ stat->phbPlssr = phb3_read_reg_asb(p, PHB_CPU_LOADSTORE_STATUS);
+ stat->phbCsr = phb3_read_reg_asb(p, PHB_DMA_CHAN_STATUS);
+ stat->lemFir = phb3_read_reg_asb(p, PHB_LEM_FIR_ACCUM);
+ stat->lemErrorMask = phb3_read_reg_asb(p, PHB_LEM_ERROR_MASK);
+ stat->lemWOF = phb3_read_reg_asb(p, PHB_LEM_WOF);
+ stat->phbErrorStatus = phb3_read_reg_asb(p, PHB_ERR_STATUS);
+ stat->phbFirstErrorStatus = phb3_read_reg_asb(p, PHB_ERR1_STATUS);
+ stat->phbErrorLog0 = phb3_read_reg_asb(p, PHB_ERR_LOG_0);
+ stat->phbErrorLog1 = phb3_read_reg_asb(p, PHB_ERR_LOG_1);
+ stat->mmioErrorStatus = phb3_read_reg_asb(p, PHB_OUT_ERR_STATUS);
+ stat->mmioFirstErrorStatus = phb3_read_reg_asb(p, PHB_OUT_ERR1_STATUS);
+ stat->mmioErrorLog0 = phb3_read_reg_asb(p, PHB_OUT_ERR_LOG_0);
+ stat->mmioErrorLog1 = phb3_read_reg_asb(p, PHB_OUT_ERR_LOG_1);
+ stat->dma0ErrorStatus = phb3_read_reg_asb(p, PHB_INA_ERR_STATUS);
+ stat->dma0FirstErrorStatus = phb3_read_reg_asb(p, PHB_INA_ERR1_STATUS);
+ stat->dma0ErrorLog0 = phb3_read_reg_asb(p, PHB_INA_ERR_LOG_0);
+ stat->dma0ErrorLog1 = phb3_read_reg_asb(p, PHB_INA_ERR_LOG_1);
+ stat->dma1ErrorStatus = phb3_read_reg_asb(p, PHB_INB_ERR_STATUS);
+ stat->dma1FirstErrorStatus = phb3_read_reg_asb(p, PHB_INB_ERR1_STATUS);
+ stat->dma1ErrorLog0 = phb3_read_reg_asb(p, PHB_INB_ERR_LOG_0);
+ stat->dma1ErrorLog1 = phb3_read_reg_asb(p, PHB_INB_ERR_LOG_1);
+
+ /*
+ * Grab PESTA & B content. The error bit (bit#0) should
+ * be fetched from IODA and the left content from memory
+ * resident tables.
+ */
+ pPEST = (uint64_t *)p->tbl_pest;
+ val64 = PHB_IODA_AD_AUTOINC;
+ val64 = SETFIELD(PHB_IODA_AD_TSEL, val64, IODA2_TBL_PESTA);
+ phb3_write_reg_asb(p, PHB_IODA_ADDR, val64);
+ for (i = 0; i < OPAL_PHB3_NUM_PEST_REGS; i++) {
+ stat->pestA[i] = phb3_read_reg_asb(p, PHB_IODA_DATA0);
+ stat->pestA[i] |= pPEST[2 * i];
+ }
+
+ val64 = PHB_IODA_AD_AUTOINC;
+ val64 = SETFIELD(PHB_IODA_AD_TSEL, val64, IODA2_TBL_PESTB);
+ phb3_write_reg_asb(p, PHB_IODA_ADDR, val64);
+ for (i = 0; i < OPAL_PHB3_NUM_PEST_REGS; i++) {
+ stat->pestB[i] = phb3_read_reg_asb(p, PHB_IODA_DATA0);
+ stat->pestB[i] |= pPEST[2 * i + 1];
+ }
+}
+
+static void phb3_eeh_dump_regs(struct phb3 *p, struct OpalIoPhb3ErrorData *regs)
+{
+ struct OpalIoPhb3ErrorData *s;
+ unsigned int i;
+
+ if (!verbose_eeh)
+ return;
+
+ if (!regs) {
+ s = zalloc(sizeof(struct OpalIoPhb3ErrorData));
+ if (!s) {
+ PHBERR(p, "Failed to allocate error info !\n");
+ return;
+ }
+
+ phb3_read_phb_status(p, s);
+ } else {
+ s = regs;
+ }
+
+ PHBERR(p, "Error detected!\n");
+
+ PHBERR(p, " portStatusReg = %08x\n", s->portStatusReg);
+ PHBERR(p, " rootCmplxStatus = %08x\n", s->rootCmplxStatus);
+ PHBERR(p, " busAgentStatus = %08x\n", s->busAgentStatus);
+
+ PHBERR(p, " errorClass = %016llx\n", s->errorClass);
+ PHBERR(p, " correlator = %016llx\n", s->correlator);
+
+ PHBERR(p, " brdgCtl = %08x\n", s->brdgCtl);
+ PHBERR(p, " deviceStatus = %08x\n", s->deviceStatus);
+ PHBERR(p, " slotStatus = %08x\n", s->slotStatus);
+ PHBERR(p, " linkStatus = %08x\n", s->linkStatus);
+ PHBERR(p, " devCmdStatus = %08x\n", s->devCmdStatus);
+ PHBERR(p, " devSecStatus = %08x\n", s->devSecStatus);
+ PHBERR(p, " rootErrorStatus = %08x\n", s->rootErrorStatus);
+ PHBERR(p, " corrErrorStatus = %08x\n", s->corrErrorStatus);
+ PHBERR(p, " uncorrErrorStatus = %08x\n", s->uncorrErrorStatus);
+
+ /* Byte swap TLP headers so they are the same as the PCIe spec */
+ PHBERR(p, " tlpHdr1 = %08x\n", bswap_32(s->tlpHdr1));
+ PHBERR(p, " tlpHdr2 = %08x\n", bswap_32(s->tlpHdr2));
+ PHBERR(p, " tlpHdr3 = %08x\n", bswap_32(s->tlpHdr3));
+ PHBERR(p, " tlpHdr4 = %08x\n", bswap_32(s->tlpHdr4));
+ PHBERR(p, " sourceId = %08x\n", s->sourceId);
+
+ PHBERR(p, " nFir = %016llx\n", s->nFir);
+ PHBERR(p, " nFirMask = %016llx\n", s->nFirMask);
+ PHBERR(p, " nFirWOF = %016llx\n", s->nFirWOF);
+ PHBERR(p, " phbPlssr = %016llx\n", s->phbPlssr);
+ PHBERR(p, " phbCsr = %016llx\n", s->phbCsr);
+ PHBERR(p, " lemFir = %016llx\n", s->lemFir);
+ PHBERR(p, " lemErrorMask = %016llx\n", s->lemErrorMask);
+ PHBERR(p, " lemWOF = %016llx\n", s->lemWOF);
+
+ PHBERR(p, " phbErrorStatus = %016llx\n", s->phbErrorStatus);
+ PHBERR(p, " phbFirstErrorStatus = %016llx\n", s->phbFirstErrorStatus);
+ PHBERR(p, " phbErrorLog0 = %016llx\n", s->phbErrorLog0);
+ PHBERR(p, " phbErrorLog1 = %016llx\n", s->phbErrorLog1);
+
+ PHBERR(p, " mmioErrorStatus = %016llx\n", s->mmioErrorStatus);
+ PHBERR(p, "mmioFirstErrorStatus = %016llx\n", s->mmioFirstErrorStatus);
+ PHBERR(p, " mmioErrorLog0 = %016llx\n", s->mmioErrorLog0);
+ PHBERR(p, " mmioErrorLog1 = %016llx\n", s->mmioErrorLog1);
+
+ PHBERR(p, " dma0ErrorStatus = %016llx\n", s->dma0ErrorStatus);
+ PHBERR(p, "dma0FirstErrorStatus = %016llx\n", s->dma0FirstErrorStatus);
+ PHBERR(p, " dma0ErrorLog0 = %016llx\n", s->dma0ErrorLog0);
+ PHBERR(p, " dma0ErrorLog1 = %016llx\n", s->dma0ErrorLog1);
+
+ PHBERR(p, " dma1ErrorStatus = %016llx\n", s->dma1ErrorStatus);
+ PHBERR(p, "dma1FirstErrorStatus = %016llx\n", s->dma1FirstErrorStatus);
+ PHBERR(p, " dma1ErrorLog0 = %016llx\n", s->dma1ErrorLog0);
+ PHBERR(p, " dma1ErrorLog1 = %016llx\n", s->dma1ErrorLog1);
+
+ for (i = 0; i < OPAL_PHB3_NUM_PEST_REGS; i++) {
+ if (!s->pestA[i] && !s->pestB[i])
+ continue;
+ PHBERR(p, " PEST[%03x] = %016llx %016llx\n",
+ i, s->pestA[i], s->pestB[i]);
+ }
+
+ if (s != regs)
+ free(s);
+}
+
+static int64_t phb3_msi_get_xive(struct irq_source *is, uint32_t isn,
+ uint16_t *server, uint8_t *prio)
+{
+ struct phb3 *p = is->data;
+ uint32_t chip, index, irq;
+ uint64_t ive;
+
+ chip = p8_irq_to_chip(isn);
+ index = p8_irq_to_phb(isn);
+ irq = PHB3_IRQ_NUM(isn);
+
+ if (chip != p->chip_id ||
+ index != p->index ||
+ irq > PHB3_MSI_IRQ_MAX)
+ return OPAL_PARAMETER;
+
+ /*
+ * Each IVE has 16 bytes in cache. Note that the kernel
+ * should strip the link bits from server field.
+ */
+ ive = p->ive_cache[irq];
+ *server = GETFIELD(IODA2_IVT_SERVER, ive);
+ *prio = GETFIELD(IODA2_IVT_PRIORITY, ive);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb3_msi_set_xive(struct irq_source *is, uint32_t isn,
+ uint16_t server, uint8_t prio)
+{
+ struct phb3 *p = is->data;
+ uint32_t chip, index;
+ uint64_t *cache, ive_num, data64, m_server, m_prio, ivc;
+ uint32_t *ive;
+
+ chip = p8_irq_to_chip(isn);
+ index = p8_irq_to_phb(isn);
+ ive_num = PHB3_IRQ_NUM(isn);
+
+ if (p->broken || !p->tbl_rtt)
+ return OPAL_HARDWARE;
+ if (chip != p->chip_id ||
+ index != p->index ||
+ ive_num > PHB3_MSI_IRQ_MAX)
+ return OPAL_PARAMETER;
+
+ phb_lock(&p->phb);
+
+ /*
+ * We need strip the link from server. As Milton told
+ * me, the server is assigned as follows and the left
+ * bits unused: node/chip/core/thread/link = 2/3/4/3/2
+ *
+ * Note: the server has added the link bits to server.
+ */
+ m_server = server;
+ m_prio = prio;
+
+ cache = &p->ive_cache[ive_num];
+ *cache = SETFIELD(IODA2_IVT_SERVER, *cache, m_server);
+ *cache = SETFIELD(IODA2_IVT_PRIORITY, *cache, m_prio);
+
+ /*
+ * Update IVT and IVC. We need use IVC update register
+ * to do that. Each IVE in the table has 128 bytes
+ */
+ ive = (uint32_t *)(p->tbl_ivt + ive_num * IVT_TABLE_STRIDE * 8);
+ data64 = PHB_IVC_UPDATE_ENABLE_SERVER | PHB_IVC_UPDATE_ENABLE_PRI;
+ data64 = SETFIELD(PHB_IVC_UPDATE_SID, data64, ive_num);
+ data64 = SETFIELD(PHB_IVC_UPDATE_SERVER, data64, m_server);
+ data64 = SETFIELD(PHB_IVC_UPDATE_PRI, data64, m_prio);
+
+ /*
+ * We don't use SETFIELD because we are doing a 32-bit access
+ * in order to avoid touching the P and Q bits
+ */
+ *ive = (m_server << 8) | m_prio;
+ out_be64(p->regs + PHB_IVC_UPDATE, data64);
+
+ if (prio != 0xff) {
+ /*
+ * Handle Q bit if we're going to enable the
+ * interrupt. The OS should make sure the interrupt
+ * handler has been installed already.
+ */
+ if (phb3_pci_msi_check_q(p, ive_num))
+ phb3_pci_msi_flush_ive(p, ive_num);
+ } else {
+ /* Read from random PHB reg to force flush */
+ in_be64(p->regs + PHB_IVC_UPDATE);
+
+ /* Order with subsequent read of Q */
+ sync();
+
+ /* Clear P, Q and Gen, preserve PE# */
+ ive[1] &= 0x0000ffff;
+
+ /*
+ * Update the IVC with a match against the old gen
+ * count. No need to worry about racing with P being
+ * set in the cache since IRQ is masked at this point.
+ */
+ ivc = SETFIELD(PHB_IVC_UPDATE_SID, 0ul, ive_num) |
+ PHB_IVC_UPDATE_ENABLE_P |
+ PHB_IVC_UPDATE_ENABLE_Q |
+ PHB_IVC_UPDATE_ENABLE_GEN;
+ out_be64(p->regs + PHB_IVC_UPDATE, ivc);
+ }
+
+ phb_unlock(&p->phb);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb3_lsi_get_xive(struct irq_source *is, uint32_t isn,
+ uint16_t *server, uint8_t *prio)
+{
+ struct phb3 *p = is->data;
+ uint32_t chip, index, irq;
+ uint64_t lxive;
+
+ chip = p8_irq_to_chip(isn);
+ index = p8_irq_to_phb(isn);
+ irq = PHB3_IRQ_NUM(isn);
+
+ if (chip != p->chip_id ||
+ index != p->index ||
+ irq < PHB3_LSI_IRQ_MIN ||
+ irq > PHB3_LSI_IRQ_MAX)
+ return OPAL_PARAMETER;
+
+ lxive = p->lxive_cache[irq - PHB3_LSI_IRQ_MIN];
+ *server = GETFIELD(IODA2_LXIVT_SERVER, lxive);
+ *prio = GETFIELD(IODA2_LXIVT_PRIORITY, lxive);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb3_lsi_set_xive(struct irq_source *is, uint32_t isn,
+ uint16_t server, uint8_t prio)
+{
+ struct phb3 *p = is->data;
+ uint32_t chip, index, irq, entry;
+ uint64_t lxive;
+
+ chip = p8_irq_to_chip(isn);
+ index = p8_irq_to_phb(isn);
+ irq = PHB3_IRQ_NUM(isn);
+
+ if (p->broken)
+ return OPAL_HARDWARE;
+
+ if (chip != p->chip_id ||
+ index != p->index ||
+ irq < PHB3_LSI_IRQ_MIN ||
+ irq > PHB3_LSI_IRQ_MAX)
+ return OPAL_PARAMETER;
+
+ lxive = SETFIELD(IODA2_LXIVT_SERVER, 0ul, server);
+ lxive = SETFIELD(IODA2_LXIVT_PRIORITY, lxive, prio);
+
+ phb_lock(&p->phb);
+
+ /*
+ * We cache the arguments because we have to mangle
+ * it in order to hijack 3 bits of priority to extend
+ * the server number
+ */
+ entry = irq - PHB3_LSI_IRQ_MIN;
+ p->lxive_cache[entry] = lxive;
+
+ /* We use HRT entry 0 always for now */
+ phb3_ioda_sel(p, IODA2_TBL_LXIVT, entry, false);
+ lxive = in_be64(p->regs + PHB_IODA_DATA0);
+ lxive = SETFIELD(IODA2_LXIVT_SERVER, lxive, server);
+ lxive = SETFIELD(IODA2_LXIVT_PRIORITY, lxive, prio);
+ out_be64(p->regs + PHB_IODA_DATA0, lxive);
+
+ phb_unlock(&p->phb);
+
+ return OPAL_SUCCESS;
+}
+
+static void phb3_err_interrupt(struct irq_source *is, uint32_t isn)
+{
+ struct phb3 *p = is->data;
+
+ PHBDBG(p, "Got interrupt 0x%08x\n", isn);
+
+ /* Update pending event */
+ opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
+ OPAL_EVENT_PCI_ERROR);
+
+ /* If the PHB is broken, go away */
+ if (p->broken)
+ return;
+
+ /*
+ * Mark the PHB has pending error so that the OS
+ * can handle it at late point.
+ */
+ phb3_set_err_pending(p, true);
+}
+
+static uint64_t phb3_lsi_attributes(struct irq_source *is, uint32_t isn)
+{
+#ifndef DISABLE_ERR_INTS
+ struct phb3 *p = is->data;
+ uint32_t idx = isn - p->base_lsi;
+
+ if (idx == PHB3_LSI_PCIE_INF || idx == PHB3_LSI_PCIE_ER)
+ return IRQ_ATTR_TARGET_OPAL | IRQ_ATTR_TARGET_RARE | IRQ_ATTR_TYPE_LSI;
+#endif
+ return IRQ_ATTR_TARGET_LINUX;
+}
+
+/* MSIs (OS owned) */
+static const struct irq_source_ops phb3_msi_irq_ops = {
+ .get_xive = phb3_msi_get_xive,
+ .set_xive = phb3_msi_set_xive,
+};
+
+/* LSIs (OS owned) */
+static const struct irq_source_ops phb3_lsi_irq_ops = {
+ .get_xive = phb3_lsi_get_xive,
+ .set_xive = phb3_lsi_set_xive,
+ .attributes = phb3_lsi_attributes,
+ .interrupt = phb3_err_interrupt,
+};
+
+static int64_t phb3_set_pe(struct phb *phb,
+ uint64_t pe_number,
+ uint64_t bdfn,
+ uint8_t bcompare,
+ uint8_t dcompare,
+ uint8_t fcompare,
+ uint8_t action)
+{
+ struct phb3 *p = phb_to_phb3(phb);
+ uint64_t mask, val, tmp, idx;
+ int32_t all = 0;
+ uint16_t *rte;
+
+ /* Sanity check */
+ if (!p->tbl_rtt)
+ return OPAL_HARDWARE;
+ if (action != OPAL_MAP_PE && action != OPAL_UNMAP_PE)
+ return OPAL_PARAMETER;
+ if (pe_number >= PHB3_MAX_PE_NUM || bdfn > 0xffff ||
+ bcompare > OpalPciBusAll ||
+ dcompare > OPAL_COMPARE_RID_DEVICE_NUMBER ||
+ fcompare > OPAL_COMPARE_RID_FUNCTION_NUMBER)
+ return OPAL_PARAMETER;
+
+ /* Figure out the RID range */
+ if (bcompare == OpalPciBusAny) {
+ mask = 0x0;
+ val = 0x0;
+ all = 0x1;
+ } else {
+ tmp = ((0x1 << (bcompare + 1)) - 1) << (15 - bcompare);
+ mask = tmp;
+ val = bdfn & tmp;
+ }
+
+ if (dcompare == OPAL_IGNORE_RID_DEVICE_NUMBER)
+ all = (all << 1) | 0x1;
+ else {
+ mask |= 0xf8;
+ val |= (bdfn & 0xf8);
+ }
+
+ if (fcompare == OPAL_IGNORE_RID_FUNCTION_NUMBER)
+ all = (all << 1) | 0x1;
+ else {
+ mask |= 0x7;
+ val |= PCI_FUNC(bdfn);
+ }
+
+ /* Map or unmap the RTT range */
+ if (all == 0x7) {
+ if (action == OPAL_MAP_PE) {
+ for (idx = 0; idx < RTT_TABLE_ENTRIES; idx++)
+ p->rte_cache[idx] = pe_number;
+ } else {
+ for ( idx = 0; idx < ARRAY_SIZE(p->rte_cache); idx++)
+ p->rte_cache[idx] = PHB3_RESERVED_PE_NUM;
+ }
+ memcpy((void *)p->tbl_rtt, p->rte_cache, RTT_TABLE_SIZE);
+ } else {
+ rte = (uint16_t *)p->tbl_rtt;
+ for (idx = 0; idx < RTT_TABLE_ENTRIES; idx++, rte++) {
+ if ((idx & mask) != val)
+ continue;
+ if (action == OPAL_MAP_PE)
+ p->rte_cache[idx] = pe_number;
+ else
+ p->rte_cache[idx] = PHB3_RESERVED_PE_NUM;
+ *rte = p->rte_cache[idx];
+ }
+ }
+
+ /* Invalidate the entire RTC */
+ out_be64(p->regs + PHB_RTC_INVALIDATE, PHB_RTC_INVALIDATE_ALL);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb3_set_peltv(struct phb *phb,
+ uint32_t parent_pe,
+ uint32_t child_pe,
+ uint8_t state)
+{
+ struct phb3 *p = phb_to_phb3(phb);
+ uint8_t *peltv;
+ uint32_t idx, mask;
+
+ /* Sanity check */
+ if (!p->tbl_peltv)
+ return OPAL_HARDWARE;
+ if (parent_pe >= PHB3_MAX_PE_NUM || child_pe >= PHB3_MAX_PE_NUM)
+ return OPAL_PARAMETER;
+
+ /* Find index for parent PE */
+ idx = parent_pe * (PHB3_MAX_PE_NUM / 8);
+ idx += (child_pe / 8);
+ mask = 0x1 << (7 - (child_pe % 8));
+
+ peltv = (uint8_t *)p->tbl_peltv;
+ peltv += idx;
+ if (state) {
+ *peltv |= mask;
+ p->peltv_cache[idx] |= mask;
+ } else {
+ *peltv &= ~mask;
+ p->peltv_cache[idx] &= ~mask;
+ }
+
+ return OPAL_SUCCESS;
+}
+
+static void phb3_prepare_link_change(struct pci_slot *slot,
+ bool is_up)
+{
+ struct phb3 *p = phb_to_phb3(slot->phb);
+ struct pci_device *pd = slot->pd;
+ uint32_t reg32;
+
+ p->has_link = is_up;
+ if (!is_up) {
+ if (!pd || !pd->slot || !pd->slot->surprise_pluggable) {
+ /* Mask PCIE port interrupts */
+ out_be64(p->regs + UTL_PCIE_PORT_IRQ_EN,
+ 0xad42800000000000UL);
+
+ pci_cfg_read32(&p->phb, 0,
+ p->aercap + PCIECAP_AER_UE_MASK, &reg32);
+ reg32 |= PCIECAP_AER_UE_MASK_SURPRISE_DOWN;
+ pci_cfg_write32(&p->phb, 0,
+ p->aercap + PCIECAP_AER_UE_MASK, reg32);
+ }
+
+ /* Mask AER receiver error */
+ phb3_pcicfg_read32(&p->phb, 0,
+ p->aercap + PCIECAP_AER_CE_MASK, &reg32);
+ reg32 |= PCIECAP_AER_CE_RECVR_ERR;
+ phb3_pcicfg_write32(&p->phb, 0,
+ p->aercap + PCIECAP_AER_CE_MASK, reg32);
+
+ /* Block PCI-CFG access */
+ p->flags |= PHB3_CFG_BLOCKED;
+ } else {
+ /* Clear AER receiver error status */
+ phb3_pcicfg_write32(&p->phb, 0,
+ p->aercap + PCIECAP_AER_CE_STATUS,
+ PCIECAP_AER_CE_RECVR_ERR);
+
+ /* Unmask receiver error status in AER */
+ phb3_pcicfg_read32(&p->phb, 0,
+ p->aercap + PCIECAP_AER_CE_MASK, &reg32);
+ reg32 &= ~PCIECAP_AER_CE_RECVR_ERR;
+ phb3_pcicfg_write32(&p->phb, 0,
+ p->aercap + PCIECAP_AER_CE_MASK, reg32);
+
+ /* Clear spurrious errors and enable PCIE port interrupts */
+ out_be64(p->regs + UTL_PCIE_PORT_STATUS,
+ 0xffdfffffffffffffUL);
+
+ if (!pd || !pd->slot || !pd->slot->surprise_pluggable) {
+ out_be64(p->regs + UTL_PCIE_PORT_IRQ_EN,
+ 0xad52800000000000UL);
+
+ pci_cfg_read32(&p->phb, 0,
+ p->aercap + PCIECAP_AER_UE_MASK, &reg32);
+ reg32 &= ~PCIECAP_AER_UE_MASK_SURPRISE_DOWN;
+ pci_cfg_write32(&p->phb, 0,
+ p->aercap + PCIECAP_AER_UE_MASK, reg32);
+ }
+
+ /* Don't block PCI-CFG */
+ p->flags &= ~PHB3_CFG_BLOCKED;
+
+ /*
+ * We might lose the bus numbers during the reset operation
+ * and we need to restore them. Otherwise, some adapters (e.g.
+ * IPR) can't be probed properly by the kernel. We don't need
+ * to restore bus numbers for every kind of reset, however,
+ * it's not harmful to always restore the bus numbers, which
+ * simplifies the logic.
+ */
+ pci_restore_bridge_buses(slot->phb, slot->pd);
+ if (slot->phb->ops->device_init)
+ pci_walk_dev(slot->phb, slot->pd,
+ slot->phb->ops->device_init, NULL);
+ }
+}
+
+static int64_t phb3_get_presence_state(struct pci_slot *slot, uint8_t *val)
+{
+ struct phb3 *p = phb_to_phb3(slot->phb);
+ uint64_t hp_override;
+
+ if (p->broken)
+ return OPAL_HARDWARE;
+
+ /*
+ * On P8, the slot status isn't wired up properly, we have
+ * to use the hotplug override A/B bits.
+ */
+ hp_override = in_be64(p->regs + PHB_HOTPLUG_OVERRIDE);
+ if ((hp_override & PHB_HPOVR_PRESENCE_A) &&
+ (hp_override & PHB_HPOVR_PRESENCE_B))
+ *val = OPAL_PCI_SLOT_EMPTY;
+ else
+ *val = OPAL_PCI_SLOT_PRESENT;
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb3_get_link_state(struct pci_slot *slot, uint8_t *val)
+{
+ struct phb3 *p = phb_to_phb3(slot->phb);
+ uint64_t reg;
+ uint16_t state;
+ int64_t rc;
+
+ /* Link is up, let's find the actual speed */
+ reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+ if (!(reg & PHB_PCIE_DLP_TC_DL_LINKACT)) {
+ *val = 0;
+ return OPAL_SUCCESS;
+ }
+
+ rc = phb3_pcicfg_read16(&p->phb, 0,
+ p->ecap + PCICAP_EXP_LSTAT, &state);
+ if (rc != OPAL_SUCCESS) {
+ PHBERR(p, "%s: Error %lld getting link state\n", __func__, rc);
+ return OPAL_HARDWARE;
+ }
+
+ if (state & PCICAP_EXP_LSTAT_DLLL_ACT)
+ *val = ((state & PCICAP_EXP_LSTAT_WIDTH) >> 4);
+ else
+ *val = 0;
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb3_retry_state(struct pci_slot *slot)
+{
+ struct phb3 *p = phb_to_phb3(slot->phb);
+
+ if (slot->retry_state == PCI_SLOT_STATE_NORMAL)
+ return OPAL_WRONG_STATE;
+
+ PHBDBG(p, "Retry state %08x\n", slot->retry_state);
+ slot->delay_tgt_tb = 0;
+ pci_slot_set_state(slot, slot->retry_state);
+ slot->retry_state = PCI_SLOT_STATE_NORMAL;
+ return slot->ops.run_sm(slot);
+}
+
+static int64_t phb3_poll_link(struct pci_slot *slot)
+{
+ struct phb3 *p = phb_to_phb3(slot->phb);
+ uint64_t reg;
+ int64_t rc;
+
+ switch (slot->state) {
+ case PHB3_SLOT_NORMAL:
+ case PHB3_SLOT_LINK_START:
+ PHBDBG(p, "LINK: Start polling\n");
+ slot->retries = PHB3_LINK_ELECTRICAL_RETRIES;
+ pci_slot_set_state(slot, PHB3_SLOT_LINK_WAIT_ELECTRICAL);
+ return pci_slot_set_sm_timeout(slot, msecs_to_tb(100));
+ case PHB3_SLOT_LINK_WAIT_ELECTRICAL:
+ /*
+ * Wait for the link electrical connection to be
+ * established (shorter timeout). This allows us to
+ * workaround spurrious presence detect on some machines
+ * without waiting 10s each time
+ *
+ * Note: We *also* check for the full link up bit here
+ * because simics doesn't seem to implement the electrical
+ * link bit at all
+ */
+ reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+ if (reg & (PHB_PCIE_DLP_INBAND_PRESENCE |
+ PHB_PCIE_DLP_TC_DL_LINKACT)) {
+ PHBDBG(p, "LINK: Electrical link detected\n");
+ pci_slot_set_state(slot, PHB3_SLOT_LINK_WAIT);
+ slot->retries = PHB3_LINK_WAIT_RETRIES;
+ return pci_slot_set_sm_timeout(slot, msecs_to_tb(100));
+ }
+
+ if (slot->retries-- == 0) {
+ PHBDBG(p, "LINK: Timeout waiting for electrical link\n");
+ PHBDBG(p, "LINK: DLP train control: 0x%016llx\n", reg);
+ rc = phb3_retry_state(slot);
+ if (rc >= OPAL_SUCCESS)
+ return rc;
+
+ pci_slot_set_state(slot, PHB3_SLOT_NORMAL);
+ return OPAL_SUCCESS;
+ }
+ return pci_slot_set_sm_timeout(slot, msecs_to_tb(100));
+ case PHB3_SLOT_LINK_WAIT:
+ reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+ if (reg & PHB_PCIE_DLP_TC_DL_LINKACT) {
+ PHBDBG(p, "LINK: Link is up\n");
+ if (slot->ops.prepare_link_change)
+ slot->ops.prepare_link_change(slot, true);
+ pci_slot_set_state(slot, PHB3_SLOT_NORMAL);
+ return OPAL_SUCCESS;
+ }
+
+ if (slot->retries-- == 0) {
+ PHBDBG(p, "LINK: Timeout waiting for link up\n");
+ PHBDBG(p, "LINK: DLP train control: 0x%016llx\n", reg);
+ rc = phb3_retry_state(slot);
+ if (rc >= OPAL_SUCCESS)
+ return rc;
+
+ pci_slot_set_state(slot, PHB3_SLOT_NORMAL);
+ return OPAL_SUCCESS;
+ }
+ return pci_slot_set_sm_timeout(slot, msecs_to_tb(100));
+ default:
+ PHBERR(p, "LINK: Unexpected slot state %08x\n",
+ slot->state);
+ }
+
+ pci_slot_set_state(slot, PHB3_SLOT_NORMAL);
+ return OPAL_HARDWARE;
+}
+
+static int64_t phb3_hreset(struct pci_slot *slot)
+{
+ struct phb3 *p = phb_to_phb3(slot->phb);
+ uint16_t brctl;
+ uint8_t presence = 1;
+
+ switch (slot->state) {
+ case PHB3_SLOT_NORMAL:
+ PHBDBG(p, "HRESET: Starts\n");
+ if (slot->ops.get_presence_state)
+ slot->ops.get_presence_state(slot, &presence);
+ if (!presence) {
+ PHBDBG(p, "HRESET: No device\n");
+ return OPAL_SUCCESS;
+ }
+
+ PHBDBG(p, "HRESET: Prepare for link down\n");
+ if (slot->ops.prepare_link_change)
+ slot->ops.prepare_link_change(slot, false);
+ /* fall through */
+ case PHB3_SLOT_HRESET_START:
+ PHBDBG(p, "HRESET: Assert\n");
+
+ phb3_pcicfg_read16(&p->phb, 0, PCI_CFG_BRCTL, &brctl);
+ brctl |= PCI_CFG_BRCTL_SECONDARY_RESET;
+ phb3_pcicfg_write16(&p->phb, 0, PCI_CFG_BRCTL, brctl);
+ pci_slot_set_state(slot, PHB3_SLOT_HRESET_DELAY);
+
+ return pci_slot_set_sm_timeout(slot, secs_to_tb(1));
+ case PHB3_SLOT_HRESET_DELAY:
+ PHBDBG(p, "HRESET: Deassert\n");
+
+ phb3_pcicfg_read16(&p->phb, 0, PCI_CFG_BRCTL, &brctl);
+ brctl &= ~PCI_CFG_BRCTL_SECONDARY_RESET;
+ phb3_pcicfg_write16(&p->phb, 0, PCI_CFG_BRCTL, brctl);
+
+ /*
+ * Due to some oddball adapters bouncing the link
+ * training a couple of times, we wait for a full second
+ * before we start checking the link status, otherwise
+ * we can get a spurrious link down interrupt which
+ * causes us to EEH immediately.
+ */
+ pci_slot_set_state(slot, PHB3_SLOT_HRESET_DELAY2);
+ return pci_slot_set_sm_timeout(slot, secs_to_tb(1));
+ case PHB3_SLOT_HRESET_DELAY2:
+ pci_slot_set_state(slot, PHB3_SLOT_LINK_START);
+ return slot->ops.poll_link(slot);
+ default:
+ PHBERR(p, "Unexpected slot state %08x\n", slot->state);
+ }
+
+ pci_slot_set_state(slot, PHB3_SLOT_NORMAL);
+ return OPAL_HARDWARE;
+}
+
+static int64_t phb3_freset(struct pci_slot *slot)
+{
+ struct phb3 *p = phb_to_phb3(slot->phb);
+ uint8_t presence = 1;
+ uint64_t reg;
+
+ switch(slot->state) {
+ case PHB3_SLOT_NORMAL:
+ PHBDBG(p, "FRESET: Starts\n");
+
+ /* Nothing to do without adapter connected */
+ if (slot->ops.get_presence_state)
+ slot->ops.get_presence_state(slot, &presence);
+ if (!presence) {
+ PHBDBG(p, "FRESET: No device\n");
+ return OPAL_SUCCESS;
+ }
+
+ PHBDBG(p, "FRESET: Prepare for link down\n");
+ slot->retry_state = PHB3_SLOT_FRESET_START;
+ if (slot->ops.prepare_link_change)
+ slot->ops.prepare_link_change(slot, false);
+ /* fall through */
+ case PHB3_SLOT_FRESET_START:
+ if (!p->skip_perst) {
+ PHBDBG(p, "FRESET: Assert\n");
+ reg = in_be64(p->regs + PHB_RESET);
+ reg &= ~0x2000000000000000ul;
+ out_be64(p->regs + PHB_RESET, reg);
+ pci_slot_set_state(slot,
+ PHB3_SLOT_FRESET_ASSERT_DELAY);
+ return pci_slot_set_sm_timeout(slot, secs_to_tb(1));
+ }
+
+ /* To skip the assert during boot time */
+ PHBDBG(p, "FRESET: Assert skipped\n");
+ pci_slot_set_state(slot, PHB3_SLOT_FRESET_ASSERT_DELAY);
+ p->skip_perst = false;
+ /* fall through */
+ case PHB3_SLOT_FRESET_ASSERT_DELAY:
+ PHBDBG(p, "FRESET: Deassert\n");
+ reg = in_be64(p->regs + PHB_RESET);
+ reg |= 0x2000000000000000ul;
+ out_be64(p->regs + PHB_RESET, reg);
+ pci_slot_set_state(slot,
+ PHB3_SLOT_FRESET_DEASSERT_DELAY);
+
+ /* CAPP FPGA requires 1s to flash before polling link */
+ return pci_slot_set_sm_timeout(slot, secs_to_tb(1));
+ case PHB3_SLOT_FRESET_DEASSERT_DELAY:
+ pci_slot_set_state(slot, PHB3_SLOT_LINK_START);
+ return slot->ops.poll_link(slot);
+ default:
+ PHBERR(p, "Unexpected slot state %08x\n", slot->state);
+ }
+
+ pci_slot_set_state(slot, PHB3_SLOT_NORMAL);
+ return OPAL_HARDWARE;
+}
+
+static int64_t load_capp_ucode(struct phb3 *p)
+{
+ int64_t rc;
+
+ if (p->index > PHB3_CAPP_MAX_PHB_INDEX(p))
+ return OPAL_HARDWARE;
+
+ /* 0x434150504c494448 = 'CAPPLIDH' in ASCII */
+ rc = capp_load_ucode(p->chip_id, p->phb.opal_id, p->index,
+ 0x434150504c494448UL, PHB3_CAPP_REG_OFFSET(p),
+ CAPP_APC_MASTER_ARRAY_ADDR_REG,
+ CAPP_APC_MASTER_ARRAY_WRITE_REG,
+ CAPP_SNP_ARRAY_ADDR_REG,
+ CAPP_SNP_ARRAY_WRITE_REG);
+ return rc;
+}
+
+static void do_capp_recovery_scoms(struct phb3 *p)
+{
+ uint64_t reg;
+ uint32_t offset;
+
+ PHBDBG(p, "Doing CAPP recovery scoms\n");
+
+ offset = PHB3_CAPP_REG_OFFSET(p);
+ /* disable snoops */
+ xscom_write(p->chip_id, SNOOP_CAPI_CONFIG + offset, 0);
+ load_capp_ucode(p);
+ /* clear err rpt reg*/
+ xscom_write(p->chip_id, CAPP_ERR_RPT_CLR + offset, 0);
+ /* clear capp fir */
+ xscom_write(p->chip_id, CAPP_FIR + offset, 0);
+
+ xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, &reg);
+ reg &= ~(PPC_BIT(0) | PPC_BIT(1));
+ xscom_write(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, reg);
+}
+
+/*
+ * Disable CAPI mode on a PHB.
+ *
+ * Must be done while PHB is fenced and in recovery. Leaves CAPP in recovery -
+ * we can't come out of recovery until the PHB has been reinitialised.
+ *
+ * We don't reset generic error registers here - we rely on phb3_init_hw() to
+ * do that.
+ *
+ * Sets PHB3_CAPP_DISABLING flag when complete.
+ */
+static void disable_capi_mode(struct phb3 *p)
+{
+ struct proc_chip *chip = get_chip(p->chip_id);
+ uint64_t reg;
+ uint32_t offset = PHB3_CAPP_REG_OFFSET(p);
+
+ lock(&capi_lock);
+
+ xscom_read(p->chip_id, PE_CAPP_EN + PE_REG_OFFSET(p), &reg);
+ if (!(reg & PPC_BIT(0))) {
+ /* Not in CAPI mode, no action required */
+ goto out;
+ }
+
+ PHBDBG(p, "CAPP: Disabling CAPI mode\n");
+ if (!(chip->capp_phb3_attached_mask & (1 << p->index)))
+ PHBERR(p, "CAPP: CAPP attached mask not set!\n");
+
+ xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, &reg);
+ if (!(reg & PPC_BIT(0))) {
+ PHBERR(p, "CAPP: not in recovery, can't disable CAPI mode!\n");
+ goto out;
+ }
+
+ /* Snoop CAPI Configuration Register - disable snooping */
+ xscom_write(p->chip_id, SNOOP_CAPI_CONFIG + offset, 0ull);
+
+ /* APC Master PB Control Register - disable examining cResps */
+ xscom_read(p->chip_id, APC_MASTER_PB_CTRL + offset, &reg);
+ reg &= ~PPC_BIT(3);
+ xscom_write(p->chip_id, APC_MASTER_PB_CTRL + offset, reg);
+
+ /* APC Master Config Register - de-select PHBs */
+ xscom_read(p->chip_id, APC_MASTER_CAPI_CTRL + offset, &reg);
+ reg &= ~PPC_BITMASK(1, 3);
+ xscom_write(p->chip_id, APC_MASTER_CAPI_CTRL + offset, reg);
+
+ /* PE Bus AIB Mode Bits */
+ xscom_read(p->chip_id, p->pci_xscom + 0xf, &reg);
+ reg |= PPC_BITMASK(7, 8); /* Ch2 command credit */
+ reg &= ~PPC_BITMASK(40, 42); /* Disable HOL blocking */
+ xscom_write(p->chip_id, p->pci_xscom + 0xf, reg);
+
+ /* PCI Hardware Configuration 0 Register - all store queues free */
+ xscom_read(p->chip_id, p->pe_xscom + 0x18, &reg);
+ reg &= ~PPC_BIT(14);
+ reg |= PPC_BIT(15);
+ xscom_write(p->chip_id, p->pe_xscom + 0x18, reg);
+
+ /*
+ * PCI Hardware Configuration 1 Register - enable read response
+ * arrival/address request ordering
+ */
+ xscom_read(p->chip_id, p->pe_xscom + 0x19, &reg);
+ reg |= PPC_BITMASK(17,18);
+ xscom_write(p->chip_id, p->pe_xscom + 0x19, reg);
+
+ /*
+ * AIB TX Command Credit Register - set AIB credit values back to
+ * normal
+ */
+ xscom_read(p->chip_id, p->pci_xscom + 0xd, &reg);
+ reg |= PPC_BIT(42);
+ reg &= ~PPC_BITMASK(43, 47);
+ xscom_write(p->chip_id, p->pci_xscom + 0xd, reg);
+
+ /* AIB TX Credit Init Timer - reset timer */
+ xscom_write(p->chip_id, p->pci_xscom + 0xc, 0xff00000000000000UL);
+
+ /*
+ * PBCQ Mode Control Register - set dcache handling to normal, not CAPP
+ * mode
+ */
+ xscom_read(p->chip_id, p->pe_xscom + 0xb, &reg);
+ reg &= ~PPC_BIT(25);
+ xscom_write(p->chip_id, p->pe_xscom + 0xb, reg);
+
+ /* Registers touched by phb3_init_capp_regs() */
+
+ /* CAPP Transport Control Register */
+ xscom_write(p->chip_id, TRANSPORT_CONTROL + offset, 0x0001000000000000UL);
+
+ /* Canned pResp Map Register 0/1/2 */
+ xscom_write(p->chip_id, CANNED_PRESP_MAP0 + offset, 0);
+ xscom_write(p->chip_id, CANNED_PRESP_MAP1 + offset, 0);
+ xscom_write(p->chip_id, CANNED_PRESP_MAP2 + offset, 0);
+
+ /* Flush SUE State Map Register */
+ xscom_write(p->chip_id, FLUSH_SUE_STATE_MAP + offset, 0);
+
+ /* CAPP Epoch and Recovery Timers Control Register */
+ xscom_write(p->chip_id, CAPP_EPOCH_TIMER_CTRL + offset, 0);
+
+ /* PE Secure CAPP Enable Register - we're all done! Disable CAPP mode! */
+ xscom_write(p->chip_id, PE_CAPP_EN + PE_REG_OFFSET(p), 0ull);
+
+ /* Trigger CAPP recovery scoms after reinit */
+ p->flags |= PHB3_CAPP_DISABLING;
+
+ chip->capp_phb3_attached_mask &= ~(1 << p->index);
+
+out:
+ unlock(&capi_lock);
+}
+
+static int64_t phb3_creset(struct pci_slot *slot)
+{
+ struct phb3 *p = phb_to_phb3(slot->phb);
+ uint64_t cqsts, val;
+
+ switch (slot->state) {
+ case PHB3_SLOT_NORMAL:
+ case PHB3_SLOT_CRESET_START:
+ PHBDBG(p, "CRESET: Starts\n");
+
+ /* do steps 3-5 of capp recovery procedure */
+ if (p->flags & PHB3_CAPP_RECOVERY)
+ do_capp_recovery_scoms(p);
+
+ /*
+ * The users might be doing error injection through PBCQ
+ * Error Inject Control Register. Without clearing that,
+ * we will get recrusive error during recovery and it will
+ * fail eventually.
+ */
+ xscom_write(p->chip_id, p->pe_xscom + 0xa, 0x0ul);
+
+ /*
+ * We might have escalated frozen state on non-existing PE
+ * to fenced PHB. For the case, the PHB isn't fenced in the
+ * hardware level and it's not safe to do ETU reset. So we
+ * have to force fenced PHB prior to ETU reset.
+ */
+ if (!phb3_fenced(p))
+ xscom_write(p->chip_id, p->pe_xscom + 0x2, 0x000000f000000000ull);
+
+ /* Now that we're guaranteed to be fenced, disable CAPI mode */
+ if (!(p->flags & PHB3_CAPP_RECOVERY))
+ disable_capi_mode(p);
+
+ /* Clear errors in NFIR and raise ETU reset */
+ xscom_read(p->chip_id, p->pe_xscom + 0x0, &p->nfir_cache);
+
+ xscom_read(p->chip_id, p->spci_xscom + 1, &val);/* HW275117 */
+ xscom_write(p->chip_id, p->pci_xscom + 0xa,
+ 0x8000000000000000UL);
+ pci_slot_set_state(slot, PHB3_SLOT_CRESET_WAIT_CQ);
+ slot->retries = 500;
+ return pci_slot_set_sm_timeout(slot, msecs_to_tb(10));
+ case PHB3_SLOT_CRESET_WAIT_CQ:
+ xscom_read(p->chip_id, p->pe_xscom + 0x1c, &val);
+ xscom_read(p->chip_id, p->pe_xscom + 0x1d, &val);
+ xscom_read(p->chip_id, p->pe_xscom + 0x1e, &val);
+ xscom_read(p->chip_id, p->pe_xscom + 0xf, &cqsts);
+ if (!(cqsts & 0xC000000000000000UL)) {
+ PHBDBG(p, "CRESET: No pending transactions\n");
+ xscom_write(p->chip_id, p->pe_xscom + 0x1, ~p->nfir_cache);
+
+ pci_slot_set_state(slot, PHB3_SLOT_CRESET_REINIT);
+ return pci_slot_set_sm_timeout(slot, msecs_to_tb(100));
+ }
+
+ if (slot->retries-- == 0) {
+ PHBERR(p, "Timeout waiting for pending transaction\n");
+ goto error;
+ }
+ return pci_slot_set_sm_timeout(slot, msecs_to_tb(10));
+ case PHB3_SLOT_CRESET_REINIT:
+ PHBDBG(p, "CRESET: Reinitialization\n");
+
+ /*
+ * Clear AIB fenced state. Otherwise, we can't access the
+ * PCI config space of root complex when reinitializing
+ * the PHB.
+ */
+ p->flags &= ~PHB3_AIB_FENCED;
+ p->flags &= ~PHB3_CAPP_RECOVERY;
+ phb3_init_hw(p, false);
+
+ if (p->flags & PHB3_CAPP_DISABLING) {
+ do_capp_recovery_scoms(p);
+ p->flags &= ~PHB3_CAPP_DISABLING;
+ }
+
+ pci_slot_set_state(slot, PHB3_SLOT_CRESET_FRESET);
+ return pci_slot_set_sm_timeout(slot, msecs_to_tb(100));
+ case PHB3_SLOT_CRESET_FRESET:
+ pci_slot_set_state(slot, PHB3_SLOT_NORMAL);
+ return slot->ops.freset(slot);
+ default:
+ PHBERR(p, "CRESET: Unexpected slot state %08x\n",
+ slot->state);
+ }
+
+error:
+ return OPAL_HARDWARE;
+}
+
+/*
+ * Initialize root complex slot, which is mainly used to
+ * do fundamental reset before PCI enumeration in PCI core.
+ * When probing root complex and building its real slot,
+ * the operations will be copied over.
+ */
+static struct pci_slot *phb3_slot_create(struct phb *phb)
+{
+ struct pci_slot *slot;
+
+ slot = pci_slot_alloc(phb, NULL);
+ if (!slot)
+ return slot;
+
+ /* Elementary functions */
+ slot->ops.get_presence_state = phb3_get_presence_state;
+ slot->ops.get_link_state = phb3_get_link_state;
+ slot->ops.get_power_state = NULL;
+ slot->ops.get_attention_state = NULL;
+ slot->ops.get_latch_state = NULL;
+ slot->ops.set_power_state = NULL;
+ slot->ops.set_attention_state = NULL;
+
+ /*
+ * For PHB slots, we have to split the fundamental reset
+ * into 2 steps. We might not have the first step which
+ * is to power off/on the slot, or it's controlled by
+ * individual platforms.
+ */
+ slot->ops.prepare_link_change = phb3_prepare_link_change;
+ slot->ops.poll_link = phb3_poll_link;
+ slot->ops.hreset = phb3_hreset;
+ slot->ops.freset = phb3_freset;
+ slot->ops.creset = phb3_creset;
+
+ return slot;
+}
+
+static int64_t phb3_eeh_freeze_status(struct phb *phb, uint64_t pe_number,
+ uint8_t *freeze_state,
+ uint16_t *pci_error_type,
+ uint16_t *severity)
+{
+ struct phb3 *p = phb_to_phb3(phb);
+ uint64_t peev_bit = PPC_BIT(pe_number & 0x3f);
+ uint64_t peev, pesta, pestb;
+
+ /* Defaults: not frozen */
+ *freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN;
+ *pci_error_type = OPAL_EEH_NO_ERROR;
+
+ /* Check dead */
+ if (p->broken) {
+ *freeze_state = OPAL_EEH_STOPPED_MMIO_DMA_FREEZE;
+ *pci_error_type = OPAL_EEH_PHB_ERROR;
+ if (severity)
+ *severity = OPAL_EEH_SEV_PHB_DEAD;
+ return OPAL_HARDWARE;
+ }
+
+ /* Check fence and CAPP recovery */
+ if (phb3_fenced(p) || (p->flags & PHB3_CAPP_RECOVERY)) {
+ *freeze_state = OPAL_EEH_STOPPED_MMIO_DMA_FREEZE;
+ *pci_error_type = OPAL_EEH_PHB_ERROR;
+ if (severity)
+ *severity = OPAL_EEH_SEV_PHB_FENCED;
+ return OPAL_SUCCESS;
+ }
+
+ /* Check the PEEV */
+ phb3_ioda_sel(p, IODA2_TBL_PEEV, pe_number / 64, false);
+ peev = in_be64(p->regs + PHB_IODA_DATA0);
+ if (!(peev & peev_bit))
+ return OPAL_SUCCESS;
+
+ /* Indicate that we have an ER pending */
+ phb3_set_err_pending(p, true);
+ if (severity)
+ *severity = OPAL_EEH_SEV_PE_ER;
+
+ /* Read the PESTA & PESTB */
+ phb3_ioda_sel(p, IODA2_TBL_PESTA, pe_number, false);
+ pesta = in_be64(p->regs + PHB_IODA_DATA0);
+ phb3_ioda_sel(p, IODA2_TBL_PESTB, pe_number, false);
+ pestb = in_be64(p->regs + PHB_IODA_DATA0);
+
+ /* Convert them */
+ if (pesta & IODA2_PESTA_MMIO_FROZEN)
+ *freeze_state |= OPAL_EEH_STOPPED_MMIO_FREEZE;
+ if (pestb & IODA2_PESTB_DMA_STOPPED)
+ *freeze_state |= OPAL_EEH_STOPPED_DMA_FREEZE;
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb3_eeh_freeze_clear(struct phb *phb, uint64_t pe_number,
+ uint64_t eeh_action_token)
+{
+ struct phb3 *p = phb_to_phb3(phb);
+ uint64_t err, peev[4];
+ int32_t i;
+ bool frozen_pe = false;
+
+ if (p->broken)
+ return OPAL_HARDWARE;
+
+ /* Summary. If nothing, move to clearing the PESTs which can
+ * contain a freeze state from a previous error or simply set
+ * explicitely by the user
+ */
+ err = in_be64(p->regs + PHB_ETU_ERR_SUMMARY);
+ if (err == 0xffffffffffffffffUL) {
+ if (phb3_fenced(p)) {
+ PHBERR(p, "eeh_freeze_clear on fenced PHB\n");
+ return OPAL_HARDWARE;
+ }
+ }
+ if (err != 0)
+ phb3_err_ER_clear(p);
+
+ /*
+ * We have PEEV in system memory. It would give more performance
+ * to access that directly.
+ */
+ if (eeh_action_token & OPAL_EEH_ACTION_CLEAR_FREEZE_MMIO) {
+ phb3_ioda_sel(p, IODA2_TBL_PESTA, pe_number, false);
+ out_be64(p->regs + PHB_IODA_DATA0, 0);
+ }
+ if (eeh_action_token & OPAL_EEH_ACTION_CLEAR_FREEZE_DMA) {
+ phb3_ioda_sel(p, IODA2_TBL_PESTB, pe_number, false);
+ out_be64(p->regs + PHB_IODA_DATA0, 0);
+ }
+
+
+ /* Update ER pending indication */
+ phb3_ioda_sel(p, IODA2_TBL_PEEV, 0, true);
+ for (i = 0; i < ARRAY_SIZE(peev); i++) {
+ peev[i] = in_be64(p->regs + PHB_IODA_DATA0);
+ if (peev[i]) {
+ frozen_pe = true;
+ break;
+ }
+ }
+ if (frozen_pe) {
+ p->err.err_src = PHB3_ERR_SRC_PHB;
+ p->err.err_class = PHB3_ERR_CLASS_ER;
+ p->err.err_bit = -1;
+ phb3_set_err_pending(p, true);
+ } else
+ phb3_set_err_pending(p, false);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb3_eeh_freeze_set(struct phb *phb, uint64_t pe_number,
+ uint64_t eeh_action_token)
+{
+ struct phb3 *p = phb_to_phb3(phb);
+ uint64_t data;
+
+ if (p->broken)
+ return OPAL_HARDWARE;
+
+ if (pe_number >= PHB3_MAX_PE_NUM)
+ return OPAL_PARAMETER;
+
+ if (eeh_action_token != OPAL_EEH_ACTION_SET_FREEZE_MMIO &&
+ eeh_action_token != OPAL_EEH_ACTION_SET_FREEZE_DMA &&
+ eeh_action_token != OPAL_EEH_ACTION_SET_FREEZE_ALL)
+ return OPAL_PARAMETER;
+
+ if (eeh_action_token & OPAL_EEH_ACTION_SET_FREEZE_MMIO) {
+ phb3_ioda_sel(p, IODA2_TBL_PESTA, pe_number, false);
+ data = in_be64(p->regs + PHB_IODA_DATA0);
+ data |= IODA2_PESTA_MMIO_FROZEN;
+ out_be64(p->regs + PHB_IODA_DATA0, data);
+ }
+
+ if (eeh_action_token & OPAL_EEH_ACTION_SET_FREEZE_DMA) {
+ phb3_ioda_sel(p, IODA2_TBL_PESTB, pe_number, false);
+ data = in_be64(p->regs + PHB_IODA_DATA0);
+ data |= IODA2_PESTB_DMA_STOPPED;
+ out_be64(p->regs + PHB_IODA_DATA0, data);
+ }
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb3_eeh_next_error(struct phb *phb,
+ uint64_t *first_frozen_pe,
+ uint16_t *pci_error_type,
+ uint16_t *severity)
+{
+ struct phb3 *p = phb_to_phb3(phb);
+ uint64_t fir, peev[4];
+ uint32_t cfg32;
+ int32_t i, j;
+
+ /* If the PHB is broken, we needn't go forward */
+ if (p->broken) {
+ *pci_error_type = OPAL_EEH_PHB_ERROR;
+ *severity = OPAL_EEH_SEV_PHB_DEAD;
+ return OPAL_SUCCESS;
+ }
+
+ if ((p->flags & PHB3_CAPP_RECOVERY)) {
+ *pci_error_type = OPAL_EEH_PHB_ERROR;
+ *severity = OPAL_EEH_SEV_PHB_FENCED;
+ return OPAL_SUCCESS;
+ }
+
+ /*
+ * Check if we already have pending errors. If that's
+ * the case, then to get more information about the
+ * pending errors. Here we try PBCQ prior to PHB.
+ */
+ if (phb3_err_pending(p) &&
+ !phb3_err_check_pbcq(p) &&
+ !phb3_err_check_lem(p))
+ phb3_set_err_pending(p, false);
+
+ /* Clear result */
+ *pci_error_type = OPAL_EEH_NO_ERROR;
+ *severity = OPAL_EEH_SEV_NO_ERROR;
+ *first_frozen_pe = (uint64_t)-1;
+
+ /* Check frozen PEs */
+ if (!phb3_err_pending(p)) {
+ phb3_ioda_sel(p, IODA2_TBL_PEEV, 0, true);
+ for (i = 0; i < ARRAY_SIZE(peev); i++) {
+ peev[i] = in_be64(p->regs + PHB_IODA_DATA0);
+ if (peev[i]) {
+ p->err.err_src = PHB3_ERR_SRC_PHB;
+ p->err.err_class = PHB3_ERR_CLASS_ER;
+ p->err.err_bit = -1;
+ phb3_set_err_pending(p, true);
+ break;
+ }
+ }
+ }
+
+ /* Mapping errors */
+ if (phb3_err_pending(p)) {
+ /*
+ * If the frozen PE is caused by a malfunctioning TLP, we
+ * need reset the PHB. So convert ER to PHB-fatal error
+ * for the case.
+ */
+ if (p->err.err_class == PHB3_ERR_CLASS_ER) {
+ fir = phb3_read_reg_asb(p, PHB_LEM_FIR_ACCUM);
+ if (fir & PPC_BIT(60)) {
+ phb3_pcicfg_read32(&p->phb, 0,
+ p->aercap + PCIECAP_AER_UE_STATUS, &cfg32);
+ if (cfg32 & PCIECAP_AER_UE_MALFORMED_TLP)
+ p->err.err_class = PHB3_ERR_CLASS_FENCED;
+ }
+ }
+
+ switch (p->err.err_class) {
+ case PHB3_ERR_CLASS_DEAD:
+ *pci_error_type = OPAL_EEH_PHB_ERROR;
+ *severity = OPAL_EEH_SEV_PHB_DEAD;
+ break;
+ case PHB3_ERR_CLASS_FENCED:
+ *pci_error_type = OPAL_EEH_PHB_ERROR;
+ *severity = OPAL_EEH_SEV_PHB_FENCED;
+ break;
+ case PHB3_ERR_CLASS_ER:
+ *pci_error_type = OPAL_EEH_PE_ERROR;
+ *severity = OPAL_EEH_SEV_PE_ER;
+
+ phb3_ioda_sel(p, IODA2_TBL_PEEV, 0, true);
+ for (i = 0; i < ARRAY_SIZE(peev); i++)
+ peev[i] = in_be64(p->regs + PHB_IODA_DATA0);
+ for (i = ARRAY_SIZE(peev) - 1; i >= 0; i--) {
+ for (j = 0; j < 64; j++) {
+ if (peev[i] & PPC_BIT(j)) {
+ *first_frozen_pe = i * 64 + j;
+ break;
+ }
+ }
+
+ if (*first_frozen_pe != (uint64_t)(-1))
+ break;
+ }
+
+ /* No frozen PE ? */
+ if (*first_frozen_pe == (uint64_t)-1) {
+ *pci_error_type = OPAL_EEH_NO_ERROR;
+ *severity = OPAL_EEH_SEV_NO_ERROR;
+ phb3_set_err_pending(p, false);
+ }
+
+ break;
+ case PHB3_ERR_CLASS_INF:
+ *pci_error_type = OPAL_EEH_PHB_ERROR;
+ *severity = OPAL_EEH_SEV_INF;
+ break;
+ default:
+ *pci_error_type = OPAL_EEH_NO_ERROR;
+ *severity = OPAL_EEH_SEV_NO_ERROR;
+ phb3_set_err_pending(p, false);
+ }
+ }
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb3_err_inject_finalize(struct phb3 *p, uint64_t addr,
+ uint64_t mask, uint64_t ctrl,
+ bool is_write)
+{
+ if (is_write)
+ ctrl |= PHB_PAPR_ERR_INJ_CTL_WR;
+ else
+ ctrl |= PHB_PAPR_ERR_INJ_CTL_RD;
+
+ out_be64(p->regs + PHB_PAPR_ERR_INJ_ADDR, addr);
+ out_be64(p->regs + PHB_PAPR_ERR_INJ_MASK, mask);
+ out_be64(p->regs + PHB_PAPR_ERR_INJ_CTL, ctrl);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb3_err_inject_mem32(struct phb3 *p, uint64_t pe_number,
+ uint64_t addr, uint64_t mask,
+ bool is_write)
+{
+ uint64_t base, len, segstart, segsize;
+ uint64_t a, m;
+ uint64_t ctrl = PHB_PAPR_ERR_INJ_CTL_OUTB;
+ uint32_t index;
+
+ segsize = (M32_PCI_SIZE / PHB3_MAX_PE_NUM);
+ a = base = len = 0x0ull;
+
+ for (index = 0; index < PHB3_MAX_PE_NUM; index++) {
+ if (GETFIELD(IODA2_M32DT_PE, p->m32d_cache[index]) != pe_number)
+ continue;
+
+ /* Obviously, we can't support discontiguous segments.
+ * We have to pick the first batch of contiguous segments
+ * for that case
+ */
+ segstart = p->mm1_base + segsize * index;
+ if (!len) {
+ base = segstart;
+ len = segsize;
+ } else if ((base + len) == segstart) {
+ len += segsize;
+ }
+
+ /* Check the specified address is valid one */
+ if (addr >= segstart && addr < (segstart + segsize)) {
+ a = addr;
+ break;
+ }
+ }
+
+ /* No MM32 segments assigned to the PE */
+ if (!len)
+ return OPAL_PARAMETER;
+
+ /* Specified address is out of range */
+ if (!a) {
+ a = base;
+ len = len & ~(len - 1);
+ m = ~(len - 1);
+ } else {
+ m = mask;
+ }
+
+ a = SETFIELD(PHB_PAPR_ERR_INJ_ADDR_MMIO, 0x0ull, a);
+ m = SETFIELD(PHB_PAPR_ERR_INJ_MASK_MMIO, 0x0ull, m);
+
+ return phb3_err_inject_finalize(p, a, m, ctrl, is_write);
+}
+
+static int64_t phb3_err_inject_mem64(struct phb3 *p, uint64_t pe_number,
+ uint64_t addr, uint64_t mask,
+ bool is_write)
+{
+ uint64_t base, len, segstart, segsize;
+ uint64_t cache, a, m;
+ uint64_t ctrl = PHB_PAPR_ERR_INJ_CTL_OUTB;
+ uint32_t index, s_index, e_index;
+
+ /* By default, the PE is PCI device dependent one */
+ s_index = 0;
+ e_index = ARRAY_SIZE(p->m64b_cache) - 2;
+ for (index = 0; index < RTT_TABLE_ENTRIES; index++) {
+ if (p->rte_cache[index] != pe_number)
+ continue;
+
+ if (index + 8 >= RTT_TABLE_ENTRIES)
+ break;
+
+ /* PCI bus dependent PE */
+ if (p->rte_cache[index + 8] == pe_number) {
+ s_index = e_index = ARRAY_SIZE(p->m64b_cache) - 1;
+ break;
+ }
+ }
+
+ a = base = len = 0x0ull;
+ for (index = s_index; !len && index <= e_index; index++) {
+ cache = p->m64b_cache[index];
+ if (!(cache & IODA2_M64BT_ENABLE))
+ continue;
+
+ if (cache & IODA2_M64BT_SINGLE_PE) {
+ if (GETFIELD(IODA2_M64BT_PE_HI, cache) != (pe_number >> 5) ||
+ GETFIELD(IODA2_M64BT_PE_LOW, cache) != (pe_number & 0x1f))
+ continue;
+
+ segstart = GETFIELD(IODA2_M64BT_SINGLE_BASE, cache);
+ segstart <<= 25; /* 32MB aligned */
+ segsize = GETFIELD(IODA2_M64BT_SINGLE_MASK, cache);
+ segsize = (0x2000000ull - segsize) << 25;
+ } else {
+ segstart = GETFIELD(IODA2_M64BT_BASE, cache);
+ segstart <<= 20; /* 1MB aligned */
+ segsize = GETFIELD(IODA2_M64BT_MASK, cache);
+ segsize = (0x40000000ull - segsize) << 20;
+
+ segsize /= PHB3_MAX_PE_NUM;
+ segstart = segstart + segsize * pe_number;
+ }
+
+ /* First window always wins based on the ascending
+ * searching priority the 16 BARs have. We're using
+ * the feature to assign resource for SRIOV VFs.
+ */
+ if (!len) {
+ base = segstart;
+ len = segsize;
+ }
+
+ /* Specified address is valid one */
+ if (addr >= segstart && addr < (segstart + segsize)) {
+ a = addr;
+ }
+ }
+
+ /* No MM64 segments assigned to the PE */
+ if (!len)
+ return OPAL_PARAMETER;
+
+ /* Address specified or calculated */
+ if (!a) {
+ a = base;
+ len = len & ~(len - 1);
+ m = ~(len - 1);
+ } else {
+ m = mask;
+ }
+
+ a = SETFIELD(PHB_PAPR_ERR_INJ_ADDR_MMIO, 0x0ull, a);
+ m = SETFIELD(PHB_PAPR_ERR_INJ_MASK_MMIO, 0x0ull, m);
+
+ return phb3_err_inject_finalize(p, a, m, ctrl, is_write);
+}
+
+static int64_t phb3_err_inject_cfg(struct phb3 *p, uint64_t pe_number,
+ uint64_t addr, uint64_t mask,
+ bool is_write)
+{
+ uint64_t a, m, prefer;
+ uint64_t ctrl = PHB_PAPR_ERR_INJ_CTL_CFG;
+ int bdfn;
+ bool is_bus_pe;
+
+ a = 0xffffull;
+ prefer = 0xffffull;
+ m = PHB_PAPR_ERR_INJ_MASK_CFG_ALL;
+ for (bdfn = 0; bdfn < RTT_TABLE_ENTRIES; bdfn++) {
+ if (p->rte_cache[bdfn] != pe_number)
+ continue;
+
+ /* The PE can be associated with PCI bus or device */
+ is_bus_pe = false;
+ if ((bdfn + 8) < RTT_TABLE_ENTRIES &&
+ p->rte_cache[bdfn + 8] == pe_number)
+ is_bus_pe = true;
+
+ /* Figure out the PCI config address */
+ if (prefer == 0xffffull) {
+ if (is_bus_pe) {
+ m = PHB_PAPR_ERR_INJ_MASK_CFG;
+ prefer = SETFIELD(m, 0x0ull, PCI_BUS_NUM(bdfn));
+ } else {
+ m = PHB_PAPR_ERR_INJ_MASK_CFG_ALL;
+ prefer = SETFIELD(m, 0x0ull, bdfn);
+ }
+ }
+
+ /* Check the input address is valid or not */
+ if (!is_bus_pe &&
+ GETFIELD(PHB_PAPR_ERR_INJ_MASK_CFG_ALL, addr) == bdfn) {
+ a = addr;
+ break;
+ }
+
+ if (is_bus_pe &&
+ GETFIELD(PHB_PAPR_ERR_INJ_MASK_CFG, addr) == PCI_BUS_NUM(bdfn)) {
+ a = addr;
+ break;
+ }
+ }
+
+ /* Invalid PE number */
+ if (prefer == 0xffffull)
+ return OPAL_PARAMETER;
+
+ /* Specified address is out of range */
+ if (a == 0xffffull)
+ a = prefer;
+ else
+ m = mask;
+
+ return phb3_err_inject_finalize(p, a, m, ctrl, is_write);
+}
+
+static int64_t phb3_err_inject_dma(struct phb3 *p, uint64_t pe_number,
+ uint64_t addr, uint64_t mask,
+ bool is_write, bool is_64bits)
+{
+ uint32_t index, page_size;
+ uint64_t tve, table_entries;
+ uint64_t base, start, end, len, a, m;
+ uint64_t ctrl = PHB_PAPR_ERR_INJ_CTL_INB;
+
+ /* TVE index and base address */
+ if (!is_64bits) {
+ index = (pe_number << 1);
+ base = 0x0ull;
+ } else {
+ index = ((pe_number << 1) + 1);
+ base = (0x1ull << 59);
+ }
+
+ /* Raw data of table entries and page size */
+ tve = p->tve_cache[index];
+ table_entries = GETFIELD(IODA2_TVT_TCE_TABLE_SIZE, tve);
+ table_entries = (0x1ull << (table_entries + 8));
+ page_size = GETFIELD(IODA2_TVT_IO_PSIZE, tve);
+ if (!page_size && !(tve & PPC_BIT(51)))
+ return OPAL_UNSUPPORTED;
+
+ /* Check the page size */
+ switch (page_size) {
+ case 0: /* bypass */
+ start = ((tve & (0x3ull << 10)) << 14) |
+ ((tve & (0xffffffull << 40)) >> 40);
+ end = ((tve & (0x3ull << 8)) << 16) |
+ ((tve & (0xffffffull << 16)) >> 16);
+
+ /* 16MB aligned size */
+ len = (end - start) << 24;
+ break;
+ case 5: /* 64KB */
+ len = table_entries * 0x10000ull;
+ break;
+ case 13: /* 16MB */
+ len = table_entries * 0x1000000ull;
+ break;
+ case 17: /* 256MB */
+ len = table_entries * 0x10000000ull;
+ break;
+ case 1: /* 4KB */
+ default:
+ len = table_entries * 0x1000ull;
+ }
+
+ /* The specified address is in range */
+ if (addr && addr >= base && addr < (base + len)) {
+ a = addr;
+ m = mask;
+ } else {
+ a = base;
+ len = len & ~(len - 1);
+ m = ~(len - 1);
+ }
+
+ return phb3_err_inject_finalize(p, a, m, ctrl, is_write);
+}
+
+static int64_t phb3_err_inject_dma32(struct phb3 *p, uint64_t pe_number,
+ uint64_t addr, uint64_t mask,
+ bool is_write)
+{
+ return phb3_err_inject_dma(p, pe_number, addr, mask, is_write, false);
+}
+
+static int64_t phb3_err_inject_dma64(struct phb3 *p, uint64_t pe_number,
+ uint64_t addr, uint64_t mask,
+ bool is_write)
+{
+ return phb3_err_inject_dma(p, pe_number, addr, mask, is_write, true);
+}
+
+static int64_t phb3_err_inject(struct phb *phb, uint64_t pe_number,
+ uint32_t type, uint32_t func,
+ uint64_t addr, uint64_t mask)
+{
+ struct phb3 *p = phb_to_phb3(phb);
+ int64_t (*handler)(struct phb3 *p, uint64_t pe_number,
+ uint64_t addr, uint64_t mask, bool is_write);
+ bool is_write;
+
+ /* How could we get here without valid RTT? */
+ if (!p->tbl_rtt)
+ return OPAL_HARDWARE;
+
+ /* We can't inject error to the reserved PE */
+ if (pe_number == PHB3_RESERVED_PE_NUM || pe_number >= PHB3_MAX_PE_NUM)
+ return OPAL_PARAMETER;
+
+ /* Clear leftover from last time */
+ out_be64(p->regs + PHB_PAPR_ERR_INJ_CTL, 0x0ul);
+
+ switch (func) {
+ case OPAL_ERR_INJECT_FUNC_IOA_LD_MEM_ADDR:
+ case OPAL_ERR_INJECT_FUNC_IOA_LD_MEM_DATA:
+ is_write = false;
+ if (type == OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR64)
+ handler = phb3_err_inject_mem64;
+ else
+ handler = phb3_err_inject_mem32;
+ break;
+ case OPAL_ERR_INJECT_FUNC_IOA_ST_MEM_ADDR:
+ case OPAL_ERR_INJECT_FUNC_IOA_ST_MEM_DATA:
+ is_write = true;
+ if (type == OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR64)
+ handler = phb3_err_inject_mem64;
+ else
+ handler = phb3_err_inject_mem32;
+ break;
+ case OPAL_ERR_INJECT_FUNC_IOA_LD_CFG_ADDR:
+ case OPAL_ERR_INJECT_FUNC_IOA_LD_CFG_DATA:
+ is_write = false;
+ handler = phb3_err_inject_cfg;
+ break;
+ case OPAL_ERR_INJECT_FUNC_IOA_ST_CFG_ADDR:
+ case OPAL_ERR_INJECT_FUNC_IOA_ST_CFG_DATA:
+ is_write = true;
+ handler = phb3_err_inject_cfg;
+ break;
+ case OPAL_ERR_INJECT_FUNC_IOA_DMA_RD_ADDR:
+ case OPAL_ERR_INJECT_FUNC_IOA_DMA_RD_DATA:
+ case OPAL_ERR_INJECT_FUNC_IOA_DMA_RD_MASTER:
+ case OPAL_ERR_INJECT_FUNC_IOA_DMA_RD_TARGET:
+ is_write = false;
+ if (type == OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR64)
+ handler = phb3_err_inject_dma64;
+ else
+ handler = phb3_err_inject_dma32;
+ break;
+ case OPAL_ERR_INJECT_FUNC_IOA_DMA_WR_ADDR:
+ case OPAL_ERR_INJECT_FUNC_IOA_DMA_WR_DATA:
+ case OPAL_ERR_INJECT_FUNC_IOA_DMA_WR_MASTER:
+ case OPAL_ERR_INJECT_FUNC_IOA_DMA_WR_TARGET:
+ is_write = true;
+ if (type == OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR64)
+ handler = phb3_err_inject_dma64;
+ else
+ handler = phb3_err_inject_dma32;
+ break;
+ default:
+ return OPAL_PARAMETER;
+ }
+
+ return handler(p, pe_number, addr, mask, is_write);
+}
+
+static int64_t phb3_get_diag_data(struct phb *phb,
+ void *diag_buffer,
+ uint64_t diag_buffer_len)
+{
+ struct phb3 *p = phb_to_phb3(phb);
+ struct OpalIoPhb3ErrorData *data = diag_buffer;
+ bool fenced;
+
+ if (diag_buffer_len < sizeof(struct OpalIoPhb3ErrorData))
+ return OPAL_PARAMETER;
+ if (p->broken)
+ return OPAL_HARDWARE;
+
+ /*
+ * Dummy check for fence so that phb3_read_phb_status knows
+ * whether to use ASB or AIB
+ */
+ fenced = phb3_fenced(p);
+ phb3_read_phb_status(p, data);
+
+ if (!fenced)
+ phb3_eeh_dump_regs(p, data);
+
+ /*
+ * We're running to here probably because of errors
+ * (INF class). For that case, we need clear the error
+ * explicitly.
+ */
+ if (phb3_err_pending(p) &&
+ p->err.err_class == PHB3_ERR_CLASS_INF &&
+ p->err.err_src == PHB3_ERR_SRC_PHB) {
+ phb3_err_ER_clear(p);
+ phb3_set_err_pending(p, false);
+ }
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb3_get_capp_info(int chip_id, struct phb *phb,
+ struct capp_info *info)
+{
+ struct phb3 *p = phb_to_phb3(phb);
+ struct proc_chip *chip = get_chip(p->chip_id);
+ uint32_t offset;
+
+ if (chip_id != p->chip_id)
+ return OPAL_PARAMETER;
+
+ if (!((1 << p->index) & chip->capp_phb3_attached_mask))
+ return OPAL_PARAMETER;
+
+ offset = PHB3_CAPP_REG_OFFSET(p);
+
+ if (PHB3_IS_NAPLES(p)) {
+ if (p->index == 0)
+ info->capp_index = 0;
+ else
+ info->capp_index = 1;
+ } else
+ info->capp_index = 0;
+ info->phb_index = p->index;
+ info->capp_fir_reg = CAPP_FIR + offset;
+ info->capp_fir_mask_reg = CAPP_FIR_MASK + offset;
+ info->capp_fir_action0_reg = CAPP_FIR_ACTION0 + offset;
+ info->capp_fir_action1_reg = CAPP_FIR_ACTION1 + offset;
+ info->capp_err_status_ctrl_reg = CAPP_ERR_STATUS_CTRL + offset;
+
+ return OPAL_SUCCESS;
+}
+
+static void phb3_init_capp_regs(struct phb3 *p, bool dma_mode)
+{
+ uint64_t reg;
+ uint32_t offset;
+ uint64_t read_buffers = 0;
+
+ offset = PHB3_CAPP_REG_OFFSET(p);
+ xscom_read(p->chip_id, APC_MASTER_PB_CTRL + offset, &reg);
+ reg &= ~PPC_BITMASK(10, 11);
+ reg |= PPC_BIT(3);
+ if (dma_mode) {
+ /* In DMA mode, the CAPP only owns some of the PHB read buffers */
+ read_buffers = 0x1;
+
+ /*
+ * HW301991 - XSL sends PTE updates with nodal scope instead of
+ * group scope. The workaround is to force all commands to
+ * unlimited scope by setting bit 4. This may have a slight
+ * performance impact, but it would be negligible on the XSL.
+ * To avoid the possibility it might impact other cards, key it
+ * off DMA mode since the XSL based Mellanox CX4 is the only
+ * card to use this mode in P8 timeframe:
+ */
+ reg |= PPC_BIT(4);
+ }
+ reg |= read_buffers << PPC_BITLSHIFT(11);
+ xscom_write(p->chip_id, APC_MASTER_PB_CTRL + offset, reg);
+
+ /* Dynamically workout which PHB to connect to port 0 of the CAPP.
+ * Here is the table from the CAPP workbook:
+ * APC_MASTER CAPP CAPP
+ * bits 1:3 port0 port1
+ * 000 disabled disabled
+ * * 001 PHB2 disabled
+ * * 010 PHB1 disabled
+ * 011 PHB1 PHB2
+ * * 100 PHB0 disabled
+ * 101 PHB0 PHB2
+ * 110 PHB0 PHB1
+ *
+ * We don't use port1 so only those starred above are used.
+ * Hence reduce table to:
+ * PHB0 -> APC MASTER(bits 1:3) = 0b100
+ * PHB1 -> APC MASTER(bits 1:3) = 0b010
+ * PHB2 -> APC MASTER(bits 1:3) = 0b001
+ *
+ * Note: Naples has two CAPP units, statically mapped:
+ * CAPP0/PHB0 -> APC MASTER(bits 1:3) = 0b100
+ * CAPP1/PHB1 -> APC MASTER(bits 1:3) = 0b010
+ */
+ reg = 0x4000000000000000ULL >> p->index;
+ reg |= 0x0070000000000000UL;
+ xscom_write(p->chip_id, APC_MASTER_CAPI_CTRL + offset, reg);
+ PHBINF(p, "CAPP: port attached\n");
+
+ /* tlb and mmio */
+ xscom_write(p->chip_id, TRANSPORT_CONTROL + offset, 0x4028000104000000UL);
+
+ xscom_write(p->chip_id, CANNED_PRESP_MAP0 + offset, 0);
+ xscom_write(p->chip_id, CANNED_PRESP_MAP1 + offset, 0xFFFFFFFF00000000UL);
+ xscom_write(p->chip_id, CANNED_PRESP_MAP2 + offset, 0);
+
+ /* error recovery */
+ xscom_write(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, 0);
+
+ xscom_write(p->chip_id, FLUSH_SUE_STATE_MAP + offset,
+ 0x1DC20B6600000000UL);
+ xscom_write(p->chip_id, CAPP_EPOCH_TIMER_CTRL + offset,
+ 0xC0000000FFF0FFE0UL);
+ xscom_write(p->chip_id, FLUSH_UOP_CONFIG1 + offset,
+ 0xB188280728000000UL);
+ xscom_write(p->chip_id, FLUSH_UOP_CONFIG2 + offset, 0xB188400F00000000UL);
+
+ reg = 0xA1F0000000000000UL;
+ reg |= read_buffers << PPC_BITLSHIFT(39);
+ xscom_write(p->chip_id, SNOOP_CAPI_CONFIG + offset, reg);
+}
+
+/* override some inits with CAPI defaults */
+static void phb3_init_capp_errors(struct phb3 *p)
+{
+ out_be64(p->regs + PHB_ERR_AIB_FENCE_ENABLE, 0xffffffdd8c80ffc0UL);
+ out_be64(p->regs + PHB_OUT_ERR_AIB_FENCE_ENABLE, 0x9cf3fe08f8dc700fUL);
+ out_be64(p->regs + PHB_INA_ERR_AIB_FENCE_ENABLE, 0xffff57fbff01ffdeUL);
+ out_be64(p->regs + PHB_INB_ERR_AIB_FENCE_ENABLE, 0xfcffe0fbff7ff0ecUL);
+ out_be64(p->regs + PHB_LEM_ERROR_MASK, 0x40018e2400022482UL);
+}
+
+/*
+ * Enable CAPI mode on a PHB
+ *
+ * Changes to this init sequence may require updating disable_capi_mode().
+ */
+static int64_t enable_capi_mode(struct phb3 *p, uint64_t pe_number, bool dma_mode)
+{
+ uint64_t reg;
+ int i;
+
+ xscom_read(p->chip_id, PE_CAPP_EN + PE_REG_OFFSET(p), &reg);
+ if (reg & PPC_BIT(0)) {
+ PHBDBG(p, "Already in CAPP mode\n");
+ }
+
+ /* poll cqstat */
+ for (i = 0; i < 500000; i++) {
+ xscom_read(p->chip_id, p->pe_xscom + 0xf, &reg);
+ if (!(reg & 0xC000000000000000UL))
+ break;
+ time_wait_us(10);
+ }
+ if (reg & 0xC000000000000000UL) {
+ PHBERR(p, "CAPP: Timeout waiting for pending transaction\n");
+ return OPAL_HARDWARE;
+ }
+
+ /* pb aib capp enable */
+ reg = PPC_BIT(0); /* capp enable */
+ if (dma_mode)
+ reg |= PPC_BIT(1); /* capp dma mode */
+ xscom_write(p->chip_id, p->spci_xscom + 0x3, reg);
+
+ /* FIXME security timer bar
+ xscom_write(p->chip_id, p->spci_xscom + 0x4, 0x8000000000000000ull);
+ */
+
+ /* aib mode */
+ xscom_read(p->chip_id, p->pci_xscom + 0xf, &reg);
+ reg &= ~PPC_BITMASK(6,7);
+ reg |= PPC_BIT(8);
+ reg |= PPC_BITMASK(40, 41);
+ reg &= ~PPC_BIT(42);
+ xscom_write(p->chip_id, p->pci_xscom + 0xf, reg);
+
+ /* pci hwconf0 */
+ xscom_read(p->chip_id, p->pe_xscom + 0x18, &reg);
+ reg |= PPC_BIT(14);
+ reg &= ~PPC_BIT(15);
+ xscom_write(p->chip_id, p->pe_xscom + 0x18, reg);
+
+ /* pci hwconf1 */
+ xscom_read(p->chip_id, p->pe_xscom + 0x19, &reg);
+ reg &= ~PPC_BITMASK(17,18);
+ xscom_write(p->chip_id, p->pe_xscom + 0x19, reg);
+
+ /* aib tx cmd cred */
+ xscom_read(p->chip_id, p->pci_xscom + 0xd, &reg);
+ if (dma_mode) {
+ /*
+ * In DMA mode, increase AIB credit value for ch 2 (DMA read)
+ * for performance reasons
+ */
+ reg &= ~PPC_BITMASK(42, 47);
+ reg |= PPC_BITMASK(43, 45);
+ } else {
+ reg &= ~PPC_BITMASK(42, 46);
+ reg |= PPC_BIT(47);
+ }
+ xscom_write(p->chip_id, p->pci_xscom + 0xd, reg);
+
+ xscom_write(p->chip_id, p->pci_xscom + 0xc, 0xff00000000000000ull);
+
+ /* pci mode ctl */
+ xscom_read(p->chip_id, p->pe_xscom + 0xb, &reg);
+ reg |= PPC_BIT(25);
+ xscom_write(p->chip_id, p->pe_xscom + 0xb, reg);
+
+ /* set tve no translate mode allow mmio window */
+ memset(p->tve_cache, 0x0, sizeof(p->tve_cache));
+ if (dma_mode) {
+ /*
+ * CAPP DMA mode needs access to all of memory, set address
+ * range to 0x0000000000000000: 0x0002FFFFFFFFFFF
+ */
+ p->tve_cache[pe_number * 2] = 0x000000FFFFFF0200ULL;
+ } else {
+ /* Allow address range 0x0002000000000000: 0x0002FFFFFFFFFFF */
+ p->tve_cache[pe_number * 2] = 0x000000FFFFFF0a00ULL;
+ }
+
+ phb3_ioda_sel(p, IODA2_TBL_TVT, 0, true);
+ for (i = 0; i < ARRAY_SIZE(p->tve_cache); i++)
+ out_be64(p->regs + PHB_IODA_DATA0, p->tve_cache[i]);
+
+ /* set m64 bar to pass mmio window */
+ memset(p->m64b_cache, 0x0, sizeof(p->m64b_cache));
+ p->m64b_cache[0] = PPC_BIT(0); /*enable*/
+ p->m64b_cache[0] |= PPC_BIT(1); /*single pe*/
+ p->m64b_cache[0] |= (p->mm0_base << 12) | ((pe_number & 0x3e0) << 27); /*base and upper pe*/
+ p->m64b_cache[0] |= 0x3fffc000 | (pe_number & 0x1f); /*mask and lower pe*/
+
+ p->m64b_cache[1] = PPC_BIT(0); /*enable*/
+ p->m64b_cache[1] |= PPC_BIT(1); /*single pe*/
+ p->m64b_cache[1] |= (0x0002000000000000ULL << 12) | ((pe_number & 0x3e0) << 27); /*base and upper pe*/
+ p->m64b_cache[1] |= 0x3f000000 | (pe_number & 0x1f); /*mask and lower pe*/
+
+ phb3_ioda_sel(p, IODA2_TBL_M64BT, 0, true);
+ for (i = 0; i < ARRAY_SIZE(p->m64b_cache); i++)
+ out_be64(p->regs + PHB_IODA_DATA0, p->m64b_cache[i]);
+
+ out_be64(p->regs + PHB_PHB3_CONFIG, PHB_PHB3C_64B_TCE_EN);
+ out_be64(p->regs + PHB_PHB3_CONFIG, PHB_PHB3C_64BIT_MSI_EN);
+
+ phb3_init_capp_errors(p);
+
+ phb3_init_capp_regs(p, dma_mode);
+
+ if (!chiptod_capp_timebase_sync(p->chip_id, CAPP_TFMR, CAPP_TB,
+ PHB3_CAPP_REG_OFFSET(p))) {
+ PHBERR(p, "CAPP: Failed to sync timebase\n");
+ return OPAL_HARDWARE;
+ }
+
+ /* set callbacks to handle HMI events */
+ capi_ops.get_capp_info = &phb3_get_capp_info;
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb3_set_capi_mode(struct phb *phb, uint64_t mode,
+ uint64_t pe_number)
+{
+ struct phb3 *p = phb_to_phb3(phb);
+ struct proc_chip *chip = get_chip(p->chip_id);
+ uint64_t reg;
+ uint64_t read_buffers;
+ uint32_t offset;
+ u8 mask;
+
+ if (!capp_ucode_loaded(chip, p->index)) {
+ PHBERR(p, "CAPP: ucode not loaded\n");
+ return OPAL_RESOURCE;
+ }
+
+ lock(&capi_lock);
+ if (PHB3_IS_NAPLES(p)) {
+ /* Naples has two CAPP units, statically mapped. */
+ chip->capp_phb3_attached_mask |= 1 << p->index;
+ } else {
+ /*
+ * Check if CAPP port is being used by any another PHB.
+ * Check and set chip->capp_phb3_attached_mask atomically
+ * incase two phb3_set_capi_mode() calls race.
+ */
+ mask = ~(1 << p->index);
+ if (chip->capp_phb3_attached_mask & mask) {
+ PHBERR(p,
+ "CAPP: port already in use by another PHB:%x\n",
+ chip->capp_phb3_attached_mask);
+ unlock(&capi_lock);
+ return false;
+ }
+ chip->capp_phb3_attached_mask = 1 << p->index;
+ }
+ unlock(&capi_lock);
+
+ offset = PHB3_CAPP_REG_OFFSET(p);
+ xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, &reg);
+ if ((reg & PPC_BIT(5))) {
+ PHBERR(p, "CAPP: recovery failed (%016llx)\n", reg);
+ return OPAL_HARDWARE;
+ } else if ((reg & PPC_BIT(0)) && (!(reg & PPC_BIT(1)))) {
+ PHBDBG(p, "CAPP: recovery in progress\n");
+ return OPAL_BUSY;
+ }
+
+ switch (mode) {
+ case OPAL_PHB_CAPI_MODE_PCIE:
+ /* Switching back to PCIe mode requires a creset */
+ return OPAL_UNSUPPORTED;
+
+ case OPAL_PHB_CAPI_MODE_CAPI:
+ return enable_capi_mode(p, pe_number, false);
+
+ case OPAL_PHB_CAPI_MODE_DMA:
+ return enable_capi_mode(p, pe_number, true);
+
+ case OPAL_PHB_CAPI_MODE_SNOOP_OFF:
+ xscom_write(p->chip_id, SNOOP_CAPI_CONFIG + offset,
+ 0x0000000000000000);
+ return OPAL_SUCCESS;
+
+ case OPAL_PHB_CAPI_MODE_SNOOP_ON:
+ xscom_write(p->chip_id, CAPP_ERR_STATUS_CTRL + offset,
+ 0x0000000000000000);
+ /*
+ * Make sure the PHB read buffers being snooped match those
+ * being used so we don't need another mode to set SNOOP+DMA
+ */
+ xscom_read(p->chip_id, APC_MASTER_PB_CTRL + offset, &reg);
+ read_buffers = (reg >> PPC_BITLSHIFT(11)) & 0x3;
+ reg = 0xA1F0000000000000UL;
+ reg |= read_buffers << PPC_BITLSHIFT(39);
+ xscom_write(p->chip_id, SNOOP_CAPI_CONFIG + offset, reg);
+
+ return OPAL_SUCCESS;
+ }
+
+ return OPAL_UNSUPPORTED;
+}
+
+static int64_t phb3_set_capp_recovery(struct phb *phb)
+{
+ struct phb3 *p = phb_to_phb3(phb);
+
+ if (p->flags & PHB3_CAPP_RECOVERY)
+ return 0;
+
+ /* set opal event flag to indicate eeh condition */
+ opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
+ OPAL_EVENT_PCI_ERROR);
+
+ p->flags |= PHB3_CAPP_RECOVERY;
+
+ return 0;
+}
+
+static const struct phb_ops phb3_ops = {
+ .cfg_read8 = phb3_pcicfg_read8,
+ .cfg_read16 = phb3_pcicfg_read16,
+ .cfg_read32 = phb3_pcicfg_read32,
+ .cfg_write8 = phb3_pcicfg_write8,
+ .cfg_write16 = phb3_pcicfg_write16,
+ .cfg_write32 = phb3_pcicfg_write32,
+ .get_reserved_pe_number = phb3_get_reserved_pe_number,
+ .device_init = phb3_device_init,
+ .device_remove = phb3_device_remove,
+ .ioda_reset = phb3_ioda_reset,
+ .papr_errinjct_reset = phb3_papr_errinjct_reset,
+ .pci_reinit = phb3_pci_reinit,
+ .set_phb_mem_window = phb3_set_phb_mem_window,
+ .phb_mmio_enable = phb3_phb_mmio_enable,
+ .map_pe_mmio_window = phb3_map_pe_mmio_window,
+ .map_pe_dma_window = phb3_map_pe_dma_window,
+ .map_pe_dma_window_real = phb3_map_pe_dma_window_real,
+ .pci_msi_eoi = phb3_pci_msi_eoi,
+ .set_xive_pe = phb3_set_ive_pe,
+ .get_msi_32 = phb3_get_msi_32,
+ .get_msi_64 = phb3_get_msi_64,
+ .set_pe = phb3_set_pe,
+ .set_peltv = phb3_set_peltv,
+ .eeh_freeze_status = phb3_eeh_freeze_status,
+ .eeh_freeze_clear = phb3_eeh_freeze_clear,
+ .eeh_freeze_set = phb3_eeh_freeze_set,
+ .next_error = phb3_eeh_next_error,
+ .err_inject = phb3_err_inject,
+ .get_diag_data2 = phb3_get_diag_data,
+ .set_capi_mode = phb3_set_capi_mode,
+ .set_capp_recovery = phb3_set_capp_recovery,
+};
+
+/*
+ * We should access those registers at the stage since the
+ * AIB isn't ready yet.
+ */
+static void phb3_setup_aib(struct phb3 *p)
+{
+ /* Init_2 - AIB TX Channel Mapping Register */
+ phb3_write_reg_asb(p, PHB_AIB_TX_CHAN_MAPPING, 0x0211230000000000UL);
+
+ /* Init_3 - AIB RX command credit register */
+ if (p->rev >= PHB3_REV_VENICE_DD20)
+ phb3_write_reg_asb(p, PHB_AIB_RX_CMD_CRED, 0x0020000100020001UL);
+ else
+ phb3_write_reg_asb(p, PHB_AIB_RX_CMD_CRED, 0x0020000100010001UL);
+
+ /* Init_4 - AIB rx data credit register */
+ if (p->rev >= PHB3_REV_VENICE_DD20)
+ phb3_write_reg_asb(p, PHB_AIB_RX_DATA_CRED, 0x0020002000010001UL);
+ else
+ phb3_write_reg_asb(p, PHB_AIB_RX_DATA_CRED, 0x0020002000000001UL);
+
+ /* Init_5 - AIB rx credit init timer register */
+ phb3_write_reg_asb(p, PHB_AIB_RX_CRED_INIT_TIMER, 0x0f00000000000000UL);
+
+ /* Init_6 - AIB Tag Enable register */
+ phb3_write_reg_asb(p, PHB_AIB_TAG_ENABLE, 0xffffffff00000000UL);
+
+ /* Init_7 - TCE Tag Enable register */
+ phb3_write_reg_asb(p, PHB_TCE_TAG_ENABLE, 0xffffffff00000000UL);
+}
+
+static void phb3_init_ioda2(struct phb3 *p)
+{
+ /* Init_14 - LSI Source ID */
+ out_be64(p->regs + PHB_LSI_SOURCE_ID,
+ SETFIELD(PHB_LSI_SRC_ID, 0ul, 0xff));
+
+ /* Init_15 - IVT BAR / Length
+ * Init_16 - RBA BAR
+ * - RTT BAR
+ * Init_17 - PELT-V BAR
+ */
+ out_be64(p->regs + PHB_RTT_BAR,
+ p->tbl_rtt | PHB_RTT_BAR_ENABLE);
+ out_be64(p->regs + PHB_PELTV_BAR,
+ p->tbl_peltv | PHB_PELTV_BAR_ENABLE);
+ out_be64(p->regs + PHB_IVT_BAR,
+ p->tbl_ivt | 0x800 | PHB_IVT_BAR_ENABLE);
+
+ /* DD2.0 or the subsequent chips don't have memory
+ * resident RBA.
+ */
+ if (p->rev >= PHB3_REV_MURANO_DD20)
+ out_be64(p->regs + PHB_RBA_BAR, 0x0ul);
+ else
+ out_be64(p->regs + PHB_RBA_BAR,
+ p->tbl_rba | PHB_RBA_BAR_ENABLE);
+
+ /* Init_18..21 - Setup M32 */
+ out_be64(p->regs + PHB_M32_BASE_ADDR, p->mm1_base);
+ out_be64(p->regs + PHB_M32_BASE_MASK, ~(M32_PCI_SIZE - 1));
+ out_be64(p->regs + PHB_M32_START_ADDR, M32_PCI_START);
+
+ /* Init_22 - Setup PEST BAR */
+ out_be64(p->regs + PHB_PEST_BAR,
+ p->tbl_pest | PHB_PEST_BAR_ENABLE);
+
+ /* Init_23 - PCIE Outbound upper address */
+ out_be64(p->regs + PHB_M64_UPPER_BITS, 0);
+
+ /* Init_24 - Interrupt represent timers
+ * The register doesn't take effect on Murano DD1.0
+ */
+ if (p->rev >= PHB3_REV_NAPLES_DD10)
+ out_be64(p->regs + PHB_INTREP_TIMER, 0x0014000000000000UL);
+ else if (p->rev >= PHB3_REV_MURANO_DD20)
+ out_be64(p->regs + PHB_INTREP_TIMER, 0x0004000000000000UL);
+ else
+ out_be64(p->regs + PHB_INTREP_TIMER, 0);
+
+ /* Init_25 - PHB3 Configuration Register. Clear TCE cache then
+ * configure the PHB
+ */
+ out_be64(p->regs + PHB_PHB3_CONFIG, PHB_PHB3C_64B_TCE_EN);
+ out_be64(p->regs + PHB_PHB3_CONFIG,
+ PHB_PHB3C_M32_EN | PHB_PHB3C_32BIT_MSI_EN |
+ PHB_PHB3C_64BIT_MSI_EN);
+
+ /* Init_26 - At least 512ns delay according to spec */
+ time_wait_us(2);
+
+ /* Init_27..36 - On-chip IODA tables init */
+ phb3_ioda_reset(&p->phb, false);
+}
+
+static bool phb3_wait_dlp_reset(struct phb3 *p)
+{
+ unsigned int i;
+ uint64_t val;
+
+ /*
+ * Firmware cannot access the UTL core regs or PCI config space
+ * until the cores are out of DL_PGRESET.
+ * DL_PGRESET should be polled until it is inactive with a value
+ * of '0'. The recommended polling frequency is once every 1ms.
+ * Firmware should poll at least 200 attempts before giving up.
+ * MMIO Stores to the link are silently dropped by the UTL core if
+ * the link is down.
+ * MMIO Loads to the link will be dropped by the UTL core and will
+ * eventually time-out and will return an all ones response if the
+ * link is down.
+ */
+#define DLP_RESET_ATTEMPTS 40000
+
+ PHBDBG(p, "Waiting for DLP PG reset to complete...\n");
+ for (i = 0; i < DLP_RESET_ATTEMPTS; i++) {
+ val = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+ if (!(val & PHB_PCIE_DLP_TC_DL_PGRESET))
+ break;
+ time_wait_us(10);
+ }
+ if (val & PHB_PCIE_DLP_TC_DL_PGRESET) {
+ PHBERR(p, "Timeout waiting for DLP PG reset !\n");
+ return false;
+ }
+ return true;
+}
+
+/* phb3_init_rc - Initialize the Root Complex config space
+ */
+static bool phb3_init_rc_cfg(struct phb3 *p)
+{
+ int64_t ecap, aercap;
+
+ /* XXX Handle errors ? */
+
+ /* Init_45..46:
+ *
+ * Set primary bus to 0, secondary to 1 and subordinate to 0xff
+ */
+ phb3_pcicfg_write32(&p->phb, 0, PCI_CFG_PRIMARY_BUS, 0x00ff0100);
+
+ /* Init_47..52
+ *
+ * IO and Memory base & limits are set to base > limit, which
+ * allows all inbounds.
+ *
+ * XXX This has the potential of confusing the OS which might
+ * think that nothing is forwarded downstream. We probably need
+ * to fix this to match the IO and M32 PHB windows
+ */
+ phb3_pcicfg_write16(&p->phb, 0, PCI_CFG_IO_BASE, 0x0010);
+ phb3_pcicfg_write32(&p->phb, 0, PCI_CFG_MEM_BASE, 0x00000010);
+ phb3_pcicfg_write32(&p->phb, 0, PCI_CFG_PREF_MEM_BASE, 0x00000010);
+
+ /* Init_53..54 - Setup bridge control enable forwarding of CORR, FATAL,
+ * and NONFATAL errors
+ */
+ phb3_pcicfg_write16(&p->phb, 0, PCI_CFG_BRCTL, PCI_CFG_BRCTL_SERR_EN);
+
+ /* Init_55..56
+ *
+ * PCIE Device control/status, enable error reporting, disable relaxed
+ * ordering, set MPS to 128 (see note), clear errors.
+ *
+ * Note: The doc recommends to set MPS to 4K. This has proved to have
+ * some issues as it requires specific claming of MRSS on devices and
+ * we've found devices in the field that misbehave when doing that.
+ *
+ * We currently leave it all to 128 bytes (minimum setting) at init
+ * time. The generic PCIe probing later on might apply a different
+ * value, or the kernel will, but we play it safe at early init
+ */
+ if (p->ecap <= 0) {
+ ecap = pci_find_cap(&p->phb, 0, PCI_CFG_CAP_ID_EXP);
+ if (ecap < 0) {
+ PHBERR(p, "Can't locate PCI-E capability\n");
+ return false;
+ }
+ p->ecap = ecap;
+ } else {
+ ecap = p->ecap;
+ }
+
+ phb3_pcicfg_write16(&p->phb, 0, ecap + PCICAP_EXP_DEVSTAT,
+ PCICAP_EXP_DEVSTAT_CE |
+ PCICAP_EXP_DEVSTAT_NFE |
+ PCICAP_EXP_DEVSTAT_FE |
+ PCICAP_EXP_DEVSTAT_UE);
+
+ phb3_pcicfg_write16(&p->phb, 0, ecap + PCICAP_EXP_DEVCTL,
+ PCICAP_EXP_DEVCTL_CE_REPORT |
+ PCICAP_EXP_DEVCTL_NFE_REPORT |
+ PCICAP_EXP_DEVCTL_FE_REPORT |
+ PCICAP_EXP_DEVCTL_UR_REPORT |
+ SETFIELD(PCICAP_EXP_DEVCTL_MPS, 0, PCIE_MPS_128B));
+
+ /* Init_57..58
+ *
+ * Root Control Register. Enable error reporting
+ *
+ * Note: Added CRS visibility.
+ */
+ phb3_pcicfg_write16(&p->phb, 0, ecap + PCICAP_EXP_RC,
+ PCICAP_EXP_RC_SYSERR_ON_CE |
+ PCICAP_EXP_RC_SYSERR_ON_NFE |
+ PCICAP_EXP_RC_SYSERR_ON_FE |
+ PCICAP_EXP_RC_CRS_VISIBLE);
+
+ /* Init_59..60
+ *
+ * Device Control 2. Enable ARI fwd, set timer to RTOS timer
+ */
+ phb3_pcicfg_write16(&p->phb, 0, ecap + PCICAP_EXP_DCTL2,
+ SETFIELD(PCICAP_EXP_DCTL2_CMPTOUT, 0, 0xf) |
+ PCICAP_EXP_DCTL2_ARI_FWD);
+
+ /* Init_61..76
+ *
+ * AER inits
+ */
+ if (p->aercap <= 0) {
+ aercap = pci_find_ecap(&p->phb, 0, PCIECAP_ID_AER, NULL);
+ if (aercap < 0) {
+ PHBERR(p, "Can't locate AER capability\n");
+ return false;
+ }
+ p->aercap = aercap;
+ } else {
+ aercap = p->aercap;
+ }
+
+ /* Clear all UE status */
+ phb3_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_UE_STATUS,
+ 0xffffffff);
+ /* Disable some error reporting as per the PHB3 spec */
+ phb3_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_UE_MASK,
+ PCIECAP_AER_UE_POISON_TLP |
+ PCIECAP_AER_UE_COMPL_TIMEOUT |
+ PCIECAP_AER_UE_COMPL_ABORT |
+ PCIECAP_AER_UE_ECRC);
+ /* Report some errors as fatal */
+ phb3_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_UE_SEVERITY,
+ PCIECAP_AER_UE_DLP |
+ PCIECAP_AER_UE_SURPRISE_DOWN |
+ PCIECAP_AER_UE_FLOW_CTL_PROT |
+ PCIECAP_AER_UE_UNEXP_COMPL |
+ PCIECAP_AER_UE_RECV_OVFLOW |
+ PCIECAP_AER_UE_MALFORMED_TLP);
+ /* Clear all CE status */
+ phb3_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_CE_STATUS,
+ 0xffffffff);
+ /* Disable some error reporting as per the PHB3 spec */
+ /* Note: When link down, also disable rcvr errors */
+ phb3_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_CE_MASK,
+ PCIECAP_AER_CE_ADV_NONFATAL |
+ (p->has_link ? 0 : PCIECAP_AER_CE_RECVR_ERR));
+
+ /* Enable or disable ECRC generation & checking */
+ phb3_enable_ecrc(&p->phb, !p->no_ecrc_devs);
+
+ /* Enable reporting in root error control */
+ phb3_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_RERR_CMD,
+ PCIECAP_AER_RERR_CMD_FE |
+ PCIECAP_AER_RERR_CMD_NFE |
+ PCIECAP_AER_RERR_CMD_CE);
+ /* Clear root error status */
+ phb3_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_RERR_STA,
+ 0xffffffff);
+
+ return true;
+}
+
+static void phb3_init_utl(struct phb3 *p)
+{
+ /* Init_77..79: Clear spurrious errors and assign errors to the
+ * right "interrupt" signal
+ */
+ out_be64(p->regs + UTL_SYS_BUS_AGENT_STATUS, 0xffffffffffffffffUL);
+ out_be64(p->regs + UTL_SYS_BUS_AGENT_ERR_SEVERITY, 0x5000000000000000UL);
+ out_be64(p->regs + UTL_SYS_BUS_AGENT_IRQ_EN, 0xfcc0000000000000UL);
+
+ /* Init_80..81: Setup tag allocations
+ *
+ * Stick to HW defaults. May differs between PHB implementations
+ */
+
+ /* Init_82: PCI Express port control
+ * SW283991: Set Outbound Non-Posted request timeout to 16ms (RTOS).
+ */
+ out_be64(p->regs + UTL_PCIE_PORT_CONTROL, 0x8588007000000000UL);
+
+ /* Init_83..85: Clean & setup port errors */
+ out_be64(p->regs + UTL_PCIE_PORT_STATUS, 0xffdfffffffffffffUL);
+ out_be64(p->regs + UTL_PCIE_PORT_ERROR_SEV, 0x5039000000000000UL);
+
+ if (p->has_link)
+ out_be64(p->regs + UTL_PCIE_PORT_IRQ_EN, 0xad52800000000000UL);
+ else
+ out_be64(p->regs + UTL_PCIE_PORT_IRQ_EN, 0xad42800000000000UL);
+
+ /* Init_86 : Cleanup RC errors */
+ out_be64(p->regs + UTL_RC_STATUS, 0xffffffffffffffffUL);
+}
+
+static void phb3_init_errors(struct phb3 *p)
+{
+ /* Init_88: LEM Error Mask : Temporarily disable error interrupts */
+ out_be64(p->regs + PHB_LEM_ERROR_MASK, 0xffffffffffffffffUL);
+
+ /* Init_89..97: Disable all error interrupts until end of init */
+ out_be64(p->regs + PHB_ERR_STATUS, 0xffffffffffffffffUL);
+ out_be64(p->regs + PHB_ERR1_STATUS, 0x0000000000000000UL);
+ out_be64(p->regs + PHB_ERR_LEM_ENABLE, 0xffffffffffffffffUL);
+ out_be64(p->regs + PHB_ERR_FREEZE_ENABLE, 0x0000000080800000UL);
+ out_be64(p->regs + PHB_ERR_AIB_FENCE_ENABLE, 0xffffffdd0c00ffc0UL);
+ out_be64(p->regs + PHB_ERR_LOG_0, 0x0000000000000000UL);
+ out_be64(p->regs + PHB_ERR_LOG_1, 0x0000000000000000UL);
+ out_be64(p->regs + PHB_ERR_STATUS_MASK, 0x0000000000000000UL);
+ out_be64(p->regs + PHB_ERR1_STATUS_MASK, 0x0000000000000000UL);
+
+ /* Init_98_106: Configure MMIO error traps & clear old state
+ *
+ * Don't enable BAR multi-hit detection in bit 41.
+ */
+ out_be64(p->regs + PHB_OUT_ERR_STATUS, 0xffffffffffffffffUL);
+ out_be64(p->regs + PHB_OUT_ERR1_STATUS, 0x0000000000000000UL);
+ out_be64(p->regs + PHB_OUT_ERR_LEM_ENABLE, 0xfdffffffffbfffffUL);
+ out_be64(p->regs + PHB_OUT_ERR_FREEZE_ENABLE, 0x0000420800000000UL);
+ out_be64(p->regs + PHB_OUT_ERR_AIB_FENCE_ENABLE, 0x9cf3bc00f89c700fUL);
+ out_be64(p->regs + PHB_OUT_ERR_LOG_0, 0x0000000000000000UL);
+ out_be64(p->regs + PHB_OUT_ERR_LOG_1, 0x0000000000000000UL);
+ out_be64(p->regs + PHB_OUT_ERR_STATUS_MASK, 0x0000000000400000UL);
+ out_be64(p->regs + PHB_OUT_ERR1_STATUS_MASK, 0x0000000000400000UL);
+
+ /* Init_107_115: Configure DMA_A error traps & clear old state */
+ out_be64(p->regs + PHB_INA_ERR_STATUS, 0xffffffffffffffffUL);
+ out_be64(p->regs + PHB_INA_ERR1_STATUS, 0x0000000000000000UL);
+ out_be64(p->regs + PHB_INA_ERR_LEM_ENABLE, 0xffffffffffffffffUL);
+ out_be64(p->regs + PHB_INA_ERR_FREEZE_ENABLE, 0xc00003a901006000UL);
+ out_be64(p->regs + PHB_INA_ERR_AIB_FENCE_ENABLE, 0x3fff5452fe019fdeUL);
+ out_be64(p->regs + PHB_INA_ERR_LOG_0, 0x0000000000000000UL);
+ out_be64(p->regs + PHB_INA_ERR_LOG_1, 0x0000000000000000UL);
+ out_be64(p->regs + PHB_INA_ERR_STATUS_MASK, 0x0000000000000000UL);
+ out_be64(p->regs + PHB_INA_ERR1_STATUS_MASK, 0x0000000000000000UL);
+
+ /* Init_116_124: Configure DMA_B error traps & clear old state */
+ out_be64(p->regs + PHB_INB_ERR_STATUS, 0xffffffffffffffffUL);
+ out_be64(p->regs + PHB_INB_ERR1_STATUS, 0x0000000000000000UL);
+ out_be64(p->regs + PHB_INB_ERR_LEM_ENABLE, 0xffffffffffffffffUL);
+
+ /*
+ * Workaround for errata HW257476, turn correctable messages into
+ * ER freezes on Murano and Venice DD1.0
+ */
+ if (p->rev < PHB3_REV_MURANO_DD20)
+ out_be64(p->regs + PHB_INB_ERR_FREEZE_ENABLE,
+ 0x0000600000000070UL);
+ else
+ out_be64(p->regs + PHB_INB_ERR_FREEZE_ENABLE,
+ 0x0000600000000060UL);
+
+ out_be64(p->regs + PHB_INB_ERR_AIB_FENCE_ENABLE, 0xfcff80fbff7ff08cUL);
+ out_be64(p->regs + PHB_INB_ERR_LOG_0, 0x0000000000000000UL);
+ out_be64(p->regs + PHB_INB_ERR_LOG_1, 0x0000000000000000UL);
+ out_be64(p->regs + PHB_INB_ERR_STATUS_MASK, 0x0000000000000000UL);
+ out_be64(p->regs + PHB_INB_ERR1_STATUS_MASK, 0x0000000000000000UL);
+
+ /* Init_125..128: Cleanup & configure LEM */
+ out_be64(p->regs + PHB_LEM_FIR_ACCUM, 0x0000000000000000UL);
+ out_be64(p->regs + PHB_LEM_ACTION0, 0xffffffffffffffffUL);
+ out_be64(p->regs + PHB_LEM_ACTION1, 0xffffffffffffffffUL);
+ out_be64(p->regs + PHB_LEM_WOF, 0x0000000000000000UL);
+}
+
+static int64_t phb3_fixup_pec_inits(struct phb3 *p)
+{
+ int64_t rc;
+ uint64_t val;
+
+ /* These fixups handle some timer updates that HB doesn't yet do
+ * to work around problems with some adapters or external drawers
+ * (SW283991)
+ */
+
+ /* PCI Hardware Configuration 0 Register */
+ rc = xscom_read(p->chip_id, p->pe_xscom + 0x18, &val);
+ if (rc) {
+ PHBERR(p, "Can't read CS0 !\n");
+ return rc;
+ }
+ val = val & 0x0f0fffffffffffffull;
+ val = val | 0x1010000000000000ull;
+ rc = xscom_write(p->chip_id, p->pe_xscom + 0x18, val);
+ if (rc) {
+ PHBERR(p, "Can't write CS0 !\n");
+ return rc;
+ }
+ return 0;
+}
+
+static void phb3_init_hw(struct phb3 *p, bool first_init)
+{
+ uint64_t val;
+
+ PHBDBG(p, "Initializing PHB...\n");
+
+ /* Fixups for PEC inits */
+ if (phb3_fixup_pec_inits(p)) {
+ PHBERR(p, "Failed to init PEC, PHB appears broken\n");
+ goto failed;
+ }
+
+ /* Lift reset */
+ xscom_read(p->chip_id, p->spci_xscom + 1, &val);/* HW275117 */
+ xscom_write(p->chip_id, p->pci_xscom + 0xa, 0);
+
+ /* XXX FIXME, turn that into a state machine or a worker thread */
+ time_wait_ms(100);
+
+ /* Grab version and fit it in an int */
+ val = phb3_read_reg_asb(p, PHB_VERSION);
+ if (val == 0 || val == 0xffffffffffffffffUL) {
+ PHBERR(p, "Failed to read version, PHB appears broken\n");
+ goto failed;
+ }
+
+ p->rev = ((val >> 16) & 0x00ff0000) | (val & 0xffff);
+ PHBDBG(p, "Core revision 0x%x\n", p->rev);
+
+ /* Setup AIB credits etc... */
+ phb3_setup_aib(p);
+
+ /* Init_8 - PCIE System Configuration Register
+ *
+ * Use default values, clear bit 15 (SYS_EC00_SLOT) to avoid incorrect
+ * slot power limit message and adjust max speed based on system
+ * config. Don't hard wire default value as some bits are different
+ * between implementations.
+ */
+ val = in_be64(p->regs + PHB_PCIE_SYSTEM_CONFIG);
+ PHBDBG(p, "Default system config: 0x%016llx\n", val);
+ val = SETFIELD(PHB_PCIE_SCONF_SLOT, val, 0);
+ val = SETFIELD(PHB_PCIE_SCONF_MAXLINKSPEED, val, p->max_link_speed);
+ out_be64(p->regs + PHB_PCIE_SYSTEM_CONFIG, val);
+ PHBDBG(p, "New system config : 0x%016llx\n",
+ in_be64(p->regs + PHB_PCIE_SYSTEM_CONFIG));
+
+ /* Init_9..12 - PCIE DLP Lane EQ control */
+ if (p->lane_eq) {
+ out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL0,
+ be64_to_cpu(p->lane_eq[0]));
+ out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL1,
+ be64_to_cpu(p->lane_eq[1]));
+ out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL2,
+ be64_to_cpu(p->lane_eq[2]));
+ out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL3,
+ be64_to_cpu(p->lane_eq[3]));
+ }
+
+ /* Init_XX - (PHB2 errata)
+ *
+ * Set proper credits, needs adjustment due to wrong defaults
+ * on PHB2 before we lift the reset. This only applies to Murano
+ * and Venice
+ */
+ if (p->index == 2 && p->rev < PHB3_REV_NAPLES_DD10)
+ out_be64(p->regs + PHB_PCIE_SYS_LINK_INIT, 0x9008133332120000UL);
+
+ /* Init_13 - PCIE Reset */
+ /*
+ * Lift the PHB resets but not PERST, this will be lifted
+ * later by the initial PERST state machine
+ */
+ PHBDBG(p, "PHB_RESET is 0x%016llx\n", in_be64(p->regs + PHB_RESET));
+ out_be64(p->regs + PHB_RESET, 0xd000000000000000UL);
+
+ /* Architected IODA2 inits */
+ phb3_init_ioda2(p);
+
+ /* Init_37..42 - Clear UTL & DLP error logs */
+ out_be64(p->regs + PHB_PCIE_UTL_ERRLOG1, 0xffffffffffffffffUL);
+ out_be64(p->regs + PHB_PCIE_UTL_ERRLOG2, 0xffffffffffffffffUL);
+ out_be64(p->regs + PHB_PCIE_UTL_ERRLOG3, 0xffffffffffffffffUL);
+ out_be64(p->regs + PHB_PCIE_UTL_ERRLOG4, 0xffffffffffffffffUL);
+ out_be64(p->regs + PHB_PCIE_DLP_ERRLOG1, 0xffffffffffffffffUL);
+ out_be64(p->regs + PHB_PCIE_DLP_ERRLOG2, 0xffffffffffffffffUL);
+
+ /* Init_43 - Wait for UTL core to come out of reset */
+ if (!phb3_wait_dlp_reset(p))
+ goto failed;
+
+ /* Init_44 - Clear port status */
+ out_be64(p->regs + UTL_PCIE_PORT_STATUS, 0xffffffffffffffffUL);
+
+ /* Init_45..76: Init root complex config space */
+ if (!phb3_init_rc_cfg(p))
+ goto failed;
+
+ /* Init_77..86 : Init UTL */
+ phb3_init_utl(p);
+
+ /*
+ * Init_87: PHB Control register. Various PHB settings
+ * Enable IVC for Murano DD2.0 or later one
+ */
+#ifdef IVT_TABLE_IVE_16B
+ val = 0xf3a80e4b00000000UL;
+#else
+ val = 0xf3a80ecb00000000UL;
+#endif
+ if (p->rev >= PHB3_REV_MURANO_DD20)
+ val |= 0x0000010000000000UL;
+ if (first_init && p->rev >= PHB3_REV_NAPLES_DD10) {
+ /* Enable 32-bit bypass support on Naples and tell the OS
+ * about it
+ */
+ val |= 0x0010000000000000UL;
+ dt_add_property(p->phb.dt_node,
+ "ibm,32-bit-bypass-supported", NULL, 0);
+ }
+ out_be64(p->regs + PHB_CONTROL, val);
+
+ /* Init_88..128 : Setup error registers */
+ phb3_init_errors(p);
+
+ /* Init_129: Read error summary */
+ val = in_be64(p->regs + PHB_ETU_ERR_SUMMARY);
+ if (val) {
+ PHBERR(p, "Errors detected during PHB init: 0x%16llx\n", val);
+ goto failed;
+ }
+
+ /* NOTE: At this point the spec waits for the link to come up. We
+ * don't bother as we are doing a PERST soon.
+ */
+
+ /* XXX I don't know why the spec does this now and not earlier, so
+ * to be sure to get it right we might want to move it to the freset
+ * state machine, though the generic PCI layer will probably do
+ * this anyway (ie, enable MEM, etc... in the RC)
+ *
+ * Note:The spec enables IO but PHB3 doesn't do IO space .... so we
+ * leave that clear.
+ */
+ phb3_pcicfg_write16(&p->phb, 0, PCI_CFG_CMD,
+ PCI_CFG_CMD_MEM_EN |
+ PCI_CFG_CMD_BUS_MASTER_EN |
+ PCI_CFG_CMD_PERR_RESP |
+ PCI_CFG_CMD_SERR_EN);
+
+ /* Clear errors */
+ phb3_pcicfg_write16(&p->phb, 0, PCI_CFG_STAT,
+ PCI_CFG_STAT_SENT_TABORT |
+ PCI_CFG_STAT_RECV_TABORT |
+ PCI_CFG_STAT_RECV_MABORT |
+ PCI_CFG_STAT_SENT_SERR |
+ PCI_CFG_STAT_RECV_PERR);
+
+ /* Init_136 - Re-enable error interrupts */
+
+ /* TBD: Should we mask any of these for PERST ? */
+ out_be64(p->regs + PHB_ERR_IRQ_ENABLE, 0x0000002280b80000UL);
+ out_be64(p->regs + PHB_OUT_ERR_IRQ_ENABLE, 0x600c42fc042080f0UL);
+ out_be64(p->regs + PHB_INA_ERR_IRQ_ENABLE, 0xc000a3a901826020UL);
+ out_be64(p->regs + PHB_INB_ERR_IRQ_ENABLE, 0x0000600000800070UL);
+ out_be64(p->regs + PHB_LEM_ERROR_MASK, 0x42498e367f502eaeUL);
+
+ /*
+ * Init_141 - Enable DMA address speculation
+ *
+ * Errata#20131017: Disable speculation until Murano DD2.0
+ *
+ * Note: We keep IVT speculation disabled (bit 4). It should work with
+ * Murano DD2.0 and later but lacks sufficient testing. We will re-enable
+ * it once that has been done.
+ */
+ if (p->rev >= PHB3_REV_MURANO_DD20)
+ out_be64(p->regs + PHB_TCE_SPEC_CTL, 0xf000000000000000UL);
+ else
+ out_be64(p->regs + PHB_TCE_SPEC_CTL, 0x0ul);
+
+ /* Errata#20131017: avoid TCE queue overflow */
+ if (p->rev == PHB3_REV_MURANO_DD20)
+ phb3_write_reg_asb(p, PHB_TCE_WATERMARK, 0x0003000000030302UL);
+
+ /* Init_142 - PHB3 - Timeout Control Register 1
+ * SW283991: Increase timeouts
+ */
+ out_be64(p->regs + PHB_TIMEOUT_CTRL1, 0x1715152016200000UL);
+
+ /* Init_143 - PHB3 - Timeout Control Register 2 */
+ out_be64(p->regs + PHB_TIMEOUT_CTRL2, 0x2320d71600000000UL);
+
+ /* Mark the PHB as functional which enables all the various sequences */
+ p->broken = false;
+
+ PHBDBG(p, "Initialization complete\n");
+
+ return;
+
+ failed:
+ PHBERR(p, "Initialization failed\n");
+ p->broken = true;
+}
+
+static void phb3_allocate_tables(struct phb3 *p)
+{
+ uint16_t *rte;
+ uint32_t i;
+
+ /* XXX Our current memalign implementation sucks,
+ *
+ * It will do the job, however it doesn't support freeing
+ * the memory and wastes space by always allocating twice
+ * as much as requested (size + alignment)
+ */
+ p->tbl_rtt = (uint64_t)local_alloc(p->chip_id, RTT_TABLE_SIZE, RTT_TABLE_SIZE);
+ assert(p->tbl_rtt);
+ rte = (uint16_t *)(p->tbl_rtt);
+ for (i = 0; i < RTT_TABLE_ENTRIES; i++, rte++)
+ *rte = PHB3_RESERVED_PE_NUM;
+
+ p->tbl_peltv = (uint64_t)local_alloc(p->chip_id, PELTV_TABLE_SIZE, PELTV_TABLE_SIZE);
+ assert(p->tbl_peltv);
+ memset((void *)p->tbl_peltv, 0, PELTV_TABLE_SIZE);
+
+ p->tbl_pest = (uint64_t)local_alloc(p->chip_id, PEST_TABLE_SIZE, PEST_TABLE_SIZE);
+ assert(p->tbl_pest);
+ memset((void *)p->tbl_pest, 0, PEST_TABLE_SIZE);
+
+ p->tbl_ivt = (uint64_t)local_alloc(p->chip_id, IVT_TABLE_SIZE, IVT_TABLE_SIZE);
+ assert(p->tbl_ivt);
+ memset((void *)p->tbl_ivt, 0, IVT_TABLE_SIZE);
+
+ p->tbl_rba = (uint64_t)local_alloc(p->chip_id, RBA_TABLE_SIZE, RBA_TABLE_SIZE);
+ assert(p->tbl_rba);
+ memset((void *)p->tbl_rba, 0, RBA_TABLE_SIZE);
+}
+
+static void phb3_add_properties(struct phb3 *p)
+{
+ struct dt_node *np = p->phb.dt_node;
+ uint32_t lsibase, icsp = get_ics_phandle();
+ uint64_t m32b, m64b, m64s, reg, tkill;
+
+ reg = cleanup_addr((uint64_t)p->regs);
+
+ /* Add various properties that HB doesn't have to
+ * add, some of them simply because they result from
+ * policy decisions made in skiboot rather than in HB
+ * such as the MMIO windows going to PCI, interrupts,
+ * etc...
+ */
+ dt_add_property_cells(np, "#address-cells", 3);
+ dt_add_property_cells(np, "#size-cells", 2);
+ dt_add_property_cells(np, "#interrupt-cells", 1);
+ dt_add_property_cells(np, "bus-range", 0, 0xff);
+ dt_add_property_cells(np, "clock-frequency", 0x200, 0); /* ??? */
+
+ dt_add_property_cells(np, "interrupt-parent", icsp);
+
+ /* XXX FIXME: add slot-name */
+ //dt_property_cell("bus-width", 8); /* Figure it out from VPD ? */
+
+ /* "ranges", we only expose M32 (PHB3 doesn't do IO)
+ *
+ * Note: The kernel expects us to have chopped of 64k from the
+ * M32 size (for the 32-bit MSIs). If we don't do that, it will
+ * get confused (OPAL does it)
+ */
+ m32b = cleanup_addr(p->mm1_base);
+ m64b = cleanup_addr(p->mm0_base);
+ m64s = p->mm0_size;
+ dt_add_property_cells(np, "ranges",
+ /* M32 space */
+ 0x02000000, 0x00000000, M32_PCI_START,
+ hi32(m32b), lo32(m32b), 0, M32_PCI_SIZE - 0x10000);
+
+ /* XXX FIXME: add opal-memwin32, dmawins, etc... */
+ dt_add_property_u64s(np, "ibm,opal-m64-window", m64b, m64b, m64s);
+ dt_add_property(np, "ibm,opal-single-pe", NULL, 0);
+ //dt_add_property_cells(np, "ibm,opal-msi-ports", 2048);
+ dt_add_property_cells(np, "ibm,opal-num-pes", 256);
+ dt_add_property_cells(np, "ibm,opal-reserved-pe",
+ PHB3_RESERVED_PE_NUM);
+ dt_add_property_cells(np, "ibm,opal-msi-ranges",
+ p->base_msi, PHB3_MSI_IRQ_COUNT);
+ tkill = reg + PHB_TCE_KILL;
+ dt_add_property_cells(np, "ibm,opal-tce-kill",
+ hi32(tkill), lo32(tkill));
+ dt_add_property_cells(np, "ibm,supported-tce-sizes",
+ 12, // 4K
+ 16, // 64K
+ 24, // 16M
+ 28); // 256M
+
+ /*
+ * Indicate to Linux that the architected IODA2 MSI EOI method
+ * is supported
+ */
+ dt_add_property_string(np, "ibm,msi-eoi-method", "ioda2");
+
+ /* Indicate to Linux that CAPP timebase sync is supported */
+ dt_add_property_string(np, "ibm,capp-timebase-sync", NULL);
+
+ /* The interrupt maps will be generated in the RC node by the
+ * PCI code based on the content of this structure:
+ */
+ lsibase = p->base_lsi;
+ p->phb.lstate.int_size = 2;
+ p->phb.lstate.int_val[0][0] = lsibase + PHB3_LSI_PCIE_INTA;
+ p->phb.lstate.int_val[0][1] = 1;
+ p->phb.lstate.int_val[1][0] = lsibase + PHB3_LSI_PCIE_INTB;
+ p->phb.lstate.int_val[1][1] = 1;
+ p->phb.lstate.int_val[2][0] = lsibase + PHB3_LSI_PCIE_INTC;
+ p->phb.lstate.int_val[2][1] = 1;
+ p->phb.lstate.int_val[3][0] = lsibase + PHB3_LSI_PCIE_INTD;
+ p->phb.lstate.int_val[3][1] = 1;
+ p->phb.lstate.int_parent[0] = icsp;
+ p->phb.lstate.int_parent[1] = icsp;
+ p->phb.lstate.int_parent[2] = icsp;
+ p->phb.lstate.int_parent[3] = icsp;
+
+ /* Indicators for variable tables */
+ dt_add_property_cells(np, "ibm,opal-rtt-table",
+ hi32(p->tbl_rtt), lo32(p->tbl_rtt), RTT_TABLE_SIZE);
+ dt_add_property_cells(np, "ibm,opal-peltv-table",
+ hi32(p->tbl_peltv), lo32(p->tbl_peltv), PELTV_TABLE_SIZE);
+ dt_add_property_cells(np, "ibm,opal-pest-table",
+ hi32(p->tbl_pest), lo32(p->tbl_pest), PEST_TABLE_SIZE);
+ dt_add_property_cells(np, "ibm,opal-ivt-table",
+ hi32(p->tbl_ivt), lo32(p->tbl_ivt), IVT_TABLE_SIZE);
+ dt_add_property_cells(np, "ibm,opal-ive-stride",
+ IVT_TABLE_STRIDE);
+ dt_add_property_cells(np, "ibm,opal-rba-table",
+ hi32(p->tbl_rba), lo32(p->tbl_rba), RBA_TABLE_SIZE);
+
+ dt_add_property_cells(np, "ibm,phb-diag-data-size",
+ sizeof(struct OpalIoPhb3ErrorData));
+}
+
+static bool phb3_calculate_windows(struct phb3 *p)
+{
+ const struct dt_property *prop;
+
+ /* Get PBCQ MMIO windows from device-tree */
+ prop = dt_require_property(p->phb.dt_node,
+ "ibm,mmio-window", -1);
+ assert(prop->len >= (2 * sizeof(uint64_t)));
+
+ p->mm0_base = ((const uint64_t *)prop->prop)[0];
+ p->mm0_size = ((const uint64_t *)prop->prop)[1];
+ if (prop->len > 16) {
+ p->mm1_base = ((const uint64_t *)prop->prop)[2];
+ p->mm1_size = ((const uint64_t *)prop->prop)[3];
+ }
+
+ /* Sort them so that 0 is big and 1 is small */
+ if (p->mm1_size && p->mm1_size > p->mm0_size) {
+ uint64_t b = p->mm0_base;
+ uint64_t s = p->mm0_size;
+ p->mm0_base = p->mm1_base;
+ p->mm0_size = p->mm1_size;
+ p->mm1_base = b;
+ p->mm1_size = s;
+ }
+
+ /* If 1 is too small, ditch it */
+ if (p->mm1_size < M32_PCI_SIZE)
+ p->mm1_size = 0;
+
+ /* If 1 doesn't exist, carve it out of 0 */
+ if (p->mm1_size == 0) {
+ p->mm0_size /= 2;
+ p->mm1_base = p->mm0_base + p->mm0_size;
+ p->mm1_size = p->mm0_size;
+ }
+
+ /* Crop mm1 to our desired size */
+ if (p->mm1_size > M32_PCI_SIZE)
+ p->mm1_size = M32_PCI_SIZE;
+
+ return true;
+}
+
+/*
+ * Trigger a creset to disable CAPI mode on kernel shutdown.
+ *
+ * This helper is called repeatedly by the host sync notifier mechanism, which
+ * relies on the kernel to regularly poll the OPAL_SYNC_HOST_REBOOT call as it
+ * shuts down.
+ *
+ * This is a somewhat hacky abuse of the host sync notifier mechanism, but the
+ * alternatives require a new API call which won't work for older kernels.
+ */
+static bool phb3_host_sync_reset(void *data)
+{
+ struct phb3 *p = (struct phb3 *)data;
+ struct pci_slot *slot = p->phb.slot;
+ struct proc_chip *chip = get_chip(p->chip_id);
+ int64_t rc;
+
+ switch (slot->state) {
+ case PHB3_SLOT_NORMAL:
+ lock(&capi_lock);
+ rc = (chip->capp_phb3_attached_mask & (1 << p->index)) ?
+ OPAL_PHB_CAPI_MODE_CAPI :
+ OPAL_PHB_CAPI_MODE_PCIE;
+ unlock(&capi_lock);
+
+ if (rc == OPAL_PHB_CAPI_MODE_PCIE)
+ return true;
+
+ PHBINF(p, "PHB in CAPI mode, resetting\n");
+ p->flags &= ~PHB3_CAPP_RECOVERY;
+ phb3_creset(slot);
+ return false;
+ default:
+ rc = slot->ops.run_sm(slot);
+ return rc <= OPAL_SUCCESS;
+ }
+}
+
+static void phb3_create(struct dt_node *np)
+{
+ const struct dt_property *prop;
+ struct phb3 *p = zalloc(sizeof(struct phb3));
+ struct pci_slot *slot;
+ size_t lane_eq_len;
+ struct dt_node *iplp;
+ struct proc_chip *chip;
+ int opal_id;
+ char *path;
+
+ assert(p);
+
+ /* Populate base stuff */
+ p->index = dt_prop_get_u32(np, "ibm,phb-index");
+ p->chip_id = dt_prop_get_u32(np, "ibm,chip-id");
+ p->regs = (void *)dt_get_address(np, 0, NULL);
+ p->base_msi = PHB3_MSI_IRQ_BASE(p->chip_id, p->index);
+ p->base_lsi = PHB3_LSI_IRQ_BASE(p->chip_id, p->index);
+ p->phb.dt_node = np;
+ p->phb.ops = &phb3_ops;
+ p->phb.phb_type = phb_type_pcie_v3;
+ p->phb.scan_map = 0x1; /* Only device 0 to scan */
+
+ if (!phb3_calculate_windows(p))
+ return;
+
+ /* Get the various XSCOM register bases from the device-tree */
+ prop = dt_require_property(np, "ibm,xscom-bases", 3 * sizeof(uint32_t));
+ p->pe_xscom = ((const uint32_t *)prop->prop)[0];
+ p->spci_xscom = ((const uint32_t *)prop->prop)[1];
+ p->pci_xscom = ((const uint32_t *)prop->prop)[2];
+
+ /*
+ * We skip the initial PERST assertion requested by the generic code
+ * when doing a cold boot because we are coming out of cold boot already
+ * so we save boot time that way. The PERST state machine will still
+ * handle waiting for the link to come up, it will just avoid actually
+ * asserting & deasserting the PERST output
+ *
+ * For a hot IPL, we still do a PERST
+ *
+ * Note: In absence of property (ie, FSP-less), we stick to the old
+ * behaviour and set skip_perst to true
+ */
+ p->skip_perst = true; /* Default */
+
+ iplp = dt_find_by_path(dt_root, "ipl-params/ipl-params");
+ if (iplp) {
+ const char *ipl_type = dt_prop_get_def(iplp, "cec-major-type", NULL);
+ if (ipl_type && (!strcmp(ipl_type, "hot")))
+ p->skip_perst = false;
+ }
+
+ /* By default link is assumed down */
+ p->has_link = false;
+
+ /* We register the PHB before we initialize it so we
+ * get a useful OPAL ID for it. We use a different numbering here
+ * between Naples and Venice/Murano in order to leave room for the
+ * NPU on Naples.
+ */
+ chip = next_chip(NULL); /* Just need any chip */
+ if (chip && chip->type == PROC_CHIP_P8_NAPLES)
+ opal_id = p->chip_id * 8 + p->index;
+ else
+ opal_id = p->chip_id * 4 + p->index;
+ pci_register_phb(&p->phb, opal_id);
+ slot = phb3_slot_create(&p->phb);
+ if (!slot)
+ PHBERR(p, "Cannot create PHB slot\n");
+
+ /* Hello ! */
+ path = dt_get_path(np);
+ PHBINF(p, "Found %s @[%d:%d]\n", path, p->chip_id, p->index);
+ PHBINF(p, " M32 [0x%016llx..0x%016llx]\n",
+ p->mm1_base, p->mm1_base + p->mm1_size - 1);
+ PHBINF(p, " M64 [0x%016llx..0x%016llx]\n",
+ p->mm0_base, p->mm0_base + p->mm0_size - 1);
+ free(path);
+
+ /* Find base location code from root node */
+ p->phb.base_loc_code = dt_prop_get_def(dt_root,
+ "ibm,io-base-loc-code", NULL);
+ if (!p->phb.base_loc_code)
+ PHBDBG(p, "Base location code not found !\n");
+
+ /* Priority order: NVRAM -> dt -> GEN3 */
+ p->max_link_speed = 3;
+ if (dt_has_node_property(np, "ibm,max-link-speed", NULL))
+ p->max_link_speed = dt_prop_get_u32(np, "ibm,max-link-speed");
+ if (pcie_max_link_speed)
+ p->max_link_speed = pcie_max_link_speed;
+ if (p->max_link_speed > 3) /* clamp to 3 */
+ p->max_link_speed = 3;
+ PHBINF(p, "Max link speed: GEN%i\n", p->max_link_speed);
+
+ /* Check for lane equalization values from HB or HDAT */
+ p->lane_eq = dt_prop_get_def_size(np, "ibm,lane-eq", NULL, &lane_eq_len);
+ if (p->lane_eq && lane_eq_len != (8 * 4)) {
+ PHBERR(p, "Device-tree has ibm,lane-eq with wrong len %ld\n",
+ lane_eq_len);
+ p->lane_eq = NULL;
+ }
+ if (p->lane_eq) {
+ PHBDBG(p, "Override lane equalization settings:\n");
+ PHBDBG(p, " 0x%016llx 0x%016llx\n",
+ be64_to_cpu(p->lane_eq[0]), be64_to_cpu(p->lane_eq[1]));
+ PHBDBG(p, " 0x%016llx 0x%016llx\n",
+ be64_to_cpu(p->lane_eq[2]), be64_to_cpu(p->lane_eq[3]));
+ }
+
+ /*
+ * Grab CEC IO VPD load info from the root of the device-tree,
+ * on P8 there's a single such VPD for the whole machine
+ */
+ prop = dt_find_property(dt_root, "ibm,io-vpd");
+ if (!prop) {
+ /* LX VPD Lid not already loaded */
+ if (platform.vpd_iohub_load)
+ platform.vpd_iohub_load(dt_root);
+ }
+
+ /* Allocate the SkiBoot internal in-memory tables for the PHB */
+ phb3_allocate_tables(p);
+
+ phb3_add_properties(p);
+
+ /* Clear IODA2 cache */
+ phb3_init_ioda_cache(p);
+
+ /* Register interrupt sources */
+ register_irq_source(&phb3_msi_irq_ops, p, p->base_msi,
+ PHB3_MSI_IRQ_COUNT);
+ register_irq_source(&phb3_lsi_irq_ops, p, p->base_lsi, 8);
+
+ /* Get the HW up and running */
+ phb3_init_hw(p, true);
+
+ /* Load capp microcode into capp unit */
+ load_capp_ucode(p);
+
+ opal_add_host_sync_notifier(phb3_host_sync_reset, p);
+
+ /* Platform additional setup */
+ if (platform.pci_setup_phb)
+ platform.pci_setup_phb(&p->phb, p->index);
+}
+
+static void phb3_probe_pbcq(struct dt_node *pbcq)
+{
+ uint32_t spci_xscom, pci_xscom, pe_xscom, gcid, pno;
+ uint64_t val, phb_bar, bar_en;
+ uint64_t mmio0_bar, mmio0_bmask, mmio0_sz;
+ uint64_t mmio1_bar, mmio1_bmask, mmio1_sz;
+ uint64_t reg[2];
+ uint64_t mmio_win[4];
+ unsigned int mmio_win_sz;
+ struct dt_node *np;
+ char *path;
+ uint64_t capp_ucode_base;
+ unsigned int max_link_speed;
+
+ gcid = dt_get_chip_id(pbcq);
+ pno = dt_prop_get_u32(pbcq, "ibm,phb-index");
+ path = dt_get_path(pbcq);
+ prlog(PR_NOTICE, "Chip %d Found PBCQ%d at %s\n", gcid, pno, path);
+ free(path);
+
+ pe_xscom = dt_get_address(pbcq, 0, NULL);
+ pci_xscom = dt_get_address(pbcq, 1, NULL);
+ spci_xscom = dt_get_address(pbcq, 2, NULL);
+ prlog(PR_DEBUG, "PHB3[%x:%x]: X[PE]=0x%08x X[PCI]=0x%08x"
+ " X[SPCI]=0x%08x\n",
+ gcid, pno, pe_xscom, pci_xscom, spci_xscom);
+
+ /* Check if CAPP mode */
+ if (xscom_read(gcid, spci_xscom + 0x03, &val)) {
+ prerror("PHB3[%x:%x]: Cannot read AIB CAPP ENABLE\n",
+ gcid, pno);
+ return;
+ }
+ if (val >> 63) {
+ prerror("PHB3[%x:%x]: Ignoring bridge in CAPP mode\n",
+ gcid, pno);
+ return;
+ }
+
+ /* Get PE BARs, assume only 0 and 2 are used for now */
+ xscom_read(gcid, pe_xscom + 0x42, &phb_bar);
+ phb_bar >>= 14;
+ prlog(PR_DEBUG, "PHB3[%x:%x] REGS = 0x%016llx [4k]\n",
+ gcid, pno, phb_bar);
+ if (phb_bar == 0) {
+ prerror("PHB3[%x:%x]: No PHB BAR set !\n", gcid, pno);
+ return;
+ }
+
+ /* Dbl check PHB BAR */
+ xscom_read(gcid, spci_xscom + 1, &val);/* HW275117 */
+ xscom_read(gcid, pci_xscom + 0x0b, &val);
+ val >>= 14;
+ prlog(PR_DEBUG, "PHB3[%x:%x] PCIBAR = 0x%016llx\n", gcid, pno, val);
+ if (phb_bar != val) {
+ prerror("PHB3[%x:%x] PCIBAR invalid, fixing up...\n",
+ gcid, pno);
+ xscom_read(gcid, spci_xscom + 1, &val);/* HW275117 */
+ xscom_write(gcid, pci_xscom + 0x0b, phb_bar << 14);
+ }
+
+ /* Check MMIO BARs */
+ xscom_read(gcid, pe_xscom + 0x40, &mmio0_bar);
+ xscom_read(gcid, pe_xscom + 0x43, &mmio0_bmask);
+ mmio0_bmask &= 0xffffffffc0000000ull;
+ mmio0_sz = ((~mmio0_bmask) >> 14) + 1;
+ mmio0_bar >>= 14;
+ prlog(PR_DEBUG, "PHB3[%x:%x] MMIO0 = 0x%016llx [0x%016llx]\n",
+ gcid, pno, mmio0_bar, mmio0_sz);
+ xscom_read(gcid, pe_xscom + 0x41, &mmio1_bar);
+ xscom_read(gcid, pe_xscom + 0x44, &mmio1_bmask);
+ mmio1_bmask &= 0xffffffffc0000000ull;
+ mmio1_sz = ((~mmio1_bmask) >> 14) + 1;
+ mmio1_bar >>= 14;
+ prlog(PR_DEBUG, "PHB3[%x:%x] MMIO1 = 0x%016llx [0x%016llx]\n",
+ gcid, pno, mmio1_bar, mmio1_sz);
+
+ /* Check BAR enable
+ *
+ * XXX BAR aren't always enabled by HB, we'll make assumptions
+ * that BARs are valid if they value is non-0
+ */
+ xscom_read(gcid, pe_xscom + 0x45, &bar_en);
+ prlog(PR_DEBUG, "PHB3[%x:%x] BAREN = 0x%016llx\n",
+ gcid, pno, bar_en);
+
+ /* Always enable PHB BAR */
+ bar_en |= 0x2000000000000000ull;
+
+ /* Build MMIO windows list */
+ mmio_win_sz = 0;
+ if (mmio0_bar) {
+ mmio_win[mmio_win_sz++] = mmio0_bar;
+ mmio_win[mmio_win_sz++] = mmio0_sz;
+ bar_en |= 0x8000000000000000ul;
+ }
+ if (mmio1_bar) {
+ mmio_win[mmio_win_sz++] = mmio1_bar;
+ mmio_win[mmio_win_sz++] = mmio1_sz;
+ bar_en |= 0x4000000000000000ul;
+ }
+
+ /* No MMIO windows ? Barf ! */
+ if (mmio_win_sz == 0) {
+ prerror("PHB3[%x:%x]: No MMIO windows enabled !\n",
+ gcid, pno);
+ return;
+ }
+
+ /* Set the interrupt routing stuff, 8 relevant bits in mask
+ * (11 bits per PHB)
+ */
+ val = p8_chip_irq_phb_base(gcid, pno);
+ val = (val << 45);
+ xscom_write(gcid, pe_xscom + 0x1a, val);
+ xscom_write(gcid, pe_xscom + 0x1b, 0xff00000000000000ul);
+
+ /* Configure LSI location to the top of the map */
+ xscom_write(gcid, pe_xscom + 0x1f, 0xff00000000000000ul);
+
+ /* Now add IRSN message bits to BAR enable and write it */
+ bar_en |= 0x1800000000000000ul;
+ xscom_write(gcid, pe_xscom + 0x45, bar_en);
+
+ prlog(PR_DEBUG, "PHB3[%x:%x] NEWBAREN = 0x%016llx\n",
+ gcid, pno, bar_en);
+
+ xscom_read(gcid, pe_xscom + 0x1a, &val);
+ prlog(PR_DEBUG, "PHB3[%x:%x] IRSNC = 0x%016llx\n",
+ gcid, pno, val);
+ xscom_read(gcid, pe_xscom + 0x1b, &val);
+ prlog(PR_DEBUG, "PHB3[%x:%x] IRSNM = 0x%016llx\n",
+ gcid, pno, val);
+ prlog(PR_DEBUG, "PHB3[%x:%x] LSI = 0x%016llx\n",
+ gcid, pno, val);
+
+ /* Create PHB node */
+ reg[0] = phb_bar;
+ reg[1] = 0x1000;
+
+ np = dt_new_addr(dt_root, "pciex", reg[0]);
+ if (!np)
+ return;
+
+ dt_add_property_strings(np, "compatible", "ibm,power8-pciex",
+ "ibm,ioda2-phb");
+ dt_add_property_strings(np, "device_type", "pciex");
+ dt_add_property(np, "reg", reg, sizeof(reg));
+
+ /* Everything else is handled later by skiboot, we just
+ * stick a few hints here
+ */
+ dt_add_property_cells(np, "ibm,xscom-bases",
+ pe_xscom, spci_xscom, pci_xscom);
+ dt_add_property(np, "ibm,mmio-window", mmio_win, 8 * mmio_win_sz);
+ dt_add_property_cells(np, "ibm,phb-index", pno);
+ dt_add_property_cells(np, "ibm,pbcq", pbcq->phandle);
+ dt_add_property_cells(np, "ibm,chip-id", gcid);
+ if (dt_has_node_property(pbcq, "ibm,use-ab-detect", NULL))
+ dt_add_property(np, "ibm,use-ab-detect", NULL, 0);
+ if (dt_has_node_property(pbcq, "ibm,hub-id", NULL))
+ dt_add_property_cells(np, "ibm,hub-id",
+ dt_prop_get_u32(pbcq, "ibm,hub-id"));
+ if (dt_has_node_property(pbcq, "ibm,loc-code", NULL)) {
+ const char *lc = dt_prop_get(pbcq, "ibm,loc-code");
+ dt_add_property_string(np, "ibm,loc-code", lc);
+ }
+ if (dt_has_node_property(pbcq, "ibm,lane-eq", NULL)) {
+ size_t leq_size;
+ const void *leq = dt_prop_get_def_size(pbcq, "ibm,lane-eq",
+ NULL, &leq_size);
+ if (leq != NULL && leq_size == 4 * 8)
+ dt_add_property(np, "ibm,lane-eq", leq, leq_size);
+ }
+ if (dt_has_node_property(pbcq, "ibm,capp-ucode", NULL)) {
+ capp_ucode_base = dt_prop_get_u32(pbcq, "ibm,capp-ucode");
+ dt_add_property_cells(np, "ibm,capp-ucode", capp_ucode_base);
+ }
+ if (dt_has_node_property(pbcq, "ibm,max-link-speed", NULL)) {
+ max_link_speed = dt_prop_get_u32(pbcq, "ibm,max-link-speed");
+ dt_add_property_cells(np, "ibm,max-link-speed", max_link_speed);
+ }
+ dt_add_property_cells(np, "ibm,capi-flags",
+ OPAL_PHB_CAPI_FLAG_SNOOP_CONTROL);
+
+ add_chip_dev_associativity(np);
+}
+
+
+void probe_phb3(void)
+{
+ struct dt_node *np;
+
+ /* Look for PBCQ XSCOM nodes */
+ dt_for_each_compatible(dt_root, np, "ibm,power8-pbcq")
+ phb3_probe_pbcq(np);
+
+ /* Look for newly created PHB nodes */
+ dt_for_each_compatible(dt_root, np, "ibm,power8-pciex")
+ phb3_create(np);
+}
+
+
diff --git a/roms/skiboot/hw/phb4.c b/roms/skiboot/hw/phb4.c
new file mode 100644
index 000000000..79083d4a1
--- /dev/null
+++ b/roms/skiboot/hw/phb4.c
@@ -0,0 +1,6400 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * PHB4: PCI Host Bridge 4, in POWER9
+ *
+ * Copyright 2013-2019 IBM Corp.
+ * Copyright 2018 Raptor Engineering, LLC
+ */
+
+/*
+ *
+ * FIXME:
+ * More stuff for EEH support:
+ * - PBCQ error reporting interrupt
+ * - I2C-based power management (replacing SHPC)
+ * - Directly detect fenced PHB through one dedicated HW reg
+ */
+
+/*
+ * This is a simplified view of the PHB4 reset and link training steps
+ *
+ * Step 1:
+ * - Check for hotplug status:
+ * o PHB_PCIE_HOTPLUG_STATUS bit PHB_PCIE_HPSTAT_PRESENCE
+ * o If not set -> Bail out (Slot is empty)
+ *
+ * Step 2:
+ * - Do complete PHB reset:
+ * o PHB/ETU reset procedure
+ *
+ * Step 3:
+ * - Drive PERST active (skip if already asserted. ie. after cold reboot)
+ * - Wait 250ms (for cards to reset)
+ * o powervm have used 250ms for a long time without any problems
+ *
+ * Step 4:
+ * - Drive PERST inactive
+ *
+ * Step 5:
+ * - Look for inband presence:
+ * o From PERST we have two stages to get inband presence detected
+ * 1) Devices must enter Detect state within 20 ms of the end of
+ * Fundamental Reset
+ * 2) Receiver detect pulse are every 12ms
+ * - Hence minimum wait time 20 + 12 = 32ms
+ * o Unfortunatey, we've seen cards take 440ms
+ * o Hence we are conservative and poll here for 1000ms (> 440ms)
+ * - If no inband presence after 100ms -> Bail out (Slot is broken)
+ * o PHB_PCIE_DLP_TRAIN_CTL bit PHB_PCIE_DLP_INBAND_PRESENCE
+ *
+ * Step 6:
+ * - Look for link training done:
+ * o PHB_PCIE_DLP_TRAIN_CTL bit PHB_PCIE_DLP_TL_LINKACT
+ * - If not set after 2000ms, Retry (3 times) -> Goto Step 2
+ * o phy lockup could link training failure, hence going back to a
+ * complete PHB reset on retry
+ * o not expect to happen very often
+ *
+ * Step 7:
+ * - Wait for 1 sec (before touching device config space):
+ * - From PCIe spec:
+ * Root Complex and/or system software must allow at least 1.0 s after
+ * a Conventional Reset of a device, before it may determine that a
+ * device which fails to return a Successful Completion status for a
+ * valid Configuration Request is a broken device.
+ *
+ * Step 8:
+ * - Sanity check for fence and link still up:
+ * o If fenced or link down, Retry (3 times) -> Goto Step 2
+ * o This is not nessary but takes no time and can be useful
+ * o Once we leave here, much harder to recover from errors
+ *
+ * Step 9:
+ * - Check for optimised link for directly attached devices:
+ * o Wait for CRS (so we can read device config space)
+ * o Check chip and device are in allowlist. if not, Goto Step 10
+ * o If trained link speed is degraded, retry -> Goto Step 2
+ * o If trained link width is degraded, retry -> Goto Step 2
+ * o If still degraded after 3 retries. Give up, Goto Step 10.
+ *
+ * Step 10:
+ * - PHB good, start probing config space.
+ * o core/pci.c: pci_reset_phb() -> pci_scan_phb()
+ */
+
+
+#undef NO_ASB
+#undef LOG_CFG
+
+#include <skiboot.h>
+#include <io.h>
+#include <timebase.h>
+#include <pci.h>
+#include <pci-cfg.h>
+#include <pci-slot.h>
+#include <vpd.h>
+#include <interrupts.h>
+#include <opal.h>
+#include <cpu.h>
+#include <device.h>
+#include <ccan/str/str.h>
+#include <ccan/array_size/array_size.h>
+#include <xscom.h>
+#include <affinity.h>
+#include <phb4.h>
+#include <phb4-regs.h>
+#include <phb4-capp.h>
+#include <capp.h>
+#include <fsp.h>
+#include <chip.h>
+#include <chiptod.h>
+#include <xive.h>
+#include <xscom-p9-regs.h>
+#include <phys-map.h>
+#include <nvram.h>
+
+/* Enable this to disable error interrupts for debug purposes */
+#undef DISABLE_ERR_INTS
+
+static void phb4_init_hw(struct phb4 *p);
+
+#define PHBDBG(p, fmt, a...) prlog(PR_DEBUG, "PHB#%04x[%d:%d]: " fmt, \
+ (p)->phb.opal_id, (p)->chip_id, \
+ (p)->index, ## a)
+#define PHBINF(p, fmt, a...) prlog(PR_INFO, "PHB#%04x[%d:%d]: " fmt, \
+ (p)->phb.opal_id, (p)->chip_id, \
+ (p)->index, ## a)
+#define PHBNOTICE(p, fmt, a...) prlog(PR_NOTICE, "PHB#%04x[%d:%d]: " fmt, \
+ (p)->phb.opal_id, (p)->chip_id, \
+ (p)->index, ## a)
+#define PHBERR(p, fmt, a...) prlog(PR_ERR, "PHB#%04x[%d:%d]: " fmt, \
+ (p)->phb.opal_id, (p)->chip_id, \
+ (p)->index, ## a)
+#ifdef LOG_CFG
+#define PHBLOGCFG(p, fmt, a...) PHBDBG(p, fmt, ## a)
+#else
+#define PHBLOGCFG(p, fmt, a...) do {} while (0)
+#endif
+
+static bool pci_eeh_mmio;
+static bool pci_retry_all;
+static int rx_err_max = PHB4_RX_ERR_MAX;
+
+static inline bool is_phb4(void)
+{
+ return (proc_gen == proc_gen_p9);
+}
+
+static inline bool is_phb5(void)
+{
+ return (proc_gen == proc_gen_p10);
+}
+
+/* PQ offloading on the XIVE IC. */
+static inline bool phb_pq_disable(struct phb4 *p __unused)
+{
+ if (is_phb5())
+ return xive2_cap_phb_pq_disable();
+
+ return false;
+}
+
+/*
+ * Use the ESB page of the XIVE IC for event notification. Latency
+ * improvement.
+ */
+static inline bool phb_abt_mode(struct phb4 *p __unused)
+{
+ if (is_phb5())
+ return xive2_cap_phb_abt();
+
+ return false;
+}
+
+static inline bool phb_can_store_eoi(struct phb4 *p)
+{
+ if (is_phb5())
+ /* PQ offloading is required for StoreEOI */
+ return XIVE2_STORE_EOI_ENABLED && phb_pq_disable(p);
+
+ return XIVE_STORE_EOI_ENABLED;
+}
+
+/* Note: The "ASB" name is historical, practically this means access via
+ * the XSCOM backdoor
+ */
+static inline uint64_t phb4_read_reg_asb(struct phb4 *p, uint32_t offset)
+{
+#ifdef NO_ASB
+ return in_be64(p->regs + offset);
+#else
+ int64_t rc;
+ uint64_t addr, val;
+
+ /* Address register: must use 4 bytes for built-in config space.
+ *
+ * This path isn't usable for outbound configuration space
+ */
+ if (((offset & 0xfffffffc) == PHB_CONFIG_DATA) && (offset & 3)) {
+ PHBERR(p, "XSCOM unaligned access to CONFIG_DATA unsupported\n");
+ return -1ull;
+ }
+ addr = XETU_HV_IND_ADDR_VALID | offset;
+ if ((offset >= 0x1000 && offset < 0x1800) || (offset == PHB_CONFIG_DATA))
+ addr |= XETU_HV_IND_ADDR_4B;
+ rc = xscom_write(p->chip_id, p->etu_xscom + XETU_HV_IND_ADDRESS, addr);
+ if (rc != 0) {
+ PHBERR(p, "XSCOM error addressing register 0x%x\n", offset);
+ return -1ull;
+ }
+ rc = xscom_read(p->chip_id, p->etu_xscom + XETU_HV_IND_DATA, &val);
+ if (rc != 0) {
+ PHBERR(p, "XSCOM error reading register 0x%x\n", offset);
+ return -1ull;
+ }
+ return val;
+#endif
+}
+
+static inline void phb4_write_reg_asb(struct phb4 *p,
+ uint32_t offset, uint64_t val)
+{
+#ifdef NO_ASB
+ out_be64(p->regs + offset, val);
+#else
+ int64_t rc;
+ uint64_t addr;
+
+ /* Address register: must use 4 bytes for built-in config space.
+ *
+ * This path isn't usable for outbound configuration space
+ */
+ if (((offset & 0xfffffffc) == PHB_CONFIG_DATA) && (offset & 3)) {
+ PHBERR(p, "XSCOM access to CONFIG_DATA unsupported\n");
+ return;
+ }
+ addr = XETU_HV_IND_ADDR_VALID | offset;
+ if ((offset >= 0x1000 && offset < 0x1800) || (offset == PHB_CONFIG_DATA))
+ addr |= XETU_HV_IND_ADDR_4B;
+ rc = xscom_write(p->chip_id, p->etu_xscom + XETU_HV_IND_ADDRESS, addr);
+ if (rc != 0) {
+ PHBERR(p, "XSCOM error addressing register 0x%x\n", offset);
+ return;
+ }
+ rc = xscom_write(p->chip_id, p->etu_xscom + XETU_HV_IND_DATA, val);
+ if (rc != 0) {
+ PHBERR(p, "XSCOM error writing register 0x%x\n", offset);
+ return;
+ }
+#endif
+}
+
+static uint64_t phb4_read_reg(struct phb4 *p, uint32_t offset)
+{
+ /* No register accesses are permitted while in reset */
+ if (p->flags & PHB4_ETU_IN_RESET)
+ return -1ull;
+
+ if (p->flags & PHB4_CFG_USE_ASB)
+ return phb4_read_reg_asb(p, offset);
+ else
+ return in_be64(p->regs + offset);
+}
+
+static void phb4_write_reg(struct phb4 *p, uint32_t offset, uint64_t val)
+{
+ /* No register accesses are permitted while in reset */
+ if (p->flags & PHB4_ETU_IN_RESET)
+ return;
+
+ if (p->flags & PHB4_CFG_USE_ASB)
+ phb4_write_reg_asb(p, offset, val);
+ else
+ return out_be64(p->regs + offset, val);
+}
+
+/* Helper to select an IODA table entry */
+static inline void phb4_ioda_sel(struct phb4 *p, uint32_t table,
+ uint32_t addr, bool autoinc)
+{
+ phb4_write_reg(p, PHB_IODA_ADDR,
+ (autoinc ? PHB_IODA_AD_AUTOINC : 0) |
+ SETFIELD(PHB_IODA_AD_TSEL, 0ul, table) |
+ SETFIELD(PHB_IODA_AD_TADR, 0ul, addr));
+}
+
+/*
+ * Configuration space access
+ *
+ * The PHB lock is assumed to be already held
+ */
+static int64_t phb4_pcicfg_check(struct phb4 *p, uint32_t bdfn,
+ uint32_t offset, uint32_t size,
+ uint16_t *pe)
+{
+ uint32_t sm = size - 1;
+
+ if (offset > 0xfff || bdfn > 0xffff)
+ return OPAL_PARAMETER;
+ if (offset & sm)
+ return OPAL_PARAMETER;
+
+ /* The root bus only has a device at 0 and we get into an
+ * error state if we try to probe beyond that, so let's
+ * avoid that and just return an error to Linux
+ */
+ if (PCI_BUS_NUM(bdfn) == 0 && (bdfn & 0xff))
+ return OPAL_HARDWARE;
+
+ /* Check PHB state */
+ if (p->broken)
+ return OPAL_HARDWARE;
+
+ /* Fetch the PE# from cache */
+ *pe = be16_to_cpu(p->tbl_rtt[bdfn]);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb4_rc_read(struct phb4 *p, uint32_t offset, uint8_t sz,
+ void *data, bool use_asb)
+{
+ uint32_t reg = offset & ~3;
+ uint32_t oval;
+
+ /* Some registers are handled locally */
+ switch (reg) {
+ /* Bridge base/limit registers are cached here as HW
+ * doesn't implement them (it hard codes values that
+ * will confuse a proper PCI implementation).
+ */
+ case PCI_CFG_MEM_BASE: /* Includes PCI_CFG_MEM_LIMIT */
+ oval = p->rc_cache[(reg - 0x20) >> 2] & 0xfff0fff0;
+ break;
+ case PCI_CFG_PREF_MEM_BASE: /* Includes PCI_CFG_PREF_MEM_LIMIT */
+ oval = p->rc_cache[(reg - 0x20) >> 2] & 0xfff0fff0;
+ oval |= 0x00010001;
+ break;
+ case PCI_CFG_IO_BASE_U16: /* Includes PCI_CFG_IO_LIMIT_U16 */
+ oval = 0;
+ break;
+ case PCI_CFG_PREF_MEM_BASE_U32:
+ case PCI_CFG_PREF_MEM_LIMIT_U32:
+ oval = p->rc_cache[(reg - 0x20) >> 2];
+ break;
+ default:
+ oval = 0xffffffff; /* default if offset too big */
+ if (reg < PHB_RC_CONFIG_SIZE) {
+ if (use_asb)
+ oval = bswap_32(phb4_read_reg_asb(p, PHB_RC_CONFIG_BASE
+ + reg));
+ else
+ oval = in_le32(p->regs + PHB_RC_CONFIG_BASE + reg);
+ }
+ }
+
+ /* Apply any post-read fixups */
+ switch (reg) {
+ case PCI_CFG_IO_BASE:
+ oval |= 0x01f1; /* Set IO base < limit to disable the window */
+ break;
+ }
+
+ switch (sz) {
+ case 1:
+ offset &= 3;
+ *((uint8_t *)data) = (oval >> (offset << 3)) & 0xff;
+ PHBLOGCFG(p, "000 CFG08 Rd %02x=%02x\n",
+ offset, *((uint8_t *)data));
+ break;
+ case 2:
+ offset &= 2;
+ *((uint16_t *)data) = (oval >> (offset << 3)) & 0xffff;
+ PHBLOGCFG(p, "000 CFG16 Rd %02x=%04x\n",
+ offset, *((uint16_t *)data));
+ break;
+ case 4:
+ *((uint32_t *)data) = oval;
+ PHBLOGCFG(p, "000 CFG32 Rd %02x=%08x\n",
+ offset, *((uint32_t *)data));
+ break;
+ default:
+ assert(false);
+ }
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb4_rc_write(struct phb4 *p, uint32_t offset, uint8_t sz,
+ uint32_t val, bool use_asb)
+{
+ uint32_t reg = offset & ~3;
+ uint32_t old, mask, shift, oldold;
+ int64_t rc;
+
+ if (reg > PHB_RC_CONFIG_SIZE)
+ return OPAL_SUCCESS;
+
+ /* If size isn't 4-bytes, do a RMW cycle */
+ if (sz < 4) {
+ rc = phb4_rc_read(p, reg, 4, &old, use_asb);
+ if (rc != OPAL_SUCCESS)
+ return rc;
+
+ /*
+ * Since we have to Read-Modify-Write here, we need to filter
+ * out registers that have write-1-to-clear bits to prevent
+ * clearing stuff we shouldn't be. So for any register this
+ * applies to, mask out those bits.
+ */
+ oldold = old;
+ switch(reg) {
+ case 0x1C: /* Secondary status */
+ old &= 0x00ffffff; /* mask out 24-31 */
+ break;
+ case 0x50: /* EC - Device status */
+ old &= 0xfff0ffff; /* mask out 16-19 */
+ break;
+ case 0x58: /* EC - Link status */
+ old &= 0x3fffffff; /* mask out 30-31 */
+ break;
+ case 0x78: /* EC - Link status 2 */
+ old &= 0xf000ffff; /* mask out 16-27 */
+ break;
+ /* These registers *only* have write-1-to-clear bits */
+ case 0x104: /* AER - Uncorr. error status */
+ case 0x110: /* AER - Corr. error status */
+ case 0x130: /* AER - Root error status */
+ case 0x180: /* P16 - status */
+ case 0x184: /* P16 - LDPM status */
+ case 0x188: /* P16 - FRDPM status */
+ case 0x18C: /* P16 - SRDPM status */
+ old &= 0x00000000;
+ break;
+ }
+
+ if (old != oldold) {
+ PHBLOGCFG(p, "Rewrote %x to %x for reg %x for W1C\n",
+ oldold, old, reg);
+ }
+
+ if (sz == 1) {
+ shift = (offset & 3) << 3;
+ mask = 0xff << shift;
+ val = (old & ~mask) | ((val & 0xff) << shift);
+ } else {
+ shift = (offset & 2) << 3;
+ mask = 0xffff << shift;
+ val = (old & ~mask) | ((val & 0xffff) << shift);
+ }
+ }
+
+ /* Some registers are handled locally */
+ switch (reg) {
+ /* See comment in phb4_rc_read() */
+ case PCI_CFG_MEM_BASE: /* Includes PCI_CFG_MEM_LIMIT */
+ case PCI_CFG_PREF_MEM_BASE: /* Includes PCI_CFG_PREF_MEM_LIMIT */
+ case PCI_CFG_PREF_MEM_BASE_U32:
+ case PCI_CFG_PREF_MEM_LIMIT_U32:
+ p->rc_cache[(reg - 0x20) >> 2] = val;
+ break;
+ case PCI_CFG_IO_BASE_U16: /* Includes PCI_CFG_IO_LIMIT_U16 */
+ break;
+ default:
+ /* Workaround PHB config space enable */
+ PHBLOGCFG(p, "000 CFG%02d Wr %02x=%08x\n", 8 * sz, reg, val);
+ if (use_asb)
+ phb4_write_reg_asb(p, PHB_RC_CONFIG_BASE + reg, val);
+ else
+ out_le32(p->regs + PHB_RC_CONFIG_BASE + reg, val);
+ }
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb4_pcicfg_read(struct phb4 *p, uint32_t bdfn,
+ uint32_t offset, uint32_t size,
+ void *data)
+{
+ uint64_t addr, val64;
+ int64_t rc;
+ uint16_t pe;
+ bool use_asb = false;
+
+ rc = phb4_pcicfg_check(p, bdfn, offset, size, &pe);
+ if (rc)
+ return rc;
+
+ if (p->flags & PHB4_AIB_FENCED) {
+ if (!(p->flags & PHB4_CFG_USE_ASB))
+ return OPAL_HARDWARE;
+ if (bdfn != 0)
+ return OPAL_HARDWARE;
+ use_asb = true;
+ } else if ((p->flags & PHB4_CFG_BLOCKED) && bdfn != 0) {
+ return OPAL_HARDWARE;
+ }
+
+ /* Handle per-device filters */
+ rc = pci_handle_cfg_filters(&p->phb, bdfn, offset, size,
+ (uint32_t *)data, false);
+ if (rc != OPAL_PARTIAL)
+ return rc;
+
+ /* Handle root complex MMIO based config space */
+ if (bdfn == 0)
+ return phb4_rc_read(p, offset, size, data, use_asb);
+
+ addr = PHB_CA_ENABLE;
+ addr = SETFIELD(PHB_CA_BDFN, addr, bdfn);
+ addr = SETFIELD(PHB_CA_REG, addr, offset & ~3u);
+ addr = SETFIELD(PHB_CA_PE, addr, pe);
+ if (use_asb) {
+ phb4_write_reg_asb(p, PHB_CONFIG_ADDRESS, addr);
+ sync();
+ val64 = bswap_64(phb4_read_reg_asb(p, PHB_CONFIG_DATA));
+ switch(size) {
+ case 1:
+ *((uint8_t *)data) = val64 >> (8 * (offset & 3));
+ break;
+ case 2:
+ *((uint16_t *)data) = val64 >> (8 * (offset & 2));
+ break;
+ case 4:
+ *((uint32_t *)data) = val64;
+ break;
+ default:
+ return OPAL_PARAMETER;
+ }
+ } else {
+ out_be64(p->regs + PHB_CONFIG_ADDRESS, addr);
+ switch(size) {
+ case 1:
+ *((uint8_t *)data) =
+ in_8(p->regs + PHB_CONFIG_DATA + (offset & 3));
+ PHBLOGCFG(p, "%03x CFG08 Rd %02x=%02x\n",
+ bdfn, offset, *((uint8_t *)data));
+ break;
+ case 2:
+ *((uint16_t *)data) =
+ in_le16(p->regs + PHB_CONFIG_DATA + (offset & 2));
+ PHBLOGCFG(p, "%03x CFG16 Rd %02x=%04x\n",
+ bdfn, offset, *((uint16_t *)data));
+ break;
+ case 4:
+ *((uint32_t *)data) = in_le32(p->regs + PHB_CONFIG_DATA);
+ PHBLOGCFG(p, "%03x CFG32 Rd %02x=%08x\n",
+ bdfn, offset, *((uint32_t *)data));
+ break;
+ default:
+ return OPAL_PARAMETER;
+ }
+ }
+ return OPAL_SUCCESS;
+}
+
+
+#define PHB4_PCI_CFG_READ(size, type) \
+static int64_t phb4_pcicfg_read##size(struct phb *phb, uint32_t bdfn, \
+ uint32_t offset, type *data) \
+{ \
+ struct phb4 *p = phb_to_phb4(phb); \
+ \
+ /* Initialize data in case of error */ \
+ *data = (type)0xffffffff; \
+ return phb4_pcicfg_read(p, bdfn, offset, sizeof(type), data); \
+}
+
+static int64_t phb4_pcicfg_write(struct phb4 *p, uint32_t bdfn,
+ uint32_t offset, uint32_t size,
+ uint32_t data)
+{
+ uint64_t addr;
+ int64_t rc;
+ uint16_t pe;
+ bool use_asb = false;
+
+ rc = phb4_pcicfg_check(p, bdfn, offset, size, &pe);
+ if (rc)
+ return rc;
+
+ if (p->flags & PHB4_AIB_FENCED) {
+ if (!(p->flags & PHB4_CFG_USE_ASB))
+ return OPAL_HARDWARE;
+ if (bdfn != 0)
+ return OPAL_HARDWARE;
+ use_asb = true;
+ } else if ((p->flags & PHB4_CFG_BLOCKED) && bdfn != 0) {
+ return OPAL_HARDWARE;
+ }
+
+ /* Handle per-device filters */
+ rc = pci_handle_cfg_filters(&p->phb, bdfn, offset, size,
+ (uint32_t *)&data, true);
+ if (rc != OPAL_PARTIAL)
+ return rc;
+
+ /* Handle root complex MMIO based config space */
+ if (bdfn == 0)
+ return phb4_rc_write(p, offset, size, data, use_asb);
+
+ addr = PHB_CA_ENABLE;
+ addr = SETFIELD(PHB_CA_BDFN, addr, bdfn);
+ addr = SETFIELD(PHB_CA_REG, addr, offset & ~3u);
+ addr = SETFIELD(PHB_CA_PE, addr, pe);
+ if (use_asb) {
+ /* We don't support ASB config space writes */
+ return OPAL_UNSUPPORTED;
+ } else {
+ out_be64(p->regs + PHB_CONFIG_ADDRESS, addr);
+ switch(size) {
+ case 1:
+ out_8(p->regs + PHB_CONFIG_DATA + (offset & 3), data);
+ break;
+ case 2:
+ out_le16(p->regs + PHB_CONFIG_DATA + (offset & 2), data);
+ break;
+ case 4:
+ out_le32(p->regs + PHB_CONFIG_DATA, data);
+ break;
+ default:
+ return OPAL_PARAMETER;
+ }
+ }
+ PHBLOGCFG(p, "%03x CFG%d Wr %02x=%08x\n", bdfn, 8 * size, offset, data);
+ return OPAL_SUCCESS;
+}
+
+#define PHB4_PCI_CFG_WRITE(size, type) \
+static int64_t phb4_pcicfg_write##size(struct phb *phb, uint32_t bdfn, \
+ uint32_t offset, type data) \
+{ \
+ struct phb4 *p = phb_to_phb4(phb); \
+ \
+ return phb4_pcicfg_write(p, bdfn, offset, sizeof(type), data); \
+}
+
+PHB4_PCI_CFG_READ(8, u8)
+PHB4_PCI_CFG_READ(16, u16)
+PHB4_PCI_CFG_READ(32, u32)
+PHB4_PCI_CFG_WRITE(8, u8)
+PHB4_PCI_CFG_WRITE(16, u16)
+PHB4_PCI_CFG_WRITE(32, u32)
+
+static int64_t phb4_get_reserved_pe_number(struct phb *phb)
+{
+ struct phb4 *p = phb_to_phb4(phb);
+
+ return PHB4_RESERVED_PE_NUM(p);
+}
+
+
+static void phb4_root_port_init(struct phb *phb, struct pci_device *dev,
+ int ecap, int aercap)
+{
+ struct phb4 *p = phb_to_phb4(phb);
+ struct pci_slot *slot = dev->slot;
+ uint16_t bdfn = dev->bdfn;
+ uint16_t val16;
+ uint32_t val32;
+
+ /*
+ * Use the PHB's callback so that UTL events will be masked or
+ * unmasked when the link is down or up.
+ */
+ if (dev->slot && dev->slot->ops.prepare_link_change &&
+ phb->slot && phb->slot->ops.prepare_link_change)
+ dev->slot->ops.prepare_link_change =
+ phb->slot->ops.prepare_link_change;
+
+ // FIXME: check recommended init values for phb4
+
+ /*
+ * Enable the bridge slot capability in the root port's config
+ * space. This should probably be done *before* we start
+ * scanning config space, but we need a pci_device struct to
+ * exist before we do a slot lookup so *faaaaaaaaaaaaaart*
+ */
+ if (slot && slot->pluggable && slot->power_limit) {
+ uint64_t val;
+
+ val = in_be64(p->regs + PHB_PCIE_SCR);
+ val |= PHB_PCIE_SCR_SLOT_CAP;
+ out_be64(p->regs + PHB_PCIE_SCR, val);
+
+ /* update the cached slotcap */
+ pci_cfg_read32(phb, bdfn, ecap + PCICAP_EXP_SLOTCAP,
+ &slot->slot_cap);
+ }
+
+ /* Enable SERR and parity checking */
+ pci_cfg_read16(phb, bdfn, PCI_CFG_CMD, &val16);
+ val16 |= (PCI_CFG_CMD_SERR_EN | PCI_CFG_CMD_PERR_RESP |
+ PCI_CFG_CMD_MEM_EN);
+ pci_cfg_write16(phb, bdfn, PCI_CFG_CMD, val16);
+
+ /* Enable reporting various errors */
+ if (!ecap) return;
+ pci_cfg_read16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, &val16);
+ val16 |= (PCICAP_EXP_DEVCTL_CE_REPORT |
+ PCICAP_EXP_DEVCTL_NFE_REPORT |
+ PCICAP_EXP_DEVCTL_FE_REPORT |
+ PCICAP_EXP_DEVCTL_UR_REPORT);
+ pci_cfg_write16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, val16);
+
+ if (!aercap) return;
+
+ /* Mask various unrecoverable errors */
+ pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_UE_MASK, &val32);
+ val32 |= (PCIECAP_AER_UE_MASK_POISON_TLP |
+ PCIECAP_AER_UE_MASK_COMPL_TIMEOUT |
+ PCIECAP_AER_UE_MASK_COMPL_ABORT |
+ PCIECAP_AER_UE_MASK_ECRC);
+ pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_MASK, val32);
+
+ /* Report various unrecoverable errors as fatal errors */
+ pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_UE_SEVERITY, &val32);
+ val32 |= (PCIECAP_AER_UE_SEVERITY_DLLP |
+ PCIECAP_AER_UE_SEVERITY_SURPRISE_DOWN |
+ PCIECAP_AER_UE_SEVERITY_FLOW_CTL_PROT |
+ PCIECAP_AER_UE_SEVERITY_UNEXP_COMPL |
+ PCIECAP_AER_UE_SEVERITY_RECV_OVFLOW |
+ PCIECAP_AER_UE_SEVERITY_MALFORMED_TLP);
+ pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_SEVERITY, val32);
+
+ /* Mask various recoverable errors */
+ pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_CE_MASK, &val32);
+ val32 |= PCIECAP_AER_CE_MASK_ADV_NONFATAL;
+ pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CE_MASK, val32);
+
+ /* Enable ECRC check */
+ pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, &val32);
+ val32 |= (PCIECAP_AER_CAPCTL_ECRCG_EN |
+ PCIECAP_AER_CAPCTL_ECRCC_EN);
+ pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, val32);
+
+ /* Enable all error reporting */
+ pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_RERR_CMD, &val32);
+ val32 |= (PCIECAP_AER_RERR_CMD_FE |
+ PCIECAP_AER_RERR_CMD_NFE |
+ PCIECAP_AER_RERR_CMD_CE);
+ pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_RERR_CMD, val32);
+}
+
+static void phb4_switch_port_init(struct phb *phb,
+ struct pci_device *dev,
+ int ecap, int aercap)
+{
+ uint16_t bdfn = dev->bdfn;
+ uint16_t val16;
+ uint32_t val32;
+
+ // FIXME: update AER settings for phb4
+
+ /* Enable SERR and parity checking and disable INTx */
+ pci_cfg_read16(phb, bdfn, PCI_CFG_CMD, &val16);
+ val16 |= (PCI_CFG_CMD_PERR_RESP |
+ PCI_CFG_CMD_SERR_EN |
+ PCI_CFG_CMD_INTx_DIS);
+ pci_cfg_write16(phb, bdfn, PCI_CFG_CMD, val16);
+
+ /* Disable partity error and enable system error */
+ pci_cfg_read16(phb, bdfn, PCI_CFG_BRCTL, &val16);
+ val16 &= ~PCI_CFG_BRCTL_PERR_RESP_EN;
+ val16 |= PCI_CFG_BRCTL_SERR_EN;
+ pci_cfg_write16(phb, bdfn, PCI_CFG_BRCTL, val16);
+
+ /* Enable reporting various errors */
+ if (!ecap) return;
+ pci_cfg_read16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, &val16);
+ val16 |= (PCICAP_EXP_DEVCTL_CE_REPORT |
+ PCICAP_EXP_DEVCTL_NFE_REPORT |
+ PCICAP_EXP_DEVCTL_FE_REPORT);
+ /* HW279570 - Disable reporting of correctable errors */
+ val16 &= ~PCICAP_EXP_DEVCTL_CE_REPORT;
+ pci_cfg_write16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, val16);
+
+ /* Unmask all unrecoverable errors */
+ if (!aercap) return;
+ pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_MASK, 0x0);
+
+ /* Severity of unrecoverable errors */
+ if (dev->dev_type == PCIE_TYPE_SWITCH_UPPORT)
+ val32 = (PCIECAP_AER_UE_SEVERITY_DLLP |
+ PCIECAP_AER_UE_SEVERITY_SURPRISE_DOWN |
+ PCIECAP_AER_UE_SEVERITY_FLOW_CTL_PROT |
+ PCIECAP_AER_UE_SEVERITY_RECV_OVFLOW |
+ PCIECAP_AER_UE_SEVERITY_MALFORMED_TLP |
+ PCIECAP_AER_UE_SEVERITY_INTERNAL);
+ else
+ val32 = (PCIECAP_AER_UE_SEVERITY_FLOW_CTL_PROT |
+ PCIECAP_AER_UE_SEVERITY_INTERNAL);
+ pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_SEVERITY, val32);
+
+ /*
+ * Mask various correctable errors
+ */
+ val32 = PCIECAP_AER_CE_MASK_ADV_NONFATAL;
+ pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CE_MASK, val32);
+
+ /* Enable ECRC generation and disable ECRC check */
+ pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, &val32);
+ val32 |= PCIECAP_AER_CAPCTL_ECRCG_EN;
+ val32 &= ~PCIECAP_AER_CAPCTL_ECRCC_EN;
+ pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, val32);
+}
+
+static void phb4_endpoint_init(struct phb *phb,
+ struct pci_device *dev,
+ int ecap, int aercap)
+{
+ uint16_t bdfn = dev->bdfn;
+ uint16_t val16;
+ uint32_t val32;
+
+ /* Enable SERR and parity checking */
+ pci_cfg_read16(phb, bdfn, PCI_CFG_CMD, &val16);
+ val16 |= (PCI_CFG_CMD_PERR_RESP |
+ PCI_CFG_CMD_SERR_EN);
+ pci_cfg_write16(phb, bdfn, PCI_CFG_CMD, val16);
+
+ /* Enable reporting various errors */
+ if (!ecap) return;
+ pci_cfg_read16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, &val16);
+ val16 &= ~PCICAP_EXP_DEVCTL_CE_REPORT;
+ val16 |= (PCICAP_EXP_DEVCTL_NFE_REPORT |
+ PCICAP_EXP_DEVCTL_FE_REPORT |
+ PCICAP_EXP_DEVCTL_UR_REPORT);
+ pci_cfg_write16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, val16);
+
+ /* Enable ECRC generation and check */
+ if (!aercap)
+ return;
+
+ pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, &val32);
+ val32 |= (PCIECAP_AER_CAPCTL_ECRCG_EN |
+ PCIECAP_AER_CAPCTL_ECRCC_EN);
+ pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, val32);
+}
+
+static int64_t phb4_pcicfg_no_dstate(void *dev __unused,
+ struct pci_cfg_reg_filter *pcrf,
+ uint32_t offset, uint32_t len __unused,
+ uint32_t *data __unused, bool write)
+{
+ uint32_t loff = offset - pcrf->start;
+
+ /* Disable D-state change on children of the PHB. For now we
+ * simply block all writes to the PM control/status
+ */
+ if (write && loff >= 4 && loff < 6)
+ return OPAL_SUCCESS;
+
+ return OPAL_PARTIAL;
+}
+
+void phb4_pec2_dma_engine_realloc(struct phb4 *p)
+{
+ uint64_t reg;
+
+ /*
+ * Allocate 16 extra dma read engines to stack 0, to boost dma
+ * performance for devices on stack 0 of PEC2, i.e PHB3.
+ * It comes at a price of reduced read engine allocation for
+ * devices on stack 1 and 2. The engine allocation becomes
+ * 48/8/8 instead of the default 32/16/16.
+ *
+ * The reallocation magic value should be 0xffff0000ff008000,
+ * but per the PCI designers, dma engine 32 (bit 0) has a
+ * quirk, and 0x7fff80007F008000 has the same effect (engine
+ * 32 goes to PHB4).
+ */
+ if (p->index != 3) /* shared slot on PEC2 */
+ return;
+
+ PHBINF(p, "Allocating an extra 16 dma read engines on PEC2 stack0\n");
+ reg = 0x7fff80007F008000ULL;
+ xscom_write(p->chip_id,
+ p->pci_xscom + XPEC_PCI_PRDSTKOVR, reg);
+ xscom_write(p->chip_id,
+ p->pe_xscom + XPEC_NEST_READ_STACK_OVERRIDE, reg);
+}
+
+static void phb4_check_device_quirks(struct pci_device *dev)
+{
+ /* Some special adapter tweaks for devices directly under the PHB */
+ if (dev->primary_bus != 1)
+ return;
+
+ /* PM quirk */
+ if (!pci_has_cap(dev, PCI_CFG_CAP_ID_PM, false))
+ return;
+
+ pci_add_cfg_reg_filter(dev,
+ pci_cap(dev, PCI_CFG_CAP_ID_PM, false), 8,
+ PCI_REG_FLAG_WRITE,
+ phb4_pcicfg_no_dstate);
+}
+
+static int phb4_device_init(struct phb *phb, struct pci_device *dev,
+ void *data __unused)
+{
+ int ecap, aercap;
+
+ /* Setup special device quirks */
+ phb4_check_device_quirks(dev);
+
+ /* Common initialization for the device */
+ pci_device_init(phb, dev);
+
+ ecap = pci_cap(dev, PCI_CFG_CAP_ID_EXP, false);
+ aercap = pci_cap(dev, PCIECAP_ID_AER, true);
+ if (dev->dev_type == PCIE_TYPE_ROOT_PORT)
+ phb4_root_port_init(phb, dev, ecap, aercap);
+ else if (dev->dev_type == PCIE_TYPE_SWITCH_UPPORT ||
+ dev->dev_type == PCIE_TYPE_SWITCH_DNPORT)
+ phb4_switch_port_init(phb, dev, ecap, aercap);
+ else
+ phb4_endpoint_init(phb, dev, ecap, aercap);
+
+ return 0;
+}
+
+static int64_t phb4_pci_reinit(struct phb *phb, uint64_t scope, uint64_t data)
+{
+ struct pci_device *pd;
+ uint16_t bdfn = data;
+ int ret;
+
+ if (scope != OPAL_REINIT_PCI_DEV)
+ return OPAL_PARAMETER;
+
+ pd = pci_find_dev(phb, bdfn);
+ if (!pd)
+ return OPAL_PARAMETER;
+
+ ret = phb4_device_init(phb, pd, NULL);
+ if (ret)
+ return OPAL_HARDWARE;
+
+ return OPAL_SUCCESS;
+}
+
+/* Default value for MBT0, see comments in init_ioda_cache() */
+static uint64_t phb4_default_mbt0(struct phb4 *p, unsigned int bar_idx)
+{
+ uint64_t mbt0;
+
+ switch (p->mbt_size - bar_idx - 1) {
+ case 0:
+ mbt0 = SETFIELD(IODA3_MBT0_MODE, 0ull, IODA3_MBT0_MODE_MDT);
+ mbt0 = SETFIELD(IODA3_MBT0_MDT_COLUMN, mbt0, 3);
+ break;
+ case 1:
+ mbt0 = SETFIELD(IODA3_MBT0_MODE, 0ull, IODA3_MBT0_MODE_MDT);
+ mbt0 = SETFIELD(IODA3_MBT0_MDT_COLUMN, mbt0, 2);
+ break;
+ case 2:
+ mbt0 = SETFIELD(IODA3_MBT0_MODE, 0ull, IODA3_MBT0_MODE_MDT);
+ mbt0 = SETFIELD(IODA3_MBT0_MDT_COLUMN, mbt0, 1);
+ break;
+ default:
+ mbt0 = SETFIELD(IODA3_MBT0_MODE, 0ull, IODA3_MBT0_MODE_PE_SEG);
+ }
+ return mbt0;
+}
+
+/*
+ * Clear the saved (cached) IODA state.
+ *
+ * The caches here are used to save the configuration of the IODA tables
+ * done by the OS. When the PHB is reset it loses all of its internal state
+ * so we need to keep a copy to restore from. This function re-initialises
+ * the saved state to sane defaults.
+ */
+static void phb4_init_ioda_cache(struct phb4 *p)
+{
+ uint32_t i;
+
+ /*
+ * The RTT entries (RTE) are supposed to be initialised to
+ * 0xFF which indicates an invalid PE# for that RTT index
+ * (the bdfn). However, we set them to 0x00 since Linux
+ * needs to find the devices first by scanning config space
+ * and this occurs before PEs have been assigned.
+ */
+ for (i = 0; i < RTT_TABLE_ENTRIES; i++)
+ p->tbl_rtt[i] = cpu_to_be16(PHB4_RESERVED_PE_NUM(p));
+ memset(p->tbl_peltv, 0x0, p->tbl_peltv_size);
+ memset(p->tve_cache, 0x0, sizeof(p->tve_cache));
+
+ /* XXX Should we mask them ? */
+ memset(p->mist_cache, 0x0, sizeof(p->mist_cache));
+
+ /* Configure MBT entries 1...N */
+
+ /* Column 0 is left 0 and will be used fo M32 and configured
+ * by the OS. We use MDT column 1..3 for the last 3 BARs, thus
+ * allowing Linux to remap those, and setup all the other ones
+ * for now in mode 00 (segment# == PE#). By default those
+ * columns are set to map the same way.
+ */
+ for (i = 0; i < p->max_num_pes; i++) {
+ p->mdt_cache[i] = SETFIELD(IODA3_MDT_PE_B, 0ull, i);
+ p->mdt_cache[i] |= SETFIELD(IODA3_MDT_PE_C, 0ull, i);
+ p->mdt_cache[i] |= SETFIELD(IODA3_MDT_PE_D, 0ull, i);
+ }
+
+ /* Initialize MBT entries for BARs 1...N */
+ for (i = 1; i < p->mbt_size; i++) {
+ p->mbt_cache[i][0] = phb4_default_mbt0(p, i);
+ p->mbt_cache[i][1] = 0;
+ }
+
+ /* Initialize M32 bar using MBT entry 0, MDT colunm A */
+ p->mbt_cache[0][0] = SETFIELD(IODA3_MBT0_MODE, 0ull, IODA3_MBT0_MODE_MDT);
+ p->mbt_cache[0][0] |= SETFIELD(IODA3_MBT0_MDT_COLUMN, 0ull, 0);
+ p->mbt_cache[0][0] |= IODA3_MBT0_TYPE_M32 | (p->mm1_base & IODA3_MBT0_BASE_ADDR);
+ p->mbt_cache[0][1] = IODA3_MBT1_ENABLE | ((~(M32_PCI_SIZE - 1)) & IODA3_MBT1_MASK);
+}
+
+static int64_t phb4_wait_bit(struct phb4 *p, uint32_t reg,
+ uint64_t mask, uint64_t want_val)
+{
+ uint64_t val;
+
+ /* Wait for all pending TCE kills to complete
+ *
+ * XXX Add timeout...
+ */
+ /* XXX SIMICS is nasty... */
+ if ((reg == PHB_TCE_KILL || reg == PHB_DMA_READ_WRITE_SYNC) &&
+ chip_quirk(QUIRK_SIMICS))
+ return OPAL_SUCCESS;
+
+ for (;;) {
+ val = in_be64(p->regs + reg);
+ if (val == 0xffffffffffffffffull) {
+ /* XXX Fenced ? */
+ return OPAL_HARDWARE;
+ }
+ if ((val & mask) == want_val)
+ break;
+
+ }
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb4_tce_kill(struct phb *phb, uint32_t kill_type,
+ uint64_t pe_number, uint32_t tce_size,
+ uint64_t dma_addr, uint32_t npages)
+{
+ struct phb4 *p = phb_to_phb4(phb);
+ uint64_t val;
+ int64_t rc;
+
+ sync();
+ switch(kill_type) {
+ case OPAL_PCI_TCE_KILL_PAGES:
+ while (npages--) {
+ /* Wait for a slot in the HW kill queue */
+ rc = phb4_wait_bit(p, PHB_TCE_KILL,
+ PHB_TCE_KILL_ALL |
+ PHB_TCE_KILL_PE |
+ PHB_TCE_KILL_ONE, 0);
+ if (rc)
+ return rc;
+ val = SETFIELD(PHB_TCE_KILL_PENUM, dma_addr, pe_number);
+
+ /* Set appropriate page size */
+ switch(tce_size) {
+ case 0x1000:
+ if (dma_addr & 0xf000000000000fffull)
+ return OPAL_PARAMETER;
+ break;
+ case 0x10000:
+ if (dma_addr & 0xf00000000000ffffull)
+ return OPAL_PARAMETER;
+ val |= PHB_TCE_KILL_PSEL | PHB_TCE_KILL_64K;
+ break;
+ case 0x200000:
+ if (dma_addr & 0xf0000000001fffffull)
+ return OPAL_PARAMETER;
+ val |= PHB_TCE_KILL_PSEL | PHB_TCE_KILL_2M;
+ break;
+ case 0x40000000:
+ if (dma_addr & 0xf00000003fffffffull)
+ return OPAL_PARAMETER;
+ val |= PHB_TCE_KILL_PSEL | PHB_TCE_KILL_1G;
+ break;
+ default:
+ return OPAL_PARAMETER;
+ }
+ /* Perform kill */
+ out_be64(p->regs + PHB_TCE_KILL, PHB_TCE_KILL_ONE | val);
+ /* Next page */
+ dma_addr += tce_size;
+ }
+ break;
+ case OPAL_PCI_TCE_KILL_PE:
+ /* Wait for a slot in the HW kill queue */
+ rc = phb4_wait_bit(p, PHB_TCE_KILL,
+ PHB_TCE_KILL_ALL |
+ PHB_TCE_KILL_PE |
+ PHB_TCE_KILL_ONE, 0);
+ if (rc)
+ return rc;
+ /* Perform kill */
+ out_be64(p->regs + PHB_TCE_KILL, PHB_TCE_KILL_PE |
+ SETFIELD(PHB_TCE_KILL_PENUM, 0ull, pe_number));
+ break;
+ case OPAL_PCI_TCE_KILL_ALL:
+ /* Wait for a slot in the HW kill queue */
+ rc = phb4_wait_bit(p, PHB_TCE_KILL,
+ PHB_TCE_KILL_ALL |
+ PHB_TCE_KILL_PE |
+ PHB_TCE_KILL_ONE, 0);
+ if (rc)
+ return rc;
+ /* Perform kill */
+ out_be64(p->regs + PHB_TCE_KILL, PHB_TCE_KILL_ALL);
+ break;
+ default:
+ return OPAL_PARAMETER;
+ }
+
+ /* Start DMA sync process */
+ if (is_phb5()){
+ val = in_be64(p->regs + PHB_DMA_READ_WRITE_SYNC) &
+ (PHB_DMA_READ_SYNC_COMPLETE |
+ PHB_DMA_WRITE_SYNC_COMPLETE);
+ out_be64(p->regs + PHB_DMA_READ_WRITE_SYNC,
+ val | PHB_DMA_READ_SYNC_START);
+
+ } else {
+ out_be64(p->regs + PHB_DMA_READ_WRITE_SYNC,
+ PHB_DMA_READ_SYNC_START);
+ }
+
+ /* Wait for kill to complete */
+ rc = phb4_wait_bit(p, PHB_Q_DMA_R, PHB_Q_DMA_R_TCE_KILL_STATUS, 0);
+ if (rc)
+ return rc;
+
+ /* Wait for DMA sync to complete */
+ return phb4_wait_bit(p, PHB_DMA_READ_WRITE_SYNC,
+ PHB_DMA_READ_SYNC_COMPLETE,
+ PHB_DMA_READ_SYNC_COMPLETE);
+}
+
+/* phb4_ioda_reset - Reset the IODA tables
+ *
+ * @purge: If true, the cache is cleared and the cleared values
+ * are applied to HW. If false, the cached values are
+ * applied to HW
+ *
+ * This reset the IODA tables in the PHB. It is called at
+ * initialization time, on PHB reset, and can be called
+ * explicitly from OPAL
+ */
+static int64_t phb4_ioda_reset(struct phb *phb, bool purge)
+{
+ struct phb4 *p = phb_to_phb4(phb);
+ uint32_t i;
+ uint64_t val;
+
+ if (purge) {
+ PHBDBG(p, "Purging all IODA tables...\n");
+ if (phb->slot)
+ phb->slot->link_retries = PHB4_LINK_LINK_RETRIES;
+ phb4_init_ioda_cache(p);
+ }
+
+ /* Init_30..31 - Errata workaround, clear PESTA entry 0 */
+ phb4_ioda_sel(p, IODA3_TBL_PESTA, 0, false);
+ out_be64(p->regs + PHB_IODA_DATA0, 0);
+
+ /* Init_32..33 - MIST */
+ phb4_ioda_sel(p, IODA3_TBL_MIST, 0, true);
+ val = in_be64(p->regs + PHB_IODA_ADDR);
+ val = SETFIELD(PHB_IODA_AD_MIST_PWV, val, 0xf);
+ out_be64(p->regs + PHB_IODA_ADDR, val);
+ for (i = 0; i < (p->num_irqs/4); i++)
+ out_be64(p->regs + PHB_IODA_DATA0, p->mist_cache[i]);
+
+ /* Init_34..35 - MRT */
+ phb4_ioda_sel(p, IODA3_TBL_MRT, 0, true);
+ for (i = 0; i < p->mrt_size; i++)
+ out_be64(p->regs + PHB_IODA_DATA0, 0);
+
+ /* Init_36..37 - TVT */
+ phb4_ioda_sel(p, IODA3_TBL_TVT, 0, true);
+ for (i = 0; i < p->tvt_size; i++)
+ out_be64(p->regs + PHB_IODA_DATA0, p->tve_cache[i]);
+
+ /* Init_38..39 - MBT */
+ phb4_ioda_sel(p, IODA3_TBL_MBT, 0, true);
+ for (i = 0; i < p->mbt_size; i++) {
+ out_be64(p->regs + PHB_IODA_DATA0, p->mbt_cache[i][0]);
+ out_be64(p->regs + PHB_IODA_DATA0, p->mbt_cache[i][1]);
+ }
+
+ /* Init_40..41 - MDT */
+ phb4_ioda_sel(p, IODA3_TBL_MDT, 0, true);
+ for (i = 0; i < p->max_num_pes; i++)
+ out_be64(p->regs + PHB_IODA_DATA0, p->mdt_cache[i]);
+
+ /* Additional OPAL specific inits */
+
+ /* Clear PEST & PEEV */
+ for (i = 0; i < p->max_num_pes; i++) {
+ phb4_ioda_sel(p, IODA3_TBL_PESTA, i, false);
+ out_be64(p->regs + PHB_IODA_DATA0, 0);
+ phb4_ioda_sel(p, IODA3_TBL_PESTB, i, false);
+ out_be64(p->regs + PHB_IODA_DATA0, 0);
+ }
+
+ phb4_ioda_sel(p, IODA3_TBL_PEEV, 0, true);
+ for (i = 0; i < p->max_num_pes/64; i++)
+ out_be64(p->regs + PHB_IODA_DATA0, 0);
+
+ /* Invalidate RTE, TCE cache */
+ out_be64(p->regs + PHB_RTC_INVALIDATE, PHB_RTC_INVALIDATE_ALL);
+
+ return phb4_tce_kill(&p->phb, OPAL_PCI_TCE_KILL_ALL, 0, 0, 0, 0);
+}
+
+/*
+ * Clear anything we have in PAPR Error Injection registers. Though
+ * the spec says the PAPR error injection should be one-shot without
+ * the "sticky" bit. However, that's false according to the experiments
+ * I had. So we have to clear it at appropriate point in kernel to
+ * avoid endless frozen PE.
+ */
+static int64_t phb4_papr_errinjct_reset(struct phb *phb)
+{
+ struct phb4 *p = phb_to_phb4(phb);
+
+ out_be64(p->regs + PHB_PAPR_ERR_INJ_CTL, 0x0ul);
+ out_be64(p->regs + PHB_PAPR_ERR_INJ_ADDR, 0x0ul);
+ out_be64(p->regs + PHB_PAPR_ERR_INJ_MASK, 0x0ul);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb4_set_phb_mem_window(struct phb *phb,
+ uint16_t window_type,
+ uint16_t window_num,
+ uint64_t addr,
+ uint64_t pci_addr __unused,
+ uint64_t size)
+{
+ struct phb4 *p = phb_to_phb4(phb);
+ uint64_t mbt0, mbt1;
+
+ /*
+ * We have a unified MBT for all BARs on PHB4.
+ *
+ * So we use it as follow:
+ *
+ * - M32 is hard wired to be MBT[0] and uses MDT column 0
+ * for remapping.
+ *
+ * - MBT[1..n] are available to the OS, currently only as
+ * fully segmented or single PE (we don't yet expose the
+ * new segmentation modes).
+ *
+ * - We configure the 3 last BARs to columnt 1..3 initially
+ * set to segment# == PE#. We will need to provide some
+ * extensions to the existing APIs to enable remapping of
+ * segments on those BARs (and only those) as the current
+ * API forces single segment mode.
+ */
+ switch (window_type) {
+ case OPAL_IO_WINDOW_TYPE:
+ case OPAL_M32_WINDOW_TYPE:
+ return OPAL_UNSUPPORTED;
+ case OPAL_M64_WINDOW_TYPE:
+ if (window_num == 0 || window_num >= p->mbt_size) {
+ PHBERR(p, "%s: Invalid window %d\n",
+ __func__, window_num);
+ return OPAL_PARAMETER;
+ }
+
+ mbt0 = p->mbt_cache[window_num][0];
+ mbt1 = p->mbt_cache[window_num][1];
+
+ /* XXX For now we assume the 4K minimum alignment,
+ * todo: check with the HW folks what the exact limits
+ * are based on the segmentation model.
+ */
+ if ((addr & 0xFFFul) || (size & 0xFFFul)) {
+ PHBERR(p, "%s: Bad addr/size alignment %llx/%llx\n",
+ __func__, addr, size);
+ return OPAL_PARAMETER;
+ }
+
+ /* size should be 2^N */
+ if (!size || size & (size-1)) {
+ PHBERR(p, "%s: size not a power of 2: %llx\n",
+ __func__, size);
+ return OPAL_PARAMETER;
+ }
+
+ /* address should be size aligned */
+ if (addr & (size - 1)) {
+ PHBERR(p, "%s: addr not size aligned %llx/%llx\n",
+ __func__, addr, size);
+ return OPAL_PARAMETER;
+ }
+
+ break;
+ default:
+ return OPAL_PARAMETER;
+ }
+
+ /* The BAR shouldn't be enabled yet */
+ if (mbt0 & IODA3_MBT0_ENABLE)
+ return OPAL_PARTIAL;
+
+ /* Apply the settings */
+ mbt0 = SETFIELD(IODA3_MBT0_BASE_ADDR, mbt0, addr >> 12);
+ mbt1 = SETFIELD(IODA3_MBT1_MASK, mbt1, ~((size >> 12) -1));
+ p->mbt_cache[window_num][0] = mbt0;
+ p->mbt_cache[window_num][1] = mbt1;
+
+ return OPAL_SUCCESS;
+}
+
+/*
+ * For one specific M64 BAR, it can be shared by all PEs,
+ * or owned by single PE exclusively.
+ */
+static int64_t phb4_phb_mmio_enable(struct phb __unused *phb,
+ uint16_t window_type,
+ uint16_t window_num,
+ uint16_t enable)
+{
+ struct phb4 *p = phb_to_phb4(phb);
+ uint64_t mbt0, mbt1, base, mask;
+
+ /*
+ * By design, PHB4 doesn't support IODT any more.
+ * Besides, we can't enable M32 BAR as well. So
+ * the function is used to do M64 mapping and each
+ * BAR is supposed to be shared by all PEs.
+ *
+ * TODO: Add support for some of the new PHB4 split modes
+ */
+ switch (window_type) {
+ case OPAL_IO_WINDOW_TYPE:
+ case OPAL_M32_WINDOW_TYPE:
+ return OPAL_UNSUPPORTED;
+ case OPAL_M64_WINDOW_TYPE:
+ /* Window 0 is reserved for M32 */
+ if (window_num == 0 || window_num >= p->mbt_size ||
+ enable > OPAL_ENABLE_M64_NON_SPLIT) {
+ PHBDBG(p,
+ "phb4_phb_mmio_enable wrong args (window %d enable %d)\n",
+ window_num, enable);
+ return OPAL_PARAMETER;
+ }
+ break;
+ default:
+ return OPAL_PARAMETER;
+ }
+
+ /*
+ * We need check the base/mask while enabling
+ * the M64 BAR. Otherwise, invalid base/mask
+ * might cause fenced AIB unintentionally
+ */
+ mbt0 = p->mbt_cache[window_num][0];
+ mbt1 = p->mbt_cache[window_num][1];
+
+ if (enable == OPAL_DISABLE_M64) {
+ /* Reset the window to disabled & default mode */
+ mbt0 = phb4_default_mbt0(p, window_num);
+ mbt1 = 0;
+ } else {
+ /* Verify that the mode is valid and consistent */
+ if (enable == OPAL_ENABLE_M64_SPLIT) {
+ uint64_t mode = GETFIELD(IODA3_MBT0_MODE, mbt0);
+ if (mode != IODA3_MBT0_MODE_PE_SEG &&
+ mode != IODA3_MBT0_MODE_MDT)
+ return OPAL_PARAMETER;
+ } else if (enable == OPAL_ENABLE_M64_NON_SPLIT) {
+ if (GETFIELD(IODA3_MBT0_MODE, mbt0) !=
+ IODA3_MBT0_MODE_SINGLE_PE)
+ return OPAL_PARAMETER;
+ } else
+ return OPAL_PARAMETER;
+
+ base = GETFIELD(IODA3_MBT0_BASE_ADDR, mbt0);
+ base = (base << 12);
+ mask = GETFIELD(IODA3_MBT1_MASK, mbt1);
+ if (base < p->mm0_base || !mask)
+ return OPAL_PARTIAL;
+
+ mbt0 |= IODA3_MBT0_ENABLE;
+ mbt1 |= IODA3_MBT1_ENABLE;
+ }
+
+ /* Update HW and cache */
+ p->mbt_cache[window_num][0] = mbt0;
+ p->mbt_cache[window_num][1] = mbt1;
+ phb4_ioda_sel(p, IODA3_TBL_MBT, window_num << 1, true);
+ out_be64(p->regs + PHB_IODA_DATA0, mbt0);
+ out_be64(p->regs + PHB_IODA_DATA0, mbt1);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb4_map_pe_mmio_window(struct phb *phb,
+ uint64_t pe_number,
+ uint16_t window_type,
+ uint16_t window_num,
+ uint16_t segment_num)
+{
+ struct phb4 *p = phb_to_phb4(phb);
+ uint64_t mbt0, mbt1, mdt0;
+
+ if (pe_number >= p->num_pes)
+ return OPAL_PARAMETER;
+
+ /*
+ * We support a combined MDT that has 4 columns. We let the OS
+ * use kernel 0 for M32.
+ *
+ * We configure the 3 last BARs to map column 3..1 which by default
+ * are set to map segment# == pe#, but can be remapped here if we
+ * extend this function.
+ *
+ * The problem is that the current API was "hijacked" so that an
+ * attempt at remapping any segment of an M64 has the effect of
+ * turning it into a single-PE mode BAR. So if we want to support
+ * remapping we'll have to play around this for example by creating
+ * a new API or a new window type...
+ */
+ switch(window_type) {
+ case OPAL_IO_WINDOW_TYPE:
+ return OPAL_UNSUPPORTED;
+ case OPAL_M32_WINDOW_TYPE:
+ if (window_num != 0 || segment_num >= p->num_pes)
+ return OPAL_PARAMETER;
+
+ mdt0 = p->mdt_cache[segment_num];
+ mdt0 = SETFIELD(IODA3_MDT_PE_A, mdt0, pe_number);
+ phb4_ioda_sel(p, IODA3_TBL_MDT, segment_num, false);
+ out_be64(p->regs + PHB_IODA_DATA0, mdt0);
+ break;
+ case OPAL_M64_WINDOW_TYPE:
+ if (window_num == 0 || window_num >= p->mbt_size)
+ return OPAL_PARAMETER;
+
+ mbt0 = p->mbt_cache[window_num][0];
+ mbt1 = p->mbt_cache[window_num][1];
+
+ /* The BAR shouldn't be enabled yet */
+ if (mbt0 & IODA3_MBT0_ENABLE)
+ return OPAL_PARTIAL;
+
+ /* Set to single PE mode and configure the PE */
+ mbt0 = SETFIELD(IODA3_MBT0_MODE, mbt0,
+ IODA3_MBT0_MODE_SINGLE_PE);
+ mbt1 = SETFIELD(IODA3_MBT1_SINGLE_PE_NUM, mbt1, pe_number);
+ p->mbt_cache[window_num][0] = mbt0;
+ p->mbt_cache[window_num][1] = mbt1;
+ break;
+ default:
+ return OPAL_PARAMETER;
+ }
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb4_map_pe_dma_window(struct phb *phb,
+ uint64_t pe_number,
+ uint16_t window_id,
+ uint16_t tce_levels,
+ uint64_t tce_table_addr,
+ uint64_t tce_table_size,
+ uint64_t tce_page_size)
+{
+ struct phb4 *p = phb_to_phb4(phb);
+ uint64_t tts_encoded;
+ uint64_t data64 = 0;
+
+ /*
+ * We configure the PHB in 2 TVE per PE mode to match phb3.
+ * Current Linux implementation *requires* the two windows per
+ * PE.
+ *
+ * Note: On DD2.0 this is the normal mode of operation.
+ */
+
+ /*
+ * Sanity check. We currently only support "2 window per PE" mode
+ * ie, only bit 59 of the PCI address is used to select the window
+ */
+ if (pe_number >= p->num_pes || (window_id >> 1) != pe_number)
+ return OPAL_PARAMETER;
+
+ /*
+ * tce_table_size == 0 is used to disable an entry, in this case
+ * we ignore other arguments
+ */
+ if (tce_table_size == 0) {
+ phb4_ioda_sel(p, IODA3_TBL_TVT, window_id, false);
+ out_be64(p->regs + PHB_IODA_DATA0, 0);
+ p->tve_cache[window_id] = 0;
+ return OPAL_SUCCESS;
+ }
+
+ /* Additional arguments validation */
+ if (tce_levels < 1 || tce_levels > 5 ||
+ !is_pow2(tce_table_size) ||
+ tce_table_size < 0x1000)
+ return OPAL_PARAMETER;
+
+ /* Encode TCE table size */
+ data64 = SETFIELD(IODA3_TVT_TABLE_ADDR, 0ul, tce_table_addr >> 12);
+ tts_encoded = ilog2(tce_table_size) - 11;
+ if (tts_encoded > 31)
+ return OPAL_PARAMETER;
+ data64 = SETFIELD(IODA3_TVT_TCE_TABLE_SIZE, data64, tts_encoded);
+
+ /* Encode TCE page size */
+ switch (tce_page_size) {
+ case 0x1000: /* 4K */
+ data64 = SETFIELD(IODA3_TVT_IO_PSIZE, data64, 1);
+ break;
+ case 0x10000: /* 64K */
+ data64 = SETFIELD(IODA3_TVT_IO_PSIZE, data64, 5);
+ break;
+ case 0x200000: /* 2M */
+ data64 = SETFIELD(IODA3_TVT_IO_PSIZE, data64, 10);
+ break;
+ case 0x40000000: /* 1G */
+ data64 = SETFIELD(IODA3_TVT_IO_PSIZE, data64, 19);
+ break;
+ default:
+ return OPAL_PARAMETER;
+ }
+
+ /* Encode number of levels */
+ data64 = SETFIELD(IODA3_TVT_NUM_LEVELS, data64, tce_levels - 1);
+
+ phb4_ioda_sel(p, IODA3_TBL_TVT, window_id, false);
+ out_be64(p->regs + PHB_IODA_DATA0, data64);
+ p->tve_cache[window_id] = data64;
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb4_map_pe_dma_window_real(struct phb *phb,
+ uint64_t pe_number,
+ uint16_t window_id,
+ uint64_t pci_start_addr,
+ uint64_t pci_mem_size)
+{
+ struct phb4 *p = phb_to_phb4(phb);
+ uint64_t end = pci_start_addr + pci_mem_size;
+ uint64_t tve;
+
+ if (pe_number >= p->num_pes ||
+ (window_id >> 1) != pe_number)
+ return OPAL_PARAMETER;
+
+ if (pci_mem_size) {
+ /* Enable */
+
+ /*
+ * Check that the start address has the right TVE index,
+ * we only support the 1 bit mode where each PE has 2
+ * TVEs
+ */
+ if ((pci_start_addr >> 59) != (window_id & 1))
+ return OPAL_PARAMETER;
+ pci_start_addr &= ((1ull << 59) - 1);
+ end = pci_start_addr + pci_mem_size;
+
+ /* We have to be 16M aligned */
+ if ((pci_start_addr & 0x00ffffff) ||
+ (pci_mem_size & 0x00ffffff))
+ return OPAL_PARAMETER;
+
+ /*
+ * It *looks* like this is the max we can support (we need
+ * to verify this. Also we are not checking for rollover,
+ * but then we aren't trying too hard to protect ourselves
+ * againt a completely broken OS.
+ */
+ if (end > 0x0003ffffffffffffull)
+ return OPAL_PARAMETER;
+
+ /*
+ * Put start address bits 49:24 into TVE[52:53]||[0:23]
+ * and end address bits 49:24 into TVE[54:55]||[24:47]
+ * and set TVE[51]
+ */
+ tve = (pci_start_addr << 16) & (0xffffffull << 40);
+ tve |= (pci_start_addr >> 38) & (3ull << 10);
+ tve |= (end >> 8) & (0xfffffful << 16);
+ tve |= (end >> 40) & (3ull << 8);
+ tve |= PPC_BIT(51) | IODA3_TVT_NON_TRANSLATE_50;
+ } else {
+ /* Disable */
+ tve = 0;
+ }
+
+ phb4_ioda_sel(p, IODA3_TBL_TVT, window_id, false);
+ out_be64(p->regs + PHB_IODA_DATA0, tve);
+ p->tve_cache[window_id] = tve;
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb4_set_option(struct phb *phb, enum OpalPhbOption opt,
+ uint64_t setting)
+{
+ struct phb4 *p = phb_to_phb4(phb);
+ uint64_t data64;
+
+ data64 = phb4_read_reg(p, PHB_CTRLR);
+ switch (opt) {
+ case OPAL_PHB_OPTION_TVE1_4GB:
+ if (setting > 1)
+ return OPAL_PARAMETER;
+
+ PHBDBG(p, "4GB bypass mode = %lld\n", setting);
+ if (setting)
+ data64 |= PPC_BIT(24);
+ else
+ data64 &= ~PPC_BIT(24);
+ break;
+ case OPAL_PHB_OPTION_MMIO_EEH_DISABLE:
+ if (setting > 1)
+ return OPAL_PARAMETER;
+
+ PHBDBG(p, "MMIO EEH Disable = %lld\n", setting);
+ if (setting)
+ data64 |= PPC_BIT(14);
+ else
+ data64 &= ~PPC_BIT(14);
+ break;
+ default:
+ return OPAL_UNSUPPORTED;
+ }
+ phb4_write_reg(p, PHB_CTRLR, data64);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb4_get_option(struct phb *phb, enum OpalPhbOption opt,
+ __be64 *setting)
+{
+ struct phb4 *p = phb_to_phb4(phb);
+ uint64_t data64;
+
+ data64 = phb4_read_reg(p, PHB_CTRLR);
+ switch (opt) {
+ case OPAL_PHB_OPTION_TVE1_4GB:
+ *setting = cpu_to_be64((data64 & PPC_BIT(24)) ? 1 : 0);
+ break;
+ case OPAL_PHB_OPTION_MMIO_EEH_DISABLE:
+ *setting = cpu_to_be64((data64 & PPC_BIT(14)) ? 1 : 0);
+ break;
+ default:
+ return OPAL_UNSUPPORTED;
+ }
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb4_set_ive_pe(struct phb *phb,
+ uint64_t pe_number,
+ uint32_t ive_num)
+{
+ struct phb4 *p = phb_to_phb4(phb);
+ uint32_t mist_idx;
+ uint32_t mist_quad;
+ uint32_t mist_shift;
+ uint64_t val;
+
+ if (pe_number >= p->num_pes || ive_num >= (p->num_irqs - 8))
+ return OPAL_PARAMETER;
+
+ mist_idx = ive_num >> 2;
+ mist_quad = ive_num & 3;
+ mist_shift = (3 - mist_quad) << 4;
+ p->mist_cache[mist_idx] &= ~(0x0fffull << mist_shift);
+ p->mist_cache[mist_idx] |= ((uint64_t)pe_number) << mist_shift;
+
+ /* Note: This has the side effect of clearing P/Q, so this
+ * shouldn't be called while the interrupt is "hot"
+ */
+
+ phb4_ioda_sel(p, IODA3_TBL_MIST, mist_idx, false);
+
+ /* We need to inject the appropriate MIST write enable bit
+ * in the IODA table address register
+ */
+ val = in_be64(p->regs + PHB_IODA_ADDR);
+ val = SETFIELD(PHB_IODA_AD_MIST_PWV, val, 8 >> mist_quad);
+ out_be64(p->regs + PHB_IODA_ADDR, val);
+
+ /* Write entry */
+ out_be64(p->regs + PHB_IODA_DATA0, p->mist_cache[mist_idx]);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb4_get_msi_32(struct phb *phb,
+ uint64_t pe_number,
+ uint32_t ive_num,
+ uint8_t msi_range,
+ uint32_t *msi_address,
+ uint32_t *message_data)
+{
+ struct phb4 *p = phb_to_phb4(phb);
+
+ /*
+ * Sanity check. We needn't check on mve_number (PE#)
+ * on PHB3 since the interrupt source is purely determined
+ * by its DMA address and data, but the check isn't
+ * harmful.
+ */
+ if (pe_number >= p->num_pes ||
+ ive_num >= (p->num_irqs - 8) ||
+ msi_range != 1 || !msi_address|| !message_data)
+ return OPAL_PARAMETER;
+
+ /*
+ * DMA address and data will form the IVE index.
+ * For more details, please refer to IODA2 spec.
+ */
+ *msi_address = 0xFFFF0000 | ((ive_num << 4) & 0xFFFFFE0F);
+ *message_data = ive_num & 0x1F;
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb4_get_msi_64(struct phb *phb,
+ uint64_t pe_number,
+ uint32_t ive_num,
+ uint8_t msi_range,
+ uint64_t *msi_address,
+ uint32_t *message_data)
+{
+ struct phb4 *p = phb_to_phb4(phb);
+
+ /* Sanity check */
+ if (pe_number >= p->num_pes ||
+ ive_num >= (p->num_irqs - 8) ||
+ msi_range != 1 || !msi_address || !message_data)
+ return OPAL_PARAMETER;
+
+ /*
+ * DMA address and data will form the IVE index.
+ * For more details, please refer to IODA2 spec.
+ */
+ *msi_address = (0x1ul << 60) | ((ive_num << 4) & 0xFFFFFFFFFFFFFE0Ful);
+ *message_data = ive_num & 0x1F;
+
+ return OPAL_SUCCESS;
+}
+
+static void phb4_rc_err_clear(struct phb4 *p)
+{
+ /* Init_47 - Clear errors */
+ phb4_pcicfg_write16(&p->phb, 0, PCI_CFG_SECONDARY_STATUS, 0xffff);
+
+ if (p->ecap <= 0)
+ return;
+
+ phb4_pcicfg_write16(&p->phb, 0, p->ecap + PCICAP_EXP_DEVSTAT,
+ PCICAP_EXP_DEVSTAT_CE |
+ PCICAP_EXP_DEVSTAT_NFE |
+ PCICAP_EXP_DEVSTAT_FE |
+ PCICAP_EXP_DEVSTAT_UE);
+
+ if (p->aercap <= 0)
+ return;
+
+ /* Clear all UE status */
+ phb4_pcicfg_write32(&p->phb, 0, p->aercap + PCIECAP_AER_UE_STATUS,
+ 0xffffffff);
+ /* Clear all CE status */
+ phb4_pcicfg_write32(&p->phb, 0, p->aercap + PCIECAP_AER_CE_STATUS,
+ 0xffffffff);
+ /* Clear root error status */
+ phb4_pcicfg_write32(&p->phb, 0, p->aercap + PCIECAP_AER_RERR_STA,
+ 0xffffffff);
+}
+
+static void phb4_err_clear_regb(struct phb4 *p)
+{
+ uint64_t val64;
+
+ val64 = phb4_read_reg(p, PHB_REGB_ERR_STATUS);
+ phb4_write_reg(p, PHB_REGB_ERR_STATUS, val64);
+ phb4_write_reg(p, PHB_REGB_ERR1_STATUS, 0x0ul);
+ phb4_write_reg(p, PHB_REGB_ERR_LOG_0, 0x0ul);
+ phb4_write_reg(p, PHB_REGB_ERR_LOG_1, 0x0ul);
+}
+
+/*
+ * The function can be called during error recovery for all classes of
+ * errors. This is new to PHB4; previous revisions had separate
+ * sequences for INF/ER/Fatal errors.
+ *
+ * "Rec #" in this function refer to "Recov_#" steps in the
+ * PHB4 INF recovery sequence.
+ */
+static void phb4_err_clear(struct phb4 *p)
+{
+ uint64_t val64;
+ uint64_t fir = phb4_read_reg(p, PHB_LEM_FIR_ACCUM);
+
+ /* Rec 1: Acquire the PCI config lock (we don't need to do this) */
+
+ /* Rec 2...15: Clear error status in RC config space */
+ phb4_rc_err_clear(p);
+
+ /* Rec 16...23: Clear PBL errors */
+ val64 = phb4_read_reg(p, PHB_PBL_ERR_STATUS);
+ phb4_write_reg(p, PHB_PBL_ERR_STATUS, val64);
+ phb4_write_reg(p, PHB_PBL_ERR1_STATUS, 0x0ul);
+ phb4_write_reg(p, PHB_PBL_ERR_LOG_0, 0x0ul);
+ phb4_write_reg(p, PHB_PBL_ERR_LOG_1, 0x0ul);
+
+ /* Rec 24...31: Clear REGB errors */
+ phb4_err_clear_regb(p);
+
+ /* Rec 32...59: Clear PHB error trap */
+ val64 = phb4_read_reg(p, PHB_TXE_ERR_STATUS);
+ phb4_write_reg(p, PHB_TXE_ERR_STATUS, val64);
+ phb4_write_reg(p, PHB_TXE_ERR1_STATUS, 0x0ul);
+ phb4_write_reg(p, PHB_TXE_ERR_LOG_0, 0x0ul);
+ phb4_write_reg(p, PHB_TXE_ERR_LOG_1, 0x0ul);
+
+ val64 = phb4_read_reg(p, PHB_RXE_ARB_ERR_STATUS);
+ phb4_write_reg(p, PHB_RXE_ARB_ERR_STATUS, val64);
+ phb4_write_reg(p, PHB_RXE_ARB_ERR1_STATUS, 0x0ul);
+ phb4_write_reg(p, PHB_RXE_ARB_ERR_LOG_0, 0x0ul);
+ phb4_write_reg(p, PHB_RXE_ARB_ERR_LOG_1, 0x0ul);
+
+ val64 = phb4_read_reg(p, PHB_RXE_MRG_ERR_STATUS);
+ phb4_write_reg(p, PHB_RXE_MRG_ERR_STATUS, val64);
+ phb4_write_reg(p, PHB_RXE_MRG_ERR1_STATUS, 0x0ul);
+ phb4_write_reg(p, PHB_RXE_MRG_ERR_LOG_0, 0x0ul);
+ phb4_write_reg(p, PHB_RXE_MRG_ERR_LOG_1, 0x0ul);
+
+ val64 = phb4_read_reg(p, PHB_RXE_TCE_ERR_STATUS);
+ phb4_write_reg(p, PHB_RXE_TCE_ERR_STATUS, val64);
+ phb4_write_reg(p, PHB_RXE_TCE_ERR1_STATUS, 0x0ul);
+ phb4_write_reg(p, PHB_RXE_TCE_ERR_LOG_0, 0x0ul);
+ phb4_write_reg(p, PHB_RXE_TCE_ERR_LOG_1, 0x0ul);
+
+ val64 = phb4_read_reg(p, PHB_ERR_STATUS);
+ phb4_write_reg(p, PHB_ERR_STATUS, val64);
+ phb4_write_reg(p, PHB_ERR1_STATUS, 0x0ul);
+ phb4_write_reg(p, PHB_ERR_LOG_0, 0x0ul);
+ phb4_write_reg(p, PHB_ERR_LOG_1, 0x0ul);
+
+ /* Rec 61/62: Clear FIR/WOF */
+ phb4_write_reg(p, PHB_LEM_FIR_AND_MASK, ~fir);
+ phb4_write_reg(p, PHB_LEM_WOF, 0x0ul);
+
+ /* Rec 63: Update LEM mask to its initial value */
+ phb4_write_reg(p, PHB_LEM_ERROR_MASK, 0x0ul);
+
+ /* Rec 64: Clear the PCI config lock (we don't need to do this) */
+}
+
+static void phb4_read_phb_status(struct phb4 *p,
+ struct OpalIoPhb4ErrorData *stat)
+{
+ uint32_t i;
+ __be64 *pPEST;
+ uint16_t __16;
+ uint32_t __32;
+ uint64_t __64;
+
+ memset(stat, 0, sizeof(struct OpalIoPhb4ErrorData));
+
+ /* Error data common part */
+ stat->common.version = cpu_to_be32(OPAL_PHB_ERROR_DATA_VERSION_1);
+ stat->common.ioType = cpu_to_be32(OPAL_PHB_ERROR_DATA_TYPE_PHB4);
+ stat->common.len = cpu_to_be32(sizeof(struct OpalIoPhb4ErrorData));
+
+ /* Use ASB for config space if the PHB is fenced */
+ if (p->flags & PHB4_AIB_FENCED)
+ p->flags |= PHB4_CFG_USE_ASB;
+
+ /* Grab RC bridge control, make it 32-bit */
+ phb4_pcicfg_read16(&p->phb, 0, PCI_CFG_BRCTL, &__16);
+ stat->brdgCtl = cpu_to_be32(__16);
+
+ /*
+ * Grab various RC PCIe capability registers. All device, slot
+ * and link status are 16-bit, so we grab the pair control+status
+ * for each of them
+ */
+ phb4_pcicfg_read32(&p->phb, 0, p->ecap + PCICAP_EXP_DEVCTL, &__32);
+ stat->deviceStatus = cpu_to_be32(__32);
+ phb4_pcicfg_read32(&p->phb, 0, p->ecap + PCICAP_EXP_SLOTCTL, &__32);
+ stat->slotStatus = cpu_to_be32(__32);
+ phb4_pcicfg_read32(&p->phb, 0, p->ecap + PCICAP_EXP_LCTL, &__32);
+ stat->linkStatus = cpu_to_be32(__32);
+
+ /*
+ * I assume those are the standard config space header, cmd & status
+ * together makes 32-bit. Secondary status is 16-bit so I'll clear
+ * the top on that one
+ */
+ phb4_pcicfg_read32(&p->phb, 0, PCI_CFG_CMD, &__32);
+ stat->devCmdStatus = cpu_to_be32(__32);
+ phb4_pcicfg_read16(&p->phb, 0, PCI_CFG_SECONDARY_STATUS, &__16);
+ stat->devSecStatus = cpu_to_be32(__16);
+
+ /* Grab a bunch of AER regs */
+ phb4_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_RERR_STA, &__32);
+ stat->rootErrorStatus = cpu_to_be32(__32);
+ phb4_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_UE_STATUS, &__32);
+ stat->uncorrErrorStatus = cpu_to_be32(__32);
+
+ phb4_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_CE_STATUS, &__32);
+ stat->corrErrorStatus = cpu_to_be32(__32);
+
+ phb4_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG0, &__32);
+ stat->tlpHdr1 = cpu_to_be32(__32);
+
+ phb4_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG1, &__32);
+ stat->tlpHdr2 = cpu_to_be32(__32);
+
+ phb4_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG2, &__32);
+ stat->tlpHdr3 = cpu_to_be32(__32);
+
+ phb4_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG3, &__32);
+ stat->tlpHdr4 = cpu_to_be32(__32);
+
+ phb4_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_SRCID, &__32);
+ stat->sourceId = cpu_to_be32(__32);
+
+
+ /* PEC NFIR, same as P8/PHB3 */
+ xscom_read(p->chip_id, p->pe_stk_xscom + 0x0, &__64);
+ stat->nFir = cpu_to_be64(__64);
+ xscom_read(p->chip_id, p->pe_stk_xscom + 0x3, &__64);
+ stat->nFirMask = cpu_to_be64(__64);
+ xscom_read(p->chip_id, p->pe_stk_xscom + 0x8, &__64);
+ stat->nFirWOF = cpu_to_be64(__64);
+
+ /* PHB4 inbound and outbound error Regs */
+ stat->phbPlssr = cpu_to_be64(phb4_read_reg_asb(p, PHB_CPU_LOADSTORE_STATUS));
+ stat->phbCsr = cpu_to_be64(phb4_read_reg_asb(p, PHB_DMA_CHAN_STATUS));
+ stat->lemFir = cpu_to_be64(phb4_read_reg_asb(p, PHB_LEM_FIR_ACCUM));
+ stat->lemErrorMask = cpu_to_be64(phb4_read_reg_asb(p, PHB_LEM_ERROR_MASK));
+ stat->lemWOF = cpu_to_be64(phb4_read_reg_asb(p, PHB_LEM_WOF));
+ stat->phbErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_ERR_STATUS));
+ stat->phbFirstErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_ERR1_STATUS));
+ stat->phbErrorLog0 = cpu_to_be64(phb4_read_reg_asb(p, PHB_ERR_LOG_0));
+ stat->phbErrorLog1 = cpu_to_be64(phb4_read_reg_asb(p, PHB_ERR_LOG_1));
+ stat->phbTxeErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_TXE_ERR_STATUS));
+ stat->phbTxeFirstErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_TXE_ERR1_STATUS));
+ stat->phbTxeErrorLog0 = cpu_to_be64(phb4_read_reg_asb(p, PHB_TXE_ERR_LOG_0));
+ stat->phbTxeErrorLog1 = cpu_to_be64(phb4_read_reg_asb(p, PHB_TXE_ERR_LOG_1));
+ stat->phbRxeArbErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_ARB_ERR_STATUS));
+ stat->phbRxeArbFirstErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_ARB_ERR1_STATUS));
+ stat->phbRxeArbErrorLog0 = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_ARB_ERR_LOG_0));
+ stat->phbRxeArbErrorLog1 = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_ARB_ERR_LOG_1));
+ stat->phbRxeMrgErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_MRG_ERR_STATUS));
+ stat->phbRxeMrgFirstErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_MRG_ERR1_STATUS));
+ stat->phbRxeMrgErrorLog0 = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_MRG_ERR_LOG_0));
+ stat->phbRxeMrgErrorLog1 = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_MRG_ERR_LOG_1));
+ stat->phbRxeTceErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_TCE_ERR_STATUS));
+ stat->phbRxeTceFirstErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_TCE_ERR1_STATUS));
+ stat->phbRxeTceErrorLog0 = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_TCE_ERR_LOG_0));
+ stat->phbRxeTceErrorLog1 = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_TCE_ERR_LOG_1));
+
+ /* PHB4 REGB error registers */
+ stat->phbPblErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_PBL_ERR_STATUS));
+ stat->phbPblFirstErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_PBL_ERR1_STATUS));
+ stat->phbPblErrorLog0 = cpu_to_be64(phb4_read_reg_asb(p, PHB_PBL_ERR_LOG_0));
+ stat->phbPblErrorLog1 = cpu_to_be64(phb4_read_reg_asb(p, PHB_PBL_ERR_LOG_1));
+
+ stat->phbPcieDlpErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_PCIE_DLP_ERR_STATUS));
+ stat->phbPcieDlpErrorLog1 = cpu_to_be64(phb4_read_reg_asb(p, PHB_PCIE_DLP_ERRLOG1));
+ stat->phbPcieDlpErrorLog2 = cpu_to_be64(phb4_read_reg_asb(p, PHB_PCIE_DLP_ERRLOG2));
+
+ stat->phbRegbErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_REGB_ERR_STATUS));
+ stat->phbRegbFirstErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_REGB_ERR1_STATUS));
+ stat->phbRegbErrorLog0 = cpu_to_be64(phb4_read_reg_asb(p, PHB_REGB_ERR_LOG_0));
+ stat->phbRegbErrorLog1 = cpu_to_be64(phb4_read_reg_asb(p, PHB_REGB_ERR_LOG_1));
+
+ /*
+ * Grab PESTA & B content. The error bit (bit#0) should
+ * be fetched from IODA and the left content from memory
+ * resident tables.
+ */
+ pPEST = (__be64 *)p->tbl_pest;
+ phb4_ioda_sel(p, IODA3_TBL_PESTA, 0, true);
+ for (i = 0; i < p->max_num_pes; i++) {
+ stat->pestA[i] = cpu_to_be64(phb4_read_reg_asb(p, PHB_IODA_DATA0));
+ stat->pestA[i] |= pPEST[2 * i];
+ }
+
+ phb4_ioda_sel(p, IODA3_TBL_PESTB, 0, true);
+ for (i = 0; i < p->max_num_pes; i++) {
+ stat->pestB[i] = cpu_to_be64(phb4_read_reg_asb(p, PHB_IODA_DATA0));
+ stat->pestB[i] |= pPEST[2 * i + 1];
+ }
+}
+
+static void __unused phb4_dump_peltv(struct phb4 *p)
+{
+ int stride = p->max_num_pes / 64;
+ uint64_t *tbl = (void *) p->tbl_peltv;
+ unsigned int pe;
+
+ PHBERR(p, "PELT-V: base addr: %p size: %llx (%d PEs, stride = %d)\n",
+ tbl, p->tbl_peltv_size, p->max_num_pes, stride);
+
+ for (pe = 0; pe < p->max_num_pes; pe++) {
+ unsigned int i, j;
+ uint64_t sum = 0;
+
+ i = pe * stride;
+
+ /*
+ * Only print an entry if there's bits set in the PE's
+ * PELT-V entry. There's a few hundred possible PEs and
+ * generally only a handful will be in use.
+ */
+
+ for (j = 0; j < stride; j++)
+ sum |= tbl[i + j];
+ if (!sum)
+ continue; /* unused PE, skip it */
+
+ if (p->max_num_pes == 512) {
+ PHBERR(p, "PELT-V[%03x] = "
+ "%016llx %016llx %016llx %016llx"
+ "%016llx %016llx %016llx %016llx\n", pe,
+ tbl[i + 0], tbl[i + 1], tbl[i + 2], tbl[i + 3],
+ tbl[i + 4], tbl[i + 5], tbl[i + 6], tbl[i + 7]);
+ } else if (p->max_num_pes == 256) {
+ PHBERR(p, "PELT-V[%03x] = "
+ "%016llx %016llx %016llx %016llx\n", pe,
+ tbl[i + 0], tbl[i + 1], tbl[i + 2], tbl[i + 3]);
+ }
+ }
+}
+
+static void __unused phb4_dump_ioda_table(struct phb4 *p, int table)
+{
+ const char *name;
+ int entries, i;
+
+ switch (table) {
+ case IODA3_TBL_LIST:
+ name = "LIST";
+ entries = 8;
+ break;
+ case IODA3_TBL_MIST:
+ name = "MIST";
+ entries = 1024;
+ break;
+ case IODA3_TBL_RCAM:
+ name = "RCAM";
+ entries = 128;
+ break;
+ case IODA3_TBL_MRT:
+ name = "MRT";
+ entries = 16;
+ break;
+ case IODA3_TBL_PESTA:
+ name = "PESTA";
+ entries = 512;
+ break;
+ case IODA3_TBL_PESTB:
+ name = "PESTB";
+ entries = 512;
+ break;
+ case IODA3_TBL_TVT:
+ name = "TVT";
+ entries = 512;
+ break;
+ case IODA3_TBL_TCAM:
+ name = "TCAM";
+ entries = 1024;
+ break;
+ case IODA3_TBL_TDR:
+ name = "TDR";
+ entries = 1024;
+ break;
+ case IODA3_TBL_MBT: /* special case, see below */
+ name = "MBT";
+ entries = 64;
+ break;
+ case IODA3_TBL_MDT:
+ name = "MDT";
+ entries = 512;
+ break;
+ case IODA3_TBL_PEEV:
+ name = "PEEV";
+ entries = 8;
+ break;
+ default:
+ PHBERR(p, "Invalid IODA table %d!\n", table);
+ return;
+ }
+
+ PHBERR(p, "Start %s dump (only non-zero entries are printed):\n", name);
+
+ phb4_ioda_sel(p, table, 0, true);
+
+ /*
+ * Each entry in the MBT is 16 bytes. Every other table has 8 byte
+ * entries so we special case the MDT to keep the output readable.
+ */
+ if (table == IODA3_TBL_MBT) {
+ for (i = 0; i < 32; i++) {
+ uint64_t v1 = phb4_read_reg_asb(p, PHB_IODA_DATA0);
+ uint64_t v2 = phb4_read_reg_asb(p, PHB_IODA_DATA0);
+
+ if (!v1 && !v2)
+ continue;
+ PHBERR(p, "MBT[%03x] = %016llx %016llx\n", i, v1, v2);
+ }
+ } else {
+ for (i = 0; i < entries; i++) {
+ uint64_t v = phb4_read_reg_asb(p, PHB_IODA_DATA0);
+
+ if (!v)
+ continue;
+ PHBERR(p, "%s[%03x] = %016llx\n", name, i, v);
+ }
+ }
+
+ PHBERR(p, "End %s dump\n", name);
+}
+
+static void phb4_eeh_dump_regs(struct phb4 *p)
+{
+ struct OpalIoPhb4ErrorData *s;
+ uint16_t reg;
+ unsigned int i;
+
+ if (!verbose_eeh)
+ return;
+
+ s = zalloc(sizeof(struct OpalIoPhb4ErrorData));
+ if (!s) {
+ PHBERR(p, "Failed to allocate error info !\n");
+ return;
+ }
+ phb4_read_phb_status(p, s);
+
+ PHBERR(p, " brdgCtl = %08x\n", be32_to_cpu(s->brdgCtl));
+
+ /* PHB4 cfg regs */
+ PHBERR(p, " deviceStatus = %08x\n", be32_to_cpu(s->deviceStatus));
+ PHBERR(p, " slotStatus = %08x\n", be32_to_cpu(s->slotStatus));
+ PHBERR(p, " linkStatus = %08x\n", be32_to_cpu(s->linkStatus));
+ PHBERR(p, " devCmdStatus = %08x\n", be32_to_cpu(s->devCmdStatus));
+ PHBERR(p, " devSecStatus = %08x\n", be32_to_cpu(s->devSecStatus));
+ PHBERR(p, " rootErrorStatus = %08x\n", be32_to_cpu(s->rootErrorStatus));
+ PHBERR(p, " corrErrorStatus = %08x\n", be32_to_cpu(s->corrErrorStatus));
+ PHBERR(p, " uncorrErrorStatus = %08x\n", be32_to_cpu(s->uncorrErrorStatus));
+
+ /* Two non OPAL API registers that are useful */
+ phb4_pcicfg_read16(&p->phb, 0, p->ecap + PCICAP_EXP_DEVCTL, &reg);
+ PHBERR(p, " devctl = %08x\n", reg);
+ phb4_pcicfg_read16(&p->phb, 0, p->ecap + PCICAP_EXP_DEVSTAT,
+ &reg);
+ PHBERR(p, " devStat = %08x\n", reg);
+
+ /* Byte swap TLP headers so they are the same as the PCIe spec */
+ PHBERR(p, " tlpHdr1 = %08x\n", cpu_to_le32(be32_to_cpu(s->tlpHdr1)));
+ PHBERR(p, " tlpHdr2 = %08x\n", cpu_to_le32(be32_to_cpu(s->tlpHdr2)));
+ PHBERR(p, " tlpHdr3 = %08x\n", cpu_to_le32(be32_to_cpu(s->tlpHdr3)));
+ PHBERR(p, " tlpHdr4 = %08x\n", cpu_to_le32(be32_to_cpu(s->tlpHdr4)));
+ PHBERR(p, " sourceId = %08x\n", be32_to_cpu(s->sourceId));
+ PHBERR(p, " nFir = %016llx\n", be64_to_cpu(s->nFir));
+ PHBERR(p, " nFirMask = %016llx\n", be64_to_cpu(s->nFirMask));
+ PHBERR(p, " nFirWOF = %016llx\n", be64_to_cpu(s->nFirWOF));
+ PHBERR(p, " phbPlssr = %016llx\n", be64_to_cpu(s->phbPlssr));
+ PHBERR(p, " phbCsr = %016llx\n", be64_to_cpu(s->phbCsr));
+ PHBERR(p, " lemFir = %016llx\n", be64_to_cpu(s->lemFir));
+ PHBERR(p, " lemErrorMask = %016llx\n", be64_to_cpu(s->lemErrorMask));
+ PHBERR(p, " lemWOF = %016llx\n", be64_to_cpu(s->lemWOF));
+ PHBERR(p, " phbErrorStatus = %016llx\n", be64_to_cpu(s->phbErrorStatus));
+ PHBERR(p, " phbFirstErrorStatus = %016llx\n", be64_to_cpu(s->phbFirstErrorStatus));
+ PHBERR(p, " phbErrorLog0 = %016llx\n", be64_to_cpu(s->phbErrorLog0));
+ PHBERR(p, " phbErrorLog1 = %016llx\n", be64_to_cpu(s->phbErrorLog1));
+ PHBERR(p, " phbTxeErrorStatus = %016llx\n", be64_to_cpu(s->phbTxeErrorStatus));
+ PHBERR(p, " phbTxeFirstErrorStatus = %016llx\n", be64_to_cpu(s->phbTxeFirstErrorStatus));
+ PHBERR(p, " phbTxeErrorLog0 = %016llx\n", be64_to_cpu(s->phbTxeErrorLog0));
+ PHBERR(p, " phbTxeErrorLog1 = %016llx\n", be64_to_cpu(s->phbTxeErrorLog1));
+ PHBERR(p, " phbRxeArbErrorStatus = %016llx\n", be64_to_cpu(s->phbRxeArbErrorStatus));
+ PHBERR(p, "phbRxeArbFrstErrorStatus = %016llx\n", be64_to_cpu(s->phbRxeArbFirstErrorStatus));
+ PHBERR(p, " phbRxeArbErrorLog0 = %016llx\n", be64_to_cpu(s->phbRxeArbErrorLog0));
+ PHBERR(p, " phbRxeArbErrorLog1 = %016llx\n", be64_to_cpu(s->phbRxeArbErrorLog1));
+ PHBERR(p, " phbRxeMrgErrorStatus = %016llx\n", be64_to_cpu(s->phbRxeMrgErrorStatus));
+ PHBERR(p, "phbRxeMrgFrstErrorStatus = %016llx\n", be64_to_cpu(s->phbRxeMrgFirstErrorStatus));
+ PHBERR(p, " phbRxeMrgErrorLog0 = %016llx\n", be64_to_cpu(s->phbRxeMrgErrorLog0));
+ PHBERR(p, " phbRxeMrgErrorLog1 = %016llx\n", be64_to_cpu(s->phbRxeMrgErrorLog1));
+ PHBERR(p, " phbRxeTceErrorStatus = %016llx\n", be64_to_cpu(s->phbRxeTceErrorStatus));
+ PHBERR(p, "phbRxeTceFrstErrorStatus = %016llx\n", be64_to_cpu(s->phbRxeTceFirstErrorStatus));
+ PHBERR(p, " phbRxeTceErrorLog0 = %016llx\n", be64_to_cpu(s->phbRxeTceErrorLog0));
+ PHBERR(p, " phbRxeTceErrorLog1 = %016llx\n", be64_to_cpu(s->phbRxeTceErrorLog1));
+ PHBERR(p, " phbPblErrorStatus = %016llx\n", be64_to_cpu(s->phbPblErrorStatus));
+ PHBERR(p, " phbPblFirstErrorStatus = %016llx\n", be64_to_cpu(s->phbPblFirstErrorStatus));
+ PHBERR(p, " phbPblErrorLog0 = %016llx\n", be64_to_cpu(s->phbPblErrorLog0));
+ PHBERR(p, " phbPblErrorLog1 = %016llx\n", be64_to_cpu(s->phbPblErrorLog1));
+ PHBERR(p, " phbPcieDlpErrorLog1 = %016llx\n", be64_to_cpu(s->phbPcieDlpErrorLog1));
+ PHBERR(p, " phbPcieDlpErrorLog2 = %016llx\n", be64_to_cpu(s->phbPcieDlpErrorLog2));
+ PHBERR(p, " phbPcieDlpErrorStatus = %016llx\n", be64_to_cpu(s->phbPcieDlpErrorStatus));
+
+ PHBERR(p, " phbRegbErrorStatus = %016llx\n", be64_to_cpu(s->phbRegbErrorStatus));
+ PHBERR(p, " phbRegbFirstErrorStatus = %016llx\n", be64_to_cpu(s->phbRegbFirstErrorStatus));
+ PHBERR(p, " phbRegbErrorLog0 = %016llx\n", be64_to_cpu(s->phbRegbErrorLog0));
+ PHBERR(p, " phbRegbErrorLog1 = %016llx\n", be64_to_cpu(s->phbRegbErrorLog1));
+
+ for (i = 0; i < p->max_num_pes; i++) {
+ if (!s->pestA[i] && !s->pestB[i])
+ continue;
+ PHBERR(p, " PEST[%03x] = %016llx %016llx\n",
+ i, be64_to_cpu(s->pestA[i]), be64_to_cpu(s->pestB[i]));
+ }
+ free(s);
+}
+
+static int64_t phb4_set_pe(struct phb *phb,
+ uint64_t pe_number,
+ uint64_t bdfn,
+ uint8_t bcompare,
+ uint8_t dcompare,
+ uint8_t fcompare,
+ uint8_t action)
+{
+ struct phb4 *p = phb_to_phb4(phb);
+ uint64_t mask, idx;
+
+ /* Sanity check */
+ if (action != OPAL_MAP_PE && action != OPAL_UNMAP_PE)
+ return OPAL_PARAMETER;
+ if (pe_number >= p->num_pes || bdfn > 0xffff ||
+ bcompare > OpalPciBusAll ||
+ dcompare > OPAL_COMPARE_RID_DEVICE_NUMBER ||
+ fcompare > OPAL_COMPARE_RID_FUNCTION_NUMBER)
+ return OPAL_PARAMETER;
+
+ /* match everything by default */
+ mask = 0;
+
+ /* Figure out the RID range */
+ if (bcompare != OpalPciBusAny)
+ mask = ((0x1 << (bcompare + 1)) - 1) << (15 - bcompare);
+
+ if (dcompare == OPAL_COMPARE_RID_DEVICE_NUMBER)
+ mask |= 0xf8;
+
+ if (fcompare == OPAL_COMPARE_RID_FUNCTION_NUMBER)
+ mask |= 0x7;
+
+ if (action == OPAL_UNMAP_PE)
+ pe_number = PHB4_RESERVED_PE_NUM(p);
+
+ /* Map or unmap the RTT range */
+ for (idx = 0; idx < RTT_TABLE_ENTRIES; idx++)
+ if ((idx & mask) == (bdfn & mask))
+ p->tbl_rtt[idx] = cpu_to_be16(pe_number);
+
+ /* Invalidate the RID Translation Cache (RTC) inside the PHB */
+ out_be64(p->regs + PHB_RTC_INVALIDATE, PHB_RTC_INVALIDATE_ALL);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb4_set_peltv(struct phb *phb,
+ uint32_t parent_pe,
+ uint32_t child_pe,
+ uint8_t state)
+{
+ struct phb4 *p = phb_to_phb4(phb);
+ uint32_t idx, mask;
+
+ /* Sanity check */
+ if (parent_pe >= p->num_pes || child_pe >= p->num_pes)
+ return OPAL_PARAMETER;
+
+ /* Find index for parent PE */
+ idx = parent_pe * (p->max_num_pes / 8);
+ idx += (child_pe / 8);
+ mask = 0x1 << (7 - (child_pe % 8));
+
+ if (state)
+ p->tbl_peltv[idx] |= mask;
+ else
+ p->tbl_peltv[idx] &= ~mask;
+
+ return OPAL_SUCCESS;
+}
+
+static void phb4_prepare_link_change(struct pci_slot *slot, bool is_up)
+{
+ struct phb4 *p = phb_to_phb4(slot->phb);
+ uint32_t reg32;
+
+ p->has_link = is_up;
+
+ if (is_up) {
+ /* Clear AER receiver error status */
+ phb4_pcicfg_write32(&p->phb, 0, p->aercap +
+ PCIECAP_AER_CE_STATUS,
+ PCIECAP_AER_CE_RECVR_ERR);
+ /* Unmask receiver error status in AER */
+ phb4_pcicfg_read32(&p->phb, 0, p->aercap +
+ PCIECAP_AER_CE_MASK, &reg32);
+ reg32 &= ~PCIECAP_AER_CE_RECVR_ERR;
+ phb4_pcicfg_write32(&p->phb, 0, p->aercap +
+ PCIECAP_AER_CE_MASK, reg32);
+
+ /* Don't block PCI-CFG */
+ p->flags &= ~PHB4_CFG_BLOCKED;
+
+ /* Re-enable link down errors */
+ out_be64(p->regs + PHB_PCIE_MISC_STRAP,
+ 0x0000060000000000ull);
+
+ /* Re-enable error status indicators that trigger irqs */
+ out_be64(p->regs + PHB_REGB_ERR_INF_ENABLE,
+ 0x2130006efca8bc00ull);
+ out_be64(p->regs + PHB_REGB_ERR_ERC_ENABLE,
+ 0x0080000000000000ull);
+ out_be64(p->regs + PHB_REGB_ERR_FAT_ENABLE,
+ 0xde0fff91035743ffull);
+
+ } else {
+ /* Mask AER receiver error */
+ phb4_pcicfg_read32(&p->phb, 0, p->aercap +
+ PCIECAP_AER_CE_MASK, &reg32);
+ reg32 |= PCIECAP_AER_CE_RECVR_ERR;
+ phb4_pcicfg_write32(&p->phb, 0, p->aercap +
+ PCIECAP_AER_CE_MASK, reg32);
+
+ /* Clear error link enable & error link down kill enable */
+ out_be64(p->regs + PHB_PCIE_MISC_STRAP, 0);
+
+ /* Disable all error status indicators that trigger irqs */
+ out_be64(p->regs + PHB_REGB_ERR_INF_ENABLE, 0);
+ out_be64(p->regs + PHB_REGB_ERR_ERC_ENABLE, 0);
+ out_be64(p->regs + PHB_REGB_ERR_FAT_ENABLE, 0);
+
+ /* Block PCI-CFG access */
+ p->flags |= PHB4_CFG_BLOCKED;
+ }
+}
+
+static int64_t phb4_get_presence_state(struct pci_slot *slot, uint8_t *val)
+{
+ struct phb4 *p = phb_to_phb4(slot->phb);
+ uint64_t hps, dtctl;
+
+ /* Test for PHB in error state ? */
+ if (p->broken)
+ return OPAL_HARDWARE;
+
+ /* Check hotplug status */
+ hps = in_be64(p->regs + PHB_PCIE_HOTPLUG_STATUS);
+ if (!(hps & PHB_PCIE_HPSTAT_PRESENCE)) {
+ *val = OPAL_PCI_SLOT_PRESENT;
+ } else {
+ /*
+ * If it says not present but link is up, then we assume
+ * we are on a broken simulation environment and still
+ * return a valid presence. Otherwise, not present.
+ */
+ dtctl = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+ if (dtctl & PHB_PCIE_DLP_TL_LINKACT) {
+ PHBERR(p, "Presence detect 0 but link set !\n");
+ *val = OPAL_PCI_SLOT_PRESENT;
+ } else {
+ *val = OPAL_PCI_SLOT_EMPTY;
+ }
+ }
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb4_get_link_info(struct pci_slot *slot, uint8_t *speed,
+ uint8_t *width)
+{
+ struct phb4 *p = phb_to_phb4(slot->phb);
+ uint64_t reg;
+ uint16_t state;
+ int64_t rc;
+ uint8_t s;
+
+ /* Link is up, let's find the actual speed */
+ reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+ if (!(reg & PHB_PCIE_DLP_TL_LINKACT)) {
+ *width = 0;
+ if (speed)
+ *speed = 0;
+ return OPAL_SUCCESS;
+ }
+
+ rc = phb4_pcicfg_read16(&p->phb, 0,
+ p->ecap + PCICAP_EXP_LSTAT, &state);
+ if (rc != OPAL_SUCCESS) {
+ PHBERR(p, "%s: Error %lld getting link state\n", __func__, rc);
+ return OPAL_HARDWARE;
+ }
+
+ if (state & PCICAP_EXP_LSTAT_DLLL_ACT) {
+ *width = ((state & PCICAP_EXP_LSTAT_WIDTH) >> 4);
+ s = state & PCICAP_EXP_LSTAT_SPEED;
+ } else {
+ *width = 0;
+ s = 0;
+ }
+
+ if (speed)
+ *speed = s;
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb4_get_link_state(struct pci_slot *slot, uint8_t *val)
+{
+ return phb4_get_link_info(slot, NULL, val);
+}
+
+static int64_t phb4_retry_state(struct pci_slot *slot)
+{
+ struct phb4 *p = phb_to_phb4(slot->phb);
+
+ /* Mark link as down */
+ phb4_prepare_link_change(slot, false);
+
+ /* Last attempt to activate link */
+ if (slot->link_retries == 1) {
+ if (slot->state == PHB4_SLOT_LINK_WAIT) {
+ PHBERR(p, "Falling back to GEN1 training\n");
+ p->max_link_speed = 1;
+ }
+ }
+
+ if (!slot->link_retries--) {
+ switch (slot->state) {
+ case PHB4_SLOT_LINK_WAIT_ELECTRICAL:
+ PHBERR(p, "Presence detected but no electrical link\n");
+ break;
+ case PHB4_SLOT_LINK_WAIT:
+ PHBERR(p, "Electrical link detected but won't train\n");
+ break;
+ case PHB4_SLOT_LINK_STABLE:
+ PHBERR(p, "Linked trained but was degraded or unstable\n");
+ break;
+ default:
+ PHBERR(p, "Unknown link issue\n");
+ }
+ return OPAL_HARDWARE;
+ }
+
+ pci_slot_set_state(slot, PHB4_SLOT_CRESET_START);
+ return pci_slot_set_sm_timeout(slot, msecs_to_tb(1));
+}
+
+static uint64_t phb4_train_info(struct phb4 *p, uint64_t reg, unsigned long dt)
+{
+ uint64_t ltssm_state = GETFIELD(PHB_PCIE_DLP_LTSSM_TRC, reg);
+ char s[80];
+
+ snprintf(s, sizeof(s), "TRACE:0x%016llx % 2lims",
+ reg, tb_to_msecs(dt));
+
+ if (reg & PHB_PCIE_DLP_TL_LINKACT)
+ snprintf(s, sizeof(s), "%s trained ", s);
+ else if (reg & PHB_PCIE_DLP_TRAINING)
+ snprintf(s, sizeof(s), "%s training", s);
+ else if (reg & PHB_PCIE_DLP_INBAND_PRESENCE)
+ snprintf(s, sizeof(s), "%s presence", s);
+ else
+ snprintf(s, sizeof(s), "%s ", s);
+
+ snprintf(s, sizeof(s), "%s GEN%lli:x%02lli:", s,
+ GETFIELD(PHB_PCIE_DLP_LINK_SPEED, reg),
+ GETFIELD(PHB_PCIE_DLP_LINK_WIDTH, reg));
+
+ switch (ltssm_state) {
+ case PHB_PCIE_DLP_LTSSM_RESET:
+ snprintf(s, sizeof(s), "%sreset", s);
+ break;
+ case PHB_PCIE_DLP_LTSSM_DETECT:
+ snprintf(s, sizeof(s), "%sdetect", s);
+ break;
+ case PHB_PCIE_DLP_LTSSM_POLLING:
+ snprintf(s, sizeof(s), "%spolling", s);
+ break;
+ case PHB_PCIE_DLP_LTSSM_CONFIG:
+ snprintf(s, sizeof(s), "%sconfig", s);
+ break;
+ case PHB_PCIE_DLP_LTSSM_L0:
+ snprintf(s, sizeof(s), "%sL0", s);
+ break;
+ case PHB_PCIE_DLP_LTSSM_REC:
+ snprintf(s, sizeof(s), "%srecovery", s);
+ break;
+ case PHB_PCIE_DLP_LTSSM_L1:
+ snprintf(s, sizeof(s), "%sL1", s);
+ break;
+ case PHB_PCIE_DLP_LTSSM_L2:
+ snprintf(s, sizeof(s), "%sL2", s);
+ break;
+ case PHB_PCIE_DLP_LTSSM_HOTRESET:
+ snprintf(s, sizeof(s), "%shotreset", s);
+ break;
+ case PHB_PCIE_DLP_LTSSM_DISABLED:
+ snprintf(s, sizeof(s), "%sdisabled", s);
+ break;
+ case PHB_PCIE_DLP_LTSSM_LOOPBACK:
+ snprintf(s, sizeof(s), "%sloopback", s);
+ break;
+ default:
+ snprintf(s, sizeof(s), "%sunvalid", s);
+ }
+ PHBNOTICE(p, "%s\n", s);
+
+ return ltssm_state;
+}
+
+static void phb4_dump_pec_err_regs(struct phb4 *p)
+{
+ uint64_t nfir_p_wof, nfir_n_wof, err_aib;
+ uint64_t err_rpt0, err_rpt1;
+
+ /* Read the PCI and NEST FIRs and dump them. Also cache PCI/NEST FIRs */
+ xscom_read(p->chip_id,
+ p->pci_stk_xscom + XPEC_PCI_STK_PCI_FIR, &p->pfir_cache);
+ xscom_read(p->chip_id,
+ p->pci_stk_xscom + XPEC_PCI_STK_PCI_FIR_WOF, &nfir_p_wof);
+ xscom_read(p->chip_id,
+ p->pe_stk_xscom + XPEC_NEST_STK_PCI_NFIR, &p->nfir_cache);
+ xscom_read(p->chip_id,
+ p->pe_stk_xscom + XPEC_NEST_STK_PCI_NFIR_WOF, &nfir_n_wof);
+ xscom_read(p->chip_id,
+ p->pe_stk_xscom + XPEC_NEST_STK_ERR_RPT0, &err_rpt0);
+ xscom_read(p->chip_id,
+ p->pe_stk_xscom + XPEC_NEST_STK_ERR_RPT1, &err_rpt1);
+ xscom_read(p->chip_id,
+ p->pci_stk_xscom + XPEC_PCI_STK_PBAIB_ERR_REPORT, &err_aib);
+
+ PHBERR(p, " PCI FIR=%016llx\n", p->pfir_cache);
+ PHBERR(p, " PCI FIR WOF=%016llx\n", nfir_p_wof);
+ PHBERR(p, " NEST FIR=%016llx\n", p->nfir_cache);
+ PHBERR(p, " NEST FIR WOF=%016llx\n", nfir_n_wof);
+ PHBERR(p, " ERR RPT0=%016llx\n", err_rpt0);
+ PHBERR(p, " ERR RPT1=%016llx\n", err_rpt1);
+ PHBERR(p, " AIB ERR=%016llx\n", err_aib);
+}
+
+static void phb4_dump_capp_err_regs(struct phb4 *p)
+{
+ uint64_t fir, apc_master_err, snoop_err, transport_err;
+ uint64_t tlbi_err, capp_err_status;
+ uint64_t offset = PHB4_CAPP_REG_OFFSET(p);
+
+ xscom_read(p->chip_id, CAPP_FIR + offset, &fir);
+ xscom_read(p->chip_id, CAPP_APC_MASTER_ERR_RPT + offset,
+ &apc_master_err);
+ xscom_read(p->chip_id, CAPP_SNOOP_ERR_RTP + offset, &snoop_err);
+ xscom_read(p->chip_id, CAPP_TRANSPORT_ERR_RPT + offset, &transport_err);
+ xscom_read(p->chip_id, CAPP_TLBI_ERR_RPT + offset, &tlbi_err);
+ xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, &capp_err_status);
+
+ PHBERR(p, " CAPP FIR=%016llx\n", fir);
+ PHBERR(p, "CAPP APC MASTER ERR=%016llx\n", apc_master_err);
+ PHBERR(p, " CAPP SNOOP ERR=%016llx\n", snoop_err);
+ PHBERR(p, " CAPP TRANSPORT ERR=%016llx\n", transport_err);
+ PHBERR(p, " CAPP TLBI ERR=%016llx\n", tlbi_err);
+ PHBERR(p, " CAPP ERR STATUS=%016llx\n", capp_err_status);
+}
+
+/* Check if AIB is fenced via PBCQ NFIR */
+static bool phb4_fenced(struct phb4 *p)
+{
+
+ /* Already fenced ? */
+ if (p->flags & PHB4_AIB_FENCED)
+ return true;
+
+ /*
+ * An all 1's from the PHB indicates a PHB freeze/fence. We
+ * don't really differenciate them at this point.
+ */
+ if (in_be64(p->regs + PHB_CPU_LOADSTORE_STATUS)!= 0xfffffffffffffffful)
+ return false;
+
+ /* Mark ourselves fenced */
+ p->flags |= PHB4_AIB_FENCED;
+
+ PHBERR(p, "PHB Freeze/Fence detected !\n");
+ phb4_dump_pec_err_regs(p);
+
+ /*
+ * dump capp error registers in case phb was fenced due to capp.
+ * Expect p->nfir_cache already updated in phb4_dump_pec_err_regs()
+ */
+ if (p->nfir_cache & XPEC_NEST_STK_PCI_NFIR_CXA_PE_CAPP)
+ phb4_dump_capp_err_regs(p);
+
+ phb4_eeh_dump_regs(p);
+
+ return true;
+}
+
+static bool phb4_check_reg(struct phb4 *p, uint64_t reg)
+{
+ if (reg == 0xffffffffffffffffUL)
+ return !phb4_fenced(p);
+ return true;
+}
+
+static void phb4_get_info(struct phb *phb, uint16_t bdfn, uint8_t *speed,
+ uint8_t *width)
+{
+ int32_t ecap;
+ uint32_t cap;
+
+ ecap = pci_find_cap(phb, bdfn, PCI_CFG_CAP_ID_EXP);
+ pci_cfg_read32(phb, bdfn, ecap + PCICAP_EXP_LCAP, &cap);
+ *width = (cap & PCICAP_EXP_LCAP_MAXWDTH) >> 4;
+ *speed = cap & PCICAP_EXP_LCAP_MAXSPD;
+}
+
+#define PVR_POWER9_CUMULUS 0x00002000
+
+static bool phb4_chip_retry_workaround(void)
+{
+ unsigned int pvr;
+
+ if (pci_retry_all)
+ return true;
+
+ /* Chips that need this retry are:
+ * - CUMULUS DD1.0
+ * - NIMBUS DD2.0 (and DD1.0, but it is unsupported so no check).
+ */
+ pvr = mfspr(SPR_PVR);
+ if (pvr & PVR_POWER9_CUMULUS) {
+ if ((PVR_VERS_MAJ(pvr) == 1) && (PVR_VERS_MIN(pvr) == 0))
+ return true;
+ } else { /* NIMBUS */
+ if ((PVR_VERS_MAJ(pvr) == 2) && (PVR_VERS_MIN(pvr) == 0))
+ return true;
+ }
+ return false;
+}
+
+struct pci_card_id {
+ uint16_t vendor;
+ uint16_t device;
+};
+
+static struct pci_card_id retry_allowlist[] = {
+ { 0x1000, 0x005d }, /* LSI Logic MegaRAID SAS-3 3108 */
+ { 0x1000, 0x00c9 }, /* LSI MPT SAS-3 */
+ { 0x104c, 0x8241 }, /* TI xHCI USB */
+ { 0x1077, 0x2261 }, /* QLogic ISP2722-based 16/32Gb FC */
+ { 0x10b5, 0x8725 }, /* PLX Switch: p9dsu, witherspoon */
+ { 0x10b5, 0x8748 }, /* PLX Switch: ZZ */
+ { 0x11f8, 0xf117 }, /* PMC-Sierra/MicroSemi NV1604 */
+ { 0x15b3, 0x1013 }, /* Mellanox ConnectX-4 */
+ { 0x15b3, 0x1017 }, /* Mellanox ConnectX-5 */
+ { 0x15b3, 0x1019 }, /* Mellanox ConnectX-5 Ex */
+ { 0x1a03, 0x1150 }, /* ASPEED AST2500 Switch */
+ { 0x8086, 0x10fb }, /* Intel x520 10G Eth */
+ { 0x9005, 0x028d }, /* MicroSemi PM8069 */
+};
+
+#define VENDOR(vdid) ((vdid) & 0xffff)
+#define DEVICE(vdid) (((vdid) >> 16) & 0xffff)
+
+static bool phb4_adapter_in_allowlist(uint32_t vdid)
+{
+ int i;
+
+ if (pci_retry_all)
+ return true;
+
+ for (i = 0; i < ARRAY_SIZE(retry_allowlist); i++)
+ if ((retry_allowlist[i].vendor == VENDOR(vdid)) &&
+ (retry_allowlist[i].device == DEVICE(vdid)))
+ return true;
+
+ return false;
+}
+
+static struct pci_card_id lane_eq_disable[] = {
+ { 0x10de, 0x17fd }, /* Nvidia GM200GL [Tesla M40] */
+ { 0x10de, 0x1db4 }, /* Nvidia GV100 */
+};
+
+static bool phb4_lane_eq_retry_allowlist(uint32_t vdid)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(lane_eq_disable); i++)
+ if ((lane_eq_disable[i].vendor == VENDOR(vdid)) &&
+ (lane_eq_disable[i].device == DEVICE(vdid)))
+ return true;
+ return false;
+}
+
+static void phb4_lane_eq_change(struct phb4 *p, uint32_t vdid)
+{
+ p->lane_eq_en = !phb4_lane_eq_retry_allowlist(vdid);
+}
+
+static bool phb4_link_optimal(struct pci_slot *slot, uint32_t *vdid)
+{
+ struct phb4 *p = phb_to_phb4(slot->phb);
+ uint64_t reg;
+ uint32_t id;
+ uint16_t bdfn, lane_errs;
+ uint8_t trained_speed, dev_speed, target_speed, rx_errs;
+ uint8_t trained_width, dev_width, target_width;
+ bool optimal_speed, optimal_width, optimal, retry_enabled, rx_err_ok;
+
+
+ /* Current trained state */
+ phb4_get_link_info(slot, &trained_speed, &trained_width);
+
+ /* Get device capability */
+ bdfn = 0x0100; /* bus=1 dev=0 device=0 */
+ /* Since this is the first access, we need to wait for CRS */
+ if (!pci_wait_crs(slot->phb, bdfn , &id))
+ return true;
+ phb4_get_info(slot->phb, bdfn, &dev_speed, &dev_width);
+
+ /* Work out if we are optimally trained */
+ target_speed = MIN(p->max_link_speed, dev_speed);
+ optimal_speed = (trained_speed >= target_speed);
+ target_width = MIN(p->max_link_width, dev_width);
+ optimal_width = (trained_width >= target_width);
+ optimal = optimal_width && optimal_speed;
+ retry_enabled = (phb4_chip_retry_workaround() &&
+ phb4_adapter_in_allowlist(id)) ||
+ phb4_lane_eq_retry_allowlist(id);
+ reg = in_be64(p->regs + PHB_PCIE_DLP_ERR_COUNTERS);
+ rx_errs = GETFIELD(PHB_PCIE_DLP_RX_ERR_CNT, reg);
+ rx_err_ok = (rx_errs < rx_err_max);
+ reg = in_be64(p->regs + PHB_PCIE_DLP_ERR_STATUS);
+ lane_errs = GETFIELD(PHB_PCIE_DLP_LANE_ERR, reg);
+
+ PHBDBG(p, "LINK: Card [%04x:%04x] %s Retry:%s\n", VENDOR(id),
+ DEVICE(id), optimal ? "Optimal" : "Degraded",
+ retry_enabled ? "enabled" : "disabled");
+ PHBDBG(p, "LINK: Speed Train:GEN%i PHB:GEN%i DEV:GEN%i%s\n",
+ trained_speed, p->max_link_speed, dev_speed,
+ optimal_speed ? "" : " *");
+ PHBDBG(p, "LINK: Width Train:x%02i PHB:x%02i DEV:x%02i%s\n",
+ trained_width, p->max_link_width, dev_width,
+ optimal_width ? "" : " *");
+ PHBDBG(p, "LINK: RX Errors Now:%i Max:%i Lane:0x%04x%s\n",
+ rx_errs, rx_err_max, lane_errs, rx_err_ok ? "" : " *");
+
+ if (vdid)
+ *vdid = id;
+
+ /* Always do RX error retry irrespective of chip and card */
+ if (!rx_err_ok)
+ return false;
+
+ if (!retry_enabled)
+ return true;
+
+ return optimal;
+}
+
+/*
+ * This is a trace function to watch what's happening duing pcie link
+ * training. If any errors are detected it simply returns so the
+ * normal code can deal with it.
+ */
+static void phb4_link_trace(struct phb4 *p, uint64_t target_state, int max_ms)
+{
+ unsigned long now, end, start = mftb(), state = 0;
+ uint64_t trwctl, reg, reglast = -1;
+ bool enabled;
+
+ /*
+ * Enable the DLP trace outputs. If we don't the LTSSM state in
+ * PHB_PCIE_DLP_TRAIN_CTL won't be updated and always reads zero.
+ */
+ trwctl = phb4_read_reg(p, PHB_PCIE_DLP_TRWCTL);
+ enabled = !!(trwctl & PHB_PCIE_DLP_TRWCTL_EN);
+ if (!enabled) {
+ phb4_write_reg(p, PHB_PCIE_DLP_TRWCTL,
+ trwctl | PHB_PCIE_DLP_TRWCTL_EN);
+ }
+
+ end = start + msecs_to_tb(max_ms);
+ now = start;
+
+ do {
+ reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+ if (reg != reglast)
+ state = phb4_train_info(p, reg, now - start);
+ reglast = reg;
+
+ if (!phb4_check_reg(p, reg)) {
+ PHBNOTICE(p, "TRACE: PHB fenced.\n");
+ goto out;
+ }
+
+ if (tb_compare(now, end) == TB_AAFTERB) {
+ PHBNOTICE(p, "TRACE: Timed out after %dms\n", max_ms);
+ goto out;
+ }
+
+ now = mftb();
+ } while (state != target_state);
+
+ PHBNOTICE(p, "TRACE: Reached target state\n");
+
+out:
+ /*
+ * The trace enable bit is a clock gate for the tracing logic. Turn
+ * it off to save power if we're not using it otherwise.
+ */
+ if (!enabled)
+ phb4_write_reg(p, PHB_PCIE_DLP_TRWCTL, trwctl);
+}
+
+/*
+ * This helper is called repeatedly by the host sync notifier mechanism, which
+ * relies on the kernel to regularly poll the OPAL_SYNC_HOST_REBOOT call as it
+ * shuts down.
+ */
+static bool phb4_host_sync_reset(void *data)
+{
+ struct phb4 *p = (struct phb4 *)data;
+ struct phb *phb = &p->phb;
+ int64_t rc = 0;
+
+ /* Make sure no-one modifies the phb flags while we are active */
+ phb_lock(phb);
+
+ /* Make sure CAPP is attached to the PHB */
+ if (p->capp)
+ /* Call phb ops to disable capi */
+ rc = phb->ops->set_capi_mode(phb, OPAL_PHB_CAPI_MODE_PCIE,
+ p->capp->attached_pe);
+ else
+ rc = OPAL_SUCCESS;
+
+ /* Continue kicking state-machine if in middle of a mode transition */
+ if (rc == OPAL_BUSY)
+ rc = phb->slot->ops.run_sm(phb->slot);
+
+ phb_unlock(phb);
+
+ return rc <= OPAL_SUCCESS;
+}
+
+/*
+ * Notification from the pci-core that a pci slot state machine completed.
+ * We use this callback to mark the CAPP disabled if we were waiting for it.
+ */
+static int64_t phb4_slot_sm_run_completed(struct pci_slot *slot, uint64_t err)
+{
+ struct phb4 *p = phb_to_phb4(slot->phb);
+
+ /* Check if we are disabling the capp */
+ if (p->flags & PHB4_CAPP_DISABLE) {
+
+ /* Unset struct capp so that we dont fall into a creset loop */
+ p->flags &= ~(PHB4_CAPP_DISABLE);
+ p->capp->phb = NULL;
+ p->capp->attached_pe = phb4_get_reserved_pe_number(&p->phb);
+
+ /* Remove the host sync notifier is we are done.*/
+ opal_del_host_sync_notifier(phb4_host_sync_reset, p);
+ if (err) {
+ /* Force a CEC ipl reboot */
+ disable_fast_reboot("CAPP: reset failed");
+ PHBERR(p, "CAPP: Unable to reset. Error=%lld\n", err);
+ } else {
+ PHBINF(p, "CAPP: reset complete\n");
+ }
+ }
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb4_poll_link(struct pci_slot *slot)
+{
+ struct phb4 *p = phb_to_phb4(slot->phb);
+ uint64_t reg;
+ uint32_t vdid;
+
+ switch (slot->state) {
+ case PHB4_SLOT_NORMAL:
+ case PHB4_SLOT_LINK_START:
+ PHBDBG(p, "LINK: Start polling\n");
+ slot->retries = PHB4_LINK_ELECTRICAL_RETRIES;
+ pci_slot_set_state(slot, PHB4_SLOT_LINK_WAIT_ELECTRICAL);
+ /* Polling early here has no chance of a false positive */
+ return pci_slot_set_sm_timeout(slot, msecs_to_tb(1));
+ case PHB4_SLOT_LINK_WAIT_ELECTRICAL:
+ /*
+ * Wait for the link electrical connection to be
+ * established (shorter timeout). This allows us to
+ * workaround spurrious presence detect on some machines
+ * without waiting 10s each time
+ *
+ * Note: We *also* check for the full link up bit here
+ * because simics doesn't seem to implement the electrical
+ * link bit at all
+ */
+ reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+ if (!phb4_check_reg(p, reg)) {
+ PHBERR(p, "PHB fence waiting for electrical link\n");
+ return phb4_retry_state(slot);
+ }
+
+ if (reg & (PHB_PCIE_DLP_INBAND_PRESENCE |
+ PHB_PCIE_DLP_TL_LINKACT)) {
+ PHBDBG(p, "LINK: Electrical link detected\n");
+ pci_slot_set_state(slot, PHB4_SLOT_LINK_WAIT);
+ slot->retries = PHB4_LINK_WAIT_RETRIES;
+ /* No wait here since already have an elec link */
+ return pci_slot_set_sm_timeout(slot, msecs_to_tb(1));
+ }
+
+ if (slot->retries-- == 0) {
+ PHBDBG(p, "LINK: No in-band presence\n");
+ return OPAL_SUCCESS;
+ }
+ /* Retry */
+ return pci_slot_set_sm_timeout(slot, msecs_to_tb(10));
+ case PHB4_SLOT_LINK_WAIT:
+ reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+ if (!phb4_check_reg(p, reg)) {
+ PHBERR(p, "LINK: PHB fence waiting for link training\n");
+ return phb4_retry_state(slot);
+ }
+ if (reg & PHB_PCIE_DLP_TL_LINKACT) {
+ PHBDBG(p, "LINK: Link is up\n");
+ phb4_prepare_link_change(slot, true);
+ pci_slot_set_state(slot, PHB4_SLOT_LINK_STABLE);
+ return pci_slot_set_sm_timeout(slot, secs_to_tb(1));
+ }
+
+ if (slot->retries-- == 0) {
+ PHBERR(p, "LINK: Timeout waiting for link up\n");
+ PHBDBG(p, "LINK: DLP train control: 0x%016llx\n", reg);
+ return phb4_retry_state(slot);
+ }
+ /* Retry */
+ return pci_slot_set_sm_timeout(slot, msecs_to_tb(10));
+ case PHB4_SLOT_LINK_STABLE:
+ /* Sanity check link */
+ if (phb4_fenced(p)) {
+ PHBERR(p, "LINK: PHB fenced waiting for stabilty\n");
+ return phb4_retry_state(slot);
+ }
+ reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+ if (!phb4_check_reg(p, reg)) {
+ PHBERR(p, "LINK: PHB fence reading training control\n");
+ return phb4_retry_state(slot);
+ }
+ if (reg & PHB_PCIE_DLP_TL_LINKACT) {
+ PHBDBG(p, "LINK: Link is stable\n");
+ if (!phb4_link_optimal(slot, &vdid)) {
+ PHBDBG(p, "LINK: Link degraded\n");
+ if (slot->link_retries) {
+ phb4_lane_eq_change(p, vdid);
+ return phb4_retry_state(slot);
+ }
+ /*
+ * Link is degraded but no more retries, so
+ * settle for what we have :-(
+ */
+ PHBERR(p, "LINK: Degraded but no more retries\n");
+ }
+ pci_restore_slot_bus_configs(slot);
+ pci_slot_set_state(slot, PHB4_SLOT_NORMAL);
+ return OPAL_SUCCESS;
+ }
+ PHBERR(p, "LINK: Went down waiting for stabilty\n");
+ PHBDBG(p, "LINK: DLP train control: 0x%016llx\n", reg);
+ return phb4_retry_state(slot);
+ default:
+ PHBERR(p, "LINK: Unexpected slot state %08x\n",
+ slot->state);
+ }
+
+ pci_slot_set_state(slot, PHB4_SLOT_NORMAL);
+ return OPAL_HARDWARE;
+}
+
+static unsigned int phb4_get_max_link_speed(struct phb4 *p, struct dt_node *np)
+{
+ unsigned int max_link_speed, hw_max_link_speed;
+ struct proc_chip *chip;
+ chip = get_chip(p->chip_id);
+
+ hw_max_link_speed = 4;
+ if (is_phb5() && (p->index == 0 || p->index == 3))
+ hw_max_link_speed = 5;
+
+ /* Priority order: NVRAM -> dt -> GEN3 dd2.00 -> hw default */
+ max_link_speed = hw_max_link_speed;
+ if (p->rev == PHB4_REV_NIMBUS_DD20 &&
+ ((0xf & chip->ec_level) == 0) && chip->ec_rev == 0)
+ max_link_speed = 3;
+ if (np) {
+ if (dt_has_node_property(np, "ibm,max-link-speed", NULL)) {
+ max_link_speed = dt_prop_get_u32(np, "ibm,max-link-speed");
+ p->dt_max_link_speed = max_link_speed;
+ }
+ else {
+ p->dt_max_link_speed = 0;
+ }
+ }
+ else {
+ if (p->dt_max_link_speed > 0) {
+ max_link_speed = p->dt_max_link_speed;
+ }
+ }
+ if (pcie_max_link_speed)
+ max_link_speed = pcie_max_link_speed;
+ if (max_link_speed > hw_max_link_speed)
+ max_link_speed = hw_max_link_speed;
+
+ return max_link_speed;
+}
+
+static unsigned int __phb4_get_max_link_width(struct phb4 *p)
+{
+ uint64_t addr, reg;
+ unsigned int lane_config, width = 16;
+
+ /*
+ * On P9, only PEC2 is configurable (no-/bi-/tri-furcation)
+ */
+ switch (p->pec) {
+ case 0:
+ width = 16;
+ break;
+ case 1:
+ width = 8;
+ break;
+ case 2:
+ addr = XPEC_P9_PCI_CPLT_CONF1 + 2 * XPEC_PCI_CPLT_OFFSET;
+ xscom_read(p->chip_id, addr, &reg);
+ lane_config = GETFIELD(XPEC_P9_PCI_LANE_CFG, reg);
+
+ if (lane_config == 0b10 && p->index >= 4)
+ width = 4;
+ else
+ width = 8;
+ }
+ return width;
+}
+
+static unsigned int __phb5_get_max_link_width(struct phb4 *p)
+{
+ uint64_t addr, reg;
+ unsigned int lane_config, width = 16;
+
+ /*
+ * On P10, the 2 PECs are identical and each can have a
+ * different furcation, so we always need to check the PEC
+ * config
+ */
+ addr = XPEC_P10_PCI_CPLT_CONF1 + p->pec * XPEC_PCI_CPLT_OFFSET;
+ xscom_read(p->chip_id, addr, &reg);
+ lane_config = GETFIELD(XPEC_P10_PCI_LANE_CFG, reg);
+
+ switch (lane_config) {
+ case 0b00:
+ width = 16;
+ break;
+ case 0b01:
+ width = 8;
+ break;
+ case 0b10:
+ if (p->index == 0 || p->index == 3)
+ width = 8;
+ else
+ width = 4;
+ break;
+ default:
+ PHBERR(p, "Unexpected PEC lane config value %#x\n",
+ lane_config);
+ }
+ return width;
+}
+
+static unsigned int phb4_get_max_link_width(struct phb4 *p)
+{
+ if (is_phb5())
+ return __phb5_get_max_link_width(p);
+ else
+ return __phb4_get_max_link_width(p);
+}
+
+static void phb4_assert_perst(struct pci_slot *slot, bool assert)
+{
+ struct phb4 *p = phb_to_phb4(slot->phb);
+ uint16_t linkctl;
+ uint64_t reg;
+
+ /*
+ * Disable the link before asserting PERST. The Cursed RAID card
+ * in ozrom1 (9005:028c) has problems coming back if PERST is asserted
+ * while link is active. To work around the problem we assert the link
+ * disable bit before asserting PERST. Asserting the secondary reset
+ * bit in the btctl register also works.
+ */
+ phb4_pcicfg_read16(&p->phb, 0, p->ecap + PCICAP_EXP_LCTL, &linkctl);
+ reg = phb4_read_reg(p, PHB_PCIE_CRESET);
+
+ if (assert) {
+ linkctl |= PCICAP_EXP_LCTL_LINK_DIS;
+ reg &= ~PHB_PCIE_CRESET_PERST_N;
+ } else {
+ linkctl &= ~PCICAP_EXP_LCTL_LINK_DIS;
+ reg |= PHB_PCIE_CRESET_PERST_N;
+ }
+
+ phb4_write_reg(p, PHB_PCIE_CRESET, reg);
+ phb4_pcicfg_write16(&p->phb, 0, p->ecap + PCICAP_EXP_LCTL, linkctl);
+}
+
+static void set_sys_disable_detect(struct phb4 *p, bool set)
+{
+ uint64_t val;
+
+ val = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+ if (set)
+ val |= PHB_PCIE_DLP_SYS_DISABLEDETECT;
+ else
+ val &= ~PHB_PCIE_DLP_SYS_DISABLEDETECT;
+ out_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL, val);
+}
+
+static int64_t phb4_hreset(struct pci_slot *slot)
+{
+ struct phb4 *p = phb_to_phb4(slot->phb);
+ uint16_t brctl;
+ uint8_t presence = 1;
+
+ switch (slot->state) {
+ case PHB4_SLOT_NORMAL:
+ PHBDBG(p, "HRESET: Starts\n");
+ if (slot->ops.get_presence_state)
+ slot->ops.get_presence_state(slot, &presence);
+ if (!presence) {
+ PHBDBG(p, "HRESET: No device\n");
+ return OPAL_SUCCESS;
+ }
+
+ /* circumvention for HW551382 */
+ if (is_phb5()) {
+ PHBINF(p, "HRESET: Workaround for HW551382\n");
+ set_sys_disable_detect(p, true);
+ }
+
+ PHBDBG(p, "HRESET: Prepare for link down\n");
+ phb4_prepare_link_change(slot, false);
+ /* fall through */
+ case PHB4_SLOT_HRESET_START:
+ PHBDBG(p, "HRESET: Assert\n");
+
+ phb4_pcicfg_read16(&p->phb, 0, PCI_CFG_BRCTL, &brctl);
+ brctl |= PCI_CFG_BRCTL_SECONDARY_RESET;
+ phb4_pcicfg_write16(&p->phb, 0, PCI_CFG_BRCTL, brctl);
+ pci_slot_set_state(slot, PHB4_SLOT_HRESET_DELAY);
+
+ return pci_slot_set_sm_timeout(slot, secs_to_tb(1));
+ case PHB4_SLOT_HRESET_DELAY:
+ PHBDBG(p, "HRESET: Deassert\n");
+
+ /* Clear link errors before we deassert reset */
+ phb4_err_clear_regb(p);
+
+ phb4_pcicfg_read16(&p->phb, 0, PCI_CFG_BRCTL, &brctl);
+ brctl &= ~PCI_CFG_BRCTL_SECONDARY_RESET;
+ phb4_pcicfg_write16(&p->phb, 0, PCI_CFG_BRCTL, brctl);
+
+ /*
+ * Due to some oddball adapters bouncing the link
+ * training a couple of times, we wait for a full second
+ * before we start checking the link status, otherwise
+ * we can get a spurrious link down interrupt which
+ * causes us to EEH immediately.
+ */
+ pci_slot_set_state(slot, PHB4_SLOT_HRESET_DELAY2);
+ return pci_slot_set_sm_timeout(slot, secs_to_tb(1));
+ case PHB4_SLOT_HRESET_DELAY2:
+ if (is_phb5())
+ set_sys_disable_detect(p, false);
+ pci_slot_set_state(slot, PHB4_SLOT_LINK_START);
+ return slot->ops.poll_link(slot);
+ default:
+ PHBERR(p, "Unexpected slot state %08x\n", slot->state);
+ }
+
+ pci_slot_set_state(slot, PHB4_SLOT_NORMAL);
+ return OPAL_HARDWARE;
+}
+
+static int64_t phb4_freset(struct pci_slot *slot)
+{
+ struct phb4 *p = phb_to_phb4(slot->phb);
+
+ switch(slot->state) {
+ case PHB4_SLOT_NORMAL:
+ case PHB4_SLOT_FRESET_START:
+ PHBDBG(p, "FRESET: Starts\n");
+
+ /* Reset max link speed for training */
+ p->max_link_speed = phb4_get_max_link_speed(p, NULL);
+
+ PHBDBG(p, "FRESET: Prepare for link down\n");
+ phb4_prepare_link_change(slot, false);
+
+ if (!p->skip_perst) {
+ /* circumvention for HW551382 */
+ if (is_phb5()) {
+ PHBINF(p, "FRESET: Workaround for HW551382\n");
+ set_sys_disable_detect(p, true);
+ }
+
+ PHBDBG(p, "FRESET: Assert\n");
+ phb4_assert_perst(slot, true);
+ pci_slot_set_state(slot, PHB4_SLOT_FRESET_ASSERT_DELAY);
+
+ /* 250ms assert time aligns with powernv */
+ return pci_slot_set_sm_timeout(slot, msecs_to_tb(250));
+ }
+
+ /* To skip the assert during boot time */
+ PHBDBG(p, "FRESET: Assert skipped\n");
+ pci_slot_set_state(slot, PHB4_SLOT_FRESET_ASSERT_DELAY);
+ p->skip_perst = false;
+ /* fall through */
+ case PHB4_SLOT_FRESET_ASSERT_DELAY:
+ /* Clear link errors before we deassert PERST */
+ phb4_err_clear_regb(p);
+
+ PHBDBG(p, "FRESET: Deassert\n");
+ phb4_assert_perst(slot, false);
+
+ if (pci_tracing)
+ phb4_link_trace(p, PHB_PCIE_DLP_LTSSM_L0, 3000);
+
+ if (is_phb5())
+ set_sys_disable_detect(p, false);
+
+ pci_slot_set_state(slot, PHB4_SLOT_LINK_START);
+ return slot->ops.poll_link(slot);
+ default:
+ PHBERR(p, "Unexpected slot state %08x\n", slot->state);
+ }
+
+ pci_slot_set_state(slot, PHB4_SLOT_NORMAL);
+ return OPAL_HARDWARE;
+}
+
+static int64_t load_capp_ucode(struct phb4 *p)
+{
+ int64_t rc;
+
+ if (p->index != CAPP0_PHB_INDEX && p->index != CAPP1_PHB_INDEX)
+ return OPAL_HARDWARE;
+
+ /* 0x434150504c494448 = 'CAPPLIDH' in ASCII */
+ rc = capp_load_ucode(p->chip_id, p->phb.opal_id, p->index,
+ 0x434150504c494448UL, PHB4_CAPP_REG_OFFSET(p),
+ CAPP_APC_MASTER_ARRAY_ADDR_REG,
+ CAPP_APC_MASTER_ARRAY_WRITE_REG,
+ CAPP_SNP_ARRAY_ADDR_REG,
+ CAPP_SNP_ARRAY_WRITE_REG);
+ return rc;
+}
+
+static int do_capp_recovery_scoms(struct phb4 *p)
+{
+ uint64_t rc, reg, end;
+ uint64_t offset = PHB4_CAPP_REG_OFFSET(p);
+
+
+ /* Get the status of CAPP recovery */
+ xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, &reg);
+
+ /* No recovery in progress ignore */
+ if ((reg & PPC_BIT(0)) == 0) {
+ PHBDBG(p, "CAPP: No recovery in progress\n");
+ return OPAL_SUCCESS;
+ }
+
+ PHBDBG(p, "CAPP: Waiting for recovery to complete\n");
+ /* recovery timer failure period 168ms */
+ end = mftb() + msecs_to_tb(168);
+ while ((reg & (PPC_BIT(1) | PPC_BIT(5) | PPC_BIT(9))) == 0) {
+
+ time_wait_ms(5);
+ xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, &reg);
+
+ if (tb_compare(mftb(), end) != TB_ABEFOREB) {
+ PHBERR(p, "CAPP: Capp recovery Timed-out.\n");
+ end = 0;
+ break;
+ }
+ }
+
+ /* Check if the recovery failed or passed */
+ if (reg & PPC_BIT(1)) {
+ uint64_t act0, act1, mask, fir;
+
+ /* Use the Action0/1 and mask to only clear the bits
+ * that cause local checkstop. Other bits needs attention
+ * of the PRD daemon.
+ */
+ xscom_read(p->chip_id, CAPP_FIR_ACTION0 + offset, &act0);
+ xscom_read(p->chip_id, CAPP_FIR_ACTION1 + offset, &act1);
+ xscom_read(p->chip_id, CAPP_FIR_MASK + offset, &mask);
+ xscom_read(p->chip_id, CAPP_FIR + offset, &fir);
+
+ fir = ~(fir & ~mask & act0 & act1);
+ PHBDBG(p, "Doing CAPP recovery scoms\n");
+
+ /* update capp fir clearing bits causing local checkstop */
+ PHBDBG(p, "Resetting CAPP Fir with mask 0x%016llX\n", fir);
+ xscom_write(p->chip_id, CAPP_FIR_CLEAR + offset, fir);
+
+ /* disable snoops */
+ xscom_write(p->chip_id, SNOOP_CAPI_CONFIG + offset, 0);
+ load_capp_ucode(p);
+
+ /* clear err rpt reg*/
+ xscom_write(p->chip_id, CAPP_ERR_RPT_CLR + offset, 0);
+
+ /* clear capp fir */
+ xscom_write(p->chip_id, CAPP_FIR + offset, 0);
+
+ /* Just reset Bit-0,1 and dont touch any other bit */
+ xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, &reg);
+ reg &= ~(PPC_BIT(0) | PPC_BIT(1));
+ xscom_write(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, reg);
+
+ PHBDBG(p, "CAPP recovery complete\n");
+ rc = OPAL_SUCCESS;
+
+ } else {
+ /* Most likely will checkstop here due to FIR ACTION for
+ * failed recovery. So this message would never be logged.
+ * But if we still enter here then return an error forcing a
+ * fence of the PHB.
+ */
+ if (reg & PPC_BIT(5))
+ PHBERR(p, "CAPP: Capp recovery Failed\n");
+ else if (reg & PPC_BIT(9))
+ PHBERR(p, "CAPP: Capp recovery hang detected\n");
+ else if (end != 0)
+ PHBERR(p, "CAPP: Unknown recovery failure\n");
+
+ PHBDBG(p, "CAPP: Err/Status-reg=0x%016llx\n", reg);
+ rc = OPAL_HARDWARE;
+ }
+
+ return rc;
+}
+
+/*
+ * Disable CAPI mode on a PHB. Must be done while PHB is fenced and
+ * not in recovery.
+ */
+static void disable_capi_mode(struct phb4 *p)
+{
+ uint64_t reg;
+ struct capp *capp = p->capp;
+
+ PHBINF(p, "CAPP: Deactivating\n");
+
+ /* Check if CAPP attached to the PHB and active */
+ if (!capp || capp->phb != &p->phb) {
+ PHBDBG(p, "CAPP: Not attached to this PHB!\n");
+ return;
+ }
+
+ xscom_read(p->chip_id, p->pe_xscom + XPEC_NEST_CAPP_CNTL, &reg);
+ if (!(reg & PPC_BIT(0))) {
+ /* Not in CAPI mode, no action required */
+ PHBERR(p, "CAPP: Not enabled!\n");
+ return;
+ }
+
+ /* CAPP should already be out of recovery in this function */
+ capp_xscom_read(capp, CAPP_ERR_STATUS_CTRL, &reg);
+ if (reg & PPC_BIT(0)) {
+ PHBERR(p, "CAPP: Can't disable while still in recovery!\n");
+ return;
+ }
+
+ PHBINF(p, "CAPP: Disabling CAPI mode\n");
+
+ /* First Phase Reset CAPP Registers */
+ /* CAPP about to be disabled mark TLBI_FENCED and tlbi_psl_is_dead */
+ capp_xscom_write(capp, CAPP_ERR_STATUS_CTRL, PPC_BIT(3) | PPC_BIT(4));
+
+ /* Flush SUE uOP1 Register */
+ if (p->rev != PHB4_REV_NIMBUS_DD10)
+ capp_xscom_write(capp, FLUSH_SUE_UOP1, 0);
+
+ /* Release DMA/STQ engines */
+ capp_xscom_write(capp, APC_FSM_READ_MASK, 0ull);
+ capp_xscom_write(capp, XPT_FSM_RMM, 0ull);
+
+ /* Disable snoop */
+ capp_xscom_write(capp, SNOOP_CAPI_CONFIG, 0);
+
+ /* Clear flush SUE state map register */
+ capp_xscom_write(capp, FLUSH_SUE_STATE_MAP, 0);
+
+ /* Disable epoch timer */
+ capp_xscom_write(capp, EPOCH_RECOVERY_TIMERS_CTRL, 0);
+
+ /* CAPP Transport Control Register */
+ capp_xscom_write(capp, TRANSPORT_CONTROL, PPC_BIT(15));
+
+ /* Disable snooping */
+ capp_xscom_write(capp, SNOOP_CONTROL, 0);
+ capp_xscom_write(capp, SNOOP_CAPI_CONFIG, 0);
+
+ /* APC Master PB Control Register - disable examining cResps */
+ capp_xscom_write(capp, APC_MASTER_PB_CTRL, 0);
+
+ /* APC Master Config Register - de-select PHBs */
+ xscom_write_mask(p->chip_id, capp->capp_xscom_offset +
+ APC_MASTER_CAPI_CTRL, 0, PPC_BITMASK(2, 3));
+
+ /* Clear all error registers */
+ capp_xscom_write(capp, CAPP_ERR_RPT_CLR, 0);
+ capp_xscom_write(capp, CAPP_FIR, 0);
+ capp_xscom_write(capp, CAPP_FIR_ACTION0, 0);
+ capp_xscom_write(capp, CAPP_FIR_ACTION1, 0);
+ capp_xscom_write(capp, CAPP_FIR_MASK, 0);
+
+ /* Second Phase Reset PEC/PHB Registers */
+
+ /* Reset the stack overrides if any */
+ xscom_write(p->chip_id, p->pci_xscom + XPEC_PCI_PRDSTKOVR, 0);
+ xscom_write(p->chip_id, p->pe_xscom +
+ XPEC_NEST_READ_STACK_OVERRIDE, 0);
+
+ /* PE Bus AIB Mode Bits. Disable Tracing. Leave HOL Blocking as it is */
+ if (!(p->rev == PHB4_REV_NIMBUS_DD10) && p->index == CAPP1_PHB_INDEX)
+ xscom_write_mask(p->chip_id,
+ p->pci_xscom + XPEC_PCI_PBAIB_HW_CONFIG, 0,
+ PPC_BIT(30));
+
+ /* Reset for PCI to PB data movement */
+ xscom_write_mask(p->chip_id, p->pe_xscom + XPEC_NEST_PBCQ_HW_CONFIG,
+ 0, XPEC_NEST_PBCQ_HW_CONFIG_PBINIT);
+
+ /* Disable CAPP mode in PEC CAPP Control Register */
+ xscom_write(p->chip_id, p->pe_xscom + XPEC_NEST_CAPP_CNTL, 0ull);
+}
+
+static int64_t phb4_creset(struct pci_slot *slot)
+{
+ struct phb4 *p = phb_to_phb4(slot->phb);
+ struct capp *capp = p->capp;
+ uint64_t pbcq_status;
+ uint64_t creset_time, wait_time;
+
+ /* Don't even try fixing a broken PHB */
+ if (p->broken)
+ return OPAL_HARDWARE;
+
+ switch (slot->state) {
+ case PHB4_SLOT_NORMAL:
+ case PHB4_SLOT_CRESET_START:
+ PHBDBG(p, "CRESET: Starts\n");
+
+ p->creset_start_time = mftb();
+
+ /* circumvention for HW551382 */
+ if (is_phb5()) {
+ PHBINF(p, "CRESET: Workaround for HW551382\n");
+ set_sys_disable_detect(p, true);
+ }
+
+ phb4_prepare_link_change(slot, false);
+ /* Clear error inject register, preventing recursive errors */
+ xscom_write(p->chip_id, p->pe_xscom + 0x2, 0x0);
+
+ /* Prevent HMI when PHB gets fenced as we are disabling CAPP */
+ if (p->flags & PHB4_CAPP_DISABLE &&
+ capp && capp->phb == slot->phb) {
+ /* Since no HMI, So set the recovery flag manually. */
+ p->flags |= PHB4_CAPP_RECOVERY;
+ xscom_write_mask(p->chip_id, capp->capp_xscom_offset +
+ CAPP_FIR_MASK,
+ PPC_BIT(31), PPC_BIT(31));
+ }
+
+ /* Force fence on the PHB to work around a non-existent PE */
+ if (!phb4_fenced(p))
+ xscom_write(p->chip_id, p->pe_stk_xscom + 0x2,
+ 0x0000002000000000UL);
+
+ /*
+ * Force use of ASB for register access until the PHB has
+ * been fully reset.
+ */
+ p->flags |= PHB4_CFG_USE_ASB | PHB4_AIB_FENCED;
+
+ /* Assert PREST before clearing errors */
+ phb4_assert_perst(slot, true);
+
+ /* Clear errors, following the proper sequence */
+ phb4_err_clear(p);
+
+ /* Actual reset */
+ p->flags |= PHB4_ETU_IN_RESET;
+ xscom_write(p->chip_id, p->pci_stk_xscom + XPEC_PCI_STK_ETU_RESET,
+ 0x8000000000000000UL);
+
+ /* Read errors in PFIR and NFIR */
+ xscom_read(p->chip_id, p->pci_stk_xscom + 0x0, &p->pfir_cache);
+ xscom_read(p->chip_id, p->pe_stk_xscom + 0x0, &p->nfir_cache);
+
+ pci_slot_set_state(slot, PHB4_SLOT_CRESET_WAIT_CQ);
+ slot->retries = 500;
+ return pci_slot_set_sm_timeout(slot, msecs_to_tb(10));
+ case PHB4_SLOT_CRESET_WAIT_CQ:
+
+ // Wait until operations are complete
+ xscom_read(p->chip_id, p->pe_stk_xscom + 0xc, &pbcq_status);
+ if (!(pbcq_status & 0xC000000000000000UL)) {
+ PHBDBG(p, "CRESET: No pending transactions\n");
+
+ /* capp recovery */
+ if ((p->flags & PHB4_CAPP_RECOVERY) &&
+ (do_capp_recovery_scoms(p) != OPAL_SUCCESS))
+ goto error;
+
+ if (p->flags & PHB4_CAPP_DISABLE)
+ disable_capi_mode(p);
+
+ /* Clear errors in PFIR and NFIR */
+ xscom_write(p->chip_id, p->pci_stk_xscom + 0x1,
+ ~p->pfir_cache);
+ xscom_write(p->chip_id, p->pe_stk_xscom + 0x1,
+ ~p->nfir_cache);
+
+ /* Re-read errors in PFIR and NFIR and reset any new
+ * error reported.
+ */
+ xscom_read(p->chip_id, p->pci_stk_xscom +
+ XPEC_PCI_STK_PCI_FIR, &p->pfir_cache);
+ xscom_read(p->chip_id, p->pe_stk_xscom +
+ XPEC_NEST_STK_PCI_NFIR, &p->nfir_cache);
+
+ if (p->pfir_cache || p->nfir_cache) {
+ PHBERR(p, "CRESET: PHB still fenced !!\n");
+ phb4_dump_pec_err_regs(p);
+
+ /* Reset the PHB errors */
+ xscom_write(p->chip_id, p->pci_stk_xscom +
+ XPEC_PCI_STK_PCI_FIR, 0);
+ xscom_write(p->chip_id, p->pe_stk_xscom +
+ XPEC_NEST_STK_PCI_NFIR, 0);
+ }
+
+ /* Clear PHB from reset */
+ xscom_write(p->chip_id,
+ p->pci_stk_xscom + XPEC_PCI_STK_ETU_RESET, 0x0);
+ p->flags &= ~PHB4_ETU_IN_RESET;
+
+ pci_slot_set_state(slot, PHB4_SLOT_CRESET_REINIT);
+ /* After lifting PHB reset, wait while logic settles */
+ return pci_slot_set_sm_timeout(slot, msecs_to_tb(10));
+ }
+
+ if (slot->retries-- == 0) {
+ PHBERR(p, "Timeout waiting for pending transaction\n");
+ goto error;
+ }
+ return pci_slot_set_sm_timeout(slot, msecs_to_tb(100));
+ case PHB4_SLOT_CRESET_REINIT:
+ PHBDBG(p, "CRESET: Reinitialization\n");
+ p->flags &= ~PHB4_AIB_FENCED;
+ p->flags &= ~PHB4_CAPP_RECOVERY;
+ p->flags &= ~PHB4_CFG_USE_ASB;
+ phb4_init_hw(p);
+ pci_slot_set_state(slot, PHB4_SLOT_CRESET_FRESET);
+
+ /*
+ * The PERST is sticky across resets, but LINK_DIS isn't.
+ * Re-assert it here now that we've reset the PHB.
+ */
+ phb4_assert_perst(slot, true);
+
+ /*
+ * wait either 100ms (for the ETU logic) or until we've had
+ * PERST asserted for 250ms.
+ */
+ creset_time = tb_to_msecs(mftb() - p->creset_start_time);
+ if (creset_time < 250)
+ wait_time = MAX(100, 250 - creset_time);
+ else
+ wait_time = 100;
+ PHBDBG(p, "CRESET: wait_time = %lld\n", wait_time);
+ return pci_slot_set_sm_timeout(slot, msecs_to_tb(wait_time));
+
+ case PHB4_SLOT_CRESET_FRESET:
+ /*
+ * We asserted PERST at the beginning of the CRESET and we
+ * have waited long enough, so we can skip it in the freset
+ * procedure.
+ */
+ p->skip_perst = true;
+ pci_slot_set_state(slot, PHB4_SLOT_NORMAL);
+ return slot->ops.freset(slot);
+ default:
+ PHBERR(p, "CRESET: Unexpected slot state %08x, resetting...\n",
+ slot->state);
+ pci_slot_set_state(slot, PHB4_SLOT_NORMAL);
+ return slot->ops.creset(slot);
+
+ }
+
+error:
+ /* Mark the PHB as dead and expect it to be removed */
+ p->broken = true;
+ return OPAL_HARDWARE;
+}
+
+/*
+ * Initialize root complex slot, which is mainly used to
+ * do fundamental reset before PCI enumeration in PCI core.
+ * When probing root complex and building its real slot,
+ * the operations will be copied over.
+ */
+static struct pci_slot *phb4_slot_create(struct phb *phb)
+{
+ struct pci_slot *slot;
+
+ slot = pci_slot_alloc(phb, NULL);
+ if (!slot)
+ return slot;
+
+ /* Elementary functions */
+ slot->ops.get_presence_state = phb4_get_presence_state;
+ slot->ops.get_link_state = phb4_get_link_state;
+ slot->ops.get_power_state = NULL;
+ slot->ops.get_attention_state = NULL;
+ slot->ops.get_latch_state = NULL;
+ slot->ops.set_power_state = NULL;
+ slot->ops.set_attention_state = NULL;
+
+ /*
+ * For PHB slots, we have to split the fundamental reset
+ * into 2 steps. We might not have the first step which
+ * is to power off/on the slot, or it's controlled by
+ * individual platforms.
+ */
+ slot->ops.prepare_link_change = phb4_prepare_link_change;
+ slot->ops.poll_link = phb4_poll_link;
+ slot->ops.hreset = phb4_hreset;
+ slot->ops.freset = phb4_freset;
+ slot->ops.creset = phb4_creset;
+ slot->ops.completed_sm_run = phb4_slot_sm_run_completed;
+ slot->link_retries = PHB4_LINK_LINK_RETRIES;
+
+ return slot;
+}
+
+static void phb4_int_unmask_all(struct phb4 *p)
+{
+ /* Init_126..130 - Re-enable error interrupts */
+ out_be64(p->regs + PHB_ERR_IRQ_ENABLE, 0xca8880cc00000000ull);
+
+ if (is_phb5())
+ out_be64(p->regs + PHB_TXE_ERR_IRQ_ENABLE, 0x200850be08200020ull);
+ else
+ out_be64(p->regs + PHB_TXE_ERR_IRQ_ENABLE, 0x2008400e08200000ull);
+ out_be64(p->regs + PHB_RXE_ARB_ERR_IRQ_ENABLE, 0xc40038fc01804070ull);
+ out_be64(p->regs + PHB_RXE_MRG_ERR_IRQ_ENABLE, 0x00006100008000a8ull);
+ out_be64(p->regs + PHB_RXE_TCE_ERR_IRQ_ENABLE, 0x60510050c0000000ull);
+}
+
+/*
+ * Mask the IRQ for any currently set error bits. This prevents the PHB's ERR
+ * and INF interrupts from being re-fired before the kernel can handle the
+ * underlying condition.
+ */
+static void phb4_int_mask_active(struct phb4 *p)
+{
+ const uint64_t error_regs[] = {
+ PHB_ERR_STATUS,
+ PHB_TXE_ERR_STATUS,
+ PHB_RXE_ARB_ERR_STATUS,
+ PHB_RXE_MRG_ERR_STATUS,
+ PHB_RXE_TCE_ERR_STATUS
+ };
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(error_regs); i++) {
+ uint64_t stat, mask;
+
+ /* The IRQ mask reg is always offset 0x20 from the status reg */
+ stat = phb4_read_reg(p, error_regs[i]);
+ mask = phb4_read_reg(p, error_regs[i] + 0x20);
+
+ phb4_write_reg(p, error_regs[i] + 0x20, mask & ~stat);
+ }
+}
+
+static uint64_t phb4_get_pesta(struct phb4 *p, uint64_t pe_number)
+{
+ uint64_t pesta;
+ __be64 *pPEST;
+
+ pPEST = (__be64 *)p->tbl_pest;
+
+ phb4_ioda_sel(p, IODA3_TBL_PESTA, pe_number, false);
+ pesta = phb4_read_reg(p, PHB_IODA_DATA0);
+ if (pesta & IODA3_PESTA_MMIO_FROZEN)
+ pesta |= be64_to_cpu(pPEST[2*pe_number]);
+
+ return pesta;
+}
+
+/* Check if the chip requires escalating a freeze to fence on MMIO loads */
+static bool phb4_escalation_required(void)
+{
+ uint64_t pvr = mfspr(SPR_PVR);
+
+ /* Only on Power9 */
+ if (proc_gen != proc_gen_p9)
+ return false;
+
+ /*
+ * Escalation is required on the following chip versions:
+ * - Cumulus DD1.0
+ * - Nimbus DD2.0, DD2.1 (and DD1.0, but it is unsupported so no check).
+ */
+ if (pvr & PVR_POWER9_CUMULUS) {
+ if (PVR_VERS_MAJ(pvr) == 1 && PVR_VERS_MIN(pvr) == 0)
+ return true;
+ } else { /* Nimbus */
+ if (PVR_VERS_MAJ(pvr) == 2 && PVR_VERS_MIN(pvr) < 2)
+ return true;
+ }
+
+ return false;
+}
+
+static bool phb4_freeze_escalate(uint64_t pesta)
+{
+ if ((GETFIELD(IODA3_PESTA_TRANS_TYPE, pesta) ==
+ IODA3_PESTA_TRANS_TYPE_MMIOLOAD) &&
+ (pesta & (IODA3_PESTA_CA_CMPLT_TMT | IODA3_PESTA_UR)))
+ return true;
+ return false;
+}
+
+static int64_t phb4_eeh_freeze_status(struct phb *phb, uint64_t pe_number,
+ uint8_t *freeze_state,
+ uint16_t *pci_error_type,
+ uint16_t *severity)
+{
+ struct phb4 *p = phb_to_phb4(phb);
+ uint64_t peev_bit = PPC_BIT(pe_number & 0x3f);
+ uint64_t peev, pesta, pestb;
+
+ /* Defaults: not frozen */
+ *freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN;
+ *pci_error_type = OPAL_EEH_NO_ERROR;
+
+ /* Check dead */
+ if (p->broken) {
+ *freeze_state = OPAL_EEH_STOPPED_MMIO_DMA_FREEZE;
+ *pci_error_type = OPAL_EEH_PHB_ERROR;
+ if (severity)
+ *severity = OPAL_EEH_SEV_PHB_DEAD;
+ return OPAL_HARDWARE;
+ }
+
+ /* Check fence and CAPP recovery */
+ if (phb4_fenced(p) || (p->flags & PHB4_CAPP_RECOVERY)) {
+ *freeze_state = OPAL_EEH_STOPPED_MMIO_DMA_FREEZE;
+ *pci_error_type = OPAL_EEH_PHB_ERROR;
+ if (severity)
+ *severity = OPAL_EEH_SEV_PHB_FENCED;
+ return OPAL_SUCCESS;
+ }
+
+ /* Check the PEEV */
+ phb4_ioda_sel(p, IODA3_TBL_PEEV, pe_number / 64, false);
+ peev = in_be64(p->regs + PHB_IODA_DATA0);
+ if (!(peev & peev_bit))
+ return OPAL_SUCCESS;
+
+ /* Indicate that we have an ER pending */
+ phb4_set_err_pending(p, true);
+ if (severity)
+ *severity = OPAL_EEH_SEV_PE_ER;
+
+ /* Read the full PESTA */
+ pesta = phb4_get_pesta(p, pe_number);
+ /* Check if we need to escalate to fence */
+ if (phb4_escalation_required() && phb4_freeze_escalate(pesta)) {
+ PHBERR(p, "Escalating freeze to fence PESTA[%lli]=%016llx\n",
+ pe_number, pesta);
+ *severity = OPAL_EEH_SEV_PHB_FENCED;
+ *pci_error_type = OPAL_EEH_PHB_ERROR;
+ }
+
+ /* Read the PESTB in the PHB */
+ phb4_ioda_sel(p, IODA3_TBL_PESTB, pe_number, false);
+ pestb = phb4_read_reg(p, PHB_IODA_DATA0);
+
+ /* Convert PESTA/B to freeze_state */
+ if (pesta & IODA3_PESTA_MMIO_FROZEN)
+ *freeze_state |= OPAL_EEH_STOPPED_MMIO_FREEZE;
+ if (pestb & IODA3_PESTB_DMA_STOPPED)
+ *freeze_state |= OPAL_EEH_STOPPED_DMA_FREEZE;
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb4_eeh_freeze_clear(struct phb *phb, uint64_t pe_number,
+ uint64_t eeh_action_token)
+{
+ struct phb4 *p = phb_to_phb4(phb);
+ uint64_t err, peev;
+ int32_t i;
+ bool frozen_pe = false;
+
+ if (p->broken)
+ return OPAL_HARDWARE;
+
+ /* Summary. If nothing, move to clearing the PESTs which can
+ * contain a freeze state from a previous error or simply set
+ * explicitely by the user
+ */
+ err = in_be64(p->regs + PHB_ETU_ERR_SUMMARY);
+ if (err == 0xffffffffffffffffUL) {
+ if (phb4_fenced(p)) {
+ PHBERR(p, "eeh_freeze_clear on fenced PHB\n");
+ return OPAL_HARDWARE;
+ }
+ }
+ if (err != 0)
+ phb4_err_clear(p);
+
+ /*
+ * We have PEEV in system memory. It would give more performance
+ * to access that directly.
+ */
+ if (eeh_action_token & OPAL_EEH_ACTION_CLEAR_FREEZE_MMIO) {
+ phb4_ioda_sel(p, IODA3_TBL_PESTA, pe_number, false);
+ out_be64(p->regs + PHB_IODA_DATA0, 0);
+ }
+ if (eeh_action_token & OPAL_EEH_ACTION_CLEAR_FREEZE_DMA) {
+ phb4_ioda_sel(p, IODA3_TBL_PESTB, pe_number, false);
+ out_be64(p->regs + PHB_IODA_DATA0, 0);
+ }
+
+
+ /* Update ER pending indication */
+ phb4_ioda_sel(p, IODA3_TBL_PEEV, 0, true);
+ for (i = 0; i < p->num_pes/64; i++) {
+ peev = in_be64(p->regs + PHB_IODA_DATA0);
+ if (peev) {
+ frozen_pe = true;
+ break;
+ }
+ }
+ if (frozen_pe) {
+ p->err.err_src = PHB4_ERR_SRC_PHB;
+ p->err.err_class = PHB4_ERR_CLASS_ER;
+ p->err.err_bit = -1;
+ phb4_set_err_pending(p, true);
+ } else
+ phb4_set_err_pending(p, false);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb4_eeh_freeze_set(struct phb *phb, uint64_t pe_number,
+ uint64_t eeh_action_token)
+{
+ struct phb4 *p = phb_to_phb4(phb);
+ uint64_t data;
+
+ if (p->broken)
+ return OPAL_HARDWARE;
+
+ if (pe_number >= p->num_pes)
+ return OPAL_PARAMETER;
+
+ if (eeh_action_token != OPAL_EEH_ACTION_SET_FREEZE_MMIO &&
+ eeh_action_token != OPAL_EEH_ACTION_SET_FREEZE_DMA &&
+ eeh_action_token != OPAL_EEH_ACTION_SET_FREEZE_ALL)
+ return OPAL_PARAMETER;
+
+ if (eeh_action_token & OPAL_EEH_ACTION_SET_FREEZE_MMIO) {
+ phb4_ioda_sel(p, IODA3_TBL_PESTA, pe_number, false);
+ data = in_be64(p->regs + PHB_IODA_DATA0);
+ data |= IODA3_PESTA_MMIO_FROZEN;
+ out_be64(p->regs + PHB_IODA_DATA0, data);
+ }
+
+ if (eeh_action_token & OPAL_EEH_ACTION_SET_FREEZE_DMA) {
+ phb4_ioda_sel(p, IODA3_TBL_PESTB, pe_number, false);
+ data = in_be64(p->regs + PHB_IODA_DATA0);
+ data |= IODA3_PESTB_DMA_STOPPED;
+ out_be64(p->regs + PHB_IODA_DATA0, data);
+ }
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb4_eeh_next_error(struct phb *phb,
+ uint64_t *first_frozen_pe,
+ uint16_t *pci_error_type,
+ uint16_t *severity)
+{
+ struct phb4 *p = phb_to_phb4(phb);
+ uint64_t peev, pesta;
+ uint32_t peev_size = p->num_pes/64;
+ int32_t i, j;
+
+ /* If the PHB is broken, we needn't go forward */
+ if (p->broken) {
+ *pci_error_type = OPAL_EEH_PHB_ERROR;
+ *severity = OPAL_EEH_SEV_PHB_DEAD;
+ return OPAL_SUCCESS;
+ }
+
+ if ((p->flags & PHB4_CAPP_RECOVERY)) {
+ *pci_error_type = OPAL_EEH_PHB_ERROR;
+ *severity = OPAL_EEH_SEV_PHB_FENCED;
+ return OPAL_SUCCESS;
+ }
+
+ /*
+ * Check if we already have pending errors. If that's
+ * the case, then to get more information about the
+ * pending errors. Here we try PBCQ prior to PHB.
+ */
+ if (phb4_err_pending(p) /*&&
+ !phb4_err_check_pbcq(p) &&
+ !phb4_err_check_lem(p) */)
+ phb4_set_err_pending(p, false);
+
+ /* Clear result */
+ *pci_error_type = OPAL_EEH_NO_ERROR;
+ *severity = OPAL_EEH_SEV_NO_ERROR;
+ *first_frozen_pe = (uint64_t)-1;
+
+ /* Check frozen PEs */
+ if (!phb4_err_pending(p)) {
+ phb4_ioda_sel(p, IODA3_TBL_PEEV, 0, true);
+ for (i = 0; i < peev_size; i++) {
+ peev = in_be64(p->regs + PHB_IODA_DATA0);
+ if (peev) {
+ p->err.err_src = PHB4_ERR_SRC_PHB;
+ p->err.err_class = PHB4_ERR_CLASS_ER;
+ p->err.err_bit = -1;
+ phb4_set_err_pending(p, true);
+ break;
+ }
+ }
+ }
+
+ if (!phb4_err_pending(p))
+ return OPAL_SUCCESS;
+ /*
+ * If the frozen PE is caused by a malfunctioning TLP, we
+ * need reset the PHB. So convert ER to PHB-fatal error
+ * for the case.
+ */
+ if (p->err.err_class == PHB4_ERR_CLASS_ER) {
+ for (i = peev_size - 1; i >= 0; i--) {
+ phb4_ioda_sel(p, IODA3_TBL_PEEV, i, false);
+ peev = in_be64(p->regs + PHB_IODA_DATA0);
+ for (j = 0; j < 64; j++) {
+ if (peev & PPC_BIT(j)) {
+ *first_frozen_pe = i * 64 + j;
+ break;
+ }
+ }
+ if (*first_frozen_pe != (uint64_t)(-1))
+ break;
+ }
+ }
+
+ if (*first_frozen_pe != (uint64_t)(-1)) {
+ pesta = phb4_get_pesta(p, *first_frozen_pe);
+ if (phb4_escalation_required() && phb4_freeze_escalate(pesta)) {
+ PHBINF(p, "Escalating freeze to fence. PESTA[%lli]=%016llx\n",
+ *first_frozen_pe, pesta);
+ p->err.err_class = PHB4_ERR_CLASS_FENCED;
+ }
+ }
+
+ switch (p->err.err_class) {
+ case PHB4_ERR_CLASS_DEAD:
+ *pci_error_type = OPAL_EEH_PHB_ERROR;
+ *severity = OPAL_EEH_SEV_PHB_DEAD;
+ break;
+ case PHB4_ERR_CLASS_FENCED:
+ *pci_error_type = OPAL_EEH_PHB_ERROR;
+ *severity = OPAL_EEH_SEV_PHB_FENCED;
+ break;
+ case PHB4_ERR_CLASS_ER:
+ *pci_error_type = OPAL_EEH_PE_ERROR;
+ *severity = OPAL_EEH_SEV_PE_ER;
+
+ /* No frozen PE ? */
+ if (*first_frozen_pe == (uint64_t)-1) {
+ *pci_error_type = OPAL_EEH_NO_ERROR;
+ *severity = OPAL_EEH_SEV_NO_ERROR;
+ phb4_set_err_pending(p, false);
+ }
+
+ break;
+ case PHB4_ERR_CLASS_INF:
+ *pci_error_type = OPAL_EEH_PHB_ERROR;
+ *severity = OPAL_EEH_SEV_INF;
+ break;
+ default:
+ *pci_error_type = OPAL_EEH_NO_ERROR;
+ *severity = OPAL_EEH_SEV_NO_ERROR;
+ phb4_set_err_pending(p, false);
+ }
+
+ /*
+ * Unmask all our error interrupts once all pending errors
+ * have been handled.
+ */
+ if (!phb4_err_pending(p))
+ phb4_int_unmask_all(p);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb4_err_inject_finalize(struct phb4 *phb, uint64_t addr,
+ uint64_t mask, uint64_t ctrl,
+ bool is_write)
+{
+ if (is_write)
+ ctrl |= PHB_PAPR_ERR_INJ_CTL_WR;
+ else
+ ctrl |= PHB_PAPR_ERR_INJ_CTL_RD;
+
+ out_be64(phb->regs + PHB_PAPR_ERR_INJ_ADDR, addr);
+ out_be64(phb->regs + PHB_PAPR_ERR_INJ_MASK, mask);
+ out_be64(phb->regs + PHB_PAPR_ERR_INJ_CTL, ctrl);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t phb4_err_inject_mem32(struct phb4 *phb __unused,
+ uint64_t pe_number __unused,
+ uint64_t addr __unused,
+ uint64_t mask __unused,
+ bool is_write __unused)
+{
+ return OPAL_UNSUPPORTED;
+}
+
+static int64_t phb4_err_inject_mem64(struct phb4 *phb __unused,
+ uint64_t pe_number __unused,
+ uint64_t addr __unused,
+ uint64_t mask __unused,
+ bool is_write __unused)
+{
+ return OPAL_UNSUPPORTED;
+}
+
+static int64_t phb4_err_inject_cfg(struct phb4 *phb, uint64_t pe_number,
+ uint64_t addr, uint64_t mask,
+ bool is_write)
+{
+ uint64_t a, m, prefer, ctrl;
+ int bdfn;
+ bool is_bus_pe = false;
+
+ a = 0xffffull;
+ prefer = 0xffffull;
+ m = PHB_PAPR_ERR_INJ_MASK_CFG_ALL;
+ ctrl = PHB_PAPR_ERR_INJ_CTL_CFG;
+
+ for (bdfn = 0; bdfn < RTT_TABLE_ENTRIES; bdfn++) {
+ if (be16_to_cpu(phb->tbl_rtt[bdfn]) != pe_number)
+ continue;
+
+ /* The PE can be associated with PCI bus or device */
+ is_bus_pe = false;
+ if ((bdfn + 8) < RTT_TABLE_ENTRIES &&
+ be16_to_cpu(phb->tbl_rtt[bdfn + 8]) == pe_number)
+ is_bus_pe = true;
+
+ /* Figure out the PCI config address */
+ if (prefer == 0xffffull) {
+ if (is_bus_pe) {
+ m = PHB_PAPR_ERR_INJ_MASK_CFG;
+ prefer = SETFIELD(m, 0x0ull, PCI_BUS_NUM(bdfn));
+ } else {
+ m = PHB_PAPR_ERR_INJ_MASK_CFG_ALL;
+ prefer = SETFIELD(m, 0x0ull, bdfn);
+ }
+ }
+
+ /* Check the input address is valid or not */
+ if (!is_bus_pe &&
+ GETFIELD(PHB_PAPR_ERR_INJ_MASK_CFG_ALL, addr) == bdfn) {
+ a = addr;
+ break;
+ }
+
+ if (is_bus_pe &&
+ GETFIELD(PHB_PAPR_ERR_INJ_MASK_CFG, addr) == PCI_BUS_NUM(bdfn)) {
+ a = addr;
+ break;
+ }
+ }
+
+ /* Invalid PE number */
+ if (prefer == 0xffffull)
+ return OPAL_PARAMETER;
+
+ /* Specified address is out of range */
+ if (a == 0xffffull)
+ a = prefer;
+ else
+ m = mask;
+
+ return phb4_err_inject_finalize(phb, a, m, ctrl, is_write);
+}
+
+static int64_t phb4_err_inject_dma(struct phb4 *phb __unused,
+ uint64_t pe_number __unused,
+ uint64_t addr __unused,
+ uint64_t mask __unused,
+ bool is_write __unused,
+ bool is_64bits __unused)
+{
+ return OPAL_UNSUPPORTED;
+}
+
+static int64_t phb4_err_inject_dma32(struct phb4 *phb, uint64_t pe_number,
+ uint64_t addr, uint64_t mask,
+ bool is_write)
+{
+ return phb4_err_inject_dma(phb, pe_number, addr, mask, is_write, false);
+}
+
+static int64_t phb4_err_inject_dma64(struct phb4 *phb, uint64_t pe_number,
+ uint64_t addr, uint64_t mask,
+ bool is_write)
+{
+ return phb4_err_inject_dma(phb, pe_number, addr, mask, is_write, true);
+}
+
+
+static int64_t phb4_err_inject(struct phb *phb, uint64_t pe_number,
+ uint32_t type, uint32_t func,
+ uint64_t addr, uint64_t mask)
+{
+ struct phb4 *p = phb_to_phb4(phb);
+ int64_t (*handler)(struct phb4 *p, uint64_t pe_number,
+ uint64_t addr, uint64_t mask, bool is_write);
+ bool is_write;
+
+ /* We can't inject error to the reserved PE */
+ if (pe_number == PHB4_RESERVED_PE_NUM(p) || pe_number >= p->num_pes)
+ return OPAL_PARAMETER;
+
+ /* Clear leftover from last time */
+ out_be64(p->regs + PHB_PAPR_ERR_INJ_CTL, 0x0ul);
+
+ switch (func) {
+ case OPAL_ERR_INJECT_FUNC_IOA_LD_MEM_ADDR:
+ case OPAL_ERR_INJECT_FUNC_IOA_LD_MEM_DATA:
+ is_write = false;
+ if (type == OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR64)
+ handler = phb4_err_inject_mem64;
+ else
+ handler = phb4_err_inject_mem32;
+ break;
+ case OPAL_ERR_INJECT_FUNC_IOA_ST_MEM_ADDR:
+ case OPAL_ERR_INJECT_FUNC_IOA_ST_MEM_DATA:
+ is_write = true;
+ if (type == OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR64)
+ handler = phb4_err_inject_mem64;
+ else
+ handler = phb4_err_inject_mem32;
+ break;
+ case OPAL_ERR_INJECT_FUNC_IOA_LD_CFG_ADDR:
+ case OPAL_ERR_INJECT_FUNC_IOA_LD_CFG_DATA:
+ is_write = false;
+ handler = phb4_err_inject_cfg;
+ break;
+ case OPAL_ERR_INJECT_FUNC_IOA_ST_CFG_ADDR:
+ case OPAL_ERR_INJECT_FUNC_IOA_ST_CFG_DATA:
+ is_write = true;
+ handler = phb4_err_inject_cfg;
+ break;
+ case OPAL_ERR_INJECT_FUNC_IOA_DMA_RD_ADDR:
+ case OPAL_ERR_INJECT_FUNC_IOA_DMA_RD_DATA:
+ case OPAL_ERR_INJECT_FUNC_IOA_DMA_RD_MASTER:
+ case OPAL_ERR_INJECT_FUNC_IOA_DMA_RD_TARGET:
+ is_write = false;
+ if (type == OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR64)
+ handler = phb4_err_inject_dma64;
+ else
+ handler = phb4_err_inject_dma32;
+ break;
+ case OPAL_ERR_INJECT_FUNC_IOA_DMA_WR_ADDR:
+ case OPAL_ERR_INJECT_FUNC_IOA_DMA_WR_DATA:
+ case OPAL_ERR_INJECT_FUNC_IOA_DMA_WR_MASTER:
+ case OPAL_ERR_INJECT_FUNC_IOA_DMA_WR_TARGET:
+ is_write = true;
+ if (type == OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR64)
+ handler = phb4_err_inject_dma64;
+ else
+ handler = phb4_err_inject_dma32;
+ break;
+ default:
+ return OPAL_PARAMETER;
+ }
+
+ return handler(p, pe_number, addr, mask, is_write);
+}
+
+static int64_t phb4_get_diag_data(struct phb *phb,
+ void *diag_buffer,
+ uint64_t diag_buffer_len)
+{
+ bool fenced;
+ struct phb4 *p = phb_to_phb4(phb);
+ struct OpalIoPhb4ErrorData *data = diag_buffer;
+
+ if (diag_buffer_len < sizeof(struct OpalIoPhb4ErrorData))
+ return OPAL_PARAMETER;
+ if (p->broken)
+ return OPAL_HARDWARE;
+
+ /*
+ * Dummy check for fence so that phb4_read_phb_status knows
+ * whether to use ASB or AIB
+ */
+ fenced = phb4_fenced(p);
+ phb4_read_phb_status(p, data);
+
+ if (!fenced)
+ phb4_eeh_dump_regs(p);
+
+ /*
+ * We're running to here probably because of errors
+ * (INF class). For that case, we need clear the error
+ * explicitly.
+ */
+ if (phb4_err_pending(p) &&
+ p->err.err_class == PHB4_ERR_CLASS_INF &&
+ p->err.err_src == PHB4_ERR_SRC_PHB) {
+ phb4_err_clear(p);
+ phb4_set_err_pending(p, false);
+ }
+
+ return OPAL_SUCCESS;
+}
+
+static uint64_t tve_encode_50b_noxlate(uint64_t start_addr, uint64_t end_addr)
+{
+ uint64_t tve;
+
+ /*
+ * Put start address bits 49:24 into TVE[52:53]||[0:23]
+ * and end address bits 49:24 into TVE[54:55]||[24:47]
+ * and set TVE[51]
+ */
+ tve = (start_addr << 16) & (0xffffffull << 40);
+ tve |= (start_addr >> 38) & (3ull << 10);
+ tve |= (end_addr >> 8) & (0xfffffful << 16);
+ tve |= (end_addr >> 40) & (3ull << 8);
+ tve |= PPC_BIT(51) | IODA3_TVT_NON_TRANSLATE_50;
+ return tve;
+}
+
+static bool phb4_is_dd20(struct phb4 *p)
+{
+ struct proc_chip *chip = get_chip(p->chip_id);
+
+ if (p->rev == PHB4_REV_NIMBUS_DD20 && ((0xf & chip->ec_level) == 0))
+ return true;
+ return false;
+}
+
+static int64_t phb4_get_capp_info(int chip_id, struct phb *phb,
+ struct capp_info *info)
+{
+ struct phb4 *p = phb_to_phb4(phb);
+ uint32_t offset;
+
+ /* Not even supposed to be here on P10, but doesn't hurt */
+ if (is_phb5())
+ return OPAL_UNSUPPORTED;
+
+ if (chip_id != p->chip_id)
+ return OPAL_PARAMETER;
+
+ /* Check is CAPP is attached to the PHB */
+ if (p->capp == NULL || p->capp->phb != phb)
+ return OPAL_PARAMETER;
+
+ offset = PHB4_CAPP_REG_OFFSET(p);
+
+ if (p->index == CAPP0_PHB_INDEX)
+ info->capp_index = 0;
+ if (p->index == CAPP1_PHB_INDEX)
+ info->capp_index = 1;
+ info->phb_index = p->index;
+ info->capp_fir_reg = CAPP_FIR + offset;
+ info->capp_fir_mask_reg = CAPP_FIR_MASK + offset;
+ info->capp_fir_action0_reg = CAPP_FIR_ACTION0 + offset;
+ info->capp_fir_action1_reg = CAPP_FIR_ACTION1 + offset;
+ info->capp_err_status_ctrl_reg = CAPP_ERR_STATUS_CTRL + offset;
+
+ return OPAL_SUCCESS;
+}
+
+static void phb4_init_capp_regs(struct phb4 *p, uint32_t capp_eng)
+{
+ uint64_t addr, reg;
+ uint32_t offset;
+ uint8_t link_width_x16 = 1;
+
+ offset = PHB4_CAPP_REG_OFFSET(p);
+
+ /* Calculate the phb link width if card is attached to PEC2 */
+ if (p->index == CAPP1_PHB_INDEX) {
+ /* Check if PEC2 is in x8 or x16 mode.
+ * PEC0 is always in x16
+ */
+ addr = XPEC_P9_PCI_CPLT_CONF1 + 2 * XPEC_PCI_CPLT_OFFSET;
+ xscom_read(p->chip_id, addr, &reg);
+ link_width_x16 = ((reg & XPEC_P9_PCI_IOVALID_MASK) ==
+ XPEC_P9_PCI_IOVALID_X16);
+ }
+
+ /* APC Master PowerBus Control Register */
+ xscom_read(p->chip_id, APC_MASTER_PB_CTRL + offset, &reg);
+ reg |= PPC_BIT(0); /* enable cResp exam */
+ reg |= PPC_BIT(3); /* disable vg not sys */
+ reg |= PPC_BIT(12);/* HW417025: disable capp virtual machines */
+ reg |= PPC_BIT(2); /* disable nn rn */
+ reg |= PPC_BIT(4); /* disable g */
+ reg |= PPC_BIT(5); /* disable ln */
+ xscom_write(p->chip_id, APC_MASTER_PB_CTRL + offset, reg);
+
+ /* Set PHB mode, HPC Dir State and P9 mode */
+ xscom_write(p->chip_id, APC_MASTER_CAPI_CTRL + offset,
+ 0x1772000000000000UL);
+ PHBINF(p, "CAPP: port attached\n");
+
+ /* Set snoop ttype decoding , dir size to 512K */
+ xscom_write(p->chip_id, SNOOP_CAPI_CONFIG + offset, 0x9000000000000000UL);
+
+ /* Use Read Epsilon Tier2 for all scopes.
+ * Set Tier2 Read Epsilon.
+ */
+ xscom_read(p->chip_id, SNOOP_CONTROL + offset, &reg);
+ reg |= PPC_BIT(0);
+ reg |= PPC_BIT(35);
+ reg |= PPC_BIT(45);
+ reg |= PPC_BIT(46);
+ reg |= PPC_BIT(47);
+ reg |= PPC_BIT(50);
+ xscom_write(p->chip_id, SNOOP_CONTROL + offset, reg);
+
+ /* Transport Control Register */
+ xscom_read(p->chip_id, TRANSPORT_CONTROL + offset, &reg);
+ if (p->index == CAPP0_PHB_INDEX) {
+ reg |= PPC_BIT(1); /* Send Packet Timer Value */
+ reg |= PPC_BITMASK(10, 13); /* Send Packet Timer Value */
+ reg &= ~PPC_BITMASK(14, 17); /* Set Max LPC CI store buffer to zeros */
+ reg &= ~PPC_BITMASK(18, 21); /* Set Max tlbi divider */
+ if (capp_eng & CAPP_MIN_STQ_ENGINES) {
+ /* 2 CAPP msg engines */
+ reg |= PPC_BIT(58);
+ reg |= PPC_BIT(59);
+ reg |= PPC_BIT(60);
+ }
+ if (capp_eng & CAPP_MAX_STQ_ENGINES) {
+ /* 14 CAPP msg engines */
+ reg |= PPC_BIT(60);
+ }
+ reg |= PPC_BIT(62);
+ }
+ if (p->index == CAPP1_PHB_INDEX) {
+ reg |= PPC_BIT(4); /* Send Packet Timer Value */
+ reg &= ~PPC_BIT(10); /* Set CI Store Buffer Threshold=5 */
+ reg |= PPC_BIT(11); /* Set CI Store Buffer Threshold=5 */
+ reg &= ~PPC_BIT(12); /* Set CI Store Buffer Threshold=5 */
+ reg |= PPC_BIT(13); /* Set CI Store Buffer Threshold=5 */
+ reg &= ~PPC_BITMASK(14, 17); /* Set Max LPC CI store buffer to zeros */
+ reg &= ~PPC_BITMASK(18, 21); /* Set Max tlbi divider */
+ if (capp_eng & CAPP_MIN_STQ_ENGINES) {
+ /* 2 CAPP msg engines */
+ reg |= PPC_BIT(59);
+ reg |= PPC_BIT(60);
+
+ } else if (capp_eng & CAPP_MAX_STQ_ENGINES) {
+
+ if (link_width_x16)
+ /* 14 CAPP msg engines */
+ reg |= PPC_BIT(60) | PPC_BIT(62);
+ else
+ /* 6 CAPP msg engines */
+ reg |= PPC_BIT(60);
+ }
+ }
+ xscom_write(p->chip_id, TRANSPORT_CONTROL + offset, reg);
+
+ /* The transport control register needs to be loaded in two
+ * steps. Once the register values have been set, we have to
+ * write bit 63 to a '1', which loads the register values into
+ * the ci store buffer logic.
+ */
+ xscom_read(p->chip_id, TRANSPORT_CONTROL + offset, &reg);
+ reg |= PPC_BIT(63);
+ xscom_write(p->chip_id, TRANSPORT_CONTROL + offset, reg);
+
+ /* Enable epoch timer */
+ xscom_write(p->chip_id, EPOCH_RECOVERY_TIMERS_CTRL + offset,
+ 0xC0000000FFF8FFE0UL);
+
+ /* Flush SUE State Map Register */
+ xscom_write(p->chip_id, FLUSH_SUE_STATE_MAP + offset,
+ 0x08020A0000000000UL);
+
+ /* Flush SUE uOP1 Register */
+ xscom_write(p->chip_id, FLUSH_SUE_UOP1 + offset,
+ 0xDCE0280428000000);
+
+ /* capp owns PHB read buffers */
+ if (p->index == CAPP0_PHB_INDEX) {
+ /* max PHB read buffers 0-47 */
+ reg = 0xFFFFFFFFFFFF0000UL;
+ if (capp_eng & CAPP_MAX_DMA_READ_ENGINES)
+ reg = 0xF000000000000000UL;
+ xscom_write(p->chip_id, APC_FSM_READ_MASK + offset, reg);
+ xscom_write(p->chip_id, XPT_FSM_RMM + offset, reg);
+ }
+ if (p->index == CAPP1_PHB_INDEX) {
+
+ if (capp_eng & CAPP_MAX_DMA_READ_ENGINES) {
+ reg = 0xF000000000000000ULL;
+ } else if (link_width_x16) {
+ /* 0-47 (Read machines) are available for
+ * capp use
+ */
+ reg = 0x0000FFFFFFFFFFFFULL;
+ } else {
+ /* Set 30 Read machines for CAPP Minus
+ * 20-27 for DMA
+ */
+ reg = 0xFFFFF00E00000000ULL;
+ }
+ xscom_write(p->chip_id, APC_FSM_READ_MASK + offset, reg);
+ xscom_write(p->chip_id, XPT_FSM_RMM + offset, reg);
+ }
+
+ /* CAPP FIR Action 0 */
+ xscom_write(p->chip_id, CAPP_FIR_ACTION0 + offset, 0x0b1c000104060000UL);
+
+ /* CAPP FIR Action 1 */
+ xscom_write(p->chip_id, CAPP_FIR_ACTION1 + offset, 0x2b9c0001240E0000UL);
+
+ /* CAPP FIR MASK */
+ xscom_write(p->chip_id, CAPP_FIR_MASK + offset, 0x80031f98d8717000UL);
+
+ /* Mask the CAPP PSL Credit Timeout Register error */
+ xscom_write_mask(p->chip_id, CAPP_FIR_MASK + offset,
+ PPC_BIT(46), PPC_BIT(46));
+
+ /* Deassert TLBI_FENCED and tlbi_psl_is_dead */
+ xscom_write(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, 0);
+}
+
+/* override some inits with CAPI defaults */
+static void phb4_init_capp_errors(struct phb4 *p)
+{
+ /* Init_77: TXE Error AIB Fence Enable Register */
+ if (phb4_is_dd20(p))
+ out_be64(p->regs + 0x0d30, 0xdfffbf0ff7ddfff0ull);
+ else
+ out_be64(p->regs + 0x0d30, 0xdff7bf0ff7ddfff0ull);
+ /* Init_86: RXE_ARB Error AIB Fence Enable Register */
+ out_be64(p->regs + 0x0db0, 0xfbffd7bbfb7fbfefull);
+
+ /* Init_95: RXE_MRG Error AIB Fence Enable Register */
+ out_be64(p->regs + 0x0e30, 0xfffffeffff7fff57ull);
+
+ /* Init_104: RXE_TCE Error AIB Fence Enable Register */
+ out_be64(p->regs + 0x0eb0, 0xffaeffafffffffffull);
+
+ /* Init_113: PHB Error AIB Fence Enable Register */
+ out_be64(p->regs + 0x0cb0, 0x35777073ff000000ull);
+}
+
+/*
+ * The capi, NBW and ASN indicators are used only on P9 to flag some
+ * types of incoming traffic for the PHB and have been removed on P10.
+ *
+ * The capi indicator is over the 8 most significant bits (and
+ * not 16). We stay away from bits 59 (TVE select), 60 and 61 (MSI)
+ *
+ * For the mask, we keep bit 59 in, as capi messages must hit TVE#0.
+ * Bit 56 is not part of the mask, so that a NBW message (see below)
+ * is also considered a capi message.
+ */
+#define CAPIIND 0x0200
+#define CAPIMASK 0xFE00
+
+/*
+ * Non-Blocking Write messages are a subset of capi messages, so the
+ * indicator is the same as capi + an extra bit (56) to differentiate.
+ * Mask is the same as capi + the extra bit
+ */
+#define NBWIND 0x0300
+#define NBWMASK 0xFF00
+
+/*
+ * The ASN indicator is used for tunneled operations (as_notify and
+ * atomics). Tunneled operation messages can be sent in PCI mode as
+ * well as CAPI mode.
+ *
+ * The format of those messages is specific and, for as_notify
+ * messages, the address field is hijacked to encode the LPID/PID/TID
+ * of the target thread, so those messages should not go through
+ * translation. They must hit TVE#1. Therefore bit 59 is part of the
+ * indicator.
+ */
+#define ASNIND 0x0C00
+#define ASNMASK 0xFF00
+
+/* Power Bus Common Queue Registers
+ * All PBCQ and PBAIB registers are accessed via SCOM
+ * NestBase = 4010C00 for PEC0
+ * 4011000 for PEC1
+ * 4011400 for PEC2
+ * PCIBase = D010800 for PE0
+ * E010800 for PE1
+ * F010800 for PE2
+ *
+ * Some registers are shared amongst all of the stacks and will only
+ * have 1 copy. Other registers are implemented one per stack.
+ * Registers that are duplicated will have an additional offset
+ * of “StackBase” so that they have a unique address.
+ * Stackoffset = 00000040 for Stack0
+ * = 00000080 for Stack1
+ * = 000000C0 for Stack2
+ */
+static int64_t enable_capi_mode(struct phb4 *p, uint64_t pe_number,
+ uint32_t capp_eng)
+{
+ uint64_t addr, reg, start_addr, end_addr, stq_eng, dma_eng;
+ uint64_t mbt0, mbt1;
+ int i, window_num = -1;
+
+ /* CAPP Control Register */
+ xscom_read(p->chip_id, p->pe_xscom + XPEC_NEST_CAPP_CNTL, &reg);
+ if (reg & PPC_BIT(0)) {
+ PHBDBG(p, "Already in CAPP mode\n");
+ }
+
+ for (i = 0; i < 500000; i++) {
+ /* PBCQ General Status Register */
+ xscom_read(p->chip_id,
+ p->pe_stk_xscom + XPEC_NEST_STK_PBCQ_STAT,
+ &reg);
+ if (!(reg & 0xC000000000000000UL))
+ break;
+ time_wait_us(10);
+ }
+ if (reg & 0xC000000000000000UL) {
+ PHBERR(p, "CAPP: Timeout waiting for pending transaction\n");
+ return OPAL_HARDWARE;
+ }
+
+ stq_eng = 0x0000000000000000ULL;
+ dma_eng = 0x0000000000000000ULL;
+ if (p->index == CAPP0_PHB_INDEX) {
+ /* PBCQ is operating as a x16 stack
+ * - The maximum number of engines give to CAPP will be
+ * 14 and will be assigned in the order of STQ 15 to 2.
+ * - 0-47 (Read machines) are available for capp use.
+ */
+ stq_eng = 0x000E000000000000ULL; /* 14 CAPP msg engines */
+ dma_eng = 0x0000FFFFFFFFFFFFULL; /* 48 CAPP Read machines */
+ }
+
+ if (p->index == CAPP1_PHB_INDEX) {
+ /* Check if PEC is in x8 or x16 mode */
+ addr = XPEC_P9_PCI_CPLT_CONF1 + 2 * XPEC_PCI_CPLT_OFFSET;
+ xscom_read(p->chip_id, addr, &reg);
+ if ((reg & XPEC_P9_PCI_IOVALID_MASK) == XPEC_P9_PCI_IOVALID_X16) {
+ /* PBCQ is operating as a x16 stack
+ * - The maximum number of engines give to CAPP will be
+ * 14 and will be assigned in the order of STQ 15 to 2.
+ * - 0-47 (Read machines) are available for capp use.
+ */
+ stq_eng = 0x000E000000000000ULL;
+ dma_eng = 0x0000FFFFFFFFFFFFULL;
+ } else {
+
+ /* PBCQ is operating as a x8 stack
+ * - The maximum number of engines given to CAPP should
+ * be 6 and will be assigned in the order of 7 to 2.
+ * - 0-30 (Read machines) are available for capp use.
+ */
+ stq_eng = 0x0006000000000000ULL;
+ /* 30 Read machines for CAPP Minus 20-27 for DMA */
+ dma_eng = 0x0000FFFFF00E0000ULL;
+ }
+ }
+
+ if (capp_eng & CAPP_MIN_STQ_ENGINES)
+ stq_eng = 0x0002000000000000ULL; /* 2 capp msg engines */
+
+ /* CAPP Control Register. Enable CAPP Mode */
+ reg = 0x8000000000000000ULL; /* PEC works in CAPP Mode */
+ reg |= stq_eng;
+ if (capp_eng & CAPP_MAX_DMA_READ_ENGINES)
+ dma_eng = 0x0000F00000000000ULL; /* 4 CAPP Read machines */
+ reg |= dma_eng;
+ xscom_write(p->chip_id, p->pe_xscom + XPEC_NEST_CAPP_CNTL, reg);
+
+ /* PEC2 has 3 ETU's + 16 pci lanes that can operate as x16,
+ * x8+x8 (bifurcated) or x8+x4+x4 (trifurcated) mode. When
+ * Mellanox CX5 card is attached to stack0 of this PEC, indicated by
+ * request to allocate CAPP_MAX_DMA_READ_ENGINES; we tweak the default
+ * dma-read engines allocations to maximize the DMA read performance
+ */
+ if ((p->index == CAPP1_PHB_INDEX) &&
+ (capp_eng & CAPP_MAX_DMA_READ_ENGINES))
+ phb4_pec2_dma_engine_realloc(p);
+
+ /* PCI to PB data movement ignores the PB init signal. */
+ xscom_write_mask(p->chip_id, p->pe_xscom + XPEC_NEST_PBCQ_HW_CONFIG,
+ XPEC_NEST_PBCQ_HW_CONFIG_PBINIT,
+ XPEC_NEST_PBCQ_HW_CONFIG_PBINIT);
+
+ /* If pump mode is enabled don't do nodal broadcasts.
+ */
+ xscom_read(p->chip_id, PB_CENT_HP_MODE_CURR, &reg);
+ if (reg & PB_CFG_PUMP_MODE) {
+ reg = XPEC_NEST_PBCQ_HW_CONFIG_DIS_NODAL;
+ reg |= XPEC_NEST_PBCQ_HW_CONFIG_DIS_RNNN;
+ xscom_write_mask(p->chip_id,
+ p->pe_xscom + XPEC_NEST_PBCQ_HW_CONFIG,
+ reg, reg);
+ }
+
+ /* PEC Phase 4 (PHB) registers adjustment
+ * Inbound CAPP traffic: The CAPI can send both CAPP packets and
+ * I/O packets. A PCIe packet is indentified as a CAPP packet in
+ * the PHB if the PCIe address matches either the CAPI
+ * Compare/Mask register or its NBW Compare/Mask register.
+ */
+
+ /*
+ * Bit [0:7] XSL_DSNCTL[capiind]
+ * Init_26 - CAPI Compare/Mask
+ */
+ out_be64(p->regs + PHB_CAPI_CMPM,
+ ((u64)CAPIIND << 48) |
+ ((u64)CAPIMASK << 32) | PHB_CAPI_CMPM_ENABLE);
+
+ /* PB AIB Hardware Control Register
+ * Wait 32 PCI clocks for a credit to become available
+ * before rejecting.
+ */
+ xscom_read(p->chip_id, p->pci_xscom + XPEC_PCI_PBAIB_HW_CONFIG, &reg);
+ reg |= PPC_BITMASK(40, 42);
+ if (p->index == CAPP1_PHB_INDEX)
+ reg |= PPC_BIT(30);
+ xscom_write(p->chip_id, p->pci_xscom + XPEC_PCI_PBAIB_HW_CONFIG, reg);
+
+ /* non-translate/50-bit mode */
+ out_be64(p->regs + PHB_NXLATE_PREFIX, 0x0000000000000000Ull);
+
+ /* set tve no translate mode allow mmio window */
+ memset(p->tve_cache, 0x0, sizeof(p->tve_cache));
+
+ /*
+ * In 50-bit non-translate mode, the fields of the TVE are
+ * used to perform an address range check. In this mode TCE
+ * Table Size(0) must be a '1' (TVE[51] = 1)
+ * PCI Addr(49:24) >= TVE[52:53]+TVE[0:23] and
+ * PCI Addr(49:24) < TVE[54:55]+TVE[24:47]
+ *
+ * TVE[51] = 1
+ * TVE[56] = 1: 50-bit Non-Translate Mode Enable
+ * TVE[0:23] = 0x000000
+ * TVE[24:47] = 0xFFFFFF
+ *
+ * capi dma mode: CAPP DMA mode needs access to all of memory
+ * capi mode: Allow address range (bit 14 = 1)
+ * 0x0002000000000000: 0x0002FFFFFFFFFFFF
+ * TVE[52:53] = '10' and TVE[54:55] = '10'
+ */
+
+ /* TVT#0: CAPI window + DMA, all memory */
+ start_addr = 0ull;
+ end_addr = 0x0003ffffffffffffull;
+ p->tve_cache[pe_number * 2] =
+ tve_encode_50b_noxlate(start_addr, end_addr);
+
+ /* TVT#1: CAPI window + DMA, all memory, in bypass mode */
+ start_addr = (1ull << 59);
+ end_addr = start_addr + 0x0003ffffffffffffull;
+ p->tve_cache[pe_number * 2 + 1] =
+ tve_encode_50b_noxlate(start_addr, end_addr);
+
+ phb4_ioda_sel(p, IODA3_TBL_TVT, 0, true);
+ for (i = 0; i < p->tvt_size; i++)
+ out_be64(p->regs + PHB_IODA_DATA0, p->tve_cache[i]);
+
+ /*
+ * Since TVT#0 is in by-pass mode, disable 32-bit MSI, as a
+ * DMA write targeting 0x00000000FFFFxxxx would be interpreted
+ * as a 32-bit MSI
+ */
+ reg = in_be64(p->regs + PHB_PHB4_CONFIG);
+ reg &= ~PHB_PHB4C_32BIT_MSI_EN;
+ out_be64(p->regs + PHB_PHB4_CONFIG, reg);
+
+ /* set mbt bar to pass capi mmio window and keep the other
+ * mmio values
+ */
+ mbt0 = IODA3_MBT0_ENABLE | IODA3_MBT0_TYPE_M64 |
+ SETFIELD(IODA3_MBT0_MODE, 0ull, IODA3_MBT0_MODE_SINGLE_PE) |
+ SETFIELD(IODA3_MBT0_MDT_COLUMN, 0ull, 0) |
+ (0x0002000000000000ULL & IODA3_MBT0_BASE_ADDR);
+
+ mbt1 = IODA3_MBT1_ENABLE |
+ (0x00ff000000000000ULL & IODA3_MBT1_MASK) |
+ SETFIELD(IODA3_MBT1_SINGLE_PE_NUM, 0ull, pe_number);
+
+ for (i = 0; i < p->mbt_size; i++) {
+ /* search if the capi mmio window is already present */
+ if ((p->mbt_cache[i][0] == mbt0) &&
+ (p->mbt_cache[i][1] == mbt1))
+ break;
+
+ /* search a free entry */
+ if ((window_num == -1) &&
+ ((!(p->mbt_cache[i][0] & IODA3_MBT0_ENABLE)) &&
+ (!(p->mbt_cache[i][1] & IODA3_MBT1_ENABLE))))
+ window_num = i;
+ }
+
+ if (window_num >= 0 && i == p->mbt_size) {
+ /* no capi mmio window found, so add it */
+ p->mbt_cache[window_num][0] = mbt0;
+ p->mbt_cache[window_num][1] = mbt1;
+
+ phb4_ioda_sel(p, IODA3_TBL_MBT, window_num << 1, true);
+ out_be64(p->regs + PHB_IODA_DATA0, mbt0);
+ out_be64(p->regs + PHB_IODA_DATA0, mbt1);
+ } else if (i == p->mbt_size) {
+ /* mbt cache full, this case should never happen */
+ PHBERR(p, "CAPP: Failed to add CAPI mmio window\n");
+ } else {
+ /* duplicate entry. Nothing to do */
+ }
+
+ phb4_init_capp_errors(p);
+
+ phb4_init_capp_regs(p, capp_eng);
+
+ if (!chiptod_capp_timebase_sync(p->chip_id, CAPP_TFMR,
+ CAPP_TB,
+ PHB4_CAPP_REG_OFFSET(p)))
+ PHBERR(p, "CAPP: Failed to sync timebase\n");
+
+ /* set callbacks to handle HMI events */
+ capi_ops.get_capp_info = &phb4_get_capp_info;
+
+ return OPAL_SUCCESS;
+}
+
+
+static int64_t phb4_init_capp(struct phb4 *p)
+{
+ struct capp *capp;
+ int rc;
+
+ if (p->index != CAPP0_PHB_INDEX &&
+ p->index != CAPP1_PHB_INDEX)
+ return OPAL_UNSUPPORTED;
+
+ capp = zalloc(sizeof(struct capp));
+ if (capp == NULL)
+ return OPAL_NO_MEM;
+
+ if (p->index == CAPP0_PHB_INDEX) {
+ capp->capp_index = 0;
+ capp->capp_xscom_offset = 0;
+
+ } else if (p->index == CAPP1_PHB_INDEX) {
+ capp->capp_index = 1;
+ capp->capp_xscom_offset = CAPP1_REG_OFFSET;
+ }
+
+ capp->attached_pe = phb4_get_reserved_pe_number(&p->phb);
+ capp->chip_id = p->chip_id;
+
+ /* Load capp microcode into the capp unit */
+ rc = load_capp_ucode(p);
+
+ if (rc == OPAL_SUCCESS)
+ p->capp = capp;
+ else
+ free(capp);
+
+ return rc;
+}
+
+static int64_t phb4_set_capi_mode(struct phb *phb, uint64_t mode,
+ uint64_t pe_number)
+{
+ struct phb4 *p = phb_to_phb4(phb);
+ struct proc_chip *chip = get_chip(p->chip_id);
+ struct capp *capp = p->capp;
+ uint64_t reg, ret;
+
+ /* No CAPI on P10. OpenCAPI only */
+ if (is_phb5())
+ return OPAL_UNSUPPORTED;
+
+ /* cant do a mode switch when capp is in recovery mode */
+ ret = capp_xscom_read(capp, CAPP_ERR_STATUS_CTRL, &reg);
+ if (ret != OPAL_SUCCESS)
+ return ret;
+
+ if ((reg & PPC_BIT(0)) && (!(reg & PPC_BIT(1)))) {
+ PHBDBG(p, "CAPP: recovery in progress\n");
+ return OPAL_BUSY;
+ }
+
+
+ switch (mode) {
+
+ case OPAL_PHB_CAPI_MODE_DMA: /* Enabled by default on p9 */
+ case OPAL_PHB_CAPI_MODE_SNOOP_ON:
+ /* nothing to do on P9 if CAPP is already enabled */
+ ret = p->capp->phb ? OPAL_SUCCESS : OPAL_UNSUPPORTED;
+ break;
+
+ case OPAL_PHB_CAPI_MODE_SNOOP_OFF:
+ ret = p->capp->phb ? OPAL_UNSUPPORTED : OPAL_SUCCESS;
+ break;
+
+ case OPAL_PHB_CAPI_MODE_PCIE:
+ if (p->flags & PHB4_CAPP_DISABLE) {
+ /* We are in middle of a CAPP disable */
+ ret = OPAL_BUSY;
+
+ } else if (capp->phb) {
+ /* Kick start a creset */
+ p->flags |= PHB4_CAPP_DISABLE;
+ PHBINF(p, "CAPP: PCIE mode needs a cold-reset\n");
+ /* Kick off the pci state machine */
+ ret = phb4_creset(phb->slot);
+ ret = ret > 0 ? OPAL_BUSY : ret;
+
+ } else {
+ /* PHB already in PCI mode */
+ ret = OPAL_SUCCESS;
+ }
+ break;
+
+ case OPAL_PHB_CAPI_MODE_CAPI: /* Fall Through */
+ case OPAL_PHB_CAPI_MODE_DMA_TVT1:
+ /* Make sure that PHB is not disabling CAPP */
+ if (p->flags & PHB4_CAPP_DISABLE) {
+ PHBERR(p, "CAPP: Disable in progress\n");
+ ret = OPAL_BUSY;
+ break;
+ }
+
+ /* Check if ucode is available */
+ if (!capp_ucode_loaded(chip, p->index)) {
+ PHBERR(p, "CAPP: ucode not loaded\n");
+ ret = OPAL_RESOURCE;
+ break;
+ }
+
+ /*
+ * Mark the CAPP attached to the PHB right away so that
+ * if a MCE happens during CAPP init we can handle it.
+ * In case of an error in CAPP init we remove the PHB
+ * from the attached_mask later.
+ */
+ capp->phb = phb;
+ capp->attached_pe = pe_number;
+
+ if (mode == OPAL_PHB_CAPI_MODE_DMA_TVT1)
+ ret = enable_capi_mode(p, pe_number,
+ CAPP_MIN_STQ_ENGINES |
+ CAPP_MAX_DMA_READ_ENGINES);
+
+ else
+ ret = enable_capi_mode(p, pe_number,
+ CAPP_MAX_STQ_ENGINES |
+ CAPP_MIN_DMA_READ_ENGINES);
+ if (ret == OPAL_SUCCESS) {
+ /* register notification on system shutdown */
+ opal_add_host_sync_notifier(&phb4_host_sync_reset, p);
+
+ } else {
+ /* In case of an error mark the PHB detached */
+ capp->phb = NULL;
+ capp->attached_pe = phb4_get_reserved_pe_number(phb);
+ }
+ break;
+
+ default:
+ ret = OPAL_UNSUPPORTED;
+ break;
+ };
+
+ return ret;
+}
+
+static void phb4_p2p_set_initiator(struct phb4 *p, uint16_t pe_number)
+{
+ uint64_t tve;
+ uint16_t window_id = (pe_number << 1) + 1;
+
+ /*
+ * Initiator needs access to the MMIO space of the target,
+ * which is well beyond the 'normal' memory area. Set its TVE
+ * with no range checking.
+ */
+ PHBDBG(p, "Setting TVE#1 for peer-to-peer for pe %d\n", pe_number);
+ tve = PPC_BIT(51);
+ phb4_ioda_sel(p, IODA3_TBL_TVT, window_id, false);
+ out_be64(p->regs + PHB_IODA_DATA0, tve);
+ p->tve_cache[window_id] = tve;
+}
+
+static void phb4_p2p_set_target(struct phb4 *p, bool enable)
+{
+ uint64_t val;
+
+ /*
+ * Enabling p2p on a target PHB reserves an outbound (as seen
+ * from the CPU) store queue for p2p
+ */
+ PHBDBG(p, "%s peer-to-peer\n", (enable ? "Enabling" : "Disabling"));
+ xscom_read(p->chip_id,
+ p->pe_stk_xscom + XPEC_NEST_STK_PBCQ_MODE, &val);
+ if (enable)
+ val |= XPEC_NEST_STK_PBCQ_MODE_P2P;
+ else
+ val &= ~XPEC_NEST_STK_PBCQ_MODE_P2P;
+ xscom_write(p->chip_id,
+ p->pe_stk_xscom + XPEC_NEST_STK_PBCQ_MODE, val);
+}
+
+static void phb4_set_p2p(struct phb *phb, uint64_t mode, uint64_t flags,
+ uint16_t pe_number)
+{
+ struct phb4 *p = phb_to_phb4(phb);
+
+ switch (mode) {
+ case OPAL_PCI_P2P_INITIATOR:
+ if (flags & OPAL_PCI_P2P_ENABLE)
+ phb4_p2p_set_initiator(p, pe_number);
+ /*
+ * When disabling p2p on the initiator, we should
+ * reset the TVE to its default bypass setting, but it
+ * is more easily done from the OS, as it knows the
+ * the start and end address and there's already an
+ * opal call for it, so let linux handle it.
+ */
+ break;
+ case OPAL_PCI_P2P_TARGET:
+ phb4_p2p_set_target(p, !!(flags & OPAL_PCI_P2P_ENABLE));
+ break;
+ default:
+ assert(0);
+ }
+}
+
+static int64_t phb4_set_capp_recovery(struct phb *phb)
+{
+ struct phb4 *p = phb_to_phb4(phb);
+
+ if (p->flags & PHB4_CAPP_RECOVERY)
+ return 0;
+
+ /* set opal event flag to indicate eeh condition */
+ opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
+ OPAL_EVENT_PCI_ERROR);
+
+ p->flags |= PHB4_CAPP_RECOVERY;
+
+ return 0;
+}
+
+/*
+ * Return the address out of a PBCQ Tunnel Bar register.
+ */
+static void phb4_get_tunnel_bar(struct phb *phb, uint64_t *addr)
+{
+ struct phb4 *p = phb_to_phb4(phb);
+ uint64_t val;
+
+ xscom_read(p->chip_id, p->pe_stk_xscom + XPEC_NEST_STK_TUNNEL_BAR,
+ &val);
+ *addr = val >> 8;
+}
+
+/*
+ * Set PBCQ Tunnel Bar register.
+ * Store addr bits [8:50] in PBCQ Tunnel Bar register bits [0:42].
+ * Note that addr bits [8:50] must also match PSL_TNR_ADDR[8:50].
+ * Reset register if val == 0.
+ *
+ * This interface is required to let device drivers set the Tunnel Bar
+ * value of their choice.
+ *
+ * Compatibility with older versions of linux, that do not set the
+ * Tunnel Bar with phb4_set_tunnel_bar(), is ensured by enable_capi_mode(),
+ * that will set the default value that used to be assumed.
+ */
+static int64_t phb4_set_tunnel_bar(struct phb *phb, uint64_t addr)
+{
+ struct phb4 *p = phb_to_phb4(phb);
+ uint64_t mask = 0x00FFFFFFFFFFE000ULL;
+
+ if (!addr) {
+ /* Reset register */
+ xscom_write(p->chip_id,
+ p->pe_stk_xscom + XPEC_NEST_STK_TUNNEL_BAR, addr);
+ return OPAL_SUCCESS;
+ }
+ if ((addr & ~mask))
+ return OPAL_PARAMETER;
+ if (!(addr & mask))
+ return OPAL_PARAMETER;
+
+ xscom_write(p->chip_id, p->pe_stk_xscom + XPEC_NEST_STK_TUNNEL_BAR,
+ (addr & mask) << 8);
+ return OPAL_SUCCESS;
+}
+
+static const struct phb_ops phb4_ops = {
+ .cfg_read8 = phb4_pcicfg_read8,
+ .cfg_read16 = phb4_pcicfg_read16,
+ .cfg_read32 = phb4_pcicfg_read32,
+ .cfg_write8 = phb4_pcicfg_write8,
+ .cfg_write16 = phb4_pcicfg_write16,
+ .cfg_write32 = phb4_pcicfg_write32,
+ .get_reserved_pe_number = phb4_get_reserved_pe_number,
+ .device_init = phb4_device_init,
+ .device_remove = NULL,
+ .ioda_reset = phb4_ioda_reset,
+ .papr_errinjct_reset = phb4_papr_errinjct_reset,
+ .pci_reinit = phb4_pci_reinit,
+ .set_phb_mem_window = phb4_set_phb_mem_window,
+ .phb_mmio_enable = phb4_phb_mmio_enable,
+ .map_pe_mmio_window = phb4_map_pe_mmio_window,
+ .map_pe_dma_window = phb4_map_pe_dma_window,
+ .map_pe_dma_window_real = phb4_map_pe_dma_window_real,
+ .set_option = phb4_set_option,
+ .get_option = phb4_get_option,
+ .set_xive_pe = phb4_set_ive_pe,
+ .get_msi_32 = phb4_get_msi_32,
+ .get_msi_64 = phb4_get_msi_64,
+ .set_pe = phb4_set_pe,
+ .set_peltv = phb4_set_peltv,
+ .eeh_freeze_status = phb4_eeh_freeze_status,
+ .eeh_freeze_clear = phb4_eeh_freeze_clear,
+ .eeh_freeze_set = phb4_eeh_freeze_set,
+ .next_error = phb4_eeh_next_error,
+ .err_inject = phb4_err_inject,
+ .get_diag_data2 = phb4_get_diag_data,
+ .tce_kill = phb4_tce_kill,
+ .set_capi_mode = phb4_set_capi_mode,
+ .set_p2p = phb4_set_p2p,
+ .set_capp_recovery = phb4_set_capp_recovery,
+ .get_tunnel_bar = phb4_get_tunnel_bar,
+ .set_tunnel_bar = phb4_set_tunnel_bar,
+};
+
+static void phb4_init_ioda3(struct phb4 *p)
+{
+ if (is_phb5()) {
+ /*
+ * When ABT is on, the MSIs on the PHB use the PQ state bits
+ * of the IC and MSI triggers from the PHB are forwarded
+ * directly to the IC ESB page. However, the LSIs are still
+ * controlled locally on the PHB and LSI triggers use a
+ * special offset for trigger injection.
+ */
+ if (phb_abt_mode(p)) {
+ uint64_t mmio_base = xive2_get_esb_base(p->base_msi);
+
+ PHBDBG(p, "Using ABT mode. ESB: 0x%016llx\n", mmio_base);
+
+ /* Init_18 - Interrupt Notify Base Address */
+ out_be64(p->regs + PHB_INT_NOTIFY_ADDR,
+ PHB_INT_NOTIFY_ADDR_64K | mmio_base);
+
+ /* Interrupt Notify Base Index is unused */
+ } else {
+ p->irq_port = xive2_get_notify_port(p->chip_id,
+ XIVE_HW_SRC_PHBn(p->index));
+
+ PHBDBG(p, "Using IC notif page at 0x%016llx\n",
+ p->irq_port);
+
+ /* Init_18 - Interrupt Notify Base Address */
+ out_be64(p->regs + PHB_INT_NOTIFY_ADDR, p->irq_port);
+
+ /* Init_19 - Interrupt Notify Base Index */
+ out_be64(p->regs + PHB_INT_NOTIFY_INDEX,
+ xive2_get_notify_base(p->base_msi));
+ }
+
+ } else { /* p9 */
+ p->irq_port = xive_get_notify_port(p->chip_id,
+ XIVE_HW_SRC_PHBn(p->index));
+ /* Init_18 - Interrupt Notify Base Address */
+ out_be64(p->regs + PHB_INT_NOTIFY_ADDR, p->irq_port);
+
+ /* Init_19 - Interrupt Notify Base Index */
+ out_be64(p->regs + PHB_INT_NOTIFY_INDEX,
+ xive_get_notify_base(p->base_msi));
+ }
+
+ /* Init_19x - Not in spec: Initialize source ID */
+ PHBDBG(p, "Reset state SRC_ID: %016llx\n",
+ in_be64(p->regs + PHB_LSI_SOURCE_ID));
+ out_be64(p->regs + PHB_LSI_SOURCE_ID,
+ SETFIELD(PHB_LSI_SRC_ID, 0ull, (p->num_irqs - 1) >> 3));
+
+ /* Init_20 - RTT BAR */
+ out_be64(p->regs + PHB_RTT_BAR, (u64) p->tbl_rtt | PHB_RTT_BAR_ENABLE);
+
+ /* Init_21 - PELT-V BAR */
+ out_be64(p->regs + PHB_PELTV_BAR,
+ (u64) p->tbl_peltv | PHB_PELTV_BAR_ENABLE);
+
+ /* Init_22 - Setup M32 starting address */
+ out_be64(p->regs + PHB_M32_START_ADDR, M32_PCI_START);
+
+ /* Init_23 - Setup PEST BAR */
+ out_be64(p->regs + PHB_PEST_BAR,
+ p->tbl_pest | PHB_PEST_BAR_ENABLE);
+
+ /* Init_24 - CRW Base Address Reg */
+ /* See enable_capi_mode() */
+
+ if (is_phb4()) {
+ /* Init_25 - ASN Compare/Mask - P9 only */
+ out_be64(p->regs + PHB_ASN_CMPM, ((u64)ASNIND << 48) |
+ ((u64)ASNMASK << 32) | PHB_ASN_CMPM_ENABLE);
+ }
+
+ /* Init_26 - CAPI Compare/Mask */
+ /* See enable_capi_mode() */
+ /* if CAPP being disabled then reset CAPI Compare/Mask Register */
+ if (p->flags & PHB4_CAPP_DISABLE)
+ out_be64(p->regs + PHB_CAPI_CMPM, 0);
+
+ /* Init_27 - PCIE Outbound upper address */
+ out_be64(p->regs + PHB_M64_UPPER_BITS, 0);
+
+ /* Init_28 - PHB4 Configuration */
+ out_be64(p->regs + PHB_PHB4_CONFIG,
+ PHB_PHB4C_32BIT_MSI_EN |
+ PHB_PHB4C_64BIT_MSI_EN);
+
+ /* Init_29 - At least 256ns delay according to spec. Do a dummy
+ * read first to flush posted writes
+ */
+ in_be64(p->regs + PHB_PHB4_CONFIG);
+ time_wait_us(2);
+
+ /* Init_30..41 - On-chip IODA tables init */
+ phb4_ioda_reset(&p->phb, false);
+}
+
+/* phb4_init_rc - Initialize the Root Complex config space
+ */
+static bool phb4_init_rc_cfg(struct phb4 *p)
+{
+ int64_t ecap, aercap;
+
+ /* XXX Handle errors ? */
+
+ /* Init_46:
+ *
+ * Set primary bus to 0, secondary to 1 and subordinate to 0xff
+ */
+ phb4_pcicfg_write32(&p->phb, 0, PCI_CFG_PRIMARY_BUS, 0x00ff0100);
+
+ /* Init_47 - Clear errors */
+ /* see phb4_rc_err_clear() called below */
+
+ /* Init_48
+ *
+ * PCIE Device control/status, enable error reporting, disable relaxed
+ * ordering, set MPS to 128 (see note), clear errors.
+ *
+ * Note: The doc recommends to set MPS to 512. This has proved to have
+ * some issues as it requires specific clamping of MRSS on devices and
+ * we've found devices in the field that misbehave when doing that.
+ *
+ * We currently leave it all to 128 bytes (minimum setting) at init
+ * time. The generic PCIe probing later on might apply a different
+ * value, or the kernel will, but we play it safe at early init
+ */
+ if (p->ecap <= 0) {
+ ecap = pci_find_cap(&p->phb, 0, PCI_CFG_CAP_ID_EXP);
+ if (ecap < 0) {
+ PHBERR(p, "Can't locate PCI-E capability\n");
+ return false;
+ }
+ p->ecap = ecap;
+ } else {
+ ecap = p->ecap;
+ }
+
+ phb4_pcicfg_write16(&p->phb, 0, ecap + PCICAP_EXP_DEVCTL,
+ PCICAP_EXP_DEVCTL_CE_REPORT |
+ PCICAP_EXP_DEVCTL_NFE_REPORT |
+ PCICAP_EXP_DEVCTL_FE_REPORT |
+ PCICAP_EXP_DEVCTL_UR_REPORT |
+ SETFIELD(PCICAP_EXP_DEVCTL_MPS, 0, PCIE_MPS_128B));
+
+ /* Init_49 - Device Control/Status 2 */
+ phb4_pcicfg_write16(&p->phb, 0, ecap + PCICAP_EXP_DCTL2,
+ SETFIELD(PCICAP_EXP_DCTL2_CMPTOUT, 0, 0x5) |
+ PCICAP_EXP_DCTL2_ARI_FWD);
+
+ /* Init_50..54
+ *
+ * AER inits
+ */
+ if (p->aercap <= 0) {
+ aercap = pci_find_ecap(&p->phb, 0, PCIECAP_ID_AER, NULL);
+ if (aercap < 0) {
+ PHBERR(p, "Can't locate AER capability\n");
+ return false;
+ }
+ p->aercap = aercap;
+ } else {
+ aercap = p->aercap;
+ }
+
+ /* Disable some error reporting as per the PHB4 spec */
+ phb4_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_UE_MASK,
+ PCIECAP_AER_UE_POISON_TLP |
+ PCIECAP_AER_UE_COMPL_TIMEOUT |
+ PCIECAP_AER_UE_COMPL_ABORT);
+
+ /* Enable ECRC generation & checking */
+ phb4_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_CAPCTL,
+ PCIECAP_AER_CAPCTL_ECRCG_EN |
+ PCIECAP_AER_CAPCTL_ECRCC_EN);
+
+ phb4_rc_err_clear(p);
+
+ return true;
+}
+
+static void phb4_init_errors(struct phb4 *p)
+{
+ /* Init_55..63 - PBL errors */
+ out_be64(p->regs + 0x1900, 0xffffffffffffffffull);
+ out_be64(p->regs + 0x1908, 0x0000000000000000ull);
+ out_be64(p->regs + 0x1920, 0x000000004d1780f8ull);
+ out_be64(p->regs + 0x1928, 0x0000000000000000ull);
+ out_be64(p->regs + 0x1930, 0xffffffffb2f87f07ull);
+ out_be64(p->regs + 0x1940, 0x0000000000000000ull);
+ out_be64(p->regs + 0x1948, 0x0000000000000000ull);
+ out_be64(p->regs + 0x1950, 0x0000000000000000ull);
+ out_be64(p->regs + 0x1958, 0x0000000000000000ull);
+
+ /* Init_64..72 - REGB errors */
+ out_be64(p->regs + 0x1c00, 0xffffffffffffffffull);
+ out_be64(p->regs + 0x1c08, 0x0000000000000000ull);
+ /* Enable/disable error status indicators that trigger irqs */
+ if (p->has_link) {
+ out_be64(p->regs + 0x1c20, 0x2130006efca8bc00ull);
+ out_be64(p->regs + 0x1c30, 0xde1fff91035743ffull);
+ } else {
+ out_be64(p->regs + 0x1c20, 0x0000000000000000ull);
+ out_be64(p->regs + 0x1c30, 0x0000000000000000ull);
+ }
+ out_be64(p->regs + 0x1c28, 0x0080000000000000ull);
+ out_be64(p->regs + 0x1c40, 0x0000000000000000ull);
+ out_be64(p->regs + 0x1c48, 0x0000000000000000ull);
+ out_be64(p->regs + 0x1c50, 0x0000000000000000ull);
+ out_be64(p->regs + 0x1c58, 0x0040000000000000ull);
+
+ /* Init_73..81 - TXE errors */
+ out_be64(p->regs + 0x0d08, 0x0000000000000000ull);
+
+ /* Errata: Clear bit 17, otherwise a CFG write UR/CA will incorrectly
+ * freeze a "random" PE (whatever last PE did an MMIO)
+ */
+ if (is_phb5()) {
+ out_be64(p->regs + 0x0d28, 0x0000500a00000000ull);
+ out_be64(p->regs + 0x0d00, 0xffffffffffffffffull);
+ out_be64(p->regs + 0x0d18, 0xffffff0fffffffffull);
+ out_be64(p->regs + 0x0d30, 0xdff7af41f7ddffdfull);
+ } else {
+ out_be64(p->regs + 0x0d28, 0x0000000a00000000ull);
+ if (phb4_is_dd20(p)) {
+ out_be64(p->regs + 0x0d00, 0xf3acff0ff7ddfff0ull);
+ out_be64(p->regs + 0x0d18, 0xf3acff0ff7ddfff0ull);
+ out_be64(p->regs + 0x0d30, 0xdfffbd05f7ddfff0ull); /* XXX CAPI has diff. value */
+ } else {
+ out_be64(p->regs + 0x0d00, 0xffffffffffffffffull);
+ out_be64(p->regs + 0x0d18, 0xffffff0fffffffffull);
+ out_be64(p->regs + 0x0d30, 0xdff7bd05f7ddfff0ull);
+ }
+ }
+
+ out_be64(p->regs + 0x0d40, 0x0000000000000000ull);
+ out_be64(p->regs + 0x0d48, 0x0000000000000000ull);
+ out_be64(p->regs + 0x0d50, 0x0000000000000000ull);
+ out_be64(p->regs + 0x0d58, 0x0000000000000000ull);
+
+ /* Init_82..90 - RXE_ARB errors */
+ out_be64(p->regs + 0x0d80, 0xffffffffffffffffull);
+ out_be64(p->regs + 0x0d88, 0x0000000000000000ull);
+ out_be64(p->regs + 0x0d98, 0xfffffffffbffffffull);
+ out_be64(p->regs + 0x0da8, 0xc00018b801000060ull);
+ /*
+ * Errata ER20161123 says we should set the top two bits in
+ * 0x0db0 but this causes config space accesses which don't
+ * get a response to fence the PHB. This breaks probing,
+ * hence we don't set them here.
+ */
+ out_be64(p->regs + 0x0db0, 0x3bffd703fa7fbf8full); /* XXX CAPI has diff. value */
+ out_be64(p->regs + 0x0dc0, 0x0000000000000000ull);
+ out_be64(p->regs + 0x0dc8, 0x0000000000000000ull);
+ out_be64(p->regs + 0x0dd0, 0x0000000000000000ull);
+ out_be64(p->regs + 0x0dd8, 0x0000000004000000ull);
+
+ /* Init_91..99 - RXE_MRG errors */
+ out_be64(p->regs + 0x0e00, 0xffffffffffffffffull);
+ out_be64(p->regs + 0x0e08, 0x0000000000000000ull);
+ out_be64(p->regs + 0x0e18, 0xffffffffffffffffull);
+ out_be64(p->regs + 0x0e28, 0x0000600000000000ull);
+ out_be64(p->regs + 0x0e30, 0xfffffeffff7fff57ull);
+ out_be64(p->regs + 0x0e40, 0x0000000000000000ull);
+ out_be64(p->regs + 0x0e48, 0x0000000000000000ull);
+ out_be64(p->regs + 0x0e50, 0x0000000000000000ull);
+ out_be64(p->regs + 0x0e58, 0x0000000000000000ull);
+
+ /* Init_100..108 - RXE_TCE errors */
+ out_be64(p->regs + 0x0e80, 0xffffffffffffffffull);
+ out_be64(p->regs + 0x0e88, 0x0000000000000000ull);
+ out_be64(p->regs + 0x0e98, 0xffffffffffffffffull);
+ out_be64(p->regs + 0x0ea8, 0x60000000c0000000ull);
+ out_be64(p->regs + 0x0eb0, 0x9faeffaf3fffffffull); /* XXX CAPI has diff. value */
+ out_be64(p->regs + 0x0ec0, 0x0000000000000000ull);
+ out_be64(p->regs + 0x0ec8, 0x0000000000000000ull);
+ out_be64(p->regs + 0x0ed0, 0x0000000000000000ull);
+ out_be64(p->regs + 0x0ed8, 0x0000000000000000ull);
+
+ /* Init_109..117 - RXPHB errors */
+ out_be64(p->regs + 0x0c80, 0xffffffffffffffffull);
+ out_be64(p->regs + 0x0c88, 0x0000000000000000ull);
+ out_be64(p->regs + 0x0c98, 0xffffffffffffffffull);
+ out_be64(p->regs + 0x0ca8, 0x0000004000000000ull);
+ out_be64(p->regs + 0x0cb0, 0x35777033ff000000ull); /* XXX CAPI has diff. value */
+ out_be64(p->regs + 0x0cc0, 0x0000000000000000ull);
+ out_be64(p->regs + 0x0cc8, 0x0000000000000000ull);
+ out_be64(p->regs + 0x0cd0, 0x0000000000000000ull);
+ out_be64(p->regs + 0x0cd8, 0x0000000000000000ull);
+
+ /* Init_118..121 - LEM */
+ out_be64(p->regs + 0x0c00, 0x0000000000000000ull);
+ if (phb4_is_dd20(p)) {
+ out_be64(p->regs + 0x0c30, 0xf3ffffffffffffffull);
+ out_be64(p->regs + 0x0c38, 0xf3ffffffffffffffull);
+ } else {
+ out_be64(p->regs + 0x0c30, 0xffffffffffffffffull);
+ out_be64(p->regs + 0x0c38, 0xffffffffffffffffull);
+ }
+ out_be64(p->regs + 0x0c40, 0x0000000000000000ull);
+}
+
+
+static bool phb4_wait_dlp_reset(struct phb4 *p)
+{
+ unsigned int i;
+ uint64_t val;
+
+ /*
+ * Firmware cannot access the UTL core regs or PCI config space
+ * until the cores are out of DL_PGRESET.
+ * DL_PGRESET should be polled until it is inactive with a value
+ * of '0'. The recommended polling frequency is once every 1ms.
+ * Firmware should poll at least 200 attempts before giving up.
+ * MMIO Stores to the link are silently dropped by the UTL core if
+ * the link is down.
+ * MMIO Loads to the link will be dropped by the UTL core and will
+ * eventually time-out and will return an all ones response if the
+ * link is down.
+ */
+#define DLP_RESET_ATTEMPTS 200
+
+ PHBDBG(p, "Waiting for DLP PG reset to complete...\n");
+ for (i = 0; i < DLP_RESET_ATTEMPTS; i++) {
+ val = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+ if (!(val & PHB_PCIE_DLP_DL_PGRESET))
+ break;
+ time_wait_ms(1);
+ }
+ if (val & PHB_PCIE_DLP_DL_PGRESET) {
+ PHBERR(p, "Timeout waiting for DLP PG reset !\n");
+ return false;
+ }
+ return true;
+}
+static void phb4_init_hw(struct phb4 *p)
+{
+ uint64_t val, creset;
+
+ PHBDBG(p, "Initializing PHB...\n");
+
+ /* Init_1 - Sync reset
+ *
+ * At this point we assume the PHB has already been reset.
+ */
+
+ /* Init_2 - Mask FIRs */
+ out_be64(p->regs + PHB_LEM_ERROR_MASK, 0xffffffffffffffffull);
+
+ /* Init_3 - TCE tag enable */
+ out_be64(p->regs + PHB_TCE_TAG_ENABLE, 0xffffffffffffffffull);
+
+ /* Init_4 - PCIE System Configuration Register
+ *
+ * Adjust max speed based on system config
+ */
+ val = in_be64(p->regs + PHB_PCIE_SCR);
+ PHBDBG(p, "Default system config: 0x%016llx\n", val);
+ val = SETFIELD(PHB_PCIE_SCR_MAXLINKSPEED, val, p->max_link_speed);
+ out_be64(p->regs + PHB_PCIE_SCR, val);
+ PHBDBG(p, "New system config : 0x%016llx\n",
+ in_be64(p->regs + PHB_PCIE_SCR));
+
+ /* Init_5 - deassert CFG reset */
+ creset = in_be64(p->regs + PHB_PCIE_CRESET);
+ PHBDBG(p, "Initial PHB CRESET is 0x%016llx\n", creset);
+ creset &= ~PHB_PCIE_CRESET_CFG_CORE;
+ out_be64(p->regs + PHB_PCIE_CRESET, creset);
+
+ /* Init_6..13 - PCIE DLP Lane EQ control */
+ if (p->lane_eq) {
+ out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL0, be64_to_cpu(p->lane_eq[0]));
+ out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL1, be64_to_cpu(p->lane_eq[1]));
+ out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL2, be64_to_cpu(p->lane_eq[2]));
+ out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL3, be64_to_cpu(p->lane_eq[3]));
+ out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL40, be64_to_cpu(p->lane_eq[4]));
+ out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL41, be64_to_cpu(p->lane_eq[5]));
+ if (is_phb5()) {
+ out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL50, be64_to_cpu(p->lane_eq[6]));
+ out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL51, be64_to_cpu(p->lane_eq[7]));
+ }
+ }
+ if (!p->lane_eq_en) {
+ /* Read modify write and set to 2 bits */
+ PHBDBG(p, "LINK: Disabling Lane EQ\n");
+ val = in_be64(p->regs + PHB_PCIE_DLP_CTL);
+ val |= PHB_PCIE_DLP_CTL_BYPASS_PH2 | PHB_PCIE_DLP_CTL_BYPASS_PH3;
+ out_be64(p->regs + PHB_PCIE_DLP_CTL, val);
+ }
+
+ if (is_phb5()) {
+ /* disable scaled flow control for now. SW527785 */
+ PHBDBG(p, "LINK: Disabling scaled flow control\n");
+ val = in_be64(p->regs + PHB_PCIE_DLP_CTL);
+ val |= PHB_PCIE_DLP_CTL_SFC_DISABLE;
+ out_be64(p->regs + PHB_PCIE_DLP_CTL, val);
+
+ /* lane equalization settings need to be tuned on P10 */
+ out_be64(p->regs + PHB_PCIE_PDL_PHY_EQ_CNTL,
+ 0x80F4FFFFFF0F9C00);
+ }
+
+ /* Init_14 - Clear link training */
+ phb4_pcicfg_write32(&p->phb, 0, 0x78,
+ 0x07FE0000 | p->max_link_speed);
+
+ /* Init_15 - deassert cores reset */
+ /*
+ * Lift the PHB resets but not PERST, this will be lifted
+ * later by the initial PERST state machine
+ */
+ creset &= ~(PHB_PCIE_CRESET_TLDLP | PHB_PCIE_CRESET_PBL);
+ creset |= PHB_PCIE_CRESET_PIPE_N;
+ out_be64(p->regs + PHB_PCIE_CRESET, creset);
+
+ /* Init_16 - Wait for DLP PGRESET to clear */
+ if (!phb4_wait_dlp_reset(p))
+ goto failed;
+
+ /* Init_17 - PHB Control */
+ val = PHB_CTRLR_IRQ_PGSZ_64K;
+ val |= PHB_CTRLR_TCE_CLB_DISABLE; // HW557787 circumvention
+ val |= SETFIELD(PHB_CTRLR_TVT_ADDR_SEL, 0ull, TVT_2_PER_PE);
+ if (phb_pq_disable(p))
+ val |= PHB_CTRLR_IRQ_PQ_DISABLE;
+ if (phb_abt_mode(p))
+ val |= PHB_CTRLR_IRQ_ABT_MODE;
+ if (phb_can_store_eoi(p)) {
+ val |= PHB_CTRLR_IRQ_STORE_EOI;
+ PHBDBG(p, "store EOI is enabled\n");
+ }
+
+ if (!pci_eeh_mmio)
+ val |= PHB_CTRLR_MMIO_EEH_DISABLE;
+
+ out_be64(p->regs + PHB_CTRLR, val);
+
+ /* Init_18..41 - Architected IODA3 inits */
+ phb4_init_ioda3(p);
+
+ /* Init_42..45 - Clear DLP error logs */
+ out_be64(p->regs + 0x1aa0, 0xffffffffffffffffull);
+ out_be64(p->regs + 0x1aa8, 0xffffffffffffffffull);
+ out_be64(p->regs + 0x1ab0, 0xffffffffffffffffull);
+ out_be64(p->regs + 0x1ab8, 0x0);
+
+
+ /* Init_46..54 : Init root complex config space */
+ if (!phb4_init_rc_cfg(p))
+ goto failed;
+
+ /* Init_55..121 : Setup error registers */
+ phb4_init_errors(p);
+
+ /* Init_122..123 : Wait for link
+ * NOTE: At this point the spec waits for the link to come up. We
+ * don't bother as we are doing a PERST soon.
+ */
+
+ /* Init_124 : NBW. XXX TODO */
+ /* See enable_capi_mode() */
+
+ /* Init_125 : Setup PCI command/status on root complex
+ * I don't know why the spec does this now and not earlier, so
+ * to be sure to get it right we might want to move it to the freset
+ * state machine, though the generic PCI layer will probably do
+ * this anyway (ie, enable MEM, etc... in the RC)
+
+ */
+ phb4_pcicfg_write16(&p->phb, 0, PCI_CFG_CMD,
+ PCI_CFG_CMD_MEM_EN |
+ PCI_CFG_CMD_BUS_MASTER_EN);
+
+ /* Clear errors */
+ phb4_pcicfg_write16(&p->phb, 0, PCI_CFG_STAT,
+ PCI_CFG_STAT_SENT_TABORT |
+ PCI_CFG_STAT_RECV_TABORT |
+ PCI_CFG_STAT_RECV_MABORT |
+ PCI_CFG_STAT_SENT_SERR |
+ PCI_CFG_STAT_RECV_PERR);
+
+ /* Init_126..130 - Re-enable error interrupts */
+ phb4_int_unmask_all(p);
+
+ /* Init_131 - Re-enable LEM error mask */
+ out_be64(p->regs + PHB_LEM_ERROR_MASK, 0x0000000000000000ull);
+
+
+ /* Init_132 - Enable DMA address speculation */
+ out_be64(p->regs + PHB_TCE_SPEC_CTL, 0x0000000000000000ull);
+
+ /* Init_133 - Timeout Control Register 1 */
+ out_be64(p->regs + PHB_TIMEOUT_CTRL1, 0x0015150000150000ull);
+
+ /* Init_134 - Timeout Control Register 2 */
+ out_be64(p->regs + PHB_TIMEOUT_CTRL2, 0x0000151500000000ull);
+
+ /* Init_135 - PBL Timeout Control Register */
+ out_be64(p->regs + PHB_PBL_TIMEOUT_CTRL, 0x2013000000000000ull);
+
+ /* Mark the PHB as functional which enables all the various sequences */
+ p->broken = false;
+
+ PHBDBG(p, "Initialization complete\n");
+
+ return;
+
+ failed:
+ PHBERR(p, "Initialization failed\n");
+ p->broken = true;
+}
+
+/* FIXME: Use scoms rather than MMIO incase we are fenced */
+static bool phb4_read_capabilities(struct phb4 *p)
+{
+ uint64_t val;
+
+ /* XXX Should make sure ETU is out of reset ! */
+
+ /* Grab version and fit it in an int */
+ val = phb4_read_reg_asb(p, PHB_VERSION);
+ if (val == 0 || val == 0xffffffffffffffffUL) {
+ PHBERR(p, "Failed to read version, PHB appears broken\n");
+ return false;
+ }
+
+ p->rev = ((val >> 16) & 0x00ff0000) | (val & 0xffff);
+ PHBDBG(p, "Core revision 0x%x\n", p->rev);
+
+ /* Read EEH capabilities */
+ val = in_be64(p->regs + PHB_PHB4_EEH_CAP);
+ if (val == 0xffffffffffffffffUL) {
+ PHBERR(p, "Failed to read EEH cap, PHB appears broken\n");
+ return false;
+ }
+ p->max_num_pes = val >> 52;
+ if (p->max_num_pes >= 512) {
+ p->mrt_size = 16;
+ p->mbt_size = 32;
+ p->tvt_size = 1024;
+ } else {
+ p->mrt_size = 8;
+ p->mbt_size = 16;
+ p->tvt_size = 512;
+ }
+
+ val = in_be64(p->regs + PHB_PHB4_IRQ_CAP);
+ if (val == 0xffffffffffffffffUL) {
+ PHBERR(p, "Failed to read IRQ cap, PHB appears broken\n");
+ return false;
+ }
+ p->num_irqs = val & 0xffff;
+
+ /* This works for 512 PEs. FIXME calculate for any hardware
+ * size returned above
+ */
+ p->tbl_peltv_size = PELTV_TABLE_SIZE_MAX;
+
+ p->tbl_pest_size = p->max_num_pes*16;
+
+ PHBDBG(p, "Found %d max PEs and %d IRQs \n",
+ p->max_num_pes, p->num_irqs);
+
+ return true;
+}
+
+static void phb4_allocate_tables(struct phb4 *p)
+{
+ uint32_t i;
+
+ /* XXX Our current memalign implementation sucks,
+ *
+ * It will do the job, however it doesn't support freeing
+ * the memory and wastes space by always allocating twice
+ * as much as requested (size + alignment)
+ */
+ p->tbl_rtt = local_alloc(p->chip_id, RTT_TABLE_SIZE, RTT_TABLE_SIZE);
+ assert(p->tbl_rtt);
+ for (i = 0; i < RTT_TABLE_ENTRIES; i++)
+ p->tbl_rtt[i] = cpu_to_be16(PHB4_RESERVED_PE_NUM(p));
+
+ p->tbl_peltv = local_alloc(p->chip_id, p->tbl_peltv_size, p->tbl_peltv_size);
+ assert(p->tbl_peltv);
+ memset(p->tbl_peltv, 0, p->tbl_peltv_size);
+
+ p->tbl_pest = (uint64_t)local_alloc(p->chip_id, p->tbl_pest_size, p->tbl_pest_size);
+ assert(p->tbl_pest);
+ memset((void *)p->tbl_pest, 0, p->tbl_pest_size);
+}
+
+static void phb4_add_properties(struct phb4 *p)
+{
+ struct dt_node *np = p->phb.dt_node;
+ uint32_t lsibase, icsp = get_ics_phandle();
+ uint64_t m32b, m64b, m64s;
+
+ /* Add various properties that HB doesn't have to
+ * add, some of them simply because they result from
+ * policy decisions made in skiboot rather than in HB
+ * such as the MMIO windows going to PCI, interrupts,
+ * etc...
+ */
+ dt_add_property_cells(np, "#address-cells", 3);
+ dt_add_property_cells(np, "#size-cells", 2);
+ dt_add_property_cells(np, "#interrupt-cells", 1);
+ dt_add_property_cells(np, "bus-range", 0, 0xff);
+ dt_add_property_cells(np, "clock-frequency", 0x200, 0); /* ??? */
+
+ dt_add_property_cells(np, "interrupt-parent", icsp);
+
+ /* XXX FIXME: add slot-name */
+ //dt_property_cell("bus-width", 8); /* Figure it out from VPD ? */
+
+ /* "ranges", we only expose M32 (PHB4 doesn't do IO)
+ *
+ * Note: The kernel expects us to have chopped of 64k from the
+ * M32 size (for the 32-bit MSIs). If we don't do that, it will
+ * get confused (OPAL does it)
+ */
+ m32b = cleanup_addr(p->mm1_base);
+ m64b = cleanup_addr(p->mm0_base);
+ m64s = p->mm0_size;
+ dt_add_property_cells(np, "ranges",
+ /* M32 space */
+ 0x02000000, 0x00000000, M32_PCI_START,
+ hi32(m32b), lo32(m32b), 0, M32_PCI_SIZE - 0x10000);
+
+ /* XXX FIXME: add opal-memwin32, dmawins, etc... */
+ dt_add_property_u64s(np, "ibm,opal-m64-window", m64b, m64b, m64s);
+ dt_add_property(np, "ibm,opal-single-pe", NULL, 0);
+ dt_add_property_cells(np, "ibm,opal-num-pes", p->num_pes);
+ dt_add_property_cells(np, "ibm,opal-reserved-pe",
+ PHB4_RESERVED_PE_NUM(p));
+ dt_add_property_cells(np, "ibm,opal-msi-ranges",
+ p->base_msi, p->num_irqs - 8);
+ /* M64 ranges start at 1 as MBT0 is used for M32 */
+ dt_add_property_cells(np, "ibm,opal-available-m64-ranges",
+ 1, p->mbt_size - 1);
+ dt_add_property_cells(np, "ibm,supported-tce-sizes",
+ 12, // 4K
+ 16, // 64K
+ 21, // 2M
+ 30); // 1G
+
+ /* Tell Linux about alignment limits for segment splits.
+ *
+ * XXX We currently only expose splits of 1 and "num PEs",
+ */
+ dt_add_property_cells(np, "ibm,opal-m64-segment-splits",
+ /* Full split, number of segments: */
+ p->num_pes,
+ /* Encoding passed to the enable call */
+ OPAL_ENABLE_M64_SPLIT,
+ /* Alignement/size restriction in #bits*/
+ /* XXX VERIFY VALUE */
+ 12,
+ /* Unused */
+ 0,
+ /* single PE, number of segments: */
+ 1,
+ /* Encoding passed to the enable call */
+ OPAL_ENABLE_M64_NON_SPLIT,
+ /* Alignement/size restriction in #bits*/
+ /* XXX VERIFY VALUE */
+ 12,
+ /* Unused */
+ 0);
+
+ /* The interrupt maps will be generated in the RC node by the
+ * PCI code based on the content of this structure:
+ */
+ lsibase = p->base_lsi;
+ p->phb.lstate.int_size = 2;
+ p->phb.lstate.int_val[0][0] = lsibase + PHB4_LSI_PCIE_INTA;
+ p->phb.lstate.int_val[0][1] = 1;
+ p->phb.lstate.int_val[1][0] = lsibase + PHB4_LSI_PCIE_INTB;
+ p->phb.lstate.int_val[1][1] = 1;
+ p->phb.lstate.int_val[2][0] = lsibase + PHB4_LSI_PCIE_INTC;
+ p->phb.lstate.int_val[2][1] = 1;
+ p->phb.lstate.int_val[3][0] = lsibase + PHB4_LSI_PCIE_INTD;
+ p->phb.lstate.int_val[3][1] = 1;
+ p->phb.lstate.int_parent[0] = icsp;
+ p->phb.lstate.int_parent[1] = icsp;
+ p->phb.lstate.int_parent[2] = icsp;
+ p->phb.lstate.int_parent[3] = icsp;
+
+ /* Indicators for variable tables */
+ dt_add_property_cells(np, "ibm,opal-rtt-table",
+ hi32((u64) p->tbl_rtt), lo32((u64) p->tbl_rtt), RTT_TABLE_SIZE);
+
+ dt_add_property_cells(np, "ibm,opal-peltv-table",
+ hi32((u64) p->tbl_peltv), lo32((u64) p->tbl_peltv),
+ p->tbl_peltv_size);
+
+ dt_add_property_cells(np, "ibm,opal-pest-table",
+ hi32(p->tbl_pest), lo32(p->tbl_pest), p->tbl_pest_size);
+
+ dt_add_property_cells(np, "ibm,phb-diag-data-size",
+ sizeof(struct OpalIoPhb4ErrorData));
+
+ /* Indicate to Linux that CAPP timebase sync is supported */
+ dt_add_property_string(np, "ibm,capp-timebase-sync", NULL);
+
+ /* Tell Linux Compare/Mask indication values */
+ dt_add_property_cells(np, "ibm,phb-indications", CAPIIND, ASNIND,
+ NBWIND);
+}
+
+static bool phb4_calculate_windows(struct phb4 *p)
+{
+ const struct dt_property *prop;
+
+ /* Get PBCQ MMIO windows from device-tree */
+ prop = dt_require_property(p->phb.dt_node,
+ "ibm,mmio-windows", -1);
+ assert(prop->len >= (2 * sizeof(uint64_t)));
+
+ p->mm0_base = dt_property_get_u64(prop, 0);
+ p->mm0_size = dt_property_get_u64(prop, 1);
+ if (prop->len > 16) {
+ p->mm1_base = dt_property_get_u64(prop, 2);
+ p->mm1_size = dt_property_get_u64(prop, 3);
+ }
+
+ /* Sort them so that 0 is big and 1 is small */
+ if (p->mm1_size && p->mm1_size > p->mm0_size) {
+ uint64_t b = p->mm0_base;
+ uint64_t s = p->mm0_size;
+ p->mm0_base = p->mm1_base;
+ p->mm0_size = p->mm1_size;
+ p->mm1_base = b;
+ p->mm1_size = s;
+ }
+
+ /* If 1 is too small, ditch it */
+ if (p->mm1_size < M32_PCI_SIZE)
+ p->mm1_size = 0;
+
+ /* If 1 doesn't exist, carve it out of 0 */
+ if (p->mm1_size == 0) {
+ p->mm0_size /= 2;
+ p->mm1_base = p->mm0_base + p->mm0_size;
+ p->mm1_size = p->mm0_size;
+ }
+
+ /* Crop mm1 to our desired size */
+ if (p->mm1_size > M32_PCI_SIZE)
+ p->mm1_size = M32_PCI_SIZE;
+
+ return true;
+}
+
+static void phb4_err_interrupt(struct irq_source *is, uint32_t isn)
+{
+ struct phb4 *p = is->data;
+
+ PHBDBG(p, "Got interrupt 0x%08x\n", isn);
+
+ /* mask the interrupt conditions to prevent it from re-firing */
+ phb4_int_mask_active(p);
+
+ /* Update pending event */
+ opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
+ OPAL_EVENT_PCI_ERROR);
+
+ /* If the PHB is broken, go away */
+ if (p->broken)
+ return;
+
+ /*
+ * Mark the PHB has pending error so that the OS
+ * can handle it at late point.
+ */
+ phb4_set_err_pending(p, true);
+}
+
+static uint64_t phb4_lsi_attributes(struct irq_source *is __unused,
+ uint32_t isn __unused)
+{
+#ifndef DISABLE_ERR_INTS
+ struct phb4 *p = is->data;
+ uint32_t idx = isn - p->base_lsi;
+
+ if (idx == PHB4_LSI_PCIE_INF || idx == PHB4_LSI_PCIE_ER)
+ return IRQ_ATTR_TARGET_OPAL | IRQ_ATTR_TARGET_RARE | IRQ_ATTR_TYPE_LSI;
+#endif
+ return IRQ_ATTR_TARGET_LINUX;
+}
+
+static char *phb4_lsi_name(struct irq_source *is, uint32_t isn)
+{
+ struct phb4 *p = is->data;
+ uint32_t idx = isn - p->base_lsi;
+ char buf[32];
+
+ if (idx == PHB4_LSI_PCIE_INF)
+ snprintf(buf, 32, "phb#%04x-inf", p->phb.opal_id);
+ else if (idx == PHB4_LSI_PCIE_ER)
+ snprintf(buf, 32, "phb#%04x-err", p->phb.opal_id);
+ else
+ assert(0); /* PCIe LSIs should never be directed to OPAL */
+
+ return strdup(buf);
+}
+
+static const struct irq_source_ops phb4_lsi_ops = {
+ .interrupt = phb4_err_interrupt,
+ .attributes = phb4_lsi_attributes,
+ .name = phb4_lsi_name,
+};
+
+static __be64 lane_eq_default[8] = {
+ CPU_TO_BE64(0x5454545454545454UL), CPU_TO_BE64(0x5454545454545454UL),
+ CPU_TO_BE64(0x5454545454545454UL), CPU_TO_BE64(0x5454545454545454UL),
+ CPU_TO_BE64(0x7777777777777777UL), CPU_TO_BE64(0x7777777777777777UL),
+ CPU_TO_BE64(0x7777777777777777UL), CPU_TO_BE64(0x7777777777777777UL),
+};
+
+static __be64 lane_eq_phb5_default[8] = {
+ CPU_TO_BE64(0x4444444444444444UL), CPU_TO_BE64(0x4444444444444444UL),
+ CPU_TO_BE64(0x4444444444444444UL), CPU_TO_BE64(0x4444444444444444UL),
+ CPU_TO_BE64(0x4444444444444444UL), CPU_TO_BE64(0x4444444444444444UL),
+ CPU_TO_BE64(0x9999999999999999UL), CPU_TO_BE64(0x9999999999999999UL),
+};
+
+static void phb4_create(struct dt_node *np)
+{
+ const struct dt_property *prop;
+ struct phb4 *p;
+ struct pci_slot *slot;
+ size_t lane_eq_len, lane_eq_len_req;
+ struct dt_node *iplp;
+ char *path;
+ uint32_t irq_base, irq_flags;
+ int i, eq_reg_count;
+ int chip_id;
+
+ chip_id = dt_prop_get_u32(np, "ibm,chip-id");
+ p = local_alloc(chip_id, sizeof(struct phb4), 8);
+ assert(p);
+ memset(p, 0x0, sizeof(struct phb4));
+
+ /* Populate base stuff */
+ p->index = dt_prop_get_u32(np, "ibm,phb-index");
+ p->chip_id = chip_id;
+ p->pec = dt_prop_get_u32(np, "ibm,phb-pec-index");
+ p->regs = (void *)dt_get_address(np, 0, NULL);
+ p->int_mmio = (void *)dt_get_address(np, 1, NULL);
+ p->phb.dt_node = np;
+ p->phb.ops = &phb4_ops;
+ p->phb.phb_type = phb_type_pcie_v4;
+ p->phb.scan_map = 0x1; /* Only device 0 to scan */
+
+ if (!phb4_calculate_windows(p))
+ return;
+
+ /* Get the various XSCOM register bases from the device-tree */
+ prop = dt_require_property(np, "ibm,xscom-bases", 5 * sizeof(uint32_t));
+ p->pe_xscom = dt_property_get_cell(prop, 0);
+ p->pe_stk_xscom = dt_property_get_cell(prop, 1);
+ p->pci_xscom = dt_property_get_cell(prop, 2);
+ p->pci_stk_xscom = dt_property_get_cell(prop, 3);
+ p->etu_xscom = dt_property_get_cell(prop, 4);
+
+ /*
+ * We skip the initial PERST assertion requested by the generic code
+ * when doing a cold boot because we are coming out of cold boot already
+ * so we save boot time that way. The PERST state machine will still
+ * handle waiting for the link to come up, it will just avoid actually
+ * asserting & deasserting the PERST output
+ *
+ * For a hot IPL, we still do a PERST
+ *
+ * Note: In absence of property (ie, FSP-less), we stick to the old
+ * behaviour and set skip_perst to true
+ */
+ p->skip_perst = true; /* Default */
+
+ iplp = dt_find_by_path(dt_root, "ipl-params/ipl-params");
+ if (iplp) {
+ const char *ipl_type = dt_prop_get_def(iplp, "cec-major-type", NULL);
+ if (ipl_type && (!strcmp(ipl_type, "hot")))
+ p->skip_perst = false;
+ }
+
+ /* By default link is assumed down */
+ p->has_link = false;
+
+ /* We register the PHB before we initialize it so we
+ * get a useful OPAL ID for it
+ */
+ pci_register_phb(&p->phb, phb4_get_opal_id(p->chip_id, p->index));
+
+ /* Create slot structure */
+ slot = phb4_slot_create(&p->phb);
+ if (!slot)
+ PHBERR(p, "Cannot create PHB slot\n");
+
+ /* Hello ! */
+ path = dt_get_path(np);
+ PHBINF(p, "Found %s @%p\n", path, p->regs);
+ PHBINF(p, " M32 [0x%016llx..0x%016llx]\n",
+ p->mm1_base, p->mm1_base + p->mm1_size - 1);
+ PHBINF(p, " M64 [0x%016llx..0x%016llx]\n",
+ p->mm0_base, p->mm0_base + p->mm0_size - 1);
+ free(path);
+
+ /* Find base location code from root node */
+ p->phb.base_loc_code = dt_prop_get_def(dt_root,
+ "ibm,io-base-loc-code", NULL);
+ if (!p->phb.base_loc_code)
+ PHBDBG(p, "Base location code not found !\n");
+
+ /*
+ * Grab CEC IO VPD load info from the root of the device-tree,
+ * on P8 there's a single such VPD for the whole machine
+ */
+ prop = dt_find_property(dt_root, "ibm,io-vpd");
+ if (!prop) {
+ /* LX VPD Lid not already loaded */
+ if (platform.vpd_iohub_load)
+ platform.vpd_iohub_load(dt_root);
+ }
+
+ /* Obtain informatin about the PHB from the hardware directly */
+ if (!phb4_read_capabilities(p))
+ goto failed;
+
+ p->max_link_speed = phb4_get_max_link_speed(p, np);
+ p->max_link_width = phb4_get_max_link_width(p);
+ PHBINF(p, "Max link speed: GEN%i, max link width %i\n",
+ p->max_link_speed, p->max_link_width);
+
+ /* Check for lane equalization values from HB or HDAT */
+ p->lane_eq_en = true;
+ p->lane_eq = dt_prop_get_def_size(np, "ibm,lane-eq", NULL, &lane_eq_len);
+ if (is_phb5())
+ eq_reg_count = 8;
+ else
+ eq_reg_count = 6;
+ lane_eq_len_req = eq_reg_count * 8;
+ if (p->lane_eq) {
+ if (lane_eq_len < lane_eq_len_req) {
+ PHBERR(p, "Device-tree has ibm,lane-eq too short: %ld"
+ " (want %ld)\n", lane_eq_len, lane_eq_len_req);
+ p->lane_eq = NULL;
+ }
+ } else {
+ PHBDBG(p, "Using default lane equalization settings\n");
+ if (is_phb5())
+ p->lane_eq = lane_eq_phb5_default;
+ else
+ p->lane_eq = lane_eq_default;
+ }
+ if (p->lane_eq) {
+ PHBDBG(p, "Override lane equalization settings:\n");
+ for (i = 0 ; i < lane_eq_len_req/(8 * 2) ; i++)
+ PHBDBG(p, " 0x%016llx 0x%016llx\n",
+ be64_to_cpu(p->lane_eq[2 * i]),
+ be64_to_cpu(p->lane_eq[2 * i + 1]));
+ }
+
+ /* Allocate a block of interrupts. We need to know if it needs
+ * 2K or 4K interrupts ... for now we just use 4K but that
+ * needs to be fixed
+ */
+ if (is_phb5())
+ irq_base = xive2_alloc_hw_irqs(p->chip_id, p->num_irqs, p->num_irqs);
+ else
+ irq_base = xive_alloc_hw_irqs(p->chip_id, p->num_irqs, p->num_irqs);
+ if (irq_base == XIVE_IRQ_ERROR) {
+ PHBERR(p, "Failed to allocate %d interrupt sources\n",
+ p->num_irqs);
+ goto failed;
+ }
+ p->base_msi = irq_base;
+ p->base_lsi = irq_base + p->num_irqs - 8;
+ p->num_pes = p->max_num_pes;
+
+ /* Allocate the SkiBoot internal in-memory tables for the PHB */
+ phb4_allocate_tables(p);
+
+ phb4_add_properties(p);
+
+ /* Clear IODA3 cache */
+ phb4_init_ioda_cache(p);
+
+ /* Get the HW up and running */
+ phb4_init_hw(p);
+
+ /* init capp that might get attached to the phb */
+ if (is_phb4())
+ phb4_init_capp(p);
+
+ /* Compute XIVE source flags depending on PHB revision */
+ irq_flags = 0;
+ if (phb_can_store_eoi(p))
+ irq_flags |= XIVE_SRC_STORE_EOI;
+ else
+ irq_flags |= XIVE_SRC_TRIGGER_PAGE;
+
+ if (is_phb5()) {
+ /*
+ * Register sources with XIVE. If offloading is on, use the
+ * ESB pages of the XIVE IC for the MSI sources instead of the
+ * ESB pages of the PHB.
+ */
+ if (phb_pq_disable(p) || phb_abt_mode(p)) {
+ xive2_register_esb_source(p->base_msi, p->num_irqs - 8);
+ } else {
+ xive2_register_hw_source(p->base_msi,
+ p->num_irqs - 8, 16,
+ p->int_mmio, irq_flags,
+ NULL, NULL);
+ }
+
+ /*
+ * LSI sources always use the ESB pages of the PHB.
+ */
+ xive2_register_hw_source(p->base_lsi, 8, 16,
+ p->int_mmio + ((p->num_irqs - 8) << 16),
+ XIVE_SRC_LSI | irq_flags, p, &phb4_lsi_ops);
+ } else {
+ /* Register all interrupt sources with XIVE */
+ xive_register_hw_source(p->base_msi, p->num_irqs - 8, 16,
+ p->int_mmio, irq_flags, NULL, NULL);
+
+ xive_register_hw_source(p->base_lsi, 8, 16,
+ p->int_mmio + ((p->num_irqs - 8) << 16),
+ XIVE_SRC_LSI, p, &phb4_lsi_ops);
+ }
+
+ /* Platform additional setup */
+ if (platform.pci_setup_phb)
+ platform.pci_setup_phb(&p->phb, p->index);
+
+ dt_add_property_string(np, "status", "okay");
+
+ return;
+
+ failed:
+ p->broken = true;
+
+ /* Tell Linux it's broken */
+ dt_add_property_string(np, "status", "error");
+}
+
+static void phb4_probe_stack(struct dt_node *stk_node, uint32_t pec_index,
+ uint32_t nest_base, uint32_t pci_base)
+{
+ enum phys_map_type phys_mmio64, phys_mmio32, phys_xive_esb, phys_reg_spc;
+ uint32_t pci_stack, nest_stack, etu_base, gcid, phb_num, stk_index;
+ uint64_t val, phb_bar = 0, irq_bar = 0, bar_en;
+ uint64_t mmio0_bar = 0, mmio0_bmask, mmio0_sz;
+ uint64_t mmio1_bar = 0, mmio1_bmask, mmio1_sz;
+ void *foo;
+ __be64 mmio_win[4];
+ unsigned int mmio_win_sz;
+ struct dt_node *np;
+ char *path;
+ uint64_t capp_ucode_base;
+ unsigned int max_link_speed;
+ int rc;
+
+ assert(is_phb5() || is_phb4()); /* Sanity check */
+
+ gcid = dt_get_chip_id(stk_node);
+ stk_index = dt_prop_get_u32(stk_node, "reg");
+ phb_num = dt_prop_get_u32(stk_node, "ibm,phb-index");
+ path = dt_get_path(stk_node);
+ if (is_phb5()) {
+ phys_mmio64 = PHB5_64BIT_MMIO;
+ phys_mmio32 = PHB5_32BIT_MMIO;
+ phys_xive_esb = PHB5_XIVE_ESB;
+ phys_reg_spc = PHB5_REG_SPC;
+ prlog(PR_INFO, "PHB: Chip %d Found PHB5 PBCQ%d Stack %d at %s\n",
+ gcid, pec_index, stk_index, path);
+ } else {
+ phys_mmio64 = PHB4_64BIT_MMIO;
+ phys_mmio32 = PHB4_32BIT_MMIO;
+ phys_xive_esb = PHB4_XIVE_ESB;
+ phys_reg_spc = PHB4_REG_SPC;
+ prlog(PR_INFO, "PHB: Chip %d Found PHB4 PBCQ%d Stack %d at %s\n",
+ gcid, pec_index, stk_index, path);
+ }
+ free(path);
+
+ pci_stack = pci_base + 0x40 * (stk_index + 1);
+ nest_stack = nest_base + 0x40 * (stk_index + 1);
+ etu_base = pci_base + 0x100 + 0x40 * stk_index;
+
+ prlog(PR_DEBUG, "PHB[%d:%d] X[PE]=0x%08x/0x%08x X[PCI]=0x%08x/0x%08x X[ETU]=0x%08x\n",
+ gcid, phb_num, nest_base, nest_stack, pci_base, pci_stack, etu_base);
+
+ /* Default BAR enables */
+ bar_en = 0;
+
+ /* Initialize PHB register BAR */
+ phys_map_get(gcid, phys_reg_spc, phb_num, &phb_bar, NULL);
+ rc = xscom_write(gcid, nest_stack + XPEC_NEST_STK_PHB_REG_BAR,
+ phb_bar << 8);
+
+ /* A scom error here probably indicates a defective/garded PHB */
+ if (rc != OPAL_SUCCESS) {
+ prerror("PHB[%d:%d] Unable to set PHB BAR. Error=%d\n",
+ gcid, phb_num, rc);
+ return;
+ }
+
+ bar_en |= XPEC_NEST_STK_BAR_EN_PHB;
+
+ /* Same with INT BAR (ESB) */
+ phys_map_get(gcid, phys_xive_esb, phb_num, &irq_bar, NULL);
+ xscom_write(gcid, nest_stack + XPEC_NEST_STK_IRQ_BAR, irq_bar << 8);
+ bar_en |= XPEC_NEST_STK_BAR_EN_INT;
+
+
+ /* Same with MMIO windows */
+ phys_map_get(gcid, phys_mmio64, phb_num, &mmio0_bar, &mmio0_sz);
+ mmio0_bmask = (~(mmio0_sz - 1)) & 0x00FFFFFFFFFFFFFFULL;
+ xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR0, mmio0_bar << 8);
+ xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR0_MASK, mmio0_bmask << 8);
+
+ phys_map_get(gcid, phys_mmio32, phb_num, &mmio1_bar, &mmio1_sz);
+ mmio1_bmask = (~(mmio1_sz - 1)) & 0x00FFFFFFFFFFFFFFULL;
+ xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR1, mmio1_bar << 8);
+ xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR1_MASK, mmio1_bmask << 8);
+
+ /* Build MMIO windows list */
+ mmio_win_sz = 0;
+ if (mmio0_bar) {
+ mmio_win[mmio_win_sz++] = cpu_to_be64(mmio0_bar);
+ mmio_win[mmio_win_sz++] = cpu_to_be64(mmio0_sz);
+ bar_en |= XPEC_NEST_STK_BAR_EN_MMIO0;
+ }
+ if (mmio1_bar) {
+ mmio_win[mmio_win_sz++] = cpu_to_be64(mmio1_bar);
+ mmio_win[mmio_win_sz++] = cpu_to_be64(mmio1_sz);
+ bar_en |= XPEC_NEST_STK_BAR_EN_MMIO1;
+ }
+
+ /* Set the appropriate enables */
+ xscom_read(gcid, nest_stack + XPEC_NEST_STK_BAR_EN, &val);
+ val |= bar_en;
+ xscom_write(gcid, nest_stack + XPEC_NEST_STK_BAR_EN, val);
+
+ /* No MMIO windows ? Barf ! */
+ if (mmio_win_sz == 0) {
+ prerror("PHB[%d:%d] No MMIO windows enabled !\n", gcid, phb_num);
+ return;
+ }
+
+ /* Clear errors in PFIR and NFIR */
+ xscom_write(gcid, pci_stack + XPEC_PCI_STK_PCI_FIR, 0);
+ xscom_write(gcid, nest_stack + XPEC_NEST_STK_PCI_NFIR, 0);
+
+ /* Check ETU reset */
+ xscom_read(gcid, pci_stack + XPEC_PCI_STK_ETU_RESET, &val);
+ prlog_once(PR_DEBUG, "ETU reset: %llx\n", val);
+ xscom_write(gcid, pci_stack + XPEC_PCI_STK_ETU_RESET, 0);
+ time_wait_ms(1);
+
+ // show we can read phb mmio space
+ foo = (void *)(phb_bar + 0x800); // phb version register
+ prlog_once(PR_DEBUG, "Version reg: 0x%016llx\n", in_be64(foo));
+
+ /* Create PHB node */
+ np = dt_new_addr(dt_root, "pciex", phb_bar);
+ if (!np)
+ return;
+
+ if (is_phb5())
+ dt_add_property_strings(np, "compatible", "ibm,power10-pciex", "ibm,ioda3-phb");
+ else
+ dt_add_property_strings(np, "compatible", "ibm,power9-pciex", "ibm,ioda3-phb");
+ dt_add_property_strings(np, "device_type", "pciex");
+ dt_add_property_u64s(np, "reg",
+ phb_bar, 0x1000,
+ irq_bar, 0x10000000);
+
+ /* Everything else is handled later by skiboot, we just
+ * stick a few hints here
+ */
+ dt_add_property_cells(np, "ibm,xscom-bases",
+ nest_base, nest_stack, pci_base, pci_stack, etu_base);
+ dt_add_property(np, "ibm,mmio-windows", mmio_win, 8 * mmio_win_sz);
+ dt_add_property_cells(np, "ibm,phb-index", phb_num);
+ dt_add_property_cells(np, "ibm,phb-pec-index", pec_index);
+ dt_add_property_cells(np, "ibm,phb-stack", stk_node->phandle);
+ dt_add_property_cells(np, "ibm,phb-stack-index", stk_index);
+ dt_add_property_cells(np, "ibm,chip-id", gcid);
+
+ /* read the hub-id out of the pbcq node */
+ if (dt_has_node_property(stk_node->parent, "ibm,hub-id", NULL)) {
+ uint32_t hub_id;
+
+ hub_id = dt_prop_get_u32(stk_node->parent, "ibm,hub-id");
+ dt_add_property_cells(np, "ibm,hub-id", hub_id);
+ }
+
+ if (dt_has_node_property(stk_node->parent, "ibm,loc-code", NULL)) {
+ const char *lc = dt_prop_get(stk_node->parent, "ibm,loc-code");
+ dt_add_property_string(np, "ibm,loc-code", lc);
+ }
+ if (dt_has_node_property(stk_node, "ibm,lane-eq", NULL)) {
+ size_t leq_size;
+ const void *leq = dt_prop_get_def_size(stk_node, "ibm,lane-eq",
+ NULL, &leq_size);
+ if (leq != NULL && leq_size >= 6 * 8)
+ dt_add_property(np, "ibm,lane-eq", leq, leq_size);
+ }
+ if (dt_has_node_property(stk_node, "ibm,capp-ucode", NULL)) {
+ capp_ucode_base = dt_prop_get_u32(stk_node, "ibm,capp-ucode");
+ dt_add_property_cells(np, "ibm,capp-ucode", capp_ucode_base);
+ }
+ if (dt_has_node_property(stk_node, "ibm,max-link-speed", NULL)) {
+ max_link_speed = dt_prop_get_u32(stk_node, "ibm,max-link-speed");
+ dt_add_property_cells(np, "ibm,max-link-speed", max_link_speed);
+ }
+ dt_add_property_cells(np, "ibm,capi-flags",
+ OPAL_PHB_CAPI_FLAG_SNOOP_CONTROL);
+
+ add_chip_dev_associativity(np);
+}
+
+static void phb4_probe_pbcq(struct dt_node *pbcq)
+{
+ uint32_t nest_base, pci_base, pec_index;
+ struct dt_node *stk;
+
+ /* REMOVEME: force this for now until we stabalise PCIe */
+ verbose_eeh = 1;
+
+ nest_base = dt_get_address(pbcq, 0, NULL);
+ pci_base = dt_get_address(pbcq, 1, NULL);
+ pec_index = dt_prop_get_u32(pbcq, "ibm,pec-index");
+
+ dt_for_each_child(pbcq, stk) {
+ if (dt_node_is_enabled(stk))
+ phb4_probe_stack(stk, pec_index, nest_base, pci_base);
+ }
+}
+
+void probe_phb4(void)
+{
+ struct dt_node *np;
+ const char *s;
+
+ pci_eeh_mmio = !nvram_query_eq_dangerous("pci-eeh-mmio", "disabled");
+ pci_retry_all = nvram_query_eq_dangerous("pci-retry-all", "true");
+ s = nvram_query_dangerous("phb-rx-err-max");
+ if (s) {
+ rx_err_max = atoi(s);
+
+ /* Clip to uint8_t used by hardware */
+ rx_err_max = MAX(rx_err_max, 0);
+ rx_err_max = MIN(rx_err_max, 255);
+ }
+
+ if (is_phb5()) {
+ prlog(PR_DEBUG, "PHB5: Maximum RX errors during training: %d\n", rx_err_max);
+ /* Look for PBCQ XSCOM nodes */
+ dt_for_each_compatible(dt_root, np, "ibm,power10-pbcq")
+ phb4_probe_pbcq(np);
+
+ /* Look for newly created PHB nodes */
+ dt_for_each_compatible(dt_root, np, "ibm,power10-pciex")
+ phb4_create(np);
+ } else {
+ prlog(PR_DEBUG, "PHB4: Maximum RX errors during training: %d\n", rx_err_max);
+ /* Look for PBCQ XSCOM nodes */
+ dt_for_each_compatible(dt_root, np, "ibm,power9-pbcq")
+ phb4_probe_pbcq(np);
+
+ /* Look for newly created PHB nodes */
+ dt_for_each_compatible(dt_root, np, "ibm,power9-pciex")
+ phb4_create(np);
+ }
+}
diff --git a/roms/skiboot/hw/phys-map.c b/roms/skiboot/hw/phys-map.c
new file mode 100644
index 000000000..d6ff99fd8
--- /dev/null
+++ b/roms/skiboot/hw/phys-map.c
@@ -0,0 +1,445 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Physical memory map
+ *
+ * Copyright 2017-2019 IBM Corp.
+ */
+
+#include <phys-map.h>
+#include <chip.h>
+#include <skiboot.h>
+#include <opal-api.h>
+#include <stack.h>
+#include <inttypes.h>
+
+struct phys_map_entry {
+ enum phys_map_type type;
+ int index;
+ uint64_t addr;
+ uint64_t size;
+};
+
+struct phys_map_info {
+ int chip_select_shift;
+ const struct phys_map_entry *table;
+};
+
+static const struct phys_map_info *phys_map;
+
+static const struct phys_map_entry phys_map_table_p10[] = {
+ /* System memory upto 4TB minus GPU memory */
+ { SYSTEM_MEM, 0, 0x0000000000000000ull, 0x0000034000000000ull },
+
+ /* TODO: Figure out GPU memory */
+
+ /* 0 TB offset @ MMIO 0x0006000000000000ull */
+ { PHB5_64BIT_MMIO, 0, 0x0006000000000000ull, 0x0000004000000000ull },
+ { PHB5_64BIT_MMIO, 1, 0x0006004000000000ull, 0x0000004000000000ull },
+ { PHB5_64BIT_MMIO, 2, 0x0006008000000000ull, 0x0000004000000000ull },
+ { PHB5_32BIT_MMIO, 0, 0x000600c000000000ull, 0x0000000080000000ull },
+ { PHB5_32BIT_MMIO, 1, 0x000600c080000000ull, 0x0000000080000000ull },
+ { PHB5_32BIT_MMIO, 2, 0x000600c100000000ull, 0x0000000080000000ull },
+ { PHB5_32BIT_MMIO, 3, 0x000600c180000000ull, 0x0000000080000000ull },
+ { PHB5_32BIT_MMIO, 4, 0x000600c200000000ull, 0x0000000080000000ull },
+ { PHB5_32BIT_MMIO, 5, 0x000600c280000000ull, 0x0000000080000000ull },
+ { PHB5_XIVE_ESB , 0, 0x000600c300000000ull, 0x0000000020000000ull },
+ { PHB5_XIVE_ESB , 1, 0x000600c320000000ull, 0x0000000020000000ull },
+ { PHB5_XIVE_ESB , 2, 0x000600c340000000ull, 0x0000000020000000ull },
+ { PHB5_XIVE_ESB , 3, 0x000600c360000000ull, 0x0000000020000000ull },
+ { PHB5_XIVE_ESB , 4, 0x000600c380000000ull, 0x0000000020000000ull },
+ { PHB5_XIVE_ESB , 5, 0x000600c3a0000000ull, 0x0000000020000000ull },
+ { PHB5_REG_SPC , 0, 0x000600c3c0000000ull, 0x0000000000100000ull },
+ { PHB5_REG_SPC , 1, 0x000600c3c0100000ull, 0x0000000000100000ull },
+ { PHB5_REG_SPC , 2, 0x000600c3c0200000ull, 0x0000000000100000ull },
+ { PHB5_REG_SPC , 3, 0x000600c3c0300000ull, 0x0000000000100000ull },
+ { PHB5_REG_SPC , 4, 0x000600c3c0400000ull, 0x0000000000100000ull },
+ { PHB5_REG_SPC , 5, 0x000600c3c0500000ull, 0x0000000000100000ull },
+ { RESV , 0, 0x000600c3c0600000ull, 0x0000003c3fa00000ull },
+
+ /* 1 TB offset */
+ { RESV , 1, 0x0006010000000000ull, 0x0000010000000000ull },
+
+ /* 2 TB offset */
+ { PHB5_64BIT_MMIO, 3, 0x0006020000000000ull, 0x0000004000000000ull },
+ { PHB5_64BIT_MMIO, 4, 0x0006024000000000ull, 0x0000004000000000ull },
+ { PHB5_64BIT_MMIO, 5, 0x0006028000000000ull, 0x0000004000000000ull },
+ { RESV , 2, 0x000602c000000000ull, 0x0000004000000000ull },
+
+ /* 3 TB offset */
+ { LPC_BUS , 0, 0x0006030000000000ull, 0x0000000100000000ull },
+ { FSP_MMIO , 0, 0x0006030100000000ull, 0x0000000100000000ull },
+ { XIVE_IC , 0, 0x0006030200000000ull, 0x0000000002000000ull },
+ { PSIHB_ESB , 0, 0x0006030202000000ull, 0x0000000000100000ull },
+ { RESV , 3, 0x0006030202100000ull, 0x0000000000f00000ull },
+ { PSIHB_REG , 0, 0x0006030203000000ull, 0x0000000000100000ull },
+ { RESV , 4, 0x0006030203100000ull, 0x0000000000080000ull },
+ { XIVE_TM , 0, 0x0006030203180000ull, 0x0000000000040000ull },
+ { RESV , 5, 0x00060302031c0000ull, 0x0000000000010000ull },
+ { NX_RNG , 0, 0x00060302031d0000ull, 0x0000000000010000ull },
+ { RESV , 6, 0x00060302031e0000ull, 0x0000000004e20000ull },
+ { XIVE_NVC , 0, 0x0006030208000000ull, 0x0000000008000000ull },
+ { RESV , 7, 0x0006030210000000ull, 0x00000000ee000000ull },
+ { VAS_HYP_WIN , 0, 0x00060302fe000000ull, 0x0000000002000000ull },
+ { VAS_USER_WIN , 0, 0x0006030300000000ull, 0x0000000100000000ull },
+
+ /* TODO: MC, OCMB, PAU */
+ { RESV , 8, 0x0006030400000000ull, 0x000000f800000000ull },
+ { XSCOM , 0, 0x000603fc00000000ull, 0x0000000400000000ull },
+
+ /* 4 TB offset */
+ { XIVE_NVPG , 0, 0x0006040000000000ull, 0x0000010000000000ull },
+
+ /* 5 - 7 TB offset */
+ /* for P10 the END and ESB regions are separate in the MMIO
+ * table */
+ { XIVE_ESB , 0, 0x0006050000000000ull, 0x0000010000000000ull },
+ { XIVE_END , 0, 0x0006060000000000ull, 0x0000020000000000ull },
+
+ /* 8 - 13 TB offset */
+ { RESV , 9, 0x0006080000000000ull, 0x0000060000000000ull },
+
+ /* 14 TB offset */
+ { RESV ,10, 0x00060e0000000000ull, 0x0000008000000000ull },
+
+ { NULL_MAP, 0, 0, 0 },
+};
+
+static const struct phys_map_entry phys_map_table_nimbus[] = {
+
+ /* System memory upto 4TB minus GPU memory */
+ { SYSTEM_MEM, 0, 0x0000000000000000ull, 0x0000034000000000ull },
+ /* GPU memory from 4TB - 128GB*GPU */
+ { GPU_MEM_4T_DOWN, 5, 0x0000034000000000ull, 0x0000002000000000ull },
+ { GPU_MEM_4T_DOWN, 4, 0x0000036000000000ull, 0x0000002000000000ull },
+ { GPU_MEM_4T_DOWN, 3, 0x0000038000000000ull, 0x0000002000000000ull },
+ { GPU_MEM_4T_DOWN, 2, 0x000003a000000000ull, 0x0000002000000000ull },
+ { GPU_MEM_4T_DOWN, 1, 0x000003c000000000ull, 0x0000002000000000ull },
+ { GPU_MEM_4T_DOWN, 0, 0x000003e000000000ull, 0x0000002000000000ull },
+ /* GPU memory from 4TB + 128GB*GPU. 4 GPUs only */
+ { GPU_MEM_4T_UP, 0, 0x0000040000000000ull, 0x0000002000000000ull },
+ { GPU_MEM_4T_UP, 1, 0x0000042000000000ull, 0x0000002000000000ull },
+ { GPU_MEM_4T_UP, 2, 0x0000044000000000ull, 0x0000002000000000ull },
+ { GPU_MEM_4T_UP, 3, 0x0000046000000000ull, 0x0000002000000000ull },
+
+ /*
+ * OpenCAPI LPC Memory
+ *
+ * With chip address extension enabled, we allocate 4TB ranges
+ * (in the second non-mirrored region) for each OpenCAPI link
+ * by varying the upper 2 bits of the group ID.
+ *
+ * We don't currently support >4TB ranges.
+ */
+ { OCAPI_MEM, 0, 0x0002000000000000ull, 0x0000040000000000ull },
+ { OCAPI_MEM, 1, 0x0002800000000000ull, 0x0000040000000000ull },
+ { OCAPI_MEM, 2, 0x0003000000000000ull, 0x0000040000000000ull },
+ { OCAPI_MEM, 3, 0x0003800000000000ull, 0x0000040000000000ull },
+
+ /* 0 TB offset @ MMIO 0x0006000000000000ull */
+ { PHB4_64BIT_MMIO, 0, 0x0006000000000000ull, 0x0000004000000000ull },
+ { PHB4_64BIT_MMIO, 1, 0x0006004000000000ull, 0x0000004000000000ull },
+ { PHB4_64BIT_MMIO, 2, 0x0006008000000000ull, 0x0000004000000000ull },
+ { PHB4_32BIT_MMIO, 0, 0x000600c000000000ull, 0x0000000080000000ull },
+ { PHB4_32BIT_MMIO, 1, 0x000600c080000000ull, 0x0000000080000000ull },
+ { PHB4_32BIT_MMIO, 2, 0x000600c100000000ull, 0x0000000080000000ull },
+ { PHB4_32BIT_MMIO, 3, 0x000600c180000000ull, 0x0000000080000000ull },
+ { PHB4_32BIT_MMIO, 4, 0x000600c200000000ull, 0x0000000080000000ull },
+ { PHB4_32BIT_MMIO, 5, 0x000600c280000000ull, 0x0000000080000000ull },
+ { PHB4_XIVE_ESB , 0, 0x000600c300000000ull, 0x0000000020000000ull },
+ { PHB4_XIVE_ESB , 1, 0x000600c320000000ull, 0x0000000020000000ull },
+ { PHB4_XIVE_ESB , 2, 0x000600c340000000ull, 0x0000000020000000ull },
+ { PHB4_XIVE_ESB , 3, 0x000600c360000000ull, 0x0000000020000000ull },
+ { PHB4_XIVE_ESB , 4, 0x000600c380000000ull, 0x0000000020000000ull },
+ { PHB4_XIVE_ESB , 5, 0x000600c3a0000000ull, 0x0000000020000000ull },
+ { PHB4_REG_SPC , 0, 0x000600c3c0000000ull, 0x0000000000100000ull },
+ { PHB4_REG_SPC , 1, 0x000600c3c0100000ull, 0x0000000000100000ull },
+ { PHB4_REG_SPC , 2, 0x000600c3c0200000ull, 0x0000000000100000ull },
+ { PHB4_REG_SPC , 3, 0x000600c3c0300000ull, 0x0000000000100000ull },
+ { PHB4_REG_SPC , 4, 0x000600c3c0400000ull, 0x0000000000100000ull },
+ { PHB4_REG_SPC , 5, 0x000600c3c0500000ull, 0x0000000000100000ull },
+ { RESV , 0, 0x000600c3c0600000ull, 0x0000000c3fa00000ull },
+ { NPU_OCAPI_MMIO , 0, 0x000600d000000000ull, 0x0000000800000000ull },
+ { NPU_OCAPI_MMIO , 1, 0x000600d800000000ull, 0x0000000800000000ull },
+ { NPU_OCAPI_MMIO , 2, 0x000600e000000000ull, 0x0000000800000000ull },
+ { NPU_OCAPI_MMIO , 3, 0x000600e800000000ull, 0x0000000800000000ull },
+ { NPU_OCAPI_MMIO , 4, 0x000600f000000000ull, 0x0000000800000000ull },
+ { NPU_OCAPI_MMIO , 5, 0x000600f800000000ull, 0x0000000800000000ull },
+
+ /* 1 TB offset @ MMIO 0x0006000000000000ull */
+ { XIVE_VC , 0, 0x0006010000000000ull, 0x0000008000000000ull },
+ { XIVE_PC , 0, 0x0006018000000000ull, 0x0000001000000000ull },
+ { VAS_USER_WIN , 0, 0x0006019000000000ull, 0x0000000100000000ull },
+ { VAS_HYP_WIN , 0, 0x0006019100000000ull, 0x0000000002000000ull },
+ { RESV , 1, 0x0006019102000000ull, 0x000000001e000000ull },
+ { OCAB_XIVE_ESB , 0, 0x0006019120000000ull, 0x0000000020000000ull },
+ { RESV , 3, 0x0006019140000000ull, 0x0000006ec0000000ull },
+
+ /* 2 TB offset @ MMIO 0x0006000000000000ull */
+ { PHB4_64BIT_MMIO, 3, 0x0006020000000000ull, 0x0000004000000000ull },
+ { PHB4_64BIT_MMIO, 4, 0x0006024000000000ull, 0x0000004000000000ull },
+ { PHB4_64BIT_MMIO, 5, 0x0006028000000000ull, 0x0000004000000000ull },
+ { RESV , 4, 0x000602c000000000ull, 0x0000004000000000ull },
+
+ /* 3 TB offset @ MMIO 0x0006000000000000ull */
+ { LPC_BUS , 0, 0x0006030000000000ull, 0x0000000100000000ull },
+ { FSP_MMIO , 0, 0x0006030100000000ull, 0x0000000100000000ull },
+ { NPU_REGS , 0, 0x0006030200000000ull, 0x0000000001000000ull },
+ { NPU_USR , 0, 0x0006030201000000ull, 0x0000000000200000ull },
+ { NPU_PHY , 0, 0x0006030201200000ull, 0x0000000000200000ull },
+ { NPU_PHY , 1, 0x0006030201400000ull, 0x0000000000200000ull },
+ { NPU_NTL , 0, 0x0006030201600000ull, 0x0000000000020000ull },
+ { NPU_NTL , 1, 0x0006030201620000ull, 0x0000000000020000ull },
+ { NPU_NTL , 2, 0x0006030201640000ull, 0x0000000000020000ull },
+ { NPU_NTL , 3, 0x0006030201660000ull, 0x0000000000020000ull },
+ { NPU_NTL , 4, 0x0006030201680000ull, 0x0000000000020000ull },
+ { NPU_NTL , 5, 0x00060302016a0000ull, 0x0000000000020000ull },
+ { NPU_GENID , 0, 0x00060302016c0000ull, 0x0000000000020000ull },
+ { NPU_GENID , 1, 0x00060302016e0000ull, 0x0000000000020000ull },
+ { NPU_GENID , 2, 0x0006030201700000ull, 0x0000000000020000ull },
+ { RESV , 5, 0x0006030201720000ull, 0x00000000018e0000ull },
+ { PSIHB_REG , 0, 0x0006030203000000ull, 0x0000000000100000ull },
+ { XIVE_IC , 0, 0x0006030203100000ull, 0x0000000000080000ull },
+ { XIVE_TM , 0, 0x0006030203180000ull, 0x0000000000040000ull },
+ { PSIHB_ESB , 0, 0x00060302031c0000ull, 0x0000000000010000ull },
+ { NX_RNG , 0, 0x00060302031d0000ull, 0x0000000000010000ull },
+ { RESV , 6, 0x00060302031e0000ull, 0x000000001ce20000ull },
+ { CENTAUR_SCOM , 0, 0x0006030220000000ull, 0x0000000020000000ull },
+ { RESV , 7, 0x0006030240000000ull, 0x000000f9c0000000ull },
+ { XSCOM , 0, 0x000603fc00000000ull, 0x0000000400000000ull },
+
+ /* NULL entry at end */
+ { NULL_MAP, 0, 0, 0 },
+};
+
+static const struct phys_map_info phys_map_nimbus = {
+ .chip_select_shift = 42,
+ .table = phys_map_table_nimbus,
+};
+
+static const struct phys_map_entry phys_map_table_axone[] = {
+
+ /* System memory up to 4TB minus GPU memory */
+ { SYSTEM_MEM, 0, 0x0000000000000000ull, 0x0000034000000000ull },
+ /* GPU memory from 4TB - 128GB*GPU */
+ { GPU_MEM_4T_DOWN, 5, 0x0000034000000000ull, 0x0000002000000000ull },
+ { GPU_MEM_4T_DOWN, 4, 0x0000036000000000ull, 0x0000002000000000ull },
+ { GPU_MEM_4T_DOWN, 3, 0x0000038000000000ull, 0x0000002000000000ull },
+ { GPU_MEM_4T_DOWN, 2, 0x000003a000000000ull, 0x0000002000000000ull },
+ { GPU_MEM_4T_DOWN, 1, 0x000003c000000000ull, 0x0000002000000000ull },
+ { GPU_MEM_4T_DOWN, 0, 0x000003e000000000ull, 0x0000002000000000ull },
+
+ /* 0 TB offset @ MMIO 0x0006000000000000ull */
+ { PHB4_64BIT_MMIO, 0, 0x0006000000000000ull, 0x0000004000000000ull },
+ { PHB4_64BIT_MMIO, 1, 0x0006004000000000ull, 0x0000004000000000ull },
+ { PHB4_64BIT_MMIO, 2, 0x0006008000000000ull, 0x0000004000000000ull },
+ { PHB4_32BIT_MMIO, 0, 0x000600c000000000ull, 0x0000000080000000ull },
+ { PHB4_32BIT_MMIO, 1, 0x000600c080000000ull, 0x0000000080000000ull },
+ { PHB4_32BIT_MMIO, 2, 0x000600c100000000ull, 0x0000000080000000ull },
+ { PHB4_32BIT_MMIO, 3, 0x000600c180000000ull, 0x0000000080000000ull },
+ { PHB4_32BIT_MMIO, 4, 0x000600c200000000ull, 0x0000000080000000ull },
+ { PHB4_32BIT_MMIO, 5, 0x000600c280000000ull, 0x0000000080000000ull },
+ { PHB4_XIVE_ESB, 0, 0x000600c300000000ull, 0x0000000020000000ull },
+ { PHB4_XIVE_ESB, 1, 0x000600c320000000ull, 0x0000000020000000ull },
+ { PHB4_XIVE_ESB, 2, 0x000600c340000000ull, 0x0000000020000000ull },
+ { PHB4_XIVE_ESB, 3, 0x000600c360000000ull, 0x0000000020000000ull },
+ { PHB4_XIVE_ESB, 4, 0x000600c380000000ull, 0x0000000020000000ull },
+ { PHB4_XIVE_ESB, 5, 0x000600c3a0000000ull, 0x0000000020000000ull },
+ { PHB4_REG_SPC, 0, 0x000600c3c0000000ull, 0x0000000000100000ull },
+ { PHB4_REG_SPC, 1, 0x000600c3c0100000ull, 0x0000000000100000ull },
+ { PHB4_REG_SPC, 2, 0x000600c3c0200000ull, 0x0000000000100000ull },
+ { PHB4_REG_SPC, 3, 0x000600c3c0300000ull, 0x0000000000100000ull },
+ { PHB4_REG_SPC, 4, 0x000600c3c0400000ull, 0x0000000000100000ull },
+ { PHB4_REG_SPC, 5, 0x000600c3c0500000ull, 0x0000000000100000ull },
+ { RESV, 0, 0x000600c3c0600000ull, 0x0000000c3fa00000ull },
+ { NPU_OCAPI_MMIO, 0, 0x000600d000000000ull, 0x0000000800000000ull },
+ { NPU_OCAPI_MMIO, 1, 0x000600d800000000ull, 0x0000000800000000ull },
+ { NPU_OCAPI_MMIO, 2, 0x000600e000000000ull, 0x0000000800000000ull },
+ { NPU_OCAPI_MMIO, 3, 0x000600e800000000ull, 0x0000000800000000ull },
+ { NPU_OCAPI_MMIO, 4, 0x000600f000000000ull, 0x0000000800000000ull },
+ { NPU_OCAPI_MMIO, 5, 0x000600f800000000ull, 0x0000000800000000ull },
+
+ /* 1 TB offset @ MMIO 0x0006000000000000ull */
+ { XIVE_VC, 0, 0x0006010000000000ull, 0x0000008000000000ull },
+ { XIVE_PC, 0, 0x0006018000000000ull, 0x0000004000000000ull },
+ { VAS_USER_WIN, 0, 0x000601c000000000ull, 0x0000000100000000ull },
+ { VAS_HYP_WIN, 0, 0x000601c100000000ull, 0x0000000002000000ull },
+ { RESV, 1, 0x000601c102000000ull, 0x0000003efe000000ull },
+
+ /* 2 TB offset @ MMIO 0x0006000000000000ull */
+ { PHB4_64BIT_MMIO, 3, 0x0006020000000000ull, 0x0000004000000000ull },
+ { PHB4_64BIT_MMIO, 4, 0x0006024000000000ull, 0x0000004000000000ull },
+ { PHB4_64BIT_MMIO, 5, 0x0006028000000000ull, 0x0000004000000000ull },
+ { RESV, 2, 0x000602c000000000ull, 0x0000004000000000ull },
+
+ /* 3 TB offset @ MMIO 0x0006000000000000ull */
+ { LPC_BUS, 0, 0x0006030000000000ull, 0x0000000100000000ull },
+ { FSP_MMIO, 0, 0x0006030100000000ull, 0x0000000100000000ull },
+ { RESV, 3, 0x0006030200000000ull, 0x0000000003000000ull },
+ { PSIHB_REG, 0, 0x0006030203000000ull, 0x0000000000100000ull },
+ { XIVE_IC, 0, 0x0006030203100000ull, 0x0000000000080000ull },
+ { XIVE_TM, 0, 0x0006030203180000ull, 0x0000000000040000ull },
+ { PSIHB_ESB, 0, 0x00060302031c0000ull, 0x0000000000010000ull },
+ { NX_RNG, 0, 0x00060302031d0000ull, 0x0000000000010000ull },
+ { RESV, 4, 0x00060302031e0000ull, 0x00000001fce20000ull },
+ { MC_OCMB_CFG, 0, 0x0006030400000000ull, 0x0000000080000000ull },
+ { MC_OCMB_CFG, 1, 0x0006030480000000ull, 0x0000000080000000ull },
+ { MC_OCMB_MMIO, 0, 0x0006030500000000ull, 0x0000000080000000ull },
+ { MC_OCMB_MMIO, 1, 0x0006030580000000ull, 0x0000000080000000ull },
+ { MC_OCMB_CFG, 2, 0x0006030600000000ull, 0x0000000080000000ull },
+ { MC_OCMB_CFG, 3, 0x0006030680000000ull, 0x0000000080000000ull },
+ { MC_OCMB_MMIO, 2, 0x0006030700000000ull, 0x0000000080000000ull },
+ { MC_OCMB_MMIO, 3, 0x0006030780000000ull, 0x0000000080000000ull },
+ { MC_OCMB_CFG, 4, 0x0006030800000000ull, 0x0000000080000000ull },
+ { MC_OCMB_CFG, 5, 0x0006030880000000ull, 0x0000000080000000ull },
+ { MC_OCMB_MMIO, 4, 0x0006030900000000ull, 0x0000000080000000ull },
+ { MC_OCMB_MMIO, 5, 0x0006030980000000ull, 0x0000000080000000ull },
+ { MC_OCMB_CFG, 6, 0x0006030a00000000ull, 0x0000000080000000ull },
+ { MC_OCMB_CFG, 7, 0x0006030a80000000ull, 0x0000000080000000ull },
+ { MC_OCMB_MMIO, 6, 0x0006030b00000000ull, 0x0000000080000000ull },
+ { MC_OCMB_MMIO, 7, 0x0006030b80000000ull, 0x0000000080000000ull },
+ { MC_OCMB_CFG, 8, 0x0006030c00000000ull, 0x0000000080000000ull },
+ { MC_OCMB_CFG, 9, 0x0006030c80000000ull, 0x0000000080000000ull },
+ { MC_OCMB_MMIO, 8, 0x0006030d00000000ull, 0x0000000080000000ull },
+ { MC_OCMB_MMIO, 9, 0x0006030d80000000ull, 0x0000000080000000ull },
+ { MC_OCMB_CFG, 10, 0x0006030e00000000ull, 0x0000000080000000ull },
+ { MC_OCMB_CFG, 11, 0x0006030e80000000ull, 0x0000000080000000ull },
+ { MC_OCMB_MMIO, 10, 0x0006030f00000000ull, 0x0000000080000000ull },
+ { MC_OCMB_MMIO, 11, 0x0006030f80000000ull, 0x0000000080000000ull },
+ { MC_OCMB_CFG, 12, 0x0006031000000000ull, 0x0000000080000000ull },
+ { MC_OCMB_CFG, 13, 0x0006031080000000ull, 0x0000000080000000ull },
+ { MC_OCMB_MMIO, 12, 0x0006031100000000ull, 0x0000000080000000ull },
+ { MC_OCMB_MMIO, 13, 0x0006031180000000ull, 0x0000000080000000ull },
+ { MC_OCMB_CFG, 14, 0x0006031200000000ull, 0x0000000080000000ull },
+ { MC_OCMB_CFG, 15, 0x0006031280000000ull, 0x0000000080000000ull },
+ { MC_OCMB_MMIO, 14, 0x0006031300000000ull, 0x0000000080000000ull },
+ { MC_OCMB_MMIO, 15, 0x0006031380000000ull, 0x0000000080000000ull },
+ { RESV, 5, 0x0006031400000000ull, 0x000000d800000000ull },
+ { NPU_REGS, 0, 0x000603ec00000000ull, 0x0000000001000000ull },
+ { NPU_REGS, 1, 0x000603ec01000000ull, 0x0000000001000000ull },
+ { NPU_REGS, 2, 0x000603ec02000000ull, 0x0000000001000000ull },
+ { NPU_NTL, 0, 0x000603ec03000000ull, 0x0000000000020000ull },
+ { NPU_NTL, 1, 0x000603ec03020000ull, 0x0000000000020000ull },
+ { NPU_NTL, 2, 0x000603ec03040000ull, 0x0000000000020000ull },
+ { NPU_NTL, 3, 0x000603ec03060000ull, 0x0000000000020000ull },
+ { NPU_GENID, 0, 0x000603ec03080000ull, 0x0000000000080000ull },
+ { NPU_NTL, 4, 0x000603ec03100000ull, 0x0000000000020000ull },
+ { NPU_NTL, 5, 0x000603ec03120000ull, 0x0000000000020000ull },
+ { NPU_NTL, 6, 0x000603ec03140000ull, 0x0000000000020000ull },
+ { NPU_NTL, 7, 0x000603ec03160000ull, 0x0000000000020000ull },
+ { NPU_GENID, 1, 0x000603ec03180000ull, 0x0000000000080000ull },
+ { NPU_NTL, 8, 0x000603ec03200000ull, 0x0000000000020000ull },
+ { NPU_NTL, 9, 0x000603ec03220000ull, 0x0000000000020000ull },
+ { NPU_NTL, 10, 0x000603ec03240000ull, 0x0000000000020000ull },
+ { NPU_NTL, 11, 0x000603ec03260000ull, 0x0000000000020000ull },
+ { NPU_GENID, 2, 0x000603ec03280000ull, 0x0000000000080000ull },
+ { RESV, 6, 0x000603ec03300000ull, 0x0000000ffcd00000ull },
+ { XSCOM, 0, 0x000603fc00000000ull, 0x0000000400000000ull },
+
+ /* NULL entry at end */
+ { NULL_MAP, 0, 0, 0 },
+};
+
+static const struct phys_map_info phys_map_axone = {
+ .chip_select_shift = 42,
+ .table = phys_map_table_axone,
+};
+
+static const struct phys_map_info phys_map_p10 = {
+ .chip_select_shift = 44,
+ .table = phys_map_table_p10,
+};
+
+static inline bool phys_map_entry_null(const struct phys_map_entry *e)
+{
+ if (e->type == NULL_MAP)
+ return true;
+ return false;
+}
+
+
+/* This crashes skiboot on error as any bad calls here are almost
+ * certainly a developer error
+ */
+void __phys_map_get(uint64_t topology_idx, uint64_t gcid, enum phys_map_type type,
+ int index, uint64_t *addr, uint64_t *size) {
+ const struct phys_map_entry *e;
+ uint64_t a;
+
+ if (!phys_map)
+ goto error;
+
+ /* Find entry in table */
+ for (e = phys_map->table; ; e++) {
+
+ /* End of table */
+ if (phys_map_entry_null(e))
+ goto error;
+
+ /* Is this our entry? */
+ if (e->type != type)
+ continue;
+ if (e->index != index)
+ continue;
+
+ /* Found entry! */
+ break;
+ }
+ a = e->addr;
+ a += topology_idx << (phys_map->chip_select_shift);
+
+ if (addr)
+ *addr = a;
+ if (size)
+ *size = e->size;
+
+ prlog(PR_TRACE, "Assigning BAR [%"PRIx64"] type:%02i index:%x "
+ "0x%016"PRIx64" for 0x%016"PRIx64"\n",
+ gcid, type, index, a, e->size);
+
+ return;
+
+error:
+ /* Something has gone really wrong */
+ prlog(PR_EMERG, "ERROR: Failed to lookup BAR type:%i index:%i\n",
+ type, index);
+ assert(0);
+}
+
+void phys_map_get(uint64_t gcid, enum phys_map_type type,
+ int index, uint64_t *addr, uint64_t *size)
+{
+ struct proc_chip *chip;
+ uint64_t topology_idx = gcid;
+
+ if (proc_gen >= proc_gen_p10) {
+ chip = get_chip(gcid);
+ topology_idx = chip->primary_topology;
+ }
+
+ return __phys_map_get(topology_idx, gcid, type, index, addr, size);
+}
+
+void phys_map_init(unsigned long pvr)
+{
+ const char *name = "unused";
+
+ phys_map = NULL;
+
+ if (proc_gen == proc_gen_p9) {
+ switch(PVR_TYPE(pvr)) {
+ case PVR_TYPE_P9P:
+ name = "axone";
+ phys_map = &phys_map_axone;
+ break;
+ default:
+ name = "nimbus";
+ phys_map = &phys_map_nimbus;
+ }
+ } else if (proc_gen == proc_gen_p10) {
+ name = "p10";
+ phys_map = &phys_map_p10;
+ }
+
+ prlog(PR_DEBUG, "Assigning physical memory map table for %s\n", name);
+
+}
diff --git a/roms/skiboot/hw/prd.c b/roms/skiboot/hw/prd.c
new file mode 100644
index 000000000..45d765457
--- /dev/null
+++ b/roms/skiboot/hw/prd.c
@@ -0,0 +1,789 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * PRD: Processor Runtime Diagnostics
+ *
+ * Copyright 2014-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <opal.h>
+#include <lock.h>
+#include <xscom.h>
+#include <chip.h>
+#include <opal-msg.h>
+#include <fsp.h>
+#include <mem_region.h>
+#include <prd-fw-msg.h>
+#include <hostservices.h>
+
+enum events {
+ EVENT_ATTN = 1 << 0,
+ EVENT_OCC_ERROR = 1 << 1,
+ EVENT_OCC_RESET = 1 << 2,
+ EVENT_SBE_PASSTHROUGH = 1 << 3,
+ EVENT_FSP_OCC_RESET = 1 << 4,
+ EVENT_FSP_OCC_LOAD_START = 1 << 5,
+};
+
+static uint8_t events[MAX_CHIPS];
+static uint64_t ipoll_status[MAX_CHIPS];
+static uint8_t _prd_msg_buf[sizeof(struct opal_prd_msg) +
+ sizeof(struct prd_fw_msg)];
+static struct opal_prd_msg *prd_msg = (struct opal_prd_msg *)&_prd_msg_buf;
+static struct opal_prd_msg *prd_msg_fsp_req;
+static struct opal_prd_msg *prd_msg_fsp_notify;
+static bool prd_msg_inuse, prd_active;
+static struct dt_node *prd_node;
+static bool prd_enabled = false;
+
+/* Locking:
+ *
+ * The events lock serialises access to the events, ipoll_status,
+ * prd_msg_inuse, and prd_active variables.
+ *
+ * The ipoll_lock protects against concurrent updates to the ipoll registers.
+ *
+ * The ipoll_lock may be acquired with events_lock held. This order must
+ * be preserved.
+ */
+static struct lock events_lock = LOCK_UNLOCKED;
+static struct lock ipoll_lock = LOCK_UNLOCKED;
+
+static uint64_t prd_ipoll_mask_reg;
+static uint64_t prd_ipoll_status_reg;
+static uint64_t prd_ipoll_mask;
+
+/* PRD registers */
+#define PRD_P8_IPOLL_REG_MASK 0x01020013
+#define PRD_P8_IPOLL_REG_STATUS 0x01020014
+#define PRD_P8_IPOLL_XSTOP PPC_BIT(0) /* Xstop for host/core/millicode */
+#define PRD_P8_IPOLL_RECOV PPC_BIT(1) /* Recoverable */
+#define PRD_P8_IPOLL_SPEC_ATTN PPC_BIT(2) /* Special attention */
+#define PRD_P8_IPOLL_HOST_ATTN PPC_BIT(3) /* Host attention */
+#define PRD_P8_IPOLL_MASK PPC_BITMASK(0, 3)
+
+#define PRD_P9_IPOLL_REG_MASK 0x000F0033
+#define PRD_P9_IPOLL_REG_STATUS 0x000F0034
+#define PRD_P9_IPOLL_XSTOP PPC_BIT(0) /* Xstop for host/core/millicode */
+#define PRD_P9_IPOLL_RECOV PPC_BIT(1) /* Recoverable */
+#define PRD_P9_IPOLL_SPEC_ATTN PPC_BIT(2) /* Special attention */
+#define PRD_P9_IPOLL_UNIT_CS PPC_BIT(3) /* Unit Xstop */
+#define PRD_P9_IPOLL_HOST_ATTN PPC_BIT(4) /* Host attention */
+#define PRD_P9_IPOLL_MASK_INTR PPC_BIT(5) /* Host interrupt */
+#define PRD_P9_IPOLL_MASK PPC_BITMASK(0, 5)
+
+static void send_next_pending_event(void);
+
+static void prd_msg_consumed(void *data, int status)
+{
+ struct opal_prd_msg *msg = data;
+ uint32_t proc;
+ int notify_status = OPAL_SUCCESS;
+ uint8_t event = 0;
+
+ lock(&events_lock);
+ switch (msg->hdr.type) {
+ case OPAL_PRD_MSG_TYPE_ATTN:
+ proc = be64_to_cpu(msg->attn.proc);
+
+ /* If other ipoll events have been received in the time
+ * between prd_msg creation and consumption, we'll need to
+ * raise a separate ATTN message for those. So, we only
+ * clear the event if we don't have any further ipoll_status
+ * bits.
+ */
+ ipoll_status[proc] &= ~be64_to_cpu(msg->attn.ipoll_status);
+ if (!ipoll_status[proc])
+ event = EVENT_ATTN;
+
+ break;
+ case OPAL_PRD_MSG_TYPE_OCC_ERROR:
+ proc = be64_to_cpu(msg->occ_error.chip);
+ event = EVENT_OCC_ERROR;
+ break;
+ case OPAL_PRD_MSG_TYPE_OCC_RESET:
+ proc = be64_to_cpu(msg->occ_reset.chip);
+ event = EVENT_OCC_RESET;
+ break;
+ case OPAL_PRD_MSG_TYPE_FIRMWARE_RESPONSE:
+ if (prd_msg_fsp_req) {
+ free(prd_msg_fsp_req);
+ prd_msg_fsp_req = NULL;
+ }
+ break;
+ case OPAL_PRD_MSG_TYPE_FIRMWARE_NOTIFY:
+ if (prd_msg_fsp_notify) {
+ free(prd_msg_fsp_notify);
+ prd_msg_fsp_notify = NULL;
+ }
+ if (status != 0) {
+ prlog(PR_DEBUG,
+ "PRD: Failed to send FSP -> HBRT message\n");
+ notify_status = FSP_STATUS_GENERIC_ERROR;
+ }
+ if (platform.prd && platform.prd->msg_response)
+ platform.prd->msg_response(notify_status);
+ break;
+ case OPAL_PRD_MSG_TYPE_SBE_PASSTHROUGH:
+ proc = be64_to_cpu(msg->sbe_passthrough.chip);
+ event = EVENT_SBE_PASSTHROUGH;
+ break;
+ case OPAL_PRD_MSG_TYPE_FSP_OCC_RESET:
+ proc = be64_to_cpu(msg->occ_reset.chip);
+ event = EVENT_FSP_OCC_RESET;
+ break;
+ case OPAL_PRD_MSG_TYPE_FSP_OCC_LOAD_START:
+ proc = be64_to_cpu(msg->occ_reset.chip);
+ event = EVENT_FSP_OCC_LOAD_START;
+ break;
+ default:
+ prlog(PR_ERR, "PRD: invalid msg consumed, type: 0x%x\n",
+ msg->hdr.type);
+ }
+
+ if (event)
+ events[proc] &= ~event;
+ prd_msg_inuse = false;
+ send_next_pending_event();
+ unlock(&events_lock);
+}
+
+/*
+ * OPAL_MSG_PRD interface can handle message size <= OPAL_MSG_FIXED_PARAMS_SIZE.
+ * But kernel prd driver had a bug where it will not copy partial data to user
+ * space. Use OPAL_MSG_PRD interface only if size is <= sizeof(opal_prg_msg).
+ */
+static inline int opal_queue_prd_msg(struct opal_prd_msg *msg)
+{
+ enum opal_msg_type msg_type = OPAL_MSG_PRD2;
+
+ if (be16_to_cpu(msg->hdr.size) <= 0x20)
+ msg_type = OPAL_MSG_PRD;
+
+ return _opal_queue_msg(msg_type, msg, prd_msg_consumed,
+ be16_to_cpu(msg->hdr.size), msg);
+}
+
+static int populate_ipoll_msg(struct opal_prd_msg *msg, uint32_t proc)
+{
+ uint64_t ipoll_mask;
+ int rc;
+
+ lock(&ipoll_lock);
+ rc = xscom_read(proc, prd_ipoll_mask_reg, &ipoll_mask);
+ unlock(&ipoll_lock);
+
+ if (rc) {
+ prlog(PR_ERR, "PRD: Unable to read ipoll status (chip %d)!\n",
+ proc);
+ return -1;
+ }
+
+ msg->attn.proc = cpu_to_be64(proc);
+ msg->attn.ipoll_status = cpu_to_be64(ipoll_status[proc]);
+ msg->attn.ipoll_mask = cpu_to_be64(ipoll_mask);
+ return 0;
+}
+
+static void send_next_pending_event(void)
+{
+ struct proc_chip *chip;
+ uint32_t proc;
+ int rc;
+ uint8_t event;
+
+ assert(!prd_msg_inuse);
+
+ if (!prd_active)
+ return;
+
+ event = 0;
+
+ for_each_chip(chip) {
+ proc = chip->id;
+ if (events[proc]) {
+ event = events[proc];
+ break;
+ }
+ }
+
+ if (!event)
+ return;
+
+ prd_msg->token = 0;
+ prd_msg->hdr.size = cpu_to_be16(sizeof(*prd_msg));
+
+ if (event & EVENT_ATTN) {
+ prd_msg->hdr.type = OPAL_PRD_MSG_TYPE_ATTN;
+ populate_ipoll_msg(prd_msg, proc);
+ } else if (event & EVENT_OCC_ERROR) {
+ prd_msg->hdr.type = OPAL_PRD_MSG_TYPE_OCC_ERROR;
+ prd_msg->occ_error.chip = cpu_to_be64(proc);
+ } else if (event & EVENT_OCC_RESET) {
+ prd_msg->hdr.type = OPAL_PRD_MSG_TYPE_OCC_RESET;
+ prd_msg->occ_reset.chip = cpu_to_be64(proc);
+ occ_msg_queue_occ_reset();
+ } else if (event & EVENT_SBE_PASSTHROUGH) {
+ prd_msg->hdr.type = OPAL_PRD_MSG_TYPE_SBE_PASSTHROUGH;
+ prd_msg->sbe_passthrough.chip = cpu_to_be64(proc);
+ } else if (event & EVENT_FSP_OCC_RESET) {
+ prd_msg->hdr.type = OPAL_PRD_MSG_TYPE_FSP_OCC_RESET;
+ prd_msg->occ_reset.chip = cpu_to_be64(proc);
+ } else if (event & EVENT_FSP_OCC_LOAD_START) {
+ prd_msg->hdr.type = OPAL_PRD_MSG_TYPE_FSP_OCC_LOAD_START;
+ prd_msg->occ_reset.chip = cpu_to_be64(proc);
+ }
+
+ /*
+ * We always need to handle PSI interrupts, but if the is PRD is
+ * disabled then we shouldn't propagate PRD events to the host.
+ */
+ if (prd_enabled) {
+ rc = opal_queue_prd_msg(prd_msg);
+ if (!rc)
+ prd_msg_inuse = true;
+ }
+}
+
+static void __prd_event(uint32_t proc, uint8_t event)
+{
+ events[proc] |= event;
+ if (!prd_msg_inuse)
+ send_next_pending_event();
+}
+
+static void prd_event(uint32_t proc, uint8_t event)
+{
+ lock(&events_lock);
+ __prd_event(proc, event);
+ unlock(&events_lock);
+}
+
+static int __ipoll_update_mask(uint32_t proc, bool set, uint64_t bits)
+{
+ uint64_t mask;
+ int rc;
+
+ rc = xscom_read(proc, prd_ipoll_mask_reg, &mask);
+ if (rc)
+ return rc;
+
+ if (set)
+ mask |= bits;
+ else
+ mask &= ~bits;
+
+ return xscom_write(proc, prd_ipoll_mask_reg, mask);
+}
+
+static int ipoll_record_and_mask_pending(uint32_t proc)
+{
+ uint64_t status;
+ int rc;
+
+ lock(&ipoll_lock);
+ rc = xscom_read(proc, prd_ipoll_status_reg, &status);
+ status &= prd_ipoll_mask;
+ if (!rc)
+ __ipoll_update_mask(proc, true, status);
+ unlock(&ipoll_lock);
+
+ if (!rc)
+ ipoll_status[proc] |= status;
+
+ return rc;
+}
+
+/* Entry point for interrupts */
+void prd_psi_interrupt(uint32_t proc)
+{
+ int rc;
+
+ lock(&events_lock);
+
+ rc = ipoll_record_and_mask_pending(proc);
+ if (rc)
+ prlog(PR_ERR, "PRD: Failed to update IPOLL mask\n");
+
+ __prd_event(proc, EVENT_ATTN);
+
+ unlock(&events_lock);
+}
+
+void prd_tmgt_interrupt(uint32_t proc)
+{
+ prd_event(proc, EVENT_OCC_ERROR);
+}
+
+void prd_occ_reset(uint32_t proc)
+{
+ prd_event(proc, EVENT_OCC_RESET);
+}
+
+void prd_fsp_occ_reset(uint32_t proc)
+{
+ prd_event(proc, EVENT_FSP_OCC_RESET);
+}
+
+void prd_sbe_passthrough(uint32_t proc)
+{
+ prd_event(proc, EVENT_SBE_PASSTHROUGH);
+}
+
+void prd_fsp_occ_load_start(uint32_t proc)
+{
+ prd_event(proc, EVENT_FSP_OCC_LOAD_START);
+}
+
+void prd_fw_resp_fsp_response(int status)
+{
+ struct prd_fw_msg *fw_resp;
+ uint64_t fw_resp_len_old;
+ int rc;
+ uint16_t hdr_size;
+
+ lock(&events_lock);
+
+ /* In case of failure, return code is passed via generic_resp */
+ if (status != 0) {
+ fw_resp = (struct prd_fw_msg *)prd_msg_fsp_req->fw_resp.data;
+ fw_resp->type = cpu_to_be64(PRD_FW_MSG_TYPE_RESP_GENERIC);
+ fw_resp->generic_resp.status = cpu_to_be64(status);
+
+ fw_resp_len_old = be64_to_cpu(prd_msg_fsp_req->fw_resp.len);
+ prd_msg_fsp_req->fw_resp.len = cpu_to_be64(PRD_FW_MSG_BASE_SIZE +
+ sizeof(fw_resp->generic_resp));
+
+ /* Update prd message size */
+ hdr_size = be16_to_cpu(prd_msg_fsp_req->hdr.size);
+ hdr_size -= fw_resp_len_old;
+ hdr_size += be64_to_cpu(prd_msg_fsp_req->fw_resp.len);
+ prd_msg_fsp_req->hdr.size = cpu_to_be16(hdr_size);
+ }
+
+ rc = opal_queue_prd_msg(prd_msg_fsp_req);
+ if (!rc)
+ prd_msg_inuse = true;
+ unlock(&events_lock);
+}
+
+int prd_hbrt_fsp_msg_notify(void *data, u32 dsize)
+{
+ struct prd_fw_msg *fw_notify;
+ int size, fw_notify_size;
+ int rc = FSP_STATUS_GENERIC_ERROR;
+
+ if (!prd_enabled) {
+ prlog(PR_NOTICE, "PRD: %s: PRD daemon is not ready\n",
+ __func__);
+ return rc;
+ }
+
+ /* Calculate prd message size */
+ fw_notify_size = PRD_FW_MSG_BASE_SIZE + dsize;
+ size = sizeof(prd_msg->hdr) + sizeof(prd_msg->token) +
+ sizeof(prd_msg->fw_notify) + fw_notify_size;
+
+ if (size > OPAL_PRD_MSG_SIZE_MAX) {
+ prlog(PR_DEBUG, "PRD: FSP - HBRT notify message size (0x%x)"
+ " is bigger than prd interface can handle\n", size);
+ return rc;
+ }
+
+ lock(&events_lock);
+
+ /* FSP - HBRT messages are serialized */
+ if (prd_msg_fsp_notify) {
+ prlog(PR_DEBUG, "PRD: FSP - HBRT notify message is busy\n");
+ goto unlock_events;
+ }
+
+ /* Handle message allocation */
+ prd_msg_fsp_notify = zalloc(size);
+ if (!prd_msg_fsp_notify) {
+ prlog(PR_DEBUG,
+ "PRD: %s: Failed to allocate memory.\n", __func__);
+ goto unlock_events;
+ }
+
+ prd_msg_fsp_notify->hdr.type = OPAL_PRD_MSG_TYPE_FIRMWARE_NOTIFY;
+ prd_msg_fsp_notify->hdr.size = cpu_to_be16(size);
+ prd_msg_fsp_notify->token = 0;
+ prd_msg_fsp_notify->fw_notify.len = cpu_to_be64(fw_notify_size);
+ fw_notify = (void *)prd_msg_fsp_notify->fw_notify.data;
+ fw_notify->type = cpu_to_be64(PRD_FW_MSG_TYPE_HBRT_FSP);
+ memcpy(&(fw_notify->mbox_msg), data, dsize);
+
+ if (!prd_active) {
+ // save the message, we'll deliver it when prd starts
+ rc = FSP_STATUS_BUSY;
+ goto unlock_events;
+ }
+
+ rc = opal_queue_prd_msg(prd_msg_fsp_notify);
+ if (!rc)
+ prd_msg_inuse = true;
+
+unlock_events:
+ unlock(&events_lock);
+ return rc;
+}
+
+/* incoming message handlers */
+static int prd_msg_handle_attn_ack(struct opal_prd_msg *msg)
+{
+ int rc;
+
+ lock(&ipoll_lock);
+ rc = __ipoll_update_mask(be64_to_cpu(msg->attn_ack.proc), false,
+ be64_to_cpu(msg->attn_ack.ipoll_ack) & prd_ipoll_mask);
+ unlock(&ipoll_lock);
+
+ if (rc)
+ prlog(PR_ERR, "PRD: Unable to unmask ipoll!\n");
+
+ return rc;
+}
+
+static int prd_msg_handle_init(struct opal_prd_msg *msg)
+{
+ struct proc_chip *chip;
+
+ lock(&ipoll_lock);
+ for_each_chip(chip) {
+ __ipoll_update_mask(chip->id, false,
+ be64_to_cpu(msg->init.ipoll) & prd_ipoll_mask);
+ }
+ unlock(&ipoll_lock);
+
+ /* we're transitioning from inactive to active; send any pending tmgt
+ * interrupts */
+ lock(&events_lock);
+ prd_active = true;
+
+ if (prd_msg_fsp_notify) {
+ if (!opal_queue_prd_msg(prd_msg_fsp_notify))
+ prd_msg_inuse = true;
+ }
+ if (!prd_msg_inuse)
+ send_next_pending_event();
+ unlock(&events_lock);
+
+ return OPAL_SUCCESS;
+}
+
+static int prd_msg_handle_fini(void)
+{
+ struct proc_chip *chip;
+
+ lock(&events_lock);
+ prd_active = false;
+ unlock(&events_lock);
+
+ lock(&ipoll_lock);
+ for_each_chip(chip) {
+ __ipoll_update_mask(chip->id, true, prd_ipoll_mask);
+ }
+ unlock(&ipoll_lock);
+
+ return OPAL_SUCCESS;
+}
+
+static int prd_msg_handle_firmware_req(struct opal_prd_msg *msg)
+{
+ unsigned long fw_req_len, fw_resp_len, data_len;
+ struct prd_fw_msg *fw_req, *fw_resp;
+ int rc;
+ uint64_t resp_msg_size;
+
+ fw_req_len = be64_to_cpu(msg->fw_req.req_len);
+ fw_resp_len = be64_to_cpu(msg->fw_req.resp_len);
+ fw_req = (struct prd_fw_msg *)msg->fw_req.data;
+
+ /* do we have a full firmware message? */
+ if (fw_req_len < sizeof(struct prd_fw_msg))
+ return -EINVAL;
+
+ /* does the total (outer) PRD message len provide enough data for the
+ * claimed (inner) FW message?
+ */
+ if (be16_to_cpu(msg->hdr.size) < fw_req_len +
+ offsetof(struct opal_prd_msg, fw_req.data))
+ return -EINVAL;
+
+ /* is there enough response buffer for a base response? Type-specific
+ * responses may be larger, but anything less than BASE_SIZE is
+ * invalid. */
+ if (fw_resp_len < PRD_FW_MSG_BASE_SIZE)
+ return -EINVAL;
+
+ /* prepare a response message. */
+ lock(&events_lock);
+ prd_msg_inuse = true;
+ prd_msg->token = 0;
+ prd_msg->hdr.type = OPAL_PRD_MSG_TYPE_FIRMWARE_RESPONSE;
+ fw_resp = (void *)prd_msg->fw_resp.data;
+
+ switch (be64_to_cpu(fw_req->type)) {
+ case PRD_FW_MSG_TYPE_REQ_NOP:
+ fw_resp->type = cpu_to_be64(PRD_FW_MSG_TYPE_RESP_NOP);
+ prd_msg->fw_resp.len = cpu_to_be64(PRD_FW_MSG_BASE_SIZE);
+ prd_msg->hdr.size = cpu_to_be16(sizeof(*prd_msg));
+ rc = 0;
+ break;
+ case PRD_FW_MSG_TYPE_ERROR_LOG:
+ if (platform.prd == NULL ||
+ platform.prd->send_error_log == NULL) {
+ rc = OPAL_UNSUPPORTED;
+ break;
+ }
+
+ rc = platform.prd->send_error_log(be32_to_cpu(fw_req->errorlog.plid),
+ be32_to_cpu(fw_req->errorlog.size),
+ fw_req->errorlog.data);
+ /* Return generic response to HBRT */
+ fw_resp->type = cpu_to_be64(PRD_FW_MSG_TYPE_RESP_GENERIC);
+ fw_resp->generic_resp.status = cpu_to_be64(rc);
+ prd_msg->fw_resp.len = cpu_to_be64(PRD_FW_MSG_BASE_SIZE +
+ sizeof(fw_resp->generic_resp));
+ prd_msg->hdr.size = cpu_to_be16(sizeof(*prd_msg));
+ rc = 0;
+ break;
+ case PRD_FW_MSG_TYPE_HBRT_FSP:
+ if (platform.prd == NULL ||
+ platform.prd->send_hbrt_msg == NULL) {
+ rc = OPAL_UNSUPPORTED;
+ break;
+ }
+
+ /*
+ * HBRT -> FSP messages are serialized. Just to be sure check
+ * whether fsp_req message is free or not.
+ */
+ if (prd_msg_fsp_req) {
+ prlog(PR_DEBUG, "PRD: HBRT - FSP message is busy\n");
+ rc = OPAL_BUSY;
+ break;
+ }
+
+ /*
+ * FSP interface doesn't tell us the response data size.
+ * Hence pass response length = request length.
+ */
+ resp_msg_size = sizeof(msg->hdr) + sizeof(msg->token) +
+ sizeof(msg->fw_resp) + fw_req_len;
+
+ if (resp_msg_size > OPAL_PRD_MSG_SIZE_MAX) {
+ prlog(PR_DEBUG, "PRD: HBRT - FSP response size (0x%llx)"
+ " is bigger than prd interface can handle\n",
+ resp_msg_size);
+ rc = OPAL_INTERNAL_ERROR;
+ break;
+ }
+
+ /*
+ * We will use fsp_queue_msg() to pass HBRT data to FSP.
+ * We cannot directly map kernel passed data as kernel
+ * will release the memory as soon as we return the control.
+ * Also FSP uses same memory to pass response to HBRT. Hence
+ * lets copy data to local memory. Then pass this memory to
+ * FSP via TCE mapping.
+ */
+ prd_msg_fsp_req = zalloc(resp_msg_size);
+ if (!prd_msg_fsp_req) {
+ prlog(PR_DEBUG, "PRD: Failed to allocate memory "
+ "for HBRT - FSP message\n");
+ rc = OPAL_RESOURCE;
+ break;
+ }
+
+ /* Update message header */
+ prd_msg_fsp_req->hdr.type = OPAL_PRD_MSG_TYPE_FIRMWARE_RESPONSE;
+ prd_msg_fsp_req->hdr.size = cpu_to_be16(resp_msg_size);
+ prd_msg_fsp_req->token = 0;
+ prd_msg_fsp_req->fw_resp.len = cpu_to_be64(fw_req_len);
+
+ /* copy HBRT data to local memory */
+ fw_resp = (struct prd_fw_msg *)prd_msg_fsp_req->fw_resp.data;
+ memcpy(fw_resp, fw_req, fw_req_len);
+
+ /* Update response type */
+ fw_resp->type = cpu_to_be64(PRD_FW_MSG_TYPE_HBRT_FSP);
+
+ /* Get MBOX message size */
+ data_len = fw_req_len - PRD_FW_MSG_BASE_SIZE;
+
+ /* We have to wait until FSP responds */
+ prd_msg_inuse = false;
+ /* Unlock to avoid recursive lock issue */
+ unlock(&events_lock);
+
+ /* Send message to FSP */
+ rc = platform.prd->send_hbrt_msg(&(fw_resp->mbox_msg), data_len);
+
+ /*
+ * Callback handler from hservice_send_hbrt_msg will take
+ * care of sending response to HBRT. So just send return
+ * code to Linux.
+ */
+ if (rc == OPAL_SUCCESS)
+ return rc;
+
+ lock(&events_lock);
+ if (prd_msg_fsp_req) {
+ free(prd_msg_fsp_req);
+ prd_msg_fsp_req = NULL;
+ }
+ break;
+ default:
+ prlog(PR_DEBUG, "PRD: Unsupported fw_request type : 0x%llx\n",
+ be64_to_cpu(fw_req->type));
+ rc = -ENOSYS;
+ }
+
+ if (!rc) {
+ rc = opal_queue_prd_msg(prd_msg);
+ if (rc)
+ prd_msg_inuse = false;
+ } else {
+ prd_msg_inuse = false;
+ }
+
+ unlock(&events_lock);
+
+ return rc;
+}
+
+/* Entry from the host above */
+static int64_t opal_prd_msg(struct opal_prd_msg *msg)
+{
+ int rc;
+
+ /* fini is a little special: the kernel (which may not have the entire
+ * opal_prd_msg definition) can send a FINI message, so we don't check
+ * the full size */
+ if (be16_to_cpu(msg->hdr.size) >= sizeof(struct opal_prd_msg_header) &&
+ msg->hdr.type == OPAL_PRD_MSG_TYPE_FINI)
+ return prd_msg_handle_fini();
+
+ if (be16_to_cpu(msg->hdr.size) < sizeof(*msg))
+ return OPAL_PARAMETER;
+
+ switch (msg->hdr.type) {
+ case OPAL_PRD_MSG_TYPE_INIT:
+ rc = prd_msg_handle_init(msg);
+ break;
+ case OPAL_PRD_MSG_TYPE_ATTN_ACK:
+ rc = prd_msg_handle_attn_ack(msg);
+ break;
+ case OPAL_PRD_MSG_TYPE_OCC_RESET_NOTIFY:
+ rc = occ_msg_queue_occ_reset();
+ break;
+ case OPAL_PRD_MSG_TYPE_FIRMWARE_REQUEST:
+ rc = prd_msg_handle_firmware_req(msg);
+ break;
+ case OPAL_PRD_MSG_TYPE_FSP_OCC_RESET_STATUS:
+ if (platform.prd == NULL ||
+ platform.prd->fsp_occ_reset_status == NULL) {
+ rc = OPAL_UNSUPPORTED;
+ break;
+ }
+ rc = platform.prd->fsp_occ_reset_status(
+ be64_to_cpu(msg->fsp_occ_reset_status.chip),
+ be64_to_cpu(msg->fsp_occ_reset_status.status));
+ break;
+ case OPAL_PRD_MSG_TYPE_CORE_SPECIAL_WAKEUP:
+ if (platform.prd == NULL ||
+ platform.prd->wakeup == NULL) {
+ rc = OPAL_UNSUPPORTED;
+ break;
+ }
+ rc = platform.prd->wakeup(be32_to_cpu(msg->spl_wakeup.core),
+ be32_to_cpu(msg->spl_wakeup.mode));
+ break;
+ case OPAL_PRD_MSG_TYPE_FSP_OCC_LOAD_START_STATUS:
+ if (platform.prd == NULL ||
+ platform.prd->fsp_occ_load_start_status == NULL) {
+ rc = OPAL_UNSUPPORTED;
+ break;
+ }
+ rc = platform.prd->fsp_occ_load_start_status(
+ be64_to_cpu(msg->fsp_occ_reset_status.chip),
+ be64_to_cpu(msg->fsp_occ_reset_status.status));
+ break;
+ default:
+ prlog(PR_DEBUG, "PRD: Unsupported prd message type : 0x%x\n",
+ msg->hdr.type);
+ rc = OPAL_UNSUPPORTED;
+ }
+
+ return rc;
+}
+
+
+/*
+ * Initialise the Opal backend for the PRD daemon. This must be called from
+ * platform probe or init function.
+ */
+void prd_init(void)
+{
+ struct proc_chip *chip;
+
+ switch (proc_gen) {
+ case proc_gen_p8:
+ prd_ipoll_mask_reg = PRD_P8_IPOLL_REG_MASK;
+ prd_ipoll_status_reg = PRD_P8_IPOLL_REG_STATUS;
+ prd_ipoll_mask = PRD_P8_IPOLL_MASK;
+ break;
+ case proc_gen_p9:
+ prd_ipoll_mask_reg = PRD_P9_IPOLL_REG_MASK;
+ prd_ipoll_status_reg = PRD_P9_IPOLL_REG_STATUS;
+ prd_ipoll_mask = PRD_P9_IPOLL_MASK;
+ break;
+ case proc_gen_p10: /* IPOLL regs are the same for p9 and p10 */
+ prd_ipoll_mask_reg = PRD_P9_IPOLL_REG_MASK;
+ prd_ipoll_status_reg = PRD_P9_IPOLL_REG_STATUS;
+ prd_ipoll_mask = PRD_P9_IPOLL_MASK;
+ break;
+ default:
+ assert(0);
+ }
+
+ /* mask everything */
+ lock(&ipoll_lock);
+ for_each_chip(chip) {
+ __ipoll_update_mask(chip->id, true, prd_ipoll_mask);
+ }
+ unlock(&ipoll_lock);
+
+ prd_enabled = true;
+ opal_register(OPAL_PRD_MSG, opal_prd_msg, 1);
+
+ prd_node = dt_new(opal_node, "diagnostics");
+ dt_add_property_strings(prd_node, "compatible", "ibm,opal-prd");
+}
+
+void prd_register_reserved_memory(void)
+{
+ struct mem_region *region;
+
+ if (!prd_node)
+ return;
+
+ lock(&mem_region_lock);
+ for (region = mem_region_next(NULL); region;
+ region = mem_region_next(region)) {
+
+ if (region->type != REGION_FW_RESERVED)
+ continue;
+
+ if (!region->node)
+ continue;
+
+ if (!dt_find_property(region->node, "ibm,prd-label")) {
+ dt_add_property_string(region->node, "ibm,prd-label",
+ region->name);
+ }
+ }
+ unlock(&mem_region_lock);
+}
diff --git a/roms/skiboot/hw/psi.c b/roms/skiboot/hw/psi.c
new file mode 100644
index 000000000..de074ce4a
--- /dev/null
+++ b/roms/skiboot/hw/psi.c
@@ -0,0 +1,1079 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Service Processor serial console handling code
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <io.h>
+#include <psi.h>
+#include <fsp.h>
+#include <opal.h>
+#include <interrupts.h>
+#include <cpu.h>
+#include <dio-p9.h>
+#include <trace.h>
+#include <xscom.h>
+#include <chip.h>
+#include <lpc.h>
+#include <i2c.h>
+#include <timebase.h>
+#include <platform.h>
+#include <errorlog.h>
+#include <xive.h>
+#include <sbe-p9.h>
+#include <phys-map.h>
+#include <occ.h>
+
+static LIST_HEAD(psis);
+static u64 psi_link_timer;
+static u64 psi_link_timeout;
+static bool psi_link_poll_active;
+
+static void psi_activate_phb(struct psi *psi);
+
+struct lock psi_lock = LOCK_UNLOCKED;
+
+DEFINE_LOG_ENTRY(OPAL_RC_PSI_TIMEOUT, OPAL_PLATFORM_ERR_EVT, OPAL_PSI,
+ OPAL_PLATFORM_FIRMWARE,
+ OPAL_UNRECOVERABLE_ERR_LOSS_OF_FUNCTION, OPAL_NA);
+
+void psi_set_link_polling(bool active)
+{
+ printf("PSI: %sing link polling\n",
+ active ? "start" : "stopp");
+ psi_link_poll_active = active;
+}
+
+void psi_disable_link(struct psi *psi)
+{
+ lock(&psi_lock);
+
+ /*
+ * Note: This can be called with the link already down but
+ * not detected as such yet by this layer since psi_check_link_active()
+ * operates locklessly and thus won't update the PSI structure. This
+ * is a non-issue, the only consequence is the messages in the log
+ * mentioning first the link having gone down then being disabled.
+ */
+ if (psi->active) {
+ u64 reg;
+ psi->active = false;
+
+ /* Mask errors in SEMR */
+ reg = in_be64(psi->regs + PSIHB_SEMR);
+ reg &= ((0xfffull << 36) | (0xfffull << 20));
+ out_be64(psi->regs + PSIHB_SEMR, reg);
+ printf("PSI: SEMR set to %llx\n", reg);
+
+ /* Reset all the error bits in PSIHB_CR and
+ * disable FSP interrupts
+ */
+ reg = in_be64(psi->regs + PSIHB_CR);
+ reg &= ~(0x7ffull << 20);
+ reg &= ~PSIHB_CR_PSI_LINK_ENABLE; /* flip link enable */
+ /*
+ * Ensure no commands/spurious interrupts reach
+ * the processor, by flipping the command enable.
+ */
+ reg &= ~PSIHB_CR_FSP_CMD_ENABLE;
+ reg &= ~PSIHB_CR_FSP_IRQ_ENABLE;
+ reg &= ~PSIHB_CR_FSP_IRQ; /* Clear interrupt state too */
+ printf("PSI[0x%03x]: Disabling link!\n", psi->chip_id);
+ out_be64(psi->regs + PSIHB_CR, reg);
+ printf("PSI: PSIHB_CR (error bits) set to %llx\n",
+ in_be64(psi->regs + PSIHB_CR));
+ psi_set_link_polling(true);
+ }
+
+ unlock(&psi_lock);
+}
+
+/*
+ * Resetting the FSP is a multi step sequence:
+ * 1. Read the PSIHBCR
+ * 2. Set the PSIHBCR[6] -- write register back.
+ * 3. Read PSIHBCR again
+ * 4. Reset PSIHBCR[6] -- write register back.
+ */
+void psi_reset_fsp(struct psi *psi)
+{
+ lock(&psi_lock);
+
+ if (psi->active) {
+ u64 reg;
+
+ printf("PSI: Driving FSP reset via PSI\n");
+ reg = in_be64(psi->regs + PSIHB_CR);
+ reg &= ~(0xfffull << 20); /* Reset error bits */
+ reg |= PSIHB_CR_FSP_RESET; /* FSP reset trigger start */
+ out_be64(psi->regs + PSIHB_CR, reg);
+ printf("PSI[0x%03x]: FSP reset start PSIHBCR set to %llx\n",
+ psi->chip_id, in_be64(psi->regs + PSIHB_CR));
+
+ reg = in_be64(psi->regs + PSIHB_CR);
+ reg &= ~PSIHB_CR_FSP_RESET; /* Clear FSP reset bit */
+ out_be64(psi->regs + PSIHB_CR, reg); /* Complete reset */
+ printf("PSI[0x%03x]: FSP reset complete. PSIHBCR set to %llx\n",
+ psi->chip_id, in_be64(psi->regs + PSIHB_CR));
+ }
+ unlock(&psi_lock);
+
+ /* Now bring down the PSI link too... */
+ psi_disable_link(psi);
+}
+
+bool psi_check_link_active(struct psi *psi)
+{
+ u64 val = in_be64(psi->regs + PSIHB_CR);
+
+ /*
+ * Unlocked, used during fsp_poke_msg so we really want
+ * to avoid fancy link re-entrancy and deadlocks here
+ */
+ if (!psi->active)
+ return false;
+ return (val & PSIHB_CR_PSI_LINK_ENABLE) &&
+ (val & PSIHB_CR_FSP_LINK_ACTIVE);
+}
+
+struct psi *psi_find_link(uint32_t chip_id)
+{
+ struct psi *psi;
+
+ list_for_each(&psis, psi, list) {
+ if (psi->chip_id == chip_id)
+ return psi;
+ }
+ return NULL;
+}
+
+#define PSI_LINK_CHECK_INTERVAL 10 /* Interval in secs */
+#define PSI_LINK_RECOVERY_TIMEOUT 1800 /* 30 minutes */
+
+static void psi_link_poll(void *data __unused)
+{
+ struct psi *psi;
+ u64 now;
+
+ if (!psi_link_poll_active)
+ return;
+
+ now = mftb();
+ if (psi_link_timer == 0 ||
+ (tb_compare(now, psi_link_timer) == TB_AAFTERB) ||
+ (tb_compare(now, psi_link_timer) == TB_AEQUALB)) {
+
+ lock(&psi_lock);
+
+ list_for_each(&psis, psi, list) {
+ u64 val;
+
+ if (psi->active)
+ continue;
+
+ val = in_be64(psi->regs + PSIHB_CR);
+
+ printf("PSI[0x%03x]: Poll CR=0x%016llx\n",
+ psi->chip_id, val);
+
+ if ((val & PSIHB_CR_PSI_LINK_ENABLE) &&
+ (val & PSIHB_CR_FSP_LINK_ACTIVE)) {
+ printf("PSI[0x%03x]: Found active link!\n",
+ psi->chip_id);
+ psi_link_timeout = 0;
+ psi->active = true;
+ psi_activate_phb(psi);
+ psi_set_link_polling(false);
+ unlock(&psi_lock);
+ if (platform.psi && platform.psi->link_established)
+ platform.psi->link_established();
+ return;
+ }
+ }
+ if (!psi_link_timeout)
+ psi_link_timeout =
+ now + secs_to_tb(PSI_LINK_RECOVERY_TIMEOUT);
+
+ if (tb_compare(now, psi_link_timeout) == TB_AAFTERB) {
+ log_simple_error(&e_info(OPAL_RC_PSI_TIMEOUT),
+ "PSI: Link timeout -- loss of FSP\n");
+ /* Reset the link timeout and continue looking */
+ psi_link_timeout = 0;
+ }
+
+ /* Poll every 10 seconds */
+ psi_link_timer = now + secs_to_tb(PSI_LINK_CHECK_INTERVAL);
+
+ unlock(&psi_lock);
+ }
+}
+
+void psi_enable_fsp_interrupt(struct psi *psi)
+{
+ /* Enable FSP interrupts in the GXHB */
+ lock(&psi_lock);
+ out_be64(psi->regs + PSIHB_CR,
+ in_be64(psi->regs + PSIHB_CR) | PSIHB_CR_FSP_IRQ_ENABLE);
+ unlock(&psi_lock);
+}
+
+/* Multiple bits can be set on errors */
+static void decode_psihb_error(u64 val)
+{
+ if (val & PSIHB_CR_PSI_ERROR)
+ printf("PSI: PSI Reported Error\n");
+ if (val & PSIHB_CR_PSI_LINK_INACTIVE)
+ printf("PSI: PSI Link Inactive Transition\n");
+ if (val & PSIHB_CR_FSP_ACK_TIMEOUT)
+ printf("PSI: FSP Ack Timeout\n");
+ if (val & PSIHB_CR_MMIO_LOAD_TIMEOUT)
+ printf("PSI: MMIO Load Timeout\n");
+ if (val & PSIHB_CR_MMIO_LENGTH_ERROR)
+ printf("PSI: MMIO Length Error\n");
+ if (val & PSIHB_CR_MMIO_ADDRESS_ERROR)
+ printf("PSI: MMIO Address Error\n");
+ if (val & PSIHB_CR_MMIO_TYPE_ERROR)
+ printf("PSI: MMIO Type Error\n");
+ if (val & PSIHB_CR_UE)
+ printf("PSI: UE Detected\n");
+ if (val & PSIHB_CR_PARITY_ERROR)
+ printf("PSI: Internal Parity Error\n");
+ if (val & PSIHB_CR_SYNC_ERR_ALERT1)
+ printf("PSI: Sync Error Alert1\n");
+ if (val & PSIHB_CR_SYNC_ERR_ALERT2)
+ printf("PSI: Sync Error Alert2\n");
+ if (val & PSIHB_CR_FSP_COMMAND_ERROR)
+ printf("PSI: FSP Command Error\n");
+}
+
+
+static void handle_psi_interrupt(struct psi *psi, u64 val)
+{
+ printf("PSI[0x%03x]: PSI mgmnt interrupt CR=0x%016llx\n",
+ psi->chip_id, val);
+
+ if (val & (0xfffull << 20)) {
+ decode_psihb_error(val);
+ psi_disable_link(psi);
+ } else if (val & (0x1full << 11))
+ printf("PSI: FSP error detected\n");
+}
+
+static void psi_spurious_fsp_irq(struct psi *psi)
+{
+ u64 reg, bit;
+
+ prlog(PR_NOTICE, "PSI: Spurious interrupt, attempting clear\n");
+
+ if (proc_gen == proc_gen_p10) {
+ reg = PSIHB_XSCOM_P10_HBCSR_CLR;
+ bit = PSIHB_XSCOM_P10_HBSCR_FSP_IRQ;
+ } else if (proc_gen == proc_gen_p9) {
+ reg = PSIHB_XSCOM_P9_HBCSR_CLR;
+ bit = PSIHB_XSCOM_P9_HBSCR_FSP_IRQ;
+ } else if (proc_gen == proc_gen_p8) {
+ reg = PSIHB_XSCOM_P8_HBCSR_CLR;
+ bit = PSIHB_XSCOM_P8_HBSCR_FSP_IRQ;
+ } else {
+ assert(false);
+ }
+ xscom_write(psi->chip_id, psi->xscom_base + reg, bit);
+}
+
+bool psi_poll_fsp_interrupt(struct psi *psi)
+{
+ return !!(in_be64(psi->regs + PSIHB_CR) & PSIHB_CR_FSP_IRQ);
+}
+
+static void psihb_interrupt(struct irq_source *is, uint32_t isn __unused)
+{
+ struct psi *psi = is->data;
+ u64 val;
+
+ val = in_be64(psi->regs + PSIHB_CR);
+
+ if (psi_link_poll_active) {
+ printf("PSI[0x%03x]: PSI interrupt CR=0x%016llx (A=%d)\n",
+ psi->chip_id, val, psi->active);
+ }
+
+ /* Handle PSI interrupts first in case it's a link down */
+ if (val & PSIHB_CR_PSI_IRQ) {
+ handle_psi_interrupt(psi, val);
+
+ /*
+ * If the link went down, re-read PSIHB_CR as
+ * the FSP interrupt might have been cleared.
+ */
+ if (!psi->active)
+ val = in_be64(psi->regs + PSIHB_CR);
+ }
+
+
+ /*
+ * We avoid forwarding FSP interrupts if the link isn't
+ * active. They should be masked anyway but it looks
+ * like the CR bit can remain set.
+ */
+ if (val & PSIHB_CR_FSP_IRQ) {
+ /*
+ * We have a case a flood with FSP mailbox interrupts
+ * when the link is down, see if we manage to clear
+ * the condition
+ */
+ if (!psi->active)
+ psi_spurious_fsp_irq(psi);
+ else {
+ if (platform.psi && platform.psi->fsp_interrupt)
+ platform.psi->fsp_interrupt();
+ }
+ }
+
+ if (platform.psi && platform.psi->psihb_interrupt)
+ platform.psi->psihb_interrupt();
+}
+
+
+static const uint32_t psi_p8_irq_to_xivr[P8_IRQ_PSI_IRQ_COUNT] = {
+ [P8_IRQ_PSI_FSP] = PSIHB_XIVR_FSP,
+ [P8_IRQ_PSI_OCC] = PSIHB_XIVR_OCC,
+ [P8_IRQ_PSI_FSI] = PSIHB_XIVR_FSI,
+ [P8_IRQ_PSI_LPC] = PSIHB_XIVR_LPC,
+ [P8_IRQ_PSI_LOCAL_ERR] = PSIHB_XIVR_LOCAL_ERR,
+ [P8_IRQ_PSI_EXTERNAL]= PSIHB_XIVR_HOST_ERR,
+};
+
+static void psi_cleanup_irq(struct psi *psi)
+{
+ uint32_t irq;
+ uint64_t xivr, xivr_p;
+
+ for (irq = 0; irq < P8_IRQ_PSI_IRQ_COUNT; irq++) {
+ prlog(PR_DEBUG, "PSI[0x%03x]: Cleaning up IRQ %d\n",
+ psi->chip_id, irq);
+
+ xivr_p = psi_p8_irq_to_xivr[irq];
+ xivr = in_be64(psi->regs + xivr_p);
+ xivr |= (0xffull << 32);
+ out_be64(psi->regs + xivr_p, xivr);
+ time_wait_ms_nopoll(10);
+ xivr = in_be64(psi->regs + xivr_p);
+ if (xivr & PPC_BIT(39)) {
+ printf(" Need EOI !\n");
+ icp_send_eoi(psi->interrupt + irq);
+ }
+ }
+}
+
+/* Called on a fast reset, make sure we aren't stuck with
+ * an accepted and never EOId PSI interrupt
+ */
+void psi_irq_reset(void)
+{
+ struct psi *psi;
+
+ printf("PSI: Hot reset!\n");
+
+ assert(proc_gen == proc_gen_p8);
+
+ list_for_each(&psis, psi, list) {
+ psi_cleanup_irq(psi);
+ }
+}
+
+static int64_t psi_p8_set_xive(struct irq_source *is, uint32_t isn,
+ uint16_t server, uint8_t priority)
+{
+ struct psi *psi = is->data;
+ uint64_t xivr_p, xivr;
+ uint32_t irq_idx = isn & 7;
+
+ if (irq_idx >= P8_IRQ_PSI_IRQ_COUNT)
+ return OPAL_PARAMETER;
+ xivr_p = psi_p8_irq_to_xivr[irq_idx];
+
+ /* Populate the XIVR */
+ xivr = (uint64_t)server << 40;
+ xivr |= (uint64_t)priority << 32;
+ xivr |= (uint64_t)(isn & 7) << 29;
+
+ out_be64(psi->regs + xivr_p, xivr);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t psi_p8_get_xive(struct irq_source *is, uint32_t isn __unused,
+ uint16_t *server, uint8_t *priority)
+{
+ struct psi *psi = is->data;
+ uint64_t xivr_p, xivr;
+ uint32_t irq_idx = isn & 7;
+
+ if (irq_idx >= P8_IRQ_PSI_IRQ_COUNT)
+ return OPAL_PARAMETER;
+
+ xivr_p = psi_p8_irq_to_xivr[irq_idx];
+
+ /* Read & decode the XIVR */
+ xivr = in_be64(psi->regs + xivr_p);
+
+ *server = (xivr >> 40) & 0xffff;
+ *priority = (xivr >> 32) & 0xff;
+
+ return OPAL_SUCCESS;
+}
+
+static void psihb_p8_interrupt(struct irq_source *is, uint32_t isn)
+{
+ struct psi *psi = is->data;
+ uint32_t idx = isn - psi->interrupt;
+
+ switch (idx) {
+ case P8_IRQ_PSI_FSP:
+ psihb_interrupt(is, isn);
+ break;
+ case P8_IRQ_PSI_OCC:
+ occ_p8_interrupt(psi->chip_id);
+ break;
+ case P8_IRQ_PSI_FSI:
+ printf("PSI: FSI irq received\n");
+ break;
+ case P8_IRQ_PSI_LPC:
+ lpc_interrupt(psi->chip_id);
+
+ /*
+ * i2c interrupts are ORed with the LPC ones on
+ * Murano DD2.1 and Venice DD2.0
+ */
+ p8_i2c_interrupt(psi->chip_id);
+ break;
+ case P8_IRQ_PSI_LOCAL_ERR:
+ prd_psi_interrupt(psi->chip_id);
+ break;
+ case P8_IRQ_PSI_EXTERNAL:
+ if (platform.external_irq)
+ platform.external_irq(psi->chip_id);
+ break;
+ }
+
+ /*
+ * TODO: Per Vicente Chung, CRESPs don't generate interrupts,
+ * and are just informational. Need to define the policy
+ * to handle them.
+ */
+}
+
+static uint64_t psi_p8_irq_attributes(struct irq_source *is, uint32_t isn)
+{
+ struct psi *psi = is->data;
+ uint32_t idx = isn - psi->interrupt;
+ uint64_t attr;
+
+ if (psi->no_lpc_irqs && idx == P8_IRQ_PSI_LPC)
+ return IRQ_ATTR_TARGET_LINUX;
+
+ /* Only direct external interrupts to OPAL if we have a handler */
+ if (idx == P8_IRQ_PSI_EXTERNAL && !platform.external_irq)
+ return IRQ_ATTR_TARGET_LINUX;
+
+ attr = IRQ_ATTR_TARGET_OPAL | IRQ_ATTR_TYPE_LSI;
+ if (idx == P8_IRQ_PSI_EXTERNAL || idx == P8_IRQ_PSI_LPC ||
+ idx == P8_IRQ_PSI_FSP)
+ attr |= IRQ_ATTR_TARGET_FREQUENT;
+ return attr;
+}
+
+static char *psi_p8_irq_name(struct irq_source *is, uint32_t isn)
+{
+ struct psi *psi = is->data;
+ uint32_t idx = isn - psi->interrupt;
+ char tmp[30];
+
+ static const char *names[P8_IRQ_PSI_IRQ_COUNT] = {
+ "fsp",
+ "occ",
+ "fsi",
+ "lpchc",
+ "local_err",
+ "external",
+ };
+
+ if (idx >= P8_IRQ_PSI_IRQ_COUNT)
+ return NULL;
+
+ snprintf(tmp, sizeof(tmp), "psi#%x:%s",
+ psi->chip_id, names[idx]);
+
+ return strdup(tmp);
+}
+
+static const struct irq_source_ops psi_p8_irq_ops = {
+ .get_xive = psi_p8_get_xive,
+ .set_xive = psi_p8_set_xive,
+ .interrupt = psihb_p8_interrupt,
+ .attributes = psi_p8_irq_attributes,
+ .name = psi_p8_irq_name,
+};
+
+static const char *psi_p9_irq_names[P9_PSI_NUM_IRQS] = {
+ "fsp",
+ "occ",
+ "fsi",
+ "lpchc",
+ "local_err",
+ "global_err",
+ "external",
+ "lpc_serirq_mux0", /* Have a callback to get name ? */
+ "lpc_serirq_mux1", /* Have a callback to get name ? */
+ "lpc_serirq_mux2", /* Have a callback to get name ? */
+ "lpc_serirq_mux3", /* Have a callback to get name ? */
+ "i2c",
+ "dio",
+ "psu"
+};
+
+static void psi_p9_mask_all(struct psi *psi)
+{
+ struct irq_source *is;
+ int isn;
+
+ /* Mask all sources */
+ is = irq_find_source(psi->interrupt);
+ for (isn = is->start; isn < is->end; isn++)
+ xive_source_mask(is, isn);
+}
+
+static void psi_p9_mask_unhandled_irq(struct irq_source *is, uint32_t isn)
+{
+ struct psi *psi = is->data;
+ int idx = isn - psi->interrupt;
+ const char *name;
+
+ if (idx < ARRAY_SIZE(psi_p9_irq_names))
+ name = psi_p9_irq_names[idx];
+ else
+ name = "unknown!";
+
+ prerror("PSI[0x%03x]: Masking unhandled LSI %d (%s)\n",
+ psi->chip_id, idx, name);
+
+ /*
+ * All the PSI interrupts are LSIs and will be constantly re-fired
+ * unless the underlying interrupt condition is cleared. If we don't
+ * have a handler for the interrupt then it needs to be masked to
+ * prevent the IRQ from locking up the thread which handles it.
+ */
+ switch (proc_gen) {
+ case proc_gen_p9:
+ xive_source_mask(is, isn);
+ break;
+ case proc_gen_p10:
+ xive2_source_mask(is, isn);
+ return;
+ default:
+ assert(false);
+ }
+
+}
+
+static void psihb_p9_interrupt(struct irq_source *is, uint32_t isn)
+{
+ struct psi *psi = is->data;
+ uint32_t idx = isn - psi->interrupt;
+
+ switch (idx) {
+ case P9_PSI_IRQ_PSI:
+ psihb_interrupt(is, isn);
+ break;
+ case P9_PSI_IRQ_OCC:
+ occ_p9_interrupt(psi->chip_id);
+ break;
+ case P9_PSI_IRQ_LPCHC:
+ lpc_interrupt(psi->chip_id);
+ break;
+ case P9_PSI_IRQ_LOCAL_ERR:
+ prd_psi_interrupt(psi->chip_id);
+ break;
+ case P9_PSI_IRQ_EXTERNAL:
+ if (platform.external_irq)
+ platform.external_irq(psi->chip_id);
+ else
+ psi_p9_mask_unhandled_irq(is, isn);
+ break;
+ case P9_PSI_IRQ_LPC_SIRQ0:
+ case P9_PSI_IRQ_LPC_SIRQ1:
+ case P9_PSI_IRQ_LPC_SIRQ2:
+ case P9_PSI_IRQ_LPC_SIRQ3:
+ lpc_serirq(psi->chip_id, idx - P9_PSI_IRQ_LPC_SIRQ0);
+ break;
+ case P9_PSI_IRQ_SBE_I2C:
+ p8_i2c_interrupt(psi->chip_id);
+ break;
+ case P9_PSI_IRQ_DIO:
+ printf("PSI: DIO irq received\n");
+ dio_interrupt_handler(psi->chip_id);
+ break;
+ case P9_PSI_IRQ_PSU:
+ p9_sbe_interrupt(psi->chip_id);
+ break;
+
+ default:
+ psi_p9_mask_unhandled_irq(is, isn);
+ }
+}
+
+static uint64_t psi_p9_irq_attributes(struct irq_source *is __unused,
+ uint32_t isn)
+{
+ struct psi *psi = is->data;
+ unsigned int idx = isn & 0xf;
+ bool is_lpc_serirq;
+
+ is_lpc_serirq =
+ (idx == P9_PSI_IRQ_LPC_SIRQ0 ||
+ idx == P9_PSI_IRQ_LPC_SIRQ1 ||
+ idx == P9_PSI_IRQ_LPC_SIRQ2 ||
+ idx == P9_PSI_IRQ_LPC_SIRQ3);
+
+ /* If LPC interrupts are disabled, route them to Linux
+ * (who will not request them since they aren't referenced
+ * in the device tree)
+ */
+ if (is_lpc_serirq && psi->no_lpc_irqs)
+ return IRQ_ATTR_TARGET_LINUX;
+
+ /* For serirq, check the LPC layer for policy */
+ if (is_lpc_serirq)
+ return lpc_get_irq_policy(psi->chip_id, idx - P9_PSI_IRQ_LPC_SIRQ0);
+
+ /* Only direct external interrupts to OPAL if we have a handler */
+ if (idx == P9_PSI_IRQ_EXTERNAL && !platform.external_irq)
+ return IRQ_ATTR_TARGET_LINUX | IRQ_ATTR_TYPE_LSI;
+
+ return IRQ_ATTR_TARGET_OPAL | IRQ_ATTR_TYPE_LSI;
+}
+
+static char *psi_p9_irq_name(struct irq_source *is, uint32_t isn)
+{
+ struct psi *psi = is->data;
+ uint32_t idx = isn - psi->interrupt;
+ char tmp[30];
+
+ if (idx >= ARRAY_SIZE(psi_p9_irq_names))
+ return NULL;
+
+ snprintf(tmp, sizeof(tmp), "psi#%x:%s",
+ psi->chip_id, psi_p9_irq_names[idx]);
+
+ return strdup(tmp);
+}
+
+static const struct irq_source_ops psi_p9_irq_ops = {
+ .interrupt = psihb_p9_interrupt,
+ .attributes = psi_p9_irq_attributes,
+ .name = psi_p9_irq_name,
+};
+
+static void psi_init_p8_interrupts(struct psi *psi)
+{
+ uint32_t irq;
+ uint64_t xivr_p;
+
+ /* On P8 we get a block of 8, set up the base/mask
+ * and mask all the sources for now
+ */
+ out_be64(psi->regs + PSIHB_IRSN,
+ SETFIELD(PSIHB_IRSN_COMP, 0ul, psi->interrupt) |
+ SETFIELD(PSIHB_IRSN_MASK, 0ul, 0x7fff8ul) |
+ PSIHB_IRSN_DOWNSTREAM_EN |
+ PSIHB_IRSN_UPSTREAM_EN);
+
+ for (irq = 0; irq < P8_IRQ_PSI_IRQ_COUNT; irq++) {
+ xivr_p = psi_p8_irq_to_xivr[irq];
+ out_be64(psi->regs + xivr_p, (0xffull << 32) | (irq << 29));
+ }
+
+ /*
+ * Register the IRQ sources FSP, OCC, FSI, LPC
+ * and Local Error. Host Error is actually the
+ * external interrupt and the policy for that comes
+ * from the platform
+ */
+ register_irq_source(&psi_p8_irq_ops, psi,
+ psi->interrupt, P8_IRQ_PSI_IRQ_COUNT);
+}
+
+static void psi_init_p9_interrupts(struct psi *psi)
+{
+ struct proc_chip *chip;
+ u64 val;
+
+ /* Grab chip */
+ chip = get_chip(psi->chip_id);
+ if (!chip)
+ return;
+
+ /* Configure the CI BAR */
+ phys_map_get(chip->id, PSIHB_ESB, 0, &val, NULL);
+ val |= PSIHB_ESB_CI_VALID;
+ out_be64(psi->regs + PSIHB_ESB_CI_BASE, val);
+
+ val = in_be64(psi->regs + PSIHB_ESB_CI_BASE);
+ psi->esb_mmio = (void *)(val & ~PSIHB_ESB_CI_VALID);
+ prlog(PR_DEBUG, "PSI[0x%03x]: ESB MMIO at @%p\n",
+ psi->chip_id, psi->esb_mmio);
+
+ /* Register sources */
+ prlog(PR_DEBUG,
+ "PSI[0x%03x]: Interrupts sources registered for P9 DD2.x\n",
+ psi->chip_id);
+ xive_register_hw_source(psi->interrupt, P9_PSI_NUM_IRQS,
+ 12, psi->esb_mmio, XIVE_SRC_LSI,
+ psi, &psi_p9_irq_ops);
+
+ psi_p9_mask_all(psi);
+
+ /* Setup interrupt offset */
+ val = xive_get_notify_base(psi->interrupt);
+ val <<= 32;
+ out_be64(psi->regs + PSIHB_IVT_OFFSET, val);
+
+ /* Grab and configure the notification port */
+ val = xive_get_notify_port(psi->chip_id, XIVE_HW_SRC_PSI);
+ val |= PSIHB_ESB_NOTIF_VALID;
+ out_be64(psi->regs + PSIHB_ESB_NOTIF_ADDR, val);
+
+ /* Reset irq handling and switch to ESB mode */
+ out_be64(psi->regs + PSIHB_INTERRUPT_CONTROL, PSIHB_IRQ_RESET);
+ out_be64(psi->regs + PSIHB_INTERRUPT_CONTROL, 0);
+}
+
+/*
+ * P9 and P10 have the same PSIHB interface
+ */
+static const struct irq_source_ops psi_p10_irq_ops = {
+ .interrupt = psihb_p9_interrupt,
+ .attributes = psi_p9_irq_attributes,
+ .name = psi_p9_irq_name,
+};
+
+#define PSIHB10_CAN_STORE_EOI(x) XIVE2_STORE_EOI_ENABLED
+
+static void psi_init_p10_interrupts(struct psi *psi)
+{
+ struct proc_chip *chip;
+ u64 val;
+ uint32_t esb_shift = 16;
+ uint32_t flags = XIVE_SRC_LSI;
+ struct irq_source *is;
+ int isn;
+
+ /* Grab chip */
+ chip = get_chip(psi->chip_id);
+ if (!chip)
+ return;
+
+ /* Configure the CI BAR */
+ phys_map_get(chip->id, PSIHB_ESB, 0, &val, NULL);
+ val |= PSIHB_ESB_CI_VALID;
+ if (esb_shift == 16)
+ val |= PSIHB10_ESB_CI_64K;
+ out_be64(psi->regs + PSIHB_ESB_CI_BASE, val);
+
+ val = in_be64(psi->regs + PSIHB_ESB_CI_BASE);
+ psi->esb_mmio = (void *)(val & ~(PSIHB_ESB_CI_VALID|PSIHB10_ESB_CI_64K));
+ prlog(PR_DEBUG, "PSI[0x%03x]: ESB MMIO at @%p\n",
+ psi->chip_id, psi->esb_mmio);
+
+ /* Store EOI */
+ if (PSIHB10_CAN_STORE_EOI(psi)) {
+ val = in_be64(psi->regs + PSIHB_CR);
+ val |= PSIHB10_CR_STORE_EOI;
+ out_be64(psi->regs + PSIHB_CR, val);
+ prlog(PR_DEBUG, "PSI[0x%03x]: store EOI is enabled\n",
+ psi->chip_id);
+ flags |= XIVE_SRC_STORE_EOI;
+ }
+
+ /* Register sources */
+ prlog(PR_DEBUG,
+ "PSI[0x%03x]: Interrupts sources registered for P10 DD%i.%i\n",
+ psi->chip_id, 0xf & (chip->ec_level >> 4), chip->ec_level & 0xf);
+
+ xive2_register_hw_source(psi->interrupt, P9_PSI_NUM_IRQS,
+ esb_shift, psi->esb_mmio, flags,
+ psi, &psi_p10_irq_ops);
+
+ /* Mask all sources */
+ is = irq_find_source(psi->interrupt);
+ for (isn = is->start; isn < is->end; isn++)
+ xive2_source_mask(is, isn);
+
+ /* Setup interrupt offset */
+ val = xive2_get_notify_base(psi->interrupt);
+ val <<= 32;
+ out_be64(psi->regs + PSIHB_IVT_OFFSET, val);
+
+ /* Grab and configure the notification port */
+ val = xive2_get_notify_port(psi->chip_id, XIVE_HW_SRC_PSI);
+ val |= PSIHB_ESB_NOTIF_VALID;
+ out_be64(psi->regs + PSIHB_ESB_NOTIF_ADDR, val);
+
+ /* Reset irq handling and switch to ESB mode */
+ out_be64(psi->regs + PSIHB_INTERRUPT_CONTROL, PSIHB_IRQ_RESET);
+ out_be64(psi->regs + PSIHB_INTERRUPT_CONTROL, 0);
+}
+
+static void psi_init_interrupts(struct psi *psi)
+{
+ /* Configure the interrupt BUID and mask it */
+ switch (proc_gen) {
+ case proc_gen_p8:
+ psi_init_p8_interrupts(psi);
+ break;
+ case proc_gen_p9:
+ psi_init_p9_interrupts(psi);
+ break;
+ case proc_gen_p10:
+ psi_init_p10_interrupts(psi);
+ break;
+ default:
+ /* Unknown: just no interrupts */
+ prerror("PSI: Unknown interrupt type\n");
+ }
+}
+
+static void psi_activate_phb(struct psi *psi)
+{
+ u64 reg;
+
+ /*
+ * Disable interrupt emission in the control register,
+ * it will be re-enabled later, after the mailbox one
+ * will have been enabled.
+ */
+ reg = in_be64(psi->regs + PSIHB_CR);
+ reg &= ~PSIHB_CR_FSP_IRQ_ENABLE;
+ out_be64(psi->regs + PSIHB_CR, reg);
+
+ /* Enable interrupts in the mask register. We enable everything
+ * except for bit "FSP command error detected" which the doc
+ * (P7 BookIV) says should be masked for normal ops. It also
+ * seems to be masked under OPAL.
+ */
+ reg = 0x0000010000100000ull;
+ out_be64(psi->regs + PSIHB_SEMR, reg);
+
+#if 0
+ /* Dump the GXHB registers */
+ printf(" PSIHB_BBAR : %llx\n",
+ in_be64(psi->regs + PSIHB_BBAR));
+ printf(" PSIHB_FSPBAR : %llx\n",
+ in_be64(psi->regs + PSIHB_FSPBAR));
+ printf(" PSIHB_FSPMMR : %llx\n",
+ in_be64(psi->regs + PSIHB_FSPMMR));
+ printf(" PSIHB_TAR : %llx\n",
+ in_be64(psi->regs + PSIHB_TAR));
+ printf(" PSIHB_CR : %llx\n",
+ in_be64(psi->regs + PSIHB_CR));
+ printf(" PSIHB_SEMR : %llx\n",
+ in_be64(psi->regs + PSIHB_SEMR));
+ printf(" PSIHB_XIVR : %llx\n",
+ in_be64(psi->regs + PSIHB_XIVR));
+#endif
+}
+
+static void psi_create_p9_int_map(struct psi *psi, struct dt_node *np)
+{
+ __be32 map[P9_PSI_NUM_IRQS][4];
+ int i;
+
+ for (i = 0; i < P9_PSI_NUM_IRQS; i++) {
+ map[i][0] = cpu_to_be32(i);
+ map[i][1] = cpu_to_be32(get_ics_phandle());
+ map[i][2] = cpu_to_be32(psi->interrupt + i);
+ map[i][3] = cpu_to_be32(1);
+ }
+ dt_add_property(np, "interrupt-map", map, sizeof(map));
+ dt_add_property_cells(np, "#address-cells", 0);
+ dt_add_property_cells(np, "#interrupt-cells", 1);
+}
+
+static void psi_create_mm_dtnode(struct psi *psi)
+{
+ struct dt_node *np;
+ uint64_t addr = (uint64_t)psi->regs;
+
+ np = dt_new_addr(dt_root, "psi", addr);
+ if (!np)
+ return;
+
+ /* Hard wire size to 4G */
+ dt_add_property_u64s(np, "reg", addr, 0x100000000ull);
+ switch (proc_gen) {
+ case proc_gen_p8:
+ dt_add_property_strings(np, "compatible", "ibm,psi",
+ "ibm,power8-psi");
+ break;
+ case proc_gen_p9:
+ case proc_gen_p10:
+ dt_add_property_strings(np, "compatible", "ibm,psi",
+ "ibm,power9-psi");
+ psi_create_p9_int_map(psi, np);
+ break;
+ default:
+ assert(0);
+ break;
+ }
+ dt_add_property_cells(np, "interrupt-parent", get_ics_phandle());
+ dt_add_property_cells(np, "interrupts", psi->interrupt, 1);
+ dt_add_property_cells(np, "ibm,chip-id", psi->chip_id);
+ psi->node = np;
+}
+
+static struct psi *alloc_psi(struct proc_chip *chip, uint64_t base)
+{
+ struct psi *psi;
+
+ psi = zalloc(sizeof(struct psi));
+ if (!psi) {
+ prerror("PSI: Could not allocate memory\n");
+ return NULL;
+ }
+ psi->xscom_base = base;
+ psi->chip_id = chip->id;
+ return psi;
+}
+
+static struct psi *psi_probe_p8(struct proc_chip *chip, u64 base)
+{
+ struct psi *psi = NULL;
+ uint64_t rc, val;
+
+ rc = xscom_read(chip->id, base + PSIHB_XSCOM_P8_BASE, &val);
+ if (rc) {
+ prerror("PSI[0x%03x]: Error %llx reading PSIHB BAR\n",
+ chip->id, rc);
+ return NULL;
+ }
+ if (val & PSIHB_XSCOM_P8_HBBAR_EN) {
+ psi = alloc_psi(chip, base);
+ if (!psi)
+ return NULL;
+ psi->regs = (void *)(val & ~PSIHB_XSCOM_P8_HBBAR_EN);
+ psi->interrupt = get_psi_interrupt(chip->id);
+ } else
+ printf("PSI[0x%03x]: Working chip not found\n", chip->id);
+
+ return psi;
+}
+
+static struct psi *psi_probe_p9(struct proc_chip *chip, u64 base)
+{
+ struct psi *psi = NULL;
+ uint64_t addr;
+
+ phys_map_get(chip->id, PSIHB_REG, 0, &addr, NULL);
+ xscom_write(chip->id, base + PSIHB_XSCOM_P9_BASE,
+ addr | PSIHB_XSCOM_P9_HBBAR_EN);
+
+ psi = alloc_psi(chip, base);
+ if (!psi)
+ return NULL;
+ psi->regs = (void *)addr;
+ psi->interrupt = xive_alloc_hw_irqs(chip->id, P9_PSI_NUM_IRQS, 16);
+ return psi;
+}
+
+static struct psi *psi_probe_p10(struct proc_chip *chip, u64 base)
+{
+ struct psi *psi = NULL;
+ uint64_t addr;
+
+ phys_map_get(chip->id, PSIHB_REG, 0, &addr, NULL);
+ xscom_write(chip->id, base + PSIHB_XSCOM_P9_BASE,
+ addr | PSIHB_XSCOM_P9_HBBAR_EN);
+
+ psi = alloc_psi(chip, base);
+ if (!psi)
+ return NULL;
+ psi->regs = (void *)addr;
+ psi->interrupt = xive2_alloc_hw_irqs(chip->id, P9_PSI_NUM_IRQS, 16);
+ return psi;
+}
+
+static bool psi_init_psihb(struct dt_node *psihb)
+{
+ uint32_t chip_id = dt_get_chip_id(psihb);
+ struct proc_chip *chip = get_chip(chip_id);
+ struct psi *psi = NULL;
+ u64 base, val;
+
+ if (!chip) {
+ prerror("PSI: Can't find chip!\n");
+ return false;
+ }
+
+ base = dt_get_address(psihb, 0, NULL);
+
+ if (dt_node_is_compatible(psihb, "ibm,power8-psihb-x"))
+ psi = psi_probe_p8(chip, base);
+ else if (dt_node_is_compatible(psihb, "ibm,power9-psihb-x"))
+ psi = psi_probe_p9(chip, base);
+ else if (dt_node_is_compatible(psihb, "ibm,power10-psihb-x"))
+ psi = psi_probe_p10(chip, base);
+ else {
+ prerror("PSI: Unknown processor type\n");
+ return false;
+ }
+ if (!psi)
+ return false;
+
+ list_add(&psis, &psi->list);
+
+ val = in_be64(psi->regs + PSIHB_CR);
+ if (val & PSIHB_CR_FSP_LINK_ACTIVE) {
+ lock(&psi_lock);
+ psi->active = true;
+ unlock(&psi_lock);
+ }
+ chip->psi = psi;
+
+ if (dt_has_node_property(psihb, "no-lpc-interrupts", NULL))
+ psi->no_lpc_irqs = true;
+
+ psi_activate_phb(psi);
+ psi_init_interrupts(psi);
+ psi_create_mm_dtnode(psi);
+
+ prlog(PR_INFO, "PSI[0x%03x]: Found PSI bridge [active=%d]\n",
+ psi->chip_id, psi->active);
+ return true;
+}
+
+void psi_fsp_link_in_use(struct psi *psi __unused)
+{
+ static bool poller_created = false;
+
+ /* Do this once only */
+ if (!poller_created) {
+ poller_created = true;
+ opal_add_poller(psi_link_poll, NULL);
+ }
+}
+
+struct psi *psi_find_functional_chip(void)
+{
+ return list_top(&psis, struct psi, list);
+}
+
+void psi_init(void)
+{
+ struct dt_node *np;
+
+ dt_for_each_compatible(dt_root, np, "ibm,psihb-x")
+ psi_init_psihb(np);
+}
+
+
diff --git a/roms/skiboot/hw/sbe-p8.c b/roms/skiboot/hw/sbe-p8.c
new file mode 100644
index 000000000..73fa5f1f2
--- /dev/null
+++ b/roms/skiboot/hw/sbe-p8.c
@@ -0,0 +1,195 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * POWER8 Self Boot Engine (SLW - SLeep/Winkle)
+ *
+ * Copyright 2013-2018 IBM Corp.
+ */
+
+#include <device.h>
+#include <sbe-p8.h>
+#include <skiboot.h>
+#include <timebase.h>
+#include <xscom.h>
+
+/* SLW timer related stuff */
+static bool sbe_has_timer;
+static uint64_t sbe_timer_inc;
+static uint64_t sbe_timer_target;
+static uint32_t sbe_timer_chip;
+static uint64_t sbe_last_gen;
+static uint64_t sbe_last_gen_stamp;
+
+static void p8_sbe_dump_timer_ffdc(void)
+{
+ uint64_t i, val;
+ int64_t rc;
+
+ static const uint32_t dump_regs[] = {
+ 0xe0000, 0xe0001, 0xe0002, 0xe0003,
+ 0xe0004, 0xe0005, 0xe0006, 0xe0007,
+ 0xe0008, 0xe0009, 0xe000a, 0xe000b,
+ 0xe000c, 0xe000d, 0xe000e, 0xe000f,
+ 0xe0010, 0xe0011, 0xe0012, 0xe0013,
+ 0xe0014, 0xe0015, 0xe0016, 0xe0017,
+ 0xe0018, 0xe0019,
+ 0x5001c,
+ 0x50038, 0x50039, 0x5003a, 0x5003b
+ };
+
+ /**
+ * @fwts-label SLWRegisterDump
+ * @fwts-advice An error condition occurred in sleep/winkle
+ * engines timer state machine. Dumping debug information to
+ * root-cause. OPAL/skiboot may be stuck on some operation that
+ * requires SLW timer state machine (e.g. core powersaving)
+ */
+ prlog(PR_DEBUG, "SLW: Register state:\n");
+
+ for (i = 0; i < ARRAY_SIZE(dump_regs); i++) {
+ uint32_t reg = dump_regs[i];
+ rc = xscom_read(sbe_timer_chip, reg, &val);
+ if (rc) {
+ prlog(PR_DEBUG, "SLW: XSCOM error %lld reading"
+ " reg 0x%x\n", rc, reg);
+ break;
+ }
+ prlog(PR_DEBUG, "SLW: %5x = %016llx\n", reg, val);
+ }
+}
+
+/* This is called with the timer lock held, so there is no
+ * issue with re-entrancy or concurrence
+ */
+void p8_sbe_update_timer_expiry(uint64_t new_target)
+{
+ uint64_t count, gen, gen2, req, now;
+ int64_t rc;
+
+ if (!sbe_has_timer || new_target == sbe_timer_target)
+ return;
+
+ sbe_timer_target = new_target;
+
+ _xscom_lock();
+ now = mftb();
+ /* Calculate how many increments from now, rounded up */
+ if (now < new_target)
+ count = (new_target - now + sbe_timer_inc - 1) / sbe_timer_inc;
+ else
+ count = 1;
+
+ /* Max counter is 24-bit */
+ if (count > 0xffffff)
+ count = 0xffffff;
+ /* Fabricate update request */
+ req = (1ull << 63) | (count << 32);
+
+ prlog(PR_TRACE, "SLW: TMR expiry: 0x%llx, req: %016llx\n", count, req);
+
+ do {
+ /* Grab generation and spin if odd */
+ for (;;) {
+ rc = _xscom_read(sbe_timer_chip, 0xE0006, &gen, false);
+ if (rc) {
+ prerror("SLW: Error %lld reading tmr gen "
+ " count\n", rc);
+ _xscom_unlock();
+ return;
+ }
+ if (!(gen & 1))
+ break;
+ if (tb_compare(now + msecs_to_tb(1), mftb()) == TB_ABEFOREB) {
+ /**
+ * @fwts-label SLWTimerStuck
+ * @fwts-advice The SLeep/Winkle Engine (SLW)
+ * failed to increment the generation number
+ * within our timeout period (it *should* have
+ * done so within ~10us, not >1ms. OPAL uses
+ * the SLW timer to schedule some operations,
+ * but can fall back to the (much less frequent
+ * OPAL poller, which although does not affect
+ * functionality, runs *much* less frequently.
+ * This could have the effect of slow I2C
+ * operations (for example). It may also mean
+ * that you *had* an increase in jitter, due
+ * to slow interactions with SLW.
+ * This error may also occur if the machine
+ * is connected to via soft FSI.
+ */
+ prerror("SLW: timer stuck, falling back to OPAL pollers. You will likely have slower I2C and may have experienced increased jitter.\n");
+ prlog(PR_DEBUG, "SLW: Stuck with odd generation !\n");
+ _xscom_unlock();
+ sbe_has_timer = false;
+ p8_sbe_dump_timer_ffdc();
+ return;
+ }
+ }
+
+ rc = _xscom_write(sbe_timer_chip, 0x5003A, req, false);
+ if (rc) {
+ prerror("SLW: Error %lld writing tmr request\n", rc);
+ _xscom_unlock();
+ return;
+ }
+
+ /* Re-check gen count */
+ rc = _xscom_read(sbe_timer_chip, 0xE0006, &gen2, false);
+ if (rc) {
+ prerror("SLW: Error %lld re-reading tmr gen "
+ " count\n", rc);
+ _xscom_unlock();
+ return;
+ }
+ } while(gen != gen2);
+ _xscom_unlock();
+
+ /* Check if the timer is working. If at least 1ms has elapsed
+ * since the last call to this function, check that the gen
+ * count has changed
+ */
+ if (tb_compare(sbe_last_gen_stamp + msecs_to_tb(1), now)
+ == TB_ABEFOREB) {
+ if (sbe_last_gen == gen) {
+ prlog(PR_ERR,
+ "SLW: Timer appears to not be running !\n");
+ sbe_has_timer = false;
+ p8_sbe_dump_timer_ffdc();
+ }
+ sbe_last_gen = gen;
+ sbe_last_gen_stamp = mftb();
+ }
+
+ prlog(PR_TRACE, "SLW: gen: %llx\n", gen);
+}
+
+bool p8_sbe_timer_ok(void)
+{
+ return sbe_has_timer;
+}
+
+void p8_sbe_init_timer(void)
+{
+ struct dt_node *np;
+ int64_t rc;
+ uint32_t tick_us;
+
+ np = dt_find_compatible_node(dt_root, NULL, "ibm,power8-sbe-timer");
+ if (!np)
+ return;
+
+ sbe_timer_chip = dt_get_chip_id(np);
+ tick_us = dt_prop_get_u32(np, "tick-time-us");
+ sbe_timer_inc = usecs_to_tb(tick_us);
+ sbe_timer_target = ~0ull;
+
+ rc = xscom_read(sbe_timer_chip, 0xE0006, &sbe_last_gen);
+ if (rc) {
+ prerror("SLW: Error %lld reading tmr gen count\n", rc);
+ return;
+ }
+ sbe_last_gen_stamp = mftb();
+
+ prlog(PR_INFO, "SLW: Timer facility on chip %d, resolution %dus\n",
+ sbe_timer_chip, tick_us);
+ sbe_has_timer = true;
+}
diff --git a/roms/skiboot/hw/sbe-p9.c b/roms/skiboot/hw/sbe-p9.c
new file mode 100644
index 000000000..898a1fb56
--- /dev/null
+++ b/roms/skiboot/hw/sbe-p9.c
@@ -0,0 +1,1040 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ *
+ * P9 OPAL - SBE communication driver
+ *
+ * SBE firmware at https://github.com/open-power/sbe
+ *
+ * P9 chip has Self Boot Engine (SBE). OPAL uses SBE for various purpose like
+ * timer, scom, MPIPL, etc,. Every chip has SBE. OPAL can communicate to SBE
+ * on all chips. Based on message type it selects appropriate SBE (ex: schedule
+ * timer on any chip).
+ *
+ * OPAL communicates to SBE via a set of data and control registers provided by
+ * the PSU block in P9 chip.
+ * - Four 8 byte registers for Host to send command packets to SBE.
+ * - Four 8 byte registers for SBE to send response packets to Host.
+ * - Two doorbell registers (1 on each side) to alert either party
+ * when data is placed in above mentioned data registers. Once Host/SBE reads
+ * incoming data, it should clear doorbell register. Interrupt is disabled
+ * as soon as doorbell register is cleared.
+ *
+ * OPAL - SBE message format:
+ * - OPAL communicates to SBE via set of well defined commands.
+ * - Reg0 contains message header (command class, subclass, flags etc).
+ * - Reg1-3 contains actual data. If data is big then it uses indirect method
+ * (data is passed via memory and memory address/size is passed in Reg1-3).
+ * - Every message has defined timeout. SBE must respond within specified
+ * time. Otherwise OPAL discards message and sends error message to caller.
+ *
+ * Constraints:
+ * - Only one command is accepted in the command buffer until the response for
+ * the command is enqueued in the response buffer by SBE.
+ *
+ * Copyright 2017-2019 IBM Corp.
+ */
+
+#define pr_fmt(fmt) "SBE: " fmt
+
+#include <chip.h>
+#include <errorlog.h>
+#include <lock.h>
+#include <opal.h>
+#include <opal-dump.h>
+#include <sbe-p9.h>
+#include <skiboot.h>
+#include <timebase.h>
+#include <timer.h>
+#include <trace.h>
+#include <xscom.h>
+
+enum p9_sbe_mbox_state {
+ sbe_mbox_idle = 0, /* Ready to send message */
+ sbe_mbox_send, /* Message sent, waiting for ack/response */
+ sbe_mbox_rr, /* SBE in R/R */
+};
+
+struct p9_sbe {
+ /* Chip ID to send message */
+ u32 chip_id;
+
+ /* List to hold SBE queue messages */
+ struct list_head msg_list;
+
+ struct lock lock;
+
+ enum p9_sbe_mbox_state state;
+
+ /* SBE MBOX message sequence number */
+ u16 cur_seq;
+};
+
+/* Default SBE chip ID */
+static int sbe_default_chip_id = -1;
+
+/* Is SBE timer running? */
+static bool sbe_has_timer = false;
+static bool sbe_timer_in_progress = false;
+static bool has_new_target = false;
+
+/* Inflight and next timer in TB */
+static uint64_t sbe_last_gen_stamp;
+static uint64_t sbe_timer_target;
+
+/* Timer lock */
+static struct lock sbe_timer_lock;
+
+/*
+ * Minimum timeout value for P9 is 500 microseconds. After that
+ * SBE timer can handle granularity of 1 microsecond.
+ */
+#define SBE_TIMER_DEFAULT_US 500
+static uint64_t sbe_timer_def_tb;
+
+/*
+ * Rate limit continuous timer update.
+ * We can update inflight timer if new timer request is lesser than inflight
+ * one. Limit such updates so that SBE gets time to handle FIFO side requests.
+ */
+#define SBE_TIMER_UPDATE_MAX 2
+static uint32_t timer_update_cnt = 0;
+
+/* Timer control message */
+static struct p9_sbe_msg *timer_ctrl_msg;
+
+#define SBE_STATUS_PRI_SHIFT 0x30
+#define SBE_STATUS_SEC_SHIFT 0x20
+
+/* Forward declaration */
+static void p9_sbe_timeout_poll_one(struct p9_sbe *sbe);
+static void p9_sbe_timer_schedule(void);
+
+/* bit 0-15 : Primary status code */
+static inline u16 p9_sbe_get_primary_rc(struct p9_sbe_msg *resp)
+{
+ return (resp->reg[0] >> SBE_STATUS_PRI_SHIFT);
+}
+
+static inline void p9_sbe_set_primary_rc(struct p9_sbe_msg *resp, u64 rc)
+{
+ resp->reg[0] |= (rc << SBE_STATUS_PRI_SHIFT);
+}
+
+static u64 p9_sbe_rreg(u32 chip_id, u64 reg)
+{
+ u64 data = 0;
+ int rc;
+
+ rc = xscom_read(chip_id, reg, &data);
+ if (rc != OPAL_SUCCESS) {
+ prlog(PR_DEBUG, "XSCOM error %d reading reg 0x%llx\n", rc, reg);
+ return 0xffffffff;
+ }
+
+ return data;
+}
+
+static void p9_sbe_reg_dump(u32 chip_id)
+{
+#define SBE_DUMP_REG_ONE(chip_id, x) \
+ prlog(PR_DEBUG, " %20s: %016llx\n", #x, p9_sbe_rreg(chip_id, x))
+
+ prlog(PR_DEBUG, "MBOX register dump for chip : %x\n", chip_id);
+ SBE_DUMP_REG_ONE(chip_id, PSU_SBE_DOORBELL_REG_RW);
+ SBE_DUMP_REG_ONE(chip_id, PSU_HOST_SBE_MBOX_REG0);
+ SBE_DUMP_REG_ONE(chip_id, PSU_HOST_SBE_MBOX_REG1);
+ SBE_DUMP_REG_ONE(chip_id, PSU_HOST_SBE_MBOX_REG2);
+ SBE_DUMP_REG_ONE(chip_id, PSU_HOST_SBE_MBOX_REG3);
+ SBE_DUMP_REG_ONE(chip_id, PSU_HOST_DOORBELL_REG_RW);
+ SBE_DUMP_REG_ONE(chip_id, PSU_HOST_SBE_MBOX_REG4);
+ SBE_DUMP_REG_ONE(chip_id, PSU_HOST_SBE_MBOX_REG5);
+ SBE_DUMP_REG_ONE(chip_id, PSU_HOST_SBE_MBOX_REG6);
+ SBE_DUMP_REG_ONE(chip_id, PSU_HOST_SBE_MBOX_REG7);
+}
+
+void p9_sbe_freemsg(struct p9_sbe_msg *msg)
+{
+ if (msg && msg->resp)
+ free(msg->resp);
+ free(msg);
+}
+
+static void p9_sbe_fillmsg(struct p9_sbe_msg *msg, u16 cmd,
+ u16 ctrl_flag, u64 reg1, u64 reg2, u64 reg3)
+{
+ bool response = !!(ctrl_flag & SBE_CMD_CTRL_RESP_REQ);
+ u16 flag;
+
+ /*
+ * Always set ack required flag. SBE will interrupt OPAL once it read
+ * message from mailbox register. If OPAL is expecting response, then
+ * it will update message timeout, otherwise it will send next message.
+ */
+ flag = ctrl_flag | SBE_CMD_CTRL_ACK_REQ;
+
+ /* Seqence ID is filled by p9_sbe_queue_msg() */
+ msg->reg[0] = ((u64)flag << 32) | cmd;
+ msg->reg[1] = reg1;
+ msg->reg[2] = reg2;
+ msg->reg[3] = reg3;
+ msg->state = sbe_msg_unused;
+ msg->response = response;
+}
+
+static struct p9_sbe_msg *p9_sbe_allocmsg(bool alloc_resp)
+{
+ struct p9_sbe_msg *msg;
+
+ msg = zalloc(sizeof(struct p9_sbe_msg));
+ if (!msg) {
+ prlog(PR_ERR, "Failed to allocate SBE message\n");
+ return NULL;
+ }
+ if (alloc_resp) {
+ msg->resp = zalloc(sizeof(struct p9_sbe_msg));
+ if (!msg->resp) {
+ prlog(PR_ERR, "Failed to allocate SBE resp message\n");
+ free(msg);
+ return NULL;
+ }
+ }
+
+ return msg;
+}
+
+/*
+ * Handles "command with direct data" format only.
+ *
+ * Note: All mbox messages of our interest uses direct data format. If we need
+ * indirect data format then we may have to enhance this function.
+ */
+struct p9_sbe_msg *p9_sbe_mkmsg(u16 cmd, u16 ctrl_flag,
+ u64 reg1, u64 reg2, u64 reg3)
+{
+ struct p9_sbe_msg *msg;
+
+ msg = p9_sbe_allocmsg(!!(ctrl_flag & SBE_CMD_CTRL_RESP_REQ));
+ if (!msg)
+ return NULL;
+
+ p9_sbe_fillmsg(msg, cmd, ctrl_flag, reg1, reg2, reg3);
+ return msg;
+}
+
+static inline bool p9_sbe_mbox_busy(struct p9_sbe *sbe)
+{
+ return (sbe->state != sbe_mbox_idle);
+}
+
+static inline bool p9_sbe_msg_busy(struct p9_sbe_msg *msg)
+{
+ switch (msg->state) {
+ case sbe_msg_queued:
+ /* fall through */
+ case sbe_msg_sent:
+ case sbe_msg_wresp:
+ return true;
+ default: /* + sbe_msg_unused, sbe_msg_done,
+ sbe_msg_timeout, sbe_msg_error */
+ break;
+ }
+ return false;
+}
+
+static inline struct p9_sbe *p9_sbe_get_sbe(u32 chip_id)
+{
+ struct proc_chip *chip;
+
+ /* Default to SBE on master chip */
+ if (chip_id == -1) {
+ if (sbe_default_chip_id == -1)
+ return NULL;
+
+ chip = get_chip(sbe_default_chip_id);
+ } else {
+ chip = get_chip(chip_id);
+ }
+ if (chip == NULL || chip->sbe == NULL)
+ return NULL;
+
+ return chip->sbe;
+}
+
+static int p9_sbe_msg_send(struct p9_sbe *sbe, struct p9_sbe_msg *msg)
+{
+ int rc, i;
+ u64 addr, *data;
+
+ addr = PSU_HOST_SBE_MBOX_REG0;
+ data = &msg->reg[0];
+
+ for (i = 0; i < NR_HOST_SBE_MBOX_REG; i++) {
+ rc = xscom_write(sbe->chip_id, addr, *data);
+ if (rc)
+ return rc;
+
+ addr++;
+ data++;
+ }
+
+ rc = xscom_write(sbe->chip_id, PSU_SBE_DOORBELL_REG_OR,
+ HOST_SBE_MSG_WAITING);
+ if (rc != OPAL_SUCCESS)
+ return rc;
+
+ prlog(PR_TRACE, "Message queued [chip id = 0x%x]:\n", sbe->chip_id);
+ for (i = 0; i < 4; i++)
+ prlog(PR_TRACE, " Reg%d : %016llx\n", i, msg->reg[i]);
+
+ msg->timeout = mftb() + msecs_to_tb(SBE_CMD_TIMEOUT_MAX);
+ sbe->state = sbe_mbox_send;
+ msg->state = sbe_msg_sent;
+ return rc;
+}
+
+static int p9_sbe_msg_receive(u32 chip_id, struct p9_sbe_msg *resp)
+{
+ int i;
+ int rc = OPAL_SUCCESS;
+ u64 addr, *data;
+
+ addr = PSU_HOST_SBE_MBOX_REG4;
+ data = &resp->reg[0];
+
+ for (i = 0; i < NR_HOST_SBE_MBOX_REG; i++) {
+ rc = xscom_read(chip_id, addr, data);
+ if (rc)
+ return rc;
+
+ addr++;
+ data++;
+ }
+ return rc;
+}
+
+/* WARNING: This will drop sbe->lock */
+static void p9_sbe_msg_complete(struct p9_sbe *sbe, struct p9_sbe_msg *msg,
+ enum p9_sbe_msg_state msg_state)
+{
+ void (*comp)(struct p9_sbe_msg *msg);
+
+ prlog(PR_TRACE, "Completing msg [chip id = %x], reg0 : 0x%llx\n",
+ sbe->chip_id, msg->reg[0]);
+
+ comp = msg->complete;
+ list_del(&msg->link);
+ sync();
+ msg->state = msg_state;
+
+ if (comp) {
+ unlock(&sbe->lock);
+ comp(msg);
+ lock(&sbe->lock);
+ }
+}
+
+/* WARNING: This will drop sbe->lock */
+static void p9_sbe_send_complete(struct p9_sbe *sbe)
+{
+ struct p9_sbe_msg *msg;
+
+ if (list_empty(&sbe->msg_list))
+ return;
+
+ msg = list_top(&sbe->msg_list, struct p9_sbe_msg, link);
+ /* Need response */
+ if (msg->response) {
+ msg->state = sbe_msg_wresp;
+ } else {
+ sbe->state = sbe_mbox_idle;
+ p9_sbe_msg_complete(sbe, msg, sbe_msg_done);
+ }
+}
+
+/* WARNING: This will drop sbe->lock */
+static void p9_sbe_process_queue(struct p9_sbe *sbe)
+{
+ int rc, retry_cnt = 0;
+ struct p9_sbe_msg *msg = NULL;
+
+ if (p9_sbe_mbox_busy(sbe))
+ return;
+
+ while (!list_empty(&sbe->msg_list)) {
+ msg = list_top(&sbe->msg_list, struct p9_sbe_msg, link);
+ /* Send message */
+ rc = p9_sbe_msg_send(sbe, msg);
+ if (rc == OPAL_SUCCESS)
+ return;
+
+ prlog(PR_ERR, "Failed to send message to SBE [chip id = %x]\n",
+ sbe->chip_id);
+ if (msg->resp) {
+ p9_sbe_set_primary_rc(msg->resp,
+ SBE_STATUS_PRI_GENERIC_ERR);
+ }
+ p9_sbe_msg_complete(sbe, msg, sbe_msg_error);
+
+ /*
+ * Repeatedly failed to send message to SBE. Lets stop
+ * sending message.
+ */
+ if (retry_cnt++ >= 3) {
+ prlog(PR_ERR, "Temporarily stopped sending "
+ "message to SBE\n");
+ return;
+ }
+ }
+}
+
+/*
+ * WARNING:
+ * Only one command is accepted in the command buffer until response
+ * to the command is enqueued in the response buffer by SBE.
+ *
+ * Head of msg_list contains in-flight message. Hence we should always
+ * add new message to tail of the list.
+ */
+int p9_sbe_queue_msg(u32 chip_id, struct p9_sbe_msg *msg,
+ void (*comp)(struct p9_sbe_msg *msg))
+{
+ struct p9_sbe *sbe;
+
+ if (!msg)
+ return OPAL_PARAMETER;
+
+ sbe = p9_sbe_get_sbe(chip_id);
+ if (!sbe)
+ return OPAL_HARDWARE;
+
+ lock(&sbe->lock);
+ /* Set completion and update sequence number */
+ msg->complete = comp;
+ msg->state = sbe_msg_queued;
+ msg->reg[0] = msg->reg[0] | ((u64)sbe->cur_seq << 16);
+ sbe->cur_seq++;
+
+ /* Reset sequence number */
+ if (sbe->cur_seq == 0xffff)
+ sbe->cur_seq = 1;
+
+ /* Add message to queue */
+ list_add_tail(&sbe->msg_list, &msg->link);
+ p9_sbe_process_queue(sbe);
+ unlock(&sbe->lock);
+
+ return OPAL_SUCCESS;
+}
+
+int p9_sbe_sync_msg(u32 chip_id, struct p9_sbe_msg *msg, bool autofree)
+{
+ int rc;
+ struct p9_sbe *sbe;
+
+ rc = p9_sbe_queue_msg(chip_id, msg, NULL);
+ if (rc)
+ goto free_msg;
+
+ sbe = p9_sbe_get_sbe(chip_id);
+ if (!sbe) {
+ rc = OPAL_HARDWARE;
+ goto free_msg;
+ }
+
+ while (p9_sbe_msg_busy(msg)) {
+ cpu_relax();
+ p9_sbe_timeout_poll_one(sbe);
+ }
+
+ if (msg->state == sbe_msg_done)
+ rc = SBE_STATUS_PRI_SUCCESS;
+ else
+ rc = SBE_STATUS_PRI_GENERIC_ERR;
+
+ if (msg->response && msg->resp)
+ rc = p9_sbe_get_primary_rc(msg->resp);
+
+free_msg:
+ if (autofree)
+ p9_sbe_freemsg(msg);
+
+ return rc;
+}
+
+/* Remove SBE message from queue. It will not remove inflight message */
+int p9_sbe_cancelmsg(u32 chip_id, struct p9_sbe_msg *msg)
+{
+ struct p9_sbe *sbe;
+
+ sbe = p9_sbe_get_sbe(chip_id);
+ if (!sbe)
+ return OPAL_PARAMETER;
+
+ lock(&sbe->lock);
+ if (msg->state != sbe_msg_queued) {
+ unlock(&sbe->lock);
+ return OPAL_BUSY;
+ }
+
+ list_del(&msg->link);
+ msg->state = sbe_msg_done;
+ unlock(&sbe->lock);
+ return OPAL_SUCCESS;
+}
+
+static void p9_sbe_handle_response(u32 chip_id, struct p9_sbe_msg *msg)
+{
+ u16 send_seq, resp_seq;
+ int rc;
+
+ if (msg == NULL || msg->resp == NULL)
+ return;
+
+ memset(msg->resp, 0, sizeof(struct p9_sbe_msg));
+
+ rc = p9_sbe_msg_receive(chip_id, msg->resp);
+ if (rc != OPAL_SUCCESS) {
+ prlog(PR_ERR, "Failed to read response message "
+ "[chip id = %x]\n", chip_id);
+ p9_sbe_set_primary_rc(msg->resp, SBE_STATUS_PRI_GENERIC_ERR);
+ return;
+ }
+
+ /* Validate sequence number */
+ send_seq = (msg->reg[0] >> 16) & 0xffff;
+ resp_seq = (msg->resp->reg[0] >> 16) & 0xffff;
+ if (send_seq != resp_seq) {
+ /*
+ * XXX Handle SBE R/R.
+ * Lets send sequence error to caller until SBE reset works.
+ */
+ prlog(PR_ERR, "Invalid sequence id [chip id = %x]\n", chip_id);
+ p9_sbe_set_primary_rc(msg->resp, SBE_STATUS_PRI_SEQ_ERR);
+ return;
+ }
+}
+
+static int p9_sbe_clear_interrupt(struct p9_sbe *sbe, u64 bits)
+{
+ int rc;
+ u64 val;
+
+ /* Clear doorbell register */
+ val = SBE_HOST_RESPONSE_MASK & ~bits;
+ rc = xscom_write(sbe->chip_id, PSU_HOST_DOORBELL_REG_AND, val);
+ if (rc) {
+ prlog(PR_ERR, "Failed to clear SBE to Host doorbell "
+ "interrupt [chip id = %x]\n", sbe->chip_id);
+ }
+ return rc;
+}
+
+/* WARNING: This will drop sbe->lock */
+static void p9_sbe_timer_response(struct p9_sbe *sbe)
+{
+ if (sbe->chip_id != sbe_default_chip_id)
+ return;
+
+ sbe_timer_in_progress = false;
+ /* Drop lock and call timers */
+ unlock(&sbe->lock);
+
+ lock(&sbe_timer_lock);
+ /*
+ * Once we get timer expiry interrupt (even if its suprious interrupt)
+ * we can schedule next timer request.
+ */
+ timer_update_cnt = 0;
+ unlock(&sbe_timer_lock);
+
+ check_timers(true);
+ lock(&sbe->lock);
+}
+
+/* WARNING: This will drop sbe->lock */
+static void __p9_sbe_interrupt(struct p9_sbe *sbe)
+{
+ bool has_response;
+ int rc;
+ u64 data = 0, val;
+ struct p9_sbe_msg *msg = NULL;
+
+again:
+ /* Read doorbell register */
+ rc = xscom_read(sbe->chip_id, PSU_HOST_DOORBELL_REG_RW, &data);
+ if (rc) {
+ prlog(PR_ERR, "Failed to read SBE to Host doorbell register "
+ "[chip id = %x]\n", sbe->chip_id);
+ p9_sbe_reg_dump(sbe->chip_id);
+ return;
+ }
+
+ /* Completed processing all the bits */
+ if (!data)
+ return;
+
+ /* SBE came back from reset */
+ if (data & SBE_HOST_RESET) {
+ /* Clear all bits and restart sending message */
+ rc = p9_sbe_clear_interrupt(sbe, data);
+ if (rc)
+ return;
+
+ prlog(PR_NOTICE,
+ "Back from reset [chip id = %x]\n", sbe->chip_id);
+ /* Reset SBE MBOX state */
+ sbe->state = sbe_mbox_idle;
+
+ /* Reset message state */
+ if (!list_empty(&sbe->msg_list)) {
+ msg = list_top(&sbe->msg_list, struct p9_sbe_msg, link);
+ msg->state = sbe_msg_queued;
+ }
+ return;
+ }
+
+ /* Process ACK message before response */
+ if (data & SBE_HOST_MSG_READ) {
+ rc = p9_sbe_clear_interrupt(sbe, SBE_HOST_MSG_READ);
+ if (rc)
+ return;
+ p9_sbe_send_complete(sbe);
+ goto again;
+ }
+
+ /* Read SBE response before clearing doorbell register */
+ if (data & SBE_HOST_RESPONSE_WAITING) {
+ if (!list_empty(&sbe->msg_list)) {
+ msg = list_top(&sbe->msg_list, struct p9_sbe_msg, link);
+ p9_sbe_handle_response(sbe->chip_id, msg);
+ has_response = true;
+ } else {
+ has_response = false;
+ prlog(PR_DEBUG,
+ "Got response with no pending message\n");
+ }
+
+ rc = p9_sbe_clear_interrupt(sbe, SBE_HOST_RESPONSE_WAITING);
+ if (rc)
+ return;
+
+ /* Reset SBE MBOX state */
+ sbe->state = sbe_mbox_idle;
+ if (has_response)
+ p9_sbe_msg_complete(sbe, msg, sbe_msg_done);
+
+ goto again;
+ }
+
+ /* SBE passthrough command, call prd handler */
+ if (data & SBE_HOST_PASSTHROUGH) {
+ rc = p9_sbe_clear_interrupt(sbe, SBE_HOST_PASSTHROUGH);
+ if (rc)
+ return;
+ prd_sbe_passthrough(sbe->chip_id);
+ goto again;
+ }
+
+ /* Timer expired */
+ if (data & SBE_HOST_TIMER_EXPIRY) {
+ rc = p9_sbe_clear_interrupt(sbe, SBE_HOST_TIMER_EXPIRY);
+ if (rc)
+ return;
+ p9_sbe_timer_response(sbe);
+ goto again;
+ }
+
+ /* Unhandled bits */
+ val = data & ~(SBE_HOST_RESPONSE_MASK);
+ if (val) {
+ prlog(PR_ERR, "Unhandled interrupt bit [chip id = %x] : "
+ " %016llx\n", sbe->chip_id, val);
+ rc = p9_sbe_clear_interrupt(sbe, data);
+ if (rc)
+ return;
+ goto again;
+ }
+}
+
+void p9_sbe_interrupt(uint32_t chip_id)
+{
+ struct proc_chip *chip;
+ struct p9_sbe *sbe;
+
+ chip = get_chip(chip_id);
+ if (chip == NULL || chip->sbe == NULL)
+ return;
+
+ sbe = chip->sbe;
+ lock(&sbe->lock);
+ __p9_sbe_interrupt(sbe);
+ p9_sbe_process_queue(sbe);
+ unlock(&sbe->lock);
+}
+
+/*
+ * Check if the timer is working. If at least 10ms elapsed since
+ * last scheduled timer expiry.
+ */
+static void p9_sbe_timer_poll(struct p9_sbe *sbe)
+{
+ if (sbe->chip_id != sbe_default_chip_id)
+ return;
+
+ if (!sbe_has_timer || !sbe_timer_in_progress)
+ return;
+
+ if (tb_compare(mftb(), sbe_last_gen_stamp + msecs_to_tb(10))
+ != TB_AAFTERB)
+ return;
+
+ prlog(PR_ERR, "Timer stuck, falling back to OPAL pollers.\n");
+ prlog(PR_ERR, "You will likely have slower I2C and may have "
+ "experienced increased jitter.\n");
+ p9_sbe_reg_dump(sbe->chip_id);
+ sbe_has_timer = false;
+ sbe_timer_in_progress = false;
+}
+
+static void p9_sbe_timeout_poll_one(struct p9_sbe *sbe)
+{
+ struct p9_sbe_msg *msg;
+
+ if (sbe->chip_id == sbe_default_chip_id) {
+ if (list_empty_nocheck(&sbe->msg_list) &&
+ !sbe_timer_in_progress)
+ return;
+ } else {
+ if (list_empty_nocheck(&sbe->msg_list))
+ return;
+ }
+
+ lock(&sbe->lock);
+
+ /*
+ * In some cases there will be a delay in calling OPAL interrupt
+ * handler routine (opal_handle_interrupt). In such cases its
+ * possible that SBE has responded, but OPAL didn't act on that.
+ * Hence check for SBE response.
+ */
+ __p9_sbe_interrupt(sbe);
+ p9_sbe_timer_poll(sbe);
+
+ if (list_empty(&sbe->msg_list))
+ goto out;
+
+ /*
+ * For some reason OPAL didn't sent message to SBE.
+ * Lets try to send message again.
+ */
+ if (!p9_sbe_mbox_busy(sbe)) {
+ p9_sbe_process_queue(sbe);
+ goto out;
+ }
+
+ msg = list_top(&sbe->msg_list, struct p9_sbe_msg, link);
+ if (tb_compare(mftb(), msg->timeout) != TB_AAFTERB)
+ goto out;
+
+ /* Message timeout */
+ prlog(PR_ERR, "Message timeout [chip id = %x], cmd = %llx, "
+ "subcmd = %llx\n", sbe->chip_id,
+ (msg->reg[0] >> 8) & 0xff, msg->reg[0] & 0xff);
+ p9_sbe_reg_dump(sbe->chip_id);
+ if (msg->resp) {
+ p9_sbe_set_primary_rc(msg->resp,
+ SBE_STATUS_PRI_GENERIC_ERR);
+ }
+
+ /* XXX Handle SBE R/R. Reset SBE state until SBE R/R works. */
+ sbe->state = sbe_mbox_idle;
+ p9_sbe_msg_complete(sbe, msg, sbe_msg_timeout);
+ p9_sbe_process_queue(sbe);
+
+out:
+ unlock(&sbe->lock);
+}
+
+static void p9_sbe_timeout_poll(void *user_data __unused)
+{
+ struct p9_sbe *sbe;
+ struct proc_chip *chip;
+
+ for_each_chip(chip) {
+ if (chip->sbe == NULL)
+ continue;
+ sbe = chip->sbe;
+ p9_sbe_timeout_poll_one(sbe);
+ }
+}
+
+static void p9_sbe_timer_resp(struct p9_sbe_msg *msg)
+{
+ if (msg->state != sbe_msg_done) {
+ prlog(PR_DEBUG, "Failed to schedule timer [chip id %x]\n",
+ sbe_default_chip_id);
+ } else {
+ /* Update last scheduled timer value */
+ sbe_last_gen_stamp = mftb() +
+ usecs_to_tb(timer_ctrl_msg->reg[1]);
+ sbe_timer_in_progress = true;
+ }
+
+ if (!has_new_target)
+ return;
+
+ lock(&sbe_timer_lock);
+ if (has_new_target) {
+ if (!p9_sbe_msg_busy(timer_ctrl_msg)) {
+ has_new_target = false;
+ p9_sbe_timer_schedule();
+ }
+ }
+ unlock(&sbe_timer_lock);
+}
+
+static void p9_sbe_timer_schedule(void)
+{
+ int rc;
+ u32 tick_us = SBE_TIMER_DEFAULT_US;
+ u64 tb_cnt, now = mftb();
+
+ if (sbe_timer_in_progress) {
+ if (sbe_timer_target >= sbe_last_gen_stamp)
+ return;
+
+ if (now >= sbe_last_gen_stamp)
+ return;
+
+ /* Remaining time of inflight timer <= sbe_timer_def_tb */
+ if ((sbe_last_gen_stamp - now) <= sbe_timer_def_tb)
+ return;
+ }
+
+ /* Stop sending timer update chipop until inflight timer expires */
+ if (timer_update_cnt > SBE_TIMER_UPDATE_MAX)
+ return;
+ timer_update_cnt++;
+
+ if (now < sbe_timer_target) {
+ /* Calculate how many microseconds from now, rounded up */
+ if ((sbe_timer_target - now) > sbe_timer_def_tb) {
+ tb_cnt = sbe_timer_target - now + usecs_to_tb(1) - 1;
+ tick_us = tb_to_usecs(tb_cnt);
+ }
+ }
+
+ /* Clear sequence number. p9_sbe_queue_msg will add new sequene ID */
+ timer_ctrl_msg->reg[0] &= ~(PPC_BITMASK(32, 47));
+ /* Update timeout value */
+ timer_ctrl_msg->reg[1] = tick_us;
+ rc = p9_sbe_queue_msg(sbe_default_chip_id, timer_ctrl_msg,
+ p9_sbe_timer_resp);
+ if (rc != OPAL_SUCCESS) {
+ prlog(PR_ERR, "Failed to start timer [chip id = %x]\n",
+ sbe_default_chip_id);
+ return;
+ }
+}
+
+/*
+ * This is called with the timer lock held, so there is no
+ * issue with re-entrancy or concurrence
+ */
+void p9_sbe_update_timer_expiry(uint64_t new_target)
+{
+ if (!sbe_has_timer || new_target == sbe_timer_target)
+ return;
+
+ lock(&sbe_timer_lock);
+ /* Timer message is in flight. Record new timer and schedule later */
+ if (p9_sbe_msg_busy(timer_ctrl_msg) || has_new_target) {
+ if (new_target < sbe_timer_target) {
+ sbe_timer_target = new_target;
+ has_new_target = true;
+ }
+ } else {
+ sbe_timer_target = new_target;
+ p9_sbe_timer_schedule();
+ }
+ unlock(&sbe_timer_lock);
+}
+
+/* Initialize SBE timer */
+static void p9_sbe_timer_init(void)
+{
+ timer_ctrl_msg = p9_sbe_mkmsg(SBE_CMD_CONTROL_TIMER,
+ CONTROL_TIMER_START, 0, 0, 0);
+ assert(timer_ctrl_msg);
+ init_lock(&sbe_timer_lock);
+ sbe_has_timer = true;
+ sbe_timer_target = mftb();
+ sbe_last_gen_stamp = ~0ull;
+ sbe_timer_def_tb = usecs_to_tb(SBE_TIMER_DEFAULT_US);
+ prlog(PR_INFO, "Timer facility on chip %x\n", sbe_default_chip_id);
+}
+
+bool p9_sbe_timer_ok(void)
+{
+ return sbe_has_timer;
+}
+
+static void p9_sbe_stash_chipop_resp(struct p9_sbe_msg *msg)
+{
+ int rc = p9_sbe_get_primary_rc(msg->resp);
+ struct p9_sbe *sbe = (void *)msg->user_data;
+
+ if (rc == SBE_STATUS_PRI_SUCCESS) {
+ prlog(PR_DEBUG, "Sent stash MPIPL config [chip id =0x%x]\n",
+ sbe->chip_id);
+ } else {
+ prlog(PR_ERR, "Failed to send stash MPIPL config "
+ "[chip id = 0x%x, rc = %d]\n", sbe->chip_id, rc);
+ }
+
+ p9_sbe_freemsg(msg);
+}
+
+static void p9_sbe_send_relocated_base_single(struct p9_sbe *sbe, u64 reloc_base)
+{
+ u8 key = SBE_STASH_KEY_SKIBOOT_BASE;
+ u16 cmd = SBE_CMD_STASH_MPIPL_CONFIG;
+ u16 flag = SBE_CMD_CTRL_RESP_REQ;
+ struct p9_sbe_msg *msg;
+
+ msg = p9_sbe_mkmsg(cmd, flag, key, reloc_base, 0);
+ if (!msg) {
+ prlog(PR_ERR, "Message allocation failed\n");
+ return;
+ }
+
+ msg->user_data = (void *)sbe;
+ if (p9_sbe_queue_msg(sbe->chip_id, msg, p9_sbe_stash_chipop_resp)) {
+ prlog(PR_ERR, "Failed to queue stash MPIPL config message\n");
+ }
+}
+
+/* Send relocated skiboot base address to all SBE */
+void p9_sbe_send_relocated_base(uint64_t reloc_base)
+{
+ struct proc_chip *chip;
+
+ for_each_chip(chip) {
+ if (chip->sbe == NULL)
+ continue;
+
+ p9_sbe_send_relocated_base_single(chip->sbe, reloc_base);
+ }
+}
+
+void p9_sbe_init(void)
+{
+ struct dt_node *xn;
+ struct proc_chip *chip;
+ struct p9_sbe *sbe;
+
+ if (proc_gen < proc_gen_p9)
+ return;
+
+ dt_for_each_compatible(dt_root, xn, "ibm,xscom") {
+ sbe = zalloc(sizeof(struct p9_sbe));
+ assert(sbe);
+ sbe->chip_id = dt_get_chip_id(xn);
+ sbe->cur_seq = 1;
+ sbe->state = sbe_mbox_idle;
+ list_head_init(&sbe->msg_list);
+ init_lock(&sbe->lock);
+
+ chip = get_chip(sbe->chip_id);
+ assert(chip);
+ chip->sbe = sbe;
+
+ if (dt_has_node_property(xn, "primary", NULL)) {
+ sbe_default_chip_id = sbe->chip_id;
+ prlog(PR_DEBUG, "Master chip id : %x\n", sbe->chip_id);
+ }
+ }
+
+ if (sbe_default_chip_id == -1) {
+ prlog(PR_ERR, "Master chip ID not found.\n");
+ return;
+ }
+
+ /* Initiate SBE timer */
+ p9_sbe_timer_init();
+
+ /* Initiate SBE timeout poller */
+ opal_add_poller(p9_sbe_timeout_poll, NULL);
+}
+
+/* Terminate and initiate MPIPL */
+void p9_sbe_terminate(void)
+{
+ uint32_t primary_chip = -1;
+ int rc;
+ u64 wait_tb;
+ struct proc_chip *chip;
+
+ /* Return if MPIPL is not supported */
+ if (!is_mpipl_enabled())
+ return;
+
+ /* Save crashing CPU details */
+ opal_mpipl_save_crashing_pir();
+
+ /* Unregister flash. It will request BMC MBOX reset */
+ if (!flash_unregister()) {
+ prlog(PR_DEBUG, "Failed to reset BMC MBOX\n");
+ return;
+ }
+
+ /*
+ * Send S0 interrupt to all SBE. Sequence:
+ * - S0 interrupt on secondary chip SBE
+ * - S0 interrupt on Primary chip SBE
+ */
+ for_each_chip(chip) {
+ if (dt_has_node_property(chip->devnode, "primary", NULL)) {
+ primary_chip = chip->id;
+ continue;
+ }
+
+ rc = xscom_write(chip->id,
+ SBE_CONTROL_REG_RW, SBE_CONTROL_REG_S0);
+ /* Initiate normal reboot */
+ if (rc) {
+ prlog(PR_ERR, "Failed to write S0 interrupt [chip id = %x]\n",
+ chip->id);
+ return;
+ }
+ }
+
+ /* Initiate normal reboot */
+ if (primary_chip == -1) {
+ prlog(PR_ERR, "Primary chip ID not found.\n");
+ return;
+ }
+
+ rc = xscom_write(primary_chip,
+ SBE_CONTROL_REG_RW, SBE_CONTROL_REG_S0);
+ if (rc) {
+ prlog(PR_ERR, "Failed to write S0 interrupt [chip id = %x]\n",
+ primary_chip);
+ return;
+ }
+
+ /* XXX We expect SBE to act on interrupt, quiesce the system and start
+ * MPIPL flow. Currently we do not have a way to detect SBE state.
+ * Hence wait for max time SBE takes to respond and then trigger
+ * normal reboot.
+ */
+ prlog(PR_NOTICE, "Initiated MPIPL, waiting for SBE to respond...\n");
+ wait_tb = mftb() + msecs_to_tb(SBE_CMD_TIMEOUT_MAX);
+ while (mftb() < wait_tb) {
+ cpu_relax();
+ }
+
+ prlog(PR_ERR, "SBE did not respond within timeout period (%d secs).\n",
+ SBE_CMD_TIMEOUT_MAX / 1000);
+ prlog(PR_ERR, "Falling back to normal reboot\n");
+}
diff --git a/roms/skiboot/hw/sfc-ctrl.c b/roms/skiboot/hw/sfc-ctrl.c
new file mode 100644
index 000000000..34b5b8e20
--- /dev/null
+++ b/roms/skiboot/hw/sfc-ctrl.c
@@ -0,0 +1,510 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2013-2014 IBM Corp. */
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <lpc.h>
+#include <sfc-ctrl.h>
+
+#include <libflash/libflash.h>
+#include <libflash/libflash-priv.h>
+
+/* Offset of SFC registers in FW space */
+#define SFC_CMDREG_OFFSET 0x00000c00
+/* Offset of SFC command buffer in FW space */
+#define SFC_CMDBUF_OFFSET 0x00000d00
+/* Offset of flash MMIO mapping in FW space */
+#define SFC_MMIO_OFFSET 0x0c000000
+
+
+/*
+ * Register definitions
+ */
+#define SFC_REG_CONF 0x10 /* CONF: Direct Access Configuration */
+#define SFC_REG_CONF_FRZE (1 << 3)
+#define SFC_REG_CONF_ECCEN (1 << 2)
+#define SFC_REG_CONF_DRCD (1 << 1)
+#define SFC_REG_CONF_FLRLD (1 << 0)
+
+#define SFC_REG_STATUS 0x0C /* STATUS : Status Reg */
+#define SFC_REG_STATUS_NX_ON_SHFT 28
+#define SFC_REG_STATUS_RWP (1 << 27)
+#define SFC_REG_STATUS_FOURBYTEAD (1 << 26)
+#define SFC_REG_STATUS_ILLEGAL (1 << 4)
+#define SFC_REG_STATUS_ECCERRCNTN (1 << 3)
+#define SFC_REG_STATUS_ECCUEN (1 << 2)
+#define SFC_REG_STATUS_DONE (1 << 0)
+
+#define SFC_REG_CMD 0x40 /* CMD : Command */
+#define SFC_REG_CMD_OPCODE_SHFT 9
+#define SFC_REG_CMD_LENGTH_SHFT 0
+
+#define SFC_REG_SPICLK 0x3C /* SPICLK: SPI clock rate config */
+#define SFC_REG_SPICLK_OUTDLY_SHFT 24
+#define SFC_REG_SPICLK_INSAMPDLY_SHFT 16
+#define SFC_REG_SPICLK_CLKHI_SHFT 8
+#define SFC_REG_SPICLK_CLKLO_SHFT 0
+
+#define SFC_REG_ADR 0x44 /* ADR : Address */
+#define SFC_REG_ERASMS 0x48 /* ERASMS : Small Erase Block Size */
+#define SFC_REG_ERASLGS 0x4C /* ERALGS : Large Erase Block Size */
+#define SFC_REG_CONF4 0x54 /* CONF4 : SPI Op Code for Small Erase */
+#define SFC_REG_CONF5 0x58 /* CONF5 : Small Erase Size config reg */
+
+#define SFC_REG_CONF8 0x64 /* CONF8 : Read Command */
+#define SFC_REG_CONF8_CSINACTIVERD_SHFT 18
+#define SFC_REG_CONF8_DUMMY_SHFT 8
+#define SFC_REG_CONF8_READOP_SHFT 0
+
+#define SFC_REG_ADRCBF 0x80 /* ADRCBF : First Intf NOR Addr Offset */
+#define SFC_REG_ADRCMF 0x84 /* ADRCMF : First Intf NOR Allocation */
+#define SFC_REG_ADRCBS 0x88 /* ADRCBS : Second Intf NOR Addr Offset */
+#define SFC_REG_ADRCMS 0x8C /* ADRCMS : Second Intf NOR Allocation */
+#define SFC_REG_OADRNB 0x90 /* OADRNB : Direct Access OBP Window Base Address */
+#define SFC_REG_OADRNS 0x94 /* OADRNS : DIrect Access OPB Window Size */
+
+#define SFC_REG_CHIPIDCONF 0x9C /* CHIPIDCONF : config ChipId CMD */
+#define SFC_REG_CHIPIDCONF_OPCODE_SHFT 24
+#define SFC_REG_CHIPIDCONF_READ (1 << 23)
+#define SFC_REG_CHIPIDCONF_WRITE (1 << 22)
+#define SFC_REG_CHIPIDCONF_USE_ADDR (1 << 21)
+#define SFC_REG_CHIPIDCONF_DUMMY_SHFT 16
+#define SFC_REG_CHIPIDCONF_LEN_SHFT 0
+
+/*
+ * SFC Opcodes
+ */
+#define SFC_OP_READRAW 0x03 /* Read Raw */
+#define SFC_OP_WRITERAW 0x02 /* Write Raw */
+#define SFC_OP_ERASM 0x32 /* Erase Small */
+#define SFC_OP_ERALG 0x34 /* Erase Large */
+#define SFC_OP_ENWRITPROT 0x53 /* Enable WRite Protect */
+#define SFC_OP_CHIPID 0x1F /* Get Chip ID */
+#define SFC_OP_STATUS 0x05 /* Get Status */
+#define SFC_OP_TURNOFF 0x5E /* Turn Off */
+#define SFC_OP_TURNON 0x50 /* Turn On */
+#define SFC_OP_ABORT 0x6F /* Super-Abort */
+#define SFC_OP_START4BA 0x37 /* Start 4BA */
+#define SFC_OP_END4BA 0x69 /* End 4BA */
+
+/* Command buffer size */
+#define SFC_CMDBUF_SIZE 256
+
+struct sfc_ctrl {
+ /* Erase sizes */
+ uint32_t small_er_size;
+ uint32_t large_er_size;
+
+ /* Current 4b mode */
+ bool mode_4b;
+
+ /* Callbacks */
+ struct spi_flash_ctrl ops;
+};
+
+/* Command register support */
+static inline int sfc_reg_read(uint8_t reg, uint32_t *val)
+{
+ int rc;
+
+ *val = 0xffffffff;
+ rc = lpc_fw_read32(val, SFC_CMDREG_OFFSET + reg);
+ if (rc)
+ return rc;
+ return 0;
+}
+
+static inline int sfc_reg_write(uint8_t reg, uint32_t val)
+{
+ return lpc_fw_write32(val, SFC_CMDREG_OFFSET + reg);
+}
+
+static int sfc_buf_write(uint32_t len, const void *data)
+{
+ __be32 tmp;
+ uint32_t off = 0;
+ int rc;
+
+ if (len > SFC_CMDBUF_SIZE)
+ return FLASH_ERR_PARM_ERROR;
+
+ while (len >= 4) {
+ tmp = cpu_to_be32(*(const uint32_t *)data);
+ rc = lpc_fw_write32((u32)tmp, SFC_CMDBUF_OFFSET + off);
+ if (rc)
+ return rc;
+ off += 4;
+ len -= 4;
+ data += 4;
+ }
+ if (!len)
+ return 0;
+
+ /* lpc_fw_write operates on BE values so that's what we layout
+ * in memory with memcpy. The swap in the register on LE doesn't
+ * matter, the result in memory will be in the right order.
+ */
+ tmp = cpu_to_be32(-1);
+ memcpy(&tmp, data, len); /* XXX: is this right? */
+ return lpc_fw_write32((u32)tmp, SFC_CMDBUF_OFFSET + off);
+}
+
+static int sfc_buf_read(uint32_t len, void *data)
+{
+ uint32_t tmp, off = 0;
+ int rc;
+
+ if (len > SFC_CMDBUF_SIZE)
+ return FLASH_ERR_PARM_ERROR;
+
+ while (len >= 4) {
+ rc = lpc_fw_read32(data, SFC_CMDBUF_OFFSET + off);
+ if (rc)
+ return rc;
+ off += 4;
+ len -= 4;
+ data += 4;
+ }
+ if (!len)
+ return 0;
+
+ rc = lpc_fw_read32(&tmp, SFC_CMDBUF_OFFSET + off);
+ if (rc)
+ return rc;
+ /* We know tmp contains a big endian value, so memcpy is
+ * our friend here
+ */
+ memcpy(data, &tmp, len);
+ return 0;
+}
+
+/* Polls until SFC indicates command is complete */
+static int sfc_poll_complete(void)
+{
+ uint32_t status, timeout;
+ struct timespec ts;
+
+ /*
+ * A full 256 bytes read/write command will take at least
+ * 126us. Smaller commands are faster but we use less of
+ * them. So let's sleep in increments of 100us
+ */
+ ts.tv_sec = 0;
+ ts.tv_nsec = 100000;
+
+ /*
+ * Use a 1s timeout which should be sufficient for the
+ * commands we use
+ */
+ timeout = 10000;
+
+ do {
+ int rc;
+
+ rc = sfc_reg_read(SFC_REG_STATUS, &status);
+ if (rc)
+ return rc;
+ if (status & SFC_REG_STATUS_DONE)
+ break;
+ if (--timeout == 0)
+ return FLASH_ERR_CTRL_TIMEOUT;
+ nanosleep(&ts, NULL);
+ } while (true);
+
+ return 0;
+}
+
+static int sfc_exec_command(uint8_t opcode, uint32_t length)
+{
+ int rc = 0;
+ uint32_t cmd_reg = 0;
+
+ if (opcode > 0x7f || length > 0x1ff)
+ return FLASH_ERR_PARM_ERROR;
+
+ /* Write command register to start execution */
+ cmd_reg |= (opcode << SFC_REG_CMD_OPCODE_SHFT);
+ cmd_reg |= (length << SFC_REG_CMD_LENGTH_SHFT);
+ rc = sfc_reg_write(SFC_REG_CMD, cmd_reg);
+ if (rc)
+ return rc;
+
+ /* Wait for command to complete */
+ return sfc_poll_complete();
+}
+
+static int sfc_chip_id(struct spi_flash_ctrl *ctrl, uint8_t *id_buf,
+ uint32_t *id_size)
+{
+ uint32_t idconf;
+ int rc;
+
+ (void)ctrl;
+
+ if ((*id_size) < 3)
+ return FLASH_ERR_PARM_ERROR;
+
+ /*
+ * XXX This will not work in locked down mode but we assume that
+ * in this case, the chip ID command is already properly programmed
+ * and the SFC will ignore this. However I haven't verified...
+ */
+ idconf = ((uint64_t)CMD_RDID) << SFC_REG_CHIPIDCONF_OPCODE_SHFT;
+ idconf |= SFC_REG_CHIPIDCONF_READ;
+ idconf |= (3ul << SFC_REG_CHIPIDCONF_LEN_SHFT);
+ (void)sfc_reg_write(SFC_REG_CHIPIDCONF, idconf);
+
+ /* Perform command */
+ rc = sfc_exec_command(SFC_OP_CHIPID, 0);
+ if (rc)
+ return rc;
+
+ /* Read chip ID */
+ rc = sfc_buf_read(3, id_buf);
+ if (rc)
+ return rc;
+ *id_size = 3;
+
+ return 0;
+}
+
+
+static int sfc_read(struct spi_flash_ctrl *ctrl, uint32_t pos,
+ void *buf, uint32_t len)
+{
+ (void)ctrl;
+
+ while(len) {
+ uint32_t chunk = len;
+ int rc;
+
+ if (chunk > SFC_CMDBUF_SIZE)
+ chunk = SFC_CMDBUF_SIZE;
+ rc = sfc_reg_write(SFC_REG_ADR, pos);
+ if (rc)
+ return rc;
+ rc = sfc_exec_command(SFC_OP_READRAW, chunk);
+ if (rc)
+ return rc;
+ rc = sfc_buf_read(chunk, buf);
+ if (rc)
+ return rc;
+ len -= chunk;
+ pos += chunk;
+ buf += chunk;
+ }
+ return 0;
+}
+
+static int sfc_write(struct spi_flash_ctrl *ctrl, uint32_t addr,
+ const void *buf, uint32_t size)
+{
+ uint32_t chunk;
+ int rc;
+
+ (void)ctrl;
+
+ while(size) {
+ /* We shall not cross a page boundary */
+ chunk = 0x100 - (addr & 0xff);
+ if (chunk > size)
+ chunk = size;
+
+ /* Write to SFC write buffer */
+ rc = sfc_buf_write(chunk, buf);
+ if (rc)
+ return rc;
+
+ /* Program address */
+ rc = sfc_reg_write(SFC_REG_ADR, addr);
+ if (rc)
+ return rc;
+
+ /* Send command */
+ rc = sfc_exec_command(SFC_OP_WRITERAW, chunk);
+ if (rc)
+ return rc;
+
+ addr += chunk;
+ buf += chunk;
+ size -= chunk;
+ }
+ return 0;
+}
+
+static int sfc_erase(struct spi_flash_ctrl *ctrl, uint32_t addr,
+ uint32_t size)
+{
+ struct sfc_ctrl *ct = container_of(ctrl, struct sfc_ctrl, ops);
+ uint32_t sm_mask = ct->small_er_size - 1;
+ uint32_t lg_mask = ct->large_er_size - 1;
+ uint32_t chunk;
+ uint8_t cmd;
+ int rc;
+
+ while(size) {
+ /* Choose erase size for this chunk */
+ if (((addr | size) & lg_mask) == 0) {
+ chunk = ct->large_er_size;
+ cmd = SFC_OP_ERALG;
+ } else if (((addr | size) & sm_mask) == 0) {
+ chunk = ct->small_er_size;
+ cmd = SFC_OP_ERASM;
+ } else
+ return FLASH_ERR_ERASE_BOUNDARY;
+
+ rc = sfc_reg_write(SFC_REG_ADR, addr);
+ if (rc)
+ return rc;
+ rc = sfc_exec_command(cmd, 0);
+ if (rc)
+ return rc;
+ addr += chunk;
+ size -= chunk;
+ }
+ return 0;
+}
+
+static int sfc_setup(struct spi_flash_ctrl *ctrl, uint32_t *tsize)
+{
+ struct sfc_ctrl *ct = container_of(ctrl, struct sfc_ctrl, ops);
+ struct flash_info *info = ctrl->finfo;
+ uint32_t er_flags;
+
+ (void)tsize;
+
+ /* Keep non-erase related flags */
+ er_flags = ~FL_ERASE_ALL;
+
+ /* Add supported erase sizes */
+ if (ct->small_er_size == 0x1000 || ct->large_er_size == 0x1000)
+ er_flags |= FL_ERASE_4K;
+ if (ct->small_er_size == 0x8000 || ct->large_er_size == 0x8000)
+ er_flags |= FL_ERASE_32K;
+ if (ct->small_er_size == 0x10000 || ct->large_er_size == 0x10000)
+ er_flags |= FL_ERASE_64K;
+
+ /* Mask the flags out */
+ info->flags &= er_flags;
+
+ return 0;
+}
+
+static int sfc_set_4b(struct spi_flash_ctrl *ctrl, bool enable)
+{
+ struct sfc_ctrl *ct = container_of(ctrl, struct sfc_ctrl, ops);
+ int rc;
+
+ rc = sfc_exec_command(enable ? SFC_OP_START4BA : SFC_OP_END4BA, 0);
+ if (rc)
+ return rc;
+ ct->mode_4b = enable;
+ return 0;
+}
+
+static void sfc_validate_er_size(uint32_t *size)
+{
+ if (*size == 0)
+ return;
+
+ /* We only support 4k, 32k and 64k */
+ if (*size != 0x1000 && *size != 0x8000 && *size != 0x10000) {
+ FL_ERR("SFC: Erase size %d bytes unsupported\n", *size);
+ *size = 0;
+ }
+}
+
+static int sfc_init(struct sfc_ctrl *ct)
+{
+ int rc;
+ uint32_t status;
+
+ /*
+ * Assumptions: The controller has been fully initialized
+ * by an earlier FW layer setting the chip ID command, the
+ * erase sizes, and configuring the timings for reads and
+ * writes.
+ *
+ * This driver is meant to be usable if the configuration
+ * is in lock down.
+ *
+ * If that wasn't the case, we could configure some sane
+ * defaults here and tuned values in setup() after the
+ * chip has been identified.
+ */
+
+ /* Read erase sizes from flash */
+ rc = sfc_reg_read(SFC_REG_ERASMS, &ct->small_er_size);
+ if (rc)
+ return rc;
+ sfc_validate_er_size(&ct->small_er_size);
+ rc = sfc_reg_read(SFC_REG_ERASLGS, &ct->large_er_size);
+ if (rc)
+ return rc;
+ sfc_validate_er_size(&ct->large_er_size);
+
+ /* No erase sizes we can cope with ? Ouch... */
+ if ((ct->small_er_size == 0 && ct->large_er_size == 0) ||
+ (ct->large_er_size && (ct->small_er_size > ct->large_er_size))) {
+ FL_ERR("SFC: No supported erase sizes !\n");
+ return FLASH_ERR_CTRL_CONFIG_MISMATCH;
+ }
+
+ FL_INF("SFC: Suppored erase sizes:");
+ if (ct->small_er_size)
+ FL_INF(" %dKB", ct->small_er_size >> 10);
+ if (ct->large_er_size)
+ FL_INF(" %dKB", ct->large_er_size >> 10);
+ FL_INF("\n");
+
+ /* Read current state of 4 byte addressing */
+ rc = sfc_reg_read(SFC_REG_STATUS, &status);
+ if (rc)
+ return rc;
+ ct->mode_4b = !!(status & SFC_REG_STATUS_FOURBYTEAD);
+
+ return 0;
+}
+
+int sfc_open(struct spi_flash_ctrl **ctrl)
+{
+ struct sfc_ctrl *ct;
+ int rc;
+
+ *ctrl = NULL;
+ ct = malloc(sizeof(*ct));
+ if (!ct) {
+ FL_ERR("SFC: Failed to allocate\n");
+ return FLASH_ERR_MALLOC_FAILED;
+ }
+ memset(ct, 0, sizeof(*ct));
+ ct->ops.chip_id = sfc_chip_id;
+ ct->ops.setup = sfc_setup;
+ ct->ops.set_4b = sfc_set_4b;
+ ct->ops.read = sfc_read;
+ ct->ops.write = sfc_write;
+ ct->ops.erase = sfc_erase;
+
+ rc = sfc_init(ct);
+ if (rc)
+ goto fail;
+ *ctrl = &ct->ops;
+ return 0;
+ fail:
+ free(ct);
+ return rc;
+}
+
+void sfc_close(struct spi_flash_ctrl *ctrl)
+{
+ struct sfc_ctrl *ct = container_of(ctrl, struct sfc_ctrl, ops);
+
+ /* Free the whole lot */
+ free(ct);
+}
+
diff --git a/roms/skiboot/hw/slw.c b/roms/skiboot/hw/slw.c
new file mode 100644
index 000000000..56ba05b0a
--- /dev/null
+++ b/roms/skiboot/hw/slw.c
@@ -0,0 +1,1731 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Everything to do with deep power saving (stop) states
+ * SLeep/Winkle, Handle ChipTOD chip & configure core timebases
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <xscom-p8-regs.h>
+#include <xscom-p9-regs.h>
+#include <xscom-p10-regs.h>
+#include <io.h>
+#include <cpu.h>
+#include <chip.h>
+#include <mem_region.h>
+#include <chiptod.h>
+#include <interrupts.h>
+#include <timebase.h>
+#include <errorlog.h>
+#include <libfdt/libfdt.h>
+#include <opal-api.h>
+#include <nvram.h>
+#include <sbe-p8.h>
+#include <xive.h>
+
+#include <p10_stop_api.H>
+#include <p8_pore_table_gen_api.H>
+#include <sbe_xip_image.h>
+
+static uint32_t slw_saved_reset[0x100];
+
+static bool slw_current_le = false;
+
+enum wakeup_engine_states wakeup_engine_state = WAKEUP_ENGINE_NOT_PRESENT;
+bool has_deep_states = false;
+
+DEFINE_LOG_ENTRY(OPAL_RC_SLW_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_SLW,
+ OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+ OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_SLW_SET, OPAL_PLATFORM_ERR_EVT, OPAL_SLW,
+ OPAL_PLATFORM_FIRMWARE, OPAL_INFO,
+ OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_SLW_GET, OPAL_PLATFORM_ERR_EVT, OPAL_SLW,
+ OPAL_PLATFORM_FIRMWARE, OPAL_INFO,
+ OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_SLW_REG, OPAL_PLATFORM_ERR_EVT, OPAL_SLW,
+ OPAL_PLATFORM_FIRMWARE, OPAL_INFO,
+ OPAL_NA);
+
+static void slw_do_rvwinkle(void *data)
+{
+ struct cpu_thread *cpu = this_cpu();
+ struct cpu_thread *master = data;
+ uint64_t lpcr = mfspr(SPR_LPCR);
+ struct proc_chip *chip;
+
+ /* Setup our ICP to receive IPIs */
+ icp_prep_for_pm();
+
+ /* Setup LPCR to wakeup on external interrupts only */
+ mtspr(SPR_LPCR, ((lpcr & ~SPR_LPCR_P8_PECE) | SPR_LPCR_P8_PECE2));
+ isync();
+
+ prlog(PR_DEBUG, "SLW: CPU PIR 0x%04x going to rvwinkle...\n",
+ cpu->pir);
+
+ /* Tell that we got it */
+ cpu->state = cpu_state_rvwinkle;
+
+ enter_p8_pm_state(1);
+
+ /* Restore SPRs */
+ init_shared_sprs();
+ init_replicated_sprs();
+
+ /* Ok, it's ours again */
+ cpu->state = cpu_state_active;
+
+ prlog(PR_DEBUG, "SLW: CPU PIR 0x%04x woken up !\n", cpu->pir);
+
+ /* Cleanup our ICP */
+ reset_cpu_icp();
+
+ /* Resync timebase */
+ chiptod_wakeup_resync();
+
+ /* Restore LPCR */
+ mtspr(SPR_LPCR, lpcr);
+ isync();
+
+ /* If we are passed a master pointer we are the designated
+ * waker, let's proceed. If not, return, we are finished.
+ */
+ if (!master)
+ return;
+
+ prlog(PR_DEBUG, "SLW: CPU PIR 0x%04x waiting for master...\n",
+ cpu->pir);
+
+ /* Allriiiight... now wait for master to go down */
+ while(master->state != cpu_state_rvwinkle)
+ sync();
+
+ /* XXX Wait one second ! (should check xscom state ? ) */
+ time_wait_ms(1000);
+
+ for_each_chip(chip) {
+ struct cpu_thread *c;
+ uint64_t tmp;
+ for_each_available_core_in_chip(c, chip->id) {
+ xscom_read(chip->id,
+ XSCOM_ADDR_P8_EX_SLAVE(pir_to_core_id(c->pir),
+ EX_PM_IDLE_STATE_HISTORY_PHYP),
+ &tmp);
+ prlog(PR_TRACE, "SLW: core %x:%x"
+ " history: 0x%016llx (mid2)\n",
+ chip->id, pir_to_core_id(c->pir),
+ tmp);
+ }
+ }
+
+ prlog(PR_DEBUG, "SLW: Waking master (PIR 0x%04x)...\n", master->pir);
+
+ /* Now poke all the secondary threads on the master's core */
+ for_each_cpu(cpu) {
+ if (!cpu_is_sibling(cpu, master) || (cpu == master))
+ continue;
+ icp_kick_cpu(cpu);
+
+ /* Wait for it to claim to be back (XXX ADD TIMEOUT) */
+ while(cpu->state != cpu_state_active)
+ sync();
+ }
+
+ /* Now poke the master and be gone */
+ icp_kick_cpu(master);
+}
+
+static void slw_patch_reset(void)
+{
+ uint32_t *src, *dst, *sav;
+
+ src = &reset_patch_start;
+ dst = (uint32_t *)0x100;
+ sav = slw_saved_reset;
+ while(src < &reset_patch_end) {
+ *(sav++) = *(dst);
+ *(dst++) = *(src++);
+ }
+ sync_icache();
+}
+
+static void slw_unpatch_reset(void)
+{
+ extern uint32_t reset_patch_start;
+ extern uint32_t reset_patch_end;
+ uint32_t *src, *dst, *sav;
+
+ src = &reset_patch_start;
+ dst = (uint32_t *)0x100;
+ sav = slw_saved_reset;
+ while(src < &reset_patch_end) {
+ *(dst++) = *(sav++);
+ src++;
+ }
+ sync_icache();
+}
+
+static bool slw_general_init(struct proc_chip *chip, struct cpu_thread *c)
+{
+ uint32_t core = pir_to_core_id(c->pir);
+ uint64_t tmp;
+ int rc;
+
+ /* PowerManagement GP0 clear PM_DISABLE */
+ rc = xscom_read(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_GP0), &tmp);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_SLW_INIT),
+ "SLW: Failed to read PM_GP0\n");
+ return false;
+ }
+ tmp = tmp & ~0x8000000000000000ULL;
+ rc = xscom_write(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_GP0), tmp);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_SLW_INIT),
+ "SLW: Failed to write PM_GP0\n");
+ return false;
+ }
+ prlog(PR_TRACE, "SLW: PMGP0 set to 0x%016llx\n", tmp);
+
+ /* Read back for debug */
+ rc = xscom_read(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_GP0), &tmp);
+ if (rc)
+ log_simple_error(&e_info(OPAL_RC_SLW_INIT),
+ "SLW: Failed to re-read PM_GP0. Continuing...\n");
+
+ prlog(PR_TRACE, "SLW: PMGP0 read 0x%016llx\n", tmp);
+
+ return true;
+}
+
+static bool slw_set_overrides(struct proc_chip *chip, struct cpu_thread *c)
+{
+ uint32_t core = pir_to_core_id(c->pir);
+ int rc;
+
+ rc = xscom_write(chip->id,
+ XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_SPECIAL_WAKEUP_PHYP),
+ 0);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_SLW_SET),
+ "SLW: Failed to write PM_SPECIAL_WAKEUP_PHYP\n");
+ return false;
+ }
+
+ return true;
+}
+
+static bool slw_set_overrides_p10(struct proc_chip *chip, struct cpu_thread *c)
+{
+ uint64_t tmp;
+ int rc;
+ uint32_t core = pir_to_core_id(c->pir);
+
+ /* Special wakeup bits that could hold power mgt */
+ rc = xscom_read(chip->id,
+ XSCOM_ADDR_P10_QME_CORE(core, P10_QME_SPWU_HYP),
+ &tmp);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_SLW_SET),
+ "SLW: Failed to read P10_QME_SPWU_HYP\n");
+ return false;
+ }
+ if (tmp & P10_SPWU_REQ)
+ prlog(PR_WARNING,
+ "SLW: core %d P10_QME_SPWU_HYP requested 0x%016llx\n",
+ core, tmp);
+
+ return true;
+}
+
+
+static bool slw_set_overrides_p9(struct proc_chip *chip, struct cpu_thread *c)
+{
+ uint64_t tmp;
+ int rc;
+ uint32_t core = pir_to_core_id(c->pir);
+
+ /* Special wakeup bits that could hold power mgt */
+ rc = xscom_read(chip->id,
+ XSCOM_ADDR_P9_EC_SLAVE(core, EC_PPM_SPECIAL_WKUP_HYP),
+ &tmp);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_SLW_SET),
+ "SLW: Failed to read EC_PPM_SPECIAL_WKUP_HYP\n");
+ return false;
+ }
+ if (tmp)
+ prlog(PR_WARNING,
+ "SLW: core %d EC_PPM_SPECIAL_WKUP_HYP read 0x%016llx\n",
+ core, tmp);
+ rc = xscom_read(chip->id,
+ XSCOM_ADDR_P9_EC_SLAVE(core, EC_PPM_SPECIAL_WKUP_OTR),
+ &tmp);
+ if (tmp)
+ prlog(PR_WARNING,
+ "SLW: core %d EC_PPM_SPECIAL_WKUP_OTR read 0x%016llx\n",
+ core, tmp);
+ return true;
+}
+
+static bool slw_unset_overrides(struct proc_chip *chip, struct cpu_thread *c)
+{
+ uint32_t core = pir_to_core_id(c->pir);
+
+ /* XXX FIXME: Save and restore the overrides */
+ prlog(PR_DEBUG, "SLW: slw_unset_overrides %x:%x\n", chip->id, core);
+ return true;
+}
+
+static bool slw_set_idle_mode(struct proc_chip *chip, struct cpu_thread *c)
+{
+ uint32_t core = pir_to_core_id(c->pir);
+ uint64_t tmp;
+ int rc;
+
+ /*
+ * PM GP1 allows fast/deep mode to be selected independently for sleep
+ * and winkle. Init PM GP1 so that sleep happens in fast mode and
+ * winkle happens in deep mode.
+ * Make use of the OR XSCOM for this since the OCC might be manipulating
+ * the PM_GP1 register as well. Before doing this ensure that the bits
+ * managing idle states are cleared so as to override any bits set at
+ * init time.
+ */
+
+ tmp = ~EX_PM_GP1_SLEEP_WINKLE_MASK;
+ rc = xscom_write(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_CLEAR_GP1),
+ tmp);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_SLW_SET),
+ "SLW: Failed to write PM_GP1\n");
+ return false;
+ }
+
+ rc = xscom_write(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_SET_GP1),
+ EX_PM_SETUP_GP1_FAST_SLEEP_DEEP_WINKLE);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_SLW_SET),
+ "SLW: Failed to write PM_GP1\n");
+ return false;
+ }
+
+ /* Read back for debug */
+ xscom_read(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_GP1), &tmp);
+ prlog(PR_TRACE, "SLW: PMGP1 read 0x%016llx\n", tmp);
+ return true;
+}
+
+static bool slw_get_idle_state_history(struct proc_chip *chip, struct cpu_thread *c)
+{
+ uint32_t core = pir_to_core_id(c->pir);
+ uint64_t tmp;
+ int rc;
+
+ /* Cleanup history */
+ rc = xscom_read(chip->id,
+ XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_IDLE_STATE_HISTORY_PHYP),
+ &tmp);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_SLW_GET),
+ "SLW: Failed to read PM_IDLE_STATE_HISTORY\n");
+ return false;
+ }
+
+ prlog(PR_TRACE, "SLW: core %x:%x history: 0x%016llx (old1)\n",
+ chip->id, core, tmp);
+
+ rc = xscom_read(chip->id,
+ XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_IDLE_STATE_HISTORY_PHYP),
+ &tmp);
+
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_SLW_GET),
+ "SLW: Failed to read PM_IDLE_STATE_HISTORY\n");
+ return false;
+ }
+
+ prlog(PR_TRACE, "SLW: core %x:%x history: 0x%016llx (old2)\n",
+ chip->id, core, tmp);
+
+ return true;
+}
+
+static bool idle_prepare_core(struct proc_chip *chip, struct cpu_thread *c)
+{
+ prlog(PR_TRACE, "FASTSLEEP: Prepare core %x:%x\n",
+ chip->id, pir_to_core_id(c->pir));
+
+ if(!slw_general_init(chip, c))
+ return false;
+ if(!slw_set_overrides(chip, c))
+ return false;
+ if(!slw_set_idle_mode(chip, c))
+ return false;
+ if(!slw_get_idle_state_history(chip, c))
+ return false;
+
+ return true;
+
+}
+
+/* Define device-tree fields */
+#define MAX_NAME_LEN 16
+struct cpu_idle_states {
+ char name[MAX_NAME_LEN];
+ u32 latency_ns;
+ u32 residency_ns;
+ /*
+ * Register value/mask used to select different idle states.
+ * PMICR in POWER8 and PSSCR in POWER9
+ */
+ u64 pm_ctrl_reg_val;
+ u64 pm_ctrl_reg_mask;
+ u32 flags;
+};
+
+static struct cpu_idle_states nap_only_cpu_idle_states[] = {
+ { /* nap */
+ .name = "nap",
+ .latency_ns = 4000,
+ .residency_ns = 100000,
+ .flags = 0*OPAL_PM_DEC_STOP \
+ | 0*OPAL_PM_TIMEBASE_STOP \
+ | 1*OPAL_PM_LOSE_USER_CONTEXT \
+ | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+ | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+ | 1*OPAL_PM_NAP_ENABLED \
+ | 0*OPAL_PM_SLEEP_ENABLED \
+ | 0*OPAL_PM_WINKLE_ENABLED \
+ | 0*OPAL_USE_PMICR,
+ .pm_ctrl_reg_val = 0,
+ .pm_ctrl_reg_mask = 0 },
+};
+
+static struct cpu_idle_states power8_cpu_idle_states[] = {
+ { /* nap */
+ .name = "nap",
+ .latency_ns = 4000,
+ .residency_ns = 100000,
+ .flags = 0*OPAL_PM_DEC_STOP \
+ | 0*OPAL_PM_TIMEBASE_STOP \
+ | 1*OPAL_PM_LOSE_USER_CONTEXT \
+ | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+ | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+ | 1*OPAL_PM_NAP_ENABLED \
+ | 0*OPAL_USE_PMICR,
+ .pm_ctrl_reg_val = 0,
+ .pm_ctrl_reg_mask = 0 },
+ { /* fast sleep (with workaround) */
+ .name = "fastsleep_",
+ .latency_ns = 40000,
+ .residency_ns = 300000000,
+ .flags = 1*OPAL_PM_DEC_STOP \
+ | 1*OPAL_PM_TIMEBASE_STOP \
+ | 1*OPAL_PM_LOSE_USER_CONTEXT \
+ | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+ | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+ | 1*OPAL_PM_SLEEP_ENABLED_ER1 \
+ | 0*OPAL_USE_PMICR, /* Not enabled until deep
+ states are available */
+ .pm_ctrl_reg_val = OPAL_PM_FASTSLEEP_PMICR,
+ .pm_ctrl_reg_mask = OPAL_PM_SLEEP_PMICR_MASK },
+ { /* Winkle */
+ .name = "winkle",
+ .latency_ns = 10000000,
+ .residency_ns = 1000000000, /* Educated guess (not measured).
+ * Winkle is not currently used by
+ * linux cpuidle subsystem so we
+ * don't have real world user.
+ * However, this should be roughly
+ * accurate for when linux does
+ * use it. */
+ .flags = 1*OPAL_PM_DEC_STOP \
+ | 1*OPAL_PM_TIMEBASE_STOP \
+ | 1*OPAL_PM_LOSE_USER_CONTEXT \
+ | 1*OPAL_PM_LOSE_HYP_CONTEXT \
+ | 1*OPAL_PM_LOSE_FULL_CONTEXT \
+ | 1*OPAL_PM_WINKLE_ENABLED \
+ | 0*OPAL_USE_PMICR, /* Currently choosing deep vs
+ fast via EX_PM_GP1 reg */
+ .pm_ctrl_reg_val = 0,
+ .pm_ctrl_reg_mask = 0 },
+};
+
+/*
+ * cpu_idle_states for key idle states of POWER9 that we want to
+ * exploit.
+ * Note latency_ns and residency_ns are estimated values for now.
+ */
+static struct cpu_idle_states power9_cpu_idle_states[] = {
+ {
+ .name = "stop0_lite", /* Enter stop0 with no state loss */
+ .latency_ns = 1000,
+ .residency_ns = 10000,
+ .flags = 0*OPAL_PM_DEC_STOP \
+ | 0*OPAL_PM_TIMEBASE_STOP \
+ | 0*OPAL_PM_LOSE_USER_CONTEXT \
+ | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+ | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+ | 1*OPAL_PM_STOP_INST_FAST,
+ .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(0) \
+ | OPAL_PM_PSSCR_MTL(3) \
+ | OPAL_PM_PSSCR_TR(3),
+ .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+ {
+ .name = "stop0",
+ .latency_ns = 2000,
+ .residency_ns = 20000,
+ .flags = 0*OPAL_PM_DEC_STOP \
+ | 0*OPAL_PM_TIMEBASE_STOP \
+ | 1*OPAL_PM_LOSE_USER_CONTEXT \
+ | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+ | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+ | 1*OPAL_PM_STOP_INST_FAST,
+ .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(0) \
+ | OPAL_PM_PSSCR_MTL(3) \
+ | OPAL_PM_PSSCR_TR(3) \
+ | OPAL_PM_PSSCR_ESL \
+ | OPAL_PM_PSSCR_EC,
+ .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+
+ /* stop1_lite has been removed since it adds no additional benefit over stop0_lite */
+
+ {
+ .name = "stop1",
+ .latency_ns = 5000,
+ .residency_ns = 50000,
+ .flags = 0*OPAL_PM_DEC_STOP \
+ | 0*OPAL_PM_TIMEBASE_STOP \
+ | 1*OPAL_PM_LOSE_USER_CONTEXT \
+ | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+ | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+ | 1*OPAL_PM_STOP_INST_FAST,
+ .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(1) \
+ | OPAL_PM_PSSCR_MTL(3) \
+ | OPAL_PM_PSSCR_TR(3) \
+ | OPAL_PM_PSSCR_ESL \
+ | OPAL_PM_PSSCR_EC,
+ .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+ /*
+ * stop2_lite has been removed since currently it adds minimal benefit over stop2.
+ * However, the benefit is eclipsed by the time required to ungate the clocks
+ */
+
+ {
+ .name = "stop2",
+ .latency_ns = 10000,
+ .residency_ns = 100000,
+ .flags = 0*OPAL_PM_DEC_STOP \
+ | 0*OPAL_PM_TIMEBASE_STOP \
+ | 1*OPAL_PM_LOSE_USER_CONTEXT \
+ | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+ | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+ | 1*OPAL_PM_STOP_INST_FAST,
+ .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(2) \
+ | OPAL_PM_PSSCR_MTL(3) \
+ | OPAL_PM_PSSCR_TR(3) \
+ | OPAL_PM_PSSCR_ESL \
+ | OPAL_PM_PSSCR_EC,
+ .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+ {
+ .name = "stop4",
+ .latency_ns = 100000,
+ .residency_ns = 10000000,
+ .flags = 0*OPAL_PM_DEC_STOP \
+ | 0*OPAL_PM_TIMEBASE_STOP \
+ | 1*OPAL_PM_LOSE_USER_CONTEXT \
+ | 1*OPAL_PM_LOSE_HYP_CONTEXT \
+ | 1*OPAL_PM_LOSE_FULL_CONTEXT \
+ | 1*OPAL_PM_STOP_INST_DEEP,
+ .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(4) \
+ | OPAL_PM_PSSCR_MTL(7) \
+ | OPAL_PM_PSSCR_TR(3) \
+ | OPAL_PM_PSSCR_ESL \
+ | OPAL_PM_PSSCR_EC,
+ .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+ {
+ .name = "stop5",
+ .latency_ns = 200000,
+ .residency_ns = 20000000,
+ .flags = 0*OPAL_PM_DEC_STOP \
+ | 0*OPAL_PM_TIMEBASE_STOP \
+ | 1*OPAL_PM_LOSE_USER_CONTEXT \
+ | 1*OPAL_PM_LOSE_HYP_CONTEXT \
+ | 1*OPAL_PM_LOSE_FULL_CONTEXT \
+ | 1*OPAL_PM_STOP_INST_DEEP,
+ .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(5) \
+ | OPAL_PM_PSSCR_MTL(7) \
+ | OPAL_PM_PSSCR_TR(3) \
+ | OPAL_PM_PSSCR_ESL \
+ | OPAL_PM_PSSCR_EC,
+ .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+
+ {
+ .name = "stop8",
+ .latency_ns = 2000000,
+ .residency_ns = 20000000,
+ .flags = 1*OPAL_PM_DEC_STOP \
+ | 1*OPAL_PM_TIMEBASE_STOP \
+ | 1*OPAL_PM_LOSE_USER_CONTEXT \
+ | 1*OPAL_PM_LOSE_HYP_CONTEXT \
+ | 1*OPAL_PM_LOSE_FULL_CONTEXT \
+ | 1*OPAL_PM_STOP_INST_DEEP,
+ .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(8) \
+ | OPAL_PM_PSSCR_MTL(11) \
+ | OPAL_PM_PSSCR_TR(3) \
+ | OPAL_PM_PSSCR_ESL \
+ | OPAL_PM_PSSCR_EC,
+ .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+
+ {
+ .name = "stop11",
+ .latency_ns = 10000000,
+ .residency_ns = 100000000,
+ .flags = 1*OPAL_PM_DEC_STOP \
+ | 1*OPAL_PM_TIMEBASE_STOP \
+ | 1*OPAL_PM_LOSE_USER_CONTEXT \
+ | 1*OPAL_PM_LOSE_HYP_CONTEXT \
+ | 1*OPAL_PM_LOSE_FULL_CONTEXT \
+ | 1*OPAL_PM_STOP_INST_DEEP,
+ .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(11) \
+ | OPAL_PM_PSSCR_MTL(11) \
+ | OPAL_PM_PSSCR_TR(3) \
+ | OPAL_PM_PSSCR_ESL \
+ | OPAL_PM_PSSCR_EC,
+ .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+
+};
+
+/*
+ * Prior to Mambo.7.8.21, mambo did set the MSR correctly for lite stop
+ * states, so disable them for now.
+ */
+static struct cpu_idle_states power9_mambo_cpu_idle_states[] = {
+ {
+ .name = "stop0",
+ .latency_ns = 2000,
+ .residency_ns = 20000,
+ .flags = 0*OPAL_PM_DEC_STOP \
+ | 0*OPAL_PM_TIMEBASE_STOP \
+ | 1*OPAL_PM_LOSE_USER_CONTEXT \
+ | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+ | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+ | 1*OPAL_PM_STOP_INST_FAST,
+ .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(0) \
+ | OPAL_PM_PSSCR_MTL(3) \
+ | OPAL_PM_PSSCR_TR(3) \
+ | OPAL_PM_PSSCR_ESL \
+ | OPAL_PM_PSSCR_EC,
+ .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+ {
+ .name = "stop1",
+ .latency_ns = 5000,
+ .residency_ns = 50000,
+ .flags = 0*OPAL_PM_DEC_STOP \
+ | 0*OPAL_PM_TIMEBASE_STOP \
+ | 1*OPAL_PM_LOSE_USER_CONTEXT \
+ | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+ | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+ | 1*OPAL_PM_STOP_INST_FAST,
+ .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(1) \
+ | OPAL_PM_PSSCR_MTL(3) \
+ | OPAL_PM_PSSCR_TR(3) \
+ | OPAL_PM_PSSCR_ESL \
+ | OPAL_PM_PSSCR_EC,
+ .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+ {
+ .name = "stop2",
+ .latency_ns = 10000,
+ .residency_ns = 100000,
+ .flags = 0*OPAL_PM_DEC_STOP \
+ | 0*OPAL_PM_TIMEBASE_STOP \
+ | 1*OPAL_PM_LOSE_USER_CONTEXT \
+ | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+ | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+ | 1*OPAL_PM_STOP_INST_FAST,
+ .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(2) \
+ | OPAL_PM_PSSCR_MTL(3) \
+ | OPAL_PM_PSSCR_TR(3) \
+ | OPAL_PM_PSSCR_ESL \
+ | OPAL_PM_PSSCR_EC,
+ .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+ {
+ .name = "stop4",
+ .latency_ns = 100000,
+ .residency_ns = 1000000,
+ .flags = 1*OPAL_PM_DEC_STOP \
+ | 1*OPAL_PM_TIMEBASE_STOP \
+ | 1*OPAL_PM_LOSE_USER_CONTEXT \
+ | 1*OPAL_PM_LOSE_HYP_CONTEXT \
+ | 1*OPAL_PM_LOSE_FULL_CONTEXT \
+ | 1*OPAL_PM_STOP_INST_DEEP,
+ .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(4) \
+ | OPAL_PM_PSSCR_MTL(7) \
+ | OPAL_PM_PSSCR_TR(3) \
+ | OPAL_PM_PSSCR_ESL \
+ | OPAL_PM_PSSCR_EC,
+ .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+
+ {
+ .name = "stop8",
+ .latency_ns = 2000000,
+ .residency_ns = 20000000,
+ .flags = 1*OPAL_PM_DEC_STOP \
+ | 1*OPAL_PM_TIMEBASE_STOP \
+ | 1*OPAL_PM_LOSE_USER_CONTEXT \
+ | 1*OPAL_PM_LOSE_HYP_CONTEXT \
+ | 1*OPAL_PM_LOSE_FULL_CONTEXT \
+ | 1*OPAL_PM_STOP_INST_DEEP,
+ .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(8) \
+ | OPAL_PM_PSSCR_MTL(11) \
+ | OPAL_PM_PSSCR_TR(3) \
+ | OPAL_PM_PSSCR_ESL \
+ | OPAL_PM_PSSCR_EC,
+ .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+
+ {
+ .name = "stop11",
+ .latency_ns = 10000000,
+ .residency_ns = 100000000,
+ .flags = 1*OPAL_PM_DEC_STOP \
+ | 1*OPAL_PM_TIMEBASE_STOP \
+ | 1*OPAL_PM_LOSE_USER_CONTEXT \
+ | 1*OPAL_PM_LOSE_HYP_CONTEXT \
+ | 1*OPAL_PM_LOSE_FULL_CONTEXT \
+ | 1*OPAL_PM_STOP_INST_DEEP,
+ .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(11) \
+ | OPAL_PM_PSSCR_MTL(11) \
+ | OPAL_PM_PSSCR_TR(3) \
+ | OPAL_PM_PSSCR_ESL \
+ | OPAL_PM_PSSCR_EC,
+ .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+
+};
+
+/*
+ * cpu_idle_states for fused core configuration
+ * These will be a subset of power9 idle states.
+ */
+static struct cpu_idle_states power9_fusedcore_cpu_idle_states[] = {
+ {
+ .name = "stop0_lite", /* Enter stop0 with no state loss */
+ .latency_ns = 1000,
+ .residency_ns = 10000,
+ .flags = 0*OPAL_PM_DEC_STOP \
+ | 0*OPAL_PM_TIMEBASE_STOP \
+ | 0*OPAL_PM_LOSE_USER_CONTEXT \
+ | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+ | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+ | 1*OPAL_PM_STOP_INST_FAST,
+ .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(0) \
+ | OPAL_PM_PSSCR_MTL(3) \
+ | OPAL_PM_PSSCR_TR(3),
+ .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+ {
+ .name = "stop0",
+ .latency_ns = 2000,
+ .residency_ns = 20000,
+ .flags = 0*OPAL_PM_DEC_STOP \
+ | 0*OPAL_PM_TIMEBASE_STOP \
+ | 1*OPAL_PM_LOSE_USER_CONTEXT \
+ | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+ | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+ | 1*OPAL_PM_STOP_INST_FAST,
+ .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(0) \
+ | OPAL_PM_PSSCR_MTL(3) \
+ | OPAL_PM_PSSCR_TR(3) \
+ | OPAL_PM_PSSCR_ESL \
+ | OPAL_PM_PSSCR_EC,
+ .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+
+ /* stop1_lite has been removed since it adds no additional benefit over stop0_lite */
+
+ {
+ .name = "stop1",
+ .latency_ns = 5000,
+ .residency_ns = 50000,
+ .flags = 0*OPAL_PM_DEC_STOP \
+ | 0*OPAL_PM_TIMEBASE_STOP \
+ | 1*OPAL_PM_LOSE_USER_CONTEXT \
+ | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+ | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+ | 1*OPAL_PM_STOP_INST_FAST,
+ .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(1) \
+ | OPAL_PM_PSSCR_MTL(3) \
+ | OPAL_PM_PSSCR_TR(3) \
+ | OPAL_PM_PSSCR_ESL \
+ | OPAL_PM_PSSCR_EC,
+ .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+ /*
+ * stop2_lite has been removed since currently it adds minimal benefit over stop2.
+ * However, the benefit is eclipsed by the time required to ungate the clocks
+ */
+
+ {
+ .name = "stop2",
+ .latency_ns = 10000,
+ .residency_ns = 100000,
+ .flags = 0*OPAL_PM_DEC_STOP \
+ | 0*OPAL_PM_TIMEBASE_STOP \
+ | 1*OPAL_PM_LOSE_USER_CONTEXT \
+ | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+ | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+ | 1*OPAL_PM_STOP_INST_FAST,
+ .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(2) \
+ | OPAL_PM_PSSCR_MTL(3) \
+ | OPAL_PM_PSSCR_TR(3) \
+ | OPAL_PM_PSSCR_ESL \
+ | OPAL_PM_PSSCR_EC,
+ .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+};
+
+/*
+ * Note latency_ns and residency_ns are estimated values for now.
+ */
+static struct cpu_idle_states power10_cpu_idle_states[] = {
+ {
+ .name = "stop0_lite", /* Enter stop0 with no state loss */
+ .latency_ns = 1000,
+ .residency_ns = 10000,
+ .flags = 0*OPAL_PM_DEC_STOP \
+ | 0*OPAL_PM_TIMEBASE_STOP \
+ | 0*OPAL_PM_LOSE_USER_CONTEXT \
+ | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+ | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+ | 1*OPAL_PM_STOP_INST_FAST,
+ .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(0) \
+ | OPAL_PM_PSSCR_MTL(0) \
+ | OPAL_PM_PSSCR_TR(3),
+ .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+ {
+ .name = "stop0",
+ .latency_ns = 10000,
+ .residency_ns = 100000,
+ .flags = 0*OPAL_PM_DEC_STOP \
+ | 0*OPAL_PM_TIMEBASE_STOP \
+ | 1*OPAL_PM_LOSE_USER_CONTEXT \
+ | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+ | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+ | 1*OPAL_PM_STOP_INST_FAST,
+ .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(0) \
+ | OPAL_PM_PSSCR_MTL(0) \
+ | OPAL_PM_PSSCR_TR(3) \
+ | OPAL_PM_PSSCR_ESL \
+ | OPAL_PM_PSSCR_EC,
+ .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+ {
+ .name = "stop2",
+ .latency_ns = 20000,
+ .residency_ns = 200000,
+ .flags = 0*OPAL_PM_DEC_STOP \
+ | 0*OPAL_PM_TIMEBASE_STOP \
+ | 1*OPAL_PM_LOSE_USER_CONTEXT \
+ | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+ | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+ | 1*OPAL_PM_STOP_INST_FAST,
+ .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(2) \
+ | OPAL_PM_PSSCR_MTL(2) \
+ | OPAL_PM_PSSCR_TR(3) \
+ | OPAL_PM_PSSCR_ESL \
+ | OPAL_PM_PSSCR_EC,
+ .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+ {
+ .name = "stop3",
+ .latency_ns = 45000,
+ .residency_ns = 450000,
+ .flags = 0*OPAL_PM_DEC_STOP \
+ | 0*OPAL_PM_TIMEBASE_STOP \
+ | 1*OPAL_PM_LOSE_USER_CONTEXT \
+ | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+ | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+ | 1*OPAL_PM_STOP_INST_FAST,
+ .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(3) \
+ | OPAL_PM_PSSCR_MTL(3) \
+ | OPAL_PM_PSSCR_TR(3) \
+ | OPAL_PM_PSSCR_ESL \
+ | OPAL_PM_PSSCR_EC,
+ .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+#if 0
+ {
+ .name = "stop11",
+ .latency_ns = 10000000,
+ .residency_ns = 100000000,
+ .flags = 1*OPAL_PM_DEC_STOP \
+ | 1*OPAL_PM_TIMEBASE_STOP \
+ | 1*OPAL_PM_LOSE_USER_CONTEXT \
+ | 1*OPAL_PM_LOSE_HYP_CONTEXT \
+ | 1*OPAL_PM_LOSE_FULL_CONTEXT \
+ | 1*OPAL_PM_STOP_INST_DEEP,
+ .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(11) \
+ | OPAL_PM_PSSCR_MTL(11) \
+ | OPAL_PM_PSSCR_TR(3) \
+ | OPAL_PM_PSSCR_ESL \
+ | OPAL_PM_PSSCR_EC,
+ .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+#endif
+};
+
+static void slw_late_init_p9(struct proc_chip *chip)
+{
+ struct cpu_thread *c;
+ int rc;
+
+ prlog(PR_INFO, "SLW: Configuring self-restore for HRMOR\n");
+ for_each_available_cpu(c) {
+ if (c->chip_id != chip->id)
+ continue;
+ /*
+ * Clear HRMOR. Need to update only for thread
+ * 0 of each core. Doing it anyway for all threads
+ */
+ rc = p9_stop_save_cpureg((void *)chip->homer_base,
+ P9_STOP_SPR_HRMOR, 0,
+ c->pir);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_SLW_REG),
+ "SLW: Failed to set HRMOR for CPU %x,RC=0x%x\n",
+ c->pir, rc);
+ prlog(PR_ERR, "Disabling deep stop states\n");
+ }
+ }
+}
+
+static void slw_late_init_p10(struct proc_chip *chip)
+{
+ struct cpu_thread *c;
+ int rc;
+
+ prlog(PR_INFO, "SLW: Configuring self-restore for HRMOR\n");
+ for_each_available_cpu(c) {
+ if (c->chip_id != chip->id)
+ continue;
+ /*
+ * Clear HRMOR. Need to update only for thread
+ * 0 of each core. Doing it anyway for all threads
+ */
+ rc = proc_stop_save_cpureg((void *)chip->homer_base,
+ PROC_STOP_SPR_HRMOR, 0,
+ c->pir);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_SLW_REG),
+ "SLW: Failed to set HRMOR for CPU %x,RC=0x%x\n",
+ c->pir, rc);
+ prlog(PR_ERR, "Disabling deep stop states\n");
+ }
+ }
+}
+
+/* Add device tree properties to describe idle states */
+void add_cpu_idle_state_properties(void)
+{
+ struct dt_node *power_mgt;
+ struct cpu_idle_states *states;
+ struct proc_chip *chip;
+ int nr_states;
+
+ bool can_sleep = true;
+ bool has_stop_inst = false;
+ u8 i;
+
+ fdt64_t *pm_ctrl_reg_val_buf;
+ fdt64_t *pm_ctrl_reg_mask_buf;
+ u32 supported_states_mask;
+ u32 opal_disabled_states_mask = ~0xFC000000; /* all but stop11 */
+ const char* nvram_disable_str;
+ u32 nvram_disabled_states_mask = 0x00;
+ u32 stop_levels;
+
+ /* Variables to track buffer length */
+ u8 name_buf_len;
+ u8 num_supported_idle_states;
+
+ /* Buffers to hold idle state properties */
+ char *name_buf, *alloced_name_buf;
+ fdt32_t *latency_ns_buf;
+ fdt32_t *residency_ns_buf;
+ fdt32_t *flags_buf;
+
+ prlog(PR_DEBUG, "CPU idle state device tree init\n");
+
+ /* Create /ibm,opal/power-mgt if it doesn't exist already */
+ power_mgt = dt_new_check(opal_node, "power-mgt");
+ if (!power_mgt) {
+ /**
+ * @fwts-label CreateDTPowerMgtNodeFail
+ * @fwts-advice OPAL failed to add the power-mgt device tree
+ * node. This could mean that firmware ran out of memory,
+ * or there's a bug somewhere.
+ */
+ prlog(PR_ERR, "creating dt node /ibm,opal/power-mgt failed\n");
+ return;
+ }
+
+ /*
+ * Chose the right state table for the chip
+ *
+ * XXX We use the first chip version, we should probably look
+ * for the smaller of all chips instead..
+ */
+ chip = next_chip(NULL);
+ assert(chip);
+ if (proc_gen >= proc_gen_p9) {
+ if (chip->type == PROC_CHIP_P9_NIMBUS ||
+ chip->type == PROC_CHIP_P9_CUMULUS ||
+ chip->type == PROC_CHIP_P9P) {
+ if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS) {
+ states = power9_mambo_cpu_idle_states;
+ nr_states = ARRAY_SIZE(power9_mambo_cpu_idle_states);
+ } else if (this_cpu()->is_fused_core) {
+ states = power9_fusedcore_cpu_idle_states;
+ nr_states = ARRAY_SIZE(power9_fusedcore_cpu_idle_states);
+ } else {
+ states = power9_cpu_idle_states;
+ nr_states = ARRAY_SIZE(power9_cpu_idle_states);
+ }
+ } else if (chip->type == PROC_CHIP_P10) {
+ states = power10_cpu_idle_states;
+ nr_states = ARRAY_SIZE(power10_cpu_idle_states);
+ } else {
+ prlog(PR_ERR, "determining chip type\n");
+ return;
+ }
+
+ has_stop_inst = true;
+ stop_levels = dt_prop_get_u32_def(power_mgt,
+ "ibm,enabled-stop-levels", 0);
+ if (!stop_levels) {
+ prerror("SLW: No stop levels available. Power saving is disabled!\n");
+ has_deep_states = false;
+ } else {
+ /* Iterate to see if we have deep states enabled */
+ for (i = 0; i < nr_states; i++) {
+ u32 level = 31 - (states[i].pm_ctrl_reg_val &
+ OPAL_PM_PSSCR_RL_MASK);
+
+ if ((stop_levels & (1ul << level)) &&
+ (states[i].flags & OPAL_PM_STOP_INST_DEEP))
+ has_deep_states = true;
+ }
+ }
+ if ((wakeup_engine_state == WAKEUP_ENGINE_PRESENT) && has_deep_states) {
+ if (chip->type == PROC_CHIP_P9_NIMBUS ||
+ chip->type == PROC_CHIP_P9_CUMULUS) {
+ slw_late_init_p9(chip);
+ xive_late_init();
+ nx_p9_rng_late_init();
+ } else if (chip->type == PROC_CHIP_P10) {
+ slw_late_init_p10(chip);
+ xive2_late_init();
+ }
+ }
+ if (wakeup_engine_state != WAKEUP_ENGINE_PRESENT)
+ has_deep_states = false;
+ } else if (chip->type == PROC_CHIP_P8_MURANO ||
+ chip->type == PROC_CHIP_P8_VENICE ||
+ chip->type == PROC_CHIP_P8_NAPLES) {
+ const struct dt_property *p;
+
+ p = dt_find_property(dt_root, "ibm,enabled-idle-states");
+ if (p)
+ prlog(PR_NOTICE,
+ "SLW: HB-provided idle states property found\n");
+ states = power8_cpu_idle_states;
+ nr_states = ARRAY_SIZE(power8_cpu_idle_states);
+
+ /* Check if hostboot say we can sleep */
+ if (!p || !dt_prop_find_string(p, "fast-sleep")) {
+ prlog(PR_WARNING, "SLW: Sleep not enabled by HB"
+ " on this platform\n");
+ can_sleep = false;
+ }
+
+ /* Clip to NAP only on Murano and Venice DD1.x */
+ if ((chip->type == PROC_CHIP_P8_MURANO ||
+ chip->type == PROC_CHIP_P8_VENICE) &&
+ chip->ec_level < 0x20) {
+ prlog(PR_NOTICE, "SLW: Sleep not enabled on P8 DD1.x\n");
+ can_sleep = false;
+ }
+
+ } else {
+ states = nap_only_cpu_idle_states;
+ nr_states = ARRAY_SIZE(nap_only_cpu_idle_states);
+ }
+
+
+ /*
+ * Currently we can't append strings and cells to dt properties.
+ * So create buffers to which you can append values, then create
+ * dt properties with this buffer content.
+ */
+
+ /* Allocate memory to idle state property buffers. */
+ alloced_name_buf= malloc(nr_states * sizeof(char) * MAX_NAME_LEN);
+ name_buf = alloced_name_buf;
+ latency_ns_buf = malloc(nr_states * sizeof(u32));
+ residency_ns_buf= malloc(nr_states * sizeof(u32));
+ flags_buf = malloc(nr_states * sizeof(u32));
+ pm_ctrl_reg_val_buf = malloc(nr_states * sizeof(u64));
+ pm_ctrl_reg_mask_buf = malloc(nr_states * sizeof(u64));
+
+ name_buf_len = 0;
+ num_supported_idle_states = 0;
+
+ /*
+ * Create a mask with the flags of all supported idle states
+ * set. Use this to only add supported idle states to the
+ * device-tree
+ */
+ if (has_stop_inst) {
+ /* Power 9/10 / POWER ISA 3.0 and above */
+ supported_states_mask = OPAL_PM_STOP_INST_FAST;
+ if (wakeup_engine_state == WAKEUP_ENGINE_PRESENT)
+ supported_states_mask |= OPAL_PM_STOP_INST_DEEP;
+ } else {
+ /* Power 7 and Power 8 */
+ supported_states_mask = OPAL_PM_NAP_ENABLED;
+ if (can_sleep)
+ supported_states_mask |= OPAL_PM_SLEEP_ENABLED |
+ OPAL_PM_SLEEP_ENABLED_ER1;
+ if (wakeup_engine_state == WAKEUP_ENGINE_PRESENT)
+ supported_states_mask |= OPAL_PM_WINKLE_ENABLED;
+ }
+ nvram_disable_str = nvram_query_dangerous("opal-stop-state-disable-mask");
+ if (nvram_disable_str)
+ nvram_disabled_states_mask = strtol(nvram_disable_str, NULL, 0);
+ prlog(PR_DEBUG, "NVRAM stop disable mask: %x\n", nvram_disabled_states_mask);
+ for (i = 0; i < nr_states; i++) {
+ /* For each state, check if it is one of the supported states. */
+ if (!(states[i].flags & supported_states_mask))
+ continue;
+
+ /* We can only use the stop levels that HB has made available */
+ if (has_stop_inst) {
+ u32 level = 31 - (states[i].pm_ctrl_reg_val &
+ OPAL_PM_PSSCR_RL_MASK);
+
+ if (!(stop_levels & (1ul << level)))
+ continue;
+
+ if ((opal_disabled_states_mask |
+ nvram_disabled_states_mask) &
+ (1ul << level)) {
+ if (nvram_disable_str &&
+ !(nvram_disabled_states_mask & (1ul << level))) {
+ prlog(PR_NOTICE, "SLW: Enabling: %s "
+ "(disabled in OPAL, forced by "
+ "NVRAM)\n",states[i].name);
+ } else {
+ prlog(PR_NOTICE, "SLW: Disabling: %s in OPAL\n",
+ states[i].name);
+ continue;
+ }
+ }
+ }
+
+ prlog(PR_INFO, "SLW: Enabling: %s\n", states[i].name);
+
+ /*
+ * If a state is supported add each of its property
+ * to its corresponding property buffer.
+ */
+ strncpy(name_buf, states[i].name, MAX_NAME_LEN);
+ name_buf = name_buf + strlen(states[i].name) + 1;
+
+ *latency_ns_buf = cpu_to_fdt32(states[i].latency_ns);
+ latency_ns_buf++;
+
+ *residency_ns_buf = cpu_to_fdt32(states[i].residency_ns);
+ residency_ns_buf++;
+
+ *flags_buf = cpu_to_fdt32(states[i].flags);
+ flags_buf++;
+
+ *pm_ctrl_reg_val_buf = cpu_to_fdt64(states[i].pm_ctrl_reg_val);
+ pm_ctrl_reg_val_buf++;
+
+ *pm_ctrl_reg_mask_buf = cpu_to_fdt64(states[i].pm_ctrl_reg_mask);
+ pm_ctrl_reg_mask_buf++;
+
+ /* Increment buffer length trackers */
+ name_buf_len += strlen(states[i].name) + 1;
+ num_supported_idle_states++;
+
+ }
+
+ /* Point buffer pointers back to beginning of the buffer */
+ name_buf -= name_buf_len;
+ latency_ns_buf -= num_supported_idle_states;
+ residency_ns_buf -= num_supported_idle_states;
+ flags_buf -= num_supported_idle_states;
+ pm_ctrl_reg_val_buf -= num_supported_idle_states;
+ pm_ctrl_reg_mask_buf -= num_supported_idle_states;
+ /* Create dt properties with the buffer content */
+ dt_add_property(power_mgt, "ibm,cpu-idle-state-names", name_buf,
+ name_buf_len* sizeof(char));
+ dt_add_property(power_mgt, "ibm,cpu-idle-state-latencies-ns",
+ latency_ns_buf, num_supported_idle_states * sizeof(u32));
+ dt_add_property(power_mgt, "ibm,cpu-idle-state-residency-ns",
+ residency_ns_buf, num_supported_idle_states * sizeof(u32));
+ dt_add_property(power_mgt, "ibm,cpu-idle-state-flags", flags_buf,
+ num_supported_idle_states * sizeof(u32));
+
+ if (has_stop_inst) {
+ dt_add_property(power_mgt, "ibm,cpu-idle-state-psscr",
+ pm_ctrl_reg_val_buf,
+ num_supported_idle_states * sizeof(u64));
+ dt_add_property(power_mgt, "ibm,cpu-idle-state-psscr-mask",
+ pm_ctrl_reg_mask_buf,
+ num_supported_idle_states * sizeof(u64));
+ } else {
+ dt_add_property(power_mgt, "ibm,cpu-idle-state-pmicr",
+ pm_ctrl_reg_val_buf,
+ num_supported_idle_states * sizeof(u64));
+ dt_add_property(power_mgt, "ibm,cpu-idle-state-pmicr-mask",
+ pm_ctrl_reg_mask_buf,
+ num_supported_idle_states * sizeof(u64));
+ }
+ assert(alloced_name_buf == name_buf);
+ free(alloced_name_buf);
+ free(latency_ns_buf);
+ free(residency_ns_buf);
+ free(flags_buf);
+ free(pm_ctrl_reg_val_buf);
+ free(pm_ctrl_reg_mask_buf);
+}
+
+static void slw_cleanup_core(struct proc_chip *chip, struct cpu_thread *c)
+{
+ uint64_t tmp;
+ int rc;
+
+ /* Display history to check transition */
+ rc = xscom_read(chip->id,
+ XSCOM_ADDR_P8_EX_SLAVE(pir_to_core_id(c->pir),
+ EX_PM_IDLE_STATE_HISTORY_PHYP),
+ &tmp);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_SLW_GET),
+ "SLW: Failed to read PM_IDLE_STATE_HISTORY\n");
+ /* XXX error handling ? return false; */
+ }
+
+ prlog(PR_DEBUG, "SLW: core %x:%x history: 0x%016llx (new1)\n",
+ chip->id, pir_to_core_id(c->pir), tmp);
+
+ rc = xscom_read(chip->id,
+ XSCOM_ADDR_P8_EX_SLAVE(pir_to_core_id(c->pir),
+ EX_PM_IDLE_STATE_HISTORY_PHYP),
+ &tmp);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_SLW_GET),
+ "SLW: Failed to read PM_IDLE_STATE_HISTORY\n");
+ /* XXX error handling ? return false; */
+ }
+
+ prlog(PR_DEBUG, "SLW: core %x:%x history: 0x%016llx (new2)\n",
+ chip->id, pir_to_core_id(c->pir), tmp);
+
+ /*
+ * XXX FIXME: Error out if the transition didn't reach rvwinkle ?
+ */
+
+ /*
+ * XXX FIXME: We should restore a bunch of the EX bits we
+ * overwrite to sane values here
+ */
+ slw_unset_overrides(chip, c);
+}
+
+static void slw_cleanup_chip(struct proc_chip *chip)
+{
+ struct cpu_thread *c;
+
+ for_each_available_core_in_chip(c, chip->id)
+ slw_cleanup_core(chip, c);
+}
+
+static void slw_patch_scans(struct proc_chip *chip, bool le_mode)
+{
+ int64_t rc;
+ uint64_t old_val, new_val;
+
+ rc = sbe_xip_get_scalar((void *)chip->slw_base,
+ "skip_ex_override_ring_scans", &old_val);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_SLW_REG),
+ "SLW: Failed to read scan override on chip %d\n",
+ chip->id);
+ return;
+ }
+
+ new_val = le_mode ? 0 : 1;
+
+ prlog(PR_TRACE, "SLW: Chip %d, LE value was: %lld, setting to %lld\n",
+ chip->id, old_val, new_val);
+
+ rc = sbe_xip_set_scalar((void *)chip->slw_base,
+ "skip_ex_override_ring_scans", new_val);
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_SLW_REG),
+ "SLW: Failed to set LE mode on chip %d\n", chip->id);
+ return;
+ }
+}
+
+int64_t slw_reinit(uint64_t flags)
+{
+ struct proc_chip *chip;
+ struct cpu_thread *cpu;
+ bool has_waker = false;
+ bool target_le = slw_current_le;
+
+ if (flags & OPAL_REINIT_CPUS_HILE_BE)
+ target_le = false;
+ if (flags & OPAL_REINIT_CPUS_HILE_LE)
+ target_le = true;
+
+ prlog(PR_TRACE, "SLW Reinit from CPU PIR 0x%04x,"
+ " HILE set to %s endian...\n",
+ this_cpu()->pir,
+ target_le ? "little" : "big");
+
+ /* Prepare chips/cores for rvwinkle */
+ for_each_chip(chip) {
+ if (!chip->slw_base) {
+ log_simple_error(&e_info(OPAL_RC_SLW_INIT),
+ "SLW: Not found on chip %d\n", chip->id);
+ return OPAL_HARDWARE;
+ }
+
+ slw_patch_scans(chip, target_le);
+ }
+ slw_current_le = target_le;
+
+ /* XXX Save HIDs ? Or do that in head.S ... */
+
+ slw_patch_reset();
+
+ /* rvwinkle everybody and pick one to wake me once I rvwinkle myself */
+ for_each_available_cpu(cpu) {
+ struct cpu_thread *master = NULL;
+
+ if (cpu == this_cpu())
+ continue;
+
+ /* Pick up a waker for myself: it must not be a sibling of
+ * the current CPU and must be a thread 0 (so it gets to
+ * sync its timebase before doing time_wait_ms()
+ */
+ if (!has_waker && !cpu_is_sibling(cpu, this_cpu()) &&
+ cpu_is_thread0(cpu)) {
+ has_waker = true;
+ master = this_cpu();
+ }
+ __cpu_queue_job(cpu, "slw_do_rvwinkle",
+ slw_do_rvwinkle, master, true);
+
+ /* Wait for it to claim to be down */
+ while(cpu->state != cpu_state_rvwinkle)
+ sync();
+ }
+
+ /* XXX Wait one second ! (should check xscom state ? ) */
+ prlog(PR_TRACE, "SLW: Waiting one second...\n");
+ time_wait_ms(1000);
+ prlog(PR_TRACE, "SLW: Done.\n");
+
+ for_each_chip(chip) {
+ struct cpu_thread *c;
+ uint64_t tmp;
+ for_each_available_core_in_chip(c, chip->id) {
+ xscom_read(chip->id,
+ XSCOM_ADDR_P8_EX_SLAVE(pir_to_core_id(c->pir),
+ EX_PM_IDLE_STATE_HISTORY_PHYP),
+ &tmp);
+ prlog(PR_DEBUG, "SLW: core %x:%x"
+ " history: 0x%016llx (mid)\n",
+ chip->id, pir_to_core_id(c->pir), tmp);
+ }
+ }
+
+
+ /* Wake everybody except on my core */
+ for_each_cpu(cpu) {
+ if (cpu->state != cpu_state_rvwinkle ||
+ cpu_is_sibling(cpu, this_cpu()))
+ continue;
+ icp_kick_cpu(cpu);
+
+ /* Wait for it to claim to be back (XXX ADD TIMEOUT) */
+ while(cpu->state != cpu_state_active)
+ sync();
+ }
+
+ /* Did we find a waker ? If we didn't, that means we had no
+ * other core in the system, we can't do it
+ */
+ if (!has_waker) {
+ prlog(PR_TRACE, "SLW: No candidate waker, giving up !\n");
+ return OPAL_HARDWARE;
+ }
+
+ /* Our siblings are rvwinkling, and our waker is waiting for us
+ * so let's just go down now
+ */
+ slw_do_rvwinkle(NULL);
+
+ slw_unpatch_reset();
+
+ for_each_chip(chip)
+ slw_cleanup_chip(chip);
+
+ prlog(PR_TRACE, "SLW Reinit complete !\n");
+
+ return OPAL_SUCCESS;
+}
+
+static void slw_patch_regs(struct proc_chip *chip)
+{
+ struct cpu_thread *c;
+ void *image = (void *)chip->slw_base;
+ int rc;
+
+ for_each_available_cpu(c) {
+ if (c->chip_id != chip->id)
+ continue;
+
+ /* Clear HRMOR */
+ rc = p8_pore_gen_cpureg_fixed(image, P8_SLW_MODEBUILD_SRAM,
+ P8_SPR_HRMOR, 0,
+ cpu_get_core_index(c),
+ cpu_get_thread_index(c));
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_SLW_REG),
+ "SLW: Failed to set HRMOR for CPU %x\n",
+ c->pir);
+ }
+
+ /* XXX Add HIDs etc... */
+ }
+}
+
+static void slw_init_chip_p9(struct proc_chip *chip)
+{
+ struct cpu_thread *c;
+
+ prlog(PR_DEBUG, "SLW: Init chip 0x%x\n", chip->id);
+
+ /* At power ON setup inits for power-mgt */
+ for_each_available_core_in_chip(c, chip->id)
+ slw_set_overrides_p9(chip, c);
+
+
+}
+
+static void slw_init_chip_p10(struct proc_chip *chip)
+{
+ struct cpu_thread *c;
+
+ prlog(PR_DEBUG, "SLW: Init chip 0x%x\n", chip->id);
+
+ /* At power ON setup inits for power-mgt */
+ for_each_available_core_in_chip(c, chip->id)
+ slw_set_overrides_p10(chip, c);
+
+
+}
+
+
+static bool slw_image_check_p9(struct proc_chip *chip)
+{
+
+ if (!chip->homer_base) {
+ log_simple_error(&e_info(OPAL_RC_SLW_REG),
+ "SLW: HOMER base not set %x\n",
+ chip->id);
+ return false;
+ } else
+ return true;
+
+
+}
+
+static bool slw_image_check_p8(struct proc_chip *chip)
+{
+ int64_t rc;
+
+ prlog(PR_DEBUG, "SLW: slw_check chip 0x%x\n", chip->id);
+ if (!chip->slw_base) {
+ prerror("SLW: No image found !\n");
+ return false;
+ }
+
+ /* Check actual image size */
+ rc = sbe_xip_get_scalar((void *)chip->slw_base, "image_size",
+ &chip->slw_image_size);
+ if (rc != 0) {
+ log_simple_error(&e_info(OPAL_RC_SLW_INIT),
+ "SLW: Error %lld reading SLW image size\n", rc);
+ /* XXX Panic ? */
+ chip->slw_base = 0;
+ chip->slw_bar_size = 0;
+ chip->slw_image_size = 0;
+ return false;
+ }
+ prlog(PR_DEBUG, "SLW: Image size from image: 0x%llx\n",
+ chip->slw_image_size);
+
+ if (chip->slw_image_size > chip->slw_bar_size) {
+ log_simple_error(&e_info(OPAL_RC_SLW_INIT),
+ "SLW: Built-in image size larger than BAR size !\n");
+ /* XXX Panic ? */
+ return false;
+ }
+ return true;
+
+}
+
+static void slw_late_init_p8(struct proc_chip *chip)
+{
+
+ prlog(PR_DEBUG, "SLW: late Init chip 0x%x\n", chip->id);
+
+ /* Patch SLW image */
+ slw_patch_regs(chip);
+
+}
+static void slw_init_chip_p8(struct proc_chip *chip)
+{
+ struct cpu_thread *c;
+
+ prlog(PR_DEBUG, "SLW: Init chip 0x%x\n", chip->id);
+ /* At power ON setup inits for fast-sleep */
+ for_each_available_core_in_chip(c, chip->id) {
+ idle_prepare_core(chip, c);
+ }
+}
+
+/* Workarounds while entering fast-sleep */
+
+static void fast_sleep_enter(void)
+{
+ uint32_t core = pir_to_core_id(this_cpu()->pir);
+ uint32_t chip_id = this_cpu()->chip_id;
+ struct cpu_thread *primary_thread;
+ uint64_t tmp;
+ int rc;
+
+ primary_thread = this_cpu()->primary;
+
+ rc = xscom_read(chip_id, XSCOM_ADDR_P8_EX(core, L2_FIR_ACTION1),
+ &tmp);
+ if (rc) {
+ prlog(PR_WARNING, "fast_sleep_enter XSCOM failed(1):"
+ " rc=%d chip_id=%d core=%d\n",
+ rc, chip_id, core);
+ return;
+ }
+
+ primary_thread->save_l2_fir_action1 = tmp;
+ primary_thread->in_fast_sleep = true;
+
+ tmp = tmp & ~0x0200000000000000ULL;
+ rc = xscom_write(chip_id, XSCOM_ADDR_P8_EX(core, L2_FIR_ACTION1),
+ tmp);
+ if (rc) {
+ prlog(PR_WARNING, "fast_sleep_enter XSCOM failed(2):"
+ " rc=%d chip_id=%d core=%d\n",
+ rc, chip_id, core);
+ return;
+ }
+ rc = xscom_read(chip_id, XSCOM_ADDR_P8_EX(core, L2_FIR_ACTION1),
+ &tmp);
+ if (rc) {
+ prlog(PR_WARNING, "fast_sleep_enter XSCOM failed(3):"
+ " rc=%d chip_id=%d core=%d\n",
+ rc, chip_id, core);
+ return;
+ }
+
+}
+
+/* Workarounds while exiting fast-sleep */
+
+void fast_sleep_exit(void)
+{
+ uint32_t core = pir_to_core_id(this_cpu()->pir);
+ uint32_t chip_id = this_cpu()->chip_id;
+ struct cpu_thread *primary_thread;
+ int rc;
+
+ primary_thread = this_cpu()->primary;
+ primary_thread->in_fast_sleep = false;
+
+ rc = xscom_write(chip_id, XSCOM_ADDR_P8_EX(core, L2_FIR_ACTION1),
+ primary_thread->save_l2_fir_action1);
+ if (rc) {
+ prlog(PR_WARNING, "fast_sleep_exit XSCOM failed:"
+ " rc=%d chip_id=%d core=%d\n",
+ rc, chip_id, core);
+ return;
+ }
+}
+
+/*
+ * Setup and cleanup method for fast-sleep workarounds
+ * state = 1 fast-sleep
+ * enter = 1 Enter state
+ * exit = 0 Exit state
+ */
+
+static int64_t opal_config_cpu_idle_state(uint64_t state, uint64_t enter)
+{
+ /* Only fast-sleep for now */
+ if (state != 1)
+ return OPAL_PARAMETER;
+
+ switch(enter) {
+ case 1:
+ fast_sleep_enter();
+ break;
+ case 0:
+ fast_sleep_exit();
+ break;
+ default:
+ return OPAL_PARAMETER;
+ }
+
+ return OPAL_SUCCESS;
+}
+
+opal_call(OPAL_CONFIG_CPU_IDLE_STATE, opal_config_cpu_idle_state, 2);
+
+int64_t opal_slw_set_reg(uint64_t cpu_pir, uint64_t sprn, uint64_t val)
+{
+
+ struct cpu_thread *c = find_cpu_by_pir(cpu_pir);
+ struct proc_chip *chip;
+ int rc;
+
+ if (!c) {
+ prerror("SLW: Unknown thread with pir %x\n", (u32) cpu_pir);
+ return OPAL_PARAMETER;
+ }
+
+ chip = get_chip(c->chip_id);
+ if (!chip) {
+ prerror("SLW: Unknown chip for thread with pir %x\n",
+ (u32) cpu_pir);
+ return OPAL_PARAMETER;
+ }
+
+ if (proc_gen >= proc_gen_p9) {
+ if (!has_deep_states) {
+ prlog(PR_INFO, "SLW: Deep states not enabled\n");
+ return OPAL_SUCCESS;
+ }
+
+ if (wakeup_engine_state != WAKEUP_ENGINE_PRESENT) {
+ log_simple_error(&e_info(OPAL_RC_SLW_REG),
+ "SLW: wakeup_engine in bad state=%d chip=%x\n",
+ wakeup_engine_state,chip->id);
+ return OPAL_INTERNAL_ERROR;
+ }
+ if (proc_gen == proc_gen_p9) {
+ rc = p9_stop_save_cpureg((void *)chip->homer_base,
+ sprn, val, cpu_pir);
+ } else {
+ rc = proc_stop_save_cpureg((void *)chip->homer_base,
+ sprn, val, cpu_pir);
+ }
+
+ } else if (proc_gen == proc_gen_p8) {
+ int spr_is_supported = 0;
+ void *image;
+ int i;
+
+ /* Check of the SPR is supported by libpore */
+ for (i = 0; i < SLW_SPR_REGS_SIZE ; i++) {
+ if (sprn == SLW_SPR_REGS[i].value) {
+ spr_is_supported = 1;
+ break;
+ }
+ }
+ if (!spr_is_supported) {
+ log_simple_error(&e_info(OPAL_RC_SLW_REG),
+ "SLW: Trying to set unsupported spr for CPU %x\n",
+ c->pir);
+ return OPAL_UNSUPPORTED;
+ }
+ image = (void *)chip->slw_base;
+ rc = p8_pore_gen_cpureg_fixed(image, P8_SLW_MODEBUILD_SRAM,
+ sprn, val,
+ cpu_get_core_index(c),
+ cpu_get_thread_index(c));
+ } else {
+ log_simple_error(&e_info(OPAL_RC_SLW_REG),
+ "SLW: proc_gen not supported\n");
+ return OPAL_UNSUPPORTED;
+
+ }
+
+ if (rc) {
+ log_simple_error(&e_info(OPAL_RC_SLW_REG),
+ "SLW: Failed to set spr %llx for CPU %x, RC=0x%x\n",
+ sprn, c->pir, rc);
+ return OPAL_INTERNAL_ERROR;
+ }
+ prlog(PR_DEBUG, "SLW: restore spr:0x%llx on c:0x%x with 0x%llx\n",
+ sprn, c->pir, val);
+ return OPAL_SUCCESS;
+
+}
+
+opal_call(OPAL_SLW_SET_REG, opal_slw_set_reg, 3);
+
+void slw_init(void)
+{
+ struct proc_chip *chip;
+
+ if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS) {
+ wakeup_engine_state = WAKEUP_ENGINE_NOT_PRESENT;
+ add_cpu_idle_state_properties();
+ return;
+ }
+ if (proc_gen == proc_gen_p8) {
+ for_each_chip(chip) {
+ slw_init_chip_p8(chip);
+ if(slw_image_check_p8(chip))
+ wakeup_engine_state = WAKEUP_ENGINE_PRESENT;
+ if (wakeup_engine_state == WAKEUP_ENGINE_PRESENT)
+ slw_late_init_p8(chip);
+ }
+ p8_sbe_init_timer();
+ } else if (proc_gen == proc_gen_p9) {
+ for_each_chip(chip) {
+ slw_init_chip_p9(chip);
+ if(slw_image_check_p9(chip))
+ wakeup_engine_state = WAKEUP_ENGINE_PRESENT;
+ if (wakeup_engine_state == WAKEUP_ENGINE_PRESENT)
+ slw_late_init_p9(chip);
+ }
+ } else if (proc_gen == proc_gen_p10) {
+ for_each_chip(chip) {
+ slw_init_chip_p10(chip);
+ if(slw_image_check_p9(chip))
+ wakeup_engine_state = WAKEUP_ENGINE_PRESENT;
+ if (wakeup_engine_state == WAKEUP_ENGINE_PRESENT) {
+ slw_late_init_p10(chip);
+ }
+ }
+ }
+ add_cpu_idle_state_properties();
+}
diff --git a/roms/skiboot/hw/test/Makefile.check b/roms/skiboot/hw/test/Makefile.check
new file mode 100644
index 000000000..45eb8072f
--- /dev/null
+++ b/roms/skiboot/hw/test/Makefile.check
@@ -0,0 +1,29 @@
+# -*-Makefile-*-
+SUBDIRS += hw/test/
+HW_TEST := hw/test/phys-map-test hw/test/run-port80h
+
+.PHONY : hw-check
+hw-check: $(HW_TEST:%=%-check)
+
+.PHONY : hw-coverage
+hw-coverage: $(HW_TEST:%=%-gcov-run)
+
+check: hw-check
+coverage: hw-coverage
+
+$(HW_TEST:%=%-gcov-run) : %-run: %
+ $(call QTEST, TEST-COVERAGE ,$< , $<)
+
+$(HW_TEST:%=%-check) : %-check: %
+ $(call QTEST, RUN-TEST ,$(VALGRIND) $<, $<)
+
+$(HW_TEST) : % : %.c hw/phys-map.o
+ $(call Q, HOSTCC ,$(HOSTCC) $(HOSTCFLAGS) -O0 -g -I include -I . -o $@ $<, $<)
+
+$(HW_TEST:%=%-gcov): %-gcov : %.c %
+ $(call QTEST, HOSTCC ,$(HOSTCC) $(HOSTCFLAGS) $(HOSTGCOVCFLAGS) -I include -I . -lgcov -o $@ $<, $<)
+
+clean: hw-clean
+
+hw-clean:
+ $(RM) -f hw/test/*.[od] $(HW_TEST) $(HW_TEST:%=%-gcov)
diff --git a/roms/skiboot/hw/test/phys-map-test.c b/roms/skiboot/hw/test/phys-map-test.c
new file mode 100644
index 000000000..d507175fe
--- /dev/null
+++ b/roms/skiboot/hw/test/phys-map-test.c
@@ -0,0 +1,203 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Physical memory map test
+ *
+ * Copyright 2013-2017 IBM Corp.
+ */
+
+#include "../../core/test/stubs.c"
+#include "../phys-map.c"
+
+enum proc_gen proc_gen;
+
+static inline void print_entry(const struct phys_map_entry *e)
+{
+ printf("type:%i index:%i addr:%016lx size:%016lx",
+ e->type, e->index, e->addr, e->size);
+}
+
+/* Check table directly for overlaps */
+static void check_table_directly(void)
+{
+ const struct phys_map_entry *e, *prev;
+ uint64_t start, end, pstart, pend;
+ bool passed;
+
+ /* Loop over table entries ... */
+ for (e = phys_map->table; !phys_map_entry_null(e); e++) {
+
+ start = e->addr;
+ end = e->addr + e->size;
+ /* ... see if they overlap with previous entries */
+ for (prev = phys_map->table; prev != e; prev++) {
+ passed = true;
+ /* Check for overlaping regions */
+ pstart = prev->addr;
+ pend = prev->addr + prev->size;
+ if ((start > pstart) && (start < pend))
+ passed = false;
+ if ((end > pstart) && (end < pend))
+ passed = false;
+
+ /* Check for duplicate entries */
+ if ((e->type == prev->type) &&
+ (e->index == prev->index))
+ passed = false;
+
+ if (passed)
+ continue;
+
+ printf("Phys map direct test FAILED: Entry overlaps\n");
+ printf("First: ");
+ print_entry(prev);
+ printf("\n");
+ printf("Second: ");
+ print_entry(e);
+ printf("\n");
+ assert(0);
+ }
+ }
+}
+
+struct map_call_entry {
+ uint64_t start;
+ uint64_t end;
+};
+
+static inline bool map_call_entry_null(const struct map_call_entry *t)
+{
+ if ((t->start == 0) &&
+ (t->end == 0))
+ return true;
+ return false;
+}
+
+/* Check calls to map to see if they overlap.
+ * Creates a new table for each of the entries it gets to check against
+ */
+
+/* Pick a chip ID, any ID. */
+#define FAKE_CHIP_ID 8
+
+struct proc_chip *get_chip(uint32_t chip_id __unused)
+{
+ return NULL;
+}
+
+static void check_map_call(void)
+{
+ uint64_t start, size, end;
+ const struct phys_map_entry *e;
+ struct map_call_entry *tbl, *t, *tnext;
+ int tbl_size = 0;
+ bool passed;
+
+ for (e = phys_map->table; !phys_map_entry_null(e); e++)
+ tbl_size++;
+
+ tbl_size++; /* allow for null entry at end */
+ tbl_size *= sizeof(struct map_call_entry);
+ tbl = malloc(tbl_size);
+ assert(tbl != NULL);
+ memset(tbl, 0, tbl_size);
+
+ /* Loop over table entries ... */
+ for (e = phys_map->table; !phys_map_entry_null(e); e++) {
+ __phys_map_get(FAKE_CHIP_ID, FAKE_CHIP_ID, e->type, e->index, &start, &size);
+
+ /* Check for alignment */
+ if ((e->type != SYSTEM_MEM) && (e->type != RESV)) {
+ /* Size is power of 2? */
+ assert(__builtin_popcountl(size) == 1);
+ /* Start is aligned to size? */
+ assert((start % size) == 0);
+ }
+
+ end = start + size;
+ for (t = tbl; !map_call_entry_null(t); t++) {
+ passed = true;
+
+ /* Check for overlaping regions */
+ if ((start > t->start) && (start < t->end))
+ passed = false;
+ if ((end > t->start) && (end < t->end))
+ passed = false;
+
+ if (passed)
+ continue;
+
+ printf("Phys map call test FAILED: Entry overlaps\n");
+ printf("First: addr:%016lx size:%016lx\n",
+ t->start, t->end - t->start);
+ printf("Second: addr:%016lx size:%016lx\n ",
+ start, size);
+ print_entry(e);
+ printf("\n");
+ assert(0);
+ }
+ /* Insert entry at end of table */
+ t->start = start;
+ t->end = end;
+ }
+
+ for (t = tbl; !map_call_entry_null(t + 1); t++) {
+ tnext = t + 1;
+ /* Make sure the table is sorted */
+ if (t->start > tnext->start) {
+ printf("Phys map test FAILED: Entry not sorted\n");
+ printf("First: addr:%016lx size:%016lx\n",
+ t->start, t->end - t->start);
+ printf("Second: addr:%016lx size:%016lx\n",
+ tnext->start, tnext->end - tnext->start);
+ assert(0);
+ }
+
+ /* Look for holes in the table in MMIO region */
+ /* We assume over 1PB is MMIO. */
+ if ((t->end != tnext->start) &&
+ (t->start > 0x0004000000000000)) {
+ printf("Phys map test FAILED: Hole in map\n");
+ printf("First: addr:%016lx size:%016lx\n",
+ t->start, t->end - t->start);
+ printf("Second: addr:%016lx size:%016lx\n",
+ tnext->start, tnext->end - tnext->start);
+ assert(0);
+ }
+ }
+
+ free(tbl);
+}
+
+/* Fake PVR definitions. See include/processor.h */
+unsigned long fake_pvr[] = {
+ 0x004e0200, /* PVR_P9 */
+ 0x004f0100, /* PVR_P9P */
+ 0x00800100, /* PVR_P10 */
+};
+
+int main(void)
+{
+ for (int i = 0; i < ARRAY_SIZE(fake_pvr); i++) {
+ switch(PVR_TYPE(fake_pvr[i])) {
+ case PVR_TYPE_P9:
+ case PVR_TYPE_P9P:
+ proc_gen = proc_gen_p9;
+ break;
+ case PVR_TYPE_P10:
+ proc_gen = proc_gen_p10;
+ break;
+ default:
+ printf("Unknown PVR 0x%lx\n", fake_pvr[i]);
+ return 1;
+ break;
+ }
+
+ phys_map_init(fake_pvr[i]);
+
+ /* Run tests */
+ check_table_directly();
+ check_map_call();
+ }
+
+ return(0);
+}
diff --git a/roms/skiboot/hw/test/run-port80h.c b/roms/skiboot/hw/test/run-port80h.c
new file mode 100644
index 000000000..860a4244d
--- /dev/null
+++ b/roms/skiboot/hw/test/run-port80h.c
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Test result of our LPC port 80h boot progress code
+ *
+ * Copyright 2018-2019 IBM Corp.
+ */
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <assert.h>
+
+#define __unused __attribute__((unused))
+
+#define __LPC_H
+
+uint8_t port80;
+uint16_t port8x;
+
+static int64_t lpc_probe_write(int addr_type __unused, uint32_t addr,
+ uint32_t data, uint32_t sz)
+{
+ assert((addr - 0x80) <= 2);
+ assert(sz == 1);
+ if (addr == 0x80)
+ port80 = data;
+ if (addr == 0x81)
+ port8x = data << 8 | (port8x & 0xff);
+ if (addr == 0x82)
+ port8x = (port8x & 0xff00) | data;
+ return 0;
+}
+
+#include "op-panel.h"
+
+void op_display_lpc(enum op_severity s, enum op_module m, uint16_t c);
+
+#include "../lpc-port80h.c"
+#include "../../core/test/stubs.c"
+
+enum proc_chip_quirks proc_chip_quirks;
+
+int main(void)
+{
+ op_display_lpc(OP_LOG, OP_MOD_INIT, 0x00);
+ assert(port80 == 0x80);
+ assert(port8x == 0x8000);
+ op_display_lpc(OP_WARN, OP_MOD_INIT, 0x00);
+ assert(port80 == 0x82);
+ assert(port8x == 0x8002);
+ op_display_lpc(OP_ERROR, OP_MOD_INIT, 0x00);
+ assert(port80 == 0x81);
+ assert(port8x == 0x8001);
+ op_display_lpc(OP_FATAL, OP_MOD_INIT, 0x00);
+ assert(port80 == 0x83);
+ assert(port8x == 0x8003);
+ op_display_lpc(OP_FATAL, OP_MOD_INIT, 0x0f);
+ assert(port80 == 0xBF);
+ assert(port8x == 0x803F);
+ op_display_lpc(OP_LOG, OP_MOD_INIT, 0x0f);
+ assert(port80 == 0xBC);
+ assert(port8x == 0x803C);
+ op_display_lpc(OP_FATAL, OP_MOD_CORE, 0x6666);
+ assert(port80 == 0xBF);
+ assert(port8x == 0x803F);
+ op_display_lpc(OP_LOG, OP_MOD_INIT, 0x01);
+ assert(port80 == 0x84);
+ assert(port8x == 0x8004);
+ op_display_lpc(OP_LOG, OP_MOD_CPU, 0x05);
+ assert(port80 == 0xC4);
+ assert(port8x == 0xC014);
+ op_display_lpc(OP_LOG, OP_MOD_LOCK, 0x07);
+ assert(port80 == 0xDC);
+ assert(port8x == 0xD01C);
+ op_display_lpc(OP_FATAL, OP_MOD_LOCK, 0x07);
+ assert(port80 == 0xDF);
+ assert(port8x == 0xD01F);
+ op_display_lpc(OP_FATAL, OP_MOD_MEM, 0x07);
+ assert(port80 == 0xEF);
+ assert(port8x == 0xE01F);
+ op_display_lpc(OP_WARN, OP_MOD_MEM, 0x02);
+ assert(port80 == 0xEA);
+ assert(port8x == 0xE00A);
+ op_display_lpc(OP_WARN, OP_MOD_CHIPTOD, 0x02);
+ assert(port80 == 0xFA);
+ assert(port8x == 0xF00A);
+
+ /*
+ * We can't assert that OP_MOD_FSP is invalid as we'd end up
+ * trying to set port80 in the assert parth
+ */
+ op_display_lpc(OP_LOG, OP_MOD_FSP, 0x00);
+ assert(port80 == 0x80);
+ assert(port8x == 0x8000);
+ op_display_lpc(OP_LOG, OP_MOD_FSPCON, 0x00);
+ assert(port80 == 0x80);
+ assert(port8x == 0x8000);
+ return 0;
+}
diff --git a/roms/skiboot/hw/vas.c b/roms/skiboot/hw/vas.c
new file mode 100644
index 000000000..0dbe0bcda
--- /dev/null
+++ b/roms/skiboot/hw/vas.c
@@ -0,0 +1,639 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2013-2018 IBM Corp. */
+
+#include <skiboot.h>
+#include <chip.h>
+#include <phys-map.h>
+#include <xscom.h>
+#include <io.h>
+#include <xive.h>
+#include <interrupts.h>
+#include <nvram.h>
+#include <vas.h>
+
+#define vas_err(__fmt,...) prlog(PR_ERR,"VAS: " __fmt, ##__VA_ARGS__)
+
+#ifdef VAS_VERBOSE_DEBUG
+#define vas_vdbg(__x,__fmt,...) prlog(PR_DEBUG,"VAS: " __fmt, ##__VA_ARGS__)
+#else
+#define vas_vdbg(__x,__fmt,...) do { } while (0)
+#endif
+
+static int vas_initialized;
+
+struct vas {
+ uint32_t chip_id;
+ uint32_t vas_id;
+ uint64_t xscom_base;
+ uint64_t wcbs;
+ uint32_t vas_irq;
+ uint64_t vas_port;
+};
+
+static inline void get_hvwc_mmio_bar(int chipid, uint64_t *start, uint64_t *len)
+{
+ phys_map_get(chipid, VAS_HYP_WIN, 0, start, len);
+}
+
+static inline void get_uwc_mmio_bar(int chipid, uint64_t *start, uint64_t *len)
+{
+ phys_map_get(chipid, VAS_USER_WIN, 0, start, len);
+}
+
+static inline uint64_t compute_vas_scom_addr(struct vas *vas, uint64_t reg)
+{
+ return vas->xscom_base + reg;
+}
+
+static int vas_scom_write(struct proc_chip *chip, uint64_t reg, uint64_t val)
+{
+ int rc;
+ uint64_t addr;
+
+ addr = compute_vas_scom_addr(chip->vas, reg);
+
+ rc = xscom_write(chip->id, addr, val);
+ if (rc != OPAL_SUCCESS) {
+ vas_err("Error writing 0x%llx to 0x%llx, rc %d\n", val, addr,
+ rc);
+ }
+
+ return rc;
+}
+
+/*
+ * Return true if NX crypto/compression is enabled on this processor.
+ *
+ * On POWER8, NX-842 crypto and compression are allowed, but they do not
+ * use VAS (return true).
+ *
+ * On POWER9, NX 842 and GZIP compressions use VAS but the PASTE instruction
+ * and hence VAS is not enabled in following revisions:
+ *
+ * - Nimbus DD1.X, DD2.01, DD2.1
+ * - Cumulus DD1.0
+ *
+ * Return false for these revisions. Return true otherwise.
+ */
+__attrconst inline bool vas_nx_enabled(void)
+{
+ uint32_t pvr;
+ int major, minor;
+ struct proc_chip *chip;
+
+ chip = next_chip(NULL);
+
+ pvr = mfspr(SPR_PVR);
+ major = PVR_VERS_MAJ(pvr);
+ minor = PVR_VERS_MIN(pvr);
+
+ switch (chip->type) {
+ case PROC_CHIP_P9_NIMBUS:
+ return (major > 2 || (major == 2 && minor > 1));
+ case PROC_CHIP_P9_CUMULUS:
+ return (major > 1 || minor > 0);
+ default:
+ return true;
+ }
+}
+
+/* Interface for NX - make sure VAS is fully initialized first */
+__attrconst inline uint64_t vas_get_hvwc_mmio_bar(const int chipid)
+{
+ uint64_t addr;
+
+ if (!vas_initialized)
+ return 0ULL;
+
+ get_hvwc_mmio_bar(chipid, &addr, NULL);
+
+ return addr;
+}
+
+/* Interface for NX - make sure VAS is fully initialized first */
+__attrconst uint64_t vas_get_wcbs_bar(int chipid)
+{
+ struct proc_chip *chip;
+
+ if (!vas_initialized)
+ return 0ULL;
+
+ chip = get_chip(chipid);
+ if (!chip)
+ return 0ULL;
+
+ return chip->vas->wcbs;
+}
+
+static int init_north_ctl(struct proc_chip *chip)
+{
+ uint64_t val = 0ULL;
+
+ val = SETFIELD(VAS_64K_MODE_MASK, val, true);
+ val = SETFIELD(VAS_ACCEPT_PASTE_MASK, val, true);
+ val = SETFIELD(VAS_ENABLE_WC_MMIO_BAR, val, true);
+ val = SETFIELD(VAS_ENABLE_UWC_MMIO_BAR, val, true);
+ val = SETFIELD(VAS_ENABLE_RMA_MMIO_BAR, val, true);
+
+ return vas_scom_write(chip, VAS_MISC_N_CTL, val);
+}
+
+/*
+ * Ensure paste instructions are not accepted and MMIO BARs are disabled.
+ */
+static inline int reset_north_ctl(struct proc_chip *chip)
+{
+ return vas_scom_write(chip, VAS_MISC_N_CTL, 0ULL);
+}
+
+static void reset_fir(struct proc_chip *chip)
+{
+ vas_scom_write(chip, VAS_FIR0, 0x0000000000000000ULL);
+ /* From VAS workbook */
+ vas_scom_write(chip, VAS_FIR_MASK, 0x000001000001ffffULL);
+ vas_scom_write(chip, VAS_FIR_ACTION0, 0xf800fdfc0001ffffull);
+ vas_scom_write(chip, VAS_FIR_ACTION1, 0xf8fffefffffc8000ull);
+}
+
+/* VAS workbook: Section 1.3.3.1: Send Message w/ Paste Commands (cl_rma_w) */
+/* P9 paste base address format */
+#define P9_RMA_LSMP_64K_SYS_ID PPC_BITMASK(8, 12)
+#define P9_RMA_LSMP_64K_NODE_ID PPC_BITMASK(15, 18)
+#define P9_RMA_LSMP_64K_CHIP_ID PPC_BITMASK(19, 21)
+
+/* Paste base address format (on P10 or later) */
+#define RMA_FOREIGN_ADDR_ENABLE PPC_BITMASK(8, 11)
+#define RMA_TOPOLOGY_INDEX PPC_BITMASK(15, 19)
+
+#define RMA_LSMP_WINID_START_BIT 32
+#define RMA_LSMP_WINID_NUM_BITS 16
+
+/*
+ * The start/base of the paste BAR is computed using the tables 1.1 through
+ * 1.4 in Section 1.3.3.1 (Send Message w/Paste Commands (cl_rma_w)) of VAS
+ * P9 Workbook.
+ *
+ * With 64K mode and Large SMP Mode the bits are used as follows:
+ *
+ * Bits Values Comments
+ * --------------------------------------
+ * 0:7 0b 0000_0000 Reserved
+ * 8:12 0b 0000_1 System id/Foreign Index 0:4
+ * 13:14 0b 00 Foreign Index 5:6
+ *
+ * 15:18 0 throuh 15 Node id (0 through 15)
+ * 19:21 0 through 7 Chip id (0 throuh 7)
+ * 22:23 0b 00 Unused, Foreign index 7:8
+ *
+ * 24:31 0b 0000_0000 RPN 0:7, Reserved
+ * 32:47 0 through 64K Send Window Id
+ * 48:51 0b 0000 Spare
+ *
+ * 52 0b 0 Reserved
+ * 53 0b 1 Report Enable (Set to 1 for NX).
+ * 54 0b 0 Reserved
+ *
+ * 55:56 0b 00 Snoop Bus
+ * 57:63 0b 0000_000 Reserved
+ *
+ * Except for a few bits, the small SMP mode computation is similar.
+ *
+ * TODO: Detect and compute address for small SMP mode.
+ *
+ * Example: For Node 0, Chip 0, Window id 4, Report Enable 1:
+ *
+ * Byte0 Byte1 Byte2 Byte3 Byte4 Byte5 Byte6 Byte7
+ * 00000000 00001000 00000000 00000000 00000000 00000100 00000100 00000000
+ * | || | | | |
+ * +-+-++++ +-------+-------+ v
+ * | | | Report Enable
+ * v v v
+ * Node Chip Window id 4
+ *
+ * Thus the paste address for window id 4 is 0x00080000_00040400 and
+ * the _base_ paste address for Node 0 Chip 0 is 0x00080000_00000000.
+ */
+
+static void p9_get_rma_bar(int chipid, uint64_t *val)
+{
+ uint64_t v;
+
+ v = 0ULL;
+ v = SETFIELD(P9_RMA_LSMP_64K_SYS_ID, v, 1);
+ v = SETFIELD(P9_RMA_LSMP_64K_NODE_ID, v, P9_GCID2NODEID(chipid));
+ v = SETFIELD(P9_RMA_LSMP_64K_CHIP_ID, v, P9_GCID2CHIPID(chipid));
+
+ *val = v;
+}
+
+/*
+ * The start/base of the paste BAR is computed using the tables 1.1 through
+ * 1.3 in Section 1.3.3.1 (Send Message w/Paste Commands (cl_rma_w)) of VAS
+ * P10 Workbook.
+ *
+ * With 64K mode and Large SMP Mode the bits are used as follows:
+ *
+ * Bits Values Comments
+ * --------------------------------------
+ * 0:7 0b 0000_0000 Reserved
+ * 8:11 0b 0001 Foreign Address Enable
+ * 12 0b 0 SMF
+ * 13:14 0b 00 Memory Select
+ *
+ * 15:19 0 throuh 16 Topology Index
+ * 20:23 0b 0000 Chip Internal Address
+ *
+ * 24:31 0b 0000_0000 RPN 0:7, Reserved
+ * 32:47 0 through 64K Send Window Id
+ * 48:51 0b 0000 Spare
+ *
+ * 52 0b 0 Reserved
+ * 53 0b 1 Report Enable (Set to 1 for NX).
+ * 54 0b 0 Reserved
+ *
+ * 55:56 0b 00 Snoop Bus
+ * 57:63 0b 0000_000 Reserved
+ *
+ * Example: For Node 0, Chip 0, Window id 4, Report Enable 1:
+ *
+ * Byte0 Byte1 Byte2 Byte3 Byte4 Byte5 Byte6 Byte7
+ * 00000000 00010000 00000000 00000000 00000000 00000100 00000100 00000000
+ * | | | | |
+ * +---+ +-------+-------+ v
+ * | | Report Enable
+ * v v
+ * Topology Index Window id 4
+ *
+ * Thus the paste address for window id 4 is 0x00100000_00040400 and
+ * the _base_ paste address for Node 0 Chip 0 is 0x00100000_00000000.
+ *
+ * Note: Bit 11 (Foreign Address Enable) is set only for paste base address.
+ * Not for VAS/NX RMA BAR. RA(0:12) = 0 for VAS/NX RMA BAR.
+ */
+
+static void get_rma_bar(struct proc_chip *chip, uint64_t *val)
+{
+ uint64_t v;
+
+ v = 0ULL;
+ v = SETFIELD(RMA_TOPOLOGY_INDEX, v, chip->primary_topology);
+
+ *val = v;
+}
+
+/* Interface for NX - make sure VAS is fully initialized first */
+__attrconst uint64_t vas_get_rma_bar(int chipid)
+{
+ struct proc_chip *chip;
+ uint64_t addr;
+
+ if (!vas_initialized)
+ return 0ULL;
+
+ chip = get_chip(chipid);
+ if (!chip)
+ return 0ULL;
+
+ get_rma_bar(chip, &addr);
+
+ return addr;
+}
+
+/*
+ * Initialize RMA BAR on this chip to correspond to its node/chip id.
+ * This will cause VAS to accept paste commands to targeted for this chip.
+ * Initialize RMA Base Address Mask Register (BAMR) to its default value.
+ */
+static int init_rma(struct proc_chip *chip)
+{
+ int rc;
+ uint64_t val;
+
+ if (proc_gen == proc_gen_p9)
+ p9_get_rma_bar(chip->id, &val);
+ else
+ get_rma_bar(chip, &val);
+
+ rc = vas_scom_write(chip, VAS_RMA_BAR, val);
+ if (rc)
+ return rc;
+
+ val = SETFIELD(VAS_RMA_BAMR_ADDR_MASK, 0ULL, 0xFFFC0000000ULL);
+
+ return vas_scom_write(chip, VAS_RMA_BAMR, val);
+}
+
+/*
+ * get_paste_bar():
+ *
+ * Compute and return the "paste base address region" for @chipid. This
+ * BAR contains the "paste" addreses for all windows on the chip. Linux
+ * uses this paste BAR to compute the hardware paste address of a (send)
+ * window using:
+ *
+ * paste_addr = base + (winid << shift)
+ *
+ * where winid is the window index and shift is computed as:
+ *
+ * start = RMA_LSMP_WINID_START_BIT;
+ * nbits = RMA_LSMP_WINID_NUM_BITS;
+ * shift = 63 - (start + nbits - 1);
+ *
+ * See also get_paste_bitfield() below, which is used to export the 'start'
+ * and 'nbits' to Linux through the DT.
+ *
+ * Each chip supports VAS_WINDOWS_PER_CHIP (64K on Power9) windows. To
+ * provide proper isolation, the paste address for each window is on a
+ * separate page. Thus with a page size of 64K, the length of the paste
+ * BAR for a chip is VAS_WINDOWS_PER_CHIP times 64K (or 4GB for Power9).
+ *
+ */
+#define VAS_PASTE_BAR_LEN (1ULL << 32) /* 4GB - see above */
+
+static inline void get_paste_bar(int chipid, uint64_t *start, uint64_t *len)
+{
+ struct proc_chip *chip;
+ uint64_t val;
+
+ if (proc_gen == proc_gen_p9)
+ p9_get_rma_bar(chipid, &val);
+ else {
+ chip = get_chip(chipid);
+ if (!chip)
+ return;
+
+ get_rma_bar(chip, &val);
+
+ /*
+ * RA(11) (Foreign Address Enable) is set only for paste
+ * base address.
+ */
+ val = SETFIELD(RMA_FOREIGN_ADDR_ENABLE, val, 1);
+ }
+
+ *start = val;
+ *len = VAS_PASTE_BAR_LEN;
+}
+
+/*
+ * get_paste_bitfield():
+ *
+ * As explained in the function header for get_paste_bar(), the window
+ * id is encoded in bits 32:47 of the paste address. Export this bitfield
+ * to Linux via the device tree as a reg property (with start bit and
+ * number of bits).
+ */
+static inline void get_paste_bitfield(uint64_t *start, uint64_t *n_bits)
+{
+ *start = (uint64_t)RMA_LSMP_WINID_START_BIT;
+ *n_bits = (uint64_t)RMA_LSMP_WINID_NUM_BITS;
+}
+
+/*
+ * Window Context MMIO (WCM) Region for each chip is assigned in the P9
+ * MMIO MAP spreadsheet. Write this value to the SCOM address associated
+ * with WCM_BAR.
+ */
+static int init_wcm(struct proc_chip *chip)
+{
+ uint64_t wcmbar;
+
+ get_hvwc_mmio_bar(chip->id, &wcmbar, NULL);
+
+ /*
+ * Write the entire WCMBAR address to the SCOM address. VAS will
+ * extract bits that it thinks are relevant i.e bits 8..38
+ */
+ return vas_scom_write(chip, VAS_WCM_BAR, wcmbar);
+}
+
+/*
+ * OS/User Window Context MMIO (UWCM) Region for each is assigned in the
+ * P9 MMIO MAP spreadsheet. Write this value to the SCOM address associated
+ * with UWCM_BAR.
+ */
+static int init_uwcm(struct proc_chip *chip)
+{
+ uint64_t uwcmbar;
+
+ get_uwc_mmio_bar(chip->id, &uwcmbar, NULL);
+
+ /*
+ * Write the entire UWCMBAR address to the SCOM address. VAS will
+ * extract bits that it thinks are relevant i.e bits 8..35.
+ */
+ return vas_scom_write(chip, VAS_UWCM_BAR, uwcmbar);
+}
+
+static inline void free_wcbs(struct proc_chip *chip)
+{
+ if (chip->vas->wcbs) {
+ free((void *)chip->vas->wcbs);
+ chip->vas->wcbs = 0ULL;
+ }
+}
+
+/*
+ * VAS needs a backing store for the 64K window contexts on a chip.
+ * (64K times 512 = 8MB). This region needs to be contiguous, so
+ * allocate during early boot. Then write the allocated address to
+ * the SCOM address for the Backing store BAR.
+ */
+static int alloc_init_wcbs(struct proc_chip *chip)
+{
+ int rc;
+ uint64_t wcbs;
+ size_t size;
+
+ /* align to the backing store size */
+ size = (size_t)VAS_WCBS_SIZE;
+ wcbs = (uint64_t)local_alloc(chip->id, size, size);
+ if (!wcbs) {
+ vas_err("Unable to allocate memory for backing store\n");
+ return -ENOMEM;
+ }
+ memset((void *)wcbs, 0ULL, size);
+
+ /*
+ * Write entire WCBS_BAR address to the SCOM address. VAS will extract
+ * relevant bits.
+ */
+ rc = vas_scom_write(chip, VAS_WCBS_BAR, wcbs);
+ if (rc != OPAL_SUCCESS)
+ goto out;
+
+ chip->vas->wcbs = wcbs;
+ return OPAL_SUCCESS;
+
+out:
+ free((void *)wcbs);
+ return rc;
+}
+
+static struct vas *alloc_vas(uint32_t chip_id, uint32_t vas_id, uint64_t base)
+{
+ struct vas *vas;
+
+ vas = zalloc(sizeof(struct vas));
+ assert(vas);
+
+ vas->chip_id = chip_id;
+ vas->vas_id = vas_id;
+ vas->xscom_base = base;
+
+ return vas;
+}
+
+static void create_mm_dt_node(struct proc_chip *chip)
+{
+ struct dt_node *dn;
+ struct vas *vas;
+ const char *compat;
+ uint64_t hvwc_start, hvwc_len;
+ uint64_t uwc_start, uwc_len;
+ uint64_t pbf_start, pbf_nbits;
+ uint64_t pbar_start = 0, pbar_len = 0;
+
+ vas = chip->vas;
+ get_hvwc_mmio_bar(chip->id, &hvwc_start, &hvwc_len);
+ get_uwc_mmio_bar(chip->id, &uwc_start, &uwc_len);
+ get_paste_bar(chip->id, &pbar_start, &pbar_len);
+ get_paste_bitfield(&pbf_start, &pbf_nbits);
+
+ if (proc_gen == proc_gen_p9)
+ compat = "ibm,power9-vas";
+ else
+ compat = "ibm,power10-vas";
+
+ dn = dt_new_addr(dt_root, "vas", hvwc_start);
+
+ dt_add_property_strings(dn, "compatible", compat,
+ "ibm,vas");
+
+ dt_add_property_u64s(dn, "reg", hvwc_start, hvwc_len,
+ uwc_start, uwc_len,
+ pbar_start, pbar_len,
+ pbf_start, pbf_nbits);
+
+ dt_add_property_cells(dn, "ibm,vas-id", vas->vas_id);
+ dt_add_property_cells(dn, "ibm,chip-id", chip->id);
+ if (vas->vas_irq) {
+ dt_add_property_cells(dn, "interrupts", vas->vas_irq, 0);
+ dt_add_property_cells(dn, "interrupt-parent",
+ get_ics_phandle());
+ dt_add_property_u64(dn, "ibm,vas-port", vas->vas_port);
+ }
+}
+
+/*
+ * Disable one VAS instance.
+ *
+ * Free memory and ensure chip does not accept paste instructions.
+ */
+static void disable_vas_inst(struct dt_node *np)
+{
+ struct proc_chip *chip;
+
+ chip = get_chip(dt_get_chip_id(np));
+
+ if (!chip->vas)
+ return;
+
+ free_wcbs(chip);
+
+ reset_north_ctl(chip);
+}
+
+static void vas_setup_irq(struct proc_chip *chip)
+{
+ uint64_t port;
+ uint32_t irq;
+
+ irq = xive_alloc_ipi_irqs(chip->id, 1, 64);
+ if (irq == XIVE_IRQ_ERROR) {
+ vas_err("Failed to allocate interrupt sources for chipID %d\n",
+ chip->id);
+ return;
+ }
+
+ vas_vdbg("trigger port: 0x%p\n", xive_get_trigger_port(irq));
+
+ port = (uint64_t)xive_get_trigger_port(irq);
+
+ chip->vas->vas_irq = irq;
+ chip->vas->vas_port = port;
+}
+
+/*
+ * Initialize one VAS instance and enable it if @enable is true.
+ */
+static int init_vas_inst(struct dt_node *np, bool enable)
+{
+ uint32_t vas_id;
+ uint64_t xscom_base;
+ struct proc_chip *chip;
+
+ chip = get_chip(dt_get_chip_id(np));
+ vas_id = dt_prop_get_u32(np, "ibm,vas-id");
+ xscom_base = dt_get_address(np, 0, NULL);
+
+ chip->vas = alloc_vas(chip->id, vas_id, xscom_base);
+
+ if (!enable) {
+ reset_north_ctl(chip);
+ return 0;
+ }
+
+ if (alloc_init_wcbs(chip))
+ return -1;
+
+ reset_fir(chip);
+
+ if (init_wcm(chip) || init_uwcm(chip) || init_north_ctl(chip) ||
+ init_rma(chip))
+ return -1;
+
+ /*
+ * Use NVRAM 'vas-user-space' config for backward compatibility
+ * to older kernels. Remove this option in future if not needed.
+ */
+ if (nvram_query_eq_dangerous("vas-user-space", "enable"))
+ vas_setup_irq(chip);
+
+ create_mm_dt_node(chip);
+
+ prlog(PR_INFO, "VAS: Initialized chip %d\n", chip->id);
+ return 0;
+
+}
+
+void vas_init(void)
+{
+ bool enabled;
+ struct dt_node *np;
+ const char *compat;
+
+ if (proc_gen == proc_gen_p9)
+ compat = "ibm,power9-vas-x";
+ else if (proc_gen == proc_gen_p10)
+ compat = "ibm,power10-vas-x";
+ else
+ return;
+
+ enabled = vas_nx_enabled();
+
+ dt_for_each_compatible(dt_root, np, compat) {
+ if (init_vas_inst(np, enabled))
+ goto out;
+ }
+
+ vas_initialized = enabled;
+ return;
+
+out:
+ dt_for_each_compatible(dt_root, np, compat)
+ disable_vas_inst(np);
+
+ vas_err("Disabled (failed initialization)\n");
+ return;
+}
diff --git a/roms/skiboot/hw/xive.c b/roms/skiboot/hw/xive.c
new file mode 100644
index 000000000..51b03549a
--- /dev/null
+++ b/roms/skiboot/hw/xive.c
@@ -0,0 +1,5234 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * XIVE: eXternal Interrupt Virtualization Engine. POWER9 interrupt
+ * controller
+ *
+ * Copyright (c) 2016-2019, IBM Corporation.
+ */
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <chip.h>
+#include <io.h>
+#include <xive.h>
+#include <xive-p9-regs.h>
+#include <xscom-p9-regs.h>
+#include <interrupts.h>
+#include <timebase.h>
+#include <bitmap.h>
+#include <buddy.h>
+#include <phys-map.h>
+#include <p9_stop_api.H>
+
+/* Always notify from EQ to VP (no EOI on EQs). Will speed up
+ * EOIs at the expense of potentially higher powerbus traffic.
+ */
+#define EQ_ALWAYS_NOTIFY
+
+/* Verbose debug */
+#undef XIVE_VERBOSE_DEBUG
+
+/* Extra debug options used in debug builds */
+#ifdef DEBUG
+#define XIVE_DEBUG_DUPLICATES
+#define XIVE_PERCPU_LOG
+#define XIVE_DEBUG_INIT_CACHE_UPDATES
+#define XIVE_EXTRA_CHECK_INIT_CACHE
+#undef XIVE_CHECK_MISROUTED_IPI
+#define XIVE_CHECK_LOCKS
+#else
+#undef XIVE_DEBUG_DUPLICATES
+#undef XIVE_PERCPU_LOG
+#undef XIVE_DEBUG_INIT_CACHE_UPDATES
+#undef XIVE_EXTRA_CHECK_INIT_CACHE
+#undef XIVE_CHECK_MISROUTED_IPI
+#undef XIVE_CHECK_LOCKS
+#endif
+
+/*
+ *
+ * VSDs, blocks, set translation etc...
+ *
+ * This stuff confused me to no end so here's an attempt at explaining
+ * my understanding of it and how I use it in OPAL & Linux
+ *
+ * For the following data structures, the XIVE use a mechanism called
+ * Virtualization Structure Tables (VST) to manage the memory layout
+ * and access: ESBs (Event State Buffers, aka IPI sources), EAS/IVT
+ * (Event assignment structures), END/EQs (Notification descriptors
+ * aka event queues) and NVT/VPD (Notification Virtual Targets).
+ *
+ * These structures divide those tables into 16 "blocks". Each XIVE
+ * instance has a definition for all 16 blocks that can either represent
+ * an actual table in memory or a remote XIVE MMIO port to access a
+ * block that is owned by that remote XIVE.
+ *
+ * Our SW design will consist of allocating one block per chip (and thus
+ * per XIVE instance) for now, thus giving us up to 16 supported chips in
+ * the system. We may have to revisit that if we ever support systems with
+ * more than 16 chips but that isn't on our radar at the moment or if we
+ * want to do like pHyp on some machines and dedicate 2 blocks per chip
+ * for some structures.
+ *
+ * Thus we need to be careful that we never expose to Linux the concept
+ * of block and block boundaries, but instead we provide full number ranges
+ * so that consecutive blocks can be supported.
+ *
+ * We will pre-allocate some of the tables in order to support a "fallback"
+ * mode operations where an old-style XICS is emulated via OPAL calls. This
+ * is achieved by having a default of one VP per physical thread associated
+ * with one EQ and one IPI. There is also enought EATs to cover all the PHBs.
+ *
+ * Similarily, for MMIO access, the BARs support what is called "set
+ * translation" which allows the BAR to be divided into a certain
+ * number of sets. The VC BAR (ESBs, ENDs, ...) supports 64 sets and
+ * the PC BAR supports 16. Each "set" can be routed to a specific
+ * block and offset within a block.
+ *
+ * For now, we will not use much of that functionality. We will use a
+ * fixed split between ESB and ENDs for the VC BAR as defined by the
+ * constants below and we will allocate all the PC BARs set to the
+ * local block of that chip
+ */
+
+#define XIVE_VSD_SIZE sizeof(u64)
+
+/* VC BAR contains set translations for the ESBs and the EQs.
+ *
+ * It's divided in 64 sets, each of which can be either ESB pages or EQ pages.
+ * The table configuring this is the EDT
+ *
+ * Additionally, the ESB pages come in pair of Linux_Trig_Mode isn't enabled
+ * (which we won't enable for now as it assumes write-only permission which
+ * the MMU doesn't support).
+ *
+ * To get started we just hard wire the following setup:
+ *
+ * VC_BAR size is 512G. We split it into 384G of ESBs (48 sets) and 128G
+ * of ENDs (16 sets) for the time being. IE. Each set is thus 8GB
+ */
+
+#define VC_ESB_SETS 48
+#define VC_END_SETS 16
+#define VC_MAX_SETS 64
+
+/* The table configuring the PC set translation (16 sets) is the VDT */
+#define PC_MAX_SETS 16
+
+/* XXX This is the currently top limit of number of ESB/SBE entries
+ * and EAS/IVT entries pre-allocated per chip. This should probably
+ * turn into a device-tree property or NVRAM setting, or maybe
+ * calculated from the amount of system RAM...
+ *
+ * This is currently set to 1M
+ *
+ * This is independent of the sizing of the MMIO space.
+ *
+ * WARNING: Due to how XICS emulation works, we cannot support more
+ * interrupts per chip at this stage as the full interrupt number
+ * (block + index) has to fit in a 24-bit number.
+ *
+ * That gives us a pre-allocated space of 256KB per chip for the state
+ * bits and 8M per chip for the EAS/IVT.
+ *
+ * Note: The HW interrupts from PCIe and similar other entities that
+ * use their own state bit array will have to share that IVT space,
+ * so we could potentially make the IVT size twice as big, but for now
+ * we will simply share it and ensure we don't hand out IPIs that
+ * overlap the HW interrupts.
+ *
+ * TODO: adjust the VC BAR range for IPI ESBs on this value
+ */
+
+#define XIVE_INT_ORDER 20 /* 1M interrupts */
+#define XIVE_INT_COUNT (1ul << XIVE_INT_ORDER)
+
+/*
+ * First interrupt number, also the first logical interrupt number
+ * allocated by Linux (the first numbers are reserved for ISA)
+ */
+#define XIVE_INT_FIRST 0x10
+
+/* Corresponding direct table sizes */
+
+#define SBE_PER_BYTE 4 /* PQ bits couples */
+#define SBE_SIZE (XIVE_INT_COUNT / SBE_PER_BYTE)
+#define IVT_SIZE (XIVE_INT_COUNT * sizeof(struct xive_ive))
+
+/* Use 64K for everything by default */
+#define XIVE_ESB_SHIFT (16 + 1) /* trigger + mgmt pages */
+#define XIVE_ESB_PAGE_SIZE (1ul << XIVE_ESB_SHIFT) /* 2 pages */
+
+/* Max number of EQs. We allocate an indirect table big enough so
+ * that when fully populated we can have that many EQs.
+ *
+ * The max number of EQs we support in our MMIO space is 128G/128K
+ * ie. 1M. Since one EQ is 8 words (32 bytes), a 64K page can hold
+ * 2K EQs. We need 512 pointers, ie, 4K of memory for the indirect
+ * table.
+ *
+ * TODO: adjust the VC BAR range for END ESBs on this value
+ */
+#define EQ_PER_PAGE (PAGE_SIZE / sizeof(struct xive_eq))
+
+#define XIVE_EQ_ORDER 20 /* 1M ENDs */
+#define XIVE_EQ_COUNT (1ul << XIVE_EQ_ORDER)
+#define XIVE_EQ_TABLE_SIZE ((XIVE_EQ_COUNT / EQ_PER_PAGE) * XIVE_VSD_SIZE)
+
+#define XIVE_EQ_SHIFT (16 + 1) /* ESn + ESe pages */
+
+/* Number of priorities (and thus EQDs) we allocate for each VP */
+#define NUM_INT_PRIORITIES 8
+
+/* Max priority number */
+#define XIVE_MAX_PRIO 7
+
+/* Priority used for the one queue in XICS emulation */
+#define XIVE_EMULATION_PRIO 7
+
+/* Priority used for gather/silent escalation (KVM) */
+#define XIVE_ESCALATION_PRIO 7
+
+/* Max number of VPs. We allocate an indirect table big enough so
+ * that when fully populated we can have that many VPs.
+ *
+ * The max number of VPs we support in our MMIO space is 64G/64K
+ * ie. 1M. Since one VP is 16 words (64 bytes), a 64K page can hold
+ * 1K EQ. We need 1024 pointers, ie, 8K of memory for the indirect
+ * table.
+ *
+ * HOWEVER: A block supports only up to 512K VPs (19 bits of target
+ * in the EQ). Since we currently only support 1 block per chip,
+ * we will allocate half of the above. We might add support for
+ * 2 blocks per chip later if necessary.
+ *
+ * TODO: adjust the PC BAR range
+ */
+#define VP_PER_PAGE (PAGE_SIZE / sizeof(struct xive_vp))
+
+#define NVT_SHIFT 19 /* in sync with EQ_W6_NVT_INDEX */
+
+/*
+ * We use 8 priorities per VP and the number of EQs is configured to
+ * 1M. Therefore, our VP space is limited to 128k.
+ */
+#define XIVE_VP_ORDER (XIVE_EQ_ORDER - 3) /* 128k */
+#define XIVE_VP_COUNT (1ul << XIVE_VP_ORDER)
+#define XIVE_VP_TABLE_SIZE ((XIVE_VP_COUNT / VP_PER_PAGE) * XIVE_VSD_SIZE)
+
+/*
+ * VP ids for HW threads.
+ *
+ * These values are hardcoded in the CAM line of the HW context and
+ * they depend on the thread id bits of the chip, 7bit for p9.
+ *
+ * HW CAM Line |chip|000000000001|thrdid |
+ * 23bits 4 12 7
+ */
+#define XIVE_THREADID_SHIFT 7
+#define XIVE_HW_VP_BASE (1 << XIVE_THREADID_SHIFT)
+#define XIVE_HW_VP_COUNT (1 << XIVE_THREADID_SHIFT)
+
+/* The xive operation mode indicates the active "API" and corresponds
+ * to the "mode" parameter of the opal_xive_reset() call
+ */
+static enum {
+ XIVE_MODE_EMU = OPAL_XIVE_MODE_EMU,
+ XIVE_MODE_EXPL = OPAL_XIVE_MODE_EXPL,
+ XIVE_MODE_NONE,
+} xive_mode = XIVE_MODE_NONE;
+
+
+/* Each source controller has one of these. There's one embedded
+ * in the XIVE struct for IPIs
+ */
+struct xive_src {
+ struct irq_source is;
+ const struct irq_source_ops *orig_ops;
+ struct xive *xive;
+ void *esb_mmio;
+ uint32_t esb_base;
+ uint32_t esb_shift;
+ uint32_t flags;
+};
+
+#define LOG_TYPE_XIRR 0
+#define LOG_TYPE_XIRR2 1
+#define LOG_TYPE_POPQ 2
+#define LOG_TYPE_EOI 3
+#define LOG_TYPE_EQD 4
+
+struct xive_log_ent {
+ uint8_t type;
+ uint8_t cnt;
+ uint64_t tb;
+#define MAX_LOG_DATA 8
+ uint32_t data[MAX_LOG_DATA];
+};
+#define MAX_LOG_ENT 32
+
+struct xive_cpu_state {
+ struct xive *xive;
+ void *tm_ring1;
+
+#ifdef XIVE_PERCPU_LOG
+ struct xive_log_ent log[MAX_LOG_ENT];
+ uint32_t log_pos;
+#endif
+ /* Base HW VP and associated queues */
+ uint32_t vp_blk;
+ uint32_t vp_idx;
+ uint32_t eq_blk;
+ uint32_t eq_idx; /* Base eq index of a block of 8 */
+ void *eq_page;
+
+ /* Pre-allocated IPI */
+ uint32_t ipi_irq;
+
+ /* Use for XICS emulation */
+ struct lock lock;
+ uint8_t cppr;
+ uint8_t mfrr;
+ uint8_t pending;
+ uint8_t prev_cppr;
+ uint32_t *eqbuf;
+ uint32_t eqptr;
+ uint32_t eqmsk;
+ uint8_t eqgen;
+ void *eqmmio;
+ uint64_t total_irqs;
+};
+
+#ifdef XIVE_PERCPU_LOG
+
+static void log_add(struct xive_cpu_state *xs, uint8_t type,
+ uint8_t count, ...)
+{
+ struct xive_log_ent *e = &xs->log[xs->log_pos];
+ va_list args;
+ int i;
+
+ e->type = type;
+ e->cnt = count;
+ e->tb = mftb();
+ va_start(args, count);
+ for (i = 0; i < count; i++)
+ e->data[i] = va_arg(args, u32);
+ va_end(args);
+ xs->log_pos = xs->log_pos + 1;
+ if (xs->log_pos == MAX_LOG_ENT)
+ xs->log_pos = 0;
+}
+
+static void log_print(struct xive_cpu_state *xs)
+{
+ uint32_t pos = xs->log_pos;
+ uint8_t buf[256];
+ int i, j;
+ static const char *lts[] = {
+ ">XIRR",
+ "<XIRR",
+ " POPQ",
+ " EOI",
+ " EQD"
+ };
+ for (i = 0; i < MAX_LOG_ENT; i++) {
+ struct xive_log_ent *e = &xs->log[pos];
+ uint8_t *b = buf, *eb = &buf[255];
+
+ b += snprintf(b, eb-b, "%08llx %s ", e->tb,
+ lts[e->type]);
+ for (j = 0; j < e->cnt && b < eb; j++)
+ b += snprintf(b, eb-b, "%08x ", e->data[j]);
+ printf("%s\n", buf);
+ pos = pos + 1;
+ if (pos == MAX_LOG_ENT)
+ pos = 0;
+ }
+}
+
+#else /* XIVE_PERCPU_LOG */
+
+static inline void log_add(struct xive_cpu_state *xs __unused,
+ uint8_t type __unused,
+ uint8_t count __unused, ...) { }
+static inline void log_print(struct xive_cpu_state *xs __unused) { }
+
+#endif /* XIVE_PERCPU_LOG */
+
+struct xive {
+ uint32_t chip_id;
+ uint32_t block_id;
+ struct dt_node *x_node;
+
+ uint64_t xscom_base;
+
+ /* MMIO regions */
+ void *ic_base;
+ uint64_t ic_size;
+ uint32_t ic_shift;
+ void *tm_base;
+ uint64_t tm_size;
+ uint32_t tm_shift;
+ void *pc_base;
+ uint64_t pc_size;
+ void *vc_base;
+ uint64_t vc_size;
+
+ void *esb_mmio;
+ void *eq_mmio;
+
+ /* Set on XSCOM register access error */
+ bool last_reg_error;
+
+ /* Per-XIVE mutex */
+ struct lock lock;
+
+ /* Pre-allocated tables.
+ *
+ * We setup all the VDS for actual tables (ie, by opposition to
+ * forwarding ports) as either direct pre-allocated or indirect
+ * and partially populated.
+ *
+ * Currently, the ESB/SBE and the EAS/IVT tables are direct and
+ * fully pre-allocated based on XIVE_INT_COUNT.
+ *
+ * The other tables are indirect, we thus pre-allocate the indirect
+ * table (ie, pages of pointers) and populate enough of the pages
+ * for our basic setup using 64K pages.
+ *
+ * The size of the indirect tables are driven by XIVE_VP_COUNT and
+ * XIVE_EQ_COUNT. The number of pre-allocated ones are driven by
+ * XIVE_HW_VP_COUNT (number of EQ depends on number of VP) in block
+ * mode, otherwise we only preallocate INITIAL_BLK0_VP_COUNT on
+ * block 0.
+ */
+
+ /* Direct SBE and IVT tables */
+ void *sbe_base;
+ void *ivt_base;
+
+ /* Indirect END/EQ table. NULL entries are unallocated, count is
+ * the numbre of pointers (ie, sub page placeholders).
+ */
+ __be64 *eq_ind_base;
+ uint32_t eq_ind_count;
+
+ /* EQ allocation bitmap. Each bit represent 8 EQs */
+ bitmap_t *eq_map;
+
+ /* Indirect NVT/VP table. NULL entries are unallocated, count is
+ * the numbre of pointers (ie, sub page placeholders).
+ */
+ __be64 *vp_ind_base;
+ uint32_t vp_ind_count;
+
+ /* Pool of donated pages for provisioning indirect EQ and VP pages */
+ struct list_head donated_pages;
+
+ /* To ease a possible change to supporting more than one block of
+ * interrupts per chip, we store here the "base" global number
+ * and max number of interrupts for this chip. The global number
+ * encompass the block number and index.
+ */
+ uint32_t int_base;
+ uint32_t int_max;
+
+ /* Due to the overlap between IPIs and HW sources in the IVT table,
+ * we keep some kind of top-down allocator. It is used for HW sources
+ * to "allocate" interrupt entries and will limit what can be handed
+ * out as IPIs. Of course this assumes we "allocate" all HW sources
+ * before we start handing out IPIs.
+ *
+ * Note: The numbers here are global interrupt numbers so that we can
+ * potentially handle more than one block per chip in the future.
+ */
+ uint32_t int_hw_bot; /* Bottom of HW allocation */
+ uint32_t int_ipi_top; /* Highest IPI handed out so far + 1 */
+
+ /* The IPI allocation bitmap */
+ bitmap_t *ipi_alloc_map;
+
+ /* We keep track of which interrupts were ever enabled to
+ * speed up xive_reset
+ */
+ bitmap_t *int_enabled_map;
+
+ /* Embedded source IPIs */
+ struct xive_src ipis;
+
+ /* Embedded escalation interrupts */
+ struct xive_src esc_irqs;
+
+ /* In memory queue overflow */
+ void *q_ovf;
+};
+
+#define XIVE_CAN_STORE_EOI(x) XIVE_STORE_EOI_ENABLED
+
+/* Global DT node */
+static struct dt_node *xive_dt_node;
+
+
+/* Block <-> Chip conversions.
+ *
+ * As chipIDs may not be within the range of 16 block IDs supported by XIVE,
+ * we have a 2 way conversion scheme.
+ *
+ * From block to chip, use the global table below.
+ *
+ * From chip to block, a field in struct proc_chip contains the first block
+ * of that chip. For now we only support one block per chip but that might
+ * change in the future
+ */
+#define XIVE_INVALID_CHIP 0xffffffff
+#define XIVE_MAX_CHIPS 16
+static uint32_t xive_block_to_chip[XIVE_MAX_CHIPS];
+static uint32_t xive_block_count;
+
+static uint32_t xive_chip_to_block(uint32_t chip_id)
+{
+ struct proc_chip *c = get_chip(chip_id);
+
+ assert(c);
+ assert(c->xive);
+ return c->xive->block_id;
+}
+
+/* Conversion between GIRQ and block/index.
+ *
+ * ------------------------------------
+ * |0000000E|BLOC| INDEX|
+ * ------------------------------------
+ * 8 4 20
+ *
+ * the E bit indicates that this is an escalation interrupt, in
+ * that case, the BLOCK/INDEX points to the EQ descriptor associated
+ * with the escalation.
+ *
+ * Global interrupt numbers for non-escalation interrupts are thus
+ * limited to 24 bits because the XICS emulation encodes the CPPR
+ * value in the top (MSB) 8 bits. Hence, 4 bits are left for the XIVE
+ * block number and the remaining 20 bits for the interrupt index
+ * number.
+ */
+#define INT_SHIFT 20
+#define INT_ESC_SHIFT (INT_SHIFT + 4) /* 4bits block id */
+
+#if XIVE_INT_ORDER > INT_SHIFT
+#error "Too many ESBs for IRQ encoding"
+#endif
+
+#if XIVE_EQ_ORDER > INT_SHIFT
+#error "Too many EQs for escalation IRQ number encoding"
+#endif
+
+#define GIRQ_TO_BLK(__g) (((__g) >> INT_SHIFT) & 0xf)
+#define GIRQ_TO_IDX(__g) ((__g) & ((1 << INT_SHIFT) - 1))
+#define BLKIDX_TO_GIRQ(__b,__i) (((uint32_t)(__b)) << INT_SHIFT | (__i))
+#define GIRQ_IS_ESCALATION(__g) ((__g) & (1 << INT_ESC_SHIFT))
+#define MAKE_ESCALATION_GIRQ(__b,__i)(BLKIDX_TO_GIRQ(__b,__i) | (1 << INT_ESC_SHIFT))
+
+/* Block/IRQ to chip# conversions */
+#define PC_BLK_TO_CHIP(__b) (xive_block_to_chip[__b])
+#define VC_BLK_TO_CHIP(__b) (xive_block_to_chip[__b])
+#define GIRQ_TO_CHIP(__isn) (VC_BLK_TO_CHIP(GIRQ_TO_BLK(__isn)))
+
+/* Routing of physical processors to VPs */
+#define PIR2VP_IDX(__pir) (XIVE_HW_VP_BASE | P9_PIR2LOCALCPU(__pir))
+#define PIR2VP_BLK(__pir) (xive_chip_to_block(P9_PIR2GCID(__pir)))
+#define VP2PIR(__blk, __idx) (P9_PIRFROMLOCALCPU(VC_BLK_TO_CHIP(__blk), (__idx) & 0x7f))
+
+/* Decoding of OPAL API VP IDs. The VP IDs are encoded as follow
+ *
+ * Block group mode:
+ *
+ * -----------------------------------
+ * |GVEOOOOO| INDEX|
+ * -----------------------------------
+ * || |
+ * || Order
+ * |Virtual
+ * Group
+ *
+ * G (Group) : Set to 1 for a group VP (not currently supported)
+ * V (Virtual) : Set to 1 for an allocated VP (vs. a physical processor ID)
+ * E (Error) : Should never be 1, used internally for errors
+ * O (Order) : Allocation order of the VP block
+ *
+ * The conversion is thus done as follow (groups aren't implemented yet)
+ *
+ * If V=0, O must be 0 and 24-bit INDEX value is the PIR
+ * If V=1, the order O group is allocated such that if N is the number of
+ * chip bits considered for allocation (*)
+ * then the INDEX is constructed as follow (bit numbers such as 0=LSB)
+ * - bottom O-N bits is the index within the "VP block"
+ * - next N bits is the XIVE blockID of the VP
+ * - the remaining bits is the per-chip "base"
+ * so the conversion consists of "extracting" the block ID and moving
+ * down the upper bits by N bits.
+ *
+ * In non-block-group mode, the difference is that the blockID is
+ * on the left of the index (the entire VP block is in a single
+ * block ID)
+ */
+
+/* VP allocation */
+static uint32_t xive_chips_alloc_bits = 0;
+static struct buddy *xive_vp_buddy;
+static struct lock xive_buddy_lock = LOCK_UNLOCKED;
+
+/* VP# decoding/encoding */
+static bool xive_decode_vp(uint32_t vp, uint32_t *blk, uint32_t *idx,
+ uint8_t *order, bool *group)
+{
+ uint32_t o = (vp >> 24) & 0x1f;
+ uint32_t n = xive_chips_alloc_bits;
+ uint32_t index = vp & 0x00ffffff;
+ uint32_t imask = (1 << (o - n)) - 1;
+
+ /* Groups not supported yet */
+ if ((vp >> 31) & 1)
+ return false;
+ if (group)
+ *group = false;
+
+ /* PIR case */
+ if (((vp >> 30) & 1) == 0) {
+ if (find_cpu_by_pir(index) == NULL)
+ return false;
+ if (blk)
+ *blk = PIR2VP_BLK(index);
+ if (idx)
+ *idx = PIR2VP_IDX(index);
+ return true;
+ }
+
+ /* Ensure o > n, we have *at least* 2 VPs per block */
+ if (o <= n)
+ return false;
+
+ /* Combine the index base and index */
+ if (idx)
+ *idx = ((index >> n) & ~imask) | (index & imask);
+ /* Extract block ID */
+ if (blk)
+ *blk = (index >> (o - n)) & ((1 << n) - 1);
+
+ /* Return order as well if asked for */
+ if (order)
+ *order = o;
+
+ return true;
+}
+
+static uint32_t xive_encode_vp(uint32_t blk, uint32_t idx, uint32_t order)
+{
+ uint32_t vp = 0x40000000 | (order << 24);
+ uint32_t n = xive_chips_alloc_bits;
+ uint32_t imask = (1 << (order - n)) - 1;
+
+ vp |= (idx & ~imask) << n;
+ vp |= blk << (order - n);
+ vp |= idx & imask;
+ return vp;
+}
+
+#define xive_regw(__x, __r, __v) \
+ __xive_regw(__x, __r, X_##__r, __v, #__r)
+#define xive_regr(__x, __r) \
+ __xive_regr(__x, __r, X_##__r, #__r)
+#define xive_regwx(__x, __r, __v) \
+ __xive_regw(__x, 0, X_##__r, __v, #__r)
+#define xive_regrx(__x, __r) \
+ __xive_regr(__x, 0, X_##__r, #__r)
+
+#ifdef XIVE_VERBOSE_DEBUG
+#define xive_vdbg(__x,__fmt,...) prlog(PR_DEBUG,"XIVE[ IC %02x ] " __fmt, (__x)->chip_id, ##__VA_ARGS__)
+#define xive_cpu_vdbg(__c,__fmt,...) prlog(PR_DEBUG,"XIVE[CPU %04x] " __fmt, (__c)->pir, ##__VA_ARGS__)
+#else
+#define xive_vdbg(x,fmt,...) do { } while(0)
+#define xive_cpu_vdbg(x,fmt,...) do { } while(0)
+#endif
+
+#define xive_dbg(__x,__fmt,...) prlog(PR_DEBUG,"XIVE[ IC %02x ] " __fmt, (__x)->chip_id, ##__VA_ARGS__)
+#define xive_cpu_dbg(__c,__fmt,...) prlog(PR_DEBUG,"XIVE[CPU %04x] " __fmt, (__c)->pir, ##__VA_ARGS__)
+#define xive_warn(__x,__fmt,...) prlog(PR_WARNING,"XIVE[ IC %02x ] " __fmt, (__x)->chip_id, ##__VA_ARGS__)
+#define xive_cpu_warn(__c,__fmt,...) prlog(PR_WARNING,"XIVE[CPU %04x] " __fmt, (__c)->pir, ##__VA_ARGS__)
+#define xive_err(__x,__fmt,...) prlog(PR_ERR,"XIVE[ IC %02x ] " __fmt, (__x)->chip_id, ##__VA_ARGS__)
+#define xive_cpu_err(__c,__fmt,...) prlog(PR_ERR,"XIVE[CPU %04x] " __fmt, (__c)->pir, ##__VA_ARGS__)
+
+static void __xive_regw(struct xive *x, uint32_t m_reg, uint32_t x_reg, uint64_t v,
+ const char *rname)
+{
+ bool use_xscom = (m_reg == 0) || !x->ic_base;
+ int64_t rc;
+
+ x->last_reg_error = false;
+
+ if (use_xscom) {
+ assert(x_reg != 0);
+ rc = xscom_write(x->chip_id, x->xscom_base + x_reg, v);
+ if (rc) {
+ if (!rname)
+ rname = "???";
+ xive_err(x, "Error writing register %s\n", rname);
+ /* Anything else we can do here ? */
+ x->last_reg_error = true;
+ }
+ } else {
+ out_be64(x->ic_base + m_reg, v);
+ }
+}
+
+static uint64_t __xive_regr(struct xive *x, uint32_t m_reg, uint32_t x_reg,
+ const char *rname)
+{
+ bool use_xscom = (m_reg == 0) || !x->ic_base;
+ int64_t rc;
+ uint64_t val;
+
+ x->last_reg_error = false;
+
+ if (use_xscom) {
+ assert(x_reg != 0);
+ rc = xscom_read(x->chip_id, x->xscom_base + x_reg, &val);
+ if (rc) {
+ if (!rname)
+ rname = "???";
+ xive_err(x, "Error reading register %s\n", rname);
+ /* Anything else we can do here ? */
+ x->last_reg_error = true;
+ return -1ull;
+ }
+ } else {
+ val = in_be64(x->ic_base + m_reg);
+ }
+ return val;
+}
+
+/* Locate a controller from an IRQ number */
+static struct xive *xive_from_isn(uint32_t isn)
+{
+ uint32_t chip_id = GIRQ_TO_CHIP(isn);
+ struct proc_chip *c = get_chip(chip_id);
+
+ if (!c)
+ return NULL;
+ return c->xive;
+}
+
+static struct xive *xive_from_pc_blk(uint32_t blk)
+{
+ uint32_t chip_id = PC_BLK_TO_CHIP(blk);
+ struct proc_chip *c = get_chip(chip_id);
+
+ if (!c)
+ return NULL;
+ return c->xive;
+}
+
+static struct xive *xive_from_vc_blk(uint32_t blk)
+{
+ uint32_t chip_id = VC_BLK_TO_CHIP(blk);
+ struct proc_chip *c = get_chip(chip_id);
+
+ if (!c)
+ return NULL;
+ return c->xive;
+}
+
+static struct xive_eq *xive_get_eq(struct xive *x, unsigned int idx)
+{
+ struct xive_eq *p;
+
+ if (idx >= (x->eq_ind_count * EQ_PER_PAGE))
+ return NULL;
+ p = (struct xive_eq *)(be64_to_cpu(x->eq_ind_base[idx / EQ_PER_PAGE]) &
+ VSD_ADDRESS_MASK);
+ if (!p)
+ return NULL;
+
+ return &p[idx % EQ_PER_PAGE];
+}
+
+static struct xive_ive *xive_get_ive(struct xive *x, unsigned int isn)
+{
+ struct xive_ive *ivt;
+ uint32_t idx = GIRQ_TO_IDX(isn);
+
+ if (GIRQ_IS_ESCALATION(isn)) {
+ /* All right, an escalation IVE is buried inside an EQ, let's
+ * try to find it
+ */
+ struct xive_eq *eq;
+
+ if (x->chip_id != VC_BLK_TO_CHIP(GIRQ_TO_BLK(isn))) {
+ xive_err(x, "xive_get_ive, ESC ISN 0x%x not on right chip\n", isn);
+ return NULL;
+ }
+ eq = xive_get_eq(x, idx);
+ if (!eq) {
+ xive_err(x, "xive_get_ive, ESC ISN 0x%x EQ not found\n", isn);
+ return NULL;
+ }
+
+ /* If using single-escalation, don't let anybody get to the individual
+ * escalation interrupts
+ */
+ if (xive_get_field32(EQ_W0_UNCOND_ESCALATE, eq->w0))
+ return NULL;
+
+ /* Grab the buried IVE */
+ return (struct xive_ive *)(char *)&eq->w4;
+ } else {
+ /* Check the block matches */
+ if (isn < x->int_base || isn >= x->int_max) {
+ xive_err(x, "xive_get_ive, ISN 0x%x not on right chip\n", isn);
+ return NULL;
+ }
+ assert (idx < XIVE_INT_COUNT);
+
+ /* If we support >1 block per chip, this should still work as
+ * we are likely to make the table contiguous anyway
+ */
+ ivt = x->ivt_base;
+ assert(ivt);
+
+ return ivt + idx;
+ }
+}
+
+static struct xive_vp *xive_get_vp(struct xive *x, unsigned int idx)
+{
+ struct xive_vp *p;
+
+ assert(idx < (x->vp_ind_count * VP_PER_PAGE));
+ p = (struct xive_vp *)(be64_to_cpu(x->vp_ind_base[idx / VP_PER_PAGE]) &
+ VSD_ADDRESS_MASK);
+ if (!p)
+ return NULL;
+
+ return &p[idx % VP_PER_PAGE];
+}
+
+static void xive_init_default_vp(struct xive_vp *vp,
+ uint32_t eq_blk, uint32_t eq_idx)
+{
+ memset(vp, 0, sizeof(struct xive_vp));
+
+ /* Stash the EQ base in the pressure relief interrupt field */
+ vp->w1 = cpu_to_be32((eq_blk << 28) | eq_idx);
+ vp->w0 = xive_set_field32(VP_W0_VALID, 0, 1);
+}
+
+static void xive_init_emu_eq(uint32_t vp_blk, uint32_t vp_idx,
+ struct xive_eq *eq, void *backing_page,
+ uint8_t prio)
+{
+ memset(eq, 0, sizeof(struct xive_eq));
+
+ eq->w1 = xive_set_field32(EQ_W1_GENERATION, 0, 1);
+ eq->w3 = cpu_to_be32(((uint64_t)backing_page) & EQ_W3_OP_DESC_LO);
+ eq->w2 = cpu_to_be32((((uint64_t)backing_page) >> 32) & EQ_W2_OP_DESC_HI);
+ eq->w6 = xive_set_field32(EQ_W6_NVT_BLOCK, 0, vp_blk) |
+ xive_set_field32(EQ_W6_NVT_INDEX, 0, vp_idx);
+ eq->w7 = xive_set_field32(EQ_W7_F0_PRIORITY, 0, prio);
+ eq->w0 = xive_set_field32(EQ_W0_VALID, 0, 1) |
+ xive_set_field32(EQ_W0_ENQUEUE, 0, 1) |
+ xive_set_field32(EQ_W0_FIRMWARE, 0, 1) |
+ xive_set_field32(EQ_W0_QSIZE, 0, EQ_QSIZE_64K) |
+#ifdef EQ_ALWAYS_NOTIFY
+ xive_set_field32(EQ_W0_UCOND_NOTIFY, 0, 1) |
+#endif
+ 0 ;
+}
+
+static uint32_t *xive_get_eq_buf(uint32_t eq_blk, uint32_t eq_idx)
+{
+ struct xive *x = xive_from_vc_blk(eq_blk);
+ struct xive_eq *eq;
+ uint64_t addr;
+
+ assert(x);
+ eq = xive_get_eq(x, eq_idx);
+ assert(eq);
+ assert(xive_get_field32(EQ_W0_VALID, eq->w0));
+ addr = ((((uint64_t)be32_to_cpu(eq->w2)) & 0x0fffffff) << 32) | be32_to_cpu(eq->w3);
+
+ return (uint32_t *)addr;
+}
+
+static void *xive_get_donated_page(struct xive *x)
+{
+ return (void *)list_pop_(&x->donated_pages, 0);
+}
+
+#define XIVE_ALLOC_IS_ERR(_idx) ((_idx) >= 0xfffffff0)
+
+#define XIVE_ALLOC_NO_SPACE 0xffffffff /* No possible space */
+#define XIVE_ALLOC_NO_IND 0xfffffffe /* Indirect need provisioning */
+#define XIVE_ALLOC_NO_MEM 0xfffffffd /* Local allocation failed */
+
+static uint32_t xive_alloc_eq_set(struct xive *x, bool alloc_indirect)
+{
+ uint32_t ind_idx;
+ int idx;
+ int eq_base_idx;
+
+ xive_vdbg(x, "Allocating EQ set...\n");
+
+ assert(x->eq_map);
+
+ /* Allocate from the EQ bitmap. Each bit is 8 EQs */
+ idx = bitmap_find_zero_bit(*x->eq_map, 0, XIVE_EQ_COUNT >> 3);
+ if (idx < 0) {
+ xive_dbg(x, "Allocation from EQ bitmap failed !\n");
+ return XIVE_ALLOC_NO_SPACE;
+ }
+
+ eq_base_idx = idx << 3;
+
+ xive_vdbg(x, "Got EQs 0x%x..0x%x\n", eq_base_idx,
+ eq_base_idx + XIVE_MAX_PRIO);
+
+ /* Calculate the indirect page where the EQs reside */
+ ind_idx = eq_base_idx / EQ_PER_PAGE;
+
+ /* Is there an indirect page ? If not, check if we can provision it */
+ if (!x->eq_ind_base[ind_idx]) {
+ /* Default flags */
+ uint64_t vsd_flags = SETFIELD(VSD_TSIZE, 0ull, 4) |
+ SETFIELD(VSD_MODE, 0ull, VSD_MODE_EXCLUSIVE);
+ void *page;
+
+ /* If alloc_indirect is set, allocate the memory from OPAL own,
+ * otherwise try to provision from the donated pool
+ */
+ if (alloc_indirect) {
+ /* Allocate/provision indirect page during boot only */
+ xive_vdbg(x, "Indirect empty, provisioning from local pool\n");
+ page = local_alloc(x->chip_id, PAGE_SIZE, PAGE_SIZE);
+ if (!page) {
+ xive_dbg(x, "provisioning failed !\n");
+ return XIVE_ALLOC_NO_MEM;
+ }
+ vsd_flags |= VSD_FIRMWARE;
+ } else {
+ xive_vdbg(x, "Indirect empty, provisioning from donated pages\n");
+ page = xive_get_donated_page(x);
+ if (!page) {
+ xive_vdbg(x, "no idirect pages available !\n");
+ return XIVE_ALLOC_NO_IND;
+ }
+ }
+ memset(page, 0, PAGE_SIZE);
+ x->eq_ind_base[ind_idx] = cpu_to_be64(vsd_flags |
+ (((uint64_t)page) & VSD_ADDRESS_MASK));
+ /* Any cache scrub needed ? */
+ }
+
+ bitmap_set_bit(*x->eq_map, idx);
+ return eq_base_idx;
+}
+
+static void xive_free_eq_set(struct xive *x, uint32_t eqs)
+{
+ uint32_t idx;
+
+ xive_vdbg(x, "Freeing EQ 0x%x..0x%x\n", eqs, eqs + XIVE_MAX_PRIO);
+
+ assert((eqs & 7) == 0);
+ assert(x->eq_map);
+
+ idx = eqs >> 3;
+ bitmap_clr_bit(*x->eq_map, idx);
+}
+
+static bool xive_provision_vp_ind(struct xive *x, uint32_t vp_idx, uint32_t order)
+{
+ uint32_t pbase, pend, i;
+
+ pbase = vp_idx / VP_PER_PAGE;
+ pend = (vp_idx + (1 << order)) / VP_PER_PAGE;
+
+ for (i = pbase; i <= pend; i++) {
+ void *page;
+ u64 vsd;
+
+ /* Already provisioned ? */
+ if (x->vp_ind_base[i])
+ continue;
+
+ /* Try to grab a donated page */
+ page = xive_get_donated_page(x);
+ if (!page)
+ return false;
+
+ /* Install the page */
+ memset(page, 0, PAGE_SIZE);
+ vsd = ((uint64_t)page) & VSD_ADDRESS_MASK;
+ vsd |= SETFIELD(VSD_TSIZE, 0ull, 4);
+ vsd |= SETFIELD(VSD_MODE, 0ull, VSD_MODE_EXCLUSIVE);
+ x->vp_ind_base[i] = cpu_to_be64(vsd);
+ }
+ return true;
+}
+
+static void xive_init_vp_allocator(void)
+{
+ /* Initialize chip alloc bits */
+ xive_chips_alloc_bits = ilog2(xive_block_count);
+
+ prlog(PR_INFO, "XIVE: %d chips considered for VP allocations\n",
+ 1 << xive_chips_alloc_bits);
+
+ /* Allocate a buddy big enough for XIVE_VP_ORDER allocations.
+ *
+ * each bit in the buddy represents 1 << xive_chips_alloc_bits
+ * VPs.
+ */
+ xive_vp_buddy = buddy_create(XIVE_VP_ORDER);
+ assert(xive_vp_buddy);
+
+ /* We reserve the whole range of VPs representing HW chips.
+ *
+ * These are 0x80..0xff, so order 7 starting at 0x80. This will
+ * reserve that range on each chip.
+ */
+ assert(buddy_reserve(xive_vp_buddy, XIVE_HW_VP_BASE,
+ XIVE_THREADID_SHIFT));
+}
+
+static uint32_t xive_alloc_vps(uint32_t order)
+{
+ uint32_t local_order, i;
+ int vp;
+
+ /* The minimum order is 2 VPs per chip */
+ if (order < (xive_chips_alloc_bits + 1))
+ order = xive_chips_alloc_bits + 1;
+
+ /* We split the allocation */
+ local_order = order - xive_chips_alloc_bits;
+
+ /* We grab that in the global buddy */
+ assert(xive_vp_buddy);
+ lock(&xive_buddy_lock);
+ vp = buddy_alloc(xive_vp_buddy, local_order);
+ unlock(&xive_buddy_lock);
+ if (vp < 0)
+ return XIVE_ALLOC_NO_SPACE;
+
+ /* Provision on every chip considered for allocation */
+ for (i = 0; i < (1 << xive_chips_alloc_bits); i++) {
+ struct xive *x = xive_from_pc_blk(i);
+ bool success;
+
+ /* Return internal error & log rather than assert ? */
+ assert(x);
+ lock(&x->lock);
+ success = xive_provision_vp_ind(x, vp, local_order);
+ unlock(&x->lock);
+ if (!success) {
+ lock(&xive_buddy_lock);
+ buddy_free(xive_vp_buddy, vp, local_order);
+ unlock(&xive_buddy_lock);
+ return XIVE_ALLOC_NO_IND;
+ }
+ }
+
+ /* Encode the VP number. "blk" is 0 as this represents
+ * all blocks and the allocation always starts at 0
+ */
+ return xive_encode_vp(0, vp, order);
+}
+
+static void xive_free_vps(uint32_t vp)
+{
+ uint32_t idx;
+ uint8_t order, local_order;
+
+ assert(xive_decode_vp(vp, NULL, &idx, &order, NULL));
+
+ /* We split the allocation */
+ local_order = order - xive_chips_alloc_bits;
+
+ /* Free that in the buddy */
+ lock(&xive_buddy_lock);
+ buddy_free(xive_vp_buddy, idx, local_order);
+ unlock(&xive_buddy_lock);
+}
+
+enum xive_cache_type {
+ xive_cache_ivc,
+ xive_cache_sbc,
+ xive_cache_eqc,
+ xive_cache_vpc,
+};
+
+static int64_t __xive_cache_watch(struct xive *x, enum xive_cache_type ctype,
+ uint64_t block, uint64_t idx,
+ uint32_t start_dword, uint32_t dword_count,
+ __be64 *new_data, bool light_watch,
+ bool synchronous);
+
+static void xive_scrub_workaround_vp(struct xive *x, uint32_t block, uint32_t idx __unused)
+{
+ /* VP variant of the workaround described in __xive_cache_scrub(),
+ * we need to be careful to use for that workaround an NVT that
+ * sits on the same xive but isn NOT part of a donated indirect
+ * entry.
+ *
+ * The reason is that the dummy cache watch will re-create a
+ * dirty entry in the cache, even if the entry is marked
+ * invalid.
+ *
+ * Thus if we are about to dispose of the indirect entry backing
+ * it, we'll cause a checkstop later on when trying to write it
+ * out.
+ *
+ * Note: This means the workaround only works for block group
+ * mode.
+ */
+ __xive_cache_watch(x, xive_cache_vpc, block, XIVE_HW_VP_BASE, 0,
+ 0, NULL, true, false);
+}
+
+static void xive_scrub_workaround_eq(struct xive *x, uint32_t block __unused, uint32_t idx)
+{
+ void *mmio;
+
+ /* EQ variant of the workaround described in __xive_cache_scrub(),
+ * a simple non-side effect load from ESn will do
+ */
+ mmio = x->eq_mmio + idx * XIVE_ESB_PAGE_SIZE;
+
+ /* Ensure the above has returned before we do anything else
+ * the XIVE store queue is completely empty
+ */
+ load_wait(in_be64(mmio + XIVE_ESB_GET));
+}
+
+static int64_t __xive_cache_scrub(struct xive *x, enum xive_cache_type ctype,
+ uint64_t block, uint64_t idx,
+ bool want_inval, bool want_disable)
+{
+ uint64_t sreg, sregx, mreg, mregx;
+ uint64_t mval, sval;
+
+#ifdef XIVE_CHECK_LOCKS
+ assert(lock_held_by_me(&x->lock));
+#endif
+
+ /* Workaround a HW bug in XIVE where the scrub completion
+ * isn't ordered by loads, thus the data might still be
+ * in a queue and may not have reached coherency.
+ *
+ * The workaround is two folds: We force the scrub to also
+ * invalidate, then after the scrub, we do a dummy cache
+ * watch which will make the HW read the data back, which
+ * should be ordered behind all the preceding stores.
+ *
+ * Update: For EQs we can do a non-side effect ESB load instead
+ * which is faster.
+ */
+ want_inval = true;
+
+ switch (ctype) {
+ case xive_cache_ivc:
+ sreg = VC_IVC_SCRUB_TRIG;
+ sregx = X_VC_IVC_SCRUB_TRIG;
+ mreg = VC_IVC_SCRUB_MASK;
+ mregx = X_VC_IVC_SCRUB_MASK;
+ break;
+ case xive_cache_sbc:
+ sreg = VC_SBC_SCRUB_TRIG;
+ sregx = X_VC_SBC_SCRUB_TRIG;
+ mreg = VC_SBC_SCRUB_MASK;
+ mregx = X_VC_SBC_SCRUB_MASK;
+ break;
+ case xive_cache_eqc:
+ sreg = VC_EQC_SCRUB_TRIG;
+ sregx = X_VC_EQC_SCRUB_TRIG;
+ mreg = VC_EQC_SCRUB_MASK;
+ mregx = X_VC_EQC_SCRUB_MASK;
+ break;
+ case xive_cache_vpc:
+ sreg = PC_VPC_SCRUB_TRIG;
+ sregx = X_PC_VPC_SCRUB_TRIG;
+ mreg = PC_VPC_SCRUB_MASK;
+ mregx = X_PC_VPC_SCRUB_MASK;
+ break;
+ default:
+ return OPAL_INTERNAL_ERROR;
+ }
+ if (ctype == xive_cache_vpc) {
+ mval = PC_SCRUB_BLOCK_ID | PC_SCRUB_OFFSET;
+ sval = SETFIELD(PC_SCRUB_BLOCK_ID, idx, block) |
+ PC_SCRUB_VALID;
+ } else {
+ mval = VC_SCRUB_BLOCK_ID | VC_SCRUB_OFFSET;
+ sval = SETFIELD(VC_SCRUB_BLOCK_ID, idx, block) |
+ VC_SCRUB_VALID;
+ }
+ if (want_inval)
+ sval |= PC_SCRUB_WANT_INVAL;
+ if (want_disable)
+ sval |= PC_SCRUB_WANT_DISABLE;
+
+ __xive_regw(x, mreg, mregx, mval, NULL);
+ __xive_regw(x, sreg, sregx, sval, NULL);
+
+ /* XXX Add timeout !!! */
+ for (;;) {
+ sval = __xive_regr(x, sreg, sregx, NULL);
+ if (!(sval & VC_SCRUB_VALID))
+ break;
+ /* Small delay */
+ time_wait(100);
+ }
+ sync();
+
+ /* Workaround for HW bug described above (only applies to
+ * EQC and VPC
+ */
+ if (ctype == xive_cache_eqc)
+ xive_scrub_workaround_eq(x, block, idx);
+ else if (ctype == xive_cache_vpc)
+ xive_scrub_workaround_vp(x, block, idx);
+
+ return 0;
+}
+
+static int64_t xive_ivc_scrub(struct xive *x, uint64_t block, uint64_t idx)
+{
+ /* IVC has no "want_inval" bit, it always invalidates */
+ return __xive_cache_scrub(x, xive_cache_ivc, block, idx, false, false);
+}
+
+static int64_t xive_vpc_scrub(struct xive *x, uint64_t block, uint64_t idx)
+{
+ return __xive_cache_scrub(x, xive_cache_vpc, block, idx, false, false);
+}
+
+static int64_t xive_vpc_scrub_clean(struct xive *x, uint64_t block, uint64_t idx)
+{
+ return __xive_cache_scrub(x, xive_cache_vpc, block, idx, true, false);
+}
+
+static int64_t xive_eqc_scrub(struct xive *x, uint64_t block, uint64_t idx)
+{
+ return __xive_cache_scrub(x, xive_cache_eqc, block, idx, false, false);
+}
+
+#define XIVE_CACHE_WATCH_MAX_RETRIES 10
+
+static int64_t __xive_cache_watch(struct xive *x, enum xive_cache_type ctype,
+ uint64_t block, uint64_t idx,
+ uint32_t start_dword, uint32_t dword_count,
+ __be64 *new_data, bool light_watch,
+ bool synchronous)
+{
+ uint64_t sreg, sregx, dreg0, dreg0x;
+ uint64_t dval0, sval, status;
+ int64_t i;
+ int retries = 0;
+
+#ifdef XIVE_CHECK_LOCKS
+ assert(lock_held_by_me(&x->lock));
+#endif
+ switch (ctype) {
+ case xive_cache_eqc:
+ sreg = VC_EQC_CWATCH_SPEC;
+ sregx = X_VC_EQC_CWATCH_SPEC;
+ dreg0 = VC_EQC_CWATCH_DAT0;
+ dreg0x = X_VC_EQC_CWATCH_DAT0;
+ sval = SETFIELD(VC_EQC_CWATCH_BLOCKID, idx, block);
+ break;
+ case xive_cache_vpc:
+ sreg = PC_VPC_CWATCH_SPEC;
+ sregx = X_PC_VPC_CWATCH_SPEC;
+ dreg0 = PC_VPC_CWATCH_DAT0;
+ dreg0x = X_PC_VPC_CWATCH_DAT0;
+ sval = SETFIELD(PC_VPC_CWATCH_BLOCKID, idx, block);
+ break;
+ default:
+ return OPAL_INTERNAL_ERROR;
+ }
+
+ /* The full bit is in the same position for EQC and VPC */
+ if (!light_watch)
+ sval |= VC_EQC_CWATCH_FULL;
+
+ for (;;) {
+ /* Write the cache watch spec */
+ __xive_regw(x, sreg, sregx, sval, NULL);
+
+ /* Load data0 register to populate the watch */
+ dval0 = __xive_regr(x, dreg0, dreg0x, NULL);
+
+ /* If new_data is NULL, this is a dummy watch used as a
+ * workaround for a HW bug
+ */
+ if (!new_data) {
+ __xive_regw(x, dreg0, dreg0x, dval0, NULL);
+ return 0;
+ }
+
+ /* Write the words into the watch facility. We write in reverse
+ * order in case word 0 is part of it as it must be the last
+ * one written.
+ */
+ for (i = start_dword + dword_count - 1; i >= start_dword ;i--) {
+ uint64_t dw = be64_to_cpu(new_data[i - start_dword]);
+ __xive_regw(x, dreg0 + i * 8, dreg0x + i, dw, NULL);
+ }
+
+ /* Write data0 register to trigger the update if word 0 wasn't
+ * written above
+ */
+ if (start_dword > 0)
+ __xive_regw(x, dreg0, dreg0x, dval0, NULL);
+
+ /* This may not be necessary for light updates (it's possible
+ * that a sync in sufficient, TBD). Ensure the above is
+ * complete and check the status of the watch.
+ */
+ status = __xive_regr(x, sreg, sregx, NULL);
+
+ /* Bits FULL and CONFLICT are in the same position in
+ * EQC and VPC
+ */
+ if (!(status & VC_EQC_CWATCH_FULL) ||
+ !(status & VC_EQC_CWATCH_CONFLICT))
+ break;
+ if (!synchronous)
+ return OPAL_BUSY;
+
+ if (++retries == XIVE_CACHE_WATCH_MAX_RETRIES) {
+ xive_err(x, "Reached maximum retries %d when doing "
+ "a %s cache update\n", retries,
+ ctype == xive_cache_eqc ? "EQC" : "VPC");
+ return OPAL_BUSY;
+ }
+ }
+
+ /* Perform a scrub with "want_invalidate" set to false to push the
+ * cache updates to memory as well
+ */
+ return __xive_cache_scrub(x, ctype, block, idx, false, false);
+}
+
+static int64_t xive_escalation_ive_cache_update(struct xive *x, uint64_t block,
+ uint64_t idx, struct xive_ive *ive,
+ bool synchronous)
+{
+ return __xive_cache_watch(x, xive_cache_eqc, block, idx,
+ 2, 1, &ive->w, true, synchronous);
+}
+
+static int64_t xive_eqc_cache_update(struct xive *x, uint64_t block,
+ uint64_t idx, struct xive_eq *eq,
+ bool synchronous)
+{
+ return __xive_cache_watch(x, xive_cache_eqc, block, idx,
+ 0, 4, (__be64 *)eq, false, synchronous);
+}
+
+static int64_t xive_vpc_cache_update(struct xive *x, uint64_t block,
+ uint64_t idx, struct xive_vp *vp,
+ bool synchronous)
+{
+ return __xive_cache_watch(x, xive_cache_vpc, block, idx,
+ 0, 8, (__be64 *)vp, false, synchronous);
+}
+
+static bool xive_set_vsd(struct xive *x, uint32_t tbl, uint32_t idx, uint64_t v)
+{
+ /* Set VC version */
+ xive_regw(x, VC_VSD_TABLE_ADDR,
+ SETFIELD(VST_TABLE_SELECT, 0ull, tbl) |
+ SETFIELD(VST_TABLE_OFFSET, 0ull, idx));
+ if (x->last_reg_error)
+ return false;
+ xive_regw(x, VC_VSD_TABLE_DATA, v);
+ if (x->last_reg_error)
+ return false;
+
+ /* Except for IRQ table, also set PC version */
+ if (tbl == VST_TSEL_IRQ)
+ return true;
+
+ xive_regw(x, PC_VSD_TABLE_ADDR,
+ SETFIELD(VST_TABLE_SELECT, 0ull, tbl) |
+ SETFIELD(VST_TABLE_OFFSET, 0ull, idx));
+ if (x->last_reg_error)
+ return false;
+ xive_regw(x, PC_VSD_TABLE_DATA, v);
+ if (x->last_reg_error)
+ return false;
+ return true;
+}
+
+static bool xive_set_local_tables(struct xive *x)
+{
+ uint64_t base, i;
+
+ /* These have to be power of 2 sized */
+ assert(is_pow2(SBE_SIZE));
+ assert(is_pow2(IVT_SIZE));
+
+ /* All tables set as exclusive */
+ base = SETFIELD(VSD_MODE, 0ull, VSD_MODE_EXCLUSIVE);
+
+ /* Set IVT as direct mode */
+ if (!xive_set_vsd(x, VST_TSEL_IVT, x->block_id, base |
+ (((uint64_t)x->ivt_base) & VSD_ADDRESS_MASK) |
+ SETFIELD(VSD_TSIZE, 0ull, ilog2(IVT_SIZE) - 12)))
+ return false;
+
+ /* Set SBE as direct mode */
+ if (!xive_set_vsd(x, VST_TSEL_SBE, x->block_id, base |
+ (((uint64_t)x->sbe_base) & VSD_ADDRESS_MASK) |
+ SETFIELD(VSD_TSIZE, 0ull, ilog2(SBE_SIZE) - 12)))
+ return false;
+
+ /* Set EQDT as indirect mode with 64K subpages */
+ if (!xive_set_vsd(x, VST_TSEL_EQDT, x->block_id, base |
+ (((uint64_t)x->eq_ind_base) & VSD_ADDRESS_MASK) |
+ VSD_INDIRECT | SETFIELD(VSD_TSIZE, 0ull, 4)))
+ return false;
+
+ /* Set VPDT as indirect mode with 64K subpages */
+ if (!xive_set_vsd(x, VST_TSEL_VPDT, x->block_id, base |
+ (((uint64_t)x->vp_ind_base) & VSD_ADDRESS_MASK) |
+ VSD_INDIRECT | SETFIELD(VSD_TSIZE, 0ull, 4)))
+ return false;
+
+ /* Setup queue overflows */
+ for (i = 0; i < VC_QUEUE_OVF_COUNT; i++) {
+ u64 addr = ((uint64_t)x->q_ovf) + i * PAGE_SIZE;
+ u64 cfg, sreg, sregx;
+
+ if (!xive_set_vsd(x, VST_TSEL_IRQ, i, base |
+ (addr & VSD_ADDRESS_MASK) |
+ SETFIELD(VSD_TSIZE, 0ull, 4)))
+ return false;
+ sreg = VC_IRQ_CONFIG_IPI + i * 8;
+ sregx = X_VC_IRQ_CONFIG_IPI + i;
+ cfg = __xive_regr(x, sreg, sregx, NULL);
+ cfg |= VC_IRQ_CONFIG_MEMB_EN;
+ cfg = SETFIELD(VC_IRQ_CONFIG_MEMB_SZ, cfg, 4);
+ __xive_regw(x, sreg, sregx, cfg, NULL);
+ }
+
+ return true;
+}
+
+static bool xive_configure_bars(struct xive *x)
+{
+ uint64_t chip_id = x->chip_id;
+ uint64_t val;
+
+ /* IC BAR */
+ phys_map_get(chip_id, XIVE_IC, 0, (uint64_t *)&x->ic_base, &x->ic_size);
+ val = (uint64_t)x->ic_base | CQ_IC_BAR_VALID | CQ_IC_BAR_64K;
+ x->ic_shift = 16;
+
+ xive_regwx(x, CQ_IC_BAR, val);
+ if (x->last_reg_error)
+ return false;
+
+ /* TM BAR, only configure TM1. Note that this has the same address
+ * for each chip !!! Hence we create a fake chip 0 and use that for
+ * all phys_map_get(XIVE_TM) calls.
+ */
+ phys_map_get(0, XIVE_TM, 0, (uint64_t *)&x->tm_base, &x->tm_size);
+ val = (uint64_t)x->tm_base | CQ_TM_BAR_VALID | CQ_TM_BAR_64K;
+ x->tm_shift = 16;
+
+ xive_regwx(x, CQ_TM1_BAR, val);
+ if (x->last_reg_error)
+ return false;
+ xive_regwx(x, CQ_TM2_BAR, 0);
+ if (x->last_reg_error)
+ return false;
+
+ /* PC BAR. Clear first, write mask, then write value */
+ phys_map_get(chip_id, XIVE_PC, 0, (uint64_t *)&x->pc_base, &x->pc_size);
+ xive_regwx(x, CQ_PC_BAR, 0);
+ if (x->last_reg_error)
+ return false;
+ val = ~(x->pc_size - 1) & CQ_PC_BARM_MASK;
+ xive_regwx(x, CQ_PC_BARM, val);
+ if (x->last_reg_error)
+ return false;
+ val = (uint64_t)x->pc_base | CQ_PC_BAR_VALID;
+ xive_regwx(x, CQ_PC_BAR, val);
+ if (x->last_reg_error)
+ return false;
+
+ /* VC BAR. Clear first, write mask, then write value */
+ phys_map_get(chip_id, XIVE_VC, 0, (uint64_t *)&x->vc_base, &x->vc_size);
+ xive_regwx(x, CQ_VC_BAR, 0);
+ if (x->last_reg_error)
+ return false;
+ val = ~(x->vc_size - 1) & CQ_VC_BARM_MASK;
+ xive_regwx(x, CQ_VC_BARM, val);
+ if (x->last_reg_error)
+ return false;
+ val = (uint64_t)x->vc_base | CQ_VC_BAR_VALID;
+ xive_regwx(x, CQ_VC_BAR, val);
+ if (x->last_reg_error)
+ return false;
+
+ /* Calculate some MMIO bases in the VC BAR */
+ x->esb_mmio = x->vc_base;
+ x->eq_mmio = x->vc_base + (x->vc_size / VC_MAX_SETS) * VC_ESB_SETS;
+
+ /* Print things out */
+ xive_dbg(x, "IC: %14p [0x%012llx/%d]\n", x->ic_base, x->ic_size,
+ x->ic_shift);
+ xive_dbg(x, "TM: %14p [0x%012llx/%d]\n", x->tm_base, x->tm_size,
+ x->tm_shift);
+ xive_dbg(x, "PC: %14p [0x%012llx]\n", x->pc_base, x->pc_size);
+ xive_dbg(x, "VC: %14p [0x%012llx]\n", x->vc_base, x->vc_size);
+
+ return true;
+}
+
+static void xive_dump_mmio(struct xive *x)
+{
+ prlog(PR_DEBUG, " CQ_CFG_PB_GEN = %016llx\n",
+ in_be64(x->ic_base + CQ_CFG_PB_GEN));
+ prlog(PR_DEBUG, " CQ_MSGSND = %016llx\n",
+ in_be64(x->ic_base + CQ_MSGSND));
+}
+
+static bool xive_config_init(struct xive *x)
+{
+ uint64_t val;
+
+ /* Configure PC and VC page sizes and disable Linux trigger mode */
+ xive_regwx(x, CQ_PBI_CTL, CQ_PBI_PC_64K | CQ_PBI_VC_64K | CQ_PBI_FORCE_TM_LOCAL);
+ if (x->last_reg_error)
+ return false;
+
+ /*** The rest can use MMIO ***/
+
+ /* Enable indirect mode in VC config */
+ val = xive_regr(x, VC_GLOBAL_CONFIG);
+ val |= VC_GCONF_INDIRECT;
+ xive_regw(x, VC_GLOBAL_CONFIG, val);
+
+ /* Enable indirect mode in PC config */
+ val = xive_regr(x, PC_GLOBAL_CONFIG);
+ val |= PC_GCONF_INDIRECT;
+ val |= PC_GCONF_CHIPID_OVR;
+ val = SETFIELD(PC_GCONF_CHIPID, val, x->block_id);
+ xive_regw(x, PC_GLOBAL_CONFIG, val);
+ xive_dbg(x, "PC_GLOBAL_CONFIG=%016llx\n", val);
+
+ val = xive_regr(x, PC_TCTXT_CFG);
+ val |= PC_TCTXT_CFG_BLKGRP_EN | PC_TCTXT_CFG_HARD_CHIPID_BLK;
+ val |= PC_TCTXT_CHIPID_OVERRIDE;
+ val |= PC_TCTXT_CFG_TARGET_EN;
+ val = SETFIELD(PC_TCTXT_CHIPID, val, x->block_id);
+ val = SETFIELD(PC_TCTXT_INIT_AGE, val, 0x2);
+ val |= PC_TCTXT_CFG_LGS_EN;
+ /* Disable pressure relief as we hijack the field in the VPs */
+ val &= ~PC_TCTXT_CFG_STORE_ACK;
+ if (this_cpu()->is_fused_core)
+ val |= PC_TCTXT_CFG_FUSE_CORE_EN;
+ else
+ val &= ~PC_TCTXT_CFG_FUSE_CORE_EN;
+ xive_regw(x, PC_TCTXT_CFG, val);
+ xive_dbg(x, "PC_TCTXT_CFG=%016llx\n", val);
+
+ val = xive_regr(x, CQ_CFG_PB_GEN);
+ /* 1-block-per-chip mode */
+ val = SETFIELD(CQ_INT_ADDR_OPT, val, 2);
+ xive_regw(x, CQ_CFG_PB_GEN, val);
+
+ /* Enable StoreEOI */
+ val = xive_regr(x, VC_SBC_CONFIG);
+ if (XIVE_CAN_STORE_EOI(x))
+ val |= VC_SBC_CONF_CPLX_CIST | VC_SBC_CONF_CIST_BOTH;
+ else
+ xive_dbg(x, "store EOI is disabled\n");
+
+ val |= VC_SBC_CONF_NO_UPD_PRF;
+ xive_regw(x, VC_SBC_CONFIG, val);
+
+ /* Disable block tracking on Nimbus (we may want to enable
+ * it on Cumulus later). HW Erratas.
+ */
+ val = xive_regr(x, PC_TCTXT_TRACK);
+ val &= ~PC_TCTXT_TRACK_EN;
+ xive_regw(x, PC_TCTXT_TRACK, val);
+
+ /* Enable relaxed ordering of trigger forwarding */
+ val = xive_regr(x, VC_AIB_TX_ORDER_TAG2);
+ val |= VC_AIB_TX_ORDER_TAG2_REL_TF;
+ xive_regw(x, VC_AIB_TX_ORDER_TAG2, val);
+
+ /* Enable new END s and u bits for silent escalate */
+ val = xive_regr(x, VC_EQC_CONFIG);
+ val |= VC_EQC_CONF_ENABLE_END_s_BIT;
+ val |= VC_EQC_CONF_ENABLE_END_u_BIT;
+ xive_regw(x, VC_EQC_CONFIG, val);
+
+ /* Disable error reporting in the FIR for info errors
+ * from the VC.
+ */
+ xive_regw(x, CQ_FIRMASK_OR, CQ_FIR_VC_INFO_ERROR_0_1);
+
+ /* Mask CI Load and Store to bad location, as IPI trigger
+ * pages may be mapped to user space, and a read on the
+ * trigger page causes a checkstop
+ */
+ xive_regw(x, CQ_FIRMASK_OR, CQ_FIR_PB_RCMDX_CI_ERR1);
+
+ return true;
+}
+
+static bool xive_setup_set_xlate(struct xive *x)
+{
+ unsigned int i;
+
+ /* Configure EDT for ESBs (aka IPIs) */
+ xive_regw(x, CQ_TAR, CQ_TAR_TBL_AUTOINC | CQ_TAR_TSEL_EDT);
+ if (x->last_reg_error)
+ return false;
+ for (i = 0; i < VC_ESB_SETS; i++) {
+ xive_regw(x, CQ_TDR,
+ /* IPI type */
+ (1ull << 62) |
+ /* block ID */
+ (((uint64_t)x->block_id) << 48) |
+ /* offset */
+ (((uint64_t)i) << 32));
+ if (x->last_reg_error)
+ return false;
+ }
+
+ /* Configure EDT for ENDs (aka EQs) */
+ for (i = 0; i < VC_END_SETS; i++) {
+ xive_regw(x, CQ_TDR,
+ /* EQ type */
+ (2ull << 62) |
+ /* block ID */
+ (((uint64_t)x->block_id) << 48) |
+ /* offset */
+ (((uint64_t)i) << 32));
+ if (x->last_reg_error)
+ return false;
+ }
+
+ /* Configure VDT */
+ xive_regw(x, CQ_TAR, CQ_TAR_TBL_AUTOINC | CQ_TAR_TSEL_VDT);
+ if (x->last_reg_error)
+ return false;
+ for (i = 0; i < PC_MAX_SETS; i++) {
+ xive_regw(x, CQ_TDR,
+ /* Valid bit */
+ (1ull << 63) |
+ /* block ID */
+ (((uint64_t)x->block_id) << 48) |
+ /* offset */
+ (((uint64_t)i) << 32));
+ if (x->last_reg_error)
+ return false;
+ }
+ return true;
+}
+
+static bool xive_prealloc_tables(struct xive *x)
+{
+ uint32_t i, vp_init_count, vp_init_base;
+ uint32_t pbase, pend;
+ uint64_t al;
+
+ /* ESB/SBE has 4 entries per byte */
+ x->sbe_base = local_alloc(x->chip_id, SBE_SIZE, SBE_SIZE);
+ if (!x->sbe_base) {
+ xive_err(x, "Failed to allocate SBE\n");
+ return false;
+ }
+ /* SBEs are initialized to 0b01 which corresponds to "ints off" */
+ memset(x->sbe_base, 0x55, SBE_SIZE);
+ xive_dbg(x, "SBE at %p size 0x%lx\n", x->sbe_base, SBE_SIZE);
+
+ /* EAS/IVT entries are 8 bytes */
+ x->ivt_base = local_alloc(x->chip_id, IVT_SIZE, IVT_SIZE);
+ if (!x->ivt_base) {
+ xive_err(x, "Failed to allocate IVT\n");
+ return false;
+ }
+ /* We clear the entries (non-valid). They will be initialized
+ * when actually used
+ */
+ memset(x->ivt_base, 0, IVT_SIZE);
+ xive_dbg(x, "IVT at %p size 0x%lx\n", x->ivt_base, IVT_SIZE);
+
+ /* Indirect EQ table. Limited to one top page. */
+ al = ALIGN_UP(XIVE_EQ_TABLE_SIZE, PAGE_SIZE);
+ if (al > PAGE_SIZE) {
+ xive_err(x, "EQ indirect table is too big !\n");
+ return false;
+ }
+ x->eq_ind_base = local_alloc(x->chip_id, al, al);
+ if (!x->eq_ind_base) {
+ xive_err(x, "Failed to allocate EQ indirect table\n");
+ return false;
+ }
+ memset(x->eq_ind_base, 0, al);
+ xive_dbg(x, "EQi at %p size 0x%llx\n", x->eq_ind_base, al);
+ x->eq_ind_count = XIVE_EQ_TABLE_SIZE / XIVE_VSD_SIZE;
+
+ /* Indirect VP table. Limited to one top page. */
+ al = ALIGN_UP(XIVE_VP_TABLE_SIZE, PAGE_SIZE);
+ if (al > PAGE_SIZE) {
+ xive_err(x, "VP indirect table is too big !\n");
+ return false;
+ }
+ x->vp_ind_base = local_alloc(x->chip_id, al, al);
+ if (!x->vp_ind_base) {
+ xive_err(x, "Failed to allocate VP indirect table\n");
+ return false;
+ }
+ xive_dbg(x, "VPi at %p size 0x%llx\n", x->vp_ind_base, al);
+ x->vp_ind_count = XIVE_VP_TABLE_SIZE / XIVE_VSD_SIZE;
+ memset(x->vp_ind_base, 0, al);
+
+ /* Populate/initialize VP/EQs indirect backing */
+ vp_init_count = XIVE_HW_VP_COUNT;
+ vp_init_base = XIVE_HW_VP_BASE;
+
+ /* Allocate pages for some VPs in indirect mode */
+ pbase = vp_init_base / VP_PER_PAGE;
+ pend = (vp_init_base + vp_init_count) / VP_PER_PAGE;
+
+ xive_dbg(x, "Allocating pages %d to %d of VPs (for %d VPs)\n",
+ pbase, pend, vp_init_count);
+ for (i = pbase; i <= pend; i++) {
+ void *page;
+ u64 vsd;
+
+ /* Indirect entries have a VSD format */
+ page = local_alloc(x->chip_id, PAGE_SIZE, PAGE_SIZE);
+ if (!page) {
+ xive_err(x, "Failed to allocate VP page\n");
+ return false;
+ }
+ xive_dbg(x, "VP%d at %p size 0x%x\n", i, page, PAGE_SIZE);
+ memset(page, 0, PAGE_SIZE);
+ vsd = ((uint64_t)page) & VSD_ADDRESS_MASK;
+
+ vsd |= SETFIELD(VSD_TSIZE, 0ull, 4);
+ vsd |= SETFIELD(VSD_MODE, 0ull, VSD_MODE_EXCLUSIVE);
+ vsd |= VSD_FIRMWARE;
+ x->vp_ind_base[i] = cpu_to_be64(vsd);
+ }
+
+ /* Allocate the queue overflow pages */
+ x->q_ovf = local_alloc(x->chip_id, VC_QUEUE_OVF_COUNT * PAGE_SIZE, PAGE_SIZE);
+ if (!x->q_ovf) {
+ xive_err(x, "Failed to allocate queue overflow\n");
+ return false;
+ }
+ return true;
+}
+
+static void xive_add_provisioning_properties(void)
+{
+ __be32 chips[XIVE_MAX_CHIPS];
+ uint32_t i, count;
+
+ dt_add_property_cells(xive_dt_node,
+ "ibm,xive-provision-page-size", PAGE_SIZE);
+
+ count = 1 << xive_chips_alloc_bits;
+ for (i = 0; i < count; i++)
+ chips[i] = cpu_to_be32(xive_block_to_chip[i]);
+ dt_add_property(xive_dt_node, "ibm,xive-provision-chips",
+ chips, 4 * count);
+}
+
+static void xive_create_mmio_dt_node(struct xive *x)
+{
+ uint64_t tb = (uint64_t)x->tm_base;
+ uint32_t stride = 1u << x->tm_shift;
+
+ xive_dt_node = dt_new_addr(dt_root, "interrupt-controller", tb);
+ assert(xive_dt_node);
+
+ dt_add_property_u64s(xive_dt_node, "reg",
+ tb + 0 * stride, stride,
+ tb + 1 * stride, stride,
+ tb + 2 * stride, stride,
+ tb + 3 * stride, stride);
+
+ dt_add_property_strings(xive_dt_node, "compatible",
+ "ibm,opal-xive-pe", "ibm,opal-intc");
+
+ dt_add_property_cells(xive_dt_node, "ibm,xive-eq-sizes",
+ 12, 16, 21, 24);
+
+ dt_add_property_cells(xive_dt_node, "ibm,xive-#priorities",
+ NUM_INT_PRIORITIES);
+ dt_add_property(xive_dt_node, "single-escalation-support", NULL, 0);
+
+ xive_add_provisioning_properties();
+}
+
+static void xive_setup_forward_ports(struct xive *x, struct proc_chip *remote_chip)
+{
+ struct xive *remote_xive = remote_chip->xive;
+ uint64_t base = SETFIELD(VSD_MODE, 0ull, VSD_MODE_FORWARD);
+ uint32_t remote_id = remote_xive->block_id;
+ uint64_t nport;
+
+ /* ESB(SBE), EAS(IVT) and END(EQ) point to the notify port */
+ nport = ((uint64_t)remote_xive->ic_base) + (1ul << remote_xive->ic_shift);
+ if (!xive_set_vsd(x, VST_TSEL_IVT, remote_id, base | nport))
+ goto error;
+ if (!xive_set_vsd(x, VST_TSEL_SBE, remote_id, base | nport))
+ goto error;
+ if (!xive_set_vsd(x, VST_TSEL_EQDT, remote_id, base | nport))
+ goto error;
+
+ /* NVT/VPD points to the remote NVT MMIO sets */
+ if (!xive_set_vsd(x, VST_TSEL_VPDT, remote_id,
+ base | ((uint64_t)remote_xive->pc_base) |
+ SETFIELD(VSD_TSIZE, 0ull, ilog2(x->pc_size) - 12)))
+ goto error;
+
+ return;
+
+ error:
+ xive_err(x, "Failure configuring forwarding ports\n");
+}
+
+static void late_init_one_xive(struct xive *x)
+{
+ struct proc_chip *chip;
+
+ /* We need to setup the cross-chip forward ports. Let's
+ * iterate all chip and set them up accordingly
+ */
+ for_each_chip(chip) {
+ /* We skip ourselves or chips without a xive */
+ if (chip->xive == x || !chip->xive)
+ continue;
+
+ /* Setup our forward ports to that chip */
+ xive_setup_forward_ports(x, chip);
+ }
+}
+
+static bool xive_check_ipi_free(struct xive *x, uint32_t irq, uint32_t count)
+{
+ uint32_t i, idx = GIRQ_TO_IDX(irq);
+
+ for (i = 0; i < count; i++)
+ if (bitmap_tst_bit(*x->ipi_alloc_map, idx + i))
+ return false;
+ return true;
+}
+
+uint32_t xive_alloc_hw_irqs(uint32_t chip_id, uint32_t count, uint32_t align)
+{
+ struct proc_chip *chip = get_chip(chip_id);
+ struct xive *x;
+ uint32_t base, i;
+
+ assert(chip);
+ assert(is_pow2(align));
+
+ x = chip->xive;
+ assert(x);
+
+ lock(&x->lock);
+
+ /* Allocate the HW interrupts */
+ base = x->int_hw_bot - count;
+ base &= ~(align - 1);
+ if (base < x->int_ipi_top) {
+ xive_err(x,
+ "HW alloc request for %d interrupts aligned to %d failed\n",
+ count, align);
+ unlock(&x->lock);
+ return XIVE_IRQ_ERROR;
+ }
+ if (!xive_check_ipi_free(x, base, count)) {
+ xive_err(x, "HWIRQ boot allocator request overlaps dynamic allocator\n");
+ unlock(&x->lock);
+ return XIVE_IRQ_ERROR;
+ }
+
+ x->int_hw_bot = base;
+
+ /* Initialize the corresponding IVT entries to sane defaults,
+ * IE entry is valid, not routed and masked, EQ data is set
+ * to the GIRQ number.
+ */
+ for (i = 0; i < count; i++) {
+ struct xive_ive *ive = xive_get_ive(x, base + i);
+
+ ive->w = xive_set_field64(IVE_VALID, 0ul, 1) |
+ xive_set_field64(IVE_MASKED, 0ul, 1) |
+ xive_set_field64(IVE_EQ_DATA, 0ul, base + i);
+ }
+
+ unlock(&x->lock);
+ return base;
+}
+
+uint32_t xive_alloc_ipi_irqs(uint32_t chip_id, uint32_t count, uint32_t align)
+{
+ struct proc_chip *chip = get_chip(chip_id);
+ struct xive *x;
+ uint32_t base, i;
+
+ assert(chip);
+ assert(is_pow2(align));
+
+ x = chip->xive;
+ assert(x);
+
+ lock(&x->lock);
+
+ /* Allocate the IPI interrupts */
+ base = x->int_ipi_top + (align - 1);
+ base &= ~(align - 1);
+ if (base >= x->int_hw_bot) {
+ xive_err(x,
+ "IPI alloc request for %d interrupts aligned to %d failed\n",
+ count, align);
+ unlock(&x->lock);
+ return XIVE_IRQ_ERROR;
+ }
+ if (!xive_check_ipi_free(x, base, count)) {
+ xive_err(x, "IPI boot allocator request overlaps dynamic allocator\n");
+ unlock(&x->lock);
+ return XIVE_IRQ_ERROR;
+ }
+
+ x->int_ipi_top = base + count;
+
+ /* Initialize the corresponding IVT entries to sane defaults,
+ * IE entry is valid, not routed and masked, EQ data is set
+ * to the GIRQ number.
+ */
+ for (i = 0; i < count; i++) {
+ struct xive_ive *ive = xive_get_ive(x, base + i);
+
+ ive->w = xive_set_field64(IVE_VALID, 0ul, 1) |
+ xive_set_field64(IVE_MASKED, 0ul, 1) |
+ xive_set_field64(IVE_EQ_DATA, 0ul, base + i);
+ }
+
+ unlock(&x->lock);
+ return base;
+}
+
+void *xive_get_trigger_port(uint32_t girq)
+{
+ uint32_t idx = GIRQ_TO_IDX(girq);
+ struct xive *x;
+
+ /* Find XIVE on which the IVE resides */
+ x = xive_from_isn(girq);
+ if (!x)
+ return NULL;
+
+ if (GIRQ_IS_ESCALATION(girq)) {
+ /* There is no trigger page for escalation interrupts */
+ return NULL;
+ } else {
+ /* Make sure it's an IPI on that chip */
+ if (girq < x->int_base ||
+ girq >= x->int_ipi_top)
+ return NULL;
+
+ return x->esb_mmio + idx * XIVE_ESB_PAGE_SIZE;
+ }
+}
+
+uint64_t xive_get_notify_port(uint32_t chip_id, uint32_t ent)
+{
+ struct proc_chip *chip = get_chip(chip_id);
+ struct xive *x;
+ uint32_t offset = 0;
+
+ assert(chip);
+ x = chip->xive;
+ assert(x);
+
+ /* This is where we can assign a different HW queue to a different
+ * source by offsetting into the cache lines of the notify port
+ *
+ * For now we keep it very basic, this will have to be looked at
+ * again on real HW with some proper performance analysis.
+ *
+ * Here's what Florian says on the matter:
+ *
+ * <<
+ * The first 2k of the notify port page can all be used for PCIe triggers
+ *
+ * However the idea would be that we try to use the first 4 cache lines to
+ * balance the PCIe Interrupt requests to use the least used snoop buses
+ * (we went from 2 to 4 snoop buses for P9). snoop 0 is heavily used
+ * (I think TLBIs are using that in addition to the normal addresses),
+ * snoop 3 is used for all Int commands, so I think snoop 2 (CL 2 in the
+ * page) is the least used overall. So we probably should that one for
+ * the Int commands from PCIe.
+ *
+ * In addition, our EAS cache supports hashing to provide "private" cache
+ * areas for the PHBs in the shared 1k EAS cache. This allows e.g. to avoid
+ * that one "thrashing" PHB thrashes the EAS cache for everyone, or provide
+ * a PHB with a private area that would allow high cache hits in case of a
+ * device using very few interrupts. The hashing is based on the offset within
+ * the cache line. So using that, you can e.g. set the EAS cache up so that
+ * IPIs use 512 entries, the x16 PHB uses 256 entries and the x8 PHBs 128
+ * entries each - or IPIs using all entries and sharing with PHBs, so PHBs
+ * would use 512 entries and 256 entries respectively.
+ *
+ * This is a tuning we would probably do later in the lab, but as a "prep"
+ * we should set up the different PHBs such that they are using different
+ * 8B-aligned offsets within the cache line, so e.g.
+ * PH4_0 addr 0x100 (CL 2 DW0
+ * PH4_1 addr 0x108 (CL 2 DW1)
+ * PH4_2 addr 0x110 (CL 2 DW2)
+ * etc.
+ * >>
+ *
+ * I'm using snoop1 for PHB0 and snoop2 for everybody else.
+ */
+ switch(ent) {
+ case XIVE_HW_SRC_PHBn(0):
+ offset = 0x100;
+ break;
+ case XIVE_HW_SRC_PHBn(1):
+ offset = 0x208;
+ break;
+ case XIVE_HW_SRC_PHBn(2):
+ offset = 0x210;
+ break;
+ case XIVE_HW_SRC_PHBn(3):
+ offset = 0x218;
+ break;
+ case XIVE_HW_SRC_PHBn(4):
+ offset = 0x220;
+ break;
+ case XIVE_HW_SRC_PHBn(5):
+ offset = 0x228;
+ break;
+ case XIVE_HW_SRC_PSI:
+ offset = 0x230;
+ break;
+ default:
+ assert(false);
+ return 0;
+ }
+
+ /* Notify port is the second page of the IC BAR */
+ return ((uint64_t)x->ic_base) + (1ul << x->ic_shift) + offset;
+}
+
+/* Manufacture the powerbus packet bits 32:63 */
+__attrconst uint32_t xive_get_notify_base(uint32_t girq)
+{
+ return (GIRQ_TO_BLK(girq) << 28) | GIRQ_TO_IDX(girq);
+}
+
+static bool xive_get_irq_targetting(uint32_t isn, uint32_t *out_target,
+ uint8_t *out_prio, uint32_t *out_lirq)
+{
+ struct xive_ive *ive;
+ struct xive *x, *eq_x;
+ struct xive_eq *eq;
+ uint32_t eq_blk, eq_idx;
+ uint32_t vp_blk __unused, vp_idx;
+ uint32_t prio, server;
+ bool is_escalation = GIRQ_IS_ESCALATION(isn);
+
+ /* Find XIVE on which the IVE resides */
+ x = xive_from_isn(isn);
+ if (!x)
+ return false;
+ /* Grab the IVE */
+ ive = xive_get_ive(x, isn);
+ if (!ive)
+ return false;
+ if (!xive_get_field64(IVE_VALID, ive->w) && !is_escalation) {
+ xive_err(x, "ISN %x lead to invalid IVE !\n", isn);
+ return false;
+ }
+
+ if (out_lirq)
+ *out_lirq = xive_get_field64(IVE_EQ_DATA, ive->w);
+
+ /* Find the EQ and its xive instance */
+ eq_blk = xive_get_field64(IVE_EQ_BLOCK, ive->w);
+ eq_idx = xive_get_field64(IVE_EQ_INDEX, ive->w);
+ eq_x = xive_from_vc_blk(eq_blk);
+
+ /* This can fail if the interrupt hasn't been initialized yet
+ * but it should also be masked, so fail silently
+ */
+ if (!eq_x)
+ goto pick_default;
+ eq = xive_get_eq(eq_x, eq_idx);
+ if (!eq)
+ goto pick_default;
+
+ /* XXX Check valid and format 0 */
+
+ /* No priority conversion, return the actual one ! */
+ if (xive_get_field64(IVE_MASKED, ive->w))
+ prio = 0xff;
+ else
+ prio = xive_get_field32(EQ_W7_F0_PRIORITY, eq->w7);
+ if (out_prio)
+ *out_prio = prio;
+
+ vp_blk = xive_get_field32(EQ_W6_NVT_BLOCK, eq->w6);
+ vp_idx = xive_get_field32(EQ_W6_NVT_INDEX, eq->w6);
+ server = VP2PIR(vp_blk, vp_idx);
+
+ if (out_target)
+ *out_target = server;
+
+ xive_vdbg(eq_x, "EQ info for ISN %x: prio=%d, server=0x%x (VP %x/%x)\n",
+ isn, prio, server, vp_blk, vp_idx);
+ return true;
+
+pick_default:
+ xive_vdbg(eq_x, "EQ info for ISN %x: Using masked defaults\n", isn);
+
+ if (out_prio)
+ *out_prio = 0xff;
+ /* Pick a random default, me will be fine ... */
+ if (out_target)
+ *out_target = mfspr(SPR_PIR);
+ return true;
+}
+
+static inline bool xive_eq_for_target(uint32_t target, uint8_t prio,
+ uint32_t *out_eq_blk,
+ uint32_t *out_eq_idx)
+{
+ struct xive *x;
+ struct xive_vp *vp;
+ uint32_t vp_blk, vp_idx;
+ uint32_t eq_blk, eq_idx;
+
+ if (prio > XIVE_MAX_PRIO)
+ return false;
+
+ /* Get the VP block/index from the target word */
+ if (!xive_decode_vp(target, &vp_blk, &vp_idx, NULL, NULL))
+ return false;
+
+ /* Grab the target VP's XIVE */
+ x = xive_from_pc_blk(vp_blk);
+ if (!x)
+ return false;
+
+ /* Find the VP structrure where we stashed the EQ number */
+ vp = xive_get_vp(x, vp_idx);
+ if (!vp)
+ return false;
+
+ /* Grab it, it's in the pressure relief interrupt field,
+ * top 4 bits are the block (word 1).
+ */
+ eq_blk = be32_to_cpu(vp->w1) >> 28;
+ eq_idx = be32_to_cpu(vp->w1) & 0x0fffffff;
+
+ /* Currently the EQ block and VP block should be the same */
+ if (eq_blk != vp_blk) {
+ xive_err(x, "eq_blk != vp_blk (%d vs. %d) for target 0x%08x/%d\n",
+ eq_blk, vp_blk, target, prio);
+ return false;
+ }
+
+ if (out_eq_blk)
+ *out_eq_blk = eq_blk;
+ if (out_eq_idx)
+ *out_eq_idx = eq_idx + prio;
+
+ return true;
+}
+
+static int64_t xive_set_irq_targetting(uint32_t isn, uint32_t target,
+ uint8_t prio, uint32_t lirq,
+ bool synchronous)
+{
+ struct xive *x;
+ struct xive_ive *ive, new_ive;
+ uint32_t eq_blk, eq_idx;
+ bool is_escalation = GIRQ_IS_ESCALATION(isn);
+ int64_t rc;
+
+ /* Find XIVE on which the IVE resides */
+ x = xive_from_isn(isn);
+ if (!x)
+ return OPAL_PARAMETER;
+ /* Grab the IVE */
+ ive = xive_get_ive(x, isn);
+ if (!ive)
+ return OPAL_PARAMETER;
+ if (!xive_get_field64(IVE_VALID, ive->w) && !is_escalation) {
+ xive_err(x, "ISN %x lead to invalid IVE !\n", isn);
+ return OPAL_PARAMETER;
+ }
+
+ lock(&x->lock);
+
+ /* If using emulation mode, fixup prio to the only supported one */
+ if (xive_mode == XIVE_MODE_EMU && prio != 0xff)
+ prio = XIVE_EMULATION_PRIO;
+
+ /* Read existing IVE */
+ new_ive = *ive;
+
+ /* Are we masking ? */
+ if (prio == 0xff && !is_escalation) {
+ new_ive.w = xive_set_field64(IVE_MASKED, new_ive.w, 1);
+ xive_vdbg(x, "ISN %x masked !\n", isn);
+
+ /* Put prio 7 in the EQ */
+ prio = XIVE_MAX_PRIO;
+ } else {
+ /* Unmasking */
+ new_ive.w = xive_set_field64(IVE_MASKED, new_ive.w, 0);
+ xive_vdbg(x, "ISN %x unmasked !\n", isn);
+
+ /* For normal interrupt sources, keep track of which ones
+ * we ever enabled since the last reset
+ */
+ if (!is_escalation)
+ bitmap_set_bit(*x->int_enabled_map, GIRQ_TO_IDX(isn));
+ }
+
+ /* If prio isn't 0xff, re-target the IVE. First find the EQ
+ * correponding to the target
+ */
+ if (prio != 0xff) {
+ if (!xive_eq_for_target(target, prio, &eq_blk, &eq_idx)) {
+ xive_err(x, "Can't find EQ for target/prio 0x%x/%d\n",
+ target, prio);
+ unlock(&x->lock);
+ return OPAL_PARAMETER;
+ }
+
+ /* Try to update it atomically to avoid an intermediary
+ * stale state
+ */
+ new_ive.w = xive_set_field64(IVE_EQ_BLOCK, new_ive.w, eq_blk);
+ new_ive.w = xive_set_field64(IVE_EQ_INDEX, new_ive.w, eq_idx);
+ }
+ new_ive.w = xive_set_field64(IVE_EQ_DATA, new_ive.w, lirq);
+
+ xive_vdbg(x,"ISN %x routed to eq %x/%x lirq=%08x IVE=%016llx !\n",
+ isn, eq_blk, eq_idx, lirq, be64_to_cpu(new_ive.w));
+
+ /* Updating the cache differs between real IVEs and escalation
+ * IVEs inside an EQ
+ */
+ if (is_escalation) {
+ rc = xive_escalation_ive_cache_update(x, x->block_id,
+ GIRQ_TO_IDX(isn), &new_ive, synchronous);
+ } else {
+ sync();
+ *ive = new_ive;
+ rc = xive_ivc_scrub(x, x->block_id, GIRQ_TO_IDX(isn));
+ }
+
+ unlock(&x->lock);
+ return rc;
+}
+
+static int64_t xive_source_get_xive(struct irq_source *is __unused,
+ uint32_t isn, uint16_t *server,
+ uint8_t *prio)
+{
+ uint32_t target_id;
+
+ if (xive_get_irq_targetting(isn, &target_id, prio, NULL)) {
+ *server = target_id << 2;
+ return OPAL_SUCCESS;
+ } else
+ return OPAL_PARAMETER;
+}
+
+static void xive_update_irq_mask(struct xive_src *s, uint32_t idx, bool masked)
+{
+ void *mmio_base = s->esb_mmio + (1ul << s->esb_shift) * idx;
+ uint32_t offset;
+
+ /* XXX FIXME: A quick mask/umask can make us shoot an interrupt
+ * more than once to a queue. We need to keep track better
+ */
+ if (s->flags & XIVE_SRC_EOI_PAGE1)
+ mmio_base += 1ull << (s->esb_shift - 1);
+ if (masked)
+ offset = XIVE_ESB_SET_PQ_01;
+ else
+ offset = XIVE_ESB_SET_PQ_00;
+
+ in_be64(mmio_base + offset);
+}
+
+static int64_t xive_sync(struct xive *x)
+{
+ uint64_t r;
+ void *p;
+
+ lock(&x->lock);
+
+ /* Second 2K range of second page */
+ p = x->ic_base + (1 << x->ic_shift) + 0x800;
+
+ /* TODO: Make this more fine grained */
+ out_be64(p + (10 << 7), 0); /* Sync OS escalations */
+ out_be64(p + (11 << 7), 0); /* Sync Hyp escalations */
+ out_be64(p + (12 << 7), 0); /* Sync Redistribution */
+ out_be64(p + ( 8 << 7), 0); /* Sync IPI */
+ out_be64(p + ( 9 << 7), 0); /* Sync HW */
+
+#define SYNC_MASK \
+ (VC_EQC_CONF_SYNC_IPI | \
+ VC_EQC_CONF_SYNC_HW | \
+ VC_EQC_CONF_SYNC_ESC1 | \
+ VC_EQC_CONF_SYNC_ESC2 | \
+ VC_EQC_CONF_SYNC_REDI)
+
+ /* XXX Add timeout */
+ for (;;) {
+ r = xive_regr(x, VC_EQC_CONFIG);
+ if ((r & SYNC_MASK) == SYNC_MASK)
+ break;
+ cpu_relax();
+ }
+ xive_regw(x, VC_EQC_CONFIG, r & ~SYNC_MASK);
+
+ /* Workaround HW issue, read back before allowing a new sync */
+ xive_regr(x, VC_GLOBAL_CONFIG);
+
+ unlock(&x->lock);
+
+ return 0;
+}
+
+static int64_t __xive_set_irq_config(struct irq_source *is, uint32_t girq,
+ uint64_t vp, uint8_t prio, uint32_t lirq,
+ bool update_esb, bool sync)
+{
+ struct xive_src *s = container_of(is, struct xive_src, is);
+ uint32_t old_target, vp_blk;
+ u8 old_prio;
+ int64_t rc;
+
+ /* Grab existing target */
+ if (!xive_get_irq_targetting(girq, &old_target, &old_prio, NULL))
+ return OPAL_PARAMETER;
+
+ /* Let XIVE configure the EQ. We do the update without the
+ * synchronous flag, thus a cache update failure will result
+ * in us returning OPAL_BUSY
+ */
+ rc = xive_set_irq_targetting(girq, vp, prio, lirq, false);
+ if (rc)
+ return rc;
+
+ /* Do we need to update the mask ? */
+ if (old_prio != prio && (old_prio == 0xff || prio == 0xff)) {
+ /* The source has special variants of masking/unmasking */
+ if (s->orig_ops && s->orig_ops->set_xive) {
+ /* We don't pass as server on source ops ! Targetting
+ * is handled by the XIVE
+ */
+ rc = s->orig_ops->set_xive(is, girq, 0, prio);
+ } else if (update_esb) {
+ /* Ensure it's enabled/disabled in the source
+ * controller
+ */
+ xive_update_irq_mask(s, girq - s->esb_base,
+ prio == 0xff);
+ }
+ }
+
+ /*
+ * Synchronize the source and old target XIVEs to ensure that
+ * all pending interrupts to the old target have reached their
+ * respective queue.
+ *
+ * WARNING: This assumes the VP and it's queues are on the same
+ * XIVE instance !
+ */
+ if (!sync)
+ return OPAL_SUCCESS;
+ xive_sync(s->xive);
+ if (xive_decode_vp(old_target, &vp_blk, NULL, NULL, NULL)) {
+ struct xive *x = xive_from_pc_blk(vp_blk);
+ if (x)
+ xive_sync(x);
+ }
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t xive_set_irq_config(uint32_t girq, uint64_t vp, uint8_t prio,
+ uint32_t lirq, bool update_esb)
+{
+ struct irq_source *is = irq_find_source(girq);
+
+ return __xive_set_irq_config(is, girq, vp, prio, lirq, update_esb,
+ true);
+}
+
+static int64_t xive_source_set_xive(struct irq_source *is,
+ uint32_t isn, uint16_t server, uint8_t prio)
+{
+ /*
+ * WARNING: There is an inherent race with the use of the
+ * mask bit in the EAS/IVT. When masked, interrupts are "lost"
+ * but their P/Q bits are still set. So when unmasking, one has
+ * to check the P bit and possibly trigger a resend.
+ *
+ * We "deal" with it by relying on the fact that the OS will
+ * lazy disable MSIs. Thus mask will only be called if the
+ * interrupt occurred while already logically masked. Thus
+ * losing subsequent occurrences is of no consequences, we just
+ * need to "cleanup" P and Q when unmasking.
+ *
+ * This needs to be documented in the OPAL APIs
+ */
+
+ /* Unmangle server */
+ server >>= 2;
+
+ /* Set logical irq to match isn */
+ return __xive_set_irq_config(is, isn, server, prio, isn, true, true);
+}
+
+static void __xive_source_eoi(struct irq_source *is, uint32_t isn)
+{
+ struct xive_src *s = container_of(is, struct xive_src, is);
+ uint32_t idx = isn - s->esb_base;
+ struct xive_ive *ive;
+ void *mmio_base;
+ uint64_t eoi_val;
+
+ /* Grab the IVE */
+ ive = s->xive->ivt_base;
+ if (!ive)
+ return;
+ ive += GIRQ_TO_IDX(isn);
+
+ /* XXX To fix the races with mask/unmask potentially causing
+ * multiple queue entries, we need to keep track of EOIs here,
+ * before the masked test below
+ */
+
+ /* If it's invalid or masked, don't do anything */
+ if (xive_get_field64(IVE_MASKED, ive->w) || !xive_get_field64(IVE_VALID, ive->w))
+ return;
+
+ /* Grab MMIO control address for that ESB */
+ mmio_base = s->esb_mmio + (1ull << s->esb_shift) * idx;
+
+ /* If the XIVE supports the new "store EOI facility, use it */
+ if (s->flags & XIVE_SRC_STORE_EOI)
+ out_be64(mmio_base + XIVE_ESB_STORE_EOI, 0);
+ else {
+ uint64_t offset;
+
+ /* Otherwise for EOI, we use the special MMIO that does
+ * a clear of both P and Q and returns the old Q.
+ *
+ * This allows us to then do a re-trigger if Q was set
+ * rather than synthetizing an interrupt in software
+ */
+ if (s->flags & XIVE_SRC_EOI_PAGE1)
+ mmio_base += 1ull << (s->esb_shift - 1);
+
+ /* LSIs don't need anything special, just EOI */
+ if (s->flags & XIVE_SRC_LSI)
+ in_be64(mmio_base);
+ else {
+ offset = XIVE_ESB_SET_PQ_00;
+ eoi_val = in_be64(mmio_base + offset);
+ xive_vdbg(s->xive, "ISN: %08x EOI=%llx\n",
+ isn, eoi_val);
+ if (!(eoi_val & 1))
+ return;
+
+ /* Re-trigger always on page0 or page1 ? */
+ out_be64(mmio_base + XIVE_ESB_STORE_TRIGGER, 0);
+ }
+ }
+}
+
+static void xive_source_eoi(struct irq_source *is, uint32_t isn)
+{
+ struct xive_src *s = container_of(is, struct xive_src, is);
+
+ if (s->orig_ops && s->orig_ops->eoi)
+ s->orig_ops->eoi(is, isn);
+ else
+ __xive_source_eoi(is, isn);
+}
+
+static void xive_source_interrupt(struct irq_source *is, uint32_t isn)
+{
+ struct xive_src *s = container_of(is, struct xive_src, is);
+
+ if (!s->orig_ops || !s->orig_ops->interrupt)
+ return;
+ s->orig_ops->interrupt(is, isn);
+}
+
+static uint64_t xive_source_attributes(struct irq_source *is, uint32_t isn)
+{
+ struct xive_src *s = container_of(is, struct xive_src, is);
+
+ if (!s->orig_ops || !s->orig_ops->attributes)
+ return IRQ_ATTR_TARGET_LINUX;
+ return s->orig_ops->attributes(is, isn);
+}
+
+static char *xive_source_name(struct irq_source *is, uint32_t isn)
+{
+ struct xive_src *s = container_of(is, struct xive_src, is);
+
+ if (!s->orig_ops || !s->orig_ops->name)
+ return NULL;
+ return s->orig_ops->name(is, isn);
+}
+
+void xive_source_mask(struct irq_source *is, uint32_t isn)
+{
+ struct xive_src *s = container_of(is, struct xive_src, is);
+
+ xive_update_irq_mask(s, isn - s->esb_base, true);
+}
+
+static const struct irq_source_ops xive_irq_source_ops = {
+ .get_xive = xive_source_get_xive,
+ .set_xive = xive_source_set_xive,
+ .eoi = xive_source_eoi,
+ .interrupt = xive_source_interrupt,
+ .attributes = xive_source_attributes,
+ .name = xive_source_name,
+};
+
+static void __xive_register_source(struct xive *x, struct xive_src *s,
+ uint32_t base, uint32_t count,
+ uint32_t shift, void *mmio, uint32_t flags,
+ bool secondary, void *data,
+ const struct irq_source_ops *orig_ops)
+{
+ s->esb_base = base;
+ s->esb_shift = shift;
+ s->esb_mmio = mmio;
+ s->flags = flags;
+ s->orig_ops = orig_ops;
+ s->xive = x;
+ s->is.start = base;
+ s->is.end = base + count;
+ s->is.ops = &xive_irq_source_ops;
+ s->is.data = data;
+
+ __register_irq_source(&s->is, secondary);
+}
+
+void xive_register_hw_source(uint32_t base, uint32_t count, uint32_t shift,
+ void *mmio, uint32_t flags, void *data,
+ const struct irq_source_ops *ops)
+{
+ struct xive_src *s;
+ struct xive *x = xive_from_isn(base);
+
+ assert(x);
+
+ s = malloc(sizeof(struct xive_src));
+ assert(s);
+ __xive_register_source(x, s, base, count, shift, mmio, flags,
+ false, data, ops);
+}
+
+void xive_register_ipi_source(uint32_t base, uint32_t count, void *data,
+ const struct irq_source_ops *ops)
+{
+ struct xive_src *s;
+ struct xive *x = xive_from_isn(base);
+ uint32_t base_idx = GIRQ_TO_IDX(base);
+ void *mmio_base;
+ uint32_t flags = XIVE_SRC_EOI_PAGE1 | XIVE_SRC_TRIGGER_PAGE;
+
+ assert(x);
+ assert(base >= x->int_base && (base + count) <= x->int_ipi_top);
+
+ s = malloc(sizeof(struct xive_src));
+ assert(s);
+
+ /* Store EOI supported on DD2.0 */
+ if (XIVE_CAN_STORE_EOI(x))
+ flags |= XIVE_SRC_STORE_EOI;
+
+ /* Callbacks assume the MMIO base corresponds to the first
+ * interrupt of that source structure so adjust it
+ */
+ mmio_base = x->esb_mmio + (1ul << XIVE_ESB_SHIFT) * base_idx;
+ __xive_register_source(x, s, base, count, XIVE_ESB_SHIFT, mmio_base,
+ flags, false, data, ops);
+}
+
+static struct xive *init_one_xive(struct dt_node *np)
+{
+ struct xive *x;
+ struct proc_chip *chip;
+ uint32_t flags;
+
+ x = zalloc(sizeof(struct xive));
+ assert(x);
+ x->x_node = np;
+ x->xscom_base = dt_get_address(np, 0, NULL);
+ x->chip_id = dt_get_chip_id(np);
+
+ /* "Allocate" a new block ID for the chip */
+ x->block_id = xive_block_count++;
+ assert (x->block_id < XIVE_MAX_CHIPS);
+ xive_block_to_chip[x->block_id] = x->chip_id;
+ init_lock(&x->lock);
+
+ chip = get_chip(x->chip_id);
+ assert(chip);
+
+ /* All supported P9 are revision 2 (Nimbus DD2) */
+ switch (chip->type) {
+ case PROC_CHIP_P9_NIMBUS:
+ /* We should not be able to boot a P9N DD1 */
+ assert((chip->ec_level & 0xf0) != 0x10);
+ /* Fallthrough */
+ case PROC_CHIP_P9_CUMULUS:
+ case PROC_CHIP_P9P:
+ break;
+ default:
+ assert(0);
+ }
+
+ xive_dbg(x, "Initializing block ID %d...\n", x->block_id);
+ chip->xive = x;
+
+ list_head_init(&x->donated_pages);
+
+ /* Base interrupt numbers and allocator init */
+ /* XXX Consider allocating half as many ESBs than MMIO space
+ * so that HW sources land outside of ESB space...
+ */
+ x->int_base = BLKIDX_TO_GIRQ(x->block_id, 0);
+ x->int_max = x->int_base + XIVE_INT_COUNT;
+ x->int_hw_bot = x->int_max;
+ x->int_ipi_top = x->int_base;
+
+ /* Make sure we never hand out "2" as it's reserved for XICS emulation
+ * IPI returns. Generally start handing out at 0x10
+ */
+ if (x->int_ipi_top < XIVE_INT_FIRST)
+ x->int_ipi_top = XIVE_INT_FIRST;
+
+ /* Allocate a few bitmaps */
+ x->eq_map = local_alloc(x->chip_id, BITMAP_BYTES(XIVE_EQ_COUNT >> 3), PAGE_SIZE);
+ assert(x->eq_map);
+ memset(x->eq_map, 0, BITMAP_BYTES(XIVE_EQ_COUNT >> 3));
+
+ /* Make sure we don't hand out 0 */
+ bitmap_set_bit(*x->eq_map, 0);
+
+ x->int_enabled_map = local_alloc(x->chip_id, BITMAP_BYTES(XIVE_INT_COUNT), PAGE_SIZE);
+ assert(x->int_enabled_map);
+ memset(x->int_enabled_map, 0, BITMAP_BYTES(XIVE_INT_COUNT));
+ x->ipi_alloc_map = local_alloc(x->chip_id, BITMAP_BYTES(XIVE_INT_COUNT), PAGE_SIZE);
+ assert(x->ipi_alloc_map);
+ memset(x->ipi_alloc_map, 0, BITMAP_BYTES(XIVE_INT_COUNT));
+
+ xive_dbg(x, "Handling interrupts [%08x..%08x]\n",
+ x->int_base, x->int_max - 1);
+
+ /* Setup the BARs */
+ if (!xive_configure_bars(x))
+ goto fail;
+
+ /* Some basic global inits such as page sizes etc... */
+ if (!xive_config_init(x))
+ goto fail;
+
+ /* Configure the set translations for MMIO */
+ if (!xive_setup_set_xlate(x))
+ goto fail;
+
+ /* Dump some MMIO registers for diagnostics */
+ xive_dump_mmio(x);
+
+ /* Pre-allocate a number of tables */
+ if (!xive_prealloc_tables(x))
+ goto fail;
+
+ /* Configure local tables in VSDs (forward ports will be
+ * handled later)
+ */
+ if (!xive_set_local_tables(x))
+ goto fail;
+
+ /* Register built-in source controllers (aka IPIs) */
+ flags = XIVE_SRC_EOI_PAGE1 | XIVE_SRC_TRIGGER_PAGE;
+ if (XIVE_CAN_STORE_EOI(x))
+ flags |= XIVE_SRC_STORE_EOI;
+ __xive_register_source(x, &x->ipis, x->int_base,
+ x->int_hw_bot - x->int_base, XIVE_ESB_SHIFT,
+ x->esb_mmio, flags, true, NULL, NULL);
+
+ /* Register escalation sources */
+ __xive_register_source(x, &x->esc_irqs,
+ MAKE_ESCALATION_GIRQ(x->block_id, 0),
+ XIVE_EQ_COUNT, XIVE_EQ_SHIFT,
+ x->eq_mmio, XIVE_SRC_EOI_PAGE1,
+ false, NULL, NULL);
+
+
+ return x;
+ fail:
+ xive_err(x, "Initialization failed...\n");
+
+ /* Should this be fatal ? */
+ //assert(false);
+ return NULL;
+}
+
+/*
+ * XICS emulation
+ */
+static void xive_ipi_init(struct xive *x, struct cpu_thread *cpu)
+{
+ struct xive_cpu_state *xs = cpu->xstate;
+
+ assert(xs);
+
+ __xive_set_irq_config(&x->ipis.is, xs->ipi_irq, cpu->pir,
+ XIVE_EMULATION_PRIO, xs->ipi_irq,
+ true, true);
+}
+
+static void xive_ipi_eoi(struct xive *x, uint32_t idx)
+{
+ uint8_t *mm = x->esb_mmio + idx * XIVE_ESB_PAGE_SIZE;
+ uint8_t eoi_val;
+
+ /* For EOI, we use the special MMIO that does a clear of both
+ * P and Q and returns the old Q.
+ *
+ * This allows us to then do a re-trigger if Q was set rather
+ * than synthetizing an interrupt in software
+ */
+ eoi_val = in_8(mm + PAGE_SIZE + XIVE_ESB_SET_PQ_00);
+ if (eoi_val & 1) {
+ out_8(mm + XIVE_ESB_STORE_TRIGGER, 0);
+ }
+}
+
+static void xive_ipi_trigger(struct xive *x, uint32_t idx)
+{
+ uint8_t *mm = x->esb_mmio + idx * XIVE_ESB_PAGE_SIZE;
+
+ xive_vdbg(x, "Trigger IPI 0x%x\n", idx);
+
+ out_8(mm + XIVE_ESB_STORE_TRIGGER, 0);
+}
+
+
+static void xive_reset_enable_thread(struct cpu_thread *c)
+{
+ struct proc_chip *chip = get_chip(c->chip_id);
+ struct xive *x = chip->xive;
+ uint32_t fc, bit;
+ uint64_t enable;
+
+ /* Get fused core number */
+ fc = (c->pir >> 3) & 0xf;
+
+ /* Get bit in register */
+ bit = c->pir & 0x3f;
+
+ /* Get which register to access */
+ if (fc < 8) {
+ xive_regw(x, PC_THREAD_EN_REG0_CLR, PPC_BIT(bit));
+ xive_regw(x, PC_THREAD_EN_REG0_SET, PPC_BIT(bit));
+
+ /*
+ * To guarantee that the TIMA accesses will see the
+ * latest state of the enable register, add an extra
+ * load on PC_THREAD_EN_REG.
+ */
+ enable = xive_regr(x, PC_THREAD_EN_REG0);
+ if (!(enable & PPC_BIT(bit)))
+ xive_cpu_err(c, "Failed to enable thread\n");
+ } else {
+ xive_regw(x, PC_THREAD_EN_REG1_CLR, PPC_BIT(bit));
+ xive_regw(x, PC_THREAD_EN_REG1_SET, PPC_BIT(bit));
+
+ /* Same as above */
+ enable = xive_regr(x, PC_THREAD_EN_REG1);
+ if (!(enable & PPC_BIT(bit)))
+ xive_cpu_err(c, "Failed to enable thread\n");
+ }
+}
+
+void xive_cpu_callin(struct cpu_thread *cpu)
+{
+ struct xive_cpu_state *xs = cpu->xstate;
+ uint8_t old_w2 __unused, w2 __unused;
+
+ if (!xs)
+ return;
+
+ /* Reset the HW thread context and enable it */
+ xive_reset_enable_thread(cpu);
+
+ /* Set VT to 1 */
+ old_w2 = in_8(xs->tm_ring1 + TM_QW3_HV_PHYS + TM_WORD2);
+ out_8(xs->tm_ring1 + TM_QW3_HV_PHYS + TM_WORD2, 0x80);
+ w2 = in_8(xs->tm_ring1 + TM_QW3_HV_PHYS + TM_WORD2);
+
+ xive_cpu_vdbg(cpu, "Initialized TIMA VP=%x/%x W01=%016llx W2=%02x->%02x\n",
+ xs->vp_blk, xs->vp_idx,
+ in_be64(xs->tm_ring1 + TM_QW3_HV_PHYS),
+ old_w2, w2);
+}
+
+#ifdef XIVE_DEBUG_INIT_CACHE_UPDATES
+static bool xive_check_eq_update(struct xive *x, uint32_t idx, struct xive_eq *eq)
+{
+ struct xive_eq *eq_p = xive_get_eq(x, idx);
+ struct xive_eq eq2;
+
+ assert(eq_p);
+ eq2 = *eq_p;
+ if (memcmp(eq, &eq2, sizeof(struct xive_eq)) != 0) {
+ xive_err(x, "EQ update mismatch idx %d\n", idx);
+ xive_err(x, "want: %08x %08x %08x %08x\n",
+ be32_to_cpu(eq->w0), be32_to_cpu(eq->w1),
+ be32_to_cpu(eq->w2), be32_to_cpu(eq->w3));
+ xive_err(x, " %08x %08x %08x %08x\n",
+ be32_to_cpu(eq->w4), be32_to_cpu(eq->w5),
+ be32_to_cpu(eq->w6), be32_to_cpu(eq->w7));
+ xive_err(x, "got : %08x %08x %08x %08x\n",
+ be32_to_cpu(eq2.w0), be32_to_cpu(eq2.w1),
+ be32_to_cpu(eq2.w2), be32_to_cpu(eq2.w3));
+ xive_err(x, " %08x %08x %08x %08x\n",
+ be32_to_cpu(eq2.w4), be32_to_cpu(eq2.w5),
+ be32_to_cpu(eq2.w6), be32_to_cpu(eq2.w7));
+ return false;
+ }
+ return true;
+}
+
+static bool xive_check_vpc_update(struct xive *x, uint32_t idx, struct xive_vp *vp)
+{
+ struct xive_vp *vp_p = xive_get_vp(x, idx);
+ struct xive_vp vp2;
+
+ assert(vp_p);
+ vp2 = *vp_p;
+ if (memcmp(vp, &vp2, sizeof(struct xive_vp)) != 0) {
+ xive_err(x, "VP update mismatch idx %d\n", idx);
+ xive_err(x, "want: %08x %08x %08x %08x\n",
+ be32_to_cpu(vp->w0), be32_to_cpu(vp->w1),
+ be32_to_cpu(vp->w2), be32_to_cpu(vp->w3));
+ xive_err(x, " %08x %08x %08x %08x\n",
+ be32_to_cpu(vp->w4), be32_to_cpu(vp->w5),
+ be32_to_cpu(vp->w6), be32_to_cpu(vp->w7));
+ xive_err(x, "got : %08x %08x %08x %08x\n",
+ be32_to_cpu(vp2.w0), be32_to_cpu(vp2.w1),
+ be32_to_cpu(vp2.w2), be32_to_cpu(vp2.w3));
+ xive_err(x, " %08x %08x %08x %08x\n",
+ be32_to_cpu(vp2.w4), be32_to_cpu(vp2.w5),
+ be32_to_cpu(vp2.w6), be32_to_cpu(vp2.w7));
+ return false;
+ }
+ return true;
+}
+#else
+static inline bool xive_check_eq_update(struct xive *x __unused,
+ uint32_t idx __unused,
+ struct xive_eq *eq __unused)
+{
+ return true;
+}
+
+static inline bool xive_check_vpc_update(struct xive *x __unused,
+ uint32_t idx __unused,
+ struct xive_vp *vp __unused)
+{
+ return true;
+}
+#endif
+
+#ifdef XIVE_EXTRA_CHECK_INIT_CACHE
+static void xive_special_cache_check(struct xive *x, uint32_t blk, uint32_t idx)
+{
+ struct xive_vp vp = {0};
+ uint32_t i;
+
+ for (i = 0; i < 1000; i++) {
+ struct xive_vp *vp_m = xive_get_vp(x, idx);
+
+ memset(vp_m, (~i) & 0xff, sizeof(*vp_m));
+ sync();
+ vp.w1 = cpu_to_be32((i << 16) | i);
+ xive_vpc_cache_update(x, blk, idx, &vp, true);
+ if (!xive_check_vpc_update(x, idx, &vp)) {
+ xive_dbg(x, "Test failed at %d iterations\n", i);
+ return;
+ }
+ }
+ xive_dbg(x, "1000 iterations test success at %d/0x%x\n", blk, idx);
+}
+#else
+static inline void xive_special_cache_check(struct xive *x __unused,
+ uint32_t blk __unused,
+ uint32_t idx __unused)
+{
+}
+#endif
+
+static void xive_setup_hw_for_emu(struct xive_cpu_state *xs)
+{
+ struct xive_eq eq;
+ struct xive_vp vp;
+ struct xive *x_eq, *x_vp;
+
+ /* Grab the XIVE where the VP resides. It could be different from
+ * the local chip XIVE if not using block group mode
+ */
+ x_vp = xive_from_pc_blk(xs->vp_blk);
+ assert(x_vp);
+
+ /* Grab the XIVE where the EQ resides. It will be the same as the
+ * VP one with the current provisioning but I prefer not making
+ * this code depend on it.
+ */
+ x_eq = xive_from_vc_blk(xs->eq_blk);
+ assert(x_eq);
+
+ /* Initialize the structure */
+ xive_init_emu_eq(xs->vp_blk, xs->vp_idx, &eq,
+ xs->eq_page, XIVE_EMULATION_PRIO);
+
+ /* Use the cache watch to write it out */
+ lock(&x_eq->lock);
+ xive_eqc_cache_update(x_eq, xs->eq_blk, xs->eq_idx + XIVE_EMULATION_PRIO, &eq, true);
+ xive_check_eq_update(x_eq, xs->eq_idx + XIVE_EMULATION_PRIO, &eq);
+
+ /* Extra testing of cache watch & scrub facilities */
+ xive_special_cache_check(x_vp, xs->vp_blk, xs->vp_idx);
+ unlock(&x_eq->lock);
+
+ /* Initialize/enable the VP */
+ xive_init_default_vp(&vp, xs->eq_blk, xs->eq_idx);
+
+ /* Use the cache watch to write it out */
+ lock(&x_vp->lock);
+ xive_vpc_cache_update(x_vp, xs->vp_blk, xs->vp_idx, &vp, true);
+ xive_check_vpc_update(x_vp, xs->vp_idx, &vp);
+ unlock(&x_vp->lock);
+}
+
+static void xive_init_cpu_emulation(struct xive_cpu_state *xs,
+ struct cpu_thread *cpu)
+{
+ struct xive *x;
+
+ /* Setup HW EQ and VP */
+ xive_setup_hw_for_emu(xs);
+
+ /* Setup and unmask the IPI */
+ xive_ipi_init(xs->xive, cpu);
+
+ /* Initialize remaining state */
+ xs->cppr = 0;
+ xs->mfrr = 0xff;
+ xs->eqbuf = xive_get_eq_buf(xs->vp_blk,
+ xs->eq_idx + XIVE_EMULATION_PRIO);
+ assert(xs->eqbuf);
+ memset(xs->eqbuf, 0, PAGE_SIZE);
+
+ xs->eqptr = 0;
+ xs->eqmsk = (PAGE_SIZE / 4) - 1;
+ xs->eqgen = 0;
+ x = xive_from_vc_blk(xs->eq_blk);
+ assert(x);
+ xs->eqmmio = x->eq_mmio + (xs->eq_idx + XIVE_EMULATION_PRIO) * XIVE_ESB_PAGE_SIZE;
+}
+
+static void xive_init_cpu_exploitation(struct xive_cpu_state *xs)
+{
+ struct xive_vp vp;
+ struct xive *x_vp;
+
+ /* Grab the XIVE where the VP resides. It could be different from
+ * the local chip XIVE if not using block group mode
+ */
+ x_vp = xive_from_pc_blk(xs->vp_blk);
+ assert(x_vp);
+
+ /* Initialize/enable the VP */
+ xive_init_default_vp(&vp, xs->eq_blk, xs->eq_idx);
+
+ /* Use the cache watch to write it out */
+ lock(&x_vp->lock);
+ xive_vpc_cache_update(x_vp, xs->vp_blk, xs->vp_idx, &vp, true);
+ unlock(&x_vp->lock);
+
+ /* Clenaup remaining state */
+ xs->cppr = 0;
+ xs->mfrr = 0xff;
+ xs->eqbuf = NULL;
+ xs->eqptr = 0;
+ xs->eqmsk = 0;
+ xs->eqgen = 0;
+ xs->eqmmio = NULL;
+}
+
+static void xive_configure_ex_special_bar(struct xive *x, struct cpu_thread *c)
+{
+ uint64_t xa, val;
+ int64_t rc;
+
+ xive_cpu_vdbg(c, "Setting up special BAR\n");
+ xa = XSCOM_ADDR_P9_EX(pir_to_core_id(c->pir), P9X_EX_NCU_SPEC_BAR);
+ val = (uint64_t)x->tm_base | P9X_EX_NCU_SPEC_BAR_ENABLE;
+ if (x->tm_shift == 16)
+ val |= P9X_EX_NCU_SPEC_BAR_256K;
+ xive_cpu_vdbg(c, "NCU_SPEC_BAR_XA[%08llx]=%016llx\n", xa, val);
+ rc = xscom_write(c->chip_id, xa, val);
+ if (rc) {
+ xive_cpu_err(c, "Failed to setup NCU_SPEC_BAR\n");
+ /* XXXX what do do now ? */
+ }
+}
+
+void xive_late_init(void)
+{
+ struct cpu_thread *c;
+
+ prlog(PR_INFO, "SLW: Configuring self-restore for NCU_SPEC_BAR\n");
+ for_each_present_cpu(c) {
+ if(cpu_is_thread0(c)) {
+ struct proc_chip *chip = get_chip(c->chip_id);
+ struct xive *x = chip->xive;
+ uint64_t xa, val, rc;
+ xa = XSCOM_ADDR_P9_EX(pir_to_core_id(c->pir),
+ P9X_EX_NCU_SPEC_BAR);
+ val = (uint64_t)x->tm_base | P9X_EX_NCU_SPEC_BAR_ENABLE;
+ /* Bail out if wakeup engine has already failed */
+ if ( wakeup_engine_state != WAKEUP_ENGINE_PRESENT) {
+ prlog(PR_ERR, "XIVE p9_stop_api fail detected\n");
+ break;
+ }
+ rc = p9_stop_save_scom((void *)chip->homer_base, xa, val,
+ P9_STOP_SCOM_REPLACE, P9_STOP_SECTION_EQ_SCOM);
+ if (rc) {
+ xive_cpu_err(c, "p9_stop_api failed for NCU_SPEC_BAR rc=%lld\n",
+ rc);
+ wakeup_engine_state = WAKEUP_ENGINE_FAILED;
+ }
+ }
+ }
+
+}
+static void xive_provision_cpu(struct xive_cpu_state *xs, struct cpu_thread *c)
+{
+ struct xive *x;
+ void *p;
+
+ /* Physical VPs are pre-allocated */
+ xs->vp_blk = PIR2VP_BLK(c->pir);
+ xs->vp_idx = PIR2VP_IDX(c->pir);
+
+ /* For now we use identical block IDs for VC and PC but that might
+ * change. We allocate the EQs on the same XIVE as the VP.
+ */
+ xs->eq_blk = xs->vp_blk;
+
+ /* Grab the XIVE where the EQ resides. It could be different from
+ * the local chip XIVE if not using block group mode
+ */
+ x = xive_from_vc_blk(xs->eq_blk);
+ assert(x);
+
+ /* Allocate a set of EQs for that VP */
+ xs->eq_idx = xive_alloc_eq_set(x, true);
+ assert(!XIVE_ALLOC_IS_ERR(xs->eq_idx));
+
+ /* Provision one of the queues. Allocate the memory on the
+ * chip where the CPU resides
+ */
+ p = local_alloc(c->chip_id, PAGE_SIZE, PAGE_SIZE);
+ if (!p) {
+ xive_err(x, "Failed to allocate EQ backing store\n");
+ assert(false);
+ }
+ xs->eq_page = p;
+}
+
+static void xive_init_cpu(struct cpu_thread *c)
+{
+ struct proc_chip *chip = get_chip(c->chip_id);
+ struct xive *x = chip->xive;
+ struct xive_cpu_state *xs;
+
+ if (!x)
+ return;
+
+ /*
+ * Each core pair (EX) needs this special BAR setup to have the
+ * right powerbus cycle for the TM area (as it has the same address
+ * on all chips so it's somewhat special).
+ *
+ * Because we don't want to bother trying to figure out which core
+ * of a pair is present we just do the setup for each of them, which
+ * is harmless.
+ */
+ if (cpu_is_thread0(c) || cpu_is_core_chiplet_primary(c))
+ xive_configure_ex_special_bar(x, c);
+
+ /* Initialize the state structure */
+ c->xstate = xs = local_alloc(c->chip_id, sizeof(struct xive_cpu_state), 1);
+ assert(xs);
+ memset(xs, 0, sizeof(struct xive_cpu_state));
+ xs->xive = x;
+
+ init_lock(&xs->lock);
+
+ /* Shortcut to TM HV ring */
+ xs->tm_ring1 = x->tm_base + (1u << x->tm_shift);
+
+ /* Allocate an IPI */
+ xs->ipi_irq = xive_alloc_ipi_irqs(c->chip_id, 1, 1);
+
+ xive_cpu_vdbg(c, "CPU IPI is irq %08x\n", xs->ipi_irq);
+
+ /* Provision a VP and some EQDs for a physical CPU */
+ xive_provision_cpu(xs, c);
+
+ /* Initialize the XICS emulation related fields */
+ xive_init_cpu_emulation(xs, c);
+}
+
+static void xive_init_cpu_properties(struct cpu_thread *cpu)
+{
+ struct cpu_thread *t;
+ __be32 iprop[8][2] = { };
+ uint32_t i;
+
+ assert(cpu_thread_count <= 8);
+
+ if (!cpu->node)
+ return;
+ for (i = 0; i < cpu_thread_count; i++) {
+ t = (i == 0) ? cpu : find_cpu_by_pir(cpu->pir + i);
+ if (!t)
+ continue;
+ iprop[i][0] = cpu_to_be32(t->xstate->ipi_irq);
+ iprop[i][1] = 0; /* Edge */
+ }
+ dt_add_property(cpu->node, "interrupts", iprop, cpu_thread_count * 8);
+ dt_add_property_cells(cpu->node, "interrupt-parent", get_ics_phandle());
+}
+
+#ifdef XIVE_DEBUG_DUPLICATES
+static uint32_t xive_count_irq_copies(struct xive_cpu_state *xs, uint32_t ref)
+{
+ uint32_t i, irq;
+ uint32_t cnt = 0;
+ uint32_t pos = xs->eqptr;
+ uint32_t gen = xs->eqgen;
+
+ for (i = 0; i < 0x3fff; i++) {
+ irq = xs->eqbuf[pos];
+ if ((irq >> 31) == gen)
+ break;
+ if (irq == ref)
+ cnt++;
+ pos = (pos + 1) & xs->eqmsk;
+ if (!pos)
+ gen ^= 1;
+ }
+ return cnt;
+}
+#else
+static inline uint32_t xive_count_irq_copies(struct xive_cpu_state *xs __unused,
+ uint32_t ref __unused)
+{
+ return 1;
+}
+#endif
+
+static uint32_t xive_read_eq(struct xive_cpu_state *xs, bool just_peek)
+{
+ uint32_t cur, copies;
+
+ xive_cpu_vdbg(this_cpu(), " EQ %s... IDX=%x MSK=%x G=%d\n",
+ just_peek ? "peek" : "read",
+ xs->eqptr, xs->eqmsk, xs->eqgen);
+ cur = xs->eqbuf[xs->eqptr];
+ xive_cpu_vdbg(this_cpu(), " cur: %08x [%08x %08x %08x ...]\n", cur,
+ xs->eqbuf[(xs->eqptr + 1) & xs->eqmsk],
+ xs->eqbuf[(xs->eqptr + 2) & xs->eqmsk],
+ xs->eqbuf[(xs->eqptr + 3) & xs->eqmsk]);
+ if ((cur >> 31) == xs->eqgen)
+ return 0;
+
+ /* Debug: check for duplicate interrupts in the queue */
+ copies = xive_count_irq_copies(xs, cur);
+ if (copies > 1) {
+ struct xive_eq *eq;
+
+ prerror("Wow ! Dups of irq %x, found %d copies !\n",
+ cur & 0x7fffffff, copies);
+ prerror("[%08x > %08x %08x %08x %08x ...] eqgen=%x eqptr=%x jp=%d\n",
+ xs->eqbuf[(xs->eqptr - 1) & xs->eqmsk],
+ xs->eqbuf[(xs->eqptr + 0) & xs->eqmsk],
+ xs->eqbuf[(xs->eqptr + 1) & xs->eqmsk],
+ xs->eqbuf[(xs->eqptr + 2) & xs->eqmsk],
+ xs->eqbuf[(xs->eqptr + 3) & xs->eqmsk],
+ xs->eqgen, xs->eqptr, just_peek);
+ lock(&xs->xive->lock);
+ __xive_cache_scrub(xs->xive, xive_cache_eqc, xs->eq_blk,
+ xs->eq_idx + XIVE_EMULATION_PRIO,
+ false, false);
+ unlock(&xs->xive->lock);
+ eq = xive_get_eq(xs->xive, xs->eq_idx + XIVE_EMULATION_PRIO);
+ prerror("EQ @%p W0=%08x W1=%08x qbuf @%p\n",
+ eq, be32_to_cpu(eq->w0), be32_to_cpu(eq->w1), xs->eqbuf);
+ }
+ log_add(xs, LOG_TYPE_POPQ, 7, cur,
+ xs->eqbuf[(xs->eqptr + 1) & xs->eqmsk],
+ xs->eqbuf[(xs->eqptr + 2) & xs->eqmsk],
+ copies,
+ xs->eqptr, xs->eqgen, just_peek);
+ if (!just_peek) {
+ xs->eqptr = (xs->eqptr + 1) & xs->eqmsk;
+ if (xs->eqptr == 0)
+ xs->eqgen ^= 1;
+ xs->total_irqs++;
+ }
+ return cur & 0x00ffffff;
+}
+
+static uint8_t xive_sanitize_cppr(uint8_t cppr)
+{
+ if (cppr == 0xff || cppr == 0)
+ return cppr;
+ else
+ return XIVE_EMULATION_PRIO;
+}
+
+static inline uint8_t opal_xive_check_pending(struct xive_cpu_state *xs,
+ uint8_t cppr)
+{
+ uint8_t mask = (cppr > 7) ? 0xff : ~((0x100 >> cppr) - 1);
+
+ return xs->pending & mask;
+}
+
+static void opal_xive_update_cppr(struct xive_cpu_state *xs, u8 cppr)
+{
+ /* Peform the update */
+ xs->cppr = cppr;
+ out_8(xs->tm_ring1 + TM_QW3_HV_PHYS + TM_CPPR, cppr);
+
+ /* Trigger the IPI if it's still more favored than the CPPR
+ *
+ * This can lead to a bunch of spurrious retriggers if the
+ * IPI is queued up behind other interrupts but that's not
+ * a big deal and keeps the code simpler
+ */
+ if (xs->mfrr < cppr)
+ xive_ipi_trigger(xs->xive, GIRQ_TO_IDX(xs->ipi_irq));
+}
+
+static int64_t opal_xive_eoi(uint32_t xirr)
+{
+ struct cpu_thread *c = this_cpu();
+ struct xive_cpu_state *xs = c->xstate;
+ uint32_t isn = xirr & 0x00ffffff;
+ struct xive *src_x;
+ bool special_ipi = false;
+ uint8_t cppr;
+
+ /*
+ * In exploitation mode, this is supported as a way to perform
+ * an EOI via a FW calls. This can be needed to workaround HW
+ * implementation bugs for example. In this case interrupts will
+ * have the OPAL_XIVE_IRQ_EOI_VIA_FW flag set.
+ *
+ * In that mode the entire "xirr" argument is interpreterd as
+ * a global IRQ number (including the escalation bit), ther is
+ * no split between the top 8 bits for CPPR and bottom 24 for
+ * the interrupt number.
+ */
+ if (xive_mode != XIVE_MODE_EMU)
+ return irq_source_eoi(xirr) ? OPAL_SUCCESS : OPAL_PARAMETER;
+
+ if (!xs)
+ return OPAL_INTERNAL_ERROR;
+
+ xive_cpu_vdbg(c, "EOI xirr=%08x cur_cppr=%d\n", xirr, xs->cppr);
+
+ /* Limit supported CPPR values from OS */
+ cppr = xive_sanitize_cppr(xirr >> 24);
+
+ lock(&xs->lock);
+
+ log_add(xs, LOG_TYPE_EOI, 3, isn, xs->eqptr, xs->eqgen);
+
+ /* If this was our magic IPI, convert to IRQ number */
+ if (isn == 2) {
+ isn = xs->ipi_irq;
+ special_ipi = true;
+ xive_cpu_vdbg(c, "User EOI for IPI !\n");
+ }
+
+ /* First check if we have stuff in that queue. If we do, don't bother with
+ * doing an EOI on the EQ. Just mark that priority pending, we'll come
+ * back later.
+ *
+ * If/when supporting multiple queues we would have to check them all
+ * in ascending prio order up to the passed-in CPPR value (exclusive).
+ */
+ if (xive_read_eq(xs, true)) {
+ xive_cpu_vdbg(c, " isn %08x, skip, queue non empty\n", xirr);
+ xs->pending |= 1 << XIVE_EMULATION_PRIO;
+ }
+#ifndef EQ_ALWAYS_NOTIFY
+ else {
+ uint8_t eoi_val;
+
+ /* Perform EQ level EOI. Only one EQ for now ...
+ *
+ * Note: We aren't doing an actual EOI. Instead we are clearing
+ * both P and Q and will re-check the queue if Q was set.
+ */
+ eoi_val = in_8(xs->eqmmio + XIVE_ESB_SET_PQ_00);
+ xive_cpu_vdbg(c, " isn %08x, eoi_val=%02x\n", xirr, eoi_val);
+
+ /* Q was set ? Check EQ again after doing a sync to ensure
+ * ordering.
+ */
+ if (eoi_val & 1) {
+ sync();
+ if (xive_read_eq(xs, true))
+ xs->pending |= 1 << XIVE_EMULATION_PRIO;
+ }
+ }
+#endif
+
+ /* Perform source level EOI if it's not our emulated MFRR IPI
+ * otherwise EOI ourselves
+ */
+ src_x = xive_from_isn(isn);
+ if (src_x) {
+ uint32_t idx = GIRQ_TO_IDX(isn);
+
+ /* Is it an IPI ? */
+ if (special_ipi) {
+ xive_ipi_eoi(src_x, idx);
+ } else {
+ /* Otherwise go through the source mechanism */
+ xive_vdbg(src_x, "EOI of IDX %x in EXT range\n", idx);
+ irq_source_eoi(isn);
+ }
+ } else {
+ xive_cpu_err(c, " EOI unknown ISN %08x\n", isn);
+ }
+
+ /* Finally restore CPPR */
+ opal_xive_update_cppr(xs, cppr);
+
+ xive_cpu_vdbg(c, " pending=0x%x cppr=%d\n", xs->pending, cppr);
+
+ unlock(&xs->lock);
+
+ /* Return whether something is pending that is suitable for
+ * delivery considering the new CPPR value. This can be done
+ * without lock as these fields are per-cpu.
+ */
+ return opal_xive_check_pending(xs, cppr) ? 1 : 0;
+}
+
+#ifdef XIVE_CHECK_MISROUTED_IPI
+static void xive_dump_eq(uint32_t eq_blk, uint32_t eq_idx)
+{
+ struct cpu_thread *me = this_cpu();
+ struct xive *x;
+ struct xive_eq *eq;
+
+ x = xive_from_vc_blk(eq_blk);
+ if (!x)
+ return;
+ eq = xive_get_eq(x, eq_idx);
+ if (!eq)
+ return;
+ xive_cpu_err(me, "EQ: %08x %08x %08x %08x (@%p)\n",
+ eq->w0, eq->w1, eq->w2, eq->w3, eq);
+ xive_cpu_err(me, " %08x %08x %08x %08x\n",
+ eq->w4, eq->w5, eq->w6, eq->w7);
+}
+static int64_t __opal_xive_dump_emu(struct xive_cpu_state *xs, uint32_t pir);
+
+static bool check_misrouted_ipi(struct cpu_thread *me, uint32_t irq)
+{
+ struct cpu_thread *c;
+
+ for_each_present_cpu(c) {
+ struct xive_cpu_state *xs = c->xstate;
+ struct xive_ive *ive;
+ uint32_t ipi_target, i, eq_blk, eq_idx;
+ struct proc_chip *chip;
+ struct xive *x;
+
+ if (!xs)
+ continue;
+ if (irq == xs->ipi_irq) {
+ xive_cpu_err(me, "misrouted IPI 0x%x, should"
+ " be aimed at CPU 0x%x\n",
+ irq, c->pir);
+ xive_cpu_err(me, " my eq_page=%p eqbuff=%p eq=0x%x/%x\n",
+ me->xstate->eq_page, me->xstate->eqbuf,
+ me->xstate->eq_blk, me->xstate->eq_idx + XIVE_EMULATION_PRIO);
+ xive_cpu_err(me, "tgt eq_page=%p eqbuff=%p eq=0x%x/%x\n",
+ c->xstate->eq_page, c->xstate->eqbuf,
+ c->xstate->eq_blk, c->xstate->eq_idx + XIVE_EMULATION_PRIO);
+ __opal_xive_dump_emu(me->xstate, me->pir);
+ __opal_xive_dump_emu(c->xstate, c->pir);
+ if (xive_get_irq_targetting(xs->ipi_irq, &ipi_target, NULL, NULL))
+ xive_cpu_err(me, "target=%08x\n", ipi_target);
+ else
+ xive_cpu_err(me, "target=???\n");
+ /* Find XIVE on which the IVE resides */
+ x = xive_from_isn(irq);
+ if (!x) {
+ xive_cpu_err(me, "no xive attached\n");
+ return true;
+ }
+ ive = xive_get_ive(x, irq);
+ if (!ive) {
+ xive_cpu_err(me, "no ive attached\n");
+ return true;
+ }
+ xive_cpu_err(me, "ive=%016llx\n", be64_to_cpu(ive->w));
+ for_each_chip(chip) {
+ x = chip->xive;
+ if (!x)
+ continue;
+ ive = x->ivt_base;
+ for (i = 0; i < XIVE_INT_COUNT; i++) {
+ if (xive_get_field64(IVE_EQ_DATA, ive[i].w) == irq) {
+ eq_blk = xive_get_field64(IVE_EQ_BLOCK, ive[i].w);
+ eq_idx = xive_get_field64(IVE_EQ_INDEX, ive[i].w);
+ xive_cpu_err(me, "Found source: 0x%x ive=%016llx\n"
+ " eq 0x%x/%x",
+ BLKIDX_TO_GIRQ(x->block_id, i),
+ be64_to_cpu(ive[i].w), eq_blk, eq_idx);
+ xive_dump_eq(eq_blk, eq_idx);
+ }
+ }
+ }
+ return true;
+ }
+ }
+ return false;
+}
+#else
+static inline bool check_misrouted_ipi(struct cpu_thread *c __unused,
+ uint32_t irq __unused)
+{
+ return false;
+}
+#endif
+
+static int64_t opal_xive_get_xirr(__be32 *out_xirr, bool just_poll)
+{
+ struct cpu_thread *c = this_cpu();
+ struct xive_cpu_state *xs = c->xstate;
+ uint16_t ack;
+ uint8_t active, old_cppr;
+
+ if (xive_mode != XIVE_MODE_EMU)
+ return OPAL_WRONG_STATE;
+ if (!xs)
+ return OPAL_INTERNAL_ERROR;
+ if (!out_xirr)
+ return OPAL_PARAMETER;
+
+ *out_xirr = 0;
+
+ lock(&xs->lock);
+
+ /*
+ * Due to the need to fetch multiple interrupts from the EQ, we
+ * need to play some tricks.
+ *
+ * The "pending" byte in "xs" keeps track of the priorities that
+ * are known to have stuff to read (currently we only use one).
+ *
+ * It is set in EOI and cleared when consumed here. We don't bother
+ * looking ahead here, EOI will do it.
+ *
+ * We do need to still do an ACK every time in case a higher prio
+ * exception occurred (though we don't do prio yet... right ? still
+ * let's get the basic design right !).
+ *
+ * Note that if we haven't found anything via ack, but did find
+ * something in the queue, we must also raise CPPR back.
+ */
+
+ xive_cpu_vdbg(c, "get_xirr W01=%016llx W2=%08x\n",
+ __in_be64(xs->tm_ring1 + TM_QW3_HV_PHYS),
+ __in_be32(xs->tm_ring1 + TM_QW3_HV_PHYS + 8));
+
+ /* Perform the HV Ack cycle */
+ if (just_poll)
+ ack = __in_be64(xs->tm_ring1 + TM_QW3_HV_PHYS) >> 48;
+ else
+ ack = __in_be16(xs->tm_ring1 + TM_SPC_ACK_HV_REG);
+ sync();
+ xive_cpu_vdbg(c, "get_xirr,%s=%04x\n", just_poll ? "POLL" : "ACK", ack);
+
+ /* Capture the old CPPR which we will return with the interrupt */
+ old_cppr = xs->cppr;
+
+ switch(GETFIELD(TM_QW3_NSR_HE, (ack >> 8))) {
+ case TM_QW3_NSR_HE_NONE:
+ break;
+ case TM_QW3_NSR_HE_POOL:
+ break;
+ case TM_QW3_NSR_HE_PHYS:
+ /* Mark pending and keep track of the CPPR update */
+ if (!just_poll && (ack & 0xff) != 0xff) {
+ xs->cppr = ack & 0xff;
+ xs->pending |= 1 << xs->cppr;
+ }
+ break;
+ case TM_QW3_NSR_HE_LSI:
+ break;
+ }
+
+ /* Calculate "active" lines as being the pending interrupts
+ * masked by the "old" CPPR
+ */
+ active = opal_xive_check_pending(xs, old_cppr);
+
+ log_add(xs, LOG_TYPE_XIRR, 6, old_cppr, xs->cppr, xs->pending, active,
+ xs->eqptr, xs->eqgen);
+
+#ifdef XIVE_PERCPU_LOG
+ {
+ struct xive_eq *eq;
+ lock(&xs->xive->lock);
+ __xive_cache_scrub(xs->xive, xive_cache_eqc, xs->eq_blk,
+ xs->eq_idx + XIVE_EMULATION_PRIO,
+ false, false);
+ unlock(&xs->xive->lock);
+ eq = xive_get_eq(xs->xive, xs->eq_idx + XIVE_EMULATION_PRIO);
+ log_add(xs, LOG_TYPE_EQD, 2, be32_to_cpu(eq->w0), be32_to_cpu(eq->w1));
+ }
+#endif /* XIVE_PERCPU_LOG */
+
+ xive_cpu_vdbg(c, " cppr=%d->%d pending=0x%x active=%x\n",
+ old_cppr, xs->cppr, xs->pending, active);
+ if (active) {
+ /* Find highest pending */
+ uint8_t prio = ffs(active) - 1;
+ uint32_t val;
+
+ /* XXX Use "p" to select queue */
+ val = xive_read_eq(xs, just_poll);
+
+ if (val && val < XIVE_INT_FIRST)
+ xive_cpu_err(c, "Bogus interrupt 0x%x received !\n", val);
+
+ /* Convert to magic IPI if needed */
+ if (val == xs->ipi_irq)
+ val = 2;
+ if (check_misrouted_ipi(c, val))
+ val = 2;
+
+ *out_xirr = cpu_to_be32((old_cppr << 24) | val);
+
+ /* If we are polling, that's it */
+ if (just_poll)
+ goto skip;
+
+ /* Clear the pending bit. EOI will set it again if needed. We
+ * could check the queue but that's not really critical here.
+ */
+ xs->pending &= ~(1 << prio);
+
+ /* Spurrious IPB bit, nothing to fetch, bring CPPR back */
+ if (!val)
+ prio = old_cppr;
+
+ /* We could have fetched a pending interrupt left over
+ * by a previous EOI, so the CPPR might need adjusting
+ * Also if we had a spurrious one as well.
+ */
+ if (xs->cppr != prio) {
+ xs->cppr = prio;
+ out_8(xs->tm_ring1 + TM_QW3_HV_PHYS + TM_CPPR, prio);
+ xive_cpu_vdbg(c, " adjusted CPPR to %d\n", prio);
+ }
+
+ if (val)
+ xive_cpu_vdbg(c, " found irq, prio=%d\n", prio);
+
+ } else {
+ /* Nothing was active, this is a fluke, restore CPPR */
+ opal_xive_update_cppr(xs, old_cppr);
+ xive_cpu_vdbg(c, " nothing active, restored CPPR to %d\n",
+ old_cppr);
+ }
+ skip:
+
+ log_add(xs, LOG_TYPE_XIRR2, 5, xs->cppr, xs->pending,
+ be32_to_cpu(*out_xirr), xs->eqptr, xs->eqgen);
+ xive_cpu_vdbg(c, " returning XIRR=%08x, pending=0x%x\n",
+ be32_to_cpu(*out_xirr), xs->pending);
+
+ unlock(&xs->lock);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_set_cppr(uint8_t cppr)
+{
+ struct cpu_thread *c = this_cpu();
+ struct xive_cpu_state *xs = c->xstate;
+
+ if (xive_mode != XIVE_MODE_EMU)
+ return OPAL_WRONG_STATE;
+
+ /* Limit supported CPPR values */
+ cppr = xive_sanitize_cppr(cppr);
+
+ if (!xs)
+ return OPAL_INTERNAL_ERROR;
+ xive_cpu_vdbg(c, "CPPR setting to %d\n", cppr);
+
+ lock(&xs->lock);
+ opal_xive_update_cppr(xs, cppr);
+ unlock(&xs->lock);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_set_mfrr(uint32_t cpu, uint8_t mfrr)
+{
+ struct cpu_thread *c = find_cpu_by_server(cpu);
+ struct xive_cpu_state *xs;
+ uint8_t old_mfrr;
+
+ if (xive_mode != XIVE_MODE_EMU)
+ return OPAL_WRONG_STATE;
+ if (!c)
+ return OPAL_PARAMETER;
+ xs = c->xstate;
+ if (!xs)
+ return OPAL_INTERNAL_ERROR;
+
+ lock(&xs->lock);
+ old_mfrr = xs->mfrr;
+ xive_cpu_vdbg(c, " Setting MFRR to %x, old is %x\n", mfrr, old_mfrr);
+ xs->mfrr = mfrr;
+ if (old_mfrr > mfrr && mfrr < xs->cppr)
+ xive_ipi_trigger(xs->xive, GIRQ_TO_IDX(xs->ipi_irq));
+ unlock(&xs->lock);
+
+ return OPAL_SUCCESS;
+}
+
+static uint64_t xive_convert_irq_flags(uint64_t iflags)
+{
+ uint64_t oflags = 0;
+
+ if (iflags & XIVE_SRC_STORE_EOI)
+ oflags |= OPAL_XIVE_IRQ_STORE_EOI;
+
+ /* OPAL_XIVE_IRQ_TRIGGER_PAGE is only meant to be set if
+ * the interrupt has a *separate* trigger page.
+ */
+ if ((iflags & XIVE_SRC_EOI_PAGE1) &&
+ (iflags & XIVE_SRC_TRIGGER_PAGE))
+ oflags |= OPAL_XIVE_IRQ_TRIGGER_PAGE;
+
+ if (iflags & XIVE_SRC_LSI)
+ oflags |= OPAL_XIVE_IRQ_LSI;
+ return oflags;
+}
+
+static int64_t opal_xive_get_irq_info(uint32_t girq,
+ __be64 *out_flags,
+ __be64 *out_eoi_page,
+ __be64 *out_trig_page,
+ __be32 *out_esb_shift,
+ __be32 *out_src_chip)
+{
+ struct irq_source *is = irq_find_source(girq);
+ struct xive_src *s = container_of(is, struct xive_src, is);
+ uint32_t idx;
+ uint64_t mm_base;
+ uint64_t eoi_page = 0, trig_page = 0;
+
+ if (xive_mode != XIVE_MODE_EXPL)
+ return OPAL_WRONG_STATE;
+ if (is == NULL || out_flags == NULL)
+ return OPAL_PARAMETER;
+ assert(is->ops == &xive_irq_source_ops);
+
+ if (out_flags)
+ *out_flags = cpu_to_be64(xive_convert_irq_flags(s->flags));
+
+ idx = girq - s->esb_base;
+
+ if (out_esb_shift)
+ *out_esb_shift = cpu_to_be32(s->esb_shift);
+
+ mm_base = (uint64_t)s->esb_mmio + (1ull << s->esb_shift) * idx;
+
+ /* The EOI page can either be the first or second page */
+ if (s->flags & XIVE_SRC_EOI_PAGE1) {
+ uint64_t p1off = 1ull << (s->esb_shift - 1);
+ eoi_page = mm_base + p1off;
+ } else
+ eoi_page = mm_base;
+
+ /* The trigger page, if it exists, is always the first page */
+ if (s->flags & XIVE_SRC_TRIGGER_PAGE)
+ trig_page = mm_base;
+
+ if (out_eoi_page)
+ *out_eoi_page = cpu_to_be64(eoi_page);
+ if (out_trig_page)
+ *out_trig_page = cpu_to_be64(trig_page);
+ if (out_src_chip)
+ *out_src_chip = cpu_to_be32(GIRQ_TO_CHIP(girq));
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_get_irq_config(uint32_t girq,
+ __be64 *out_vp,
+ uint8_t *out_prio,
+ __be32 *out_lirq)
+{
+ uint32_t vp;
+ uint32_t lirq;
+ uint8_t prio;
+
+ if (xive_mode != XIVE_MODE_EXPL)
+ return OPAL_WRONG_STATE;
+
+ if (xive_get_irq_targetting(girq, &vp, &prio, &lirq)) {
+ *out_vp = cpu_to_be64(vp);
+ *out_prio = prio;
+ *out_lirq = cpu_to_be32(lirq);
+ return OPAL_SUCCESS;
+ } else
+ return OPAL_PARAMETER;
+}
+
+static int64_t opal_xive_set_irq_config(uint32_t girq,
+ uint64_t vp,
+ uint8_t prio,
+ uint32_t lirq)
+{
+ /*
+ * This variant is meant for a XIVE-aware OS, thus it will
+ * *not* affect the ESB state of the interrupt. If used with
+ * a prio of FF, the IVT/EAS will be mased. In that case the
+ * races have to be handled by the OS.
+ *
+ * The exception to this rule is interrupts for which masking
+ * and unmasking is handled by firmware. In that case the ESB
+ * state isn't under OS control and will be dealt here. This
+ * is currently only the case of LSIs and on P9 DD1.0 only so
+ * isn't an issue.
+ */
+
+ if (xive_mode != XIVE_MODE_EXPL)
+ return OPAL_WRONG_STATE;
+
+ return xive_set_irq_config(girq, vp, prio, lirq, false);
+}
+
+static int64_t opal_xive_get_queue_info(uint64_t vp, uint32_t prio,
+ __be64 *out_qpage,
+ __be64 *out_qsize,
+ __be64 *out_qeoi_page,
+ __be32 *out_escalate_irq,
+ __be64 *out_qflags)
+{
+ uint32_t blk, idx;
+ struct xive *x;
+ struct xive_eq *eq;
+
+ if (xive_mode != XIVE_MODE_EXPL)
+ return OPAL_WRONG_STATE;
+
+ if (!xive_eq_for_target(vp, prio, &blk, &idx))
+ return OPAL_PARAMETER;
+
+ x = xive_from_vc_blk(blk);
+ if (!x)
+ return OPAL_PARAMETER;
+
+ eq = xive_get_eq(x, idx);
+ if (!eq)
+ return OPAL_PARAMETER;
+
+ if (out_escalate_irq) {
+ uint32_t esc_idx = idx;
+
+ /* If escalations are routed to a single queue, fix up
+ * the escalation interrupt number here.
+ */
+ if (xive_get_field32(EQ_W0_UNCOND_ESCALATE, eq->w0))
+ esc_idx |= XIVE_ESCALATION_PRIO;
+
+ *out_escalate_irq =
+ cpu_to_be32(MAKE_ESCALATION_GIRQ(blk, esc_idx));
+ }
+
+ /* If this is a single-escalation gather queue, that's all
+ * there is to return
+ */
+ if (xive_get_field32(EQ_W0_SILENT_ESCALATE, eq->w0)) {
+ if (out_qflags)
+ *out_qflags = 0;
+ if (out_qpage)
+ *out_qpage = 0;
+ if (out_qsize)
+ *out_qsize = 0;
+ if (out_qeoi_page)
+ *out_qeoi_page = 0;
+ return OPAL_SUCCESS;
+ }
+
+ if (out_qpage) {
+ if (xive_get_field32(EQ_W0_ENQUEUE, eq->w0))
+ *out_qpage = cpu_to_be64(((uint64_t)xive_get_field32(EQ_W2_OP_DESC_HI, eq->w2) << 32) | be32_to_cpu(eq->w3));
+ else
+ *out_qpage = 0;
+ }
+ if (out_qsize) {
+ if (xive_get_field32(EQ_W0_ENQUEUE, eq->w0))
+ *out_qsize = cpu_to_be64(xive_get_field32(EQ_W0_QSIZE, eq->w0) + 12);
+ else
+ *out_qsize = 0;
+ }
+ if (out_qeoi_page) {
+ *out_qeoi_page =
+ cpu_to_be64((uint64_t)x->eq_mmio + idx * XIVE_ESB_PAGE_SIZE);
+ }
+ if (out_qflags) {
+ *out_qflags = 0;
+ if (xive_get_field32(EQ_W0_VALID, eq->w0))
+ *out_qflags |= cpu_to_be64(OPAL_XIVE_EQ_ENABLED);
+ if (xive_get_field32(EQ_W0_UCOND_NOTIFY, eq->w0))
+ *out_qflags |= cpu_to_be64(OPAL_XIVE_EQ_ALWAYS_NOTIFY);
+ if (xive_get_field32(EQ_W0_ESCALATE_CTL, eq->w0))
+ *out_qflags |= cpu_to_be64(OPAL_XIVE_EQ_ESCALATE);
+ }
+
+ return OPAL_SUCCESS;
+}
+
+static void xive_cleanup_eq(struct xive_eq *eq)
+{
+ eq->w0 = xive_set_field32(EQ_W0_FIRMWARE, 0, xive_get_field32(EQ_W0_FIRMWARE, eq->w0));
+ eq->w1 = cpu_to_be32(EQ_W1_ESe_Q | EQ_W1_ESn_Q);
+ eq->w2 = eq->w3 = eq->w4 = eq->w5 = eq->w6 = eq->w7 = 0;
+}
+
+static int64_t opal_xive_set_queue_info(uint64_t vp, uint32_t prio,
+ uint64_t qpage,
+ uint64_t qsize,
+ uint64_t qflags)
+{
+ uint32_t blk, idx;
+ struct xive *x;
+ struct xive_eq *old_eq;
+ struct xive_eq eq;
+ uint32_t vp_blk, vp_idx;
+ bool group;
+ int64_t rc;
+
+ if (xive_mode != XIVE_MODE_EXPL)
+ return OPAL_WRONG_STATE;
+ if (!xive_eq_for_target(vp, prio, &blk, &idx))
+ return OPAL_PARAMETER;
+
+ x = xive_from_vc_blk(blk);
+ if (!x)
+ return OPAL_PARAMETER;
+
+ old_eq = xive_get_eq(x, idx);
+ if (!old_eq)
+ return OPAL_PARAMETER;
+
+ /* If this is a silent escalation queue, it cannot be
+ * configured directly
+ */
+ if (xive_get_field32(EQ_W0_SILENT_ESCALATE, old_eq->w0))
+ return OPAL_PARAMETER;
+
+ /* This shouldn't fail or xive_eq_for_target would have
+ * failed already
+ */
+ if (!xive_decode_vp(vp, &vp_blk, &vp_idx, NULL, &group))
+ return OPAL_PARAMETER;
+
+ /*
+ * Make a local copy which we will later try to commit using
+ * the cache watch facility
+ */
+ eq = *old_eq;
+
+ if (qflags & OPAL_XIVE_EQ_ENABLED) {
+ switch(qsize) {
+ /* Supported sizes */
+ case 12:
+ case 16:
+ case 21:
+ case 24:
+ eq.w3 = cpu_to_be32(((uint64_t)qpage) & EQ_W3_OP_DESC_LO);
+ eq.w2 = cpu_to_be32((((uint64_t)qpage) >> 32) & EQ_W2_OP_DESC_HI);
+ eq.w0 = xive_set_field32(EQ_W0_ENQUEUE, eq.w0, 1);
+ eq.w0 = xive_set_field32(EQ_W0_QSIZE, eq.w0, qsize - 12);
+ break;
+ case 0:
+ eq.w2 = eq.w3 = 0;
+ eq.w0 = xive_set_field32(EQ_W0_ENQUEUE, eq.w0, 0);
+ break;
+ default:
+ return OPAL_PARAMETER;
+ }
+
+ /* Ensure the priority and target are correctly set (they will
+ * not be right after allocation
+ */
+ eq.w6 = xive_set_field32(EQ_W6_NVT_BLOCK, 0, vp_blk) |
+ xive_set_field32(EQ_W6_NVT_INDEX, 0, vp_idx);
+ eq.w7 = xive_set_field32(EQ_W7_F0_PRIORITY, 0, prio);
+ /* XXX Handle group i bit when needed */
+
+ /* Always notify flag */
+ if (qflags & OPAL_XIVE_EQ_ALWAYS_NOTIFY)
+ eq.w0 = xive_set_field32(EQ_W0_UCOND_NOTIFY, eq.w0, 1);
+ else
+ eq.w0 = xive_set_field32(EQ_W0_UCOND_NOTIFY, eq.w0, 0);
+
+ /* Escalation flag */
+ if (qflags & OPAL_XIVE_EQ_ESCALATE)
+ eq.w0 = xive_set_field32(EQ_W0_ESCALATE_CTL, eq.w0, 1);
+ else
+ eq.w0 = xive_set_field32(EQ_W0_ESCALATE_CTL, eq.w0, 0);
+
+ /* Unconditionally clear the current queue pointer, set
+ * generation to 1 and disable escalation interrupts.
+ */
+ eq.w1 = xive_set_field32(EQ_W1_GENERATION, 0, 1) |
+ xive_set_field32(EQ_W1_ES, 0, xive_get_field32(EQ_W1_ES, old_eq->w1));
+
+ /* Enable. We always enable backlog for an enabled queue
+ * otherwise escalations won't work.
+ */
+ eq.w0 = xive_set_field32(EQ_W0_VALID, eq.w0, 1);
+ eq.w0 = xive_set_field32(EQ_W0_BACKLOG, eq.w0, 1);
+ } else
+ xive_cleanup_eq(&eq);
+
+ /* Update EQ, non-synchronous */
+ lock(&x->lock);
+ rc = xive_eqc_cache_update(x, blk, idx, &eq, false);
+ unlock(&x->lock);
+
+ return rc;
+}
+
+static int64_t opal_xive_get_queue_state(uint64_t vp, uint32_t prio,
+ __be32 *out_qtoggle,
+ __be32 *out_qindex)
+{
+ uint32_t blk, idx;
+ struct xive *x;
+ struct xive_eq *eq;
+ int64_t rc;
+
+ if (xive_mode != XIVE_MODE_EXPL)
+ return OPAL_WRONG_STATE;
+
+ if (!out_qtoggle || !out_qindex ||
+ !xive_eq_for_target(vp, prio, &blk, &idx))
+ return OPAL_PARAMETER;
+
+ x = xive_from_vc_blk(blk);
+ if (!x)
+ return OPAL_PARAMETER;
+
+ eq = xive_get_eq(x, idx);
+ if (!eq)
+ return OPAL_PARAMETER;
+
+ /* Scrub the queue */
+ lock(&x->lock);
+ rc = xive_eqc_scrub(x, blk, idx);
+ unlock(&x->lock);
+ if (rc)
+ return rc;
+
+ /* We don't do disable queues */
+ if (!xive_get_field32(EQ_W0_VALID, eq->w0))
+ return OPAL_WRONG_STATE;
+
+ *out_qtoggle = cpu_to_be32(xive_get_field32(EQ_W1_GENERATION, eq->w1));
+ *out_qindex = cpu_to_be32(xive_get_field32(EQ_W1_PAGE_OFF, eq->w1));
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_set_queue_state(uint64_t vp, uint32_t prio,
+ uint32_t qtoggle, uint32_t qindex)
+{
+ uint32_t blk, idx;
+ struct xive *x;
+ struct xive_eq *eq, new_eq;
+ int64_t rc;
+
+ if (xive_mode != XIVE_MODE_EXPL)
+ return OPAL_WRONG_STATE;
+
+ if (!xive_eq_for_target(vp, prio, &blk, &idx))
+ return OPAL_PARAMETER;
+
+ x = xive_from_vc_blk(blk);
+ if (!x)
+ return OPAL_PARAMETER;
+
+ eq = xive_get_eq(x, idx);
+ if (!eq)
+ return OPAL_PARAMETER;
+
+ /* We don't do disable queues */
+ if (!xive_get_field32(EQ_W0_VALID, eq->w0))
+ return OPAL_WRONG_STATE;
+
+ new_eq = *eq;
+
+ new_eq.w1 = xive_set_field32(EQ_W1_GENERATION, new_eq.w1, qtoggle);
+ new_eq.w1 = xive_set_field32(EQ_W1_PAGE_OFF, new_eq.w1, qindex);
+
+ lock(&x->lock);
+ rc = xive_eqc_cache_update(x, blk, idx, &new_eq, false);
+ unlock(&x->lock);
+
+ return rc;
+}
+
+static int64_t opal_xive_donate_page(uint32_t chip_id, uint64_t addr)
+{
+ struct proc_chip *c = get_chip(chip_id);
+ struct list_node *n;
+
+ if (xive_mode != XIVE_MODE_EXPL)
+ return OPAL_WRONG_STATE;
+ if (!c)
+ return OPAL_PARAMETER;
+ if (!c->xive)
+ return OPAL_PARAMETER;
+ if (addr & 0xffff)
+ return OPAL_PARAMETER;
+
+ n = (struct list_node *)addr;
+ lock(&c->xive->lock);
+ list_add(&c->xive->donated_pages, n);
+ unlock(&c->xive->lock);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_get_vp_info(uint64_t vp_id,
+ __be64 *out_flags,
+ __be64 *out_cam_value,
+ __be64 *out_report_cl_pair,
+ __be32 *out_chip_id)
+{
+ struct xive *x;
+ struct xive_vp *vp;
+ uint32_t blk, idx;
+ bool group;
+
+ if (!xive_decode_vp(vp_id, &blk, &idx, NULL, &group))
+ return OPAL_PARAMETER;
+ /* We don't do groups yet */
+ if (group)
+ return OPAL_PARAMETER;
+ x = xive_from_pc_blk(blk);
+ if (!x)
+ return OPAL_PARAMETER;
+ vp = xive_get_vp(x, idx);
+ if (!vp)
+ return OPAL_PARAMETER;
+
+ if (out_flags) {
+ uint32_t eq_blk, eq_idx;
+ struct xive_eq *eq;
+ struct xive *eq_x;
+ *out_flags = 0;
+
+ /* We would like to a way to stash a SW bit in the VP to
+ * know whether silent escalation is enabled or not, but
+ * unlike what happens with EQs, the PC cache watch doesn't
+ * implement the reserved bit in the VPs... so we have to go
+ * look at EQ 7 instead.
+ */
+ /* Grab EQ for prio 7 to check for silent escalation */
+ if (!xive_eq_for_target(vp_id, XIVE_ESCALATION_PRIO,
+ &eq_blk, &eq_idx))
+ return OPAL_PARAMETER;
+
+ eq_x = xive_from_vc_blk(eq_blk);
+ if (!eq_x)
+ return OPAL_PARAMETER;
+
+ eq = xive_get_eq(x, eq_idx);
+ if (!eq)
+ return OPAL_PARAMETER;
+ if (xive_get_field32(VP_W0_VALID, vp->w0))
+ *out_flags |= cpu_to_be64(OPAL_XIVE_VP_ENABLED);
+ if (xive_get_field32(EQ_W0_SILENT_ESCALATE, eq->w0))
+ *out_flags |= cpu_to_be64(OPAL_XIVE_VP_SINGLE_ESCALATION);
+ }
+
+ if (out_cam_value)
+ *out_cam_value = cpu_to_be64((blk << NVT_SHIFT) | idx);
+
+ if (out_report_cl_pair) {
+ *out_report_cl_pair = cpu_to_be64(((uint64_t)(be32_to_cpu(vp->w6) & 0x0fffffff)) << 32);
+ *out_report_cl_pair |= cpu_to_be64(be32_to_cpu(vp->w7) & 0xffffff00);
+ }
+
+ if (out_chip_id)
+ *out_chip_id = cpu_to_be32(xive_block_to_chip[blk]);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t xive_setup_silent_gather(uint64_t vp_id, bool enable)
+{
+ uint32_t blk, idx, i;
+ struct xive_eq *eq_orig;
+ struct xive_eq eq;
+ struct xive *x;
+ int64_t rc;
+
+ /* Get base EQ block */
+ if (!xive_eq_for_target(vp_id, 0, &blk, &idx))
+ return OPAL_PARAMETER;
+ x = xive_from_vc_blk(blk);
+ if (!x)
+ return OPAL_PARAMETER;
+
+ /* Grab prio 7 */
+ eq_orig = xive_get_eq(x, idx + XIVE_ESCALATION_PRIO);
+ if (!eq_orig)
+ return OPAL_PARAMETER;
+
+ /* If trying to enable silent gather, make sure prio 7 is not
+ * already enabled as a normal queue
+ */
+ if (enable && xive_get_field32(EQ_W0_VALID, eq_orig->w0) &&
+ !xive_get_field32(EQ_W0_SILENT_ESCALATE, eq_orig->w0)) {
+ xive_dbg(x, "Attempt at enabling silent gather but"
+ " prio 7 queue already in use\n");
+ return OPAL_PARAMETER;
+ }
+
+ eq = *eq_orig;
+
+ if (enable) {
+ /* W0: Enabled and "s" set, no other bit */
+ eq.w0 = xive_set_field32(EQ_W0_FIRMWARE, 0, xive_get_field32(EQ_W0_FIRMWARE, eq.w0)) |
+ xive_set_field32(EQ_W0_VALID, 0, 1) |
+ xive_set_field32(EQ_W0_SILENT_ESCALATE, 0, 1) |
+ xive_set_field32(EQ_W0_ESCALATE_CTL, 0, 1) |
+ xive_set_field32(EQ_W0_BACKLOG, 0, 1);
+
+ /* W1: Mark ESn as 01, ESe as 00 */
+ eq.w1 = xive_set_field32(EQ_W1_ESn_P, eq.w1, 0);
+ eq.w1 = xive_set_field32(EQ_W1_ESn_Q, eq.w1, 1);
+ eq.w1 = xive_set_field32(EQ_W1_ESe, eq.w1, 0);
+ } else if (xive_get_field32(EQ_W0_SILENT_ESCALATE, eq.w0))
+ xive_cleanup_eq(&eq);
+
+ if (!memcmp(eq_orig, &eq, sizeof(eq)))
+ rc = 0;
+ else
+ rc = xive_eqc_cache_update(x, blk, idx + XIVE_ESCALATION_PRIO,
+ &eq, false);
+ if (rc)
+ return rc;
+
+ /* Mark/unmark all other prios with the new "u" bit and update
+ * escalation
+ */
+ for (i = 0; i < NUM_INT_PRIORITIES; i++) {
+ if (i == XIVE_ESCALATION_PRIO)
+ continue;
+ eq_orig = xive_get_eq(x, idx + i);
+ if (!eq_orig)
+ continue;
+ eq = *eq_orig;
+ if (enable) {
+ /* Set new "u" bit */
+ eq.w0 = xive_set_field32(EQ_W0_UNCOND_ESCALATE, eq.w0, 1);
+
+ /* Re-route escalation interrupt (previous
+ * route is lost !) to the gather queue
+ */
+ eq.w4 = xive_set_field32(EQ_W4_ESC_EQ_BLOCK, eq.w4, blk);
+ eq.w4 = xive_set_field32(EQ_W4_ESC_EQ_INDEX, eq.w4, idx + XIVE_ESCALATION_PRIO);
+ } else if (xive_get_field32(EQ_W0_UNCOND_ESCALATE, eq.w0)) {
+ /* Clear the "u" bit, disable escalations if it was set */
+ eq.w0 = xive_set_field32(EQ_W0_UNCOND_ESCALATE, eq.w0, 0);
+ eq.w0 = xive_set_field32(EQ_W0_ESCALATE_CTL, eq.w0, 0);
+ }
+ if (!memcmp(eq_orig, &eq, sizeof(eq)))
+ continue;
+ rc = xive_eqc_cache_update(x, blk, idx + i, &eq, false);
+ if (rc)
+ break;
+ }
+
+ return rc;
+}
+
+static int64_t opal_xive_set_vp_info(uint64_t vp_id,
+ uint64_t flags,
+ uint64_t report_cl_pair)
+{
+ struct xive *x;
+ struct xive_vp *vp, vp_new;
+ uint32_t blk, idx;
+ bool group;
+ int64_t rc;
+
+ if (!xive_decode_vp(vp_id, &blk, &idx, NULL, &group))
+ return OPAL_PARAMETER;
+ /* We don't do groups yet */
+ if (group)
+ return OPAL_PARAMETER;
+ if (report_cl_pair & 0xff)
+ return OPAL_PARAMETER;
+ x = xive_from_pc_blk(blk);
+ if (!x)
+ return OPAL_PARAMETER;
+ vp = xive_get_vp(x, idx);
+ if (!vp)
+ return OPAL_PARAMETER;
+
+ lock(&x->lock);
+
+ vp_new = *vp;
+ if (flags & OPAL_XIVE_VP_ENABLED) {
+ vp_new.w0 = xive_set_field32(VP_W0_VALID, vp_new.w0, 1);
+ vp_new.w6 = cpu_to_be32(report_cl_pair >> 32);
+ vp_new.w7 = cpu_to_be32(report_cl_pair & 0xffffffff);
+
+ if (flags & OPAL_XIVE_VP_SINGLE_ESCALATION)
+ rc = xive_setup_silent_gather(vp_id, true);
+ else
+ rc = xive_setup_silent_gather(vp_id, false);
+ } else {
+ vp_new.w0 = vp_new.w6 = vp_new.w7 = 0;
+ rc = xive_setup_silent_gather(vp_id, false);
+ }
+
+ if (rc) {
+ if (rc != OPAL_BUSY)
+ xive_dbg(x, "Silent gather setup failed with err %lld\n", rc);
+ goto bail;
+ }
+
+ rc = xive_vpc_cache_update(x, blk, idx, &vp_new, false);
+ if (rc)
+ goto bail;
+
+ /* When disabling, we scrub clean (invalidate the entry) so
+ * we can avoid cache ops in alloc/free
+ */
+ if (!(flags & OPAL_XIVE_VP_ENABLED))
+ xive_vpc_scrub_clean(x, blk, idx);
+
+bail:
+ unlock(&x->lock);
+ return rc;
+}
+
+static int64_t opal_xive_get_vp_state(uint64_t vp_id, __be64 *out_state)
+{
+ struct xive *x;
+ struct xive_vp *vp;
+ uint32_t blk, idx;
+ int64_t rc;
+ bool group;
+
+ if (!out_state || !xive_decode_vp(vp_id, &blk, &idx, NULL, &group))
+ return OPAL_PARAMETER;
+ if (group)
+ return OPAL_PARAMETER;
+ x = xive_from_pc_blk(blk);
+ if (!x)
+ return OPAL_PARAMETER;
+ vp = xive_get_vp(x, idx);
+ if (!vp)
+ return OPAL_PARAMETER;
+
+ /* Scrub the vp */
+ lock(&x->lock);
+ rc = xive_vpc_scrub(x, blk, idx);
+ unlock(&x->lock);
+ if (rc)
+ return rc;
+
+ if (!xive_get_field32(VP_W0_VALID, vp->w0))
+ return OPAL_WRONG_STATE;
+
+ /*
+ * Return word4 and word5 which contain the saved HW thread
+ * context. The IPB register is all we care for now on P9.
+ */
+ *out_state = cpu_to_be64((((uint64_t)be32_to_cpu(vp->w4)) << 32) | be32_to_cpu(vp->w5));
+
+ return OPAL_SUCCESS;
+}
+
+static void xive_cleanup_cpu_tima(struct cpu_thread *c)
+{
+ struct xive_cpu_state *xs = c->xstate;
+ struct xive *x = xs->xive;
+ void *ind_tm_base = x->ic_base + (4 << x->ic_shift);
+ uint8_t old_w2 __unused, w2 __unused;
+
+ /* Reset the HW context */
+ xive_reset_enable_thread(c);
+
+ /* Setup indirect access to the corresponding thread */
+ xive_regw(x, PC_TCTXT_INDIR0,
+ PC_TCTXT_INDIR_VALID |
+ SETFIELD(PC_TCTXT_INDIR_THRDID, 0ull, c->pir & 0xff));
+
+ /* Workaround for HW issue: Need to read the above register
+ * back before doing the subsequent accesses
+ */
+ xive_regr(x, PC_TCTXT_INDIR0);
+
+ /* Set VT to 1 */
+ old_w2 = in_8(ind_tm_base + TM_QW3_HV_PHYS + TM_WORD2);
+ out_8(ind_tm_base + TM_QW3_HV_PHYS + TM_WORD2, 0x80);
+ w2 = in_8(ind_tm_base + TM_QW3_HV_PHYS + TM_WORD2);
+
+ /* Dump HV state */
+ xive_cpu_vdbg(c, "[reset] VP TIMA VP=%x/%x W01=%016llx W2=%02x->%02x\n",
+ xs->vp_blk, xs->vp_idx,
+ in_be64(ind_tm_base + TM_QW3_HV_PHYS),
+ old_w2, w2);
+
+ /* Reset indirect access */
+ xive_regw(x, PC_TCTXT_INDIR0, 0);
+}
+
+static int64_t xive_vc_ind_cache_kill(struct xive *x, uint64_t type)
+{
+ uint64_t val;
+
+ /* We clear the whole thing */
+ xive_regw(x, VC_AT_MACRO_KILL_MASK, 0);
+ xive_regw(x, VC_AT_MACRO_KILL, VC_KILL_VALID |
+ SETFIELD(VC_KILL_TYPE, 0ull, type));
+
+ /* XXX SIMICS problem ? */
+ if (chip_quirk(QUIRK_SIMICS))
+ return 0;
+
+ /* XXX Add timeout */
+ for (;;) {
+ val = xive_regr(x, VC_AT_MACRO_KILL);
+ if (!(val & VC_KILL_VALID))
+ break;
+ }
+ return 0;
+}
+
+static int64_t xive_pc_ind_cache_kill(struct xive *x)
+{
+ uint64_t val;
+
+ /* We clear the whole thing */
+ xive_regw(x, PC_AT_KILL_MASK, 0);
+ xive_regw(x, PC_AT_KILL, PC_AT_KILL_VALID);
+
+ /* XXX SIMICS problem ? */
+ if (chip_quirk(QUIRK_SIMICS))
+ return 0;
+
+ /* XXX Add timeout */
+ for (;;) {
+ val = xive_regr(x, PC_AT_KILL);
+ if (!(val & PC_AT_KILL_VALID))
+ break;
+ }
+ return 0;
+}
+
+static void xive_cleanup_vp_ind(struct xive *x)
+{
+ int i;
+
+ xive_dbg(x, "Cleaning up %d VP ind entries...\n", x->vp_ind_count);
+ for (i = 0; i < x->vp_ind_count; i++) {
+ if (be64_to_cpu(x->vp_ind_base[i]) & VSD_FIRMWARE) {
+ xive_dbg(x, " %04x ... skip (firmware)\n", i);
+ continue;
+ }
+ if (x->vp_ind_base[i] != 0) {
+ x->vp_ind_base[i] = 0;
+ xive_dbg(x, " %04x ... cleaned\n", i);
+ }
+ }
+ xive_pc_ind_cache_kill(x);
+}
+
+static void xive_cleanup_eq_ind(struct xive *x)
+{
+ int i;
+
+ xive_dbg(x, "Cleaning up %d EQ ind entries...\n", x->eq_ind_count);
+ for (i = 0; i < x->eq_ind_count; i++) {
+ if (be64_to_cpu(x->eq_ind_base[i]) & VSD_FIRMWARE) {
+ xive_dbg(x, " %04x ... skip (firmware)\n", i);
+ continue;
+ }
+ if (x->eq_ind_base[i] != 0) {
+ x->eq_ind_base[i] = 0;
+ xive_dbg(x, " %04x ... cleaned\n", i);
+ }
+ }
+ xive_vc_ind_cache_kill(x, VC_KILL_EQD);
+}
+
+static void xive_reset_one(struct xive *x)
+{
+ struct cpu_thread *c;
+ bool eq_firmware;
+ int i;
+
+ xive_dbg(x, "Resetting one xive...\n");
+
+ lock(&x->lock);
+
+ /* Check all interrupts are disabled */
+ i = bitmap_find_one_bit(*x->int_enabled_map, 0, XIVE_INT_COUNT);
+ if (i >= 0)
+ xive_warn(x, "Interrupt %d (and maybe more) not disabled"
+ " at reset !\n", i);
+
+ /* Reset IPI allocation */
+ xive_dbg(x, "freeing alloc map %p/%p\n",
+ x->ipi_alloc_map, *x->ipi_alloc_map);
+ memset(x->ipi_alloc_map, 0, BITMAP_BYTES(XIVE_INT_COUNT));
+
+ xive_dbg(x, "Resetting EQs...\n");
+
+ /* Reset all allocated EQs and free the user ones */
+ bitmap_for_each_one(*x->eq_map, XIVE_EQ_COUNT >> 3, i) {
+ struct xive_eq eq0;
+ struct xive_eq *eq;
+ int j;
+
+ if (i == 0)
+ continue;
+ eq_firmware = false;
+ for (j = 0; j < NUM_INT_PRIORITIES; j++) {
+ uint32_t idx = (i << 3) | j;
+
+ eq = xive_get_eq(x, idx);
+ if (!eq)
+ continue;
+
+ /* We need to preserve the firmware bit, otherwise
+ * we will incorrectly free the EQs that are reserved
+ * for the physical CPUs
+ */
+ if (xive_get_field32(EQ_W0_VALID, eq->w0)) {
+ if (!xive_get_field32(EQ_W0_FIRMWARE, eq->w0))
+ xive_dbg(x, "EQ 0x%x:0x%x is valid at reset: %08x %08x\n",
+ x->block_id, idx, be32_to_cpu(eq->w0), be32_to_cpu(eq->w1));
+ eq0 = *eq;
+ xive_cleanup_eq(&eq0);
+ xive_eqc_cache_update(x, x->block_id, idx, &eq0, true);
+ }
+ if (xive_get_field32(EQ_W0_FIRMWARE, eq->w0))
+ eq_firmware = true;
+ }
+ if (!eq_firmware)
+ bitmap_clr_bit(*x->eq_map, i);
+ }
+
+ /* Take out all VPs from HW and reset all CPPRs to 0 */
+ for_each_present_cpu(c) {
+ if (c->chip_id != x->chip_id)
+ continue;
+ if (!c->xstate)
+ continue;
+ xive_cleanup_cpu_tima(c);
+ }
+
+ /* Reset all user-allocated VPs. This is inefficient, we should
+ * either keep a bitmap of allocated VPs or add an iterator to
+ * the buddy which is trickier but doable.
+ */
+ for (i = 0; i < XIVE_VP_COUNT; i++) {
+ struct xive_vp *vp;
+ struct xive_vp vp0 = {0};
+
+ /* Ignore the physical CPU VPs */
+ if (i >= XIVE_HW_VP_BASE &&
+ i < (XIVE_HW_VP_BASE + XIVE_HW_VP_COUNT))
+ continue;
+
+ /* Is the VP valid ? */
+ vp = xive_get_vp(x, i);
+ if (!vp || !xive_get_field32(VP_W0_VALID, vp->w0))
+ continue;
+
+ /* Clear it */
+ xive_dbg(x, "VP 0x%x:0x%x is valid at reset\n", x->block_id, i);
+ xive_vpc_cache_update(x, x->block_id, i, &vp0, true);
+ }
+
+ /* Forget about remaining donated pages */
+ list_head_init(&x->donated_pages);
+
+ /* And cleanup donated indirect VP and EQ pages */
+ xive_cleanup_vp_ind(x);
+ xive_cleanup_eq_ind(x);
+
+ /* The rest must not be called with the lock held */
+ unlock(&x->lock);
+
+ /* Re-configure VPs and emulation */
+ for_each_present_cpu(c) {
+ struct xive_cpu_state *xs = c->xstate;
+
+ if (c->chip_id != x->chip_id || !xs)
+ continue;
+
+ if (xive_mode == XIVE_MODE_EMU)
+ xive_init_cpu_emulation(xs, c);
+ else
+ xive_init_cpu_exploitation(xs);
+ }
+}
+
+static void xive_reset_mask_source_cb(struct irq_source *is,
+ void *data __unused)
+{
+ struct xive_src *s = container_of(is, struct xive_src, is);
+ struct xive *x;
+ uint32_t isn;
+
+ if (is->ops != &xive_irq_source_ops)
+ return;
+
+ /* Skip escalation sources */
+ if (GIRQ_IS_ESCALATION(is->start))
+ return;
+
+ x = s->xive;
+
+ /* Iterate all interrupts */
+ for (isn = is->start; isn < is->end; isn++) {
+ /* Has it ever been enabled ? */
+ if (!bitmap_tst_bit(*x->int_enabled_map, GIRQ_TO_IDX(isn)))
+ continue;
+ /* Mask it and clear the enabled map bit */
+ xive_vdbg(x, "[reset] disabling source 0x%x\n", isn);
+ __xive_set_irq_config(is, isn, 0, 0xff, isn, true, false);
+ bitmap_clr_bit(*x->int_enabled_map, GIRQ_TO_IDX(isn));
+ }
+}
+
+void xive_cpu_reset(void)
+{
+ struct cpu_thread *c = this_cpu();
+ struct xive_cpu_state *xs = c->xstate;
+
+ xs->cppr = 0;
+ out_8(xs->tm_ring1 + TM_QW3_HV_PHYS + TM_CPPR, 0);
+
+ in_be64(xs->tm_ring1 + TM_SPC_PULL_POOL_CTX);
+}
+
+static int64_t __xive_reset(uint64_t version)
+{
+ struct proc_chip *chip;
+
+ xive_mode = version;
+
+ /* Mask all interrupt sources */
+ irq_for_each_source(xive_reset_mask_source_cb, NULL);
+
+ /* For each XIVE do a sync... */
+ for_each_chip(chip) {
+ if (!chip->xive)
+ continue;
+ xive_sync(chip->xive);
+ }
+
+ /* For each XIVE reset everything else... */
+ for_each_chip(chip) {
+ if (!chip->xive)
+ continue;
+ xive_reset_one(chip->xive);
+ }
+
+ /* Cleanup global VP allocator */
+ buddy_reset(xive_vp_buddy);
+
+ /* We reserve the whole range of VPs representing HW chips.
+ *
+ * These are 0x80..0xff, so order 7 starting at 0x80. This will
+ * reserve that range on each chip.
+ */
+ assert(buddy_reserve(xive_vp_buddy, XIVE_HW_VP_BASE,
+ XIVE_THREADID_SHIFT));
+
+ return OPAL_SUCCESS;
+}
+
+/* Called by fast reboot */
+int64_t xive_reset(void)
+{
+ if (xive_mode == XIVE_MODE_NONE)
+ return OPAL_SUCCESS;
+ return __xive_reset(XIVE_MODE_EMU);
+}
+
+static int64_t opal_xive_reset(uint64_t version)
+{
+ prlog(PR_DEBUG, "XIVE reset, version: %d...\n", (int)version);
+
+ if (version > 1)
+ return OPAL_PARAMETER;
+
+ return __xive_reset(version);
+}
+
+static int64_t opal_xive_free_vp_block(uint64_t vp_base)
+{
+ uint32_t blk, idx, i, j, count;
+ uint8_t order;
+ bool group;
+
+ if (xive_mode != XIVE_MODE_EXPL)
+ return OPAL_WRONG_STATE;
+
+ if (!xive_decode_vp(vp_base, &blk, &idx, &order, &group))
+ return OPAL_PARAMETER;
+ if (group)
+ return OPAL_PARAMETER;
+ if (blk)
+ return OPAL_PARAMETER;
+ if (order < (xive_chips_alloc_bits + 1))
+ return OPAL_PARAMETER;
+ if (idx & ((1 << (order - xive_chips_alloc_bits)) - 1))
+ return OPAL_PARAMETER;
+
+ count = 1 << order;
+ for (i = 0; i < count; i++) {
+ uint32_t vp_id = vp_base + i;
+ uint32_t blk, idx, eq_blk, eq_idx;
+ struct xive *x;
+ struct xive_vp *vp;
+
+ if (!xive_decode_vp(vp_id, &blk, &idx, NULL, NULL)) {
+ prerror("XIVE: Couldn't decode VP id %u\n", vp_id);
+ return OPAL_INTERNAL_ERROR;
+ }
+ x = xive_from_pc_blk(blk);
+ if (!x) {
+ prerror("XIVE: Instance not found for deallocated VP"
+ " block %d\n", blk);
+ return OPAL_INTERNAL_ERROR;
+ }
+ vp = xive_get_vp(x, idx);
+ if (!vp) {
+ prerror("XIVE: VP not found for deallocation !");
+ return OPAL_INTERNAL_ERROR;
+ }
+
+ /* VP must be disabled */
+ if (xive_get_field32(VP_W0_VALID, vp->w0)) {
+ prlog(PR_ERR, "XIVE: freeing active VP %d\n", vp_id);
+ return OPAL_XIVE_FREE_ACTIVE;
+ }
+
+ /* Not populated */
+ if (vp->w1 == 0)
+ continue;
+ eq_blk = be32_to_cpu(vp->w1) >> 28;
+ eq_idx = be32_to_cpu(vp->w1) & 0x0fffffff;
+
+ lock(&x->lock);
+
+ /* Ensure EQs are disabled and cleaned up. Ideally the caller
+ * should have done it but we double check it here
+ */
+ for (j = 0; j < NUM_INT_PRIORITIES; j++) {
+ struct xive *eq_x = xive_from_vc_blk(eq_blk);
+ struct xive_eq eq, *orig_eq = xive_get_eq(eq_x, eq_idx + j);
+
+ if (!xive_get_field32(EQ_W0_VALID, orig_eq->w0))
+ continue;
+
+ prlog(PR_WARNING, "XIVE: freeing VP %d with queue %d active\n",
+ vp_id, j);
+ eq = *orig_eq;
+ xive_cleanup_eq(&eq);
+ xive_eqc_cache_update(x, eq_blk, eq_idx + j, &eq, true);
+ }
+
+ /* Mark it not populated so we don't try to free it again */
+ vp->w1 = 0;
+
+ if (eq_blk != blk) {
+ prerror("XIVE: Block mismatch trying to free EQs\n");
+ unlock(&x->lock);
+ return OPAL_INTERNAL_ERROR;
+ }
+
+ xive_free_eq_set(x, eq_idx);
+ unlock(&x->lock);
+ }
+
+ xive_free_vps(vp_base);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_alloc_vp_block(uint32_t alloc_order)
+{
+ uint32_t vp_base, eqs, count, i;
+ int64_t rc;
+
+ if (xive_mode != XIVE_MODE_EXPL)
+ return OPAL_WRONG_STATE;
+
+ prlog(PR_TRACE, "opal_xive_alloc_vp_block(%d)\n", alloc_order);
+
+ vp_base = xive_alloc_vps(alloc_order);
+ if (XIVE_ALLOC_IS_ERR(vp_base)) {
+ if (vp_base == XIVE_ALLOC_NO_IND)
+ return OPAL_XIVE_PROVISIONING;
+ return OPAL_RESOURCE;
+ }
+
+ /* Allocate EQs and initialize VPs */
+ count = 1 << alloc_order;
+ for (i = 0; i < count; i++) {
+ uint32_t vp_id = vp_base + i;
+ uint32_t blk, idx;
+ struct xive *x;
+ struct xive_vp *vp;
+
+ if (!xive_decode_vp(vp_id, &blk, &idx, NULL, NULL)) {
+ prerror("XIVE: Couldn't decode VP id %u\n", vp_id);
+ return OPAL_INTERNAL_ERROR;
+ }
+ x = xive_from_pc_blk(blk);
+ if (!x) {
+ prerror("XIVE: Instance not found for allocated VP"
+ " block %d\n", blk);
+ rc = OPAL_INTERNAL_ERROR;
+ goto fail;
+ }
+ vp = xive_get_vp(x, idx);
+ if (!vp) {
+ prerror("XIVE: VP not found after allocation !");
+ rc = OPAL_INTERNAL_ERROR;
+ goto fail;
+ }
+
+ /* Allocate EQs, if fails, free the VPs and return */
+ lock(&x->lock);
+ eqs = xive_alloc_eq_set(x, false);
+ unlock(&x->lock);
+ if (XIVE_ALLOC_IS_ERR(eqs)) {
+ if (eqs == XIVE_ALLOC_NO_IND)
+ rc = OPAL_XIVE_PROVISIONING;
+ else
+ rc = OPAL_RESOURCE;
+ goto fail;
+ }
+
+ /* Initialize the VP structure. We don't use a cache watch
+ * as we have made sure when freeing the entries to scrub
+ * it out of the cache.
+ */
+ memset(vp, 0, sizeof(*vp));
+ vp->w1 = cpu_to_be32((blk << 28) | eqs);
+ }
+ return vp_base;
+ fail:
+ opal_xive_free_vp_block(vp_base);
+
+ return rc;
+}
+
+static int64_t xive_try_allocate_irq(struct xive *x)
+{
+ int idx, base_idx, max_count, girq;
+ struct xive_ive *ive;
+
+ lock(&x->lock);
+
+ base_idx = x->int_ipi_top - x->int_base;
+ max_count = x->int_hw_bot - x->int_ipi_top;
+
+ idx = bitmap_find_zero_bit(*x->ipi_alloc_map, base_idx, max_count);
+ if (idx < 0) {
+ unlock(&x->lock);
+ return OPAL_RESOURCE;
+ }
+ bitmap_set_bit(*x->ipi_alloc_map, idx);
+ girq = x->int_base + idx;
+
+ /* Mark the IVE valid. Don't bother with the HW cache, it's
+ * still masked anyway, the cache will be updated when unmasked
+ * and configured.
+ */
+ ive = xive_get_ive(x, girq);
+ if (!ive) {
+ bitmap_clr_bit(*x->ipi_alloc_map, idx);
+ unlock(&x->lock);
+ return OPAL_PARAMETER;
+ }
+ ive->w = xive_set_field64(IVE_VALID, 0ul, 1) |
+ xive_set_field64(IVE_MASKED, 0ul, 1) |
+ xive_set_field64(IVE_EQ_DATA, 0ul, girq);
+ unlock(&x->lock);
+
+ return girq;
+}
+
+static int64_t opal_xive_allocate_irq(uint32_t chip_id)
+{
+ struct proc_chip *chip;
+ bool try_all = false;
+ int64_t rc;
+
+ if (xive_mode != XIVE_MODE_EXPL)
+ return OPAL_WRONG_STATE;
+
+ if (chip_id == OPAL_XIVE_ANY_CHIP) {
+ try_all = true;
+ chip_id = this_cpu()->chip_id;
+ }
+ chip = get_chip(chip_id);
+ if (!chip)
+ return OPAL_PARAMETER;
+
+ /* Try initial target chip */
+ if (!chip->xive)
+ rc = OPAL_PARAMETER;
+ else
+ rc = xive_try_allocate_irq(chip->xive);
+ if (rc >= 0 || !try_all)
+ return rc;
+
+ /* Failed and we try all... do so */
+ for_each_chip(chip) {
+ if (!chip->xive)
+ continue;
+ rc = xive_try_allocate_irq(chip->xive);
+ if (rc >= 0)
+ break;
+ }
+ return rc;
+}
+
+static int64_t opal_xive_free_irq(uint32_t girq)
+{
+ struct irq_source *is = irq_find_source(girq);
+ struct xive_src *s = container_of(is, struct xive_src, is);
+ struct xive *x = xive_from_isn(girq);
+ struct xive_ive *ive;
+ uint32_t idx;
+
+ if (xive_mode != XIVE_MODE_EXPL)
+ return OPAL_WRONG_STATE;
+ if (!x || !is)
+ return OPAL_PARAMETER;
+
+ idx = GIRQ_TO_IDX(girq);
+
+ lock(&x->lock);
+
+ ive = xive_get_ive(x, girq);
+ if (!ive) {
+ unlock(&x->lock);
+ return OPAL_PARAMETER;
+ }
+
+ /* Mask the interrupt source */
+ xive_update_irq_mask(s, girq - s->esb_base, true);
+
+ /* Mark the IVE masked and invalid */
+ ive->w = xive_set_field64(IVE_VALID, 0ul, 1) |
+ xive_set_field64(IVE_MASKED, 0ul, 1);
+ xive_ivc_scrub(x, x->block_id, idx);
+
+ /* Free it */
+ if (!bitmap_tst_bit(*x->ipi_alloc_map, idx)) {
+ unlock(&x->lock);
+ return OPAL_PARAMETER;
+ }
+ bitmap_clr_bit(*x->ipi_alloc_map, idx);
+ bitmap_clr_bit(*x->int_enabled_map, idx);
+ unlock(&x->lock);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_dump_tm(uint32_t offset, const char *n, uint32_t pir)
+{
+ struct cpu_thread *c = find_cpu_by_pir(pir);
+ struct xive_cpu_state *xs;
+ struct xive *x;
+ void *ind_tm_base;
+ uint64_t v0,v1;
+
+ if (!c)
+ return OPAL_PARAMETER;
+ xs = c->xstate;
+ if (!xs || !xs->tm_ring1)
+ return OPAL_INTERNAL_ERROR;
+ x = xs->xive;
+ ind_tm_base = x->ic_base + (4 << x->ic_shift);
+
+ lock(&x->lock);
+
+ /* Setup indirect access to the corresponding thread */
+ xive_regw(x, PC_TCTXT_INDIR0,
+ PC_TCTXT_INDIR_VALID |
+ SETFIELD(PC_TCTXT_INDIR_THRDID, 0ull, pir & 0xff));
+
+ /* Workaround for HW issue: Need to read the above register
+ * back before doing the subsequent accesses
+ */
+ xive_regr(x, PC_TCTXT_INDIR0);
+
+ v0 = in_be64(ind_tm_base + offset);
+ if (offset == TM_QW3_HV_PHYS) {
+ v1 = in_8(ind_tm_base + offset + 8);
+ v1 <<= 56;
+ } else {
+ v1 = in_be32(ind_tm_base + offset + 8);
+ v1 <<= 32;
+ }
+ prlog(PR_INFO, "CPU[%04x]: TM state for QW %s\n", pir, n);
+ prlog(PR_INFO, "CPU[%04x]: NSR CPPR IPB LSMFB ACK# INC AGE PIPR"
+ " W2 W3\n", pir);
+ prlog(PR_INFO, "CPU[%04x]: %02x %02x %02x %02x %02x "
+ "%02x %02x %02x %08x %08x\n", pir,
+ (uint8_t)(v0 >> 58) & 0xff, (uint8_t)(v0 >> 48) & 0xff,
+ (uint8_t)(v0 >> 40) & 0xff, (uint8_t)(v0 >> 32) & 0xff,
+ (uint8_t)(v0 >> 24) & 0xff, (uint8_t)(v0 >> 16) & 0xff,
+ (uint8_t)(v0 >> 8) & 0xff, (uint8_t)(v0 ) & 0xff,
+ (uint32_t)(v1 >> 32) & 0xffffffff,
+ (uint32_t)(v1 & 0xffffffff));
+
+
+ xive_regw(x, PC_TCTXT_INDIR0, 0);
+ unlock(&x->lock);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_dump_vp(uint32_t vp_id)
+{
+ uint32_t blk, idx;
+ uint8_t order;
+ bool group;
+ struct xive *x;
+ struct xive_vp *vp;
+ uint32_t *vpw;
+
+ if (!xive_decode_vp(vp_id, &blk, &idx, &order, &group))
+ return OPAL_PARAMETER;
+
+ x = xive_from_vc_blk(blk);
+ if (!x)
+ return OPAL_PARAMETER;
+ vp = xive_get_vp(x, idx);
+ if (!vp)
+ return OPAL_PARAMETER;
+ lock(&x->lock);
+
+ xive_vpc_scrub_clean(x, blk, idx);
+
+ vpw = ((uint32_t *)vp) + (group ? 8 : 0);
+ prlog(PR_INFO, "VP[%08x]: 0..3: %08x %08x %08x %08x\n", vp_id,
+ vpw[0], vpw[1], vpw[2], vpw[3]);
+ prlog(PR_INFO, "VP[%08x]: 4..7: %08x %08x %08x %08x\n", vp_id,
+ vpw[4], vpw[5], vpw[6], vpw[7]);
+ unlock(&x->lock);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t __opal_xive_dump_emu(struct xive_cpu_state *xs, uint32_t pir)
+{
+ struct xive_eq *eq;
+ uint32_t ipi_target;
+ uint8_t *mm, pq;
+
+ prlog(PR_INFO, "CPU[%04x]: XIVE emulation state\n", pir);
+
+ prlog(PR_INFO, "CPU[%04x]: cppr=%02x mfrr=%02x pend=%02x"
+ " prev_cppr=%02x total_irqs=%llx\n", pir,
+ xs->cppr, xs->mfrr, xs->pending, xs->prev_cppr, xs->total_irqs);
+
+ prlog(PR_INFO, "CPU[%04x]: EQ IDX=%x MSK=%x G=%d [%08x %08x %08x > %08x %08x %08x %08x ...]\n",
+ pir, xs->eqptr, xs->eqmsk, xs->eqgen,
+ xs->eqbuf[(xs->eqptr - 3) & xs->eqmsk],
+ xs->eqbuf[(xs->eqptr - 2) & xs->eqmsk],
+ xs->eqbuf[(xs->eqptr - 1) & xs->eqmsk],
+ xs->eqbuf[(xs->eqptr + 0) & xs->eqmsk],
+ xs->eqbuf[(xs->eqptr + 1) & xs->eqmsk],
+ xs->eqbuf[(xs->eqptr + 2) & xs->eqmsk],
+ xs->eqbuf[(xs->eqptr + 3) & xs->eqmsk]);
+
+ mm = xs->xive->esb_mmio + GIRQ_TO_IDX(xs->ipi_irq) * XIVE_ESB_PAGE_SIZE;
+ pq = in_8(mm + 0x10800);
+ if (xive_get_irq_targetting(xs->ipi_irq, &ipi_target, NULL, NULL))
+ prlog(PR_INFO, "CPU[%04x]: IPI #%08x PQ=%x target=%08x\n",
+ pir, xs->ipi_irq, pq, ipi_target);
+ else
+ prlog(PR_INFO, "CPU[%04x]: IPI #%08x PQ=%x target=??\n",
+ pir, xs->ipi_irq, pq);
+
+
+
+ __xive_cache_scrub(xs->xive, xive_cache_eqc, xs->eq_blk,
+ xs->eq_idx + XIVE_EMULATION_PRIO,
+ false, false);
+ eq = xive_get_eq(xs->xive, xs->eq_idx + XIVE_EMULATION_PRIO);
+ prlog(PR_INFO, "CPU[%04x]: EQ @%p W0=%08x W1=%08x qbuf @%p\n",
+ pir, eq, be32_to_cpu(eq->w0), be32_to_cpu(eq->w1), xs->eqbuf);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_dump_emu(uint32_t pir)
+{
+ struct cpu_thread *c = find_cpu_by_pir(pir);
+ struct xive_cpu_state *xs;
+ int64_t rc;
+
+ if (!c)
+ return OPAL_PARAMETER;
+
+ xs = c->xstate;
+ if (!xs) {
+ prlog(PR_INFO, " <none>\n");
+ return OPAL_SUCCESS;
+ }
+ lock(&xs->lock);
+ rc = __opal_xive_dump_emu(xs, pir);
+ log_print(xs);
+ unlock(&xs->lock);
+
+ return rc;
+}
+
+static int64_t opal_xive_sync_irq_src(uint32_t girq)
+{
+ struct xive *x = xive_from_isn(girq);
+
+ if (!x)
+ return OPAL_PARAMETER;
+ return xive_sync(x);
+}
+
+static int64_t opal_xive_sync_irq_target(uint32_t girq)
+{
+ uint32_t target, vp_blk;
+ struct xive *x;
+
+ if (!xive_get_irq_targetting(girq, &target, NULL, NULL))
+ return OPAL_PARAMETER;
+ if (!xive_decode_vp(target, &vp_blk, NULL, NULL, NULL))
+ return OPAL_PARAMETER;
+ x = xive_from_pc_blk(vp_blk);
+ if (!x)
+ return OPAL_PARAMETER;
+ return xive_sync(x);
+}
+
+static int64_t opal_xive_sync(uint32_t type, uint32_t id)
+{
+ int64_t rc = OPAL_SUCCESS;;
+
+ if (type & XIVE_SYNC_EAS)
+ rc = opal_xive_sync_irq_src(id);
+ if (rc)
+ return rc;
+ if (type & XIVE_SYNC_QUEUE)
+ rc = opal_xive_sync_irq_target(id);
+ if (rc)
+ return rc;
+
+ /* Add more ... */
+
+ return rc;
+}
+
+static int64_t opal_xive_dump(uint32_t type, uint32_t id)
+{
+ switch (type) {
+ case XIVE_DUMP_TM_HYP:
+ return opal_xive_dump_tm(TM_QW3_HV_PHYS, "PHYS", id);
+ case XIVE_DUMP_TM_POOL:
+ return opal_xive_dump_tm(TM_QW2_HV_POOL, "POOL", id);
+ case XIVE_DUMP_TM_OS:
+ return opal_xive_dump_tm(TM_QW1_OS, "OS ", id);
+ case XIVE_DUMP_TM_USER:
+ return opal_xive_dump_tm(TM_QW0_USER, "USER", id);
+ case XIVE_DUMP_VP:
+ return opal_xive_dump_vp(id);
+ case XIVE_DUMP_EMU_STATE:
+ return opal_xive_dump_emu(id);
+ default:
+ return OPAL_PARAMETER;
+ }
+}
+
+static void xive_init_globals(void)
+{
+ uint32_t i;
+
+ for (i = 0; i < XIVE_MAX_CHIPS; i++)
+ xive_block_to_chip[i] = XIVE_INVALID_CHIP;
+}
+
+void init_xive(void)
+{
+ struct dt_node *np;
+ struct proc_chip *chip;
+ struct cpu_thread *cpu;
+ struct xive *one_xive;
+ bool first = true;
+
+ /* Look for xive nodes and do basic inits */
+ dt_for_each_compatible(dt_root, np, "ibm,power9-xive-x") {
+ struct xive *x;
+
+ /* Initialize some global stuff */
+ if (first)
+ xive_init_globals();
+
+ /* Create/initialize the xive instance */
+ x = init_one_xive(np);
+ if (first)
+ one_xive = x;
+ first = false;
+ }
+ if (first)
+ return;
+
+ xive_mode = XIVE_MODE_EMU;
+
+ /* Init VP allocator */
+ xive_init_vp_allocator();
+
+ /* Create a device-tree node for Linux use */
+ xive_create_mmio_dt_node(one_xive);
+
+ /* Some inits must be done after all xive have been created
+ * such as setting up the forwarding ports
+ */
+ for_each_chip(chip) {
+ if (chip->xive)
+ late_init_one_xive(chip->xive);
+ }
+
+ /* Initialize XICS emulation per-cpu structures */
+ for_each_present_cpu(cpu) {
+ xive_init_cpu(cpu);
+ }
+ /* Add interrupts propertie to each CPU node */
+ for_each_present_cpu(cpu) {
+ if (cpu_is_thread0(cpu))
+ xive_init_cpu_properties(cpu);
+ }
+
+ /* Calling boot CPU */
+ xive_cpu_callin(this_cpu());
+
+ /* Register XICS emulation calls */
+ opal_register(OPAL_INT_GET_XIRR, opal_xive_get_xirr, 2);
+ opal_register(OPAL_INT_SET_CPPR, opal_xive_set_cppr, 1);
+ opal_register(OPAL_INT_EOI, opal_xive_eoi, 1);
+ opal_register(OPAL_INT_SET_MFRR, opal_xive_set_mfrr, 2);
+
+ /* Register XIVE exploitation calls */
+ opal_register(OPAL_XIVE_RESET, opal_xive_reset, 1);
+ opal_register(OPAL_XIVE_GET_IRQ_INFO, opal_xive_get_irq_info, 6);
+ opal_register(OPAL_XIVE_GET_IRQ_CONFIG, opal_xive_get_irq_config, 4);
+ opal_register(OPAL_XIVE_SET_IRQ_CONFIG, opal_xive_set_irq_config, 4);
+ opal_register(OPAL_XIVE_GET_QUEUE_INFO, opal_xive_get_queue_info, 7);
+ opal_register(OPAL_XIVE_SET_QUEUE_INFO, opal_xive_set_queue_info, 5);
+ opal_register(OPAL_XIVE_DONATE_PAGE, opal_xive_donate_page, 2);
+ opal_register(OPAL_XIVE_ALLOCATE_IRQ, opal_xive_allocate_irq, 1);
+ opal_register(OPAL_XIVE_FREE_IRQ, opal_xive_free_irq, 1);
+ opal_register(OPAL_XIVE_ALLOCATE_VP_BLOCK, opal_xive_alloc_vp_block, 1);
+ opal_register(OPAL_XIVE_FREE_VP_BLOCK, opal_xive_free_vp_block, 1);
+ opal_register(OPAL_XIVE_GET_VP_INFO, opal_xive_get_vp_info, 5);
+ opal_register(OPAL_XIVE_SET_VP_INFO, opal_xive_set_vp_info, 3);
+ opal_register(OPAL_XIVE_SYNC, opal_xive_sync, 2);
+ opal_register(OPAL_XIVE_DUMP, opal_xive_dump, 2);
+ opal_register(OPAL_XIVE_GET_QUEUE_STATE, opal_xive_get_queue_state, 4);
+ opal_register(OPAL_XIVE_SET_QUEUE_STATE, opal_xive_set_queue_state, 4);
+ opal_register(OPAL_XIVE_GET_VP_STATE, opal_xive_get_vp_state, 2);
+}
+
diff --git a/roms/skiboot/hw/xive2.c b/roms/skiboot/hw/xive2.c
new file mode 100644
index 000000000..d5814bcbf
--- /dev/null
+++ b/roms/skiboot/hw/xive2.c
@@ -0,0 +1,4666 @@
+// SPDX-License-Identifier: Apache-2.0
+/*
+ * XIVE2: eXternal Interrupt Virtualization Engine. POWER10 interrupt
+ * controller
+ *
+ * Copyright (c) 2016-2019, IBM Corporation.
+ */
+
+#define pr_fmt(fmt) "XIVE: " fmt
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <chip.h>
+#include <io.h>
+#include <xive.h>
+#include <xive2-regs.h>
+#include <xscom-p10-regs.h>
+#include <interrupts.h>
+#include <timebase.h>
+#include <bitmap.h>
+#include <buddy.h>
+#include <phys-map.h>
+#include <p10_stop_api.H>
+
+
+/* Verbose debug */
+#undef XIVE_VERBOSE_DEBUG
+#undef DEBUG
+
+/* Extra debug options used in debug builds */
+#ifdef DEBUG
+#define XIVE_CHECK_LOCKS
+#define XIVE_DEBUG_INIT_CACHE_UPDATES
+#define XIVE_EXTRA_CHECK_INIT_CACHE
+#else
+#undef XIVE_CHECK_LOCKS
+#undef XIVE_DEBUG_INIT_CACHE_UPDATES
+#undef XIVE_EXTRA_CHECK_INIT_CACHE
+#endif
+
+/*
+ * VSDs, blocks, set translation etc...
+ *
+ * For the following data structures, the XIVE use a mechanism called
+ * Virtualization Structure Tables (VST) to manage the memory layout
+ * and access: ESBs (Event State Buffers), EAS (Event assignment
+ * structures), ENDs (Event Notification Descriptors) and NVT/NVP
+ * (Notification Virtual Targets/Processors).
+ *
+ * These structures divide those tables into 16 "blocks". Each XIVE
+ * instance has a definition for all 16 blocks that can either represent
+ * an actual table in memory or a remote XIVE MMIO port to access a
+ * block that is owned by that remote XIVE.
+ *
+ * Our SW design will consist of allocating one block per chip (and thus
+ * per XIVE instance) for now, thus giving us up to 16 supported chips in
+ * the system. We may have to revisit that if we ever support systems with
+ * more than 16 chips but that isn't on our radar at the moment or if we
+ * want to do like pHyp on some machines and dedicate 2 blocks per chip
+ * for some structures.
+ *
+ * Thus we need to be careful that we never expose to Linux the concept
+ * of block and block boundaries, but instead we provide full number ranges
+ * so that consecutive blocks can be supported.
+ *
+ * Similarily, for MMIO access, the BARs support what is called "set
+ * translation" which allows the BAR to be devided into a certain
+ * number of sets. Each "set" can be routed to a specific block and
+ * offset within a block.
+ */
+
+#define XIVE_MAX_BLOCKS 16
+#define XIVE_VSD_SIZE 8
+
+/*
+ * Max number of ESBs. (direct table)
+ *
+ * The max number of ESBs supported in the P10 MMIO space is 1TB/128K: 8M.
+ *
+ * 1M is our current top limit of ESB entries and EAS entries
+ * pre-allocated per chip. That allocates 256KB per chip for the state
+ * bits and 8M per chip for the EAS.
+ */
+
+#define XIVE_INT_ORDER 20 /* 1M interrupts */
+#define XIVE_INT_COUNT (1ul << XIVE_INT_ORDER)
+
+/*
+ * First interrupt number, also the first logical interrupt number
+ * allocated by Linux (maximum ISA interrupt number + 1)
+ */
+#define XIVE_INT_FIRST 0x10
+
+/* Corresponding direct table sizes */
+#define XIVE_ESB_SIZE (XIVE_INT_COUNT / 4)
+#define XIVE_EAT_SIZE (XIVE_INT_COUNT * 8)
+
+/* Use 64K for everything by default */
+#define XIVE_ESB_SHIFT (16 + 1) /* trigger + mgmt pages */
+#define XIVE_ESB_PAGE_SIZE (1ul << XIVE_ESB_SHIFT) /* 2 pages */
+
+/*
+ * Max number of ENDs. (indirect table)
+ *
+ * The max number of ENDs supported in the P10 MMIO space is 2TB/128K: 16M.
+ * Since one END is 32 bytes, a 64K indirect subpage can hold 2K ENDs.
+ * We need 8192 subpages, ie, 64K of memory for the indirect table.
+ */
+#define END_PER_PAGE (PAGE_SIZE / sizeof(struct xive_end))
+
+#define XIVE_END_ORDER 23 /* 8M ENDs */
+#define XIVE_END_COUNT (1ul << XIVE_END_ORDER)
+#define XIVE_END_TABLE_SIZE ((XIVE_END_COUNT / END_PER_PAGE) * XIVE_VSD_SIZE)
+
+#define XIVE_END_SHIFT (16 + 1) /* ESn + ESe pages */
+
+/* One bit per number of priorities configured */
+#define xive_end_bitmap_size(x) (XIVE_END_COUNT >> xive_cfg_vp_prio_shift(x))
+
+/* Number of priorities (and thus ENDs) we allocate for each VP */
+#define xive_cfg_vp_prio_shift(x) GETFIELD(CQ_XIVE_CFG_VP_INT_PRIO, (x)->config)
+#define xive_cfg_vp_prio(x) (1 << xive_cfg_vp_prio_shift(x))
+
+/* Max priority number */
+#define xive_max_prio(x) (xive_cfg_vp_prio(x) - 1)
+
+/* Priority used for gather/silent escalation (KVM) */
+#define xive_escalation_prio(x) xive_max_prio(x)
+
+/*
+ * Max number of VPs. (indirect table)
+ *
+ * The max number of NVPs we support in our MMIO space is 1TB/128K: 8M.
+ * Since one NVP is 32 bytes, a 64K indirect subpage can hold 2K NVPs.
+ * We need 4096 pointers, ie, 32K of memory for the indirect table.
+ *
+ * However, we use 8 priorities (by default) per NVP and the number of
+ * ENDs is configured to 8M. Therefore, our VP space is limited to 1M.
+ */
+#define VP_PER_PAGE (PAGE_SIZE / sizeof(struct xive_nvp))
+
+#define XIVE_VP_ORDER(x) (XIVE_END_ORDER - xive_cfg_vp_prio_shift(x))
+#define XIVE_VP_COUNT(x) (1ul << XIVE_VP_ORDER(x))
+#define XIVE_VP_TABLE_SIZE(x) ((XIVE_VP_COUNT(x) / VP_PER_PAGE) * XIVE_VSD_SIZE)
+
+#define XIVE_NVP_SHIFT 17 /* NVPG BAR: two pages, even NVP, odd NVG */
+
+/* VP Space maximums in Gen1 and Gen2 modes */
+#define VP_SHIFT_GEN1 19 /* in sync with END_W6_VP_OFFSET_GEN1 */
+#define VP_SHIFT_GEN2 24 /* in sync with END_W6_VP_OFFSET */
+
+/*
+ * VP ids for HW threads.
+ *
+ * Depends on the thread id bits configuration of the IC. 8bit is the
+ * default for P10 and 7bit for p9.
+ *
+ * These values are global because they should be common to all chips
+ */
+static uint32_t xive_threadid_shift;
+static uint32_t xive_hw_vp_base;
+static uint32_t xive_hw_vp_count;
+
+/*
+ * The XIVE operation mode indicates the active "API" and corresponds
+ * to the "version/mode" parameter of the opal_xive_reset() call
+ */
+static enum {
+ /* No XICS emulation */
+ XIVE_MODE_EXPL = OPAL_XIVE_MODE_EXPL, /* default */
+ XIVE_MODE_NONE,
+} xive_mode = XIVE_MODE_NONE;
+
+/*
+ * The XIVE exploitation mode options indicates the active features and
+ * is part of the mode parameter of the opal_xive_reset() call
+ */
+static uint64_t xive_expl_options;
+
+#define XIVE_EXPL_ALL_OPTIONS 0
+
+/*
+ * Each source controller has one of these. There's one embedded in
+ * the XIVE struct for IPIs
+ */
+struct xive_src {
+ struct irq_source is;
+ const struct irq_source_ops *orig_ops;
+ struct xive *xive;
+ void *esb_mmio;
+ uint32_t esb_base;
+ uint32_t esb_shift;
+ uint32_t flags;
+};
+
+struct xive_cpu_state {
+ struct xive *xive;
+ void *tm_ring1;
+
+ /* Base HW VP and associated queues */
+ uint32_t vp_blk;
+ uint32_t vp_idx;
+ uint32_t end_blk;
+ uint32_t end_idx; /* Base end index of a block of 8 */
+
+ struct lock lock;
+};
+
+enum xive_generation {
+ XIVE_GEN1 = 1, /* P9 compat mode */
+ XIVE_GEN2 = 2, /* P10 default */
+};
+
+enum xive_quirks {
+ /* HW527671 - 8bits Hardwired Thread Id range not implemented */
+ XIVE_QUIRK_THREADID_7BITS = 0x00000001,
+ /* HW542974 - interrupt command priority checker not working properly */
+ XIVE_QUIRK_BROKEN_PRIO_CHECK = 0x00000002,
+};
+
+struct xive {
+ uint32_t chip_id;
+ uint32_t block_id;
+ struct dt_node *x_node;
+
+ enum xive_generation generation;
+ uint64_t capabilities;
+ uint64_t config;
+
+ uint64_t xscom_base;
+
+ /* MMIO regions */
+ void *ic_base;
+ uint64_t ic_size;
+ uint32_t ic_shift;
+ void *ic_tm_direct_base;
+
+ void *tm_base;
+ uint64_t tm_size;
+ uint32_t tm_shift;
+ void *nvp_base;
+ uint64_t nvp_size;
+ void *esb_base;
+ uint64_t esb_size;
+ void *end_base;
+ uint64_t end_size;
+
+ /* Set on XSCOM register access error */
+ bool last_reg_error;
+
+ /* Per-XIVE mutex */
+ struct lock lock;
+
+ /* Pre-allocated tables.
+ *
+ * We setup all the VDS for actual tables (ie, by opposition to
+ * forwarding ports) as either direct pre-allocated or indirect
+ * and partially populated.
+ *
+ * Currently, the ESB and the EAS tables are direct and fully
+ * pre-allocated based on XIVE_INT_COUNT.
+ *
+ * The other tables are indirect, we thus pre-allocate the indirect
+ * table (ie, pages of pointers) and populate enough of the pages
+ * for our basic setup using 64K subpages.
+ *
+ * The size of the indirect tables are driven by XIVE_VP_COUNT
+ * and XIVE_END_COUNT. The number of pre-allocated ones are
+ * driven by xive_hw_vp_count for the HW threads. The number
+ * of END depends on number of VP.
+ */
+
+ /* Direct SBE and EAT tables */
+ void *sbe_base;
+ void *eat_base;
+
+ /* Indirect END table. NULL entries are unallocated, count is
+ * the numbre of pointers (ie, sub page placeholders).
+ */
+ beint64_t *end_ind_base;
+ uint32_t end_ind_count;
+ uint64_t end_ind_size;
+
+ /* END allocation bitmap. Each bit represent #priority ENDs */
+ bitmap_t *end_map;
+
+ /* Indirect NVT/VP table. NULL entries are unallocated, count is
+ * the numbre of pointers (ie, sub page placeholders).
+ */
+ beint64_t *vp_ind_base;
+ uint32_t vp_ind_count;
+ uint64_t vp_ind_size;
+
+ /* VP space size. Depends on Gen1/2 mode */
+ uint32_t vp_shift;
+
+ /* Pool of donated pages for provisioning indirect END and VP pages */
+ struct list_head donated_pages;
+
+ /* To ease a possible change to supporting more than one block of
+ * interrupts per chip, we store here the "base" global number
+ * and max number of interrupts for this chip. The global number
+ * encompass the block number and index.
+ */
+ uint32_t int_base;
+ uint32_t int_count;
+
+ /* Due to the overlap between IPIs and HW sources in the EAS table,
+ * we keep some kind of top-down allocator. It is used for HW sources
+ * to "allocate" interrupt entries and will limit what can be handed
+ * out as IPIs. Of course this assumes we "allocate" all HW sources
+ * before we start handing out IPIs.
+ *
+ * Note: The numbers here are global interrupt numbers so that we can
+ * potentially handle more than one block per chip in the future.
+ */
+ uint32_t int_hw_bot; /* Bottom of HW allocation */
+ uint32_t int_ipi_top; /* Highest IPI handed out so far + 1 */
+
+ /* The IPI allocation bitmap */
+ bitmap_t *ipi_alloc_map;
+
+ /* We keep track of which interrupts were ever enabled to
+ * speed up xive_reset
+ */
+ bitmap_t *int_enabled_map;
+
+ /* Embedded source IPIs */
+ struct xive_src ipis;
+
+ /* Embedded escalation interrupts */
+ struct xive_src esc_irqs;
+
+ /* In memory queue overflow */
+ void *q_ovf;
+
+ /* Cache/sync injection */
+ uint64_t sync_inject_size;
+ void *sync_inject;
+
+ /* INT HW Errata */
+ uint64_t quirks;
+};
+
+/* First XIVE unit configured on the system */
+static struct xive *one_xive;
+
+/* Global DT node */
+static struct dt_node *xive_dt_node;
+
+/* Block <-> Chip conversions.
+ *
+ * As chipIDs may not be within the range of 16 block IDs supported by XIVE,
+ * we have a 2 way conversion scheme.
+ *
+ * From block to chip, use the global table below.
+ *
+ * From chip to block, a field in struct proc_chip contains the first block
+ * of that chip. For now we only support one block per chip but that might
+ * change in the future
+ */
+#define XIVE_INVALID_CHIP 0xffffffff
+#define XIVE_MAX_CHIPS 16
+static uint32_t xive_block_to_chip[XIVE_MAX_CHIPS];
+static uint32_t xive_block_count;
+
+static uint32_t xive_chip_to_block(uint32_t chip_id)
+{
+ struct proc_chip *c = get_chip(chip_id);
+
+ assert(c);
+ assert(c->xive);
+ return c->xive->block_id;
+}
+
+/*
+ * Conversion between GIRQ and block/index.
+ *
+ * ------------------------------------
+ * |000E|BLOC| INDEX|
+ * ------------------------------------
+ * 4 4 24
+ *
+ * the E bit indicates that this is an escalation interrupt, in
+ * that case, the BLOC/INDEX represents the END containing the
+ * corresponding escalation descriptor.
+ *
+ * Global interrupt numbers for non-escalation interrupts are thus
+ * limited to 28 bits.
+ */
+
+#define INT_SHIFT 24
+#define INT_ESC_SHIFT (INT_SHIFT + 4) /* 4bits block id */
+
+#if XIVE_INT_ORDER > INT_SHIFT
+#error "Too many ESBs for IRQ encoding"
+#endif
+
+#if XIVE_END_ORDER > INT_SHIFT
+#error "Too many ENDs for escalation IRQ number encoding"
+#endif
+
+#define GIRQ_TO_BLK(__g) (((__g) >> INT_SHIFT) & 0xf)
+#define GIRQ_TO_IDX(__g) ((__g) & ((1 << INT_SHIFT) - 1))
+#define BLKIDX_TO_GIRQ(__b,__i) (((uint32_t)(__b)) << INT_SHIFT | (__i))
+
+#define GIRQ_IS_ESCALATION(__g) ((__g) & (1 << INT_ESC_SHIFT))
+#define MAKE_ESCALATION_GIRQ(__b,__i)(BLKIDX_TO_GIRQ(__b,__i) | (1 << INT_ESC_SHIFT))
+
+
+/* Block/IRQ to chip# conversions */
+#define PC_BLK_TO_CHIP(__b) (xive_block_to_chip[__b])
+#define VC_BLK_TO_CHIP(__b) (xive_block_to_chip[__b])
+#define GIRQ_TO_CHIP(__isn) (VC_BLK_TO_CHIP(GIRQ_TO_BLK(__isn)))
+
+/* Routing of physical processors to VPs */
+#define PIR2VP_IDX( __pir) (xive_hw_vp_base | P10_PIR2LOCALCPU(__pir))
+#define PIR2VP_BLK(__pir) (xive_chip_to_block(P10_PIR2GCID(__pir)))
+#define VP2PIR(__blk, __idx) (P10_PIRFROMLOCALCPU(VC_BLK_TO_CHIP(__blk), (__idx) & 0xff))
+
+/* Decoding of OPAL API VP IDs. The VP IDs are encoded as follow
+ *
+ * Block group mode:
+ *
+ * -----------------------------------
+ * |GVEOOOOO| INDEX|
+ * -----------------------------------
+ * || |
+ * || Order
+ * |Virtual
+ * Group
+ *
+ * G (Group) : Set to 1 for a group VP (not currently supported)
+ * V (Virtual) : Set to 1 for an allocated VP (vs. a physical processor ID)
+ * E (Error) : Should never be 1, used internally for errors
+ * O (Order) : Allocation order of the VP block
+ *
+ * The conversion is thus done as follow (groups aren't implemented yet)
+ *
+ * If V=0, O must be 0 and 24-bit INDEX value is the PIR
+ * If V=1, the order O group is allocated such that if N is the number of
+ * chip bits considered for allocation (*)
+ * then the INDEX is constructed as follow (bit numbers such as 0=LSB)
+ * - bottom O-N bits is the index within the "VP block"
+ * - next N bits is the XIVE blockID of the VP
+ * - the remaining bits is the per-chip "base"
+ * so the conversion consists of "extracting" the block ID and moving
+ * down the upper bits by N bits.
+ *
+ * In non-block-group mode, the difference is that the blockID is
+ * on the left of the index (the entire VP block is in a single
+ * block ID)
+ */
+
+#define VP_GROUP_SHIFT 31
+#define VP_VIRTUAL_SHIFT 30
+#define VP_ERROR_SHIFT 29
+#define VP_ORDER_SHIFT 24
+
+#define vp_group(vp) (((vp) >> VP_GROUP_SHIFT) & 1)
+#define vp_virtual(vp) (((vp) >> VP_VIRTUAL_SHIFT) & 1)
+#define vp_order(vp) (((vp) >> VP_ORDER_SHIFT) & 0x1f)
+#define vp_index(vp) ((vp) & ((1 << VP_ORDER_SHIFT) - 1))
+
+/* VP allocation */
+static uint32_t xive_chips_alloc_bits = 0;
+static struct buddy *xive_vp_buddy;
+static struct lock xive_buddy_lock = LOCK_UNLOCKED;
+
+/* VP# decoding/encoding */
+static bool xive_decode_vp(uint32_t vp, uint32_t *blk, uint32_t *idx,
+ uint8_t *order, bool *group)
+{
+ uint32_t o = vp_order(vp);
+ uint32_t n = xive_chips_alloc_bits;
+ uint32_t index = vp_index(vp);
+ uint32_t imask = (1 << (o - n)) - 1;
+
+ /* Groups not supported yet */
+ if (vp_group(vp))
+ return false;
+ if (group)
+ *group = false;
+
+ /* PIR case */
+ if (!vp_virtual(vp)) {
+ if (find_cpu_by_pir(index) == NULL)
+ return false;
+ if (blk)
+ *blk = PIR2VP_BLK(index);
+ if (idx)
+ *idx = PIR2VP_IDX(index);
+ return true;
+ }
+
+ /* Ensure o > n, we have *at least* 2 VPs per block */
+ if (o <= n)
+ return false;
+
+ /* Combine the index base and index */
+ if (idx)
+ *idx = ((index >> n) & ~imask) | (index & imask);
+ /* Extract block ID */
+ if (blk)
+ *blk = (index >> (o - n)) & ((1 << n) - 1);
+
+ /* Return order as well if asked for */
+ if (order)
+ *order = o;
+
+ return true;
+}
+
+static uint32_t xive_encode_vp(uint32_t blk, uint32_t idx, uint32_t order)
+{
+ uint32_t vp = (1 << VP_VIRTUAL_SHIFT) | (order << VP_ORDER_SHIFT);
+ uint32_t n = xive_chips_alloc_bits;
+ uint32_t imask = (1 << (order - n)) - 1;
+
+ vp |= (idx & ~imask) << n;
+ vp |= blk << (order - n);
+ vp |= idx & imask;
+ return vp;
+}
+
+/*
+ * XSCOM/MMIO helpers
+ */
+#define XIVE_NO_MMIO -1
+
+#define xive_regw(__x, __r, __v) \
+ __xive_regw(__x, __r, X_##__r, __v, #__r)
+#define xive_regr(__x, __r) \
+ __xive_regr(__x, __r, X_##__r, #__r)
+#define xive_regwx(__x, __r, __v) \
+ __xive_regw(__x, XIVE_NO_MMIO, X_##__r, __v, #__r)
+#define xive_regrx(__x, __r) \
+ __xive_regr(__x, XIVE_NO_MMIO, X_##__r, #__r)
+
+#ifdef XIVE_VERBOSE_DEBUG
+#define xive_vdbg(__x,__fmt,...) prlog(PR_DEBUG,"[ IC %02x ] " __fmt, (__x)->chip_id, ##__VA_ARGS__)
+#define xive_cpu_vdbg(__c,__fmt,...) prlog(PR_DEBUG,"[CPU %04x] " __fmt, (__c)->pir, ##__VA_ARGS__)
+#else
+#define xive_vdbg(x,fmt,...) do { } while(0)
+#define xive_cpu_vdbg(x,fmt,...) do { } while(0)
+#endif
+
+#define xive_dbg(__x,__fmt,...) prlog(PR_DEBUG,"[ IC %02x ] " __fmt, (__x)->chip_id, ##__VA_ARGS__)
+#define xive_cpu_dbg(__c,__fmt,...) prlog(PR_DEBUG,"[CPU %04x] " __fmt, (__c)->pir, ##__VA_ARGS__)
+#define xive_notice(__x,__fmt,...) prlog(PR_NOTICE,"[ IC %02x ] " __fmt, (__x)->chip_id, ##__VA_ARGS__)
+#define xive_cpu_notice(__c,__fmt,...) prlog(PR_NOTICE,"[CPU %04x] " __fmt, (__c)->pir, ##__VA_ARGS__)
+#define xive_warn(__x,__fmt,...) prlog(PR_WARNING,"[ IC %02x ] " __fmt, (__x)->chip_id, ##__VA_ARGS__)
+#define xive_cpu_warn(__c,__fmt,...) prlog(PR_WARNING,"[CPU %04x] " __fmt, (__c)->pir, ##__VA_ARGS__)
+#define xive_err(__x,__fmt,...) prlog(PR_ERR,"[ IC %02x ] " __fmt, (__x)->chip_id, ##__VA_ARGS__)
+#define xive_cpu_err(__c,__fmt,...) prlog(PR_ERR,"[CPU %04x] " __fmt, (__c)->pir, ##__VA_ARGS__)
+
+/*
+ * The XIVE subengine being accessed can be deduced from the XSCOM
+ * reg, and from there, the page offset in the IC BAR.
+ */
+static void* xive_ic_page(struct xive *x, uint32_t x_reg)
+{
+ uint64_t pgoff = (x_reg >> 8) & 0x3;
+
+ return x->ic_base + (pgoff << x->ic_shift);
+}
+
+static void __xive_regw(struct xive *x, uint32_t m_reg, uint32_t x_reg, uint64_t v,
+ const char *rname)
+{
+ bool use_xscom = (m_reg == XIVE_NO_MMIO) || !x->ic_base;
+ int64_t rc;
+
+ x->last_reg_error = false;
+
+ assert(x_reg != 0);
+
+ if (use_xscom) {
+ rc = xscom_write(x->chip_id, x->xscom_base + x_reg, v);
+ if (rc) {
+ if (!rname)
+ rname = "???";
+ xive_err(x, "Error writing register %s\n", rname);
+ /* Anything else we can do here ? */
+ x->last_reg_error = true;
+ }
+ } else {
+ out_be64(xive_ic_page(x, x_reg) + m_reg, v);
+ }
+}
+
+static uint64_t __xive_regr(struct xive *x, uint32_t m_reg, uint32_t x_reg,
+ const char *rname)
+{
+ bool use_xscom = (m_reg == XIVE_NO_MMIO) || !x->ic_base;
+ int64_t rc;
+ uint64_t val;
+
+ x->last_reg_error = false;
+
+ assert(x_reg != 0);
+
+ if (use_xscom) {
+ rc = xscom_read(x->chip_id, x->xscom_base + x_reg, &val);
+ if (rc) {
+ if (!rname)
+ rname = "???";
+ xive_err(x, "Error reading register %s\n", rname);
+ /* Anything else we can do here ? */
+ x->last_reg_error = true;
+ return -1ull;
+ }
+ } else {
+ val = in_be64(xive_ic_page(x, x_reg) + m_reg);
+ }
+ return val;
+}
+
+/* Locate a controller from an IRQ number */
+static struct xive *xive_from_isn(uint32_t isn)
+{
+ uint32_t chip_id = GIRQ_TO_CHIP(isn);
+ struct proc_chip *c = get_chip(chip_id);
+
+ if (!c)
+ return NULL;
+ return c->xive;
+}
+
+static struct xive *xive_from_pc_blk(uint32_t blk)
+{
+ uint32_t chip_id = PC_BLK_TO_CHIP(blk);
+ struct proc_chip *c = get_chip(chip_id);
+
+ if (!c)
+ return NULL;
+ return c->xive;
+}
+
+static struct xive *xive_from_vc_blk(uint32_t blk)
+{
+ uint32_t chip_id = VC_BLK_TO_CHIP(blk);
+ struct proc_chip *c = get_chip(chip_id);
+
+ if (!c)
+ return NULL;
+ return c->xive;
+}
+
+static struct xive_end *xive_get_end(struct xive *x, unsigned int idx)
+{
+ struct xive_end *p;
+
+ if (idx >= (x->end_ind_count * END_PER_PAGE))
+ return NULL;
+ p = (struct xive_end *)(be64_to_cpu(x->end_ind_base[idx / END_PER_PAGE]) &
+ VSD_ADDRESS_MASK);
+ if (!p)
+ return NULL;
+
+ return &p[idx % END_PER_PAGE];
+}
+
+static struct xive_eas *xive_get_eas(struct xive *x, unsigned int isn)
+{
+ struct xive_eas *eat;
+ uint32_t idx = GIRQ_TO_IDX(isn);
+
+ if (GIRQ_IS_ESCALATION(isn)) {
+ /* Allright, an escalation EAS is buried inside an END, let's
+ * try to find it
+ */
+ struct xive_end *end;
+
+ if (x->chip_id != VC_BLK_TO_CHIP(GIRQ_TO_BLK(isn))) {
+ xive_err(x, "%s, ESC ISN 0x%x not on right chip\n",
+ __func__, isn);
+ return NULL;
+ }
+ end = xive_get_end(x, idx);
+ if (!end) {
+ xive_err(x, "%s, ESC ISN 0x%x END not found\n",
+ __func__, isn);
+ return NULL;
+ }
+
+ /* If using single-escalation, don't let anybody get
+ * to the individual escalation interrupts
+ */
+ if (xive_get_field32(END_W0_UNCOND_ESCALATE, end->w0))
+ return NULL;
+
+ /* Grab the escalation END */
+ return (struct xive_eas *)(char *)&end->w4;
+ } else {
+ /* Check the block matches */
+ if (isn < x->int_base || isn >= x->int_count) {
+ xive_err(x, "%s, ISN 0x%x not on right chip\n",
+ __func__, isn);
+ return NULL;
+ }
+ assert (idx < XIVE_INT_COUNT);
+
+ /* If we support >1 block per chip, this should still
+ * work as we are likely to make the table contiguous
+ * anyway
+ */
+ eat = x->eat_base;
+ assert(eat);
+
+ return eat + idx;
+ }
+}
+
+static struct xive_nvp *xive_get_vp(struct xive *x, unsigned int idx)
+{
+ struct xive_nvp *p;
+
+ assert(idx < (x->vp_ind_count * VP_PER_PAGE));
+ p = (struct xive_nvp *)(be64_to_cpu(x->vp_ind_base[idx / VP_PER_PAGE]) &
+ VSD_ADDRESS_MASK);
+ if (!p)
+ return NULL;
+
+ return &p[idx % VP_PER_PAGE];
+}
+
+/*
+ * Store the END base of the VP in W5, using the new architected field
+ * in P10. Used to be the pressure relief interrupt field on P9.
+ */
+static void xive_vp_set_end_base(struct xive_nvp *vp,
+ uint32_t end_blk, uint32_t end_idx)
+{
+ vp->w5 = xive_set_field32(NVP_W5_VP_END_BLOCK, 0, end_blk) |
+ xive_set_field32(NVP_W5_VP_END_INDEX, 0, end_idx);
+
+ /* This is the criteria to know if a VP was allocated */
+ assert(vp->w5 != 0);
+}
+
+static void xive_init_default_vp(struct xive_nvp *vp,
+ uint32_t end_blk, uint32_t end_idx)
+{
+ memset(vp, 0, sizeof(struct xive_nvp));
+
+ xive_vp_set_end_base(vp, end_blk, end_idx);
+
+ vp->w0 = xive_set_field32(NVP_W0_VALID, 0, 1);
+}
+
+/*
+ * VPs of the HW threads have their own set of ENDs which is allocated
+ * when XIVE is initialized. These are tagged with a FIRMWARE bit so
+ * that they can be identified when the driver is reset (kexec).
+ */
+static void xive_init_hw_end(struct xive_end *end)
+{
+ memset(end, 0, sizeof(struct xive_end));
+ end->w0 = xive_set_field32(END_W0_FIRMWARE1, 0, 1);
+}
+
+static void *xive_get_donated_page(struct xive *x)
+{
+ return (void *)list_pop_(&x->donated_pages, 0);
+}
+
+#define XIVE_ALLOC_IS_ERR(_idx) ((_idx) >= 0xfffffff0)
+
+#define XIVE_ALLOC_NO_SPACE 0xffffffff /* No possible space */
+#define XIVE_ALLOC_NO_IND 0xfffffffe /* Indirect need provisioning */
+#define XIVE_ALLOC_NO_MEM 0xfffffffd /* Local allocation failed */
+
+static uint32_t xive_alloc_end_set(struct xive *x, bool alloc_indirect)
+{
+ uint32_t ind_idx;
+ int idx;
+ int end_base_idx;
+
+ xive_vdbg(x, "Allocating END set...\n");
+
+ assert(x->end_map);
+
+ /* Allocate from the END bitmap. Each bit is 8 ENDs */
+ idx = bitmap_find_zero_bit(*x->end_map, 0, xive_end_bitmap_size(x));
+ if (idx < 0) {
+ xive_dbg(x, "Allocation from END bitmap failed !\n");
+ return XIVE_ALLOC_NO_SPACE;
+ }
+
+ end_base_idx = idx << xive_cfg_vp_prio_shift(x);
+
+ xive_vdbg(x, "Got ENDs 0x%x..0x%x\n", end_base_idx,
+ end_base_idx + xive_max_prio(x));
+
+ /* Calculate the indirect page where the ENDs reside */
+ ind_idx = end_base_idx / END_PER_PAGE;
+
+ /* Is there an indirect page ? If not, check if we can provision it */
+ if (!x->end_ind_base[ind_idx]) {
+ /* Default flags */
+ uint64_t vsd_flags = SETFIELD(VSD_TSIZE, 0ull, 4) |
+ SETFIELD(VSD_MODE, 0ull, VSD_MODE_EXCLUSIVE);
+ void *page;
+
+ /* If alloc_indirect is set, allocate the memory from OPAL own,
+ * otherwise try to provision from the donated pool
+ */
+ if (alloc_indirect) {
+ /* Allocate/provision indirect page during boot only */
+ xive_vdbg(x, "Indirect empty, provisioning from local pool\n");
+ page = local_alloc(x->chip_id, PAGE_SIZE, PAGE_SIZE);
+ if (!page) {
+ xive_dbg(x, "provisioning failed !\n");
+ return XIVE_ALLOC_NO_MEM;
+ }
+ vsd_flags |= VSD_FIRMWARE;
+ } else {
+ xive_vdbg(x, "Indirect empty, provisioning from donated pages\n");
+ page = xive_get_donated_page(x);
+ if (!page) {
+ xive_vdbg(x, "no idirect pages available !\n");
+ return XIVE_ALLOC_NO_IND;
+ }
+ }
+ memset(page, 0, PAGE_SIZE);
+ x->end_ind_base[ind_idx] = cpu_to_be64(vsd_flags |
+ (((uint64_t)page) & VSD_ADDRESS_MASK));
+ /* Any cache scrub needed ? */
+ }
+
+ bitmap_set_bit(*x->end_map, idx);
+ return end_base_idx;
+}
+
+static void xive_free_end_set(struct xive *x, uint32_t ends)
+{
+ uint32_t idx;
+ uint8_t prio_mask = xive_max_prio(x);
+
+ xive_vdbg(x, "Freeing END 0x%x..0x%x\n", ends, ends + xive_max_prio(x));
+
+ assert((ends & prio_mask) == 0);
+ assert(x->end_map);
+
+ idx = ends >> xive_cfg_vp_prio_shift(x);
+ bitmap_clr_bit(*x->end_map, idx);
+}
+
+static bool xive_provision_vp_ind(struct xive *x, uint32_t vp_idx, uint32_t order)
+{
+ uint32_t pbase, pend, i;
+
+ pbase = vp_idx / VP_PER_PAGE;
+ pend = (vp_idx + (1 << order)) / VP_PER_PAGE;
+
+ for (i = pbase; i <= pend; i++) {
+ void *page;
+ u64 vsd;
+
+ /* Already provisioned ? */
+ if (x->vp_ind_base[i])
+ continue;
+
+ /* Try to grab a donated page */
+ page = xive_get_donated_page(x);
+ if (!page)
+ return false;
+
+ /* Install the page */
+ memset(page, 0, PAGE_SIZE);
+ vsd = ((uint64_t)page) & VSD_ADDRESS_MASK;
+ vsd |= SETFIELD(VSD_TSIZE, 0ull, 4);
+ vsd |= SETFIELD(VSD_MODE, 0ull, VSD_MODE_EXCLUSIVE);
+ x->vp_ind_base[i] = cpu_to_be64(vsd);
+ }
+ return true;
+}
+
+static void xive_init_vp_allocator(void)
+{
+ /* Initialize chip alloc bits */
+ xive_chips_alloc_bits = ilog2(xive_block_count);
+
+ prlog(PR_INFO, "%d chips considered for VP allocations\n",
+ 1 << xive_chips_alloc_bits);
+
+ /* Allocate a buddy big enough for XIVE_VP_ORDER allocations.
+ *
+ * each bit in the buddy represents 1 << xive_chips_alloc_bits
+ * VPs.
+ */
+ xive_vp_buddy = buddy_create(XIVE_VP_ORDER(one_xive));
+ assert(xive_vp_buddy);
+
+ /*
+ * We reserve the whole range of VP ids representing HW threads.
+ */
+ assert(buddy_reserve(xive_vp_buddy, xive_hw_vp_base,
+ xive_threadid_shift));
+}
+
+static uint32_t xive_alloc_vps(uint32_t order)
+{
+ uint32_t local_order, i;
+ int vp;
+
+ /* The minimum order is 2 VPs per chip */
+ if (order < (xive_chips_alloc_bits + 1))
+ order = xive_chips_alloc_bits + 1;
+
+ /* We split the allocation */
+ local_order = order - xive_chips_alloc_bits;
+
+ /* We grab that in the global buddy */
+ assert(xive_vp_buddy);
+ lock(&xive_buddy_lock);
+ vp = buddy_alloc(xive_vp_buddy, local_order);
+ unlock(&xive_buddy_lock);
+ if (vp < 0)
+ return XIVE_ALLOC_NO_SPACE;
+
+ /* Provision on every chip considered for allocation */
+ for (i = 0; i < (1 << xive_chips_alloc_bits); i++) {
+ struct xive *x = xive_from_pc_blk(i);
+ bool success;
+
+ /* Return internal error & log rather than assert ? */
+ assert(x);
+ lock(&x->lock);
+ success = xive_provision_vp_ind(x, vp, local_order);
+ unlock(&x->lock);
+ if (!success) {
+ lock(&xive_buddy_lock);
+ buddy_free(xive_vp_buddy, vp, local_order);
+ unlock(&xive_buddy_lock);
+ return XIVE_ALLOC_NO_IND;
+ }
+ }
+
+ /* Encode the VP number. "blk" is 0 as this represents
+ * all blocks and the allocation always starts at 0
+ */
+ return xive_encode_vp(0, vp, order);
+}
+
+static void xive_free_vps(uint32_t vp)
+{
+ uint32_t idx;
+ uint8_t order, local_order;
+
+ assert(xive_decode_vp(vp, NULL, &idx, &order, NULL));
+
+ /* We split the allocation */
+ local_order = order - xive_chips_alloc_bits;
+
+ /* Free that in the buddy */
+ lock(&xive_buddy_lock);
+ buddy_free(xive_vp_buddy, idx, local_order);
+ unlock(&xive_buddy_lock);
+}
+
+enum xive_cache_type {
+ xive_cache_easc,
+ xive_cache_esbc,
+ xive_cache_endc,
+ xive_cache_nxc,
+};
+
+/*
+ * Cache update
+ */
+
+#define FLUSH_CTRL_POLL_VALID PPC_BIT(0) /* POLL bit is the same for all */
+
+static int64_t __xive_cache_scrub(struct xive *x,
+ enum xive_cache_type ctype,
+ uint64_t block, uint64_t idx,
+ bool want_inval __unused, bool want_disable __unused)
+{
+ uint64_t ctrl_reg, x_ctrl_reg;
+ uint64_t poll_val, ctrl_val;
+
+#ifdef XIVE_CHECK_LOCKS
+ assert(lock_held_by_me(&x->lock));
+#endif
+ switch (ctype) {
+ case xive_cache_easc:
+ poll_val =
+ SETFIELD(VC_EASC_FLUSH_POLL_BLOCK_ID, 0ll, block) |
+ SETFIELD(VC_EASC_FLUSH_POLL_OFFSET, 0ll, idx) |
+ VC_EASC_FLUSH_POLL_BLOCK_ID_MASK |
+ VC_EASC_FLUSH_POLL_OFFSET_MASK;
+ xive_regw(x, VC_EASC_FLUSH_POLL, poll_val);
+ ctrl_reg = VC_EASC_FLUSH_CTRL;
+ x_ctrl_reg = X_VC_EASC_FLUSH_CTRL;
+ break;
+ case xive_cache_esbc:
+ poll_val =
+ SETFIELD(VC_ESBC_FLUSH_POLL_BLOCK_ID, 0ll, block) |
+ SETFIELD(VC_ESBC_FLUSH_POLL_OFFSET, 0ll, idx) |
+ VC_ESBC_FLUSH_POLL_BLOCK_ID_MASK |
+ VC_ESBC_FLUSH_POLL_OFFSET_MASK;
+ xive_regw(x, VC_ESBC_FLUSH_POLL, poll_val);
+ ctrl_reg = VC_ESBC_FLUSH_CTRL;
+ x_ctrl_reg = X_VC_ESBC_FLUSH_CTRL;
+ break;
+ case xive_cache_endc:
+ poll_val =
+ SETFIELD(VC_ENDC_FLUSH_POLL_BLOCK_ID, 0ll, block) |
+ SETFIELD(VC_ENDC_FLUSH_POLL_OFFSET, 0ll, idx) |
+ VC_ENDC_FLUSH_POLL_BLOCK_ID_MASK |
+ VC_ENDC_FLUSH_POLL_OFFSET_MASK;
+ xive_regw(x, VC_ENDC_FLUSH_POLL, poll_val);
+ ctrl_reg = VC_ENDC_FLUSH_CTRL;
+ x_ctrl_reg = X_VC_ENDC_FLUSH_CTRL;
+ break;
+ case xive_cache_nxc:
+ poll_val =
+ SETFIELD(PC_NXC_FLUSH_POLL_BLOCK_ID, 0ll, block) |
+ SETFIELD(PC_NXC_FLUSH_POLL_OFFSET, 0ll, idx) |
+ PC_NXC_FLUSH_POLL_BLOCK_ID_MASK |
+ PC_NXC_FLUSH_POLL_OFFSET_MASK;
+ xive_regw(x, PC_NXC_FLUSH_POLL, poll_val);
+ ctrl_reg = PC_NXC_FLUSH_CTRL;
+ x_ctrl_reg = X_PC_NXC_FLUSH_CTRL;
+ break;
+ default:
+ return OPAL_INTERNAL_ERROR;
+ }
+
+ /* XXX Add timeout !!! */
+ for (;;) {
+ ctrl_val = __xive_regr(x, ctrl_reg, x_ctrl_reg, NULL);
+ if (!(ctrl_val & FLUSH_CTRL_POLL_VALID))
+ break;
+ /* Small delay */
+ time_wait(100);
+ }
+ sync();
+ return 0;
+}
+
+static int64_t xive_easc_scrub(struct xive *x, uint64_t block, uint64_t idx)
+{
+ return __xive_cache_scrub(x, xive_cache_easc, block, idx, false, false);
+}
+
+static int64_t xive_nxc_scrub(struct xive *x, uint64_t block, uint64_t idx)
+{
+ return __xive_cache_scrub(x, xive_cache_nxc, block, idx, false, false);
+}
+
+static int64_t xive_nxc_scrub_clean(struct xive *x, uint64_t block, uint64_t idx)
+{
+ return __xive_cache_scrub(x, xive_cache_nxc, block, idx, true, false);
+}
+
+static int64_t xive_endc_scrub(struct xive *x, uint64_t block, uint64_t idx)
+{
+ return __xive_cache_scrub(x, xive_cache_endc, block, idx, false, false);
+}
+
+#define XIVE_CACHE_WATCH_MAX_RETRIES 10
+
+static int64_t __xive_cache_watch(struct xive *x, enum xive_cache_type ctype,
+ uint64_t block, uint64_t idx,
+ uint32_t start_dword, uint32_t dword_count,
+ beint64_t *new_data, bool light_watch,
+ bool synchronous)
+{
+ uint64_t sreg, sregx, dreg0, dreg0x;
+ uint64_t dval0, sval, status;
+ int64_t i;
+ int retries = 0;
+
+#ifdef XIVE_CHECK_LOCKS
+ assert(lock_held_by_me(&x->lock));
+#endif
+ switch (ctype) {
+ case xive_cache_endc:
+ sreg = VC_ENDC_WATCH0_SPEC;
+ sregx = X_VC_ENDC_WATCH0_SPEC;
+ dreg0 = VC_ENDC_WATCH0_DATA0;
+ dreg0x = X_VC_ENDC_WATCH0_DATA0;
+ sval = SETFIELD(VC_ENDC_WATCH_BLOCK_ID, idx, block);
+ break;
+ case xive_cache_nxc:
+ sreg = PC_NXC_WATCH0_SPEC;
+ sregx = X_PC_NXC_WATCH0_SPEC;
+ dreg0 = PC_NXC_WATCH0_DATA0;
+ dreg0x = X_PC_NXC_WATCH0_DATA0;
+ sval = SETFIELD(PC_NXC_WATCH_BLOCK_ID, idx, block);
+ break;
+ default:
+ return OPAL_INTERNAL_ERROR;
+ }
+
+ /* The full bit is in the same position for ENDC and NXC */
+ if (!light_watch)
+ sval |= VC_ENDC_WATCH_FULL;
+
+ for (;;) {
+ /* Write the cache watch spec */
+ __xive_regw(x, sreg, sregx, sval, NULL);
+
+ /* Load data0 register to populate the watch */
+ dval0 = __xive_regr(x, dreg0, dreg0x, NULL);
+
+ /* If new_data is NULL, this is a dummy watch used as a
+ * workaround for a HW bug
+ */
+ if (!new_data) {
+ __xive_regw(x, dreg0, dreg0x, dval0, NULL);
+ return 0;
+ }
+
+ /* Write the words into the watch facility. We write in reverse
+ * order in case word 0 is part of it as it must be the last
+ * one written.
+ */
+ for (i = start_dword + dword_count - 1; i >= start_dword ;i--) {
+ uint64_t dw = be64_to_cpu(new_data[i - start_dword]);
+ __xive_regw(x, dreg0 + i * 8, dreg0x + i, dw, NULL);
+ }
+
+ /* Write data0 register to trigger the update if word 0 wasn't
+ * written above
+ */
+ if (start_dword > 0)
+ __xive_regw(x, dreg0, dreg0x, dval0, NULL);
+
+ /* This may not be necessary for light updates (it's possible
+ * that a sync in sufficient, TBD). Ensure the above is
+ * complete and check the status of the watch.
+ */
+ status = __xive_regr(x, sreg, sregx, NULL);
+
+ /* Bits FULL and CONFLICT are in the same position in
+ * ENDC and NXC
+ */
+ if (!(status & VC_ENDC_WATCH_FULL) ||
+ !(status & VC_ENDC_WATCH_CONFLICT))
+ break;
+ if (!synchronous)
+ return OPAL_BUSY;
+
+ if (++retries == XIVE_CACHE_WATCH_MAX_RETRIES) {
+ xive_err(x, "Reached maximum retries %d when doing "
+ "a %s cache update\n", retries,
+ ctype == xive_cache_endc ? "ENDC" : "NXC");
+ return OPAL_BUSY;
+ }
+ }
+
+ /* Perform a scrub with "want_invalidate" set to false to push the
+ * cache updates to memory as well
+ */
+ return __xive_cache_scrub(x, ctype, block, idx, false, false);
+}
+
+#ifdef XIVE_DEBUG_INIT_CACHE_UPDATES
+static bool xive_check_endc_update(struct xive *x, uint32_t idx, struct xive_end *end)
+{
+ struct xive_end *end_p = xive_get_end(x, idx);
+ struct xive_end end2;
+
+ assert(end_p);
+ end2 = *end_p;
+ if (memcmp(end, &end2, sizeof(struct xive_end)) != 0) {
+ xive_err(x, "END update mismatch idx %d\n", idx);
+ xive_err(x, "want: %08x %08x %08x %08x\n",
+ end->w0, end->w1, end->w2, end->w3);
+ xive_err(x, " %08x %08x %08x %08x\n",
+ end->w4, end->w5, end->w6, end->w7);
+ xive_err(x, "got : %08x %08x %08x %08x\n",
+ end2.w0, end2.w1, end2.w2, end2.w3);
+ xive_err(x, " %08x %08x %08x %08x\n",
+ end2.w4, end2.w5, end2.w6, end2.w7);
+ return false;
+ }
+ return true;
+}
+
+static bool xive_check_nxc_update(struct xive *x, uint32_t idx, struct xive_nvp *vp)
+{
+ struct xive_nvp *vp_p = xive_get_vp(x, idx);
+ struct xive_nvp vp2;
+
+ assert(vp_p);
+ vp2 = *vp_p;
+ if (memcmp(vp, &vp2, sizeof(struct xive_nvp)) != 0) {
+ xive_err(x, "VP update mismatch idx %d\n", idx);
+ xive_err(x, "want: %08x %08x %08x %08x\n",
+ vp->w0, vp->w1, vp->w2, vp->w3);
+ xive_err(x, " %08x %08x %08x %08x\n",
+ vp->w4, vp->w5, vp->w6, vp->w7);
+ xive_err(x, "got : %08x %08x %08x %08x\n",
+ vp2.w0, vp2.w1, vp2.w2, vp2.w3);
+ xive_err(x, " %08x %08x %08x %08x\n",
+ vp2.w4, vp2.w5, vp2.w6, vp2.w7);
+ return false;
+ }
+ return true;
+}
+#else
+static inline bool xive_check_endc_update(struct xive *x __unused,
+ uint32_t idx __unused,
+ struct xive_end *end __unused)
+{
+ return true;
+}
+
+static inline bool xive_check_nxc_update(struct xive *x __unused,
+ uint32_t idx __unused,
+ struct xive_nvp *vp __unused)
+{
+ return true;
+}
+#endif
+
+static int64_t xive_escalation_ive_cache_update(struct xive *x, uint64_t block,
+ uint64_t idx, struct xive_eas *eas,
+ bool synchronous)
+{
+ return __xive_cache_watch(x, xive_cache_endc, block, idx,
+ 2, 1, &eas->w, true, synchronous);
+}
+
+static int64_t xive_endc_cache_update(struct xive *x, uint64_t block,
+ uint64_t idx, struct xive_end *end,
+ bool synchronous)
+{
+ int64_t ret;
+
+ ret = __xive_cache_watch(x, xive_cache_endc, block, idx,
+ 0, 4, (beint64_t *)end, false, synchronous);
+ xive_check_endc_update(x, idx, end);
+ return ret;
+}
+
+static int64_t xive_nxc_cache_update(struct xive *x, uint64_t block,
+ uint64_t idx, struct xive_nvp *vp,
+ bool synchronous)
+{
+ int64_t ret;
+
+ ret = __xive_cache_watch(x, xive_cache_nxc, block, idx,
+ 0, 4, (beint64_t *)vp, false, synchronous);
+ xive_check_nxc_update(x, idx, vp);
+ return ret;
+}
+
+/*
+ * VSD
+ */
+static bool xive_set_vsd(struct xive *x, uint32_t tbl, uint32_t idx, uint64_t v)
+{
+ /* Set VC subengine */
+ xive_regw(x, VC_VSD_TABLE_ADDR,
+ SETFIELD(VC_VSD_TABLE_SELECT, 0ull, tbl) |
+ SETFIELD(VC_VSD_TABLE_ADDRESS, 0ull, idx));
+ if (x->last_reg_error)
+ return false;
+ xive_regw(x, VC_VSD_TABLE_DATA, v);
+ if (x->last_reg_error)
+ return false;
+
+ /* also set PC subengine if table is used */
+ if (tbl == VST_EAS || tbl == VST_ERQ || tbl == VST_IC)
+ return true;
+
+ xive_regw(x, PC_VSD_TABLE_ADDR,
+ SETFIELD(PC_VSD_TABLE_SELECT, 0ull, tbl) |
+ SETFIELD(PC_VSD_TABLE_ADDRESS, 0ull, idx));
+ if (x->last_reg_error)
+ return false;
+ xive_regw(x, PC_VSD_TABLE_DATA, v);
+ if (x->last_reg_error)
+ return false;
+ return true;
+}
+
+static bool xive_set_local_tables(struct xive *x)
+{
+ uint64_t base, i;
+
+ /* These have to be power of 2 sized */
+ assert(is_pow2(XIVE_ESB_SIZE));
+ assert(is_pow2(XIVE_EAT_SIZE));
+
+ /* All tables set as exclusive */
+ base = SETFIELD(VSD_MODE, 0ull, VSD_MODE_EXCLUSIVE);
+
+ /* ESB: direct mode */
+ if (!xive_set_vsd(x, VST_ESB, x->block_id, base |
+ (((uint64_t)x->sbe_base) & VSD_ADDRESS_MASK) |
+ SETFIELD(VSD_TSIZE, 0ull, ilog2(XIVE_ESB_SIZE) - 12)))
+ return false;
+
+ /* EAS: direct mode */
+ if (!xive_set_vsd(x, VST_EAS, x->block_id, base |
+ (((uint64_t)x->eat_base) & VSD_ADDRESS_MASK) |
+ SETFIELD(VSD_TSIZE, 0ull, ilog2(XIVE_EAT_SIZE) - 12)))
+ return false;
+
+ /* END: indirect mode with 64K subpages */
+ if (!xive_set_vsd(x, VST_END, x->block_id, base |
+ (((uint64_t)x->end_ind_base) & VSD_ADDRESS_MASK) |
+ VSD_INDIRECT | SETFIELD(VSD_TSIZE, 0ull,
+ ilog2(x->end_ind_size) - 12)))
+ return false;
+
+ /* NVP: indirect mode with 64K subpages */
+ if (!xive_set_vsd(x, VST_NVP, x->block_id, base |
+ (((uint64_t)x->vp_ind_base) & VSD_ADDRESS_MASK) |
+ VSD_INDIRECT | SETFIELD(VSD_TSIZE, 0ull,
+ ilog2(x->vp_ind_size) - 12)))
+ return false;
+
+ /* NVG: not used */
+ /* NVC: not used */
+
+ /* INT and SYNC: indexed with the Topology# */
+ if (!xive_set_vsd(x, VST_IC, x->chip_id, base |
+ (((uint64_t)x->ic_base) & VSD_ADDRESS_MASK) |
+ SETFIELD(VSD_TSIZE, 0ull, ilog2(x->ic_size) - 12)))
+ return false;
+
+ if (!xive_set_vsd(x, VST_SYNC, x->chip_id, base |
+ (((uint64_t)x->sync_inject) & VSD_ADDRESS_MASK) |
+ SETFIELD(VSD_TSIZE, 0ull, ilog2(x->sync_inject_size) - 12)))
+ return false;
+
+ /*
+ * ERQ: one 64K page for each queue overflow. Indexed with :
+ *
+ * 0:IPI, 1:HWD, 2:NxC, 3:INT, 4:OS-Queue, 5:Pool-Queue, 6:Hard-Queue
+ */
+ for (i = 0; i < VC_QUEUE_COUNT; i++) {
+ u64 addr = ((uint64_t)x->q_ovf) + i * PAGE_SIZE;
+ u64 cfg, sreg, sregx;
+
+ if (!xive_set_vsd(x, VST_ERQ, i, base |
+ (addr & VSD_ADDRESS_MASK) |
+ SETFIELD(VSD_TSIZE, 0ull, 4)))
+ return false;
+
+ sreg = VC_QUEUES_CFG_REM0 + i * 8;
+ sregx = X_VC_QUEUES_CFG_REM0 + i;
+ cfg = __xive_regr(x, sreg, sregx, NULL);
+ cfg |= VC_QUEUES_CFG_MEMB_EN;
+ cfg = SETFIELD(VC_QUEUES_CFG_MEMB_SZ, cfg, 4);
+ __xive_regw(x, sreg, sregx, cfg, NULL);
+ }
+
+ return true;
+}
+
+
+/*
+ * IC BAR layout
+ *
+ * Page 0: Internal CQ register accesses (reads & writes)
+ * Page 1: Internal PC register accesses (reads & writes)
+ * Page 2: Internal VC register accesses (reads & writes)
+ * Page 3: Internal TCTXT (TIMA) reg accesses (read & writes)
+ * Page 4: Notify Port page (writes only, w/data),
+ * Page 5: Reserved
+ * Page 6: Sync Poll page (writes only, dataless)
+ * Page 7: Sync Inject page (writes only, dataless)
+ * Page 8: LSI Trigger page (writes only, dataless)
+ * Page 9: LSI SB Management page (reads & writes dataless)
+ * Pages 10-255: Reserved
+ * Pages 256-383: Direct mapped Thread Context Area (reads & writes)
+ * covering the 128 threads in P10.
+ * Pages 384-511: Reserved
+ */
+
+#define XIVE_IC_CQ_PGOFF 0
+#define XIVE_IC_PC_PGOFF 1
+#define XIVE_IC_VC_PGOFF 2
+#define XIVE_IC_TCTXT_PGOFF 3
+#define XIVE_NOTIFY_PGOFF 4
+#define XIVE_SYNC_POLL_PGOFF 6
+#define XIVE_SYNC_INJECT_PGOFF 7
+#define XIVE_LSI_TRIGGER_PGOFF 8
+#define XIVE_LSI_MGMT_PGOFF 9
+#define XIVE_IC_TM_DIRECT_PGOFF 256
+
+static bool xive_configure_ic_bars(struct xive *x)
+{
+ uint64_t chip_id = x->chip_id;
+ uint64_t val;
+
+ /* Reset all bars to zero */
+ xive_regwx(x, CQ_RST_CTL, CQ_RST_PB_BAR_RESET);
+
+ /* IC BAR */
+ phys_map_get(chip_id, XIVE_IC, 0, (uint64_t *)&x->ic_base, &x->ic_size);
+ val = (uint64_t)x->ic_base | CQ_IC_BAR_VALID | CQ_IC_BAR_64K;
+ x->ic_shift = 16;
+
+ xive_regwx(x, CQ_IC_BAR, val);
+ if (x->last_reg_error)
+ return false;
+
+ /*
+ * TM BAR, same address for each chip. Hence we create a fake
+ * chip 0 and use that for all phys_map_get(XIVE_TM) calls.
+ */
+ phys_map_get(0, XIVE_TM, 0, (uint64_t *)&x->tm_base, &x->tm_size);
+ val = (uint64_t)x->tm_base | CQ_TM_BAR_VALID | CQ_TM_BAR_64K;
+ x->tm_shift = 16;
+
+ xive_regwx(x, CQ_TM_BAR, val);
+ if (x->last_reg_error)
+ return false;
+
+ /* IC BAR sub-pages shortcuts */
+ x->ic_tm_direct_base = x->ic_base +
+ (XIVE_IC_TM_DIRECT_PGOFF << x->ic_shift);
+
+ return true;
+}
+
+/*
+ * NVPG, NVC, ESB, END BARs have common attributes: 64k page and only
+ * one set covering the whole BAR.
+ */
+static bool xive_configure_bars(struct xive *x)
+{
+ uint64_t chip_id = x->chip_id;
+ uint64_t val;
+ uint64_t esb_size;
+ uint64_t end_size;
+ uint64_t nvp_size;
+
+ x->nvp_size = XIVE_VP_COUNT(x) << XIVE_NVP_SHIFT;
+ x->esb_size = XIVE_INT_COUNT << XIVE_ESB_SHIFT;
+ x->end_size = XIVE_END_COUNT << XIVE_END_SHIFT;
+
+ /*
+ * NVC BAR is not configured because we do not use the XIVE2
+ * Crowd capability.
+ */
+
+ /* NVPG BAR: two pages, even NVP, odd NVG */
+ phys_map_get(chip_id, XIVE_NVPG, 0, (uint64_t *)&x->nvp_base, &nvp_size);
+ if (x->nvp_size > nvp_size) {
+ xive_err(x, "NVP table is larger than default: "
+ "0x%012llx > 0x%012llx\n", x->nvp_size, nvp_size);
+ return false;
+ }
+
+ val = (uint64_t)x->nvp_base | CQ_BAR_VALID | CQ_BAR_64K |
+ SETFIELD(CQ_BAR_RANGE, 0ull, ilog2(x->nvp_size) - 24);
+ xive_regwx(x, CQ_NVPG_BAR, val);
+ if (x->last_reg_error)
+ return false;
+
+ /* ESB BAR */
+ phys_map_get(chip_id, XIVE_ESB, 0, (uint64_t *)&x->esb_base, &esb_size);
+ if (x->esb_size > esb_size) {
+ xive_err(x, "ESB table is larger than default: "
+ "0x%012llx > 0x%012llx\n", x->esb_size, esb_size);
+ return false;
+ }
+
+ val = (uint64_t)x->esb_base | CQ_BAR_VALID | CQ_BAR_64K |
+ SETFIELD(CQ_BAR_RANGE, 0ull, ilog2(x->esb_size) - 24);
+ xive_regwx(x, CQ_ESB_BAR, val);
+ if (x->last_reg_error)
+ return false;
+
+ /* END BAR */
+ phys_map_get(chip_id, XIVE_END, 0, (uint64_t *)&x->end_base, &end_size);
+ if (x->end_size > end_size) {
+ xive_err(x, "END table is larger than default: "
+ "0x%012llx > 0x%012llx\n", x->end_size, end_size);
+ return false;
+ }
+
+ val = (uint64_t)x->end_base | CQ_BAR_VALID | CQ_BAR_64K |
+ SETFIELD(CQ_BAR_RANGE, 0ull, ilog2(x->end_size) - 24);
+ xive_regwx(x, CQ_END_BAR, val);
+ if (x->last_reg_error)
+ return false;
+
+ xive_dbg(x, "IC: %14p [0x%012llx]\n", x->ic_base, x->ic_size);
+ xive_dbg(x, "TM: %14p [0x%012llx]\n", x->tm_base, x->tm_size);
+ xive_dbg(x, "NVP: %14p [0x%012llx]\n", x->nvp_base, x->nvp_size);
+ xive_dbg(x, "ESB: %14p [0x%012llx]\n", x->esb_base, x->esb_size);
+ xive_dbg(x, "END: %14p [0x%012llx]\n", x->end_base, x->end_size);
+ xive_dbg(x, "OVF: %14p [0x%012x]\n", x->q_ovf,
+ VC_QUEUE_COUNT * PAGE_SIZE);
+
+ return true;
+}
+
+static void xive_dump_mmio(struct xive *x)
+{
+ prlog(PR_DEBUG, " CQ_CFG_PB_GEN = %016llx\n",
+ in_be64(x->ic_base + CQ_CFG_PB_GEN));
+ prlog(PR_DEBUG, " CQ_MSGSND = %016llx\n",
+ in_be64(x->ic_base + CQ_MSGSND));
+}
+
+static const struct {
+ uint64_t bitmask;
+ const char *name;
+} xive_capabilities[] = {
+ { CQ_XIVE_CAP_PHB_PQ_DISABLE, "PHB PQ disable mode support" },
+ { CQ_XIVE_CAP_PHB_ABT, "PHB address based trigger mode support" },
+ { CQ_XIVE_CAP_EXPLOITATION_MODE, "Exploitation mode" },
+ { CQ_XIVE_CAP_STORE_EOI, "StoreEOI mode support" },
+ { CQ_XIVE_CAP_VP_SAVE_RESTORE, "VP Context Save and Restore" },
+};
+
+static void xive_dump_capabilities(struct xive *x, uint64_t cap_val)
+{
+ int i;
+
+ xive_dbg(x, "capabilities: %016llx\n", cap_val);
+ xive_dbg(x, "\tVersion: %lld\n",
+ GETFIELD(CQ_XIVE_CAP_VERSION, cap_val));
+ xive_dbg(x, "\tUser interrupt priorities: [ 1 - %d ]\n",
+ 1 << GETFIELD(CQ_XIVE_CAP_USER_INT_PRIO, cap_val));
+ xive_dbg(x, "\tVP interrupt priorities: [ %d - 8 ]\n",
+ 1 << GETFIELD(CQ_XIVE_CAP_VP_INT_PRIO, cap_val));
+ xive_dbg(x, "\tExtended Blockid bits: %lld\n",
+ 4 + GETFIELD(CQ_XIVE_CAP_BLOCK_ID_WIDTH, cap_val));
+
+ for (i = 0; i < ARRAY_SIZE(xive_capabilities); i++) {
+ if (xive_capabilities[i].bitmask & cap_val)
+ xive_dbg(x, "\t%s\n", xive_capabilities[i].name);
+ }
+}
+
+static const struct {
+ uint64_t bitmask;
+ const char *name;
+} xive_configs[] = {
+ { CQ_XIVE_CFG_GEN1_TIMA_OS, "Gen1 mode TIMA OS" },
+ { CQ_XIVE_CFG_GEN1_TIMA_HYP, "Gen1 mode TIMA Hyp" },
+ { CQ_XIVE_CFG_GEN1_TIMA_HYP_BLK0, "Gen1 mode TIMA General Hypervisor Block0" },
+ { CQ_XIVE_CFG_GEN1_TIMA_CROWD_DIS, "Gen1 mode TIMA Crowd disable" },
+ { CQ_XIVE_CFG_GEN1_END_ESX, "Gen1 mode END ESx" },
+ { CQ_XIVE_CFG_EN_VP_SAVE_RESTORE, "VP Context Save and Restore" },
+ { CQ_XIVE_CFG_EN_VP_SAVE_REST_STRICT, "VP Context Save and Restore strict" },
+};
+
+static void xive_dump_configuration(struct xive *x, const char *prefix,
+ uint64_t cfg_val)
+{
+ int i ;
+
+ xive_dbg(x, "%s configuration: %016llx\n", prefix, cfg_val);
+ xive_dbg(x, "\tHardwired Thread Id range: %lld bits\n",
+ 7 + GETFIELD(CQ_XIVE_CFG_HYP_HARD_RANGE, cfg_val));
+ xive_dbg(x, "\tUser Interrupt priorities: [ 1 - %d ]\n",
+ 1 << GETFIELD(CQ_XIVE_CFG_USER_INT_PRIO, cfg_val));
+ xive_dbg(x, "\tVP Interrupt priorities: [ 0 - %d ]\n", xive_max_prio(x));
+ xive_dbg(x, "\tBlockId bits: %lld bits\n",
+ 4 + GETFIELD(CQ_XIVE_CFG_BLOCK_ID_WIDTH, cfg_val));
+ if (CQ_XIVE_CFG_HYP_HARD_BLKID_OVERRIDE & cfg_val)
+ xive_dbg(x, "\tHardwired BlockId: %lld\n",
+ GETFIELD(CQ_XIVE_CFG_HYP_HARD_BLOCK_ID, cfg_val));
+
+ for (i = 0; i < ARRAY_SIZE(xive_configs); i++) {
+ if (xive_configs[i].bitmask & cfg_val)
+ xive_dbg(x, "\t%s\n", xive_configs[i].name);
+ }
+}
+
+/*
+ * Default XIVE configuration
+ */
+#define XIVE_CONFIGURATION \
+ (SETFIELD(CQ_XIVE_CFG_HYP_HARD_RANGE, 0ull, CQ_XIVE_CFG_THREADID_8BITS) | \
+ SETFIELD(CQ_XIVE_CFG_VP_INT_PRIO, 0ull, CQ_XIVE_CFG_INT_PRIO_8))
+
+/*
+ * Gen1 configuration for tests (QEMU)
+ */
+#define XIVE_CONFIGURATION_GEN1 \
+ (SETFIELD(CQ_XIVE_CFG_HYP_HARD_RANGE, 0ull, CQ_XIVE_CFG_THREADID_7BITS) | \
+ SETFIELD(CQ_XIVE_CFG_VP_INT_PRIO, 0ull, CQ_XIVE_CFG_INT_PRIO_8) | \
+ CQ_XIVE_CFG_GEN1_TIMA_OS | \
+ CQ_XIVE_CFG_GEN1_TIMA_HYP | \
+ CQ_XIVE_CFG_GEN1_TIMA_HYP_BLK0 | \
+ CQ_XIVE_CFG_GEN1_TIMA_CROWD_DIS | \
+ CQ_XIVE_CFG_GEN1_END_ESX)
+
+static bool xive_has_cap(struct xive *x, uint64_t cap)
+{
+ return !!x && !!(x->capabilities & cap);
+}
+
+#define XIVE_CAN_STORE_EOI(x) xive_has_cap(x, CQ_XIVE_CAP_STORE_EOI)
+
+static bool xive_cfg_save_restore(struct xive *x)
+{
+ return !!(x->config & CQ_XIVE_CFG_EN_VP_SAVE_RESTORE);
+}
+
+/*
+ * When PQ_disable is available, configure the ESB cache to improve
+ * performance for PHB ESBs.
+ *
+ * split_mode :
+ * 1/3rd of the cache is reserved for PHB ESBs and the rest to
+ * IPIs. This is sufficient to keep all the PHB ESBs in cache and
+ * avoid ESB cache misses during IO interrupt processing.
+ *
+ * hash_array_enable :
+ * Internal cache hashing optimization. The hash_array tracks for
+ * ESBs where the original trigger came from so that we avoid
+ * getting the EAS into the cache twice.
+ */
+static void xive_config_esb_cache(struct xive *x)
+{
+ uint64_t val = xive_regr(x, VC_ESBC_CFG);
+
+ if (xive_has_cap(x, CQ_XIVE_CAP_PHB_PQ_DISABLE)) {
+ val |= VC_ESBC_CFG_SPLIT_MODE | VC_ESBC_CFG_HASH_ARRAY_ENABLE;
+ val = SETFIELD(VC_ESBC_CFG_MAX_ENTRIES_IN_MODIFIED, val, 0xE);
+ xive_dbg(x, "ESB cache configured with split mode "
+ "and hash array. VC_ESBC_CFG=%016llx\n", val);
+ } else
+ val &= ~VC_ESBC_CFG_SPLIT_MODE;
+
+ xive_regw(x, VC_ESBC_CFG, val);
+}
+
+static void xive_config_fused_core(struct xive *x)
+{
+ uint64_t val = xive_regr(x, TCTXT_CFG);
+
+ if (this_cpu()->is_fused_core) {
+ val |= TCTXT_CFG_FUSE_CORE_EN;
+ xive_dbg(x, "configured for fused cores. "
+ "PC_TCTXT_CFG=%016llx\n", val);
+ } else
+ val &= ~TCTXT_CFG_FUSE_CORE_EN;
+ xive_regw(x, TCTXT_CFG, val);
+}
+
+static void xive_config_reduced_priorities_fixup(struct xive *x)
+{
+ if (xive_cfg_vp_prio_shift(x) < CQ_XIVE_CFG_INT_PRIO_8 &&
+ x->quirks & XIVE_QUIRK_BROKEN_PRIO_CHECK) {
+ uint64_t val = xive_regr(x, PC_ERR1_CFG1);
+
+ val &= ~PC_ERR1_CFG1_INTERRUPT_INVALID_PRIO;
+ xive_dbg(x, "workaround for reduced priorities. "
+ "PC_ERR1_CFG1=%016llx\n", val);
+ xive_regw(x, PC_ERR1_CFG1, val);
+ }
+}
+
+static bool xive_config_init(struct xive *x)
+{
+ x->capabilities = xive_regr(x, CQ_XIVE_CAP);
+ xive_dump_capabilities(x, x->capabilities);
+
+ x->generation = GETFIELD(CQ_XIVE_CAP_VERSION, x->capabilities);
+
+ /*
+ * Allow QEMU to override version for tests
+ */
+ if (x->generation != XIVE_GEN2 && !chip_quirk(QUIRK_QEMU)) {
+ xive_err(x, "Invalid XIVE controller version %d\n",
+ x->generation);
+ return false;
+ }
+
+ x->config = xive_regr(x, CQ_XIVE_CFG);
+ xive_dump_configuration(x, "default", x->config);
+
+ /* Start with default settings */
+ x->config = x->generation == XIVE_GEN1 ? XIVE_CONFIGURATION_GEN1 :
+ XIVE_CONFIGURATION;
+
+ if (x->quirks & XIVE_QUIRK_THREADID_7BITS)
+ x->config = SETFIELD(CQ_XIVE_CFG_HYP_HARD_RANGE, x->config,
+ CQ_XIVE_CFG_THREADID_7BITS);
+
+ /*
+ * Hardwire the block ID. The default value is the topology ID
+ * of the chip which is different from the block.
+ */
+ x->config |= CQ_XIVE_CFG_HYP_HARD_BLKID_OVERRIDE |
+ SETFIELD(CQ_XIVE_CFG_HYP_HARD_BLOCK_ID, 0ull, x->block_id);
+
+ /*
+ * Enable "VP Context Save and Restore" by default. it is
+ * compatible with KVM which currently does the context
+ * save&restore in the entry/exit path of the vCPU
+ */
+ if (x->capabilities & CQ_XIVE_CAP_VP_SAVE_RESTORE)
+ x->config |= CQ_XIVE_CFG_EN_VP_SAVE_RESTORE;
+
+ xive_dump_configuration(x, "new", x->config);
+ xive_regw(x, CQ_XIVE_CFG, x->config);
+ if (xive_regr(x, CQ_XIVE_CFG) != x->config) {
+ xive_err(x, "configuration setting failed\n");
+ }
+
+ /*
+ * Disable error reporting in the FIR for info errors from the VC.
+ */
+ xive_regw(x, CQ_FIRMASK_OR, CQ_FIR_VC_INFO_ERROR_0_2);
+
+ /*
+ * Mask CI Load and Store to bad location, as IPI trigger
+ * pages may be mapped to user space, and a read on the
+ * trigger page causes a checkstop
+ */
+ xive_regw(x, CQ_FIRMASK_OR, CQ_FIR_PB_RCMDX_CI_ERR1);
+
+ /*
+ * VP space settings. P9 mode is 19bits.
+ */
+ x->vp_shift = x->generation == XIVE_GEN1 ?
+ VP_SHIFT_GEN1 : VP_SHIFT_GEN2;
+
+ /*
+ * VP ids for HW threads. These values are hardcoded in the
+ * CAM line of the HW context
+ *
+ * POWER10 |chip|0000000000000001|threadid|
+ * 28bits 4 16 8
+ *
+ * POWER9 |chip|000000000001|thrdid |
+ * 23bits 4 12 7
+ */
+
+ /* TODO (cosmetic): set VP ids for HW threads only once */
+ xive_threadid_shift = 7 + GETFIELD(CQ_XIVE_CFG_HYP_HARD_RANGE,
+ x->config);
+
+ xive_hw_vp_base = 1 << xive_threadid_shift;
+ xive_hw_vp_count = 1 << xive_threadid_shift;
+
+ xive_dbg(x, "store EOI is %savailable\n",
+ XIVE_CAN_STORE_EOI(x) ? "" : "not ");
+
+ xive_config_fused_core(x);
+
+ xive_config_esb_cache(x);
+
+ xive_config_reduced_priorities_fixup(x);
+
+ return true;
+}
+
+/* Set Translation tables : 1 block per chip */
+static bool xive_setup_set_xlate(struct xive *x)
+{
+ unsigned int i;
+
+ /* Configure ESBs */
+ xive_regw(x, CQ_TAR,
+ CQ_TAR_AUTOINC | SETFIELD(CQ_TAR_SELECT, 0ull, CQ_TAR_ESB));
+ if (x->last_reg_error)
+ return false;
+ for (i = 0; i < XIVE_MAX_BLOCKS; i++) {
+ xive_regw(x, CQ_TDR, CQ_TDR_VALID |
+ SETFIELD(CQ_TDR_BLOCK_ID, 0ull, x->block_id));
+ if (x->last_reg_error)
+ return false;
+ }
+
+ /* Configure ENDs */
+ xive_regw(x, CQ_TAR,
+ CQ_TAR_AUTOINC | SETFIELD(CQ_TAR_SELECT, 0ull, CQ_TAR_END));
+ if (x->last_reg_error)
+ return false;
+ for (i = 0; i < XIVE_MAX_BLOCKS; i++) {
+ xive_regw(x, CQ_TDR, CQ_TDR_VALID |
+ SETFIELD(CQ_TDR_BLOCK_ID, 0ull, x->block_id));
+ if (x->last_reg_error)
+ return false;
+ }
+
+ /* Configure NVPs */
+ xive_regw(x, CQ_TAR,
+ CQ_TAR_AUTOINC | SETFIELD(CQ_TAR_SELECT, 0ull, CQ_TAR_NVPG));
+ if (x->last_reg_error)
+ return false;
+ for (i = 0; i < XIVE_MAX_BLOCKS; i++) {
+ xive_regw(x, CQ_TDR, CQ_TDR_VALID |
+ SETFIELD(CQ_TDR_BLOCK_ID, 0ull, x->block_id));
+ if (x->last_reg_error)
+ return false;
+ }
+ return true;
+}
+
+static bool xive_prealloc_tables(struct xive *x)
+{
+ uint32_t i;
+ uint32_t pbase, pend;
+
+ /* ESB has 4 entries per byte */
+ x->sbe_base = local_alloc(x->chip_id, XIVE_ESB_SIZE, XIVE_ESB_SIZE);
+ if (!x->sbe_base) {
+ xive_err(x, "Failed to allocate SBE\n");
+ return false;
+ }
+
+ /* PQs are initialized to 0b01 which corresponds to "ints off" */
+ memset(x->sbe_base, 0x55, XIVE_ESB_SIZE);
+ xive_dbg(x, "SBE at %p size 0x%lx\n", x->sbe_base, XIVE_ESB_SIZE);
+
+ /* EAS entries are 8 bytes */
+ x->eat_base = local_alloc(x->chip_id, XIVE_EAT_SIZE, XIVE_EAT_SIZE);
+ if (!x->eat_base) {
+ xive_err(x, "Failed to allocate EAS\n");
+ return false;
+ }
+
+ /*
+ * We clear the entries (non-valid). They will be initialized
+ * when actually used
+ */
+ memset(x->eat_base, 0, XIVE_EAT_SIZE);
+ xive_dbg(x, "EAT at %p size 0x%lx\n", x->eat_base, XIVE_EAT_SIZE);
+
+ /* Indirect END table. Limited to one top page. */
+ x->end_ind_size = ALIGN_UP(XIVE_END_TABLE_SIZE, PAGE_SIZE);
+ if (x->end_ind_size > PAGE_SIZE) {
+ xive_err(x, "END indirect table is too big !\n");
+ return false;
+ }
+ x->end_ind_base = local_alloc(x->chip_id, x->end_ind_size,
+ x->end_ind_size);
+ if (!x->end_ind_base) {
+ xive_err(x, "Failed to allocate END indirect table\n");
+ return false;
+ }
+ memset(x->end_ind_base, 0, x->end_ind_size);
+ xive_dbg(x, "ENDi at %p size 0x%llx #%ld entries\n", x->end_ind_base,
+ x->end_ind_size, XIVE_END_COUNT);
+ x->end_ind_count = XIVE_END_TABLE_SIZE / XIVE_VSD_SIZE;
+
+ /* Indirect VP table. Limited to one top page. */
+ x->vp_ind_size = ALIGN_UP(XIVE_VP_TABLE_SIZE(x), PAGE_SIZE);
+ if (x->vp_ind_size > PAGE_SIZE) {
+ xive_err(x, "VP indirect table is too big !\n");
+ return false;
+ }
+ x->vp_ind_base = local_alloc(x->chip_id, x->vp_ind_size,
+ x->vp_ind_size);
+ if (!x->vp_ind_base) {
+ xive_err(x, "Failed to allocate VP indirect table\n");
+ return false;
+ }
+ xive_dbg(x, "VPi at %p size 0x%llx #%ld entries\n", x->vp_ind_base,
+ x->vp_ind_size, XIVE_VP_COUNT(x));
+ x->vp_ind_count = XIVE_VP_TABLE_SIZE(x) / XIVE_VSD_SIZE;
+ memset(x->vp_ind_base, 0, x->vp_ind_size);
+
+ /* Allocate pages for the VP ids representing HW threads */
+ pbase = xive_hw_vp_base / VP_PER_PAGE;
+ pend = (xive_hw_vp_base + xive_hw_vp_count) / VP_PER_PAGE;
+
+ xive_dbg(x, "Allocating pages %d to %d of VPs (for %d VPs)\n",
+ pbase, pend, xive_hw_vp_count);
+ for (i = pbase; i <= pend; i++) {
+ void *page;
+ u64 vsd;
+
+ /* Indirect entries have a VSD format */
+ page = local_alloc(x->chip_id, PAGE_SIZE, PAGE_SIZE);
+ if (!page) {
+ xive_err(x, "Failed to allocate VP page\n");
+ return false;
+ }
+ xive_dbg(x, "VP%d at %p size 0x%x\n", i, page, PAGE_SIZE);
+ memset(page, 0, PAGE_SIZE);
+ vsd = ((uint64_t)page) & VSD_ADDRESS_MASK;
+
+ vsd |= SETFIELD(VSD_TSIZE, 0ull, 4);
+ vsd |= SETFIELD(VSD_MODE, 0ull, VSD_MODE_EXCLUSIVE);
+ vsd |= VSD_FIRMWARE;
+ x->vp_ind_base[i] = cpu_to_be64(vsd);
+ }
+
+ /*
+ * Allocate page for cache and sync injection (512 * 128 hw
+ * threads) + one extra page for future use
+ */
+ x->sync_inject_size = PAGE_SIZE + PAGE_SIZE;
+ x->sync_inject = local_alloc(x->chip_id, x->sync_inject_size,
+ x->sync_inject_size);
+ if (!x->sync_inject) {
+ xive_err(x, "Failed to allocate sync pages\n");
+ return false;
+ }
+
+ /*
+ * The Memory Coherence Directory uses 16M "granule" to track
+ * shared copies of a cache line. If any cache line within the
+ * 16M range gets touched by someone outside of the group, the
+ * MCD forces accesses to any cache line within the range to
+ * include everyone that might have a shared copy.
+ */
+#define QUEUE_OVF_ALIGN (16 << 20) /* MCD granule size */
+
+ /*
+ * Allocate the queue overflow pages and use a 16M alignment
+ * to avoid sharing with other structures and reduce traffic
+ * on the PowerBus.
+ */
+ x->q_ovf = local_alloc(x->chip_id, VC_QUEUE_COUNT * PAGE_SIZE,
+ QUEUE_OVF_ALIGN);
+ if (!x->q_ovf) {
+ xive_err(x, "Failed to allocate queue overflow\n");
+ return false;
+ }
+ return true;
+}
+
+static void xive_add_provisioning_properties(void)
+{
+ beint32_t chips[XIVE_MAX_CHIPS];
+ uint32_t i, count;
+
+ dt_add_property_cells(xive_dt_node,
+ "ibm,xive-provision-page-size", PAGE_SIZE);
+
+ count = 1 << xive_chips_alloc_bits;
+ for (i = 0; i < count; i++)
+ chips[i] = cpu_to_be32(xive_block_to_chip[i]);
+ dt_add_property(xive_dt_node, "ibm,xive-provision-chips",
+ chips, 4 * count);
+}
+
+static void xive_create_mmio_dt_node(struct xive *x)
+{
+ uint64_t tb = (uint64_t)x->tm_base;
+ uint32_t stride = 1u << x->tm_shift;
+
+ xive_dt_node = dt_new_addr(dt_root, "interrupt-controller", tb);
+ assert(xive_dt_node);
+
+ dt_add_property_u64s(xive_dt_node, "reg",
+ tb + 0 * stride, stride,
+ tb + 1 * stride, stride,
+ tb + 2 * stride, stride,
+ tb + 3 * stride, stride);
+
+ dt_add_property_strings(xive_dt_node, "compatible",
+ "ibm,opal-xive-pe", "ibm,opal-intc");
+
+ dt_add_property_cells(xive_dt_node, "ibm,xive-eq-sizes",
+ 12, 16, 21, 24);
+
+ dt_add_property_cells(xive_dt_node, "ibm,xive-#priorities",
+ xive_cfg_vp_prio(x));
+
+ dt_add_property(xive_dt_node, "single-escalation-support", NULL, 0);
+
+ if (XIVE_CAN_STORE_EOI(x))
+ dt_add_property(xive_dt_node, "store-eoi", NULL, 0);
+
+ if (xive_cfg_save_restore(x))
+ dt_add_property(xive_dt_node, "vp-save-restore", NULL, 0);
+
+ xive_add_provisioning_properties();
+
+}
+
+static void xive_setup_forward_ports(struct xive *x, struct proc_chip *remote_chip)
+{
+ struct xive *remote_xive = remote_chip->xive;
+ uint64_t base = SETFIELD(VSD_MODE, 0ull, VSD_MODE_FORWARD);
+
+ if (!xive_set_vsd(x, VST_ESB, remote_xive->block_id,
+ base | ((uint64_t)remote_xive->esb_base) |
+ SETFIELD(VSD_TSIZE, 0ull, ilog2(x->esb_size) - 12)))
+ goto error;
+
+ /* EAS: No remote */
+
+ if (!xive_set_vsd(x, VST_END, remote_xive->block_id,
+ base | ((uint64_t)remote_xive->end_base) |
+ SETFIELD(VSD_TSIZE, 0ull, ilog2(x->end_size) - 12)))
+ goto error;
+
+ if (!xive_set_vsd(x, VST_NVP, remote_xive->block_id,
+ base | ((uint64_t)remote_xive->nvp_base) |
+ SETFIELD(VSD_TSIZE, 0ull, ilog2(x->nvp_size) - 12)))
+ goto error;
+
+ /* NVG: not used */
+ /* NVC: not used */
+
+ if (!xive_set_vsd(x, VST_IC, remote_xive->chip_id,
+ base | ((uint64_t)remote_xive->ic_base) |
+ SETFIELD(VSD_TSIZE, 0ull, ilog2(x->ic_size) - 12)))
+ goto error;
+
+ if (!xive_set_vsd(x, VST_SYNC, remote_xive->chip_id,
+ base | ((uint64_t)remote_xive->sync_inject) |
+ SETFIELD(VSD_TSIZE, 0ull, ilog2(x->sync_inject_size) - 12)))
+ goto error;
+
+ /* ERQ: No remote */
+
+ return;
+
+ error:
+ xive_err(x, "Failure configuring forwarding ports\n");
+}
+
+static void late_init_one_xive(struct xive *x)
+{
+ struct proc_chip *chip;
+
+ /* We need to setup the cross-chip forward ports. Let's
+ * iterate all chip and set them up accordingly
+ */
+ for_each_chip(chip) {
+ /* We skip ourselves or chips without a xive */
+ if (chip->xive == x || !chip->xive)
+ continue;
+
+ /* Setup our forward ports to that chip */
+ xive_setup_forward_ports(x, chip);
+ }
+}
+
+static bool xive_check_ipi_free(struct xive *x, uint32_t irq, uint32_t count)
+{
+ uint32_t i, idx = GIRQ_TO_IDX(irq);
+
+ for (i = 0; i < count; i++)
+ if (bitmap_tst_bit(*x->ipi_alloc_map, idx + i))
+ return false;
+ return true;
+}
+
+uint32_t xive2_alloc_hw_irqs(uint32_t chip_id, uint32_t count,
+ uint32_t align)
+{
+ struct proc_chip *chip = get_chip(chip_id);
+ struct xive *x;
+ uint32_t base, i;
+
+ assert(chip);
+ assert(is_pow2(align));
+
+ x = chip->xive;
+ assert(x);
+
+ lock(&x->lock);
+
+ /* Allocate the HW interrupts */
+ base = x->int_hw_bot - count;
+ base &= ~(align - 1);
+ if (base < x->int_ipi_top) {
+ xive_err(x,
+ "HW alloc request for %d interrupts aligned to %d failed\n",
+ count, align);
+ unlock(&x->lock);
+ return XIVE_IRQ_ERROR;
+ }
+ if (!xive_check_ipi_free(x, base, count)) {
+ xive_err(x, "HWIRQ boot allocator request overlaps dynamic allocator\n");
+ unlock(&x->lock);
+ return XIVE_IRQ_ERROR;
+ }
+
+ x->int_hw_bot = base;
+
+ /* Initialize the corresponding EAS entries to sane defaults,
+ * IE entry is valid, not routed and masked, EQ data is set
+ * to the GIRQ number.
+ */
+ for (i = 0; i < count; i++) {
+ struct xive_eas *eas = xive_get_eas(x, base + i);
+
+ eas->w = xive_set_field64(EAS_VALID, 0, 1) |
+ xive_set_field64(EAS_MASKED, 0, 1) |
+ xive_set_field64(EAS_END_DATA, 0, base + i);
+ }
+
+ unlock(&x->lock);
+ return base;
+}
+
+uint32_t xive2_alloc_ipi_irqs(uint32_t chip_id, uint32_t count,
+ uint32_t align)
+{
+ struct proc_chip *chip = get_chip(chip_id);
+ struct xive *x;
+ uint32_t base, i;
+
+ assert(chip);
+ assert(is_pow2(align));
+
+ x = chip->xive;
+ assert(x);
+
+ lock(&x->lock);
+
+ /* Allocate the IPI interrupts */
+ base = x->int_ipi_top + (align - 1);
+ base &= ~(align - 1);
+ if (base >= x->int_hw_bot) {
+ xive_err(x,
+ "IPI alloc request for %d interrupts aligned to %d failed\n",
+ count, align);
+ unlock(&x->lock);
+ return XIVE_IRQ_ERROR;
+ }
+ if (!xive_check_ipi_free(x, base, count)) {
+ xive_err(x, "IPI boot allocator request overlaps dynamic allocator\n");
+ unlock(&x->lock);
+ return XIVE_IRQ_ERROR;
+ }
+
+ x->int_ipi_top = base + count;
+
+ /* Initialize the corresponding EAS entries to sane defaults,
+ * IE entry is valid, not routed and masked, END data is set
+ * to the GIRQ number.
+ */
+ for (i = 0; i < count; i++) {
+ struct xive_eas *eas = xive_get_eas(x, base + i);
+
+ eas->w = xive_set_field64(EAS_VALID, 0, 1) |
+ xive_set_field64(EAS_MASKED, 0, 1) |
+ xive_set_field64(EAS_END_DATA, 0, base + i);
+ }
+
+ unlock(&x->lock);
+ return base;
+}
+
+void *xive2_get_trigger_port(uint32_t girq)
+{
+ uint32_t idx = GIRQ_TO_IDX(girq);
+ struct xive *x;
+
+ /* Find XIVE on which the EAS resides */
+ x = xive_from_isn(girq);
+ if (!x)
+ return NULL;
+
+ if (GIRQ_IS_ESCALATION(girq)) {
+ /* There is no trigger page for escalation interrupts */
+ return NULL;
+ } else {
+ /* Make sure it's an IPI on that chip */
+ if (girq < x->int_base ||
+ girq >= x->int_ipi_top)
+ return NULL;
+
+ return x->esb_base + idx * XIVE_ESB_PAGE_SIZE;
+ }
+}
+
+/*
+ * Notify Port page (writes only, w/data), separated into two
+ * categories, both sent to VC:
+ * - IPI queue (Addr bit 52 = 0) (for NPU)
+ * - HW queue (Addr bit 52 = 1)
+ */
+uint64_t xive2_get_notify_port(uint32_t chip_id, uint32_t ent)
+{
+ struct proc_chip *chip = get_chip(chip_id);
+ struct xive *x;
+ uint32_t offset = 0;
+
+ assert(chip);
+ x = chip->xive;
+ assert(x);
+
+ /* This is where we can assign a different HW queue to a different
+ * source by offsetting into the cache lines of the notify port
+ *
+ * For now we keep it very basic, this will have to be looked at
+ * again on real HW with some proper performance analysis.
+ *
+ * Here's what Florian says on the matter:
+ *
+ * <<
+ * The first 2k of the notify port page can all be used for PCIe triggers
+ *
+ * However the idea would be that we try to use the first 4 cache lines to
+ * balance the PCIe Interrupt requests to use the least used snoop buses
+ * (we went from 2 to 4 snoop buses for P9). snoop 0 is heavily used
+ * (I think TLBIs are using that in addition to the normal addresses),
+ * snoop 3 is used for all Int commands, so I think snoop 2 (CL 2 in the
+ * page) is the least used overall. So we probably should that one for
+ * the Int commands from PCIe.
+ *
+ * In addition, our EAS cache supports hashing to provide "private" cache
+ * areas for the PHBs in the shared 1k EAS cache. This allows e.g. to avoid
+ * that one "thrashing" PHB thrashes the EAS cache for everyone, or provide
+ * a PHB with a private area that would allow high cache hits in case of a
+ * device using very few interrupts. The hashing is based on the offset within
+ * the cache line. So using that, you can e.g. set the EAS cache up so that
+ * IPIs use 512 entries, the x16 PHB uses 256 entries and the x8 PHBs 128
+ * entries each - or IPIs using all entries and sharing with PHBs, so PHBs
+ * would use 512 entries and 256 entries respectively.
+ *
+ * This is a tuning we would probably do later in the lab, but as a "prep"
+ * we should set up the different PHBs such that they are using different
+ * 8B-aligned offsets within the cache line, so e.g.
+ * PH4_0 addr 0x100 (CL 2 DW0
+ * PH4_1 addr 0x108 (CL 2 DW1)
+ * PH4_2 addr 0x110 (CL 2 DW2)
+ * etc.
+ * >>
+ *
+ * I'm using snoop1 for PHB0 and snoop2 for everybody else.
+ */
+
+ /* Florian adds :
+ *
+ * we just set them up for a start to have different offsets
+ * within the cache line so that we could use the allocation
+ * restrictions that can be enforced in the interrupt
+ * controller
+ *
+ * P10 might now be randomizing the cache line bits in HW to
+ * balance snoop bus usage
+ */
+ switch(ent) {
+ case XIVE_HW_SRC_PHBn(0):
+ offset = 0x800;
+ break;
+ case XIVE_HW_SRC_PHBn(1):
+ offset = 0x908;
+ break;
+ case XIVE_HW_SRC_PHBn(2):
+ offset = 0x910;
+ break;
+ case XIVE_HW_SRC_PHBn(3):
+ offset = 0x918;
+ break;
+ case XIVE_HW_SRC_PHBn(4):
+ offset = 0x920;
+ break;
+ case XIVE_HW_SRC_PHBn(5):
+ offset = 0x928;
+ break;
+ case XIVE_HW_SRC_PSI:
+ offset = 0x930;
+ break;
+ default:
+ assert(false);
+ return 0;
+ }
+
+ return ((uint64_t)x->ic_base) +
+ (XIVE_NOTIFY_PGOFF << x->ic_shift) + offset;
+}
+
+/* Manufacture the powerbus packet bits 32:63 */
+__attrconst uint32_t xive2_get_notify_base(uint32_t girq)
+{
+ return (GIRQ_TO_BLK(girq) << 28) | GIRQ_TO_IDX(girq);
+}
+
+static bool xive_get_irq_targetting(uint32_t isn, uint32_t *out_target,
+ uint8_t *out_prio, uint32_t *out_lirq)
+{
+ struct xive_eas *eas;
+ struct xive *x, *end_x;
+ struct xive_end *end;
+ uint32_t end_blk, end_idx;
+ uint32_t vp_blk, vp_idx;
+ uint32_t prio, server;
+ bool is_escalation = GIRQ_IS_ESCALATION(isn);
+
+ /* Find XIVE on which the EAS resides */
+ x = xive_from_isn(isn);
+ if (!x)
+ return false;
+ /* Grab the EAS */
+ eas = xive_get_eas(x, isn);
+ if (!eas)
+ return false;
+ if (!xive_get_field64(EAS_VALID, eas->w) && !is_escalation) {
+ xive_err(x, "ISN %x lead to invalid EAS !\n", isn);
+ return false;
+ }
+
+ if (out_lirq)
+ *out_lirq = xive_get_field64(EAS_END_DATA, eas->w);
+
+ /* Find the END and its xive instance */
+ end_blk = xive_get_field64(EAS_END_BLOCK, eas->w);
+ end_idx = xive_get_field64(EAS_END_INDEX, eas->w);
+ end_x = xive_from_vc_blk(end_blk);
+
+ /* This can fail if the interrupt hasn't been initialized yet
+ * but it should also be masked, so fail silently
+ */
+ if (!end_x)
+ goto pick_default;
+ end = xive_get_end(end_x, end_idx);
+ if (!end)
+ goto pick_default;
+
+ /* XXX Check valid and format 0 */
+
+ /* No priority conversion, return the actual one ! */
+ if (xive_get_field64(EAS_MASKED, eas->w))
+ prio = 0xff;
+ else
+ prio = xive_get_field32(END_W7_F0_PRIORITY, end->w7);
+ if (out_prio)
+ *out_prio = prio;
+
+ vp_blk = xive_get_field32(END_W6_VP_BLOCK, end->w6);
+ vp_idx = xive_get_field32(END_W6_VP_OFFSET, end->w6);
+ server = VP2PIR(vp_blk, vp_idx);
+
+ if (out_target)
+ *out_target = server;
+
+ xive_vdbg(end_x, "END info for ISN %x: prio=%d, server=0x%x (VP %x/%x)\n",
+ isn, prio, server, vp_blk, vp_idx);
+ return true;
+
+pick_default:
+ xive_vdbg(end_x, "END info for ISN %x: Using masked defaults\n", isn);
+
+ if (out_prio)
+ *out_prio = 0xff;
+ /* Pick a random default, me will be fine ... */
+ if (out_target)
+ *out_target = mfspr(SPR_PIR);
+ return true;
+}
+
+static inline bool xive_end_for_target(uint32_t target, uint8_t prio,
+ uint32_t *out_end_blk,
+ uint32_t *out_end_idx)
+{
+ struct xive *x;
+ struct xive_nvp *vp;
+ uint32_t vp_blk, vp_idx;
+ uint32_t end_blk, end_idx;
+
+ if (prio > xive_max_prio(one_xive))
+ return false;
+
+ /* Get the VP block/index from the target word */
+ if (!xive_decode_vp(target, &vp_blk, &vp_idx, NULL, NULL))
+ return false;
+
+ /* Grab the target VP's XIVE */
+ x = xive_from_pc_blk(vp_blk);
+ if (!x)
+ return false;
+
+ /* Find the VP structrure where we stashed the END number */
+ vp = xive_get_vp(x, vp_idx);
+ if (!vp)
+ return false;
+
+ end_blk = xive_get_field32(NVP_W5_VP_END_BLOCK, vp->w5);
+ end_idx = xive_get_field32(NVP_W5_VP_END_INDEX, vp->w5);
+
+ /* Currently the END block and VP block should be the same */
+ if (end_blk != vp_blk) {
+ xive_err(x, "end_blk != vp_blk (%d vs. %d) for target 0x%08x/%d\n",
+ end_blk, vp_blk, target, prio);
+ assert(false);
+ }
+
+ if (out_end_blk)
+ *out_end_blk = end_blk;
+ if (out_end_idx)
+ *out_end_idx = end_idx + prio;
+
+ return true;
+}
+
+static int64_t xive_set_irq_targetting(uint32_t isn, uint32_t target,
+ uint8_t prio, uint32_t lirq,
+ bool synchronous)
+{
+ struct xive *x;
+ struct xive_eas *eas, new_eas;
+ uint32_t end_blk, end_idx;
+ bool is_escalation = GIRQ_IS_ESCALATION(isn);
+ int64_t rc;
+
+ /* Find XIVE on which the EAS resides */
+ x = xive_from_isn(isn);
+ if (!x)
+ return OPAL_PARAMETER;
+ /* Grab the EAS */
+ eas = xive_get_eas(x, isn);
+ if (!eas)
+ return OPAL_PARAMETER;
+ if (!xive_get_field64(EAS_VALID, eas->w) && !is_escalation) {
+ xive_err(x, "ISN %x lead to invalid EAS !\n", isn);
+ return OPAL_PARAMETER;
+ }
+
+ lock(&x->lock);
+
+ /* Read existing EAS */
+ new_eas = *eas;
+
+ /* Are we masking ? */
+ if (prio == 0xff && !is_escalation) {
+ new_eas.w = xive_set_field64(EAS_MASKED, new_eas.w, 1);
+ xive_vdbg(x, "ISN %x masked !\n", isn);
+
+ /* Put prio 7 in the END */
+ prio = xive_max_prio(x);
+ } else {
+ /* Unmasking */
+ new_eas.w = xive_set_field64(EAS_MASKED, new_eas.w, 0);
+ xive_vdbg(x, "ISN %x unmasked !\n", isn);
+
+ /* For normal interrupt sources, keep track of which ones
+ * we ever enabled since the last reset
+ */
+ if (!is_escalation)
+ bitmap_set_bit(*x->int_enabled_map, GIRQ_TO_IDX(isn));
+ }
+
+ /* If prio isn't 0xff, re-target the EAS. First find the END
+ * correponding to the target
+ */
+ if (prio != 0xff) {
+ if (!xive_end_for_target(target, prio, &end_blk, &end_idx)) {
+ xive_err(x, "Can't find END for target/prio 0x%x/%d\n",
+ target, prio);
+ unlock(&x->lock);
+ return OPAL_PARAMETER;
+ }
+
+ /* Try to update it atomically to avoid an intermediary
+ * stale state
+ */
+ new_eas.w = xive_set_field64(EAS_END_BLOCK, new_eas.w, end_blk);
+ new_eas.w = xive_set_field64(EAS_END_INDEX, new_eas.w, end_idx);
+ }
+ new_eas.w = xive_set_field64(EAS_END_DATA, new_eas.w, lirq);
+
+ xive_vdbg(x,"ISN %x routed to end %x/%x lirq=%08x EAS=%016llx !\n",
+ isn, end_blk, end_idx, lirq, new_eas.w);
+
+ /* Updating the cache differs between real EAS and escalation
+ * EAS inside an END
+ */
+ if (is_escalation) {
+ rc = xive_escalation_ive_cache_update(x, x->block_id,
+ GIRQ_TO_IDX(isn), &new_eas, synchronous);
+ } else {
+ sync();
+ *eas = new_eas;
+ rc = xive_easc_scrub(x, x->block_id, GIRQ_TO_IDX(isn));
+ }
+
+ unlock(&x->lock);
+ return rc;
+}
+
+static void xive_update_irq_mask(struct xive_src *s, uint32_t idx, bool masked)
+{
+ void *mmio_base = s->esb_mmio + (1ul << s->esb_shift) * idx;
+ uint32_t offset;
+
+ /* XXX FIXME: A quick mask/umask can make us shoot an interrupt
+ * more than once to a queue. We need to keep track better
+ */
+ if (s->flags & XIVE_SRC_EOI_PAGE1)
+ mmio_base += 1ull << (s->esb_shift - 1);
+ if (masked)
+ offset = XIVE_ESB_SET_PQ_01;
+ else
+ offset = XIVE_ESB_SET_PQ_00;
+
+ in_be64(mmio_base + offset);
+}
+
+#define XIVE_SYNC_IPI 0x000
+#define XIVE_SYNC_HW 0x080
+#define XIVE_SYNC_NxC 0x100
+#define XIVE_SYNC_INT 0x180
+#define XIVE_SYNC_OS_ESC 0x200
+#define XIVE_SYNC_POOL_ESC 0x280
+#define XIVE_SYNC_HARD_ESC 0x300
+
+static int64_t xive_sync(struct xive *x __unused)
+{
+ uint64_t r;
+ void *sync_base;
+
+ lock(&x->lock);
+
+ sync_base = x->ic_base + (XIVE_SYNC_POLL_PGOFF << x->ic_shift);
+
+ out_be64(sync_base + XIVE_SYNC_IPI, 0);
+ out_be64(sync_base + XIVE_SYNC_HW, 0);
+ out_be64(sync_base + XIVE_SYNC_NxC, 0);
+ out_be64(sync_base + XIVE_SYNC_INT, 0);
+ out_be64(sync_base + XIVE_SYNC_OS_ESC, 0);
+ out_be64(sync_base + XIVE_SYNC_POOL_ESC, 0);
+ out_be64(sync_base + XIVE_SYNC_HARD_ESC, 0);
+
+ /* XXX Add timeout */
+ for (;;) {
+ r = xive_regr(x, VC_ENDC_SYNC_DONE);
+ if ((r & VC_ENDC_SYNC_POLL_DONE) == VC_ENDC_SYNC_POLL_DONE)
+ break;
+ cpu_relax();
+ }
+ xive_regw(x, VC_ENDC_SYNC_DONE, r & ~VC_ENDC_SYNC_POLL_DONE);
+
+ /*
+ * Do a read after clearing the sync done bit to prevent any
+ * race between CI write and next sync command
+ */
+ xive_regr(x, VC_ENDC_SYNC_DONE);
+
+ unlock(&x->lock);
+ return 0;
+}
+
+static int64_t __xive_set_irq_config(struct irq_source *is, uint32_t girq,
+ uint64_t vp, uint8_t prio, uint32_t lirq,
+ bool update_esb, bool sync)
+{
+ struct xive_src *s = container_of(is, struct xive_src, is);
+ uint32_t old_target, vp_blk;
+ u8 old_prio;
+ int64_t rc;
+
+ /* Grab existing target */
+ if (!xive_get_irq_targetting(girq, &old_target, &old_prio, NULL))
+ return OPAL_PARAMETER;
+
+ /* Let XIVE configure the END. We do the update without the
+ * synchronous flag, thus a cache update failure will result
+ * in us returning OPAL_BUSY
+ */
+ rc = xive_set_irq_targetting(girq, vp, prio, lirq, false);
+ if (rc)
+ return rc;
+
+ /* Do we need to update the mask ? */
+ if (old_prio != prio && (old_prio == 0xff || prio == 0xff)) {
+ /* The source has special variants of masking/unmasking */
+ if (update_esb) {
+ /* Ensure it's enabled/disabled in the source
+ * controller
+ */
+ xive_update_irq_mask(s, girq - s->esb_base,
+ prio == 0xff);
+ }
+ }
+
+ /*
+ * Synchronize the source and old target XIVEs to ensure that
+ * all pending interrupts to the old target have reached their
+ * respective queue.
+ *
+ * WARNING: This assumes the VP and it's queues are on the same
+ * XIVE instance !
+ */
+ if (!sync)
+ return OPAL_SUCCESS;
+ xive_sync(s->xive);
+ if (xive_decode_vp(old_target, &vp_blk, NULL, NULL, NULL)) {
+ struct xive *x = xive_from_pc_blk(vp_blk);
+ if (x)
+ xive_sync(x);
+ }
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t xive_set_irq_config(uint32_t girq, uint64_t vp, uint8_t prio,
+ uint32_t lirq, bool update_esb)
+{
+ struct irq_source *is = irq_find_source(girq);
+
+ return __xive_set_irq_config(is, girq, vp, prio, lirq, update_esb,
+ true);
+}
+
+static void xive_source_interrupt(struct irq_source *is, uint32_t isn)
+{
+ struct xive_src *s = container_of(is, struct xive_src, is);
+
+ if (!s->orig_ops || !s->orig_ops->interrupt)
+ return;
+ s->orig_ops->interrupt(is, isn);
+}
+
+static uint64_t xive_source_attributes(struct irq_source *is, uint32_t isn)
+{
+ struct xive_src *s = container_of(is, struct xive_src, is);
+
+ if (!s->orig_ops || !s->orig_ops->attributes)
+ return IRQ_ATTR_TARGET_LINUX;
+ return s->orig_ops->attributes(is, isn);
+}
+
+static char *xive_source_name(struct irq_source *is, uint32_t isn)
+{
+ struct xive_src *s = container_of(is, struct xive_src, is);
+
+ if (!s->orig_ops || !s->orig_ops->name)
+ return NULL;
+ return s->orig_ops->name(is, isn);
+}
+
+void xive2_source_mask(struct irq_source *is, uint32_t isn)
+{
+ struct xive_src *s = container_of(is, struct xive_src, is);
+
+ xive_update_irq_mask(s, isn - s->esb_base, true);
+}
+
+static const struct irq_source_ops xive_irq_source_ops = {
+ .interrupt = xive_source_interrupt,
+ .attributes = xive_source_attributes,
+ .name = xive_source_name,
+};
+
+static void __xive_register_source(struct xive *x, struct xive_src *s,
+ uint32_t base, uint32_t count,
+ uint32_t shift, void *mmio, uint32_t flags,
+ bool secondary, void *data,
+ const struct irq_source_ops *orig_ops)
+{
+ s->esb_base = base;
+ s->esb_shift = shift;
+ s->esb_mmio = mmio;
+ s->flags = flags;
+ s->orig_ops = orig_ops;
+ s->xive = x;
+ s->is.start = base;
+ s->is.end = base + count;
+ s->is.ops = &xive_irq_source_ops;
+ s->is.data = data;
+
+ __register_irq_source(&s->is, secondary);
+}
+
+void xive2_register_hw_source(uint32_t base, uint32_t count, uint32_t shift,
+ void *mmio, uint32_t flags, void *data,
+ const struct irq_source_ops *ops)
+{
+ struct xive_src *s;
+ struct xive *x = xive_from_isn(base);
+
+ assert(x);
+
+ s = malloc(sizeof(struct xive_src));
+ assert(s);
+ __xive_register_source(x, s, base, count, shift, mmio, flags,
+ false, data, ops);
+}
+
+static void __xive2_register_esb_source(uint32_t base, uint32_t count,
+ void *data, const struct irq_source_ops *ops)
+{
+ struct xive_src *s;
+ struct xive *x = xive_from_isn(base);
+ uint32_t base_idx = GIRQ_TO_IDX(base);
+ void *mmio_base;
+ uint32_t flags = XIVE_SRC_EOI_PAGE1 | XIVE_SRC_TRIGGER_PAGE;
+
+ assert(x);
+
+ s = malloc(sizeof(struct xive_src));
+ assert(s);
+
+ if (XIVE_CAN_STORE_EOI(x))
+ flags |= XIVE_SRC_STORE_EOI;
+
+ /* Callbacks assume the MMIO base corresponds to the first
+ * interrupt of that source structure so adjust it
+ */
+ mmio_base = x->esb_base + (1ul << XIVE_ESB_SHIFT) * base_idx;
+ __xive_register_source(x, s, base, count, XIVE_ESB_SHIFT, mmio_base,
+ flags, false, data, ops);
+}
+
+/*
+ * Check that IPI sources have interrupt numbers in the IPI interrupt
+ * number range
+ */
+void xive2_register_ipi_source(uint32_t base, uint32_t count, void *data,
+ const struct irq_source_ops *ops)
+{
+ struct xive *x = xive_from_isn(base);
+
+ assert(x);
+ assert(base >= x->int_base && (base + count) <= x->int_ipi_top);
+
+ __xive2_register_esb_source(base, count, data, ops);
+}
+
+/*
+ * Some HW sources (PHB) can disable the use of their own ESB pages
+ * and offload all the checks on ESB pages of the IC. The interrupt
+ * numbers are not necessarily in the IPI range.
+ */
+void xive2_register_esb_source(uint32_t base, uint32_t count)
+{
+ __xive2_register_esb_source(base, count, NULL, NULL);
+}
+
+uint64_t xive2_get_esb_base(uint32_t base)
+{
+ struct xive *x = xive_from_isn(base);
+ uint32_t base_idx = GIRQ_TO_IDX(base);
+
+ assert(x);
+
+ return (uint64_t) x->esb_base + (1ul << XIVE_ESB_SHIFT) * base_idx;
+}
+
+static void xive_set_quirks(struct xive *x, struct proc_chip *chip __unused)
+{
+ uint64_t quirks = 0;
+
+ /* This extension is dropped for P10 */
+ if (proc_gen == proc_gen_p10)
+ quirks |= XIVE_QUIRK_THREADID_7BITS;
+
+ /* Broken check on invalid priority when reduced priorities is in use */
+ if (proc_gen == proc_gen_p10)
+ quirks |= XIVE_QUIRK_BROKEN_PRIO_CHECK;
+
+ xive_dbg(x, "setting XIVE quirks to %016llx\n", quirks);
+ x->quirks = quirks;
+}
+
+static struct xive *init_one_xive(struct dt_node *np)
+{
+ struct xive *x;
+ struct proc_chip *chip;
+ uint32_t flags;
+
+ x = zalloc(sizeof(struct xive));
+ assert(x);
+ x->x_node = np;
+ x->xscom_base = dt_get_address(np, 0, NULL);
+ x->chip_id = dt_get_chip_id(np);
+
+ /* "Allocate" a new block ID for the chip */
+ x->block_id = xive_block_count++;
+ assert (x->block_id < XIVE_MAX_CHIPS);
+ xive_block_to_chip[x->block_id] = x->chip_id;
+ init_lock(&x->lock);
+
+ chip = get_chip(x->chip_id);
+ assert(chip);
+
+ xive_notice(x, "Initializing XIVE block ID %d...\n", x->block_id);
+ chip->xive = x;
+
+ xive_set_quirks(x, chip);
+
+ list_head_init(&x->donated_pages);
+
+ /* Base interrupt numbers and allocator init */
+
+ x->int_base = BLKIDX_TO_GIRQ(x->block_id, 0);
+ x->int_count = x->int_base + XIVE_INT_COUNT;
+ x->int_hw_bot = x->int_count;
+ x->int_ipi_top = x->int_base;
+
+ if (x->int_ipi_top < XIVE_INT_FIRST)
+ x->int_ipi_top = XIVE_INT_FIRST;
+
+ /* Allocate a few bitmaps */
+ x->end_map = local_alloc(x->chip_id, BITMAP_BYTES(xive_end_bitmap_size(x)), PAGE_SIZE);
+ assert(x->end_map);
+ memset(x->end_map, 0, BITMAP_BYTES(xive_end_bitmap_size(x)));
+
+ /*
+ * Allocate END index 0 to make sure it can not be used as an
+ * END base for a VP. This is the criteria to know if a VP was
+ * allocated.
+ */
+ bitmap_set_bit(*x->end_map, 0);
+
+ x->int_enabled_map = local_alloc(x->chip_id, BITMAP_BYTES(XIVE_INT_COUNT), PAGE_SIZE);
+ assert(x->int_enabled_map);
+ memset(x->int_enabled_map, 0, BITMAP_BYTES(XIVE_INT_COUNT));
+ x->ipi_alloc_map = local_alloc(x->chip_id, BITMAP_BYTES(XIVE_INT_COUNT), PAGE_SIZE);
+ assert(x->ipi_alloc_map);
+ memset(x->ipi_alloc_map, 0, BITMAP_BYTES(XIVE_INT_COUNT));
+
+ xive_dbg(x, "Handling interrupts [%08x..%08x]\n",
+ x->int_base, x->int_count - 1);
+
+ /* Setup the IC BARs */
+ if (!xive_configure_ic_bars(x))
+ goto fail;
+
+ /* Some basic global inits such as page sizes etc... */
+ if (!xive_config_init(x))
+ goto fail;
+
+ /* Configure the set translations for MMIO */
+ if (!xive_setup_set_xlate(x))
+ goto fail;
+
+ /* Dump some MMIO registers for diagnostics */
+ xive_dump_mmio(x);
+
+ /* Pre-allocate a number of tables */
+ if (!xive_prealloc_tables(x))
+ goto fail;
+
+ /* Setup the XIVE structures BARs */
+ if (!xive_configure_bars(x))
+ goto fail;
+
+ /*
+ * Configure local tables in VSDs (forward ports will be
+ * handled later)
+ */
+ if (!xive_set_local_tables(x))
+ goto fail;
+
+ /* Register built-in source controllers (aka IPIs) */
+ flags = XIVE_SRC_EOI_PAGE1 | XIVE_SRC_TRIGGER_PAGE;
+ if (XIVE_CAN_STORE_EOI(x))
+ flags |= XIVE_SRC_STORE_EOI;
+ __xive_register_source(x, &x->ipis, x->int_base,
+ x->int_hw_bot - x->int_base, XIVE_ESB_SHIFT,
+ x->esb_base, flags, true, NULL, NULL);
+
+ /* Register escalation sources (ENDs)
+ *
+ * The ESe PQ bits are used for coalescing and the END ESB for
+ * interrupt management. The word 4&5 of the END is the EAS
+ * for the escalation source and the indexing is the same as
+ * the END.
+ *
+ * This is an OPAL primary source, IPIs are secondary.
+ */
+ __xive_register_source(x, &x->esc_irqs,
+ MAKE_ESCALATION_GIRQ(x->block_id, 0),
+ XIVE_END_COUNT, XIVE_END_SHIFT,
+ x->end_base, XIVE_SRC_EOI_PAGE1,
+ false, NULL, NULL);
+
+
+ return x;
+ fail:
+ xive_err(x, "Initialization failed...\n");
+
+ /* Should this be fatal ? */
+ //assert(false);
+ return NULL;
+}
+
+static void xive_reset_enable_thread(struct cpu_thread *c)
+{
+ struct proc_chip *chip = get_chip(c->chip_id);
+ struct xive *x = chip->xive;
+ uint32_t fc, bit;
+ uint64_t enable;
+
+ /* Get fused core number */
+ fc = (c->pir >> 3) & 0xf;
+
+ /* Get bit in register */
+ bit = c->pir & 0x3f;
+
+ /* Get which register to access */
+ if (fc < 8) {
+ xive_regw(x, TCTXT_EN0_RESET, PPC_BIT(bit));
+ xive_regw(x, TCTXT_EN0_SET, PPC_BIT(bit));
+
+ enable = xive_regr(x, TCTXT_EN0);
+ if (!(enable & PPC_BIT(bit)))
+ xive_cpu_err(c, "Failed to enable thread\n");
+ } else {
+ xive_regw(x, TCTXT_EN1_RESET, PPC_BIT(bit));
+ xive_regw(x, TCTXT_EN1_SET, PPC_BIT(bit));
+
+ enable = xive_regr(x, TCTXT_EN1);
+ if (!(enable & PPC_BIT(bit)))
+ xive_cpu_err(c, "Failed to enable thread\n");
+ }
+}
+
+void xive2_cpu_callin(struct cpu_thread *cpu)
+{
+ struct xive_cpu_state *xs = cpu->xstate;
+ uint8_t old_w2 __unused, w2 __unused;
+
+ if (!xs)
+ return;
+
+ /* Reset the HW thread context and enable it */
+ xive_reset_enable_thread(cpu);
+
+ /* Set VT to 1 */
+ old_w2 = in_8(xs->tm_ring1 + TM_QW3_HV_PHYS + TM_WORD2);
+ out_8(xs->tm_ring1 + TM_QW3_HV_PHYS + TM_WORD2, 0x80);
+ w2 = in_8(xs->tm_ring1 + TM_QW3_HV_PHYS + TM_WORD2);
+
+ xive_cpu_vdbg(cpu, "Initialized TIMA VP=%x/%x W01=%016llx W2=%02x->%02x\n",
+ xs->vp_blk, xs->vp_idx,
+ in_be64(xs->tm_ring1 + TM_QW3_HV_PHYS),
+ old_w2, w2);
+}
+
+#ifdef XIVE_EXTRA_CHECK_INIT_CACHE
+#define CHECK_INIT_CACHE_LOOP 0x100
+static void xive_special_cache_check(struct xive *x, uint32_t blk, uint32_t idx)
+{
+ struct xive_nvp vp = {0};
+ uint32_t i;
+
+ /*
+ * SIMICS checks the value of reserved fields
+ */
+ if (chip_quirk(QUIRK_SIMICS))
+ return;
+
+ for (i = 0; i < CHECK_INIT_CACHE_LOOP; i++) {
+ struct xive_nvp *vp_m = xive_get_vp(x, idx);
+
+ memset(vp_m, (~i) & 0xff, sizeof(*vp_m));
+ sync();
+ vp.w1 = (i << 16) | i;
+ assert(!xive_nxc_cache_update(x, blk, idx, &vp, true));
+ if (!xive_check_nxc_update(x, idx, &vp)) {
+ xive_dbg(x, "NXC update test failed at %d iterations\n", i);
+ return;
+ }
+ }
+ xive_dbg(x, "NXC update test passed for %d/0x%x\n", blk, idx);
+}
+#else
+static inline void xive_special_cache_check(struct xive *x __unused,
+ uint32_t blk __unused,
+ uint32_t idx __unused)
+{
+}
+#endif
+
+static void xive_init_cpu_exploitation(struct xive_cpu_state *xs)
+{
+ struct xive_end end;
+ struct xive_nvp vp;
+ struct xive *x_vp, *x_end;
+ int i;
+
+ /* Grab the XIVE where the VP resides. It could be different from
+ * the local chip XIVE if not using block group mode
+ */
+ x_vp = xive_from_pc_blk(xs->vp_blk);
+ assert(x_vp);
+
+ /* Grab the XIVE where the END resides. It should be the same
+ * as the VP.
+ */
+ x_end = xive_from_vc_blk(xs->end_blk);
+ assert(x_end);
+
+ xive_init_hw_end(&end);
+
+ /* Use the cache watch to update all ENDs reserved for HW VPs */
+ lock(&x_end->lock);
+ for (i = 0; i < xive_cfg_vp_prio(x_end); i++)
+ xive_endc_cache_update(x_end, xs->end_blk, xs->end_idx + i,
+ &end, true);
+ unlock(&x_end->lock);
+
+ /* Initialize/enable the VP */
+ xive_init_default_vp(&vp, xs->end_blk, xs->end_idx);
+
+ /* Use the cache watch to write it out */
+ lock(&x_vp->lock);
+ xive_special_cache_check(x_vp, xs->vp_blk, xs->vp_idx);
+ xive_nxc_cache_update(x_vp, xs->vp_blk, xs->vp_idx, &vp, true);
+ unlock(&x_vp->lock);
+}
+
+static void xive_configure_ex_special_bar(struct xive *x, struct cpu_thread *c)
+{
+ uint64_t xa, val;
+ int64_t rc;
+
+ xive_cpu_vdbg(c, "Setting up special BAR\n");
+ xa = XSCOM_ADDR_P10_NCU(pir_to_core_id(c->pir), P10_NCU_SPEC_BAR);
+ val = (uint64_t)x->tm_base | P10_NCU_SPEC_BAR_ENABLE;
+ if (x->tm_shift == 16)
+ val |= P10_NCU_SPEC_BAR_256K;
+ xive_cpu_vdbg(c, "NCU_SPEC_BAR_XA[%08llx]=%016llx\n", xa, val);
+ rc = xscom_write(c->chip_id, xa, val);
+ if (rc) {
+ xive_cpu_err(c, "Failed to setup NCU_SPEC_BAR\n");
+ /* XXXX what do do now ? */
+ }
+}
+
+void xive2_late_init(void)
+{
+ struct cpu_thread *c;
+
+ prlog(PR_INFO, "SLW: Configuring self-restore for NCU_SPEC_BAR\n");
+ for_each_present_cpu(c) {
+ if(cpu_is_thread0(c)) {
+ struct proc_chip *chip = get_chip(c->chip_id);
+ struct xive *x = chip->xive;
+ uint64_t xa, val, rc;
+ xa = XSCOM_ADDR_P10_NCU(pir_to_core_id(c->pir), P10_NCU_SPEC_BAR);
+ val = (uint64_t)x->tm_base | P10_NCU_SPEC_BAR_ENABLE;
+ /* Bail out if wakeup engine has already failed */
+ if (wakeup_engine_state != WAKEUP_ENGINE_PRESENT) {
+ prlog(PR_ERR, "XIVE proc_stop_api fail detected\n");
+ break;
+ }
+ rc = proc_stop_save_scom((void *)chip->homer_base, xa, val,
+ PROC_STOP_SCOM_REPLACE, PROC_STOP_SECTION_L3);
+ if (rc) {
+ xive_cpu_err(c, "proc_stop_save_scom failed for NCU_SPEC_BAR rc=%lld\n",
+ rc);
+ wakeup_engine_state = WAKEUP_ENGINE_FAILED;
+ }
+ }
+ }
+}
+
+static void xive_provision_cpu(struct xive_cpu_state *xs, struct cpu_thread *c)
+{
+ struct xive *x;
+
+ /* VP ids for HW threads are pre-allocated */
+ xs->vp_blk = PIR2VP_BLK(c->pir);
+ xs->vp_idx = PIR2VP_IDX(c->pir);
+
+ /* For now we use identical block IDs for VC and PC but that might
+ * change. We allocate the ENDs on the same XIVE as the VP.
+ */
+ xs->end_blk = xs->vp_blk;
+
+ /* Grab the XIVE where the END resides. It could be different from
+ * the local chip XIVE if not using block group mode
+ */
+ x = xive_from_vc_blk(xs->end_blk);
+ assert(x);
+
+ /* Allocate a set of ENDs for that VP */
+ xs->end_idx = xive_alloc_end_set(x, true);
+ assert(!XIVE_ALLOC_IS_ERR(xs->end_idx));
+}
+
+static void xive_init_cpu(struct cpu_thread *c)
+{
+ struct proc_chip *chip = get_chip(c->chip_id);
+ struct xive *x = chip->xive;
+ struct xive_cpu_state *xs;
+
+ if (!x)
+ return;
+
+ /*
+ * Each core pair (EX) needs this special BAR setup to have the
+ * right powerbus cycle for the TM area (as it has the same address
+ * on all chips so it's somewhat special).
+ *
+ * Because we don't want to bother trying to figure out which core
+ * of a pair is present we just do the setup for each of them, which
+ * is harmless.
+ */
+ if (cpu_is_thread0(c) || cpu_is_core_chiplet_primary(c))
+ xive_configure_ex_special_bar(x, c);
+
+ /* Initialize the state structure */
+ c->xstate = xs = local_alloc(c->chip_id, sizeof(struct xive_cpu_state), 1);
+ assert(xs);
+ memset(xs, 0, sizeof(struct xive_cpu_state));
+ xs->xive = x;
+
+ init_lock(&xs->lock);
+
+ /* Shortcut to TM HV ring */
+ xs->tm_ring1 = x->tm_base + (1u << x->tm_shift);
+
+ /* Provision a VP id and some ENDs for a HW thread */
+ xive_provision_cpu(xs, c);
+
+ xive_init_cpu_exploitation(xs);
+}
+
+static uint64_t xive_convert_irq_flags(uint64_t iflags)
+{
+ uint64_t oflags = 0;
+
+ if (iflags & XIVE_SRC_STORE_EOI)
+ oflags |= OPAL_XIVE_IRQ_STORE_EOI2;
+
+ /* OPAL_XIVE_IRQ_TRIGGER_PAGE is only meant to be set if
+ * the interrupt has a *separate* trigger page.
+ */
+ if ((iflags & XIVE_SRC_EOI_PAGE1) &&
+ (iflags & XIVE_SRC_TRIGGER_PAGE))
+ oflags |= OPAL_XIVE_IRQ_TRIGGER_PAGE;
+
+ if (iflags & XIVE_SRC_LSI)
+ oflags |= OPAL_XIVE_IRQ_LSI;
+
+ return oflags;
+}
+
+static int64_t opal_xive_get_irq_info(uint32_t girq,
+ beint64_t *out_flags,
+ beint64_t *out_eoi_page,
+ beint64_t *out_trig_page,
+ beint32_t *out_esb_shift,
+ beint32_t *out_src_chip)
+{
+ struct irq_source *is = irq_find_source(girq);
+ struct xive_src *s = container_of(is, struct xive_src, is);
+ uint32_t idx;
+ uint64_t mm_base;
+ uint64_t eoi_page = 0, trig_page = 0;
+
+ if (xive_mode != XIVE_MODE_EXPL)
+ return OPAL_WRONG_STATE;
+ if (is == NULL || out_flags == NULL)
+ return OPAL_PARAMETER;
+ assert(is->ops == &xive_irq_source_ops);
+
+ if (out_flags)
+ *out_flags = cpu_to_be64(xive_convert_irq_flags(s->flags));
+
+ idx = girq - s->esb_base;
+
+ if (out_esb_shift)
+ *out_esb_shift = cpu_to_be32(s->esb_shift);
+
+ mm_base = (uint64_t)s->esb_mmio + (1ull << s->esb_shift) * idx;
+
+ /* The EOI page can either be the first or second page */
+ if (s->flags & XIVE_SRC_EOI_PAGE1) {
+ uint64_t p1off = 1ull << (s->esb_shift - 1);
+ eoi_page = mm_base + p1off;
+ } else
+ eoi_page = mm_base;
+
+ /* The trigger page, if it exists, is always the first page */
+ if (s->flags & XIVE_SRC_TRIGGER_PAGE)
+ trig_page = mm_base;
+
+ if (out_eoi_page)
+ *out_eoi_page = cpu_to_be64(eoi_page);
+ if (out_trig_page)
+ *out_trig_page = cpu_to_be64(trig_page);
+ if (out_src_chip)
+ *out_src_chip = cpu_to_be32(GIRQ_TO_CHIP(girq));
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_get_irq_config(uint32_t girq,
+ beint64_t *out_vp,
+ uint8_t *out_prio,
+ beint32_t *out_lirq)
+{
+ uint32_t vp;
+ uint32_t lirq;
+ uint8_t prio;
+
+ if (xive_mode != XIVE_MODE_EXPL)
+ return OPAL_WRONG_STATE;
+
+ if (xive_get_irq_targetting(girq, &vp, &prio, &lirq)) {
+ *out_vp = cpu_to_be64(vp);
+ *out_prio = prio;
+ *out_lirq = cpu_to_be32(lirq);
+ return OPAL_SUCCESS;
+ } else
+ return OPAL_PARAMETER;
+}
+
+static int64_t opal_xive_set_irq_config(uint32_t girq,
+ uint64_t vp,
+ uint8_t prio,
+ uint32_t lirq)
+{
+ /*
+ * This variant is meant for a XIVE-aware OS, thus it will
+ * *not* affect the ESB state of the interrupt. If used with
+ * a prio of FF, the EAS will be masked. In that case the
+ * races have to be handled by the OS.
+ */
+ if (xive_mode != XIVE_MODE_EXPL)
+ return OPAL_WRONG_STATE;
+
+ return xive_set_irq_config(girq, vp, prio, lirq, false);
+}
+
+static int64_t opal_xive_get_queue_info(uint64_t vp, uint32_t prio,
+ beint64_t *out_qpage,
+ beint64_t *out_qsize,
+ beint64_t *out_qeoi_page,
+ beint32_t *out_escalate_irq,
+ beint64_t *out_qflags)
+{
+ uint32_t blk, idx;
+ struct xive *x;
+ struct xive_end *end;
+
+ if (xive_mode != XIVE_MODE_EXPL)
+ return OPAL_WRONG_STATE;
+
+ if (!xive_end_for_target(vp, prio, &blk, &idx))
+ return OPAL_PARAMETER;
+
+ x = xive_from_vc_blk(blk);
+ if (!x)
+ return OPAL_PARAMETER;
+
+ end = xive_get_end(x, idx);
+ if (!end)
+ return OPAL_PARAMETER;
+
+ if (out_escalate_irq) {
+ uint32_t esc_idx = idx;
+
+ /* If escalations are routed to a single queue, fix up
+ * the escalation interrupt number here.
+ */
+ if (xive_get_field32(END_W0_UNCOND_ESCALATE, end->w0))
+ esc_idx |= xive_escalation_prio(x);
+ *out_escalate_irq =
+ cpu_to_be32(MAKE_ESCALATION_GIRQ(blk, esc_idx));
+ }
+
+ /* If this is a single-escalation gather queue, that's all
+ * there is to return
+ */
+ if (xive_get_field32(END_W0_SILENT_ESCALATE, end->w0)) {
+ if (out_qflags)
+ *out_qflags = 0;
+ if (out_qpage)
+ *out_qpage = 0;
+ if (out_qsize)
+ *out_qsize = 0;
+ if (out_qeoi_page)
+ *out_qeoi_page = 0;
+ return OPAL_SUCCESS;
+ }
+
+ if (out_qpage) {
+ if (xive_get_field32(END_W0_ENQUEUE, end->w0))
+ *out_qpage = cpu_to_be64(
+ ((uint64_t)xive_get_field32(END_W2_EQ_ADDR_HI, end->w2) << 32) |
+ xive_get_field32(END_W3_EQ_ADDR_LO, end->w3));
+ else
+ *out_qpage = 0;
+ }
+ if (out_qsize) {
+ if (xive_get_field32(END_W0_ENQUEUE, end->w0))
+ *out_qsize = cpu_to_be64(xive_get_field32(END_W3_QSIZE, end->w3) + 12);
+ else
+ *out_qsize = 0;
+ }
+ if (out_qeoi_page) {
+ *out_qeoi_page = cpu_to_be64(
+ (uint64_t)x->end_base + idx * XIVE_ESB_PAGE_SIZE);
+ }
+ if (out_qflags) {
+ *out_qflags = 0;
+ if (xive_get_field32(END_W0_VALID, end->w0))
+ *out_qflags |= cpu_to_be64(OPAL_XIVE_EQ_ENABLED);
+ if (xive_get_field32(END_W0_UCOND_NOTIFY, end->w0))
+ *out_qflags |= cpu_to_be64(OPAL_XIVE_EQ_ALWAYS_NOTIFY);
+ if (xive_get_field32(END_W0_ESCALATE_CTL, end->w0))
+ *out_qflags |= cpu_to_be64(OPAL_XIVE_EQ_ESCALATE);
+ }
+
+ return OPAL_SUCCESS;
+}
+
+static void xive_cleanup_end(struct xive_end *end)
+{
+ end->w0 = xive_set_field32(END_W0_FIRMWARE1, 0, xive_end_is_firmware1(end));
+ end->w1 = xive_set_field32(END_W1_ESe_Q, 0, 1) |
+ xive_set_field32(END_W1_ESn_Q, 0, 1);
+ end->w2 = end->w3 = end->w4 = end->w5 = end->w6 = end->w7 = 0;
+}
+
+static int64_t opal_xive_set_queue_info(uint64_t vp, uint32_t prio,
+ uint64_t qpage,
+ uint64_t qsize,
+ uint64_t qflags)
+{
+ uint32_t blk, idx;
+ struct xive *x;
+ struct xive_end *old_end;
+ struct xive_end end;
+ uint32_t vp_blk, vp_idx;
+ bool group;
+ int64_t rc;
+
+ if (!xive_end_for_target(vp, prio, &blk, &idx))
+ return OPAL_PARAMETER;
+
+ x = xive_from_vc_blk(blk);
+ if (!x)
+ return OPAL_PARAMETER;
+
+ old_end = xive_get_end(x, idx);
+ if (!old_end)
+ return OPAL_PARAMETER;
+
+ /* If this is a silent escalation queue, it cannot be
+ * configured directly
+ */
+ if (xive_get_field32(END_W0_SILENT_ESCALATE, old_end->w0))
+ return OPAL_PARAMETER;
+
+ /* This shouldn't fail or xive_end_for_target would have
+ * failed already
+ */
+ if (!xive_decode_vp(vp, &vp_blk, &vp_idx, NULL, &group))
+ return OPAL_PARAMETER;
+
+ /*
+ * Make a local copy which we will later try to commit using
+ * the cache watch facility
+ */
+ end = *old_end;
+
+ if (qflags & OPAL_XIVE_EQ_ENABLED) {
+ switch(qsize) {
+ /* Supported sizes */
+ case 12:
+ case 16:
+ case 21:
+ case 24:
+ end.w3 = cpu_to_be32(qpage & END_W3_EQ_ADDR_LO);
+ end.w2 = cpu_to_be32((qpage >> 32) & END_W2_EQ_ADDR_HI);
+ end.w3 = xive_set_field32(END_W3_QSIZE, end.w3, qsize - 12);
+ end.w0 = xive_set_field32(END_W0_ENQUEUE, end.w0, 1);
+ break;
+ case 0:
+ end.w2 = end.w3 = 0;
+ end.w0 = xive_set_field32(END_W0_ENQUEUE, end.w0, 0);
+ break;
+ default:
+ return OPAL_PARAMETER;
+ }
+
+ /* Ensure the priority and target are correctly set (they will
+ * not be right after allocation
+ */
+ end.w6 = xive_set_field32(END_W6_VP_BLOCK, 0, vp_blk) |
+ xive_set_field32(END_W6_VP_OFFSET, 0, vp_idx);
+ end.w7 = xive_set_field32(END_W7_F0_PRIORITY, 0, prio);
+ /* XXX Handle group i bit when needed */
+
+ /* Always notify flag */
+ if (qflags & OPAL_XIVE_EQ_ALWAYS_NOTIFY)
+ end.w0 = xive_set_field32(END_W0_UCOND_NOTIFY, end.w0, 1);
+ else
+ end.w0 = xive_set_field32(END_W0_UCOND_NOTIFY, end.w0, 0);
+
+ /* Escalation flag */
+ if (qflags & OPAL_XIVE_EQ_ESCALATE)
+ end.w0 = xive_set_field32(END_W0_ESCALATE_CTL, end.w0, 1);
+ else
+ end.w0 = xive_set_field32(END_W0_ESCALATE_CTL, end.w0, 0);
+
+ /* Unconditionally clear the current queue pointer, set
+ * generation to 1 and disable escalation interrupts.
+ */
+ end.w1 = xive_set_field32(END_W1_GENERATION, 0, 1) |
+ xive_set_field32(END_W1_ES, 0, xive_get_field32(END_W1_ES, old_end->w1));
+
+ /* Enable. We always enable backlog for an enabled queue
+ * otherwise escalations won't work.
+ */
+ end.w0 = xive_set_field32(END_W0_VALID, end.w0, 1);
+ end.w0 = xive_set_field32(END_W0_BACKLOG, end.w0, 1);
+ } else
+ xive_cleanup_end(&end);
+
+ /* Update END, non-synchronous */
+ lock(&x->lock);
+ rc = xive_endc_cache_update(x, blk, idx, &end, false);
+ unlock(&x->lock);
+
+ return rc;
+}
+
+static int64_t opal_xive_get_queue_state(uint64_t vp, uint32_t prio,
+ beint32_t *out_qtoggle,
+ beint32_t *out_qindex)
+{
+ uint32_t blk, idx;
+ struct xive *x;
+ struct xive_end *end;
+ int64_t rc;
+
+ if (xive_mode != XIVE_MODE_EXPL)
+ return OPAL_WRONG_STATE;
+
+ if (!out_qtoggle || !out_qindex ||
+ !xive_end_for_target(vp, prio, &blk, &idx))
+ return OPAL_PARAMETER;
+
+ x = xive_from_vc_blk(blk);
+ if (!x)
+ return OPAL_PARAMETER;
+
+ end = xive_get_end(x, idx);
+ if (!end)
+ return OPAL_PARAMETER;
+
+ /* Scrub the queue */
+ lock(&x->lock);
+ rc = xive_endc_scrub(x, blk, idx);
+ unlock(&x->lock);
+ if (rc)
+ return rc;
+
+ /* We don't do disable queues */
+ if (!xive_get_field32(END_W0_VALID, end->w0))
+ return OPAL_WRONG_STATE;
+
+ *out_qtoggle = cpu_to_be32(xive_get_field32(END_W1_GENERATION, end->w1));
+ *out_qindex = cpu_to_be32(xive_get_field32(END_W1_PAGE_OFF, end->w1));
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_set_queue_state(uint64_t vp, uint32_t prio,
+ uint32_t qtoggle, uint32_t qindex)
+{
+ uint32_t blk, idx;
+ struct xive *x;
+ struct xive_end *end, new_end;
+ int64_t rc;
+
+ if (xive_mode != XIVE_MODE_EXPL)
+ return OPAL_WRONG_STATE;
+
+ if (!xive_end_for_target(vp, prio, &blk, &idx))
+ return OPAL_PARAMETER;
+
+ x = xive_from_vc_blk(blk);
+ if (!x)
+ return OPAL_PARAMETER;
+
+ end = xive_get_end(x, idx);
+ if (!end)
+ return OPAL_PARAMETER;
+
+ /* We don't do disable queues */
+ if (!xive_get_field32(END_W0_VALID, end->w0))
+ return OPAL_WRONG_STATE;
+
+ new_end = *end;
+
+ new_end.w1 = xive_set_field32(END_W1_GENERATION, new_end.w1, qtoggle);
+ new_end.w1 = xive_set_field32(END_W1_PAGE_OFF, new_end.w1, qindex);
+
+ lock(&x->lock);
+ rc = xive_endc_cache_update(x, blk, idx, &new_end, false);
+ unlock(&x->lock);
+
+ return rc;
+}
+
+static int64_t opal_xive_donate_page(uint32_t chip_id, uint64_t addr)
+{
+ struct proc_chip *c = get_chip(chip_id);
+ struct list_node *n;
+
+ if (xive_mode != XIVE_MODE_EXPL)
+ return OPAL_WRONG_STATE;
+ if (!c)
+ return OPAL_PARAMETER;
+ if (!c->xive)
+ return OPAL_PARAMETER;
+ if (addr & 0xffff)
+ return OPAL_PARAMETER;
+
+ n = (struct list_node *)addr;
+ lock(&c->xive->lock);
+ list_add(&c->xive->donated_pages, n);
+ unlock(&c->xive->lock);
+ return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_get_vp_info(uint64_t vp_id,
+ beint64_t *out_flags,
+ beint64_t *out_cam_value,
+ beint64_t *out_report_cl_pair,
+ beint32_t *out_chip_id)
+{
+ struct xive *x;
+ struct xive_nvp *vp;
+ uint32_t blk, idx;
+ bool group;
+
+ if (!xive_decode_vp(vp_id, &blk, &idx, NULL, &group))
+ return OPAL_PARAMETER;
+ /* We don't do groups yet */
+ if (group)
+ return OPAL_PARAMETER;
+ x = xive_from_pc_blk(blk);
+ if (!x)
+ return OPAL_PARAMETER;
+ vp = xive_get_vp(x, idx);
+ if (!vp)
+ return OPAL_PARAMETER;
+
+ if (out_flags) {
+ uint32_t end_blk, end_idx;
+ struct xive_end *end;
+ struct xive *end_x;
+ *out_flags = 0;
+
+ /*
+ * We would like to a way to stash a SW bit in the VP
+ * to know whether silent escalation is enabled or
+ * not, but unlike what happens with ENDs, the PC
+ * cache watch doesn't implement the reserved bit in
+ * the VPs... so we have to go look at END 7 instead.
+ */
+
+ /* Grab END for prio 7 to check for silent escalation */
+ if (!xive_end_for_target(vp_id, xive_escalation_prio(x),
+ &end_blk, &end_idx))
+ return OPAL_PARAMETER;
+
+ end_x = xive_from_vc_blk(end_blk);
+ if (!end_x)
+ return OPAL_PARAMETER;
+
+ end = xive_get_end(x, end_idx);
+ if (!end)
+ return OPAL_PARAMETER;
+ if (xive_get_field32(NVP_W0_VALID, vp->w0))
+ *out_flags |= cpu_to_be64(OPAL_XIVE_VP_ENABLED);
+ if (xive_cfg_save_restore(x))
+ *out_flags |= cpu_to_be64(OPAL_XIVE_VP_SAVE_RESTORE);
+ if (xive_get_field32(END_W0_SILENT_ESCALATE, end->w0))
+ *out_flags |= cpu_to_be64(OPAL_XIVE_VP_SINGLE_ESCALATION);
+ }
+
+ if (out_cam_value) {
+ uint64_t cam_value;
+
+ cam_value = (blk << x->vp_shift) | idx;
+
+ /*
+ * If save-restore is enabled, force the CAM line
+ * value with the H bit.
+ */
+ if (xive_cfg_save_restore(x))
+ cam_value |= TM10_QW1W2_HO;
+
+ *out_cam_value = cpu_to_be64(cam_value);
+ }
+
+ if (out_report_cl_pair) {
+ uint64_t report_cl_pair;
+
+ report_cl_pair = ((uint64_t)(be32_to_cpu(vp->w6) & 0x0fffffff)) << 32;
+ report_cl_pair |= be32_to_cpu(vp->w7) & 0xffffff00;
+
+ *out_report_cl_pair = cpu_to_be64(report_cl_pair);
+ }
+
+ if (out_chip_id)
+ *out_chip_id = cpu_to_be32(xive_block_to_chip[blk]);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t xive_setup_silent_gather(uint64_t vp_id, bool enable)
+{
+ uint32_t blk, idx, i;
+ struct xive_end *end_orig;
+ struct xive_end end;
+ struct xive *x;
+ int64_t rc;
+
+ /* Get base END block */
+ if (!xive_end_for_target(vp_id, 0, &blk, &idx)) {
+ prlog(PR_ERR, "%s: Invalid VP 0x%08llx\n", __func__, vp_id);
+ return OPAL_PARAMETER;
+ }
+ x = xive_from_vc_blk(blk);
+ if (!x) {
+ prlog(PR_ERR, "%s: VP 0x%08llx has invalid block %d\n", __func__,
+ vp_id, blk);
+ return OPAL_PARAMETER;
+ }
+
+ /* Grab prio 7 */
+ end_orig = xive_get_end(x, idx + xive_escalation_prio(x));
+ if (!end_orig) {
+ xive_err(x, "Failed to get silent gather END 0x%x for VP 0x%08llx\n",
+ idx + xive_escalation_prio(x), vp_id);
+ return OPAL_PARAMETER;
+ }
+
+ /* If trying to enable silent gather, make sure prio 7 is not
+ * already enabled as a normal queue
+ */
+ if (enable && xive_get_field32(END_W0_VALID, end_orig->w0) &&
+ !xive_get_field32(END_W0_SILENT_ESCALATE, end_orig->w0)) {
+ xive_err(x, "silent gather END 0x%x already in use\n",
+ idx + xive_escalation_prio(x));
+ return OPAL_PARAMETER;
+ }
+
+ end = *end_orig;
+
+ if (enable) {
+ /* W0: Enabled and "s" set, no other bit */
+ end.w0 = xive_set_field32(END_W0_FIRMWARE1, end.w0, 0);
+ end.w0 = xive_set_field32(END_W0_VALID, end.w0, 1);
+ end.w0 = xive_set_field32(END_W0_SILENT_ESCALATE, end.w0, 1);
+ end.w0 = xive_set_field32(END_W0_ESCALATE_CTL, end.w0, 1);
+ end.w0 = xive_set_field32(END_W0_BACKLOG, end.w0, 1);
+
+ /* Set new "N" for END escalation (vs. ESB) */
+ end.w0 = xive_set_field32(END_W0_ESCALATE_END, end.w0, 1);
+
+ /* W1: Mark ESn as 01, ESe as 00 */
+ end.w1 = xive_set_field32(END_W1_ESn_P, end.w1, 0);
+ end.w1 = xive_set_field32(END_W1_ESn_Q, end.w1, 1);
+ end.w1 = xive_set_field32(END_W1_ESe, end.w1, 0);
+ } else if (xive_get_field32(END_W0_SILENT_ESCALATE, end.w0))
+ xive_cleanup_end(&end);
+
+ if (!memcmp(end_orig, &end, sizeof(end)))
+ rc = 0;
+ else
+ rc = xive_endc_cache_update(x, blk, idx + xive_escalation_prio(x),
+ &end, false);
+ if (rc)
+ return rc;
+
+ /* Mark/unmark all other prios with the new "u" bit and update
+ * escalation
+ */
+ for (i = 0; i < xive_cfg_vp_prio(x); i++) {
+ if (i == xive_escalation_prio(x))
+ continue;
+ end_orig = xive_get_end(x, idx + i);
+ if (!end_orig)
+ continue;
+ end = *end_orig;
+ if (enable) {
+ /* Set "u" bit */
+ end.w0 = xive_set_field32(END_W0_UNCOND_ESCALATE, end.w0, 1);
+
+ /* Set new "N" for END escalation (vs. ESB) */
+ /* TODO (Gen2+) : use ESB escalation configuration */
+ end.w0 = xive_set_field32(END_W0_ESCALATE_END, end.w0, 1);
+
+ /* Re-route escalation interrupt (previous
+ * route is lost !) to the gather queue
+ */
+ end.w4 = xive_set_field32(END_W4_END_BLOCK, end.w4, blk);
+ end.w4 = xive_set_field32(END_W4_ESC_END_INDEX,
+ end.w4, idx + xive_escalation_prio(x));
+ } else if (xive_get_field32(END_W0_UNCOND_ESCALATE, end.w0)) {
+ /* Clear the "u" bit, disable escalations if it was set */
+ end.w0 = xive_set_field32(END_W0_UNCOND_ESCALATE, end.w0, 0);
+ end.w0 = xive_set_field32(END_W0_ESCALATE_CTL, end.w0, 0);
+ }
+ if (!memcmp(end_orig, &end, sizeof(end)))
+ continue;
+ rc = xive_endc_cache_update(x, blk, idx + i, &end, false);
+ if (rc)
+ break;
+ }
+
+ return rc;
+}
+
+static int64_t opal_xive_set_vp_info(uint64_t vp_id,
+ uint64_t flags,
+ uint64_t report_cl_pair)
+{
+ struct xive *x;
+ struct xive_nvp *vp, vp_new;
+ uint32_t blk, idx;
+ bool group;
+ int64_t rc;
+
+ if (!xive_decode_vp(vp_id, &blk, &idx, NULL, &group))
+ return OPAL_PARAMETER;
+ /* We don't do groups yet */
+ if (group)
+ return OPAL_PARAMETER;
+ if (report_cl_pair & 0xff)
+ return OPAL_PARAMETER;
+ x = xive_from_pc_blk(blk);
+ if (!x)
+ return OPAL_PARAMETER;
+ vp = xive_get_vp(x, idx);
+ if (!vp)
+ return OPAL_PARAMETER;
+
+ /* Consistency check. */
+ if ((flags & OPAL_XIVE_VP_SAVE_RESTORE) && !xive_cfg_save_restore(x))
+ return OPAL_PARAMETER;
+
+ lock(&x->lock);
+
+ vp_new = *vp;
+ if (flags & OPAL_XIVE_VP_ENABLED) {
+ vp_new.w0 = xive_set_field32(NVP_W0_VALID, vp_new.w0, 1);
+ vp_new.w6 = cpu_to_be32(report_cl_pair >> 32);
+ vp_new.w7 = cpu_to_be32(report_cl_pair & 0xffffffff);
+
+ if (flags & OPAL_XIVE_VP_SINGLE_ESCALATION)
+ rc = xive_setup_silent_gather(vp_id, true);
+ else
+ rc = xive_setup_silent_gather(vp_id, false);
+
+ /*
+ * Prepare NVP to be HW owned for automatic save-restore
+ */
+ if (xive_cfg_save_restore(x)) {
+ /*
+ * Set NVP privilege level. Default to OS.
+ * This check only makes sense for KVM guests
+ * currently. We would need an extra flag to
+ * distinguish from pool level.
+ */
+ vp_new.w0 = xive_set_field32(NVP_W0_VPRIV, vp_new.w0, 0);
+
+ vp_new.w2 = xive_set_field32(NVP_W2_CPPR, vp_new.w2, 0xFF);
+ vp_new.w0 = xive_set_field32(NVP_W0_HW, vp_new.w0, 1);
+ }
+ } else {
+ /*
+ * TODO (kvm): disabling a VP invalidates the associated ENDs.
+ *
+ * The loads then return all 1s which can be an issue for the
+ * Linux code to handle.
+ */
+
+ vp_new.w0 = vp_new.w6 = vp_new.w7 = 0;
+ rc = xive_setup_silent_gather(vp_id, false);
+ }
+
+ if (rc) {
+ if (rc != OPAL_BUSY)
+ xive_dbg(x, "Silent gather setup failed with err %lld\n", rc);
+ goto bail;
+ }
+
+ rc = xive_nxc_cache_update(x, blk, idx, &vp_new, false);
+ if (rc)
+ goto bail;
+
+ /* When disabling, we scrub clean (invalidate the entry) so
+ * we can avoid cache ops in alloc/free
+ */
+ if (!(flags & OPAL_XIVE_VP_ENABLED))
+ xive_nxc_scrub_clean(x, blk, idx);
+
+bail:
+ unlock(&x->lock);
+ return rc;
+}
+
+static int64_t opal_xive_get_vp_state(uint64_t vp_id, beint64_t *out_state)
+{
+ struct xive *x;
+ struct xive_nvp *vp;
+ uint32_t blk, idx;
+ int64_t rc;
+ bool group;
+
+ if (!out_state || !xive_decode_vp(vp_id, &blk, &idx, NULL, &group))
+ return OPAL_PARAMETER;
+ if (group)
+ return OPAL_PARAMETER;
+ x = xive_from_pc_blk(blk);
+ if (!x)
+ return OPAL_PARAMETER;
+ vp = xive_get_vp(x, idx);
+ if (!vp)
+ return OPAL_PARAMETER;
+
+ /* Scrub the vp */
+ lock(&x->lock);
+ rc = xive_nxc_scrub(x, blk, idx);
+ unlock(&x->lock);
+ if (rc)
+ return rc;
+
+ if (!xive_get_field32(NVP_W0_VALID, vp->w0))
+ return OPAL_WRONG_STATE;
+
+ /*
+ * return a state matching the layout of WORD 0-1 of the TIMA
+ * as this is expected by current implementation.
+ */
+ *out_state = cpu_to_be64(((uint64_t) 0x0) << 54 |
+ (uint64_t)xive_get_field32(NVP_W2_CPPR, vp->w2) << 48 |
+ (uint64_t)xive_get_field32(NVP_W2_IPB, vp->w2) << 40 |
+ (uint64_t)xive_get_field32(NVP_W2_LSMFB, vp->w2) << 32);
+
+ return OPAL_SUCCESS;
+}
+
+static void *xive_cpu_get_tima(struct cpu_thread *c)
+{
+ struct xive_cpu_state *xs = c->xstate;
+ struct xive *x = xs->xive;
+
+ return x->ic_tm_direct_base + ((c->pir & 0xff) << x->ic_shift);
+}
+
+static void xive_cleanup_cpu_tima(struct cpu_thread *c)
+{
+ struct xive_cpu_state *xs __unused = c->xstate;
+ void *cpu_tm_base = xive_cpu_get_tima(c);
+ uint8_t old_w2 __unused, w2 __unused;
+
+ /* Reset the HW context */
+ xive_reset_enable_thread(c);
+
+ /* Set VT to 1 */
+ old_w2 = in_8(cpu_tm_base + TM_QW3_HV_PHYS + TM_WORD2);
+ out_8(cpu_tm_base + TM_QW3_HV_PHYS + TM_WORD2, 0x80);
+ w2 = in_8(cpu_tm_base + TM_QW3_HV_PHYS + TM_WORD2);
+
+ /* Dump HV state */
+ xive_cpu_vdbg(c, "[reset] VP TIMA VP=%x/%x W01=%016llx W2=%02x->%02x\n",
+ xs->vp_blk, xs->vp_idx,
+ in_be64(cpu_tm_base + TM_QW3_HV_PHYS),
+ old_w2, w2);
+}
+
+static int64_t xive_vc_ind_cache_kill(struct xive *x, uint64_t type)
+{
+ uint64_t val;
+
+ /* We clear the whole thing */
+ xive_regw(x, VC_AT_MACRO_KILL_MASK, 0);
+ xive_regw(x, VC_AT_MACRO_KILL, VC_AT_MACRO_KILL_VALID |
+ SETFIELD(VC_AT_MACRO_KILL_VSD, 0ull, type));
+
+ /* XXX Add timeout */
+ for (;;) {
+ val = xive_regr(x, VC_AT_MACRO_KILL);
+ if (!(val & VC_AT_MACRO_KILL_VALID))
+ break;
+ }
+ return 0;
+}
+
+static int64_t xive_pc_ind_cache_kill(struct xive *x)
+{
+ uint64_t val;
+
+ /* We clear the whole thing */
+ xive_regw(x, PC_AT_KILL_MASK, 0);
+ xive_regw(x, PC_AT_KILL, PC_AT_KILL_VALID |
+ SETFIELD(VC_AT_MACRO_KILL_VSD, 0ull, VST_NVP));
+
+ /* XXX Add timeout */
+ for (;;) {
+ val = xive_regr(x, PC_AT_KILL);
+ if (!(val & PC_AT_KILL_VALID))
+ break;
+ }
+ return 0;
+}
+
+static void xive_cleanup_vp_ind(struct xive *x)
+{
+ int i;
+
+ xive_dbg(x, "Cleaning up %d VP ind entries...\n", x->vp_ind_count);
+ for (i = 0; i < x->vp_ind_count; i++) {
+ if (be64_to_cpu(x->vp_ind_base[i]) & VSD_FIRMWARE) {
+ xive_dbg(x, " %04x ... skip (firmware)\n", i);
+ continue;
+ }
+ if (x->vp_ind_base[i] != 0) {
+ x->vp_ind_base[i] = 0;
+ xive_dbg(x, " %04x ... cleaned\n", i);
+ }
+ }
+ xive_pc_ind_cache_kill(x);
+}
+
+static void xive_cleanup_end_ind(struct xive *x)
+{
+ int i;
+
+ xive_dbg(x, "Cleaning up %d END ind entries...\n", x->end_ind_count);
+ for (i = 0; i < x->end_ind_count; i++) {
+ if (be64_to_cpu(x->end_ind_base[i]) & VSD_FIRMWARE) {
+ xive_dbg(x, " %04x ... skip (firmware)\n", i);
+ continue;
+ }
+ if (x->end_ind_base[i] != 0) {
+ x->end_ind_base[i] = 0;
+ xive_dbg(x, " %04x ... cleaned\n", i);
+ }
+ }
+ xive_vc_ind_cache_kill(x, VST_END);
+}
+
+static void xive_reset_one(struct xive *x)
+{
+ struct cpu_thread *c;
+ bool end_firmware;
+ int i;
+
+ xive_notice(x, "Resetting one xive...\n");
+
+ lock(&x->lock);
+
+ /* Check all interrupts are disabled */
+ i = bitmap_find_one_bit(*x->int_enabled_map, 0, XIVE_INT_COUNT);
+ if (i >= 0)
+ xive_warn(x, "Interrupt %d (and maybe more) not disabled"
+ " at reset !\n", i);
+
+ /* Reset IPI allocation */
+ xive_dbg(x, "freeing alloc map %p/%p\n",
+ x->ipi_alloc_map, *x->ipi_alloc_map);
+ memset(x->ipi_alloc_map, 0, BITMAP_BYTES(XIVE_INT_COUNT));
+
+ xive_dbg(x, "Resetting ENDs...\n");
+
+ /* Reset all allocated ENDs and free the user ones */
+ bitmap_for_each_one(*x->end_map, xive_end_bitmap_size(x), i) {
+ struct xive_end end0;
+ struct xive_end *end;
+ int j;
+
+ if (i == 0)
+ continue;
+ end_firmware = false;
+ for (j = 0; j < xive_cfg_vp_prio(x); j++) {
+ uint32_t idx = (i << xive_cfg_vp_prio_shift(x)) | j;
+
+ end = xive_get_end(x, idx);
+ if (!end)
+ continue;
+
+ /* We need to preserve the firmware bit, otherwise
+ * we will incorrectly free the ENDs that are reserved
+ * for the physical CPUs
+ */
+ if (xive_get_field32(END_W0_VALID, end->w0)) {
+ if (!xive_end_is_firmware1(end))
+ xive_dbg(x, "END 0x%x:0x%x is valid at reset: %08x %08x\n",
+ x->block_id, idx, end->w0, end->w1);
+ end0 = *end;
+ xive_cleanup_end(&end0);
+ xive_endc_cache_update(x, x->block_id, idx, &end0, true);
+ }
+ if (xive_end_is_firmware1(end))
+ end_firmware = true;
+ }
+ if (!end_firmware)
+ bitmap_clr_bit(*x->end_map, i);
+ }
+
+ /* Take out all VPs from HW and reset all CPPRs to 0 */
+ for_each_present_cpu(c) {
+ if (c->chip_id != x->chip_id)
+ continue;
+ if (!c->xstate)
+ continue;
+ xive_cleanup_cpu_tima(c);
+ }
+
+ /* Reset all user-allocated VPs. This is inefficient, we should
+ * either keep a bitmap of allocated VPs or add an iterator to
+ * the buddy which is trickier but doable.
+ */
+ for (i = 0; i < XIVE_VP_COUNT(x); i++) {
+ struct xive_nvp *vp;
+ struct xive_nvp vp0 = {0};
+
+ /* Ignore the physical CPU VPs */
+ if (i >= xive_hw_vp_count &&
+ i < (xive_hw_vp_base + xive_hw_vp_count))
+ continue;
+
+ /* Is the VP valid ? */
+ vp = xive_get_vp(x, i);
+ if (!vp || !xive_get_field32(NVP_W0_VALID, vp->w0))
+ continue;
+
+ /* Clear it */
+ xive_dbg(x, "VP 0x%x:0x%x is valid at reset\n", x->block_id, i);
+ xive_nxc_cache_update(x, x->block_id, i, &vp0, true);
+ }
+
+ /* Forget about remaining donated pages */
+ list_head_init(&x->donated_pages);
+
+ /* And cleanup donated indirect VP and END pages */
+ xive_cleanup_vp_ind(x);
+ xive_cleanup_end_ind(x);
+
+ /* The rest must not be called with the lock held */
+ unlock(&x->lock);
+
+ /* Re-configure VPs */
+ for_each_present_cpu(c) {
+ struct xive_cpu_state *xs = c->xstate;
+
+ if (c->chip_id != x->chip_id || !xs)
+ continue;
+
+ xive_init_cpu_exploitation(xs);
+ }
+}
+
+static void xive_reset_mask_source_cb(struct irq_source *is,
+ void *data __unused)
+{
+ struct xive_src *s = container_of(is, struct xive_src, is);
+ struct xive *x;
+ uint32_t isn;
+
+ if (is->ops != &xive_irq_source_ops)
+ return;
+
+ /* Skip escalation sources */
+ if (GIRQ_IS_ESCALATION(is->start))
+ return;
+
+ x = s->xive;
+
+ /* Iterate all interrupts */
+ for (isn = is->start; isn < is->end; isn++) {
+ /* Has it ever been enabled ? */
+ if (!bitmap_tst_bit(*x->int_enabled_map, GIRQ_TO_IDX(isn)))
+ continue;
+ /* Mask it and clear the enabled map bit */
+ xive_vdbg(x, "[reset] disabling source 0x%x\n", isn);
+ __xive_set_irq_config(is, isn, 0, 0xff, isn, true, false);
+ bitmap_clr_bit(*x->int_enabled_map, GIRQ_TO_IDX(isn));
+ }
+}
+
+void xive2_cpu_reset(void)
+{
+ struct cpu_thread *c = this_cpu();
+ struct xive_cpu_state *xs = c->xstate;
+
+ out_8(xs->tm_ring1 + TM_QW3_HV_PHYS + TM_CPPR, 0);
+
+ in_be64(xs->tm_ring1 + TM_SPC_PULL_POOL_CTX);
+}
+
+static int64_t __xive_reset(uint64_t mode)
+{
+ struct proc_chip *chip;
+
+ xive_mode = mode;
+
+ /* Mask all interrupt sources */
+ irq_for_each_source(xive_reset_mask_source_cb, NULL);
+
+ /* For each XIVE do a sync... */
+ for_each_chip(chip) {
+ if (!chip->xive)
+ continue;
+ xive_sync(chip->xive);
+ }
+
+ /* For each XIVE reset everything else... */
+ for_each_chip(chip) {
+ if (!chip->xive)
+ continue;
+ xive_reset_one(chip->xive);
+ }
+
+ /* Cleanup global VP allocator */
+ buddy_reset(xive_vp_buddy);
+
+ /*
+ * We reserve the whole range of VP ids for HW threads.
+ */
+ assert(buddy_reserve(xive_vp_buddy, xive_hw_vp_base, xive_threadid_shift));
+
+ return OPAL_SUCCESS;
+}
+
+/* Called by fast reboot */
+int64_t xive2_reset(void)
+{
+ if (xive_mode == XIVE_MODE_NONE)
+ return OPAL_SUCCESS;
+ return __xive_reset(XIVE_MODE_EXPL);
+}
+
+static int64_t opal_xive_reset(uint64_t mode)
+{
+ prlog(PR_DEBUG, "XIVE reset. mode = %llx\n", mode);
+
+ if (!(mode & XIVE_MODE_EXPL)) {
+ prlog(PR_NOTICE, "No emulation mode. XIVE exploitation mode "
+ "is the default\n");
+ }
+
+ xive_expl_options = mode & ~XIVE_MODE_EXPL;
+ if (xive_expl_options & ~XIVE_EXPL_ALL_OPTIONS) {
+ prerror("invalid XIVE exploitation mode option %016llx\n",
+ xive_expl_options);
+ return OPAL_PARAMETER;
+ }
+
+ return __xive_reset(XIVE_MODE_EXPL);
+}
+
+static int64_t opal_xive_free_vp_block(uint64_t vp_base)
+{
+ uint32_t blk, idx, i, j, count;
+ uint8_t order;
+ bool group;
+
+ if (xive_mode != XIVE_MODE_EXPL)
+ return OPAL_WRONG_STATE;
+
+ if (!xive_decode_vp(vp_base, &blk, &idx, &order, &group))
+ return OPAL_PARAMETER;
+ if (group)
+ return OPAL_PARAMETER;
+ if (blk)
+ return OPAL_PARAMETER;
+ if (order < (xive_chips_alloc_bits + 1))
+ return OPAL_PARAMETER;
+ if (idx & ((1 << (order - xive_chips_alloc_bits)) - 1))
+ return OPAL_PARAMETER;
+
+ count = 1 << order;
+ for (i = 0; i < count; i++) {
+ uint32_t vp_id = vp_base + i;
+ uint32_t blk, idx, end_blk, end_idx;
+ struct xive *x;
+ struct xive_nvp *vp;
+
+ if (!xive_decode_vp(vp_id, &blk, &idx, NULL, NULL)) {
+ prerror("Couldn't decode VP id %u\n", vp_id);
+ return OPAL_INTERNAL_ERROR;
+ }
+ x = xive_from_pc_blk(blk);
+ if (!x) {
+ prerror("Instance not found for deallocated VP"
+ " block %d\n", blk);
+ return OPAL_INTERNAL_ERROR;
+ }
+ vp = xive_get_vp(x, idx);
+ if (!vp) {
+ prerror("VP not found for deallocation !");
+ return OPAL_INTERNAL_ERROR;
+ }
+
+ /* VP must be disabled */
+ if (xive_get_field32(NVP_W0_VALID, vp->w0)) {
+ prlog(PR_ERR, "freeing active VP %d\n", vp_id);
+ return OPAL_XIVE_FREE_ACTIVE;
+ }
+
+ /* Not populated */
+ if (vp->w5 == 0)
+ continue;
+
+ end_blk = xive_get_field32(NVP_W5_VP_END_BLOCK, vp->w5);
+ end_idx = xive_get_field32(NVP_W5_VP_END_INDEX, vp->w5);
+
+ lock(&x->lock);
+
+ /* Ensure ENDs are disabled and cleaned up. Ideally the caller
+ * should have done it but we double check it here
+ */
+ for (j = 0; j < xive_cfg_vp_prio(x); j++) {
+ struct xive *end_x = xive_from_vc_blk(end_blk);
+ struct xive_end end, *orig_end = xive_get_end(end_x, end_idx + j);
+
+ if (!xive_get_field32(END_W0_VALID, orig_end->w0))
+ continue;
+
+ prlog(PR_WARNING, "freeing VP %d with queue %d active\n",
+ vp_id, j);
+ end = *orig_end;
+ xive_cleanup_end(&end);
+ xive_endc_cache_update(x, end_blk, end_idx + j, &end, true);
+ }
+
+ /* Mark it not populated so we don't try to free it again */
+ vp->w5 = 0;
+
+ if (end_blk != blk) {
+ prerror("Block mismatch trying to free ENDs\n");
+ unlock(&x->lock);
+ return OPAL_INTERNAL_ERROR;
+ }
+
+ xive_free_end_set(x, end_idx);
+ unlock(&x->lock);
+ }
+
+ xive_free_vps(vp_base);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_alloc_vp_block(uint32_t alloc_order)
+{
+ uint32_t vp_base, ends, count, i;
+ int64_t rc;
+
+ if (xive_mode != XIVE_MODE_EXPL)
+ return OPAL_WRONG_STATE;
+
+ prlog(PR_TRACE, "opal_xive_alloc_vp_block(%d)\n", alloc_order);
+
+ vp_base = xive_alloc_vps(alloc_order);
+ if (XIVE_ALLOC_IS_ERR(vp_base)) {
+ if (vp_base == XIVE_ALLOC_NO_IND)
+ return OPAL_XIVE_PROVISIONING;
+ return OPAL_RESOURCE;
+ }
+
+ /* Allocate ENDs and initialize VPs */
+ count = 1 << alloc_order;
+ for (i = 0; i < count; i++) {
+ uint32_t vp_id = vp_base + i;
+ uint32_t blk, idx;
+ struct xive *x;
+ struct xive_nvp *vp;
+
+ if (!xive_decode_vp(vp_id, &blk, &idx, NULL, NULL)) {
+ prerror("Couldn't decode VP id %u\n", vp_id);
+ return OPAL_INTERNAL_ERROR;
+ }
+ x = xive_from_pc_blk(blk);
+ if (!x) {
+ prerror("Instance not found for allocated VP"
+ " block %d\n", blk);
+ rc = OPAL_INTERNAL_ERROR;
+ goto fail;
+ }
+ vp = xive_get_vp(x, idx);
+ if (!vp) {
+ prerror("VP not found after allocation !");
+ rc = OPAL_INTERNAL_ERROR;
+ goto fail;
+ }
+
+ /* Allocate ENDs, if fails, free the VPs and return */
+ lock(&x->lock);
+ ends = xive_alloc_end_set(x, false);
+ unlock(&x->lock);
+ if (XIVE_ALLOC_IS_ERR(ends)) {
+ if (ends == XIVE_ALLOC_NO_IND)
+ rc = OPAL_XIVE_PROVISIONING;
+ else
+ rc = OPAL_RESOURCE;
+ goto fail;
+ }
+
+ /* Initialize the VP structure. We don't use a cache watch
+ * as we have made sure when freeing the entries to scrub
+ * it out of the cache.
+ */
+ memset(vp, 0, sizeof(*vp));
+
+ /* Store the END base of the VP in W5 (new in p10) */
+ xive_vp_set_end_base(vp, blk, ends);
+ }
+ return vp_base;
+ fail:
+ opal_xive_free_vp_block(vp_base);
+
+ return rc;
+}
+
+static int64_t xive_try_allocate_irq(struct xive *x)
+{
+ int idx, base_idx, max_count, girq;
+ struct xive_eas *eas;
+
+ lock(&x->lock);
+
+ base_idx = x->int_ipi_top - x->int_base;
+ max_count = x->int_hw_bot - x->int_ipi_top;
+
+ idx = bitmap_find_zero_bit(*x->ipi_alloc_map, base_idx, max_count);
+ if (idx < 0) {
+ unlock(&x->lock);
+ return OPAL_RESOURCE;
+ }
+ bitmap_set_bit(*x->ipi_alloc_map, idx);
+ girq = x->int_base + idx;
+
+ /* Mark the EAS valid. Don't bother with the HW cache, it's
+ * still masked anyway, the cache will be updated when unmasked
+ * and configured.
+ */
+ eas = xive_get_eas(x, girq);
+ if (!eas) {
+ bitmap_clr_bit(*x->ipi_alloc_map, idx);
+ unlock(&x->lock);
+ return OPAL_PARAMETER;
+ }
+ eas->w = xive_set_field64(EAS_VALID, 0, 1) |
+ xive_set_field64(EAS_MASKED, 0, 1) |
+ xive_set_field64(EAS_END_DATA, 0, girq);
+ unlock(&x->lock);
+
+ return girq;
+}
+
+static int64_t opal_xive_allocate_irq(uint32_t chip_id)
+{
+ struct proc_chip *chip;
+ bool try_all = false;
+ int64_t rc;
+
+ if (xive_mode != XIVE_MODE_EXPL)
+ return OPAL_WRONG_STATE;
+
+ if (chip_id == OPAL_XIVE_ANY_CHIP) {
+ try_all = true;
+ chip_id = this_cpu()->chip_id;
+ }
+ chip = get_chip(chip_id);
+ if (!chip)
+ return OPAL_PARAMETER;
+
+ /* Try initial target chip */
+ if (!chip->xive)
+ rc = OPAL_PARAMETER;
+ else
+ rc = xive_try_allocate_irq(chip->xive);
+ if (rc >= 0 || !try_all)
+ return rc;
+
+ /* Failed and we try all... do so */
+ for_each_chip(chip) {
+ if (!chip->xive)
+ continue;
+ rc = xive_try_allocate_irq(chip->xive);
+ if (rc >= 0)
+ break;
+ }
+ return rc;
+}
+
+static int64_t opal_xive_free_irq(uint32_t girq)
+{
+ struct irq_source *is = irq_find_source(girq);
+ struct xive_src *s = container_of(is, struct xive_src, is);
+ struct xive *x = xive_from_isn(girq);
+ struct xive_eas *eas;
+ uint32_t idx;
+
+ if (xive_mode != XIVE_MODE_EXPL)
+ return OPAL_WRONG_STATE;
+ if (!x || !is)
+ return OPAL_PARAMETER;
+
+ idx = GIRQ_TO_IDX(girq);
+
+ lock(&x->lock);
+
+ eas = xive_get_eas(x, girq);
+ if (!eas) {
+ unlock(&x->lock);
+ return OPAL_PARAMETER;
+ }
+
+ /* Mask the interrupt source */
+ xive_update_irq_mask(s, girq - s->esb_base, true);
+
+ /* Mark the EAS masked and invalid */
+ eas->w = xive_set_field64(EAS_VALID, 0, 1) |
+ xive_set_field64(EAS_MASKED, 0, 1);
+ xive_easc_scrub(x, x->block_id, idx);
+
+ /* Free it */
+ if (!bitmap_tst_bit(*x->ipi_alloc_map, idx)) {
+ unlock(&x->lock);
+ return OPAL_PARAMETER;
+ }
+ bitmap_clr_bit(*x->ipi_alloc_map, idx);
+ bitmap_clr_bit(*x->int_enabled_map, idx);
+ unlock(&x->lock);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_dump_tm(uint32_t offset, const char *n, uint32_t pir)
+{
+ struct cpu_thread *c = find_cpu_by_pir(pir);
+ struct xive_cpu_state *xs;
+ struct xive *x;
+ void *cpu_tm_base;
+ uint64_t v0,v1;
+
+ if (!c)
+ return OPAL_PARAMETER;
+ xs = c->xstate;
+ if (!xs || !xs->tm_ring1)
+ return OPAL_INTERNAL_ERROR;
+ x = xs->xive;
+ cpu_tm_base = xive_cpu_get_tima(c);
+
+ lock(&x->lock);
+ v0 = in_be64(cpu_tm_base + offset);
+ if (offset == TM_QW3_HV_PHYS) {
+ v1 = in_8(cpu_tm_base + offset + 8);
+ v1 <<= 56;
+ } else {
+ v1 = in_be32(cpu_tm_base + offset + 8);
+ v1 <<= 32;
+ }
+ prlog(PR_INFO, "CPU[%04x]: TM state for QW %s\n", pir, n);
+ prlog(PR_INFO, "CPU[%04x]: NSR CPPR IPB LSMFB ACK# INC AGE PIPR"
+ " W2 W3\n", pir);
+ prlog(PR_INFO, "CPU[%04x]: %02x %02x %02x %02x %02x "
+ "%02x %02x %02x %08x %08x\n", pir,
+ (uint8_t)(v0 >> 58) & 0xff, (uint8_t)(v0 >> 48) & 0xff,
+ (uint8_t)(v0 >> 40) & 0xff, (uint8_t)(v0 >> 32) & 0xff,
+ (uint8_t)(v0 >> 24) & 0xff, (uint8_t)(v0 >> 16) & 0xff,
+ (uint8_t)(v0 >> 8) & 0xff, (uint8_t)(v0 ) & 0xff,
+ (uint32_t)(v1 >> 32) & 0xffffffff,
+ (uint32_t)(v1 & 0xffffffff));
+ unlock(&x->lock);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_dump_vp(uint32_t vp_id)
+{
+ uint32_t blk, idx;
+ uint8_t order;
+ bool group;
+ struct xive *x;
+ struct xive_nvp *vp;
+ uint32_t *vpw;
+
+ if (!xive_decode_vp(vp_id, &blk, &idx, &order, &group))
+ return OPAL_PARAMETER;
+
+ x = xive_from_vc_blk(blk);
+ if (!x)
+ return OPAL_PARAMETER;
+ vp = xive_get_vp(x, idx);
+ if (!vp)
+ return OPAL_PARAMETER;
+ lock(&x->lock);
+
+ xive_nxc_scrub_clean(x, blk, idx);
+
+ vpw = ((uint32_t *)vp) + (group ? 8 : 0);
+ prlog(PR_INFO, "VP[%08x]: 0..3: %08x %08x %08x %08x\n", vp_id,
+ vpw[0], vpw[1], vpw[2], vpw[3]);
+ prlog(PR_INFO, "VP[%08x]: 4..7: %08x %08x %08x %08x\n", vp_id,
+ vpw[4], vpw[5], vpw[6], vpw[7]);
+ unlock(&x->lock);
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_sync_irq_src(uint32_t girq)
+{
+ struct xive *x = xive_from_isn(girq);
+
+ if (!x)
+ return OPAL_PARAMETER;
+ return xive_sync(x);
+}
+
+static int64_t opal_xive_sync_irq_target(uint32_t girq)
+{
+ uint32_t target, vp_blk;
+ struct xive *x;
+
+ if (!xive_get_irq_targetting(girq, &target, NULL, NULL))
+ return OPAL_PARAMETER;
+ if (!xive_decode_vp(target, &vp_blk, NULL, NULL, NULL))
+ return OPAL_PARAMETER;
+ x = xive_from_pc_blk(vp_blk);
+ if (!x)
+ return OPAL_PARAMETER;
+ return xive_sync(x);
+}
+
+static int64_t opal_xive_sync(uint32_t type, uint32_t id)
+{
+ int64_t rc = OPAL_SUCCESS;;
+
+ if (type & XIVE_SYNC_EAS)
+ rc = opal_xive_sync_irq_src(id);
+ if (rc)
+ return rc;
+ if (type & XIVE_SYNC_QUEUE)
+ rc = opal_xive_sync_irq_target(id);
+ if (rc)
+ return rc;
+
+ /* Add more ... */
+
+ return rc;
+}
+
+static int64_t opal_xive_dump(uint32_t type, uint32_t id)
+{
+ switch (type) {
+ case XIVE_DUMP_TM_HYP:
+ return opal_xive_dump_tm(TM_QW3_HV_PHYS, "PHYS", id);
+ case XIVE_DUMP_TM_POOL:
+ return opal_xive_dump_tm(TM_QW2_HV_POOL, "POOL", id);
+ case XIVE_DUMP_TM_OS:
+ return opal_xive_dump_tm(TM_QW1_OS, "OS ", id);
+ case XIVE_DUMP_TM_USER:
+ return opal_xive_dump_tm(TM_QW0_USER, "USER", id);
+ case XIVE_DUMP_VP:
+ return opal_xive_dump_vp(id);
+ default:
+ return OPAL_PARAMETER;
+ }
+}
+
+static void xive_init_globals(void)
+{
+ uint32_t i;
+
+ for (i = 0; i < XIVE_MAX_CHIPS; i++)
+ xive_block_to_chip[i] = XIVE_INVALID_CHIP;
+}
+
+/*
+ * The global availability of some capabilities used in other drivers
+ * (PHB, PSI) is deduced from the capabilities of the first XIVE chip
+ * of the system. It should be common to all chips.
+ */
+bool xive2_cap_phb_pq_disable(void)
+{
+ return xive_has_cap(one_xive, CQ_XIVE_CAP_PHB_PQ_DISABLE);
+}
+
+bool xive2_cap_phb_abt(void)
+{
+ if (!xive_has_cap(one_xive, CQ_XIVE_CAP_PHB_ABT))
+ return false;
+
+ /*
+ * We need 'PQ disable' to use ABT mode, else the OS will use
+ * two different sets of ESB pages (PHB and IC) to control the
+ * interrupt sources. Can not work.
+ */
+ if (!xive2_cap_phb_pq_disable()) {
+ prlog_once(PR_ERR, "ABT mode is set without PQ disable. "
+ "Ignoring bogus configuration\n");
+ return false;
+ }
+
+ return true;
+}
+
+bool xive2_cap_store_eoi(void)
+{
+ return xive_has_cap(one_xive, CQ_XIVE_CAP_STORE_EOI);
+}
+
+void xive2_init(void)
+{
+ struct dt_node *np;
+ struct proc_chip *chip;
+ struct cpu_thread *cpu;
+ bool first = true;
+
+ /* Look for xive nodes and do basic inits */
+ dt_for_each_compatible(dt_root, np, "ibm,power10-xive-x") {
+ struct xive *x;
+
+ /* Initialize some global stuff */
+ if (first)
+ xive_init_globals();
+
+ /* Create/initialize the xive instance */
+ x = init_one_xive(np);
+ if (first)
+ one_xive = x;
+ first = false;
+ }
+ if (first)
+ return;
+
+ /*
+ * P8 emulation is not supported on P10 anymore. Exploitation
+ * is the default XIVE mode. We might introduce a GEN2 mode.
+ */
+ xive_mode = XIVE_MODE_EXPL;
+
+ /* Init VP allocator */
+ xive_init_vp_allocator();
+
+ /* Create a device-tree node for Linux use */
+ xive_create_mmio_dt_node(one_xive);
+
+ /* Some inits must be done after all xive have been created
+ * such as setting up the forwarding ports
+ */
+ for_each_chip(chip) {
+ if (chip->xive)
+ late_init_one_xive(chip->xive);
+ }
+
+ /* Initialize per-cpu structures */
+ for_each_present_cpu(cpu) {
+ xive_init_cpu(cpu);
+ }
+
+ /* Calling boot CPU */
+ xive2_cpu_callin(this_cpu());
+
+ /* Register XIVE exploitation calls */
+ opal_register(OPAL_XIVE_RESET, opal_xive_reset, 1);
+ opal_register(OPAL_XIVE_GET_IRQ_INFO, opal_xive_get_irq_info, 6);
+ opal_register(OPAL_XIVE_GET_IRQ_CONFIG, opal_xive_get_irq_config, 4);
+ opal_register(OPAL_XIVE_SET_IRQ_CONFIG, opal_xive_set_irq_config, 4);
+ opal_register(OPAL_XIVE_GET_QUEUE_INFO, opal_xive_get_queue_info, 7);
+ opal_register(OPAL_XIVE_SET_QUEUE_INFO, opal_xive_set_queue_info, 5);
+ opal_register(OPAL_XIVE_DONATE_PAGE, opal_xive_donate_page, 2);
+ opal_register(OPAL_XIVE_ALLOCATE_IRQ, opal_xive_allocate_irq, 1);
+ opal_register(OPAL_XIVE_FREE_IRQ, opal_xive_free_irq, 1);
+ opal_register(OPAL_XIVE_ALLOCATE_VP_BLOCK, opal_xive_alloc_vp_block, 1);
+ opal_register(OPAL_XIVE_FREE_VP_BLOCK, opal_xive_free_vp_block, 1);
+ opal_register(OPAL_XIVE_GET_VP_INFO, opal_xive_get_vp_info, 5);
+ opal_register(OPAL_XIVE_SET_VP_INFO, opal_xive_set_vp_info, 3);
+ opal_register(OPAL_XIVE_SYNC, opal_xive_sync, 2);
+ opal_register(OPAL_XIVE_DUMP, opal_xive_dump, 2);
+ opal_register(OPAL_XIVE_GET_QUEUE_STATE, opal_xive_get_queue_state, 4);
+ opal_register(OPAL_XIVE_SET_QUEUE_STATE, opal_xive_set_queue_state, 4);
+ opal_register(OPAL_XIVE_GET_VP_STATE, opal_xive_get_vp_state, 2);
+}
diff --git a/roms/skiboot/hw/xscom.c b/roms/skiboot/hw/xscom.c
new file mode 100644
index 000000000..347457242
--- /dev/null
+++ b/roms/skiboot/hw/xscom.c
@@ -0,0 +1,1019 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * XSCOM driver
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <io.h>
+#include <processor.h>
+#include <device.h>
+#include <chip.h>
+#include <centaur.h>
+#include <errorlog.h>
+#include <opal-api.h>
+#include <timebase.h>
+#include <nvram.h>
+
+/* Mask of bits to clear in HMER before an access */
+#define HMER_CLR_MASK (~(SPR_HMER_XSCOM_FAIL | \
+ SPR_HMER_XSCOM_DONE | \
+ SPR_HMER_XSCOM_STATUS))
+
+DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_RW, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM,
+ OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+ OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_INDIRECT_RW, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM,
+ OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+ OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_RESET, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM,
+ OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+ OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_BUSY, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM,
+ OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+ OPAL_NA);
+
+/* xscom details to trigger xstop */
+static struct {
+ uint64_t addr;
+ uint64_t fir_bit;
+} xstop_xscom;
+
+/*
+ * Locking notes:
+ *
+ * We used to have a per-target lock. However due to errata HW822317
+ * we can have issues on the issuer side if multiple threads try to
+ * send XSCOMs simultaneously (HMER responses get mixed up), so just
+ * use a global lock instead
+ */
+static struct lock xscom_lock = LOCK_UNLOCKED;
+
+static inline void *xscom_addr(uint32_t gcid, uint32_t pcb_addr)
+{
+ struct proc_chip *chip = get_chip(gcid);
+ uint64_t addr;
+
+ assert(chip);
+ addr = chip->xscom_base;
+ if (proc_gen == proc_gen_p8) {
+ addr |= ((uint64_t)pcb_addr << 4) & ~0xfful;
+ addr |= (pcb_addr << 3) & 0x78;
+ } else
+ addr |= ((uint64_t)pcb_addr << 3);
+ return (void *)addr;
+}
+
+static uint64_t xscom_wait_done(void)
+{
+ uint64_t hmer;
+
+ do
+ hmer = mfspr(SPR_HMER);
+ while(!(hmer & SPR_HMER_XSCOM_DONE));
+
+ /*
+ * HW822317: We need to read a second time as the actual
+ * status can be delayed by 1 cycle after DONE
+ */
+ return mfspr(SPR_HMER);
+}
+
+static void xscom_reset(uint32_t gcid, bool need_delay)
+{
+ u64 hmer;
+ uint32_t recv_status_reg, log_reg, err_reg;
+ struct timespec ts;
+
+ /* Clear errors in HMER */
+ mtspr(SPR_HMER, HMER_CLR_MASK);
+
+ /* Setup local and target scom addresses */
+ if (proc_gen == proc_gen_p10) {
+ recv_status_reg = 0x00090018;
+ log_reg = 0x0090012;
+ err_reg = 0x0090013;
+ } else if (proc_gen == proc_gen_p9) {
+ recv_status_reg = 0x00090018;
+ log_reg = 0x0090012;
+ err_reg = 0x0090013;
+ } else {
+ recv_status_reg = 0x202000f;
+ log_reg = 0x2020007;
+ err_reg = 0x2020009;
+ }
+
+ /* First we need to write 0 to a register on our chip */
+ out_be64(xscom_addr(this_cpu()->chip_id, recv_status_reg), 0);
+ hmer = xscom_wait_done();
+ if (hmer & SPR_HMER_XSCOM_FAIL)
+ goto fail;
+
+ /* Then we need to clear those two other registers on the target */
+ out_be64(xscom_addr(gcid, log_reg), 0);
+ hmer = xscom_wait_done();
+ if (hmer & SPR_HMER_XSCOM_FAIL)
+ goto fail;
+ out_be64(xscom_addr(gcid, err_reg), 0);
+ hmer = xscom_wait_done();
+ if (hmer & SPR_HMER_XSCOM_FAIL)
+ goto fail;
+
+ if (need_delay) {
+ /*
+ * Its observed that sometimes immediate retry of
+ * XSCOM operation returns wrong data. Adding a
+ * delay for XSCOM reset to be effective. Delay of
+ * 10 ms is found to be working fine experimentally.
+ * FIXME: Replace 10ms delay by exact delay needed
+ * or other alternate method to confirm XSCOM reset
+ * completion, after checking from HW folks.
+ */
+ ts.tv_sec = 0;
+ ts.tv_nsec = 10 * 1000;
+ nanosleep_nopoll(&ts, NULL);
+ }
+ return;
+ fail:
+ /* Fatal error resetting XSCOM */
+ log_simple_error(&e_info(OPAL_RC_XSCOM_RESET),
+ "XSCOM: Fatal error resetting engine after failed access !\n");
+
+ /* XXX Generate error log ? attn ? panic ?
+ * If we decide to panic, change the above severity to PANIC
+ */
+}
+
+static int xscom_clear_error(uint32_t gcid, uint32_t pcb_addr)
+{
+ u64 hmer;
+ uint32_t base_xscom_addr;
+ uint32_t xscom_clear_reg = 0x20010800;
+
+ /* only in case of p9 */
+ if (proc_gen != proc_gen_p9)
+ return 0;
+
+/* xscom clear address range/mask */
+#define XSCOM_CLEAR_RANGE_START 0x20010A00
+#define XSCOM_CLEAR_RANGE_END 0x20010ABF
+#define XSCOM_CLEAR_RANGE_MASK 0x200FFBFF
+
+ /*
+ * Due to a hardware issue where core responding to scom was delayed
+ * due to thread reconfiguration, leaves the scom logic in a state
+ * where the subsequent scom to that core can get errors. This is
+ * affected for Core PC scom registers in the range of
+ * 20010A80-20010ABF.
+ *
+ * The solution is if a xscom timeout occurs to one of Core PC scom
+ * registers in the range of 20010A80-20010ABF, a clearing scom
+ * write is done to 0x20010800 with data of '0x00000000' which will
+ * also get a timeout but clears the scom logic errors. After the
+ * clearing write is done the original scom operation can be retried.
+ *
+ * The scom timeout is reported as status 0x4 (Invalid address)
+ * in HMER[21-23].
+ */
+
+ base_xscom_addr = pcb_addr & XSCOM_CLEAR_RANGE_MASK;
+ if (!((base_xscom_addr >= XSCOM_CLEAR_RANGE_START) &&
+ (base_xscom_addr <= XSCOM_CLEAR_RANGE_END)))
+ return 0;
+
+ /*
+ * Reset the XSCOM or next scom operation will fail.
+ * We also need a small delay before we go ahead with clearing write.
+ * We have observed that without a delay the clearing write has reported
+ * a wrong status.
+ */
+ xscom_reset(gcid, true);
+
+ /* Clear errors in HMER */
+ mtspr(SPR_HMER, HMER_CLR_MASK);
+
+ /* Write 0 to clear the xscom logic errors on target chip */
+ out_be64(xscom_addr(gcid, xscom_clear_reg), 0);
+ hmer = xscom_wait_done();
+
+ /*
+ * Above clearing xscom write will timeout and error out with
+ * invalid access as there is no register at that address. This
+ * xscom operation just helps to clear the xscom logic error.
+ *
+ * On failure, reset the XSCOM or we'll hang on the next access
+ */
+ if (hmer & SPR_HMER_XSCOM_FAIL)
+ xscom_reset(gcid, true);
+
+ return 1;
+}
+
+static int64_t xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr,
+ bool is_write, int64_t retries,
+ int64_t *xscom_clear_retries)
+{
+ unsigned int stat = GETFIELD(SPR_HMER_XSCOM_STATUS, hmer);
+ int64_t rc = OPAL_HARDWARE;
+
+ /* XXX Figure out error codes from doc and error
+ * recovery procedures
+ */
+ switch(stat) {
+ case 1:
+ /*
+ * XSCOM engine is blocked, need to retry. Reset XSCOM
+ * engine after crossing retry threshold before
+ * retrying again.
+ */
+ if (retries && !(retries % XSCOM_BUSY_RESET_THRESHOLD)) {
+ prlog(PR_NOTICE, "XSCOM: Busy even after %d retries, "
+ "resetting XSCOM now. Total retries = %lld\n",
+ XSCOM_BUSY_RESET_THRESHOLD, retries);
+ xscom_reset(gcid, true);
+
+ }
+
+ /* Log error if we have retried enough and its still busy */
+ if (retries == XSCOM_BUSY_MAX_RETRIES)
+ log_simple_error(&e_info(OPAL_RC_XSCOM_BUSY),
+ "XSCOM: %s-busy error gcid=0x%x pcb_addr=0x%x "
+ "stat=0x%x\n", is_write ? "write" : "read",
+ gcid, pcb_addr, stat);
+ return OPAL_XSCOM_BUSY;
+
+ case 2: /* CPU is asleep, reset XSCOM engine and return */
+ xscom_reset(gcid, false);
+ return OPAL_XSCOM_CHIPLET_OFF;
+ case 3: /* Partial good */
+ rc = OPAL_XSCOM_PARTIAL_GOOD;
+ break;
+ case 4: /* Invalid address / address error */
+ rc = OPAL_XSCOM_ADDR_ERROR;
+ if (xscom_clear_error(gcid, pcb_addr)) {
+ /* return busy if retries still pending. */
+ if ((*xscom_clear_retries)--)
+ return OPAL_XSCOM_BUSY;
+
+ prlog(PR_DEBUG, "XSCOM: error recovery failed for "
+ "gcid=0x%x pcb_addr=0x%x\n", gcid, pcb_addr);
+
+ }
+ break;
+ case 5: /* Clock error */
+ rc = OPAL_XSCOM_CLOCK_ERROR;
+ break;
+ case 6: /* Parity error */
+ rc = OPAL_XSCOM_PARITY_ERROR;
+ break;
+ case 7: /* Time out */
+ rc = OPAL_XSCOM_TIMEOUT;
+ break;
+ }
+
+ /*
+ * If we're in an XSCOM opal call then squash the error
+ * we assume that the caller (probably opal-prd) will
+ * handle logging it
+ */
+ if (this_cpu()->current_token != OPAL_XSCOM_READ &&
+ this_cpu()->current_token != OPAL_XSCOM_WRITE) {
+ log_simple_error(&e_info(OPAL_RC_XSCOM_RW),
+ "XSCOM: %s error gcid=0x%x pcb_addr=0x%x stat=0x%x\n",
+ is_write ? "write" : "read", gcid, pcb_addr, stat);
+ }
+
+ /* We need to reset the XSCOM or we'll hang on the next access */
+ xscom_reset(gcid, false);
+
+ /* Non recovered ... just fail */
+ return rc;
+}
+
+static void xscom_handle_ind_error(uint64_t data, uint32_t gcid,
+ uint64_t pcb_addr, bool is_write)
+{
+ unsigned int stat = GETFIELD(XSCOM_DATA_IND_ERR, data);
+ bool timeout = !(data & XSCOM_DATA_IND_COMPLETE);
+
+ /* XXX: Create error log entry ? */
+ if (timeout)
+ log_simple_error(&e_info(OPAL_RC_XSCOM_INDIRECT_RW),
+ "XSCOM: indirect %s timeout, gcid=0x%x pcb_addr=0x%llx"
+ " stat=0x%x\n",
+ is_write ? "write" : "read", gcid, pcb_addr, stat);
+ else
+ log_simple_error(&e_info(OPAL_RC_XSCOM_INDIRECT_RW),
+ "XSCOM: indirect %s error, gcid=0x%x pcb_addr=0x%llx"
+ " stat=0x%x\n",
+ is_write ? "write" : "read", gcid, pcb_addr, stat);
+}
+
+static bool xscom_gcid_ok(uint32_t gcid)
+{
+ return get_chip(gcid) != NULL;
+}
+
+/* Determine if SCOM address is multicast */
+static inline bool xscom_is_multicast_addr(uint32_t addr)
+{
+ return (((addr >> 30) & 0x1) == 0x1);
+}
+
+/*
+ * Low level XSCOM access functions, perform a single direct xscom
+ * access via MMIO
+ */
+static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val)
+{
+ uint64_t hmer;
+ int64_t ret, retries;
+ int64_t xscom_clear_retries = XSCOM_CLEAR_MAX_RETRIES;
+
+ if (!xscom_gcid_ok(gcid)) {
+ prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid);
+ return OPAL_PARAMETER;
+ }
+
+ for (retries = 0; retries <= XSCOM_BUSY_MAX_RETRIES; retries++) {
+ /* Clear status bits in HMER (HMER is special
+ * writing to it *ands* bits
+ */
+ mtspr(SPR_HMER, HMER_CLR_MASK);
+
+ /* Read value from SCOM */
+ *val = in_be64(xscom_addr(gcid, pcb_addr));
+
+ /* Wait for done bit */
+ hmer = xscom_wait_done();
+
+ /* Check for error */
+ if (!(hmer & SPR_HMER_XSCOM_FAIL))
+ return OPAL_SUCCESS;
+
+ /* Handle error and possibly eventually retry */
+ ret = xscom_handle_error(hmer, gcid, pcb_addr, false, retries,
+ &xscom_clear_retries);
+ if (ret != OPAL_BUSY)
+ break;
+ }
+
+ /* Do not print error message for multicast SCOMS */
+ if (xscom_is_multicast_addr(pcb_addr) && ret == OPAL_XSCOM_CHIPLET_OFF)
+ return ret;
+
+ /*
+ * Workaround on P9: PRD does operations it *knows* will fail with this
+ * error to work around a hardware issue where accesses via the PIB
+ * (FSI or OCC) work as expected, accesses via the ADU (what xscom goes
+ * through) do not. The chip logic will always return all FFs if there
+ * is any error on the scom.
+ */
+ if (proc_gen == proc_gen_p9 && ret == OPAL_XSCOM_CHIPLET_OFF)
+ return ret;
+
+ /*
+ * If an OPAL call XSCOM read fails, then the OPAL-PRD will
+ * handle logging the error. Hence just print an
+ * informational message here.
+ */
+ if (this_cpu()->current_token == OPAL_XSCOM_READ)
+ prlog(PR_INFO, "XSCOM: Read failed, ret = %lld\n", ret);
+ else
+ prerror("XSCOM: Read failed, ret = %lld\n", ret);
+
+ return ret;
+}
+
+static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val)
+{
+ uint64_t hmer;
+ int64_t ret, retries = 0;
+ int64_t xscom_clear_retries = XSCOM_CLEAR_MAX_RETRIES;
+
+ if (!xscom_gcid_ok(gcid)) {
+ prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid);
+ return OPAL_PARAMETER;
+ }
+
+ for (retries = 0; retries <= XSCOM_BUSY_MAX_RETRIES; retries++) {
+ /* Clear status bits in HMER (HMER is special
+ * writing to it *ands* bits
+ */
+ mtspr(SPR_HMER, HMER_CLR_MASK);
+
+ /* Write value to SCOM */
+ out_be64(xscom_addr(gcid, pcb_addr), val);
+
+ /* Wait for done bit */
+ hmer = xscom_wait_done();
+
+ /* Check for error */
+ if (!(hmer & SPR_HMER_XSCOM_FAIL))
+ return OPAL_SUCCESS;
+
+ /* Handle error and possibly eventually retry */
+ ret = xscom_handle_error(hmer, gcid, pcb_addr, true, retries,
+ &xscom_clear_retries);
+ if (ret != OPAL_BUSY)
+ break;
+ }
+
+ /* Do not print error message for multicast SCOMS */
+ if (xscom_is_multicast_addr(pcb_addr) && ret == OPAL_XSCOM_CHIPLET_OFF)
+ return ret;
+
+ /*
+ * Workaround on P9: PRD does operations it *knows* will fail with this
+ * error to work around a hardware issue where accesses via the PIB
+ * (FSI or OCC) work as expected, accesses via the ADU (what xscom goes
+ * through) do not. The chip logic will always return all FFs if there
+ * is any error on the scom.
+ */
+ if (proc_gen == proc_gen_p9 && ret == OPAL_XSCOM_CHIPLET_OFF)
+ return ret;
+ /*
+ * If an OPAL call XSCOM write fails, then the OPAL-PRD will
+ * handle logging the error. Hence just print an
+ * informational message here.
+ */
+ if (this_cpu()->current_token == OPAL_XSCOM_WRITE)
+ prlog(PR_INFO, "XSCOM: Write failed, ret = %lld\n", ret);
+ else
+ prerror("XSCOM: Write failed, ret = %lld\n", ret);
+
+ return ret;
+}
+
+/*
+ * Indirect XSCOM access functions
+ */
+static int xscom_indirect_read_form0(uint32_t gcid, uint64_t pcb_addr,
+ uint64_t *val)
+{
+ uint32_t addr;
+ uint64_t data;
+ int rc, retries;
+
+ /* Write indirect address */
+ addr = pcb_addr & 0x7fffffff;
+ data = XSCOM_DATA_IND_READ |
+ (pcb_addr & XSCOM_ADDR_IND_ADDR);
+ rc = __xscom_write(gcid, addr, data);
+ if (rc)
+ goto bail;
+
+ /* Wait for completion */
+ for (retries = 0; retries < XSCOM_IND_MAX_RETRIES; retries++) {
+ rc = __xscom_read(gcid, addr, &data);
+ if (rc)
+ goto bail;
+ if ((data & XSCOM_DATA_IND_COMPLETE) &&
+ ((data & XSCOM_DATA_IND_ERR) == 0)) {
+ *val = data & XSCOM_DATA_IND_DATA;
+ break;
+ }
+ if ((data & XSCOM_DATA_IND_COMPLETE) ||
+ (retries >= XSCOM_IND_MAX_RETRIES)) {
+ xscom_handle_ind_error(data, gcid, pcb_addr,
+ false);
+ rc = OPAL_HARDWARE;
+ goto bail;
+ }
+ }
+ bail:
+ if (rc)
+ *val = (uint64_t)-1;
+ return rc;
+}
+
+static int xscom_indirect_form(uint64_t pcb_addr)
+{
+ return (pcb_addr >> 60) & 1;
+}
+
+static int xscom_indirect_read(uint32_t gcid, uint64_t pcb_addr, uint64_t *val)
+{
+ uint64_t form = xscom_indirect_form(pcb_addr);
+
+ if ((proc_gen >= proc_gen_p9) && (form == 1))
+ return OPAL_UNSUPPORTED;
+
+ return xscom_indirect_read_form0(gcid, pcb_addr, val);
+}
+
+static int xscom_indirect_write_form0(uint32_t gcid, uint64_t pcb_addr,
+ uint64_t val)
+{
+ uint32_t addr;
+ uint64_t data;
+ int rc, retries;
+
+ /* Only 16 bit data with indirect */
+ if (val & ~(XSCOM_ADDR_IND_DATA))
+ return OPAL_PARAMETER;
+
+ /* Write indirect address & data */
+ addr = pcb_addr & 0x7fffffff;
+ data = pcb_addr & XSCOM_ADDR_IND_ADDR;
+ data |= val & XSCOM_ADDR_IND_DATA;
+
+ rc = __xscom_write(gcid, addr, data);
+ if (rc)
+ goto bail;
+
+ /* Wait for completion */
+ for (retries = 0; retries < XSCOM_IND_MAX_RETRIES; retries++) {
+ rc = __xscom_read(gcid, addr, &data);
+ if (rc)
+ goto bail;
+ if ((data & XSCOM_DATA_IND_COMPLETE) &&
+ ((data & XSCOM_DATA_IND_ERR) == 0))
+ break;
+ if ((data & XSCOM_DATA_IND_COMPLETE) ||
+ (retries >= XSCOM_IND_MAX_RETRIES)) {
+ xscom_handle_ind_error(data, gcid, pcb_addr,
+ true);
+ rc = OPAL_HARDWARE;
+ goto bail;
+ }
+ }
+ bail:
+ return rc;
+}
+
+static int xscom_indirect_write_form1(uint32_t gcid, uint64_t pcb_addr,
+ uint64_t val)
+{
+ uint32_t addr;
+ uint64_t data;
+
+ if (proc_gen < proc_gen_p9)
+ return OPAL_UNSUPPORTED;
+ if (val & ~(XSCOM_DATA_IND_FORM1_DATA))
+ return OPAL_PARAMETER;
+
+ /* Mangle address and data for form1 */
+ addr = (pcb_addr & 0x000ffffffffUL);
+ data = (pcb_addr & 0xfff00000000UL) << 20;
+ data |= val;
+ return __xscom_write(gcid, addr, data);
+}
+
+static int xscom_indirect_write(uint32_t gcid, uint64_t pcb_addr, uint64_t val)
+{
+ uint64_t form = xscom_indirect_form(pcb_addr);
+
+ if ((proc_gen >= proc_gen_p9) && (form == 1))
+ return xscom_indirect_write_form1(gcid, pcb_addr, val);
+
+ return xscom_indirect_write_form0(gcid, pcb_addr, val);
+}
+
+static uint32_t xscom_decode_chiplet(uint32_t partid, uint64_t *pcb_addr)
+{
+ uint32_t gcid = (partid & 0x0fffffff) >> 4;
+ uint32_t core = partid & 0xf;
+
+ if (proc_gen >= proc_gen_p9) {
+ /* XXX Not supported */
+ *pcb_addr = 0;
+ } else {
+ *pcb_addr |= P8_EX_PCB_SLAVE_BASE;
+ *pcb_addr |= core << 24;
+ }
+
+ return gcid;
+}
+
+void _xscom_lock(void)
+{
+ lock(&xscom_lock);
+}
+
+void _xscom_unlock(void)
+{
+ unlock(&xscom_lock);
+}
+
+/* sorted by the scom controller's partid */
+static LIST_HEAD(scom_list);
+
+int64_t scom_register(struct scom_controller *new)
+{
+ struct scom_controller *cur;
+
+ list_for_each(&scom_list, cur, link) {
+ if (cur->part_id == new->part_id) {
+ prerror("Attempted to add duplicate scom, partid %x\n",
+ new->part_id);
+ return OPAL_BUSY;
+ }
+
+ if (cur->part_id > new->part_id) {
+ list_add_before(&scom_list, &new->link, &cur->link);
+ return 0;
+ }
+ }
+
+ /* if we never find a larger partid then this is the largest */
+ list_add_tail(&scom_list, &new->link);
+
+ return 0;
+}
+
+static struct scom_controller *scom_find(uint32_t partid)
+{
+ struct scom_controller *cur;
+
+ list_for_each(&scom_list, cur, link)
+ if (partid == cur->part_id)
+ return cur;
+
+ return NULL;
+}
+
+static int64_t scom_read(struct scom_controller *scom, uint32_t partid,
+ uint64_t pcbaddr, uint64_t *val)
+{
+ int64_t rc = scom->read(scom, partid, pcbaddr, val);
+
+ if (rc) {
+ prerror("%s: to %x off: %llx rc = %lld\n",
+ __func__, partid, pcbaddr, rc);
+ }
+
+ return rc;
+}
+
+static int64_t scom_write(struct scom_controller *scom, uint32_t partid,
+ uint64_t pcbaddr, uint64_t val)
+{
+ int64_t rc = scom->write(scom, partid, pcbaddr, val);
+
+ if (rc) {
+ prerror("%s: to %x off: %llx rc = %lld\n",
+ __func__, partid, pcbaddr, rc);
+ }
+
+ return rc;
+}
+
+/*
+ * External API
+ */
+int _xscom_read(uint32_t partid, uint64_t pcb_addr, uint64_t *val, bool take_lock)
+{
+ struct scom_controller *scom;
+ uint32_t gcid;
+ int rc;
+
+ if (!opal_addr_valid(val))
+ return OPAL_PARAMETER;
+
+ /* Due to a bug in some versions of the PRD wrapper app, errors
+ * might not be properly forwarded to PRD, in which case the data
+ * set here will be used. Rather than a random value let's thus
+ * initialize the data to a known clean state.
+ */
+ *val = 0xdeadbeefdeadbeefull;
+
+ /* Handle part ID decoding */
+ switch(partid >> 28) {
+ case 0: /* Normal processor chip */
+ gcid = partid;
+ break;
+ case 4: /* EX chiplet */
+ gcid = xscom_decode_chiplet(partid, &pcb_addr);
+ if (pcb_addr == 0)
+ return OPAL_UNSUPPORTED;
+ break;
+ default:
+ /* is it one of our hacks? */
+ scom = scom_find(partid);
+ if (scom)
+ return scom_read(scom, partid, pcb_addr, val);
+
+ /**
+ * @fwts-label XSCOMReadInvalidPartID
+ * @fwts-advice xscom_read was called with an invalid partid.
+ * There's likely a bug somewhere in the stack that's causing
+ * someone to try an xscom_read on something that isn't a
+ * processor, Centaur or EX chiplet.
+ */
+ prerror("%s: invalid XSCOM partid 0x%x\n", __func__, partid);
+ return OPAL_PARAMETER;
+ }
+
+ /* HW822317 requires us to do global locking */
+ if (take_lock)
+ lock(&xscom_lock);
+
+ /* Direct vs indirect access */
+ if (pcb_addr & XSCOM_ADDR_IND_FLAG)
+ rc = xscom_indirect_read(gcid, pcb_addr, val);
+ else
+ rc = __xscom_read(gcid, pcb_addr & 0x7fffffff, val);
+
+ /* Unlock it */
+ if (take_lock)
+ unlock(&xscom_lock);
+ return rc;
+}
+
+static int64_t opal_xscom_read(uint32_t partid, uint64_t pcb_addr, __be64 *__val)
+{
+ uint64_t val;
+ int64_t rc;
+
+ rc = xscom_read(partid, pcb_addr, &val);
+ *__val = cpu_to_be64(val);
+
+ return rc;
+}
+opal_call(OPAL_XSCOM_READ, opal_xscom_read, 3);
+
+int _xscom_write(uint32_t partid, uint64_t pcb_addr, uint64_t val, bool take_lock)
+{
+ struct scom_controller *scom;
+ uint32_t gcid;
+ int rc;
+
+ /* Handle part ID decoding */
+ switch(partid >> 28) {
+ case 0: /* Normal processor chip */
+ gcid = partid;
+ break;
+ case 4: /* EX chiplet */
+ gcid = xscom_decode_chiplet(partid, &pcb_addr);
+ break;
+ default:
+ /* is it one of our hacks? */
+ scom = scom_find(partid);
+ if (scom)
+ return scom_write(scom, partid, pcb_addr, val);
+
+ /**
+ * @fwts-label XSCOMWriteInvalidPartID
+ * @fwts-advice xscom_write was called with an invalid partid.
+ * There's likely a bug somewhere in the stack that's causing
+ * someone to try an xscom_write on something that isn't a
+ * processor, Centaur or EX chiplet.
+ */
+ prerror("%s: invalid XSCOM partid 0x%x\n", __func__, partid);
+ return OPAL_PARAMETER;
+ }
+
+ /* HW822317 requires us to do global locking */
+ if (take_lock)
+ lock(&xscom_lock);
+
+ /* Direct vs indirect access */
+ if (pcb_addr & XSCOM_ADDR_IND_FLAG)
+ rc = xscom_indirect_write(gcid, pcb_addr, val);
+ else
+ rc = __xscom_write(gcid, pcb_addr & 0x7fffffff, val);
+
+ /* Unlock it */
+ if (take_lock)
+ unlock(&xscom_lock);
+ return rc;
+}
+
+static int64_t opal_xscom_write(uint32_t partid, uint64_t pcb_addr, uint64_t val)
+{
+ return xscom_write(partid, pcb_addr, val);
+}
+opal_call(OPAL_XSCOM_WRITE, opal_xscom_write, 3);
+
+/*
+ * Perform a xscom read-modify-write.
+ */
+int xscom_write_mask(uint32_t partid, uint64_t pcb_addr, uint64_t val, uint64_t mask)
+{
+ int rc;
+ uint64_t old_val;
+
+ rc = xscom_read(partid, pcb_addr, &old_val);
+ if (rc)
+ return rc;
+ val = (old_val & ~mask) | (val & mask);
+ return xscom_write(partid, pcb_addr, val);
+}
+
+int xscom_readme(uint64_t pcb_addr, uint64_t *val)
+{
+ return xscom_read(this_cpu()->chip_id, pcb_addr, val);
+}
+
+int xscom_writeme(uint64_t pcb_addr, uint64_t val)
+{
+ return xscom_write(this_cpu()->chip_id, pcb_addr, val);
+}
+
+int64_t xscom_read_cfam_chipid(uint32_t partid, uint32_t *chip_id)
+{
+ uint64_t val;
+ int64_t rc = OPAL_SUCCESS;
+
+ /* Mambo chip model lacks the f000f register, just make
+ * something up
+ */
+ if (chip_quirk(QUIRK_NO_F000F)) {
+ if (proc_gen == proc_gen_p10)
+ val = 0x120DA04980000000UL; /* P10 DD1.0 */
+ else if (proc_gen == proc_gen_p9)
+ val = 0x203D104980000000UL; /* P9 Nimbus DD2.3 */
+ else
+ val = 0x221EF04980000000UL; /* P8 Murano DD2.1 */
+ } else
+ rc = xscom_read(partid, 0xf000f, &val);
+
+ /* Extract CFAM id */
+ if (rc == OPAL_SUCCESS)
+ *chip_id = (uint32_t)(val >> 44);
+
+ return rc;
+}
+
+static void xscom_init_chip_info(struct proc_chip *chip)
+{
+ uint32_t val;
+ int64_t rc;
+
+ rc = xscom_read_cfam_chipid(chip->id, &val);
+ if (rc) {
+ prerror("XSCOM: Error %lld reading 0xf000f register\n", rc);
+ /* We leave chip type to UNKNOWN */
+ return;
+ }
+
+ /* Identify chip */
+ switch(val & 0xff) {
+ case 0xef:
+ chip->type = PROC_CHIP_P8_MURANO;
+ assert(proc_gen == proc_gen_p8);
+ break;
+ case 0xea:
+ chip->type = PROC_CHIP_P8_VENICE;
+ assert(proc_gen == proc_gen_p8);
+ break;
+ case 0xd3:
+ chip->type = PROC_CHIP_P8_NAPLES;
+ assert(proc_gen == proc_gen_p8);
+ break;
+ case 0xd1:
+ chip->type = PROC_CHIP_P9_NIMBUS;
+ assert(proc_gen == proc_gen_p9);
+ break;
+ case 0xd4:
+ chip->type = PROC_CHIP_P9_CUMULUS;
+ assert(proc_gen == proc_gen_p9);
+ break;
+ case 0xd9:
+ chip->type = PROC_CHIP_P9P;
+ assert(proc_gen == proc_gen_p9);
+ break;
+ case 0xda:
+ chip->type = PROC_CHIP_P10;
+ assert(proc_gen == proc_gen_p10);
+ break;
+ default:
+ printf("CHIP: Unknown chip type 0x%02x !!!\n",
+ (unsigned char)(val & 0xff));
+ }
+
+ /* Get EC level from CFAM ID */
+ chip->ec_level = ((val >> 16) & 0xf) << 4;
+ chip->ec_level |= (val >> 8) & 0xf;
+
+ /*
+ * On P9, grab the ECID bits to differenciate
+ * DD1.01, 1.02, 2.00, etc...
+ */
+ if (chip_quirk(QUIRK_MAMBO_CALLOUTS)) {
+ chip->ec_rev = 0;
+ } else if (proc_gen == proc_gen_p9) {
+ uint64_t ecid2 = 0;
+ uint8_t rev;
+ xscom_read(chip->id, 0x18002, &ecid2);
+ switch((ecid2 >> 45) & 7) {
+ case 0:
+ rev = 0;
+ break;
+ case 1:
+ rev = 1;
+ break;
+ case 3:
+ rev = 2;
+ break;
+ case 7:
+ rev = 3;
+ break;
+ default:
+ rev = 0;
+ }
+ prlog(PR_INFO,"P9 DD%i.%i%d detected\n", 0xf & (chip->ec_level >> 4),
+ chip->ec_level & 0xf, rev);
+ chip->ec_rev = rev;
+ } /* XXX P10 */
+}
+
+/*
+* This function triggers xstop by writing to XSCOM.
+* Machine would enter xstop state post completion of this.
+*/
+int64_t xscom_trigger_xstop(void)
+{
+ int rc = OPAL_UNSUPPORTED;
+ bool xstop_disabled = false;
+
+ if (nvram_query_eq_dangerous("opal-sw-xstop", "disable"))
+ xstop_disabled = true;
+
+ if (xstop_disabled) {
+ prlog(PR_NOTICE, "Software initiated checkstop disabled.\n");
+ return rc;
+ }
+
+ if (xstop_xscom.addr)
+ rc = xscom_writeme(xstop_xscom.addr,
+ PPC_BIT(xstop_xscom.fir_bit));
+
+ return rc;
+}
+
+void xscom_init(void)
+{
+ struct dt_node *xn;
+ const struct dt_property *p;
+
+ dt_for_each_compatible(dt_root, xn, "ibm,xscom") {
+ uint32_t gcid = dt_get_chip_id(xn);
+ const struct dt_property *reg;
+ struct proc_chip *chip;
+ const char *chip_name;
+ static const char *chip_names[] = {
+ "UNKNOWN", "P8E", "P8", "P8NVL", "P9N", "P9C", "P9P",
+ "P10",
+ };
+
+ chip = get_chip(gcid);
+ assert(chip);
+
+ /* XXX We need a proper address parsing. For now, we just
+ * "know" that we are looking at a u64
+ */
+ reg = dt_find_property(xn, "reg");
+ assert(reg);
+
+ chip->xscom_base = dt_translate_address(xn, 0, NULL);
+
+ /* Grab processor type and EC level */
+ xscom_init_chip_info(chip);
+
+ if (chip->type >= ARRAY_SIZE(chip_names))
+ chip_name = "INVALID";
+ else
+ chip_name = chip_names[chip->type];
+
+ /* We keep a "CHIP" prefix to make the log more user-friendly */
+ prlog(PR_NOTICE, "CHIP: Chip ID %04x type: %s DD%x.%x%d\n",
+ gcid, chip_name, chip->ec_level >> 4,
+ chip->ec_level & 0xf, chip->ec_rev);
+ prlog(PR_DEBUG, "XSCOM: Base address: 0x%llx\n", chip->xscom_base);
+ }
+
+ /* Collect details to trigger xstop via XSCOM write */
+ p = dt_find_property(dt_root, "ibm,sw-checkstop-fir");
+ if (p) {
+ xstop_xscom.addr = dt_property_get_cell(p, 0);
+ xstop_xscom.fir_bit = dt_property_get_cell(p, 1);
+ prlog(PR_DEBUG, "XSTOP: XSCOM addr = 0x%llx, FIR bit = %lld\n",
+ xstop_xscom.addr, xstop_xscom.fir_bit);
+ } else
+ prlog(PR_DEBUG, "XSTOP: ibm,sw-checkstop-fir prop not found\n");
+}
+
+void xscom_used_by_console(void)
+{
+ xscom_lock.in_con_path = true;
+
+ /*
+ * Some other processor might hold it without having
+ * disabled the console locally so let's make sure that
+ * is over by taking/releasing the lock ourselves
+ */
+ lock(&xscom_lock);
+ unlock(&xscom_lock);
+}
+
+bool xscom_ok(void)
+{
+ return !lock_held_by_me(&xscom_lock);
+}