diff options
author | 2023-10-10 14:33:42 +0000 | |
---|---|---|
committer | 2023-10-10 14:33:42 +0000 | |
commit | af1a266670d040d2f4083ff309d732d648afba2a (patch) | |
tree | 2fc46203448ddcc6f81546d379abfaeb323575e9 /roms/skiboot/hw | |
parent | e02cda008591317b1625707ff8e115a4841aa889 (diff) |
Change-Id: Iaf8d18082d3991dec7c0ebbea540f092188eb4ec
Diffstat (limited to 'roms/skiboot/hw')
94 files changed, 74585 insertions, 0 deletions
diff --git a/roms/skiboot/hw/Makefile.inc b/roms/skiboot/hw/Makefile.inc new file mode 100644 index 000000000..37256d3cc --- /dev/null +++ b/roms/skiboot/hw/Makefile.inc @@ -0,0 +1,19 @@ +# -*-Makefile-*- +SUBDIRS += hw +HW_OBJS = xscom.o chiptod.o lpc.o lpc-uart.o psi.o +HW_OBJS += homer.o slw.o occ.o fsi-master.o centaur.o imc.o +HW_OBJS += nx.o nx-rng.o nx-crypto.o nx-compress.o nx-842.o nx-gzip.o +HW_OBJS += phb3.o sfc-ctrl.o fake-rtc.o bt.o p8-i2c.o prd.o +HW_OBJS += dts.o lpc-rtc.o npu.o npu-hw-procedures.o xive.o phb4.o +HW_OBJS += fake-nvram.o lpc-mbox.o npu2.o npu2-hw-procedures.o +HW_OBJS += npu2-common.o npu2-opencapi.o phys-map.o sbe-p9.o capp.o +HW_OBJS += occ-sensor.o vas.o sbe-p8.o dio-p9.o lpc-port80h.o cache-p9.o +HW_OBJS += npu-opal.o npu3.o npu3-nvlink.o npu3-hw-procedures.o +HW_OBJS += ocmb.o xive2.o +HW=hw/built-in.a + +include $(SRC)/hw/fsp/Makefile.inc +include $(SRC)/hw/ast-bmc/Makefile.inc +include $(SRC)/hw/ipmi/Makefile.inc + +$(HW): $(HW_OBJS:%=hw/%) $(FSP) $(EC) $(AST_BMC) $(IPMI) diff --git a/roms/skiboot/hw/ast-bmc/Makefile.inc b/roms/skiboot/hw/ast-bmc/Makefile.inc new file mode 100644 index 000000000..e7ded0e88 --- /dev/null +++ b/roms/skiboot/hw/ast-bmc/Makefile.inc @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +SUBDIRS += hw/ast-bmc + +AST_BMC_OBJS = ast-io.o ast-sf-ctrl.o +AST_BMC = hw/ast-bmc/built-in.a +$(AST_BMC): $(AST_BMC_OBJS:%=hw/ast-bmc/%) diff --git a/roms/skiboot/hw/ast-bmc/ast-io.c b/roms/skiboot/hw/ast-bmc/ast-io.c new file mode 100644 index 000000000..f0f8c4c4d --- /dev/null +++ b/roms/skiboot/hw/ast-bmc/ast-io.c @@ -0,0 +1,498 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Note about accesses to the AST2400 internal memory map: + * + * There are two ways to genrate accesses to the AHB bus of the AST2400 + * from the host. The LPC->AHB bridge and the iLPC->AHB bridge. + * + * LPC->AHB bridge + * --------------- + * + * This bridge directly converts memory or firmware accesses using + * a set of registers for establishing a remapping window. We prefer + * using FW space as normal memory space is limited to byte accesses + * to a fixed 256M window, while FW space allows us to use different + * access sizes and to control the IDSEL bits which essentially enable + * a full 4G address space. + * + * The way FW accesses map onto AHB is controlled via two registers + * in the BMC's LPC host controller: + * + * HICR7 at 0x1e789088 [31:16] : ADRBASE + * [15:00] : HWMBASE + * + * HICR8 at 0x1e78908c [31:16] : ADRMASK + * [15:00] : HWNCARE + * + * All decoding/remapping happens on the top 16 bits of the LPC address + * named LPC_ADDR as follow: + * + * - For decoding, LPC_ADDR bits are compared with HWMBASE if the + * corresponding bit in HWNCARE is 0. + * + * - For remapping, the AHB address is constructed by taking bits + * from LPC_ADDR if the corresponding bit in ADRMASK is 0 or in + * ADRBASE if the corresponding bit in ADRMASK is 1 + * + * Example of 2MB SPI flash, LPC 0xFCE00000~0xFCFFFFFF onto + * AHB 0x30000000~0x301FFFFF (SPI flash) + * + * ADRBASE=0x3000 HWMBASE=0xFCE0 + * ADRMASK=0xFFE0 HWNCARE=0x001F + * + * This comes pre-configured by the BMC or HostBoot to access the PNOR + * flash from IDSEL 0 as follow: + * + * ADRBASE=0x3000 HWMBASE=0x0e00 for 32MB + * ADRMASK=0xfe00 HWNCARE=0x01ff + * + * Which means mapping of LPC 0x0e000000..0x0fffffff onto + * AHB 0x30000000..0x31ffffff + * + * iLPC->AHB bridge + * --------------- + * + * This bridge is hosted in the SuperIO part of the BMC and is + * controlled by a series of byte-sized registers accessed indirectly + * via IO ports 0x2e and 0x2f. + * + * Via these, byte by byte, we can construct an AHB address and + * fill a data buffer to trigger a write cycle, or we can do a + * read cycle and read back the data, byte after byte. + * + * This is fairly convoluted and slow but works regardless of what + * mapping was established in the LPC->AHB bridge. + * + * For the time being, we use the iLPC->AHB for everything except + * pnor accesses. In the long run, we will reconfigure the LPC->AHB + * to provide more direct access to all of the BMC address space but + * we'll only do that after the boot script/program on the BMC is + * updated to restore the bridge to a state compatible with the SBE + * expectations on boot. + * + * Copyright 2013-2019 IBM Corp. + */ + +#include <skiboot.h> +#include <lpc.h> +#include <lock.h> +#include <device.h> + +#include "ast.h" + +#define BMC_SIO_SCR28 0x28 +#define BOOT_FLAGS_VERSION 0x42 + +/* + * SIO Register 0x29: Boot Flags (normal bit ordering) + * + * [7:6] Hostboot Boot mode: + * 00 : Normal + * 01 : Terminate on first error + * 10 : istep mode + * 11 : reserved + * [5:4] Boot options + * 00 : reserved + * 01 : Memboot + * 10 : Clear gard + * 11 : reserved + * [ 3 ] BMC mbox PNOR driver + * [2:0] Hostboot Log level: + * 000 : Normal + * 001 : Enable Scan trace + * xxx : reserved + */ + +#define BMC_SIO_SCR29 0x29 +#define BMC_SIO_SCR29_MBOX 0x08 +#define BMC_SIO_SCR29_MEMBOOT 0x10 + +/* + * SIO Register 0x2d: Platform Flags (normal bit ordering) + * + * [ 7 ] Hostboot configures SUART + * [ 6 ] Hostboot configures VUART + * [5:1] Reserved + * [ 0 ] Isolate Service Processor + */ +#define BMC_SIO_PLAT_FLAGS 0x2d +#define BMC_SIO_PLAT_ISOLATE_SP 0x01 + +enum { + BMC_SIO_DEV_NONE = -1, + BMC_SIO_DEV_UART1 = 2, + BMC_SIO_DEV_UART2 = 3, + BMC_SIO_DEV_SWC = 4, + BMC_SIO_DEV_KBC = 5, + BMC_SIO_DEV_P80 = 7, + BMC_SIO_DEV_UART3 = 0xb, + BMC_SIO_DEV_UART4 = 0xc, + BMC_SIO_DEV_LPC2AHB = 0xd, + BMC_SIO_DEV_MBOX = 0xe, +}; + +static struct lock bmc_sio_lock = LOCK_UNLOCKED; +static int bmc_sio_cur_dev = BMC_SIO_DEV_NONE; + +/* + * SuperIO indirect accesses + */ +static void bmc_sio_outb(uint8_t val, uint8_t reg) +{ + lpc_outb(reg, 0x2e); + lpc_outb(val, 0x2f); +} + +static uint8_t bmc_sio_inb(uint8_t reg) +{ + lpc_outb(reg, 0x2e); + return lpc_inb(0x2f); +} + +static void bmc_sio_get(int dev) +{ + lock(&bmc_sio_lock); + + if (bmc_sio_cur_dev == dev || dev < 0) + return; + + if (bmc_sio_cur_dev == BMC_SIO_DEV_NONE) { + /* Send SuperIO password */ + lpc_outb(0xa5, 0x2e); + lpc_outb(0xa5, 0x2e); + } + + /* Select logical dev */ + bmc_sio_outb(dev, 0x07); + + bmc_sio_cur_dev = dev; +} + +static void bmc_sio_put(bool lock_sio) +{ + if (lock_sio) { + /* Re-lock SuperIO */ + lpc_outb(0xaa, 0x2e); + + bmc_sio_cur_dev = BMC_SIO_DEV_NONE; + } + unlock(&bmc_sio_lock); +} + +/* + * AHB accesses via iLPC->AHB in SuperIO. Works on byteswapped + * values (ie. Little Endian registers) + */ +static void bmc_sio_ahb_prep(uint32_t reg, uint8_t type) +{ + /* Enable iLPC->AHB */ + bmc_sio_outb(0x01, 0x30); + + /* Address */ + bmc_sio_outb((reg >> 24) & 0xff, 0xf0); + bmc_sio_outb((reg >> 16) & 0xff, 0xf1); + bmc_sio_outb((reg >> 8) & 0xff, 0xf2); + bmc_sio_outb((reg ) & 0xff, 0xf3); + + /* bytes cycle type */ + bmc_sio_outb(type, 0xf8); +} + +static void bmc_sio_ahb_writel(uint32_t val, uint32_t reg) +{ + bmc_sio_get(BMC_SIO_DEV_LPC2AHB); + + bmc_sio_ahb_prep(reg, 2); + + /* Write data */ + bmc_sio_outb(val >> 24, 0xf4); + bmc_sio_outb(val >> 16, 0xf5); + bmc_sio_outb(val >> 8, 0xf6); + bmc_sio_outb(val , 0xf7); + + /* Trigger */ + bmc_sio_outb(0xcf, 0xfe); + + bmc_sio_put(false); +} + +static uint32_t bmc_sio_ahb_readl(uint32_t reg) +{ + uint32_t val = 0; + + bmc_sio_get(BMC_SIO_DEV_LPC2AHB); + + bmc_sio_ahb_prep(reg, 2); + + /* Trigger */ + bmc_sio_inb(0xfe); + + /* Read results */ + val = (val << 8) | bmc_sio_inb(0xf4); + val = (val << 8) | bmc_sio_inb(0xf5); + val = (val << 8) | bmc_sio_inb(0xf6); + val = (val << 8) | bmc_sio_inb(0xf7); + + bmc_sio_put(false); + + return val; +} + +/* + * External API + * + * We only support 4-byte accesses to all of AHB. We additionally + * support 1-byte accesses to the flash area only. + * + * We could support all access sizes via iLPC but we don't need + * that for now. + */ + +void ast_ahb_writel(uint32_t val, uint32_t reg) +{ + /* For now, always use iLPC->AHB, it will byteswap */ + bmc_sio_ahb_writel(val, reg); +} + +uint32_t ast_ahb_readl(uint32_t reg) +{ + /* For now, always use iLPC->AHB, it will byteswap */ + return bmc_sio_ahb_readl(reg); +} + +static void ast_setup_sio_irq_polarity(void) +{ + /* Select logical dev 2 */ + bmc_sio_get(BMC_SIO_DEV_UART1); + bmc_sio_outb(0x01, 0x71); /* level low */ + bmc_sio_put(false); + + /* Select logical dev 3 */ + bmc_sio_get(BMC_SIO_DEV_UART2); + bmc_sio_outb(0x01, 0x71); /* irq level low */ + bmc_sio_put(false); + + /* Select logical dev 4 */ + bmc_sio_get(BMC_SIO_DEV_SWC); + bmc_sio_outb(0x01, 0x71); /* irq level low */ + bmc_sio_put(false); + + /* Select logical dev 5 */ + bmc_sio_get(BMC_SIO_DEV_KBC); + bmc_sio_outb(0x01, 0x71); /* irq level low */ + bmc_sio_outb(0x01, 0x73); /* irq level low */ + bmc_sio_put(false); + + /* Select logical dev 7 */ + bmc_sio_get(BMC_SIO_DEV_P80); + bmc_sio_outb(0x01, 0x71); /* irq level low */ + bmc_sio_put(false); + + /* Select logical dev d */ + bmc_sio_get(BMC_SIO_DEV_UART3); + bmc_sio_outb(0x01, 0x71); /* irq level low */ + bmc_sio_put(false); + + /* Select logical dev c */ + bmc_sio_get(BMC_SIO_DEV_UART4); + bmc_sio_outb(0x01, 0x71); /* irq level low */ + bmc_sio_put(false); + + /* Select logical dev d */ + bmc_sio_get(BMC_SIO_DEV_LPC2AHB); + bmc_sio_outb(0x01, 0x71); /* irq level low */ + bmc_sio_put(false); + + /* Select logical dev e */ + bmc_sio_get(BMC_SIO_DEV_MBOX); + bmc_sio_outb(0x01, 0x71); /* irq level low */ + bmc_sio_put(true); +} + +bool ast_sio_is_enabled(void) +{ + bool enabled; + int64_t rc; + + lock(&bmc_sio_lock); + /* + * Probe by attempting to lock the SIO device, this way the + * post-condition is that the SIO device is locked or not able to be + * unlocked. This turns out neater than trying to use the unlock code. + */ + rc = lpc_probe_write(OPAL_LPC_IO, 0x2e, 0xaa, 1); + if (rc) { + enabled = false; + /* If we can't lock it, then we can't unlock it either */ + goto out; + } + + /* + * Now that we know that is locked and able to be unlocked, unlock it + * if skiboot's recorded device state indicates it was previously + * unlocked. + */ + if (bmc_sio_cur_dev != BMC_SIO_DEV_NONE) { + /* Send SuperIO password */ + lpc_outb(0xa5, 0x2e); + lpc_outb(0xa5, 0x2e); + + /* Ensure the previously selected logical dev is selected */ + bmc_sio_outb(bmc_sio_cur_dev, 0x07); + } + + enabled = true; +out: + unlock(&bmc_sio_lock); + + return enabled; +} + +bool ast_sio_init(void) +{ + bool enabled = ast_sio_is_enabled(); + + /* Configure all AIO interrupts to level low */ + if (enabled) + ast_setup_sio_irq_polarity(); + + return enabled; +} + +bool ast_io_is_rw(void) +{ + return !(ast_ahb_readl(LPC_HICRB) & LPC_HICRB_ILPC_DISABLE); +} + +bool ast_io_init(void) +{ + return ast_io_is_rw(); +} + +bool ast_lpc_fw_ipmi_hiomap(void) +{ + return platform.bmc->sw->ipmi_oem_hiomap_cmd != 0; +} + +bool ast_lpc_fw_mbox_hiomap(void) +{ + struct dt_node *n; + + n = dt_find_compatible_node(dt_root, NULL, "mbox"); + + return n != NULL; +} + +bool ast_lpc_fw_maps_flash(void) +{ + uint8_t boot_version; + uint8_t boot_flags; + + boot_version = bmc_sio_inb(BMC_SIO_SCR28); + if (boot_version != BOOT_FLAGS_VERSION) + return true; + + boot_flags = bmc_sio_inb(BMC_SIO_SCR29); + return !(boot_flags & BMC_SIO_SCR29_MEMBOOT); +} + +bool ast_scratch_reg_is_mbox(void) +{ + uint8_t boot_version; + uint8_t boot_flags; + + boot_version = bmc_sio_inb(BMC_SIO_SCR28); + if (boot_version != BOOT_FLAGS_VERSION) + return false; + + boot_flags = bmc_sio_inb(BMC_SIO_SCR29); + return boot_flags & BMC_SIO_SCR29_MBOX; +} + +void ast_setup_ibt(uint16_t io_base, uint8_t irq) +{ + uint32_t v; + + v = bmc_sio_ahb_readl(LPC_iBTCR0); + v = v & ~(0xfffffc00u); + v = v | (((uint32_t)io_base) << 16); + v = v | (((uint32_t)irq) << 12); + bmc_sio_ahb_writel(v, LPC_iBTCR0); +} + +bool ast_is_vuart1_enabled(void) +{ + uint32_t v; + + v = bmc_sio_ahb_readl(VUART1_GCTRLA); + return !!(v & 1); +} + +void ast_setup_vuart1(uint16_t io_base, uint8_t irq) +{ + uint32_t v; + + /* IRQ level low */ + v = bmc_sio_ahb_readl(VUART1_GCTRLA); + v = v & ~2u; + bmc_sio_ahb_writel(v, VUART1_GCTRLA); + v = bmc_sio_ahb_readl(VUART1_GCTRLA); + + /* IRQ number */ + v = bmc_sio_ahb_readl(VUART1_GCTRLB); + v = (v & ~0xf0u) | (irq << 4); + bmc_sio_ahb_writel(v, VUART1_GCTRLB); + + /* Address */ + bmc_sio_ahb_writel(io_base & 0xff, VUART1_ADDRL); + bmc_sio_ahb_writel(io_base >> 8, VUART1_ADDRH); +} + +/* Setup SuperIO UART 1 */ +void ast_setup_sio_uart1(uint16_t io_base, uint8_t irq) +{ + bmc_sio_get(BMC_SIO_DEV_UART1); + + /* Disable UART1 for configuration */ + bmc_sio_outb(0x00, 0x30); + + /* Configure base and interrupt */ + bmc_sio_outb(io_base >> 8, 0x60); + bmc_sio_outb(io_base & 0xff, 0x61); + bmc_sio_outb(irq, 0x70); + bmc_sio_outb(0x01, 0x71); /* level low */ + + /* Enable UART1 */ + bmc_sio_outb(0x01, 0x30); + + bmc_sio_put(true); +} + +void ast_disable_sio_uart1(void) +{ + bmc_sio_get(BMC_SIO_DEV_UART1); + + /* Disable UART1 */ + bmc_sio_outb(0x00, 0x30); + + bmc_sio_put(true); +} + +void ast_setup_sio_mbox(uint16_t io_base, uint8_t irq) +{ + bmc_sio_get(BMC_SIO_DEV_MBOX); + + /* Disable for configuration */ + bmc_sio_outb(0x00, 0x30); + + bmc_sio_outb(io_base >> 8, 0x60); + bmc_sio_outb(io_base & 0xff, 0x61); + bmc_sio_outb(irq, 0x70); + bmc_sio_outb(0x01, 0x71); /* level low */ + + /* Enable MailBox */ + bmc_sio_outb(0x01, 0x30); + + bmc_sio_put(true); +} + diff --git a/roms/skiboot/hw/ast-bmc/ast-sf-ctrl.c b/roms/skiboot/hw/ast-bmc/ast-sf-ctrl.c new file mode 100644 index 000000000..03cc44318 --- /dev/null +++ b/roms/skiboot/hw/ast-bmc/ast-sf-ctrl.c @@ -0,0 +1,1020 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* Copyright 2013-2018 IBM Corp. */ + +#include <stdint.h> +#include <stdbool.h> +#include <stdlib.h> +#include <errno.h> +#include <stdio.h> +#include <string.h> + +#include <libflash/libflash.h> +#include <libflash/libflash-priv.h> +#ifdef __SKIBOOT__ +#include "lpc.h" +#endif + +#include "ast.h" + +#ifndef __unused +#define __unused __attribute__((unused)) +#endif + +#define CALIBRATE_BUF_SIZE 16384 + +struct ast_sf_ctrl { + /* We have 2 controllers, one for the BMC flash, one for the PNOR */ + uint8_t type; + + /* Address and previous value of the ctrl register */ + uint32_t ctl_reg; + + /* Control register value for normal commands */ + uint32_t ctl_val; + + /* Control register value for (fast) reads */ + uint32_t ctl_read_val; + + /* Flash read timing register */ + uint32_t fread_timing_reg; + uint32_t fread_timing_val; + + /* Address of the flash mapping */ + uint32_t flash; + + /* Current 4b mode */ + bool mode_4b; + + /* Callbacks */ + struct spi_flash_ctrl ops; +}; + +static uint32_t ast_ahb_freq; + +static const uint32_t ast_ct_hclk_divs[] = { + 0xf, /* HCLK */ + 0x7, /* HCLK/2 */ + 0xe, /* HCLK/3 */ + 0x6, /* HCLK/4 */ + 0xd, /* HCLK/5 */ +}; + +#ifdef __SKIBOOT__ +#define PNOR_AHB_ADDR 0x30000000 +static uint32_t pnor_lpc_offset; + +static int ast_copy_to_ahb(uint32_t reg, const void *src, uint32_t len) +{ + /* Check we don't cross IDSEL segments */ + if ((reg ^ (reg + len - 1)) >> 28) + return -EINVAL; + + /* SPI flash, use LPC->AHB bridge */ + if ((reg >> 28) == (PNOR_AHB_ADDR >> 28)) { + uint32_t chunk, off = reg - PNOR_AHB_ADDR + pnor_lpc_offset; + int64_t rc; + + while(len) { + /* Chose access size */ + if (len > 3 && !(off & 3)) { + rc = lpc_write(OPAL_LPC_FW, off, + *(uint32_t *)src, 4); + chunk = 4; + } else { + rc = lpc_write(OPAL_LPC_FW, off, + *(uint8_t *)src, 1); + chunk = 1; + } + if (rc) { + prerror("AST_IO: lpc_write.sb failure %lld" + " to FW 0x%08x\n", rc, off); + return rc; + } + len -= chunk; + off += chunk; + src += chunk; + } + return 0; + } + + /* Otherwise we don't do byte access (... yet) */ + prerror("AST_IO: Attempted write bytes access to %08x\n", reg); + return -EINVAL; +} + +static int ast_copy_from_ahb(void *dst, uint32_t reg, uint32_t len) +{ + /* Check we don't cross IDSEL segments */ + if ((reg ^ (reg + len - 1)) >> 28) + return -EINVAL; + + /* SPI flash, use LPC->AHB bridge */ + if ((reg >> 28) == (PNOR_AHB_ADDR >> 28)) { + uint32_t chunk, off = reg - PNOR_AHB_ADDR + pnor_lpc_offset; + int64_t rc; + + while(len) { + uint32_t dat; + + /* Chose access size */ + if (len > 3 && !(off & 3)) { + rc = lpc_read(OPAL_LPC_FW, off, &dat, 4); + if (!rc) + *(uint32_t *)dst = dat; + chunk = 4; + } else { + rc = lpc_read(OPAL_LPC_FW, off, &dat, 1); + if (!rc) + *(uint8_t *)dst = dat; + chunk = 1; + } + if (rc) { + prerror("AST_IO: lpc_read.sb failure %lld" + " to FW 0x%08x\n", rc, off); + return rc; + } + len -= chunk; + off += chunk; + dst += chunk; + } + return 0; + } + /* Otherwise we don't do byte access (... yet) */ + prerror("AST_IO: Attempted read bytes access to %08x\n", reg); + return -EINVAL; +} +#endif /* __SKIBOOT__ */ + +static int ast_sf_start_cmd(struct ast_sf_ctrl *ct, uint8_t cmd) +{ + /* Switch to user mode, CE# dropped */ + ast_ahb_writel(ct->ctl_val | 7, ct->ctl_reg); + + /* user mode, CE# active */ + ast_ahb_writel(ct->ctl_val | 3, ct->ctl_reg); + + /* write cmd */ + return ast_copy_to_ahb(ct->flash, &cmd, 1); +} + +static void ast_sf_end_cmd(struct ast_sf_ctrl *ct) +{ + /* clear CE# */ + ast_ahb_writel(ct->ctl_val | 7, ct->ctl_reg); + + /* Switch back to read mode */ + ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg); +} + +static int ast_sf_send_addr(struct ast_sf_ctrl *ct, uint32_t addr) +{ + const void *ap; + beint32_t tmp; + + /* Layout address MSB first in memory */ + tmp = cpu_to_be32(addr); + + /* Send the right amount of bytes */ + ap = (char *)&tmp; + + if (ct->mode_4b) + return ast_copy_to_ahb(ct->flash, ap, 4); + else + return ast_copy_to_ahb(ct->flash, ap + 1, 3); +} + +static int ast_sf_cmd_rd(struct spi_flash_ctrl *ctrl, uint8_t cmd, + bool has_addr, uint32_t addr, void *buffer, + uint32_t size) +{ + struct ast_sf_ctrl *ct = container_of(ctrl, struct ast_sf_ctrl, ops); + int rc; + + rc = ast_sf_start_cmd(ct, cmd); + if (rc) + goto bail; + if (has_addr) { + rc = ast_sf_send_addr(ct, addr); + if (rc) + goto bail; + } + if (buffer && size) + rc = ast_copy_from_ahb(buffer, ct->flash, size); + bail: + ast_sf_end_cmd(ct); + return rc; +} + +static int ast_sf_cmd_wr(struct spi_flash_ctrl *ctrl, uint8_t cmd, + bool has_addr, uint32_t addr, const void *buffer, + uint32_t size) +{ + struct ast_sf_ctrl *ct = container_of(ctrl, struct ast_sf_ctrl, ops); + int rc; + + rc = ast_sf_start_cmd(ct, cmd); + if (rc) + goto bail; + if (has_addr) { + rc = ast_sf_send_addr(ct, addr); + if (rc) + goto bail; + } + if (buffer && size) + rc = ast_copy_to_ahb(ct->flash, buffer, size); + bail: + ast_sf_end_cmd(ct); + return rc; +} + +static int ast_sf_set_4b(struct spi_flash_ctrl *ctrl, bool enable) +{ + struct ast_sf_ctrl *ct = container_of(ctrl, struct ast_sf_ctrl, ops); + uint32_t ce_ctrl = 0; + + if (ct->type == AST_SF_TYPE_BMC && ct->ops.finfo->size > 0x1000000) + ce_ctrl = ast_ahb_readl(BMC_SPI_FCTL_CE_CTRL); + else if (ct->type != AST_SF_TYPE_PNOR) + return enable ? FLASH_ERR_4B_NOT_SUPPORTED : 0; + + /* + * We update the "old" value as well since when quitting + * we don't restore the mode of the flash itself so we need + * to leave the controller in a compatible setup + */ + if (enable) { + ct->ctl_val |= 0x2000; + ct->ctl_read_val |= 0x2000; + ce_ctrl |= 0x1; + } else { + ct->ctl_val &= ~0x2000; + ct->ctl_read_val &= ~0x2000; + ce_ctrl &= ~0x1; + } + ct->mode_4b = enable; + + /* Update read mode */ + ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg); + + if (ce_ctrl && ct->type == AST_SF_TYPE_BMC) + ast_ahb_writel(ce_ctrl, BMC_SPI_FCTL_CE_CTRL); + + return 0; +} + +static int ast_sf_read(struct spi_flash_ctrl *ctrl, uint32_t pos, + void *buf, uint32_t len) +{ + struct ast_sf_ctrl *ct = container_of(ctrl, struct ast_sf_ctrl, ops); + + /* + * We are in read mode by default. We don't yet support fancy + * things like fast read or X2 mode + */ + return ast_copy_from_ahb(buf, ct->flash + pos, len); +} + +static void ast_get_ahb_freq(void) +{ + static const uint32_t cpu_freqs_24_48[] = { + 384000000, + 360000000, + 336000000, + 408000000 + }; + static const uint32_t cpu_freqs_25[] = { + 400000000, + 375000000, + 350000000, + 425000000 + }; + static const uint32_t ahb_div[] = { 1, 2, 4, 3 }; + uint32_t strap, cpu_clk, div; + + if (ast_ahb_freq) + return; + + /* HW strapping gives us the CPU freq and AHB divisor */ + strap = ast_ahb_readl(SCU_HW_STRAPPING); + if (strap & 0x00800000) { + FL_DBG("AST: CLKIN 25Mhz\n"); + cpu_clk = cpu_freqs_25[(strap >> 8) & 3]; + } else { + FL_DBG("AST: CLKIN 24/48Mhz\n"); + cpu_clk = cpu_freqs_24_48[(strap >> 8) & 3]; + } + FL_DBG("AST: CPU frequency: %d Mhz\n", cpu_clk / 1000000); + div = ahb_div[(strap >> 10) & 3]; + ast_ahb_freq = cpu_clk / div; + FL_DBG("AST: AHB frequency: %d Mhz\n", ast_ahb_freq / 1000000); +} + +static int ast_sf_check_reads(struct ast_sf_ctrl *ct, + const uint8_t *golden_buf, uint8_t *test_buf) +{ + int i, rc; + + for (i = 0; i < 10; i++) { + rc = ast_copy_from_ahb(test_buf, ct->flash, CALIBRATE_BUF_SIZE); + if (rc) + return rc; + if (memcmp(test_buf, golden_buf, CALIBRATE_BUF_SIZE) != 0) + return FLASH_ERR_VERIFY_FAILURE; + } + return 0; +} + +static int ast_sf_calibrate_reads(struct ast_sf_ctrl *ct, uint32_t hdiv, + const uint8_t *golden_buf, uint8_t *test_buf) +{ + int i, rc; + int good_pass = -1, pass_count = 0; + uint32_t shift = (hdiv - 1) << 2; + uint32_t mask = ~(0xfu << shift); + +#define FREAD_TPASS(i) (((i) / 2) | (((i) & 1) ? 0 : 8)) + + /* Try HCLK delay 0..5, each one with/without delay and look for a + * good pair. + */ + for (i = 0; i < 12; i++) { + bool pass; + + ct->fread_timing_val &= mask; + ct->fread_timing_val |= FREAD_TPASS(i) << shift; + ast_ahb_writel(ct->fread_timing_val, ct->fread_timing_reg); + rc = ast_sf_check_reads(ct, golden_buf, test_buf); + if (rc && rc != FLASH_ERR_VERIFY_FAILURE) + return rc; + pass = (rc == 0); + FL_DBG(" * [%08x] %d HCLK delay, %dns DI delay : %s\n", + ct->fread_timing_val, i/2, (i & 1) ? 0 : 4, pass ? "PASS" : "FAIL"); + if (pass) { + pass_count++; + if (pass_count == 3) { + good_pass = i - 1; + break; + } + } else + pass_count = 0; + } + + /* No good setting for this frequency */ + if (good_pass < 0) + return FLASH_ERR_VERIFY_FAILURE; + + /* We have at least one pass of margin, let's use first pass */ + ct->fread_timing_val &= mask; + ct->fread_timing_val |= FREAD_TPASS(good_pass) << shift; + ast_ahb_writel(ct->fread_timing_val, ct->fread_timing_reg); + FL_DBG("AST: * -> good is pass %d [0x%08x]\n", + good_pass, ct->fread_timing_val); + return 0; +} + +static bool ast_calib_data_usable(const uint8_t *test_buf, uint32_t size) +{ + const uint32_t *tb32 = (const uint32_t *)test_buf; + uint32_t i, cnt = 0; + + /* We check if we have enough words that are neither all 0 + * nor all 1's so the calibration can be considered valid. + * + * I use an arbitrary threshold for now of 64 + */ + size >>= 2; + for (i = 0; i < size; i++) { + if (tb32[i] != 0 && tb32[i] != 0xffffffff) + cnt++; + } + return cnt >= 64; +} + +static int ast_sf_optimize_reads(struct ast_sf_ctrl *ct, + struct flash_info *info __unused, + uint32_t max_freq) +{ + uint8_t *golden_buf, *test_buf; + int i, rc, best_div = -1; + uint32_t save_read_val = ct->ctl_read_val; + + test_buf = malloc(CALIBRATE_BUF_SIZE * 2); + golden_buf = test_buf + CALIBRATE_BUF_SIZE; + + /* We start with the dumbest setting and read some data */ + ct->ctl_read_val = (ct->ctl_read_val & 0x2000) | + (0x00 << 28) | /* Single bit */ + (0x00 << 24) | /* CE# max */ + (0x03 << 16) | /* use normal reads */ + (0x00 << 8) | /* HCLK/16 */ + (0x00 << 6) | /* no dummy cycle */ + (0x00); /* normal read */ + ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg); + + rc = ast_copy_from_ahb(golden_buf, ct->flash, CALIBRATE_BUF_SIZE); + if (rc) { + free(test_buf); + return rc; + } + + /* Establish our read mode with freq field set to 0 */ + ct->ctl_read_val = save_read_val & 0xfffff0ff; + + /* Check if calibration data is suitable */ + if (!ast_calib_data_usable(golden_buf, CALIBRATE_BUF_SIZE)) { + FL_INF("AST: Calibration area too uniform, " + "using low speed\n"); + ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg); + free(test_buf); + return 0; + } + + /* Now we iterate the HCLK dividers until we find our breaking point */ + for (i = 5; i > 0; i--) { + uint32_t tv, freq; + + /* Compare timing to max */ + freq = ast_ahb_freq / i; + if (freq >= max_freq) + continue; + + /* Set the timing */ + tv = ct->ctl_read_val | (ast_ct_hclk_divs[i - 1] << 8); + ast_ahb_writel(tv, ct->ctl_reg); + FL_DBG("AST: Trying HCLK/%d...\n", i); + rc = ast_sf_calibrate_reads(ct, i, golden_buf, test_buf); + + /* Some other error occurred, bail out */ + if (rc && rc != FLASH_ERR_VERIFY_FAILURE) { + free(test_buf); + return rc; + } + if (rc == 0) + best_div = i; + } + free(test_buf); + + /* Nothing found ? */ + if (best_div < 0) + FL_ERR("AST: No good frequency, using dumb slow\n"); + else { + FL_DBG("AST: Found good read timings at HCLK/%d\n", best_div); + ct->ctl_read_val |= (ast_ct_hclk_divs[best_div - 1] << 8); + } + ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg); + + return 0; +} + +static int ast_sf_get_hclk(uint32_t *ctl_val, uint32_t max_freq) +{ + int i; + + /* It appears that running commands at HCLK/2 on some micron + * chips results in occasionally reads of bogus status (that + * or unrelated chip hangs). + * + * Since we cannot calibrate properly the reads for commands, + * instead, let's limit our SPI frequency to HCLK/4 to stay + * on the safe side of things + */ +#define MIN_CMD_FREQ 4 + for (i = MIN_CMD_FREQ; i <= 5; i++) { + uint32_t freq = ast_ahb_freq / i; + if (freq >= max_freq) + continue; + *ctl_val |= (ast_ct_hclk_divs[i - 1] << 8); + return i; + } + return 0; +} + +static int ast_sf_setup_macronix(struct ast_sf_ctrl *ct, struct flash_info *info) +{ + int rc, div __unused; + uint8_t srcr[2]; + + /* + * Those Macronix chips support dual reads at 104Mhz + * and dual IO at 84Mhz with 4 dummies. + * + * Our calibration algo should give us something along + * the lines of HCLK/3 (HCLK/2 seems to work sometimes + * but appears to be fairly unreliable) which is 64Mhz + * + * So we chose dual IO mode. + * + * The CE# inactive width for reads must be 7ns, we set it + * to 3T which is about 15ns at the fastest speed we support + * HCLK/2) as I've had issue with smaller values. + * + * For write and program it's 30ns so let's set the value + * for normal ops to 6T. + * + * Preserve the current 4b mode. + */ + FL_DBG("AST: Setting up Macronix...\n"); + + /* + * Read the status and config registers + */ + rc = ast_sf_cmd_rd(&ct->ops, CMD_RDSR, false, 0, &srcr[0], 1); + if (rc != 0) { + FL_ERR("AST: Failed to read status\n"); + return rc; + } + rc = ast_sf_cmd_rd(&ct->ops, CMD_RDCR, false, 0, &srcr[1], 1); + if (rc != 0) { + FL_ERR("AST: Failed to read configuration\n"); + return rc; + } + + FL_DBG("AST: Macronix SR:CR: 0x%02x:%02x\n", srcr[0], srcr[1]); + + /* Switch to 8 dummy cycles to enable 104Mhz operations */ + srcr[1] = (srcr[1] & 0x3f) | 0x80; + + rc = fl_wren(&ct->ops); + if (rc) { + FL_ERR("AST: Failed to WREN for Macronix config\n"); + return rc; + } + + rc = ast_sf_cmd_wr(&ct->ops, CMD_WRSR, false, 0, srcr, 2); + if (rc != 0) { + FL_ERR("AST: Failed to write Macronix config\n"); + return rc; + } + rc = fl_sync_wait_idle(&ct->ops);; + if (rc != 0) { + FL_ERR("AST: Failed waiting for config write\n"); + return rc; + } + + FL_DBG("AST: Macronix SR:CR: 0x%02x:%02x\n", srcr[0], srcr[1]); + + /* Use 2READ */ + ct->ctl_read_val = (ct->ctl_read_val & 0x2000) | + (0x03 << 28) | /* Dual IO */ + (0x0d << 24) | /* CE# width 3T */ + (0xbb << 16) | /* 2READ command */ + (0x00 << 8) | /* HCLK/16 (optimize later) */ + (0x02 << 6) | /* 2 bytes dummy cycle (8 clocks) */ + (0x01); /* fast read */ + + /* Configure SPI flash read timing */ + rc = ast_sf_optimize_reads(ct, info, 104000000); + if (rc) { + FL_ERR("AST: Failed to setup proper read timings, rc=%d\n", rc); + return rc; + } + + /* + * For other commands and writes also increase the SPI clock + * to HCLK/2 since the chip supports up to 133Mhz and set + * CE# inactive to 6T. We request a timing that is 20% below + * the limit of the chip, so about 106Mhz which should fit. + */ + ct->ctl_val = (ct->ctl_val & 0x2000) | + (0x00 << 28) | /* Single bit */ + (0x0a << 24) | /* CE# width 6T (b1010) */ + (0x00 << 16) | /* no command */ + (0x00 << 8) | /* HCLK/16 (done later) */ + (0x00 << 6) | /* no dummy cycle */ + (0x00); /* normal read */ + + div = ast_sf_get_hclk(&ct->ctl_val, 106000000); + FL_DBG("AST: Command timing set to HCLK/%d\n", div); + + /* Update chip with current read config */ + ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg); + return 0; +} + +static int ast_sf_setup_winbond(struct ast_sf_ctrl *ct, struct flash_info *info) +{ + int rc, div __unused; + + FL_DBG("AST: Setting up Windbond...\n"); + + /* + * This Windbond chip support dual reads at 104Mhz + * with 8 dummy cycles. + * + * The CE# inactive width for reads must be 10ns, we set it + * to 3T which is about 15.6ns. + */ + ct->ctl_read_val = (ct->ctl_read_val & 0x2000) | + (0x02 << 28) | /* Dual bit data only */ + (0x0e << 24) | /* CE# width 2T (b1110) */ + (0x3b << 16) | /* DREAD command */ + (0x00 << 8) | /* HCLK/16 */ + (0x01 << 6) | /* 1-byte dummy cycle */ + (0x01); /* fast read */ + + /* Configure SPI flash read timing */ + rc = ast_sf_optimize_reads(ct, info, 104000000); + if (rc) { + FL_ERR("AST: Failed to setup proper read timings, rc=%d\n", rc); + return rc; + } + + /* + * For other commands and writes also increase the SPI clock + * to HCLK/2 since the chip supports up to 133Mhz. CE# inactive + * for write and erase is 50ns so let's set it to 10T. + */ + ct->ctl_val = (ct->ctl_read_val & 0x2000) | + (0x00 << 28) | /* Single bit */ + (0x06 << 24) | /* CE# width 10T (b0110) */ + (0x00 << 16) | /* no command */ + (0x00 << 8) | /* HCLK/16 */ + (0x00 << 6) | /* no dummy cycle */ + (0x01); /* fast read */ + + div = ast_sf_get_hclk(&ct->ctl_val, 106000000); + FL_DBG("AST: Command timing set to HCLK/%d\n", div); + + /* Update chip with current read config */ + ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg); + return 0; +} + +static int ast_sf_setup_micron(struct ast_sf_ctrl *ct, struct flash_info *info) +{ + uint8_t vconf, ext_id[6]; + int rc, div __unused; + + FL_DBG("AST: Setting up Micron...\n"); + + /* + * Read the extended chip ID to try to detect old vs. new + * flashes since old Micron flashes have a lot of issues + */ + rc = ast_sf_cmd_rd(&ct->ops, CMD_RDID, false, 0, ext_id, 6); + if (rc != 0) { + FL_ERR("AST: Failed to read Micron ext ID, sticking to dumb speed\n"); + return 0; + } + /* Check ID matches expectations */ + if (ext_id[0] != ((info->id >> 16) & 0xff) || + ext_id[1] != ((info->id >> 8) & 0xff) || + ext_id[2] != ((info->id ) & 0xff)) { + FL_ERR("AST: Micron ext ID mismatch, sticking to dumb speed\n"); + return 0; + } + FL_DBG("AST: Micron ext ID byte: 0x%02x\n", ext_id[4]); + + /* Check for old (<45nm) chips, don't try to be fancy on those */ + if (!(ext_id[4] & 0x40)) { + FL_DBG("AST: Old chip, using dumb timings\n"); + goto dumb; + } + + /* + * Read the micron specific volatile configuration reg + */ + rc = ast_sf_cmd_rd(&ct->ops, CMD_MIC_RDVCONF, false, 0, &vconf, 1); + if (rc != 0) { + FL_ERR("AST: Failed to read Micron vconf, sticking to dumb speed\n"); + goto dumb; + } + FL_DBG("AST: Micron VCONF: 0x%02x\n", vconf); + + /* Switch to 8 dummy cycles (we might be able to operate with 4 + * but let's keep some margin + */ + vconf = (vconf & 0x0f) | 0x80; + + rc = ast_sf_cmd_wr(&ct->ops, CMD_MIC_WRVCONF, false, 0, &vconf, 1); + if (rc != 0) { + FL_ERR("AST: Failed to write Micron vconf, " + " sticking to dumb speed\n"); + goto dumb; + } + rc = fl_sync_wait_idle(&ct->ops);; + if (rc != 0) { + FL_ERR("AST: Failed waiting for config write\n"); + return rc; + } + FL_DBG("AST: Updated to : 0x%02x\n", vconf); + + /* + * Try to do full dual IO, with 8 dummy cycles it supports 133Mhz + * + * The CE# inactive width for reads must be 20ns, we set it + * to 4T which is about 20.8ns. + */ + ct->ctl_read_val = (ct->ctl_read_val & 0x2000) | + (0x03 << 28) | /* Single bit */ + (0x0c << 24) | /* CE# 4T */ + (0xbb << 16) | /* 2READ command */ + (0x00 << 8) | /* HCLK/16 (optimize later) */ + (0x02 << 6) | /* 8 dummy cycles (2 bytes) */ + (0x01); /* fast read */ + + /* Configure SPI flash read timing */ + rc = ast_sf_optimize_reads(ct, info, 133000000); + if (rc) { + FL_ERR("AST: Failed to setup proper read timings, rc=%d\n", rc); + return rc; + } + + /* + * For other commands and writes also increase the SPI clock + * to HCLK/2 since the chip supports up to 133Mhz. CE# inactive + * for write and erase is 50ns so let's set it to 10T. + */ + ct->ctl_val = (ct->ctl_read_val & 0x2000) | + (0x00 << 28) | /* Single bit */ + (0x06 << 24) | /* CE# width 10T (b0110) */ + (0x00 << 16) | /* no command */ + (0x00 << 8) | /* HCLK/16 */ + (0x00 << 6) | /* no dummy cycle */ + (0x00); /* norm read */ + + div = ast_sf_get_hclk(&ct->ctl_val, 133000000); + FL_DBG("AST: Command timing set to HCLK/%d\n", div); + + /* Update chip with current read config */ + ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg); + + return 0; + + dumb: + ct->ctl_val = ct->ctl_read_val = (ct->ctl_read_val & 0x2000) | + (0x00 << 28) | /* Single bit */ + (0x00 << 24) | /* CE# max */ + (0x03 << 16) | /* use normal reads */ + (0x06 << 8) | /* HCLK/4 */ + (0x00 << 6) | /* no dummy cycle */ + (0x00); /* normal read */ + + /* Update chip with current read config */ + ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg); + + return 0; +} + +static int ast_sf_setup(struct spi_flash_ctrl *ctrl, uint32_t *tsize) +{ + struct ast_sf_ctrl *ct = container_of(ctrl, struct ast_sf_ctrl, ops); + struct flash_info *info = ctrl->finfo; + + (void)tsize; + + /* + * Configure better timings and read mode for known + * flash chips + */ + switch(info->id) { + case 0xc22018: /* MX25L12835F */ + case 0xc22019: /* MX25L25635F */ + case 0xc2201a: /* MX66L51235F */ + case 0xc2201b: /* MX66L1G45G */ + return ast_sf_setup_macronix(ct, info); + case 0xef4018: /* W25Q128BV */ + return ast_sf_setup_winbond(ct, info); + case 0x20ba20: /* MT25Qx512xx */ + return ast_sf_setup_micron(ct, info); + } + /* No special tuning */ + return 0; +} + +static bool ast_sf_init_pnor(struct ast_sf_ctrl *ct) +{ + uint32_t reg; + + ct->ctl_reg = PNOR_SPI_FCTL_CTRL; + ct->fread_timing_reg = PNOR_SPI_FREAD_TIMING; + ct->flash = PNOR_FLASH_BASE; + + /* Enable writing to the controller */ + reg = ast_ahb_readl(PNOR_SPI_FCTL_CONF); + if (reg == 0xffffffff) { + FL_ERR("AST_SF: Failed read from controller config\n"); + return false; + } + ast_ahb_writel(reg | 1, PNOR_SPI_FCTL_CONF); + + /* + * Snapshot control reg and sanitize it for our + * use, switching to 1-bit mode, clearing user + * mode if set, etc... + * + * Also configure SPI clock to something safe + * like HCLK/8 (24Mhz) + */ + ct->ctl_val = ast_ahb_readl(ct->ctl_reg); + if (ct->ctl_val == 0xffffffff) { + FL_ERR("AST_SF: Failed read from controller control\n"); + return false; + } + + ct->ctl_val = (ct->ctl_val & 0x2000) | + (0x00 << 28) | /* Single bit */ + (0x00 << 24) | /* CE# width 16T */ + (0x00 << 16) | /* no command */ + (0x04 << 8) | /* HCLK/8 */ + (0x00 << 6) | /* no dummy cycle */ + (0x00); /* normal read */ + + /* Initial read mode is default */ + ct->ctl_read_val = ct->ctl_val; + + /* Initial read timings all 0 */ + ct->fread_timing_val = 0; + + /* Configure for read */ + ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg); + ast_ahb_writel(ct->fread_timing_val, ct->fread_timing_reg); + + if (ct->ctl_val & 0x2000) + ct->mode_4b = true; + else + ct->mode_4b = false; + + return true; +} + +static bool ast_sf_init_bmc(struct ast_sf_ctrl *ct) +{ + ct->ctl_reg = BMC_SPI_FCTL_CTRL; + ct->fread_timing_reg = BMC_SPI_FREAD_TIMING; + ct->flash = BMC_FLASH_BASE; + + /* + * Snapshot control reg and sanitize it for our + * use, switching to 1-bit mode, clearing user + * mode if set, etc... + * + * Also configure SPI clock to something safe + * like HCLK/8 (24Mhz) + */ + ct->ctl_val = + (0x00 << 28) | /* Single bit */ + (0x00 << 24) | /* CE# width 16T */ + (0x00 << 16) | /* no command */ + (0x04 << 8) | /* HCLK/8 */ + (0x00 << 6) | /* no dummy cycle */ + (0x00); /* normal read */ + + /* Initial read mode is default */ + ct->ctl_read_val = ct->ctl_val; + + /* Initial read timings all 0 */ + ct->fread_timing_val = 0; + + /* Configure for read */ + ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg); + ast_ahb_writel(ct->fread_timing_val, ct->fread_timing_reg); + + ct->mode_4b = false; + + return true; +} + +static int ast_mem_set4b(struct spi_flash_ctrl *ctrl __unused, + bool enable __unused) +{ + return 0; +} + +static int ast_mem_setup(struct spi_flash_ctrl *ctrl __unused, + uint32_t *tsize __unused) +{ + return 0; +} + +static int ast_mem_chipid(struct spi_flash_ctrl *ctrl __unused, uint8_t *id_buf, + uint32_t *id_size) +{ + if (*id_size < 3) + return -1; + + id_buf[0] = 0xaa; + id_buf[1] = 0x55; + id_buf[2] = 0xaa; + *id_size = 3; + return 0; +} + +static int ast_mem_write(struct spi_flash_ctrl *ctrl, uint32_t pos, + const void *buf, uint32_t len) +{ + struct ast_sf_ctrl *ct = container_of(ctrl, struct ast_sf_ctrl, ops); + + /* + * This only works when the ahb is pointed at system memory. + */ + return ast_copy_to_ahb(ct->flash + pos, buf, len); +} + +static int ast_mem_erase(struct spi_flash_ctrl *ctrl, uint32_t addr, uint32_t size) +{ + struct ast_sf_ctrl *ct = container_of(ctrl, struct ast_sf_ctrl, ops); + uint32_t pos, len, end = addr + size; + uint64_t zero = 0; + int ret; + + for (pos = addr; pos < end; pos += sizeof(zero)) { + if (pos + sizeof(zero) > end) + len = end - pos; + else + len = sizeof(zero); + + ret = ast_copy_to_ahb(ct->flash + pos, &zero, len); + if (ret) + return ret; + } + + return 0; +} + +int ast_sf_open(uint8_t type, struct spi_flash_ctrl **ctrl) +{ + struct ast_sf_ctrl *ct; +#ifdef __SKIBOOT__ + uint32_t hicr7; + + if (!ast_sio_is_enabled()) + return -ENODEV; +#endif /* __SKIBOOT__ */ + + if (type != AST_SF_TYPE_PNOR && type != AST_SF_TYPE_BMC + && type != AST_SF_TYPE_MEM) + return -EINVAL; + + *ctrl = NULL; + ct = malloc(sizeof(*ct)); + if (!ct) { + FL_ERR("AST_SF: Failed to allocate\n"); + return -ENOMEM; + } + memset(ct, 0, sizeof(*ct)); + ct->type = type; + + if (type == AST_SF_TYPE_MEM) { + ct->ops.cmd_wr = NULL; + ct->ops.cmd_rd = NULL; + ct->ops.read = ast_sf_read; + ct->ops.set_4b = ast_mem_set4b; + ct->ops.write = ast_mem_write; + ct->ops.erase = ast_mem_erase; + ct->ops.setup = ast_mem_setup; + ct->ops.chip_id = ast_mem_chipid; + ct->flash = PNOR_FLASH_BASE; + } else { + ct->ops.cmd_wr = ast_sf_cmd_wr; + ct->ops.cmd_rd = ast_sf_cmd_rd; + ct->ops.set_4b = ast_sf_set_4b; + ct->ops.read = ast_sf_read; + ct->ops.setup = ast_sf_setup; + } + + ast_get_ahb_freq(); + + if (type == AST_SF_TYPE_PNOR) { + if (!ast_sf_init_pnor(ct)) + goto fail; + } else if (type == AST_SF_TYPE_BMC) { + if (!ast_sf_init_bmc(ct)) + goto fail; + } + +#ifdef __SKIBOOT__ + /* Read the configuration of the LPC->AHB bridge for PNOR + * to extract the PNOR LPC offset which can be different + * depending on flash size + */ + hicr7 = ast_ahb_readl(LPC_HICR7); + pnor_lpc_offset = (hicr7 & 0xffffu) << 16; + prlog(PR_DEBUG, "AST: PNOR LPC offset: 0x%08x\n", pnor_lpc_offset); +#endif /* __SKIBOOT__ */ + + *ctrl = &ct->ops; + + return 0; + fail: + free(ct); + return -EIO; +} + +void ast_sf_close(struct spi_flash_ctrl *ctrl) +{ + struct ast_sf_ctrl *ct = container_of(ctrl, struct ast_sf_ctrl, ops); + + /* Restore control reg to read */ + ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg); + + /* Additional cleanup */ + if (ct->type == AST_SF_TYPE_PNOR) { + uint32_t reg = ast_ahb_readl(PNOR_SPI_FCTL_CONF); + if (reg != 0xffffffff) + ast_ahb_writel(reg & ~1, PNOR_SPI_FCTL_CONF); + } + + /* Free the whole lot */ + free(ct); +} diff --git a/roms/skiboot/hw/bt.c b/roms/skiboot/hw/bt.c new file mode 100644 index 000000000..5016feab6 --- /dev/null +++ b/roms/skiboot/hw/bt.c @@ -0,0 +1,720 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Block Transfer, typically what IPMI goes over + * + * Copyright 2013-2019 IBM Corp. + */ + +#define pr_fmt(fmt) "BT: " fmt + +#include <skiboot.h> +#include <lpc.h> +#include <lock.h> +#include <device.h> +#include <timebase.h> +#include <ipmi.h> +#include <bt.h> +#include <timer.h> +#include <ipmi.h> +#include <timebase.h> +#include <chip.h> +#include <interrupts.h> + +/* BT registers */ +#define BT_CTRL 0 +#define BT_CTRL_B_BUSY 0x80 +#define BT_CTRL_H_BUSY 0x40 +#define BT_CTRL_OEM0 0x20 +#define BT_CTRL_SMS_ATN 0x10 +#define BT_CTRL_B2H_ATN 0x08 +#define BT_CTRL_H2B_ATN 0x04 +#define BT_CTRL_CLR_RD_PTR 0x02 +#define BT_CTRL_CLR_WR_PTR 0x01 +#define BT_HOST2BMC 1 +#define BT_INTMASK 2 +#define BT_INTMASK_B2H_IRQEN 0x01 +#define BT_INTMASK_B2H_IRQ 0x02 +#define BT_INTMASK_BMC_HWRST 0x80 + +/* Maximum size of the HW FIFO */ +#define BT_FIFO_LEN 64 + +/* Default poll interval before interrupts are working */ +#define BT_DEFAULT_POLL_MS 200 + +/* + * Minimum size of an IPMI request/response including + * mandatory headers. + */ +#define BT_MIN_REQ_LEN 3 +#define BT_MIN_RESP_LEN 4 + +/* How long (in uS) to poll for new ipmi data. */ +#define POLL_TIMEOUT 10000 + +/* Maximum number of outstanding messages to allow in the queue. */ +#define BT_MAX_QUEUE_LEN 10 + +/* How long (in seconds) before a message is timed out. */ +#define BT_MSG_TIMEOUT 3 + +/* Maximum number of times to attempt sending a message before giving up. */ +#define BT_MAX_RETRIES 1 + +/* Macro to enable printing BT message queue for debug */ +#define BT_QUEUE_DEBUG 0 + +/* BT message logging macros */ +#define _BT_Q_LOG(level, msg, fmt, args...) \ + do { if (msg) \ + prlog(level, "seq 0x%02x netfn 0x%02x cmd 0x%02x: " fmt "\n", \ + (msg)->seq, ((msg)->ipmi_msg.netfn >> 2), (msg)->ipmi_msg.cmd, ##args); \ + else \ + prlog(level, "seq 0x?? netfn 0x?? cmd 0x??: " fmt "\n", ##args); \ + } while (0) + +#define BT_Q_ERR(msg, fmt, args...) \ + _BT_Q_LOG(PR_ERR, msg, fmt, ##args) + +#define BT_Q_DBG(msg, fmt, args...) \ + _BT_Q_LOG(PR_DEBUG, msg, fmt, ##args) + +#define BT_Q_TRACE(msg, fmt, args...) \ + _BT_Q_LOG(PR_TRACE, msg, fmt, ##args) + +struct bt_msg { + struct list_node link; + unsigned long tb; + uint8_t seq; + uint8_t send_count; + bool disable_retry; + struct ipmi_msg ipmi_msg; +}; + +struct bt_caps { + uint8_t num_requests; + uint16_t input_buf_len; + uint16_t output_buf_len; + uint8_t msg_timeout; + uint8_t max_retries; +}; + +struct bt { + uint32_t base_addr; + struct lock lock; + struct list_head msgq; + struct list_head msgq_sync; /* separate list for synchronous messages */ + struct timer poller; + bool irq_ok; + int queue_len; + struct bt_caps caps; +}; + +static struct bt bt; +static struct bt_msg *inflight_bt_msg; /* Holds in flight message */ + +static int ipmi_seq; + +static inline uint8_t bt_inb(uint32_t reg) +{ + return lpc_inb(bt.base_addr + reg); +} + +static inline void bt_outb(uint8_t data, uint32_t reg) +{ + lpc_outb(data, bt.base_addr + reg); +} + +static inline void bt_set_h_busy(bool value) +{ + uint8_t rval; + + rval = bt_inb(BT_CTRL); + if (value != !!(rval & BT_CTRL_H_BUSY)) + bt_outb(BT_CTRL_H_BUSY, BT_CTRL); +} + +static inline void bt_assert_h_busy(void) +{ + uint8_t rval; + rval = bt_inb(BT_CTRL); + assert(rval & BT_CTRL_H_BUSY); +} + +static void get_bt_caps_complete(struct ipmi_msg *msg) +{ + /* Ignore errors, we'll fallback to using the defaults, no big deal */ + if (msg->data[0] == 0) { + prlog(PR_DEBUG, "Got illegal BMC BT capability\n"); + goto out; + } + + if (msg->data[1] != BT_FIFO_LEN) { + prlog(PR_DEBUG, "Got a input buffer len (%u) cap which differs from the default\n", + msg->data[1]); + } + + if (msg->data[2] != BT_FIFO_LEN) { + prlog(PR_DEBUG, "Got a output buffer len (%u) cap which differs from the default\n", + msg->data[2]); + } + + /* + * IPMI Spec says that the value for buffer sizes are: + * "the largest value allowed in first byte" + * Therefore we want to add one to what we get + */ + bt.caps.num_requests = msg->data[0]; + bt.caps.input_buf_len = msg->data[1] + 1; + bt.caps.output_buf_len = msg->data[2] + 1; + bt.caps.msg_timeout = msg->data[3]; + bt.caps.max_retries = msg->data[4]; + prlog(PR_DEBUG, "BMC BT capabilities received:\n"); + prlog(PR_DEBUG, "buffer sizes: %d input %d output\n", + bt.caps.input_buf_len, bt.caps.output_buf_len); + prlog(PR_DEBUG, "number of requests: %d\n", bt.caps.num_requests); + prlog(PR_DEBUG, "msg timeout: %d max retries: %d\n", + bt.caps.msg_timeout, bt.caps.max_retries); + +out: + ipmi_free_msg(msg); +} + +static void get_bt_caps(void) +{ + + struct ipmi_msg *bmc_caps; + /* + * Didn't sent a message, now is a good time to ask the BMC for its + * capabilities. + */ + bmc_caps = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, IPMI_GET_BT_CAPS, + get_bt_caps_complete, NULL, NULL, 0, sizeof(struct bt_caps)); + if (!bmc_caps) + prerror("Couldn't create BMC BT capabilities msg\n"); + + if (bmc_caps && ipmi_queue_msg(bmc_caps)) + prerror("Couldn't enqueue request for BMC BT capabilities\n"); + + /* Ignore errors, we'll fallback to using the defaults, no big deal */ +} + +static inline bool bt_idle(void) +{ + uint8_t bt_ctrl = bt_inb(BT_CTRL); + + return !(bt_ctrl & BT_CTRL_B_BUSY) && !(bt_ctrl & BT_CTRL_H2B_ATN); +} + +/* Must be called with bt.lock held */ +static void bt_msg_del(struct bt_msg *bt_msg) +{ + list_del(&bt_msg->link); + bt.queue_len--; + + /* once inflight_bt_msg out of list, it should be emptyed */ + if (bt_msg == inflight_bt_msg) + inflight_bt_msg = NULL; + + unlock(&bt.lock); + ipmi_cmd_done(bt_msg->ipmi_msg.cmd, + IPMI_NETFN_RETURN_CODE(bt_msg->ipmi_msg.netfn), + IPMI_TIMEOUT_ERR, &bt_msg->ipmi_msg); + lock(&bt.lock); +} + +static void bt_init_interface(void) +{ + /* Clear interrupt condition & enable irq */ + bt_outb(BT_INTMASK_B2H_IRQ | BT_INTMASK_B2H_IRQEN, BT_INTMASK); + + /* Take care of a stable H_BUSY if any */ + bt_set_h_busy(false); +} + +static void bt_reset_interface(void) +{ + bt_outb(BT_INTMASK_BMC_HWRST, BT_INTMASK); + bt_init_interface(); +} + +/* + * Try and send a message from the message queue. Caller must hold + * bt.bt_lock and bt.lock and ensue the message queue is not + * empty. + */ +static void bt_send_msg(struct bt_msg *bt_msg) +{ + int i; + struct ipmi_msg *ipmi_msg; + + ipmi_msg = &bt_msg->ipmi_msg; + + /* Send the message */ + bt_outb(BT_CTRL_CLR_WR_PTR, BT_CTRL); + + /* Byte 1 - Length */ + bt_outb(ipmi_msg->req_size + BT_MIN_REQ_LEN, BT_HOST2BMC); + + /* Byte 2 - NetFn/LUN */ + bt_outb(ipmi_msg->netfn, BT_HOST2BMC); + + /* Byte 3 - Seq */ + bt_outb(bt_msg->seq, BT_HOST2BMC); + + /* Byte 4 - Cmd */ + bt_outb(ipmi_msg->cmd, BT_HOST2BMC); + + /* Byte 5:N - Data */ + for (i = 0; i < ipmi_msg->req_size; i++) + bt_outb(ipmi_msg->data[i], BT_HOST2BMC); + + BT_Q_TRACE(bt_msg, "Message sent to host"); + bt_msg->send_count++; + + bt_outb(BT_CTRL_H2B_ATN, BT_CTRL); + + return; +} + +static void bt_clear_fifo(void) +{ + int i; + + for (i = 0; i < bt.caps.input_buf_len; i++) + bt_outb(0xff, BT_HOST2BMC); +} + +static void bt_flush_msg(void) +{ + bt_assert_h_busy(); + bt_outb(BT_CTRL_B2H_ATN | BT_CTRL_CLR_RD_PTR | BT_CTRL_CLR_WR_PTR, BT_CTRL); + bt_clear_fifo(); + /* Can't hurt to clear the write pointer again, just to be sure */ + bt_outb(BT_CTRL_CLR_WR_PTR, BT_CTRL); + bt_set_h_busy(false); +} + +static void bt_get_resp(void) +{ + int i; + struct ipmi_msg *ipmi_msg; + uint8_t resp_len, netfn, seq, cmd; + uint8_t cc = IPMI_CC_NO_ERROR; + + /* Indicate to the BMC that we are busy */ + bt_set_h_busy(true); + + /* Clear B2H_ATN and read pointer */ + bt_outb(BT_CTRL_B2H_ATN, BT_CTRL); + bt_outb(BT_CTRL_CLR_RD_PTR, BT_CTRL); + + /* Read the response */ + /* Byte 1 - Length (includes header size) */ + resp_len = bt_inb(BT_HOST2BMC) - BT_MIN_RESP_LEN; + + /* Byte 2 - NetFn/LUN */ + netfn = bt_inb(BT_HOST2BMC); + + /* Byte 3 - Seq */ + seq = bt_inb(BT_HOST2BMC); + + /* Byte 4 - Cmd */ + cmd = bt_inb(BT_HOST2BMC); + + /* Byte 5 - Completion Code */ + cc = bt_inb(BT_HOST2BMC); + + /* Find the corresponding message */ + if (inflight_bt_msg == NULL || inflight_bt_msg->seq != seq) { + /* A response to a message we no longer care about. */ + prlog(PR_INFO, "Nobody cared about a response to an BT/IPMI message" + "(seq 0x%02x netfn 0x%02x cmd 0x%02x)\n", seq, (netfn >> 2), cmd); + bt_flush_msg(); + return; + } + + ipmi_msg = &inflight_bt_msg->ipmi_msg; + + /* + * Make sure we have enough room to store the response. As all values + * are unsigned we will also trigger this error if + * bt_inb(BT_HOST2BMC) < BT_MIN_RESP_LEN (which should never occur). + */ + if (resp_len > ipmi_msg->resp_size) { + BT_Q_ERR(inflight_bt_msg, "Invalid resp_len %d", resp_len); + resp_len = ipmi_msg->resp_size; + cc = IPMI_ERR_MSG_TRUNCATED; + } + ipmi_msg->resp_size = resp_len; + + /* Byte 6:N - Data */ + for (i = 0; i < resp_len; i++) + ipmi_msg->data[i] = bt_inb(BT_HOST2BMC); + bt_set_h_busy(false); + + BT_Q_TRACE(inflight_bt_msg, "IPMI MSG done"); + + list_del(&inflight_bt_msg->link); + /* Ready to send next message */ + inflight_bt_msg = NULL; + bt.queue_len--; + unlock(&bt.lock); + + /* Call IPMI layer to finish processing the message. */ + ipmi_cmd_done(cmd, netfn, cc, ipmi_msg); + lock(&bt.lock); + + return; +} + +static void bt_expire_old_msg(uint64_t tb) +{ + struct bt_msg *bt_msg = inflight_bt_msg; + + if (bt_msg && bt_msg->tb > 0 && !chip_quirk(QUIRK_SIMICS) && + (tb_compare(tb, bt_msg->tb + + secs_to_tb(bt.caps.msg_timeout)) == TB_AAFTERB)) { + if (bt_msg->send_count <= bt.caps.max_retries && + !bt_msg->disable_retry) { + /* A message timeout is usually due to the BMC + * clearing the H2B_ATN flag without actually + * doing anything. The data will still be in the + * FIFO so just reset the flag.*/ + BT_Q_ERR(bt_msg, "Retry sending message"); + + /* This means we have started message timeout, but not + * yet sent message to BMC as driver was not free to + * send message. Lets resend message. + */ + if (bt_msg->send_count == 0) + bt_send_msg(bt_msg); + else + bt_outb(BT_CTRL_H2B_ATN, BT_CTRL); + + bt_msg->send_count++; + bt_msg->tb = tb; + } else { + BT_Q_ERR(bt_msg, "Timeout sending message"); + bt_msg_del(bt_msg); + + /* + * Timing out a message is inherently racy as the BMC + * may start writing just as we decide to kill the + * message. Hopefully resetting the interface is + * sufficient to guard against such things. + */ + bt_reset_interface(); + } + } +} + +#if BT_QUEUE_DEBUG +static void print_debug_queue_info(void) +{ + struct bt_msg *msg; + static bool printed; + + if (!list_empty(&bt.msgq_sync) || !list_empty(&bt.msgq)) { + printed = false; + prlog(PR_DEBUG, "-------- BT Sync Msg Queue -------\n"); + list_for_each(&bt.msgq_sync, msg, link) { + BT_Q_DBG(msg, "[ sent %d ]", msg->send_count); + } + prlog(PR_DEBUG, "---------- BT Msg Queue ----------\n"); + list_for_each(&bt.msgq, msg, link) { + BT_Q_DBG(msg, "[ sent %d ]", msg->send_count); + } + prlog(PR_DEBUG, "----------------------------------\n"); + } else if (!printed) { + printed = true; + prlog(PR_DEBUG, "------- BT Msg Queue Empty -------\n"); + } +} +#endif + +static void bt_send_and_unlock(void) +{ + /* Busy? */ + if (inflight_bt_msg) + goto out_unlock; + + if (!lpc_ok()) + goto out_unlock; + + /* Synchronous messages gets priority over normal message */ + if (!list_empty(&bt.msgq_sync)) + inflight_bt_msg = list_top(&bt.msgq_sync, struct bt_msg, link); + else if (!list_empty(&bt.msgq)) + inflight_bt_msg = list_top(&bt.msgq, struct bt_msg, link); + else + goto out_unlock; + + assert(inflight_bt_msg); + /* + * Start the message timeout once it gets to the top + * of the queue. This will ensure we timeout messages + * in the case of a broken bt interface as occurs when + * the BMC is not responding to any IPMI messages. + */ + if (inflight_bt_msg->tb == 0) + inflight_bt_msg->tb = mftb(); + + /* + * Only send it if we haven't already. + * Timeouts and retries happen in bt_expire_old_msg() + * called from bt_poll() + */ + if (bt_idle() && inflight_bt_msg->send_count == 0) + bt_send_msg(inflight_bt_msg); + +out_unlock: + unlock(&bt.lock); +} + +static void bt_poll(struct timer *t __unused, void *data __unused, + uint64_t now) +{ + uint8_t bt_ctrl; + + /* Don't do anything if the LPC bus is offline */ + if (!lpc_ok()) + return; + + /* + * If we can't get the lock assume someone else will notice + * the new message and process it. + */ + lock(&bt.lock); + +#if BT_QUEUE_DEBUG + print_debug_queue_info(); +#endif + + bt_ctrl = bt_inb(BT_CTRL); + + /* Is there a response waiting for us? */ + if (bt_ctrl & BT_CTRL_B2H_ATN) + bt_get_resp(); + + bt_expire_old_msg(now); + + /* Check for sms_atn */ + if (bt_inb(BT_CTRL) & BT_CTRL_SMS_ATN) { + bt_outb(BT_CTRL_SMS_ATN, BT_CTRL); + unlock(&bt.lock); + ipmi_sms_attention(); + lock(&bt.lock); + } + + /* + * Send messages if we can. If the BMC was really quick we + * could loop back to the start and check for a response + * instead of unlocking, but testing shows the BMC isn't that + * fast so we will wait for the IRQ or a call to the pollers instead. + */ + bt_send_and_unlock(); + + schedule_timer(&bt.poller, + bt.irq_ok ? TIMER_POLL : msecs_to_tb(BT_DEFAULT_POLL_MS)); +} + +static void bt_ipmi_poll(void) +{ + bt_poll(NULL, NULL, mftb()); +} + +static void bt_add_msg(struct bt_msg *bt_msg) +{ + bt_msg->tb = 0; + bt_msg->seq = ipmi_seq++; + bt_msg->send_count = 0; + bt.queue_len++; + if (bt.queue_len > BT_MAX_QUEUE_LEN) { + /* Maximum queue length exceeded, remove oldest messages. */ + BT_Q_ERR(bt_msg, "Maximum queue length exceeded"); + /* First try to remove message from normal queue */ + if (!list_empty(&bt.msgq)) + bt_msg = list_tail(&bt.msgq, struct bt_msg, link); + else if (!list_empty(&bt.msgq_sync)) + bt_msg = list_tail(&bt.msgq_sync, struct bt_msg, link); + assert(bt_msg); + BT_Q_ERR(bt_msg, "Removed from queue"); + bt_msg_del(bt_msg); + } +} + +/* Add message to synchronous message list */ +static int bt_add_ipmi_msg_head(struct ipmi_msg *ipmi_msg) +{ + struct bt_msg *bt_msg = container_of(ipmi_msg, struct bt_msg, ipmi_msg); + + lock(&bt.lock); + bt_add_msg(bt_msg); + list_add_tail(&bt.msgq_sync, &bt_msg->link); + bt_send_and_unlock(); + + return 0; +} + +static int bt_add_ipmi_msg(struct ipmi_msg *ipmi_msg) +{ + struct bt_msg *bt_msg = container_of(ipmi_msg, struct bt_msg, ipmi_msg); + + lock(&bt.lock); + bt_add_msg(bt_msg); + list_add_tail(&bt.msgq, &bt_msg->link); + bt_send_and_unlock(); + + return 0; +} + +static void bt_irq(uint32_t chip_id __unused, uint32_t irq_mask __unused) +{ + uint8_t ireg; + + ireg = bt_inb(BT_INTMASK); + + bt.irq_ok = true; + if (ireg & BT_INTMASK_B2H_IRQ) { + bt_outb(BT_INTMASK_B2H_IRQ | BT_INTMASK_B2H_IRQEN, BT_INTMASK); + bt_poll(NULL, NULL, mftb()); + } +} + +/* + * Allocate an ipmi message and bt container and return the ipmi + * message struct. Allocates enough space for the request and response + * data. + */ +static struct ipmi_msg *bt_alloc_ipmi_msg(size_t request_size, size_t response_size) +{ + struct bt_msg *bt_msg; + + bt_msg = zalloc(sizeof(struct bt_msg) + MAX(request_size, response_size)); + if (!bt_msg) + return NULL; + + bt_msg->ipmi_msg.req_size = request_size; + bt_msg->ipmi_msg.resp_size = response_size; + bt_msg->ipmi_msg.data = (uint8_t *) (bt_msg + 1); + + return &bt_msg->ipmi_msg; +} + +/* + * Free a previously allocated ipmi message. + */ +static void bt_free_ipmi_msg(struct ipmi_msg *ipmi_msg) +{ + struct bt_msg *bt_msg = container_of(ipmi_msg, struct bt_msg, ipmi_msg); + + free(bt_msg); +} + +/* + * Do not resend IPMI messages to BMC. + */ +static void bt_disable_ipmi_msg_retry(struct ipmi_msg *ipmi_msg) +{ + struct bt_msg *bt_msg = container_of(ipmi_msg, struct bt_msg, ipmi_msg); + + bt_msg->disable_retry = true; +} + +/* + * Remove a message from the queue. The memory allocated for the ipmi message + * will need to be freed by the caller with bt_free_ipmi_msg() as it will no + * longer be in the queue of messages. + */ +static int bt_del_ipmi_msg(struct ipmi_msg *ipmi_msg) +{ + struct bt_msg *bt_msg = container_of(ipmi_msg, struct bt_msg, ipmi_msg); + + lock(&bt.lock); + list_del(&bt_msg->link); + bt.queue_len--; + bt_send_and_unlock(); + return 0; +} + +static struct ipmi_backend bt_backend = { + .alloc_msg = bt_alloc_ipmi_msg, + .free_msg = bt_free_ipmi_msg, + .queue_msg = bt_add_ipmi_msg, + .queue_msg_head = bt_add_ipmi_msg_head, + .dequeue_msg = bt_del_ipmi_msg, + .disable_retry = bt_disable_ipmi_msg_retry, + .poll = bt_ipmi_poll, +}; + +static struct lpc_client bt_lpc_client = { + .interrupt = bt_irq, +}; + +void bt_init(void) +{ + struct dt_node *n; + const struct dt_property *prop; + uint32_t irq; + + /* Set sane capability defaults */ + bt.caps.num_requests = 1; + bt.caps.input_buf_len = BT_FIFO_LEN; + bt.caps.output_buf_len = BT_FIFO_LEN; + bt.caps.msg_timeout = BT_MSG_TIMEOUT; + bt.caps.max_retries = BT_MAX_RETRIES; + + /* We support only one */ + n = dt_find_compatible_node(dt_root, NULL, "ipmi-bt"); + if (!n) { + prerror("No BT device\n"); + return; + } + + /* Get IO base */ + prop = dt_find_property(n, "reg"); + if (!prop) { + prerror("Can't find reg property\n"); + return; + } + if (dt_property_get_cell(prop, 0) != OPAL_LPC_IO) { + prerror("Only supports IO addresses\n"); + return; + } + bt.base_addr = dt_property_get_cell(prop, 1); + init_timer(&bt.poller, bt_poll, NULL); + + bt_init_interface(); + init_lock(&bt.lock); + + /* + * The iBT interface comes up in the busy state until the daemon has + * initialised it. + */ + list_head_init(&bt.msgq); + list_head_init(&bt.msgq_sync); + inflight_bt_msg = NULL; + bt.queue_len = 0; + + prlog(PR_INFO, "Interface initialized, IO 0x%04x\n", bt.base_addr); + + ipmi_register_backend(&bt_backend); + + /* + * We initially schedule the poller as a relatively fast timer, at + * least until we have at least one interrupt occurring at which + * point we turn it into a background poller + */ + schedule_timer(&bt.poller, msecs_to_tb(BT_DEFAULT_POLL_MS)); + + irq = dt_prop_get_u32(n, "interrupts"); + bt_lpc_client.interrupts = LPC_IRQ(irq); + lpc_register_client(dt_get_chip_id(n), &bt_lpc_client, + IRQ_ATTR_TARGET_OPAL); + + /* Enqueue an IPMI message to ask the BMC about its BT capabilities */ + get_bt_caps(); + + prlog(PR_DEBUG, "Using LPC IRQ %d\n", irq); +} diff --git a/roms/skiboot/hw/cache-p9.c b/roms/skiboot/hw/cache-p9.c new file mode 100644 index 000000000..fb5ce3087 --- /dev/null +++ b/roms/skiboot/hw/cache-p9.c @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Copyright 2019 IBM Corp. + */ + +#include <skiboot.h> +#include <chip.h> +#include <xscom.h> +#include <timebase.h> +#include <xscom-p9-regs.h> +#include <cache-p9.h> + +/* Registers and bits used to clear the L2 and L3 cache */ +#define L2_PRD_PURGE_CMD_REG 0x1080e +#define L2_PRD_PURGE_CMD_TRIGGER PPC_BIT(0) +#define L2_PRD_PURGE_CMD_TYPE_MASK PPC_BITMASK(1, 4) +#define L2CAC_FLUSH 0x0 +#define L2_PRD_PURGE_CMD_REG_BUSY PPC_BIT(9) +#define L3_PRD_PURGE_REG 0x1180e +#define L3_PRD_PURGE_REQ PPC_BIT(0) +#define L3_PRD_PURGE_TTYPE_MASK PPC_BITMASK(1, 4) +#define L3_FULL_PURGE 0x0 + +#define L2_L3_PRD_PURGE_TIMEOUT_MS 20 + +static int start_l2_purge(uint32_t chip_id, uint32_t core_id) +{ + uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG); + int rc; + + rc = xscom_write_mask(chip_id, addr, L2CAC_FLUSH, + L2_PRD_PURGE_CMD_TYPE_MASK); + if (!rc) + rc = xscom_write_mask(chip_id, addr, L2_PRD_PURGE_CMD_TRIGGER, + L2_PRD_PURGE_CMD_TRIGGER); + if (rc) + prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM write_mask " + "failed %i\n", core_id, rc); + return rc; +} + +static int wait_l2_purge(uint32_t chip_id, uint32_t core_id) +{ + uint64_t val; + uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG); + unsigned long now = mftb(); + unsigned long end = now + msecs_to_tb(L2_L3_PRD_PURGE_TIMEOUT_MS); + int rc; + + while (1) { + rc = xscom_read(chip_id, addr, &val); + if (rc) { + prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM read " + "failed %i\n", core_id, rc); + break; + } + if (!(val & L2_PRD_PURGE_CMD_REG_BUSY)) + break; + now = mftb(); + if (tb_compare(now, end) == TB_AAFTERB) { + prlog(PR_ERR, "PURGE L2 on core 0x%x timed out %i\n", + core_id, rc); + return OPAL_BUSY; + } + } + + /* We have to clear the trigger bit ourselves */ + val &= ~L2_PRD_PURGE_CMD_TRIGGER; + rc = xscom_write(chip_id, addr, val); + if (rc) + prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM write failed %i\n", + core_id, rc); + return rc; +} + +static int start_l3_purge(uint32_t chip_id, uint32_t core_id) +{ + uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG); + int rc; + + rc = xscom_write_mask(chip_id, addr, L3_FULL_PURGE, + L3_PRD_PURGE_TTYPE_MASK); + if (!rc) + rc = xscom_write_mask(chip_id, addr, L3_PRD_PURGE_REQ, + L3_PRD_PURGE_REQ); + if (rc) + prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM write_mask " + "failed %i\n", core_id, rc); + return rc; +} + +static int wait_l3_purge(uint32_t chip_id, uint32_t core_id) +{ + uint64_t val; + uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG); + unsigned long now = mftb(); + unsigned long end = now + msecs_to_tb(L2_L3_PRD_PURGE_TIMEOUT_MS); + int rc; + + /* Trigger bit is automatically set to zero when flushing is done */ + while (1) { + rc = xscom_read(chip_id, addr, &val); + if (rc) { + prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM read " + "failed %i\n", core_id, rc); + break; + } + if (!(val & L3_PRD_PURGE_REQ)) + break; + now = mftb(); + if (tb_compare(now, end) == TB_AAFTERB) { + prlog(PR_ERR, "PURGE L3 on core 0x%x timed out %i\n", + core_id, rc); + return OPAL_BUSY; + } + } + return rc; +} + +int64_t purge_l2_l3_caches(void) +{ + struct cpu_thread *t; + uint64_t core_id, prev_core_id = (uint64_t)-1; + int rc; + unsigned long now = mftb(); + + for_each_ungarded_cpu(t) { + /* Only need to do it once per core chiplet */ + core_id = pir_to_core_id(t->pir); + if (prev_core_id == core_id) + continue; + prev_core_id = core_id; + rc = start_l2_purge(t->chip_id, core_id); + if (rc) + goto trace_exit; + rc = start_l3_purge(t->chip_id, core_id); + if (rc) + goto trace_exit; + } + + prev_core_id = (uint64_t)-1; + for_each_ungarded_cpu(t) { + /* Only need to do it once per core chiplet */ + core_id = pir_to_core_id(t->pir); + if (prev_core_id == core_id) + continue; + prev_core_id = core_id; + + rc = wait_l2_purge(t->chip_id, core_id); + if (rc) + goto trace_exit; + rc = wait_l3_purge(t->chip_id, core_id); + if (rc) + goto trace_exit; + } + +trace_exit: + prlog(PR_TRACE, "L2/L3 purging took %ldus\n", + tb_to_usecs(mftb() - now)); + + return rc; +} diff --git a/roms/skiboot/hw/capp.c b/roms/skiboot/hw/capp.c new file mode 100644 index 000000000..a1aa1caa9 --- /dev/null +++ b/roms/skiboot/hw/capp.c @@ -0,0 +1,243 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * CAPP unit (i.e. CAPI) + * + * Copyright 2013-2019 IBM Corp. + */ + +#include <skiboot.h> +#include <io.h> +#include <opal.h> +#include <chip.h> +#include <xscom.h> +#include <capp.h> + +#define PHBERR(opal_id, chip_id, index, fmt, a...) \ + prlog(PR_ERR, "PHB#%04x[%d:%d]: " fmt, \ + opal_id, chip_id, \ + index, ## a) + +static struct { + uint32_t ec_level; + struct capp_lid_hdr *lid; + size_t size; + int load_result; +} capp_ucode_info = { 0, NULL, 0, false }; + +#define CAPP_UCODE_MAX_SIZE 0x20000 + +struct lock capi_lock = LOCK_UNLOCKED; +struct capp_ops capi_ops = { NULL }; + +bool capp_ucode_loaded(struct proc_chip *chip, unsigned int index) +{ + return (chip->capp_ucode_loaded & (1 << index)); +} + +int preload_capp_ucode(void) +{ + struct dt_node *p; + struct proc_chip *chip; + uint32_t index; + uint64_t rc; + int ret; + + /* CAPI is supported on P8 and P9 only */ + p = dt_find_compatible_node(dt_root, NULL, "ibm,power8-pbcq"); + if (!p) + p = dt_find_compatible_node(dt_root, NULL, "ibm,power9-pbcq"); + if (!p) + return OPAL_SUCCESS; + + chip = get_chip(dt_get_chip_id(p)); + + rc = xscom_read_cfam_chipid(chip->id, &index); + if (rc) { + prerror("CAPP: Error reading cfam chip-id\n"); + ret = OPAL_HARDWARE; + return ret; + } + /* Keep ChipID and Major/Minor EC. Mask out the Location Code. */ + index = index & 0xf0fff; + + /* Assert that we're preloading */ + assert(capp_ucode_info.lid == NULL); + capp_ucode_info.load_result = OPAL_EMPTY; + + capp_ucode_info.ec_level = index; + + /* Is the ucode preloaded like for BML? */ + if (dt_has_node_property(p, "ibm,capp-ucode", NULL)) { + capp_ucode_info.lid = (struct capp_lid_hdr *)(u64) + dt_prop_get_u32(p, "ibm,capp-ucode"); + capp_ucode_info.load_result = OPAL_SUCCESS; + ret = OPAL_SUCCESS; + goto end; + } + /* If we successfully download the ucode, we leave it around forever */ + capp_ucode_info.size = CAPP_UCODE_MAX_SIZE; + capp_ucode_info.lid = malloc(CAPP_UCODE_MAX_SIZE); + if (!capp_ucode_info.lid) { + prerror("CAPP: Can't allocate space for ucode lid\n"); + ret = OPAL_NO_MEM; + goto end; + } + + prlog(PR_INFO, "CAPI: Preloading ucode %x\n", capp_ucode_info.ec_level); + + ret = start_preload_resource(RESOURCE_ID_CAPP, index, + capp_ucode_info.lid, + &capp_ucode_info.size); + + if (ret != OPAL_SUCCESS) { + prerror("CAPI: Failed to preload resource %d\n", ret); + capp_ucode_info.load_result = ret; + } + +end: + return ret; +} + +static int64_t capp_lid_download(void) +{ + int64_t ret; + + if (capp_ucode_info.load_result != OPAL_EMPTY) + return capp_ucode_info.load_result; + + capp_ucode_info.load_result = wait_for_resource_loaded( + RESOURCE_ID_CAPP, + capp_ucode_info.ec_level); + + if (capp_ucode_info.load_result != OPAL_SUCCESS) { + prerror("CAPP: Error loading ucode lid. index=%x\n", + capp_ucode_info.ec_level); + ret = OPAL_RESOURCE; + free(capp_ucode_info.lid); + capp_ucode_info.lid = NULL; + goto end; + } + + ret = OPAL_SUCCESS; +end: + return ret; +} + +int64_t capp_load_ucode(unsigned int chip_id, uint32_t opal_id, + unsigned int index, u64 lid_eyecatcher, + uint32_t reg_offset, + uint64_t apc_master_addr, uint64_t apc_master_write, + uint64_t snp_array_addr, uint64_t snp_array_write) +{ + struct proc_chip *chip = get_chip(chip_id); + struct capp_ucode_lid *ucode; + struct capp_ucode_data *data; + struct capp_lid_hdr *lid; + uint64_t rc, val, addr; + uint32_t chunk_count, offset; + int i; + + if (capp_ucode_loaded(chip, index)) + return OPAL_SUCCESS; + + rc = capp_lid_download(); + if (rc) + return rc; + + prlog(PR_INFO, "CHIP%i: CAPP ucode lid loaded at %p\n", + chip_id, capp_ucode_info.lid); + + lid = capp_ucode_info.lid; + /* + * If lid header is present (on FSP machines), it'll tell us where to + * find the ucode. Otherwise this is the ucode. + */ + ucode = (struct capp_ucode_lid *)lid; + if (be64_to_cpu(lid->eyecatcher) == lid_eyecatcher) { + if (be64_to_cpu(lid->version) != 0x1) { + PHBERR(opal_id, chip_id, index, + "capi ucode lid header invalid\n"); + return OPAL_HARDWARE; + } + ucode = (struct capp_ucode_lid *) + ((char *)ucode + be64_to_cpu(lid->ucode_offset)); + } + + /* 'CAPPULID' in ASCII */ + if ((be64_to_cpu(ucode->eyecatcher) != 0x43415050554C4944UL) || + (be64_to_cpu(ucode->version) != 1)) { + PHBERR(opal_id, chip_id, index, + "CAPP: ucode header invalid\n"); + return OPAL_HARDWARE; + } + + offset = 0; + while (offset < be64_to_cpu(ucode->data_size)) { + data = (struct capp_ucode_data *) + ((char *)&ucode->data + offset); + chunk_count = be32_to_cpu(data->hdr.chunk_count); + offset += sizeof(struct capp_ucode_data_hdr) + chunk_count * 8; + + /* 'CAPPUCOD' in ASCII */ + if (be64_to_cpu(data->hdr.eyecatcher) != 0x4341505055434F44UL) { + PHBERR(opal_id, chip_id, index, + "CAPP: ucode data header invalid:%i\n", + offset); + return OPAL_HARDWARE; + } + + switch (data->hdr.reg) { + case apc_master_cresp: + xscom_write(chip_id, apc_master_addr + reg_offset, + 0); + addr = apc_master_write; + break; + case apc_master_uop_table: + xscom_write(chip_id, apc_master_addr + reg_offset, + 0x180ULL << 52); + addr = apc_master_write; + break; + case snp_ttype: + xscom_write(chip_id, snp_array_addr + reg_offset, + 0x5000ULL << 48); + addr = snp_array_write; + break; + case snp_uop_table: + xscom_write(chip_id, snp_array_addr + reg_offset, + 0x4000ULL << 48); + addr = snp_array_write; + break; + default: + continue; + } + + for (i = 0; i < chunk_count; i++) { + val = be64_to_cpu(data->data[i]); + xscom_write(chip_id, addr + reg_offset, val); + } + } + + chip->capp_ucode_loaded |= (1 << index); + + return OPAL_SUCCESS; +} + +int64_t capp_get_info(int chip_id, struct phb *phb, struct capp_info *info) +{ + if (capi_ops.get_capp_info) + return capi_ops.get_capp_info(chip_id, phb, info); + + return OPAL_PARAMETER; +} + +int64_t capp_xscom_read(struct capp *capp, int64_t off, uint64_t *val) +{ + return capp == NULL ? OPAL_PARAMETER : + xscom_read(capp->chip_id, off + capp->capp_xscom_offset, val); +} + +int64_t capp_xscom_write(struct capp *capp, int64_t off, uint64_t val) +{ + return capp == NULL ? OPAL_PARAMETER : + xscom_write(capp->chip_id, off + capp->capp_xscom_offset, val); +} diff --git a/roms/skiboot/hw/centaur.c b/roms/skiboot/hw/centaur.c new file mode 100644 index 000000000..e9ff4197f --- /dev/null +++ b/roms/skiboot/hw/centaur.c @@ -0,0 +1,555 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Centaur memory buffer chip + * + * Copyright 2013-2017 IBM Corp. + */ + +#include <skiboot.h> +#include <xscom.h> +#include <processor.h> +#include <device.h> +#include <chip.h> +#include <centaur.h> +#include <lock.h> +#include <fsi-master.h> +#include <timebase.h> + +/* + * Centaur chip IDs are using the XSCOM "partID" encoding + * described in xscom.h. recap: + * + * 0b1000.0000.0000.0000.0000.00NN.NCCC.MMMM + * N=Node, C=Chip, M=Memory Channel + * + * We currently use FSI exclusively for centaur access. We can + * start using MMIO on Centaur DD2.x when we have a way to handle + * machine checks happening inside Sapphire which we don't at the + * moment. + */ + +/* Is that correct ? */ +#define MAX_CENTAURS_PER_CHIP 8 + +/* Mark the centaur offline after this many consecutive errors */ +#define CENTAUR_ERR_OFFLINE_THRESHOLD 10 + +/* + * FSI2PIB register definitions (this could be moved out if we were to + * support FSI master to other chips. + */ +#define FSI_DATA0_REG 0x1000 +#define FSI_DATA1_REG 0x1004 +#define FSI_CMD_REG 0x1008 +#define FSI_CMD_WR 0x80000000 +#define FSI_CMD_RD 0x00000000 +#define FSI_ENG_RESET_REG 0x1018 +#define FSI_STATUS_REG 0x101c +#define FSI_STATUS_ABORT 0x00100000 +#define FSI_STATUS_ERRORS 0x00007000 + +/* Some Centaur XSCOMs we care about */ +#define SCAC_CONFIG_REG 0x020115ce +#define SCAC_CONFIG_SET 0x020115cf +#define SCAC_CONFIG_CLR 0x020115d0 +#define SCAC_ENABLE_MSK PPC_BIT(0) + +#define cent_log(__lev, __c, __fmt, ...) \ + prlog(__lev, "CENTAUR %x: " __fmt, __c->part_id, ##__VA_ARGS__) + +static int64_t centaur_fsiscom_complete(struct centaur_chip *centaur) +{ + int64_t rc; + uint32_t stat; + + rc = mfsi_read(centaur->fsi_master_chip_id, centaur->fsi_master_engine, + centaur->fsi_master_port, FSI_STATUS_REG, &stat); + if (rc) { + cent_log(PR_ERR, centaur, "MFSI read error %lld reading STAT\n", rc); + return rc; + } + if ((stat & (FSI_STATUS_ABORT | FSI_STATUS_ERRORS)) == 0) + return OPAL_SUCCESS; + + cent_log(PR_ERR, centaur, "Remote FSI SCOM error, status=0x%08x\n", stat); + + /* All 1's ? Assume it's gone */ + if (stat == 0xffffffffu) { + cent_log(PR_ERR, centaur, "Chip appears to be dead !\n"); + centaur->valid = false; + + /* Here, hostboot grabs a pile of FFDC from the FSI layer, + * we could do that too ... + */ + return OPAL_HARDWARE; + } + + /* Here HB prints the GPx registers which I believe are only + * in the host (FSI master). We skip that for now, we don't have + * a good API to them + */ + + /* Recovery sequence from HostBoot fsiscom.C + * if SCOM fails and FSI Master displays "MasterTimeOut" + * then 7,6 <covered by FSI driver> + * else if SCOM fails and FSI2PIB Status shows PIB abort + * then just perform unit reset (6) and wait 1 ms + * else (PIB_abort='0' but PIB error is unequal 0) + * then just perform unit reset (6) (wait not needed). + * + * Note: Waiting 1ms inside OPAL is a BIG NO NO !!! We have + * no choice but doing it at the moment but that will have + * to be fixed one way or another, possibly by returning some + * kind of busy status until the delay is expired. + */ + rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine, + centaur->fsi_master_port, FSI_ENG_RESET_REG, 0); + if (rc) { + cent_log(PR_ERR, centaur, "MFSI write error %lld resetting SCOM engine\n", + rc); + } + return OPAL_HARDWARE; +} + +static int64_t centaur_fsiscom_read(struct centaur_chip *centaur, uint32_t pcb_addr, + uint64_t *val) +{ + int64_t rc; + uint32_t data0, data1; + + rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine, + centaur->fsi_master_port, FSI_CMD_REG, pcb_addr | FSI_CMD_RD); + if (rc) { + cent_log(PR_ERR, centaur, "MFSI write error %lld writing CMD\n", rc); + return rc; + } + + rc = centaur_fsiscom_complete(centaur); + if (rc) + return rc; + + rc = mfsi_read(centaur->fsi_master_chip_id, centaur->fsi_master_engine, + centaur->fsi_master_port, FSI_DATA0_REG, &data0); + if (rc) { + cent_log(PR_ERR, centaur, "MFSI read error %lld reading DATA0\n", rc); + return rc; + } + rc = mfsi_read(centaur->fsi_master_chip_id, centaur->fsi_master_engine, + centaur->fsi_master_port, FSI_DATA1_REG, &data1); + if (rc) { + cent_log(PR_ERR, centaur, "MFSI read error %lld readking DATA1\n", rc); + return rc; + } + + *val = (((uint64_t)data0) << 32) | data1; + + return OPAL_SUCCESS; +} + +static int64_t centaur_fsiscom_write(struct centaur_chip *centaur, uint32_t pcb_addr, + uint64_t val) +{ + int64_t rc; + + rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine, + centaur->fsi_master_port, FSI_DATA0_REG, hi32(val)); + if (rc) { + cent_log(PR_ERR, centaur, "MFSI write error %lld writing DATA0\n", rc); + return rc; + } + rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine, + centaur->fsi_master_port, FSI_DATA1_REG, lo32(val)); + if (rc) { + cent_log(PR_ERR, centaur, "MFSI write error %lld writing DATA1\n", rc); + return rc; + } + rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine, + centaur->fsi_master_port, FSI_CMD_REG, pcb_addr | FSI_CMD_WR); + if (rc) { + cent_log(PR_ERR, centaur, "MFSI write error %lld writing CMD\n", rc); + return rc; + } + + return centaur_fsiscom_complete(centaur); +} + +struct centaur_chip *get_centaur(uint32_t part_id) +{ + uint32_t hchip_id, mchan; + struct proc_chip *hchip; + struct centaur_chip *centaur; + + if ((part_id >> 28) != 8) { + prerror("CENTAUR: Invalid part ID 0x%x\n", part_id); + return NULL; + } + hchip_id = (part_id & 0x0fffffff) >> 4; + mchan = part_id & 0xf; + + hchip = get_chip(hchip_id); + if (!hchip) { + prerror("CENTAUR: Centaur 0x%x not found on non-existing chip 0%x\n", + part_id, hchip_id); + return NULL; + } + if (mchan >= MAX_CENTAURS_PER_CHIP) { + prerror("CENTAUR: Centaur 0x%x channel out of bounds !\n", part_id); + return NULL; + } + if (!hchip->centaurs) { + prerror("CENTAUR: Centaur 0x%x not found on chip 0%x (no centaurs)\n", + part_id, hchip_id); + return NULL; + } + centaur = &hchip->centaurs[mchan]; + if (!centaur->valid) { + prerror("CENTAUR: Centaur 0x%x not valid on chip 0%x\n", + part_id, hchip_id); + return NULL; + } + return centaur; +} + +/* + * Indirect XSCOM access functions. Copied from xscom.c, at a + * latter date, we should merge these properly. + */ +static void centaur_xscom_handle_ind_error(struct centaur_chip *centaur, + uint64_t data, uint64_t pcb_addr, + bool is_write) +{ + unsigned int stat = GETFIELD(XSCOM_DATA_IND_ERR, data); + bool timeout = !(data & XSCOM_DATA_IND_COMPLETE); + + /* XXX: Create error log entry ? */ + if (timeout) + cent_log(PR_ERR, centaur, + "inddirect %s timeout, pcb_addr=0x%llx stat=0x%x\n", + is_write ? "write" : "read", pcb_addr, stat); + else + cent_log(PR_ERR, centaur, + "indirect %s error, pcb_addr=0x%llx stat=0x%x\n", + is_write ? "write" : "read", pcb_addr, stat); +} + +static int centaur_xscom_ind_read(struct centaur_chip *centaur, + uint64_t pcb_addr, uint64_t *val) +{ + uint32_t addr; + uint64_t data; + int rc, retries; + + /* Write indirect address */ + addr = pcb_addr & 0x7fffffff; + data = XSCOM_DATA_IND_READ | + (pcb_addr & XSCOM_ADDR_IND_ADDR); + rc = centaur_fsiscom_write(centaur, addr, data); + if (rc) + goto bail; + + /* Wait for completion */ + for (retries = 0; retries < XSCOM_IND_MAX_RETRIES; retries++) { + rc = centaur_fsiscom_read(centaur, addr, &data); + if (rc) + goto bail; + if ((data & XSCOM_DATA_IND_COMPLETE) && + ((data & XSCOM_DATA_IND_ERR) == 0)) { + *val = data & XSCOM_DATA_IND_DATA; + break; + } + if ((data & XSCOM_DATA_IND_COMPLETE) || + (retries >= XSCOM_IND_MAX_RETRIES)) { + centaur_xscom_handle_ind_error(centaur, data, pcb_addr, + false); + rc = OPAL_HARDWARE; + goto bail; + } + } + bail: + if (rc) + *val = (uint64_t)-1; + return rc; +} + +static int centaur_xscom_ind_write(struct centaur_chip *centaur, + uint64_t pcb_addr, uint64_t val) +{ + uint32_t addr; + uint64_t data; + int rc, retries; + + /* Write indirect address & data */ + addr = pcb_addr & 0x7fffffff; + data = pcb_addr & XSCOM_ADDR_IND_ADDR; + data |= val & XSCOM_ADDR_IND_DATA; + + rc = centaur_fsiscom_write(centaur, addr, data); + if (rc) + goto bail; + + /* Wait for completion */ + for (retries = 0; retries < XSCOM_IND_MAX_RETRIES; retries++) { + rc = centaur_fsiscom_read(centaur, addr, &data); + if (rc) + goto bail; + if ((data & XSCOM_DATA_IND_COMPLETE) && + ((data & XSCOM_DATA_IND_ERR) == 0)) + break; + if ((data & XSCOM_DATA_IND_COMPLETE) || + (retries >= XSCOM_IND_MAX_RETRIES)) { + centaur_xscom_handle_ind_error(centaur, data, pcb_addr, + true); + rc = OPAL_HARDWARE; + goto bail; + } + } + bail: + return rc; +} + +static int64_t centaur_xscom_read(struct scom_controller *scom, + uint32_t id __unused, uint64_t pcb_addr, + uint64_t *val) +{ + struct centaur_chip *centaur = scom->private; + int64_t rc; + + if (!centaur) + return OPAL_PARAMETER; + if (!centaur->online) + return OPAL_XSCOM_CTR_OFFLINED; + + lock(¢aur->lock); + if (pcb_addr & XSCOM_ADDR_IND_FLAG) + rc = centaur_xscom_ind_read(centaur, pcb_addr, val); + else + rc = centaur_fsiscom_read(centaur, pcb_addr, val); + + /* We mark the centaur offline if we get too many errors on + * consecutive accesses + */ + if (rc) { + centaur->error_count++; + if (centaur->error_count > CENTAUR_ERR_OFFLINE_THRESHOLD) { + centaur->online = false; + /** + * @fwts-label CentaurOfflinedTooManyErrors + * @fwts-advice OPAL marked a Centaur (memory buffer) + * as offline due to CENTAUR_ERR_OFFLINE_THRESHOLD (10) + * consecutive errors on XSCOMs to this centaur. + * OPAL will now return OPAL_XSCOM_CTR_OFFLINED and not + * try any further XSCOMs. This is likely caused by + * some hardware issue or PRD recovery issue. + */ + prlog(PR_ERR, "CENTAUR: Offlined %x due to > %d consecutive XSCOM errors. No more XSCOMs to this centaur.\n", + id, CENTAUR_ERR_OFFLINE_THRESHOLD); + } + } else + centaur->error_count = 0; + unlock(¢aur->lock); + + return rc; +} + +static int64_t centaur_xscom_write(struct scom_controller *scom, + uint32_t id __unused, uint64_t pcb_addr, + uint64_t val) +{ + struct centaur_chip *centaur = scom->private; + int64_t rc; + + if (!centaur) + return OPAL_PARAMETER; + if (!centaur->online) + return OPAL_XSCOM_CTR_OFFLINED; + + lock(¢aur->lock); + if (pcb_addr & XSCOM_ADDR_IND_FLAG) + rc = centaur_xscom_ind_write(centaur, pcb_addr, val); + else + rc = centaur_fsiscom_write(centaur, pcb_addr, val); + + /* We mark the centaur offline if we get too many errors on + * consecutive accesses + */ + if (rc) { + centaur->error_count++; + if (centaur->error_count > CENTAUR_ERR_OFFLINE_THRESHOLD) + centaur->online = false; + } else + centaur->error_count = 0; + unlock(¢aur->lock); + + return rc; +} + +static bool centaur_check_id(struct centaur_chip *centaur) +{ + int64_t rc; + uint64_t val; + + rc = centaur_fsiscom_read(centaur, 0xf000f, &val); + if (rc) { + cent_log(PR_ERR, centaur, + " FSISCOM error %lld reading ID register\n", + rc); + return false; + } + + /* Extract CFAM id */ + val >>= 44; + + /* Identify chip */ + if ((val & 0xff) != 0xe9) { + cent_log(PR_ERR, centaur, + " CFAM ID 0x%02x is not a Centaur !\n", + (unsigned int)(val & 0xff)); + return false; + } + + /* Get EC level from CFAM ID */ + centaur->ec_level = ((val >> 16) & 0xf) << 4; + centaur->ec_level |= (val >> 8) & 0xf; + + return true; +} + +static bool centaur_add(uint32_t part_id, uint32_t mchip, uint32_t meng, + uint32_t mport) +{ + uint32_t hchip_id, mchan; + struct proc_chip *hchip; + struct centaur_chip *centaur; + + if ((part_id >> 28) != 8) { + prerror("CENTAUR: Invalid part ID 0x%x\n", part_id); + return false; + } + hchip_id = (part_id & 0x0fffffff) >> 4; + mchan = part_id & 0xf; + + printf("CENTAUR: Found centaur for chip 0x%x channel %d\n", + hchip_id, mchan); + printf("CENTAUR: FSI host: 0x%x cMFSI%d port %d\n", + mchip, meng, mport); + + hchip = get_chip(hchip_id); + if (!hchip) { + prerror("CENTAUR: No such chip !!!\n"); + return false; + } + + if (mchan >= MAX_CENTAURS_PER_CHIP) { + prerror("CENTAUR: Channel out of bounds !\n"); + return false; + } + + if (!hchip->centaurs) { + hchip->centaurs = + zalloc(sizeof(struct centaur_chip) * + MAX_CENTAURS_PER_CHIP); + assert(hchip->centaurs); + } + + centaur = &hchip->centaurs[mchan]; + if (centaur->valid) { + prerror("CENTAUR: Duplicate centaur !\n"); + return false; + } + centaur->part_id = part_id; + centaur->fsi_master_chip_id = mchip; + centaur->fsi_master_port = mport; + centaur->fsi_master_engine = meng ? MFSI_cMFSI1 : MFSI_cMFSI0; + centaur->online = true; + init_lock(¢aur->lock); + list_head_init(¢aur->i2cms); + + if (!centaur_check_id(centaur)) + return false; + + centaur->scom.part_id = part_id; + centaur->scom.private = centaur; + centaur->scom.read = centaur_xscom_read; + centaur->scom.write = centaur_xscom_write; + scom_register(¢aur->scom); + + cent_log(PR_INFO, centaur, "Found DD%x.%x chip\n", + centaur->ec_level >> 4, + centaur->ec_level & 0xf); + + centaur->valid = true; + return true; +} + +/* Returns how long to wait for logic to stop in TB ticks or a negative + * value on error + */ +int64_t centaur_disable_sensor_cache(uint32_t part_id) +{ + struct centaur_chip *centaur = get_centaur(part_id); + int64_t rc = 0; + uint64_t ctrl; + + if (!centaur) + return false; + + lock(¢aur->lock); + centaur->scache_disable_count++; + if (centaur->scache_disable_count == 1) { + centaur->scache_was_enabled = false; + rc = centaur_fsiscom_read(centaur, SCAC_CONFIG_REG, &ctrl); + if (rc) + goto bail; + centaur->scache_was_enabled = !!(ctrl & SCAC_ENABLE_MSK); + rc = centaur_fsiscom_write(centaur, SCAC_CONFIG_CLR, SCAC_ENABLE_MSK); + if (rc) + goto bail; + rc = msecs_to_tb(30); + } + bail: + unlock(¢aur->lock); + return rc; +} + +int64_t centaur_enable_sensor_cache(uint32_t part_id) +{ + struct centaur_chip *centaur = get_centaur(part_id); + int64_t rc = 0; + + if (!centaur) + return false; + + lock(¢aur->lock); + if (centaur->scache_disable_count == 0) { + cent_log(PR_ERR, centaur, "Cache count going negative !\n"); + backtrace(); + goto bail; + } + centaur->scache_disable_count--; + if (centaur->scache_disable_count == 0 && centaur->scache_was_enabled) + rc = centaur_fsiscom_write(centaur, SCAC_CONFIG_SET, SCAC_ENABLE_MSK); + bail: + unlock(¢aur->lock); + return rc; +} + +void centaur_init(void) +{ + struct dt_node *cn; + + dt_for_each_compatible(dt_root, cn, "ibm,centaur") { + uint32_t chip_id, mchip, meng, mport; + + chip_id = dt_prop_get_u32(cn, "ibm,chip-id"); + mchip = dt_prop_get_u32(cn, "ibm,fsi-master-chip-id"); + meng = dt_prop_get_cell(cn, "ibm,fsi-master-port", 0); + mport = dt_prop_get_cell(cn, "ibm,fsi-master-port", 1); + + /* + * If adding the centaur succeeds, we expose it to + * Linux as a scom-controller + */ + if (centaur_add(chip_id, mchip, meng, mport)) + dt_add_property(cn, "scom-controller", NULL, 0); + } +} diff --git a/roms/skiboot/hw/chiptod.c b/roms/skiboot/hw/chiptod.c new file mode 100644 index 000000000..7c0a1ffc7 --- /dev/null +++ b/roms/skiboot/hw/chiptod.c @@ -0,0 +1,2067 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Handle ChipTOD chip & configure core and CAPP timebases + * + * Copyright 2013-2019 IBM Corp. + */ + +#define pr_fmt(fmt) "CHIPTOD: " fmt + +#include <skiboot.h> +#include <xscom.h> +#include <pci.h> +#include <chiptod.h> +#include <chip.h> +#include <io.h> +#include <cpu.h> +#include <timebase.h> +#include <opal-api.h> + +/* TOD chip XSCOM addresses */ +#define TOD_MASTER_PATH_CTRL 0x00040000 /* Master Path ctrl reg */ +#define TOD_PRI_PORT0_CTRL 0x00040001 /* Primary port0 ctrl reg */ +#define TOD_PRI_PORT1_CTRL 0x00040002 /* Primary port1 ctrl reg */ +#define TOD_SEC_PORT0_CTRL 0x00040003 /* Secondary p0 ctrl reg */ +#define TOD_SEC_PORT1_CTRL 0x00040004 /* Secondary p1 ctrl reg */ +#define TOD_SLAVE_PATH_CTRL 0x00040005 /* Slave Path ctrl reg */ +#define TOD_INTERNAL_PATH_CTRL 0x00040006 /* Internal Path ctrl reg */ + +/* -- TOD primary/secondary master/slave control register -- */ +#define TOD_PSMS_CTRL 0x00040007 +#define TOD_PSMSC_PM_TOD_SELECT PPC_BIT(1) /* Primary Master TOD */ +#define TOD_PSMSC_PM_DRAW_SELECT PPC_BIT(2) /* Primary Master Drawer */ +#define TOD_PSMSC_SM_TOD_SELECT PPC_BIT(9) /* Secondary Master TOD */ +#define TOD_PSMSC_SM_DRAW_SELECT PPC_BIT(10) /* Secondary Master Draw */ + +/* -- TOD primary/secondary master/slave status register -- */ +#define TOD_STATUS 0x00040008 +#define TOD_ST_TOPOLOGY_SELECT PPC_BITMASK(0, 2) +#define TOD_ST_MPATH0_STEP_VALID PPC_BIT(6) /* MasterPath0 step valid */ +#define TOD_ST_MPATH1_STEP_VALID PPC_BIT(7) /* MasterPath1 step valid */ +#define TOD_ST_SPATH0_STEP_VALID PPC_BIT(8) /* SlavePath0 step valid */ +#define TOD_ST_SPATH1_STEP_VALID PPC_BIT(10) /* SlavePath1 step valid */ +/* Primary master/slave path select (0 = PATH_0, 1 = PATH_1) */ +#define TOD_ST_PRI_MPATH_SELECT PPC_BIT(12) /* Primary MPath Select */ +#define TOD_ST_PRI_SPATH_SELECT PPC_BIT(15) /* Primary SPath Select */ +/* Secondary master/slave path select (0 = PATH_0, 1 = PATH_1) */ +#define TOD_ST_SEC_MPATH_SELECT PPC_BIT(16) /* Secondary MPath Select */ +#define TOD_ST_SEC_SPATH_SELECT PPC_BIT(19) /* Secondary SPath Select */ +#define TOD_ST_ACTIVE_MASTER PPC_BIT(23) +#define TOD_ST_BACKUP_MASTER PPC_BIT(24) + +/* TOD chip XSCOM addresses */ +#define TOD_CHIP_CTRL 0x00040010 /* Chip control register */ +#define TOD_TTYPE_0 0x00040011 +#define TOD_TTYPE_1 0x00040012 /* PSS switch */ +#define TOD_TTYPE_2 0x00040013 /* Enable step checkers */ +#define TOD_TTYPE_3 0x00040014 /* Request TOD */ +#define TOD_TTYPE_4 0x00040015 /* Send TOD */ +#define TOD_TTYPE_5 0x00040016 /* Invalidate TOD */ +#define TOD_CHIPTOD_TO_TB 0x00040017 +#define TOD_LOAD_TOD_MOD 0x00040018 +#define TOD_CHIPTOD_VALUE 0x00040020 +#define TOD_CHIPTOD_LOAD_TB 0x00040021 +#define TOD_CHIPTOD_FSM 0x00040024 + +/* -- TOD PIB Master reg -- */ +#define TOD_PIB_MASTER 0x00040027 +#define TOD_PIBM_ADDR_CFG_MCAST PPC_BIT(25) +#define TOD_PIBM_ADDR_CFG_SLADDR PPC_BITMASK(26, 31) +#define TOD_PIBM_TTYPE4_SEND_MODE PPC_BIT(32) +#define TOD_PIBM_TTYPE4_SEND_ENBL PPC_BIT(33) + +/* -- TOD Error interrupt register -- */ +#define TOD_ERROR 0x00040030 +/* SYNC errors */ +#define TOD_ERR_CRMO_PARITY PPC_BIT(0) +#define TOD_ERR_OSC0_PARITY PPC_BIT(1) +#define TOD_ERR_OSC1_PARITY PPC_BIT(2) +#define TOD_ERR_PPORT0_CREG_PARITY PPC_BIT(3) +#define TOD_ERR_PPORT1_CREG_PARITY PPC_BIT(4) +#define TOD_ERR_SPORT0_CREG_PARITY PPC_BIT(5) +#define TOD_ERR_SPORT1_CREG_PARITY PPC_BIT(6) +#define TOD_ERR_SPATH_CREG_PARITY PPC_BIT(7) +#define TOD_ERR_IPATH_CREG_PARITY PPC_BIT(8) +#define TOD_ERR_PSMS_CREG_PARITY PPC_BIT(9) +#define TOD_ERR_CRITC_PARITY PPC_BIT(13) +#define TOD_ERR_MP0_STEP_CHECK PPC_BIT(14) +#define TOD_ERR_MP1_STEP_CHECK PPC_BIT(15) +#define TOD_ERR_PSS_HAMMING_DISTANCE PPC_BIT(18) +#define TOD_ERR_DELAY_COMPL_PARITY PPC_BIT(22) +/* CNTR errors */ +#define TOD_ERR_CTCR_PARITY PPC_BIT(32) +#define TOD_ERR_TOD_SYNC_CHECK PPC_BIT(33) +#define TOD_ERR_TOD_FSM_PARITY PPC_BIT(34) +#define TOD_ERR_TOD_REGISTER_PARITY PPC_BIT(35) +#define TOD_ERR_OVERFLOW_YR2042 PPC_BIT(36) +#define TOD_ERR_TOD_WOF_LSTEP_PARITY PPC_BIT(37) +#define TOD_ERR_TTYPE0_RECVD PPC_BIT(38) +#define TOD_ERR_TTYPE1_RECVD PPC_BIT(39) +#define TOD_ERR_TTYPE2_RECVD PPC_BIT(40) +#define TOD_ERR_TTYPE3_RECVD PPC_BIT(41) +#define TOD_ERR_TTYPE4_RECVD PPC_BIT(42) +#define TOD_ERR_TTYPE5_RECVD PPC_BIT(43) + +/* -- TOD Error interrupt register -- */ +#define TOD_ERROR_INJECT 0x00040031 + +/* PC unit PIB address which recieves the timebase transfer from TOD */ +#define PC_TOD 0x4A3 + +/* Local FIR EH.TPCHIP.TPC.LOCAL_FIR */ +#define LOCAL_CORE_FIR 0x0104000C +#define LFIR_SWITCH_COMPLETE PPC_BIT(18) + +/* Number of iterations for the various timeouts */ +#define TIMEOUT_LOOPS 20000000 + +/* TOD active Primary/secondary configuration */ +#define TOD_PRI_CONF_IN_USE 0 /* Tod using primary topology*/ +#define TOD_SEC_CONF_IN_USE 7 /* Tod using secondary topo */ + +/* Timebase State Machine error state */ +#define TBST_STATE_ERROR 9 + +static enum chiptod_type { + chiptod_unknown, + chiptod_p8, + chiptod_p9, + chiptod_p10, +} chiptod_type; + +enum chiptod_chip_role { + chiptod_chip_role_UNKNOWN = -1, + chiptod_chip_role_MDMT = 0, /* Master Drawer Master TOD */ + chiptod_chip_role_MDST, /* Master Drawer Slave TOD */ + chiptod_chip_role_SDMT, /* Slave Drawer Master TOD */ + chiptod_chip_role_SDST, /* Slave Drawer Slave TOD */ +}; + +enum chiptod_chip_status { + chiptod_active_master = 0, /* Chip TOD is Active master */ + chiptod_backup_master = 1, /* Chip TOD is backup master */ + chiptod_backup_disabled, /* Chip TOD is backup but disabled */ +}; + +struct chiptod_chip_config_info { + int32_t id; /* chip id */ + enum chiptod_chip_role role; /* Chip role */ + enum chiptod_chip_status status; /* active/backup/disabled */ +}; + +static int32_t chiptod_primary = -1; +static int32_t chiptod_secondary = -1; +static enum chiptod_topology current_topology = chiptod_topo_unknown; + +/* + * chiptod_topology_info holds primary/secondary chip configuration info. + * This info is initialized during chiptod_init(). This is an array of two: + * [0] = [chiptod_topo_primary] = Primary topology config info + * [1] = [chiptod_topo_secondary] = Secondary topology config info + */ +static struct chiptod_chip_config_info chiptod_topology_info[2]; + +/* + * Array of TOD control registers that holds last known valid values. + * + * Cache chiptod control register values at following instances: + * 1. Chiptod initialization + * 2. After topology switch is complete. + * 3. Upon receiving enable/disable topology request from FSP. + * + * Cache following chip TOD control registers: + * - Master Path control register (0x00040000) + * - Primary Port-0 control register (0x00040001) + * - Primary Port-1 control register (0x00040002) + * - Secondary Port-0 control register (0x00040003) + * - Secondary Port-1 control register (0x00040004) + * - Slave Path control register (0x00040005) + * - Internal Path control register (0x00040006) + * - Primary/secondary master/slave control register (0x00040007) + * - Chip control register (0x00040010) + * + * This data is used for restoring respective TOD registers to sane values + * whenever parity errors are reported on these registers (through HMI). + * The error_bit maps to corresponding bit from TOD error register that + * reports parity error on respective TOD registers. + */ +static struct chiptod_tod_regs { + /* error bit from TOD Error reg */ + const uint64_t error_bit; + + /* xscom address of TOD register to be restored. */ + const uint64_t xscom_addr; + /* per chip cached value of TOD control registers to be restored. */ + struct { + uint64_t data; + bool valid; + } val[MAX_CHIPS]; +} chiptod_tod_regs[] = { + { TOD_ERR_CRMO_PARITY, TOD_MASTER_PATH_CTRL, { } }, + { TOD_ERR_PPORT0_CREG_PARITY, TOD_PRI_PORT0_CTRL, { } }, + { TOD_ERR_PPORT1_CREG_PARITY, TOD_PRI_PORT1_CTRL, { } }, + { TOD_ERR_SPORT0_CREG_PARITY, TOD_SEC_PORT0_CTRL, { } }, + { TOD_ERR_SPORT1_CREG_PARITY, TOD_SEC_PORT1_CTRL, { } }, + { TOD_ERR_SPATH_CREG_PARITY, TOD_SLAVE_PATH_CTRL, { } }, + { TOD_ERR_IPATH_CREG_PARITY, TOD_INTERNAL_PATH_CTRL, { } }, + { TOD_ERR_PSMS_CREG_PARITY, TOD_PSMS_CTRL, { } }, + { TOD_ERR_CTCR_PARITY, TOD_CHIP_CTRL, { } }, +}; + +/* The base TFMR value is the same for the whole machine + * for now as far as I can tell + */ +static uint64_t base_tfmr; + +/* + * For now, we use a global lock for runtime chiptod operations, + * eventually make this a per-core lock for wakeup rsync and + * take all of them for RAS cases. + */ +static struct lock chiptod_lock = LOCK_UNLOCKED; +static bool chiptod_unrecoverable; + +#define NUM_SYNC_RETRIES 10 + +static void _chiptod_cache_tod_regs(int32_t chip_id) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(chiptod_tod_regs); i++) { + if (xscom_read(chip_id, chiptod_tod_regs[i].xscom_addr, + &(chiptod_tod_regs[i].val[chip_id].data))) { + prerror("XSCOM error reading 0x%08llx reg.\n", + chiptod_tod_regs[i].xscom_addr); + /* Invalidate this record and continue */ + chiptod_tod_regs[i].val[chip_id].valid = 0; + continue; + } + chiptod_tod_regs[i].val[chip_id].valid = 1; + } +} + +static void chiptod_cache_tod_registers(void) +{ + struct proc_chip *chip; + + for_each_chip(chip) + _chiptod_cache_tod_regs(chip->id); +} + +static void print_topo_info(enum chiptod_topology topo) +{ + const char *role[] = { "Unknown", "MDMT", "MDST", "SDMT", "SDST" }; + const char *status[] = { "Unknown", + "Active Master", "Backup Master", "Backup Master Disabled" }; + + prlog(PR_DEBUG, " Chip id: %d, Role: %s, Status: %s\n", + chiptod_topology_info[topo].id, + role[chiptod_topology_info[topo].role + 1], + status[chiptod_topology_info[topo].status + 1]); +} + +static void print_topology_info(void) +{ + const char *topo[] = { "Unknown", "Primary", "Secondary" }; + + if (current_topology < 0) + return; + + prlog(PR_DEBUG, "TOD Topology in Use: %s\n", + topo[current_topology+1]); + prlog(PR_DEBUG, " Primary configuration:\n"); + print_topo_info(chiptod_topo_primary); + prlog(PR_DEBUG, " Secondary configuration:\n"); + print_topo_info(chiptod_topo_secondary); +} + +static enum chiptod_topology query_current_topology(void) +{ + uint64_t tod_status; + + if (xscom_readme(TOD_STATUS, &tod_status)) { + prerror("XSCOM error reading TOD_STATUS reg\n"); + return chiptod_topo_unknown; + } + + /* + * Tod status register bit [0-2] tells configuration in use. + * 000 <= primary configuration in use + * 111 <= secondary configuration in use + */ + if ((tod_status & TOD_ST_TOPOLOGY_SELECT) == TOD_PRI_CONF_IN_USE) + return chiptod_topo_primary; + else + return chiptod_topo_secondary; +} + +static enum chiptod_chip_role +chiptod_get_chip_role(enum chiptod_topology topology, int32_t chip_id) +{ + uint64_t tod_ctrl; + enum chiptod_chip_role role = chiptod_chip_role_UNKNOWN; + + if (chip_id < 0) + return role; + + if (xscom_read(chip_id, TOD_PSMS_CTRL, &tod_ctrl)) { + prerror("XSCOM error reading TOD_PSMS_CTRL\n"); + return chiptod_chip_role_UNKNOWN; + } + + switch (topology) { + case chiptod_topo_primary: + if (tod_ctrl & TOD_PSMSC_PM_DRAW_SELECT) { + if (tod_ctrl & TOD_PSMSC_PM_TOD_SELECT) + role = chiptod_chip_role_MDMT; + else + role = chiptod_chip_role_MDST; + } else { + if (tod_ctrl & TOD_PSMSC_PM_TOD_SELECT) + role = chiptod_chip_role_SDMT; + else + role = chiptod_chip_role_SDST; + } + break; + case chiptod_topo_secondary: + if (tod_ctrl & TOD_PSMSC_SM_DRAW_SELECT) { + if (tod_ctrl & TOD_PSMSC_SM_TOD_SELECT) + role = chiptod_chip_role_MDMT; + else + role = chiptod_chip_role_MDST; + } else { + if (tod_ctrl & TOD_PSMSC_SM_TOD_SELECT) + role = chiptod_chip_role_SDMT; + else + role = chiptod_chip_role_SDST; + } + break; + case chiptod_topo_unknown: + default: + break; + } + return role; +} + +/* + * Check and return the status of sync step network for a given + * topology configuration. + * Return values: + * true: Sync Step network is running + * false: Sync Step network is not running + */ +static bool chiptod_sync_step_check_running(enum chiptod_topology topology) +{ + uint64_t tod_status; + enum chiptod_chip_role role; + bool running = false; + int32_t chip_id = chiptod_topology_info[topology].id; + + /* Sanity check */ + if (chip_id < 0) + return false; + + if (xscom_read(chip_id, TOD_STATUS, &tod_status)) { + prerror("XSCOM error reading TOD_STATUS reg\n"); + return false; + } + + switch (topology) { + case chiptod_topo_primary: + /* Primary configuration */ + role = chiptod_topology_info[topology].role; + if (role == chiptod_chip_role_MDMT) { + /* + * Chip is using Master path. + * Check if it is using path_0/path_1 and then + * validity of that path. + * + * TOD_STATUS[12]: 0 = PATH_0, 1 = PATH_1 + */ + if (tod_status & TOD_ST_PRI_MPATH_SELECT) { + if (tod_status & TOD_ST_MPATH1_STEP_VALID) + running = true; + } else { + if (tod_status & TOD_ST_MPATH0_STEP_VALID) + running = true; + } + } else { + /* + * Chip is using Slave path. + * + * TOD_STATUS[15]: 0 = PATH_0, 1 = PATH_1 + */ + if (tod_status & TOD_ST_PRI_SPATH_SELECT) { + if (tod_status & TOD_ST_SPATH1_STEP_VALID) + running = true; + } else { + if (tod_status & TOD_ST_SPATH0_STEP_VALID) + running = true; + } + } + break; + case chiptod_topo_secondary: + /* Secondary configuration */ + role = chiptod_topology_info[topology].role; + if (role == chiptod_chip_role_MDMT) { + /* + * Chip is using Master path. + * Check if it is using path_0/path_1 and then + * validity of that path. + * + * TOD_STATUS[12]: 0 = PATH_0, 1 = PATH_1 + */ + if (tod_status & TOD_ST_SEC_MPATH_SELECT) { + if (tod_status & TOD_ST_MPATH1_STEP_VALID) + running = true; + } else { + if (tod_status & TOD_ST_MPATH0_STEP_VALID) + running = true; + } + } else { + /* + * Chip is using Slave path. + * + * TOD_STATUS[15]: 0 = PATH_0, 1 = PATH_1 + */ + if (tod_status & TOD_ST_SEC_SPATH_SELECT) { + if (tod_status & TOD_ST_SPATH1_STEP_VALID) + running = true; + } else { + if (tod_status & TOD_ST_SPATH0_STEP_VALID) + running = true; + } + } + break; + default: + break; + } + return running; +} + +static enum chiptod_chip_status _chiptod_get_chip_status(int32_t chip_id) +{ + uint64_t tod_status; + enum chiptod_chip_status status = -1; + + if (chip_id < 0) + return chiptod_backup_disabled; + + if (xscom_read(chip_id, TOD_STATUS, &tod_status)) { + prerror("XSCOM error reading TOD_STATUS reg\n"); + return status; + } + + if (tod_status & TOD_ST_ACTIVE_MASTER) + status = chiptod_active_master; + else if (tod_status & TOD_ST_BACKUP_MASTER) + status = chiptod_backup_master; + + return status; +} + +static enum chiptod_chip_status +chiptod_get_chip_status(enum chiptod_topology topology) +{ + return _chiptod_get_chip_status(chiptod_topology_info[topology].id); +} + +static void chiptod_update_topology(enum chiptod_topology topo) +{ + int32_t chip_id = chiptod_topology_info[topo].id; + + if (chip_id < 0) + return; + + chiptod_topology_info[topo].role = chiptod_get_chip_role(topo, chip_id); + chiptod_topology_info[topo].status = chiptod_get_chip_status(topo); + + /* + * If chip TOD on this topology is a backup master then check if + * sync/step network is running on this topology. If not, + * then mark status as backup not valid. + */ + if ((chiptod_topology_info[topo].status == chiptod_backup_master) && + !chiptod_sync_step_check_running(topo)) + chiptod_topology_info[topo].status = chiptod_backup_disabled; +} + +static void chiptod_setup_base_tfmr(void) +{ + struct dt_node *cpu = this_cpu()->node; + uint64_t core_freq, tod_freq; + uint64_t mcbs; + + base_tfmr = SPR_TFMR_TB_ECLIPZ; + + /* Get CPU and TOD freqs in Hz */ + if (dt_has_node_property(cpu, "ibm,extended-clock-frequency", NULL)) + core_freq = dt_prop_get_u64(cpu, "ibm,extended-clock-frequency"); + else + core_freq = dt_prop_get_u32(cpu, "clock-frequency"); + + if (!core_freq) { + prlog(PR_ERR, "CPU clock frequency is not set\n"); + abort(); + } + + tod_freq = 32000000; + + /* Calculate the "Max Cycles Between Steps" value according + * to the magic formula: + * + * mcbs = (core_freq * max_jitter_factor) / (4 * tod_freq) / 100; + * + * The max jitter factor is set to 240 based on what pHyp uses. + */ + mcbs = (core_freq * 240) / (4 * tod_freq) / 100; + prlog(PR_INFO, "Calculated MCBS is 0x%llx" + " (Cfreq=%lld Tfreq=%lld)\n", + mcbs, core_freq, tod_freq); + + /* Bake that all into TFMR */ + base_tfmr = SETFIELD(SPR_TFMR_MAX_CYC_BET_STEPS, base_tfmr, mcbs); + base_tfmr = SETFIELD(SPR_TFMR_N_CLKS_PER_STEP, base_tfmr, 0); + base_tfmr = SETFIELD(SPR_TFMR_SYNC_BIT_SEL, base_tfmr, 4); +} + +static bool chiptod_mod_tb(void) +{ + uint64_t tfmr = base_tfmr; + uint64_t timeout = 0; + + /* Switch timebase to "Not Set" state */ + mtspr(SPR_TFMR, tfmr | SPR_TFMR_LOAD_TOD_MOD); + do { + if (++timeout >= (TIMEOUT_LOOPS*2)) { + prerror("TB \"Not Set\" timeout\n"); + return false; + } + tfmr = mfspr(SPR_TFMR); + if (tfmr & SPR_TFMR_TFMR_CORRUPT) { + prerror("TB \"Not Set\" TFMR corrupt\n"); + return false; + } + if (GETFIELD(SPR_TFMR_TBST_ENCODED, tfmr) == 9) { + prerror("TB \"Not Set\" TOD in error state\n"); + return false; + } + } while (tfmr & SPR_TFMR_LOAD_TOD_MOD); + + return true; +} + +static bool chiptod_interrupt_check(void) +{ + uint64_t tfmr; + uint64_t timeout = 0; + + do { + if (++timeout >= TIMEOUT_LOOPS) { + prerror("Interrupt check fail\n"); + return false; + } + tfmr = mfspr(SPR_TFMR); + if (tfmr & SPR_TFMR_TFMR_CORRUPT) { + prerror("Interrupt check TFMR corrupt !\n"); + return false; + } + } while (tfmr & SPR_TFMR_CHIP_TOD_INTERRUPT); + + return true; +} + +static bool chiptod_running_check(uint32_t chip_id) +{ + uint64_t tval; + + if (xscom_read(chip_id, TOD_CHIPTOD_FSM, &tval)) { + prerror("XSCOM error polling run\n"); + return false; + } + if (tval & 0x0800000000000000UL) + return true; + else + return false; +} + +static bool chiptod_poll_running(void) +{ + uint64_t timeout = 0; + uint64_t tval; + + /* Chip TOD running check */ + do { + if (++timeout >= TIMEOUT_LOOPS) { + prerror("Running check fail timeout\n"); + return false; + } + if (xscom_readme(TOD_CHIPTOD_FSM, &tval)) { + prerror("XSCOM error polling run\n"); + return false; + } + } while (!(tval & 0x0800000000000000UL)); + + return true; +} + +static bool chiptod_to_tb(void) +{ + uint32_t pir = this_cpu()->pir; + uint64_t tval, tfmr; + uint64_t timeout = 0; + + /* Tell the ChipTOD about our fabric address + * + * The pib_master value is calculated from the CPU core ID, given in + * the PIR. Because we have different core/thread arrangements in the + * PIR between p7 and p8, we need to do the calculation differently. + * + * p7: 0b00001 || 3-bit core id + * p8: 0b0001 || 4-bit core id + * p9: 0b001 || 5-bit core id + * p10: 0b001 || 5-bit core id + * + * However in P10 we don't use the core ID addressing, but rather core + * scom addressing mode, which appears to work better. + */ + + if (xscom_readme(TOD_PIB_MASTER, &tval)) { + prerror("XSCOM error reading PIB_MASTER\n"); + return false; + } + + if (chiptod_type == chiptod_p10) { + uint32_t core_id = pir_to_core_id(pir); + + if (this_cpu()->is_fused_core && + PVR_VERS_MAJ(mfspr(SPR_PVR)) == 2) { + /* Workaround: must address the even small core. */ + core_id &= ~1; + } + + tval = XSCOM_ADDR_P10_EC(core_id, PC_TOD); + + tval <<= 32; /* PIB slave address goes in PPC bits [0:31] */ + + tval |= PPC_BIT(35); /* Enable SCOM addressing. */ + + } else { + uint64_t tvbits; + + if (chiptod_type == chiptod_p9) { + tvbits = (pir >> 2) & 0x1f; + tvbits |= 0x20; + } else if (chiptod_type == chiptod_p8) { + tvbits = (pir >> 3) & 0xf; + tvbits |= 0x10; + } else { + tvbits = (pir >> 2) & 0x7; + tvbits |= 0x08; + } + tval &= ~TOD_PIBM_ADDR_CFG_MCAST; + tval = SETFIELD(TOD_PIBM_ADDR_CFG_SLADDR, tval, tvbits); + } + + if (xscom_writeme(TOD_PIB_MASTER, tval)) { + prerror("XSCOM error writing PIB_MASTER\n"); + return false; + } + + /* Make us ready to get the TB from the chipTOD */ + mtspr(SPR_TFMR, base_tfmr | SPR_TFMR_MOVE_CHIP_TOD_TO_TB); + + /* Tell the ChipTOD to send it */ + if (xscom_writeme(TOD_CHIPTOD_TO_TB, PPC_BIT(0))) { + prerror("XSCOM error writing CHIPTOD_TO_TB\n"); + return false; + } + + /* Wait for it to complete */ + timeout = 0; + do { + if (++timeout >= TIMEOUT_LOOPS) { + prerror("Chip to TB timeout\n"); + return false; + } + tfmr = mfspr(SPR_TFMR); + if (tfmr & SPR_TFMR_TFMR_CORRUPT) { + prerror("MoveToTB: corrupt TFMR !\n"); + return false; + } + } while (tfmr & SPR_TFMR_MOVE_CHIP_TOD_TO_TB); + + return true; +} + +static bool chiptod_check_tb_running(void) +{ + /* We used to wait for two SYNC pulses in TFMR but that + * doesn't seem to occur in sim, so instead we use a + * method similar to what pHyp does which is to check for + * TFMR SPR_TFMR_TB_VALID and not SPR_TFMR_TFMR_CORRUPT + */ +#if 0 + uint64_t tfmr, timeout; + unsigned int i; + + for (i = 0; i < 2; i++) { + tfmr = mfspr(SPR_TFMR); + tfmr &= ~SPR_TFMR_TB_SYNC_OCCURED; + mtspr(SPR_TFMR, tfmr); + timeout = 0; + do { + if (++timeout >= TIMEOUT_LOOPS) { + prerror("CHIPTOD: No sync pulses\n"); + return false; + } + tfmr = mfspr(SPR_TFMR); + } while (!(tfmr & SPR_TFMR_TB_SYNC_OCCURED)); + } +#else + uint64_t tfmr = mfspr(SPR_TFMR); + + return (tfmr & SPR_TFMR_TB_VALID) && + !(tfmr & SPR_TFMR_TFMR_CORRUPT); +#endif + return true; +} + +static bool chiptod_reset_tb_errors(void) +{ + uint64_t tfmr; + unsigned long timeout = 0; + + /* Ask for automatic clear of errors */ + tfmr = base_tfmr | SPR_TFMR_CLEAR_TB_ERRORS; + + /* Additionally pHyp sets these (write-1-to-clear ?) */ + tfmr |= SPR_TFMR_TB_MISSING_SYNC; + tfmr |= SPR_TFMR_TB_MISSING_STEP; + tfmr |= SPR_TFMR_TB_RESIDUE_ERR; + mtspr(SPR_TFMR, tfmr); + + /* We have to write "Clear TB Errors" again */ + tfmr = base_tfmr | SPR_TFMR_CLEAR_TB_ERRORS; + mtspr(SPR_TFMR, tfmr); + + do { + if (++timeout >= TIMEOUT_LOOPS) { + /* Don't actually do anything on error for + * now ... not much we can do, panic maybe ? + */ + prerror("TB error reset timeout !\n"); + return false; + } + tfmr = mfspr(SPR_TFMR); + if (tfmr & SPR_TFMR_TFMR_CORRUPT) { + prerror("TB error reset: corrupt TFMR !\n"); + return false; + } + } while (tfmr & SPR_TFMR_CLEAR_TB_ERRORS); + return true; +} + +static void chiptod_cleanup_thread_tfmr(void) +{ + uint64_t tfmr = base_tfmr; + + tfmr |= SPR_TFMR_PURR_PARITY_ERR; + tfmr |= SPR_TFMR_SPURR_PARITY_ERR; + tfmr |= SPR_TFMR_DEC_PARITY_ERR; + tfmr |= SPR_TFMR_TFMR_CORRUPT; + tfmr |= SPR_TFMR_PURR_OVERFLOW; + tfmr |= SPR_TFMR_SPURR_OVERFLOW; + mtspr(SPR_TFMR, tfmr); +} + +static void chiptod_reset_tod_errors(void) +{ + uint64_t terr; + + /* + * At boot, we clear the errors that the firmware is + * supposed to handle. List provided by the pHyp folks. + */ + + terr = TOD_ERR_CRITC_PARITY; + terr |= TOD_ERR_PSS_HAMMING_DISTANCE; + terr |= TOD_ERR_DELAY_COMPL_PARITY; + terr |= TOD_ERR_CTCR_PARITY; + terr |= TOD_ERR_TOD_SYNC_CHECK; + terr |= TOD_ERR_TOD_FSM_PARITY; + terr |= TOD_ERR_TOD_REGISTER_PARITY; + + if (xscom_writeme(TOD_ERROR, terr)) { + prerror("XSCOM error writing TOD_ERROR !\n"); + /* Not much we can do here ... abort ? */ + } +} + +static void chiptod_sync_master(void *data) +{ + uint64_t initial_tb_value; + bool *result = data; + + prlog(PR_DEBUG, "Master sync on CPU PIR 0x%04x...\n", + this_cpu()->pir); + + /* Apply base tfmr */ + mtspr(SPR_TFMR, base_tfmr); + + /* From recipe provided by pHyp folks, reset various errors + * before attempting the sync + */ + chiptod_reset_tb_errors(); + + /* Cleanup thread tfmr bits */ + chiptod_cleanup_thread_tfmr(); + + /* Reset errors in the chiptod itself */ + chiptod_reset_tod_errors(); + + /* Switch timebase to "Not Set" state */ + if (!chiptod_mod_tb()) + goto error; + prlog(PR_INSANE, "SYNC MASTER Step 2 TFMR=0x%016lx\n", mfspr(SPR_TFMR)); + + /* Chip TOD step checkers enable */ + if (xscom_writeme(TOD_TTYPE_2, PPC_BIT(0))) { + prerror("XSCOM error enabling steppers\n"); + goto error; + } + + prlog(PR_INSANE, "SYNC MASTER Step 3 TFMR=0x%016lx\n", mfspr(SPR_TFMR)); + + /* Chip TOD interrupt check */ + if (!chiptod_interrupt_check()) + goto error; + prlog(PR_INSANE, "SYNC MASTER Step 4 TFMR=0x%016lx\n", mfspr(SPR_TFMR)); + + /* Switch local chiptod to "Not Set" state */ + if (xscom_writeme(TOD_LOAD_TOD_MOD, PPC_BIT(0))) { + prerror("XSCOM error sending LOAD_TOD_MOD\n"); + goto error; + } + + /* Switch all remote chiptod to "Not Set" state */ + if (xscom_writeme(TOD_TTYPE_5, PPC_BIT(0))) { + prerror("XSCOM error sending TTYPE_5\n"); + goto error; + } + + /* + * Load the master's current timebase value into the Chip TOD + * network. This is so we have sane timestamps across the whole + * IPL process. The Chip TOD documentation says that the loaded + * value needs to be one STEP before a SYNC. In other words, + * set the low bits to 0x1ff0. + */ + initial_tb_value = (mftb() & ~0x1fff) | 0x1ff0; + + /* Chip TOD load initial value */ + if (xscom_writeme(TOD_CHIPTOD_LOAD_TB, initial_tb_value)) { + prerror("XSCOM error setting init TB\n"); + goto error; + } + + prlog(PR_INSANE, "SYNC MASTER Step 5 TFMR=0x%016lx\n", mfspr(SPR_TFMR)); + + if (!chiptod_poll_running()) + goto error; + prlog(PR_INSANE, "SYNC MASTER Step 6 TFMR=0x%016lx\n", mfspr(SPR_TFMR)); + + /* Move chiptod value to core TB */ + if (!chiptod_to_tb()) + goto error; + prlog(PR_INSANE, "SYNC MASTER Step 7 TFMR=0x%016lx\n", mfspr(SPR_TFMR)); + + /* Send local chip TOD to all chips TOD */ + if (xscom_writeme(TOD_TTYPE_4, PPC_BIT(0))) { + prerror("XSCOM error sending TTYPE_4\n"); + goto error; + } + + /* Check if TB is running */ + if (!chiptod_check_tb_running()) + goto error; + + prlog(PR_INSANE, "Master sync completed, TB=%lx\n", mfspr(SPR_TBRL)); + + /* + * A little delay to make sure the remote chips get up to + * speed before we start syncing them. + * + * We have to do it here because we know our TB is running + * while the boot thread TB might not yet. + */ + time_wait_ms(1); + + *result = true; + return; + error: + prerror("Master sync failed! TFMR=0x%016lx, retrying...\n", mfspr(SPR_TFMR)); + *result = false; +} + +static void chiptod_sync_slave(void *data) +{ + bool *result = data; + bool do_sync = false; + + /* Only get primaries, not threads */ + if (!this_cpu()->is_secondary) + do_sync = true; + + if (chiptod_type == chiptod_p10 && this_cpu()->is_fused_core && + PVR_VERS_MAJ(mfspr(SPR_PVR)) == 2) { + /* P10 DD2 fused core workaround, must sync on small cores */ + if (this_cpu() == this_cpu()->ec_primary) + do_sync = true; + } + + if (!do_sync) { + /* Just cleanup the TFMR */ + chiptod_cleanup_thread_tfmr(); + *result = true; + return; + } + + prlog(PR_DEBUG, "Slave sync on CPU PIR 0x%04x...\n", + this_cpu()->pir); + + /* Apply base tfmr */ + mtspr(SPR_TFMR, base_tfmr); + + /* From recipe provided by pHyp folks, reset various errors + * before attempting the sync + */ + chiptod_reset_tb_errors(); + + /* Cleanup thread tfmr bits */ + chiptod_cleanup_thread_tfmr(); + + /* Switch timebase to "Not Set" state */ + if (!chiptod_mod_tb()) + goto error; + prlog(PR_INSANE, "SYNC SLAVE Step 2 TFMR=0x%016lx\n", mfspr(SPR_TFMR)); + + /* Chip TOD running check */ + if (!chiptod_poll_running()) + goto error; + prlog(PR_INSANE, "SYNC SLAVE Step 3 TFMR=0x%016lx\n", mfspr(SPR_TFMR)); + + /* Chip TOD interrupt check */ + if (!chiptod_interrupt_check()) + goto error; + prlog(PR_INSANE, "SYNC SLAVE Step 4 TFMR=0x%016lx\n", mfspr(SPR_TFMR)); + + /* Move chiptod value to core TB */ + if (!chiptod_to_tb()) + goto error; + prlog(PR_INSANE, "SYNC SLAVE Step 5 TFMR=0x%016lx\n", mfspr(SPR_TFMR)); + + /* Check if TB is running */ + if (!chiptod_check_tb_running()) + goto error; + + prlog(PR_INSANE, "Slave sync completed, TB=%lx\n", mfspr(SPR_TBRL)); + + *result = true; + return; + error: + prerror("Slave sync failed ! TFMR=0x%016lx, retrying...\n", mfspr(SPR_TFMR)); + *result = false; +} + +bool chiptod_wakeup_resync(void) +{ + if (chiptod_primary < 0) + return 0; + + lock(&chiptod_lock); + + /* Apply base tfmr */ + mtspr(SPR_TFMR, base_tfmr); + + /* From recipe provided by pHyp folks, reset various errors + * before attempting the sync + */ + chiptod_reset_tb_errors(); + + /* Cleanup thread tfmr bits */ + chiptod_cleanup_thread_tfmr(); + + /* Switch timebase to "Not Set" state */ + if (!chiptod_mod_tb()) + goto error; + + /* Move chiptod value to core TB */ + if (!chiptod_to_tb()) + goto error; + + unlock(&chiptod_lock); + + return true; + error: + prerror("Resync failed ! TFMR=0x%16lx\n", mfspr(SPR_TFMR)); + unlock(&chiptod_lock); + return false; +} + +/* + * Fixup for p10 TOD bug workaround. + * + * The TOD may fail to start if all clocks in the system are derived from + * the same reference oscillator. + * + * Avoiding this is pretty easy: Whenever we clear/reset the TOD registers, + * make sure to init bits 26:31 of TOD_SLAVE_PATH_CTRL (0x40005) to 0b111111 + * instead of 0b000000. The value 0 in TOD_S_PATH_CTRL_REG(26:31) must be + * avoided, and if it does get written it must be followed up by writing a + * value of all ones to clean up the resulting bad state before the (nonzero) + * final value can be written. + */ +static void fixup_tod_reg_value(struct chiptod_tod_regs *treg_entry) +{ + int32_t chip_id = this_cpu()->chip_id; + + if (proc_gen != proc_gen_p10) + return; + + if (treg_entry->xscom_addr == TOD_SLAVE_PATH_CTRL) + treg_entry->val[chip_id].data |= PPC_BITMASK(26,31); +} + +static int __chiptod_recover_tod_errors(void) +{ + uint64_t terr; + uint64_t treset = 0; + int i, rc = -1; + int32_t chip_id = this_cpu()->chip_id; + + /* Read TOD error register */ + if (xscom_readme(TOD_ERROR, &terr)) { + prerror("XSCOM error reading TOD_ERROR reg\n"); + return 0; + } + /* Check for sync check error and recover */ + if ((terr & TOD_ERR_TOD_SYNC_CHECK) || + (terr & TOD_ERR_TOD_FSM_PARITY) || + (terr & TOD_ERR_CTCR_PARITY) || + (terr & TOD_ERR_PSS_HAMMING_DISTANCE) || + (terr & TOD_ERR_DELAY_COMPL_PARITY) || + (terr & TOD_ERR_TOD_REGISTER_PARITY)) { + chiptod_reset_tod_errors(); + rc = 1; + } + + /* + * Check for TOD control register parity errors and restore those + * registers with last saved valid values. + */ + for (i = 0; i < ARRAY_SIZE(chiptod_tod_regs); i++) { + if (!(terr & chiptod_tod_regs[i].error_bit)) + continue; + + /* Check if we have valid last saved register value. */ + if (!chiptod_tod_regs[i].val[chip_id].valid) { + prerror("Failed to restore TOD register: %08llx", + chiptod_tod_regs[i].xscom_addr); + return 0; + } + + fixup_tod_reg_value(&chiptod_tod_regs[i]); + + prlog(PR_DEBUG, "Parity error, Restoring TOD register: " + "%08llx = %016llx\n", + chiptod_tod_regs[i].xscom_addr, + chiptod_tod_regs[i].val[chip_id].data); + if (xscom_writeme(chiptod_tod_regs[i].xscom_addr, + chiptod_tod_regs[i].val[chip_id].data)) { + prerror("XSCOM error writing 0x%08llx reg.\n", + chiptod_tod_regs[i].xscom_addr); + return 0; + } + treset |= chiptod_tod_regs[i].error_bit; + } + + if (treset && (xscom_writeme(TOD_ERROR, treset))) { + prerror("XSCOM error writing TOD_ERROR !\n"); + return 0; + } + /* We have handled all the TOD errors routed to hypervisor */ + if (treset) + rc = 1; + return rc; +} + +int chiptod_recover_tod_errors(void) +{ + int rc; + + lock(&chiptod_lock); + rc = __chiptod_recover_tod_errors(); + unlock(&chiptod_lock); + return rc; +} + +static int32_t chiptod_get_active_master(void) +{ + if (current_topology < 0) + return -1; + + if (chiptod_topology_info[current_topology].status == + chiptod_active_master) + return chiptod_topology_info[current_topology].id; + return -1; +} + +/* Return true if Active master TOD is running. */ +static bool chiptod_master_running(void) +{ + int32_t active_master_chip; + + active_master_chip = chiptod_get_active_master(); + if (active_master_chip != -1) { + if (chiptod_running_check(active_master_chip)) + return true; + } + return false; +} + +static bool chiptod_set_ttype4_mode(struct proc_chip *chip, bool enable) +{ + uint64_t tval; + + /* Sanity check */ + if (!chip) + return false; + + if (xscom_read(chip->id, TOD_PIB_MASTER, &tval)) { + prerror("XSCOM error reading PIB_MASTER\n"); + return false; + } + + if (enable) { + /* + * Enable TTYPE4 send mode. This allows TOD to respond to + * TTYPE3 request. + */ + tval |= TOD_PIBM_TTYPE4_SEND_MODE; + tval |= TOD_PIBM_TTYPE4_SEND_ENBL; + } else { + /* Disable TTYPE4 send mode. */ + tval &= ~TOD_PIBM_TTYPE4_SEND_MODE; + tval &= ~TOD_PIBM_TTYPE4_SEND_ENBL; + } + + if (xscom_write(chip->id, TOD_PIB_MASTER, tval)) { + prerror("XSCOM error writing PIB_MASTER\n"); + return false; + } + return true; +} + +/* Stop TODs on slave chips in backup topology. */ +static void chiptod_stop_slave_tods(void) +{ + struct proc_chip *chip = NULL; + enum chiptod_topology backup_topo; + uint64_t terr = 0; + + /* Inject TOD sync check error on salve TODs to stop them. */ + terr |= TOD_ERR_TOD_SYNC_CHECK; + + if (current_topology == chiptod_topo_primary) + backup_topo = chiptod_topo_secondary; + else + backup_topo = chiptod_topo_primary; + + for_each_chip(chip) { + enum chiptod_chip_role role; + + /* Current chip TOD is already in stooped state */ + if (chip->id == this_cpu()->chip_id) + continue; + + role = chiptod_get_chip_role(backup_topo, chip->id); + + /* Skip backup master chip TOD. */ + if (role == chiptod_chip_role_MDMT) + continue; + + if (xscom_write(chip->id, TOD_ERROR_INJECT, terr)) + prerror("XSCOM error writing TOD_ERROR_INJ\n"); + + if (chiptod_running_check(chip->id)) { + prlog(PR_DEBUG, + "Failed to stop TOD on slave CHIP [%d]\n", + chip->id); + } + } +} + +static bool is_topology_switch_required(void) +{ + int32_t active_master_chip; + uint64_t tod_error; + + active_master_chip = chiptod_get_active_master(); + + /* Check if TOD is running on Active master. */ + if (chiptod_master_running()) + return false; + + /* + * Check if sync/step network is running. + * + * If sync/step network is not running on current active topology + * then we need switch topology to recover from TOD error. + */ + if (!chiptod_sync_step_check_running(current_topology)) { + prlog(PR_DEBUG, "Sync/Step network not running\n"); + return true; + } + + /* + * Check if there is a step check error reported on + * Active master. + */ + if (xscom_read(active_master_chip, TOD_ERROR, &tod_error)) { + prerror("XSCOM error reading TOD_ERROR reg\n"); + /* + * Can't do anything here. But we already found that + * sync/step network is running. Hence return false. + */ + return false; + } + + if (tod_error & TOD_ERR_MP0_STEP_CHECK) { + prlog(PR_DEBUG, "TOD step check error\n"); + return true; + } + + return false; +} + +static bool chiptod_backup_valid(void) +{ + enum chiptod_topology backup_topo; + + if (current_topology < 0) + return false; + + if (current_topology == chiptod_topo_primary) + backup_topo = chiptod_topo_secondary; + else + backup_topo = chiptod_topo_primary; + + if (chiptod_topology_info[backup_topo].status == chiptod_backup_master) + return chiptod_sync_step_check_running(backup_topo); + + return false; +} + +static void chiptod_topology_switch_complete(void) +{ + /* + * After the topology switch, we may have a non-functional backup + * topology, and we won't be able to recover from future TOD errors + * that requires topology switch. Someone needs to either fix it OR + * configure new functional backup topology. + * + * Bit 18 of the Pervasive FIR is used to signal that TOD error + * analysis needs to be performed. This allows FSP/PRD to + * investigate and re-configure new backup topology if required. + * Once new backup topology is configured and ready, FSP sends a + * mailbox command xE6, s/c 0x06, mod 0, to enable the backup + * topology. + * + * This isn't documented anywhere. This info is provided by FSP + * folks. + */ + if (xscom_writeme(LOCAL_CORE_FIR, LFIR_SWITCH_COMPLETE)) { + prerror("XSCOM error writing LOCAL_CORE_FIR\n"); + return; + } + + /* Save TOD control registers values. */ + chiptod_cache_tod_registers(); + + prlog(PR_DEBUG, "Topology switch complete\n"); + print_topology_info(); +} + +/* + * Sync up TOD with other chips and get TOD in running state. + * Check if current topology is active and running. If not, then + * trigger a topology switch. + */ +static int chiptod_start_tod(void) +{ + struct proc_chip *chip = NULL; + + /* Do a topology switch if required. */ + if (is_topology_switch_required()) { + int32_t mchip = chiptod_get_active_master(); + + prlog(PR_DEBUG, "Need topology switch to recover\n"); + /* + * There is a failure in StepSync network in current + * active topology. TOD is not running on active master chip. + * We need to sync with backup master chip TOD. + * But before we do that we need to switch topology to make + * backup master as the new active master. Once we switch the + * topology we can then request TOD value from new active + * master. But make sure we move local chiptod to Not Set + * before requesting TOD value. + * + * Before triggering a topology switch, check if backup + * is valid and stop all slave TODs in backup topology. + */ + if (!chiptod_backup_valid()) { + prerror("Backup master is not enabled. " + "Can not do a topology switch.\n"); + goto error_out; + } + + chiptod_stop_slave_tods(); + + if (xscom_write(mchip, TOD_TTYPE_1, PPC_BIT(0))) { + prerror("XSCOM error switching primary/secondary\n"); + goto error_out; + } + + /* Update topology info. */ + current_topology = query_current_topology(); + chiptod_update_topology(chiptod_topo_primary); + chiptod_update_topology(chiptod_topo_secondary); + + /* + * We just switched topologies to recover. + * Check if new master TOD is running. + */ + if (!chiptod_master_running()) { + prerror("TOD is not running on new master.\n"); + goto error_out; + } + + /* + * Enable step checkers on all Chip TODs + * + * During topology switch, step checkers are disabled + * on all Chip TODs by default. Enable them. + */ + if (xscom_writeme(TOD_TTYPE_2, PPC_BIT(0))) { + prerror("XSCOM error enabling steppers\n"); + goto error_out; + } + + chiptod_topology_switch_complete(); + } + + if (!chiptod_master_running()) { + /* + * Active Master TOD is not running, which means it won't + * respond to TTYPE_3 request. + * + * Find a chip that has TOD in running state and configure + * it to respond to TTYPE_3 request. + */ + for_each_chip(chip) { + if (chiptod_running_check(chip->id)) { + if (chiptod_set_ttype4_mode(chip, true)) + break; + } + } + } + + /* Switch local chiptod to "Not Set" state */ + if (xscom_writeme(TOD_LOAD_TOD_MOD, PPC_BIT(0))) { + prerror("XSCOM error sending LOAD_TOD_MOD\n"); + goto error_out; + } + + /* + * Request the current TOD value from another chip. + * This will move TOD in running state + */ + if (xscom_writeme(TOD_TTYPE_3, PPC_BIT(0))) { + prerror("XSCOM error sending TTYPE_3\n"); + goto error_out; + } + + /* Check if chip TOD is running. */ + if (!chiptod_poll_running()) + goto error_out; + + /* Restore the ttype4_mode. */ + chiptod_set_ttype4_mode(chip, false); + return 1; + +error_out: + chiptod_unrecoverable = true; + return 0; +} + +static bool tfmr_recover_tb_errors(uint64_t tfmr) +{ + uint64_t tfmr_reset_error; + unsigned long timeout = 0; + + /* Ask for automatic clear of errors */ + tfmr_reset_error = base_tfmr | SPR_TFMR_CLEAR_TB_ERRORS; + + /* Additionally pHyp sets these (write-1-to-clear ?) */ + if (tfmr & SPR_TFMR_TB_MISSING_SYNC) + tfmr_reset_error |= SPR_TFMR_TB_MISSING_SYNC; + + if (tfmr & SPR_TFMR_TB_MISSING_STEP) + tfmr_reset_error |= SPR_TFMR_TB_MISSING_STEP; + + /* + * write 1 to bit 45 to clear TB residue the error. + * TB register has already been reset to zero as part pre-recovery. + */ + if (tfmr & SPR_TFMR_TB_RESIDUE_ERR) + tfmr_reset_error |= SPR_TFMR_TB_RESIDUE_ERR; + + if (tfmr & SPR_TFMR_FW_CONTROL_ERR) + tfmr_reset_error |= SPR_TFMR_FW_CONTROL_ERR; + + if (tfmr & SPR_TFMR_TBST_CORRUPT) + tfmr_reset_error |= SPR_TFMR_TBST_CORRUPT; + + mtspr(SPR_TFMR, tfmr_reset_error); + + /* We have to write "Clear TB Errors" again */ + tfmr_reset_error = base_tfmr | SPR_TFMR_CLEAR_TB_ERRORS; + mtspr(SPR_TFMR, tfmr_reset_error); + + do { + if (++timeout >= TIMEOUT_LOOPS) { + prerror("TB error reset timeout !\n"); + return false; + } + tfmr = mfspr(SPR_TFMR); + if (tfmr & SPR_TFMR_TFMR_CORRUPT) { + prerror("TB error reset: corrupt TFMR !\n"); + return false; + } + } while (tfmr & SPR_TFMR_CLEAR_TB_ERRORS); + return true; +} + +bool tfmr_recover_local_errors(uint64_t tfmr) +{ + uint64_t tfmr_reset_errors = 0; + + if (tfmr & SPR_TFMR_DEC_PARITY_ERR) { + /* Set DEC with all ones */ + mtspr(SPR_DEC, ~0); + + /* set bit 59 to clear TFMR DEC parity error. */ + tfmr_reset_errors |= SPR_TFMR_DEC_PARITY_ERR; + } + + /* + * Reset PURR/SPURR to recover. We also need help from KVM + * layer to handle this change in PURR/SPURR. That needs + * to be handled in kernel KVM layer. For now, to recover just + * reset it. + */ + if (tfmr & SPR_TFMR_PURR_PARITY_ERR) { + /* set PURR register with sane value or reset it. */ + mtspr(SPR_PURR, 0); + + /* set bit 57 to clear TFMR PURR parity error. */ + tfmr_reset_errors |= SPR_TFMR_PURR_PARITY_ERR; + } + + if (tfmr & SPR_TFMR_SPURR_PARITY_ERR) { + /* set PURR register with sane value or reset it. */ + mtspr(SPR_SPURR, 0); + + /* set bit 58 to clear TFMR PURR parity error. */ + tfmr_reset_errors |= SPR_TFMR_SPURR_PARITY_ERR; + } + + /* Write TFMR twice to clear the error */ + mtspr(SPR_TFMR, base_tfmr | tfmr_reset_errors); + mtspr(SPR_TFMR, base_tfmr | tfmr_reset_errors); + + /* Get fresh copy of TFMR */ + tfmr = mfspr(SPR_TFMR); + + /* Check if TFMR non-TB errors still present. */ + if (tfmr & tfmr_reset_errors) { + prerror("TFMR non-TB error recovery failed! " + "TFMR=0x%016lx\n", mfspr(SPR_TFMR)); + return false; + } + return true; +} + +/* + * TFMR parity error recovery as per pc_workbook: + * MT(TFMR) bits 11 and 60 are b’1’ + * MT(HMER) all bits 1 except for bits 4,5 + */ +bool recover_corrupt_tfmr(void) +{ + uint64_t tfmr; + + /* Get the base TFMR */ + tfmr = base_tfmr; + + /* Set bit 60 to clear TFMR parity error. */ + tfmr |= SPR_TFMR_TFMR_CORRUPT; + mtspr(SPR_TFMR, tfmr); + + /* Write twice to clear the error */ + mtspr(SPR_TFMR, tfmr); + + /* Get fresh copy of TFMR */ + tfmr = mfspr(SPR_TFMR); + + /* Check if TFMR parity error still present. */ + if (tfmr & SPR_TFMR_TFMR_CORRUPT) { + prerror("TFMR error recovery: corrupt TFMR !\n"); + return false; + } + + /* + * Now that we have sane value in TFMR, check if Timebase machine + * state is in ERROR state. If yes, clear TB errors so that + * Timebase machine state changes to RESET state. Once in RESET state + * then we can then load TB with TOD value. + */ + if (GETFIELD(SPR_TFMR_TBST_ENCODED, tfmr) == TBST_STATE_ERROR) { + if (!chiptod_reset_tb_errors()) + return false; + } + return true; +} + +void tfmr_cleanup_core_errors(uint64_t tfmr) +{ + /* If HDEC is bad, clean it on all threads before we clear the + * error condition. + */ + if (tfmr & SPR_TFMR_HDEC_PARITY_ERROR) + mtspr(SPR_HDEC, 0); + + /* If TB is invalid, clean it on all threads as well, it will be + * restored after the next rendez-vous + */ + if (!(tfmr & SPR_TFMR_TB_VALID)) { + mtspr(SPR_TBWU, 0); + mtspr(SPR_TBWU, 0); + } +} + +int tfmr_clear_core_errors(uint64_t tfmr) +{ + uint64_t tfmr_reset_errors = 0; + + /* return -1 if there is nothing to be fixed. */ + if (!(tfmr & SPR_TFMR_HDEC_PARITY_ERROR)) + return -1; + + tfmr_reset_errors |= SPR_TFMR_HDEC_PARITY_ERROR; + + /* Write TFMR twice to clear the error */ + mtspr(SPR_TFMR, base_tfmr | tfmr_reset_errors); + mtspr(SPR_TFMR, base_tfmr | tfmr_reset_errors); + + return 1; +} + +/* + * Recover from TB and TOD errors. + * Timebase register is per core and first thread that gets chance to + * handle interrupt would fix actual TFAC errors and rest of the threads + * from same core would see no errors. Return -1 if no errors have been + * found. The caller (handle_hmi_exception) of this function would not + * send an HMI event to host if return value is -1. + * + * Return values: + * 0 <= Failed to recover from errors + * 1 <= Successfully recovered from errors + * -1 <= No errors found. Errors are already been fixed. + */ +int chiptod_recover_tb_errors(bool *out_resynced) +{ + uint64_t tfmr; + int rc = -1; + + *out_resynced = false; + + if (chiptod_primary < 0) + return 0; + + lock(&chiptod_lock); + + /* + * Return if TOD is unrecoverable. + * The previous attempt to recover TOD has been failed. + */ + if (chiptod_unrecoverable) { + rc = 0; + goto error_out; + } + + /* Get fresh copy of TFMR */ + tfmr = mfspr(SPR_TFMR); + + /* + * Check for TB errors. + * On Sync check error, bit 44 of TFMR is set. Check for it and + * clear it. + * + * In some rare situations we may have all TB errors already cleared, + * but TB stuck in waiting for new value from TOD with TFMR bit 18 + * set to '1'. This uncertain state of TB would fail the process + * of getting TB back into running state. Get TB in clean initial + * state by clearing TB errors if TFMR[18] is set. + */ + if ((tfmr & SPR_TFMR_TB_MISSING_STEP) || + (tfmr & SPR_TFMR_TB_RESIDUE_ERR) || + (tfmr & SPR_TFMR_FW_CONTROL_ERR) || + (tfmr & SPR_TFMR_TBST_CORRUPT) || + (tfmr & SPR_TFMR_MOVE_CHIP_TOD_TO_TB) || + (tfmr & SPR_TFMR_TB_MISSING_SYNC)) { + if (!tfmr_recover_tb_errors(tfmr)) { + rc = 0; + goto error_out; + } + } + + /* + * Check for TOD sync check error. + * On TOD errors, bit 51 of TFMR is set. If this bit is on then we + * need to fetch TOD error register and recover from TOD errors. + * Bit 33 of TOD error register indicates sync check error. + */ + if (tfmr & SPR_TFMR_CHIP_TOD_INTERRUPT) + rc = __chiptod_recover_tod_errors(); + + /* Check if TB is running. If not then we need to get it running. */ + if (!(tfmr & SPR_TFMR_TB_VALID)) { + rc = 0; + + /* Place TB in Notset state. */ + if (!chiptod_mod_tb()) + goto error_out; + + /* + * Before we move TOD to core TB check if TOD is running. + * If not, then get TOD in running state. + */ + if (!chiptod_running_check(this_cpu()->chip_id)) + if (!chiptod_start_tod()) + goto error_out; + + /* Move chiptod value to core TB */ + if (!chiptod_to_tb()) + goto error_out; + + *out_resynced = true; + + /* We have successfully able to get TB running. */ + rc = 1; + } + +error_out: + unlock(&chiptod_lock); + return rc; +} + +static int64_t opal_resync_timebase(void) +{ + /* Mambo and qemu doesn't simulate the chiptod */ + if (chip_quirk(QUIRK_NO_CHIPTOD)) + return OPAL_SUCCESS; + + if (!chiptod_wakeup_resync()) { + prerror("OPAL: Resync timebase failed on CPU 0x%04x\n", + this_cpu()->pir); + return OPAL_HARDWARE; + } + return OPAL_SUCCESS; +} +opal_call(OPAL_RESYNC_TIMEBASE, opal_resync_timebase, 0); + +static void chiptod_print_tb(void *data __unused) +{ + prlog(PR_DEBUG, "PIR 0x%04x TB=%lx\n", this_cpu()->pir, + mfspr(SPR_TBRL)); +} + +static bool chiptod_probe(void) +{ + struct dt_node *np; + + dt_for_each_compatible(dt_root, np, "ibm,power-chiptod") { + uint32_t chip; + + /* Old DT has chip-id in chiptod node, newer only in the + * parent xscom bridge + */ + chip = dt_get_chip_id(np); + + if (dt_has_node_property(np, "primary", NULL)) { + chiptod_primary = chip; + if (dt_node_is_compatible(np, "ibm,power8-chiptod")) + chiptod_type = chiptod_p8; + if (dt_node_is_compatible(np, "ibm,power9-chiptod")) + chiptod_type = chiptod_p9; + if (dt_node_is_compatible(np, "ibm,power10-chiptod")) + chiptod_type = chiptod_p10; + } + + if (dt_has_node_property(np, "secondary", NULL)) + chiptod_secondary = chip; + + } + + if (chiptod_type == chiptod_unknown) { + prerror("Unknown TOD type !\n"); + return false; + } + + return true; +} + +static void chiptod_discover_new_backup(enum chiptod_topology topo) +{ + struct proc_chip *chip = NULL; + + /* Scan through available chips to find new backup master chip */ + for_each_chip(chip) { + if (_chiptod_get_chip_status(chip->id) == chiptod_backup_master) + break; + } + + /* Found new backup master chip. Update the topology info */ + if (chip) { + prlog(PR_DEBUG, "New backup master: CHIP [%d]\n", + chip->id); + + if (topo == chiptod_topo_primary) + chiptod_primary = chip->id; + else + chiptod_secondary = chip->id; + chiptod_topology_info[topo].id = chip->id; + chiptod_update_topology(topo); + + prlog(PR_DEBUG, + "Backup topology configuration changed.\n"); + print_topology_info(); + } + + /* + * Topology configuration has changed. Save TOD control registers + * values. + */ + chiptod_cache_tod_registers(); +} + +/* + * Enable/disable backup topology. + * If request is to enable topology, then discover new backup master + * chip and update the topology configuration info. If the request is + * to disable topology, then mark the current backup topology as disabled. + * Return error (-1) if the action is requested on currenlty active + * topology. + * + * Return values: + * true <= Success + * false <= Topology is active and in use. + */ +bool chiptod_adjust_topology(enum chiptod_topology topo, bool enable) +{ + uint8_t rc = true; + /* + * The FSP can only request that the currently inactive topology + * be disabled or enabled. If the requested topology is currently + * the active topology, then fail this request with a -1 (TOD + * topology in use) status as return code. + */ + lock(&chiptod_lock); + if (topo == current_topology) { + rc = false; + goto out; + } + + if (enable) + chiptod_discover_new_backup(topo); + else + chiptod_topology_info[topo].status = chiptod_backup_disabled; +out: + unlock(&chiptod_lock); + return rc; +} + +static void chiptod_init_topology_info(void) +{ + /* Find and update current topology in use. */ + current_topology = query_current_topology(); + + /* Initialized primary topology chip config info */ + chiptod_topology_info[chiptod_topo_primary].id = chiptod_primary; + chiptod_update_topology(chiptod_topo_primary); + + /* Initialized secondary topology chip config info */ + chiptod_topology_info[chiptod_topo_secondary].id = chiptod_secondary; + chiptod_update_topology(chiptod_topo_secondary); + + /* Cache TOD control registers values. */ + chiptod_cache_tod_registers(); + print_topology_info(); +} + +void chiptod_init(void) +{ + struct cpu_thread *cpu0, *cpu; + bool sres; + int i; + + /* Mambo and qemu doesn't simulate the chiptod */ + if (chip_quirk(QUIRK_NO_CHIPTOD)) + return; + + op_display(OP_LOG, OP_MOD_CHIPTOD, 0); + + if (!chiptod_probe()) { + prerror("Failed ChipTOD detection !\n"); + op_display(OP_FATAL, OP_MOD_CHIPTOD, 0); + abort(); + } + + op_display(OP_LOG, OP_MOD_CHIPTOD, 1); + + /* Pick somebody on the primary */ + cpu0 = find_cpu_by_chip_id(chiptod_primary); + + /* Calculate the base TFMR value used for everybody */ + chiptod_setup_base_tfmr(); + + prlog(PR_DEBUG, "Base TFMR=0x%016llx\n", base_tfmr); + + i = NUM_SYNC_RETRIES; + do { + /* Schedule master sync */ + sres = false; + cpu_wait_job(cpu_queue_job(cpu0, "chiptod_sync_master", + chiptod_sync_master, &sres), true); + } while (!sres && i--); + + if (!sres) { + op_display(OP_FATAL, OP_MOD_CHIPTOD, 2); + abort(); + } + + op_display(OP_LOG, OP_MOD_CHIPTOD, 2); + + /* Schedule slave sync */ + for_each_available_cpu(cpu) { + /* Skip master */ + if (cpu == cpu0) + continue; + + i = NUM_SYNC_RETRIES; + do { + /* Queue job */ + sres = false; + cpu_wait_job(cpu_queue_job(cpu, "chiptod_sync_slave", + chiptod_sync_slave, &sres), + true); + } while (!sres && i--); + + if (!sres) { + op_display(OP_WARN, OP_MOD_CHIPTOD, 3|(cpu->pir << 8)); + prerror("CHIPTOD: Failed to sync PIR 0x%04x\n", + this_cpu()->pir); + + /* Disable threads */ + cpu_disable_all_threads(cpu); + } + op_display(OP_LOG, OP_MOD_CHIPTOD, 3|(cpu->pir << 8)); + } + + /* Display TBs */ + for_each_available_cpu(cpu) { + /* Only do primaries, not threads */ + if (cpu->is_secondary) + continue; + cpu_wait_job(cpu_queue_job(cpu, "chiptod_print_tb", + chiptod_print_tb, NULL), true); + } + + chiptod_init_topology_info(); + op_display(OP_LOG, OP_MOD_CHIPTOD, 4); +} + +/* CAPP timebase sync */ + +static bool chiptod_capp_reset_tb_errors(uint32_t chip_id, + uint32_t tfmr_addr, + uint32_t offset) +{ + uint64_t tfmr; + unsigned long timeout = 0; + + /* Ask for automatic clear of errors */ + tfmr = base_tfmr | SPR_TFMR_CLEAR_TB_ERRORS; + + /* Additionally pHyp sets these (write-1-to-clear ?) */ + tfmr |= SPR_TFMR_TB_MISSING_SYNC; + tfmr |= SPR_TFMR_TB_MISSING_STEP; + tfmr |= SPR_TFMR_TB_RESIDUE_ERR; + tfmr |= SPR_TFMR_TBST_CORRUPT; + tfmr |= SPR_TFMR_TFMR_CORRUPT; + + /* Write CAPP TFMR */ + xscom_write(chip_id, tfmr_addr + offset, tfmr); + + /* We have to write "Clear TB Errors" again */ + tfmr = base_tfmr | SPR_TFMR_CLEAR_TB_ERRORS; + /* Write CAPP TFMR */ + xscom_write(chip_id, tfmr_addr + offset, tfmr); + + do { + if (++timeout >= TIMEOUT_LOOPS) { + prerror("CAPP: TB error reset timeout !\n"); + return false; + } + /* Read CAPP TFMR */ + xscom_read(chip_id, tfmr_addr + offset, &tfmr); + if (tfmr & SPR_TFMR_TFMR_CORRUPT) { + prerror("CAPP: TB error reset: corrupt TFMR!\n"); + return false; + } + } while (tfmr & SPR_TFMR_CLEAR_TB_ERRORS); + return true; +} + +static bool chiptod_capp_mod_tb(uint32_t chip_id, uint32_t tfmr_addr, + uint32_t offset) +{ + uint64_t timeout = 0; + uint64_t tfmr; + + /* Switch CAPP timebase to "Not Set" state */ + tfmr = base_tfmr | SPR_TFMR_LOAD_TOD_MOD; + xscom_write(chip_id, tfmr_addr + offset, tfmr); + do { + if (++timeout >= (TIMEOUT_LOOPS*2)) { + prerror("CAPP: TB \"Not Set\" timeout\n"); + return false; + } + xscom_read(chip_id, tfmr_addr + offset, &tfmr); + if (tfmr & SPR_TFMR_TFMR_CORRUPT) { + prerror("CAPP: TB \"Not Set\" TFMR corrupt\n"); + return false; + } + if (GETFIELD(SPR_TFMR_TBST_ENCODED, tfmr) == 9) { + prerror("CAPP: TB \"Not Set\" TOD in error state\n"); + return false; + } + } while (tfmr & SPR_TFMR_LOAD_TOD_MOD); + + return true; +} + +static bool chiptod_wait_for_chip_sync(void) +{ + uint64_t tfmr; + uint64_t timeout = 0; + + /* Read core TFMR, mask bit 42, write core TFMR back */ + tfmr = mfspr(SPR_TFMR); + tfmr &= ~SPR_TFMR_TB_SYNC_OCCURED; + mtspr(SPR_TFMR, tfmr); + + /* Read core TFMR until the TB sync occurred */ + do { + if (++timeout >= TIMEOUT_LOOPS) { + prerror("No sync pulses\n"); + return false; + } + tfmr = mfspr(SPR_TFMR); + } while (!(tfmr & SPR_TFMR_TB_SYNC_OCCURED)); + return true; +} + +static bool chiptod_capp_check_tb_running(uint32_t chip_id, + uint32_t tfmr_addr, + uint32_t offset) +{ + uint64_t tfmr; + uint64_t timeout = 0; + + /* Read CAPP TFMR until TB becomes valid */ + do { + if (++timeout >= (TIMEOUT_LOOPS*2)) { + prerror("CAPP: TB Invalid!\n"); + return false; + } + xscom_read(chip_id, tfmr_addr + offset, &tfmr); + if (tfmr & SPR_TFMR_TFMR_CORRUPT) { + prerror("CAPP: TFMR corrupt!\n"); + return false; + } + } while (!(tfmr & SPR_TFMR_TB_VALID)); + return true; +} + +bool chiptod_capp_timebase_sync(unsigned int chip_id, uint32_t tfmr_addr, + uint32_t tb_addr, uint32_t offset) +{ + uint64_t tfmr; + uint64_t capp_tb; + int64_t delta; + unsigned int retry = 0; + + /* Set CAPP TFMR to base tfmr value */ + xscom_write(chip_id, tfmr_addr + offset, base_tfmr); + + /* Reset CAPP TB errors before attempting the sync */ + if (!chiptod_capp_reset_tb_errors(chip_id, tfmr_addr, offset)) + return false; + + /* Switch CAPP TB to "Not Set" state */ + if (!chiptod_capp_mod_tb(chip_id, tfmr_addr, offset)) + return false; + + /* Sync CAPP TB with core TB, retry while difference > 16usecs */ + do { + if (retry++ > 5) { + prerror("CAPP: TB sync: giving up!\n"); + return false; + } + + /* Make CAPP ready to get the TB, wait for chip sync */ + tfmr = base_tfmr | SPR_TFMR_MOVE_CHIP_TOD_TO_TB; + xscom_write(chip_id, tfmr_addr + offset, tfmr); + if (!chiptod_wait_for_chip_sync()) + return false; + + /* Set CAPP TB from core TB */ + xscom_write(chip_id, tb_addr + offset, mftb()); + + /* Wait for CAPP TFMR tb_valid bit */ + if (!chiptod_capp_check_tb_running(chip_id, tfmr_addr, offset)) + return false; + + /* Read CAPP TB, read core TB, compare */ + xscom_read(chip_id, tb_addr + offset, &capp_tb); + delta = mftb() - capp_tb; + if (delta < 0) + delta = -delta; + } while (tb_to_usecs(delta) > 16); + + return true; +} diff --git a/roms/skiboot/hw/dio-p9.c b/roms/skiboot/hw/dio-p9.c new file mode 100644 index 000000000..5153f6eeb --- /dev/null +++ b/roms/skiboot/hw/dio-p9.c @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* Copyright 2019 IBM Corp. */ + +#define pr_fmt(fmt) "DIO: " fmt + +#include <chip.h> +#include <dio-p9.h> +#include <opal.h> +#include <xscom.h> +#include <xscom-p9-regs.h> + +void p9_dio_init(void) +{ + struct dt_node *xn; + struct proc_chip *chip; + struct p9_dio *dio; + + if (proc_gen < proc_gen_p9) + return; + + dt_for_each_compatible(dt_root, xn, "ibm,xscom") { + dio = zalloc(sizeof(struct p9_dio)); + assert(dio); + chip = get_chip(dt_get_chip_id(xn)); + assert(chip); + chip->dio = dio; + } +} + +int dio_interrupt_register(struct proc_chip *chip, + int port, dio_interrupt_callback callback) +{ + u64 val; + int rc; + + assert(chip); + assert(chip->dio); + + if (port < 0 || port >= NUM_OF_P9_DIO_PORTS) + return OPAL_PARAMETER; + + if (chip->dio->callbacks[port]) /* This port already has a callback */ + return OPAL_PARAMETER; + + rc = xscom_read(chip->id, P9_GPIO_INTERRUPT_ENABLE, &val); + if (rc != OPAL_SUCCESS) { + prlog(PR_ERR, "XSCOM error %d reading reg 0x%llx\n", + rc, P9_GPIO_INTERRUPT_ENABLE); + return OPAL_HARDWARE; + } + + val |= PPC_BIT(port); + rc = xscom_write(chip->id, P9_GPIO_INTERRUPT_ENABLE, val); + if (rc != OPAL_SUCCESS) { + prlog(PR_ERR, "XSCOM error %d writing reg 0x%llx\n", + rc, P9_GPIO_INTERRUPT_ENABLE); + return OPAL_HARDWARE; + } + + chip->dio->callbacks[port] = callback; + + return OPAL_SUCCESS; +} + +int dio_interrupt_deregister(struct proc_chip* chip, + int port, dio_interrupt_callback callback) +{ + u64 val; + int rc; + + assert(chip); + assert(chip->dio); + + if (port < 0 || port >= NUM_OF_P9_DIO_PORTS) + return OPAL_PARAMETER; + + if (chip->dio->callbacks[port] != callback) + return OPAL_PARAMETER; + + rc = xscom_read(chip->id, P9_GPIO_INTERRUPT_ENABLE, &val); + if (rc != OPAL_SUCCESS) { + prlog(PR_ERR, "XSCOM error %d reading reg 0x%llx\n", + rc, P9_GPIO_INTERRUPT_ENABLE); + return OPAL_HARDWARE; + } + + val &= ~PPC_BIT(port); + rc = xscom_write(chip->id, P9_GPIO_INTERRUPT_ENABLE, val); + if (rc != OPAL_SUCCESS) { + prlog(PR_ERR, "XSCOM error %d writing reg 0x%llx\n", + rc, P9_GPIO_INTERRUPT_ENABLE); + return OPAL_HARDWARE; + } + + chip->dio->callbacks[port] = NULL; + + return OPAL_SUCCESS; +} + +void dio_interrupt_handler(uint32_t chip_id) +{ + struct proc_chip *chip; + u64 val; + int rc; + int i; + + chip = get_chip(chip_id); + if (chip == NULL || chip->dio == NULL) + return; + + rc = xscom_read(chip->id, P9_GPIO_INTERRUPT_STATUS, &val); + if (rc != OPAL_SUCCESS) { + prlog(PR_ERR, "XSCOM error %d reading reg 0x%llx\n", + rc, P9_GPIO_INTERRUPT_STATUS); + return; + } + + for (i = 0; i < NUM_OF_P9_DIO_PORTS; ++i) { + if (val & PPC_BIT(i)) { + if (chip->dio->callbacks[i]) + chip->dio->callbacks[i](chip); + else + prlog(PR_ERR, + "DIO interrupt triggerd on chip 0x%x" + " port %d but no handler\n", + chip->id, i); + /* Write 1 to clear the interrupt status */ + xscom_write(chip->id, P9_GPIO_INTERRUPT_CONDITION, + val & PPC_BIT(i)); + } + } +} diff --git a/roms/skiboot/hw/dts.c b/roms/skiboot/hw/dts.c new file mode 100644 index 000000000..d8831e4d3 --- /dev/null +++ b/roms/skiboot/hw/dts.c @@ -0,0 +1,416 @@ +// SPDX-License-Identifier: Apache-2.0 +/* Copyright 2013-2019 IBM Corp. */ + +#include <xscom.h> +#include <chip.h> +#include <sensor.h> +#include <dts.h> +#include <skiboot.h> +#include <opal-api.h> +#include <opal-msg.h> +#include <timer.h> +#include <timebase.h> + +struct dts { + uint8_t valid; + uint8_t trip; + int16_t temp; +}; + +/* + * Attributes for the core temperature sensor + */ +enum { + SENSOR_DTS_ATTR_TEMP_MAX, + SENSOR_DTS_ATTR_TEMP_TRIP +}; + + +/* Therm mac result masking for DTS (result(0:15) + * 0:3 - 0x0 + * 4:11 - Temperature in degrees C + * 12:13 - trip bits: 00 - no trip; 01 - warning; 10 - critical; 11 - fatal + * 14 - spare + * 15 - valid + */ +static void dts_decode_one_dts(uint16_t raw, struct dts *dts) +{ + /* + * The value is both signed and unsigned :-) 0xff could be + * either 255C or -1C, so for now we treat this as unsigned + * which is sufficient for our purpose. We could try to be + * a bit smarter and treat it as signed for values between + * -10 and 0 and unsigned to 239 or something like that... + */ + dts->valid = raw & 1; + if (dts->valid) { + dts->temp = (raw >> 4) & 0xff; + dts->trip = (raw >> 2) & 0x3; + } else { + dts->temp = 0; + dts->trip = 0; + } +} + +static void dts_keep_max(struct dts *temps, int n, struct dts *dts) +{ + int i; + + for (i = 0; i < n; i++) { + int16_t t = temps[i].temp; + + if (!temps[i].valid) + continue; + + if (t > dts->temp) + dts->temp = t; + + dts->valid++; + dts->trip |= temps[i].trip; + } +} + +/* Per core Digital Thermal Sensors */ +#define EX_THERM_DTS_RESULT0 0x10050000 +#define EX_THERM_DTS_RESULT1 0x10050001 + +/* Different sensor locations */ +#define P8_CT_ZONE_LSU 0 +#define P8_CT_ZONE_ISU 1 +#define P8_CT_ZONE_FXU 2 +#define P8_CT_ZONE_L3C 3 +#define P8_CT_ZONES 4 + +/* + * Returns the temperature as the max of all 4 zones and a global trip + * attribute. + */ +static int dts_read_core_temp_p8(uint32_t pir, struct dts *dts) +{ + int32_t chip_id = pir_to_chip_id(pir); + int32_t core = pir_to_core_id(pir); + uint64_t dts0, dts1; + struct dts temps[P8_CT_ZONES]; + int rc; + + rc = xscom_read(chip_id, XSCOM_ADDR_P8_EX(core, EX_THERM_DTS_RESULT0), + &dts0); + if (rc) + return rc; + + rc = xscom_read(chip_id, XSCOM_ADDR_P8_EX(core, EX_THERM_DTS_RESULT1), + &dts1); + if (rc) + return rc; + + dts_decode_one_dts(dts0 >> 48, &temps[P8_CT_ZONE_LSU]); + dts_decode_one_dts(dts0 >> 32, &temps[P8_CT_ZONE_ISU]); + dts_decode_one_dts(dts0 >> 16, &temps[P8_CT_ZONE_FXU]); + dts_decode_one_dts(dts1 >> 48, &temps[P8_CT_ZONE_L3C]); + + dts_keep_max(temps, P8_CT_ZONES, dts); + + prlog(PR_TRACE, "DTS: Chip %x Core %x temp:%dC trip:%x\n", + chip_id, core, dts->temp, dts->trip); + + /* + * FIXME: The trip bits are always set ?! Just discard + * them for the moment until we understand why. + */ + dts->trip = 0; + return 0; +} + +/* Per core Digital Thermal Sensors */ +#define EC_THERM_P9_DTS_RESULT0 0x050000 + +/* Different sensor locations */ +#define P9_CORE_DTS0 0 +#define P9_CORE_DTS1 1 +#define P9_CORE_ZONES 2 + +/* + * Returns the temperature as the max of all zones and a global trip + * attribute. + */ +static int dts_read_core_temp_p9(uint32_t pir, struct dts *dts) +{ + int32_t chip_id = pir_to_chip_id(pir); + int32_t core = pir_to_core_id(pir); + uint64_t dts0; + struct dts temps[P9_CORE_ZONES]; + int rc; + + rc = xscom_read(chip_id, XSCOM_ADDR_P9_EC(core, EC_THERM_P9_DTS_RESULT0), + &dts0); + if (rc) + return rc; + + dts_decode_one_dts(dts0 >> 48, &temps[P9_CORE_DTS0]); + dts_decode_one_dts(dts0 >> 32, &temps[P9_CORE_DTS1]); + + dts_keep_max(temps, P9_CORE_ZONES, dts); + + prlog(PR_TRACE, "DTS: Chip %x Core %x temp:%dC trip:%x\n", + chip_id, core, dts->temp, dts->trip); + + /* + * FIXME: The trip bits are always set ?! Just discard + * them for the moment until we understand why. + */ + dts->trip = 0; + return 0; +} + +static void dts_async_read_temp(struct timer *t __unused, void *data, + u64 now __unused) +{ + struct dts dts = {0}; + int rc, swkup_rc; + struct cpu_thread *cpu = data; + + swkup_rc = dctl_set_special_wakeup(cpu); + + if (proc_gen == proc_gen_p9) + rc = dts_read_core_temp_p9(cpu->pir, &dts); + else /* (proc_gen == proc_gen_p10) */ + rc = OPAL_UNSUPPORTED; /* XXX P10 */ + + if (!rc) { + if (cpu->sensor_attr == SENSOR_DTS_ATTR_TEMP_MAX) + *cpu->sensor_data = cpu_to_be64(dts.temp); + else if (cpu->sensor_attr == SENSOR_DTS_ATTR_TEMP_TRIP) + *cpu->sensor_data = cpu_to_be64(dts.trip); + } + + if (!swkup_rc) + dctl_clear_special_wakeup(cpu); + + check_sensor_read(cpu->token); + rc = opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, + cpu_to_be64(cpu->token), + cpu_to_be64(rc)); + if (rc) + prerror("Failed to queue async message\n"); + + cpu->dts_read_in_progress = false; +} + +static int dts_read_core_temp(u32 pir, struct dts *dts, u8 attr, + int token, __be64 *sensor_data) +{ + struct cpu_thread *cpu; + int rc; + + switch (proc_gen) { + case proc_gen_p8: + rc = dts_read_core_temp_p8(pir, dts); + break; + case proc_gen_p9: /* Asynchronus read */ + cpu = find_cpu_by_pir(pir); + if (!cpu) + return OPAL_PARAMETER; + lock(&cpu->dts_lock); + if (cpu->dts_read_in_progress) { + unlock(&cpu->dts_lock); + return OPAL_BUSY; + } + cpu->dts_read_in_progress = true; + cpu->sensor_attr = attr; + cpu->sensor_data = sensor_data; + cpu->token = token; + schedule_timer(&cpu->dts_timer, 0); + rc = OPAL_ASYNC_COMPLETION; + unlock(&cpu->dts_lock); + break; + case proc_gen_p10: /* XXX P10 */ + default: + rc = OPAL_UNSUPPORTED; + } + return rc; +} + +/* Per memory controller Digital Thermal Sensors */ +#define THERM_MEM_DTS_RESULT0 0x2050000 + +/* Different sensor locations */ +#define P8_MEM_DTS0 0 +#define P8_MEM_DTS1 1 +#define P8_MEM_ZONES 2 + +static int dts_read_mem_temp(uint32_t chip_id, struct dts *dts) +{ + uint64_t dts0; + struct dts temps[P8_MEM_ZONES]; + int i; + int rc; + + rc = xscom_read(chip_id, THERM_MEM_DTS_RESULT0, &dts0); + if (rc) + return rc; + + dts_decode_one_dts(dts0 >> 48, &temps[P8_MEM_DTS0]); + dts_decode_one_dts(dts0 >> 32, &temps[P8_MEM_DTS1]); + + for (i = 0; i < P8_MEM_ZONES; i++) { + int16_t t = temps[i].temp; + + if (!temps[i].valid) + continue; + + /* keep the max temperature of all 4 sensors */ + if (t > dts->temp) + dts->temp = t; + + dts->valid++; + dts->trip |= temps[i].trip; + } + + prlog(PR_TRACE, "DTS: Chip %x temp:%dC trip:%x\n", + chip_id, dts->temp, dts->trip); + + /* + * FIXME: The trip bits are always set ?! Just discard + * them for the moment until we understand why. + */ + dts->trip = 0; + return 0; +} + +/* + * DTS sensor class ids. Only one for the moment: the core + * temperature. + */ +enum sensor_dts_class { + SENSOR_DTS_CORE_TEMP, + SENSOR_DTS_MEM_TEMP, + /* To be continued */ +}; + +/* + * Extract the centaur chip id which was truncated to fit in the + * resource identifier field of the sensor handler + */ +#define centaur_get_id(rid) (0x80000000 | ((rid) & 0x3ff)) + +int64_t dts_sensor_read(u32 sensor_hndl, int token, __be64 *sensor_data) +{ + uint8_t attr = sensor_get_attr(sensor_hndl); + uint32_t rid = sensor_get_rid(sensor_hndl); + struct dts dts = {0}; + int64_t rc; + + if (attr > SENSOR_DTS_ATTR_TEMP_TRIP) + return OPAL_PARAMETER; + + memset(&dts, 0, sizeof(struct dts)); + + switch (sensor_get_frc(sensor_hndl)) { + case SENSOR_DTS_CORE_TEMP: + rc = dts_read_core_temp(rid, &dts, attr, token, sensor_data); + break; + case SENSOR_DTS_MEM_TEMP: + rc = dts_read_mem_temp(centaur_get_id(rid), &dts); + break; + default: + rc = OPAL_PARAMETER; + break; + } + if (rc) + return rc; + + if (attr == SENSOR_DTS_ATTR_TEMP_MAX) + *sensor_data = cpu_to_be64(dts.temp); + else if (attr == SENSOR_DTS_ATTR_TEMP_TRIP) + *sensor_data = cpu_to_be64(dts.trip); + + return 0; +} + +/* + * We only have two bytes for the resource identifier in the sensor + * handler. Let's trunctate the centaur chip id to squeeze it in. + * + * Centaur chip IDs are using the XSCOM "partID" encoding described in + * xscom.h. recap: + * + * 0b1000.0000.0000.0000.0000.00NN.NCCC.MMMM + * N=Node, C=Chip, M=Memory Channel + */ +#define centaur_make_id(cen_id, dimm_id) \ + (((chip_id) & 0x3ff) | ((dimm_id) << 10)) + +#define core_handler(core_id, attr_id) \ + sensor_make_handler(SENSOR_DTS, SENSOR_DTS_CORE_TEMP, \ + core_id, attr_id) + +#define cen_handler(cen_id, attr_id) \ + sensor_make_handler(SENSOR_DTS, SENSOR_DTS_MEM_TEMP, \ + centaur_make_id(chip_id, 0), attr_id) + +bool dts_sensor_create_nodes(struct dt_node *sensors) +{ + struct proc_chip *chip; + struct dt_node *cn; + char name[64]; + + /* build the device tree nodes : + * + * sensors/core-temp@pir + * + * The core is identified by its PIR, is stored in the resource + * number of the sensor handler. + */ + for_each_chip(chip) { + struct cpu_thread *c; + + for_each_available_core_in_chip(c, chip->id) { + struct dt_node *node; + uint32_t handler; + + snprintf(name, sizeof(name), "core-temp@%x", c->pir); + + handler = core_handler(c->pir, SENSOR_DTS_ATTR_TEMP_MAX); + node = dt_new(sensors, name); + dt_add_property_string(node, "compatible", + "ibm,opal-sensor"); + dt_add_property_cells(node, "sensor-data", handler); + handler = core_handler(c->pir, SENSOR_DTS_ATTR_TEMP_TRIP); + dt_add_property_cells(node, "sensor-status", handler); + dt_add_property_string(node, "sensor-type", "temp"); + dt_add_property_cells(node, "ibm,pir", c->pir); + dt_add_property_cells(node, "reg", handler); + dt_add_property_string(node, "label", "Core"); + init_timer(&c->dts_timer, dts_async_read_temp, c); + c->dts_read_in_progress = false; + } + } + + /* + * sensors/mem-temp@chip for Centaurs + */ + dt_for_each_compatible(dt_root, cn, "ibm,centaur") { + uint32_t chip_id; + struct dt_node *node; + uint32_t handler; + + chip_id = dt_prop_get_u32(cn, "ibm,chip-id"); + + snprintf(name, sizeof(name), "mem-temp@%x", chip_id); + + handler = cen_handler(chip_id, SENSOR_DTS_ATTR_TEMP_MAX); + node = dt_new(sensors, name); + dt_add_property_string(node, "compatible", + "ibm,opal-sensor"); + dt_add_property_cells(node, "sensor-data", handler); + + handler = cen_handler(chip_id, SENSOR_DTS_ATTR_TEMP_TRIP); + dt_add_property_cells(node, "sensor-status", handler); + dt_add_property_string(node, "sensor-type", "temp"); + dt_add_property_cells(node, "ibm,chip-id", chip_id); + dt_add_property_cells(node, "reg", handler); + dt_add_property_string(node, "label", "Centaur"); + } + + return true; +} diff --git a/roms/skiboot/hw/fake-nvram.c b/roms/skiboot/hw/fake-nvram.c new file mode 100644 index 000000000..44adde4a3 --- /dev/null +++ b/roms/skiboot/hw/fake-nvram.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* Copyright 2013-2017 IBM Corp. */ + +#include <skiboot.h> +#include <opal.h> +#include <mem_region.h> +#include <lock.h> + +static struct mem_region *nvram_region; +static struct lock fake_nvram_lock = LOCK_UNLOCKED; + +int fake_nvram_info(uint32_t *total_size) +{ + nvram_region = find_mem_region("ibm,fake-nvram"); + + if (!nvram_region) + return OPAL_HARDWARE; + + *total_size = nvram_region->len; + + return OPAL_SUCCESS; +} + +int fake_nvram_start_read(void *dst, uint32_t src, uint32_t len) +{ + if (!nvram_region) + return -ENODEV; + + lock(&fake_nvram_lock); + memcpy(dst, (void *) (nvram_region->start + src), len); + unlock(&fake_nvram_lock); + + nvram_read_complete(true); + + return 0; +} + +int fake_nvram_write(uint32_t offset, void *src, uint32_t size) +{ + if (!nvram_region) + return OPAL_HARDWARE; + + lock(&fake_nvram_lock); + memcpy((void *) (nvram_region->start + offset), src, size); + unlock(&fake_nvram_lock); + + return 0; +} + diff --git a/roms/skiboot/hw/fake-rtc.c b/roms/skiboot/hw/fake-rtc.c new file mode 100644 index 000000000..3f083050c --- /dev/null +++ b/roms/skiboot/hw/fake-rtc.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* Copyright 2013-2017 IBM Corp. */ + +#include <skiboot.h> +#include <opal.h> +#include <mem_region.h> +#include <device.h> +#include <timebase.h> +#include <time-utils.h> +#include <lock.h> + +/* timebase when tm_offset was assigned */ +static unsigned long tb_synctime; + +/* + * Absolute time that was last assigned. + * Current rtc value is calculated from this. +*/ +static struct tm tm_offset; + +/* protects tm_offset & tb_synctime */ +static struct lock emulation_lock; + +static int64_t fake_rtc_write(uint32_t ymd, uint64_t hmsm) +{ + + lock(&emulation_lock); + + datetime_to_tm(ymd, hmsm, &tm_offset); + tb_synctime = mftb(); + + unlock(&emulation_lock); + + return OPAL_SUCCESS; +} + +static int64_t fake_rtc_read(__be32 *__ymd, __be64 *__hmsm) +{ + + time_t sec; + struct tm tm_calculated; + uint32_t ymd; + uint64_t hmsm; + + if (!__ymd || !__hmsm) + return OPAL_PARAMETER; + + /* Compute the emulated clock value */ + lock(&emulation_lock); + + sec = tb_to_secs(mftb() - tb_synctime) + mktime(&tm_offset); + gmtime_r(&sec, &tm_calculated); + tm_to_datetime(&tm_calculated, &ymd, &hmsm); + + unlock(&emulation_lock); + + *__ymd = cpu_to_be32(ymd); + *__hmsm = cpu_to_be64(hmsm); + + return OPAL_SUCCESS; +} + +void fake_rtc_init(void) +{ + struct mem_region *rtc_region = NULL; + uint32_t *rtc = NULL, *fake_ymd; + uint64_t *fake_hmsm; + struct dt_node *np; + + /* Read initial values from reserved memory */ + rtc_region = find_mem_region("ibm,fake-rtc"); + + /* Should we register anyway? */ + if (!rtc_region) { + prlog(PR_TRACE, "No initial RTC value found\n"); + return; + } + + init_lock(&emulation_lock); + + /* Fetch the initial rtc values */ + rtc = (uint32_t *) rtc_region->start; + + fake_ymd = rtc; + fake_hmsm = ((uint64_t *) &rtc[1]); + + fake_rtc_write(*fake_ymd, *fake_hmsm); + + /* Register opal calls */ + opal_register(OPAL_RTC_READ, fake_rtc_read, 2); + opal_register(OPAL_RTC_WRITE, fake_rtc_write, 2); + + /* add the fake rtc dt node */ + np = dt_new(opal_node, "rtc"); + dt_add_property_strings(np, "compatible", "ibm,opal-rtc"); + + prlog(PR_TRACE, "Init fake RTC to Date:%d-%d-%d Time:%d-%d-%d\n", + tm_offset.tm_mon, tm_offset.tm_mday, tm_offset.tm_year, + tm_offset.tm_hour, tm_offset.tm_min, tm_offset.tm_sec); +} diff --git a/roms/skiboot/hw/fsi-master.c b/roms/skiboot/hw/fsi-master.c new file mode 100644 index 000000000..410542a19 --- /dev/null +++ b/roms/skiboot/hw/fsi-master.c @@ -0,0 +1,675 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* Copyright 2013-2017 IBM Corp. */ + +#include <skiboot.h> +#include <xscom.h> +#include <lock.h> +#include <timebase.h> +#include <chip.h> +#include <fsi-master.h> + +/* + * FSI Masters sit on OPB busses behind PIB2OPB bridges + * + * There are two cMFSI behind two different bridges at + * different XSCOM addresses. For now we don't have them in + * the device-tree so we hard code the address + */ +#define PIB2OPB_MFSI0_ADDR 0x20000 +#define PIB2OPB_MFSI1_ADDR 0x30000 + +/* + * Bridge registers on XSCOM that allow generatoin + * of OPB cycles + */ +#define PIB2OPB_REG_CMD 0x0 +#define OPB_CMD_WRITE 0x80000000 +#define OPB_CMD_READ 0x00000000 +#define OPB_CMD_8BIT 0x00000000 +#define OPB_CMD_16BIT 0x20000000 +#define OPB_CMD_32BIT 0x60000000 +#define PIB2OPB_REG_STAT 0x1 +#define OPB_STAT_ANY_ERR 0x80000000 +#define OPB_STAT_ERR_OPB 0x7FEC0000 +#define OPB_STAT_ERRACK 0x00100000 +#define OPB_STAT_BUSY 0x00010000 +#define OPB_STAT_READ_VALID 0x00020000 +#define OPB_STAT_ERR_CMFSI 0x0000FC00 +#define OPB_STAT_ERR_HMFSI 0x000000FC +#define OPB_STAT_ERR_BASE (OPB_STAT_ANY_ERR | \ + OPB_STAT_ERR_OPB | \ + OPB_STAT_ERRACK) +#define PIB2OPB_REG_LSTAT 0x2 +#define PIB2OPB_REG_RESET 0x4 +#define PIB2OPB_REG_cRSIC 0x5 +#define PIB2OPB_REG_cRSIM 0x6 +#define PIB2OPB_REG_cRSIS 0x7 +#define PIB2OPB_REG_hRSIC 0x8 +#define PIB2OPB_REG_hRSIM 0x9 +#define PIB2OPB_REG_hRSIS 0xA + +/* Low level errors from OPB contain the status in the bottom 32-bit + * and one of these in the top 32-bit + */ +#define OPB_ERR_XSCOM_ERR 0x100000000ull +#define OPB_ERR_TIMEOUT_ERR 0x200000000ull +#define OPB_ERR_BAD_OPB_ADDR 0x400000000ull + +/* + * PIB2OPB 0 has 2 MFSIs, cMFSI and hMFSI, PIB2OPB 1 only + * has cMFSI + */ +#define cMFSI_OPB_PORTS_BASE 0x40000 +#define cMFSI_OPB_REG_BASE 0x03000 +#define hMFSI_OPB_PORTS_BASE 0x80000 +#define hMFSI_OPB_REG_BASE 0x03400 +#define MFSI_OPB_PORT_STRIDE 0x08000 + +/* MFSI control registers */ +#define MFSI_REG_MSTAP(__n) (0x0D0 + (__n) * 4) +#define MFSI_REG_MATRB0 0x1D8 +#define MFSI_REG_MDTRB0 0x1DC +#define MFSI_REG_MESRB0 0x1D0 +#define MFSI_REG_MAESP0 0x050 +#define MFSI_REG_MAEB 0x070 +#define MFSI_REG_MSCSB0 0x1D4 + +/* FSI Slave registers */ +#define FSI_SLAVE_REGS 0x000800 /**< FSI Slave Register */ +#define FSI_SMODE (FSI_SLAVE_REGS | 0x00) +#define FSI_SLBUS (FSI_SLAVE_REGS | 0x30) +#define FSI_SLRES (FSI_SLAVE_REGS | 0x34) + +#define FSI2PIB_ENGINE 0x001000 /**< FSI2PIB Engine (SCOM) */ +#define FSI2PIB_RESET (FSI2PIB_ENGINE | 0x18) +#define FSI2PIB_STATUS (FSI2PIB_ENGINE | 0x1C) +#define FSI2PIB_COMPMASK (FSI2PIB_ENGINE | 0x30) +#define FSI2PIB_TRUEMASK (FSI2PIB_ENGINE | 0x34) + +struct mfsi { + uint32_t chip_id; + uint32_t unit; + uint32_t xscom_base; + uint32_t ports_base; + uint32_t reg_base; + uint32_t err_bits; +}; + +#define mfsi_log(__lev, __m, __fmt, ...) \ + prlog(__lev, "MFSI %x:%x: " __fmt, __m->chip_id, __m->unit, ##__VA_ARGS__) +/* + * Use a global FSI lock for now. Beware of re-entrancy + * if we ever add support for normal chip XSCOM via FSI, in + * which case we'll probably have to consider either per chip + * lock (which can have AB->BA deadlock issues) or a re-entrant + * global lock or something else. ... + */ +static struct lock fsi_lock = LOCK_UNLOCKED; + +/* + * OPB accessors + */ + +/* We try up to 1.2ms for an OPB access */ +#define MFSI_OPB_MAX_TRIES 1200 + +static uint64_t mfsi_opb_poll(struct mfsi *mfsi, uint32_t *read_data) +{ + unsigned long retries = MFSI_OPB_MAX_TRIES; + uint64_t sval; + uint32_t stat; + int64_t rc; + + /* We try again every 10us for a bit more than 1ms */ + for (;;) { + /* Read OPB status register */ + rc = xscom_read(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_STAT, &sval); + if (rc) { + /* Do something here ? */ + mfsi_log(PR_ERR, mfsi, "XSCOM error %lld read OPB STAT\n", rc); + return OPB_ERR_XSCOM_ERR; + } + mfsi_log(PR_INSANE, mfsi, " STAT=0x%16llx...\n", sval); + + stat = sval >> 32; + + /* Complete */ + if (!(stat & OPB_STAT_BUSY)) + break; + if (retries-- == 0) { + /* This isn't supposed to happen (HW timeout) */ + mfsi_log(PR_ERR, mfsi, "OPB POLL timeout !\n"); + return OPB_ERR_TIMEOUT_ERR | (stat & mfsi->err_bits); + } + time_wait_us(1); + } + + /* Did we have an error ? */ + if (stat & mfsi->err_bits) + return stat & mfsi->err_bits; + + if (read_data) { + if (!(stat & OPB_STAT_READ_VALID)) { + mfsi_log(PR_ERR, mfsi, "Read successful but no data !\n"); + + /* What do do here ? can it actually happen ? */ + sval = 0xffffffff; + } + *read_data = sval & 0xffffffff; + } + + return 0; +} + +static uint64_t mfsi_opb_read(struct mfsi *mfsi, uint32_t opb_addr, uint32_t *data) +{ + uint64_t opb_cmd = OPB_CMD_READ | OPB_CMD_32BIT; + int64_t rc; + + if (opb_addr > 0x00ffffff) + return OPB_ERR_BAD_OPB_ADDR; + + opb_cmd |= opb_addr; + opb_cmd <<= 32; + + mfsi_log(PR_INSANE, mfsi, "MFSI_OPB_READ: Writing 0x%16llx to XSCOM %x\n", + opb_cmd, mfsi->xscom_base); + + rc = xscom_write(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_CMD, opb_cmd); + if (rc) { + mfsi_log(PR_ERR, mfsi, "XSCOM error %lld writing OPB CMD\n", rc); + return OPB_ERR_XSCOM_ERR; + } + return mfsi_opb_poll(mfsi, data); +} + +static uint64_t mfsi_opb_write(struct mfsi *mfsi, uint32_t opb_addr, uint32_t data) +{ + uint64_t opb_cmd = OPB_CMD_WRITE | OPB_CMD_32BIT; + int64_t rc; + + if (opb_addr > 0x00ffffff) + return OPB_ERR_BAD_OPB_ADDR; + + opb_cmd |= opb_addr; + opb_cmd <<= 32; + opb_cmd |= data; + + mfsi_log(PR_INSANE, mfsi, "MFSI_OPB_WRITE: Writing 0x%16llx to XSCOM %x\n", + opb_cmd, mfsi->xscom_base); + + rc = xscom_write(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_CMD, opb_cmd); + if (rc) { + mfsi_log(PR_ERR, mfsi, "XSCOM error %lld writing OPB CMD\n", rc); + return OPB_ERR_XSCOM_ERR; + } + return mfsi_opb_poll(mfsi, NULL); +} + +static struct mfsi *mfsi_get(uint32_t chip_id, uint32_t unit) +{ + struct proc_chip *chip = get_chip(chip_id); + struct mfsi *mfsi; + + if (!chip || unit > MFSI_hMFSI0) + return NULL; + mfsi = &chip->fsi_masters[unit]; + if (mfsi->xscom_base == 0) + return NULL; + return mfsi; +} + +static int64_t mfsi_reset_pib2opb(struct mfsi *mfsi) +{ + uint64_t stat; + int64_t rc; + + rc = xscom_write(mfsi->chip_id, + mfsi->xscom_base + PIB2OPB_REG_RESET, (1ul << 63)); + if (rc) { + mfsi_log(PR_ERR, mfsi, "XSCOM error %lld resetting PIB2OPB\n", rc); + return rc; + } + rc = xscom_write(mfsi->chip_id, + mfsi->xscom_base + PIB2OPB_REG_STAT, (1ul << 63)); + if (rc) { + mfsi_log(PR_ERR, mfsi, "XSCOM error %lld resetting status\n", rc); + return rc; + } + rc = xscom_read(mfsi->chip_id, + mfsi->xscom_base + PIB2OPB_REG_STAT, &stat); + if (rc) { + mfsi_log(PR_ERR, mfsi, "XSCOM error %lld reading status\n", rc); + return rc; + } + return 0; +} + + +static void mfsi_dump_pib2opb_state(struct mfsi *mfsi) +{ + uint64_t val; + + /* Dump a bunch of registers */ + if (xscom_read(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_CMD, &val)) + goto xscom_error; + mfsi_log(PR_ERR, mfsi, " PIB2OPB CMD = %016llx\n", val); + if (xscom_read(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_STAT, &val)) + goto xscom_error; + mfsi_log(PR_ERR, mfsi, " PIB2OPB STAT = %016llx\n", val); + if (xscom_read(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_LSTAT, &val)) + goto xscom_error; + mfsi_log(PR_ERR, mfsi, " PIB2OPB LSTAT = %016llx\n", val); + + if (mfsi->unit == MFSI_cMFSI0 || mfsi->unit == MFSI_cMFSI1) { + if (xscom_read(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_cRSIC, &val)) + goto xscom_error; + mfsi_log(PR_ERR, mfsi, " PIB2OPB cRSIC = %016llx\n", val); + if (xscom_read(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_cRSIM, &val)) + goto xscom_error; + mfsi_log(PR_ERR, mfsi, " PIB2OPB cRSIM = %016llx\n", val); + if (xscom_read(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_cRSIS, &val)) + goto xscom_error; + mfsi_log(PR_ERR, mfsi, " PIB2OPB cRSIS = %016llx\n", val); + } else if (mfsi->unit == MFSI_hMFSI0) { + if (xscom_read(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_hRSIC, &val)) + goto xscom_error; + mfsi_log(PR_ERR, mfsi, " PIB2OPB hRSIC = %016llx\n", val); + if (xscom_read(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_hRSIM, &val)) + goto xscom_error; + mfsi_log(PR_ERR, mfsi, " PIB2OPB hRSIM = %016llx\n", val); + if (xscom_read(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_hRSIS, &val)) + goto xscom_error; + mfsi_log(PR_ERR, mfsi, " PIB2OPB hRSIS = %016llx\n", val); + } + return; + xscom_error: + mfsi_log(PR_ERR, mfsi, "XSCOM error reading PIB2OPB registers\n"); +} + +static int64_t mfsi_dump_ctrl_regs(struct mfsi *mfsi) +{ + uint64_t opb_stat; + uint32_t i; + + /* List of registers to dump (from HB) */ + static uint32_t dump_regs[] = { + MFSI_REG_MATRB0, + MFSI_REG_MDTRB0, + MFSI_REG_MESRB0, + MFSI_REG_MAESP0, + MFSI_REG_MAEB, + MFSI_REG_MSCSB0, + }; + static const char *dump_regs_names[] = { + "MFSI_REG_MATRB0", + "MFSI_REG_MDTRB0", + "MFSI_REG_MESRB0", + "MFSI_REG_MAESP0", + "MFSI_REG_MAEB ", + "MFSI_REG_MSCSB0", + }; + for (i = 0; i < ARRAY_SIZE(dump_regs); i++) { + uint32_t val; + + opb_stat = mfsi_opb_read(mfsi, mfsi->reg_base + dump_regs[i], &val); + if (opb_stat) { + /* Error on dump, give up */ + mfsi_log(PR_ERR, mfsi, " OPB stat 0x%016llx dumping reg %x\n", + opb_stat, dump_regs[i]); + return OPAL_HARDWARE; + } + mfsi_log(PR_ERR, mfsi, " %s = %08x\n", dump_regs_names[i], val); + } + for (i = 0; i < 8; i++) { + uint32_t val; + + opb_stat = mfsi_opb_read(mfsi, mfsi->reg_base + MFSI_REG_MSTAP(i), &val); + if (opb_stat) { + /* Error on dump, give up */ + mfsi_log(PR_ERR, mfsi, " OPB stat 0x%016llx dumping reg %x\n", + opb_stat, MFSI_REG_MSTAP(i)); + return OPAL_HARDWARE; + } + mfsi_log(PR_ERR, mfsi, " MFSI_REG_MSTAP%d = %08x\n", i, val); + } + return OPAL_SUCCESS; +} + +static int64_t mfsi_master_cleanup(struct mfsi *mfsi, uint32_t port) +{ + uint64_t opb_stat; + uint32_t port_base, compmask, truemask; + + /* Reset the bridge to clear up the residual errors */ + + /* bit0 = Bridge: General reset */ + opb_stat = mfsi_opb_write(mfsi, mfsi->reg_base + MFSI_REG_MESRB0, 0x80000000u); + if (opb_stat) { + mfsi_log(PR_ERR, mfsi, " OPB stat 0x%016llx writing reset to MESRB0\n", + opb_stat); + return OPAL_HARDWARE; + } + + /* Calculate base address of port */ + port_base = mfsi->ports_base + port * MFSI_OPB_PORT_STRIDE; + + /* Perform error reset on Centaur fsi slave: */ + /* write 0x4000000 to addr=834 */ + opb_stat = mfsi_opb_write(mfsi, port_base + FSI_SLRES, 0x04000000); + if (opb_stat) { + mfsi_log(PR_ERR, mfsi, + " OPB stat 0x%016llx writing reset to FSI slave\n", + opb_stat); + return OPAL_HARDWARE; + } + + /* Further step is to issue a PIB reset to the FSI2PIB engine + * in busy state, i.e. write arbitrary data to 101c + * (putcfam 1007) register of the previously failed FSI2PIB + * engine on Centaur. + * + * XXX BenH: Should that be done by the upper FSI XSCOM layer ? + */ + opb_stat = mfsi_opb_write(mfsi, port_base + FSI2PIB_STATUS, 0xFFFFFFFF); + if (opb_stat) { + mfsi_log(PR_ERR, mfsi, + " OPB stat 0x%016llx clearing FSI2PIB_STATUS\n", + opb_stat); + return OPAL_HARDWARE; + } + + /* Need to save/restore the true/comp masks or the FSP (PRD ?) will + * get annoyed + */ + opb_stat = mfsi_opb_read(mfsi, port_base + FSI2PIB_COMPMASK, &compmask); + if (opb_stat) { + mfsi_log(PR_ERR, mfsi, + " OPB stat 0x%016llx reading FSI2PIB_COMPMASK\n", + opb_stat); + return OPAL_HARDWARE; + } + opb_stat = mfsi_opb_read(mfsi, port_base + FSI2PIB_TRUEMASK, &truemask); + if (opb_stat) { + mfsi_log(PR_ERR, mfsi, + " OPB stat 0x%016llx reading FSI2PIB_TRUEMASK\n", + opb_stat); + return OPAL_HARDWARE; + } + + /* Then, write arbitrary data to 1018 (putcfam 1006) to + * reset any pending FSI2PIB errors. + */ + opb_stat = mfsi_opb_write(mfsi, port_base + FSI2PIB_RESET, 0xFFFFFFFF); + if (opb_stat) { + mfsi_log(PR_ERR, mfsi, + " OPB stat 0x%016llx writing FSI2PIB_RESET\n", + opb_stat); + return OPAL_HARDWARE; + } + + /* Restore the true/comp masks */ + opb_stat = mfsi_opb_write(mfsi, port_base + FSI2PIB_COMPMASK, compmask); + if (opb_stat) { + mfsi_log(PR_ERR, mfsi, + " OPB stat 0x%016llx writing FSI2PIB_COMPMASK\n", + opb_stat); + return OPAL_HARDWARE; + } + opb_stat = mfsi_opb_write(mfsi, port_base + FSI2PIB_TRUEMASK, truemask); + if (opb_stat) { + mfsi_log(PR_ERR, mfsi, + " OPB stat 0x%016llx writing FSI2PIB_TRUEMASK\n", + opb_stat); + return OPAL_HARDWARE; + } + return OPAL_SUCCESS; +} + +static int64_t mfsi_analyse_fsi_error(struct mfsi *mfsi) +{ + uint64_t opb_stat; + uint32_t mesrb0; + + /* Most of the code below is adapted from HB. The main difference is + * that we don't gard + */ + + /* Read MESRB0 */ + opb_stat = mfsi_opb_read(mfsi, mfsi->reg_base + MFSI_REG_MESRB0, &mesrb0); + if (opb_stat) { + mfsi_log(PR_ERR, mfsi, " OPB stat 0x%016llx reading MESRB0\n", opb_stat); + return OPAL_HARDWARE; + } + mfsi_log(PR_ERR, mfsi, " MESRB0=%08x\n", mesrb0); + + /* bits 8:15 are internal parity errors in the master */ + if (mesrb0 & 0x00FF0000) { + mfsi_log(PR_ERR, mfsi, " Master parity error !\n"); + } else { + /* bits 0:3 are a specific error code */ + switch ((mesrb0 & 0xF0000000) >> 28) { + case 0x1: /* OPB error */ + case 0x2: /* Invalid state of OPB state machine */ + /* error is inside the OPB logic */ + mfsi_log(PR_ERR, mfsi, " OPB logic error !\n"); + break; + case 0x3: /* Port access error */ + /* probably some kind of code collision */ + /* could also be something weird in the chip */ + mfsi_log(PR_ERR, mfsi, " Port access error !\n"); + break; + case 0x4: /* ID mismatch */ + mfsi_log(PR_ERR, mfsi, " Port ID mismatch !\n"); + break; + case 0x6: /* port timeout error */ + mfsi_log(PR_ERR, mfsi, " Port timeout !\n"); + break; + case 0x7: /* master timeout error */ + mfsi_log(PR_ERR, mfsi, " Master timeout !\n"); + break; + case 0x9: /* Any error response from Slave */ + mfsi_log(PR_ERR, mfsi, " Slave error response !\n"); + break; + case 0xC: /* bridge parity error */ + mfsi_log(PR_ERR, mfsi, " Bridge parity error !\n"); + break; + case 0xB: /* protocol error */ + mfsi_log(PR_ERR, mfsi, " Protocol error !\n"); + break; + case 0x8: /* master CRC error */ + mfsi_log(PR_ERR, mfsi, " Master CRC error !\n"); + break; + case 0xA: /* Slave CRC error */ + mfsi_log(PR_ERR, mfsi, " Slave CRC error !\n"); + break; + default: + mfsi_log(PR_ERR, mfsi, " Unknown error !\n"); + break; + } + } + return OPAL_SUCCESS; +} + +static int64_t mfsi_handle_error(struct mfsi *mfsi, uint32_t port, + uint64_t opb_stat, uint32_t fsi_addr) +{ + int rc; + bool found_root_cause = false; + + mfsi_log(PR_ERR, mfsi, "Access error on port %d, stat=%012llx\n", + port, opb_stat); + + /* First handle stat codes we synthetized */ + if (opb_stat & OPB_ERR_XSCOM_ERR) + return OPAL_HARDWARE; + if (opb_stat & OPB_ERR_BAD_OPB_ADDR) + return OPAL_PARAMETER; + + /* Dump a bunch of regisers from PIB2OPB and reset it */ + mfsi_dump_pib2opb_state(mfsi); + + /* Reset PIB2OPB */ + mfsi_reset_pib2opb(mfsi); + + /* This one is not supposed to happen but ... */ + if (opb_stat & OPB_ERR_TIMEOUT_ERR) + return OPAL_HARDWARE; + + /* Dump some FSI control registers */ + rc = mfsi_dump_ctrl_regs(mfsi); + + /* If that failed, reset PIB2OPB again and return */ + if (rc) { + mfsi_dump_pib2opb_state(mfsi); + mfsi_reset_pib2opb(mfsi); + return OPAL_HARDWARE; + } + + /* Now check for known root causes (from HB) */ + + /* First check if it's a ctrl register access error and we got an OPB NACK, + * which means an out of bounds control reg + */ + if ((opb_stat & OPB_STAT_ERRACK) && + ((fsi_addr & ~0x2ffu) == mfsi->reg_base)) { + mfsi_log(PR_ERR, mfsi, " Error appears to be out of bounds reg %08x\n", + fsi_addr); + found_root_cause = true; + } + /* Else check for other OPB errors */ + else if (opb_stat & OPB_STAT_ERR_OPB) { + mfsi_log(PR_ERR, mfsi, " Error appears to be an OPB error\n"); + found_root_cause = true; + } + + /* Root cause not found, dig into FSI logic */ + if (!found_root_cause) { + rc = mfsi_analyse_fsi_error(mfsi); + if (!rc) { + /* If that failed too, reset the PIB2OPB again */ + mfsi_reset_pib2opb(mfsi); + } + } + + /* Cleanup MFSI master */ + mfsi_master_cleanup(mfsi, port); + + return OPAL_HARDWARE; +} + +int64_t mfsi_read(uint32_t chip, uint32_t unit, uint32_t port, + uint32_t fsi_addr, uint32_t *data) +{ + struct mfsi *mfsi = mfsi_get(chip, unit); + uint32_t port_addr; + uint64_t opb_stat; + int64_t rc = OPAL_SUCCESS; + + if (!mfsi || port > 7) + return OPAL_PARAMETER; + + lock(&fsi_lock); + + /* Calculate port address */ + port_addr = mfsi->ports_base + port * MFSI_OPB_PORT_STRIDE; + port_addr += fsi_addr; + + /* Perform OPB access */ + opb_stat = mfsi_opb_read(mfsi, port_addr, data); + if (opb_stat) + rc = mfsi_handle_error(mfsi, port, opb_stat, port_addr); + + unlock(&fsi_lock); + + return rc; +} + +int64_t mfsi_write(uint32_t chip, uint32_t unit, uint32_t port, + uint32_t fsi_addr, uint32_t data) +{ + struct mfsi *mfsi = mfsi_get(chip, unit); + uint32_t port_addr; + uint64_t opb_stat; + int64_t rc = OPAL_SUCCESS; + + if (!mfsi || port > 7) + return OPAL_PARAMETER; + + lock(&fsi_lock); + + /* Calculate port address */ + port_addr = mfsi->ports_base + port * MFSI_OPB_PORT_STRIDE; + port_addr += fsi_addr; + + /* Perform OPB access */ + opb_stat = mfsi_opb_write(mfsi, port_addr, data); + if (opb_stat) + rc = mfsi_handle_error(mfsi, port, opb_stat, port_addr); + + unlock(&fsi_lock); + + return rc; +} + +static void mfsi_add(struct proc_chip *chip, struct mfsi *mfsi, uint32_t unit) +{ + mfsi->chip_id = chip->id; + mfsi->unit = unit; + + /* We hard code everything for now */ + switch (unit) { + case MFSI_cMFSI0: + mfsi->xscom_base = PIB2OPB_MFSI0_ADDR; + mfsi->ports_base = cMFSI_OPB_PORTS_BASE; + mfsi->reg_base = cMFSI_OPB_REG_BASE; + mfsi->err_bits = OPB_STAT_ERR_BASE | OPB_STAT_ERR_CMFSI; + break; + case MFSI_cMFSI1: + mfsi->xscom_base = PIB2OPB_MFSI1_ADDR; + mfsi->ports_base = cMFSI_OPB_PORTS_BASE; + mfsi->reg_base = cMFSI_OPB_REG_BASE; + mfsi->err_bits = OPB_STAT_ERR_BASE | OPB_STAT_ERR_CMFSI; + break; + case MFSI_hMFSI0: + mfsi->xscom_base = PIB2OPB_MFSI0_ADDR; + mfsi->ports_base = hMFSI_OPB_PORTS_BASE; + mfsi->reg_base = hMFSI_OPB_REG_BASE; + mfsi->err_bits = OPB_STAT_ERR_BASE | OPB_STAT_ERR_HMFSI; + break; + default: + /* ??? */ + return; + } + + /* Hardware Bug HW222712 on Murano DD1.0 causes the + * any_error bit to be un-clearable so we just + * have to ignore it. Additionally, HostBoot applies + * this to Venice too, though the comment there claims + * this is a Simics workaround. + * + * The doc says that bit can be safely ignored, so let's + * just not bother and always take it out. + */ + + /* 16: cMFSI any-master-error */ + /* 24: hMFSI any-master-error */ + mfsi->err_bits &= 0xFFFF7F7F; + + mfsi_log(PR_INFO, mfsi, "Initialized\n"); +} + +void mfsi_init(void) +{ + struct proc_chip *chip; + + for_each_chip(chip) { + chip->fsi_masters = zalloc(sizeof(struct mfsi) * 3); + assert(chip->fsi_masters); + mfsi_add(chip, &chip->fsi_masters[MFSI_cMFSI0], MFSI_cMFSI0); + mfsi_add(chip, &chip->fsi_masters[MFSI_hMFSI0], MFSI_hMFSI0); + mfsi_add(chip, &chip->fsi_masters[MFSI_cMFSI1], MFSI_cMFSI1); + + } +} + diff --git a/roms/skiboot/hw/fsp/Makefile.inc b/roms/skiboot/hw/fsp/Makefile.inc new file mode 100644 index 000000000..21dc52a9f --- /dev/null +++ b/roms/skiboot/hw/fsp/Makefile.inc @@ -0,0 +1,13 @@ +SUBDIRS += hw/fsp + +FSP_OBJS = fsp.o fsp-console.o fsp-rtc.o fsp-nvram.o fsp-sysparam.o +FSP_OBJS += fsp-surveillance.o fsp-codeupdate.o fsp-sensor.o +FSP_OBJS += fsp-diag.o fsp-leds.o fsp-mem-err.o fsp-op-panel.o +FSP_OBJS += fsp-elog-read.o fsp-elog-write.o fsp-epow.o fsp-dpo.o +FSP_OBJS += fsp-dump.o fsp-sysdump.o fsp-chiptod.o fsp-ipmi.o +FSP_OBJS += fsp-attn.o fsp-occ.o fsp-psi.o +FSP = hw/fsp/built-in.a + +ifeq ($(CONFIG_FSP),1) +$(FSP): $(FSP_OBJS:%=hw/fsp/%) +endif diff --git a/roms/skiboot/hw/fsp/fsp-attn.c b/roms/skiboot/hw/fsp/fsp-attn.c new file mode 100644 index 000000000..6e358e0d4 --- /dev/null +++ b/roms/skiboot/hw/fsp/fsp-attn.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * FSP ATTentioN support + * + * FSP can grab a bunch of things on host firmware dying, + * let's set that up. + * + * Copyright 2013-2019 IBM Corp. +*/ +#include <fsp.h> +#include <skiboot.h> +#include <fsp-elog.h> +#include <fsp-attn.h> +#include <hdata/spira.h> +#include <stack.h> +#include <processor.h> +#include <opal-dump.h> + +#define TI_CMD_VALID 0x1 /* Command valid */ +#define TI_CMD 0xA1 /* Terminate Immediate command */ +#define TI_DATA_LEN 0x0400 /* Data length */ +/* Controls dump actions + * - Non-destructive hardware dump (bit 0) + * - memory dump (bit 1) + * - Destructive hardware dump (bit 2) + */ +#define TI_DMP_CTL 0x6 +/* Dump type + * 0 - Abbreviated hardware dump + * 1 - Complete hardware dump + * 2 - No hardware dump + */ +#define TI_DUMP_TYPE 0x1 +#define TI_FORMAT 0x02 /* SRC format */ +#define TI_SRC_FLAGS 0x0 /* SRC flags */ +#define TI_ASCII_WORDS 0x0 /* Number of ASCII words */ + +/* HEX words: Number of hex words of data added, up to 8 total + * this value is one more. + */ +#define TI_HEX_WORDS 0x02 +/* SRC length : 8 byte header, 8 hex words of data and + * 32 byte ASCII SRC + */ +#define TI_SRC_LEN 0x48 + +static struct ti_attn *ti_attn; + +/* Initialises SP attention area with default values */ +static void init_sp_attn_area(void) +{ + /* Already done */ + if (ti_attn) + return; + + /* We are just enabling attention area 1 */ + ti_attn = (struct ti_attn *)&cpu_ctl_sp_attn_area1; + + /* Attention component checks Attn area 2 first, if its NULL + * it will check for Attn area 1. + */ + memset(&cpu_ctl_sp_attn_area1, 0, sizeof(struct sp_attn_area)); + memset(&cpu_ctl_sp_attn_area2, 0, sizeof(struct sp_attn_area)); + + ti_attn->cmd_valid = TI_CMD_VALID; + ti_attn->attn_cmd = TI_CMD; + ti_attn->data_len = CPU_TO_BE16(TI_DATA_LEN); + /* Dump control byte not used as of now */ + ti_attn->dump_ctrl =TI_DMP_CTL; + ti_attn->dump_type = CPU_TO_BE16(TI_DUMP_TYPE); + + /* SRC format */ + ti_attn->src_fmt = TI_FORMAT; + /* SRC flags */ + ti_attn->src_flags = TI_SRC_FLAGS; + /* #ASCII words */ + ti_attn->ascii_cnt = TI_ASCII_WORDS; + /* #HEX words */ + ti_attn->hex_cnt = TI_HEX_WORDS; + ti_attn->src_len = CPU_TO_BE16(TI_SRC_LEN); + snprintf(ti_attn->src, SRC_LEN, "%X", generate_src_from_comp(OPAL_RC_ATTN)); +} + +/* Updates src in sp attention area + */ +static void update_sp_attn_area(const char *msg) +{ +#define STACK_BUF_ENTRIES 20 + struct bt_entry bt_buf[STACK_BUF_ENTRIES]; + struct bt_metadata metadata; + unsigned int len; + + if (!fsp_present()) + return; + + /* This can be called early */ + if (!ti_attn) + init_sp_attn_area(); + + ti_attn->src_word[0] = + cpu_to_be32((uint32_t)((uint64_t)__builtin_return_address(0) & 0xffffffff)); + + snprintf(ti_attn->msg.version, VERSION_LEN, "%s", version); + backtrace_create(bt_buf, STACK_BUF_ENTRIES, &metadata); + metadata.token = OPAL_LAST + 1; + len = BT_FRAME_LEN; + backtrace_print(bt_buf, &metadata, ti_attn->msg.bt_buf, &len, false); + snprintf(ti_attn->msg.file_info, FILE_INFO_LEN, "%s", msg); + + ti_attn->msg_len = cpu_to_be32(VERSION_LEN + BT_FRAME_LEN + + strlen(ti_attn->msg.file_info)); +} + +void __attribute__((noreturn)) ibm_fsp_terminate(const char *msg) +{ + /* Update SP attention area */ + update_sp_attn_area(msg); + + /* Update op panel op_display */ + op_display(OP_FATAL, OP_MOD_CORE, 0x6666); + + /* Save crashing CPU details */ + opal_mpipl_save_crashing_pir(); + + /* XXX FIXME: We should fsp_poll for a while to ensure any pending + * console writes have made it out, but until we have decent PSI + * link handling we must not do it forever. Polling can prevent the + * FSP from bringing the PSI link up and it can get stuck in a + * reboot loop. + */ + + trigger_attn(); + for (;;) ; +} + +/* Intialises SP attention area */ +void fsp_attn_init(void) +{ + if (!fsp_present()) + return; + + init_sp_attn_area(); +} diff --git a/roms/skiboot/hw/fsp/fsp-chiptod.c b/roms/skiboot/hw/fsp/fsp-chiptod.c new file mode 100644 index 000000000..e4ede3c1c --- /dev/null +++ b/roms/skiboot/hw/fsp/fsp-chiptod.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * On some chiptod errors, ask the FSP for a new topology + * + * Copyright 2013-2017 IBM Corp. + */ + +#define pr_fmt(fmt) "CHIPTOD: " fmt + +#include <skiboot.h> +#include <chiptod.h> +#include <fsp.h> + +/* Response status for fsp command 0xE6, s/c 0x06 (Enable/Disable Topology) */ +#define FSP_STATUS_TOPO_IN_USE 0xb8 /* topology is in use */ + +static bool fsp_chiptod_update_topology(uint32_t cmd_sub_mod, + struct fsp_msg *msg) +{ + struct fsp_msg *resp; + enum chiptod_topology topo; + bool action; + uint8_t status = 0; + + switch (cmd_sub_mod) { + case FSP_CMD_TOPO_ENABLE_DISABLE: + /* + * Action Values: 0x00 = Disable, 0x01 = Enable + * Topology Values: 0x00 = Primary, 0x01 = Secondary + */ + action = !!msg->data.bytes[2]; + topo = msg->data.bytes[3]; + prlog(PR_DEBUG, "Topology update event:\n"); + prlog(PR_DEBUG, " Action = %s, Topology = %s\n", + action ? "Enable" : "Disable", + topo ? "Secondary" : "Primary"); + + if (!chiptod_adjust_topology(topo, action)) + status = FSP_STATUS_TOPO_IN_USE; + else + status = 0x00; + + resp = fsp_mkmsg(FSP_RSP_TOPO_ENABLE_DISABLE | status, 0); + if (!resp) { + prerror("Response allocation failed\n"); + return false; + } + if (fsp_queue_msg(resp, fsp_freemsg)) { + fsp_freemsg(resp); + prerror("Failed to queue response msg\n"); + return false; + } + return true; + default: + prlog(PR_DEBUG, "Unhandled sub cmd: %06x\n", cmd_sub_mod); + break; + } + return false; +} + +static struct fsp_client fsp_chiptod_client = { + .message = fsp_chiptod_update_topology, +}; + +void fsp_chiptod_init(void) +{ + /* Register for Class E6 (HW maintanance) */ + fsp_register_client(&fsp_chiptod_client, FSP_MCLASS_HW_MAINT); +} diff --git a/roms/skiboot/hw/fsp/fsp-codeupdate.c b/roms/skiboot/hw/fsp/fsp-codeupdate.c new file mode 100644 index 000000000..3cd5b2bc9 --- /dev/null +++ b/roms/skiboot/hw/fsp/fsp-codeupdate.c @@ -0,0 +1,1315 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Firmware code update for FSP systems + * + * Copyright 2013-2018 IBM Corp. + */ + +#include <skiboot.h> +#include <fsp.h> +#include <fsp-sysparam.h> +#include <lock.h> +#include <device.h> +#include <ccan/endian/endian.h> +#include <errorlog.h> +#include <opal-api.h> +#include <timebase.h> + +#include "fsp-codeupdate.h" + +enum flash_state { + FLASH_STATE_ABSENT, + FLASH_STATE_INVALID, /* IPL side marker lid is invalid */ + FLASH_STATE_READING, + FLASH_STATE_READ, + FLASH_STATE_ABORT, +}; + +enum lid_fetch_side { + FETCH_T_SIDE_ONLY, + FETCH_P_SIDE_ONLY, + FETCH_BOTH_SIDE, +}; + +static enum flash_state flash_state = FLASH_STATE_INVALID; +static enum lid_fetch_side lid_fetch_side = FETCH_BOTH_SIDE; + +/* Image buffers */ +static struct opal_sg_list *image_data; +static uint32_t tce_start; +static void *lid_data; +static char validate_buf[VALIDATE_BUF_SIZE]; + +/* TCE buffer lock */ +static struct lock flash_lock = LOCK_UNLOCKED; + +/* FW VPD data */ +static struct fw_image_vpd fw_vpd[2]; + +/* Code update related sys parameters */ +static uint32_t ipl_side; +static uint32_t hmc_managed; +static uint32_t update_policy; +static uint32_t in_flight_params; + +/* If non-NULL, this gets called just before rebooting */ +int (*fsp_flash_term_hook)(void); + +DEFINE_LOG_ENTRY(OPAL_RC_CU_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_CODEUPDATE, + OPAL_PLATFORM_FIRMWARE, + OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT, OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_CU_FLASH, OPAL_PLATFORM_ERR_EVT, OPAL_CODEUPDATE, + OPAL_PLATFORM_FIRMWARE, + OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT, OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_CU_SG_LIST, OPAL_PLATFORM_ERR_EVT, OPAL_CODEUPDATE, + OPAL_PLATFORM_FIRMWARE, + OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT, OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_CU_COMMIT, OPAL_PLATFORM_ERR_EVT, OPAL_CODEUPDATE, + OPAL_PLATFORM_FIRMWARE, + OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT, OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_CU_MSG, OPAL_PLATFORM_ERR_EVT, OPAL_CODEUPDATE, + OPAL_PLATFORM_FIRMWARE, + OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT, OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_CU_NOTIFY, OPAL_PLATFORM_ERR_EVT, OPAL_CODEUPDATE, + OPAL_PLATFORM_FIRMWARE, + OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT, OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_CU_MARKER_LID, OPAL_PLATFORM_ERR_EVT, OPAL_CODEUPDATE, + OPAL_PLATFORM_FIRMWARE, + OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT, OPAL_NA); + +static inline void code_update_tce_map(uint32_t tce_offset, + void *buffer, uint32_t size) +{ + uint32_t tlen = ALIGN_UP(size, TCE_PSIZE); + + fsp_tce_map(PSI_DMA_CODE_UPD + tce_offset, buffer, tlen); +} + +static inline void code_update_tce_unmap(uint32_t size) +{ + fsp_tce_unmap(PSI_DMA_CODE_UPD, size); +} + +static inline void set_def_fw_version(uint32_t side) +{ + strncpy(fw_vpd[side].mi_keyword, FW_VERSION_UNKNOWN, MI_KEYWORD_SIZE); + strncpy(fw_vpd[side].ext_fw_id, FW_VERSION_UNKNOWN, ML_KEYWORD_SIZE); +} + +/* + * Get IPL side + */ +static void get_ipl_side(void) +{ + struct dt_node *iplp; + const char *side = NULL; + + iplp = dt_find_by_path(dt_root, "ipl-params/ipl-params"); + if (iplp) + side = dt_prop_get_def(iplp, "cec-ipl-side", NULL); + prlog(PR_NOTICE, "CUPD: IPL SIDE = %s\n", side); + + if (!side || !strcmp(side, "temp")) + ipl_side = FW_IPL_SIDE_TEMP; + else + ipl_side = FW_IPL_SIDE_PERM; +} + + +/* + * Helper routines to retrieve code update related + * system parameters from FSP. + */ + +static void inc_in_flight_param(void) +{ + lock(&flash_lock); + in_flight_params++; + unlock(&flash_lock); +} + +static void dec_in_flight_param(void) +{ + lock(&flash_lock); + assert(in_flight_params > 0); + in_flight_params--; + unlock(&flash_lock); +} + +static void got_code_update_policy(uint32_t param_id __unused, int err_len, + void *data __unused) +{ + if (err_len != 4) { + log_simple_error(&e_info(OPAL_RC_CU_INIT), "CUPD: Error " + "retrieving code update policy: %d\n", err_len); + } else { + update_policy = be32_to_cpu((__be32)update_policy); + prlog(PR_NOTICE, "CUPD: Code update policy from FSP: %d\n", + update_policy); + } + + dec_in_flight_param(); +} + +static void get_code_update_policy(void) +{ + int rc; + + inc_in_flight_param(); + rc = fsp_get_sys_param(SYS_PARAM_FLASH_POLICY, &update_policy, 4, + got_code_update_policy, NULL); + if (rc) { + log_simple_error(&e_info(OPAL_RC_CU_INIT), + "CUPD: Error %d queueing param request\n", rc); + dec_in_flight_param(); + } +} + +static void got_platform_hmc_managed(uint32_t param_id __unused, int err_len, + void *data __unused) +{ + if (err_len != 4) { + log_simple_error(&e_info(OPAL_RC_CU_INIT), "CUPD: Error " + "retrieving hmc managed status: %d\n", err_len); + } else { + hmc_managed = be32_to_cpu((__be32)hmc_managed); + prlog(PR_NOTICE, "CUPD: HMC managed status from FSP: %d\n", + hmc_managed); + } + + dec_in_flight_param(); +} + +static void get_platform_hmc_managed(void) +{ + int rc; + + inc_in_flight_param(); + rc = fsp_get_sys_param(SYS_PARAM_HMC_MANAGED, &hmc_managed, 4, + got_platform_hmc_managed, NULL); + if (rc) { + log_simple_error(&e_info(OPAL_RC_CU_INIT), + "CUPD: Error %d queueing param request\n", rc); + dec_in_flight_param(); + } +} + +static bool fw_ipl_side_update_notify(struct fsp_msg *msg) +{ + u32 param_id = fsp_msg_get_data_word(msg, 0); + int dlen = fsp_msg_get_data_word(msg, 1) & 0xffff; + uint32_t state = fsp_msg_get_data_word(msg, 2); + + if (param_id != SYS_PARAM_FW_IPL_SIDE) + return false; + + if (dlen != 4) { + prlog(PR_DEBUG, + "CUPD: Invalid sysparams notify len : 0x%x\n", dlen); + return false; + } + + prlog(PR_NOTICE, "CUPD: FW IPL side changed. Disable fast reboot\n"); + prlog(PR_NOTICE, "CUPD: Next IPL side : %s\n", + state == FW_IPL_SIDE_TEMP ? "temp" : "perm"); + + disable_fast_reboot("FSP IPL Side Change"); + return true; +} + +static int64_t code_update_check_state(void) +{ + switch(flash_state) { + case FLASH_STATE_ABSENT: + return OPAL_HARDWARE; + case FLASH_STATE_INVALID: + case FLASH_STATE_ABORT: + return OPAL_INTERNAL_ERROR; + case FLASH_STATE_READING: + return OPAL_BUSY; + default: + break; + } + return OPAL_SUCCESS; +} + +/* + * Get common marker LID additional data section + */ +static void *get_adf_sec_data(struct com_marker_adf_sec *adf_sec, + uint32_t name) +{ + struct com_marker_adf_header *adf_header; + int i; + + adf_header = (void *)adf_sec->adf_data; + for (i = 0; i < be32_to_cpu(adf_sec->adf_cnt); i++) { + if (be32_to_cpu(adf_header->name) == name) + return adf_header; + + adf_header = (void *)adf_header + be32_to_cpu(adf_header->size); + } + return NULL; +} + +/* + * Parse common marker LID to get FW version details + * + * Note: + * At present, we are parsing "Service Pack Nomenclature ADF" + * section only. If we are adding FW IP support, then we have + * to parse "Firmware IP Protection ADF" as well. + */ +static void parse_marker_lid(uint32_t side) +{ + struct com_marker_header *header; + struct com_marker_mi_section *mi_sec; + struct com_marker_adf_sec *adf_sec; + struct com_marker_adf_sp *adf_sp; + + header = (void *)lid_data; + + /* Get MI details */ + mi_sec = (void *)header + be32_to_cpu(header->MI_offset); + /* + * If Marker LID is invalid, then FSP will return a Marker + * LID with ASCII zeros for the entire MI keyword. + */ + if (mi_sec->mi_keyword[0] == '0') + return; + + strncpy(fw_vpd[side].mi_keyword, mi_sec->mi_keyword, MI_KEYWORD_SIZE); + fw_vpd[side].mi_keyword[MI_KEYWORD_SIZE - 1] = '\0'; + prlog(PR_NOTICE, "CUPD: %s side MI Keyword = %s\n", + side == 0x00 ? "P" : "T", fw_vpd[side].mi_keyword); + + /* Get ML details */ + adf_sec = (void *)header + be32_to_cpu(mi_sec->adf_offset); + adf_sp = get_adf_sec_data(adf_sec, ADF_NAME_SP); + if (!adf_sp) + return; + + strncpy(fw_vpd[side].ext_fw_id, + (void *)adf_sp + be32_to_cpu(adf_sp->sp_name_offset), + ML_KEYWORD_SIZE); + fw_vpd[side].ext_fw_id[ML_KEYWORD_SIZE - 1] = '\0'; + prlog(PR_NOTICE, "CUPD: %s side ML Keyword = %s\n", + side == 0x00 ? "P" : "T", fw_vpd[side].ext_fw_id); +} + +static void validate_com_marker_lid(void) +{ + if (!strncmp(fw_vpd[ipl_side].mi_keyword, FW_VERSION_UNKNOWN, + sizeof(FW_VERSION_UNKNOWN))) { + log_simple_error(&e_info(OPAL_RC_CU_MARKER_LID), + "CUPD: IPL side Marker LID is not valid\n"); + flash_state = FLASH_STATE_INVALID; + return; + } + + flash_state = FLASH_STATE_READ; +} + +static void fetch_lid_data_complete(struct fsp_msg *msg) +{ + void *buffer; + size_t length, chunk; + uint32_t lid_id, offset; + uint16_t id; + uint8_t flags, status; + int rc; + + status = (msg->resp->word1 >> 8) & 0xff; + flags = (fsp_msg_get_data_word(msg, 0) >> 16) & 0xff; + id = fsp_msg_get_data_word(msg, 0) & 0xffff; + lid_id = fsp_msg_get_data_word(msg, 1); + offset = fsp_msg_get_data_word(msg->resp, 1); + length = fsp_msg_get_data_word(msg->resp, 2); + + prlog(PR_NOTICE, "CUPD: Marker LID id : size : status = " + "0x%x : 0x%x : 0x%x\n", + fsp_msg_get_data_word(msg, 1), fsp_msg_get_data_word(msg->resp, 2), status); + + fsp_freemsg(msg); + + switch (status) { + case FSP_STATUS_SUCCESS: /* Read complete, parse VPD */ + parse_marker_lid(lid_id == P_COM_MARKER_LID_ID ? 0 : 1); + break; + case FSP_STATUS_MORE_DATA: /* More data left */ + offset += length; + chunk = MARKER_LID_SIZE - offset; + if (chunk > 0) { + buffer = (void *)PSI_DMA_CODE_UPD + offset; + rc = fsp_fetch_data_queue(flags, id, lid_id, + offset, buffer, &chunk, + fetch_lid_data_complete); + + /* If queue msg fails, then continue with marker LID + * validation hoping that we have at least boot side + * information. + */ + if (rc == OPAL_SUCCESS) + return; + } + break; + default: /* Fetch LID call failed */ + break; + } + + /* If required, fetch T side marker LID */ + if (lid_id == P_COM_MARKER_LID_ID && + lid_fetch_side == FETCH_BOTH_SIDE) { + length = MARKER_LID_SIZE; + rc = fsp_fetch_data_queue(flags, id, T_COM_MARKER_LID_ID, + 0, (void *)PSI_DMA_CODE_UPD, + &length, fetch_lid_data_complete); + + /* If queue msg fails, then continue with marker LID + * validation hoping that we have at least boot side + * information. + */ + if (rc == OPAL_SUCCESS) + return; + } + + lock(&flash_lock); + + /* Validate marker LID data */ + validate_com_marker_lid(); + /* TCE unmap */ + code_update_tce_unmap(MARKER_LID_SIZE); + + unlock(&flash_lock); +} + +static void fetch_com_marker_lid(void) +{ + size_t length = MARKER_LID_SIZE; + uint32_t lid_id; + int rc; + + /* Read in progress? */ + rc = code_update_check_state(); + if (rc == OPAL_HARDWARE || rc == OPAL_BUSY) + return; + + if (lid_fetch_side == FETCH_T_SIDE_ONLY) { + lid_id = T_COM_MARKER_LID_ID; + set_def_fw_version(FW_IPL_SIDE_TEMP); + } else if (lid_fetch_side == FETCH_P_SIDE_ONLY) { + lid_id = P_COM_MARKER_LID_ID; + set_def_fw_version(FW_IPL_SIDE_PERM); + } else { + lid_id = P_COM_MARKER_LID_ID; + set_def_fw_version(FW_IPL_SIDE_PERM); + set_def_fw_version(FW_IPL_SIDE_TEMP); + } + + code_update_tce_map(0, lid_data, length); + rc = fsp_fetch_data_queue(0x00, 0x05, lid_id, 0, + (void *)PSI_DMA_CODE_UPD, &length, + fetch_lid_data_complete); + if (!rc) + flash_state = FLASH_STATE_READING; + else + flash_state = FLASH_STATE_INVALID; +} + +/* + * Add MI and ML keyword details into DT + */ +#define FW_VER_SIZE 64 +static void add_opal_firmware_version(void) +{ + struct dt_node *dt_fw; + char buffer[FW_VER_SIZE]; + int offset; + + dt_fw = dt_find_by_path(dt_root, "ibm,opal/firmware"); + if (!dt_fw) + return; + + /* MI version */ + offset = snprintf(buffer, FW_VER_SIZE, "MI %s %s", + fw_vpd[FW_IPL_SIDE_TEMP].mi_keyword, + fw_vpd[FW_IPL_SIDE_PERM].mi_keyword); + if (ipl_side == FW_IPL_SIDE_TEMP) + snprintf(buffer + offset, FW_VER_SIZE - offset, + " %s", fw_vpd[FW_IPL_SIDE_TEMP].mi_keyword); + else + snprintf(buffer + offset, FW_VER_SIZE - offset, + " %s", fw_vpd[FW_IPL_SIDE_PERM].mi_keyword); + + dt_add_property(dt_fw, "mi-version", buffer, strlen(buffer)); + + /* ML version */ + offset = snprintf(buffer, FW_VER_SIZE, "ML %s %s", + fw_vpd[FW_IPL_SIDE_TEMP].ext_fw_id, + fw_vpd[FW_IPL_SIDE_PERM].ext_fw_id); + if (ipl_side == FW_IPL_SIDE_TEMP) + snprintf(buffer + offset, FW_VER_SIZE - offset, + " %s", fw_vpd[FW_IPL_SIDE_TEMP].ext_fw_id); + else + snprintf(buffer + offset, FW_VER_SIZE - offset, + " %s", fw_vpd[FW_IPL_SIDE_PERM].ext_fw_id); + + dt_add_property(dt_fw, "ml-version", buffer, strlen(buffer)); +} + +/* + * This is called right before starting the payload (Linux) to + * ensure the common marker LID read and parsing has happened + * before we transfer control. + */ +void fsp_code_update_wait_vpd(bool is_boot) +{ + int waited = 0; + + if (!fsp_present()) + return; + + prlog(PR_NOTICE, "CUPD: Waiting read marker LID" + " and in flight parsm completion...\n"); + + lock(&flash_lock); + while(true) { + if (!(flash_state == FLASH_STATE_READING || in_flight_params)) + break; + unlock(&flash_lock); + time_wait_ms(5); + waited+=5; + lock(&flash_lock); + } + unlock(&flash_lock); + + if (waited) + prlog(PR_DEBUG, "CUPD: fsp_code_update_wait_vpd %d\n", waited); + + if (is_boot) + add_opal_firmware_version(); +} + +static int code_update_start(void) +{ + struct fsp_msg *msg; + int rc; + uint16_t comp = 0x00; /* All components */ + uint8_t side = OPAL_COMMIT_TMP_SIDE; /* Temporary side */ + + msg = fsp_mkmsg(FSP_CMD_FLASH_START, 1, side << 16 | comp); + if (!msg) { + log_simple_error(&e_info(OPAL_RC_CU_MSG), + "CUPD: CMD_FLASH_START message allocation failed !\n"); + return OPAL_INTERNAL_ERROR; + } + if (fsp_sync_msg(msg, false)) { + fsp_freemsg(msg); + return OPAL_INTERNAL_ERROR; + } + rc = (msg->resp->word1 >> 8) & 0xff; + fsp_freemsg(msg); + return rc; +} + +static int code_update_write_lid(uint32_t lid_id, uint32_t size) +{ + struct fsp_msg *msg; + int rc, n_pairs = 1; + + msg = fsp_mkmsg(FSP_CMD_FLASH_WRITE, 5, lid_id, + n_pairs, 0, tce_start, size); + if (!msg) { + log_simple_error(&e_info(OPAL_RC_CU_MSG), + "CUPD: CMD_FLASH_WRITE message allocation failed !\n"); + return OPAL_INTERNAL_ERROR; + } + if (fsp_sync_msg(msg, false)) { + fsp_freemsg(msg); + return OPAL_INTERNAL_ERROR; + } + rc = (msg->resp->word1 >> 8) & 0xff; + fsp_freemsg(msg); + return rc; +} + +static int code_update_del_lid(uint32_t lid_id) +{ + struct fsp_msg *msg; + int rc; + + msg = fsp_mkmsg(FSP_CMD_FLASH_DEL, 1, lid_id); + if (!msg) { + log_simple_error(&e_info(OPAL_RC_CU_MSG), + "CUPD: CMD_FLASH_DEL message allocation failed !\n"); + return OPAL_INTERNAL_ERROR; + } + if (fsp_sync_msg(msg, false)) { + fsp_freemsg(msg); + return OPAL_INTERNAL_ERROR; + } + rc = (msg->resp->word1 >> 8) & 0xff; + fsp_freemsg(msg); + return rc; +} + +static int code_update_complete(uint32_t cmd) +{ + struct fsp_msg *msg; + int rc; + + msg = fsp_mkmsg(cmd, 0); + if (!msg) { + log_simple_error(&e_info(OPAL_RC_CU_MSG), + "CUPD: CUPD COMPLETE message allocation failed !\n"); + return OPAL_INTERNAL_ERROR; + } + if (fsp_sync_msg(msg, false)) { + fsp_freemsg(msg); + return OPAL_INTERNAL_ERROR; + } + rc = (msg->resp->word1 >> 8) & 0xff; + fsp_freemsg(msg); + return rc; +} + +static int code_update_swap_side(void) +{ + struct fsp_msg *msg; + int rc; + + msg = fsp_mkmsg(FSP_CMD_FLASH_SWAP, 0); + if (!msg) { + log_simple_error(&e_info(OPAL_RC_CU_MSG), + "CUPD: CMD_FLASH_SWAP message allocation failed !\n"); + return OPAL_INTERNAL_ERROR; + } + + if (fsp_sync_msg(msg, false)) { + fsp_freemsg(msg); + return OPAL_INTERNAL_ERROR; + } + rc = (msg->resp->word1 >> 8) & 0xff; + fsp_freemsg(msg); + return rc; +} + +static int code_update_set_ipl_side(void) +{ + struct fsp_msg *msg; + uint8_t side = FW_IPL_SIDE_TEMP; /* Next IPL side */ + int rc; + + msg = fsp_mkmsg(FSP_CMD_SET_IPL_SIDE, 1, side << 16); + if (!msg) { + log_simple_error(&e_info(OPAL_RC_CU_MSG), + "CUPD: CMD_SET_IPL_SIDE message allocation failed!\n"); + return OPAL_INTERNAL_ERROR; + } + if (fsp_sync_msg(msg, false)) { + fsp_freemsg(msg); + log_simple_error(&e_info(OPAL_RC_CU_MSG), + "CUPD: Setting next IPL side failed!\n"); + return OPAL_INTERNAL_ERROR; + } + rc = (msg->resp->word1 >> 8) & 0xff; + fsp_freemsg(msg); + return rc; +} + +static void code_update_commit_complete(struct fsp_msg *msg) +{ + int rc; + uint8_t type; + + rc = (msg->resp->word1 >> 8) & 0xff; + type = (msg->word1 >> 8) & 0xff; + fsp_freemsg(msg); + if (rc) { + log_simple_error(&e_info(OPAL_RC_CU_COMMIT), + "CUPD: Code update commit failed, err 0x%x\n", rc); + return; + } + + /* Reset cached VPD data */ + lock(&flash_lock); + + /* Find commit type */ + if (type == 0x01) { + lid_fetch_side = FETCH_P_SIDE_ONLY; + } else if (type == 0x02) + lid_fetch_side = FETCH_T_SIDE_ONLY; + else + lid_fetch_side = FETCH_BOTH_SIDE; + + fetch_com_marker_lid(); + + unlock(&flash_lock); +} + +static int code_update_commit(uint32_t cmd) +{ + struct fsp_msg *msg; + + msg = fsp_mkmsg(cmd, 0); + if (!msg) { + log_simple_error(&e_info(OPAL_RC_CU_MSG), + "CUPD: COMMIT message allocation failed !\n"); + return OPAL_INTERNAL_ERROR; + } + if (fsp_queue_msg(msg, code_update_commit_complete)) { + log_simple_error(&e_info(OPAL_RC_CU_COMMIT), + "CUPD: Failed to queue code update commit message\n"); + fsp_freemsg(msg); + return OPAL_INTERNAL_ERROR; + } + return OPAL_SUCCESS; +} + +/* + * Inband code update is allowed? + */ +static int64_t validate_inband_policy(void) +{ + /* Quirk: + * If the code update policy is out-of-band, but the system + * is not HMC-managed, then inband update is allowed. + */ + if (hmc_managed != PLATFORM_HMC_MANAGED) + return 0; + if (update_policy == INBAND_UPDATE_ALLOWED) + return 0; + + return -1; +} + +/* + * Validate magic Number + */ +static int64_t validate_magic_num(uint16_t magic) +{ + if (magic != IMAGE_MAGIC_NUMBER) + return -1; + return 0; +} + +/* + * Compare MI keyword to make sure candidate image + * is valid for this platform. + */ +static int64_t validate_image_version(struct update_image_header *header, + uint32_t *result) +{ + struct fw_image_vpd vpd; + int t_valid = 0, p_valid = 0, cton_ver = -1, ptot_ver = -1; + + /* Valid flash image level? */ + if (strncmp(fw_vpd[0].mi_keyword, FW_VERSION_UNKNOWN, + sizeof(FW_VERSION_UNKNOWN)) != 0) + p_valid = 1; + + if (strncmp(fw_vpd[1].mi_keyword, FW_VERSION_UNKNOWN, + sizeof(FW_VERSION_UNKNOWN)) != 0) + t_valid = 1; + + /* Validate with IPL side image */ + vpd = fw_vpd[ipl_side]; + + /* Validate platform identifier (first two char of MI keyword) */ + if (strncmp(vpd.mi_keyword, header->mi_keyword_data, 2) != 0) { + *result = VALIDATE_INVALID_IMG; + return OPAL_SUCCESS; + } + + /* Don't flash different FW series (like P7 image on P8) */ + if (vpd.mi_keyword[2] != header->mi_keyword_data[2]) { + *result = VALIDATE_INVALID_IMG; + return OPAL_SUCCESS; + } + + /* Get current to new version difference */ + cton_ver = strncmp(vpd.mi_keyword + 3, header->mi_keyword_data + 3, 6); + + /* Get P to T version difference */ + if (t_valid && p_valid) + ptot_ver = strncmp(fw_vpd[0].mi_keyword + 3, + fw_vpd[1].mi_keyword + 3, 6); + + /* Update validation result */ + if (ipl_side == FW_IPL_SIDE_TEMP) { + if (!ptot_ver && cton_ver > 0) /* downgrade T side */ + *result = VALIDATE_TMP_UPDATE_DL; + else if (!ptot_ver && cton_ver <= 0) /* upgrade T side */ + *result = VALIDATE_TMP_UPDATE; + else if (cton_ver > 0) /* Implied commit & downgrade T side */ + *result = VALIDATE_TMP_COMMIT_DL; + else /* Implied commit & upgrade T side */ + *result = VALIDATE_TMP_COMMIT; + } else { + if (!t_valid) /* Current unknown */ + *result = VALIDATE_CUR_UNKNOWN; + else if (cton_ver > 0) /* downgrade FW version */ + *result = VALIDATE_TMP_UPDATE_DL; + else /* upgrade FW version */ + *result = VALIDATE_TMP_UPDATE; + } + return OPAL_SUCCESS; +} + +/* + * Validate candidate image + */ +static int validate_candidate_image(uint64_t buffer, + uint32_t size, uint32_t *result) +{ + struct update_image_header *header; + int rc = OPAL_PARAMETER; + + if (size < VALIDATE_BUF_SIZE) + goto out; + + rc = code_update_check_state(); + if (rc != OPAL_SUCCESS) + goto out; + + if (validate_inband_policy() != 0) { + *result = VALIDATE_FLASH_AUTH; + rc = OPAL_SUCCESS; + goto out; + } + + memcpy(validate_buf, (void *)buffer, VALIDATE_BUF_SIZE); + header = (struct update_image_header *)validate_buf; + + if (validate_magic_num(be16_to_cpu(header->magic)) != 0) { + *result = VALIDATE_INVALID_IMG; + rc = OPAL_SUCCESS; + goto out; + } + rc = validate_image_version(header, result); +out: + return rc; +} + +static int validate_out_buf_mi_data(void *buffer, int offset, uint32_t result) +{ + struct update_image_header *header = (void *)validate_buf; + + /* Current T & P side MI data */ + offset += snprintf(buffer + offset, VALIDATE_BUF_SIZE - offset, + "MI %s %s\n", + fw_vpd[1].mi_keyword, fw_vpd[0].mi_keyword); + + /* New T & P side MI data */ + offset += snprintf(buffer + offset, VALIDATE_BUF_SIZE - offset, + "MI %s", header->mi_keyword_data); + if (result == VALIDATE_TMP_COMMIT_DL || + result == VALIDATE_TMP_COMMIT) + offset += snprintf(buffer + offset, + VALIDATE_BUF_SIZE - offset, + " %s\n", fw_vpd[1].mi_keyword); + else + offset += snprintf(buffer + offset, + VALIDATE_BUF_SIZE - offset, + " %s\n", fw_vpd[0].mi_keyword); + return offset; +} + +static int validate_out_buf_ml_data(void *buffer, int offset, uint32_t result) +{ + struct update_image_header *header = (void *)validate_buf; + /* Candidate image ML data */ + char *ext_fw_id = (void *)header->data; + + /* Current T & P side ML data */ + offset += snprintf(buffer + offset, VALIDATE_BUF_SIZE - offset, + "ML %s %s\n", + fw_vpd[1].ext_fw_id, fw_vpd[0].ext_fw_id); + + /* New T & P side ML data */ + offset += snprintf(buffer + offset, VALIDATE_BUF_SIZE - offset, + "ML %s", ext_fw_id); + if (result == VALIDATE_TMP_COMMIT_DL || + result == VALIDATE_TMP_COMMIT) + offset += snprintf(buffer + offset, + VALIDATE_BUF_SIZE - offset, + " %s\n", fw_vpd[1].ext_fw_id); + else + offset += snprintf(buffer + offset, + VALIDATE_BUF_SIZE - offset, + " %s\n", fw_vpd[0].ext_fw_id); + + return offset; +} + +/* + * Copy LID data to TCE buffer + */ +static int get_lid_data(struct opal_sg_list *list, + int lid_size, int lid_offset) +{ + struct opal_sg_list *sg; + struct opal_sg_entry *entry; + int length, num_entries, i, buf_pos = 0; + int map_act, map_size; + bool last = false; + + /* Reset TCE start address */ + tce_start = 0; + + for (sg = list; sg; sg = (struct opal_sg_list*)be64_to_cpu(sg->next)) { + length = (be64_to_cpu(sg->length) & ~(SG_LIST_VERSION << 56)) - 16; + num_entries = length / sizeof(struct opal_sg_entry); + if (num_entries <= 0) + return -1; + + for (i = 0; i < num_entries; i++) { + entry = &sg->entry[i]; + + /* + * Continue until we get data block which + * contains LID data + */ + if (lid_offset > be64_to_cpu(entry->length)) { + lid_offset -= be64_to_cpu(entry->length); + continue; + } + + /* + * SG list entry size can be more than 4k. + * Map only required pages, instead of + * mapping entire entry. + */ + map_act = be64_to_cpu(entry->length); + map_size = be64_to_cpu(entry->length); + + /* First TCE mapping */ + if (!tce_start) { + tce_start = PSI_DMA_CODE_UPD + + (lid_offset & 0xfff); + map_act = be64_to_cpu(entry->length) - lid_offset; + lid_offset &= ~0xfff; + map_size = be64_to_cpu(entry->length) - lid_offset; + } + + /* Check pending LID size to map */ + if (lid_size <= map_act) { + /* (map_size - map_act) gives page + * start to tce offset difference. + * This is required when LID size + * is <= 4k. + */ + map_size = (map_size - map_act) + lid_size; + last = true; + } + + /* Ajust remaining size to map */ + lid_size -= map_act; + + /* TCE mapping */ + code_update_tce_map(buf_pos, + (void*)(be64_to_cpu(entry->data) + + lid_offset), + map_size); + buf_pos += map_size; + /* Reset LID offset count */ + lid_offset = 0; + + if (last) + return OPAL_SUCCESS; + } + } /* outer loop */ + return -1; +} + +/* + * If IPL side is T, then swap P & T sides to add + * new fix to T side. + */ +static int validate_ipl_side(void) +{ + if (ipl_side == FW_IPL_SIDE_PERM) + return 0; + return code_update_swap_side(); +} + +static int64_t fsp_opal_validate_flash(uint64_t buffer, + __be32 *size, __be32 *result) +{ + int64_t rc = 0; + int offset; + uint32_t r; + + lock(&flash_lock); + + rc = validate_candidate_image(buffer, be32_to_cpu(*size), &r); + /* Fill output buffer + * + * Format: + * MI<sp>current-T-image<sp>current-P-image<0x0A> + * MI<sp>new-T-image<sp>new-P-image<0x0A> + * ML<sp>current-T-image<sp>current-P-image<0x0A> + * ML<sp>new-T-image<sp>new-P-image<0x0A> + */ + if (!rc && (r != VALIDATE_FLASH_AUTH && r != VALIDATE_INVALID_IMG)) { + /* Clear output buffer */ + memset((void *)buffer, 0, VALIDATE_BUF_SIZE); + + offset = validate_out_buf_mi_data((void *)buffer, 0, r); + offset += validate_out_buf_ml_data((void *)buffer, offset, r); + *size = cpu_to_be32(offset); + } + *result = cpu_to_be32(r); + + unlock(&flash_lock); + return rc; +} + +/* Commit/Reject T side image */ +static int64_t fsp_opal_manage_flash(uint8_t op) +{ + uint32_t cmd; + int rc; + + lock(&flash_lock); + rc = code_update_check_state(); + unlock(&flash_lock); + + if (rc != OPAL_SUCCESS) + return rc; + + if (op != OPAL_REJECT_TMP_SIDE && op != OPAL_COMMIT_TMP_SIDE) + return OPAL_PARAMETER; + + if ((op == OPAL_COMMIT_TMP_SIDE && ipl_side == FW_IPL_SIDE_PERM) || + (op == OPAL_REJECT_TMP_SIDE && ipl_side == FW_IPL_SIDE_TEMP)) + return OPAL_ACTIVE_SIDE_ERR; + + if (op == OPAL_COMMIT_TMP_SIDE) + cmd = FSP_CMD_FLASH_NORMAL; + else + cmd = FSP_CMD_FLASH_REMOVE; + + return code_update_commit(cmd); +} + +static int fsp_flash_firmware(void) +{ + struct update_image_header *header; + struct lid_index_entry *idx_entry; + struct opal_sg_list *list; + struct opal_sg_entry *entry; + int rc, i; + + /* Make sure no outstanding LID read is in progress */ + rc = code_update_check_state(); + if (rc == OPAL_BUSY) + fsp_code_update_wait_vpd(false); + + /* Get LID Index */ + list = image_data; + if (!list) + goto out; + entry = &list->entry[0]; + header = (struct update_image_header *)be64_to_cpu(entry->data); + idx_entry = (void *)header + be16_to_cpu(header->lid_index_offset); + + /* FIXME: + * At present we depend on FSP to validate CRC for + * individual LIDs. Calculate and validate individual + * LID CRC here. + */ + + if (validate_ipl_side() != 0) { + log_simple_error(&e_info(OPAL_RC_CU_FLASH), "CUPD: " + "Rename (Swap T and P) failed!\n"); + goto out; + } + + /* Set next IPL side */ + if (code_update_set_ipl_side() != 0) { + log_simple_error(&e_info(OPAL_RC_CU_FLASH), "CUPD: " + "Setting next IPL side failed!\n"); + goto out; + } + + /* Start code update process */ + if (code_update_start() != 0) { + log_simple_error(&e_info(OPAL_RC_CU_FLASH), "CUPD: " + "Code update start failed!\n"); + goto out; + } + + /* + * Delete T side LIDs before writing. + * + * Note: + * - Applicable for FWv >= 760. + * - Current Code Update design is to ignore + * any delete lid failure, and continue with + * the update. + */ + rc = code_update_del_lid(DEL_UPD_SIDE_LIDS); + + if (rc) + prlog(PR_TRACE, "CUPD: Failed to delete LIDs (%d). This is okay, continuing..", rc); + + for (i = 0; i < be16_to_cpu(header->number_lids); i++) { + if (be32_to_cpu(idx_entry->size) > LID_MAX_SIZE) { + log_simple_error(&e_info(OPAL_RC_CU_FLASH), "CUPD: LID" + " (0x%x) size 0x%x is > max LID size (0x%x).\n", + be32_to_cpu(idx_entry->id), + be32_to_cpu(idx_entry->size), LID_MAX_SIZE); + goto abort_update; + } + + rc = get_lid_data(list, be32_to_cpu(idx_entry->size), + be32_to_cpu(idx_entry->offset)); + if (rc) { + log_simple_error(&e_info(OPAL_RC_CU_FLASH), "CUPD: " + "Failed to parse LID from firmware image." + " (rc : %d).\n", rc); + goto abort_update; + } + + rc = code_update_write_lid(be32_to_cpu(idx_entry->id), + be32_to_cpu(idx_entry->size)); + if (rc) { + log_simple_error(&e_info(OPAL_RC_CU_FLASH), "CUPD: " + "Failed to write LID to FSP. (rc : %d).\n", rc); + goto abort_update; + } + + /* Unmap TCE */ + code_update_tce_unmap(PSI_DMA_CODE_UPD_SIZE); + + /* Next LID index */ + idx_entry = (void *)idx_entry + sizeof(struct lid_index_entry); + } + + /* Code update completed */ + rc = code_update_complete(FSP_CMD_FLASH_COMPLETE); + + return rc; + +abort_update: + rc = code_update_complete(FSP_CMD_FLASH_ABORT); + if (rc) + log_simple_error(&e_info(OPAL_RC_CU_FLASH), "CUPD: " + "Code update abort command failed. (rc : %d).", rc); + +out: + return -1; +} + +static int64_t validate_sglist(struct opal_sg_list *list) +{ + struct opal_sg_list *sg; + struct opal_sg_entry *prev_entry, *entry; + int length, num_entries, i; + + prev_entry = NULL; + for (sg = list; sg; sg = (struct opal_sg_list*)be64_to_cpu(sg->next)) { + length = (be64_to_cpu(sg->length) & ~(SG_LIST_VERSION << 56)) - 16; + num_entries = length / sizeof(struct opal_sg_entry); + if (num_entries <= 0) + return -1; + + for (i = 0; i < num_entries; i++) { + entry = &sg->entry[i]; + + /* All entries must be aligned */ + if (((uint64_t)be64_to_cpu(entry->data)) & 0xfff) + return OPAL_PARAMETER; + + /* All non-terminal entries size must be aligned */ + if (prev_entry && (be64_to_cpu(prev_entry->length) & 0xfff)) + return OPAL_PARAMETER; + + prev_entry = entry; + } + } + return OPAL_SUCCESS; +} + +static int64_t fsp_opal_update_flash(struct opal_sg_list *list) +{ + struct opal_sg_entry *entry; + int length, num_entries, result = 0, rc = OPAL_PARAMETER; + + /* Ensure that the sg list honors our alignment requirements */ + rc = validate_sglist(list); + if (rc) { + log_simple_error(&e_info(OPAL_RC_CU_SG_LIST), + "CUPD: sglist fails alignment requirements\n"); + return rc; + } + + lock(&flash_lock); + if (!list) { /* Cancel update request */ + fsp_flash_term_hook = NULL; + image_data = NULL; + rc = OPAL_SUCCESS; + goto out; + } + + disable_fast_reboot("FSP Code Update"); + + length = (be64_to_cpu(list->length) & ~(SG_LIST_VERSION << 56)) - 16; + num_entries = length / sizeof(struct opal_sg_entry); + if (num_entries <= 0) + goto out; + + /* Validate image header */ + entry = &list->entry[0]; + rc = validate_candidate_image((uint64_t)be64_to_cpu(entry->data), + VALIDATE_BUF_SIZE, &result); + if (!rc && (result != VALIDATE_FLASH_AUTH && + result != VALIDATE_INVALID_IMG)) { + image_data = list; + fsp_flash_term_hook = fsp_flash_firmware; + goto out; + } + + /* Adjust return code */ + if (result == VALIDATE_FLASH_AUTH) + rc = OPAL_FLASH_NO_AUTH; + else if (result == VALIDATE_INVALID_IMG) + rc = OPAL_INVALID_IMAGE; + +out: + unlock(&flash_lock); + return rc; +} + +/* + * Code Update notifications + * + * Note: At present we just ACK these notifications. + * Reset cached VPD data if we are going to support + * concurrent image maint in future. + */ +static bool code_update_notify(uint32_t cmd_sub_mod, struct fsp_msg *msg) +{ + int rc; + uint32_t cmd; + + switch(cmd_sub_mod) { + case FSP_CMD_FLASH_CACHE: + cmd = FSP_CMD_FLASH_CACHE_RSP; + prlog(PR_NOTICE, "CUPD: Update LID cache event [data = 0x%x]\n", + fsp_msg_get_data_word(msg, 0)); + break; + case FSP_CMD_FLASH_OUTC: + case FSP_CMD_FLASH_OUTR: + case FSP_CMD_FLASH_OUTS: + cmd = FSP_CMD_FLASH_OUT_RSP; + prlog(PR_NOTICE, "CUPD: Out of band commit notify " + "[Type = 0x%x]\n", (msg->word1 >> 8) & 0xff); + break; + default: + log_simple_error(&e_info(OPAL_RC_CU_NOTIFY), "CUPD: Unknown " + "notification [cmd = 0x%x]\n", cmd_sub_mod); + return false; + } + + rc = fsp_queue_msg(fsp_mkmsg(cmd, 0), fsp_freemsg); + if (rc) + log_simple_error(&e_info(OPAL_RC_CU_NOTIFY), "CUPD: Failed to " + "queue code update notification response :%d\n", rc); + + return true; +} + +/* + * Handle FSP R/R event. + * + * Note: + * If FSP R/R happens during code update, then entire system reboots + * and comes up with P side image (and T side image will be invalid). + * Hence we don't need to handle R/R during code update. + * + * Also if FSP R/R happens in init path (while retrieving in_flight_params) + * then system fails to continue booting (because we have not yet loaded + * all required data/LID from FSP). Hence we don't need to handle R/R + * for system params. + */ +static bool fsp_code_update_rr(uint32_t cmd_sub_mod, + struct fsp_msg *msg __unused) +{ + switch (cmd_sub_mod) { + case FSP_RESET_START: + lock(&flash_lock); + + if (code_update_check_state() == OPAL_BUSY) + flash_state = FLASH_STATE_ABORT; + + unlock(&flash_lock); + return true; + case FSP_RELOAD_COMPLETE: + lock(&flash_lock); + + /* Lets try to parse marker LID again, if we failed + * to parse marker LID last time. + */ + if (code_update_check_state() == OPAL_INTERNAL_ERROR) + fetch_com_marker_lid(); + + unlock(&flash_lock); + return true; + } + return false; +} + +static struct fsp_client fsp_cupd_client_rr = { + .message = fsp_code_update_rr, +}; + +static struct fsp_client fsp_get_notify = { + .message = code_update_notify, +}; + +void fsp_code_update_init(void) +{ + if (!fsp_present()) { + flash_state = FLASH_STATE_ABSENT; + return; + } + + /* OPAL interface */ + opal_register(OPAL_FLASH_VALIDATE, fsp_opal_validate_flash, 3); + opal_register(OPAL_FLASH_MANAGE, fsp_opal_manage_flash, 1); + opal_register(OPAL_FLASH_UPDATE, fsp_opal_update_flash, 1); + + /* register Code Update Class D3 */ + fsp_register_client(&fsp_get_notify, FSP_MCLASS_CODE_UPDATE); + /* Register for Class AA (FSP R/R) */ + fsp_register_client(&fsp_cupd_client_rr, FSP_MCLASS_RR_EVENT); + + /* Register for firmware IPL side update notification */ + sysparam_add_update_notifier(fw_ipl_side_update_notify); + + /* Flash hook */ + fsp_flash_term_hook = NULL; + + /* Fetch various code update related sys parameters */ + get_ipl_side(); + get_code_update_policy(); + get_platform_hmc_managed(); + + /* Fetch common marker LID */ + lid_data = memalign(TCE_PSIZE, MARKER_LID_SIZE); + if (!lid_data) { + log_simple_error(&e_info(OPAL_RC_CU_INIT), + "CUPD: Failed to allocate memory for marker LID\n"); + flash_state = FLASH_STATE_ABSENT; + return; + } + fetch_com_marker_lid(); +} diff --git a/roms/skiboot/hw/fsp/fsp-codeupdate.h b/roms/skiboot/hw/fsp/fsp-codeupdate.h new file mode 100644 index 000000000..2b86619ef --- /dev/null +++ b/roms/skiboot/hw/fsp/fsp-codeupdate.h @@ -0,0 +1,222 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* Copyright 2013-2015 IBM Corp. */ + +#ifndef __CODEUPDATE_H +#define __CODEUPDATE_H + +/* Flash SG list version */ +#define SG_LIST_VERSION (1UL) + +/* LID size <= 16M */ +#define LID_MAX_SIZE 0x1000000 + +/* Delete all LIDs in */ +#define DEL_UPD_SIDE_LIDS 0xFFFFFFFF + +/* System parameter values used in code update validation */ +#define INBAND_UPDATE_ALLOWED 0x01 +#define PLATFORM_HMC_MANAGED 0x01 +#define FW_LICENSE_ACCEPT 0x01 + +/* Running image side */ +#define FW_IPL_SIDE_TEMP 0x01 +#define FW_IPL_SIDE_PERM 0x00 + +/* Manage operations */ +#define OPAL_REJECT_TMP_SIDE 0 +#define OPAL_COMMIT_TMP_SIDE 1 + +/* Validate image size */ +#define VALIDATE_BUF_SIZE 4096 + +/* Code update operation status */ +#define OPAL_INVALID_IMAGE -1003 /* Unacceptable image */ +#define OPAL_ACTIVE_SIDE_ERR -9001 +#define OPAL_FLASH_NO_AUTH -9002 + +/* Validate image update result tokens */ +#define VALIDATE_TMP_UPDATE 0 /* T side will be updated */ +#define VALIDATE_FLASH_AUTH 1 /* Partition does not have authority */ +#define VALIDATE_INVALID_IMG 2 /* Candidate image is not valid */ +#define VALIDATE_CUR_UNKNOWN 3 /* Current fixpack level is unknown */ +/* + * Current T side will be committed to P side before being replace with new + * image, and the new image is downlevel from current image + */ +#define VALIDATE_TMP_COMMIT_DL 4 +/* + * Current T side will be committed to P side before being replaced with new + * image + */ +#define VALIDATE_TMP_COMMIT 5 +/* + * T side will be updated with a downlevel image + */ +#define VALIDATE_TMP_UPDATE_DL 6 +/* + * The candidate image's release date is later than the system's firmware + * service entitlement date - service warranty period has expired + */ +#define VALIDATE_OUT_OF_WRNTY 7 + +/* default version */ +#define FW_VERSION_UNKNOWN "UNKNOWN" + +/* Actual size of MI & ML keyword including NULL */ +#define MI_KEYWORD_SIZE 10 +#define ML_KEYWORD_SIZE 9 + +/* Firmware image VPD data */ +struct fw_image_vpd { + char mi_keyword[MI_KEYWORD_SIZE]; /* NNSSS_FFF */ + char ext_fw_id[ML_KEYWORD_SIZE]; /* FWxxx.yy */ +}; + +/* Master LID header */ +struct master_lid_header { + char key[3]; /* "MLH" */ + uint8_t version; /* 0x02 */ + __be16 header_size; + __be16 entry_size; + uint8_t reserved[56]; +}; + +/* LID index entry */ +struct lid_index_entry { + __be32 id; + __be32 size; + __be32 offset; + __be32 crc; +}; + +/* SP flags */ +#define FW_ONE_OFF_SP 0x80000000 +#define FW_EMERGENCY_SP 0x40000000 + +/* + * SP GA date + * + * sp_flag addr = header->data + header->ext_fw_id_size + */ +struct update_image_ga_date { + __be32 sp_flag; + char sp_ga_date[8]; /* YYYYMMDD */ +}; + +/* Image magic number */ +#define IMAGE_MAGIC_NUMBER 0x5549 + +/* Image header structure */ +struct update_image_header { + __be16 magic; + __be16 version; + __be32 package_size; + __be32 crc; + __be16 lid_index_offset; + __be16 number_lids; + __be16 package_flags; + __be16 mi_keyword_size; + char mi_keyword_data[40]; + __be16 ext_fw_id_size; + /* Rest of the image data including ext fw id, sp flags */ + char data[]; +}; + +/* FipS header */ +struct fips_header { + __be16 magic; + __be16 version; + __be32 lid_id; + __be32 lid_date; /* YYYYMMDD */ + __be16 lid_time; /* HHMM */ + __be16 lid_class; + __be32 crc; + __be32 lid_size; /* Number of bytes below header */ + __be32 header_size; + uint8_t mtd_number; + uint8_t valid; /* 1 = valid, 0 = invalid */ + uint8_t reserved; + uint8_t lid_info_size; + char lid_info[64]; /* code level */ + __be32 update_date; /* YYYYMMDD */ + __be16 update_time; /* HHMM */ + __be16 phylum_len; + uint8_t lid_phylum[]; +}; + +/* Approximate LID size */ +#define MASTER_LID_SIZE 0x5000 +/* + * Note: + * Doc indicates non-SP LIDs size is 0-8MB. However + * in reality marker LID size less than 4k. Allocating + * 8k to give some breathing space. + */ +#define MARKER_LID_SIZE 0x00002000 + +/* Common marker LID no */ +#define P_COM_MARKER_LID_ID 0x80A00001 +#define T_COM_MARKER_LID_ID (P_COM_MARKER_LID_ID | ADJUST_T_SIDE_LID_NO) + +/* + * Common marker LID structure + * + * Note that we are populating only required sections, + * not all ADF sections in common marker LID. + */ +struct com_marker_header { + __be32 version; + __be32 MI_offset; /* Offset to MI section */ + __be32 iseries_offset; +}; + +/* MI Keyword section */ +struct com_marker_mi_section { + __be32 MI_size; + char mi_keyword[40]; /* MI Keyword */ + char lst_disrupt_fix_lvl[3]; + char skip[21]; /* Skip not interested fields */ + __be32 adf_offset; /* Offset to ADF section */ +}; + +/* Additional Data Fields */ +struct com_marker_adf_sec { + __be32 adf_cnt; /* ADF count */ + char adf_data[]; /* ADF data */ +}; + +/* ADF common header */ +struct com_marker_adf_header { + __be32 size; /* Section size */ + __be32 name; /* Section name */ +}; + +/* + * Service Pack Nomenclature ADF + * + * Service pack release name. + */ +#define ADF_NAME_SP 0x53504E4D /* SPNM */ +struct com_marker_adf_sp +{ + struct com_marker_adf_header header; + __be32 sp_name_offset; /* Offset from start of ADF */ + __be32 sp_name_size; + __be32 skip[4]; /* Skip rest of fields */ +}; + +/* + * Firmware IP Protection ADF + * + * Service Pack flags and GA date. + */ +#define ADF_NAME_FW_IP 0x46495050 /* FIPP */ +struct com_marker_fw_ip { + struct com_marker_adf_header header; + __be32 sp_flag_offset; /* Offset from start of ADF */ + __be32 sp_flag_size; + __be32 sp_ga_offset; /* Offset from start of ADF*/ + __be32 sp_ga_size; +}; + +#endif /* __CODEUPDATE_H */ diff --git a/roms/skiboot/hw/fsp/fsp-console.c b/roms/skiboot/hw/fsp/fsp-console.c new file mode 100644 index 000000000..dc23ac46f --- /dev/null +++ b/roms/skiboot/hw/fsp/fsp-console.c @@ -0,0 +1,1062 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Flexible Service Processor (FSP) serial console handling code + * + * Copyright 2013-2018 IBM Corp. + */ + +#include <skiboot.h> +#include <processor.h> +#include <io.h> +#include <fsp.h> +#include <console.h> +#include <opal.h> +#include <timebase.h> +#include <device.h> +#include <fsp-sysparam.h> +#include <errorlog.h> +#include <lock.h> + +DEFINE_LOG_ENTRY(OPAL_RC_CONSOLE_HANG, OPAL_PLATFORM_ERR_EVT, OPAL_CONSOLE, + OPAL_PLATFORM_FIRMWARE, + OPAL_PREDICTIVE_ERR_GENERAL, OPAL_NA); + +struct fsp_serbuf_hdr { + __be16 partition_id; + u8 session_id; + u8 hmc_id; + __be16 data_offset; + __be16 last_valid; + __be16 ovf_count; + __be16 next_in; + u8 flags; + u8 reserved; + __be16 next_out; + u8 data[]; +}; +#define SER_BUF_DATA_SIZE (0x10000 - sizeof(struct fsp_serbuf_hdr)) + +struct fsp_serial { + bool available; + bool open; + bool has_part0; + bool has_part1; + bool log_port; + bool out_poke; + char loc_code[LOC_CODE_SIZE]; + u16 rsrc_id; + struct fsp_serbuf_hdr *in_buf; + struct fsp_serbuf_hdr *out_buf; + struct fsp_msg *poke_msg; + u8 waiting; + u64 irq; + u16 out_buf_prev_len; + u64 out_buf_timeout; +}; + +#define SER_BUFFER_SIZE 0x00040000UL +#define MAX_SERIAL 4 + +#define SER_BUFFER_OUT_TIMEOUT 10 + +static struct fsp_serial fsp_serials[MAX_SERIAL]; +static bool got_intf_query; +static struct lock fsp_con_lock = LOCK_UNLOCKED; +static void* ser_buffer = NULL; + +static void fsp_console_reinit(void) +{ + int i; + void *base; + struct fsp_msg *msg; + + /* Initialize out data structure pointers & TCE maps */ + base = ser_buffer; + for (i = 0; i < MAX_SERIAL; i++) { + struct fsp_serial *ser = &fsp_serials[i]; + + ser->in_buf = base; + ser->out_buf = base + SER_BUFFER_SIZE/2; + base += SER_BUFFER_SIZE; + } + fsp_tce_map(PSI_DMA_SER0_BASE, ser_buffer, + 4 * PSI_DMA_SER0_SIZE); + + for (i = 0; i < MAX_SERIAL; i++) { + struct fsp_serial *fs = &fsp_serials[i]; + + if (!fs->available) + continue; + + if (fs->rsrc_id == 0xffff) + continue; + prlog(PR_DEBUG, "FSP: Reassociating HVSI console %d\n", i); + msg = fsp_mkmsg(FSP_CMD_ASSOC_SERIAL, 2, + (fs->rsrc_id << 16) | 1, i); + if (!msg) { + prerror("FSPCON: Failed to allocate associate msg\n"); + return; + } + if (fsp_queue_msg(msg, fsp_freemsg)) { + fsp_freemsg(msg); + prerror("FSPCON: Failed to queue associate msg\n"); + return; + } + } +} + +static void fsp_close_consoles(void) +{ + unsigned int i; + + for (i = 0; i < MAX_SERIAL; i++) { + struct fsp_serial *fs = &fsp_serials[i]; + + if (!fs->available) + continue; + + lock(&fsp_con_lock); + if (fs->open) { + fs->open = false; + fs->out_poke = false; + if (fs->poke_msg->state != fsp_msg_unused) + fsp_cancelmsg(fs->poke_msg); + fsp_freemsg(fs->poke_msg); + fs->poke_msg = NULL; + } + unlock(&fsp_con_lock); + } + prlog(PR_DEBUG, "FSPCON: Closed consoles due to FSP reset/reload\n"); +} + +static void fsp_pokemsg_reclaim(struct fsp_msg *msg) +{ + struct fsp_serial *fs = msg->user_data; + + /* + * The poke_msg might have been "detached" from the console + * in vserial_close, so we need to check whether it's current + * before touching the state, otherwise, just free it + */ + lock(&fsp_con_lock); + if (fs->open && fs->poke_msg == msg) { + if (fs->out_poke) { + if (fsp_queue_msg(fs->poke_msg, fsp_pokemsg_reclaim)) { + prerror("FSPCON: failed to queue poke msg\n"); + } else { + fs->out_poke = false; + } + } else + fs->poke_msg->state = fsp_msg_unused; + } else + fsp_freemsg(msg); + unlock(&fsp_con_lock); +} + +/* Called with the fsp_con_lock held */ +static size_t fsp_write_vserial(struct fsp_serial *fs, const char *buf, + size_t len) +{ + struct fsp_serbuf_hdr *sb = fs->out_buf; + u16 old_nin = be16_to_cpu(sb->next_in); + u16 space, chunk; + + if (!fs->open) + return 0; + + space = (be16_to_cpu(sb->next_out) + SER_BUF_DATA_SIZE - old_nin - 1) + % SER_BUF_DATA_SIZE; + if (space < len) + len = space; + if (!len) + return 0; + + chunk = SER_BUF_DATA_SIZE - old_nin; + if (chunk > len) + chunk = len; + memcpy(&sb->data[old_nin], buf, chunk); + if (chunk < len) + memcpy(&sb->data[0], buf + chunk, len - chunk); + lwsync(); + sb->next_in = cpu_to_be16((old_nin + len) % SER_BUF_DATA_SIZE); + sync(); + + if (be16_to_cpu(sb->next_out) == old_nin && fs->poke_msg) { + if (fs->poke_msg->state == fsp_msg_unused) { + if (fsp_queue_msg(fs->poke_msg, fsp_pokemsg_reclaim)) + prerror("FSPCON: poke msg queuing failed\n"); + } else + fs->out_poke = true; + } +#ifndef DISABLE_CON_PENDING_EVT + opal_update_pending_evt(OPAL_EVENT_CONSOLE_OUTPUT, + OPAL_EVENT_CONSOLE_OUTPUT); +#endif + return len; +} + +#ifdef DVS_CONSOLE +static int fsp_con_port = -1; +static bool fsp_con_full; + +/* + * This is called by the code in console.c without the con_lock + * held. However it can be called as the result of any printf + * thus any other lock might be held including possibly the + * FSP lock + */ +static size_t fsp_con_write(const char *buf, size_t len) +{ + size_t written; + + if (fsp_con_port < 0) + return 0; + + lock(&fsp_con_lock); + written = fsp_write_vserial(&fsp_serials[fsp_con_port], buf, len); + fsp_con_full = (written < len); + unlock(&fsp_con_lock); + + return written; +} + +static struct con_ops fsp_con_ops = { + .write = fsp_con_write, +}; +#endif /* DVS_CONSOLE */ + +static void fsp_open_vserial(struct fsp_msg *msg) +{ + struct fsp_msg *resp; + + u16 part_id = fsp_msg_get_data_word(msg, 0) & 0xffff; + u16 sess_id = fsp_msg_get_data_word(msg, 1) & 0xffff; + u8 hmc_sess = msg->data.bytes[0]; + u8 hmc_indx = msg->data.bytes[1]; + u8 authority = msg->data.bytes[4]; + u32 tce_in, tce_out; + struct fsp_serial *fs; + + prlog(PR_INFO, "FSPCON: Got VSerial Open\n"); + prlog(PR_DEBUG, " part_id = 0x%04x\n", part_id); + prlog(PR_DEBUG, " sess_id = 0x%04x\n", sess_id); + prlog(PR_DEBUG, " hmc_sess = 0x%02x\n", hmc_sess); + prlog(PR_DEBUG, " hmc_indx = 0x%02x\n", hmc_indx); + prlog(PR_DEBUG, " authority = 0x%02x\n", authority); + + if (sess_id >= MAX_SERIAL || !fsp_serials[sess_id].available) { + prlog(PR_WARNING, "FSPCON: 0x%04x NOT AVAILABLE!\n", sess_id); + resp = fsp_mkmsg(FSP_RSP_OPEN_VSERIAL | 0x2f, 0); + if (!resp) { + prerror("FSPCON: Response allocation failed\n"); + return; + } + if (fsp_queue_msg(resp, fsp_freemsg)) { + fsp_freemsg(resp); + prerror("FSPCON: Failed to queue response msg\n"); + } + return; + } + + fs = &fsp_serials[sess_id]; + + /* Hack ! On blades, the console opened via the mm has partition 1 + * while the debug DVS generally has partition 0 (though you can + * use what you want really). + * We don't want a DVS open/close to crap on the blademm console + * thus if it's a raw console, gets an open with partID 1, we + * set a flag that ignores the close of partid 0 + */ + if (fs->rsrc_id == 0xffff) { + if (part_id == 0) + fs->has_part0 = true; + if (part_id == 1) + fs->has_part1 = true; + } + + tce_in = PSI_DMA_SER0_BASE + PSI_DMA_SER0_SIZE * sess_id; + tce_out = tce_in + SER_BUFFER_SIZE/2; + + lock(&fsp_con_lock); + if (fs->open) { + prlog(PR_DEBUG, " already open, skipping init !\n"); + unlock(&fsp_con_lock); + goto already_open; + } + + fs->poke_msg = fsp_mkmsg(FSP_CMD_VSERIAL_OUT, 2, + fsp_msg_get_data_word(msg, 0), + fsp_msg_get_data_word(msg, 1) & 0xffff); + if (fs->poke_msg == NULL) { + prerror("FSPCON: Failed to allocate poke_msg\n"); + unlock(&fsp_con_lock); + return; + } + + fs->open = true; + fs->poke_msg->user_data = fs; + + fs->in_buf->partition_id = fs->out_buf->partition_id = cpu_to_be16(part_id); + fs->in_buf->session_id = fs->out_buf->session_id = sess_id; + fs->in_buf->hmc_id = fs->out_buf->hmc_id = hmc_indx; + fs->in_buf->data_offset = fs->out_buf->data_offset = + cpu_to_be16(sizeof(struct fsp_serbuf_hdr)); + fs->in_buf->last_valid = fs->out_buf->last_valid = + cpu_to_be16(SER_BUF_DATA_SIZE - 1); + fs->in_buf->ovf_count = fs->out_buf->ovf_count = 0; + fs->in_buf->next_in = fs->out_buf->next_in = 0; + fs->in_buf->flags = fs->out_buf->flags = 0; + fs->in_buf->reserved = fs->out_buf->reserved = 0; + fs->in_buf->next_out = fs->out_buf->next_out = 0; + fs->out_buf_prev_len = 0; + fs->out_buf_timeout = 0; + unlock(&fsp_con_lock); + + already_open: + resp = fsp_mkmsg(FSP_RSP_OPEN_VSERIAL, 6, fsp_msg_get_data_word(msg, 0), + fsp_msg_get_data_word(msg, 1) & 0xffff, 0, tce_in, 0, tce_out); + if (!resp) { + prerror("FSPCON: Failed to allocate open msg response\n"); + return; + } + if (fsp_queue_msg(resp, fsp_freemsg)) { + fsp_freemsg(resp); + prerror("FSPCON: Failed to queue open msg response\n"); + return; + } + +#ifdef DVS_CONSOLE + prlog(PR_DEBUG, " log_port = %d\n", fs->log_port); + if (fs->log_port) { + fsp_con_port = sess_id; + sync(); + /* + * We mark the FSP lock as being in the console + * path. We do that only once, we never unmark it + * (there is really no much point) + */ + fsp_used_by_console(); + fsp_con_lock.in_con_path = true; + /* See comment in fsp_used_by_console */ + lock(&fsp_con_lock); + unlock(&fsp_con_lock); + set_console(&fsp_con_ops); + } +#endif +} + +static void fsp_close_vserial(struct fsp_msg *msg) +{ + u16 part_id = fsp_msg_get_data_word(msg, 0) & 0xffff; + u16 sess_id = fsp_msg_get_data_word(msg, 1) & 0xffff; + u8 hmc_sess = msg->data.bytes[0]; + u8 hmc_indx = msg->data.bytes[1]; + u8 authority = msg->data.bytes[4]; + struct fsp_serial *fs; + struct fsp_msg *resp; + + prlog(PR_INFO, "FSPCON: Got VSerial Close\n"); + prlog(PR_DEBUG, " part_id = 0x%04x\n", part_id); + prlog(PR_DEBUG, " sess_id = 0x%04x\n", sess_id); + prlog(PR_DEBUG, " hmc_sess = 0x%02x\n", hmc_sess); + prlog(PR_DEBUG, " hmc_indx = 0x%02x\n", hmc_indx); + prlog(PR_DEBUG, " authority = 0x%02x\n", authority); + + if (sess_id >= MAX_SERIAL || !fsp_serials[sess_id].available) { + prlog(PR_WARNING, "FSPCON: 0x%04x NOT AVAILABLE!\n", sess_id); + goto skip_close; + } + + fs = &fsp_serials[sess_id]; + + /* See "HACK" comment in open */ + if (fs->rsrc_id == 0xffff) { + if (part_id == 0) + fs->has_part0 = false; + if (part_id == 1) + fs->has_part1 = false; + if (fs->has_part0 || fs->has_part1) { + prlog(PR_DEBUG, " skipping close !\n"); + goto skip_close; + } + } + +#ifdef DVS_CONSOLE + if (fs->log_port) { + fsp_con_port = -1; + set_console(NULL); + } +#endif + + lock(&fsp_con_lock); + if (fs->open) { + fs->open = false; + fs->out_poke = false; + if (fs->poke_msg && fs->poke_msg->state == fsp_msg_unused) { + fsp_freemsg(fs->poke_msg); + fs->poke_msg = NULL; + } + } + unlock(&fsp_con_lock); + skip_close: + resp = fsp_mkmsg(FSP_RSP_CLOSE_VSERIAL, 2, fsp_msg_get_data_word(msg, 0), + fsp_msg_get_data_word(msg, 1) & 0xffff); + if (!resp) { + prerror("FSPCON: Failed to allocate close msg response\n"); + return; + } + if (fsp_queue_msg(resp, fsp_freemsg)) { + fsp_freemsg(resp); + prerror("FSPCON: Failed to queue close msg response\n"); + } +} + +static bool fsp_con_msg_hmc(u32 cmd_sub_mod, struct fsp_msg *msg) +{ + struct fsp_msg *resp; + + /* Associate response */ + if ((cmd_sub_mod >> 8) == 0xe08a) { + prlog(PR_TRACE, "FSPCON: Got associate response, status" + " 0x%02x\n", cmd_sub_mod & 0xff); + return true; + } + if ((cmd_sub_mod >> 8) == 0xe08b) { + prlog(PR_TRACE, "Got unassociate response, status 0x%02x\n", + cmd_sub_mod & 0xff); + return true; + } + switch(cmd_sub_mod) { + case FSP_CMD_OPEN_VSERIAL: + fsp_open_vserial(msg); + return true; + case FSP_CMD_CLOSE_VSERIAL: + fsp_close_vserial(msg); + return true; + case FSP_CMD_HMC_INTF_QUERY: + prlog(PR_DEBUG, "FSPCON: Got HMC interface query\n"); + got_intf_query = true; + resp = fsp_mkmsg(FSP_RSP_HMC_INTF_QUERY, 1, + fsp_msg_get_data_word(msg, 0) & 0x00ffffff); + if (!resp) { + prerror("FSPCON: Failed to allocate hmc intf response\n"); + return true; + } + if (fsp_queue_msg(resp, fsp_freemsg)) { + fsp_freemsg(resp); + prerror("FSPCON: Failed to queue hmc intf response\n"); + } + return true; + } + return false; +} + +static bool fsp_con_msg_vt(u32 cmd_sub_mod, struct fsp_msg *msg) +{ + u16 sess_id = fsp_msg_get_data_word(msg, 1) & 0xffff; + + if (cmd_sub_mod == FSP_CMD_VSERIAL_IN && sess_id < MAX_SERIAL) { + struct fsp_serial *fs = &fsp_serials[sess_id]; + + if (!fs->open) + return true; + + /* FSP is signaling some incoming data. We take the console + * lock to avoid racing with a simultaneous read, though we + * might want to consider to simplify all that locking into + * one single lock that covers the console and the pending + * events. + */ + lock(&fsp_con_lock); + opal_update_pending_evt(OPAL_EVENT_CONSOLE_INPUT, + OPAL_EVENT_CONSOLE_INPUT); + opal_update_pending_evt(fs->irq, fs->irq); + unlock(&fsp_con_lock); + } + return true; +} + +static bool fsp_con_msg_rr(u32 cmd_sub_mod, struct fsp_msg *msg) +{ + assert(msg == NULL); + + switch (cmd_sub_mod) { + case FSP_RESET_START: + fsp_close_consoles(); + return true; + case FSP_RELOAD_COMPLETE: + fsp_console_reinit(); + return true; + } + return false; +} + +static struct fsp_client fsp_con_client_hmc = { + .message = fsp_con_msg_hmc, +}; + +static struct fsp_client fsp_con_client_vt = { + .message = fsp_con_msg_vt, +}; + +static struct fsp_client fsp_con_client_rr = { + .message = fsp_con_msg_rr, +}; + +static void fsp_serial_add(int index, u16 rsrc_id, const char *loc_code, + bool log_port) +{ + struct fsp_serial *ser; + struct fsp_msg *msg; + + lock(&fsp_con_lock); + ser = &fsp_serials[index]; + + if (ser->available) { + unlock(&fsp_con_lock); + return; + } + + ser->rsrc_id = rsrc_id; + memset(ser->loc_code, 0x00, LOC_CODE_SIZE); + strncpy(ser->loc_code, loc_code, LOC_CODE_SIZE - 1); + ser->available = true; + ser->log_port = log_port; + unlock(&fsp_con_lock); + + /* DVS doesn't have that */ + if (rsrc_id != 0xffff) { + msg = fsp_mkmsg(FSP_CMD_ASSOC_SERIAL, 2, + (rsrc_id << 16) | 1, index); + if (!msg) { + prerror("FSPCON: Assoc serial alloc failed\n"); + return; + } + if (fsp_queue_msg(msg, fsp_freemsg)) { + fsp_freemsg(msg); + prerror("FSPCON: Assoc serial queue failed\n"); + return; + } + } +} + +void fsp_console_preinit(void) +{ + int i; + void *base; + + if (!fsp_present()) + return; + + ser_buffer = memalign(TCE_PSIZE, SER_BUFFER_SIZE * MAX_SERIAL); + + /* Initialize out data structure pointers & TCE maps */ + base = ser_buffer; + for (i = 0; i < MAX_SERIAL; i++) { + struct fsp_serial *ser = &fsp_serials[i]; + + ser->in_buf = base; + ser->out_buf = base + SER_BUFFER_SIZE/2; + base += SER_BUFFER_SIZE; + } + fsp_tce_map(PSI_DMA_SER0_BASE, ser_buffer, + 4 * PSI_DMA_SER0_SIZE); + + /* Register for class E0 and E1 */ + fsp_register_client(&fsp_con_client_hmc, FSP_MCLASS_HMC_INTFMSG); + fsp_register_client(&fsp_con_client_vt, FSP_MCLASS_HMC_VT); + fsp_register_client(&fsp_con_client_rr, FSP_MCLASS_RR_EVENT); + + /* Add DVS ports. We currently have session 0 and 3, 0 is for + * OS use. 3 is our debug port. We need to add those before + * we complete the OPL or we'll potentially miss the + * console setup on Firebird blades. + */ + fsp_serial_add(0, 0xffff, "DVS_OS", false); + op_display(OP_LOG, OP_MOD_FSPCON, 0x0001); + fsp_serial_add(3, 0xffff, "DVS_FW", true); + op_display(OP_LOG, OP_MOD_FSPCON, 0x0002); + +} + +static int64_t fsp_console_write(int64_t term_number, __be64 *__length, + const uint8_t *buffer) +{ + struct fsp_serial *fs; + size_t written, requested; + + if (term_number < 0 || term_number >= MAX_SERIAL) + return OPAL_PARAMETER; + fs = &fsp_serials[term_number]; + if (!fs->available || fs->log_port) + return OPAL_PARAMETER; + lock(&fsp_con_lock); + if (!fs->open) { + unlock(&fsp_con_lock); + return OPAL_CLOSED; + } + /* Clamp to a reasonable size */ + requested = be64_to_cpu(*__length); + if (requested > 0x1000) + requested = 0x1000; + written = fsp_write_vserial(fs, buffer, requested); + + if (written) { + /* If we wrote anything, reset timeout */ + fs->out_buf_prev_len = 0; + fs->out_buf_timeout = 0; + } + +#ifdef OPAL_DEBUG_CONSOLE_IO + prlog(PR_TRACE, "OPAL: console write req=%ld written=%ld" + " ni=%d no=%d\n", + requested, written, be16_to_cpu(fs->out_buf->next_in), + be16_to_cpu(fs->out_buf->next_out)); + prlog(PR_TRACE, " %02x %02x %02x %02x " + "%02x \'%c\' %02x \'%c\' %02x \'%c\'.%02x \'%c\'..\n", + buffer[0], buffer[1], buffer[2], buffer[3], + buffer[4], buffer[4], buffer[5], buffer[5], + buffer[6], buffer[6], buffer[7], buffer[7]); +#endif /* OPAL_DEBUG_CONSOLE_IO */ + + *__length = cpu_to_be64(written); + unlock(&fsp_con_lock); + + if (written) + return OPAL_SUCCESS; + + return OPAL_HARDWARE; +} + +static int64_t fsp_console_write_buffer_space(int64_t term_number, + __be64 *__length) +{ + static bool elog_generated = false; + struct fsp_serial *fs; + struct fsp_serbuf_hdr *sb; + int64_t length; + + if (term_number < 0 || term_number >= MAX_SERIAL) + return OPAL_PARAMETER; + fs = &fsp_serials[term_number]; + if (!fs->available || fs->log_port) + return OPAL_PARAMETER; + lock(&fsp_con_lock); + if (!fs->open) { + unlock(&fsp_con_lock); + return OPAL_CLOSED; + } + sb = fs->out_buf; + length = (be16_to_cpu(sb->next_out) + SER_BUF_DATA_SIZE + - be16_to_cpu(sb->next_in) - 1) + % SER_BUF_DATA_SIZE; + unlock(&fsp_con_lock); + + /* Console buffer has enough space to write incoming data */ + if (length != fs->out_buf_prev_len) { + fs->out_buf_prev_len = length; + fs->out_buf_timeout = 0; + + *__length = cpu_to_be64(length); + return OPAL_SUCCESS; + } + + /* + * Buffer is full, start internal timer. We will continue returning + * SUCCESS until timeout happens, hoping FSP will consume data within + * timeout period. + */ + if (fs->out_buf_timeout == 0) { + fs->out_buf_timeout = mftb() + + secs_to_tb(SER_BUFFER_OUT_TIMEOUT); + } + + if (tb_compare(mftb(), fs->out_buf_timeout) != TB_AAFTERB) { + *__length = cpu_to_be64(length); + return OPAL_SUCCESS; + } + + /* + * FSP is still active but not reading console data. Hence + * our console buffer became full. Most likely IPMI daemon + * on FSP is buggy. Lets log error and return OPAL_RESOURCE + * to payload (Linux). + */ + if (!elog_generated) { + elog_generated = true; + log_simple_error(&e_info(OPAL_RC_CONSOLE_HANG), "FSPCON: Console " + "buffer is full, dropping console data\n"); + } + + /* Timeout happened. Lets drop incoming data */ + return OPAL_RESOURCE; +} + +static int64_t fsp_console_read(int64_t term_number, __be64 *__length, + uint8_t *buffer) +{ + struct fsp_serial *fs; + struct fsp_serbuf_hdr *sb; + bool pending = false; + uint32_t old_nin, n, i, chunk, req = be64_to_cpu(*__length); + int rc = OPAL_SUCCESS; + + if (term_number < 0 || term_number >= MAX_SERIAL) + return OPAL_PARAMETER; + fs = &fsp_serials[term_number]; + if (!fs->available || fs->log_port) + return OPAL_PARAMETER; + lock(&fsp_con_lock); + if (!fs->open) { + rc = OPAL_CLOSED; + goto clr_flag; + } + if (fs->waiting) + fs->waiting = 0; + sb = fs->in_buf; + old_nin = be16_to_cpu(sb->next_in); + lwsync(); + n = (old_nin + SER_BUF_DATA_SIZE - be16_to_cpu(sb->next_out)) + % SER_BUF_DATA_SIZE; + if (n > req) { + pending = true; + n = req; + } + *__length = cpu_to_be64(n); + + chunk = SER_BUF_DATA_SIZE - be16_to_cpu(sb->next_out); + if (chunk > n) + chunk = n; + memcpy(buffer, &sb->data[be16_to_cpu(sb->next_out)], chunk); + if (chunk < n) + memcpy(buffer + chunk, &sb->data[0], n - chunk); + sb->next_out = cpu_to_be16(((be16_to_cpu(sb->next_out)) + n) % SER_BUF_DATA_SIZE); + +#ifdef OPAL_DEBUG_CONSOLE_IO + prlog(PR_TRACE, "OPAL: console read req=%d read=%d ni=%d no=%d\n", + req, n, be16_to_cpu(sb->next_in), be16_to_cpu(sb->next_out)); + prlog(PR_TRACE, " %02x %02x %02x %02x %02x %02x %02x %02x ...\n", + buffer[0], buffer[1], buffer[2], buffer[3], + buffer[4], buffer[5], buffer[6], buffer[7]); +#endif /* OPAL_DEBUG_CONSOLE_IO */ + +clr_flag: + /* Might clear the input pending flag */ + for (i = 0; i < MAX_SERIAL && !pending; i++) { + struct fsp_serial *fs = &fsp_serials[i]; + struct fsp_serbuf_hdr *sb = fs->in_buf; + + if (fs->log_port || !fs->open) + continue; + if (sb->next_out != sb->next_in) { + /* + * HACK: Some kernels (4.1+) may fail to properly + * register hvc1 and will never read it. This can lead + * to RCU stalls, so if we notice this console is not + * being read, do not set OPAL_EVENT_CONSOLE_INPUT even + * if it has data + */ + if (fs->waiting < 5) { + pending = true; + fs->waiting++; + } + } + } + if (!pending) { + opal_update_pending_evt(fs->irq, 0); + opal_update_pending_evt(OPAL_EVENT_CONSOLE_INPUT, 0); + } + + unlock(&fsp_con_lock); + + return rc; +} + +void fsp_console_poll(void *data __unused) +{ +#ifdef OPAL_DEBUG_CONSOLE_POLL + static int debug; +#endif + + /* + * We don't get messages for out buffer being consumed, so we + * need to poll. We also defer sending of poke messages from + * the sapphire console to avoid a locking nightmare with + * beging called from printf() deep into an existing lock nest + * stack. + */ + if (fsp_con_full || + (opal_pending_events & OPAL_EVENT_CONSOLE_OUTPUT)) { + unsigned int i; + bool pending = false; + + /* We take the console lock. This is somewhat inefficient + * but it guarantees we aren't racing with a write, and + * thus clearing an event improperly + */ + lock(&fsp_con_lock); + for (i = 0; i < MAX_SERIAL && !pending; i++) { + struct fsp_serial *fs = &fsp_serials[i]; + struct fsp_serbuf_hdr *sb = fs->out_buf; + + if (!fs->open) + continue; + if (sb->next_out == sb->next_in) { + continue; + } + if (fs->log_port) { + flush_console(); + } else { +#ifdef OPAL_DEBUG_CONSOLE_POLL + if (debug < 5) { + prlog(PR_DEBUG,"OPAL: %d still pending" + " ni=%d no=%d\n", + i, be16_to_cpu(sb->next_in), + be16_to_cpu(sb->next_out)); + debug++; + } +#endif /* OPAL_DEBUG_CONSOLE_POLL */ + pending = true; + } + } + if (!pending) { + opal_update_pending_evt(OPAL_EVENT_CONSOLE_OUTPUT, 0); +#ifdef OPAL_DEBUG_CONSOLE_POLL + debug = 0; +#endif + } + unlock(&fsp_con_lock); + } +} + +void fsp_console_init(void) +{ + struct dt_node *serials, *ser; + int i; + + if (!fsp_present()) + return; + + /* Wait until we got the intf query before moving on */ + while (!got_intf_query) + opal_run_pollers(); + + op_display(OP_LOG, OP_MOD_FSPCON, 0x0000); + + /* Register poller */ + opal_add_poller(fsp_console_poll, NULL); + + /* Register OPAL console backend */ + set_opal_console(&fsp_opal_con); + + /* Parse serial port data */ + serials = dt_find_by_path(dt_root, "ipl-params/fsp-serial"); + if (!serials) { + prerror("FSPCON: No FSP serial ports in device-tree\n"); + return; + } + + i = 1; + dt_for_each_child(serials, ser) { + u32 rsrc_id = dt_prop_get_u32(ser, "reg"); + const void *lc = dt_prop_get(ser, "ibm,loc-code"); + + prlog(PR_NOTICE, "FSPCON: Serial %d rsrc: %04x loc: %s\n", + i, rsrc_id, (const char *)lc); + fsp_serial_add(i++, rsrc_id, lc, false); + op_display(OP_LOG, OP_MOD_FSPCON, 0x0010 + i); + } + + op_display(OP_LOG, OP_MOD_FSPCON, 0x0005); +} + +static int64_t fsp_console_flush(int64_t terminal __unused) +{ + /* FIXME: There's probably something we can do here... */ + return OPAL_PARAMETER; +} + +struct opal_con_ops fsp_opal_con = { + .name = "FSP OPAL console", + .init = NULL, /* all the required setup is done in fsp_console_init() */ + .read = fsp_console_read, + .write = fsp_console_write, + .space = fsp_console_write_buffer_space, + .flush = fsp_console_flush, +}; + +static void flush_all_input(void) +{ + unsigned int i; + + lock(&fsp_con_lock); + for (i = 0; i < MAX_SERIAL; i++) { + struct fsp_serial *fs = &fsp_serials[i]; + struct fsp_serbuf_hdr *sb = fs->in_buf; + + if (fs->log_port) + continue; + + sb->next_out = sb->next_in; + } + unlock(&fsp_con_lock); +} + +static bool send_all_hvsi_close(void) +{ + unsigned int i; + bool has_hvsi = false; + static const uint8_t close_packet[] = { 0xfe, 6, 0, 1, 0, 3 }; + + for (i = 0; i < MAX_SERIAL; i++) { + struct fsp_serial *fs = &fsp_serials[i]; + struct fsp_serbuf_hdr *sb = fs->out_buf; + unsigned int space, timeout = 10; + + if (fs->log_port) + continue; + if (fs->rsrc_id == 0xffff) + continue; + has_hvsi = true; + + /* Do we have room ? Wait a bit if not */ + while(timeout--) { + space = (be16_to_cpu(sb->next_out) + SER_BUF_DATA_SIZE - + be16_to_cpu(sb->next_in) - 1) % SER_BUF_DATA_SIZE; + if (space >= 6) + break; + time_wait_ms(500); + } + lock(&fsp_con_lock); + fsp_write_vserial(fs, close_packet, 6); + unlock(&fsp_con_lock); + } + + return has_hvsi; +} + +static void reopen_all_hvsi(void) +{ + unsigned int i; + + for (i = 0; i < MAX_SERIAL; i++) { + struct fsp_serial *fs = &fsp_serials[i]; + + if (!fs->available) + continue; + + if (fs->rsrc_id == 0xffff) + continue; + prlog(PR_NOTICE, "FSP: Deassociating HVSI console %d\n", i); + fsp_sync_msg(fsp_mkmsg(FSP_CMD_UNASSOC_SERIAL, 1, + (i << 16) | 1), true); + } + for (i = 0; i < MAX_SERIAL; i++) { + struct fsp_serial *fs = &fsp_serials[i]; + + if (!fs->available) + continue; + + if (fs->rsrc_id == 0xffff) + continue; + prlog(PR_NOTICE, "FSP: Reassociating HVSI console %d\n", i); + fsp_sync_msg(fsp_mkmsg(FSP_CMD_ASSOC_SERIAL, 2, + (fs->rsrc_id << 16) | 1, i), true); + } +} + +void fsp_console_reset(void) +{ + if (!fsp_present()) + return; + + prlog(PR_NOTICE, "FSP: Console reset !\n"); + + /* This is called on a fast-reset. To work around issues with HVSI + * initial negotiation, before we reboot the kernel, we flush all + * input and send an HVSI close packet. + */ + flush_all_input(); + + /* Returns false if there is no HVSI console */ + if (!send_all_hvsi_close()) + return; + + time_wait_ms(500); + + reopen_all_hvsi(); + +} + +void fsp_console_add_nodes(void) +{ + struct dt_node *opal_event; + unsigned int i; + + opal_event = dt_find_by_name(opal_node, "event"); + + for (i = 0; i < MAX_SERIAL; i++) { + struct fsp_serial *fs = &fsp_serials[i]; + struct dt_node *fs_node; + const char *type; + + if (fs->log_port || !fs->available) + continue; + + if (fs->rsrc_id == 0xffff) + type = "raw"; + else + type = "hvsi"; + + fs_node = add_opal_console_node(i, type, SER_BUF_DATA_SIZE); + + fs->irq = opal_dynamic_event_alloc(); + dt_add_property_cells(fs_node, "interrupts", ilog2(fs->irq)); + + if (opal_event) + dt_add_property_cells(fs_node, "interrupt-parent", + opal_event->phandle); + } +} + +void fsp_console_select_stdout(void) +{ + bool use_serial = false; + int rc; + u8 param; + + if (!fsp_present()) + return; + + rc = fsp_get_sys_param(SYS_PARAM_CONSOLE_SELECT, + ¶m, 1, NULL, NULL); + if (rc != 1) { + prerror("FSPCON: Failed to get console" + " sysparam rc %d\n", rc); + } else { + switch(param) { + case 0: + use_serial = false; + break; + case 1: + use_serial = true; + break; + default: + prerror("FSPCON: Unknown console" + " sysparam %d\n", param); + } + } + + dt_check_del_prop(dt_chosen, "linux,stdout-path"); + + if (fsp_serials[1].open && use_serial) { + dt_add_property_string(dt_chosen, "linux,stdout-path", + "/ibm,opal/consoles/serial@1"); + prlog(PR_NOTICE, "FSPCON: default console set to serial A\n"); + } else { + dt_add_property_string(dt_chosen, "linux,stdout-path", + "/ibm,opal/consoles/serial@0"); + prlog(PR_NOTICE, "FSPCON: default console set to SOL/DVS\n"); + } +} + diff --git a/roms/skiboot/hw/fsp/fsp-diag.c b/roms/skiboot/hw/fsp/fsp-diag.c new file mode 100644 index 000000000..d9101f31b --- /dev/null +++ b/roms/skiboot/hw/fsp/fsp-diag.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Code for handling FSP_MCLASS_DIAG messages (cmd 0xee) + * Receiving a high level ack timeout is likely indicative of a firmware bug + * + * Copyright 2013-2014 IBM Corp. + */ + +#include <skiboot.h> +#include <fsp.h> +#include <lock.h> +#include <processor.h> +#include <timebase.h> +#include <opal.h> +#include <fsp-sysparam.h> + +static bool fsp_diag_msg(u32 cmd_sub_mod, struct fsp_msg *msg) +{ + + if (cmd_sub_mod == FSP_RSP_DIAG_LINK_ERROR) { + printf("FIXME: Unhandled FSP_MCLASS_DIAG Link Error Report\n"); + return false; + } + + if (cmd_sub_mod != FSP_RSP_DIAG_ACK_TIMEOUT) { + printf("BUG: Unhandled subcommand: 0x%x (New FSP spec?)\n", + cmd_sub_mod); + return false; + } + + printf("BUG: High Level ACK timeout (FSP_MCLASS_DIAG) for 0x%x\n", + fsp_msg_get_data_word(msg, 0) & 0xffff0000); + + return true; +} + +static struct fsp_client fsp_diag = { + .message = fsp_diag_msg, +}; + +/* This is called at boot time */ +void fsp_init_diag(void) +{ + /* Register for the diag event */ + fsp_register_client(&fsp_diag, FSP_MCLASS_DIAG); +} diff --git a/roms/skiboot/hw/fsp/fsp-dpo.c b/roms/skiboot/hw/fsp/fsp-dpo.c new file mode 100644 index 000000000..91919f915 --- /dev/null +++ b/roms/skiboot/hw/fsp/fsp-dpo.c @@ -0,0 +1,154 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * FSP DPO (Delayed Power Off) event support + * + * Copyright 2013-2017 IBM Corp. + */ + +#define pr_fmt(fmt) "FSP-DPO: " fmt + +#include <skiboot.h> +#include <fsp.h> +#include <stdio.h> +#include <timebase.h> +#include <opal.h> +#include <opal-msg.h> + +#define DPO_CMD_SGN_BYTE0 0xf4 /* Byte[0] signature */ +#define DPO_CMD_SGN_BYTE1 0x20 /* Byte[1] signature */ +#define DPO_TIMEOUT 2700 /* 45 minutes in seconds */ + +bool fsp_dpo_pending; +static unsigned long fsp_dpo_init_tb; + +/* + * OPAL DPO interface + * + * Returns zero if DPO is not active, positive value indicating number + * of seconds remaining for a forced system shutdown. This will enable + * the host to schedule for shutdown voluntarily before timeout occurs. + */ +static int64_t fsp_opal_get_dpo_status(__be64 *dpo_timeout) +{ + if (!fsp_dpo_pending) { + *dpo_timeout = 0; + return OPAL_WRONG_STATE; + } + + *dpo_timeout = cpu_to_be64(DPO_TIMEOUT - tb_to_secs(mftb() - fsp_dpo_init_tb)); + return OPAL_SUCCESS; +} + +/* Process FSP DPO init message */ +static void fsp_process_dpo(struct fsp_msg *msg) +{ + struct fsp_msg *resp; + u32 cmd = FSP_RSP_INIT_DPO; + int rc; + + /* DPO message does not have the correct signatures */ + if ((msg->data.bytes[0] != DPO_CMD_SGN_BYTE0) + || (msg->data.bytes[1] != DPO_CMD_SGN_BYTE1)) { + prerror("Message signatures did not match\n"); + cmd |= FSP_STATUS_INVALID_CMD; + resp = fsp_mkmsg(cmd, 0); + if (resp == NULL) { + prerror("%s : Message allocation failed\n", __func__); + return; + } + if (fsp_queue_msg(resp, fsp_freemsg)) { + fsp_freemsg(resp); + prerror("%s : Failed to queue response " + "message\n", __func__); + } + return; + } + + /* OPAL is already in "DPO pending" state */ + if (fsp_dpo_pending) { + prlog(PR_INFO, "OPAL already in DPO pending state\n"); + cmd |= FSP_STATUS_INVALID_DPOSTATE; + resp = fsp_mkmsg(cmd, 0); + if (resp == NULL) { + prerror("%s : Message allocation failed\n", __func__); + return; + } + if (fsp_queue_msg(resp, fsp_freemsg)) { + fsp_freemsg(resp); + prerror("%s : Failed to queue response " + "message\n", __func__); + } + return; + } + + + /* Inform the host about DPO */ + rc = opal_queue_msg(OPAL_MSG_DPO, NULL, NULL); + if (rc) { + prerror("OPAL message queuing failed\n"); + cmd |= FSP_STATUS_GENERIC_ERROR; + resp = fsp_mkmsg(cmd, 0); + if (resp == NULL) { + prerror("%s : Message allocation failed\n", __func__); + return; + } + if (fsp_queue_msg(resp, fsp_freemsg)) { + fsp_freemsg(resp); + prerror("%s : Failed to queue response " + "message\n", __func__); + } + return; + } else + prlog(PR_INFO, "Notified host about DPO event\n"); + + /* Acknowledge the FSP on DPO */ + resp = fsp_mkmsg(cmd, 0); + if (resp == NULL) { + prerror("%s : Message allocation failed\n", __func__); + return; + } + if (fsp_queue_msg(resp, fsp_freemsg)) { + fsp_freemsg(resp); + prerror("%s : Failed to queue response message\n", __func__); + return; + } + + /* Record DPO init time and set DPO pending flag */ + fsp_dpo_init_tb = mftb(); + fsp_dpo_pending = true; + + /* + * OPAL is now in DPO pending state. After first detecting DPO + * condition from OPAL, the host will have 45 minutes to prepare + * the system for shutdown. The host must take all necessary actions + * required in that regard and at the end shutdown itself. The host + * shutdown sequence eventually will make the call OPAL_CEC_POWER_DOWN + * which in turn ask the FSP to shutdown the CEC. If the FSP does not + * receive the cec power down command from OPAL within 45 minutes, + * it will assume that the host and the OPAL has processed the DPO + * sequence successfully and hence force power off the system. + */ +} + +/* Handle DPO sub-command from FSP */ +static bool fsp_dpo_message(u32 cmd_sub_mod, struct fsp_msg *msg) +{ + if (cmd_sub_mod == FSP_CMD_INIT_DPO) { + prlog(PR_INFO, "Delayed Power Off (DPO) notification received\n"); + fsp_process_dpo(msg); + return true; + } + + return false; +} + +static struct fsp_client fsp_dpo_client = { + .message = fsp_dpo_message, +}; + +void fsp_dpo_init(void) +{ + fsp_register_client(&fsp_dpo_client, FSP_MCLASS_SERVICE); + opal_register(OPAL_GET_DPO_STATUS, fsp_opal_get_dpo_status, 1); + prlog(PR_INFO, "FSP DPO support initialized\n"); +} diff --git a/roms/skiboot/hw/fsp/fsp-dump.c b/roms/skiboot/hw/fsp/fsp-dump.c new file mode 100644 index 000000000..96cb45e6f --- /dev/null +++ b/roms/skiboot/hw/fsp/fsp-dump.c @@ -0,0 +1,916 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Dump support: + * We get dump notification from different sources: + * - During system initialization via HDAT + * - During FSP reset/reload (FipS dump) + * - Dump available notification MBOX command (0xCE, 0x78, 0x00) + * + * To avoid complications, we keep list of dumps in a list and fetch + * them serially. + * + * Dump retrieve process: + * - Once we get notification from FSP we enqueue the dump ID and notify + * Linux via OPAL event notification. + * - Linux reads dump info and allocates required memory to fetch the dump + * and makes dump read call. + * - Sapphire fetches dump data from FSP. + * - Linux writes dump to disk and sends acknowledgement. + * - Sapphire acknowledges FSP. + * + * Copyright 2013-2015 IBM Corp. + */ + +#include <fsp.h> +#include <psi.h> +#include <lock.h> +#include <device.h> +#include <skiboot.h> +#include <errorlog.h> +#include <opal-api.h> + +/* + * Max outstanding dumps to retrieve + * + * Note: + * Dumps are serialized. We don't get notification for second + * dump of given type until we acknowledge first one. But we + * may get notification for different dump type. And our dump + * retrieval code is serialized. Hence we use list to keep + * track of outstanding dumps to be retrieved. + */ +#define MAX_DUMP_RECORD 0x04 + +/* Max retry */ +#define FIPS_DUMP_MAX_RETRY 0x03 + +/* Dump type */ +#define DUMP_TYPE_FSP 0x01 +#define DUMP_TYPE_SYS 0x02 +#define DUMP_TYPE_SMA 0x03 + +/* Dump fetch size */ +#define DUMP_FETCH_SIZE_FSP 0x500000 +#define DUMP_FETCH_SIZE_SYS 0x400000 +#define DUMP_FETCH_SIZE_RES 0x200000 + +/* Params for Fips dump */ +#define FSP_DUMP_TOOL_TYPE "SYS " +#define FSP_DUMP_CLIENT_ID "SAPPHIRE_CLIENT" + +enum dump_state { + DUMP_STATE_ABSENT, /* No FSP dump */ + DUMP_STATE_NONE, /* No dump to retrieve */ + DUMP_STATE_NOTIFY, /* Notified Linux */ + DUMP_STATE_FETCHING, /* Dump retrieval is in progress */ + DUMP_STATE_FETCH, /* Dump retrieve complete */ + DUMP_STATE_PARTIAL, /* Partial read */ + DUMP_STATE_ABORTING, /* Aborting due to kexec */ +}; + +/* Pending dump list */ +struct dump_record { + uint8_t type; + uint32_t id; + uint32_t size; + struct list_node link; +}; + +/* List definations */ +static LIST_HEAD(dump_pending); +static LIST_HEAD(dump_free); + +/* Dump retrieve state */ +static enum dump_state dump_state = DUMP_STATE_NONE; + +/* Dump buffer SG list */ +static struct opal_sg_list *dump_data; +static struct dump_record *dump_entry; +static int64_t dump_offset; +static size_t fetch_remain; + +/* FipS dump retry count */ +static int retry_cnt; + +/* Protect list and dump retrieve state */ +static struct lock dump_lock = LOCK_UNLOCKED; + +/* Forward declaration */ +static int64_t fsp_opal_dump_init(uint8_t dump_type); +static int64_t fsp_dump_read(void); + +DEFINE_LOG_ENTRY(OPAL_RC_DUMP_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_DUMP, + OPAL_PLATFORM_FIRMWARE, + OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT, + OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_DUMP_LIST, OPAL_PLATFORM_ERR_EVT, OPAL_DUMP, + OPAL_PLATFORM_FIRMWARE, + OPAL_INFO, + OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_DUMP_ACK, OPAL_PLATFORM_ERR_EVT, OPAL_DUMP, + OPAL_PLATFORM_FIRMWARE, OPAL_INFO, + OPAL_NA); + +/* + * Helper functions + */ +static inline void update_dump_state(enum dump_state state) +{ + dump_state = state; +} + +static int64_t check_dump_state(void) +{ + switch (dump_state) { + case DUMP_STATE_ABSENT: + return OPAL_HARDWARE; + case DUMP_STATE_NONE: + case DUMP_STATE_NOTIFY: + /* During dump fetch, notify is wrong state */ + return OPAL_WRONG_STATE; + case DUMP_STATE_FETCHING: + case DUMP_STATE_ABORTING: + return OPAL_BUSY_EVENT; + case DUMP_STATE_FETCH: + return OPAL_SUCCESS; + case DUMP_STATE_PARTIAL: + return OPAL_PARTIAL; + } + return OPAL_SUCCESS; +} + +static inline void dump_tce_map(uint32_t tce_offset, + void *buffer, uint32_t size) +{ + uint32_t tlen = ALIGN_UP(size, TCE_PSIZE); + fsp_tce_map(PSI_DMA_DUMP_DATA + tce_offset, buffer, tlen); +} + +static inline void dump_tce_unmap(uint32_t size) +{ + fsp_tce_unmap(PSI_DMA_DUMP_DATA, size); +} + +/* + * Returns Data set ID for the given dump type + */ +static inline uint16_t get_dump_data_set_id(uint8_t type) +{ + switch (type) { + case DUMP_TYPE_FSP: + return FSP_DATASET_SP_DUMP; + case DUMP_TYPE_SYS: + return FSP_DATASET_HW_DUMP; + default: + break; + } + return OPAL_INTERNAL_ERROR; +} + +/* + * Returns max data we can fetch from FSP fetch data call + */ +static inline int64_t get_dump_fetch_max_size(uint8_t type) +{ + switch (type) { + case DUMP_TYPE_FSP: + return DUMP_FETCH_SIZE_FSP; + case DUMP_TYPE_SYS: + return DUMP_FETCH_SIZE_SYS; + default: + break; + } + return OPAL_INTERNAL_ERROR; +} + +/* + * Get dump record from pending list + */ +static inline struct dump_record *get_dump_rec_from_list(uint32_t id) +{ + struct dump_record *record; + + list_for_each(&dump_pending, record, link) { + if (record->id == id) + return record; + } + return NULL; +} + +/* + * New dump available notification to Linux + */ +static void update_opal_dump_notify(void) +{ + /* + * Wait until current dump retrieval to complete + * before notifying again. + */ + if (dump_state != DUMP_STATE_NONE) + return; + + /* More dump's to retrieve */ + if (!list_empty(&dump_pending)) { + update_dump_state(DUMP_STATE_NOTIFY); + opal_update_pending_evt(OPAL_EVENT_DUMP_AVAIL, + OPAL_EVENT_DUMP_AVAIL); + } +} + +static int64_t remove_dump_id_from_list(uint32_t dump_id) +{ + struct dump_record *record, *nxt_record; + int rc = OPAL_SUCCESS; + bool found = false; + + /* Remove record from pending list */ + list_for_each_safe(&dump_pending, record, nxt_record, link) { + if (record->id != dump_id) + continue; + + found = true; + list_del(&record->link); + list_add(&dump_free, &record->link); + break; + } + + /* + * Continue update_opal_dump_notify even if it fails + * to remove ID. So that we can resend notification + * for the same dump ID to Linux. + */ + if (!found) { /* List corrupted? */ + log_simple_error(&e_info(OPAL_RC_DUMP_LIST), + "DUMP: ID 0x%x not found in list!\n", + dump_id); + rc = OPAL_PARAMETER; + } + + /* Update state */ + update_dump_state(DUMP_STATE_NONE); + /* Notify next available dump to retrieve */ + update_opal_dump_notify(); + + return rc; +} + +static int64_t add_dump_id_to_list(uint8_t dump_type, + uint32_t dump_id, uint32_t dump_size) +{ + struct dump_record *record; + int rc = OPAL_SUCCESS; + + lock(&dump_lock); + + rc = check_dump_state(); + if (rc == OPAL_HARDWARE) + goto out; + + /* List is full ? */ + if (list_empty(&dump_free)) { + printf("DUMP: Dump ID 0x%x is not queued.\n", dump_id); + rc = OPAL_RESOURCE; + goto out; + } + + /* Already queued? */ + record = get_dump_rec_from_list(dump_id); + if (record) { + rc = OPAL_SUCCESS; + goto out; + } + + /* Add to list */ + record = list_pop(&dump_free, struct dump_record, link); + record->type = dump_type; + record->id = dump_id; + record->size = dump_size; + list_add_tail(&dump_pending, &record->link); + + /* OPAL notification */ + update_opal_dump_notify(); + rc = OPAL_SUCCESS; + +out: + unlock(&dump_lock); + return rc; +} + +static void dump_init_complete(struct fsp_msg *msg) +{ + uint8_t status = (msg->resp->word1 >> 8) & 0xff; + + printf("DUMP: FipS dump init status = 0x%x\n", status); + fsp_freemsg(msg); + + switch (status) { + case FSP_STATUS_SUCCESS: + printf("DUMP: Initiated FipS dump.\n"); + break; + case FSP_STATUS_BUSY: /* Retry, if FSP is busy */ + if (retry_cnt++ < FIPS_DUMP_MAX_RETRY) + if (fsp_opal_dump_init(DUMP_TYPE_FSP) == OPAL_SUCCESS) + return; + break; + default: + break; + } + /* Reset max retry count */ + retry_cnt = 0; +} + +/* + * Initiate new FipS dump + */ +static int64_t fsp_opal_dump_init(uint8_t dump_type) +{ + struct fsp_msg *msg; + int rc = OPAL_SUCCESS; + uint32_t *tool_type = (void *)FSP_DUMP_TOOL_TYPE; + uint32_t *client_id = (void *)FSP_DUMP_CLIENT_ID; + + /* Only FipS dump generate request is supported */ + if (dump_type != DUMP_TYPE_FSP) + return OPAL_PARAMETER; + + msg = fsp_mkmsg(FSP_CMD_FSP_DUMP_INIT, 6, *tool_type, + sizeof(FSP_DUMP_CLIENT_ID), *client_id, + *(client_id + 1), *(client_id + 2), *(client_id + 3)); + + if (!msg) { + log_simple_error(&e_info(OPAL_RC_DUMP_INIT), + "DUMP: Message allocation failed.\n"); + rc = OPAL_INTERNAL_ERROR; + } else if (fsp_queue_msg(msg, dump_init_complete)) { + log_simple_error(&e_info(OPAL_RC_DUMP_INIT), + "DUMP: Failed to queue FipS dump init request.\n"); + fsp_freemsg(msg); + rc = OPAL_INTERNAL_ERROR; + } + + return rc; +} + +/* + * OPAL interface to send dump information to Linux. + */ +static int64_t fsp_opal_dump_info2(__be32 *dump_id, __be32 *dump_size, + __be32 *dump_type) +{ + struct dump_record *record; + int rc = OPAL_SUCCESS; + + lock(&dump_lock); + + /* Clear notification */ + opal_update_pending_evt(OPAL_EVENT_DUMP_AVAIL, 0); + + record = list_top(&dump_pending, struct dump_record, link); + if (!record) { /* List corrupted? */ + update_dump_state(DUMP_STATE_NONE); + rc = OPAL_INTERNAL_ERROR; + goto out; + } + *dump_id = cpu_to_be32(record->id); + *dump_size = cpu_to_be32(record->size); + *dump_type = cpu_to_be32(record->type); + +out: + unlock(&dump_lock); + return rc; +} + +static int64_t fsp_opal_dump_info(__be32 *dump_id, __be32 *dump_size) +{ + __be32 dump_type; + return fsp_opal_dump_info2(dump_id, dump_size, &dump_type); +} + +static int64_t validate_dump_sglist(struct opal_sg_list *list, + int64_t *size) +{ + struct opal_sg_list *sg; + struct opal_sg_entry *prev_entry, *entry; + int length, num_entries, i; + + prev_entry = NULL; + *size = 0; + for (sg = list; sg; sg = (struct opal_sg_list*)be64_to_cpu(sg->next)) { + length = be64_to_cpu(sg->length) - 16; + num_entries = length / sizeof(struct opal_sg_entry); + if (num_entries <= 0) + return OPAL_PARAMETER; + + for (i = 0; i < num_entries; i++) { + entry = &sg->entry[i]; + *size += be64_to_cpu(entry->length); + + /* All entries must be aligned */ + if (((uint64_t)be64_to_cpu(entry->data)) & 0xfff) + return OPAL_PARAMETER; + + /* All non-terminal entries size must be aligned */ + if (prev_entry && (be64_to_cpu(prev_entry->length) & 0xfff)) + return OPAL_PARAMETER; + + prev_entry = entry; + } + } + return OPAL_SUCCESS; +} + +/* + * Map dump buffer to TCE buffer + */ +static int64_t map_dump_buffer(void) +{ + struct opal_sg_list *sg; + struct opal_sg_entry *entry; + int64_t fetch_max; + int length, num_entries, i; + int buf_off, fetch_off, tce_off, sg_off; + bool last = false; + + /* FSP fetch max size */ + fetch_max = get_dump_fetch_max_size(dump_entry->type); + if (fetch_max > (dump_entry->size - dump_offset)) + fetch_remain = dump_entry->size - dump_offset; + else + fetch_remain = fetch_max; + + /* offsets */ + fetch_off = fetch_remain; + tce_off = sg_off = 0; + + for (sg = dump_data; sg; sg = (struct opal_sg_list*)be64_to_cpu(sg->next)) { + num_entries = (be64_to_cpu(sg->length) - 16) / + sizeof(struct opal_sg_entry); + if (num_entries <= 0) + return OPAL_PARAMETER; + + for (i = 0; i < num_entries; i++) { + entry = &sg->entry[i]; + + /* Continue until we get offset */ + if ((sg_off + be64_to_cpu(entry->length)) < dump_offset) { + sg_off += be64_to_cpu(entry->length); + continue; + } + + /* + * SG list entry size can be more than 4k. + * Map only required pages, instead of + * mapping entire entry. + */ + if (!tce_off) { + buf_off = (dump_offset - sg_off) & ~0xfff; + length = be64_to_cpu(entry->length) - buf_off; + } else { + buf_off = 0; + length = be64_to_cpu(entry->length); + } + + /* Adjust length for last mapping */ + if (fetch_off <= length) { + length = fetch_off; + last = true; + } + + /* Adjust offset */ + sg_off += be64_to_cpu(entry->length); + fetch_off -= length; + + /* TCE mapping */ + dump_tce_map(tce_off, (void*)(be64_to_cpu(entry->data) + buf_off), length); + tce_off += length; + + /* TCE mapping complete */ + if (last) + return OPAL_SUCCESS; + } + } /* outer loop */ + return OPAL_PARAMETER; +} + +static void dump_read_complete(struct fsp_msg *msg) +{ + void *buffer; + size_t length, offset; + int rc; + uint32_t dump_id; + uint16_t id; + uint8_t flags, status; + bool compl = false; + + status = (msg->resp->word1 >> 8) & 0xff; + flags = (fsp_msg_get_data_word(msg, 0) >> 16) & 0xff; + id = fsp_msg_get_data_word(msg, 0) & 0xffff; + dump_id = fsp_msg_get_data_word(msg, 1); + offset = fsp_msg_get_data_word(msg->resp, 1); + length = fsp_msg_get_data_word(msg->resp, 2); + + fsp_freemsg(msg); + + lock(&dump_lock); + + if (dump_state == DUMP_STATE_ABORTING) { + printf("DUMP: Fetch dump aborted, ID = 0x%x\n", dump_id); + dump_tce_unmap(PSI_DMA_DUMP_DATA_SIZE); + update_dump_state(DUMP_STATE_NONE); + goto bail; + } + + switch (status) { + case FSP_STATUS_SUCCESS: /* Fetch next dump block */ + if (dump_offset < dump_entry->size) { + dump_tce_unmap(PSI_DMA_DUMP_DATA_SIZE); + rc = fsp_dump_read(); + if (rc == OPAL_SUCCESS) + goto bail; + } else { /* Dump read complete */ + compl = true; + } + break; + case FSP_STATUS_MORE_DATA: /* More data to read */ + offset += length; + buffer = (void *)PSI_DMA_DUMP_DATA + offset; + fetch_remain -= length; + + rc = fsp_fetch_data_queue(flags, id, dump_id, offset, buffer, + &fetch_remain, dump_read_complete); + if (rc == OPAL_SUCCESS) + goto bail; + break; + default: + break; + } + + dump_tce_unmap(PSI_DMA_DUMP_DATA_SIZE); + + /* Update state */ + if (compl) { + printf("DUMP: Fetch dump success. ID = 0x%x\n", dump_id); + update_dump_state(DUMP_STATE_FETCH); + } else { + printf("DUMP: Fetch dump partial. ID = 0x%x\n", dump_id); + update_dump_state(DUMP_STATE_PARTIAL); + } + bail: + unlock(&dump_lock); +} + +/* + * Fetch dump data from FSP + */ +static int64_t fsp_dump_read(void) +{ + int64_t rc; + uint16_t data_set; + uint8_t flags = 0x00; + + /* Get data set ID */ + data_set = get_dump_data_set_id(dump_entry->type); + + /* Map TCE buffer */ + rc = map_dump_buffer(); + if (rc != OPAL_SUCCESS) { + printf("DUMP: TCE mapping failed\n"); + return rc; + } + + printf("DUMP: Fetch Dump. ID = %02x, sub ID = %08x, len = %ld\n", + data_set, dump_entry->id, fetch_remain); + + /* Fetch data */ + rc = fsp_fetch_data_queue(flags, data_set, dump_entry->id, + dump_offset, (void *)PSI_DMA_DUMP_DATA, + &fetch_remain, dump_read_complete); + + /* Adjust dump fetch offset */ + dump_offset += fetch_remain; + + return rc; +} + +static int64_t fsp_opal_dump_read(uint32_t dump_id, + struct opal_sg_list *list) +{ + struct dump_record *record; + int64_t rc, size; + + lock(&dump_lock); + + /* Check state */ + if (dump_state != DUMP_STATE_NOTIFY) { + rc = check_dump_state(); + goto out; + } + + /* Validate dump ID */ + record = get_dump_rec_from_list(dump_id); + if (!record) { /* List corrupted? */ + rc = OPAL_INTERNAL_ERROR; + goto out; + } + + /* Validate dump buffer and size */ + rc = validate_dump_sglist(list, &size); + if (rc != OPAL_SUCCESS) { + printf("DUMP: SG list validation failed\n"); + goto out; + } + + if (size < record->size) { /* Insuffient buffer */ + printf("DUMP: Insufficient buffer\n"); + rc = OPAL_PARAMETER; + goto out; + } + + /* Update state */ + update_dump_state(DUMP_STATE_FETCHING); + + /* Fetch dump data */ + dump_entry = record; + dump_data = list; + dump_offset = 0; + rc = fsp_dump_read(); + if (rc != OPAL_SUCCESS) + goto out; + + /* Check status after initiating fetch data */ + rc = check_dump_state(); + +out: + unlock(&dump_lock); + return rc; +} + +static void dump_ack_complete(struct fsp_msg *msg) +{ + uint8_t status = (msg->resp->word1 >> 8) & 0xff; + + if (status) + log_simple_error(&e_info(OPAL_RC_DUMP_ACK), + "DUMP: ACK failed for ID: 0x%x\n", + fsp_msg_get_data_word(msg, 0)); + else + printf("DUMP: ACKed dump ID: 0x%x\n", fsp_msg_get_data_word(msg, 0)); + + fsp_freemsg(msg); +} + +/* + * Acknowledge dump + */ +static int64_t fsp_opal_dump_ack(uint32_t dump_id) +{ + struct dump_record *record; + struct fsp_msg *msg; + int rc; + uint32_t cmd; + uint8_t dump_type = 0; + + /* Get dump type */ + lock(&dump_lock); + record = get_dump_rec_from_list(dump_id); + if (record) + dump_type = record->type; + + /* + * Next available dump in pending list will be of different + * type. Hence we don't need to wait for ack complete. + * + * Note: + * This allows us to proceed even if we fail to ACK. + * In the worst case we may get notification for the + * same dump again, which is probably better than + * looping forever. + */ + rc = remove_dump_id_from_list(dump_id); + if (rc != OPAL_SUCCESS) /* Invalid dump id */ + goto out; + + /* Adjust mod value */ + cmd = FSP_CMD_ACK_DUMP | (dump_type & 0xff); + msg = fsp_mkmsg(cmd, 1, dump_id); + if (!msg) { + log_simple_error(&e_info(OPAL_RC_DUMP_ACK), + "DUMP: Message allocation failed.!\n"); + rc = OPAL_INTERNAL_ERROR; + } else if (fsp_queue_msg(msg, dump_ack_complete)) { + log_simple_error(&e_info(OPAL_RC_DUMP_ACK), + "DUMP: Failed to queue dump ack message.\n"); + fsp_freemsg(msg); + rc = OPAL_INTERNAL_ERROR; + } +out: + unlock(&dump_lock); + return rc; +} + +/* Resend dump available notification */ +static int64_t fsp_opal_dump_resend_notification(void) +{ + lock(&dump_lock); + + if (dump_state != DUMP_STATE_ABSENT) + update_dump_state(DUMP_STATE_NONE); + + update_opal_dump_notify(); + + unlock(&dump_lock); + + return OPAL_SUCCESS; +} + +/* + * Handle FSP R/R event. + */ +static bool fsp_dump_retrieve_rr(uint32_t cmd_sub_mod, + struct fsp_msg *msg __unused) +{ + switch (cmd_sub_mod) { + case FSP_RESET_START: + lock(&dump_lock); + /* Reset dump state */ + if (dump_state == DUMP_STATE_FETCHING) + update_dump_state(DUMP_STATE_ABORTING); + unlock(&dump_lock); + return true; + case FSP_RELOAD_COMPLETE: + lock(&dump_lock); + + /* Reset TCE mapping */ + dump_tce_unmap(PSI_DMA_DUMP_DATA_SIZE); + + /* Reset dump state */ + update_dump_state(DUMP_STATE_NONE); + + /* + * For now keeping R/R handler simple. In the worst case + * we may endup resending dump available notification for + * same dump ID twice to Linux. + */ + update_opal_dump_notify(); + unlock(&dump_lock); + return true; + } + return false; +} + +/* + * Handle host kexec'ing scenarios + */ +static bool opal_kexec_dump_notify(void *data __unused) +{ + bool ready = true; + + lock(&dump_lock); + + /* Dump retrieve is in progress? */ + if (dump_state == DUMP_STATE_FETCHING) + dump_state = DUMP_STATE_ABORTING; + + /* Not yet safe to kexec */ + if (dump_state == DUMP_STATE_ABORTING) + ready = false; + + unlock(&dump_lock); + + return ready; +} + +/* + * FipS dump notification + */ +void fsp_fips_dump_notify(uint32_t dump_id, uint32_t dump_size) +{ + printf("DUMP: FipS dump available. ID = 0x%x [size: %d bytes]\n", + dump_id, dump_size); + add_dump_id_to_list(DUMP_TYPE_FSP, dump_id, dump_size); +} + +/* + * System/Platform dump notification + */ +static bool fsp_sys_dump_notify(uint32_t cmd_sub_mod, struct fsp_msg *msg) +{ + /* + * Though spec says mod 00 is deprecated we still + * seems to get mod 00 notification (at least on + * P7 machine). + */ + if (cmd_sub_mod != FSP_RSP_SYS_DUMP && + cmd_sub_mod != FSP_RSP_SYS_DUMP_OLD) + return false; + + printf("DUMP: Platform dump available. ID = 0x%x [size: %d bytes]\n", + fsp_msg_get_data_word(msg, 0), fsp_msg_get_data_word(msg, 1)); + + add_dump_id_to_list(DUMP_TYPE_SYS, + fsp_msg_get_data_word(msg, 0), + fsp_msg_get_data_word(msg, 1)); + return true; +} + +/* + * If platform dump available during IPL time, then we + * get notification via HDAT. Check for DT for the dump + * presence. + */ +static void check_ipl_sys_dump(void) +{ + struct dt_node *dump_node, *opal_node; + uint32_t dump_id, dump_size; + + if (proc_gen >= proc_gen_p9) { + opal_node = dt_find_by_path(dt_root, "ibm,opal"); + if (!opal_node) + return; + dump_node = dt_find_by_path(opal_node, "dump"); + if (dump_node) { + if (dt_find_property(dump_node, "mpipl-boot")) + return; + } + } + + dump_node = dt_find_by_path(dt_root, "ipl-params/platform-dump"); + if (!dump_node) + return; + + if (!dt_find_property(dump_node, "dump-id")) + return; + + dump_id = dt_prop_get_u32(dump_node, "dump-id"); + dump_size = (uint32_t)dt_prop_get_u64(dump_node, "total-size"); + + printf("DUMP: Platform dump present during IPL.\n"); + printf(" ID = 0x%x [size: %d bytes]\n", dump_id, dump_size); + + add_dump_id_to_list(DUMP_TYPE_SYS, dump_id, dump_size); +} + +/* + * Allocate and initialize dump list + */ +static int init_dump_free_list(void) +{ + struct dump_record *entry; + int i; + + entry = zalloc(sizeof(struct dump_record) * MAX_DUMP_RECORD); + if (!entry) { + log_simple_error(&e_info(OPAL_RC_DUMP_INIT), + "DUMP: Out of memory\n"); + return -ENOMEM; + } + + for (i = 0; i < MAX_DUMP_RECORD; i++) { + list_add_tail(&dump_free, &entry->link); + entry++; + } + return 0; +} + +static struct fsp_client fsp_sys_dump_client = { + .message = fsp_sys_dump_notify, +}; + +static struct fsp_client fsp_dump_client_rr = { + .message = fsp_dump_retrieve_rr, +}; + +void fsp_dump_init(void) +{ + if (!fsp_present()) { + update_dump_state(DUMP_STATE_ABSENT); + return; + } + + /* Initialize list */ + if (init_dump_free_list() != 0) { + update_dump_state(DUMP_STATE_ABSENT); + return; + } + + /* Register for Class CE */ + fsp_register_client(&fsp_sys_dump_client, FSP_MCLASS_SERVICE); + /* Register for Class AA (FSP R/R) */ + fsp_register_client(&fsp_dump_client_rr, FSP_MCLASS_RR_EVENT); + + /* Register for sync on host reboot call */ + opal_add_host_sync_notifier(opal_kexec_dump_notify, NULL); + + /* OPAL interface */ + opal_register(OPAL_DUMP_INIT, fsp_opal_dump_init, 1); + opal_register(OPAL_DUMP_INFO, fsp_opal_dump_info, 2); + opal_register(OPAL_DUMP_INFO2, fsp_opal_dump_info2, 3); + opal_register(OPAL_DUMP_READ, fsp_opal_dump_read, 2); + opal_register(OPAL_DUMP_ACK, fsp_opal_dump_ack, 1); + opal_register(OPAL_DUMP_RESEND, fsp_opal_dump_resend_notification, 0); + + /* Check for platform dump presence during IPL time */ + check_ipl_sys_dump(); +} diff --git a/roms/skiboot/hw/fsp/fsp-elog-read.c b/roms/skiboot/hw/fsp/fsp-elog-read.c new file mode 100644 index 000000000..bd23ffbe8 --- /dev/null +++ b/roms/skiboot/hw/fsp/fsp-elog-read.c @@ -0,0 +1,608 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * This code will enable retrieving of error log from FSP -> Sapphire in + * sequence. + * Here, FSP would send next log only when Sapphire sends a new log notification + * response to FSP. On Completion of reading the log from FSP, + * OPAL_EVENT_ERROR_LOG_AVAIL is signaled. This will remain raised until a call + * to opal_elog_read() is made and OPAL_SUCCESS is returned. Upon which, the + * operation is complete and the event is cleared. This is READ action from FSP. + * + * Copyright 2013-2017 IBM Corp. + */ + +/* + * Design of READ error log : + * When we receive a new error log entry notification from FSP, we queue it into + * the "pending" list. If the "pending" list is not empty, then we start + * fetching log from FSP. + * + * When Linux reads a log entry, we dequeue it from the "pending" list and + * enqueue it to another "processed" list. At this point, if the "pending" + * list is not empty, we continue to fetch the next log. + * + * When Linux calls opal_resend_pending_logs(), we fetch the log corresponding + * to the head of the pending list and move it to the processed list, and + * continue this process until the pending list is empty. If the pending list + * was empty earlier and is currently non-empty, we initiate an error log fetch. + * + * When Linux acks an error log, we remove it from processed list. + */ + +#include <errno.h> +#include <fsp.h> +#include <fsp-elog.h> +#include <lock.h> +#include <opal-api.h> +#include <psi.h> +#include <skiboot.h> + +/* + * Maximum number of entries that are pre-allocated + * to keep track of pending elogs to be fetched. + */ +#define ELOG_READ_MAX_RECORD 128 + +/* Structure to maintain log-id, log-size, pending and processed list. */ +struct fsp_log_entry { + uint32_t log_id; + size_t log_size; + struct list_node link; +}; + +static LIST_HEAD(elog_read_pending); +static LIST_HEAD(elog_read_processed); +static LIST_HEAD(elog_read_free); +/* + * Lock is used to protect overwriting of processed and pending list + * and also used while updating state of each log. + */ +static struct lock elog_read_lock = LOCK_UNLOCKED; + +#define ELOG_READ_BUFFER_SIZE 0x00004000 +/* Log buffer to copy FSP log for read */ +static void *elog_read_buffer; +static uint32_t elog_head_id; /* FSP entry ID */ +static size_t elog_head_size; /* Actual FSP log size */ +static uint32_t elog_read_retries; /* Bad response status count */ + +/* Initialize the state of the log */ +static enum elog_head_state elog_read_from_fsp_head_state = ELOG_STATE_NONE; + +static bool elog_enabled = false; + +/* Need forward declaration because of circular dependency. */ +static void fsp_elog_queue_fetch(void); + +/* + * Check the response message for mbox acknowledgement + * command send to FSP. + */ +static void fsp_elog_ack_complete(struct fsp_msg *msg) +{ + uint8_t val; + + val = (msg->resp->word1 >> 8) & 0xff; + if (val != 0) + prerror("ELOG: Acknowledgement error\n"); + + fsp_freemsg(msg); +} + +/* Send error log PHYP acknowledgement to FSP with entry ID. */ +static int64_t fsp_send_elog_ack(uint32_t log_id) +{ + struct fsp_msg *ack_msg; + + ack_msg = fsp_mkmsg(FSP_CMD_ERRLOG_PHYP_ACK, 1, log_id); + if (!ack_msg) { + prerror("ELOG: Failed to allocate ack message\n"); + return OPAL_INTERNAL_ERROR; + } + + if (fsp_queue_msg(ack_msg, fsp_elog_ack_complete)) { + fsp_freemsg(ack_msg); + ack_msg = NULL; + prerror("ELOG: Error queueing elog ack complete\n"); + return OPAL_INTERNAL_ERROR; + } + + return OPAL_SUCCESS; +} + +/* Retrieve error log from FSP with TCE for the data transfer. */ +static void fsp_elog_check_and_fetch_head(void) +{ + lock(&elog_read_lock); + if (elog_read_from_fsp_head_state != ELOG_STATE_NONE || + list_empty(&elog_read_pending)) { + unlock(&elog_read_lock); + return; + } + + elog_read_retries = 0; + /* Start fetching first entry from the pending list */ + fsp_elog_queue_fetch(); + unlock(&elog_read_lock); +} + +void elog_set_head_state(bool opal_logs, enum elog_head_state state) +{ + static enum elog_head_state opal_logs_state = ELOG_STATE_NONE; + static enum elog_head_state fsp_logs_state = ELOG_STATE_NONE; + + /* ELOG disabled */ + if (!elog_enabled) + return; + + if (opal_logs) + opal_logs_state = state; + else + fsp_logs_state = state; + + if (fsp_logs_state == ELOG_STATE_FETCHED_DATA || + opal_logs_state == ELOG_STATE_FETCHED_DATA) + opal_update_pending_evt(OPAL_EVENT_ERROR_LOG_AVAIL, + OPAL_EVENT_ERROR_LOG_AVAIL); + else + opal_update_pending_evt(OPAL_EVENT_ERROR_LOG_AVAIL, 0); +} + +/* This function should be called with the lock held. */ +static inline void fsp_elog_set_head_state(enum elog_head_state state) +{ + elog_set_head_state(false, state); + elog_read_from_fsp_head_state = state; +} + +/* + * When, we try maximum time of fetching log from FSP + * we call following function to delete log from the + * pending list and update the state to fetch next log. + * + * This function should be called with the lock held. + */ +static void fsp_elog_fetch_failure(uint8_t fsp_status) +{ + struct fsp_log_entry *log_data; + + /* Read top list and delete the node */ + log_data = list_top(&elog_read_pending, struct fsp_log_entry, link); + if (!log_data) { + /** + * @fwts-label ElogFetchFailureInconsistent + * @fwts-advice Inconsistent state between OPAL and FSP + * in code path for handling failure of fetching error log + * from FSP. Likely a bug in interaction between FSP and OPAL. + */ + prlog(PR_ERR, "%s: Inconsistent internal list state !\n", + __func__); + } else { + list_del(&log_data->link); + list_add(&elog_read_free, &log_data->link); + prerror("ELOG: received invalid data: %x FSP status: 0x%x\n", + log_data->log_id, fsp_status); + } + + fsp_elog_set_head_state(ELOG_STATE_NONE); +} + +/* Read response value from FSP for fetch sp data mbox command */ +static void fsp_elog_read_complete(struct fsp_msg *read_msg) +{ + uint8_t val; + + lock(&elog_read_lock); + val = (read_msg->resp->word1 >> 8) & 0xff; + fsp_freemsg(read_msg); + if (elog_read_from_fsp_head_state == ELOG_STATE_REJECTED) { + fsp_elog_set_head_state(ELOG_STATE_NONE); + goto elog_read_out; + } + + switch (val) { + case FSP_STATUS_SUCCESS: + fsp_elog_set_head_state(ELOG_STATE_FETCHED_DATA); + break; + + case FSP_STATUS_DMA_ERROR: + if (elog_read_retries++ < MAX_RETRIES) { + /* + * For a error response value from FSP, we try to + * send fetch sp data mbox command again for three + * times if response from FSP is still not valid + * we send generic error response to FSP. + */ + fsp_elog_queue_fetch(); + break; + } + + fsp_elog_fetch_failure(val); + break; + + default: + fsp_elog_fetch_failure(val); + } + +elog_read_out: + unlock(&elog_read_lock); + + /* Check if a new log needs fetching */ + fsp_elog_check_and_fetch_head(); +} + +/* Read error log from FSP through mbox commands */ +static void fsp_elog_queue_fetch(void) +{ + int rc; + uint8_t flags = 0; + struct fsp_log_entry *entry; + + entry = list_top(&elog_read_pending, struct fsp_log_entry, link); + if (!entry) { + /** + * @fwts-label ElogQueueInconsistent + * @fwts-advice Bug in interaction between FSP and OPAL. We + * expected there to be a pending read from FSP but the list + * was empty. + */ + prlog(PR_ERR, "%s: Inconsistent internal list state !\n", + __func__); + fsp_elog_set_head_state(ELOG_STATE_NONE); + return; + } + + fsp_elog_set_head_state(ELOG_STATE_FETCHING); + elog_head_id = entry->log_id; + elog_head_size = entry->log_size; + rc = fsp_fetch_data_queue(flags, FSP_DATASET_ERRLOG, elog_head_id, + 0, (void *)PSI_DMA_ERRLOG_READ_BUF, + &elog_head_size, fsp_elog_read_complete); + if (rc) { + prerror("ELOG: failed to queue read message: %d\n", rc); + fsp_elog_set_head_state(ELOG_STATE_NONE); + } +} + +/* OPAL interface for PowerNV to read log size and log ID from Sapphire. */ +static int64_t fsp_opal_elog_info(__be64 *opal_elog_id, + __be64 *opal_elog_size, __be64 *elog_type) +{ + struct fsp_log_entry *log_data; + + /* Copy type of the error log */ + *elog_type = cpu_to_be64(ELOG_TYPE_PEL); + + /* Check if any OPAL log needs to be reported to the host */ + if (opal_elog_info(opal_elog_id, opal_elog_size)) + return OPAL_SUCCESS; + + lock(&elog_read_lock); + if (elog_read_from_fsp_head_state != ELOG_STATE_FETCHED_DATA) { + unlock(&elog_read_lock); + return OPAL_WRONG_STATE; + } + + log_data = list_top(&elog_read_pending, struct fsp_log_entry, link); + if (!log_data) { + /** + * @fwts-label ElogInfoInconsistentState + * @fwts-advice We expected there to be an entry in the list + * of error logs for the error log we're fetching information + * for. There wasn't. This means there's a bug. + */ + prlog(PR_ERR, "%s: Inconsistent internal list state !\n", + __func__); + fsp_elog_set_head_state(ELOG_STATE_NONE); + unlock(&elog_read_lock); + return OPAL_WRONG_STATE; + } + + *opal_elog_id = cpu_to_be64(log_data->log_id); + *opal_elog_size = cpu_to_be64(log_data->log_size); + fsp_elog_set_head_state(ELOG_STATE_HOST_INFO); + unlock(&elog_read_lock); + return OPAL_SUCCESS; +} + +/* OPAL interface for PowerNV to read log from Sapphire. */ +static int64_t fsp_opal_elog_read(void *buffer, uint64_t opal_elog_size, + uint64_t opal_elog_id) +{ + int size = opal_elog_size; + struct fsp_log_entry *log_data; + + /* Check if any OPAL log needs to be reported to the PowerNV */ + if (opal_elog_read(buffer, opal_elog_size, opal_elog_id)) + return OPAL_SUCCESS; + + /* + * Read top entry from list. + * As we know always top record of the list is fetched from FSP + */ + lock(&elog_read_lock); + if (elog_read_from_fsp_head_state != ELOG_STATE_HOST_INFO) { + unlock(&elog_read_lock); + return OPAL_WRONG_STATE; + } + + log_data = list_top(&elog_read_pending, struct fsp_log_entry, link); + if (!log_data) { + /** + * @fwts-label ElogReadInconsistentState + * @fwts-advice Inconsistent state while reading error log + * from FSP. Bug in OPAL and FSP interaction. + */ + prlog(PR_ERR, "%s: Inconsistent internal list state !\n", + __func__); + fsp_elog_set_head_state(ELOG_STATE_NONE); + unlock(&elog_read_lock); + return OPAL_WRONG_STATE; + } + + /* Check log ID and then read log from buffer */ + if (opal_elog_id != log_data->log_id) { + unlock(&elog_read_lock); + return OPAL_PARAMETER; + } + + /* Do not copy more than actual log size */ + if (opal_elog_size > log_data->log_size) + size = log_data->log_size; + + memset(buffer, 0, opal_elog_size); + memcpy(buffer, elog_read_buffer, size); + + /* + * Once log is read from linux move record from pending + * to processed list and delete record from pending list + * and change state of the log to fetch next record. + */ + list_del(&log_data->link); + list_add(&elog_read_processed, &log_data->link); + fsp_elog_set_head_state(ELOG_STATE_NONE); + unlock(&elog_read_lock); + + /* Read error log from FSP */ + fsp_elog_check_and_fetch_head(); + + return OPAL_SUCCESS; +} + +/* Set state of the log head before fetching the log. */ +static void elog_reject_head(void) +{ + if (elog_read_from_fsp_head_state == ELOG_STATE_FETCHING) + fsp_elog_set_head_state(ELOG_STATE_REJECTED); + else + fsp_elog_set_head_state(ELOG_STATE_NONE); +} + +/* OPAL interface for PowerNV to send ack to FSP with log ID */ +static int64_t fsp_opal_elog_ack(uint64_t ack_id) +{ + int rc = 0; + struct fsp_log_entry *record, *next_record; + + if (opal_elog_ack(ack_id)) + return rc; + + /* Send acknowledgement to FSP */ + rc = fsp_send_elog_ack(ack_id); + if (rc != OPAL_SUCCESS) { + prerror("ELOG: failed to send acknowledgement: %d\n", rc); + return rc; + } + + lock(&elog_read_lock); + list_for_each_safe(&elog_read_processed, record, next_record, link) { + if (record->log_id != ack_id) + continue; + + list_del(&record->link); + list_add(&elog_read_free, &record->link); + unlock(&elog_read_lock); + return rc; + } + + list_for_each_safe(&elog_read_pending, record, next_record, link) { + if (record->log_id != ack_id) + continue; + /* + * It means PowerNV has sent ACK without reading actual data. + * Because of this elog_read_from_fsp_head_state may be + * stuck in wrong state (ELOG_STATE_HOST_INFO) and not able + * to send remaining ELOGs to PowerNV. Hence reset ELOG state + * and start sending remaining ELOGs. + */ + list_del(&record->link); + list_add(&elog_read_free, &record->link); + elog_reject_head(); + unlock(&elog_read_lock); + fsp_elog_check_and_fetch_head(); + return rc; + } + + unlock(&elog_read_lock); + return OPAL_PARAMETER; +} + +/* + * Once Linux kexec's it ask to resend all logs which + * are not acknowledged from Linux. + */ +static void fsp_opal_resend_pending_logs(void) +{ + struct fsp_log_entry *entry; + + lock(&elog_read_lock); + elog_enabled = true; + unlock(&elog_read_lock); + + /* Check if any Sapphire logs are pending. */ + opal_resend_pending_logs(); + + lock(&elog_read_lock); + /* + * If processed list is not empty add all record from + * processed list to pending list at head of the list + * and delete records from processed list. + */ + while (!list_empty(&elog_read_processed)) { + entry = list_pop(&elog_read_processed, + struct fsp_log_entry, link); + list_add(&elog_read_pending, &entry->link); + } + + unlock(&elog_read_lock); + + /* Read error log from FSP */ + elog_reject_head(); + fsp_elog_check_and_fetch_head(); +} + +/* Disable ELOG event flag until PowerNV is ready to receive event */ +static bool opal_kexec_elog_notify(void *data __unused) +{ + lock(&elog_read_lock); + elog_enabled = false; + opal_update_pending_evt(OPAL_EVENT_ERROR_LOG_AVAIL, 0); + unlock(&elog_read_lock); + + return true; +} + +/* FSP elog notify function */ +static bool fsp_elog_msg(uint32_t cmd_sub_mod, struct fsp_msg *msg) +{ + int rc = 0; + struct fsp_log_entry *record; + uint32_t log_id; + uint32_t log_size; + + if (cmd_sub_mod != FSP_CMD_ERRLOG_NOTIFICATION) + return false; + + log_id = fsp_msg_get_data_word(msg, 0); + log_size = fsp_msg_get_data_word(msg, 1); + + prlog(PR_TRACE, "ELOG: Notified of log 0x%08x (size: %d)\n", + log_id, log_size); + + /* Make sure we don't cross read buffer size */ + if (log_size > ELOG_READ_BUFFER_SIZE) { + log_size = ELOG_READ_BUFFER_SIZE; + printf("ELOG: Truncated log (0x%08x) to 0x%x\n", + log_id, log_size); + } + + /* Take a lock until we take out the node from elog_read_free */ + lock(&elog_read_lock); + if (!list_empty(&elog_read_free)) { + /* Create a new entry in the pending list. */ + record = list_pop(&elog_read_free, struct fsp_log_entry, link); + record->log_id = log_id; + record->log_size = log_size; + list_add_tail(&elog_read_pending, &record->link); + unlock(&elog_read_lock); + + /* Send response back to FSP for a new elog notify message. */ + rc = fsp_queue_msg(fsp_mkmsg(FSP_RSP_ERRLOG_NOTIFICATION, + 1, log_id), fsp_freemsg); + if (rc) + prerror("ELOG: Failed to queue errlog notification" + " response: %d\n", rc); + + /* Read error log from FSP */ + fsp_elog_check_and_fetch_head(); + + } else { + prlog(PR_TRACE, "ELOG: Log entry 0x%08x discarded\n", log_id); + + /* Unlock if elog_read_free is empty. */ + unlock(&elog_read_lock); + + rc = fsp_queue_msg(fsp_mkmsg(FSP_RSP_ERRLOG_NOTIFICATION, + 1, log_id), fsp_freemsg); + if (rc) + prerror("ELOG: Failed to queue errlog notification" + " response: %d\n", rc); + + /* + * If list is full with max record then we send discarded by + * phyp (condition full) ack to FSP. + * + * At some point in the future, we'll get notified again. + * This is largely up to FSP as to when they tell us about + * the log again. + */ + rc = fsp_queue_msg(fsp_mkmsg(FSP_CMD_ERRLOG_PHYP_ACK | 0x02, + 1, log_id), fsp_freemsg); + if (rc) + prerror("ELOG: Failed to queue errlog ack" + " response: %d\n", rc); + } + + return true; +} + +static struct fsp_client fsp_get_elog_notify = { + .message = fsp_elog_msg, +}; + +/* Pre-allocate memory for reading error log from FSP */ +static int init_elog_read_free_list(uint32_t num_entries) +{ + struct fsp_log_entry *entry; + int i; + + entry = zalloc(sizeof(struct fsp_log_entry) * num_entries); + if (!entry) + goto out_err; + + for (i = 0; i < num_entries; ++i) { + list_add_tail(&elog_read_free, &entry->link); + entry++; + } + + return 0; + +out_err: + return -ENOMEM; +} + +/* FSP elog read init function */ +void fsp_elog_read_init(void) +{ + int val = 0; + + if (!fsp_present()) + return; + + elog_read_buffer = memalign(TCE_PSIZE, ELOG_READ_BUFFER_SIZE); + if (!elog_read_buffer) { + prerror("FSP: could not allocate FSP ELOG_READ_BUFFER!\n"); + return; + } + + /* Map TCEs */ + fsp_tce_map(PSI_DMA_ERRLOG_READ_BUF, elog_read_buffer, + PSI_DMA_ERRLOG_READ_BUF_SZ); + + /* Pre allocate memory for 128 record */ + val = init_elog_read_free_list(ELOG_READ_MAX_RECORD); + if (val != 0) + return; + + /* Register error log class D2 */ + fsp_register_client(&fsp_get_elog_notify, FSP_MCLASS_ERR_LOG); + + /* Register for sync on PowerNV reboot call */ + opal_add_host_sync_notifier(opal_kexec_elog_notify, NULL); + + /* Register OPAL interface */ + opal_register(OPAL_ELOG_READ, fsp_opal_elog_read, 3); + opal_register(OPAL_ELOG_ACK, fsp_opal_elog_ack, 1); + opal_register(OPAL_ELOG_RESEND, fsp_opal_resend_pending_logs, 0); + opal_register(OPAL_ELOG_SIZE, fsp_opal_elog_info, 3); +} diff --git a/roms/skiboot/hw/fsp/fsp-elog-write.c b/roms/skiboot/hw/fsp/fsp-elog-write.c new file mode 100644 index 000000000..7b26a1867 --- /dev/null +++ b/roms/skiboot/hw/fsp/fsp-elog-write.c @@ -0,0 +1,441 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * This code will enable generation and pushing of error log from Sapphire + * to FSP. + * Critical events from Sapphire that needs to be reported will be pushed + * on to FSP after converting the error log to Platform Error Log(PEL) format. + * This is termed as write action to FSP. + * + * Copyright 2013-2016 IBM Corp. + */ + +#include <cpu.h> +#include <errno.h> +#include <fsp.h> +#include <fsp-elog.h> +#include <lock.h> +#include <opal-api.h> +#include <pel.h> +#include <pool.h> +#include <skiboot.h> +#include <timebase.h> + +static LIST_HEAD(elog_write_to_fsp_pending); +static LIST_HEAD(elog_write_to_host_pending); +static LIST_HEAD(elog_write_to_host_processed); + +static struct lock elog_write_lock = LOCK_UNLOCKED; +static struct lock elog_panic_write_lock = LOCK_UNLOCKED; +static struct lock elog_write_to_host_lock = LOCK_UNLOCKED; + +#define ELOG_WRITE_TO_FSP_BUFFER_SIZE 0x00004000 +/* Log buffer to copy OPAL log for write to FSP. */ +static void *elog_write_to_fsp_buffer; + +#define ELOG_PANIC_WRITE_BUFFER_SIZE 0x00004000 +static void *elog_panic_write_buffer; + +#define ELOG_WRITE_TO_HOST_BUFFER_SIZE 0x00004000 +static void *elog_write_to_host_buffer; + +static uint32_t elog_write_retries; + +/* Manipulate this only with write_lock held */ +static uint32_t elog_plid_fsp_commit = -1; +static enum elog_head_state elog_write_to_host_head_state = ELOG_STATE_NONE; + +/* Need forward declaration because of circular dependency */ +static int opal_send_elog_to_fsp(void); + +static void remove_elog_head_entry(void) +{ + struct errorlog *head, *entry; + + lock(&elog_write_lock); + if (!list_empty(&elog_write_to_fsp_pending)) { + head = list_top(&elog_write_to_fsp_pending, + struct errorlog, link); + if (head->plid == elog_plid_fsp_commit) { + entry = list_pop(&elog_write_to_fsp_pending, + struct errorlog, link); + opal_elog_complete(entry, + elog_write_retries < MAX_RETRIES); + /* Reset the counter */ + elog_plid_fsp_commit = -1; + } + } + + elog_write_retries = 0; + unlock(&elog_write_lock); +} + +static void opal_fsp_write_complete(struct fsp_msg *read_msg) +{ + uint8_t val; + + val = (read_msg->resp->word1 >> 8) & 0xff; + fsp_freemsg(read_msg); + + switch (val) { + case FSP_STATUS_SUCCESS: + remove_elog_head_entry(); + break; + default: + if (elog_write_retries++ >= MAX_RETRIES) { + remove_elog_head_entry(); + prerror("ELOG: Error in writing to FSP (0x%x)!\n", val); + } + + break; + } + + if (opal_send_elog_to_fsp() != OPAL_SUCCESS) + prerror("ELOG: Error sending elog to FSP !\n"); +} + +/* Write PEL format hex dump of the log to FSP */ +static int64_t fsp_opal_elog_write(size_t opal_elog_size) +{ + struct fsp_msg *elog_msg; + + elog_msg = fsp_mkmsg(FSP_CMD_CREATE_ERRLOG, 3, opal_elog_size, + 0, PSI_DMA_ERRLOG_WRITE_BUF); + if (!elog_msg) { + prerror("ELOG: Failed to create message for WRITE to FSP\n"); + return OPAL_INTERNAL_ERROR; + } + + if (fsp_queue_msg(elog_msg, opal_fsp_write_complete)) { + fsp_freemsg(elog_msg); + elog_msg = NULL; + prerror("FSP: Error queueing elog update\n"); + return OPAL_INTERNAL_ERROR; + } + + return OPAL_SUCCESS; +} + +/* This should be called with elog_write_to_host_lock lock */ +static inline void fsp_elog_write_set_head_state(enum elog_head_state state) +{ + elog_set_head_state(true, state); + elog_write_to_host_head_state = state; +} + +bool opal_elog_info(__be64 *opal_elog_id, __be64 *opal_elog_size) +{ + struct errorlog *head; + bool rc = false; + + lock(&elog_write_to_host_lock); + if (elog_write_to_host_head_state == ELOG_STATE_FETCHED_DATA) { + head = list_top(&elog_write_to_host_pending, + struct errorlog, link); + if (!head) { + /** + * @fwts-label ElogListInconsistent + * @fwts-advice Bug in interaction between FSP and + * OPAL. The state maintained by OPAL didn't match + * what the FSP sent. + */ + prlog(PR_ERR, + "%s: Inconsistent internal list state !\n", + __func__); + fsp_elog_write_set_head_state(ELOG_STATE_NONE); + } else { + *opal_elog_id = cpu_to_be64(head->plid); + *opal_elog_size = cpu_to_be64(head->log_size); + fsp_elog_write_set_head_state(ELOG_STATE_HOST_INFO); + rc = true; + } + } + + unlock(&elog_write_to_host_lock); + return rc; +} + +static void opal_commit_elog_in_host(void) +{ + struct errorlog *buf; + + lock(&elog_write_to_host_lock); + if (!list_empty(&elog_write_to_host_pending) && + (elog_write_to_host_head_state == ELOG_STATE_NONE)) { + buf = list_top(&elog_write_to_host_pending, + struct errorlog, link); + buf->log_size = create_pel_log(buf, + (char *)elog_write_to_host_buffer, + ELOG_WRITE_TO_HOST_BUFFER_SIZE); + fsp_elog_write_set_head_state(ELOG_STATE_FETCHED_DATA); + } + + unlock(&elog_write_to_host_lock); +} + +bool opal_elog_read(void *buffer, uint64_t opal_elog_size, + uint64_t opal_elog_id) +{ + struct errorlog *log_data; + bool rc = false; + + lock(&elog_write_to_host_lock); + if (elog_write_to_host_head_state == ELOG_STATE_HOST_INFO) { + log_data = list_top(&elog_write_to_host_pending, + struct errorlog, link); + if (!log_data) { + fsp_elog_write_set_head_state(ELOG_STATE_NONE); + unlock(&elog_write_to_host_lock); + return rc; + } + + if ((opal_elog_id != log_data->plid) && + (opal_elog_size != log_data->log_size)) { + unlock(&elog_write_to_host_lock); + return rc; + } + + memcpy(buffer, elog_write_to_host_buffer, opal_elog_size); + list_del(&log_data->link); + list_add(&elog_write_to_host_processed, &log_data->link); + fsp_elog_write_set_head_state(ELOG_STATE_NONE); + rc = true; + } + + unlock(&elog_write_to_host_lock); + opal_commit_elog_in_host(); + return rc; +} + +bool opal_elog_ack(uint64_t ack_id) +{ + bool rc = false; + struct errorlog *log_data; + struct errorlog *record, *next_record; + + lock(&elog_write_to_host_lock); + if (!list_empty(&elog_write_to_host_processed)) { + list_for_each_safe(&elog_write_to_host_processed, record, + next_record, link) { + if (record->plid != ack_id) + continue; + + list_del(&record->link); + opal_elog_complete(record, true); + rc = true; + } + } + + if ((!rc) && (!list_empty(&elog_write_to_host_pending))) { + log_data = list_top(&elog_write_to_host_pending, + struct errorlog, link); + if (ack_id == log_data->plid) + fsp_elog_write_set_head_state(ELOG_STATE_NONE); + + list_for_each_safe(&elog_write_to_host_pending, record, + next_record, link) { + if (record->plid != ack_id) + continue; + + list_del(&record->link); + opal_elog_complete(record, true); + rc = true; + unlock(&elog_write_to_host_lock); + opal_commit_elog_in_host(); + return rc; + } + } + + unlock(&elog_write_to_host_lock); + return rc; +} + +void opal_resend_pending_logs(void) +{ + struct errorlog *record; + + lock(&elog_write_to_host_lock); + while (!list_empty(&elog_write_to_host_processed)) { + record = list_pop(&elog_write_to_host_processed, + struct errorlog, link); + list_add_tail(&elog_write_to_host_pending, &record->link); + } + + fsp_elog_write_set_head_state(ELOG_STATE_NONE); + unlock(&elog_write_to_host_lock); + opal_commit_elog_in_host(); +} + +static inline u64 get_elog_timeout(void) +{ + return (mftb() + secs_to_tb(ERRORLOG_TIMEOUT_INTERVAL)); +} + +static int opal_send_elog_to_fsp(void) +{ + struct errorlog *head; + int rc = OPAL_SUCCESS; + + /* + * Convert entry to PEL and push it down to FSP. + * Then we wait for the ack from FSP. + */ + lock(&elog_write_lock); + if (!list_empty(&elog_write_to_fsp_pending)) { + head = list_top(&elog_write_to_fsp_pending, + struct errorlog, link); + /* Error needs to be committed, update the time out value */ + head->elog_timeout = get_elog_timeout(); + + elog_plid_fsp_commit = head->plid; + head->log_size = create_pel_log(head, + (char *)elog_write_to_fsp_buffer, + ELOG_WRITE_TO_FSP_BUFFER_SIZE); + rc = fsp_opal_elog_write(head->log_size); + unlock(&elog_write_lock); + return rc; + } + + unlock(&elog_write_lock); + return rc; +} + +static int opal_push_logs_sync_to_fsp(struct errorlog *buf) +{ + struct fsp_msg *elog_msg; + int opal_elog_size = 0; + int rc = OPAL_SUCCESS; + + lock(&elog_panic_write_lock); + + /* Error needs to be committed, update the time out value */ + buf->elog_timeout = get_elog_timeout(); + + opal_elog_size = create_pel_log(buf, + (char *)elog_panic_write_buffer, + ELOG_PANIC_WRITE_BUFFER_SIZE); + + elog_msg = fsp_mkmsg(FSP_CMD_CREATE_ERRLOG, 3, opal_elog_size, + 0, PSI_DMA_ELOG_PANIC_WRITE_BUF); + if (!elog_msg) { + prerror("ELOG: PLID: 0x%x Failed to create message for WRITE " + "to FSP\n", buf->plid); + unlock(&elog_panic_write_lock); + opal_elog_complete(buf, false); + return OPAL_INTERNAL_ERROR; + } + + if (fsp_sync_msg(elog_msg, false)) { + fsp_freemsg(elog_msg); + rc = OPAL_INTERNAL_ERROR; + } else { + rc = (elog_msg->resp->word1 >> 8) & 0xff; + fsp_freemsg(elog_msg); + } + + unlock(&elog_panic_write_lock); + if (rc != OPAL_SUCCESS) + opal_elog_complete(buf, false); + else + opal_elog_complete(buf, true); + + return rc; +} + +int elog_fsp_commit(struct errorlog *buf) +{ + int rc = OPAL_SUCCESS; + + if (buf->event_severity == OPAL_ERROR_PANIC) { + rc = opal_push_logs_sync_to_fsp(buf); + return rc; + } + + lock(&elog_write_lock); + if (list_empty(&elog_write_to_fsp_pending)) { + list_add_tail(&elog_write_to_fsp_pending, &buf->link); + unlock(&elog_write_lock); + rc = opal_send_elog_to_fsp(); + return rc; + } + + list_add_tail(&elog_write_to_fsp_pending, &buf->link); + unlock(&elog_write_lock); + return rc; +} + +static void elog_append_write_to_host(struct errorlog *buf) +{ + lock(&elog_write_to_host_lock); + if (list_empty(&elog_write_to_host_pending)) { + list_add(&elog_write_to_host_pending, &buf->link); + unlock(&elog_write_to_host_lock); + opal_commit_elog_in_host(); + } else { + list_add_tail(&elog_write_to_host_pending, &buf->link); + unlock(&elog_write_to_host_lock); + } +} + +static void elog_timeout_poll(void *data __unused) +{ + uint64_t now; + struct errorlog *head, *entry; + + lock(&elog_write_lock); + if (list_empty(&elog_write_to_fsp_pending)) { + unlock(&elog_write_lock); + return; + } + + head = list_top(&elog_write_to_fsp_pending, struct errorlog, link); + now = mftb(); + if ((tb_compare(now, head->elog_timeout) == TB_AAFTERB) || + (tb_compare(now, head->elog_timeout) == TB_AEQUALB)) { + entry = list_pop(&elog_write_to_fsp_pending, + struct errorlog, link); + unlock(&elog_write_lock); + elog_append_write_to_host(entry); + } else { + unlock(&elog_write_lock); + } +} + +/* FSP elog init function */ +void fsp_elog_write_init(void) +{ + if (!fsp_present()) + return; + + elog_panic_write_buffer = memalign(TCE_PSIZE, + ELOG_PANIC_WRITE_BUFFER_SIZE); + if (!elog_panic_write_buffer) { + prerror("FSP: could not allocate ELOG_PANIC_WRITE_BUFFER!\n"); + return; + } + + elog_write_to_fsp_buffer = memalign(TCE_PSIZE, + ELOG_WRITE_TO_FSP_BUFFER_SIZE); + if (!elog_write_to_fsp_buffer) { + prerror("FSP: could not allocate ELOG_WRITE_BUFFER!\n"); + return; + } + + elog_write_to_host_buffer = memalign(TCE_PSIZE, + ELOG_WRITE_TO_HOST_BUFFER_SIZE); + if (!elog_write_to_host_buffer) { + prerror("FSP: could not allocate ELOG_WRITE_TO_HOST_BUFFER!\n"); + return; + } + + /* Map TCEs */ + fsp_tce_map(PSI_DMA_ELOG_PANIC_WRITE_BUF, elog_panic_write_buffer, + PSI_DMA_ELOG_PANIC_WRITE_BUF_SZ); + + fsp_tce_map(PSI_DMA_ERRLOG_WRITE_BUF, elog_write_to_fsp_buffer, + PSI_DMA_ERRLOG_WRITE_BUF_SZ); + + elog_init(); + + /* Add a poller */ + opal_add_poller(elog_timeout_poll, NULL); +} diff --git a/roms/skiboot/hw/fsp/fsp-epow.c b/roms/skiboot/hw/fsp/fsp-epow.c new file mode 100644 index 000000000..8869e91e6 --- /dev/null +++ b/roms/skiboot/hw/fsp/fsp-epow.c @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * FSP Environmental and Power Warnings (EPOW) support + * + * Copyright 2013-2016 IBM Corp. + */ + +#define pr_fmt(fmt) "FSP-EPOW: " fmt + +#include <fsp.h> +#include <device.h> +#include <lock.h> +#include <opal-msg.h> +#include <opal-api.h> + +#include "fsp-epow.h" + +/* + * System EPOW status + * + * This value is exported to the host. Each individual element in this + * array [0...(OPAL_SYSEPOW_MAX-1)] contains bitwise EPOW event info + * corresponding to particular defined EPOW sub class. For example. + * opal_epow_status[OPAL_SYSEPOW_POWER] will reflect power related EPOW events. + */ +static int16_t epow_status[OPAL_SYSEPOW_MAX]; + +/* EPOW lock */ +static struct lock epow_lock = LOCK_UNLOCKED; + +/* Process FSP sent EPOW based information */ +static void epow_process_ex1_event(u8 *epow) +{ + memset(epow_status, 0, sizeof(epow_status)); + + if (epow[4] == EPOW_TMP_INT) { + prlog(PR_INFO, "Internal temp above normal\n"); + epow_status[OPAL_SYSEPOW_TEMP] = OPAL_SYSTEMP_INT; + + } else if (epow[4] == EPOW_TMP_AMB) { + prlog(PR_INFO, "Ambient temp above normal\n"); + epow_status[OPAL_SYSEPOW_TEMP] = OPAL_SYSTEMP_AMB; + + } else if (epow[4] == EPOW_ON_UPS) { + prlog(PR_INFO, "System running on UPS power\n"); + epow_status[OPAL_SYSEPOW_POWER] = OPAL_SYSPOWER_UPS; + + } +} + +/* Process EPOW event */ +static void fsp_process_epow(struct fsp_msg *msg, int epow_type) +{ + int rc; + u8 epow[8]; + bool epow_changed = false; + int16_t old_epow_status[OPAL_SYSEPOW_MAX]; + + /* Basic EPOW signature */ + if (msg->data.bytes[0] != 0xF2) { + /** + * @fwts-label EPOWSignatureMismatch + * @fwts-advice Bug in skiboot/FSP code for EPOW event handling + */ + prlog(PR_ERR, "Signature mismatch\n"); + return; + } + + lock(&epow_lock); + + /* Copy over and clear system EPOW status */ + memcpy(old_epow_status, epow_status, sizeof(old_epow_status)); + + switch(epow_type) { + case EPOW_NORMAL: + case EPOW_EX2: + break; + case EPOW_EX1: + epow[0] = msg->data.bytes[0]; + epow[1] = msg->data.bytes[1]; + epow[2] = msg->data.bytes[2]; + epow[3] = msg->data.bytes[3]; + epow[4] = msg->data.bytes[4]; + + epow_process_ex1_event(epow); + break; + default: + prlog(PR_WARNING, "Unknown EPOW event notification\n"); + break; + } + + if (memcmp(epow_status, old_epow_status, sizeof(epow_status))) + epow_changed = true; + + unlock(&epow_lock); + + /* Send OPAL message notification */ + if (epow_changed) { + rc = opal_queue_msg(OPAL_MSG_EPOW, NULL, NULL); + if (rc) { + /** + * @fwts-label EPOWMessageQueueFailed + * @fwts-advice Queueing a message from OPAL to FSP + * failed. This is likely due to either an OPAL bug + * or the FSP going away. + */ + prlog(PR_ERR, "OPAL EPOW message queuing failed\n"); + return; + } + prlog(PR_INFO, "Notified host about EPOW event\n"); + } +} + +/* + * EPOW OPAL interface + * + * The host requests for the system EPOW status through this + * OPAl call, where it passes a buffer with a give length. + * Sapphire fills the buffer with updated system EPOW status + * and then updates the length variable back to reflect the + * number of EPOW sub classes it has updated the buffer with. + */ +static int64_t fsp_opal_get_epow_status(__be16 *out_epow, __be16 *length) +{ + int i; + int n_epow_class; + int l = be16_to_cpu(*length); + + /* + * There can be situations where the host and the Sapphire versions + * don't match with eact other and hence the expected system EPOW status + * details. Newer hosts might be expecting status for more number of EPOW + * sub classes which Sapphire may not know about and older hosts might be + * expecting status for EPOW sub classes which is a subset of what + * Sapphire really knows about. Both these situations are handled here. + * + * (A) Host version >= Sapphire version + * + * Sapphire sends out EPOW status for sub classes it knows about + * and keeps the status. Updates the length variable for the host. + * + * (B) Host version < Sapphire version + * + * Sapphire sends out EPOW status for sub classes host knows about + * and can interpret correctly. + */ + if (l >= OPAL_SYSEPOW_MAX) { + n_epow_class = OPAL_SYSEPOW_MAX; + *length = cpu_to_be16(OPAL_SYSEPOW_MAX); + } else { + n_epow_class = l; + } + + /* Transfer EPOW Status */ + for (i = 0; i < n_epow_class; i++) + out_epow[i] = cpu_to_be16(epow_status[i]); + + return OPAL_SUCCESS; +} + +/* Handle EPOW sub-commands from FSP */ +static bool fsp_epow_message(u32 cmd_sub_mod, struct fsp_msg *msg) +{ + switch(cmd_sub_mod) { + case FSP_CMD_PANELSTATUS: + fsp_process_epow(msg, EPOW_NORMAL); + return true; + case FSP_CMD_PANELSTATUS_EX1: + fsp_process_epow(msg, EPOW_EX1); + return true; + case FSP_CMD_PANELSTATUS_EX2: + fsp_process_epow(msg, EPOW_EX2); + return true; + } + return false; +} + +static struct fsp_client fsp_epow_client = { + .message = fsp_epow_message, +}; + +void fsp_epow_init(void) +{ + struct dt_node *np; + + fsp_register_client(&fsp_epow_client, FSP_MCLASS_SERVICE); + opal_register(OPAL_GET_EPOW_STATUS, fsp_opal_get_epow_status, 2); + np = dt_new(opal_node, "epow"); + dt_add_property_strings(np, "compatible", "ibm,opal-v3-epow"); + dt_add_property_strings(np, "epow-classes", "power", "temperature", "cooling"); + prlog(PR_INFO, "FSP EPOW support initialized\n"); +} diff --git a/roms/skiboot/hw/fsp/fsp-epow.h b/roms/skiboot/hw/fsp/fsp-epow.h new file mode 100644 index 000000000..bc1df258e --- /dev/null +++ b/roms/skiboot/hw/fsp/fsp-epow.h @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Handle FSP EPOW event notifications + * + * Copyright 2013-2015 IBM Corp. + */ + +#ifndef __FSP_EPOW_H +#define __FSP_EPOW_H + +/* FSP based EPOW event notifications */ +#define EPOW_NORMAL 0x00 /* panel status normal */ +#define EPOW_EX1 0x01 /* panel status extended 1 */ +#define EPOW_EX2 0x02 /* Panel status extended 2 */ + +/* EPOW reason code notifications */ +#define EPOW_ON_UPS 1 /* System on UPS */ +#define EPOW_TMP_AMB 2 /* Over ambient temperature */ +#define EPOW_TMP_INT 3 /* Over internal temperature */ + +#endif diff --git a/roms/skiboot/hw/fsp/fsp-ipmi.c b/roms/skiboot/hw/fsp/fsp-ipmi.c new file mode 100644 index 000000000..e368c2828 --- /dev/null +++ b/roms/skiboot/hw/fsp/fsp-ipmi.c @@ -0,0 +1,400 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Conduit for IPMI messages to/from FSP + * + * Copyright 2014-2019 IBM Corp. + */ + +#include <errorlog.h> +#include <fsp.h> +#include <ipmi.h> +#include <lock.h> +#include <opal-api.h> + +/* + * Under the hood, FSP IPMI component implements the KCS (Keyboard Controller + * Style) interface + * + * KCS interface request message format + * + * BYTE 1 BYTE 2 BYTE 3:N + * ------------------------------------- + * | NetFn/LUN | Cmd | Data | + * ------------------------------------- + * + * KCS interface response message format + * + * BYTE 1 BYTE 2 BYTE 3 BYTE 4:N + * ------------------------------------------------ + * | NetFn/LUN | Cmd | CompCode | Data | + * ------------------------------------------------ + + */ + +#define FSP_IPMI_REQ_MIN_LEN 2 /* NetFn + Cmd */ +#define FSP_IPMI_RESP_MIN_LEN 3 /* NetFn + Cmd + Completion code */ + +DEFINE_LOG_ENTRY(OPAL_RC_IPMI_REQ, OPAL_PLATFORM_ERR_EVT, OPAL_IPMI, + OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_GENERAL, + OPAL_NA); +DEFINE_LOG_ENTRY(OPAL_RC_IPMI_RESP, OPAL_PLATFORM_ERR_EVT, OPAL_IPMI, + OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_GENERAL, + OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_IPMI_DMA_ERROR_RESP, OPAL_PLATFORM_ERR_EVT, OPAL_IPMI, + OPAL_PLATFORM_FIRMWARE, OPAL_INFO, + OPAL_NA); + +struct fsp_ipmi_msg { + struct list_node link; + struct ipmi_msg ipmi_msg; +}; + +static struct fsp_ipmi { + struct list_head msg_queue; + void *ipmi_req_buf; + void *ipmi_resp_buf; + /* There can only be one outstanding request whose reference is stored + * in 'cur_msg' and the 'lock' protects against the concurrent updates + * of it through request and response. The same 'lock' also protects + * the list manipulation. + */ + struct fsp_ipmi_msg *cur_msg; + struct lock lock; +} fsp_ipmi; + +static int fsp_ipmi_send_request(void); + +static void fsp_ipmi_cmd_done(uint8_t cmd, uint8_t netfn, uint8_t cc) +{ + struct fsp_ipmi_msg *fsp_ipmi_msg = fsp_ipmi.cur_msg; + + lock(&fsp_ipmi.lock); + if (fsp_ipmi.cur_msg == NULL) { + unlock(&fsp_ipmi.lock); + return; + } + list_del(&fsp_ipmi_msg->link); + fsp_ipmi.cur_msg = NULL; + unlock(&fsp_ipmi.lock); + + ipmi_cmd_done(cmd, netfn, cc, &fsp_ipmi_msg->ipmi_msg); +} + + +static void fsp_ipmi_req_complete(struct fsp_msg *msg) +{ + uint8_t status = (msg->resp->word1 >> 8) & 0xff; + uint32_t length = fsp_msg_get_data_word(msg->resp, 0); + struct fsp_ipmi_msg *fsp_ipmi_msg = msg->user_data; + struct ipmi_msg *ipmi_msg; + + fsp_freemsg(msg); + + if (status != FSP_STATUS_SUCCESS) { + assert(fsp_ipmi_msg == fsp_ipmi.cur_msg); + + ipmi_msg = &fsp_ipmi_msg->ipmi_msg; + + if (length != (ipmi_msg->req_size + FSP_IPMI_REQ_MIN_LEN)) + prlog(PR_DEBUG, "IPMI: Length mismatch in req completion " + "(%d, %d)\n", ipmi_msg->req_size, length); + + log_simple_error(&e_info(OPAL_RC_IPMI_REQ), "IPMI: Request " + "failed with status:0x%02x\n", status); + /* FSP will not send the response now, so clear the current + * outstanding request + */ + fsp_ipmi_cmd_done(ipmi_msg->cmd, + IPMI_NETFN_RETURN_CODE(ipmi_msg->netfn), + IPMI_ERR_UNSPECIFIED); + + /* Send the next request in the queue */ + fsp_ipmi_send_request(); + } +} + +static int fsp_ipmi_send_request(void) +{ + uint8_t *req_buf = fsp_ipmi.ipmi_req_buf; + struct ipmi_msg *ipmi_msg; + struct fsp_msg *msg; + int rc; + + if (fsp_in_rr()) + return OPAL_BUSY; + + lock(&fsp_ipmi.lock); + /* An outstanding request is still pending */ + if (fsp_ipmi.cur_msg) { + unlock(&fsp_ipmi.lock); + return OPAL_SUCCESS; + } + + fsp_ipmi.cur_msg = list_top(&fsp_ipmi.msg_queue, struct fsp_ipmi_msg, + link); + unlock(&fsp_ipmi.lock); + + if (!fsp_ipmi.cur_msg) + return OPAL_SUCCESS; + + ipmi_msg = &fsp_ipmi.cur_msg->ipmi_msg; + prlog(PR_TRACE, "IPMI: Send request, netfn:0x%02x, cmd:0x%02x, " + "req_len:%d\n", ipmi_msg->netfn, ipmi_msg->cmd, ipmi_msg->req_size); + + /* KCS request message format */ + *req_buf++ = ipmi_msg->netfn; /* BYTE 1 */ + *req_buf++ = ipmi_msg->cmd; /* BYTE 2 */ + if (ipmi_msg->req_size) + memcpy(req_buf, ipmi_msg->data, ipmi_msg->req_size); + + msg = fsp_mkmsg(FSP_CMD_FETCH_PLAT_DATA, 5, 0, PSI_DMA_PLAT_REQ_BUF, + 0, PSI_DMA_PLAT_RESP_BUF, + ipmi_msg->req_size + FSP_IPMI_REQ_MIN_LEN); + if (!msg) { + log_simple_error(&e_info(OPAL_RC_IPMI_REQ), "IPMI: Failed to " + "allocate request message\n"); + fsp_ipmi_cmd_done(ipmi_msg->cmd, + IPMI_NETFN_RETURN_CODE(ipmi_msg->netfn), + IPMI_ERR_UNSPECIFIED); + return OPAL_NO_MEM; + } + + msg->user_data = fsp_ipmi.cur_msg; + rc = fsp_queue_msg(msg, fsp_ipmi_req_complete); + if (rc) { + log_simple_error(&e_info(OPAL_RC_IPMI_REQ), "IPMI: Failed to " + "queue request message (%d)\n", rc); + fsp_freemsg(msg); + fsp_ipmi_cmd_done(ipmi_msg->cmd, + IPMI_NETFN_RETURN_CODE(ipmi_msg->netfn), + IPMI_ERR_UNSPECIFIED); + return OPAL_INTERNAL_ERROR; + } + + return OPAL_SUCCESS; +} + +static struct ipmi_msg *fsp_ipmi_alloc_msg(size_t req_size, size_t resp_size) +{ + struct fsp_ipmi_msg *fsp_ipmi_msg; + struct ipmi_msg *ipmi_msg; + + fsp_ipmi_msg = zalloc(sizeof(*fsp_ipmi_msg) + MAX(req_size, resp_size)); + if (!fsp_ipmi_msg) + return NULL; + + ipmi_msg = &fsp_ipmi_msg->ipmi_msg; + + ipmi_msg->req_size = req_size; + ipmi_msg->resp_size = resp_size; + ipmi_msg->data = (uint8_t *)(fsp_ipmi_msg + 1); + + return ipmi_msg; +} + +static void fsp_ipmi_free_msg(struct ipmi_msg *ipmi_msg) +{ + struct fsp_ipmi_msg *fsp_ipmi_msg = container_of(ipmi_msg, + struct fsp_ipmi_msg, ipmi_msg); + + free(fsp_ipmi_msg); +} + +static int fsp_ipmi_queue_msg(struct ipmi_msg *ipmi_msg) +{ + struct fsp_ipmi_msg *fsp_ipmi_msg = container_of(ipmi_msg, + struct fsp_ipmi_msg, ipmi_msg); + + if (fsp_in_rr()) + return OPAL_BUSY; + + lock(&fsp_ipmi.lock); + list_add_tail(&fsp_ipmi.msg_queue, &fsp_ipmi_msg->link); + unlock(&fsp_ipmi.lock); + + return fsp_ipmi_send_request(); +} + +static int fsp_ipmi_queue_msg_head(struct ipmi_msg *ipmi_msg) +{ + struct fsp_ipmi_msg *fsp_ipmi_msg = container_of(ipmi_msg, + struct fsp_ipmi_msg, ipmi_msg); + + if (fsp_in_rr()) + return OPAL_BUSY; + + lock(&fsp_ipmi.lock); + list_add(&fsp_ipmi.msg_queue, &fsp_ipmi_msg->link); + unlock(&fsp_ipmi.lock); + + return fsp_ipmi_send_request(); +} + +static int fsp_ipmi_dequeue_msg(struct ipmi_msg *ipmi_msg) +{ + struct fsp_ipmi_msg *fsp_ipmi_msg = container_of(ipmi_msg, + struct fsp_ipmi_msg, ipmi_msg); + + lock(&fsp_ipmi.lock); + list_del_from(&fsp_ipmi.msg_queue, &fsp_ipmi_msg->link); + unlock(&fsp_ipmi.lock); + + return 0; +} + +static struct ipmi_backend fsp_ipmi_backend = { + .alloc_msg = fsp_ipmi_alloc_msg, + .free_msg = fsp_ipmi_free_msg, + .queue_msg = fsp_ipmi_queue_msg, + .queue_msg_head = fsp_ipmi_queue_msg_head, + .dequeue_msg = fsp_ipmi_dequeue_msg, + /* FIXME if ever use ipmi_queue_msg_sync on FSP */ + .poll = NULL, +}; + +static bool fsp_ipmi_rr_notify(uint32_t cmd_sub_mod, + struct fsp_msg *msg __unused) +{ + struct ipmi_msg *ipmi_msg; + + switch (cmd_sub_mod) { + case FSP_RESET_START: + return true; + case FSP_RELOAD_COMPLETE: + /* + * We will not get response for outstanding request. Send error + * message to caller and start sending new ipmi messages. + */ + if (fsp_ipmi.cur_msg) { + ipmi_msg = &fsp_ipmi.cur_msg->ipmi_msg; + fsp_ipmi_cmd_done(ipmi_msg->cmd, + IPMI_NETFN_RETURN_CODE(ipmi_msg->netfn), + IPMI_ERR_UNSPECIFIED); + } + fsp_ipmi_send_request(); + return true; + } + return false; +} + +static struct fsp_client fsp_ipmi_client_rr = { + .message = fsp_ipmi_rr_notify, +}; + +static bool fsp_ipmi_send_response(uint32_t cmd) +{ + struct fsp_msg *resp; + int rc; + + resp = fsp_mkmsg(cmd, 0); + if (!resp) { + log_simple_error(&e_info(OPAL_RC_IPMI_RESP), "IPMI: Failed to " + "allocate response message\n"); + return false; + } + + rc = fsp_queue_msg(resp, fsp_freemsg); + if (rc) { + fsp_freemsg(resp); + log_simple_error(&e_info(OPAL_RC_IPMI_RESP), "IPMI: Failed to " + "queue response message\n"); + return false; + } + + return true; +} + +static bool fsp_ipmi_read_response(struct fsp_msg *msg) +{ + uint8_t *resp_buf = fsp_ipmi.ipmi_resp_buf; + uint32_t status = fsp_msg_get_data_word(msg, 3); + uint32_t length = fsp_msg_get_data_word(msg, 2); + struct ipmi_msg *ipmi_msg; + uint8_t netfn, cmd, cc; + + assert(fsp_ipmi.cur_msg); + ipmi_msg = &fsp_ipmi.cur_msg->ipmi_msg; + + /* Response TCE token */ + assert(fsp_msg_get_data_word(msg, 1) == PSI_DMA_PLAT_RESP_BUF); + + if (status != FSP_STATUS_SUCCESS) { + if(status == FSP_STATUS_DMA_ERROR) + log_simple_error(&e_info(OPAL_RC_IPMI_DMA_ERROR_RESP), "IPMI: Received " + "DMA ERROR response from FSP, this may be due to FSP " + "is in termination state:0x%02x\n", status); + else + log_simple_error(&e_info(OPAL_RC_IPMI_RESP), "IPMI: FSP response " + "received with bad status:0x%02x\n", status); + + fsp_ipmi_cmd_done(ipmi_msg->cmd, + IPMI_NETFN_RETURN_CODE(ipmi_msg->netfn), + IPMI_ERR_UNSPECIFIED); + return fsp_ipmi_send_response(FSP_RSP_PLAT_DATA | + FSP_STATUS_SUCCESS); + } + + /* KCS response message format */ + netfn = *resp_buf++; + cmd = *resp_buf++; + cc = *resp_buf++; + length -= FSP_IPMI_RESP_MIN_LEN; + + prlog(PR_TRACE, "IPMI: fsp response received, netfn:0x%02x, cmd:0x%02x," + " cc:0x%02x, length:%d\n", netfn, cmd, cc, length); + + if (length > ipmi_msg->resp_size) { + prlog(PR_DEBUG, "IPMI: Length mismatch in response (%d, %d)\n", + length, ipmi_msg->resp_size); + length = ipmi_msg->resp_size; /* Truncate */ + cc = IPMI_ERR_MSG_TRUNCATED; + } + + ipmi_msg->resp_size = length; + if (length) + memcpy(ipmi_msg->data, resp_buf, length); + + fsp_ipmi_cmd_done(cmd, netfn, cc); + + return fsp_ipmi_send_response(FSP_RSP_PLAT_DATA); +} + +static bool fsp_ipmi_response(uint32_t cmd_sub_mod, struct fsp_msg *msg) +{ + bool rc; + + switch (cmd_sub_mod) { + case FSP_CMD_SEND_PLAT_DATA: + prlog(PR_TRACE, "FSP_CMD_SEND_PLAT_DATA command received\n"); + rc = fsp_ipmi_read_response(msg); + break; + default: + return false; + }; + + /* If response sent successfully, pick the next request */ + if (rc == true) + fsp_ipmi_send_request(); + + return rc; +} + +static struct fsp_client fsp_ipmi_client = { + .message = fsp_ipmi_response, +}; + +void fsp_ipmi_init(void) +{ + fsp_tce_map(PSI_DMA_PLAT_REQ_BUF, fsp_ipmi.ipmi_req_buf, + PSI_DMA_PLAT_REQ_BUF_SIZE); + fsp_tce_map(PSI_DMA_PLAT_RESP_BUF, fsp_ipmi.ipmi_resp_buf, + PSI_DMA_PLAT_RESP_BUF_SIZE); + + list_head_init(&fsp_ipmi.msg_queue); + init_lock(&fsp_ipmi.lock); + + fsp_register_client(&fsp_ipmi_client, FSP_MCLASS_FETCH_SPDATA); + fsp_register_client(&fsp_ipmi_client_rr, FSP_MCLASS_RR_EVENT); + ipmi_register_backend(&fsp_ipmi_backend); +} diff --git a/roms/skiboot/hw/fsp/fsp-leds.c b/roms/skiboot/hw/fsp/fsp-leds.c new file mode 100644 index 000000000..5a552ab3e --- /dev/null +++ b/roms/skiboot/hw/fsp/fsp-leds.c @@ -0,0 +1,1939 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * LED location code and indicator handling + * + * Copyright 2013-2019 IBM Corp. + */ + +#define pr_fmt(fmt) "FSPLED: " fmt +#include <skiboot.h> +#include <fsp.h> +#include <device.h> +#include <spcn.h> +#include <lock.h> +#include <errorlog.h> +#include <opal.h> +#include <opal-msg.h> +#include <fsp-leds.h> +#include <fsp-sysparam.h> + +#define buf_write(p, type, val) do { *(type *)(p) = val;\ + p += sizeof(type); } while(0) +#define buf_read(p, type, addr) do { *addr = *(type *)(p);\ + p += sizeof(type); } while(0) + +/* SPCN replay threshold */ +#define SPCN_REPLAY_THRESHOLD 2 + +/* LED support status */ +enum led_support_state { + LED_STATE_ABSENT, + LED_STATE_READING, + LED_STATE_PRESENT, +}; + +static enum led_support_state led_support = LED_STATE_ABSENT; + +/* + * PSI mapped buffer for LED data + * + * Mapped once and never unmapped. Used for fetching all + * available LED information and creating the list. Also + * used for setting individual LED state. + * + */ +static void *led_buffer; +static u8 *loc_code_list_buffer = NULL; + +/* Maintain list of all LEDs + * + * The contents here will be used to cater requests from FSP + * async commands and HV initiated OPAL calls. + */ +static struct list_head cec_ledq; /* CEC LED list */ +static struct list_head encl_ledq; /* Enclosure LED list */ +static struct list_head spcn_cmdq; /* SPCN command queue */ + +/* LED lock */ +static struct lock led_lock = LOCK_UNLOCKED; +static struct lock spcn_cmd_lock = LOCK_UNLOCKED; +static struct lock sai_lock = LOCK_UNLOCKED; + +static bool spcn_cmd_complete = true; /* SPCN command complete */ + +/* Last SPCN command */ +static u32 last_spcn_cmd; +static int replay = 0; + +/* + * FSP controls System Attention Indicator. But it expects hypervisor + * keep track of the status and serve get LED state request (both from + * Linux and FSP itself)! + */ +static struct sai_data sai_data; + +/* Forward declaration */ +static void fsp_read_leds_data_complete(struct fsp_msg *msg); +static int process_led_state_change(void); + + +DEFINE_LOG_ENTRY(OPAL_RC_LED_SPCN, OPAL_PLATFORM_ERR_EVT, OPAL_LED, + OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_GENERAL, + OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_LED_BUFF, OPAL_PLATFORM_ERR_EVT, OPAL_LED, + OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_GENERAL, + OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_LED_LC, OPAL_PLATFORM_ERR_EVT, OPAL_LED, + OPAL_PLATFORM_FIRMWARE, OPAL_INFO, OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_LED_STATE, OPAL_PLATFORM_ERR_EVT, OPAL_LED, + OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_GENERAL, + OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_LED_SUPPORT, OPAL_PLATFORM_ERR_EVT, OPAL_LED, + OPAL_PLATFORM_FIRMWARE, OPAL_INFO, OPAL_NA); + + +/* Find descendent LED record with CEC location code in CEC list */ +static struct fsp_led_data *fsp_find_cec_led(char *loc_code) +{ + struct fsp_led_data *led, *next; + + list_for_each_safe(&cec_ledq, led, next, link) { + if (strcmp(led->loc_code, loc_code)) + continue; + return led; + } + return NULL; +} + +/* Find encl LED record with ENCL location code in ENCL list */ +static struct fsp_led_data *fsp_find_encl_led(char *loc_code) +{ + struct fsp_led_data *led, *next; + + list_for_each_safe(&encl_ledq, led, next, link) { + if (strcmp(led->loc_code, loc_code)) + continue; + return led; + } + return NULL; +} + +/* Find encl LED record with CEC location code in CEC list */ +static struct fsp_led_data *fsp_find_encl_cec_led(char *loc_code) +{ + struct fsp_led_data *led, *next; + + list_for_each_safe(&cec_ledq, led, next, link) { + if (strstr(led->loc_code, "-")) + continue; + if (!strstr(loc_code, led->loc_code)) + continue; + return led; + } + return NULL; +} + +/* Find encl LED record with CEC location code in ENCL list */ +static struct fsp_led_data *fsp_find_encl_encl_led(char *loc_code) +{ + struct fsp_led_data *led, *next; + + list_for_each_safe(&encl_ledq, led, next, link) { + if (!strstr(loc_code, led->loc_code)) + continue; + return led; + } + return NULL; +} + +/* Compute the ENCL LED status in CEC list */ +static void compute_encl_status_cec(struct fsp_led_data *encl_led) +{ + struct fsp_led_data *led, *next; + + encl_led->status &= ~SPCN_LED_IDENTIFY_MASK; + encl_led->status &= ~SPCN_LED_FAULT_MASK; + + list_for_each_safe(&cec_ledq, led, next, link) { + if (!strstr(led->loc_code, encl_led->loc_code)) + continue; + + /* Don't count the enclsure LED itself */ + if (!strcmp(led->loc_code, encl_led->loc_code)) + continue; + + if (led->status & SPCN_LED_IDENTIFY_MASK) + encl_led->status |= SPCN_LED_IDENTIFY_MASK; + + if (led->status & SPCN_LED_FAULT_MASK) + encl_led->status |= SPCN_LED_FAULT_MASK; + } +} + +/* Is a enclosure LED */ +static bool is_enclosure_led(char *loc_code) +{ + if (strstr(loc_code, "-")) + return false; + if (!fsp_find_cec_led(loc_code) || !fsp_find_encl_led(loc_code)) + return false; + return true; +} + +static inline void opal_led_update_complete(u64 async_token, u64 result) +{ + opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, + cpu_to_be64(async_token), + cpu_to_be64(result)); +} + +static inline bool is_sai_loc_code(const char *loc_code) +{ + if (!loc_code) + return false; + + if (!strncmp(sai_data.loc_code, loc_code, strlen(sai_data.loc_code))) + return true; + + return false; +} + +/* Set/Reset System attention indicator */ +static void fsp_set_sai_complete(struct fsp_msg *msg) +{ + int ret = OPAL_SUCCESS; + int rc = msg->resp->word1 & 0xff00; + struct led_set_cmd *spcn_cmd = (struct led_set_cmd *)msg->user_data; + + if (rc) { + /** + * @fwts-label FSPSAIFailed + * @fwts-advice Failed to update System Attention Indicator. + * Likely means some bug with OPAL interacting with FSP. + */ + prlog(PR_ERR, "Update SAI cmd failed [rc=%d].\n", rc); + ret = OPAL_INTERNAL_ERROR; + + /* Roll back */ + lock(&sai_lock); + sai_data.state = spcn_cmd->ckpt_status; + unlock(&sai_lock); + } + + if (spcn_cmd->cmd_src == SPCN_SRC_OPAL) + opal_led_update_complete(spcn_cmd->async_token, ret); + + /* free msg and spcn command */ + free(spcn_cmd); + fsp_freemsg(msg); + + /* Process pending LED update request */ + process_led_state_change(); +} + +static int fsp_set_sai(struct led_set_cmd *spcn_cmd) +{ + int rc = -ENOMEM; + uint32_t cmd = FSP_CMD_SA_INDICATOR; + struct fsp_msg *msg; + + /* + * FSP does not allow hypervisor to set real SAI, but we can + * reset real SAI. Also in our case only host can control + * LEDs, not guests. Hence we will set platform virtual SAI + * and reset real SAI. + */ + if (spcn_cmd->state == LED_STATE_ON) + cmd |= FSP_LED_SET_PLAT_SAI; + else + cmd |= FSP_LED_RESET_REAL_SAI; + + prlog(PR_TRACE, "Update SAI Indicator [cur : 0x%x, new : 0x%x].\n", + sai_data.state, spcn_cmd->state); + + msg = fsp_mkmsg(cmd, 0); + if (!msg) { + /** + * @fwts-label SAIMallocFail + * @fwts-advice OPAL ran out of memory while trying to + * allocate an FSP message in SAI code path. This indicates + * an OPAL bug that caused OPAL to run out of memory. + */ + prlog(PR_ERR, "%s: Memory allocation failed.\n", __func__); + goto sai_fail; + } + + spcn_cmd->ckpt_status = sai_data.state; + msg->user_data = spcn_cmd; + rc = fsp_queue_msg(msg, fsp_set_sai_complete); + if (rc) { + fsp_freemsg(msg); + /** + * @fwts-label SAIQueueFail + * @fwts-advice Error in queueing message to FSP in SAI code + * path. Likely an OPAL bug. + */ + prlog(PR_ERR, "%s: Failed to queue the message\n", __func__); + goto sai_fail; + } + + lock(&sai_lock); + sai_data.state = spcn_cmd->state; + unlock(&sai_lock); + + return OPAL_SUCCESS; + +sai_fail: + if (spcn_cmd->cmd_src == SPCN_SRC_OPAL) + opal_led_update_complete(spcn_cmd->async_token, + OPAL_INTERNAL_ERROR); + + return OPAL_INTERNAL_ERROR; +} + +static void fsp_get_sai_complete(struct fsp_msg *msg) +{ + int rc = msg->resp->word1 & 0xff00; + + if (rc) { + /** + * @fwts-label FSPSAIGetFailed + * @fwts-advice Possibly an error on FSP side, OPAL failed + * to read state from FSP. + */ + prlog(PR_ERR, "Read real SAI cmd failed [rc = 0x%x].\n", rc); + } else { /* Update SAI state */ + lock(&sai_lock); + sai_data.state = fsp_msg_get_data_word(msg->resp, 0) & 0xff; + unlock(&sai_lock); + + prlog(PR_TRACE, "SAI initial state = 0x%x\n", sai_data.state); + } + + fsp_freemsg(msg); +} + +/* Read initial SAI state. */ +static void fsp_get_sai(void) +{ + int rc; + uint32_t cmd = FSP_CMD_SA_INDICATOR | FSP_LED_READ_REAL_SAI; + struct fsp_msg *msg; + + msg = fsp_mkmsg(cmd, 0); + if (!msg) { + /** + * @fwts-label FSPGetSAIMallocFail + * @fwts-advice OPAL ran out of memory: OPAL bug. + */ + prlog(PR_ERR, "%s: Memory allocation failed.\n", __func__); + return; + } + rc = fsp_queue_msg(msg, fsp_get_sai_complete); + if (rc) { + fsp_freemsg(msg); + /** + * @fwts-label FSPGetSAIQueueFail + * @fwts-advice Failed to queue message to FSP: OPAL bug + */ + prlog(PR_ERR, "%s: Failed to queue the message\n", __func__); + } +} + +static bool sai_update_notification(struct fsp_msg *msg) +{ + uint32_t state = fsp_msg_get_data_word(msg, 2); + uint32_t param_id = fsp_msg_get_data_word(msg, 0); + int len = fsp_msg_get_data_word(msg, 1) & 0xffff; + + if (param_id != SYS_PARAM_REAL_SAI && param_id != SYS_PARAM_PLAT_SAI) + return false; + + if (len != 4) + return false; + + if (state != LED_STATE_ON && state != LED_STATE_OFF) + return false; + + /* Update SAI state */ + lock(&sai_lock); + sai_data.state = state; + unlock(&sai_lock); + + prlog(PR_TRACE, "SAI updated. New SAI state = 0x%x\n", state); + return true; +} + + +/* + * Update both the local LED lists to reflect upon led state changes + * occurred with the recent SPCN command. Subsequent LED requests will + * be served with these updates changed to the list. + */ +static void update_led_list(char *loc_code, u32 led_state, u32 excl_bit) +{ + struct fsp_led_data *led = NULL, *encl_led = NULL, *encl_cec_led = NULL; + bool is_encl_led = is_enclosure_led(loc_code); + + /* Enclosure LED in CEC list */ + encl_cec_led = fsp_find_encl_cec_led(loc_code); + if (!encl_cec_led) { + log_simple_error(&e_info(OPAL_RC_LED_LC), + "Could not find enclosure LED in CEC LC=%s\n", + loc_code); + return; + } + + /* Update state */ + if (is_encl_led) { + /* Enclosure exclusive bit */ + encl_cec_led->excl_bit = excl_bit; + } else { /* Descendant LED in CEC list */ + led = fsp_find_cec_led(loc_code); + if (!led) { + log_simple_error(&e_info(OPAL_RC_LED_LC), + "Could not find descendent LED in \ + CEC LC=%s\n", loc_code); + return; + } + led->status = led_state; + } + + /* Enclosure LED in ENCL list */ + encl_led = fsp_find_encl_encl_led(loc_code); + if (!encl_led) { + log_simple_error(&e_info(OPAL_RC_LED_LC), + "Could not find enclosure LED in ENCL LC=%s\n", + loc_code); + return; + } + + /* Compute descendent rolled up status */ + compute_encl_status_cec(encl_cec_led); + + /* Check whether exclussive bits set */ + if (encl_cec_led->excl_bit & FSP_LED_EXCL_FAULT) + encl_cec_led->status |= SPCN_LED_FAULT_MASK; + + if (encl_cec_led->excl_bit & FSP_LED_EXCL_IDENTIFY) + encl_cec_led->status |= SPCN_LED_IDENTIFY_MASK; + + /* Copy over */ + encl_led->status = encl_cec_led->status; + encl_led->excl_bit = encl_cec_led->excl_bit; +} + +static int fsp_set_led_response(uint32_t cmd) +{ + struct fsp_msg *msg; + int rc = -1; + + msg = fsp_mkmsg(cmd, 0); + if (!msg) { + prerror("Failed to allocate FSP_RSP_SET_LED_STATE [cmd=%x])\n", + cmd); + } else { + rc = fsp_queue_msg(msg, fsp_freemsg); + if (rc != OPAL_SUCCESS) { + fsp_freemsg(msg); + prerror("Failed to queue FSP_RSP_SET_LED_STATE" + " [cmd=%x]\n", cmd); + } + } + return rc; +} + +static void fsp_spcn_set_led_completion(struct fsp_msg *msg) +{ + struct fsp_msg *resp = msg->resp; + u32 cmd = FSP_RSP_SET_LED_STATE; + u8 status = resp->word1 & 0xff00; + struct led_set_cmd *spcn_cmd = (struct led_set_cmd *)msg->user_data; + + lock(&led_lock); + + /* + * LED state update request came as part of FSP async message + * FSP_CMD_SET_LED_STATE, we need to send response message. + * + * Also if SPCN command failed, then roll back changes. + */ + if (status != FSP_STATUS_SUCCESS) { + log_simple_error(&e_info(OPAL_RC_LED_SPCN), + "Last SPCN command failed, status=%02x\n", + status); + cmd |= FSP_STATUS_GENERIC_ERROR; + + /* Rollback the changes */ + update_led_list(spcn_cmd->loc_code, + spcn_cmd->ckpt_status, spcn_cmd->ckpt_excl_bit); + } + + /* FSP initiated SPCN command */ + if (spcn_cmd->cmd_src == SPCN_SRC_FSP) + fsp_set_led_response(cmd); + + /* OPAL initiated SPCN command */ + if (spcn_cmd->cmd_src == SPCN_SRC_OPAL) { + if (status != FSP_STATUS_SUCCESS) + opal_led_update_complete(spcn_cmd->async_token, + OPAL_INTERNAL_ERROR); + else + opal_led_update_complete(spcn_cmd->async_token, + OPAL_SUCCESS); + } + + unlock(&led_lock); + + /* free msg and spcn command */ + free(spcn_cmd); + fsp_freemsg(msg); + + /* Process pending LED update request */ + process_led_state_change(); +} + +/* + * Set the state of the LED pointed by the location code + * + * LED command: FAULT state or IDENTIFY state + * LED state : OFF (reset) or ON (set) + * + * SPCN TCE mapped buffer entries for setting LED state + * + * struct spcn_led_data { + * u8 lc_len; + * u16 state; + * char lc_code[LOC_CODE_SIZE]; + *}; + */ +static int fsp_msg_set_led_state(struct led_set_cmd *spcn_cmd) +{ + struct spcn_led_data sled; + struct fsp_msg *msg = NULL; + struct fsp_led_data *led = NULL; + void *buf = led_buffer; + u16 data_len = 0; + u32 cmd_hdr = 0; + u32 cmd = FSP_RSP_SET_LED_STATE; + int rc = -1; + + memset(sled.lc_code, 0, LOC_CODE_SIZE); + sled.lc_len = strlen(spcn_cmd->loc_code); + if (sled.lc_len >= LOC_CODE_SIZE) + sled.lc_len = LOC_CODE_SIZE - 1; + strncpy(sled.lc_code, spcn_cmd->loc_code, LOC_CODE_SIZE - 1); + + lock(&led_lock); + + /* Location code length + Location code + LED control */ + data_len = LOC_CODE_LEN + sled.lc_len + LED_CONTROL_LEN; + cmd_hdr = SPCN_MOD_SET_LED_CTL_LOC_CODE << 24 | SPCN_CMD_SET << 16 | + data_len; + + /* Fetch the current state of LED */ + led = fsp_find_cec_led(spcn_cmd->loc_code); + + /* LED not present */ + if (led == NULL) { + if (spcn_cmd->cmd_src == SPCN_SRC_FSP) { + cmd |= FSP_STATUS_INVALID_LC; + fsp_set_led_response(cmd); + } + + if (spcn_cmd->cmd_src == SPCN_SRC_OPAL) + opal_led_update_complete(spcn_cmd->async_token, + OPAL_INTERNAL_ERROR); + + unlock(&led_lock); + return rc; + } + + /* + * Checkpoint the status here, will use it if the SPCN + * command eventually fails. + */ + spcn_cmd->ckpt_status = led->status; + spcn_cmd->ckpt_excl_bit = led->excl_bit; + sled.state = cpu_to_be16(led->status); + + /* Update the exclussive LED bits */ + if (is_enclosure_led(spcn_cmd->loc_code)) { + if (spcn_cmd->command == LED_COMMAND_FAULT) { + if (spcn_cmd->state == LED_STATE_ON) + led->excl_bit |= FSP_LED_EXCL_FAULT; + if (spcn_cmd->state == LED_STATE_OFF) + led->excl_bit &= ~FSP_LED_EXCL_FAULT; + } + + if (spcn_cmd->command == LED_COMMAND_IDENTIFY) { + if (spcn_cmd->state == LED_STATE_ON) + led->excl_bit |= FSP_LED_EXCL_IDENTIFY; + if (spcn_cmd->state == LED_STATE_OFF) + led->excl_bit &= ~FSP_LED_EXCL_IDENTIFY; + } + } + + /* LED FAULT commad */ + if (spcn_cmd->command == LED_COMMAND_FAULT) { + if (spcn_cmd->state == LED_STATE_ON) + sled.state |= cpu_to_be16(SPCN_LED_FAULT_MASK); + if (spcn_cmd->state == LED_STATE_OFF) + sled.state &= cpu_to_be16(~SPCN_LED_FAULT_MASK); + } + + /* LED IDENTIFY command */ + if (spcn_cmd->command == LED_COMMAND_IDENTIFY) { + if (spcn_cmd->state == LED_STATE_ON) + sled.state |= cpu_to_be16(SPCN_LED_IDENTIFY_MASK); + if (spcn_cmd->state == LED_STATE_OFF) + sled.state &= cpu_to_be16(~SPCN_LED_IDENTIFY_MASK); + } + + /* Write into SPCN TCE buffer */ + buf_write(buf, u8, sled.lc_len); /* Location code length */ + memcpy(buf, sled.lc_code, sled.lc_len); /* Location code */ + buf += sled.lc_len; + buf_write(buf, __be16, sled.state); /* LED state */ + + msg = fsp_mkmsg(FSP_CMD_SPCN_PASSTHRU, 4, + SPCN_ADDR_MODE_CEC_NODE, cmd_hdr, 0, PSI_DMA_LED_BUF); + if (!msg) { + cmd |= FSP_STATUS_GENERIC_ERROR; + rc = -1; + goto update_fail; + } + + /* + * Update the local lists based on the attempted SPCN command to + * set/reset an individual led (CEC or ENCL). + */ + update_led_list(spcn_cmd->loc_code, be16_to_cpu(sled.state), led->excl_bit); + msg->user_data = spcn_cmd; + + rc = fsp_queue_msg(msg, fsp_spcn_set_led_completion); + if (rc != OPAL_SUCCESS) { + cmd |= FSP_STATUS_GENERIC_ERROR; + fsp_freemsg(msg); + /* Revert LED state update */ + update_led_list(spcn_cmd->loc_code, spcn_cmd->ckpt_status, + spcn_cmd->ckpt_excl_bit); + } + +update_fail: + if (rc) { + log_simple_error(&e_info(OPAL_RC_LED_STATE), + "Set led state failed at LC=%s\n", + spcn_cmd->loc_code); + + if (spcn_cmd->cmd_src == SPCN_SRC_FSP) + fsp_set_led_response(cmd); + + if (spcn_cmd->cmd_src == SPCN_SRC_OPAL) + opal_led_update_complete(spcn_cmd->async_token, + OPAL_INTERNAL_ERROR); + } + + unlock(&led_lock); + return rc; +} + +/* + * process_led_state_change + * + * If the command queue is empty, it sets the 'spcn_cmd_complete' as true + * and just returns. Else it pops one element from the command queue + * and processes the command for the requested LED state change. + */ +static int process_led_state_change(void) +{ + struct led_set_cmd *spcn_cmd; + int rc = 0; + + /* + * The command queue is empty. This will only + * happen during the SPCN command callback path + * in which case we set 'spcn_cmd_complete' as true. + */ + lock(&spcn_cmd_lock); + if (list_empty(&spcn_cmdq)) { + spcn_cmd_complete = true; + unlock(&spcn_cmd_lock); + return rc; + } + + spcn_cmd = list_pop(&spcn_cmdq, struct led_set_cmd, link); + unlock(&spcn_cmd_lock); + + if (is_sai_loc_code(spcn_cmd->loc_code)) + rc = fsp_set_sai(spcn_cmd); + else + rc = fsp_msg_set_led_state(spcn_cmd); + + if (rc) { + free(spcn_cmd); + process_led_state_change(); + } + + return rc; +} + +/* + * queue_led_state_change + * + * FSP async command or OPAL based request for LED state change gets queued + * up in the command queue. If no previous SPCN command is pending, then it + * immediately pops up one element from the list and processes it. If previous + * SPCN commands are still pending then it just queues up and return. When the + * SPCN command callback gets to execute, it processes one element from the + * list and keeps the chain execution going. At last when there are no elements + * in the command queue it sets 'spcn_cmd_complete' as true again. + */ +static int queue_led_state_change(char *loc_code, u8 command, + u8 state, int cmd_src, uint64_t async_token) +{ + struct led_set_cmd *cmd; + int rc = 0; + + /* New request node */ + cmd = zalloc(sizeof(struct led_set_cmd)); + if (!cmd) { + /** + * @fwts-label FSPLEDRequestMallocFail + * @fwts-advice OPAL failed to allocate memory for FSP LED + * command. Likely an OPAL bug led to out of memory. + */ + prlog(PR_ERR, "SPCN set command node allocation failed\n"); + return -1; + } + + /* Save the request */ + strncpy(cmd->loc_code, loc_code, LOC_CODE_SIZE - 1); + cmd->command = command; + cmd->state = state; + cmd->cmd_src = cmd_src; + cmd->async_token = async_token; + + /* Add to the queue */ + lock(&spcn_cmd_lock); + list_add_tail(&spcn_cmdq, &cmd->link); + + /* No previous SPCN command pending */ + if (spcn_cmd_complete) { + spcn_cmd_complete = false; + unlock(&spcn_cmd_lock); + rc = process_led_state_change(); + return rc; + } + + unlock(&spcn_cmd_lock); + return rc; +} + +/* + * Write single location code information into the TCE outbound buffer + * + * Data layout + * + * 2 bytes - Length of location code structure + * 4 bytes - CCIN in ASCII + * 1 byte - Resource status flag + * 1 byte - Indicator state + * 1 byte - Raw loc code length + * 1 byte - Loc code field size + * Field size byte - Null terminated ASCII string padded to 4 byte boundary + * + */ +static u32 fsp_push_data_to_tce(struct fsp_led_data *led, u8 *out_data, + u32 total_size) +{ + struct fsp_loc_code_data lcode; + + /* CCIN value is irrelevant */ + lcode.ccin = 0x0; + + lcode.status = FSP_IND_NOT_IMPLMNTD; + + if (led->parms & SPCN_LED_IDENTIFY_MASK) + lcode.status = FSP_IND_IMPLMNTD; + + /* LED indicator status */ + lcode.ind_state = FSP_IND_INACTIVE; + if (led->status & SPCN_LED_IDENTIFY_MASK) + lcode.ind_state |= FSP_IND_IDENTIFY_ACTV; + if (led->status & SPCN_LED_FAULT_MASK) + lcode.ind_state |= FSP_IND_FAULT_ACTV; + + /* Location code */ + memset(lcode.loc_code, 0, LOC_CODE_SIZE); + lcode.raw_len = strlen(led->loc_code); + strncpy(lcode.loc_code, led->loc_code, LOC_CODE_SIZE - 1); + lcode.fld_sz = sizeof(lcode.loc_code); + + /* Rest of the structure */ + lcode.size = cpu_to_be16(sizeof(lcode)); + lcode.status &= 0x0f; + + /* + * Check for outbound buffer overflow. If there are still + * more LEDs to be sent across to FSP, don't send, ignore. + */ + if ((total_size + be16_to_cpu(lcode.size)) > PSI_DMA_LOC_COD_BUF_SZ) + return 0; + + /* Copy over to the buffer */ + memcpy(out_data, &lcode, sizeof(lcode)); + + return be16_to_cpu(lcode.size); +} + +/* + * Send out LED information structure pointed by "loc_code" + * to FSP through the PSI DMA mapping. Buffer layout structure + * must be followed. + */ +static void fsp_ret_loc_code_list(u16 req_type, char *loc_code) +{ + struct fsp_led_data *led, *next; + struct fsp_msg *msg; + + u8 *data; /* Start of TCE mapped buffer */ + u8 *out_data; /* Start of location code data */ + u32 bytes_sent = 0, total_size = 0; + u16 header_size = 0, flags = 0; + + if (loc_code_list_buffer == NULL) { + prerror("No loc_code_list_buffer\n"); + return; + } + + /* Init the addresses */ + data = loc_code_list_buffer; + out_data = NULL; + + /* Unmapping through FSP_CMD_RET_LOC_BUFFER command */ + fsp_tce_map(PSI_DMA_LOC_COD_BUF, (void *)data, PSI_DMA_LOC_COD_BUF_SZ); + out_data = data + 8; + + /* CEC LED list */ + list_for_each_safe(&cec_ledq, led, next, link) { + /* + * When the request type is system wide led list + * i.e GET_LC_CMPLT_SYS, send the entire contents + * of the CEC list including both all descendents + * and all of their enclosures. + */ + + if (req_type == GET_LC_ENCLOSURES) + break; + + if (req_type == GET_LC_ENCL_DESCENDANTS) { + if (strstr(led->loc_code, loc_code) == NULL) + continue; + } + + if (req_type == GET_LC_SINGLE_LOC_CODE) { + if (strcmp(led->loc_code, loc_code)) + continue; + } + + /* Push the data into TCE buffer */ + bytes_sent = fsp_push_data_to_tce(led, out_data, total_size); + + /* Advance the TCE pointer */ + out_data += bytes_sent; + total_size += bytes_sent; + } + + /* Enclosure LED list */ + if (req_type == GET_LC_ENCLOSURES) { + list_for_each_safe(&encl_ledq, led, next, link) { + + /* Push the data into TCE buffer */ + bytes_sent = fsp_push_data_to_tce(led, + out_data, total_size); + + /* Advance the TCE pointer */ + out_data += bytes_sent; + total_size += bytes_sent; + } + } + + /* Count from 'data' instead of 'data_out' */ + total_size += 8; + memcpy(data, &total_size, sizeof(total_size)); + + header_size = OUTBUF_HEADER_SIZE; + memcpy(data + sizeof(total_size), &header_size, sizeof(header_size)); + + if (req_type == GET_LC_ENCL_DESCENDANTS) + flags = 0x8000; + + memcpy(data + sizeof(total_size) + sizeof(header_size), &flags, + sizeof(flags)); + msg = fsp_mkmsg(FSP_RSP_GET_LED_LIST, 3, 0, + PSI_DMA_LOC_COD_BUF, total_size); + if (!msg) { + prerror("Failed to allocate FSP_RSP_GET_LED_LIST.\n"); + } else { + if (fsp_queue_msg(msg, fsp_freemsg)) { + fsp_freemsg(msg); + prerror("Failed to queue FSP_RSP_GET_LED_LIST\n"); + } + } +} + +/* + * FSP async command: FSP_CMD_GET_LED_LIST + * + * (1) FSP sends the list of location codes through inbound buffer + * (2) HV sends the status of those location codes through outbound buffer + * + * Inbound buffer data layout (loc code request structure) + * + * 2 bytes - Length of entire structure + * 2 bytes - Request type + * 1 byte - Raw length of location code + * 1 byte - Location code field size + * `Field size` bytes - NULL terminated ASCII location code string + */ +static void fsp_get_led_list(struct fsp_msg *msg) +{ + struct fsp_loc_code_req req; + u32 tce_token = fsp_msg_get_data_word(msg, 1); + void *buf; + + /* Parse inbound buffer */ + buf = fsp_inbound_buf_from_tce(tce_token); + if (!buf) { + struct fsp_msg *msg; + msg = fsp_mkmsg(FSP_RSP_GET_LED_LIST | FSP_STATUS_INVALID_DATA, + 0); + if (!msg) { + prerror("Failed to allocate FSP_RSP_GET_LED_LIST" + " | FSP_STATUS_INVALID_DATA\n"); + } else { + if (fsp_queue_msg(msg, fsp_freemsg)) { + fsp_freemsg(msg); + prerror("Failed to queue " + "FSP_RSP_GET_LED_LIST |" + " FSP_STATUS_INVALID_DATA\n"); + } + } + return; + } + memcpy(&req, buf, sizeof(req)); + + prlog(PR_TRACE, "Request for loc code list type 0x%04x LC=%s\n", + be16_to_cpu(req.req_type), req.loc_code); + + fsp_ret_loc_code_list(be16_to_cpu(req.req_type), req.loc_code); +} + +/* + * FSP async command: FSP_CMD_RET_LOC_BUFFER + * + * With this command FSP returns ownership of the outbound buffer + * used by Sapphire to pass the indicator list previous time. That + * way FSP tells Sapphire that it has consumed all the data present + * on the outbound buffer and Sapphire can reuse it for next request. + */ +static void fsp_free_led_list_buf(struct fsp_msg *msg) +{ + u32 tce_token = fsp_msg_get_data_word(msg, 1); + u32 cmd = FSP_RSP_RET_LED_BUFFER; + struct fsp_msg *resp; + + /* Token does not point to outbound buffer */ + if (tce_token != PSI_DMA_LOC_COD_BUF) { + log_simple_error(&e_info(OPAL_RC_LED_BUFF), + "Invalid tce token from FSP\n"); + cmd |= FSP_STATUS_GENERIC_ERROR; + resp = fsp_mkmsg(cmd, 0); + if (!resp) { + prerror("Failed to allocate FSP_RSP_RET_LED_BUFFER" + "| FSP_STATUS_GENERIC_ERROR\n"); + return; + } + + if (fsp_queue_msg(resp, fsp_freemsg)) { + fsp_freemsg(resp); + prerror("Failed to queue " + "RET_LED_BUFFER|ERROR\n"); + } + return; + } + + /* Unmap the location code DMA buffer */ + fsp_tce_unmap(PSI_DMA_LOC_COD_BUF, PSI_DMA_LOC_COD_BUF_SZ); + + resp = fsp_mkmsg(cmd, 0); + if (!resp) { + prerror("Failed to allocate FSP_RSP_RET_LED_BUFFER\n"); + return; + } + if (fsp_queue_msg(resp, fsp_freemsg)) { + fsp_freemsg(resp); + prerror("Failed to queue FSP_RSP_RET_LED_BUFFER\n"); + } +} + +static void fsp_ret_led_state(char *loc_code) +{ + bool found = false; + u8 ind_state = 0; + u32 cmd = FSP_RSP_GET_LED_STATE; + struct fsp_led_data *led, *next; + struct fsp_msg *msg; + + if (is_sai_loc_code(loc_code)) { + if (sai_data.state & OPAL_SLOT_LED_STATE_ON) + ind_state = FSP_IND_FAULT_ACTV; + found = true; + } else { + list_for_each_safe(&cec_ledq, led, next, link) { + if (strcmp(loc_code, led->loc_code)) + continue; + + /* Found the location code */ + if (led->status & SPCN_LED_IDENTIFY_MASK) + ind_state |= FSP_IND_IDENTIFY_ACTV; + if (led->status & SPCN_LED_FAULT_MASK) + ind_state |= FSP_IND_FAULT_ACTV; + + found = true; + break; + } + } + + /* Location code not found */ + if (!found) { + log_simple_error(&e_info(OPAL_RC_LED_LC), + "Could not find the location code LC=%s\n", + loc_code); + cmd |= FSP_STATUS_INVALID_LC; + ind_state = 0xff; + } + + msg = fsp_mkmsg(cmd, 1, ind_state); + if (!msg) { + prerror("Couldn't alloc FSP_RSP_GET_LED_STATE\n"); + return; + } + + if (fsp_queue_msg(msg, fsp_freemsg)) { + fsp_freemsg(msg); + prerror("Couldn't queue FSP_RSP_GET_LED_STATE\n"); + } +} + +/* + * FSP async command: FSP_CMD_GET_LED_STATE + * + * With this command FSP query the state for any given LED + */ +static void fsp_get_led_state(struct fsp_msg *msg) +{ + struct fsp_get_ind_state_req req; + u32 tce_token = fsp_msg_get_data_word(msg, 1); + void *buf; + + /* Parse the inbound buffer */ + buf = fsp_inbound_buf_from_tce(tce_token); + if (!buf) { + struct fsp_msg *msg; + msg = fsp_mkmsg(FSP_RSP_GET_LED_STATE | + FSP_STATUS_INVALID_DATA, 0); + if (!msg) { + prerror("Failed to allocate FSP_RSP_GET_LED_STATE" + " | FSP_STATUS_INVALID_DATA\n"); + return; + } + if (fsp_queue_msg(msg, fsp_freemsg)) { + fsp_freemsg(msg); + prerror("Failed to queue FSP_RSP_GET_LED_STATE" + " | FSP_STATUS_INVALID_DATA\n"); + } + return; + } + memcpy(&req, buf, sizeof(req)); + + prlog(PR_TRACE, "%s: tce=0x%08x buf=%p rq.sz=%d rq.lc_len=%d" + " rq.fld_sz=%d LC: %02x %02x %02x %02x....\n", __func__, + tce_token, buf, req.size, req.lc_len, req.fld_sz, + req.loc_code[0], req.loc_code[1], + req.loc_code[2], req.loc_code[3]); + + /* Bound check */ + if (req.lc_len >= LOC_CODE_SIZE) { + log_simple_error(&e_info(OPAL_RC_LED_LC), + "Loc code too large in %s: %d bytes\n", + __func__, req.lc_len); + req.lc_len = LOC_CODE_SIZE - 1; + } + /* Ensure NULL termination */ + req.loc_code[req.lc_len] = 0; + + /* Do the deed */ + fsp_ret_led_state(req.loc_code); +} + +/* + * FSP async command: FSP_CMD_SET_LED_STATE + * + * With this command FSP sets/resets the state for any given LED + */ +static void fsp_set_led_state(struct fsp_msg *msg) +{ + struct fsp_set_ind_state_req req; + struct fsp_led_data *led, *next; + u32 tce_token = fsp_msg_get_data_word(msg, 1); + bool command, state; + void *buf; + int rc; + + /* Parse the inbound buffer */ + buf = fsp_inbound_buf_from_tce(tce_token); + if (!buf) { + fsp_set_led_response(FSP_RSP_SET_LED_STATE | + FSP_STATUS_INVALID_DATA); + return; + } + memcpy(&req, buf, sizeof(req)); + + prlog(PR_TRACE, "%s: tce=0x%08x buf=%p rq.sz=%d rq.typ=0x%04x" + " rq.lc_len=%d rq.fld_sz=%d LC: %02x %02x %02x %02x....\n", + __func__, tce_token, buf, be16_to_cpu(req.size), req.lc_len, req.fld_sz, + be16_to_cpu(req.req_type), + req.loc_code[0], req.loc_code[1], + req.loc_code[2], req.loc_code[3]); + + /* Bound check */ + if (req.lc_len >= LOC_CODE_SIZE) { + log_simple_error(&e_info(OPAL_RC_LED_LC), + "Loc code too large in %s: %d bytes\n", + __func__, req.lc_len); + req.lc_len = LOC_CODE_SIZE - 1; + } + /* Ensure NULL termination */ + req.loc_code[req.lc_len] = 0; + + /* Decode command */ + command = (req.ind_state & LOGICAL_IND_STATE_MASK) ? + LED_COMMAND_FAULT : LED_COMMAND_IDENTIFY; + state = (req.ind_state & ACTIVE_LED_STATE_MASK) ? + LED_STATE_ON : LED_STATE_OFF; + + /* Handle requests */ + switch (be16_to_cpu(req.req_type)) { + case SET_IND_ENCLOSURE: + list_for_each_safe(&cec_ledq, led, next, link) { + /* Only descendants of the same enclosure */ + if (!strstr(led->loc_code, req.loc_code)) + continue; + + /* Skip the enclosure */ + if (!strcmp(led->loc_code, req.loc_code)) + continue; + + rc = queue_led_state_change(led->loc_code, command, + state, SPCN_SRC_FSP, 0); + if (rc != 0) + fsp_set_led_response(FSP_RSP_SET_LED_STATE | + FSP_STATUS_GENERIC_ERROR); + } + break; + case SET_IND_SINGLE_LOC_CODE: + /* Set led state for single descendent led */ + rc = queue_led_state_change(req.loc_code, + command, state, SPCN_SRC_FSP, 0); + if (rc != 0) + fsp_set_led_response(FSP_RSP_SET_LED_STATE | + FSP_STATUS_GENERIC_ERROR); + break; + default: + fsp_set_led_response(FSP_RSP_SET_LED_STATE | + FSP_STATUS_NOT_SUPPORTED); + break; + } +} + +/* Handle received indicator message from FSP */ +static bool fsp_indicator_message(u32 cmd_sub_mod, struct fsp_msg *msg) +{ + u32 cmd; + struct fsp_msg *resp; + + /* LED support not available yet */ + if (led_support != LED_STATE_PRESENT) { + log_simple_error(&e_info(OPAL_RC_LED_SUPPORT), + "Indicator message while LED support not" + " available yet\n"); + return false; + } + + switch (cmd_sub_mod) { + case FSP_CMD_GET_LED_LIST: + prlog(PR_TRACE, "FSP_CMD_GET_LED_LIST command received\n"); + fsp_get_led_list(msg); + return true; + case FSP_CMD_RET_LED_BUFFER: + prlog(PR_TRACE, "FSP_CMD_RET_LED_BUFFER command received\n"); + fsp_free_led_list_buf(msg); + return true; + case FSP_CMD_GET_LED_STATE: + prlog(PR_TRACE, "FSP_CMD_GET_LED_STATE command received\n"); + fsp_get_led_state(msg); + return true; + case FSP_CMD_SET_LED_STATE: + prlog(PR_TRACE, "FSP_CMD_SET_LED_STATE command received\n"); + fsp_set_led_state(msg); + return true; + /* + * FSP async sub commands which have not been implemented. + * For these async sub commands, print for the log and ack + * the field service processor with a generic error. + */ + case FSP_CMD_GET_MTMS_LIST: + prlog(PR_TRACE, "FSP_CMD_GET_MTMS_LIST command received\n"); + cmd = FSP_RSP_GET_MTMS_LIST; + break; + case FSP_CMD_RET_MTMS_BUFFER: + prlog(PR_TRACE, "FSP_CMD_RET_MTMS_BUFFER command received\n"); + cmd = FSP_RSP_RET_MTMS_BUFFER; + break; + case FSP_CMD_SET_ENCL_MTMS: + prlog(PR_TRACE, "FSP_CMD_SET_MTMS command received\n"); + cmd = FSP_RSP_SET_ENCL_MTMS; + break; + case FSP_CMD_CLR_INCT_ENCL: + prlog(PR_TRACE, "FSP_CMD_CLR_INCT_ENCL command received\n"); + cmd = FSP_RSP_CLR_INCT_ENCL; + break; + case FSP_CMD_ENCL_MCODE_INIT: + prlog(PR_TRACE, "FSP_CMD_ENCL_MCODE_INIT command received\n"); + cmd = FSP_RSP_ENCL_MCODE_INIT; + break; + case FSP_CMD_ENCL_MCODE_INTR: + prlog(PR_TRACE, "FSP_CMD_ENCL_MCODE_INTR command received\n"); + cmd = FSP_RSP_ENCL_MCODE_INTR; + break; + case FSP_CMD_ENCL_POWR_TRACE: + prlog(PR_TRACE, "FSP_CMD_ENCL_POWR_TRACE command received\n"); + cmd = FSP_RSP_ENCL_POWR_TRACE; + break; + case FSP_CMD_RET_ENCL_TRACE_BUFFER: + prlog(PR_TRACE, "FSP_CMD_RET_ENCL_TRACE_BUFFER command received\n"); + cmd = FSP_RSP_RET_ENCL_TRACE_BUFFER; + break; + case FSP_CMD_GET_SPCN_LOOP_STATUS: + prlog(PR_TRACE, "FSP_CMD_GET_SPCN_LOOP_STATUS command received\n"); + cmd = FSP_RSP_GET_SPCN_LOOP_STATUS; + break; + case FSP_CMD_INITIATE_LAMP_TEST: + /* XXX: FSP ACK not required for this sub command */ + prlog(PR_TRACE, "FSP_CMD_INITIATE_LAMP_TEST command received\n"); + return true; + default: + return false; + } + cmd |= FSP_STATUS_GENERIC_ERROR; + resp = fsp_mkmsg(cmd, 0); + if (!resp) { + prerror("Failed to allocate FSP_STATUS_GENERIC_ERROR\n"); + return false; + } + if (fsp_queue_msg(resp, fsp_freemsg)) { + fsp_freemsg(resp); + prerror("Failed to queue FSP_STATUS_GENERIC_ERROR\n"); + return false; + } + return true; +} + +/* Indicator class client */ +static struct fsp_client fsp_indicator_client = { + .message = fsp_indicator_message, +}; + + +static int fsp_opal_get_sai(__be64 *led_mask, __be64 *led_value) +{ + *led_mask |= cpu_to_be64(OPAL_SLOT_LED_STATE_ON << OPAL_SLOT_LED_TYPE_ATTN); + if (sai_data.state & OPAL_SLOT_LED_STATE_ON) + *led_value |= cpu_to_be64(OPAL_SLOT_LED_STATE_ON << OPAL_SLOT_LED_TYPE_ATTN); + + return OPAL_SUCCESS; +} + +static int fsp_opal_set_sai(uint64_t async_token, char *loc_code, + const u64 led_mask, const u64 led_value) +{ + int state = LED_STATE_OFF; + + if (!((led_mask >> OPAL_SLOT_LED_TYPE_ATTN) & OPAL_SLOT_LED_STATE_ON)) + return OPAL_PARAMETER; + + if ((led_value >> OPAL_SLOT_LED_TYPE_ATTN) & OPAL_SLOT_LED_STATE_ON) + state = LED_STATE_ON; + + return queue_led_state_change(loc_code, 0, + state, SPCN_SRC_OPAL, async_token); +} + +/* + * fsp_opal_leds_get_ind (OPAL_LEDS_GET_INDICATOR) + * + * Argument Description Updated By + * -------- ----------- ---------- + * loc_code Location code of the LEDs (Host) + * led_mask LED types whose status is available (OPAL) + * led_value Status of the available LED types (OPAL) + * max_led_type Maximum number of supported LED types (Host/OPAL) + * + * The host will pass the location code of the LED types (loc_code) and + * maximum number of LED types it understands (max_led_type). OPAL will + * update the 'led_mask' with set bits pointing to LED types whose status + * is available and updates the 'led_value' with actual status. OPAL checks + * the 'max_led_type' to understand whether the host is newer or older + * compared to itself. In the case where the OPAL is newer compared + * to host (OPAL's max_led_type > host's max_led_type), it will update + * led_mask and led_value according to max_led_type requested by the host. + * When the host is newer compared to the OPAL (host's max_led_type > + * OPAL's max_led_type), OPAL updates 'max_led_type' to the maximum + * number of LED type it understands and updates 'led_mask', 'led_value' + * based on that maximum value of LED types. + */ +static int64_t fsp_opal_leds_get_ind(char *loc_code, __be64 *led_mask, + __be64 *led_value, __be64 *max_led_type) +{ + bool supported = true; + int64_t max; + int rc; + struct fsp_led_data *led; + + /* FSP not present */ + if (!fsp_present()) + return OPAL_HARDWARE; + + /* LED support not available */ + if (led_support != LED_STATE_PRESENT) + return OPAL_HARDWARE; + + max = be64_to_cpu(*max_led_type); + + /* Adjust max LED type */ + if (max > OPAL_SLOT_LED_TYPE_MAX) { + supported = false; + max = OPAL_SLOT_LED_TYPE_MAX; + *max_led_type = cpu_to_be64(max); + } + + /* Invalid parameter */ + if (max <= 0) + return OPAL_PARAMETER; + + /* Get System attention indicator state */ + if (is_sai_loc_code(loc_code)) { + rc = fsp_opal_get_sai(led_mask, led_value); + return rc; + } + + /* LED not found */ + led = fsp_find_cec_led(loc_code); + if (!led) + return OPAL_PARAMETER; + + *led_mask = 0; + *led_value = 0; + + /* Identify LED */ + --max; + *led_mask |= cpu_to_be64(OPAL_SLOT_LED_STATE_ON << OPAL_SLOT_LED_TYPE_ID); + if (led->status & SPCN_LED_IDENTIFY_MASK) + *led_value |= cpu_to_be64(OPAL_SLOT_LED_STATE_ON << OPAL_SLOT_LED_TYPE_ID); + + /* Fault LED */ + if (!max) + return OPAL_SUCCESS; + + --max; + *led_mask |= cpu_to_be64(OPAL_SLOT_LED_STATE_ON << OPAL_SLOT_LED_TYPE_FAULT); + if (led->status & SPCN_LED_FAULT_MASK) + *led_value |= cpu_to_be64(OPAL_SLOT_LED_STATE_ON << OPAL_SLOT_LED_TYPE_FAULT); + + /* OPAL doesn't support all the LED type requested by payload */ + if (!supported) + return OPAL_PARTIAL; + + return OPAL_SUCCESS; +} + +/* + * fsp_opal_leds_set_ind (OPAL_LEDS_SET_INDICATOR) + * + * Argument Description Updated By + * -------- ----------- ---------- + * loc_code Location code of the LEDs (Host) + * led_mask LED types whose status will be updated (Host) + * led_value Requested status of various LED types (Host) + * max_led_type Maximum number of supported LED types (Host/OPAL) + * + * The host will pass the location code of the LED types, mask, value + * and maximum number of LED types it understands. OPAL will update + * LED status for all the LED types mentioned in the mask with their + * value mentioned. OPAL checks the 'max_led_type' to understand + * whether the host is newer or older compared to itself. In case where + * the OPAL is newer compared to the host (OPAL's max_led_type > + * host's max_led_type), it updates LED status based on max_led_type + * requested from the host. When the host is newer compared to the OPAL + * (host's max_led_type > OPAL's max_led_type), OPAL updates + * 'max_led_type' to the maximum number of LED type it understands and + * then it updates LED status based on that updated maximum value of LED + * types. Host needs to check the returned updated value of max_led_type + * to figure out which part of it's request got served and which ones got + * ignored. + */ +static int64_t fsp_opal_leds_set_ind(uint64_t async_token, + char *loc_code, const u64 led_mask, + const u64 led_value, __be64 *max_led_type) +{ + bool supported = true; + int command, state, rc = OPAL_SUCCESS; + int64_t max; + struct fsp_led_data *led; + + /* FSP not present */ + if (!fsp_present()) + return OPAL_HARDWARE; + + /* LED support not available */ + if (led_support != LED_STATE_PRESENT) + return OPAL_HARDWARE; + + max = be64_to_cpu(*max_led_type); + + /* Adjust max LED type */ + if (max > OPAL_SLOT_LED_TYPE_MAX) { + supported = false; + max = OPAL_SLOT_LED_TYPE_MAX; + *max_led_type = cpu_to_be64(max); + } + + /* Invalid parameter */ + if (max <= 0) + return OPAL_PARAMETER; + + /* Set System attention indicator state */ + if (is_sai_loc_code(loc_code)) { + supported = true; + rc = fsp_opal_set_sai(async_token, + loc_code, led_mask, led_value); + goto success; + } + + /* LED not found */ + led = fsp_find_cec_led(loc_code); + if (!led) + return OPAL_PARAMETER; + + /* Indentify LED mask */ + --max; + + if ((led_mask >> OPAL_SLOT_LED_TYPE_ID) & OPAL_SLOT_LED_STATE_ON) { + supported = true; + + command = LED_COMMAND_IDENTIFY; + state = LED_STATE_OFF; + if ((led_value >> OPAL_SLOT_LED_TYPE_ID) + & OPAL_SLOT_LED_STATE_ON) + state = LED_STATE_ON; + + rc = queue_led_state_change(loc_code, command, + state, SPCN_SRC_OPAL, async_token); + } + + if (!max) + goto success; + + /* Fault LED mask */ + --max; + if ((led_mask >> OPAL_SLOT_LED_TYPE_FAULT) & OPAL_SLOT_LED_STATE_ON) { + supported = true; + + command = LED_COMMAND_FAULT; + state = LED_STATE_OFF; + if ((led_value >> OPAL_SLOT_LED_TYPE_FAULT) + & OPAL_SLOT_LED_STATE_ON) + state = LED_STATE_ON; + + rc = queue_led_state_change(loc_code, command, + state, SPCN_SRC_OPAL, async_token); + } + +success: + /* Unsupported LED type */ + if (!supported) + return OPAL_UNSUPPORTED; + + if (rc == OPAL_SUCCESS) + rc = OPAL_ASYNC_COMPLETION; + else + rc = OPAL_INTERNAL_ERROR; + + return rc; +} + +/* Get LED node from device tree */ +static struct dt_node *dt_get_led_node(void) +{ + struct dt_node *pled; + + if (!opal_node) { + prlog(PR_WARNING, "OPAL parent device node not available\n"); + return NULL; + } + + pled = dt_find_by_path(opal_node, DT_PROPERTY_LED_NODE); + if (!pled) + prlog(PR_WARNING, "Parent device node not available\n"); + + return pled; +} + +/* Get System attention indicator location code from device tree */ +static void dt_get_sai_loc_code(void) +{ + struct dt_node *pled, *child; + const char *led_type = NULL; + + memset(sai_data.loc_code, 0, LOC_CODE_SIZE); + + pled = dt_get_led_node(); + if (!pled) + return; + + list_for_each(&pled->children, child, list) { + led_type = dt_prop_get(child, DT_PROPERTY_LED_TYPES); + if (!led_type) + continue; + + if (strcmp(led_type, LED_TYPE_ATTENTION)) + continue; + + memcpy(sai_data.loc_code, child->name, LOC_CODE_SIZE - 1); + + prlog(PR_TRACE, "SAI Location code = %s\n", sai_data.loc_code); + return; + } +} + +/* + * create_led_device_node + * + * Creates the system parent LED device node and all individual + * child LED device nodes under it. This is called right before + * starting the payload (Linux) to ensure that the SPCN command + * sequence to fetch the LED location code list has been finished + * and to have a better chance of creating the deviced nodes. + */ +void create_led_device_nodes(void) +{ + const char *led_mode = NULL; + struct fsp_led_data *led, *next; + struct dt_node *pled, *cled; + + if (!fsp_present()) + return; + + /* Make sure LED list read is completed */ + while (led_support == LED_STATE_READING) + opal_run_pollers(); + + if (led_support == LED_STATE_ABSENT) { + prlog(PR_WARNING, "LED support not available, \ + hence device tree nodes will not be created\n"); + return; + } + + /* Get LED node */ + pled = dt_get_led_node(); + if (!pled) + return; + + /* Check if already populated (fast-reboot) */ + if (dt_has_node_property(pled, "compatible", NULL)) + return; + dt_add_property_strings(pled, "compatible", DT_PROPERTY_LED_COMPATIBLE); + + led_mode = dt_prop_get(pled, DT_PROPERTY_LED_MODE); + if (!led_mode) { + prlog(PR_WARNING, "Unknown LED operating mode\n"); + return; + } + + /* LED child nodes */ + list_for_each_safe(&cec_ledq, led, next, link) { + /* Duplicate LED location code */ + if (dt_find_by_path(pled, led->loc_code)) { + prlog(PR_WARNING, "duplicate location code %s\n", + led->loc_code); + continue; + } + + cled = dt_new(pled, led->loc_code); + if (!cled) { + prlog(PR_WARNING, "Child device node creation " + "failed\n"); + continue; + } + + if (!strcmp(led_mode, LED_MODE_LIGHT_PATH)) + dt_add_property_strings(cled, DT_PROPERTY_LED_TYPES, + LED_TYPE_IDENTIFY, + LED_TYPE_FAULT); + else + dt_add_property_strings(cled, DT_PROPERTY_LED_TYPES, + LED_TYPE_IDENTIFY); + } +} + +/* + * Process the received LED data from SPCN + * + * Every LED state data is added into the CEC list. If the location + * code is a enclosure type, its added into the enclosure list as well. + * + */ +static void fsp_process_leds_data(u16 len) +{ + struct fsp_led_data *led_data = NULL; + void *buf = NULL; + + /* + * Process the entire captured data from the last command + * + * TCE mapped 'led_buffer' contains the fsp_led_data structure + * one after the other till the total length 'len'. + * + */ + buf = led_buffer; + while (len) { + size_t lc_len; + __be16 tmp; + + /* Prepare */ + led_data = zalloc(sizeof(struct fsp_led_data)); + assert(led_data); + + /* Resource ID */ + buf_read(buf, __be16, &tmp); + led_data->rid = be16_to_cpu(tmp); + len -= sizeof(led_data->rid); + + /* Location code length */ + buf_read(buf, u8, &led_data->lc_len); + len -= sizeof(led_data->lc_len); + + lc_len = led_data->lc_len; + if (lc_len == 0) { + free(led_data); + break; + } + + if (lc_len >= LOC_CODE_SIZE) + lc_len = LOC_CODE_SIZE - 1; + + /* Location code */ + strncpy(led_data->loc_code, buf, lc_len); + led_data->loc_code[lc_len] = '\0'; + + buf += led_data->lc_len; + len -= led_data->lc_len; + + /* Parameters */ + buf_read(buf, __be16, &tmp); + led_data->parms = be16_to_cpu(tmp); + len -= sizeof(led_data->parms); + + /* Status */ + buf_read(buf, __be16, &tmp); + led_data->status = be16_to_cpu(tmp); + len -= sizeof(led_data->status); + + /* + * This is Enclosure LED's location code, need to go + * inside the enclosure LED list as well. + */ + if (!strstr(led_data->loc_code, "-")) { + struct fsp_led_data *encl_led_data = NULL; + encl_led_data = zalloc(sizeof(struct fsp_led_data)); + assert(encl_led_data); + + /* copy over the original */ + memcpy(encl_led_data, led_data, sizeof(struct fsp_led_data)); + + /* Add to the list of enclosure LEDs */ + list_add_tail(&encl_ledq, &encl_led_data->link); + } + + /* Push this onto the list */ + list_add_tail(&cec_ledq, &led_data->link); + } +} + +/* Replay the SPCN command */ +static void replay_spcn_cmd(u32 last_spcn_cmd) +{ + u32 cmd_hdr = 0; + int rc = -1; + + /* Reached threshold */ + if (replay == SPCN_REPLAY_THRESHOLD) { + replay = 0; + led_support = LED_STATE_ABSENT; + return; + } + + replay++; + if (last_spcn_cmd == SPCN_MOD_PRS_LED_DATA_FIRST) { + cmd_hdr = SPCN_MOD_PRS_LED_DATA_FIRST << 24 | + SPCN_CMD_PRS << 16; + rc = fsp_queue_msg(fsp_mkmsg(FSP_CMD_SPCN_PASSTHRU, 4, + SPCN_ADDR_MODE_CEC_NODE, + cmd_hdr, 0, + PSI_DMA_LED_BUF), + fsp_read_leds_data_complete); + if (rc) + prlog(PR_ERR, "Replay SPCN_MOD_PRS_LED_DATA_FIRST" + " command could not be queued\n"); + } + + if (last_spcn_cmd == SPCN_MOD_PRS_LED_DATA_SUB) { + cmd_hdr = SPCN_MOD_PRS_LED_DATA_SUB << 24 | SPCN_CMD_PRS << 16; + rc = fsp_queue_msg(fsp_mkmsg(FSP_CMD_SPCN_PASSTHRU, 4, + SPCN_ADDR_MODE_CEC_NODE, cmd_hdr, + 0, PSI_DMA_LED_BUF), + fsp_read_leds_data_complete); + if (rc) + prlog(PR_ERR, "Replay SPCN_MOD_PRS_LED_DATA_SUB" + " command could not be queued\n"); + } + + /* Failed to queue MBOX message */ + if (rc) + led_support = LED_STATE_ABSENT; +} + +/* + * FSP message response handler for following SPCN LED commands + * which are used to fetch all of the LED data from SPCN + * + * 1. SPCN_MOD_PRS_LED_DATA_FIRST --> First 1KB of LED data + * 2. SPCN_MOD_PRS_LED_DATA_SUB --> Subsequent 1KB of LED data + * + * Once the SPCN_RSP_STATUS_SUCCESS response code has been received + * indicating the last batch of 1KB LED data is here, the list addition + * process is now complete and we enable LED support for FSP async commands + * and for OPAL interface. + */ +static void fsp_read_leds_data_complete(struct fsp_msg *msg) +{ + struct fsp_led_data *led, *next; + struct fsp_msg *resp = msg->resp; + u32 cmd_hdr = 0; + int rc = 0; + + u32 msg_status = resp->word1 & 0xff00; + u32 led_status = (fsp_msg_get_data_word(resp, 1) >> 24) & 0xff; + u16 data_len = (u16)(fsp_msg_get_data_word(resp, 1) & 0xffff); + + if (msg_status != FSP_STATUS_SUCCESS) { + log_simple_error(&e_info(OPAL_RC_LED_SUPPORT), + "FSP returned error %x LED not supported\n", + msg_status); + /* LED support not available */ + led_support = LED_STATE_ABSENT; + + fsp_freemsg(msg); + return; + } + + /* SPCN command status */ + switch (led_status) { + /* Last 1KB of LED data */ + case SPCN_RSP_STATUS_SUCCESS: + prlog(PR_DEBUG, "SPCN_RSP_STATUS_SUCCESS: %d bytes received\n", + data_len); + + led_support = LED_STATE_PRESENT; + + /* Copy data to the local list */ + fsp_process_leds_data(data_len); + + /* LEDs captured on the system */ + prlog(PR_DEBUG, "CEC LEDs captured on the system:\n"); + list_for_each_safe(&cec_ledq, led, next, link) { + prlog(PR_DEBUG, + "rid: %x\t" + "len: %x " + "lcode: %-30s\t" + "parms: %04x\t" + "status: %04x\n", + led->rid, + led->lc_len, + led->loc_code, + led->parms, + led->status); + } + + prlog(PR_DEBUG, "ENCL LEDs captured on the system:\n"); + list_for_each_safe(&encl_ledq, led, next, link) { + prlog(PR_DEBUG, + "rid: %x\t" + "len: %x " + "lcode: %-30s\t" + "parms: %04x\t" + "status: %04x\n", + led->rid, + led->lc_len, + led->loc_code, + led->parms, + led->status); + } + + break; + + /* If more 1KB of LED data present */ + case SPCN_RSP_STATUS_COND_SUCCESS: + prlog(PR_DEBUG, "SPCN_RSP_STATUS_COND_SUCCESS: %d bytes " + " received\n", data_len); + + /* Copy data to the local list */ + fsp_process_leds_data(data_len); + + /* Fetch the remaining data from SPCN */ + last_spcn_cmd = SPCN_MOD_PRS_LED_DATA_SUB; + cmd_hdr = SPCN_MOD_PRS_LED_DATA_SUB << 24 | SPCN_CMD_PRS << 16; + rc = fsp_queue_msg(fsp_mkmsg(FSP_CMD_SPCN_PASSTHRU, 4, + SPCN_ADDR_MODE_CEC_NODE, + cmd_hdr, 0, PSI_DMA_LED_BUF), + fsp_read_leds_data_complete); + if (rc) { + prlog(PR_ERR, "SPCN_MOD_PRS_LED_DATA_SUB command" + " could not be queued\n"); + + led_support = LED_STATE_ABSENT; + } + break; + + /* Other expected error codes*/ + case SPCN_RSP_STATUS_INVALID_RACK: + case SPCN_RSP_STATUS_INVALID_SLAVE: + case SPCN_RSP_STATUS_INVALID_MOD: + case SPCN_RSP_STATUS_STATE_PROHIBIT: + case SPCN_RSP_STATUS_UNKNOWN: + default: + /* Replay the previous SPCN command */ + replay_spcn_cmd(last_spcn_cmd); + } + fsp_freemsg(msg); +} + +/* + * Init the LED state + * + * This is called during the host boot process. This is the place where + * we figure out all the LEDs present on the system, their state and then + * create structure out of those information and popullate two master lists. + * One for all the LEDs on the CEC and one for all the LEDs on the enclosure. + * The LED information contained in the lists will cater either to various + * FSP initiated async commands or POWERNV initiated OPAL calls. Need to make + * sure that this initialization process is complete before allowing any requets + * on LED. Also need to be called to re-fetch data from SPCN after any LED state + * have been updated. + */ +static void fsp_leds_query_spcn(void) +{ + struct fsp_led_data *led = NULL; + int rc = 0; + + u32 cmd_hdr = SPCN_MOD_PRS_LED_DATA_FIRST << 24 | SPCN_CMD_PRS << 16; + + /* Till the last batch of LED data */ + last_spcn_cmd = 0; + + /* Empty the lists */ + while (!list_empty(&cec_ledq)) { + led = list_pop(&cec_ledq, struct fsp_led_data, link); + free(led); + } + + while (!list_empty(&encl_ledq)) { + led = list_pop(&encl_ledq, struct fsp_led_data, link); + free(led); + } + + /* Allocate buffer with alignment requirements */ + if (led_buffer == NULL) { + led_buffer = memalign(TCE_PSIZE, PSI_DMA_LED_BUF_SZ); + if (!led_buffer) + return; + } + + /* TCE mapping - will not unmap */ + fsp_tce_map(PSI_DMA_LED_BUF, led_buffer, PSI_DMA_LED_BUF_SZ); + + /* Request the first 1KB of LED data */ + last_spcn_cmd = SPCN_MOD_PRS_LED_DATA_FIRST; + rc = fsp_queue_msg(fsp_mkmsg(FSP_CMD_SPCN_PASSTHRU, 4, + SPCN_ADDR_MODE_CEC_NODE, cmd_hdr, 0, + PSI_DMA_LED_BUF), fsp_read_leds_data_complete); + if (rc) + prlog(PR_ERR, + "SPCN_MOD_PRS_LED_DATA_FIRST command could" + " not be queued\n"); + else /* Initiated LED list fetch MBOX command */ + led_support = LED_STATE_READING; +} + +/* Init the LED subsystem at boot time */ +void fsp_led_init(void) +{ + led_buffer = NULL; + + if (!fsp_present()) + return; + + /* Init the master lists */ + list_head_init(&cec_ledq); + list_head_init(&encl_ledq); + list_head_init(&spcn_cmdq); + + fsp_leds_query_spcn(); + + loc_code_list_buffer = memalign(TCE_PSIZE, PSI_DMA_LOC_COD_BUF_SZ); + if (loc_code_list_buffer == NULL) + prerror("ERROR: Unable to allocate loc_code_list_buffer!\n"); + + prlog(PR_TRACE, "Init completed\n"); + + /* Get System attention indicator state */ + dt_get_sai_loc_code(); + fsp_get_sai(); + + /* Handle FSP initiated async LED commands */ + fsp_register_client(&fsp_indicator_client, FSP_MCLASS_INDICATOR); + prlog(PR_TRACE, "FSP async command client registered\n"); + + /* Register for SAI update notification */ + sysparam_add_update_notifier(sai_update_notification); + + opal_register(OPAL_LEDS_GET_INDICATOR, fsp_opal_leds_get_ind, 4); + opal_register(OPAL_LEDS_SET_INDICATOR, fsp_opal_leds_set_ind, 5); + prlog(PR_TRACE, "LED OPAL interface registered\n"); +} diff --git a/roms/skiboot/hw/fsp/fsp-mem-err.c b/roms/skiboot/hw/fsp/fsp-mem-err.c new file mode 100644 index 000000000..2e3e65401 --- /dev/null +++ b/roms/skiboot/hw/fsp/fsp-mem-err.c @@ -0,0 +1,401 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Sometimes some memory needs to go and sit in the naughty corner + * + * Copyright 2013-2019 IBM Corp. + */ + +#define pr_fmt(fmt) "FSPMEMERR: " fmt +#include <skiboot.h> +#include <opal.h> +#include <opal-msg.h> +#include <lock.h> +#include <fsp.h> +#include <errorlog.h> + +/* FSP sends real address of 4K memory page. */ +#define MEM_ERR_PAGE_SIZE_4K (1UL << 12) + +/* maximum number of error event to hold until linux consumes it. */ +#define MERR_MAX_RECORD 1024 + +struct fsp_mem_err_node { + struct list_node list; + struct OpalMemoryErrorData data; +}; + +static LIST_HEAD(merr_free_list); +static LIST_HEAD(mem_error_list); +/* + * lock is used to protect overwriting of merr_free_list and mem_error_list + * list. + */ +static struct lock mem_err_lock = LOCK_UNLOCKED; + +DEFINE_LOG_ENTRY(OPAL_RC_MEM_ERR_RES, OPAL_PLATFORM_ERR_EVT, OPAL_MEM_ERR, + OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL, + OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_MEM_ERR_DEALLOC, OPAL_PLATFORM_ERR_EVT, OPAL_MEM_ERR, + OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL, + OPAL_NA); + +static bool send_response_to_fsp(u32 cmd_sub_mod) +{ + struct fsp_msg *rsp; + int rc = -ENOMEM; + + rsp = fsp_mkmsg(cmd_sub_mod, 0); + if (rsp) + rc = fsp_queue_msg(rsp, fsp_freemsg); + if (rc) { + fsp_freemsg(rsp); + /* XXX Generate error logs */ + prerror("Error %d queueing FSP memory error reply\n", rc); + return false; + } + return true; +} + +/* + * Queue up the memory error message for delivery. + * + * queue_event_for_delivery get called from two places. + * 1) from queue_mem_err_node when new fsp mem error is available and + * 2) from completion callback indicating that linux has consumed an message. + * + * TODO: + * There is a chance that, we may not get a free slot to queue our event + * for delivery to linux during both the above invocations. In that case + * we end up holding events with us until next fsp memory error comes in. + * We need to address this case either here OR fix up messaging infrastructure + * to make sure at least one slot will always be available per message type. + * + * XXX: BenH: I changed the msg infrastructure to attempt an allocation + * in that case, at least until we clarify a bit better how + * we want to handle things. + */ +static void queue_event_for_delivery(void *data __unused, int staus __unused) +{ + struct fsp_mem_err_node *entry; + uint64_t *merr_data; + int rc; + + lock(&mem_err_lock); + entry = list_pop(&mem_error_list, struct fsp_mem_err_node, list); + unlock(&mem_err_lock); + + if (!entry) + return; + + /* + * struct OpalMemoryErrorData is of (4 * 64 bits) size and well packed + * structure. Hence use uint64_t pointer to pass entire structure + * using 4 params in generic message format. + */ + merr_data = (uint64_t *)&entry->data; + + /* queue up for delivery */ + rc = opal_queue_msg(OPAL_MSG_MEM_ERR, NULL, queue_event_for_delivery, + cpu_to_be64(merr_data[0]), + cpu_to_be64(merr_data[1]), + cpu_to_be64(merr_data[2]), + cpu_to_be64(merr_data[3])); + lock(&mem_err_lock); + if (rc) { + /* + * Failed to queue up the event for delivery. No free slot + * available. There is a chance that we are trying to queue + * up multiple event at the same time. We may already have + * at least one event queued up, in that case we will be + * called again through completion callback and we should + * be able to grab empty slot then. + * + * For now, put this node back on mem_error_list. + */ + list_add(&mem_error_list, &entry->list); + } else + list_add(&merr_free_list, &entry->list); + unlock(&mem_err_lock); +} + +static int queue_mem_err_node(struct OpalMemoryErrorData *merr_evt) +{ + struct fsp_mem_err_node *entry; + + lock(&mem_err_lock); + entry = list_pop(&merr_free_list, struct fsp_mem_err_node, list); + if (!entry) { + printf("Failed to queue up memory error event.\n"); + unlock(&mem_err_lock); + return -ENOMEM; + } + + entry->data = *merr_evt; + list_add(&mem_error_list, &entry->list); + unlock(&mem_err_lock); + + /* Queue up the event for delivery to OS. */ + queue_event_for_delivery(NULL, OPAL_SUCCESS); + return 0; +} + +/* Check if memory resilience event for same address already exists. */ +static bool is_resilience_event_exist(u64 paddr) +{ + struct fsp_mem_err_node *entry; + struct OpalMemoryErrorData *merr_evt; + int found = 0; + + lock(&mem_err_lock); + list_for_each(&mem_error_list, entry, list) { + merr_evt = &entry->data; + if ((merr_evt->type == OPAL_MEM_ERR_TYPE_RESILIENCE) && + (be64_to_cpu(merr_evt->u.resilience.physical_address_start) + == paddr)) { + found = 1; + break; + } + } + unlock(&mem_err_lock); + return !!found; +} + +/* + * handle Memory Resilience error message. + * Section 28.2 of Hypervisor to FSP Mailbox Interface Specification. + * + * The flow for Memory Resilence Event is: + * 1. PRD component in FSP gets a recoverable attention from hardware when + * there is a corretable/uncorrectable memory error to free up a page. + * 2. PRD sends Memory Resilence Command to hypervisor with the real address of + * the 4K memory page in which the error occurred. + * 3. The hypervisor acknowledges with a status immediately. Immediate + * acknowledgment doesn’t require the freeing of the page to be completed. + */ +static bool handle_memory_resilience(u32 cmd_sub_mod, u64 paddr) +{ + int rc = 0; + struct OpalMemoryErrorData mem_err_evt; + struct errorlog *buf; + + memset(&mem_err_evt, 0, sizeof(struct OpalMemoryErrorData)); + /* Check arguments */ + if (paddr == 0) { + prerror("memory resilience: Invalid real address.\n"); + return send_response_to_fsp(FSP_RSP_MEM_RES | + FSP_STATUS_GENERIC_ERROR); + } + + /* Check if event already exist for same address. */ + if (is_resilience_event_exist(paddr)) + goto send_response; + + /* Populate an event. */ + mem_err_evt.version = OpalMemErr_V1; + mem_err_evt.type = OPAL_MEM_ERR_TYPE_RESILIENCE; + + switch (cmd_sub_mod) { + case FSP_CMD_MEM_RES_CE: + /* + * Should we keep counter for corrected errors in + * sapphire OR let linux (PowerNV) handle it? + * + * For now, send corrected errors to linux and let + * linux handle corrected errors thresholding. + */ + mem_err_evt.flags |= cpu_to_be16(OPAL_MEM_CORRECTED_ERROR); + mem_err_evt.u.resilience.resil_err_type = + OPAL_MEM_RESILIENCE_CE; + break; + case FSP_CMD_MEM_RES_UE: + mem_err_evt.u.resilience.resil_err_type = + OPAL_MEM_RESILIENCE_UE; + break; + case FSP_CMD_MEM_RES_UE_SCRB: + mem_err_evt.u.resilience.resil_err_type = + OPAL_MEM_RESILIENCE_UE_SCRUB; + break; + } + mem_err_evt.u.resilience.physical_address_start = cpu_to_be64(paddr); + mem_err_evt.u.resilience.physical_address_end = + cpu_to_be64(paddr + MEM_ERR_PAGE_SIZE_4K); + + /* Queue up the event and inform OS about it. */ + rc = queue_mem_err_node(&mem_err_evt); + +send_response: + /* Queue up an OK response to the resilience message itself */ + if (!rc) + return send_response_to_fsp(FSP_RSP_MEM_RES); + else { + buf = opal_elog_create(&e_info(OPAL_RC_MEM_ERR_RES), 0); + log_append_msg(buf, + "OPAL_MEM_ERR: Cannot queue up memory " + "resilience error event to the OS"); + log_add_section(buf, OPAL_ELOG_SEC_DESC); + log_append_data(buf, (char *) &mem_err_evt, + sizeof(struct OpalMemoryErrorData)); + log_commit(buf); + return false; + } +} + +/* update existing event entry if match is found. */ +static bool update_memory_deallocation_event(u64 paddr_start, u64 paddr_end) +{ + struct fsp_mem_err_node *entry; + struct OpalMemoryErrorData *merr_evt; + int found = 0; + + lock(&mem_err_lock); + list_for_each(&mem_error_list, entry, list) { + merr_evt = &entry->data; + if ((merr_evt->type == OPAL_MEM_ERR_TYPE_DYN_DALLOC) && + (be64_to_cpu(merr_evt->u.dyn_dealloc.physical_address_start) + == paddr_start)) { + found = 1; + if (be64_to_cpu(merr_evt->u.dyn_dealloc.physical_address_end) + < paddr_end) + merr_evt->u.dyn_dealloc.physical_address_end = + cpu_to_be64(paddr_end); + break; + } + } + unlock(&mem_err_lock); + return !!found; +} + +/* + * Handle dynamic memory deallocation message. + * + * When a condition occurs in which we need to do a large scale memory + * deallocation, PRD will send a starting and ending address of an area of + * memory to Hypervisor. Hypervisor then need to use this to deallocate all + * pages between and including the addresses. + * + */ +static bool handle_memory_deallocation(u64 paddr_start, u64 paddr_end) +{ + int rc = 0; + u8 err = 0; + struct OpalMemoryErrorData mem_err_evt; + struct errorlog *buf; + + memset(&mem_err_evt, 0, sizeof(struct OpalMemoryErrorData)); + /* Check arguments */ + if ((paddr_start == 0) || (paddr_end == 0)) { + prerror("memory deallocation: Invalid " + "starting/ending real address.\n"); + err = FSP_STATUS_GENERIC_ERROR; + } + + /* If we had an error, send response to fsp and return */ + if (err) + return send_response_to_fsp(FSP_RSP_MEM_DYN_DEALLOC | err); + + /* + * FSP can send dynamic memory deallocation multiple times for the + * same address/address ranges. Hence check and update if we already + * have sam event queued. + */ + if (update_memory_deallocation_event(paddr_start, paddr_end)) + goto send_response; + + /* Populate an new event. */ + mem_err_evt.version = OpalMemErr_V1; + mem_err_evt.type = OPAL_MEM_ERR_TYPE_DYN_DALLOC; + mem_err_evt.u.dyn_dealloc.dyn_err_type = + OPAL_MEM_DYNAMIC_DEALLOC; + mem_err_evt.u.dyn_dealloc.physical_address_start = cpu_to_be64(paddr_start); + mem_err_evt.u.dyn_dealloc.physical_address_end = cpu_to_be64(paddr_end); + + /* Queue up the event and inform OS about it. */ + rc = queue_mem_err_node(&mem_err_evt); + +send_response: + /* Queue up an OK response to the memory deallocation message itself */ + if (!rc) + return send_response_to_fsp(FSP_RSP_MEM_DYN_DEALLOC); + else { + buf = opal_elog_create(&e_info(OPAL_RC_MEM_ERR_DEALLOC), 0); + log_append_msg(buf, + "OPAL_MEM_ERR: Cannot queue up memory " + "deallocation error event to the OS"); + log_add_section(buf, OPAL_ELOG_SEC_DESC); + log_append_data(buf, (char *)&mem_err_evt, + sizeof(struct OpalMemoryErrorData)); + log_commit(buf); + return false; + } +} + +/* Receive a memory error mesages and handle it. */ +static bool fsp_mem_err_msg(u32 cmd_sub_mod, struct fsp_msg *msg) +{ + u64 paddr_start, paddr_end; + + printf("Received 0x%08ux command\n", cmd_sub_mod); + switch (cmd_sub_mod) { + case FSP_CMD_MEM_RES_CE: + case FSP_CMD_MEM_RES_UE: + case FSP_CMD_MEM_RES_UE_SCRB: + /* + * We get the memory relilence command from FSP for + * correctable/Uncorrectable/scrub UE errors with real + * address of 4K memory page in which the error occurred. + */ + paddr_start = be64_to_cpu(*((__be64 *)&msg->data.bytes[0])); + printf("Got memory resilience error message for " + "paddr=0x%016llux\n", paddr_start); + return handle_memory_resilience(cmd_sub_mod, paddr_start); + case FSP_CMD_MEM_DYN_DEALLOC: + paddr_start = be64_to_cpu(*((__be64 *)&msg->data.bytes[0])); + paddr_end = be64_to_cpu(*((__be64 *)&msg->data.bytes[8])); + printf("Got dynamic memory deallocation message: " + "paddr_start=0x%016llux, paddr_end=0x%016llux\n", + paddr_start, paddr_end); + return handle_memory_deallocation(paddr_start, paddr_end); + } + return false; +} + +/* + * pre allocate memory to hold maximum of 128 memory error event until linux + * consumes it. + */ +static int init_merr_free_list(uint32_t num_entries) +{ + struct fsp_mem_err_node *entry; + int i; + + entry = zalloc(sizeof(struct fsp_mem_err_node) * num_entries); + if (!entry) + return -ENOMEM; + + for (i = 0; i < num_entries; ++i, entry++) + list_add_tail(&merr_free_list, &entry->list); + + return 0; +} + +static struct fsp_client fsp_mem_err_client = { + .message = fsp_mem_err_msg, +}; + +void fsp_memory_err_init(void) +{ + int rc; + + printf("Intializing fsp memory handling.\n"); + /* If we have an FSP, register for notifications */ + if (!fsp_present()) + return; + + /* pre allocate memory for 128 record */ + rc = init_merr_free_list(MERR_MAX_RECORD); + if (rc < 0) + return; + + fsp_register_client(&fsp_mem_err_client, FSP_MCLASS_MEMORY_ERR); +} diff --git a/roms/skiboot/hw/fsp/fsp-nvram.c b/roms/skiboot/hw/fsp/fsp-nvram.c new file mode 100644 index 000000000..aa17cb5e7 --- /dev/null +++ b/roms/skiboot/hw/fsp/fsp-nvram.c @@ -0,0 +1,424 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Read/Write NVRAM from/to FSP + * + * Copyright 2013-2017 IBM Corp. + */ + +#include <skiboot.h> +#include <fsp.h> +#include <opal.h> +#include <lock.h> +#include <device.h> +#include <errorlog.h> + +/* + * The FSP NVRAM API operates in "blocks" of 4K. It is entirely exposed + * to the OS via the OPAL APIs. + * + * In order to avoid dealing with complicated read/modify/write state + * machines (and added issues related to FSP failover in the middle) + * we keep a memory copy of the entire nvram which we load at boot + * time. We save only modified blocks. + * + * To limit the amount of memory used by the nvram image, we limit + * how much nvram we support to NVRAM_SIZE. Additionally, this limit + * of 1M is the maximum that the CHRP/PAPR nvram partition format + * supports for a partition entry. + * + * (Q: should we save the whole thing in case of FSP failover ?) + * + * The nvram is expected to comply with the CHRP/PAPR defined format, + * and specifically contain a System partition (ID 0x70) named "common" + * with configuration variables for the bootloader and a FW private + * partition for future use by skiboot. + * + * If the partition layout appears broken or lacks one of the above + * partitions, we reformat the entire nvram at boot time. + * + * We do not exploit the ability of the FSP to store a checksum. This + * is documented as possibly going away. The CHRP format for nvram + * that Linux uses has its own (though weak) checksum mechanism already + * + */ + +#define NVRAM_BLKSIZE 0x1000 + +struct nvram_triplet { + __be64 dma_addr; + __be32 blk_offset; + __be32 blk_count; +} __packed; + +#define NVRAM_FLAG_CLEAR_WPEND 0x80000000 + +enum nvram_state { + NVRAM_STATE_CLOSED, + NVRAM_STATE_OPENING, + NVRAM_STATE_BROKEN, + NVRAM_STATE_OPEN, + NVRAM_STATE_ABSENT, +}; + +static void *fsp_nvram_image; +static uint32_t fsp_nvram_size; +static struct lock fsp_nvram_lock = LOCK_UNLOCKED; +static struct fsp_msg *fsp_nvram_msg; +static uint32_t fsp_nvram_dirty_start; +static uint32_t fsp_nvram_dirty_end; +static bool fsp_nvram_was_read; +static struct nvram_triplet fsp_nvram_triplet __align(0x1000); +static enum nvram_state fsp_nvram_state = NVRAM_STATE_CLOSED; + +DEFINE_LOG_ENTRY(OPAL_RC_NVRAM_INIT, OPAL_PLATFORM_ERR_EVT , OPAL_NVRAM, + OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL, + OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_NVRAM_OPEN, OPAL_PLATFORM_ERR_EVT, OPAL_NVRAM, + OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL, + OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_NVRAM_SIZE, OPAL_PLATFORM_ERR_EVT, OPAL_NVRAM, + OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL, + OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_NVRAM_READ, OPAL_PLATFORM_ERR_EVT, OPAL_NVRAM, + OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL, + OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_NVRAM_WRITE, OPAL_PLATFORM_ERR_EVT, OPAL_NVRAM, + OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL, + OPAL_NA); + +static void fsp_nvram_send_write(void); + +static void fsp_nvram_wr_complete(struct fsp_msg *msg) +{ + struct fsp_msg *resp = msg->resp; + uint8_t rc; + + lock(&fsp_nvram_lock); + fsp_nvram_msg = NULL; + + /* Check for various errors. If an error occurred, + * we generally assume the nvram is completely dirty + * but we won't trigger a new write until we get + * either a new attempt at writing, or an FSP reset + * reload (TODO) + */ + if (!resp || resp->state != fsp_msg_response) + goto fail_dirty; + rc = (msg->word1 >> 8) & 0xff; + switch(rc) { + case 0: + case 0x44: + /* Sync to secondary required... XXX */ + case 0x45: + break; + case 0xef: + /* Sync to secondary failed, let's ignore that for now, + * maybe when (if) we handle redundant FSPs ... + */ + prerror("FSP: NVRAM sync to secondary failed\n"); + break; + default: + log_simple_error(&e_info(OPAL_RC_NVRAM_WRITE), + "FSP: NVRAM write return error 0x%02x\n", rc); + goto fail_dirty; + } + fsp_freemsg(msg); + if (fsp_nvram_dirty_start <= fsp_nvram_dirty_end) + fsp_nvram_send_write(); + unlock(&fsp_nvram_lock); + return; + fail_dirty: + fsp_nvram_dirty_start = 0; + fsp_nvram_dirty_end = fsp_nvram_size - 1; + fsp_freemsg(msg); + unlock(&fsp_nvram_lock); +} + +static void fsp_nvram_send_write(void) +{ + uint32_t start = fsp_nvram_dirty_start; + uint32_t end = fsp_nvram_dirty_end; + uint32_t count; + + if (start > end || fsp_nvram_state != NVRAM_STATE_OPEN) + return; + count = (end - start) / NVRAM_BLKSIZE + 1; + fsp_nvram_triplet.dma_addr = cpu_to_be64(PSI_DMA_NVRAM_BODY + start); + fsp_nvram_triplet.blk_offset = cpu_to_be32(start / NVRAM_BLKSIZE); + fsp_nvram_triplet.blk_count = cpu_to_be32(count); + fsp_nvram_msg = fsp_mkmsg(FSP_CMD_WRITE_VNVRAM, 6, + 0, PSI_DMA_NVRAM_TRIPL, 1, + NVRAM_FLAG_CLEAR_WPEND, 0, 0); + if (fsp_queue_msg(fsp_nvram_msg, fsp_nvram_wr_complete)) { + fsp_freemsg(fsp_nvram_msg); + fsp_nvram_msg = NULL; + log_simple_error(&e_info(OPAL_RC_NVRAM_WRITE), + "FSP: Error queueing nvram update\n"); + return; + } + fsp_nvram_dirty_start = fsp_nvram_size; + fsp_nvram_dirty_end = 0; +} + +static void fsp_nvram_rd_complete(struct fsp_msg *msg) +{ + int64_t rc; + + lock(&fsp_nvram_lock); + + /* Read complete, check status. What to do if the read fails ? + * + * Well, there could be various reasons such as an FSP reboot + * at the wrong time, but there is really not much we can do + * so for now I'll just mark the nvram as closed, and we'll + * attempt a re-open and re-read whenever the OS tries to + * access it + */ + rc = (msg->resp->word1 >> 8) & 0xff; + fsp_nvram_msg = NULL; + fsp_freemsg(msg); + if (rc) { + prerror("FSP: NVRAM read failed, will try again later\n"); + fsp_nvram_state = NVRAM_STATE_CLOSED; + } else { + /* nvram was read once, no need to do it ever again */ + fsp_nvram_was_read = true; + fsp_nvram_state = NVRAM_STATE_OPEN; + + /* XXX Here we should look for nvram settings that concern + * us such as guest kernel arguments etc... + */ + } + unlock(&fsp_nvram_lock); + nvram_read_complete(fsp_nvram_state == NVRAM_STATE_OPEN); + if (fsp_nvram_state != NVRAM_STATE_OPEN) + log_simple_error(&e_info(OPAL_RC_NVRAM_INIT), + "FSP: NVRAM not read, skipping init\n"); +} + +static void fsp_nvram_send_read(void) +{ + fsp_nvram_msg = fsp_mkmsg(FSP_CMD_READ_VNVRAM, 4, + 0, PSI_DMA_NVRAM_BODY, 0, + fsp_nvram_size / NVRAM_BLKSIZE); + if (fsp_queue_msg(fsp_nvram_msg, fsp_nvram_rd_complete)) { + /* If the nvram read fails to queue, we mark ourselves + * closed. Shouldn't have happened anyway. Not much else + * we can do. + */ + fsp_nvram_state = NVRAM_STATE_CLOSED; + fsp_freemsg(fsp_nvram_msg); + fsp_nvram_msg = NULL; + log_simple_error(&e_info(OPAL_RC_NVRAM_READ), + "FSP: Error queueing nvram read\n"); + return; + } +} + +static void fsp_nvram_open_complete(struct fsp_msg *msg) +{ + int8_t rc; + + lock(&fsp_nvram_lock); + + /* Open complete, check status */ + rc = (msg->resp->word1 >> 8) & 0xff; + fsp_nvram_msg = NULL; + fsp_freemsg(msg); + if (rc) { + log_simple_error(&e_info(OPAL_RC_NVRAM_OPEN), + "FSP: NVRAM open failed, FSP error 0x%02x\n", rc); + goto failed; + } + if (fsp_nvram_was_read) + fsp_nvram_state = NVRAM_STATE_OPEN; + else + fsp_nvram_send_read(); + unlock(&fsp_nvram_lock); + return; + failed: + fsp_nvram_state = NVRAM_STATE_CLOSED; + unlock(&fsp_nvram_lock); +} + +static void fsp_nvram_send_open(void) +{ + printf("FSP NVRAM: Opening nvram...\n"); + fsp_nvram_msg = fsp_mkmsg(FSP_CMD_OPEN_VNVRAM, 1, fsp_nvram_size); + assert(fsp_nvram_msg); + fsp_nvram_state = NVRAM_STATE_OPENING; + if (!fsp_queue_msg(fsp_nvram_msg, fsp_nvram_open_complete)) + return; + + prerror("FSP NVRAM: Failed to queue nvram open message\n"); + fsp_freemsg(fsp_nvram_msg); + fsp_nvram_msg = NULL; + fsp_nvram_state = NVRAM_STATE_CLOSED; +} + +static bool fsp_nvram_get_size(uint32_t *out_size) +{ + struct fsp_msg *msg; + int rc, size; + + msg = fsp_mkmsg(FSP_CMD_GET_VNVRAM_SIZE, 0); + assert(msg); + + rc = fsp_sync_msg(msg, false); + size = msg->resp ? fsp_msg_get_data_word(msg->resp, 0) : 0; + fsp_freemsg(msg); + if (rc || size == 0) { + log_simple_error(&e_info(OPAL_RC_NVRAM_SIZE), + "FSP: Error %d nvram size reported is %d\n", rc, size); + fsp_nvram_state = NVRAM_STATE_BROKEN; + return false; + } + printf("FSP: NVRAM file size from FSP is %d bytes\n", size); + *out_size = size; + return true; +} + +static bool fsp_nvram_msg_rr(u32 cmd_sub_mod, struct fsp_msg *msg) +{ + assert(msg == NULL); + + switch (cmd_sub_mod) { + case FSP_RESET_START: + printf("FSP: Closing NVRAM on account of FSP Reset\n"); + fsp_nvram_state = NVRAM_STATE_CLOSED; + return true; + case FSP_RELOAD_COMPLETE: + printf("FSP: Reopening NVRAM of FSP Reload complete\n"); + lock(&fsp_nvram_lock); + fsp_nvram_send_open(); + unlock(&fsp_nvram_lock); + return true; + } + return false; +} + +static struct fsp_client fsp_nvram_client_rr = { + .message = fsp_nvram_msg_rr, +}; + +static bool fsp_vnvram_msg(u32 cmd_sub_mod, struct fsp_msg *msg __unused) +{ + u32 cmd; + struct fsp_msg *resp; + + switch (cmd_sub_mod) { + case FSP_CMD_GET_VNV_STATS: + prlog(PR_DEBUG, + "FSP NVRAM: Get vNVRAM statistics not supported\n"); + cmd = FSP_RSP_GET_VNV_STATS | FSP_STATUS_INVALID_SUBCMD; + break; + case FSP_CMD_FREE_VNV_STATS: + prlog(PR_DEBUG, + "FSP NVRAM: Free vNVRAM statistics buffer not supported\n"); + cmd = FSP_RSP_FREE_VNV_STATS | FSP_STATUS_INVALID_SUBCMD; + break; + default: + return false; + } + + resp = fsp_mkmsg(cmd, 0); + if (!resp) { + prerror("FSP NVRAM: Failed to allocate resp message\n"); + return false; + } + if (fsp_queue_msg(resp, fsp_freemsg)) { + prerror("FSP NVRAM: Failed to queue resp message\n"); + fsp_freemsg(resp); + return false; + } + return true; +} + +static struct fsp_client fsp_vnvram_client = { + .message = fsp_vnvram_msg, +}; + +int fsp_nvram_info(uint32_t *total_size) +{ + if (!fsp_present()) { + fsp_nvram_state = NVRAM_STATE_ABSENT; + return OPAL_HARDWARE; + } + + if (!fsp_nvram_get_size(total_size)) + return OPAL_HARDWARE; + return OPAL_SUCCESS; +} + +int fsp_nvram_start_read(void *dst, uint32_t src, uint32_t len) +{ + /* We are currently limited to fully aligned transfers */ + assert((((uint64_t)dst) & 0xfff) == 0); + assert(dst); + + /* Currently don't support src!=0 */ + assert(src == 0); + + if (!fsp_present()) + return -ENODEV; + + op_display(OP_LOG, OP_MOD_INIT, 0x0007); + + lock(&fsp_nvram_lock); + + /* Store image info */ + fsp_nvram_image = dst; + fsp_nvram_size = len; + + /* Mark nvram as not dirty */ + fsp_nvram_dirty_start = len; + fsp_nvram_dirty_end = 0; + + /* Map TCEs */ + fsp_tce_map(PSI_DMA_NVRAM_TRIPL, &fsp_nvram_triplet, + PSI_DMA_NVRAM_TRIPL_SZ); + fsp_tce_map(PSI_DMA_NVRAM_BODY, dst, PSI_DMA_NVRAM_BODY_SZ); + + /* Register for the reset/reload event */ + fsp_register_client(&fsp_nvram_client_rr, FSP_MCLASS_RR_EVENT); + + /* Register for virtual NVRAM interface events */ + fsp_register_client(&fsp_vnvram_client, FSP_MCLASS_VIRTUAL_NVRAM); + + /* Open and load the nvram from the FSP */ + fsp_nvram_send_open(); + + unlock(&fsp_nvram_lock); + + return 0; +} + +int fsp_nvram_write(uint32_t offset, void *src, uint32_t size) +{ + uint64_t end = offset + size - 1; + + /* We only support writing from the original image */ + if (src != fsp_nvram_image + offset) + return OPAL_HARDWARE; + + offset &= ~(NVRAM_BLKSIZE - 1); + end &= ~(NVRAM_BLKSIZE - 1); + + lock(&fsp_nvram_lock); + /* If the nvram is closed, try re-opening */ + if (fsp_nvram_state == NVRAM_STATE_CLOSED) + fsp_nvram_send_open(); + if (fsp_nvram_dirty_start > offset) + fsp_nvram_dirty_start = offset; + if (fsp_nvram_dirty_end < end) + fsp_nvram_dirty_end = end; + if (!fsp_nvram_msg && fsp_nvram_state == NVRAM_STATE_OPEN) + fsp_nvram_send_write(); + unlock(&fsp_nvram_lock); + + return 0; +} diff --git a/roms/skiboot/hw/fsp/fsp-occ.c b/roms/skiboot/hw/fsp/fsp-occ.c new file mode 100644 index 000000000..58926f408 --- /dev/null +++ b/roms/skiboot/hw/fsp/fsp-occ.c @@ -0,0 +1,417 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * FSP/OCC interactions + * + * Unlike OpenPOWER machines, FSP machines are much more tightly coupled + * between FSP, host, and OCC. On P8 we have to do a dance to start the + * OCC, but on P9 Hostboot does that, consistent with what we do on + * OpenPOWER. + * + * Copyright 2013-2019 IBM Corp. + */ + +#include <skiboot.h> +#include <xscom.h> +#include <xscom-p8-regs.h> +#include <io.h> +#include <cpu.h> +#include <chip.h> +#include <mem_region.h> +#include <fsp.h> +#include <timebase.h> +#include <hostservices.h> +#include <errorlog.h> +#include <opal-api.h> +#include <opal-msg.h> +#include <timer.h> +#include <i2c.h> +#include <powercap.h> +#include <psr.h> +#include <sensor.h> +#include <occ.h> + +DEFINE_LOG_ENTRY(OPAL_RC_OCC_LOAD, OPAL_PLATFORM_ERR_EVT, OPAL_OCC, + OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL, + OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_OCC_RESET, OPAL_PLATFORM_ERR_EVT, OPAL_OCC, + OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL, + OPAL_NA); + +struct occ_load_req { + u8 scope; + u32 dbob_id; + u32 seq_id; + struct list_node link; +}; +static LIST_HEAD(occ_load_req_list); + + +static void occ_queue_load(u8 scope, u32 dbob_id, u32 seq_id) +{ + struct occ_load_req *occ_req; + + occ_req = zalloc(sizeof(struct occ_load_req)); + if (!occ_req) { + /** + * @fwts-label OCCload_reqENOMEM + * @fwts-advice ENOMEM while allocating OCC load message. + * OCCs not started, consequently no power/frequency scaling + * will be functional. + */ + prlog(PR_ERR, "OCC: Could not allocate occ_load_req\n"); + return; + } + + occ_req->scope = scope; + occ_req->dbob_id = dbob_id; + occ_req->seq_id = seq_id; + list_add_tail(&occ_load_req_list, &occ_req->link); +} + +static void __occ_do_load(u8 scope, u32 dbob_id __unused, u32 seq_id) +{ + struct fsp_msg *stat; + int rc = -ENOMEM; + int status_word = 0; + struct proc_chip *chip = next_chip(NULL); + + /* Call HBRT... */ + rc = host_services_occ_load(); + + /* Handle fallback to preload */ + if (rc == -ENOENT && chip->homer_base) { + prlog(PR_INFO, "OCC: Load: Fallback to preloaded image\n"); + rc = 0; + } else if (!rc) { + struct opal_occ_msg occ_msg = { CPU_TO_BE64(OCC_LOAD), 0, 0 }; + + rc = _opal_queue_msg(OPAL_MSG_OCC, NULL, NULL, + sizeof(struct opal_occ_msg), &occ_msg); + if (rc) + prlog(PR_INFO, "OCC: Failed to queue message %d\n", + OCC_LOAD); + + /* Success, start OCC */ + rc = host_services_occ_start(); + } + if (rc) { + /* If either of hostservices call fail, send fail to FSP */ + /* Find a chip ID to send failure */ + for_each_chip(chip) { + if (scope == 0x01 && dbob_id != chip->dbob_id) + continue; + status_word = 0xB500 | (chip->pcid & 0xff); + break; + } + log_simple_error(&e_info(OPAL_RC_OCC_LOAD), + "OCC: Error %d in load/start OCC\n", rc); + } + + /* Send a single response for all chips */ + stat = fsp_mkmsg(FSP_CMD_LOAD_OCC_STAT, 2, status_word, seq_id); + if (stat) + rc = fsp_queue_msg(stat, fsp_freemsg); + if (rc) { + log_simple_error(&e_info(OPAL_RC_OCC_LOAD), + "OCC: Error %d queueing FSP OCC LOAD STATUS msg", rc); + fsp_freemsg(stat); + } +} + +void occ_poke_load_queue(void) +{ + struct occ_load_req *occ_req, *next; + + if (list_empty(&occ_load_req_list)) + return; + + list_for_each_safe(&occ_load_req_list, occ_req, next, link) { + __occ_do_load(occ_req->scope, occ_req->dbob_id, + occ_req->seq_id); + list_del(&occ_req->link); + free(occ_req); + } +} + +static u32 last_seq_id; +static bool in_ipl = true; +static void occ_do_load(u8 scope, u32 dbob_id __unused, u32 seq_id) +{ + struct fsp_msg *rsp; + int rc = -ENOMEM; + u8 err = 0; + + if (scope != 0x01 && scope != 0x02) { + /** + * @fwts-label OCCLoadInvalidScope + * @fwts-advice Invalid request for loading OCCs. Power and + * frequency management not functional + */ + prlog(PR_ERR, "OCC: Load message with invalid scope 0x%x\n", + scope); + err = 0x22; + } + + /* First queue up an OK response to the load message itself */ + rsp = fsp_mkmsg(FSP_RSP_LOAD_OCC | err, 0); + if (rsp) + rc = fsp_queue_msg(rsp, fsp_freemsg); + if (rc) { + log_simple_error(&e_info(OPAL_RC_OCC_LOAD), + "OCC: Error %d queueing FSP OCC LOAD reply\n", rc); + fsp_freemsg(rsp); + return; + } + + if (err) + return; + + if (proc_gen >= proc_gen_p9) { + if (in_ipl) { + /* OCC is pre-loaded in P9, so send SUCCESS to FSP */ + rsp = fsp_mkmsg(FSP_CMD_LOAD_OCC_STAT, 2, 0, seq_id); + if (!rsp) + return; + + rc = fsp_queue_msg(rsp, fsp_freemsg); + if (rc) { + log_simple_error(&e_info(OPAL_RC_OCC_LOAD), + "OCC: Error %d queueing OCC LOAD STATUS msg", + rc); + fsp_freemsg(rsp); + } + in_ipl = false; + } else { + struct proc_chip *chip = next_chip(NULL); + + last_seq_id = seq_id; + prd_fsp_occ_load_start(chip->id); + } + return; + } + + /* + * Check if hostservices lid caching is complete. If not, queue + * the load request. + */ + if (!hservices_lid_preload_complete()) { + occ_queue_load(scope, dbob_id, seq_id); + return; + } + + __occ_do_load(scope, dbob_id, seq_id); +} + +int fsp_occ_reset_status(u64 chipid, s64 status) +{ + struct fsp_msg *stat; + int rc = OPAL_NO_MEM; + int status_word = 0; + + prlog(PR_INFO, "HBRT: OCC stop() completed with %lld\n", status); + + if (status) { + struct proc_chip *chip = get_chip(chipid); + + if (!chip) + return OPAL_PARAMETER; + + status_word = 0xfe00 | (chip->pcid & 0xff); + log_simple_error(&e_info(OPAL_RC_OCC_RESET), + "OCC: Error %lld in OCC reset of chip %lld\n", + status, chipid); + } else { + occ_msg_queue_occ_reset(); + } + + stat = fsp_mkmsg(FSP_CMD_RESET_OCC_STAT, 2, status_word, last_seq_id); + if (!stat) + return rc; + + rc = fsp_queue_msg(stat, fsp_freemsg); + if (rc) { + fsp_freemsg(stat); + log_simple_error(&e_info(OPAL_RC_OCC_RESET), + "OCC: Error %d queueing FSP OCC RESET STATUS message\n", + rc); + } + return rc; +} + +int fsp_occ_load_start_status(u64 chipid, s64 status) +{ + struct fsp_msg *stat; + int rc = OPAL_NO_MEM; + int status_word = 0; + + if (status) { + struct proc_chip *chip = get_chip(chipid); + + if (!chip) + return OPAL_PARAMETER; + + status_word = 0xB500 | (chip->pcid & 0xff); + log_simple_error(&e_info(OPAL_RC_OCC_LOAD), + "OCC: Error %d in load/start OCC %lld\n", rc, + chipid); + } + + stat = fsp_mkmsg(FSP_CMD_LOAD_OCC_STAT, 2, status_word, last_seq_id); + if (!stat) + return rc; + + rc = fsp_queue_msg(stat, fsp_freemsg); + if (rc) { + fsp_freemsg(stat); + log_simple_error(&e_info(OPAL_RC_OCC_LOAD), + "OCC: Error %d queueing FSP OCC LOAD STATUS msg", rc); + } + + return rc; +} + +static void occ_do_reset(u8 scope, u32 dbob_id, u32 seq_id) +{ + struct fsp_msg *rsp, *stat; + struct proc_chip *chip = next_chip(NULL); + int rc = -ENOMEM; + u8 err = 0; + + /* Check arguments */ + if (scope != 0x01 && scope != 0x02) { + /** + * @fwts-label OCCResetInvalidScope + * @fwts-advice Invalid request for resetting OCCs. Power and + * frequency management not functional + */ + prlog(PR_ERR, "OCC: Reset message with invalid scope 0x%x\n", + scope); + err = 0x22; + } + + /* First queue up an OK response to the reset message itself */ + rsp = fsp_mkmsg(FSP_RSP_RESET_OCC | err, 0); + if (rsp) + rc = fsp_queue_msg(rsp, fsp_freemsg); + if (rc) { + fsp_freemsg(rsp); + log_simple_error(&e_info(OPAL_RC_OCC_RESET), + "OCC: Error %d queueing FSP OCC RESET reply\n", rc); + return; + } + + /* If we had an error, return */ + if (err) + return; + + /* + * Call HBRT to stop OCC and leave it stopped. FSP will send load/start + * request subsequently. Also after few runtime restarts (currently 3), + * FSP will request OCC to left in stopped state. + */ + + switch (proc_gen) { + case proc_gen_p8: + rc = host_services_occ_stop(); + break; + case proc_gen_p9: + case proc_gen_p10: + last_seq_id = seq_id; + chip = next_chip(NULL); + prd_fsp_occ_reset(chip->id); + return; + default: + return; + } + + /* Handle fallback to preload */ + if (rc == -ENOENT && chip->homer_base) { + prlog(PR_INFO, "OCC: Reset: Fallback to preloaded image\n"); + rc = 0; + } + if (!rc) { + /* Send a single success response for all chips */ + stat = fsp_mkmsg(FSP_CMD_RESET_OCC_STAT, 2, 0, seq_id); + if (stat) + rc = fsp_queue_msg(stat, fsp_freemsg); + if (rc) { + fsp_freemsg(stat); + log_simple_error(&e_info(OPAL_RC_OCC_RESET), + "OCC: Error %d queueing FSP OCC RESET" + " STATUS message\n", rc); + } + occ_msg_queue_occ_reset(); + } else { + + /* + * Then send a matching OCC Reset Status message with an 0xFE + * (fail) response code as well to the first matching chip + */ + for_each_chip(chip) { + if (scope == 0x01 && dbob_id != chip->dbob_id) + continue; + rc = -ENOMEM; + stat = fsp_mkmsg(FSP_CMD_RESET_OCC_STAT, 2, + 0xfe00 | (chip->pcid & 0xff), seq_id); + if (stat) + rc = fsp_queue_msg(stat, fsp_freemsg); + if (rc) { + fsp_freemsg(stat); + log_simple_error(&e_info(OPAL_RC_OCC_RESET), + "OCC: Error %d queueing FSP OCC RESET" + " STATUS message\n", rc); + } + break; + } + } +} + +static bool fsp_occ_msg(u32 cmd_sub_mod, struct fsp_msg *msg) +{ + u32 dbob_id, seq_id; + u8 scope; + + switch (cmd_sub_mod) { + case FSP_CMD_LOAD_OCC: + /* + * We get the "Load OCC" command at boot. We don't currently + * support loading it ourselves (we don't have the procedures, + * they will come with Host Services). For now HostBoot will + * have loaded a OCC firmware for us, but we still need to + * be nice and respond to OCC. + */ + scope = msg->data.bytes[3]; + dbob_id = fsp_msg_get_data_word(msg, 1); + seq_id = fsp_msg_get_data_word(msg, 2); + prlog(PR_INFO, "OCC: Got OCC Load message, scope=0x%x" + " dbob=0x%x seq=0x%x\n", scope, dbob_id, seq_id); + occ_do_load(scope, dbob_id, seq_id); + return true; + + case FSP_CMD_RESET_OCC: + /* + * We shouldn't be getting this one, but if we do, we have + * to reply something sensible or the FSP will get upset + */ + scope = msg->data.bytes[3]; + dbob_id = fsp_msg_get_data_word(msg, 1); + seq_id = fsp_msg_get_data_word(msg, 2); + prlog(PR_INFO, "OCC: Got OCC Reset message, scope=0x%x" + " dbob=0x%x seq=0x%x\n", scope, dbob_id, seq_id); + occ_do_reset(scope, dbob_id, seq_id); + return true; + } + return false; +} + +static struct fsp_client fsp_occ_client = { + .message = fsp_occ_msg, +}; + +void occ_fsp_init(void) +{ + /* If we have an FSP, register for notifications */ + if (fsp_present()) + fsp_register_client(&fsp_occ_client, FSP_MCLASS_OCC); +} diff --git a/roms/skiboot/hw/fsp/fsp-op-panel.c b/roms/skiboot/hw/fsp/fsp-op-panel.c new file mode 100644 index 000000000..a8ac00b7a --- /dev/null +++ b/roms/skiboot/hw/fsp/fsp-op-panel.c @@ -0,0 +1,266 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Small LCD screen on the front of FSP machines + * + * Copyright 2013-2019 IBM Corp. + */ + +#include <skiboot.h> +#include <fsp.h> +#include <lock.h> +#include <opal.h> +#include <device.h> +#include <processor.h> +#include <opal-msg.h> +#include <errorlog.h> + +DEFINE_LOG_ENTRY(OPAL_RC_PANEL_WRITE, OPAL_PLATFORM_ERR_EVT, OPAL_OP_PANEL, + OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL, OPAL_NA); + +/* For OPAL OP_PANEL API we can only have one in flight due to TCEs */ +static struct fsp_msg *op_req; +static uint64_t op_async_token; +static struct lock op_lock = LOCK_UNLOCKED; + +static void fsp_op_display_fatal(uint32_t w0, uint32_t w1) +{ + static struct fsp_msg op_msg_resp; + static struct fsp_msg op_msg = { + .resp = &op_msg_resp, + }; + + fsp_fillmsg(&op_msg, FSP_CMD_DISP_SRC_DIRECT, 3, 1, w0, w1); + + /* + * A special way to send a message: it doesn't run pollers. + * This means we can call it while in a poller, which we may + * well be in when we're terminating (and thus displaying a *fatal* + * message on the op-panel). + */ + fsp_fatal_msg(&op_msg); +} + +void fsp_op_display(enum op_severity sev, enum op_module mod, uint16_t code) +{ + struct fsp_msg *op_msg; + uint32_t w0; + uint32_t w1; + + if (!fsp_present()) + return; + + w0 = sev << 16 | mod; + + w1 = tohex((code >> 12) & 0xf) << 24; + w1 |= tohex((code >> 8) & 0xf) << 16; + w1 |= tohex((code >> 4) & 0xf) << 8; + w1 |= tohex((code ) & 0xf); + + if (sev == OP_FATAL) { + fsp_op_display_fatal(w0, w1); + } else { + op_msg = fsp_allocmsg(true); + if (!op_msg) { + prerror("Failed to allocate FSP message for PANEL\n"); + return; + } + + fsp_fillmsg(op_msg, FSP_CMD_DISP_SRC_DIRECT, 3, 1, w0, w1); + + if(fsp_queue_msg(op_msg, fsp_freemsg)) + prerror("Failed to queue FSP message for OP PANEL\n"); + } +} + +void op_panel_disable_src_echo(void) +{ + struct fsp_msg op_msg_resp; + struct fsp_msg op_msg = { + .resp = &op_msg_resp, + }; + + if (!fsp_present()) + return; + + fsp_fillmsg(&op_msg, FSP_CMD_DIS_SRC_ECHO, 0); + fsp_sync_msg(&op_msg, false); +} + +void op_panel_clear_src(void) +{ + struct fsp_msg op_msg_resp; + struct fsp_msg op_msg = { + .resp = &op_msg_resp, + }; + + if (!fsp_present()) + return; + + fsp_fillmsg(&op_msg, FSP_CMD_CLEAR_SRC, 0); + fsp_sync_msg(&op_msg, false); +} + +/* opal_write_oppanel - Write to the physical op panel. + * + * Pass in an array of oppanel_line_t structs defining the ASCII characters + * to display on each line of the oppanel. If there are two lines on the + * physical panel, and you only want to write to the first line, you only + * need to pass in one line. If you only want to write to the second line, + * you need to pass in both lines, and set the line_len of the first line + * to zero. + * + * This command is asynchronous. If OPAL_SUCCESS is returned, then the + * operation was initiated successfully. Subsequent calls will return + * OPAL_BUSY until the current operation is complete. + */ +struct op_src { + uint8_t version; +#define OP_SRC_VERSION 2 + uint8_t flags; + uint8_t reserved; + uint8_t hex_word_cnt; + __be16 reserved2; + __be16 total_size; + __be32 word2; /* SRC format in low byte */ + __be32 word3; + __be32 word4; + __be32 word5; + __be32 word6; + __be32 word7; + __be32 word8; + __be32 word9; + uint8_t ascii[OP_PANEL_NUM_LINES * OP_PANEL_LINE_LEN]; /* Word 11 */ +} __packed __align(4); + +/* Page align for the sake of TCE mapping */ +static struct op_src op_src __align(0x1000); + +static void __op_panel_write_complete(struct fsp_msg *msg) +{ + fsp_tce_unmap(PSI_DMA_OP_PANEL_MISC, 0x1000); + + lock(&op_lock); + op_req = NULL; + unlock(&op_lock); + + fsp_freemsg(msg); +} + +static void op_panel_write_complete(struct fsp_msg *msg) +{ + uint8_t rc = (msg->resp->word1 >> 8) & 0xff; + + if (rc) + prerror("OPPANEL: Error 0x%02x in display command\n", rc); + + __op_panel_write_complete(msg); + + opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, + cpu_to_be64(1), + cpu_to_be64(op_async_token)); +} + +static int64_t __opal_write_oppanel(oppanel_line_t *lines, uint64_t num_lines, + uint64_t async_token) +{ + int64_t rc = OPAL_ASYNC_COMPLETION; + int len; + int i; + + if (num_lines < 1 || num_lines > OP_PANEL_NUM_LINES) + return OPAL_PARAMETER; + + /* Only one in flight */ + lock(&op_lock); + if (op_req) { + rc = OPAL_BUSY_EVENT; + unlock(&op_lock); + goto bail; + } + + op_req = fsp_allocmsg(true); + if (!op_req) { + rc = OPAL_NO_MEM; + unlock(&op_lock); + goto bail; + } + unlock(&op_lock); + + op_async_token = async_token; + + memset(&op_src, 0, sizeof(op_src)); + + op_src.version = OP_SRC_VERSION; + op_src.flags = 0; + op_src.reserved = 0; + op_src.hex_word_cnt = 1; /* header word only */ + op_src.reserved2 = 0; + op_src.total_size = cpu_to_be16(sizeof(op_src)); + op_src.word2 = 0; /* should be unneeded */ + + for (i = 0; i < num_lines; i++) { + uint8_t *current_line = op_src.ascii + (i * OP_PANEL_LINE_LEN); + + len = be64_to_cpu(lines[i].line_len); + if (len < OP_PANEL_LINE_LEN) + memset(current_line + len, ' ', OP_PANEL_LINE_LEN-len); + else + len = OP_PANEL_LINE_LEN; + memcpy(current_line, (void *) be64_to_cpu(lines[i].line), len); + } + + for (i = 0; i < sizeof(op_src.ascii); i++) { + /* + * So, there's this interesting thing if you send + * HTML/Javascript through the Operator Panel. + * You get to inject it into the ASM web ui! + * So we filter out anything suspect here, + * at least for the time being. + * + * Allowed characters: + * . / 0-9 : a-z A-Z SPACE + */ + if (! ((op_src.ascii[i] >= '.' && op_src.ascii[i] <= ':') || + (op_src.ascii[i] >= 'a' && op_src.ascii[i] <= 'z') || + (op_src.ascii[i] >= 'A' && op_src.ascii[i] <= 'Z') || + op_src.ascii[i] == ' ')) { + op_src.ascii[i] = '.'; + } + } + + fsp_tce_map(PSI_DMA_OP_PANEL_MISC, &op_src, 0x1000); + + fsp_fillmsg(op_req, FSP_CMD_DISP_SRC_INDIR, 3, 0, + PSI_DMA_OP_PANEL_MISC, sizeof(struct op_src)); + rc = fsp_queue_msg(op_req, op_panel_write_complete); + if (rc) { + __op_panel_write_complete(op_req); + rc = OPAL_INTERNAL_ERROR; + } + bail: + log_simple_error(&e_info(OPAL_RC_PANEL_WRITE), + "FSP: Error updating Op Panel: %lld\n", rc); + return rc; +} + +static int64_t opal_write_oppanel_async(uint64_t async_token, + oppanel_line_t *lines, + uint64_t num_lines) +{ + return __opal_write_oppanel(lines, num_lines, async_token); +} + +void fsp_oppanel_init(void) +{ + struct dt_node *oppanel; + + if (!fsp_present()) + return; + + opal_register(OPAL_WRITE_OPPANEL_ASYNC, opal_write_oppanel_async, 3); + + oppanel = dt_new(opal_node, "oppanel"); + dt_add_property_cells(oppanel, "#length", OP_PANEL_LINE_LEN); + dt_add_property_cells(oppanel, "#lines", OP_PANEL_NUM_LINES); + dt_add_property_string(oppanel, "compatible", "ibm,opal-oppanel"); +} diff --git a/roms/skiboot/hw/fsp/fsp-psi.c b/roms/skiboot/hw/fsp/fsp-psi.c new file mode 100644 index 000000000..38f130dd7 --- /dev/null +++ b/roms/skiboot/hw/fsp/fsp-psi.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* Copyright 2013-2019 IBM Corp. */ + +#include <io.h> +#include <psi.h> +#include <lock.h> +#include <fsp.h> + +static void psi_tce_enable(struct psi *psi, bool enable) +{ + void *addr = psi->regs + PSIHB_PHBSCR; + u64 val; + + val = in_be64(addr); + if (enable) + val |= PSIHB_PHBSCR_TCE_ENABLE; + else + val &= ~PSIHB_PHBSCR_TCE_ENABLE; + out_be64(addr, val); +} + +/* + * Configure the PSI interface for communicating with + * an FSP, such as enabling the TCEs, FSP commands, + * etc... + */ +void psi_init_for_fsp(struct psi *psi) +{ + uint64_t reg; + bool enable_tce = true; + + lock(&psi_lock); + + /* Disable and setup TCE base address */ + psi_tce_enable(psi, false); + + switch (proc_gen) { + case proc_gen_p8: + case proc_gen_p9: + case proc_gen_p10: + out_be64(psi->regs + PSIHB_TAR, PSI_TCE_TABLE_BASE | + PSIHB_TAR_256K_ENTRIES); + break; + default: + enable_tce = false; + }; + + /* Enable various other configuration register bits based + * on what pHyp does. We keep interrupts disabled until + * after the mailbox has been properly configured. We assume + * basic stuff such as PSI link enable is already there. + * + * - FSP CMD Enable + * - FSP MMIO Enable + * - TCE Enable + * - Error response enable + * + * Clear all other error bits + */ + if (!psi->active) { + prerror("PSI: psi_init_for_fsp() called on inactive link!\n"); + unlock(&psi_lock); + return; + } + + reg = in_be64(psi->regs + PSIHB_CR); + reg |= PSIHB_CR_FSP_CMD_ENABLE; + reg |= PSIHB_CR_FSP_MMIO_ENABLE; + reg |= PSIHB_CR_FSP_ERR_RSP_ENABLE; + reg &= ~0x00000000ffffffffull; + out_be64(psi->regs + PSIHB_CR, reg); + psi_tce_enable(psi, enable_tce); + + unlock(&psi_lock); +} diff --git a/roms/skiboot/hw/fsp/fsp-rtc.c b/roms/skiboot/hw/fsp/fsp-rtc.c new file mode 100644 index 000000000..237560a8d --- /dev/null +++ b/roms/skiboot/hw/fsp/fsp-rtc.c @@ -0,0 +1,567 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Real Time Clock (RTC) attached to FSP + * + * Copyright 2013-2017 IBM Corp. + */ + +#include <skiboot.h> +#include <fsp.h> +#include <lock.h> +#include <timebase.h> +#include <time.h> +#include <time-utils.h> +#include <opal-api.h> +#include <opal-msg.h> +#include <errorlog.h> +#include <device.h> + +/* + * Note on how those operate: + * + * Because the RTC calls can be pretty slow, these functions will shoot + * an asynchronous request to the FSP (if none is already pending) + * + * The requests will return OPAL_BUSY_EVENT as long as the event has + * not been completed. + * + * WARNING: An attempt at doing an RTC write while one is already pending + * will simply ignore the new arguments and continue returning + * OPAL_BUSY_EVENT. This is to be compatible with existing Linux code. + * + * Completion of the request will result in an event OPAL_EVENT_RTC + * being signaled, which will remain raised until a corresponding call + * to opal_rtc_read() or opal_rtc_write() finally returns OPAL_SUCCESS, + * at which point the operation is complete and the event cleared. + * + * If we end up taking longer than rtc_read_timeout_ms millieconds waiting + * for the response from a read request, we simply return a cached value (plus + * an offset calculated from the timebase. When the read request finally + * returns, we update our cache value accordingly. + * + * There is two separate set of state for reads and writes. If both are + * attempted at the same time, the event bit will remain set as long as either + * of the two has a pending event to signal. + */ + +#include <rtc.h> + +/* All of the below state is protected by rtc_lock. + * It should be held for the shortest amount of time possible. + * Certainly not across calls to FSP. + */ +static struct lock rtc_lock; + +static enum { + RTC_TOD_VALID, + RTC_TOD_INVALID, + RTC_TOD_PERMANENT_ERROR, +} rtc_tod_state = RTC_TOD_INVALID; + +/* State machine for getting an RTC request. + * RTC_{READ/WRITE}_NO_REQUEST -> RTC_{READ/WRITE}_PENDING_REQUEST (one in flight) + * RTC_{READ/WRITE}_PENDING_REQUEST -> RTC_{READ/WRITE}_REQUEST_AVAILABLE, + * when FSP responds + * RTC_{READ/WRITE}_REQUEST_AVAILABLE -> RTC_{READ/WRITE}_NO_REQUEST, + * when OS retrieves it + */ +static enum { + RTC_READ_NO_REQUEST, + RTC_READ_PENDING_REQUEST, + RTC_READ_REQUEST_AVAILABLE, +} rtc_read_request_state = RTC_READ_NO_REQUEST; + +static enum { + RTC_WRITE_NO_REQUEST, + RTC_WRITE_PENDING_REQUEST, + RTC_WRITE_REQUEST_AVAILABLE, +} rtc_write_request_state = RTC_WRITE_NO_REQUEST; + +static bool rtc_tod_cache_dirty = false; + +struct opal_tpo_data { + uint64_t tpo_async_token; + __be32 *year_month_day; + __be32 *hour_min; +}; + +/* Timebase value when we last initiated a RTC read request */ +static unsigned long read_req_tb; + +/* If a RTC read takes longer than this, we return a value generated + * from the cache + timebase */ +static const int rtc_read_timeout_ms = 1500; + +DEFINE_LOG_ENTRY(OPAL_RC_RTC_TOD, OPAL_PLATFORM_ERR_EVT, OPAL_RTC, + OPAL_PLATFORM_FIRMWARE, OPAL_INFO, OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_RTC_READ, OPAL_PLATFORM_ERR_EVT, OPAL_RTC, + OPAL_PLATFORM_FIRMWARE, OPAL_INFO, OPAL_NA); + +static void fsp_tpo_req_complete(struct fsp_msg *read_resp) +{ + struct opal_tpo_data *attr = read_resp->user_data; + int val; + int rc; + + val = (read_resp->resp->word1 >> 8) & 0xff; + switch (val) { + case FSP_STATUS_TOD_RESET: + log_simple_error(&e_info(OPAL_RC_RTC_TOD), + "RTC TPO in invalid state\n"); + rc = OPAL_INTERNAL_ERROR; + break; + + case FSP_STATUS_TOD_PERMANENT_ERROR: + log_simple_error(&e_info(OPAL_RC_RTC_TOD), + "RTC TPO in permanent error state\n"); + rc = OPAL_INTERNAL_ERROR; + break; + case FSP_STATUS_INVALID_DATA: + log_simple_error(&e_info(OPAL_RC_RTC_TOD), + "RTC TPO: Invalid data\n"); + rc = OPAL_PARAMETER; + break; + case FSP_STATUS_SUCCESS: + /* Save the read TPO value in our cache */ + if (attr->year_month_day) + *attr->year_month_day = cpu_to_be32(fsp_msg_get_data_word(read_resp->resp, 0)); + if (attr->hour_min) + *attr->hour_min = cpu_to_be32(fsp_msg_get_data_word(read_resp->resp, 1)); + rc = OPAL_SUCCESS; + break; + + default: + log_simple_error(&e_info(OPAL_RC_RTC_TOD), + "TPO read failed: %d\n", val); + rc = OPAL_INTERNAL_ERROR; + break; + } + opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, + cpu_to_be64(attr->tpo_async_token), + cpu_to_be64(rc)); + free(attr); + fsp_freemsg(read_resp); +} + +static void fsp_rtc_process_read(struct fsp_msg *read_resp) +{ + int val = (read_resp->word1 >> 8) & 0xff; + struct tm tm; + + assert(lock_held_by_me(&rtc_lock)); + + assert(rtc_read_request_state == RTC_READ_PENDING_REQUEST); + + switch (val) { + case FSP_STATUS_TOD_RESET: + log_simple_error(&e_info(OPAL_RC_RTC_TOD), + "RTC TOD in invalid state\n"); + rtc_tod_state = RTC_TOD_INVALID; + break; + + case FSP_STATUS_TOD_PERMANENT_ERROR: + log_simple_error(&e_info(OPAL_RC_RTC_TOD), + "RTC TOD in permanent error state\n"); + rtc_tod_state = RTC_TOD_PERMANENT_ERROR; + break; + + case FSP_STATUS_SUCCESS: + /* Save the read RTC value in our cache */ + rtc_tod_state = RTC_TOD_VALID; + datetime_to_tm(fsp_msg_get_data_word(read_resp, 0), + (u64)fsp_msg_get_data_word(read_resp, 1) << 32, &tm); + rtc_cache_update(&tm); + prlog(PR_TRACE, "FSP-RTC Got time: %d-%d-%d %d:%d:%d\n", + tm.tm_year, tm.tm_mon, tm.tm_mday, + tm.tm_hour, tm.tm_min, tm.tm_sec); + break; + + default: + log_simple_error(&e_info(OPAL_RC_RTC_TOD), + "RTC TOD read failed: %d\n", val); + rtc_tod_state = RTC_TOD_INVALID; + } + rtc_read_request_state = RTC_READ_REQUEST_AVAILABLE; +} + +static void opal_rtc_eval_events(bool read_write) +{ + bool request_available; + + if (read_write) + request_available = (rtc_read_request_state == + RTC_READ_REQUEST_AVAILABLE); + else + request_available = (rtc_write_request_state == + RTC_WRITE_REQUEST_AVAILABLE); + + assert(lock_held_by_me(&rtc_lock)); + opal_update_pending_evt(OPAL_EVENT_RTC, + request_available ? OPAL_EVENT_RTC : 0); +} + +static void fsp_rtc_req_complete(struct fsp_msg *msg) +{ + lock(&rtc_lock); + prlog(PR_TRACE, "RTC completion %p\n", msg); + + if (fsp_msg_cmd(msg) == (FSP_CMD_READ_TOD & 0xffffff)) { + fsp_rtc_process_read(msg->resp); + opal_rtc_eval_events(true); + } else { + assert(rtc_write_request_state == RTC_WRITE_PENDING_REQUEST); + rtc_write_request_state = RTC_WRITE_REQUEST_AVAILABLE; + opal_rtc_eval_events(false); + } + + unlock(&rtc_lock); + fsp_freemsg(msg); +} + +static int64_t fsp_rtc_send_read_request(void) +{ + struct fsp_msg *msg; + int rc; + + assert(lock_held_by_me(&rtc_lock)); + assert(rtc_read_request_state == RTC_READ_NO_REQUEST); + + msg = fsp_mkmsg(FSP_CMD_READ_TOD, 0); + if (!msg) { + log_simple_error(&e_info(OPAL_RC_RTC_READ), + "RTC: failed to allocate read message\n"); + return OPAL_INTERNAL_ERROR; + } + + rc = fsp_queue_msg(msg, fsp_rtc_req_complete); + if (rc) { + fsp_freemsg(msg); + log_simple_error(&e_info(OPAL_RC_RTC_READ), + "RTC: failed to queue read message: %d\n", rc); + return OPAL_INTERNAL_ERROR; + } + + rtc_read_request_state = RTC_READ_PENDING_REQUEST; + + read_req_tb = mftb(); + + return OPAL_BUSY_EVENT; +} + +static int64_t fsp_opal_rtc_read(__be32 *__ymd, __be64 *__hmsm) +{ + int64_t rc; + uint32_t ymd; + uint64_t hmsm; + + if (!__ymd || !__hmsm) + return OPAL_PARAMETER; + + lock(&rtc_lock); + + if (rtc_tod_state == RTC_TOD_PERMANENT_ERROR) { + rc = OPAL_HARDWARE; + goto out; + } + + /* During R/R of FSP, read cached TOD */ + if (fsp_in_rr()) { + if (rtc_tod_state == RTC_TOD_VALID) { + rtc_cache_get_datetime(&ymd, &hmsm); + rc = OPAL_SUCCESS; + } else { + rc = OPAL_INTERNAL_ERROR; + } + goto out; + } + + /* If we don't have a read pending already, fire off a request and + * return */ + if (rtc_read_request_state == RTC_READ_NO_REQUEST) { + prlog(PR_TRACE, "Sending new RTC read request\n"); + rc = fsp_rtc_send_read_request(); + /* If our pending read is done, clear events and return the time + * from the cache */ + } else if (rtc_read_request_state == RTC_READ_REQUEST_AVAILABLE) { + prlog(PR_TRACE, "RTC read complete, state %d\n", rtc_tod_state); + rtc_read_request_state = RTC_READ_NO_REQUEST; + + opal_rtc_eval_events(true); + + if (rtc_tod_state == RTC_TOD_VALID) { + rtc_cache_get_datetime(&ymd, &hmsm); + prlog(PR_TRACE,"FSP-RTC Cached datetime: %x %llx\n", + ymd, hmsm); + rc = OPAL_SUCCESS; + } else { + rc = OPAL_INTERNAL_ERROR; + } + + /* Timeout: return our cached value (updated from tb), but leave the + * read request pending so it will update the cache later */ + } else if (mftb() > read_req_tb + msecs_to_tb(rtc_read_timeout_ms)) { + prlog(PR_TRACE, "RTC read timed out\n"); + + if (rtc_tod_state == RTC_TOD_VALID) { + rtc_cache_get_datetime(&ymd, &hmsm); + rc = OPAL_SUCCESS; + } else { + rc = OPAL_INTERNAL_ERROR; + } + /* Otherwise, we're still waiting on the read to complete */ + } else { + assert(rtc_read_request_state == RTC_READ_PENDING_REQUEST); + rc = OPAL_BUSY_EVENT; + } +out: + unlock(&rtc_lock); + + if (rc == OPAL_SUCCESS) { + *__ymd = cpu_to_be32(ymd); + *__hmsm = cpu_to_be64(hmsm); + } + + return rc; +} + +static int64_t fsp_rtc_send_write_request(uint32_t year_month_day, + uint64_t hour_minute_second_millisecond) +{ + struct fsp_msg *msg; + uint32_t w0, w1, w2; + + assert(lock_held_by_me(&rtc_lock)); + assert(rtc_write_request_state == RTC_WRITE_NO_REQUEST); + + /* Create a request and send it. Just like for read, we ignore + * the "millisecond" field which is probably supposed to be + * microseconds and which Linux ignores as well anyway + */ + w0 = year_month_day; + w1 = (hour_minute_second_millisecond >> 32) & 0xffffff00; + w2 = 0; + + msg = fsp_mkmsg(FSP_CMD_WRITE_TOD, 3, w0, w1, w2); + if (!msg) { + prlog(PR_TRACE, " -> allocation failed !\n"); + return OPAL_INTERNAL_ERROR; + } + prlog(PR_TRACE, " -> req at %p\n", msg); + + if (fsp_queue_msg(msg, fsp_rtc_req_complete)) { + prlog(PR_TRACE, " -> queueing failed !\n"); + fsp_freemsg(msg); + return OPAL_INTERNAL_ERROR; + } + + rtc_write_request_state = RTC_WRITE_PENDING_REQUEST; + + return OPAL_BUSY_EVENT; +} + +static int64_t fsp_opal_rtc_write(uint32_t year_month_day, + uint64_t hour_minute_second_millisecond) +{ + int rc; + struct tm tm; + + lock(&rtc_lock); + if (rtc_tod_state == RTC_TOD_PERMANENT_ERROR) { + rc = OPAL_HARDWARE; + goto out; + } + + if (fsp_in_rr()) { + datetime_to_tm(year_month_day, + hour_minute_second_millisecond, &tm); + rtc_cache_update(&tm); + rtc_tod_cache_dirty = true; + rc = OPAL_SUCCESS; + goto out; + } + + if (rtc_write_request_state == RTC_WRITE_NO_REQUEST) { + prlog(PR_TRACE, "Sending new RTC write request\n"); + rc = fsp_rtc_send_write_request(year_month_day, + hour_minute_second_millisecond); + } else if (rtc_write_request_state == RTC_WRITE_PENDING_REQUEST) { + rc = OPAL_BUSY_EVENT; + } else { + assert(rtc_write_request_state == RTC_WRITE_REQUEST_AVAILABLE); + rtc_write_request_state = RTC_WRITE_NO_REQUEST; + + opal_rtc_eval_events(false); + rc = OPAL_SUCCESS; + } + +out: + unlock(&rtc_lock); + return rc; +} + +/* Set timed power on values to fsp */ +static int64_t fsp_opal_tpo_write(uint64_t async_token, uint32_t y_m_d, + uint32_t hr_min) +{ + static struct opal_tpo_data *attr; + struct fsp_msg *msg; + + if (!fsp_present()) + return OPAL_HARDWARE; + + attr = zalloc(sizeof(struct opal_tpo_data)); + if (!attr) + return OPAL_NO_MEM; + + /* Create a request and send it.*/ + attr->tpo_async_token = async_token; + + /* check if this is a disable tpo request */ + if (y_m_d == 0 && hr_min == 0) { + prlog(PR_TRACE, "Sending TPO disable request...\n"); + msg = fsp_mkmsg(FSP_CMD_TPO_DISABLE, 0); + } else { + prlog(PR_TRACE, "Sending TPO write request...\n"); + msg = fsp_mkmsg(FSP_CMD_TPO_WRITE, 2, y_m_d, hr_min); + } + + if (!msg) { + prerror("TPO: Failed to create message for WRITE to FSP\n"); + free(attr); + return OPAL_INTERNAL_ERROR; + } + msg->user_data = attr; + if (fsp_queue_msg(msg, fsp_tpo_req_complete)) { + free(attr); + fsp_freemsg(msg); + return OPAL_INTERNAL_ERROR; + } + return OPAL_ASYNC_COMPLETION; +} + +/* Read Timed power on (TPO) from FSP */ +static int64_t fsp_opal_tpo_read(uint64_t async_token, __be32 *y_m_d, + __be32 *hr_min) +{ + static struct opal_tpo_data *attr; + struct fsp_msg *msg; + int64_t rc; + + if (!fsp_present()) + return OPAL_HARDWARE; + + if (!y_m_d || !hr_min) + return OPAL_PARAMETER; + + attr = zalloc(sizeof(*attr)); + if (!attr) + return OPAL_NO_MEM; + + /* Send read requet to FSP */ + attr->tpo_async_token = async_token; + attr->year_month_day = y_m_d; + attr->hour_min = hr_min; + + prlog(PR_TRACE, "Sending new TPO read request\n"); + msg = fsp_mkmsg(FSP_CMD_TPO_READ, 0); + if (!msg) { + log_simple_error(&e_info(OPAL_RC_RTC_READ), + "TPO: failed to allocate read message\n"); + free(attr); + return OPAL_INTERNAL_ERROR; + } + msg->user_data = attr; + rc = fsp_queue_msg(msg, fsp_tpo_req_complete); + if (rc) { + free(attr); + fsp_freemsg(msg); + log_simple_error(&e_info(OPAL_RC_RTC_READ), + "TPO: failed to queue read message: %lld\n", rc); + return OPAL_INTERNAL_ERROR; + } + return OPAL_ASYNC_COMPLETION; +} + +static void rtc_flush_cached_tod(void) +{ + struct fsp_msg *msg; + uint64_t h_m_s_m; + uint32_t y_m_d; + + if (rtc_cache_get_datetime(&y_m_d, &h_m_s_m)) + return; + msg = fsp_mkmsg(FSP_CMD_WRITE_TOD, 3, y_m_d, + (h_m_s_m >> 32) & 0xffffff00, 0); + if (!msg) { + prerror("TPO: %s : Failed to allocate write TOD message\n", + __func__); + return; + } + if (fsp_queue_msg(msg, fsp_freemsg)) { + fsp_freemsg(msg); + prerror("TPO: %s : Failed to queue WRITE_TOD command\n", + __func__); + return; + } +} + +static bool fsp_rtc_msg_rr(u32 cmd_sub_mod, struct fsp_msg *msg) +{ + + int rc = false; + assert(msg == NULL); + + switch (cmd_sub_mod) { + case FSP_RESET_START: + rc = true; + break; + case FSP_RELOAD_COMPLETE: + lock(&rtc_lock); + if (rtc_tod_cache_dirty) { + rtc_flush_cached_tod(); + rtc_tod_cache_dirty = false; + } + unlock(&rtc_lock); + rc = true; + break; + } + + return rc; +} + +static struct fsp_client fsp_rtc_client_rr = { + .message = fsp_rtc_msg_rr, +}; + +void fsp_rtc_init(void) +{ + struct dt_node *np; + + if (!fsp_present()) { + rtc_tod_state = RTC_TOD_PERMANENT_ERROR; + return; + } + + opal_register(OPAL_RTC_READ, fsp_opal_rtc_read, 2); + opal_register(OPAL_RTC_WRITE, fsp_opal_rtc_write, 2); + opal_register(OPAL_WRITE_TPO, fsp_opal_tpo_write, 3); + opal_register(OPAL_READ_TPO, fsp_opal_tpo_read, 3); + + np = dt_new(opal_node, "rtc"); + dt_add_property_strings(np, "compatible", "ibm,opal-rtc"); + dt_add_property(np, "has-tpo", NULL, 0); + + /* Register for the reset/reload event */ + fsp_register_client(&fsp_rtc_client_rr, FSP_MCLASS_RR_EVENT); + + prlog(PR_TRACE, "Getting initial RTC TOD\n"); + + /* We don't wait for RTC response and this is actually okay as + * any OPAL callers will wait correctly and if we ever have + * internal users then they should check the state properly + */ + lock(&rtc_lock); + fsp_rtc_send_read_request(); + unlock(&rtc_lock); +} diff --git a/roms/skiboot/hw/fsp/fsp-sensor.c b/roms/skiboot/hw/fsp/fsp-sensor.c new file mode 100644 index 000000000..ffcd004f3 --- /dev/null +++ b/roms/skiboot/hw/fsp/fsp-sensor.c @@ -0,0 +1,860 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * This code will enable the 'powernv' to retrieve sensor related data from FSP + * using SPCN passthru mailbox commands. + * + * The OPAL read sensor API in Sapphire is implemented as an 'asynchronous' read + * call that returns after queuing the read request. A unique sensor-id is + * expected as an argument for OPAL read call which has already been exported + * to the device tree during fsp init. The sapphire code decodes this Id to + * determine requested attribute and sensor. + * + * Copyright 2013-2017 IBM Corp. + */ + +#include <skiboot.h> +#include <fsp.h> +#include <lock.h> +#include <device.h> +#include <spcn.h> +#include <opal-api.h> +#include <opal-msg.h> +#include <errorlog.h> +#include <sensor.h> + +#define INVALID_DATA ((uint32_t)-1) + +/* Entry size of PRS command modifiers */ +#define PRS_STATUS_ENTRY_SZ 0x08 +#define SENSOR_PARAM_ENTRY_SZ 0x10 +#define SENSOR_DATA_ENTRY_SZ 0x08 +#define PROC_JUNC_ENTRY_SZ 0x04 + +DEFINE_LOG_ENTRY(OPAL_RC_SENSOR_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_SENSOR, + OPAL_MISC_SUBSYSTEM, + OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT, + OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_SENSOR_READ, OPAL_PLATFORM_ERR_EVT, OPAL_SENSOR, + OPAL_MISC_SUBSYSTEM, OPAL_INFO, + OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_SENSOR_ASYNC_COMPLETE, OPAL_PLATFORM_ERR_EVT, + OPAL_SENSOR, OPAL_MISC_SUBSYSTEM, OPAL_INFO, + OPAL_NA); + +/* FSP response status codes */ +enum { + SP_RSP_STATUS_VALID_DATA = 0x00, + SP_RSP_STATUS_INVALID_DATA = 0x22, + SP_RSP_STATUS_SPCN_ERR = 0xA8, + SP_RSP_STATUS_DMA_ERR = 0x24, +}; + +enum sensor_state { + SENSOR_VALID_DATA, + SENSOR_INVALID_DATA, + SENSOR_SPCN_ERROR, + SENSOR_DMA_ERROR, + SENSOR_PERMANENT_ERROR, + SENSOR_OPAL_ERROR, +}; + +enum spcn_attr { + SENSOR_STATUS, + SENSOR_THRS, + SENSOR_DATA, + SENSOR_MAX, +}; + +/* Parsed sensor attributes, passed through OPAL */ +struct opal_sensor_data { + uint64_t async_token; /* Asynchronous token */ + __be64 *sensor_data; /* Kernel pointer to copy data */ + enum spcn_attr spcn_attr; /* Modifier attribute */ + uint16_t rid; /* Sensor RID */ + uint8_t frc; /* Sensor resource class */ + uint32_t mod_index; /* Modifier index*/ + uint32_t offset; /* Offset in sensor buffer */ +}; + +struct spcn_mod { + uint8_t mod; /* Modifier code */ + uint8_t entry_size; /* Size of each entry in response buffer */ + uint16_t entry_count; /* Number of entries */ +}; + +static struct spcn_mod spcn_mod_data[] = { + {SPCN_MOD_PRS_STATUS_FIRST, PRS_STATUS_ENTRY_SZ, 0 }, + {SPCN_MOD_PRS_STATUS_SUBS, PRS_STATUS_ENTRY_SZ, 0 }, + {SPCN_MOD_SENSOR_PARAM_FIRST, SENSOR_PARAM_ENTRY_SZ, 0 }, + {SPCN_MOD_SENSOR_PARAM_SUBS, SENSOR_PARAM_ENTRY_SZ, 0 }, + {SPCN_MOD_SENSOR_DATA_FIRST, SENSOR_DATA_ENTRY_SZ, 0 }, + {SPCN_MOD_SENSOR_DATA_SUBS, SENSOR_DATA_ENTRY_SZ, 0 }, + /* TODO Support this modifier '0x14', if required */ + /* {SPCN_MOD_PROC_JUNC_TEMP, PROC_JUNC_ENTRY_SZ, 0, NULL}, */ + {SPCN_MOD_SENSOR_POWER, SENSOR_DATA_ENTRY_SZ, 0 }, + {SPCN_MOD_LAST, 0xff, 0xffff} +}; + +/* Frame resource class (FRC) names */ +static const char *frc_names[] = { + /* 0x00 and 0x01 are reserved */ + NULL, + NULL, + "power-controller", + "power", + "regulator", + "cooling-fan", + "cooling-controller", + "battery-charger", + "battery-pack", + "amb-temp", + "temp", + "vrm", + "riser-card", + "io-backplane" +}; + +#define SENSOR_MAX_SIZE 0x00100000 +static void *sensor_buffer = NULL; +static enum sensor_state sensor_state; +static bool prev_msg_consumed = true; +static struct lock sensor_lock; + +/* Function prototypes */ +static int64_t fsp_sensor_send_read_request(struct opal_sensor_data *attr); +static void queue_msg_for_delivery(int rc, struct opal_sensor_data *attr); + + +/* + * Power Resource Status (PRS) + * Command: 0x42 + * + * Modifier: 0x01 + * -------------------------------------------------------------------------- + * | 0 1 2 3 4 5 6 7 | + * -------------------------------------------------------------------------- + * |Frame resrc class| PRID | SRC | Status | + * -------------------------------------------------------------------------- + * + * + * Modifier: 0x10 + * -------------------------------------------------------------------------- + * | 0 1 2 3 4 5 6 7 | + * -------------------------------------------------------------------------- + * |Frame resrc class| PRID | Sensor location | + * -------------------------------------------------------------------------- + * -------------------------------------------------------------------------- + * | 8 9 10 11 12 13 14 15 | + * -------------------------------------------------------------------------- + * | Reserved | Reserved | Threshold | Status | + * -------------------------------------------------------------------------- + * + * + * Modifier: 0x12 + * -------------------------------------------------------------------------- + * | 0 1 2 3 4 5 6 7 | + * -------------------------------------------------------------------------- + * |Frame resrc class| PRID | Sensor data | Status | + * -------------------------------------------------------------------------- + * + * + * Modifier: 0x14 + * -------------------------------------------------------------------------- + * | 0 1 2 3 | + * -------------------------------------------------------------------------- + * |Enclosure Tj Avg | Chip Tj Avg | Reserved | Reserved | + * -------------------------------------------------------------------------- + */ + + +/* + * When coming from a SENSOR_POWER modifier command, the resource id + * of a power supply is on one byte and misses a "subclass" byte + * (0x10). This routine adds it to be consistent with the PRS_STATUS + * modifier command. + */ +#define normalize_power_rid(rid) (0x1000|(rid)) + +static uint32_t sensor_power_process_data(uint16_t rid, + struct sensor_power *power) +{ + int i; + + if (!sensor_power_is_valid(power)) { + prlog(PR_TRACE, "Power Sensor data not valid\n"); + return INVALID_DATA; + } + + for (i = 0; i < sensor_power_count(power); i++) { + prlog(PR_TRACE, "Power[%d]: %d mW\n", i, + power->supplies[i].milliwatts); + if (rid == normalize_power_rid(power->supplies[i].rid)) + return be32_to_cpu(power->supplies[i].milliwatts) / 1000; + } + + return 0; +} + +static inline uint16_t convert_status_to_fault(uint16_t status) +{ + return status & 0x06; +} + +static void fsp_sensor_process_data(struct opal_sensor_data *attr) +{ + uint8_t *sensor_buf_ptr = (uint8_t *)sensor_buffer; + uint32_t sensor_data = INVALID_DATA; + __be16 sensor_mod_data[8]; + int count; + + for (count = 0; count < spcn_mod_data[attr->mod_index].entry_count; + count++) { + memcpy((void *)sensor_mod_data, sensor_buf_ptr, + spcn_mod_data[attr->mod_index].entry_size); + if (spcn_mod_data[attr->mod_index].mod == SPCN_MOD_PROC_JUNC_TEMP) { + /* TODO Support this modifier '0x14', if required */ + + } else if (spcn_mod_data[attr->mod_index].mod == SPCN_MOD_SENSOR_POWER) { + sensor_data = sensor_power_process_data(attr->rid, + (struct sensor_power *) sensor_buf_ptr); + break; + } else if (be16_to_cpu(sensor_mod_data[0]) == attr->frc && + be16_to_cpu(sensor_mod_data[1]) == attr->rid) { + switch (attr->spcn_attr) { + case SENSOR_STATUS: + sensor_data = + convert_status_to_fault(be16_to_cpu(sensor_mod_data[3])); + break; + case SENSOR_THRS: + sensor_data = be16_to_cpu(sensor_mod_data[6]); + break; + case SENSOR_DATA: + sensor_data = be16_to_cpu(sensor_mod_data[2]); + break; + default: + break; + } + + break; + } + + sensor_buf_ptr += spcn_mod_data[attr->mod_index].entry_size; + } + + *attr->sensor_data = cpu_to_be64(sensor_data); + if (sensor_data == INVALID_DATA) + queue_msg_for_delivery(OPAL_PARTIAL, attr); + else + queue_msg_for_delivery(OPAL_SUCCESS, attr); +} + +static int fsp_sensor_process_read(struct fsp_msg *resp_msg) +{ + uint8_t mbx_rsp_status; + uint32_t size = 0; + + mbx_rsp_status = (resp_msg->word1 >> 8) & 0xff; + switch (mbx_rsp_status) { + case SP_RSP_STATUS_VALID_DATA: + sensor_state = SENSOR_VALID_DATA; + size = fsp_msg_get_data_word(resp_msg, 1) & 0xffff; + break; + case SP_RSP_STATUS_INVALID_DATA: + log_simple_error(&e_info(OPAL_RC_SENSOR_READ), + "SENSOR: %s: Received invalid data\n", __func__); + sensor_state = SENSOR_INVALID_DATA; + break; + case SP_RSP_STATUS_SPCN_ERR: + log_simple_error(&e_info(OPAL_RC_SENSOR_READ), + "SENSOR: %s: Failure due to SPCN error\n", __func__); + sensor_state = SENSOR_SPCN_ERROR; + break; + case SP_RSP_STATUS_DMA_ERR: + log_simple_error(&e_info(OPAL_RC_SENSOR_READ), + "SENSOR: %s: Failure due to DMA error\n", __func__); + sensor_state = SENSOR_DMA_ERROR; + break; + default: + log_simple_error(&e_info(OPAL_RC_SENSOR_READ), + "SENSOR %s: Read failed, status:0x%02X\n", + __func__, mbx_rsp_status); + sensor_state = SENSOR_INVALID_DATA; + break; + } + + return size; +} + +static void queue_msg_for_delivery(int rc, struct opal_sensor_data *attr) +{ + prlog(PR_INSANE, "%s: rc:%d, data:%lld\n", + __func__, rc, *(attr->sensor_data)); + check_sensor_read(attr->async_token); + opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, + cpu_to_be64(attr->async_token), + cpu_to_be64(rc)); + spcn_mod_data[attr->mod_index].entry_count = 0; + free(attr); + prev_msg_consumed = true; +} + +static void fsp_sensor_read_complete(struct fsp_msg *msg) +{ + struct opal_sensor_data *attr = msg->user_data; + enum spcn_rsp_status status; + int rc, size; + + prlog(PR_INSANE, "%s()\n", __func__); + + status = (fsp_msg_get_data_word(msg->resp, 1) >> 24) & 0xff; + size = fsp_sensor_process_read(msg->resp); + fsp_freemsg(msg); + + lock(&sensor_lock); + if (sensor_state == SENSOR_VALID_DATA) { + spcn_mod_data[attr->mod_index].entry_count += (size / + spcn_mod_data[attr->mod_index].entry_size); + attr->offset += size; + /* Fetch the subsequent entries of the same modifier type */ + if (status == SPCN_RSP_STATUS_COND_SUCCESS) { + switch (spcn_mod_data[attr->mod_index].mod) { + case SPCN_MOD_PRS_STATUS_FIRST: + case SPCN_MOD_SENSOR_PARAM_FIRST: + case SPCN_MOD_SENSOR_DATA_FIRST: + attr->mod_index++; + spcn_mod_data[attr->mod_index].entry_count = + spcn_mod_data[attr->mod_index - 1]. + entry_count; + spcn_mod_data[attr->mod_index - 1].entry_count = 0; + break; + default: + break; + } + + rc = fsp_sensor_send_read_request(attr); + if (rc != OPAL_ASYNC_COMPLETION) + goto err; + } else { /* Notify 'powernv' of read completion */ + fsp_sensor_process_data(attr); + } + } else { + rc = OPAL_INTERNAL_ERROR; + goto err; + } + unlock(&sensor_lock); + return; +err: + *attr->sensor_data = cpu_to_be64(INVALID_DATA); + queue_msg_for_delivery(rc, attr); + unlock(&sensor_lock); + log_simple_error(&e_info(OPAL_RC_SENSOR_ASYNC_COMPLETE), + "SENSOR: %s: Failed to queue the " + "read request to fsp\n", __func__); +} + +static int64_t fsp_sensor_send_read_request(struct opal_sensor_data *attr) +{ + int rc; + struct fsp_msg *msg; + uint32_t align; + uint32_t cmd_header; + + if (fsp_in_rr()) + return OPAL_BUSY; + + prlog(PR_INSANE, "Get the data for modifier [%x]\n", + spcn_mod_data[attr->mod_index].mod); + + if (spcn_mod_data[attr->mod_index].mod == SPCN_MOD_PROC_JUNC_TEMP) { + /* TODO Support this modifier '0x14', if required */ + align = attr->offset % sizeof(uint32_t); + if (align) + attr->offset += (sizeof(uint32_t) - align); + + /* TODO Add 8 byte command data required for mod 0x14 */ + + attr->offset += 8; + + cmd_header = spcn_mod_data[attr->mod_index].mod << 24 | + SPCN_CMD_PRS << 16 | 0x0008; + } else { + cmd_header = spcn_mod_data[attr->mod_index].mod << 24 | + SPCN_CMD_PRS << 16; + } + + msg = fsp_mkmsg(FSP_CMD_SPCN_PASSTHRU, 4, + SPCN_ADDR_MODE_CEC_NODE, cmd_header, 0, + PSI_DMA_SENSOR_BUF + attr->offset); + + if (!msg) { + log_simple_error(&e_info(OPAL_RC_SENSOR_READ), "SENSOR: Failed " + "to allocate read message\n"); + return OPAL_INTERNAL_ERROR; + } + + msg->user_data = attr; + rc = fsp_queue_msg(msg, fsp_sensor_read_complete); + if (rc) { + fsp_freemsg(msg); + msg = NULL; + log_simple_error(&e_info(OPAL_RC_SENSOR_READ), "SENSOR: Failed " + "to queue read message (%d)\n", rc); + return OPAL_INTERNAL_ERROR; + } + + return OPAL_ASYNC_COMPLETION; +} + +/* + * These are the resources we know about and for which we provide a + * mapping in the device tree to capture data from the OS. Just + * discard the other ones for the moment. + */ +static inline bool sensor_frc_is_valid(uint16_t frc) +{ + switch (frc) { + case SENSOR_FRC_POWER_SUPPLY: + case SENSOR_FRC_COOLING_FAN: + case SENSOR_FRC_AMB_TEMP: + return true; + default: + return false; + } +} + +/* + * Each attribute of a resource needs a request to the FSP to capture + * its data. The routine below provides the mapping between the + * attribute and the PRS command modifier to use. + * + * resource | data | thrs | status | + * ----------------+--------+--------+-----------+ + * power_supply | POWER | | | + * | | | PRS | + * ----------------+--------+--------+-----------+ + * amb-temp | DATA | | DATA | + * | | PARAM | PARAM (*) | + * ----------------+--------+--------+-----------+ + * fan | DATA | | DATA (*) | + * | | PARAM | PARAM (*) | + * | | | PRS | + * + * (*) don't use the attribute given by this command modifier + */ +static int64_t parse_sensor_id(uint32_t handler, struct opal_sensor_data *attr) +{ + uint32_t mod, index; + + attr->frc = sensor_get_frc(handler); + attr->rid = sensor_get_rid(handler); + attr->spcn_attr = sensor_get_attr(handler); + + if (!sensor_frc_is_valid(attr->frc)) + return OPAL_PARAMETER; + + /* now compute the PRS command modifier which will be used to + * request a resource attribute from the FSP */ + switch (attr->spcn_attr) { + case SENSOR_DATA: + if (attr->frc == SENSOR_FRC_POWER_SUPPLY) + mod = SPCN_MOD_SENSOR_POWER; + else + mod = SPCN_MOD_SENSOR_DATA_FIRST; + break; + + case SENSOR_THRS: + mod = SPCN_MOD_SENSOR_PARAM_FIRST; + break; + + case SENSOR_STATUS: + switch (attr->frc) { + case SENSOR_FRC_AMB_TEMP: + mod = SPCN_MOD_SENSOR_DATA_FIRST; + break; + case SENSOR_FRC_POWER_SUPPLY: + case SENSOR_FRC_COOLING_FAN: + mod = SPCN_MOD_PRS_STATUS_FIRST; + break; + default: + return OPAL_PARAMETER; + } + break; + + default: + return OPAL_PARAMETER; + } + + for (index = 0; spcn_mod_data[index].mod != SPCN_MOD_LAST; index++) { + if (spcn_mod_data[index].mod == mod) + break; + } + + attr->mod_index = index; + return 0; +} + + +int64_t fsp_opal_read_sensor(uint32_t sensor_hndl, int token, + __be64 *sensor_data) +{ + struct opal_sensor_data *attr; + int64_t rc; + + prlog(PR_INSANE, "fsp_opal_read_sensor [%08x]\n", sensor_hndl); + + if (fsp_in_rr()) + return OPAL_BUSY; + + if (sensor_state == SENSOR_PERMANENT_ERROR) { + rc = OPAL_HARDWARE; + goto out; + } + + if (!sensor_hndl) { + rc = OPAL_PARAMETER; + goto out; + } + + lock(&sensor_lock); + if (prev_msg_consumed) { + attr = zalloc(sizeof(*attr)); + if (!attr) { + log_simple_error(&e_info(OPAL_RC_SENSOR_READ), + "SENSOR: Failed to allocate memory\n"); + rc = OPAL_NO_MEM; + goto out_lock; + } + + /* Parse the sensor id and store them to the local structure */ + rc = parse_sensor_id(sensor_hndl, attr); + if (rc) { + log_simple_error(&e_info(OPAL_RC_SENSOR_READ), + "SENSOR: %s: Failed to parse the sensor " + "handle[0x%08x]\n", __func__, sensor_hndl); + goto out_free; + } + /* Kernel buffer pointer to copy the data later when ready */ + attr->sensor_data = sensor_data; + attr->async_token = token; + + rc = fsp_sensor_send_read_request(attr); + if (rc != OPAL_ASYNC_COMPLETION) { + log_simple_error(&e_info(OPAL_RC_SENSOR_READ), + "SENSOR: %s: Failed to queue the read " + "request to fsp\n", __func__); + goto out_free; + } + + prev_msg_consumed = false; + } else { + rc = OPAL_BUSY_EVENT; + } + + unlock(&sensor_lock); + return rc; + +out_free: + free(attr); +out_lock: + unlock(&sensor_lock); +out: + return rc; +} + + +#define MAX_NAME 64 + +static struct dt_node *sensor_get_node(struct dt_node *sensors, + struct sensor_header *header, const char* attrname) +{ + char name[MAX_NAME]; + struct dt_node *node; + + /* + * Just use the resource class name and resource id. This + * should be obvious enough for a node name. + */ + snprintf(name, sizeof(name), "%s#%d-%s", frc_names[be16_to_cpu(header->frc)], be16_to_cpu(header->rid), attrname); + + /* + * The same resources are reported by the different PRS + * subcommands (PRS_STATUS, SENSOR_PARAM, SENSOR_DATA). So we + * need to check that we did not already create the device + * node. + */ + node = dt_find_by_path(sensors, name); + if (!node) { + prlog(PR_INFO, "SENSOR: creating node %s\n", name); + + node = dt_new(sensors, name); + + snprintf(name, sizeof(name), "ibm,opal-sensor-%s", + frc_names[be16_to_cpu(header->frc)]); + dt_add_property_string(node, "compatible", name); + } else { + /** + * @fwts-label OPALSensorNodeExists + * @fwts-advice OPAL had trouble creating the sensor + * nodes in the device tree as there was already one there. + * This indicates either the device tree from Hostboot + * already filled in sensors or an OPAL bug. + */ + prlog(PR_ERR, "SENSOR: node %s exists\n", name); + } + return node; +} + +#define sensor_handler(header, attr_num) \ + sensor_make_handler(SENSOR_FSP, be16_to_cpu((header).frc), be16_to_cpu((header).rid), attr_num) + +static int add_sensor_prs(struct dt_node *sensors, struct sensor_prs *prs) +{ + struct dt_node *node; + + node = sensor_get_node(sensors, &prs->header, "faulted"); + if (!node) + return -1; + + dt_add_property_cells(node, "sensor-id", + sensor_handler(prs->header, SENSOR_STATUS)); + return 0; +} + +static int add_sensor_param(struct dt_node *sensors, struct sensor_param *param) +{ + struct dt_node *node; + + node = sensor_get_node(sensors, ¶m->header, "thrs"); + if (!node) + return -1; + + dt_add_property_string(node, "ibm,loc-code", param->location); + dt_add_property_cells(node, "sensor-id", + sensor_handler(param->header, SENSOR_THRS)); + /* don't use the status coming from the response of the + * SENSOR_PARAM subcommand */ + return 0; +} + +static int add_sensor_data(struct dt_node *sensors, + struct sensor_data *data) +{ + struct dt_node *node; + + node = sensor_get_node(sensors, &data->header, "data"); + if (!node) + return -1; + + dt_add_property_cells(node, "sensor-id", + sensor_handler(data->header, SENSOR_DATA)); + + /* Let's make sure we are not adding a duplicate device node. + * Some resource, like fans, get their status attribute from + * three different commands ... + */ + if (be16_to_cpu(data->header.frc) == SENSOR_FRC_AMB_TEMP) { + node = sensor_get_node(sensors, &data->header, "faulted"); + if (!node) + return -1; + + dt_add_property_cells(node, "sensor-id", + sensor_handler(data->header, SENSOR_STATUS)); + } + + return 0; +} + +static int add_sensor_power(struct dt_node *sensors, struct sensor_power *power) +{ + int i; + struct dt_node *node; + + if (!sensor_power_is_valid(power)) + return -1; + + for (i = 0; i < sensor_power_count(power); i++) { + struct sensor_header header = { + cpu_to_be16(SENSOR_FRC_POWER_SUPPLY), + cpu_to_be16(normalize_power_rid(power->supplies[i].rid)) + }; + + node = sensor_get_node(sensors, &header, "data"); + + prlog(PR_TRACE, "SENSOR: Power[%d] : %d mW\n", + power->supplies[i].rid, + be32_to_cpu(power->supplies[i].milliwatts)); + + dt_add_property_cells(node, "sensor-id", + sensor_handler(header, SENSOR_DATA)); + } + return 0; +} + +static void add_sensor_ids(struct dt_node *sensors) +{ + uint8_t *sensor_buf_ptr = (uint8_t *)sensor_buffer; + struct spcn_mod *smod; + int i; + + for (smod = spcn_mod_data; smod->mod != SPCN_MOD_LAST; smod++) { + /* + * SPCN_MOD_SENSOR_POWER (0x1C) has a different layout. + */ + if (smod->mod == SPCN_MOD_SENSOR_POWER) { + add_sensor_power(sensors, + (struct sensor_power *) sensor_buf_ptr); + + sensor_buf_ptr += smod->entry_size * smod->entry_count; + continue; + } + + for (i = 0; i < smod->entry_count; i++) { + struct sensor_header *header = + (struct sensor_header *) sensor_buf_ptr; + + if (!sensor_frc_is_valid(be16_to_cpu(header->frc))) + goto out_sensor; + + switch (smod->mod) { + case SPCN_MOD_PROC_JUNC_TEMP: + /* TODO Support this modifier '0x14', + if required */ + break; + + case SPCN_MOD_PRS_STATUS_FIRST: + case SPCN_MOD_PRS_STATUS_SUBS: + add_sensor_prs(sensors, + (struct sensor_prs *) header); + break; + + case SPCN_MOD_SENSOR_PARAM_FIRST: + case SPCN_MOD_SENSOR_PARAM_SUBS: + add_sensor_param(sensors, + (struct sensor_param *) header); + break; + + case SPCN_MOD_SENSOR_DATA_FIRST: + case SPCN_MOD_SENSOR_DATA_SUBS: + add_sensor_data(sensors, + (struct sensor_data *) header); + + break; + + default: + prerror("SENSOR: unknown modifier : %x\n", + smod->mod); + } + +out_sensor: + sensor_buf_ptr += smod->entry_size; + } + } +} + +static void add_opal_sensor_node(void) +{ + int index; + + if (!fsp_present()) + return; + + add_sensor_ids(sensor_node); + + /* Reset the entry count of each modifier */ + for (index = 0; spcn_mod_data[index].mod != SPCN_MOD_LAST; + index++) + spcn_mod_data[index].entry_count = 0; +} + +void fsp_init_sensor(void) +{ + uint32_t cmd_header, align, size, psi_dma_offset = 0; + enum spcn_rsp_status status; + struct fsp_msg msg, resp; + int index, rc; + + if (!fsp_present()) { + sensor_state = SENSOR_PERMANENT_ERROR; + return; + } + + sensor_buffer = memalign(TCE_PSIZE, SENSOR_MAX_SIZE); + if (!sensor_buffer) { + log_simple_error(&e_info(OPAL_RC_SENSOR_INIT), "SENSOR: could " + "not allocate sensor_buffer!\n"); + return; + } + + /* Map TCE */ + fsp_tce_map(PSI_DMA_SENSOR_BUF, sensor_buffer, PSI_DMA_SENSOR_BUF_SZ); + + msg.resp = &resp; + + /* Traverse using all the modifiers to know all the sensors available + * in the system */ + for (index = 0; spcn_mod_data[index].mod != SPCN_MOD_LAST && + sensor_state == SENSOR_VALID_DATA;) { + prlog(PR_TRACE, "Get the data for modifier [%d]\n", + spcn_mod_data[index].mod); + if (spcn_mod_data[index].mod == SPCN_MOD_PROC_JUNC_TEMP) { + /* TODO Support this modifier 0x14, if required */ + align = psi_dma_offset % sizeof(uint32_t); + if (align) + psi_dma_offset += (sizeof(uint32_t) - align); + + /* TODO Add 8 byte command data required for mod 0x14 */ + psi_dma_offset += 8; + + cmd_header = spcn_mod_data[index].mod << 24 | + SPCN_CMD_PRS << 16 | 0x0008; + } else { + cmd_header = spcn_mod_data[index].mod << 24 | + SPCN_CMD_PRS << 16; + } + + fsp_fillmsg(&msg, FSP_CMD_SPCN_PASSTHRU, 4, + SPCN_ADDR_MODE_CEC_NODE, cmd_header, 0, + PSI_DMA_SENSOR_BUF + psi_dma_offset); + + rc = fsp_sync_msg(&msg, false); + if (rc >= 0) { + status = (fsp_msg_get_data_word(&resp, 1) >> 24) & 0xff; + size = fsp_sensor_process_read(&resp); + psi_dma_offset += size; + spcn_mod_data[index].entry_count += (size / + spcn_mod_data[index].entry_size); + } else { + sensor_state = SENSOR_PERMANENT_ERROR; + break; + } + + switch (spcn_mod_data[index].mod) { + case SPCN_MOD_PRS_STATUS_FIRST: + case SPCN_MOD_SENSOR_PARAM_FIRST: + case SPCN_MOD_SENSOR_DATA_FIRST: + if (status == SPCN_RSP_STATUS_COND_SUCCESS) + index++; + else + index += 2; + + break; + case SPCN_MOD_PRS_STATUS_SUBS: + case SPCN_MOD_SENSOR_PARAM_SUBS: + case SPCN_MOD_SENSOR_DATA_SUBS: + if (status != SPCN_RSP_STATUS_COND_SUCCESS) + index++; + break; + case SPCN_MOD_SENSOR_POWER: + index++; + default: + break; + } + } + + if (sensor_state != SENSOR_VALID_DATA) + sensor_state = SENSOR_PERMANENT_ERROR; + else + add_opal_sensor_node(); +} diff --git a/roms/skiboot/hw/fsp/fsp-surveillance.c b/roms/skiboot/hw/fsp/fsp-surveillance.c new file mode 100644 index 000000000..84e6878f3 --- /dev/null +++ b/roms/skiboot/hw/fsp/fsp-surveillance.c @@ -0,0 +1,226 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * We don't want to go on the cart! + * + * Copyright 2013-2018 IBM Corp. + */ + +#include <skiboot.h> +#include <fsp.h> +#include <lock.h> +#include <processor.h> +#include <timebase.h> +#include <fsp-sysparam.h> +#include <errorlog.h> +#include <opal-api.h> + +static bool fsp_surv_state = false; +static bool fsp_surv_ack_pending = false; +static u64 surv_timer; +static u64 surv_ack_timer; +static u32 surv_state_param; +static struct lock surv_lock = LOCK_UNLOCKED; + +#define FSP_SURV_ACK_TIMEOUT 120 /* surv ack timeout in seconds */ + +DEFINE_LOG_ENTRY(OPAL_RC_SURVE_INIT, OPAL_MISC_ERR_EVT, OPAL_SURVEILLANCE, + OPAL_SURVEILLANCE_ERR, OPAL_PREDICTIVE_ERR_GENERAL, + OPAL_MISCELLANEOUS_INFO_ONLY); + +DEFINE_LOG_ENTRY(OPAL_RC_SURVE_STATUS, OPAL_MISC_ERR_EVT, OPAL_SURVEILLANCE, + OPAL_SURVEILLANCE_ERR, OPAL_PREDICTIVE_ERR_GENERAL, + OPAL_MISCELLANEOUS_INFO_ONLY); + +DEFINE_LOG_ENTRY(OPAL_RC_SURVE_ACK, OPAL_MISC_ERR_EVT, OPAL_SURVEILLANCE, + OPAL_SURVEILLANCE_ERR, OPAL_PREDICTIVE_ERR_GENERAL, + OPAL_MISCELLANEOUS_INFO_ONLY); + +static void fsp_surv_ack(struct fsp_msg *msg) +{ + uint8_t val; + + if (!msg->resp) + return; + + val = (msg->resp->word1 >> 8) & 0xff; + if (val == 0) { + /* reset the pending flag */ + prlog(PR_TRACE, + "SURV: Received heartbeat acknowledge from FSP\n"); + lock(&surv_lock); + fsp_surv_ack_pending = false; + unlock(&surv_lock); + } else { + /** + * @fwts-label FSPHeartbeatAckError + * @fwts-advice Error in acknowledging heartbeat to FSP. + * This could mean the FSP has gone away or it may mean + * the FSP may kill us for missing too many heartbeats. + */ + prlog(PR_ERR, + "SURV: Heartbeat Acknowledgment error from FSP\n"); + } + + fsp_freemsg(msg); +} + +static void fsp_surv_check_timeout(void) +{ + u64 now = mftb(); + + /* + * We just checked fsp_surv_ack_pending to be true in fsp_surv_hbeat + * and we haven't dropped the surv_lock between then and now. So, we + * just go ahead and check timeouts. + */ + if (tb_compare(now, surv_ack_timer) == TB_AAFTERB) { + uint32_t plid = log_simple_error(&e_info(OPAL_RC_SURVE_ACK), + "SURV: Surv ACK timed out; initiating R/R\n"); + + /* Reset the pending trigger too */ + fsp_surv_ack_pending = false; + fsp_trigger_reset(plid); + } + + return; +} + +/* Send surveillance heartbeat based on a timebase trigger */ +static void fsp_surv_hbeat(void) +{ + u64 now = mftb(); + struct fsp_msg *msg; + + /* Check if an ack is pending... if so, don't send the ping just yet */ + if (fsp_surv_ack_pending) { + fsp_surv_check_timeout(); + return; + } + + /* add timebase callbacks */ + /* + * XXX This packet needs to be pushed to FSP in an interval + * less than 120s that's advertised to FSP. + * + * Verify if the command building format and call is fine. + */ + if (surv_timer == 0 || + (tb_compare(now, surv_timer) == TB_AAFTERB) || + (tb_compare(now, surv_timer) == TB_AEQUALB)) { + prlog(PR_TRACE, + "SURV: Sending the heartbeat command to FSP\n"); + msg = fsp_mkmsg(FSP_CMD_SURV_HBEAT, 1, 120); + if (!msg) { + prerror("SURV: Failed to allocate heartbeat msg\n"); + return; + } + if (fsp_queue_msg(msg, fsp_surv_ack)) { + fsp_freemsg(msg); + prerror("SURV: Failed to queue heartbeat msg\n"); + } else { + fsp_surv_ack_pending = true; + surv_timer = now + secs_to_tb(60); + surv_ack_timer = now + secs_to_tb(FSP_SURV_ACK_TIMEOUT); + } + } +} + +static void fsp_surv_poll(void *data __unused) +{ + if (!fsp_surv_state) + return; + lock(&surv_lock); + fsp_surv_hbeat(); + unlock(&surv_lock); +} + +static void fsp_surv_got_param(uint32_t param_id __unused, int err_len, + void *data __unused) +{ + if (err_len != 4) { + uint32_t plid = log_simple_error(&e_info(OPAL_RC_SURVE_STATUS), + "SURV: Error (%d) retrieving surv status; initiating R/R\n", + err_len); + fsp_trigger_reset(plid); + return; + } + + surv_state_param = be32_to_cpu((__be32)surv_state_param); + if (!(surv_state_param & 0x01)) { + prlog(PR_NOTICE, "SURV: Status from FSP: disabled\n"); + return; + } + prlog(PR_NOTICE, "SURV: Status from FSP: enabled\n"); + + lock(&surv_lock); + fsp_surv_state = true; + + /* Also send one heartbeat now. The next one will not happen + * until we hit the OS. + */ + fsp_surv_hbeat(); + unlock(&surv_lock); +} + +void fsp_surv_query(void) +{ + int rc; + + printf("SURV: Querying FSP's surveillance status\n"); + + /* Reset surveillance settings */ + lock(&surv_lock); + fsp_surv_state = false; + surv_timer = 0; + surv_ack_timer = 0; + unlock(&surv_lock); + + /* Query FPS for surveillance state */ + rc = fsp_get_sys_param(SYS_PARAM_SURV, &surv_state_param, 4, + fsp_surv_got_param, NULL); + if (rc) { + log_simple_error(&e_info(OPAL_RC_SURVE_INIT), + "SURV: Error %d queueing param request\n", rc); + } +} + +static bool fsp_surv_msg_rr(u32 cmd_sub_mod, struct fsp_msg *msg) +{ + assert(msg == NULL); + + switch (cmd_sub_mod) { + case FSP_RESET_START: + printf("SURV: Disabling surveillance\n"); + lock(&surv_lock); + fsp_surv_state = false; + fsp_surv_ack_pending = false; + unlock(&surv_lock); + return true; + case FSP_RELOAD_COMPLETE: + fsp_surv_query(); + return true; + } + return false; +} + +static struct fsp_client fsp_surv_client_rr = { + .message = fsp_surv_msg_rr, +}; + +/* This is called at boot time */ +void fsp_init_surveillance(void) +{ + /* Always register the poller, so we don't have to add/remove + * it on reset-reload or change of surveillance state. Also the + * poller list has no locking so we don't want to play with it + * at runtime. + */ + opal_add_poller(fsp_surv_poll, NULL); + + /* Register for the reset/reload event */ + fsp_register_client(&fsp_surv_client_rr, FSP_MCLASS_RR_EVENT); + + /* Send query to FSP */ + fsp_surv_query(); +} + diff --git a/roms/skiboot/hw/fsp/fsp-sysdump.c b/roms/skiboot/hw/fsp/fsp-sysdump.c new file mode 100644 index 000000000..cd8744062 --- /dev/null +++ b/roms/skiboot/hw/fsp/fsp-sysdump.c @@ -0,0 +1,407 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Sapphire dump design: + * - During initialization we setup Memory Dump Source Table (MDST) table + * which contains address, size pair. + * - We send MDST table update notification to FSP via MBOX command. + * - During Sapphire checkstop: + * - FSP retrieves HWDUMP. + * - FSP retrieves CEC memory based on MDST table. + * - Once Sapphire reboot FSP sends new dump avialable notification via HDAT + * + * Copyright 2013-2016 IBM Corp. + */ + +#include <fsp.h> +#include <psi.h> +#include <opal.h> +#include <lock.h> +#include <skiboot.h> +#include <errorlog.h> +#include <opal-dump.h> + +/* + * Sapphire dump size + * This is the maximum memory that FSP can retrieve during checkstop. + * + * Note: + * Presently we are hardcoding this parameter. Eventually we need + * new System parameter so that we can get max size dynamically. + */ +#define MAX_SAPPHIRE_DUMP_SIZE 0x1000000 + +DEFINE_LOG_ENTRY(OPAL_RC_DUMP_MDST_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_DUMP, + OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT, + OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_DUMP_MDST_UPDATE, OPAL_PLATFORM_ERR_EVT, OPAL_DUMP, + OPAL_PLATFORM_FIRMWARE, + OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT, + OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_DUMP_MDST_ADD, OPAL_PLATFORM_ERR_EVT, OPAL_DUMP, + OPAL_PLATFORM_FIRMWARE, OPAL_INFO, OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_DUMP_MDST_REMOVE, OPAL_PLATFORM_ERR_EVT, OPAL_DUMP, + OPAL_PLATFORM_FIRMWARE, OPAL_INFO, OPAL_NA); + + +static struct mdst_table *mdst_table; +static struct mdst_table *dump_mem_region; + +static int cur_mdst_entry; +static int max_mdst_entry; +static int cur_dump_size; +/* + * Presently both sizes are same.. But if someday FSP gives more space + * than our TCE mapping then we need this validation.. + * + * Also once FSP implements MAX_SAPPHIRE_DUMP_SIZE system param, we can + * move this validation to separate function. + */ +static int max_dump_size = MIN(MAX_SAPPHIRE_DUMP_SIZE, PSI_DMA_HYP_DUMP_SIZE); + +/* Protect MDST table entries */ +static struct lock mdst_lock = LOCK_UNLOCKED; + +static inline uint32_t get_dump_region_map_size(uint64_t addr, uint32_t size) +{ + uint64_t start, end; + + start = addr & ~TCE_MASK; + end = addr + size; + end = ALIGN_UP(end, TCE_PSIZE); + + return (end - start); +} + +static int dump_region_tce_map(void) +{ + int i; + uint32_t t_size = 0, size; + uint64_t addr; + + for (i = 0; i < cur_mdst_entry; i++) { + + addr = be64_to_cpu(dump_mem_region[i].addr) & ~TCE_MASK; + size = get_dump_region_map_size(be64_to_cpu(dump_mem_region[i].addr), + be32_to_cpu(dump_mem_region[i].size)); + + if (t_size + size > max_dump_size) + break; + + /* TCE mapping */ + fsp_tce_map(PSI_DMA_HYP_DUMP + t_size, (void *)addr, size); + + /* Add entry to MDST table */ + mdst_table[i].data_region = dump_mem_region[i].data_region; + mdst_table[i].size = dump_mem_region[i].size; + mdst_table[i].addr = cpu_to_be64(PSI_DMA_HYP_DUMP + t_size); + + /* TCE alignment adjustment */ + mdst_table[i].addr = cpu_to_be64(be64_to_cpu(mdst_table[i].addr) + + (be64_to_cpu(dump_mem_region[i].addr) & 0xfff)); + + t_size += size; + } + + return i; +} + +static inline void dump_region_tce_unmap(void) +{ + fsp_tce_unmap(PSI_DMA_HYP_DUMP, PSI_DMA_HYP_DUMP_SIZE); +} + +static void update_mdst_table_complete(struct fsp_msg *msg) +{ + uint8_t status = (msg->resp->word1 >> 8) & 0xff; + + if (status) + log_simple_error(&e_info(OPAL_RC_DUMP_MDST_UPDATE), + "MDST: Update table MBOX command failed: " + "0x%x\n", status); + else + printf("MDST: Table updated.\n"); + + fsp_freemsg(msg); +} + +/* Send MDST table to FSP */ +static int64_t fsp_update_mdst_table(void) +{ + struct fsp_msg *msg; + int count; + int rc = OPAL_SUCCESS; + + if (cur_mdst_entry <= 0) { + printf("MDST: Table is empty\n"); + return OPAL_INTERNAL_ERROR; + } + + lock(&mdst_lock); + + /* Unmap previous mapping */ + dump_region_tce_unmap(); + count = dump_region_tce_map(); + + msg = fsp_mkmsg(FSP_CMD_HYP_MDST_TABLE, 4, 0, + PSI_DMA_MDST_TABLE, + sizeof(*mdst_table) * count, + sizeof(*mdst_table)); + unlock(&mdst_lock); + + if (!msg) { + log_simple_error(&e_info(OPAL_RC_DUMP_MDST_UPDATE), + "MDST: Message allocation failed.!\n"); + rc = OPAL_INTERNAL_ERROR; + } else if (fsp_queue_msg(msg, update_mdst_table_complete)) { + log_simple_error(&e_info(OPAL_RC_DUMP_MDST_UPDATE), + "MDST: Failed to queue MDST table message.\n"); + fsp_freemsg(msg); + rc = OPAL_INTERNAL_ERROR; + } + return rc; +} + +static int dump_region_del_entry(uint32_t id) +{ + int i; + uint32_t size; + bool found = false; + int rc = OPAL_SUCCESS; + + lock(&mdst_lock); + + for (i = 0; i < cur_mdst_entry; i++) { + if (dump_mem_region[i].data_region != id) + continue; + + found = true; + break; + } + + if (!found) { + rc = OPAL_PARAMETER; + goto del_out; + } + + /* Adjust current dump size */ + size = get_dump_region_map_size(be64_to_cpu(dump_mem_region[i].addr), + be32_to_cpu(dump_mem_region[i].size)); + cur_dump_size -= size; + + for ( ; i < cur_mdst_entry - 1; i++) + dump_mem_region[i] = dump_mem_region[i + 1]; + + dump_mem_region[i].data_region = 0; + cur_mdst_entry--; + +del_out: + unlock(&mdst_lock); + return rc; +} + +/* Add entry to MDST table */ +static int __dump_region_add_entry(uint32_t id, uint64_t addr, uint32_t size) +{ + int rc = OPAL_INTERNAL_ERROR; + uint32_t act_size; + + /* Delete function takes lock before modifying table */ + dump_region_del_entry(id); + + lock(&mdst_lock); + + if (cur_mdst_entry >= max_mdst_entry) { + log_simple_error(&e_info(OPAL_RC_DUMP_MDST_ADD), + "MDST: Table is full.\n"); + goto out; + } + + /* TCE alignment adjustment */ + act_size = get_dump_region_map_size(addr, size); + + /* Make sure we don't cross dump size limit */ + if (cur_dump_size + act_size > max_dump_size) { + log_simple_error(&e_info(OPAL_RC_DUMP_MDST_ADD), + "MDST: 0x%x is crossing max dump size (0x%x) limit.\n", + cur_dump_size + act_size, max_dump_size); + goto out; + } + + /* Add entry to dump memory region table */ + dump_mem_region[cur_mdst_entry].data_region = (u8)id; + dump_mem_region[cur_mdst_entry].addr = cpu_to_be64(addr); + dump_mem_region[cur_mdst_entry].size = cpu_to_be32(size); + + /* Update dump region count and dump size */ + cur_mdst_entry++; + cur_dump_size += act_size; + + printf("MDST: Addr = 0x%llx [size : 0x%x bytes] added to MDST table.\n", + (uint64_t)addr, size); + + rc = OPAL_SUCCESS; + +out: + unlock(&mdst_lock); + return rc; +} + +static int dump_region_add_entries(void) +{ + int rc; + + /* Add console buffer */ + rc = __dump_region_add_entry(DUMP_REGION_CONSOLE, + INMEM_CON_START, INMEM_CON_LEN); + if (rc) + return rc; + + /* Add HBRT buffer */ + rc = __dump_region_add_entry(DUMP_REGION_HBRT_LOG, + HBRT_CON_START, HBRT_CON_LEN); + + return rc; +} + +static int64_t fsp_opal_register_dump_region(uint32_t id, + uint64_t addr, uint64_t size) +{ + int rc = OPAL_SUCCESS; + + if (!fsp_present()) + return OPAL_UNSUPPORTED; + + /* Validate memory region id */ + if (id < DUMP_REGION_HOST_START || id > DUMP_REGION_HOST_END) { + log_simple_error(&e_info(OPAL_RC_DUMP_MDST_ADD), + "MDST: Invalid dump region id : 0x%x\n", id); + return OPAL_PARAMETER; + } + + if (size <= 0) { + log_simple_error(&e_info(OPAL_RC_DUMP_MDST_ADD), + "MDST: Invalid size : 0x%llx\n", size); + return OPAL_PARAMETER; + } + + rc = __dump_region_add_entry(id, addr, size); + if (rc) + return rc; + + /* Send updated MDST to FSP */ + rc = fsp_update_mdst_table(); + + return rc; +} + +static int64_t fsp_opal_unregister_dump_region(uint32_t id) +{ + int rc = OPAL_SUCCESS; + + if (!fsp_present()) + return OPAL_UNSUPPORTED; + + /* Validate memory region id */ + if (id < DUMP_REGION_HOST_START || id > DUMP_REGION_HOST_END) { + log_simple_error(&e_info(OPAL_RC_DUMP_MDST_REMOVE), + "MDST: Invalid dump region id : 0x%x\n", id); + return OPAL_PARAMETER; + } + + rc = dump_region_del_entry(id); + if (rc) { + log_simple_error(&e_info(OPAL_RC_DUMP_MDST_REMOVE), + "MDST: dump region id : 0x%x not found\n", id); + return OPAL_PARAMETER; + } + + /* Send updated MDST to FSP */ + rc = fsp_update_mdst_table(); + + return rc; +} + +/* TCE mapping */ +static inline void mdst_table_tce_map(void) +{ + fsp_tce_map(PSI_DMA_MDST_TABLE, mdst_table, PSI_DMA_MDST_TABLE_SIZE); +} + +/* Initialize MDST table */ +static int mdst_table_init(void) +{ + dump_mem_region = memalign(TCE_PSIZE, PSI_DMA_MDST_TABLE_SIZE); + if (!dump_mem_region) { + log_simple_error(&e_info(OPAL_RC_DUMP_MDST_INIT), + "MDST: Failed to allocate memory for dump " + "memory region table.\n"); + return -ENOMEM; + } + + memset(dump_mem_region, 0, PSI_DMA_MDST_TABLE_SIZE); + + mdst_table = memalign(TCE_PSIZE, PSI_DMA_MDST_TABLE_SIZE); + if (!mdst_table) { + log_simple_error(&e_info(OPAL_RC_DUMP_MDST_INIT), + "MDST: Failed to allocate memory for MDST table.\n"); + return -ENOMEM; + } + + memset(mdst_table, 0, PSI_DMA_MDST_TABLE_SIZE); + mdst_table_tce_map(); + + max_mdst_entry = PSI_DMA_MDST_TABLE_SIZE / sizeof(*mdst_table); + printf("MDST: Max entries in MDST table : %d\n", max_mdst_entry); + + return OPAL_SUCCESS; +} + +/* + * Handle FSP R/R event. + */ +static bool fsp_mdst_update_rr(uint32_t cmd_sub_mod, + struct fsp_msg *msg __unused) +{ + switch (cmd_sub_mod) { + case FSP_RESET_START: + return true; + case FSP_RELOAD_COMPLETE: /* Send MDST to FSP */ + fsp_update_mdst_table(); + return true; + } + return false; +} + +static struct fsp_client fsp_mdst_client_rr = { + .message = fsp_mdst_update_rr, +}; + +/* Initialize MDST table and send notification to FSP */ +void fsp_mdst_table_init(void) +{ + if (!fsp_present()) + return; + + /* OPAL interface */ + opal_register(OPAL_REGISTER_DUMP_REGION, + fsp_opal_register_dump_region, 3); + opal_register(OPAL_UNREGISTER_DUMP_REGION, + fsp_opal_unregister_dump_region, 1); + + /* Initiate MDST */ + if (mdst_table_init() != OPAL_SUCCESS) + return; + + /* + * Ignore return code from mdst_table_add_entries so that + * we can atleast capture partial dump. + */ + dump_region_add_entries(); + fsp_update_mdst_table(); + + /* Register for Class AA (FSP R/R) */ + fsp_register_client(&fsp_mdst_client_rr, FSP_MCLASS_RR_EVENT); +} diff --git a/roms/skiboot/hw/fsp/fsp-sysparam.c b/roms/skiboot/hw/fsp/fsp-sysparam.c new file mode 100644 index 000000000..adb424e5e --- /dev/null +++ b/roms/skiboot/hw/fsp/fsp-sysparam.c @@ -0,0 +1,508 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * There's some system level parameters that aren't over IPMI or NVRAM + * but that the FSP exposes through this interface. + * + * We expose these through an OPAL API as there really isn't any other/better + * way of doing so. + * + * Copyright 2013-2017 IBM Corp. + */ + +#include <skiboot.h> +#include <fsp.h> +#include <opal.h> +#include <device.h> +#include <lock.h> +#include <processor.h> +#include <psi.h> +#include <opal-msg.h> +#include <fsp-sysparam.h> + +struct sysparam_comp_data { + uint32_t param_len; + uint64_t async_token; +}; + +struct sysparam_req { + sysparam_compl_t completion; + void *comp_data; + void *ubuf; + uint32_t ulen; + struct fsp_msg msg; + struct fsp_msg resp; + bool done; +}; + +static struct sysparam_attr { + const char *name; + uint32_t id; + uint32_t length; + uint8_t perm; +} sysparam_attrs[] = { +#define _R OPAL_SYSPARAM_READ +#define _W OPAL_SYSPARAM_WRITE +#define _RW OPAL_SYSPARAM_RW + {"surveillance", SYS_PARAM_SURV, 4, _RW}, + {"hmc-management", SYS_PARAM_HMC_MANAGED, 4, _R}, + {"cupd-policy", SYS_PARAM_FLASH_POLICY, 4, _RW}, + {"plat-hmc-managed", SYS_PARAM_NEED_HMC, 4, _RW}, + {"fw-license-policy", SYS_PARAM_FW_LICENSE, 4, _RW}, + {"world-wide-port-num", SYS_PARAM_WWPN, 12, _W}, + {"default-boot-device", SYS_PARAM_DEF_BOOT_DEV, 1, _RW}, + {"next-boot-device", SYS_PARAM_NEXT_BOOT_DEV,1, _RW}, + {"console-select", SYS_PARAM_CONSOLE_SELECT,1, _RW}, + {"boot-device-path", SYS_PARAM_BOOT_DEV_PATH,48, _RW} +#undef _R +#undef _W +#undef _RW +}; + +static int fsp_sysparam_process(struct sysparam_req *r) +{ + u32 param_id, len; + int stlen = 0; + u8 fstat; + /* Snapshot completion before we set the "done" flag */ + sysparam_compl_t comp = r->completion; + void *cdata = r->comp_data; + + if (r->msg.state != fsp_msg_done) { + prerror("FSP: Request for sysparam 0x%x got FSP failure!\n", + fsp_msg_get_data_word(&r->msg, 0)); + stlen = -1; /* XXX Find saner error codes */ + goto complete; + } + + param_id = fsp_msg_get_data_word(&r->resp, 0); + len = fsp_msg_get_data_word(&r->resp, 1) & 0xffff; + + /* Check params validity */ + if (param_id != fsp_msg_get_data_word(&r->msg, 0)) { + prerror("FSP: Request for sysparam 0x%x got resp. for 0x%x!\n", + fsp_msg_get_data_word(&r->msg, 0), param_id); + stlen = -2; /* XXX Sane error codes */ + goto complete; + } + if (len > r->ulen) { + prerror("FSP: Request for sysparam 0x%x truncated!\n", + param_id); + len = r->ulen; + } + + /* Decode the request status */ + fstat = (r->msg.resp->word1 >> 8) & 0xff; + switch(fstat) { + case 0x00: /* XXX Is that even possible ? */ + case 0x11: /* Data in request */ + memcpy(r->ubuf, &r->resp.data.bytes[8], len); + /* fallthrough */ + case 0x12: /* Data in TCE */ + stlen = len; + break; + default: + stlen = -fstat; + } + complete: + /* Call completion if any */ + if (comp) + comp(fsp_msg_get_data_word(&r->msg, 0), stlen, cdata); + + free(r); + + return stlen; +} + +static void fsp_sysparam_get_complete(struct fsp_msg *msg) +{ + struct sysparam_req *r = container_of(msg, struct sysparam_req, msg); + + /* If it's an asynchronous request, process it now */ + if (r->completion) { + fsp_sysparam_process(r); + return; + } + + /* Else just set the done flag */ + + /* Another CPU can be polling on the "done" flag without the + * lock held, so let's order the udpates to the structure + */ + lwsync(); + r->done = true; +} + +int fsp_get_sys_param(uint32_t param_id, void *buffer, uint32_t length, + sysparam_compl_t async_complete, void *comp_data) +{ + struct sysparam_req *r; + uint64_t baddr, tce_token; + int rc; + + if (!fsp_present()) + return -ENODEV; + /* + * XXX FIXME: We currently always allocate the sysparam_req here + * however, we want to avoid runtime allocations as much as + * possible, so if this is going to be used a lot at runtime, + * we probably want to pre-allocate a pool of these + */ + if (length > 4096) + return -EINVAL; + r = zalloc(sizeof(struct sysparam_req)); + if (!r) + return -ENOMEM; + r->completion = async_complete; + r->comp_data = comp_data; + r->done = false; + r->ubuf = buffer; + r->ulen = length; + r->msg.resp = &r->resp; + + /* Map always 1 page ... easier that way and none of that + * is performance critical + */ + baddr = (uint64_t)buffer; + fsp_tce_map(PSI_DMA_GET_SYSPARAM, (void *)(baddr & ~0xffful), 0x1000); + tce_token = PSI_DMA_GET_SYSPARAM | (baddr & 0xfff); + fsp_fillmsg(&r->msg, FSP_CMD_QUERY_SPARM, 3, + param_id, length, tce_token); + rc = fsp_queue_msg(&r->msg, fsp_sysparam_get_complete); + + if (rc) + free(r); + + /* Asynchronous operation or queueing failure, return */ + if (rc || async_complete) + return rc; + + /* Synchronous operation requested, spin and process */ + while(!r->done) + opal_run_pollers(); + + /* Will free the request */ + return fsp_sysparam_process(r); +} + +static void fsp_opal_getparam_complete(uint32_t param_id __unused, int err_len, + void *data) +{ + struct sysparam_comp_data *comp_data = data; + int rc = OPAL_SUCCESS; + + if (comp_data->param_len != err_len) + rc = OPAL_INTERNAL_ERROR; + + opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, + cpu_to_be64(comp_data->async_token), + cpu_to_be64(rc)); + free(comp_data); +} + +static void fsp_opal_setparam_complete(struct fsp_msg *msg) +{ + struct sysparam_comp_data *comp_data = msg->user_data; + u8 fstat; + uint32_t param_id; + int rc = OPAL_SUCCESS; + + if (msg->state != fsp_msg_done) { + prerror("FSP: Request for set sysparam 0x%x got FSP failure!\n", + fsp_msg_get_data_word(msg, 0)); + rc = OPAL_INTERNAL_ERROR; + goto out; + } + + param_id = fsp_msg_get_data_word(msg->resp, 0); + if (param_id != fsp_msg_get_data_word(msg, 0)) { + prerror("FSP: Request for set sysparam 0x%x got resp. for 0x%x!" + "\n", fsp_msg_get_data_word(msg, 0), param_id); + rc = OPAL_INTERNAL_ERROR; + goto out; + } + + fstat = (msg->resp->word1 >> 8) & 0xff; + switch (fstat) { + case 0x00: + rc = OPAL_SUCCESS; + break; + case 0x22: + prerror("%s: Response status 0x%x, invalid data\n", __func__, + fstat); + rc = OPAL_INTERNAL_ERROR; + break; + case 0x24: + prerror("%s: Response status 0x%x, DMA error\n", __func__, + fstat); + rc = OPAL_INTERNAL_ERROR; + break; + default: + rc = OPAL_INTERNAL_ERROR; + break; + } + +out: + opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, + cpu_to_be64(comp_data->async_token), + cpu_to_be64(rc)); + free(comp_data); + fsp_freemsg(msg); +} + +/* OPAL interface for PowerNV to read the system parameter from FSP */ +static int64_t fsp_opal_get_param(uint64_t async_token, uint32_t param_id, + uint64_t buffer, uint64_t length) +{ + struct sysparam_comp_data *comp_data; + int count, rc, i; + + if (!fsp_present()) + return OPAL_HARDWARE; + + count = ARRAY_SIZE(sysparam_attrs); + for (i = 0; i < count; i++) + if (sysparam_attrs[i].id == param_id) + break; + if (i == count) + return OPAL_PARAMETER; + + if (length < sysparam_attrs[i].length) + return OPAL_PARAMETER; + if (!(sysparam_attrs[i].perm & OPAL_SYSPARAM_READ)) + return OPAL_PERMISSION; + + comp_data = zalloc(sizeof(struct sysparam_comp_data)); + if (!comp_data) + return OPAL_NO_MEM; + + comp_data->param_len = sysparam_attrs[i].length; + comp_data->async_token = async_token; + rc = fsp_get_sys_param(param_id, (void *)buffer, + sysparam_attrs[i].length, fsp_opal_getparam_complete, + comp_data); + if (rc) { + free(comp_data); + prerror("%s: Error %d queuing param request\n", __func__, rc); + return OPAL_INTERNAL_ERROR; + } + + return OPAL_ASYNC_COMPLETION; +} + +/* OPAL interface for PowerNV to update the system parameter to FSP */ +static int64_t fsp_opal_set_param(uint64_t async_token, uint32_t param_id, + uint64_t buffer, uint64_t length) +{ + struct sysparam_comp_data *comp_data; + struct fsp_msg *msg; + uint64_t tce_token; + int count, rc, i; + + if (!fsp_present()) + return OPAL_HARDWARE; + + count = ARRAY_SIZE(sysparam_attrs); + for (i = 0; i < count; i++) + if (sysparam_attrs[i].id == param_id) + break; + if (i == count) + return OPAL_PARAMETER; + + if (length < sysparam_attrs[i].length) + return OPAL_PARAMETER; + if (!(sysparam_attrs[i].perm & OPAL_SYSPARAM_WRITE)) + return OPAL_PERMISSION; + + fsp_tce_map(PSI_DMA_SET_SYSPARAM, (void *)(buffer & ~0xffful), 0x1000); + tce_token = PSI_DMA_SET_SYSPARAM | (buffer & 0xfff); + + msg = fsp_mkmsg(FSP_CMD_SET_SPARM_2, 4, param_id, length, + tce_token >> 32, tce_token); + if (!msg) { + prerror("%s: Failed to allocate the message\n", __func__); + return OPAL_INTERNAL_ERROR; + } + + comp_data = zalloc(sizeof(struct sysparam_comp_data)); + if (!comp_data) { + fsp_freemsg(msg); + return OPAL_NO_MEM; + } + + comp_data->param_len = length; + comp_data->async_token = async_token; + msg->user_data = comp_data; + + rc = fsp_queue_msg(msg, fsp_opal_setparam_complete); + if (rc) { + free(comp_data); + fsp_freemsg(msg); + prerror("%s: Failed to queue the message\n", __func__); + return OPAL_INTERNAL_ERROR; + } + + return OPAL_ASYNC_COMPLETION; +} + +struct sysparam_notify_entry { + struct list_node link; + sysparam_update_notify notify; +}; + +static LIST_HEAD(sysparam_update_notifiers); + +/* Add client to notifier chain */ +void sysparam_add_update_notifier(sysparam_update_notify notify) +{ + struct sysparam_notify_entry *entry; + + entry = zalloc(sizeof(struct sysparam_notify_entry)); + assert(entry); + + entry->notify = notify; + list_add_tail(&sysparam_update_notifiers, &entry->link); +} + +/* Remove client from notifier chain */ +void sysparam_del_update_notifier(sysparam_update_notify notify) +{ + struct sysparam_notify_entry *entry; + + list_for_each(&sysparam_update_notifiers, entry, link) { + if (entry->notify == notify) { + list_del(&entry->link); + free(entry); + return; + } + } +} + +/* Update notification chain */ +static void sysparam_run_update_notifier(struct fsp_msg *msg) +{ + bool ret; + struct sysparam_notify_entry *entry; + + list_for_each(&sysparam_update_notifiers, entry, link) { + ret = entry->notify(msg); + if (ret == true) + break; + } +} + +static bool fsp_sysparam_msg(u32 cmd_sub_mod, struct fsp_msg *msg) +{ + struct fsp_msg *rsp; + int rc = -ENOMEM; + + switch(cmd_sub_mod) { + case FSP_CMD_SP_SPARM_UPD_0: + case FSP_CMD_SP_SPARM_UPD_1: + printf("FSP: Got sysparam update, param ID 0x%x\n", + fsp_msg_get_data_word(msg, 0)); + + sysparam_run_update_notifier(msg); + + rsp = fsp_mkmsg((cmd_sub_mod & 0xffff00) | 0x008000, 0); + if (rsp) + rc = fsp_queue_msg(rsp, fsp_freemsg); + if (rc) { + prerror("FSP: Error %d queuing sysparam reply\n", rc); + /* What to do here ? R/R ? */ + fsp_freemsg(rsp); + } + return true; + } + return false; +} + +static struct fsp_client fsp_sysparam_client = { + .message = fsp_sysparam_msg, +}; + +static void add_opal_sysparam_node(void) +{ + struct dt_node *sysparams; + char *names, *s; + __be32 *ids, *lens; + uint8_t *perms; + unsigned int i, count, size = 0; + + if (!fsp_present()) + return; + + sysparams = dt_new(opal_node, "sysparams"); + dt_add_property_string(sysparams, "compatible", "ibm,opal-sysparams"); + + count = ARRAY_SIZE(sysparam_attrs); + for (i = 0; i < count; i++) + size = size + strlen(sysparam_attrs[i].name) + 1; + + names = zalloc(size); + if (!names) { + prerror("%s: Failed to allocate memory for parameter names\n", + __func__); + return; + } + + ids = zalloc(count * sizeof(*ids)); + if (!ids) { + prerror("%s: Failed to allocate memory for parameter ids\n", + __func__); + goto out_free_name; + } + + lens = zalloc(count * sizeof(*lens)); + if (!lens) { + prerror("%s: Failed to allocate memory for parameter length\n", + __func__); + goto out_free_id; + } + + perms = zalloc(count * sizeof(*perms)); + if (!perms) { + prerror("%s: Failed to allocate memory for parameter length\n", + __func__); + goto out_free_len; + } + + s = names; + for (i = 0; i < count; i++) { + strcpy(s, sysparam_attrs[i].name); + s = s + strlen(sysparam_attrs[i].name) + 1; + + ids[i] = cpu_to_be32(sysparam_attrs[i].id); + lens[i] = cpu_to_be32(sysparam_attrs[i].length); + perms[i] = sysparam_attrs[i].perm; + } + + dt_add_property(sysparams, "param-name", names, size); + dt_add_property(sysparams, "param-id", ids, count * sizeof(*ids)); + dt_add_property(sysparams, "param-len", lens, count * sizeof(*lens)); + dt_add_property(sysparams, "param-perm", perms, count * sizeof(*perms)); + + free(perms); + +out_free_len: + free(lens); +out_free_id: + free(ids); +out_free_name: + free(names); +} + +void fsp_sysparam_init(void) +{ + if (!fsp_present()) + return; + + /* Register change notifications */ + fsp_register_client(&fsp_sysparam_client, FSP_MCLASS_SERVICE); + + /* Register OPAL interfaces */ + opal_register(OPAL_GET_PARAM, fsp_opal_get_param, 4); + opal_register(OPAL_SET_PARAM, fsp_opal_set_param, 4); + + /* Add device-tree nodes */ + add_opal_sysparam_node(); +} diff --git a/roms/skiboot/hw/fsp/fsp.c b/roms/skiboot/hw/fsp/fsp.c new file mode 100644 index 000000000..2c5f9d71b --- /dev/null +++ b/roms/skiboot/hw/fsp/fsp.c @@ -0,0 +1,2709 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Base FSP (Flexible Service Processor) Support + * + * FSP is the BMC-like thing in some IBM POWER servers + * + * Copyright 2013-2019 IBM Corp. + */ + +#include <stdarg.h> +#include <processor.h> +#include <io.h> +#include <fsp.h> +#include <lock.h> +#include <interrupts.h> +#include <device.h> +#include <trace.h> +#include <timebase.h> +#include <cpu.h> +#include <errorlog.h> +#include <opal.h> +#include <opal-msg.h> +#include <ccan/list/list.h> + +extern uint32_t hir_trigger; + +DEFINE_LOG_ENTRY(OPAL_RC_FSP_POLL_TIMEOUT, OPAL_PLATFORM_ERR_EVT, OPAL_FSP, + OPAL_PLATFORM_FIRMWARE, OPAL_RECOVERED_ERR_GENERAL, OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_FSP_MBOX_ERR, OPAL_PLATFORM_ERR_EVT, OPAL_FSP, + OPAL_PLATFORM_FIRMWARE, OPAL_RECOVERED_ERR_GENERAL, OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_FSP_DISR_HIR_MASK, OPAL_PLATFORM_ERR_EVT, OPAL_FSP, + OPAL_PLATFORM_FIRMWARE, OPAL_RECOVERED_ERR_GENERAL, OPAL_NA); + +/* We make this look like a Surveillance error, even though it really + * isn't one. + */ +DEFINE_LOG_ENTRY(OPAL_INJECTED_HIR, OPAL_MISC_ERR_EVT, OPAL_SURVEILLANCE, + OPAL_SURVEILLANCE_ERR, OPAL_PREDICTIVE_ERR_GENERAL, + OPAL_MISCELLANEOUS_INFO_ONLY); + +#define FSP_TRACE_MSG +#define FSP_TRACE_EVENT + +#define FSP_MAX_IOPATH 4 + +enum fsp_path_state { + fsp_path_bad, + fsp_path_backup, + fsp_path_active, +}; + +struct fsp_iopath { + enum fsp_path_state state; + void *fsp_regs; + struct psi *psi; +}; + +enum fsp_mbx_state { + fsp_mbx_idle, /* Mailbox ready to send */ + fsp_mbx_send, /* Mailbox sent, waiting for ack */ + fsp_mbx_crit_op, /* Critical operation in progress */ + fsp_mbx_prep_for_reset, /* Prepare for reset sent */ + fsp_mbx_hir_seq_done, /* HIR sequence done, link forced down */ + fsp_mbx_err, /* Mailbox in error state, waiting for r&r */ + fsp_mbx_rr, /* Mailbox in r&r */ +}; + +struct fsp { + struct fsp *link; + unsigned int index; + enum fsp_mbx_state state; + struct fsp_msg *pending; + + unsigned int iopath_count; + int active_iopath; /* -1: no active IO path */ + struct fsp_iopath iopath[FSP_MAX_IOPATH]; +}; + +enum ipl_state { + ipl_initial = 0x00000000, + ipl_opl_sent = 0x00000001, + ipl_got_continue = 0x00000002, + ipl_got_new_role = 0x00000004, + ipl_got_caps = 0x00000008, + ipl_got_fsp_functional = 0x00000010 +}; +static enum ipl_state ipl_state = ipl_initial; + +static struct fsp *first_fsp; +static struct fsp *active_fsp; +static u16 fsp_curseq = 0x8000; +static __be64 *fsp_tce_table; + +#define FSP_INBOUND_SIZE 0x00100000UL +static void *fsp_inbound_buf = NULL; +static u32 fsp_inbound_off; + +static struct lock fsp_lock = LOCK_UNLOCKED; +static struct lock fsp_poll_lock = LOCK_UNLOCKED; + +static u64 fsp_cmdclass_resp_bitmask; +static u64 timeout_timer; + +static u64 fsp_hir_timeout; + +#define FSP_CRITICAL_OP_TIMEOUT 128 +#define FSP_DRCR_CLEAR_TIMEOUT 128 + +/* LID numbers. For now we hijack some of pHyp's own until i figure + * out the whole business with the MasterLID + */ +#define KERNEL_LID_PHYP 0x80a00701 +#define KERNEL_LID_OPAL 0x80f00101 +#define INITRAMFS_LID_OPAL 0x80f00102 + +/* + * We keep track on last logged values for some things to print only on + * value changes, but also to relieve pressure on the tracer which + * doesn't do a very good job at detecting repeats when called from + * many different CPUs + */ +static u32 disr_last_print; +static u32 drcr_last_print; +static u32 hstate_last_print; + +void fsp_handle_resp(struct fsp_msg *msg); + +struct fsp_cmdclass { + int timeout; + bool busy; + struct list_head msgq; + struct list_head clientq; + struct list_head rr_queue; /* To queue up msgs during R/R */ + u64 timesent; +}; + +static struct fsp_cmdclass fsp_cmdclass_rr; + +static struct fsp_cmdclass fsp_cmdclass[FSP_MCLASS_LAST - FSP_MCLASS_FIRST + 1] += { +#define DEF_CLASS(_cl, _to) [_cl - FSP_MCLASS_FIRST] = { .timeout = _to } + DEF_CLASS(FSP_MCLASS_SERVICE, 16), + DEF_CLASS(FSP_MCLASS_PCTRL_MSG, 16), + DEF_CLASS(FSP_MCLASS_PCTRL_ABORTS, 16), + DEF_CLASS(FSP_MCLASS_ERR_LOG, 16), + DEF_CLASS(FSP_MCLASS_CODE_UPDATE, 40), + DEF_CLASS(FSP_MCLASS_FETCH_SPDATA, 16), + DEF_CLASS(FSP_MCLASS_FETCH_HVDATA, 16), + DEF_CLASS(FSP_MCLASS_NVRAM, 16), + DEF_CLASS(FSP_MCLASS_MBOX_SURV, 2), + DEF_CLASS(FSP_MCLASS_RTC, 16), + DEF_CLASS(FSP_MCLASS_SMART_CHIP, 20), + DEF_CLASS(FSP_MCLASS_INDICATOR, 180), + DEF_CLASS(FSP_MCLASS_HMC_INTFMSG, 16), + DEF_CLASS(FSP_MCLASS_HMC_VT, 16), + DEF_CLASS(FSP_MCLASS_HMC_BUFFERS, 16), + DEF_CLASS(FSP_MCLASS_SHARK, 16), + DEF_CLASS(FSP_MCLASS_MEMORY_ERR, 16), + DEF_CLASS(FSP_MCLASS_CUOD_EVENT, 16), + DEF_CLASS(FSP_MCLASS_HW_MAINT, 16), + DEF_CLASS(FSP_MCLASS_VIO, 16), + DEF_CLASS(FSP_MCLASS_SRC_MSG, 16), + DEF_CLASS(FSP_MCLASS_DATA_COPY, 16), + DEF_CLASS(FSP_MCLASS_TONE, 16), + DEF_CLASS(FSP_MCLASS_VIRTUAL_NVRAM, 16), + DEF_CLASS(FSP_MCLASS_TORRENT, 16), + DEF_CLASS(FSP_MCLASS_NODE_PDOWN, 16), + DEF_CLASS(FSP_MCLASS_DIAG, 16), + DEF_CLASS(FSP_MCLASS_PCIE_LINK_TOPO, 16), + DEF_CLASS(FSP_MCLASS_OCC, 16), + DEF_CLASS(FSP_MCLASS_TRUSTED_BOOT, 2), + DEF_CLASS(FSP_MCLASS_HBRT, 2), +}; + +static void fsp_trace_msg(struct fsp_msg *msg, u8 dir __unused) +{ + union trace fsp __unused; +#ifdef FSP_TRACE_MSG + size_t len = offsetof(struct trace_fsp_msg, data[msg->dlen]); + + fsp.fsp_msg.dlen = msg->dlen; + fsp.fsp_msg.word0 = cpu_to_be32(msg->word0); + fsp.fsp_msg.word1 = cpu_to_be32(msg->word1); + fsp.fsp_msg.dir = dir; + memcpy(fsp.fsp_msg.data, msg->data.bytes, msg->dlen); + trace_add(&fsp, TRACE_FSP_MSG, len); +#endif /* FSP_TRACE_MSG */ + assert(msg->dlen <= sizeof(fsp.fsp_msg.data)); +} + +static struct fsp *fsp_get_active(void) +{ + /* XXX Handle transition between FSPs */ + return active_fsp; +} + +static u64 fsp_get_class_bit(u8 class) +{ + /* Alias classes CE and CF as the FSP has a single queue */ + if (class == FSP_MCLASS_IPL) + class = FSP_MCLASS_SERVICE; + + return 1ul << (class - FSP_MCLASS_FIRST); +} + +static struct fsp_cmdclass *__fsp_get_cmdclass(u8 class) +{ + struct fsp_cmdclass *ret; + + /* RR class is special */ + if (class == FSP_MCLASS_RR_EVENT) + return &fsp_cmdclass_rr; + + /* Bound check */ + if (class < FSP_MCLASS_FIRST || class > FSP_MCLASS_LAST) + return NULL; + + /* Alias classes CE and CF as the FSP has a single queue */ + if (class == FSP_MCLASS_IPL) + class = FSP_MCLASS_SERVICE; + + ret = &fsp_cmdclass[class - FSP_MCLASS_FIRST]; + + /* Unknown class */ + if (ret->timeout == 0) + return NULL; + + return ret; +} + +static struct fsp_cmdclass *fsp_get_cmdclass(struct fsp_msg *msg) +{ + u8 c = msg->word0 & 0xff; + + return __fsp_get_cmdclass(c); +} + +static struct fsp_msg *__fsp_allocmsg(void) +{ + return zalloc(sizeof(struct fsp_msg)); +} + +struct fsp_msg *fsp_allocmsg(bool alloc_response) +{ + struct fsp_msg *msg; + + msg = __fsp_allocmsg(); + if (!msg) + return NULL; + if (alloc_response) { + msg->resp = __fsp_allocmsg(); + if (!msg->resp) { + free(msg); + return NULL; + } + } + + return msg; +} + +void __fsp_freemsg(struct fsp_msg *msg) +{ + free(msg); +} + +void fsp_freemsg(struct fsp_msg *msg) +{ + if (msg && msg->resp) + __fsp_freemsg(msg->resp); + __fsp_freemsg(msg); +} + +void fsp_cancelmsg(struct fsp_msg *msg) +{ + bool need_unlock = false; + struct fsp_cmdclass* cmdclass = fsp_get_cmdclass(msg); + + if (!fsp_in_rr()) { + prerror("FSP: Message cancel allowed only when" + "FSP is in reset\n"); + return; + } + + if (!cmdclass) + return; + + /* Recursive locking */ + need_unlock = lock_recursive(&fsp_lock); + + list_del(&msg->link); + msg->state = fsp_msg_cancelled; + + if (need_unlock) + unlock(&fsp_lock); +} + +static void fsp_wreg(struct fsp *fsp, u32 reg, u32 val) +{ + struct fsp_iopath *iop; + + if (fsp->active_iopath < 0) + return; + iop = &fsp->iopath[fsp->active_iopath]; + if (iop->state == fsp_path_bad) + return; + out_be32(iop->fsp_regs + reg, val); +} + +static u32 fsp_rreg(struct fsp *fsp, u32 reg) +{ + struct fsp_iopath *iop; + + if (fsp->active_iopath < 0) + return 0xffffffff; + iop = &fsp->iopath[fsp->active_iopath]; + if (iop->state == fsp_path_bad) + return 0xffffffff; + return in_be32(iop->fsp_regs + reg); +} + +static void fsp_reg_dump(void) +{ +#define FSP_DUMP_ONE(x) \ + prlog(PR_DEBUG, " %20s: %x\n", #x, fsp_rreg(fsp, x)); + + struct fsp *fsp = fsp_get_active(); + + if (!fsp) + return; + + prlog(PR_DEBUG, "FSP #%d: Register dump (state=%d)\n", + fsp->index, fsp->state); + FSP_DUMP_ONE(FSP_DRCR_REG); + FSP_DUMP_ONE(FSP_DISR_REG); + FSP_DUMP_ONE(FSP_MBX1_HCTL_REG); + FSP_DUMP_ONE(FSP_MBX1_FCTL_REG); + FSP_DUMP_ONE(FSP_MBX2_HCTL_REG); + FSP_DUMP_ONE(FSP_MBX2_FCTL_REG); + FSP_DUMP_ONE(FSP_SDES_REG); + FSP_DUMP_ONE(FSP_HDES_REG); + FSP_DUMP_ONE(FSP_HDIR_REG); + FSP_DUMP_ONE(FSP_HDIM_SET_REG); + FSP_DUMP_ONE(FSP_PDIR_REG); + FSP_DUMP_ONE(FSP_PDIM_SET_REG); + FSP_DUMP_ONE(FSP_SCRATCH0_REG); + FSP_DUMP_ONE(FSP_SCRATCH1_REG); + FSP_DUMP_ONE(FSP_SCRATCH2_REG); + FSP_DUMP_ONE(FSP_SCRATCH3_REG); +} + +static void fsp_notify_rr_state(u32 state) +{ + struct fsp_client *client, *next; + struct fsp_cmdclass *cmdclass = __fsp_get_cmdclass(FSP_MCLASS_RR_EVENT); + + assert(cmdclass); + list_for_each_safe(&cmdclass->clientq, client, next, link) + client->message(state, NULL); +} + +static void fsp_reset_cmdclass(void) +{ + int i; + struct fsp_msg *msg; + + /* + * The FSP is in reset and hence we can't expect any response + * to outstanding messages that we've already sent. Clear the + * bitmap to reflect that. + */ + fsp_cmdclass_resp_bitmask = 0; + for (i = 0; i <= (FSP_MCLASS_LAST - FSP_MCLASS_FIRST); i++) { + struct fsp_cmdclass *cmdclass = &fsp_cmdclass[i]; + cmdclass->busy = false; + cmdclass->timesent = 0; + + /* Make sure the message queue is empty */ + while(!list_empty(&cmdclass->msgq)) { + msg = list_pop(&cmdclass->msgq, struct fsp_msg, + link); + list_add_tail(&cmdclass->rr_queue, &msg->link); + } + } +} + +static bool fsp_in_hir(struct fsp *fsp) +{ + switch (fsp->state) { + case fsp_mbx_crit_op: + case fsp_mbx_prep_for_reset: + return true; + default: + return false; + } +} + +static bool fsp_in_reset(struct fsp *fsp) +{ + switch (fsp->state) { + case fsp_mbx_hir_seq_done: /* FSP reset triggered */ + case fsp_mbx_err: /* Will be reset soon */ + case fsp_mbx_rr: /* Mbx activity stopped pending reset */ + return true; + default: + return false; + } +} + +bool fsp_in_rr(void) +{ + struct fsp *fsp = fsp_get_active(); + struct fsp_iopath *iop; + + if (fsp->active_iopath < 0) + return true; + + iop = &fsp->iopath[fsp->active_iopath]; + + if (fsp_in_reset(fsp) || fsp_in_hir(fsp) || !(psi_check_link_active(iop->psi))) + return true; + + return false; +} + +static bool fsp_hir_state_timeout(void) +{ + u64 now = mftb(); + + if (tb_compare(now, fsp_hir_timeout) == TB_AAFTERB) + return true; + + return false; +} + +static void fsp_set_hir_timeout(u32 seconds) +{ + u64 now = mftb(); + fsp_hir_timeout = now + secs_to_tb(seconds); +} + +static bool fsp_crit_op_in_progress(struct fsp *fsp) +{ + u32 disr = fsp_rreg(fsp, FSP_DISR_REG); + + if (disr & FSP_DISR_CRIT_OP_IN_PROGRESS) + return true; + + return false; +} + +/* Notify the FSP that it will be reset soon by writing to the DRCR */ +static void fsp_prep_for_reset(struct fsp *fsp) +{ + u32 drcr; + + /* + * Its possible that the FSP went into reset by itself between the + * time the HIR is triggered and we get here. Check and bail out if so. + */ + if (fsp_in_rr()) + return; + + drcr = fsp_rreg(fsp, FSP_DRCR_REG); + + prlog(PR_TRACE, "FSP: Writing reset to DRCR\n"); + drcr_last_print = drcr; + fsp_wreg(fsp, FSP_DRCR_REG, (drcr | FSP_PREP_FOR_RESET_CMD)); + fsp->state = fsp_mbx_prep_for_reset; + fsp_set_hir_timeout(FSP_DRCR_CLEAR_TIMEOUT); +} + +static void fsp_hir_poll(struct fsp *fsp, struct psi *psi) +{ + u32 drcr; + + if (fsp_in_reset(fsp) || !(psi_check_link_active(psi))) + return; + + switch (fsp->state) { + case fsp_mbx_crit_op: + if (fsp_crit_op_in_progress(fsp)) { + if (fsp_hir_state_timeout()) + prerror("FSP: Critical operation timeout\n"); + /* XXX What do do next? Check with FSP folks */ + } else { + fsp_prep_for_reset(fsp); + } + break; + case fsp_mbx_prep_for_reset: + drcr = fsp_rreg(fsp, FSP_DRCR_REG); + + if (drcr != drcr_last_print) { + prlog(PR_TRACE, "FSP: DRCR changed, old = %x," + " new = %x\n", + drcr_last_print, drcr); + drcr_last_print = drcr; + } + + if (drcr & FSP_DRCR_ACK_MASK) { + if (fsp_hir_state_timeout()) { + prerror("FSP: Ack timeout. Triggering reset\n"); + psi_reset_fsp(psi); + fsp->state = fsp_mbx_hir_seq_done; + } + } else { + prlog(PR_TRACE, "FSP: DRCR ack received." + " Triggering reset\n"); + psi_reset_fsp(psi); + fsp->state = fsp_mbx_hir_seq_done; + } + break; + default: + break; + } +} + +/* + * This is the main entry for the host initiated reset case. + * This gets called when: + * a. Surveillance ack is not received in 120 seconds + * b. A mailbox command doesn't get a response within the stipulated time. + */ +static void __fsp_trigger_reset(void) +{ + struct fsp *fsp = fsp_get_active(); + u32 disr; + + /* Already in one of the error processing states */ + if (fsp_in_hir(fsp) || fsp_in_reset(fsp)) + return; + + prerror("FSP: fsp_trigger_reset() entry\n"); + + drcr_last_print = 0; + /* + * Check if we are allowed to reset the FSP. We aren't allowed to + * reset the FSP if the FSP_DISR_DBG_IN_PROGRESS is set. + */ + disr = fsp_rreg(fsp, FSP_DISR_REG); + if (disr & FSP_DISR_DBG_IN_PROGRESS) { + prerror("FSP: Host initiated reset disabled\n"); + return; + } + + /* + * Check if some critical operation is in progress as indicated + * by FSP_DISR_CRIT_OP_IN_PROGRESS. Timeout is 128 seconds + */ + if (fsp_crit_op_in_progress(fsp)) { + prlog(PR_NOTICE, "FSP: Critical operation in progress\n"); + fsp->state = fsp_mbx_crit_op; + fsp_set_hir_timeout(FSP_CRITICAL_OP_TIMEOUT); + } else + fsp_prep_for_reset(fsp); +} + +static uint32_t fsp_hir_reason_plid; + +void fsp_trigger_reset(uint32_t plid) +{ + lock(&fsp_lock); + fsp_hir_reason_plid = plid; + __fsp_trigger_reset(); + unlock(&fsp_lock); +} + +/* + * Called when we trigger a HIR or when the FSP tells us via the DISR's + * RR bit that one is impending. We should therefore stop all mbox activity. + */ +static void fsp_start_rr(struct fsp *fsp) +{ + struct fsp_iopath *iop; + + if (fsp->state == fsp_mbx_rr) + return; + + /* We no longer have an active path on that FSP */ + if (fsp->active_iopath >= 0) { + iop = &fsp->iopath[fsp->active_iopath]; + iop->state = fsp_path_bad; + fsp->active_iopath = -1; + } + fsp->state = fsp_mbx_rr; + disr_last_print = 0; + hstate_last_print = 0; + + /* + * Mark all command classes as non-busy and clear their + * timeout, then flush all messages in our staging queue + */ + fsp_reset_cmdclass(); + + /* Notify clients. We have to drop the lock here */ + unlock(&fsp_lock); + fsp_notify_rr_state(FSP_RESET_START); + lock(&fsp_lock); + + /* + * Unlike earlier, we don't trigger the PSI link polling + * from this point. We wait for the PSI interrupt to tell + * us the FSP is really down and then start the polling there. + */ +} + +/* + * Called on normal/quick shutdown to give up the PSI link + */ +void fsp_reset_links(void) +{ + struct fsp *fsp = fsp_get_active(); + struct fsp_iopath *iop; + + if (!fsp) + return; + + /* Already in one of the error states? */ + if (fsp_in_hir(fsp) || fsp_in_reset(fsp)) + return; + + iop = &fsp->iopath[fsp->active_iopath]; + prlog(PR_NOTICE, "FSP #%d: Host initiated shutdown." + " Giving up the PSI link\n", fsp->index); + psi_disable_link(iop->psi); + return; +} + +static void fsp_trace_event(struct fsp *fsp, u32 evt, + u32 data0, u32 data1, u32 data2, u32 data3) +{ + union trace tfsp __unused; +#ifdef FSP_TRACE_EVENT + size_t len = sizeof(struct trace_fsp_event); + + tfsp.fsp_evt.event = cpu_to_be16(evt); + tfsp.fsp_evt.fsp_state = cpu_to_be16(fsp->state); + tfsp.fsp_evt.data[0] = cpu_to_be32(data0); + tfsp.fsp_evt.data[1] = cpu_to_be32(data1); + tfsp.fsp_evt.data[2] = cpu_to_be32(data2); + tfsp.fsp_evt.data[3] = cpu_to_be32(data3); + trace_add(&tfsp, TRACE_FSP_EVENT, len); +#endif /* FSP_TRACE_EVENT */ +} + +static void fsp_handle_errors(struct fsp *fsp) +{ + u32 hstate; + struct fsp_iopath *iop; + struct psi *psi; + u32 disr; + + if (fsp->active_iopath < 0) { + prerror("FSP #%d: fsp_handle_errors() with no active IOP\n", + fsp->index); + return; + } + + iop = &fsp->iopath[fsp->active_iopath]; + if (!iop->psi) { + prerror("FSP: Active IOP with no PSI link !\n"); + return; + } + psi = iop->psi; + + /* + * If the link is not up, start R&R immediately, we do call + * psi_disable_link() in this case as while the link might + * not be up, it might still be enabled and the PSI layer + * "active" bit still set + */ + if (!psi_check_link_active(psi)) { + /* Start R&R process */ + fsp_trace_event(fsp, TRACE_FSP_EVT_LINK_DOWN, 0, 0, 0, 0); + prerror("FSP #%d: Link down, starting R&R\n", fsp->index); + + fsp_start_rr(fsp); + return; + } + + /* Link is up, check for other conditions */ + disr = fsp_rreg(fsp, FSP_DISR_REG); + + /* If in R&R, log values */ + if (disr != disr_last_print) { + fsp_trace_event(fsp, TRACE_FSP_EVT_DISR_CHG, disr, 0, 0, 0); + + prlog(PR_TRACE, "FSP #%d: DISR stat change = 0x%08x\n", + fsp->index, disr); + disr_last_print = disr; + } + + /* On a deferred mbox error, trigger a HIR + * Note: We may never get here since the link inactive case is handled + * above and the other case is when the iop->psi is NULL, which is + * quite rare. + */ + if (fsp->state == fsp_mbx_err) { + uint32_t plid; + plid = log_simple_error(&e_info(OPAL_RC_FSP_MBOX_ERR), + "FSP #%d: Triggering HIR on mbx_err\n", + fsp->index); + fsp_trigger_reset(plid); + return; + } + + /* + * If we get here as part of normal flow, the FSP is telling + * us that there will be an impending R&R, so we stop all mbox + * activity. The actual link down trigger is via a PSI + * interrupt that may arrive in due course. + */ + if (disr & FSP_DISR_FSP_IN_RR) { + /* + * If we get here with DEBUG_IN_PROGRESS also set, the + * FSP is in debug and we should *not* reset it now + */ + if (disr & FSP_DISR_DBG_IN_PROGRESS) + return; + + /* + * When the linux comes back up, we still see that bit + * set for a bit, so just move on, nothing to see here + */ + if (fsp->state == fsp_mbx_rr) + return; + + if (fsp_dpo_pending) { + /* + * If we are about to process a reset when DPO + * is pending, its possible that the host has + * gone down, and OPAL is on its way down and + * hence will not see the subsequent PSI interrupt. + * So, just give up the link here. + */ + prlog(PR_NOTICE, "FSP #%d: FSP reset with DPO pending." + " Giving up PSI link\n", + fsp->index); + psi_disable_link(psi); + } else { + prlog(PR_NOTICE, "FSP #%d: FSP in Reset." + " Waiting for PSI interrupt\n", + fsp->index); + } + fsp_start_rr(fsp); + } + + /* + * However, if any of Unit Check or Runtime Termintated or + * Flash Terminated bits is also set, the FSP is asking us + * to trigger a HIR so it can try to recover via the DRCR route. + */ + if (disr & FSP_DISR_HIR_TRIGGER_MASK) { + const char *reason = "Unknown FSP_DISR_HIR_TRIGGER"; + uint32_t plid; + fsp_trace_event(fsp, TRACE_FSP_EVT_SOFT_RR, disr, 0, 0, 0); + + if (disr & FSP_DISR_FSP_UNIT_CHECK) + reason = "DISR Unit Check set"; + else if (disr & FSP_DISR_FSP_RUNTIME_TERM) + reason = "DISR Runtime Terminate set"; + else if (disr & FSP_DISR_FSP_FLASH_TERM) + reason = "DISR Flash Terminate set"; + + plid = log_simple_error(&e_info(OPAL_RC_FSP_DISR_HIR_MASK), + "FSP: %s. Triggering host initiated " + "reset.", reason); + + /* Clear all interrupt conditions */ + fsp_wreg(fsp, FSP_HDIR_REG, FSP_DBIRQ_ALL); + + /* Make sure this happened */ + fsp_rreg(fsp, FSP_HDIR_REG); + + fsp_trigger_reset(plid); + return; + } + + /* + * We detect an R&R complete indication, acknolwedge it + */ + if (disr & FSP_DISR_FSP_RR_COMPLETE) { + /* + * Acking this bit doens't make it go away immediately, so + * only do it while still in R&R state + */ + if (fsp->state == fsp_mbx_rr) { + fsp_trace_event(fsp, TRACE_FSP_EVT_RR_COMPL, 0,0,0,0); + + prlog(PR_NOTICE, "FSP #%d: Detected R&R complete," + " acking\n", fsp->index); + + /* Clear HDATA area */ + fsp_wreg(fsp, FSP_MBX1_HDATA_AREA, 0xff); + + /* Ack it (XDN) and clear HPEND & counts */ + fsp_wreg(fsp, FSP_MBX1_HCTL_REG, + FSP_MBX_CTL_PTS | + FSP_MBX_CTL_XDN | + FSP_MBX_CTL_HPEND | + FSP_MBX_CTL_HCSP_MASK | + FSP_MBX_CTL_DCSP_MASK); + + /* + * Mark the mbox as usable again so we can process + * incoming messages + */ + fsp->state = fsp_mbx_idle; + + /* Also clear R&R complete bit in DISR */ + fsp_wreg(fsp, FSP_DISR_REG, FSP_DISR_FSP_RR_COMPLETE); + + psi_enable_fsp_interrupt(psi); + } + } + + /* + * XXX + * + * Here we detect a number of errors, should we initiate + * and R&R ? + */ + + hstate = fsp_rreg(fsp, FSP_HDES_REG); + if (hstate != hstate_last_print) { + fsp_trace_event(fsp, TRACE_FSP_EVT_HDES_CHG, hstate, 0, 0, 0); + + prlog(PR_DEBUG, "FSP #%d: HDES stat change = 0x%08x\n", + fsp->index, hstate); + hstate_last_print = hstate; + } + + if (hstate == 0xffffffff) + return; + + /* Clear errors */ + fsp_wreg(fsp, FSP_HDES_REG, FSP_DBERRSTAT_CLR1); + + /* + * Most of those errors shouldn't have happened, we just clear + * the error state and return. In the long run, we might want + * to start retrying commands, switching FSPs or links, etc... + * + * We currently don't set our mailbox to a permanent error state. + */ + if (hstate & FSP_DBERRSTAT_ILLEGAL1) + prerror("FSP #%d: Illegal command error !\n", fsp->index); + + if (hstate & FSP_DBERRSTAT_WFULL1) + prerror("FSP #%d: Write to a full mbox !\n", fsp->index); + + if (hstate & FSP_DBERRSTAT_REMPTY1) + prerror("FSP #%d: Read from an empty mbox !\n", fsp->index); + + if (hstate & FSP_DBERRSTAT_PAR1) + prerror("FSP #%d: Parity error !\n", fsp->index); +} + +/* + * This is called by fsp_post_msg() to check if the mbox + * is in a state that allows sending of a message + * + * Due to the various "interesting" contexts fsp_post_msg() + * can be called from, including recursive locks from lock + * error messages or console code, this should avoid doing + * anything more complex than checking a bit of state. + * + * Specifically, we cannot initiate an R&R and call back into + * clients etc... from this function. + * + * The best we can do is to se the mbox in error state and + * handle it later during a poll or interrupts. + */ +static bool fsp_check_can_send(struct fsp *fsp) +{ + struct fsp_iopath *iop; + struct psi *psi; + + /* Look for FSP in non-idle state */ + if (fsp->state != fsp_mbx_idle) + return false; + + /* Look for an active IO path */ + if (fsp->active_iopath < 0) + goto mbox_error; + iop = &fsp->iopath[fsp->active_iopath]; + if (!iop->psi) { + prerror("FSP: Active IOP with no PSI link !\n"); + goto mbox_error; + } + psi = iop->psi; + + /* Check if link has gone down. This will be handled later */ + if (!psi_check_link_active(psi)) { + prerror("FSP #%d: Link seems to be down on send\n", fsp->index); + goto mbox_error; + } + + /* XXX Do we want to check for other error conditions ? */ + return true; + + /* + * An error of some case occurred, we'll handle it later + * from a more normal "poll" context + */ + mbox_error: + fsp->state = fsp_mbx_err; + return false; +} + +static bool fsp_post_msg(struct fsp *fsp, struct fsp_msg *msg) +{ + u32 ctl, reg; + int i, wlen; + + prlog(PR_INSANE, "FSP #%d: fsp_post_msg (w0: 0x%08x w1: 0x%08x)\n", + fsp->index, msg->word0, msg->word1); + + /* Note: We used to read HCTL here and only modify some of + * the bits in it. This was bogus, because we would write back + * the incoming bits as '1' and clear them, causing fsp_poll() + * to then miss them. Let's just start with 0, which is how + * I suppose the HW intends us to do. + */ + + /* Set ourselves as busy */ + fsp->pending = msg; + fsp->state = fsp_mbx_send; + msg->state = fsp_msg_sent; + + /* We trace after setting the mailbox state so that if the + * tracing recurses, it ends up just queuing the message up + */ + fsp_trace_msg(msg, TRACE_FSP_MSG_OUT); + + /* Build the message in the mailbox */ + reg = FSP_MBX1_HDATA_AREA; + fsp_wreg(fsp, reg, msg->word0); reg += 4; + fsp_wreg(fsp, reg, msg->word1); reg += 4; + wlen = (msg->dlen + 3) >> 2; + for (i = 0; i < wlen; i++) { + fsp_wreg(fsp, reg, fsp_msg_get_data_word(msg, i)); + reg += 4; + } + + /* Write the header */ + fsp_wreg(fsp, FSP_MBX1_HHDR0_REG, (msg->dlen + 8) << 16); + + /* Write the control register */ + ctl = 4 << FSP_MBX_CTL_HCHOST_SHIFT; + ctl |= (msg->dlen + 8) << FSP_MBX_CTL_DCHOST_SHIFT; + ctl |= FSP_MBX_CTL_PTS | FSP_MBX_CTL_SPPEND; + prlog(PR_INSANE, " new ctl: %08x\n", ctl); + fsp_wreg(fsp, FSP_MBX1_HCTL_REG, ctl); + + return true; +} + +static void fsp_poke_queue(struct fsp_cmdclass *cmdclass) +{ + struct fsp *fsp = fsp_get_active(); + struct fsp_msg *msg; + + if (!fsp) + return; + if (!fsp_check_can_send(fsp)) + return; + + /* From here to the point where fsp_post_msg() sets fsp->state + * to !idle we must not cause any re-entrancy (no debug or trace) + * in a code path that may hit fsp_post_msg() (it's ok to do so + * if we are going to bail out), as we are committed to calling + * fsp_post_msg() and so a re-entrancy could cause us to do a + * double-send into the mailbox. + */ + if (cmdclass->busy || list_empty(&cmdclass->msgq)) + return; + + msg = list_top(&cmdclass->msgq, struct fsp_msg, link); + assert(msg); + cmdclass->busy = true; + + if (!fsp_post_msg(fsp, msg)) { + prerror("FSP #%d: Failed to send message\n", fsp->index); + cmdclass->busy = false; + return; + } +} + +static void __fsp_fillmsg(struct fsp_msg *msg, u32 cmd_sub_mod, + u8 add_words, va_list list) +{ + bool response = !!(cmd_sub_mod & 0x1000000); + u8 cmd = (cmd_sub_mod >> 16) & 0xff; + u8 sub = (cmd_sub_mod >> 8) & 0xff; + u8 mod = cmd_sub_mod & 0xff; + int i; + + msg->word0 = cmd & 0xff; + msg->word1 = mod << 8 | sub; + msg->response = response; + msg->dlen = add_words << 2; + + for (i = 0; i < add_words; i++) + fsp_msg_set_data_word(msg, i, va_arg(list, unsigned int)); +} + +void fsp_fillmsg(struct fsp_msg *msg, u32 cmd_sub_mod, u32 add_words, ...) +{ + va_list list; + + va_start(list, add_words); + __fsp_fillmsg(msg, cmd_sub_mod, add_words, list); + va_end(list); +} + +struct fsp_msg *fsp_mkmsg(u32 cmd_sub_mod, u32 add_words, ...) +{ + struct fsp_msg *msg = fsp_allocmsg(!!(cmd_sub_mod & 0x1000000)); + va_list list; + + if (!msg) { + prerror("FSP: Failed to allocate struct fsp_msg\n"); + return NULL; + } + + va_start(list, add_words); + __fsp_fillmsg(msg, cmd_sub_mod, add_words, list); + va_end(list); + + return msg; +} + +/* + * IMPORTANT NOTE: This is *guaranteed* to not call the completion + * routine recusrively for *any* fsp message, either the + * queued one or a previous one. Thus it is *ok* to call + * this function with a lock held which will itself be + * taken by the completion function. + * + * Any change to this implementation must respect this + * rule. This will be especially true of things like + * reset/reload and error handling, if we fail to queue + * we must just return an error, not call any completion + * from the scope of fsp_queue_msg(). + */ +int fsp_queue_msg(struct fsp_msg *msg, void (*comp)(struct fsp_msg *msg)) +{ + struct fsp_cmdclass *cmdclass; + struct fsp *fsp = fsp_get_active(); + bool need_unlock; + u16 seq; + int rc = 0; + + if (!fsp || !msg) + return -1; + + /* Recursive locking */ + need_unlock = lock_recursive(&fsp_lock); + + /* Grab a new sequence number */ + seq = fsp_curseq; + fsp_curseq = fsp_curseq + 1; + if (fsp_curseq == 0) + fsp_curseq = 0x8000; + msg->word0 = (msg->word0 & 0xffff) | seq << 16; + + /* Set completion */ + msg->complete = comp; + + /* Clear response state */ + if (msg->resp) + msg->resp->state = fsp_msg_unused; + + /* Queue the message in the appropriate queue */ + cmdclass = fsp_get_cmdclass(msg); + if (!cmdclass) { + prerror("FSP: Invalid msg in fsp_queue_msg w0/1=0x%08x/%08x\n", + msg->word0, msg->word1); + rc = -1; + goto unlock; + } + + msg->state = fsp_msg_queued; + + /* + * If we have initiated or about to initiate a reset/reload operation, + * we stash the message on the R&R backup queue. Otherwise, queue it + * normally and poke the HW + */ + if (fsp_in_hir(fsp) || fsp_in_reset(fsp)) + list_add_tail(&cmdclass->rr_queue, &msg->link); + else { + list_add_tail(&cmdclass->msgq, &msg->link); + fsp_poke_queue(cmdclass); + } + + unlock: + if (need_unlock) + unlock(&fsp_lock); + + return rc; +} + +/* WARNING: This will drop the FSP lock !!! */ +static void fsp_complete_msg(struct fsp_msg *msg) +{ + struct fsp_cmdclass *cmdclass = fsp_get_cmdclass(msg); + void (*comp)(struct fsp_msg *msg); + + assert(cmdclass); + + prlog(PR_INSANE, " completing msg, word0: 0x%08x\n", msg->word0); + + comp = msg->complete; + list_del_from(&cmdclass->msgq, &msg->link); + cmdclass->busy = false; + msg->state = fsp_msg_done; + + unlock(&fsp_lock); + if (comp) + (*comp)(msg); + lock(&fsp_lock); +} + +/* WARNING: This will drop the FSP lock !!! */ +static void fsp_complete_send(struct fsp *fsp) +{ + struct fsp_msg *msg = fsp->pending; + struct fsp_cmdclass *cmdclass = fsp_get_cmdclass(msg); + + assert(msg); + assert(cmdclass); + + fsp->pending = NULL; + + prlog(PR_INSANE, " completing send, word0: 0x%08x, resp: %d\n", + msg->word0, msg->response); + + if (msg->response) { + u64 setbit = fsp_get_class_bit(msg->word0 & 0xff); + msg->state = fsp_msg_wresp; + fsp_cmdclass_resp_bitmask |= setbit; + cmdclass->timesent = mftb(); + } else + fsp_complete_msg(msg); +} + +static void fsp_alloc_inbound(struct fsp_msg *msg) +{ + u16 func_id = fsp_msg_get_data_word(msg, 0) & 0xffff; + u32 len = fsp_msg_get_data_word(msg, 1); + u32 tce_token = 0, act_len = 0; + u8 rc = 0; + void *buf; + struct fsp_msg *resp; + + prlog(PR_DEBUG, "FSP: Allocate inbound buffer func: %04x len: %d\n", + func_id, len); + + lock(&fsp_lock); + if ((fsp_inbound_off + len) > FSP_INBOUND_SIZE) { + prerror("FSP: Out of space in buffer area !\n"); + rc = 0xeb; + goto reply; + } + + if (!fsp_inbound_buf) { + fsp_inbound_buf = memalign(TCE_PSIZE, FSP_INBOUND_SIZE); + if (!fsp_inbound_buf) { + prerror("FSP: could not allocate fsp_inbound_buf!\n"); + rc = 0xeb; + goto reply; + } + } + + buf = fsp_inbound_buf + fsp_inbound_off; + tce_token = PSI_DMA_INBOUND_BUF + fsp_inbound_off; + len = (len + TCE_MASK) & ~TCE_MASK; + fsp_inbound_off += len; + fsp_tce_map(tce_token, buf, len); + prlog(PR_DEBUG, "FSP: -> buffer at 0x%p, TCE: 0x%08x, alen: 0x%x\n", + buf, tce_token, len); + act_len = len; + + reply: + unlock(&fsp_lock); + + resp = fsp_mkmsg(FSP_RSP_ALLOC_INBOUND | rc, 3, 0, tce_token, act_len); + if (!resp) { + prerror("FSP: response message allocation failed\n"); + return; + } + if (fsp_queue_msg(resp, fsp_freemsg)) { + fsp_freemsg(resp); + prerror("FSP: Failed to queue response message\n"); + return; + } +} + +void *fsp_inbound_buf_from_tce(u32 tce_token) +{ + u32 offset = tce_token - PSI_DMA_INBOUND_BUF; + + if (tce_token < PSI_DMA_INBOUND_BUF || offset >= fsp_inbound_off) { + prerror("FSP: TCE token 0x%x out of bounds\n", tce_token); + return NULL; + } + return fsp_inbound_buf + offset; +} + +static void fsp_repost_queued_msgs_post_rr(void) +{ + struct fsp_msg *msg; + int i; + + for (i = 0; i <= (FSP_MCLASS_LAST - FSP_MCLASS_FIRST); i++) { + struct fsp_cmdclass *cmdclass = &fsp_cmdclass[i]; + bool poke = false; + + while(!list_empty(&cmdclass->rr_queue)) { + msg = list_pop(&cmdclass->rr_queue, + struct fsp_msg, link); + list_add_tail(&cmdclass->msgq, &msg->link); + poke = true; + } + if (poke) + fsp_poke_queue(cmdclass); + } +} + +static bool fsp_local_command(u32 cmd_sub_mod, struct fsp_msg *msg) +{ + u32 cmd = 0; + u32 rsp_data = 0; + struct fsp_msg *resp; + + switch(cmd_sub_mod) { + case FSP_CMD_CONTINUE_IPL: + /* We get a CONTINUE_IPL as a response to OPL */ + prlog(PR_NOTICE, "FSP: Got CONTINUE_IPL !\n"); + ipl_state |= ipl_got_continue; + return true; + + case FSP_CMD_HV_STATE_CHG: + prlog(PR_NOTICE, "FSP: Got HV state change request to %d\n", + msg->data.bytes[0]); + + /* Send response synchronously for now, we might want to + * deal with that sort of stuff asynchronously if/when + * we add support for auto-freeing of messages + */ + resp = fsp_mkmsg(FSP_RSP_HV_STATE_CHG, 0); + if (!resp) + prerror("FSP: Failed to allocate HV state response\n"); + else { + if (fsp_queue_msg(resp, fsp_freemsg)) { + fsp_freemsg(resp); + prerror("FSP: Failed to queue HV state resp\n"); + } + } + return true; + + case FSP_CMD_SP_NEW_ROLE: + /* FSP is assuming a new role */ + prlog(PR_INFO, "FSP: FSP assuming new role\n"); + resp = fsp_mkmsg(FSP_RSP_SP_NEW_ROLE, 0); + if (!resp) + prerror("FSP: Failed to allocate SP role response\n"); + else { + if (fsp_queue_msg(resp, fsp_freemsg)) { + fsp_freemsg(resp); + prerror("FSP: Failed to queue SP role resp\n"); + } + } + ipl_state |= ipl_got_new_role; + return true; + + case FSP_CMD_SP_QUERY_CAPS: + prlog(PR_INFO, "FSP: FSP query capabilities\n"); + /* XXX Do something saner. For now do a synchronous + * response and hard code our capabilities + */ + resp = fsp_mkmsg(FSP_RSP_SP_QUERY_CAPS, 4, 0x3ff80000, 0, 0, 0); + if (!resp) + prerror("FSP: Failed to allocate CAPS response\n"); + else { + if (fsp_queue_msg(resp, fsp_freemsg)) { + fsp_freemsg(resp); + prerror("FSP: Failed to queue CAPS resp\n"); + } + } + ipl_state |= ipl_got_caps; + return true; + case FSP_CMD_FSP_FUNCTNAL: + prlog(PR_INFO, "FSP: Got FSP Functional\n"); + ipl_state |= ipl_got_fsp_functional; + return true; + case FSP_CMD_ALLOC_INBOUND: + fsp_alloc_inbound(msg); + return true; + case FSP_CMD_SP_RELOAD_COMP: + if (msg->data.bytes[3] & PPC_BIT8(0)) { + fsp_fips_dump_notify(fsp_msg_get_data_word(msg, 1), + fsp_msg_get_data_word(msg, 2)); + + if (msg->data.bytes[3] & PPC_BIT8(1)) + prlog(PR_DEBUG, " PLID is %x\n", + fsp_msg_get_data_word(msg, 3)); + } + if (msg->data.bytes[3] & PPC_BIT8(2)) { + prlog(PR_INFO, "FSP: SP Reset/Reload was NOT done\n"); + } else { + prlog(PR_INFO, "FSP: SP says Reset/Reload complete\n"); + /* Notify clients that the FSP is back up */ + fsp_notify_rr_state(FSP_RELOAD_COMPLETE); + fsp_repost_queued_msgs_post_rr(); + } + return true; + case FSP_CMD_CLOSE_HMC_INTF: + /* Close the HMC interface */ + /* Though Sapphire does not support a HMC connection, the FSP + * sends this message when it is trying to open any new + * hypervisor session. So returning an error 0x51. + */ + cmd = FSP_RSP_CLOSE_HMC_INTF | FSP_STAUS_INVALID_HMC_ID; + rsp_data = msg->data.bytes[0] << 24 | msg->data.bytes[1] << 16; + rsp_data &= 0xffff0000; + resp = fsp_mkmsg(cmd, 1, rsp_data); + if (!resp) + prerror("FSP: Failed to allocate HMC close response\n"); + else { + if (fsp_queue_msg(resp, fsp_freemsg)) { + fsp_freemsg(resp); + prerror("FSP: Failed to queue HMC close resp\n"); + } + } + return true; + case FSP_CMD_GET_HIR_PLID: + /* Get Platform Log Id with reason for Host Initiated Reset */ + prlog(PR_DEBUG, "FSP: Sending PLID 0x%x as HIR reason\n", + fsp_hir_reason_plid); + resp = fsp_mkmsg(FSP_RSP_GET_HIR_PLID, 1, fsp_hir_reason_plid); + if (!resp) + prerror("FSP: Failed to allocate GET_HIR_PLID response\n"); + else { + if (fsp_queue_msg(resp, fsp_freemsg)) { + fsp_freemsg(resp); + prerror("FSP: Failed to queue GET_HIR_PLID resp\n"); + } + } + fsp_hir_reason_plid = 0; + return true; + } + return false; +} + + +/* This is called without the FSP lock */ +static void fsp_handle_command(struct fsp_msg *msg) +{ + struct fsp_cmdclass *cmdclass = fsp_get_cmdclass(msg); + struct fsp_client *client, *next; + struct fsp_msg *resp; + u32 cmd_sub_mod; + + if (!cmdclass) { + prerror("FSP: Got message for unknown class %x\n", + msg->word0 & 0xff); + goto free; + } + + cmd_sub_mod = (msg->word0 & 0xff) << 16; + cmd_sub_mod |= (msg->word1 & 0xff) << 8; + cmd_sub_mod |= (msg->word1 >> 8) & 0xff; + + /* Some commands are handled locally */ + if (fsp_local_command(cmd_sub_mod, msg)) + goto free; + + /* The rest go to clients */ + list_for_each_safe(&cmdclass->clientq, client, next, link) { + if (client->message(cmd_sub_mod, msg)) + goto free; + } + + prerror("FSP: Unhandled message %06x\n", cmd_sub_mod); + + /* We don't know whether the message expected some kind of + * response, so we send one anyway + */ + resp = fsp_mkmsg((cmd_sub_mod & 0xffff00) | 0x008020, 0); + if (!resp) + prerror("FSP: Failed to allocate default response\n"); + else { + if (fsp_queue_msg(resp, fsp_freemsg)) { + fsp_freemsg(resp); + prerror("FSP: Failed to queue default response\n"); + } + } + + free: + fsp_freemsg(msg); +} + +static void __fsp_fill_incoming(struct fsp *fsp, struct fsp_msg *msg, + int dlen, u32 w0, u32 w1) +{ + unsigned int wlen, i, reg; + + msg->dlen = dlen - 8; + msg->word0 = w0; + msg->word1 = w1; + wlen = (dlen + 3) >> 2; + reg = FSP_MBX1_FDATA_AREA + 8; + for (i = 0; i < wlen; i++) { + fsp_msg_set_data_word(msg, i, fsp_rreg(fsp, reg)); + reg += 4; + } + + /* Ack it (XDN) and clear HPEND & counts */ + fsp_wreg(fsp, FSP_MBX1_HCTL_REG, + FSP_MBX_CTL_PTS | + FSP_MBX_CTL_XDN | + FSP_MBX_CTL_HPEND | + FSP_MBX_CTL_HCSP_MASK | + FSP_MBX_CTL_DCSP_MASK); + + fsp_trace_msg(msg, TRACE_FSP_MSG_IN); +} + +static void __fsp_drop_incoming(struct fsp *fsp) +{ + /* Ack it (XDN) and clear HPEND & counts */ + fsp_wreg(fsp, FSP_MBX1_HCTL_REG, + FSP_MBX_CTL_PTS | + FSP_MBX_CTL_XDN | + FSP_MBX_CTL_HPEND | + FSP_MBX_CTL_HCSP_MASK | + FSP_MBX_CTL_DCSP_MASK); +} + +/* WARNING: This will drop the FSP lock */ +static void fsp_handle_incoming(struct fsp *fsp) +{ + struct fsp_msg *msg; + u32 h0, w0, w1; + unsigned int dlen; + bool special_response = false; + + h0 = fsp_rreg(fsp, FSP_MBX1_FHDR0_REG); + dlen = (h0 >> 16) & 0xff; + + w0 = fsp_rreg(fsp, FSP_MBX1_FDATA_AREA); + w1 = fsp_rreg(fsp, FSP_MBX1_FDATA_AREA + 4); + + prlog(PR_INSANE, " Incoming: w0: 0x%08x, w1: 0x%08x, dlen: %d\n", + w0, w1, dlen); + + /* Some responses are expected out of band */ + if ((w0 & 0xff) == FSP_MCLASS_HMC_INTFMSG && + ((w1 & 0xff) == 0x8a || ((w1 & 0xff) == 0x8b))) + special_response = true; + + /* Check for response bit */ + if (w1 & 0x80 && !special_response) { + struct fsp_cmdclass *cmdclass = __fsp_get_cmdclass(w0 & 0xff); + struct fsp_msg *req; + + if (!cmdclass) { + prerror("FSP: Got response for unknown class %x\n", + w0 & 0xff); + __fsp_drop_incoming(fsp); + return; + } + + if (!cmdclass->busy || list_empty(&cmdclass->msgq)) { + prerror("FSP #%d: Got orphan response! w0 = 0x%08x w1 = 0x%08x\n", + fsp->index, w0, w1); + __fsp_drop_incoming(fsp); + return; + } + req = list_top(&cmdclass->msgq, struct fsp_msg, link); + + /* Check if the response seems to match the message */ + if (req->state != fsp_msg_wresp || + (req->word0 & 0xff) != (w0 & 0xff) || + (req->word1 & 0xff) != (w1 & 0x7f)) { + __fsp_drop_incoming(fsp); + prerror("FSP #%d: Response doesn't match pending msg. w0 = 0x%08x w1 = 0x%08x\n", + fsp->index, w0, w1); + return; + } else { + u64 resetbit = ~fsp_get_class_bit(req->word0 & 0xff); + fsp_cmdclass_resp_bitmask &= resetbit; + cmdclass->timesent = 0; + } + + /* Allocate response if needed XXX We need to complete + * the original message with some kind of error here ? + */ + if (!req->resp) { + req->resp = __fsp_allocmsg(); + if (!req->resp) { + __fsp_drop_incoming(fsp); + prerror("FSP #%d: Failed to allocate response\n", + fsp->index); + return; + } + } + + /* Populate and complete (will drop the lock) */ + req->resp->state = fsp_msg_response; + __fsp_fill_incoming(fsp, req->resp, dlen, w0, w1); + fsp_complete_msg(req); + return; + } + + /* Allocate an incoming message */ + msg = __fsp_allocmsg(); + if (!msg) { + __fsp_drop_incoming(fsp); + prerror("FSP #%d: Failed to allocate incoming msg\n", + fsp->index); + return; + } + msg->state = fsp_msg_incoming; + __fsp_fill_incoming(fsp, msg, dlen, w0, w1); + + /* Handle FSP commands. This can recurse into fsp_queue_msg etc.. */ + unlock(&fsp_lock); + fsp_handle_command(msg); + lock(&fsp_lock); +} + +static void fsp_check_queues(struct fsp *fsp) +{ + int i; + + /* XXX In the long run, we might want to have a queue of + * classes waiting to be serviced to speed this up, either + * that or a bitmap. + */ + for (i = 0; i <= (FSP_MCLASS_LAST - FSP_MCLASS_FIRST); i++) { + struct fsp_cmdclass *cmdclass = &fsp_cmdclass[i]; + + if (fsp->state != fsp_mbx_idle) + break; + if (cmdclass->busy || list_empty(&cmdclass->msgq)) + continue; + fsp_poke_queue(cmdclass); + } +} + +static void __fsp_poll(bool interrupt) +{ + struct fsp_iopath *iop; + struct fsp *fsp = fsp_get_active(); + u32 ctl, hdir = 0; + bool psi_irq; + + /* + * The tracer isn't terribly efficient at detecting dups + * especially when coming from multiple CPUs so we do our + * own change-detection locally + */ + static u32 hdir_last_trace; + static u32 ctl_last_trace; + static bool psi_irq_last_trace; + static bool irq_last_trace; + + if (!fsp) + return; + + /* Crazy interrupt handling scheme: + * + * In order to avoid "losing" interrupts when polling the mbox + * we only clear interrupt conditions when called as a result of + * an interrupt. + * + * That way, if a poll clears, for example, the HPEND condition, + * the interrupt remains, causing a dummy interrupt later on + * thus allowing the OS to be notified of a state change (ie it + * doesn't need every poll site to monitor every state change). + * + * However, this scheme is complicated by the fact that we need + * to clear the interrupt condition after we have cleared the + * original condition in HCTL, and we might have long stale + * interrupts which we do need to eventually get rid of. However + * clearing interrupts in such a way is racy, so we need to loop + * and re-poll HCTL after having done so or we might miss an + * event. It's a latency risk, but unlikely and probably worth it. + */ + + again: + if (fsp->active_iopath < 0) { + /* That should never happen */ + if (interrupt && (fsp->state != fsp_mbx_rr)) + prerror("FSP: Interrupt with no working IO path\n"); + return; + } + iop = &fsp->iopath[fsp->active_iopath]; + + /* Check for error state and handle R&R completion */ + fsp_handle_errors(fsp); + + /* Handle host initiated resets */ + if (fsp_in_hir(fsp)) { + fsp_hir_poll(fsp, iop->psi); + return; + } + + /* + * The above might have triggered and R&R, check that we + * are still functional + */ + if ((fsp->active_iopath < 0) || fsp_in_hir(fsp)) + return; + iop = &fsp->iopath[fsp->active_iopath]; + + /* Read interrupt status (we may or may not use it) */ + hdir = fsp_rreg(fsp, FSP_HDIR_REG); + + /* Read control now as well so we can trace them */ + ctl = fsp_rreg(fsp, FSP_MBX1_HCTL_REG); + + /* Ditto with PSI irq state */ + psi_irq = psi_poll_fsp_interrupt(iop->psi); + + /* Trace it if anything changes */ + if (hdir != hdir_last_trace || ctl != ctl_last_trace || + interrupt != irq_last_trace || psi_irq != psi_irq_last_trace) { + fsp_trace_event(fsp, TRACE_FSP_EVT_POLL_IRQ, + interrupt, hdir, ctl, psi_irq); + + hdir_last_trace = hdir; + ctl_last_trace = ctl; + irq_last_trace = interrupt; + psi_irq_last_trace = psi_irq; + } + + /* + * We *MUST* ignore the MBOX2 bits here. While MBOX2 cannot generate + * interrupt, it might still latch some bits here (and we found cases + * where the MBOX2 XUP would be set). If that happens, clearing HDIR + * never works (the bit gets set again immediately) because we don't + * clear the condition in HTCL2 and thus we loop forever. + */ + hdir &= FSP_DBIRQ_MBOX1; + + /* + * Sanity check: If an interrupt is pending and we are in polling + * mode, check that the PSI side is also pending. If some bit is + * set, just clear and move on. + */ + if (hdir && !interrupt && !psi_irq) { + prerror("FSP: WARNING ! HDIR 0x%08x but no PSI irq !\n", hdir); + fsp_wreg(fsp, FSP_HDIR_REG, hdir); + } + + /* + * We should never have the mbox in error state here unless it + * was fine until some printf inside fsp_handle_errors() caused + * the console to poke the FSP which detected a branch new error + * in the process. Let's be safe rather than sorry and handle that + * here + */ + if (fsp_in_hir(fsp) || fsp->state == fsp_mbx_err) { + prerror("FSP: Late error state detection\n"); + goto again; + } + + /* + * If we are in an R&R state with an active IO path, we + * shouldn't be getting interrupts. If we do, just clear + * the condition and print a message + */ + if (fsp->state == fsp_mbx_rr) { + if (interrupt) { + prerror("FSP: Interrupt in RR state [HDIR=0x%08x]\n", + hdir); + fsp_wreg(fsp, FSP_HDIR_REG, hdir); + } + return; + } + + /* Poll FSP CTL */ + if (ctl & (FSP_MBX_CTL_XUP | FSP_MBX_CTL_HPEND)) + prlog(PR_INSANE, "FSP #%d: poll, ctl: %x\n", fsp->index, ctl); + + /* Do we have a pending message waiting to complete ? */ + if (ctl & FSP_MBX_CTL_XUP) { + fsp_wreg(fsp, FSP_MBX1_HCTL_REG, FSP_MBX_CTL_XUP); + if (fsp->state == fsp_mbx_send) { + /* mbox is free */ + fsp->state = fsp_mbx_idle; + + /* Complete message (will break the lock) */ + fsp_complete_send(fsp); + + /* Lock can have been broken, so ctl is now + * potentially invalid, let's recheck + */ + goto again; + } else { + prerror("FSP #%d: Got XUP with no pending message !\n", + fsp->index); + } + } + + if (fsp->state == fsp_mbx_send) { + /* XXX Handle send timeouts!!! */ + } + + /* Is there an incoming message ? This will break the lock as well */ + if (ctl & FSP_MBX_CTL_HPEND) + fsp_handle_incoming(fsp); + + /* Note: Lock may have been broken above, thus ctl might be invalid + * now, don't use it any further. + */ + + /* Check for something else to send */ + if (fsp->state == fsp_mbx_idle) + fsp_check_queues(fsp); + + /* Clear interrupts, and recheck HCTL if any occurred */ + if (interrupt && hdir) { + fsp_wreg(fsp, FSP_HDIR_REG, hdir); + goto again; + } +} + +void fsp_interrupt(void) +{ + lock(&fsp_lock); + __fsp_poll(true); + unlock(&fsp_lock); +} + + +int fsp_sync_msg(struct fsp_msg *msg, bool autofree) +{ + int rc; + + rc = fsp_queue_msg(msg, NULL); + if (rc) + goto bail; + + while(fsp_msg_busy(msg)) { + if (fsp_in_rr()) { + fsp_cancelmsg(msg); + rc = -1; + goto bail; + } + cpu_relax(); + opal_run_pollers(); + } + + switch(msg->state) { + case fsp_msg_done: + rc = 0; + break; + case fsp_msg_timeout: + rc = -1; /* XXX to improve */ + break; + default: + rc = -1; /* Should not happen... (assert ?) */ + } + + if (msg->resp) + rc = (msg->resp->word1 >> 8) & 0xff; + bail: + if (autofree) + fsp_freemsg(msg); + return rc; +} + +void fsp_register_client(struct fsp_client *client, u8 msgclass) +{ + struct fsp_cmdclass *cmdclass = __fsp_get_cmdclass(msgclass); + + if (!fsp_present()) + return; + assert(cmdclass); + list_add_tail(&cmdclass->clientq, &client->link); +} + +void fsp_unregister_client(struct fsp_client *client, u8 msgclass) +{ + struct fsp_cmdclass *cmdclass = __fsp_get_cmdclass(msgclass); + + if (!fsp_present()) + return; + assert(cmdclass); + list_del_from(&cmdclass->clientq, &client->link); +} + +static int fsp_init_mbox(struct fsp *fsp) +{ + unsigned int i; + u32 reg; + + /* + * Note: The documentation contradicts itself as to + * whether the HDIM bits should be set or cleared to + * enable interrupts + * + * This seems to work... + */ + + /* Mask all interrupts */ + fsp_wreg(fsp, FSP_HDIM_CLR_REG, FSP_DBIRQ_ALL); + + /* Clear all errors */ + fsp_wreg(fsp, FSP_HDES_REG, FSP_DBERRSTAT_CLR1 | FSP_DBERRSTAT_CLR2); + + /* Initialize data area as the doco says */ + for (i = 0; i < 0x40; i += 4) + fsp_wreg(fsp, FSP_MBX1_HDATA_AREA + i, 0); + + /* + * Clear whatever crap may remain in HDCR. Do not write XDN as that + * would be interpreted incorrectly as an R&R completion which + * we aren't ready to send yet ! + */ + fsp_wreg(fsp, FSP_MBX1_HCTL_REG, FSP_MBX_CTL_XUP | FSP_MBX_CTL_HPEND | + FSP_MBX_CTL_HCSP_MASK | FSP_MBX_CTL_DCSP_MASK | + FSP_MBX_CTL_PTS); + + /* Clear all pending interrupts */ + fsp_wreg(fsp, FSP_HDIR_REG, FSP_DBIRQ_ALL); + + /* Enable all mbox1 interrupts */ + fsp_wreg(fsp, FSP_HDIM_SET_REG, FSP_DBIRQ_MBOX1); + + /* Decode what FSP we are connected to */ + reg = fsp_rreg(fsp, FSP_SCRATCH0_REG); + if (reg & PPC_BIT32(0)) { /* Is it a valid connection */ + if (reg & PPC_BIT32(3)) + prlog(PR_INFO, "FSP: Connected to FSP-B\n"); + else + prlog(PR_INFO, "FSP: Connected to FSP-A\n"); + } + + return 0; +} + +/* We use a single fixed TCE table for all PSI interfaces */ +static void fsp_init_tce_table(void) +{ + fsp_tce_table = (__be64 *)PSI_TCE_TABLE_BASE; + + memset(fsp_tce_table, 0, PSI_TCE_TABLE_SIZE); +} + +void fsp_tce_map(u32 offset, void *addr, u32 size) +{ + u64 raddr = (u64)addr; + + assert(!(offset & TCE_MASK)); + assert(!(raddr & TCE_MASK)); + assert(!(size & TCE_MASK)); + + size >>= TCE_SHIFT; + offset >>= TCE_SHIFT; + + while(size--) { + fsp_tce_table[offset++] = cpu_to_be64(raddr | 0x3); + raddr += TCE_PSIZE; + } +} + +void fsp_tce_unmap(u32 offset, u32 size) +{ + assert(!(offset & TCE_MASK)); + assert(!(size & TCE_MASK)); + + size >>= TCE_SHIFT; + offset >>= TCE_SHIFT; + + while(size--) + fsp_tce_table[offset++] = 0; +} + +static struct fsp *fsp_find_by_index(int index) +{ + struct fsp *fsp = first_fsp; + + do { + if (fsp->index == index) + return fsp; + } while (fsp->link != first_fsp); + + return NULL; +} + +static void fsp_init_links(struct dt_node *fsp_node) +{ + const struct dt_property *linksprop; + int i, index; + struct fsp *fsp; + struct fsp_iopath *fiop; + + linksprop = dt_find_property(fsp_node, "ibm,psi-links"); + assert(linksprop); + + index = dt_prop_get_u32(fsp_node, "reg"); + fsp = fsp_find_by_index(index); + if (!fsp) { + prerror("FSP: FSP with index %d not found\n", index); + return; + } + + fsp->state = fsp_mbx_idle; + + /* Iterate all links */ + for (i = 0; i < fsp->iopath_count; i++) { + u64 reg; + u32 link; + + link = dt_property_get_cell(linksprop, i); + fiop = &fsp->iopath[i]; + fiop->psi = psi_find_link(link); + if (fiop->psi == NULL) { + prerror("FSP #%d: Couldn't find PSI link\n", + fsp->index); + continue; + } + + prlog(PR_DEBUG, "FSP #%d: Found PSI HB link to chip %d\n", + fsp->index, link); + + psi_fsp_link_in_use(fiop->psi); + + /* Get the FSP register window */ + reg = in_be64(fiop->psi->regs + PSIHB_FSPBAR); + fiop->fsp_regs = (void *)(reg | (1ULL << 63) | + dt_prop_get_u32(fsp_node, "reg-offset")); + } +} + +static void fsp_update_links_states(struct fsp *fsp) +{ + struct fsp_iopath *fiop; + unsigned int i; + + /* Iterate all links */ + for (i = 0; i < fsp->iopath_count; i++) { + fiop = &fsp->iopath[i]; + if (!fiop->psi) + fiop->state = fsp_path_bad; + else if (fiop->psi->active) { + fsp->active_iopath = i; + fiop->state = fsp_path_active; + } else + fiop->state = fsp_path_backup; + } + + if (fsp->active_iopath >= 0) { + if (!active_fsp || (active_fsp != fsp)) + active_fsp = fsp; + + fsp_inbound_off = 0; + fiop = &fsp->iopath[fsp->active_iopath]; + psi_init_for_fsp(fiop->psi); + fsp_init_mbox(fsp); + } +} + +void fsp_reinit_fsp(void) +{ + struct fsp *fsp; + + /* Notify all FSPs to check for an updated link state */ + for (fsp = first_fsp; fsp; fsp = fsp->link) + fsp_update_links_states(fsp); +} + +static void fsp_create_fsp(struct dt_node *fsp_node) +{ + const struct dt_property *linksprop; + struct fsp *fsp; + int count, index; + + index = dt_prop_get_u32(fsp_node, "reg"); + prlog(PR_INFO, "FSP #%d: Found in device-tree, setting up...\n", + index); + + linksprop = dt_find_property(fsp_node, "ibm,psi-links"); + if (!linksprop || linksprop->len < 4) { + prerror("FSP #%d: No links !\n", index); + return; + } + + fsp = zalloc(sizeof(struct fsp)); + if (!fsp) { + prerror("FSP #%d: Can't allocate memory !\n", index); + return; + } + + fsp->index = index; + fsp->active_iopath = -1; + + count = linksprop->len / 4; + prlog(PR_DEBUG, "FSP #%d: Found %d IO PATH\n", index, count); + if (count > FSP_MAX_IOPATH) { + prerror("FSP #%d: WARNING, limited to %d IO PATH\n", + index, FSP_MAX_IOPATH); + count = FSP_MAX_IOPATH; + } + fsp->iopath_count = count; + + fsp->link = first_fsp; + first_fsp = fsp; + + fsp_init_links(fsp_node); + fsp_update_links_states(fsp); + + if (fsp->active_iopath >= 0) + psi_enable_fsp_interrupt(fsp->iopath[fsp->active_iopath].psi); +} + +static void fsp_opal_poll(void *data __unused) +{ + /* Test the host initiated reset */ + if (hir_trigger == 0xdeadbeef) { + uint32_t plid = log_simple_error(&e_info(OPAL_INJECTED_HIR), + "SURV: Injected HIR, initiating FSP R/R\n"); + fsp_trigger_reset(plid); + hir_trigger = 0; + } + + if (try_lock(&fsp_lock)) { + __fsp_poll(false); + unlock(&fsp_lock); + } +} + +int fsp_fatal_msg(struct fsp_msg *msg) +{ + int rc = 0; + + rc = fsp_queue_msg(msg, NULL); + if (rc) + return rc; + + while(fsp_msg_busy(msg)) { + if (fsp_in_rr()) { + fsp_cancelmsg(msg); + return -1; + } + + cpu_relax(); + fsp_opal_poll(NULL); + } + + switch(msg->state) { + case fsp_msg_done: + rc = 0; + break; + case fsp_msg_timeout: + rc = -1; /* XXX to improve */ + break; + default: + rc = -1; /* Should not happen... (assert ?) */ + } + + if (msg->resp) + rc = (msg->resp->word1 >> 8) & 0xff; + + return rc; +} + +static bool fsp_init_one(const char *compat) +{ + struct dt_node *fsp_node; + bool inited = false; + + dt_for_each_compatible(dt_root, fsp_node, compat) { + if (!inited) { + int i; + + /* Initialize the per-class msg queues */ + for (i = 0; + i <= (FSP_MCLASS_LAST - FSP_MCLASS_FIRST); i++) { + list_head_init(&fsp_cmdclass[i].msgq); + list_head_init(&fsp_cmdclass[i].clientq); + list_head_init(&fsp_cmdclass[i].rr_queue); + } + + /* Init the queues for RR notifier cmdclass */ + list_head_init(&fsp_cmdclass_rr.msgq); + list_head_init(&fsp_cmdclass_rr.clientq); + list_head_init(&fsp_cmdclass_rr.rr_queue); + + /* Register poller */ + opal_add_poller(fsp_opal_poll, NULL); + + inited = true; + } + + /* Create the FSP data structure */ + fsp_create_fsp(fsp_node); + } + + return inited; +} + +void fsp_init(void) +{ + prlog(PR_DEBUG, "FSP: Looking for FSP...\n"); + + fsp_init_tce_table(); + + if (!fsp_init_one("ibm,fsp1") && !fsp_init_one("ibm,fsp2")) { + prlog(PR_DEBUG, "FSP: No FSP on this machine\n"); + return; + } +} + +bool fsp_present(void) +{ + return first_fsp != NULL; +} + +static void fsp_timeout_poll(void *data __unused) +{ + u64 now = mftb(); + u64 timeout_val = 0; + u64 cmdclass_resp_bitmask = fsp_cmdclass_resp_bitmask; + struct fsp_cmdclass *cmdclass = NULL; + struct fsp_msg *req = NULL; + u32 index = 0; + + if (timeout_timer == 0) + timeout_timer = now + secs_to_tb(30); + + /* The lowest granularity for a message timeout is 30 secs. + * So every 30secs, check if there is any message + * waiting for a response from the FSP + */ + if (tb_compare(now, timeout_timer) == TB_ABEFOREB) + return; + if (!try_lock(&fsp_poll_lock)) + return; + if (tb_compare(now, timeout_timer) == TB_ABEFOREB) { + unlock(&fsp_poll_lock); + return; + } + + while (cmdclass_resp_bitmask) { + u64 time_sent = 0; + u64 time_to_comp = 0; + + if (!(cmdclass_resp_bitmask & 0x1)) + goto next_bit; + + cmdclass = &fsp_cmdclass[index]; + timeout_val = secs_to_tb((cmdclass->timeout) * 60); + time_sent = cmdclass->timesent; + time_to_comp = now - cmdclass->timesent; + + /* Now check if the response has timed out */ + if (tb_compare(time_to_comp, timeout_val) == TB_AAFTERB) { + u32 w0, w1; + enum fsp_msg_state mstate; + + /* Take the FSP lock now and re-check */ + lock(&fsp_lock); + if (!(fsp_cmdclass_resp_bitmask & (1ull << index)) || + time_sent != cmdclass->timesent) { + unlock(&fsp_lock); + goto next_bit; + } + req = list_top(&cmdclass->msgq, struct fsp_msg, link); + if (!req) { + printf("FSP: Timeout state mismatch on class %d\n", + index); + fsp_cmdclass_resp_bitmask &= ~(1ull << index); + cmdclass->timesent = 0; + unlock(&fsp_lock); + goto next_bit; + } + w0 = req->word0; + w1 = req->word1; + mstate = req->state; + prlog(PR_WARNING, "FSP: Response from FSP timed out," + " cmd = %x subcmd = %x mod = %x state: %d\n", + w0 & 0xff, w1 & 0xff, (w1 >> 8) & 0xff, mstate); + fsp_reg_dump(); + fsp_cmdclass_resp_bitmask &= ~(1ull << index); + cmdclass->timesent = 0; + if (req->resp) { + req->resp->state = fsp_msg_timeout; + req->resp->word1 = (FSP_STATUS_BUSY << 8) | + (req->resp->word1 & 0xff); + } + fsp_complete_msg(req); + __fsp_trigger_reset(); + unlock(&fsp_lock); + fsp_hir_reason_plid = log_simple_error( + &e_info(OPAL_RC_FSP_POLL_TIMEOUT), + "FSP: Response from FSP timed out," + " cmd = %x subcmd = %x mod = %x state: %d\n", + w0 & 0xff, w1 & 0xff, (w1 >> 8) & 0xff, mstate); + } + next_bit: + cmdclass_resp_bitmask = cmdclass_resp_bitmask >> 1; + index++; + } + unlock(&fsp_poll_lock); +} + +void fsp_opl(void) +{ + struct dt_node *iplp; + + if (!fsp_present()) + return; + + /* Send OPL */ + ipl_state |= ipl_opl_sent; + fsp_sync_msg(fsp_mkmsg(FSP_CMD_OPL, 0), true); + while(!(ipl_state & ipl_got_continue)) { + opal_run_pollers(); + cpu_relax(); + } + + /* Send continue ACK */ + fsp_sync_msg(fsp_mkmsg(FSP_CMD_CONTINUE_ACK, 0), true); + + /* Wait for various FSP messages */ + prlog(PR_INFO, "INIT: Waiting for FSP to advertise new role...\n"); + while(!(ipl_state & ipl_got_new_role)) { + cpu_relax(); + opal_run_pollers(); + } + prlog(PR_INFO, "INIT: Waiting for FSP to request capabilities...\n"); + while(!(ipl_state & ipl_got_caps)) { + cpu_relax(); + opal_run_pollers(); + } + + /* Initiate the timeout poller */ + opal_add_poller(fsp_timeout_poll, NULL); + + /* Tell FSP we are in standby */ + prlog(PR_INFO, "INIT: Sending HV Functional: Standby...\n"); + fsp_sync_msg(fsp_mkmsg(FSP_CMD_HV_FUNCTNAL, 1, 0x01000000), true); + + /* Wait for FSP functional */ + prlog(PR_INFO, "INIT: Waiting for FSP functional\n"); + while(!(ipl_state & ipl_got_fsp_functional)) { + cpu_relax(); + opal_run_pollers(); + } + + /* Tell FSP we are in running state */ + prlog(PR_INFO, "INIT: Sending HV Functional: Runtime...\n"); + fsp_sync_msg(fsp_mkmsg(FSP_CMD_HV_FUNCTNAL, 1, 0x02000000), true); + + /* + * For the factory reset case, FSP sends us the PCI Bus + * Reset request. We don't have to do anything special with + * PCI bus numbers here; just send the Power Down message + * with modifier 0x02 to FSP. + */ + iplp = dt_find_by_path(dt_root, "ipl-params/ipl-params"); + if (iplp && dt_find_property(iplp, "pci-busno-reset-ipl")) { + prlog(PR_DEBUG, "INIT: PCI Bus Reset requested." + " Sending Power Down\n"); + fsp_sync_msg(fsp_mkmsg(FSP_CMD_POWERDOWN_PCIRS, 0), true); + } + + /* + * Tell FSP we are in running state with all partitions. + * + * This is need otherwise the FSP will not reset it's reboot count + * on failures. Ideally we should send that when we know the + * OS is up but we don't currently have a very good way to do + * that so this will do as a stop-gap + */ + prlog(PR_NOTICE, "INIT: Sending HV Functional: Runtime all partitions\n"); + fsp_sync_msg(fsp_mkmsg(FSP_CMD_HV_FUNCTNAL, 1, 0x04000000), true); +} + +uint32_t fsp_adjust_lid_side(uint32_t lid_no) +{ + struct dt_node *iplp; + const char *side = NULL; + + iplp = dt_find_by_path(dt_root, "ipl-params/ipl-params"); + if (iplp) + side = dt_prop_get_def(iplp, "cec-ipl-side", NULL); + if (!side || !strcmp(side, "temp")) + lid_no |= ADJUST_T_SIDE_LID_NO; + return lid_no; +} + +struct fsp_fetch_lid_item { + enum resource_id id; + uint32_t idx; + + uint32_t lid; + uint32_t lid_no; + uint64_t bsize; + uint32_t offset; + void *buffer; + size_t *length; + size_t remaining; + size_t chunk_requested; + struct list_node link; + int result; +}; + +/* + * We have a queue of things to fetch + * when fetched, it moves to fsp_fetched_lid until we're asked if it + * has been fetched, in which case it's free()d. + * + * Everything is protected with fsp_fetch_lock. + * + * We use PSI_DMA_FETCH TCE entry for this fetching queue. If something + * is in the fsp_fetch_lid_queue, it means we're using this TCE entry! + * + * If we add the first entry to fsp_fetch_lid_queue, we trigger fetching! + */ +static LIST_HEAD(fsp_fetch_lid_queue); +static LIST_HEAD(fsp_fetched_lid); +static struct lock fsp_fetch_lock = LOCK_UNLOCKED; + +/* + * Asynchronous fsp fetch data call + * + * Note: + * buffer = PSI DMA address space + */ +int fsp_fetch_data_queue(uint8_t flags, uint16_t id, uint32_t sub_id, + uint32_t offset, void *buffer, size_t *length, + void (*comp)(struct fsp_msg *msg)) +{ + struct fsp_msg *msg; + uint32_t chunk = *length; + + if (!comp) + return OPAL_PARAMETER; + + msg = fsp_mkmsg(FSP_CMD_FETCH_SP_DATA, 0x6, flags << 16 | id, + sub_id, offset, 0, buffer, chunk); + if (!msg) { + prerror("FSP: allocation failed!\n"); + return OPAL_INTERNAL_ERROR; + } + if (fsp_queue_msg(msg, comp)) { + fsp_freemsg(msg); + prerror("FSP: Failed to queue fetch data message\n"); + return OPAL_INTERNAL_ERROR; + } + return OPAL_SUCCESS; +} + +#define CAPP_IDX_VENICE_DD10 0x100ea +#define CAPP_IDX_VENICE_DD20 0x200ea +#define CAPP_IDX_MURANO_DD20 0x200ef +#define CAPP_IDX_MURANO_DD21 0x201ef +#define CAPP_IDX_NAPLES_DD10 0x100d3 +#define CAPP_IDX_NIMBUS_DD10 0x100d1 +#define CAPP_IDX_NIMBUS_DD20 0x200d1 +#define CAPP_IDX_NIMBUS_DD21 0x201d1 +#define CAPP_IDX_NIMBUS_DD22 0x202d1 +#define CAPP_IDX_NIMBUS_DD23 0x203d1 + +#define IMA_CATALOG_NIMBUS 0x4e0200 +#define IMA_CATALOG_P10_DD1 0x800100 +#define IMA_CATALOG_P10_DD2 0x800200 + + +static struct { + enum resource_id id; + uint32_t idx; + uint32_t lid_no; +} fsp_lid_map[] = { + { RESOURCE_ID_KERNEL, RESOURCE_SUBID_NONE, KERNEL_LID_OPAL }, + { RESOURCE_ID_INITRAMFS,RESOURCE_SUBID_NONE, INITRAMFS_LID_OPAL }, + { RESOURCE_ID_IMA_CATALOG,IMA_CATALOG_NIMBUS, 0x80f00103 }, + { RESOURCE_ID_CAPP, CAPP_IDX_MURANO_DD20, 0x80a02002 }, + { RESOURCE_ID_CAPP, CAPP_IDX_MURANO_DD21, 0x80a02001 }, + { RESOURCE_ID_CAPP, CAPP_IDX_VENICE_DD10, 0x80a02003 }, + { RESOURCE_ID_CAPP, CAPP_IDX_VENICE_DD20, 0x80a02004 }, + { RESOURCE_ID_CAPP, CAPP_IDX_NAPLES_DD10, 0x80a02005 }, + { RESOURCE_ID_CAPP, CAPP_IDX_NIMBUS_DD10, 0x80a02006 }, + { RESOURCE_ID_CAPP, CAPP_IDX_NIMBUS_DD20, 0x80a02007 }, + { RESOURCE_ID_CAPP, CAPP_IDX_NIMBUS_DD21, 0x80a02007 }, + { RESOURCE_ID_CAPP, CAPP_IDX_NIMBUS_DD22, 0x80a02007 }, + { RESOURCE_ID_CAPP, CAPP_IDX_NIMBUS_DD23, 0x80a02007 }, + { RESOURCE_ID_IMA_CATALOG,IMA_CATALOG_P10_DD1, 0x80f00103 }, + { RESOURCE_ID_IMA_CATALOG,IMA_CATALOG_P10_DD2, 0x80f00103 }, +}; + +static void fsp_start_fetching_next_lid(void); +static void fsp_fetch_lid_next_chunk(struct fsp_fetch_lid_item *last); + +static void fsp_fetch_lid_complete(struct fsp_msg *msg) +{ + struct fsp_fetch_lid_item *last; + uint32_t woffset, wlen; + uint8_t rc; + + lock(&fsp_fetch_lock); + last = list_top(&fsp_fetch_lid_queue, struct fsp_fetch_lid_item, link); + fsp_tce_unmap(PSI_DMA_FETCH, last->bsize); + + woffset = fsp_msg_get_data_word(msg->resp, 1); + wlen = fsp_msg_get_data_word(msg->resp, 2); + rc = (msg->resp->word1 >> 8) & 0xff; + + /* Fall back to a PHYP LID for kernel loads */ + if (rc && last->lid_no == KERNEL_LID_OPAL) { + const char *ltype = dt_prop_get_def(dt_root, "lid-type", NULL); + if (!ltype || strcmp(ltype, "opal")) { + prerror("Failed to load in OPAL mode...\n"); + last->result = OPAL_PARAMETER; + last = list_pop(&fsp_fetch_lid_queue, + struct fsp_fetch_lid_item, link); + list_add_tail(&fsp_fetched_lid, &last->link); + fsp_start_fetching_next_lid(); + unlock(&fsp_fetch_lock); + return; + } + printf("Trying to load as PHYP LID...\n"); + last->lid = KERNEL_LID_PHYP; + /* Retry with different LID */ + fsp_fetch_lid_next_chunk(last); + } + + if (rc !=0 && rc != 2) { + last->result = -EIO; + last = list_pop(&fsp_fetch_lid_queue, struct fsp_fetch_lid_item, link); + prerror("FSP LID %08x load ERROR %d\n", last->lid_no, rc); + list_add_tail(&fsp_fetched_lid, &last->link); + fsp_start_fetching_next_lid(); + unlock(&fsp_fetch_lock); + return; + } + + /* + * As per documentation, rc=2 means end of file not reached and + * rc=1 means we reached end of file. But it looks like we always + * get rc=0 irrespective of whether end of file is reached or not. + * The old implementation (fsp_sync_msg) used to rely on + * (wlen < chunk) to decide whether we reached end of file. + * + * Ideally FSP folks should be fix their code as per documentation. + * but until they do, adding the old check (hack) here again. + * + * Without this hack some systems would load partial lid and won't + * be able to boot into petitboot kernel. + */ + if (rc == 0 && (wlen < last->chunk_requested)) + last->result = OPAL_SUCCESS; + + fsp_freemsg(msg); + + last->remaining -= wlen; + *(last->length) += wlen; + last->buffer += wlen; + last->offset += wlen; + + prlog(PR_DEBUG, "FSP: LID %x Chunk read -> rc=0x%02x off: %08x" + " twritten: %08x\n", last->lid, rc, woffset, wlen); + + fsp_fetch_lid_next_chunk(last); + + unlock(&fsp_fetch_lock); +} + +static void fsp_fetch_lid_next_chunk(struct fsp_fetch_lid_item *last) +{ + uint64_t baddr; + uint64_t balign, boff; + uint32_t chunk; + uint32_t taddr; + struct fsp_msg *msg; + uint8_t flags = 0; + uint16_t id = FSP_DATASET_NONSP_LID; + uint32_t sub_id; + + assert(lock_held_by_me(&fsp_fetch_lock)); + + if (last->remaining == 0 || last->result == OPAL_SUCCESS) { + last->result = OPAL_SUCCESS; + last = list_pop(&fsp_fetch_lid_queue, + struct fsp_fetch_lid_item, link); + list_add_tail(&fsp_fetched_lid, &last->link); + fsp_start_fetching_next_lid(); + return; + } + + baddr = (uint64_t)last->buffer; + balign = baddr & ~TCE_MASK; + boff = baddr & TCE_MASK; + + chunk = last->remaining; + if (chunk > (PSI_DMA_FETCH_SIZE - boff)) + chunk = PSI_DMA_FETCH_SIZE - boff; + last->bsize = ((boff + chunk) + TCE_MASK) & ~TCE_MASK; + last->chunk_requested = chunk; + + prlog(PR_DEBUG, "FSP: LID %08x chunk 0x%08x bytes balign=%llx" + " boff=%llx bsize=%llx\n", + last->lid_no, chunk, balign, boff, last->bsize); + + fsp_tce_map(PSI_DMA_FETCH, (void *)balign, last->bsize); + taddr = PSI_DMA_FETCH + boff; + + sub_id = last->lid; + + msg = fsp_mkmsg(FSP_CMD_FETCH_SP_DATA, 6, + flags << 16 | id, sub_id, last->offset, + 0, taddr, chunk); + + if (fsp_queue_msg(msg, fsp_fetch_lid_complete)) { + fsp_freemsg(msg); + prerror("FSP: Failed to queue fetch data message\n"); + last->result = OPAL_INTERNAL_ERROR; + last = list_pop(&fsp_fetch_lid_queue, + struct fsp_fetch_lid_item, link); + list_add_tail(&fsp_fetched_lid, &last->link); + } + last->result = OPAL_BUSY; +} + +static void fsp_start_fetching_next_lid(void) +{ + struct fsp_fetch_lid_item *last; + + assert(lock_held_by_me(&fsp_fetch_lock)); + + last = list_top(&fsp_fetch_lid_queue, struct fsp_fetch_lid_item, link); + + if (last == NULL) + return; + + /* If we're not already fetching */ + if (last->result == OPAL_EMPTY) + fsp_fetch_lid_next_chunk(last); +} + +int fsp_start_preload_resource(enum resource_id id, uint32_t idx, + void *buf, size_t *size) +{ + struct fsp_fetch_lid_item *resource; + uint32_t lid_no = 0; + int i; + + resource = malloc(sizeof(struct fsp_fetch_lid_item)); + assert(resource != NULL); + + resource->id = id; + resource->idx = idx; + + resource->offset = 0; + resource->buffer = buf; + resource->remaining = *size; + *size = 0; + resource->length = size; + resource->result = OPAL_EMPTY; + + for (i = 0; i < ARRAY_SIZE(fsp_lid_map); i++) { + if (id != fsp_lid_map[i].id) + continue; + + if (fsp_lid_map[i].idx == idx) { + lid_no = fsp_lid_map[i].lid_no; + break; + } + } + if (lid_no == 0) + return OPAL_PARAMETER; + + printf("Trying to load OPAL LID %08x...\n", lid_no); + resource->lid_no = lid_no; + resource->lid = fsp_adjust_lid_side(lid_no); + + lock(&fsp_fetch_lock); + list_add_tail(&fsp_fetch_lid_queue, &resource->link); + fsp_start_fetching_next_lid(); + unlock(&fsp_fetch_lock); + + return OPAL_SUCCESS; +} + +int fsp_resource_loaded(enum resource_id id, uint32_t idx) +{ + struct fsp_fetch_lid_item *resource = NULL; + struct fsp_fetch_lid_item *r; + int rc = OPAL_BUSY; + + lock(&fsp_fetch_lock); + list_for_each(&fsp_fetched_lid, r, link) { + if (r->id == id && r->idx == idx) { + resource = r; + break; + } + } + + if (resource) { + rc = resource->result; + list_del(&resource->link); + free(resource); + } + unlock(&fsp_fetch_lock); + + return rc; +} + +static int fsp_lid_loaded(uint32_t lid_no) +{ + struct fsp_fetch_lid_item *resource = NULL; + struct fsp_fetch_lid_item *r; + int rc = OPAL_BUSY; + + lock(&fsp_fetch_lock); + list_for_each(&fsp_fetched_lid, r, link) { + if (r->lid_no == lid_no) { + resource = r; + break; + } + } + + if (resource) { + rc = resource->result; + if (rc == OPAL_SUCCESS) { + list_del(&resource->link); + free(resource); + } + } + unlock(&fsp_fetch_lock); + + return rc; +} + +int fsp_preload_lid(uint32_t lid_no, char *buf, size_t *size) +{ + struct fsp_fetch_lid_item *resource; + int r = OPAL_SUCCESS; + + resource = malloc(sizeof(struct fsp_fetch_lid_item)); + assert(resource != NULL); + + resource->id = -1; + resource->idx = -1; + + resource->offset = 0; + resource->buffer = buf; + resource->remaining = *size; + *size = 0; + resource->length = size; + resource->result = OPAL_EMPTY; + + if (lid_no == 0) + return OPAL_PARAMETER; + + printf("Trying to load LID %08x from FSP\n", lid_no); + resource->lid_no = lid_no; + resource->lid = fsp_adjust_lid_side(lid_no); + + lock(&fsp_fetch_lock); + list_add_tail(&fsp_fetch_lid_queue, &resource->link); + fsp_start_fetching_next_lid(); + unlock(&fsp_fetch_lock); + + return r; +} + +int fsp_wait_lid_loaded(uint32_t lid_no) +{ + int r; + int waited = 0; + + r = fsp_lid_loaded(lid_no); + + while(r == OPAL_BUSY) { + opal_run_pollers(); + time_wait_nopoll(msecs_to_tb(5)); + waited+=5; + cpu_relax(); + r = fsp_lid_loaded(lid_no); + } + + prlog(PR_DEBUG, "FSP: fsp_wait_lid_loaded %x %u ms\n", lid_no, waited); + + return r; +} + +void fsp_used_by_console(void) +{ + fsp_lock.in_con_path = true; + + /* + * Some other processor might hold it without having + * disabled the console locally so let's make sure that + * is over by taking/releasing the lock ourselves + */ + lock(&fsp_lock); + unlock(&fsp_lock); +} diff --git a/roms/skiboot/hw/homer.c b/roms/skiboot/hw/homer.c new file mode 100644 index 000000000..3ff6ed1ae --- /dev/null +++ b/roms/skiboot/hw/homer.c @@ -0,0 +1,252 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* Copyright 2013-2019 IBM Corp. */ + +#include <skiboot.h> +#include <xscom.h> +#include <io.h> +#include <cpu.h> +#include <chip.h> +#include <mem_region.h> +#include <hostservices.h> + +#define P8_PBA_BAR0 0x2013f00 +#define P8_PBA_BARMASK0 0x2013f04 + +#define P9_PBA_BAR0 0x5012B00 +#define P9_PBA_BARMASK0 0x5012B04 + +#define P10_PBA_BAR0 0x01010CDA +#define P10_PBA_BARMASK0 0x01010CDE + +#define PBA_MASK_ALL_BITS 0x000001FFFFF00000ULL /* Bits 23:43 */ + +enum P8_BAR { + P8_BAR_HOMER = 0, + P8_BAR_CENTAUR = 1, + P8_BAR_SLW = 2, + P8_BAR_OCC_COMMON = 3, +}; + +enum P9_BAR { + P9_BAR_HOMER = 0, + P9_BAR_CENTAUR = 1, + P9_BAR_OCC_COMMON = 2, + P9_BAR_SBE = 3, +}; + +enum P10_BAR { + P10_BAR_HOMER = 0, + P10_BAR_OCMB_THERMAL = 1, + P10_BAR_OCC_COMMON = 2, + P10_BAR_SBE = 3, +}; + +static u64 pba_bar0, pba_barmask0; +static u8 bar_homer, bar_slw, bar_occ_common; + +static bool read_pba_bar(struct proc_chip *chip, unsigned int bar_no, + uint64_t *base, uint64_t *size) +{ + uint64_t bar, mask; + int rc; + + rc = xscom_read(chip->id, pba_bar0 + bar_no, &bar); + if (rc) { + prerror("SLW: Error %d reading PBA BAR%d on chip %d\n", + rc, bar_no, chip->id); + return false; + } + rc = xscom_read(chip->id, pba_barmask0 + bar_no, &mask); + if (rc) { + prerror("SLW: Error %d reading PBA BAR MASK%d on chip %d\n", + rc, bar_no, chip->id); + return false; + } + prlog(PR_DEBUG, " PBA BAR%d : 0x%016llx\n", bar_no, bar); + prlog(PR_DEBUG, " PBA MASK%d: 0x%016llx\n", bar_no, mask); + + if (mask == PBA_MASK_ALL_BITS) { + /* + * This could happen if all HOMER users are not enabled during + * early system bringup. Skip using the PBA BAR. + */ + mask = 0; + bar = 0; + prerror(" PBA MASK%d uninitalized skipping BAR\n", bar_no); + } + + *base = bar & 0x0ffffffffffffffful; + *size = (mask | 0xfffff) + 1; + + return (*base) != 0; +} + +static void homer_init_chip(struct proc_chip *chip) +{ + uint64_t hbase = 0, hsize = 0; + uint64_t sbase, ssize, obase, osize; + + /* + * PBA BARs assigned by HB: + * + * P8: + * 0 : Entire HOMER + * 1 : OCC to Centaur path (we don't care) + * 2 : SLW image + * 3 : OCC Common area + * + * We need to reserve the memory covered by BAR 0 and BAR 3, however + * on earlier HBs, BAR0 isn't set so we need BAR 2 instead in that + * case to cover SLW (OCC not running). + * + * P9: + * 0 : Entire HOMER + * 1 : OCC to Centaur path (Cumulus only) + * 2 : OCC Common area + * 3 : SBE communication + * + */ + if (read_pba_bar(chip, bar_homer, &hbase, &hsize)) { + prlog(PR_DEBUG, " HOMER Image at 0x%llx size %lldMB\n", + hbase, hsize / 0x100000); + + if (!mem_range_is_reserved(hbase, hsize)) { + prlog(PR_WARNING, + "HOMER image is not reserved! Reserving\n"); + mem_reserve_fw("ibm,homer-image", hbase, hsize); + } + + chip->homer_base = hbase; + chip->homer_size = hsize; + } + + /* + * We always read the SLW BAR since we need to grab info about the + * SLW image in the struct proc_chip for use by the slw.c code + */ + if (proc_gen == proc_gen_p8 && + read_pba_bar(chip, bar_slw, &sbase, &ssize)) { + prlog(PR_DEBUG, " SLW Image at 0x%llx size %lldMB\n", + sbase, ssize / 0x100000); + + /* + * Only reserve it if we have no homer image or if it + * doesn't fit in it (only check the base). + */ + if ((sbase < hbase || sbase > (hbase + hsize) || + (hbase == 0 && sbase > 0)) && + !mem_range_is_reserved(sbase, ssize)) { + prlog(PR_WARNING, + "SLW image is not reserved! Reserving\n"); + mem_reserve_fw("ibm,slw-image", sbase, ssize); + } + + chip->slw_base = sbase; + chip->slw_bar_size = ssize; + chip->slw_image_size = ssize; /* will be adjusted later */ + } + + if (read_pba_bar(chip, bar_occ_common, &obase, &osize)) { + prlog(PR_DEBUG, " OCC Common Area at 0x%llx size %lldMB\n", + obase, osize / 0x100000); + chip->occ_common_base = obase; + chip->occ_common_size = osize; + } +} + + +static void host_services_occ_base_setup(void) +{ + struct proc_chip *chip; + uint64_t occ_common; + + chip = next_chip(NULL); /* Frist chip */ + occ_common = (uint64_t) local_alloc(chip->id, OCC_COMMON_SIZE, OCC_COMMON_SIZE); + + for_each_chip(chip) { + chip->occ_common_base = occ_common; + chip->occ_common_size = OCC_COMMON_SIZE; + + chip->homer_base = (uint64_t) local_alloc(chip->id, HOMER_IMAGE_SIZE, + HOMER_IMAGE_SIZE); + chip->homer_size = HOMER_IMAGE_SIZE; + memset((void *)chip->homer_base, 0, chip->homer_size); + + prlog(PR_DEBUG, "HBRT: Chip %d HOMER base %016llx : %08llx\n", + chip->id, chip->homer_base, chip->homer_size); + prlog(PR_DEBUG, "HBRT: OCC common base %016llx : %08llx\n", + chip->occ_common_base, chip->occ_common_size); + } +} + +void homer_init(void) +{ + struct proc_chip *chip; + + if (chip_quirk(QUIRK_NO_PBA)) + return; + + switch (proc_gen) { + case proc_gen_p8: + pba_bar0 = P8_PBA_BAR0; + pba_barmask0 = P8_PBA_BARMASK0; + bar_homer = P8_BAR_HOMER; + bar_slw = P8_BAR_SLW; + bar_occ_common = P8_BAR_OCC_COMMON; + break; + case proc_gen_p9: + pba_bar0 = P9_PBA_BAR0; + pba_barmask0 = P9_PBA_BARMASK0; + bar_homer = P9_BAR_HOMER; + bar_occ_common = P9_BAR_OCC_COMMON; + break; + case proc_gen_p10: + pba_bar0 = P10_PBA_BAR0; + pba_barmask0 = P10_PBA_BARMASK0; + bar_homer = P10_BAR_HOMER; + bar_occ_common = P10_BAR_OCC_COMMON; + break; + default: + return; + }; + + /* + * XXX This is temporary, on P8 we look for any configured + * SLW/OCC BAR and reserve the memory. Eventually, this will be + * done via HostBoot using the device-tree "reserved-ranges" + * or we'll load the SLW & OCC images ourselves using Host Services. + */ + for_each_chip(chip) { + prlog(PR_DEBUG, "HOMER: Init chip %d\n", chip->id); + homer_init_chip(chip); + } + + /* + * Check is PBA BARs are already loaded with HOMER and + * skip host services. + */ + + chip = next_chip(NULL); + /* Both HOMER images and OCC areas are setup */ + if (chip->homer_base && chip->occ_common_base) { + /* Reserve OCC common area from BAR */ + if (!mem_range_is_reserved(chip->occ_common_base, + chip->occ_common_size)) { + prlog(PR_WARNING, + "OCC common area is not reserved! Reserving\n"); + mem_reserve_fw("ibm,occ-common-area", + chip->occ_common_base, + chip->occ_common_size); + } + } else if (chip->homer_base) { + /* + * HOMER is setup but not OCC!! Do not allocate HOMER + * regions. This case is possible during early system + * bringup where OCC images are not yet operational. + */ + } else { + /* Allocate memory for HOMER and OCC common area */ + host_services_occ_base_setup(); + } +} + diff --git a/roms/skiboot/hw/imc.c b/roms/skiboot/hw/imc.c new file mode 100644 index 000000000..cbd68edc4 --- /dev/null +++ b/roms/skiboot/hw/imc.c @@ -0,0 +1,1075 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * In-Memory Counters (IMC) + * Sometimes called IMA, but that's also a different thing. + * + * Copyright 2016-2019 IBM Corp. + */ + +#define pr_fmt(fmt) "IMC: " fmt +#include <skiboot.h> +#include <xscom.h> +#include <imc.h> +#include <chip.h> +#include <libxz/xz.h> +#include <device.h> +#include <p9_stop_api.H> + +/* + * IMC trace scom values + */ +#define IMC_TRACE_CPMC1 0 /* select cpmc1 */ +#define IMC_TRACE_CPMC2 1 /* select cpmc2 */ +#define IMC_TRACE_CPMCLOAD_VAL 0xfa /* + * Value to be loaded into cpmc2 + * at sampling start + */ + +/* Event: CPM_32MHZ_CYC */ +#define IMC_TRACE_CPMC2SEL_VAL 2 +#define IMC_TRACE_CPMC1SEL_VAL 4 + +#define IMC_TRACE_BUFF_SIZE 0 /* + * b’000’- 4K entries * 64 per + * entry = 256K buffersize + */ +static uint64_t TRACE_IMC_ADDR; +static uint64_t CORE_IMC_EVENT_MASK_ADDR; +static uint64_t trace_scom_val; +/* + * Initialise these with the pdbar and htm scom port address array + * at run time, based on the processor version. + */ +static unsigned int *pdbar_scom_index; +static unsigned int *htm_scom_index; + +/* + * Nest IMC PMU names along with their bit values as represented in the + * imc_chip_avl_vector(in struct imc_chip_cb, look at include/imc.h). + * nest_pmus[] is an array containing all the possible nest IMC PMU node names. + */ +static char const *nest_pmus[] = { + "powerbus0", + "mcs0", + "mcs1", + "mcs2", + "mcs3", + "mcs4", + "mcs5", + "mcs6", + "mcs7", + "mba0", + "mba1", + "mba2", + "mba3", + "mba4", + "mba5", + "mba6", + "mba7", + "cen0", + "cen1", + "cen2", + "cen3", + "cen4", + "cen5", + "cen6", + "cen7", + "xlink0", + "xlink1", + "xlink2", + "mcd0", + "mcd1", + "phb0", + "phb1", + "phb2", + "phb3", + "phb4", + "phb5", + "nx", + "capp0", + "capp1", + "vas", + "int", + "alink0", + "alink1", + "alink2", + "alink3", + "nvlink0", + "nvlink1", + "nvlink2", + "nvlink3", + "nvlink4", + "nvlink5", + /* reserved bits : 51 - 63 */ +}; + +/* + * Due to Nest HW/OCC restriction, microcode will not support individual unit + * events for these nest units mcs0, mcs1 ... mcs7 in the accumulation mode. + * And events to monitor each mcs units individually will be supported only + * in the debug mode (which will be supported by microcode in the future). + * These will be advertised only when OPAL provides interface for the it. + */ +static char const *debug_mode_units[] = { + "mcs0", + "mcs1", + "mcs2", + "mcs3", + "mcs4", + "mcs5", + "mcs6", + "mcs7", +}; + +/* + * Combined unit node events are counted when any of the individual + * unit is enabled in the availability vector. That is, + * ex, mcs01 unit node should be enabled only when mcs0 or mcs1 enabled. + * mcs23 unit node should be enabled only when mcs2 or mcs3 is enabled + */ +static struct combined_units_node cu_node[] = { + { .name = "mcs01", .unit1 = PPC_BIT(1), .unit2 = PPC_BIT(2) }, + { .name = "mcs23", .unit1 = PPC_BIT(3), .unit2 = PPC_BIT(4) }, + { .name = "mcs45", .unit1 = PPC_BIT(5), .unit2 = PPC_BIT(6) }, + { .name = "mcs67", .unit1 = PPC_BIT(7), .unit2 = PPC_BIT(8) }, +}; + +static char *compress_buf; +static size_t compress_buf_size; +const char **prop_to_fix(struct dt_node *node); +static const char *props_to_fix[] = {"events", NULL}; + +static bool is_nest_mem_initialized(struct imc_chip_cb *ptr) +{ + /* + * Non zero value in "Status" field indicate memory initialized. + */ + if (!ptr->imc_chip_run_status) + return false; + + return true; +} + +/* + * A Quad contains 4 cores in Power 9, and there are 4 addresses for + * the Core Hardware Trace Macro (CHTM) attached to each core. + * So, for core index 0 to core index 3, we have a sequential range of + * SCOM port addresses in the arrays below, each for Hardware Trace Macro (HTM) + * mode and PDBAR. + */ +static unsigned int pdbar_scom_index_p9[] = { + 0x1001220B, + 0x1001230B, + 0x1001260B, + 0x1001270B +}; +static unsigned int htm_scom_index_p9[] = { + 0x10012200, + 0x10012300, + 0x10012600, + 0x10012700 +}; + +static unsigned int pdbar_scom_index_p10[] = { + 0x2001868B, + 0x2001468B, + 0x2001268B, + 0x2001168B +}; + +static unsigned int htm_scom_index_p10[] = { + 0x20018680, + 0x20014680, + 0x20012680, + 0x20011680 +}; + +static struct imc_chip_cb *get_imc_cb(uint32_t chip_id) +{ + struct proc_chip *chip = get_chip(chip_id); + struct imc_chip_cb *cb; + + if (!chip->homer_base) + return NULL; /* The No Homers Club */ + + cb = (struct imc_chip_cb *)(chip->homer_base + P9_CB_STRUCT_OFFSET); + if (!is_nest_mem_initialized(cb)) + return NULL; + + return cb; +} + +static int pause_microcode_at_boot(void) +{ + struct proc_chip *chip; + struct imc_chip_cb *cb; + + for_each_chip(chip) { + cb = get_imc_cb(chip->id); + if (cb) + cb->imc_chip_command = cpu_to_be64(NEST_IMC_DISABLE); + else + return -1; /* ucode is not init-ed */ + } + + return 0; +} + +/* + * Function return list of properties names for the fixup + */ +const char **prop_to_fix(struct dt_node *node) +{ + if (dt_node_is_compatible(node, "ibm,imc-counters")) + return props_to_fix; + + return NULL; +} + +/* Helper to get the IMC device type for a device node */ +static int get_imc_device_type(struct dt_node *node) +{ + const struct dt_property *type; + u32 val=0; + + if (!node) + return -1; + + type = dt_find_property(node, "type"); + if (!type) + return -1; + + val = dt_prop_get_u32(node, "type"); + switch (val){ + case IMC_COUNTER_CHIP: + return IMC_COUNTER_CHIP; + case IMC_COUNTER_CORE: + return IMC_COUNTER_CORE; + case IMC_COUNTER_THREAD: + return IMC_COUNTER_THREAD; + case IMC_COUNTER_TRACE: + return IMC_COUNTER_TRACE; + default: + break; + } + + /* Unknown/Unsupported IMC device type */ + return -1; +} + +static bool is_nest_node(struct dt_node *node) +{ + if (get_imc_device_type(node) == IMC_COUNTER_CHIP) + return true; + + return false; +} + +static bool is_imc_device_type_supported(struct dt_node *node) +{ + u32 val = get_imc_device_type(node); + struct proc_chip *chip = get_chip(this_cpu()->chip_id); + uint64_t pvr; + + if ((val == IMC_COUNTER_CHIP) || (val == IMC_COUNTER_CORE) || + (val == IMC_COUNTER_THREAD)) + return true; + + if (val == IMC_COUNTER_TRACE) { + pvr = mfspr(SPR_PVR); + + switch (chip->type) { + case PROC_CHIP_P9_NIMBUS: + /* + * Trace mode is supported in Nimbus DD2.2 + * and later versions. + */ + if ((PVR_VERS_MAJ(pvr) == 2) && + (PVR_VERS_MIN(pvr) >= 2)) + return true; + break; + case PROC_CHIP_P10: + return true; + default: + return false; + } + + } + return false; +} + +/* + * Helper to check for the imc device type in the incoming device tree. + * Remove unsupported device node. + */ +static void check_imc_device_type(struct dt_node *dev) +{ + struct dt_node *node; + + dt_for_each_compatible(dev, node, "ibm,imc-counters") { + if (!is_imc_device_type_supported(node)) { + /* + * ah nice, found a device type which I didnt know. + * Remove it and also mark node as NULL, since dt_next + * will try to fetch info for "prev" which is removed + * by dt_free. + */ + dt_free(node); + node = NULL; + } + } + + return; +} + +static void imc_dt_exports_prop_add(struct dt_node *dev) +{ + struct dt_node *node; + struct proc_chip *chip; + const struct dt_property *type; + uint32_t offset = 0, size = 0; + uint64_t baddr; + char namebuf[32]; + + + dt_for_each_compatible(dev, node, "ibm,imc-counters") { + type = dt_find_property(node, "type"); + if (type && is_nest_node(node)) { + offset = dt_prop_get_u32(node, "offset"); + size = dt_prop_get_u32(node, "size"); + } + } + + /* + * Enable only if we have valid values. + */ + if (!size && !offset) + return; + + node = dt_find_by_name(opal_node, "exports"); + if (!node) + return; + + for_each_chip(chip) { + snprintf(namebuf, sizeof(namebuf), "imc_nest_chip_%x", chip->id); + baddr = chip->homer_base; + baddr += offset; + dt_add_property_u64s(node, namebuf, baddr, size); + } +} + +/* + * Remove the PMU device nodes from the incoming new subtree, if they are not + * available in the hardware. The availability is described by the + * control block's imc_chip_avl_vector. + * Each bit represents a device unit. If the device is available, then + * the bit is set else its unset. + */ +static void disable_unavailable_units(struct dt_node *dev) +{ + uint64_t avl_vec; + struct imc_chip_cb *cb; + struct dt_node *target; + int i; + bool disable_all_nests = false; + struct proc_chip *chip; + + /* + * Check the state of ucode in all the chip. + * Disable the nest unit if ucode is not initialized + * in any of the chip. + */ + for_each_chip(chip) { + cb = get_imc_cb(chip->id); + if (!cb) { + /* + * At least currently, if one chip isn't functioning, + * none of the IMC Nest units will be functional. + * So while you may *think* this should be per chip, + * it isn't. + */ + disable_all_nests = true; + break; + } + } + + /* Add a property to "exports" node in opal_node */ + imc_dt_exports_prop_add(dev); + + /* Fetch the IMC control block structure */ + cb = get_imc_cb(this_cpu()->chip_id); + if (cb && !disable_all_nests) + avl_vec = be64_to_cpu(cb->imc_chip_avl_vector); + else { + avl_vec = 0; /* Remove only nest imc device nodes */ + + /* Incase of mambo, just fake it */ + if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS) + avl_vec = (0xffULL) << 56; + } + + for (i = 0; i < ARRAY_SIZE(nest_pmus); i++) { + if (!(PPC_BITMASK(i, i) & avl_vec)) { + /* Check if the device node exists */ + target = dt_find_by_name(dev, nest_pmus[i]); + if (!target) + continue; + /* Remove the device node */ + dt_free(target); + } + } + + /* + * Loop to detect debug mode units and remove them + * since the microcode does not support debug mode function yet. + */ + for (i = 0; i < ARRAY_SIZE(debug_mode_units); i++) { + target = dt_find_by_name(dev, debug_mode_units[i]); + if (!target) + continue; + /* Remove the device node */ + dt_free(target); + } + + /* + * Based on availability unit vector from control block, + * check and enable combined unit nodes in the device tree. + */ + for (i = 0; i < MAX_NEST_COMBINED_UNITS ; i++ ) { + if (!(cu_node[i].unit1 & avl_vec) && + !(cu_node[i].unit2 & avl_vec)) { + target = dt_find_by_name(dev, cu_node[i].name); + if (!target) + continue; + + /* Remove the device node */ + dt_free(target); + } + } + + return; +} + +static void disable_imc_type_from_dt(struct dt_node *dev, int imc_type) +{ + struct dt_node *node; + + dt_for_each_compatible(dev, node, "ibm,imc-counters") { + if (get_imc_device_type(node) == imc_type) { + dt_free(node); + node = NULL; + } + } + + return; +} + +/* + * Function to queue the loading of imc catalog data + * from the IMC pnor partition. + */ +void imc_catalog_preload(void) +{ + uint32_t pvr = (mfspr(SPR_PVR) & ~(0xf0ff)); + int ret = OPAL_SUCCESS; + compress_buf_size = MAX_COMPRESSED_IMC_DTB_SIZE; + + if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS) + return; + + /* Enable only for power 9/10 */ + if (proc_gen < proc_gen_p9) + return; + + compress_buf = malloc(MAX_COMPRESSED_IMC_DTB_SIZE); + if (!compress_buf) { + prerror("Memory allocation for catalog failed\n"); + return; + } + + ret = start_preload_resource(RESOURCE_ID_IMA_CATALOG, + pvr, compress_buf, &compress_buf_size); + if (ret != OPAL_SUCCESS) { + prerror("Failed to load IMA_CATALOG: %d\n", ret); + free(compress_buf); + compress_buf = NULL; + } + + return; +} + +static void imc_dt_update_nest_node(struct dt_node *dev) +{ + struct proc_chip *chip; + __be64 *base_addr = NULL; + __be32 *chipids = NULL; + int i=0, nr_chip = nr_chips(); + struct dt_node *node; + const struct dt_property *type; + + /* Add the base_addr and chip-id properties for the nest node */ + base_addr = malloc(sizeof(u64) * nr_chip); + chipids = malloc(sizeof(u32) * nr_chip); + for_each_chip(chip) { + base_addr[i] = cpu_to_be64(chip->homer_base); + chipids[i] = cpu_to_be32(chip->id); + i++; + } + + dt_for_each_compatible(dev, node, "ibm,imc-counters") { + type = dt_find_property(node, "type"); + if (type && is_nest_node(node)) { + dt_add_property(node, "base-addr", base_addr, (i * sizeof(u64))); + dt_add_property(node, "chip-id", chipids, (i * sizeof(u32))); + } + } +} + +static struct xz_decompress *imc_xz; + +void imc_decompress_catalog(void) +{ + void *decompress_buf = NULL; + uint32_t pvr = (mfspr(SPR_PVR) & ~(0xf0ff)); + int ret; + + /* Check we succeeded in starting the preload */ + if (compress_buf == NULL) + return; + + ret = wait_for_resource_loaded(RESOURCE_ID_IMA_CATALOG, pvr); + if (ret != OPAL_SUCCESS) { + prerror("IMC Catalog load failed\n"); + return; + } + + /* + * Memory for decompression. + */ + decompress_buf = malloc(MAX_DECOMPRESSED_IMC_DTB_SIZE); + if (!decompress_buf) { + prerror("No memory for decompress_buf \n"); + return; + } + + /* + * Decompress the compressed buffer + */ + imc_xz = malloc(sizeof(struct xz_decompress)); + if (!imc_xz) { + prerror("No memory to decompress IMC catalog\n"); + free(decompress_buf); + return; + } + + imc_xz->dst = decompress_buf; + imc_xz->src = compress_buf; + imc_xz->dst_size = MAX_DECOMPRESSED_IMC_DTB_SIZE; + imc_xz->src_size = compress_buf_size; + xz_start_decompress(imc_xz); +} + +static int setup_imc_scoms(void) +{ + switch (proc_gen) { + case proc_gen_p9: + CORE_IMC_EVENT_MASK_ADDR = CORE_IMC_EVENT_MASK_ADDR_P9; + TRACE_IMC_ADDR = TRACE_IMC_ADDR_P9; + pdbar_scom_index = pdbar_scom_index_p9; + htm_scom_index = htm_scom_index_p9; + trace_scom_val = TRACE_IMC_SCOM(IMC_TRACE_CPMC2, + IMC_TRACE_CPMCLOAD_VAL, + IMC_TRACE_CPMC1SEL_VAL, + IMC_TRACE_CPMC2SEL_VAL, + IMC_TRACE_BUFF_SIZE); + return 0; + case proc_gen_p10: + CORE_IMC_EVENT_MASK_ADDR = CORE_IMC_EVENT_MASK_ADDR_P10; + TRACE_IMC_ADDR = TRACE_IMC_ADDR_P10; + pdbar_scom_index = pdbar_scom_index_p10; + htm_scom_index = htm_scom_index_p10; + trace_scom_val = TRACE_IMC_SCOM(IMC_TRACE_CPMC1, + IMC_TRACE_CPMCLOAD_VAL, + IMC_TRACE_CPMC1SEL_VAL, + IMC_TRACE_CPMC2SEL_VAL, + IMC_TRACE_BUFF_SIZE); + return 0; + default: + prerror("%s: Unknown cpu type\n", __func__); + break; + } + return -1; +} + +/* + * Load the IMC pnor partition and find the appropriate sub-partition + * based on the platform's PVR. + * Decompress the sub-partition and link the imc device tree to the + * existing device tree. + */ +void imc_init(void) +{ + struct dt_node *dev; + int err_flag = -1; + + if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS) { + dev = dt_find_compatible_node(dt_root, NULL, + "ibm,opal-in-memory-counters"); + if (!dev) + return; + + goto imc_mambo; + } + + /* Enable only for power 9/10 */ + if (proc_gen < proc_gen_p9) + return; + + if (!imc_xz) + return; + + wait_xz_decompress(imc_xz); + if (imc_xz->status != OPAL_SUCCESS) { + prerror("IMC: xz_decompress failed\n"); + goto err; + } + + /* + * Flow of the data from PNOR to main device tree: + * + * PNOR -> compressed local buffer (compress_buf) + * compressed local buffer -> decompressed local buf (decompress_buf) + * decompress local buffer -> main device tree + * free compressed local buffer + */ + + + /* Create a device tree entry for imc counters */ + dev = dt_new_root("imc-counters"); + if (!dev) { + prerror("IMC: Failed to add an imc-counters root node\n"); + goto err; + } + + /* + * Attach the new decompress_buf to the imc-counters node. + * dt_expand_node() does sanity checks for fdt_header, piggyback + */ + if (dt_expand_node(dev, imc_xz->dst, 0) < 0) { + dt_free(dev); + prerror("IMC: dt_expand_node failed\n"); + goto err; + } + +imc_mambo: + if (setup_imc_scoms()) { + prerror("IMC: Failed to setup the scoms\n"); + goto err; + } + + /* Check and remove unsupported imc device types */ + check_imc_device_type(dev); + + /* + * Check and remove unsupported nest unit nodes by the microcode, + * from the incoming device tree. + */ + disable_unavailable_units(dev); + + /* Fix the phandle in the incoming device tree */ + dt_adjust_subtree_phandle(dev, prop_to_fix); + + /* Update the base_addr and chip-id for nest nodes */ + imc_dt_update_nest_node(dev); + + if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS) + return; + + /* + * IMC nest counters has both in-band (ucode access) and out of band + * access to it. Since not all nest counter configurations are supported + * by ucode, out of band tools are used to characterize other + * configuration. + * + * If the ucode not paused and OS does not have IMC driver support, + * then out to band tools will race with ucode and end up getting + * undesirable values. Hence pause the ucode if it is already running. + */ + if (pause_microcode_at_boot()) { + prerror("IMC: Pausing ucode failed, disabling nest imc\n"); + disable_imc_type_from_dt(dev, IMC_COUNTER_CHIP); + } + + /* + * If the dt_attach_root() fails, "imc-counters" node will not be + * seen in the device-tree and hence OS should not make any + * OPAL_IMC_* calls. + */ + if (!dt_attach_root(dt_root, dev)) { + dt_free(dev); + prerror("IMC: Failed to attach imc-counter node to dt root\n"); + goto err; + } + + err_flag = OPAL_SUCCESS; + +err: + if (err_flag != OPAL_SUCCESS) + prerror("IMC Devices not added\n"); + + free(compress_buf); + free(imc_xz->dst); + free(imc_xz); +} + +static int stop_api_init(struct proc_chip *chip, int phys_core_id, + uint32_t scoms, uint64_t data, + const ScomOperation_t operation, + const ScomSection_t section, + const char *type) +{ + int ret; + + prlog(PR_DEBUG, "Configuring stopapi for IMC\n"); + ret = p9_stop_save_scom((void *)chip->homer_base, scoms, + data, operation, section); + if (ret) { + prerror("IMC %s stopapi ret = %d, scoms = %x (core id = %x)\n",\ + type, ret, scoms, phys_core_id); + if (ret != STOP_SAVE_SCOM_ENTRY_UPDATE_FAILED) + wakeup_engine_state = WAKEUP_ENGINE_FAILED; + else + prerror("SCOM entries are full\n"); + return OPAL_HARDWARE; + } + + return ret; +} + +/* Function to return the scom address for the specified core */ +static uint32_t get_imc_scom_addr_for_core(int core, uint64_t addr) +{ + uint32_t scom_addr; + + switch (proc_gen) { + case proc_gen_p9: + scom_addr = XSCOM_ADDR_P9_EC(core, addr); + return scom_addr; + case proc_gen_p10: + scom_addr = XSCOM_ADDR_P10_EC(core, addr); + return scom_addr; + default: + return 0; + } +} + +/* Function to return the scom address for the specified core in the quad */ +static uint32_t get_imc_scom_addr_for_quad(int core, uint64_t addr) +{ + uint32_t scom_addr; + + switch (proc_gen) { + case proc_gen_p9: + scom_addr = XSCOM_ADDR_P9_EQ(core, addr); + return scom_addr; + case proc_gen_p10: + scom_addr = XSCOM_ADDR_P10_EQ(core, addr); + return scom_addr; + default: + return 0; + } +} + +static int64_t core_imc_counters_init(uint64_t addr, int port_id, + int phys_core_id, struct cpu_thread *c) +{ + uint32_t pdbar_addr, event_mask_addr, htm_addr; + int ret; + + /* Get the scom address for this core, based on the platform */ + pdbar_addr = get_imc_scom_addr_for_quad(phys_core_id, + pdbar_scom_index[port_id]); + event_mask_addr = get_imc_scom_addr_for_core(phys_core_id, + CORE_IMC_EVENT_MASK_ADDR); + + /* + * Core IMC hardware mandate initing of three scoms + * to enbale or disable of the Core IMC engine. + * + * PDBAR: Scom contains the real address to store per-core + * counter data in memory along with other bits. + * + * EventMask: Scom contain bits to denote event to multiplex + * at different MSR[HV PR] values, along with bits for + * sampling duration. + * + * HTM Scom: scom to enable counter data movement to memory. + */ + + + if (xscom_write(c->chip_id, pdbar_addr, + (u64)(CORE_IMC_PDBAR_MASK & addr))) { + prerror("error in xscom_write for pdbar\n"); + return OPAL_HARDWARE; + } + + if (has_deep_states) { + if (wakeup_engine_state == WAKEUP_ENGINE_PRESENT) { + struct proc_chip *chip = get_chip(c->chip_id); + + ret = stop_api_init(chip, phys_core_id, pdbar_addr, + (u64)(CORE_IMC_PDBAR_MASK & addr), + P9_STOP_SCOM_REPLACE, + P9_STOP_SECTION_EQ_SCOM, + "pdbar"); + if (ret) + return ret; + ret = stop_api_init(chip, phys_core_id, + event_mask_addr, + (u64)CORE_IMC_EVENT_MASK, + P9_STOP_SCOM_REPLACE, + P9_STOP_SECTION_CORE_SCOM, + "event_mask"); + if (ret) + return ret; + } else { + prerror("IMC: Wakeup engine not present!"); + return OPAL_HARDWARE; + } + } + + if (xscom_write(c->chip_id, event_mask_addr, + (u64)CORE_IMC_EVENT_MASK)) { + prerror("error in xscom_write for event mask\n"); + return OPAL_HARDWARE; + } + + /* Get the scom address for htm_mode scom based on the platform */ + htm_addr = get_imc_scom_addr_for_quad(phys_core_id, + htm_scom_index[port_id]); + if (xscom_write(c->chip_id, htm_addr, + (u64)CORE_IMC_HTM_MODE_DISABLE)) { + prerror("error in xscom_write for htm mode\n"); + return OPAL_HARDWARE; + } + return OPAL_SUCCESS; +} + +/* + * opal_imc_counters_init : This call initialize the IMC engine. + * + * For Nest IMC, this is no-op and returns OPAL_SUCCESS at this point. + * For Core IMC, this initializes core IMC Engine, by initializing + * these scoms "PDBAR", "HTM_MODE" and the "EVENT_MASK" in a given cpu. + */ +static int64_t opal_imc_counters_init(uint32_t type, uint64_t addr, uint64_t cpu_pir) +{ + struct cpu_thread *c = find_cpu_by_pir(cpu_pir); + int port_id, phys_core_id; + int ret; + uint32_t htm_addr, trace_addr; + + switch (type) { + case OPAL_IMC_COUNTERS_NEST: + return OPAL_SUCCESS; + case OPAL_IMC_COUNTERS_CORE: + if (!c) + return OPAL_PARAMETER; + + /* + * Core IMC hardware mandates setting of htm_mode and + * pdbar in specific scom ports. port_id are in + * pdbar_scom_index[] and htm_scom_index[]. + */ + phys_core_id = pir_to_core_id(c->pir); + port_id = phys_core_id % 4; + + if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS) + return OPAL_SUCCESS; + + ret = core_imc_counters_init(addr, port_id, phys_core_id, c); + if (ret < 0) + return ret; + /* + * If fused core is supported, do the scoms for the + * secondary core also. + */ + if (this_cpu()->is_fused_core) { + struct cpu_thread *c1 = find_cpu_by_pir(cpu_pir ^ 1); + + phys_core_id = pir_to_core_id(c1->pir); + port_id = phys_core_id % 4; + + ret = core_imc_counters_init(addr, port_id, phys_core_id, c1); + if (ret < 0) + return ret; + } + return ret; + case OPAL_IMC_COUNTERS_TRACE: + if (!c) + return OPAL_PARAMETER; + + phys_core_id = pir_to_core_id(c->pir); + port_id = phys_core_id % 4; + + if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS) + return OPAL_SUCCESS; + + trace_addr = get_imc_scom_addr_for_core(phys_core_id, + TRACE_IMC_ADDR); + htm_addr = get_imc_scom_addr_for_quad(phys_core_id, + htm_scom_index[port_id]); + + if (has_deep_states) { + if (wakeup_engine_state == WAKEUP_ENGINE_PRESENT) { + struct proc_chip *chip = get_chip(c->chip_id); + + ret = stop_api_init(chip, phys_core_id, + trace_addr, + trace_scom_val, + P9_STOP_SCOM_REPLACE, + P9_STOP_SECTION_CORE_SCOM, + "trace_imc"); + if (ret) + return ret; + } else { + prerror("IMC-trace:Wakeup engine not present!"); + return OPAL_HARDWARE; + } + } + if (xscom_write(c->chip_id, htm_addr, (u64)CORE_IMC_HTM_MODE_DISABLE)) { + prerror("IMC-trace: error in xscom_write for htm mode\n"); + return OPAL_HARDWARE; + } + if (xscom_write(c->chip_id, trace_addr, trace_scom_val)) { + prerror("IMC-trace: error in xscom_write for trace mode\n"); + return OPAL_HARDWARE; + } + return OPAL_SUCCESS; + + } + + return OPAL_SUCCESS; +} +opal_call(OPAL_IMC_COUNTERS_INIT, opal_imc_counters_init, 3); + +/* opal_imc_counters_control_start: This call starts the nest/core imc engine. */ +static int64_t opal_imc_counters_start(uint32_t type, uint64_t cpu_pir) +{ + u64 op; + struct cpu_thread *c = find_cpu_by_pir(cpu_pir); + struct imc_chip_cb *cb; + int port_id, phys_core_id; + uint32_t htm_addr; + + if (!c) + return OPAL_PARAMETER; + + switch (type) { + case OPAL_IMC_COUNTERS_NEST: + /* Fetch the IMC control block structure */ + cb = get_imc_cb(c->chip_id); + if (!cb) + return OPAL_HARDWARE; + + /* Set the run command */ + op = NEST_IMC_ENABLE; + + if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS) + return OPAL_SUCCESS; + + /* Write the command to the control block now */ + cb->imc_chip_command = cpu_to_be64(op); + + return OPAL_SUCCESS; + case OPAL_IMC_COUNTERS_CORE: + case OPAL_IMC_COUNTERS_TRACE: + /* + * Core IMC hardware mandates setting of htm_mode in specific + * scom ports (port_id are in htm_scom_index[]) + */ + phys_core_id = pir_to_core_id(c->pir); + port_id = phys_core_id % 4; + + if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS) + return OPAL_SUCCESS; + + htm_addr = get_imc_scom_addr_for_quad(phys_core_id, + htm_scom_index[port_id]); + /* + * Enables the core imc engine by appropriately setting + * bits 4-9 of the HTM_MODE scom port. No initialization + * is done in this call. This just enables the the counters + * to count with the previous initialization. + */ + if (xscom_write(c->chip_id, htm_addr, (u64)CORE_IMC_HTM_MODE_ENABLE)) { + prerror("IMC OPAL_start: error in xscom_write for htm_mode\n"); + return OPAL_HARDWARE; + } + + return OPAL_SUCCESS; + } + + return OPAL_SUCCESS; +} +opal_call(OPAL_IMC_COUNTERS_START, opal_imc_counters_start, 2); + +/* opal_imc_counters_control_stop: This call stops the nest imc engine. */ +static int64_t opal_imc_counters_stop(uint32_t type, uint64_t cpu_pir) +{ + u64 op; + struct imc_chip_cb *cb; + struct cpu_thread *c = find_cpu_by_pir(cpu_pir); + int port_id, phys_core_id; + uint32_t htm_addr; + + if (!c) + return OPAL_PARAMETER; + + switch (type) { + case OPAL_IMC_COUNTERS_NEST: + /* Fetch the IMC control block structure */ + cb = get_imc_cb(c->chip_id); + if (!cb) + return OPAL_HARDWARE; + + /* Set the run command */ + op = NEST_IMC_DISABLE; + + if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS) + return OPAL_SUCCESS; + + /* Write the command to the control block */ + cb->imc_chip_command = cpu_to_be64(op); + + return OPAL_SUCCESS; + + case OPAL_IMC_COUNTERS_CORE: + case OPAL_IMC_COUNTERS_TRACE: + /* + * Core IMC hardware mandates setting of htm_mode in specific + * scom ports (port_id are in htm_scom_index[]) + */ + phys_core_id = pir_to_core_id(c->pir); + port_id = phys_core_id % 4; + + if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS) + return OPAL_SUCCESS; + + htm_addr = get_imc_scom_addr_for_quad(phys_core_id, + htm_scom_index[port_id]); + /* + * Disables the core imc engine by clearing + * bits 4-9 of the HTM_MODE scom port. + */ + if (xscom_write(c->chip_id, htm_addr, (u64) CORE_IMC_HTM_MODE_DISABLE)) { + prerror("error in xscom_write for htm_mode\n"); + return OPAL_HARDWARE; + } + + return OPAL_SUCCESS; + } + + return OPAL_SUCCESS; +} +opal_call(OPAL_IMC_COUNTERS_STOP, opal_imc_counters_stop, 2); diff --git a/roms/skiboot/hw/ipmi/Makefile.inc b/roms/skiboot/hw/ipmi/Makefile.inc new file mode 100644 index 000000000..c6b36a2b3 --- /dev/null +++ b/roms/skiboot/hw/ipmi/Makefile.inc @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later + +SUBDIRS += hw/ipmi + +IPMI_OBJS = ipmi-rtc.o ipmi-power.o ipmi-fru.o ipmi-sel.o +IPMI_OBJS += ipmi-watchdog.o ipmi-sensor.o ipmi-attn.o ipmi-info.o + +IPMI = hw/ipmi/built-in.a +$(IPMI): $(IPMI_OBJS:%=hw/ipmi/%) diff --git a/roms/skiboot/hw/ipmi/ipmi-attn.c b/roms/skiboot/hw/ipmi/ipmi-attn.c new file mode 100644 index 000000000..280b2525f --- /dev/null +++ b/roms/skiboot/hw/ipmi/ipmi-attn.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * When everything is terrible, tell the FSP as much as possible as to why + * + * Copyright 2013-2019 IBM Corp. + */ + +#include <errorlog.h> +#include <ipmi.h> +#include <pel.h> +#include <platform.h> +#include <processor.h> +#include <sbe-p9.h> +#include <skiboot.h> +#include <stack.h> +#include <timebase.h> +#include <xscom.h> + +/* Use same attention SRC for BMC based machine */ +DEFINE_LOG_ENTRY(OPAL_RC_ATTN, OPAL_PLATFORM_ERR_EVT, + OPAL_ATTN, OPAL_PLATFORM_FIRMWARE, + OPAL_ERROR_PANIC, OPAL_ABNORMAL_POWER_OFF); + +/* Maximum buffer size to capture backtrace and other useful information */ +#define IPMI_TI_BUFFER_SIZE (IPMI_MAX_PEL_SIZE - PEL_MIN_SIZE) +static char ti_buffer[IPMI_TI_BUFFER_SIZE]; + +#define STACK_BUF_ENTRIES 20 +static struct bt_entry bt_buf[STACK_BUF_ENTRIES]; + +/* Log eSEL event with OPAL backtrace */ +static void ipmi_log_terminate_event(const char *msg) +{ + struct bt_metadata metadata; + unsigned int ti_len; + unsigned int ti_size; + struct errorlog *elog_buf; + + /* Fill OPAL version */ + ti_len = snprintf(ti_buffer, IPMI_TI_BUFFER_SIZE, + "OPAL version : %s\n", version); + + /* File information */ + ti_len += snprintf(ti_buffer + ti_len, IPMI_TI_BUFFER_SIZE - ti_len, + "File info : %s\n", msg); + ti_size = IPMI_TI_BUFFER_SIZE - ti_len; + + /* Backtrace */ + backtrace_create(bt_buf, STACK_BUF_ENTRIES, &metadata); + metadata.token = OPAL_LAST + 1; + backtrace_print(bt_buf, &metadata, ti_buffer + ti_len, &ti_size, true); + + /* Create eSEL event and commit */ + elog_buf = opal_elog_create(&e_info(OPAL_RC_ATTN), 0); + log_append_data(elog_buf, (char *)&ti_buffer, ti_len + ti_size); + log_commit(elog_buf); +} + +void __attribute__((noreturn)) ipmi_terminate(const char *msg) +{ + /* Log eSEL event */ + if (ipmi_present()) + ipmi_log_terminate_event(msg); + + /* + * If mpipl is supported then trigger SBE interrupt + * to initiate mpipl + */ + p9_sbe_terminate(); + + /* + * Trigger software xstop (OPAL TI). It will stop all the CPU threads + * moving them into quiesced state. OCC will collect all FIR data. + * Upon checkstop signal, BMC will then decide whether to reboot/IPL or + * not depending on AutoReboot policy, if any. This helps in cases + * where OPAL is crashing/terminating before host reaches to runtime. + * With OpenBMC AutoReboot policy, in such cases, it will make sure + * that system is moved to Quiesced state after 3 or so attempts to + * IPL. Without OPAL TI, OpenBMC will never know that OPAL is + * terminating and system would go into never ending IPL'ing loop. + * + * Once the system reaches to runtime, OpenBMC resets the boot counter. + * Hence next time when BMC receieves the OPAL TI, it will IPL the + * system if AutoReboot is enabled. We don't need to worry about self + * rebooting. + */ + + xscom_trigger_xstop(); + /* + * Control will not reach here if software xstop has been supported and + * enabled. If not supported then fallback to cec reboot path below. + */ + + /* Reboot call */ + if (platform.cec_reboot) + platform.cec_reboot(); + + while (1) + time_wait_ms(100); +} diff --git a/roms/skiboot/hw/ipmi/ipmi-fru.c b/roms/skiboot/hw/ipmi/ipmi-fru.c new file mode 100644 index 000000000..86c9ca0ce --- /dev/null +++ b/roms/skiboot/hw/ipmi/ipmi-fru.c @@ -0,0 +1,231 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Fill out firmware related FRUs (Field Replaceable Units) + * + * Copyright 2013-2019 IBM Corp. + */ + +#include <skiboot.h> +#include <stdlib.h> +#include <string.h> +#include <ipmi.h> +#include <lock.h> +#include <opal.h> +#include <device.h> + +struct product_info { + char *manufacturer; + char *product; + char *part_no; + char *version; + char *serial_no; + char *asset_tag; +}; + +struct common_header { + u8 version; + u8 internal_offset; + u8 chassis_offset; + u8 board_offset; + u8 product_offset; + u8 multirecord_offset; + u8 pad; + u8 checksum; +} __packed; + +/* The maximum amount of FRU data we can store. */ +#define FRU_DATA_SIZE 256 + +/* We allocate two bytes at these locations in the data array to track + * state. */ +#define WRITE_INDEX 256 +#define REMAINING 257 + +/* The ASCII string encoding used only has 5 bits to encode length + * hence the maximum is 31 characters. */ +#define MAX_STR_LEN 31 + +static u8 fru_dev_id = 0; + +static int fru_insert_string(u8 *buf, char *str) +{ + int len = strlen(str); + + /* The ASCII type/length format only supports a string length + * between 2 and 31 characters. Zero characters is ok though + * as it indicates no data present. */ + if (len == 1 || len > MAX_STR_LEN) + return OPAL_PARAMETER; + + buf[0] = 0xc0 | len; + memcpy(&buf[1], str, len); + + return len + 1; +} + +static u8 fru_checksum(u8 *buf, int len) +{ + int i; + u8 checksum = 0; + + for(i = 0; i < len; i++) { + checksum += buf[i]; + } + checksum = ~checksum + 1; + return checksum; +} + +#define FRU_INSERT_STRING(x, y) \ + ({ rc = fru_insert_string(x, y); \ + { if (rc < 1) return OPAL_PARAMETER; } rc; }) + +static int fru_fill_product_info(u8 *buf, struct product_info *info, size_t size) +{ + size_t total_size = 11; + int index = 0; + int rc; + + total_size += strlen(info->manufacturer); + total_size += strlen(info->product); + total_size += strlen(info->part_no); + total_size += strlen(info->version); + total_size += strlen(info->serial_no); + total_size += strlen(info->asset_tag); + total_size += (8 - (total_size % 8)) % 8; + if (total_size > size) + return OPAL_PARAMETER; + + buf[index++] = 0x1; /* Version */ + buf[index++] = total_size / 8; /* Size */ + buf[index++] = 0; /* Language code (English) */ + + index += FRU_INSERT_STRING(&buf[index], info->manufacturer); + index += FRU_INSERT_STRING(&buf[index], info->product); + index += FRU_INSERT_STRING(&buf[index], info->part_no); + index += FRU_INSERT_STRING(&buf[index], info->version); + index += FRU_INSERT_STRING(&buf[index], info->serial_no); + index += FRU_INSERT_STRING(&buf[index], info->asset_tag); + + buf[index++] = 0xc1; /* End of data marker */ + memset(&buf[index], 0, total_size - index - 1); + index += total_size - index - 1; + buf[index] = fru_checksum(buf, index); + assert(index == total_size - 1); + + return total_size; +} + +static int fru_add(u8 *buf, int size) +{ + int len; + struct common_header common_hdr; + char *short_version; + struct product_info info = { + .manufacturer = (char *) "IBM", + .product = (char *) "skiboot", + .part_no = (char *) "", + .serial_no = (char *) "", + .asset_tag = (char *) "", + }; + + if (size < sizeof(common_hdr)) + return OPAL_PARAMETER; + + /* We currently only support adding the version number at the + * product information offset. We choose an offset of 64 bytes + * because that's what the standard recommends. */ + common_hdr.version = 1; + common_hdr.internal_offset = 0; + common_hdr.chassis_offset = 0; + common_hdr.board_offset = 0; + common_hdr.product_offset = 64/8; + common_hdr.multirecord_offset = 0; + common_hdr.pad = 0; + common_hdr.checksum = fru_checksum((u8 *) &common_hdr, sizeof(common_hdr) - 1); + memcpy(buf, &common_hdr, sizeof(common_hdr)); + + short_version = strdup(version); + info.version = short_version; + if (!strncmp(version, "skiboot-", 8)) + info.version = &short_version[8]; + + if (strlen(info.version) >= MAX_STR_LEN) { + if (info.version[MAX_STR_LEN] != '\0') + info.version[MAX_STR_LEN - 1] = '+'; + info.version[MAX_STR_LEN] = '\0'; + } + + len = fru_fill_product_info(&buf[64], &info, size - 64); + free(short_version); + if (len < 0) + return OPAL_PARAMETER; + + return len + 64; +} + +static void fru_write_complete(struct ipmi_msg *msg) +{ + u8 write_count = msg->data[0]; + u16 offset; + + msg->data[WRITE_INDEX] += write_count; + msg->data[REMAINING] -= write_count; + if (msg->data[REMAINING] == 0) + goto out; + + offset = msg->data[WRITE_INDEX]; + ipmi_init_msg(msg, IPMI_DEFAULT_INTERFACE, IPMI_WRITE_FRU, + fru_write_complete, NULL, + MIN(msg->data[REMAINING] + 3, IPMI_MAX_REQ_SIZE), 2); + + memmove(&msg->data[3], &msg->data[offset + 3], msg->req_size - 3); + + msg->data[0] = fru_dev_id; /* FRU Device ID */ + msg->data[1] = offset & 0xff; /* Offset LSB */ + msg->data[2] = (offset >> 8) & 0xff; /* Offset MSB */ + + ipmi_queue_msg(msg); + + return; + +out: + ipmi_free_msg(msg); +} + +static int fru_write(void) +{ + struct ipmi_msg *msg; + int len; + + /* We allocate FRU_DATA_SIZE + 5 bytes for the message: + * - 3 bytes for the the write FRU command header + * - FRU_DATA_SIZE bytes for FRU data + * - 2 bytes for offset & bytes remaining count + */ + msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, IPMI_WRITE_FRU, + fru_write_complete, NULL, NULL, FRU_DATA_SIZE + 5, 2); + if (!msg) + return OPAL_RESOURCE; + + msg->data[0] = fru_dev_id; /* FRU Device ID */ + msg->data[1] = 0x0; /* Offset LSB (we always write a new common header) */ + msg->data[2] = 0x0; /* Offset MSB */ + len = fru_add(&msg->data[3], FRU_DATA_SIZE); + + if (len < 0) + return len; + + /* Three bytes for the actual FRU Data Command */ + msg->data[WRITE_INDEX] = 0; + msg->data[REMAINING] = len; + msg->req_size = MIN(len + 3, IPMI_MAX_REQ_SIZE); + return ipmi_queue_msg(msg); +} + +void ipmi_fru_init(u8 dev_id) +{ + fru_dev_id = dev_id; + fru_write(); + + return; +} diff --git a/roms/skiboot/hw/ipmi/ipmi-info.c b/roms/skiboot/hw/ipmi/ipmi-info.c new file mode 100644 index 000000000..d93b59d7d --- /dev/null +++ b/roms/skiboot/hw/ipmi/ipmi-info.c @@ -0,0 +1,206 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Various bits of info retreived over IPMI + * + * Copyright 2018-2019 IBM Corp. + */ + +#include <device.h> +#include <skiboot.h> +#include <stdlib.h> +#include <ipmi.h> +#include <mem_region-malloc.h> +#include <opal.h> +#include <timebase.h> + +/* + * Response data from IPMI Get device ID command (As defined in + * Section 20.1 Get Device ID Command - IPMI standard spec). + */ +struct ipmi_dev_id { + uint8_t dev_id; + uint8_t dev_revision; + uint8_t fw_rev1; + uint8_t fw_rev2; + uint8_t ipmi_ver; + uint8_t add_dev_support; + uint8_t manufactur_id[3]; + uint8_t product_id[2]; + uint8_t aux_fw_rev[4]; +}; +static struct ipmi_dev_id *ipmi_dev_id; + +/* + * Response data from IPMI Chassis Get System Boot Option (As defined in + * Section 28.13 Get System Boot Options Command - IPMI standard spec). + */ +struct ipmi_sys_boot_opt { + uint8_t param_version; + uint8_t param_valid; + /* + * Fields for OEM parameter 0x62. This parameter does not follow + * the normal layout and just has a single byte to signal if it + * is active or not. + */ + uint8_t flag_set; +}; +static struct ipmi_sys_boot_opt *ipmi_sys_boot_opt; + +/* Got response from BMC? */ +static bool bmc_info_waiting = false; +static bool bmc_info_valid = false; +static bool bmc_boot_opt_waiting = false; +static bool bmc_boot_opt_valid = false; + +/* This will free ipmi_dev_id structure */ +void ipmi_dt_add_bmc_info(void) +{ + char buf[8]; + struct dt_node *dt_fw_version; + + while (bmc_info_waiting) + time_wait_ms(5); + + if (!bmc_info_valid) + return; + + dt_fw_version = dt_find_by_name(dt_root, "ibm,firmware-versions"); + if (!dt_fw_version) { + free(ipmi_dev_id); + return; + } + + memset(buf, 0, sizeof(buf)); + snprintf(buf, sizeof(buf), "%x.%02x", + ipmi_dev_id->fw_rev1, ipmi_dev_id->fw_rev2); + dt_add_property_string(dt_fw_version, "bmc-firmware-version", buf); + + free(ipmi_dev_id); +} + +static void ipmi_get_bmc_info_resp(struct ipmi_msg *msg) +{ + bmc_info_waiting = false; + + if (msg->cc != IPMI_CC_NO_ERROR) { + prlog(PR_ERR, "IPMI: IPMI_BMC_GET_DEVICE_ID cmd returned error" + " [rc : 0x%x]\n", msg->data[0]); + return; + } + + /* ipmi_dev_id has optional fields */ + if (msg->resp_size <= sizeof(struct ipmi_dev_id)) { + bmc_info_valid = true; + memcpy(ipmi_dev_id, msg->data, msg->resp_size); + } else { + prlog(PR_WARNING, "IPMI: IPMI_BMC_GET_DEVICE_ID unexpected response size\n"); + } + + ipmi_free_msg(msg); +} + +int ipmi_get_bmc_info_request(void) +{ + int rc; + struct ipmi_msg *msg; + + ipmi_dev_id = zalloc(sizeof(struct ipmi_dev_id)); + assert(ipmi_dev_id); + + msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, IPMI_BMC_GET_DEVICE_ID, + ipmi_get_bmc_info_resp, NULL, NULL, + 0, sizeof(struct ipmi_dev_id)); + if (!msg) + return OPAL_NO_MEM; + + msg->error = ipmi_get_bmc_info_resp; + prlog(PR_INFO, "IPMI: Requesting IPMI_BMC_GET_DEVICE_ID\n"); + rc = ipmi_queue_msg(msg); + if (rc) { + prlog(PR_ERR, "IPMI: Failed to queue IPMI_BMC_GET_DEVICE_ID\n"); + ipmi_free_msg(msg); + return rc; + } + + bmc_info_waiting = true; + return rc; +} + +/* This will free ipmi_sys_boot_opt structure */ +int ipmi_chassis_check_sbe_validation(void) +{ + int rc = -1; + + while (bmc_boot_opt_waiting) + time_wait_ms(10); + + if (!bmc_boot_opt_valid) + goto out; + + if ((ipmi_sys_boot_opt->param_valid & 0x8) != 0) + goto out; + if (ipmi_sys_boot_opt->param_valid != 0x62) + goto out; + + rc = ipmi_sys_boot_opt->flag_set; + +out: + free(ipmi_sys_boot_opt); + return rc; +} + +static void ipmi_get_chassis_boot_opt_resp(struct ipmi_msg *msg) +{ + bmc_boot_opt_waiting = false; + + if (msg->cc != IPMI_CC_NO_ERROR) { + prlog(PR_INFO, "IPMI: IPMI_CHASSIS_GET_BOOT_OPT cmd returned error" + " [rc : 0x%x]\n", msg->data[0]); + ipmi_free_msg(msg); + return; + } + + if (msg->resp_size == sizeof(struct ipmi_sys_boot_opt)) { + bmc_boot_opt_valid = true; + memcpy(ipmi_sys_boot_opt, msg->data, msg->resp_size); + } else { + prlog(PR_WARNING, "IPMI: IPMI_CHASSIS_GET_BOOT_OPT unexpected response size\n"); + } + + ipmi_free_msg(msg); +} + +int ipmi_get_chassis_boot_opt_request(void) +{ + int rc; + struct ipmi_msg *msg; + uint8_t req[] = { + 0x62, /* OEM parameter (SBE Validation on astbmc) */ + 0x00, /* no set selector */ + 0x00, /* no block selector */ + }; + + ipmi_sys_boot_opt = zalloc(sizeof(struct ipmi_sys_boot_opt)); + assert(ipmi_sys_boot_opt); + + msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, IPMI_CHASSIS_GET_BOOT_OPT, + ipmi_get_chassis_boot_opt_resp, NULL, req, + sizeof(req), sizeof(struct ipmi_sys_boot_opt)); + if (!msg) { + free(ipmi_sys_boot_opt); + return OPAL_NO_MEM; + } + + msg->error = ipmi_get_chassis_boot_opt_resp; + prlog(PR_INFO, "IPMI: Requesting IPMI_CHASSIS_GET_BOOT_OPT\n"); + rc = ipmi_queue_msg(msg); + if (rc) { + prlog(PR_ERR, "IPMI: Failed to queue IPMI_CHASSIS_GET_BOOT_OPT\n"); + free(ipmi_sys_boot_opt); + ipmi_free_msg(msg); + return rc; + } + + bmc_boot_opt_waiting = true; + return rc; +} diff --git a/roms/skiboot/hw/ipmi/ipmi-power.c b/roms/skiboot/hw/ipmi/ipmi-power.c new file mode 100644 index 000000000..8101a8524 --- /dev/null +++ b/roms/skiboot/hw/ipmi/ipmi-power.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Power as in electricity, not POWER as in POWER + * + * Copyright 2013-2019 IBM Corp. + */ + +#include <skiboot.h> +#include <stdlib.h> +#include <ipmi.h> +#include <opal.h> +#include <timebase.h> + +static void ipmi_chassis_control_complete(struct ipmi_msg *msg) +{ + uint8_t request = msg->data[0]; + uint8_t cc = msg->cc; + + ipmi_free_msg(msg); + if (cc == IPMI_CC_NO_ERROR) + return; + + prlog(PR_INFO, "IPMI: Chassis control request failed. " + "request=0x%02x, rc=0x%02x\n", request, cc); + + if (ipmi_chassis_control(request)) { + prlog(PR_INFO, "IPMI: Failed to resend chassis control " + "request [0x%02x]\n", request); + } +} + +int ipmi_chassis_control(uint8_t request) +{ + struct ipmi_msg *msg; + + if (!ipmi_present()) + return OPAL_CLOSED; + + if (request > IPMI_CHASSIS_SOFT_SHUTDOWN) + return OPAL_PARAMETER; + + msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, IPMI_CHASSIS_CONTROL, + ipmi_chassis_control_complete, NULL, + &request, sizeof(request), 0); + if (!msg) + return OPAL_HARDWARE; + /* Set msg->error callback function */ + msg->error = ipmi_chassis_control_complete; + + prlog(PR_INFO, "IPMI: sending chassis control request 0x%02x\n", + request); + + return ipmi_queue_msg(msg); +} + +int ipmi_set_power_state(uint8_t system, uint8_t device) +{ + struct ipmi_msg *msg; + struct { + uint8_t system; + uint8_t device; + } power_state; + + if (!ipmi_present()) + return OPAL_CLOSED; + + power_state.system = system; + power_state.device = device; + + if (system != IPMI_PWR_NOCHANGE) + power_state.system |= 0x80; + if (device != IPMI_PWR_NOCHANGE) + power_state.device |= 0x80; + + msg = ipmi_mkmsg_simple(IPMI_SET_POWER_STATE, &power_state, + sizeof(power_state)); + + if (!msg) + return OPAL_HARDWARE; + + prlog(PR_INFO, "IPMI: setting power state: sys %02x, dev %02x\n", + power_state.system, power_state.device); + + return ipmi_queue_msg(msg); +} diff --git a/roms/skiboot/hw/ipmi/ipmi-rtc.c b/roms/skiboot/hw/ipmi/ipmi-rtc.c new file mode 100644 index 000000000..52da2946c --- /dev/null +++ b/roms/skiboot/hw/ipmi/ipmi-rtc.c @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Talk to a Real Time Clock (RTC) over IPMI + * + * Copyright 2013-2015 IBM Corp. + */ + +#include <stdlib.h> +#include <string.h> +#include <ipmi.h> +#include <time.h> +#include <time-utils.h> +#include <device.h> +#include <opal.h> +#include <rtc.h> + +static enum {idle, waiting, updated, error} time_status; + +static void get_sel_time_error(struct ipmi_msg *msg) +{ + time_status = error; + ipmi_free_msg(msg); +} + +static void get_sel_time_complete(struct ipmi_msg *msg) +{ + struct tm tm; + le32 result; + time_t time; + + memcpy(&result, msg->data, 4); + time = le32_to_cpu(result); + gmtime_r(&time, &tm); + rtc_cache_update(&tm); + time_status = updated; + ipmi_free_msg(msg); +} + +static int64_t ipmi_get_sel_time(void) +{ + struct ipmi_msg *msg; + + msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, IPMI_GET_SEL_TIME, + get_sel_time_complete, NULL, NULL, 0, 4); + if (!msg) + return OPAL_HARDWARE; + + msg->error = get_sel_time_error; + + return ipmi_queue_msg(msg); +} + +static int64_t ipmi_set_sel_time(uint32_t _tv) +{ + struct ipmi_msg *msg; + const le32 tv = cpu_to_le32(_tv); + + msg = ipmi_mkmsg_simple(IPMI_SET_SEL_TIME, (void*)&tv, sizeof(tv)); + if (!msg) + return OPAL_HARDWARE; + + return ipmi_queue_msg(msg); +} + +static int64_t ipmi_opal_rtc_read(__be32 *__ymd, __be64 *__hmsm) +{ + int ret = 0; + uint32_t ymd; + uint64_t hmsm; + + if (!__ymd || !__hmsm) + return OPAL_PARAMETER; + + switch(time_status) { + case idle: + if (ipmi_get_sel_time() < 0) + return OPAL_HARDWARE; + time_status = waiting; + ret = OPAL_BUSY_EVENT; + break; + + case waiting: + ret = OPAL_BUSY_EVENT; + break; + + case updated: + rtc_cache_get_datetime(&ymd, &hmsm); + *__ymd = cpu_to_be32(ymd); + *__hmsm = cpu_to_be64(hmsm); + time_status = idle; + ret = OPAL_SUCCESS; + break; + + case error: + time_status = idle; + ret = OPAL_HARDWARE; + break; + } + + return ret; +} + +static int64_t ipmi_opal_rtc_write(uint32_t year_month_day, + uint64_t hour_minute_second_millisecond) +{ + time_t t; + struct tm tm; + + datetime_to_tm(year_month_day, hour_minute_second_millisecond, &tm); + t = mktime(&tm); + if (ipmi_set_sel_time(t)) + return OPAL_HARDWARE; + + return OPAL_SUCCESS; +} + +void ipmi_rtc_init(void) +{ + struct dt_node *np = dt_new(opal_node, "rtc"); + dt_add_property_strings(np, "compatible", "ibm,opal-rtc"); + + opal_register(OPAL_RTC_READ, ipmi_opal_rtc_read, 2); + opal_register(OPAL_RTC_WRITE, ipmi_opal_rtc_write, 2); + + /* Initialise the rtc cache */ + ipmi_get_sel_time(); +} diff --git a/roms/skiboot/hw/ipmi/ipmi-sel.c b/roms/skiboot/hw/ipmi/ipmi-sel.c new file mode 100644 index 000000000..215b8ba7d --- /dev/null +++ b/roms/skiboot/hw/ipmi/ipmi-sel.c @@ -0,0 +1,701 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* Copyright 2013-2018 IBM Corp. */ + +#define pr_fmt(fmt) "IPMI: " fmt +#include <ccan/list/list.h> +#include <ccan/str/str.h> +#include <compiler.h> +#include <errno.h> +#include <skiboot.h> +#include <stdlib.h> +#include <string.h> +#include <ipmi.h> +#include <device.h> +#include <opal.h> +#include <lock.h> +#include <errorlog.h> +#include <pel.h> +#include <opal-msg.h> +#include <debug_descriptor.h> +#include <occ.h> +#include <timebase.h> + +/* OEM SEL fields */ +#define SEL_OEM_ID_0 0x55 +#define SEL_OEM_ID_1 0x55 +#define SEL_RECORD_TYPE_OEM 0xC0 +#define SEL_RECORD_TYPE_EVENT 0x02 + +#define SEL_NETFN_IBM 0x3a + +/* OEM SEL Commands */ +/* TODO: Move these to their respective source files */ +#define CMD_AMI_POWER 0x04 +#define CMD_AMI_PNOR_ACCESS 0x07 +#define CMD_AMI_OCC_RESET 0x0e +#define CMD_HEARTBEAT 0xff + +/* XXX: Listed here for completeness, registered in libflash/ipmi-flash.c */ +#define CMD_OP_HIOMAP_EVENT 0x0f + +#define SOFT_OFF 0x00 +#define SOFT_REBOOT 0x01 + +#define RELEASE_PNOR 0x00 +#define REQUEST_PNOR 0x01 + +/* 32.1 SEL Event Records type */ +#define SEL_REC_TYPE_SYS_EVENT 0x02 +#define SEL_REC_TYPE_AMI_ESEL 0xDF + +/* OEM SEL generator ID for AMI */ +#define SEL_GENERATOR_ID_AMI 0x0020 + +/* IPMI SEL version */ +#define SEL_EVM_VER_1 0x03 +#define SEL_EVM_VER_2 0x04 + +/* + * Sensor type for System events + * + * Sensor information (type, number, etc) is passed to us via + * device tree. Currently we are using System Event type to + * log OPAL events. + */ +#define SENSOR_TYPE_SYS_EVENT 0x12 + +/* + * 42.1 Event/Reading Type Codes + * + * Note that device hotplug and availability related events + * are not defined as we are not using those events type. + */ +#define SEL_EVENT_DIR_TYPE_UNSPECIFIED 0x00 +#define SEL_EVENT_DIR_TYPE_THRESHOLD 0x01 +#define SEL_EVENT_DIR_TYPE_STATE 0x03 +#define SEL_EVENT_DIR_TYPE_PREDICTIVE 0x04 +#define SEL_EVENT_DIR_TYPE_LIMIT 0x05 +#define SEL_EVENT_DIR_TYPE_PERFORMANCE 0x06 +#define SEL_EVENT_DIR_TYPE_TRANSITION 0x07 +#define SEL_EVENT_DIR_TYPE_OEM 0x70 + +/* + * 42.1 Event/Reading Type Codes + */ +#define SEL_DATA1_AMI 0xAA +#define SEL_DATA1_DEASSERTED 0x00 +#define SEL_DATA1_ASSERTED 0x01 +#define SEL_DATA1_OK 0x00 +#define SEL_DATA1_NON_CRIT_FROM_OK 0x01 +#define SEL_DATA1_CRIT_FROM_LESS_SEV 0x02 +#define SEL_DATA1_NON_REC_FROM_LESS_SEV 0x03 +#define SEL_DATA1_NON_CRIT 0x04 +#define SEL_DATA1_CRITICAL 0x05 +#define SEL_DATA1_NON_RECOVERABLE 0X06 +#define SEL_DATA1_MONITOR 0x07 +#define SEL_DATA1_INFORMATIONAL 0x08 + +/* SEL Record Entry */ +struct sel_record { + le16 record_id; + uint8_t record_type; + le32 timestamp; + le16 generator_id; + uint8_t evm_ver; + uint8_t sensor_type; + uint8_t sensor_number; + uint8_t event_dir_type; + uint8_t event_data1; + uint8_t event_data2; + uint8_t event_data3; +} __packed; + +static struct sel_record sel_record; + +struct oem_sel { + /* SEL header */ + uint8_t id[2]; + uint8_t type; + uint8_t timestamp[4]; + uint8_t manuf_id[3]; + /* OEM SEL data (6 bytes) follows */ + uint8_t netfun; + uint8_t cmd; + uint8_t data[4]; +}; + +#define ESEL_HDR_SIZE 7 + +/* Used for sending PANIC events like abort() path */ +struct ipmi_sel_panic_msg { + bool busy; + struct ipmi_msg *msg; + struct lock lock; +}; +static struct ipmi_sel_panic_msg ipmi_sel_panic_msg; + +static LIST_HEAD(sel_handlers); + +/* Forward declaration */ +static void ipmi_elog_poll(struct ipmi_msg *msg); + +/* + * Allocate IPMI message: + * For normal event, allocate memory using ipmi_mkmsg and for PANIC + * event, use pre-allocated buffer. + */ +static struct ipmi_msg *ipmi_sel_alloc_msg(struct errorlog *elog_buf) +{ + struct ipmi_msg *msg = NULL; + + if (elog_buf->event_severity == OPAL_ERROR_PANIC) { + /* Called before initialization completes */ + if (ipmi_sel_panic_msg.msg == NULL) { + ipmi_sel_init(); /* Try to allocate IPMI message */ + if (ipmi_sel_panic_msg.msg == NULL) + return NULL; + } + + if (ipmi_sel_panic_msg.busy == true) + return NULL; + + lock(&ipmi_sel_panic_msg.lock); + msg = ipmi_sel_panic_msg.msg; + ipmi_sel_panic_msg.busy = true; + unlock(&ipmi_sel_panic_msg.lock); + + ipmi_init_msg(msg, IPMI_DEFAULT_INTERFACE, IPMI_RESERVE_SEL, + ipmi_elog_poll, elog_buf, IPMI_MAX_REQ_SIZE, 2); + } else { + msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, IPMI_RESERVE_SEL, + ipmi_elog_poll, elog_buf, NULL, + IPMI_MAX_REQ_SIZE, 2); + } + + return msg; +} + +static void ipmi_sel_free_msg(struct ipmi_msg *msg) +{ + if (msg == ipmi_sel_panic_msg.msg) { + lock(&ipmi_sel_panic_msg.lock); + ipmi_sel_panic_msg.busy = false; + unlock(&ipmi_sel_panic_msg.lock); + } else { + ipmi_free_msg(msg); + } + + msg = NULL; +} + +/* Initialize eSEL record */ +static void ipmi_init_esel_record(void) +{ + memset(&sel_record, 0, sizeof(struct sel_record)); + sel_record.record_type = SEL_REC_TYPE_AMI_ESEL; + sel_record.generator_id = cpu_to_le16(SEL_GENERATOR_ID_AMI); + sel_record.evm_ver = SEL_EVM_VER_2; + sel_record.sensor_type = SENSOR_TYPE_SYS_EVENT; + sel_record.sensor_number = + ipmi_get_sensor_number(SENSOR_TYPE_SYS_EVENT); + sel_record.event_dir_type = SEL_EVENT_DIR_TYPE_OEM; + sel_record.event_data1 = SEL_DATA1_AMI; +} + +/* Update required fields in SEL record */ +static void ipmi_update_sel_record(uint8_t event_severity, uint16_t esel_record_id) +{ + sel_record.record_type = SEL_REC_TYPE_SYS_EVENT; + sel_record.event_data2 = (esel_record_id >> 8) & 0xff; + sel_record.event_data3 = esel_record_id & 0xff; + + switch (event_severity) { + case OPAL_ERROR_PANIC: + sel_record.event_dir_type = SEL_EVENT_DIR_TYPE_TRANSITION; + sel_record.event_data1 = SEL_DATA1_CRITICAL; + break; + case OPAL_UNRECOVERABLE_ERR_GENERAL: /* Fall through */ + case OPAL_UNRECOVERABLE_ERR_DEGRADE_PERF: + case OPAL_UNRECOVERABLE_ERR_LOSS_REDUNDANCY: + case OPAL_UNRECOVERABLE_ERR_LOSS_REDUNDANCY_PERF: + case OPAL_UNRECOVERABLE_ERR_LOSS_OF_FUNCTION: + sel_record.event_dir_type = SEL_EVENT_DIR_TYPE_TRANSITION; + sel_record.event_data1 = SEL_DATA1_NON_RECOVERABLE; + break; + case OPAL_PREDICTIVE_ERR_GENERAL: /* Fall through */ + case OPAL_PREDICTIVE_ERR_DEGRADED_PERF: + case OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT: + case OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_BOOT_DEGRADE_PERF: + case OPAL_PREDICTIVE_ERR_LOSS_OF_REDUNDANCY: + sel_record.event_dir_type = SEL_EVENT_DIR_TYPE_PREDICTIVE; + sel_record.event_data1 = SEL_DATA1_NON_CRIT_FROM_OK; + break; + case OPAL_RECOVERED_ERR_GENERAL: + sel_record.event_dir_type = SEL_EVENT_DIR_TYPE_TRANSITION; + sel_record.event_data1 = SEL_DATA1_OK; + break; + case OPAL_INFO: + sel_record.event_dir_type = SEL_EVENT_DIR_TYPE_TRANSITION; + sel_record.event_data1 = SEL_DATA1_INFORMATIONAL; + break; + default: + sel_record.event_dir_type = SEL_EVENT_DIR_TYPE_STATE; + sel_record.event_data1 = SEL_DATA1_ASSERTED; + break; + } +} + +static void ipmi_elog_error(struct ipmi_msg *msg) +{ + if (msg->cc == IPMI_LOST_ARBITRATION_ERR) + /* Retry due to SEL erase */ + ipmi_queue_msg(msg); + else { + opal_elog_complete(msg->user_data, false); + ipmi_sel_free_msg(msg); + } +} + +static void ipmi_log_sel_event_error(struct ipmi_msg *msg) +{ + if (msg->cc != IPMI_CC_NO_ERROR) + prlog(PR_INFO, "SEL: Failed to log SEL event\n"); + + ipmi_sel_free_msg(msg); +} + +static void ipmi_log_sel_event_complete(struct ipmi_msg *msg) +{ + prlog(PR_INFO, "SEL: New event logged [ID : %x%x]\n", msg->data[1], + msg->data[0]); + + ipmi_sel_free_msg(msg); +} + +/* Log SEL event with eSEL record ID */ +static void ipmi_log_sel_event(struct ipmi_msg *msg, uint8_t event_severity, + uint16_t esel_record_id) +{ + /* Fill required SEL event fields */ + ipmi_update_sel_record(event_severity, esel_record_id); + + /* Fill IPMI message */ + ipmi_init_msg(msg, IPMI_DEFAULT_INTERFACE, IPMI_ADD_SEL_EVENT, + ipmi_log_sel_event_complete, NULL, + sizeof(struct sel_record), 2); + + /* Copy SEL data */ + memcpy(msg->data, &sel_record, sizeof(struct sel_record)); + + msg->error = ipmi_log_sel_event_error; + ipmi_queue_msg_head(msg); +} + +/* Goes through the required steps to add a complete eSEL: + * + * 1. Get a reservation + * 2. Add eSEL header + * 3. Partially add data to the SEL + * + * Because a reservation is needed we need to ensure eSEL's are added + * as a single transaction as concurrent/interleaved adds would cancel + * the reservation. We guarantee this by always adding our messages to + * the head of the transmission queue, blocking any other messages + * being sent until we have completed sending this message. + * + * There is still a very small chance that we will accidentally + * interleave a message if there is another one waiting at the head of + * the ipmi queue and another cpu calls the ipmi poller before we + * complete. However this should just cause a resevation cancelled + * error which we have to deal with anyway (eg. because there may be a + * SEL erase in progress) so it shouldn't cause any problems. + */ +static void ipmi_elog_poll(struct ipmi_msg *msg) +{ + static bool first = false; + static char pel_buf[IPMI_MAX_PEL_SIZE]; + static size_t pel_size; + static size_t esel_size; + static int esel_index = 0; + int pel_index; + static unsigned int reservation_id = 0; + static unsigned int record_id = 0; + struct errorlog *elog_buf = (struct errorlog *) msg->user_data; + size_t req_size; + + if (bmc_platform->sw->ipmi_oem_partial_add_esel == 0) { + prlog(PR_WARNING, "Dropped eSEL: BMC code is buggy/missing\n"); + ipmi_sel_free_msg(msg); + return; + } + + ipmi_init_esel_record(); + if (msg->cmd == IPMI_CMD(IPMI_RESERVE_SEL)) { + first = true; + reservation_id = msg->data[0]; + reservation_id |= msg->data[1] << 8; + if (!reservation_id) { + /* + * According to specification we should never + * get here, but just in case we do we cancel + * sending the message. + */ + prerror("Invalid reservation id"); + opal_elog_complete(elog_buf, false); + ipmi_sel_free_msg(msg); + return; + } + + pel_size = create_pel_log(elog_buf, pel_buf, IPMI_MAX_PEL_SIZE); + esel_size = pel_size + sizeof(struct sel_record); + esel_index = 0; + record_id = 0; + } else { + record_id = msg->data[0]; + record_id |= msg->data[1] << 8; + } + + /* Start or continue the IPMI_PARTIAL_ADD_SEL */ + if (esel_index >= esel_size) { + /* + * We're all done. Invalidate the resevation id to + * ensure we get an error if we cut in on another eSEL + * message. + */ + reservation_id = 0; + esel_index = 0; + + /* Log SEL event and free ipmi message */ + ipmi_log_sel_event(msg, elog_buf->event_severity, record_id); + + opal_elog_complete(elog_buf, true); + return; + } + + if ((esel_size - esel_index) <= (IPMI_MAX_REQ_SIZE - ESEL_HDR_SIZE)) { + /* Last data to send */ + msg->data[6] = 1; + req_size = esel_size - esel_index + ESEL_HDR_SIZE; + } else { + msg->data[6] = 0; + req_size = IPMI_MAX_REQ_SIZE; + } + + ipmi_init_msg(msg, IPMI_DEFAULT_INTERFACE, + bmc_platform->sw->ipmi_oem_partial_add_esel, + ipmi_elog_poll, elog_buf, req_size, 2); + + msg->data[0] = reservation_id & 0xff; + msg->data[1] = (reservation_id >> 8) & 0xff; + msg->data[2] = record_id & 0xff; + msg->data[3] = (record_id >> 8) & 0xff; + msg->data[4] = esel_index & 0xff; + msg->data[5] = (esel_index >> 8) & 0xff; + + if (first) { + first = false; + memcpy(&msg->data[ESEL_HDR_SIZE], &sel_record, + sizeof(struct sel_record)); + esel_index = sizeof(struct sel_record); + msg->req_size = esel_index + ESEL_HDR_SIZE; + } else { + pel_index = esel_index - sizeof(struct sel_record); + memcpy(&msg->data[ESEL_HDR_SIZE], &pel_buf[pel_index], + msg->req_size - ESEL_HDR_SIZE); + esel_index += msg->req_size - ESEL_HDR_SIZE; + } + + ipmi_queue_msg_head(msg); + return; +} + +int ipmi_elog_commit(struct errorlog *elog_buf) +{ + struct ipmi_msg *msg; + + /* Only log events that needs attention */ + if (elog_buf->event_severity < + OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT || + elog_buf->elog_origin != ORG_SAPPHIRE) { + prlog(PR_INFO, "dropping non severe PEL event\n"); + opal_elog_complete(elog_buf, true); + return 0; + } + + /* + * We pass a large request size in to mkmsg so that we have a + * large enough allocation to reuse the message to pass the + * PEL data via a series of partial add commands. + */ + msg = ipmi_sel_alloc_msg(elog_buf); + if (!msg) { + opal_elog_complete(elog_buf, false); + return OPAL_RESOURCE; + } + + msg->error = ipmi_elog_error; + msg->req_size = 0; + if (elog_buf->event_severity == OPAL_ERROR_PANIC) { + ipmi_queue_msg_sync(msg); + + /* + * eSEL logs are split into multiple smaller chunks and sent + * to BMC. Lets wait until we finish sending all the chunks + * to BMC. + */ + while (ipmi_sel_panic_msg.busy != false) { + if (msg->backend->poll) + msg->backend->poll(); + time_wait_ms(10); + } + } else { + ipmi_queue_msg(msg); + } + + return 0; +} + +#define ACCESS_DENIED 0x00 +#define ACCESS_GRANTED 0x01 + +static void sel_pnor(uint8_t access, void *context __unused) +{ + struct ipmi_msg *msg; + uint8_t granted = ACCESS_GRANTED; + + switch (access) { + case REQUEST_PNOR: + prlog(PR_NOTICE, "PNOR access requested\n"); + if (bmc_platform->sw->ipmi_oem_pnor_access_status == 0) { + /** + * @fwts-label PNORAccessYeahButNoBut + * @fwts-advice OPAL doesn't know that the BMC supports + * PNOR access commands. This will be a bug in the OPAL + * support for this BMC. + */ + prlog(PR_ERR, "PNOR BUG: access requested but BMC doesn't support request\n"); + break; + } + + granted = flash_reserve(); + if (granted) + occ_pnor_set_owner(PNOR_OWNER_EXTERNAL); + /* Ack the request */ + msg = ipmi_mkmsg_simple(bmc_platform->sw->ipmi_oem_pnor_access_status, &granted, 1); + ipmi_queue_msg(msg); + break; + case RELEASE_PNOR: + prlog(PR_NOTICE, "PNOR access released\n"); + flash_release(); + occ_pnor_set_owner(PNOR_OWNER_HOST); + break; + default: + /** + * @fwts-label InvalidPNORAccessRequest + * @fwts-advice In negotiating PNOR access with BMC, we + * got an odd/invalid request from the BMC. Likely a bug + * in OPAL/BMC interaction. + */ + prlog(PR_ERR, "invalid PNOR access requested: %02x\n", + access); + } +} + +static void sel_power(uint8_t power, void *context __unused) +{ + switch (power) { + case SOFT_OFF: + prlog(PR_NOTICE, "Soft shutdown requested\n"); + if (opal_booting() && platform.cec_power_down) { + prlog(PR_NOTICE, "Host not up, shutting down now\n"); + platform.cec_power_down(IPMI_CHASSIS_PWR_DOWN); + } else { + opal_queue_msg(OPAL_MSG_SHUTDOWN, NULL, NULL, + cpu_to_be64(SOFT_OFF)); + } + + break; + case SOFT_REBOOT: + prlog(PR_NOTICE, "Soft reboot requested\n"); + if (opal_booting() && platform.cec_reboot) { + prlog(PR_NOTICE, "Host not up, rebooting now\n"); + platform.cec_reboot(); + } else { + opal_queue_msg(OPAL_MSG_SHUTDOWN, NULL, NULL, + cpu_to_be64(SOFT_REBOOT)); + } + + break; + default: + prlog(PR_WARNING, "requested bad power state: %02x\n", + power); + } +} + +static void sel_heartbeat(uint8_t heartbeat, void *context __unused) +{ + /* There is only one sub-command so no processing needed */ + prlog(PR_DEBUG, "BMC issued heartbeat command: %02x\n", + heartbeat); +} + +static uint32_t occ_sensor_id_to_chip(uint8_t sensor, uint32_t *chip) +{ + struct dt_node *node, *bmc_node, *sensors_node; + + /* Default chip id */ + *chip = 0; + + bmc_node = dt_find_by_name(dt_root, "bmc"); + if (!bmc_node) + return 0; + + sensors_node = dt_find_by_name(bmc_node, "sensors"); + if (!sensors_node) + return 0; + + node = dt_find_by_name_addr(sensors_node, "sensor", sensor); + if (!node) { + prlog(PR_DEBUG, "Could not find OCC sensor node. Id : %d\n", + (u32)sensor); + return 0; + } + + if (!dt_has_node_property(node, "ibm,chip-id", NULL)) { + prlog(PR_DEBUG, "Could not find chip-id for OCC sensor : %d\n", + (u32)sensor); + return 0; + } + + *chip = dt_get_chip_id(node); + return 0; +} + +static void sel_occ_reset(uint8_t sensor, void *context __unused) +{ + uint32_t chip; + int rc; + + rc = occ_sensor_id_to_chip(sensor, &chip); + if (rc) { + /** + * @fwts-label: SELUnknownOCCReset + * @fwts-advice: Likely bug in what sent us the OCC reset. + */ + prlog(PR_ERR, "SEL message to reset an unknown OCC " + "(sensor ID 0x%02x)\n", sensor); + return; + } + + prd_occ_reset(chip); +} + +struct ipmi_sel_handler { + uint8_t oem_cmd; + void (*fn)(uint8_t data, void *context); + void *context; + struct list_node node; +}; + +int ipmi_sel_register(uint8_t oem_cmd, + void (*fn)(uint8_t data, void *context), + void *context) +{ + struct ipmi_sel_handler *handler; + + list_for_each(&sel_handlers, handler, node) { + if (handler->oem_cmd == oem_cmd) { + prerror("Handler for SEL command 0x%02x already registered\n", + oem_cmd); + return -EINVAL; + } + } + + handler = malloc(sizeof(*handler)); + if (!handler) + return -ENOMEM; + + handler->oem_cmd = oem_cmd; + handler->fn = fn; + handler->context = context; + + list_add(&sel_handlers, &handler->node); + + return 0; +} + +void ipmi_sel_init(void) +{ + int rc; + + /* Already done */ + if (ipmi_sel_panic_msg.msg != NULL) + return; + + memset(&ipmi_sel_panic_msg, 0, sizeof(struct ipmi_sel_panic_msg)); + ipmi_sel_panic_msg.msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, + IPMI_RESERVE_SEL, ipmi_elog_poll, + NULL, NULL, IPMI_MAX_REQ_SIZE, 2); + + /* Hackishly register these old-style handlers here for now */ + /* TODO: Move them to their appropriate source files */ + rc = ipmi_sel_register(CMD_AMI_POWER, sel_power, NULL); + if (rc < 0) { + prerror("Failed to register SEL handler for %s", + stringify(CMD_AMI_POWER)); + } + + rc = ipmi_sel_register(CMD_AMI_OCC_RESET, sel_occ_reset, NULL); + if (rc < 0) { + prerror("Failed to register SEL handler for %s", + stringify(CMD_AMI_OCC_RESET)); + } + + rc = ipmi_sel_register(CMD_AMI_PNOR_ACCESS, sel_pnor, NULL); + if (rc < 0) { + prerror("Failed to register SEL handler for %s", + stringify(CMD_AMI_PNOR_ACCESS)); + } + + rc = ipmi_sel_register(CMD_HEARTBEAT, sel_heartbeat, NULL); + if (rc < 0) { + prerror("Failed to register SEL handler for %s", + stringify(CMD_HEARTBEAT)); + } +} + +void ipmi_parse_sel(struct ipmi_msg *msg) +{ + struct ipmi_sel_handler *handler; + struct oem_sel sel; + + assert(msg->resp_size <= 16); + + memcpy(&sel, msg->data, msg->resp_size); + + /* We do not process system event records */ + if (sel.type == SEL_RECORD_TYPE_EVENT) { + prlog(PR_INFO, "dropping System Event Record SEL\n"); + return; + } + + prlog(PR_DEBUG, "SEL received (%d bytes, netfn %d, cmd %d)\n", + msg->resp_size, sel.netfun, sel.cmd); + + /* Only accept OEM SEL messages */ + if (sel.id[0] != SEL_OEM_ID_0 || sel.id[1] != SEL_OEM_ID_1 || + sel.type != SEL_RECORD_TYPE_OEM) { + prlog(PR_WARNING, "unknown SEL %02x%02x (type %02x)\n", + sel.id[0], sel.id[1], sel.type); + return; + } + + list_for_each(&sel_handlers, handler, node) { + if (handler->oem_cmd == sel.cmd) { + handler->fn(sel.data[0], handler->context); + return; + } + } + + prlog(PR_WARNING, "unknown OEM SEL command %02x received\n", sel.cmd); +} diff --git a/roms/skiboot/hw/ipmi/ipmi-sensor.c b/roms/skiboot/hw/ipmi/ipmi-sensor.c new file mode 100644 index 000000000..857b789e4 --- /dev/null +++ b/roms/skiboot/hw/ipmi/ipmi-sensor.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* Copyright 2013-2017 IBM Corp. */ + +#include <device.h> +#include <ipmi.h> +#include <opal.h> +#include <skiboot.h> +#include <string.h> +#include <stdbool.h> + +#define IPMI_WRITE_SENSOR (1 << 0) + +#define FW_PROGRESS_SENSOR_TYPE 0x0F +#define BOOT_COUNT_SENSOR_TYPE 0xC3 + +static int16_t sensors[MAX_IPMI_SENSORS]; + +static bool sensors_present = false; + +struct set_sensor_req { + u8 sensor_number; + u8 operation; + u8 sensor_reading; + u8 assertion_mask[2]; + u8 deassertion_mask[2]; + u8 event_data[3]; +}; + +static bool ipmi_sensor_type_present(uint8_t sensor_type) +{ + const struct dt_property *type_prop; + uint8_t type; + struct dt_node *node; + + dt_for_each_compatible(dt_root, node, "ibm,ipmi-sensor") { + type_prop = dt_find_property(node, "ipmi-sensor-type"); + if (!type_prop) { + prlog(PR_ERR, "IPMI: sensor doesn't have ipmi-sensor-type\n"); + continue; + } + + type = (uint8_t)dt_property_get_cell(type_prop, 0); + if (type == sensor_type) + return true; + } + return false; +} + +uint8_t ipmi_get_sensor_number(uint8_t sensor_type) +{ + assert(sensor_type < MAX_IPMI_SENSORS); + return sensors[sensor_type]; +} + +int ipmi_set_boot_count(void) +{ + struct set_sensor_req req; + struct ipmi_msg *msg; + int boot_count_sensor; + + if (!sensors_present) + return OPAL_UNSUPPORTED; + + if (!ipmi_present()) + return OPAL_CLOSED; + + if (!ipmi_sensor_type_present(BOOT_COUNT_SENSOR_TYPE)) + return OPAL_HARDWARE; + + boot_count_sensor = sensors[BOOT_COUNT_SENSOR_TYPE]; + + if (boot_count_sensor < 0) { + prlog(PR_DEBUG, "IPMI: boot count set but not present\n"); + return OPAL_HARDWARE; + } + + memset(&req, 0, sizeof(req)); + + req.sensor_number = boot_count_sensor; + req.operation = IPMI_WRITE_SENSOR; + req.sensor_reading = 0x00; + req.assertion_mask[0] = 0x02; + + msg = ipmi_mkmsg_simple(IPMI_SET_SENSOR_READING, &req, sizeof(req)); + if (!msg) + return OPAL_HARDWARE; + + printf("IPMI: Resetting boot count on successful boot\n"); + + return ipmi_queue_msg(msg); +} + +int ipmi_set_fw_progress_sensor(uint8_t state) +{ + struct ipmi_msg *msg; + struct set_sensor_req request; + int fw_sensor_num; + + if (!sensors_present) + return OPAL_UNSUPPORTED; + + if (!ipmi_present()) + return OPAL_CLOSED; + + if (!ipmi_sensor_type_present(FW_PROGRESS_SENSOR_TYPE)) + return OPAL_HARDWARE; + + fw_sensor_num = sensors[FW_PROGRESS_SENSOR_TYPE]; + + if (fw_sensor_num < 0) { + prlog(PR_DEBUG, "IPMI: fw progress set but not present\n"); + return OPAL_HARDWARE; + } + + memset(&request, 0, sizeof(request)); + + request.sensor_number = fw_sensor_num; + request.operation = 0xa0; /* Set event data bytes, assertion bits */ + request.assertion_mask[0] = 0x04; /* Firmware progress offset */ + request.event_data[0] = 0xc2; + request.event_data[1] = state; + + prlog(PR_INFO, "IPMI: setting fw progress sensor %02x to %02x\n", + request.sensor_number, request.event_data[1]); + + msg = ipmi_mkmsg_simple(IPMI_SET_SENSOR_READING, &request, + sizeof(request)); + if (!msg) + return OPAL_HARDWARE; + + return ipmi_queue_msg(msg); +} + +void ipmi_sensor_init(void) +{ + const struct dt_property *type_prop, *num_prop; + uint8_t num, type; + struct dt_node *n; + + memset(sensors, -1, sizeof(sensors)); + + dt_for_each_compatible(dt_root, n, "ibm,ipmi-sensor") { + type_prop = dt_find_property(n, "ipmi-sensor-type"); + if (!type_prop) { + prerror("IPMI: sensor doesn't have ipmi-sensor-type\n"); + continue; + } + + num_prop = dt_find_property(n, "reg"); + if (!num_prop) { + prerror("IPMI: sensor doesn't have reg property\n"); + continue; + } + num = (uint8_t)dt_property_get_cell(num_prop, 0); + type = (uint8_t)dt_property_get_cell(type_prop, 0); + assert(type < MAX_IPMI_SENSORS); + sensors[type] = num; + } + sensors_present = true; +} diff --git a/roms/skiboot/hw/ipmi/ipmi-watchdog.c b/roms/skiboot/hw/ipmi/ipmi-watchdog.c new file mode 100644 index 000000000..dc0a9e5b4 --- /dev/null +++ b/roms/skiboot/hw/ipmi/ipmi-watchdog.c @@ -0,0 +1,218 @@ +// SPDX-License-Identifier: Apache-2.0 +/* + * Copyright 2013-2018 IBM Corp. + * Copyright 2018 Google Corp. + */ + +#include <stdlib.h> +#include <ipmi.h> +#include <lock.h> +#include <opal.h> +#include <device.h> +#include <timer.h> +#include <timebase.h> +#include <pool.h> +#include <skiboot.h> + +#define TIMER_USE_DONT_LOG 0x80 +#define TIMER_USE_DONT_STOP 0x40 +#define TIMER_USE_POST 0x02 + +/* WDT expiration actions */ +#define WDT_PRETIMEOUT_SMI 0x10 +#define WDT_RESET_ACTION 0x01 +#define WDT_NO_ACTION 0x00 + +/* IPMI defined custom completion codes for the watchdog */ +#define WDT_CC_OK 0x00 +#define WDT_CC_NOT_INITIALIZED 0x80 + +/* Flags used for IPMI callbacks */ +#define WDT_SET_DO_RESET 0x01 +#define WDT_RESET_NO_REINIT 0x01 + +/* How long to set the overall watchdog timeout for. In units of + * 100ms. If the timer is not reset within this time the watchdog + * expiration action will occur. */ +#define WDT_TIMEOUT 600 + +/* How often to reset the timer using schedule_timer(). Too short and +we risk accidentally resetting the system due to opal_run_pollers() not +being called in time, too short and we waste time resetting the wdt +more frequently than necessary. */ +#define WDT_MARGIN 300 + +static struct timer wdt_timer; +static bool wdt_stopped; +static bool wdt_ticking; + +/* Saved values from the last watchdog set action */ +static uint8_t last_action; +static uint16_t last_count; +static uint8_t last_pretimeout; + +static void reset_wdt(struct timer *t, void *data, uint64_t now); + +static void set_wdt_complete(struct ipmi_msg *msg) +{ + const uintptr_t flags = (uintptr_t)msg->user_data; + + if (flags & WDT_SET_DO_RESET) { + /* Make sure the reset action does not create a loop and + * perform a reset in the case where the BMC send an + * uninitialized error. */ + reset_wdt(NULL, (void *)WDT_RESET_NO_REINIT, 0); + } + + ipmi_free_msg(msg); +} + +static void set_wdt(uint8_t action, uint16_t count, uint8_t pretimeout, + bool dont_stop, bool do_reset) +{ + struct ipmi_msg *ipmi_msg; + uintptr_t completion_flags = 0; + + if (do_reset) + completion_flags |= WDT_SET_DO_RESET; + + /* Save the values prior to issuing the set operation so that we can + * re-initialize the watchdog in error cases. */ + last_action = action; + last_count = count; + last_pretimeout = pretimeout; + + ipmi_msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, IPMI_SET_WDT, + set_wdt_complete, NULL, NULL, 6, 0); + if (!ipmi_msg) { + prerror("Unable to allocate set wdt message\n"); + return; + } + ipmi_msg->error = set_wdt_complete; + ipmi_msg->user_data = (void *)completion_flags; + ipmi_msg->data[0] = TIMER_USE_POST | + TIMER_USE_DONT_LOG | + (dont_stop ? TIMER_USE_DONT_STOP : 0); + ipmi_msg->data[1] = action; /* Timer Actions */ + ipmi_msg->data[2] = pretimeout; /* Pre-timeout Interval */ + ipmi_msg->data[3] = 0; /* Timer Use Flags */ + ipmi_msg->data[4] = count & 0xff; /* Initial countdown (lsb) */ + ipmi_msg->data[5] = (count >> 8) & 0xff; /* Initial countdown (msb) */ + ipmi_queue_msg(ipmi_msg); +} + +static void reset_wdt_complete(struct ipmi_msg *msg) +{ + const uintptr_t flags = (uintptr_t)msg->user_data; + uint64_t reset_delay_ms = (WDT_TIMEOUT - WDT_MARGIN) * 100; + + if (msg->cc == WDT_CC_NOT_INITIALIZED && + !(flags & WDT_RESET_NO_REINIT)) { + /* If our timer was not initialized on the BMC side, we should + * perform a single attempt to set it up again. */ + set_wdt(last_action, last_count, last_pretimeout, true, true); + } else if (msg->cc != WDT_CC_OK) { + /* Use a short (10s) timeout before performing the next reset + * if we encounter an unknown error. This makes sure that we + * are able to reset and re-initialize the timer since it might + * expire. */ + reset_delay_ms = 10 * 1000; + } + + /* If we are inside of skiboot we need to periodically restart the + * timer. Reschedule a reset so it happens before the timeout. */ + if (wdt_ticking) + schedule_timer(&wdt_timer, msecs_to_tb(reset_delay_ms)); + + ipmi_free_msg(msg); +} + +static struct ipmi_msg *wdt_reset_mkmsg(void) +{ + struct ipmi_msg *ipmi_msg; + + ipmi_msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, IPMI_RESET_WDT, + reset_wdt_complete, NULL, NULL, 0, 0); + if (!ipmi_msg) { + prerror("Unable to allocate reset wdt message\n"); + return NULL; + } + ipmi_msg->error = reset_wdt_complete; + + return ipmi_msg; +} + +static void sync_reset_wdt(void) +{ + struct ipmi_msg *ipmi_msg; + + if ((ipmi_msg = wdt_reset_mkmsg())) + ipmi_queue_msg_sync(ipmi_msg); +} + +static void reset_wdt(struct timer *t __unused, void *data, + uint64_t now __unused) +{ + struct ipmi_msg *ipmi_msg; + + if ((ipmi_msg = wdt_reset_mkmsg())) { + ipmi_msg->user_data = data; + ipmi_queue_msg_head(ipmi_msg); + } +} + +void ipmi_wdt_stop(void) +{ + if (!wdt_stopped) { + /* Make sure the background reset timer is disabled before + * stopping the watchdog. If we issue a reset after disabling + * the timer, it will be re-enabled. */ + wdt_ticking = false; + cancel_timer(&wdt_timer); + + /* Configure the watchdog to be disabled and do no action + * in case the underlying implementation is buggy and times + * out anyway. */ + wdt_stopped = true; + set_wdt(WDT_NO_ACTION, 100, 0, false, false); + } +} + +void ipmi_wdt_final_reset(void) +{ + /* We can safely stop the timer prior to setting up our final + * watchdog timeout since we have enough margin before the + * timeout. */ + wdt_ticking = false; + cancel_timer(&wdt_timer); + + /* + * We're going to wait a little while before requiring + * BOOTKERNEL to have IPMI watchdog support so that people + * can catch up in their development environments. + * If you still read this after 2018, send a patch! + */ +#if 0 + /* Configure the watchdog and make sure it is still enabled */ + set_wdt(WDT_RESET_ACTION | WDT_PRETIMEOUT_SMI, WDT_TIMEOUT, + WDT_MARGIN/10, true, true); + sync_reset_wdt(); +#else + set_wdt(WDT_NO_ACTION, 100, 0, false, false); +#endif + ipmi_set_boot_count(); +} + +void ipmi_wdt_init(void) +{ + init_timer(&wdt_timer, reset_wdt, NULL); + set_wdt(WDT_RESET_ACTION, WDT_TIMEOUT, 0, true, false); + + /* Start the WDT. We do it synchronously to make sure it has + * started before skiboot continues booting. Otherwise we + * could crash before the wdt has actually been started. */ + wdt_ticking = true; + sync_reset_wdt(); + + return; +} diff --git a/roms/skiboot/hw/ipmi/test/Makefile.check b/roms/skiboot/hw/ipmi/test/Makefile.check new file mode 100644 index 000000000..ceed1ed39 --- /dev/null +++ b/roms/skiboot/hw/ipmi/test/Makefile.check @@ -0,0 +1,34 @@ +# -*-Makefile-*- +IPMI_TEST := hw/ipmi/test/run-fru + +LCOV_EXCLUDE += $(IPMI_TEST:%=%.c) + +.PHONY : hw-ipmi-check hw-ipmi-coverage +hw-ipmi-check: $(IPMI_TEST:%=%-check) +hw-ipmi-coverage: $(IPMI_TEST:%=%-gcov-run) + +check: hw-ipmi-check +coverage: hw-ipmi-coverage + +$(IPMI_TEST:%=%-gcov-run) : %-run: % + $(call Q, TEST-COVERAGE ,$< , $<) + +$(IPMI_TEST:%=%-check) : %-check: % + $(call Q, RUN-TEST ,$(VALGRIND) $<, $<) + +$(IPMI_TEST) : % : %.c + $(call Q, HOSTCC ,$(HOSTCC) $(HOSTCFLAGS) -O0 -g -I include -I . -o $@ $<, $<) + +$(IPMI_TEST:%=%-gcov): %-gcov : %.c % + $(call Q, HOSTCC ,$(HOSTCC) $(HOSTCFLAGS) $(HOSTGCOVCFLAGS) -I include -I . -I libfdt -lgcov -o $@ $<, $<) + +$(IPMI_TEST:%=%-gcov): % : $(%.d:-gcov=) + +-include $(wildcard hw/ipmi/test/*.d) + +clean: ipmi-test-clean + +ipmi-test-clean: + $(RM) -f hw/ipmi/test/*.[od] $(IPMI_TEST) $(IPMI_TEST:%=%-gcov) + $(RM) -f *.gcda *.gcno skiboot.info + $(RM) -rf coverage-report diff --git a/roms/skiboot/hw/ipmi/test/run-fru.c b/roms/skiboot/hw/ipmi/test/run-fru.c new file mode 100644 index 000000000..fa79c98a1 --- /dev/null +++ b/roms/skiboot/hw/ipmi/test/run-fru.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* Copyright 2013-2019 IBM Corp. */ + +#include <unistd.h> +#include <sys/stat.h> +#include <fcntl.h> + +#define __TEST__ + +#include "../ipmi-fru.c" + +#include <string.h> + +int error = 0; + +const char version[] = "a-too-long-version-test-string-is-here"; + +void ipmi_free_msg(struct ipmi_msg __unused *msg) +{ +} + +void ipmi_init_msg(struct ipmi_msg __unused *msg, int __unused interface, + uint32_t __unused code, + void __unused (*complete)(struct ipmi_msg *), + void __unused *user_data, size_t __unused req_size, + size_t __unused resp_size) +{ +} + +struct ipmi_msg *ipmi_mkmsg(int __unused interface, uint32_t __unused code, + void __unused (*complete)(struct ipmi_msg *), + void __unused *user_data, void __unused *req_data, size_t __unused req_size, + size_t __unused resp_size) +{ + return NULL; +} + +int ipmi_queue_msg(struct ipmi_msg __unused *msg) +{ + return 0; +} + +void _prlog(int __unused log_level, const __unused char* fmt, ...) +{ + return; +} + +int main(void) +{ + u8 *buf; + int len; + struct product_info info = { + .manufacturer = (char *) "IBM", + .product = (char *) "skiboot", + .part_no = (char *) "hello", + .version = (char *) "12345", + .serial_no = (char *) "12345", + .asset_tag = (char *) "abcd", + }; + struct product_info invalid_info = { + .manufacturer = (char *) "I", + .product = (char *) "skiboot", + .part_no = (char *) "hello", + .version = (char *) "12345", + .serial_no = (char *) "12345", + .asset_tag = (char *) "abcd", + }; + struct product_info invalid_info2 = { + .manufacturer = (char *) "IBM", + .product = (char *) "skiboot", + .part_no = (char *) "this is a really long string that's more" + "than 32 characters, because it turns out that's invalid.", + .version = (char *) "12345", + .serial_no = (char *) "12345", + .asset_tag = (char *) "abcd", + }; + + buf = malloc(256); + + len = fru_fill_product_info(buf, &info, 40); + assert(len == 40); + assert(memcmp(buf, "\001\005\000\303IBM\307skiboot\305hello" + "\30512345\30512345\304abcd\301-",len) == 0); + + + /* Make sure the checksum is right */ + assert(!fru_checksum(buf, len)); + + /* This should fail (not enough space) */ + assert(fru_fill_product_info(buf, &info, 39) < 0); + + memset(buf, 0, 256); + len = fru_fill_product_info(buf, &invalid_info, 40); + assert(len == OPAL_PARAMETER); + + memset(buf, 0, 256); + len = fru_fill_product_info(buf, &invalid_info2, 256); + assert(len == OPAL_PARAMETER); + + memset(buf, 0, 256); + assert(fru_add(buf, 256) > 0); + assert(0 == memcmp(&buf[64], "\001\a\000\303IBM\307skiboot\300" + "\337a-too-long-version-test-string+\300\300\301" + "\0\0\0",54)); + + + memset(buf, 0, 256); + assert(fru_add(buf, 1) == OPAL_PARAMETER); + + memset(buf, 0, 256); + assert(fru_add(buf, 65) == OPAL_PARAMETER); + + free(buf); + + return 0; +} diff --git a/roms/skiboot/hw/lpc-mbox.c b/roms/skiboot/hw/lpc-mbox.c new file mode 100644 index 000000000..f5bb97ea4 --- /dev/null +++ b/roms/skiboot/hw/lpc-mbox.c @@ -0,0 +1,346 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * LPC MBOX + * + * Copyright 2017-2018 IBM Corp. + */ + +#define pr_fmt(fmt) "LPC-MBOX: " fmt + +#include <skiboot.h> +#include <lpc.h> +#include <console.h> +#include <opal.h> +#include <device.h> +#include <interrupts.h> +#include <processor.h> +#include <errorlog.h> +#include <trace.h> +#include <timebase.h> +#include <timer.h> +#include <cpu.h> +#include <chip.h> +#include <io.h> + +#include <lpc-mbox.h> + +#define MBOX_FLAG_REG 0x0f +#define MBOX_STATUS_0 0x10 +#define MBOX_STATUS_1 0x11 +#define MBOX_STATUS_1_ATTN (1 << 7) +#define MBOX_STATUS_1_RESP (1 << 5) +#define MBOX_BMC_CTRL 0x12 +#define MBOX_CTRL_INT_STATUS (1 << 7) +#define MBOX_CTRL_INT_MASK (1 << 1) +#define MBOX_CTRL_INT_PING (1 << 0) +#define MBOX_CTRL_INT_SEND (MBOX_CTRL_INT_PING | MBOX_CTRL_INT_MASK) +#define MBOX_HOST_CTRL 0x13 +#define MBOX_BMC_INT_EN_0 0x14 +#define MBOX_BMC_INT_EN_1 0x15 +#define MBOX_HOST_INT_EN_0 0x16 +#define MBOX_HOST_INT_EN_1 0x17 + +#define MBOX_MAX_QUEUE_LEN 5 + +struct mbox { + uint32_t base; + int queue_len; + bool irq_ok; + uint8_t seq; + struct timer poller; + void (*callback)(struct bmc_mbox_msg *msg, void *priv); + void *drv_data; + void (*attn)(uint8_t bits, void *priv); + void *attn_data; + struct lock lock; + uint8_t sequence; + unsigned long timeout; +}; + +static struct mbox mbox; + +/* + * MBOX accesses + */ + +static void bmc_mbox_outb(uint8_t val, uint8_t reg) +{ + lpc_outb(val, mbox.base + reg); +} + +static uint8_t bmc_mbox_inb(uint8_t reg) +{ + return lpc_inb(mbox.base + reg); +} + +static void bmc_mbox_recv_message(struct bmc_mbox_msg *msg) +{ + uint8_t *msg_data = (uint8_t *)msg; + int i; + + for (i = 0; i < BMC_MBOX_READ_REGS; i++) + msg_data[i] = bmc_mbox_inb(i); +} + +/* This needs work, don't write the data bytes that aren't needed */ +static void bmc_mbox_send_message(struct bmc_mbox_msg *msg) +{ + uint8_t *msg_data = (uint8_t *)msg; + int i; + + if (!lpc_ok()) + /* We're going to have to handle this better */ + prlog(PR_ERR, "LPC isn't ok\n"); + + for (i = 0; i < BMC_MBOX_WRITE_REGS; i++) + bmc_mbox_outb(msg_data[i], i); + + /* + * Don't touch the response byte - it's setup to generate an interrupt + * to the host (us) when written to, or the host status reg - we don't + * currently use it, or the BMC status reg - we're not allowed to. + */ + + /* Ping */ + prlog(PR_TRACE, "Sending BMC interrupt\n"); + bmc_mbox_outb(MBOX_CTRL_INT_SEND, MBOX_HOST_CTRL); +} + +int bmc_mbox_enqueue(struct bmc_mbox_msg *msg, unsigned int timeout_sec) +{ + if (!mbox.base) { + prlog(PR_CRIT, "Using MBOX without init!\n"); + return OPAL_WRONG_STATE; + } + + lock(&mbox.lock); + if (mbox.timeout) { + prlog(PR_DEBUG, "MBOX message already in flight\n"); + if (mftb() > mbox.timeout) { + prlog(PR_ERR, "In flight message dropped on the floor\n"); + } else { + unlock(&mbox.lock); + return OPAL_BUSY; + } + } + + mbox.timeout = mftb() + secs_to_tb(timeout_sec); + msg->seq = ++mbox.sequence; + + bmc_mbox_send_message(msg); + unlock(&mbox.lock); + + schedule_timer(&mbox.poller, mbox.irq_ok ? + TIMER_POLL : msecs_to_tb(MBOX_DEFAULT_POLL_MS)); + + return 0; +} + +static void mbox_poll(struct timer *t __unused, void *data __unused, + uint64_t now __unused) +{ + struct bmc_mbox_msg msg; + + if (!lpc_ok()) + return; + + /* + * This status bit being high means that someone touched the + * response byte (byte 13). + * There is probably a response for the previously sent commant + */ + lock(&mbox.lock); + if (bmc_mbox_inb(MBOX_STATUS_1) & MBOX_STATUS_1_RESP) { + /* W1C on that reg */ + bmc_mbox_outb(MBOX_STATUS_1_RESP, MBOX_STATUS_1); + + prlog(PR_INSANE, "Got a regular interrupt\n"); + + bmc_mbox_recv_message(&msg); + if (mbox.sequence != msg.seq) { + prlog(PR_ERR, "Got a response to a message we no longer care about\n"); + goto out_response; + } + + mbox.timeout = 0; + if (mbox.callback) + mbox.callback(&msg, mbox.drv_data); + else + prlog(PR_ERR, "Detected NULL callback for mbox message\n"); + } + +out_response: + + /* + * The BMC has touched byte 15 to get our attention as it has + * something to tell us. + */ + if (bmc_mbox_inb(MBOX_STATUS_1) & MBOX_STATUS_1_ATTN) { + uint8_t action, all; + + /* W1C on that reg */ + bmc_mbox_outb(MBOX_STATUS_1_ATTN, MBOX_STATUS_1); + + all = action = bmc_mbox_inb(MBOX_FLAG_REG); + prlog(PR_TRACE, "Got a status register interrupt with action 0x%02x\n", + action); + if (action & MBOX_ATTN_BMC_REBOOT) { + /* + * It's unlikely that something needs to be done at the + * driver level. Let libflash deal with it. + * Print something just in case, it is quite a signficant + * event. + */ + prlog(PR_WARNING, "BMC reset detected\n"); + action &= ~MBOX_ATTN_BMC_REBOOT; + } + + if (action & MBOX_ATTN_BMC_WINDOW_RESET) + action &= ~MBOX_ATTN_BMC_WINDOW_RESET; + + if (action & MBOX_ATTN_BMC_FLASH_LOST) + action &= ~MBOX_ATTN_BMC_FLASH_LOST; + + if (action & MBOX_ATTN_BMC_DAEMON_READY) + action &= ~MBOX_ATTN_BMC_DAEMON_READY; + + if (action) + prlog(PR_ERR, "Got a status bit set that don't know about: 0x%02x\n", + action); + + mbox.attn(all, mbox.attn_data); + } + + unlock(&mbox.lock); + + schedule_timer(&mbox.poller, + mbox.irq_ok ? TIMER_POLL : msecs_to_tb(MBOX_DEFAULT_POLL_MS)); +} + +static void mbox_irq(uint32_t chip_id __unused, uint32_t irq_mask __unused) +{ + mbox.irq_ok = true; + mbox_poll(NULL, NULL, 0); +} + +static struct lpc_client mbox_lpc_client = { + .interrupt = mbox_irq, +}; + +static bool mbox_init_hw(void) +{ + /* Disable all status interrupts except attentions */ + bmc_mbox_outb(0x00, MBOX_HOST_INT_EN_0); + bmc_mbox_outb(MBOX_STATUS_1_ATTN, MBOX_HOST_INT_EN_1); + + /* Cleanup host interrupt and status */ + bmc_mbox_outb(MBOX_CTRL_INT_STATUS, MBOX_HOST_CTRL); + + /* Disable host control interrupt for now (will be + * re-enabled when needed). Clear BMC interrupts + */ + bmc_mbox_outb(MBOX_CTRL_INT_MASK, MBOX_BMC_CTRL); + + return true; +} + +int bmc_mbox_register_callback(void (*callback)(struct bmc_mbox_msg *msg, void *priv), + void *drv_data) +{ + mbox.callback = callback; + mbox.drv_data = drv_data; + return 0; +} + +int bmc_mbox_register_attn(void (*callback)(uint8_t bits, void *priv), + void *drv_data) +{ + mbox.attn = callback; + mbox.attn_data = drv_data; + return 0; +} + +uint8_t bmc_mbox_get_attn_reg(void) +{ + return bmc_mbox_inb(MBOX_FLAG_REG); +} + +void mbox_init(void) +{ + const struct dt_property *prop; + struct dt_node *np; + uint32_t irq, chip_id; + + if (mbox.base) { + prlog(PR_ERR, "Duplicate call to mbox_init()\n"); + return; + } + + prlog(PR_DEBUG, "Attempting mbox init\n"); + np = dt_find_compatible_node(dt_root, NULL, "mbox"); + if (!np) { + /* Only an ERROR on P9 and above, otherwise just + * a warning for someone doing development + */ + prlog((proc_gen <= proc_gen_p8) ? PR_DEBUG : PR_ERR, + "No device tree entry\n"); + return; + } + + /* Read the interrupts property if any */ + irq = dt_prop_get_u32_def(np, "interrupts", 0); + if (!irq) { + prlog(PR_ERR, "No interrupts property\n"); + return; + } + + if (!lpc_present()) { + prlog(PR_ERR, "LPC not present\n"); + return; + } + + /* Get IO base */ + prop = dt_find_property(np, "reg"); + if (!prop) { + prlog(PR_ERR, "Can't find reg property\n"); + return; + } + if (dt_property_get_cell(prop, 0) != OPAL_LPC_IO) { + prlog(PR_ERR, "Only supports IO addresses\n"); + return; + } + mbox.base = dt_property_get_cell(prop, 1); + + if (!mbox_init_hw()) { + prlog(PR_DEBUG, "Couldn't init HW\n"); + return; + } + + /* Disable the standard interrupt we don't care */ + bmc_mbox_outb(MBOX_CTRL_INT_MASK, MBOX_HOST_CTRL); + + /* Clear the status reg bits that we intend to use for interrupts */ + /* W1C */ + bmc_mbox_outb(MBOX_STATUS_1_RESP | MBOX_STATUS_1_ATTN, MBOX_STATUS_1); + + mbox.queue_len = 0; + mbox.callback = NULL; + mbox.drv_data = NULL; + mbox.timeout = 0; + mbox.sequence = 0; + init_lock(&mbox.lock); + + init_timer(&mbox.poller, mbox_poll, NULL); + + chip_id = dt_get_chip_id(np); + mbox_lpc_client.interrupts = LPC_IRQ(irq); + lpc_register_client(chip_id, &mbox_lpc_client, IRQ_ATTR_TARGET_OPAL); + + /* Enable interrupts */ + bmc_mbox_outb(MBOX_STATUS_1_ATTN | MBOX_STATUS_1_RESP, MBOX_HOST_INT_EN_1); + + prlog(PR_DEBUG, "Enabled on chip %d, IO port 0x%x, IRQ %d\n", + chip_id, mbox.base, irq); +} + + diff --git a/roms/skiboot/hw/lpc-port80h.c b/roms/skiboot/hw/lpc-port80h.c new file mode 100644 index 000000000..0d1fee99e --- /dev/null +++ b/roms/skiboot/hw/lpc-port80h.c @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * op_display() but over the 1 byte LPC port 80h just like an original IBM PC + * + * Copyright 2018-2019 IBM Corp. + */ + +#define pr_fmt(fmt) "Port80h: " fmt + +#include <lpc.h> +#include <op-panel.h> +#include <chip.h> + +/* + * Convert our detailed op_display() call into 1 byte for LPC port 80h + * + * Our layout looks like this: + * MSB (bit 7): 1 = Comes from OPAL + * bit 6 : 0 = OP_MOD_INIT (the main one), 1 = (see bit 5) + * bit 5432 : (if bit 6=0, low nibble of op-panel code) + * bit 5432 : (if bit 6=1, other OP_MOD_ values in bits 54: + * 00b=OP_MOD_CPU, 01b=OP_MOD_LOCK, + * 10b=OP_MOD_MEM, 11b=OP_MOD_CHIPTOD + * bits 0,1 from code in bits 32) + * + * bit 1,0: 00b=OP_LOG, 10b=OP_WARN, 01b=OP_ERROR, 11b=OP_FATAL + * i.e. bit 0 indicates ERROR or FATAL. + * + * If port 80h number has the MSB and LSB set, then you died in OPAL. + * Any *odd* number with the MSB set (i.e. > 0x80) indicates error. + */ +static inline uint8_t op_display_to_port80(uint8_t last_value, enum op_severity s, enum op_module m, uint16_t c) +{ + uint8_t r = 0x80; /* Start with top bit set indicating in OPAL */ + + switch(m) { + case OP_MOD_INIT: + /* bit 6 is zero */ + /* bits 5432 have low nibble of c */ + r |= (c & 0x0f) << 2; + break; + case OP_MOD_CPU: + r |= 0x40 | (c & 0x03) << 2; + break; + case OP_MOD_LOCK: + r |= 0x50 | (c & 0x03) << 2; + break; + case OP_MOD_MEM: + r |= 0x60 | (c & 0x03) << 2; + break; + case OP_MOD_CHIPTOD: + r |= 0x70 | (c & 0x03) << 2; + break; + case OP_MOD_CORE: + /* + * Only current OP_MOD_CORE is where we're OP_FATAL, + * So let's go for the last value set and tweak the + * bits for OP_FATAL. + */ + r = last_value & 0xFC; + break; + case OP_MOD_FSP: + case OP_MOD_FSPCON: + /* Should never be hit, port80h only used on non-FSP! */ + break; + } + + switch(s) { + case OP_LOG: + break; + case OP_WARN: + r |= 0x02; + break; + case OP_ERROR: + r |= 0x01; + break; + case OP_FATAL: + r |= 0x03; + } + + return r; +} + +/* + * Convert our detailed op_display() call into 2 bytes for LPC port 81h and 82h + * + * This looks pretty similar to our port80 code. + * Notably we now have more bits to throw progress into. + * + * Our layout looks like this: + * MSB (bit 15): 1 = Comes from OPAL + * bit 14 : 0 = OP_MOD_INIT (the main one), 1 = (see bit 13) + * bits 13-2 : (if bit 6=0, low 12 bits of op-panel code) + * bit 13,12 : (if bit 6=1, other OP_MOD_ values in bits 13 and 12: + * 00b=OP_MOD_CPU, 01b=OP_MOD_LOCK, + * 10b=OP_MOD_MEM, 11b=OP_MOD_CHIPTOD) + * and bits 11-2 are low 10 bits of op-panel code) + * + * bit 1,0: 00b=OP_LOG, 10b=OP_WARN, 01b=OP_ERROR, 11b=OP_FATAL + * i.e. bit 0 indicates ERROR or FATAL. + * + * If port 80h number has the MSB and LSB set, then you died in OPAL. + * Any *odd* number with the MSB set (i.e. > 0x80) indicates error. + */ +static inline uint16_t op_display_to_port8x(uint16_t last_value, enum op_severity s, enum op_module m, uint16_t c) +{ + uint16_t r = 0x8000; /* Start with top bit set indicating in OPAL */ + + switch(m) { + case OP_MOD_INIT: + /* bit 6 is zero */ + /* bits 13 through 2 have low 12 bits of c */ + r |= (c & 0xFFF) << 2; + break; + case OP_MOD_CPU: + r |= 0x4000 | (c & 0x03FF) << 2; + break; + case OP_MOD_LOCK: + r |= 0x5000 | (c & 0x03FF) << 2; + break; + case OP_MOD_MEM: + r |= 0x6000 | (c & 0x03FF) << 2; + break; + case OP_MOD_CHIPTOD: + r |= 0x7000 | (c & 0x03FF) << 2; + break; + case OP_MOD_CORE: + /* + * Only current OP_MOD_CORE is where we're OP_FATAL, + * So let's go for the last value set and tweak the + * bits for OP_FATAL. + */ + r = last_value & 0xFFFC; + break; + case OP_MOD_FSP: + case OP_MOD_FSPCON: + /* Should never be hit, port80h only used on non-FSP! */ + break; + } + + switch(s) { + case OP_LOG: + break; + case OP_WARN: + r |= 0x02; + break; + case OP_ERROR: + r |= 0x01; + break; + case OP_FATAL: + r |= 0x03; + } + + return r; +} + + +void op_display_lpc(enum op_severity s, enum op_module m, uint16_t c) +{ + static uint8_t port80_val = 0x80; + static uint16_t port8x_val = 0x8000; + + if (chip_quirk(QUIRK_SIMICS)) + return; + + port80_val = op_display_to_port80(port80_val, s, m, c); + port8x_val = op_display_to_port8x(port8x_val, s, m, c); + + lpc_probe_write(OPAL_LPC_IO, 0x80, port80_val, 1); + lpc_probe_write(OPAL_LPC_IO, 0x81, port8x_val >> 8, 1); + lpc_probe_write(OPAL_LPC_IO, 0x82, port8x_val & 0xff, 1); +} + diff --git a/roms/skiboot/hw/lpc-rtc.c b/roms/skiboot/hw/lpc-rtc.c new file mode 100644 index 000000000..dc4a484b3 --- /dev/null +++ b/roms/skiboot/hw/lpc-rtc.c @@ -0,0 +1,235 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Real Time Clock hanging off LPC + * + * Copyright 2015 IBM Corp. + */ + +#include <stdlib.h> +#include <string.h> +#include <ipmi.h> +#include <time.h> +#include <time-utils.h> +#include <device.h> +#include <opal.h> +#include <rtc.h> +#include <lpc.h> +#include <lock.h> +#include <timebase.h> + +/* Legacy RTC registers */ +#define RTC_REG_SECONDS 0 +#define RTC_REG_MINUTES 2 +#define RTC_REG_HOURS 4 +#define RTC_REG_DAY_OF_WEEK 6 +#define RTC_REG_DAY_OF_MONTH 7 +#define RTC_REG_MONTH 8 +#define RTC_REG_YEAR 9 +#define RTC_REG_A 10 +#define RTC_REG_A_UIP 0x80 +#define RTC_REG_B 11 +#define RTC_REG_B_DIS_UPD 0x80 +#define RTC_REG_B_PIE 0x40 +#define RTC_REG_B_AIE 0x20 +#define RTC_REG_B_UIE 0x10 +#define RTC_REG_B_SQWE 0x08 +#define RTC_REG_B_DM_BINARY 0x04 +#define RTC_REG_B_24H 0x02 +#define RTC_REG_B_DST_EN 0x01 +#define RTC_REG_C 12 +#define RTC_REG_D 13 +#define RTC_REG_D_VALID 0x80 + +/* Init value is no interrupts, 24H mode, updates enabled */ +#define RTC_REG_B_INIT (RTC_REG_B_24H) + +static u32 rtc_port; +static struct lock rtc_lock = LOCK_UNLOCKED; + +static uint8_t rtc_read(uint8_t reg) +{ + lpc_outb(reg, rtc_port); + return lpc_inb(rtc_port + 1); +} + +static void rtc_write(uint8_t reg, uint8_t val) +{ + lpc_outb(reg, rtc_port); + lpc_outb(val, rtc_port + 1); +} + +static bool lpc_rtc_read_tm(struct tm *tm) +{ + struct tm tm2; + unsigned int loops = 0; + + /* Read until two series provide identical values, this + * should deal with update races in all practical cases + */ + for (;;) { + tm2 = *tm; + tm->tm_sec = rtc_read(RTC_REG_SECONDS); + tm->tm_min = rtc_read(RTC_REG_MINUTES); + tm->tm_hour = rtc_read(RTC_REG_HOURS); + tm->tm_mday = rtc_read(RTC_REG_DAY_OF_MONTH); + tm->tm_mon = rtc_read(RTC_REG_MONTH); + tm->tm_year = rtc_read(RTC_REG_YEAR); + if (loops > 0 && memcmp(&tm2, tm, sizeof(struct tm)) == 0) + break; + loops++; + if (loops > 10) { + prerror("RTC: Failed to obtain stable values\n"); + return false; + } + } + tm->tm_sec = bcd_byte(tm->tm_sec, 0); + tm->tm_min = bcd_byte(tm->tm_min, 0); + tm->tm_hour = bcd_byte(tm->tm_hour, 0); + tm->tm_mday = bcd_byte(tm->tm_mday, 0); + tm->tm_mon = bcd_byte(tm->tm_mon, 0) - 1; + tm->tm_year = bcd_byte(tm->tm_year, 0); + + /* 2000 wrap */ + if (tm->tm_year < 69) + tm->tm_year += 100; + + /* Base */ + tm->tm_year += 1900; + + return true; +} + +static void lpc_rtc_write_tm(struct tm *tm __unused) +{ + /* XXX */ +} + +static void lpc_init_time(void) +{ + uint8_t val; + struct tm tm; + bool valid; + + memset(&tm, 0, sizeof(tm)); + + lock(&rtc_lock); + + /* If update is in progress, wait a bit */ + val = rtc_read(RTC_REG_A); + if (val & RTC_REG_A_UIP) + time_wait_ms(10); + + /* Read from RTC */ + valid = lpc_rtc_read_tm(&tm); + + unlock(&rtc_lock); + + /* Update cache */ + if (valid) + rtc_cache_update(&tm); +} + +static void lpc_init_hw(void) +{ + lock(&rtc_lock); + + /* Set REG B to a suitable default */ + rtc_write(RTC_REG_B, RTC_REG_B_INIT); + + unlock(&rtc_lock); +} + +static int64_t lpc_opal_rtc_read(__be32 *__ymd, __be64 *__hmsm) +{ + uint8_t val; + int64_t rc = OPAL_SUCCESS; + struct tm tm; + uint32_t ymd; + uint64_t hmsm; + + if (!__ymd || !__hmsm) + return OPAL_PARAMETER; + + /* Return busy if updating. This is somewhat racy, but will + * do for now, most RTCs nowadays are smart enough to atomically + * update. Alternatively we could just read from the cache... + */ + lock(&rtc_lock); + val = rtc_read(RTC_REG_A); + if (val & RTC_REG_A_UIP) { + unlock(&rtc_lock); + return OPAL_BUSY_EVENT; + } + + /* Read from RTC */ + if (lpc_rtc_read_tm(&tm)) + rc = OPAL_SUCCESS; + else + rc = OPAL_HARDWARE; + unlock(&rtc_lock); + + if (rc == OPAL_SUCCESS) { + /* Update cache */ + rtc_cache_update(&tm); + + /* Convert to OPAL time */ + tm_to_datetime(&tm, &ymd, &hmsm); + *__ymd = cpu_to_be32(ymd); + *__hmsm = cpu_to_be64(hmsm); + } + + return rc; +} + +static int64_t lpc_opal_rtc_write(uint32_t year_month_day, + uint64_t hour_minute_second_millisecond) +{ + struct tm tm; + + /* Convert to struct tm */ + datetime_to_tm(year_month_day, hour_minute_second_millisecond, &tm); + + /* Write it out */ + lock(&rtc_lock); + lpc_rtc_write_tm(&tm); + unlock(&rtc_lock); + + return OPAL_SUCCESS; +} + +void lpc_rtc_init(void) +{ + struct dt_node *rtc_node, *np; + + if (!lpc_present()) + return; + + /* We support only one */ + rtc_node = dt_find_compatible_node(dt_root, NULL, "pnpPNP,b00"); + if (!rtc_node) + return; + + /* Get IO base */ + rtc_port = dt_prop_get_cell_def(rtc_node, "reg", 1, 0); + if (!rtc_port) { + prerror("RTC: Can't find reg property\n"); + return; + } + if (dt_prop_get_cell_def(rtc_node, "reg", 0, 0) != OPAL_LPC_IO) { + prerror("RTC: Unsupported address type\n"); + return; + } + + /* Init the HW */ + lpc_init_hw(); + + /* Create OPAL API node and register OPAL calls */ + np = dt_new(opal_node, "rtc"); + dt_add_property_strings(np, "compatible", "ibm,opal-rtc"); + + opal_register(OPAL_RTC_READ, lpc_opal_rtc_read, 2); + opal_register(OPAL_RTC_WRITE, lpc_opal_rtc_write, 2); + + /* Initialise the rtc cache */ + lpc_init_time(); +} diff --git a/roms/skiboot/hw/lpc-uart.c b/roms/skiboot/hw/lpc-uart.c new file mode 100644 index 000000000..834011b37 --- /dev/null +++ b/roms/skiboot/hw/lpc-uart.c @@ -0,0 +1,738 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Serial port hanging off LPC + * + * Copyright 2013-2019 IBM Corp. + */ + +#include <skiboot.h> +#include <lpc.h> +#include <console.h> +#include <opal.h> +#include <device.h> +#include <interrupts.h> +#include <processor.h> +#include <errorlog.h> +#include <trace.h> +#include <timebase.h> +#include <cpu.h> +#include <chip.h> +#include <io.h> +#include <nvram.h> + +DEFINE_LOG_ENTRY(OPAL_RC_UART_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_UART, + OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL, + OPAL_NA); + +/* UART reg defs */ +#define REG_RBR 0 +#define REG_THR 0 +#define REG_DLL 0 +#define REG_IER 1 +#define REG_DLM 1 +#define REG_FCR 2 +#define REG_IIR 2 +#define REG_LCR 3 +#define REG_MCR 4 +#define REG_LSR 5 +#define REG_MSR 6 +#define REG_SCR 7 + +#define LSR_DR 0x01 /* Data ready */ +#define LSR_OE 0x02 /* Overrun */ +#define LSR_PE 0x04 /* Parity error */ +#define LSR_FE 0x08 /* Framing error */ +#define LSR_BI 0x10 /* Break */ +#define LSR_THRE 0x20 /* Xmit holding register empty */ +#define LSR_TEMT 0x40 /* Xmitter empty */ +#define LSR_ERR 0x80 /* Error */ + +#define LCR_DLAB 0x80 /* DLL access */ + +#define IER_RX 0x01 +#define IER_THRE 0x02 +#define IER_ALL 0x0f + +static struct lock uart_lock = LOCK_UNLOCKED; +static struct dt_node *uart_node; +static uint32_t uart_base; +static uint64_t uart_tx_full_time; +static bool has_irq = false, irq_ok, rx_full, tx_full; +static uint8_t tx_room; +static uint8_t cached_ier; +static void *mmio_uart_base; +static int uart_console_policy = UART_CONSOLE_OPAL; +static int lpc_irq = -1; + +void uart_set_console_policy(int policy) +{ + uart_console_policy = policy; +} + +static void uart_trace(u8 ctx, u8 cnt, u8 irq_state, u8 in_count) +{ + union trace t; + + t.uart.ctx = ctx; + t.uart.cnt = cnt; + t.uart.irq_state = irq_state; + t.uart.in_count = cpu_to_be16(in_count); + trace_add(&t, TRACE_UART, sizeof(struct trace_uart)); +} + +static inline uint8_t uart_read(unsigned int reg) +{ + if (mmio_uart_base) + return in_8(mmio_uart_base + reg); + else + return lpc_inb(uart_base + reg); +} + +static inline void uart_write(unsigned int reg, uint8_t val) +{ + if (mmio_uart_base) + out_8(mmio_uart_base + reg, val); + else + lpc_outb(val, uart_base + reg); +} + +static bool uart_check_tx_room(void) +{ + if (tx_room) + return true; + + if (uart_read(REG_LSR) & LSR_THRE) { + /* FIFO is 16 entries */ + tx_room = 16; + tx_full = false; + return true; + } + + return false; +} + +/* Must be called with UART lock held */ +static void uart_write_thr(uint8_t val) +{ + uart_write(REG_THR, val); + + tx_room--; + if (tx_room == 0) { + if (!uart_check_tx_room()) + uart_tx_full_time = mftb(); + } +} + +static bool uart_timed_out(unsigned long msecs) +{ + if (uart_check_tx_room()) + return false; + + if (chip_quirk(QUIRK_SLOW_SIM)) + msecs *= 5; + + if (tb_compare(mftb(), uart_tx_full_time + msecs_to_tb(msecs)) == TB_AAFTERB) + return true; + + return false; +} + +static bool uart_wait_tx_room(void) +{ + if (uart_check_tx_room()) + return true; + + smt_lowest(); + while (!uart_check_tx_room()) { + if (uart_timed_out(100)) { + smt_medium(); + return false; + } + } + smt_medium(); + + return true; +} + +static void uart_update_ier(void) +{ + uint8_t ier = 0; + + if (!has_irq) + return; + + /* If we have never got an interrupt, enable them all, + * the first interrupt received will tell us if interrupts + * are functional (some boards are missing an EC or FPGA + * programming causing LPC interrupts not to work). + */ + if (!irq_ok) + ier = IER_ALL; + if (!rx_full) + ier |= IER_RX; + if (tx_full) + ier |= IER_THRE; + if (ier != cached_ier) { + uart_write(REG_IER, ier); + cached_ier = ier; + } +} + +bool uart_enabled(void) +{ + return mmio_uart_base || uart_base; +} + +/* + * Internal console driver (output only) + */ +static size_t uart_con_write(const char *buf, size_t len) +{ + size_t written = 0; + + /* If LPC bus is bad, we just swallow data */ + if (!lpc_ok() && !mmio_uart_base) + return len; + + lock(&uart_lock); + while (written < len) { + if (!uart_wait_tx_room()) + break; + + uart_write_thr(buf[written++]); + } + + if (!written && uart_timed_out(1000)) { + unlock(&uart_lock); + return len; /* swallow data */ + } + + unlock(&uart_lock); + + return written; +} + +static struct con_ops uart_con_driver = { + .write = uart_con_write, +}; + +/* + * OPAL console driver + */ + +/* + * We implement a simple buffer to buffer input data as some bugs in + * Linux make it fail to read fast enough after we get an interrupt. + * + * We use it on non-interrupt operations as well while at it because + * it doesn't cost us much and might help in a few cases where Linux + * is calling opal_poll_events() but not actually reading. + * + * Most of the time I expect we'll flush it completely to Linux into + * it's tty flip buffers so I don't bother with a ring buffer. + */ +#define IN_BUF_SIZE 0x1000 +static uint8_t *in_buf; +static uint32_t in_count; + +/* + * We implement a ring buffer for output data as well to speed things + * up a bit. This allows us to have interrupt driven sends. This is only + * for the output data coming from the OPAL API, not the internal one + * which is already bufferred. + */ +#define OUT_BUF_SIZE 0x1000 +static uint8_t *out_buf; +static uint32_t out_buf_prod; +static uint32_t out_buf_cons; + +/* Asynchronous flush, uart_lock must be held */ +static int64_t uart_con_flush(void) +{ + bool tx_was_full = tx_full; + uint32_t out_buf_cons_initial = out_buf_cons; + + while(out_buf_prod != out_buf_cons) { + if (tx_room == 0) { + /* + * If the interrupt is not functional, + * we force a full synchronous flush, + * otherwise the Linux console isn't + * usable (too slow). + */ + if (irq_ok) + uart_check_tx_room(); + else + uart_wait_tx_room(); + } + if (tx_room == 0) { + tx_full = true; + break; + } + + uart_write_thr(out_buf[out_buf_cons++]); + out_buf_cons %= OUT_BUF_SIZE; + } + if (tx_full != tx_was_full) + uart_update_ier(); + if (out_buf_prod != out_buf_cons) { + /* Return busy if nothing was flushed this call */ + if (out_buf_cons == out_buf_cons_initial) { + if (uart_timed_out(1000)) + return OPAL_TIMEOUT; + return OPAL_BUSY; + } + /* Return partial if there's more to flush */ + return OPAL_PARTIAL; + } + + return OPAL_SUCCESS; +} + +static uint32_t uart_tx_buf_space(void) +{ + return OUT_BUF_SIZE - 1 - + (out_buf_prod + OUT_BUF_SIZE - out_buf_cons) % OUT_BUF_SIZE; +} + +static int64_t uart_opal_write(int64_t term_number, __be64 *__length, + const uint8_t *buffer) +{ + size_t written = 0, len = be64_to_cpu(*__length); + int64_t ret = OPAL_SUCCESS; + + if (term_number != 0) + return OPAL_PARAMETER; + + lock(&uart_lock); + + /* Copy data to out buffer */ + while (uart_tx_buf_space() && len--) { + out_buf[out_buf_prod++] = *(buffer++); + out_buf_prod %= OUT_BUF_SIZE; + written++; + } + + /* Flush out buffer again */ + uart_con_flush(); + + if (!written && uart_timed_out(1000)) + ret = OPAL_TIMEOUT; + unlock(&uart_lock); + + *__length = cpu_to_be64(written); + + return ret; +} + +static int64_t uart_opal_write_buffer_space(int64_t term_number, + __be64 *__length) +{ + int64_t ret = OPAL_SUCCESS; + int64_t tx_buf_len; + + if (term_number != 0) + return OPAL_PARAMETER; + + lock(&uart_lock); + tx_buf_len = uart_tx_buf_space(); + + if ((tx_buf_len < be64_to_cpu(*__length)) && uart_timed_out(1000)) + ret = OPAL_TIMEOUT; + + *__length = cpu_to_be64(tx_buf_len); + unlock(&uart_lock); + + return ret; +} + +/* Must be called with UART lock held */ +static void uart_read_to_buffer(void) +{ + /* As long as there is room in the buffer */ + while(in_count < IN_BUF_SIZE) { + /* Read status register */ + uint8_t lsr = uart_read(REG_LSR); + + /* Nothing to read ... */ + if ((lsr & LSR_DR) == 0) + break; + + /* Read and add to buffer */ + in_buf[in_count++] = uart_read(REG_RBR); + } + + /* If the buffer is full disable the interrupt */ + rx_full = (in_count == IN_BUF_SIZE); + uart_update_ier(); +} + +static void uart_adjust_opal_event(void) +{ + if (in_count) + opal_update_pending_evt(OPAL_EVENT_CONSOLE_INPUT, + OPAL_EVENT_CONSOLE_INPUT); + else + opal_update_pending_evt(OPAL_EVENT_CONSOLE_INPUT, 0); +} + +/* This is called with the console lock held */ +static int64_t uart_opal_read(int64_t term_number, __be64 *__length, + uint8_t *buffer) +{ + size_t req_count = be64_to_cpu(*__length), read_cnt = 0; + uint8_t lsr = 0; + + if (term_number != 0) + return OPAL_PARAMETER; + if (!in_buf) + return OPAL_INTERNAL_ERROR; + + lock(&uart_lock); + + /* Read from buffer first */ + if (in_count) { + read_cnt = in_count; + if (req_count < read_cnt) + read_cnt = req_count; + memcpy(buffer, in_buf, read_cnt); + req_count -= read_cnt; + if (in_count != read_cnt) + memmove(in_buf, in_buf + read_cnt, in_count - read_cnt); + in_count -= read_cnt; + } + + /* + * If there's still room in the user buffer, read from the UART + * directly + */ + while(req_count) { + lsr = uart_read(REG_LSR); + if ((lsr & LSR_DR) == 0) + break; + buffer[read_cnt++] = uart_read(REG_RBR); + req_count--; + } + + /* Finally, flush whatever's left in the UART into our buffer */ + uart_read_to_buffer(); + + uart_trace(TRACE_UART_CTX_READ, read_cnt, tx_full, in_count); + + unlock(&uart_lock); + + /* Adjust the OPAL event */ + uart_adjust_opal_event(); + + *__length = cpu_to_be64(read_cnt); + return OPAL_SUCCESS; +} + +static int64_t uart_opal_flush(int64_t term_number) +{ + int64_t rc; + + if (term_number != 0) + return OPAL_PARAMETER; + + lock(&uart_lock); + rc = uart_con_flush(); + unlock(&uart_lock); + + return rc; +} + +static void __uart_do_poll(u8 trace_ctx) +{ + if (!in_buf) + return; + + lock(&uart_lock); + uart_read_to_buffer(); + uart_con_flush(); + uart_trace(trace_ctx, 0, tx_full, in_count); + unlock(&uart_lock); + + uart_adjust_opal_event(); +} + +static void uart_console_poll(void *data __unused) +{ + __uart_do_poll(TRACE_UART_CTX_POLL); +} + +static void uart_irq(uint32_t chip_id __unused, uint32_t irq_mask __unused) +{ + if (!irq_ok) { + prlog(PR_DEBUG, "UART: IRQ functional !\n"); + irq_ok = true; + } + __uart_do_poll(TRACE_UART_CTX_IRQ); +} + +/* + * Common setup/inits + */ + +static void uart_setup_os_passthrough(void) +{ + char *path; + + static struct lpc_client uart_lpc_os_client = { + .reset = NULL, + .interrupt = NULL, + .interrupts = 0 + }; + + dt_add_property_strings(uart_node, "status", "ok"); + path = dt_get_path(uart_node); + dt_add_property_string(dt_chosen, "linux,stdout-path", path); + free(path); + + /* Setup LPC client for OS interrupts */ + if (lpc_irq >= 0) { + uint32_t chip_id = dt_get_chip_id(uart_node); + uart_lpc_os_client.interrupts = LPC_IRQ(lpc_irq); + lpc_register_client(chip_id, &uart_lpc_os_client, + IRQ_ATTR_TARGET_LINUX); + } + prlog(PR_DEBUG, "UART: Enabled as OS pass-through\n"); +} + +static void uart_setup_opal_console(void) +{ + static struct lpc_client uart_lpc_opal_client = { + .interrupt = uart_irq, + }; + + /* Add the opal console node */ + add_opal_console_node(0, "raw", OUT_BUF_SIZE); + + dt_add_property_string(dt_chosen, "linux,stdout-path", + "/ibm,opal/consoles/serial@0"); + + /* + * We mark the UART as reserved since we don't want the + * kernel to start using it with its own 8250 driver + */ + dt_add_property_strings(uart_node, "status", "reserved"); + + /* Allocate an input buffer */ + in_buf = zalloc(IN_BUF_SIZE); + out_buf = zalloc(OUT_BUF_SIZE); + + /* Setup LPC client for OPAL interrupts */ + if (lpc_irq >= 0) { + uint32_t chip_id = dt_get_chip_id(uart_node); + uart_lpc_opal_client.interrupts = LPC_IRQ(lpc_irq); + lpc_register_client(chip_id, &uart_lpc_opal_client, + IRQ_ATTR_TARGET_OPAL); + has_irq = true; + } + + /* + * If the interrupt is enabled, turn on RX interrupts (and + * only these for now + */ + tx_full = rx_full = false; + uart_update_ier(); + + /* Start console poller */ + opal_add_poller(uart_console_poll, NULL); +} + +static void uart_init_opal_console(void) +{ + const char *nv_policy; + + /* Update the policy if the corresponding nvram variable + * is present + */ + nv_policy = nvram_query_dangerous("uart-con-policy"); + if (nv_policy) { + if (!strcmp(nv_policy, "opal")) + uart_console_policy = UART_CONSOLE_OPAL; + else if (!strcmp(nv_policy, "os")) + uart_console_policy = UART_CONSOLE_OS; + else + prlog(PR_WARNING, + "UART: Unknown console policy in NVRAM: %s\n", + nv_policy); + } + if (uart_console_policy == UART_CONSOLE_OPAL) + uart_setup_opal_console(); + else + uart_setup_os_passthrough(); +} + +struct opal_con_ops uart_opal_con = { + .name = "OPAL UART console", + .init = uart_init_opal_console, + .read = uart_opal_read, + .write = uart_opal_write, + .space = uart_opal_write_buffer_space, + .flush = uart_opal_flush, +}; + +static bool uart_init_hw(unsigned int speed, unsigned int clock) +{ + unsigned int dll = (clock / 16) / speed; + + /* Clear line control */ + uart_write(REG_LCR, 0x00); + + /* Check if the UART responds */ + uart_write(REG_IER, 0x01); + if (uart_read(REG_IER) != 0x01) + goto detect_fail; + uart_write(REG_IER, 0x00); + if (uart_read(REG_IER) != 0x00) + goto detect_fail; + + uart_write(REG_LCR, LCR_DLAB); + uart_write(REG_DLL, dll & 0xff); + uart_write(REG_DLM, dll >> 8); + uart_write(REG_LCR, 0x03); /* 8N1 */ + uart_write(REG_MCR, 0x03); /* RTS/DTR */ + uart_write(REG_FCR, 0x07); /* clear & en. fifos */ + + /* + * On some UART implementations[1], we have observed that characters + * written to the UART during early boot (where no RX path is used, + * so we don't read from RBR) can cause a character timeout interrupt + * once we eventually enable interrupts through the IER. This + * interrupt can only be cleared by reading from RBR (even though we've + * cleared the RX FIFO!). + * + * Unfortunately though, the LCR[DR] bit does *not* indicate that there + * are characters to be read from RBR, so we may never read it, so the + * interrupt continuously fires. + * + * So, manually clear the timeout interrupt by reading the RBR here. + * We discard the read data, but that shouldn't matter as we've just + * reset the FIFO anyway. + * + * 1: seen on the AST2500 SUART. I assume this applies to 2400 too. + */ + uart_read(REG_RBR); + + return true; + + detect_fail: + prerror("UART: Presence detect failed !\n"); + return false; +} + +/* + * early_uart_init() is similar to uart_init() in that it configures skiboot + * console log to output via a UART. The main differences are that the early + * version only works with MMIO UARTs and will not setup interrupts or locks. + */ +void early_uart_init(void) +{ + struct dt_node *uart_node; + u32 clk, baud; + + uart_node = dt_find_compatible_node(dt_root, NULL, "ns16550"); + if (!uart_node) + return; + + /* Try translate the address, if this fails then it's not a MMIO UART */ + mmio_uart_base = (void *) dt_translate_address(uart_node, 0, NULL); + if (!mmio_uart_base) + return; + + clk = dt_prop_get_u32(uart_node, "clock-frequency"); + baud = dt_prop_get_u32(uart_node, "current-speed"); + + if (uart_init_hw(baud, clk)) { + set_console(&uart_con_driver); + prlog(PR_DEBUG, "UART: Using UART at %p\n", mmio_uart_base); + } else { + prerror("UART: Early init failed!"); + mmio_uart_base = NULL; + } +} + +void uart_init(void) +{ + const struct dt_property *prop; + struct dt_node *n; + char *path __unused; + const be32 *irqp; + + /* Clean up after early_uart_init() */ + mmio_uart_base = NULL; + + /* UART lock is in the console path and thus must block + * printf re-entrancy + */ + uart_lock.in_con_path = true; + + /* We support only one */ + uart_node = n = dt_find_compatible_node(dt_root, NULL, "ns16550"); + if (!n) + return; + + /* Read the interrupts property if any */ + irqp = dt_prop_get_def(n, "interrupts", NULL); + + /* Now check if the UART is on the root bus. This is the case of + * directly mapped UARTs in simulation environments + */ + if (n->parent == dt_root) { + printf("UART: Found at root !\n"); + mmio_uart_base = (void *)dt_translate_address(n, 0, NULL); + if (!mmio_uart_base) { + printf("UART: Failed to translate address !\n"); + return; + } + + /* If it has an interrupt properly, we consider this to be + * a direct XICS/XIVE interrupt + */ + if (irqp) + has_irq = true; + + } else { + if (!lpc_present()) + return; + + /* Get IO base */ + prop = dt_find_property(n, "reg"); + if (!prop) { + log_simple_error(&e_info(OPAL_RC_UART_INIT), + "UART: Can't find reg property\n"); + return; + } + if (dt_property_get_cell(prop, 0) != OPAL_LPC_IO) { + log_simple_error(&e_info(OPAL_RC_UART_INIT), + "UART: Only supports IO addresses\n"); + return; + } + uart_base = dt_property_get_cell(prop, 1); + + if (irqp) { + lpc_irq = be32_to_cpu(*irqp); + prlog(PR_DEBUG, "UART: Using LPC IRQ %d\n", lpc_irq); + } + } + + + if (!uart_init_hw(dt_prop_get_u32(n, "current-speed"), + dt_prop_get_u32(n, "clock-frequency"))) { + prerror("UART: Initialization failed\n"); + dt_add_property_strings(n, "status", "bad"); + return; + } + + /* + * Mark LPC used by the console (will mark the relevant + * locks to avoid deadlocks when flushing the console) + */ + lpc_used_by_console(); + + /* Install console backend for printf() */ + set_console(&uart_con_driver); +} + diff --git a/roms/skiboot/hw/lpc.c b/roms/skiboot/hw/lpc.c new file mode 100644 index 000000000..bf3ab1fae --- /dev/null +++ b/roms/skiboot/hw/lpc.c @@ -0,0 +1,1407 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Low Pin Count (LPC) Bus. + * + * Copyright 2013-2019 IBM Corp. + */ + +#define pr_fmt(fmt) "LPC: " fmt + +#include <skiboot.h> +#include <xscom.h> +#include <io.h> +#include <lock.h> +#include <chip.h> +#include <lpc.h> +#include <timebase.h> +#include <errorlog.h> +#include <opal-api.h> +#include <platform.h> +#include <psi.h> +#include <interrupts.h> + +//#define DBG_IRQ(fmt...) prerror(fmt) +#define DBG_IRQ(fmt...) do { } while(0) + +DEFINE_LOG_ENTRY(OPAL_RC_LPC_READ, OPAL_PLATFORM_ERR_EVT, OPAL_LPC, + OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL, + OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_LPC_WRITE, OPAL_PLATFORM_ERR_EVT, OPAL_LPC, + OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL, + OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_LPC_SYNC, OPAL_PLATFORM_ERR_EVT, OPAL_LPC, + OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL, + OPAL_NA); + +/* Used exclusively in manufacturing mode */ +DEFINE_LOG_ENTRY(OPAL_RC_LPC_SYNC_PERF, OPAL_PLATFORM_ERR_EVT, OPAL_LPC, + OPAL_MISC_SUBSYSTEM, OPAL_UNRECOVERABLE_ERR_DEGRADE_PERF, + OPAL_NA); + +#define ECCB_CTL 0 /* b0020 -> b00200 */ +#define ECCB_STAT 2 /* b0022 -> b00210 */ +#define ECCB_DATA 3 /* b0023 -> b00218 */ + +#define ECCB_CTL_MAGIC 0xd000000000000000ul +#define ECCB_CTL_DATASZ PPC_BITMASK(4,7) +#define ECCB_CTL_READ PPC_BIT(15) +#define ECCB_CTL_ADDRLEN PPC_BITMASK(23,25) +#define ECCB_ADDRLEN_4B 0x4 +#define ECCB_CTL_ADDR PPC_BITMASK(32,63) + +#define ECCB_STAT_PIB_ERR PPC_BITMASK(0,5) +#define ECCB_STAT_RD_DATA PPC_BITMASK(6,37) +#define ECCB_STAT_BUSY PPC_BIT(44) +#define ECCB_STAT_ERRORS1 PPC_BITMASK(45,51) +#define ECCB_STAT_OP_DONE PPC_BIT(52) +#define ECCB_STAT_ERRORS2 PPC_BITMASK(53,55) + +#define ECCB_STAT_ERR_MASK (ECCB_STAT_PIB_ERR | \ + ECCB_STAT_ERRORS1 | \ + ECCB_STAT_ERRORS2) + +#define ECCB_TIMEOUT 1000000 + +/* OPB Master LS registers */ +#define OPB_MASTER_LS_IRQ_STAT 0x50 +#define OPB_MASTER_LS_IRQ_MASK 0x54 +#define OPB_MASTER_LS_IRQ_POL 0x58 +#define OPB_MASTER_IRQ_LPC 0x00000800 + +/* LPC HC registers */ +#define LPC_HC_FW_SEG_IDSEL 0x24 +#define LPC_HC_FW_RD_ACC_SIZE 0x28 +#define LPC_HC_FW_RD_1B 0x00000000 +#define LPC_HC_FW_RD_2B 0x01000000 +#define LPC_HC_FW_RD_4B 0x02000000 +#define LPC_HC_FW_RD_16B 0x04000000 +#define LPC_HC_FW_RD_128B 0x07000000 +#define LPC_HC_IRQSER_CTRL 0x30 +#define LPC_HC_IRQSER_EN 0x80000000 +#define LPC_HC_IRQSER_QMODE 0x40000000 +#define LPC_HC_IRQSER_START_MASK 0x03000000 +#define LPC_HC_IRQSER_START_4CLK 0x00000000 +#define LPC_HC_IRQSER_START_6CLK 0x01000000 +#define LPC_HC_IRQSER_START_8CLK 0x02000000 +#define LPC_HC_IRQSER_AUTO_CLEAR 0x00800000 +#define LPC_HC_IRQMASK 0x34 /* same bit defs as LPC_HC_IRQSTAT */ +#define LPC_HC_IRQSTAT 0x38 +#define LPC_HC_IRQ_SERIRQ0 0x80000000u /* all bits down to ... */ +#define LPC_HC_IRQ_SERIRQ16 0x00008000 /* IRQ16=IOCHK#, IRQ2=SMI# */ +#define LPC_HC_IRQ_SERIRQ_ALL 0xffff8000 +#define LPC_HC_IRQ_LRESET 0x00000400 +#define LPC_HC_IRQ_SYNC_ABNORM_ERR 0x00000080 +#define LPC_HC_IRQ_SYNC_NORESP_ERR 0x00000040 +#define LPC_HC_IRQ_SYNC_NORM_ERR 0x00000020 +#define LPC_HC_IRQ_SYNC_TIMEOUT_ERR 0x00000010 +#define LPC_HC_IRQ_TARG_TAR_ERR 0x00000008 +#define LPC_HC_IRQ_BM_TAR_ERR 0x00000004 +#define LPC_HC_IRQ_BM0_REQ 0x00000002 +#define LPC_HC_IRQ_BM1_REQ 0x00000001 +#define LPC_HC_IRQ_BASE_IRQS ( \ + LPC_HC_IRQ_LRESET | \ + LPC_HC_IRQ_SYNC_ABNORM_ERR | \ + LPC_HC_IRQ_SYNC_NORESP_ERR | \ + LPC_HC_IRQ_SYNC_NORM_ERR | \ + LPC_HC_IRQ_SYNC_TIMEOUT_ERR | \ + LPC_HC_IRQ_TARG_TAR_ERR | \ + LPC_HC_IRQ_BM_TAR_ERR) +#define LPC_HC_ERROR_ADDRESS 0x40 + +#define LPC_NUM_SERIRQ 17 + +enum { + LPC_ROUTE_FREE = 0, + LPC_ROUTE_OPAL, + LPC_ROUTE_LINUX +}; + +struct lpc_error_entry { + int64_t rc; + const char *description; +}; + +struct lpcm { + uint32_t chip_id; + uint32_t xbase; + void *mbase; + struct lock lock; + uint8_t fw_idsel; + uint8_t fw_rdsz; + struct list_head clients; + bool has_serirq; + uint8_t sirq_routes[LPC_NUM_SERIRQ]; + bool sirq_routed[LPC_NUM_SERIRQ]; + uint32_t sirq_rmasks[4]; + uint8_t sirq_ralloc[4]; + struct dt_node *node; +}; + + +#define LPC_BUS_DEGRADED_PERF_THRESHOLD 5 + +struct lpc_client_entry { + struct list_node node; + const struct lpc_client *clt; + uint32_t policy; +}; + +/* Default LPC bus */ +static int32_t lpc_default_chip_id = -1; +static bool lpc_irqs_ready; + +/* + * These are expected to be the same on all chips and should probably + * be read (or configured) dynamically. This is how things are configured + * today on Tuletta. + */ +static uint32_t lpc_io_opb_base = 0xd0010000; +static uint32_t lpc_mem_opb_base = 0xe0000000; +static uint32_t lpc_fw_opb_base = 0xf0000000; +static uint32_t lpc_reg_opb_base = 0xc0012000; +static uint32_t opb_master_reg_base = 0xc0010000; + +static int64_t opb_mmio_write(struct lpcm *lpc, uint32_t addr, uint32_t data, + uint32_t sz) +{ + switch (sz) { + case 1: + out_8(lpc->mbase + addr, data); + return OPAL_SUCCESS; + case 2: + out_be16(lpc->mbase + addr, data); + return OPAL_SUCCESS; + case 4: + out_be32(lpc->mbase + addr, data); + return OPAL_SUCCESS; + } + prerror("Invalid data size %d\n", sz); + return OPAL_PARAMETER; +} + +static int64_t opb_write(struct lpcm *lpc, uint32_t addr, uint32_t data, + uint32_t sz) +{ + uint64_t ctl = ECCB_CTL_MAGIC, stat; + int64_t rc, tout; + uint64_t data_reg; + + if (lpc->mbase) + return opb_mmio_write(lpc, addr, data, sz); + + switch(sz) { + case 1: + data_reg = ((uint64_t)data) << 56; + break; + case 2: + data_reg = ((uint64_t)data) << 48; + break; + case 4: + data_reg = ((uint64_t)data) << 32; + break; + default: + prerror("Invalid data size %d\n", sz); + return OPAL_PARAMETER; + } + + rc = xscom_write(lpc->chip_id, lpc->xbase + ECCB_DATA, data_reg); + if (rc) { + log_simple_error(&e_info(OPAL_RC_LPC_WRITE), + "LPC: XSCOM write to ECCB DATA error %lld\n", rc); + return rc; + } + + ctl = SETFIELD(ECCB_CTL_DATASZ, ctl, sz); + ctl = SETFIELD(ECCB_CTL_ADDRLEN, ctl, ECCB_ADDRLEN_4B); + ctl = SETFIELD(ECCB_CTL_ADDR, ctl, addr); + rc = xscom_write(lpc->chip_id, lpc->xbase + ECCB_CTL, ctl); + if (rc) { + log_simple_error(&e_info(OPAL_RC_LPC_WRITE), + "LPC: XSCOM write to ECCB CTL error %lld\n", rc); + return rc; + } + + for (tout = 0; tout < ECCB_TIMEOUT; tout++) { + rc = xscom_read(lpc->chip_id, lpc->xbase + ECCB_STAT, + &stat); + if (rc) { + log_simple_error(&e_info(OPAL_RC_LPC_WRITE), + "LPC: XSCOM read from ECCB STAT err %lld\n", + rc); + return rc; + } + if (stat & ECCB_STAT_OP_DONE) { + if (stat & ECCB_STAT_ERR_MASK) { + log_simple_error(&e_info(OPAL_RC_LPC_WRITE), + "LPC: Error status: 0x%llx\n", stat); + return OPAL_HARDWARE; + } + return OPAL_SUCCESS; + } + time_wait_nopoll(100); + } + log_simple_error(&e_info(OPAL_RC_LPC_WRITE), "LPC: Write timeout !\n"); + return OPAL_HARDWARE; +} + +static int64_t opb_mmio_read(struct lpcm *lpc, uint32_t addr, uint32_t *data, + uint32_t sz) +{ + switch (sz) { + case 1: + *data = in_8(lpc->mbase + addr); + return OPAL_SUCCESS; + case 2: + *data = in_be16(lpc->mbase + addr); + return OPAL_SUCCESS; + case 4: + *data = in_be32(lpc->mbase + addr); + return OPAL_SUCCESS; + } + prerror("Invalid data size %d\n", sz); + return OPAL_PARAMETER; +} + +static int64_t opb_read(struct lpcm *lpc, uint32_t addr, uint32_t *data, + uint32_t sz) +{ + uint64_t ctl = ECCB_CTL_MAGIC | ECCB_CTL_READ, stat; + int64_t rc, tout; + + if (lpc->mbase) + return opb_mmio_read(lpc, addr, data, sz); + + if (sz != 1 && sz != 2 && sz != 4) { + prerror("Invalid data size %d\n", sz); + return OPAL_PARAMETER; + } + + ctl = SETFIELD(ECCB_CTL_DATASZ, ctl, sz); + ctl = SETFIELD(ECCB_CTL_ADDRLEN, ctl, ECCB_ADDRLEN_4B); + ctl = SETFIELD(ECCB_CTL_ADDR, ctl, addr); + rc = xscom_write(lpc->chip_id, lpc->xbase + ECCB_CTL, ctl); + if (rc) { + log_simple_error(&e_info(OPAL_RC_LPC_READ), + "LPC: XSCOM write to ECCB CTL error %lld\n", rc); + return rc; + } + + for (tout = 0; tout < ECCB_TIMEOUT; tout++) { + rc = xscom_read(lpc->chip_id, lpc->xbase + ECCB_STAT, + &stat); + if (rc) { + log_simple_error(&e_info(OPAL_RC_LPC_READ), + "LPC: XSCOM read from ECCB STAT err %lld\n", + rc); + return rc; + } + if (stat & ECCB_STAT_OP_DONE) { + uint32_t rdata = GETFIELD(ECCB_STAT_RD_DATA, stat); + if (stat & ECCB_STAT_ERR_MASK) { + log_simple_error(&e_info(OPAL_RC_LPC_READ), + "LPC: Error status: 0x%llx\n", stat); + return OPAL_HARDWARE; + } + switch(sz) { + case 1: + *data = rdata >> 24; + break; + case 2: + *data = rdata >> 16; + break; + default: + *data = rdata; + break; + } + return 0; + } + time_wait_nopoll(100); + } + log_simple_error(&e_info(OPAL_RC_LPC_READ), "LPC: Read timeout !\n"); + return OPAL_HARDWARE; +} + +static int64_t lpc_set_fw_idsel(struct lpcm *lpc, uint8_t idsel) +{ + uint32_t val; + int64_t rc; + + if (idsel == lpc->fw_idsel) + return OPAL_SUCCESS; + if (idsel > 0xf) + return OPAL_PARAMETER; + + rc = opb_read(lpc, lpc_reg_opb_base + LPC_HC_FW_SEG_IDSEL, + &val, 4); + if (rc) { + prerror("Failed to read HC_FW_SEG_IDSEL register !\n"); + return rc; + } + val = (val & 0xfffffff0) | idsel; + rc = opb_write(lpc, lpc_reg_opb_base + LPC_HC_FW_SEG_IDSEL, + val, 4); + if (rc) { + prerror("Failed to write HC_FW_SEG_IDSEL register !\n"); + return rc; + } + lpc->fw_idsel = idsel; + return OPAL_SUCCESS; +} + +static int64_t lpc_set_fw_rdsz(struct lpcm *lpc, uint8_t rdsz) +{ + uint32_t val; + int64_t rc; + + if (rdsz == lpc->fw_rdsz) + return OPAL_SUCCESS; + switch(rdsz) { + case 1: + val = LPC_HC_FW_RD_1B; + break; + case 2: + val = LPC_HC_FW_RD_2B; + break; + case 4: + val = LPC_HC_FW_RD_4B; + break; + default: + /* + * The HW supports 16 and 128 via a buffer/cache + * but I have never exprimented with it and am not + * sure it works the way we expect so let's leave it + * at that for now + */ + return OPAL_PARAMETER; + } + rc = opb_write(lpc, lpc_reg_opb_base + LPC_HC_FW_RD_ACC_SIZE, + val, 4); + if (rc) { + prerror("Failed to write LPC_HC_FW_RD_ACC_SIZE !\n"); + return rc; + } + lpc->fw_rdsz = rdsz; + return OPAL_SUCCESS; +} + +static int64_t lpc_opb_prepare(struct lpcm *lpc, + enum OpalLPCAddressType addr_type, + uint32_t addr, uint32_t sz, + uint32_t *opb_base, bool is_write) +{ + uint32_t top = addr + sz; + uint8_t fw_idsel; + int64_t rc; + + /* Address wraparound */ + if (top < addr) + return OPAL_PARAMETER; + + /* + * Bound check access and get the OPB base address for + * the window corresponding to the access type + */ + switch(addr_type) { + case OPAL_LPC_IO: + /* IO space is 64K */ + if (top > 0x10000) + return OPAL_PARAMETER; + /* And only supports byte accesses */ + if (sz != 1) + return OPAL_PARAMETER; + *opb_base = lpc_io_opb_base; + break; + case OPAL_LPC_MEM: + /* MEM space is 256M */ + if (top > 0x10000000) + return OPAL_PARAMETER; + /* And only supports byte accesses */ + if (sz != 1) + return OPAL_PARAMETER; + *opb_base = lpc_mem_opb_base; + break; + case OPAL_LPC_FW: + /* + * FW space is in segments of 256M controlled + * by IDSEL, make sure we don't cross segments + */ + *opb_base = lpc_fw_opb_base; + fw_idsel = (addr >> 28); + if (((top - 1) >> 28) != fw_idsel) + return OPAL_PARAMETER; + + /* Set segment */ + rc = lpc_set_fw_idsel(lpc, fw_idsel); + if (rc) + return rc; + /* Set read access size */ + if (!is_write) { + rc = lpc_set_fw_rdsz(lpc, sz); + if (rc) + return rc; + } + break; + default: + return OPAL_PARAMETER; + } + return OPAL_SUCCESS; +} + +#define LPC_ERROR_IDX(x) (__builtin_ffs(x) - 1 - 2) +#define LPC_ERROR(_sts, _rc, _description) \ + [LPC_ERROR_IDX(_sts)] = { _rc, _description } +static const struct lpc_error_entry lpc_error_table[] = { + LPC_ERROR(LPC_HC_IRQ_BM_TAR_ERR, OPAL_WRONG_STATE, "Got bus master TAR error."), + LPC_ERROR(LPC_HC_IRQ_TARG_TAR_ERR, OPAL_WRONG_STATE, "Got abnormal TAR error."), + LPC_ERROR(LPC_HC_IRQ_SYNC_TIMEOUT_ERR, OPAL_TIMEOUT, "Got SYNC timeout error."), + LPC_ERROR(LPC_HC_IRQ_SYNC_NORM_ERR, OPAL_WRONG_STATE, "Got SYNC normal error."), + LPC_ERROR(LPC_HC_IRQ_SYNC_NORESP_ERR, OPAL_HARDWARE, "Got SYNC no-response error."), + LPC_ERROR(LPC_HC_IRQ_SYNC_ABNORM_ERR, OPAL_WRONG_STATE, "Got SYNC abnormal error."), +}; + +static int64_t lpc_probe_prepare(struct lpcm *lpc) +{ + const uint32_t irqmask_addr = lpc_reg_opb_base + LPC_HC_IRQMASK; + const uint32_t irqstat_addr = lpc_reg_opb_base + LPC_HC_IRQSTAT; + uint32_t irqmask; + int rc; + + rc = opb_read(lpc, irqmask_addr, &irqmask, 4); + if (rc) + return rc; + + irqmask &= ~LPC_HC_IRQ_SYNC_NORESP_ERR; + rc = opb_write(lpc, irqmask_addr, irqmask, 4); + if (rc) + return rc; + + return opb_write(lpc, irqstat_addr, LPC_HC_IRQ_SYNC_NORESP_ERR, 4); +} + +static int64_t lpc_probe_test(struct lpcm *lpc) +{ + const uint32_t irqmask_addr = lpc_reg_opb_base + LPC_HC_IRQMASK; + const uint32_t irqstat_addr = lpc_reg_opb_base + LPC_HC_IRQSTAT; + uint32_t irqmask, irqstat; + int64_t idx; + int rc; + + rc = opb_read(lpc, irqstat_addr, &irqstat, 4); + if (rc) + return rc; + + rc = opb_write(lpc, irqstat_addr, LPC_HC_IRQ_SYNC_NORESP_ERR, 4); + if (rc) + return rc; + + rc = opb_read(lpc, irqmask_addr, &irqmask, 4); + if (rc) + return rc; + + irqmask |= LPC_HC_IRQ_SYNC_NORESP_ERR; + rc = opb_write(lpc, irqmask_addr, irqmask, 4); + if (rc) + return rc; + + if (!(irqstat & LPC_HC_IRQ_BASE_IRQS)) + return OPAL_SUCCESS; + + /* Ensure we can perform a valid lookup in the error table */ + idx = LPC_ERROR_IDX(irqstat); + if (idx < 0 || idx >= ARRAY_SIZE(lpc_error_table)) { + prerror("LPC bus error translation failed with status 0x%x\n", + irqstat); + return OPAL_PARAMETER; + } + + rc = lpc_error_table[idx].rc; + return rc; +} + +static int64_t __lpc_write(struct lpcm *lpc, enum OpalLPCAddressType addr_type, + uint32_t addr, uint32_t data, uint32_t sz, + bool probe) +{ + uint32_t opb_base; + int64_t rc; + + lock(&lpc->lock); + if (probe) { + rc = lpc_probe_prepare(lpc); + if (rc) + goto bail; + } + + /* + * Convert to an OPB access and handle LPC HC configuration + * for FW accesses (IDSEL) + */ + rc = lpc_opb_prepare(lpc, addr_type, addr, sz, &opb_base, true); + if (rc) + goto bail; + + /* Perform OPB access */ + rc = opb_write(lpc, opb_base + addr, data, sz); + if (rc) + goto bail; + + if (probe) + rc = lpc_probe_test(lpc); + bail: + unlock(&lpc->lock); + return rc; +} + +static int64_t __lpc_write_sanity(enum OpalLPCAddressType addr_type, + uint32_t addr, uint32_t data, uint32_t sz, + bool probe) +{ + struct proc_chip *chip; + + if (lpc_default_chip_id < 0) + return OPAL_PARAMETER; + chip = get_chip(lpc_default_chip_id); + if (!chip || !chip->lpc) + return OPAL_PARAMETER; + return __lpc_write(chip->lpc, addr_type, addr, data, sz, probe); +} + +int64_t lpc_write(enum OpalLPCAddressType addr_type, uint32_t addr, + uint32_t data, uint32_t sz) +{ + return __lpc_write_sanity(addr_type, addr, data, sz, false); +} + +int64_t lpc_probe_write(enum OpalLPCAddressType addr_type, uint32_t addr, + uint32_t data, uint32_t sz) +{ + return __lpc_write_sanity(addr_type, addr, data, sz, true); +} + +/* + * The "OPAL" variant add the emulation of 2 and 4 byte accesses using + * byte accesses for IO and MEM space in order to be compatible with + * existing Linux expectations + */ +static int64_t opal_lpc_write(uint32_t chip_id, enum OpalLPCAddressType addr_type, + uint32_t addr, uint32_t data, uint32_t sz) +{ + struct proc_chip *chip; + int64_t rc; + + chip = get_chip(chip_id); + if (!chip || !chip->lpc) + return OPAL_PARAMETER; + + if (addr_type == OPAL_LPC_FW || sz == 1) + return __lpc_write(chip->lpc, addr_type, addr, data, sz, false); + while(sz--) { + rc = __lpc_write(chip->lpc, addr_type, addr, data & 0xff, 1, false); + if (rc) + return rc; + addr++; + data >>= 8; + } + return OPAL_SUCCESS; +} + +static int64_t __lpc_read(struct lpcm *lpc, enum OpalLPCAddressType addr_type, + uint32_t addr, uint32_t *data, uint32_t sz, + bool probe) +{ + uint32_t opb_base; + int64_t rc; + + lock(&lpc->lock); + if (probe) { + rc = lpc_probe_prepare(lpc); + if (rc) + goto bail; + } + + /* + * Convert to an OPB access and handle LPC HC configuration + * for FW accesses (IDSEL and read size) + */ + rc = lpc_opb_prepare(lpc, addr_type, addr, sz, &opb_base, false); + if (rc) + goto bail; + + /* Perform OPB access */ + rc = opb_read(lpc, opb_base + addr, data, sz); + if (rc) + goto bail; + + if (probe) + rc = lpc_probe_test(lpc); + bail: + unlock(&lpc->lock); + return rc; +} + +static int64_t __lpc_read_sanity(enum OpalLPCAddressType addr_type, + uint32_t addr, uint32_t *data, uint32_t sz, + bool probe) +{ + struct proc_chip *chip; + + if (lpc_default_chip_id < 0) + return OPAL_PARAMETER; + chip = get_chip(lpc_default_chip_id); + if (!chip || !chip->lpc) + return OPAL_PARAMETER; + return __lpc_read(chip->lpc, addr_type, addr, data, sz, probe); +} + +int64_t lpc_read(enum OpalLPCAddressType addr_type, uint32_t addr, + uint32_t *data, uint32_t sz) +{ + return __lpc_read_sanity(addr_type, addr, data, sz, false); +} + +int64_t lpc_probe_read(enum OpalLPCAddressType addr_type, uint32_t addr, + uint32_t *data, uint32_t sz) +{ + return __lpc_read_sanity(addr_type, addr, data, sz, true); +} + +/* + * The "OPAL" variant add the emulation of 2 and 4 byte accesses using + * byte accesses for IO and MEM space in order to be compatible with + * existing Linux expectations + */ +static int64_t opal_lpc_read(uint32_t chip_id, enum OpalLPCAddressType addr_type, + uint32_t addr, __be32 *data, uint32_t sz) +{ + struct proc_chip *chip; + int64_t rc; + uint32_t tmp; + + chip = get_chip(chip_id); + if (!chip || !chip->lpc) + return OPAL_PARAMETER; + + if (addr_type == OPAL_LPC_FW) { + rc = __lpc_read(chip->lpc, addr_type, addr, &tmp, sz, false); + if (rc) + return rc; + + } else { + tmp = 0; + while (sz--) { + uint32_t byte; + + rc = __lpc_read(chip->lpc, addr_type, addr, &byte, 1, false); + if (rc) + return rc; + tmp = tmp | (byte << (8 * sz)); + addr++; + } + } + + *data = cpu_to_be32(tmp); + + return OPAL_SUCCESS; +} + +bool lpc_present(void) +{ + return lpc_default_chip_id >= 0; +} + +/* Called with LPC lock held */ +static void lpc_setup_serirq(struct lpcm *lpc) +{ + struct lpc_client_entry *ent; + uint32_t mask = LPC_HC_IRQ_BASE_IRQS; + int rc; + + if (!lpc_irqs_ready) + return; + + /* Collect serirq enable bits */ + list_for_each(&lpc->clients, ent, node) + mask |= ent->clt->interrupts & LPC_HC_IRQ_SERIRQ_ALL; + + rc = opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQMASK, mask, 4); + if (rc) { + prerror("Failed to update irq mask\n"); + return; + } + DBG_IRQ("IRQ mask set to 0x%08x\n", mask); + + /* Enable the LPC interrupt in the OPB Master */ + opb_write(lpc, opb_master_reg_base + OPB_MASTER_LS_IRQ_POL, 0, 4); + rc = opb_write(lpc, opb_master_reg_base + OPB_MASTER_LS_IRQ_MASK, + OPB_MASTER_IRQ_LPC, 4); + if (rc) + prerror("Failed to enable IRQs in OPB\n"); + + /* Check whether we should enable serirq */ + if (mask & LPC_HC_IRQ_SERIRQ_ALL) { + rc = opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQSER_CTRL, + LPC_HC_IRQSER_EN | + LPC_HC_IRQSER_START_4CLK | + /* + * New mode bit for P9N DD2.0 (ignored otherwise) + * when set we no longer have to manually clear + * the SerIRQs on EOI. + */ + LPC_HC_IRQSER_AUTO_CLEAR, 4); + DBG_IRQ("SerIRQ enabled\n"); + } else { + rc = opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQSER_CTRL, + 0, 4); + DBG_IRQ("SerIRQ disabled\n"); + } + if (rc) + prerror("Failed to configure SerIRQ\n"); + { + u32 val; + rc = opb_read(lpc, lpc_reg_opb_base + LPC_HC_IRQMASK, &val, 4); + if (rc) + prerror("Failed to readback mask"); + else + DBG_IRQ("MASK READBACK=%x\n", val); + + rc = opb_read(lpc, lpc_reg_opb_base + LPC_HC_IRQSER_CTRL, + &val, 4); + if (rc) + prerror("Failed to readback ctrl"); + else + DBG_IRQ("CTRL READBACK=%x\n", val); + } +} + +static void lpc_route_serirq(struct lpcm *lpc, uint32_t sirq, + uint32_t psi_idx) +{ + uint32_t reg, shift, val, psi_old; + int64_t rc; + + psi_old = lpc->sirq_routes[sirq]; + lpc->sirq_rmasks[psi_old] &= ~(LPC_HC_IRQ_SERIRQ0 >> sirq); + lpc->sirq_rmasks[psi_idx] |= (LPC_HC_IRQ_SERIRQ0 >> sirq); + lpc->sirq_routes[sirq] = psi_idx; + lpc->sirq_routed[sirq] = true; + + /* We may not be ready yet ... */ + if (!lpc->has_serirq) + return; + + if (sirq < 14) { + reg = 0xc; + shift = 4 + (sirq << 1); + } else { + reg = 0x8; + shift = 8 + ((sirq - 14) << 1); + } + shift = 30-shift; + rc = opb_read(lpc, opb_master_reg_base + reg, &val, 4); + if (rc) + return; + val = val & ~(3 << shift); + val |= (psi_idx & 3) << shift; + opb_write(lpc, opb_master_reg_base + reg, val, 4); +} + +static void lpc_alloc_route(struct lpcm *lpc, unsigned int irq, + unsigned int policy) +{ + unsigned int i, r, c; + int route = -1; + + if (policy == IRQ_ATTR_TARGET_OPAL) + r = LPC_ROUTE_OPAL; + else + r = LPC_ROUTE_LINUX; + + prlog(PR_DEBUG, "Routing irq %d, policy: %d (r=%d)\n", + irq, policy, r); + + /* Are we already routed ? */ + if (lpc->sirq_routed[irq] && + r != lpc->sirq_ralloc[lpc->sirq_routes[irq]]) { + prerror("irq %d has conflicting policies\n", irq); + return; + } + + /* First try to find a free route. Leave one for another + * policy though + */ + for (i = 0, c = 0; i < 4; i++) { + /* Count routes with identical policy */ + if (lpc->sirq_ralloc[i] == r) + c++; + + /* Use the route if it's free and there is no more + * than 3 existing routes with that policy + */ + if (lpc->sirq_ralloc[i] == LPC_ROUTE_FREE && c < 4) { + lpc->sirq_ralloc[i] = r; + route = i; + break; + } + } + + /* If we couldn't get a free one, try to find an existing one + * with a matching policy + */ + for (i = 0; route < 0 && i < 4; i++) { + if (lpc->sirq_ralloc[i] == r) + route = i; + } + + /* Still no route ? bail. That should never happen */ + if (route < 0) { + prerror("Can't find a route for irq %d\n", irq); + return; + } + + /* Program route */ + lpc_route_serirq(lpc, irq, route); + + prlog(PR_DEBUG, "SerIRQ %d using route %d targetted at %s\n", + irq, route, r == LPC_ROUTE_LINUX ? "OS" : "OPAL"); +} + +unsigned int lpc_get_irq_policy(uint32_t chip_id, uint32_t psi_idx) +{ + struct proc_chip *c = get_chip(chip_id); + + if (!c || !c->lpc) + return IRQ_ATTR_TARGET_LINUX; + + if (c->lpc->sirq_ralloc[psi_idx] == LPC_ROUTE_LINUX) + return IRQ_ATTR_TARGET_LINUX; + else + return IRQ_ATTR_TARGET_OPAL | IRQ_ATTR_TYPE_LSI; +} + +static void lpc_create_int_map(struct lpcm *lpc, struct dt_node *psi_node) +{ + __be32 map[LPC_NUM_SERIRQ * 5], *pmap; + uint32_t i; + + if (!psi_node) + return; + pmap = map; + for (i = 0; i < LPC_NUM_SERIRQ; i++) { + if (!lpc->sirq_routed[i]) + continue; + *(pmap++) = 0; + *(pmap++) = 0; + *(pmap++) = cpu_to_be32(i); + *(pmap++) = cpu_to_be32(psi_node->phandle); + *(pmap++) = cpu_to_be32(lpc->sirq_routes[i] + P9_PSI_IRQ_LPC_SIRQ0); + } + if (pmap == map) + return; + dt_add_property(lpc->node, "interrupt-map", map, + (pmap - map) * sizeof(uint32_t)); + dt_add_property_cells(lpc->node, "interrupt-map-mask", 0, 0, 0xff); + dt_add_property_cells(lpc->node, "#interrupt-cells", 1); +} + +void lpc_finalize_interrupts(void) +{ + struct proc_chip *chip; + + lpc_irqs_ready = true; + + for_each_chip(chip) { + if (chip->lpc && chip->psi && + (chip->type == PROC_CHIP_P9_NIMBUS || + chip->type == PROC_CHIP_P9_CUMULUS || + chip->type == PROC_CHIP_P9P || + chip->type == PROC_CHIP_P10)) + lpc_create_int_map(chip->lpc, chip->psi->node); + } +} + +static void lpc_init_interrupts_one(struct proc_chip *chip) +{ + struct lpcm *lpc = chip->lpc; + int i, rc; + + lock(&lpc->lock); + + /* First mask them all */ + rc = opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQMASK, 0, 4); + if (rc) { + prerror("Failed to init interrutps\n"); + goto bail; + } + + switch(chip->type) { + case PROC_CHIP_P8_MURANO: + case PROC_CHIP_P8_VENICE: + /* On Murano/Venice, there is no SerIRQ, only enable error + * interrupts + */ + rc = opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQMASK, + LPC_HC_IRQ_BASE_IRQS, 4); + if (rc) { + prerror("Failed to set interrupt mask\n"); + goto bail; + } + opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQSER_CTRL, 0, 4); + break; + case PROC_CHIP_P8_NAPLES: + /* On Naples, we support LPC interrupts, enable them based + * on what clients requests. This will setup the mask and + * enable processing + */ + lpc->has_serirq = true; + lpc_setup_serirq(lpc); + break; + case PROC_CHIP_P9_NIMBUS: + case PROC_CHIP_P9_CUMULUS: + case PROC_CHIP_P9P: + case PROC_CHIP_P10: + /* On P9, we additionally setup the routing. */ + lpc->has_serirq = true; + for (i = 0; i < LPC_NUM_SERIRQ; i++) { + if (lpc->sirq_routed[i]) + lpc_route_serirq(lpc, i, lpc->sirq_routes[i]); + } + lpc_setup_serirq(lpc); + break; + default: + ; + } + bail: + unlock(&lpc->lock); +} + +void lpc_init_interrupts(void) +{ + struct proc_chip *chip; + + lpc_irqs_ready = true; + + for_each_chip(chip) { + if (chip->lpc) + lpc_init_interrupts_one(chip); + } +} + +static void lpc_dispatch_reset(struct lpcm *lpc) +{ + struct lpc_client_entry *ent; + + /* XXX We are going to hit this repeatedly while reset is + * asserted which might be sub-optimal. We should instead + * detect assertion and start a poller that will wait for + * de-assertion. We could notify clients of LPC being + * on/off rather than just reset + */ + + prerror("Got LPC reset on chip 0x%x !\n", lpc->chip_id); + + /* Collect serirq enable bits */ + list_for_each(&lpc->clients, ent, node) { + if (!ent->clt->reset) + continue; + unlock(&lpc->lock); + ent->clt->reset(lpc->chip_id); + lock(&lpc->lock); + } + + /* Reconfigure serial interrupts */ + if (lpc->has_serirq) + lpc_setup_serirq(lpc); +} + +static void lpc_dispatch_err_irqs(struct lpcm *lpc, uint32_t irqs) +{ + const struct lpc_error_entry *err; + static int lpc_bus_err_count; + struct opal_err_info *info; + uint32_t addr; + int64_t idx; + int rc; + + /* Write back to clear error interrupts, we clear SerIRQ later + * as they are handled as level interrupts + */ + rc = opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQSTAT, + LPC_HC_IRQ_BASE_IRQS, 4); + if (rc) + prerror("Failed to clear IRQ error latches !\n"); + + if (irqs & LPC_HC_IRQ_LRESET) { + lpc_dispatch_reset(lpc); + return; + } + + /* Ensure we can perform a valid lookup in the error table */ + idx = LPC_ERROR_IDX(irqs); + if (idx < 0 || idx >= ARRAY_SIZE(lpc_error_table)) { + prerror("LPC bus error translation failed with status 0x%x\n", + irqs); + return; + } + + /* Find and report the error */ + err = &lpc_error_table[idx]; + lpc_bus_err_count++; + if (manufacturing_mode && (lpc_bus_err_count > LPC_BUS_DEGRADED_PERF_THRESHOLD)) + info = &e_info(OPAL_RC_LPC_SYNC_PERF); + else + info = &e_info(OPAL_RC_LPC_SYNC); + + rc = opb_read(lpc, lpc_reg_opb_base + LPC_HC_ERROR_ADDRESS, &addr, 4); + if (rc) + log_simple_error(info, "LPC[%03x]: %s " + "Error reading error address register\n", + lpc->chip_id, err->description); + else + log_simple_error(info, "LPC[%03x]: %s Error address reg: " + "0x%08x\n", + lpc->chip_id, err->description, addr); +} + +static void lpc_dispatch_ser_irqs(struct lpcm *lpc, uint32_t irqs, + bool clear_latch) +{ + struct lpc_client_entry *ent; + uint32_t cirqs; + int rc; + + irqs &= LPC_HC_IRQ_SERIRQ_ALL; + + /* Collect serirq enable bits */ + list_for_each(&lpc->clients, ent, node) { + if (!ent->clt->interrupt) + continue; + cirqs = ent->clt->interrupts & irqs; + if (cirqs) { + unlock(&lpc->lock); + ent->clt->interrupt(lpc->chip_id, cirqs); + lock(&lpc->lock); + } + } + + /* Our SerIRQ are level sensitive, we clear the latch after + * we call the handler. + */ + if (!clear_latch) + return; + + rc = opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQSTAT, irqs, 4); + if (rc) + prerror("Failed to clear SerIRQ latches !\n"); +} + +void lpc_interrupt(uint32_t chip_id) +{ + struct proc_chip *chip = get_chip(chip_id); + struct lpcm *lpc; + uint32_t irqs, opb_irqs; + int rc; + + /* No initialized LPC controller on that chip */ + if (!chip || !chip->lpc) + return; + lpc = chip->lpc; + + lock(&lpc->lock); + + /* Grab OPB Master LS interrupt status */ + rc = opb_read(lpc, opb_master_reg_base + OPB_MASTER_LS_IRQ_STAT, + &opb_irqs, 4); + if (rc) { + prerror("Failed to read OPB IRQ state\n"); + unlock(&lpc->lock); + return; + } + + DBG_IRQ("OPB IRQ on chip 0x%x, oirqs=0x%08x\n", chip_id, opb_irqs); + + /* Check if it's an LPC interrupt */ + if (!(opb_irqs & OPB_MASTER_IRQ_LPC)) { + /* Something we don't support ? Ack it anyway... */ + goto bail; + } + + /* Handle the lpc interrupt source (errors etc...) */ + rc = opb_read(lpc, lpc_reg_opb_base + LPC_HC_IRQSTAT, &irqs, 4); + if (rc) { + prerror("Failed to read LPC IRQ state\n"); + goto bail; + } + + DBG_IRQ("LPC IRQ on chip 0x%x, irqs=0x%08x\n", chip_id, irqs); + + /* Handle error interrupts */ + if (irqs & LPC_HC_IRQ_BASE_IRQS) + lpc_dispatch_err_irqs(lpc, irqs); + + /* Handle SerIRQ interrupts */ + if (irqs & LPC_HC_IRQ_SERIRQ_ALL) + lpc_dispatch_ser_irqs(lpc, irqs, true); + bail: + /* Ack it at the OPB level */ + opb_write(lpc, opb_master_reg_base + OPB_MASTER_LS_IRQ_STAT, + opb_irqs, 4); + unlock(&lpc->lock); +} + +void lpc_serirq(uint32_t chip_id, uint32_t index) +{ + struct proc_chip *chip = get_chip(chip_id); + struct lpcm *lpc; + uint32_t irqs, rmask; + int rc; + + /* No initialized LPC controller on that chip */ + if (!chip || !chip->lpc) + return; + lpc = chip->lpc; + + lock(&lpc->lock); + + /* Handle the lpc interrupt source (errors etc...) */ + rc = opb_read(lpc, lpc_reg_opb_base + LPC_HC_IRQSTAT, &irqs, 4); + if (rc) { + prerror("Failed to read LPC IRQ state\n"); + goto bail; + } + rmask = lpc->sirq_rmasks[index]; + + DBG_IRQ("IRQ on chip 0x%x, irqs=0x%08x rmask=0x%08x\n", + chip_id, irqs, rmask); + irqs &= rmask; + + /* + * Handle SerIRQ interrupts. Don't clear the latch, + * it will be done in our special EOI callback if + * necessary on DD1 + */ + if (irqs) + lpc_dispatch_ser_irqs(lpc, irqs, false); + + bail: + unlock(&lpc->lock); +} + +void lpc_all_interrupts(uint32_t chip_id) +{ + struct proc_chip *chip = get_chip(chip_id); + struct lpcm *lpc; + + /* No initialized LPC controller on that chip */ + if (!chip || !chip->lpc) + return; + lpc = chip->lpc; + + /* Dispatch all */ + lock(&lpc->lock); + lpc_dispatch_ser_irqs(lpc, LPC_HC_IRQ_SERIRQ_ALL, false); + unlock(&lpc->lock); +} + +static void lpc_init_chip_p8(struct dt_node *xn) + { + uint32_t gcid = dt_get_chip_id(xn); + struct proc_chip *chip; + struct lpcm *lpc; + + chip = get_chip(gcid); + assert(chip); + + lpc = zalloc(sizeof(struct lpcm)); + assert(lpc); + lpc->chip_id = gcid; + lpc->xbase = dt_get_address(xn, 0, NULL); + lpc->fw_idsel = 0xff; + lpc->fw_rdsz = 0xff; + lpc->node = xn; + list_head_init(&lpc->clients); + init_lock(&lpc->lock); + + if (lpc_default_chip_id < 0 || + dt_has_node_property(xn, "primary", NULL)) { + lpc_default_chip_id = gcid; + } + + /* Mask all interrupts for now */ + opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQMASK, 0, 4); + + printf("LPC[%03x]: Initialized, access via XSCOM @0x%x\n", + gcid, lpc->xbase); + + dt_add_property(xn, "interrupt-controller", NULL, 0); + dt_add_property_cells(xn, "#interrupt-cells", 1); + assert(dt_prop_get_u32(xn, "#address-cells") == 2); + + chip->lpc = lpc; +} + +static void lpc_init_chip_p9(struct dt_node *opb_node) +{ + uint32_t gcid = dt_get_chip_id(opb_node); + struct dt_node *lpc_node; + struct proc_chip *chip; + struct lpcm *lpc; + u64 addr; + u32 val; + + chip = get_chip(gcid); + assert(chip); + + /* Grab OPB base address */ + addr = dt_prop_get_cell(opb_node, "ranges", 1); + addr <<= 32; + addr |= dt_prop_get_cell(opb_node, "ranges", 2); + + /* Find the "lpc" child node */ + lpc_node = dt_find_compatible_node(opb_node, NULL, "ibm,power9-lpc"); + if (!lpc_node) + return; + + lpc = zalloc(sizeof(struct lpcm)); + assert(lpc); + lpc->chip_id = gcid; + lpc->mbase = (void *)addr; + lpc->fw_idsel = 0xff; + lpc->fw_rdsz = 0xff; + lpc->node = lpc_node; + list_head_init(&lpc->clients); + init_lock(&lpc->lock); + + if (lpc_default_chip_id < 0 || + dt_has_node_property(opb_node, "primary", NULL)) { + lpc_default_chip_id = gcid; + } + + /* Mask all interrupts for now */ + opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQMASK, 0, 4); + + /* Clear any stale LPC bus errors */ + opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQSTAT, + LPC_HC_IRQ_BASE_IRQS, 4); + + /* Default with routing to PSI SerIRQ 0, this will be updated + * later when interrupts are initialized. + */ + opb_read(lpc, opb_master_reg_base + 8, &val, 4); + val &= 0xff03ffff; + opb_write(lpc, opb_master_reg_base + 8, val, 4); + opb_read(lpc, opb_master_reg_base + 0xc, &val, 4); + val &= 0xf0000000; + opb_write(lpc, opb_master_reg_base + 0xc, val, 4); + + prlog(PR_INFO, "LPC[%03x]: Initialized\n", gcid); + prlog(PR_DEBUG,"access via MMIO @%p\n", lpc->mbase); + + chip->lpc = lpc; +} + +void lpc_init(void) +{ + struct dt_node *xn; + bool has_lpc = false; + + /* Look for P9 first as the DT is compatile for both 8 and 9 */ + dt_for_each_compatible(dt_root, xn, "ibm,power9-lpcm-opb") { + lpc_init_chip_p9(xn); + has_lpc = true; + } + + if (!has_lpc) { + dt_for_each_compatible(dt_root, xn, "ibm,power8-lpc") { + lpc_init_chip_p8(xn); + has_lpc = true; + } + } + if (lpc_default_chip_id >= 0) + prlog(PR_DEBUG, "Default bus on chip 0x%x\n", + lpc_default_chip_id); + + if (has_lpc) { + opal_register(OPAL_LPC_WRITE, opal_lpc_write, 5); + opal_register(OPAL_LPC_READ, opal_lpc_read, 5); + } +} + +void lpc_used_by_console(void) +{ + struct proc_chip *chip; + + xscom_used_by_console(); + + for_each_chip(chip) { + struct lpcm *lpc = chip->lpc; + if (lpc) { + lpc->lock.in_con_path = true; + lock(&lpc->lock); + unlock(&lpc->lock); + } + } +} + +bool lpc_ok(void) +{ + struct proc_chip *chip; + + if (lpc_default_chip_id < 0) + return false; + if (!xscom_ok()) + return false; + chip = get_chip(lpc_default_chip_id); + if (!chip->lpc) + return false; + return !lock_held_by_me(&chip->lpc->lock); +} + +void lpc_register_client(uint32_t chip_id, + const struct lpc_client *clt, + uint32_t policy) +{ + struct lpc_client_entry *ent; + struct proc_chip *chip; + struct lpcm *lpc; + bool has_routes; + + chip = get_chip(chip_id); + assert(chip); + lpc = chip->lpc; + if (!lpc) { + prerror("Attempt to register client on bad chip 0x%x\n", + chip_id); + return; + } + + has_routes = + chip->type == PROC_CHIP_P9_NIMBUS || + chip->type == PROC_CHIP_P9_CUMULUS || + chip->type == PROC_CHIP_P9P || + chip->type == PROC_CHIP_P10; + + if (policy != IRQ_ATTR_TARGET_OPAL && !has_routes) { + prerror("Chip doesn't support OS interrupt policy\n"); + return; + } + + ent = malloc(sizeof(*ent)); + assert(ent); + ent->clt = clt; + ent->policy = policy; + lock(&lpc->lock); + list_add(&lpc->clients, &ent->node); + + if (has_routes) { + unsigned int i; + for (i = 0; i < LPC_NUM_SERIRQ; i++) + if (clt->interrupts & LPC_IRQ(i)) + lpc_alloc_route(lpc, i, policy); + } + + if (lpc->has_serirq) + lpc_setup_serirq(lpc); + unlock(&lpc->lock); +} diff --git a/roms/skiboot/hw/npu-hw-procedures.c b/roms/skiboot/hw/npu-hw-procedures.c new file mode 100644 index 000000000..91bbb0f15 --- /dev/null +++ b/roms/skiboot/hw/npu-hw-procedures.c @@ -0,0 +1,608 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * NPU (NVLink1, POWER8NVL) Hardware Procedures + * + * Copyright 2013-2019 IBM Corp. + */ + +#include <skiboot.h> +#include <io.h> +#include <timebase.h> +#include <pci.h> +#include <pci-virt.h> +#include <interrupts.h> +#include <npu-regs.h> +#include <npu.h> +#include <xscom.h> + +typedef uint32_t (*step)(struct npu_dev *); + +struct procedure { + const char *name; + step steps[]; +}; + +#define DEFINE_PROCEDURE(NAME, STEPS...) \ + static struct procedure procedure_##NAME = \ + {.name = #NAME, .steps = {NAME, ##STEPS}} + +#define PROCEDURE_INPROGRESS (1 << 31) +#define PROCEDURE_COMPLETE (1 << 30) +#define PROCEDURE_NEXT (1 << 29) +#define PROCEDURE_FAILED 2 +#define PROCEDURE_ABORTED 3 +#define PROCEDURE_UNSUPPORTED 4 + +/* Mask defining which status bits we want to expose */ +#define PROCEDURE_STATUS_MASK 0xc000000f + +/* Accesors for PHY registers. These can be done either via MMIO or SCOM. */ +static bool pl_use_scom = 1; +static void phy_write(struct npu_dev *npu_dev, uint64_t addr, uint32_t val) +{ + if (pl_use_scom) + xscom_write(npu_dev->npu->chip_id, npu_dev->pl_xscom_base | addr, val); + else + out_be16((void *) npu_dev->pl_base + PL_MMIO_ADDR(addr), val); +} + +static uint16_t phy_read(struct npu_dev *npu_dev, uint64_t addr) +{ + uint64_t val; + + if (pl_use_scom) + xscom_read(npu_dev->npu->chip_id, npu_dev->pl_xscom_base + addr, &val); + else + val = in_be16((void *) npu_dev->pl_base + PL_MMIO_ADDR(addr)); + + return val & 0xffff; +} + +/* The DL registers can be accessed indirectly via the NTL */ +static void dl_write(struct npu_dev *npu_dev, uint32_t addr, uint32_t val) +{ + xscom_write(npu_dev->npu->chip_id, + npu_dev->xscom + NX_DL_REG_ADDR, addr); + xscom_write(npu_dev->npu->chip_id, + npu_dev->xscom + NX_DL_REG_DATA, val); +} + +static uint64_t __unused dl_read(struct npu_dev *npu_dev, uint32_t addr) +{ + uint64_t val; + + xscom_write(npu_dev->npu->chip_id, + npu_dev->xscom + NX_DL_REG_ADDR, addr); + xscom_read(npu_dev->npu->chip_id, + npu_dev->xscom + NX_DL_REG_DATA, &val); + return val; +} + +/* Our hardware bits are backwards here. The lane vectors are 16-bit + * values represented in IBM bit ordering. This means lane 0 is + * represented by bit 15 in most of the registers. Internally we keep + * this sane (ie. npu_dev->lane_mask[0] == lane 0) as we need sane + * numbering for set_lane_reg() anyway. */ +static uint32_t phy_lane_mask(struct npu_dev *npu_dev) +{ + /* We only train 8 lanes at a time so we don't do a full + * bit-swap */ + assert(npu_dev->lane_mask == 0xff00 || npu_dev->lane_mask == 0xff); + + return ~npu_dev->lane_mask & 0xffff; +} + +static void set_lane_reg(struct npu_dev *npu_dev, uint64_t base_reg, + uint64_t data, uint64_t mask) +{ + uint64_t val, i; + uint32_t lane_mask = npu_dev->lane_mask; + + for (i = 0; i <= 23; i++) { + if (lane_mask & (1ul << i)) { + uint64_t tx_rxcal_reg = base_reg + (i << 32); + val = phy_read(npu_dev, tx_rxcal_reg); + val = (val & ~mask) | data; + phy_write(npu_dev, tx_rxcal_reg, val); + } + } +} + +static uint32_t stop(struct npu_dev *npu_dev __unused) +{ + return PROCEDURE_COMPLETE | PROCEDURE_ABORTED; +} +DEFINE_PROCEDURE(stop); + +static uint32_t nop(struct npu_dev *npu_dev __unused) +{ + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(nop); + +/* Procedure 1.2.1 (RESET_NPU_DL) from opt_programmerguide.odt. Also + * incorporates AT reset. */ +static uint32_t reset_npu_dl(struct npu_dev *npu_dev) +{ + uint64_t val; + + /* Assert NPU reset */ + xscom_read(npu_dev->npu->chip_id, npu_dev->xscom + NX_NTL_CONTROL, &val); + val |= NTL_CONTROL_RESET; + xscom_write(npu_dev->npu->chip_id, npu_dev->xscom + NX_NTL_CONTROL, val); + + /* Put the Nvidia logic in reset */ + dl_write(npu_dev, NDL_CONTROL, 0xe8000000); + + /* Release Nvidia logic from reset */ + dl_write(npu_dev, NDL_CONTROL, 0); + + /* Release NPU from reset */ + val &= ~NTL_CONTROL_RESET; + xscom_write(npu_dev->npu->chip_id, npu_dev->xscom + NX_NTL_CONTROL, val); + + /* Setup up TL credits */ + xscom_write(npu_dev->npu->chip_id, npu_dev->xscom + NX_TL_CMD_CR, PPC_BIT(0)); + xscom_write(npu_dev->npu->chip_id, npu_dev->xscom + NX_TL_CMD_D_CR, PPC_BIT(0)); + xscom_write(npu_dev->npu->chip_id, npu_dev->xscom + NX_TL_RSP_CR, PPC_BIT(15)); + xscom_write(npu_dev->npu->chip_id, npu_dev->xscom + NX_TL_RSP_D_CR, PPC_BIT(15)); + + /* Reset error registers. TODO: are there more we should clear here? */ + npu_ioda_sel(npu_dev->npu, NPU_IODA_TBL_PESTB, 0, true); + for (val = 0; val < NPU_NUM_OF_PES; val++) + out_be64(npu_dev->npu->at_regs + NPU_IODA_DATA0, 0); + + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(reset_npu_dl); + +/* Procedures 1.2.3 (reset_lanes) & 1.2.4 + * (io_register_write_reset_values) */ +static uint32_t phy_reset(struct npu_dev *npu_dev) +{ + uint16_t val; + + /* Lower run_lane inputs for lanes to be reset */ + val = phy_read(npu_dev, RX_RUN_LANE_VEC_0_15); + val &= ~phy_lane_mask(npu_dev); + phy_write(npu_dev, RX_RUN_LANE_VEC_0_15, val); + + return PROCEDURE_NEXT; +} + +static uint32_t phy_reset_wait(struct npu_dev *npu_dev) +{ + uint16_t val; + + /* Wait for lane busy outputs to go to zero for lanes to be + * reset */ + val = phy_read(npu_dev, RX_LANE_BUSY_VEC_0_15); + if (val & phy_lane_mask(npu_dev)) + return PROCEDURE_INPROGRESS; + + return PROCEDURE_NEXT; +} + +static uint32_t phy_reset_complete(struct npu_dev *npu_dev) +{ + uint16_t val; + uint32_t lane_mask = phy_lane_mask(npu_dev); + + /* Set ioreset_vec for the desired lanes bit positions */ + val = phy_read(npu_dev, RX_IORESET_VEC_0_15); + phy_write(npu_dev, RX_IORESET_VEC_0_15, val | lane_mask); + + val = phy_read(npu_dev, TX_IORESET_VEC_0_15); + phy_write(npu_dev, TX_IORESET_VEC_0_15, val | lane_mask); + + /* Clear ioreset_vec */ + val = phy_read(npu_dev, RX_IORESET_VEC_0_15); + phy_write(npu_dev, RX_IORESET_VEC_0_15, val & ~lane_mask); + + val = phy_read(npu_dev, TX_IORESET_VEC_0_15); + phy_write(npu_dev, TX_IORESET_VEC_0_15, val & ~lane_mask); + + /* Reset RX phase rotators */ + set_lane_reg(npu_dev, RX_PR_CNTL_PL, RX_PR_RESET, RX_PR_RESET); + set_lane_reg(npu_dev, RX_PR_CNTL_PL, 0, RX_PR_RESET); + + /* Restore registers from scominit that may have changed */ + set_lane_reg(npu_dev, RX_PR_MODE, 0x8, RX_PR_PHASE_STEP); + set_lane_reg(npu_dev, RX_A_DAC_CNTL, + 0x7 << MASK_TO_LSH(RX_PR_IQ_RES_SEL), + RX_PR_IQ_RES_SEL); + set_lane_reg(npu_dev, TX_MODE1_PL, 0, TX_LANE_PDWN); + set_lane_reg(npu_dev, RX_BANK_CONTROLS, 0, RX_LANE_ANA_PDWN); + set_lane_reg(npu_dev, RX_MODE, 0, RX_LANE_DIG_PDWN); + + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(phy_reset, phy_reset_wait, phy_reset_complete); + +/* Round a fixed decimal number. Frac is the number of fractional + * bits */ +static uint32_t round(uint32_t val, int frac) +{ + if (val >> (frac - 1) & 0x1) + return (val >> frac) + 1; + else + return val >> frac; +} + +#define ZCAL_MIN (10 << 3) +#define ZCAL_MAX (40 << 3) +#define ZCAL_K0 0x0 +#define ZCAL_M 128 +/* TODO: add a test case for the following values: + + Initial values: + zcal_n = 0xda; + zcal_p = 0xc7; + + Results: + pre_p = 0x0 + pre_n = 0x0 + margin_p = 0x0 + margin_n = 0x0 + total_en_p = 0x32 + total_en_n = 0x37 + */ + +static uint32_t phy_tx_zcal(struct npu_dev *npu_dev) +{ + uint64_t val; + + if (npu_dev->index < 2 && npu_dev->npu->tx_zcal_complete[0]) + return PROCEDURE_COMPLETE; + + if (npu_dev->index >= 2 && npu_dev->npu->tx_zcal_complete[1]) + return PROCEDURE_COMPLETE; + + /* Start calibration */ + val = phy_read(npu_dev, TX_IMPCAL_SWO1_PB); + val &= TX_ZCAL_SWO_EN; + phy_write(npu_dev, TX_IMPCAL_SWO1_PB, val); + phy_write(npu_dev, TX_IMPCAL_SWO2_PB, 0x50 << 2); + val = phy_read(npu_dev, TX_IMPCAL_PB); + val |= TX_ZCAL_REQ; + phy_write(npu_dev, TX_IMPCAL_PB, val); + + return PROCEDURE_NEXT; +} + +static uint32_t phy_tx_zcal_wait(struct npu_dev *npu_dev) +{ + uint64_t val; + + val = phy_read(npu_dev, TX_IMPCAL_PB); + if (!(val & TX_ZCAL_DONE)) + return PROCEDURE_INPROGRESS; + + if (val & TX_ZCAL_ERROR) + return PROCEDURE_COMPLETE | PROCEDURE_FAILED; + + return PROCEDURE_NEXT; +} + +static uint32_t phy_tx_zcal_calculate(struct npu_dev *npu_dev) +{ + uint64_t val; + uint64_t zcal_n; + uint64_t zcal_p; + uint64_t margin_n; + uint64_t margin_p; + uint64_t pre_n; + uint64_t pre_p; + uint64_t total_en_n; + uint64_t total_en_p; + + val = phy_read(npu_dev, TX_IMPCAL_NVAL_PB); + zcal_n = GETFIELD(TX_ZCAL_N, val); + val = phy_read(npu_dev, TX_IMPCAL_PVAL_PB); + zcal_p = GETFIELD(TX_ZCAL_P, val); + + if ((zcal_n < ZCAL_MIN) || (zcal_n > ZCAL_MAX) || + (zcal_p < ZCAL_MIN) || (zcal_p > ZCAL_MAX)) + return PROCEDURE_COMPLETE | PROCEDURE_FAILED; + + margin_n = (0x80 - ZCAL_M) * zcal_n / 2; + margin_p = (0x80 - ZCAL_M) * zcal_p / 2; + pre_n = (((0x80 * zcal_n) - (2 * margin_n)) * ZCAL_K0) / 0x80; + pre_p = (((0x80 * zcal_p) - (2 * margin_p)) * ZCAL_K0) / 0x80; + + total_en_n = 0x80 * zcal_n - (2 * margin_n) - (pre_n & 1023); + total_en_p = 0x80 * zcal_p - (2 * margin_p) - (pre_p & 1023); + + pre_p = round(pre_p, 9); + pre_n = round(pre_n, 9); + margin_p = round(margin_p, 9); + margin_n = round(margin_n, 9); + total_en_p = round(total_en_p, 9); + total_en_n = round(total_en_n, 9); + + val = SETFIELD(TX_FFE_TOTAL_ENABLE_N_ENC, 0, total_en_n); + val = SETFIELD(TX_FFE_TOTAL_ENABLE_P_ENC, val, total_en_p); + phy_write(npu_dev, TX_FFE_TOTAL_2RSTEP_EN, val); + + val = SETFIELD(TX_FFE_PRE_N_SEL_ENC, 0, pre_n); + val = SETFIELD(TX_FFE_PRE_P_SEL_ENC, val, pre_p); + phy_write(npu_dev, TX_FFE_PRE_2RSTEP_SEL, val); + + val = SETFIELD(TX_FFE_MARGIN_PD_N_SEL_ENC, 0, margin_n); + val = SETFIELD(TX_FFE_MARGIN_PU_P_SEL_ENC, val, margin_p); + phy_write(npu_dev, TX_FFE_MARGIN_2RSTEP_SEL, val); + + if (npu_dev->index < 2) + npu_dev->npu->tx_zcal_complete[0] = true; + else + npu_dev->npu->tx_zcal_complete[1] = true; + + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(phy_tx_zcal, phy_tx_zcal_wait, phy_tx_zcal_calculate); + +static uint32_t phy_enable_tx_rxcal(struct npu_dev *npu_dev) +{ + /* Turn common mode on */ + set_lane_reg(npu_dev, TX_MODE2_PL, TX_RXCAL, TX_RXCAL); + + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(phy_enable_tx_rxcal); + +static uint32_t phy_disable_tx_rxcal(struct npu_dev *npu_dev) +{ + /* Turn common mode off */ + set_lane_reg(npu_dev, TX_MODE2_PL, 0, TX_RXCAL); + + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(phy_disable_tx_rxcal); + +static uint32_t phy_rx_dccal(struct npu_dev *npu_dev) +{ + if (phy_read(npu_dev, RX_LANE_BUSY_VEC_0_15) + & ~phy_read(npu_dev, RX_INIT_DONE_VEC_0_15)) + return PROCEDURE_INPROGRESS; + + return PROCEDURE_NEXT; +} + +static uint32_t phy_rx_dccal_start(struct npu_dev *npu_dev) +{ + uint64_t val; + + /* Save EO step control */ + val = phy_read(npu_dev, RX_EO_STEP_CNTL_PG); + npu_dev->procedure_data = val; + + phy_write(npu_dev, RX_EO_STEP_CNTL_PG, + RX_EO_ENABLE_LATCH_OFFSET_CAL + | RX_EO_ENABLE_CM_COARSE_CAL); + + val = phy_read(npu_dev, RX_RECAL_ABORT_VEC_0_15); + val |= phy_lane_mask(npu_dev); + phy_write(npu_dev, RX_RECAL_ABORT_VEC_0_15, val); + + val = phy_read(npu_dev, RX_RUN_LANE_VEC_0_15); + val |= phy_lane_mask(npu_dev); + phy_write(npu_dev, RX_RUN_LANE_VEC_0_15, val); + + return PROCEDURE_NEXT; +} + +static uint32_t phy_rx_dccal_complete(struct npu_dev *npu_dev) +{ + /* Poll for completion on relevant lanes */ + if ((phy_read(npu_dev, RX_INIT_DONE_VEC_0_15) & phy_lane_mask(npu_dev)) + != phy_lane_mask(npu_dev)) + return PROCEDURE_INPROGRESS; + + return PROCEDURE_NEXT; +} + +static uint32_t phy_rx_dccal_fifo_init(struct npu_dev *npu_dev) +{ + uint64_t val; + + val = phy_read(npu_dev, RX_RUN_LANE_VEC_0_15); + val &= ~phy_lane_mask(npu_dev); + phy_write(npu_dev, RX_RUN_LANE_VEC_0_15, val); + + /* Turn off recal abort */ + val = phy_read(npu_dev, RX_RECAL_ABORT_VEC_0_15); + val &= ~phy_lane_mask(npu_dev); + phy_write(npu_dev, RX_RECAL_ABORT_VEC_0_15, val); + + /* Restore original settings */ + phy_write(npu_dev, RX_EO_STEP_CNTL_PG, npu_dev->procedure_data); + + /* FIFO Init */ + set_lane_reg(npu_dev, TX_MODE2_PL, 0, TX_UNLOAD_CLK_DISABLE); + set_lane_reg(npu_dev, TX_CNTL_STAT2, TX_FIFO_INIT, TX_FIFO_INIT); + set_lane_reg(npu_dev, TX_MODE2_PL, TX_UNLOAD_CLK_DISABLE, + TX_UNLOAD_CLK_DISABLE); + + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(phy_rx_dccal, phy_rx_dccal_start, phy_rx_dccal_complete, + phy_rx_dccal_fifo_init); + +static uint32_t phy_rx_training(struct npu_dev *npu_dev) +{ + uint16_t val; + + if (!npu_dev->procedure_data) { + val = phy_read(npu_dev, RX_RUN_LANE_VEC_0_15); + val |= phy_lane_mask(npu_dev); + phy_write(npu_dev, RX_RUN_LANE_VEC_0_15, val); + } + + npu_dev->procedure_data++; + if (npu_dev->procedure_data >= 1000000) + return PROCEDURE_COMPLETE | PROCEDURE_FAILED; + + val = phy_read(npu_dev, RX_RUN_LANE_VEC_0_15); + if ((val & phy_lane_mask(npu_dev)) != phy_lane_mask(npu_dev)) + return PROCEDURE_INPROGRESS; + + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(phy_rx_training); + +static struct procedure *npu_procedures[] = { + &procedure_stop, + &procedure_nop, + NULL, + NULL, + &procedure_phy_reset, + &procedure_phy_tx_zcal, + &procedure_phy_rx_dccal, + &procedure_phy_enable_tx_rxcal, + &procedure_phy_disable_tx_rxcal, + &procedure_phy_rx_training, + &procedure_reset_npu_dl, + + /* Place holders for pre-terminate and terminate procedures */ + &procedure_nop, + &procedure_nop}; + +/* Run a procedure step(s) and return status */ +static uint32_t get_procedure_status(struct npu_dev *dev) +{ + uint32_t result; + uint16_t procedure = dev->procedure_number; + uint16_t step = dev->procedure_step; + const char *name = npu_procedures[procedure]->name; + + do { + result = npu_procedures[procedure]->steps[step](dev); + + if (result & PROCEDURE_NEXT) { + step++; + NPUDEVINF(dev, "Running procedure %s step %d\n", name, step); + } + } while (result & PROCEDURE_NEXT); + + dev->procedure_step = step; + + if (result & PROCEDURE_COMPLETE) + NPUDEVINF(dev, "Procedure %s complete\n", name); + else if (mftb() > dev->procedure_tb + msecs_to_tb(100)) { + NPUDEVINF(dev, "Procedure %s timed out\n", name); + result = PROCEDURE_COMPLETE | PROCEDURE_FAILED; + } + + /* Mask off internal state bits */ + dev->procedure_status = result & PROCEDURE_STATUS_MASK; + + return dev->procedure_status; +} + +static int64_t npu_dev_procedure_read(struct npu_dev *dev, uint32_t offset, + uint32_t size, uint32_t *data) +{ + int64_t rc = OPAL_SUCCESS; + + if (size != 4) { + /* Short config reads are not supported */ + prlog(PR_ERR, "NPU%d: Short read of procedure register\n", dev->npu->phb.opal_id); + return OPAL_PARAMETER; + } + + *data = 0; + + switch (offset) { + case 0: + /* Only run the procedure if not already complete */ + if (dev->procedure_status & PROCEDURE_COMPLETE) + *data = dev->procedure_status; + else + *data = get_procedure_status(dev); + + break; + + case 4: + *data = dev->procedure_number; + break; + + default: + prlog(PR_ERR, "NPU%d: Invalid vendor specific offset 0x%08x\n", + dev->npu->phb.opal_id, offset); + rc = OPAL_PARAMETER; + } + + return rc; +} + +static int64_t npu_dev_procedure_write(struct npu_dev *dev, uint32_t offset, + uint32_t size, uint32_t data) +{ + const char *name; + int64_t rc = OPAL_SUCCESS; + + if (size != 4) { + /* Short config writes are not supported */ + prlog(PR_ERR, "NPU%d: Short read of procedure register\n", + dev->npu->phb.opal_id); + return OPAL_PARAMETER; + } + + switch (offset) { + case 0: + /* We ignore writes to the status register */ + NPUDEVINF(dev, "Ignoring writes to status register\n"); + break; + + case 4: + if (data >= ARRAY_SIZE(npu_procedures) || + !npu_procedures[data]) { + NPUDEVINF(dev, "Unsupported procedure number %d\n", data); + dev->procedure_status = PROCEDURE_COMPLETE + | PROCEDURE_UNSUPPORTED; + break; + } + + name = npu_procedures[data]->name; + if (dev->procedure_number == data + && !(dev->procedure_status & PROCEDURE_COMPLETE)) + NPUDEVINF(dev, "Restarting procuedure %s\n", name); + else + NPUDEVINF(dev, "Starting procedure %s\n", name); + + dev->procedure_status = PROCEDURE_INPROGRESS; + dev->procedure_number = data; + dev->procedure_step = 0; + dev->procedure_data = 0; + dev->procedure_tb = mftb(); + break; + + default: + NPUDEVINF(dev, "Invalid vendor specific offset 0x%08x\n", offset); + rc = OPAL_PARAMETER; + } + + return rc; +} + +int64_t npu_dev_procedure(void *dev, struct pci_cfg_reg_filter *pcrf, + uint32_t offset, uint32_t len, uint32_t *data, + bool write) +{ + struct pci_virt_device *pvd = dev; + struct npu_dev *ndev = pvd->data; + + if (write) + return npu_dev_procedure_write(ndev, offset - pcrf->start, + len, *data); + + return npu_dev_procedure_read(ndev, offset - pcrf->start, len, data); +} + +void npu_dev_procedure_reset(struct npu_dev *dev) +{ + dev->procedure_status = 0; + dev->procedure_number = 0; + dev->procedure_step = 0; + dev->procedure_data = 0; +} diff --git a/roms/skiboot/hw/npu-opal.c b/roms/skiboot/hw/npu-opal.c new file mode 100644 index 000000000..412ea460e --- /dev/null +++ b/roms/skiboot/hw/npu-opal.c @@ -0,0 +1,176 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Copyright 2019 IBM Corp. + */ + +#include <skiboot.h> +#include <pci.h> +#include <phb4.h> +#include <npu2.h> +#include <npu3.h> + +static int64_t opal_npu_init_context(uint64_t phb_id, int pid __unused, + uint64_t msr, uint64_t bdf) +{ + struct phb *phb = pci_get_phb(phb_id); + + if (!phb) + return OPAL_PARAMETER; + + if (phb->phb_type == phb_type_npu_v2) + return npu2_init_context(phb, msr, bdf); + + if (phb->phb_type == phb_type_npu_v3) + return npu3_init_context(phb, msr, bdf); + + return OPAL_PARAMETER; +} +opal_call(OPAL_NPU_INIT_CONTEXT, opal_npu_init_context, 4); + +static int64_t opal_npu_destroy_context(uint64_t phb_id, uint64_t pid __unused, + uint64_t bdf) +{ + struct phb *phb = pci_get_phb(phb_id); + + if (!phb) + return OPAL_PARAMETER; + + if (phb->phb_type == phb_type_npu_v2) + return npu2_destroy_context(phb, bdf); + + if (phb->phb_type == phb_type_npu_v3) + return npu3_destroy_context(phb, bdf); + + return OPAL_PARAMETER; +} +opal_call(OPAL_NPU_DESTROY_CONTEXT, opal_npu_destroy_context, 3); + +static int64_t opal_npu_map_lpar(uint64_t phb_id, uint64_t bdf, uint64_t lparid, + uint64_t lpcr) +{ + struct phb *phb = pci_get_phb(phb_id); + + if (!phb) + return OPAL_PARAMETER; + + if (phb->phb_type == phb_type_npu_v2) + return npu2_map_lpar(phb, bdf, lparid, lpcr); + + if (phb->phb_type == phb_type_npu_v3) + return npu3_map_lpar(phb, bdf, lparid, lpcr); + + return OPAL_PARAMETER; +} +opal_call(OPAL_NPU_MAP_LPAR, opal_npu_map_lpar, 4); + +static int npu_check_relaxed_ordering(struct phb *phb, struct pci_device *pd, + void *enable) +{ + /* + * IBM PCIe bridge devices (ie. the root ports) can always allow relaxed + * ordering + */ + if (pd->vdid == 0x04c11014) + pd->allow_relaxed_ordering = true; + + PCIDBG(phb, pd->bdfn, "Checking relaxed ordering config\n"); + if (pd->allow_relaxed_ordering) + return 0; + + PCIDBG(phb, pd->bdfn, "Relaxed ordering not allowed\n"); + *(bool *)enable = false; + + return 1; +} + +static int64_t npu_set_relaxed_order(uint32_t gcid, int pec, bool enable) +{ + struct phb *phb; + int64_t rc; + + for_each_phb(phb) { + if (phb->phb_type == phb_type_npu_v2) + rc = npu2_set_relaxed_order(phb, gcid, pec, enable); + else if (phb->phb_type == phb_type_npu_v3) + rc = npu3_set_relaxed_order(phb, gcid, pec, enable); + else + continue; + + if (rc) + return rc; + } + + return OPAL_SUCCESS; +} + +static int64_t opal_npu_set_relaxed_order(uint64_t phb_id, uint16_t bdfn, + bool request_enabled) +{ + struct phb *phb = pci_get_phb(phb_id); + struct phb4 *phb4; + uint32_t chip_id, pec; + struct pci_device *pd; + bool enable = true; + + if (!phb || phb->phb_type != phb_type_pcie_v4) + return OPAL_PARAMETER; + + phb4 = phb_to_phb4(phb); + pec = phb4->pec; + chip_id = phb4->chip_id; + + if (chip_id & ~0x1b) + return OPAL_PARAMETER; + + pd = pci_find_dev(phb, bdfn); + if (!pd) + return OPAL_PARAMETER; + + /* + * Not changing state, so no need to rescan PHB devices to determine if + * we need to enable/disable it + */ + if (pd->allow_relaxed_ordering == request_enabled) + return OPAL_SUCCESS; + + pd->allow_relaxed_ordering = request_enabled; + + /* + * Walk all devices on this PHB to ensure they all support relaxed + * ordering + */ + pci_walk_dev(phb, NULL, npu_check_relaxed_ordering, &enable); + + if (request_enabled && !enable) { + /* + * Not all devices on this PHB support relaxed-ordering + * mode so we can't enable it as requested + */ + prlog(PR_INFO, "Cannot set relaxed ordering for PEC %d on chip %d\n", + pec, chip_id); + return OPAL_CONSTRAINED; + } + + if (npu_set_relaxed_order(chip_id, pec, request_enabled)) { + npu_set_relaxed_order(chip_id, pec, false); + return OPAL_RESOURCE; + } + + phb4->ro_state = request_enabled; + return OPAL_SUCCESS; +} +opal_call(OPAL_NPU_SET_RELAXED_ORDER, opal_npu_set_relaxed_order, 3); + +static int64_t opal_npu_get_relaxed_order(uint64_t phb_id, + uint16_t bdfn __unused) +{ + struct phb *phb = pci_get_phb(phb_id); + struct phb4 *phb4; + + if (!phb || phb->phb_type != phb_type_pcie_v4) + return OPAL_PARAMETER; + + phb4 = phb_to_phb4(phb); + return phb4->ro_state; +} +opal_call(OPAL_NPU_GET_RELAXED_ORDER, opal_npu_get_relaxed_order, 2); diff --git a/roms/skiboot/hw/npu.c b/roms/skiboot/hw/npu.c new file mode 100644 index 000000000..dba7ee50f --- /dev/null +++ b/roms/skiboot/hw/npu.c @@ -0,0 +1,1693 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * NVLink1, supported by the NPU (POWER8) + * + * Copyright 2013-2019 IBM Corp. + */ + +#include <skiboot.h> +#include <io.h> +#include <timebase.h> +#include <pci.h> +#include <pci-cfg.h> +#include <pci-virt.h> +#include <pci-slot.h> +#include <interrupts.h> +#include <opal.h> +#include <opal-api.h> +#include <cpu.h> +#include <device.h> +#include <ccan/str/str.h> +#include <ccan/array_size/array_size.h> +#include <ccan/build_assert/build_assert.h> +#include <affinity.h> +#include <npu-regs.h> +#include <npu.h> +#include <xscom.h> +#include <string.h> + +/* + * Terminology: + * + * Brick - A group of either 8 TX or 8 RX lanes + * Link - A group of 8 TX and 8 RX lanes + * + * Each link is represented in system software as an emulated PCI + * device. Garrison has two chips each with 4 links, therefore there + * are 8 emulated PCI devices in total. + * + * +----------------------------------------------------------------+ + * | PBCQ3 (SCOM Base Address 0x2012c00) | + * | PHB3 (SCOM Base Address 0x9012c00) | + * +----------------------------------------------------------------+ + * |||||||| |||||||| + * |||||||| |||||||| + * |||||||| |||||||| + * |||||||| |||||||| + * +----------------------------------------------------------------+ + * | PCIe x8 | + * +----------------------------------------------------------------+ + * | GPU0 | + * +--------------------------------+-------------------------------+ + * | NV Link 1 | NV Link 0 | + * +---------------+----------------+---------------+---------------+ + * | RX | TX | RX | TX | + * +---------------+----------------+---------------+---------------+ + * |||||||| |||||||| |||||||| |||||||| + * |||||||| |||||||| |||||||| |||||||| + * |||||||| |||||||| |||||||| |||||||| + * |||||||| |||||||| |||||||| |||||||| + * +---------------+----------------+---------------+---------------+ + * | TX | RX | TX | RX | + * +---------------+----------------+---------------+---------------+ + * | Lanes [0:7] PHY 0 Lanes [8:15] | + * | SCOM Base Address 0x8000080008010c3f | + * +--------------------------------+-------------------------------+ + * | Link 0 NDL/NTL | Link 1 NTL/NDL | + * | SCOM Base Address 0x8013c00 | SCOM Base Address 0x8013c40 | + * +--------------------------------+-------------------------------+ + * | | + * | Address Translation/AT (shared for all links) | + * | SCOM Base Address 0x8013d80 | + * | | + * +--------------------------------+-------------------------------+ + * | Link 3 NDL/NTL | Link 4 NTL/NDL | + * | SCOM Base Address 0x8013d00 | SCOM Base Address 0x8013d40 | + * +--------------------------------+-------------------------------+ + * | Lanes [8:15] PHY 1 Lanes [0:7] | + * | SCOM Base Address 0x8000080008010c7f | + * +---------------+----------------+---------------+---------------+ + * | TX | RX | TX | RX | + * +---------------+----------------+---------------+---------------+ + * |||||||| |||||||| |||||||| |||||||| + * |||||||| |||||||| |||||||| |||||||| + * |||||||| |||||||| |||||||| |||||||| + * |||||||| |||||||| |||||||| |||||||| + * +---------------+----------------+---------------+---------------+ + * | RX | TX | RX | TX | + * +---------------+----------------+---------------+---------------+ + * | NV Link 2 | NV Link 3 | + * +--------------------------------+-------------------------------+ + * | GPU1 | + * +----------------------------------------------------------------+ + * | PCIe x8 | + * +----------------------------------------------------------------+ + * |||||||| |||||||| + * |||||||| |||||||| + * |||||||| |||||||| + * |||||||| |||||||| + * +----------------------------------------------------------------+ + * | PHB2 (SCOM Base Address 0x9012800) | + * | PBCQ2 (SCOM Base Address 0x2012800) | + * +----------------------------------------------------------------+ + * + */ + +static struct npu_dev_cap *npu_dev_find_capability(struct npu_dev *dev, + uint16_t id); + +#define OPAL_NPU_VERSION 0x02 + +#define PCIE_CAP_START 0x40 +#define PCIE_CAP_END 0x80 +#define VENDOR_CAP_START 0x80 +#define VENDOR_CAP_END 0x90 + +#define VENDOR_CAP_PCI_DEV_OFFSET 0x0d + +/* Returns the scom base for the given link index */ +static uint64_t npu_link_scom_base(struct dt_node *dn, uint32_t scom_base, + int index) +{ + struct dt_node *link; + uint32_t link_index; + char namebuf[32]; + + snprintf(namebuf, sizeof(namebuf), "link@%x", index); + link = dt_find_by_name(dn, namebuf); + assert(link); + link_index = dt_prop_get_u32(link, "ibm,npu-link-index"); + return scom_base + (link_index * NPU_LINK_SIZE); +} + +static uint64_t get_bar_size(uint64_t bar) +{ + return (1 << GETFIELD(NX_MMIO_BAR_SIZE, bar)) * 0x10000; +} + +/* Update the changes of the device BAR to link BARs */ +static void npu_dev_bar_update(uint32_t gcid, struct npu_dev_bar *bar, + bool enable) +{ + uint64_t val; + + if (!bar->xscom) + return; + + val = bar->base; + val = SETFIELD(NX_MMIO_BAR_SIZE, val, ilog2(bar->size / 0x10000)); + if (enable) + val |= NX_MMIO_BAR_ENABLE; + xscom_write(gcid, bar->xscom, val); +} + +/* Trap for PCI command (0x4) to enable or disable device's BARs */ +static int64_t npu_dev_cfg_write_cmd(void *dev, + struct pci_cfg_reg_filter *pcrf __unused, + uint32_t offset, uint32_t size, + uint32_t *data, bool write) +{ + struct pci_virt_device *pvd = dev; + struct npu_dev *ndev = pvd->data; + bool enable; + + if (!write) + return OPAL_PARTIAL; + + if (offset != PCI_CFG_CMD) + return OPAL_PARAMETER; + if (size != 1 && size != 2 && size != 4) + return OPAL_PARAMETER; + + /* Update device BARs and link BARs will be syncrhonized + * with hardware automatically. + */ + enable = !!(*data & PCI_CFG_CMD_MEM_EN); + npu_dev_bar_update(ndev->npu->chip_id, &ndev->bar, enable); + + /* Normal path to update PCI config buffer */ + return OPAL_PARTIAL; +} + +/* + * Trap for memory BARs: 0xFF's should be written to BAR register + * prior to getting its size. + */ +static int64_t npu_dev_cfg_bar_read(struct npu_dev *dev __unused, + struct pci_cfg_reg_filter *pcrf, + uint32_t offset, uint32_t size, + uint32_t *data) +{ + struct npu_dev_bar *bar = (struct npu_dev_bar *)(pcrf->data); + + /* Revert to normal path if we weren't trapped for BAR size */ + if (!bar->trapped) + return OPAL_PARTIAL; + + if (offset != pcrf->start && + offset != pcrf->start + 4) + return OPAL_PARAMETER; + if (size != 4) + return OPAL_PARAMETER; + + bar->trapped = false; + *data = bar->bar_sz; + return OPAL_SUCCESS; +} + +static int64_t npu_dev_cfg_bar_write(struct npu_dev *dev, + struct pci_cfg_reg_filter *pcrf, + uint32_t offset, uint32_t size, + uint32_t data) +{ + struct pci_virt_device *pvd = dev->pvd; + struct npu_dev_bar *bar = (struct npu_dev_bar *)(pcrf->data); + uint32_t pci_cmd; + + if (offset != pcrf->start && + offset != pcrf->start + 4) + return OPAL_PARAMETER; + if (size != 4) + return OPAL_PARAMETER; + + /* Return BAR size on next read */ + if (data == 0xffffffff) { + bar->trapped = true; + if (offset == pcrf->start) + bar->bar_sz = (bar->size & 0xffffffff); + else + bar->bar_sz = (bar->size >> 32); + + return OPAL_SUCCESS; + } + + /* Update BAR base address */ + if (offset == pcrf->start) { + bar->base &= 0xffffffff00000000UL; + bar->base |= (data & 0xfffffff0); + } else { + bar->base &= 0x00000000ffffffffUL; + bar->base |= ((uint64_t)data << 32); + + PCI_VIRT_CFG_NORMAL_RD(pvd, PCI_CFG_CMD, 4, &pci_cmd); + npu_dev_bar_update(dev->npu->chip_id, bar, + !!(pci_cmd & PCI_CFG_CMD_MEM_EN)); + } + + /* We still depend on the normal path to update the + * cached config buffer. + */ + return OPAL_PARAMETER; +} + +static int64_t npu_dev_cfg_bar(void *dev, struct pci_cfg_reg_filter *pcrf, + uint32_t offset, uint32_t len, uint32_t *data, + bool write) +{ + struct pci_virt_device *pvd = dev; + struct npu_dev *ndev = pvd->data; + + if (write) + return npu_dev_cfg_bar_write(ndev, pcrf, offset, len, *data); + + return npu_dev_cfg_bar_read(ndev, pcrf, offset, len, data); +} + +static int64_t npu_dev_cfg_exp_devcap(void *dev, + struct pci_cfg_reg_filter *pcrf __unused, + uint32_t offset, uint32_t size, + uint32_t *data, bool write) +{ + struct pci_virt_device *pvd = dev; + struct npu_dev *ndev = pvd->data; + + assert(write); + + if ((size != 2) || (offset & 1)) { + /* Short config writes are not supported */ + prlog(PR_ERR, "NPU%d: Unsupported write to pcie control register\n", + ndev->phb->opal_id); + return OPAL_PARAMETER; + } + + if (*data & PCICAP_EXP_DEVCTL_FUNC_RESET) + npu_dev_procedure_reset(ndev); + + return OPAL_PARTIAL; +} + +static struct npu_dev *bdfn_to_npu_dev(struct npu *p, uint32_t bdfn) +{ + struct pci_virt_device *pvd; + + /* Sanity check */ + if (bdfn & ~0xff) + return NULL; + + pvd = pci_virt_find_device(&p->phb, bdfn); + if (pvd) + return pvd->data; + + return NULL; +} + +#define NPU_CFG_READ(size, type) \ +static int64_t npu_cfg_read##size(struct phb *phb, uint32_t bdfn, \ + uint32_t offset, type *data) \ +{ \ + uint32_t val; \ + int64_t ret; \ + \ + ret = pci_virt_cfg_read(phb, bdfn, offset, sizeof(*data), &val); \ + *data = (type)val; \ + return ret; \ +} +#define NPU_CFG_WRITE(size, type) \ +static int64_t npu_cfg_write##size(struct phb *phb, uint32_t bdfn, \ + uint32_t offset, type data) \ +{ \ + uint32_t val = data; \ + \ + return pci_virt_cfg_write(phb, bdfn, offset, sizeof(data), val); \ +} + +NPU_CFG_READ(8, u8); +NPU_CFG_READ(16, u16); +NPU_CFG_READ(32, u32); +NPU_CFG_WRITE(8, u8); +NPU_CFG_WRITE(16, u16); +NPU_CFG_WRITE(32, u32); + +static int __npu_dev_bind_pci_dev(struct phb *phb __unused, + struct pci_device *pd, + void *data) +{ + struct npu_dev *dev = data; + struct dt_node *pci_dt_node; + char *pcislot; + + /* Ignore non-nvidia PCI devices */ + if ((pd->vdid & 0xffff) != 0x10de) + return 0; + + /* Find the PCI device's slot location */ + for (pci_dt_node = pd->dn; + pci_dt_node && !dt_find_property(pci_dt_node, "ibm,slot-label"); + pci_dt_node = pci_dt_node->parent); + + if (!pci_dt_node) + return 0; + + pcislot = (char *)dt_prop_get(pci_dt_node, "ibm,slot-label"); + + prlog(PR_DEBUG, "NPU: comparing GPU %s and NPU %s\n", + pcislot, dev->slot_label); + + if (streq(pcislot, dev->slot_label)) + return 1; + + return 0; +} + +static void npu_dev_bind_pci_dev(struct npu_dev *dev) +{ + struct phb *phb; + uint32_t i; + + if (dev->pd) + return; + + for (i = 0; i < 64; i++) { + if (dev->npu->phb.opal_id == i) + continue; + + phb = pci_get_phb(i); + if (!phb) + continue; + + dev->pd = pci_walk_dev(phb, NULL, __npu_dev_bind_pci_dev, dev); + if (dev->pd) { + dev->phb = phb; + /* Found the device, set the bit in config space */ + PCI_VIRT_CFG_INIT_RO(dev->pvd, VENDOR_CAP_START + + VENDOR_CAP_PCI_DEV_OFFSET, 1, 0x01); + return; + } + } + + prlog(PR_INFO, "%s: No PCI device for NPU device %04x:%02x:%02x.%x to bind to. If you expect a GPU to be there, this is a problem.\n", + __func__, dev->npu->phb.opal_id, + dev->pvd->bdfn >> 8 & 0xff, + dev->pvd->bdfn >> 3 & 0x1f, + dev->pvd->bdfn & 0x7); + +} + +static struct lock pci_npu_phandle_lock = LOCK_UNLOCKED; + +/* Appends an NPU phandle to the given PCI device node ibm,npu + * property */ +static void npu_append_pci_phandle(struct dt_node *dn, u32 phandle) +{ + uint32_t *npu_phandles; + struct dt_property *pci_npu_phandle_prop; + size_t prop_len; + + /* Use a lock to make sure no one else has a reference to an + * ibm,npu property (this assumes this is the only function + * that holds a reference to it). */ + lock(&pci_npu_phandle_lock); + + /* This function shouldn't be called unless ibm,npu exists */ + pci_npu_phandle_prop = (struct dt_property *) + dt_require_property(dn, "ibm,npu", -1); + + /* Need to append to the properties */ + prop_len = pci_npu_phandle_prop->len; + prop_len += sizeof(*npu_phandles); + dt_resize_property(&pci_npu_phandle_prop, prop_len); + + npu_phandles = (uint32_t *) pci_npu_phandle_prop->prop; + npu_phandles[prop_len/sizeof(*npu_phandles) - 1] = phandle; + unlock(&pci_npu_phandle_lock); +} + +static int npu_dn_fixup(struct phb *phb, + struct pci_device *pd, + void *data __unused) +{ + struct npu *p = phb_to_npu(phb); + struct npu_dev *dev; + + dev = bdfn_to_npu_dev(p, pd->bdfn); + assert(dev); + + if (dev->phb || dev->pd) + return 0; + + /* NPU devices require a slot location to associate with GPUs */ + dev->slot_label = dt_prop_get(pd->dn, "ibm,slot-label"); + + /* Bind the emulated PCI device with the real one, which can't + * be done until the PCI devices are populated. Once the real + * PCI device is identified, we also need fix the device-tree + * for it + */ + npu_dev_bind_pci_dev(dev); + if (dev->phb && dev->pd && dev->pd->dn) { + if (dt_find_property(dev->pd->dn, "ibm,npu")) + npu_append_pci_phandle(dev->pd->dn, pd->dn->phandle); + else + dt_add_property_cells(dev->pd->dn, "ibm,npu", pd->dn->phandle); + + dt_add_property_cells(pd->dn, "ibm,gpu", dev->pd->dn->phandle); + } + + return 0; +} + +static void npu_phb_final_fixup(struct phb *phb) +{ + pci_walk_dev(phb, NULL, npu_dn_fixup, NULL); +} + +static void npu_ioda_init(struct npu *p) +{ + uint64_t *data64; + uint32_t i; + + /* LXIVT - Disable all LSIs */ + for (i = 0; i < ARRAY_SIZE(p->lxive_cache); i++) { + data64 = &p->lxive_cache[i]; + *data64 = SETFIELD(NPU_IODA_LXIVT_PRIORITY, 0ul, 0xff); + *data64 = SETFIELD(NPU_IODA_LXIVT_SERVER, *data64, 0); + } + + /* PCT - Reset to reserved PE# */ + for (i = 0; i < ARRAY_SIZE(p->pce_cache); i++) { + data64 = &p->pce_cache[i]; + *data64 = SETFIELD(NPU_IODA_PCT_PE, 0ul, 0ul); + *data64 |= NPU_IODA_PCT_LINK_ENABLED; + } + + /* Clear TVT */ + memset(p->tve_cache, 0, sizeof(p->tve_cache)); +} + +static int64_t npu_ioda_reset(struct phb *phb, bool purge) +{ + struct npu *p = phb_to_npu(phb); + uint32_t i; + + if (purge) { + NPUDBG(p, "Purging all IODA tables...\n"); + npu_ioda_init(p); + } + + /* LIST */ + npu_ioda_sel(p, NPU_IODA_TBL_LIST, 0, true); + for (i = 0; i < 8; i++) + out_be64(p->at_regs + NPU_IODA_DATA0, 0x1); + + /* LIXVT */ + npu_ioda_sel(p, NPU_IODA_TBL_LXIVT, 0, true); + for (i = 0; i < ARRAY_SIZE(p->lxive_cache); i++) + out_be64(p->at_regs + NPU_IODA_DATA0, p->lxive_cache[i]); + + /* PCT */ + npu_ioda_sel(p, NPU_IODA_TBL_PCT, 0, true); + for (i = 0; i < ARRAY_SIZE(p->pce_cache); i++) + out_be64(p->at_regs + NPU_IODA_DATA0, p->pce_cache[i]); + + /* TVT */ + npu_ioda_sel(p, NPU_IODA_TBL_TVT, 0, true); + for (i = 0; i < ARRAY_SIZE(p->tve_cache); i++) + out_be64(p->at_regs + NPU_IODA_DATA0, p->tve_cache[i]); + + return OPAL_SUCCESS; +} + +static int npu_isn_valid(struct npu *p, uint32_t isn) +{ + if (p->chip_id != p8_irq_to_chip(isn) || p->index != 0 || + NPU_IRQ_NUM(isn) < NPU_LSI_IRQ_MIN || + NPU_IRQ_NUM(isn) > NPU_LSI_IRQ_MAX) { + /** + * @fwts-label NPUisnInvalid + * @fwts-advice NVLink not functional + */ + prlog(PR_ERR, "NPU%d: isn 0x%x not valid for this NPU\n", + p->phb.opal_id, isn); + return false; + } + + return true; +} + +static int64_t npu_lsi_get_xive(struct irq_source *is, uint32_t isn, + uint16_t *server, uint8_t *prio) +{ + struct npu *p = is->data; + uint32_t irq = NPU_IRQ_NUM(isn); + uint64_t lxive; + + if (!npu_isn_valid(p, isn)) + return OPAL_PARAMETER; + + /* The content is fetched from the cache, which requires + * that the initial cache should be initialized with the + * default values + */ + irq -= NPU_LSI_IRQ_MIN; + lxive = p->lxive_cache[irq]; + *server = GETFIELD(NPU_IODA_LXIVT_SERVER, lxive); + *prio = GETFIELD(NPU_IODA_LXIVT_PRIORITY, lxive); + + return OPAL_SUCCESS; +} + +static int64_t npu_lsi_set_xive(struct irq_source *is, uint32_t isn, + uint16_t server, uint8_t prio) +{ + struct npu *p = is->data; + uint32_t irq = NPU_IRQ_NUM(isn); + uint64_t lxive; + + if (!npu_isn_valid(p, isn)) + return OPAL_PARAMETER; + + /* Figure out LXIVT entry */ + lxive = SETFIELD(NPU_IODA_LXIVT_SERVER, 0ul, server); + lxive = SETFIELD(NPU_IODA_LXIVT_PRIORITY, lxive, prio); + + /* Cache LXIVT entry */ + irq -= NPU_LSI_IRQ_MIN; + p->lxive_cache[irq] = lxive; + + /* Update to LXIVT entry */ + npu_ioda_sel(p, NPU_IODA_TBL_LXIVT, irq, false); + lxive = in_be64(p->at_regs + NPU_IODA_DATA0); + lxive = SETFIELD(NPU_IODA_LXIVT_SERVER, lxive, server); + lxive = SETFIELD(NPU_IODA_LXIVT_PRIORITY, lxive, prio); + out_be64(p->at_regs + NPU_IODA_DATA0, lxive); + + return OPAL_SUCCESS; +} + +static void npu_err_interrupt(struct irq_source *is, uint32_t isn) +{ + struct npu *p = is->data; + uint32_t irq = NPU_IRQ_NUM(isn); + + if (!npu_isn_valid(p, isn)) + return; + + /* There're 4 LSIs used for error reporting: 4/5 for data + * link error reporting while 6/7 for frozen PE detection + */ + irq -= NPU_LSI_IRQ_MIN; + switch (irq) { + case 4 ... 5: + prerror("Invalid NPU error interrupt received\n"); + break; + case 6 ... 7: + opal_update_pending_evt(OPAL_EVENT_PCI_ERROR, + OPAL_EVENT_PCI_ERROR); + } +} + +static uint64_t npu_lsi_attributes(struct irq_source *is, uint32_t isn) +{ + struct npu *p = is->data; + uint32_t idx = isn - p->base_lsi; + + if (idx >= 4) + return IRQ_ATTR_TARGET_OPAL | IRQ_ATTR_TARGET_RARE | IRQ_ATTR_TYPE_LSI; + return IRQ_ATTR_TARGET_LINUX; +} + +/* Error LSIs (skiboot owned) */ +static const struct irq_source_ops npu_lsi_irq_ops = { + .get_xive = npu_lsi_get_xive, + .set_xive = npu_lsi_set_xive, + .attributes = npu_lsi_attributes, + .interrupt = npu_err_interrupt, +}; + +static void npu_register_irq(struct npu *p) +{ + register_irq_source(&npu_lsi_irq_ops, p, p->base_lsi, 8); +} + +static void npu_hw_init(struct npu *p) +{ + /* 3 MMIO setup for AT */ + out_be64(p->at_regs + NPU_LSI_SOURCE_ID, + SETFIELD(NPU_LSI_SRC_ID_BASE, 0ul, NPU_LSI_IRQ_MIN >> 4)); + BUILD_ASSERT((NPU_LSI_IRQ_MIN & 0x07F0) == NPU_LSI_IRQ_MIN); + out_be64(p->at_regs + NPU_INTREP_TIMER, 0x0ul); + npu_ioda_reset(&p->phb, false); +} + +static int64_t npu_map_pe_dma_window_real(struct phb *phb, + uint64_t pe_number, + uint16_t window_id, + uint64_t pci_start_addr, + uint64_t pci_mem_size) +{ + struct npu *p = phb_to_npu(phb); + uint64_t end; + uint64_t tve; + + /* Sanity check. Each PE has one corresponding TVE */ + if (pe_number >= NPU_NUM_OF_PES || + window_id != pe_number) + return OPAL_PARAMETER; + + if (pci_mem_size) { + /* Enable */ + + end = pci_start_addr + pci_mem_size; + + /* We have to be 16M aligned */ + if ((pci_start_addr & 0x00ffffff) || + (pci_mem_size & 0x00ffffff)) + return OPAL_PARAMETER; + + /* + * It *looks* like this is the max we can support (we need + * to verify this. Also we are not checking for rollover, + * but then we aren't trying too hard to protect ourselves + * againt a completely broken OS. + */ + if (end > 0x0003ffffffffffffull) + return OPAL_PARAMETER; + + /* + * Put start address bits 49:24 into TVE[52:53]||[0:23] + * and end address bits 49:24 into TVE[54:55]||[24:47] + * and set TVE[51] + */ + tve = (pci_start_addr << 16) & (0xffffffull << 48); + tve |= (pci_start_addr >> 38) & (3ull << 10); + tve |= (end >> 8) & (0xfffffful << 16); + tve |= (end >> 40) & (3ull << 8); + tve |= PPC_BIT(51); + } else { + /* Disable */ + tve = 0; + } + + npu_ioda_sel(p, NPU_IODA_TBL_TVT, window_id, false); + out_be64(p->at_regs + NPU_IODA_DATA0, tve); + p->tve_cache[window_id] = tve; + + return OPAL_SUCCESS; +} + +static int64_t npu_map_pe_dma_window(struct phb *phb, + uint64_t pe_number, + uint16_t window_id, + uint16_t tce_levels, + uint64_t tce_table_addr, + uint64_t tce_table_size, + uint64_t tce_page_size) +{ + struct npu *p = phb_to_npu(phb); + uint64_t tts_encoded; + uint64_t data64 = 0; + + /* Sanity check. Each PE has one corresponding TVE */ + if (pe_number >= NPU_NUM_OF_PES || + window_id != pe_number) + return OPAL_PARAMETER; + + /* Special condition, zero TCE table size used to disable + * the TVE. + */ + if (!tce_table_size) { + npu_ioda_sel(p, NPU_IODA_TBL_TVT, window_id, false); + out_be64(p->at_regs + NPU_IODA_DATA0, 0ul); + p->tve_cache[window_id] = 0ul; + return OPAL_SUCCESS; + } + + /* Additional arguments validation */ + if (tce_levels < 1 || + tce_levels > 4 || + !is_pow2(tce_table_size) || + tce_table_size < 0x1000) + return OPAL_PARAMETER; + + /* TCE table size */ + data64 = SETFIELD(NPU_IODA_TVT_TTA, 0ul, tce_table_addr >> 12); + tts_encoded = ilog2(tce_table_size) - 11; + if (tts_encoded > 39) + return OPAL_PARAMETER; + data64 = SETFIELD(NPU_IODA_TVT_SIZE, data64, tts_encoded); + + /* TCE page size */ + switch (tce_page_size) { + case 0x10000: /* 64K */ + data64 = SETFIELD(NPU_IODA_TVT_PSIZE, data64, 5); + break; + case 0x1000000: /* 16M */ + data64 = SETFIELD(NPU_IODA_TVT_PSIZE, data64, 13); + break; + case 0x10000000: /* 256M */ + data64 = SETFIELD(NPU_IODA_TVT_PSIZE, data64, 17); + break; + case 0x1000: /* 4K */ + default: + data64 = SETFIELD(NPU_IODA_TVT_PSIZE, data64, 1); + } + + /* Number of levels */ + data64 = SETFIELD(NPU_IODA_TVT_LEVELS, data64, tce_levels - 1); + + /* Update to hardware */ + npu_ioda_sel(p, NPU_IODA_TBL_TVT, window_id, false); + out_be64(p->at_regs + NPU_IODA_DATA0, data64); + p->tve_cache[window_id] = data64; + + return OPAL_SUCCESS; +} + +static int64_t npu_set_pe(struct phb *phb, + uint64_t pe_number, + uint64_t bdfn, + uint8_t bcompare, + uint8_t dcompare, + uint8_t fcompare, + uint8_t action) +{ + struct npu *p = phb_to_npu(phb); + struct npu_dev *dev; + uint32_t link_idx; + uint64_t *data64; + + /* Sanity check */ + if (action != OPAL_MAP_PE && + action != OPAL_UNMAP_PE) + return OPAL_PARAMETER; + if (pe_number >= NPU_NUM_OF_PES) + return OPAL_PARAMETER; + + /* All emulated PCI devices hooked to root bus, whose + * bus number is zero. + */ + dev = bdfn_to_npu_dev(p, bdfn); + if (PCI_BUS_NUM(bdfn) || !dev) + return OPAL_PARAMETER; + + link_idx = dev->index; + dev->pe_number = pe_number; + + /* Separate links will be mapped to different PEs */ + if (bcompare != OpalPciBusAll || + dcompare != OPAL_COMPARE_RID_DEVICE_NUMBER || + fcompare != OPAL_COMPARE_RID_FUNCTION_NUMBER) + return OPAL_UNSUPPORTED; + + /* Map the link to the corresponding PE */ + data64 = &p->pce_cache[link_idx]; + if (action == OPAL_MAP_PE) + *data64 = SETFIELD(NPU_IODA_PCT_PE, *data64, + pe_number); + else + *data64 = SETFIELD(NPU_IODA_PCT_PE, *data64, + NPU_NUM_OF_PES); + + *data64 |= NPU_IODA_PCT_LINK_ENABLED; + + npu_ioda_sel(p, NPU_IODA_TBL_PCT, link_idx, false); + out_be64(p->at_regs + NPU_IODA_DATA0, *data64); + + return OPAL_SUCCESS; +} + +static int64_t npu_get_link_state(struct pci_slot *slot __unused, uint8_t *val) +{ + /* As we're emulating all PCI stuff, the link bandwidth + * isn't big deal anyway. + */ + *val = OPAL_SHPC_LINK_UP_x1; + return OPAL_SUCCESS; +} + +static int64_t npu_get_power_state(struct pci_slot *slot __unused, uint8_t *val) +{ + *val = PCI_SLOT_POWER_ON; + return OPAL_SUCCESS; +} + +static int64_t npu_hreset(struct pci_slot *slot __unused) +{ + prlog(PR_DEBUG, "NPU: driver should call reset procedure here\n"); + + return OPAL_SUCCESS; +} + +static int64_t npu_freset(struct pci_slot *slot __unused) +{ + /* FIXME: PHB fundamental reset, which need to be + * figured out later. It's used by EEH recovery + * upon fenced AT. + */ + return OPAL_SUCCESS; +} + +static struct pci_slot *npu_slot_create(struct phb *phb) +{ + struct pci_slot *slot; + + slot = pci_slot_alloc(phb, NULL); + if (!slot) + return slot; + + /* Elementary functions */ + slot->ops.get_presence_state = NULL; + slot->ops.get_link_state = npu_get_link_state; + slot->ops.get_power_state = npu_get_power_state; + slot->ops.get_attention_state = NULL; + slot->ops.get_latch_state = NULL; + slot->ops.set_power_state = NULL; + slot->ops.set_attention_state = NULL; + + slot->ops.prepare_link_change = NULL; + slot->ops.poll_link = NULL; + slot->ops.hreset = npu_hreset; + slot->ops.freset = npu_freset; + slot->ops.creset = NULL; + + return slot; +} + +static int64_t npu_freeze_status(struct phb *phb, + uint64_t pe_number __unused, + uint8_t *freeze_state, + uint16_t *pci_error_type __unused, + uint16_t *severity __unused) +{ + /* FIXME: When it's called by skiboot PCI config accessor, + * the PE number is fixed to 0, which is incorrect. We need + * introduce another PHB callback to translate it. For now, + * it keeps the skiboot PCI enumeration going. + */ + struct npu *p = phb_to_npu(phb); + if (p->fenced) + *freeze_state = OPAL_EEH_STOPPED_MMIO_DMA_FREEZE; + else + *freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN; + return OPAL_SUCCESS; +} + +static int64_t npu_eeh_next_error(struct phb *phb, + uint64_t *first_frozen_pe, + uint16_t *pci_error_type, + uint16_t *severity) +{ + struct npu *p = phb_to_npu(phb); + int i; + uint64_t result = 0; + *first_frozen_pe = -1; + *pci_error_type = OPAL_EEH_NO_ERROR; + *severity = OPAL_EEH_SEV_NO_ERROR; + + if (p->fenced) { + *pci_error_type = OPAL_EEH_PHB_ERROR; + *severity = OPAL_EEH_SEV_PHB_FENCED; + return OPAL_SUCCESS; + } + + npu_ioda_sel(p, NPU_IODA_TBL_PESTB, 0, true); + for (i = 0; i < NPU_NUM_OF_PES; i++) { + result = in_be64(p->at_regs + NPU_IODA_DATA0); + if (result > 0) { + *first_frozen_pe = i; + *pci_error_type = OPAL_EEH_PE_ERROR; + *severity = OPAL_EEH_SEV_PE_ER; + break; + } + } + + return OPAL_SUCCESS; +} + +/* For use in error injection and handling. */ +void npu_set_fence_state(struct npu *p, bool fence) { + p->fenced = fence; + + if (fence) + prlog(PR_ERR, "NPU: Chip %x is fenced, reboot required.\n", + p->chip_id); + else + prlog(PR_WARNING, "NPU: un-fencing is dangerous and should \ + only be used for development purposes."); +} + +/* Sets the NPU to trigger an error when a DMA occurs */ +static int64_t npu_err_inject(struct phb *phb, uint64_t pe_number, + uint32_t type, uint32_t func __unused, + uint64_t addr __unused, uint64_t mask __unused) +{ + struct npu *p = phb_to_npu(phb); + struct npu_dev *dev = NULL; + int i; + + if (pe_number >= NPU_NUM_OF_PES) { + prlog(PR_ERR, "NPU: error injection failed, bad PE given\n"); + return OPAL_PARAMETER; + } + + for (i = 0; i < p->total_devices; i++) { + if (p->devices[i].pe_number == pe_number) { + dev = &p->devices[i]; + break; + } + } + + if (!dev) { + prlog(PR_ERR, "NPU: couldn't find device with PE%llx\n", pe_number); + return OPAL_PARAMETER; + } + + /* TODO: extend this to conform to OPAL injection standards */ + if (type > 1) { + prlog(PR_ERR, "NPU: invalid error injection type\n"); + return OPAL_PARAMETER; + } else if (type == 1) { + /* Emulate fence mode. */ + npu_set_fence_state(p, true); + } else { + /* Cause a freeze with an invalid MMIO read. If the BAR is not + * enabled, this will checkstop the machine. + */ + npu_dev_bar_update(p->chip_id, &dev->bar, true); + in_be64((void *)dev->bar.base); + } + + return OPAL_SUCCESS; +} + +static const struct phb_ops npu_ops = { + .cfg_read8 = npu_cfg_read8, + .cfg_read16 = npu_cfg_read16, + .cfg_read32 = npu_cfg_read32, + .cfg_write8 = npu_cfg_write8, + .cfg_write16 = npu_cfg_write16, + .cfg_write32 = npu_cfg_write32, + .get_reserved_pe_number = NULL, + .device_init = NULL, + .phb_final_fixup = npu_phb_final_fixup, + .ioda_reset = npu_ioda_reset, + .papr_errinjct_reset = NULL, + .pci_reinit = NULL, + .set_phb_mem_window = NULL, + .phb_mmio_enable = NULL, + .map_pe_mmio_window = NULL, + .map_pe_dma_window = npu_map_pe_dma_window, + .map_pe_dma_window_real = npu_map_pe_dma_window_real, + .pci_msi_eoi = NULL, + .set_xive_pe = NULL, + .get_msi_32 = NULL, + .get_msi_64 = NULL, + .set_pe = npu_set_pe, + .set_peltv = NULL, + .eeh_freeze_status = npu_freeze_status, + .eeh_freeze_clear = NULL, + .eeh_freeze_set = NULL, + .next_error = npu_eeh_next_error, + .err_inject = npu_err_inject, + .get_diag_data2 = NULL, + .set_capi_mode = NULL, + .set_capp_recovery = NULL, +}; + +static void assign_mmio_bars(uint32_t gcid, uint32_t xscom, + struct dt_node *npu_dn, uint64_t mm_win[2], + uint64_t at_bar[2]) +{ + uint64_t mem_start, mem_end; + struct npu_dev_bar bar; + struct dt_node *link; + + /* Configure BAR selection. + * + * Currently, each PHY contains 2 links and each link has 2 + * BARs. The first BAR is assigned to the DLTL region which is + * what the kernel uses. The second BAR is either assigned to + * either the PL or AT region or unassigned. The PL0/PL1/AT + * MMIO regions are not exposed to the kernel so we assigned + * them at the start of the available memory area followed by + * the DLTL regions. So we end up with the following memory + * map (assuming we're given a memory region starting at + * 0x3fff000000000): + * + * Link#0-BAR#0: NTL/NDL BAR (128KB) - 0x3fff000420000 + * Link#0-BAR#1: PL0 BAR ( 2MB) - 0x3fff000000000 + * Link#1-BAR#0: NTL/NDL BAR (128KB) - 0x3fff000440000 + * Link#1-BAR#1: AT BAR ( 64KB) - 0x3fff000400000 + * Link#2-BAR#0: NTL/NDL BAR (128KB) - 0x3fff000460000 + * Link#2-BAR#1: PL1 BAR ( 2MB) - 0x3fff000200000 + * Link#3-BAR#0: NTL/NDL BAR (128KB) - 0x3fff000480000 + * Link#3-BAR#1: UNASSIGNED + */ + xscom_write(gcid, xscom + NPU_AT_SCOM_OFFSET + NX_BAR, + 0x0211000043500000UL); + + xscom_read(gcid, npu_link_scom_base(npu_dn, xscom, 0) + NX_MMIO_BAR_0, + &mem_start); + mem_start = GETFIELD(NX_MMIO_BAR_BASE, mem_start) << 12; + + xscom_read(gcid, npu_link_scom_base(npu_dn, xscom, 5) + NX_MMIO_BAR_0, + &mem_end); + mem_end = (GETFIELD(NX_MMIO_BAR_BASE, mem_end) << 12) + + get_bar_size(mem_end); + + /* PL0 BAR comes first at 0x3fff000000000 */ + bar.xscom = npu_link_scom_base(npu_dn, xscom, 0) + NX_MMIO_BAR_1; + bar.base = mem_start; + bar.size = NX_MMIO_PL_SIZE; + npu_dev_bar_update(gcid, &bar, true); + + /* PL1 BAR */ + bar.xscom = npu_link_scom_base(npu_dn, xscom, 4) + NX_MMIO_BAR_1; + bar.base += bar.size; + bar.size = NX_MMIO_PL_SIZE; + npu_dev_bar_update(gcid, &bar, true); + + /* Then the AT BAR */ + bar.xscom = npu_link_scom_base(npu_dn, xscom, 1) + NX_MMIO_BAR_1; + bar.base += bar.size; + bar.size = NX_MMIO_AT_SIZE; + at_bar[0] = bar.base; + at_bar[1] = NX_MMIO_AT_SIZE; + npu_dev_bar_update(gcid, &bar, true); + + /* Now we configure all the DLTL BARs. These are the ones + * actually exposed to the kernel. */ + mm_win[0] = bar.base + bar.size; + dt_for_each_node(npu_dn, link) { + uint32_t index; + + index = dt_prop_get_u32(link, "ibm,npu-link-index"); + bar.xscom = npu_link_scom_base(npu_dn, xscom, index) + + NX_MMIO_BAR_0; + bar.base += bar.size; + bar.size = NX_MMIO_DL_SIZE; + bar.base = ALIGN_UP(bar.base, bar.size); + npu_dev_bar_update(gcid, &bar, false); + } + mm_win[1] = (bar.base + bar.size) - mm_win[0]; + + /* If we weren't given enough room to setup all the BARs we + * require it's better to crash here than risk creating + * overlapping BARs which will xstop the machine randomly in + * the future.*/ + assert(bar.base + bar.size <= mem_end); +} + +/* Probe NPU device node and create PCI root device node + * accordingly. The NPU deivce node should specify number + * of links and xscom base address to access links. + */ +static void npu_probe_phb(struct dt_node *dn) +{ + struct dt_node *np; + uint32_t gcid, index, phb_index, xscom; + uint64_t at_bar[2], mm_win[2]; + uint32_t links; + char *path; + + /* Retrieve chip id */ + path = dt_get_path(dn); + gcid = dt_get_chip_id(dn); + index = dt_prop_get_u32(dn, "ibm,npu-index"); + phb_index = dt_prop_get_u32(dn, "ibm,phb-index"); + links = dt_prop_get_u32(dn, "ibm,npu-links"); + prlog(PR_INFO, "Chip %d Found NPU%d (%d links) at %s\n", + gcid, index, links, path); + free(path); + + /* Retrieve xscom base addr */ + xscom = dt_get_address(dn, 0, NULL); + prlog(PR_INFO, " XSCOM Base: %08x\n", xscom); + + assign_mmio_bars(gcid, xscom, dn, mm_win, at_bar); + prlog(PR_INFO, " AT BAR: %016llx (%lldKB)\n", + at_bar[0], at_bar[1] / 0x400); + + /* Create PCI root device node */ + np = dt_new_addr(dt_root, "pciex", at_bar[0]); + assert(np); + + dt_add_property_strings(np, "compatible", + "ibm,power8-npu-pciex", "ibm,ioda2-npu-phb"); + dt_add_property_strings(np, "device_type", "pciex"); + dt_add_property(np, "reg", at_bar, sizeof(at_bar)); + + dt_add_property_cells(np, "ibm,phb-index", phb_index); + dt_add_property_cells(np, "ibm,npu-index", index); + dt_add_property_cells(np, "ibm,chip-id", gcid); + dt_add_property_cells(np, "ibm,xscom-base", xscom); + dt_add_property_cells(np, "ibm,npcq", dn->phandle); + dt_add_property_cells(np, "ibm,links", links); + dt_add_property(np, "ibm,mmio-window", mm_win, sizeof(mm_win)); + dt_add_property_cells(np, "ibm,phb-diag-data-size", 0); + + /* Disable fast reboot - not currently supported */ + disable_fast_reboot("NVLink device enabled"); +} + +static void npu_dev_populate_vendor_cap(struct npu_dev_cap *cap) +{ + struct npu_dev *dev = cap->dev; + struct pci_virt_device *pvd = dev->pvd; + uint32_t offset = cap->start; + uint8_t val; + + /* Add length and version information */ + val = cap->end - cap->start; + PCI_VIRT_CFG_INIT_RO(pvd, offset + 2, 1, val); + PCI_VIRT_CFG_INIT_RO(pvd, offset + 3, 1, OPAL_NPU_VERSION); + offset += 4; + + /* Defaults when the trap can't handle the read/write (eg. due + * to reading/writing less than 4 bytes). */ + val = 0x0; + PCI_VIRT_CFG_INIT_RO(pvd, offset, 4, val); + PCI_VIRT_CFG_INIT_RO(pvd, offset + 4, 4, val); + + /* Create a trap for AT/PL procedures */ + pci_virt_add_filter(pvd, offset, 8, + PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE, + npu_dev_procedure, NULL); + offset += 8; + + PCI_VIRT_CFG_INIT_RO(pvd, offset, 1, dev->index); +} + +static void npu_dev_populate_pcie_cap(struct npu_dev_cap *cap) +{ + struct npu_dev *dev = cap->dev; + struct pci_virt_device *pvd = dev->pvd; + uint32_t base = cap->start; + uint32_t val; + + /* Sanity check on capability ID */ + if (cap->id != PCI_CFG_CAP_ID_EXP) { + prlog(PR_NOTICE, "%s: Invalid capability ID %d (%d)\n", + __func__, cap->id, PCI_CFG_CAP_ID_EXP); + return; + } + + /* Sanity check on spanned registers */ + if ((cap->end - cap->start) < PCIE_CAP_START) { + prlog(PR_NOTICE, "%s: Invalid reg region [%x, %x] for cap %d\n", + __func__, cap->start, cap->end, cap->id); + return; + } + + /* 0x00 - ID/PCIE capability */ + val = cap->id; + val |= ((0x2 << 16) | (PCIE_TYPE_ENDPOINT << 20)); + PCI_VIRT_CFG_INIT_RO(pvd, base, 4, val); + + /* 0x04 - Device capability + * + * We should support FLR. Otherwise, it might have + * problem passing it through to userland via Linux + * VFIO infrastructure + */ + val = ((PCIE_MPSS_128) | + (PCIE_PHANTOM_NONE << 3) | + (PCIE_L0SL_MAX_NO_LIMIT << 6) | + (PCIE_L1L_MAX_NO_LIMIT << 9) | + (PCICAP_EXP_DEVCAP_FUNC_RESET)); + PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_DEVCAP, 4, val); + + pci_virt_add_filter(pvd, base + PCICAP_EXP_DEVCTL, 2, + PCI_REG_FLAG_WRITE, + npu_dev_cfg_exp_devcap, NULL); + + /* 0x08 - Device control and status */ + PCI_VIRT_CFG_INIT(pvd, base + PCICAP_EXP_DEVCTL, 4, 0x00002810, + 0xffff0000, 0x000f0000); + + /* 0x0c - Link capability */ + val = (PCIE_LSPEED_VECBIT_2 | (PCIE_LWIDTH_1X << 4)); + PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_LCAP, 4, val); + + /* 0x10 - Link control and status */ + PCI_VIRT_CFG_INIT(pvd, base + PCICAP_EXP_LCTL, 4, 0x00130000, + 0xfffff000, 0xc0000000); + + /* 0x14 - Slot capability */ + PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_SLOTCAP, 4, 0x00000000); + + /* 0x18 - Slot control and status */ + PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_SLOTCTL, 4, 0x00000000); + + /* 0x1c - Root control and capability */ + PCI_VIRT_CFG_INIT(pvd, base + PCICAP_EXP_RC, 4, 0x00000000, + 0xffffffe0, 0x00000000); + + /* 0x20 - Root status */ + PCI_VIRT_CFG_INIT(pvd, base + PCICAP_EXP_RSTAT, 4, 0x00000000, + 0xffffffff, 0x00010000); + + /* 0x24 - Device capability 2 */ + PCI_VIRT_CFG_INIT_RO(pvd, base + PCIECAP_EXP_DCAP2, 4, 0x00000000); + + /* 0x28 - Device Control and status 2 */ + PCI_VIRT_CFG_INIT(pvd, base + PCICAP_EXP_DCTL2, 4, 0x00070000, + 0xffff0000, 0x00000000); + + /* 0x2c - Link capability 2 */ + PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_LCAP2, 4, 0x00000007); + + /* 0x30 - Link control and status 2 */ + PCI_VIRT_CFG_INIT(pvd, base + PCICAP_EXP_LCTL2, 4, 0x00000003, + 0xffff0000, 0x00200000); + + /* 0x34 - Slot capability 2 */ + PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_SCAP2, 4, 0x00000000); + + /* 0x38 - Slot control and status 2 */ + PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_SCTL2, 4, 0x00000000); +} + +static struct npu_dev_cap *npu_dev_create_capability(struct npu_dev *dev, + void (*populate)(struct npu_dev_cap *), + uint16_t id, + uint16_t start, + uint16_t end) +{ + struct npu_dev_cap *cap; + + /* Check if the capability is existing */ + cap = npu_dev_find_capability(dev, id); + if (cap) + return cap; + + /* Allocate new one */ + cap = zalloc(sizeof(struct npu_dev_cap)); + assert(cap); + + /* Put it into the pool */ + cap->id = id; + cap->start = start; + cap->end = end; + cap->dev = dev; + cap->populate = populate; + list_add_tail(&dev->capabilities, &cap->link); + + return cap; +} + +static struct npu_dev_cap *npu_dev_find_capability(struct npu_dev *dev, + uint16_t id) +{ + struct npu_dev_cap *cap; + + list_for_each(&dev->capabilities, cap, link) { + if (cap->id == id) + return cap; + } + + return NULL; +} + +/* + * All capabilities should be put into the device capability + * list according to register offset in ascending order for + * easy access at later point. + */ +static void npu_dev_create_capabilities(struct npu_dev *dev) +{ + list_head_init(&dev->capabilities); + + /* PCI express capability */ + npu_dev_create_capability(dev, npu_dev_populate_pcie_cap, + PCI_CFG_CAP_ID_EXP, PCIE_CAP_START, + PCIE_CAP_END); + + /* Vendor specific capability */ + npu_dev_create_capability(dev, npu_dev_populate_vendor_cap, + PCI_CFG_CAP_ID_VENDOR, VENDOR_CAP_START, + VENDOR_CAP_END); +} + +static void npu_dev_create_cfg(struct npu_dev *dev) +{ + struct pci_virt_device *pvd = dev->pvd; + struct npu_dev_cap *cap; + uint32_t offset; + uint32_t last_cap_offset; + + /* 0x00 - Vendor/Device ID */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_VENDOR_ID, 4, 0x04ea1014); + + /* 0x04 - Command/Status + * + * Create one trap to trace toggling memory BAR enable bit + */ + PCI_VIRT_CFG_INIT(pvd, PCI_CFG_CMD, 4, 0x00100000, 0xffb802b8, + 0xf9000000); + + pci_virt_add_filter(pvd, PCI_CFG_CMD, 1, PCI_REG_FLAG_WRITE, + npu_dev_cfg_write_cmd, NULL); + + /* 0x08 - Rev/Class/Cache */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_REV_ID, 4, 0x06800100); + + /* 0x0c - CLS/Latency Timer/Header/BIST */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CACHE_LINE_SIZE, 4, 0x00800000); + + /* 0x10 - BARs, always 64-bits non-prefetchable + * + * Each emulated device represents one link and therefore + * there is one BAR for the associated DLTL region. + */ + + /* Low 32-bits */ + PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR0, 4, + (dev->bar.base & 0xfffffff0) | dev->bar.flags, + 0x0000000f, 0x00000000); + + /* High 32-bits */ + PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR1, 4, (dev->bar.base >> 32), + 0x00000000, 0x00000000); + + /* + * Create trap. Writting 0xFF's to BAR registers should be + * trapped and return size on next read + */ + pci_virt_add_filter(pvd, PCI_CFG_BAR0, 8, + PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE, + npu_dev_cfg_bar, &dev->bar); + + /* 0x18/1c/20/24 - Disabled BAR#2/3/4/5 + * + * Mark those BARs readonly so that 0x0 will be returned when + * probing the length and the BARs will be skipped. + */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR2, 4, 0x00000000); + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR3, 4, 0x00000000); + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR4, 4, 0x00000000); + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR5, 4, 0x00000000); + + /* 0x28 - Cardbus CIS pointer */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CARDBUS_CIS, 4, 0x00000000); + + /* 0x2c - Subsystem ID */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_SUBSYS_VENDOR_ID, 4, 0x00000000); + + /* 0x30 - ROM BAR + * + * Force its size to be zero so that the kernel will skip + * probing the ROM BAR. We needn't emulate ROM BAR. + */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_ROMBAR, 4, 0xffffffff); + + /* 0x34 - PCI Capability + * + * By default, we don't have any capabilities + */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CAP, 4, 0x00000000); + + last_cap_offset = PCI_CFG_CAP - 1; + list_for_each(&dev->capabilities, cap, link) { + offset = cap->start; + + /* Initialize config space for the capability */ + if (cap->populate) + cap->populate(cap); + + /* Add capability header */ + PCI_VIRT_CFG_INIT_RO(pvd, offset, 2, cap->id); + + /* Update the next capability pointer */ + PCI_VIRT_CFG_NORMAL_WR(pvd, last_cap_offset + 1, 1, offset); + + last_cap_offset = offset; + } + + /* 0x38 - Reserved */ + PCI_VIRT_CFG_INIT_RO(pvd, 0x38, 4, 0x00000000); + + /* 0x3c - INT line/pin/Minimal grant/Maximal latency */ + if (!(dev->index % 2)) + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_INT_LINE, 4, 0x00000100); + else + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_INT_LINE, 4, 0x00000200); +} + +static uint32_t npu_allocate_bdfn(struct npu *p, uint32_t group) +{ + int i; + int bdfn = (group << 3); + + for (i = 0; i < p->total_devices; i++) { + if ((p->devices[i].pvd->bdfn & 0xf8) == (bdfn & 0xf8)) + bdfn++; + } + + return bdfn; +} + +static void npu_create_devices(struct dt_node *dn, struct npu *p) +{ + struct npu_dev *dev; + struct dt_node *npu_dn, *link; + uint32_t bdfn, npu_phandle, index = 0; + uint64_t buid_reg; + uint64_t lsisrcid; + uint64_t buid; + + + /* The bits in the LSI ID Base register are always compared and + * can be set to 0 in the buid base and mask fields. The + * buid (bus unit id) is the full irq minus the last 4 bits. */ + lsisrcid = GETFIELD(NPU_LSI_SRC_ID_BASE, NPU_LSI_SRC_ID_BASE); + buid = p8_chip_irq_block_base(p->chip_id, P8_IRQ_BLOCK_MISC) >> 4; + + buid_reg = SETFIELD(NP_IRQ_LEVELS, NP_BUID_ENABLE, ~0); + buid_reg = SETFIELD(NP_BUID_MASK, buid_reg, ~lsisrcid); + buid_reg = SETFIELD(NP_BUID_BASE, buid_reg, (buid & ~lsisrcid)); + + /* Get the npu node which has the links which we expand here + * into pci like devices attached to our emulated phb. */ + npu_phandle = dt_prop_get_u32(dn, "ibm,npcq"); + npu_dn = dt_find_by_phandle(dt_root, npu_phandle); + assert(npu_dn); + + /* Walk the link@x nodes to initialize devices */ + p->total_devices = 0; + p->phb.scan_map = 0; + list_head_init(&p->phb.virt_devices); + dt_for_each_compatible(npu_dn, link, "ibm,npu-link") { + struct npu_dev_bar *bar; + uint32_t group_id; + uint64_t val; + + dev = &p->devices[index]; + dev->index = dt_prop_get_u32(link, "ibm,npu-link-index"); + dev->xscom = npu_link_scom_base(npu_dn, p->xscom_base, + dev->index); + + dev->npu = p; + dev->dt_node = link; + + /* We don't support MMIO PHY access yet */ + dev->pl_base = NULL; + + group_id = dt_prop_get_u32(link, "ibm,npu-group-id"); + bdfn = npu_allocate_bdfn(p, group_id); + + /* This must be done after calling + * npu_allocate_bdfn() */ + p->total_devices++; + p->phb.scan_map |= 0x1 << ((bdfn & 0xf8) >> 3); + + dev->pl_xscom_base = dt_prop_get_u64(link, "ibm,npu-phy"); + dev->lane_mask = dt_prop_get_u32(link, "ibm,npu-lane-mask"); + + /* Setup BUID/ISRN */ + xscom_write(p->chip_id, dev->xscom + NX_NP_BUID, buid_reg); + + /* Create PCI virtual device */ + dev->pvd = pci_virt_add_device(&p->phb, bdfn, NPU_DEV_CFG_SIZE, dev); + assert(dev->pvd); + bar = &dev->bar; + bar->flags = (PCI_CFG_BAR_TYPE_MEM | + PCI_CFG_BAR_MEM64); + + /* Update BAR info */ + bar->xscom = dev->xscom + NX_MMIO_BAR_0; + xscom_read(p->chip_id, bar->xscom, &val); + bar->base = GETFIELD(NX_MMIO_BAR_BASE, val) << 12; + bar->size = get_bar_size(val); + + /* + * The config space is initialised with the BARs + * disabled, so make sure it is actually disabled in + * hardware. + */ + npu_dev_bar_update(p->chip_id, bar, false); + + /* Initialize capabilities */ + npu_dev_create_capabilities(dev); + + /* Initialize config space */ + npu_dev_create_cfg(dev); + + index++; + } +} + +static void npu_add_phb_properties(struct npu *p) +{ + struct dt_node *np = p->phb.dt_node; + uint32_t icsp = get_ics_phandle(); + uint64_t tkill, mm_base, mm_size; + uint32_t base_lsi = p->base_lsi; + uint32_t map[] = { + /* Dev 0 INT#A (used by fn0) */ + 0x0000, 0x0, 0x0, 0x1, icsp, base_lsi + NPU_LSI_INT_DL0, 1, + /* Dev 0 INT#B (used by fn1) */ + 0x0000, 0x0, 0x0, 0x2, icsp, base_lsi + NPU_LSI_INT_DL1, 1, + /* Dev 1 INT#A (used by fn0) */ + 0x0800, 0x0, 0x0, 0x1, icsp, base_lsi + NPU_LSI_INT_DL2, 1, + /* Dev 1 INT#B (used by fn1) */ + 0x0800, 0x0, 0x0, 0x2, icsp, base_lsi + NPU_LSI_INT_DL3, 1, + }; + /* Mask is bus, device and INT# */ + uint32_t mask[] = {0xf800, 0x0, 0x0, 0x7}; + char slotbuf[32]; + + /* Add various properties that HB doesn't have to + * add, some of them simply because they result from + * policy decisions made in skiboot rather than in HB + * such as the MMIO windows going to PCI, interrupts, + * etc. + */ + dt_add_property_cells(np, "#address-cells", 3); + dt_add_property_cells(np, "#size-cells", 2); + dt_add_property_cells(np, "#interrupt-cells", 1); + dt_add_property_cells(np, "bus-range", 0, 0xff); + dt_add_property_cells(np, "clock-frequency", 0x200, 0); + dt_add_property_cells(np, "interrupt-parent", icsp); + + /* DLPL Interrupts, we don't use the standard swizzle */ + p->phb.lstate.int_size = 0; + dt_add_property(np, "interrupt-map", map, sizeof(map)); + dt_add_property(np, "interrupt-map-mask", mask, sizeof(mask)); + + /* NPU PHB properties */ + /* TODO: Due to an errata TCE KILL only works when DMA traffic + * has been stopped. We need to implement the work around + * which is to do a TCE kill all instead. */ + tkill = cleanup_addr((uint64_t)p->at_regs) + NPU_TCE_KILL; + dt_add_property_cells(np, "ibm,opal-num-pes", + NPU_NUM_OF_PES); + dt_add_property_cells(np, "ibm,opal-reserved-pe", + 0); + dt_add_property_u64(np, "ibm,opal-tce-kill", tkill); + + /* Memory window is exposed as 32-bits non-prefetchable + * one because 64-bits prefetchable one is kind of special + * to kernel. + */ + mm_base = p->mm_base; + mm_size = p->mm_size; + dt_add_property_cells(np, "ranges", 0x02000000, + hi32(mm_base), lo32(mm_base), + hi32(mm_base), lo32(mm_base), + hi32(mm_size), lo32(mm_size)); + + /* Set the slot location on the NPU PHB. This PHB can contain + * devices that correlate with multiple physical slots, so + * present the chip ID instead. + */ + snprintf(slotbuf, sizeof(slotbuf), "NPU Chip %d", p->chip_id); + dt_add_property_string(np, "ibm,io-base-loc-code", slotbuf); +} + +static void npu_create_phb(struct dt_node *dn) +{ + const struct dt_property *prop; + struct npu *p; + struct pci_slot *slot; + uint32_t links; + void *pmem; + + /* Retrieve number of devices */ + links = dt_prop_get_u32(dn, "ibm,links"); + pmem = zalloc(sizeof(struct npu) + links * sizeof(struct npu_dev)); + assert(pmem); + + /* Populate PHB */ + p = pmem; + p->index = dt_prop_get_u32(dn, "ibm,npu-index"); + p->chip_id = dt_prop_get_u32(dn, "ibm,chip-id"); + p->xscom_base = dt_prop_get_u32(dn, "ibm,xscom-base"); + p->total_devices = links; + + /* TODO: When hardware fences are implemented, detect them here */ + p->fenced = false; + + /* This is the AT base */ + p->at_xscom = p->xscom_base + NPU_AT_SCOM_OFFSET; + p->at_regs = (void *)dt_get_address(dn, 0, NULL); + + prop = dt_require_property(dn, "ibm,mmio-window", -1); + assert(prop->len >= (2 * sizeof(uint64_t))); + p->mm_base = ((const uint64_t *)prop->prop)[0]; + p->mm_size = ((const uint64_t *)prop->prop)[1]; + + p->devices = pmem + sizeof(struct npu); + + /* Interrupt */ + p->base_lsi = p8_chip_irq_block_base(p->chip_id, P8_IRQ_BLOCK_MISC) + + NPU_LSI_IRQ_MIN; + + /* Generic PHB */ + p->phb.dt_node = dn; + p->phb.ops = &npu_ops; + p->phb.phb_type = phb_type_pcie_v3; + + /* Populate devices */ + npu_create_devices(dn, p); + + /* Populate extra properties */ + npu_add_phb_properties(p); + + /* Create PHB slot */ + slot = npu_slot_create(&p->phb); + if (!slot) + { + /** + * @fwts-label NPUCannotCreatePHBSlot + * @fwts-advice Firmware probably ran out of memory creating + * NPU slot. NVLink functionality could be broken. + */ + prlog(PR_ERR, "NPU: Cannot create PHB slot\n"); + } + + /* Register PHB */ + pci_register_phb(&p->phb, OPAL_DYNAMIC_PHB_ID); + + /* Initialize IODA cache */ + npu_ioda_init(p); + + /* Register interrupt source */ + npu_register_irq(p); + + /* Initialize hardware */ + npu_hw_init(p); +} + +void probe_npu(void) +{ + struct dt_node *np; + + /* Scan NPU XSCOM nodes */ + dt_for_each_compatible(dt_root, np, "ibm,power8-npu") + npu_probe_phb(np); + + /* Scan newly created PHB nodes */ + dt_for_each_compatible(dt_root, np, "ibm,power8-npu-pciex") + npu_create_phb(np); +} diff --git a/roms/skiboot/hw/npu2-common.c b/roms/skiboot/hw/npu2-common.c new file mode 100644 index 000000000..3bc9bcee6 --- /dev/null +++ b/roms/skiboot/hw/npu2-common.c @@ -0,0 +1,681 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* Copyright 2013-2019 IBM Corp. */ + +#include <skiboot.h> +#include <xscom.h> +#include <pci.h> +#include <npu2.h> +#include <npu2-regs.h> +#include <bitutils.h> +#include <nvram.h> +#include <i2c.h> +#include <interrupts.h> +#include <xive.h> + +#define NPU2_IRQ_BASE_SHIFT 13 +#define NPU2_N_DL_IRQS 35 +#define NPU2_N_DL_IRQS_ALIGN 64 + +/* + * We use the indirect method because it uses the same addresses as + * the MMIO offsets (NPU RING) + */ +static void npu2_scom_set_addr(uint64_t gcid, uint64_t scom_base, + uint64_t addr, uint64_t size) +{ + addr = SETFIELD(NPU2_MISC_DA_ADDR, 0ull, addr); + addr = SETFIELD(NPU2_MISC_DA_LEN, addr, size); + xscom_write(gcid, scom_base + NPU2_MISC_SCOM_IND_SCOM_ADDR, addr); +} + +void npu2_scom_write(uint64_t gcid, uint64_t scom_base, + uint64_t reg, uint64_t size, + uint64_t val) +{ + npu2_scom_set_addr(gcid, scom_base, reg, size); + xscom_write(gcid, scom_base + NPU2_MISC_SCOM_IND_SCOM_DATA, val); +} + +uint64_t npu2_scom_read(uint64_t gcid, uint64_t scom_base, + uint64_t reg, uint64_t size) +{ + uint64_t val; + + npu2_scom_set_addr(gcid, scom_base, reg, size); + xscom_read(gcid, scom_base + NPU2_MISC_SCOM_IND_SCOM_DATA, &val); + + return val; +} + +void npu2_write_4b(struct npu2 *p, uint64_t reg, uint32_t val) +{ + npu2_scom_write(p->chip_id, p->xscom_base, reg, NPU2_MISC_DA_LEN_4B, + (uint64_t)val << 32); +} + +uint32_t npu2_read_4b(struct npu2 *p, uint64_t reg) +{ + return npu2_scom_read(p->chip_id, p->xscom_base, reg, + NPU2_MISC_DA_LEN_4B) >> 32; +} + +void npu2_write(struct npu2 *p, uint64_t reg, uint64_t val) +{ + npu2_scom_write(p->chip_id, p->xscom_base, reg, NPU2_MISC_DA_LEN_8B, val); +} + +uint64_t npu2_read(struct npu2 *p, uint64_t reg) +{ + return npu2_scom_read(p->chip_id, p->xscom_base, reg, NPU2_MISC_DA_LEN_8B); +} + +void npu2_write_mask(struct npu2 *p, uint64_t reg, uint64_t val, uint64_t mask) +{ + uint64_t new_val; + + new_val = npu2_read(p, reg); + new_val &= ~mask; + new_val |= val & mask; + npu2_scom_write(p->chip_id, p->xscom_base, reg, NPU2_MISC_DA_LEN_8B, new_val); +} + +void npu2_write_mask_4b(struct npu2 *p, uint64_t reg, uint32_t val, uint32_t mask) +{ + uint32_t new_val; + + new_val = npu2_read_4b(p, reg); + new_val &= ~mask; + new_val |= val & mask; + npu2_scom_write(p->chip_id, p->xscom_base, reg, NPU2_MISC_DA_LEN_4B, + (uint64_t)new_val << 32); +} + +typedef struct { + const char *name; + uint32_t block; + uint32_t offset; +} npu2_scom_dump_t; + +static npu2_scom_dump_t npu2_scom_dump_global[] = { + /* CQ State Machine */ + { "CS.SM0.MISC.CERR_MESSAGE0", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG0 }, + { "CS.SM1.MISC.CERR_MESSAGE0", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG0 }, + { "CS.SM2.MISC.CERR_MESSAGE0", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG0 }, + { "CS.SM3.MISC.CERR_MESSAGE0", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG0 }, + + { "CS.SM0.MISC.CERR_MESSAGE1", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG1 }, + { "CS.SM1.MISC.CERR_MESSAGE1", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG1 }, + { "CS.SM2.MISC.CERR_MESSAGE1", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG1 }, + { "CS.SM3.MISC.CERR_MESSAGE1", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG1 }, + + { "CS.SM0.MISC.CERR_MESSAGE2", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG2 }, + { "CS.SM1.MISC.CERR_MESSAGE2", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG2 }, + { "CS.SM2.MISC.CERR_MESSAGE2", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG2 }, + { "CS.SM3.MISC.CERR_MESSAGE2", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG2 }, + + { "CS.SM0.MISC.CERR_MESSAGE3", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG3 }, + { "CS.SM1.MISC.CERR_MESSAGE3", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG3 }, + { "CS.SM2.MISC.CERR_MESSAGE3", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG3 }, + { "CS.SM3.MISC.CERR_MESSAGE3", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG3 }, + + { "CS.SM0.MISC.CERR_MESSAGE4", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG4 }, + { "CS.SM1.MISC.CERR_MESSAGE4", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG4 }, + { "CS.SM2.MISC.CERR_MESSAGE4", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG4 }, + { "CS.SM3.MISC.CERR_MESSAGE4", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG4 }, + + { "CS.SM0.MISC.CERR_MESSAGE5", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG5 }, + { "CS.SM1.MISC.CERR_MESSAGE5", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG5 }, + { "CS.SM2.MISC.CERR_MESSAGE5", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG5 }, + { "CS.SM3.MISC.CERR_MESSAGE5", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG5 }, + + { "CS.SM0.MISC.CERR_MESSAGE6", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG6 }, + { "CS.SM1.MISC.CERR_MESSAGE6", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG6 }, + { "CS.SM2.MISC.CERR_MESSAGE6", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG6 }, + { "CS.SM3.MISC.CERR_MESSAGE6", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG6 }, + + { "CS.SM0.MISC.CERR_FIRST0", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST0 }, + { "CS.SM1.MISC.CERR_FIRST0", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST0 }, + { "CS.SM2.MISC.CERR_FIRST0", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST0 }, + { "CS.SM3.MISC.CERR_FIRST0", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST0 }, + + { "CS.SM0.MISC.CERR_FIRST1", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST1 }, + { "CS.SM1.MISC.CERR_FIRST1", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST1 }, + { "CS.SM2.MISC.CERR_FIRST1", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST1 }, + { "CS.SM3.MISC.CERR_FIRST1", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST1 }, + + { "CS.SM0.MISC.CERR_FIRST2", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST2 }, + { "CS.SM1.MISC.CERR_FIRST2", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST2 }, + { "CS.SM2.MISC.CERR_FIRST2", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST2 }, + { "CS.SM3.MISC.CERR_FIRST2", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST2 }, + + /* CQ Control */ + { "CS.CTL.MISC.CERR_MESSAGE0", NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_MSG0 }, + { "CS.CTL.MISC.CERR_MESSAGE1", NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_MSG1 }, + { "CS.CTL.MISC.CERR_FIRST0", NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_FIRST0 }, + { "CS.CTL.MISC.CERR_FIRST1", NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_FIRST1 }, + + /* CQ Data */ + { "DAT.MISC.CERR_ECC_HOLD", NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_STATUS }, + { "DAT.MISC.CERR_ECC_MASK", NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_MASK }, + { "DAT.MISC.CERR_ECC_FIRST", NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_FIRST }, + { "DAT.MISC.REM0", NPU2_BLOCK_DAT, NPU2_CQ_DAT_RAS_MSG0 }, + { "DAT.MISC.REM1", NPU2_BLOCK_DAT, NPU2_CQ_DAT_RAS_MSG1 }, +}; + +static npu2_scom_dump_t npu2_scom_dump_nvlink[] = { + { "NTL0.REGS.CERR_FIRST1", NPU2_BLOCK_NTL0, NPU2_NTL_ERR_FIRST1_OFF }, + { "NTL1.REGS.CERR_FIRST1", NPU2_BLOCK_NTL1, NPU2_NTL_ERR_FIRST1_OFF }, + { "NTL0.REGS.CERR_FIRST2", NPU2_BLOCK_NTL0, NPU2_NTL_ERR_FIRST2_OFF }, + { "NTL1.REGS.CERR_FIRST2", NPU2_BLOCK_NTL1, NPU2_NTL_ERR_FIRST2_OFF }, +}; + +static npu2_scom_dump_t npu2_scom_dump_ocapi[] = { + { "OTL0.MISC.C_ERR_RPT_HOLD0", NPU2_BLOCK_OTL0, NPU2_OTL_ERR_RPT_HOLD0 }, + { "OTL1.MISC.C_ERR_RPT_HOLD0", NPU2_BLOCK_OTL1, NPU2_OTL_ERR_RPT_HOLD0 }, + { "OTL0.MISC.OTL_REM0", NPU2_BLOCK_OTL0, NPU2_OTL_RAS_ERR_MSG0 }, + { "OTL1.MISC.OTL_REM0", NPU2_BLOCK_OTL1, NPU2_OTL_RAS_ERR_MSG0 }, + { "OTL0.MISC.ERROR_SIG_RXI", NPU2_BLOCK_OTL0, NPU2_OTL_RXI_ERR_SIG }, + { "OTL1.MISC.ERROR_SIG_RXI", NPU2_BLOCK_OTL1, NPU2_OTL_RXI_ERR_SIG }, + { "OTL0.MISC.ERROR_SIG_RXO", NPU2_BLOCK_OTL0, NPU2_OTL_RXO_ERR_SIG }, + { "OTL1.MISC.ERROR_SIG_RXO", NPU2_BLOCK_OTL1, NPU2_OTL_RXO_ERR_SIG }, + { "OTL0.MISC.C_ERR_RPT_HOLD1", NPU2_BLOCK_OTL0, NPU2_OTL_ERR_RPT_HOLD1 }, + { "OTL1.MISC.C_ERR_RPT_HOLD1", NPU2_BLOCK_OTL1, NPU2_OTL_ERR_RPT_HOLD1 }, +}; + +static void print_one_npu_reg(struct npu2 *npu, npu2_scom_dump_t *scom, int stack) +{ + uint64_t reg, val; + + reg = NPU2_REG_OFFSET(stack, scom->block, scom->offset); + val = npu2_scom_read(npu->chip_id, npu->xscom_base, + reg, NPU2_MISC_DA_LEN_8B); + + prlog(PR_ERR, "NPU[%d] STCK%d.%s 0x%llx = 0x%016llx\n", + npu->chip_id, stack - 4, scom->name, reg, val); +} + +/* same as above, but for direct access registers */ +static void print_one_reg(int chip_id, int brick_index, + uint64_t reg_addr, const char *reg_name) +{ + uint64_t val; + + xscom_read(chip_id, reg_addr, &val); + prlog(PR_ERR, "NPU[%d] %s brick %d 0x%llx = 0x%016llx\n", + chip_id, reg_name, brick_index, reg_addr, val); +} + +static void show_nvlink_regs(struct npu2 *npu, int brick_index) +{ + uint32_t stack, ntl; + int i; + + stack = NPU2_STACK_STCK_0 + brick_index / 2; + ntl = NPU2_BLOCK_NTL0 + (brick_index % 2) * 2; + + for (i = 0; i < ARRAY_SIZE(npu2_scom_dump_nvlink); i++) { + if (npu2_scom_dump_nvlink[i].block == ntl) + print_one_npu_reg(npu, &npu2_scom_dump_nvlink[i], stack); + } +} + +static void show_opencapi_regs(struct npu2 *npu, int brick_index) +{ + uint32_t stack, otl; + int i; + + stack = NPU2_STACK_STCK_0 + brick_index / 2; + otl = NPU2_BLOCK_OTL0 + (brick_index % 2); + + /* NPU registers */ + for (i = 0; i < ARRAY_SIZE(npu2_scom_dump_ocapi); i++) { + if (npu2_scom_dump_ocapi[i].block == otl) + print_one_npu_reg(npu, &npu2_scom_dump_ocapi[i], stack); + } + + /* Fabric registers */ + print_one_reg(npu->chip_id, brick_index, + OB_ODL_STATUS(brick_index), "ODL status"); + print_one_reg(npu->chip_id, brick_index, + OB_ODL_TRAINING_STATUS(brick_index), "ODL training status"); + print_one_reg(npu->chip_id, brick_index, + OB_ODL_ENDPOINT_INFO(brick_index), "ODL endpoint info"); +} + +static void show_all_regs(struct npu2 *npu, int brick_index) +{ + int i, stack, stack_min, stack_max; + uint64_t fir_val, mask_val, fir_addr, mask_addr; + struct npu2_dev *dev; + npu2_scom_dump_t scom_reg; + + if (brick_index != -1) { + stack_min = stack_max = NPU2_STACK_STCK_0 + brick_index / 2; + } else { + stack_min = NPU2_STACK_STCK_0; + stack_max = NPU2_STACK_STCK_2; + /* Avoid dumping unused stacks for opencapi on Lagrange */ + if (npu->total_devices == 2) + stack_min = stack_max = NPU2_STACK_STCK_1; + } + + /* NPU FIRs */ + for (i = 0; i < NPU2_TOTAL_FIR_REGISTERS; i++) { + fir_addr = NPU2_FIR_REGISTER_0 + i * NPU2_FIR_OFFSET; + mask_addr = fir_addr + NPU2_FIR_MASK_OFFSET; + xscom_read(npu->chip_id, fir_addr, &fir_val); + xscom_read(npu->chip_id, mask_addr, &mask_val); + prlog(PR_ERR, "NPU[%d] FIR%d = 0x%016llx (mask 0x%016llx => 0x%016llx)\n", + npu->chip_id, i, fir_val, mask_val, fir_val & ~mask_val); + } + + /* NPU global, per-stack registers */ + for (i = 0; i < ARRAY_SIZE(npu2_scom_dump_global); i++) { + for (stack = stack_min; stack <= stack_max; stack++) + print_one_npu_reg(npu, &npu2_scom_dump_global[i], stack); + } + + /* + * NPU global registers, stack independent + * We have only one for now, so dump it directly + */ + scom_reg.name = "XTS.REG.ERR_HOLD"; + scom_reg.block = NPU2_BLOCK_XTS; + scom_reg.offset = 0; + print_one_npu_reg(npu, &scom_reg, NPU2_STACK_MISC); + + /* nvlink- or opencapi-specific registers */ + for (i = 0; i < npu->total_devices; i++) { + dev = &npu->devices[i]; + if (brick_index == -1 || dev->brick_index == brick_index) { + if (dev->type == NPU2_DEV_TYPE_NVLINK) + show_nvlink_regs(npu, dev->brick_index); + else if (dev->type == NPU2_DEV_TYPE_OPENCAPI) + show_opencapi_regs(npu, dev->brick_index); + } + } +} + +void npu2_dump_scoms(int chip_id) +{ + struct npu2 *npu; + struct phb *phb; + struct npu2_dev *dev; + + /* + * Look for the npu2 structure for that chip ID. We can access it + * through the array of phbs, looking for a nvlink or opencapi + * phb. We can have several entries, but they all point + * to the same npu2 structure + */ + for_each_phb(phb) { + npu = NULL; + if (phb->phb_type == phb_type_npu_v2) { + npu = phb_to_npu2_nvlink(phb); + } else if (phb->phb_type == phb_type_npu_v2_opencapi) { + dev = phb_to_npu2_dev_ocapi(phb); + npu = dev->npu; + } + if (npu && npu->chip_id == chip_id) { + show_all_regs(npu, -1 /* all bricks */); + break; + } + } +} + +static uint64_t npu2_ipi_attributes(struct irq_source *is __unused, uint32_t isn __unused) +{ + struct npu2 *p = is->data; + uint32_t idx = isn - p->base_lsi; + + if ((idx == 18) || (idx >= 27 && idx <= 34)) + /* + * level 18: TCE Interrupt - used to detect a frozen PE (nvlink) + * level 27-30: OTL interrupt (opencapi) + * level 31-34: XSL interrupt (opencapi) + */ + return IRQ_ATTR_TARGET_OPAL | IRQ_ATTR_TARGET_RARE | IRQ_ATTR_TYPE_MSI; + else + return IRQ_ATTR_TARGET_LINUX; +} + +static char *npu2_ipi_name(struct irq_source *is, uint32_t isn) +{ + struct npu2 *p = is->data; + uint32_t idx = isn - p->base_lsi; + const char *name; + + switch (idx) { + case 0: name = "NDL 0 Stall Event (brick 0)"; break; + case 1: name = "NDL 0 No-Stall Event (brick 0)"; break; + case 2: name = "NDL 1 Stall Event (brick 1)"; break; + case 3: name = "NDL 1 No-Stall Event (brick 1)"; break; + case 4: name = "NDL 2 Stall Event (brick 2)"; break; + case 5: name = "NDL 2 No-Stall Event (brick 2)"; break; + case 6: name = "NDL 5 Stall Event (brick 3)"; break; + case 7: name = "NDL 5 No-Stall Event (brick 3)"; break; + case 8: name = "NDL 4 Stall Event (brick 4)"; break; + case 9: name = "NDL 4 No-Stall Event (brick 4)"; break; + case 10: name = "NDL 3 Stall Event (brick 5)"; break; + case 11: name = "NDL 3 No-Stall Event (brick 5)"; break; + case 12: name = "NTL 0 Event"; break; + case 13: name = "NTL 1 Event"; break; + case 14: name = "NTL 2 Event"; break; + case 15: name = "NTL 3 Event"; break; + case 16: name = "NTL 4 Event"; break; + case 17: name = "NTL 5 Event"; break; + case 18: name = "TCE Event"; break; + case 19: name = "ATS Event"; break; + case 20: name = "CQ Event"; break; + case 21: name = "MISC Event"; break; + case 22: name = "NMMU Local Xstop"; break; + case 23: name = "Translate Fail (brick 2)"; break; + case 24: name = "Translate Fail (brick 3)"; break; + case 25: name = "Translate Fail (brick 4)"; break; + case 26: name = "Translate Fail (brick 5)"; break; + case 27: name = "OTL Event (brick 2)"; break; + case 28: name = "OTL Event (brick 3)"; break; + case 29: name = "OTL Event (brick 4)"; break; + case 30: name = "OTL Event (brick 5)"; break; + case 31: name = "XSL Event (brick 2)"; break; + case 32: name = "XSL Event (brick 3)"; break; + case 33: name = "XSL Event (brick 4)"; break; + case 34: name = "XSL Event (brick 5)"; break; + default: name = "Unknown"; + } + return strdup(name); +} + +static void npu2_err_interrupt(struct irq_source *is, uint32_t isn) +{ + struct npu2 *p = is->data; + uint32_t idx = isn - p->base_lsi; + char *irq_name; + int brick; + + switch (idx) { + case 18: + opal_update_pending_evt(OPAL_EVENT_PCI_ERROR, + OPAL_EVENT_PCI_ERROR); + break; + case 27 ... 34: + /* opencapi only */ + brick = 2 + ((idx - 27) % 4); + irq_name = npu2_ipi_name(is, isn); + prlog(PR_ERR, "NPU[%d] received error interrupt '%s'\n", + p->chip_id, irq_name); + free(irq_name); + show_all_regs(p, brick); + /* + * P9 NPU doesn't support recovering a link going down + * unexpectedly. So we mark the device as broken and + * report it to the OS, so that the error is logged + * and the drivers notified. + */ + npu2_opencapi_set_broken(p, brick); + opal_update_pending_evt(OPAL_EVENT_PCI_ERROR, + OPAL_EVENT_PCI_ERROR); + break; + default: + prerror("OPAL received unknown NPU2 interrupt %d\n", idx); + return; + } +} + +static const struct irq_source_ops npu2_ipi_ops = { + .interrupt = npu2_err_interrupt, + .attributes = npu2_ipi_attributes, + .name = npu2_ipi_name, +}; + +static void setup_irqs(struct npu2 *p) +{ + uint64_t reg, val; + void *tp; + + p->base_lsi = xive_alloc_ipi_irqs(p->chip_id, NPU2_N_DL_IRQS, NPU2_N_DL_IRQS_ALIGN); + if (p->base_lsi == XIVE_IRQ_ERROR) { + prlog(PR_ERR, "NPU: Failed to allocate interrupt sources\n"); + return; + } + xive_register_ipi_source(p->base_lsi, NPU2_N_DL_IRQS, p, &npu2_ipi_ops); + + /* Set IPI configuration */ + reg = NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, NPU2_MISC_CFG); + val = npu2_read(p, reg); + val = SETFIELD(NPU2_MISC_CFG_IPI_PS, val, NPU2_MISC_CFG_IPI_PS_64K); + val = SETFIELD(NPU2_MISC_CFG_IPI_OS, val, NPU2_MISC_CFG_IPI_OS_AIX); + npu2_write(p, reg, val); + + /* Set IRQ base */ + reg = NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, NPU2_MISC_IRQ_BASE); + tp = xive_get_trigger_port(p->base_lsi); + val = ((uint64_t)tp) << NPU2_IRQ_BASE_SHIFT; + npu2_write(p, reg, val); +} + +static bool _i2c_presence_detect(struct npu2_dev *dev) +{ + uint8_t state, data; + int rc; + + rc = i2c_request_send(dev->npu->i2c_port_id_ocapi, + platform.ocapi->i2c_presence_addr, + SMBUS_READ, 0, 1, + &state, 1, 120); + if (rc) { + OCAPIERR(dev, "error detecting link presence: %d\n", rc); + return true; /* assume link exists */ + } + + OCAPIDBG(dev, "I2C presence detect: 0x%x\n", state); + + switch (dev->link_index) { + case 2: + data = platform.ocapi->i2c_presence_brick2; + break; + case 3: + data = platform.ocapi->i2c_presence_brick3; + break; + case 4: + data = platform.ocapi->i2c_presence_brick4; + break; + case 5: + data = platform.ocapi->i2c_presence_brick5; + break; + default: + OCAPIERR(dev, "presence detection on invalid link\n"); + return true; + } + /* Presence detect bits are active low */ + return !(state & data); +} + +/* + * A default presence detection implementation for platforms like ZZ and Zaius + * that don't implement their own. Assumes all devices found will be OpenCAPI. + */ +void npu2_i2c_presence_detect(struct npu2 *npu) +{ + struct npu2_dev *dev; + assert(platform.ocapi); + for (int i = 0; i < npu->total_devices; i++) { + dev = &npu->devices[i]; + if (_i2c_presence_detect(dev)) + dev->type = NPU2_DEV_TYPE_OPENCAPI; + else + dev->type = NPU2_DEV_TYPE_UNKNOWN; + } +} + +static struct npu2 *setup_npu(struct dt_node *dn) +{ + struct npu2 *npu; + struct npu2_dev *dev; + struct dt_node *np; + uint32_t num_links; + char port_name[17]; + void *npumem; + char *path; + int gcid; + struct proc_chip *chip; + int i = 0; + + /* Retrieve chip ID */ + path = dt_get_path(dn); + gcid = dt_get_chip_id(dn); + chip = get_chip(gcid); + assert(chip); + + num_links = dt_prop_get_u32(dn, "ibm,npu-links"); + npumem = zalloc(sizeof(struct npu2) + num_links * + sizeof(struct npu2_dev)); + assert(npumem); + npu = npumem; + + npu->dt_node = dn; + npu->index = dt_prop_get_u32(dn, "ibm,npu-index"); + npu->chip_id = gcid; + npu->xscom_base = dt_get_address(dn, 0, NULL); + + init_lock(&npu->i2c_lock); + npu->i2c_pin_mode = ~0; // input mode by default + npu->i2c_pin_wr_state = ~0; // reset is active low + if (platform.ocapi) { + /* Find I2C port for handling device presence/reset */ + snprintf(port_name, sizeof(port_name), "p8_%08x_e%dp%d", + gcid, platform.ocapi->i2c_engine, + platform.ocapi->i2c_port); + prlog(PR_DEBUG, "NPU: Looking for I2C port %s\n", port_name); + + dt_for_each_compatible(dt_root, np, "ibm,power9-i2c-port") { + if (streq(port_name, dt_prop_get(np, "ibm,port-name"))) { + npu->i2c_port_id_ocapi = dt_prop_get_u32(np, "ibm,opal-id"); + break; + } + } + + if (!npu->i2c_port_id_ocapi) { + prlog(PR_ERR, "NPU: Couldn't find I2C port %s\n", + port_name); + goto failed; + } + } + + npu->devices = npumem + sizeof(struct npu2); + + dt_for_each_compatible(dn, np, "ibm,npu-link") { + assert(i < num_links); + dev = &npu->devices[i]; + dev->link_index = dt_prop_get_u32(np, "ibm,npu-link-index"); + /* May be overridden by platform presence detection */ + dev->brick_index = dev->link_index; + /* Will be overridden by presence detection */ + dev->type = NPU2_DEV_TYPE_UNKNOWN; + dev->npu = npu; + dev->dt_node = np; + dev->pl_xscom_base = dt_prop_get_u64(np, "ibm,npu-phy"); + dev->lane_mask = dt_prop_get_u32(np, "ibm,npu-lane-mask"); + dev->link_speed = dt_prop_get_u64(np, "ibm,link-speed"); + i++; + }; + npu->total_devices = i; + + prlog(PR_INFO, "NPU: Chip %d Found NPU2#%d (%d links) at %s\n", + npu->chip_id, npu->index, npu->total_devices, path); + prlog(PR_INFO, " SCOM Base: %08llx\n", npu->xscom_base); + free(path); + return npu; + +failed: + prlog(PR_ERR, "NPU: Chip %d NPU setup failed\n", gcid); + free(path); + free(npu); + return NULL; +} + +static void setup_devices(struct npu2 *npu) +{ + bool nvlink_detected = false, ocapi_detected = false; + struct npu2_dev *dev; + + /* + * TODO: In future, we'll do brick configuration here to support mixed + * setups. + */ + for (int i = 0; i < npu->total_devices; i++) { + dev = &npu->devices[i]; + switch (dev->type) { + case NPU2_DEV_TYPE_NVLINK: + nvlink_detected = true; + dt_add_property_strings(dev->dt_node, + "ibm,npu-link-type", + "nvlink"); + break; + case NPU2_DEV_TYPE_OPENCAPI: + ocapi_detected = true; + dt_add_property_strings(dev->dt_node, + "ibm,npu-link-type", + "opencapi"); + break; + default: + prlog(PR_INFO, "NPU: Link %d device not present\n", + npu->devices[i].link_index); + dt_add_property_strings(dev->dt_node, + "ibm,npu-link-type", + "unknown"); + } + } + + if (nvlink_detected && ocapi_detected) { + prlog(PR_ERR, "NPU: NVLink and OpenCAPI devices on same chip not supported, aborting NPU init\n"); + return; + } + + setup_irqs(npu); + + if (nvlink_detected) + npu2_nvlink_init_npu(npu); + else if (ocapi_detected) + npu2_opencapi_init_npu(npu); +} + +void probe_npu2(void) +{ + struct proc_chip *chip = next_chip(NULL); + struct npu2 *npu; + struct dt_node *np; + const char *zcal; + + /* npu2 only */ + if (!dt_find_compatible_node(dt_root, NULL, "ibm,power9-npu")) + return; + + /* Abort if we're running on POWER9C DD1 (P9N DD1 is not supported) */ + if (chip && + chip->type == PROC_CHIP_P9_CUMULUS && + (chip->ec_level & 0xf0) == 0x10) { + prlog(PR_INFO, "NPU2: DD1 not supported\n"); + return; + } + + /* Check for a zcal override */ + zcal = nvram_query_dangerous("nv_zcal_override"); + if (zcal) { + nv_zcal_nominal = atoi(zcal); + prlog(PR_WARNING, "NPU2: Using ZCAL impedance override = %d\n", nv_zcal_nominal); + } + + if (!platform.npu2_device_detect) { + prlog(PR_INFO, "NPU: Platform does not support NPU\n"); + return; + } + + dt_for_each_compatible(dt_root, np, "ibm,power9-npu") { + npu = setup_npu(np); + if (!npu) + continue; + platform.npu2_device_detect(npu); + setup_devices(npu); + } +} diff --git a/roms/skiboot/hw/npu2-hw-procedures.c b/roms/skiboot/hw/npu2-hw-procedures.c new file mode 100644 index 000000000..fb88dfdf6 --- /dev/null +++ b/roms/skiboot/hw/npu2-hw-procedures.c @@ -0,0 +1,1079 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * NPU2 (POWER9) Hardware Procedures + * + * Copyright 2013-2019 IBM Corp. + */ + +#include <skiboot.h> +#include <io.h> +#include <timebase.h> +#include <pci.h> +#include <pci-virt.h> +#include <interrupts.h> +#include <npu2.h> +#include <npu2-regs.h> +#include <xscom.h> + +/* Set in npu2.c if there is an nvram override for the zcal settings on this + * machine */ +int nv_zcal_nominal = -1; + +/* PHY Registers. The documentation for the PHY training is written in + * terms of bits within an actual register so we use that + * representation here. */ +struct npu2_phy_reg { + uint64_t offset; + uint64_t start; + uint64_t len; +}; + +/* + * Currently unused, but documented here: +static struct npu2_phy_reg NPU2_PHY_RX_DATA_DAC_SPARE_MODE = {0x000, 63, 64}; +static struct npu2_phy_reg NPU2_PHY_RX_DAC_CNTL6 = {0x00c, 63, 64}; +static struct npu2_phy_reg NPU2_PHY_RX_DAC_CNTL5 = {0x028, 63, 64}; +static struct npu2_phy_reg NPU2_PHY_RX_DAC_CNTL9 = {0x030, 63, 64}; +static struct npu2_phy_reg NPU2_PHY_RX_DAC_CNTL5_EO = {0x00a, 63, 64}; +static struct npu2_phy_reg NPU2_PHY_RX_DAC_CNTL4 = {0x026, 63, 64}; +*/ +static struct npu2_phy_reg NPU2_PHY_RX_RUN_LANE = {0x0c8, 48, 1}; +static struct npu2_phy_reg NPU2_PHY_RX_IORESET = {0x096, 63, 1}; +static struct npu2_phy_reg NPU2_PHY_TX_IORESET = {0x113, 48, 1}; +static struct npu2_phy_reg NPU2_PHY_RX_PR_RESET = {0x096, 62, 1}; +static struct npu2_phy_reg NPU2_PHY_RX_LANE_ANA_PDWN = {0x002, 54, 1}; +static struct npu2_phy_reg NPU2_PHY_RX_LANE_DIG_PDWN = {0x088, 48, 1}; +static struct npu2_phy_reg NPU2_PHY_RX_PR_IQ_RES_SEL = {0x004, 59, 3}; +static struct npu2_phy_reg NPU2_PHY_RX_PR_PHASE_STEP = {0x08a, 60, 4}; +static struct npu2_phy_reg NPU2_PHY_TX_LANE_PDWN = {0x101, 48, 1}; +static struct npu2_phy_reg NPU2_PHY_RX_RUN_DCCAL = {0x0c8, 49, 1}; +static struct npu2_phy_reg NPU2_PHY_RX_DCCAL_DONE = {0x0ca, 49, 1}; +static struct npu2_phy_reg NPU2_PHY_RX_LANE_BUSY = {0x0ca, 50, 1}; +static struct npu2_phy_reg NPU2_PHY_RX_B_BANK_CONTROLS = {0x002, 58, 6}; +static struct npu2_phy_reg NPU2_PHY_TX_UNLOAD_CLK_DISABLE = {0x103, 56, 1}; +static struct npu2_phy_reg NPU2_PHY_TX_FIFO_INIT = {0x105, 53, 1}; +static struct npu2_phy_reg NPU2_PHY_TX_RXCAL = {0x103, 57, 1}; +static struct npu2_phy_reg NPU2_PHY_RX_INIT_DONE = {0x0ca, 48, 1}; +static struct npu2_phy_reg NPU2_PHY_RX_PR_EDGE_TRACK_CNTL = {0x092, 48, 2}; +static struct npu2_phy_reg NPU2_PHY_RX_PR_BUMP_SL_1UI = {0x092, 57, 1}; +static struct npu2_phy_reg NPU2_PHY_RX_PR_FW_OFF = {0x08a, 56, 1}; +static struct npu2_phy_reg NPU2_PHY_RX_PR_FW_INERTIA_AMT = {0x08a, 57, 3}; +static struct npu2_phy_reg NPU2_PHY_RX_CFG_LTE_MC = {0x000, 60, 4}; +static struct npu2_phy_reg NPU2_PHY_RX_A_INTEG_COARSE_GAIN = {0x00a, 48, 4}; +static struct npu2_phy_reg NPU2_PHY_RX_A_CTLE_COARSE = {0x00c, 48, 5}; +static struct npu2_phy_reg NPU2_PHY_RX_A_CTLE_GAIN = {0x00c, 53, 4}; +static struct npu2_phy_reg NPU2_PHY_RX_B_INTEG_COARSE_GAIN = {0x026, 48, 4}; +static struct npu2_phy_reg NPU2_PHY_RX_B_CTLE_COARSE = {0x028, 48, 5}; +static struct npu2_phy_reg NPU2_PHY_RX_B_CTLE_GAIN = {0x028, 53, 4}; +static struct npu2_phy_reg NPU2_PHY_RX_E_INTEG_COARSE_GAIN = {0x030, 48, 4}; +static struct npu2_phy_reg NPU2_PHY_RX_E_CTLE_COARSE = {0x032, 48, 5}; +static struct npu2_phy_reg NPU2_PHY_RX_E_CTLE_GAIN = {0x032, 53, 4}; + +/* These registers are per-PHY, not per lane */ +static struct npu2_phy_reg NPU2_PHY_RX_SPEED_SELECT = {0x262, 51, 2}; +static struct npu2_phy_reg NPU2_PHY_RX_AC_COUPLED = {0x262, 53, 1}; +static struct npu2_phy_reg NPU2_PHY_TX_ZCAL_SWO_EN = {0x3c9, 48, 1}; +static struct npu2_phy_reg NPU2_PHY_TX_ZCAL_REQ = {0x3c1, 49, 1}; +static struct npu2_phy_reg NPU2_PHY_TX_ZCAL_DONE = {0x3c1, 50, 1}; +static struct npu2_phy_reg NPU2_PHY_TX_ZCAL_ERROR = {0x3c1, 51, 1}; +static struct npu2_phy_reg NPU2_PHY_TX_ZCAL_N = {0x3c3, 48, 9}; +static struct npu2_phy_reg NPU2_PHY_TX_ZCAL_P = {0x3c5, 48, 9}; +static struct npu2_phy_reg NPU2_PHY_TX_FFE_BOOST_EN = {0x34b, 59, 1}; +static struct npu2_phy_reg NPU2_PHY_TX_PSEG_PRE_EN = {0x34d, 51, 5}; +static struct npu2_phy_reg NPU2_PHY_TX_PSEG_PRE_SELECT = {0x34d, 56, 5}; +static struct npu2_phy_reg NPU2_PHY_TX_NSEG_PRE_EN = {0x34f, 51, 5}; +static struct npu2_phy_reg NPU2_PHY_TX_NSEG_PRE_SELECT = {0x34f, 56, 5}; +static struct npu2_phy_reg NPU2_PHY_TX_PSEG_POST_EN = {0x361, 49, 7}; +static struct npu2_phy_reg NPU2_PHY_TX_PSEG_POST_SELECT = {0x361, 56, 7}; +static struct npu2_phy_reg NPU2_PHY_TX_NSEG_POST_EN = {0x363, 49, 7}; +static struct npu2_phy_reg NPU2_PHY_TX_NSEG_POST_SELECT = {0x363, 56, 7}; +static struct npu2_phy_reg NPU2_PHY_TX_PSEG_MARGINPU_EN = {0x351, 48, 8}; +static struct npu2_phy_reg NPU2_PHY_TX_NSEG_MARGINPU_EN = {0x353, 48, 8}; +static struct npu2_phy_reg NPU2_PHY_TX_PSEG_MARGINPD_EN = {0x351, 56, 8}; +static struct npu2_phy_reg NPU2_PHY_TX_NSEG_MARGINPD_EN = {0x353, 56, 8}; +static struct npu2_phy_reg NPU2_PHY_TX_MARGINPU_SELECT = {0x355, 48, 8}; +static struct npu2_phy_reg NPU2_PHY_TX_MARGINPD_SELECT = {0x355, 56, 8}; +static struct npu2_phy_reg NPU2_PHY_TX_PSEG_MAIN_EN = {0x357, 51, 7}; +static struct npu2_phy_reg NPU2_PHY_TX_NSEG_MAIN_EN = {0x359, 51, 7}; +/* Currently unused, but documented here +static struct npu2_phy_reg NPU2_PHY_RX_HIST_MIN_EYE_WIDTH = {0x24e, 54, 8}; +static struct npu2_phy_reg NPU2_PHY_RX_HIST_MIN_EYE_WIDTH_LANE = {0x24e, 49, 5}; +static struct npu2_phy_reg NPU2_PHY_RX_HIST_MIN_EYE_WIDTH_VALID= {0x24e, 48, 1}; +*/ +static struct npu2_phy_reg NPU2_PHY_RX_RC_ENABLE_AUTO_RECAL = {0x25c, 51, 1}; + +static struct npu2_phy_reg NPU2_PHY_RX_CLKDIST_PDWN = {0x204, 48, 3}; +static struct npu2_phy_reg NPU2_PHY_RX_IREF_PDWN = {0x230, 54, 1}; +static struct npu2_phy_reg NPU2_PHY_TX_CLKDIST_PDWN = {0x305, 48, 3}; +static struct npu2_phy_reg NPU2_PHY_RX_CTL_DATASM_CLKDIST_PDWN = {0x2e0, 60, 1}; +static struct npu2_phy_reg NPU2_PHY_TX_DRV_DATA_PATTERN_GCRMSG = {0x309, 50, 4}; + +#define NPU2_PHY_REG(scom_base, reg, lane) \ + SETFIELD(PPC_BITMASK(27, 31), ((reg)->offset << 42) | scom_base, lane) + +#define NPU2_MAX_PHY_LANE 23 + +/* This is a bit of a gross hack but it does the job */ +#define FOR_EACH_LANE(ndev, lane) \ + for (lane = 0; lane <= NPU2_MAX_PHY_LANE; lane++) \ + if (!(ndev->lane_mask & (1 << (NPU2_MAX_PHY_LANE - lane)))) \ + continue; \ + else + +typedef uint32_t (*step)(struct npu2_dev *); + +struct procedure { + const char *name; + step steps[]; +}; + +#define DEFINE_PROCEDURE(NAME, STEPS...) \ + static struct procedure procedure_##NAME = \ + {.name = #NAME, .steps = {NAME, ##STEPS}} + +#define PROCEDURE_INPROGRESS (1 << 31) +#define PROCEDURE_COMPLETE (1 << 30) +#define PROCEDURE_NEXT (1 << 29) +#define PROCEDURE_FAILED 2 +#define PROCEDURE_ABORTED 3 +#define PROCEDURE_UNSUPPORTED 4 + +/* Mask defining which status bits we want to expose */ +#define PROCEDURE_STATUS_MASK 0xc000000f + +static void phy_write_lane(struct npu2_dev *ndev, struct npu2_phy_reg *reg, int lane, uint64_t val) +{ + uint64_t old_val, reg_addr; + int rc; + uint64_t mask = PPC_BITMASK(reg->start, reg->start + reg->len - 1); + + /* Check to make sure we're not trying to specify a lane to a + * non-per-lane register */ + if (lane >= 0) + assert(reg->offset < 0x200); + else + assert(reg->offset >= 0x200); + + reg_addr = NPU2_PHY_REG(ndev->pl_xscom_base, reg, lane); + rc = xscom_read(ndev->npu->chip_id, reg_addr, &old_val); + if (rc) + NPU2DEVERR(ndev, "error %d reading scom 0x%llx\n", rc, reg_addr); + val = SETFIELD(mask, old_val, val); + rc = xscom_write(ndev->npu->chip_id, reg_addr, val); + if (rc) + NPU2DEVERR(ndev, "error %d writing scom 0x%llx\n", rc, reg_addr); +} + +static uint64_t phy_read_lane(struct npu2_dev *ndev, struct npu2_phy_reg *reg, int lane) +{ + uint64_t val, reg_addr; + int rc; + uint64_t mask = PPC_BITMASK(reg->start, reg->start + reg->len - 1); + + /* Check to make sure we're not trying to specify a lane to a + * non-per-lane register */ + if (lane >= 0) + assert(reg->offset < 0x200); + else + assert(reg->offset >= 0x200); + + reg_addr = NPU2_PHY_REG(ndev->pl_xscom_base, reg, lane); + rc = xscom_read(ndev->npu->chip_id, reg_addr, &val); + if (rc) + NPU2DEVERR(ndev, "error %d reading scom 0x%llx\n", rc, reg_addr); + + return GETFIELD(mask, val); +} + +#define phy_write(ndev, reg, val) phy_write_lane(ndev, reg, -1, val) +#define phy_read(ndev, reg) phy_read_lane(ndev, reg, -1) + +static uint32_t stop(struct npu2_dev *npu_dev __unused) +{ + return PROCEDURE_COMPLETE | PROCEDURE_ABORTED; +} +DEFINE_PROCEDURE(stop); + +static uint32_t nop(struct npu2_dev *npu_dev __unused) +{ + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(nop); + +/* + * Return the obus (0 or 1) of a device + * + * Using the brick index is dangerous, because it varies for a link + * depending on the mode (opencapi or nvlink) + */ +static int obus_index(struct npu2_dev *ndev) +{ + if ((ndev->pl_xscom_base & 0x3F000000) == 0x09000000) + return 0; + else + return 1; +} + +/* + * Return the brick number (0-2) within an obus chiplet. + * Only valid for nvlink devices + */ +static int obus_brick_index(struct npu2_dev *ndev) +{ + int index = ndev->brick_index % 3; + + assert(ndev->type != NPU2_DEV_TYPE_OPENCAPI); + /* On the second obus chiplet, index is reversed */ + if ((ndev->pl_xscom_base & 0x3F000000) != 0x09000000) + return 2 - index; + + return index; +} + +static void set_iovalid(struct npu2_dev *ndev, bool raise) +{ + uint64_t addr, val, mask; + int rc; + + if (ndev->type == NPU2_DEV_TYPE_OPENCAPI) + return; + + addr = (ndev->pl_xscom_base & 0x3F000000) | 0x9; + mask = PPC_BIT(6 + obus_brick_index(ndev)); + val = raise ? mask : 0; + + rc = xscom_write_mask(ndev->npu->chip_id, addr, val, mask); + if (rc) + NPU2DEVERR(ndev, "error %d writing scom 0x%llx\n", rc, addr); +} + +static bool poll_fence_status(struct npu2_dev *ndev, uint64_t val) +{ + uint64_t fs; + int i; + + for (i = 0; i < 4096; i++) { + fs = npu2_read(ndev->npu, NPU2_NTL_CQ_FENCE_STATUS(ndev)); + if ((fs & 0xc000000000000000UL) == val) + return true; + } + + NPU2DEVERR(ndev, "NPU2_NTL_CQ_FENCE_STATUS timeout (0x%llx)\n", val); + return false; +} + +/* Procedure 1.2.1 - Reset NPU/NDL */ +uint32_t reset_ntl(struct npu2_dev *ndev) +{ + uint64_t val, check; + int lane, i; + + set_iovalid(ndev, true); + + /* Power on clocks */ + phy_write(ndev, &NPU2_PHY_RX_CLKDIST_PDWN, 0); + phy_write(ndev, &NPU2_PHY_RX_IREF_PDWN, 1); + phy_write(ndev, &NPU2_PHY_TX_CLKDIST_PDWN, 0); + phy_write(ndev, &NPU2_PHY_RX_CTL_DATASM_CLKDIST_PDWN, 0); + + FOR_EACH_LANE(ndev, lane) { + phy_write_lane(ndev, &NPU2_PHY_RX_LANE_ANA_PDWN, lane, 0); + phy_write_lane(ndev, &NPU2_PHY_RX_LANE_DIG_PDWN, lane, 0); + phy_write_lane(ndev, &NPU2_PHY_TX_LANE_PDWN, lane, 0); + } + + /* Clear fence state for the brick */ + val = npu2_read(ndev->npu, NPU2_MISC_FENCE_STATE); + if (val) { + NPU2DEVINF(ndev, "Clearing all bricks fence\n"); + npu2_write(ndev->npu, NPU2_MISC_FENCE_STATE, val); + for (i = 0, check = 0; i < 4096; i++) { + check = npu2_read(ndev->npu, NPU2_NTL_CQ_FENCE_STATUS(ndev)); + if (!check) + break; + } + if (check) + NPU2DEVERR(ndev, "Clearing NPU2_MISC_FENCE_STATE=0x%llx timeout, current=0x%llx\n", + val, check); + } + + /* Write PRI */ + val = SETFIELD(PPC_BITMASK(0,1), 0ull, obus_brick_index(ndev)); + npu2_write_mask(ndev->npu, NPU2_NTL_PRI_CFG(ndev), val, -1ULL); + + val = NPU2_NTL_MISC_CFG2_NDL_RX_PARITY_ENA; + npu2_write_mask(ndev->npu, NPU2_NTL_MISC_CFG2(ndev), 0ull, val); + + /* NTL Reset */ + val = npu2_read(ndev->npu, NPU2_NTL_MISC_CFG1(ndev)); + val |= PPC_BIT(8) | PPC_BIT(9); + npu2_write(ndev->npu, NPU2_NTL_MISC_CFG1(ndev), val); + + if (!poll_fence_status(ndev, 0xc000000000000000UL)) + return PROCEDURE_COMPLETE | PROCEDURE_FAILED; + + return PROCEDURE_NEXT; +} + +static uint32_t reset_ndl(struct npu2_dev *ndev) +{ + uint64_t val; + + val = npu2_read_4b(ndev->npu, NPU2_NTL_DL_CONTROL(ndev)); + val |= PPC_BIT32(0) | PPC_BIT32(1); + npu2_write_4b(ndev->npu, NPU2_NTL_DL_CONTROL(ndev), val); + + val = npu2_read_4b(ndev->npu, NPU2_NTL_DL_CONTROL(ndev)); + val &= ~(PPC_BIT32(0) | PPC_BIT32(1)); + npu2_write_4b(ndev->npu, NPU2_NTL_DL_CONTROL(ndev), val); + + val = PPC_BIT32(0); + npu2_write_4b(ndev->npu, NPU2_NTL_DL_CONFIG(ndev), val); + + return PROCEDURE_NEXT; +} + +static uint32_t reset_ntl_release(struct npu2_dev *ndev) +{ + uint64_t val; + uint64_t npu2_fir; + uint64_t npu2_fir_addr; + int i; + + /* Clear FIR bits */ + npu2_fir_addr = NPU2_FIR_REGISTER_0; + npu2_fir = 0; + + for (i = 0; i < NPU2_TOTAL_FIR_REGISTERS; i++) { + xscom_write(ndev->npu->chip_id, npu2_fir_addr, npu2_fir); + npu2_fir_addr += NPU2_FIR_OFFSET; + + } + + val = npu2_read(ndev->npu, NPU2_NTL_MISC_CFG1(ndev)); + val &= 0xFFBFFFFFFFFFFFFFUL; + npu2_write(ndev->npu, NPU2_NTL_MISC_CFG1(ndev), val); + + if (!poll_fence_status(ndev, 0x8000000000000000UL)) + return PROCEDURE_COMPLETE | PROCEDURE_FAILED; + + return PROCEDURE_NEXT; +} + +static uint32_t reset_ntl_finish(struct npu2_dev *ndev) +{ + /* Credit Setup */ + npu2_write(ndev->npu, NPU2_NTL_CRED_HDR_CREDIT_TX(ndev), 0x0200000000000000UL); + npu2_write(ndev->npu, NPU2_NTL_PRB_HDR_CREDIT_TX(ndev), 0x0200000000000000UL); + npu2_write(ndev->npu, NPU2_NTL_ATR_HDR_CREDIT_TX(ndev), 0x0200000000000000UL); + npu2_write(ndev->npu, NPU2_NTL_RSP_HDR_CREDIT_TX(ndev), 0x0200000000000000UL); + npu2_write(ndev->npu, NPU2_NTL_CRED_DATA_CREDIT_TX(ndev), 0x1000000000000000UL); + npu2_write(ndev->npu, NPU2_NTL_RSP_DATA_CREDIT_TX(ndev), 0x1000000000000000UL); + npu2_write(ndev->npu, NPU2_NTL_CRED_HDR_CREDIT_RX(ndev), 0x0000BE0000000000UL); + npu2_write(ndev->npu, NPU2_NTL_DBD_HDR_CREDIT_RX(ndev), 0x0000640000000000UL); + npu2_write(ndev->npu, NPU2_NTL_ATSD_HDR_CREDIT_RX(ndev), 0x0000200000000000UL); + npu2_write(ndev->npu, NPU2_NTL_RSP_HDR_CREDIT_RX(ndev), 0x0000BE0000000000UL); + npu2_write(ndev->npu, NPU2_NTL_CRED_DATA_CREDIT_RX(ndev), 0x0001000000000000UL); + npu2_write(ndev->npu, NPU2_NTL_RSP_DATA_CREDIT_RX(ndev), 0x0001000000000000UL); + + npu2_set_link_flag(ndev, NPU2_DEV_DL_RESET); + + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(reset_ntl, reset_ndl, reset_ntl_release, reset_ntl_finish); + +/* Procedure 1.2.2 - Reset I/O PHY Lanes */ +static uint32_t phy_reset(struct npu2_dev *ndev) +{ + int lane; + + set_iovalid(ndev, false); + + /* Power on clocks */ + phy_write(ndev, &NPU2_PHY_RX_CLKDIST_PDWN, 0); + phy_write(ndev, &NPU2_PHY_RX_IREF_PDWN, 1); + phy_write(ndev, &NPU2_PHY_TX_CLKDIST_PDWN, 0); + phy_write(ndev, &NPU2_PHY_RX_CTL_DATASM_CLKDIST_PDWN, 0); + + FOR_EACH_LANE(ndev, lane) + phy_write_lane(ndev, &NPU2_PHY_RX_RUN_LANE, lane, 0); + + return PROCEDURE_NEXT; +} + +static uint32_t phy_reset_wait(struct npu2_dev *ndev) +{ + int lane; + + /* Wait for all lanes to become inactive */ + FOR_EACH_LANE(ndev, lane) + if (phy_read_lane(ndev, &NPU2_PHY_RX_LANE_BUSY, lane)) + return PROCEDURE_INPROGRESS; + + FOR_EACH_LANE(ndev, lane) { + /* Set lane in reset */ + phy_write_lane(ndev, &NPU2_PHY_RX_IORESET, lane, 1); + phy_write_lane(ndev, &NPU2_PHY_TX_IORESET, lane, 1); + + /* Release lane from reset */ + phy_write_lane(ndev, &NPU2_PHY_RX_IORESET, lane, 0); + phy_write_lane(ndev, &NPU2_PHY_TX_IORESET, lane, 0); + + /* Reset the phase rotator */ + phy_write_lane(ndev, &NPU2_PHY_RX_PR_RESET, lane, 1); + phy_write_lane(ndev, &NPU2_PHY_RX_PR_RESET, lane, 0); + } + + return PROCEDURE_NEXT; +} + +/* Procedure 1.2.3 - Initialise I/O PHY Registers */ +static uint32_t phy_reset_complete(struct npu2_dev *ndev) +{ + int lane; + + FOR_EACH_LANE(ndev, lane) { + phy_write_lane(ndev, &NPU2_PHY_RX_LANE_ANA_PDWN, lane, 0); + phy_write_lane(ndev, &NPU2_PHY_RX_LANE_DIG_PDWN, lane, 0); + phy_write_lane(ndev, &NPU2_PHY_RX_PR_IQ_RES_SEL, lane, 0x7); + phy_write_lane(ndev, &NPU2_PHY_RX_PR_PHASE_STEP, lane, 0xc); + phy_write_lane(ndev, &NPU2_PHY_TX_LANE_PDWN, lane, 0); + phy_write_lane(ndev, &NPU2_PHY_RX_PR_FW_INERTIA_AMT, lane, 4); + phy_write_lane(ndev, &NPU2_PHY_RX_CFG_LTE_MC, lane, 3); + phy_write_lane(ndev, &NPU2_PHY_RX_A_INTEG_COARSE_GAIN, lane, 11); + phy_write_lane(ndev, &NPU2_PHY_RX_B_INTEG_COARSE_GAIN, lane, 11); + phy_write_lane(ndev, &NPU2_PHY_RX_E_INTEG_COARSE_GAIN, lane, 11); + + if (ndev->type == NPU2_DEV_TYPE_OPENCAPI) { + phy_write_lane(ndev, &NPU2_PHY_RX_A_CTLE_GAIN, lane, 0); + phy_write_lane(ndev, &NPU2_PHY_RX_B_CTLE_GAIN, lane, 0); + phy_write_lane(ndev, &NPU2_PHY_RX_E_CTLE_GAIN, lane, 0); + + phy_write_lane(ndev, &NPU2_PHY_RX_A_CTLE_COARSE, lane, 20); + phy_write_lane(ndev, &NPU2_PHY_RX_B_CTLE_COARSE, lane, 20); + phy_write_lane(ndev, &NPU2_PHY_RX_E_CTLE_COARSE, lane, 20); + } + } + + set_iovalid(ndev, true); + + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(phy_reset, phy_reset_wait, phy_reset_complete); + +/* Procedure 1.2.6 - I/O PHY Tx Impedance Calibration */ +static uint32_t phy_tx_zcal(struct npu2_dev *ndev) +{ + if (ndev->npu->tx_zcal_complete[obus_index(ndev)]) + return PROCEDURE_COMPLETE; + + /* Turn off SW enable and enable zcal state machine */ + phy_write(ndev, &NPU2_PHY_TX_ZCAL_SWO_EN, 0); + + /* Start impedance calibration state machine */ + phy_write(ndev, &NPU2_PHY_TX_ZCAL_REQ, 1); + + return PROCEDURE_NEXT; +} + +static uint32_t phy_tx_zcal_wait(struct npu2_dev *ndev) +{ + int done, error; + + done = phy_read(ndev, &NPU2_PHY_TX_ZCAL_DONE); + error = phy_read(ndev, &NPU2_PHY_TX_ZCAL_ERROR); + + /* We have never seen this in the field and it is not expected. + * Therefore it's best to error out which will complain loudly. Nominal + * vaules may be set in nvram to ignore this error. */ + if (error && nv_zcal_nominal < 0) { + NPU2DEVERR(ndev, "ZCAL failed. Nominal values may be used by" + " setting nvram variable nv_zcal_override = 50\n"); + NPU2DEVERR(ndev, "However this may impact link performance\n"); + return PROCEDURE_COMPLETE | PROCEDURE_FAILED; + } + + if (!done) + return PROCEDURE_INPROGRESS; + + return PROCEDURE_NEXT; +} + +#define MARGIN_RATIO (0) +#define FFE_PRE_COEFF (0) +#define FFE_POST_COEFF (0) + +#define PRE_WIDTH (5) +#define POST_WIDTH (7) +#define MAIN_WIDTH (7) +#define ZCAL_MIN (16 * 2) +#define ZCAL_MAX (33 * 2) +#define PRECURSOR_X2_MAX (4 * 2 + 1) +#define POSTCURSOR_X2_MAX (6 * 2 + 1) +#define MARGIN_X2_MAX (8 * 2) +#define MAIN_X2_MAX ((6 * 2) + 1) +#define TOTAL_X2_MAX (PRECURSOR_X2_MAX + POSTCURSOR_X2_MAX + 2*MARGIN_X2_MAX + MAIN_X2_MAX) + +static uint32_t therm(uint32_t dec) +{ + return ((0x1 << dec) - 1); +} + +static uint32_t therm_with_half(uint32_t dec, uint8_t width) +{ + /* If the LSB of the 2r equivalent is on, then we need to set the 2r bit (MSB) */ + uint32_t half_on = ( dec & 0x1 ) << ( width - 1 ); + + /* Shift the 2r equivalent to a 1r value and convert to a thermometer code. */ + uint32_t x1_equiv = ((1 << (dec >> 1 )) - 1); + + /* Combine 1r equivalent thermometer code + the 2r MSB value. */ + return half_on | x1_equiv; +} + +static uint32_t phy_tx_zcal_calculate(struct npu2_dev *ndev) +{ + int p_value, n_value; + int ffe_pre_coeff = FFE_PRE_COEFF; + int ffe_post_coeff = FFE_POST_COEFF; + uint32_t zcal_n; + uint32_t zcal_p; + uint32_t p_main_enable = MAIN_X2_MAX; + uint32_t p_margin_pu_enable = MARGIN_X2_MAX; + uint32_t p_margin_pd_enable = MARGIN_X2_MAX; + uint32_t p_precursor_select; + uint32_t p_postcursor_select; + uint32_t margin_pu_select; + uint32_t n_main_enable = MAIN_X2_MAX; + uint32_t n_margin_pu_enable = MARGIN_X2_MAX; + uint32_t n_margin_pd_enable = MARGIN_X2_MAX; + uint32_t n_precursor_select; + uint32_t n_postcursor_select; + uint32_t margin_pd_select; + uint32_t margin_select; + + if (nv_zcal_nominal < 0) { + /* Convert the value from 8R to 2R by / 4 */ + zcal_n = phy_read(ndev, &NPU2_PHY_TX_ZCAL_N) / 4; + zcal_p = phy_read(ndev, &NPU2_PHY_TX_ZCAL_P) / 4; + } else { + zcal_n = zcal_p = nv_zcal_nominal; + NPU2DEVINF(ndev, "Using nominal values for zcal, performance may be impacted\n"); + } + + /* Again, if the hardware detects an unexpected condition it's + * better just to fail loudly. */ + if ((zcal_n < ZCAL_MIN) || (zcal_n > ZCAL_MAX) || + (zcal_p < ZCAL_MIN) || (zcal_p > ZCAL_MAX)) + return PROCEDURE_COMPLETE | PROCEDURE_FAILED; + + if (ndev->type == NPU2_DEV_TYPE_OPENCAPI && + platform.ocapi->phy_setup) { + ffe_pre_coeff = platform.ocapi->phy_setup->tx_ffe_pre_coeff; + ffe_post_coeff = platform.ocapi->phy_setup->tx_ffe_post_coeff; + } + + p_value = zcal_p - TOTAL_X2_MAX; + p_precursor_select = (p_value * ffe_pre_coeff)/128; + p_postcursor_select = (p_value * ffe_post_coeff)/128; + margin_pu_select = (p_value * MARGIN_RATIO)/256; + + if (p_value % 2) { + p_main_enable--; + p_value++; + } + + while (p_value < 0) { + if (p_main_enable > 1) { + p_main_enable -= 2; + } else if ((p_margin_pu_enable + p_margin_pd_enable) > 0) { + if (p_margin_pu_enable == p_margin_pd_enable) + p_margin_pd_enable -= 2; + else + p_margin_pu_enable -= 2; + } + p_value += 2; + } + + n_value = zcal_n - TOTAL_X2_MAX; + n_precursor_select = (n_value * ffe_pre_coeff)/128; + n_postcursor_select = (n_value * ffe_post_coeff)/128; + margin_pd_select = (p_value * MARGIN_RATIO)/256; + + if (n_value % 2) { + n_main_enable--; + n_value++; + } + + while (n_value < 0) { + if (n_main_enable > 1) { + n_main_enable -= 2; + } else if ((n_margin_pu_enable + n_margin_pd_enable) > 0) { + if (n_margin_pu_enable == n_margin_pd_enable) + n_margin_pd_enable -= 2; + else + n_margin_pu_enable -= 2; + } + n_value += 2; + } + + margin_select = therm((margin_pu_select + 1)/2) & + therm((margin_pd_select + 1)/2) & + therm((p_margin_pu_enable + 1)/2) & + therm((p_margin_pd_enable + 1)/2) & + therm((n_margin_pu_enable + 1)/2) & + therm((n_margin_pd_enable + 1)/2); + + phy_write(ndev, &NPU2_PHY_TX_PSEG_PRE_EN, therm_with_half(PRECURSOR_X2_MAX, PRE_WIDTH)); + phy_write(ndev, &NPU2_PHY_TX_PSEG_PRE_SELECT, therm_with_half(p_precursor_select, PRE_WIDTH)); + phy_write(ndev, &NPU2_PHY_TX_PSEG_POST_EN, therm_with_half(POSTCURSOR_X2_MAX, POST_WIDTH)); + phy_write(ndev, &NPU2_PHY_TX_PSEG_POST_SELECT, therm_with_half(p_postcursor_select, POST_WIDTH)); + phy_write(ndev, &NPU2_PHY_TX_PSEG_MARGINPU_EN, therm((p_margin_pu_enable + 1)/2)); + phy_write(ndev, &NPU2_PHY_TX_PSEG_MARGINPD_EN, therm((p_margin_pd_enable + 1)/2)); + phy_write(ndev, &NPU2_PHY_TX_PSEG_MAIN_EN, therm_with_half(p_main_enable, MAIN_WIDTH)); + + phy_write(ndev, &NPU2_PHY_TX_NSEG_PRE_EN, therm_with_half(PRECURSOR_X2_MAX, PRE_WIDTH)); + phy_write(ndev, &NPU2_PHY_TX_NSEG_PRE_SELECT, therm_with_half(n_precursor_select, PRE_WIDTH)); + phy_write(ndev, &NPU2_PHY_TX_NSEG_POST_EN, therm_with_half(POSTCURSOR_X2_MAX, POST_WIDTH)); + phy_write(ndev, &NPU2_PHY_TX_NSEG_POST_SELECT, therm_with_half(n_postcursor_select, POST_WIDTH)); + phy_write(ndev, &NPU2_PHY_TX_NSEG_MARGINPU_EN, therm((n_margin_pu_enable + 1)/2)); + phy_write(ndev, &NPU2_PHY_TX_NSEG_MARGINPD_EN, therm((n_margin_pd_enable + 1)/2)); + phy_write(ndev, &NPU2_PHY_TX_NSEG_MAIN_EN, therm_with_half(n_main_enable, MAIN_WIDTH)); + + phy_write(ndev, &NPU2_PHY_TX_MARGINPU_SELECT, therm(margin_select + 1)/2); + phy_write(ndev, &NPU2_PHY_TX_MARGINPD_SELECT, therm(margin_select + 1)/2); + + ndev->npu->tx_zcal_complete[obus_index(ndev)] = 1; + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(phy_tx_zcal, phy_tx_zcal_wait, phy_tx_zcal_calculate); + +/* Procedure 1.2.8 - Enable Downstream Link Training */ +static uint32_t phy_enable_tx_rxcal(struct npu2_dev *ndev) +{ + int lane; + + FOR_EACH_LANE(ndev, lane) + phy_write_lane(ndev, &NPU2_PHY_TX_RXCAL, lane, 1); + + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(phy_enable_tx_rxcal); + +/* Procedure 1.2.9 - Disable Downstream Link Training */ +static uint32_t phy_disable_tx_rxcal(struct npu2_dev *ndev) +{ + int lane; + + FOR_EACH_LANE(ndev, lane) + phy_write_lane(ndev, &NPU2_PHY_TX_RXCAL, lane, 0); + + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(phy_disable_tx_rxcal); + +/* Procedure 1.2.4 - I/O PHY DC Calibration */ +static uint32_t phy_rx_dccal(struct npu2_dev *ndev) +{ + int lane; + + set_iovalid(ndev, false); + + FOR_EACH_LANE(ndev, lane) + phy_write_lane(ndev, &NPU2_PHY_RX_PR_FW_OFF, lane, 1); + + FOR_EACH_LANE(ndev, lane) + phy_write_lane(ndev, &NPU2_PHY_RX_RUN_DCCAL, lane, 1); + + return PROCEDURE_NEXT; +} + +static uint32_t phy_rx_dccal_complete(struct npu2_dev *ndev) +{ + int lane; + + FOR_EACH_LANE(ndev, lane) + if (!phy_read_lane(ndev, &NPU2_PHY_RX_DCCAL_DONE, lane)) + return PROCEDURE_INPROGRESS; + + FOR_EACH_LANE(ndev, lane) + phy_write_lane(ndev, &NPU2_PHY_RX_RUN_DCCAL, lane, 0); + + FOR_EACH_LANE(ndev, lane) { + phy_write_lane(ndev, &NPU2_PHY_RX_B_BANK_CONTROLS, lane, 0); + phy_write_lane(ndev, &NPU2_PHY_RX_PR_EDGE_TRACK_CNTL, lane, 0); + phy_write_lane(ndev, &NPU2_PHY_RX_PR_FW_OFF, lane, 0); + } + + set_iovalid(ndev, true); + + return PROCEDURE_NEXT; +} + +static uint32_t phy_rx_clock_sel(struct npu2_dev *ndev) +{ + if (ndev->type != NPU2_DEV_TYPE_OPENCAPI) { + /* + * Change the RX clk mux control to be done by + * software instead of HW. This avoids glitches caused + * by changing the mux setting. + * + * Work around a known DL bug by doing these writes + * twice. + */ + npu2_write_mask_4b(ndev->npu, NPU2_NTL_DL_CLK_CTRL(ndev), + 0x80000002, 0x80000003); + npu2_write_mask_4b(ndev->npu, NPU2_NTL_DL_CLK_CTRL(ndev), + 0x80000002, 0x80000003); + + npu2_write_mask_4b(ndev->npu, NPU2_NTL_DL_CLK_CTRL(ndev), + 0x80000000, 0x80000003); + npu2_write_mask_4b(ndev->npu, NPU2_NTL_DL_CLK_CTRL(ndev), + 0x80000000, 0x80000003); + } + return PROCEDURE_NEXT; +} + +/* Procedure 1.2.5 - IO PHY Tx FIFO Init */ +static uint32_t phy_tx_fifo_init(struct npu2_dev *ndev) +{ + int lane; + + FOR_EACH_LANE(ndev, lane) { + phy_write_lane(ndev, &NPU2_PHY_TX_UNLOAD_CLK_DISABLE, lane, 0); + phy_write_lane(ndev, &NPU2_PHY_TX_FIFO_INIT, lane, 1); + phy_write_lane(ndev, &NPU2_PHY_TX_UNLOAD_CLK_DISABLE, lane, 1); + } + + return PROCEDURE_COMPLETE; +} + +/* We group TX FIFO init in here mainly because that's what was done + * on NVLink1 */ +DEFINE_PROCEDURE(phy_rx_dccal, phy_rx_dccal_complete, phy_rx_clock_sel, + phy_tx_fifo_init); + +/* Procedure 1.2.7 - I/O PHY Upstream Link Training */ +static uint32_t phy_rx_training(struct npu2_dev *ndev) +{ + int lane; + + FOR_EACH_LANE(ndev, lane) + phy_write_lane(ndev, &NPU2_PHY_RX_RUN_LANE, lane, 1); + + return PROCEDURE_NEXT; +} + +static uint32_t phy_rx_training_wait(struct npu2_dev *ndev) +{ + int lane; + + FOR_EACH_LANE(ndev, lane) + if (!phy_read_lane(ndev, &NPU2_PHY_RX_INIT_DONE, lane)) + return PROCEDURE_INPROGRESS; + + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(phy_rx_training, phy_rx_training_wait); + +static uint32_t check_credit(struct npu2_dev *ndev, uint64_t reg, + const char *reg_name, uint64_t expected) +{ + uint64_t val; + + val = npu2_read(ndev->npu, reg); + if (val == expected) + return 0; + + NPU2DEVERR(ndev, "%s: expected 0x%llx, read 0x%llx\n", + reg_name, expected, val); + + return 1; +} + +#define CHECK_CREDIT(ndev, reg, expected) \ + check_credit(ndev, reg(ndev), #reg, expected); + +static uint32_t check_credits(struct npu2_dev *ndev) +{ + uint64_t val; + + CHECK_CREDIT(ndev, NPU2_NTL_CRED_HDR_CREDIT_RX, 0x0BE0BE0000000000ULL); + CHECK_CREDIT(ndev, NPU2_NTL_RSP_HDR_CREDIT_RX, 0x0BE0BE0000000000ULL); + CHECK_CREDIT(ndev, NPU2_NTL_CRED_DATA_CREDIT_RX, 0x1001000000000000ULL); + CHECK_CREDIT(ndev, NPU2_NTL_RSP_DATA_CREDIT_RX, 0x1001000000000000ULL); + CHECK_CREDIT(ndev, NPU2_NTL_DBD_HDR_CREDIT_RX, 0x0640640000000000ULL); + CHECK_CREDIT(ndev, NPU2_NTL_ATSD_HDR_CREDIT_RX, 0x0200200000000000ULL); + + val = npu2_read(ndev->npu, NPU2_NTL_MISC_CFG1(ndev)); + val &= 0xFF3FFFFFFFFFFFFFUL; + npu2_write(ndev->npu, NPU2_NTL_MISC_CFG1(ndev), val); + + if (!poll_fence_status(ndev, 0x0)) + return PROCEDURE_COMPLETE | PROCEDURE_FAILED; + + val = NPU2_NTL_MISC_CFG2_NDL_RX_PARITY_ENA; + npu2_write_mask(ndev->npu, NPU2_NTL_MISC_CFG2(ndev), val, val); + + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(check_credits); + +static struct procedure *npu_procedures[] = { + &procedure_stop, + &procedure_nop, + NULL, + NULL, + &procedure_phy_reset, + &procedure_phy_tx_zcal, + &procedure_phy_rx_dccal, + &procedure_phy_enable_tx_rxcal, + &procedure_phy_disable_tx_rxcal, + &procedure_phy_rx_training, + &procedure_reset_ntl, + + /* Place holders for pre-terminate and terminate procedures */ + &procedure_nop, + &procedure_nop, + &procedure_check_credits +}; + +/* Run a procedure step(s) and return status */ +static uint32_t get_procedure_status(struct npu2_dev *dev) +{ + uint32_t result; + uint16_t procedure = dev->procedure_number; + uint16_t step = dev->procedure_step; + const char *name = npu_procedures[procedure]->name; + + do { + result = npu_procedures[procedure]->steps[step](dev); + + if (result & PROCEDURE_NEXT) { + step++; + NPU2DEVINF(dev, "Running procedure %s step %d\n", name, step); + } + } while (result & PROCEDURE_NEXT); + + dev->procedure_step = step; + + if (result & PROCEDURE_COMPLETE) + NPU2DEVINF(dev, "Procedure %s complete\n", name); + else if (mftb() > dev->procedure_tb + msecs_to_tb(1000)) { + NPU2DEVINF(dev, "Procedure %s timed out\n", name); + result = PROCEDURE_COMPLETE | PROCEDURE_FAILED; + } + + /* Mask off internal state bits */ + dev->procedure_status = result & PROCEDURE_STATUS_MASK; + + return dev->procedure_status; +} + +static int64_t npu_dev_procedure_read(struct npu2_dev *dev, uint32_t offset, + uint32_t size, uint32_t *data) +{ + int64_t rc = OPAL_SUCCESS; + + if (size != 4) { + /* Short config reads are not supported */ + prlog(PR_ERR, "NPU%d: Short read of procedure register\n", npu2_dev_to_phb(dev)->opal_id); + return OPAL_PARAMETER; + } + + *data = 0; + + switch (offset) { + case 0: + /* Only run the procedure if not already complete */ + if (dev->procedure_status & PROCEDURE_COMPLETE) + *data = dev->procedure_status; + else + *data = get_procedure_status(dev); + + break; + + case 4: + *data = dev->procedure_number; + break; + + default: + prlog(PR_ERR, "NPU%d: Invalid vendor specific offset 0x%08x\n", + npu2_dev_to_phb(dev)->opal_id, offset); + rc = OPAL_PARAMETER; + } + + return rc; +} + +static int64_t npu_dev_procedure_write(struct npu2_dev *dev, uint32_t offset, + uint32_t size, uint32_t data) +{ + const char *name; + int64_t rc = OPAL_SUCCESS; + + if (size != 4) { + /* Short config writes are not supported */ + prlog(PR_ERR, "NPU%d: Short read of procedure register\n", + npu2_dev_to_phb(dev)->opal_id); + return OPAL_PARAMETER; + } + + switch (offset) { + case 0: + /* We ignore writes to the status register */ + NPU2DEVINF(dev, "Ignoring writes to status register\n"); + break; + + case 4: + if (data >= ARRAY_SIZE(npu_procedures) || + !npu_procedures[data]) { + NPU2DEVINF(dev, "Unsupported procedure number %d\n", data); + dev->procedure_status = PROCEDURE_COMPLETE + | PROCEDURE_UNSUPPORTED; + break; + } + + name = npu_procedures[data]->name; + if (dev->procedure_number == data + && !(dev->procedure_status & PROCEDURE_COMPLETE)) + NPU2DEVINF(dev, "Restarting procedure %s\n", name); + else + NPU2DEVINF(dev, "Starting procedure %s\n", name); + + dev->procedure_status = PROCEDURE_INPROGRESS; + dev->procedure_number = data; + dev->procedure_step = 0; + dev->procedure_tb = mftb(); + break; + + default: + NPU2DEVINF(dev, "Invalid vendor specific offset 0x%08x\n", offset); + rc = OPAL_PARAMETER; + } + + return rc; +} + +int64_t npu2_dev_procedure(void *dev, struct pci_cfg_reg_filter *pcrf, + uint32_t offset, uint32_t len, uint32_t *data, + bool write) +{ + struct pci_virt_device *pvd = dev; + struct npu2_dev *ndev = pvd->data; + + if (write) + return npu_dev_procedure_write(ndev, offset - pcrf->start, + len, *data); + + return npu_dev_procedure_read(ndev, offset - pcrf->start, len, data); +} + +void npu2_dev_procedure_reset(struct npu2_dev *dev) +{ + uint64_t val; + + /* Fence the brick */ + val = npu2_read(dev->npu, NPU2_NTL_MISC_CFG1(dev)); + val |= PPC_BIT(8) | PPC_BIT(9); + npu2_write(dev->npu, NPU2_NTL_MISC_CFG1(dev), val); + + npu2_clear_link_flag(dev, NPU2_DEV_DL_RESET); +} + +static uint32_t run_procedure(struct npu2_dev *dev, uint16_t procedure_number) +{ + struct procedure *proc; + const char *name; + uint32_t result; + + assert(procedure_number <= ARRAY_SIZE(npu_procedures)); + proc = npu_procedures[procedure_number]; + assert(proc); + + name = proc->name; + NPU2DEVINF(dev, "Running procedure %s\n", name); + dev->procedure_status = PROCEDURE_INPROGRESS; + dev->procedure_number = procedure_number; + dev->procedure_step = 0; + dev->procedure_tb = mftb(); + + result = get_procedure_status(dev); + while (!(result & PROCEDURE_COMPLETE)) { + time_wait_ms(1); + result = get_procedure_status(dev); + } + return result; +} + +void npu2_opencapi_bump_ui_lane(struct npu2_dev *dev) +{ + uint64_t reg; + uint64_t status_xscom; + int lane, bit = 7; + + status_xscom = OB_ODL_TRAINING_STATUS(dev->brick_index); + xscom_read(dev->npu->chip_id, status_xscom, ®); + reg = GETFIELD(OB_ODL_TRAINING_STATUS_STS_RX_PATTERN_B, reg); + + FOR_EACH_LANE(dev, lane) { + if (reg & (1 << bit--)) + continue; + prlog(PR_TRACE, "OCAPI: bumpui bumping lane %d\n", lane); + for (int i = 0; i < 4; i++) { + phy_write_lane(dev, &NPU2_PHY_RX_PR_BUMP_SL_1UI, lane, 1); + phy_write_lane(dev, &NPU2_PHY_RX_PR_BUMP_SL_1UI, lane, 0); + } + } +} + +void npu2_opencapi_phy_init(struct npu2_dev *dev) +{ + if (platform.ocapi->phy_setup) { + OCAPIINF(dev, "Enabling platform-specific PHY setup\n"); + phy_write(dev, &NPU2_PHY_TX_FFE_BOOST_EN, + platform.ocapi->phy_setup->tx_ffe_boost_en); + } + + run_procedure(dev, 5); /* procedure_phy_tx_zcal */ + /* + * This is only required for OpenCAPI - Hostboot tries to set this + * on systems where it can tell a link is OpenCAPI, but for + * Witherspoon it needs to be done in skiboot after device detection. + */ + phy_write(dev, &NPU2_PHY_RX_RC_ENABLE_AUTO_RECAL, 0x1); + phy_write(dev, &NPU2_PHY_RX_AC_COUPLED, 1); + + switch (dev->link_speed) { + case 20000000000UL: + OCAPIINF(dev, "Link speed set at 20Gb/s\n"); + phy_write(dev, &NPU2_PHY_RX_SPEED_SELECT, 1); + break; + case 25000000000UL: + case 25781250000UL: + OCAPIINF(dev, "Link speed set at 25.xGb/s\n"); + phy_write(dev, &NPU2_PHY_RX_SPEED_SELECT, 0); + break; + default: + OCAPIERR(dev, "Invalid link speed!\n"); + assert(false); + } +} + +int npu2_opencapi_phy_reset(struct npu2_dev *dev) +{ + int rc; + + rc = run_procedure(dev, 4); /* procedure_phy_reset */ + if (rc != PROCEDURE_COMPLETE) + return -1; + rc = run_procedure(dev, 6); /* procedure_phy_rx_dccal */ + if (rc != PROCEDURE_COMPLETE) + return -1; + return 0; +} + +void npu2_opencapi_phy_prbs31(struct npu2_dev *dev) +{ + phy_write(dev, &NPU2_PHY_TX_DRV_DATA_PATTERN_GCRMSG, 0xD); +} diff --git a/roms/skiboot/hw/npu2-opencapi.c b/roms/skiboot/hw/npu2-opencapi.c new file mode 100644 index 000000000..035c6cdc3 --- /dev/null +++ b/roms/skiboot/hw/npu2-opencapi.c @@ -0,0 +1,2370 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Support for OpenCAPI on POWER9 NPUs + * + * This file provides support for OpenCAPI as implemented on POWER9. + * + * At present, we initialise the NPU separately from the NVLink code in npu2.c. + * As such, we don't currently support mixed NVLink and OpenCAPI configurations + * on the same NPU for machines such as Witherspoon. + * + * Procedure references in this file are to the POWER9 OpenCAPI NPU Workbook + * (IBM internal document). + * + * TODO: + * - Support for mixed NVLink and OpenCAPI on the same NPU + * - Support for link ganging (one AFU using multiple links) + * - Link reset and error handling + * - Presence detection + * - Consume HDAT NPU information + * - LPC Memory support + * + * Copyright 2013-2019 IBM Corp. + */ + +#include <skiboot.h> +#include <xscom.h> +#include <io.h> +#include <timebase.h> +#include <pci.h> +#include <pci-cfg.h> +#include <pci-slot.h> +#include <interrupts.h> +#include <opal.h> +#include <opal-api.h> +#include <npu2.h> +#include <npu2-regs.h> +#include <phys-map.h> +#include <i2c.h> +#include <nvram.h> + +#define NPU_IRQ_LEVELS_XSL 23 +#define MAX_PE_HANDLE ((1 << 15) - 1) +#define TL_MAX_TEMPLATE 63 +#define TL_RATE_BUF_SIZE 32 + +#define OCAPI_SLOT_NORMAL PCI_SLOT_STATE_NORMAL +#define OCAPI_SLOT_LINK PCI_SLOT_STATE_LINK +#define OCAPI_SLOT_LINK_START (OCAPI_SLOT_LINK + 1) +#define OCAPI_SLOT_LINK_WAIT (OCAPI_SLOT_LINK + 2) +#define OCAPI_SLOT_LINK_TRAINED (OCAPI_SLOT_LINK + 3) +#define OCAPI_SLOT_FRESET PCI_SLOT_STATE_FRESET +#define OCAPI_SLOT_FRESET_START (OCAPI_SLOT_FRESET + 1) +#define OCAPI_SLOT_FRESET_INIT (OCAPI_SLOT_FRESET + 2) +#define OCAPI_SLOT_FRESET_ASSERT_DELAY (OCAPI_SLOT_FRESET + 3) +#define OCAPI_SLOT_FRESET_DEASSERT_DELAY (OCAPI_SLOT_FRESET + 4) +#define OCAPI_SLOT_FRESET_INIT_DELAY (OCAPI_SLOT_FRESET + 5) + +#define OCAPI_LINK_TRAINING_RETRIES 2 +#define OCAPI_LINK_TRAINING_TIMEOUT 3000 /* ms */ +#define OCAPI_LINK_STATE_TRAINED 0x7 + +enum npu2_link_training_state { + NPU2_TRAIN_DEFAULT, /* fully train the link */ + NPU2_TRAIN_PRBS31, /* used for Signal Integrity testing */ + NPU2_TRAIN_NONE, /* used for testing with loopback cable */ +}; +static enum npu2_link_training_state npu2_ocapi_training_state = NPU2_TRAIN_DEFAULT; + +static const struct phb_ops npu2_opencapi_ops; + +static inline uint64_t index_to_stack(uint64_t index) { + switch (index) { + case 2: + case 3: + return NPU2_STACK_STCK_1; + break; + case 4: + case 5: + return NPU2_STACK_STCK_2; + break; + default: + assert(false); + } +} + +static inline uint64_t index_to_stacku(uint64_t index) { + switch (index) { + case 2: + case 3: + return NPU2_STACK_STCK_1U; + break; + case 4: + case 5: + return NPU2_STACK_STCK_2U; + break; + default: + assert(false); + } +} + +static inline uint64_t index_to_block(uint64_t index) { + switch (index) { + case 2: + case 4: + return NPU2_BLOCK_OTL0; + break; + case 3: + case 5: + return NPU2_BLOCK_OTL1; + break; + default: + assert(false); + } +} + +static uint64_t get_odl_status(uint32_t gcid, uint64_t index) +{ + uint64_t reg, status_xscom; + + status_xscom = OB_ODL_STATUS(index); + xscom_read(gcid, status_xscom, ®); + return reg; +} + +static uint64_t get_odl_training_status(uint32_t gcid, uint64_t index) +{ + uint64_t status_xscom, reg; + + status_xscom = OB_ODL_TRAINING_STATUS(index); + xscom_read(gcid, status_xscom, ®); + return reg; +} + +static uint64_t get_odl_endpoint_info(uint32_t gcid, uint64_t index) +{ + uint64_t status_xscom, reg; + + status_xscom = OB_ODL_ENDPOINT_INFO(index); + xscom_read(gcid, status_xscom, ®); + return reg; +} + +static void disable_nvlink(uint32_t gcid, int index) +{ + uint64_t phy_config_scom, reg; + + switch (index) { + case 2: + case 3: + phy_config_scom = OBUS_LL0_IOOL_PHY_CONFIG; + break; + case 4: + case 5: + phy_config_scom = OBUS_LL3_IOOL_PHY_CONFIG; + break; + default: + assert(false); + } + /* Disable NV-Link link layers */ + xscom_read(gcid, phy_config_scom, ®); + reg &= ~OBUS_IOOL_PHY_CONFIG_NV0_NPU_ENABLED; + reg &= ~OBUS_IOOL_PHY_CONFIG_NV1_NPU_ENABLED; + reg &= ~OBUS_IOOL_PHY_CONFIG_NV2_NPU_ENABLED; + xscom_write(gcid, phy_config_scom, reg); +} + +/* Procedure 13.1.3.1 - select OCAPI vs NVLink for bricks 2-3/4-5 */ + +static void set_transport_mux_controls(uint32_t gcid, uint32_t scom_base, + int index, enum npu2_dev_type type) +{ + /* Step 1 - Set Transport MUX controls to select correct OTL or NTL */ + uint64_t reg; + uint64_t field; + + /* TODO: Rework this to select for NVLink too */ + assert(type == NPU2_DEV_TYPE_OPENCAPI); + + prlog(PR_DEBUG, "OCAPI: %s: Setting transport mux controls\n", __func__); + + /* Optical IO Transport Mux Config for Bricks 0-2 and 4-5 */ + reg = npu2_scom_read(gcid, scom_base, NPU2_MISC_OPTICAL_IO_CFG0, + NPU2_MISC_DA_LEN_8B); + switch (index) { + case 0: + case 1: + /* not valid for OpenCAPI */ + assert(false); + break; + case 2: /* OTL1.0 */ + field = GETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_NDLMUX_BRK0TO2, reg); + field &= ~0b100; + reg = SETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_NDLMUX_BRK0TO2, reg, + field); + field = GETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_OCMUX_BRK0TO1, reg); + field |= 0b10; + reg = SETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_OCMUX_BRK0TO1, reg, + field); + break; + case 3: /* OTL1.1 */ + field = GETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_NDLMUX_BRK0TO2, reg); + field &= ~0b010; + reg = SETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_NDLMUX_BRK0TO2, reg, + field); + field = GETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_OCMUX_BRK0TO1, reg); + field |= 0b01; + reg = SETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_OCMUX_BRK0TO1, reg, + field); + break; + case 4: /* OTL2.0 */ + field = GETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_OCMUX_BRK4TO5, reg); + field |= 0b10; + reg = SETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_OCMUX_BRK4TO5, reg, + field); + break; + case 5: /* OTL2.1 */ + field = GETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_OCMUX_BRK4TO5, reg); + field |= 0b01; + reg = SETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_OCMUX_BRK4TO5, reg, + field); + break; + default: + assert(false); + } + npu2_scom_write(gcid, scom_base, NPU2_MISC_OPTICAL_IO_CFG0, + NPU2_MISC_DA_LEN_8B, reg); + + /* + * PowerBus Optical Miscellaneous Config Register - select + * OpenCAPI for b4/5 and A-Link for b3 + */ + xscom_read(gcid, PU_IOE_PB_MISC_CFG, ®); + switch (index) { + case 0: + case 1: + case 2: + case 3: + break; + case 4: + reg = SETFIELD(PU_IOE_PB_MISC_CFG_SEL_04_NPU_NOT_PB, reg, 1); + break; + case 5: + reg = SETFIELD(PU_IOE_PB_MISC_CFG_SEL_05_NPU_NOT_PB, reg, 1); + break; + } + xscom_write(gcid, PU_IOE_PB_MISC_CFG, reg); +} + +static void assert_odl_reset(uint32_t gcid, int index) +{ + uint64_t reg, config_xscom; + + config_xscom = OB_ODL_CONFIG(index); + /* Reset ODL */ + reg = OB_ODL_CONFIG_RESET; + reg = SETFIELD(OB_ODL_CONFIG_VERSION, reg, 0b000001); + reg = SETFIELD(OB_ODL_CONFIG_TRAIN_MODE, reg, 0b0110); + reg = SETFIELD(OB_ODL_CONFIG_SUPPORTED_MODES, reg, 0b0010); + reg |= OB_ODL_CONFIG_X4_BACKOFF_ENABLE; + reg = SETFIELD(OB_ODL_CONFIG_PHY_CNTR_LIMIT, reg, 0b1111); + reg |= OB_ODL_CONFIG_DEBUG_ENABLE; + reg = SETFIELD(OB_ODL_CONFIG_FWD_PROGRESS_TIMER, reg, 0b0110); + xscom_write(gcid, config_xscom, reg); +} + +static void deassert_odl_reset(uint32_t gcid, int index) +{ + uint64_t reg, config_xscom; + + config_xscom = OB_ODL_CONFIG(index); + xscom_read(gcid, config_xscom, ®); + reg &= ~OB_ODL_CONFIG_RESET; + xscom_write(gcid, config_xscom, reg); +} + +static void enable_odl_phy_mux(uint32_t gcid, int index) +{ + uint64_t reg; + uint64_t phy_config_scom; + prlog(PR_DEBUG, "OCAPI: %s: Enabling ODL to PHY MUXes\n", __func__); + /* Step 2 - Enable MUXes for ODL to PHY connection */ + switch (index) { + case 2: + case 3: + phy_config_scom = OBUS_LL0_IOOL_PHY_CONFIG; + break; + case 4: + case 5: + phy_config_scom = OBUS_LL3_IOOL_PHY_CONFIG; + break; + default: + assert(false); + } + + /* + * ODL must be in reset when enabling. + * It stays in reset until the link is trained + */ + assert_odl_reset(gcid, index); + + /* PowerBus OLL PHY Training Config Register */ + xscom_read(gcid, phy_config_scom, ®); + + /* + * Enable ODL to use shared PHYs + * + * On obus3, OTL0 is connected to ODL1 (and OTL1 to ODL0), so + * even if it may look odd at first, we do want to enable ODL0 + * for links 2 and 5 + */ + switch (index) { + case 2: + case 5: + reg |= OBUS_IOOL_PHY_CONFIG_ODL0_ENABLED; + break; + case 3: + case 4: + reg |= OBUS_IOOL_PHY_CONFIG_ODL1_ENABLED; + break; + } + + /* + * Based on the platform, we may have to activate an extra mux + * to connect the ODL to the right set of lanes. + * + * FIXME: to be checked once we have merged with nvlink + * code. Need to verify that it's a platform parameter and not + * slot-dependent + */ + if (platform.ocapi->odl_phy_swap) + reg |= OBUS_IOOL_PHY_CONFIG_ODL_PHY_SWAP; + else + reg &= ~OBUS_IOOL_PHY_CONFIG_ODL_PHY_SWAP; + + /* Disable A-Link link layers */ + reg &= ~OBUS_IOOL_PHY_CONFIG_LINK0_OLL_ENABLED; + reg &= ~OBUS_IOOL_PHY_CONFIG_LINK1_OLL_ENABLED; + + xscom_write(gcid, phy_config_scom, reg); +} + +static void disable_alink_fp(uint32_t gcid) +{ + uint64_t reg = 0; + + prlog(PR_DEBUG, "OCAPI: %s: Disabling A-Link framer/parsers\n", __func__); + /* Step 3 - Disable A-Link framers/parsers */ + /* TODO: Confirm if needed on OPAL system */ + + reg |= PU_IOE_PB_FP_CFG_FP0_FMR_DISABLE; + reg |= PU_IOE_PB_FP_CFG_FP0_PRS_DISABLE; + reg |= PU_IOE_PB_FP_CFG_FP1_FMR_DISABLE; + reg |= PU_IOE_PB_FP_CFG_FP1_PRS_DISABLE; + xscom_write(gcid, PU_IOE_PB_FP01_CFG, reg); + xscom_write(gcid, PU_IOE_PB_FP23_CFG, reg); + xscom_write(gcid, PU_IOE_PB_FP45_CFG, reg); + xscom_write(gcid, PU_IOE_PB_FP67_CFG, reg); +} + +static void enable_xsl_clocks(uint32_t gcid, uint32_t scom_base, int index) +{ + /* Step 5 - Enable Clocks in XSL */ + + prlog(PR_DEBUG, "OCAPI: %s: Enable clocks in XSL\n", __func__); + + npu2_scom_write(gcid, scom_base, NPU2_REG_OFFSET(index_to_stack(index), + NPU2_BLOCK_XSL, + NPU2_XSL_WRAP_CFG), + NPU2_MISC_DA_LEN_8B, NPU2_XSL_WRAP_CFG_XSLO_CLOCK_ENABLE); +} + +#define CQ_CTL_STATUS_TIMEOUT 10 /* milliseconds */ + +static int set_fence_control(uint32_t gcid, uint32_t scom_base, + int index, uint8_t status) +{ + int stack, block; + uint64_t reg, status_field; + uint8_t status_val; + uint64_t fence_control; + uint64_t timeout = mftb() + msecs_to_tb(CQ_CTL_STATUS_TIMEOUT); + + stack = index_to_stack(index); + block = index_to_block(index); + + fence_control = NPU2_REG_OFFSET(stack, NPU2_BLOCK_CTL, + block == NPU2_BLOCK_OTL0 ? + NPU2_CQ_CTL_FENCE_CONTROL_0 : + NPU2_CQ_CTL_FENCE_CONTROL_1); + + reg = SETFIELD(NPU2_CQ_CTL_FENCE_CONTROL_REQUEST_FENCE, 0ull, status); + npu2_scom_write(gcid, scom_base, fence_control, + NPU2_MISC_DA_LEN_8B, reg); + + /* Wait for fence status to update */ + if (index_to_block(index) == NPU2_BLOCK_OTL0) + status_field = NPU2_CQ_CTL_STATUS_BRK0_AM_FENCED; + else + status_field = NPU2_CQ_CTL_STATUS_BRK1_AM_FENCED; + + do { + reg = npu2_scom_read(gcid, scom_base, + NPU2_REG_OFFSET(index_to_stack(index), + NPU2_BLOCK_CTL, + NPU2_CQ_CTL_STATUS), + NPU2_MISC_DA_LEN_8B); + status_val = GETFIELD(status_field, reg); + if (status_val == status) + return OPAL_SUCCESS; + time_wait_ms(1); + } while (tb_compare(mftb(), timeout) == TB_ABEFOREB); + + /** + * @fwts-label OCAPIFenceStatusTimeout + * @fwts-advice The NPU fence status did not update as expected. This + * could be the result of a firmware or hardware bug. OpenCAPI + * functionality could be broken. + */ + prlog(PR_ERR, + "OCAPI: Fence status for brick %d stuck: expected 0x%x, got 0x%x\n", + index, status, status_val); + return OPAL_HARDWARE; +} + +static void set_npcq_config(uint32_t gcid, uint32_t scom_base, int index) +{ + uint64_t reg, stack, block; + + prlog(PR_DEBUG, "OCAPI: %s: Set NPCQ Config\n", __func__); + /* Step 6 - Set NPCQ configuration */ + /* CQ_CTL Misc Config Register #0 */ + stack = index_to_stack(index); + block = index_to_block(index); + + /* Enable OTL */ + npu2_scom_write(gcid, scom_base, NPU2_OTL_CONFIG0(stack, block), + NPU2_MISC_DA_LEN_8B, NPU2_OTL_CONFIG0_EN); + set_fence_control(gcid, scom_base, index, 0b01); + reg = npu2_scom_read(gcid, scom_base, + NPU2_REG_OFFSET(stack, NPU2_BLOCK_CTL, + NPU2_CQ_CTL_MISC_CFG), + NPU2_MISC_DA_LEN_8B); + /* Set OCAPI mode */ + reg |= NPU2_CQ_CTL_MISC_CFG_CONFIG_OCAPI_MODE; + if (block == NPU2_BLOCK_OTL0) + reg |= NPU2_CQ_CTL_MISC_CFG_CONFIG_OTL0_ENABLE; + else + reg |= NPU2_CQ_CTL_MISC_CFG_CONFIG_OTL1_ENABLE; + npu2_scom_write(gcid, scom_base, + NPU2_REG_OFFSET(stack, NPU2_BLOCK_CTL, + NPU2_CQ_CTL_MISC_CFG), + NPU2_MISC_DA_LEN_8B, reg); + + /* NPU Fenced */ + set_fence_control(gcid, scom_base, index, 0b11); + + /* NPU Half Fenced */ + set_fence_control(gcid, scom_base, index, 0b10); + + /* CQ_DAT Misc Config Register #1 */ + reg = npu2_scom_read(gcid, scom_base, + NPU2_REG_OFFSET(stack, NPU2_BLOCK_DAT, + NPU2_CQ_DAT_MISC_CFG), + NPU2_MISC_DA_LEN_8B); + /* Set OCAPI mode for bricks 2-5 */ + reg |= NPU2_CQ_DAT_MISC_CFG_CONFIG_OCAPI_MODE; + npu2_scom_write(gcid, scom_base, + NPU2_REG_OFFSET(stack, NPU2_BLOCK_DAT, + NPU2_CQ_DAT_MISC_CFG), + NPU2_MISC_DA_LEN_8B, reg); + + /* CQ_SM Misc Config Register #0 */ + for (block = NPU2_BLOCK_SM_0; block <= NPU2_BLOCK_SM_3; block++) { + reg = npu2_scom_read(gcid, scom_base, + NPU2_REG_OFFSET(stack, block, + NPU2_CQ_SM_MISC_CFG0), + NPU2_MISC_DA_LEN_8B); + /* Set OCAPI mode for bricks 2-5 */ + reg |= NPU2_CQ_SM_MISC_CFG0_CONFIG_OCAPI_MODE; + npu2_scom_write(gcid, scom_base, + NPU2_REG_OFFSET(stack, block, + NPU2_CQ_SM_MISC_CFG0), + NPU2_MISC_DA_LEN_8B, reg); + } +} + +static void enable_xsl_xts_interfaces(uint32_t gcid, uint32_t scom_base, int index) +{ + uint64_t reg; + + prlog(PR_DEBUG, "OCAPI: %s: Enable XSL-XTS Interfaces\n", __func__); + /* Step 7 - Enable XSL-XTS interfaces */ + /* XTS Config Register - Enable XSL-XTS interface */ + reg = npu2_scom_read(gcid, scom_base, NPU2_XTS_CFG, NPU2_MISC_DA_LEN_8B); + reg |= NPU2_XTS_CFG_OPENCAPI; + npu2_scom_write(gcid, scom_base, NPU2_XTS_CFG, NPU2_MISC_DA_LEN_8B, reg); + + /* XTS Config2 Register - Enable XSL1/2 */ + reg = npu2_scom_read(gcid, scom_base, NPU2_XTS_CFG2, NPU2_MISC_DA_LEN_8B); + switch (index_to_stack(index)) { + case NPU2_STACK_STCK_1: + reg |= NPU2_XTS_CFG2_XSL1_ENA; + break; + case NPU2_STACK_STCK_2: + reg |= NPU2_XTS_CFG2_XSL2_ENA; + break; + } + npu2_scom_write(gcid, scom_base, NPU2_XTS_CFG2, NPU2_MISC_DA_LEN_8B, reg); +} + +static void enable_sm_allocation(uint32_t gcid, uint32_t scom_base, int index) +{ + uint64_t reg, block; + int stack = index_to_stack(index); + + prlog(PR_DEBUG, "OCAPI: %s: Enable State Machine Allocation\n", __func__); + /* Step 8 - Enable state-machine allocation */ + /* Low-Water Marks Registers - Enable state machine allocation */ + for (block = NPU2_BLOCK_SM_0; block <= NPU2_BLOCK_SM_3; block++) { + reg = npu2_scom_read(gcid, scom_base, + NPU2_REG_OFFSET(stack, block, + NPU2_LOW_WATER_MARKS), + NPU2_MISC_DA_LEN_8B); + reg |= NPU2_LOW_WATER_MARKS_ENABLE_MACHINE_ALLOC; + npu2_scom_write(gcid, scom_base, + NPU2_REG_OFFSET(stack, block, + NPU2_LOW_WATER_MARKS), + NPU2_MISC_DA_LEN_8B, reg); + } +} + +static void enable_pb_snooping(uint32_t gcid, uint32_t scom_base, int index) +{ + uint64_t reg, block; + int stack = index_to_stack(index); + + prlog(PR_DEBUG, "OCAPI: %s: Enable PowerBus snooping\n", __func__); + /* Step 9 - Enable PowerBus snooping */ + /* CQ_SM Misc Config Register #0 - Enable PowerBus snooping */ + for (block = NPU2_BLOCK_SM_0; block <= NPU2_BLOCK_SM_3; block++) { + reg = npu2_scom_read(gcid, scom_base, + NPU2_REG_OFFSET(stack, block, + NPU2_CQ_SM_MISC_CFG0), + NPU2_MISC_DA_LEN_8B); + reg |= NPU2_CQ_SM_MISC_CFG0_CONFIG_ENABLE_PBUS; + npu2_scom_write(gcid, scom_base, + NPU2_REG_OFFSET(stack, block, + NPU2_CQ_SM_MISC_CFG0), + NPU2_MISC_DA_LEN_8B, reg); + } +} + +static void brick_config(uint32_t gcid, uint32_t scom_base, int index) +{ + /* + * We assume at this point that the PowerBus Hotplug Mode Control + * register is correctly set by Hostboot + */ + disable_nvlink(gcid, index); + set_transport_mux_controls(gcid, scom_base, index, + NPU2_DEV_TYPE_OPENCAPI); + enable_odl_phy_mux(gcid, index); + disable_alink_fp(gcid); + enable_xsl_clocks(gcid, scom_base, index); + set_npcq_config(gcid, scom_base, index); + enable_xsl_xts_interfaces(gcid, scom_base, index); + enable_sm_allocation(gcid, scom_base, index); + enable_pb_snooping(gcid, scom_base, index); +} + +/* Procedure 13.1.3.4 - Brick to PE Mapping */ +static void pe_config(struct npu2_dev *dev) +{ + /* We currently use a fixed PE assignment per brick */ + uint64_t val, reg; + val = NPU2_MISC_BRICK_BDF2PE_MAP_ENABLE; + val = SETFIELD(NPU2_MISC_BRICK_BDF2PE_MAP_PE, val, NPU2_OCAPI_PE(dev)); + val = SETFIELD(NPU2_MISC_BRICK_BDF2PE_MAP_BDF, val, 0); + reg = NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, + NPU2_MISC_BRICK0_BDF2PE_MAP0 + + (dev->brick_index * 0x18)); + npu2_write(dev->npu, reg, val); +} + +/* Procedure 13.1.3.5 - TL Configuration */ +static void tl_config(uint32_t gcid, uint32_t scom_base, uint64_t index) +{ + uint64_t reg; + uint64_t stack = index_to_stack(index); + uint64_t block = index_to_block(index); + + prlog(PR_DEBUG, "OCAPI: %s: TL Configuration\n", __func__); + /* OTL Config 0 Register */ + reg = 0; + /* OTL Enable */ + reg |= NPU2_OTL_CONFIG0_EN; + /* Block PE Handle from ERAT Index */ + reg |= NPU2_OTL_CONFIG0_BLOCK_PE_HANDLE; + /* OTL Brick ID */ + reg = SETFIELD(NPU2_OTL_CONFIG0_BRICKID, reg, index - 2); + /* ERAT Hash 0 */ + reg = SETFIELD(NPU2_OTL_CONFIG0_ERAT_HASH_0, reg, 0b011001); + /* ERAT Hash 1 */ + reg = SETFIELD(NPU2_OTL_CONFIG0_ERAT_HASH_1, reg, 0b000111); + /* ERAT Hash 2 */ + reg = SETFIELD(NPU2_OTL_CONFIG0_ERAT_HASH_2, reg, 0b101100); + /* ERAT Hash 3 */ + reg = SETFIELD(NPU2_OTL_CONFIG0_ERAT_HASH_3, reg, 0b100110); + npu2_scom_write(gcid, scom_base, NPU2_OTL_CONFIG0(stack, block), + NPU2_MISC_DA_LEN_8B, reg); + + /* OTL Config 1 Register */ + reg = 0; + /* + * We leave Template 1-3 bits at 0 to force template 0 as required + * for unknown devices. + * + * Template 0 Transmit Rate is set to most conservative setting which + * will always be supported. Other Template Transmit rates are left + * unset and will be set later by OS. + */ + reg = SETFIELD(NPU2_OTL_CONFIG1_TX_TEMP0_RATE, reg, 0b1111); + /* Extra wait cycles TXI-TXO */ + reg = SETFIELD(NPU2_OTL_CONFIG1_TX_DRDY_WAIT, reg, 0b001); + /* Minimum Frequency to Return TLX Credits to AFU */ + reg = SETFIELD(NPU2_OTL_CONFIG1_TX_CRET_FREQ, reg, 0b001); + /* Frequency to add age to Transmit Requests */ + reg = SETFIELD(NPU2_OTL_CONFIG1_TX_AGE_FREQ, reg, 0b11000); + /* Response High Priority Threshold */ + reg = SETFIELD(NPU2_OTL_CONFIG1_TX_RS2_HPWAIT, reg, 0b011011); + /* 4-slot Request High Priority Threshold */ + reg = SETFIELD(NPU2_OTL_CONFIG1_TX_RQ4_HPWAIT, reg, 0b011011); + /* 6-slot Request High Priority */ + reg = SETFIELD(NPU2_OTL_CONFIG1_TX_RQ6_HPWAIT, reg, 0b011011); + /* Stop the OCAPI Link on Uncorrectable Error + * TODO: Confirm final value - disabled for debug */ + + npu2_scom_write(gcid, scom_base, NPU2_OTL_CONFIG1(stack, block), + NPU2_MISC_DA_LEN_8B, reg); + + /* TLX Credit Configuration Register */ + reg = 0; + /* VC0/VC3/DCP0/DCP1 credits to send to AFU */ + reg = SETFIELD(NPU2_OTL_TLX_CREDITS_VC0_CREDITS, reg, 0x40); + reg = SETFIELD(NPU2_OTL_TLX_CREDITS_VC3_CREDITS, reg, 0x40); + reg = SETFIELD(NPU2_OTL_TLX_CREDITS_DCP0_CREDITS, reg, 0x80); + reg = SETFIELD(NPU2_OTL_TLX_CREDITS_DCP1_CREDITS, reg, 0x80); + npu2_scom_write(gcid, scom_base, NPU2_OTL_TLX_CREDITS(stack, block), + NPU2_MISC_DA_LEN_8B, reg); +} + +/* Detect Nimbus DD2.0 and DD2.01 */ +static int get_nimbus_level(void) +{ + struct proc_chip *chip = next_chip(NULL); + + if (chip && chip->type == PROC_CHIP_P9_NIMBUS) + return chip->ec_level & 0xff; + return -1; +} + +/* Procedure 13.1.3.6 - Address Translation Configuration */ +static void address_translation_config(uint32_t gcid, uint32_t scom_base, + uint64_t index) +{ + int chip_level; + uint64_t reg; + uint64_t stack = index_to_stack(index); + + prlog(PR_DEBUG, "OCAPI: %s: Address Translation Configuration\n", __func__); + /* PSL_SCNTL_A0 Register */ + /* + * ERAT shared between multiple AFUs + * + * The workbook has this bit around the wrong way from the hardware. + * + * TODO: handle correctly with link ganging + */ + reg = npu2_scom_read(gcid, scom_base, + NPU2_REG_OFFSET(stack, NPU2_BLOCK_XSL, + NPU2_XSL_PSL_SCNTL_A0), + NPU2_MISC_DA_LEN_8B); + reg |= NPU2_XSL_PSL_SCNTL_A0_MULTI_AFU_DIAL; + npu2_scom_write(gcid, scom_base, + NPU2_REG_OFFSET(stack, NPU2_BLOCK_XSL, + NPU2_XSL_PSL_SCNTL_A0), + NPU2_MISC_DA_LEN_8B, reg); + + chip_level = get_nimbus_level(); + if (chip_level == 0x20) { + /* + * Errata HW408041 (section 15.1.10 of NPU workbook) + * "RA mismatch when both tlbie and checkout response + * are seen in same cycle" + */ + /* XSL_GP Register - Bloom Filter Disable */ + reg = npu2_scom_read(gcid, scom_base, + NPU2_REG_OFFSET(stack, NPU2_BLOCK_XSL, NPU2_XSL_GP), + NPU2_MISC_DA_LEN_8B); + /* To update XSL_GP, we must first write a magic value to it */ + npu2_scom_write(gcid, scom_base, + NPU2_REG_OFFSET(stack, NPU2_BLOCK_XSL, NPU2_XSL_GP), + NPU2_MISC_DA_LEN_8B, 0x0523790323000000UL); + reg &= ~NPU2_XSL_GP_BLOOM_FILTER_ENABLE; + npu2_scom_write(gcid, scom_base, + NPU2_REG_OFFSET(stack, NPU2_BLOCK_XSL, NPU2_XSL_GP), + NPU2_MISC_DA_LEN_8B, reg); + } + + if (chip_level == 0x20 || chip_level == 0x21) { + /* + * DD2.0/2.1 EOA Bug. Fixed in DD2.2 + */ + reg = 0x32F8000000000001UL; + npu2_scom_write(gcid, scom_base, + NPU2_REG_OFFSET(stack, NPU2_BLOCK_XSL, + NPU2_XSL_DEF), + NPU2_MISC_DA_LEN_8B, reg); + } +} + +/* TODO: Merge this with NVLink implementation - we don't use the npu2_bar + * wrapper for the PHY BARs yet */ +static void write_bar(uint32_t gcid, uint32_t scom_base, uint64_t reg, + uint64_t addr, uint64_t size) +{ + uint64_t val; + int block; + switch (NPU2_REG(reg)) { + case NPU2_PHY_BAR: + val = SETFIELD(NPU2_PHY_BAR_ADDR, 0ul, addr >> 21); + val = SETFIELD(NPU2_PHY_BAR_ENABLE, val, 1); + break; + case NPU2_NTL0_BAR: + case NPU2_NTL1_BAR: + val = SETFIELD(NPU2_NTL_BAR_ADDR, 0ul, addr >> 16); + val = SETFIELD(NPU2_NTL_BAR_SIZE, val, ilog2(size >> 16)); + val = SETFIELD(NPU2_NTL_BAR_ENABLE, val, 1); + break; + case NPU2_GENID_BAR: + val = SETFIELD(NPU2_GENID_BAR_ADDR, 0ul, addr >> 16); + val = SETFIELD(NPU2_GENID_BAR_ENABLE, val, 1); + break; + default: + val = 0ul; + } + + for (block = NPU2_BLOCK_SM_0; block <= NPU2_BLOCK_SM_3; block++) { + npu2_scom_write(gcid, scom_base, NPU2_REG_OFFSET(0, block, reg), + NPU2_MISC_DA_LEN_8B, val); + prlog(PR_DEBUG, "OCAPI: Setting BAR %llx to %llx\n", + NPU2_REG_OFFSET(0, block, reg), val); + } +} + +static void setup_global_mmio_bar(uint32_t gcid, uint32_t scom_base, + uint64_t reg[]) +{ + uint64_t addr, size; + + prlog(PR_DEBUG, "OCAPI: patching up PHY0 bar, %s\n", __func__); + phys_map_get(gcid, NPU_PHY, 0, &addr, &size); + write_bar(gcid, scom_base, + NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_PHY_BAR), + addr, size); + prlog(PR_DEBUG, "OCAPI: patching up PHY1 bar, %s\n", __func__); + phys_map_get(gcid, NPU_PHY, 1, &addr, &size); + write_bar(gcid, scom_base, + NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_PHY_BAR), + addr, size); + + prlog(PR_DEBUG, "OCAPI: setup global mmio, %s\n", __func__); + phys_map_get(gcid, NPU_REGS, 0, &addr, &size); + write_bar(gcid, scom_base, + NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_PHY_BAR), + addr, size); + reg[0] = addr; + reg[1] = size; +} + +/* Procedure 13.1.3.8 - AFU MMIO Range BARs */ +static void setup_afu_mmio_bars(uint32_t gcid, uint32_t scom_base, + struct npu2_dev *dev) +{ + uint64_t stack = index_to_stack(dev->brick_index); + uint64_t offset = index_to_block(dev->brick_index) == NPU2_BLOCK_OTL0 ? + NPU2_NTL0_BAR : NPU2_NTL1_BAR; + uint64_t pa_offset = index_to_block(dev->brick_index) == NPU2_BLOCK_OTL0 ? + NPU2_CQ_CTL_MISC_MMIOPA0_CONFIG : + NPU2_CQ_CTL_MISC_MMIOPA1_CONFIG; + uint64_t addr, size, reg; + + prlog(PR_DEBUG, "OCAPI: %s: Setup AFU MMIO BARs\n", __func__); + phys_map_get(gcid, NPU_OCAPI_MMIO, dev->brick_index, &addr, &size); + + prlog(PR_DEBUG, "OCAPI: AFU MMIO set to %llx, size %llx\n", addr, size); + write_bar(gcid, scom_base, NPU2_REG_OFFSET(stack, 0, offset), addr, + size); + dev->bars[0].npu2_bar.base = addr; + dev->bars[0].npu2_bar.size = size; + + reg = SETFIELD(NPU2_CQ_CTL_MISC_MMIOPA_ADDR, 0ull, addr >> 16); + reg = SETFIELD(NPU2_CQ_CTL_MISC_MMIOPA_SIZE, reg, ilog2(size >> 16)); + prlog(PR_DEBUG, "OCAPI: PA translation %llx\n", reg); + npu2_scom_write(gcid, scom_base, + NPU2_REG_OFFSET(stack, NPU2_BLOCK_CTL, + pa_offset), + NPU2_MISC_DA_LEN_8B, reg); +} + +/* Procedure 13.1.3.9 - AFU Config BARs */ +static void setup_afu_config_bars(uint32_t gcid, uint32_t scom_base, + struct npu2_dev *dev) +{ + uint64_t stack = index_to_stack(dev->brick_index); + int stack_num = stack - NPU2_STACK_STCK_0; + uint64_t addr, size; + + prlog(PR_DEBUG, "OCAPI: %s: Setup AFU Config BARs\n", __func__); + phys_map_get(gcid, NPU_GENID, stack_num, &addr, &size); + prlog(PR_DEBUG, "OCAPI: Assigning GENID BAR: %016llx\n", addr); + write_bar(gcid, scom_base, NPU2_REG_OFFSET(stack, 0, NPU2_GENID_BAR), + addr, size); + dev->bars[1].npu2_bar.base = addr; + dev->bars[1].npu2_bar.size = size; +} + +static void otl_enabletx(uint32_t gcid, uint32_t scom_base, + struct npu2_dev *dev) +{ + uint64_t stack = index_to_stack(dev->brick_index); + uint64_t block = index_to_block(dev->brick_index); + uint64_t reg; + + /* OTL Config 2 Register */ + /* Transmit Enable */ + OCAPIDBG(dev, "Enabling TX\n"); + reg = 0; + reg |= NPU2_OTL_CONFIG2_TX_SEND_EN; + npu2_scom_write(gcid, scom_base, NPU2_OTL_CONFIG2(stack, block), + NPU2_MISC_DA_LEN_8B, reg); + + reg = npu2_scom_read(gcid, scom_base, NPU2_OTL_VC_CREDITS(stack, block), + NPU2_MISC_DA_LEN_8B); + OCAPIDBG(dev, "credit counter: %llx\n", reg); + /* TODO: Abort if credits are zero */ +} + +static uint8_t get_reset_pin(struct npu2_dev *dev) +{ + uint8_t pin; + + switch (dev->brick_index) { + case 2: + pin = platform.ocapi->i2c_reset_brick2; + break; + case 3: + pin = platform.ocapi->i2c_reset_brick3; + break; + case 4: + pin = platform.ocapi->i2c_reset_brick4; + break; + case 5: + pin = platform.ocapi->i2c_reset_brick5; + break; + default: + assert(false); + } + return pin; +} + +static void assert_adapter_reset(struct npu2_dev *dev) +{ + uint8_t pin, data; + int rc; + + pin = get_reset_pin(dev); + /* + * set the i2c reset pin in output mode + * + * On the 9554 device, register 3 is the configuration + * register and a pin is in output mode if its value is 0 + */ + lock(&dev->npu->i2c_lock); + dev->npu->i2c_pin_mode &= ~pin; + data = dev->npu->i2c_pin_mode; + + rc = i2c_request_send(dev->npu->i2c_port_id_ocapi, + platform.ocapi->i2c_reset_addr, SMBUS_WRITE, + 0x3, 1, + &data, sizeof(data), 120); + if (rc) + goto err; + + /* register 1 controls the signal, reset is active low */ + dev->npu->i2c_pin_wr_state &= ~pin; + data = dev->npu->i2c_pin_wr_state; + + rc = i2c_request_send(dev->npu->i2c_port_id_ocapi, + platform.ocapi->i2c_reset_addr, SMBUS_WRITE, + 0x1, 1, + &data, sizeof(data), 120); + if (rc) + goto err; + unlock(&dev->npu->i2c_lock); + return; + +err: + unlock(&dev->npu->i2c_lock); + /** + * @fwts-label OCAPIDeviceResetFailed + * @fwts-advice There was an error attempting to send + * a reset signal over I2C to the OpenCAPI device. + */ + OCAPIERR(dev, "Error writing I2C reset signal: %d\n", rc); +} + +static void deassert_adapter_reset(struct npu2_dev *dev) +{ + uint8_t pin, data; + int rc, rc2; + + pin = get_reset_pin(dev); + + /* + * All we need to do here is deassert the reset signal by + * setting the reset pin to high. However, we cannot leave the + * pin in output mode, as it can cause troubles with the + * opencapi adapter: when the slot is powered off (on a reboot + * for example), if the i2c controller is actively setting the + * reset signal to high, it maintains voltage on part of the + * fpga and can leak current. It can lead the fpga to be in an + * unspecified state and potentially cause damage. + * + * The circumvention is to set the pin back to input + * mode. There are pullup resistors on the planar on all + * platforms to make sure the signal will "naturally" be high, + * without the i2c controller actively setting it, so we won't + * have problems when the slot is powered off. And it takes + * the adapter out of reset. + * + * To summarize: + * 1. set the pin to input mode. That is enough to raise the + * signal + * 2. set the value of the pin to high. The pin is input mode, + * so it won't really do anything. But it's more coherent + * and avoids bad surprises on the next call to + * assert_adapter_reset() + */ + lock(&dev->npu->i2c_lock); + dev->npu->i2c_pin_mode |= pin; + data = dev->npu->i2c_pin_mode; + + rc = i2c_request_send(dev->npu->i2c_port_id_ocapi, + platform.ocapi->i2c_reset_addr, SMBUS_WRITE, + 0x3, 1, + &data, sizeof(data), 120); + + dev->npu->i2c_pin_wr_state |= pin; + data = dev->npu->i2c_pin_wr_state; + rc2 = i2c_request_send(dev->npu->i2c_port_id_ocapi, + platform.ocapi->i2c_reset_addr, SMBUS_WRITE, + 0x1, 1, + &data, sizeof(data), 120); + unlock(&dev->npu->i2c_lock); + if (!rc) + rc = rc2; + if (rc) { + /** + * @fwts-label OCAPIDeviceResetFailed + * @fwts-advice There was an error attempting to send + * a reset signal over I2C to the OpenCAPI device. + */ + OCAPIERR(dev, "Error writing I2C reset signal: %d\n", rc); + } +} + +static void setup_perf_counters(struct npu2_dev *dev) +{ + uint64_t addr, reg, link; + + /* + * setup the DLL perf counters to check CRC errors detected by + * the NPU or the adapter. + * + * Counter 0: link 0/ODL0, CRC error detected by ODL + * Counter 1: link 0/ODL0, CRC error detected by DLx + * Counter 2: link 1/ODL1, CRC error detected by ODL + * Counter 3: link 1/ODL1, CRC error detected by DLx + */ + if ((dev->brick_index == 2) || (dev->brick_index == 5)) + link = 0; + else + link = 1; + + addr = OB_DLL_PERF_MONITOR_CONFIG(dev->brick_index); + xscom_read(dev->npu->chip_id, addr, ®); + if (link == 0) { + reg = SETFIELD(OB_DLL_PERF_MONITOR_CONFIG_ENABLE, reg, + OB_DLL_PERF_MONITOR_CONFIG_LINK0); + reg = SETFIELD(OB_DLL_PERF_MONITOR_CONFIG_ENABLE >> 2, reg, + OB_DLL_PERF_MONITOR_CONFIG_LINK0); + } else { + reg = SETFIELD(OB_DLL_PERF_MONITOR_CONFIG_ENABLE >> 4, reg, + OB_DLL_PERF_MONITOR_CONFIG_LINK1); + reg = SETFIELD(OB_DLL_PERF_MONITOR_CONFIG_ENABLE >> 6, reg, + OB_DLL_PERF_MONITOR_CONFIG_LINK1); + } + reg = SETFIELD(OB_DLL_PERF_MONITOR_CONFIG_SIZE, reg, + OB_DLL_PERF_MONITOR_CONFIG_SIZE16); + xscom_write(dev->npu->chip_id, + OB_DLL_PERF_MONITOR_CONFIG(dev->brick_index), reg); + OCAPIDBG(dev, "perf counter config %llx = %llx\n", addr, reg); + + addr = OB_DLL_PERF_MONITOR_SELECT(dev->brick_index); + xscom_read(dev->npu->chip_id, addr, ®); + reg = SETFIELD(OB_DLL_PERF_MONITOR_SELECT_COUNTER >> (link * 16), + reg, OB_DLL_PERF_MONITOR_SELECT_CRC_ODL); + reg = SETFIELD(OB_DLL_PERF_MONITOR_SELECT_COUNTER >> ((link * 16) + 8), + reg, OB_DLL_PERF_MONITOR_SELECT_CRC_DLX); + xscom_write(dev->npu->chip_id, addr, reg); + OCAPIDBG(dev, "perf counter select %llx = %llx\n", addr, reg); +} + +static void check_perf_counters(struct npu2_dev *dev) +{ + uint64_t addr, reg, link0, link1; + + addr = OB_DLL_PERF_COUNTER0(dev->brick_index); + xscom_read(dev->npu->chip_id, addr, ®); + link0 = GETFIELD(PPC_BITMASK(0, 31), reg); + link1 = GETFIELD(PPC_BITMASK(32, 63), reg); + if (link0 || link1) + OCAPIERR(dev, "CRC error count link0=%08llx link1=%08llx\n", + link0, link1); +} + +static void set_init_pattern(uint32_t gcid, struct npu2_dev *dev) +{ + uint64_t reg, config_xscom; + + config_xscom = OB_ODL_CONFIG(dev->brick_index); + /* Transmit Pattern A */ + xscom_read(gcid, config_xscom, ®); + reg = SETFIELD(OB_ODL_CONFIG_TRAIN_MODE, reg, 0b0001); + xscom_write(gcid, config_xscom, reg); +} + +static void start_training(uint32_t gcid, struct npu2_dev *dev) +{ + uint64_t reg, config_xscom; + + config_xscom = OB_ODL_CONFIG(dev->brick_index); + /* Start training */ + xscom_read(gcid, config_xscom, ®); + reg = SETFIELD(OB_ODL_CONFIG_TRAIN_MODE, reg, 0b1000); + xscom_write(gcid, config_xscom, reg); +} + +static int64_t npu2_opencapi_get_presence_state(struct pci_slot __unused *slot, + uint8_t *val) +{ + /* + * Presence detection for OpenCAPI is currently done at the start of + * NPU initialisation, and we only create slots if a device is present. + * As such we will never be asked to get the presence of a slot that's + * empty. + * + * This may change if we ever support surprise hotplug down + * the track. + */ + *val = OPAL_PCI_SLOT_PRESENT; + return OPAL_SUCCESS; +} + +static void fence_brick(struct npu2_dev *dev) +{ + OCAPIDBG(dev, "Fencing brick\n"); + set_fence_control(dev->npu->chip_id, dev->npu->xscom_base, + dev->brick_index, 0b11); + /* from 13.2.1, Quiesce Fence State */ + npu2_write(dev->npu, NPU2_MISC_FENCE_STATE, + PPC_BIT(dev->brick_index + 6)); +} + +static void unfence_brick(struct npu2_dev *dev) +{ + OCAPIDBG(dev, "Unfencing brick\n"); + npu2_write(dev->npu, NPU2_MISC_FENCE_STATE, + PPC_BIT(dev->brick_index)); + + set_fence_control(dev->npu->chip_id, dev->npu->xscom_base, + dev->brick_index, 0b10); + set_fence_control(dev->npu->chip_id, dev->npu->xscom_base, + dev->brick_index, 0b00); +} + +static enum OpalShpcLinkState get_link_width(uint64_t odl_status) +{ + uint64_t tx_lanes, rx_lanes, state; + + /* + * On P9, the 'trained mode' field of the ODL status is + * hard-coded to x8 and is useless for us. We need to look at + * the status of the individual lanes. + * The link trains at x8, x4 or not at all. + */ + state = GETFIELD(OB_ODL_STATUS_TRAINING_STATE_MACHINE, odl_status); + if (state != OCAPI_LINK_STATE_TRAINED) + return OPAL_SHPC_LINK_DOWN; + + rx_lanes = GETFIELD(OB_ODL_STATUS_RX_TRAINED_LANES, odl_status); + tx_lanes = GETFIELD(OB_ODL_STATUS_TX_TRAINED_LANES, odl_status); + if ((rx_lanes != 0xFF) || (tx_lanes != 0xFF)) + return OPAL_SHPC_LINK_UP_x4; + else + return OPAL_SHPC_LINK_UP_x8; +} + +static int64_t npu2_opencapi_get_link_state(struct pci_slot *slot, uint8_t *val) +{ + struct npu2_dev *dev = phb_to_npu2_dev_ocapi(slot->phb); + uint64_t reg; + + reg = get_odl_status(dev->npu->chip_id, dev->brick_index); + *val = get_link_width(reg); + return OPAL_SUCCESS; +} + +static int64_t npu2_opencapi_get_power_state(struct pci_slot *slot, + uint8_t *val) +{ + *val = slot->power_state; + return OPAL_SUCCESS; +} + +static int64_t npu2_opencapi_set_power_state(struct pci_slot *slot, uint8_t val) +{ + struct npu2_dev *dev = phb_to_npu2_dev_ocapi(slot->phb); + + switch (val) { + case PCI_SLOT_POWER_OFF: + OCAPIDBG(dev, "Fake power off\n"); + fence_brick(dev); + assert_adapter_reset(dev); + slot->power_state = PCI_SLOT_POWER_OFF; + return OPAL_SUCCESS; + + case PCI_SLOT_POWER_ON: + if (slot->power_state != PCI_SLOT_POWER_OFF) + return OPAL_SUCCESS; + OCAPIDBG(dev, "Fake power on\n"); + slot->power_state = PCI_SLOT_POWER_ON; + slot->state = OCAPI_SLOT_NORMAL; + return OPAL_SUCCESS; + + default: + return OPAL_UNSUPPORTED; + } +} + +static void check_trained_link(struct npu2_dev *dev, uint64_t odl_status) +{ + if (get_link_width(odl_status) != OPAL_SHPC_LINK_UP_x8) { + OCAPIERR(dev, "Link trained in degraded mode (%016llx)\n", + odl_status); + OCAPIDBG(dev, "Link endpoint info: %016llx\n", + get_odl_endpoint_info(dev->npu->chip_id, dev->brick_index)); + } +} + +static int64_t npu2_opencapi_retry_state(struct pci_slot *slot, + uint64_t odl_status) +{ + struct npu2_dev *dev = phb_to_npu2_dev_ocapi(slot->phb); + uint32_t chip_id = dev->npu->chip_id; + + if (!slot->link_retries--) { + /** + * @fwts-label OCAPILinkTrainingFailed + * @fwts-advice The OpenCAPI link training procedure failed. + * This indicates a hardware or firmware bug. OpenCAPI + * functionality will not be available on this link. + */ + OCAPIERR(dev, + "Link failed to train, final link status: %016llx\n", + odl_status); + OCAPIDBG(dev, "Final link training status: %016llx\n", + get_odl_training_status(chip_id, dev->brick_index)); + return OPAL_HARDWARE; + } + + OCAPIERR(dev, "Link failed to train, retrying\n"); + OCAPIDBG(dev, "Link status: %016llx, training status: %016llx\n", + odl_status, + get_odl_training_status(chip_id, dev->brick_index)); + + pci_slot_set_state(slot, OCAPI_SLOT_FRESET_INIT); + return pci_slot_set_sm_timeout(slot, msecs_to_tb(1)); +} + +static void npu2_opencapi_prepare_link_change(struct pci_slot *slot __unused, + bool up __unused) +{ + /* + * PCI hotplug wants it defined, but we don't need to do anything + */ +} + +static int64_t npu2_opencapi_poll_link(struct pci_slot *slot) +{ + struct npu2_dev *dev = phb_to_npu2_dev_ocapi(slot->phb); + uint32_t chip_id = dev->npu->chip_id; + uint64_t reg; + + switch (slot->state) { + case OCAPI_SLOT_NORMAL: + case OCAPI_SLOT_LINK_START: + OCAPIDBG(dev, "Start polling\n"); + pci_slot_set_state(slot, OCAPI_SLOT_LINK_WAIT); + /* fall-through */ + case OCAPI_SLOT_LINK_WAIT: + reg = get_odl_status(chip_id, dev->brick_index); + if (GETFIELD(OB_ODL_STATUS_TRAINING_STATE_MACHINE, reg) == + OCAPI_LINK_STATE_TRAINED) { + OCAPIINF(dev, "link trained in %ld ms\n", + tb_to_msecs(mftb() - dev->train_start)); + check_trained_link(dev, reg); + pci_slot_set_state(slot, OCAPI_SLOT_LINK_TRAINED); + return pci_slot_set_sm_timeout(slot, msecs_to_tb(1)); + } + if (tb_compare(mftb(), dev->train_timeout) == TB_AAFTERB) + return npu2_opencapi_retry_state(slot, reg); + + return pci_slot_set_sm_timeout(slot, msecs_to_tb(1)); + + case OCAPI_SLOT_LINK_TRAINED: + otl_enabletx(chip_id, dev->npu->xscom_base, dev); + pci_slot_set_state(slot, OCAPI_SLOT_NORMAL); + if (dev->flags & NPU2_DEV_BROKEN) { + OCAPIERR(dev, "Resetting a device which hit a previous error. Device recovery is not supported, so future behavior is undefined\n"); + dev->flags &= ~NPU2_DEV_BROKEN; + } + check_perf_counters(dev); + dev->phb_ocapi.scan_map = 1; + return OPAL_SUCCESS; + + default: + OCAPIERR(dev, "unexpected slot state %08x\n", slot->state); + + } + pci_slot_set_state(slot, OCAPI_SLOT_NORMAL); + return OPAL_HARDWARE; +} + +static int64_t npu2_opencapi_creset(struct pci_slot *slot) +{ + struct npu2_dev *dev = phb_to_npu2_dev_ocapi(slot->phb); + + OCAPIERR(dev, "creset not supported\n"); + return OPAL_UNSUPPORTED; +} + +static int64_t npu2_opencapi_freset(struct pci_slot *slot) +{ + struct npu2_dev *dev = phb_to_npu2_dev_ocapi(slot->phb); + uint32_t chip_id = dev->npu->chip_id; + uint8_t presence = 1; + int rc; + + switch (slot->state) { + case OCAPI_SLOT_NORMAL: + case OCAPI_SLOT_FRESET_START: + OCAPIDBG(dev, "FRESET starts\n"); + + if (slot->ops.get_presence_state) + slot->ops.get_presence_state(slot, &presence); + if (!presence) { + /* + * FIXME: if there's no card on the link, we + * should consider powering off the unused + * lanes to save energy + */ + OCAPIINF(dev, "no card detected\n"); + return OPAL_SUCCESS; + } + slot->link_retries = OCAPI_LINK_TRAINING_RETRIES; + /* fall-through */ + case OCAPI_SLOT_FRESET_INIT: + fence_brick(dev); + assert_odl_reset(chip_id, dev->brick_index); + assert_adapter_reset(dev); + pci_slot_set_state(slot, + OCAPI_SLOT_FRESET_ASSERT_DELAY); + /* assert for 5ms */ + return pci_slot_set_sm_timeout(slot, msecs_to_tb(5)); + + case OCAPI_SLOT_FRESET_ASSERT_DELAY: + rc = npu2_opencapi_phy_reset(dev); + if (rc) { + OCAPIERR(dev, "FRESET: couldn't reset PHY state\n"); + return OPAL_HARDWARE; + } + deassert_odl_reset(chip_id, dev->brick_index); + deassert_adapter_reset(dev); + pci_slot_set_state(slot, + OCAPI_SLOT_FRESET_DEASSERT_DELAY); + /* give 250ms to device to be ready */ + return pci_slot_set_sm_timeout(slot, msecs_to_tb(250)); + + case OCAPI_SLOT_FRESET_DEASSERT_DELAY: + unfence_brick(dev); + set_init_pattern(chip_id, dev); + pci_slot_set_state(slot, + OCAPI_SLOT_FRESET_INIT_DELAY); + return pci_slot_set_sm_timeout(slot, msecs_to_tb(5)); + + case OCAPI_SLOT_FRESET_INIT_DELAY: + /* Bump lanes - this improves training reliability */ + npu2_opencapi_bump_ui_lane(dev); + start_training(chip_id, dev); + dev->train_start = mftb(); + dev->train_timeout = dev->train_start + msecs_to_tb(OCAPI_LINK_TRAINING_TIMEOUT); + pci_slot_set_state(slot, OCAPI_SLOT_LINK_START); + return slot->ops.poll_link(slot); + + default: + OCAPIERR(dev, "FRESET: unexpected slot state %08x\n", + slot->state); + } + pci_slot_set_state(slot, OCAPI_SLOT_NORMAL); + return OPAL_HARDWARE; +} + +static int64_t npu2_opencapi_hreset(struct pci_slot *slot __unused) +{ + struct npu2_dev *dev = phb_to_npu2_dev_ocapi(slot->phb); + + OCAPIERR(dev, "hreset not supported\n"); + return OPAL_UNSUPPORTED; +} + +static void make_slot_hotpluggable(struct pci_slot *slot, struct phb *phb) +{ + struct npu2_dev *dev = phb_to_npu2_dev_ocapi(phb); + char name[40]; + const char *label = NULL; + + /* + * Add a few definitions to the DT so that the linux PCI + * hotplug framework can find the slot and identify it as + * hot-pluggable. + * + * The "ibm,slot-label" property is used by linux as the slot name + */ + slot->pluggable = 1; + pci_slot_add_dt_properties(slot, phb->dt_node); + + if (platform.ocapi->ocapi_slot_label) + label = platform.ocapi->ocapi_slot_label(dev->npu->chip_id, + dev->brick_index); + + if (!label) { + snprintf(name, sizeof(name), "OPENCAPI-%04x", + (int)PCI_SLOT_PHB_INDEX(slot->id)); + label = name; + } + dt_add_property_string(phb->dt_node, "ibm,slot-label", label); +} + +static struct pci_slot *npu2_opencapi_slot_create(struct phb *phb) +{ + struct pci_slot *slot; + + slot = pci_slot_alloc(phb, NULL); + if (!slot) + return slot; + + /* TODO: Figure out other slot functions */ + slot->ops.get_presence_state = npu2_opencapi_get_presence_state; + slot->ops.get_link_state = npu2_opencapi_get_link_state; + slot->ops.get_power_state = npu2_opencapi_get_power_state; + slot->ops.get_attention_state = NULL; + slot->ops.get_latch_state = NULL; + slot->ops.set_power_state = npu2_opencapi_set_power_state; + slot->ops.set_attention_state = NULL; + + slot->ops.prepare_link_change = npu2_opencapi_prepare_link_change; + slot->ops.poll_link = npu2_opencapi_poll_link; + slot->ops.creset = npu2_opencapi_creset; + slot->ops.freset = npu2_opencapi_freset; + slot->ops.hreset = npu2_opencapi_hreset; + + return slot; +} + +static int64_t npu2_opencapi_pcicfg_check(struct npu2_dev *dev, uint32_t offset, + uint32_t size) +{ + if (!dev || offset > 0xfff || (offset & (size - 1))) + return OPAL_PARAMETER; + + return OPAL_SUCCESS; +} + +static int64_t npu2_opencapi_pcicfg_read(struct phb *phb, uint32_t bdfn, + uint32_t offset, uint32_t size, + void *data) +{ + uint64_t cfg_addr; + struct npu2_dev *dev = phb_to_npu2_dev_ocapi(phb); + uint64_t genid_base; + int64_t rc; + + rc = npu2_opencapi_pcicfg_check(dev, offset, size); + if (rc) + return rc; + + genid_base = dev->bars[1].npu2_bar.base + + (index_to_block(dev->brick_index) == NPU2_BLOCK_OTL1 ? 256 : 0); + + cfg_addr = NPU2_CQ_CTL_CONFIG_ADDR_ENABLE; + cfg_addr = SETFIELD(NPU2_CQ_CTL_CONFIG_ADDR_BUS_NUMBER | + NPU2_CQ_CTL_CONFIG_ADDR_DEVICE_NUMBER | + NPU2_CQ_CTL_CONFIG_ADDR_FUNCTION_NUMBER, + cfg_addr, bdfn); + cfg_addr = SETFIELD(NPU2_CQ_CTL_CONFIG_ADDR_REGISTER_NUMBER, + cfg_addr, offset & ~3u); + + out_be64((beint64_t *)genid_base, cfg_addr); + sync(); + + switch (size) { + case 1: + *((uint8_t *)data) = + in_8((volatile uint8_t *)(genid_base + 128 + (offset & 3))); + break; + case 2: + *((uint16_t *)data) = + in_le16((volatile leint16_t *)(genid_base + 128 + (offset & 2))); + break; + case 4: + *((uint32_t *)data) = in_le32((volatile leint32_t *)(genid_base + 128)); + break; + default: + return OPAL_PARAMETER; + } + + return OPAL_SUCCESS; +} + +#define NPU2_OPENCAPI_PCI_CFG_READ(size, type) \ +static int64_t npu2_opencapi_pcicfg_read##size(struct phb *phb, \ + uint32_t bdfn, \ + uint32_t offset, \ + type *data) \ +{ \ + /* Initialize data in case of error */ \ + *data = (type)0xffffffff; \ + return npu2_opencapi_pcicfg_read(phb, bdfn, offset, \ + sizeof(type), data); \ +} + +static int64_t npu2_opencapi_pcicfg_write(struct phb *phb, uint32_t bdfn, + uint32_t offset, uint32_t size, + uint32_t data) +{ + uint64_t cfg_addr; + struct npu2_dev *dev = phb_to_npu2_dev_ocapi(phb); + uint64_t genid_base; + int64_t rc; + + rc = npu2_opencapi_pcicfg_check(dev, offset, size); + if (rc) + return rc; + + genid_base = dev->bars[1].npu2_bar.base + + (index_to_block(dev->brick_index) == NPU2_BLOCK_OTL1 ? 256 : 0); + + cfg_addr = NPU2_CQ_CTL_CONFIG_ADDR_ENABLE; + cfg_addr = SETFIELD(NPU2_CQ_CTL_CONFIG_ADDR_BUS_NUMBER | + NPU2_CQ_CTL_CONFIG_ADDR_DEVICE_NUMBER | + NPU2_CQ_CTL_CONFIG_ADDR_FUNCTION_NUMBER, + cfg_addr, bdfn); + cfg_addr = SETFIELD(NPU2_CQ_CTL_CONFIG_ADDR_REGISTER_NUMBER, + cfg_addr, offset & ~3u); + + out_be64((beint64_t *)genid_base, cfg_addr); + sync(); + + switch (size) { + case 1: + out_8((volatile uint8_t *)(genid_base + 128 + (offset & 3)), + data); + break; + case 2: + out_le16((volatile leint16_t *)(genid_base + 128 + (offset & 2)), + data); + break; + case 4: + out_le32((volatile leint32_t *)(genid_base + 128), data); + break; + default: + return OPAL_PARAMETER; + } + + return OPAL_SUCCESS; +} + +#define NPU2_OPENCAPI_PCI_CFG_WRITE(size, type) \ +static int64_t npu2_opencapi_pcicfg_write##size(struct phb *phb, \ + uint32_t bdfn, \ + uint32_t offset, \ + type data) \ +{ \ + return npu2_opencapi_pcicfg_write(phb, bdfn, offset, \ + sizeof(type), data); \ +} + +NPU2_OPENCAPI_PCI_CFG_READ(8, u8) +NPU2_OPENCAPI_PCI_CFG_READ(16, u16) +NPU2_OPENCAPI_PCI_CFG_READ(32, u32) +NPU2_OPENCAPI_PCI_CFG_WRITE(8, u8) +NPU2_OPENCAPI_PCI_CFG_WRITE(16, u16) +NPU2_OPENCAPI_PCI_CFG_WRITE(32, u32) + +static int64_t npu2_opencapi_ioda_reset(struct phb __unused *phb, + bool __unused purge) +{ + /* Not relevant to OpenCAPI - we do this just to silence the error */ + return OPAL_SUCCESS; +} + +static int64_t npu2_opencapi_set_pe(struct phb *phb, + uint64_t pe_num, + uint64_t __unused bdfn, + uint8_t __unused bcompare, + uint8_t __unused dcompare, + uint8_t __unused fcompare, + uint8_t action) +{ + struct npu2_dev *dev = phb_to_npu2_dev_ocapi(phb); + /* + * Ignored on OpenCAPI - we use fixed PE assignments. May need + * addressing when we support dual-link devices. + * + * We nonetheless store the PE reported by the OS so that we + * can send it back in case of error. If there are several PCI + * functions on the device, the OS can define many PEs, we + * only keep one, the OS will handle it. + */ + if (action != OPAL_MAP_PE && action != OPAL_UNMAP_PE) + return OPAL_PARAMETER; + + if (action == OPAL_UNMAP_PE) + pe_num = -1; + dev->linux_pe = pe_num; + return OPAL_SUCCESS; +} + +static int64_t npu2_opencapi_freeze_status(struct phb *phb __unused, + uint64_t pe_number __unused, + uint8_t *freeze_state, + uint16_t *pci_error_type, + uint16_t *severity) +{ + *freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN; + *pci_error_type = OPAL_EEH_NO_ERROR; + if (severity) + *severity = OPAL_EEH_SEV_NO_ERROR; + + return OPAL_SUCCESS; +} + +static int64_t npu2_opencapi_eeh_next_error(struct phb *phb, + uint64_t *first_frozen_pe, + uint16_t *pci_error_type, + uint16_t *severity) +{ + struct npu2_dev *dev = phb_to_npu2_dev_ocapi(phb); + + if (!first_frozen_pe || !pci_error_type || !severity) + return OPAL_PARAMETER; + + if (dev->flags & NPU2_DEV_BROKEN) { + OCAPIDBG(dev, "Reporting device as broken\n"); + *first_frozen_pe = dev->linux_pe; + *pci_error_type = OPAL_EEH_PHB_ERROR; + *severity = OPAL_EEH_SEV_PHB_DEAD; + } else { + *first_frozen_pe = -1; + *pci_error_type = OPAL_EEH_NO_ERROR; + *severity = OPAL_EEH_SEV_NO_ERROR; + } + return OPAL_SUCCESS; +} + +static int npu2_add_mmio_regs(struct phb *phb, struct pci_device *pd, + void *data __unused) +{ + uint32_t irq; + struct npu2_dev *dev = phb_to_npu2_dev_ocapi(phb); + uint64_t block = index_to_block(dev->brick_index); + uint64_t stacku = index_to_stacku(dev->brick_index); + uint64_t dsisr, dar, tfc, handle; + + /* + * Pass the hw irq number for the translation fault irq + * irq levels 23 -> 26 are for translation faults, 1 per brick + */ + irq = dev->npu->base_lsi + NPU_IRQ_LEVELS_XSL; + if (stacku == NPU2_STACK_STCK_2U) + irq += 2; + if (block == NPU2_BLOCK_OTL1) + irq++; + + /* + * Add the addresses of the registers needed by the OS to handle + * faults. The OS accesses them by mmio. + */ + dsisr = (uint64_t) dev->npu->regs + NPU2_OTL_OSL_DSISR(stacku, block); + dar = (uint64_t) dev->npu->regs + NPU2_OTL_OSL_DAR(stacku, block); + tfc = (uint64_t) dev->npu->regs + NPU2_OTL_OSL_TFC(stacku, block); + handle = (uint64_t) dev->npu->regs + NPU2_OTL_OSL_PEHANDLE(stacku, + block); + dt_add_property_cells(pd->dn, "ibm,opal-xsl-irq", irq); + dt_add_property_cells(pd->dn, "ibm,opal-xsl-mmio", + hi32(dsisr), lo32(dsisr), + hi32(dar), lo32(dar), + hi32(tfc), lo32(tfc), + hi32(handle), lo32(handle)); + return 0; +} + +static void npu2_opencapi_final_fixup(struct phb *phb) +{ + pci_walk_dev(phb, NULL, npu2_add_mmio_regs, NULL); +} + +static void mask_nvlink_fir(struct npu2 *p) +{ + uint64_t reg; + + /* + * From section 13.1.3.10 of the NPU workbook: "the NV-Link + * Datalink Layer Stall and NoStall signals are used for a + * different purpose when the link is configured for + * OpenCAPI. Therefore, the corresponding bits in NPU FIR + * Register 1 must be masked and configured to NOT cause the + * NPU to go into Freeze or Fence mode or send an Interrupt." + * + * FIXME: will need to revisit when mixing nvlink with + * opencapi. Assumes an opencapi-only setup on both PHYs for + * now. + */ + + /* Mask FIRs */ + xscom_read(p->chip_id, p->xscom_base + NPU2_MISC_FIR1_MASK, ®); + reg = SETFIELD(PPC_BITMASK(0, 11), reg, 0xFFF); + xscom_write(p->chip_id, p->xscom_base + NPU2_MISC_FIR1_MASK, reg); + + /* freeze disable */ + reg = npu2_scom_read(p->chip_id, p->xscom_base, + NPU2_MISC_FREEZE_ENABLE1, NPU2_MISC_DA_LEN_8B); + reg = SETFIELD(PPC_BITMASK(0, 11), reg, 0); + npu2_scom_write(p->chip_id, p->xscom_base, + NPU2_MISC_FREEZE_ENABLE1, NPU2_MISC_DA_LEN_8B, reg); + + /* fence disable */ + reg = npu2_scom_read(p->chip_id, p->xscom_base, + NPU2_MISC_FENCE_ENABLE1, NPU2_MISC_DA_LEN_8B); + reg = SETFIELD(PPC_BITMASK(0, 11), reg, 0); + npu2_scom_write(p->chip_id, p->xscom_base, + NPU2_MISC_FENCE_ENABLE1, NPU2_MISC_DA_LEN_8B, reg); + + /* irq disable */ + reg = npu2_scom_read(p->chip_id, p->xscom_base, + NPU2_MISC_IRQ_ENABLE1, NPU2_MISC_DA_LEN_8B); + reg = SETFIELD(PPC_BITMASK(0, 11), reg, 0); + npu2_scom_write(p->chip_id, p->xscom_base, + NPU2_MISC_IRQ_ENABLE1, NPU2_MISC_DA_LEN_8B, reg); +} + +static int enable_interrupts(struct npu2 *p) +{ + uint64_t reg, xsl_fault, xstop_override, xsl_mask; + + /* + * We need to: + * - enable translation interrupts for all bricks + * - override most brick-fatal errors from FIR2 to send an + * interrupt instead of the default action of checkstopping + * the systems, since we can just fence the brick and keep + * the system alive. + * - the exception to the above is 2 FIRs for XSL errors + * resulting from bad AFU behavior, for which we don't want to + * checkstop but can't configure to send an error interrupt + * either, as the XSL errors are reported on 2 links (the + * XSL is shared between 2 links). Instead, we mask + * them. The XSL errors will result in an OTL error, which + * is reported only once, for the correct link. + * + * FIR bits configured to trigger an interrupt must have their + * default action masked + */ + xsl_fault = PPC_BIT(0) | PPC_BIT(1) | PPC_BIT(2) | PPC_BIT(3); + xstop_override = 0x0FFFEFC00F91B000; + xsl_mask = NPU2_CHECKSTOP_REG2_XSL_XLAT_REQ_WHILE_SPAP_INVALID | + NPU2_CHECKSTOP_REG2_XSL_INVALID_PEE; + + xscom_read(p->chip_id, p->xscom_base + NPU2_MISC_FIR2_MASK, ®); + reg |= xsl_fault | xstop_override | xsl_mask; + xscom_write(p->chip_id, p->xscom_base + NPU2_MISC_FIR2_MASK, reg); + + reg = npu2_scom_read(p->chip_id, p->xscom_base, NPU2_MISC_IRQ_ENABLE2, + NPU2_MISC_DA_LEN_8B); + reg |= xsl_fault | xstop_override; + npu2_scom_write(p->chip_id, p->xscom_base, NPU2_MISC_IRQ_ENABLE2, + NPU2_MISC_DA_LEN_8B, reg); + + /* + * Make sure the brick is fenced on those errors. + * Fencing is incompatible with freezing, but there's no + * freeze defined for FIR2, so we don't have to worry about it + * + * For the 2 XSL bits we ignore, we need to make sure they + * don't fence the link, as the NPU logic could allow it even + * when masked. + */ + reg = npu2_scom_read(p->chip_id, p->xscom_base, NPU2_MISC_FENCE_ENABLE2, + NPU2_MISC_DA_LEN_8B); + reg |= xstop_override; + reg &= ~NPU2_CHECKSTOP_REG2_XSL_XLAT_REQ_WHILE_SPAP_INVALID; + reg &= ~NPU2_CHECKSTOP_REG2_XSL_INVALID_PEE; + npu2_scom_write(p->chip_id, p->xscom_base, NPU2_MISC_FENCE_ENABLE2, + NPU2_MISC_DA_LEN_8B, reg); + + mask_nvlink_fir(p); + return 0; +} + +static void setup_debug_training_state(struct npu2_dev *dev) +{ + npu2_opencapi_phy_reset(dev); + + switch (npu2_ocapi_training_state) { + case NPU2_TRAIN_PRBS31: + OCAPIINF(dev, "sending PRBS31 pattern per NVRAM setting\n"); + npu2_opencapi_phy_prbs31(dev); + break; + + case NPU2_TRAIN_NONE: + OCAPIINF(dev, "link not trained per NVRAM setting\n"); + break; + default: + assert(false); + } +} + +static void setup_device(struct npu2_dev *dev) +{ + struct dt_node *dn_phb; + struct pci_slot *slot; + uint64_t mm_win[2]; + + /* Populate PHB device node */ + phys_map_get(dev->npu->chip_id, NPU_OCAPI_MMIO, dev->brick_index, &mm_win[0], + &mm_win[1]); + prlog(PR_DEBUG, "OCAPI: Setting MMIO window to %016llx + %016llx\n", + mm_win[0], mm_win[1]); + dn_phb = dt_new_addr(dt_root, "pciex", mm_win[0]); + assert(dn_phb); + dt_add_property_strings(dn_phb, + "compatible", + "ibm,power9-npu-opencapi-pciex", + "ibm,ioda2-npu2-opencapi-phb"); + + dt_add_property_cells(dn_phb, "#address-cells", 3); + dt_add_property_cells(dn_phb, "#size-cells", 2); + dt_add_property_cells(dn_phb, "#interrupt-cells", 1); + dt_add_property_cells(dn_phb, "bus-range", 0, 0xff); + dt_add_property_cells(dn_phb, "clock-frequency", 0x200, 0); + dt_add_property_cells(dn_phb, "interrupt-parent", get_ics_phandle()); + + dt_add_property_strings(dn_phb, "device_type", "pciex"); + dt_add_property(dn_phb, "reg", mm_win, sizeof(mm_win)); + dt_add_property_cells(dn_phb, "ibm,npu-index", dev->npu->index); + dt_add_property_cells(dn_phb, "ibm,phb-index", + npu2_get_phb_index(dev->brick_index)); + dt_add_property_cells(dn_phb, "ibm,chip-id", dev->npu->chip_id); + dt_add_property_cells(dn_phb, "ibm,xscom-base", dev->npu->xscom_base); + dt_add_property_cells(dn_phb, "ibm,npcq", dev->npu->dt_node->phandle); + dt_add_property_cells(dn_phb, "ibm,links", 1); + dt_add_property(dn_phb, "ibm,mmio-window", mm_win, sizeof(mm_win)); + dt_add_property_cells(dn_phb, "ibm,phb-diag-data-size", 0); + + /* + * We ignore whatever PE numbers Linux tries to set, so we just + * advertise enough that Linux won't complain + */ + dt_add_property_cells(dn_phb, "ibm,opal-num-pes", NPU2_MAX_PE_NUM); + dt_add_property_cells(dn_phb, "ibm,opal-reserved-pe", NPU2_RESERVED_PE_NUM); + + dt_add_property_cells(dn_phb, "ranges", 0x02000000, + hi32(mm_win[0]), lo32(mm_win[0]), + hi32(mm_win[0]), lo32(mm_win[0]), + hi32(mm_win[1]), lo32(mm_win[1])); + + dev->phb_ocapi.dt_node = dn_phb; + dev->phb_ocapi.ops = &npu2_opencapi_ops; + dev->phb_ocapi.phb_type = phb_type_npu_v2_opencapi; + dev->phb_ocapi.scan_map = 0; + + dev->bdfn = 0; + dev->linux_pe = -1; + + /* TODO: Procedure 13.1.3.7 - AFU Memory Range BARs */ + /* Procedure 13.1.3.8 - AFU MMIO Range BARs */ + setup_afu_mmio_bars(dev->npu->chip_id, dev->npu->xscom_base, dev); + /* Procedure 13.1.3.9 - AFU Config BARs */ + setup_afu_config_bars(dev->npu->chip_id, dev->npu->xscom_base, dev); + setup_perf_counters(dev); + npu2_opencapi_phy_init(dev); + + set_fence_control(dev->npu->chip_id, dev->npu->xscom_base, dev->brick_index, 0b00); + + pci_register_phb(&dev->phb_ocapi, OPAL_DYNAMIC_PHB_ID); + + if (npu2_ocapi_training_state != NPU2_TRAIN_DEFAULT) { + setup_debug_training_state(dev); + } else { + slot = npu2_opencapi_slot_create(&dev->phb_ocapi); + if (!slot) { + /** + * @fwts-label OCAPICannotCreatePHBSlot + * @fwts-advice Firmware probably ran out of memory creating + * NPU slot. OpenCAPI functionality could be broken. + */ + prlog(PR_ERR, "OCAPI: Cannot create PHB slot\n"); + } + make_slot_hotpluggable(slot, &dev->phb_ocapi); + } + return; +} + +static void read_nvram_training_state(void) +{ + const char *state; + + state = nvram_query_dangerous("opencapi-link-training"); + if (state) { + if (!strcmp(state, "prbs31")) + npu2_ocapi_training_state = NPU2_TRAIN_PRBS31; + else if (!strcmp(state, "none")) + npu2_ocapi_training_state = NPU2_TRAIN_NONE; + else + prlog(PR_WARNING, + "OCAPI: invalid training state in NVRAM: %s\n", + state); + } +} + +int npu2_opencapi_init_npu(struct npu2 *npu) +{ + struct npu2_dev *dev; + uint64_t reg[2]; + + assert(platform.ocapi); + read_nvram_training_state(); + + /* TODO: Test OpenCAPI with fast reboot and make it work */ + disable_fast_reboot("OpenCAPI device enabled"); + + setup_global_mmio_bar(npu->chip_id, npu->xscom_base, reg); + + npu->regs = (void *)reg[0]; + + for (int i = 0; i < npu->total_devices; i++) { + dev = &npu->devices[i]; + if (dev->type != NPU2_DEV_TYPE_OPENCAPI) + continue; + + prlog(PR_INFO, "OCAPI: Configuring link index %d, brick %d\n", + dev->link_index, dev->brick_index); + + /* Procedure 13.1.3.1 - Select OCAPI vs NVLink */ + brick_config(npu->chip_id, npu->xscom_base, dev->brick_index); + + /* Procedure 13.1.3.4 - Brick to PE Mapping */ + pe_config(dev); + + /* Procedure 13.1.3.5 - Transaction Layer Configuration */ + tl_config(npu->chip_id, npu->xscom_base, dev->brick_index); + + /* Procedure 13.1.3.6 - Address Translation Configuration */ + address_translation_config(npu->chip_id, npu->xscom_base, dev->brick_index); + } + + enable_interrupts(npu); + + for (int i = 0; i < npu->total_devices; i++) { + dev = &npu->devices[i]; + if (dev->type != NPU2_DEV_TYPE_OPENCAPI) + continue; + setup_device(dev); + } + + return 0; +} + +static const struct phb_ops npu2_opencapi_ops = { + .cfg_read8 = npu2_opencapi_pcicfg_read8, + .cfg_read16 = npu2_opencapi_pcicfg_read16, + .cfg_read32 = npu2_opencapi_pcicfg_read32, + .cfg_write8 = npu2_opencapi_pcicfg_write8, + .cfg_write16 = npu2_opencapi_pcicfg_write16, + .cfg_write32 = npu2_opencapi_pcicfg_write32, + .device_init = NULL, + .phb_final_fixup = npu2_opencapi_final_fixup, + .ioda_reset = npu2_opencapi_ioda_reset, + .papr_errinjct_reset = NULL, + .pci_reinit = NULL, + .set_phb_mem_window = NULL, + .phb_mmio_enable = NULL, + .map_pe_mmio_window = NULL, + .map_pe_dma_window = NULL, + .map_pe_dma_window_real = NULL, + .pci_msi_eoi = NULL, + .set_xive_pe = NULL, + .get_msi_32 = NULL, + .get_msi_64 = NULL, + .set_pe = npu2_opencapi_set_pe, + .set_peltv = NULL, + .eeh_freeze_status = npu2_opencapi_freeze_status, + .eeh_freeze_clear = NULL, + .eeh_freeze_set = NULL, + .next_error = npu2_opencapi_eeh_next_error, + .err_inject = NULL, + .get_diag_data2 = NULL, + .set_capi_mode = NULL, + .set_capp_recovery = NULL, + .tce_kill = NULL, +}; + +void npu2_opencapi_set_broken(struct npu2 *npu, int brick) +{ + struct phb *phb; + struct npu2_dev *dev; + + for_each_phb(phb) { + if (phb->phb_type == phb_type_npu_v2_opencapi) { + dev = phb_to_npu2_dev_ocapi(phb); + if (dev->npu == npu && + dev->brick_index == brick) + dev->flags |= NPU2_DEV_BROKEN; + } + } +} + +static int64_t opal_npu_spa_setup(uint64_t phb_id, uint32_t __unused bdfn, + uint64_t addr, uint64_t PE_mask) +{ + uint64_t stack, block, offset, reg; + struct phb *phb = pci_get_phb(phb_id); + struct npu2_dev *dev; + int rc; + + if (!phb || phb->phb_type != phb_type_npu_v2_opencapi) + return OPAL_PARAMETER; + + /* 4k aligned */ + if (addr & 0xFFF) + return OPAL_PARAMETER; + + if (PE_mask > 15) + return OPAL_PARAMETER; + + dev = phb_to_npu2_dev_ocapi(phb); + if (!dev) + return OPAL_PARAMETER; + + block = index_to_block(dev->brick_index); + stack = index_to_stack(dev->brick_index); + if (block == NPU2_BLOCK_OTL1) + offset = NPU2_XSL_PSL_SPAP_A1; + else + offset = NPU2_XSL_PSL_SPAP_A0; + + + lock(&dev->npu->lock); + /* + * set the SPAP used by the device + */ + reg = npu2_scom_read(dev->npu->chip_id, dev->npu->xscom_base, + NPU2_REG_OFFSET(stack, NPU2_BLOCK_XSL, offset), + NPU2_MISC_DA_LEN_8B); + if ((addr && (reg & NPU2_XSL_PSL_SPAP_EN)) || + (!addr && !(reg & NPU2_XSL_PSL_SPAP_EN))) { + rc = OPAL_BUSY; + goto out; + } + /* SPA is disabled by passing a NULL address */ + reg = addr; + if (addr) + reg = addr | NPU2_XSL_PSL_SPAP_EN; + + npu2_scom_write(dev->npu->chip_id, dev->npu->xscom_base, + NPU2_REG_OFFSET(stack, NPU2_BLOCK_XSL, offset), + NPU2_MISC_DA_LEN_8B, reg); + + /* + * set the PE mask that the OS uses for PASID -> PE handle + * conversion + */ + reg = npu2_scom_read(dev->npu->chip_id, dev->npu->xscom_base, + NPU2_OTL_CONFIG0(stack, block), NPU2_MISC_DA_LEN_8B); + reg &= ~NPU2_OTL_CONFIG0_PE_MASK; + reg |= (PE_mask << (63-7)); + npu2_scom_write(dev->npu->chip_id, dev->npu->xscom_base, + NPU2_OTL_CONFIG0(stack, block), NPU2_MISC_DA_LEN_8B, + reg); + rc = OPAL_SUCCESS; +out: + unlock(&dev->npu->lock); + return rc; +} +opal_call(OPAL_NPU_SPA_SETUP, opal_npu_spa_setup, 4); + +static int64_t opal_npu_spa_clear_cache(uint64_t phb_id, uint32_t __unused bdfn, + uint64_t PE_handle) +{ + uint64_t cc_inv, stack, block, reg, rc; + uint32_t retries = 5; + struct phb *phb = pci_get_phb(phb_id); + struct npu2_dev *dev; + + if (!phb || phb->phb_type != phb_type_npu_v2_opencapi) + return OPAL_PARAMETER; + + if (PE_handle > MAX_PE_HANDLE) + return OPAL_PARAMETER; + + dev = phb_to_npu2_dev_ocapi(phb); + if (!dev) + return OPAL_PARAMETER; + + block = index_to_block(dev->brick_index); + stack = index_to_stack(dev->brick_index); + cc_inv = NPU2_REG_OFFSET(stack, NPU2_BLOCK_XSL, NPU2_XSL_PSL_LLCMD_A0); + + lock(&dev->npu->lock); + reg = npu2_scom_read(dev->npu->chip_id, dev->npu->xscom_base, cc_inv, + NPU2_MISC_DA_LEN_8B); + if (reg & PPC_BIT(16)) { + rc = OPAL_BUSY; + goto out; + } + + reg = PE_handle | PPC_BIT(15); + if (block == NPU2_BLOCK_OTL1) + reg |= PPC_BIT(48); + npu2_scom_write(dev->npu->chip_id, dev->npu->xscom_base, cc_inv, + NPU2_MISC_DA_LEN_8B, reg); + + rc = OPAL_HARDWARE; + while (retries--) { + reg = npu2_scom_read(dev->npu->chip_id, dev->npu->xscom_base, + cc_inv, NPU2_MISC_DA_LEN_8B); + if (!(reg & PPC_BIT(16))) { + rc = OPAL_SUCCESS; + break; + } + /* the bit expected to flip in less than 200us */ + time_wait_us(200); + } +out: + unlock(&dev->npu->lock); + return rc; +} +opal_call(OPAL_NPU_SPA_CLEAR_CACHE, opal_npu_spa_clear_cache, 3); + +static int get_template_rate(unsigned int templ, char *rate_buf) +{ + int shift, idx, val; + + /* + * Each rate is encoded over 4 bits (0->15), with 15 being the + * slowest. The buffer is a succession of rates for all the + * templates. The first 4 bits are for template 63, followed + * by 4 bits for template 62, ... etc. So the rate for + * template 0 is at the very end of the buffer. + */ + idx = (TL_MAX_TEMPLATE - templ) / 2; + shift = 4 * (1 - ((TL_MAX_TEMPLATE - templ) % 2)); + val = rate_buf[idx] >> shift; + return val; +} + +static bool is_template_supported(unsigned int templ, long capabilities) +{ + return !!(capabilities & (1ull << templ)); +} + +static int64_t opal_npu_tl_set(uint64_t phb_id, uint32_t __unused bdfn, + long capabilities, uint64_t rate_phys, int rate_sz) +{ + struct phb *phb = pci_get_phb(phb_id); + struct npu2_dev *dev; + uint64_t stack, block, reg, templ_rate; + int i, rate_pos; + char *rate = (char *) rate_phys; + + if (!phb || phb->phb_type != phb_type_npu_v2_opencapi) + return OPAL_PARAMETER; + if (!opal_addr_valid(rate) || rate_sz != TL_RATE_BUF_SIZE) + return OPAL_PARAMETER; + + dev = phb_to_npu2_dev_ocapi(phb); + if (!dev) + return OPAL_PARAMETER; + + block = index_to_block(dev->brick_index); + stack = index_to_stack(dev->brick_index); + /* + * The 'capabilities' argument defines what TL template the + * device can receive. OpenCAPI 3.0 and 4.0 define 64 templates, so + * that's one bit per template. + * + * For each template, the device processing time may vary, so + * the device advertises at what rate a message of a given + * template can be sent. That's encoded in the 'rate' buffer. + * + * On P9, NPU only knows about TL templates 0 -> 3. + * Per the spec, template 0 must be supported. + */ + if (!is_template_supported(0, capabilities)) + return OPAL_PARAMETER; + + reg = npu2_scom_read(dev->npu->chip_id, dev->npu->xscom_base, + NPU2_OTL_CONFIG1(stack, block), + NPU2_MISC_DA_LEN_8B); + reg &= ~(NPU2_OTL_CONFIG1_TX_TEMP1_EN | NPU2_OTL_CONFIG1_TX_TEMP2_EN | + NPU2_OTL_CONFIG1_TX_TEMP3_EN); + for (i = 0; i < 4; i++) { + /* Skip template 0 as it is implicitly enabled */ + if (i && is_template_supported(i, capabilities)) + reg |= PPC_BIT(i); + /* The tx rate should still be set for template 0 */ + templ_rate = get_template_rate(i, rate); + rate_pos = 8 + i * 4; + reg = SETFIELD(PPC_BITMASK(rate_pos, rate_pos + 3), reg, + templ_rate); + } + npu2_scom_write(dev->npu->chip_id, dev->npu->xscom_base, + NPU2_OTL_CONFIG1(stack, block), NPU2_MISC_DA_LEN_8B, + reg); + OCAPIDBG(dev, "OTL configuration 1 register set to %llx\n", reg); + return OPAL_SUCCESS; +} +opal_call(OPAL_NPU_TL_SET, opal_npu_tl_set, 5); + +static void set_mem_bar(struct npu2_dev *dev, uint64_t base, uint64_t size) +{ + uint64_t stack, val, reg, bar_offset, pa_config_offset; + uint8_t memsel; + + stack = index_to_stack(dev->brick_index); + switch (dev->brick_index) { + case 2: + case 4: + bar_offset = NPU2_GPU0_MEM_BAR; + pa_config_offset = NPU2_CQ_CTL_MISC_PA0_CONFIG; + break; + case 3: + case 5: + bar_offset = NPU2_GPU1_MEM_BAR; + pa_config_offset = NPU2_CQ_CTL_MISC_PA1_CONFIG; + break; + default: + assert(false); + } + + assert((!size && !base) || (size && base)); + + /* + * Memory select configuration: + * - 0b000 - BAR disabled + * - 0b001 - match 0b00, 0b01 + * - 0b010 - match 0b01, 0b10 + * - 0b011 - match 0b00, 0b10 + * - 0b100 - match 0b00 + * - 0b101 - match 0b01 + * - 0b110 - match 0b10 + * - 0b111 - match 0b00, 0b01, 0b10 + */ + memsel = GETFIELD(PPC_BITMASK(13, 14), base); + if (size) + val = SETFIELD(NPU2_MEM_BAR_EN | NPU2_MEM_BAR_SEL_MEM, 0ULL, 0b100 + memsel); + else + val = 0; + + /* Base address - 12 bits, 1G aligned */ + val = SETFIELD(NPU2_MEM_BAR_NODE_ADDR, val, GETFIELD(PPC_BITMASK(22, 33), base)); + + /* GCID */ + val = SETFIELD(NPU2_MEM_BAR_GROUP, val, GETFIELD(PPC_BITMASK(15, 18), base)); + val = SETFIELD(NPU2_MEM_BAR_CHIP, val, GETFIELD(PPC_BITMASK(19, 21), base)); + + /* Other settings */ + val = SETFIELD(NPU2_MEM_BAR_POISON, val, 1); + val = SETFIELD(NPU2_MEM_BAR_GRANULE, val, 0); + val = SETFIELD(NPU2_MEM_BAR_BAR_SIZE, val, ilog2(size >> 30)); + val = SETFIELD(NPU2_MEM_BAR_MODE, val, 0); + + for (int block = NPU2_BLOCK_SM_0; block <= NPU2_BLOCK_SM_3; block++) { + reg = NPU2_REG_OFFSET(stack, block, bar_offset); + npu2_write(dev->npu, reg, val); + } + + /* Set PA config */ + if (size) + val = SETFIELD(NPU2_CQ_CTL_MISC_PA_CONFIG_MEMSELMATCH, 0ULL, 0b100 + memsel); + else + val = 0; + val = SETFIELD(NPU2_CQ_CTL_MISC_PA_CONFIG_GRANULE, val, 0); + val = SETFIELD(NPU2_CQ_CTL_MISC_PA_CONFIG_SIZE, val, ilog2(size >> 30)); + val = SETFIELD(NPU2_CQ_CTL_MISC_PA_CONFIG_MODE, val, 0); + val = SETFIELD(NPU2_CQ_CTL_MISC_PA_CONFIG_MASK, val, 0); + reg = NPU2_REG_OFFSET(stack, NPU2_BLOCK_CTL, pa_config_offset); + npu2_write(dev->npu, reg, val); +} + +static int64_t alloc_mem_bar(struct npu2_dev *dev, uint64_t size, uint64_t *bar) +{ + uint64_t phys_map_base, phys_map_size, val; + int rc = OPAL_SUCCESS; + + lock(&dev->npu->lock); + + if (dev->lpc_mem_base) { + OCAPIERR(dev, "LPC allocation failed - BAR already in use\n"); + rc = OPAL_RESOURCE; + goto out; + } + + /* + * The supported chip address extension mask is 1100 100 (mask + * off 2 bits from group ID and 1 bit from chip ID). + * + * Fall back to only permitting a single allocation if we + * don't see this mask value. + */ + xscom_read(dev->npu->chip_id, PB_CENT_MODE, &val); + if (GETFIELD(PB_CFG_CHIP_ADDR_EXTENSION_MASK_CENT, val) == 0b1100100) { + phys_map_get(dev->npu->chip_id, OCAPI_MEM, + dev->brick_index - 2, &phys_map_base, + &phys_map_size); + } else { + bool in_use = false; + + for (int i = 0; i < dev->npu->total_devices; i++) { + if (dev->npu->devices[i].lpc_mem_base) + in_use = true; + } + + if (in_use) { + OCAPIERR(dev, "LPC allocation failed - single device per chip limit, FW upgrade required (pb_cent_mode=0x%016llx)\n", val); + rc = OPAL_RESOURCE; + goto out; + } + + phys_map_get(dev->npu->chip_id, OCAPI_MEM, 0, &phys_map_base, + &phys_map_size); + } + + if (size > phys_map_size) { + /** + * @fwts-label OCAPIInvalidLPCMemoryBARSize + * @fwts-advice The operating system requested an unsupported + * amount of OpenCAPI LPC memory. This is possibly a kernel + * bug, or you may need to upgrade your firmware. + */ + OCAPIERR(dev, "Invalid LPC memory BAR allocation size requested: 0x%llx bytes (limit 0x%llx)\n", + size, phys_map_size); + rc = OPAL_PARAMETER; + goto out; + } + + /* Minimum BAR size is 1 GB */ + if (size < (1 << 30)) { + size = 1 << 30; + } + + if (!is_pow2(size)) { + size = 1ull << (ilog2(size) + 1); + } + + set_mem_bar(dev, phys_map_base, size); + *bar = phys_map_base; + dev->lpc_mem_base = phys_map_base; + dev->lpc_mem_size = size; + +out: + unlock(&dev->npu->lock); + return rc; +} + +static int64_t release_mem_bar(struct npu2_dev *dev) +{ + int rc = OPAL_SUCCESS; + + lock(&dev->npu->lock); + + if (!dev->lpc_mem_base) { + rc = OPAL_PARAMETER; + goto out; + } + + set_mem_bar(dev, 0, 0); + dev->lpc_mem_base = 0; + dev->lpc_mem_size = 0; + +out: + unlock(&dev->npu->lock); + return rc; +} + +static int64_t opal_npu_mem_alloc(uint64_t phb_id, uint32_t __unused bdfn, + uint64_t size, __be64 *__bar) +{ + struct phb *phb = pci_get_phb(phb_id); + struct npu2_dev *dev; + uint64_t bar; + int64_t rc; + + + if (!phb || phb->phb_type != phb_type_npu_v2_opencapi) + return OPAL_PARAMETER; + + dev = phb_to_npu2_dev_ocapi(phb); + if (!dev) + return OPAL_PARAMETER; + + if (!opal_addr_valid(__bar)) + return OPAL_PARAMETER; + + rc = alloc_mem_bar(dev, size, &bar); + if (rc == OPAL_SUCCESS) + *__bar = cpu_to_be64(bar); + + return rc; +} +opal_call(OPAL_NPU_MEM_ALLOC, opal_npu_mem_alloc, 4); + +static int64_t opal_npu_mem_release(uint64_t phb_id, uint32_t __unused bdfn) +{ + struct phb *phb = pci_get_phb(phb_id); + struct npu2_dev *dev; + + + if (!phb || phb->phb_type != phb_type_npu_v2_opencapi) + return OPAL_PARAMETER; + + dev = phb_to_npu2_dev_ocapi(phb); + if (!dev) + return OPAL_PARAMETER; + + return release_mem_bar(dev); +} +opal_call(OPAL_NPU_MEM_RELEASE, opal_npu_mem_release, 2); diff --git a/roms/skiboot/hw/npu2.c b/roms/skiboot/hw/npu2.c new file mode 100644 index 000000000..cf57eeb0c --- /dev/null +++ b/roms/skiboot/hw/npu2.c @@ -0,0 +1,2323 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * NPU - NVlink and OpenCAPI + * + * Copyright 2013-2019 IBM Corp. + */ + +#include <skiboot.h> +#include <io.h> +#include <timebase.h> +#include <pci-cfg.h> +#include <pci.h> +#include <pci-slot.h> +#include <pci-virt.h> +#include <opal.h> +#include <opal-api.h> +#include <cpu.h> +#include <device.h> +#include <ccan/str/str.h> +#include <ccan/array_size/array_size.h> +#include <affinity.h> +#include <npu2.h> +#include <lock.h> +#include <xscom.h> +#include <bitutils.h> +#include <chip.h> +#include <phys-map.h> +#include <nvram.h> +#include <xscom-p9-regs.h> +#include <phb4.h> +#include <cache-p9.h> + +#define VENDOR_CAP_START 0x80 +#define VENDOR_CAP_END 0x90 +#define VENDOR_CAP_LEN 0x10 +#define VENDOR_CAP_VERSION 0x01 +#define VENDOR_CAP_PCI_DEV_OFFSET 0x0d + +/* + * NPU2 BAR layout definition. We have 3 stacks and each of them + * contains 2 bricks. So every NPU2 has 6 bricks in total. There are 2 + * PHY BARs and each of them is shared by 3 bricks. Every brick has + * one NTL BAR and two bricks share one GENID BAR. There is also a + * global MMIO BAR. We only expose DL and GENID BARs to the OS and all + * other BARs will be hidden in skiboot. + * + * Before the global MMIO BAR is configured, scom is the only way to + * access the BAR registers. At NPU2 PHB probing time, we rely on scom + * to assign all BARs until the global MMIO BAR is established. + * + * We need to access 4 SM registers in the same stack in order to + * configure one particular BAR. + */ + +/* Set a specific flag in the vendor config space */ +void npu2_set_link_flag(struct npu2_dev *ndev, uint8_t flag) +{ + ndev->nvlink.link_flags |= flag; + PCI_VIRT_CFG_INIT_RO(ndev->nvlink.pvd, VENDOR_CAP_START + + VENDOR_CAP_PCI_DEV_OFFSET, 1, ndev->nvlink.link_flags); +} + +void npu2_clear_link_flag(struct npu2_dev *ndev, uint8_t flag) +{ + ndev->nvlink.link_flags &= ~flag; + PCI_VIRT_CFG_INIT_RO(ndev->nvlink.pvd, VENDOR_CAP_START + + VENDOR_CAP_PCI_DEV_OFFSET, 1, ndev->nvlink.link_flags); +} + +static inline void npu2_ioda_sel(struct npu2 *p, uint32_t table, + uint32_t index, bool autoinc) +{ + out_be64(p->regs + NPU2_ATS_IODA_TBL, + (autoinc ? NPU2_ATS_IODA_TBL_AUTOINC : 0ul) | + SETFIELD(NPU2_ATS_IODA_TBL_SELECT, 0ul, table) | + SETFIELD(NPU2_ATS_IODA_TBL_INDEX, 0ul, index)); +} + +static struct npu2_dev *npu2_bdf_to_dev(struct npu2 *p, + uint32_t bdfn) +{ + struct pci_virt_device *pvd; + + /* All emulated devices are attached to root bus */ + if (bdfn & ~0xff) + return NULL; + + pvd = pci_virt_find_device(&p->phb_nvlink, bdfn); + if (pvd) + return pvd->data; + + return NULL; +} + +static inline void npu2_get_bar(uint32_t gcid, struct npu2_bar *bar) +{ + phys_map_get(gcid, bar->type, bar->index, &bar->base, &bar->size); +} + +static void npu2_read_bar(struct npu2 *p, struct npu2_bar *bar) +{ + uint64_t reg, val; + int enabled; + + reg = NPU2_REG_OFFSET(0, NPU2_BLOCK_SM_0, bar->reg); + val = npu2_read(p, reg); + + switch (NPU2_REG(bar->reg)) { + case NPU2_PHY_BAR: + bar->base = GETFIELD(NPU2_PHY_BAR_ADDR, val) << 21; + enabled = GETFIELD(NPU2_PHY_BAR_ENABLE, val); + + if (NPU2_REG_STACK(reg) == NPU2_STACK_STCK_2) + /* This is the global MMIO BAR */ + bar->size = 0x1000000; + else + bar->size = 0x200000; + break; + case NPU2_NTL0_BAR: + case NPU2_NTL1_BAR: + bar->base = GETFIELD(NPU2_NTL_BAR_ADDR, val) << 16; + enabled = GETFIELD(NPU2_NTL_BAR_ENABLE, val); + bar->size = 0x10000 << GETFIELD(NPU2_NTL_BAR_SIZE, val); + break; + case NPU2_GENID_BAR: + bar->base = GETFIELD(NPU2_GENID_BAR_ADDR, val) << 16; + enabled = GETFIELD(NPU2_GENID_BAR_ENABLE, val); + bar->size = 0x20000; + break; + default: + bar->base = 0ul; + enabled = 0; + bar->size = 0; + break; + } + + bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED, bar->flags, enabled); +} + +static void npu2_write_bar(struct npu2 *p, + struct npu2_bar *bar, + uint32_t gcid, + uint32_t scom) +{ + uint64_t reg, val, enable = !!(bar->flags & NPU2_BAR_FLAG_ENABLED); + int block; + + switch (NPU2_REG(bar->reg)) { + case NPU2_PHY_BAR: + val = SETFIELD(NPU2_PHY_BAR_ADDR, 0ul, bar->base >> 21); + val = SETFIELD(NPU2_PHY_BAR_ENABLE, val, enable); + break; + case NPU2_NTL0_BAR: + case NPU2_NTL1_BAR: + val = SETFIELD(NPU2_NTL_BAR_ADDR, 0ul, bar->base >> 16); + val = SETFIELD(NPU2_NTL_BAR_ENABLE, val, enable); + val = SETFIELD(NPU2_NTL_BAR_SIZE, val, 1); + break; + case NPU2_GENID_BAR: + val = SETFIELD(NPU2_GENID_BAR_ADDR, 0ul, bar->base >> 16); + val = SETFIELD(NPU2_GENID_BAR_ENABLE, val, enable); + break; + default: + val = 0ul; + } + + for (block = NPU2_BLOCK_SM_0; block <= NPU2_BLOCK_SM_3; block++) { + reg = NPU2_REG_OFFSET(0, block, bar->reg); + if (p) + npu2_write(p, reg, val); + else + npu2_scom_write(gcid, scom, reg, NPU2_MISC_DA_LEN_8B, val); + } +} + +/* Trap for PCI command (0x4) to enable or disable device's BARs */ +static int64_t npu2_cfg_write_cmd(void *dev, + struct pci_cfg_reg_filter *pcrf __unused, + uint32_t offset, uint32_t size, + uint32_t *data, bool write) +{ + struct pci_virt_device *pvd = dev; + struct npu2_dev *ndev = pvd->data; + struct npu2_bar *ntl_npu_bar, *genid_npu_bar; + bool enabled; + + if (!write) + return OPAL_PARTIAL; + + if (offset != PCI_CFG_CMD) + return OPAL_PARAMETER; + if (size != 1 && size != 2 && size != 4) + return OPAL_PARAMETER; + + /* + * Enable or disable NTL and GENID BAR. Two bricks share + * one GENID BAR, which is exposed via the first brick. + */ + enabled = !!(*data & PCI_CFG_CMD_MEM_EN); + ntl_npu_bar = &ndev->bars[0].npu2_bar; + genid_npu_bar = &ndev->bars[1].npu2_bar; + + ntl_npu_bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED, ntl_npu_bar->flags, enabled); + npu2_write_bar(ndev->npu, ntl_npu_bar, 0, 0); + + /* + * Enable/disable the GENID BAR. Two bricks share one GENID + * BAR which is exposed via the first brick so we need to + * track the enables separately. + */ + if (NPU2DEV_BRICK(ndev)) + genid_npu_bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED1, genid_npu_bar->flags, + enabled); + else + genid_npu_bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED0, genid_npu_bar->flags, + enabled); + + /* Enable the BAR if either device requests it enabled, otherwise disable it */ + genid_npu_bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED, genid_npu_bar->flags, + !!(genid_npu_bar->flags & (NPU2_BAR_FLAG_ENABLED0 | + NPU2_BAR_FLAG_ENABLED1))); + npu2_write_bar(ndev->npu, genid_npu_bar, 0, 0); + + return OPAL_PARTIAL; +} + +static int64_t npu2_cfg_read_bar(struct npu2_dev *dev __unused, + struct pci_cfg_reg_filter *pcrf, + uint32_t offset, uint32_t size, + uint32_t *data) +{ + struct npu2_pcie_bar *bar = (struct npu2_pcie_bar *) pcrf->data; + + if (!(bar->flags & NPU2_PCIE_BAR_FLAG_TRAPPED)) + return OPAL_PARTIAL; + + if ((size != 4) || + (offset != pcrf->start && offset != pcrf->start + 4)) + return OPAL_PARAMETER; + + if (bar->flags & NPU2_PCIE_BAR_FLAG_SIZE_HI) + *data = bar->npu2_bar.size >> 32; + else + *data = bar->npu2_bar.size; + bar->flags &= ~(NPU2_PCIE_BAR_FLAG_TRAPPED | NPU2_PCIE_BAR_FLAG_SIZE_HI); + + return OPAL_SUCCESS; +} + +static int64_t npu2_cfg_write_bar(struct npu2_dev *dev, + struct pci_cfg_reg_filter *pcrf, + uint32_t offset, uint32_t size, + uint32_t data) +{ + struct npu2_pcie_bar *bar = (struct npu2_pcie_bar *) pcrf->data; + struct npu2_bar old_bar, *npu2_bar = &bar->npu2_bar; + + if ((size != 4) || + (offset != pcrf->start && offset != pcrf->start + 4)) + return OPAL_PARAMETER; + + /* Return BAR size on next read */ + if (data == 0xffffffff) { + bar->flags |= NPU2_PCIE_BAR_FLAG_TRAPPED; + if (offset == pcrf->start + 4) + bar->flags |= NPU2_PCIE_BAR_FLAG_SIZE_HI; + + return OPAL_SUCCESS; + } + + if (offset == pcrf->start) { + npu2_bar->base &= 0xffffffff00000000UL; + npu2_bar->base |= (data & 0xfffffff0); + } else { + npu2_bar->base &= 0x00000000ffffffffUL; + npu2_bar->base |= ((uint64_t)data << 32); + + if (NPU2_REG(npu2_bar->reg) == NPU2_GENID_BAR && NPU2DEV_BRICK(dev)) + npu2_bar->base -= 0x10000; + + old_bar.reg = npu2_bar->reg; + npu2_read_bar(dev->npu, &old_bar); + + /* Only allow changing the base address if the BAR is not enabled */ + if ((npu2_bar->flags & NPU2_BAR_FLAG_ENABLED) && + (npu2_bar->base != old_bar.base)) { + npu2_bar->base = old_bar.base; + return OPAL_HARDWARE; + } + + npu2_write_bar(dev->npu, &bar->npu2_bar, 0, 0); + } + + /* To update the config cache */ + return OPAL_PARTIAL; +} + +static int64_t npu2_dev_cfg_bar(void *dev, struct pci_cfg_reg_filter *pcrf, + uint32_t offset, uint32_t len, uint32_t *data, + bool write) +{ + struct pci_virt_device *pvd = dev; + struct npu2_dev *ndev = (struct npu2_dev *) pvd->data; + + if (write) + return npu2_cfg_write_bar(ndev, pcrf, offset, len, *data); + + return npu2_cfg_read_bar(ndev, pcrf, offset, len, data); +} + +static int64_t npu2_dev_cfg_exp_devcap(void *dev, + struct pci_cfg_reg_filter *pcrf __unused, + uint32_t offset, uint32_t size, + uint32_t *data, bool write) +{ + struct pci_virt_device *pvd = dev; + struct npu2_dev *ndev = pvd->data; + int rc; + + assert(write); + + if ((size != 2) || (offset & 1)) { + /* Short config writes are not supported */ + prlog(PR_ERR, "NPU%d: Unsupported write to pcie control register\n", + ndev->nvlink.phb->opal_id); + return OPAL_PARAMETER; + } + + if (*data & PCICAP_EXP_DEVCTL_FUNC_RESET) + npu2_dev_procedure_reset(ndev); + + rc = purge_l2_l3_caches(); + if (rc) + return rc; + + return OPAL_PARTIAL; +} + +#define NPU2_CFG_READ(size, type) \ +static int64_t npu2_cfg_read##size(struct phb *phb, uint32_t bdfn, \ + uint32_t offset, type *data) \ +{ \ + uint32_t val; \ + int64_t ret; \ + \ + ret = pci_virt_cfg_read(phb, bdfn, offset, \ + sizeof(*data), &val); \ + *data = (type)val; \ + return ret; \ +} +#define NPU2_CFG_WRITE(size, type) \ +static int64_t npu2_cfg_write##size(struct phb *phb, uint32_t bdfn, \ + uint32_t offset, type data) \ +{ \ + uint32_t val = data; \ + int64_t ret; \ + \ + ret = pci_virt_cfg_write(phb, bdfn, offset, \ + sizeof(data), val); \ + return ret; \ +} + +NPU2_CFG_READ(8, u8); +NPU2_CFG_READ(16, u16); +NPU2_CFG_READ(32, u32); +NPU2_CFG_WRITE(8, u8); +NPU2_CFG_WRITE(16, u16); +NPU2_CFG_WRITE(32, u32); + +static int __npu2_dev_bind_pci_dev(struct phb *phb __unused, + struct pci_device *pd, + void *data) +{ + struct npu2_dev *dev = data; + struct dt_node *pci_dt_node; + char *pcislot; + + /* Ignore non-nvidia PCI devices */ + if ((pd->vdid & 0xffff) != 0x10de) + return 0; + + /* Find the PCI device's slot location */ + for (pci_dt_node = pd->dn; + pci_dt_node && !dt_find_property(pci_dt_node, "ibm,loc-code"); + pci_dt_node = pci_dt_node->parent); + + if (!pci_dt_node) + return 0; + + pcislot = (char *)dt_prop_get(pci_dt_node, "ibm,loc-code"); + + NPU2DEVDBG(dev, "Comparing GPU '%s' and NPU2 '%s'\n", + pcislot, dev->nvlink.slot_label); + + if (streq(pcislot, dev->nvlink.slot_label)) + return 1; + + return 0; +} + +static int64_t npu2_gpu_bridge_sec_bus_reset(void *dev, + struct pci_cfg_reg_filter *pcrf __unused, + uint32_t offset, uint32_t len, + uint32_t *data, bool write) +{ + struct pci_device *pd = dev; + struct pci_device *gpu; + struct phb *npphb; + struct npu2 *npu; + struct dt_node *np; + struct npu2_dev *ndev; + int i; + + assert(write); + + if ((len != 2) || (offset & 1)) { + /* Short config writes are not supported */ + PCIERR(pd->phb, pd->bdfn, + "Unsupported write to bridge control register\n"); + return OPAL_PARAMETER; + } + + gpu = list_top(&pd->children, struct pci_device, link); + if (gpu && (*data & PCI_CFG_BRCTL_SECONDARY_RESET)) { + int64_t rc; + + dt_for_each_compatible(dt_root, np, "ibm,power9-npu-pciex") { + npphb = pci_get_phb(dt_prop_get_cell(np, + "ibm,opal-phbid", 1)); + if (!npphb || npphb->phb_type != phb_type_npu_v2) + continue; + + npu = phb_to_npu2_nvlink(npphb); + for (i = 0; i < npu->total_devices; ++i) { + ndev = &npu->devices[i]; + if (ndev->nvlink.pd == gpu) + npu2_dev_procedure_reset(ndev); + } + } + + rc = purge_l2_l3_caches(); + if (rc) + return rc; + } + + return OPAL_PARTIAL; +} + +static void npu2_dev_bind_pci_dev(struct npu2_dev *dev) +{ + struct phb *phb; + uint32_t i; + + if (dev->nvlink.pd) + return; + + for (i = 0; i < 64; i++) { + if (dev->npu->phb_nvlink.opal_id == i) + continue; + + phb = pci_get_phb(i); + if (!phb) + continue; + + dev->nvlink.pd = pci_walk_dev(phb, NULL, __npu2_dev_bind_pci_dev, dev); + if (dev->nvlink.pd) { + dev->nvlink.phb = phb; + /* Found the device, set the bit in config space */ + npu2_set_link_flag(dev, NPU2_DEV_PCI_LINKED); + + /* + * We define a custom sec bus reset handler for a slot + * with an NVLink-connected GPU to prevent HMIs which + * will otherwise happen if we reset GPU before + * resetting NVLinks. + */ + if (dev->nvlink.pd->parent && + dev->nvlink.pd->parent->slot) + pci_add_cfg_reg_filter(dev->nvlink.pd->parent, + PCI_CFG_BRCTL, 2, + PCI_REG_FLAG_WRITE, + npu2_gpu_bridge_sec_bus_reset); + return; + } + } + + NPU2DEVINF(dev, "No PCI device found for slot '%s'\n", + dev->nvlink.slot_label); +} + +static struct lock pci_npu_phandle_lock = LOCK_UNLOCKED; + +static void npu2_append_phandle(struct dt_node *dn, + u32 phandle) +{ + struct dt_property *prop; + uint32_t *npu_phandles; + size_t len; + + /* + * Use a lock to make sure no one else has a reference to an + * ibm,npu property (this assumes this is the only function + * that holds a reference to it) + */ + lock(&pci_npu_phandle_lock); + + /* This function shouldn't be called unless ibm,npu exists */ + prop = (struct dt_property *)dt_require_property(dn, "ibm,npu", -1); + + /* Need to append to the properties */ + len = prop->len + sizeof(*npu_phandles); + dt_resize_property(&prop, len); + + npu_phandles = (uint32_t *)prop->prop; + npu_phandles[len / sizeof(*npu_phandles) - 1] = phandle; + unlock(&pci_npu_phandle_lock); +} + +static struct dt_node *npu2_create_memory_dn(uint64_t addr, uint64_t size) +{ + struct dt_node *mem; + static u32 chip_id = 255; + + mem = dt_find_by_name_addr(dt_root, "memory", addr); + if (mem) + return mem; + + mem = dt_new_addr(dt_root, "memory", addr); + if (!mem) + return NULL; + dt_add_property_string(mem, "device_type", "memory"); + dt_add_property_string(mem, "compatible", "ibm,coherent-device-memory"); + dt_add_property_u64s(mem, "reg", addr, size); + dt_add_property_cells(mem, "ibm,chip-id", chip_id); + dt_add_property_u64s(mem, "linux,usable-memory", addr, 0); + dt_add_property_cells(mem, "ibm,associativity", 4, chip_id, chip_id, chip_id, chip_id); + chip_id--; + + assert(chip_id); + return mem; +} + +/* There are potentially multiple links per GPU, so lookup the GPU memory based + * on bdfn. */ +static void npu2_get_gpu_base(struct npu2_dev *ndev, uint64_t *addr, uint64_t *size) +{ + struct npu2 *p = ndev->npu; + int group; + + group = PCI_DEV(ndev->bdfn); + phys_map_get(ndev->npu->chip_id, p->gpu_map_type, group, addr, size); +} + +static void npu2_dn_fixup_gmb(struct dt_node *pd_dn, struct npu2_dev *ndev) +{ + uint64_t gpu_base, gpu_size, gta; + struct dt_node *mem_dn; + + npu2_get_gpu_base(ndev, &gpu_base, &gpu_size); + mem_dn = npu2_create_memory_dn(gpu_base, gpu_size); + assert(mem_dn); + dt_add_property_cells(pd_dn, "memory-region", mem_dn->phandle); + + /* Coral mode address compression. This is documented in Figure 3.5 + * "P9->GPU RA Compression (Coral) of the NPU2 workbook". */ + gta = ((gpu_base >> 42) & 0x1) << 42; + gta |= ((gpu_base >> 45) & 0x3) << 43; + gta |= ((gpu_base >> 49) & 0x3) << 45; + gta |= gpu_base & ((1UL << 43) - 1); + + dt_add_property_u64s(pd_dn, "ibm,device-tgt-addr", gta); +} + +static int npu2_assign_gmb(struct npu2_dev *ndev) +{ + struct npu2 *p = ndev->npu; + int peers, mode; + uint32_t bdfn; + uint64_t base, size, reg, val, gmb; + + /* Need to work out number of link peers. This amount to + * working out the maximum function number. So work start at + * the highest bdfn (fn = 6) and count back until we find a + * npu2_dev. */ + for (bdfn = (ndev->bdfn & ~0x7) | NPU2_LINKS_PER_CHIP; + PCI_FUNC(bdfn) != 0x7; bdfn = (bdfn & ~0x7) | (PCI_FUNC(bdfn) - 1)) + if (npu2_bdf_to_dev(p, bdfn)) + break; + peers = PCI_FUNC(bdfn); + + npu2_get_gpu_base(ndev, &base, &size); + + NPU2DBG(p, "Setting BAR region dt:%llx\n", base); + val = SETFIELD(NPU2_MEM_BAR_EN, 0ULL, 1); + val = SETFIELD(NPU2_MEM_BAR_SEL_MEM, val, base >> (63-14)); + val = SETFIELD(NPU2_MEM_BAR_GROUP, val, base >> (63-18)); + val = SETFIELD(NPU2_MEM_BAR_CHIP, val, base >> (63-21)); + val = SETFIELD(NPU2_MEM_BAR_NODE_ADDR, val, base >> (63-33)); + val = SETFIELD(NPU2_MEM_BAR_POISON, val, 1); + val = SETFIELD(NPU2_MEM_BAR_GRANULE, val, 0); + + /* We don't know how much memory the GPU has, so we may as well just + * pass the whole aperture through at this point. */ + val = SETFIELD(NPU2_MEM_BAR_BAR_SIZE, val, ilog2(size >> 30)); + + switch (peers) { + case 0: + mode = 0; + break; + case 1: + mode = 1; + break; + case 2: + mode = 3; + break; + case 3: + mode = 6; + break; + case 5: + mode = 10; + break; + default: + /* Hardware does not support this configuration */ + assert(0); + } + + mode += PCI_FUNC(ndev->bdfn); + val = SETFIELD(NPU2_MEM_BAR_MODE, val, mode); + + gmb = NPU2_GPU0_MEM_BAR; + if (NPU2DEV_BRICK(ndev)) + gmb = NPU2_GPU1_MEM_BAR; + + reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(ndev), + NPU2_BLOCK_SM_0, gmb); + + npu2_write(p, reg, val); + reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(ndev), + NPU2_BLOCK_SM_1, gmb); + npu2_write(p, reg, val); + reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(ndev), + NPU2_BLOCK_SM_2, gmb); + npu2_write(p, reg, val); + reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(ndev), + NPU2_BLOCK_SM_3, gmb); + npu2_write(p, reg, val); + + return 0; +} + +static int npu2_dn_fixup(struct phb *phb, + struct pci_device *pd, + void *data __unused) +{ + struct npu2 *p = phb_to_npu2_nvlink(phb); + struct npu2_dev *dev; + uint32_t speed; + const char *label; + + dev = npu2_bdf_to_dev(p, pd->bdfn); + assert(dev); + if (dev->nvlink.phb || dev->nvlink.pd) + return 0; + + npu2_assign_gmb(dev); + npu2_dn_fixup_gmb(pd->dn, dev); + dt_add_property_cells(pd->dn, "ibm,nvlink", dev->dt_node->phandle); + + /* + * NVLink supports multiple speeds and device drivers need to know what + * speed has been set by firmware. Hostboot does the inits that set the + * link speed and tell us via HDAT and we need to copy that from the + * link node. + */ + speed = dt_prop_get_u32_def(dev->dt_node, "nvidia,link-speed", 0xff); + if (speed != 0xff) + dt_add_property_cells(pd->dn, "ibm,nvlink-speed", speed); + + /* + * NPU2 devices have a slot label that indicates which GPU slot + * this NPU is connected to. Add a location code to the NVlink + * device node based on the slot label. + */ + label = dt_prop_get_def(dev->dt_node, "ibm,slot-label", NULL); + if (!label) { + /** + * @fwts-label NPUNoPHBSlotLabel + * @fwts-advice No GPU/NPU2 slot information was found. + * NVLink2 functionality will not work. + */ + prlog(PR_ERR, "NPU: Cannot find GPU slot information\n"); + return 0; + } + dt_add_property_string(pd->dn, "ibm,loc-code", label); + + dev->nvlink.slot_label = label; + + /* + * Bind the emulated PCI device with the real one, which can't + * be done until the PCI devices are populated. Once the real + * PCI device is identified, we also need fix the device-tree + * for it + */ + npu2_dev_bind_pci_dev(dev); + if (dev->nvlink.phb && dev->nvlink.pd && dev->nvlink.pd->dn) { + if (dt_find_property(dev->nvlink.pd->dn, "ibm,npu")) + npu2_append_phandle(dev->nvlink.pd->dn, pd->dn->phandle); + else + dt_add_property_cells(dev->nvlink.pd->dn, "ibm,npu", pd->dn->phandle); + + dt_add_property_cells(pd->dn, "ibm,gpu", dev->nvlink.pd->dn->phandle); + dev->nvlink.gpu_bdfn = dev->nvlink.pd->bdfn; + } + + return 0; +} + +static int npu2_links_per_gpu(struct phb *phb, + struct pci_device *pd, + void *data) +{ + struct npu2 *p = phb_to_npu2_nvlink(phb); + struct npu2_dev *dev; + int *nlinks = (int *)data; + + dev = npu2_bdf_to_dev(p, pd->bdfn); + assert(dev); + + if (dev->nvlink.phb && dev->nvlink.pd && dev->nvlink.pd->dn) { + const struct dt_property *prop; + int n; + + /* The link count is the number of phandles in "ibm,npu" */ + prop = dt_find_property(dev->nvlink.pd->dn, "ibm,npu"); + if (!prop) + return 0; + + /* Count could vary by gpu, so find the max */ + n = prop->len / sizeof(uint32_t); + if (n > *nlinks) + *nlinks = n; + } + + return 0; +} + +static void npu2_phb_fixup_scominit(struct dt_node *dn, int links_per_gpu) +{ + uint32_t gcid = dt_get_chip_id(dn); + uint64_t val, mask; + + /* + * MRBSP settings for 2- and 3-link GPU systems. These can improve + * GPU peer-to-peer fully ordered write performance. + */ + if (links_per_gpu == 3) { + val = PPC_BIT(30) | PPC_BIT(34) | PPC_BIT(36) | PPC_BIT(37) | + PPC_BIT(44) | PPC_BIT(45); + mask = PPC_BITMASK(28,39) | PPC_BITMASK(44,47); + } else if (links_per_gpu == 2) { + val = PPC_BIT(46) | PPC_BIT(47); + mask = PPC_BITMASK(44,47); + } else + return; + + xscom_write_mask(gcid, 0x50110c0, val, mask); + xscom_write_mask(gcid, 0x50112c0, val, mask); + xscom_write_mask(gcid, 0x50114c0, val, mask); +} + +static void npu2_phb_final_fixup(struct phb *phb) +{ + int links_per_gpu = 0; + struct dt_node *np; + + pci_walk_dev(phb, NULL, npu2_dn_fixup, NULL); + + /* + * Now that the emulated devices are bound to the real ones, we can + * determine links_per_gpu and do some final init. + */ + pci_walk_dev(phb, NULL, npu2_links_per_gpu, &links_per_gpu); + dt_for_each_compatible(dt_root, np, "ibm,power9-npu") + npu2_phb_fixup_scominit(np, links_per_gpu); +} + +static void npu2_init_ioda_cache(struct npu2 *p) +{ + /* TVT */ + memset(p->tve_cache, 0, sizeof(p->tve_cache)); +} + +static int64_t npu2_ioda_reset(struct phb *phb, bool purge) +{ + struct npu2 *p = phb_to_npu2_nvlink(phb); + uint32_t i; + + if (purge) { + NPU2DBG(p, "Purging all IODA tables...\n"); + npu2_init_ioda_cache(p); + } + + /* TVT */ + npu2_ioda_sel(p, NPU2_ATS_IODA_TBL_TVT, 0, true); + for (i = 0; i < ARRAY_SIZE(p->tve_cache); i++) + out_be64(p->regs + NPU2_ATS_IODA_DATA, p->tve_cache[i]); + + return OPAL_SUCCESS; +} + +static void npu2_write_mcd(struct npu2 *p, uint64_t pcb_addr, uint64_t addr, + uint64_t size) +{ + uint64_t val; + + NPU2DBG(p, "Setting MCD addr:%llx\n", pcb_addr); + assert(is_pow2(size)); + + val = MCD_BANK_CN_VALID; + val = SETFIELD(MCD_BANK_CN_SIZE, val, (size >> 25) - 1); + val = SETFIELD(MCD_BANK_CN_ADDR, val, addr >> 25); + xscom_write(p->chip_id, pcb_addr, val); +} + +static void npu2_mcd_init(struct npu2 *p) +{ + int i; + uint64_t size, addr, gpu_min_addr, gpu_max_addr, total_size; + + /* Init memory cache directory (MCD) registers. */ + phys_map_get(p->chip_id, p->gpu_map_type, NPU2_LINKS_PER_CHIP - 1, + &gpu_min_addr, NULL); + phys_map_get(p->chip_id, p->gpu_map_type, 0, &gpu_max_addr, &size); + gpu_max_addr += size; + + /* We assume GPU memory is contiguous from the first possible GPU to the + * last and that the size is the same so best to check that. */ + for (i = 0; i < NPU2_LINKS_PER_CHIP; i++) { + uint64_t tmp; + phys_map_get(p->chip_id, p->gpu_map_type, i, &addr, &tmp); + assert((addr >= gpu_min_addr) && (addr + tmp <= gpu_max_addr)); + assert(tmp == size); + } + + /* We have two MCDs, so if neccessary we can split the region covered + * across both if total_size is not a power of two. */ + total_size = gpu_max_addr - gpu_min_addr; + size = 1ull << ilog2(total_size); + + /* Allocate the biggest chunk first as we assume gpu_max_addr has the + * highest alignment. */ + addr = gpu_max_addr - size; + npu2_write_mcd(p, MCD0_BANK0_CN3, addr, size); + total_size -= size; + if (total_size) { + /* total_size was not a power of two, but the remainder should + * be if all GPUs were assigned the same size. */ + assert(is_pow2(total_size)); + size = 1ull << ilog2(total_size); + addr -= size; + assert(addr <= gpu_min_addr); + npu2_write_mcd(p, MCD1_BANK0_CN3, addr, size); + } +} + +static void npu2_hw_init(struct npu2 *p) +{ + uint64_t reg, val; + int s, b; + + npu2_ioda_reset(&p->phb_nvlink, false); + + /* Enable XTS retry mode */ + val = npu2_read(p, NPU2_XTS_CFG); + npu2_write(p, NPU2_XTS_CFG, val | NPU2_XTS_CFG_MMIOSD | NPU2_XTS_CFG_TRY_ATR_RO); + + val = npu2_read(p, NPU2_XTS_CFG2); + npu2_write(p, NPU2_XTS_CFG2, val | NPU2_XTS_CFG2_NO_FLUSH_ENA); + + /* + * There are three different ways we configure the MCD and memory map. + * 1) Old way + * Skiboot configures the MCD and puts GPUs at 4TB and below + * 2) New way with MCD + * Hostboot configures the MCD and skiboot puts GPU at 4TB and above + * 3) New way without MCD + * No one configures the MCD and skiboot puts GPU at 4TB and below + * + * 1) Will go away evenutally as it's a configuration that can + * cause an xstop or data integrity problems. We are keeping + * it around to support existing hostboot. Print error + * message if used. + * 2) Is for smaller memory configurations and will be used + * initially for GPUs on Witherspoon. Supports only to + * 512GB of memory and 4 GPUs per socket. + * 3) Is for fully populated configurations of 4TB of memory + * and 6GPUs per socket. May have performance impacts. + * + * The different configurations can be detected via the following scoms: + * 1) 0x5011c0c bit 2 = 1, 0x5011c0a bits 42:48 = 0 + * 2) 0x5011c0c bit 2 = 1, 0x5011c0a bits 42:48 = 7 + * 3) 0x5011c0c bit 2 = 0, 0x5011c0a bits 42:48 = 0 + */ + + /* Get 0x05011c0c bit 2 = 1 */ + xscom_read(p->chip_id, PB_CENT_HP_MODE_CURR, &val); + if ((val & PB_CFG_CHG_RATE_GP_MASTER) != 0) { + /* Get 0x05011c0a bits 42:48 */ + xscom_read(p->chip_id, PB_CENT_MODE, &val); + if (GETFIELD(PB_CFG_CHIP_ADDR_EXTENSION_MASK_CENT, val) == 0) { + /* 1) */ + NPU2DBG(p, "Using old memory map + MCD enabled in skiboot\n"); + NPU2ERR(p, "!!! Old firmware detected. Update hostboot for new MCD mapping !!!\n"); + p->gpu_map_type = GPU_MEM_4T_DOWN; + npu2_mcd_init(p); + } else if (GETFIELD(PB_CFG_CHIP_ADDR_EXTENSION_MASK_CENT, val) == 7) { + /* 2) */ + NPU2DBG(p, "Using small memory map + MCD enabled\n"); + p->gpu_map_type = GPU_MEM_4T_UP; + } else + NPU2ERR(p, "!!! Unsupported NPU2 configuration. " + "0x%llx!!!\n", val); + } else { + /* 3) */ + NPU2DBG(p, "Using large memory map + MCD disabled\n"); + p->gpu_map_type = GPU_MEM_4T_DOWN; + } + + /* Static initialization of every relaxed-ordering cfg[2] register */ + val = NPU2_RELAXED_ORDERING_CMD_CL_DMA_W | + NPU2_RELAXED_ORDERING_CMD_CL_DMA_W_HP | + NPU2_RELAXED_ORDERING_CMD_CL_DMA_INJ | + NPU2_RELAXED_ORDERING_CMD_PR_DMA_INJ | + NPU2_RELAXED_ORDERING_CMD_DMA_PR_W | + NPU2_RELAXED_ORDERING_CMD_CL_RD_NC_F0 | + NPU2_RELAXED_ORDERING_SOURCE4_RDENA; + + for (s = NPU2_STACK_STCK_0; s <= NPU2_STACK_STCK_2; s++) { + for (b = NPU2_BLOCK_SM_0; b <= NPU2_BLOCK_SM_3; b++) { + reg = NPU2_REG_OFFSET(s, b, NPU2_RELAXED_ORDERING_CFG(2)); + npu2_write(p, reg, val); + } + } +} + +static int64_t npu2_map_pe_dma_window_real(struct phb *phb, + uint64_t pe_num, + uint16_t window_id, + uint64_t pci_start_addr __unused, + uint64_t pci_mem_size __unused) +{ + struct npu2 *p = phb_to_npu2_nvlink(phb); + uint64_t tve; + + /* Sanity check. Each PE has one corresponding TVE */ + if (pe_num >= NPU2_MAX_PE_NUM || + window_id != pe_num) + return OPAL_PARAMETER; + + if (pci_mem_size) { + /* GPUs need to be able to access the MMIO memory space as well. + * On POWER9 this is above the top of ram so disable the TVT + * range check allowing access to all memory addresses. */ + tve = 0; + } else { + /* Disable */ + tve = PPC_BIT(51); + } + + npu2_ioda_sel(p, NPU2_ATS_IODA_TBL_TVT, window_id, false); + out_be64(p->regs + NPU2_ATS_IODA_DATA, tve); + p->tve_cache[window_id] = tve; + + return OPAL_SUCCESS; +} + +static int64_t npu2_map_pe_dma_window(struct phb *phb, + uint64_t pe_num, + uint16_t window_id, + uint16_t tce_levels, + uint64_t tce_table_addr, + uint64_t tce_table_size, + uint64_t tce_page_size) +{ + struct npu2 *p = phb_to_npu2_nvlink(phb); + uint64_t tts_encoded; + uint64_t data64 = 0; + + /* Sanity check. Each PE has one corresponding TVE */ + if (pe_num >= NPU2_MAX_PE_NUM || + window_id != pe_num) + return OPAL_PARAMETER; + + /* + * Special condition, zero TCE table size used to disable + * the TVE. + */ + if (!tce_table_size) { + npu2_ioda_sel(p, NPU2_ATS_IODA_TBL_TVT, window_id, false); + out_be64(p->regs + NPU2_ATS_IODA_DATA, 0ul); + p->tve_cache[window_id] = 0ul; + return OPAL_SUCCESS; + } + + /* Additional arguments validation */ + if (tce_levels < 1 || + tce_levels > 4 || + !is_pow2(tce_table_size) || + tce_table_size < 0x1000) + return OPAL_PARAMETER; + + /* TCE table size */ + data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_TTA, 0ul, tce_table_addr >> 12); + tts_encoded = ilog2(tce_table_size) - 11; + if (tts_encoded > 39) + return OPAL_PARAMETER; + data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_SIZE, data64, tts_encoded); + + /* TCE page size */ + switch (tce_page_size) { + case 0x10000: /* 64K */ + data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_PSIZE, data64, 5); + break; + case 0x1000000: /* 16M */ + data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_PSIZE, data64, 13); + break; + case 0x10000000: /* 256M */ + data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_PSIZE, data64, 17); + break; + case 0x1000: /* 4K */ + default: + data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_PSIZE, data64, 1); + } + + /* Number of levels */ + data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_LEVEL, data64, tce_levels - 1); + + /* Update to hardware */ + npu2_ioda_sel(p, NPU2_ATS_IODA_TBL_TVT, window_id, false); + out_be64(p->regs + NPU2_ATS_IODA_DATA, data64); + p->tve_cache[window_id] = data64; + + return OPAL_SUCCESS; +} + +static int64_t npu2_set_pe(struct phb *phb, + uint64_t pe_num, + uint64_t bdfn, + uint8_t bcompare, + uint8_t dcompare, + uint8_t fcompare, + uint8_t action) +{ + struct npu2 *p; + struct npu2_dev *dev; + uint64_t reg, val; + + /* Sanity check */ + if (action != OPAL_MAP_PE && action != OPAL_UNMAP_PE) + return OPAL_PARAMETER; + if (pe_num >= NPU2_MAX_PE_NUM) + return OPAL_PARAMETER; + if (bdfn >> 8) + return OPAL_PARAMETER; + if (bcompare != OpalPciBusAll || + dcompare != OPAL_COMPARE_RID_DEVICE_NUMBER || + fcompare != OPAL_COMPARE_RID_FUNCTION_NUMBER) + return OPAL_UNSUPPORTED; + if (phb->phb_type != phb_type_npu_v2) + return OPAL_PARAMETER; + + p = phb_to_npu2_nvlink(phb); + if (!p) + return OPAL_PARAMETER; + + dev = npu2_bdf_to_dev(p, bdfn); + if (!dev) + return OPAL_PARAMETER; + + val = NPU2_CQ_BRICK_BDF2PE_MAP_ENABLE; + val = SETFIELD(NPU2_CQ_BRICK_BDF2PE_MAP_PE, val, pe_num); + val = SETFIELD(NPU2_CQ_BRICK_BDF2PE_MAP_BDF, val, dev->nvlink.gpu_bdfn); + + if (!NPU2DEV_BRICK(dev)) + reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + dev->brick_index/2, + NPU2_BLOCK_CTL, NPU2_CQ_BRICK0_BDF2PE_MAP0); + else + reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + dev->brick_index/2, + NPU2_BLOCK_CTL, NPU2_CQ_BRICK1_BDF2PE_MAP0); + + npu2_write(p, reg, val); + val = NPU2_MISC_BRICK_BDF2PE_MAP_ENABLE; + val = SETFIELD(NPU2_MISC_BRICK_BDF2PE_MAP_PE, val, pe_num); + val = SETFIELD(NPU2_MISC_BRICK_BDF2PE_MAP_BDF, val, dev->nvlink.gpu_bdfn); + reg = NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, + NPU2_MISC_BRICK0_BDF2PE_MAP0 + (dev->brick_index * 0x18)); + npu2_write(p, reg, val); + + return OPAL_SUCCESS; +} + +static int64_t npu2_get_link_state(struct pci_slot *slot __unused, uint8_t *val) +{ + /* + * As we're emulating all PCI stuff, the link bandwidth + * isn't big deal anyway. + */ + *val = OPAL_SHPC_LINK_UP_x1; + return OPAL_SUCCESS; +} + +static int64_t npu2_get_power_state(struct pci_slot *slot __unused, uint8_t *val) +{ + *val = PCI_SLOT_POWER_ON; + return OPAL_SUCCESS; +} + +static int64_t npu2_hreset(struct pci_slot *slot __unused) +{ + struct npu2 *p; + int i; + struct npu2_dev *ndev; + + p = phb_to_npu2_nvlink(slot->phb); + NPU2INF(p, "Hreset PHB state\n"); + + for (i = 0; i < p->total_devices; i++) { + ndev = &p->devices[i]; + if (ndev) { + NPU2DEVINF(ndev, "Resetting device\n"); + reset_ntl(ndev); + } + } + return purge_l2_l3_caches(); +} + +static int64_t npu2_freset(struct pci_slot *slot __unused) +{ + return OPAL_SUCCESS; +} + +static int64_t npu2_creset(struct pci_slot *slot) +{ + struct npu2 *p; + int i; + struct npu2_dev *ndev; + + p = phb_to_npu2_nvlink(slot->phb); + NPU2INF(p, "Creset PHB state\n"); + + for (i = 0; i < p->total_devices; i++) { + ndev = &p->devices[i]; + if (ndev) { + NPU2DEVINF(ndev, "Resetting device\n"); + reset_ntl(ndev); + } + } + return OPAL_SUCCESS; +} + +static struct pci_slot *npu2_slot_create(struct phb *phb) +{ + struct pci_slot *slot; + + slot = pci_slot_alloc(phb, NULL); + if (!slot) + return slot; + + /* Elementary functions */ + slot->ops.get_presence_state = NULL; + slot->ops.get_link_state = npu2_get_link_state; + slot->ops.get_power_state = npu2_get_power_state; + slot->ops.get_attention_state = NULL; + slot->ops.get_latch_state = NULL; + slot->ops.set_power_state = NULL; + slot->ops.set_attention_state = NULL; + + slot->ops.prepare_link_change = NULL; + slot->ops.poll_link = NULL; + slot->ops.hreset = npu2_hreset; + slot->ops.freset = npu2_freset; + slot->ops.creset = npu2_creset; + + return slot; +} + +int64_t npu2_freeze_status(struct phb *phb __unused, + uint64_t pe_number __unused, + uint8_t *freeze_state, + uint16_t *pci_error_type, + uint16_t *severity) +{ + /* + * FIXME: When it's called by skiboot PCI config accessor, + * the PE number is fixed to 0, which is incorrect. We need + * introduce another PHB callback to translate it. For now, + * it keeps the skiboot PCI enumeration going. + */ + *freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN; + *pci_error_type = OPAL_EEH_NO_ERROR; + if (severity) + *severity = OPAL_EEH_SEV_NO_ERROR; + + return OPAL_SUCCESS; +} + +static int64_t npu2_eeh_next_error(struct phb *phb, + uint64_t *first_frozen_pe, + uint16_t *pci_error_type, + uint16_t *severity) +{ + struct npu2 *p = phb_to_npu2_nvlink(phb); + int i; + uint64_t result = 0; + + if (!first_frozen_pe || !pci_error_type || !severity) + return OPAL_PARAMETER; + + *first_frozen_pe = -1; + *pci_error_type = OPAL_EEH_NO_ERROR; + *severity = OPAL_EEH_SEV_NO_ERROR; + + for (i = 0; i < NPU2_MAX_PE_NUM; i++) { + result = npu2_read(p, NPU2_MISC_PESTB(i)); + if (result > 0) { + *first_frozen_pe = i; + *pci_error_type = OPAL_EEH_PE_ERROR; + *severity = OPAL_EEH_SEV_PE_ER; + break; + } + } + + return OPAL_SUCCESS; +} + +static int64_t npu2_tce_kill(struct phb *phb, uint32_t kill_type, + uint64_t pe_number, uint32_t tce_size, + uint64_t dma_addr, uint32_t npages) +{ + struct npu2 *npu = phb_to_npu2_nvlink(phb); + uint32_t tce_page_size; + uint64_t val; + + if (pe_number > NPU2_MAX_PE_NUM) + return OPAL_PARAMETER; + + sync(); + switch(kill_type) { + case OPAL_PCI_TCE_KILL_PAGES: + tce_page_size = 1ULL << ( + 11 + GETFIELD(npu->tve_cache[pe_number], + NPU2_ATS_IODA_TBL_TVT_PSIZE)); + if (tce_page_size != tce_size) { + NPU2ERR(npu, "npu2_tce_kill: Unexpected TCE size (got 0x%x expected 0x%x)\n", + tce_size, tce_page_size); + return OPAL_PARAMETER; + } + + if (npages < 128) { + while (npages--) { + val = SETFIELD(NPU2_ATS_TCE_KILL_PENUM, dma_addr, pe_number); + npu2_write(npu, NPU2_ATS_TCE_KILL, NPU2_ATS_TCE_KILL_ONE | val); + dma_addr += tce_size; + } + break; + } + /* + * For too many TCEs do not bother with the loop above and simply + * flush everything, going to be lot faster. + */ + /* Fall through */ + case OPAL_PCI_TCE_KILL_PE: + /* + * NPU2 doesn't support killing a PE so fall through + * and do a kill all instead. + */ + case OPAL_PCI_TCE_KILL_ALL: + npu2_write(npu, NPU2_ATS_TCE_KILL, NPU2_ATS_TCE_KILL_ALL); + break; + default: + return OPAL_PARAMETER; + } + + return OPAL_SUCCESS; +} + +static const struct phb_ops npu_ops = { + .cfg_read8 = npu2_cfg_read8, + .cfg_read16 = npu2_cfg_read16, + .cfg_read32 = npu2_cfg_read32, + .cfg_write8 = npu2_cfg_write8, + .cfg_write16 = npu2_cfg_write16, + .cfg_write32 = npu2_cfg_write32, + .device_init = NULL, + .phb_final_fixup = npu2_phb_final_fixup, + .ioda_reset = npu2_ioda_reset, + .papr_errinjct_reset = NULL, + .pci_reinit = NULL, + .set_phb_mem_window = NULL, + .phb_mmio_enable = NULL, + .map_pe_mmio_window = NULL, + .map_pe_dma_window = npu2_map_pe_dma_window, + .map_pe_dma_window_real = npu2_map_pe_dma_window_real, + .pci_msi_eoi = NULL, + .set_xive_pe = NULL, + .get_msi_32 = NULL, + .get_msi_64 = NULL, + .set_pe = npu2_set_pe, + .set_peltv = NULL, + .eeh_freeze_status = npu2_freeze_status, + .eeh_freeze_clear = NULL, + .eeh_freeze_set = NULL, + .next_error = npu2_eeh_next_error, + .err_inject = NULL, + .get_diag_data2 = NULL, + .set_capi_mode = NULL, + .set_capp_recovery = NULL, + .tce_kill = npu2_tce_kill, +}; + +static void assign_mmio_bars(uint64_t gcid, uint32_t scom, uint64_t reg[2], uint64_t mm_win[2]) +{ + uint32_t i; + struct npu2_bar *bar; + struct npu2_bar npu2_bars[] = { + /* NPU_REGS must be first in this list */ + { .type = NPU_REGS, .index = 0, + .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_PHY_BAR), + .flags = NPU2_BAR_FLAG_ENABLED }, + { .type = NPU_PHY, .index = 0, + .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_PHY_BAR), + .flags = NPU2_BAR_FLAG_ENABLED }, + { .type = NPU_PHY, .index = 1, + .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_PHY_BAR), + .flags = NPU2_BAR_FLAG_ENABLED }, + { .type = NPU_NTL, .index = 0, + .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_NTL0_BAR) }, + { .type = NPU_NTL, .index = 1, + .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_NTL1_BAR) }, + { .type = NPU_NTL, .index = 2, + .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_NTL0_BAR) }, + { .type = NPU_NTL, .index = 3, + .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_NTL1_BAR) }, + { .type = NPU_NTL, .index = 4, + .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_NTL0_BAR) }, + { .type = NPU_NTL, .index = 5, + .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_NTL1_BAR) }, + { .type = NPU_GENID, .index = 0, + .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_GENID_BAR) }, + { .type = NPU_GENID, .index = 1, + .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_GENID_BAR) }, + { .type = NPU_GENID, .index = 2, + .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_GENID_BAR) }, + }; + + for (i = 0; i < ARRAY_SIZE(npu2_bars); i++) { + bar = &npu2_bars[i]; + npu2_get_bar(gcid, bar); + npu2_write_bar(NULL, bar, gcid, scom); + } + + /* Global MMIO BAR */ + reg[0] = npu2_bars[0].base; + reg[1] = npu2_bars[0].size; + + /* NTL and GENID BARs are exposed to kernel via the mm + * window */ + mm_win[0] = npu2_bars[3].base; + mm_win[1] = npu2_bars[ARRAY_SIZE(npu2_bars) - 1].base + + npu2_bars[ARRAY_SIZE(npu2_bars) - 1].size - + mm_win[0]; +} + +/* + * Set up NPU for NVLink and create PCI root device node + * accordingly. + */ +int npu2_nvlink_init_npu(struct npu2 *npu) +{ + struct dt_node *np; + uint64_t reg[2], mm_win[2], val, mask; + + /* TODO: Clean this up with register names, etc. when we get + * time. This just turns NVLink mode on in each brick and should + * get replaced with a patch from ajd once we've worked out how + * things are going to work there. + * + * Obviously if the year is now 2020 that didn't happen and you + * should fix this :-) */ + + val = PPC_BIT(58); + mask = PPC_BIT(58) | /* CONFIG_NVLINK_MODE */ + PPC_BIT(40); /* CONFIG_ENABLE_SNARF_CPM */ + + /* + * V100 GPUs are known to violate NVLink2 protocol if some GPU memory + * mapped by a CPU was also "linear-block" mapped by a GPU. When this + * happens, it breaks the NPU2 cache coherency state machine and + * it throws machine checkstop. Disabling snarfing fixes this so let's + * disable it by default. + */ + if (nvram_query_eq_dangerous("opal-npu2-snarf-cpm", "enable")) { + prlog(PR_WARNING, "NPU2#%d: enabling Probe.I.MO snarfing, a bad GPU driver may crash the system!\n", + npu->index); + val |= PPC_BIT(40); /* CONFIG_ENABLE_SNARF_CPM */ + } + + xscom_write_mask(npu->chip_id, NPU_STCK0_CS_SM0_MISC_CONFIG0, + val, mask); + xscom_write_mask(npu->chip_id, NPU_STCK0_CS_SM1_MISC_CONFIG0, + val, mask); + xscom_write_mask(npu->chip_id, NPU_STCK0_CS_SM2_MISC_CONFIG0, + val, mask); + xscom_write_mask(npu->chip_id, NPU_STCK0_CS_SM3_MISC_CONFIG0, + val, mask); + xscom_write_mask(npu->chip_id, NPU_STCK1_CS_SM0_MISC_CONFIG0, + val, mask); + xscom_write_mask(npu->chip_id, NPU_STCK1_CS_SM1_MISC_CONFIG0, + val, mask); + xscom_write_mask(npu->chip_id, NPU_STCK1_CS_SM2_MISC_CONFIG0, + val, mask); + xscom_write_mask(npu->chip_id, NPU_STCK1_CS_SM3_MISC_CONFIG0, + val, mask); + xscom_write_mask(npu->chip_id, NPU_STCK2_CS_SM0_MISC_CONFIG0, + val, mask); + xscom_write_mask(npu->chip_id, NPU_STCK2_CS_SM1_MISC_CONFIG0, + val, mask); + xscom_write_mask(npu->chip_id, NPU_STCK2_CS_SM2_MISC_CONFIG0, + val, mask); + xscom_write_mask(npu->chip_id, NPU_STCK2_CS_SM3_MISC_CONFIG0, + val, mask); + + xscom_write_mask(npu->chip_id, 0x50110c0, PPC_BIT(53), PPC_BIT(53)); + xscom_write_mask(npu->chip_id, 0x50112c0, PPC_BIT(53), PPC_BIT(53)); + xscom_write_mask(npu->chip_id, 0x50114c0, PPC_BIT(53), PPC_BIT(53)); + xscom_write_mask(npu->chip_id, 0x50110f1, PPC_BIT(41), PPC_BIT(41)); + xscom_write_mask(npu->chip_id, 0x50112f1, PPC_BIT(41), PPC_BIT(41)); + xscom_write_mask(npu->chip_id, 0x50114f1, PPC_BIT(41), PPC_BIT(41)); + + val = NPU2_NTL_MISC_CFG2_BRICK_ENABLE | + NPU2_NTL_MISC_CFG2_NDL_TX_PARITY_ENA | + NPU2_NTL_MISC_CFG2_NDL_PRI_PARITY_ENA | + NPU2_NTL_MISC_CFG2_RCV_CREDIT_OVERFLOW_ENA; + xscom_write_mask(npu->chip_id, 0x5011110, val, val); + xscom_write_mask(npu->chip_id, 0x5011130, val, val); + xscom_write_mask(npu->chip_id, 0x5011310, val, val); + xscom_write_mask(npu->chip_id, 0x5011330, val, val); + xscom_write_mask(npu->chip_id, 0x5011510, val, val); + xscom_write_mask(npu->chip_id, 0x5011530, val, val); + + val = PPC_BIT(6) | PPC_BIT(7) | PPC_BIT(11); + xscom_write_mask(npu->chip_id, 0x5011009, val, PPC_BITMASK(6,11)); + xscom_write_mask(npu->chip_id, 0x5011039, val, PPC_BITMASK(6,11)); + xscom_write_mask(npu->chip_id, 0x5011069, val, PPC_BITMASK(6,11)); + xscom_write_mask(npu->chip_id, 0x5011099, val, PPC_BITMASK(6,11)); + xscom_write_mask(npu->chip_id, 0x5011209, val, PPC_BITMASK(6,11)); + xscom_write_mask(npu->chip_id, 0x5011239, val, PPC_BITMASK(6,11)); + xscom_write_mask(npu->chip_id, 0x5011269, val, PPC_BITMASK(6,11)); + xscom_write_mask(npu->chip_id, 0x5011299, val, PPC_BITMASK(6,11)); + xscom_write_mask(npu->chip_id, 0x5011409, val, PPC_BITMASK(6,11)); + xscom_write_mask(npu->chip_id, 0x5011439, val, PPC_BITMASK(6,11)); + xscom_write_mask(npu->chip_id, 0x5011469, val, PPC_BITMASK(6,11)); + xscom_write_mask(npu->chip_id, 0x5011499, val, PPC_BITMASK(6,11)); + + /* Reassign the BARs */ + assign_mmio_bars(npu->chip_id, npu->xscom_base, reg, mm_win); + npu->regs = (void *)reg[0]; + npu->mm_base = mm_win[0]; + npu->mm_size = mm_win[1]; + + if (reg[0] && reg[1]) + prlog(PR_INFO, " Global MMIO BAR: %016llx (%lldMB)\n", + reg[0], reg[1] >> 20); + else + prlog(PR_ERR, " Global MMIO BAR: Disabled\n"); + + /* Populate PCI root device node */ + np = dt_new_addr(dt_root, "pciex", reg[0]); + assert(np); + dt_add_property_strings(np, + "compatible", + "ibm,power9-npu-pciex", + "ibm,ioda2-npu2-phb"); + dt_add_property_strings(np, "device_type", "pciex"); + dt_add_property(np, "reg", reg, sizeof(reg)); + dt_add_property_cells(np, "ibm,phb-index", npu2_get_phb_index(0)); + dt_add_property_cells(np, "ibm,npu-index", npu->index); + dt_add_property_cells(np, "ibm,chip-id", npu->chip_id); + dt_add_property_cells(np, "ibm,xscom-base", npu->xscom_base); + dt_add_property_cells(np, "ibm,npcq", npu->dt_node->phandle); + dt_add_property_cells(np, "ibm,links", npu->total_devices); + dt_add_property(np, "ibm,mmio-window", mm_win, sizeof(mm_win)); + dt_add_property_cells(np, "ibm,phb-diag-data-size", 0); + + /* Disable fast reboot - not currently supported */ + disable_fast_reboot("NVLink device enabled"); + + npu2_nvlink_create_phb(npu, np); + + return 0; +} + +static uint32_t npu2_populate_pcie_cap(struct npu2_dev *dev, + uint32_t start, + uint32_t prev_cap) +{ + struct pci_virt_device *pvd = dev->nvlink.pvd; + uint32_t val; + + /* Add capability list */ + PCI_VIRT_CFG_INIT_RO(pvd, prev_cap, 1, start); + PCI_VIRT_CFG_INIT_RO(pvd, start, 1, PCI_CFG_CAP_ID_EXP); + + /* 0x00 - ID/PCIE capability */ + val = PCI_CFG_CAP_ID_EXP; + val |= ((0x2 << 16) | (PCIE_TYPE_ENDPOINT << 20)); + PCI_VIRT_CFG_INIT_RO(pvd, start, 4, val); + + /* 0x04 - Device capability + * + * We should support FLR. Otherwise, it might have + * problem passing it through to userland via Linux + * VFIO infrastructure + */ + val = ((PCIE_MPSS_128) | + (PCIE_PHANTOM_NONE << 3) | + (PCIE_L0SL_MAX_NO_LIMIT << 6) | + (PCIE_L1L_MAX_NO_LIMIT << 9) | + (PCICAP_EXP_DEVCAP_FUNC_RESET)); + PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_DEVCAP, 4, val); + + pci_virt_add_filter(pvd, start + PCICAP_EXP_DEVCTL, 2, + PCI_REG_FLAG_WRITE, + npu2_dev_cfg_exp_devcap, NULL); + + /* 0x08 - Device control and status */ + PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_DEVCTL, 4, 0x00002810, + 0xffff0000, 0x000f0000); + + /* 0x0c - Link capability */ + val = (PCIE_LSPEED_VECBIT_2 | (PCIE_LWIDTH_1X << 4)); + PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_LCAP, 4, val); + + /* 0x10 - Link control and status */ + PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_LCTL, 4, 0x00130000, + 0xfffff000, 0xc0000000); + + /* 0x14 - Slot capability */ + PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SLOTCAP, 4, 0x00000000); + + /* 0x18 - Slot control and status */ + PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SLOTCTL, 4, 0x00000000); + + /* 0x1c - Root control and capability */ + PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_RC, 4, 0x00000000, + 0xffffffe0, 0x00000000); + + /* 0x20 - Root status */ + PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_RSTAT, 4, 0x00000000, + 0xffffffff, 0x00010000); + + /* 0x24 - Device capability 2 */ + PCI_VIRT_CFG_INIT_RO(pvd, start + PCIECAP_EXP_DCAP2, 4, 0x00000000); + + /* 0x28 - Device Control and status 2 */ + PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_DCTL2, 4, 0x00070000, + 0xffff0000, 0x00000000); + + /* 0x2c - Link capability 2 */ + PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_LCAP2, 4, 0x00000007); + + /* 0x30 - Link control and status 2 */ + PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_LCTL2, 4, 0x00000003, + 0xffff0000, 0x00200000); + + /* 0x34 - Slot capability 2 */ + PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SCAP2, 4, 0x00000000); + + /* 0x38 - Slot control and status 2 */ + PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SCTL2, 4, 0x00000000); + + return start + PCICAP_EXP_SCTL2 + 8; +} + +static uint32_t npu2_populate_vendor_cap(struct npu2_dev *dev, + uint32_t start, + uint32_t prev_cap) +{ + struct pci_virt_device *pvd = dev->nvlink.pvd; + + /* Capbility list */ + PCI_VIRT_CFG_INIT_RO(pvd, prev_cap, 1, start); + PCI_VIRT_CFG_INIT_RO(pvd, start, 1, PCI_CFG_CAP_ID_VENDOR); + + /* Length and version */ + PCI_VIRT_CFG_INIT_RO(pvd, start + 2, 1, VENDOR_CAP_LEN); + PCI_VIRT_CFG_INIT_RO(pvd, start + 3, 1, VENDOR_CAP_VERSION); + + /* + * Defaults when the trap can't handle the read/write (eg. due + * to reading/writing less than 4 bytes). + */ + PCI_VIRT_CFG_INIT_RO(pvd, start + 4, 4, 0); + PCI_VIRT_CFG_INIT_RO(pvd, start + 8, 4, 0); + + /* Add NVLink2 PHY procedures trap */ + pci_virt_add_filter(pvd, start + 4, 8, + PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE, + npu2_dev_procedure, + NULL); + + /* Link index */ + PCI_VIRT_CFG_INIT_RO(pvd, start + 0xc, 1, dev->link_index); + + return start + VENDOR_CAP_LEN; +} + +static void npu2_populate_cfg(struct npu2_dev *dev) +{ + struct pci_virt_device *pvd = dev->nvlink.pvd; + struct npu2_pcie_bar *bar; + uint32_t pos; + + /* 0x00 - Vendor/Device ID */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_VENDOR_ID, 4, 0x04ea1014); + + /* 0x04 - Command/Status */ + PCI_VIRT_CFG_INIT(pvd, PCI_CFG_CMD, 4, 0x00100000, 0xffb802b8, + 0xf9000000); + + pci_virt_add_filter(pvd, PCI_CFG_CMD, 1, PCI_REG_FLAG_WRITE, + npu2_cfg_write_cmd, NULL); + + /* 0x08 - Rev/Class/Cache */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_REV_ID, 4, 0x06800101); + + /* 0x0c - CLS/Latency Timer/Header/BIST */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CACHE_LINE_SIZE, 4, 0x00800000); + + /* 0x10/14 - BAR#0, NTL BAR */ + bar = &dev->bars[0]; + PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR0, 4, + (bar->npu2_bar.base & 0xfffffff0) | (bar->flags & 0xF), + 0x0000000f, 0x00000000); + PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR1, 4, (bar->npu2_bar.base >> 32), + 0x00000000, 0x00000000); + pci_virt_add_filter(pvd, PCI_CFG_BAR0, 8, + PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE, + npu2_dev_cfg_bar, bar); + + /* 0x18/1c - BAR#1, GENID BAR */ + bar = &dev->bars[1]; + if (NPU2DEV_BRICK(dev) == 0) + PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR2, 4, (bar->npu2_bar.base & 0xfffffff0) | + (bar->flags & 0xF), + 0x0000000f, 0x00000000); + else + /* Brick 1 gets the upper portion of the generation id register */ + PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR2, 4, ((bar->npu2_bar.base + 0x10000) & 0xfffffff0) | + (bar->flags & 0xF), + 0x0000000f, 0x00000000); + + PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR3, 4, (bar->npu2_bar.base >> 32), 0x00000000, + 0x00000000); + pci_virt_add_filter(pvd, PCI_CFG_BAR2, 8, + PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE, + npu2_dev_cfg_bar, bar); + + /* 0x20/0x24 - BARs, disabled */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR4, 4, 0x00000000); + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR5, 4, 0x00000000); + + /* 0x28 - Cardbus CIS pointer */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CARDBUS_CIS, 4, 0x00000000); + + /* 0x2c - Subsystem ID */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_SUBSYS_VENDOR_ID, 4, 0x00000000); + + /* 0x30 - ROM BAR, zero sized */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_ROMBAR, 4, 0xffffffff); + + /* 0x34 - PCI Capability */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CAP, 4, 0x00000000); + + /* 0x38 - Reserved */ + PCI_VIRT_CFG_INIT_RO(pvd, 0x38, 4, 0x00000000); + + /* 0x3c - INT line/pin/Minimal grant/Maximal latency */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_INT_LINE, 4, 0x00000100); /* INT A */ + + /* PCIE and vendor specific capability */ + pos = npu2_populate_pcie_cap(dev, 0x40, PCI_CFG_CAP); + pos = npu2_populate_vendor_cap(dev, pos, 0x41); + PCI_VIRT_CFG_INIT_RO(pvd, pos + 1, 1, 0); +} + +static uint32_t npu_allocate_bdfn(struct npu2 *p, uint32_t group) +{ + int i; + int bdfn = (group << 3); + + for (i = 0; i < p->total_devices; i++) { + if ((p->devices[i].bdfn & 0xf8) == (bdfn & 0xf8)) + bdfn++; + } + + return bdfn; +} + +static void npu2_populate_devices(struct npu2 *p, + struct dt_node *dn) +{ + struct npu2_dev *dev; + struct dt_node *npu2_dn, *link; + uint32_t npu_phandle, index = 0; + int stack; + + /* + * Get the npu node which has the links which we expand here + * into pci like devices attached to our emulated phb. + */ + npu_phandle = dt_prop_get_u32(dn, "ibm,npcq"); + npu2_dn = dt_find_by_phandle(dt_root, npu_phandle); + assert(npu2_dn); + + /* Walk the link@x nodes to initialize devices */ + p->total_devices = 0; + p->phb_nvlink.scan_map = 0; + dt_for_each_compatible(npu2_dn, link, "ibm,npu-link") { + uint32_t group_id; + struct npu2_bar *npu2_bar; + + dev = &p->devices[index]; + dev->type = NPU2_DEV_TYPE_NVLINK; + dev->npu = p; + dev->dt_node = link; + dev->link_index = dt_prop_get_u32(link, "ibm,npu-link-index"); + dev->brick_index = dev->link_index; + + group_id = dt_prop_get_u32(link, "ibm,npu-group-id"); + dev->bdfn = npu_allocate_bdfn(p, group_id); + + /* This must be done after calling + * npu_allocate_bdfn() */ + p->total_devices++; + p->phb_nvlink.scan_map |= 0x1 << ((dev->bdfn & 0xf8) >> 3); + + dev->pl_xscom_base = dt_prop_get_u64(link, "ibm,npu-phy"); + dev->lane_mask = dt_prop_get_u32(link, "ibm,npu-lane-mask"); + + /* Populate BARs. BAR0/1 is the NTL bar. */ + stack = NPU2_STACK_STCK_0 + NPU2DEV_STACK(dev); + npu2_bar = &dev->bars[0].npu2_bar; + npu2_bar->type = NPU_NTL; + npu2_bar->index = dev->brick_index; + npu2_bar->reg = NPU2_REG_OFFSET(stack, 0, NPU2DEV_BRICK(dev) == 0 ? + NPU2_NTL0_BAR : NPU2_NTL1_BAR); + npu2_get_bar(p->chip_id, npu2_bar); + + dev->bars[0].flags = PCI_CFG_BAR_TYPE_MEM | PCI_CFG_BAR_MEM64; + + /* BAR2/3 is the GENID bar. */ + npu2_bar = &dev->bars[1].npu2_bar; + npu2_bar->type = NPU_GENID; + npu2_bar->index = NPU2DEV_STACK(dev); + npu2_bar->reg = NPU2_REG_OFFSET(stack, 0, NPU2_GENID_BAR); + npu2_get_bar(p->chip_id, npu2_bar); + + /* The GENID is a single physical BAR that we split + * for each emulated device */ + npu2_bar->size = 0x10000; + if (NPU2DEV_BRICK(dev)) + npu2_bar->base += 0x10000; + dev->bars[1].flags = PCI_CFG_BAR_TYPE_MEM | PCI_CFG_BAR_MEM64; + + /* Initialize PCI virtual device */ + dev->nvlink.pvd = pci_virt_add_device(&p->phb_nvlink, dev->bdfn, 0x100, dev); + if (dev->nvlink.pvd) + npu2_populate_cfg(dev); + + index++; + } +} + +static void npu2_add_interrupt_map(struct npu2 *p, + struct dt_node *dn) +{ + struct dt_node *npu2_dn, *link, *phb_dn; + uint32_t npu2_phandle, index = 0, i; + uint32_t icsp = get_ics_phandle(); + uint32_t *map; + size_t map_size; + uint32_t mask[] = {0xff00, 0x0, 0x0, 0x7}; + + assert(p->phb_nvlink.dt_node); + phb_dn = p->phb_nvlink.dt_node; + + npu2_phandle = dt_prop_get_u32(dn, "ibm,npcq"); + npu2_dn = dt_find_by_phandle(dt_root, npu2_phandle); + assert(npu2_dn); + map_size = 7 * sizeof(*map) * p->total_devices; + map = malloc(map_size); + index = 0; + dt_for_each_compatible(npu2_dn, link, "ibm,npu-link") { + i = index * 7; + map[i + 0] = (p->devices[index].bdfn << 8); + map[i + 1] = 0; + map[i + 2] = 0; + + map[i + 3] = 1; /* INT A */ + map[i + 4] = icsp; /* interrupt-parent */ + map[i + 5] = p->base_lsi + (index * 2) + 1; /* NDL No-Stall Event */ + map[i + 6] = 0; /* 0 = EDGE, 1 = LEVEL. */ + index++; + } + dt_add_property(phb_dn, "interrupt-map", map, map_size); + free(map); + dt_add_property(phb_dn, "interrupt-map-mask", mask, sizeof(mask)); +} + +static void npu2_add_phb_properties(struct npu2 *p) +{ + struct dt_node *np = p->phb_nvlink.dt_node; + uint32_t icsp = get_ics_phandle(); + uint64_t mm_base, mm_size; + + /* + * Add various properties that HB doesn't have to + * add, some of them simply because they result from + * policy decisions made in skiboot rather than in HB + * such as the MMIO windows going to PCI, interrupts, + * etc. + */ + dt_add_property_cells(np, "#address-cells", 3); + dt_add_property_cells(np, "#size-cells", 2); + dt_add_property_cells(np, "#interrupt-cells", 1); + dt_add_property_cells(np, "bus-range", 0, 0xff); + dt_add_property_cells(np, "clock-frequency", 0x200, 0); + dt_add_property_cells(np, "interrupt-parent", icsp); + + /* NPU2 PHB properties */ + dt_add_property_cells(np, "ibm,opal-num-pes", + NPU2_MAX_PE_NUM); + dt_add_property_cells(np, "ibm,opal-reserved-pe", + NPU2_RESERVED_PE_NUM); + dt_add_property_cells(np, "ibm,supported-tce-sizes", + 12, // 4K + 16, // 64K + 24, // 16M + 28); // 256M + + dt_add_property_u64s(np, "ibm,mmio-atsd", + MMIO_ATSD_ADDR(p->regs, 0), + MMIO_ATSD_ADDR(p->regs, 1), + MMIO_ATSD_ADDR(p->regs, 2), + MMIO_ATSD_ADDR(p->regs, 3), + MMIO_ATSD_ADDR(p->regs, 4), + MMIO_ATSD_ADDR(p->regs, 5), + MMIO_ATSD_ADDR(p->regs, 6), + MMIO_ATSD_ADDR(p->regs, 7)); + + /* + * Memory window is exposed as 64-bits non-prefetchable + * one because 64-bits prefetchable one is kind of special + * to kernel. + */ + mm_base = p->mm_base; + mm_size = p->mm_size; + dt_add_property_cells(np, "ranges", 0x02000000, + hi32(mm_base), lo32(mm_base), + hi32(mm_base), lo32(mm_base), + hi32(mm_size), lo32(mm_size)); +} + +void npu2_nvlink_create_phb(struct npu2 *npu, struct dt_node *dn) +{ + struct pci_slot *slot; + + /* Generic PHB */ + npu->phb_nvlink.dt_node = dn; + npu->phb_nvlink.ops = &npu_ops; + npu->phb_nvlink.phb_type = phb_type_npu_v2; + init_lock(&npu->lock); + init_lock(&npu->phb_nvlink.lock); + list_head_init(&npu->phb_nvlink.devices); + list_head_init(&npu->phb_nvlink.virt_devices); + + npu2_populate_devices(npu, dn); + npu2_add_interrupt_map(npu, dn); + npu2_add_phb_properties(npu); + + slot = npu2_slot_create(&npu->phb_nvlink); + if (!slot) + { + /** + * @fwts-label NPUCannotCreatePHBSlot + * @fwts-advice Firmware probably ran out of memory creating + * NPU2 slot. NVLink functionality could be broken. + */ + prlog(PR_ERR, "NPU: Cannot create PHB slot\n"); + } + + pci_register_phb(&npu->phb_nvlink, OPAL_DYNAMIC_PHB_ID); + + npu2_init_ioda_cache(npu); + npu2_hw_init(npu); +} + +/* + * Search a table for an entry with matching value under mask. Returns + * the index and the current value in *value. + */ +static int npu_table_search(struct npu2 *p, uint64_t table_addr, int stride, + int table_size, uint64_t *value, uint64_t mask) +{ + int i; + uint64_t val; + + assert(value); + + for (i = 0; i < table_size; i++) { + val = npu2_read(p, table_addr + i*stride); + if ((val & mask) == *value) { + *value = val; + return i; + } + } + + return -1; +} + +/* + * Allocate a context ID and initialise the tables with the relevant + * information. Returns the ID on or error if one couldn't be + * allocated. + */ +#define NPU2_VALID_ATS_MSR_BITS (MSR_DR | MSR_HV | MSR_PR | MSR_SF) +int64_t npu2_init_context(struct phb *phb, uint64_t msr, uint64_t bdf) +{ + struct npu2 *p; + uint64_t xts_bdf, old_xts_bdf_pid, xts_bdf_pid; + int id; + + /* + * MSR bits should be masked by the caller to allow for future + * expansion if required. + */ + if (msr & ~NPU2_VALID_ATS_MSR_BITS) + return OPAL_UNSUPPORTED; + + /* + * Need to get LPARSHORT. + */ + p = phb_to_npu2_nvlink(phb); + lock(&p->lock); + xts_bdf = SETFIELD(NPU2_XTS_BDF_MAP_BDF, 0ul, bdf); + if (npu_table_search(p, NPU2_XTS_BDF_MAP, 8, NPU2_XTS_BDF_MAP_SIZE, + &xts_bdf, NPU2_XTS_BDF_MAP_BDF) < 0) { + NPU2ERR(p, "LPARID not associated with any GPU\n"); + id = OPAL_PARAMETER; + goto out; + } + + id = GETFIELD(NPU2_XTS_BDF_MAP_LPARSHORT, xts_bdf); + NPU2DBG(p, "Found LPARSHORT = 0x%x for BDF = 0x%03llx\n", id, bdf); + + /* Enable this mapping for both real and virtual addresses */ + xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_VALID_ATRGPA0, 0UL, 1); + xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_VALID_ATRGPA1, xts_bdf_pid, 1); + + /* Enables TLBIE/MMIOSD forwarding for this entry */ + xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_VALID_ATSD, xts_bdf_pid, 1); + xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_LPARSHORT, xts_bdf_pid, id); + + /* Set the relevant MSR bits */ + xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_MSR_DR, xts_bdf_pid, + !!(msr & MSR_DR)); + xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_MSR_HV, xts_bdf_pid, + !!(msr & MSR_HV)); + xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_MSR_PR, xts_bdf_pid, + !!(msr & MSR_PR)); + + /* We don't support anything other than 64-bit so we can safely hardcode + * it here */ + xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_MSR_SF, xts_bdf_pid, 1); + + /* + * Throw an error if the wildcard entry for this bdf is already set + * with different msr bits. + */ + old_xts_bdf_pid = npu2_read(p, NPU2_XTS_PID_MAP + id*0x20); + if (old_xts_bdf_pid) { + if (GETFIELD(NPU2_XTS_PID_MAP_MSR, old_xts_bdf_pid) != + GETFIELD(NPU2_XTS_PID_MAP_MSR, xts_bdf_pid)) { + NPU2ERR(p, "%s: Unexpected MSR value\n", __func__); + id = OPAL_PARAMETER; + goto out; + } else if (!p->ctx_ref[id]) { + NPU2ERR(p, "%s: Unexpected mapping\n", __func__); + id = OPAL_INTERNAL_ERROR; + goto out; + } + } + + /* Write the entry */ + if (!p->ctx_ref[id]) { + NPU2DBG(p, "XTS_PID_MAP[%03d] = 0x%08llx\n", id, xts_bdf_pid); + npu2_write(p, NPU2_XTS_PID_MAP + id*0x20, xts_bdf_pid); + + if (!GETFIELD(NPU2_XTS_BDF_MAP_VALID, xts_bdf)) { + xts_bdf = SETFIELD(NPU2_XTS_BDF_MAP_VALID, xts_bdf, 1); + npu2_write(p, NPU2_XTS_BDF_MAP + id*8, xts_bdf); + } + } + ++p->ctx_ref[id]; + +out: + unlock(&p->lock); + return id; +} + +int64_t npu2_destroy_context(struct phb *phb, uint64_t bdf) +{ + struct npu2 *p; + uint64_t xts_bdf; + int rc = OPAL_PARAMETER, id; + + p = phb_to_npu2_nvlink(phb); + lock(&p->lock); + + /* Need to find lparshort for this bdf */ + xts_bdf = SETFIELD(NPU2_XTS_BDF_MAP_BDF, 0ul, bdf); + if (npu_table_search(p, NPU2_XTS_BDF_MAP, 8, NPU2_XTS_BDF_MAP_SIZE, + &xts_bdf, NPU2_XTS_BDF_MAP_BDF) < 0) { + NPU2ERR(p, "LPARID not associated with any GPU\n"); + } else { + /* + * The bdf/pid table contains wildcard entries and MSR bits + * which we need to clear between switching a device from + * a host to a guest or vice versa. + */ + id = GETFIELD(NPU2_XTS_BDF_MAP_LPARSHORT, xts_bdf); + if (p->ctx_ref[id]) { + --p->ctx_ref[id]; + if (!p->ctx_ref[id]) { + NPU2DBG(p, "XTS_PID_MAP[%03d] = 0 (destroy)\n", + id); + npu2_write(p, NPU2_XTS_PID_MAP + id*0x20, 0); + } + rc = OPAL_SUCCESS; + } + } + unlock(&p->lock); + return rc; +} + +/* + * Map the given virtual bdf to lparid with given lpcr. + */ +int64_t npu2_map_lpar(struct phb *phb, uint64_t bdf, uint64_t lparid, + uint64_t lpcr) +{ + struct npu2 *p; + struct npu2_dev *ndev = NULL; + uint64_t xts_bdf_lpar, atsd_lpar, rc = OPAL_SUCCESS; + int i; + int id; + static uint64_t atsd_lpar_regs[] = { + NPU2_XTS_MMIO_ATSD0_LPARID, NPU2_XTS_MMIO_ATSD1_LPARID, + NPU2_XTS_MMIO_ATSD2_LPARID, NPU2_XTS_MMIO_ATSD3_LPARID, + NPU2_XTS_MMIO_ATSD4_LPARID, NPU2_XTS_MMIO_ATSD5_LPARID, + NPU2_XTS_MMIO_ATSD6_LPARID, NPU2_XTS_MMIO_ATSD7_LPARID + }; + + if (lpcr) + /* The LPCR bits are only required for hash based ATS, + * which we don't currently support but may need to in + * future. */ + return OPAL_UNSUPPORTED; + + p = phb_to_npu2_nvlink(phb); + lock(&p->lock); + + /* Find any existing entries and update them */ + xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_BDF, 0L, bdf); + id = npu_table_search(p, NPU2_XTS_BDF_MAP, 8, NPU2_XTS_BDF_MAP_SIZE, + &xts_bdf_lpar, NPU2_XTS_BDF_MAP_BDF); + if (id < 0) { + /* No existing mapping found, find space for a new one */ + xts_bdf_lpar = 0; + id = npu_table_search(p, NPU2_XTS_BDF_MAP, 8, NPU2_XTS_BDF_MAP_SIZE, + &xts_bdf_lpar, -1UL); + } + + if (id < 0) { + /* Unable to find a free mapping */ + NPU2ERR(p, "No free XTS_BDF[] entry\n"); + rc = OPAL_RESOURCE; + goto out; + } + + xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_UNFILT, 0UL, 1); + xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_BDF, xts_bdf_lpar, bdf); + + /* We only support radix for the moment */ + xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_XLAT, xts_bdf_lpar, 0x3); + xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_LPARID, xts_bdf_lpar, lparid); + xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_LPARSHORT, xts_bdf_lpar, id); + + /* Need to find an NVLink to send the ATSDs for this device over */ + for (i = 0; i < p->total_devices; i++) { + if (p->devices[i].nvlink.gpu_bdfn == bdf) { + ndev = &p->devices[i]; + break; + } + } + + if (!ndev) { + NPU2ERR(p, "Unable to find nvlink for bdf %llx\n", bdf); + rc = OPAL_PARAMETER; + goto out; + } + + /* + * We need to allocate an ATSD per NVLink bridge if possible, + * use the ibm,npu-link-index property for that. + */ + atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_LPARID, 0, lparid); + if (!lparid) + atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_MSR_HV, atsd_lpar, 1); + + if (ndev->link_index < ARRAY_SIZE(atsd_lpar_regs)) + npu2_write(p, atsd_lpar_regs[ndev->link_index], atsd_lpar); + else + NPU2ERR(p, "Unable to assign ATSD for link index %u\n", + ndev->link_index); + + xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_STACK, xts_bdf_lpar, + 0x4 >> (ndev->brick_index / 2)); + xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_BRICK, xts_bdf_lpar, + (ndev->brick_index % 2)); + + NPU2DBG(p, "XTS_BDF_MAP[%03d] = 0x%08llx\n", id, xts_bdf_lpar); + npu2_write(p, NPU2_XTS_BDF_MAP + id*8, xts_bdf_lpar); + + /* Reset wildcard in the PID map and the refcounter */ + if (npu2_read(p, NPU2_XTS_PID_MAP + id*0x20) || p->ctx_ref[id]) { + prlog(PR_INFO, "Resetting PID MAP for LPID %lld\n", lparid); + p->ctx_ref[id] = 0; + npu2_write(p, NPU2_XTS_PID_MAP + id*0x20, 0); + } + +out: + unlock(&p->lock); + return rc; +} + +static inline uint32_t npu2_relaxed_ordering_source_grpchp(uint32_t gcid) +{ + if (gcid & ~0x1b) + return OPAL_PARAMETER; + + /* Repack 0bGGGGCCC to 0bGGCC */ + return ((gcid & 0x18) >> 1) | (gcid & 0x3); +} + +static uint64_t npu2_relaxed_ordering_cfg_read(struct npu2_dev *ndev, int n) +{ + uint64_t reg = NPU2_SM_REG_OFFSET(ndev, 0, NPU2_RELAXED_ORDERING_CFG(n)); + + return npu2_read(ndev->npu, reg); +} + +static void npu2_relaxed_ordering_cfg_write(struct npu2_dev *ndev, int n, + uint64_t val) +{ + uint64_t reg; + int sm; + + /* Set every register on our stack */ + for (sm = NPU2_BLOCK_SM_0; sm <= NPU2_BLOCK_SM_3; sm++) { + reg = NPU2_SM_REG_OFFSET(ndev, sm, NPU2_RELAXED_ORDERING_CFG(n)); + npu2_write(ndev->npu, reg, val); + } +} + +/* + * Parse the value of a relaxed ordering config register. Returns SOURCE0 or + * SOURCE1 register mask if relaxed ordering is set for the given chip/pec. + * Returns 0 if unset. + */ +static uint64_t npu2_relaxed_ordering_cfg_enabled(uint64_t val, uint32_t gcid, + int pec) +{ + uint32_t src, grpchp; + uint64_t mask; + int i; + + for (i = 0; i < 2; i++) { + mask = NPU2_RELAXED_ORDERING_SOURCE(i); + src = GETFIELD(mask, val); + + if (!GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_ENA, src)) + continue; + + if (GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_PECSEL, src) != pec) + continue; + + grpchp = GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_GRPCHP, src); + if (grpchp == npu2_relaxed_ordering_source_grpchp(gcid)) + return mask; + + if (grpchp == 0xf) /* match all */ + return mask; + } + + return 0; +} + +static int npu2_enable_relaxed_ordering(struct npu2_dev *ndev, uint32_t gcid, + int pec) +{ + uint64_t val, mask; + uint32_t src; + int rc = OPAL_RESOURCE; + int i; + + NPU2DEVINF(ndev, "Enabling relaxed ordering for PEC %d on chip %d\n", pec, gcid); + lock(&ndev->npu->lock); + + for (i = 0; i < 2; i++) { + val = npu2_relaxed_ordering_cfg_read(ndev, i); + if (!npu2_relaxed_ordering_cfg_enabled(val, gcid, pec)) + continue; + + /* Already enabled */ + rc = OPAL_SUCCESS; + goto out; + } + + src = NPU2_RELAXED_ORDERING_SOURCE_WRENA | + NPU2_RELAXED_ORDERING_SOURCE_RDENA; + src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_PECSEL, src, pec); + src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_GRPCHP, src, + npu2_relaxed_ordering_source_grpchp(gcid)); + src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_WRMIN, src, 0); + src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_WRMAX, src, 23); + src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_RDMIN, src, 0); + src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_RDMAX, src, 47); + + /* Find somewhere to write this config */ + for (i = 0; i < 2; i++) { + val = npu2_relaxed_ordering_cfg_read(ndev, i); + + if (!GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_ENA << 32, val)) + mask = NPU2_RELAXED_ORDERING_SOURCE(0); + else if (!GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_ENA, val)) + mask = NPU2_RELAXED_ORDERING_SOURCE(1); + else + continue; + + val = SETFIELD(mask, val, src); + npu2_relaxed_ordering_cfg_write(ndev, i, val); + + rc = OPAL_SUCCESS; + break; + } + +out: + unlock(&ndev->npu->lock); + return rc; +} + +static void npu2_disable_relaxed_ordering(struct npu2_dev *ndev, uint32_t gcid, + int pec) +{ + uint64_t val, mask; + int i; + + NPU2DEVINF(ndev, "Disabling relaxed ordering for PEC %d on chip %d\n", pec, gcid); + lock(&ndev->npu->lock); + + for (i = 0; i < 2; i++) { + val = npu2_relaxed_ordering_cfg_read(ndev, i); + + mask = npu2_relaxed_ordering_cfg_enabled(val, gcid, pec); + if (!mask) + continue; + + val = SETFIELD(mask, val, 0); + npu2_relaxed_ordering_cfg_write(ndev, i, val); + } + + unlock(&ndev->npu->lock); +} + +/* + * Enable or disable relaxed ordering on all nvlinks for a given PEC. May leave + * relaxed ordering partially enabled if there are insufficient HW resources to + * enable it on all links. + */ +int64_t npu2_set_relaxed_order(struct phb *phb, uint32_t gcid, int pec, + bool enable) +{ + struct npu2 *npu = phb_to_npu2_nvlink(phb); + struct npu2_dev *ndev; + int64_t rc = OPAL_SUCCESS; + + for (int i = 0; i < npu->total_devices; i++) { + ndev = &npu->devices[i]; + if (enable) + rc = npu2_enable_relaxed_ordering(ndev, gcid, pec); + else + npu2_disable_relaxed_ordering(ndev, gcid, pec); + + if (rc != OPAL_SUCCESS) { + NPU2DEVINF(ndev, "Insufficient resources to activate relaxed ordering mode\n"); + return OPAL_RESOURCE; + } + } + + return OPAL_SUCCESS; +} diff --git a/roms/skiboot/hw/npu3-hw-procedures.c b/roms/skiboot/hw/npu3-hw-procedures.c new file mode 100644 index 000000000..098e6e467 --- /dev/null +++ b/roms/skiboot/hw/npu3-hw-procedures.c @@ -0,0 +1,792 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Copyright 2019 IBM Corp. + */ + +#include <skiboot.h> +#include <npu3.h> +#include <npu3-regs.h> +#include <timebase.h> +#include <xscom.h> +#include <xscom-p9-regs.h> + +#define NPU3DEVLOG(l, dev, fmt, a...) \ + prlog(l, "NPU[%d:%d:%d]: " fmt, \ + (dev)->npu->chip_id, \ + (dev)->npu->index, \ + (dev)->index, ##a) +#define NPU3DEVDBG(dev, fmt, a...) NPU3DEVLOG(PR_DEBUG, dev, fmt, ##a) +#define NPU3DEVINF(dev, fmt, a...) NPU3DEVLOG(PR_INFO, dev, fmt, ##a) +#define NPU3DEVERR(dev, fmt, a...) NPU3DEVLOG(PR_ERR, dev, fmt, ##a) + +/* + * The documentation for the PHY training is written in terms of bits within an + * actual register so we use that representation here. + */ +struct npu3_phy_reg { + uint64_t offset; + uint64_t mask; +}; + +static struct npu3_phy_reg +NPU3_PHY_RX_RUN_LANE = { 0x0c8, PPC_BIT(48) }, +NPU3_PHY_RX_IORESET = { 0x096, PPC_BIT(63) }, +NPU3_PHY_TX_IORESET = { 0x113, PPC_BIT(48) }, +NPU3_PHY_RX_PR_RESET = { 0x096, PPC_BIT(62) }, +NPU3_PHY_RX_LANE_ANA_PDWN = { 0x002, PPC_BIT(54) }, +NPU3_PHY_RX_LANE_DIG_PDWN = { 0x088, PPC_BIT(48) }, +NPU3_PHY_RX_PR_PHASE_STEP = { 0x08a, PPC_BITMASK(60, 63) }, +NPU3_PHY_TX_LANE_PDWN = { 0x101, PPC_BIT(48) }, +NPU3_PHY_RX_RUN_DCCAL = { 0x0c8, PPC_BIT(49) }, +NPU3_PHY_RX_DCCAL_DONE = { 0x0ca, PPC_BIT(49) }, +NPU3_PHY_RX_LANE_BUSY = { 0x0ca, PPC_BIT(50) }, +NPU3_PHY_RX_B_BANK_CONTROLS = { 0x002, PPC_BITMASK(58, 63) }, +NPU3_PHY_TX_UNLOAD_CLK_DISABLE = { 0x103, PPC_BIT(56) }, +NPU3_PHY_TX_FIFO_INIT = { 0x105, PPC_BIT(53) }, +NPU3_PHY_TX_RXCAL = { 0x103, PPC_BIT(57) }, +NPU3_PHY_RX_INIT_DONE = { 0x0ca, PPC_BIT(48) }, +NPU3_PHY_RX_PR_EDGE_TRACK_CNTL = { 0x092, PPC_BITMASK(48, 49) }, +NPU3_PHY_RX_PR_FW_OFF = { 0x08a, PPC_BIT(56) }, +NPU3_PHY_RX_PR_FW_INERTIA_AMT = { 0x08a, PPC_BITMASK(57, 59) }, +NPU3_PHY_RX_CFG_LTE_MC = { 0x000, PPC_BITMASK(60, 63) }, +NPU3_PHY_RX_A_INTEG_COARSE_GAIN = { 0x00a, PPC_BITMASK(48, 51) }, +NPU3_PHY_RX_B_INTEG_COARSE_GAIN = { 0x026, PPC_BITMASK(48, 51) }, +NPU3_PHY_RX_E_INTEG_COARSE_GAIN = { 0x030, PPC_BITMASK(48, 51) }, + +/* These registers are per-PHY, not per lane */ +NPU3_PHY_TX_ZCAL_SWO_EN = { 0x3c9, PPC_BIT(48) }, +NPU3_PHY_TX_ZCAL_REQ = { 0x3c1, PPC_BIT(49) }, +NPU3_PHY_TX_ZCAL_DONE = { 0x3c1, PPC_BIT(50) }, +NPU3_PHY_TX_ZCAL_ERROR = { 0x3c1, PPC_BIT(51) }, +NPU3_PHY_TX_ZCAL_N = { 0x3c3, PPC_BITMASK(48, 56) }, +NPU3_PHY_TX_ZCAL_P = { 0x3c5, PPC_BITMASK(48, 56) }, +NPU3_PHY_TX_PSEG_PRE_EN = { 0x34d, PPC_BITMASK(51, 55) }, +NPU3_PHY_TX_PSEG_PRE_SELECT = { 0x34d, PPC_BITMASK(56, 60) }, +NPU3_PHY_TX_NSEG_PRE_EN = { 0x34f, PPC_BITMASK(51, 55) }, +NPU3_PHY_TX_NSEG_PRE_SELECT = { 0x34f, PPC_BITMASK(56, 60) }, +NPU3_PHY_TX_PSEG_POST_EN = { 0x361, PPC_BITMASK(49, 55) }, +NPU3_PHY_TX_PSEG_POST_SELECT = { 0x361, PPC_BITMASK(56, 62) }, +NPU3_PHY_TX_NSEG_POST_EN = { 0x363, PPC_BITMASK(49, 55) }, +NPU3_PHY_TX_NSEG_POST_SELECT = { 0x363, PPC_BITMASK(56, 62) }, +NPU3_PHY_TX_PSEG_MARGINPU_EN = { 0x351, PPC_BITMASK(48, 55) }, +NPU3_PHY_TX_NSEG_MARGINPU_EN = { 0x353, PPC_BITMASK(48, 55) }, +NPU3_PHY_TX_PSEG_MARGINPD_EN = { 0x351, PPC_BITMASK(56, 63) }, +NPU3_PHY_TX_NSEG_MARGINPD_EN = { 0x353, PPC_BITMASK(56, 63) }, +NPU3_PHY_TX_MARGINPU_SELECT = { 0x355, PPC_BITMASK(48, 55) }, +NPU3_PHY_TX_MARGINPD_SELECT = { 0x355, PPC_BITMASK(56, 63) }, +NPU3_PHY_TX_PSEG_MAIN_EN = { 0x357, PPC_BITMASK(51, 57) }, +NPU3_PHY_TX_NSEG_MAIN_EN = { 0x359, PPC_BITMASK(51, 57) }, +NPU3_PHY_RX_CLKDIST_PDWN = { 0x204, PPC_BITMASK(48, 50) }, +NPU3_PHY_RX_IREF_PDWN = { 0x230, PPC_BIT(54) }, +NPU3_PHY_TX_CLKDIST_PDWN = { 0x305, PPC_BITMASK(48, 50) }, +NPU3_PHY_RX_CTL_DATASM_CLKDIST_PDWN = { 0x2e0, PPC_BIT(60) }; + +static uint64_t npu3_phy_scom(struct npu3_dev *dev, struct npu3_phy_reg *reg, + int lane) +{ + uint64_t scom; + + /* Don't specify a lane for a non-per-lane register */ + if (lane >= 0) + assert(reg->offset < 0x200); + else + assert(reg->offset >= 0x200); + + scom = OB_INDIRECT(dev->ob_chiplet); + scom = SETFIELD(PPC_BITMASK(12, 21), scom, reg->offset); + + if (lane > 0) + scom = SETFIELD(PPC_BITMASK(27, 31), scom, lane); + + return scom; +} + +static void npu3_phy_write_lane(struct npu3_dev *dev, struct npu3_phy_reg *reg, + int lane, uint64_t val) +{ + struct npu3 *npu = dev->npu; + uint64_t scom, scom_val; + + scom = npu3_phy_scom(dev, reg, lane); + + xscom_read(npu->chip_id, scom, &scom_val); + scom_val = SETFIELD(reg->mask, scom_val, val); + xscom_write(npu->chip_id, scom, scom_val); +} + +static uint64_t npu3_phy_read_lane(struct npu3_dev *dev, + struct npu3_phy_reg *reg, + int lane) +{ + struct npu3 *npu = dev->npu; + uint64_t scom, scom_val; + + scom = npu3_phy_scom(dev, reg, lane); + xscom_read(npu->chip_id, scom, &scom_val); + + return GETFIELD(reg->mask, scom_val); +} + +static inline void npu3_phy_write(struct npu3_dev *dev, + struct npu3_phy_reg *reg, + uint64_t val) +{ + npu3_phy_write_lane(dev, reg, -1, val); +} + +static inline uint64_t npu3_phy_read(struct npu3_dev *dev, + struct npu3_phy_reg *reg) +{ + return npu3_phy_read_lane(dev, reg, -1); +} + +struct procedure { + const char *name; + uint32_t (*steps[])(struct npu3_dev *); +}; + +#define DEFINE_PROCEDURE(NAME, STEPS...) \ +static struct procedure procedure_##NAME = { \ + .name = #NAME, \ + .steps = { NAME, ##STEPS } \ +} + +static uint32_t stop(struct npu3_dev *npu_dev __unused) +{ + return NPU3_PROC_COMPLETE | NPU3_PROC_ABORTED; +} + +DEFINE_PROCEDURE(stop); + +static uint32_t nop(struct npu3_dev *npu_dev __unused) +{ + return NPU3_PROC_COMPLETE; +} + +DEFINE_PROCEDURE(nop); + +static void set_iovalid(struct npu3_dev *dev, bool raise) +{ + struct npu3 *npu = dev->npu; + uint64_t reg, val; + + reg = OB_CPLT_CONF1(dev->ob_chiplet); + + xscom_read(npu->chip_id, reg, &val); + val = SETFIELD(OB_CPLT_CONF1_NV_IOVALID(dev->index), val, raise); + xscom_write(npu->chip_id, reg, val); +} + +#define NPU3_PHY_LANES 24 + +#define npu3_for_each_lane(lane, dev) \ + for (lane = 0; lane < NPU3_PHY_LANES; lane++) \ + if (dev->phy_lane_mask & PPC_BIT32(lane)) \ + +static uint32_t phy_reset(struct npu3_dev *dev) +{ + uint32_t lane; + + set_iovalid(dev, false); + + npu3_for_each_lane(lane, dev) + npu3_phy_write_lane(dev, &NPU3_PHY_RX_RUN_LANE, lane, 0); + + return NPU3_PROC_NEXT; +} + +static uint32_t phy_reset_wait(struct npu3_dev *dev) +{ + int lane; + + /* Wait for all lanes to become inactive */ + npu3_for_each_lane(lane, dev) + if (npu3_phy_read_lane(dev, &NPU3_PHY_RX_LANE_BUSY, lane)) + return NPU3_PROC_INPROGRESS; + + npu3_for_each_lane(lane, dev) { + /* Set lane in reset */ + npu3_phy_write_lane(dev, &NPU3_PHY_RX_IORESET, lane, 1); + npu3_phy_write_lane(dev, &NPU3_PHY_TX_IORESET, lane, 1); + + /* Release lane from reset */ + npu3_phy_write_lane(dev, &NPU3_PHY_RX_IORESET, lane, 0); + npu3_phy_write_lane(dev, &NPU3_PHY_TX_IORESET, lane, 0); + + /* Reset the phase rotator */ + npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_RESET, lane, 1); + npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_RESET, lane, 0); + } + + return NPU3_PROC_NEXT; +} + +/* Procedure 1.2.3 - Initialise I/O PHY Registers */ +static uint32_t phy_reset_complete(struct npu3_dev *dev) +{ + int lane; + + npu3_for_each_lane(lane, dev) { + npu3_phy_write_lane(dev, &NPU3_PHY_RX_LANE_ANA_PDWN, lane, 0); + npu3_phy_write_lane(dev, &NPU3_PHY_RX_LANE_DIG_PDWN, lane, 0); + npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_PHASE_STEP, lane, 0xc); + npu3_phy_write_lane(dev, &NPU3_PHY_TX_LANE_PDWN, lane, 0); + npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_FW_INERTIA_AMT, lane, 4); + npu3_phy_write_lane(dev, &NPU3_PHY_RX_CFG_LTE_MC, lane, 3); + npu3_phy_write_lane(dev, &NPU3_PHY_RX_A_INTEG_COARSE_GAIN, lane, 11); + npu3_phy_write_lane(dev, &NPU3_PHY_RX_B_INTEG_COARSE_GAIN, lane, 11); + npu3_phy_write_lane(dev, &NPU3_PHY_RX_E_INTEG_COARSE_GAIN, lane, 11); + } + + set_iovalid(dev, true); + + return NPU3_PROC_COMPLETE; +} + +DEFINE_PROCEDURE(phy_reset, phy_reset_wait, phy_reset_complete); + +/* Procedure 1.2.6 - I/O PHY Tx Impedance Calibration */ +static uint32_t phy_tx_zcal(struct npu3_dev *dev) +{ + if (dev->npu->tx_zcal_complete) + return NPU3_PROC_COMPLETE; + + /* Turn off SW enable and enable zcal state machine */ + npu3_phy_write(dev, &NPU3_PHY_TX_ZCAL_SWO_EN, 0); + + /* Start impedance calibration state machine */ + npu3_phy_write(dev, &NPU3_PHY_TX_ZCAL_REQ, 1); + + return NPU3_PROC_NEXT; +} + +static uint32_t phy_tx_zcal_wait(struct npu3_dev *dev) +{ + if (npu3_phy_read(dev, &NPU3_PHY_TX_ZCAL_ERROR)) + return NPU3_PROC_COMPLETE | NPU3_PROC_FAILED; + + if (!npu3_phy_read(dev, &NPU3_PHY_TX_ZCAL_DONE)) + return NPU3_PROC_INPROGRESS; + + return NPU3_PROC_NEXT; +} + +#define MARGIN_RATIO 0 +#define FFE_PRE_COEFF 0 +#define FFE_POST_COEFF 0 + +#define PRE_WIDTH 5 +#define POST_WIDTH 7 +#define MAIN_WIDTH 7 +#define ZCAL_MIN (16 * 2) +#define ZCAL_MAX (33 * 2) +#define PRECURSOR_X2_MAX (4 * 2 + 1) +#define POSTCURSOR_X2_MAX (6 * 2 + 1) +#define MARGIN_X2_MAX (8 * 2) +#define MAIN_X2_MAX (6 * 2 + 1) +#define TOTAL_X2_MAX (PRECURSOR_X2_MAX + POSTCURSOR_X2_MAX + \ + 2 * MARGIN_X2_MAX + MAIN_X2_MAX) + +static uint32_t therm(uint32_t dec) +{ + return (0x1 << dec) - 1; +} + +static uint32_t therm_with_half(uint32_t dec, uint8_t width) +{ + /* If the LSB of the 2r equivalent is on, then we need to set the 2r bit (MSB) */ + uint32_t half_on = (dec & 0x1) << (width - 1); + + /* Shift the 2r equivalent to a 1r value and convert to a thermometer code. */ + uint32_t x1_equiv = ((1 << (dec >> 1)) - 1); + + /* Combine 1r equivalent thermometer code + the 2r MSB value. */ + return half_on | x1_equiv; +} + +static uint32_t phy_tx_zcal_calculate(struct npu3_dev *dev) +{ + int p_value, n_value; + uint32_t zcal_n; + uint32_t zcal_p; + uint32_t p_main_enable = MAIN_X2_MAX; + uint32_t p_margin_pu_enable = MARGIN_X2_MAX; + uint32_t p_margin_pd_enable = MARGIN_X2_MAX; + uint32_t p_precursor_select; + uint32_t p_postcursor_select; + uint32_t margin_pu_select; + uint32_t n_main_enable = MAIN_X2_MAX; + uint32_t n_margin_pu_enable = MARGIN_X2_MAX; + uint32_t n_margin_pd_enable = MARGIN_X2_MAX; + uint32_t n_precursor_select; + uint32_t n_postcursor_select; + uint32_t margin_pd_select; + uint32_t margin_select; + + /* Convert the value from 8R to 2R by / 4 */ + zcal_n = npu3_phy_read(dev, &NPU3_PHY_TX_ZCAL_N) / 4; + zcal_p = npu3_phy_read(dev, &NPU3_PHY_TX_ZCAL_P) / 4; + + /* + * Again, if the hardware detects an unexpected condition it's + * better just to fail loudly. + */ + if (zcal_n < ZCAL_MIN || zcal_n > ZCAL_MAX || + zcal_p < ZCAL_MIN || zcal_p > ZCAL_MAX) + return NPU3_PROC_COMPLETE | NPU3_PROC_FAILED; + + p_value = zcal_p - TOTAL_X2_MAX; + p_precursor_select = p_value * FFE_PRE_COEFF / 128; + p_postcursor_select = p_value * FFE_POST_COEFF / 128; + margin_pu_select = p_value * MARGIN_RATIO / 256; + + if (p_value % 2) { + p_main_enable--; + p_value++; + } + + while (p_value < 0) { + if (p_main_enable > 1) { + p_main_enable -= 2; + } else if (p_margin_pu_enable + p_margin_pd_enable > 0) { + if (p_margin_pu_enable == p_margin_pd_enable) + p_margin_pd_enable -= 2; + else + p_margin_pu_enable -= 2; + } + p_value += 2; + } + + n_value = zcal_n - TOTAL_X2_MAX; + n_precursor_select = n_value * FFE_PRE_COEFF / 128; + n_postcursor_select = n_value * FFE_POST_COEFF / 128; + margin_pd_select = p_value * MARGIN_RATIO / 256; + + if (n_value % 2) { + n_main_enable--; + n_value++; + } + + while (n_value < 0) { + if (n_main_enable > 1) { + n_main_enable -= 2; + } else if (n_margin_pu_enable + n_margin_pd_enable > 0) { + if (n_margin_pu_enable == n_margin_pd_enable) + n_margin_pd_enable -= 2; + else + n_margin_pu_enable -= 2; + } + n_value += 2; + } + + margin_select = therm((margin_pu_select + 1) / 2) & + therm((margin_pd_select + 1) / 2) & + therm((p_margin_pu_enable + 1) / 2) & + therm((p_margin_pd_enable + 1) / 2) & + therm((n_margin_pu_enable + 1) / 2) & + therm((n_margin_pd_enable + 1) / 2); + + npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_PRE_EN, therm_with_half(PRECURSOR_X2_MAX, PRE_WIDTH)); + npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_PRE_SELECT, therm_with_half(p_precursor_select, PRE_WIDTH)); + npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_POST_EN, therm_with_half(POSTCURSOR_X2_MAX, POST_WIDTH)); + npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_POST_SELECT, therm_with_half(p_postcursor_select, POST_WIDTH)); + npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_MARGINPU_EN, therm((p_margin_pu_enable + 1) / 2)); + npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_MARGINPD_EN, therm((p_margin_pd_enable + 1) / 2)); + npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_MAIN_EN, therm_with_half(p_main_enable, MAIN_WIDTH)); + + npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_PRE_EN, therm_with_half(PRECURSOR_X2_MAX, PRE_WIDTH)); + npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_PRE_SELECT, therm_with_half(n_precursor_select, PRE_WIDTH)); + npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_POST_EN, therm_with_half(POSTCURSOR_X2_MAX, POST_WIDTH)); + npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_POST_SELECT, therm_with_half(n_postcursor_select, POST_WIDTH)); + npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_MARGINPU_EN, therm((n_margin_pu_enable + 1) / 2)); + npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_MARGINPD_EN, therm((n_margin_pd_enable + 1) / 2)); + npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_MAIN_EN, therm_with_half(n_main_enable, MAIN_WIDTH)); + + npu3_phy_write(dev, &NPU3_PHY_TX_MARGINPU_SELECT, therm(margin_select + 1) / 2); + npu3_phy_write(dev, &NPU3_PHY_TX_MARGINPD_SELECT, therm(margin_select + 1) / 2); + + dev->npu->tx_zcal_complete = true; + + return NPU3_PROC_COMPLETE; +} + +DEFINE_PROCEDURE(phy_tx_zcal, phy_tx_zcal_wait, phy_tx_zcal_calculate); + +/* Procedure 1.2.4 - I/O PHY DC Calibration */ +static uint32_t phy_rx_dccal(struct npu3_dev *dev) +{ + int lane; + + set_iovalid(dev, false); + + npu3_for_each_lane(lane, dev) + npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_FW_OFF, lane, 1); + + npu3_for_each_lane(lane, dev) + npu3_phy_write_lane(dev, &NPU3_PHY_RX_RUN_DCCAL, lane, 1); + + return NPU3_PROC_NEXT; +} + +static uint32_t phy_rx_dccal_complete(struct npu3_dev *dev) +{ + int lane; + + npu3_for_each_lane(lane, dev) + if (!npu3_phy_read_lane(dev, &NPU3_PHY_RX_DCCAL_DONE, lane)) + return NPU3_PROC_INPROGRESS; + + npu3_for_each_lane(lane, dev) + npu3_phy_write_lane(dev, &NPU3_PHY_RX_RUN_DCCAL, lane, 0); + + npu3_for_each_lane(lane, dev) { + npu3_phy_write_lane(dev, &NPU3_PHY_RX_B_BANK_CONTROLS, lane, 0); + npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_EDGE_TRACK_CNTL, lane, 0); + npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_FW_OFF, lane, 0); + } + + return NPU3_PROC_NEXT; +} + +/* Procedure 1.2.5 - IO PHY Tx FIFO Init */ +static uint32_t phy_tx_fifo_init(struct npu3_dev *dev) +{ + int lane; + + npu3_for_each_lane(lane, dev) { + npu3_phy_write_lane(dev, &NPU3_PHY_TX_UNLOAD_CLK_DISABLE, lane, 0); + npu3_phy_write_lane(dev, &NPU3_PHY_TX_FIFO_INIT, lane, 1); + npu3_phy_write_lane(dev, &NPU3_PHY_TX_UNLOAD_CLK_DISABLE, lane, 1); + } + + set_iovalid(dev, true); + + return NPU3_PROC_COMPLETE; +} + +DEFINE_PROCEDURE(phy_rx_dccal, phy_rx_dccal_complete, phy_tx_fifo_init); + +/* Procedure 1.2.8 - Enable Downstream Link Training */ +static uint32_t phy_enable_tx_rxcal(struct npu3_dev *dev) +{ + int lane; + + npu3_for_each_lane(lane, dev) + npu3_phy_write_lane(dev, &NPU3_PHY_TX_RXCAL, lane, 1); + + return NPU3_PROC_COMPLETE; +} +DEFINE_PROCEDURE(phy_enable_tx_rxcal); + +/* Procedure 1.2.9 - Disable Downstream Link Training */ +static uint32_t phy_disable_tx_rxcal(struct npu3_dev *dev) +{ + int lane; + + npu3_for_each_lane(lane, dev) + npu3_phy_write_lane(dev, &NPU3_PHY_TX_RXCAL, lane, 0); + + return NPU3_PROC_COMPLETE; +} +DEFINE_PROCEDURE(phy_disable_tx_rxcal); + +/* Procedure 1.2.7 - I/O PHY Upstream Link Training */ +static uint32_t phy_rx_training(struct npu3_dev *dev) +{ + int lane; + + npu3_for_each_lane(lane, dev) + npu3_phy_write_lane(dev, &NPU3_PHY_RX_RUN_LANE, lane, 1); + + return NPU3_PROC_NEXT; +} + +static uint32_t phy_rx_training_wait(struct npu3_dev *dev) +{ + int lane; + + npu3_for_each_lane(lane, dev) + if (!npu3_phy_read_lane(dev, &NPU3_PHY_RX_INIT_DONE, lane)) + return NPU3_PROC_INPROGRESS; + + return NPU3_PROC_COMPLETE; +} + +DEFINE_PROCEDURE(phy_rx_training, phy_rx_training_wait); + +static void npu3_dev_fence_set(struct npu3_dev *dev, uint8_t state) +{ + struct npu3 *npu = dev->npu; + uint64_t val; + + val = npu3_read(npu, NPU3_NTL_MISC_CFG1(dev->index)); + val = SETFIELD(NPU3_NTL_MISC_CFG1_NTL_RESET, val, state); + npu3_write(npu, NPU3_NTL_MISC_CFG1(dev->index), val); +} + +static uint8_t npu3_dev_fence_get(struct npu3_dev *dev) +{ + uint64_t val; + + val = npu3_read(dev->npu, NPU3_NTL_CQ_FENCE_STATUS(dev->index)); + return GETFIELD(NPU3_NTL_CQ_FENCE_STATUS_FIELD, val); +} + +/* Procedure 1.2.1 - Reset NPU/NDL */ +static uint32_t reset_ntl(struct npu3_dev *dev) +{ + struct npu3 *npu = dev->npu; + uint64_t val; + int lane; + + set_iovalid(dev, true); + + /* Power on clocks */ + npu3_phy_write(dev, &NPU3_PHY_RX_CLKDIST_PDWN, 0); + npu3_phy_write(dev, &NPU3_PHY_RX_IREF_PDWN, 1); + npu3_phy_write(dev, &NPU3_PHY_TX_CLKDIST_PDWN, 0); + npu3_phy_write(dev, &NPU3_PHY_RX_CTL_DATASM_CLKDIST_PDWN, 0); + + npu3_for_each_lane(lane, dev) { + npu3_phy_write_lane(dev, &NPU3_PHY_RX_LANE_ANA_PDWN, lane, 0); + npu3_phy_write_lane(dev, &NPU3_PHY_RX_LANE_DIG_PDWN, lane, 0); + npu3_phy_write_lane(dev, &NPU3_PHY_TX_LANE_PDWN, lane, 0); + } + + /* Write PRI */ + val = SETFIELD(NPU3_NTL_PRI_CFG_NDL, 0ull, dev->index); + npu3_write(npu, NPU3_NTL_PRI_CFG(dev->index), val); + + /* Disable parity checking */ + val = npu3_read(npu, NPU3_NTL_MISC_CFG2(dev->index)); + val &= ~(NPU3_NTL_MISC_CFG2_NDL_RX_PARITY_ENA | + NPU3_NTL_MISC_CFG2_NDL_TX_PARITY_ENA | + NPU3_NTL_MISC_CFG2_NDL_PRI_PARITY_ENA); + npu3_write(npu, NPU3_NTL_MISC_CFG2(dev->index), val); + + if (dev->type == NPU3_DEV_TYPE_NVLINK) + npu3_pvd_flag_clear(dev, NPU3_DEV_DL_RESET); + + npu3_dev_fence_set(dev, NPU3_NTL_CQ_FENCE_STATUS_FULL); + + return NPU3_PROC_NEXT; +} + +static uint32_t reset_ndl(struct npu3_dev *dev) +{ + struct npu3 *npu = dev->npu; + uint64_t reg; + uint32_t val32; + + if (npu3_dev_fence_get(dev) != NPU3_NTL_CQ_FENCE_STATUS_FULL) + return NPU3_PROC_INPROGRESS; + + reg = NPU3_DLPL_CTL(dev->index); + val32 = npu3_read_4b(npu, reg); + val32 |= NPU3_DLPL_CTL_RESET_RX | NPU3_DLPL_CTL_RESET_MISC; + npu3_write_4b(npu, reg, val32); + + val32 = npu3_read_4b(npu, reg); + val32 &= ~(NPU3_DLPL_CTL_RESET_RX | NPU3_DLPL_CTL_RESET_MISC); + npu3_write_4b(npu, reg, val32); + + reg = NPU3_DLPL_CFG(dev->index); + val32 = NPU3_DLPL_CFG_PRI_BYTESWAP; + npu3_write_4b(npu, reg, val32); + + /* Clear FIR bits */ + for (uint32_t i = 0; i < NPU3_FIR_MAX; i++) + xscom_write(npu->chip_id, npu->xscom_base + NPU3_FIR(i), 0ull); + + npu3_dev_fence_set(dev, NPU3_NTL_CQ_FENCE_STATUS_HALF); + + return NPU3_PROC_NEXT; +} + +static uint32_t reset_ntl_release(struct npu3_dev *dev) +{ + struct npu3 *npu = dev->npu; + uint32_t i = dev->index; + + if (npu3_dev_fence_get(dev) != NPU3_NTL_CQ_FENCE_STATUS_HALF) + return NPU3_PROC_INPROGRESS; + + /* Credit setup */ + npu3_write(npu, NPU3_NTL_CREQ_HDR_CRED_SND(i), 0x0200000000000000); + npu3_write(npu, NPU3_NTL_PRB_HDR_CRED_SND(i), 0x0200000000000000); + npu3_write(npu, NPU3_NTL_ATR_HDR_CRED_SND(i), 0x0200000000000000); + npu3_write(npu, NPU3_NTL_RSP_HDR_CRED_SND(i), 0x0200000000000000); + npu3_write(npu, NPU3_NTL_CREQ_DAT_CRED_SND(i), 0x1000000000000000); + npu3_write(npu, NPU3_NTL_RSP_DAT_CRED_SND(i), 0x1000000000000000); + + npu3_write(npu, NPU3_NTL_CREQ_HDR_CRED_RCV(i), 0x0000be0000000000); + npu3_write(npu, NPU3_NTL_DGD_HDR_CRED_RCV(i), 0x0000640000000000); + npu3_write(npu, NPU3_NTL_ATSD_HDR_CRED_RCV(i), 0x0000200000000000); + npu3_write(npu, NPU3_NTL_RSP_HDR_CRED_RCV(i), 0x0000be0000000000); + npu3_write(npu, NPU3_NTL_CREQ_DAT_CRED_RCV(i), 0x0001000000000000); + npu3_write(npu, NPU3_NTL_RSP_DAT_CRED_RCV(i), 0x0001000000000000); + + npu3_dev_fence_set(dev, NPU3_NTL_CQ_FENCE_STATUS_NONE); + + return NPU3_PROC_NEXT; +} + +static uint32_t reset_ntl_finish(struct npu3_dev *dev) { + struct npu3 *npu = dev->npu; + uint64_t val; + + if (npu3_dev_fence_get(dev) != NPU3_NTL_CQ_FENCE_STATUS_NONE) + return NPU3_PROC_INPROGRESS; + + /* Enable parity checking */ + val = npu3_read(npu, NPU3_NTL_MISC_CFG2(dev->index)); + val |= NPU3_NTL_MISC_CFG2_NDL_RX_PARITY_ENA | + NPU3_NTL_MISC_CFG2_NDL_TX_PARITY_ENA | + NPU3_NTL_MISC_CFG2_NDL_PRI_PARITY_ENA; + npu3_write(npu, NPU3_NTL_MISC_CFG2(dev->index), val); + + if (dev->type == NPU3_DEV_TYPE_NVLINK) + npu3_pvd_flag_set(dev, NPU3_DEV_DL_RESET); + + return NPU3_PROC_COMPLETE; +} + +DEFINE_PROCEDURE(reset_ntl, reset_ndl, reset_ntl_release, reset_ntl_finish); + +static int npu3_dev_regcmp(struct npu3_dev *dev, uint64_t reg, + const char *reg_name, uint64_t expected) +{ + uint64_t val; + + val = npu3_read(dev->npu, reg); + if (val == expected) + return 0; + + NPU3DEVERR(dev, "%s: expected 0x%llx, read 0x%llx\n", + reg_name, expected, val); + + return 1; +} + +#define REGCMP(reg, expected) \ + npu3_dev_regcmp(dev, reg(dev->index), #reg, expected) + +static uint32_t check_credits(struct npu3_dev *dev) +{ + /* Use bitwise OR to prevent short-circuit evaluation */ + if (REGCMP(NPU3_NTL_CREQ_HDR_CRED_RCV, 0x0be0be0000000000ull) | + REGCMP(NPU3_NTL_DGD_HDR_CRED_RCV, 0x0640640000000000ull) | + REGCMP(NPU3_NTL_ATSD_HDR_CRED_RCV, 0x0200200000000000ull) | + REGCMP(NPU3_NTL_RSP_HDR_CRED_RCV, 0x0be0be0000000000ull) | + REGCMP(NPU3_NTL_CREQ_DAT_CRED_RCV, 0x1001000000000000ull) | + REGCMP(NPU3_NTL_RSP_DAT_CRED_RCV, 0x1001000000000000ull)) + return NPU3_PROC_COMPLETE | NPU3_PROC_FAILED; + + return NPU3_PROC_COMPLETE; +} + +DEFINE_PROCEDURE(check_credits); + +static struct procedure *procedures[] = { + [0] = &procedure_stop, + [1] = &procedure_nop, + [4] = &procedure_phy_reset, + [5] = &procedure_phy_tx_zcal, + [6] = &procedure_phy_rx_dccal, + [7] = &procedure_phy_enable_tx_rxcal, + [8] = &procedure_phy_disable_tx_rxcal, + [9] = &procedure_phy_rx_training, + [10] = &procedure_reset_ntl, + [11] = &procedure_nop, /* Placeholder for pre-terminate */ + [12] = &procedure_nop, /* Placeholder for terminate */ + [13] = &procedure_check_credits, +}; + +void npu3_dev_procedure_init(struct npu3_dev *dev, uint32_t pnum) +{ + struct npu3_procedure *proc = &dev->proc; + const char *name; + + if (pnum >= ARRAY_SIZE(procedures) || !procedures[pnum]) { + NPU3DEVERR(dev, "Unsupported procedure number %d\n", pnum); + proc->status = NPU3_PROC_COMPLETE | NPU3_PROC_UNSUPPORTED; + return; + } + + name = procedures[pnum]->name; + + if (proc->number == pnum && !(proc->status & NPU3_PROC_COMPLETE)) + NPU3DEVINF(dev, "Restarting procedure %s\n", name); + else + NPU3DEVINF(dev, "Starting procedure %s\n", name); + + proc->status = NPU3_PROC_INPROGRESS; + proc->number = pnum; + proc->step = 0; + proc->timeout = mftb() + msecs_to_tb(1000); +} + +static uint32_t npu3_dev_procedure_run_step(struct npu3_dev *dev) +{ + struct npu3_procedure *proc = &dev->proc; + uint32_t result; + + result = procedures[proc->number]->steps[proc->step](dev); + if (result & NPU3_PROC_NEXT) { + proc->step++; + + NPU3DEVINF(dev, "Running procedure %s step %d\n", + procedures[proc->number]->name, proc->step); + } + + return result; +} + +static void npu3_dev_procedure_run(struct npu3_dev *dev) +{ + struct npu3_procedure *proc = &dev->proc; + const char *name; + uint32_t result; + + do { + result = npu3_dev_procedure_run_step(dev); + } while (result & NPU3_PROC_NEXT); + + name = procedures[proc->number]->name; + + if (result & NPU3_PROC_COMPLETE) { + NPU3DEVINF(dev, "Procedure %s complete\n", name); + } else if (tb_compare(mftb(), proc->timeout) == TB_AAFTERB) { + NPU3DEVINF(dev, "Procedure %s timed out\n", name); + result = NPU3_PROC_COMPLETE | NPU3_PROC_FAILED; + } + + /* Mask off internal state bits */ + proc->status = result & NPU3_PROC_STATUS_MASK; +} + +uint32_t npu3_dev_procedure_status(struct npu3_dev *dev) +{ + /* Run the procedure if not already complete */ + if (!(dev->proc.status & NPU3_PROC_COMPLETE)) + npu3_dev_procedure_run(dev); + + return dev->proc.status; +} + +int64_t npu3_dev_reset(struct npu3_dev *dev) +{ + unsigned long timeout; + + reset_ntl(dev); + timeout = mftb() + msecs_to_tb(1000); + + while (npu3_dev_fence_get(dev) != NPU3_NTL_CQ_FENCE_STATUS_FULL) { + if (tb_compare(mftb(), timeout) == TB_AAFTERB) { + NPU3DEVINF(dev, "Device reset timed out\n"); + return OPAL_BUSY; + } + } + + return OPAL_SUCCESS; +} diff --git a/roms/skiboot/hw/npu3-nvlink.c b/roms/skiboot/hw/npu3-nvlink.c new file mode 100644 index 000000000..920864b32 --- /dev/null +++ b/roms/skiboot/hw/npu3-nvlink.c @@ -0,0 +1,1828 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Copyright 2019 IBM Corp. + */ + +#include <skiboot.h> +#include <device.h> +#include <phys-map.h> +#include <npu3.h> +#include <npu3-regs.h> +#include <pci-virt.h> +#include <xscom.h> +#include <xscom-p9-regs.h> +#include <interrupts.h> +#include <pci-cfg.h> +#include <pci-slot.h> +#include <cache-p9.h> + +#define NPU3LOG(l, npu, fmt, a...) \ + prlog(l, "NPU#%04x[%d:%d]: " fmt, \ + (npu)->nvlink.phb.opal_id, \ + (npu)->chip_id, \ + (npu)->index, ##a) +#define NPU3DBG(npu, fmt, a...) NPU3LOG(PR_DEBUG, npu, fmt, ##a) +#define NPU3INF(npu, fmt, a...) NPU3LOG(PR_INFO, npu, fmt, ##a) +#define NPU3ERR(npu, fmt, a...) NPU3LOG(PR_ERR, npu, fmt, ##a) + +#define NPU3DEVLOG(l, dev, fmt, a...) \ + prlog(l, "NPU#%04x:%02x:%02x.%x " fmt, \ + (dev)->npu->nvlink.phb.opal_id, \ + PCI_BUS_NUM((dev)->nvlink.pvd->bdfn), \ + PCI_DEV((dev)->nvlink.pvd->bdfn), \ + PCI_FUNC((dev)->nvlink.pvd->bdfn), ##a) +#define NPU3DEVDBG(dev, fmt, a...) NPU3DEVLOG(PR_DEBUG, dev, fmt, ##a) +#define NPU3DEVINF(dev, fmt, a...) NPU3DEVLOG(PR_INFO, dev, fmt, ##a) +#define NPU3DEVERR(dev, fmt, a...) NPU3DEVLOG(PR_ERR, dev, fmt, ##a) + +#define NPU3_CFG_READ(size, type) \ +static int64_t npu3_cfg_read##size(struct phb *phb, uint32_t bdfn, \ + uint32_t offset, type *data) \ +{ \ + uint32_t val; \ + int64_t ret; \ + \ + ret = pci_virt_cfg_read(phb, bdfn, offset, \ + sizeof(*data), &val); \ + *data = (type)val; \ + return ret; \ +} + +#define NPU3_CFG_WRITE(size, type) \ +static int64_t npu3_cfg_write##size(struct phb *phb, uint32_t bdfn, \ + uint32_t offset, type data) \ +{ \ + uint32_t val = data; \ + int64_t ret; \ + \ + ret = pci_virt_cfg_write(phb, bdfn, offset, \ + sizeof(data), val); \ + return ret; \ +} + +NPU3_CFG_READ(8, u8); +NPU3_CFG_READ(16, u16); +NPU3_CFG_READ(32, u32); +NPU3_CFG_WRITE(8, u8); +NPU3_CFG_WRITE(16, u16); +NPU3_CFG_WRITE(32, u32); + +static int64_t npu3_eeh_freeze_status(struct phb *phb __unused, + uint64_t pe_num __unused, + uint8_t *freeze_state, + uint16_t *pci_error_type, + uint16_t *severity) +{ + /* + * FIXME: When it's called by skiboot PCI config accessor, + * the PE number is fixed to 0, which is incorrect. We need + * introduce another PHB callback to translate it. For now, + * it keeps the skiboot PCI enumeration going. + */ + *freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN; + *pci_error_type = OPAL_EEH_NO_ERROR; + + if (severity) + *severity = OPAL_EEH_SEV_NO_ERROR; + + return OPAL_SUCCESS; +} + +/* Number of PEs supported */ +#define NPU3_MAX_PE_NUM 16 +#define NPU3_RESERVED_PE_NUM 15 + +static int64_t npu3_ioda_reset(struct phb *phb, bool purge __unused) +{ + struct npu3 *npu = npu3_phb_to_npu(phb); + uint64_t val; + + val = NPU3_ATS_IODA_ADDR_AUTO_INC; + val = SETFIELD(NPU3_ATS_IODA_ADDR_TBL_SEL, val, + NPU3_ATS_IODA_ADDR_TBL_TVT); + npu3_write(npu, NPU3_ATS_IODA_ADDR, val); + + for (uint32_t i = 0; i < NPU3_MAX_PE_NUM; i++) + npu3_write(npu, NPU3_ATS_IODA_DATA, 0ull); + + return OPAL_SUCCESS; +} + +static inline void npu3_ioda_sel(struct npu3 *npu, uint32_t table, + uint32_t index) +{ + uint64_t val; + + val = SETFIELD(NPU3_ATS_IODA_ADDR_TBL_SEL, 0ull, table); + val = SETFIELD(NPU3_ATS_IODA_ADDR_TBL_ADDR, val, index); + npu3_write(npu, NPU3_ATS_IODA_ADDR, val); +} + +static int64_t npu3_map_pe_dma_window(struct phb *phb, + uint64_t pe_num, + uint16_t window_id, + uint16_t tce_levels, + uint64_t tce_table_addr, + uint64_t tce_table_size, + uint64_t tce_page_size) +{ + struct npu3 *npu = npu3_phb_to_npu(phb); + uint64_t tts_encoded, val; + uint32_t page_size; + + /* Each PE has one corresponding TVE */ + if (window_id != pe_num || pe_num >= NPU3_MAX_PE_NUM) + return OPAL_PARAMETER; + + npu3_ioda_sel(npu, NPU3_ATS_IODA_ADDR_TBL_TVT, pe_num); + + /* TCE table size zero is used to disable the TVE */ + if (!tce_table_size) { + npu3_write(npu, NPU3_ATS_IODA_DATA, 0ull); + return OPAL_SUCCESS; + } + + /* TCE table size */ + if (!is_pow2(tce_table_size) || tce_table_size < 0x1000) + return OPAL_PARAMETER; + + tts_encoded = ilog2(tce_table_size) - 11; + if (tts_encoded > 39) + return OPAL_PARAMETER; + + val = SETFIELD(NPU3_ATS_IODA_TVT_TABLE_SIZE, 0ull, tts_encoded); + + /* Number of levels */ + if (tce_levels < 1 || tce_levels > 4) + return OPAL_PARAMETER; + + val = SETFIELD(NPU3_ATS_IODA_TVT_TABLE_LEVEL, val, tce_levels - 1); + + /* TCE page size */ + switch (tce_page_size) { + case 256 << 20: + page_size = 17; + break; + case 16 << 20: + page_size = 13; + break; + case 64 << 10: + page_size = 5; + break; + default: + page_size = 1; + } + + val = SETFIELD(NPU3_ATS_IODA_TVT_PAGE_SIZE, val, page_size); + val = SETFIELD(NPU3_ATS_IODA_TVT_XLAT_ADDR, val, tce_table_addr >> 12); + npu3_write(npu, NPU3_ATS_IODA_DATA, val); + + return OPAL_SUCCESS; +} + +static int64_t npu3_map_pe_dma_window_real(struct phb *phb, + uint64_t pe_num, + uint16_t window_id, + uint64_t pci_start_addr __unused, + uint64_t pci_mem_size __unused) +{ + struct npu3 *npu = npu3_phb_to_npu(phb); + uint64_t val; + + /* Each PE has one corresponding TVE */ + if (window_id != pe_num || pe_num >= NPU3_MAX_PE_NUM) + return OPAL_PARAMETER; + + if (pci_mem_size) { + /* + * GPUs need to be able to access the MMIO memory space as well. + * On POWER9 this is above the top of RAM, so disable the TVT + * range check, allowing access to all memory addresses. + */ + val = 0; + } else { + /* Disable */ + val = PPC_BIT(51); + } + + npu3_ioda_sel(npu, NPU3_ATS_IODA_ADDR_TBL_TVT, pe_num); + npu3_write(npu, NPU3_ATS_IODA_DATA, val); + + return OPAL_SUCCESS; +} + +static int64_t npu3_next_error(struct phb *phb, + uint64_t *first_frozen_pe, + uint16_t *pci_error_type, + uint16_t *severity) +{ + struct npu3 *npu = npu3_phb_to_npu(phb); + uint64_t val; + uint32_t pe_num; + + if (!first_frozen_pe || !pci_error_type || !severity) + return OPAL_PARAMETER; + + *first_frozen_pe = -1; + *pci_error_type = OPAL_EEH_NO_ERROR; + *severity = OPAL_EEH_SEV_NO_ERROR; + + for (pe_num = 0; pe_num < NPU3_MAX_PE_NUM; pe_num++) { + val = npu3_read(npu, NPU3_MISC_PESTB_DATA(pe_num)); + if (!GETFIELD(NPU3_MISC_PESTB_DATA_DMA_STOPPED_STATE, val)) + continue; + + *first_frozen_pe = pe_num; + *pci_error_type = OPAL_EEH_PE_ERROR; + *severity = OPAL_EEH_SEV_PE_ER; + break; + } + + return OPAL_SUCCESS; +} + +static struct npu3_dev *npu3_bdfn_to_dev(struct npu3 *npu, uint32_t bdfn) +{ + struct pci_virt_device *pvd; + + /* All emulated devices are attached to root bus */ + if (bdfn & ~0xff) + return NULL; + + pvd = pci_virt_find_device(&npu->nvlink.phb, bdfn); + if (pvd) + return pvd->data; + + return NULL; +} + +static int npu3_match_gpu(struct phb *phb __unused, struct pci_device *pd, + void *data) +{ + const char *slot = data; + struct dt_node *dn; + char *loc_code; + + /* Ignore non-NVIDIA devices */ + if (PCI_VENDOR_ID(pd->vdid) != 0x10de) + return 0; + + /* Find the PCI device's slot location */ + for (dn = pd->dn; + dn && !dt_find_property(dn, "ibm,loc-code"); + dn = dn->parent); + + if (!dn) + return 0; + + loc_code = (char *)dt_prop_get(dn, "ibm,loc-code"); + if (streq(loc_code, slot)) + return 1; + + return 0; +} + +static void npu3_dev_find_gpu(struct npu3_dev *dev) +{ + const char *slot = dev->nvlink.loc_code; + struct phb *phb; + struct pci_device *gpu; + + if (!slot) + return; + + for_each_phb(phb) { + gpu = pci_walk_dev(phb, NULL, npu3_match_gpu, (void *)slot); + if (!gpu) + continue; + + dev->nvlink.gpu = gpu; + return; + } + + NPU3DEVINF(dev, "No PCI device found for slot '%s'\n", slot); +} + +#define VENDOR_CAP_START 0x80 +#define VENDOR_CAP_LINK_FLAG_OFFSET 0x0d + +void npu3_pvd_flag_set(struct npu3_dev *dev, uint8_t flag) +{ + uint32_t offset = VENDOR_CAP_START + VENDOR_CAP_LINK_FLAG_OFFSET; + uint32_t flags; + + PCI_VIRT_CFG_RDONLY_RD(dev->nvlink.pvd, offset, 1, &flags); + flags |= flag; + PCI_VIRT_CFG_INIT_RO(dev->nvlink.pvd, offset, 1, flags); +} + +void npu3_pvd_flag_clear(struct npu3_dev *dev, uint8_t flag) +{ + uint32_t offset = VENDOR_CAP_START + VENDOR_CAP_LINK_FLAG_OFFSET; + uint32_t flags; + + PCI_VIRT_CFG_RDONLY_RD(dev->nvlink.pvd, offset, 1, &flags); + flags &= ~flag; + PCI_VIRT_CFG_INIT_RO(dev->nvlink.pvd, offset, 1, flags); +} + +static struct lock npu3_phandle_lock = LOCK_UNLOCKED; + +static void npu3_append_phandle(struct dt_node *dn, const char *name, + uint32_t phandle) +{ + struct dt_property *prop; + uint32_t *phandles; + size_t len; + + prop = __dt_find_property(dn, name); + if (!prop) { + dt_add_property_cells(dn, name, phandle); + return; + } + + /* + * Make sure no one else has a reference to the property. Assume + * this is the only function that holds a reference to it. + */ + lock(&npu3_phandle_lock); + + /* Need to append to the property */ + len = prop->len + sizeof(*phandles); + dt_resize_property(&prop, len); + + phandles = (uint32_t *)prop->prop; + phandles[len / sizeof(*phandles) - 1] = phandle; + + unlock(&npu3_phandle_lock); +} + +static void npu3_dev_fixup_dt(struct npu3_dev *dev) +{ + struct pci_device *pd = dev->nvlink.pd; + struct pci_device *gpu = dev->nvlink.gpu; + + dt_add_property_cells(pd->dn, "ibm,nvlink", dev->dn->phandle); + dt_add_property_string(pd->dn, "ibm,loc-code", dev->nvlink.loc_code); + if (dev->link_speed != 0xff) + dt_add_property_cells(pd->dn, "ibm,nvlink-speed", + lo32(dev->link_speed)); + + if (!gpu) + return; + + npu3_append_phandle(gpu->dn, "ibm,npu", pd->dn->phandle); + dt_add_property_cells(pd->dn, "ibm,gpu", gpu->dn->phandle); +} + +static int64_t npu3_gpu_bridge_sec_bus_reset(void *pdev, + struct pci_cfg_reg_filter *pcrf __unused, + uint32_t offset, uint32_t len, + uint32_t *data, bool write) +{ + struct pci_device *pd = pdev; + struct pci_device *gpu; + struct npu3 *npu; + struct npu3_dev *dev; + bool purge = false; + + if (!write) + return OPAL_PARAMETER; + + if (len != 2 || offset & 1) { + PCIERR(pd->phb, pd->bdfn, + "Unsupported write to bridge control register\n"); + return OPAL_PARAMETER; + } + + if (!(*data & PCI_CFG_BRCTL_SECONDARY_RESET)) + return OPAL_PARTIAL; + + gpu = list_top(&pd->children, struct pci_device, link); + if (!gpu) + return OPAL_PARTIAL; + + npu3_for_each_nvlink_npu(npu) + npu3_for_each_nvlink_dev(dev, npu) + if (dev->nvlink.gpu == gpu) + if (!npu3_dev_reset(dev)) + purge = true; + + if (purge) + purge_l2_l3_caches(); + + return OPAL_PARTIAL; +} + +static int npu3_dev_bind(struct phb *phb, struct pci_device *pd, + void *data __unused) +{ + struct npu3 *npu = npu3_phb_to_npu(phb); + struct npu3_dev *dev = npu3_bdfn_to_dev(npu, pd->bdfn); + struct pci_device *gpu; + + dev->nvlink.pd = pd; + + /* The slot label indicates which GPU this link is connected to */ + dev->nvlink.loc_code = dt_prop_get_def(dev->dn, "ibm,slot-label", NULL); + if (!dev->nvlink.loc_code) { + /** + * @fwts-label NPUNoPHBSlotLabel + * @fwts-advice No GPU/NPU slot information was found. + * NVLink3 functionality will not work. + */ + NPU3DEVERR(dev, "Cannot find GPU slot information\n"); + } + + npu3_dev_find_gpu(dev); + npu3_dev_fixup_dt(dev); + + gpu = dev->nvlink.gpu; + if (!gpu) + return 0; + + /* When a GPU is reset, ensure all of its links are reset too */ + if (gpu->parent && gpu->parent->slot) + pci_add_cfg_reg_filter(gpu->parent, PCI_CFG_BRCTL, 2, + PCI_REG_FLAG_WRITE, + npu3_gpu_bridge_sec_bus_reset); + + npu3_pvd_flag_set(dev, NPU3_DEV_PCI_LINKED); + + return 0; +} + +struct npu3 *npu3_next_nvlink_npu(struct npu3 *npu, uint32_t chip_id) +{ + uint64_t phb_id = 0; + struct phb *phb; + + if (npu) + phb_id = npu->nvlink.phb.opal_id + 1; + + for (; (phb = __pci_next_phb_idx(&phb_id));) { + if (phb->phb_type != phb_type_npu_v3) + continue; + + npu = npu3_phb_to_npu(phb); + if (npu->chip_id == chip_id || chip_id == NPU3_ANY_CHIP) + return npu; + } + + return NULL; +} + +static struct npu3 *npu3_last_npu(void) +{ + static struct npu3 *last = NULL; + struct npu3 *npu; + + if (last) + return last; + + npu3_for_each_nvlink_npu(npu) + last = npu; + + return last; +} + +static uint32_t npu3_gpu_links(struct pci_device *gpu) +{ + const struct dt_property *prop; + + if (!gpu) + return 0; + + /* The link count is the number of phandles in "ibm,npu" */ + prop = dt_find_property(gpu->dn, "ibm,npu"); + if (!prop) + return 0; + + return prop->len / sizeof(uint32_t); +} + +static uint32_t npu3_links_per_gpu(void) +{ + struct npu3 *npu; + struct npu3_dev *dev; + uint32_t links = 0; + + /* Use the first GPU we find to figure this out */ + npu3_for_each_nvlink_npu(npu) { + npu3_for_each_nvlink_dev(dev, npu) { + links = npu3_gpu_links(dev->nvlink.gpu); + if (links) + goto out; + } + } + +out: + prlog(PR_DEBUG, "NPU: %s: %d\n", __func__, links); + + return links; +} + +int32_t npu3_dev_gpu_index(struct npu3_dev *dev) +{ + const char *slot; + char *p = NULL; + int ret; + + slot = dev->nvlink.loc_code; + if (!slot) + return -1; + + if (memcmp(slot, "GPU", 3)) + return -1; + + ret = strtol(slot + 3, &p, 10); + if (*p || p == slot + 3) + return -1; + + return ret; +} + +static uint32_t npu3_chip_possible_gpu_links(void) +{ + struct proc_chip *chip; + struct npu3 *npu; + struct npu3_dev *dev; + uint32_t possible = 0; + + for_each_chip(chip) { + npu3_for_each_chip_nvlink_npu(npu, chip->id) + npu3_for_each_nvlink_dev(dev, npu) + if (npu3_dev_gpu_index(dev) != -1) + possible++; + + if (possible) + break; + } + + prlog(PR_DEBUG, "NPU: %s: %d\n", __func__, possible); + + return possible; +} + +uint32_t npu3_chip_possible_gpus(void) +{ + static uint32_t possible = -1; + uint32_t links_per_gpu; + + /* Static value, same for all chips; only do this once */ + if (possible != -1) + return possible; + + possible = 0; + + links_per_gpu = npu3_links_per_gpu(); + if (links_per_gpu) + possible = npu3_chip_possible_gpu_links() / links_per_gpu; + + prlog(PR_DEBUG, "NPU: %s: %d\n", __func__, possible); + + return possible; +} + +static void npu3_dev_assign_gmb(struct npu3_dev *dev, uint64_t addr, + uint64_t size) +{ + uint32_t mode; + uint64_t val; + + switch (npu3_gpu_links(dev->nvlink.gpu)) { + case 0: + return; + case 1: + mode = 0; + break; + case 2: + mode = 1; + break; + case 3: + mode = 3; + break; + case 4: + mode = 6; + break; + case 6: + mode = 10; + break; + default: + /* Hardware does not support this configuration */ + assert(0); + } + + mode += PCI_FUNC(dev->nvlink.pvd->bdfn); + + val = NPU3_GPU_MEM_BAR_ENABLE | + NPU3_GPU_MEM_BAR_POISON; + val = SETFIELD(NPU3_GPU_MEM_BAR_ADDR, val, addr >> 30); + val = SETFIELD(NPU3_GPU_MEM_BAR_SIZE, val, size >> 30); + val = SETFIELD(NPU3_GPU_MEM_BAR_MODE, val, mode); + + npu3_write(dev->npu, NPU3_GPU_MEM_BAR(dev->index), val); +} + +static struct dt_node *npu3_create_memory_dn(struct npu3_dev *dev, + uint32_t gpu_index, uint64_t addr, + uint64_t size) +{ + uint32_t nid = 255 - gpu_index; + struct dt_node *mem; + + mem = dt_find_by_name_addr(dt_root, "memory", addr); + if (mem) + return mem; + + mem = dt_new_addr(dt_root, "memory", addr); + assert(mem); + + dt_add_property_string(mem, "device_type", "memory"); + dt_add_property_string(mem, "compatible", "ibm,coherent-device-memory"); + dt_add_property_u64s(mem, "reg", addr, size); + dt_add_property_u64s(mem, "linux,usable-memory", addr, 0); + dt_add_property_cells(mem, "ibm,chip-id", nid); + dt_add_property_cells(mem, "ibm,associativity", 4, nid, nid, nid, nid); + + NPU3INF(dev->npu, "%s mem: 0x%016llx (nid %d)\n", dev->nvlink.loc_code, + addr, nid); + + return mem; +} + +static void npu3_dev_init_gpu_mem(struct npu3_dev *dev) +{ + struct pci_device *pd = dev->nvlink.pd; + struct npu3 *npu = dev->npu; + struct dt_node *mem; + uint64_t addr, size, gta; + uint32_t gpu_index; + + if (!dev->nvlink.gpu) + return; + + gpu_index = npu3_dev_gpu_index(dev) % npu3_chip_possible_gpus(); + phys_map_get(npu->chip_id, GPU_MEM_4T_DOWN, gpu_index, &addr, &size); + + npu3_dev_assign_gmb(dev, addr, size); + mem = npu3_create_memory_dn(dev, gpu_index, addr, size); + + /* + * Coral mode address compression. This is documented in Figure 3.5 of + * the NPU workbook; "P9->GPU RA Compression (Coral)". + */ + gta = (addr >> 42 & 0x1) << 42; + gta |= (addr >> 45 & 0x3) << 43; + gta |= (addr >> 49 & 0x3) << 45; + gta |= addr & ((1ul << 43) - 1); + + dt_add_property_cells(pd->dn, "memory-region", mem->phandle); + dt_add_property_u64s(pd->dn, "ibm,device-tgt-addr", gta); +} + +static void npu3_final_fixup(void) +{ + struct npu3 *npu; + struct npu3_dev *dev; + + npu3_for_each_nvlink_npu(npu) + npu3_for_each_nvlink_dev(dev, npu) + npu3_dev_init_gpu_mem(dev); +} + +static void npu3_phb_final_fixup(struct phb *phb) +{ + struct npu3 *npu = npu3_phb_to_npu(phb); + + pci_walk_dev(phb, NULL, npu3_dev_bind, NULL); + + /* + * After every npu's devices are bound, do gpu-related fixup. This + * counts on npu3_last_npu() walking the phbs in the same order as + * the PHB final fixup loop in __pci_init_slots(). + */ + if (npu == npu3_last_npu()) + npu3_final_fixup(); +} + +static int64_t npu3_set_pe(struct phb *phb, + uint64_t pe_num, + uint64_t bdfn, + uint8_t bcompare, + uint8_t dcompare, + uint8_t fcompare, + uint8_t action) +{ + struct npu3 *npu = npu3_phb_to_npu(phb); + struct npu3_dev *dev; + uint64_t val; + + dev = npu3_bdfn_to_dev(npu, bdfn); + if (!dev) + return OPAL_PARAMETER; + + if (action != OPAL_MAP_PE && action != OPAL_UNMAP_PE) + return OPAL_PARAMETER; + + if (pe_num >= NPU3_MAX_PE_NUM) + return OPAL_PARAMETER; + + if (bcompare != OpalPciBusAll || + dcompare != OPAL_COMPARE_RID_DEVICE_NUMBER || + fcompare != OPAL_COMPARE_RID_FUNCTION_NUMBER) + return OPAL_UNSUPPORTED; + + if (!dev->nvlink.gpu) + return OPAL_SUCCESS; + + val = NPU3_CTL_BDF2PE_CFG_ENABLE; + val = SETFIELD(NPU3_CTL_BDF2PE_CFG_PE, val, pe_num); + val = SETFIELD(NPU3_CTL_BDF2PE_CFG_BDF, val, dev->nvlink.gpu->bdfn); + npu3_write(npu, NPU3_CTL_BDF2PE_CFG(pe_num), val); + + val = NPU3_MISC_BDF2PE_CFG_ENABLE; + val = SETFIELD(NPU3_MISC_BDF2PE_CFG_PE, val, pe_num); + val = SETFIELD(NPU3_MISC_BDF2PE_CFG_BDF, val, dev->nvlink.gpu->bdfn); + npu3_write(npu, NPU3_MISC_BDF2PE_CFG(pe_num), val); + + return OPAL_SUCCESS; +} + +static int64_t npu3_tce_kill_pages(struct npu3 *npu, + uint64_t pe_num, + uint32_t tce_size, + uint64_t dma_addr, + uint32_t npages) +{ + uint32_t check_tce_size; + uint64_t val; + + if (pe_num >= NPU3_MAX_PE_NUM) + return OPAL_PARAMETER; + + npu3_ioda_sel(npu, NPU3_ATS_IODA_ADDR_TBL_TVT, pe_num); + val = npu3_read(npu, NPU3_ATS_IODA_DATA); + + check_tce_size = 0x800 << GETFIELD(NPU3_ATS_IODA_TVT_PAGE_SIZE, val); + if (check_tce_size != tce_size) { + NPU3ERR(npu, "%s: Unexpected TCE size (got 0x%x, expected 0x%x)\n", + __func__, tce_size, check_tce_size); + + return OPAL_PARAMETER; + } + + val = NPU3_ATS_TCE_KILL_ONE; + val = SETFIELD(NPU3_ATS_TCE_KILL_PE_NUMBER, val, pe_num); + + while (npages--) { + val = SETFIELD(NPU3_ATS_TCE_KILL_ADDRESS, val, dma_addr >> 12); + npu3_write(npu, NPU3_ATS_TCE_KILL, val); + + dma_addr += tce_size; + } + + return OPAL_SUCCESS; +} + +static int64_t npu3_tce_kill(struct phb *phb, + uint32_t kill_type, + uint64_t pe_num, + uint32_t tce_size, + uint64_t dma_addr, + uint32_t npages) +{ + struct npu3 *npu = npu3_phb_to_npu(phb); + + sync(); + + switch(kill_type) { + case OPAL_PCI_TCE_KILL_PAGES: + return npu3_tce_kill_pages(npu, pe_num, tce_size, + dma_addr, npages); + case OPAL_PCI_TCE_KILL_PE: + /* + * NPU doesn't support killing a PE so fall through + * and do a kill all instead. + */ + case OPAL_PCI_TCE_KILL_ALL: + npu3_write(npu, NPU3_ATS_TCE_KILL, NPU3_ATS_TCE_KILL_ALL); + return OPAL_SUCCESS; + } + + return OPAL_PARAMETER; +} + +static const struct phb_ops npu_ops = { + .cfg_read8 = npu3_cfg_read8, + .cfg_read16 = npu3_cfg_read16, + .cfg_read32 = npu3_cfg_read32, + .cfg_write8 = npu3_cfg_write8, + .cfg_write16 = npu3_cfg_write16, + .cfg_write32 = npu3_cfg_write32, + .eeh_freeze_status = npu3_eeh_freeze_status, + .ioda_reset = npu3_ioda_reset, + .map_pe_dma_window = npu3_map_pe_dma_window, + .map_pe_dma_window_real = npu3_map_pe_dma_window_real, + .next_error = npu3_next_error, + .phb_final_fixup = npu3_phb_final_fixup, + .set_pe = npu3_set_pe, + .tce_kill = npu3_tce_kill, +}; + +static int64_t npu3_reset(struct pci_slot *slot) +{ + struct npu3 *npu = npu3_phb_to_npu(slot->phb); + struct npu3_dev *dev; + int64_t rc = OPAL_SUCCESS; + bool purge = false; + + npu3_for_each_nvlink_dev(dev, npu) { + rc = npu3_dev_reset(dev); + if (rc) + break; + + purge = true; + } + + /* No devices reset; don't purge, just return */ + if (!purge) + return rc; + + /* All devices reset */ + if (!rc) + return purge_l2_l3_caches(); + + /* Some devices successfully reset; purge, but still return error */ + purge_l2_l3_caches(); + return rc; +} + +static int64_t npu3_freset(struct pci_slot *slot __unused) +{ + return OPAL_SUCCESS; +} + +static int64_t npu3_get_link_state(struct pci_slot *slot __unused, + uint8_t *val) +{ + *val = OPAL_SHPC_LINK_UP_x1; + return OPAL_SUCCESS; +} + +static int64_t npu3_get_power_state(struct pci_slot *slot __unused, + uint8_t *val) +{ + *val = PCI_SLOT_POWER_ON; + return OPAL_SUCCESS; +} + +static void npu3_create_phb_slot(struct npu3 *npu) +{ + struct pci_slot *slot; + + slot = pci_slot_alloc(&npu->nvlink.phb, NULL); + if (!slot) + return; + + /* Elementary functions */ + slot->ops.creset = npu3_reset; + slot->ops.freset = npu3_freset; + slot->ops.hreset = npu3_reset; + slot->ops.get_link_state = npu3_get_link_state; + slot->ops.get_power_state = npu3_get_power_state; +} + +static void npu3_create_phb(struct npu3 *npu) +{ + struct phb *phb = &npu->nvlink.phb; + + phb->phb_type = phb_type_npu_v3; + phb->ops = &npu_ops; + phb->dt_node = dt_new_addr(dt_root, "pciex", npu->regs[0]); + assert(phb->dt_node); + + list_head_init(&phb->virt_devices); + pci_register_phb(phb, npu3_get_opal_id(npu->chip_id, + npu3_get_phb_index(npu->index))); + npu3_create_phb_slot(npu); + npu3_ioda_reset(phb, true); +} + +static void npu3_dev_init_hw(struct npu3_dev *dev) +{ + struct npu3 *npu = dev->npu; + uint64_t reg, val; + + reg = NPU3_RELAXED_CFG2(dev->index); + val = npu3_read(npu, reg); + val |= NPU3_RELAXED_CFG2_CMD_CL_DMA_W | + NPU3_RELAXED_CFG2_CMD_CL_DMA_W_HP | + NPU3_RELAXED_CFG2_CMD_CL_DMA_INJ | + NPU3_RELAXED_CFG2_CMD_PR_DMA_INJ | + NPU3_RELAXED_CFG2_CMD_DMA_PR_W | + NPU3_RELAXED_CFG2_CMD_CL_RD_NC_F0 | + NPU3_RELAXED_CFG2_SRC_RDENA(0); + npu3_write(npu, reg, val); + + reg = NPU3_NTL_MISC_CFG2(dev->index); + val = npu3_read(npu, reg); + val |= NPU3_NTL_MISC_CFG2_BRICK_ENABLE | + NPU3_NTL_MISC_CFG2_RCV_CREDIT_OVERFLOW_ENA; + npu3_write(npu, reg, val); +} + +static void npu3_init_hw(struct npu3 *npu) +{ + struct npu3_dev *dev; + uint64_t reg, val; + + reg = NPU3_XTS_CFG; + val = npu3_read(npu, reg); + val |= NPU3_XTS_CFG_MMIOSD | NPU3_XTS_CFG_TRY_ATR_RO; + npu3_write(npu, reg, val); + + reg = NPU3_XTS_CFG2; + val = npu3_read(npu, reg); + val |= NPU3_XTS_CFG2_NO_FLUSH_ENA; + npu3_write(npu, reg, val); + + reg = NPU3_RELAXED_SRC(0); + val = NPU3_RELAXED_SRC_MASK_NPU; + npu3_write(npu, reg, val); + + npu3_for_each_nvlink_dev(dev, npu) + npu3_dev_init_hw(dev); +} + +/* PCI command register (BAR enable/disable) */ +static int64_t npu3_cfg_cmd(void *pvd, + struct pci_cfg_reg_filter *pcrf __unused, + uint32_t offset, uint32_t size, + uint32_t *data, bool write) +{ + struct npu3_dev *dev = ((struct pci_virt_device *)pvd)->data; + + if (!write) + return OPAL_PARTIAL; + + if (offset != PCI_CFG_CMD) + return OPAL_PARAMETER; + + if (size != 1 && size != 2 && size != 4) + return OPAL_PARAMETER; + + npu3_dev_enable_bars(dev, !!(*data & PCI_CFG_CMD_MEM_EN)); + + return OPAL_PARTIAL; +} + +static int64_t npu3_cfg_bar_write(struct npu3_bar *bar, uint64_t mask, + uint32_t data) +{ + if (data != 0xffffffff) + return OPAL_HARDWARE; + + /* Return BAR size on next read */ + bar->trap |= mask; + + return OPAL_SUCCESS; +} + +static int64_t npu3_cfg_bar_read(struct npu3_bar *bar, uint64_t mask, + uint32_t *data) +{ + if (!(bar->trap & mask)) + return OPAL_PARTIAL; + + *data = GETFIELD(mask, bar->size); + bar->trap &= ~mask; + + return OPAL_SUCCESS; +} + +/* PCI BAR registers (NTL/GENID) */ +static int64_t npu3_cfg_bar(void *pvd __unused, + struct pci_cfg_reg_filter *pcrf, + uint32_t offset, uint32_t size, uint32_t *data, + bool write) +{ + struct npu3_bar *bar = (struct npu3_bar *)pcrf->data; + uint64_t mask; + + if (size != 4) + return OPAL_PARAMETER; + + if (offset == pcrf->start) + mask = 0xffffffff; + else if (offset == pcrf->start + 4) + mask = 0xffffffffull << 32; + else + return OPAL_PARAMETER; + + if (write) + return npu3_cfg_bar_write(bar, mask, *data); + + return npu3_cfg_bar_read(bar, mask, data); +} + +/* PCI control register */ +static int64_t npu3_cfg_devctl(void *pvd, + struct pci_cfg_reg_filter *pcrf __unused, + uint32_t offset, uint32_t size, + uint32_t *data, bool write) +{ + struct npu3_dev *dev = ((struct pci_virt_device *)pvd)->data; + + if (!write) + return OPAL_HARDWARE; + + if (size != 2 || offset & 1) { + NPU3DEVERR(dev, "Unsupported write to pcie control register\n"); + return OPAL_PARAMETER; + } + + if (*data & PCICAP_EXP_DEVCTL_FUNC_RESET) + if (!npu3_dev_reset(dev)) + purge_l2_l3_caches(); + + return OPAL_PARTIAL; +} + +static uint32_t npu3_cfg_populate_pcie_cap(struct npu3_dev *dev, uint32_t start, + uint32_t prev_cap) +{ + struct pci_virt_device *pvd = dev->nvlink.pvd; + uint32_t val; + + /* Add capability list */ + PCI_VIRT_CFG_INIT_RO(pvd, prev_cap, 1, start); + PCI_VIRT_CFG_INIT_RO(pvd, start, 1, PCI_CFG_CAP_ID_EXP); + + /* 0x00 - ID/PCIE capability */ + val = PCI_CFG_CAP_ID_EXP; + val |= 0x2 << 16 | PCIE_TYPE_ENDPOINT << 20; + PCI_VIRT_CFG_INIT_RO(pvd, start, 4, val); + + /* 0x04 - Device capability */ + val = PCIE_MPSS_128 | + PCIE_PHANTOM_NONE << 3 | + PCIE_L0SL_MAX_NO_LIMIT << 6 | + PCIE_L1L_MAX_NO_LIMIT << 9 | + PCICAP_EXP_DEVCAP_FUNC_RESET; + PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_DEVCAP, 4, val); + + pci_virt_add_filter(pvd, start + PCICAP_EXP_DEVCTL, 2, + PCI_REG_FLAG_WRITE, + npu3_cfg_devctl, NULL); + + /* 0x08 - Device control and status */ + PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_DEVCTL, 4, 0x00002810, + 0xffff0000, 0x000f0000); + + /* 0x0c - Link capability */ + val = PCIE_LSPEED_VECBIT_2 | PCIE_LWIDTH_1X << 4; + PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_LCAP, 4, val); + + /* 0x10 - Link control and status */ + PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_LCTL, 4, 0x00130000, + 0xfffff000, 0xc0000000); + + /* 0x14 - Slot capability */ + PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SLOTCAP, 4, 0x00000000); + + /* 0x18 - Slot control and status */ + PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SLOTCTL, 4, 0x00000000); + + /* 0x1c - Root control and capability */ + PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_RC, 4, 0x00000000, + 0xffffffe0, 0x00000000); + + /* 0x20 - Root status */ + PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_RSTAT, 4, 0x00000000, + 0xffffffff, 0x00010000); + + /* 0x24 - Device capability 2 */ + PCI_VIRT_CFG_INIT_RO(pvd, start + PCIECAP_EXP_DCAP2, 4, 0x00000000); + + /* 0x28 - Device Control and status 2 */ + PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_DCTL2, 4, 0x00070000, + 0xffff0000, 0x00000000); + + /* 0x2c - Link capability 2 */ + PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_LCAP2, 4, 0x00000007); + + /* 0x30 - Link control and status 2 */ + PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_LCTL2, 4, 0x00000003, + 0xffff0000, 0x00200000); + + /* 0x34 - Slot capability 2 */ + PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SCAP2, 4, 0x00000000); + + /* 0x38 - Slot control and status 2 */ + PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SCTL2, 4, 0x00000000); + + return start + PCICAP_EXP_SCTL2 + 8; +} + +static int64_t npu3_dev_procedure_write(struct npu3_dev *dev, uint32_t offset, + uint32_t data) +{ + switch (offset) { + case 0: + NPU3DEVINF(dev, "Ignoring write to status register\n"); + break; + case 4: + npu3_dev_procedure_init(dev, data); + break; + default: + return OPAL_PARAMETER; + } + + return OPAL_SUCCESS; +} + +static int64_t npu3_dev_procedure_read(struct npu3_dev *dev, uint32_t offset, + uint32_t *data) +{ + switch (offset) { + case 0: + *data = npu3_dev_procedure_status(dev); + break; + case 4: + *data = dev->proc.number; + break; + default: + *data = 0; + return OPAL_PARAMETER; + } + + return OPAL_SUCCESS; +} + +/* Hardware procedure control/status registers */ +static int64_t npu3_dev_procedure(void *pvd, struct pci_cfg_reg_filter *pcrf, + uint32_t offset, uint32_t size, + uint32_t *data, bool write) +{ + struct npu3_dev *dev = ((struct pci_virt_device *)pvd)->data; + + if (size != 4) + return OPAL_PARAMETER; + + offset -= pcrf->start; + + if (write) + return npu3_dev_procedure_write(dev, offset, *data); + + return npu3_dev_procedure_read(dev, offset, data); +} + +/* PPE SRAM access is indirect via CSAR/CSDR */ +static void npu3_dev_ppe_sram_sel(struct npu3_dev *dev, uint32_t reg) +{ + uint64_t val; + + val = SETFIELD(OB_PPE_CSAR_SRAM_ADDR, 0ull, reg); + xscom_write(dev->npu->chip_id, OB_PPE_CSAR(dev->ob_chiplet), val); +} + +static void npu3_dev_ppe_sram_write(struct npu3_dev *dev, uint32_t reg, + uint64_t val) +{ + npu3_dev_ppe_sram_sel(dev, reg); + xscom_write(dev->npu->chip_id, OB_PPE_CSDR(dev->ob_chiplet), val); +} + +static uint64_t npu3_dev_ppe_sram_read(struct npu3_dev *dev, uint32_t reg) +{ + uint64_t val; + + npu3_dev_ppe_sram_sel(dev, reg); + xscom_read(dev->npu->chip_id, OB_PPE_CSDR(dev->ob_chiplet), &val); + + return val; +} + +/* Software-implemented autonomous link training (SALT) */ +static int64_t npu3_dev_salt(void *pvd, struct pci_cfg_reg_filter *pcrf, + uint32_t offset, uint32_t size, uint32_t *data, + bool write) +{ + struct npu3_dev *dev = ((struct pci_virt_device *)pvd)->data; + unsigned long timeout; + uint32_t cmd_reg; + uint64_t val; + + if (size != 4 || offset != pcrf->start) + return OPAL_PARAMETER; + + /* The config register before this one holds CMD_REG */ + PCI_VIRT_CFG_NORMAL_RD(pvd, pcrf->start - 4, 4, &cmd_reg); + if (cmd_reg == 0xffffffff) + return OPAL_PARAMETER; + + /* Check for another command in progress */ + val = npu3_dev_ppe_sram_read(dev, OB_PPE_SALT_CMD); + if (GETFIELD(OB_PPE_SALT_CMD_READY, val)) { + NPU3DEVINF(dev, "SALT_CMD 0x%x: Not ready\n", cmd_reg); + return OPAL_BUSY; + } + + val = OB_PPE_SALT_CMD_READY; + val = SETFIELD(OB_PPE_SALT_CMD_RW, val, write); + val = SETFIELD(OB_PPE_SALT_CMD_LINKNUM, val, npu3_chip_dev_index(dev)); + val = SETFIELD(OB_PPE_SALT_CMD_REG, val, cmd_reg); + if (write) + val = SETFIELD(OB_PPE_SALT_CMD_DATA, val, *data); + + npu3_dev_ppe_sram_write(dev, OB_PPE_SALT_CMD, val); + + /* Wait for the go bit to clear */ + timeout = mftb() + msecs_to_tb(1000); + + while (GETFIELD(OB_PPE_SALT_CMD_READY, val)) { + if (tb_compare(mftb(), timeout) == TB_AAFTERB) { + NPU3DEVINF(dev, "SALT_CMD 0x%x: Timeout\n", cmd_reg); + return OPAL_BUSY; + } + + val = npu3_dev_ppe_sram_read(dev, OB_PPE_SALT_CMD); + } + + if (GETFIELD(OB_PPE_SALT_CMD_ERR, val)) + NPU3DEVINF(dev, "SALT_CMD 0x%x: Error\n", cmd_reg); + + if (!write) + *data = GETFIELD(OB_PPE_SALT_CMD_DATA, val); + + return OPAL_SUCCESS; +} + +#define VENDOR_CAP_LEN 0x1c +#define VENDOR_CAP_VERSION 0x02 + +static uint32_t npu3_cfg_populate_vendor_cap(struct npu3_dev *dev, + uint32_t start, uint32_t prev_cap) +{ + struct pci_virt_device *pvd = dev->nvlink.pvd; + + /* Capabilities list */ + PCI_VIRT_CFG_INIT_RO(pvd, prev_cap, 1, start); + PCI_VIRT_CFG_INIT_RO(pvd, start, 1, PCI_CFG_CAP_ID_VENDOR); + + /* Length and version */ + PCI_VIRT_CFG_INIT_RO(pvd, start + 2, 1, VENDOR_CAP_LEN); + PCI_VIRT_CFG_INIT_RO(pvd, start + 3, 1, VENDOR_CAP_VERSION); + + /* + * Defaults when the trap can't handle the read/write (eg. due to + * reading/writing less than 4 bytes). + */ + PCI_VIRT_CFG_INIT_RO(pvd, start + 4, 4, 0); + PCI_VIRT_CFG_INIT_RO(pvd, start + 8, 4, 0); + + /* PHY procedure trap */ + pci_virt_add_filter(pvd, start + 4, 8, + PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE, + npu3_dev_procedure, NULL); + + /* Link index */ + PCI_VIRT_CFG_INIT_RO(pvd, start + 0xc, 1, npu3_chip_dev_index(dev)); + + /* SALT registers */ + PCI_VIRT_CFG_INIT(pvd, start + 0x10, 4, 0xffffffff, 0, 0); + PCI_VIRT_CFG_INIT_RO(pvd, start + 0x14, 4, 0); + + pci_virt_add_filter(pvd, start + 0x14, 4, + PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE, + npu3_dev_salt, NULL); + + return start + VENDOR_CAP_LEN; +} + +static void npu3_cfg_populate(struct npu3_dev *dev) +{ + struct pci_virt_device *pvd = dev->nvlink.pvd; + uint64_t addr; + uint32_t pos; + + /* 0x00 - Vendor/Device ID */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_VENDOR_ID, 4, 0x04ea1014); + + /* 0x04 - Command/Status */ + PCI_VIRT_CFG_INIT(pvd, PCI_CFG_CMD, 4, 0x00100000, 0xffb802b8, + 0xf9000000); + + pci_virt_add_filter(pvd, PCI_CFG_CMD, 1, PCI_REG_FLAG_WRITE, + npu3_cfg_cmd, NULL); + + /* 0x08 - Rev/Class/Cache */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_REV_ID, 4, 0x06800102); + + /* 0x0c - CLS/Latency Timer/Header/BIST */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CACHE_LINE_SIZE, 4, 0x00800000); + + /* 0x10/14 - NTL BAR */ + addr = SETFIELD(0xf, dev->ntl_bar.addr, + PCI_CFG_BAR_TYPE_MEM | PCI_CFG_BAR_MEM64); + PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR0, 4, lo32(addr), 0xf, 0); + PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR1, 4, hi32(addr), 0, 0); + + pci_virt_add_filter(pvd, PCI_CFG_BAR0, 8, + PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE, + npu3_cfg_bar, &dev->ntl_bar); + + /* 0x18/1c - GENID BAR */ + addr = SETFIELD(0xf, dev->genid_bar.addr, + PCI_CFG_BAR_TYPE_MEM | PCI_CFG_BAR_MEM64); + PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR2, 4, lo32(addr), 0xf, 0); + PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR3, 4, hi32(addr), 0, 0); + + pci_virt_add_filter(pvd, PCI_CFG_BAR2, 8, + PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE, + npu3_cfg_bar, &dev->genid_bar); + + /* 0x20/0x24 - BARs, disabled */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR4, 4, 0x00000000); + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR5, 4, 0x00000000); + + /* 0x28 - Cardbus CIS pointer */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CARDBUS_CIS, 4, 0x00000000); + + /* 0x2c - Subsystem ID */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_SUBSYS_VENDOR_ID, 4, 0x00000000); + + /* 0x30 - ROM BAR, zero sized */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_ROMBAR, 4, 0xffffffff); + + /* 0x34 - PCI Capability */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CAP, 4, 0x00000000); + + /* 0x38 - Reserved */ + PCI_VIRT_CFG_INIT_RO(pvd, 0x38, 4, 0x00000000); + + /* 0x3c - INT line/pin/Minimal grant/Maximal latency */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_INT_LINE, 4, 0x00000100); /* INT A */ + + /* PCIE and vendor specific capability */ + pos = npu3_cfg_populate_pcie_cap(dev, 0x40, PCI_CFG_CAP); + pos = npu3_cfg_populate_vendor_cap(dev, pos, 0x41); + PCI_VIRT_CFG_INIT_RO(pvd, pos + 1, 1, 0); +} + +static void npu3_dev_create_pvd(struct npu3_dev *dev) +{ + struct npu3 *npu = dev->npu; + struct phb *phb = &npu->nvlink.phb; + + dev->nvlink.pvd = pci_virt_add_device(phb, dev->index, 0x100, dev); + if (!dev->nvlink.pvd) + return; + + phb->scan_map |= 0x1 << GETFIELD(0xf8, dev->nvlink.pvd->bdfn); + npu3_cfg_populate(dev); +} + +static void npu3_dt_add_mmio_atsd(struct npu3 *npu) +{ + struct dt_node *dn = npu->nvlink.phb.dt_node; + uint64_t mmio_atsd[NPU3_XTS_ATSD_MAX]; + + for (uint32_t i = 0; i < NPU3_XTS_ATSD_MAX; i++) + mmio_atsd[i] = npu->regs[0] + NPU3_XTS_ATSD_LAUNCH(i); + + dt_add_property(dn, "ibm,mmio-atsd", mmio_atsd, sizeof(mmio_atsd)); +} + +static void npu3_dt_add_mmio_window(struct npu3 *npu) +{ + struct dt_node *dn = npu->nvlink.phb.dt_node; + uint32_t ntl0_index = npu->index * NPU3_LINKS_PER_NPU; + uint64_t addr, size, win[2]; + + /* Device MMIO window (NTL/GENID regs only) */ + phys_map_get(npu->chip_id, NPU_NTL, ntl0_index, &win[0], NULL); + phys_map_get(npu->chip_id, NPU_GENID, npu->index, &addr, &size); + win[1] = addr + size - win[0]; + + dt_add_property(dn, "ibm,mmio-window", win, sizeof(win)); + dt_add_property_cells(dn, "ranges", 0x02000000, + hi32(win[0]), lo32(win[0]), + hi32(win[0]), lo32(win[0]), + hi32(win[1]), lo32(win[1])); +} + +/* NDL No-Stall Event level */ +static uint32_t npu3_dev_interrupt_level(struct npu3_dev *dev) +{ + const uint32_t level[12] = { 1, 3, 5, 7, 9, 11, + 43, 45, 47, 49, 51, 53 }; + + return level[npu3_chip_dev_index(dev)]; +} + +static void npu3_dt_add_interrupts(struct npu3 *npu) +{ + struct dt_node *dn = npu->nvlink.phb.dt_node; + uint32_t *map, icsp, i = 0; + struct npu3_dev *dev; + size_t map_size = 0; + + npu3_for_each_nvlink_dev(dev, npu) + map_size += sizeof(*map) * 7; + + if (!map_size) + return; + + icsp = get_ics_phandle(); + map = zalloc(map_size); + assert(map); + + npu3_for_each_nvlink_dev(dev, npu) { + map[i] = dev->nvlink.pvd->bdfn << 8; + map[i + 3] = 1; /* INT A */ + map[i + 4] = icsp; /* interrupt-parent */ + map[i + 5] = npu->irq_base + npu3_dev_interrupt_level(dev); + map[i + 6] = 0; /* 0 = EDGE, 1 = LEVEL */ + i += 7; + } + + dt_add_property_cells(dn, "interrupt-parent", icsp); + dt_add_property(dn, "interrupt-map", map, map_size); + dt_add_property_cells(dn, "interrupt-map-mask", 0xff00, 0x0, 0x0, 0x7); + + free(map); +} + +/* Populate PCI root device node */ +static void npu3_dt_add_props(struct npu3 *npu) +{ + struct dt_node *dn = npu->nvlink.phb.dt_node; + + dt_add_property_cells(dn, "#address-cells", 3); + dt_add_property_cells(dn, "#size-cells", 2); + dt_add_property_cells(dn, "#interrupt-cells", 1); + dt_add_property_cells(dn, "bus-range", 0, 0xff); + dt_add_property_cells(dn, "clock-frequency", 0x200, 0); + + dt_add_property_strings(dn, "device_type", "pciex"); + + /* + * To the OS, npu2 and npu3 are both ibm,ioda2-npu2-phb. The added + * ibm,ioda3-npu3-phb allows for possible quirks. + */ + dt_add_property_strings(dn, "compatible", + "ibm,power9-npu-pciex", + "ibm,ioda2-npu2-phb", + "ibm,ioda2-npu3-phb"); + + dt_add_property_cells(dn, "ibm,phb-index", + npu3_get_phb_index(npu->index)); + dt_add_property_cells(dn, "ibm,phb-diag-data-size", 0); + dt_add_property_cells(dn, "ibm,opal-num-pes", NPU3_MAX_PE_NUM); + dt_add_property_cells(dn, "ibm,opal-reserved-pe", NPU3_RESERVED_PE_NUM); + dt_add_property_cells(dn, "ibm,supported-tce-sizes", + 12, /* 4K */ + 16, /* 64K */ + 24, /* 16M */ + 28); /* 256M */ + + dt_add_property_cells(dn, "ibm,chip-id", npu->chip_id); + dt_add_property_cells(dn, "ibm,npu-index", npu->index); + dt_add_property_cells(dn, "ibm,npcq", npu->dt_node->phandle); + dt_add_property_cells(dn, "ibm,xscom-base", npu->xscom_base); + dt_add_property_cells(dn, "ibm,links", NPU3_LINKS_PER_NPU); + + dt_add_property(dn, "reg", npu->regs, sizeof(npu->regs)); + + npu3_dt_add_mmio_atsd(npu); + npu3_dt_add_mmio_window(npu); + npu3_dt_add_interrupts(npu); +} + +void npu3_init_nvlink(struct npu3 *npu) +{ + struct npu3_dev *dev; + + if (!npu3_next_dev(npu, NULL, NPU3_DEV_TYPE_NVLINK)) + return; + + npu3_init_hw(npu); + npu3_create_phb(npu); + + npu3_for_each_nvlink_dev(dev, npu) + npu3_dev_create_pvd(dev); + + npu3_dt_add_props(npu); + + /* TODO: Sort out if/why we still can't enable this */ + disable_fast_reboot("NVLink device enabled"); +} + +static int64_t npu3_init_context_pid(struct npu3 *npu, uint32_t index, + uint64_t msr) +{ + uint64_t map, old_map; + + /* Unfiltered XTS mode; index is lparshort */ + map = SETFIELD(NPU3_XTS_PID_MAP_LPARSHORT, 0ull, index); + + /* Enable this mapping for both real and virtual addresses */ + map |= NPU3_XTS_PID_MAP_VALID_ATRGPA0 | NPU3_XTS_PID_MAP_VALID_ATRGPA1; + + /* Enable TLBIE/MMIOSD forwarding for this entry */ + map |= NPU3_XTS_PID_MAP_VALID_ATSD; + + /* Set the relevant MSR bits */ + if (msr & MSR_DR) + map |= NPU3_XTS_PID_MAP_MSR_DR; + + if (msr & MSR_HV) + map |= NPU3_XTS_PID_MAP_MSR_HV; + + if (msr & MSR_PR) + map |= NPU3_XTS_PID_MAP_MSR_PR; + + /* We don't support anything other than 64-bit so hardcode it here */ + map |= NPU3_XTS_PID_MAP_MSR_SF; + + old_map = npu3_read(npu, NPU3_XTS_PID_MAP(index)); + + /* Error out if this entry is already set with different msr bits */ + if (old_map && GETFIELD(NPU3_XTS_PID_MAP_MSR, old_map) != + GETFIELD(NPU3_XTS_PID_MAP_MSR, map)) { + NPU3ERR(npu, "%s: Unexpected MSR value\n", __func__); + return OPAL_PARAMETER; + } + + if (!old_map) { + NPU3DBG(npu, "XTS_PID_MAP[%03d] = 0x%08llx\n", index, map); + npu3_write(npu, NPU3_XTS_PID_MAP(index), map); + } + + npu->nvlink.ctx_ref[index]++; + + return OPAL_SUCCESS; +} + +#define NPU3_VALID_ATS_MSR_BITS (MSR_DR | MSR_HV | MSR_PR | MSR_SF) + +/* + * Allocate a context ID and initialize the tables with the relevant + * information. Returns the ID or error if one couldn't be allocated. + */ +int64_t npu3_init_context(struct phb *phb, uint64_t msr, uint64_t bdf) +{ + struct npu3 *npu = npu3_phb_to_npu(phb); + uint32_t lparshort, i; + uint64_t map; + int64_t rc; + + /* + * MSR bits should be masked by the caller to allow for future + * expansion if required. + */ + if (msr & ~NPU3_VALID_ATS_MSR_BITS) + return OPAL_UNSUPPORTED; + + lock(&npu->lock); + + for (i = 0; i < NPU3_XTS_BDF_MAP_MAX; i++) { + map = npu3_read(npu, NPU3_XTS_BDF_MAP(i)); + + if (map && GETFIELD(NPU3_XTS_BDF_MAP_BDF, map) == bdf) + break; + } + + if (i == NPU3_XTS_BDF_MAP_MAX) { + NPU3ERR(npu, "LPARID not associated with any GPU\n"); + rc = OPAL_PARAMETER; + goto out; + } + + lparshort = GETFIELD(NPU3_XTS_BDF_MAP_LPARSHORT, map); + NPU3DBG(npu, "Found LPARSHORT 0x%x for bdf %02llx:%02llx.%llx\n", + lparshort, PCI_BUS_NUM(bdf), PCI_DEV(bdf), PCI_FUNC(bdf)); + + rc = npu3_init_context_pid(npu, lparshort, msr); + if (rc) + goto out; + + if (!(map & NPU3_XTS_BDF_MAP_VALID)) { + map |= NPU3_XTS_BDF_MAP_VALID; + npu3_write(npu, NPU3_XTS_BDF_MAP(i), map); + } + + rc = lparshort; + +out: + unlock(&npu->lock); + return rc; +} + +static int64_t npu3_destroy_context_pid(struct npu3 *npu, uint32_t index) +{ + if (!npu->nvlink.ctx_ref[index]) + return OPAL_PARAMETER; + + /* Only destroy when refcount hits 0 */ + if (--npu->nvlink.ctx_ref[index]) + return OPAL_PARTIAL; + + NPU3DBG(npu, "XTS_PID_MAP[%03d] = 0 (destroy)\n", index); + npu3_write(npu, NPU3_XTS_PID_MAP(index), 0ull); + + return OPAL_SUCCESS; +} + +int64_t npu3_destroy_context(struct phb *phb, uint64_t bdf) +{ + struct npu3 *npu = npu3_phb_to_npu(phb); + uint32_t lparshort, i; + int64_t map, rc; + + lock(&npu->lock); + + for (i = 0; i < NPU3_XTS_BDF_MAP_MAX; i++) { + map = npu3_read(npu, NPU3_XTS_BDF_MAP(i)); + + if (map && GETFIELD(NPU3_XTS_BDF_MAP_BDF, map) == bdf) + break; + } + + if (i == NPU3_XTS_BDF_MAP_MAX) { + NPU3ERR(npu, "LPARID not associated with any GPU\n"); + rc = OPAL_PARAMETER; + goto out; + } + + lparshort = GETFIELD(NPU3_XTS_BDF_MAP_LPARSHORT, map); + rc = npu3_destroy_context_pid(npu, lparshort); + +out: + unlock(&npu->lock); + return rc; +} + +/* Map the given virtual bdf to lparid with given lpcr */ +int64_t npu3_map_lpar(struct phb *phb, uint64_t bdf, uint64_t lparid, + uint64_t lpcr) +{ + struct npu3 *npu = npu3_phb_to_npu(phb); + struct npu3_dev *dev; + int64_t rc = OPAL_SUCCESS; + uint64_t map, val; + uint32_t i; + + /* + * The LPCR bits are only required for hash based ATS, which we don't + * currently support, but may need to in the future. + */ + if (lpcr) + return OPAL_UNSUPPORTED; + + lock(&npu->lock); + + /* Update the entry if it already exists */ + for (i = 0; i < NPU3_XTS_BDF_MAP_MAX; i++) { + map = npu3_read(npu, NPU3_XTS_BDF_MAP(i)); + + if (map && GETFIELD(NPU3_XTS_BDF_MAP_BDF, map) == bdf) + break; + } + + if (i == NPU3_XTS_BDF_MAP_MAX) { + /* No existing mapping found, find space for a new one */ + for (i = 0; i < NPU3_XTS_BDF_MAP_MAX; i++) + if (!npu3_read(npu, NPU3_XTS_BDF_MAP(i))) + break; + } + + if (i == NPU3_XTS_BDF_MAP_MAX) { + NPU3ERR(npu, "No free XTS_BDF[] entry\n"); + rc = OPAL_RESOURCE; + goto out; + } + + map = NPU3_XTS_BDF_MAP_UNFILT; + map = SETFIELD(NPU3_XTS_BDF_MAP_BDF, map, bdf); + map = SETFIELD(NPU3_XTS_BDF_MAP_LPARID, map, lparid); + map = SETFIELD(NPU3_XTS_BDF_MAP_LPARSHORT, map, i); + + /* We only support radix at the moment */ + map = SETFIELD(NPU3_XTS_BDF_MAP_XLAT, map, 0x3); + + /* Find a link on which to send ATSDs for this device */ + npu3_for_each_nvlink_dev(dev, npu) + if (dev->nvlink.gpu->bdfn == bdf) + break; + + if (!dev || dev->nvlink.gpu->bdfn != bdf) { + NPU3ERR(npu, "Can't find a link for bdf %02llx:%02llx.%llx\n", + PCI_BUS_NUM(bdf), PCI_DEV(bdf), PCI_FUNC(bdf)); + rc = OPAL_PARAMETER; + goto out; + } + + map = SETFIELD(NPU3_XTS_BDF_MAP_BRICK, map, dev->index); + + NPU3DBG(npu, "XTS_BDF_MAP[%03d] = 0x%08llx\n", i, map); + npu3_write(npu, NPU3_XTS_BDF_MAP(i), map); + + /* We need to allocate an ATSD per link */ + val = SETFIELD(NPU3_XTS_ATSD_HYP_LPARID, 0ull, lparid); + if (!lparid) + val |= NPU3_XTS_ATSD_HYP_MSR_HV; + + npu3_write(npu, NPU3_XTS_ATSD_HYP(dev->index), val); + +out: + unlock(&npu->lock); + return rc; +} + +static int64_t npu3_relaxed_order_enable(struct npu3 *npu, uint64_t src) +{ + struct npu3_dev *dev; + uint32_t i; + + for (i = 0; i < NPU3_RELAXED_SRC_MAX; i++) + if (npu3_read(npu, NPU3_RELAXED_SRC(i)) == src) + return OPAL_SUCCESS; /* Already enabled */ + + /* Find somewhere to write this source */ + for (i = 0; i < NPU3_RELAXED_SRC_MAX; i++) + if (!npu3_read(npu, NPU3_RELAXED_SRC(i))) + break; + + if (i == NPU3_RELAXED_SRC_MAX) { + NPU3ERR(npu, "Insufficient resources to activate relaxed ordering mode\n"); + return OPAL_RESOURCE; + } + + npu3_write(npu, NPU3_RELAXED_SRC(i), src); + + npu3_for_each_nvlink_dev(dev, npu) { + uint64_t val = npu3_read(npu, NPU3_RELAXED_CFG2(dev->index)); + + val |= NPU3_RELAXED_CFG2_SRC_WRENA(i) | + NPU3_RELAXED_CFG2_SRC_RDENA(i); + npu3_write(npu, NPU3_RELAXED_CFG2(dev->index), val); + } + + return OPAL_SUCCESS; +} + +static void npu3_relaxed_order_disable(struct npu3 *npu, uint64_t src) +{ + struct npu3_dev *dev; + uint32_t i; + + for (i = 0; i < NPU3_RELAXED_SRC_MAX; i++) + if (npu3_read(npu, NPU3_RELAXED_SRC(i)) == src) + break; + + if (i == NPU3_RELAXED_SRC_MAX) + return; /* Already disabled */ + + npu3_for_each_nvlink_dev(dev, npu) { + uint64_t val = npu3_read(npu, NPU3_RELAXED_CFG2(dev->index)); + + val &= ~NPU3_RELAXED_CFG2_SRC_WRENA(i); + val &= ~NPU3_RELAXED_CFG2_SRC_RDENA(i); + npu3_write(npu, NPU3_RELAXED_CFG2(dev->index), val); + } + + npu3_write(npu, NPU3_RELAXED_SRC(i), 0ull); +} + +/* Enable or disable relaxed ordering on all nvlinks for a given PEC. */ +int64_t npu3_set_relaxed_order(struct phb *phb, uint32_t gcid, int pec, + bool enable) +{ + struct npu3 *npu = npu3_phb_to_npu(phb); + int64_t rc = OPAL_SUCCESS; + uint64_t src; + + NPU3INF(npu, "%s relaxed ordering for PEC %d on chip %d\n", + enable ? "Enabling" : "Disabling", + pec, gcid); + + lock(&npu->lock); + + src = SETFIELD(NPU3_RELAXED_SRC_GRPCHP, 0ull, gcid); + src = SETFIELD(NPU3_RELAXED_SRC_PEC, src, pec); + src = SETFIELD(NPU3_RELAXED_SRC_RDSTART, src, 0); + src = SETFIELD(NPU3_RELAXED_SRC_RDEND, src, 47); + src = SETFIELD(NPU3_RELAXED_SRC_WRSTART, src, 0); + src = SETFIELD(NPU3_RELAXED_SRC_WREND, src, 23); + + if (enable) + rc = npu3_relaxed_order_enable(npu, src); + else + npu3_relaxed_order_disable(npu, src); + + unlock(&npu->lock); + return rc; +} diff --git a/roms/skiboot/hw/npu3.c b/roms/skiboot/hw/npu3.c new file mode 100644 index 000000000..03461373e --- /dev/null +++ b/roms/skiboot/hw/npu3.c @@ -0,0 +1,549 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Copyright 2019 IBM Corp. + */ + +#include <io.h> +#include <xscom.h> +#include <npu3.h> +#include <npu3-regs.h> +#include <nvram.h> +#include <interrupts.h> +#include <xive.h> + +#define NPU3LOG(l, npu, fmt, a...) \ + prlog(l, "NPU[%d:%d]: " fmt, (npu)->chip_id, (npu)->index, ##a) +#define NPU3DBG(npu, fmt, a...) NPU3LOG(PR_DEBUG, npu, fmt, ##a) +#define NPU3INF(npu, fmt, a...) NPU3LOG(PR_INFO, npu, fmt, ##a) +#define NPU3ERR(npu, fmt, a...) NPU3LOG(PR_ERR, npu, fmt, ##a) + +#define NPU3DEVLOG(l, dev, fmt, a...) \ + prlog(l, "NPU[%d:%d:%d]: " fmt, \ + (dev)->npu->chip_id, \ + (dev)->npu->index, \ + (dev)->index, ##a) +#define NPU3DEVDBG(dev, fmt, a...) NPU3DEVLOG(PR_DEBUG, dev, fmt, ##a) +#define NPU3DEVINF(dev, fmt, a...) NPU3DEVLOG(PR_INFO, dev, fmt, ##a) +#define NPU3DEVERR(dev, fmt, a...) NPU3DEVLOG(PR_ERR, dev, fmt, ##a) + +static void npu3_dt_create_link(struct dt_node *npu, uint32_t npu_index, + uint32_t dev_index) +{ + struct dt_node *link; + uint32_t phy_lane_mask, ob_chiplet; + + link = dt_new_addr(npu, "link", dev_index); + + dt_add_property_string(link, "compatible", "ibm,npu-link"); + dt_add_property_cells(link, "reg", dev_index); + dt_add_property_cells(link, "ibm,npu-link-index", dev_index); + + switch (npu_index) { + case 0: + /* fall through */ + case 2: + ob_chiplet = npu_index ? 3 : 0; + + switch (dev_index) { + case 0: + phy_lane_mask = PPC_BITMASK32(0, 3); + break; + case 1: + phy_lane_mask = PPC_BITMASK32(13, 16); + break; + case 2: + phy_lane_mask = PPC_BITMASK32(7, 10); + break; + case 3: + phy_lane_mask = PPC_BITMASK32(20, 23); + break; + } + + break; + case 1: + switch (dev_index) { + case 0: + ob_chiplet = 1; + phy_lane_mask = PPC_BITMASK32(0, 3); + break; + case 1: + ob_chiplet = 2; + phy_lane_mask = PPC_BITMASK32(0, 3); + break; + case 2: + ob_chiplet = 1; + phy_lane_mask = PPC_BITMASK32(7, 10); + break; + case 3: + ob_chiplet = 2; + phy_lane_mask = PPC_BITMASK32(7, 10); + break; + } + + break; + default: + return; + } + + dt_add_property_cells(link, "ibm,npu-phy", ob_chiplet); + dt_add_property_cells(link, "ibm,npu-lane-mask", phy_lane_mask); +} + +static void npu3_dt_create_npu(struct dt_node *xscom, uint32_t npu_index) +{ + const uint32_t npu_base[] = { 0x5011000, 0x5011400, 0x3011c00 }; + struct dt_node *npu; + + npu = dt_new_addr(xscom, "npu", npu_base[npu_index]); + + dt_add_property_cells(npu, "#size-cells", 0); + dt_add_property_cells(npu, "#address-cells", 1); + dt_add_property_cells(npu, "reg", npu_base[npu_index], 0x2c); + dt_add_property_string(npu, "compatible", "ibm,power9-npu3"); + dt_add_property_cells(npu, "ibm,npu-index", npu_index); + + for (uint32_t i = 0; i < NPU3_LINKS_PER_NPU; i++) + npu3_dt_create_link(npu, npu_index, i); +} + +/* This can be removed when/if we decide to use HDAT instead */ +static bool npu3_dt_create(void) +{ + struct proc_chip *chip = next_chip(NULL); + struct dt_node *xscom; + + /* npu3 chips only */ + if (proc_gen < proc_gen_p9 || + chip->type == PROC_CHIP_P9_NIMBUS || + chip->type == PROC_CHIP_P9_CUMULUS) + return false; + + dt_for_each_compatible(dt_root, xscom, "ibm,xscom") + for (uint32_t i = 0; i < 3; i++) + npu3_dt_create_npu(xscom, i); + + return true; +} + +static struct npu3 *npu3_create(struct dt_node *dn) +{ + struct npu3 *npu; + struct dt_node *link; + struct npu3_dev *dev; + char *path; + uint32_t i; + + npu = zalloc(sizeof(*npu)); + assert(npu); + + init_lock(&npu->lock); + + npu->dt_node = dn; + npu->index = dt_prop_get_u32(dn, "ibm,npu-index"); + npu->xscom_base = dt_get_address(dn, 0, NULL); + + npu->chip_id = dt_get_chip_id(dn); + assert(get_chip(npu->chip_id)); + + dt_for_each_compatible(dn, link, "ibm,npu-link") { + i = dt_prop_get_u32(link, "ibm,npu-link-index"); + assert(i < NPU3_LINKS_PER_NPU); + + dev = &npu->devices[i]; + dev->index = i; + dev->npu = npu; + dev->dn = link; + dev->ob_chiplet = dt_prop_get_u32(link, "ibm,npu-phy"); + dev->phy_lane_mask = dt_prop_get_u32(link, "ibm,npu-lane-mask"); + dev->proc.status = NPU3_PROC_COMPLETE; + }; + + path = dt_get_path(dn); + NPU3INF(npu, "Found %s\n", path); + NPU3INF(npu, "SCOM base: 0x%llx\n", npu->xscom_base); + free(path); + + return npu; +} + +struct npu3_dev *npu3_next_dev(struct npu3 *npu, struct npu3_dev *dev, + enum npu3_dev_type type) +{ + uint32_t i = 0; + + if (dev) + i = dev->index + 1; + + for (; i < NPU3_LINKS_PER_NPU; i++) { + dev = &npu->devices[i]; + + if (dev->type == type || type == NPU3_DEV_TYPE_ANY) + return dev; + } + + return NULL; +} + +static void npu3_device_detect_fixup(struct npu3_dev *dev) +{ + struct dt_node *dn = dev->dn; + + if (dev->type == NPU3_DEV_TYPE_NVLINK) { + dt_add_property_strings(dn, "ibm,npu-link-type", "nvlink"); + dev->link_speed = dt_prop_get_u32_def( + dn, "nvidia,link-speed", 0xff); + return; + } + + NPU3DEVDBG(dev, "Link type unknown\n"); + dt_add_property_strings(dn, "ibm,npu-link-type", "unknown"); +} + +/* + * We use the indirect method because it uses the same addresses as + * the MMIO offsets (NPU RING) + */ +static void npu3_scom_sel(struct npu3 *npu, uint64_t reg, uint64_t size) +{ + uint64_t val; + + val = SETFIELD(NPU3_MISC_DA_ADDR, 0ull, reg); + val = SETFIELD(NPU3_MISC_DA_LEN, val, size); + xscom_write(npu->chip_id, + npu->xscom_base + NPU3_MISC_SCOM_IND_SCOM_ADDR, + val); +} + +static void npu3_scom_write(struct npu3 *npu, uint64_t reg, uint64_t size, + uint64_t val) +{ + npu3_scom_sel(npu, reg, size); + xscom_write(npu->chip_id, + npu->xscom_base + NPU3_MISC_SCOM_IND_SCOM_DATA, + val); +} + +static uint64_t npu3_scom_read(struct npu3 *npu, uint64_t reg, uint64_t size) +{ + uint64_t val; + + npu3_scom_sel(npu, reg, size); + xscom_read(npu->chip_id, + npu->xscom_base + NPU3_MISC_SCOM_IND_SCOM_DATA, + &val); + + return val; +} + +void npu3_write(struct npu3 *npu, uint64_t reg, uint64_t val) +{ + void *mmio = (void *)npu->regs[0]; + + if (mmio) + out_be64(mmio + reg, val); + else + npu3_scom_write(npu, reg, NPU3_MISC_DA_LEN_8B, val); + + /* CQ_SM writes should be mirrored in all four blocks */ + if (NPU3_REG_BLOCK(reg) != NPU3_BLOCK_CQ_SM(0)) + return; + + for (uint32_t i = 1; i < 4; i++) + npu3_write(npu, NPU3_BLOCK_CQ_SM(i) + NPU3_REG_OFFSET(reg), + val); +} + +uint64_t npu3_read(struct npu3 *npu, uint64_t reg) +{ + void *mmio = (void *)npu->regs[0]; + + if (mmio) + return in_be64(mmio + reg); + + return npu3_scom_read(npu, reg, NPU3_MISC_DA_LEN_8B); +} + +void npu3_write_4b(struct npu3 *npu, uint64_t reg, uint32_t val) +{ + void *mmio = (void *)npu->regs[0]; + + if (mmio) + out_be32(mmio + reg, val); + else + npu3_scom_write(npu, reg, NPU3_MISC_DA_LEN_4B, + (uint64_t)val << 32); + + if (NPU3_REG_BLOCK(reg) != NPU3_BLOCK_CQ_SM(0)) + return; + + for (uint32_t i = 1; i < 4; i++) + npu3_write_4b(npu, NPU3_BLOCK_CQ_SM(i) + NPU3_REG_OFFSET(reg), + val); +} + +uint32_t npu3_read_4b(struct npu3 *npu, uint64_t reg) +{ + void *mmio = (void *)npu->regs[0]; + + if (mmio) + return in_be32(mmio + reg); + + return npu3_scom_read(npu, reg, NPU3_MISC_DA_LEN_4B) >> 32; +} + +static void npu3_misc_config(struct npu3 *npu) +{ + struct npu3_dev *dev; + uint32_t typemap = 0; + uint64_t reg, val; + + npu3_for_each_nvlink_dev(dev, npu) + typemap |= 0x10 >> dev->index; + + reg = NPU3_MCP_MISC_CFG0; + val = npu3_read(npu, reg); + val |= NPU3_MCP_MISC_CFG0_ENABLE_PBUS; + val &= ~NPU3_MCP_MISC_CFG0_ENABLE_SNARF_CPM; + val = SETFIELD(NPU3_MCP_MISC_CFG0_NVLINK_MODE, val, typemap); + val = SETFIELD(NPU3_MCP_MISC_CFG0_OCAPI_MODE, val, ~typemap); + npu3_write(npu, reg, val); + + reg = NPU3_SNP_MISC_CFG0; + val = npu3_read(npu, reg); + val |= NPU3_SNP_MISC_CFG0_ENABLE_PBUS; + val = SETFIELD(NPU3_SNP_MISC_CFG0_NVLINK_MODE, val, typemap); + val = SETFIELD(NPU3_SNP_MISC_CFG0_OCAPI_MODE, val, ~typemap); + npu3_write(npu, reg, val); + + reg = NPU3_CTL_MISC_CFG2; + val = npu3_read(npu, reg); + val = SETFIELD(NPU3_CTL_MISC_CFG2_NVLINK_MODE, val, typemap); + val = SETFIELD(NPU3_CTL_MISC_CFG2_OCAPI_MODE, val, ~typemap); + npu3_write(npu, reg, val); + + reg = NPU3_DAT_MISC_CFG1; + val = npu3_read(npu, reg); + val = SETFIELD(NPU3_DAT_MISC_CFG1_NVLINK_MODE, val, typemap); + val = SETFIELD(NPU3_DAT_MISC_CFG1_OCAPI_MODE, val, ~typemap); + npu3_write(npu, reg, val); +} + +static void npu3_assign_bars(struct npu3 *npu) +{ + struct npu3_dev *dev; + uint64_t addr, size, val; + + /* Global MMIO bar (per npu) */ + phys_map_get(npu->chip_id, NPU_REGS, npu->index, &addr, &size); + val = SETFIELD(NPU3_MMIO_BAR_ADDR, 0ull, addr >> 24); + val |= NPU3_MMIO_BAR_ENABLE; + npu3_write(npu, NPU3_MMIO_BAR, val); + + NPU3INF(npu, "MMIO base: 0x%016llx (%lldMB)\n", addr, size >> 20); + npu->regs[0] = addr; + npu->regs[1] = size; + + /* NTL bar (per device) */ + npu3_for_each_dev(dev, npu) { + phys_map_get(npu->chip_id, NPU_NTL, npu3_chip_dev_index(dev), + &addr, &size); + val = SETFIELD(NPU3_NTL_BAR_ADDR, 0ull, addr >> 16); + val = SETFIELD(NPU3_NTL_BAR_SIZE, val, ilog2(size >> 16)); + npu3_write(npu, NPU3_NTL_BAR(dev->index), val); + + dev->ntl_bar.addr = addr; + dev->ntl_bar.size = size; + } + + /* GENID bar (logically divided per device) */ + phys_map_get(npu->chip_id, NPU_GENID, npu->index, &addr, NULL); + val = SETFIELD(NPU3_GENID_BAR_ADDR, 0ull, addr >> 19); + npu3_write(npu, NPU3_GENID_BAR, val); + + npu3_for_each_dev(dev, npu) { + dev->genid_bar.addr = addr + (dev->index << 16); + dev->genid_bar.size = 64 << 10; + } +} + +void npu3_dev_enable_bars(struct npu3_dev *dev, bool enable) +{ + struct npu3 *npu = dev->npu; + uint64_t reg, val; + + if (dev->ntl_bar.enable == enable) /* No state change */ + return; + + dev->ntl_bar.enable = enable; + dev->genid_bar.enable = enable; + + reg = NPU3_NTL_BAR(dev->index); + val = npu3_read(npu, reg); + val = SETFIELD(NPU3_NTL_BAR_ENABLE, val, enable); + npu3_write(npu, reg, val); + + /* + * Generation IDs are a single space in the hardware but we split them + * per device. Only disable in hardware if every device has disabled. + */ + if (!enable) + npu3_for_each_dev(dev, npu) + if (dev->genid_bar.enable) + return; + + reg = NPU3_GENID_BAR; + val = npu3_read(npu, reg); + val = SETFIELD(NPU3_GENID_BAR_ENABLE, val, enable); + npu3_write(npu, reg, val); +} + +static uint64_t npu3_ipi_attributes(struct irq_source *is, uint32_t isn) +{ + struct npu3 *npu = is->data; + uint32_t level = isn - npu->irq_base; + + /* TCE interrupt is used to detect a frozen PE */ + if (level == 18) + return IRQ_ATTR_TARGET_OPAL | + IRQ_ATTR_TARGET_RARE | + IRQ_ATTR_TYPE_MSI; + + return IRQ_ATTR_TARGET_LINUX; +} + +static void npu3_ipi_interrupt(struct irq_source *is, uint32_t isn) +{ + struct npu3 *npu = is->data; + uint32_t level = isn - npu->irq_base; + + if (level != 18) { + NPU3ERR(npu, "Received unknown interrupt %d\n", level); + return; + } + + opal_update_pending_evt(OPAL_EVENT_PCI_ERROR, OPAL_EVENT_PCI_ERROR); +} + +#define NPU3_IRQ_LEVELS 60 + +static char *npu3_ipi_name(struct irq_source *is, uint32_t isn) +{ + struct npu3 *npu = is->data; + uint32_t level = isn - npu->irq_base; + static const char *names[NPU3_IRQ_LEVELS] = { + [0] = "NDL 0 Stall Event (brick 0)", + [1] = "NDL 0 No-Stall Event (brick 0)", + [2] = "NDL 1 Stall Event (brick 1)", + [3] = "NDL 1 No-Stall Event (brick 1)", + [4] = "NDL 2 Stall Event (brick 2)", + [5] = "NDL 2 No-Stall Event (brick 2)", + [6] = "NDL 3 Stall Event (brick 3)", + [7] = "NDL 3 No-Stall Event (brick 3)", + [8] = "NDL 4 Stall Event (brick 4)", + [9] = "NDL 4 No-Stall Event (brick 4)", + [10] = "NDL 5 Stall Event (brick 5)", + [11] = "NDL 5 No-Stall Event (brick 5)", + [12] = "NTL 0 Event", + [13] = "NTL 1 Event", + [14] = "NTL 2 Event", + [15] = "NTL 3 Event", + [16] = "NTL 4 Event", + [17] = "NTL 5 Event", + [18] = "TCE Event", + [19] = "ATS Event", + [20] = "CQ Event", + [21] = "MISC Event", + [41] = "Memory Controller Event", + [42] = "NDL 6 Stall Event (brick 6)", + [43] = "NDL 6 No-Stall Event (brick 6)", + [44] = "NDL 7 Stall Event (brick 7)", + [45] = "NDL 7 No-Stall Event (brick 7)", + [46] = "NDL 8 Stall Event (brick 8)", + [47] = "NDL 8 No-Stall Event (brick 8)", + [48] = "NDL 9 Stall Event (brick 9)", + [49] = "NDL 9 No-Stall Event (brick 9)", + [50] = "NDL 10 Stall Event (brick 10)", + [51] = "NDL 10 No-Stall Event (brick 10)", + [52] = "NDL 11 Stall Event (brick 11)", + [53] = "NDL 11 No-Stall Event (brick 11)", + [54] = "NTL 6 Event", + [55] = "NTL 7 Event", + [56] = "NTL 8 Event", + [57] = "NTL 9 Event", + [58] = "NTL 10 Event", + [59] = "NTL 11 Event", + }; + + if (level >= NPU3_IRQ_LEVELS || !names[level]) + return strdup("Unknown"); + + return strdup(names[level]); +} + +static const struct irq_source_ops npu3_ipi_ops = { + .attributes = npu3_ipi_attributes, + .interrupt = npu3_ipi_interrupt, + .name = npu3_ipi_name, +}; + +static void npu3_setup_irqs(struct npu3 *npu) +{ + uint64_t reg, val; + uint32_t base; + + base = xive_alloc_ipi_irqs(npu->chip_id, NPU3_IRQ_LEVELS, 64); + if (base == XIVE_IRQ_ERROR) { + NPU3ERR(npu, "Failed to allocate interrupt sources\n"); + return; + } + + xive_register_ipi_source(base, NPU3_IRQ_LEVELS, npu, &npu3_ipi_ops); + + /* Set IPI configuration */ + reg = NPU3_MISC_CFG; + val = npu3_read(npu, reg); + val = SETFIELD(NPU3_MISC_CFG_IPI_PS, val, NPU3_MISC_CFG_IPI_PS_64K); + val = SETFIELD(NPU3_MISC_CFG_IPI_OS, val, NPU3_MISC_CFG_IPI_OS_AIX); + npu3_write(npu, reg, val); + + /* Set IRQ base */ + reg = NPU3_MISC_INT_BAR; + val = SETFIELD(NPU3_MISC_INT_BAR_ADDR, 0ull, + (uint64_t)xive_get_trigger_port(base) >> 12); + npu3_write(npu, reg, val); + + npu->irq_base = base; +} + +static void npu3_init(struct npu3 *npu) +{ + struct npu3_dev *dev; + + platform.npu3_device_detect(npu); + npu3_for_each_dev(dev, npu) + npu3_device_detect_fixup(dev); + + npu3_misc_config(npu); + npu3_assign_bars(npu); + npu3_setup_irqs(npu); + npu3_init_nvlink(npu); +} + +void probe_npu3(void) +{ + struct dt_node *dn; + struct npu3 *npu; + + if (!npu3_dt_create()) + return; + + if (!platform.npu3_device_detect) { + prlog(PR_INFO, "NPU: Platform does not support NPU\n"); + return; + } + + dt_for_each_compatible(dt_root, dn, "ibm,power9-npu3") { + npu = npu3_create(dn); + npu3_init(npu); + } +} diff --git a/roms/skiboot/hw/nx-842.c b/roms/skiboot/hw/nx-842.c new file mode 100644 index 000000000..0cb87dcc8 --- /dev/null +++ b/roms/skiboot/hw/nx-842.c @@ -0,0 +1,231 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * NX unit 842 compression accellerator + * + * Copyright 2015-2019 IBM Corp. + */ + +#include <skiboot.h> +#include <chip.h> +#include <xscom.h> +#include <io.h> +#include <cpu.h> +#include <nx.h> +#include <vas.h> + +/* Configuration settings */ +#define CFG_842_FC_ENABLE (0x1f) /* enable all 842 functions */ +#define CFG_842_ENABLE (1) /* enable 842 engines */ +#define DMA_CSB_WR NX_DMA_CSB_WR_CI +#define DMA_COMPLETION_MODE NX_DMA_COMPLETION_MODE_CI +#define DMA_CPB_WR NX_DMA_CPB_WR_CI_PAD +#define DMA_OUTPUT_DATA_WR NX_DMA_OUTPUT_DATA_WR_CI +#define EE_1 (1) /* enable engine 842 1 */ +#define EE_0 (1) /* enable engine 842 0 */ + +static int nx_cfg_842(u32 gcid, u64 xcfg) +{ + u64 cfg, ci, ct; + int rc, instance = gcid + 1; + + BUILD_ASSERT(MAX_CHIPS < NX_842_CFG_CI_MAX); + + rc = xscom_read(gcid, xcfg, &cfg); + if (rc) { + prerror("NX%d: ERROR: XSCOM 842 config read failure %d\n", + gcid, rc); + return rc; + } + + ct = GETFIELD(NX_842_CFG_CT, cfg); + if (!ct) + prlog(PR_INFO, "NX%d: 842 CT set to %u\n", gcid, NX_CT_842); + else if (ct == NX_CT_842) + prlog(PR_INFO, "NX%d: 842 CT already set to %u\n", + gcid, NX_CT_842); + else + prlog(PR_INFO, "NX%d: 842 CT already set to %u, " + "changing to %u\n", gcid, (unsigned int)ct, NX_CT_842); + ct = NX_CT_842; + cfg = SETFIELD(NX_842_CFG_CT, cfg, ct); + + /* Coprocessor Instance must be shifted left. + * See hw doc Section 5.5.1. + */ + ci = GETFIELD(NX_842_CFG_CI, cfg) >> NX_842_CFG_CI_LSHIFT; + if (!ci) + prlog(PR_INFO, "NX%d: 842 CI set to %d\n", gcid, instance); + else if (ci == instance) + prlog(PR_INFO, "NX%d: 842 CI already set to %u\n", gcid, + (unsigned int)ci); + else + prlog(PR_INFO, "NX%d: 842 CI already set to %u, " + "changing to %d\n", gcid, (unsigned int)ci, instance); + ci = instance; + cfg = SETFIELD(NX_842_CFG_CI, cfg, ci << NX_842_CFG_CI_LSHIFT); + + /* Enable all functions */ + cfg = SETFIELD(NX_842_CFG_FC_ENABLE, cfg, CFG_842_FC_ENABLE); + + cfg = SETFIELD(NX_842_CFG_ENABLE, cfg, CFG_842_ENABLE); + + rc = xscom_write(gcid, xcfg, cfg); + if (rc) + prerror("NX%d: ERROR: 842 CT %u CI %u config failure %d\n", + gcid, (unsigned int)ct, (unsigned int)ci, rc); + else + prlog(PR_DEBUG, "NX%d: 842 Config 0x%016lx\n", + gcid, (unsigned long)cfg); + + return rc; +} + +static int nx_cfg_842_umac(struct dt_node *node, u32 gcid, u32 pb_base) +{ + int rc; + u64 umac_bar, umac_notify; + struct dt_node *nx_node; + static u32 nx842_tid = 1; /* tid counter within coprocessor type */ + + nx_node = dt_new(node, "ibm,842-high-fifo"); + umac_bar = pb_base + NX_P9_842_HIGH_PRI_RX_FIFO_BAR; + umac_notify = pb_base + NX_P9_842_HIGH_PRI_RX_FIFO_NOTIFY_MATCH; + rc = nx_cfg_rx_fifo(nx_node, "ibm,p9-nx-842", "High", gcid, + NX_CT_842, nx842_tid++, umac_bar, + umac_notify); + if (rc) + return rc; + + nx_node = dt_new(node, "ibm,842-normal-fifo"); + umac_bar = pb_base + NX_P9_842_NORMAL_PRI_RX_FIFO_BAR; + umac_notify = pb_base + NX_P9_842_NORMAL_PRI_RX_FIFO_NOTIFY_MATCH; + rc = nx_cfg_rx_fifo(nx_node, "ibm,p9-nx-842", "Normal", gcid, + NX_CT_842, nx842_tid++, umac_bar, + umac_notify); + + return rc; +} + +static int nx_cfg_842_dma(u32 gcid, u64 xcfg) +{ + u64 cfg; + int rc; + + rc = xscom_read(gcid, xcfg, &cfg); + if (rc) { + prerror("NX%d: ERROR: XSCOM DMA config read failure %d\n", + gcid, rc); + return rc; + } + + cfg = SETFIELD(NX_DMA_CFG_842_COMPRESS_PREFETCH, cfg, + DMA_COMPRESS_PREFETCH); + cfg = SETFIELD(NX_DMA_CFG_842_DECOMPRESS_PREFETCH, cfg, + DMA_DECOMPRESS_PREFETCH); + cfg = SETFIELD(NX_DMA_CFG_842_COMPRESS_MAX_RR, cfg, + DMA_COMPRESS_MAX_RR); + cfg = SETFIELD(NX_DMA_CFG_842_DECOMPRESS_MAX_RR, cfg, + DMA_DECOMPRESS_MAX_RR); + cfg = SETFIELD(NX_DMA_CFG_842_SPBC, cfg, + DMA_SPBC); + if (proc_gen < proc_gen_p9) { + cfg = SETFIELD(NX_DMA_CFG_842_CSB_WR, cfg, + DMA_CSB_WR); + cfg = SETFIELD(NX_DMA_CFG_842_COMPLETION_MODE, cfg, + DMA_COMPLETION_MODE); + cfg = SETFIELD(NX_DMA_CFG_842_CPB_WR, cfg, + DMA_CPB_WR); + cfg = SETFIELD(NX_DMA_CFG_842_OUTPUT_DATA_WR, cfg, + DMA_OUTPUT_DATA_WR); + } + + rc = xscom_write(gcid, xcfg, cfg); + if (rc) + prerror("NX%d: ERROR: DMA config failure %d\n", gcid, rc); + else + prlog(PR_DEBUG, "NX%d: DMA 0x%016lx\n", gcid, + (unsigned long)cfg); + + return rc; +} + +static int nx_cfg_842_ee(u32 gcid, u64 xcfg) +{ + u64 cfg; + int rc; + + rc = xscom_read(gcid, xcfg, &cfg); + if (rc) { + prerror("NX%d: ERROR: XSCOM EE config read failure %d\n", + gcid, rc); + return rc; + } + + cfg = SETFIELD(NX_EE_CFG_CH1, cfg, EE_1); + cfg = SETFIELD(NX_EE_CFG_CH0, cfg, EE_0); + + rc = xscom_write(gcid, xcfg, cfg); + if (rc) + prerror("NX%d: ERROR: Engine Enable failure %d\n", gcid, rc); + else + prlog(PR_DEBUG, "NX%d: Engine Enable 0x%016lx\n", + gcid, (unsigned long)cfg); + + return rc; +} + +void nx_enable_842(struct dt_node *node, u32 gcid, u32 pb_base) +{ + u64 cfg_dma, cfg_842, cfg_ee; + int rc; + + if (dt_node_is_compatible(node, "ibm,power8-nx")) { + cfg_dma = pb_base + NX_P8_DMA_CFG; + cfg_842 = pb_base + NX_P8_842_CFG; + cfg_ee = pb_base + NX_P8_EE_CFG; + } else { + prerror("NX%d: ERROR: Unknown NX type!\n", gcid); + return; + } + + rc = nx_cfg_842_dma(gcid, cfg_dma); + if (rc) + return; + + rc = nx_cfg_842(gcid, cfg_842); + if (rc) + return; + + rc = nx_cfg_842_ee(gcid, cfg_ee); + if (rc) + return; + + prlog(PR_INFO, "NX%d: 842 Coprocessor Enabled\n", gcid); + + dt_add_property_cells(node, "ibm,842-coprocessor-type", NX_CT_842); + dt_add_property_cells(node, "ibm,842-coprocessor-instance", gcid + 1); +} + +void p9_nx_enable_842(struct dt_node *node, u32 gcid, u32 pb_base) +{ + u64 cfg_dma, cfg_ee; + int rc; + + cfg_dma = pb_base + NX_P9_DMA_CFG; + cfg_ee = pb_base + NX_P9_EE_CFG; + + rc = nx_cfg_842_dma(gcid, cfg_dma); + if (rc) + return; + + rc = nx_cfg_842_umac(node, gcid, pb_base); + if (rc) + return; + + rc = nx_cfg_842_ee(gcid, cfg_ee); + if (rc) + return; + + prlog(PR_INFO, "NX%d: 842 Coprocessor Enabled\n", gcid); + +} diff --git a/roms/skiboot/hw/nx-compress.c b/roms/skiboot/hw/nx-compress.c new file mode 100644 index 000000000..9b3c6717d --- /dev/null +++ b/roms/skiboot/hw/nx-compress.c @@ -0,0 +1,340 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * NX has 842 and GZIP (P9) accellerators + * + * Copyright 2015-2018 IBM Corp. + */ + +#include <skiboot.h> +#include <chip.h> +#include <xscom.h> +#include <io.h> +#include <cpu.h> +#include <nx.h> +#include <vas.h> +#include <opal.h> + +static int nx_cfg_umac_tx_wc(u32 gcid, u64 xcfg) +{ + int rc = 0; + u64 cfg; + + cfg = vas_get_wcbs_bar(gcid); + if (!cfg) { + prerror("NX%d: ERROR finding WC Backing store BAR\n", gcid); + return -ENOMEM; + } + + /* + * NOTE: Write the entire bar address to SCOM. VAS/NX will extract + * the relevant (NX_P9_UMAC_TX_WINDOW_CONTEXT_ADDR) bits. + * IOW, _don't_ just write the bit field like: + * + * cfg = SETFIELD(NX_P9_UMAC_TX_WINDOW_CONTEXT_ADDR, 0ULL, cfg); + */ + rc = xscom_write(gcid, xcfg, cfg); + + if (rc) + prerror("NX%d: ERROR: UMAC SEND WC BAR, %d\n", gcid, rc); + else + prlog(PR_DEBUG, "NX%d: UMAC SEND WC BAR, 0x%016lx, " + "xcfg 0x%llx\n", + gcid, (unsigned long)cfg, xcfg); + + return rc; +} + +static int nx_cfg_dma_vas_mmio(u32 gcid, u64 xcfg) +{ + int rc = 0; + u64 cfg; + + cfg = vas_get_hvwc_mmio_bar(gcid); + /* + * NOTE: Write the entire bar address to SCOM. VAS/NX will extract + * the relevant (NX_P9_UMAC_VAS_MMIO_ADDR) bits. IOW, _don't_ + * just write the bit field like: + * + * cfg = SETFIELD(NX_P9_DMA_VAS_MMIO_ADDR, 0ULL, cfg); + */ + rc = xscom_write(gcid, xcfg, cfg); + + if (rc) + prerror("NX%d: ERROR: DMA VAS MMIO BAR, %d\n", gcid, rc); + else + prlog(PR_DEBUG, "NX%d: DMA VAS MMIO BAR, 0x%016lx, xcfg 0x%llx\n", + gcid, (unsigned long)cfg, xcfg); + + return rc; +} + +static int nx_cfg_umac_vas_mmio(u32 gcid, u64 xcfg) +{ + int rc = 0; + u64 cfg; + + cfg = vas_get_hvwc_mmio_bar(gcid); + /* + * NOTE: Write the entire bar address to SCOM. VAS/NX will extract + * the relevant (NX_P9_UMAC_VAS_MMIO_ADDR) bits. IOW, _don't_ + * just write the bit field like: + * + * cfg = SETFIELD(NX_P9_UMAC_VAS_MMIO_ADDR, 0ULL, cfg); + */ + rc = xscom_write(gcid, xcfg, cfg); + + if (rc) + prerror("NX%d: ERROR: UMAC VAS MMIO BAR, %d\n", gcid, rc); + else + prlog(PR_DEBUG, "NX%d: UMAC VAS MMIO BAR, 0x%016lx, " + "xcfg 0x%llx\n", + gcid, (unsigned long)cfg, xcfg); + + return rc; +} + +static int nx_cfg_umac_status_ctrl(u32 gcid, u64 xcfg) +{ + u64 uctrl; + int rc; +#define CRB_ENABLE 1 + + rc = xscom_read(gcid, xcfg, &uctrl); + if (rc) + return rc; + + uctrl = SETFIELD(NX_P9_UMAC_STATUS_CTRL_CRB_ENABLE, uctrl, CRB_ENABLE); + rc = xscom_write(gcid, xcfg, uctrl); + if (rc) + prerror("NX%d: ERROR: Setting UMAC Status Control failure %d\n", + gcid, rc); + else + prlog(PR_DEBUG, "NX%d: Setting UMAC Status Control 0x%016lx\n", + gcid, (unsigned long)uctrl); + + return rc; +} + +static int nx_cfg_vas_rma_bar(u32 gcid, u64 xcfg) +{ + int rc = 0; + u64 cfg; + + cfg = vas_get_rma_bar(gcid); + /* + * NOTE: Write the entire bar address to SCOM. VAS/NX will extract + * the relevant (NX_P10_VAS_RMA_WRITE_BAR) bits. IOW, _don't_ + * just write the bit field like: + * cfg = SETFIELD(NX_P10_VAS_RMA_WRITE_BAR, 0ULL, cfg); + */ + rc = xscom_write(gcid, xcfg, cfg); + + if (rc) + prerror("NX%d: ERROR: VAS RMA WRITE BAR, %d\n", gcid, rc); + else + prlog(PR_DEBUG, "NX%d: VAS RMA WRITE BAR, 0x%016lx, " + "xcfg 0x%llx\n", gcid, (unsigned long)cfg, + xcfg); + + return rc; +} + +int nx_cfg_rx_fifo(struct dt_node *node, const char *compat, + const char *priority, u32 gcid, u32 pid, u32 tid, + u64 umac_bar, u64 umac_notify) +{ + u64 cfg; + int rc, size; + uint64_t fifo; + u32 lpid = 0xfff; /* All 1's for 12 bits in UMAC notify match reg */ +#define MATCH_ENABLE 1 + + fifo = (uint64_t) local_alloc(gcid, RX_FIFO_SIZE, RX_FIFO_SIZE); + assert(fifo); + + /* + * When configuring the address of the Rx FIFO into the Receive FIFO + * BAR, we should _NOT_ shift the address into bits 8:53. Instead we + * should copy the address as is and VAS/NX will extract relevant bits. + */ + /* + * Section 5.21 of P9 NX Workbook Version 2.42 shows Receive FIFO BAR + * 54:56 represents FIFO size + * 000 = 1KB, 8 CRBs + * 001 = 2KB, 16 CRBs + * 010 = 4KB, 32 CRBs + * 011 = 8KB, 64 CRBs + * 100 = 16KB, 128 CRBs + * 101 = 32KB, 256 CRBs + * 110 = 111 reserved + */ + size = RX_FIFO_SIZE / 1024; + cfg = SETFIELD(NX_P9_RX_FIFO_BAR_SIZE, fifo, ilog2(size)); + + rc = xscom_write(gcid, umac_bar, cfg); + if (rc) { + prerror("NX%d: ERROR: Setting UMAC FIFO bar failure %d\n", + gcid, rc); + return rc; + } else + prlog(PR_DEBUG, "NX%d: Setting UMAC FIFO bar 0x%016lx\n", + gcid, (unsigned long)cfg); + + rc = xscom_read(gcid, umac_notify, &cfg); + if (rc) + return rc; + + /* + * VAS issues asb_notify with the unique ID to identify the target + * co-processor/engine. Logical partition ID (lpid), process ID (pid), + * and thread ID (tid) combination is used to define the unique ID + * in the system. Export these values in device-tree such that the + * driver configure RxFIFO with VAS. Set these values in RxFIFO notify + * match register for each engine which compares the ID with each + * request. + * To define unique indentification, 0xfff (1's for 12 bits), + * co-processor type, and counter within coprocessor type are used + * for lpid, pid, and tid respectively. + */ + cfg = SETFIELD(NX_P9_RX_FIFO_NOTIFY_MATCH_LPID, cfg, lpid); + cfg = SETFIELD(NX_P9_RX_FIFO_NOTIFY_MATCH_PID, cfg, pid); + cfg = SETFIELD(NX_P9_RX_FIFO_NOTIFY_MATCH_TID, cfg, tid); + cfg = SETFIELD(NX_P9_RX_FIFO_NOTIFY_MATCH_MATCH_ENABLE, cfg, + MATCH_ENABLE); + + rc = xscom_write(gcid, umac_notify, cfg); + if (rc) { + prerror("NX%d: ERROR: Setting UMAC notify match failure %d\n", + gcid, rc); + return rc; + } else + prlog(PR_DEBUG, "NX%d: Setting UMAC notify match 0x%016lx\n", + gcid, (unsigned long)cfg); + + dt_add_property_string(node, "compatible", compat); + dt_add_property_string(node, "priority", priority); + dt_add_property_u64(node, "rx-fifo-address", fifo); + dt_add_property_cells(node, "rx-fifo-size", RX_FIFO_SIZE); + dt_add_property_cells(node, "lpid", lpid); + dt_add_property_cells(node, "pid", pid); + dt_add_property_cells(node, "tid", tid); + + return 0; +} + +static int nx_init_fifo_ctrl(u32 gcid, u64 fifo_ctrl) +{ + u64 cfg; + int rc = 0; + + rc = xscom_read(gcid, fifo_ctrl, &cfg); + if (rc) + return rc; + + cfg = SETFIELD(NX_P9_RX_FIFO_CTRL_READ_OFFSET, cfg, 0); + cfg = SETFIELD(NX_P9_RX_FIFO_CTRL_QUEUED, cfg, 0); + + rc = xscom_write(gcid, fifo_ctrl, cfg); + + return rc; +} + + +static int opal_nx_coproc_init(u32 gcid, u32 ct) +{ + struct proc_chip *chip; + u64 fifo, fifo_hi; + u32 nx_base; + int rc; + + if (proc_gen < proc_gen_p9) + return OPAL_UNSUPPORTED; + + chip = get_chip(gcid); + if (!chip) + return OPAL_PARAMETER; + + nx_base = chip->nx_base; + if (!nx_base) + return OPAL_PARAMETER; + + switch (ct) { + case NX_CT_842: + fifo_hi = nx_base + NX_P9_842_HIGH_PRI_RX_FIFO_CTRL; + fifo = nx_base + NX_P9_842_NORMAL_PRI_RX_FIFO_CTRL; + break; + case NX_CT_GZIP: + fifo_hi = nx_base + NX_P9_GZIP_HIGH_PRI_RX_FIFO_CTRL; + fifo = nx_base + NX_P9_GZIP_NORMAL_PRI_RX_FIFO_CTRL; + break; + default: + prlog(PR_EMERG, "OPAL: Unknown NX coprocessor type\n"); + return OPAL_PARAMETER; + } + + rc = nx_init_fifo_ctrl(gcid, fifo_hi); + + if (!rc) + rc = nx_init_fifo_ctrl(gcid, fifo); + + return rc; +} + +opal_call(OPAL_NX_COPROC_INIT, opal_nx_coproc_init, 2); + +void nx_create_compress_node(struct dt_node *node) +{ + u32 gcid, pb_base; + struct proc_chip *chip; + int rc; + + gcid = dt_get_chip_id(node); + pb_base = dt_get_address(node, 0, NULL); + + chip = get_chip(gcid); + chip->nx_base = pb_base; + + prlog(PR_INFO, "NX%d: 842 at 0x%x\n", gcid, pb_base); + + /* + * ibm,power9-nx is compatible on P10. So using same + * compatible string. + */ + if (dt_node_is_compatible(node, "ibm,power9-nx")) { + u64 cfg_mmio, cfg_txwc, cfg_uctrl, cfg_dma; + + prlog(PR_DEBUG, "Found ibm,power9-nx\n"); + cfg_mmio = pb_base + NX_P9_UMAC_VAS_MMIO_BAR; + cfg_dma = pb_base + NX_P9_DMA_VAS_MMIO_BAR; + cfg_txwc = pb_base + NX_P9_UMAC_TX_WINDOW_CONTEXT_BAR; + cfg_uctrl = pb_base + NX_P9_UMAC_STATUS_CTRL; + + rc = nx_cfg_umac_vas_mmio(gcid, cfg_mmio); + if (rc) + return; + + rc = nx_cfg_dma_vas_mmio(gcid, cfg_dma); + if (rc) + return; + + rc = nx_cfg_umac_tx_wc(gcid, cfg_txwc); + if (rc) + return; + + rc = nx_cfg_umac_status_ctrl(gcid, cfg_uctrl); + if (rc) + return; + + if (proc_gen > proc_gen_p9) { + u64 cfg_rma = pb_base + NX_P10_VAS_RMA_WRITE_BAR; + + rc = nx_cfg_vas_rma_bar(gcid, cfg_rma); + if (rc) + return; + } + + p9_nx_enable_842(node, gcid, pb_base); + p9_nx_enable_gzip(node, gcid, pb_base); + } else + nx_enable_842(node, gcid, pb_base); +} diff --git a/roms/skiboot/hw/nx-crypto.c b/roms/skiboot/hw/nx-crypto.c new file mode 100644 index 000000000..8b8ff5ee5 --- /dev/null +++ b/roms/skiboot/hw/nx-crypto.c @@ -0,0 +1,298 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * NX Cryptographic accellerators + * + * Copyright 2015-2019 IBM Corp. + */ + +#include <skiboot.h> +#include <chip.h> +#include <xscom.h> +#include <io.h> +#include <cpu.h> +#include <nx.h> + +/* Configuration settings */ +#define CFG_SYM_FC_ENABLE (0) /* disable all sym functions */ +#define CFG_SYM_ENABLE (0) /* disable sym engines */ +#define CFG_ASYM_FC_ENABLE (0) /* disable all asym functions */ +#define CFG_ASYM_ENABLE (0) /* disable asym engines */ +#define CFG_CRB_IQ_SYM (0) /* don't use any extra input queues */ +#define CFG_CRB_IQ_ASYM (0) /* don't use any extra input queues */ +#define AES_SHA_MAX_RR (1) /* valid range: 1-8 */ +#define AES_SHA_CSB_WR NX_DMA_CSB_WR_PDMA +#define AES_SHA_COMPLETION_MODE NX_DMA_COMPLETION_MODE_PDMA +#define AES_SHA_CPB_WR NX_DMA_CPB_WR_DMA_NOPAD +#define AES_SHA_OUTPUT_DATA_WR NX_DMA_OUTPUT_DATA_WR_DMA +#define AMF_MAX_RR (1) /* valid range: 1-8 */ +#define AMF_CSB_WR NX_DMA_CSB_WR_PDMA +#define AMF_COMPLETION_MODE NX_DMA_COMPLETION_MODE_PDMA +#define AMF_CPB_WR (0) /* CPB WR not done with AMF */ +#define AMF_OUTPUT_DATA_WR NX_DMA_OUTPUT_DATA_WR_DMA +#define EE_CH7 (0) /* disable engine AMF 3(P8) */ +#define EE_CH6 (0) /* disable engine AMF 2(P8) */ +#define EE_CH5 (0) /* disable engine AMF 1(P8) */ +#define EE_CH4 (0) /* disable engine SYM AMF 0(P8) */ +#define EE_CH3 (0) /* disable engine SYM 1 */ +#define EE_CH2 (0) /* disable engine SYM 0 */ + +static int nx_cfg_sym(u32 gcid, u64 xcfg) +{ + u64 cfg, ci, ct; + int rc, instance = gcid + 1; + + BUILD_ASSERT(MAX_CHIPS < NX_SYM_CFG_CI_MAX); + + rc = xscom_read(gcid, xcfg, &cfg); + if (rc) { + prerror("NX%d: ERROR: XSCOM SYM config read failure %d\n", + gcid, rc); + return rc; + } + + ct = GETFIELD(NX_SYM_CFG_CT, cfg); + if (!ct) + prlog(PR_INFO, "NX%d: SYM CT set to %u\n", gcid, NX_CT_SYM); + else if (ct == NX_CT_SYM) + prlog(PR_INFO, "NX%d: SYM CT already set to %u\n", + gcid, NX_CT_SYM); + else + prlog(PR_INFO, "NX%d: SYM CT already set to %u, " + "changing to %u\n", gcid, (unsigned int)ct, NX_CT_SYM); + ct = NX_CT_SYM; + cfg = SETFIELD(NX_SYM_CFG_CT, cfg, ct); + + /* Coprocessor Instance must be shifted left. + * See hw doc Section 5.5.1. + */ + ci = GETFIELD(NX_SYM_CFG_CI, cfg) >> NX_SYM_CFG_CI_LSHIFT; + if (!ci) + prlog(PR_INFO, "NX%d: SYM CI set to %d\n", gcid, instance); + else if (ci == instance) + prlog(PR_INFO, "NX%d: SYM CI already set to %u\n", gcid, + (unsigned int)ci); + else + prlog(PR_INFO, "NX%d: SYM CI already set to %u, " + "changing to %d\n", gcid, (unsigned int)ci, instance); + ci = instance; + cfg = SETFIELD(NX_SYM_CFG_CI, cfg, ci << NX_SYM_CFG_CI_LSHIFT); + + cfg = SETFIELD(NX_SYM_CFG_FC_ENABLE, cfg, CFG_SYM_FC_ENABLE); + + cfg = SETFIELD(NX_SYM_CFG_ENABLE, cfg, CFG_SYM_ENABLE); + + rc = xscom_write(gcid, xcfg, cfg); + if (rc) + prerror("NX%d: ERROR: SYM CT %u CI %u config failure %d\n", + gcid, (unsigned int)ct, (unsigned int)ci, rc); + else + prlog(PR_DEBUG, "NX%d: SYM Config 0x%016lx\n", + gcid, (unsigned long)cfg); + + return rc; +} + +static int nx_cfg_asym(u32 gcid, u64 xcfg) +{ + u64 cfg, ci, ct; + int rc, instance = gcid + 1; + + BUILD_ASSERT(MAX_CHIPS < NX_ASYM_CFG_CI_MAX); + + rc = xscom_read(gcid, xcfg, &cfg); + if (rc) { + prerror("NX%d: ERROR: XSCOM ASYM config read failure %d\n", + gcid, rc); + return rc; + } + + ct = GETFIELD(NX_ASYM_CFG_CT, cfg); + if (!ct) + prlog(PR_INFO, "NX%d: ASYM CT set to %u\n", + gcid, NX_CT_ASYM); + else if (ct == NX_CT_ASYM) + prlog(PR_INFO, "NX%d: ASYM CT already set to %u\n", + gcid, NX_CT_ASYM); + else + prlog(PR_INFO, "NX%d: ASYM CT already set to %u, " + "changing to %u\n", gcid, (unsigned int)ct, NX_CT_ASYM); + ct = NX_CT_ASYM; + cfg = SETFIELD(NX_ASYM_CFG_CT, cfg, ct); + + /* Coprocessor Instance must be shifted left. + * See hw doc Section 5.5.1. + */ + ci = GETFIELD(NX_ASYM_CFG_CI, cfg) >> NX_ASYM_CFG_CI_LSHIFT; + if (!ci) + prlog(PR_INFO, "NX%d: ASYM CI set to %d\n", gcid, instance); + else if (ci == instance) + prlog(PR_INFO, "NX%d: ASYM CI already set to %u\n", gcid, + (unsigned int)ci); + else + prlog(PR_INFO, "NX%d: ASYM CI already set to %u, " + "changing to %d\n", gcid, (unsigned int)ci, instance); + ci = instance; + cfg = SETFIELD(NX_ASYM_CFG_CI, cfg, ci << NX_ASYM_CFG_CI_LSHIFT); + + cfg = SETFIELD(NX_ASYM_CFG_FC_ENABLE, cfg, CFG_ASYM_FC_ENABLE); + + cfg = SETFIELD(NX_ASYM_CFG_ENABLE, cfg, CFG_ASYM_ENABLE); + + rc = xscom_write(gcid, xcfg, cfg); + if (rc) + prerror("NX%d: ERROR: ASYM CT %u CI %u config failure %d\n", + gcid, (unsigned int)ct, (unsigned int)ci, rc); + else + prlog(PR_DEBUG, "NX%d: ASYM Config 0x%016lx\n", + gcid, (unsigned long)cfg); + + return rc; +} + +static int nx_cfg_dma(u32 gcid, u64 xcfg) +{ + u64 cfg; + int rc; + + rc = xscom_read(gcid, xcfg, &cfg); + if (rc) { + prerror("NX%d: ERROR: XSCOM DMA config read failure %d\n", + gcid, rc); + return rc; + } + + cfg = SETFIELD(NX_DMA_CFG_AES_SHA_MAX_RR, cfg, + AES_SHA_MAX_RR); + cfg = SETFIELD(NX_DMA_CFG_AES_SHA_CSB_WR, cfg, + AES_SHA_CSB_WR); + cfg = SETFIELD(NX_DMA_CFG_AES_SHA_COMPLETION_MODE, cfg, + AES_SHA_COMPLETION_MODE); + cfg = SETFIELD(NX_DMA_CFG_AES_SHA_CPB_WR, cfg, + AES_SHA_CPB_WR); + cfg = SETFIELD(NX_DMA_CFG_AES_SHA_OUTPUT_DATA_WR, cfg, + AES_SHA_OUTPUT_DATA_WR); + + cfg = SETFIELD(NX_DMA_CFG_AMF_MAX_RR, cfg, + AMF_MAX_RR); + cfg = SETFIELD(NX_DMA_CFG_AMF_CSB_WR, cfg, + AMF_CSB_WR); + cfg = SETFIELD(NX_DMA_CFG_AMF_COMPLETION_MODE, cfg, + AMF_COMPLETION_MODE); + cfg = SETFIELD(NX_DMA_CFG_AMF_CPB_WR, cfg, + AMF_CPB_WR); + cfg = SETFIELD(NX_DMA_CFG_AMF_OUTPUT_DATA_WR, cfg, + AMF_OUTPUT_DATA_WR); + + rc = xscom_write(gcid, xcfg, cfg); + if (rc) + prerror("NX%d: ERROR: DMA config failure %d\n", gcid, rc); + else + prlog(PR_DEBUG, "NX%d: DMA 0x%016lx\n", gcid, + (unsigned long)cfg); + + return rc; +} + +static int nx_cfg_iq(u32 gcid, u64 xcfg) +{ + u64 cfg; + int rc; + + rc = xscom_read(gcid, xcfg, &cfg); + if (rc) { + prerror("NX%d: ERROR: XSCOM CRB IQ config read failure %d\n", + gcid, rc); + return rc; + } + + cfg = SETFIELD(NX_CRB_IQ_SYM, cfg, CFG_CRB_IQ_SYM); + cfg = SETFIELD(NX_CRB_IQ_ASYM, cfg, CFG_CRB_IQ_ASYM); + + rc = xscom_write(gcid, xcfg, cfg); + if (rc) + prerror("NX%d: ERROR: CRB Input Queue failure %d\n", gcid, rc); + else + prlog(PR_DEBUG, "NX%d: CRB Input Queue 0x%016lx\n", + gcid, (unsigned long)cfg); + + return rc; +} + +static int nx_cfg_ee(u32 gcid, u64 xcfg) +{ + u64 cfg; + int rc; + + rc = xscom_read(gcid, xcfg, &cfg); + if (rc) { + prerror("NX%d: ERROR: XSCOM EE config read failure %d\n", + gcid, rc); + return rc; + } + + cfg = SETFIELD(NX_EE_CFG_CH7, cfg, EE_CH7); + cfg = SETFIELD(NX_EE_CFG_CH6, cfg, EE_CH6); + cfg = SETFIELD(NX_EE_CFG_CH5, cfg, EE_CH5); + cfg = SETFIELD(NX_EE_CFG_CH4, cfg, EE_CH4); + cfg = SETFIELD(NX_EE_CFG_CH3, cfg, EE_CH3); + cfg = SETFIELD(NX_EE_CFG_CH2, cfg, EE_CH2); + + rc = xscom_write(gcid, xcfg, cfg); + if (rc) + prerror("NX%d: ERROR: Engine Enable failure %d\n", gcid, rc); + else + prlog(PR_DEBUG, "NX%d: Engine Enable 0x%016lx\n", + gcid, (unsigned long)cfg); + + return rc; +} + +void nx_create_crypto_node(struct dt_node *node) +{ + u32 gcid; + u32 pb_base; + u64 cfg_dma, cfg_sym, cfg_asym, cfg_iq, cfg_ee; + int rc; + + gcid = dt_get_chip_id(node); + pb_base = dt_get_address(node, 0, NULL); + + prlog(PR_INFO, "NX%d: Crypto at 0x%x\n", gcid, pb_base); + + if (dt_node_is_compatible(node, "ibm,power8-nx")) { + cfg_dma = pb_base + NX_P8_DMA_CFG; + cfg_sym = pb_base + NX_P8_SYM_CFG; + cfg_asym = pb_base + NX_P8_ASYM_CFG; + cfg_iq = pb_base + NX_P8_CRB_IQ; + cfg_ee = pb_base + NX_P8_EE_CFG; + } else if (dt_node_is_compatible(node, "ibm,power9-nx")) { + prlog(PR_INFO, "NX%d: POWER9 nx-crypto not yet supported\n", + gcid); + return; + } else { + prerror("NX%d: ERROR: Unknown NX type!\n", gcid); + return; + } + + rc = nx_cfg_dma(gcid, cfg_dma); + if (rc) + return; + + rc = nx_cfg_sym(gcid, cfg_sym); + if (rc) + return; + + rc = nx_cfg_asym(gcid, cfg_asym); + if (rc) + return; + + rc = nx_cfg_iq(gcid, cfg_iq); + if (rc) + return; + + rc = nx_cfg_ee(gcid, cfg_ee); + if (rc) + return; + + prlog(PR_INFO, "NX%d: Crypto Coprocessors Disabled (not supported)\n", gcid); +} diff --git a/roms/skiboot/hw/nx-gzip.c b/roms/skiboot/hw/nx-gzip.c new file mode 100644 index 000000000..9bc491e70 --- /dev/null +++ b/roms/skiboot/hw/nx-gzip.c @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * NX GZIP (p9) accellerator support + * + * Copyright 2016-2017 IBM Corp. + */ + +#include <skiboot.h> +#include <chip.h> +#include <xscom.h> +#include <io.h> +#include <cpu.h> +#include <nx.h> + +#define EE (1) /* enable gzip engine */ + +static int nx_cfg_gzip_umac(struct dt_node *node, u32 gcid, u32 pb_base) +{ + int rc; + u64 umac_bar, umac_notify; + struct dt_node *nx_node; + static u32 nxgzip_tid = 1; /* tid counter within coprocessor type */ + + nx_node = dt_new(node, "ibm,gzip-high-fifo"); + umac_bar = pb_base + NX_P9_GZIP_HIGH_PRI_RX_FIFO_BAR; + umac_notify = pb_base + NX_P9_GZIP_HIGH_PRI_RX_FIFO_NOTIFY_MATCH; + + rc = nx_cfg_rx_fifo(nx_node, "ibm,p9-nx-gzip", "High", gcid, + NX_CT_GZIP, nxgzip_tid++, umac_bar, + umac_notify); + if (rc) + return rc; + + nx_node = dt_new(node, "ibm,gzip-normal-fifo"); + umac_bar = pb_base + NX_P9_GZIP_NORMAL_PRI_RX_FIFO_BAR; + umac_notify = pb_base + NX_P9_GZIP_NORMAL_PRI_RX_FIFO_NOTIFY_MATCH; + + rc = nx_cfg_rx_fifo(nx_node, "ibm,p9-nx-gzip", "Normal", gcid, + NX_CT_GZIP, nxgzip_tid++, umac_bar, + umac_notify); + + return rc; +} + +static int nx_cfg_gzip_dma(u32 gcid, u64 xcfg) +{ + u64 cfg; + int rc; + + rc = xscom_read(gcid, xcfg, &cfg); + if (rc) + return rc; + + cfg = SETFIELD(NX_DMA_CFG_GZIP_COMPRESS_PREFETCH, cfg, + DMA_COMPRESS_PREFETCH); + cfg = SETFIELD(NX_DMA_CFG_GZIP_DECOMPRESS_PREFETCH, cfg, + DMA_DECOMPRESS_PREFETCH); + + cfg = SETFIELD(NX_DMA_CFG_GZIP_COMPRESS_MAX_RR, cfg, + DMA_COMPRESS_MAX_RR); + cfg = SETFIELD(NX_DMA_CFG_GZIP_DECOMPRESS_MAX_RR, cfg, + DMA_DECOMPRESS_MAX_RR); + + rc = xscom_write(gcid, xcfg, cfg); + if (rc) + prerror("NX%d: ERROR: DMA config failure %d\n", gcid, rc); + else + prlog(PR_DEBUG, "NX%d: DMA 0x%016lx\n", gcid, + (unsigned long)cfg); + + return rc; +} + +static int nx_cfg_gzip_ee(u32 gcid, u64 xcfg) +{ + u64 cfg; + int rc; + + rc = xscom_read(gcid, xcfg, &cfg); + if (rc) + return rc; + + cfg = SETFIELD(NX_P9_EE_CFG_CH4, cfg, EE); + + rc = xscom_write(gcid, xcfg, cfg); + if (rc) + prerror("NX%d: ERROR: Engine Enable failure %d\n", gcid, rc); + else + prlog(PR_DEBUG, "NX%d: Engine Enable 0x%016lx\n", + gcid, (unsigned long)cfg); + + return rc; +} + +void p9_nx_enable_gzip(struct dt_node *node, u32 gcid, u32 pb_base) +{ + u64 cfg_dma, cfg_ee; + int rc; + + prlog(PR_INFO, "NX%d: gzip at 0x%x\n", gcid, pb_base); + + cfg_dma = pb_base + NX_P9_DMA_CFG; + cfg_ee = pb_base + NX_P9_EE_CFG; + + rc = nx_cfg_gzip_dma(gcid, cfg_dma); + if (rc) + return; + + rc = nx_cfg_gzip_ee(gcid, cfg_ee); + if (rc) + return; + + rc = nx_cfg_gzip_umac(node, gcid, pb_base); + if (rc) + return; + + prlog(PR_INFO, "NX%d: gzip Coprocessor Enabled\n", gcid); +} diff --git a/roms/skiboot/hw/nx-rng.c b/roms/skiboot/hw/nx-rng.c new file mode 100644 index 000000000..274b33211 --- /dev/null +++ b/roms/skiboot/hw/nx-rng.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * NX Hardware Random Number Generator + * + * Copyright 2013-2019 IBM Corp. + */ + +#include <skiboot.h> +#include <xscom.h> +#include <io.h> +#include <cpu.h> +#include <nx.h> +#include <chip.h> +#include <phys-map.h> +#include <xscom-p9-regs.h> + +/* + * On P9 the DARN instruction is used to access the HW RNG. There is still + * an NX RNG BAR, but it is used to configure which NX a core will source + * random numbers from rather than being a MMIO window. + */ +static void nx_init_p9_rng(uint32_t chip_id) +{ + uint64_t bar, tmp; + + if (chip_quirk(QUIRK_NO_RNG)) + return; + + phys_map_get(chip_id, NX_RNG, 0, &bar, NULL); + xscom_write(chip_id, P9X_NX_MMIO_BAR, bar | P9X_NX_MMIO_BAR_EN); + + /* Read config register for pace info */ + xscom_read(chip_id, P9X_NX_RNG_CFG, &tmp); + prlog(PR_INFO, "NX RNG[%x] pace:%lli\n", chip_id, 0xffff & (tmp >> 2)); +} + +void nx_create_rng_node(struct dt_node *node) +{ + u64 bar, cfg; + u64 xbar, xcfg; + u32 pb_base; + u32 gcid; + u64 rng_addr, rng_len, len, addr_mask; + struct dt_node *rng; + int rc; + + gcid = dt_get_chip_id(node); + pb_base = dt_get_address(node, 0, NULL); + + if (dt_node_is_compatible(node, "ibm,power8-nx")) { + xbar = pb_base + NX_P8_RNG_BAR; + xcfg = pb_base + NX_P8_RNG_CFG; + addr_mask = NX_P8_RNG_BAR_ADDR; + } else if (dt_node_is_compatible(node, "ibm,power9-nx")) { + nx_init_p9_rng(gcid); + return; + } else { + prerror("NX%d: Unknown NX type!\n", gcid); + return; + } + + rc = xscom_read(gcid, xbar, &bar); /* Get RNG BAR */ + if (rc) { + prerror("NX%d: ERROR: XSCOM RNG BAR read failure %d\n", + gcid, rc); + return; + } + + rc = xscom_read(gcid, xcfg, &cfg); /* Get RNG CFG */ + if (rc) { + prerror("NX%d: ERROR: XSCOM RNG config read failure %d\n", + gcid, rc); + return; + } + + /* + * We mask in-place rather than using GETFIELD for the base address + * as we happen to *know* that it's properly aligned in the register. + * + * FIXME? Always assusme BAR gets a valid address from FSP + */ + rng_addr = bar & addr_mask; + len = GETFIELD(NX_RNG_BAR_SIZE, bar); + if (len > 4) { + prerror("NX%d: Corrupted bar size %lld\n", gcid, len); + return; + } + rng_len = (u64[]){ 0x1000, /* 4K */ + 0x10000, /* 64K */ + 0x400000000UL, /* 16G*/ + 0x100000, /* 1M */ + 0x1000000 /* 16M */} [len]; + + + prlog(PR_INFO, "NX%d: RNG BAR set to 0x%016llx..0x%016llx\n", + gcid, rng_addr, rng_addr + rng_len - 1); + + /* RNG must be enabled before MMIO is enabled */ + rc = xscom_write(gcid, xcfg, cfg | NX_RNG_CFG_ENABLE); + if (rc) { + prerror("NX%d: ERROR: XSCOM RNG config enable failure %d\n", + gcid, rc); + return; + } + + /* The BAR needs to be enabled too */ + rc = xscom_write(gcid, xbar, bar | NX_RNG_BAR_ENABLE); + if (rc) { + prerror("NX%d: ERROR: XSCOM RNG config enable failure %d\n", + gcid, rc); + return; + } + + rng = dt_new_addr(dt_root, "hwrng", rng_addr); + if (!rng) + return; + + dt_add_property_strings(rng, "compatible", "ibm,power-rng"); + dt_add_property_u64s(rng, "reg", rng_addr, rng_len); + dt_add_property_cells(rng, "ibm,chip-id", gcid); +} diff --git a/roms/skiboot/hw/nx.c b/roms/skiboot/hw/nx.c new file mode 100644 index 000000000..fdadf53c7 --- /dev/null +++ b/roms/skiboot/hw/nx.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * NX Accellerator unit support + * + * Copyright 2013-2019 IBM Corp. + */ + +#include <skiboot.h> +#include <xscom.h> +#include <io.h> +#include <cpu.h> +#include <nx.h> +#include <chip.h> +#include <xscom-p9-regs.h> +#include <xscom-p10-regs.h> +#include <phys-map.h> +#include <vas.h> +#include <p9_stop_api.H> + +static void darn_init(void) +{ + struct dt_node *nx; + struct proc_chip *chip; + struct cpu_thread *c; + uint64_t bar, default_bar; + + if (chip_quirk(QUIRK_NO_RNG)) + return; + + /* + * To allow the DARN instruction to function there must be at least + * one NX available in the system. Otherwise using DARN will result + * in a checkstop. I suppose we could mask the FIR... + */ + dt_for_each_compatible(dt_root, nx, "ibm,power9-nx") + break; + assert(nx); + + phys_map_get(dt_get_chip_id(nx), NX_RNG, 0, &default_bar, NULL); + + for_each_chip(chip) { + /* is this NX enabled? */ + xscom_read(chip->id, P9X_NX_MMIO_BAR, &bar); + if (!(bar & ~P9X_NX_MMIO_BAR_EN)) + bar = default_bar; + + for_each_available_core_in_chip(c, chip->id) { + uint64_t addr; + + if (proc_gen == proc_gen_p9) { + addr = XSCOM_ADDR_P9_EX(pir_to_core_id(c->pir), + P9X_EX_NCU_DARN_BAR); + xscom_write(chip->id, addr, + bar | P9X_EX_NCU_DARN_BAR_EN); + } else if (proc_gen >= proc_gen_p10) { + addr = XSCOM_ADDR_P10_NCU(pir_to_core_id(c->pir), + P10_NCU_DARN_BAR); + xscom_write(chip->id, addr, + bar | P10_NCU_DARN_BAR_EN); + /* Init for sibling core also */ + if (c->is_fused_core) { + addr = XSCOM_ADDR_P10_NCU(pir_to_core_id(c->pir + 1), + P10_NCU_DARN_BAR); + xscom_write(chip->id, addr, + bar | P10_NCU_DARN_BAR_EN); + } + } + } + } +} + +void nx_p9_rng_late_init(void) +{ + struct cpu_thread *c; + uint64_t rc; + + if (proc_gen < proc_gen_p9) + return; + if (chip_quirk(QUIRK_NO_RNG)) + return; + + prlog(PR_INFO, "SLW: Configuring self-restore for P9X_EX_NCU_DARN_BAR\n"); + for_each_present_cpu(c) { + if(cpu_is_thread0(c)) { + struct proc_chip *chip = get_chip(c->chip_id); + uint64_t addr, bar; + + phys_map_get(chip->id, NX_RNG, 0, &bar, NULL); + addr = XSCOM_ADDR_P9_EX(pir_to_core_id(c->pir), + P9X_EX_NCU_DARN_BAR); + /* Bail out if wakeup engine has already failed */ + if ( wakeup_engine_state != WAKEUP_ENGINE_PRESENT) { + prlog(PR_ERR,"DARN BAR p9_stop_api fail detected\n"); + break; + } + rc = p9_stop_save_scom((void *)chip->homer_base, + addr, bar | P9X_EX_NCU_DARN_BAR_EN, + P9_STOP_SCOM_REPLACE, + P9_STOP_SECTION_EQ_SCOM); + if (rc) { + prlog(PR_ERR, + "p9_stop_api for DARN_BAR failed rc= %lld", + rc); + prlog(PR_ERR, "Disabling deep stop states\n"); + wakeup_engine_state = WAKEUP_ENGINE_FAILED; + break; + } + } + } +} + +static void nx_init_one(struct dt_node *node) +{ + nx_create_rng_node(node); + + if (!vas_nx_enabled()) + return; + + nx_create_crypto_node(node); + + nx_create_compress_node(node); +} + +void nx_init(void) +{ + struct dt_node *node; + + dt_for_each_compatible(dt_root, node, "ibm,power-nx") { + nx_init_one(node); + } + + dt_for_each_compatible(dt_root, node, "ibm,power9-nx") { + nx_init_one(node); + } + + if (proc_gen >= proc_gen_p9) + darn_init(); +} diff --git a/roms/skiboot/hw/occ-sensor.c b/roms/skiboot/hw/occ-sensor.c new file mode 100644 index 000000000..6efaf908b --- /dev/null +++ b/roms/skiboot/hw/occ-sensor.c @@ -0,0 +1,640 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * OCC (On Chip Controller) exports a bunch of sensors + * + * Copyright 2017-2019 IBM Corp. + */ + +#include <skiboot.h> +#include <opal.h> +#include <chip.h> +#include <sensor.h> +#include <device.h> +#include <cpu.h> +#include <occ.h> + +enum sensor_attr { + SENSOR_SAMPLE, + SENSOR_SAMPLE_MIN, /* OCC's min/max */ + SENSOR_SAMPLE_MAX, + SENSOR_CSM_MIN, /* CSM's min/max */ + SENSOR_CSM_MAX, + SENSOR_ACCUMULATOR, + MAX_SENSOR_ATTR, +}; + +#define HWMON_SENSORS_MASK (OCC_SENSOR_TYPE_CURRENT | \ + OCC_SENSOR_TYPE_VOLTAGE | \ + OCC_SENSOR_TYPE_TEMPERATURE | \ + OCC_SENSOR_TYPE_POWER) + +/* + * Standard HWMON linux interface expects the below units for the + * environment sensors: + * - Current : milliampere + * - Voltage : millivolt + * - Temperature : millidegree Celsius (scaled in kernel) + * - Power : microWatt (scaled in kernel) + * - Energy : microJoule + */ + +/* + * OCC sensor units are obtained after scaling the sensor values. + * https://github.com/open-power/occ/blob/master/src/occ_405/sensor/sensor_info.c + */ + +static struct str_map { + const char *occ_str; + const char *opal_str; +} str_maps[] = { + {"PWRSYS", "System"}, + /* Bulk power of the system: Watt */ + {"PWRFAN", "Fan"}, + /* Power consumption of the system fans: Watt */ + {"PWRIO", "IO"}, + /* Power consumption of the IO subsystem: Watt */ + {"PWRSTORE", "Storage"}, + /* Power comsumption of the storage subsystem: Watt */ + {"PWRGPU", "GPU"}, + /* Power consumption for GPUs per socket read from APSS: Watt */ + {"PWRAPSSCH", "APSS"}, + /* Power Provided by APSS channel x (where x=0…15): Watt */ + {"PWRPROC", ""}, + /* Power consumption for this Processor: Watt */ + {"PWRVDD", "Vdd"}, + /* Power consumption for this Processor's Vdd(AVSBus readings): Watt */ + {"PWRVDN", "Vdn"}, + /* Power consumption for this Processor's Vdn (nest) + * Calculated from AVSBus readings: Watt */ + {"PWRMEM", "Memory"}, + /* Power consumption for Memory for this Processor read from APSS: + * Watt */ + {"CURVDD", "Vdd"}, + /* Processor Vdd Current (read from AVSBus): Ampere */ + {"CURVDN", "Vdn"}, + /* Processor Vdn Current (read from AVSBus): Ampere */ + {"VOLTVDDSENSE", "Vdd Remote Sense"}, + /* Vdd Voltage at the remote sense. + * AVS reading adjusted for loadline: millivolt */ + {"VOLTVDNSENSE", "Vdn Remote Sense"}, + /* Vdn Voltage at the remote sense. + * AVS reading adjusted for loadline: millivolt */ + {"VOLTVDD", "Vdd"}, + /* Processor Vdd Voltage (read from AVSBus): millivolt */ + {"VOLTVDN", "Vdn"}, + /* Processor Vdn Voltage (read from AVSBus): millivolt */ + {"TEMPC", "Core"}, + /* Average temperature of core DTS sensors for Processor's Core y: + * Celsius */ + {"TEMPQ", "Quad"}, + /* Average temperature of quad (in cache) DTS sensors for + * Processor’s Quad y: Celsius */ + {"TEMPNEST", "Nest"}, + /* Average temperature of nest DTS sensors: Celsius */ + {"TEMPPROCTHRMC", "Core"}, + /* The combined weighted core/quad temperature for processor core y: + * Celsius */ + {"TEMPDIMM", "DIMM"}, + /* DIMM temperature for DIMM x: Celsius */ + {"TEMPGPU", "GPU"}, + /* GPU x (0..2) board temperature: Celsius */ + /* TEMPGPUxMEM: GPU x hottest HBM temperature (individual memory + * temperatures are not available): Celsius */ + {"TEMPVDD", "VRM VDD"}, + /* VRM Vdd temperature: Celsius */ +}; + +static u64 occ_sensor_base; + +static inline +struct occ_sensor_data_header *get_sensor_header_block(int occ_num) +{ + return (struct occ_sensor_data_header *) + (occ_sensor_base + occ_num * OCC_SENSOR_DATA_BLOCK_SIZE); +} + +static inline +struct occ_sensor_name *get_names_block(struct occ_sensor_data_header *hb) +{ + return ((struct occ_sensor_name *)((u64)hb + be32_to_cpu(hb->names_offset))); +} + +static inline u32 sensor_handler(int occ_num, int sensor_id, int attr) +{ + return sensor_make_handler(SENSOR_OCC, occ_num, sensor_id, attr); +} + +/* + * The scaling factor for the sensors is encoded in the below format: + * (((UINT32)mantissa << 8) | (UINT32)((UINT8) 256 + (UINT8)exp)) + * https://github.com/open-power/occ/blob/master/src/occ_405/sensor/sensor.h + */ +static void scale_sensor(struct occ_sensor_name *md, u64 *sensor) +{ + u32 factor = be32_to_cpu(md->scale_factor); + int i; + s8 exp; + + if (be16_to_cpu(md->type) == OCC_SENSOR_TYPE_CURRENT) + *sensor *= 1000; //convert to mA + + *sensor *= factor >> 8; + exp = factor & 0xFF; + + if (exp > 0) { + for (i = labs(exp); i > 0; i--) + *sensor *= 10; + } else { + for (i = labs(exp); i > 0; i--) + *sensor /= 10; + } +} + +static void scale_energy(struct occ_sensor_name *md, u64 *sensor) +{ + u32 factor = be32_to_cpu(md->freq); + int i; + s8 exp; + + *sensor *= 1000000; //convert to uJ + + *sensor /= factor >> 8; + exp = factor & 0xFF; + + if (exp > 0) { + for (i = labs(exp); i > 0; i--) + *sensor /= 10; + } else { + for (i = labs(exp); i > 0; i--) + *sensor *= 10; + } +} + +static u64 read_sensor(struct occ_sensor_record *sensor, int attr) +{ + switch (attr) { + case SENSOR_SAMPLE: + return be16_to_cpu(sensor->sample); + case SENSOR_SAMPLE_MIN: + return be16_to_cpu(sensor->sample_min); + case SENSOR_SAMPLE_MAX: + return be16_to_cpu(sensor->sample_max); + case SENSOR_CSM_MIN: + return be16_to_cpu(sensor->csm_min); + case SENSOR_CSM_MAX: + return be16_to_cpu(sensor->csm_max); + case SENSOR_ACCUMULATOR: + return be64_to_cpu(sensor->accumulator); + default: + break; + } + + return 0; +} + +static void *select_sensor_buffer(struct occ_sensor_data_header *hb, int id) +{ + struct occ_sensor_name *md; + u8 *ping, *pong; + void *buffer = NULL; + u32 reading_offset; + + if (!hb) + return NULL; + + md = get_names_block(hb); + + ping = (u8 *)((u64)hb + be32_to_cpu(hb->reading_ping_offset)); + pong = (u8 *)((u64)hb + be32_to_cpu(hb->reading_pong_offset)); + reading_offset = be32_to_cpu(md[id].reading_offset); + + /* Check which buffer is valid and read the data from that. + * Ping Pong Action + * 0 0 Return with error + * 0 1 Read Pong + * 1 0 Read Ping + * 1 1 Read the buffer with latest timestamp + */ + + if (*ping && *pong) { + u64 tping, tpong; + u64 ping_buf = (u64)ping + reading_offset; + u64 pong_buf = (u64)pong + reading_offset; + + tping = be64_to_cpu(((struct occ_sensor_record *)ping_buf)->timestamp); + tpong = be64_to_cpu(((struct occ_sensor_record *)pong_buf)->timestamp); + + if (tping > tpong) + buffer = ping; + else + buffer = pong; + } else if (*ping && !*pong) { + buffer = ping; + } else if (!*ping && *pong) { + buffer = pong; + } else if (!*ping && !*pong) { + prlog(PR_DEBUG, "OCC: Both ping and pong sensor buffers are invalid\n"); + return NULL; + } + + assert(buffer); + buffer = (void *)((u64)buffer + reading_offset); + + return buffer; +} + +int occ_sensor_read(u32 handle, __be64 *data) +{ + struct occ_sensor_data_header *hb; + struct occ_sensor_name *md; + u16 id = sensor_get_rid(handle); + u8 occ_num = sensor_get_frc(handle); + u8 attr = sensor_get_attr(handle); + u64 d; + void *buff; + + if (occ_num > MAX_OCCS) + return OPAL_PARAMETER; + + if (attr > MAX_SENSOR_ATTR) + return OPAL_PARAMETER; + + if (is_occ_reset()) + return OPAL_HARDWARE; + + hb = get_sensor_header_block(occ_num); + + if (hb->valid != 1) + return OPAL_HARDWARE; + + if (id > be16_to_cpu(hb->nr_sensors)) + return OPAL_PARAMETER; + + buff = select_sensor_buffer(hb, id); + if (!buff) + return OPAL_HARDWARE; + + d = read_sensor(buff, attr); + if (!d) + goto out_success; + + md = get_names_block(hb); + if (be16_to_cpu(md[id].type) == OCC_SENSOR_TYPE_POWER && attr == SENSOR_ACCUMULATOR) + scale_energy(&md[id], &d); + else + scale_sensor(&md[id], &d); + +out_success: + *data = cpu_to_be64(d); + + return OPAL_SUCCESS; +} + +static bool occ_sensor_sanity(struct occ_sensor_data_header *hb, int chipid) +{ + if (hb->valid != 0x01) { + prerror("OCC: Chip %d sensor data invalid\n", chipid); + return false; + } + + if (hb->version != 0x01) { + prerror("OCC: Chip %d unsupported sensor header block version %d\n", + chipid, hb->version); + return false; + } + + if (hb->reading_version != 0x01) { + prerror("OCC: Chip %d unsupported sensor record format %d\n", + chipid, hb->reading_version); + return false; + } + + if (hb->names_version != 0x01) { + prerror("OCC: Chip %d unsupported sensor names format %d\n", + chipid, hb->names_version); + return false; + } + + if (hb->name_length != sizeof(struct occ_sensor_name)) { + prerror("OCC: Chip %d unsupported sensor names length %d\n", + chipid, hb->name_length); + return false; + } + + if (!hb->nr_sensors) { + prerror("OCC: Chip %d has no sensors\n", chipid); + return false; + } + + if (!hb->names_offset || + !hb->reading_ping_offset || + !hb->reading_pong_offset) { + prerror("OCC: Chip %d Invalid sensor buffer pointers\n", + chipid); + return false; + } + + return true; +} + +/* + * parse_entity: Parses OCC sensor name to return the entity number like + * chipid, core-id, dimm-no, gpu-no. 'end' is used to + * get the subentity strings. Returns -1 if no number is found. + * TEMPC4 --> returns 4, end will be NULL + * TEMPGPU2DRAM1 --> returns 2, end = "DRAM1" + * PWRSYS --> returns -1, end = NULL + */ +static int parse_entity(const char *name, char **end) +{ + while (*name != '\0') { + if (isdigit(*name)) + break; + name++; + } + + if (*name) + return strtol(name, end, 10); + else + return -1; +} + +static void add_sensor_label(struct dt_node *node, struct occ_sensor_name *md, + int chipid) +{ + char sname[30] = ""; + char prefix[30] = ""; + uint16_t location = be16_to_cpu(md->location); + int i; + + if (location != OCC_SENSOR_LOC_SYSTEM) + snprintf(prefix, sizeof(prefix), "%s %d ", "Chip", chipid); + + for (i = 0; i < ARRAY_SIZE(str_maps); i++) + if (!strncmp(str_maps[i].occ_str, md->name, + strlen(str_maps[i].occ_str))) { + char *end; + int num = -1; + + if (location != OCC_SENSOR_LOC_CORE) + num = parse_entity(md->name, &end); + + if (num != -1) { + snprintf(sname, sizeof(sname), "%s%s %d %s", + prefix, str_maps[i].opal_str, num, + end); + } else { + snprintf(sname, sizeof(sname), "%s%s", prefix, + str_maps[i].opal_str); + } + dt_add_property_string(node, "label", sname); + return; + } + + /* Fallback to OCC literal if mapping is not found */ + if (location == OCC_SENSOR_LOC_SYSTEM) { + dt_add_property_string(node, "label", md->name); + } else { + snprintf(sname, sizeof(sname), "%s%s", prefix, md->name); + dt_add_property_string(node, "label", sname); + } +} + +static const char *get_sensor_type_string(enum occ_sensor_type type) +{ + switch (type) { + case OCC_SENSOR_TYPE_POWER: + return "power"; + case OCC_SENSOR_TYPE_TEMPERATURE: + return "temp"; + case OCC_SENSOR_TYPE_CURRENT: + return "curr"; + case OCC_SENSOR_TYPE_VOLTAGE: + return "in"; + default: + break; + } + + return "unknown"; +} + +static const char *get_sensor_loc_string(enum occ_sensor_location loc) +{ + switch (loc) { + case OCC_SENSOR_LOC_SYSTEM: + return "sys"; + case OCC_SENSOR_LOC_PROCESSOR: + return "proc"; + case OCC_SENSOR_LOC_MEMORY: + return "mem"; + case OCC_SENSOR_LOC_VRM: + return "vrm"; + case OCC_SENSOR_LOC_CORE: + return "core"; + case OCC_SENSOR_LOC_QUAD: + return "quad"; + case OCC_SENSOR_LOC_GPU: + return "gpu"; + default: + break; + } + + return "unknown"; +} + +/* + * Power sensors can be 0 valued in few platforms like Zaius, Romulus + * which do not have APSS. At the moment there is no HDAT/DT property + * to indicate if APSS is present. So for now skip zero valued power + * sensors. + */ +static bool check_sensor_sample(struct occ_sensor_data_header *hb, u32 offset) +{ + struct occ_sensor_record *ping, *pong; + + ping = (struct occ_sensor_record *)((u64)hb + + be32_to_cpu(hb->reading_ping_offset) + offset); + pong = (struct occ_sensor_record *)((u64)hb + + be32_to_cpu(hb->reading_pong_offset) + offset); + return ping->sample || pong->sample; +} + +static void add_sensor_node(const char *loc, const char *type, int i, int attr, + struct occ_sensor_name *md, __be32 *phandle, u32 *ptype, + u32 pir, u32 occ_num, u32 chipid) +{ + char name[30]; + struct dt_node *node; + u32 handler; + + snprintf(name, sizeof(name), "%s-%s", loc, type); + handler = sensor_handler(occ_num, i, attr); + node = dt_new_addr(sensor_node, name, handler); + dt_add_property_string(node, "sensor-type", type); + dt_add_property_cells(node, "sensor-data", handler); + dt_add_property_cells(node, "reg", handler); + dt_add_property_string(node, "occ_label", md->name); + add_sensor_label(node, md, chipid); + + if (be16_to_cpu(md->location) == OCC_SENSOR_LOC_CORE) + dt_add_property_cells(node, "ibm,pir", pir); + + *ptype = be16_to_cpu(md->type); + + if (attr == SENSOR_SAMPLE) { + handler = sensor_handler(occ_num, i, SENSOR_CSM_MAX); + dt_add_property_cells(node, "sensor-data-max", handler); + + handler = sensor_handler(occ_num, i, SENSOR_CSM_MIN); + dt_add_property_cells(node, "sensor-data-min", handler); + } + + dt_add_property_string(node, "compatible", "ibm,opal-sensor"); + *phandle = cpu_to_be32(node->phandle); +} + +bool occ_sensors_init(void) +{ + struct proc_chip *chip; + struct dt_node *sg, *exports; + int occ_num = 0, i; + bool has_gpu = false; + + /* OCC inband sensors is only supported in P9/10 */ + if (proc_gen < proc_gen_p9) + return false; + + /* Sensors are copied to BAR2 OCC Common Area */ + chip = next_chip(NULL); + if (!chip->occ_common_base) { + prerror("OCC: Unassigned OCC Common Area. No sensors found\n"); + return false; + } + + occ_sensor_base = chip->occ_common_base + OCC_SENSOR_DATA_BLOCK_OFFSET; + + sg = dt_new(opal_node, "sensor-groups"); + if (!sg) { + prerror("OCC: Failed to create sensor groups node\n"); + return false; + } + dt_add_property_string(sg, "compatible", "ibm,opal-sensor-group"); + dt_add_property_cells(sg, "#address-cells", 1); + dt_add_property_cells(sg, "#size-cells", 0); + + /* + * On POWER9, ibm,ioda2-npu2-phb indicates the presence of a + * GPU NVlink. + */ + if (dt_find_compatible_node(dt_root, NULL, "ibm,ioda2-npu2-phb")) { + + for_each_chip(chip) { + int max_gpus_per_chip = 3, i; + + for(i = 0; i < max_gpus_per_chip; i++) { + has_gpu = occ_get_gpu_presence(chip, i); + + if (has_gpu) + break; + } + + if (has_gpu) + break; + } + } + + for_each_chip(chip) { + struct occ_sensor_data_header *hb; + struct occ_sensor_name *md; + __be32 *phandles; + u32 *ptype, phcount = 0; + unsigned int nr_sensors; + + hb = get_sensor_header_block(occ_num); + md = get_names_block(hb); + + /* Sanity check of the Sensor Data Header Block */ + if (!occ_sensor_sanity(hb, chip->id)) + continue; + + nr_sensors = be16_to_cpu(hb->nr_sensors); + + phandles = malloc(nr_sensors * sizeof(__be32)); + assert(phandles); + ptype = malloc(nr_sensors * sizeof(u32)); + assert(ptype); + + for (i = 0; i < nr_sensors; i++) { + const char *type_name, *loc; + struct cpu_thread *c = NULL; + uint32_t pir = 0; + uint16_t type = be16_to_cpu(md[i].type); + uint16_t location = be16_to_cpu(md[i].location); + + if (md[i].structure_type != OCC_SENSOR_READING_FULL) + continue; + + if (!(type & HWMON_SENSORS_MASK)) + continue; + + if (location == OCC_SENSOR_LOC_GPU && !has_gpu) + continue; + + if (type == OCC_SENSOR_TYPE_POWER && + !check_sensor_sample(hb, be32_to_cpu(md[i].reading_offset))) + continue; + + if (location == OCC_SENSOR_LOC_CORE) { + int num = parse_entity(md[i].name, NULL); + + for_each_available_core_in_chip(c, chip->id) + if (pir_to_core_id(c->pir) == num) + break; + if (!c) + continue; + pir = c->pir; + } + + type_name = get_sensor_type_string(type); + loc = get_sensor_loc_string(location); + + add_sensor_node(loc, type_name, i, SENSOR_SAMPLE, &md[i], + &phandles[phcount], &ptype[phcount], + pir, occ_num, chip->id); + phcount++; + + /* Add energy sensors */ + if (type == OCC_SENSOR_TYPE_POWER && + md[i].structure_type == OCC_SENSOR_READING_FULL) { + add_sensor_node(loc, "energy", i, + SENSOR_ACCUMULATOR, &md[i], + &phandles[phcount], &ptype[phcount], + pir, occ_num, chip->id); + phcount++; + } + + } + occ_num++; + occ_add_sensor_groups(sg, phandles, ptype, phcount, chip->id); + free(phandles); + free(ptype); + } + /* clear the device tree property if no sensors */ + if (list_empty(&sg->children)) { + dt_free(sg); + } + + if (!occ_num) + return false; + + exports = dt_find_by_path(dt_root, "/ibm,opal/firmware/exports"); + if (!exports) { + prerror("OCC: dt node /ibm,opal/firmware/exports not found\n"); + return false; + } + + dt_add_property_u64s(exports, "occ_inband_sensors", occ_sensor_base, + OCC_SENSOR_DATA_BLOCK_SIZE * occ_num); + + return true; +} diff --git a/roms/skiboot/hw/occ.c b/roms/skiboot/hw/occ.c new file mode 100644 index 000000000..8d7bcbec9 --- /dev/null +++ b/roms/skiboot/hw/occ.c @@ -0,0 +1,2339 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Interface with the On Chip Controller, + * which enforces power and thermal management + * + * Copyright 2013-2019 IBM Corp. + */ + +#include <skiboot.h> +#include <xscom.h> +#include <xscom-p8-regs.h> +#include <io.h> +#include <cpu.h> +#include <chip.h> +#include <mem_region.h> +#include <timebase.h> +#include <errorlog.h> +#include <opal-api.h> +#include <opal-msg.h> +#include <timer.h> +#include <i2c.h> +#include <powercap.h> +#include <psr.h> +#include <sensor.h> +#include <occ.h> +#include <psi.h> + +/* OCC Communication Area for PStates */ + +#define P8_HOMER_OPAL_DATA_OFFSET 0x1F8000 +#define P9_HOMER_OPAL_DATA_OFFSET 0x0E2000 + +#define OPAL_DYNAMIC_DATA_OFFSET 0x0B80 +/* relative to HOMER_OPAL_DATA_OFFSET */ + +#define MAX_PSTATES 256 +#define MAX_P8_CORES 12 +#define MAX_P9_CORES 24 +#define MAX_P10_CORES 32 + +#define MAX_OPAL_CMD_DATA_LENGTH 4090 +#define MAX_OCC_RSP_DATA_LENGTH 8698 + +#define P8_PIR_CORE_MASK 0xFFF8 +#define P9_PIR_QUAD_MASK 0xFFF0 +#define P10_PIR_CHIP_MASK 0x0000 +#define FREQ_MAX_IN_DOMAIN 0 +#define FREQ_MOST_RECENTLY_SET 1 + +/** + * OCC-OPAL Shared Memory Region + * + * Reference document : + * https://github.com/open-power/docs/blob/master/occ/OCC_OpenPwr_FW_Interfaces.pdf + * + * Supported layout versions: + * - 0x01, 0x02 : P8 + * https://github.com/open-power/occ/blob/master_p8/src/occ/proc/proc_pstate.h + * + * - 0x90 : P9 + * https://github.com/open-power/occ/blob/master/src/occ_405/proc/proc_pstate.h + * In 0x90 the data is separated into :- + * -- Static Data (struct occ_pstate_table): Data is written once by OCC + * -- Dynamic Data (struct occ_dynamic_data): Data is updated at runtime + * + * struct occ_pstate_table - Pstate table layout + * @valid: Indicates if data is valid + * @version: Layout version [Major/Minor] + * @v2.throttle: Reason for limiting the max pstate + * @v9.occ_role: OCC role (Master/Slave) + * @v#.pstate_min: Minimum pstate ever allowed + * @v#.pstate_nom: Nominal pstate + * @v#.pstate_turbo: Maximum turbo pstate + * @v#.pstate_ultra_turbo: Maximum ultra turbo pstate and the maximum + * pstate ever allowed + * @v#.pstates: Pstate-id and frequency list from Pmax to Pmin + * @v#.pstates.id: Pstate-id + * @v#.pstates.flags: Pstate-flag(reserved) + * @v2.pstates.vdd: Voltage Identifier + * @v2.pstates.vcs: Voltage Identifier + * @v#.pstates.freq_khz: Frequency in KHz + * @v#.core_max[1..N]: Max pstate with N active cores + * @spare/reserved/pad: Unused data + */ +struct occ_pstate_table { + u8 valid; + u8 version; + union __packed { + struct __packed { /* Version 0x01 and 0x02 */ + u8 throttle; + s8 pstate_min; + s8 pstate_nom; + s8 pstate_turbo; + s8 pstate_ultra_turbo; + u8 spare; + u64 reserved; + struct __packed { + s8 id; + u8 flags; + u8 vdd; + u8 vcs; + __be32 freq_khz; + } pstates[MAX_PSTATES]; + s8 core_max[MAX_P8_CORES]; + u8 pad[100]; + } v2; + struct __packed { /* Version 0x90 */ + u8 occ_role; + u8 pstate_min; + u8 pstate_nom; + u8 pstate_turbo; + u8 pstate_ultra_turbo; + u8 spare; + u64 reserved1; + u64 reserved2; + struct __packed { + u8 id; + u8 flags; + u16 reserved; + __be32 freq_khz; + } pstates[MAX_PSTATES]; + u8 core_max[MAX_P9_CORES]; + u8 pad[56]; + } v9; + struct __packed { /* Version 0xA0 */ + u8 occ_role; + u8 pstate_min; + u8 pstate_fixed_freq; + u8 pstate_base; + u8 pstate_ultra_turbo; + u8 pstate_fmax; + u8 minor; + u8 pstate_bottom_throttle; + u8 spare; + u8 spare1; + u32 reserved_32; + u64 reserved_64; + struct __packed { + u8 id; + u8 valid; + u16 reserved; + __be32 freq_khz; + } pstates[MAX_PSTATES]; + u8 core_max[MAX_P10_CORES]; + u8 pad[48]; + } v10; + }; +} __packed; + +/** + * OPAL-OCC Command Response Interface + * + * OPAL-OCC Command Buffer + * + * --------------------------------------------------------------------- + * | OPAL | Cmd | OPAL | | Cmd Data | Cmd Data | OPAL | + * | Cmd | Request | OCC | Reserved | Length | Length | Cmd | + * | Flags | ID | Cmd | | (MSB) | (LSB) | Data... | + * --------------------------------------------------------------------- + * | ….OPAL Command Data up to max of Cmd Data Length 4090 bytes | + * | | + * --------------------------------------------------------------------- + * + * OPAL Command Flag + * + * ----------------------------------------------------------------- + * | Bit 7 | Bit 6 | Bit 5 | Bit 4 | Bit 3 | Bit 2 | Bit 1 | Bit 0 | + * | (msb) | | | | | | | (lsb) | + * ----------------------------------------------------------------- + * |Cmd | | | | | | | | + * |Ready | | | | | | | | + * ----------------------------------------------------------------- + * + * struct opal_command_buffer - Defines the layout of OPAL command buffer + * @flag: Provides general status of the command + * @request_id: Token to identify request + * @cmd: Command sent + * @data_size: Command data length + * @data: Command specific data + * @spare: Unused byte + */ +struct opal_command_buffer { + u8 flag; + u8 request_id; + u8 cmd; + u8 spare; + u16 data_size; + u8 data[MAX_OPAL_CMD_DATA_LENGTH]; +} __packed; + +/** + * OPAL-OCC Response Buffer + * + * --------------------------------------------------------------------- + * | OCC | Cmd | OPAL | Response | Rsp Data | Rsp Data | OPAL | + * | Rsp | Request | OCC | Status | Length | Length | Rsp | + * | Flags | ID | Cmd | | (MSB) | (LSB) | Data... | + * --------------------------------------------------------------------- + * | ….OPAL Response Data up to max of Rsp Data Length 8698 bytes | + * | | + * --------------------------------------------------------------------- + * + * OCC Response Flag + * + * ----------------------------------------------------------------- + * | Bit 7 | Bit 6 | Bit 5 | Bit 4 | Bit 3 | Bit 2 | Bit 1 | Bit 0 | + * | (msb) | | | | | | | (lsb) | + * ----------------------------------------------------------------- + * | | | | | | |OCC in | Rsp | + * | | | | | | |progress|Ready | + * ----------------------------------------------------------------- + * + * struct occ_response_buffer - Defines the layout of OCC response buffer + * @flag: Provides general status of the response + * @request_id: Token to identify request + * @cmd: Command requested + * @status: Indicates success/failure status of + * the command + * @data_size: Response data length + * @data: Response specific data + */ +struct occ_response_buffer { + u8 flag; + u8 request_id; + u8 cmd; + u8 status; + u16 data_size; + u8 data[MAX_OCC_RSP_DATA_LENGTH]; +} __packed; + +/** + * OCC-OPAL Shared Memory Interface Dynamic Data Vx90 + * + * struct occ_dynamic_data - Contains runtime attributes + * @occ_state: Current state of OCC + * @major_version: Major version number + * @minor_version: Minor version number (backwards compatible) + * Version 1 indicates GPU presence populated + * @gpus_present: Bitmask of GPUs present (on systems where GPU + * presence is detected through APSS) + * @cpu_throttle: Reason for limiting the max pstate + * @mem_throttle: Reason for throttling memory + * @quick_pwr_drop: Indicates if QPD is asserted + * @pwr_shifting_ratio: Indicates the current percentage of power to + * take away from the CPU vs GPU when shifting + * power to maintain a power cap. Value of 100 + * means take all power from CPU. + * @pwr_cap_type: Indicates type of power cap in effect + * @hard_min_pwr_cap: Hard minimum system power cap in Watts. + * Guaranteed unless hardware failure + * @max_pwr_cap: Maximum allowed system power cap in Watts + * @cur_pwr_cap: Current system power cap + * @soft_min_pwr_cap: Soft powercap minimum. OCC may or may not be + * able to maintain this + * @spare/reserved: Unused data + * @cmd: Opal Command Buffer + * @rsp: OCC Response Buffer + */ +struct occ_dynamic_data { + u8 occ_state; + u8 major_version; + u8 minor_version; + u8 gpus_present; + struct __packed { /* Version 0x90 */ + u8 spare1; + } v9; + struct __packed { /* Version 0xA0 */ + u8 wof_enabled; + } v10; + u8 cpu_throttle; + u8 mem_throttle; + u8 quick_pwr_drop; + u8 pwr_shifting_ratio; + u8 pwr_cap_type; + u16 hard_min_pwr_cap; + u16 max_pwr_cap; + u16 cur_pwr_cap; + u16 soft_min_pwr_cap; + u8 pad[110]; + struct opal_command_buffer cmd; + struct occ_response_buffer rsp; +} __packed; + +static bool occ_reset; +static struct lock occ_lock = LOCK_UNLOCKED; +static unsigned long homer_opal_data_offset; + +DEFINE_LOG_ENTRY(OPAL_RC_OCC_PSTATE_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_OCC, + OPAL_CEC_HARDWARE, OPAL_INFO, + OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_OCC_TIMEOUT, OPAL_PLATFORM_ERR_EVT, OPAL_OCC, + OPAL_CEC_HARDWARE, OPAL_UNRECOVERABLE_ERR_GENERAL, + OPAL_NA); + +/* + * POWER9 and newer platforms have pstate values which are unsigned + * positive values. They are continuous set of unsigned integers + * [0 to +N] where Pmax is 0 and Pmin is N. The linear ordering of + * pstates for P9 has changed compared to P8. Where P8 has negative + * pstate values advertised as [0 to -N] where Pmax is 0 and + * Pmin is -N. The following routine helps to abstract pstate + * comparison with pmax and perform sanity checks on pstate limits. + */ + +/** + * cmp_pstates: Compares the given two pstates and determines which + * among them is associated with a higher pstate. + * + * @a,@b: The pstate ids of the pstates being compared. + * + * Returns: -1 : If pstate associated with @a is smaller than + * the pstate associated with @b. + * 0 : If pstates associated with @a and @b are equal. + * 1 : If pstate associated with @a is greater than + * the pstate associated with @b. + */ +static int cmp_pstates(int a, int b) +{ + /* P8 has 0 to -N (pmax to pmin), P9 has 0 to +N (pmax to pmin) */ + if (a > b) + return (proc_gen == proc_gen_p8)? 1 : -1; + else if (a < b) + return (proc_gen == proc_gen_p8)? -1 : 1; + + return 0; +} + +static inline +struct occ_pstate_table *get_occ_pstate_table(struct proc_chip *chip) +{ + return (struct occ_pstate_table *) + (chip->homer_base + homer_opal_data_offset); +} + +static inline +struct occ_dynamic_data *get_occ_dynamic_data(struct proc_chip *chip) +{ + return (struct occ_dynamic_data *) + (chip->homer_base + homer_opal_data_offset + + OPAL_DYNAMIC_DATA_OFFSET); +} + +/* + * On Chips which have at least one active EX unit, check the + * HOMER area for pstate-table valid bit on versions 0x1 and 0x2, or + * HOMER dynamic area occ_state on version 0x90. + */ +static bool wait_for_all_occ_init(void) +{ + struct proc_chip *chip; + struct dt_node *xn; + struct occ_pstate_table *occ_data; + struct occ_dynamic_data *occ_dyn_data; + int tries; + uint64_t start_time, end_time; + uint32_t timeout = 0; + + if (platform.occ_timeout) + timeout = platform.occ_timeout(); + + start_time = mftb(); + for_each_chip(chip) { + u8 version; + + /* + * If the chip doesn't any EX unit present, then OCC + * will not update the pstate-table. So, skip the + * check. + */ + if (!chip->ex_present) { + prlog(PR_DEBUG, "OCC: Chip %02x has no active EX units. Skipping check\n", + chip->id); + continue; + } + + /* Check for valid homer address */ + if (!chip->homer_base) { + /** + * @fwts-label OCCInvalidHomerBase + * @fwts-advice The HOMER base address for a chip + * was not valid. This means that OCC (On Chip + * Controller) will be non-functional and CPU + * frequency scaling will not be functional. CPU may + * be set to a safe, low frequency. Power savings in + * CPU idle or CPU hotplug may be impacted. + */ + prlog(PR_ERR,"OCC: Chip: %x homer_base is not valid\n", + chip->id); + return false; + } + + /* Get PState table address */ + occ_data = get_occ_pstate_table(chip); + + /* + * Wait for the OCC to set an appropriate version bit. + * The wait is needed since on some platforms (such P8 + * Tuletta), OCC is not loaded before OPAL boot. Hence + * initialization can take a while. + * + * Note: Checking for occ_data->version == (0x01/0x02/0x90/0xA0) + * is ok because we clear all of + * homer_base+size before passing memory to host + * services. This ensures occ_data->version == 0x0 + * before OCC load. + */ + tries = timeout * 10; + while (tries--) { + version = occ_data->version; + + if (version == 0x01 || version == 0x02 || + version == 0x90 || version == 0xA0) + break; + + time_wait_ms(100); + } + + version = occ_data->version; + switch (version) { + case 0x1: + case 0x2: + /* + * OCC-OPAL interface version 0x1 and 0x2 do not have + * the dynamic data. Hence the the only way to figure out + * if the OCC is up or not is to check the valid-bit + * in the pstate table. + */ + if (occ_data->valid != 1) { + /** + * @fwts-label OCCInvalidPStateTable + * @fwts-advice The pstate table for a chip + * was not valid. This means that OCC (On Chip + * Controller) will be non-functional and CPU + * frequency scaling will not be functional. CPU may + * be set to a low, safe frequency. This means + * that CPU idle states and CPU frequency scaling + * may not be functional. + */ + prlog(PR_ERR, "OCC: Chip: %x PState table is not valid\n", + chip->id); + return false; + } + break; + + case 0x90: + /* + * OCC-OPAL interface version 0x90 has a + * dynamic data section. This has an + * occ_state field whose values inform about + * the state of the OCC. + * + * 0x00 = OCC not running. No communication + * allowed. + * + * 0x01 = Standby. No communication allowed. + * + * 0x02 = Observation State. Communication + * allowed and is command dependent. + * + * 0x03 = Active State. Communication allowed + * and is command dependent. + * + * 0x04 = Safe State. No communication + * allowed. Just like CPU throttle + * status, some failures will not allow + * for OCC to update state to safe. + * + * 0x05 = Characterization State. + * Communication allowed and is command + * dependent. + * + * We will error out if OCC is not in the + * Active State. + * + * XXX : Should we error out only if no + * communication is allowed with the + * OCC ? + */ + occ_dyn_data = get_occ_dynamic_data(chip); + if (occ_dyn_data->occ_state != 0x3) { + /** + * @fwts-label OCCInactive + * @fwts-advice The OCC for a chip was not active. + * This means that CPU frequency scaling will + * not be functional. CPU may be set to a low, + * safe frequency. This means that CPU idle + * states and CPU frequency scaling may not be + * functional. + */ + prlog(PR_ERR, "OCC: Chip: %x: OCC not active\n", + chip->id); + return false; + } + break; + + case 0xA0: + /* + * OCC-OPAL interface version 0x90 has a + * dynamic data section. This has an + * occ_state field whose values inform about + * the state of the OCC. + * + * 0x00 = OCC not running. No communication + * allowed. + * + * 0x01 = Standby. No communication allowed. + * + * 0x02 = Observation State. Communication + * allowed and is command dependent. + * + * 0x03 = Active State. Communication allowed + * and is command dependent. + * + * 0x04 = Safe State. No communication + * allowed. Just like CPU throttle + * status, some failures will not allow + * for OCC to update state to safe. + * + * 0x05 = Characterization State. + * Communication allowed and is command + * dependent. + * + * We will error out if OCC is not in the + * Active State. + * + * XXX : Should we error out only if no + * communication is allowed with the + * OCC ? + */ + occ_dyn_data = get_occ_dynamic_data(chip); + if (occ_dyn_data->occ_state != 0x3) { + /** + * @fwts-label OCCInactive + * @fwts-advice The OCC for a chip was not active. + * This means that CPU frequency scaling will + * not be functional. CPU may be set to a low, + * safe frequency. This means that CPU idle + * states and CPU frequency scaling may not be + * functional. + */ + prlog(PR_ERR, "OCC: Chip: %x: OCC not active\n", + chip->id); + return false; + } + break; + + default: + prlog(PR_ERR, "OCC: Unknown OCC-OPAL interface version.\n"); + return false; + } + + if (!chip->occ_functional) + chip->occ_functional = true; + + prlog(PR_DEBUG, "OCC: Chip %02x Data (%016llx) = %016llx\n", + chip->id, (uint64_t)occ_data, be64_to_cpu(*(__be64 *)occ_data)); + + if (version == 0x90 || version == 0xA0) { + occ_dyn_data = get_occ_dynamic_data(chip); + prlog(PR_DEBUG, "OCC: Chip %02x Dynamic Data (%016llx) = %016llx\n", + chip->id, (uint64_t)occ_dyn_data, + be64_to_cpu(*(__be64 *)occ_dyn_data)); + } + } + + end_time = mftb(); + prlog(PR_NOTICE, "OCC: All Chip Rdy after %lu ms\n", + tb_to_msecs(end_time - start_time)); + + dt_for_each_compatible(dt_root, xn, "ibm,xscom") { + const struct dt_property *p; + p = dt_find_property(xn, "ibm,occ-functional-state"); + if (!p) + dt_add_property_cells(xn, "ibm,occ-functional-state", + 0x1); + } + return true; +} + +/* + * OCC provides pstate table entries in continuous descending order. + * Parse the pstate table to skip pstate_ids that are greater + * than Pmax. If a pstate_id is equal to Pmin then add it to + * the list and break from the loop as this is the last valid + * element in the pstate table. + */ +static void parse_pstates_v2(struct occ_pstate_table *data, __be32 *dt_id, + __be32 *dt_freq, int nr_pstates, int pmax, int pmin) +{ + int i, j; + + for (i = 0, j = 0; i < MAX_PSTATES && j < nr_pstates; i++) { + if (cmp_pstates(data->v2.pstates[i].id, pmax) > 0) + continue; + + dt_id[j] = cpu_to_be32(data->v2.pstates[i].id); + dt_freq[j] = cpu_to_be32(be32_to_cpu(data->v2.pstates[i].freq_khz) / 1000); + j++; + + if (data->v2.pstates[i].id == pmin) + break; + } + + if (j != nr_pstates) + prerror("OCC: Expected pstates(%d) is not equal to parsed pstates(%d)\n", + nr_pstates, j); +} + +static void parse_pstates_v9(struct occ_pstate_table *data, __be32 *dt_id, + __be32 *dt_freq, int nr_pstates, int pmax, int pmin) +{ + int i, j; + + for (i = 0, j = 0; i < MAX_PSTATES && j < nr_pstates; i++) { + if (cmp_pstates(data->v9.pstates[i].id, pmax) > 0) + continue; + + dt_id[j] = cpu_to_be32(data->v9.pstates[i].id); + dt_freq[j] = cpu_to_be32(be32_to_cpu(data->v9.pstates[i].freq_khz) / 1000); + j++; + + if (data->v9.pstates[i].id == pmin) + break; + } + + if (j != nr_pstates) + prerror("OCC: Expected pstates(%d) is not equal to parsed pstates(%d)\n", + nr_pstates, j); +} + +static void parse_pstates_v10(struct occ_pstate_table *data, __be32 *dt_id, + __be32 *dt_freq, int nr_pstates, int pmax, int pmin) +{ + int i, j; + int invalid = 0; + + for (i = 0, j = 0; i < MAX_PSTATES && j < nr_pstates; i++) { + if (cmp_pstates(data->v10.pstates[i].id, pmax) > 0) + continue; + + if (!data->v10.pstates[i].valid) { + prlog(PR_WARNING, "OCC: Found Invalid pstate with index %d. Skipping it.\n", i); + invalid++; + continue; + } + + dt_id[j] = cpu_to_be32(data->v10.pstates[i].id); + dt_freq[j] = cpu_to_be32(be32_to_cpu(data->v10.pstates[i].freq_khz) / 1000); + j++; + + if (data->v10.pstates[i].id == pmin) + break; + } + + if ((j + invalid) != nr_pstates) { + prerror("OCC: Expected pstates(%d) not equal to (Parsed pstates(%d) + Invalid Pstates (%d))\n", + nr_pstates, j, invalid); + } +} + +static void parse_vid(struct occ_pstate_table *occ_data, + struct dt_node *node, u8 nr_pstates, + int pmax, int pmin) +{ + u8 *dt_vdd, *dt_vcs; + int i, j; + + dt_vdd = malloc(nr_pstates); + assert(dt_vdd); + dt_vcs = malloc(nr_pstates); + assert(dt_vcs); + + for (i = 0, j = 0; i < MAX_PSTATES && j < nr_pstates; i++) { + if (cmp_pstates(occ_data->v2.pstates[i].id, pmax) > 0) + continue; + + dt_vdd[j] = occ_data->v2.pstates[i].vdd; + dt_vcs[j] = occ_data->v2.pstates[i].vcs; + j++; + + if (occ_data->v2.pstates[i].id == pmin) + break; + } + + dt_add_property(node, "ibm,pstate-vdds", dt_vdd, nr_pstates); + dt_add_property(node, "ibm,pstate-vcss", dt_vcs, nr_pstates); + + free(dt_vdd); + free(dt_vcs); +} + +/* Add device tree properties to describe pstates states */ +/* Return nominal pstate to set in each core */ +static bool add_cpu_pstate_properties(struct dt_node *power_mgt, + int *pstate_nom) +{ + struct proc_chip *chip; + uint64_t occ_data_area; + struct occ_pstate_table *occ_data = NULL; + struct occ_dynamic_data *occ_dyn_data; + /* Arrays for device tree */ + __be32 *dt_id, *dt_freq; + int pmax, pmin, pnom; + u8 nr_pstates; + bool ultra_turbo_supported; + int i, major, minor; + + prlog(PR_DEBUG, "OCC: CPU pstate state device tree init\n"); + + /* + * Find first chip with an OCC which has as a valid + * pstate-table + */ + for_each_chip(chip) { + occ_data = get_occ_pstate_table(chip); + + /* Dump first 16 bytes of PState table */ + occ_data_area = (uint64_t)occ_data; + prlog(PR_DEBUG, "OCC: Chip %02d :Data (%16llx) = %16llx %16llx\n", + chip->id, occ_data_area, + be64_to_cpu(*(__be64 *)occ_data_area), + be64_to_cpu(*(__be64 *)(occ_data_area + 8))); + + if (occ_data->valid) + break; + /* + * XXX : Error out if !occ_data->valid but Chip has at + * least one EX Unit? + */ + } + + assert(occ_data); + if (!occ_data->valid) { + /** + * @fwts-label OCCInvalidPStateTableDT + * @fwts-advice The pstate tables for none of the chips + * are valid. This means that OCC (On Chip + * Controller) will be non-functional. This means + * that CPU idle states and CPU frequency scaling + * will not be functional as OPAL doesn't populate + * the device tree with pstates in this case. + */ + prlog(PR_ERR, "OCC: PState table is not valid\n"); + return false; + } + + /* + * Workload-Optimized-Frequency(WOF) or Ultra-Turbo is supported + * from version 0x02 onwards. If WOF is disabled then, the max + * ultra_turbo pstate will be equal to max turbo pstate. + */ + ultra_turbo_supported = true; + + major = occ_data->version >> 4; + minor = occ_data->version & 0xF; + + /* Parse Pmax, Pmin and Pnominal */ + switch (major) { + case 0: + if (proc_gen >= proc_gen_p9) { + /** + * @fwts-label OCCInvalidVersion02 + * @fwts-advice The PState table layout version is not + * supported in P9. So OPAL will not parse the PState + * table. CPU frequency scaling will not be functional + * as frequency and pstate-ids are not added to DT. + */ + prerror("OCC: Version %x is not supported in P9\n", + occ_data->version); + return false; + } + if (minor == 0x1) + ultra_turbo_supported = false; + pmin = occ_data->v2.pstate_min; + pnom = occ_data->v2.pstate_nom; + if (ultra_turbo_supported) + pmax = occ_data->v2.pstate_ultra_turbo; + else + pmax = occ_data->v2.pstate_turbo; + break; + case 0x9: + if (proc_gen == proc_gen_p8) { + /** + * @fwts-label OCCInvalidVersion90 + * @fwts-advice The PState table layout version is not + * supported in P8. So OPAL will not parse the PState + * table. CPU frequency scaling will not be functional + * as frequency and pstate-ids are not added to DT. + */ + prerror("OCC: Version %x is not supported in P8\n", + occ_data->version); + return false; + } + pmin = occ_data->v9.pstate_min; + pnom = occ_data->v9.pstate_nom; + pmax = occ_data->v9.pstate_ultra_turbo; + break; + case 0xA: + pmin = occ_data->v10.pstate_min; + pnom = occ_data->v10.pstate_fixed_freq; + occ_dyn_data = get_occ_dynamic_data(chip); + if (occ_dyn_data->v10.wof_enabled) + pmax = occ_data->v10.pstate_ultra_turbo; + else + pmax = occ_data->v10.pstate_fmax; + break; + default: + /** + * @fwts-label OCCUnsupportedVersion + * @fwts-advice The PState table layout version is not + * supported. So OPAL will not parse the PState table. + * CPU frequency scaling will not be functional as OPAL + * doesn't populate the device tree with pstates. + */ + prerror("OCC: Unsupported pstate table layout version %d\n", + occ_data->version); + return false; + } + + /* Sanity check for pstate limits */ + if (cmp_pstates(pmin, pmax) > 0) { + /** + * @fwts-label OCCInvalidPStateLimits + * @fwts-advice The min pstate is greater than the + * max pstate, this could be due to corrupted/invalid + * data in OCC-OPAL shared memory region. So OPAL has + * not added pstates to device tree. This means that + * CPU Frequency management will not be functional in + * the host. + */ + prerror("OCC: Invalid pstate limits. Pmin(%d) > Pmax (%d)\n", + pmin, pmax); + return false; + } + + if (cmp_pstates(pnom, pmax) > 0) { + /** + * @fwts-label OCCInvalidNominalPState + * @fwts-advice The nominal pstate is greater than the + * max pstate, this could be due to corrupted/invalid + * data in OCC-OPAL shared memory region. So OPAL has + * limited the nominal pstate to max pstate. + */ + prerror("OCC: Clipping nominal pstate(%d) to Pmax(%d)\n", + pnom, pmax); + pnom = pmax; + } + + nr_pstates = labs(pmax - pmin) + 1; + prlog(PR_DEBUG, "OCC: Version %x Min %d Nom %d Max %d Nr States %d\n", + occ_data->version, pmin, pnom, pmax, nr_pstates); + if (((major == 0x9 || major == 0xA) && nr_pstates <= 1) || + (major == 0 && (nr_pstates <= 1 || nr_pstates > 128))) { + /** + * @fwts-label OCCInvalidPStateRange + * @fwts-advice The number of pstates is outside the valid + * range (currently <=1 or > 128 on p8, >255 on P9), so OPAL + * has not added pstates to the device tree. This means that + * OCC (On Chip Controller) will be non-functional. This means + * that CPU idle states and CPU frequency scaling + * will not be functional. + */ + prerror("OCC: OCC range is not valid; No of pstates = %d\n", + nr_pstates); + return false; + } + + dt_id = malloc(nr_pstates * sizeof(__be32)); + assert(dt_id); + dt_freq = malloc(nr_pstates * sizeof(__be32)); + assert(dt_freq); + + switch (major) { + case 0: + parse_pstates_v2(occ_data, dt_id, dt_freq, nr_pstates, + pmax, pmin); + break; + case 0x9: + parse_pstates_v9(occ_data, dt_id, dt_freq, nr_pstates, + pmax, pmin); + break; + case 0xA: + parse_pstates_v10(occ_data, dt_id, dt_freq, nr_pstates, + pmax, pmin); + break; + default: + return false; + } + + /* Add the device-tree entries */ + dt_add_property(power_mgt, "ibm,pstate-ids", dt_id, + nr_pstates * sizeof(__be32)); + dt_add_property(power_mgt, "ibm,pstate-frequencies-mhz", dt_freq, + nr_pstates * sizeof(__be32)); + dt_add_property_cells(power_mgt, "ibm,pstate-min", pmin); + dt_add_property_cells(power_mgt, "ibm,pstate-nominal", pnom); + dt_add_property_cells(power_mgt, "ibm,pstate-max", pmax); + + free(dt_freq); + free(dt_id); + + /* + * Parse and add WOF properties: turbo, ultra-turbo and core_max array. + * core_max[1..n] array provides the max sustainable pstate that can be + * achieved with i active cores in the chip. + */ + if (ultra_turbo_supported) { + int pturbo, pultra_turbo; + u8 nr_cores = get_available_nr_cores_in_chip(chip->id); + __be32 *dt_cmax; + + dt_cmax = malloc(nr_cores * sizeof(u32)); + assert(dt_cmax); + switch (major) { + case 0: + pturbo = occ_data->v2.pstate_turbo; + pultra_turbo = occ_data->v2.pstate_ultra_turbo; + for (i = 0; i < nr_cores; i++) + dt_cmax[i] = cpu_to_be32(occ_data->v2.core_max[i]); + break; + case 0x9: + pturbo = occ_data->v9.pstate_turbo; + pultra_turbo = occ_data->v9.pstate_ultra_turbo; + for (i = 0; i < nr_cores; i++) + dt_cmax[i] = cpu_to_be32(occ_data->v9.core_max[i]); + break; + case 0xA: + pturbo = occ_data->v10.pstate_base; + pultra_turbo = occ_data->v10.pstate_ultra_turbo; + for (i = 0; i < nr_cores; i++) + dt_cmax[i] = cpu_to_be32(occ_data->v10.core_max[i]); + break; + default: + return false; + } + + if (cmp_pstates(pturbo, pmax) > 0) { + prerror("OCC: Clipping turbo pstate(%d) to Pmax(%d)\n", + pturbo, pmax); + dt_add_property_cells(power_mgt, "ibm,pstate-turbo", + pmax); + } else { + dt_add_property_cells(power_mgt, "ibm,pstate-turbo", + pturbo); + } + + dt_add_property_cells(power_mgt, "ibm,pstate-ultra-turbo", + pultra_turbo); + dt_add_property(power_mgt, "ibm,pstate-core-max", dt_cmax, + nr_cores * sizeof(u32)); + + dt_add_property_cells(power_mgt, "ibm,pstate-base", pturbo); + free(dt_cmax); + } + + if (major == 0x9 || major == 0xA) + goto out; + + dt_add_property_cells(power_mgt, "#address-cells", 2); + dt_add_property_cells(power_mgt, "#size-cells", 1); + + /* Add chip specific pstate properties */ + for_each_chip(chip) { + struct dt_node *occ_node; + + occ_data = get_occ_pstate_table(chip); + occ_node = dt_new_addr(power_mgt, "occ", (uint64_t)occ_data); + if (!occ_node) { + /** + * @fwts-label OCCDTFailedNodeCreation + * @fwts-advice Failed to create + * /ibm,opal/power-mgt/occ. Per-chip pstate properties + * are not added to Device Tree. + */ + prerror("OCC: Failed to create /ibm,opal/power-mgt/occ@%llx\n", + (uint64_t)occ_data); + return false; + } + + dt_add_property_cells(occ_node, "reg", + hi32((uint64_t)occ_data), + lo32((uint64_t)occ_data), + OPAL_DYNAMIC_DATA_OFFSET + + sizeof(struct occ_dynamic_data)); + dt_add_property_cells(occ_node, "ibm,chip-id", chip->id); + + /* + * Parse and add pstate Voltage Identifiers (VID) to DT which + * are provided by OCC in version 0x01 and 0x02 + */ + parse_vid(occ_data, occ_node, nr_pstates, pmax, pmin); + } +out: + /* Return pstate to set for each core */ + *pstate_nom = pnom; + return true; +} + +/* + * Prepare chip for pstate transitions + */ + +static bool cpu_pstates_prepare_core(struct proc_chip *chip, + struct cpu_thread *c, + int pstate_nom) +{ + uint32_t core = pir_to_core_id(c->pir); + uint64_t tmp, pstate; + int rc; + + /* + * Currently Fastsleep init clears EX_PM_SPR_OVERRIDE_EN. + * Need to ensure only relevant bits are inited + */ + + /* Init PM GP1 for SCOM based PSTATE control to set nominal freq + * + * Use the OR SCOM to set the required bits in PM_GP1 register + * since the OCC might be mainpulating the PM_GP1 register as well. + */ + rc = xscom_write(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_SET_GP1), + EX_PM_SETUP_GP1_PM_SPR_OVERRIDE_EN); + if (rc) { + log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT), + "OCC: Failed to write PM_GP1 in pstates init\n"); + return false; + } + + /* Set new pstate to core */ + rc = xscom_read(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_PPMCR), &tmp); + if (rc) { + log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT), + "OCC: Failed to read PM_PPMCR from OCC in pstates init\n"); + return false; + } + tmp = tmp & ~0xFFFF000000000000ULL; + pstate = ((uint64_t) pstate_nom) & 0xFF; + tmp = tmp | (pstate << 56) | (pstate << 48); + rc = xscom_write(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_PPMCR), tmp); + if (rc) { + log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT), + "OCC: Failed to write PM_PPMCR in pstates init\n"); + return false; + } + time_wait_ms(1); /* Wait for PState to change */ + /* + * Init PM GP1 for SPR based PSTATE control. + * Once OCC is active EX_PM_SETUP_GP1_DPLL_FREQ_OVERRIDE_EN will be + * cleared by OCC. Sapphire need not clear. + * However wait for DVFS state machine to become idle after min->nominal + * transition initiated above. If not switch over to SPR control could fail. + * + * Use the AND SCOM to clear the required bits in PM_GP1 register + * since the OCC might be mainpulating the PM_GP1 register as well. + */ + tmp = ~EX_PM_SETUP_GP1_PM_SPR_OVERRIDE_EN; + rc = xscom_write(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_CLEAR_GP1), + tmp); + if (rc) { + log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT), + "OCC: Failed to write PM_GP1 in pstates init\n"); + return false; + } + + /* Just debug */ + rc = xscom_read(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_PPMSR), &tmp); + if (rc) { + log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT), + "OCC: Failed to read PM_PPMSR from OCC" + "in pstates init\n"); + return false; + } + prlog(PR_DEBUG, "OCC: Chip %x Core %x PPMSR %016llx\n", + chip->id, core, tmp); + + /* + * If PMSR is still in transition at this point due to PState change + * initiated above, then the switchover to SPR may not work. + * ToDo: Check for DVFS state machine idle before change. + */ + + return true; +} + +static bool occ_opal_msg_outstanding = false; +static void occ_msg_consumed(void *data __unused, int status __unused) +{ + lock(&occ_lock); + occ_opal_msg_outstanding = false; + unlock(&occ_lock); +} + +static inline u8 get_cpu_throttle(struct proc_chip *chip) +{ + struct occ_pstate_table *pdata = get_occ_pstate_table(chip); + struct occ_dynamic_data *data; + + switch (pdata->version >> 4) { + case 0: + return pdata->v2.throttle; + case 0x9: + case 0xA: + data = get_occ_dynamic_data(chip); + return data->cpu_throttle; + default: + return 0; + }; +} + +bool is_occ_reset(void) +{ + return occ_reset; +} + +static void occ_throttle_poll(void *data __unused) +{ + struct proc_chip *chip; + struct occ_pstate_table *occ_data; + struct opal_occ_msg occ_msg; + int rc; + + if (!try_lock(&occ_lock)) + return; + if (occ_reset) { + int inactive = 0; + + for_each_chip(chip) { + occ_data = get_occ_pstate_table(chip); + if (occ_data->valid != 1) { + inactive = 1; + break; + } + } + if (!inactive) { + /* + * Queue OCC_THROTTLE with throttle status as 0 to + * indicate all OCCs are active after a reset. + */ + occ_msg.type = cpu_to_be64(OCC_THROTTLE); + occ_msg.chip = 0; + occ_msg.throttle_status = 0; + rc = _opal_queue_msg(OPAL_MSG_OCC, NULL, NULL, + sizeof(struct opal_occ_msg), + &occ_msg); + if (!rc) + occ_reset = false; + } + } else { + if (occ_opal_msg_outstanding) + goto done; + for_each_chip(chip) { + u8 throttle; + + occ_data = get_occ_pstate_table(chip); + throttle = get_cpu_throttle(chip); + if ((occ_data->valid == 1) && + (chip->throttle != throttle) && + (throttle <= OCC_MAX_THROTTLE_STATUS)) { + occ_msg.type = cpu_to_be64(OCC_THROTTLE); + occ_msg.chip = cpu_to_be64(chip->id); + occ_msg.throttle_status = cpu_to_be64(throttle); + rc = _opal_queue_msg(OPAL_MSG_OCC, NULL, + occ_msg_consumed, + sizeof(struct opal_occ_msg), + &occ_msg); + if (!rc) { + chip->throttle = throttle; + occ_opal_msg_outstanding = true; + break; + } + } + } + } +done: + unlock(&occ_lock); +} + +/* OPAL-OCC Command/Response Interface */ + +enum occ_state { + OCC_STATE_NOT_RUNNING = 0x00, + OCC_STATE_STANDBY = 0x01, + OCC_STATE_OBSERVATION = 0x02, + OCC_STATE_ACTIVE = 0x03, + OCC_STATE_SAFE = 0x04, + OCC_STATE_CHARACTERIZATION = 0x05, +}; + +enum occ_role { + OCC_ROLE_SLAVE = 0x0, + OCC_ROLE_MASTER = 0x1, +}; + +enum occ_cmd { + OCC_CMD_CLEAR_SENSOR_DATA, + OCC_CMD_SET_POWER_CAP, + OCC_CMD_SET_POWER_SHIFTING_RATIO, + OCC_CMD_SELECT_SENSOR_GROUP, +}; + +struct opal_occ_cmd_info { + enum occ_cmd cmd; + u8 cmd_value; + u16 cmd_size; + u16 rsp_size; + int timeout_ms; + u16 state_mask; + u8 role_mask; +}; + +static struct opal_occ_cmd_info occ_cmds[] = { + { OCC_CMD_CLEAR_SENSOR_DATA, + 0xD0, 4, 4, 1000, + PPC_BIT16(OCC_STATE_OBSERVATION) | + PPC_BIT16(OCC_STATE_ACTIVE) | + PPC_BIT16(OCC_STATE_CHARACTERIZATION), + PPC_BIT8(OCC_ROLE_MASTER) | PPC_BIT8(OCC_ROLE_SLAVE) + }, + { OCC_CMD_SET_POWER_CAP, + 0xD1, 2, 2, 1000, + PPC_BIT16(OCC_STATE_OBSERVATION) | + PPC_BIT16(OCC_STATE_ACTIVE) | + PPC_BIT16(OCC_STATE_CHARACTERIZATION), + PPC_BIT8(OCC_ROLE_MASTER) + }, + { OCC_CMD_SET_POWER_SHIFTING_RATIO, + 0xD2, 1, 1, 1000, + PPC_BIT16(OCC_STATE_OBSERVATION) | + PPC_BIT16(OCC_STATE_ACTIVE) | + PPC_BIT16(OCC_STATE_CHARACTERIZATION), + PPC_BIT8(OCC_ROLE_MASTER) | PPC_BIT8(OCC_ROLE_SLAVE) + }, + { OCC_CMD_SELECT_SENSOR_GROUP, + 0xD3, 2, 2, 1000, + PPC_BIT16(OCC_STATE_OBSERVATION) | + PPC_BIT16(OCC_STATE_ACTIVE) | + PPC_BIT16(OCC_STATE_CHARACTERIZATION), + PPC_BIT8(OCC_ROLE_MASTER) | PPC_BIT8(OCC_ROLE_SLAVE) + }, +}; + +enum occ_response_status { + OCC_RSP_SUCCESS = 0x00, + OCC_RSP_INVALID_COMMAND = 0x11, + OCC_RSP_INVALID_CMD_DATA_LENGTH = 0x12, + OCC_RSP_INVALID_DATA = 0x13, + OCC_RSP_INTERNAL_ERROR = 0x15, +}; + +#define OCC_FLAG_RSP_READY 0x01 +#define OCC_FLAG_CMD_IN_PROGRESS 0x02 +#define OPAL_FLAG_CMD_READY 0x80 + +struct opal_occ_cmd_data { + u8 *data; + enum occ_cmd cmd; +}; + +static struct cmd_interface { + struct lock queue_lock; + struct timer timeout; + struct opal_occ_cmd_data *cdata; + struct opal_command_buffer *cmd; + struct occ_response_buffer *rsp; + u8 *occ_state; + u8 *valid; + u32 chip_id; + u32 token; + u16 enabled_sensor_mask; + u8 occ_role; + u8 request_id; + bool cmd_in_progress; + bool retry; +} *chips; + +static int nr_occs; + +static inline struct cmd_interface *get_chip_cmd_interface(int chip_id) +{ + int i; + + for (i = 0; i < nr_occs; i++) + if (chips[i].chip_id == chip_id) + return &chips[i]; + + return NULL; +} + +static inline bool occ_in_progress(struct cmd_interface *chip) +{ + return (chip->rsp->flag == OCC_FLAG_CMD_IN_PROGRESS); +} + +static int write_occ_cmd(struct cmd_interface *chip) +{ + struct opal_command_buffer *cmd = chip->cmd; + enum occ_cmd ocmd = chip->cdata->cmd; + + if (!chip->retry && occ_in_progress(chip)) { + chip->cmd_in_progress = false; + return OPAL_BUSY; + } + + cmd->flag = chip->rsp->flag = 0; + cmd->cmd = occ_cmds[ocmd].cmd_value; + cmd->request_id = chip->request_id++; + cmd->data_size = occ_cmds[ocmd].cmd_size; + memcpy(&cmd->data, chip->cdata->data, cmd->data_size); + cmd->flag = OPAL_FLAG_CMD_READY; + + schedule_timer(&chip->timeout, + msecs_to_tb(occ_cmds[ocmd].timeout_ms)); + + return OPAL_ASYNC_COMPLETION; +} + +static int64_t opal_occ_command(struct cmd_interface *chip, int token, + struct opal_occ_cmd_data *cdata) +{ + int rc; + + if (!(*chip->valid) || + (!(PPC_BIT16(*chip->occ_state) & occ_cmds[cdata->cmd].state_mask))) + return OPAL_HARDWARE; + + if (!(PPC_BIT8(chip->occ_role) & occ_cmds[cdata->cmd].role_mask)) + return OPAL_PERMISSION; + + lock(&chip->queue_lock); + if (chip->cmd_in_progress) { + rc = OPAL_BUSY; + goto out; + } + + chip->cdata = cdata; + chip->token = token; + chip->cmd_in_progress = true; + chip->retry = false; + rc = write_occ_cmd(chip); +out: + unlock(&chip->queue_lock); + return rc; +} + +static inline bool sanity_check_opal_cmd(struct opal_command_buffer *cmd, + struct cmd_interface *chip) +{ + return ((cmd->cmd == occ_cmds[chip->cdata->cmd].cmd_value) && + (cmd->request_id == chip->request_id - 1) && + (cmd->data_size == occ_cmds[chip->cdata->cmd].cmd_size)); +} + +static inline bool check_occ_rsp(struct opal_command_buffer *cmd, + struct occ_response_buffer *rsp) +{ + if (cmd->cmd != rsp->cmd) { + prlog(PR_DEBUG, "OCC: Command value mismatch in OCC response" + "rsp->cmd = %d cmd->cmd = %d\n", rsp->cmd, cmd->cmd); + return false; + } + + if (cmd->request_id != rsp->request_id) { + prlog(PR_DEBUG, "OCC: Request ID mismatch in OCC response" + "rsp->request_id = %d cmd->request_id = %d\n", + rsp->request_id, cmd->request_id); + return false; + } + + return true; +} + +static inline void queue_occ_rsp_msg(int token, int rc) +{ + int ret; + + ret = opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, + cpu_to_be64(token), + cpu_to_be64(rc)); + if (ret) + prerror("OCC: Failed to queue OCC response status message\n"); +} + +static void occ_cmd_timeout_handler(struct timer *t __unused, void *data, + uint64_t now __unused) +{ + struct cmd_interface *chip = data; + + lock(&chip->queue_lock); + if (!chip->cmd_in_progress) + goto exit; + + if (!chip->retry) { + prlog(PR_DEBUG, "OCC: Command timeout, retrying\n"); + chip->retry = true; + write_occ_cmd(chip); + } else { + chip->cmd_in_progress = false; + queue_occ_rsp_msg(chip->token, OPAL_TIMEOUT); + prlog(PR_DEBUG, "OCC: Command timeout after retry\n"); + } +exit: + unlock(&chip->queue_lock); +} + +static int read_occ_rsp(struct occ_response_buffer *rsp) +{ + switch (rsp->status) { + case OCC_RSP_SUCCESS: + return OPAL_SUCCESS; + case OCC_RSP_INVALID_COMMAND: + prlog(PR_DEBUG, "OCC: Rsp status: Invalid command\n"); + break; + case OCC_RSP_INVALID_CMD_DATA_LENGTH: + prlog(PR_DEBUG, "OCC: Rsp status: Invalid command data length\n"); + break; + case OCC_RSP_INVALID_DATA: + prlog(PR_DEBUG, "OCC: Rsp status: Invalid command data\n"); + break; + case OCC_RSP_INTERNAL_ERROR: + prlog(PR_DEBUG, "OCC: Rsp status: OCC internal error\n"); + break; + default: + break; + } + + /* Clear the OCC response flag */ + rsp->flag = 0; + return OPAL_INTERNAL_ERROR; +} + +static void handle_occ_rsp(uint32_t chip_id) +{ + struct cmd_interface *chip; + struct opal_command_buffer *cmd; + struct occ_response_buffer *rsp; + + chip = get_chip_cmd_interface(chip_id); + if (!chip) + return; + + cmd = chip->cmd; + rsp = chip->rsp; + + /*Read rsp*/ + if (rsp->flag != OCC_FLAG_RSP_READY) + return; + lock(&chip->queue_lock); + if (!chip->cmd_in_progress) + goto exit; + + cancel_timer(&chip->timeout); + if (!sanity_check_opal_cmd(cmd, chip) || + !check_occ_rsp(cmd, rsp)) { + if (!chip->retry) { + prlog(PR_DEBUG, "OCC: Command-response mismatch, retrying\n"); + chip->retry = true; + write_occ_cmd(chip); + } else { + chip->cmd_in_progress = false; + queue_occ_rsp_msg(chip->token, OPAL_INTERNAL_ERROR); + prlog(PR_DEBUG, "OCC: Command-response mismatch\n"); + } + goto exit; + } + + if (rsp->cmd == occ_cmds[OCC_CMD_SELECT_SENSOR_GROUP].cmd_value && + rsp->status == OCC_RSP_SUCCESS) + chip->enabled_sensor_mask = *(u16 *)chip->cdata->data; + + chip->cmd_in_progress = false; + queue_occ_rsp_msg(chip->token, read_occ_rsp(chip->rsp)); +exit: + unlock(&chip->queue_lock); +} + +bool occ_get_gpu_presence(struct proc_chip *chip, int gpu_num) +{ + struct occ_dynamic_data *ddata; + static int max_retries = 20; + static bool found = false; + + assert(gpu_num <= 2); + + ddata = get_occ_dynamic_data(chip); + while (!found && max_retries) { + if (ddata->major_version == 0 && ddata->minor_version >= 1) { + found = true; + break; + } + time_wait_ms(100); + max_retries--; + ddata = get_occ_dynamic_data(chip); + } + + if (!found) { + prlog(PR_INFO, "OCC: No GPU slot presence, assuming GPU present\n"); + return true; + } + + return (bool)(ddata->gpus_present & 1 << gpu_num); +} + +static void occ_add_powercap_sensors(struct dt_node *power_mgt); +static void occ_add_psr_sensors(struct dt_node *power_mgt); + +static void occ_cmd_interface_init(void) +{ + struct occ_dynamic_data *data; + struct occ_pstate_table *pdata; + struct dt_node *power_mgt; + struct proc_chip *chip; + int i = 0, major; + + /* Check if the OCC data is valid */ + for_each_chip(chip) { + pdata = get_occ_pstate_table(chip); + if (!pdata->valid) + return; + } + + chip = next_chip(NULL); + pdata = get_occ_pstate_table(chip); + major = pdata->version >> 4; + if (major != 0x9 || major != 0xA) + return; + + for_each_chip(chip) + nr_occs++; + + chips = malloc(sizeof(*chips) * nr_occs); + assert(chips); + + for_each_chip(chip) { + pdata = get_occ_pstate_table(chip); + data = get_occ_dynamic_data(chip); + chips[i].chip_id = chip->id; + chips[i].occ_state = &data->occ_state; + chips[i].valid = &pdata->valid; + chips[i].cmd = &data->cmd; + chips[i].rsp = &data->rsp; + switch (major) { + case 0x9: + chips[i].occ_role = pdata->v9.occ_role; + break; + case 0xA: + chips[i].occ_role = pdata->v10.occ_role; + break; + } + init_lock(&chips[i].queue_lock); + chips[i].cmd_in_progress = false; + chips[i].request_id = 0; + chips[i].enabled_sensor_mask = OCC_ENABLED_SENSOR_MASK; + init_timer(&chips[i].timeout, occ_cmd_timeout_handler, + &chips[i]); + i++; + } + + power_mgt = dt_find_by_path(dt_root, "/ibm,opal/power-mgt"); + if (!power_mgt) { + prerror("OCC: dt node /ibm,opal/power-mgt not found\n"); + return; + } + + /* Add powercap sensors to DT */ + occ_add_powercap_sensors(power_mgt); + + /* Add power-shifting-ratio CPU-GPU sensors to DT */ + occ_add_psr_sensors(power_mgt); +} + +/* Powercap interface */ +enum sensor_powercap_occ_attr { + POWERCAP_OCC_SOFT_MIN, + POWERCAP_OCC_MAX, + POWERCAP_OCC_CUR, + POWERCAP_OCC_HARD_MIN, +}; + +static void occ_add_powercap_sensors(struct dt_node *power_mgt) +{ + struct dt_node *pcap, *node; + u32 handle; + + pcap = dt_new(power_mgt, "powercap"); + if (!pcap) { + prerror("OCC: Failed to create powercap node\n"); + return; + } + + dt_add_property_string(pcap, "compatible", "ibm,opal-powercap"); + node = dt_new(pcap, "system-powercap"); + if (!node) { + prerror("OCC: Failed to create system powercap node\n"); + return; + } + + handle = powercap_make_handle(POWERCAP_CLASS_OCC, POWERCAP_OCC_CUR); + dt_add_property_cells(node, "powercap-current", handle); + + handle = powercap_make_handle(POWERCAP_CLASS_OCC, + POWERCAP_OCC_SOFT_MIN); + dt_add_property_cells(node, "powercap-min", handle); + + handle = powercap_make_handle(POWERCAP_CLASS_OCC, POWERCAP_OCC_MAX); + dt_add_property_cells(node, "powercap-max", handle); + + handle = powercap_make_handle(POWERCAP_CLASS_OCC, + POWERCAP_OCC_HARD_MIN); + dt_add_property_cells(node, "powercap-hard-min", handle); + +} + +int occ_get_powercap(u32 handle, u32 *pcap) +{ + struct occ_pstate_table *pdata; + struct occ_dynamic_data *ddata; + struct proc_chip *chip; + + chip = next_chip(NULL); + pdata = get_occ_pstate_table(chip); + ddata = get_occ_dynamic_data(chip); + + if (!pdata->valid) + return OPAL_HARDWARE; + + switch (powercap_get_attr(handle)) { + case POWERCAP_OCC_SOFT_MIN: + *pcap = ddata->soft_min_pwr_cap; + break; + case POWERCAP_OCC_MAX: + *pcap = ddata->max_pwr_cap; + break; + case POWERCAP_OCC_CUR: + *pcap = ddata->cur_pwr_cap; + break; + case POWERCAP_OCC_HARD_MIN: + *pcap = ddata->hard_min_pwr_cap; + break; + default: + *pcap = 0; + return OPAL_UNSUPPORTED; + } + + return OPAL_SUCCESS; +} + +static u16 pcap_cdata; +static struct opal_occ_cmd_data pcap_data = { + .data = (u8 *)&pcap_cdata, + .cmd = OCC_CMD_SET_POWER_CAP, +}; + +int __attribute__((__const__)) occ_set_powercap(u32 handle, int token, u32 pcap) +{ + struct occ_dynamic_data *ddata; + struct proc_chip *chip; + int i; + + if (powercap_get_attr(handle) != POWERCAP_OCC_CUR) + return OPAL_PERMISSION; + + if (!chips) + return OPAL_HARDWARE; + + for (i = 0; i < nr_occs; i++) + if (chips[i].occ_role == OCC_ROLE_MASTER) + break; + + if (!(*chips[i].valid)) + return OPAL_HARDWARE; + + chip = get_chip(chips[i].chip_id); + ddata = get_occ_dynamic_data(chip); + + if (pcap == ddata->cur_pwr_cap) + return OPAL_SUCCESS; + + if (pcap && (pcap > ddata->max_pwr_cap || + pcap < ddata->soft_min_pwr_cap)) + return OPAL_PARAMETER; + + pcap_cdata = pcap; + return opal_occ_command(&chips[i], token, &pcap_data); +}; + +/* Power-Shifting Ratio */ +enum psr_type { + PSR_TYPE_CPU_TO_GPU, /* 0% Cap GPU first, 100% Cap CPU first */ +}; + +int occ_get_psr(u32 handle, u32 *ratio) +{ + struct occ_dynamic_data *ddata; + struct proc_chip *chip; + u8 i = psr_get_rid(handle); + + if (psr_get_type(handle) != PSR_TYPE_CPU_TO_GPU) + return OPAL_UNSUPPORTED; + + if (i > nr_occs) + return OPAL_UNSUPPORTED; + + if (!(*chips[i].valid)) + return OPAL_HARDWARE; + + chip = get_chip(chips[i].chip_id); + ddata = get_occ_dynamic_data(chip); + *ratio = ddata->pwr_shifting_ratio; + return OPAL_SUCCESS; +} + +static u8 psr_cdata; +static struct opal_occ_cmd_data psr_data = { + .data = &psr_cdata, + .cmd = OCC_CMD_SET_POWER_SHIFTING_RATIO, +}; + +int occ_set_psr(u32 handle, int token, u32 ratio) +{ + struct occ_dynamic_data *ddata; + struct proc_chip *chip; + u8 i = psr_get_rid(handle); + + if (psr_get_type(handle) != PSR_TYPE_CPU_TO_GPU) + return OPAL_UNSUPPORTED; + + if (ratio > 100) + return OPAL_PARAMETER; + + if (i > nr_occs) + return OPAL_UNSUPPORTED; + + if (!(*chips[i].valid)) + return OPAL_HARDWARE; + + chip = get_chip(chips[i].chip_id); + ddata = get_occ_dynamic_data(chip); + if (ratio == ddata->pwr_shifting_ratio) + return OPAL_SUCCESS; + + psr_cdata = ratio; + return opal_occ_command(&chips[i], token, &psr_data); +} + +static void occ_add_psr_sensors(struct dt_node *power_mgt) +{ + struct dt_node *node; + int i; + + node = dt_new(power_mgt, "psr"); + if (!node) { + prerror("OCC: Failed to create power-shifting-ratio node\n"); + return; + } + + dt_add_property_string(node, "compatible", + "ibm,opal-power-shift-ratio"); + dt_add_property_cells(node, "#address-cells", 1); + dt_add_property_cells(node, "#size-cells", 0); + for (i = 0; i < nr_occs; i++) { + struct dt_node *cnode; + char name[20]; + u32 handle = psr_make_handle(PSR_CLASS_OCC, i, + PSR_TYPE_CPU_TO_GPU); + + cnode = dt_new_addr(node, "cpu-to-gpu", handle); + if (!cnode) { + prerror("OCC: Failed to create power-shifting-ratio node\n"); + return; + } + + snprintf(name, 20, "cpu_to_gpu_%d", chips[i].chip_id); + dt_add_property_string(cnode, "label", name); + dt_add_property_cells(cnode, "handle", handle); + dt_add_property_cells(cnode, "reg", chips[i].chip_id); + } +} + +/* OCC clear sensor limits CSM/Profiler/Job-scheduler */ + +enum occ_sensor_limit_group { + OCC_SENSOR_LIMIT_GROUP_CSM = 0x10, + OCC_SENSOR_LIMIT_GROUP_PROFILER = 0x20, + OCC_SENSOR_LIMIT_GROUP_JOB_SCHED = 0x40, +}; + +static u32 sensor_limit; +static struct opal_occ_cmd_data slimit_data = { + .data = (u8 *)&sensor_limit, + .cmd = OCC_CMD_CLEAR_SENSOR_DATA, +}; + +int occ_sensor_group_clear(u32 group_hndl, int token) +{ + u32 limit = sensor_get_rid(group_hndl); + u8 i = sensor_get_attr(group_hndl); + + if (i > nr_occs) + return OPAL_UNSUPPORTED; + + switch (limit) { + case OCC_SENSOR_LIMIT_GROUP_CSM: + case OCC_SENSOR_LIMIT_GROUP_PROFILER: + case OCC_SENSOR_LIMIT_GROUP_JOB_SCHED: + break; + default: + return OPAL_UNSUPPORTED; + } + + if (!(*chips[i].valid)) + return OPAL_HARDWARE; + + sensor_limit = limit << 24; + return opal_occ_command(&chips[i], token, &slimit_data); +} + +static u16 sensor_enable; +static struct opal_occ_cmd_data sensor_mask_data = { + .data = (u8 *)&sensor_enable, + .cmd = OCC_CMD_SELECT_SENSOR_GROUP, +}; + +int occ_sensor_group_enable(u32 group_hndl, int token, bool enable) +{ + u16 type = sensor_get_rid(group_hndl); + u8 i = sensor_get_attr(group_hndl); + + if (i > nr_occs) + return OPAL_UNSUPPORTED; + + switch (type) { + case OCC_SENSOR_TYPE_GENERIC: + case OCC_SENSOR_TYPE_CURRENT: + case OCC_SENSOR_TYPE_VOLTAGE: + case OCC_SENSOR_TYPE_TEMPERATURE: + case OCC_SENSOR_TYPE_UTILIZATION: + case OCC_SENSOR_TYPE_TIME: + case OCC_SENSOR_TYPE_FREQUENCY: + case OCC_SENSOR_TYPE_POWER: + case OCC_SENSOR_TYPE_PERFORMANCE: + break; + default: + return OPAL_UNSUPPORTED; + } + + if (!(*chips[i].valid)) + return OPAL_HARDWARE; + + if (enable && (type & chips[i].enabled_sensor_mask)) + return OPAL_SUCCESS; + else if (!enable && !(type & chips[i].enabled_sensor_mask)) + return OPAL_SUCCESS; + + sensor_enable = enable ? type | chips[i].enabled_sensor_mask : + ~type & chips[i].enabled_sensor_mask; + + return opal_occ_command(&chips[i], token, &sensor_mask_data); +} + +void occ_add_sensor_groups(struct dt_node *sg, __be32 *phandles, u32 *ptype, + int nr_phandles, int chipid) +{ + struct group_info { + int type; + const char *str; + u32 ops; + } groups[] = { + { OCC_SENSOR_LIMIT_GROUP_CSM, "csm", + OPAL_SENSOR_GROUP_CLEAR + }, + { OCC_SENSOR_LIMIT_GROUP_PROFILER, "profiler", + OPAL_SENSOR_GROUP_CLEAR + }, + { OCC_SENSOR_LIMIT_GROUP_JOB_SCHED, "js", + OPAL_SENSOR_GROUP_CLEAR + }, + { OCC_SENSOR_TYPE_GENERIC, "generic", + OPAL_SENSOR_GROUP_ENABLE + }, + { OCC_SENSOR_TYPE_CURRENT, "curr", + OPAL_SENSOR_GROUP_ENABLE + }, + { OCC_SENSOR_TYPE_VOLTAGE, "in", + OPAL_SENSOR_GROUP_ENABLE + }, + { OCC_SENSOR_TYPE_TEMPERATURE, "temp", + OPAL_SENSOR_GROUP_ENABLE + }, + { OCC_SENSOR_TYPE_UTILIZATION, "utilization", + OPAL_SENSOR_GROUP_ENABLE + }, + { OCC_SENSOR_TYPE_TIME, "time", + OPAL_SENSOR_GROUP_ENABLE + }, + { OCC_SENSOR_TYPE_FREQUENCY, "frequency", + OPAL_SENSOR_GROUP_ENABLE + }, + { OCC_SENSOR_TYPE_POWER, "power", + OPAL_SENSOR_GROUP_ENABLE + }, + { OCC_SENSOR_TYPE_PERFORMANCE, "performance", + OPAL_SENSOR_GROUP_ENABLE + }, + }; + int i, j; + + /* + * Dont add sensor groups if cmd-interface is not intialized + */ + if (!chips) + return; + + for (i = 0; i < nr_occs; i++) + if (chips[i].chip_id == chipid) + break; + + for (j = 0; j < ARRAY_SIZE(groups); j++) { + struct dt_node *node; + char name[20]; + u32 handle; + + snprintf(name, 20, "occ-%s", groups[j].str); + handle = sensor_make_handler(SENSOR_OCC, 0, + groups[j].type, i); + node = dt_new_addr(sg, name, handle); + if (!node) { + prerror("Failed to create sensor group nodes\n"); + return; + } + + dt_add_property_cells(node, "sensor-group-id", handle); + dt_add_property_string(node, "type", groups[j].str); + + if (groups[j].type == OCC_SENSOR_TYPE_CURRENT || + groups[j].type == OCC_SENSOR_TYPE_VOLTAGE || + groups[j].type == OCC_SENSOR_TYPE_TEMPERATURE || + groups[j].type == OCC_SENSOR_TYPE_POWER) { + dt_add_property_string(node, "sensor-type", + groups[j].str); + dt_add_property_string(node, "compatible", + "ibm,opal-sensor"); + } + + dt_add_property_cells(node, "ibm,chip-id", chipid); + dt_add_property_cells(node, "reg", handle); + if (groups[j].ops == OPAL_SENSOR_GROUP_ENABLE) { + __be32 *_phandles; + int k, pcount = 0; + + _phandles = malloc(sizeof(u32) * nr_phandles); + assert(_phandles); + for (k = 0; k < nr_phandles; k++) + if (ptype[k] == groups[j].type) + _phandles[pcount++] = phandles[k]; + if (pcount) + dt_add_property(node, "sensors", _phandles, + pcount * sizeof(u32)); + free(_phandles); + } else { + dt_add_property(node, "sensors", phandles, + nr_phandles * sizeof(u32)); + } + dt_add_property_cells(node, "ops", groups[j].ops); + } +} + +/* CPU-OCC PState init */ +/* Called after OCC init on P8 and P9 */ +void occ_pstates_init(void) +{ + struct proc_chip *chip; + struct cpu_thread *c; + struct dt_node *power_mgt; + int pstate_nom; + u32 freq_domain_mask; + u8 domain_runs_at; + static bool occ_pstates_initialized; + + power_mgt = dt_find_by_path(dt_root, "/ibm,opal/power-mgt"); + if (!power_mgt) { + /** + * @fwts-label OCCDTNodeNotFound + * @fwts-advice Device tree node /ibm,opal/power-mgt not + * found. OPAL didn't add pstate information to device tree. + * Probably a firmware bug. + */ + prlog(PR_ERR, "OCC: dt node /ibm,opal/power-mgt not found\n"); + return; + } + + /* Handle fast reboots */ + if (occ_pstates_initialized) { + struct dt_node *child; + int i; + const char *props[] = { + "ibm,pstate-core-max", + "ibm,pstate-frequencies-mhz", + "ibm,pstate-ids", + "ibm,pstate-max", + "ibm,pstate-min", + "ibm,pstate-nominal", + "ibm,pstate-turbo", + "ibm,pstate-ultra-turbo", + "ibm,pstate-base", + "#address-cells", + "#size-cells", + }; + + for (i = 0; i < ARRAY_SIZE(props); i++) + dt_check_del_prop(power_mgt, props[i]); + + dt_for_each_child(power_mgt, child) + if (!strncmp(child->name, "occ", 3)) + dt_free(child); + } + + switch (proc_gen) { + case proc_gen_p8: + homer_opal_data_offset = P8_HOMER_OPAL_DATA_OFFSET; + break; + case proc_gen_p9: + case proc_gen_p10: + homer_opal_data_offset = P9_HOMER_OPAL_DATA_OFFSET; + break; + default: + return; + } + + chip = next_chip(NULL); + if (!chip->homer_base) { + log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT), + "OCC: No HOMER detected, assuming no pstates\n"); + return; + } + + /* Wait for all OCC to boot up */ + if(!wait_for_all_occ_init()) { + log_simple_error(&e_info(OPAL_RC_OCC_TIMEOUT), + "OCC: Initialization on all chips did not complete" + "(timed out)\n"); + return; + } + + /* + * Check boundary conditions and add device tree nodes + * and return nominal pstate to set for the core + */ + if (!add_cpu_pstate_properties(power_mgt, &pstate_nom)) { + log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT), + "Skiping core cpufreq init due to OCC error\n"); + } else if (proc_gen == proc_gen_p8) { + /* + * Setup host based pstates and set nominal frequency only in + * P8. + */ + for_each_chip(chip) + for_each_available_core_in_chip(c, chip->id) + cpu_pstates_prepare_core(chip, c, pstate_nom); + } + + if (occ_pstates_initialized) + return; + + /* Add opal_poller to poll OCC throttle status of each chip */ + for_each_chip(chip) + chip->throttle = 0; + opal_add_poller(occ_throttle_poll, NULL); + occ_pstates_initialized = true; + + /* Init OPAL-OCC command-response interface */ + occ_cmd_interface_init(); + + /* TODO Firmware plumbing required so as to have two modes to set + * PMCR based on max in domain or most recently used. As of today, + * it is always max in domain for P9. + */ + domain_runs_at = 0; + freq_domain_mask = 0; + if (proc_gen == proc_gen_p8) { + freq_domain_mask = P8_PIR_CORE_MASK; + domain_runs_at = FREQ_MOST_RECENTLY_SET; + } else if (proc_gen == proc_gen_p9) { + freq_domain_mask = P9_PIR_QUAD_MASK; + domain_runs_at = FREQ_MAX_IN_DOMAIN; + } else if (proc_gen == proc_gen_p10) { + freq_domain_mask = P10_PIR_CHIP_MASK; + domain_runs_at = FREQ_MAX_IN_DOMAIN; + } else { + assert(0); + } + + dt_add_property_cells(power_mgt, "freq-domain-mask", freq_domain_mask); + dt_add_property_cells(power_mgt, "domain-runs-at", domain_runs_at); +} + +int find_master_and_slave_occ(uint64_t **master, uint64_t **slave, + int *nr_masters, int *nr_slaves) +{ + struct proc_chip *chip; + int nr_chips = 0, i; + uint64_t chipids[MAX_CHIPS]; + + for_each_chip(chip) { + chipids[nr_chips++] = chip->id; + } + + chip = next_chip(NULL); + /* + * Proc0 is the master OCC for Tuleta/Alpine boxes. + * Hostboot expects the pair of chips for MURANO, so pass the sibling + * chip id along with proc0 to hostboot. + */ + *nr_masters = (chip->type == PROC_CHIP_P8_MURANO) ? 2 : 1; + *master = (uint64_t *)malloc(*nr_masters * sizeof(uint64_t)); + + if (!*master) { + printf("OCC: master array alloc failure\n"); + return -ENOMEM; + } + + if (nr_chips - *nr_masters > 0) { + *nr_slaves = nr_chips - *nr_masters; + *slave = (uint64_t *)malloc(*nr_slaves * sizeof(uint64_t)); + if (!*slave) { + printf("OCC: slave array alloc failure\n"); + return -ENOMEM; + } + } + + for (i = 0; i < nr_chips; i++) { + if (i < *nr_masters) { + *(*master + i) = chipids[i]; + continue; + } + *(*slave + i - *nr_masters) = chipids[i]; + } + return 0; +} + + +int occ_msg_queue_occ_reset(void) +{ + struct opal_occ_msg occ_msg = { CPU_TO_BE64(OCC_RESET), 0, 0 }; + struct proc_chip *chip; + int rc; + + lock(&occ_lock); + rc = _opal_queue_msg(OPAL_MSG_OCC, NULL, NULL, + sizeof(struct opal_occ_msg), &occ_msg); + if (rc) { + prlog(PR_INFO, "OCC: Failed to queue OCC_RESET message\n"); + goto out; + } + /* + * Set 'valid' byte of occ_pstate_table to 0 since OCC + * may not clear this byte on a reset. + * OCC will set the 'valid' byte to 1 when it becomes + * active again. + */ + for_each_chip(chip) { + struct occ_pstate_table *occ_data; + + occ_data = get_occ_pstate_table(chip); + occ_data->valid = 0; + chip->throttle = 0; + } + occ_reset = true; +out: + unlock(&occ_lock); + return rc; +} + +#define PV_OCC_GP0 0x01000000 +#define PV_OCC_GP0_AND 0x01000004 +#define PV_OCC_GP0_OR 0x01000005 +#define PV_OCC_GP0_PNOR_OWNER PPC_BIT(18) /* 1 = OCC / Host, 0 = BMC */ + +static void occ_pnor_set_one_owner(uint32_t chip_id, enum pnor_owner owner) +{ + uint64_t reg, mask; + + if (owner == PNOR_OWNER_HOST) { + reg = PV_OCC_GP0_OR; + mask = PV_OCC_GP0_PNOR_OWNER; + } else { + reg = PV_OCC_GP0_AND; + mask = ~PV_OCC_GP0_PNOR_OWNER; + } + + xscom_write(chip_id, reg, mask); +} + +void occ_pnor_set_owner(enum pnor_owner owner) +{ + struct proc_chip *chip; + + for_each_chip(chip) + occ_pnor_set_one_owner(chip->id, owner); +} + + +#define P8_OCB_OCI_OCCMISC 0x6a020 +#define P8_OCB_OCI_OCCMISC_AND 0x6a021 +#define P8_OCB_OCI_OCCMISC_OR 0x6a022 + +#define P9_OCB_OCI_OCCMISC 0x6c080 +#define P9_OCB_OCI_OCCMISC_CLEAR 0x6c081 +#define P9_OCB_OCI_OCCMISC_OR 0x6c082 + +#define OCB_OCI_OCIMISC_IRQ PPC_BIT(0) +#define OCB_OCI_OCIMISC_IRQ_TMGT PPC_BIT(1) +#define OCB_OCI_OCIMISC_IRQ_SLW_TMR PPC_BIT(14) +#define OCB_OCI_OCIMISC_IRQ_OPAL_DUMMY PPC_BIT(15) + +#define P8_OCB_OCI_OCIMISC_MASK (OCB_OCI_OCIMISC_IRQ_TMGT | \ + OCB_OCI_OCIMISC_IRQ_OPAL_DUMMY | \ + OCB_OCI_OCIMISC_IRQ_SLW_TMR) + +#define OCB_OCI_OCIMISC_IRQ_I2C PPC_BIT(2) +#define OCB_OCI_OCIMISC_IRQ_SHMEM PPC_BIT(3) +#define P9_OCB_OCI_OCIMISC_MASK (OCB_OCI_OCIMISC_IRQ_TMGT | \ + OCB_OCI_OCIMISC_IRQ_I2C | \ + OCB_OCI_OCIMISC_IRQ_SHMEM | \ + OCB_OCI_OCIMISC_IRQ_OPAL_DUMMY) + +void occ_send_dummy_interrupt(void) +{ + struct psi *psi; + struct proc_chip *chip = get_chip(this_cpu()->chip_id); + + /* Emulators don't do this */ + if (chip_quirk(QUIRK_NO_OCC_IRQ)) + return; + + /* Find a functional PSI. This ensures an interrupt even if + * the psihb on the current chip is not configured */ + if (chip->psi) + psi = chip->psi; + else + psi = psi_find_functional_chip(); + + if (!psi) { + prlog_once(PR_WARNING, "PSI: no functional PSI HB found, " + "no self interrupts delivered\n"); + return; + } + + switch (proc_gen) { + case proc_gen_p8: + xscom_write(psi->chip_id, P8_OCB_OCI_OCCMISC_OR, + OCB_OCI_OCIMISC_IRQ | + OCB_OCI_OCIMISC_IRQ_OPAL_DUMMY); + break; + case proc_gen_p9: + xscom_write(psi->chip_id, P9_OCB_OCI_OCCMISC_OR, + OCB_OCI_OCIMISC_IRQ | + OCB_OCI_OCIMISC_IRQ_OPAL_DUMMY); + break; + case proc_gen_p10: + xscom_write(psi->chip_id, P9_OCB_OCI_OCCMISC_OR, + OCB_OCI_OCIMISC_IRQ | + OCB_OCI_OCIMISC_IRQ_OPAL_DUMMY); + break; + default: + break; + } +} + +void occ_p8_interrupt(uint32_t chip_id) +{ + uint64_t ireg; + int64_t rc; + + /* The OCC interrupt is used to mux up to 15 different sources */ + rc = xscom_read(chip_id, P8_OCB_OCI_OCCMISC, &ireg); + if (rc) { + prerror("OCC: Failed to read interrupt status !\n"); + /* Should we mask it in the XIVR ? */ + return; + } + prlog(PR_TRACE, "OCC: IRQ received: %04llx\n", ireg >> 48); + + /* Clear the bits */ + xscom_write(chip_id, P8_OCB_OCI_OCCMISC_AND, ~ireg); + + /* Dispatch */ + if (ireg & OCB_OCI_OCIMISC_IRQ_TMGT) + prd_tmgt_interrupt(chip_id); + if (ireg & OCB_OCI_OCIMISC_IRQ_SLW_TMR) + check_timers(true); + + /* We may have masked-out OCB_OCI_OCIMISC_IRQ in the previous + * OCCMISC_AND write. Check if there are any new source bits set, + * and trigger another interrupt if so. + */ + rc = xscom_read(chip_id, P8_OCB_OCI_OCCMISC, &ireg); + if (!rc && (ireg & P8_OCB_OCI_OCIMISC_MASK)) + xscom_write(chip_id, P8_OCB_OCI_OCCMISC_OR, + OCB_OCI_OCIMISC_IRQ); +} + +void occ_p9_interrupt(uint32_t chip_id) +{ + u64 ireg; + s64 rc; + + /* The OCC interrupt is used to mux up to 15 different sources */ + rc = xscom_read(chip_id, P9_OCB_OCI_OCCMISC, &ireg); + if (rc) { + prerror("OCC: Failed to read interrupt status !\n"); + return; + } + prlog(PR_TRACE, "OCC: IRQ received: %04llx\n", ireg >> 48); + + /* Clear the bits */ + xscom_write(chip_id, P9_OCB_OCI_OCCMISC_CLEAR, ireg); + + /* Dispatch */ + if (ireg & OCB_OCI_OCIMISC_IRQ_TMGT) + prd_tmgt_interrupt(chip_id); + + if (ireg & OCB_OCI_OCIMISC_IRQ_SHMEM) { + occ_throttle_poll(NULL); + handle_occ_rsp(chip_id); + } + + if (ireg & OCB_OCI_OCIMISC_IRQ_I2C) + p9_i2c_bus_owner_change(chip_id); + + /* We may have masked-out OCB_OCI_OCIMISC_IRQ in the previous + * OCCMISC_AND write. Check if there are any new source bits set, + * and trigger another interrupt if so. + */ + rc = xscom_read(chip_id, P9_OCB_OCI_OCCMISC, &ireg); + if (!rc && (ireg & P9_OCB_OCI_OCIMISC_MASK)) + xscom_write(chip_id, P9_OCB_OCI_OCCMISC_OR, + OCB_OCI_OCIMISC_IRQ); +} diff --git a/roms/skiboot/hw/ocmb.c b/roms/skiboot/hw/ocmb.c new file mode 100644 index 000000000..bc470d0ab --- /dev/null +++ b/roms/skiboot/hw/ocmb.c @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Open Capi Memory Buffer chip + * + * Copyright 2020 IBM Corp. + */ + + +#define pr_fmt(fmt) "OCMB: " fmt + +#include <skiboot.h> +#include <xscom.h> +#include <device.h> +#include <ocmb.h> +#include <io.h> +#include <inttypes.h> + +struct ocmb_range { + uint64_t start; + uint64_t end; + uint64_t flags; + + /* flags come from hdat */ +#define ACCESS_8B PPC_BIT(0) +#define ACCESS_4B PPC_BIT(1) +#define ACCESS_SIZE_MASK (ACCESS_8B | ACCESS_4B) +}; + +struct ocmb { + struct scom_controller scom; + int range_count; + struct ocmb_range ranges[]; +}; + +static const struct ocmb_range *find_range(const struct ocmb *o, uint64_t offset) +{ + int i; + uint64_t addr = offset & ~(HRMOR_BIT); + + for (i = 0; i < o->range_count; i++) { + uint64_t start = o->ranges[i].start; + uint64_t end = o->ranges[i].end; + + if (addr >= start && addr <= end) + return &o->ranges[i]; + } + + return NULL; +} + +static int64_t ocmb_fake_scom_write(struct scom_controller *f, + uint32_t __unused chip_id, + uint64_t offset, uint64_t val) +{ + const struct ocmb *o = f->private; + const struct ocmb_range *r; + + r = find_range(o, offset); + if (!r) { + prerror("no matching address range!\n"); + return OPAL_XSCOM_ADDR_ERROR; + } + + switch (r->flags & ACCESS_SIZE_MASK) { + case ACCESS_8B: + if (offset & 0x7) + return OPAL_XSCOM_ADDR_ERROR; + out_be64((void *) offset, val); + break; + + case ACCESS_4B: + if (offset & 0x3) + return OPAL_XSCOM_ADDR_ERROR; + out_be32((void *) offset, val); + break; + default: + prerror("bad flags? %llx\n", r->flags); + return OPAL_XSCOM_ADDR_ERROR; + } + + return OPAL_SUCCESS; +} + +static int64_t ocmb_fake_scom_read(struct scom_controller *f, + uint32_t chip_id __unused, + uint64_t offset, uint64_t *val) +{ + const struct ocmb *o = f->private; + const struct ocmb_range *r = NULL; + + r = find_range(o, offset); + if (!r) { + prerror("no matching address range!\n"); + return OPAL_XSCOM_ADDR_ERROR; + } + + + switch (r->flags & ACCESS_SIZE_MASK) { + case ACCESS_8B: + if (offset & 0x7) + return OPAL_XSCOM_ADDR_ERROR; + *val = in_be64((void *) offset); + break; + + case ACCESS_4B: + if (offset & 0x3) + return OPAL_XSCOM_ADDR_ERROR; + *val = in_be32((void *) offset); + break; + default: + prerror("bad flags? %llx\n", r->flags); + return OPAL_XSCOM_ADDR_ERROR; + } + + return OPAL_SUCCESS; +} + +static bool ocmb_probe_one(struct dt_node *ocmb_node) +{ + uint64_t chip_id = dt_prop_get_u32(ocmb_node, "ibm,chip-id"); + const struct dt_property *flags; + int i = 0, num = 0; + struct ocmb *ocmb; + + num = dt_count_addresses(ocmb_node); + + ocmb = zalloc(sizeof(*ocmb) + sizeof(*ocmb->ranges) * num); + if (!ocmb) + return false; + + ocmb->scom.private = ocmb; + ocmb->scom.part_id = chip_id; + ocmb->scom.write = ocmb_fake_scom_write; + ocmb->scom.read = ocmb_fake_scom_read; + ocmb->range_count = num; + + flags = dt_require_property(ocmb_node, "flags", sizeof(u64) * num); + + for (i = 0; i < num; i++) { + uint64_t start, size; + + start = dt_get_address(ocmb_node, i, &size); + + ocmb->ranges[i].start = start; + ocmb->ranges[i].end = start + size - 1; + ocmb->ranges[i].flags = dt_property_get_u64(flags, i); + + prlog(PR_DEBUG, "Added range: %" PRIx64 " - [%llx - %llx]\n", + chip_id, start, start + size - 1); + } + + if (scom_register(&ocmb->scom)) + prerror("Error registering fake scom\n"); + + dt_add_property(ocmb_node, "scom-controller", NULL, 0); + prlog(PR_NOTICE, "Added scom controller for %s\n", ocmb_node->name); + + return true; +} + +void ocmb_init(void) +{ + struct dt_node *dn; + + dt_for_each_compatible(dt_root, dn, "ibm,explorer") + ocmb_probe_one(dn); +} diff --git a/roms/skiboot/hw/p8-i2c.c b/roms/skiboot/hw/p8-i2c.c new file mode 100644 index 000000000..45815858e --- /dev/null +++ b/roms/skiboot/hw/p8-i2c.c @@ -0,0 +1,1688 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * P8 i2c master + * + * Copyright 2013-2019 IBM Corp. + */ + +#undef DEBUG + +#include <opal.h> +#include <skiboot.h> +#include <mem_region-malloc.h> +#include <lock.h> +#include <chip.h> +#include <i2c.h> +#include <xscom.h> +#include <timebase.h> +#include <timer.h> +#include <opal-msg.h> +#include <errorlog.h> +#include <centaur.h> +#include <debug_descriptor.h> + +DEFINE_LOG_ENTRY(OPAL_RC_I2C_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_I2C, + OPAL_IO_SUBSYSTEM, OPAL_PREDICTIVE_ERR_DEGRADED_PERF, + OPAL_NA); +DEFINE_LOG_ENTRY(OPAL_RC_I2C_START_REQ, OPAL_INPUT_OUTPUT_ERR_EVT, OPAL_I2C, + OPAL_IO_SUBSYSTEM, OPAL_INFO, OPAL_NA); +DEFINE_LOG_ENTRY(OPAL_RC_I2C_TIMEOUT, OPAL_INPUT_OUTPUT_ERR_EVT, OPAL_I2C, + OPAL_IO_SUBSYSTEM, OPAL_INFO, OPAL_NA); +DEFINE_LOG_ENTRY(OPAL_RC_I2C_TRANSFER, OPAL_INPUT_OUTPUT_ERR_EVT, OPAL_I2C, + OPAL_IO_SUBSYSTEM, OPAL_INFO, OPAL_NA); +DEFINE_LOG_ENTRY(OPAL_RC_I2C_RESET, OPAL_INPUT_OUTPUT_ERR_EVT, OPAL_I2C, + OPAL_IO_SUBSYSTEM, OPAL_INFO, OPAL_NA); + +#ifdef DEBUG +#define DBG(fmt...) prlog(PR_ERR, "I2C: " fmt) +#define I2C_TIMEOUT_IRQ_MS 100 /* 100ms/byte timeout */ +#define I2C_TIMEOUT_POLL_MS 4000 /* 4s/byte timeout */ +#else +#define DBG(fmt...) prlog(PR_TRACE, "I2C: " fmt) +#define I2C_TIMEOUT_IRQ_MS 1 /* 1ms/byte timeout */ +#define I2C_TIMEOUT_POLL_MS 4000 /* 4s/byte timeout */ +#endif + +/* How long to keep the sensor cache disabled after an access + * in milliseconds + */ +#define SENSOR_CACHE_EN_DELAY 10 + +#define USEC_PER_SEC 1000000 +#define USEC_PER_MSEC 1000 +#define I2C_RESET_DELAY_MS 5 /* 5 msecs */ +#define I2C_FIFO_HI_LVL 4 +#define I2C_FIFO_LO_LVL 4 + +/* + * I2C registers set. + * Below is the offset of registers from base which is stored in the + * 'struct p8_i2c_master' + */ + +/* I2C FIFO register */ +#define I2C_FIFO_REG 0x4 +#define I2C_FIFO PPC_BITMASK(0, 7) + +/* I2C command register */ +#define I2C_CMD_REG 0x5 +#define I2C_CMD_WITH_START PPC_BIT(0) +#define I2C_CMD_WITH_ADDR PPC_BIT(1) +#define I2C_CMD_READ_CONT PPC_BIT(2) +#define I2C_CMD_WITH_STOP PPC_BIT(3) +#define I2C_CMD_INTR_STEERING PPC_BITMASK(6,7) /* P9 */ +#define I2C_CMD_INTR_STEER_HOST 1 +#define I2C_CMD_INTR_STEER_OCC 2 +#define I2C_CMD_DEV_ADDR PPC_BITMASK(8, 14) +#define I2C_CMD_READ_NOT_WRITE PPC_BIT(15) +#define I2C_CMD_LEN_BYTES PPC_BITMASK(16, 31) +#define I2C_MAX_TFR_LEN 0xfff0ull + +/* I2C mode register */ +#define I2C_MODE_REG 0x6 +#define I2C_MODE_BIT_RATE_DIV PPC_BITMASK(0, 15) +#define I2C_MODE_PORT_NUM PPC_BITMASK(16, 21) +#define I2C_MODE_ENHANCED PPC_BIT(28) +#define I2C_MODE_DIAGNOSTIC PPC_BIT(29) +#define I2C_MODE_PACING_ALLOW PPC_BIT(30) +#define I2C_MODE_WRAP PPC_BIT(31) + +/* I2C watermark register */ +#define I2C_WATERMARK_REG 0x7 +#define I2C_WATERMARK_HIGH PPC_BITMASK(16, 19) +#define I2C_WATERMARK_LOW PPC_BITMASK(24, 27) + +/* + * I2C interrupt mask and condition registers + * + * NB: The function of 0x9 and 0xa changes depending on whether you're reading + * or writing to them. When read they return the interrupt condition bits + * and on writes they update the interrupt mask register. + * + * The bit definitions are the same for all the interrupt registers. + */ +#define I2C_INTR_MASK_REG 0x8 + +#define I2C_INTR_RAW_COND_REG 0x9 /* read */ +#define I2C_INTR_MASK_OR_REG 0x9 /* write*/ + +#define I2C_INTR_COND_REG 0xa /* read */ +#define I2C_INTR_MASK_AND_REG 0xa /* write */ + +#define I2C_INTR_ALL PPC_BITMASK(16, 31) +#define I2C_INTR_INVALID_CMD PPC_BIT(16) +#define I2C_INTR_LBUS_PARITY_ERR PPC_BIT(17) +#define I2C_INTR_BKEND_OVERRUN_ERR PPC_BIT(18) +#define I2C_INTR_BKEND_ACCESS_ERR PPC_BIT(19) +#define I2C_INTR_ARBT_LOST_ERR PPC_BIT(20) +#define I2C_INTR_NACK_RCVD_ERR PPC_BIT(21) +#define I2C_INTR_DATA_REQ PPC_BIT(22) +#define I2C_INTR_CMD_COMP PPC_BIT(23) +#define I2C_INTR_STOP_ERR PPC_BIT(24) +#define I2C_INTR_I2C_BUSY PPC_BIT(25) +#define I2C_INTR_NOT_I2C_BUSY PPC_BIT(26) +#define I2C_INTR_SCL_EQ_1 PPC_BIT(28) +#define I2C_INTR_SCL_EQ_0 PPC_BIT(29) +#define I2C_INTR_SDA_EQ_1 PPC_BIT(30) +#define I2C_INTR_SDA_EQ_0 PPC_BIT(31) + +/* I2C status register */ +#define I2C_RESET_I2C_REG 0xb +#define I2C_RESET_ERRORS 0xc +#define I2C_STAT_REG 0xb +#define I2C_STAT_INVALID_CMD PPC_BIT(0) +#define I2C_STAT_LBUS_PARITY_ERR PPC_BIT(1) +#define I2C_STAT_BKEND_OVERRUN_ERR PPC_BIT(2) +#define I2C_STAT_BKEND_ACCESS_ERR PPC_BIT(3) +#define I2C_STAT_ARBT_LOST_ERR PPC_BIT(4) +#define I2C_STAT_NACK_RCVD_ERR PPC_BIT(5) +#define I2C_STAT_DATA_REQ PPC_BIT(6) +#define I2C_STAT_CMD_COMP PPC_BIT(7) +#define I2C_STAT_STOP_ERR PPC_BIT(8) +#define I2C_STAT_UPPER_THRS PPC_BITMASK(9, 15) +#define I2C_STAT_ANY_I2C_INTR PPC_BIT(16) +#define I2C_STAT_PORT_HISTORY_BUSY PPC_BIT(19) +#define I2C_STAT_SCL_INPUT_LEVEL PPC_BIT(20) +#define I2C_STAT_SDA_INPUT_LEVEL PPC_BIT(21) +#define I2C_STAT_PORT_BUSY PPC_BIT(22) +#define I2C_STAT_INTERFACE_BUSY PPC_BIT(23) +#define I2C_STAT_FIFO_ENTRY_COUNT PPC_BITMASK(24, 31) + +#define I2C_STAT_ANY_ERR (I2C_STAT_INVALID_CMD | I2C_STAT_LBUS_PARITY_ERR | \ + I2C_STAT_BKEND_OVERRUN_ERR | \ + I2C_STAT_BKEND_ACCESS_ERR | I2C_STAT_ARBT_LOST_ERR | \ + I2C_STAT_NACK_RCVD_ERR | I2C_STAT_STOP_ERR) + + +#define I2C_INTR_ACTIVE \ + ((I2C_STAT_ANY_ERR >> 16) | I2C_INTR_CMD_COMP | I2C_INTR_DATA_REQ) + +/* Pseudo-status used for timeouts */ +#define I2C_STAT_PSEUDO_TIMEOUT PPC_BIT(63) + + +/* I2C extended status register */ +#define I2C_EXTD_STAT_REG 0xc +#define I2C_EXTD_STAT_FIFO_SIZE PPC_BITMASK(0, 7) +#define I2C_EXTD_STAT_MSM_CURSTATE PPC_BITMASK(11, 15) +#define I2C_EXTD_STAT_SCL_IN_SYNC PPC_BIT(16) +#define I2C_EXTD_STAT_SDA_IN_SYNC PPC_BIT(17) +#define I2C_EXTD_STAT_S_SCL PPC_BIT(18) +#define I2C_EXTD_STAT_S_SDA PPC_BIT(19) +#define I2C_EXTD_STAT_M_SCL PPC_BIT(20) +#define I2C_EXTD_STAT_M_SDA PPC_BIT(21) +#define I2C_EXTD_STAT_HIGH_WATER PPC_BIT(22) +#define I2C_EXTD_STAT_LOW_WATER PPC_BIT(23) +#define I2C_EXTD_STAT_I2C_BUSY PPC_BIT(24) +#define I2C_EXTD_STAT_SELF_BUSY PPC_BIT(25) +#define I2C_EXTD_STAT_I2C_VERSION PPC_BITMASK(27, 31) + +/* I2C residual front end/back end length */ +#define I2C_RESIDUAL_LEN_REG 0xd +#define I2C_RESIDUAL_FRONT_END PPC_BITMASK(0, 15) +#define I2C_RESIDUAL_BACK_END PPC_BITMASK(16, 31) + +/* Port busy register */ +#define I2C_PORT_BUSY_REG 0xe +#define I2C_SET_S_SCL_REG 0xd +#define I2C_RESET_S_SCL_REG 0xf +#define I2C_SET_S_SDA_REG 0x10 +#define I2C_RESET_S_SDA_REG 0x11 + +enum p8_i2c_master_type { + I2C_POWER8, + I2C_CENTAUR, + MAX_I2C_TYPE, +}; + +struct p8_i2c_master { + struct dt_node *dt_node; + struct lock lock; /* Lock to guard the members */ + enum p8_i2c_master_type type; /* P8 vs. Centaur */ + uint64_t start_time; /* Request start time */ + uint64_t last_update; + uint64_t poll_interval; /* Polling interval */ + uint64_t xscom_base; /* xscom base of i2cm */ + uint32_t fifo_size; /* Maximum size of FIFO */ + uint32_t chip_id; /* Chip the i2cm sits on */ + uint32_t engine_id; /* Engine# on chip */ + uint8_t obuf[4]; /* Offset buffer */ + uint32_t bytes_sent; + bool irq_ok; /* Interrupt working ? */ + bool occ_cache_dis; /* I have disabled the cache */ + bool occ_lock_acquired; /* Acquired lock from OCC */ + enum request_state { + state_idle, + state_occache_dis, + state_offset, + state_data, + state_error, + state_recovery, + } state; + struct list_head req_list; /* Request queue head */ + struct timer poller; + struct timer timeout; + struct timer recovery; + struct timer sensor_cache; + uint8_t recovery_pass; + struct list_node link; + struct list_head ports; +}; + +struct p8_i2c_master_port { + struct i2c_bus bus; /* Abstract bus struct for the client */ + struct p8_i2c_master *master; + uint32_t port_num; + uint32_t bit_rate_div; /* Divisor to set bus speed*/ + uint64_t byte_timeout; /* Timeout per byte */ + uint64_t poll_interval; /* Polling interval */ + struct list_node link; +}; + +static int occ_i2c_unlock(struct p8_i2c_master *master); + +static int64_t i2cm_read_reg(struct p8_i2c_master *m, int reg, uint64_t *val) +{ + return xscom_read(m->chip_id, m->xscom_base + reg, val); +} + +static int64_t i2cm_write_reg(struct p8_i2c_master *m, int reg, uint64_t val) +{ + return xscom_write(m->chip_id, m->xscom_base + reg, val); +} + +static void p8_i2c_print_debug_info(struct p8_i2c_master_port *port, + struct i2c_request *req, uint64_t end_time) +{ + struct p8_i2c_master *master = port->master; + uint64_t cmd, mode, stat, estat, intm, intc; + + /* Print master and request structure bits */ + log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), + "I2C: Chip %08x Eng. %d Port %d--\n" + " xscom_base=0x%016llx\tstate=%d\tbytes_sent=%d\n", + master->chip_id, master->engine_id, port->port_num, + master->xscom_base, master->state, master->bytes_sent); + + log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), "I2C: Request info--\n" + " addr=0x%04x\toffset_bytes=%d\toffset=%d\tlen=%d\n", + req->dev_addr, req->offset_bytes, req->offset, + req->rw_len); + + log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), "I2C: " + " start_time=%016llx end_time=%016llx (duration=%016llx)\n", + master->start_time, end_time, end_time - master->start_time); + + /* initialise to some fake value in case of read errors */ + cmd = mode = stat = estat = intm = intc = 0xDEAD; + + /* Dump the current state of i2c registers */ + i2cm_read_reg(master, I2C_CMD_REG, &cmd); + i2cm_read_reg(master, I2C_MODE_REG, &mode); + i2cm_read_reg(master, I2C_MODE_REG, &mode); + i2cm_read_reg(master, I2C_STAT_REG, &stat); + i2cm_read_reg(master, I2C_EXTD_STAT_REG, &estat); + i2cm_read_reg(master, I2C_INTR_MASK_REG, &intm); + i2cm_read_reg(master, I2C_INTR_RAW_COND_REG, &intc); + + log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), "I2C: Register dump--\n" + " cmd:0x%016llx\tmode:0x%016llx\tstat:0x%016llx\n" + " estat:0x%016llx\tintm:0x%016llx\tintc:0x%016llx\n", + cmd, mode, stat, estat, intm, intc); + + log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), + "I2C: Error bits set: %s%s%s%s%s%s%s\n", + (stat & I2C_STAT_NACK_RCVD_ERR) ? "nack, " : "", + (stat & I2C_STAT_INVALID_CMD) ? "cmd invalid, " : "", + (stat & I2C_STAT_LBUS_PARITY_ERR) ? "interal parity, " : "", + (stat & I2C_STAT_BKEND_OVERRUN_ERR) ? "backend overrun, " : "", + (stat & I2C_STAT_BKEND_ACCESS_ERR) ? "backend access, " : "", + (stat & I2C_STAT_ARBT_LOST_ERR) ? "arbitration loss, " : "", + (stat & I2C_STAT_STOP_ERR) ? "stop error, " : ""); +} + +static bool p8_i2c_has_irqs(struct p8_i2c_master *master) +{ + struct proc_chip *chip; + + /* Centaur I2C doesn't have interrupts */ + if (master->type == I2C_CENTAUR) + return false; + + chip = get_chip(master->chip_id); + + /* The i2c interrupts was only added to Murano DD2.1 and Venice + * DD2.0. When operating without interrupts, we need to bump the + * timeouts as we rely solely on the polls from Linux which can + * be up to 2s apart ! + */ + if (proc_gen >= proc_gen_p9) + return true; + else if (chip->type == PROC_CHIP_P8_MURANO) + return chip->ec_level >= 0x21; + else if (chip->type == PROC_CHIP_P8_VENICE) + return chip->ec_level >= 0x20; + + return true; +} + +static int p8_i2c_enable_irqs(struct p8_i2c_master *master) +{ + int rc; + + /* enable interrupts we're interested in */ + rc = i2cm_write_reg(master, I2C_INTR_MASK_OR_REG, I2C_INTR_ACTIVE); + if (rc) + prlog(PR_ERR, "I2C: Failed to enable the interrupts\n"); + + return rc; +} + +static void p8_i2c_reset_timeout(struct p8_i2c_master *master, + struct i2c_request *req) +{ + uint64_t now = mftb(); + + master->last_update = now; + schedule_timer_at(&master->timeout, now + msecs_to_tb(req->timeout)); +} + +static int p8_i2c_prog_watermark(struct p8_i2c_master *master) +{ + uint64_t watermark; + int rc; + + rc = xscom_read(master->chip_id, master->xscom_base + I2C_WATERMARK_REG, + &watermark); + if (rc) { + prlog(PR_ERR, "I2C: Failed to read the WATERMARK_REG\n"); + return rc; + } + + /* Set the high/low watermark */ + watermark = SETFIELD(I2C_WATERMARK_HIGH, watermark, I2C_FIFO_HI_LVL); + watermark = SETFIELD(I2C_WATERMARK_LOW, watermark, I2C_FIFO_LO_LVL); + rc = xscom_write(master->chip_id, master->xscom_base + + I2C_WATERMARK_REG, watermark); + if (rc) + prlog(PR_ERR, "I2C: Failed to set high/low watermark level\n"); + + return rc; +} + +static int p8_i2c_prog_mode(struct p8_i2c_master_port *port, bool enhanced_mode) +{ + struct p8_i2c_master *master = port->master; + uint64_t mode, omode; + int rc; + + rc = xscom_read(master->chip_id, master->xscom_base + + I2C_MODE_REG, &mode); + if (rc) { + prlog(PR_ERR, "I2C: Failed to read the MODE_REG\n"); + return rc; + } + omode = mode; + mode = SETFIELD(I2C_MODE_PORT_NUM, mode, port->port_num); + mode = SETFIELD(I2C_MODE_BIT_RATE_DIV, mode, port->bit_rate_div); + if (enhanced_mode) + mode |= I2C_MODE_ENHANCED; + else + mode &= ~I2C_MODE_ENHANCED; + if (mode == omode) + return 0; + + rc = xscom_write(master->chip_id, master->xscom_base + I2C_MODE_REG, + mode); + if (rc) + prlog(PR_ERR, "I2C: Failed to write the MODE_REG\n"); + + return rc; +} + +static void p8_i2c_complete_request(struct p8_i2c_master *master, + struct i2c_request *req, int ret) +{ + /* We only complete the current top level request */ + assert(req == list_top(&master->req_list, struct i2c_request, link)); + + cancel_timer_async(&master->timeout); + + list_del(&req->link); + master->state = state_idle; + req->result = ret; + req->req_state = i2c_req_done; + + /* Schedule re-enabling of sensor cache */ + if (master->occ_cache_dis) + schedule_timer(&master->sensor_cache, + msecs_to_tb(SENSOR_CACHE_EN_DELAY)); + + /* If we're done with i2c master, allow OCC to use it */ + if (master->occ_lock_acquired && list_empty(&master->req_list)) + occ_i2c_unlock(master); + + unlock(&master->lock); + if (req->completion) + req->completion(ret, req); + /* req might have been freed at this point */ + lock(&master->lock); +} + + +static int p8_i2c_engine_reset(struct p8_i2c_master_port *port) +{ + struct p8_i2c_master *master = port->master; + int rc; + + /* Reset the i2c engine */ + rc = xscom_write(master->chip_id, master->xscom_base + + I2C_RESET_I2C_REG, 0); + if (rc) { + log_simple_error(&e_info(OPAL_RC_I2C_RESET), "I2C: Failed " + "to reset the i2c engine\n"); + return rc; + } + + /* Reprogram the watermark and mode */ + rc = p8_i2c_prog_watermark(port->master); + if (rc) { + log_simple_error(&e_info(OPAL_RC_I2C_RESET), "I2C: Failed to" + "program the WATERMARK_REG\n"); + return rc; + } + + rc = p8_i2c_prog_mode(port, false); + if (rc) + log_simple_error(&e_info(OPAL_RC_I2C_RESET), "I2C: Failed to" + "program the MODE_REG\n"); + + return rc; +} + +static void p8_i2c_translate_error(struct i2c_request *req, uint64_t status) +{ + /* Assuming there are not more than one type of error simultaneously */ + if (status & I2C_STAT_NACK_RCVD_ERR) + req->result = OPAL_I2C_NACK_RCVD; + else if (status & I2C_STAT_INVALID_CMD) + req->result = OPAL_I2C_INVALID_CMD; + else if (status & I2C_STAT_LBUS_PARITY_ERR) + req->result = OPAL_I2C_LBUS_PARITY; + else if (status & I2C_STAT_BKEND_OVERRUN_ERR) + req->result = OPAL_I2C_BKEND_OVERRUN; + else if (status & I2C_STAT_BKEND_ACCESS_ERR) + req->result = OPAL_I2C_BKEND_ACCESS; + else if (status & I2C_STAT_ARBT_LOST_ERR) + req->result = OPAL_I2C_ARBT_LOST; + else if (status & I2C_STAT_STOP_ERR) + req->result = OPAL_I2C_STOP_ERR; + else if (status & I2C_STAT_PSEUDO_TIMEOUT) + req->result = OPAL_I2C_TIMEOUT; +} + +static int p8_i2c_reset_port(struct p8_i2c_master_port *p) +{ + struct p8_i2c_master *master = p->master; + int reset_loops, rc; + uint64_t status; + + /* FIXME: this should per per-port rather than per-master */ + master->state = state_error; + + /* + * Put the master into enhanced STOP mode when recovering the + * port. This causes the master to send additional STOP conditions + * to work around some particularly stupid I2C devices and it's + * required on secure I2C masters since they will not send a bare + * stop condition. + */ + rc = p8_i2c_prog_mode(p, true); + if (rc) { + log_simple_error(&e_info(OPAL_RC_I2C_RESET), + "I2C: Failed to enable enhanced mode\n"); + return -1; + } + + rc = xscom_write(master->chip_id, master->xscom_base + + I2C_CMD_REG, I2C_CMD_WITH_STOP); + if (rc) + goto err; + + /* Wait for COMMAND COMPLETE */ + for (reset_loops = 0; reset_loops < 10; reset_loops++) { + time_wait_ms(10); + + rc = xscom_read(master->chip_id, + master->xscom_base + I2C_STAT_REG, + &status); + if (rc) + goto err; + + if (status & I2C_STAT_CMD_COMP) + break; + } + + if (status & I2C_STAT_CMD_COMP) + return 0; +err: + prerror("I2C: Failed to reset c%de%dp%d\n", + master->chip_id, master->engine_id, p->port_num); + return -1; +} + +static void p8_i2c_status_error(struct p8_i2c_master_port *port, + struct i2c_request *req, + uint64_t status, uint64_t end_time) +{ + struct p8_i2c_master *master = port->master; + int rc; + + /* Display any error other than I2C_INTR_NACK_RCVD_ERR or + * timeout since getting NACK's is normal if Linux is probing + * the bus and timeouts will have already logged something. + */ + if (!(status & (I2C_STAT_NACK_RCVD_ERR | I2C_STAT_PSEUDO_TIMEOUT))) { + log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), + "I2C: Transfer error occurred\n"); + p8_i2c_print_debug_info(port, req, end_time); + } else if (status == I2C_STAT_PSEUDO_TIMEOUT) { + log_simple_error(&e_info(OPAL_RC_I2C_TIMEOUT), + "I2C: request timed out!\n"); + p8_i2c_print_debug_info(port, req, end_time); + } + + p8_i2c_translate_error(req, status); + + rc = p8_i2c_engine_reset(port); + if (rc) + goto exit; + + if (status & (I2C_STAT_LBUS_PARITY_ERR | I2C_STAT_ARBT_LOST_ERR | + I2C_STAT_STOP_ERR)) { + /* + * Don't bother issuing a STOP command for those errors + * just get rid of the current request and start off with + * the fresh one in the list + */ + p8_i2c_complete_request(master, req, req->result); + } else { + if (p8_i2c_reset_port(port)) + goto exit; + /* Enable the interrupt */ + p8_i2c_enable_irqs(master); + } + return; + +exit: + p8_i2c_complete_request(master, req, req->result); +} + +static int p8_i2c_fifo_read(struct p8_i2c_master *master, + uint8_t *buf, uint32_t count) +{ + uint64_t fifo; + uint32_t i; + int rc = 0; + + for (i = 0; i < count; i++, buf++) { + rc = xscom_read(master->chip_id, master->xscom_base + + I2C_FIFO_REG, &fifo); + if (rc) { + log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), + "I2C: Failed to read the fifo\n"); + break; + } + + *buf = GETFIELD(I2C_FIFO, fifo); + } + return rc; +} + +static int p8_i2c_fifo_write(struct p8_i2c_master *master, + uint8_t *buf, uint32_t count) +{ + uint64_t fifo; + uint32_t i; + int rc = 0; + + for (i = 0; i < count; i++, buf++) { + fifo = SETFIELD(I2C_FIFO, 0ull, *buf); + rc = xscom_write(master->chip_id, master->xscom_base + + I2C_FIFO_REG, fifo); + if (rc) { + log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), + "I2C: Failed to write the fifo\n"); + break; + } + } + return rc; +} + +static void p8_i2c_status_data_request(struct p8_i2c_master *master, + struct i2c_request *req, + uint64_t status) +{ + uint32_t fifo_count, fifo_free, count; + uint8_t *buf; + int rc = 0; + + fifo_count = GETFIELD(I2C_STAT_FIFO_ENTRY_COUNT, status); + fifo_free = master->fifo_size - fifo_count; + + DBG("Data request, state=%d fifo_count=%d/%d bytes_sent=%d\n", + master->state, fifo_count, master->fifo_size, master->bytes_sent); + + switch(master->state) { + case state_offset: + /* We assume the offset can always be written in one go */ + if (fifo_free < req->offset_bytes) { + log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), + "I2C: Fifo too small for offset !\n"); + rc = OPAL_HARDWARE; + } else { + rc = p8_i2c_fifo_write(master, master->obuf, + req->offset_bytes); + } + + /* For read, wait address phase to complete */ + if (rc || req->op != SMBUS_WRITE) + break; + + /* For writes, transition to data phase now */ + master->state = state_data; + fifo_free -= req->offset_bytes; + /* Fall through */ + case state_data: + /* Sanity check */ + if (master->bytes_sent >= req->rw_len) { + log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), "I2C: " + "Data req with no data to send sent=%d " + "req=%d\n", master->bytes_sent, + req->rw_len); + rc = OPAL_HARDWARE; + break; + } + + /* Get next chunk */ + buf = req->rw_buf + master->bytes_sent; + count = req->rw_len - master->bytes_sent; + + /* Check direction */ + if (req->op == I2C_READ || req->op == SMBUS_READ) { + if (count > fifo_count) + count = fifo_count; + rc = p8_i2c_fifo_read(master, buf, count); + } else { + if (count > fifo_free) + count = fifo_free; + rc = p8_i2c_fifo_write(master, buf, count); + } + if (rc == 0) + master->bytes_sent += count; + break; + default: + log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), "I2C: Invalid " + "state %d in data req !\n", master->state); + rc = OPAL_WRONG_STATE; + } + + if (rc) { + p8_i2c_complete_request(master, req, rc); + } else { + p8_i2c_enable_irqs(master); + p8_i2c_reset_timeout(master, req); + } +} + +static void p8_i2c_complete_offset(struct p8_i2c_master *master, + struct i2c_request *req) +{ + uint64_t cmd; + int rc = 0; + + DBG("Completing offset phase\n"); + + /* If it's a write, we should only get here for empty + * write commands + */ + if (req->op == SMBUS_WRITE && req->rw_len != 0) { + log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), "I2C: Write " + "completion in offset state !\n"); + rc = OPAL_HARDWARE; + goto complete; + } + + /* Switch to data phase */ + master->state = state_data; + + /* If it's not a read command, or there are no data to read, + * then we complete the command + */ + if (req->op != SMBUS_READ || req->rw_len == 0) + goto complete; + + /* Otherwise, let's start the data phase */ + cmd = I2C_CMD_WITH_START | I2C_CMD_WITH_ADDR | + I2C_CMD_WITH_STOP | I2C_CMD_READ_NOT_WRITE; + cmd = SETFIELD(I2C_CMD_DEV_ADDR, cmd, req->dev_addr); + cmd = SETFIELD(I2C_CMD_LEN_BYTES, cmd, req->rw_len); + cmd = SETFIELD(I2C_CMD_INTR_STEERING, cmd, I2C_CMD_INTR_STEER_HOST); + + DBG("Command: %016llx, state: %d\n", cmd, master->state); + + /* Send command */ + rc = xscom_write(master->chip_id, master->xscom_base + I2C_CMD_REG, + cmd); + if (rc) { + log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), "I2C: Failed " + "to write the CMD_REG\n"); + goto complete; + } + + /* Enable the interrupts */ + p8_i2c_enable_irqs(master); + p8_i2c_reset_timeout(master, req); + return; + + complete: + p8_i2c_complete_request(master, req, rc); +} + +static void p8_i2c_status_cmd_completion(struct p8_i2c_master *master, + struct i2c_request *req, + uint64_t end_time __unused) +{ + int rc; + + DBG("Command completion, state=%d bytes_sent=%d\n", + master->state, master->bytes_sent); + DBG(" start_time=%016llx end_time=%016llx (duration=%016llx)\n", + master->start_time, end_time, end_time - master->start_time); + + /* If we complete an offset, we probably need to transition + * do a data read, check if that all makes sense + */ + if (master->state == state_offset) { + p8_i2c_complete_offset(master, req); + return; + } + + /* If we are not already in error state, check if we have + * completed our data transfer properly + */ + if (master->state != state_error && master->bytes_sent != req->rw_len) { + log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), "I2C: Request " + "complete with residual data req=%d done=%d\n", + req->rw_len, master->bytes_sent); + /* Should we error out here ? */ + } + rc = master->state == state_error ? req->result : OPAL_SUCCESS; + p8_i2c_complete_request(master, req, rc); +} + +static void p8_i2c_check_status(struct p8_i2c_master *master) +{ + struct p8_i2c_master_port *port; + uint64_t status, deadline, now; + struct i2c_request *req; + int rc; + + /* + * When idle or waiting for the occ to release the bus there's + * nothing to check. Also ignore recovery state, as the bus + * can be reset in that state, and a request can think it's + * complete when it just means the reset is complete. + * Error states are handled when starting a new request. + */ + if (master->state == state_idle || master->state == state_occache_dis || + master->state == state_recovery) + return; + + /* A non-idle master should always have a pending request */ + req = list_top(&master->req_list, struct i2c_request, link); + if (!req) { + prerror("I2C: Master is not idle and has no pending request\n"); + return; + } + + rc = i2cm_read_reg(master, I2C_STAT_REG, &status); + if (rc) { + log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), + "I2C: Failed to read the STAT_REG\n"); + return; + } + + /* mask interrupts while we're mucking with the master */ + rc = i2cm_write_reg(master, I2C_INTR_MASK_AND_REG, ~I2C_INTR_ALL); + if (rc) { + log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), + "I2C: Failed to disable the interrupts\n"); + return; + } + + /* Get port for current request */ + port = container_of(req->bus, struct p8_i2c_master_port, bus); + now = mftb(); + + deadline = master->last_update + msecs_to_tb(req->timeout); + + if (status & I2C_STAT_ANY_ERR) + p8_i2c_status_error(port, req, status & I2C_STAT_ANY_ERR, now); + else if (status & I2C_STAT_DATA_REQ) + p8_i2c_status_data_request(master, req, status); + else if (status & I2C_STAT_CMD_COMP) + p8_i2c_status_cmd_completion(master, req, now); + else if (tb_compare(now, deadline) == TB_AAFTERB) + p8_i2c_status_error(port, req, I2C_STAT_PSEUDO_TIMEOUT, now); + else + p8_i2c_enable_irqs(master); +} + +static int p8_i2c_check_initial_status(struct p8_i2c_master_port *port) +{ + struct p8_i2c_master *master = port->master; + uint64_t status, estat; + int rc; + + master->recovery_pass++; + + /* Read status register */ + rc = xscom_read(master->chip_id, master->xscom_base + I2C_STAT_REG, + &status); + if (rc) { + log_simple_error(&e_info(OPAL_RC_I2C_START_REQ), "I2C: Failed " + "to read the STAT_REG\n"); + return rc; + } + + rc = xscom_read(master->chip_id, + master->xscom_base + I2C_EXTD_STAT_REG, + &estat); + if (rc) { + log_simple_error(&e_info(OPAL_RC_I2C_START_REQ), "I2C: Failed " + "to read the EXTD_STAT_REG\n"); + return rc; + } + if (estat & (I2C_EXTD_STAT_I2C_BUSY | I2C_EXTD_STAT_SELF_BUSY)) { + DBG("Initial estat busy ! %016llx\n", estat); + /* Just a warning for now */ + } + + /* Nothing happened ? Go back */ + if (status & I2C_STAT_ANY_ERR) { + log_simple_error(&e_info(OPAL_RC_I2C_START_REQ), "I2C: " + "Initial error status 0x%016llx\n", status); + + if (master->recovery_pass > 1) { + log_simple_error(&e_info(OPAL_RC_I2C_START_REQ), "I2C: " + "Error stuck, aborting !!\n"); + return OPAL_HARDWARE; + } + + /* Mark state as "recovery" to block any other activity */ + master->state = state_recovery; + + /* Reset the engine */ + p8_i2c_engine_reset(port); + + /* Delay 5ms for bus to settle */ + schedule_timer(&master->recovery, msecs_to_tb(5)); + return OPAL_BUSY; + } + + /* Still busy ? */ + if (!(status & I2C_STAT_CMD_COMP)) { + log_simple_error(&e_info(OPAL_RC_I2C_START_REQ), "I2C: Initial " + "command complete not set\n"); + + if (master->recovery_pass > 5) { + log_simple_error(&e_info(OPAL_RC_I2C_START_REQ), "I2C: " + "Command stuck, aborting !!\n"); + return OPAL_HARDWARE; + } + + + master->state = state_recovery; + + /* Delay 5ms for bus to settle */ + schedule_timer(&master->recovery, msecs_to_tb(5)); + return OPAL_BUSY; + } + + master->recovery_pass = 0; + return 0; +} + +/* + * On POWER9, the I2C may also wish to use some of the i2cm engines, + * to do things like read sensor data. There's a couple of shared + * registers with the OCC to negotiate locking of the i2cm engines. + * See occ/src/occ_405/lock/lock.c + */ +static bool occ_uses_master(struct p8_i2c_master *master) +{ + /* OCC uses I2CM Engines 1,2 and 3, only on POWER9/10 */ + if (master->type == I2C_POWER8 && proc_gen >= proc_gen_p9) + return master->engine_id >= 1; + + return false; +} + +static uint32_t occflg; +#define OCCFLG_BASE 0 +#define OCCFLG_CLEAR 1 +#define OCCFLG_SET 2 + +static int occ_i2c_lock(struct p8_i2c_master *master) +{ + u64 occflags, busflag; + int rc; + + if (!occ_uses_master(master) || !occflg) + return 0; + + if (master->occ_lock_acquired) + return 0; + + rc = xscom_read(master->chip_id, occflg, &occflags); + if (rc) { + prerror("I2C: Failed to read OCC FLAG register\n"); + return rc; + } + + assert(master->engine_id > 0); + + busflag = PPC_BIT(16 + (master->engine_id - 1) * 2); + + DBG("I2C: c%de%d: occflags = %llx (locks = %x:%x:%x)\n", + master->chip_id, master->engine_id, (u64) occflags, + (u32) GETFIELD(PPC_BITMASK(16, 17), occflags), + (u32) GETFIELD(PPC_BITMASK(18, 19), occflags), + (u32) GETFIELD(PPC_BITMASK(20, 21), occflags)); + + rc = xscom_write(master->chip_id, occflg + OCCFLG_SET, busflag); + if (rc) { + prerror("I2C: Failed to write OCC FLAG register\n"); + return rc; + } + + /* If the OCC also has this bus locked then wait for IRQ */ + if (occflags & (busflag >> 1)) { + DBG("I2C: c%de%d: Master in use by OCC\n", + master->chip_id, master->engine_id); + return 1; + } + + master->occ_lock_acquired = true; + + return 0; +} + +static int occ_i2c_unlock(struct p8_i2c_master *master) +{ + u64 busflag, occflags; + int rc; + + if (!occ_uses_master(master) || !occflg) + return 0; + + rc = xscom_read(master->chip_id, occflg, &occflags); + if (rc) { + prerror("I2C: Failed to read OCC Flag register\n"); + return rc; + } + + busflag = PPC_BIT(16 + (master->engine_id - 1) * 2); + + if (!(occflags & busflag)) { + DBG("I2C: spurious unlock for c%de%d already cleared (flags = %.16llx)", + master->chip_id, master->engine_id, occflags); + } + + rc = xscom_write(master->chip_id, occflg + OCCFLG_CLEAR, busflag); + if (rc) + prerror("I2C: Failed to write OCC Flag register\n"); + + master->occ_lock_acquired = false; + + return rc; +} + +static int p8_i2c_start_request(struct p8_i2c_master *master, + struct i2c_request *req) +{ + struct p8_i2c_master_port *port; + uint64_t cmd; + int64_t rc; + + DBG("Starting req %d len=%d addr=%02x (offset=%x)\n", + req->op, req->rw_len, req->dev_addr, req->offset); + + /* Get port */ + port = container_of(req->bus, struct p8_i2c_master_port, bus); + + /* Check if we need to disable the OCC cache first */ + if (master->type == I2C_CENTAUR && !master->occ_cache_dis) { + DBG("Disabling OCC cache...\n"); + rc = centaur_disable_sensor_cache(master->chip_id); + + if (rc < 0) { + log_simple_error(&e_info(OPAL_RC_I2C_START_REQ), + "I2C: Failed " + "to disable the sensor cache\n"); + return rc; + } + master->occ_cache_dis = true; + + /* Do we need to wait ? */ + if (rc > 0) { + DBG("Waiting %lld\n", rc); + master->state = state_occache_dis; + schedule_timer(&master->recovery, rc); + return 0; + } + } + + /* + * on P9 we need to set the "I2C master using bit" so we don't + * conflict with the OCC's use of the i2c master. + */ + rc = occ_i2c_lock(master); + if (rc < 0) { + log_simple_error(&e_info(OPAL_RC_I2C_START_REQ), + "I2C: Failed to get I2CM lock from OCC\n"); + return rc; + } + if (rc > 0) { + /* Wait for OCC IRQ */ + master->state = state_occache_dis; + schedule_timer(&master->recovery, msecs_to_tb(10)); + return 0; + } + + /* Convert the offset if needed */ + if (req->offset_bytes) { + int i; + + for (i = 0; i < req->offset_bytes; i++) { + uint8_t b; + + b = req->offset >> (8 * (req->offset_bytes - i - 1)); + master->obuf[i] = b; + } + DBG("Offset %d bytes: %02x %02x %02x %02x\n", + req->offset_bytes, master->obuf[0], master->obuf[1], + master->obuf[2], master->obuf[3]); + } + + /* Program mode register */ + rc = p8_i2c_prog_mode(port, false); + if (rc) { + log_simple_error(&e_info(OPAL_RC_I2C_START_REQ), "I2C: Failed " + "to program the MODE_REG\n"); + return rc; + } + + /* Check status */ + rc = p8_i2c_check_initial_status(port); + if (rc != OPAL_BUSY) + master->recovery_pass = 0; + if (rc) + return rc; + + /* program the watermark register */ + rc = p8_i2c_prog_watermark(master); + if (rc) { + log_simple_error(&e_info(OPAL_RC_I2C_INIT), + "I2C: Failed to program the WATERMARK_REG\n"); + return rc; + } + + /* Initialize bytes_sent */ + master->bytes_sent = 0; + + /* Set up the command register */ + cmd = I2C_CMD_WITH_START | I2C_CMD_WITH_ADDR; + cmd = SETFIELD(I2C_CMD_DEV_ADDR, cmd, req->dev_addr); + cmd = SETFIELD(I2C_CMD_INTR_STEERING, cmd, I2C_CMD_INTR_STEER_HOST); + switch (req->op) { + case I2C_READ: + cmd |= I2C_CMD_READ_NOT_WRITE; + /* Fall through */ + case I2C_WRITE: + cmd |= I2C_CMD_WITH_STOP; + cmd = SETFIELD(I2C_CMD_LEN_BYTES, cmd, req->rw_len); + master->state = state_data; + break; + case SMBUS_READ: + cmd = SETFIELD(I2C_CMD_LEN_BYTES, cmd, req->offset_bytes); + master->state = state_offset; + break; + case SMBUS_WRITE: + cmd |= I2C_CMD_WITH_STOP; + cmd = SETFIELD(I2C_CMD_LEN_BYTES, cmd, + req->rw_len + req->offset_bytes); + master->state = state_offset; + break; + default: + return OPAL_PARAMETER; + } + DBG("Command: %016llx, state: %d\n", cmd, master->state); + + master->start_time = mftb(); + + /* Send command */ + rc = xscom_write(master->chip_id, master->xscom_base + I2C_CMD_REG, + cmd); + if (rc) { + log_simple_error(&e_info(OPAL_RC_I2C_START_REQ), "I2C: Failed " + "to write the CMD_REG\n"); + return rc; + } + + /* Enable the interrupts */ + p8_i2c_enable_irqs(master); + + /* Run a poll timer for boot cases or non-working interrupts + * cases + */ + if (!opal_booting() && master->irq_ok) + master->poll_interval = TIMER_POLL; + else + master->poll_interval = port->poll_interval; + schedule_timer(&master->poller, master->poll_interval); + + /* If we don't have a user-set timeout then use the master's default */ + if (!req->timeout) + req->timeout = port->byte_timeout; + + /* Start the timeout */ + p8_i2c_reset_timeout(master, req); + + return OPAL_SUCCESS; +} + +static void p8_i2c_check_work(struct p8_i2c_master *master) +{ + struct i2c_request *req; + int rc; + + while (master->state == state_idle && !list_empty(&master->req_list)) { + req = list_top(&master->req_list, struct i2c_request, link); + rc = p8_i2c_start_request(master, req); + if (rc) { + /* + * If it didn't work the first three times then + * odds are it's not going to work on the 4th. + */ + if (rc && req->retries > 3) + p8_i2c_complete_request(master, req, rc); + else + req->retries++; + } + } +} + +/* OCC IRQ Handler for I2C Ownership Change*/ +void p9_i2c_bus_owner_change(u32 chip_id) +{ + struct proc_chip *chip = get_chip(chip_id); + struct p8_i2c_master *master = NULL; + + assert(chip); + list_for_each(&chip->i2cms, master, link) { + lock(&master->lock); + + /* spurious */ + if (master->state != state_occache_dis) + goto done; + + /* Can we now lock this master? */ + if (occ_i2c_lock(master)) + goto done; + + /* clear the existing wait timer */ + cancel_timer_async(&master->recovery); + + /* re-start the request now that we own the master */ + master->state = state_idle; + + p8_i2c_check_work(master); + p8_i2c_check_status(master); +done: + unlock(&master->lock); + } +} + +static int p8_i2c_queue_request(struct i2c_request *req) +{ + struct i2c_bus *bus = req->bus; + struct p8_i2c_master_port *port = + container_of(bus, struct p8_i2c_master_port, bus); + struct p8_i2c_master *master = port->master; + int rc = 0; + + /* Parameter check */ + if (req->rw_len > I2C_MAX_TFR_LEN) { + prlog(PR_ERR, "I2C: Too large transfer %d bytes\n", req->rw_len); + return OPAL_PARAMETER; + } + + if (req->offset_bytes > 4) { + prlog(PR_ERR, "I2C: Invalid offset size %d\n", req->offset_bytes); + return OPAL_PARAMETER; + } + lock(&master->lock); + list_add_tail(&master->req_list, &req->link); + p8_i2c_check_work(master); + unlock(&master->lock); + + return rc; +} + +static uint64_t p8_i2c_run_request(struct i2c_request *req) +{ + struct i2c_bus *bus = req->bus; + struct p8_i2c_master_port *port = + container_of(bus, struct p8_i2c_master_port, bus); + struct p8_i2c_master *master = port->master; + uint64_t poll_interval = 0; + + lock(&master->lock); + p8_i2c_check_status(master); + p8_i2c_check_work(master); + poll_interval = master->poll_interval; + unlock(&master->lock); + + return poll_interval; +} + +static inline uint32_t p8_i2c_get_bit_rate_divisor(uint32_t lb_freq, + uint32_t bus_speed) +{ + assert(bus_speed > 0); + return (((lb_freq / bus_speed) - 1) / 4); +} + +static inline uint64_t p8_i2c_get_poll_interval(uint32_t bus_speed) +{ + uint64_t usec; + + assert(bus_speed > 0); + + /* Polling Interval = 8 * (1/bus_speed) * (1/10) -> convert to uSec */ + usec = ((8 * USEC_PER_SEC) / (10 * bus_speed)); + return usecs_to_tb(usec); +} + +static void p8_i2c_timeout(struct timer *t __unused, void *data, + uint64_t __unused now) +{ + struct p8_i2c_master *master = data; + + lock(&master->lock); + + DBG("timeout on c%de%d\n", master->chip_id, master->engine_id); + + /* + * Run through the usual status checks. It's possible to get spurious + * timeouts due to races between the interrupt/poller paths and the + * timeout handler. So we do all the checking, all the time. + */ + p8_i2c_check_status(master); + p8_i2c_check_work(master); + + unlock(&master->lock); +} + +static void p8_i2c_recover(struct timer *t __unused, void *data, + uint64_t now __unused) +{ + struct p8_i2c_master *master = data; + + lock(&master->lock); + + /* + * The recovery timer can race with the OCC interrupt. If the interrupt + * comes in just before this is called, then we'll get a spurious + * timeout which we need to ignore. + */ + if (master->state != state_recovery && + master->state != state_occache_dis) { + unlock(&master->lock); + return; + } + + master->state = state_idle; + + /* We may or may not still have work pending, re-enable the sensor cache + * immediately if we don't (we just waited the recovery time so there is + * little point waiting longer). + */ + if (master->occ_cache_dis && list_empty(&master->req_list)) { + DBG("Re-enabling OCC cache after recovery\n"); + centaur_enable_sensor_cache(master->chip_id); + master->occ_cache_dis = false; + } + + if (master->occ_lock_acquired && list_empty(&master->req_list)) + occ_i2c_unlock(master); + + /* Re-check for new work */ + p8_i2c_check_work(master); + unlock(&master->lock); +} + +static void p8_i2c_enable_scache(struct timer *t __unused, void *data, + uint64_t now __unused) +{ + struct p8_i2c_master *master = data; + + lock(&master->lock); + + /* Check if we are still idle */ + if (master->state == state_idle && master->occ_cache_dis) { + DBG("Re-enabling OCC cache\n"); + centaur_enable_sensor_cache(master->chip_id); + master->occ_cache_dis = false; + } + unlock(&master->lock); +} + +static void p8_i2c_poll(struct timer *t __unused, void *data, uint64_t now) +{ + struct p8_i2c_master *master = data; + + /* + * This is called when the interrupt isn't functional or + * generally from the opal pollers, so fast while booting + * and slowly when Linux is up. + */ + + /* Lockless fast bailout */ + if (master->state == state_idle) + return; + + lock(&master->lock); + p8_i2c_check_status(master); + if (master->state != state_idle) + schedule_timer_at(&master->poller, now + master->poll_interval); + p8_i2c_check_work(master); + unlock(&master->lock); +} + +void p8_i2c_interrupt(uint32_t chip_id) +{ + struct proc_chip *chip = get_chip(chip_id); + struct p8_i2c_master *master = NULL; + + assert(chip); + list_for_each(&chip->i2cms, master, link) { + + /* Lockless fast bailout (shared interrupt) */ + if (master->state == state_idle) + continue; + + lock(&master->lock); + + /* Run the state machine */ + p8_i2c_check_status(master); + + /* Check for new work */ + p8_i2c_check_work(master); + + unlock(&master->lock); + } +} + +static const char *compat[] = { + "ibm,power8-i2cm", + "ibm,centaur-i2cm" +}; + +static void p8_i2c_add_bus_prop(struct p8_i2c_master_port *port) +{ + const struct dt_property *c, *p; + struct dt_node *np = port->bus.dt_node; + char name[32]; + + c = dt_find_property(np, "compatible"); + p = dt_find_property(np, "ibm,port-name"); + + if (!c) { + if (port->master->type == I2C_POWER8) + dt_add_property_strings(np, "compatible", + "ibm,power8-i2c-port", + "ibm,opal-i2c"); + else if (port->master->type == I2C_CENTAUR) + dt_add_property_strings(np, "compatible", + "ibm,centaur-i2c-port", + "ibm,opal-i2c"); + } + + if (!p) { + if (port->master->type == I2C_POWER8) + snprintf(name, sizeof(name), "p8_%08x_e%dp%d", + port->master->chip_id, port->master->engine_id, + port->port_num); + else if (port->master->type == I2C_CENTAUR) + snprintf(name, sizeof(name), "cen_%08x_e%dp%d", + port->master->chip_id, port->master->engine_id, + port->port_num); + + dt_add_property_string(np, "ibm,port-name", name); + } +} + +static struct p8_i2c_master_port *p8_i2c_init_one_port(struct p8_i2c_master *m, + struct dt_node *n) +{ + struct p8_i2c_master_port *port; + uint64_t def_timeout, lb_freq; + uint32_t speed, div; + + port = zalloc(sizeof(*port)); + if (!port) + return NULL; + + def_timeout = m->irq_ok ? I2C_TIMEOUT_IRQ_MS : I2C_TIMEOUT_POLL_MS; + + lb_freq = dt_prop_get_u32_def(m->dt_node, "clock-frequency", 150000000); + speed = dt_prop_get_u32_def(n, "bus-frequency", 100000); + div = p8_i2c_get_bit_rate_divisor(lb_freq, speed); + + /* p8-i2c stuff */ + port->master = m; + port->bit_rate_div = div; + port->poll_interval = p8_i2c_get_poll_interval(speed); + port->port_num = dt_prop_get_u32(n, "reg"); + port->byte_timeout = dt_prop_get_u32_def(n, "timeout-ms", def_timeout); + list_add_tail(&m->ports, &port->link); + + /* core i2c stuff */ + port->bus.dt_node = n; + port->bus.queue_req = p8_i2c_queue_request; + port->bus.run_req = p8_i2c_run_request; + i2c_add_bus(&port->bus); + + /* add the bus name and compatible (if needed) */ + p8_i2c_add_bus_prop(port); + + prlog(PR_INFO, " P%d: <%s> %d kHz\n", port->port_num, + (char *) dt_prop_get(n, "ibm,port-name"), speed / 1000); + + return port; +} + +static struct p8_i2c_master *p8_i2c_init_one(struct dt_node *i2cm, + enum p8_i2c_master_type type) +{ + struct p8_i2c_master *master; + struct list_head *chip_list; + struct dt_node *i2cm_port; + uint64_t ex_stat; + uint32_t lb_freq; + int64_t rc; + + master = zalloc(sizeof(*master)); + if (!master) { + log_simple_error(&e_info(OPAL_RC_I2C_INIT), + "I2C: Failed to allocate master " + "structure\n"); + return NULL; + } + master->type = type; + + /* Local bus speed in Hz */ + lb_freq = dt_prop_get_u32(i2cm, "clock-frequency"); + + /* Initialise the i2c master structure */ + master->state = state_idle; + master->chip_id = dt_get_chip_id(i2cm); + master->engine_id = dt_prop_get_u32(i2cm, "chip-engine#"); + master->xscom_base = dt_get_address(i2cm, 0, NULL); + master->dt_node = i2cm; + if (master->type == I2C_CENTAUR) { + struct centaur_chip *centaur = get_centaur(master->chip_id); + if (centaur == NULL) { + log_simple_error(&e_info(OPAL_RC_I2C_INIT), + "I2C: Failed to get centaur 0x%x ", + master->chip_id); + free(master); + return NULL; + } + chip_list = ¢aur->i2cms; + + /* Detect bad device-tree from HostBoot giving us bogus + * i2c masters + */ + if (master->engine_id > 0) { + prlog(PR_ERR, "I2C: Skipping Centaur Master #1\n"); + free(master); + return NULL; + } + } else { + struct proc_chip *chip = get_chip(master->chip_id); + assert(chip); + chip_list = &chip->i2cms; + } + init_timer(&master->timeout, p8_i2c_timeout, master); + init_timer(&master->poller, p8_i2c_poll, master); + init_timer(&master->recovery, p8_i2c_recover, master); + init_timer(&master->sensor_cache, p8_i2c_enable_scache, master); + + master->irq_ok = p8_i2c_has_irqs(master); + + prlog(PR_INFO, "I2C: Chip %08x Eng. %d Clock %d Mhz %s\n", + master->chip_id, master->engine_id, lb_freq / 1000000, + master->irq_ok ? "" : "(no interrupt)"); + + /* Disable OCC cache during inits */ + if (master->type == I2C_CENTAUR) { + rc = centaur_disable_sensor_cache(master->chip_id); + if (rc < 0) { + log_simple_error(&e_info(OPAL_RC_I2C_INIT), "I2C: " + "Error %lld disabling sensor cache\n", + rc); + /* Ignore error and move on ... */ + } else + time_wait(rc); + } + rc = xscom_read(master->chip_id, master->xscom_base + + I2C_EXTD_STAT_REG, &ex_stat); + if (rc) { + log_simple_error(&e_info(OPAL_RC_I2C_INIT), "I2C: " + "Failed to read EXTD_STAT_REG\n"); + if (master->type == I2C_CENTAUR) + centaur_enable_sensor_cache(master->chip_id); + + free(master); + return NULL; + } + + master->fifo_size = GETFIELD(I2C_EXTD_STAT_FIFO_SIZE, ex_stat); + list_head_init(&master->req_list); + list_head_init(&master->ports); + + /* Re-enable the sensor cache, we aren't touching HW anymore */ + if (master->type == I2C_CENTAUR) + centaur_enable_sensor_cache(master->chip_id); + + /* Add master to chip's list */ + list_add_tail(chip_list, &master->link); + + /* initialise ports */ + dt_for_each_child(i2cm, i2cm_port) + p8_i2c_init_one_port(master, i2cm_port); + + return master; +} + +void p8_i2c_init(void) +{ + struct dt_node *i2cm; + int i; + + /* setup the handshake reg */ + if (proc_gen <= proc_gen_p9) + occflg = 0x6C08A; + else if (proc_gen == proc_gen_p10) + occflg = 0x6C0AC; + else + return; + + prlog(PR_INFO, "I2C: OCC flag reg: %x\n", occflg); + + for (i = 0; i < MAX_I2C_TYPE; i++) { + dt_for_each_compatible(dt_root, i2cm, compat[i]) + p8_i2c_init_one(i2cm, i); + } +} + +struct i2c_bus *p8_i2c_find_bus_by_port(uint32_t chip_id, int eng, int port_num) +{ + struct proc_chip *chip = get_chip(chip_id); + struct p8_i2c_master *m, *master = NULL; + struct p8_i2c_master_port *port; + + if (!chip) + return NULL; + + list_for_each(&chip->i2cms, m, link) { + if (m->engine_id == eng) { + master = m; + break; + } + } + + if (!master) + return NULL; + + list_for_each(&master->ports, port, link) + if (port->port_num == port_num) + return &port->bus; + + return NULL; +} + +/* Adds a new i2c port to the DT and initialises it */ +struct i2c_bus *p8_i2c_add_bus(uint32_t chip_id, int eng_id, int port_id, + uint32_t bus_speed) +{ + struct proc_chip *c = get_chip(chip_id); + struct p8_i2c_master *m, *master = NULL; + struct p8_i2c_master_port *port; + struct dt_node *pn; + + if (!c) { + prerror("I2C: Unable to add i2c bus: c%de%dp%d: chip doesn't exist\n", + chip_id, eng_id, port_id); + return NULL; + } + + list_for_each(&c->i2cms, m, link) { + if (m->engine_id == eng_id) { + master = m; + break; + } + } + + if (!master) { + struct dt_node *mn; + + mn = p8_i2c_add_master_node(c->devnode, eng_id); + if (!mn) { + prerror("I2C: Unable to add DT node for I2CM c%xe%d\n", + chip_id, eng_id); + return NULL; + } + + master = p8_i2c_init_one(mn, I2C_POWER8); + if (!master) { + prerror("I2C: Unable to initialise I2CM c%xe%d\n", + chip_id, eng_id); + return NULL; + } + } + + list_for_each(&master->ports, port, link) + if (port->port_num == port_id) + return &port->bus; + + pn = __p8_i2c_add_port_node(master->dt_node, port_id, bus_speed); + if (!pn) { + prerror("I2C: Unable to add dt node for bus c%xe%dp%d\n", + chip_id, eng_id, port_id); + return NULL; + } + + port = p8_i2c_init_one_port(master, pn); + if (!port) { + prerror("I2C: Unable to init bus c%xe%dp%d\n", + chip_id, eng_id, port_id); + return NULL; + } + + return &port->bus; +} diff --git a/roms/skiboot/hw/phb3.c b/roms/skiboot/hw/phb3.c new file mode 100644 index 000000000..8af6b6164 --- /dev/null +++ b/roms/skiboot/hw/phb3.c @@ -0,0 +1,5052 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * PHB3: PCI Host Bridge 3, in POWER8 + * + * Copyright 2013-2019 IBM Corp. + */ + +#include <skiboot.h> +#include <io.h> +#include <timebase.h> +#include <pci-cfg.h> +#include <pci.h> +#include <pci-slot.h> +#include <vpd.h> +#include <interrupts.h> +#include <opal.h> +#include <cpu.h> +#include <device.h> +#include <ccan/str/str.h> +#include <ccan/array_size/array_size.h> +#include <xscom.h> +#include <affinity.h> +#include <phb3.h> +#include <phb3-regs.h> +#include <phb3-capp.h> +#include <capp.h> +#include <fsp.h> +#include <chip.h> +#include <chiptod.h> + +/* Enable this to disable error interrupts for debug purposes */ +#undef DISABLE_ERR_INTS + +static void phb3_init_hw(struct phb3 *p, bool first_init); + +#define PHBDBG(p, fmt, a...) prlog(PR_DEBUG, "PHB#%04x: " fmt, \ + (p)->phb.opal_id, ## a) +#define PHBINF(p, fmt, a...) prlog(PR_INFO, "PHB#%04x: " fmt, \ + (p)->phb.opal_id, ## a) +#define PHBERR(p, fmt, a...) prlog(PR_ERR, "PHB#%04x: " fmt, \ + (p)->phb.opal_id, ## a) + +#define PE_CAPP_EN 0x9013c03 + +#define PE_REG_OFFSET(p) \ + ((PHB3_IS_NAPLES(p) && (p)->index) ? 0x40 : 0x0) + +/* Helper to select an IODA table entry */ +static inline void phb3_ioda_sel(struct phb3 *p, uint32_t table, + uint32_t addr, bool autoinc) +{ + out_be64(p->regs + PHB_IODA_ADDR, + (autoinc ? PHB_IODA_AD_AUTOINC : 0) | + SETFIELD(PHB_IODA_AD_TSEL, 0ul, table) | + SETFIELD(PHB_IODA_AD_TADR, 0ul, addr)); +} + +static void phb3_eeh_dump_regs(struct phb3 *p, + struct OpalIoPhb3ErrorData *regs); + +/* Check if AIB is fenced via PBCQ NFIR */ +static bool phb3_fenced(struct phb3 *p) +{ + uint64_t nfir; + + /* We still probably has crazy xscom */ + xscom_read(p->chip_id, p->pe_xscom + 0x0, &nfir); + if (nfir & PPC_BIT(16)) { + p->flags |= PHB3_AIB_FENCED; + + phb3_eeh_dump_regs(p, NULL); + return true; + } + return false; +} + +static int64_t phb3_pcicfg_rc_pref_window(void *dev __unused, + struct pci_cfg_reg_filter *pcrf, + uint32_t offset, uint32_t len, + uint32_t *data, bool write) +{ + uint8_t *pdata; + uint32_t i; + + /* Cache whatever we received */ + if (write) { + pdata = &pcrf->data[offset - pcrf->start]; + for (i = 0; i < len; i++, pdata++) + *pdata = (uint8_t)(*data >> (8 * i)); + return OPAL_SUCCESS; + } + + /* Return whatever we cached */ + *data = 0; + pdata = &pcrf->data[offset - pcrf->start + len - 1]; + for (i = len; i > 0; i--, pdata--) { + *data = (*data) << 8; + if (offset + i == PCI_CFG_PREF_MEM_BASE) { + *data |= ((*pdata & 0xf0) | 0x1); + continue; + } + + *data |= *pdata; + } + + return OPAL_SUCCESS; +} + +/* + * Configuration space access + * + * The PHB lock is assumed to be already held + */ +static int64_t phb3_pcicfg_check(struct phb3 *p, uint32_t bdfn, + uint32_t offset, uint32_t size, + uint8_t *pe) +{ + uint32_t sm = size - 1; + + if (offset > 0xfff || bdfn > 0xffff) + return OPAL_PARAMETER; + if (offset & sm) + return OPAL_PARAMETER; + + /* The root bus only has a device at 0 and we get into an + * error state if we try to probe beyond that, so let's + * avoid that and just return an error to Linux + */ + if (PCI_BUS_NUM(bdfn) == 0 && (bdfn & 0xff)) + return OPAL_HARDWARE; + + /* Check PHB state */ + if (p->broken) + return OPAL_HARDWARE; + + /* Fetch the PE# from cache */ + *pe = p->rte_cache[bdfn]; + + return OPAL_SUCCESS; +} + +static void phb3_link_update(struct phb *phb, uint16_t data) +{ + struct phb3 *p = phb_to_phb3(phb); + uint32_t new_spd, new_wid; + uint32_t old_spd, old_wid; + uint16_t old_data; + uint64_t lreg; + int i; + + /* Read the old speed and width */ + pci_cfg_read16(phb, 0, 0x5a, &old_data); + + /* Decode the register values */ + new_spd = data & PCICAP_EXP_LSTAT_SPEED; + new_wid = (data & PCICAP_EXP_LSTAT_WIDTH) >> 4; + old_spd = old_data & PCICAP_EXP_LSTAT_SPEED; + old_wid = (old_data & PCICAP_EXP_LSTAT_WIDTH) >> 4; + + /* Apply maximums */ + if (new_wid > 16) + new_wid = 16; + if (new_wid < 1) + new_wid = 1; + if (new_spd > 3) + new_spd = 3; + if (new_spd < 1) + new_spd = 1; + + PHBINF(p, "Link change request: speed %d->%d, width %d->%d\n", + old_spd, new_spd, old_wid, new_wid); + + /* Check if width needs to be changed */ + if (old_wid != new_wid) { + PHBINF(p, "Changing width...\n"); + lreg = in_be64(p->regs + PHB_PCIE_LINK_MANAGEMENT); + lreg = SETFIELD(PHB_PCIE_LM_TGT_LINK_WIDTH, lreg, new_wid); + lreg |= PHB_PCIE_LM_CHG_LINK_WIDTH; + out_be64(p->regs + PHB_PCIE_LINK_MANAGEMENT, lreg); + for (i=0; i<10;i++) { + lreg = in_be64(p->regs + PHB_PCIE_LINK_MANAGEMENT); + if (lreg & PHB_PCIE_LM_DL_WCHG_PENDING) + break; + time_wait_ms_nopoll(1); + } + if (!(lreg & PHB_PCIE_LM_DL_WCHG_PENDING)) + PHBINF(p, "Timeout waiting for speed change start\n"); + for (i=0; i<100;i++) { + lreg = in_be64(p->regs + PHB_PCIE_LINK_MANAGEMENT); + if (!(lreg & PHB_PCIE_LM_DL_WCHG_PENDING)) + break; + time_wait_ms_nopoll(1); + } + if (lreg & PHB_PCIE_LM_DL_WCHG_PENDING) + PHBINF(p, "Timeout waiting for speed change end\n"); + } + /* Check if speed needs to be changed */ + if (old_spd != new_spd) { + PHBINF(p, "Changing speed...\n"); + lreg = in_be64(p->regs + PHB_PCIE_LINK_MANAGEMENT); + if (lreg & PPC_BIT(19)) { + uint16_t lctl2; + PHBINF(p, " Bit19 set ! working around...\n"); + pci_cfg_read16(phb, 0, 0x78, &lctl2); + PHBINF(p, " LCTL2=%04x\n", lctl2); + lctl2 &= ~PCICAP_EXP_LCTL2_HWAUTSPDIS; + pci_cfg_write16(phb, 0, 0x78, lctl2); + } + lreg = in_be64(p->regs + PHB_PCIE_LINK_MANAGEMENT); + lreg = SETFIELD(PHB_PCIE_LM_TGT_SPEED, lreg, new_spd); + lreg |= PHB_PCIE_LM_CHG_SPEED; + out_be64(p->regs + PHB_PCIE_LINK_MANAGEMENT, lreg); + } +} + +static int64_t phb3_pcicfg_rc_link_speed(void *dev, + struct pci_cfg_reg_filter *pcrf __unused, + uint32_t offset, uint32_t len, + uint32_t *data, bool write) +{ + struct pci_device *pd = dev; + + /* Hack for link speed changes. We intercept attempts at writing + * the link control/status register + */ + if (write && len == 4 && offset == 0x58) { + phb3_link_update(pd->phb, (*data) >> 16); + return OPAL_SUCCESS; + } + if (write && len == 2 && offset == 0x5a) { + phb3_link_update(pd->phb, *(uint16_t *)data); + return OPAL_SUCCESS; + } + + return OPAL_PARTIAL; +} + +#define PHB3_PCI_CFG_READ(size, type) \ +static int64_t phb3_pcicfg_read##size(struct phb *phb, uint32_t bdfn, \ + uint32_t offset, type *data) \ +{ \ + struct phb3 *p = phb_to_phb3(phb); \ + uint64_t addr, val64; \ + int64_t rc; \ + uint8_t pe; \ + bool use_asb = false; \ + \ + /* Initialize data in case of error */ \ + *data = (type)0xffffffff; \ + \ + rc = phb3_pcicfg_check(p, bdfn, offset, sizeof(type), &pe); \ + if (rc) \ + return rc; \ + \ + if (p->flags & PHB3_AIB_FENCED) { \ + if (!(p->flags & PHB3_CFG_USE_ASB)) \ + return OPAL_HARDWARE; \ + use_asb = true; \ + } else if ((p->flags & PHB3_CFG_BLOCKED) && bdfn != 0) { \ + return OPAL_HARDWARE; \ + } \ + \ + rc = pci_handle_cfg_filters(phb, bdfn, offset, sizeof(type), \ + (uint32_t *)data, false); \ + if (rc != OPAL_PARTIAL) \ + return rc; \ + \ + addr = PHB_CA_ENABLE; \ + addr = SETFIELD(PHB_CA_BDFN, addr, bdfn); \ + addr = SETFIELD(PHB_CA_REG, addr, offset); \ + addr = SETFIELD(PHB_CA_PE, addr, pe); \ + if (use_asb) { \ + phb3_write_reg_asb(p, PHB_CONFIG_ADDRESS, addr); \ + sync(); \ + val64 = bswap_64(phb3_read_reg_asb(p, PHB_CONFIG_DATA)); \ + *data = (type)(val64 >> (8 * (offset & (4 - sizeof(type))))); \ + } else { \ + out_be64(p->regs + PHB_CONFIG_ADDRESS, addr); \ + *data = in_le##size(p->regs + PHB_CONFIG_DATA + \ + (offset & (4 - sizeof(type)))); \ + } \ + \ + return OPAL_SUCCESS; \ +} + +#define PHB3_PCI_CFG_WRITE(size, type) \ +static int64_t phb3_pcicfg_write##size(struct phb *phb, uint32_t bdfn, \ + uint32_t offset, type data) \ +{ \ + struct phb3 *p = phb_to_phb3(phb); \ + uint64_t addr, val64 = 0; \ + int64_t rc; \ + uint8_t pe; \ + bool use_asb = false; \ + \ + rc = phb3_pcicfg_check(p, bdfn, offset, sizeof(type), &pe); \ + if (rc) \ + return rc; \ + \ + if (p->flags & PHB3_AIB_FENCED) { \ + if (!(p->flags & PHB3_CFG_USE_ASB)) \ + return OPAL_HARDWARE; \ + use_asb = true; \ + } else if ((p->flags & PHB3_CFG_BLOCKED) && bdfn != 0) { \ + return OPAL_HARDWARE; \ + } \ + \ + rc = pci_handle_cfg_filters(phb, bdfn, offset, sizeof(type), \ + (uint32_t *)&data, true); \ + if (rc != OPAL_PARTIAL) \ + return rc; \ + \ + addr = PHB_CA_ENABLE; \ + addr = SETFIELD(PHB_CA_BDFN, addr, bdfn); \ + addr = SETFIELD(PHB_CA_REG, addr, offset); \ + addr = SETFIELD(PHB_CA_PE, addr, pe); \ + if (use_asb) { \ + val64 = data; \ + val64 = bswap_64(val64 << 8 * (offset & (4 - sizeof(type)))); \ + phb3_write_reg_asb(p, PHB_CONFIG_ADDRESS, addr); \ + sync(); \ + phb3_write_reg_asb(p, PHB_CONFIG_DATA, val64); \ + } else { \ + out_be64(p->regs + PHB_CONFIG_ADDRESS, addr); \ + out_le##size(p->regs + PHB_CONFIG_DATA + \ + (offset & (4 - sizeof(type))), data); \ + } \ + \ + return OPAL_SUCCESS; \ +} + +PHB3_PCI_CFG_READ(8, u8) +PHB3_PCI_CFG_READ(16, u16) +PHB3_PCI_CFG_READ(32, u32) +PHB3_PCI_CFG_WRITE(8, u8) +PHB3_PCI_CFG_WRITE(16, u16) +PHB3_PCI_CFG_WRITE(32, u32) + +static int64_t phb3_get_reserved_pe_number(struct phb *phb __unused) +{ + return PHB3_RESERVED_PE_NUM; +} + +static inline void phb3_enable_ecrc(struct phb *phb, bool enable) +{ + struct phb3 *p = phb_to_phb3(phb); + uint32_t ctl; + + if (p->aercap <= 0) + return; + + pci_cfg_read32(phb, 0, p->aercap + PCIECAP_AER_CAPCTL, &ctl); + if (enable) { + ctl |= (PCIECAP_AER_CAPCTL_ECRCG_EN | + PCIECAP_AER_CAPCTL_ECRCC_EN); + } else { + ctl &= ~(PCIECAP_AER_CAPCTL_ECRCG_EN | + PCIECAP_AER_CAPCTL_ECRCC_EN); + } + + pci_cfg_write32(phb, 0, p->aercap + PCIECAP_AER_CAPCTL, ctl); +} + +static void phb3_root_port_init(struct phb *phb, struct pci_device *dev, + int ecap, int aercap) +{ + struct phb3 *p = phb_to_phb3(phb); + uint16_t bdfn = dev->bdfn; + uint16_t val16; + uint32_t val32; + + /* Use PHB's callback so that the UTL events will be masked + * or unmasked when the link is down or up. + */ + if (dev->slot && dev->slot->ops.prepare_link_change && + phb->slot && phb->slot->ops.prepare_link_change) + dev->slot->ops.prepare_link_change = + phb->slot->ops.prepare_link_change; + + /* Mask UTL link down event if root slot supports surprise + * hotplug as the event should be handled by hotplug driver + * instead of EEH subsystem. + */ + if (dev->slot && dev->slot->surprise_pluggable) + out_be64(p->regs + UTL_PCIE_PORT_IRQ_EN, 0xad42800000000000UL); + + /* Enable SERR and parity checking */ + pci_cfg_read16(phb, bdfn, PCI_CFG_CMD, &val16); + val16 |= (PCI_CFG_CMD_SERR_EN | PCI_CFG_CMD_PERR_RESP); + pci_cfg_write16(phb, bdfn, PCI_CFG_CMD, val16); + + /* Enable reporting various errors */ + if (!ecap) return; + pci_cfg_read16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, &val16); + val16 |= (PCICAP_EXP_DEVCTL_CE_REPORT | + PCICAP_EXP_DEVCTL_NFE_REPORT | + PCICAP_EXP_DEVCTL_FE_REPORT | + PCICAP_EXP_DEVCTL_UR_REPORT); + pci_cfg_write16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, val16); + + if (!aercap) return; + + /* Mask various unrecoverable errors. The link surprise down + * event should be masked when its PCI slot support surprise + * hotplug. The link surprise down event should be handled by + * PCI hotplug driver instead of EEH subsystem. + */ + pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_UE_MASK, &val32); + val32 |= (PCIECAP_AER_UE_MASK_POISON_TLP | + PCIECAP_AER_UE_MASK_COMPL_TIMEOUT | + PCIECAP_AER_UE_MASK_COMPL_ABORT | + PCIECAP_AER_UE_MASK_ECRC); + if (dev->slot && dev->slot->surprise_pluggable) + val32 |= PCIECAP_AER_UE_MASK_SURPRISE_DOWN; + pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_MASK, val32); + + /* Report various unrecoverable errors as fatal errors */ + pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_UE_SEVERITY, &val32); + val32 |= (PCIECAP_AER_UE_SEVERITY_DLLP | + PCIECAP_AER_UE_SEVERITY_SURPRISE_DOWN | + PCIECAP_AER_UE_SEVERITY_FLOW_CTL_PROT | + PCIECAP_AER_UE_SEVERITY_UNEXP_COMPL | + PCIECAP_AER_UE_SEVERITY_RECV_OVFLOW | + PCIECAP_AER_UE_SEVERITY_MALFORMED_TLP); + pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_SEVERITY, val32); + + /* Mask various recoverable errors */ + pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_CE_MASK, &val32); + val32 |= PCIECAP_AER_CE_MASK_ADV_NONFATAL; + pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CE_MASK, val32); + + /* Enable ECRC check */ + phb3_enable_ecrc(phb, true); + + /* Enable all error reporting */ + pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_RERR_CMD, &val32); + val32 |= (PCIECAP_AER_RERR_CMD_FE | + PCIECAP_AER_RERR_CMD_NFE | + PCIECAP_AER_RERR_CMD_CE); + pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_RERR_CMD, val32); +} + +static void phb3_switch_port_init(struct phb *phb, + struct pci_device *dev, + int ecap, int aercap) +{ + struct phb3 *p = phb_to_phb3(phb); + uint16_t bdfn = dev->bdfn; + uint16_t val16; + uint32_t val32; + + /* Enable SERR and parity checking and disable INTx */ + pci_cfg_read16(phb, bdfn, PCI_CFG_CMD, &val16); + val16 |= (PCI_CFG_CMD_PERR_RESP | + PCI_CFG_CMD_SERR_EN | + PCI_CFG_CMD_INTx_DIS); + pci_cfg_write16(phb, bdfn, PCI_CFG_CMD, val16); + + /* Disable partity error and enable system error */ + pci_cfg_read16(phb, bdfn, PCI_CFG_BRCTL, &val16); + val16 &= ~PCI_CFG_BRCTL_PERR_RESP_EN; + val16 |= PCI_CFG_BRCTL_SERR_EN; + pci_cfg_write16(phb, bdfn, PCI_CFG_BRCTL, val16); + + /* Enable reporting various errors */ + if (!ecap) return; + pci_cfg_read16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, &val16); + val16 |= (PCICAP_EXP_DEVCTL_CE_REPORT | + PCICAP_EXP_DEVCTL_NFE_REPORT | + PCICAP_EXP_DEVCTL_FE_REPORT); + /* HW279570 - Disable reporting of correctable errors */ + val16 &= ~PCICAP_EXP_DEVCTL_CE_REPORT; + pci_cfg_write16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, val16); + + /* Unmask all unrecoverable errors for upstream port. For + * downstream port, the surprise link down is masked because + * it should be handled by hotplug driver instead of EEH + * subsystem. + */ + if (!aercap) return; + if (dev->dev_type == PCIE_TYPE_SWITCH_DNPORT && + dev->slot && dev->slot->surprise_pluggable) + pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_MASK, + PCIECAP_AER_UE_MASK_SURPRISE_DOWN); + else + pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_MASK, 0x0); + + /* Severity of unrecoverable errors */ + if (dev->dev_type == PCIE_TYPE_SWITCH_UPPORT) + val32 = (PCIECAP_AER_UE_SEVERITY_DLLP | + PCIECAP_AER_UE_SEVERITY_SURPRISE_DOWN | + PCIECAP_AER_UE_SEVERITY_FLOW_CTL_PROT | + PCIECAP_AER_UE_SEVERITY_RECV_OVFLOW | + PCIECAP_AER_UE_SEVERITY_MALFORMED_TLP | + PCIECAP_AER_UE_SEVERITY_INTERNAL); + else + val32 = (PCIECAP_AER_UE_SEVERITY_FLOW_CTL_PROT | + PCIECAP_AER_UE_SEVERITY_INTERNAL); + pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_SEVERITY, val32); + + /* + * Mask various correctable errors + * + * On Murano and Venice DD1.0 we disable emission of corrected + * error messages to the PHB completely to workaround errata + * HW257476 causing the loss of tags. + */ + if (p->rev < PHB3_REV_MURANO_DD20) + val32 = 0xffffffff; + else + val32 = PCIECAP_AER_CE_MASK_ADV_NONFATAL; + pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CE_MASK, val32); + + /* Enable ECRC generation and disable ECRC check */ + pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, &val32); + val32 |= PCIECAP_AER_CAPCTL_ECRCG_EN; + val32 &= ~PCIECAP_AER_CAPCTL_ECRCC_EN; + pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, val32); +} + +static void phb3_endpoint_init(struct phb *phb, + struct pci_device *dev, + int ecap, int aercap) +{ + struct phb3 *p = phb_to_phb3(phb); + uint16_t bdfn = dev->bdfn; + uint16_t val16; + uint32_t val32; + + /* Enable SERR and parity checking */ + pci_cfg_read16(phb, bdfn, PCI_CFG_CMD, &val16); + val16 |= (PCI_CFG_CMD_PERR_RESP | + PCI_CFG_CMD_SERR_EN); + pci_cfg_write16(phb, bdfn, PCI_CFG_CMD, val16); + + /* Enable reporting various errors */ + if (!ecap) return; + pci_cfg_read16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, &val16); + val16 &= ~PCICAP_EXP_DEVCTL_CE_REPORT; + val16 |= (PCICAP_EXP_DEVCTL_NFE_REPORT | + PCICAP_EXP_DEVCTL_FE_REPORT | + PCICAP_EXP_DEVCTL_UR_REPORT); + /* HW279570 - Disable reporting of correctable errors */ + val16 &= ~PCICAP_EXP_DEVCTL_CE_REPORT; + pci_cfg_write16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, val16); + + /* + * On Murano and Venice DD1.0 we disable emission of corrected + * error messages to the PHB completely to workaround errata + * HW257476 causing the loss of tags. + */ + if (p->rev < PHB3_REV_MURANO_DD20) + pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CE_MASK, + 0xffffffff); + + /* Enable ECRC generation and check */ + pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, &val32); + val32 |= (PCIECAP_AER_CAPCTL_ECRCG_EN | + PCIECAP_AER_CAPCTL_ECRCC_EN); + pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, val32); +} + +static int64_t phb3_pcicfg_no_dstate(void *dev __unused, + struct pci_cfg_reg_filter *pcrf, + uint32_t offset, uint32_t len __unused, + uint32_t *data __unused, bool write) +{ + uint32_t loff = offset - pcrf->start; + + /* Disable D-state change on children of the PHB. For now we + * simply block all writes to the PM control/status + */ + if (write && loff >= 4 && loff < 6) + return OPAL_SUCCESS; + + return OPAL_PARTIAL; +} + +static void phb3_check_device_quirks(struct phb *phb, struct pci_device *dev) +{ + struct phb3 *p = phb_to_phb3(phb); + + if (dev->primary_bus != 0 && + dev->primary_bus != 1) + return; + + if (dev->primary_bus == 1) { + u64 modectl; + + /* + * For these adapters, if they are directly under the PHB, we + * adjust the disable_wr_scope_group bit for performances + * + * 15b3:1003 Mellanox Travis3-EN (CX3) + * 15b3:1011 Mellanox HydePark (ConnectIB) + * 15b3:1013 Mellanox GlacierPark (CX4) + */ + xscom_read(p->chip_id, p->pe_xscom + 0x0b, &modectl); + if (PCI_VENDOR_ID(dev->vdid) == 0x15b3 && + (PCI_DEVICE_ID(dev->vdid) == 0x1003 || + PCI_DEVICE_ID(dev->vdid) == 0x1011 || + PCI_DEVICE_ID(dev->vdid) == 0x1013)) + modectl |= PPC_BIT(14); + else + modectl &= ~PPC_BIT(14); + xscom_write(p->chip_id, p->pe_xscom + 0x0b, modectl); + + /* + * Naples has a problem with D-states at least on Mellanox CX4, + * disable changing D-state on Naples like we do it for PHB4. + */ + if (PHB3_IS_NAPLES(p) && + pci_has_cap(dev, PCI_CFG_CAP_ID_PM, false)) { + pci_add_cfg_reg_filter(dev, + pci_cap(dev, PCI_CFG_CAP_ID_PM, false), + 8, + PCI_REG_FLAG_WRITE, + phb3_pcicfg_no_dstate); + } + } else if (dev->primary_bus == 0) { + /* + * Emulate the prefetchable window of the root port + * when the corresponding HW registers are readonly. + * + * 1014:03dc Root port on P8/P8E/P8NVL + */ + if (PCI_VENDOR_ID(dev->vdid) == 0x1014 && + PCI_DEVICE_ID(dev->vdid) == 0x03dc) { + uint32_t pref_hi, tmp; + + pci_cfg_read32(phb, dev->bdfn, + PCI_CFG_PREF_MEM_BASE_U32, &pref_hi); + pci_cfg_write32(phb, dev->bdfn, + PCI_CFG_PREF_MEM_BASE_U32, ~pref_hi); + pci_cfg_read32(phb, dev->bdfn, + PCI_CFG_PREF_MEM_BASE_U32, &tmp); + pci_cfg_write32(phb, dev->bdfn, + PCI_CFG_PREF_MEM_BASE_U32, pref_hi); + if (tmp == pref_hi) + pci_add_cfg_reg_filter(dev, + PCI_CFG_PREF_MEM_BASE_U32, 12, + PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE, + phb3_pcicfg_rc_pref_window); + /* Add filter to control link speed */ + pci_add_cfg_reg_filter(dev, + 0x58, 4, + PCI_REG_FLAG_WRITE, + phb3_pcicfg_rc_link_speed); + } + } +} + +static inline int phb3_should_disable_ecrc(struct pci_device *pd) +{ + /* + * When we have PMC PCIe switch, we need disable ECRC on root port. + * Otherwise, the adapters behind the switch downstream ports might + * not probed successfully. + */ + if (pd->vdid == 0x854611f8) + return true; + + return false; +} + +static int phb3_device_init(struct phb *phb, + struct pci_device *dev, + void *data) +{ + struct phb3 *p = phb_to_phb3(phb); + int ecap, aercap; + + /* Some special adapter tweaks for devices directly under the PHB */ + phb3_check_device_quirks(phb, dev); + + /* Common initialization for the device */ + pci_device_init(phb, dev); + + ecap = pci_cap(dev, PCI_CFG_CAP_ID_EXP, false); + aercap = pci_cap(dev, PCIECAP_ID_AER, true); + if (dev->dev_type == PCIE_TYPE_ROOT_PORT) + phb3_root_port_init(phb, dev, ecap, aercap); + else if (dev->dev_type == PCIE_TYPE_SWITCH_UPPORT || + dev->dev_type == PCIE_TYPE_SWITCH_DNPORT) + phb3_switch_port_init(phb, dev, ecap, aercap); + else + phb3_endpoint_init(phb, dev, ecap, aercap); + + /* + * Check if we need disable ECRC functionality on root port. It + * only happens when PCI topology changes, meaning it's skipped + * when reinitializing PCI device after EEH reset. + */ + if (!data && phb3_should_disable_ecrc(dev)) { + if (p->no_ecrc_devs++ == 0) + phb3_enable_ecrc(phb, false); + } + + return 0; +} + +static void phb3_device_remove(struct phb *phb, struct pci_device *pd) +{ + struct phb3 *p = phb_to_phb3(phb); + + if (!phb3_should_disable_ecrc(pd) || p->no_ecrc_devs == 0) + return; + + if (--p->no_ecrc_devs == 0) + phb3_enable_ecrc(phb, true); +} + +static int64_t phb3_pci_reinit(struct phb *phb, uint64_t scope, uint64_t data) +{ + struct pci_device *pd; + uint16_t bdfn = data; + int ret; + + if (scope != OPAL_REINIT_PCI_DEV) + return OPAL_PARAMETER; + + pd = pci_find_dev(phb, bdfn); + if (!pd) + return OPAL_PARAMETER; + + ret = phb3_device_init(phb, pd, pd); + if (ret) + return OPAL_HARDWARE; + + return OPAL_SUCCESS; +} + +/* Clear IODA cache tables */ +static void phb3_init_ioda_cache(struct phb3 *p) +{ + uint32_t i; + uint64_t *data64; + + /* + * RTT and PELTV. RTE should be 0xFF's to indicate + * invalid PE# for the corresponding RID. + * + * Note: Instead we set all RTE entries to 0x00 to + * work around a problem where PE lookups might be + * done before Linux has established valid PE's + * (during PCI probing). We can revisit that once/if + * Linux has been fixed to always setup valid PEs. + * + * The value 0x00 corresponds to the default PE# Linux + * uses to check for config space freezes before it + * has assigned PE# to busses. + * + * WARNING: Additionally, we need to be careful, there's + * a HW issue, if we get an MSI on an RTT entry that is + * FF, things will go bad. We need to ensure we don't + * ever let a live FF RTT even temporarily when resetting + * for EEH etc... (HW278969). + */ + for (i = 0; i < ARRAY_SIZE(p->rte_cache); i++) + p->rte_cache[i] = PHB3_RESERVED_PE_NUM; + memset(p->peltv_cache, 0x0, sizeof(p->peltv_cache)); + + /* Disable all LSI */ + for (i = 0; i < ARRAY_SIZE(p->lxive_cache); i++) { + data64 = &p->lxive_cache[i]; + *data64 = SETFIELD(IODA2_LXIVT_PRIORITY, 0ul, 0xff); + *data64 = SETFIELD(IODA2_LXIVT_SERVER, *data64, 0x0); + } + + /* Diable all MSI */ + for (i = 0; i < ARRAY_SIZE(p->ive_cache); i++) { + data64 = &p->ive_cache[i]; + *data64 = SETFIELD(IODA2_IVT_PRIORITY, 0ul, 0xff); + *data64 = SETFIELD(IODA2_IVT_SERVER, *data64, 0x0); + } + + /* Clear TVT */ + memset(p->tve_cache, 0x0, sizeof(p->tve_cache)); + /* Clear M32 domain */ + memset(p->m32d_cache, 0x0, sizeof(p->m32d_cache)); + /* Clear M64 domain */ + memset(p->m64b_cache, 0x0, sizeof(p->m64b_cache)); +} + +/* phb3_ioda_reset - Reset the IODA tables + * + * @purge: If true, the cache is cleared and the cleared values + * are applied to HW. If false, the cached values are + * applied to HW + * + * This reset the IODA tables in the PHB. It is called at + * initialization time, on PHB reset, and can be called + * explicitly from OPAL + */ +static int64_t phb3_ioda_reset(struct phb *phb, bool purge) +{ + struct phb3 *p = phb_to_phb3(phb); + uint64_t server, prio; + uint64_t *pdata64, data64; + uint32_t i; + + if (purge) { + prlog(PR_DEBUG, "PHB%x: Purging all IODA tables...\n", + p->phb.opal_id); + phb3_init_ioda_cache(p); + } + + /* Init_27..28 - LIXVT */ + phb3_ioda_sel(p, IODA2_TBL_LXIVT, 0, true); + for (i = 0; i < ARRAY_SIZE(p->lxive_cache); i++) { + data64 = p->lxive_cache[i]; + server = GETFIELD(IODA2_LXIVT_SERVER, data64); + prio = GETFIELD(IODA2_LXIVT_PRIORITY, data64); + data64 = SETFIELD(IODA2_LXIVT_SERVER, data64, server); + data64 = SETFIELD(IODA2_LXIVT_PRIORITY, data64, prio); + out_be64(p->regs + PHB_IODA_DATA0, data64); + } + + /* Init_29..30 - MRT */ + phb3_ioda_sel(p, IODA2_TBL_MRT, 0, true); + for (i = 0; i < 8; i++) + out_be64(p->regs + PHB_IODA_DATA0, 0); + + /* Init_31..32 - TVT */ + phb3_ioda_sel(p, IODA2_TBL_TVT, 0, true); + for (i = 0; i < ARRAY_SIZE(p->tve_cache); i++) + out_be64(p->regs + PHB_IODA_DATA0, p->tve_cache[i]); + + /* Init_33..34 - M64BT */ + phb3_ioda_sel(p, IODA2_TBL_M64BT, 0, true); + for (i = 0; i < ARRAY_SIZE(p->m64b_cache); i++) + out_be64(p->regs + PHB_IODA_DATA0, p->m64b_cache[i]); + + /* Init_35..36 - M32DT */ + phb3_ioda_sel(p, IODA2_TBL_M32DT, 0, true); + for (i = 0; i < ARRAY_SIZE(p->m32d_cache); i++) + out_be64(p->regs + PHB_IODA_DATA0, p->m32d_cache[i]); + + /* Load RTE, PELTV */ + if (p->tbl_rtt) + memcpy((void *)p->tbl_rtt, p->rte_cache, RTT_TABLE_SIZE); + if (p->tbl_peltv) + memcpy((void *)p->tbl_peltv, p->peltv_cache, PELTV_TABLE_SIZE); + + /* Load IVT */ + if (p->tbl_ivt) { + pdata64 = (uint64_t *)p->tbl_ivt; + for (i = 0; i < IVT_TABLE_ENTRIES; i++) + pdata64[i * IVT_TABLE_STRIDE] = p->ive_cache[i]; + } + + /* Invalidate RTE, IVE, TCE cache */ + out_be64(p->regs + PHB_RTC_INVALIDATE, PHB_RTC_INVALIDATE_ALL); + out_be64(p->regs + PHB_IVC_INVALIDATE, PHB_IVC_INVALIDATE_ALL); + out_be64(p->regs + PHB_TCE_KILL, PHB_TCE_KILL_ALL); + + /* Clear RBA */ + if (p->rev >= PHB3_REV_MURANO_DD20) { + phb3_ioda_sel(p, IODA2_TBL_RBA, 0, true); + for (i = 0; i < 32; i++) + out_be64(p->regs + PHB_IODA_DATA0, 0x0ul); + } + + /* Clear PEST & PEEV */ + for (i = 0; i < PHB3_MAX_PE_NUM; i++) { + uint64_t pesta, pestb; + + phb3_ioda_sel(p, IODA2_TBL_PESTA, i, false); + pesta = in_be64(p->regs + PHB_IODA_DATA0); + out_be64(p->regs + PHB_IODA_DATA0, 0); + phb3_ioda_sel(p, IODA2_TBL_PESTB, i, false); + pestb = in_be64(p->regs + PHB_IODA_DATA0); + out_be64(p->regs + PHB_IODA_DATA0, 0); + + if ((pesta & IODA2_PESTA_MMIO_FROZEN) || + (pestb & IODA2_PESTB_DMA_STOPPED)) + PHBDBG(p, "Frozen PE#%x (%s - %s)\n", + i, (pesta & IODA2_PESTA_MMIO_FROZEN) ? "DMA" : "", + (pestb & IODA2_PESTB_DMA_STOPPED) ? "MMIO" : ""); + } + + phb3_ioda_sel(p, IODA2_TBL_PEEV, 0, true); + for (i = 0; i < 4; i++) + out_be64(p->regs + PHB_IODA_DATA0, 0); + + return OPAL_SUCCESS; +} + +/* + * Clear anything we have in PAPR Error Injection registers. Though + * the spec says the PAPR error injection should be one-shot without + * the "sticky" bit. However, that's false according to the experiments + * I had. So we have to clear it at appropriate point in kernel to + * avoid endless frozen PE. + */ +static int64_t phb3_papr_errinjct_reset(struct phb *phb) +{ + struct phb3 *p = phb_to_phb3(phb); + + out_be64(p->regs + PHB_PAPR_ERR_INJ_CTL, 0x0ul); + out_be64(p->regs + PHB_PAPR_ERR_INJ_ADDR, 0x0ul); + out_be64(p->regs + PHB_PAPR_ERR_INJ_MASK, 0x0ul); + + return OPAL_SUCCESS; +} + +static int64_t phb3_set_phb_mem_window(struct phb *phb, + uint16_t window_type, + uint16_t window_num, + uint64_t addr, + uint64_t __unused pci_addr, + uint64_t size) +{ + struct phb3 *p = phb_to_phb3(phb); + uint64_t data64; + + /* + * By design, PHB3 doesn't support IODT any more. + * Besides, we can't enable M32 BAR as well. So + * the function is used to do M64 mapping and each + * BAR is supposed to be shared by all PEs. + */ + switch (window_type) { + case OPAL_IO_WINDOW_TYPE: + case OPAL_M32_WINDOW_TYPE: + return OPAL_UNSUPPORTED; + case OPAL_M64_WINDOW_TYPE: + if (window_num >= 16) + return OPAL_PARAMETER; + + data64 = p->m64b_cache[window_num]; + if (data64 & IODA2_M64BT_SINGLE_PE) { + if ((addr & 0x1FFFFFFul) || + (size & 0x1FFFFFFul)) + return OPAL_PARAMETER; + } else { + if ((addr & 0xFFFFFul) || + (size & 0xFFFFFul)) + return OPAL_PARAMETER; + } + + /* size should be 2^N */ + if (!size || size & (size-1)) + return OPAL_PARAMETER; + + /* address should be size aligned */ + if (addr & (size - 1)) + return OPAL_PARAMETER; + + break; + default: + return OPAL_PARAMETER; + } + + if (data64 & IODA2_M64BT_SINGLE_PE) { + data64 = SETFIELD(IODA2_M64BT_SINGLE_BASE, data64, + addr >> 25); + data64 = SETFIELD(IODA2_M64BT_SINGLE_MASK, data64, + 0x20000000 - (size >> 25)); + } else { + data64 = SETFIELD(IODA2_M64BT_BASE, data64, + addr >> 20); + data64 = SETFIELD(IODA2_M64BT_MASK, data64, + 0x40000000 - (size >> 20)); + } + p->m64b_cache[window_num] = data64; + + return OPAL_SUCCESS; +} + +/* + * For one specific M64 BAR, it can be shared by all PEs, + * or owned by single PE exclusively. + */ +static int64_t phb3_phb_mmio_enable(struct phb *phb, + uint16_t window_type, + uint16_t window_num, + uint16_t enable) +{ + struct phb3 *p = phb_to_phb3(phb); + uint64_t data64, base, mask; + + /* + * By design, PHB3 doesn't support IODT any more. + * Besides, we can't enable M32 BAR as well. So + * the function is used to do M64 mapping and each + * BAR is supposed to be shared by all PEs. + */ + switch (window_type) { + case OPAL_IO_WINDOW_TYPE: + case OPAL_M32_WINDOW_TYPE: + return OPAL_UNSUPPORTED; + case OPAL_M64_WINDOW_TYPE: + if (window_num >= 16 || + enable > OPAL_ENABLE_M64_NON_SPLIT) + return OPAL_PARAMETER; + break; + default: + return OPAL_PARAMETER; + } + + /* + * We need check the base/mask while enabling + * the M64 BAR. Otherwise, invalid base/mask + * might cause fenced AIB unintentionally + */ + data64 = p->m64b_cache[window_num]; + switch (enable) { + case OPAL_DISABLE_M64: + data64 &= ~IODA2_M64BT_SINGLE_PE; + data64 &= ~IODA2_M64BT_ENABLE; + break; + case OPAL_ENABLE_M64_SPLIT: + if (data64 & IODA2_M64BT_SINGLE_PE) + return OPAL_PARAMETER; + base = GETFIELD(IODA2_M64BT_BASE, data64); + base = (base << 20); + mask = GETFIELD(IODA2_M64BT_MASK, data64); + if (base < p->mm0_base || !mask) + return OPAL_PARTIAL; + + data64 |= IODA2_M64BT_ENABLE; + break; + case OPAL_ENABLE_M64_NON_SPLIT: + if (!(data64 & IODA2_M64BT_SINGLE_PE)) + return OPAL_PARAMETER; + base = GETFIELD(IODA2_M64BT_SINGLE_BASE, data64); + base = (base << 25); + mask = GETFIELD(IODA2_M64BT_SINGLE_MASK, data64); + if (base < p->mm0_base || !mask) + return OPAL_PARTIAL; + + data64 |= IODA2_M64BT_SINGLE_PE; + data64 |= IODA2_M64BT_ENABLE; + break; + } + + /* Update HW and cache */ + phb3_ioda_sel(p, IODA2_TBL_M64BT, window_num, false); + out_be64(p->regs + PHB_IODA_DATA0, data64); + p->m64b_cache[window_num] = data64; + return OPAL_SUCCESS; +} + +static int64_t phb3_map_pe_mmio_window(struct phb *phb, + uint64_t pe_number, + uint16_t window_type, + uint16_t window_num, + uint16_t segment_num) +{ + struct phb3 *p = phb_to_phb3(phb); + uint64_t data64, *cache; + + if (pe_number >= PHB3_MAX_PE_NUM) + return OPAL_PARAMETER; + + /* + * PHB3 doesn't support IODT any more. On the other + * hand, PHB3 support M64DT with much more flexibility. + * we need figure it out later. At least, we never use + * M64DT in kernel. + */ + switch(window_type) { + case OPAL_IO_WINDOW_TYPE: + return OPAL_UNSUPPORTED; + case OPAL_M32_WINDOW_TYPE: + if (window_num != 0 || segment_num >= PHB3_MAX_PE_NUM) + return OPAL_PARAMETER; + + cache = &p->m32d_cache[segment_num]; + phb3_ioda_sel(p, IODA2_TBL_M32DT, segment_num, false); + out_be64(p->regs + PHB_IODA_DATA0, + SETFIELD(IODA2_M32DT_PE, 0ull, pe_number)); + *cache = SETFIELD(IODA2_M32DT_PE, 0ull, pe_number); + + break; + case OPAL_M64_WINDOW_TYPE: + if (window_num >= 16) + return OPAL_PARAMETER; + cache = &p->m64b_cache[window_num]; + data64 = *cache; + + /* The BAR shouldn't be enabled yet */ + if (data64 & IODA2_M64BT_ENABLE) + return OPAL_PARTIAL; + + data64 |= IODA2_M64BT_SINGLE_PE; + data64 = SETFIELD(IODA2_M64BT_PE_HI, data64, pe_number >> 5); + data64 = SETFIELD(IODA2_M64BT_PE_LOW, data64, pe_number); + *cache = data64; + + break; + default: + return OPAL_PARAMETER; + } + + return OPAL_SUCCESS; +} + +static int64_t phb3_map_pe_dma_window(struct phb *phb, + uint64_t pe_number, + uint16_t window_id, + uint16_t tce_levels, + uint64_t tce_table_addr, + uint64_t tce_table_size, + uint64_t tce_page_size) +{ + struct phb3 *p = phb_to_phb3(phb); + uint64_t tts_encoded; + uint64_t data64 = 0; + + /* + * Sanity check. We currently only support "2 window per PE" mode + * ie, only bit 59 of the PCI address is used to select the window + */ + if (pe_number >= PHB3_MAX_PE_NUM || + (window_id >> 1) != pe_number) + return OPAL_PARAMETER; + + /* + * tce_table_size == 0 is used to disable an entry, in this case + * we ignore other arguments + */ + if (tce_table_size == 0) { + phb3_ioda_sel(p, IODA2_TBL_TVT, window_id, false); + out_be64(p->regs + PHB_IODA_DATA0, 0); + p->tve_cache[window_id] = 0; + return OPAL_SUCCESS; + } + + /* Additional arguments validation */ + if (tce_levels < 1 || tce_levels > 5 || + !is_pow2(tce_table_size) || + tce_table_size < 0x1000) + return OPAL_PARAMETER; + + /* Encode TCE table size */ + data64 = SETFIELD(IODA2_TVT_TABLE_ADDR, 0ul, tce_table_addr >> 12); + tts_encoded = ilog2(tce_table_size) - 11; + if (tts_encoded > 31) + return OPAL_PARAMETER; + data64 = SETFIELD(IODA2_TVT_TCE_TABLE_SIZE, data64, tts_encoded); + + /* Encode TCE page size */ + switch (tce_page_size) { + case 0x1000: /* 4K */ + data64 = SETFIELD(IODA2_TVT_IO_PSIZE, data64, 1); + break; + case 0x10000: /* 64K */ + data64 = SETFIELD(IODA2_TVT_IO_PSIZE, data64, 5); + break; + case 0x1000000: /* 16M */ + data64 = SETFIELD(IODA2_TVT_IO_PSIZE, data64, 13); + break; + case 0x10000000: /* 256M */ + data64 = SETFIELD(IODA2_TVT_IO_PSIZE, data64, 17); + break; + default: + return OPAL_PARAMETER; + } + + /* Encode number of levels */ + data64 = SETFIELD(IODA2_TVT_NUM_LEVELS, data64, tce_levels - 1); + + phb3_ioda_sel(p, IODA2_TBL_TVT, window_id, false); + out_be64(p->regs + PHB_IODA_DATA0, data64); + p->tve_cache[window_id] = data64; + + return OPAL_SUCCESS; +} + +static int64_t phb3_map_pe_dma_window_real(struct phb *phb, + uint64_t pe_number, + uint16_t window_id, + uint64_t pci_start_addr, + uint64_t pci_mem_size) +{ + struct phb3 *p = phb_to_phb3(phb); + uint64_t end; + uint64_t tve; + + if (pe_number >= PHB3_MAX_PE_NUM || + (window_id >> 1) != pe_number) + return OPAL_PARAMETER; + + if (pci_mem_size) { + /* Enable */ + + /* + * Check that the start address has the right TVE index, + * we only support the 1 bit mode where each PE has 2 + * TVEs + */ + if ((pci_start_addr >> 59) != (window_id & 1)) + return OPAL_PARAMETER; + pci_start_addr &= ((1ull << 59) - 1); + end = pci_start_addr + pci_mem_size; + + /* We have to be 16M aligned */ + if ((pci_start_addr & 0x00ffffff) || + (pci_mem_size & 0x00ffffff)) + return OPAL_PARAMETER; + + /* + * It *looks* like this is the max we can support (we need + * to verify this. Also we are not checking for rollover, + * but then we aren't trying too hard to protect ourselves + * againt a completely broken OS. + */ + if (end > 0x0003ffffffffffffull) + return OPAL_PARAMETER; + + /* + * Put start address bits 49:24 into TVE[52:53]||[0:23] + * and end address bits 49:24 into TVE[54:55]||[24:47] + * and set TVE[51] + */ + tve = (pci_start_addr << 16) & (0xffffffull << 48); + tve |= (pci_start_addr >> 38) & (3ull << 10); + tve |= (end >> 8) & (0xfffffful << 16); + tve |= (end >> 40) & (3ull << 8); + tve |= PPC_BIT(51); + } else { + /* Disable */ + tve = 0; + } + + phb3_ioda_sel(p, IODA2_TBL_TVT, window_id, false); + out_be64(p->regs + PHB_IODA_DATA0, tve); + p->tve_cache[window_id] = tve; + + return OPAL_SUCCESS; +} + +static bool phb3_pci_msi_check_q(struct phb3 *p, uint32_t ive_num) +{ + uint64_t ive, ivc, ffi, state; + uint8_t *q_byte; + + /* Each IVE has 16-bytes or 128-bytes */ + ive = p->tbl_ivt + (ive_num * IVT_TABLE_STRIDE * 8); + q_byte = (uint8_t *)(ive + 5); + + /* + * Handle Q bit. If the Q bit doesn't show up, + * we would have CI load to make that. + */ + if (!(*q_byte & 0x1)) { + /* Read from random PHB reg to force flush */ + in_be64(p->regs + PHB_IVC_UPDATE); + + /* Order with subsequent read of Q */ + sync(); + + /* Q still not set, bail out */ + if (!(*q_byte & 0x1)) + return false; + } + + /* Lock FFI and send interrupt */ + while (1) { + state = in_be64(p->regs + PHB_FFI_LOCK); + if (!state) + break; + if (state == ~0ULL) /* PHB Fenced */ + return false; + } + + /* Clear Q bit and update IVC */ + *q_byte = 0; + ivc = SETFIELD(PHB_IVC_UPDATE_SID, 0ul, ive_num) | + PHB_IVC_UPDATE_ENABLE_Q; + out_be64(p->regs + PHB_IVC_UPDATE, ivc); + + /* + * Resend interrupt. Note the lock clear bit isn't documented in + * the PHB3 spec and thus is probably unnecessary but it's in + * IODA2 so let's be safe here, it won't hurt to set it + */ + ffi = SETFIELD(PHB_FFI_REQUEST_ISN, 0ul, ive_num) | PHB_FFI_LOCK_CLEAR; + out_be64(p->regs + PHB_FFI_REQUEST, ffi); + + return true; +} + +static void phb3_pci_msi_flush_ive(struct phb3 *p, uint32_t ive_num) +{ + asm volatile("dcbf %0,%1" + : + : "b" (p->tbl_ivt), "r" (ive_num * IVT_TABLE_STRIDE * 8) + : "memory"); +} + +static int64_t phb3_pci_msi_eoi(struct phb *phb, + uint32_t hwirq) +{ + struct phb3 *p = phb_to_phb3(phb); + uint32_t ive_num = PHB3_IRQ_NUM(hwirq); + uint64_t ive, ivc; + uint8_t *p_byte, gp, gen, newgen; + + /* OS might not configure IVT yet */ + if (!p->tbl_ivt) + return OPAL_HARDWARE; + + /* Each IVE has 16-bytes or 128-bytes */ + ive = p->tbl_ivt + (ive_num * IVT_TABLE_STRIDE * 8); + p_byte = (uint8_t *)(ive + 4); + + /* Read generation and P */ + gp = *p_byte; + gen = (gp >> 1) & 3; + newgen = (gen + 1) & 3; + + /* Increment generation count and clear P */ + *p_byte = newgen << 1; + + /* If at this point: + * - the IVC is invalid (due to high IRQ load) and + * - we get a new interrupt on this hwirq. + * Due to the new interrupt, the IVC will fetch from the IVT. + * This IVC reload will result in P set and gen=n+1. This + * interrupt may not actually be delievered at this point + * though. + * + * Software will then try to clear P in the IVC (out_be64 + * below). This could cause an interrupt to be lost because P + * is cleared in the IVC without the new interrupt being + * delivered. + * + * To avoid this race, we increment the generation count in + * the IVT when we clear P. When software writes the IVC with + * P cleared but with gen=n, the IVC won't actually clear P + * because gen doesn't match what it just cached from the IVT. + * Hence we don't lose P being set. + */ + + /* Update the P bit in the IVC is gen count matches */ + ivc = SETFIELD(PHB_IVC_UPDATE_SID, 0ul, ive_num) | + PHB_IVC_UPDATE_ENABLE_P | + PHB_IVC_UPDATE_ENABLE_GEN | + PHB_IVC_UPDATE_ENABLE_CON | + SETFIELD(PHB_IVC_UPDATE_GEN_MATCH, 0ul, gen) | + SETFIELD(PHB_IVC_UPDATE_GEN, 0ul, newgen); + /* out_be64 has a sync to order with the IVT update above */ + out_be64(p->regs + PHB_IVC_UPDATE, ivc); + + /* Handle Q bit */ + phb3_pci_msi_check_q(p, ive_num); + + phb3_pci_msi_flush_ive(p, ive_num); + + return OPAL_SUCCESS; +} + +static int64_t phb3_set_ive_pe(struct phb *phb, + uint64_t pe_number, + uint32_t ive_num) +{ + struct phb3 *p = phb_to_phb3(phb); + uint64_t *cache, ivep, data64; + uint16_t *pe_word; + + /* OS should enable the BAR in advance */ + if (!p->tbl_ivt) + return OPAL_HARDWARE; + + /* Each IVE reserves 128 bytes */ + if (pe_number >= PHB3_MAX_PE_NUM || + ive_num >= IVT_TABLE_ENTRIES) + return OPAL_PARAMETER; + + /* Update IVE cache */ + cache = &p->ive_cache[ive_num]; + *cache = SETFIELD(IODA2_IVT_PE, *cache, pe_number); + + /* Update in-memory IVE without clobbering P and Q */ + ivep = p->tbl_ivt + (ive_num * IVT_TABLE_STRIDE * 8); + pe_word = (uint16_t *)(ivep + 6); + *pe_word = pe_number; + + /* Invalidate IVC */ + data64 = SETFIELD(PHB_IVC_INVALIDATE_SID, 0ul, ive_num); + out_be64(p->regs + PHB_IVC_INVALIDATE, data64); + + return OPAL_SUCCESS; +} + +static int64_t phb3_get_msi_32(struct phb *phb __unused, + uint64_t pe_number, + uint32_t ive_num, + uint8_t msi_range, + uint32_t *msi_address, + uint32_t *message_data) +{ + /* + * Sanity check. We needn't check on mve_number (PE#) + * on PHB3 since the interrupt source is purely determined + * by its DMA address and data, but the check isn't + * harmful. + */ + if (pe_number >= PHB3_MAX_PE_NUM || + ive_num >= IVT_TABLE_ENTRIES || + msi_range != 1 || !msi_address|| !message_data) + return OPAL_PARAMETER; + + /* + * DMA address and data will form the IVE index. + * For more details, please refer to IODA2 spec. + */ + *msi_address = 0xFFFF0000 | ((ive_num << 4) & 0xFFFFFE0F); + *message_data = ive_num & 0x1F; + + return OPAL_SUCCESS; +} + +static int64_t phb3_get_msi_64(struct phb *phb __unused, + uint64_t pe_number, + uint32_t ive_num, + uint8_t msi_range, + uint64_t *msi_address, + uint32_t *message_data) +{ + /* Sanity check */ + if (pe_number >= PHB3_MAX_PE_NUM || + ive_num >= IVT_TABLE_ENTRIES || + msi_range != 1 || !msi_address || !message_data) + return OPAL_PARAMETER; + + /* + * DMA address and data will form the IVE index. + * For more details, please refer to IODA2 spec. + */ + *msi_address = (0x1ul << 60) | ((ive_num << 4) & 0xFFFFFFFFFFFFFE0Ful); + *message_data = ive_num & 0x1F; + + return OPAL_SUCCESS; +} + +static bool phb3_err_check_pbcq(struct phb3 *p) +{ + uint64_t nfir, mask, wof, val64; + int32_t class, bit; + uint64_t severity[PHB3_ERR_CLASS_LAST] = { + 0x0000000000000000UL, /* NONE */ + 0x018000F800000000UL, /* DEAD */ + 0x7E7DC70000000000UL, /* FENCED */ + 0x0000000000000000UL, /* ER */ + 0x0000000000000000UL /* INF */ + }; + + /* + * Read on NFIR to see if XSCOM is working properly. + * If XSCOM doesn't work well, we need take the PHB + * into account any more. + */ + xscom_read(p->chip_id, p->pe_xscom + 0x0, &nfir); + if (nfir == 0xffffffffffffffffUL) { + p->err.err_src = PHB3_ERR_SRC_NONE; + p->err.err_class = PHB3_ERR_CLASS_DEAD; + phb3_set_err_pending(p, true); + return true; + } + + /* + * Check WOF. We need handle unmasked errors firstly. + * We probably run into the situation (on simulator) + * where we have asserted FIR bits, but WOF has nothing. + * For that case, we should check FIR as well. + */ + xscom_read(p->chip_id, p->pe_xscom + 0x3, &mask); + xscom_read(p->chip_id, p->pe_xscom + 0x8, &wof); + if (wof & ~mask) + wof &= ~mask; + if (!wof) { + if (nfir & ~mask) + nfir &= ~mask; + if (!nfir) + return false; + wof = nfir; + } + + /* We shouldn't hit class PHB3_ERR_CLASS_NONE */ + for (class = PHB3_ERR_CLASS_NONE; + class < PHB3_ERR_CLASS_LAST; + class++) { + val64 = wof & severity[class]; + if (!val64) + continue; + + for (bit = 0; bit < 64; bit++) { + if (val64 & PPC_BIT(bit)) { + p->err.err_src = PHB3_ERR_SRC_PBCQ; + p->err.err_class = class; + p->err.err_bit = 63 - bit; + phb3_set_err_pending(p, true); + return true; + } + } + } + + return false; +} + +static bool phb3_err_check_lem(struct phb3 *p) +{ + uint64_t fir, wof, mask, val64; + int32_t class, bit; + uint64_t severity[PHB3_ERR_CLASS_LAST] = { + 0x0000000000000000UL, /* NONE */ + 0x0000000000000000UL, /* DEAD */ + 0xADB670C980ADD151UL, /* FENCED */ + 0x000800107F500A2CUL, /* ER */ + 0x42018E2200002482UL /* INF */ + }; + + /* + * Read FIR. If XSCOM or ASB is frozen, we needn't + * go forward and just mark the PHB with dead state + */ + fir = phb3_read_reg_asb(p, PHB_LEM_FIR_ACCUM); + if (fir == 0xffffffffffffffffUL) { + p->err.err_src = PHB3_ERR_SRC_PHB; + p->err.err_class = PHB3_ERR_CLASS_DEAD; + phb3_set_err_pending(p, true); + return true; + } + + /* + * Check on WOF for the unmasked errors firstly. Under + * some situation where we run skiboot on simulator, + * we already had FIR bits asserted, but WOF is still zero. + * For that case, we check FIR directly. + */ + wof = phb3_read_reg_asb(p, PHB_LEM_WOF); + mask = phb3_read_reg_asb(p, PHB_LEM_ERROR_MASK); + if (wof & ~mask) + wof &= ~mask; + if (!wof) { + if (fir & ~mask) + fir &= ~mask; + if (!fir) + return false; + wof = fir; + } + + /* We shouldn't hit PHB3_ERR_CLASS_NONE */ + for (class = PHB3_ERR_CLASS_NONE; + class < PHB3_ERR_CLASS_LAST; + class++) { + val64 = wof & severity[class]; + if (!val64) + continue; + + for (bit = 0; bit < 64; bit++) { + if (val64 & PPC_BIT(bit)) { + p->err.err_src = PHB3_ERR_SRC_PHB; + p->err.err_class = class; + p->err.err_bit = 63 - bit; + phb3_set_err_pending(p, true); + return true; + } + } + } + + return false; +} + +/* + * The function can be called during error recovery for INF + * and ER class. For INF case, it's expected to be called + * when grabbing the error log. We will call it explicitly + * when clearing frozen PE state for ER case. + */ +static void phb3_err_ER_clear(struct phb3 *p) +{ + uint32_t val32; + uint64_t val64; + uint64_t fir = in_be64(p->regs + PHB_LEM_FIR_ACCUM); + + /* Rec 1: Grab the PCI config lock */ + /* Removed... unnecessary. We have our own lock here */ + + /* Rec 2/3/4: Take all inbound transactions */ + out_be64(p->regs + PHB_CONFIG_ADDRESS, 0x8000001c00000000ul); + out_be32(p->regs + PHB_CONFIG_DATA, 0x10000000); + + /* Rec 5/6/7: Clear pending non-fatal errors */ + out_be64(p->regs + PHB_CONFIG_ADDRESS, 0x8000005000000000ul); + val32 = in_be32(p->regs + PHB_CONFIG_DATA); + out_be32(p->regs + PHB_CONFIG_DATA, (val32 & 0xe0700000) | 0x0f000f00); + + /* Rec 8/9/10: Clear pending fatal errors for AER */ + out_be64(p->regs + PHB_CONFIG_ADDRESS, 0x8000010400000000ul); + out_be32(p->regs + PHB_CONFIG_DATA, 0xffffffff); + + /* Rec 11/12/13: Clear pending non-fatal errors for AER */ + out_be64(p->regs + PHB_CONFIG_ADDRESS, 0x8000011000000000ul); + out_be32(p->regs + PHB_CONFIG_DATA, 0xffffffff); + + /* Rec 22/23/24: Clear root port errors */ + out_be64(p->regs + PHB_CONFIG_ADDRESS, 0x8000013000000000ul); + out_be32(p->regs + PHB_CONFIG_DATA, 0xffffffff); + + /* Rec 25/26/27: Enable IO and MMIO bar */ + out_be64(p->regs + PHB_CONFIG_ADDRESS, 0x8000004000000000ul); + out_be32(p->regs + PHB_CONFIG_DATA, 0x470100f8); + + /* Rec 28: Release the PCI config lock */ + /* Removed... unnecessary. We have our own lock here */ + + /* Rec 29...34: Clear UTL errors */ + val64 = in_be64(p->regs + UTL_SYS_BUS_AGENT_STATUS); + out_be64(p->regs + UTL_SYS_BUS_AGENT_STATUS, val64); + val64 = in_be64(p->regs + UTL_PCIE_PORT_STATUS); + out_be64(p->regs + UTL_PCIE_PORT_STATUS, val64); + val64 = in_be64(p->regs + UTL_RC_STATUS); + out_be64(p->regs + UTL_RC_STATUS, val64); + + /* Rec 39...66: Clear PHB error trap */ + val64 = in_be64(p->regs + PHB_ERR_STATUS); + out_be64(p->regs + PHB_ERR_STATUS, val64); + out_be64(p->regs + PHB_ERR1_STATUS, 0x0ul); + out_be64(p->regs + PHB_ERR_LOG_0, 0x0ul); + out_be64(p->regs + PHB_ERR_LOG_1, 0x0ul); + + val64 = in_be64(p->regs + PHB_OUT_ERR_STATUS); + out_be64(p->regs + PHB_OUT_ERR_STATUS, val64); + out_be64(p->regs + PHB_OUT_ERR1_STATUS, 0x0ul); + out_be64(p->regs + PHB_OUT_ERR_LOG_0, 0x0ul); + out_be64(p->regs + PHB_OUT_ERR_LOG_1, 0x0ul); + + val64 = in_be64(p->regs + PHB_INA_ERR_STATUS); + out_be64(p->regs + PHB_INA_ERR_STATUS, val64); + out_be64(p->regs + PHB_INA_ERR1_STATUS, 0x0ul); + out_be64(p->regs + PHB_INA_ERR_LOG_0, 0x0ul); + out_be64(p->regs + PHB_INA_ERR_LOG_1, 0x0ul); + + val64 = in_be64(p->regs + PHB_INB_ERR_STATUS); + out_be64(p->regs + PHB_INB_ERR_STATUS, val64); + out_be64(p->regs + PHB_INB_ERR1_STATUS, 0x0ul); + out_be64(p->regs + PHB_INB_ERR_LOG_0, 0x0ul); + out_be64(p->regs + PHB_INB_ERR_LOG_1, 0x0ul); + + /* Rec 67/68: Clear FIR/WOF */ + out_be64(p->regs + PHB_LEM_FIR_AND_MASK, ~fir); + out_be64(p->regs + PHB_LEM_WOF, 0x0ul); +} + +static void phb3_read_phb_status(struct phb3 *p, + struct OpalIoPhb3ErrorData *stat) +{ + uint16_t val; + uint64_t *pPEST; + uint64_t val64 = 0; + uint32_t i; + + memset(stat, 0, sizeof(struct OpalIoPhb3ErrorData)); + + /* Error data common part */ + stat->common.version = OPAL_PHB_ERROR_DATA_VERSION_1; + stat->common.ioType = OPAL_PHB_ERROR_DATA_TYPE_PHB3; + stat->common.len = sizeof(struct OpalIoPhb3ErrorData); + + /* + * We read some registers using config space through AIB. + * + * Get to other registers using ASB when possible to get to them + * through a fence if one is present. + */ + + /* Use ASB to access PCICFG if the PHB has been fenced */ + p->flags |= PHB3_CFG_USE_ASB; + + /* Grab RC bridge control, make it 32-bit */ + phb3_pcicfg_read16(&p->phb, 0, PCI_CFG_BRCTL, &val); + stat->brdgCtl = val; + + /* Grab UTL status registers */ + stat->portStatusReg = hi32(phb3_read_reg_asb(p, UTL_PCIE_PORT_STATUS)); + stat->rootCmplxStatus = hi32(phb3_read_reg_asb(p, UTL_RC_STATUS)); + stat->busAgentStatus = hi32(phb3_read_reg_asb(p, UTL_SYS_BUS_AGENT_STATUS)); + + /* + * Grab various RC PCIe capability registers. All device, slot + * and link status are 16-bit, so we grab the pair control+status + * for each of them + */ + phb3_pcicfg_read32(&p->phb, 0, p->ecap + PCICAP_EXP_DEVCTL, + &stat->deviceStatus); + phb3_pcicfg_read32(&p->phb, 0, p->ecap + PCICAP_EXP_SLOTCTL, + &stat->slotStatus); + phb3_pcicfg_read32(&p->phb, 0, p->ecap + PCICAP_EXP_LCTL, + &stat->linkStatus); + + /* + * I assume those are the standard config space header, cmd & status + * together makes 32-bit. Secondary status is 16-bit so I'll clear + * the top on that one + */ + phb3_pcicfg_read32(&p->phb, 0, PCI_CFG_CMD, &stat->devCmdStatus); + phb3_pcicfg_read16(&p->phb, 0, PCI_CFG_SECONDARY_STATUS, &val); + stat->devSecStatus = val; + + /* Grab a bunch of AER regs */ + phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_RERR_STA, + &stat->rootErrorStatus); + phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_UE_STATUS, + &stat->uncorrErrorStatus); + phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_CE_STATUS, + &stat->corrErrorStatus); + phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG0, + &stat->tlpHdr1); + phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG1, + &stat->tlpHdr2); + phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG2, + &stat->tlpHdr3); + phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG3, + &stat->tlpHdr4); + phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_SRCID, + &stat->sourceId); + + /* Restore to AIB */ + p->flags &= ~PHB3_CFG_USE_ASB; + + /* PEC NFIR */ + xscom_read(p->chip_id, p->pe_xscom + 0x0, &stat->nFir); + xscom_read(p->chip_id, p->pe_xscom + 0x3, &stat->nFirMask); + xscom_read(p->chip_id, p->pe_xscom + 0x8, &stat->nFirWOF); + + /* PHB3 inbound and outbound error Regs */ + stat->phbPlssr = phb3_read_reg_asb(p, PHB_CPU_LOADSTORE_STATUS); + stat->phbCsr = phb3_read_reg_asb(p, PHB_DMA_CHAN_STATUS); + stat->lemFir = phb3_read_reg_asb(p, PHB_LEM_FIR_ACCUM); + stat->lemErrorMask = phb3_read_reg_asb(p, PHB_LEM_ERROR_MASK); + stat->lemWOF = phb3_read_reg_asb(p, PHB_LEM_WOF); + stat->phbErrorStatus = phb3_read_reg_asb(p, PHB_ERR_STATUS); + stat->phbFirstErrorStatus = phb3_read_reg_asb(p, PHB_ERR1_STATUS); + stat->phbErrorLog0 = phb3_read_reg_asb(p, PHB_ERR_LOG_0); + stat->phbErrorLog1 = phb3_read_reg_asb(p, PHB_ERR_LOG_1); + stat->mmioErrorStatus = phb3_read_reg_asb(p, PHB_OUT_ERR_STATUS); + stat->mmioFirstErrorStatus = phb3_read_reg_asb(p, PHB_OUT_ERR1_STATUS); + stat->mmioErrorLog0 = phb3_read_reg_asb(p, PHB_OUT_ERR_LOG_0); + stat->mmioErrorLog1 = phb3_read_reg_asb(p, PHB_OUT_ERR_LOG_1); + stat->dma0ErrorStatus = phb3_read_reg_asb(p, PHB_INA_ERR_STATUS); + stat->dma0FirstErrorStatus = phb3_read_reg_asb(p, PHB_INA_ERR1_STATUS); + stat->dma0ErrorLog0 = phb3_read_reg_asb(p, PHB_INA_ERR_LOG_0); + stat->dma0ErrorLog1 = phb3_read_reg_asb(p, PHB_INA_ERR_LOG_1); + stat->dma1ErrorStatus = phb3_read_reg_asb(p, PHB_INB_ERR_STATUS); + stat->dma1FirstErrorStatus = phb3_read_reg_asb(p, PHB_INB_ERR1_STATUS); + stat->dma1ErrorLog0 = phb3_read_reg_asb(p, PHB_INB_ERR_LOG_0); + stat->dma1ErrorLog1 = phb3_read_reg_asb(p, PHB_INB_ERR_LOG_1); + + /* + * Grab PESTA & B content. The error bit (bit#0) should + * be fetched from IODA and the left content from memory + * resident tables. + */ + pPEST = (uint64_t *)p->tbl_pest; + val64 = PHB_IODA_AD_AUTOINC; + val64 = SETFIELD(PHB_IODA_AD_TSEL, val64, IODA2_TBL_PESTA); + phb3_write_reg_asb(p, PHB_IODA_ADDR, val64); + for (i = 0; i < OPAL_PHB3_NUM_PEST_REGS; i++) { + stat->pestA[i] = phb3_read_reg_asb(p, PHB_IODA_DATA0); + stat->pestA[i] |= pPEST[2 * i]; + } + + val64 = PHB_IODA_AD_AUTOINC; + val64 = SETFIELD(PHB_IODA_AD_TSEL, val64, IODA2_TBL_PESTB); + phb3_write_reg_asb(p, PHB_IODA_ADDR, val64); + for (i = 0; i < OPAL_PHB3_NUM_PEST_REGS; i++) { + stat->pestB[i] = phb3_read_reg_asb(p, PHB_IODA_DATA0); + stat->pestB[i] |= pPEST[2 * i + 1]; + } +} + +static void phb3_eeh_dump_regs(struct phb3 *p, struct OpalIoPhb3ErrorData *regs) +{ + struct OpalIoPhb3ErrorData *s; + unsigned int i; + + if (!verbose_eeh) + return; + + if (!regs) { + s = zalloc(sizeof(struct OpalIoPhb3ErrorData)); + if (!s) { + PHBERR(p, "Failed to allocate error info !\n"); + return; + } + + phb3_read_phb_status(p, s); + } else { + s = regs; + } + + PHBERR(p, "Error detected!\n"); + + PHBERR(p, " portStatusReg = %08x\n", s->portStatusReg); + PHBERR(p, " rootCmplxStatus = %08x\n", s->rootCmplxStatus); + PHBERR(p, " busAgentStatus = %08x\n", s->busAgentStatus); + + PHBERR(p, " errorClass = %016llx\n", s->errorClass); + PHBERR(p, " correlator = %016llx\n", s->correlator); + + PHBERR(p, " brdgCtl = %08x\n", s->brdgCtl); + PHBERR(p, " deviceStatus = %08x\n", s->deviceStatus); + PHBERR(p, " slotStatus = %08x\n", s->slotStatus); + PHBERR(p, " linkStatus = %08x\n", s->linkStatus); + PHBERR(p, " devCmdStatus = %08x\n", s->devCmdStatus); + PHBERR(p, " devSecStatus = %08x\n", s->devSecStatus); + PHBERR(p, " rootErrorStatus = %08x\n", s->rootErrorStatus); + PHBERR(p, " corrErrorStatus = %08x\n", s->corrErrorStatus); + PHBERR(p, " uncorrErrorStatus = %08x\n", s->uncorrErrorStatus); + + /* Byte swap TLP headers so they are the same as the PCIe spec */ + PHBERR(p, " tlpHdr1 = %08x\n", bswap_32(s->tlpHdr1)); + PHBERR(p, " tlpHdr2 = %08x\n", bswap_32(s->tlpHdr2)); + PHBERR(p, " tlpHdr3 = %08x\n", bswap_32(s->tlpHdr3)); + PHBERR(p, " tlpHdr4 = %08x\n", bswap_32(s->tlpHdr4)); + PHBERR(p, " sourceId = %08x\n", s->sourceId); + + PHBERR(p, " nFir = %016llx\n", s->nFir); + PHBERR(p, " nFirMask = %016llx\n", s->nFirMask); + PHBERR(p, " nFirWOF = %016llx\n", s->nFirWOF); + PHBERR(p, " phbPlssr = %016llx\n", s->phbPlssr); + PHBERR(p, " phbCsr = %016llx\n", s->phbCsr); + PHBERR(p, " lemFir = %016llx\n", s->lemFir); + PHBERR(p, " lemErrorMask = %016llx\n", s->lemErrorMask); + PHBERR(p, " lemWOF = %016llx\n", s->lemWOF); + + PHBERR(p, " phbErrorStatus = %016llx\n", s->phbErrorStatus); + PHBERR(p, " phbFirstErrorStatus = %016llx\n", s->phbFirstErrorStatus); + PHBERR(p, " phbErrorLog0 = %016llx\n", s->phbErrorLog0); + PHBERR(p, " phbErrorLog1 = %016llx\n", s->phbErrorLog1); + + PHBERR(p, " mmioErrorStatus = %016llx\n", s->mmioErrorStatus); + PHBERR(p, "mmioFirstErrorStatus = %016llx\n", s->mmioFirstErrorStatus); + PHBERR(p, " mmioErrorLog0 = %016llx\n", s->mmioErrorLog0); + PHBERR(p, " mmioErrorLog1 = %016llx\n", s->mmioErrorLog1); + + PHBERR(p, " dma0ErrorStatus = %016llx\n", s->dma0ErrorStatus); + PHBERR(p, "dma0FirstErrorStatus = %016llx\n", s->dma0FirstErrorStatus); + PHBERR(p, " dma0ErrorLog0 = %016llx\n", s->dma0ErrorLog0); + PHBERR(p, " dma0ErrorLog1 = %016llx\n", s->dma0ErrorLog1); + + PHBERR(p, " dma1ErrorStatus = %016llx\n", s->dma1ErrorStatus); + PHBERR(p, "dma1FirstErrorStatus = %016llx\n", s->dma1FirstErrorStatus); + PHBERR(p, " dma1ErrorLog0 = %016llx\n", s->dma1ErrorLog0); + PHBERR(p, " dma1ErrorLog1 = %016llx\n", s->dma1ErrorLog1); + + for (i = 0; i < OPAL_PHB3_NUM_PEST_REGS; i++) { + if (!s->pestA[i] && !s->pestB[i]) + continue; + PHBERR(p, " PEST[%03x] = %016llx %016llx\n", + i, s->pestA[i], s->pestB[i]); + } + + if (s != regs) + free(s); +} + +static int64_t phb3_msi_get_xive(struct irq_source *is, uint32_t isn, + uint16_t *server, uint8_t *prio) +{ + struct phb3 *p = is->data; + uint32_t chip, index, irq; + uint64_t ive; + + chip = p8_irq_to_chip(isn); + index = p8_irq_to_phb(isn); + irq = PHB3_IRQ_NUM(isn); + + if (chip != p->chip_id || + index != p->index || + irq > PHB3_MSI_IRQ_MAX) + return OPAL_PARAMETER; + + /* + * Each IVE has 16 bytes in cache. Note that the kernel + * should strip the link bits from server field. + */ + ive = p->ive_cache[irq]; + *server = GETFIELD(IODA2_IVT_SERVER, ive); + *prio = GETFIELD(IODA2_IVT_PRIORITY, ive); + + return OPAL_SUCCESS; +} + +static int64_t phb3_msi_set_xive(struct irq_source *is, uint32_t isn, + uint16_t server, uint8_t prio) +{ + struct phb3 *p = is->data; + uint32_t chip, index; + uint64_t *cache, ive_num, data64, m_server, m_prio, ivc; + uint32_t *ive; + + chip = p8_irq_to_chip(isn); + index = p8_irq_to_phb(isn); + ive_num = PHB3_IRQ_NUM(isn); + + if (p->broken || !p->tbl_rtt) + return OPAL_HARDWARE; + if (chip != p->chip_id || + index != p->index || + ive_num > PHB3_MSI_IRQ_MAX) + return OPAL_PARAMETER; + + phb_lock(&p->phb); + + /* + * We need strip the link from server. As Milton told + * me, the server is assigned as follows and the left + * bits unused: node/chip/core/thread/link = 2/3/4/3/2 + * + * Note: the server has added the link bits to server. + */ + m_server = server; + m_prio = prio; + + cache = &p->ive_cache[ive_num]; + *cache = SETFIELD(IODA2_IVT_SERVER, *cache, m_server); + *cache = SETFIELD(IODA2_IVT_PRIORITY, *cache, m_prio); + + /* + * Update IVT and IVC. We need use IVC update register + * to do that. Each IVE in the table has 128 bytes + */ + ive = (uint32_t *)(p->tbl_ivt + ive_num * IVT_TABLE_STRIDE * 8); + data64 = PHB_IVC_UPDATE_ENABLE_SERVER | PHB_IVC_UPDATE_ENABLE_PRI; + data64 = SETFIELD(PHB_IVC_UPDATE_SID, data64, ive_num); + data64 = SETFIELD(PHB_IVC_UPDATE_SERVER, data64, m_server); + data64 = SETFIELD(PHB_IVC_UPDATE_PRI, data64, m_prio); + + /* + * We don't use SETFIELD because we are doing a 32-bit access + * in order to avoid touching the P and Q bits + */ + *ive = (m_server << 8) | m_prio; + out_be64(p->regs + PHB_IVC_UPDATE, data64); + + if (prio != 0xff) { + /* + * Handle Q bit if we're going to enable the + * interrupt. The OS should make sure the interrupt + * handler has been installed already. + */ + if (phb3_pci_msi_check_q(p, ive_num)) + phb3_pci_msi_flush_ive(p, ive_num); + } else { + /* Read from random PHB reg to force flush */ + in_be64(p->regs + PHB_IVC_UPDATE); + + /* Order with subsequent read of Q */ + sync(); + + /* Clear P, Q and Gen, preserve PE# */ + ive[1] &= 0x0000ffff; + + /* + * Update the IVC with a match against the old gen + * count. No need to worry about racing with P being + * set in the cache since IRQ is masked at this point. + */ + ivc = SETFIELD(PHB_IVC_UPDATE_SID, 0ul, ive_num) | + PHB_IVC_UPDATE_ENABLE_P | + PHB_IVC_UPDATE_ENABLE_Q | + PHB_IVC_UPDATE_ENABLE_GEN; + out_be64(p->regs + PHB_IVC_UPDATE, ivc); + } + + phb_unlock(&p->phb); + + return OPAL_SUCCESS; +} + +static int64_t phb3_lsi_get_xive(struct irq_source *is, uint32_t isn, + uint16_t *server, uint8_t *prio) +{ + struct phb3 *p = is->data; + uint32_t chip, index, irq; + uint64_t lxive; + + chip = p8_irq_to_chip(isn); + index = p8_irq_to_phb(isn); + irq = PHB3_IRQ_NUM(isn); + + if (chip != p->chip_id || + index != p->index || + irq < PHB3_LSI_IRQ_MIN || + irq > PHB3_LSI_IRQ_MAX) + return OPAL_PARAMETER; + + lxive = p->lxive_cache[irq - PHB3_LSI_IRQ_MIN]; + *server = GETFIELD(IODA2_LXIVT_SERVER, lxive); + *prio = GETFIELD(IODA2_LXIVT_PRIORITY, lxive); + + return OPAL_SUCCESS; +} + +static int64_t phb3_lsi_set_xive(struct irq_source *is, uint32_t isn, + uint16_t server, uint8_t prio) +{ + struct phb3 *p = is->data; + uint32_t chip, index, irq, entry; + uint64_t lxive; + + chip = p8_irq_to_chip(isn); + index = p8_irq_to_phb(isn); + irq = PHB3_IRQ_NUM(isn); + + if (p->broken) + return OPAL_HARDWARE; + + if (chip != p->chip_id || + index != p->index || + irq < PHB3_LSI_IRQ_MIN || + irq > PHB3_LSI_IRQ_MAX) + return OPAL_PARAMETER; + + lxive = SETFIELD(IODA2_LXIVT_SERVER, 0ul, server); + lxive = SETFIELD(IODA2_LXIVT_PRIORITY, lxive, prio); + + phb_lock(&p->phb); + + /* + * We cache the arguments because we have to mangle + * it in order to hijack 3 bits of priority to extend + * the server number + */ + entry = irq - PHB3_LSI_IRQ_MIN; + p->lxive_cache[entry] = lxive; + + /* We use HRT entry 0 always for now */ + phb3_ioda_sel(p, IODA2_TBL_LXIVT, entry, false); + lxive = in_be64(p->regs + PHB_IODA_DATA0); + lxive = SETFIELD(IODA2_LXIVT_SERVER, lxive, server); + lxive = SETFIELD(IODA2_LXIVT_PRIORITY, lxive, prio); + out_be64(p->regs + PHB_IODA_DATA0, lxive); + + phb_unlock(&p->phb); + + return OPAL_SUCCESS; +} + +static void phb3_err_interrupt(struct irq_source *is, uint32_t isn) +{ + struct phb3 *p = is->data; + + PHBDBG(p, "Got interrupt 0x%08x\n", isn); + + /* Update pending event */ + opal_update_pending_evt(OPAL_EVENT_PCI_ERROR, + OPAL_EVENT_PCI_ERROR); + + /* If the PHB is broken, go away */ + if (p->broken) + return; + + /* + * Mark the PHB has pending error so that the OS + * can handle it at late point. + */ + phb3_set_err_pending(p, true); +} + +static uint64_t phb3_lsi_attributes(struct irq_source *is, uint32_t isn) +{ +#ifndef DISABLE_ERR_INTS + struct phb3 *p = is->data; + uint32_t idx = isn - p->base_lsi; + + if (idx == PHB3_LSI_PCIE_INF || idx == PHB3_LSI_PCIE_ER) + return IRQ_ATTR_TARGET_OPAL | IRQ_ATTR_TARGET_RARE | IRQ_ATTR_TYPE_LSI; +#endif + return IRQ_ATTR_TARGET_LINUX; +} + +/* MSIs (OS owned) */ +static const struct irq_source_ops phb3_msi_irq_ops = { + .get_xive = phb3_msi_get_xive, + .set_xive = phb3_msi_set_xive, +}; + +/* LSIs (OS owned) */ +static const struct irq_source_ops phb3_lsi_irq_ops = { + .get_xive = phb3_lsi_get_xive, + .set_xive = phb3_lsi_set_xive, + .attributes = phb3_lsi_attributes, + .interrupt = phb3_err_interrupt, +}; + +static int64_t phb3_set_pe(struct phb *phb, + uint64_t pe_number, + uint64_t bdfn, + uint8_t bcompare, + uint8_t dcompare, + uint8_t fcompare, + uint8_t action) +{ + struct phb3 *p = phb_to_phb3(phb); + uint64_t mask, val, tmp, idx; + int32_t all = 0; + uint16_t *rte; + + /* Sanity check */ + if (!p->tbl_rtt) + return OPAL_HARDWARE; + if (action != OPAL_MAP_PE && action != OPAL_UNMAP_PE) + return OPAL_PARAMETER; + if (pe_number >= PHB3_MAX_PE_NUM || bdfn > 0xffff || + bcompare > OpalPciBusAll || + dcompare > OPAL_COMPARE_RID_DEVICE_NUMBER || + fcompare > OPAL_COMPARE_RID_FUNCTION_NUMBER) + return OPAL_PARAMETER; + + /* Figure out the RID range */ + if (bcompare == OpalPciBusAny) { + mask = 0x0; + val = 0x0; + all = 0x1; + } else { + tmp = ((0x1 << (bcompare + 1)) - 1) << (15 - bcompare); + mask = tmp; + val = bdfn & tmp; + } + + if (dcompare == OPAL_IGNORE_RID_DEVICE_NUMBER) + all = (all << 1) | 0x1; + else { + mask |= 0xf8; + val |= (bdfn & 0xf8); + } + + if (fcompare == OPAL_IGNORE_RID_FUNCTION_NUMBER) + all = (all << 1) | 0x1; + else { + mask |= 0x7; + val |= PCI_FUNC(bdfn); + } + + /* Map or unmap the RTT range */ + if (all == 0x7) { + if (action == OPAL_MAP_PE) { + for (idx = 0; idx < RTT_TABLE_ENTRIES; idx++) + p->rte_cache[idx] = pe_number; + } else { + for ( idx = 0; idx < ARRAY_SIZE(p->rte_cache); idx++) + p->rte_cache[idx] = PHB3_RESERVED_PE_NUM; + } + memcpy((void *)p->tbl_rtt, p->rte_cache, RTT_TABLE_SIZE); + } else { + rte = (uint16_t *)p->tbl_rtt; + for (idx = 0; idx < RTT_TABLE_ENTRIES; idx++, rte++) { + if ((idx & mask) != val) + continue; + if (action == OPAL_MAP_PE) + p->rte_cache[idx] = pe_number; + else + p->rte_cache[idx] = PHB3_RESERVED_PE_NUM; + *rte = p->rte_cache[idx]; + } + } + + /* Invalidate the entire RTC */ + out_be64(p->regs + PHB_RTC_INVALIDATE, PHB_RTC_INVALIDATE_ALL); + + return OPAL_SUCCESS; +} + +static int64_t phb3_set_peltv(struct phb *phb, + uint32_t parent_pe, + uint32_t child_pe, + uint8_t state) +{ + struct phb3 *p = phb_to_phb3(phb); + uint8_t *peltv; + uint32_t idx, mask; + + /* Sanity check */ + if (!p->tbl_peltv) + return OPAL_HARDWARE; + if (parent_pe >= PHB3_MAX_PE_NUM || child_pe >= PHB3_MAX_PE_NUM) + return OPAL_PARAMETER; + + /* Find index for parent PE */ + idx = parent_pe * (PHB3_MAX_PE_NUM / 8); + idx += (child_pe / 8); + mask = 0x1 << (7 - (child_pe % 8)); + + peltv = (uint8_t *)p->tbl_peltv; + peltv += idx; + if (state) { + *peltv |= mask; + p->peltv_cache[idx] |= mask; + } else { + *peltv &= ~mask; + p->peltv_cache[idx] &= ~mask; + } + + return OPAL_SUCCESS; +} + +static void phb3_prepare_link_change(struct pci_slot *slot, + bool is_up) +{ + struct phb3 *p = phb_to_phb3(slot->phb); + struct pci_device *pd = slot->pd; + uint32_t reg32; + + p->has_link = is_up; + if (!is_up) { + if (!pd || !pd->slot || !pd->slot->surprise_pluggable) { + /* Mask PCIE port interrupts */ + out_be64(p->regs + UTL_PCIE_PORT_IRQ_EN, + 0xad42800000000000UL); + + pci_cfg_read32(&p->phb, 0, + p->aercap + PCIECAP_AER_UE_MASK, ®32); + reg32 |= PCIECAP_AER_UE_MASK_SURPRISE_DOWN; + pci_cfg_write32(&p->phb, 0, + p->aercap + PCIECAP_AER_UE_MASK, reg32); + } + + /* Mask AER receiver error */ + phb3_pcicfg_read32(&p->phb, 0, + p->aercap + PCIECAP_AER_CE_MASK, ®32); + reg32 |= PCIECAP_AER_CE_RECVR_ERR; + phb3_pcicfg_write32(&p->phb, 0, + p->aercap + PCIECAP_AER_CE_MASK, reg32); + + /* Block PCI-CFG access */ + p->flags |= PHB3_CFG_BLOCKED; + } else { + /* Clear AER receiver error status */ + phb3_pcicfg_write32(&p->phb, 0, + p->aercap + PCIECAP_AER_CE_STATUS, + PCIECAP_AER_CE_RECVR_ERR); + + /* Unmask receiver error status in AER */ + phb3_pcicfg_read32(&p->phb, 0, + p->aercap + PCIECAP_AER_CE_MASK, ®32); + reg32 &= ~PCIECAP_AER_CE_RECVR_ERR; + phb3_pcicfg_write32(&p->phb, 0, + p->aercap + PCIECAP_AER_CE_MASK, reg32); + + /* Clear spurrious errors and enable PCIE port interrupts */ + out_be64(p->regs + UTL_PCIE_PORT_STATUS, + 0xffdfffffffffffffUL); + + if (!pd || !pd->slot || !pd->slot->surprise_pluggable) { + out_be64(p->regs + UTL_PCIE_PORT_IRQ_EN, + 0xad52800000000000UL); + + pci_cfg_read32(&p->phb, 0, + p->aercap + PCIECAP_AER_UE_MASK, ®32); + reg32 &= ~PCIECAP_AER_UE_MASK_SURPRISE_DOWN; + pci_cfg_write32(&p->phb, 0, + p->aercap + PCIECAP_AER_UE_MASK, reg32); + } + + /* Don't block PCI-CFG */ + p->flags &= ~PHB3_CFG_BLOCKED; + + /* + * We might lose the bus numbers during the reset operation + * and we need to restore them. Otherwise, some adapters (e.g. + * IPR) can't be probed properly by the kernel. We don't need + * to restore bus numbers for every kind of reset, however, + * it's not harmful to always restore the bus numbers, which + * simplifies the logic. + */ + pci_restore_bridge_buses(slot->phb, slot->pd); + if (slot->phb->ops->device_init) + pci_walk_dev(slot->phb, slot->pd, + slot->phb->ops->device_init, NULL); + } +} + +static int64_t phb3_get_presence_state(struct pci_slot *slot, uint8_t *val) +{ + struct phb3 *p = phb_to_phb3(slot->phb); + uint64_t hp_override; + + if (p->broken) + return OPAL_HARDWARE; + + /* + * On P8, the slot status isn't wired up properly, we have + * to use the hotplug override A/B bits. + */ + hp_override = in_be64(p->regs + PHB_HOTPLUG_OVERRIDE); + if ((hp_override & PHB_HPOVR_PRESENCE_A) && + (hp_override & PHB_HPOVR_PRESENCE_B)) + *val = OPAL_PCI_SLOT_EMPTY; + else + *val = OPAL_PCI_SLOT_PRESENT; + + return OPAL_SUCCESS; +} + +static int64_t phb3_get_link_state(struct pci_slot *slot, uint8_t *val) +{ + struct phb3 *p = phb_to_phb3(slot->phb); + uint64_t reg; + uint16_t state; + int64_t rc; + + /* Link is up, let's find the actual speed */ + reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL); + if (!(reg & PHB_PCIE_DLP_TC_DL_LINKACT)) { + *val = 0; + return OPAL_SUCCESS; + } + + rc = phb3_pcicfg_read16(&p->phb, 0, + p->ecap + PCICAP_EXP_LSTAT, &state); + if (rc != OPAL_SUCCESS) { + PHBERR(p, "%s: Error %lld getting link state\n", __func__, rc); + return OPAL_HARDWARE; + } + + if (state & PCICAP_EXP_LSTAT_DLLL_ACT) + *val = ((state & PCICAP_EXP_LSTAT_WIDTH) >> 4); + else + *val = 0; + + return OPAL_SUCCESS; +} + +static int64_t phb3_retry_state(struct pci_slot *slot) +{ + struct phb3 *p = phb_to_phb3(slot->phb); + + if (slot->retry_state == PCI_SLOT_STATE_NORMAL) + return OPAL_WRONG_STATE; + + PHBDBG(p, "Retry state %08x\n", slot->retry_state); + slot->delay_tgt_tb = 0; + pci_slot_set_state(slot, slot->retry_state); + slot->retry_state = PCI_SLOT_STATE_NORMAL; + return slot->ops.run_sm(slot); +} + +static int64_t phb3_poll_link(struct pci_slot *slot) +{ + struct phb3 *p = phb_to_phb3(slot->phb); + uint64_t reg; + int64_t rc; + + switch (slot->state) { + case PHB3_SLOT_NORMAL: + case PHB3_SLOT_LINK_START: + PHBDBG(p, "LINK: Start polling\n"); + slot->retries = PHB3_LINK_ELECTRICAL_RETRIES; + pci_slot_set_state(slot, PHB3_SLOT_LINK_WAIT_ELECTRICAL); + return pci_slot_set_sm_timeout(slot, msecs_to_tb(100)); + case PHB3_SLOT_LINK_WAIT_ELECTRICAL: + /* + * Wait for the link electrical connection to be + * established (shorter timeout). This allows us to + * workaround spurrious presence detect on some machines + * without waiting 10s each time + * + * Note: We *also* check for the full link up bit here + * because simics doesn't seem to implement the electrical + * link bit at all + */ + reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL); + if (reg & (PHB_PCIE_DLP_INBAND_PRESENCE | + PHB_PCIE_DLP_TC_DL_LINKACT)) { + PHBDBG(p, "LINK: Electrical link detected\n"); + pci_slot_set_state(slot, PHB3_SLOT_LINK_WAIT); + slot->retries = PHB3_LINK_WAIT_RETRIES; + return pci_slot_set_sm_timeout(slot, msecs_to_tb(100)); + } + + if (slot->retries-- == 0) { + PHBDBG(p, "LINK: Timeout waiting for electrical link\n"); + PHBDBG(p, "LINK: DLP train control: 0x%016llx\n", reg); + rc = phb3_retry_state(slot); + if (rc >= OPAL_SUCCESS) + return rc; + + pci_slot_set_state(slot, PHB3_SLOT_NORMAL); + return OPAL_SUCCESS; + } + return pci_slot_set_sm_timeout(slot, msecs_to_tb(100)); + case PHB3_SLOT_LINK_WAIT: + reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL); + if (reg & PHB_PCIE_DLP_TC_DL_LINKACT) { + PHBDBG(p, "LINK: Link is up\n"); + if (slot->ops.prepare_link_change) + slot->ops.prepare_link_change(slot, true); + pci_slot_set_state(slot, PHB3_SLOT_NORMAL); + return OPAL_SUCCESS; + } + + if (slot->retries-- == 0) { + PHBDBG(p, "LINK: Timeout waiting for link up\n"); + PHBDBG(p, "LINK: DLP train control: 0x%016llx\n", reg); + rc = phb3_retry_state(slot); + if (rc >= OPAL_SUCCESS) + return rc; + + pci_slot_set_state(slot, PHB3_SLOT_NORMAL); + return OPAL_SUCCESS; + } + return pci_slot_set_sm_timeout(slot, msecs_to_tb(100)); + default: + PHBERR(p, "LINK: Unexpected slot state %08x\n", + slot->state); + } + + pci_slot_set_state(slot, PHB3_SLOT_NORMAL); + return OPAL_HARDWARE; +} + +static int64_t phb3_hreset(struct pci_slot *slot) +{ + struct phb3 *p = phb_to_phb3(slot->phb); + uint16_t brctl; + uint8_t presence = 1; + + switch (slot->state) { + case PHB3_SLOT_NORMAL: + PHBDBG(p, "HRESET: Starts\n"); + if (slot->ops.get_presence_state) + slot->ops.get_presence_state(slot, &presence); + if (!presence) { + PHBDBG(p, "HRESET: No device\n"); + return OPAL_SUCCESS; + } + + PHBDBG(p, "HRESET: Prepare for link down\n"); + if (slot->ops.prepare_link_change) + slot->ops.prepare_link_change(slot, false); + /* fall through */ + case PHB3_SLOT_HRESET_START: + PHBDBG(p, "HRESET: Assert\n"); + + phb3_pcicfg_read16(&p->phb, 0, PCI_CFG_BRCTL, &brctl); + brctl |= PCI_CFG_BRCTL_SECONDARY_RESET; + phb3_pcicfg_write16(&p->phb, 0, PCI_CFG_BRCTL, brctl); + pci_slot_set_state(slot, PHB3_SLOT_HRESET_DELAY); + + return pci_slot_set_sm_timeout(slot, secs_to_tb(1)); + case PHB3_SLOT_HRESET_DELAY: + PHBDBG(p, "HRESET: Deassert\n"); + + phb3_pcicfg_read16(&p->phb, 0, PCI_CFG_BRCTL, &brctl); + brctl &= ~PCI_CFG_BRCTL_SECONDARY_RESET; + phb3_pcicfg_write16(&p->phb, 0, PCI_CFG_BRCTL, brctl); + + /* + * Due to some oddball adapters bouncing the link + * training a couple of times, we wait for a full second + * before we start checking the link status, otherwise + * we can get a spurrious link down interrupt which + * causes us to EEH immediately. + */ + pci_slot_set_state(slot, PHB3_SLOT_HRESET_DELAY2); + return pci_slot_set_sm_timeout(slot, secs_to_tb(1)); + case PHB3_SLOT_HRESET_DELAY2: + pci_slot_set_state(slot, PHB3_SLOT_LINK_START); + return slot->ops.poll_link(slot); + default: + PHBERR(p, "Unexpected slot state %08x\n", slot->state); + } + + pci_slot_set_state(slot, PHB3_SLOT_NORMAL); + return OPAL_HARDWARE; +} + +static int64_t phb3_freset(struct pci_slot *slot) +{ + struct phb3 *p = phb_to_phb3(slot->phb); + uint8_t presence = 1; + uint64_t reg; + + switch(slot->state) { + case PHB3_SLOT_NORMAL: + PHBDBG(p, "FRESET: Starts\n"); + + /* Nothing to do without adapter connected */ + if (slot->ops.get_presence_state) + slot->ops.get_presence_state(slot, &presence); + if (!presence) { + PHBDBG(p, "FRESET: No device\n"); + return OPAL_SUCCESS; + } + + PHBDBG(p, "FRESET: Prepare for link down\n"); + slot->retry_state = PHB3_SLOT_FRESET_START; + if (slot->ops.prepare_link_change) + slot->ops.prepare_link_change(slot, false); + /* fall through */ + case PHB3_SLOT_FRESET_START: + if (!p->skip_perst) { + PHBDBG(p, "FRESET: Assert\n"); + reg = in_be64(p->regs + PHB_RESET); + reg &= ~0x2000000000000000ul; + out_be64(p->regs + PHB_RESET, reg); + pci_slot_set_state(slot, + PHB3_SLOT_FRESET_ASSERT_DELAY); + return pci_slot_set_sm_timeout(slot, secs_to_tb(1)); + } + + /* To skip the assert during boot time */ + PHBDBG(p, "FRESET: Assert skipped\n"); + pci_slot_set_state(slot, PHB3_SLOT_FRESET_ASSERT_DELAY); + p->skip_perst = false; + /* fall through */ + case PHB3_SLOT_FRESET_ASSERT_DELAY: + PHBDBG(p, "FRESET: Deassert\n"); + reg = in_be64(p->regs + PHB_RESET); + reg |= 0x2000000000000000ul; + out_be64(p->regs + PHB_RESET, reg); + pci_slot_set_state(slot, + PHB3_SLOT_FRESET_DEASSERT_DELAY); + + /* CAPP FPGA requires 1s to flash before polling link */ + return pci_slot_set_sm_timeout(slot, secs_to_tb(1)); + case PHB3_SLOT_FRESET_DEASSERT_DELAY: + pci_slot_set_state(slot, PHB3_SLOT_LINK_START); + return slot->ops.poll_link(slot); + default: + PHBERR(p, "Unexpected slot state %08x\n", slot->state); + } + + pci_slot_set_state(slot, PHB3_SLOT_NORMAL); + return OPAL_HARDWARE; +} + +static int64_t load_capp_ucode(struct phb3 *p) +{ + int64_t rc; + + if (p->index > PHB3_CAPP_MAX_PHB_INDEX(p)) + return OPAL_HARDWARE; + + /* 0x434150504c494448 = 'CAPPLIDH' in ASCII */ + rc = capp_load_ucode(p->chip_id, p->phb.opal_id, p->index, + 0x434150504c494448UL, PHB3_CAPP_REG_OFFSET(p), + CAPP_APC_MASTER_ARRAY_ADDR_REG, + CAPP_APC_MASTER_ARRAY_WRITE_REG, + CAPP_SNP_ARRAY_ADDR_REG, + CAPP_SNP_ARRAY_WRITE_REG); + return rc; +} + +static void do_capp_recovery_scoms(struct phb3 *p) +{ + uint64_t reg; + uint32_t offset; + + PHBDBG(p, "Doing CAPP recovery scoms\n"); + + offset = PHB3_CAPP_REG_OFFSET(p); + /* disable snoops */ + xscom_write(p->chip_id, SNOOP_CAPI_CONFIG + offset, 0); + load_capp_ucode(p); + /* clear err rpt reg*/ + xscom_write(p->chip_id, CAPP_ERR_RPT_CLR + offset, 0); + /* clear capp fir */ + xscom_write(p->chip_id, CAPP_FIR + offset, 0); + + xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, ®); + reg &= ~(PPC_BIT(0) | PPC_BIT(1)); + xscom_write(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, reg); +} + +/* + * Disable CAPI mode on a PHB. + * + * Must be done while PHB is fenced and in recovery. Leaves CAPP in recovery - + * we can't come out of recovery until the PHB has been reinitialised. + * + * We don't reset generic error registers here - we rely on phb3_init_hw() to + * do that. + * + * Sets PHB3_CAPP_DISABLING flag when complete. + */ +static void disable_capi_mode(struct phb3 *p) +{ + struct proc_chip *chip = get_chip(p->chip_id); + uint64_t reg; + uint32_t offset = PHB3_CAPP_REG_OFFSET(p); + + lock(&capi_lock); + + xscom_read(p->chip_id, PE_CAPP_EN + PE_REG_OFFSET(p), ®); + if (!(reg & PPC_BIT(0))) { + /* Not in CAPI mode, no action required */ + goto out; + } + + PHBDBG(p, "CAPP: Disabling CAPI mode\n"); + if (!(chip->capp_phb3_attached_mask & (1 << p->index))) + PHBERR(p, "CAPP: CAPP attached mask not set!\n"); + + xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, ®); + if (!(reg & PPC_BIT(0))) { + PHBERR(p, "CAPP: not in recovery, can't disable CAPI mode!\n"); + goto out; + } + + /* Snoop CAPI Configuration Register - disable snooping */ + xscom_write(p->chip_id, SNOOP_CAPI_CONFIG + offset, 0ull); + + /* APC Master PB Control Register - disable examining cResps */ + xscom_read(p->chip_id, APC_MASTER_PB_CTRL + offset, ®); + reg &= ~PPC_BIT(3); + xscom_write(p->chip_id, APC_MASTER_PB_CTRL + offset, reg); + + /* APC Master Config Register - de-select PHBs */ + xscom_read(p->chip_id, APC_MASTER_CAPI_CTRL + offset, ®); + reg &= ~PPC_BITMASK(1, 3); + xscom_write(p->chip_id, APC_MASTER_CAPI_CTRL + offset, reg); + + /* PE Bus AIB Mode Bits */ + xscom_read(p->chip_id, p->pci_xscom + 0xf, ®); + reg |= PPC_BITMASK(7, 8); /* Ch2 command credit */ + reg &= ~PPC_BITMASK(40, 42); /* Disable HOL blocking */ + xscom_write(p->chip_id, p->pci_xscom + 0xf, reg); + + /* PCI Hardware Configuration 0 Register - all store queues free */ + xscom_read(p->chip_id, p->pe_xscom + 0x18, ®); + reg &= ~PPC_BIT(14); + reg |= PPC_BIT(15); + xscom_write(p->chip_id, p->pe_xscom + 0x18, reg); + + /* + * PCI Hardware Configuration 1 Register - enable read response + * arrival/address request ordering + */ + xscom_read(p->chip_id, p->pe_xscom + 0x19, ®); + reg |= PPC_BITMASK(17,18); + xscom_write(p->chip_id, p->pe_xscom + 0x19, reg); + + /* + * AIB TX Command Credit Register - set AIB credit values back to + * normal + */ + xscom_read(p->chip_id, p->pci_xscom + 0xd, ®); + reg |= PPC_BIT(42); + reg &= ~PPC_BITMASK(43, 47); + xscom_write(p->chip_id, p->pci_xscom + 0xd, reg); + + /* AIB TX Credit Init Timer - reset timer */ + xscom_write(p->chip_id, p->pci_xscom + 0xc, 0xff00000000000000UL); + + /* + * PBCQ Mode Control Register - set dcache handling to normal, not CAPP + * mode + */ + xscom_read(p->chip_id, p->pe_xscom + 0xb, ®); + reg &= ~PPC_BIT(25); + xscom_write(p->chip_id, p->pe_xscom + 0xb, reg); + + /* Registers touched by phb3_init_capp_regs() */ + + /* CAPP Transport Control Register */ + xscom_write(p->chip_id, TRANSPORT_CONTROL + offset, 0x0001000000000000UL); + + /* Canned pResp Map Register 0/1/2 */ + xscom_write(p->chip_id, CANNED_PRESP_MAP0 + offset, 0); + xscom_write(p->chip_id, CANNED_PRESP_MAP1 + offset, 0); + xscom_write(p->chip_id, CANNED_PRESP_MAP2 + offset, 0); + + /* Flush SUE State Map Register */ + xscom_write(p->chip_id, FLUSH_SUE_STATE_MAP + offset, 0); + + /* CAPP Epoch and Recovery Timers Control Register */ + xscom_write(p->chip_id, CAPP_EPOCH_TIMER_CTRL + offset, 0); + + /* PE Secure CAPP Enable Register - we're all done! Disable CAPP mode! */ + xscom_write(p->chip_id, PE_CAPP_EN + PE_REG_OFFSET(p), 0ull); + + /* Trigger CAPP recovery scoms after reinit */ + p->flags |= PHB3_CAPP_DISABLING; + + chip->capp_phb3_attached_mask &= ~(1 << p->index); + +out: + unlock(&capi_lock); +} + +static int64_t phb3_creset(struct pci_slot *slot) +{ + struct phb3 *p = phb_to_phb3(slot->phb); + uint64_t cqsts, val; + + switch (slot->state) { + case PHB3_SLOT_NORMAL: + case PHB3_SLOT_CRESET_START: + PHBDBG(p, "CRESET: Starts\n"); + + /* do steps 3-5 of capp recovery procedure */ + if (p->flags & PHB3_CAPP_RECOVERY) + do_capp_recovery_scoms(p); + + /* + * The users might be doing error injection through PBCQ + * Error Inject Control Register. Without clearing that, + * we will get recrusive error during recovery and it will + * fail eventually. + */ + xscom_write(p->chip_id, p->pe_xscom + 0xa, 0x0ul); + + /* + * We might have escalated frozen state on non-existing PE + * to fenced PHB. For the case, the PHB isn't fenced in the + * hardware level and it's not safe to do ETU reset. So we + * have to force fenced PHB prior to ETU reset. + */ + if (!phb3_fenced(p)) + xscom_write(p->chip_id, p->pe_xscom + 0x2, 0x000000f000000000ull); + + /* Now that we're guaranteed to be fenced, disable CAPI mode */ + if (!(p->flags & PHB3_CAPP_RECOVERY)) + disable_capi_mode(p); + + /* Clear errors in NFIR and raise ETU reset */ + xscom_read(p->chip_id, p->pe_xscom + 0x0, &p->nfir_cache); + + xscom_read(p->chip_id, p->spci_xscom + 1, &val);/* HW275117 */ + xscom_write(p->chip_id, p->pci_xscom + 0xa, + 0x8000000000000000UL); + pci_slot_set_state(slot, PHB3_SLOT_CRESET_WAIT_CQ); + slot->retries = 500; + return pci_slot_set_sm_timeout(slot, msecs_to_tb(10)); + case PHB3_SLOT_CRESET_WAIT_CQ: + xscom_read(p->chip_id, p->pe_xscom + 0x1c, &val); + xscom_read(p->chip_id, p->pe_xscom + 0x1d, &val); + xscom_read(p->chip_id, p->pe_xscom + 0x1e, &val); + xscom_read(p->chip_id, p->pe_xscom + 0xf, &cqsts); + if (!(cqsts & 0xC000000000000000UL)) { + PHBDBG(p, "CRESET: No pending transactions\n"); + xscom_write(p->chip_id, p->pe_xscom + 0x1, ~p->nfir_cache); + + pci_slot_set_state(slot, PHB3_SLOT_CRESET_REINIT); + return pci_slot_set_sm_timeout(slot, msecs_to_tb(100)); + } + + if (slot->retries-- == 0) { + PHBERR(p, "Timeout waiting for pending transaction\n"); + goto error; + } + return pci_slot_set_sm_timeout(slot, msecs_to_tb(10)); + case PHB3_SLOT_CRESET_REINIT: + PHBDBG(p, "CRESET: Reinitialization\n"); + + /* + * Clear AIB fenced state. Otherwise, we can't access the + * PCI config space of root complex when reinitializing + * the PHB. + */ + p->flags &= ~PHB3_AIB_FENCED; + p->flags &= ~PHB3_CAPP_RECOVERY; + phb3_init_hw(p, false); + + if (p->flags & PHB3_CAPP_DISABLING) { + do_capp_recovery_scoms(p); + p->flags &= ~PHB3_CAPP_DISABLING; + } + + pci_slot_set_state(slot, PHB3_SLOT_CRESET_FRESET); + return pci_slot_set_sm_timeout(slot, msecs_to_tb(100)); + case PHB3_SLOT_CRESET_FRESET: + pci_slot_set_state(slot, PHB3_SLOT_NORMAL); + return slot->ops.freset(slot); + default: + PHBERR(p, "CRESET: Unexpected slot state %08x\n", + slot->state); + } + +error: + return OPAL_HARDWARE; +} + +/* + * Initialize root complex slot, which is mainly used to + * do fundamental reset before PCI enumeration in PCI core. + * When probing root complex and building its real slot, + * the operations will be copied over. + */ +static struct pci_slot *phb3_slot_create(struct phb *phb) +{ + struct pci_slot *slot; + + slot = pci_slot_alloc(phb, NULL); + if (!slot) + return slot; + + /* Elementary functions */ + slot->ops.get_presence_state = phb3_get_presence_state; + slot->ops.get_link_state = phb3_get_link_state; + slot->ops.get_power_state = NULL; + slot->ops.get_attention_state = NULL; + slot->ops.get_latch_state = NULL; + slot->ops.set_power_state = NULL; + slot->ops.set_attention_state = NULL; + + /* + * For PHB slots, we have to split the fundamental reset + * into 2 steps. We might not have the first step which + * is to power off/on the slot, or it's controlled by + * individual platforms. + */ + slot->ops.prepare_link_change = phb3_prepare_link_change; + slot->ops.poll_link = phb3_poll_link; + slot->ops.hreset = phb3_hreset; + slot->ops.freset = phb3_freset; + slot->ops.creset = phb3_creset; + + return slot; +} + +static int64_t phb3_eeh_freeze_status(struct phb *phb, uint64_t pe_number, + uint8_t *freeze_state, + uint16_t *pci_error_type, + uint16_t *severity) +{ + struct phb3 *p = phb_to_phb3(phb); + uint64_t peev_bit = PPC_BIT(pe_number & 0x3f); + uint64_t peev, pesta, pestb; + + /* Defaults: not frozen */ + *freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN; + *pci_error_type = OPAL_EEH_NO_ERROR; + + /* Check dead */ + if (p->broken) { + *freeze_state = OPAL_EEH_STOPPED_MMIO_DMA_FREEZE; + *pci_error_type = OPAL_EEH_PHB_ERROR; + if (severity) + *severity = OPAL_EEH_SEV_PHB_DEAD; + return OPAL_HARDWARE; + } + + /* Check fence and CAPP recovery */ + if (phb3_fenced(p) || (p->flags & PHB3_CAPP_RECOVERY)) { + *freeze_state = OPAL_EEH_STOPPED_MMIO_DMA_FREEZE; + *pci_error_type = OPAL_EEH_PHB_ERROR; + if (severity) + *severity = OPAL_EEH_SEV_PHB_FENCED; + return OPAL_SUCCESS; + } + + /* Check the PEEV */ + phb3_ioda_sel(p, IODA2_TBL_PEEV, pe_number / 64, false); + peev = in_be64(p->regs + PHB_IODA_DATA0); + if (!(peev & peev_bit)) + return OPAL_SUCCESS; + + /* Indicate that we have an ER pending */ + phb3_set_err_pending(p, true); + if (severity) + *severity = OPAL_EEH_SEV_PE_ER; + + /* Read the PESTA & PESTB */ + phb3_ioda_sel(p, IODA2_TBL_PESTA, pe_number, false); + pesta = in_be64(p->regs + PHB_IODA_DATA0); + phb3_ioda_sel(p, IODA2_TBL_PESTB, pe_number, false); + pestb = in_be64(p->regs + PHB_IODA_DATA0); + + /* Convert them */ + if (pesta & IODA2_PESTA_MMIO_FROZEN) + *freeze_state |= OPAL_EEH_STOPPED_MMIO_FREEZE; + if (pestb & IODA2_PESTB_DMA_STOPPED) + *freeze_state |= OPAL_EEH_STOPPED_DMA_FREEZE; + + return OPAL_SUCCESS; +} + +static int64_t phb3_eeh_freeze_clear(struct phb *phb, uint64_t pe_number, + uint64_t eeh_action_token) +{ + struct phb3 *p = phb_to_phb3(phb); + uint64_t err, peev[4]; + int32_t i; + bool frozen_pe = false; + + if (p->broken) + return OPAL_HARDWARE; + + /* Summary. If nothing, move to clearing the PESTs which can + * contain a freeze state from a previous error or simply set + * explicitely by the user + */ + err = in_be64(p->regs + PHB_ETU_ERR_SUMMARY); + if (err == 0xffffffffffffffffUL) { + if (phb3_fenced(p)) { + PHBERR(p, "eeh_freeze_clear on fenced PHB\n"); + return OPAL_HARDWARE; + } + } + if (err != 0) + phb3_err_ER_clear(p); + + /* + * We have PEEV in system memory. It would give more performance + * to access that directly. + */ + if (eeh_action_token & OPAL_EEH_ACTION_CLEAR_FREEZE_MMIO) { + phb3_ioda_sel(p, IODA2_TBL_PESTA, pe_number, false); + out_be64(p->regs + PHB_IODA_DATA0, 0); + } + if (eeh_action_token & OPAL_EEH_ACTION_CLEAR_FREEZE_DMA) { + phb3_ioda_sel(p, IODA2_TBL_PESTB, pe_number, false); + out_be64(p->regs + PHB_IODA_DATA0, 0); + } + + + /* Update ER pending indication */ + phb3_ioda_sel(p, IODA2_TBL_PEEV, 0, true); + for (i = 0; i < ARRAY_SIZE(peev); i++) { + peev[i] = in_be64(p->regs + PHB_IODA_DATA0); + if (peev[i]) { + frozen_pe = true; + break; + } + } + if (frozen_pe) { + p->err.err_src = PHB3_ERR_SRC_PHB; + p->err.err_class = PHB3_ERR_CLASS_ER; + p->err.err_bit = -1; + phb3_set_err_pending(p, true); + } else + phb3_set_err_pending(p, false); + + return OPAL_SUCCESS; +} + +static int64_t phb3_eeh_freeze_set(struct phb *phb, uint64_t pe_number, + uint64_t eeh_action_token) +{ + struct phb3 *p = phb_to_phb3(phb); + uint64_t data; + + if (p->broken) + return OPAL_HARDWARE; + + if (pe_number >= PHB3_MAX_PE_NUM) + return OPAL_PARAMETER; + + if (eeh_action_token != OPAL_EEH_ACTION_SET_FREEZE_MMIO && + eeh_action_token != OPAL_EEH_ACTION_SET_FREEZE_DMA && + eeh_action_token != OPAL_EEH_ACTION_SET_FREEZE_ALL) + return OPAL_PARAMETER; + + if (eeh_action_token & OPAL_EEH_ACTION_SET_FREEZE_MMIO) { + phb3_ioda_sel(p, IODA2_TBL_PESTA, pe_number, false); + data = in_be64(p->regs + PHB_IODA_DATA0); + data |= IODA2_PESTA_MMIO_FROZEN; + out_be64(p->regs + PHB_IODA_DATA0, data); + } + + if (eeh_action_token & OPAL_EEH_ACTION_SET_FREEZE_DMA) { + phb3_ioda_sel(p, IODA2_TBL_PESTB, pe_number, false); + data = in_be64(p->regs + PHB_IODA_DATA0); + data |= IODA2_PESTB_DMA_STOPPED; + out_be64(p->regs + PHB_IODA_DATA0, data); + } + + return OPAL_SUCCESS; +} + +static int64_t phb3_eeh_next_error(struct phb *phb, + uint64_t *first_frozen_pe, + uint16_t *pci_error_type, + uint16_t *severity) +{ + struct phb3 *p = phb_to_phb3(phb); + uint64_t fir, peev[4]; + uint32_t cfg32; + int32_t i, j; + + /* If the PHB is broken, we needn't go forward */ + if (p->broken) { + *pci_error_type = OPAL_EEH_PHB_ERROR; + *severity = OPAL_EEH_SEV_PHB_DEAD; + return OPAL_SUCCESS; + } + + if ((p->flags & PHB3_CAPP_RECOVERY)) { + *pci_error_type = OPAL_EEH_PHB_ERROR; + *severity = OPAL_EEH_SEV_PHB_FENCED; + return OPAL_SUCCESS; + } + + /* + * Check if we already have pending errors. If that's + * the case, then to get more information about the + * pending errors. Here we try PBCQ prior to PHB. + */ + if (phb3_err_pending(p) && + !phb3_err_check_pbcq(p) && + !phb3_err_check_lem(p)) + phb3_set_err_pending(p, false); + + /* Clear result */ + *pci_error_type = OPAL_EEH_NO_ERROR; + *severity = OPAL_EEH_SEV_NO_ERROR; + *first_frozen_pe = (uint64_t)-1; + + /* Check frozen PEs */ + if (!phb3_err_pending(p)) { + phb3_ioda_sel(p, IODA2_TBL_PEEV, 0, true); + for (i = 0; i < ARRAY_SIZE(peev); i++) { + peev[i] = in_be64(p->regs + PHB_IODA_DATA0); + if (peev[i]) { + p->err.err_src = PHB3_ERR_SRC_PHB; + p->err.err_class = PHB3_ERR_CLASS_ER; + p->err.err_bit = -1; + phb3_set_err_pending(p, true); + break; + } + } + } + + /* Mapping errors */ + if (phb3_err_pending(p)) { + /* + * If the frozen PE is caused by a malfunctioning TLP, we + * need reset the PHB. So convert ER to PHB-fatal error + * for the case. + */ + if (p->err.err_class == PHB3_ERR_CLASS_ER) { + fir = phb3_read_reg_asb(p, PHB_LEM_FIR_ACCUM); + if (fir & PPC_BIT(60)) { + phb3_pcicfg_read32(&p->phb, 0, + p->aercap + PCIECAP_AER_UE_STATUS, &cfg32); + if (cfg32 & PCIECAP_AER_UE_MALFORMED_TLP) + p->err.err_class = PHB3_ERR_CLASS_FENCED; + } + } + + switch (p->err.err_class) { + case PHB3_ERR_CLASS_DEAD: + *pci_error_type = OPAL_EEH_PHB_ERROR; + *severity = OPAL_EEH_SEV_PHB_DEAD; + break; + case PHB3_ERR_CLASS_FENCED: + *pci_error_type = OPAL_EEH_PHB_ERROR; + *severity = OPAL_EEH_SEV_PHB_FENCED; + break; + case PHB3_ERR_CLASS_ER: + *pci_error_type = OPAL_EEH_PE_ERROR; + *severity = OPAL_EEH_SEV_PE_ER; + + phb3_ioda_sel(p, IODA2_TBL_PEEV, 0, true); + for (i = 0; i < ARRAY_SIZE(peev); i++) + peev[i] = in_be64(p->regs + PHB_IODA_DATA0); + for (i = ARRAY_SIZE(peev) - 1; i >= 0; i--) { + for (j = 0; j < 64; j++) { + if (peev[i] & PPC_BIT(j)) { + *first_frozen_pe = i * 64 + j; + break; + } + } + + if (*first_frozen_pe != (uint64_t)(-1)) + break; + } + + /* No frozen PE ? */ + if (*first_frozen_pe == (uint64_t)-1) { + *pci_error_type = OPAL_EEH_NO_ERROR; + *severity = OPAL_EEH_SEV_NO_ERROR; + phb3_set_err_pending(p, false); + } + + break; + case PHB3_ERR_CLASS_INF: + *pci_error_type = OPAL_EEH_PHB_ERROR; + *severity = OPAL_EEH_SEV_INF; + break; + default: + *pci_error_type = OPAL_EEH_NO_ERROR; + *severity = OPAL_EEH_SEV_NO_ERROR; + phb3_set_err_pending(p, false); + } + } + + return OPAL_SUCCESS; +} + +static int64_t phb3_err_inject_finalize(struct phb3 *p, uint64_t addr, + uint64_t mask, uint64_t ctrl, + bool is_write) +{ + if (is_write) + ctrl |= PHB_PAPR_ERR_INJ_CTL_WR; + else + ctrl |= PHB_PAPR_ERR_INJ_CTL_RD; + + out_be64(p->regs + PHB_PAPR_ERR_INJ_ADDR, addr); + out_be64(p->regs + PHB_PAPR_ERR_INJ_MASK, mask); + out_be64(p->regs + PHB_PAPR_ERR_INJ_CTL, ctrl); + + return OPAL_SUCCESS; +} + +static int64_t phb3_err_inject_mem32(struct phb3 *p, uint64_t pe_number, + uint64_t addr, uint64_t mask, + bool is_write) +{ + uint64_t base, len, segstart, segsize; + uint64_t a, m; + uint64_t ctrl = PHB_PAPR_ERR_INJ_CTL_OUTB; + uint32_t index; + + segsize = (M32_PCI_SIZE / PHB3_MAX_PE_NUM); + a = base = len = 0x0ull; + + for (index = 0; index < PHB3_MAX_PE_NUM; index++) { + if (GETFIELD(IODA2_M32DT_PE, p->m32d_cache[index]) != pe_number) + continue; + + /* Obviously, we can't support discontiguous segments. + * We have to pick the first batch of contiguous segments + * for that case + */ + segstart = p->mm1_base + segsize * index; + if (!len) { + base = segstart; + len = segsize; + } else if ((base + len) == segstart) { + len += segsize; + } + + /* Check the specified address is valid one */ + if (addr >= segstart && addr < (segstart + segsize)) { + a = addr; + break; + } + } + + /* No MM32 segments assigned to the PE */ + if (!len) + return OPAL_PARAMETER; + + /* Specified address is out of range */ + if (!a) { + a = base; + len = len & ~(len - 1); + m = ~(len - 1); + } else { + m = mask; + } + + a = SETFIELD(PHB_PAPR_ERR_INJ_ADDR_MMIO, 0x0ull, a); + m = SETFIELD(PHB_PAPR_ERR_INJ_MASK_MMIO, 0x0ull, m); + + return phb3_err_inject_finalize(p, a, m, ctrl, is_write); +} + +static int64_t phb3_err_inject_mem64(struct phb3 *p, uint64_t pe_number, + uint64_t addr, uint64_t mask, + bool is_write) +{ + uint64_t base, len, segstart, segsize; + uint64_t cache, a, m; + uint64_t ctrl = PHB_PAPR_ERR_INJ_CTL_OUTB; + uint32_t index, s_index, e_index; + + /* By default, the PE is PCI device dependent one */ + s_index = 0; + e_index = ARRAY_SIZE(p->m64b_cache) - 2; + for (index = 0; index < RTT_TABLE_ENTRIES; index++) { + if (p->rte_cache[index] != pe_number) + continue; + + if (index + 8 >= RTT_TABLE_ENTRIES) + break; + + /* PCI bus dependent PE */ + if (p->rte_cache[index + 8] == pe_number) { + s_index = e_index = ARRAY_SIZE(p->m64b_cache) - 1; + break; + } + } + + a = base = len = 0x0ull; + for (index = s_index; !len && index <= e_index; index++) { + cache = p->m64b_cache[index]; + if (!(cache & IODA2_M64BT_ENABLE)) + continue; + + if (cache & IODA2_M64BT_SINGLE_PE) { + if (GETFIELD(IODA2_M64BT_PE_HI, cache) != (pe_number >> 5) || + GETFIELD(IODA2_M64BT_PE_LOW, cache) != (pe_number & 0x1f)) + continue; + + segstart = GETFIELD(IODA2_M64BT_SINGLE_BASE, cache); + segstart <<= 25; /* 32MB aligned */ + segsize = GETFIELD(IODA2_M64BT_SINGLE_MASK, cache); + segsize = (0x2000000ull - segsize) << 25; + } else { + segstart = GETFIELD(IODA2_M64BT_BASE, cache); + segstart <<= 20; /* 1MB aligned */ + segsize = GETFIELD(IODA2_M64BT_MASK, cache); + segsize = (0x40000000ull - segsize) << 20; + + segsize /= PHB3_MAX_PE_NUM; + segstart = segstart + segsize * pe_number; + } + + /* First window always wins based on the ascending + * searching priority the 16 BARs have. We're using + * the feature to assign resource for SRIOV VFs. + */ + if (!len) { + base = segstart; + len = segsize; + } + + /* Specified address is valid one */ + if (addr >= segstart && addr < (segstart + segsize)) { + a = addr; + } + } + + /* No MM64 segments assigned to the PE */ + if (!len) + return OPAL_PARAMETER; + + /* Address specified or calculated */ + if (!a) { + a = base; + len = len & ~(len - 1); + m = ~(len - 1); + } else { + m = mask; + } + + a = SETFIELD(PHB_PAPR_ERR_INJ_ADDR_MMIO, 0x0ull, a); + m = SETFIELD(PHB_PAPR_ERR_INJ_MASK_MMIO, 0x0ull, m); + + return phb3_err_inject_finalize(p, a, m, ctrl, is_write); +} + +static int64_t phb3_err_inject_cfg(struct phb3 *p, uint64_t pe_number, + uint64_t addr, uint64_t mask, + bool is_write) +{ + uint64_t a, m, prefer; + uint64_t ctrl = PHB_PAPR_ERR_INJ_CTL_CFG; + int bdfn; + bool is_bus_pe; + + a = 0xffffull; + prefer = 0xffffull; + m = PHB_PAPR_ERR_INJ_MASK_CFG_ALL; + for (bdfn = 0; bdfn < RTT_TABLE_ENTRIES; bdfn++) { + if (p->rte_cache[bdfn] != pe_number) + continue; + + /* The PE can be associated with PCI bus or device */ + is_bus_pe = false; + if ((bdfn + 8) < RTT_TABLE_ENTRIES && + p->rte_cache[bdfn + 8] == pe_number) + is_bus_pe = true; + + /* Figure out the PCI config address */ + if (prefer == 0xffffull) { + if (is_bus_pe) { + m = PHB_PAPR_ERR_INJ_MASK_CFG; + prefer = SETFIELD(m, 0x0ull, PCI_BUS_NUM(bdfn)); + } else { + m = PHB_PAPR_ERR_INJ_MASK_CFG_ALL; + prefer = SETFIELD(m, 0x0ull, bdfn); + } + } + + /* Check the input address is valid or not */ + if (!is_bus_pe && + GETFIELD(PHB_PAPR_ERR_INJ_MASK_CFG_ALL, addr) == bdfn) { + a = addr; + break; + } + + if (is_bus_pe && + GETFIELD(PHB_PAPR_ERR_INJ_MASK_CFG, addr) == PCI_BUS_NUM(bdfn)) { + a = addr; + break; + } + } + + /* Invalid PE number */ + if (prefer == 0xffffull) + return OPAL_PARAMETER; + + /* Specified address is out of range */ + if (a == 0xffffull) + a = prefer; + else + m = mask; + + return phb3_err_inject_finalize(p, a, m, ctrl, is_write); +} + +static int64_t phb3_err_inject_dma(struct phb3 *p, uint64_t pe_number, + uint64_t addr, uint64_t mask, + bool is_write, bool is_64bits) +{ + uint32_t index, page_size; + uint64_t tve, table_entries; + uint64_t base, start, end, len, a, m; + uint64_t ctrl = PHB_PAPR_ERR_INJ_CTL_INB; + + /* TVE index and base address */ + if (!is_64bits) { + index = (pe_number << 1); + base = 0x0ull; + } else { + index = ((pe_number << 1) + 1); + base = (0x1ull << 59); + } + + /* Raw data of table entries and page size */ + tve = p->tve_cache[index]; + table_entries = GETFIELD(IODA2_TVT_TCE_TABLE_SIZE, tve); + table_entries = (0x1ull << (table_entries + 8)); + page_size = GETFIELD(IODA2_TVT_IO_PSIZE, tve); + if (!page_size && !(tve & PPC_BIT(51))) + return OPAL_UNSUPPORTED; + + /* Check the page size */ + switch (page_size) { + case 0: /* bypass */ + start = ((tve & (0x3ull << 10)) << 14) | + ((tve & (0xffffffull << 40)) >> 40); + end = ((tve & (0x3ull << 8)) << 16) | + ((tve & (0xffffffull << 16)) >> 16); + + /* 16MB aligned size */ + len = (end - start) << 24; + break; + case 5: /* 64KB */ + len = table_entries * 0x10000ull; + break; + case 13: /* 16MB */ + len = table_entries * 0x1000000ull; + break; + case 17: /* 256MB */ + len = table_entries * 0x10000000ull; + break; + case 1: /* 4KB */ + default: + len = table_entries * 0x1000ull; + } + + /* The specified address is in range */ + if (addr && addr >= base && addr < (base + len)) { + a = addr; + m = mask; + } else { + a = base; + len = len & ~(len - 1); + m = ~(len - 1); + } + + return phb3_err_inject_finalize(p, a, m, ctrl, is_write); +} + +static int64_t phb3_err_inject_dma32(struct phb3 *p, uint64_t pe_number, + uint64_t addr, uint64_t mask, + bool is_write) +{ + return phb3_err_inject_dma(p, pe_number, addr, mask, is_write, false); +} + +static int64_t phb3_err_inject_dma64(struct phb3 *p, uint64_t pe_number, + uint64_t addr, uint64_t mask, + bool is_write) +{ + return phb3_err_inject_dma(p, pe_number, addr, mask, is_write, true); +} + +static int64_t phb3_err_inject(struct phb *phb, uint64_t pe_number, + uint32_t type, uint32_t func, + uint64_t addr, uint64_t mask) +{ + struct phb3 *p = phb_to_phb3(phb); + int64_t (*handler)(struct phb3 *p, uint64_t pe_number, + uint64_t addr, uint64_t mask, bool is_write); + bool is_write; + + /* How could we get here without valid RTT? */ + if (!p->tbl_rtt) + return OPAL_HARDWARE; + + /* We can't inject error to the reserved PE */ + if (pe_number == PHB3_RESERVED_PE_NUM || pe_number >= PHB3_MAX_PE_NUM) + return OPAL_PARAMETER; + + /* Clear leftover from last time */ + out_be64(p->regs + PHB_PAPR_ERR_INJ_CTL, 0x0ul); + + switch (func) { + case OPAL_ERR_INJECT_FUNC_IOA_LD_MEM_ADDR: + case OPAL_ERR_INJECT_FUNC_IOA_LD_MEM_DATA: + is_write = false; + if (type == OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR64) + handler = phb3_err_inject_mem64; + else + handler = phb3_err_inject_mem32; + break; + case OPAL_ERR_INJECT_FUNC_IOA_ST_MEM_ADDR: + case OPAL_ERR_INJECT_FUNC_IOA_ST_MEM_DATA: + is_write = true; + if (type == OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR64) + handler = phb3_err_inject_mem64; + else + handler = phb3_err_inject_mem32; + break; + case OPAL_ERR_INJECT_FUNC_IOA_LD_CFG_ADDR: + case OPAL_ERR_INJECT_FUNC_IOA_LD_CFG_DATA: + is_write = false; + handler = phb3_err_inject_cfg; + break; + case OPAL_ERR_INJECT_FUNC_IOA_ST_CFG_ADDR: + case OPAL_ERR_INJECT_FUNC_IOA_ST_CFG_DATA: + is_write = true; + handler = phb3_err_inject_cfg; + break; + case OPAL_ERR_INJECT_FUNC_IOA_DMA_RD_ADDR: + case OPAL_ERR_INJECT_FUNC_IOA_DMA_RD_DATA: + case OPAL_ERR_INJECT_FUNC_IOA_DMA_RD_MASTER: + case OPAL_ERR_INJECT_FUNC_IOA_DMA_RD_TARGET: + is_write = false; + if (type == OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR64) + handler = phb3_err_inject_dma64; + else + handler = phb3_err_inject_dma32; + break; + case OPAL_ERR_INJECT_FUNC_IOA_DMA_WR_ADDR: + case OPAL_ERR_INJECT_FUNC_IOA_DMA_WR_DATA: + case OPAL_ERR_INJECT_FUNC_IOA_DMA_WR_MASTER: + case OPAL_ERR_INJECT_FUNC_IOA_DMA_WR_TARGET: + is_write = true; + if (type == OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR64) + handler = phb3_err_inject_dma64; + else + handler = phb3_err_inject_dma32; + break; + default: + return OPAL_PARAMETER; + } + + return handler(p, pe_number, addr, mask, is_write); +} + +static int64_t phb3_get_diag_data(struct phb *phb, + void *diag_buffer, + uint64_t diag_buffer_len) +{ + struct phb3 *p = phb_to_phb3(phb); + struct OpalIoPhb3ErrorData *data = diag_buffer; + bool fenced; + + if (diag_buffer_len < sizeof(struct OpalIoPhb3ErrorData)) + return OPAL_PARAMETER; + if (p->broken) + return OPAL_HARDWARE; + + /* + * Dummy check for fence so that phb3_read_phb_status knows + * whether to use ASB or AIB + */ + fenced = phb3_fenced(p); + phb3_read_phb_status(p, data); + + if (!fenced) + phb3_eeh_dump_regs(p, data); + + /* + * We're running to here probably because of errors + * (INF class). For that case, we need clear the error + * explicitly. + */ + if (phb3_err_pending(p) && + p->err.err_class == PHB3_ERR_CLASS_INF && + p->err.err_src == PHB3_ERR_SRC_PHB) { + phb3_err_ER_clear(p); + phb3_set_err_pending(p, false); + } + + return OPAL_SUCCESS; +} + +static int64_t phb3_get_capp_info(int chip_id, struct phb *phb, + struct capp_info *info) +{ + struct phb3 *p = phb_to_phb3(phb); + struct proc_chip *chip = get_chip(p->chip_id); + uint32_t offset; + + if (chip_id != p->chip_id) + return OPAL_PARAMETER; + + if (!((1 << p->index) & chip->capp_phb3_attached_mask)) + return OPAL_PARAMETER; + + offset = PHB3_CAPP_REG_OFFSET(p); + + if (PHB3_IS_NAPLES(p)) { + if (p->index == 0) + info->capp_index = 0; + else + info->capp_index = 1; + } else + info->capp_index = 0; + info->phb_index = p->index; + info->capp_fir_reg = CAPP_FIR + offset; + info->capp_fir_mask_reg = CAPP_FIR_MASK + offset; + info->capp_fir_action0_reg = CAPP_FIR_ACTION0 + offset; + info->capp_fir_action1_reg = CAPP_FIR_ACTION1 + offset; + info->capp_err_status_ctrl_reg = CAPP_ERR_STATUS_CTRL + offset; + + return OPAL_SUCCESS; +} + +static void phb3_init_capp_regs(struct phb3 *p, bool dma_mode) +{ + uint64_t reg; + uint32_t offset; + uint64_t read_buffers = 0; + + offset = PHB3_CAPP_REG_OFFSET(p); + xscom_read(p->chip_id, APC_MASTER_PB_CTRL + offset, ®); + reg &= ~PPC_BITMASK(10, 11); + reg |= PPC_BIT(3); + if (dma_mode) { + /* In DMA mode, the CAPP only owns some of the PHB read buffers */ + read_buffers = 0x1; + + /* + * HW301991 - XSL sends PTE updates with nodal scope instead of + * group scope. The workaround is to force all commands to + * unlimited scope by setting bit 4. This may have a slight + * performance impact, but it would be negligible on the XSL. + * To avoid the possibility it might impact other cards, key it + * off DMA mode since the XSL based Mellanox CX4 is the only + * card to use this mode in P8 timeframe: + */ + reg |= PPC_BIT(4); + } + reg |= read_buffers << PPC_BITLSHIFT(11); + xscom_write(p->chip_id, APC_MASTER_PB_CTRL + offset, reg); + + /* Dynamically workout which PHB to connect to port 0 of the CAPP. + * Here is the table from the CAPP workbook: + * APC_MASTER CAPP CAPP + * bits 1:3 port0 port1 + * 000 disabled disabled + * * 001 PHB2 disabled + * * 010 PHB1 disabled + * 011 PHB1 PHB2 + * * 100 PHB0 disabled + * 101 PHB0 PHB2 + * 110 PHB0 PHB1 + * + * We don't use port1 so only those starred above are used. + * Hence reduce table to: + * PHB0 -> APC MASTER(bits 1:3) = 0b100 + * PHB1 -> APC MASTER(bits 1:3) = 0b010 + * PHB2 -> APC MASTER(bits 1:3) = 0b001 + * + * Note: Naples has two CAPP units, statically mapped: + * CAPP0/PHB0 -> APC MASTER(bits 1:3) = 0b100 + * CAPP1/PHB1 -> APC MASTER(bits 1:3) = 0b010 + */ + reg = 0x4000000000000000ULL >> p->index; + reg |= 0x0070000000000000UL; + xscom_write(p->chip_id, APC_MASTER_CAPI_CTRL + offset, reg); + PHBINF(p, "CAPP: port attached\n"); + + /* tlb and mmio */ + xscom_write(p->chip_id, TRANSPORT_CONTROL + offset, 0x4028000104000000UL); + + xscom_write(p->chip_id, CANNED_PRESP_MAP0 + offset, 0); + xscom_write(p->chip_id, CANNED_PRESP_MAP1 + offset, 0xFFFFFFFF00000000UL); + xscom_write(p->chip_id, CANNED_PRESP_MAP2 + offset, 0); + + /* error recovery */ + xscom_write(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, 0); + + xscom_write(p->chip_id, FLUSH_SUE_STATE_MAP + offset, + 0x1DC20B6600000000UL); + xscom_write(p->chip_id, CAPP_EPOCH_TIMER_CTRL + offset, + 0xC0000000FFF0FFE0UL); + xscom_write(p->chip_id, FLUSH_UOP_CONFIG1 + offset, + 0xB188280728000000UL); + xscom_write(p->chip_id, FLUSH_UOP_CONFIG2 + offset, 0xB188400F00000000UL); + + reg = 0xA1F0000000000000UL; + reg |= read_buffers << PPC_BITLSHIFT(39); + xscom_write(p->chip_id, SNOOP_CAPI_CONFIG + offset, reg); +} + +/* override some inits with CAPI defaults */ +static void phb3_init_capp_errors(struct phb3 *p) +{ + out_be64(p->regs + PHB_ERR_AIB_FENCE_ENABLE, 0xffffffdd8c80ffc0UL); + out_be64(p->regs + PHB_OUT_ERR_AIB_FENCE_ENABLE, 0x9cf3fe08f8dc700fUL); + out_be64(p->regs + PHB_INA_ERR_AIB_FENCE_ENABLE, 0xffff57fbff01ffdeUL); + out_be64(p->regs + PHB_INB_ERR_AIB_FENCE_ENABLE, 0xfcffe0fbff7ff0ecUL); + out_be64(p->regs + PHB_LEM_ERROR_MASK, 0x40018e2400022482UL); +} + +/* + * Enable CAPI mode on a PHB + * + * Changes to this init sequence may require updating disable_capi_mode(). + */ +static int64_t enable_capi_mode(struct phb3 *p, uint64_t pe_number, bool dma_mode) +{ + uint64_t reg; + int i; + + xscom_read(p->chip_id, PE_CAPP_EN + PE_REG_OFFSET(p), ®); + if (reg & PPC_BIT(0)) { + PHBDBG(p, "Already in CAPP mode\n"); + } + + /* poll cqstat */ + for (i = 0; i < 500000; i++) { + xscom_read(p->chip_id, p->pe_xscom + 0xf, ®); + if (!(reg & 0xC000000000000000UL)) + break; + time_wait_us(10); + } + if (reg & 0xC000000000000000UL) { + PHBERR(p, "CAPP: Timeout waiting for pending transaction\n"); + return OPAL_HARDWARE; + } + + /* pb aib capp enable */ + reg = PPC_BIT(0); /* capp enable */ + if (dma_mode) + reg |= PPC_BIT(1); /* capp dma mode */ + xscom_write(p->chip_id, p->spci_xscom + 0x3, reg); + + /* FIXME security timer bar + xscom_write(p->chip_id, p->spci_xscom + 0x4, 0x8000000000000000ull); + */ + + /* aib mode */ + xscom_read(p->chip_id, p->pci_xscom + 0xf, ®); + reg &= ~PPC_BITMASK(6,7); + reg |= PPC_BIT(8); + reg |= PPC_BITMASK(40, 41); + reg &= ~PPC_BIT(42); + xscom_write(p->chip_id, p->pci_xscom + 0xf, reg); + + /* pci hwconf0 */ + xscom_read(p->chip_id, p->pe_xscom + 0x18, ®); + reg |= PPC_BIT(14); + reg &= ~PPC_BIT(15); + xscom_write(p->chip_id, p->pe_xscom + 0x18, reg); + + /* pci hwconf1 */ + xscom_read(p->chip_id, p->pe_xscom + 0x19, ®); + reg &= ~PPC_BITMASK(17,18); + xscom_write(p->chip_id, p->pe_xscom + 0x19, reg); + + /* aib tx cmd cred */ + xscom_read(p->chip_id, p->pci_xscom + 0xd, ®); + if (dma_mode) { + /* + * In DMA mode, increase AIB credit value for ch 2 (DMA read) + * for performance reasons + */ + reg &= ~PPC_BITMASK(42, 47); + reg |= PPC_BITMASK(43, 45); + } else { + reg &= ~PPC_BITMASK(42, 46); + reg |= PPC_BIT(47); + } + xscom_write(p->chip_id, p->pci_xscom + 0xd, reg); + + xscom_write(p->chip_id, p->pci_xscom + 0xc, 0xff00000000000000ull); + + /* pci mode ctl */ + xscom_read(p->chip_id, p->pe_xscom + 0xb, ®); + reg |= PPC_BIT(25); + xscom_write(p->chip_id, p->pe_xscom + 0xb, reg); + + /* set tve no translate mode allow mmio window */ + memset(p->tve_cache, 0x0, sizeof(p->tve_cache)); + if (dma_mode) { + /* + * CAPP DMA mode needs access to all of memory, set address + * range to 0x0000000000000000: 0x0002FFFFFFFFFFF + */ + p->tve_cache[pe_number * 2] = 0x000000FFFFFF0200ULL; + } else { + /* Allow address range 0x0002000000000000: 0x0002FFFFFFFFFFF */ + p->tve_cache[pe_number * 2] = 0x000000FFFFFF0a00ULL; + } + + phb3_ioda_sel(p, IODA2_TBL_TVT, 0, true); + for (i = 0; i < ARRAY_SIZE(p->tve_cache); i++) + out_be64(p->regs + PHB_IODA_DATA0, p->tve_cache[i]); + + /* set m64 bar to pass mmio window */ + memset(p->m64b_cache, 0x0, sizeof(p->m64b_cache)); + p->m64b_cache[0] = PPC_BIT(0); /*enable*/ + p->m64b_cache[0] |= PPC_BIT(1); /*single pe*/ + p->m64b_cache[0] |= (p->mm0_base << 12) | ((pe_number & 0x3e0) << 27); /*base and upper pe*/ + p->m64b_cache[0] |= 0x3fffc000 | (pe_number & 0x1f); /*mask and lower pe*/ + + p->m64b_cache[1] = PPC_BIT(0); /*enable*/ + p->m64b_cache[1] |= PPC_BIT(1); /*single pe*/ + p->m64b_cache[1] |= (0x0002000000000000ULL << 12) | ((pe_number & 0x3e0) << 27); /*base and upper pe*/ + p->m64b_cache[1] |= 0x3f000000 | (pe_number & 0x1f); /*mask and lower pe*/ + + phb3_ioda_sel(p, IODA2_TBL_M64BT, 0, true); + for (i = 0; i < ARRAY_SIZE(p->m64b_cache); i++) + out_be64(p->regs + PHB_IODA_DATA0, p->m64b_cache[i]); + + out_be64(p->regs + PHB_PHB3_CONFIG, PHB_PHB3C_64B_TCE_EN); + out_be64(p->regs + PHB_PHB3_CONFIG, PHB_PHB3C_64BIT_MSI_EN); + + phb3_init_capp_errors(p); + + phb3_init_capp_regs(p, dma_mode); + + if (!chiptod_capp_timebase_sync(p->chip_id, CAPP_TFMR, CAPP_TB, + PHB3_CAPP_REG_OFFSET(p))) { + PHBERR(p, "CAPP: Failed to sync timebase\n"); + return OPAL_HARDWARE; + } + + /* set callbacks to handle HMI events */ + capi_ops.get_capp_info = &phb3_get_capp_info; + + return OPAL_SUCCESS; +} + +static int64_t phb3_set_capi_mode(struct phb *phb, uint64_t mode, + uint64_t pe_number) +{ + struct phb3 *p = phb_to_phb3(phb); + struct proc_chip *chip = get_chip(p->chip_id); + uint64_t reg; + uint64_t read_buffers; + uint32_t offset; + u8 mask; + + if (!capp_ucode_loaded(chip, p->index)) { + PHBERR(p, "CAPP: ucode not loaded\n"); + return OPAL_RESOURCE; + } + + lock(&capi_lock); + if (PHB3_IS_NAPLES(p)) { + /* Naples has two CAPP units, statically mapped. */ + chip->capp_phb3_attached_mask |= 1 << p->index; + } else { + /* + * Check if CAPP port is being used by any another PHB. + * Check and set chip->capp_phb3_attached_mask atomically + * incase two phb3_set_capi_mode() calls race. + */ + mask = ~(1 << p->index); + if (chip->capp_phb3_attached_mask & mask) { + PHBERR(p, + "CAPP: port already in use by another PHB:%x\n", + chip->capp_phb3_attached_mask); + unlock(&capi_lock); + return false; + } + chip->capp_phb3_attached_mask = 1 << p->index; + } + unlock(&capi_lock); + + offset = PHB3_CAPP_REG_OFFSET(p); + xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, ®); + if ((reg & PPC_BIT(5))) { + PHBERR(p, "CAPP: recovery failed (%016llx)\n", reg); + return OPAL_HARDWARE; + } else if ((reg & PPC_BIT(0)) && (!(reg & PPC_BIT(1)))) { + PHBDBG(p, "CAPP: recovery in progress\n"); + return OPAL_BUSY; + } + + switch (mode) { + case OPAL_PHB_CAPI_MODE_PCIE: + /* Switching back to PCIe mode requires a creset */ + return OPAL_UNSUPPORTED; + + case OPAL_PHB_CAPI_MODE_CAPI: + return enable_capi_mode(p, pe_number, false); + + case OPAL_PHB_CAPI_MODE_DMA: + return enable_capi_mode(p, pe_number, true); + + case OPAL_PHB_CAPI_MODE_SNOOP_OFF: + xscom_write(p->chip_id, SNOOP_CAPI_CONFIG + offset, + 0x0000000000000000); + return OPAL_SUCCESS; + + case OPAL_PHB_CAPI_MODE_SNOOP_ON: + xscom_write(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, + 0x0000000000000000); + /* + * Make sure the PHB read buffers being snooped match those + * being used so we don't need another mode to set SNOOP+DMA + */ + xscom_read(p->chip_id, APC_MASTER_PB_CTRL + offset, ®); + read_buffers = (reg >> PPC_BITLSHIFT(11)) & 0x3; + reg = 0xA1F0000000000000UL; + reg |= read_buffers << PPC_BITLSHIFT(39); + xscom_write(p->chip_id, SNOOP_CAPI_CONFIG + offset, reg); + + return OPAL_SUCCESS; + } + + return OPAL_UNSUPPORTED; +} + +static int64_t phb3_set_capp_recovery(struct phb *phb) +{ + struct phb3 *p = phb_to_phb3(phb); + + if (p->flags & PHB3_CAPP_RECOVERY) + return 0; + + /* set opal event flag to indicate eeh condition */ + opal_update_pending_evt(OPAL_EVENT_PCI_ERROR, + OPAL_EVENT_PCI_ERROR); + + p->flags |= PHB3_CAPP_RECOVERY; + + return 0; +} + +static const struct phb_ops phb3_ops = { + .cfg_read8 = phb3_pcicfg_read8, + .cfg_read16 = phb3_pcicfg_read16, + .cfg_read32 = phb3_pcicfg_read32, + .cfg_write8 = phb3_pcicfg_write8, + .cfg_write16 = phb3_pcicfg_write16, + .cfg_write32 = phb3_pcicfg_write32, + .get_reserved_pe_number = phb3_get_reserved_pe_number, + .device_init = phb3_device_init, + .device_remove = phb3_device_remove, + .ioda_reset = phb3_ioda_reset, + .papr_errinjct_reset = phb3_papr_errinjct_reset, + .pci_reinit = phb3_pci_reinit, + .set_phb_mem_window = phb3_set_phb_mem_window, + .phb_mmio_enable = phb3_phb_mmio_enable, + .map_pe_mmio_window = phb3_map_pe_mmio_window, + .map_pe_dma_window = phb3_map_pe_dma_window, + .map_pe_dma_window_real = phb3_map_pe_dma_window_real, + .pci_msi_eoi = phb3_pci_msi_eoi, + .set_xive_pe = phb3_set_ive_pe, + .get_msi_32 = phb3_get_msi_32, + .get_msi_64 = phb3_get_msi_64, + .set_pe = phb3_set_pe, + .set_peltv = phb3_set_peltv, + .eeh_freeze_status = phb3_eeh_freeze_status, + .eeh_freeze_clear = phb3_eeh_freeze_clear, + .eeh_freeze_set = phb3_eeh_freeze_set, + .next_error = phb3_eeh_next_error, + .err_inject = phb3_err_inject, + .get_diag_data2 = phb3_get_diag_data, + .set_capi_mode = phb3_set_capi_mode, + .set_capp_recovery = phb3_set_capp_recovery, +}; + +/* + * We should access those registers at the stage since the + * AIB isn't ready yet. + */ +static void phb3_setup_aib(struct phb3 *p) +{ + /* Init_2 - AIB TX Channel Mapping Register */ + phb3_write_reg_asb(p, PHB_AIB_TX_CHAN_MAPPING, 0x0211230000000000UL); + + /* Init_3 - AIB RX command credit register */ + if (p->rev >= PHB3_REV_VENICE_DD20) + phb3_write_reg_asb(p, PHB_AIB_RX_CMD_CRED, 0x0020000100020001UL); + else + phb3_write_reg_asb(p, PHB_AIB_RX_CMD_CRED, 0x0020000100010001UL); + + /* Init_4 - AIB rx data credit register */ + if (p->rev >= PHB3_REV_VENICE_DD20) + phb3_write_reg_asb(p, PHB_AIB_RX_DATA_CRED, 0x0020002000010001UL); + else + phb3_write_reg_asb(p, PHB_AIB_RX_DATA_CRED, 0x0020002000000001UL); + + /* Init_5 - AIB rx credit init timer register */ + phb3_write_reg_asb(p, PHB_AIB_RX_CRED_INIT_TIMER, 0x0f00000000000000UL); + + /* Init_6 - AIB Tag Enable register */ + phb3_write_reg_asb(p, PHB_AIB_TAG_ENABLE, 0xffffffff00000000UL); + + /* Init_7 - TCE Tag Enable register */ + phb3_write_reg_asb(p, PHB_TCE_TAG_ENABLE, 0xffffffff00000000UL); +} + +static void phb3_init_ioda2(struct phb3 *p) +{ + /* Init_14 - LSI Source ID */ + out_be64(p->regs + PHB_LSI_SOURCE_ID, + SETFIELD(PHB_LSI_SRC_ID, 0ul, 0xff)); + + /* Init_15 - IVT BAR / Length + * Init_16 - RBA BAR + * - RTT BAR + * Init_17 - PELT-V BAR + */ + out_be64(p->regs + PHB_RTT_BAR, + p->tbl_rtt | PHB_RTT_BAR_ENABLE); + out_be64(p->regs + PHB_PELTV_BAR, + p->tbl_peltv | PHB_PELTV_BAR_ENABLE); + out_be64(p->regs + PHB_IVT_BAR, + p->tbl_ivt | 0x800 | PHB_IVT_BAR_ENABLE); + + /* DD2.0 or the subsequent chips don't have memory + * resident RBA. + */ + if (p->rev >= PHB3_REV_MURANO_DD20) + out_be64(p->regs + PHB_RBA_BAR, 0x0ul); + else + out_be64(p->regs + PHB_RBA_BAR, + p->tbl_rba | PHB_RBA_BAR_ENABLE); + + /* Init_18..21 - Setup M32 */ + out_be64(p->regs + PHB_M32_BASE_ADDR, p->mm1_base); + out_be64(p->regs + PHB_M32_BASE_MASK, ~(M32_PCI_SIZE - 1)); + out_be64(p->regs + PHB_M32_START_ADDR, M32_PCI_START); + + /* Init_22 - Setup PEST BAR */ + out_be64(p->regs + PHB_PEST_BAR, + p->tbl_pest | PHB_PEST_BAR_ENABLE); + + /* Init_23 - PCIE Outbound upper address */ + out_be64(p->regs + PHB_M64_UPPER_BITS, 0); + + /* Init_24 - Interrupt represent timers + * The register doesn't take effect on Murano DD1.0 + */ + if (p->rev >= PHB3_REV_NAPLES_DD10) + out_be64(p->regs + PHB_INTREP_TIMER, 0x0014000000000000UL); + else if (p->rev >= PHB3_REV_MURANO_DD20) + out_be64(p->regs + PHB_INTREP_TIMER, 0x0004000000000000UL); + else + out_be64(p->regs + PHB_INTREP_TIMER, 0); + + /* Init_25 - PHB3 Configuration Register. Clear TCE cache then + * configure the PHB + */ + out_be64(p->regs + PHB_PHB3_CONFIG, PHB_PHB3C_64B_TCE_EN); + out_be64(p->regs + PHB_PHB3_CONFIG, + PHB_PHB3C_M32_EN | PHB_PHB3C_32BIT_MSI_EN | + PHB_PHB3C_64BIT_MSI_EN); + + /* Init_26 - At least 512ns delay according to spec */ + time_wait_us(2); + + /* Init_27..36 - On-chip IODA tables init */ + phb3_ioda_reset(&p->phb, false); +} + +static bool phb3_wait_dlp_reset(struct phb3 *p) +{ + unsigned int i; + uint64_t val; + + /* + * Firmware cannot access the UTL core regs or PCI config space + * until the cores are out of DL_PGRESET. + * DL_PGRESET should be polled until it is inactive with a value + * of '0'. The recommended polling frequency is once every 1ms. + * Firmware should poll at least 200 attempts before giving up. + * MMIO Stores to the link are silently dropped by the UTL core if + * the link is down. + * MMIO Loads to the link will be dropped by the UTL core and will + * eventually time-out and will return an all ones response if the + * link is down. + */ +#define DLP_RESET_ATTEMPTS 40000 + + PHBDBG(p, "Waiting for DLP PG reset to complete...\n"); + for (i = 0; i < DLP_RESET_ATTEMPTS; i++) { + val = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL); + if (!(val & PHB_PCIE_DLP_TC_DL_PGRESET)) + break; + time_wait_us(10); + } + if (val & PHB_PCIE_DLP_TC_DL_PGRESET) { + PHBERR(p, "Timeout waiting for DLP PG reset !\n"); + return false; + } + return true; +} + +/* phb3_init_rc - Initialize the Root Complex config space + */ +static bool phb3_init_rc_cfg(struct phb3 *p) +{ + int64_t ecap, aercap; + + /* XXX Handle errors ? */ + + /* Init_45..46: + * + * Set primary bus to 0, secondary to 1 and subordinate to 0xff + */ + phb3_pcicfg_write32(&p->phb, 0, PCI_CFG_PRIMARY_BUS, 0x00ff0100); + + /* Init_47..52 + * + * IO and Memory base & limits are set to base > limit, which + * allows all inbounds. + * + * XXX This has the potential of confusing the OS which might + * think that nothing is forwarded downstream. We probably need + * to fix this to match the IO and M32 PHB windows + */ + phb3_pcicfg_write16(&p->phb, 0, PCI_CFG_IO_BASE, 0x0010); + phb3_pcicfg_write32(&p->phb, 0, PCI_CFG_MEM_BASE, 0x00000010); + phb3_pcicfg_write32(&p->phb, 0, PCI_CFG_PREF_MEM_BASE, 0x00000010); + + /* Init_53..54 - Setup bridge control enable forwarding of CORR, FATAL, + * and NONFATAL errors + */ + phb3_pcicfg_write16(&p->phb, 0, PCI_CFG_BRCTL, PCI_CFG_BRCTL_SERR_EN); + + /* Init_55..56 + * + * PCIE Device control/status, enable error reporting, disable relaxed + * ordering, set MPS to 128 (see note), clear errors. + * + * Note: The doc recommends to set MPS to 4K. This has proved to have + * some issues as it requires specific claming of MRSS on devices and + * we've found devices in the field that misbehave when doing that. + * + * We currently leave it all to 128 bytes (minimum setting) at init + * time. The generic PCIe probing later on might apply a different + * value, or the kernel will, but we play it safe at early init + */ + if (p->ecap <= 0) { + ecap = pci_find_cap(&p->phb, 0, PCI_CFG_CAP_ID_EXP); + if (ecap < 0) { + PHBERR(p, "Can't locate PCI-E capability\n"); + return false; + } + p->ecap = ecap; + } else { + ecap = p->ecap; + } + + phb3_pcicfg_write16(&p->phb, 0, ecap + PCICAP_EXP_DEVSTAT, + PCICAP_EXP_DEVSTAT_CE | + PCICAP_EXP_DEVSTAT_NFE | + PCICAP_EXP_DEVSTAT_FE | + PCICAP_EXP_DEVSTAT_UE); + + phb3_pcicfg_write16(&p->phb, 0, ecap + PCICAP_EXP_DEVCTL, + PCICAP_EXP_DEVCTL_CE_REPORT | + PCICAP_EXP_DEVCTL_NFE_REPORT | + PCICAP_EXP_DEVCTL_FE_REPORT | + PCICAP_EXP_DEVCTL_UR_REPORT | + SETFIELD(PCICAP_EXP_DEVCTL_MPS, 0, PCIE_MPS_128B)); + + /* Init_57..58 + * + * Root Control Register. Enable error reporting + * + * Note: Added CRS visibility. + */ + phb3_pcicfg_write16(&p->phb, 0, ecap + PCICAP_EXP_RC, + PCICAP_EXP_RC_SYSERR_ON_CE | + PCICAP_EXP_RC_SYSERR_ON_NFE | + PCICAP_EXP_RC_SYSERR_ON_FE | + PCICAP_EXP_RC_CRS_VISIBLE); + + /* Init_59..60 + * + * Device Control 2. Enable ARI fwd, set timer to RTOS timer + */ + phb3_pcicfg_write16(&p->phb, 0, ecap + PCICAP_EXP_DCTL2, + SETFIELD(PCICAP_EXP_DCTL2_CMPTOUT, 0, 0xf) | + PCICAP_EXP_DCTL2_ARI_FWD); + + /* Init_61..76 + * + * AER inits + */ + if (p->aercap <= 0) { + aercap = pci_find_ecap(&p->phb, 0, PCIECAP_ID_AER, NULL); + if (aercap < 0) { + PHBERR(p, "Can't locate AER capability\n"); + return false; + } + p->aercap = aercap; + } else { + aercap = p->aercap; + } + + /* Clear all UE status */ + phb3_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_UE_STATUS, + 0xffffffff); + /* Disable some error reporting as per the PHB3 spec */ + phb3_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_UE_MASK, + PCIECAP_AER_UE_POISON_TLP | + PCIECAP_AER_UE_COMPL_TIMEOUT | + PCIECAP_AER_UE_COMPL_ABORT | + PCIECAP_AER_UE_ECRC); + /* Report some errors as fatal */ + phb3_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_UE_SEVERITY, + PCIECAP_AER_UE_DLP | + PCIECAP_AER_UE_SURPRISE_DOWN | + PCIECAP_AER_UE_FLOW_CTL_PROT | + PCIECAP_AER_UE_UNEXP_COMPL | + PCIECAP_AER_UE_RECV_OVFLOW | + PCIECAP_AER_UE_MALFORMED_TLP); + /* Clear all CE status */ + phb3_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_CE_STATUS, + 0xffffffff); + /* Disable some error reporting as per the PHB3 spec */ + /* Note: When link down, also disable rcvr errors */ + phb3_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_CE_MASK, + PCIECAP_AER_CE_ADV_NONFATAL | + (p->has_link ? 0 : PCIECAP_AER_CE_RECVR_ERR)); + + /* Enable or disable ECRC generation & checking */ + phb3_enable_ecrc(&p->phb, !p->no_ecrc_devs); + + /* Enable reporting in root error control */ + phb3_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_RERR_CMD, + PCIECAP_AER_RERR_CMD_FE | + PCIECAP_AER_RERR_CMD_NFE | + PCIECAP_AER_RERR_CMD_CE); + /* Clear root error status */ + phb3_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_RERR_STA, + 0xffffffff); + + return true; +} + +static void phb3_init_utl(struct phb3 *p) +{ + /* Init_77..79: Clear spurrious errors and assign errors to the + * right "interrupt" signal + */ + out_be64(p->regs + UTL_SYS_BUS_AGENT_STATUS, 0xffffffffffffffffUL); + out_be64(p->regs + UTL_SYS_BUS_AGENT_ERR_SEVERITY, 0x5000000000000000UL); + out_be64(p->regs + UTL_SYS_BUS_AGENT_IRQ_EN, 0xfcc0000000000000UL); + + /* Init_80..81: Setup tag allocations + * + * Stick to HW defaults. May differs between PHB implementations + */ + + /* Init_82: PCI Express port control + * SW283991: Set Outbound Non-Posted request timeout to 16ms (RTOS). + */ + out_be64(p->regs + UTL_PCIE_PORT_CONTROL, 0x8588007000000000UL); + + /* Init_83..85: Clean & setup port errors */ + out_be64(p->regs + UTL_PCIE_PORT_STATUS, 0xffdfffffffffffffUL); + out_be64(p->regs + UTL_PCIE_PORT_ERROR_SEV, 0x5039000000000000UL); + + if (p->has_link) + out_be64(p->regs + UTL_PCIE_PORT_IRQ_EN, 0xad52800000000000UL); + else + out_be64(p->regs + UTL_PCIE_PORT_IRQ_EN, 0xad42800000000000UL); + + /* Init_86 : Cleanup RC errors */ + out_be64(p->regs + UTL_RC_STATUS, 0xffffffffffffffffUL); +} + +static void phb3_init_errors(struct phb3 *p) +{ + /* Init_88: LEM Error Mask : Temporarily disable error interrupts */ + out_be64(p->regs + PHB_LEM_ERROR_MASK, 0xffffffffffffffffUL); + + /* Init_89..97: Disable all error interrupts until end of init */ + out_be64(p->regs + PHB_ERR_STATUS, 0xffffffffffffffffUL); + out_be64(p->regs + PHB_ERR1_STATUS, 0x0000000000000000UL); + out_be64(p->regs + PHB_ERR_LEM_ENABLE, 0xffffffffffffffffUL); + out_be64(p->regs + PHB_ERR_FREEZE_ENABLE, 0x0000000080800000UL); + out_be64(p->regs + PHB_ERR_AIB_FENCE_ENABLE, 0xffffffdd0c00ffc0UL); + out_be64(p->regs + PHB_ERR_LOG_0, 0x0000000000000000UL); + out_be64(p->regs + PHB_ERR_LOG_1, 0x0000000000000000UL); + out_be64(p->regs + PHB_ERR_STATUS_MASK, 0x0000000000000000UL); + out_be64(p->regs + PHB_ERR1_STATUS_MASK, 0x0000000000000000UL); + + /* Init_98_106: Configure MMIO error traps & clear old state + * + * Don't enable BAR multi-hit detection in bit 41. + */ + out_be64(p->regs + PHB_OUT_ERR_STATUS, 0xffffffffffffffffUL); + out_be64(p->regs + PHB_OUT_ERR1_STATUS, 0x0000000000000000UL); + out_be64(p->regs + PHB_OUT_ERR_LEM_ENABLE, 0xfdffffffffbfffffUL); + out_be64(p->regs + PHB_OUT_ERR_FREEZE_ENABLE, 0x0000420800000000UL); + out_be64(p->regs + PHB_OUT_ERR_AIB_FENCE_ENABLE, 0x9cf3bc00f89c700fUL); + out_be64(p->regs + PHB_OUT_ERR_LOG_0, 0x0000000000000000UL); + out_be64(p->regs + PHB_OUT_ERR_LOG_1, 0x0000000000000000UL); + out_be64(p->regs + PHB_OUT_ERR_STATUS_MASK, 0x0000000000400000UL); + out_be64(p->regs + PHB_OUT_ERR1_STATUS_MASK, 0x0000000000400000UL); + + /* Init_107_115: Configure DMA_A error traps & clear old state */ + out_be64(p->regs + PHB_INA_ERR_STATUS, 0xffffffffffffffffUL); + out_be64(p->regs + PHB_INA_ERR1_STATUS, 0x0000000000000000UL); + out_be64(p->regs + PHB_INA_ERR_LEM_ENABLE, 0xffffffffffffffffUL); + out_be64(p->regs + PHB_INA_ERR_FREEZE_ENABLE, 0xc00003a901006000UL); + out_be64(p->regs + PHB_INA_ERR_AIB_FENCE_ENABLE, 0x3fff5452fe019fdeUL); + out_be64(p->regs + PHB_INA_ERR_LOG_0, 0x0000000000000000UL); + out_be64(p->regs + PHB_INA_ERR_LOG_1, 0x0000000000000000UL); + out_be64(p->regs + PHB_INA_ERR_STATUS_MASK, 0x0000000000000000UL); + out_be64(p->regs + PHB_INA_ERR1_STATUS_MASK, 0x0000000000000000UL); + + /* Init_116_124: Configure DMA_B error traps & clear old state */ + out_be64(p->regs + PHB_INB_ERR_STATUS, 0xffffffffffffffffUL); + out_be64(p->regs + PHB_INB_ERR1_STATUS, 0x0000000000000000UL); + out_be64(p->regs + PHB_INB_ERR_LEM_ENABLE, 0xffffffffffffffffUL); + + /* + * Workaround for errata HW257476, turn correctable messages into + * ER freezes on Murano and Venice DD1.0 + */ + if (p->rev < PHB3_REV_MURANO_DD20) + out_be64(p->regs + PHB_INB_ERR_FREEZE_ENABLE, + 0x0000600000000070UL); + else + out_be64(p->regs + PHB_INB_ERR_FREEZE_ENABLE, + 0x0000600000000060UL); + + out_be64(p->regs + PHB_INB_ERR_AIB_FENCE_ENABLE, 0xfcff80fbff7ff08cUL); + out_be64(p->regs + PHB_INB_ERR_LOG_0, 0x0000000000000000UL); + out_be64(p->regs + PHB_INB_ERR_LOG_1, 0x0000000000000000UL); + out_be64(p->regs + PHB_INB_ERR_STATUS_MASK, 0x0000000000000000UL); + out_be64(p->regs + PHB_INB_ERR1_STATUS_MASK, 0x0000000000000000UL); + + /* Init_125..128: Cleanup & configure LEM */ + out_be64(p->regs + PHB_LEM_FIR_ACCUM, 0x0000000000000000UL); + out_be64(p->regs + PHB_LEM_ACTION0, 0xffffffffffffffffUL); + out_be64(p->regs + PHB_LEM_ACTION1, 0xffffffffffffffffUL); + out_be64(p->regs + PHB_LEM_WOF, 0x0000000000000000UL); +} + +static int64_t phb3_fixup_pec_inits(struct phb3 *p) +{ + int64_t rc; + uint64_t val; + + /* These fixups handle some timer updates that HB doesn't yet do + * to work around problems with some adapters or external drawers + * (SW283991) + */ + + /* PCI Hardware Configuration 0 Register */ + rc = xscom_read(p->chip_id, p->pe_xscom + 0x18, &val); + if (rc) { + PHBERR(p, "Can't read CS0 !\n"); + return rc; + } + val = val & 0x0f0fffffffffffffull; + val = val | 0x1010000000000000ull; + rc = xscom_write(p->chip_id, p->pe_xscom + 0x18, val); + if (rc) { + PHBERR(p, "Can't write CS0 !\n"); + return rc; + } + return 0; +} + +static void phb3_init_hw(struct phb3 *p, bool first_init) +{ + uint64_t val; + + PHBDBG(p, "Initializing PHB...\n"); + + /* Fixups for PEC inits */ + if (phb3_fixup_pec_inits(p)) { + PHBERR(p, "Failed to init PEC, PHB appears broken\n"); + goto failed; + } + + /* Lift reset */ + xscom_read(p->chip_id, p->spci_xscom + 1, &val);/* HW275117 */ + xscom_write(p->chip_id, p->pci_xscom + 0xa, 0); + + /* XXX FIXME, turn that into a state machine or a worker thread */ + time_wait_ms(100); + + /* Grab version and fit it in an int */ + val = phb3_read_reg_asb(p, PHB_VERSION); + if (val == 0 || val == 0xffffffffffffffffUL) { + PHBERR(p, "Failed to read version, PHB appears broken\n"); + goto failed; + } + + p->rev = ((val >> 16) & 0x00ff0000) | (val & 0xffff); + PHBDBG(p, "Core revision 0x%x\n", p->rev); + + /* Setup AIB credits etc... */ + phb3_setup_aib(p); + + /* Init_8 - PCIE System Configuration Register + * + * Use default values, clear bit 15 (SYS_EC00_SLOT) to avoid incorrect + * slot power limit message and adjust max speed based on system + * config. Don't hard wire default value as some bits are different + * between implementations. + */ + val = in_be64(p->regs + PHB_PCIE_SYSTEM_CONFIG); + PHBDBG(p, "Default system config: 0x%016llx\n", val); + val = SETFIELD(PHB_PCIE_SCONF_SLOT, val, 0); + val = SETFIELD(PHB_PCIE_SCONF_MAXLINKSPEED, val, p->max_link_speed); + out_be64(p->regs + PHB_PCIE_SYSTEM_CONFIG, val); + PHBDBG(p, "New system config : 0x%016llx\n", + in_be64(p->regs + PHB_PCIE_SYSTEM_CONFIG)); + + /* Init_9..12 - PCIE DLP Lane EQ control */ + if (p->lane_eq) { + out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL0, + be64_to_cpu(p->lane_eq[0])); + out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL1, + be64_to_cpu(p->lane_eq[1])); + out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL2, + be64_to_cpu(p->lane_eq[2])); + out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL3, + be64_to_cpu(p->lane_eq[3])); + } + + /* Init_XX - (PHB2 errata) + * + * Set proper credits, needs adjustment due to wrong defaults + * on PHB2 before we lift the reset. This only applies to Murano + * and Venice + */ + if (p->index == 2 && p->rev < PHB3_REV_NAPLES_DD10) + out_be64(p->regs + PHB_PCIE_SYS_LINK_INIT, 0x9008133332120000UL); + + /* Init_13 - PCIE Reset */ + /* + * Lift the PHB resets but not PERST, this will be lifted + * later by the initial PERST state machine + */ + PHBDBG(p, "PHB_RESET is 0x%016llx\n", in_be64(p->regs + PHB_RESET)); + out_be64(p->regs + PHB_RESET, 0xd000000000000000UL); + + /* Architected IODA2 inits */ + phb3_init_ioda2(p); + + /* Init_37..42 - Clear UTL & DLP error logs */ + out_be64(p->regs + PHB_PCIE_UTL_ERRLOG1, 0xffffffffffffffffUL); + out_be64(p->regs + PHB_PCIE_UTL_ERRLOG2, 0xffffffffffffffffUL); + out_be64(p->regs + PHB_PCIE_UTL_ERRLOG3, 0xffffffffffffffffUL); + out_be64(p->regs + PHB_PCIE_UTL_ERRLOG4, 0xffffffffffffffffUL); + out_be64(p->regs + PHB_PCIE_DLP_ERRLOG1, 0xffffffffffffffffUL); + out_be64(p->regs + PHB_PCIE_DLP_ERRLOG2, 0xffffffffffffffffUL); + + /* Init_43 - Wait for UTL core to come out of reset */ + if (!phb3_wait_dlp_reset(p)) + goto failed; + + /* Init_44 - Clear port status */ + out_be64(p->regs + UTL_PCIE_PORT_STATUS, 0xffffffffffffffffUL); + + /* Init_45..76: Init root complex config space */ + if (!phb3_init_rc_cfg(p)) + goto failed; + + /* Init_77..86 : Init UTL */ + phb3_init_utl(p); + + /* + * Init_87: PHB Control register. Various PHB settings + * Enable IVC for Murano DD2.0 or later one + */ +#ifdef IVT_TABLE_IVE_16B + val = 0xf3a80e4b00000000UL; +#else + val = 0xf3a80ecb00000000UL; +#endif + if (p->rev >= PHB3_REV_MURANO_DD20) + val |= 0x0000010000000000UL; + if (first_init && p->rev >= PHB3_REV_NAPLES_DD10) { + /* Enable 32-bit bypass support on Naples and tell the OS + * about it + */ + val |= 0x0010000000000000UL; + dt_add_property(p->phb.dt_node, + "ibm,32-bit-bypass-supported", NULL, 0); + } + out_be64(p->regs + PHB_CONTROL, val); + + /* Init_88..128 : Setup error registers */ + phb3_init_errors(p); + + /* Init_129: Read error summary */ + val = in_be64(p->regs + PHB_ETU_ERR_SUMMARY); + if (val) { + PHBERR(p, "Errors detected during PHB init: 0x%16llx\n", val); + goto failed; + } + + /* NOTE: At this point the spec waits for the link to come up. We + * don't bother as we are doing a PERST soon. + */ + + /* XXX I don't know why the spec does this now and not earlier, so + * to be sure to get it right we might want to move it to the freset + * state machine, though the generic PCI layer will probably do + * this anyway (ie, enable MEM, etc... in the RC) + * + * Note:The spec enables IO but PHB3 doesn't do IO space .... so we + * leave that clear. + */ + phb3_pcicfg_write16(&p->phb, 0, PCI_CFG_CMD, + PCI_CFG_CMD_MEM_EN | + PCI_CFG_CMD_BUS_MASTER_EN | + PCI_CFG_CMD_PERR_RESP | + PCI_CFG_CMD_SERR_EN); + + /* Clear errors */ + phb3_pcicfg_write16(&p->phb, 0, PCI_CFG_STAT, + PCI_CFG_STAT_SENT_TABORT | + PCI_CFG_STAT_RECV_TABORT | + PCI_CFG_STAT_RECV_MABORT | + PCI_CFG_STAT_SENT_SERR | + PCI_CFG_STAT_RECV_PERR); + + /* Init_136 - Re-enable error interrupts */ + + /* TBD: Should we mask any of these for PERST ? */ + out_be64(p->regs + PHB_ERR_IRQ_ENABLE, 0x0000002280b80000UL); + out_be64(p->regs + PHB_OUT_ERR_IRQ_ENABLE, 0x600c42fc042080f0UL); + out_be64(p->regs + PHB_INA_ERR_IRQ_ENABLE, 0xc000a3a901826020UL); + out_be64(p->regs + PHB_INB_ERR_IRQ_ENABLE, 0x0000600000800070UL); + out_be64(p->regs + PHB_LEM_ERROR_MASK, 0x42498e367f502eaeUL); + + /* + * Init_141 - Enable DMA address speculation + * + * Errata#20131017: Disable speculation until Murano DD2.0 + * + * Note: We keep IVT speculation disabled (bit 4). It should work with + * Murano DD2.0 and later but lacks sufficient testing. We will re-enable + * it once that has been done. + */ + if (p->rev >= PHB3_REV_MURANO_DD20) + out_be64(p->regs + PHB_TCE_SPEC_CTL, 0xf000000000000000UL); + else + out_be64(p->regs + PHB_TCE_SPEC_CTL, 0x0ul); + + /* Errata#20131017: avoid TCE queue overflow */ + if (p->rev == PHB3_REV_MURANO_DD20) + phb3_write_reg_asb(p, PHB_TCE_WATERMARK, 0x0003000000030302UL); + + /* Init_142 - PHB3 - Timeout Control Register 1 + * SW283991: Increase timeouts + */ + out_be64(p->regs + PHB_TIMEOUT_CTRL1, 0x1715152016200000UL); + + /* Init_143 - PHB3 - Timeout Control Register 2 */ + out_be64(p->regs + PHB_TIMEOUT_CTRL2, 0x2320d71600000000UL); + + /* Mark the PHB as functional which enables all the various sequences */ + p->broken = false; + + PHBDBG(p, "Initialization complete\n"); + + return; + + failed: + PHBERR(p, "Initialization failed\n"); + p->broken = true; +} + +static void phb3_allocate_tables(struct phb3 *p) +{ + uint16_t *rte; + uint32_t i; + + /* XXX Our current memalign implementation sucks, + * + * It will do the job, however it doesn't support freeing + * the memory and wastes space by always allocating twice + * as much as requested (size + alignment) + */ + p->tbl_rtt = (uint64_t)local_alloc(p->chip_id, RTT_TABLE_SIZE, RTT_TABLE_SIZE); + assert(p->tbl_rtt); + rte = (uint16_t *)(p->tbl_rtt); + for (i = 0; i < RTT_TABLE_ENTRIES; i++, rte++) + *rte = PHB3_RESERVED_PE_NUM; + + p->tbl_peltv = (uint64_t)local_alloc(p->chip_id, PELTV_TABLE_SIZE, PELTV_TABLE_SIZE); + assert(p->tbl_peltv); + memset((void *)p->tbl_peltv, 0, PELTV_TABLE_SIZE); + + p->tbl_pest = (uint64_t)local_alloc(p->chip_id, PEST_TABLE_SIZE, PEST_TABLE_SIZE); + assert(p->tbl_pest); + memset((void *)p->tbl_pest, 0, PEST_TABLE_SIZE); + + p->tbl_ivt = (uint64_t)local_alloc(p->chip_id, IVT_TABLE_SIZE, IVT_TABLE_SIZE); + assert(p->tbl_ivt); + memset((void *)p->tbl_ivt, 0, IVT_TABLE_SIZE); + + p->tbl_rba = (uint64_t)local_alloc(p->chip_id, RBA_TABLE_SIZE, RBA_TABLE_SIZE); + assert(p->tbl_rba); + memset((void *)p->tbl_rba, 0, RBA_TABLE_SIZE); +} + +static void phb3_add_properties(struct phb3 *p) +{ + struct dt_node *np = p->phb.dt_node; + uint32_t lsibase, icsp = get_ics_phandle(); + uint64_t m32b, m64b, m64s, reg, tkill; + + reg = cleanup_addr((uint64_t)p->regs); + + /* Add various properties that HB doesn't have to + * add, some of them simply because they result from + * policy decisions made in skiboot rather than in HB + * such as the MMIO windows going to PCI, interrupts, + * etc... + */ + dt_add_property_cells(np, "#address-cells", 3); + dt_add_property_cells(np, "#size-cells", 2); + dt_add_property_cells(np, "#interrupt-cells", 1); + dt_add_property_cells(np, "bus-range", 0, 0xff); + dt_add_property_cells(np, "clock-frequency", 0x200, 0); /* ??? */ + + dt_add_property_cells(np, "interrupt-parent", icsp); + + /* XXX FIXME: add slot-name */ + //dt_property_cell("bus-width", 8); /* Figure it out from VPD ? */ + + /* "ranges", we only expose M32 (PHB3 doesn't do IO) + * + * Note: The kernel expects us to have chopped of 64k from the + * M32 size (for the 32-bit MSIs). If we don't do that, it will + * get confused (OPAL does it) + */ + m32b = cleanup_addr(p->mm1_base); + m64b = cleanup_addr(p->mm0_base); + m64s = p->mm0_size; + dt_add_property_cells(np, "ranges", + /* M32 space */ + 0x02000000, 0x00000000, M32_PCI_START, + hi32(m32b), lo32(m32b), 0, M32_PCI_SIZE - 0x10000); + + /* XXX FIXME: add opal-memwin32, dmawins, etc... */ + dt_add_property_u64s(np, "ibm,opal-m64-window", m64b, m64b, m64s); + dt_add_property(np, "ibm,opal-single-pe", NULL, 0); + //dt_add_property_cells(np, "ibm,opal-msi-ports", 2048); + dt_add_property_cells(np, "ibm,opal-num-pes", 256); + dt_add_property_cells(np, "ibm,opal-reserved-pe", + PHB3_RESERVED_PE_NUM); + dt_add_property_cells(np, "ibm,opal-msi-ranges", + p->base_msi, PHB3_MSI_IRQ_COUNT); + tkill = reg + PHB_TCE_KILL; + dt_add_property_cells(np, "ibm,opal-tce-kill", + hi32(tkill), lo32(tkill)); + dt_add_property_cells(np, "ibm,supported-tce-sizes", + 12, // 4K + 16, // 64K + 24, // 16M + 28); // 256M + + /* + * Indicate to Linux that the architected IODA2 MSI EOI method + * is supported + */ + dt_add_property_string(np, "ibm,msi-eoi-method", "ioda2"); + + /* Indicate to Linux that CAPP timebase sync is supported */ + dt_add_property_string(np, "ibm,capp-timebase-sync", NULL); + + /* The interrupt maps will be generated in the RC node by the + * PCI code based on the content of this structure: + */ + lsibase = p->base_lsi; + p->phb.lstate.int_size = 2; + p->phb.lstate.int_val[0][0] = lsibase + PHB3_LSI_PCIE_INTA; + p->phb.lstate.int_val[0][1] = 1; + p->phb.lstate.int_val[1][0] = lsibase + PHB3_LSI_PCIE_INTB; + p->phb.lstate.int_val[1][1] = 1; + p->phb.lstate.int_val[2][0] = lsibase + PHB3_LSI_PCIE_INTC; + p->phb.lstate.int_val[2][1] = 1; + p->phb.lstate.int_val[3][0] = lsibase + PHB3_LSI_PCIE_INTD; + p->phb.lstate.int_val[3][1] = 1; + p->phb.lstate.int_parent[0] = icsp; + p->phb.lstate.int_parent[1] = icsp; + p->phb.lstate.int_parent[2] = icsp; + p->phb.lstate.int_parent[3] = icsp; + + /* Indicators for variable tables */ + dt_add_property_cells(np, "ibm,opal-rtt-table", + hi32(p->tbl_rtt), lo32(p->tbl_rtt), RTT_TABLE_SIZE); + dt_add_property_cells(np, "ibm,opal-peltv-table", + hi32(p->tbl_peltv), lo32(p->tbl_peltv), PELTV_TABLE_SIZE); + dt_add_property_cells(np, "ibm,opal-pest-table", + hi32(p->tbl_pest), lo32(p->tbl_pest), PEST_TABLE_SIZE); + dt_add_property_cells(np, "ibm,opal-ivt-table", + hi32(p->tbl_ivt), lo32(p->tbl_ivt), IVT_TABLE_SIZE); + dt_add_property_cells(np, "ibm,opal-ive-stride", + IVT_TABLE_STRIDE); + dt_add_property_cells(np, "ibm,opal-rba-table", + hi32(p->tbl_rba), lo32(p->tbl_rba), RBA_TABLE_SIZE); + + dt_add_property_cells(np, "ibm,phb-diag-data-size", + sizeof(struct OpalIoPhb3ErrorData)); +} + +static bool phb3_calculate_windows(struct phb3 *p) +{ + const struct dt_property *prop; + + /* Get PBCQ MMIO windows from device-tree */ + prop = dt_require_property(p->phb.dt_node, + "ibm,mmio-window", -1); + assert(prop->len >= (2 * sizeof(uint64_t))); + + p->mm0_base = ((const uint64_t *)prop->prop)[0]; + p->mm0_size = ((const uint64_t *)prop->prop)[1]; + if (prop->len > 16) { + p->mm1_base = ((const uint64_t *)prop->prop)[2]; + p->mm1_size = ((const uint64_t *)prop->prop)[3]; + } + + /* Sort them so that 0 is big and 1 is small */ + if (p->mm1_size && p->mm1_size > p->mm0_size) { + uint64_t b = p->mm0_base; + uint64_t s = p->mm0_size; + p->mm0_base = p->mm1_base; + p->mm0_size = p->mm1_size; + p->mm1_base = b; + p->mm1_size = s; + } + + /* If 1 is too small, ditch it */ + if (p->mm1_size < M32_PCI_SIZE) + p->mm1_size = 0; + + /* If 1 doesn't exist, carve it out of 0 */ + if (p->mm1_size == 0) { + p->mm0_size /= 2; + p->mm1_base = p->mm0_base + p->mm0_size; + p->mm1_size = p->mm0_size; + } + + /* Crop mm1 to our desired size */ + if (p->mm1_size > M32_PCI_SIZE) + p->mm1_size = M32_PCI_SIZE; + + return true; +} + +/* + * Trigger a creset to disable CAPI mode on kernel shutdown. + * + * This helper is called repeatedly by the host sync notifier mechanism, which + * relies on the kernel to regularly poll the OPAL_SYNC_HOST_REBOOT call as it + * shuts down. + * + * This is a somewhat hacky abuse of the host sync notifier mechanism, but the + * alternatives require a new API call which won't work for older kernels. + */ +static bool phb3_host_sync_reset(void *data) +{ + struct phb3 *p = (struct phb3 *)data; + struct pci_slot *slot = p->phb.slot; + struct proc_chip *chip = get_chip(p->chip_id); + int64_t rc; + + switch (slot->state) { + case PHB3_SLOT_NORMAL: + lock(&capi_lock); + rc = (chip->capp_phb3_attached_mask & (1 << p->index)) ? + OPAL_PHB_CAPI_MODE_CAPI : + OPAL_PHB_CAPI_MODE_PCIE; + unlock(&capi_lock); + + if (rc == OPAL_PHB_CAPI_MODE_PCIE) + return true; + + PHBINF(p, "PHB in CAPI mode, resetting\n"); + p->flags &= ~PHB3_CAPP_RECOVERY; + phb3_creset(slot); + return false; + default: + rc = slot->ops.run_sm(slot); + return rc <= OPAL_SUCCESS; + } +} + +static void phb3_create(struct dt_node *np) +{ + const struct dt_property *prop; + struct phb3 *p = zalloc(sizeof(struct phb3)); + struct pci_slot *slot; + size_t lane_eq_len; + struct dt_node *iplp; + struct proc_chip *chip; + int opal_id; + char *path; + + assert(p); + + /* Populate base stuff */ + p->index = dt_prop_get_u32(np, "ibm,phb-index"); + p->chip_id = dt_prop_get_u32(np, "ibm,chip-id"); + p->regs = (void *)dt_get_address(np, 0, NULL); + p->base_msi = PHB3_MSI_IRQ_BASE(p->chip_id, p->index); + p->base_lsi = PHB3_LSI_IRQ_BASE(p->chip_id, p->index); + p->phb.dt_node = np; + p->phb.ops = &phb3_ops; + p->phb.phb_type = phb_type_pcie_v3; + p->phb.scan_map = 0x1; /* Only device 0 to scan */ + + if (!phb3_calculate_windows(p)) + return; + + /* Get the various XSCOM register bases from the device-tree */ + prop = dt_require_property(np, "ibm,xscom-bases", 3 * sizeof(uint32_t)); + p->pe_xscom = ((const uint32_t *)prop->prop)[0]; + p->spci_xscom = ((const uint32_t *)prop->prop)[1]; + p->pci_xscom = ((const uint32_t *)prop->prop)[2]; + + /* + * We skip the initial PERST assertion requested by the generic code + * when doing a cold boot because we are coming out of cold boot already + * so we save boot time that way. The PERST state machine will still + * handle waiting for the link to come up, it will just avoid actually + * asserting & deasserting the PERST output + * + * For a hot IPL, we still do a PERST + * + * Note: In absence of property (ie, FSP-less), we stick to the old + * behaviour and set skip_perst to true + */ + p->skip_perst = true; /* Default */ + + iplp = dt_find_by_path(dt_root, "ipl-params/ipl-params"); + if (iplp) { + const char *ipl_type = dt_prop_get_def(iplp, "cec-major-type", NULL); + if (ipl_type && (!strcmp(ipl_type, "hot"))) + p->skip_perst = false; + } + + /* By default link is assumed down */ + p->has_link = false; + + /* We register the PHB before we initialize it so we + * get a useful OPAL ID for it. We use a different numbering here + * between Naples and Venice/Murano in order to leave room for the + * NPU on Naples. + */ + chip = next_chip(NULL); /* Just need any chip */ + if (chip && chip->type == PROC_CHIP_P8_NAPLES) + opal_id = p->chip_id * 8 + p->index; + else + opal_id = p->chip_id * 4 + p->index; + pci_register_phb(&p->phb, opal_id); + slot = phb3_slot_create(&p->phb); + if (!slot) + PHBERR(p, "Cannot create PHB slot\n"); + + /* Hello ! */ + path = dt_get_path(np); + PHBINF(p, "Found %s @[%d:%d]\n", path, p->chip_id, p->index); + PHBINF(p, " M32 [0x%016llx..0x%016llx]\n", + p->mm1_base, p->mm1_base + p->mm1_size - 1); + PHBINF(p, " M64 [0x%016llx..0x%016llx]\n", + p->mm0_base, p->mm0_base + p->mm0_size - 1); + free(path); + + /* Find base location code from root node */ + p->phb.base_loc_code = dt_prop_get_def(dt_root, + "ibm,io-base-loc-code", NULL); + if (!p->phb.base_loc_code) + PHBDBG(p, "Base location code not found !\n"); + + /* Priority order: NVRAM -> dt -> GEN3 */ + p->max_link_speed = 3; + if (dt_has_node_property(np, "ibm,max-link-speed", NULL)) + p->max_link_speed = dt_prop_get_u32(np, "ibm,max-link-speed"); + if (pcie_max_link_speed) + p->max_link_speed = pcie_max_link_speed; + if (p->max_link_speed > 3) /* clamp to 3 */ + p->max_link_speed = 3; + PHBINF(p, "Max link speed: GEN%i\n", p->max_link_speed); + + /* Check for lane equalization values from HB or HDAT */ + p->lane_eq = dt_prop_get_def_size(np, "ibm,lane-eq", NULL, &lane_eq_len); + if (p->lane_eq && lane_eq_len != (8 * 4)) { + PHBERR(p, "Device-tree has ibm,lane-eq with wrong len %ld\n", + lane_eq_len); + p->lane_eq = NULL; + } + if (p->lane_eq) { + PHBDBG(p, "Override lane equalization settings:\n"); + PHBDBG(p, " 0x%016llx 0x%016llx\n", + be64_to_cpu(p->lane_eq[0]), be64_to_cpu(p->lane_eq[1])); + PHBDBG(p, " 0x%016llx 0x%016llx\n", + be64_to_cpu(p->lane_eq[2]), be64_to_cpu(p->lane_eq[3])); + } + + /* + * Grab CEC IO VPD load info from the root of the device-tree, + * on P8 there's a single such VPD for the whole machine + */ + prop = dt_find_property(dt_root, "ibm,io-vpd"); + if (!prop) { + /* LX VPD Lid not already loaded */ + if (platform.vpd_iohub_load) + platform.vpd_iohub_load(dt_root); + } + + /* Allocate the SkiBoot internal in-memory tables for the PHB */ + phb3_allocate_tables(p); + + phb3_add_properties(p); + + /* Clear IODA2 cache */ + phb3_init_ioda_cache(p); + + /* Register interrupt sources */ + register_irq_source(&phb3_msi_irq_ops, p, p->base_msi, + PHB3_MSI_IRQ_COUNT); + register_irq_source(&phb3_lsi_irq_ops, p, p->base_lsi, 8); + + /* Get the HW up and running */ + phb3_init_hw(p, true); + + /* Load capp microcode into capp unit */ + load_capp_ucode(p); + + opal_add_host_sync_notifier(phb3_host_sync_reset, p); + + /* Platform additional setup */ + if (platform.pci_setup_phb) + platform.pci_setup_phb(&p->phb, p->index); +} + +static void phb3_probe_pbcq(struct dt_node *pbcq) +{ + uint32_t spci_xscom, pci_xscom, pe_xscom, gcid, pno; + uint64_t val, phb_bar, bar_en; + uint64_t mmio0_bar, mmio0_bmask, mmio0_sz; + uint64_t mmio1_bar, mmio1_bmask, mmio1_sz; + uint64_t reg[2]; + uint64_t mmio_win[4]; + unsigned int mmio_win_sz; + struct dt_node *np; + char *path; + uint64_t capp_ucode_base; + unsigned int max_link_speed; + + gcid = dt_get_chip_id(pbcq); + pno = dt_prop_get_u32(pbcq, "ibm,phb-index"); + path = dt_get_path(pbcq); + prlog(PR_NOTICE, "Chip %d Found PBCQ%d at %s\n", gcid, pno, path); + free(path); + + pe_xscom = dt_get_address(pbcq, 0, NULL); + pci_xscom = dt_get_address(pbcq, 1, NULL); + spci_xscom = dt_get_address(pbcq, 2, NULL); + prlog(PR_DEBUG, "PHB3[%x:%x]: X[PE]=0x%08x X[PCI]=0x%08x" + " X[SPCI]=0x%08x\n", + gcid, pno, pe_xscom, pci_xscom, spci_xscom); + + /* Check if CAPP mode */ + if (xscom_read(gcid, spci_xscom + 0x03, &val)) { + prerror("PHB3[%x:%x]: Cannot read AIB CAPP ENABLE\n", + gcid, pno); + return; + } + if (val >> 63) { + prerror("PHB3[%x:%x]: Ignoring bridge in CAPP mode\n", + gcid, pno); + return; + } + + /* Get PE BARs, assume only 0 and 2 are used for now */ + xscom_read(gcid, pe_xscom + 0x42, &phb_bar); + phb_bar >>= 14; + prlog(PR_DEBUG, "PHB3[%x:%x] REGS = 0x%016llx [4k]\n", + gcid, pno, phb_bar); + if (phb_bar == 0) { + prerror("PHB3[%x:%x]: No PHB BAR set !\n", gcid, pno); + return; + } + + /* Dbl check PHB BAR */ + xscom_read(gcid, spci_xscom + 1, &val);/* HW275117 */ + xscom_read(gcid, pci_xscom + 0x0b, &val); + val >>= 14; + prlog(PR_DEBUG, "PHB3[%x:%x] PCIBAR = 0x%016llx\n", gcid, pno, val); + if (phb_bar != val) { + prerror("PHB3[%x:%x] PCIBAR invalid, fixing up...\n", + gcid, pno); + xscom_read(gcid, spci_xscom + 1, &val);/* HW275117 */ + xscom_write(gcid, pci_xscom + 0x0b, phb_bar << 14); + } + + /* Check MMIO BARs */ + xscom_read(gcid, pe_xscom + 0x40, &mmio0_bar); + xscom_read(gcid, pe_xscom + 0x43, &mmio0_bmask); + mmio0_bmask &= 0xffffffffc0000000ull; + mmio0_sz = ((~mmio0_bmask) >> 14) + 1; + mmio0_bar >>= 14; + prlog(PR_DEBUG, "PHB3[%x:%x] MMIO0 = 0x%016llx [0x%016llx]\n", + gcid, pno, mmio0_bar, mmio0_sz); + xscom_read(gcid, pe_xscom + 0x41, &mmio1_bar); + xscom_read(gcid, pe_xscom + 0x44, &mmio1_bmask); + mmio1_bmask &= 0xffffffffc0000000ull; + mmio1_sz = ((~mmio1_bmask) >> 14) + 1; + mmio1_bar >>= 14; + prlog(PR_DEBUG, "PHB3[%x:%x] MMIO1 = 0x%016llx [0x%016llx]\n", + gcid, pno, mmio1_bar, mmio1_sz); + + /* Check BAR enable + * + * XXX BAR aren't always enabled by HB, we'll make assumptions + * that BARs are valid if they value is non-0 + */ + xscom_read(gcid, pe_xscom + 0x45, &bar_en); + prlog(PR_DEBUG, "PHB3[%x:%x] BAREN = 0x%016llx\n", + gcid, pno, bar_en); + + /* Always enable PHB BAR */ + bar_en |= 0x2000000000000000ull; + + /* Build MMIO windows list */ + mmio_win_sz = 0; + if (mmio0_bar) { + mmio_win[mmio_win_sz++] = mmio0_bar; + mmio_win[mmio_win_sz++] = mmio0_sz; + bar_en |= 0x8000000000000000ul; + } + if (mmio1_bar) { + mmio_win[mmio_win_sz++] = mmio1_bar; + mmio_win[mmio_win_sz++] = mmio1_sz; + bar_en |= 0x4000000000000000ul; + } + + /* No MMIO windows ? Barf ! */ + if (mmio_win_sz == 0) { + prerror("PHB3[%x:%x]: No MMIO windows enabled !\n", + gcid, pno); + return; + } + + /* Set the interrupt routing stuff, 8 relevant bits in mask + * (11 bits per PHB) + */ + val = p8_chip_irq_phb_base(gcid, pno); + val = (val << 45); + xscom_write(gcid, pe_xscom + 0x1a, val); + xscom_write(gcid, pe_xscom + 0x1b, 0xff00000000000000ul); + + /* Configure LSI location to the top of the map */ + xscom_write(gcid, pe_xscom + 0x1f, 0xff00000000000000ul); + + /* Now add IRSN message bits to BAR enable and write it */ + bar_en |= 0x1800000000000000ul; + xscom_write(gcid, pe_xscom + 0x45, bar_en); + + prlog(PR_DEBUG, "PHB3[%x:%x] NEWBAREN = 0x%016llx\n", + gcid, pno, bar_en); + + xscom_read(gcid, pe_xscom + 0x1a, &val); + prlog(PR_DEBUG, "PHB3[%x:%x] IRSNC = 0x%016llx\n", + gcid, pno, val); + xscom_read(gcid, pe_xscom + 0x1b, &val); + prlog(PR_DEBUG, "PHB3[%x:%x] IRSNM = 0x%016llx\n", + gcid, pno, val); + prlog(PR_DEBUG, "PHB3[%x:%x] LSI = 0x%016llx\n", + gcid, pno, val); + + /* Create PHB node */ + reg[0] = phb_bar; + reg[1] = 0x1000; + + np = dt_new_addr(dt_root, "pciex", reg[0]); + if (!np) + return; + + dt_add_property_strings(np, "compatible", "ibm,power8-pciex", + "ibm,ioda2-phb"); + dt_add_property_strings(np, "device_type", "pciex"); + dt_add_property(np, "reg", reg, sizeof(reg)); + + /* Everything else is handled later by skiboot, we just + * stick a few hints here + */ + dt_add_property_cells(np, "ibm,xscom-bases", + pe_xscom, spci_xscom, pci_xscom); + dt_add_property(np, "ibm,mmio-window", mmio_win, 8 * mmio_win_sz); + dt_add_property_cells(np, "ibm,phb-index", pno); + dt_add_property_cells(np, "ibm,pbcq", pbcq->phandle); + dt_add_property_cells(np, "ibm,chip-id", gcid); + if (dt_has_node_property(pbcq, "ibm,use-ab-detect", NULL)) + dt_add_property(np, "ibm,use-ab-detect", NULL, 0); + if (dt_has_node_property(pbcq, "ibm,hub-id", NULL)) + dt_add_property_cells(np, "ibm,hub-id", + dt_prop_get_u32(pbcq, "ibm,hub-id")); + if (dt_has_node_property(pbcq, "ibm,loc-code", NULL)) { + const char *lc = dt_prop_get(pbcq, "ibm,loc-code"); + dt_add_property_string(np, "ibm,loc-code", lc); + } + if (dt_has_node_property(pbcq, "ibm,lane-eq", NULL)) { + size_t leq_size; + const void *leq = dt_prop_get_def_size(pbcq, "ibm,lane-eq", + NULL, &leq_size); + if (leq != NULL && leq_size == 4 * 8) + dt_add_property(np, "ibm,lane-eq", leq, leq_size); + } + if (dt_has_node_property(pbcq, "ibm,capp-ucode", NULL)) { + capp_ucode_base = dt_prop_get_u32(pbcq, "ibm,capp-ucode"); + dt_add_property_cells(np, "ibm,capp-ucode", capp_ucode_base); + } + if (dt_has_node_property(pbcq, "ibm,max-link-speed", NULL)) { + max_link_speed = dt_prop_get_u32(pbcq, "ibm,max-link-speed"); + dt_add_property_cells(np, "ibm,max-link-speed", max_link_speed); + } + dt_add_property_cells(np, "ibm,capi-flags", + OPAL_PHB_CAPI_FLAG_SNOOP_CONTROL); + + add_chip_dev_associativity(np); +} + + +void probe_phb3(void) +{ + struct dt_node *np; + + /* Look for PBCQ XSCOM nodes */ + dt_for_each_compatible(dt_root, np, "ibm,power8-pbcq") + phb3_probe_pbcq(np); + + /* Look for newly created PHB nodes */ + dt_for_each_compatible(dt_root, np, "ibm,power8-pciex") + phb3_create(np); +} + + diff --git a/roms/skiboot/hw/phb4.c b/roms/skiboot/hw/phb4.c new file mode 100644 index 000000000..79083d4a1 --- /dev/null +++ b/roms/skiboot/hw/phb4.c @@ -0,0 +1,6400 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * PHB4: PCI Host Bridge 4, in POWER9 + * + * Copyright 2013-2019 IBM Corp. + * Copyright 2018 Raptor Engineering, LLC + */ + +/* + * + * FIXME: + * More stuff for EEH support: + * - PBCQ error reporting interrupt + * - I2C-based power management (replacing SHPC) + * - Directly detect fenced PHB through one dedicated HW reg + */ + +/* + * This is a simplified view of the PHB4 reset and link training steps + * + * Step 1: + * - Check for hotplug status: + * o PHB_PCIE_HOTPLUG_STATUS bit PHB_PCIE_HPSTAT_PRESENCE + * o If not set -> Bail out (Slot is empty) + * + * Step 2: + * - Do complete PHB reset: + * o PHB/ETU reset procedure + * + * Step 3: + * - Drive PERST active (skip if already asserted. ie. after cold reboot) + * - Wait 250ms (for cards to reset) + * o powervm have used 250ms for a long time without any problems + * + * Step 4: + * - Drive PERST inactive + * + * Step 5: + * - Look for inband presence: + * o From PERST we have two stages to get inband presence detected + * 1) Devices must enter Detect state within 20 ms of the end of + * Fundamental Reset + * 2) Receiver detect pulse are every 12ms + * - Hence minimum wait time 20 + 12 = 32ms + * o Unfortunatey, we've seen cards take 440ms + * o Hence we are conservative and poll here for 1000ms (> 440ms) + * - If no inband presence after 100ms -> Bail out (Slot is broken) + * o PHB_PCIE_DLP_TRAIN_CTL bit PHB_PCIE_DLP_INBAND_PRESENCE + * + * Step 6: + * - Look for link training done: + * o PHB_PCIE_DLP_TRAIN_CTL bit PHB_PCIE_DLP_TL_LINKACT + * - If not set after 2000ms, Retry (3 times) -> Goto Step 2 + * o phy lockup could link training failure, hence going back to a + * complete PHB reset on retry + * o not expect to happen very often + * + * Step 7: + * - Wait for 1 sec (before touching device config space): + * - From PCIe spec: + * Root Complex and/or system software must allow at least 1.0 s after + * a Conventional Reset of a device, before it may determine that a + * device which fails to return a Successful Completion status for a + * valid Configuration Request is a broken device. + * + * Step 8: + * - Sanity check for fence and link still up: + * o If fenced or link down, Retry (3 times) -> Goto Step 2 + * o This is not nessary but takes no time and can be useful + * o Once we leave here, much harder to recover from errors + * + * Step 9: + * - Check for optimised link for directly attached devices: + * o Wait for CRS (so we can read device config space) + * o Check chip and device are in allowlist. if not, Goto Step 10 + * o If trained link speed is degraded, retry -> Goto Step 2 + * o If trained link width is degraded, retry -> Goto Step 2 + * o If still degraded after 3 retries. Give up, Goto Step 10. + * + * Step 10: + * - PHB good, start probing config space. + * o core/pci.c: pci_reset_phb() -> pci_scan_phb() + */ + + +#undef NO_ASB +#undef LOG_CFG + +#include <skiboot.h> +#include <io.h> +#include <timebase.h> +#include <pci.h> +#include <pci-cfg.h> +#include <pci-slot.h> +#include <vpd.h> +#include <interrupts.h> +#include <opal.h> +#include <cpu.h> +#include <device.h> +#include <ccan/str/str.h> +#include <ccan/array_size/array_size.h> +#include <xscom.h> +#include <affinity.h> +#include <phb4.h> +#include <phb4-regs.h> +#include <phb4-capp.h> +#include <capp.h> +#include <fsp.h> +#include <chip.h> +#include <chiptod.h> +#include <xive.h> +#include <xscom-p9-regs.h> +#include <phys-map.h> +#include <nvram.h> + +/* Enable this to disable error interrupts for debug purposes */ +#undef DISABLE_ERR_INTS + +static void phb4_init_hw(struct phb4 *p); + +#define PHBDBG(p, fmt, a...) prlog(PR_DEBUG, "PHB#%04x[%d:%d]: " fmt, \ + (p)->phb.opal_id, (p)->chip_id, \ + (p)->index, ## a) +#define PHBINF(p, fmt, a...) prlog(PR_INFO, "PHB#%04x[%d:%d]: " fmt, \ + (p)->phb.opal_id, (p)->chip_id, \ + (p)->index, ## a) +#define PHBNOTICE(p, fmt, a...) prlog(PR_NOTICE, "PHB#%04x[%d:%d]: " fmt, \ + (p)->phb.opal_id, (p)->chip_id, \ + (p)->index, ## a) +#define PHBERR(p, fmt, a...) prlog(PR_ERR, "PHB#%04x[%d:%d]: " fmt, \ + (p)->phb.opal_id, (p)->chip_id, \ + (p)->index, ## a) +#ifdef LOG_CFG +#define PHBLOGCFG(p, fmt, a...) PHBDBG(p, fmt, ## a) +#else +#define PHBLOGCFG(p, fmt, a...) do {} while (0) +#endif + +static bool pci_eeh_mmio; +static bool pci_retry_all; +static int rx_err_max = PHB4_RX_ERR_MAX; + +static inline bool is_phb4(void) +{ + return (proc_gen == proc_gen_p9); +} + +static inline bool is_phb5(void) +{ + return (proc_gen == proc_gen_p10); +} + +/* PQ offloading on the XIVE IC. */ +static inline bool phb_pq_disable(struct phb4 *p __unused) +{ + if (is_phb5()) + return xive2_cap_phb_pq_disable(); + + return false; +} + +/* + * Use the ESB page of the XIVE IC for event notification. Latency + * improvement. + */ +static inline bool phb_abt_mode(struct phb4 *p __unused) +{ + if (is_phb5()) + return xive2_cap_phb_abt(); + + return false; +} + +static inline bool phb_can_store_eoi(struct phb4 *p) +{ + if (is_phb5()) + /* PQ offloading is required for StoreEOI */ + return XIVE2_STORE_EOI_ENABLED && phb_pq_disable(p); + + return XIVE_STORE_EOI_ENABLED; +} + +/* Note: The "ASB" name is historical, practically this means access via + * the XSCOM backdoor + */ +static inline uint64_t phb4_read_reg_asb(struct phb4 *p, uint32_t offset) +{ +#ifdef NO_ASB + return in_be64(p->regs + offset); +#else + int64_t rc; + uint64_t addr, val; + + /* Address register: must use 4 bytes for built-in config space. + * + * This path isn't usable for outbound configuration space + */ + if (((offset & 0xfffffffc) == PHB_CONFIG_DATA) && (offset & 3)) { + PHBERR(p, "XSCOM unaligned access to CONFIG_DATA unsupported\n"); + return -1ull; + } + addr = XETU_HV_IND_ADDR_VALID | offset; + if ((offset >= 0x1000 && offset < 0x1800) || (offset == PHB_CONFIG_DATA)) + addr |= XETU_HV_IND_ADDR_4B; + rc = xscom_write(p->chip_id, p->etu_xscom + XETU_HV_IND_ADDRESS, addr); + if (rc != 0) { + PHBERR(p, "XSCOM error addressing register 0x%x\n", offset); + return -1ull; + } + rc = xscom_read(p->chip_id, p->etu_xscom + XETU_HV_IND_DATA, &val); + if (rc != 0) { + PHBERR(p, "XSCOM error reading register 0x%x\n", offset); + return -1ull; + } + return val; +#endif +} + +static inline void phb4_write_reg_asb(struct phb4 *p, + uint32_t offset, uint64_t val) +{ +#ifdef NO_ASB + out_be64(p->regs + offset, val); +#else + int64_t rc; + uint64_t addr; + + /* Address register: must use 4 bytes for built-in config space. + * + * This path isn't usable for outbound configuration space + */ + if (((offset & 0xfffffffc) == PHB_CONFIG_DATA) && (offset & 3)) { + PHBERR(p, "XSCOM access to CONFIG_DATA unsupported\n"); + return; + } + addr = XETU_HV_IND_ADDR_VALID | offset; + if ((offset >= 0x1000 && offset < 0x1800) || (offset == PHB_CONFIG_DATA)) + addr |= XETU_HV_IND_ADDR_4B; + rc = xscom_write(p->chip_id, p->etu_xscom + XETU_HV_IND_ADDRESS, addr); + if (rc != 0) { + PHBERR(p, "XSCOM error addressing register 0x%x\n", offset); + return; + } + rc = xscom_write(p->chip_id, p->etu_xscom + XETU_HV_IND_DATA, val); + if (rc != 0) { + PHBERR(p, "XSCOM error writing register 0x%x\n", offset); + return; + } +#endif +} + +static uint64_t phb4_read_reg(struct phb4 *p, uint32_t offset) +{ + /* No register accesses are permitted while in reset */ + if (p->flags & PHB4_ETU_IN_RESET) + return -1ull; + + if (p->flags & PHB4_CFG_USE_ASB) + return phb4_read_reg_asb(p, offset); + else + return in_be64(p->regs + offset); +} + +static void phb4_write_reg(struct phb4 *p, uint32_t offset, uint64_t val) +{ + /* No register accesses are permitted while in reset */ + if (p->flags & PHB4_ETU_IN_RESET) + return; + + if (p->flags & PHB4_CFG_USE_ASB) + phb4_write_reg_asb(p, offset, val); + else + return out_be64(p->regs + offset, val); +} + +/* Helper to select an IODA table entry */ +static inline void phb4_ioda_sel(struct phb4 *p, uint32_t table, + uint32_t addr, bool autoinc) +{ + phb4_write_reg(p, PHB_IODA_ADDR, + (autoinc ? PHB_IODA_AD_AUTOINC : 0) | + SETFIELD(PHB_IODA_AD_TSEL, 0ul, table) | + SETFIELD(PHB_IODA_AD_TADR, 0ul, addr)); +} + +/* + * Configuration space access + * + * The PHB lock is assumed to be already held + */ +static int64_t phb4_pcicfg_check(struct phb4 *p, uint32_t bdfn, + uint32_t offset, uint32_t size, + uint16_t *pe) +{ + uint32_t sm = size - 1; + + if (offset > 0xfff || bdfn > 0xffff) + return OPAL_PARAMETER; + if (offset & sm) + return OPAL_PARAMETER; + + /* The root bus only has a device at 0 and we get into an + * error state if we try to probe beyond that, so let's + * avoid that and just return an error to Linux + */ + if (PCI_BUS_NUM(bdfn) == 0 && (bdfn & 0xff)) + return OPAL_HARDWARE; + + /* Check PHB state */ + if (p->broken) + return OPAL_HARDWARE; + + /* Fetch the PE# from cache */ + *pe = be16_to_cpu(p->tbl_rtt[bdfn]); + + return OPAL_SUCCESS; +} + +static int64_t phb4_rc_read(struct phb4 *p, uint32_t offset, uint8_t sz, + void *data, bool use_asb) +{ + uint32_t reg = offset & ~3; + uint32_t oval; + + /* Some registers are handled locally */ + switch (reg) { + /* Bridge base/limit registers are cached here as HW + * doesn't implement them (it hard codes values that + * will confuse a proper PCI implementation). + */ + case PCI_CFG_MEM_BASE: /* Includes PCI_CFG_MEM_LIMIT */ + oval = p->rc_cache[(reg - 0x20) >> 2] & 0xfff0fff0; + break; + case PCI_CFG_PREF_MEM_BASE: /* Includes PCI_CFG_PREF_MEM_LIMIT */ + oval = p->rc_cache[(reg - 0x20) >> 2] & 0xfff0fff0; + oval |= 0x00010001; + break; + case PCI_CFG_IO_BASE_U16: /* Includes PCI_CFG_IO_LIMIT_U16 */ + oval = 0; + break; + case PCI_CFG_PREF_MEM_BASE_U32: + case PCI_CFG_PREF_MEM_LIMIT_U32: + oval = p->rc_cache[(reg - 0x20) >> 2]; + break; + default: + oval = 0xffffffff; /* default if offset too big */ + if (reg < PHB_RC_CONFIG_SIZE) { + if (use_asb) + oval = bswap_32(phb4_read_reg_asb(p, PHB_RC_CONFIG_BASE + + reg)); + else + oval = in_le32(p->regs + PHB_RC_CONFIG_BASE + reg); + } + } + + /* Apply any post-read fixups */ + switch (reg) { + case PCI_CFG_IO_BASE: + oval |= 0x01f1; /* Set IO base < limit to disable the window */ + break; + } + + switch (sz) { + case 1: + offset &= 3; + *((uint8_t *)data) = (oval >> (offset << 3)) & 0xff; + PHBLOGCFG(p, "000 CFG08 Rd %02x=%02x\n", + offset, *((uint8_t *)data)); + break; + case 2: + offset &= 2; + *((uint16_t *)data) = (oval >> (offset << 3)) & 0xffff; + PHBLOGCFG(p, "000 CFG16 Rd %02x=%04x\n", + offset, *((uint16_t *)data)); + break; + case 4: + *((uint32_t *)data) = oval; + PHBLOGCFG(p, "000 CFG32 Rd %02x=%08x\n", + offset, *((uint32_t *)data)); + break; + default: + assert(false); + } + return OPAL_SUCCESS; +} + +static int64_t phb4_rc_write(struct phb4 *p, uint32_t offset, uint8_t sz, + uint32_t val, bool use_asb) +{ + uint32_t reg = offset & ~3; + uint32_t old, mask, shift, oldold; + int64_t rc; + + if (reg > PHB_RC_CONFIG_SIZE) + return OPAL_SUCCESS; + + /* If size isn't 4-bytes, do a RMW cycle */ + if (sz < 4) { + rc = phb4_rc_read(p, reg, 4, &old, use_asb); + if (rc != OPAL_SUCCESS) + return rc; + + /* + * Since we have to Read-Modify-Write here, we need to filter + * out registers that have write-1-to-clear bits to prevent + * clearing stuff we shouldn't be. So for any register this + * applies to, mask out those bits. + */ + oldold = old; + switch(reg) { + case 0x1C: /* Secondary status */ + old &= 0x00ffffff; /* mask out 24-31 */ + break; + case 0x50: /* EC - Device status */ + old &= 0xfff0ffff; /* mask out 16-19 */ + break; + case 0x58: /* EC - Link status */ + old &= 0x3fffffff; /* mask out 30-31 */ + break; + case 0x78: /* EC - Link status 2 */ + old &= 0xf000ffff; /* mask out 16-27 */ + break; + /* These registers *only* have write-1-to-clear bits */ + case 0x104: /* AER - Uncorr. error status */ + case 0x110: /* AER - Corr. error status */ + case 0x130: /* AER - Root error status */ + case 0x180: /* P16 - status */ + case 0x184: /* P16 - LDPM status */ + case 0x188: /* P16 - FRDPM status */ + case 0x18C: /* P16 - SRDPM status */ + old &= 0x00000000; + break; + } + + if (old != oldold) { + PHBLOGCFG(p, "Rewrote %x to %x for reg %x for W1C\n", + oldold, old, reg); + } + + if (sz == 1) { + shift = (offset & 3) << 3; + mask = 0xff << shift; + val = (old & ~mask) | ((val & 0xff) << shift); + } else { + shift = (offset & 2) << 3; + mask = 0xffff << shift; + val = (old & ~mask) | ((val & 0xffff) << shift); + } + } + + /* Some registers are handled locally */ + switch (reg) { + /* See comment in phb4_rc_read() */ + case PCI_CFG_MEM_BASE: /* Includes PCI_CFG_MEM_LIMIT */ + case PCI_CFG_PREF_MEM_BASE: /* Includes PCI_CFG_PREF_MEM_LIMIT */ + case PCI_CFG_PREF_MEM_BASE_U32: + case PCI_CFG_PREF_MEM_LIMIT_U32: + p->rc_cache[(reg - 0x20) >> 2] = val; + break; + case PCI_CFG_IO_BASE_U16: /* Includes PCI_CFG_IO_LIMIT_U16 */ + break; + default: + /* Workaround PHB config space enable */ + PHBLOGCFG(p, "000 CFG%02d Wr %02x=%08x\n", 8 * sz, reg, val); + if (use_asb) + phb4_write_reg_asb(p, PHB_RC_CONFIG_BASE + reg, val); + else + out_le32(p->regs + PHB_RC_CONFIG_BASE + reg, val); + } + return OPAL_SUCCESS; +} + +static int64_t phb4_pcicfg_read(struct phb4 *p, uint32_t bdfn, + uint32_t offset, uint32_t size, + void *data) +{ + uint64_t addr, val64; + int64_t rc; + uint16_t pe; + bool use_asb = false; + + rc = phb4_pcicfg_check(p, bdfn, offset, size, &pe); + if (rc) + return rc; + + if (p->flags & PHB4_AIB_FENCED) { + if (!(p->flags & PHB4_CFG_USE_ASB)) + return OPAL_HARDWARE; + if (bdfn != 0) + return OPAL_HARDWARE; + use_asb = true; + } else if ((p->flags & PHB4_CFG_BLOCKED) && bdfn != 0) { + return OPAL_HARDWARE; + } + + /* Handle per-device filters */ + rc = pci_handle_cfg_filters(&p->phb, bdfn, offset, size, + (uint32_t *)data, false); + if (rc != OPAL_PARTIAL) + return rc; + + /* Handle root complex MMIO based config space */ + if (bdfn == 0) + return phb4_rc_read(p, offset, size, data, use_asb); + + addr = PHB_CA_ENABLE; + addr = SETFIELD(PHB_CA_BDFN, addr, bdfn); + addr = SETFIELD(PHB_CA_REG, addr, offset & ~3u); + addr = SETFIELD(PHB_CA_PE, addr, pe); + if (use_asb) { + phb4_write_reg_asb(p, PHB_CONFIG_ADDRESS, addr); + sync(); + val64 = bswap_64(phb4_read_reg_asb(p, PHB_CONFIG_DATA)); + switch(size) { + case 1: + *((uint8_t *)data) = val64 >> (8 * (offset & 3)); + break; + case 2: + *((uint16_t *)data) = val64 >> (8 * (offset & 2)); + break; + case 4: + *((uint32_t *)data) = val64; + break; + default: + return OPAL_PARAMETER; + } + } else { + out_be64(p->regs + PHB_CONFIG_ADDRESS, addr); + switch(size) { + case 1: + *((uint8_t *)data) = + in_8(p->regs + PHB_CONFIG_DATA + (offset & 3)); + PHBLOGCFG(p, "%03x CFG08 Rd %02x=%02x\n", + bdfn, offset, *((uint8_t *)data)); + break; + case 2: + *((uint16_t *)data) = + in_le16(p->regs + PHB_CONFIG_DATA + (offset & 2)); + PHBLOGCFG(p, "%03x CFG16 Rd %02x=%04x\n", + bdfn, offset, *((uint16_t *)data)); + break; + case 4: + *((uint32_t *)data) = in_le32(p->regs + PHB_CONFIG_DATA); + PHBLOGCFG(p, "%03x CFG32 Rd %02x=%08x\n", + bdfn, offset, *((uint32_t *)data)); + break; + default: + return OPAL_PARAMETER; + } + } + return OPAL_SUCCESS; +} + + +#define PHB4_PCI_CFG_READ(size, type) \ +static int64_t phb4_pcicfg_read##size(struct phb *phb, uint32_t bdfn, \ + uint32_t offset, type *data) \ +{ \ + struct phb4 *p = phb_to_phb4(phb); \ + \ + /* Initialize data in case of error */ \ + *data = (type)0xffffffff; \ + return phb4_pcicfg_read(p, bdfn, offset, sizeof(type), data); \ +} + +static int64_t phb4_pcicfg_write(struct phb4 *p, uint32_t bdfn, + uint32_t offset, uint32_t size, + uint32_t data) +{ + uint64_t addr; + int64_t rc; + uint16_t pe; + bool use_asb = false; + + rc = phb4_pcicfg_check(p, bdfn, offset, size, &pe); + if (rc) + return rc; + + if (p->flags & PHB4_AIB_FENCED) { + if (!(p->flags & PHB4_CFG_USE_ASB)) + return OPAL_HARDWARE; + if (bdfn != 0) + return OPAL_HARDWARE; + use_asb = true; + } else if ((p->flags & PHB4_CFG_BLOCKED) && bdfn != 0) { + return OPAL_HARDWARE; + } + + /* Handle per-device filters */ + rc = pci_handle_cfg_filters(&p->phb, bdfn, offset, size, + (uint32_t *)&data, true); + if (rc != OPAL_PARTIAL) + return rc; + + /* Handle root complex MMIO based config space */ + if (bdfn == 0) + return phb4_rc_write(p, offset, size, data, use_asb); + + addr = PHB_CA_ENABLE; + addr = SETFIELD(PHB_CA_BDFN, addr, bdfn); + addr = SETFIELD(PHB_CA_REG, addr, offset & ~3u); + addr = SETFIELD(PHB_CA_PE, addr, pe); + if (use_asb) { + /* We don't support ASB config space writes */ + return OPAL_UNSUPPORTED; + } else { + out_be64(p->regs + PHB_CONFIG_ADDRESS, addr); + switch(size) { + case 1: + out_8(p->regs + PHB_CONFIG_DATA + (offset & 3), data); + break; + case 2: + out_le16(p->regs + PHB_CONFIG_DATA + (offset & 2), data); + break; + case 4: + out_le32(p->regs + PHB_CONFIG_DATA, data); + break; + default: + return OPAL_PARAMETER; + } + } + PHBLOGCFG(p, "%03x CFG%d Wr %02x=%08x\n", bdfn, 8 * size, offset, data); + return OPAL_SUCCESS; +} + +#define PHB4_PCI_CFG_WRITE(size, type) \ +static int64_t phb4_pcicfg_write##size(struct phb *phb, uint32_t bdfn, \ + uint32_t offset, type data) \ +{ \ + struct phb4 *p = phb_to_phb4(phb); \ + \ + return phb4_pcicfg_write(p, bdfn, offset, sizeof(type), data); \ +} + +PHB4_PCI_CFG_READ(8, u8) +PHB4_PCI_CFG_READ(16, u16) +PHB4_PCI_CFG_READ(32, u32) +PHB4_PCI_CFG_WRITE(8, u8) +PHB4_PCI_CFG_WRITE(16, u16) +PHB4_PCI_CFG_WRITE(32, u32) + +static int64_t phb4_get_reserved_pe_number(struct phb *phb) +{ + struct phb4 *p = phb_to_phb4(phb); + + return PHB4_RESERVED_PE_NUM(p); +} + + +static void phb4_root_port_init(struct phb *phb, struct pci_device *dev, + int ecap, int aercap) +{ + struct phb4 *p = phb_to_phb4(phb); + struct pci_slot *slot = dev->slot; + uint16_t bdfn = dev->bdfn; + uint16_t val16; + uint32_t val32; + + /* + * Use the PHB's callback so that UTL events will be masked or + * unmasked when the link is down or up. + */ + if (dev->slot && dev->slot->ops.prepare_link_change && + phb->slot && phb->slot->ops.prepare_link_change) + dev->slot->ops.prepare_link_change = + phb->slot->ops.prepare_link_change; + + // FIXME: check recommended init values for phb4 + + /* + * Enable the bridge slot capability in the root port's config + * space. This should probably be done *before* we start + * scanning config space, but we need a pci_device struct to + * exist before we do a slot lookup so *faaaaaaaaaaaaaart* + */ + if (slot && slot->pluggable && slot->power_limit) { + uint64_t val; + + val = in_be64(p->regs + PHB_PCIE_SCR); + val |= PHB_PCIE_SCR_SLOT_CAP; + out_be64(p->regs + PHB_PCIE_SCR, val); + + /* update the cached slotcap */ + pci_cfg_read32(phb, bdfn, ecap + PCICAP_EXP_SLOTCAP, + &slot->slot_cap); + } + + /* Enable SERR and parity checking */ + pci_cfg_read16(phb, bdfn, PCI_CFG_CMD, &val16); + val16 |= (PCI_CFG_CMD_SERR_EN | PCI_CFG_CMD_PERR_RESP | + PCI_CFG_CMD_MEM_EN); + pci_cfg_write16(phb, bdfn, PCI_CFG_CMD, val16); + + /* Enable reporting various errors */ + if (!ecap) return; + pci_cfg_read16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, &val16); + val16 |= (PCICAP_EXP_DEVCTL_CE_REPORT | + PCICAP_EXP_DEVCTL_NFE_REPORT | + PCICAP_EXP_DEVCTL_FE_REPORT | + PCICAP_EXP_DEVCTL_UR_REPORT); + pci_cfg_write16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, val16); + + if (!aercap) return; + + /* Mask various unrecoverable errors */ + pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_UE_MASK, &val32); + val32 |= (PCIECAP_AER_UE_MASK_POISON_TLP | + PCIECAP_AER_UE_MASK_COMPL_TIMEOUT | + PCIECAP_AER_UE_MASK_COMPL_ABORT | + PCIECAP_AER_UE_MASK_ECRC); + pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_MASK, val32); + + /* Report various unrecoverable errors as fatal errors */ + pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_UE_SEVERITY, &val32); + val32 |= (PCIECAP_AER_UE_SEVERITY_DLLP | + PCIECAP_AER_UE_SEVERITY_SURPRISE_DOWN | + PCIECAP_AER_UE_SEVERITY_FLOW_CTL_PROT | + PCIECAP_AER_UE_SEVERITY_UNEXP_COMPL | + PCIECAP_AER_UE_SEVERITY_RECV_OVFLOW | + PCIECAP_AER_UE_SEVERITY_MALFORMED_TLP); + pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_SEVERITY, val32); + + /* Mask various recoverable errors */ + pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_CE_MASK, &val32); + val32 |= PCIECAP_AER_CE_MASK_ADV_NONFATAL; + pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CE_MASK, val32); + + /* Enable ECRC check */ + pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, &val32); + val32 |= (PCIECAP_AER_CAPCTL_ECRCG_EN | + PCIECAP_AER_CAPCTL_ECRCC_EN); + pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, val32); + + /* Enable all error reporting */ + pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_RERR_CMD, &val32); + val32 |= (PCIECAP_AER_RERR_CMD_FE | + PCIECAP_AER_RERR_CMD_NFE | + PCIECAP_AER_RERR_CMD_CE); + pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_RERR_CMD, val32); +} + +static void phb4_switch_port_init(struct phb *phb, + struct pci_device *dev, + int ecap, int aercap) +{ + uint16_t bdfn = dev->bdfn; + uint16_t val16; + uint32_t val32; + + // FIXME: update AER settings for phb4 + + /* Enable SERR and parity checking and disable INTx */ + pci_cfg_read16(phb, bdfn, PCI_CFG_CMD, &val16); + val16 |= (PCI_CFG_CMD_PERR_RESP | + PCI_CFG_CMD_SERR_EN | + PCI_CFG_CMD_INTx_DIS); + pci_cfg_write16(phb, bdfn, PCI_CFG_CMD, val16); + + /* Disable partity error and enable system error */ + pci_cfg_read16(phb, bdfn, PCI_CFG_BRCTL, &val16); + val16 &= ~PCI_CFG_BRCTL_PERR_RESP_EN; + val16 |= PCI_CFG_BRCTL_SERR_EN; + pci_cfg_write16(phb, bdfn, PCI_CFG_BRCTL, val16); + + /* Enable reporting various errors */ + if (!ecap) return; + pci_cfg_read16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, &val16); + val16 |= (PCICAP_EXP_DEVCTL_CE_REPORT | + PCICAP_EXP_DEVCTL_NFE_REPORT | + PCICAP_EXP_DEVCTL_FE_REPORT); + /* HW279570 - Disable reporting of correctable errors */ + val16 &= ~PCICAP_EXP_DEVCTL_CE_REPORT; + pci_cfg_write16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, val16); + + /* Unmask all unrecoverable errors */ + if (!aercap) return; + pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_MASK, 0x0); + + /* Severity of unrecoverable errors */ + if (dev->dev_type == PCIE_TYPE_SWITCH_UPPORT) + val32 = (PCIECAP_AER_UE_SEVERITY_DLLP | + PCIECAP_AER_UE_SEVERITY_SURPRISE_DOWN | + PCIECAP_AER_UE_SEVERITY_FLOW_CTL_PROT | + PCIECAP_AER_UE_SEVERITY_RECV_OVFLOW | + PCIECAP_AER_UE_SEVERITY_MALFORMED_TLP | + PCIECAP_AER_UE_SEVERITY_INTERNAL); + else + val32 = (PCIECAP_AER_UE_SEVERITY_FLOW_CTL_PROT | + PCIECAP_AER_UE_SEVERITY_INTERNAL); + pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_SEVERITY, val32); + + /* + * Mask various correctable errors + */ + val32 = PCIECAP_AER_CE_MASK_ADV_NONFATAL; + pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CE_MASK, val32); + + /* Enable ECRC generation and disable ECRC check */ + pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, &val32); + val32 |= PCIECAP_AER_CAPCTL_ECRCG_EN; + val32 &= ~PCIECAP_AER_CAPCTL_ECRCC_EN; + pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, val32); +} + +static void phb4_endpoint_init(struct phb *phb, + struct pci_device *dev, + int ecap, int aercap) +{ + uint16_t bdfn = dev->bdfn; + uint16_t val16; + uint32_t val32; + + /* Enable SERR and parity checking */ + pci_cfg_read16(phb, bdfn, PCI_CFG_CMD, &val16); + val16 |= (PCI_CFG_CMD_PERR_RESP | + PCI_CFG_CMD_SERR_EN); + pci_cfg_write16(phb, bdfn, PCI_CFG_CMD, val16); + + /* Enable reporting various errors */ + if (!ecap) return; + pci_cfg_read16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, &val16); + val16 &= ~PCICAP_EXP_DEVCTL_CE_REPORT; + val16 |= (PCICAP_EXP_DEVCTL_NFE_REPORT | + PCICAP_EXP_DEVCTL_FE_REPORT | + PCICAP_EXP_DEVCTL_UR_REPORT); + pci_cfg_write16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, val16); + + /* Enable ECRC generation and check */ + if (!aercap) + return; + + pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, &val32); + val32 |= (PCIECAP_AER_CAPCTL_ECRCG_EN | + PCIECAP_AER_CAPCTL_ECRCC_EN); + pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, val32); +} + +static int64_t phb4_pcicfg_no_dstate(void *dev __unused, + struct pci_cfg_reg_filter *pcrf, + uint32_t offset, uint32_t len __unused, + uint32_t *data __unused, bool write) +{ + uint32_t loff = offset - pcrf->start; + + /* Disable D-state change on children of the PHB. For now we + * simply block all writes to the PM control/status + */ + if (write && loff >= 4 && loff < 6) + return OPAL_SUCCESS; + + return OPAL_PARTIAL; +} + +void phb4_pec2_dma_engine_realloc(struct phb4 *p) +{ + uint64_t reg; + + /* + * Allocate 16 extra dma read engines to stack 0, to boost dma + * performance for devices on stack 0 of PEC2, i.e PHB3. + * It comes at a price of reduced read engine allocation for + * devices on stack 1 and 2. The engine allocation becomes + * 48/8/8 instead of the default 32/16/16. + * + * The reallocation magic value should be 0xffff0000ff008000, + * but per the PCI designers, dma engine 32 (bit 0) has a + * quirk, and 0x7fff80007F008000 has the same effect (engine + * 32 goes to PHB4). + */ + if (p->index != 3) /* shared slot on PEC2 */ + return; + + PHBINF(p, "Allocating an extra 16 dma read engines on PEC2 stack0\n"); + reg = 0x7fff80007F008000ULL; + xscom_write(p->chip_id, + p->pci_xscom + XPEC_PCI_PRDSTKOVR, reg); + xscom_write(p->chip_id, + p->pe_xscom + XPEC_NEST_READ_STACK_OVERRIDE, reg); +} + +static void phb4_check_device_quirks(struct pci_device *dev) +{ + /* Some special adapter tweaks for devices directly under the PHB */ + if (dev->primary_bus != 1) + return; + + /* PM quirk */ + if (!pci_has_cap(dev, PCI_CFG_CAP_ID_PM, false)) + return; + + pci_add_cfg_reg_filter(dev, + pci_cap(dev, PCI_CFG_CAP_ID_PM, false), 8, + PCI_REG_FLAG_WRITE, + phb4_pcicfg_no_dstate); +} + +static int phb4_device_init(struct phb *phb, struct pci_device *dev, + void *data __unused) +{ + int ecap, aercap; + + /* Setup special device quirks */ + phb4_check_device_quirks(dev); + + /* Common initialization for the device */ + pci_device_init(phb, dev); + + ecap = pci_cap(dev, PCI_CFG_CAP_ID_EXP, false); + aercap = pci_cap(dev, PCIECAP_ID_AER, true); + if (dev->dev_type == PCIE_TYPE_ROOT_PORT) + phb4_root_port_init(phb, dev, ecap, aercap); + else if (dev->dev_type == PCIE_TYPE_SWITCH_UPPORT || + dev->dev_type == PCIE_TYPE_SWITCH_DNPORT) + phb4_switch_port_init(phb, dev, ecap, aercap); + else + phb4_endpoint_init(phb, dev, ecap, aercap); + + return 0; +} + +static int64_t phb4_pci_reinit(struct phb *phb, uint64_t scope, uint64_t data) +{ + struct pci_device *pd; + uint16_t bdfn = data; + int ret; + + if (scope != OPAL_REINIT_PCI_DEV) + return OPAL_PARAMETER; + + pd = pci_find_dev(phb, bdfn); + if (!pd) + return OPAL_PARAMETER; + + ret = phb4_device_init(phb, pd, NULL); + if (ret) + return OPAL_HARDWARE; + + return OPAL_SUCCESS; +} + +/* Default value for MBT0, see comments in init_ioda_cache() */ +static uint64_t phb4_default_mbt0(struct phb4 *p, unsigned int bar_idx) +{ + uint64_t mbt0; + + switch (p->mbt_size - bar_idx - 1) { + case 0: + mbt0 = SETFIELD(IODA3_MBT0_MODE, 0ull, IODA3_MBT0_MODE_MDT); + mbt0 = SETFIELD(IODA3_MBT0_MDT_COLUMN, mbt0, 3); + break; + case 1: + mbt0 = SETFIELD(IODA3_MBT0_MODE, 0ull, IODA3_MBT0_MODE_MDT); + mbt0 = SETFIELD(IODA3_MBT0_MDT_COLUMN, mbt0, 2); + break; + case 2: + mbt0 = SETFIELD(IODA3_MBT0_MODE, 0ull, IODA3_MBT0_MODE_MDT); + mbt0 = SETFIELD(IODA3_MBT0_MDT_COLUMN, mbt0, 1); + break; + default: + mbt0 = SETFIELD(IODA3_MBT0_MODE, 0ull, IODA3_MBT0_MODE_PE_SEG); + } + return mbt0; +} + +/* + * Clear the saved (cached) IODA state. + * + * The caches here are used to save the configuration of the IODA tables + * done by the OS. When the PHB is reset it loses all of its internal state + * so we need to keep a copy to restore from. This function re-initialises + * the saved state to sane defaults. + */ +static void phb4_init_ioda_cache(struct phb4 *p) +{ + uint32_t i; + + /* + * The RTT entries (RTE) are supposed to be initialised to + * 0xFF which indicates an invalid PE# for that RTT index + * (the bdfn). However, we set them to 0x00 since Linux + * needs to find the devices first by scanning config space + * and this occurs before PEs have been assigned. + */ + for (i = 0; i < RTT_TABLE_ENTRIES; i++) + p->tbl_rtt[i] = cpu_to_be16(PHB4_RESERVED_PE_NUM(p)); + memset(p->tbl_peltv, 0x0, p->tbl_peltv_size); + memset(p->tve_cache, 0x0, sizeof(p->tve_cache)); + + /* XXX Should we mask them ? */ + memset(p->mist_cache, 0x0, sizeof(p->mist_cache)); + + /* Configure MBT entries 1...N */ + + /* Column 0 is left 0 and will be used fo M32 and configured + * by the OS. We use MDT column 1..3 for the last 3 BARs, thus + * allowing Linux to remap those, and setup all the other ones + * for now in mode 00 (segment# == PE#). By default those + * columns are set to map the same way. + */ + for (i = 0; i < p->max_num_pes; i++) { + p->mdt_cache[i] = SETFIELD(IODA3_MDT_PE_B, 0ull, i); + p->mdt_cache[i] |= SETFIELD(IODA3_MDT_PE_C, 0ull, i); + p->mdt_cache[i] |= SETFIELD(IODA3_MDT_PE_D, 0ull, i); + } + + /* Initialize MBT entries for BARs 1...N */ + for (i = 1; i < p->mbt_size; i++) { + p->mbt_cache[i][0] = phb4_default_mbt0(p, i); + p->mbt_cache[i][1] = 0; + } + + /* Initialize M32 bar using MBT entry 0, MDT colunm A */ + p->mbt_cache[0][0] = SETFIELD(IODA3_MBT0_MODE, 0ull, IODA3_MBT0_MODE_MDT); + p->mbt_cache[0][0] |= SETFIELD(IODA3_MBT0_MDT_COLUMN, 0ull, 0); + p->mbt_cache[0][0] |= IODA3_MBT0_TYPE_M32 | (p->mm1_base & IODA3_MBT0_BASE_ADDR); + p->mbt_cache[0][1] = IODA3_MBT1_ENABLE | ((~(M32_PCI_SIZE - 1)) & IODA3_MBT1_MASK); +} + +static int64_t phb4_wait_bit(struct phb4 *p, uint32_t reg, + uint64_t mask, uint64_t want_val) +{ + uint64_t val; + + /* Wait for all pending TCE kills to complete + * + * XXX Add timeout... + */ + /* XXX SIMICS is nasty... */ + if ((reg == PHB_TCE_KILL || reg == PHB_DMA_READ_WRITE_SYNC) && + chip_quirk(QUIRK_SIMICS)) + return OPAL_SUCCESS; + + for (;;) { + val = in_be64(p->regs + reg); + if (val == 0xffffffffffffffffull) { + /* XXX Fenced ? */ + return OPAL_HARDWARE; + } + if ((val & mask) == want_val) + break; + + } + return OPAL_SUCCESS; +} + +static int64_t phb4_tce_kill(struct phb *phb, uint32_t kill_type, + uint64_t pe_number, uint32_t tce_size, + uint64_t dma_addr, uint32_t npages) +{ + struct phb4 *p = phb_to_phb4(phb); + uint64_t val; + int64_t rc; + + sync(); + switch(kill_type) { + case OPAL_PCI_TCE_KILL_PAGES: + while (npages--) { + /* Wait for a slot in the HW kill queue */ + rc = phb4_wait_bit(p, PHB_TCE_KILL, + PHB_TCE_KILL_ALL | + PHB_TCE_KILL_PE | + PHB_TCE_KILL_ONE, 0); + if (rc) + return rc; + val = SETFIELD(PHB_TCE_KILL_PENUM, dma_addr, pe_number); + + /* Set appropriate page size */ + switch(tce_size) { + case 0x1000: + if (dma_addr & 0xf000000000000fffull) + return OPAL_PARAMETER; + break; + case 0x10000: + if (dma_addr & 0xf00000000000ffffull) + return OPAL_PARAMETER; + val |= PHB_TCE_KILL_PSEL | PHB_TCE_KILL_64K; + break; + case 0x200000: + if (dma_addr & 0xf0000000001fffffull) + return OPAL_PARAMETER; + val |= PHB_TCE_KILL_PSEL | PHB_TCE_KILL_2M; + break; + case 0x40000000: + if (dma_addr & 0xf00000003fffffffull) + return OPAL_PARAMETER; + val |= PHB_TCE_KILL_PSEL | PHB_TCE_KILL_1G; + break; + default: + return OPAL_PARAMETER; + } + /* Perform kill */ + out_be64(p->regs + PHB_TCE_KILL, PHB_TCE_KILL_ONE | val); + /* Next page */ + dma_addr += tce_size; + } + break; + case OPAL_PCI_TCE_KILL_PE: + /* Wait for a slot in the HW kill queue */ + rc = phb4_wait_bit(p, PHB_TCE_KILL, + PHB_TCE_KILL_ALL | + PHB_TCE_KILL_PE | + PHB_TCE_KILL_ONE, 0); + if (rc) + return rc; + /* Perform kill */ + out_be64(p->regs + PHB_TCE_KILL, PHB_TCE_KILL_PE | + SETFIELD(PHB_TCE_KILL_PENUM, 0ull, pe_number)); + break; + case OPAL_PCI_TCE_KILL_ALL: + /* Wait for a slot in the HW kill queue */ + rc = phb4_wait_bit(p, PHB_TCE_KILL, + PHB_TCE_KILL_ALL | + PHB_TCE_KILL_PE | + PHB_TCE_KILL_ONE, 0); + if (rc) + return rc; + /* Perform kill */ + out_be64(p->regs + PHB_TCE_KILL, PHB_TCE_KILL_ALL); + break; + default: + return OPAL_PARAMETER; + } + + /* Start DMA sync process */ + if (is_phb5()){ + val = in_be64(p->regs + PHB_DMA_READ_WRITE_SYNC) & + (PHB_DMA_READ_SYNC_COMPLETE | + PHB_DMA_WRITE_SYNC_COMPLETE); + out_be64(p->regs + PHB_DMA_READ_WRITE_SYNC, + val | PHB_DMA_READ_SYNC_START); + + } else { + out_be64(p->regs + PHB_DMA_READ_WRITE_SYNC, + PHB_DMA_READ_SYNC_START); + } + + /* Wait for kill to complete */ + rc = phb4_wait_bit(p, PHB_Q_DMA_R, PHB_Q_DMA_R_TCE_KILL_STATUS, 0); + if (rc) + return rc; + + /* Wait for DMA sync to complete */ + return phb4_wait_bit(p, PHB_DMA_READ_WRITE_SYNC, + PHB_DMA_READ_SYNC_COMPLETE, + PHB_DMA_READ_SYNC_COMPLETE); +} + +/* phb4_ioda_reset - Reset the IODA tables + * + * @purge: If true, the cache is cleared and the cleared values + * are applied to HW. If false, the cached values are + * applied to HW + * + * This reset the IODA tables in the PHB. It is called at + * initialization time, on PHB reset, and can be called + * explicitly from OPAL + */ +static int64_t phb4_ioda_reset(struct phb *phb, bool purge) +{ + struct phb4 *p = phb_to_phb4(phb); + uint32_t i; + uint64_t val; + + if (purge) { + PHBDBG(p, "Purging all IODA tables...\n"); + if (phb->slot) + phb->slot->link_retries = PHB4_LINK_LINK_RETRIES; + phb4_init_ioda_cache(p); + } + + /* Init_30..31 - Errata workaround, clear PESTA entry 0 */ + phb4_ioda_sel(p, IODA3_TBL_PESTA, 0, false); + out_be64(p->regs + PHB_IODA_DATA0, 0); + + /* Init_32..33 - MIST */ + phb4_ioda_sel(p, IODA3_TBL_MIST, 0, true); + val = in_be64(p->regs + PHB_IODA_ADDR); + val = SETFIELD(PHB_IODA_AD_MIST_PWV, val, 0xf); + out_be64(p->regs + PHB_IODA_ADDR, val); + for (i = 0; i < (p->num_irqs/4); i++) + out_be64(p->regs + PHB_IODA_DATA0, p->mist_cache[i]); + + /* Init_34..35 - MRT */ + phb4_ioda_sel(p, IODA3_TBL_MRT, 0, true); + for (i = 0; i < p->mrt_size; i++) + out_be64(p->regs + PHB_IODA_DATA0, 0); + + /* Init_36..37 - TVT */ + phb4_ioda_sel(p, IODA3_TBL_TVT, 0, true); + for (i = 0; i < p->tvt_size; i++) + out_be64(p->regs + PHB_IODA_DATA0, p->tve_cache[i]); + + /* Init_38..39 - MBT */ + phb4_ioda_sel(p, IODA3_TBL_MBT, 0, true); + for (i = 0; i < p->mbt_size; i++) { + out_be64(p->regs + PHB_IODA_DATA0, p->mbt_cache[i][0]); + out_be64(p->regs + PHB_IODA_DATA0, p->mbt_cache[i][1]); + } + + /* Init_40..41 - MDT */ + phb4_ioda_sel(p, IODA3_TBL_MDT, 0, true); + for (i = 0; i < p->max_num_pes; i++) + out_be64(p->regs + PHB_IODA_DATA0, p->mdt_cache[i]); + + /* Additional OPAL specific inits */ + + /* Clear PEST & PEEV */ + for (i = 0; i < p->max_num_pes; i++) { + phb4_ioda_sel(p, IODA3_TBL_PESTA, i, false); + out_be64(p->regs + PHB_IODA_DATA0, 0); + phb4_ioda_sel(p, IODA3_TBL_PESTB, i, false); + out_be64(p->regs + PHB_IODA_DATA0, 0); + } + + phb4_ioda_sel(p, IODA3_TBL_PEEV, 0, true); + for (i = 0; i < p->max_num_pes/64; i++) + out_be64(p->regs + PHB_IODA_DATA0, 0); + + /* Invalidate RTE, TCE cache */ + out_be64(p->regs + PHB_RTC_INVALIDATE, PHB_RTC_INVALIDATE_ALL); + + return phb4_tce_kill(&p->phb, OPAL_PCI_TCE_KILL_ALL, 0, 0, 0, 0); +} + +/* + * Clear anything we have in PAPR Error Injection registers. Though + * the spec says the PAPR error injection should be one-shot without + * the "sticky" bit. However, that's false according to the experiments + * I had. So we have to clear it at appropriate point in kernel to + * avoid endless frozen PE. + */ +static int64_t phb4_papr_errinjct_reset(struct phb *phb) +{ + struct phb4 *p = phb_to_phb4(phb); + + out_be64(p->regs + PHB_PAPR_ERR_INJ_CTL, 0x0ul); + out_be64(p->regs + PHB_PAPR_ERR_INJ_ADDR, 0x0ul); + out_be64(p->regs + PHB_PAPR_ERR_INJ_MASK, 0x0ul); + + return OPAL_SUCCESS; +} + +static int64_t phb4_set_phb_mem_window(struct phb *phb, + uint16_t window_type, + uint16_t window_num, + uint64_t addr, + uint64_t pci_addr __unused, + uint64_t size) +{ + struct phb4 *p = phb_to_phb4(phb); + uint64_t mbt0, mbt1; + + /* + * We have a unified MBT for all BARs on PHB4. + * + * So we use it as follow: + * + * - M32 is hard wired to be MBT[0] and uses MDT column 0 + * for remapping. + * + * - MBT[1..n] are available to the OS, currently only as + * fully segmented or single PE (we don't yet expose the + * new segmentation modes). + * + * - We configure the 3 last BARs to columnt 1..3 initially + * set to segment# == PE#. We will need to provide some + * extensions to the existing APIs to enable remapping of + * segments on those BARs (and only those) as the current + * API forces single segment mode. + */ + switch (window_type) { + case OPAL_IO_WINDOW_TYPE: + case OPAL_M32_WINDOW_TYPE: + return OPAL_UNSUPPORTED; + case OPAL_M64_WINDOW_TYPE: + if (window_num == 0 || window_num >= p->mbt_size) { + PHBERR(p, "%s: Invalid window %d\n", + __func__, window_num); + return OPAL_PARAMETER; + } + + mbt0 = p->mbt_cache[window_num][0]; + mbt1 = p->mbt_cache[window_num][1]; + + /* XXX For now we assume the 4K minimum alignment, + * todo: check with the HW folks what the exact limits + * are based on the segmentation model. + */ + if ((addr & 0xFFFul) || (size & 0xFFFul)) { + PHBERR(p, "%s: Bad addr/size alignment %llx/%llx\n", + __func__, addr, size); + return OPAL_PARAMETER; + } + + /* size should be 2^N */ + if (!size || size & (size-1)) { + PHBERR(p, "%s: size not a power of 2: %llx\n", + __func__, size); + return OPAL_PARAMETER; + } + + /* address should be size aligned */ + if (addr & (size - 1)) { + PHBERR(p, "%s: addr not size aligned %llx/%llx\n", + __func__, addr, size); + return OPAL_PARAMETER; + } + + break; + default: + return OPAL_PARAMETER; + } + + /* The BAR shouldn't be enabled yet */ + if (mbt0 & IODA3_MBT0_ENABLE) + return OPAL_PARTIAL; + + /* Apply the settings */ + mbt0 = SETFIELD(IODA3_MBT0_BASE_ADDR, mbt0, addr >> 12); + mbt1 = SETFIELD(IODA3_MBT1_MASK, mbt1, ~((size >> 12) -1)); + p->mbt_cache[window_num][0] = mbt0; + p->mbt_cache[window_num][1] = mbt1; + + return OPAL_SUCCESS; +} + +/* + * For one specific M64 BAR, it can be shared by all PEs, + * or owned by single PE exclusively. + */ +static int64_t phb4_phb_mmio_enable(struct phb __unused *phb, + uint16_t window_type, + uint16_t window_num, + uint16_t enable) +{ + struct phb4 *p = phb_to_phb4(phb); + uint64_t mbt0, mbt1, base, mask; + + /* + * By design, PHB4 doesn't support IODT any more. + * Besides, we can't enable M32 BAR as well. So + * the function is used to do M64 mapping and each + * BAR is supposed to be shared by all PEs. + * + * TODO: Add support for some of the new PHB4 split modes + */ + switch (window_type) { + case OPAL_IO_WINDOW_TYPE: + case OPAL_M32_WINDOW_TYPE: + return OPAL_UNSUPPORTED; + case OPAL_M64_WINDOW_TYPE: + /* Window 0 is reserved for M32 */ + if (window_num == 0 || window_num >= p->mbt_size || + enable > OPAL_ENABLE_M64_NON_SPLIT) { + PHBDBG(p, + "phb4_phb_mmio_enable wrong args (window %d enable %d)\n", + window_num, enable); + return OPAL_PARAMETER; + } + break; + default: + return OPAL_PARAMETER; + } + + /* + * We need check the base/mask while enabling + * the M64 BAR. Otherwise, invalid base/mask + * might cause fenced AIB unintentionally + */ + mbt0 = p->mbt_cache[window_num][0]; + mbt1 = p->mbt_cache[window_num][1]; + + if (enable == OPAL_DISABLE_M64) { + /* Reset the window to disabled & default mode */ + mbt0 = phb4_default_mbt0(p, window_num); + mbt1 = 0; + } else { + /* Verify that the mode is valid and consistent */ + if (enable == OPAL_ENABLE_M64_SPLIT) { + uint64_t mode = GETFIELD(IODA3_MBT0_MODE, mbt0); + if (mode != IODA3_MBT0_MODE_PE_SEG && + mode != IODA3_MBT0_MODE_MDT) + return OPAL_PARAMETER; + } else if (enable == OPAL_ENABLE_M64_NON_SPLIT) { + if (GETFIELD(IODA3_MBT0_MODE, mbt0) != + IODA3_MBT0_MODE_SINGLE_PE) + return OPAL_PARAMETER; + } else + return OPAL_PARAMETER; + + base = GETFIELD(IODA3_MBT0_BASE_ADDR, mbt0); + base = (base << 12); + mask = GETFIELD(IODA3_MBT1_MASK, mbt1); + if (base < p->mm0_base || !mask) + return OPAL_PARTIAL; + + mbt0 |= IODA3_MBT0_ENABLE; + mbt1 |= IODA3_MBT1_ENABLE; + } + + /* Update HW and cache */ + p->mbt_cache[window_num][0] = mbt0; + p->mbt_cache[window_num][1] = mbt1; + phb4_ioda_sel(p, IODA3_TBL_MBT, window_num << 1, true); + out_be64(p->regs + PHB_IODA_DATA0, mbt0); + out_be64(p->regs + PHB_IODA_DATA0, mbt1); + + return OPAL_SUCCESS; +} + +static int64_t phb4_map_pe_mmio_window(struct phb *phb, + uint64_t pe_number, + uint16_t window_type, + uint16_t window_num, + uint16_t segment_num) +{ + struct phb4 *p = phb_to_phb4(phb); + uint64_t mbt0, mbt1, mdt0; + + if (pe_number >= p->num_pes) + return OPAL_PARAMETER; + + /* + * We support a combined MDT that has 4 columns. We let the OS + * use kernel 0 for M32. + * + * We configure the 3 last BARs to map column 3..1 which by default + * are set to map segment# == pe#, but can be remapped here if we + * extend this function. + * + * The problem is that the current API was "hijacked" so that an + * attempt at remapping any segment of an M64 has the effect of + * turning it into a single-PE mode BAR. So if we want to support + * remapping we'll have to play around this for example by creating + * a new API or a new window type... + */ + switch(window_type) { + case OPAL_IO_WINDOW_TYPE: + return OPAL_UNSUPPORTED; + case OPAL_M32_WINDOW_TYPE: + if (window_num != 0 || segment_num >= p->num_pes) + return OPAL_PARAMETER; + + mdt0 = p->mdt_cache[segment_num]; + mdt0 = SETFIELD(IODA3_MDT_PE_A, mdt0, pe_number); + phb4_ioda_sel(p, IODA3_TBL_MDT, segment_num, false); + out_be64(p->regs + PHB_IODA_DATA0, mdt0); + break; + case OPAL_M64_WINDOW_TYPE: + if (window_num == 0 || window_num >= p->mbt_size) + return OPAL_PARAMETER; + + mbt0 = p->mbt_cache[window_num][0]; + mbt1 = p->mbt_cache[window_num][1]; + + /* The BAR shouldn't be enabled yet */ + if (mbt0 & IODA3_MBT0_ENABLE) + return OPAL_PARTIAL; + + /* Set to single PE mode and configure the PE */ + mbt0 = SETFIELD(IODA3_MBT0_MODE, mbt0, + IODA3_MBT0_MODE_SINGLE_PE); + mbt1 = SETFIELD(IODA3_MBT1_SINGLE_PE_NUM, mbt1, pe_number); + p->mbt_cache[window_num][0] = mbt0; + p->mbt_cache[window_num][1] = mbt1; + break; + default: + return OPAL_PARAMETER; + } + + return OPAL_SUCCESS; +} + +static int64_t phb4_map_pe_dma_window(struct phb *phb, + uint64_t pe_number, + uint16_t window_id, + uint16_t tce_levels, + uint64_t tce_table_addr, + uint64_t tce_table_size, + uint64_t tce_page_size) +{ + struct phb4 *p = phb_to_phb4(phb); + uint64_t tts_encoded; + uint64_t data64 = 0; + + /* + * We configure the PHB in 2 TVE per PE mode to match phb3. + * Current Linux implementation *requires* the two windows per + * PE. + * + * Note: On DD2.0 this is the normal mode of operation. + */ + + /* + * Sanity check. We currently only support "2 window per PE" mode + * ie, only bit 59 of the PCI address is used to select the window + */ + if (pe_number >= p->num_pes || (window_id >> 1) != pe_number) + return OPAL_PARAMETER; + + /* + * tce_table_size == 0 is used to disable an entry, in this case + * we ignore other arguments + */ + if (tce_table_size == 0) { + phb4_ioda_sel(p, IODA3_TBL_TVT, window_id, false); + out_be64(p->regs + PHB_IODA_DATA0, 0); + p->tve_cache[window_id] = 0; + return OPAL_SUCCESS; + } + + /* Additional arguments validation */ + if (tce_levels < 1 || tce_levels > 5 || + !is_pow2(tce_table_size) || + tce_table_size < 0x1000) + return OPAL_PARAMETER; + + /* Encode TCE table size */ + data64 = SETFIELD(IODA3_TVT_TABLE_ADDR, 0ul, tce_table_addr >> 12); + tts_encoded = ilog2(tce_table_size) - 11; + if (tts_encoded > 31) + return OPAL_PARAMETER; + data64 = SETFIELD(IODA3_TVT_TCE_TABLE_SIZE, data64, tts_encoded); + + /* Encode TCE page size */ + switch (tce_page_size) { + case 0x1000: /* 4K */ + data64 = SETFIELD(IODA3_TVT_IO_PSIZE, data64, 1); + break; + case 0x10000: /* 64K */ + data64 = SETFIELD(IODA3_TVT_IO_PSIZE, data64, 5); + break; + case 0x200000: /* 2M */ + data64 = SETFIELD(IODA3_TVT_IO_PSIZE, data64, 10); + break; + case 0x40000000: /* 1G */ + data64 = SETFIELD(IODA3_TVT_IO_PSIZE, data64, 19); + break; + default: + return OPAL_PARAMETER; + } + + /* Encode number of levels */ + data64 = SETFIELD(IODA3_TVT_NUM_LEVELS, data64, tce_levels - 1); + + phb4_ioda_sel(p, IODA3_TBL_TVT, window_id, false); + out_be64(p->regs + PHB_IODA_DATA0, data64); + p->tve_cache[window_id] = data64; + + return OPAL_SUCCESS; +} + +static int64_t phb4_map_pe_dma_window_real(struct phb *phb, + uint64_t pe_number, + uint16_t window_id, + uint64_t pci_start_addr, + uint64_t pci_mem_size) +{ + struct phb4 *p = phb_to_phb4(phb); + uint64_t end = pci_start_addr + pci_mem_size; + uint64_t tve; + + if (pe_number >= p->num_pes || + (window_id >> 1) != pe_number) + return OPAL_PARAMETER; + + if (pci_mem_size) { + /* Enable */ + + /* + * Check that the start address has the right TVE index, + * we only support the 1 bit mode where each PE has 2 + * TVEs + */ + if ((pci_start_addr >> 59) != (window_id & 1)) + return OPAL_PARAMETER; + pci_start_addr &= ((1ull << 59) - 1); + end = pci_start_addr + pci_mem_size; + + /* We have to be 16M aligned */ + if ((pci_start_addr & 0x00ffffff) || + (pci_mem_size & 0x00ffffff)) + return OPAL_PARAMETER; + + /* + * It *looks* like this is the max we can support (we need + * to verify this. Also we are not checking for rollover, + * but then we aren't trying too hard to protect ourselves + * againt a completely broken OS. + */ + if (end > 0x0003ffffffffffffull) + return OPAL_PARAMETER; + + /* + * Put start address bits 49:24 into TVE[52:53]||[0:23] + * and end address bits 49:24 into TVE[54:55]||[24:47] + * and set TVE[51] + */ + tve = (pci_start_addr << 16) & (0xffffffull << 40); + tve |= (pci_start_addr >> 38) & (3ull << 10); + tve |= (end >> 8) & (0xfffffful << 16); + tve |= (end >> 40) & (3ull << 8); + tve |= PPC_BIT(51) | IODA3_TVT_NON_TRANSLATE_50; + } else { + /* Disable */ + tve = 0; + } + + phb4_ioda_sel(p, IODA3_TBL_TVT, window_id, false); + out_be64(p->regs + PHB_IODA_DATA0, tve); + p->tve_cache[window_id] = tve; + + return OPAL_SUCCESS; +} + +static int64_t phb4_set_option(struct phb *phb, enum OpalPhbOption opt, + uint64_t setting) +{ + struct phb4 *p = phb_to_phb4(phb); + uint64_t data64; + + data64 = phb4_read_reg(p, PHB_CTRLR); + switch (opt) { + case OPAL_PHB_OPTION_TVE1_4GB: + if (setting > 1) + return OPAL_PARAMETER; + + PHBDBG(p, "4GB bypass mode = %lld\n", setting); + if (setting) + data64 |= PPC_BIT(24); + else + data64 &= ~PPC_BIT(24); + break; + case OPAL_PHB_OPTION_MMIO_EEH_DISABLE: + if (setting > 1) + return OPAL_PARAMETER; + + PHBDBG(p, "MMIO EEH Disable = %lld\n", setting); + if (setting) + data64 |= PPC_BIT(14); + else + data64 &= ~PPC_BIT(14); + break; + default: + return OPAL_UNSUPPORTED; + } + phb4_write_reg(p, PHB_CTRLR, data64); + + return OPAL_SUCCESS; +} + +static int64_t phb4_get_option(struct phb *phb, enum OpalPhbOption opt, + __be64 *setting) +{ + struct phb4 *p = phb_to_phb4(phb); + uint64_t data64; + + data64 = phb4_read_reg(p, PHB_CTRLR); + switch (opt) { + case OPAL_PHB_OPTION_TVE1_4GB: + *setting = cpu_to_be64((data64 & PPC_BIT(24)) ? 1 : 0); + break; + case OPAL_PHB_OPTION_MMIO_EEH_DISABLE: + *setting = cpu_to_be64((data64 & PPC_BIT(14)) ? 1 : 0); + break; + default: + return OPAL_UNSUPPORTED; + } + + return OPAL_SUCCESS; +} + +static int64_t phb4_set_ive_pe(struct phb *phb, + uint64_t pe_number, + uint32_t ive_num) +{ + struct phb4 *p = phb_to_phb4(phb); + uint32_t mist_idx; + uint32_t mist_quad; + uint32_t mist_shift; + uint64_t val; + + if (pe_number >= p->num_pes || ive_num >= (p->num_irqs - 8)) + return OPAL_PARAMETER; + + mist_idx = ive_num >> 2; + mist_quad = ive_num & 3; + mist_shift = (3 - mist_quad) << 4; + p->mist_cache[mist_idx] &= ~(0x0fffull << mist_shift); + p->mist_cache[mist_idx] |= ((uint64_t)pe_number) << mist_shift; + + /* Note: This has the side effect of clearing P/Q, so this + * shouldn't be called while the interrupt is "hot" + */ + + phb4_ioda_sel(p, IODA3_TBL_MIST, mist_idx, false); + + /* We need to inject the appropriate MIST write enable bit + * in the IODA table address register + */ + val = in_be64(p->regs + PHB_IODA_ADDR); + val = SETFIELD(PHB_IODA_AD_MIST_PWV, val, 8 >> mist_quad); + out_be64(p->regs + PHB_IODA_ADDR, val); + + /* Write entry */ + out_be64(p->regs + PHB_IODA_DATA0, p->mist_cache[mist_idx]); + + return OPAL_SUCCESS; +} + +static int64_t phb4_get_msi_32(struct phb *phb, + uint64_t pe_number, + uint32_t ive_num, + uint8_t msi_range, + uint32_t *msi_address, + uint32_t *message_data) +{ + struct phb4 *p = phb_to_phb4(phb); + + /* + * Sanity check. We needn't check on mve_number (PE#) + * on PHB3 since the interrupt source is purely determined + * by its DMA address and data, but the check isn't + * harmful. + */ + if (pe_number >= p->num_pes || + ive_num >= (p->num_irqs - 8) || + msi_range != 1 || !msi_address|| !message_data) + return OPAL_PARAMETER; + + /* + * DMA address and data will form the IVE index. + * For more details, please refer to IODA2 spec. + */ + *msi_address = 0xFFFF0000 | ((ive_num << 4) & 0xFFFFFE0F); + *message_data = ive_num & 0x1F; + + return OPAL_SUCCESS; +} + +static int64_t phb4_get_msi_64(struct phb *phb, + uint64_t pe_number, + uint32_t ive_num, + uint8_t msi_range, + uint64_t *msi_address, + uint32_t *message_data) +{ + struct phb4 *p = phb_to_phb4(phb); + + /* Sanity check */ + if (pe_number >= p->num_pes || + ive_num >= (p->num_irqs - 8) || + msi_range != 1 || !msi_address || !message_data) + return OPAL_PARAMETER; + + /* + * DMA address and data will form the IVE index. + * For more details, please refer to IODA2 spec. + */ + *msi_address = (0x1ul << 60) | ((ive_num << 4) & 0xFFFFFFFFFFFFFE0Ful); + *message_data = ive_num & 0x1F; + + return OPAL_SUCCESS; +} + +static void phb4_rc_err_clear(struct phb4 *p) +{ + /* Init_47 - Clear errors */ + phb4_pcicfg_write16(&p->phb, 0, PCI_CFG_SECONDARY_STATUS, 0xffff); + + if (p->ecap <= 0) + return; + + phb4_pcicfg_write16(&p->phb, 0, p->ecap + PCICAP_EXP_DEVSTAT, + PCICAP_EXP_DEVSTAT_CE | + PCICAP_EXP_DEVSTAT_NFE | + PCICAP_EXP_DEVSTAT_FE | + PCICAP_EXP_DEVSTAT_UE); + + if (p->aercap <= 0) + return; + + /* Clear all UE status */ + phb4_pcicfg_write32(&p->phb, 0, p->aercap + PCIECAP_AER_UE_STATUS, + 0xffffffff); + /* Clear all CE status */ + phb4_pcicfg_write32(&p->phb, 0, p->aercap + PCIECAP_AER_CE_STATUS, + 0xffffffff); + /* Clear root error status */ + phb4_pcicfg_write32(&p->phb, 0, p->aercap + PCIECAP_AER_RERR_STA, + 0xffffffff); +} + +static void phb4_err_clear_regb(struct phb4 *p) +{ + uint64_t val64; + + val64 = phb4_read_reg(p, PHB_REGB_ERR_STATUS); + phb4_write_reg(p, PHB_REGB_ERR_STATUS, val64); + phb4_write_reg(p, PHB_REGB_ERR1_STATUS, 0x0ul); + phb4_write_reg(p, PHB_REGB_ERR_LOG_0, 0x0ul); + phb4_write_reg(p, PHB_REGB_ERR_LOG_1, 0x0ul); +} + +/* + * The function can be called during error recovery for all classes of + * errors. This is new to PHB4; previous revisions had separate + * sequences for INF/ER/Fatal errors. + * + * "Rec #" in this function refer to "Recov_#" steps in the + * PHB4 INF recovery sequence. + */ +static void phb4_err_clear(struct phb4 *p) +{ + uint64_t val64; + uint64_t fir = phb4_read_reg(p, PHB_LEM_FIR_ACCUM); + + /* Rec 1: Acquire the PCI config lock (we don't need to do this) */ + + /* Rec 2...15: Clear error status in RC config space */ + phb4_rc_err_clear(p); + + /* Rec 16...23: Clear PBL errors */ + val64 = phb4_read_reg(p, PHB_PBL_ERR_STATUS); + phb4_write_reg(p, PHB_PBL_ERR_STATUS, val64); + phb4_write_reg(p, PHB_PBL_ERR1_STATUS, 0x0ul); + phb4_write_reg(p, PHB_PBL_ERR_LOG_0, 0x0ul); + phb4_write_reg(p, PHB_PBL_ERR_LOG_1, 0x0ul); + + /* Rec 24...31: Clear REGB errors */ + phb4_err_clear_regb(p); + + /* Rec 32...59: Clear PHB error trap */ + val64 = phb4_read_reg(p, PHB_TXE_ERR_STATUS); + phb4_write_reg(p, PHB_TXE_ERR_STATUS, val64); + phb4_write_reg(p, PHB_TXE_ERR1_STATUS, 0x0ul); + phb4_write_reg(p, PHB_TXE_ERR_LOG_0, 0x0ul); + phb4_write_reg(p, PHB_TXE_ERR_LOG_1, 0x0ul); + + val64 = phb4_read_reg(p, PHB_RXE_ARB_ERR_STATUS); + phb4_write_reg(p, PHB_RXE_ARB_ERR_STATUS, val64); + phb4_write_reg(p, PHB_RXE_ARB_ERR1_STATUS, 0x0ul); + phb4_write_reg(p, PHB_RXE_ARB_ERR_LOG_0, 0x0ul); + phb4_write_reg(p, PHB_RXE_ARB_ERR_LOG_1, 0x0ul); + + val64 = phb4_read_reg(p, PHB_RXE_MRG_ERR_STATUS); + phb4_write_reg(p, PHB_RXE_MRG_ERR_STATUS, val64); + phb4_write_reg(p, PHB_RXE_MRG_ERR1_STATUS, 0x0ul); + phb4_write_reg(p, PHB_RXE_MRG_ERR_LOG_0, 0x0ul); + phb4_write_reg(p, PHB_RXE_MRG_ERR_LOG_1, 0x0ul); + + val64 = phb4_read_reg(p, PHB_RXE_TCE_ERR_STATUS); + phb4_write_reg(p, PHB_RXE_TCE_ERR_STATUS, val64); + phb4_write_reg(p, PHB_RXE_TCE_ERR1_STATUS, 0x0ul); + phb4_write_reg(p, PHB_RXE_TCE_ERR_LOG_0, 0x0ul); + phb4_write_reg(p, PHB_RXE_TCE_ERR_LOG_1, 0x0ul); + + val64 = phb4_read_reg(p, PHB_ERR_STATUS); + phb4_write_reg(p, PHB_ERR_STATUS, val64); + phb4_write_reg(p, PHB_ERR1_STATUS, 0x0ul); + phb4_write_reg(p, PHB_ERR_LOG_0, 0x0ul); + phb4_write_reg(p, PHB_ERR_LOG_1, 0x0ul); + + /* Rec 61/62: Clear FIR/WOF */ + phb4_write_reg(p, PHB_LEM_FIR_AND_MASK, ~fir); + phb4_write_reg(p, PHB_LEM_WOF, 0x0ul); + + /* Rec 63: Update LEM mask to its initial value */ + phb4_write_reg(p, PHB_LEM_ERROR_MASK, 0x0ul); + + /* Rec 64: Clear the PCI config lock (we don't need to do this) */ +} + +static void phb4_read_phb_status(struct phb4 *p, + struct OpalIoPhb4ErrorData *stat) +{ + uint32_t i; + __be64 *pPEST; + uint16_t __16; + uint32_t __32; + uint64_t __64; + + memset(stat, 0, sizeof(struct OpalIoPhb4ErrorData)); + + /* Error data common part */ + stat->common.version = cpu_to_be32(OPAL_PHB_ERROR_DATA_VERSION_1); + stat->common.ioType = cpu_to_be32(OPAL_PHB_ERROR_DATA_TYPE_PHB4); + stat->common.len = cpu_to_be32(sizeof(struct OpalIoPhb4ErrorData)); + + /* Use ASB for config space if the PHB is fenced */ + if (p->flags & PHB4_AIB_FENCED) + p->flags |= PHB4_CFG_USE_ASB; + + /* Grab RC bridge control, make it 32-bit */ + phb4_pcicfg_read16(&p->phb, 0, PCI_CFG_BRCTL, &__16); + stat->brdgCtl = cpu_to_be32(__16); + + /* + * Grab various RC PCIe capability registers. All device, slot + * and link status are 16-bit, so we grab the pair control+status + * for each of them + */ + phb4_pcicfg_read32(&p->phb, 0, p->ecap + PCICAP_EXP_DEVCTL, &__32); + stat->deviceStatus = cpu_to_be32(__32); + phb4_pcicfg_read32(&p->phb, 0, p->ecap + PCICAP_EXP_SLOTCTL, &__32); + stat->slotStatus = cpu_to_be32(__32); + phb4_pcicfg_read32(&p->phb, 0, p->ecap + PCICAP_EXP_LCTL, &__32); + stat->linkStatus = cpu_to_be32(__32); + + /* + * I assume those are the standard config space header, cmd & status + * together makes 32-bit. Secondary status is 16-bit so I'll clear + * the top on that one + */ + phb4_pcicfg_read32(&p->phb, 0, PCI_CFG_CMD, &__32); + stat->devCmdStatus = cpu_to_be32(__32); + phb4_pcicfg_read16(&p->phb, 0, PCI_CFG_SECONDARY_STATUS, &__16); + stat->devSecStatus = cpu_to_be32(__16); + + /* Grab a bunch of AER regs */ + phb4_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_RERR_STA, &__32); + stat->rootErrorStatus = cpu_to_be32(__32); + phb4_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_UE_STATUS, &__32); + stat->uncorrErrorStatus = cpu_to_be32(__32); + + phb4_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_CE_STATUS, &__32); + stat->corrErrorStatus = cpu_to_be32(__32); + + phb4_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG0, &__32); + stat->tlpHdr1 = cpu_to_be32(__32); + + phb4_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG1, &__32); + stat->tlpHdr2 = cpu_to_be32(__32); + + phb4_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG2, &__32); + stat->tlpHdr3 = cpu_to_be32(__32); + + phb4_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG3, &__32); + stat->tlpHdr4 = cpu_to_be32(__32); + + phb4_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_SRCID, &__32); + stat->sourceId = cpu_to_be32(__32); + + + /* PEC NFIR, same as P8/PHB3 */ + xscom_read(p->chip_id, p->pe_stk_xscom + 0x0, &__64); + stat->nFir = cpu_to_be64(__64); + xscom_read(p->chip_id, p->pe_stk_xscom + 0x3, &__64); + stat->nFirMask = cpu_to_be64(__64); + xscom_read(p->chip_id, p->pe_stk_xscom + 0x8, &__64); + stat->nFirWOF = cpu_to_be64(__64); + + /* PHB4 inbound and outbound error Regs */ + stat->phbPlssr = cpu_to_be64(phb4_read_reg_asb(p, PHB_CPU_LOADSTORE_STATUS)); + stat->phbCsr = cpu_to_be64(phb4_read_reg_asb(p, PHB_DMA_CHAN_STATUS)); + stat->lemFir = cpu_to_be64(phb4_read_reg_asb(p, PHB_LEM_FIR_ACCUM)); + stat->lemErrorMask = cpu_to_be64(phb4_read_reg_asb(p, PHB_LEM_ERROR_MASK)); + stat->lemWOF = cpu_to_be64(phb4_read_reg_asb(p, PHB_LEM_WOF)); + stat->phbErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_ERR_STATUS)); + stat->phbFirstErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_ERR1_STATUS)); + stat->phbErrorLog0 = cpu_to_be64(phb4_read_reg_asb(p, PHB_ERR_LOG_0)); + stat->phbErrorLog1 = cpu_to_be64(phb4_read_reg_asb(p, PHB_ERR_LOG_1)); + stat->phbTxeErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_TXE_ERR_STATUS)); + stat->phbTxeFirstErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_TXE_ERR1_STATUS)); + stat->phbTxeErrorLog0 = cpu_to_be64(phb4_read_reg_asb(p, PHB_TXE_ERR_LOG_0)); + stat->phbTxeErrorLog1 = cpu_to_be64(phb4_read_reg_asb(p, PHB_TXE_ERR_LOG_1)); + stat->phbRxeArbErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_ARB_ERR_STATUS)); + stat->phbRxeArbFirstErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_ARB_ERR1_STATUS)); + stat->phbRxeArbErrorLog0 = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_ARB_ERR_LOG_0)); + stat->phbRxeArbErrorLog1 = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_ARB_ERR_LOG_1)); + stat->phbRxeMrgErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_MRG_ERR_STATUS)); + stat->phbRxeMrgFirstErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_MRG_ERR1_STATUS)); + stat->phbRxeMrgErrorLog0 = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_MRG_ERR_LOG_0)); + stat->phbRxeMrgErrorLog1 = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_MRG_ERR_LOG_1)); + stat->phbRxeTceErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_TCE_ERR_STATUS)); + stat->phbRxeTceFirstErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_TCE_ERR1_STATUS)); + stat->phbRxeTceErrorLog0 = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_TCE_ERR_LOG_0)); + stat->phbRxeTceErrorLog1 = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_TCE_ERR_LOG_1)); + + /* PHB4 REGB error registers */ + stat->phbPblErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_PBL_ERR_STATUS)); + stat->phbPblFirstErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_PBL_ERR1_STATUS)); + stat->phbPblErrorLog0 = cpu_to_be64(phb4_read_reg_asb(p, PHB_PBL_ERR_LOG_0)); + stat->phbPblErrorLog1 = cpu_to_be64(phb4_read_reg_asb(p, PHB_PBL_ERR_LOG_1)); + + stat->phbPcieDlpErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_PCIE_DLP_ERR_STATUS)); + stat->phbPcieDlpErrorLog1 = cpu_to_be64(phb4_read_reg_asb(p, PHB_PCIE_DLP_ERRLOG1)); + stat->phbPcieDlpErrorLog2 = cpu_to_be64(phb4_read_reg_asb(p, PHB_PCIE_DLP_ERRLOG2)); + + stat->phbRegbErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_REGB_ERR_STATUS)); + stat->phbRegbFirstErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_REGB_ERR1_STATUS)); + stat->phbRegbErrorLog0 = cpu_to_be64(phb4_read_reg_asb(p, PHB_REGB_ERR_LOG_0)); + stat->phbRegbErrorLog1 = cpu_to_be64(phb4_read_reg_asb(p, PHB_REGB_ERR_LOG_1)); + + /* + * Grab PESTA & B content. The error bit (bit#0) should + * be fetched from IODA and the left content from memory + * resident tables. + */ + pPEST = (__be64 *)p->tbl_pest; + phb4_ioda_sel(p, IODA3_TBL_PESTA, 0, true); + for (i = 0; i < p->max_num_pes; i++) { + stat->pestA[i] = cpu_to_be64(phb4_read_reg_asb(p, PHB_IODA_DATA0)); + stat->pestA[i] |= pPEST[2 * i]; + } + + phb4_ioda_sel(p, IODA3_TBL_PESTB, 0, true); + for (i = 0; i < p->max_num_pes; i++) { + stat->pestB[i] = cpu_to_be64(phb4_read_reg_asb(p, PHB_IODA_DATA0)); + stat->pestB[i] |= pPEST[2 * i + 1]; + } +} + +static void __unused phb4_dump_peltv(struct phb4 *p) +{ + int stride = p->max_num_pes / 64; + uint64_t *tbl = (void *) p->tbl_peltv; + unsigned int pe; + + PHBERR(p, "PELT-V: base addr: %p size: %llx (%d PEs, stride = %d)\n", + tbl, p->tbl_peltv_size, p->max_num_pes, stride); + + for (pe = 0; pe < p->max_num_pes; pe++) { + unsigned int i, j; + uint64_t sum = 0; + + i = pe * stride; + + /* + * Only print an entry if there's bits set in the PE's + * PELT-V entry. There's a few hundred possible PEs and + * generally only a handful will be in use. + */ + + for (j = 0; j < stride; j++) + sum |= tbl[i + j]; + if (!sum) + continue; /* unused PE, skip it */ + + if (p->max_num_pes == 512) { + PHBERR(p, "PELT-V[%03x] = " + "%016llx %016llx %016llx %016llx" + "%016llx %016llx %016llx %016llx\n", pe, + tbl[i + 0], tbl[i + 1], tbl[i + 2], tbl[i + 3], + tbl[i + 4], tbl[i + 5], tbl[i + 6], tbl[i + 7]); + } else if (p->max_num_pes == 256) { + PHBERR(p, "PELT-V[%03x] = " + "%016llx %016llx %016llx %016llx\n", pe, + tbl[i + 0], tbl[i + 1], tbl[i + 2], tbl[i + 3]); + } + } +} + +static void __unused phb4_dump_ioda_table(struct phb4 *p, int table) +{ + const char *name; + int entries, i; + + switch (table) { + case IODA3_TBL_LIST: + name = "LIST"; + entries = 8; + break; + case IODA3_TBL_MIST: + name = "MIST"; + entries = 1024; + break; + case IODA3_TBL_RCAM: + name = "RCAM"; + entries = 128; + break; + case IODA3_TBL_MRT: + name = "MRT"; + entries = 16; + break; + case IODA3_TBL_PESTA: + name = "PESTA"; + entries = 512; + break; + case IODA3_TBL_PESTB: + name = "PESTB"; + entries = 512; + break; + case IODA3_TBL_TVT: + name = "TVT"; + entries = 512; + break; + case IODA3_TBL_TCAM: + name = "TCAM"; + entries = 1024; + break; + case IODA3_TBL_TDR: + name = "TDR"; + entries = 1024; + break; + case IODA3_TBL_MBT: /* special case, see below */ + name = "MBT"; + entries = 64; + break; + case IODA3_TBL_MDT: + name = "MDT"; + entries = 512; + break; + case IODA3_TBL_PEEV: + name = "PEEV"; + entries = 8; + break; + default: + PHBERR(p, "Invalid IODA table %d!\n", table); + return; + } + + PHBERR(p, "Start %s dump (only non-zero entries are printed):\n", name); + + phb4_ioda_sel(p, table, 0, true); + + /* + * Each entry in the MBT is 16 bytes. Every other table has 8 byte + * entries so we special case the MDT to keep the output readable. + */ + if (table == IODA3_TBL_MBT) { + for (i = 0; i < 32; i++) { + uint64_t v1 = phb4_read_reg_asb(p, PHB_IODA_DATA0); + uint64_t v2 = phb4_read_reg_asb(p, PHB_IODA_DATA0); + + if (!v1 && !v2) + continue; + PHBERR(p, "MBT[%03x] = %016llx %016llx\n", i, v1, v2); + } + } else { + for (i = 0; i < entries; i++) { + uint64_t v = phb4_read_reg_asb(p, PHB_IODA_DATA0); + + if (!v) + continue; + PHBERR(p, "%s[%03x] = %016llx\n", name, i, v); + } + } + + PHBERR(p, "End %s dump\n", name); +} + +static void phb4_eeh_dump_regs(struct phb4 *p) +{ + struct OpalIoPhb4ErrorData *s; + uint16_t reg; + unsigned int i; + + if (!verbose_eeh) + return; + + s = zalloc(sizeof(struct OpalIoPhb4ErrorData)); + if (!s) { + PHBERR(p, "Failed to allocate error info !\n"); + return; + } + phb4_read_phb_status(p, s); + + PHBERR(p, " brdgCtl = %08x\n", be32_to_cpu(s->brdgCtl)); + + /* PHB4 cfg regs */ + PHBERR(p, " deviceStatus = %08x\n", be32_to_cpu(s->deviceStatus)); + PHBERR(p, " slotStatus = %08x\n", be32_to_cpu(s->slotStatus)); + PHBERR(p, " linkStatus = %08x\n", be32_to_cpu(s->linkStatus)); + PHBERR(p, " devCmdStatus = %08x\n", be32_to_cpu(s->devCmdStatus)); + PHBERR(p, " devSecStatus = %08x\n", be32_to_cpu(s->devSecStatus)); + PHBERR(p, " rootErrorStatus = %08x\n", be32_to_cpu(s->rootErrorStatus)); + PHBERR(p, " corrErrorStatus = %08x\n", be32_to_cpu(s->corrErrorStatus)); + PHBERR(p, " uncorrErrorStatus = %08x\n", be32_to_cpu(s->uncorrErrorStatus)); + + /* Two non OPAL API registers that are useful */ + phb4_pcicfg_read16(&p->phb, 0, p->ecap + PCICAP_EXP_DEVCTL, ®); + PHBERR(p, " devctl = %08x\n", reg); + phb4_pcicfg_read16(&p->phb, 0, p->ecap + PCICAP_EXP_DEVSTAT, + ®); + PHBERR(p, " devStat = %08x\n", reg); + + /* Byte swap TLP headers so they are the same as the PCIe spec */ + PHBERR(p, " tlpHdr1 = %08x\n", cpu_to_le32(be32_to_cpu(s->tlpHdr1))); + PHBERR(p, " tlpHdr2 = %08x\n", cpu_to_le32(be32_to_cpu(s->tlpHdr2))); + PHBERR(p, " tlpHdr3 = %08x\n", cpu_to_le32(be32_to_cpu(s->tlpHdr3))); + PHBERR(p, " tlpHdr4 = %08x\n", cpu_to_le32(be32_to_cpu(s->tlpHdr4))); + PHBERR(p, " sourceId = %08x\n", be32_to_cpu(s->sourceId)); + PHBERR(p, " nFir = %016llx\n", be64_to_cpu(s->nFir)); + PHBERR(p, " nFirMask = %016llx\n", be64_to_cpu(s->nFirMask)); + PHBERR(p, " nFirWOF = %016llx\n", be64_to_cpu(s->nFirWOF)); + PHBERR(p, " phbPlssr = %016llx\n", be64_to_cpu(s->phbPlssr)); + PHBERR(p, " phbCsr = %016llx\n", be64_to_cpu(s->phbCsr)); + PHBERR(p, " lemFir = %016llx\n", be64_to_cpu(s->lemFir)); + PHBERR(p, " lemErrorMask = %016llx\n", be64_to_cpu(s->lemErrorMask)); + PHBERR(p, " lemWOF = %016llx\n", be64_to_cpu(s->lemWOF)); + PHBERR(p, " phbErrorStatus = %016llx\n", be64_to_cpu(s->phbErrorStatus)); + PHBERR(p, " phbFirstErrorStatus = %016llx\n", be64_to_cpu(s->phbFirstErrorStatus)); + PHBERR(p, " phbErrorLog0 = %016llx\n", be64_to_cpu(s->phbErrorLog0)); + PHBERR(p, " phbErrorLog1 = %016llx\n", be64_to_cpu(s->phbErrorLog1)); + PHBERR(p, " phbTxeErrorStatus = %016llx\n", be64_to_cpu(s->phbTxeErrorStatus)); + PHBERR(p, " phbTxeFirstErrorStatus = %016llx\n", be64_to_cpu(s->phbTxeFirstErrorStatus)); + PHBERR(p, " phbTxeErrorLog0 = %016llx\n", be64_to_cpu(s->phbTxeErrorLog0)); + PHBERR(p, " phbTxeErrorLog1 = %016llx\n", be64_to_cpu(s->phbTxeErrorLog1)); + PHBERR(p, " phbRxeArbErrorStatus = %016llx\n", be64_to_cpu(s->phbRxeArbErrorStatus)); + PHBERR(p, "phbRxeArbFrstErrorStatus = %016llx\n", be64_to_cpu(s->phbRxeArbFirstErrorStatus)); + PHBERR(p, " phbRxeArbErrorLog0 = %016llx\n", be64_to_cpu(s->phbRxeArbErrorLog0)); + PHBERR(p, " phbRxeArbErrorLog1 = %016llx\n", be64_to_cpu(s->phbRxeArbErrorLog1)); + PHBERR(p, " phbRxeMrgErrorStatus = %016llx\n", be64_to_cpu(s->phbRxeMrgErrorStatus)); + PHBERR(p, "phbRxeMrgFrstErrorStatus = %016llx\n", be64_to_cpu(s->phbRxeMrgFirstErrorStatus)); + PHBERR(p, " phbRxeMrgErrorLog0 = %016llx\n", be64_to_cpu(s->phbRxeMrgErrorLog0)); + PHBERR(p, " phbRxeMrgErrorLog1 = %016llx\n", be64_to_cpu(s->phbRxeMrgErrorLog1)); + PHBERR(p, " phbRxeTceErrorStatus = %016llx\n", be64_to_cpu(s->phbRxeTceErrorStatus)); + PHBERR(p, "phbRxeTceFrstErrorStatus = %016llx\n", be64_to_cpu(s->phbRxeTceFirstErrorStatus)); + PHBERR(p, " phbRxeTceErrorLog0 = %016llx\n", be64_to_cpu(s->phbRxeTceErrorLog0)); + PHBERR(p, " phbRxeTceErrorLog1 = %016llx\n", be64_to_cpu(s->phbRxeTceErrorLog1)); + PHBERR(p, " phbPblErrorStatus = %016llx\n", be64_to_cpu(s->phbPblErrorStatus)); + PHBERR(p, " phbPblFirstErrorStatus = %016llx\n", be64_to_cpu(s->phbPblFirstErrorStatus)); + PHBERR(p, " phbPblErrorLog0 = %016llx\n", be64_to_cpu(s->phbPblErrorLog0)); + PHBERR(p, " phbPblErrorLog1 = %016llx\n", be64_to_cpu(s->phbPblErrorLog1)); + PHBERR(p, " phbPcieDlpErrorLog1 = %016llx\n", be64_to_cpu(s->phbPcieDlpErrorLog1)); + PHBERR(p, " phbPcieDlpErrorLog2 = %016llx\n", be64_to_cpu(s->phbPcieDlpErrorLog2)); + PHBERR(p, " phbPcieDlpErrorStatus = %016llx\n", be64_to_cpu(s->phbPcieDlpErrorStatus)); + + PHBERR(p, " phbRegbErrorStatus = %016llx\n", be64_to_cpu(s->phbRegbErrorStatus)); + PHBERR(p, " phbRegbFirstErrorStatus = %016llx\n", be64_to_cpu(s->phbRegbFirstErrorStatus)); + PHBERR(p, " phbRegbErrorLog0 = %016llx\n", be64_to_cpu(s->phbRegbErrorLog0)); + PHBERR(p, " phbRegbErrorLog1 = %016llx\n", be64_to_cpu(s->phbRegbErrorLog1)); + + for (i = 0; i < p->max_num_pes; i++) { + if (!s->pestA[i] && !s->pestB[i]) + continue; + PHBERR(p, " PEST[%03x] = %016llx %016llx\n", + i, be64_to_cpu(s->pestA[i]), be64_to_cpu(s->pestB[i])); + } + free(s); +} + +static int64_t phb4_set_pe(struct phb *phb, + uint64_t pe_number, + uint64_t bdfn, + uint8_t bcompare, + uint8_t dcompare, + uint8_t fcompare, + uint8_t action) +{ + struct phb4 *p = phb_to_phb4(phb); + uint64_t mask, idx; + + /* Sanity check */ + if (action != OPAL_MAP_PE && action != OPAL_UNMAP_PE) + return OPAL_PARAMETER; + if (pe_number >= p->num_pes || bdfn > 0xffff || + bcompare > OpalPciBusAll || + dcompare > OPAL_COMPARE_RID_DEVICE_NUMBER || + fcompare > OPAL_COMPARE_RID_FUNCTION_NUMBER) + return OPAL_PARAMETER; + + /* match everything by default */ + mask = 0; + + /* Figure out the RID range */ + if (bcompare != OpalPciBusAny) + mask = ((0x1 << (bcompare + 1)) - 1) << (15 - bcompare); + + if (dcompare == OPAL_COMPARE_RID_DEVICE_NUMBER) + mask |= 0xf8; + + if (fcompare == OPAL_COMPARE_RID_FUNCTION_NUMBER) + mask |= 0x7; + + if (action == OPAL_UNMAP_PE) + pe_number = PHB4_RESERVED_PE_NUM(p); + + /* Map or unmap the RTT range */ + for (idx = 0; idx < RTT_TABLE_ENTRIES; idx++) + if ((idx & mask) == (bdfn & mask)) + p->tbl_rtt[idx] = cpu_to_be16(pe_number); + + /* Invalidate the RID Translation Cache (RTC) inside the PHB */ + out_be64(p->regs + PHB_RTC_INVALIDATE, PHB_RTC_INVALIDATE_ALL); + + return OPAL_SUCCESS; +} + +static int64_t phb4_set_peltv(struct phb *phb, + uint32_t parent_pe, + uint32_t child_pe, + uint8_t state) +{ + struct phb4 *p = phb_to_phb4(phb); + uint32_t idx, mask; + + /* Sanity check */ + if (parent_pe >= p->num_pes || child_pe >= p->num_pes) + return OPAL_PARAMETER; + + /* Find index for parent PE */ + idx = parent_pe * (p->max_num_pes / 8); + idx += (child_pe / 8); + mask = 0x1 << (7 - (child_pe % 8)); + + if (state) + p->tbl_peltv[idx] |= mask; + else + p->tbl_peltv[idx] &= ~mask; + + return OPAL_SUCCESS; +} + +static void phb4_prepare_link_change(struct pci_slot *slot, bool is_up) +{ + struct phb4 *p = phb_to_phb4(slot->phb); + uint32_t reg32; + + p->has_link = is_up; + + if (is_up) { + /* Clear AER receiver error status */ + phb4_pcicfg_write32(&p->phb, 0, p->aercap + + PCIECAP_AER_CE_STATUS, + PCIECAP_AER_CE_RECVR_ERR); + /* Unmask receiver error status in AER */ + phb4_pcicfg_read32(&p->phb, 0, p->aercap + + PCIECAP_AER_CE_MASK, ®32); + reg32 &= ~PCIECAP_AER_CE_RECVR_ERR; + phb4_pcicfg_write32(&p->phb, 0, p->aercap + + PCIECAP_AER_CE_MASK, reg32); + + /* Don't block PCI-CFG */ + p->flags &= ~PHB4_CFG_BLOCKED; + + /* Re-enable link down errors */ + out_be64(p->regs + PHB_PCIE_MISC_STRAP, + 0x0000060000000000ull); + + /* Re-enable error status indicators that trigger irqs */ + out_be64(p->regs + PHB_REGB_ERR_INF_ENABLE, + 0x2130006efca8bc00ull); + out_be64(p->regs + PHB_REGB_ERR_ERC_ENABLE, + 0x0080000000000000ull); + out_be64(p->regs + PHB_REGB_ERR_FAT_ENABLE, + 0xde0fff91035743ffull); + + } else { + /* Mask AER receiver error */ + phb4_pcicfg_read32(&p->phb, 0, p->aercap + + PCIECAP_AER_CE_MASK, ®32); + reg32 |= PCIECAP_AER_CE_RECVR_ERR; + phb4_pcicfg_write32(&p->phb, 0, p->aercap + + PCIECAP_AER_CE_MASK, reg32); + + /* Clear error link enable & error link down kill enable */ + out_be64(p->regs + PHB_PCIE_MISC_STRAP, 0); + + /* Disable all error status indicators that trigger irqs */ + out_be64(p->regs + PHB_REGB_ERR_INF_ENABLE, 0); + out_be64(p->regs + PHB_REGB_ERR_ERC_ENABLE, 0); + out_be64(p->regs + PHB_REGB_ERR_FAT_ENABLE, 0); + + /* Block PCI-CFG access */ + p->flags |= PHB4_CFG_BLOCKED; + } +} + +static int64_t phb4_get_presence_state(struct pci_slot *slot, uint8_t *val) +{ + struct phb4 *p = phb_to_phb4(slot->phb); + uint64_t hps, dtctl; + + /* Test for PHB in error state ? */ + if (p->broken) + return OPAL_HARDWARE; + + /* Check hotplug status */ + hps = in_be64(p->regs + PHB_PCIE_HOTPLUG_STATUS); + if (!(hps & PHB_PCIE_HPSTAT_PRESENCE)) { + *val = OPAL_PCI_SLOT_PRESENT; + } else { + /* + * If it says not present but link is up, then we assume + * we are on a broken simulation environment and still + * return a valid presence. Otherwise, not present. + */ + dtctl = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL); + if (dtctl & PHB_PCIE_DLP_TL_LINKACT) { + PHBERR(p, "Presence detect 0 but link set !\n"); + *val = OPAL_PCI_SLOT_PRESENT; + } else { + *val = OPAL_PCI_SLOT_EMPTY; + } + } + + return OPAL_SUCCESS; +} + +static int64_t phb4_get_link_info(struct pci_slot *slot, uint8_t *speed, + uint8_t *width) +{ + struct phb4 *p = phb_to_phb4(slot->phb); + uint64_t reg; + uint16_t state; + int64_t rc; + uint8_t s; + + /* Link is up, let's find the actual speed */ + reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL); + if (!(reg & PHB_PCIE_DLP_TL_LINKACT)) { + *width = 0; + if (speed) + *speed = 0; + return OPAL_SUCCESS; + } + + rc = phb4_pcicfg_read16(&p->phb, 0, + p->ecap + PCICAP_EXP_LSTAT, &state); + if (rc != OPAL_SUCCESS) { + PHBERR(p, "%s: Error %lld getting link state\n", __func__, rc); + return OPAL_HARDWARE; + } + + if (state & PCICAP_EXP_LSTAT_DLLL_ACT) { + *width = ((state & PCICAP_EXP_LSTAT_WIDTH) >> 4); + s = state & PCICAP_EXP_LSTAT_SPEED; + } else { + *width = 0; + s = 0; + } + + if (speed) + *speed = s; + + return OPAL_SUCCESS; +} + +static int64_t phb4_get_link_state(struct pci_slot *slot, uint8_t *val) +{ + return phb4_get_link_info(slot, NULL, val); +} + +static int64_t phb4_retry_state(struct pci_slot *slot) +{ + struct phb4 *p = phb_to_phb4(slot->phb); + + /* Mark link as down */ + phb4_prepare_link_change(slot, false); + + /* Last attempt to activate link */ + if (slot->link_retries == 1) { + if (slot->state == PHB4_SLOT_LINK_WAIT) { + PHBERR(p, "Falling back to GEN1 training\n"); + p->max_link_speed = 1; + } + } + + if (!slot->link_retries--) { + switch (slot->state) { + case PHB4_SLOT_LINK_WAIT_ELECTRICAL: + PHBERR(p, "Presence detected but no electrical link\n"); + break; + case PHB4_SLOT_LINK_WAIT: + PHBERR(p, "Electrical link detected but won't train\n"); + break; + case PHB4_SLOT_LINK_STABLE: + PHBERR(p, "Linked trained but was degraded or unstable\n"); + break; + default: + PHBERR(p, "Unknown link issue\n"); + } + return OPAL_HARDWARE; + } + + pci_slot_set_state(slot, PHB4_SLOT_CRESET_START); + return pci_slot_set_sm_timeout(slot, msecs_to_tb(1)); +} + +static uint64_t phb4_train_info(struct phb4 *p, uint64_t reg, unsigned long dt) +{ + uint64_t ltssm_state = GETFIELD(PHB_PCIE_DLP_LTSSM_TRC, reg); + char s[80]; + + snprintf(s, sizeof(s), "TRACE:0x%016llx % 2lims", + reg, tb_to_msecs(dt)); + + if (reg & PHB_PCIE_DLP_TL_LINKACT) + snprintf(s, sizeof(s), "%s trained ", s); + else if (reg & PHB_PCIE_DLP_TRAINING) + snprintf(s, sizeof(s), "%s training", s); + else if (reg & PHB_PCIE_DLP_INBAND_PRESENCE) + snprintf(s, sizeof(s), "%s presence", s); + else + snprintf(s, sizeof(s), "%s ", s); + + snprintf(s, sizeof(s), "%s GEN%lli:x%02lli:", s, + GETFIELD(PHB_PCIE_DLP_LINK_SPEED, reg), + GETFIELD(PHB_PCIE_DLP_LINK_WIDTH, reg)); + + switch (ltssm_state) { + case PHB_PCIE_DLP_LTSSM_RESET: + snprintf(s, sizeof(s), "%sreset", s); + break; + case PHB_PCIE_DLP_LTSSM_DETECT: + snprintf(s, sizeof(s), "%sdetect", s); + break; + case PHB_PCIE_DLP_LTSSM_POLLING: + snprintf(s, sizeof(s), "%spolling", s); + break; + case PHB_PCIE_DLP_LTSSM_CONFIG: + snprintf(s, sizeof(s), "%sconfig", s); + break; + case PHB_PCIE_DLP_LTSSM_L0: + snprintf(s, sizeof(s), "%sL0", s); + break; + case PHB_PCIE_DLP_LTSSM_REC: + snprintf(s, sizeof(s), "%srecovery", s); + break; + case PHB_PCIE_DLP_LTSSM_L1: + snprintf(s, sizeof(s), "%sL1", s); + break; + case PHB_PCIE_DLP_LTSSM_L2: + snprintf(s, sizeof(s), "%sL2", s); + break; + case PHB_PCIE_DLP_LTSSM_HOTRESET: + snprintf(s, sizeof(s), "%shotreset", s); + break; + case PHB_PCIE_DLP_LTSSM_DISABLED: + snprintf(s, sizeof(s), "%sdisabled", s); + break; + case PHB_PCIE_DLP_LTSSM_LOOPBACK: + snprintf(s, sizeof(s), "%sloopback", s); + break; + default: + snprintf(s, sizeof(s), "%sunvalid", s); + } + PHBNOTICE(p, "%s\n", s); + + return ltssm_state; +} + +static void phb4_dump_pec_err_regs(struct phb4 *p) +{ + uint64_t nfir_p_wof, nfir_n_wof, err_aib; + uint64_t err_rpt0, err_rpt1; + + /* Read the PCI and NEST FIRs and dump them. Also cache PCI/NEST FIRs */ + xscom_read(p->chip_id, + p->pci_stk_xscom + XPEC_PCI_STK_PCI_FIR, &p->pfir_cache); + xscom_read(p->chip_id, + p->pci_stk_xscom + XPEC_PCI_STK_PCI_FIR_WOF, &nfir_p_wof); + xscom_read(p->chip_id, + p->pe_stk_xscom + XPEC_NEST_STK_PCI_NFIR, &p->nfir_cache); + xscom_read(p->chip_id, + p->pe_stk_xscom + XPEC_NEST_STK_PCI_NFIR_WOF, &nfir_n_wof); + xscom_read(p->chip_id, + p->pe_stk_xscom + XPEC_NEST_STK_ERR_RPT0, &err_rpt0); + xscom_read(p->chip_id, + p->pe_stk_xscom + XPEC_NEST_STK_ERR_RPT1, &err_rpt1); + xscom_read(p->chip_id, + p->pci_stk_xscom + XPEC_PCI_STK_PBAIB_ERR_REPORT, &err_aib); + + PHBERR(p, " PCI FIR=%016llx\n", p->pfir_cache); + PHBERR(p, " PCI FIR WOF=%016llx\n", nfir_p_wof); + PHBERR(p, " NEST FIR=%016llx\n", p->nfir_cache); + PHBERR(p, " NEST FIR WOF=%016llx\n", nfir_n_wof); + PHBERR(p, " ERR RPT0=%016llx\n", err_rpt0); + PHBERR(p, " ERR RPT1=%016llx\n", err_rpt1); + PHBERR(p, " AIB ERR=%016llx\n", err_aib); +} + +static void phb4_dump_capp_err_regs(struct phb4 *p) +{ + uint64_t fir, apc_master_err, snoop_err, transport_err; + uint64_t tlbi_err, capp_err_status; + uint64_t offset = PHB4_CAPP_REG_OFFSET(p); + + xscom_read(p->chip_id, CAPP_FIR + offset, &fir); + xscom_read(p->chip_id, CAPP_APC_MASTER_ERR_RPT + offset, + &apc_master_err); + xscom_read(p->chip_id, CAPP_SNOOP_ERR_RTP + offset, &snoop_err); + xscom_read(p->chip_id, CAPP_TRANSPORT_ERR_RPT + offset, &transport_err); + xscom_read(p->chip_id, CAPP_TLBI_ERR_RPT + offset, &tlbi_err); + xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, &capp_err_status); + + PHBERR(p, " CAPP FIR=%016llx\n", fir); + PHBERR(p, "CAPP APC MASTER ERR=%016llx\n", apc_master_err); + PHBERR(p, " CAPP SNOOP ERR=%016llx\n", snoop_err); + PHBERR(p, " CAPP TRANSPORT ERR=%016llx\n", transport_err); + PHBERR(p, " CAPP TLBI ERR=%016llx\n", tlbi_err); + PHBERR(p, " CAPP ERR STATUS=%016llx\n", capp_err_status); +} + +/* Check if AIB is fenced via PBCQ NFIR */ +static bool phb4_fenced(struct phb4 *p) +{ + + /* Already fenced ? */ + if (p->flags & PHB4_AIB_FENCED) + return true; + + /* + * An all 1's from the PHB indicates a PHB freeze/fence. We + * don't really differenciate them at this point. + */ + if (in_be64(p->regs + PHB_CPU_LOADSTORE_STATUS)!= 0xfffffffffffffffful) + return false; + + /* Mark ourselves fenced */ + p->flags |= PHB4_AIB_FENCED; + + PHBERR(p, "PHB Freeze/Fence detected !\n"); + phb4_dump_pec_err_regs(p); + + /* + * dump capp error registers in case phb was fenced due to capp. + * Expect p->nfir_cache already updated in phb4_dump_pec_err_regs() + */ + if (p->nfir_cache & XPEC_NEST_STK_PCI_NFIR_CXA_PE_CAPP) + phb4_dump_capp_err_regs(p); + + phb4_eeh_dump_regs(p); + + return true; +} + +static bool phb4_check_reg(struct phb4 *p, uint64_t reg) +{ + if (reg == 0xffffffffffffffffUL) + return !phb4_fenced(p); + return true; +} + +static void phb4_get_info(struct phb *phb, uint16_t bdfn, uint8_t *speed, + uint8_t *width) +{ + int32_t ecap; + uint32_t cap; + + ecap = pci_find_cap(phb, bdfn, PCI_CFG_CAP_ID_EXP); + pci_cfg_read32(phb, bdfn, ecap + PCICAP_EXP_LCAP, &cap); + *width = (cap & PCICAP_EXP_LCAP_MAXWDTH) >> 4; + *speed = cap & PCICAP_EXP_LCAP_MAXSPD; +} + +#define PVR_POWER9_CUMULUS 0x00002000 + +static bool phb4_chip_retry_workaround(void) +{ + unsigned int pvr; + + if (pci_retry_all) + return true; + + /* Chips that need this retry are: + * - CUMULUS DD1.0 + * - NIMBUS DD2.0 (and DD1.0, but it is unsupported so no check). + */ + pvr = mfspr(SPR_PVR); + if (pvr & PVR_POWER9_CUMULUS) { + if ((PVR_VERS_MAJ(pvr) == 1) && (PVR_VERS_MIN(pvr) == 0)) + return true; + } else { /* NIMBUS */ + if ((PVR_VERS_MAJ(pvr) == 2) && (PVR_VERS_MIN(pvr) == 0)) + return true; + } + return false; +} + +struct pci_card_id { + uint16_t vendor; + uint16_t device; +}; + +static struct pci_card_id retry_allowlist[] = { + { 0x1000, 0x005d }, /* LSI Logic MegaRAID SAS-3 3108 */ + { 0x1000, 0x00c9 }, /* LSI MPT SAS-3 */ + { 0x104c, 0x8241 }, /* TI xHCI USB */ + { 0x1077, 0x2261 }, /* QLogic ISP2722-based 16/32Gb FC */ + { 0x10b5, 0x8725 }, /* PLX Switch: p9dsu, witherspoon */ + { 0x10b5, 0x8748 }, /* PLX Switch: ZZ */ + { 0x11f8, 0xf117 }, /* PMC-Sierra/MicroSemi NV1604 */ + { 0x15b3, 0x1013 }, /* Mellanox ConnectX-4 */ + { 0x15b3, 0x1017 }, /* Mellanox ConnectX-5 */ + { 0x15b3, 0x1019 }, /* Mellanox ConnectX-5 Ex */ + { 0x1a03, 0x1150 }, /* ASPEED AST2500 Switch */ + { 0x8086, 0x10fb }, /* Intel x520 10G Eth */ + { 0x9005, 0x028d }, /* MicroSemi PM8069 */ +}; + +#define VENDOR(vdid) ((vdid) & 0xffff) +#define DEVICE(vdid) (((vdid) >> 16) & 0xffff) + +static bool phb4_adapter_in_allowlist(uint32_t vdid) +{ + int i; + + if (pci_retry_all) + return true; + + for (i = 0; i < ARRAY_SIZE(retry_allowlist); i++) + if ((retry_allowlist[i].vendor == VENDOR(vdid)) && + (retry_allowlist[i].device == DEVICE(vdid))) + return true; + + return false; +} + +static struct pci_card_id lane_eq_disable[] = { + { 0x10de, 0x17fd }, /* Nvidia GM200GL [Tesla M40] */ + { 0x10de, 0x1db4 }, /* Nvidia GV100 */ +}; + +static bool phb4_lane_eq_retry_allowlist(uint32_t vdid) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(lane_eq_disable); i++) + if ((lane_eq_disable[i].vendor == VENDOR(vdid)) && + (lane_eq_disable[i].device == DEVICE(vdid))) + return true; + return false; +} + +static void phb4_lane_eq_change(struct phb4 *p, uint32_t vdid) +{ + p->lane_eq_en = !phb4_lane_eq_retry_allowlist(vdid); +} + +static bool phb4_link_optimal(struct pci_slot *slot, uint32_t *vdid) +{ + struct phb4 *p = phb_to_phb4(slot->phb); + uint64_t reg; + uint32_t id; + uint16_t bdfn, lane_errs; + uint8_t trained_speed, dev_speed, target_speed, rx_errs; + uint8_t trained_width, dev_width, target_width; + bool optimal_speed, optimal_width, optimal, retry_enabled, rx_err_ok; + + + /* Current trained state */ + phb4_get_link_info(slot, &trained_speed, &trained_width); + + /* Get device capability */ + bdfn = 0x0100; /* bus=1 dev=0 device=0 */ + /* Since this is the first access, we need to wait for CRS */ + if (!pci_wait_crs(slot->phb, bdfn , &id)) + return true; + phb4_get_info(slot->phb, bdfn, &dev_speed, &dev_width); + + /* Work out if we are optimally trained */ + target_speed = MIN(p->max_link_speed, dev_speed); + optimal_speed = (trained_speed >= target_speed); + target_width = MIN(p->max_link_width, dev_width); + optimal_width = (trained_width >= target_width); + optimal = optimal_width && optimal_speed; + retry_enabled = (phb4_chip_retry_workaround() && + phb4_adapter_in_allowlist(id)) || + phb4_lane_eq_retry_allowlist(id); + reg = in_be64(p->regs + PHB_PCIE_DLP_ERR_COUNTERS); + rx_errs = GETFIELD(PHB_PCIE_DLP_RX_ERR_CNT, reg); + rx_err_ok = (rx_errs < rx_err_max); + reg = in_be64(p->regs + PHB_PCIE_DLP_ERR_STATUS); + lane_errs = GETFIELD(PHB_PCIE_DLP_LANE_ERR, reg); + + PHBDBG(p, "LINK: Card [%04x:%04x] %s Retry:%s\n", VENDOR(id), + DEVICE(id), optimal ? "Optimal" : "Degraded", + retry_enabled ? "enabled" : "disabled"); + PHBDBG(p, "LINK: Speed Train:GEN%i PHB:GEN%i DEV:GEN%i%s\n", + trained_speed, p->max_link_speed, dev_speed, + optimal_speed ? "" : " *"); + PHBDBG(p, "LINK: Width Train:x%02i PHB:x%02i DEV:x%02i%s\n", + trained_width, p->max_link_width, dev_width, + optimal_width ? "" : " *"); + PHBDBG(p, "LINK: RX Errors Now:%i Max:%i Lane:0x%04x%s\n", + rx_errs, rx_err_max, lane_errs, rx_err_ok ? "" : " *"); + + if (vdid) + *vdid = id; + + /* Always do RX error retry irrespective of chip and card */ + if (!rx_err_ok) + return false; + + if (!retry_enabled) + return true; + + return optimal; +} + +/* + * This is a trace function to watch what's happening duing pcie link + * training. If any errors are detected it simply returns so the + * normal code can deal with it. + */ +static void phb4_link_trace(struct phb4 *p, uint64_t target_state, int max_ms) +{ + unsigned long now, end, start = mftb(), state = 0; + uint64_t trwctl, reg, reglast = -1; + bool enabled; + + /* + * Enable the DLP trace outputs. If we don't the LTSSM state in + * PHB_PCIE_DLP_TRAIN_CTL won't be updated and always reads zero. + */ + trwctl = phb4_read_reg(p, PHB_PCIE_DLP_TRWCTL); + enabled = !!(trwctl & PHB_PCIE_DLP_TRWCTL_EN); + if (!enabled) { + phb4_write_reg(p, PHB_PCIE_DLP_TRWCTL, + trwctl | PHB_PCIE_DLP_TRWCTL_EN); + } + + end = start + msecs_to_tb(max_ms); + now = start; + + do { + reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL); + if (reg != reglast) + state = phb4_train_info(p, reg, now - start); + reglast = reg; + + if (!phb4_check_reg(p, reg)) { + PHBNOTICE(p, "TRACE: PHB fenced.\n"); + goto out; + } + + if (tb_compare(now, end) == TB_AAFTERB) { + PHBNOTICE(p, "TRACE: Timed out after %dms\n", max_ms); + goto out; + } + + now = mftb(); + } while (state != target_state); + + PHBNOTICE(p, "TRACE: Reached target state\n"); + +out: + /* + * The trace enable bit is a clock gate for the tracing logic. Turn + * it off to save power if we're not using it otherwise. + */ + if (!enabled) + phb4_write_reg(p, PHB_PCIE_DLP_TRWCTL, trwctl); +} + +/* + * This helper is called repeatedly by the host sync notifier mechanism, which + * relies on the kernel to regularly poll the OPAL_SYNC_HOST_REBOOT call as it + * shuts down. + */ +static bool phb4_host_sync_reset(void *data) +{ + struct phb4 *p = (struct phb4 *)data; + struct phb *phb = &p->phb; + int64_t rc = 0; + + /* Make sure no-one modifies the phb flags while we are active */ + phb_lock(phb); + + /* Make sure CAPP is attached to the PHB */ + if (p->capp) + /* Call phb ops to disable capi */ + rc = phb->ops->set_capi_mode(phb, OPAL_PHB_CAPI_MODE_PCIE, + p->capp->attached_pe); + else + rc = OPAL_SUCCESS; + + /* Continue kicking state-machine if in middle of a mode transition */ + if (rc == OPAL_BUSY) + rc = phb->slot->ops.run_sm(phb->slot); + + phb_unlock(phb); + + return rc <= OPAL_SUCCESS; +} + +/* + * Notification from the pci-core that a pci slot state machine completed. + * We use this callback to mark the CAPP disabled if we were waiting for it. + */ +static int64_t phb4_slot_sm_run_completed(struct pci_slot *slot, uint64_t err) +{ + struct phb4 *p = phb_to_phb4(slot->phb); + + /* Check if we are disabling the capp */ + if (p->flags & PHB4_CAPP_DISABLE) { + + /* Unset struct capp so that we dont fall into a creset loop */ + p->flags &= ~(PHB4_CAPP_DISABLE); + p->capp->phb = NULL; + p->capp->attached_pe = phb4_get_reserved_pe_number(&p->phb); + + /* Remove the host sync notifier is we are done.*/ + opal_del_host_sync_notifier(phb4_host_sync_reset, p); + if (err) { + /* Force a CEC ipl reboot */ + disable_fast_reboot("CAPP: reset failed"); + PHBERR(p, "CAPP: Unable to reset. Error=%lld\n", err); + } else { + PHBINF(p, "CAPP: reset complete\n"); + } + } + + return OPAL_SUCCESS; +} + +static int64_t phb4_poll_link(struct pci_slot *slot) +{ + struct phb4 *p = phb_to_phb4(slot->phb); + uint64_t reg; + uint32_t vdid; + + switch (slot->state) { + case PHB4_SLOT_NORMAL: + case PHB4_SLOT_LINK_START: + PHBDBG(p, "LINK: Start polling\n"); + slot->retries = PHB4_LINK_ELECTRICAL_RETRIES; + pci_slot_set_state(slot, PHB4_SLOT_LINK_WAIT_ELECTRICAL); + /* Polling early here has no chance of a false positive */ + return pci_slot_set_sm_timeout(slot, msecs_to_tb(1)); + case PHB4_SLOT_LINK_WAIT_ELECTRICAL: + /* + * Wait for the link electrical connection to be + * established (shorter timeout). This allows us to + * workaround spurrious presence detect on some machines + * without waiting 10s each time + * + * Note: We *also* check for the full link up bit here + * because simics doesn't seem to implement the electrical + * link bit at all + */ + reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL); + if (!phb4_check_reg(p, reg)) { + PHBERR(p, "PHB fence waiting for electrical link\n"); + return phb4_retry_state(slot); + } + + if (reg & (PHB_PCIE_DLP_INBAND_PRESENCE | + PHB_PCIE_DLP_TL_LINKACT)) { + PHBDBG(p, "LINK: Electrical link detected\n"); + pci_slot_set_state(slot, PHB4_SLOT_LINK_WAIT); + slot->retries = PHB4_LINK_WAIT_RETRIES; + /* No wait here since already have an elec link */ + return pci_slot_set_sm_timeout(slot, msecs_to_tb(1)); + } + + if (slot->retries-- == 0) { + PHBDBG(p, "LINK: No in-band presence\n"); + return OPAL_SUCCESS; + } + /* Retry */ + return pci_slot_set_sm_timeout(slot, msecs_to_tb(10)); + case PHB4_SLOT_LINK_WAIT: + reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL); + if (!phb4_check_reg(p, reg)) { + PHBERR(p, "LINK: PHB fence waiting for link training\n"); + return phb4_retry_state(slot); + } + if (reg & PHB_PCIE_DLP_TL_LINKACT) { + PHBDBG(p, "LINK: Link is up\n"); + phb4_prepare_link_change(slot, true); + pci_slot_set_state(slot, PHB4_SLOT_LINK_STABLE); + return pci_slot_set_sm_timeout(slot, secs_to_tb(1)); + } + + if (slot->retries-- == 0) { + PHBERR(p, "LINK: Timeout waiting for link up\n"); + PHBDBG(p, "LINK: DLP train control: 0x%016llx\n", reg); + return phb4_retry_state(slot); + } + /* Retry */ + return pci_slot_set_sm_timeout(slot, msecs_to_tb(10)); + case PHB4_SLOT_LINK_STABLE: + /* Sanity check link */ + if (phb4_fenced(p)) { + PHBERR(p, "LINK: PHB fenced waiting for stabilty\n"); + return phb4_retry_state(slot); + } + reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL); + if (!phb4_check_reg(p, reg)) { + PHBERR(p, "LINK: PHB fence reading training control\n"); + return phb4_retry_state(slot); + } + if (reg & PHB_PCIE_DLP_TL_LINKACT) { + PHBDBG(p, "LINK: Link is stable\n"); + if (!phb4_link_optimal(slot, &vdid)) { + PHBDBG(p, "LINK: Link degraded\n"); + if (slot->link_retries) { + phb4_lane_eq_change(p, vdid); + return phb4_retry_state(slot); + } + /* + * Link is degraded but no more retries, so + * settle for what we have :-( + */ + PHBERR(p, "LINK: Degraded but no more retries\n"); + } + pci_restore_slot_bus_configs(slot); + pci_slot_set_state(slot, PHB4_SLOT_NORMAL); + return OPAL_SUCCESS; + } + PHBERR(p, "LINK: Went down waiting for stabilty\n"); + PHBDBG(p, "LINK: DLP train control: 0x%016llx\n", reg); + return phb4_retry_state(slot); + default: + PHBERR(p, "LINK: Unexpected slot state %08x\n", + slot->state); + } + + pci_slot_set_state(slot, PHB4_SLOT_NORMAL); + return OPAL_HARDWARE; +} + +static unsigned int phb4_get_max_link_speed(struct phb4 *p, struct dt_node *np) +{ + unsigned int max_link_speed, hw_max_link_speed; + struct proc_chip *chip; + chip = get_chip(p->chip_id); + + hw_max_link_speed = 4; + if (is_phb5() && (p->index == 0 || p->index == 3)) + hw_max_link_speed = 5; + + /* Priority order: NVRAM -> dt -> GEN3 dd2.00 -> hw default */ + max_link_speed = hw_max_link_speed; + if (p->rev == PHB4_REV_NIMBUS_DD20 && + ((0xf & chip->ec_level) == 0) && chip->ec_rev == 0) + max_link_speed = 3; + if (np) { + if (dt_has_node_property(np, "ibm,max-link-speed", NULL)) { + max_link_speed = dt_prop_get_u32(np, "ibm,max-link-speed"); + p->dt_max_link_speed = max_link_speed; + } + else { + p->dt_max_link_speed = 0; + } + } + else { + if (p->dt_max_link_speed > 0) { + max_link_speed = p->dt_max_link_speed; + } + } + if (pcie_max_link_speed) + max_link_speed = pcie_max_link_speed; + if (max_link_speed > hw_max_link_speed) + max_link_speed = hw_max_link_speed; + + return max_link_speed; +} + +static unsigned int __phb4_get_max_link_width(struct phb4 *p) +{ + uint64_t addr, reg; + unsigned int lane_config, width = 16; + + /* + * On P9, only PEC2 is configurable (no-/bi-/tri-furcation) + */ + switch (p->pec) { + case 0: + width = 16; + break; + case 1: + width = 8; + break; + case 2: + addr = XPEC_P9_PCI_CPLT_CONF1 + 2 * XPEC_PCI_CPLT_OFFSET; + xscom_read(p->chip_id, addr, ®); + lane_config = GETFIELD(XPEC_P9_PCI_LANE_CFG, reg); + + if (lane_config == 0b10 && p->index >= 4) + width = 4; + else + width = 8; + } + return width; +} + +static unsigned int __phb5_get_max_link_width(struct phb4 *p) +{ + uint64_t addr, reg; + unsigned int lane_config, width = 16; + + /* + * On P10, the 2 PECs are identical and each can have a + * different furcation, so we always need to check the PEC + * config + */ + addr = XPEC_P10_PCI_CPLT_CONF1 + p->pec * XPEC_PCI_CPLT_OFFSET; + xscom_read(p->chip_id, addr, ®); + lane_config = GETFIELD(XPEC_P10_PCI_LANE_CFG, reg); + + switch (lane_config) { + case 0b00: + width = 16; + break; + case 0b01: + width = 8; + break; + case 0b10: + if (p->index == 0 || p->index == 3) + width = 8; + else + width = 4; + break; + default: + PHBERR(p, "Unexpected PEC lane config value %#x\n", + lane_config); + } + return width; +} + +static unsigned int phb4_get_max_link_width(struct phb4 *p) +{ + if (is_phb5()) + return __phb5_get_max_link_width(p); + else + return __phb4_get_max_link_width(p); +} + +static void phb4_assert_perst(struct pci_slot *slot, bool assert) +{ + struct phb4 *p = phb_to_phb4(slot->phb); + uint16_t linkctl; + uint64_t reg; + + /* + * Disable the link before asserting PERST. The Cursed RAID card + * in ozrom1 (9005:028c) has problems coming back if PERST is asserted + * while link is active. To work around the problem we assert the link + * disable bit before asserting PERST. Asserting the secondary reset + * bit in the btctl register also works. + */ + phb4_pcicfg_read16(&p->phb, 0, p->ecap + PCICAP_EXP_LCTL, &linkctl); + reg = phb4_read_reg(p, PHB_PCIE_CRESET); + + if (assert) { + linkctl |= PCICAP_EXP_LCTL_LINK_DIS; + reg &= ~PHB_PCIE_CRESET_PERST_N; + } else { + linkctl &= ~PCICAP_EXP_LCTL_LINK_DIS; + reg |= PHB_PCIE_CRESET_PERST_N; + } + + phb4_write_reg(p, PHB_PCIE_CRESET, reg); + phb4_pcicfg_write16(&p->phb, 0, p->ecap + PCICAP_EXP_LCTL, linkctl); +} + +static void set_sys_disable_detect(struct phb4 *p, bool set) +{ + uint64_t val; + + val = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL); + if (set) + val |= PHB_PCIE_DLP_SYS_DISABLEDETECT; + else + val &= ~PHB_PCIE_DLP_SYS_DISABLEDETECT; + out_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL, val); +} + +static int64_t phb4_hreset(struct pci_slot *slot) +{ + struct phb4 *p = phb_to_phb4(slot->phb); + uint16_t brctl; + uint8_t presence = 1; + + switch (slot->state) { + case PHB4_SLOT_NORMAL: + PHBDBG(p, "HRESET: Starts\n"); + if (slot->ops.get_presence_state) + slot->ops.get_presence_state(slot, &presence); + if (!presence) { + PHBDBG(p, "HRESET: No device\n"); + return OPAL_SUCCESS; + } + + /* circumvention for HW551382 */ + if (is_phb5()) { + PHBINF(p, "HRESET: Workaround for HW551382\n"); + set_sys_disable_detect(p, true); + } + + PHBDBG(p, "HRESET: Prepare for link down\n"); + phb4_prepare_link_change(slot, false); + /* fall through */ + case PHB4_SLOT_HRESET_START: + PHBDBG(p, "HRESET: Assert\n"); + + phb4_pcicfg_read16(&p->phb, 0, PCI_CFG_BRCTL, &brctl); + brctl |= PCI_CFG_BRCTL_SECONDARY_RESET; + phb4_pcicfg_write16(&p->phb, 0, PCI_CFG_BRCTL, brctl); + pci_slot_set_state(slot, PHB4_SLOT_HRESET_DELAY); + + return pci_slot_set_sm_timeout(slot, secs_to_tb(1)); + case PHB4_SLOT_HRESET_DELAY: + PHBDBG(p, "HRESET: Deassert\n"); + + /* Clear link errors before we deassert reset */ + phb4_err_clear_regb(p); + + phb4_pcicfg_read16(&p->phb, 0, PCI_CFG_BRCTL, &brctl); + brctl &= ~PCI_CFG_BRCTL_SECONDARY_RESET; + phb4_pcicfg_write16(&p->phb, 0, PCI_CFG_BRCTL, brctl); + + /* + * Due to some oddball adapters bouncing the link + * training a couple of times, we wait for a full second + * before we start checking the link status, otherwise + * we can get a spurrious link down interrupt which + * causes us to EEH immediately. + */ + pci_slot_set_state(slot, PHB4_SLOT_HRESET_DELAY2); + return pci_slot_set_sm_timeout(slot, secs_to_tb(1)); + case PHB4_SLOT_HRESET_DELAY2: + if (is_phb5()) + set_sys_disable_detect(p, false); + pci_slot_set_state(slot, PHB4_SLOT_LINK_START); + return slot->ops.poll_link(slot); + default: + PHBERR(p, "Unexpected slot state %08x\n", slot->state); + } + + pci_slot_set_state(slot, PHB4_SLOT_NORMAL); + return OPAL_HARDWARE; +} + +static int64_t phb4_freset(struct pci_slot *slot) +{ + struct phb4 *p = phb_to_phb4(slot->phb); + + switch(slot->state) { + case PHB4_SLOT_NORMAL: + case PHB4_SLOT_FRESET_START: + PHBDBG(p, "FRESET: Starts\n"); + + /* Reset max link speed for training */ + p->max_link_speed = phb4_get_max_link_speed(p, NULL); + + PHBDBG(p, "FRESET: Prepare for link down\n"); + phb4_prepare_link_change(slot, false); + + if (!p->skip_perst) { + /* circumvention for HW551382 */ + if (is_phb5()) { + PHBINF(p, "FRESET: Workaround for HW551382\n"); + set_sys_disable_detect(p, true); + } + + PHBDBG(p, "FRESET: Assert\n"); + phb4_assert_perst(slot, true); + pci_slot_set_state(slot, PHB4_SLOT_FRESET_ASSERT_DELAY); + + /* 250ms assert time aligns with powernv */ + return pci_slot_set_sm_timeout(slot, msecs_to_tb(250)); + } + + /* To skip the assert during boot time */ + PHBDBG(p, "FRESET: Assert skipped\n"); + pci_slot_set_state(slot, PHB4_SLOT_FRESET_ASSERT_DELAY); + p->skip_perst = false; + /* fall through */ + case PHB4_SLOT_FRESET_ASSERT_DELAY: + /* Clear link errors before we deassert PERST */ + phb4_err_clear_regb(p); + + PHBDBG(p, "FRESET: Deassert\n"); + phb4_assert_perst(slot, false); + + if (pci_tracing) + phb4_link_trace(p, PHB_PCIE_DLP_LTSSM_L0, 3000); + + if (is_phb5()) + set_sys_disable_detect(p, false); + + pci_slot_set_state(slot, PHB4_SLOT_LINK_START); + return slot->ops.poll_link(slot); + default: + PHBERR(p, "Unexpected slot state %08x\n", slot->state); + } + + pci_slot_set_state(slot, PHB4_SLOT_NORMAL); + return OPAL_HARDWARE; +} + +static int64_t load_capp_ucode(struct phb4 *p) +{ + int64_t rc; + + if (p->index != CAPP0_PHB_INDEX && p->index != CAPP1_PHB_INDEX) + return OPAL_HARDWARE; + + /* 0x434150504c494448 = 'CAPPLIDH' in ASCII */ + rc = capp_load_ucode(p->chip_id, p->phb.opal_id, p->index, + 0x434150504c494448UL, PHB4_CAPP_REG_OFFSET(p), + CAPP_APC_MASTER_ARRAY_ADDR_REG, + CAPP_APC_MASTER_ARRAY_WRITE_REG, + CAPP_SNP_ARRAY_ADDR_REG, + CAPP_SNP_ARRAY_WRITE_REG); + return rc; +} + +static int do_capp_recovery_scoms(struct phb4 *p) +{ + uint64_t rc, reg, end; + uint64_t offset = PHB4_CAPP_REG_OFFSET(p); + + + /* Get the status of CAPP recovery */ + xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, ®); + + /* No recovery in progress ignore */ + if ((reg & PPC_BIT(0)) == 0) { + PHBDBG(p, "CAPP: No recovery in progress\n"); + return OPAL_SUCCESS; + } + + PHBDBG(p, "CAPP: Waiting for recovery to complete\n"); + /* recovery timer failure period 168ms */ + end = mftb() + msecs_to_tb(168); + while ((reg & (PPC_BIT(1) | PPC_BIT(5) | PPC_BIT(9))) == 0) { + + time_wait_ms(5); + xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, ®); + + if (tb_compare(mftb(), end) != TB_ABEFOREB) { + PHBERR(p, "CAPP: Capp recovery Timed-out.\n"); + end = 0; + break; + } + } + + /* Check if the recovery failed or passed */ + if (reg & PPC_BIT(1)) { + uint64_t act0, act1, mask, fir; + + /* Use the Action0/1 and mask to only clear the bits + * that cause local checkstop. Other bits needs attention + * of the PRD daemon. + */ + xscom_read(p->chip_id, CAPP_FIR_ACTION0 + offset, &act0); + xscom_read(p->chip_id, CAPP_FIR_ACTION1 + offset, &act1); + xscom_read(p->chip_id, CAPP_FIR_MASK + offset, &mask); + xscom_read(p->chip_id, CAPP_FIR + offset, &fir); + + fir = ~(fir & ~mask & act0 & act1); + PHBDBG(p, "Doing CAPP recovery scoms\n"); + + /* update capp fir clearing bits causing local checkstop */ + PHBDBG(p, "Resetting CAPP Fir with mask 0x%016llX\n", fir); + xscom_write(p->chip_id, CAPP_FIR_CLEAR + offset, fir); + + /* disable snoops */ + xscom_write(p->chip_id, SNOOP_CAPI_CONFIG + offset, 0); + load_capp_ucode(p); + + /* clear err rpt reg*/ + xscom_write(p->chip_id, CAPP_ERR_RPT_CLR + offset, 0); + + /* clear capp fir */ + xscom_write(p->chip_id, CAPP_FIR + offset, 0); + + /* Just reset Bit-0,1 and dont touch any other bit */ + xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, ®); + reg &= ~(PPC_BIT(0) | PPC_BIT(1)); + xscom_write(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, reg); + + PHBDBG(p, "CAPP recovery complete\n"); + rc = OPAL_SUCCESS; + + } else { + /* Most likely will checkstop here due to FIR ACTION for + * failed recovery. So this message would never be logged. + * But if we still enter here then return an error forcing a + * fence of the PHB. + */ + if (reg & PPC_BIT(5)) + PHBERR(p, "CAPP: Capp recovery Failed\n"); + else if (reg & PPC_BIT(9)) + PHBERR(p, "CAPP: Capp recovery hang detected\n"); + else if (end != 0) + PHBERR(p, "CAPP: Unknown recovery failure\n"); + + PHBDBG(p, "CAPP: Err/Status-reg=0x%016llx\n", reg); + rc = OPAL_HARDWARE; + } + + return rc; +} + +/* + * Disable CAPI mode on a PHB. Must be done while PHB is fenced and + * not in recovery. + */ +static void disable_capi_mode(struct phb4 *p) +{ + uint64_t reg; + struct capp *capp = p->capp; + + PHBINF(p, "CAPP: Deactivating\n"); + + /* Check if CAPP attached to the PHB and active */ + if (!capp || capp->phb != &p->phb) { + PHBDBG(p, "CAPP: Not attached to this PHB!\n"); + return; + } + + xscom_read(p->chip_id, p->pe_xscom + XPEC_NEST_CAPP_CNTL, ®); + if (!(reg & PPC_BIT(0))) { + /* Not in CAPI mode, no action required */ + PHBERR(p, "CAPP: Not enabled!\n"); + return; + } + + /* CAPP should already be out of recovery in this function */ + capp_xscom_read(capp, CAPP_ERR_STATUS_CTRL, ®); + if (reg & PPC_BIT(0)) { + PHBERR(p, "CAPP: Can't disable while still in recovery!\n"); + return; + } + + PHBINF(p, "CAPP: Disabling CAPI mode\n"); + + /* First Phase Reset CAPP Registers */ + /* CAPP about to be disabled mark TLBI_FENCED and tlbi_psl_is_dead */ + capp_xscom_write(capp, CAPP_ERR_STATUS_CTRL, PPC_BIT(3) | PPC_BIT(4)); + + /* Flush SUE uOP1 Register */ + if (p->rev != PHB4_REV_NIMBUS_DD10) + capp_xscom_write(capp, FLUSH_SUE_UOP1, 0); + + /* Release DMA/STQ engines */ + capp_xscom_write(capp, APC_FSM_READ_MASK, 0ull); + capp_xscom_write(capp, XPT_FSM_RMM, 0ull); + + /* Disable snoop */ + capp_xscom_write(capp, SNOOP_CAPI_CONFIG, 0); + + /* Clear flush SUE state map register */ + capp_xscom_write(capp, FLUSH_SUE_STATE_MAP, 0); + + /* Disable epoch timer */ + capp_xscom_write(capp, EPOCH_RECOVERY_TIMERS_CTRL, 0); + + /* CAPP Transport Control Register */ + capp_xscom_write(capp, TRANSPORT_CONTROL, PPC_BIT(15)); + + /* Disable snooping */ + capp_xscom_write(capp, SNOOP_CONTROL, 0); + capp_xscom_write(capp, SNOOP_CAPI_CONFIG, 0); + + /* APC Master PB Control Register - disable examining cResps */ + capp_xscom_write(capp, APC_MASTER_PB_CTRL, 0); + + /* APC Master Config Register - de-select PHBs */ + xscom_write_mask(p->chip_id, capp->capp_xscom_offset + + APC_MASTER_CAPI_CTRL, 0, PPC_BITMASK(2, 3)); + + /* Clear all error registers */ + capp_xscom_write(capp, CAPP_ERR_RPT_CLR, 0); + capp_xscom_write(capp, CAPP_FIR, 0); + capp_xscom_write(capp, CAPP_FIR_ACTION0, 0); + capp_xscom_write(capp, CAPP_FIR_ACTION1, 0); + capp_xscom_write(capp, CAPP_FIR_MASK, 0); + + /* Second Phase Reset PEC/PHB Registers */ + + /* Reset the stack overrides if any */ + xscom_write(p->chip_id, p->pci_xscom + XPEC_PCI_PRDSTKOVR, 0); + xscom_write(p->chip_id, p->pe_xscom + + XPEC_NEST_READ_STACK_OVERRIDE, 0); + + /* PE Bus AIB Mode Bits. Disable Tracing. Leave HOL Blocking as it is */ + if (!(p->rev == PHB4_REV_NIMBUS_DD10) && p->index == CAPP1_PHB_INDEX) + xscom_write_mask(p->chip_id, + p->pci_xscom + XPEC_PCI_PBAIB_HW_CONFIG, 0, + PPC_BIT(30)); + + /* Reset for PCI to PB data movement */ + xscom_write_mask(p->chip_id, p->pe_xscom + XPEC_NEST_PBCQ_HW_CONFIG, + 0, XPEC_NEST_PBCQ_HW_CONFIG_PBINIT); + + /* Disable CAPP mode in PEC CAPP Control Register */ + xscom_write(p->chip_id, p->pe_xscom + XPEC_NEST_CAPP_CNTL, 0ull); +} + +static int64_t phb4_creset(struct pci_slot *slot) +{ + struct phb4 *p = phb_to_phb4(slot->phb); + struct capp *capp = p->capp; + uint64_t pbcq_status; + uint64_t creset_time, wait_time; + + /* Don't even try fixing a broken PHB */ + if (p->broken) + return OPAL_HARDWARE; + + switch (slot->state) { + case PHB4_SLOT_NORMAL: + case PHB4_SLOT_CRESET_START: + PHBDBG(p, "CRESET: Starts\n"); + + p->creset_start_time = mftb(); + + /* circumvention for HW551382 */ + if (is_phb5()) { + PHBINF(p, "CRESET: Workaround for HW551382\n"); + set_sys_disable_detect(p, true); + } + + phb4_prepare_link_change(slot, false); + /* Clear error inject register, preventing recursive errors */ + xscom_write(p->chip_id, p->pe_xscom + 0x2, 0x0); + + /* Prevent HMI when PHB gets fenced as we are disabling CAPP */ + if (p->flags & PHB4_CAPP_DISABLE && + capp && capp->phb == slot->phb) { + /* Since no HMI, So set the recovery flag manually. */ + p->flags |= PHB4_CAPP_RECOVERY; + xscom_write_mask(p->chip_id, capp->capp_xscom_offset + + CAPP_FIR_MASK, + PPC_BIT(31), PPC_BIT(31)); + } + + /* Force fence on the PHB to work around a non-existent PE */ + if (!phb4_fenced(p)) + xscom_write(p->chip_id, p->pe_stk_xscom + 0x2, + 0x0000002000000000UL); + + /* + * Force use of ASB for register access until the PHB has + * been fully reset. + */ + p->flags |= PHB4_CFG_USE_ASB | PHB4_AIB_FENCED; + + /* Assert PREST before clearing errors */ + phb4_assert_perst(slot, true); + + /* Clear errors, following the proper sequence */ + phb4_err_clear(p); + + /* Actual reset */ + p->flags |= PHB4_ETU_IN_RESET; + xscom_write(p->chip_id, p->pci_stk_xscom + XPEC_PCI_STK_ETU_RESET, + 0x8000000000000000UL); + + /* Read errors in PFIR and NFIR */ + xscom_read(p->chip_id, p->pci_stk_xscom + 0x0, &p->pfir_cache); + xscom_read(p->chip_id, p->pe_stk_xscom + 0x0, &p->nfir_cache); + + pci_slot_set_state(slot, PHB4_SLOT_CRESET_WAIT_CQ); + slot->retries = 500; + return pci_slot_set_sm_timeout(slot, msecs_to_tb(10)); + case PHB4_SLOT_CRESET_WAIT_CQ: + + // Wait until operations are complete + xscom_read(p->chip_id, p->pe_stk_xscom + 0xc, &pbcq_status); + if (!(pbcq_status & 0xC000000000000000UL)) { + PHBDBG(p, "CRESET: No pending transactions\n"); + + /* capp recovery */ + if ((p->flags & PHB4_CAPP_RECOVERY) && + (do_capp_recovery_scoms(p) != OPAL_SUCCESS)) + goto error; + + if (p->flags & PHB4_CAPP_DISABLE) + disable_capi_mode(p); + + /* Clear errors in PFIR and NFIR */ + xscom_write(p->chip_id, p->pci_stk_xscom + 0x1, + ~p->pfir_cache); + xscom_write(p->chip_id, p->pe_stk_xscom + 0x1, + ~p->nfir_cache); + + /* Re-read errors in PFIR and NFIR and reset any new + * error reported. + */ + xscom_read(p->chip_id, p->pci_stk_xscom + + XPEC_PCI_STK_PCI_FIR, &p->pfir_cache); + xscom_read(p->chip_id, p->pe_stk_xscom + + XPEC_NEST_STK_PCI_NFIR, &p->nfir_cache); + + if (p->pfir_cache || p->nfir_cache) { + PHBERR(p, "CRESET: PHB still fenced !!\n"); + phb4_dump_pec_err_regs(p); + + /* Reset the PHB errors */ + xscom_write(p->chip_id, p->pci_stk_xscom + + XPEC_PCI_STK_PCI_FIR, 0); + xscom_write(p->chip_id, p->pe_stk_xscom + + XPEC_NEST_STK_PCI_NFIR, 0); + } + + /* Clear PHB from reset */ + xscom_write(p->chip_id, + p->pci_stk_xscom + XPEC_PCI_STK_ETU_RESET, 0x0); + p->flags &= ~PHB4_ETU_IN_RESET; + + pci_slot_set_state(slot, PHB4_SLOT_CRESET_REINIT); + /* After lifting PHB reset, wait while logic settles */ + return pci_slot_set_sm_timeout(slot, msecs_to_tb(10)); + } + + if (slot->retries-- == 0) { + PHBERR(p, "Timeout waiting for pending transaction\n"); + goto error; + } + return pci_slot_set_sm_timeout(slot, msecs_to_tb(100)); + case PHB4_SLOT_CRESET_REINIT: + PHBDBG(p, "CRESET: Reinitialization\n"); + p->flags &= ~PHB4_AIB_FENCED; + p->flags &= ~PHB4_CAPP_RECOVERY; + p->flags &= ~PHB4_CFG_USE_ASB; + phb4_init_hw(p); + pci_slot_set_state(slot, PHB4_SLOT_CRESET_FRESET); + + /* + * The PERST is sticky across resets, but LINK_DIS isn't. + * Re-assert it here now that we've reset the PHB. + */ + phb4_assert_perst(slot, true); + + /* + * wait either 100ms (for the ETU logic) or until we've had + * PERST asserted for 250ms. + */ + creset_time = tb_to_msecs(mftb() - p->creset_start_time); + if (creset_time < 250) + wait_time = MAX(100, 250 - creset_time); + else + wait_time = 100; + PHBDBG(p, "CRESET: wait_time = %lld\n", wait_time); + return pci_slot_set_sm_timeout(slot, msecs_to_tb(wait_time)); + + case PHB4_SLOT_CRESET_FRESET: + /* + * We asserted PERST at the beginning of the CRESET and we + * have waited long enough, so we can skip it in the freset + * procedure. + */ + p->skip_perst = true; + pci_slot_set_state(slot, PHB4_SLOT_NORMAL); + return slot->ops.freset(slot); + default: + PHBERR(p, "CRESET: Unexpected slot state %08x, resetting...\n", + slot->state); + pci_slot_set_state(slot, PHB4_SLOT_NORMAL); + return slot->ops.creset(slot); + + } + +error: + /* Mark the PHB as dead and expect it to be removed */ + p->broken = true; + return OPAL_HARDWARE; +} + +/* + * Initialize root complex slot, which is mainly used to + * do fundamental reset before PCI enumeration in PCI core. + * When probing root complex and building its real slot, + * the operations will be copied over. + */ +static struct pci_slot *phb4_slot_create(struct phb *phb) +{ + struct pci_slot *slot; + + slot = pci_slot_alloc(phb, NULL); + if (!slot) + return slot; + + /* Elementary functions */ + slot->ops.get_presence_state = phb4_get_presence_state; + slot->ops.get_link_state = phb4_get_link_state; + slot->ops.get_power_state = NULL; + slot->ops.get_attention_state = NULL; + slot->ops.get_latch_state = NULL; + slot->ops.set_power_state = NULL; + slot->ops.set_attention_state = NULL; + + /* + * For PHB slots, we have to split the fundamental reset + * into 2 steps. We might not have the first step which + * is to power off/on the slot, or it's controlled by + * individual platforms. + */ + slot->ops.prepare_link_change = phb4_prepare_link_change; + slot->ops.poll_link = phb4_poll_link; + slot->ops.hreset = phb4_hreset; + slot->ops.freset = phb4_freset; + slot->ops.creset = phb4_creset; + slot->ops.completed_sm_run = phb4_slot_sm_run_completed; + slot->link_retries = PHB4_LINK_LINK_RETRIES; + + return slot; +} + +static void phb4_int_unmask_all(struct phb4 *p) +{ + /* Init_126..130 - Re-enable error interrupts */ + out_be64(p->regs + PHB_ERR_IRQ_ENABLE, 0xca8880cc00000000ull); + + if (is_phb5()) + out_be64(p->regs + PHB_TXE_ERR_IRQ_ENABLE, 0x200850be08200020ull); + else + out_be64(p->regs + PHB_TXE_ERR_IRQ_ENABLE, 0x2008400e08200000ull); + out_be64(p->regs + PHB_RXE_ARB_ERR_IRQ_ENABLE, 0xc40038fc01804070ull); + out_be64(p->regs + PHB_RXE_MRG_ERR_IRQ_ENABLE, 0x00006100008000a8ull); + out_be64(p->regs + PHB_RXE_TCE_ERR_IRQ_ENABLE, 0x60510050c0000000ull); +} + +/* + * Mask the IRQ for any currently set error bits. This prevents the PHB's ERR + * and INF interrupts from being re-fired before the kernel can handle the + * underlying condition. + */ +static void phb4_int_mask_active(struct phb4 *p) +{ + const uint64_t error_regs[] = { + PHB_ERR_STATUS, + PHB_TXE_ERR_STATUS, + PHB_RXE_ARB_ERR_STATUS, + PHB_RXE_MRG_ERR_STATUS, + PHB_RXE_TCE_ERR_STATUS + }; + int i; + + for (i = 0; i < ARRAY_SIZE(error_regs); i++) { + uint64_t stat, mask; + + /* The IRQ mask reg is always offset 0x20 from the status reg */ + stat = phb4_read_reg(p, error_regs[i]); + mask = phb4_read_reg(p, error_regs[i] + 0x20); + + phb4_write_reg(p, error_regs[i] + 0x20, mask & ~stat); + } +} + +static uint64_t phb4_get_pesta(struct phb4 *p, uint64_t pe_number) +{ + uint64_t pesta; + __be64 *pPEST; + + pPEST = (__be64 *)p->tbl_pest; + + phb4_ioda_sel(p, IODA3_TBL_PESTA, pe_number, false); + pesta = phb4_read_reg(p, PHB_IODA_DATA0); + if (pesta & IODA3_PESTA_MMIO_FROZEN) + pesta |= be64_to_cpu(pPEST[2*pe_number]); + + return pesta; +} + +/* Check if the chip requires escalating a freeze to fence on MMIO loads */ +static bool phb4_escalation_required(void) +{ + uint64_t pvr = mfspr(SPR_PVR); + + /* Only on Power9 */ + if (proc_gen != proc_gen_p9) + return false; + + /* + * Escalation is required on the following chip versions: + * - Cumulus DD1.0 + * - Nimbus DD2.0, DD2.1 (and DD1.0, but it is unsupported so no check). + */ + if (pvr & PVR_POWER9_CUMULUS) { + if (PVR_VERS_MAJ(pvr) == 1 && PVR_VERS_MIN(pvr) == 0) + return true; + } else { /* Nimbus */ + if (PVR_VERS_MAJ(pvr) == 2 && PVR_VERS_MIN(pvr) < 2) + return true; + } + + return false; +} + +static bool phb4_freeze_escalate(uint64_t pesta) +{ + if ((GETFIELD(IODA3_PESTA_TRANS_TYPE, pesta) == + IODA3_PESTA_TRANS_TYPE_MMIOLOAD) && + (pesta & (IODA3_PESTA_CA_CMPLT_TMT | IODA3_PESTA_UR))) + return true; + return false; +} + +static int64_t phb4_eeh_freeze_status(struct phb *phb, uint64_t pe_number, + uint8_t *freeze_state, + uint16_t *pci_error_type, + uint16_t *severity) +{ + struct phb4 *p = phb_to_phb4(phb); + uint64_t peev_bit = PPC_BIT(pe_number & 0x3f); + uint64_t peev, pesta, pestb; + + /* Defaults: not frozen */ + *freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN; + *pci_error_type = OPAL_EEH_NO_ERROR; + + /* Check dead */ + if (p->broken) { + *freeze_state = OPAL_EEH_STOPPED_MMIO_DMA_FREEZE; + *pci_error_type = OPAL_EEH_PHB_ERROR; + if (severity) + *severity = OPAL_EEH_SEV_PHB_DEAD; + return OPAL_HARDWARE; + } + + /* Check fence and CAPP recovery */ + if (phb4_fenced(p) || (p->flags & PHB4_CAPP_RECOVERY)) { + *freeze_state = OPAL_EEH_STOPPED_MMIO_DMA_FREEZE; + *pci_error_type = OPAL_EEH_PHB_ERROR; + if (severity) + *severity = OPAL_EEH_SEV_PHB_FENCED; + return OPAL_SUCCESS; + } + + /* Check the PEEV */ + phb4_ioda_sel(p, IODA3_TBL_PEEV, pe_number / 64, false); + peev = in_be64(p->regs + PHB_IODA_DATA0); + if (!(peev & peev_bit)) + return OPAL_SUCCESS; + + /* Indicate that we have an ER pending */ + phb4_set_err_pending(p, true); + if (severity) + *severity = OPAL_EEH_SEV_PE_ER; + + /* Read the full PESTA */ + pesta = phb4_get_pesta(p, pe_number); + /* Check if we need to escalate to fence */ + if (phb4_escalation_required() && phb4_freeze_escalate(pesta)) { + PHBERR(p, "Escalating freeze to fence PESTA[%lli]=%016llx\n", + pe_number, pesta); + *severity = OPAL_EEH_SEV_PHB_FENCED; + *pci_error_type = OPAL_EEH_PHB_ERROR; + } + + /* Read the PESTB in the PHB */ + phb4_ioda_sel(p, IODA3_TBL_PESTB, pe_number, false); + pestb = phb4_read_reg(p, PHB_IODA_DATA0); + + /* Convert PESTA/B to freeze_state */ + if (pesta & IODA3_PESTA_MMIO_FROZEN) + *freeze_state |= OPAL_EEH_STOPPED_MMIO_FREEZE; + if (pestb & IODA3_PESTB_DMA_STOPPED) + *freeze_state |= OPAL_EEH_STOPPED_DMA_FREEZE; + + return OPAL_SUCCESS; +} + +static int64_t phb4_eeh_freeze_clear(struct phb *phb, uint64_t pe_number, + uint64_t eeh_action_token) +{ + struct phb4 *p = phb_to_phb4(phb); + uint64_t err, peev; + int32_t i; + bool frozen_pe = false; + + if (p->broken) + return OPAL_HARDWARE; + + /* Summary. If nothing, move to clearing the PESTs which can + * contain a freeze state from a previous error or simply set + * explicitely by the user + */ + err = in_be64(p->regs + PHB_ETU_ERR_SUMMARY); + if (err == 0xffffffffffffffffUL) { + if (phb4_fenced(p)) { + PHBERR(p, "eeh_freeze_clear on fenced PHB\n"); + return OPAL_HARDWARE; + } + } + if (err != 0) + phb4_err_clear(p); + + /* + * We have PEEV in system memory. It would give more performance + * to access that directly. + */ + if (eeh_action_token & OPAL_EEH_ACTION_CLEAR_FREEZE_MMIO) { + phb4_ioda_sel(p, IODA3_TBL_PESTA, pe_number, false); + out_be64(p->regs + PHB_IODA_DATA0, 0); + } + if (eeh_action_token & OPAL_EEH_ACTION_CLEAR_FREEZE_DMA) { + phb4_ioda_sel(p, IODA3_TBL_PESTB, pe_number, false); + out_be64(p->regs + PHB_IODA_DATA0, 0); + } + + + /* Update ER pending indication */ + phb4_ioda_sel(p, IODA3_TBL_PEEV, 0, true); + for (i = 0; i < p->num_pes/64; i++) { + peev = in_be64(p->regs + PHB_IODA_DATA0); + if (peev) { + frozen_pe = true; + break; + } + } + if (frozen_pe) { + p->err.err_src = PHB4_ERR_SRC_PHB; + p->err.err_class = PHB4_ERR_CLASS_ER; + p->err.err_bit = -1; + phb4_set_err_pending(p, true); + } else + phb4_set_err_pending(p, false); + + return OPAL_SUCCESS; +} + +static int64_t phb4_eeh_freeze_set(struct phb *phb, uint64_t pe_number, + uint64_t eeh_action_token) +{ + struct phb4 *p = phb_to_phb4(phb); + uint64_t data; + + if (p->broken) + return OPAL_HARDWARE; + + if (pe_number >= p->num_pes) + return OPAL_PARAMETER; + + if (eeh_action_token != OPAL_EEH_ACTION_SET_FREEZE_MMIO && + eeh_action_token != OPAL_EEH_ACTION_SET_FREEZE_DMA && + eeh_action_token != OPAL_EEH_ACTION_SET_FREEZE_ALL) + return OPAL_PARAMETER; + + if (eeh_action_token & OPAL_EEH_ACTION_SET_FREEZE_MMIO) { + phb4_ioda_sel(p, IODA3_TBL_PESTA, pe_number, false); + data = in_be64(p->regs + PHB_IODA_DATA0); + data |= IODA3_PESTA_MMIO_FROZEN; + out_be64(p->regs + PHB_IODA_DATA0, data); + } + + if (eeh_action_token & OPAL_EEH_ACTION_SET_FREEZE_DMA) { + phb4_ioda_sel(p, IODA3_TBL_PESTB, pe_number, false); + data = in_be64(p->regs + PHB_IODA_DATA0); + data |= IODA3_PESTB_DMA_STOPPED; + out_be64(p->regs + PHB_IODA_DATA0, data); + } + + return OPAL_SUCCESS; +} + +static int64_t phb4_eeh_next_error(struct phb *phb, + uint64_t *first_frozen_pe, + uint16_t *pci_error_type, + uint16_t *severity) +{ + struct phb4 *p = phb_to_phb4(phb); + uint64_t peev, pesta; + uint32_t peev_size = p->num_pes/64; + int32_t i, j; + + /* If the PHB is broken, we needn't go forward */ + if (p->broken) { + *pci_error_type = OPAL_EEH_PHB_ERROR; + *severity = OPAL_EEH_SEV_PHB_DEAD; + return OPAL_SUCCESS; + } + + if ((p->flags & PHB4_CAPP_RECOVERY)) { + *pci_error_type = OPAL_EEH_PHB_ERROR; + *severity = OPAL_EEH_SEV_PHB_FENCED; + return OPAL_SUCCESS; + } + + /* + * Check if we already have pending errors. If that's + * the case, then to get more information about the + * pending errors. Here we try PBCQ prior to PHB. + */ + if (phb4_err_pending(p) /*&& + !phb4_err_check_pbcq(p) && + !phb4_err_check_lem(p) */) + phb4_set_err_pending(p, false); + + /* Clear result */ + *pci_error_type = OPAL_EEH_NO_ERROR; + *severity = OPAL_EEH_SEV_NO_ERROR; + *first_frozen_pe = (uint64_t)-1; + + /* Check frozen PEs */ + if (!phb4_err_pending(p)) { + phb4_ioda_sel(p, IODA3_TBL_PEEV, 0, true); + for (i = 0; i < peev_size; i++) { + peev = in_be64(p->regs + PHB_IODA_DATA0); + if (peev) { + p->err.err_src = PHB4_ERR_SRC_PHB; + p->err.err_class = PHB4_ERR_CLASS_ER; + p->err.err_bit = -1; + phb4_set_err_pending(p, true); + break; + } + } + } + + if (!phb4_err_pending(p)) + return OPAL_SUCCESS; + /* + * If the frozen PE is caused by a malfunctioning TLP, we + * need reset the PHB. So convert ER to PHB-fatal error + * for the case. + */ + if (p->err.err_class == PHB4_ERR_CLASS_ER) { + for (i = peev_size - 1; i >= 0; i--) { + phb4_ioda_sel(p, IODA3_TBL_PEEV, i, false); + peev = in_be64(p->regs + PHB_IODA_DATA0); + for (j = 0; j < 64; j++) { + if (peev & PPC_BIT(j)) { + *first_frozen_pe = i * 64 + j; + break; + } + } + if (*first_frozen_pe != (uint64_t)(-1)) + break; + } + } + + if (*first_frozen_pe != (uint64_t)(-1)) { + pesta = phb4_get_pesta(p, *first_frozen_pe); + if (phb4_escalation_required() && phb4_freeze_escalate(pesta)) { + PHBINF(p, "Escalating freeze to fence. PESTA[%lli]=%016llx\n", + *first_frozen_pe, pesta); + p->err.err_class = PHB4_ERR_CLASS_FENCED; + } + } + + switch (p->err.err_class) { + case PHB4_ERR_CLASS_DEAD: + *pci_error_type = OPAL_EEH_PHB_ERROR; + *severity = OPAL_EEH_SEV_PHB_DEAD; + break; + case PHB4_ERR_CLASS_FENCED: + *pci_error_type = OPAL_EEH_PHB_ERROR; + *severity = OPAL_EEH_SEV_PHB_FENCED; + break; + case PHB4_ERR_CLASS_ER: + *pci_error_type = OPAL_EEH_PE_ERROR; + *severity = OPAL_EEH_SEV_PE_ER; + + /* No frozen PE ? */ + if (*first_frozen_pe == (uint64_t)-1) { + *pci_error_type = OPAL_EEH_NO_ERROR; + *severity = OPAL_EEH_SEV_NO_ERROR; + phb4_set_err_pending(p, false); + } + + break; + case PHB4_ERR_CLASS_INF: + *pci_error_type = OPAL_EEH_PHB_ERROR; + *severity = OPAL_EEH_SEV_INF; + break; + default: + *pci_error_type = OPAL_EEH_NO_ERROR; + *severity = OPAL_EEH_SEV_NO_ERROR; + phb4_set_err_pending(p, false); + } + + /* + * Unmask all our error interrupts once all pending errors + * have been handled. + */ + if (!phb4_err_pending(p)) + phb4_int_unmask_all(p); + + return OPAL_SUCCESS; +} + +static int64_t phb4_err_inject_finalize(struct phb4 *phb, uint64_t addr, + uint64_t mask, uint64_t ctrl, + bool is_write) +{ + if (is_write) + ctrl |= PHB_PAPR_ERR_INJ_CTL_WR; + else + ctrl |= PHB_PAPR_ERR_INJ_CTL_RD; + + out_be64(phb->regs + PHB_PAPR_ERR_INJ_ADDR, addr); + out_be64(phb->regs + PHB_PAPR_ERR_INJ_MASK, mask); + out_be64(phb->regs + PHB_PAPR_ERR_INJ_CTL, ctrl); + + return OPAL_SUCCESS; +} + +static int64_t phb4_err_inject_mem32(struct phb4 *phb __unused, + uint64_t pe_number __unused, + uint64_t addr __unused, + uint64_t mask __unused, + bool is_write __unused) +{ + return OPAL_UNSUPPORTED; +} + +static int64_t phb4_err_inject_mem64(struct phb4 *phb __unused, + uint64_t pe_number __unused, + uint64_t addr __unused, + uint64_t mask __unused, + bool is_write __unused) +{ + return OPAL_UNSUPPORTED; +} + +static int64_t phb4_err_inject_cfg(struct phb4 *phb, uint64_t pe_number, + uint64_t addr, uint64_t mask, + bool is_write) +{ + uint64_t a, m, prefer, ctrl; + int bdfn; + bool is_bus_pe = false; + + a = 0xffffull; + prefer = 0xffffull; + m = PHB_PAPR_ERR_INJ_MASK_CFG_ALL; + ctrl = PHB_PAPR_ERR_INJ_CTL_CFG; + + for (bdfn = 0; bdfn < RTT_TABLE_ENTRIES; bdfn++) { + if (be16_to_cpu(phb->tbl_rtt[bdfn]) != pe_number) + continue; + + /* The PE can be associated with PCI bus or device */ + is_bus_pe = false; + if ((bdfn + 8) < RTT_TABLE_ENTRIES && + be16_to_cpu(phb->tbl_rtt[bdfn + 8]) == pe_number) + is_bus_pe = true; + + /* Figure out the PCI config address */ + if (prefer == 0xffffull) { + if (is_bus_pe) { + m = PHB_PAPR_ERR_INJ_MASK_CFG; + prefer = SETFIELD(m, 0x0ull, PCI_BUS_NUM(bdfn)); + } else { + m = PHB_PAPR_ERR_INJ_MASK_CFG_ALL; + prefer = SETFIELD(m, 0x0ull, bdfn); + } + } + + /* Check the input address is valid or not */ + if (!is_bus_pe && + GETFIELD(PHB_PAPR_ERR_INJ_MASK_CFG_ALL, addr) == bdfn) { + a = addr; + break; + } + + if (is_bus_pe && + GETFIELD(PHB_PAPR_ERR_INJ_MASK_CFG, addr) == PCI_BUS_NUM(bdfn)) { + a = addr; + break; + } + } + + /* Invalid PE number */ + if (prefer == 0xffffull) + return OPAL_PARAMETER; + + /* Specified address is out of range */ + if (a == 0xffffull) + a = prefer; + else + m = mask; + + return phb4_err_inject_finalize(phb, a, m, ctrl, is_write); +} + +static int64_t phb4_err_inject_dma(struct phb4 *phb __unused, + uint64_t pe_number __unused, + uint64_t addr __unused, + uint64_t mask __unused, + bool is_write __unused, + bool is_64bits __unused) +{ + return OPAL_UNSUPPORTED; +} + +static int64_t phb4_err_inject_dma32(struct phb4 *phb, uint64_t pe_number, + uint64_t addr, uint64_t mask, + bool is_write) +{ + return phb4_err_inject_dma(phb, pe_number, addr, mask, is_write, false); +} + +static int64_t phb4_err_inject_dma64(struct phb4 *phb, uint64_t pe_number, + uint64_t addr, uint64_t mask, + bool is_write) +{ + return phb4_err_inject_dma(phb, pe_number, addr, mask, is_write, true); +} + + +static int64_t phb4_err_inject(struct phb *phb, uint64_t pe_number, + uint32_t type, uint32_t func, + uint64_t addr, uint64_t mask) +{ + struct phb4 *p = phb_to_phb4(phb); + int64_t (*handler)(struct phb4 *p, uint64_t pe_number, + uint64_t addr, uint64_t mask, bool is_write); + bool is_write; + + /* We can't inject error to the reserved PE */ + if (pe_number == PHB4_RESERVED_PE_NUM(p) || pe_number >= p->num_pes) + return OPAL_PARAMETER; + + /* Clear leftover from last time */ + out_be64(p->regs + PHB_PAPR_ERR_INJ_CTL, 0x0ul); + + switch (func) { + case OPAL_ERR_INJECT_FUNC_IOA_LD_MEM_ADDR: + case OPAL_ERR_INJECT_FUNC_IOA_LD_MEM_DATA: + is_write = false; + if (type == OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR64) + handler = phb4_err_inject_mem64; + else + handler = phb4_err_inject_mem32; + break; + case OPAL_ERR_INJECT_FUNC_IOA_ST_MEM_ADDR: + case OPAL_ERR_INJECT_FUNC_IOA_ST_MEM_DATA: + is_write = true; + if (type == OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR64) + handler = phb4_err_inject_mem64; + else + handler = phb4_err_inject_mem32; + break; + case OPAL_ERR_INJECT_FUNC_IOA_LD_CFG_ADDR: + case OPAL_ERR_INJECT_FUNC_IOA_LD_CFG_DATA: + is_write = false; + handler = phb4_err_inject_cfg; + break; + case OPAL_ERR_INJECT_FUNC_IOA_ST_CFG_ADDR: + case OPAL_ERR_INJECT_FUNC_IOA_ST_CFG_DATA: + is_write = true; + handler = phb4_err_inject_cfg; + break; + case OPAL_ERR_INJECT_FUNC_IOA_DMA_RD_ADDR: + case OPAL_ERR_INJECT_FUNC_IOA_DMA_RD_DATA: + case OPAL_ERR_INJECT_FUNC_IOA_DMA_RD_MASTER: + case OPAL_ERR_INJECT_FUNC_IOA_DMA_RD_TARGET: + is_write = false; + if (type == OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR64) + handler = phb4_err_inject_dma64; + else + handler = phb4_err_inject_dma32; + break; + case OPAL_ERR_INJECT_FUNC_IOA_DMA_WR_ADDR: + case OPAL_ERR_INJECT_FUNC_IOA_DMA_WR_DATA: + case OPAL_ERR_INJECT_FUNC_IOA_DMA_WR_MASTER: + case OPAL_ERR_INJECT_FUNC_IOA_DMA_WR_TARGET: + is_write = true; + if (type == OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR64) + handler = phb4_err_inject_dma64; + else + handler = phb4_err_inject_dma32; + break; + default: + return OPAL_PARAMETER; + } + + return handler(p, pe_number, addr, mask, is_write); +} + +static int64_t phb4_get_diag_data(struct phb *phb, + void *diag_buffer, + uint64_t diag_buffer_len) +{ + bool fenced; + struct phb4 *p = phb_to_phb4(phb); + struct OpalIoPhb4ErrorData *data = diag_buffer; + + if (diag_buffer_len < sizeof(struct OpalIoPhb4ErrorData)) + return OPAL_PARAMETER; + if (p->broken) + return OPAL_HARDWARE; + + /* + * Dummy check for fence so that phb4_read_phb_status knows + * whether to use ASB or AIB + */ + fenced = phb4_fenced(p); + phb4_read_phb_status(p, data); + + if (!fenced) + phb4_eeh_dump_regs(p); + + /* + * We're running to here probably because of errors + * (INF class). For that case, we need clear the error + * explicitly. + */ + if (phb4_err_pending(p) && + p->err.err_class == PHB4_ERR_CLASS_INF && + p->err.err_src == PHB4_ERR_SRC_PHB) { + phb4_err_clear(p); + phb4_set_err_pending(p, false); + } + + return OPAL_SUCCESS; +} + +static uint64_t tve_encode_50b_noxlate(uint64_t start_addr, uint64_t end_addr) +{ + uint64_t tve; + + /* + * Put start address bits 49:24 into TVE[52:53]||[0:23] + * and end address bits 49:24 into TVE[54:55]||[24:47] + * and set TVE[51] + */ + tve = (start_addr << 16) & (0xffffffull << 40); + tve |= (start_addr >> 38) & (3ull << 10); + tve |= (end_addr >> 8) & (0xfffffful << 16); + tve |= (end_addr >> 40) & (3ull << 8); + tve |= PPC_BIT(51) | IODA3_TVT_NON_TRANSLATE_50; + return tve; +} + +static bool phb4_is_dd20(struct phb4 *p) +{ + struct proc_chip *chip = get_chip(p->chip_id); + + if (p->rev == PHB4_REV_NIMBUS_DD20 && ((0xf & chip->ec_level) == 0)) + return true; + return false; +} + +static int64_t phb4_get_capp_info(int chip_id, struct phb *phb, + struct capp_info *info) +{ + struct phb4 *p = phb_to_phb4(phb); + uint32_t offset; + + /* Not even supposed to be here on P10, but doesn't hurt */ + if (is_phb5()) + return OPAL_UNSUPPORTED; + + if (chip_id != p->chip_id) + return OPAL_PARAMETER; + + /* Check is CAPP is attached to the PHB */ + if (p->capp == NULL || p->capp->phb != phb) + return OPAL_PARAMETER; + + offset = PHB4_CAPP_REG_OFFSET(p); + + if (p->index == CAPP0_PHB_INDEX) + info->capp_index = 0; + if (p->index == CAPP1_PHB_INDEX) + info->capp_index = 1; + info->phb_index = p->index; + info->capp_fir_reg = CAPP_FIR + offset; + info->capp_fir_mask_reg = CAPP_FIR_MASK + offset; + info->capp_fir_action0_reg = CAPP_FIR_ACTION0 + offset; + info->capp_fir_action1_reg = CAPP_FIR_ACTION1 + offset; + info->capp_err_status_ctrl_reg = CAPP_ERR_STATUS_CTRL + offset; + + return OPAL_SUCCESS; +} + +static void phb4_init_capp_regs(struct phb4 *p, uint32_t capp_eng) +{ + uint64_t addr, reg; + uint32_t offset; + uint8_t link_width_x16 = 1; + + offset = PHB4_CAPP_REG_OFFSET(p); + + /* Calculate the phb link width if card is attached to PEC2 */ + if (p->index == CAPP1_PHB_INDEX) { + /* Check if PEC2 is in x8 or x16 mode. + * PEC0 is always in x16 + */ + addr = XPEC_P9_PCI_CPLT_CONF1 + 2 * XPEC_PCI_CPLT_OFFSET; + xscom_read(p->chip_id, addr, ®); + link_width_x16 = ((reg & XPEC_P9_PCI_IOVALID_MASK) == + XPEC_P9_PCI_IOVALID_X16); + } + + /* APC Master PowerBus Control Register */ + xscom_read(p->chip_id, APC_MASTER_PB_CTRL + offset, ®); + reg |= PPC_BIT(0); /* enable cResp exam */ + reg |= PPC_BIT(3); /* disable vg not sys */ + reg |= PPC_BIT(12);/* HW417025: disable capp virtual machines */ + reg |= PPC_BIT(2); /* disable nn rn */ + reg |= PPC_BIT(4); /* disable g */ + reg |= PPC_BIT(5); /* disable ln */ + xscom_write(p->chip_id, APC_MASTER_PB_CTRL + offset, reg); + + /* Set PHB mode, HPC Dir State and P9 mode */ + xscom_write(p->chip_id, APC_MASTER_CAPI_CTRL + offset, + 0x1772000000000000UL); + PHBINF(p, "CAPP: port attached\n"); + + /* Set snoop ttype decoding , dir size to 512K */ + xscom_write(p->chip_id, SNOOP_CAPI_CONFIG + offset, 0x9000000000000000UL); + + /* Use Read Epsilon Tier2 for all scopes. + * Set Tier2 Read Epsilon. + */ + xscom_read(p->chip_id, SNOOP_CONTROL + offset, ®); + reg |= PPC_BIT(0); + reg |= PPC_BIT(35); + reg |= PPC_BIT(45); + reg |= PPC_BIT(46); + reg |= PPC_BIT(47); + reg |= PPC_BIT(50); + xscom_write(p->chip_id, SNOOP_CONTROL + offset, reg); + + /* Transport Control Register */ + xscom_read(p->chip_id, TRANSPORT_CONTROL + offset, ®); + if (p->index == CAPP0_PHB_INDEX) { + reg |= PPC_BIT(1); /* Send Packet Timer Value */ + reg |= PPC_BITMASK(10, 13); /* Send Packet Timer Value */ + reg &= ~PPC_BITMASK(14, 17); /* Set Max LPC CI store buffer to zeros */ + reg &= ~PPC_BITMASK(18, 21); /* Set Max tlbi divider */ + if (capp_eng & CAPP_MIN_STQ_ENGINES) { + /* 2 CAPP msg engines */ + reg |= PPC_BIT(58); + reg |= PPC_BIT(59); + reg |= PPC_BIT(60); + } + if (capp_eng & CAPP_MAX_STQ_ENGINES) { + /* 14 CAPP msg engines */ + reg |= PPC_BIT(60); + } + reg |= PPC_BIT(62); + } + if (p->index == CAPP1_PHB_INDEX) { + reg |= PPC_BIT(4); /* Send Packet Timer Value */ + reg &= ~PPC_BIT(10); /* Set CI Store Buffer Threshold=5 */ + reg |= PPC_BIT(11); /* Set CI Store Buffer Threshold=5 */ + reg &= ~PPC_BIT(12); /* Set CI Store Buffer Threshold=5 */ + reg |= PPC_BIT(13); /* Set CI Store Buffer Threshold=5 */ + reg &= ~PPC_BITMASK(14, 17); /* Set Max LPC CI store buffer to zeros */ + reg &= ~PPC_BITMASK(18, 21); /* Set Max tlbi divider */ + if (capp_eng & CAPP_MIN_STQ_ENGINES) { + /* 2 CAPP msg engines */ + reg |= PPC_BIT(59); + reg |= PPC_BIT(60); + + } else if (capp_eng & CAPP_MAX_STQ_ENGINES) { + + if (link_width_x16) + /* 14 CAPP msg engines */ + reg |= PPC_BIT(60) | PPC_BIT(62); + else + /* 6 CAPP msg engines */ + reg |= PPC_BIT(60); + } + } + xscom_write(p->chip_id, TRANSPORT_CONTROL + offset, reg); + + /* The transport control register needs to be loaded in two + * steps. Once the register values have been set, we have to + * write bit 63 to a '1', which loads the register values into + * the ci store buffer logic. + */ + xscom_read(p->chip_id, TRANSPORT_CONTROL + offset, ®); + reg |= PPC_BIT(63); + xscom_write(p->chip_id, TRANSPORT_CONTROL + offset, reg); + + /* Enable epoch timer */ + xscom_write(p->chip_id, EPOCH_RECOVERY_TIMERS_CTRL + offset, + 0xC0000000FFF8FFE0UL); + + /* Flush SUE State Map Register */ + xscom_write(p->chip_id, FLUSH_SUE_STATE_MAP + offset, + 0x08020A0000000000UL); + + /* Flush SUE uOP1 Register */ + xscom_write(p->chip_id, FLUSH_SUE_UOP1 + offset, + 0xDCE0280428000000); + + /* capp owns PHB read buffers */ + if (p->index == CAPP0_PHB_INDEX) { + /* max PHB read buffers 0-47 */ + reg = 0xFFFFFFFFFFFF0000UL; + if (capp_eng & CAPP_MAX_DMA_READ_ENGINES) + reg = 0xF000000000000000UL; + xscom_write(p->chip_id, APC_FSM_READ_MASK + offset, reg); + xscom_write(p->chip_id, XPT_FSM_RMM + offset, reg); + } + if (p->index == CAPP1_PHB_INDEX) { + + if (capp_eng & CAPP_MAX_DMA_READ_ENGINES) { + reg = 0xF000000000000000ULL; + } else if (link_width_x16) { + /* 0-47 (Read machines) are available for + * capp use + */ + reg = 0x0000FFFFFFFFFFFFULL; + } else { + /* Set 30 Read machines for CAPP Minus + * 20-27 for DMA + */ + reg = 0xFFFFF00E00000000ULL; + } + xscom_write(p->chip_id, APC_FSM_READ_MASK + offset, reg); + xscom_write(p->chip_id, XPT_FSM_RMM + offset, reg); + } + + /* CAPP FIR Action 0 */ + xscom_write(p->chip_id, CAPP_FIR_ACTION0 + offset, 0x0b1c000104060000UL); + + /* CAPP FIR Action 1 */ + xscom_write(p->chip_id, CAPP_FIR_ACTION1 + offset, 0x2b9c0001240E0000UL); + + /* CAPP FIR MASK */ + xscom_write(p->chip_id, CAPP_FIR_MASK + offset, 0x80031f98d8717000UL); + + /* Mask the CAPP PSL Credit Timeout Register error */ + xscom_write_mask(p->chip_id, CAPP_FIR_MASK + offset, + PPC_BIT(46), PPC_BIT(46)); + + /* Deassert TLBI_FENCED and tlbi_psl_is_dead */ + xscom_write(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, 0); +} + +/* override some inits with CAPI defaults */ +static void phb4_init_capp_errors(struct phb4 *p) +{ + /* Init_77: TXE Error AIB Fence Enable Register */ + if (phb4_is_dd20(p)) + out_be64(p->regs + 0x0d30, 0xdfffbf0ff7ddfff0ull); + else + out_be64(p->regs + 0x0d30, 0xdff7bf0ff7ddfff0ull); + /* Init_86: RXE_ARB Error AIB Fence Enable Register */ + out_be64(p->regs + 0x0db0, 0xfbffd7bbfb7fbfefull); + + /* Init_95: RXE_MRG Error AIB Fence Enable Register */ + out_be64(p->regs + 0x0e30, 0xfffffeffff7fff57ull); + + /* Init_104: RXE_TCE Error AIB Fence Enable Register */ + out_be64(p->regs + 0x0eb0, 0xffaeffafffffffffull); + + /* Init_113: PHB Error AIB Fence Enable Register */ + out_be64(p->regs + 0x0cb0, 0x35777073ff000000ull); +} + +/* + * The capi, NBW and ASN indicators are used only on P9 to flag some + * types of incoming traffic for the PHB and have been removed on P10. + * + * The capi indicator is over the 8 most significant bits (and + * not 16). We stay away from bits 59 (TVE select), 60 and 61 (MSI) + * + * For the mask, we keep bit 59 in, as capi messages must hit TVE#0. + * Bit 56 is not part of the mask, so that a NBW message (see below) + * is also considered a capi message. + */ +#define CAPIIND 0x0200 +#define CAPIMASK 0xFE00 + +/* + * Non-Blocking Write messages are a subset of capi messages, so the + * indicator is the same as capi + an extra bit (56) to differentiate. + * Mask is the same as capi + the extra bit + */ +#define NBWIND 0x0300 +#define NBWMASK 0xFF00 + +/* + * The ASN indicator is used for tunneled operations (as_notify and + * atomics). Tunneled operation messages can be sent in PCI mode as + * well as CAPI mode. + * + * The format of those messages is specific and, for as_notify + * messages, the address field is hijacked to encode the LPID/PID/TID + * of the target thread, so those messages should not go through + * translation. They must hit TVE#1. Therefore bit 59 is part of the + * indicator. + */ +#define ASNIND 0x0C00 +#define ASNMASK 0xFF00 + +/* Power Bus Common Queue Registers + * All PBCQ and PBAIB registers are accessed via SCOM + * NestBase = 4010C00 for PEC0 + * 4011000 for PEC1 + * 4011400 for PEC2 + * PCIBase = D010800 for PE0 + * E010800 for PE1 + * F010800 for PE2 + * + * Some registers are shared amongst all of the stacks and will only + * have 1 copy. Other registers are implemented one per stack. + * Registers that are duplicated will have an additional offset + * of “StackBase” so that they have a unique address. + * Stackoffset = 00000040 for Stack0 + * = 00000080 for Stack1 + * = 000000C0 for Stack2 + */ +static int64_t enable_capi_mode(struct phb4 *p, uint64_t pe_number, + uint32_t capp_eng) +{ + uint64_t addr, reg, start_addr, end_addr, stq_eng, dma_eng; + uint64_t mbt0, mbt1; + int i, window_num = -1; + + /* CAPP Control Register */ + xscom_read(p->chip_id, p->pe_xscom + XPEC_NEST_CAPP_CNTL, ®); + if (reg & PPC_BIT(0)) { + PHBDBG(p, "Already in CAPP mode\n"); + } + + for (i = 0; i < 500000; i++) { + /* PBCQ General Status Register */ + xscom_read(p->chip_id, + p->pe_stk_xscom + XPEC_NEST_STK_PBCQ_STAT, + ®); + if (!(reg & 0xC000000000000000UL)) + break; + time_wait_us(10); + } + if (reg & 0xC000000000000000UL) { + PHBERR(p, "CAPP: Timeout waiting for pending transaction\n"); + return OPAL_HARDWARE; + } + + stq_eng = 0x0000000000000000ULL; + dma_eng = 0x0000000000000000ULL; + if (p->index == CAPP0_PHB_INDEX) { + /* PBCQ is operating as a x16 stack + * - The maximum number of engines give to CAPP will be + * 14 and will be assigned in the order of STQ 15 to 2. + * - 0-47 (Read machines) are available for capp use. + */ + stq_eng = 0x000E000000000000ULL; /* 14 CAPP msg engines */ + dma_eng = 0x0000FFFFFFFFFFFFULL; /* 48 CAPP Read machines */ + } + + if (p->index == CAPP1_PHB_INDEX) { + /* Check if PEC is in x8 or x16 mode */ + addr = XPEC_P9_PCI_CPLT_CONF1 + 2 * XPEC_PCI_CPLT_OFFSET; + xscom_read(p->chip_id, addr, ®); + if ((reg & XPEC_P9_PCI_IOVALID_MASK) == XPEC_P9_PCI_IOVALID_X16) { + /* PBCQ is operating as a x16 stack + * - The maximum number of engines give to CAPP will be + * 14 and will be assigned in the order of STQ 15 to 2. + * - 0-47 (Read machines) are available for capp use. + */ + stq_eng = 0x000E000000000000ULL; + dma_eng = 0x0000FFFFFFFFFFFFULL; + } else { + + /* PBCQ is operating as a x8 stack + * - The maximum number of engines given to CAPP should + * be 6 and will be assigned in the order of 7 to 2. + * - 0-30 (Read machines) are available for capp use. + */ + stq_eng = 0x0006000000000000ULL; + /* 30 Read machines for CAPP Minus 20-27 for DMA */ + dma_eng = 0x0000FFFFF00E0000ULL; + } + } + + if (capp_eng & CAPP_MIN_STQ_ENGINES) + stq_eng = 0x0002000000000000ULL; /* 2 capp msg engines */ + + /* CAPP Control Register. Enable CAPP Mode */ + reg = 0x8000000000000000ULL; /* PEC works in CAPP Mode */ + reg |= stq_eng; + if (capp_eng & CAPP_MAX_DMA_READ_ENGINES) + dma_eng = 0x0000F00000000000ULL; /* 4 CAPP Read machines */ + reg |= dma_eng; + xscom_write(p->chip_id, p->pe_xscom + XPEC_NEST_CAPP_CNTL, reg); + + /* PEC2 has 3 ETU's + 16 pci lanes that can operate as x16, + * x8+x8 (bifurcated) or x8+x4+x4 (trifurcated) mode. When + * Mellanox CX5 card is attached to stack0 of this PEC, indicated by + * request to allocate CAPP_MAX_DMA_READ_ENGINES; we tweak the default + * dma-read engines allocations to maximize the DMA read performance + */ + if ((p->index == CAPP1_PHB_INDEX) && + (capp_eng & CAPP_MAX_DMA_READ_ENGINES)) + phb4_pec2_dma_engine_realloc(p); + + /* PCI to PB data movement ignores the PB init signal. */ + xscom_write_mask(p->chip_id, p->pe_xscom + XPEC_NEST_PBCQ_HW_CONFIG, + XPEC_NEST_PBCQ_HW_CONFIG_PBINIT, + XPEC_NEST_PBCQ_HW_CONFIG_PBINIT); + + /* If pump mode is enabled don't do nodal broadcasts. + */ + xscom_read(p->chip_id, PB_CENT_HP_MODE_CURR, ®); + if (reg & PB_CFG_PUMP_MODE) { + reg = XPEC_NEST_PBCQ_HW_CONFIG_DIS_NODAL; + reg |= XPEC_NEST_PBCQ_HW_CONFIG_DIS_RNNN; + xscom_write_mask(p->chip_id, + p->pe_xscom + XPEC_NEST_PBCQ_HW_CONFIG, + reg, reg); + } + + /* PEC Phase 4 (PHB) registers adjustment + * Inbound CAPP traffic: The CAPI can send both CAPP packets and + * I/O packets. A PCIe packet is indentified as a CAPP packet in + * the PHB if the PCIe address matches either the CAPI + * Compare/Mask register or its NBW Compare/Mask register. + */ + + /* + * Bit [0:7] XSL_DSNCTL[capiind] + * Init_26 - CAPI Compare/Mask + */ + out_be64(p->regs + PHB_CAPI_CMPM, + ((u64)CAPIIND << 48) | + ((u64)CAPIMASK << 32) | PHB_CAPI_CMPM_ENABLE); + + /* PB AIB Hardware Control Register + * Wait 32 PCI clocks for a credit to become available + * before rejecting. + */ + xscom_read(p->chip_id, p->pci_xscom + XPEC_PCI_PBAIB_HW_CONFIG, ®); + reg |= PPC_BITMASK(40, 42); + if (p->index == CAPP1_PHB_INDEX) + reg |= PPC_BIT(30); + xscom_write(p->chip_id, p->pci_xscom + XPEC_PCI_PBAIB_HW_CONFIG, reg); + + /* non-translate/50-bit mode */ + out_be64(p->regs + PHB_NXLATE_PREFIX, 0x0000000000000000Ull); + + /* set tve no translate mode allow mmio window */ + memset(p->tve_cache, 0x0, sizeof(p->tve_cache)); + + /* + * In 50-bit non-translate mode, the fields of the TVE are + * used to perform an address range check. In this mode TCE + * Table Size(0) must be a '1' (TVE[51] = 1) + * PCI Addr(49:24) >= TVE[52:53]+TVE[0:23] and + * PCI Addr(49:24) < TVE[54:55]+TVE[24:47] + * + * TVE[51] = 1 + * TVE[56] = 1: 50-bit Non-Translate Mode Enable + * TVE[0:23] = 0x000000 + * TVE[24:47] = 0xFFFFFF + * + * capi dma mode: CAPP DMA mode needs access to all of memory + * capi mode: Allow address range (bit 14 = 1) + * 0x0002000000000000: 0x0002FFFFFFFFFFFF + * TVE[52:53] = '10' and TVE[54:55] = '10' + */ + + /* TVT#0: CAPI window + DMA, all memory */ + start_addr = 0ull; + end_addr = 0x0003ffffffffffffull; + p->tve_cache[pe_number * 2] = + tve_encode_50b_noxlate(start_addr, end_addr); + + /* TVT#1: CAPI window + DMA, all memory, in bypass mode */ + start_addr = (1ull << 59); + end_addr = start_addr + 0x0003ffffffffffffull; + p->tve_cache[pe_number * 2 + 1] = + tve_encode_50b_noxlate(start_addr, end_addr); + + phb4_ioda_sel(p, IODA3_TBL_TVT, 0, true); + for (i = 0; i < p->tvt_size; i++) + out_be64(p->regs + PHB_IODA_DATA0, p->tve_cache[i]); + + /* + * Since TVT#0 is in by-pass mode, disable 32-bit MSI, as a + * DMA write targeting 0x00000000FFFFxxxx would be interpreted + * as a 32-bit MSI + */ + reg = in_be64(p->regs + PHB_PHB4_CONFIG); + reg &= ~PHB_PHB4C_32BIT_MSI_EN; + out_be64(p->regs + PHB_PHB4_CONFIG, reg); + + /* set mbt bar to pass capi mmio window and keep the other + * mmio values + */ + mbt0 = IODA3_MBT0_ENABLE | IODA3_MBT0_TYPE_M64 | + SETFIELD(IODA3_MBT0_MODE, 0ull, IODA3_MBT0_MODE_SINGLE_PE) | + SETFIELD(IODA3_MBT0_MDT_COLUMN, 0ull, 0) | + (0x0002000000000000ULL & IODA3_MBT0_BASE_ADDR); + + mbt1 = IODA3_MBT1_ENABLE | + (0x00ff000000000000ULL & IODA3_MBT1_MASK) | + SETFIELD(IODA3_MBT1_SINGLE_PE_NUM, 0ull, pe_number); + + for (i = 0; i < p->mbt_size; i++) { + /* search if the capi mmio window is already present */ + if ((p->mbt_cache[i][0] == mbt0) && + (p->mbt_cache[i][1] == mbt1)) + break; + + /* search a free entry */ + if ((window_num == -1) && + ((!(p->mbt_cache[i][0] & IODA3_MBT0_ENABLE)) && + (!(p->mbt_cache[i][1] & IODA3_MBT1_ENABLE)))) + window_num = i; + } + + if (window_num >= 0 && i == p->mbt_size) { + /* no capi mmio window found, so add it */ + p->mbt_cache[window_num][0] = mbt0; + p->mbt_cache[window_num][1] = mbt1; + + phb4_ioda_sel(p, IODA3_TBL_MBT, window_num << 1, true); + out_be64(p->regs + PHB_IODA_DATA0, mbt0); + out_be64(p->regs + PHB_IODA_DATA0, mbt1); + } else if (i == p->mbt_size) { + /* mbt cache full, this case should never happen */ + PHBERR(p, "CAPP: Failed to add CAPI mmio window\n"); + } else { + /* duplicate entry. Nothing to do */ + } + + phb4_init_capp_errors(p); + + phb4_init_capp_regs(p, capp_eng); + + if (!chiptod_capp_timebase_sync(p->chip_id, CAPP_TFMR, + CAPP_TB, + PHB4_CAPP_REG_OFFSET(p))) + PHBERR(p, "CAPP: Failed to sync timebase\n"); + + /* set callbacks to handle HMI events */ + capi_ops.get_capp_info = &phb4_get_capp_info; + + return OPAL_SUCCESS; +} + + +static int64_t phb4_init_capp(struct phb4 *p) +{ + struct capp *capp; + int rc; + + if (p->index != CAPP0_PHB_INDEX && + p->index != CAPP1_PHB_INDEX) + return OPAL_UNSUPPORTED; + + capp = zalloc(sizeof(struct capp)); + if (capp == NULL) + return OPAL_NO_MEM; + + if (p->index == CAPP0_PHB_INDEX) { + capp->capp_index = 0; + capp->capp_xscom_offset = 0; + + } else if (p->index == CAPP1_PHB_INDEX) { + capp->capp_index = 1; + capp->capp_xscom_offset = CAPP1_REG_OFFSET; + } + + capp->attached_pe = phb4_get_reserved_pe_number(&p->phb); + capp->chip_id = p->chip_id; + + /* Load capp microcode into the capp unit */ + rc = load_capp_ucode(p); + + if (rc == OPAL_SUCCESS) + p->capp = capp; + else + free(capp); + + return rc; +} + +static int64_t phb4_set_capi_mode(struct phb *phb, uint64_t mode, + uint64_t pe_number) +{ + struct phb4 *p = phb_to_phb4(phb); + struct proc_chip *chip = get_chip(p->chip_id); + struct capp *capp = p->capp; + uint64_t reg, ret; + + /* No CAPI on P10. OpenCAPI only */ + if (is_phb5()) + return OPAL_UNSUPPORTED; + + /* cant do a mode switch when capp is in recovery mode */ + ret = capp_xscom_read(capp, CAPP_ERR_STATUS_CTRL, ®); + if (ret != OPAL_SUCCESS) + return ret; + + if ((reg & PPC_BIT(0)) && (!(reg & PPC_BIT(1)))) { + PHBDBG(p, "CAPP: recovery in progress\n"); + return OPAL_BUSY; + } + + + switch (mode) { + + case OPAL_PHB_CAPI_MODE_DMA: /* Enabled by default on p9 */ + case OPAL_PHB_CAPI_MODE_SNOOP_ON: + /* nothing to do on P9 if CAPP is already enabled */ + ret = p->capp->phb ? OPAL_SUCCESS : OPAL_UNSUPPORTED; + break; + + case OPAL_PHB_CAPI_MODE_SNOOP_OFF: + ret = p->capp->phb ? OPAL_UNSUPPORTED : OPAL_SUCCESS; + break; + + case OPAL_PHB_CAPI_MODE_PCIE: + if (p->flags & PHB4_CAPP_DISABLE) { + /* We are in middle of a CAPP disable */ + ret = OPAL_BUSY; + + } else if (capp->phb) { + /* Kick start a creset */ + p->flags |= PHB4_CAPP_DISABLE; + PHBINF(p, "CAPP: PCIE mode needs a cold-reset\n"); + /* Kick off the pci state machine */ + ret = phb4_creset(phb->slot); + ret = ret > 0 ? OPAL_BUSY : ret; + + } else { + /* PHB already in PCI mode */ + ret = OPAL_SUCCESS; + } + break; + + case OPAL_PHB_CAPI_MODE_CAPI: /* Fall Through */ + case OPAL_PHB_CAPI_MODE_DMA_TVT1: + /* Make sure that PHB is not disabling CAPP */ + if (p->flags & PHB4_CAPP_DISABLE) { + PHBERR(p, "CAPP: Disable in progress\n"); + ret = OPAL_BUSY; + break; + } + + /* Check if ucode is available */ + if (!capp_ucode_loaded(chip, p->index)) { + PHBERR(p, "CAPP: ucode not loaded\n"); + ret = OPAL_RESOURCE; + break; + } + + /* + * Mark the CAPP attached to the PHB right away so that + * if a MCE happens during CAPP init we can handle it. + * In case of an error in CAPP init we remove the PHB + * from the attached_mask later. + */ + capp->phb = phb; + capp->attached_pe = pe_number; + + if (mode == OPAL_PHB_CAPI_MODE_DMA_TVT1) + ret = enable_capi_mode(p, pe_number, + CAPP_MIN_STQ_ENGINES | + CAPP_MAX_DMA_READ_ENGINES); + + else + ret = enable_capi_mode(p, pe_number, + CAPP_MAX_STQ_ENGINES | + CAPP_MIN_DMA_READ_ENGINES); + if (ret == OPAL_SUCCESS) { + /* register notification on system shutdown */ + opal_add_host_sync_notifier(&phb4_host_sync_reset, p); + + } else { + /* In case of an error mark the PHB detached */ + capp->phb = NULL; + capp->attached_pe = phb4_get_reserved_pe_number(phb); + } + break; + + default: + ret = OPAL_UNSUPPORTED; + break; + }; + + return ret; +} + +static void phb4_p2p_set_initiator(struct phb4 *p, uint16_t pe_number) +{ + uint64_t tve; + uint16_t window_id = (pe_number << 1) + 1; + + /* + * Initiator needs access to the MMIO space of the target, + * which is well beyond the 'normal' memory area. Set its TVE + * with no range checking. + */ + PHBDBG(p, "Setting TVE#1 for peer-to-peer for pe %d\n", pe_number); + tve = PPC_BIT(51); + phb4_ioda_sel(p, IODA3_TBL_TVT, window_id, false); + out_be64(p->regs + PHB_IODA_DATA0, tve); + p->tve_cache[window_id] = tve; +} + +static void phb4_p2p_set_target(struct phb4 *p, bool enable) +{ + uint64_t val; + + /* + * Enabling p2p on a target PHB reserves an outbound (as seen + * from the CPU) store queue for p2p + */ + PHBDBG(p, "%s peer-to-peer\n", (enable ? "Enabling" : "Disabling")); + xscom_read(p->chip_id, + p->pe_stk_xscom + XPEC_NEST_STK_PBCQ_MODE, &val); + if (enable) + val |= XPEC_NEST_STK_PBCQ_MODE_P2P; + else + val &= ~XPEC_NEST_STK_PBCQ_MODE_P2P; + xscom_write(p->chip_id, + p->pe_stk_xscom + XPEC_NEST_STK_PBCQ_MODE, val); +} + +static void phb4_set_p2p(struct phb *phb, uint64_t mode, uint64_t flags, + uint16_t pe_number) +{ + struct phb4 *p = phb_to_phb4(phb); + + switch (mode) { + case OPAL_PCI_P2P_INITIATOR: + if (flags & OPAL_PCI_P2P_ENABLE) + phb4_p2p_set_initiator(p, pe_number); + /* + * When disabling p2p on the initiator, we should + * reset the TVE to its default bypass setting, but it + * is more easily done from the OS, as it knows the + * the start and end address and there's already an + * opal call for it, so let linux handle it. + */ + break; + case OPAL_PCI_P2P_TARGET: + phb4_p2p_set_target(p, !!(flags & OPAL_PCI_P2P_ENABLE)); + break; + default: + assert(0); + } +} + +static int64_t phb4_set_capp_recovery(struct phb *phb) +{ + struct phb4 *p = phb_to_phb4(phb); + + if (p->flags & PHB4_CAPP_RECOVERY) + return 0; + + /* set opal event flag to indicate eeh condition */ + opal_update_pending_evt(OPAL_EVENT_PCI_ERROR, + OPAL_EVENT_PCI_ERROR); + + p->flags |= PHB4_CAPP_RECOVERY; + + return 0; +} + +/* + * Return the address out of a PBCQ Tunnel Bar register. + */ +static void phb4_get_tunnel_bar(struct phb *phb, uint64_t *addr) +{ + struct phb4 *p = phb_to_phb4(phb); + uint64_t val; + + xscom_read(p->chip_id, p->pe_stk_xscom + XPEC_NEST_STK_TUNNEL_BAR, + &val); + *addr = val >> 8; +} + +/* + * Set PBCQ Tunnel Bar register. + * Store addr bits [8:50] in PBCQ Tunnel Bar register bits [0:42]. + * Note that addr bits [8:50] must also match PSL_TNR_ADDR[8:50]. + * Reset register if val == 0. + * + * This interface is required to let device drivers set the Tunnel Bar + * value of their choice. + * + * Compatibility with older versions of linux, that do not set the + * Tunnel Bar with phb4_set_tunnel_bar(), is ensured by enable_capi_mode(), + * that will set the default value that used to be assumed. + */ +static int64_t phb4_set_tunnel_bar(struct phb *phb, uint64_t addr) +{ + struct phb4 *p = phb_to_phb4(phb); + uint64_t mask = 0x00FFFFFFFFFFE000ULL; + + if (!addr) { + /* Reset register */ + xscom_write(p->chip_id, + p->pe_stk_xscom + XPEC_NEST_STK_TUNNEL_BAR, addr); + return OPAL_SUCCESS; + } + if ((addr & ~mask)) + return OPAL_PARAMETER; + if (!(addr & mask)) + return OPAL_PARAMETER; + + xscom_write(p->chip_id, p->pe_stk_xscom + XPEC_NEST_STK_TUNNEL_BAR, + (addr & mask) << 8); + return OPAL_SUCCESS; +} + +static const struct phb_ops phb4_ops = { + .cfg_read8 = phb4_pcicfg_read8, + .cfg_read16 = phb4_pcicfg_read16, + .cfg_read32 = phb4_pcicfg_read32, + .cfg_write8 = phb4_pcicfg_write8, + .cfg_write16 = phb4_pcicfg_write16, + .cfg_write32 = phb4_pcicfg_write32, + .get_reserved_pe_number = phb4_get_reserved_pe_number, + .device_init = phb4_device_init, + .device_remove = NULL, + .ioda_reset = phb4_ioda_reset, + .papr_errinjct_reset = phb4_papr_errinjct_reset, + .pci_reinit = phb4_pci_reinit, + .set_phb_mem_window = phb4_set_phb_mem_window, + .phb_mmio_enable = phb4_phb_mmio_enable, + .map_pe_mmio_window = phb4_map_pe_mmio_window, + .map_pe_dma_window = phb4_map_pe_dma_window, + .map_pe_dma_window_real = phb4_map_pe_dma_window_real, + .set_option = phb4_set_option, + .get_option = phb4_get_option, + .set_xive_pe = phb4_set_ive_pe, + .get_msi_32 = phb4_get_msi_32, + .get_msi_64 = phb4_get_msi_64, + .set_pe = phb4_set_pe, + .set_peltv = phb4_set_peltv, + .eeh_freeze_status = phb4_eeh_freeze_status, + .eeh_freeze_clear = phb4_eeh_freeze_clear, + .eeh_freeze_set = phb4_eeh_freeze_set, + .next_error = phb4_eeh_next_error, + .err_inject = phb4_err_inject, + .get_diag_data2 = phb4_get_diag_data, + .tce_kill = phb4_tce_kill, + .set_capi_mode = phb4_set_capi_mode, + .set_p2p = phb4_set_p2p, + .set_capp_recovery = phb4_set_capp_recovery, + .get_tunnel_bar = phb4_get_tunnel_bar, + .set_tunnel_bar = phb4_set_tunnel_bar, +}; + +static void phb4_init_ioda3(struct phb4 *p) +{ + if (is_phb5()) { + /* + * When ABT is on, the MSIs on the PHB use the PQ state bits + * of the IC and MSI triggers from the PHB are forwarded + * directly to the IC ESB page. However, the LSIs are still + * controlled locally on the PHB and LSI triggers use a + * special offset for trigger injection. + */ + if (phb_abt_mode(p)) { + uint64_t mmio_base = xive2_get_esb_base(p->base_msi); + + PHBDBG(p, "Using ABT mode. ESB: 0x%016llx\n", mmio_base); + + /* Init_18 - Interrupt Notify Base Address */ + out_be64(p->regs + PHB_INT_NOTIFY_ADDR, + PHB_INT_NOTIFY_ADDR_64K | mmio_base); + + /* Interrupt Notify Base Index is unused */ + } else { + p->irq_port = xive2_get_notify_port(p->chip_id, + XIVE_HW_SRC_PHBn(p->index)); + + PHBDBG(p, "Using IC notif page at 0x%016llx\n", + p->irq_port); + + /* Init_18 - Interrupt Notify Base Address */ + out_be64(p->regs + PHB_INT_NOTIFY_ADDR, p->irq_port); + + /* Init_19 - Interrupt Notify Base Index */ + out_be64(p->regs + PHB_INT_NOTIFY_INDEX, + xive2_get_notify_base(p->base_msi)); + } + + } else { /* p9 */ + p->irq_port = xive_get_notify_port(p->chip_id, + XIVE_HW_SRC_PHBn(p->index)); + /* Init_18 - Interrupt Notify Base Address */ + out_be64(p->regs + PHB_INT_NOTIFY_ADDR, p->irq_port); + + /* Init_19 - Interrupt Notify Base Index */ + out_be64(p->regs + PHB_INT_NOTIFY_INDEX, + xive_get_notify_base(p->base_msi)); + } + + /* Init_19x - Not in spec: Initialize source ID */ + PHBDBG(p, "Reset state SRC_ID: %016llx\n", + in_be64(p->regs + PHB_LSI_SOURCE_ID)); + out_be64(p->regs + PHB_LSI_SOURCE_ID, + SETFIELD(PHB_LSI_SRC_ID, 0ull, (p->num_irqs - 1) >> 3)); + + /* Init_20 - RTT BAR */ + out_be64(p->regs + PHB_RTT_BAR, (u64) p->tbl_rtt | PHB_RTT_BAR_ENABLE); + + /* Init_21 - PELT-V BAR */ + out_be64(p->regs + PHB_PELTV_BAR, + (u64) p->tbl_peltv | PHB_PELTV_BAR_ENABLE); + + /* Init_22 - Setup M32 starting address */ + out_be64(p->regs + PHB_M32_START_ADDR, M32_PCI_START); + + /* Init_23 - Setup PEST BAR */ + out_be64(p->regs + PHB_PEST_BAR, + p->tbl_pest | PHB_PEST_BAR_ENABLE); + + /* Init_24 - CRW Base Address Reg */ + /* See enable_capi_mode() */ + + if (is_phb4()) { + /* Init_25 - ASN Compare/Mask - P9 only */ + out_be64(p->regs + PHB_ASN_CMPM, ((u64)ASNIND << 48) | + ((u64)ASNMASK << 32) | PHB_ASN_CMPM_ENABLE); + } + + /* Init_26 - CAPI Compare/Mask */ + /* See enable_capi_mode() */ + /* if CAPP being disabled then reset CAPI Compare/Mask Register */ + if (p->flags & PHB4_CAPP_DISABLE) + out_be64(p->regs + PHB_CAPI_CMPM, 0); + + /* Init_27 - PCIE Outbound upper address */ + out_be64(p->regs + PHB_M64_UPPER_BITS, 0); + + /* Init_28 - PHB4 Configuration */ + out_be64(p->regs + PHB_PHB4_CONFIG, + PHB_PHB4C_32BIT_MSI_EN | + PHB_PHB4C_64BIT_MSI_EN); + + /* Init_29 - At least 256ns delay according to spec. Do a dummy + * read first to flush posted writes + */ + in_be64(p->regs + PHB_PHB4_CONFIG); + time_wait_us(2); + + /* Init_30..41 - On-chip IODA tables init */ + phb4_ioda_reset(&p->phb, false); +} + +/* phb4_init_rc - Initialize the Root Complex config space + */ +static bool phb4_init_rc_cfg(struct phb4 *p) +{ + int64_t ecap, aercap; + + /* XXX Handle errors ? */ + + /* Init_46: + * + * Set primary bus to 0, secondary to 1 and subordinate to 0xff + */ + phb4_pcicfg_write32(&p->phb, 0, PCI_CFG_PRIMARY_BUS, 0x00ff0100); + + /* Init_47 - Clear errors */ + /* see phb4_rc_err_clear() called below */ + + /* Init_48 + * + * PCIE Device control/status, enable error reporting, disable relaxed + * ordering, set MPS to 128 (see note), clear errors. + * + * Note: The doc recommends to set MPS to 512. This has proved to have + * some issues as it requires specific clamping of MRSS on devices and + * we've found devices in the field that misbehave when doing that. + * + * We currently leave it all to 128 bytes (minimum setting) at init + * time. The generic PCIe probing later on might apply a different + * value, or the kernel will, but we play it safe at early init + */ + if (p->ecap <= 0) { + ecap = pci_find_cap(&p->phb, 0, PCI_CFG_CAP_ID_EXP); + if (ecap < 0) { + PHBERR(p, "Can't locate PCI-E capability\n"); + return false; + } + p->ecap = ecap; + } else { + ecap = p->ecap; + } + + phb4_pcicfg_write16(&p->phb, 0, ecap + PCICAP_EXP_DEVCTL, + PCICAP_EXP_DEVCTL_CE_REPORT | + PCICAP_EXP_DEVCTL_NFE_REPORT | + PCICAP_EXP_DEVCTL_FE_REPORT | + PCICAP_EXP_DEVCTL_UR_REPORT | + SETFIELD(PCICAP_EXP_DEVCTL_MPS, 0, PCIE_MPS_128B)); + + /* Init_49 - Device Control/Status 2 */ + phb4_pcicfg_write16(&p->phb, 0, ecap + PCICAP_EXP_DCTL2, + SETFIELD(PCICAP_EXP_DCTL2_CMPTOUT, 0, 0x5) | + PCICAP_EXP_DCTL2_ARI_FWD); + + /* Init_50..54 + * + * AER inits + */ + if (p->aercap <= 0) { + aercap = pci_find_ecap(&p->phb, 0, PCIECAP_ID_AER, NULL); + if (aercap < 0) { + PHBERR(p, "Can't locate AER capability\n"); + return false; + } + p->aercap = aercap; + } else { + aercap = p->aercap; + } + + /* Disable some error reporting as per the PHB4 spec */ + phb4_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_UE_MASK, + PCIECAP_AER_UE_POISON_TLP | + PCIECAP_AER_UE_COMPL_TIMEOUT | + PCIECAP_AER_UE_COMPL_ABORT); + + /* Enable ECRC generation & checking */ + phb4_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_CAPCTL, + PCIECAP_AER_CAPCTL_ECRCG_EN | + PCIECAP_AER_CAPCTL_ECRCC_EN); + + phb4_rc_err_clear(p); + + return true; +} + +static void phb4_init_errors(struct phb4 *p) +{ + /* Init_55..63 - PBL errors */ + out_be64(p->regs + 0x1900, 0xffffffffffffffffull); + out_be64(p->regs + 0x1908, 0x0000000000000000ull); + out_be64(p->regs + 0x1920, 0x000000004d1780f8ull); + out_be64(p->regs + 0x1928, 0x0000000000000000ull); + out_be64(p->regs + 0x1930, 0xffffffffb2f87f07ull); + out_be64(p->regs + 0x1940, 0x0000000000000000ull); + out_be64(p->regs + 0x1948, 0x0000000000000000ull); + out_be64(p->regs + 0x1950, 0x0000000000000000ull); + out_be64(p->regs + 0x1958, 0x0000000000000000ull); + + /* Init_64..72 - REGB errors */ + out_be64(p->regs + 0x1c00, 0xffffffffffffffffull); + out_be64(p->regs + 0x1c08, 0x0000000000000000ull); + /* Enable/disable error status indicators that trigger irqs */ + if (p->has_link) { + out_be64(p->regs + 0x1c20, 0x2130006efca8bc00ull); + out_be64(p->regs + 0x1c30, 0xde1fff91035743ffull); + } else { + out_be64(p->regs + 0x1c20, 0x0000000000000000ull); + out_be64(p->regs + 0x1c30, 0x0000000000000000ull); + } + out_be64(p->regs + 0x1c28, 0x0080000000000000ull); + out_be64(p->regs + 0x1c40, 0x0000000000000000ull); + out_be64(p->regs + 0x1c48, 0x0000000000000000ull); + out_be64(p->regs + 0x1c50, 0x0000000000000000ull); + out_be64(p->regs + 0x1c58, 0x0040000000000000ull); + + /* Init_73..81 - TXE errors */ + out_be64(p->regs + 0x0d08, 0x0000000000000000ull); + + /* Errata: Clear bit 17, otherwise a CFG write UR/CA will incorrectly + * freeze a "random" PE (whatever last PE did an MMIO) + */ + if (is_phb5()) { + out_be64(p->regs + 0x0d28, 0x0000500a00000000ull); + out_be64(p->regs + 0x0d00, 0xffffffffffffffffull); + out_be64(p->regs + 0x0d18, 0xffffff0fffffffffull); + out_be64(p->regs + 0x0d30, 0xdff7af41f7ddffdfull); + } else { + out_be64(p->regs + 0x0d28, 0x0000000a00000000ull); + if (phb4_is_dd20(p)) { + out_be64(p->regs + 0x0d00, 0xf3acff0ff7ddfff0ull); + out_be64(p->regs + 0x0d18, 0xf3acff0ff7ddfff0ull); + out_be64(p->regs + 0x0d30, 0xdfffbd05f7ddfff0ull); /* XXX CAPI has diff. value */ + } else { + out_be64(p->regs + 0x0d00, 0xffffffffffffffffull); + out_be64(p->regs + 0x0d18, 0xffffff0fffffffffull); + out_be64(p->regs + 0x0d30, 0xdff7bd05f7ddfff0ull); + } + } + + out_be64(p->regs + 0x0d40, 0x0000000000000000ull); + out_be64(p->regs + 0x0d48, 0x0000000000000000ull); + out_be64(p->regs + 0x0d50, 0x0000000000000000ull); + out_be64(p->regs + 0x0d58, 0x0000000000000000ull); + + /* Init_82..90 - RXE_ARB errors */ + out_be64(p->regs + 0x0d80, 0xffffffffffffffffull); + out_be64(p->regs + 0x0d88, 0x0000000000000000ull); + out_be64(p->regs + 0x0d98, 0xfffffffffbffffffull); + out_be64(p->regs + 0x0da8, 0xc00018b801000060ull); + /* + * Errata ER20161123 says we should set the top two bits in + * 0x0db0 but this causes config space accesses which don't + * get a response to fence the PHB. This breaks probing, + * hence we don't set them here. + */ + out_be64(p->regs + 0x0db0, 0x3bffd703fa7fbf8full); /* XXX CAPI has diff. value */ + out_be64(p->regs + 0x0dc0, 0x0000000000000000ull); + out_be64(p->regs + 0x0dc8, 0x0000000000000000ull); + out_be64(p->regs + 0x0dd0, 0x0000000000000000ull); + out_be64(p->regs + 0x0dd8, 0x0000000004000000ull); + + /* Init_91..99 - RXE_MRG errors */ + out_be64(p->regs + 0x0e00, 0xffffffffffffffffull); + out_be64(p->regs + 0x0e08, 0x0000000000000000ull); + out_be64(p->regs + 0x0e18, 0xffffffffffffffffull); + out_be64(p->regs + 0x0e28, 0x0000600000000000ull); + out_be64(p->regs + 0x0e30, 0xfffffeffff7fff57ull); + out_be64(p->regs + 0x0e40, 0x0000000000000000ull); + out_be64(p->regs + 0x0e48, 0x0000000000000000ull); + out_be64(p->regs + 0x0e50, 0x0000000000000000ull); + out_be64(p->regs + 0x0e58, 0x0000000000000000ull); + + /* Init_100..108 - RXE_TCE errors */ + out_be64(p->regs + 0x0e80, 0xffffffffffffffffull); + out_be64(p->regs + 0x0e88, 0x0000000000000000ull); + out_be64(p->regs + 0x0e98, 0xffffffffffffffffull); + out_be64(p->regs + 0x0ea8, 0x60000000c0000000ull); + out_be64(p->regs + 0x0eb0, 0x9faeffaf3fffffffull); /* XXX CAPI has diff. value */ + out_be64(p->regs + 0x0ec0, 0x0000000000000000ull); + out_be64(p->regs + 0x0ec8, 0x0000000000000000ull); + out_be64(p->regs + 0x0ed0, 0x0000000000000000ull); + out_be64(p->regs + 0x0ed8, 0x0000000000000000ull); + + /* Init_109..117 - RXPHB errors */ + out_be64(p->regs + 0x0c80, 0xffffffffffffffffull); + out_be64(p->regs + 0x0c88, 0x0000000000000000ull); + out_be64(p->regs + 0x0c98, 0xffffffffffffffffull); + out_be64(p->regs + 0x0ca8, 0x0000004000000000ull); + out_be64(p->regs + 0x0cb0, 0x35777033ff000000ull); /* XXX CAPI has diff. value */ + out_be64(p->regs + 0x0cc0, 0x0000000000000000ull); + out_be64(p->regs + 0x0cc8, 0x0000000000000000ull); + out_be64(p->regs + 0x0cd0, 0x0000000000000000ull); + out_be64(p->regs + 0x0cd8, 0x0000000000000000ull); + + /* Init_118..121 - LEM */ + out_be64(p->regs + 0x0c00, 0x0000000000000000ull); + if (phb4_is_dd20(p)) { + out_be64(p->regs + 0x0c30, 0xf3ffffffffffffffull); + out_be64(p->regs + 0x0c38, 0xf3ffffffffffffffull); + } else { + out_be64(p->regs + 0x0c30, 0xffffffffffffffffull); + out_be64(p->regs + 0x0c38, 0xffffffffffffffffull); + } + out_be64(p->regs + 0x0c40, 0x0000000000000000ull); +} + + +static bool phb4_wait_dlp_reset(struct phb4 *p) +{ + unsigned int i; + uint64_t val; + + /* + * Firmware cannot access the UTL core regs or PCI config space + * until the cores are out of DL_PGRESET. + * DL_PGRESET should be polled until it is inactive with a value + * of '0'. The recommended polling frequency is once every 1ms. + * Firmware should poll at least 200 attempts before giving up. + * MMIO Stores to the link are silently dropped by the UTL core if + * the link is down. + * MMIO Loads to the link will be dropped by the UTL core and will + * eventually time-out and will return an all ones response if the + * link is down. + */ +#define DLP_RESET_ATTEMPTS 200 + + PHBDBG(p, "Waiting for DLP PG reset to complete...\n"); + for (i = 0; i < DLP_RESET_ATTEMPTS; i++) { + val = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL); + if (!(val & PHB_PCIE_DLP_DL_PGRESET)) + break; + time_wait_ms(1); + } + if (val & PHB_PCIE_DLP_DL_PGRESET) { + PHBERR(p, "Timeout waiting for DLP PG reset !\n"); + return false; + } + return true; +} +static void phb4_init_hw(struct phb4 *p) +{ + uint64_t val, creset; + + PHBDBG(p, "Initializing PHB...\n"); + + /* Init_1 - Sync reset + * + * At this point we assume the PHB has already been reset. + */ + + /* Init_2 - Mask FIRs */ + out_be64(p->regs + PHB_LEM_ERROR_MASK, 0xffffffffffffffffull); + + /* Init_3 - TCE tag enable */ + out_be64(p->regs + PHB_TCE_TAG_ENABLE, 0xffffffffffffffffull); + + /* Init_4 - PCIE System Configuration Register + * + * Adjust max speed based on system config + */ + val = in_be64(p->regs + PHB_PCIE_SCR); + PHBDBG(p, "Default system config: 0x%016llx\n", val); + val = SETFIELD(PHB_PCIE_SCR_MAXLINKSPEED, val, p->max_link_speed); + out_be64(p->regs + PHB_PCIE_SCR, val); + PHBDBG(p, "New system config : 0x%016llx\n", + in_be64(p->regs + PHB_PCIE_SCR)); + + /* Init_5 - deassert CFG reset */ + creset = in_be64(p->regs + PHB_PCIE_CRESET); + PHBDBG(p, "Initial PHB CRESET is 0x%016llx\n", creset); + creset &= ~PHB_PCIE_CRESET_CFG_CORE; + out_be64(p->regs + PHB_PCIE_CRESET, creset); + + /* Init_6..13 - PCIE DLP Lane EQ control */ + if (p->lane_eq) { + out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL0, be64_to_cpu(p->lane_eq[0])); + out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL1, be64_to_cpu(p->lane_eq[1])); + out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL2, be64_to_cpu(p->lane_eq[2])); + out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL3, be64_to_cpu(p->lane_eq[3])); + out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL40, be64_to_cpu(p->lane_eq[4])); + out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL41, be64_to_cpu(p->lane_eq[5])); + if (is_phb5()) { + out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL50, be64_to_cpu(p->lane_eq[6])); + out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL51, be64_to_cpu(p->lane_eq[7])); + } + } + if (!p->lane_eq_en) { + /* Read modify write and set to 2 bits */ + PHBDBG(p, "LINK: Disabling Lane EQ\n"); + val = in_be64(p->regs + PHB_PCIE_DLP_CTL); + val |= PHB_PCIE_DLP_CTL_BYPASS_PH2 | PHB_PCIE_DLP_CTL_BYPASS_PH3; + out_be64(p->regs + PHB_PCIE_DLP_CTL, val); + } + + if (is_phb5()) { + /* disable scaled flow control for now. SW527785 */ + PHBDBG(p, "LINK: Disabling scaled flow control\n"); + val = in_be64(p->regs + PHB_PCIE_DLP_CTL); + val |= PHB_PCIE_DLP_CTL_SFC_DISABLE; + out_be64(p->regs + PHB_PCIE_DLP_CTL, val); + + /* lane equalization settings need to be tuned on P10 */ + out_be64(p->regs + PHB_PCIE_PDL_PHY_EQ_CNTL, + 0x80F4FFFFFF0F9C00); + } + + /* Init_14 - Clear link training */ + phb4_pcicfg_write32(&p->phb, 0, 0x78, + 0x07FE0000 | p->max_link_speed); + + /* Init_15 - deassert cores reset */ + /* + * Lift the PHB resets but not PERST, this will be lifted + * later by the initial PERST state machine + */ + creset &= ~(PHB_PCIE_CRESET_TLDLP | PHB_PCIE_CRESET_PBL); + creset |= PHB_PCIE_CRESET_PIPE_N; + out_be64(p->regs + PHB_PCIE_CRESET, creset); + + /* Init_16 - Wait for DLP PGRESET to clear */ + if (!phb4_wait_dlp_reset(p)) + goto failed; + + /* Init_17 - PHB Control */ + val = PHB_CTRLR_IRQ_PGSZ_64K; + val |= PHB_CTRLR_TCE_CLB_DISABLE; // HW557787 circumvention + val |= SETFIELD(PHB_CTRLR_TVT_ADDR_SEL, 0ull, TVT_2_PER_PE); + if (phb_pq_disable(p)) + val |= PHB_CTRLR_IRQ_PQ_DISABLE; + if (phb_abt_mode(p)) + val |= PHB_CTRLR_IRQ_ABT_MODE; + if (phb_can_store_eoi(p)) { + val |= PHB_CTRLR_IRQ_STORE_EOI; + PHBDBG(p, "store EOI is enabled\n"); + } + + if (!pci_eeh_mmio) + val |= PHB_CTRLR_MMIO_EEH_DISABLE; + + out_be64(p->regs + PHB_CTRLR, val); + + /* Init_18..41 - Architected IODA3 inits */ + phb4_init_ioda3(p); + + /* Init_42..45 - Clear DLP error logs */ + out_be64(p->regs + 0x1aa0, 0xffffffffffffffffull); + out_be64(p->regs + 0x1aa8, 0xffffffffffffffffull); + out_be64(p->regs + 0x1ab0, 0xffffffffffffffffull); + out_be64(p->regs + 0x1ab8, 0x0); + + + /* Init_46..54 : Init root complex config space */ + if (!phb4_init_rc_cfg(p)) + goto failed; + + /* Init_55..121 : Setup error registers */ + phb4_init_errors(p); + + /* Init_122..123 : Wait for link + * NOTE: At this point the spec waits for the link to come up. We + * don't bother as we are doing a PERST soon. + */ + + /* Init_124 : NBW. XXX TODO */ + /* See enable_capi_mode() */ + + /* Init_125 : Setup PCI command/status on root complex + * I don't know why the spec does this now and not earlier, so + * to be sure to get it right we might want to move it to the freset + * state machine, though the generic PCI layer will probably do + * this anyway (ie, enable MEM, etc... in the RC) + + */ + phb4_pcicfg_write16(&p->phb, 0, PCI_CFG_CMD, + PCI_CFG_CMD_MEM_EN | + PCI_CFG_CMD_BUS_MASTER_EN); + + /* Clear errors */ + phb4_pcicfg_write16(&p->phb, 0, PCI_CFG_STAT, + PCI_CFG_STAT_SENT_TABORT | + PCI_CFG_STAT_RECV_TABORT | + PCI_CFG_STAT_RECV_MABORT | + PCI_CFG_STAT_SENT_SERR | + PCI_CFG_STAT_RECV_PERR); + + /* Init_126..130 - Re-enable error interrupts */ + phb4_int_unmask_all(p); + + /* Init_131 - Re-enable LEM error mask */ + out_be64(p->regs + PHB_LEM_ERROR_MASK, 0x0000000000000000ull); + + + /* Init_132 - Enable DMA address speculation */ + out_be64(p->regs + PHB_TCE_SPEC_CTL, 0x0000000000000000ull); + + /* Init_133 - Timeout Control Register 1 */ + out_be64(p->regs + PHB_TIMEOUT_CTRL1, 0x0015150000150000ull); + + /* Init_134 - Timeout Control Register 2 */ + out_be64(p->regs + PHB_TIMEOUT_CTRL2, 0x0000151500000000ull); + + /* Init_135 - PBL Timeout Control Register */ + out_be64(p->regs + PHB_PBL_TIMEOUT_CTRL, 0x2013000000000000ull); + + /* Mark the PHB as functional which enables all the various sequences */ + p->broken = false; + + PHBDBG(p, "Initialization complete\n"); + + return; + + failed: + PHBERR(p, "Initialization failed\n"); + p->broken = true; +} + +/* FIXME: Use scoms rather than MMIO incase we are fenced */ +static bool phb4_read_capabilities(struct phb4 *p) +{ + uint64_t val; + + /* XXX Should make sure ETU is out of reset ! */ + + /* Grab version and fit it in an int */ + val = phb4_read_reg_asb(p, PHB_VERSION); + if (val == 0 || val == 0xffffffffffffffffUL) { + PHBERR(p, "Failed to read version, PHB appears broken\n"); + return false; + } + + p->rev = ((val >> 16) & 0x00ff0000) | (val & 0xffff); + PHBDBG(p, "Core revision 0x%x\n", p->rev); + + /* Read EEH capabilities */ + val = in_be64(p->regs + PHB_PHB4_EEH_CAP); + if (val == 0xffffffffffffffffUL) { + PHBERR(p, "Failed to read EEH cap, PHB appears broken\n"); + return false; + } + p->max_num_pes = val >> 52; + if (p->max_num_pes >= 512) { + p->mrt_size = 16; + p->mbt_size = 32; + p->tvt_size = 1024; + } else { + p->mrt_size = 8; + p->mbt_size = 16; + p->tvt_size = 512; + } + + val = in_be64(p->regs + PHB_PHB4_IRQ_CAP); + if (val == 0xffffffffffffffffUL) { + PHBERR(p, "Failed to read IRQ cap, PHB appears broken\n"); + return false; + } + p->num_irqs = val & 0xffff; + + /* This works for 512 PEs. FIXME calculate for any hardware + * size returned above + */ + p->tbl_peltv_size = PELTV_TABLE_SIZE_MAX; + + p->tbl_pest_size = p->max_num_pes*16; + + PHBDBG(p, "Found %d max PEs and %d IRQs \n", + p->max_num_pes, p->num_irqs); + + return true; +} + +static void phb4_allocate_tables(struct phb4 *p) +{ + uint32_t i; + + /* XXX Our current memalign implementation sucks, + * + * It will do the job, however it doesn't support freeing + * the memory and wastes space by always allocating twice + * as much as requested (size + alignment) + */ + p->tbl_rtt = local_alloc(p->chip_id, RTT_TABLE_SIZE, RTT_TABLE_SIZE); + assert(p->tbl_rtt); + for (i = 0; i < RTT_TABLE_ENTRIES; i++) + p->tbl_rtt[i] = cpu_to_be16(PHB4_RESERVED_PE_NUM(p)); + + p->tbl_peltv = local_alloc(p->chip_id, p->tbl_peltv_size, p->tbl_peltv_size); + assert(p->tbl_peltv); + memset(p->tbl_peltv, 0, p->tbl_peltv_size); + + p->tbl_pest = (uint64_t)local_alloc(p->chip_id, p->tbl_pest_size, p->tbl_pest_size); + assert(p->tbl_pest); + memset((void *)p->tbl_pest, 0, p->tbl_pest_size); +} + +static void phb4_add_properties(struct phb4 *p) +{ + struct dt_node *np = p->phb.dt_node; + uint32_t lsibase, icsp = get_ics_phandle(); + uint64_t m32b, m64b, m64s; + + /* Add various properties that HB doesn't have to + * add, some of them simply because they result from + * policy decisions made in skiboot rather than in HB + * such as the MMIO windows going to PCI, interrupts, + * etc... + */ + dt_add_property_cells(np, "#address-cells", 3); + dt_add_property_cells(np, "#size-cells", 2); + dt_add_property_cells(np, "#interrupt-cells", 1); + dt_add_property_cells(np, "bus-range", 0, 0xff); + dt_add_property_cells(np, "clock-frequency", 0x200, 0); /* ??? */ + + dt_add_property_cells(np, "interrupt-parent", icsp); + + /* XXX FIXME: add slot-name */ + //dt_property_cell("bus-width", 8); /* Figure it out from VPD ? */ + + /* "ranges", we only expose M32 (PHB4 doesn't do IO) + * + * Note: The kernel expects us to have chopped of 64k from the + * M32 size (for the 32-bit MSIs). If we don't do that, it will + * get confused (OPAL does it) + */ + m32b = cleanup_addr(p->mm1_base); + m64b = cleanup_addr(p->mm0_base); + m64s = p->mm0_size; + dt_add_property_cells(np, "ranges", + /* M32 space */ + 0x02000000, 0x00000000, M32_PCI_START, + hi32(m32b), lo32(m32b), 0, M32_PCI_SIZE - 0x10000); + + /* XXX FIXME: add opal-memwin32, dmawins, etc... */ + dt_add_property_u64s(np, "ibm,opal-m64-window", m64b, m64b, m64s); + dt_add_property(np, "ibm,opal-single-pe", NULL, 0); + dt_add_property_cells(np, "ibm,opal-num-pes", p->num_pes); + dt_add_property_cells(np, "ibm,opal-reserved-pe", + PHB4_RESERVED_PE_NUM(p)); + dt_add_property_cells(np, "ibm,opal-msi-ranges", + p->base_msi, p->num_irqs - 8); + /* M64 ranges start at 1 as MBT0 is used for M32 */ + dt_add_property_cells(np, "ibm,opal-available-m64-ranges", + 1, p->mbt_size - 1); + dt_add_property_cells(np, "ibm,supported-tce-sizes", + 12, // 4K + 16, // 64K + 21, // 2M + 30); // 1G + + /* Tell Linux about alignment limits for segment splits. + * + * XXX We currently only expose splits of 1 and "num PEs", + */ + dt_add_property_cells(np, "ibm,opal-m64-segment-splits", + /* Full split, number of segments: */ + p->num_pes, + /* Encoding passed to the enable call */ + OPAL_ENABLE_M64_SPLIT, + /* Alignement/size restriction in #bits*/ + /* XXX VERIFY VALUE */ + 12, + /* Unused */ + 0, + /* single PE, number of segments: */ + 1, + /* Encoding passed to the enable call */ + OPAL_ENABLE_M64_NON_SPLIT, + /* Alignement/size restriction in #bits*/ + /* XXX VERIFY VALUE */ + 12, + /* Unused */ + 0); + + /* The interrupt maps will be generated in the RC node by the + * PCI code based on the content of this structure: + */ + lsibase = p->base_lsi; + p->phb.lstate.int_size = 2; + p->phb.lstate.int_val[0][0] = lsibase + PHB4_LSI_PCIE_INTA; + p->phb.lstate.int_val[0][1] = 1; + p->phb.lstate.int_val[1][0] = lsibase + PHB4_LSI_PCIE_INTB; + p->phb.lstate.int_val[1][1] = 1; + p->phb.lstate.int_val[2][0] = lsibase + PHB4_LSI_PCIE_INTC; + p->phb.lstate.int_val[2][1] = 1; + p->phb.lstate.int_val[3][0] = lsibase + PHB4_LSI_PCIE_INTD; + p->phb.lstate.int_val[3][1] = 1; + p->phb.lstate.int_parent[0] = icsp; + p->phb.lstate.int_parent[1] = icsp; + p->phb.lstate.int_parent[2] = icsp; + p->phb.lstate.int_parent[3] = icsp; + + /* Indicators for variable tables */ + dt_add_property_cells(np, "ibm,opal-rtt-table", + hi32((u64) p->tbl_rtt), lo32((u64) p->tbl_rtt), RTT_TABLE_SIZE); + + dt_add_property_cells(np, "ibm,opal-peltv-table", + hi32((u64) p->tbl_peltv), lo32((u64) p->tbl_peltv), + p->tbl_peltv_size); + + dt_add_property_cells(np, "ibm,opal-pest-table", + hi32(p->tbl_pest), lo32(p->tbl_pest), p->tbl_pest_size); + + dt_add_property_cells(np, "ibm,phb-diag-data-size", + sizeof(struct OpalIoPhb4ErrorData)); + + /* Indicate to Linux that CAPP timebase sync is supported */ + dt_add_property_string(np, "ibm,capp-timebase-sync", NULL); + + /* Tell Linux Compare/Mask indication values */ + dt_add_property_cells(np, "ibm,phb-indications", CAPIIND, ASNIND, + NBWIND); +} + +static bool phb4_calculate_windows(struct phb4 *p) +{ + const struct dt_property *prop; + + /* Get PBCQ MMIO windows from device-tree */ + prop = dt_require_property(p->phb.dt_node, + "ibm,mmio-windows", -1); + assert(prop->len >= (2 * sizeof(uint64_t))); + + p->mm0_base = dt_property_get_u64(prop, 0); + p->mm0_size = dt_property_get_u64(prop, 1); + if (prop->len > 16) { + p->mm1_base = dt_property_get_u64(prop, 2); + p->mm1_size = dt_property_get_u64(prop, 3); + } + + /* Sort them so that 0 is big and 1 is small */ + if (p->mm1_size && p->mm1_size > p->mm0_size) { + uint64_t b = p->mm0_base; + uint64_t s = p->mm0_size; + p->mm0_base = p->mm1_base; + p->mm0_size = p->mm1_size; + p->mm1_base = b; + p->mm1_size = s; + } + + /* If 1 is too small, ditch it */ + if (p->mm1_size < M32_PCI_SIZE) + p->mm1_size = 0; + + /* If 1 doesn't exist, carve it out of 0 */ + if (p->mm1_size == 0) { + p->mm0_size /= 2; + p->mm1_base = p->mm0_base + p->mm0_size; + p->mm1_size = p->mm0_size; + } + + /* Crop mm1 to our desired size */ + if (p->mm1_size > M32_PCI_SIZE) + p->mm1_size = M32_PCI_SIZE; + + return true; +} + +static void phb4_err_interrupt(struct irq_source *is, uint32_t isn) +{ + struct phb4 *p = is->data; + + PHBDBG(p, "Got interrupt 0x%08x\n", isn); + + /* mask the interrupt conditions to prevent it from re-firing */ + phb4_int_mask_active(p); + + /* Update pending event */ + opal_update_pending_evt(OPAL_EVENT_PCI_ERROR, + OPAL_EVENT_PCI_ERROR); + + /* If the PHB is broken, go away */ + if (p->broken) + return; + + /* + * Mark the PHB has pending error so that the OS + * can handle it at late point. + */ + phb4_set_err_pending(p, true); +} + +static uint64_t phb4_lsi_attributes(struct irq_source *is __unused, + uint32_t isn __unused) +{ +#ifndef DISABLE_ERR_INTS + struct phb4 *p = is->data; + uint32_t idx = isn - p->base_lsi; + + if (idx == PHB4_LSI_PCIE_INF || idx == PHB4_LSI_PCIE_ER) + return IRQ_ATTR_TARGET_OPAL | IRQ_ATTR_TARGET_RARE | IRQ_ATTR_TYPE_LSI; +#endif + return IRQ_ATTR_TARGET_LINUX; +} + +static char *phb4_lsi_name(struct irq_source *is, uint32_t isn) +{ + struct phb4 *p = is->data; + uint32_t idx = isn - p->base_lsi; + char buf[32]; + + if (idx == PHB4_LSI_PCIE_INF) + snprintf(buf, 32, "phb#%04x-inf", p->phb.opal_id); + else if (idx == PHB4_LSI_PCIE_ER) + snprintf(buf, 32, "phb#%04x-err", p->phb.opal_id); + else + assert(0); /* PCIe LSIs should never be directed to OPAL */ + + return strdup(buf); +} + +static const struct irq_source_ops phb4_lsi_ops = { + .interrupt = phb4_err_interrupt, + .attributes = phb4_lsi_attributes, + .name = phb4_lsi_name, +}; + +static __be64 lane_eq_default[8] = { + CPU_TO_BE64(0x5454545454545454UL), CPU_TO_BE64(0x5454545454545454UL), + CPU_TO_BE64(0x5454545454545454UL), CPU_TO_BE64(0x5454545454545454UL), + CPU_TO_BE64(0x7777777777777777UL), CPU_TO_BE64(0x7777777777777777UL), + CPU_TO_BE64(0x7777777777777777UL), CPU_TO_BE64(0x7777777777777777UL), +}; + +static __be64 lane_eq_phb5_default[8] = { + CPU_TO_BE64(0x4444444444444444UL), CPU_TO_BE64(0x4444444444444444UL), + CPU_TO_BE64(0x4444444444444444UL), CPU_TO_BE64(0x4444444444444444UL), + CPU_TO_BE64(0x4444444444444444UL), CPU_TO_BE64(0x4444444444444444UL), + CPU_TO_BE64(0x9999999999999999UL), CPU_TO_BE64(0x9999999999999999UL), +}; + +static void phb4_create(struct dt_node *np) +{ + const struct dt_property *prop; + struct phb4 *p; + struct pci_slot *slot; + size_t lane_eq_len, lane_eq_len_req; + struct dt_node *iplp; + char *path; + uint32_t irq_base, irq_flags; + int i, eq_reg_count; + int chip_id; + + chip_id = dt_prop_get_u32(np, "ibm,chip-id"); + p = local_alloc(chip_id, sizeof(struct phb4), 8); + assert(p); + memset(p, 0x0, sizeof(struct phb4)); + + /* Populate base stuff */ + p->index = dt_prop_get_u32(np, "ibm,phb-index"); + p->chip_id = chip_id; + p->pec = dt_prop_get_u32(np, "ibm,phb-pec-index"); + p->regs = (void *)dt_get_address(np, 0, NULL); + p->int_mmio = (void *)dt_get_address(np, 1, NULL); + p->phb.dt_node = np; + p->phb.ops = &phb4_ops; + p->phb.phb_type = phb_type_pcie_v4; + p->phb.scan_map = 0x1; /* Only device 0 to scan */ + + if (!phb4_calculate_windows(p)) + return; + + /* Get the various XSCOM register bases from the device-tree */ + prop = dt_require_property(np, "ibm,xscom-bases", 5 * sizeof(uint32_t)); + p->pe_xscom = dt_property_get_cell(prop, 0); + p->pe_stk_xscom = dt_property_get_cell(prop, 1); + p->pci_xscom = dt_property_get_cell(prop, 2); + p->pci_stk_xscom = dt_property_get_cell(prop, 3); + p->etu_xscom = dt_property_get_cell(prop, 4); + + /* + * We skip the initial PERST assertion requested by the generic code + * when doing a cold boot because we are coming out of cold boot already + * so we save boot time that way. The PERST state machine will still + * handle waiting for the link to come up, it will just avoid actually + * asserting & deasserting the PERST output + * + * For a hot IPL, we still do a PERST + * + * Note: In absence of property (ie, FSP-less), we stick to the old + * behaviour and set skip_perst to true + */ + p->skip_perst = true; /* Default */ + + iplp = dt_find_by_path(dt_root, "ipl-params/ipl-params"); + if (iplp) { + const char *ipl_type = dt_prop_get_def(iplp, "cec-major-type", NULL); + if (ipl_type && (!strcmp(ipl_type, "hot"))) + p->skip_perst = false; + } + + /* By default link is assumed down */ + p->has_link = false; + + /* We register the PHB before we initialize it so we + * get a useful OPAL ID for it + */ + pci_register_phb(&p->phb, phb4_get_opal_id(p->chip_id, p->index)); + + /* Create slot structure */ + slot = phb4_slot_create(&p->phb); + if (!slot) + PHBERR(p, "Cannot create PHB slot\n"); + + /* Hello ! */ + path = dt_get_path(np); + PHBINF(p, "Found %s @%p\n", path, p->regs); + PHBINF(p, " M32 [0x%016llx..0x%016llx]\n", + p->mm1_base, p->mm1_base + p->mm1_size - 1); + PHBINF(p, " M64 [0x%016llx..0x%016llx]\n", + p->mm0_base, p->mm0_base + p->mm0_size - 1); + free(path); + + /* Find base location code from root node */ + p->phb.base_loc_code = dt_prop_get_def(dt_root, + "ibm,io-base-loc-code", NULL); + if (!p->phb.base_loc_code) + PHBDBG(p, "Base location code not found !\n"); + + /* + * Grab CEC IO VPD load info from the root of the device-tree, + * on P8 there's a single such VPD for the whole machine + */ + prop = dt_find_property(dt_root, "ibm,io-vpd"); + if (!prop) { + /* LX VPD Lid not already loaded */ + if (platform.vpd_iohub_load) + platform.vpd_iohub_load(dt_root); + } + + /* Obtain informatin about the PHB from the hardware directly */ + if (!phb4_read_capabilities(p)) + goto failed; + + p->max_link_speed = phb4_get_max_link_speed(p, np); + p->max_link_width = phb4_get_max_link_width(p); + PHBINF(p, "Max link speed: GEN%i, max link width %i\n", + p->max_link_speed, p->max_link_width); + + /* Check for lane equalization values from HB or HDAT */ + p->lane_eq_en = true; + p->lane_eq = dt_prop_get_def_size(np, "ibm,lane-eq", NULL, &lane_eq_len); + if (is_phb5()) + eq_reg_count = 8; + else + eq_reg_count = 6; + lane_eq_len_req = eq_reg_count * 8; + if (p->lane_eq) { + if (lane_eq_len < lane_eq_len_req) { + PHBERR(p, "Device-tree has ibm,lane-eq too short: %ld" + " (want %ld)\n", lane_eq_len, lane_eq_len_req); + p->lane_eq = NULL; + } + } else { + PHBDBG(p, "Using default lane equalization settings\n"); + if (is_phb5()) + p->lane_eq = lane_eq_phb5_default; + else + p->lane_eq = lane_eq_default; + } + if (p->lane_eq) { + PHBDBG(p, "Override lane equalization settings:\n"); + for (i = 0 ; i < lane_eq_len_req/(8 * 2) ; i++) + PHBDBG(p, " 0x%016llx 0x%016llx\n", + be64_to_cpu(p->lane_eq[2 * i]), + be64_to_cpu(p->lane_eq[2 * i + 1])); + } + + /* Allocate a block of interrupts. We need to know if it needs + * 2K or 4K interrupts ... for now we just use 4K but that + * needs to be fixed + */ + if (is_phb5()) + irq_base = xive2_alloc_hw_irqs(p->chip_id, p->num_irqs, p->num_irqs); + else + irq_base = xive_alloc_hw_irqs(p->chip_id, p->num_irqs, p->num_irqs); + if (irq_base == XIVE_IRQ_ERROR) { + PHBERR(p, "Failed to allocate %d interrupt sources\n", + p->num_irqs); + goto failed; + } + p->base_msi = irq_base; + p->base_lsi = irq_base + p->num_irqs - 8; + p->num_pes = p->max_num_pes; + + /* Allocate the SkiBoot internal in-memory tables for the PHB */ + phb4_allocate_tables(p); + + phb4_add_properties(p); + + /* Clear IODA3 cache */ + phb4_init_ioda_cache(p); + + /* Get the HW up and running */ + phb4_init_hw(p); + + /* init capp that might get attached to the phb */ + if (is_phb4()) + phb4_init_capp(p); + + /* Compute XIVE source flags depending on PHB revision */ + irq_flags = 0; + if (phb_can_store_eoi(p)) + irq_flags |= XIVE_SRC_STORE_EOI; + else + irq_flags |= XIVE_SRC_TRIGGER_PAGE; + + if (is_phb5()) { + /* + * Register sources with XIVE. If offloading is on, use the + * ESB pages of the XIVE IC for the MSI sources instead of the + * ESB pages of the PHB. + */ + if (phb_pq_disable(p) || phb_abt_mode(p)) { + xive2_register_esb_source(p->base_msi, p->num_irqs - 8); + } else { + xive2_register_hw_source(p->base_msi, + p->num_irqs - 8, 16, + p->int_mmio, irq_flags, + NULL, NULL); + } + + /* + * LSI sources always use the ESB pages of the PHB. + */ + xive2_register_hw_source(p->base_lsi, 8, 16, + p->int_mmio + ((p->num_irqs - 8) << 16), + XIVE_SRC_LSI | irq_flags, p, &phb4_lsi_ops); + } else { + /* Register all interrupt sources with XIVE */ + xive_register_hw_source(p->base_msi, p->num_irqs - 8, 16, + p->int_mmio, irq_flags, NULL, NULL); + + xive_register_hw_source(p->base_lsi, 8, 16, + p->int_mmio + ((p->num_irqs - 8) << 16), + XIVE_SRC_LSI, p, &phb4_lsi_ops); + } + + /* Platform additional setup */ + if (platform.pci_setup_phb) + platform.pci_setup_phb(&p->phb, p->index); + + dt_add_property_string(np, "status", "okay"); + + return; + + failed: + p->broken = true; + + /* Tell Linux it's broken */ + dt_add_property_string(np, "status", "error"); +} + +static void phb4_probe_stack(struct dt_node *stk_node, uint32_t pec_index, + uint32_t nest_base, uint32_t pci_base) +{ + enum phys_map_type phys_mmio64, phys_mmio32, phys_xive_esb, phys_reg_spc; + uint32_t pci_stack, nest_stack, etu_base, gcid, phb_num, stk_index; + uint64_t val, phb_bar = 0, irq_bar = 0, bar_en; + uint64_t mmio0_bar = 0, mmio0_bmask, mmio0_sz; + uint64_t mmio1_bar = 0, mmio1_bmask, mmio1_sz; + void *foo; + __be64 mmio_win[4]; + unsigned int mmio_win_sz; + struct dt_node *np; + char *path; + uint64_t capp_ucode_base; + unsigned int max_link_speed; + int rc; + + assert(is_phb5() || is_phb4()); /* Sanity check */ + + gcid = dt_get_chip_id(stk_node); + stk_index = dt_prop_get_u32(stk_node, "reg"); + phb_num = dt_prop_get_u32(stk_node, "ibm,phb-index"); + path = dt_get_path(stk_node); + if (is_phb5()) { + phys_mmio64 = PHB5_64BIT_MMIO; + phys_mmio32 = PHB5_32BIT_MMIO; + phys_xive_esb = PHB5_XIVE_ESB; + phys_reg_spc = PHB5_REG_SPC; + prlog(PR_INFO, "PHB: Chip %d Found PHB5 PBCQ%d Stack %d at %s\n", + gcid, pec_index, stk_index, path); + } else { + phys_mmio64 = PHB4_64BIT_MMIO; + phys_mmio32 = PHB4_32BIT_MMIO; + phys_xive_esb = PHB4_XIVE_ESB; + phys_reg_spc = PHB4_REG_SPC; + prlog(PR_INFO, "PHB: Chip %d Found PHB4 PBCQ%d Stack %d at %s\n", + gcid, pec_index, stk_index, path); + } + free(path); + + pci_stack = pci_base + 0x40 * (stk_index + 1); + nest_stack = nest_base + 0x40 * (stk_index + 1); + etu_base = pci_base + 0x100 + 0x40 * stk_index; + + prlog(PR_DEBUG, "PHB[%d:%d] X[PE]=0x%08x/0x%08x X[PCI]=0x%08x/0x%08x X[ETU]=0x%08x\n", + gcid, phb_num, nest_base, nest_stack, pci_base, pci_stack, etu_base); + + /* Default BAR enables */ + bar_en = 0; + + /* Initialize PHB register BAR */ + phys_map_get(gcid, phys_reg_spc, phb_num, &phb_bar, NULL); + rc = xscom_write(gcid, nest_stack + XPEC_NEST_STK_PHB_REG_BAR, + phb_bar << 8); + + /* A scom error here probably indicates a defective/garded PHB */ + if (rc != OPAL_SUCCESS) { + prerror("PHB[%d:%d] Unable to set PHB BAR. Error=%d\n", + gcid, phb_num, rc); + return; + } + + bar_en |= XPEC_NEST_STK_BAR_EN_PHB; + + /* Same with INT BAR (ESB) */ + phys_map_get(gcid, phys_xive_esb, phb_num, &irq_bar, NULL); + xscom_write(gcid, nest_stack + XPEC_NEST_STK_IRQ_BAR, irq_bar << 8); + bar_en |= XPEC_NEST_STK_BAR_EN_INT; + + + /* Same with MMIO windows */ + phys_map_get(gcid, phys_mmio64, phb_num, &mmio0_bar, &mmio0_sz); + mmio0_bmask = (~(mmio0_sz - 1)) & 0x00FFFFFFFFFFFFFFULL; + xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR0, mmio0_bar << 8); + xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR0_MASK, mmio0_bmask << 8); + + phys_map_get(gcid, phys_mmio32, phb_num, &mmio1_bar, &mmio1_sz); + mmio1_bmask = (~(mmio1_sz - 1)) & 0x00FFFFFFFFFFFFFFULL; + xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR1, mmio1_bar << 8); + xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR1_MASK, mmio1_bmask << 8); + + /* Build MMIO windows list */ + mmio_win_sz = 0; + if (mmio0_bar) { + mmio_win[mmio_win_sz++] = cpu_to_be64(mmio0_bar); + mmio_win[mmio_win_sz++] = cpu_to_be64(mmio0_sz); + bar_en |= XPEC_NEST_STK_BAR_EN_MMIO0; + } + if (mmio1_bar) { + mmio_win[mmio_win_sz++] = cpu_to_be64(mmio1_bar); + mmio_win[mmio_win_sz++] = cpu_to_be64(mmio1_sz); + bar_en |= XPEC_NEST_STK_BAR_EN_MMIO1; + } + + /* Set the appropriate enables */ + xscom_read(gcid, nest_stack + XPEC_NEST_STK_BAR_EN, &val); + val |= bar_en; + xscom_write(gcid, nest_stack + XPEC_NEST_STK_BAR_EN, val); + + /* No MMIO windows ? Barf ! */ + if (mmio_win_sz == 0) { + prerror("PHB[%d:%d] No MMIO windows enabled !\n", gcid, phb_num); + return; + } + + /* Clear errors in PFIR and NFIR */ + xscom_write(gcid, pci_stack + XPEC_PCI_STK_PCI_FIR, 0); + xscom_write(gcid, nest_stack + XPEC_NEST_STK_PCI_NFIR, 0); + + /* Check ETU reset */ + xscom_read(gcid, pci_stack + XPEC_PCI_STK_ETU_RESET, &val); + prlog_once(PR_DEBUG, "ETU reset: %llx\n", val); + xscom_write(gcid, pci_stack + XPEC_PCI_STK_ETU_RESET, 0); + time_wait_ms(1); + + // show we can read phb mmio space + foo = (void *)(phb_bar + 0x800); // phb version register + prlog_once(PR_DEBUG, "Version reg: 0x%016llx\n", in_be64(foo)); + + /* Create PHB node */ + np = dt_new_addr(dt_root, "pciex", phb_bar); + if (!np) + return; + + if (is_phb5()) + dt_add_property_strings(np, "compatible", "ibm,power10-pciex", "ibm,ioda3-phb"); + else + dt_add_property_strings(np, "compatible", "ibm,power9-pciex", "ibm,ioda3-phb"); + dt_add_property_strings(np, "device_type", "pciex"); + dt_add_property_u64s(np, "reg", + phb_bar, 0x1000, + irq_bar, 0x10000000); + + /* Everything else is handled later by skiboot, we just + * stick a few hints here + */ + dt_add_property_cells(np, "ibm,xscom-bases", + nest_base, nest_stack, pci_base, pci_stack, etu_base); + dt_add_property(np, "ibm,mmio-windows", mmio_win, 8 * mmio_win_sz); + dt_add_property_cells(np, "ibm,phb-index", phb_num); + dt_add_property_cells(np, "ibm,phb-pec-index", pec_index); + dt_add_property_cells(np, "ibm,phb-stack", stk_node->phandle); + dt_add_property_cells(np, "ibm,phb-stack-index", stk_index); + dt_add_property_cells(np, "ibm,chip-id", gcid); + + /* read the hub-id out of the pbcq node */ + if (dt_has_node_property(stk_node->parent, "ibm,hub-id", NULL)) { + uint32_t hub_id; + + hub_id = dt_prop_get_u32(stk_node->parent, "ibm,hub-id"); + dt_add_property_cells(np, "ibm,hub-id", hub_id); + } + + if (dt_has_node_property(stk_node->parent, "ibm,loc-code", NULL)) { + const char *lc = dt_prop_get(stk_node->parent, "ibm,loc-code"); + dt_add_property_string(np, "ibm,loc-code", lc); + } + if (dt_has_node_property(stk_node, "ibm,lane-eq", NULL)) { + size_t leq_size; + const void *leq = dt_prop_get_def_size(stk_node, "ibm,lane-eq", + NULL, &leq_size); + if (leq != NULL && leq_size >= 6 * 8) + dt_add_property(np, "ibm,lane-eq", leq, leq_size); + } + if (dt_has_node_property(stk_node, "ibm,capp-ucode", NULL)) { + capp_ucode_base = dt_prop_get_u32(stk_node, "ibm,capp-ucode"); + dt_add_property_cells(np, "ibm,capp-ucode", capp_ucode_base); + } + if (dt_has_node_property(stk_node, "ibm,max-link-speed", NULL)) { + max_link_speed = dt_prop_get_u32(stk_node, "ibm,max-link-speed"); + dt_add_property_cells(np, "ibm,max-link-speed", max_link_speed); + } + dt_add_property_cells(np, "ibm,capi-flags", + OPAL_PHB_CAPI_FLAG_SNOOP_CONTROL); + + add_chip_dev_associativity(np); +} + +static void phb4_probe_pbcq(struct dt_node *pbcq) +{ + uint32_t nest_base, pci_base, pec_index; + struct dt_node *stk; + + /* REMOVEME: force this for now until we stabalise PCIe */ + verbose_eeh = 1; + + nest_base = dt_get_address(pbcq, 0, NULL); + pci_base = dt_get_address(pbcq, 1, NULL); + pec_index = dt_prop_get_u32(pbcq, "ibm,pec-index"); + + dt_for_each_child(pbcq, stk) { + if (dt_node_is_enabled(stk)) + phb4_probe_stack(stk, pec_index, nest_base, pci_base); + } +} + +void probe_phb4(void) +{ + struct dt_node *np; + const char *s; + + pci_eeh_mmio = !nvram_query_eq_dangerous("pci-eeh-mmio", "disabled"); + pci_retry_all = nvram_query_eq_dangerous("pci-retry-all", "true"); + s = nvram_query_dangerous("phb-rx-err-max"); + if (s) { + rx_err_max = atoi(s); + + /* Clip to uint8_t used by hardware */ + rx_err_max = MAX(rx_err_max, 0); + rx_err_max = MIN(rx_err_max, 255); + } + + if (is_phb5()) { + prlog(PR_DEBUG, "PHB5: Maximum RX errors during training: %d\n", rx_err_max); + /* Look for PBCQ XSCOM nodes */ + dt_for_each_compatible(dt_root, np, "ibm,power10-pbcq") + phb4_probe_pbcq(np); + + /* Look for newly created PHB nodes */ + dt_for_each_compatible(dt_root, np, "ibm,power10-pciex") + phb4_create(np); + } else { + prlog(PR_DEBUG, "PHB4: Maximum RX errors during training: %d\n", rx_err_max); + /* Look for PBCQ XSCOM nodes */ + dt_for_each_compatible(dt_root, np, "ibm,power9-pbcq") + phb4_probe_pbcq(np); + + /* Look for newly created PHB nodes */ + dt_for_each_compatible(dt_root, np, "ibm,power9-pciex") + phb4_create(np); + } +} diff --git a/roms/skiboot/hw/phys-map.c b/roms/skiboot/hw/phys-map.c new file mode 100644 index 000000000..d6ff99fd8 --- /dev/null +++ b/roms/skiboot/hw/phys-map.c @@ -0,0 +1,445 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Physical memory map + * + * Copyright 2017-2019 IBM Corp. + */ + +#include <phys-map.h> +#include <chip.h> +#include <skiboot.h> +#include <opal-api.h> +#include <stack.h> +#include <inttypes.h> + +struct phys_map_entry { + enum phys_map_type type; + int index; + uint64_t addr; + uint64_t size; +}; + +struct phys_map_info { + int chip_select_shift; + const struct phys_map_entry *table; +}; + +static const struct phys_map_info *phys_map; + +static const struct phys_map_entry phys_map_table_p10[] = { + /* System memory upto 4TB minus GPU memory */ + { SYSTEM_MEM, 0, 0x0000000000000000ull, 0x0000034000000000ull }, + + /* TODO: Figure out GPU memory */ + + /* 0 TB offset @ MMIO 0x0006000000000000ull */ + { PHB5_64BIT_MMIO, 0, 0x0006000000000000ull, 0x0000004000000000ull }, + { PHB5_64BIT_MMIO, 1, 0x0006004000000000ull, 0x0000004000000000ull }, + { PHB5_64BIT_MMIO, 2, 0x0006008000000000ull, 0x0000004000000000ull }, + { PHB5_32BIT_MMIO, 0, 0x000600c000000000ull, 0x0000000080000000ull }, + { PHB5_32BIT_MMIO, 1, 0x000600c080000000ull, 0x0000000080000000ull }, + { PHB5_32BIT_MMIO, 2, 0x000600c100000000ull, 0x0000000080000000ull }, + { PHB5_32BIT_MMIO, 3, 0x000600c180000000ull, 0x0000000080000000ull }, + { PHB5_32BIT_MMIO, 4, 0x000600c200000000ull, 0x0000000080000000ull }, + { PHB5_32BIT_MMIO, 5, 0x000600c280000000ull, 0x0000000080000000ull }, + { PHB5_XIVE_ESB , 0, 0x000600c300000000ull, 0x0000000020000000ull }, + { PHB5_XIVE_ESB , 1, 0x000600c320000000ull, 0x0000000020000000ull }, + { PHB5_XIVE_ESB , 2, 0x000600c340000000ull, 0x0000000020000000ull }, + { PHB5_XIVE_ESB , 3, 0x000600c360000000ull, 0x0000000020000000ull }, + { PHB5_XIVE_ESB , 4, 0x000600c380000000ull, 0x0000000020000000ull }, + { PHB5_XIVE_ESB , 5, 0x000600c3a0000000ull, 0x0000000020000000ull }, + { PHB5_REG_SPC , 0, 0x000600c3c0000000ull, 0x0000000000100000ull }, + { PHB5_REG_SPC , 1, 0x000600c3c0100000ull, 0x0000000000100000ull }, + { PHB5_REG_SPC , 2, 0x000600c3c0200000ull, 0x0000000000100000ull }, + { PHB5_REG_SPC , 3, 0x000600c3c0300000ull, 0x0000000000100000ull }, + { PHB5_REG_SPC , 4, 0x000600c3c0400000ull, 0x0000000000100000ull }, + { PHB5_REG_SPC , 5, 0x000600c3c0500000ull, 0x0000000000100000ull }, + { RESV , 0, 0x000600c3c0600000ull, 0x0000003c3fa00000ull }, + + /* 1 TB offset */ + { RESV , 1, 0x0006010000000000ull, 0x0000010000000000ull }, + + /* 2 TB offset */ + { PHB5_64BIT_MMIO, 3, 0x0006020000000000ull, 0x0000004000000000ull }, + { PHB5_64BIT_MMIO, 4, 0x0006024000000000ull, 0x0000004000000000ull }, + { PHB5_64BIT_MMIO, 5, 0x0006028000000000ull, 0x0000004000000000ull }, + { RESV , 2, 0x000602c000000000ull, 0x0000004000000000ull }, + + /* 3 TB offset */ + { LPC_BUS , 0, 0x0006030000000000ull, 0x0000000100000000ull }, + { FSP_MMIO , 0, 0x0006030100000000ull, 0x0000000100000000ull }, + { XIVE_IC , 0, 0x0006030200000000ull, 0x0000000002000000ull }, + { PSIHB_ESB , 0, 0x0006030202000000ull, 0x0000000000100000ull }, + { RESV , 3, 0x0006030202100000ull, 0x0000000000f00000ull }, + { PSIHB_REG , 0, 0x0006030203000000ull, 0x0000000000100000ull }, + { RESV , 4, 0x0006030203100000ull, 0x0000000000080000ull }, + { XIVE_TM , 0, 0x0006030203180000ull, 0x0000000000040000ull }, + { RESV , 5, 0x00060302031c0000ull, 0x0000000000010000ull }, + { NX_RNG , 0, 0x00060302031d0000ull, 0x0000000000010000ull }, + { RESV , 6, 0x00060302031e0000ull, 0x0000000004e20000ull }, + { XIVE_NVC , 0, 0x0006030208000000ull, 0x0000000008000000ull }, + { RESV , 7, 0x0006030210000000ull, 0x00000000ee000000ull }, + { VAS_HYP_WIN , 0, 0x00060302fe000000ull, 0x0000000002000000ull }, + { VAS_USER_WIN , 0, 0x0006030300000000ull, 0x0000000100000000ull }, + + /* TODO: MC, OCMB, PAU */ + { RESV , 8, 0x0006030400000000ull, 0x000000f800000000ull }, + { XSCOM , 0, 0x000603fc00000000ull, 0x0000000400000000ull }, + + /* 4 TB offset */ + { XIVE_NVPG , 0, 0x0006040000000000ull, 0x0000010000000000ull }, + + /* 5 - 7 TB offset */ + /* for P10 the END and ESB regions are separate in the MMIO + * table */ + { XIVE_ESB , 0, 0x0006050000000000ull, 0x0000010000000000ull }, + { XIVE_END , 0, 0x0006060000000000ull, 0x0000020000000000ull }, + + /* 8 - 13 TB offset */ + { RESV , 9, 0x0006080000000000ull, 0x0000060000000000ull }, + + /* 14 TB offset */ + { RESV ,10, 0x00060e0000000000ull, 0x0000008000000000ull }, + + { NULL_MAP, 0, 0, 0 }, +}; + +static const struct phys_map_entry phys_map_table_nimbus[] = { + + /* System memory upto 4TB minus GPU memory */ + { SYSTEM_MEM, 0, 0x0000000000000000ull, 0x0000034000000000ull }, + /* GPU memory from 4TB - 128GB*GPU */ + { GPU_MEM_4T_DOWN, 5, 0x0000034000000000ull, 0x0000002000000000ull }, + { GPU_MEM_4T_DOWN, 4, 0x0000036000000000ull, 0x0000002000000000ull }, + { GPU_MEM_4T_DOWN, 3, 0x0000038000000000ull, 0x0000002000000000ull }, + { GPU_MEM_4T_DOWN, 2, 0x000003a000000000ull, 0x0000002000000000ull }, + { GPU_MEM_4T_DOWN, 1, 0x000003c000000000ull, 0x0000002000000000ull }, + { GPU_MEM_4T_DOWN, 0, 0x000003e000000000ull, 0x0000002000000000ull }, + /* GPU memory from 4TB + 128GB*GPU. 4 GPUs only */ + { GPU_MEM_4T_UP, 0, 0x0000040000000000ull, 0x0000002000000000ull }, + { GPU_MEM_4T_UP, 1, 0x0000042000000000ull, 0x0000002000000000ull }, + { GPU_MEM_4T_UP, 2, 0x0000044000000000ull, 0x0000002000000000ull }, + { GPU_MEM_4T_UP, 3, 0x0000046000000000ull, 0x0000002000000000ull }, + + /* + * OpenCAPI LPC Memory + * + * With chip address extension enabled, we allocate 4TB ranges + * (in the second non-mirrored region) for each OpenCAPI link + * by varying the upper 2 bits of the group ID. + * + * We don't currently support >4TB ranges. + */ + { OCAPI_MEM, 0, 0x0002000000000000ull, 0x0000040000000000ull }, + { OCAPI_MEM, 1, 0x0002800000000000ull, 0x0000040000000000ull }, + { OCAPI_MEM, 2, 0x0003000000000000ull, 0x0000040000000000ull }, + { OCAPI_MEM, 3, 0x0003800000000000ull, 0x0000040000000000ull }, + + /* 0 TB offset @ MMIO 0x0006000000000000ull */ + { PHB4_64BIT_MMIO, 0, 0x0006000000000000ull, 0x0000004000000000ull }, + { PHB4_64BIT_MMIO, 1, 0x0006004000000000ull, 0x0000004000000000ull }, + { PHB4_64BIT_MMIO, 2, 0x0006008000000000ull, 0x0000004000000000ull }, + { PHB4_32BIT_MMIO, 0, 0x000600c000000000ull, 0x0000000080000000ull }, + { PHB4_32BIT_MMIO, 1, 0x000600c080000000ull, 0x0000000080000000ull }, + { PHB4_32BIT_MMIO, 2, 0x000600c100000000ull, 0x0000000080000000ull }, + { PHB4_32BIT_MMIO, 3, 0x000600c180000000ull, 0x0000000080000000ull }, + { PHB4_32BIT_MMIO, 4, 0x000600c200000000ull, 0x0000000080000000ull }, + { PHB4_32BIT_MMIO, 5, 0x000600c280000000ull, 0x0000000080000000ull }, + { PHB4_XIVE_ESB , 0, 0x000600c300000000ull, 0x0000000020000000ull }, + { PHB4_XIVE_ESB , 1, 0x000600c320000000ull, 0x0000000020000000ull }, + { PHB4_XIVE_ESB , 2, 0x000600c340000000ull, 0x0000000020000000ull }, + { PHB4_XIVE_ESB , 3, 0x000600c360000000ull, 0x0000000020000000ull }, + { PHB4_XIVE_ESB , 4, 0x000600c380000000ull, 0x0000000020000000ull }, + { PHB4_XIVE_ESB , 5, 0x000600c3a0000000ull, 0x0000000020000000ull }, + { PHB4_REG_SPC , 0, 0x000600c3c0000000ull, 0x0000000000100000ull }, + { PHB4_REG_SPC , 1, 0x000600c3c0100000ull, 0x0000000000100000ull }, + { PHB4_REG_SPC , 2, 0x000600c3c0200000ull, 0x0000000000100000ull }, + { PHB4_REG_SPC , 3, 0x000600c3c0300000ull, 0x0000000000100000ull }, + { PHB4_REG_SPC , 4, 0x000600c3c0400000ull, 0x0000000000100000ull }, + { PHB4_REG_SPC , 5, 0x000600c3c0500000ull, 0x0000000000100000ull }, + { RESV , 0, 0x000600c3c0600000ull, 0x0000000c3fa00000ull }, + { NPU_OCAPI_MMIO , 0, 0x000600d000000000ull, 0x0000000800000000ull }, + { NPU_OCAPI_MMIO , 1, 0x000600d800000000ull, 0x0000000800000000ull }, + { NPU_OCAPI_MMIO , 2, 0x000600e000000000ull, 0x0000000800000000ull }, + { NPU_OCAPI_MMIO , 3, 0x000600e800000000ull, 0x0000000800000000ull }, + { NPU_OCAPI_MMIO , 4, 0x000600f000000000ull, 0x0000000800000000ull }, + { NPU_OCAPI_MMIO , 5, 0x000600f800000000ull, 0x0000000800000000ull }, + + /* 1 TB offset @ MMIO 0x0006000000000000ull */ + { XIVE_VC , 0, 0x0006010000000000ull, 0x0000008000000000ull }, + { XIVE_PC , 0, 0x0006018000000000ull, 0x0000001000000000ull }, + { VAS_USER_WIN , 0, 0x0006019000000000ull, 0x0000000100000000ull }, + { VAS_HYP_WIN , 0, 0x0006019100000000ull, 0x0000000002000000ull }, + { RESV , 1, 0x0006019102000000ull, 0x000000001e000000ull }, + { OCAB_XIVE_ESB , 0, 0x0006019120000000ull, 0x0000000020000000ull }, + { RESV , 3, 0x0006019140000000ull, 0x0000006ec0000000ull }, + + /* 2 TB offset @ MMIO 0x0006000000000000ull */ + { PHB4_64BIT_MMIO, 3, 0x0006020000000000ull, 0x0000004000000000ull }, + { PHB4_64BIT_MMIO, 4, 0x0006024000000000ull, 0x0000004000000000ull }, + { PHB4_64BIT_MMIO, 5, 0x0006028000000000ull, 0x0000004000000000ull }, + { RESV , 4, 0x000602c000000000ull, 0x0000004000000000ull }, + + /* 3 TB offset @ MMIO 0x0006000000000000ull */ + { LPC_BUS , 0, 0x0006030000000000ull, 0x0000000100000000ull }, + { FSP_MMIO , 0, 0x0006030100000000ull, 0x0000000100000000ull }, + { NPU_REGS , 0, 0x0006030200000000ull, 0x0000000001000000ull }, + { NPU_USR , 0, 0x0006030201000000ull, 0x0000000000200000ull }, + { NPU_PHY , 0, 0x0006030201200000ull, 0x0000000000200000ull }, + { NPU_PHY , 1, 0x0006030201400000ull, 0x0000000000200000ull }, + { NPU_NTL , 0, 0x0006030201600000ull, 0x0000000000020000ull }, + { NPU_NTL , 1, 0x0006030201620000ull, 0x0000000000020000ull }, + { NPU_NTL , 2, 0x0006030201640000ull, 0x0000000000020000ull }, + { NPU_NTL , 3, 0x0006030201660000ull, 0x0000000000020000ull }, + { NPU_NTL , 4, 0x0006030201680000ull, 0x0000000000020000ull }, + { NPU_NTL , 5, 0x00060302016a0000ull, 0x0000000000020000ull }, + { NPU_GENID , 0, 0x00060302016c0000ull, 0x0000000000020000ull }, + { NPU_GENID , 1, 0x00060302016e0000ull, 0x0000000000020000ull }, + { NPU_GENID , 2, 0x0006030201700000ull, 0x0000000000020000ull }, + { RESV , 5, 0x0006030201720000ull, 0x00000000018e0000ull }, + { PSIHB_REG , 0, 0x0006030203000000ull, 0x0000000000100000ull }, + { XIVE_IC , 0, 0x0006030203100000ull, 0x0000000000080000ull }, + { XIVE_TM , 0, 0x0006030203180000ull, 0x0000000000040000ull }, + { PSIHB_ESB , 0, 0x00060302031c0000ull, 0x0000000000010000ull }, + { NX_RNG , 0, 0x00060302031d0000ull, 0x0000000000010000ull }, + { RESV , 6, 0x00060302031e0000ull, 0x000000001ce20000ull }, + { CENTAUR_SCOM , 0, 0x0006030220000000ull, 0x0000000020000000ull }, + { RESV , 7, 0x0006030240000000ull, 0x000000f9c0000000ull }, + { XSCOM , 0, 0x000603fc00000000ull, 0x0000000400000000ull }, + + /* NULL entry at end */ + { NULL_MAP, 0, 0, 0 }, +}; + +static const struct phys_map_info phys_map_nimbus = { + .chip_select_shift = 42, + .table = phys_map_table_nimbus, +}; + +static const struct phys_map_entry phys_map_table_axone[] = { + + /* System memory up to 4TB minus GPU memory */ + { SYSTEM_MEM, 0, 0x0000000000000000ull, 0x0000034000000000ull }, + /* GPU memory from 4TB - 128GB*GPU */ + { GPU_MEM_4T_DOWN, 5, 0x0000034000000000ull, 0x0000002000000000ull }, + { GPU_MEM_4T_DOWN, 4, 0x0000036000000000ull, 0x0000002000000000ull }, + { GPU_MEM_4T_DOWN, 3, 0x0000038000000000ull, 0x0000002000000000ull }, + { GPU_MEM_4T_DOWN, 2, 0x000003a000000000ull, 0x0000002000000000ull }, + { GPU_MEM_4T_DOWN, 1, 0x000003c000000000ull, 0x0000002000000000ull }, + { GPU_MEM_4T_DOWN, 0, 0x000003e000000000ull, 0x0000002000000000ull }, + + /* 0 TB offset @ MMIO 0x0006000000000000ull */ + { PHB4_64BIT_MMIO, 0, 0x0006000000000000ull, 0x0000004000000000ull }, + { PHB4_64BIT_MMIO, 1, 0x0006004000000000ull, 0x0000004000000000ull }, + { PHB4_64BIT_MMIO, 2, 0x0006008000000000ull, 0x0000004000000000ull }, + { PHB4_32BIT_MMIO, 0, 0x000600c000000000ull, 0x0000000080000000ull }, + { PHB4_32BIT_MMIO, 1, 0x000600c080000000ull, 0x0000000080000000ull }, + { PHB4_32BIT_MMIO, 2, 0x000600c100000000ull, 0x0000000080000000ull }, + { PHB4_32BIT_MMIO, 3, 0x000600c180000000ull, 0x0000000080000000ull }, + { PHB4_32BIT_MMIO, 4, 0x000600c200000000ull, 0x0000000080000000ull }, + { PHB4_32BIT_MMIO, 5, 0x000600c280000000ull, 0x0000000080000000ull }, + { PHB4_XIVE_ESB, 0, 0x000600c300000000ull, 0x0000000020000000ull }, + { PHB4_XIVE_ESB, 1, 0x000600c320000000ull, 0x0000000020000000ull }, + { PHB4_XIVE_ESB, 2, 0x000600c340000000ull, 0x0000000020000000ull }, + { PHB4_XIVE_ESB, 3, 0x000600c360000000ull, 0x0000000020000000ull }, + { PHB4_XIVE_ESB, 4, 0x000600c380000000ull, 0x0000000020000000ull }, + { PHB4_XIVE_ESB, 5, 0x000600c3a0000000ull, 0x0000000020000000ull }, + { PHB4_REG_SPC, 0, 0x000600c3c0000000ull, 0x0000000000100000ull }, + { PHB4_REG_SPC, 1, 0x000600c3c0100000ull, 0x0000000000100000ull }, + { PHB4_REG_SPC, 2, 0x000600c3c0200000ull, 0x0000000000100000ull }, + { PHB4_REG_SPC, 3, 0x000600c3c0300000ull, 0x0000000000100000ull }, + { PHB4_REG_SPC, 4, 0x000600c3c0400000ull, 0x0000000000100000ull }, + { PHB4_REG_SPC, 5, 0x000600c3c0500000ull, 0x0000000000100000ull }, + { RESV, 0, 0x000600c3c0600000ull, 0x0000000c3fa00000ull }, + { NPU_OCAPI_MMIO, 0, 0x000600d000000000ull, 0x0000000800000000ull }, + { NPU_OCAPI_MMIO, 1, 0x000600d800000000ull, 0x0000000800000000ull }, + { NPU_OCAPI_MMIO, 2, 0x000600e000000000ull, 0x0000000800000000ull }, + { NPU_OCAPI_MMIO, 3, 0x000600e800000000ull, 0x0000000800000000ull }, + { NPU_OCAPI_MMIO, 4, 0x000600f000000000ull, 0x0000000800000000ull }, + { NPU_OCAPI_MMIO, 5, 0x000600f800000000ull, 0x0000000800000000ull }, + + /* 1 TB offset @ MMIO 0x0006000000000000ull */ + { XIVE_VC, 0, 0x0006010000000000ull, 0x0000008000000000ull }, + { XIVE_PC, 0, 0x0006018000000000ull, 0x0000004000000000ull }, + { VAS_USER_WIN, 0, 0x000601c000000000ull, 0x0000000100000000ull }, + { VAS_HYP_WIN, 0, 0x000601c100000000ull, 0x0000000002000000ull }, + { RESV, 1, 0x000601c102000000ull, 0x0000003efe000000ull }, + + /* 2 TB offset @ MMIO 0x0006000000000000ull */ + { PHB4_64BIT_MMIO, 3, 0x0006020000000000ull, 0x0000004000000000ull }, + { PHB4_64BIT_MMIO, 4, 0x0006024000000000ull, 0x0000004000000000ull }, + { PHB4_64BIT_MMIO, 5, 0x0006028000000000ull, 0x0000004000000000ull }, + { RESV, 2, 0x000602c000000000ull, 0x0000004000000000ull }, + + /* 3 TB offset @ MMIO 0x0006000000000000ull */ + { LPC_BUS, 0, 0x0006030000000000ull, 0x0000000100000000ull }, + { FSP_MMIO, 0, 0x0006030100000000ull, 0x0000000100000000ull }, + { RESV, 3, 0x0006030200000000ull, 0x0000000003000000ull }, + { PSIHB_REG, 0, 0x0006030203000000ull, 0x0000000000100000ull }, + { XIVE_IC, 0, 0x0006030203100000ull, 0x0000000000080000ull }, + { XIVE_TM, 0, 0x0006030203180000ull, 0x0000000000040000ull }, + { PSIHB_ESB, 0, 0x00060302031c0000ull, 0x0000000000010000ull }, + { NX_RNG, 0, 0x00060302031d0000ull, 0x0000000000010000ull }, + { RESV, 4, 0x00060302031e0000ull, 0x00000001fce20000ull }, + { MC_OCMB_CFG, 0, 0x0006030400000000ull, 0x0000000080000000ull }, + { MC_OCMB_CFG, 1, 0x0006030480000000ull, 0x0000000080000000ull }, + { MC_OCMB_MMIO, 0, 0x0006030500000000ull, 0x0000000080000000ull }, + { MC_OCMB_MMIO, 1, 0x0006030580000000ull, 0x0000000080000000ull }, + { MC_OCMB_CFG, 2, 0x0006030600000000ull, 0x0000000080000000ull }, + { MC_OCMB_CFG, 3, 0x0006030680000000ull, 0x0000000080000000ull }, + { MC_OCMB_MMIO, 2, 0x0006030700000000ull, 0x0000000080000000ull }, + { MC_OCMB_MMIO, 3, 0x0006030780000000ull, 0x0000000080000000ull }, + { MC_OCMB_CFG, 4, 0x0006030800000000ull, 0x0000000080000000ull }, + { MC_OCMB_CFG, 5, 0x0006030880000000ull, 0x0000000080000000ull }, + { MC_OCMB_MMIO, 4, 0x0006030900000000ull, 0x0000000080000000ull }, + { MC_OCMB_MMIO, 5, 0x0006030980000000ull, 0x0000000080000000ull }, + { MC_OCMB_CFG, 6, 0x0006030a00000000ull, 0x0000000080000000ull }, + { MC_OCMB_CFG, 7, 0x0006030a80000000ull, 0x0000000080000000ull }, + { MC_OCMB_MMIO, 6, 0x0006030b00000000ull, 0x0000000080000000ull }, + { MC_OCMB_MMIO, 7, 0x0006030b80000000ull, 0x0000000080000000ull }, + { MC_OCMB_CFG, 8, 0x0006030c00000000ull, 0x0000000080000000ull }, + { MC_OCMB_CFG, 9, 0x0006030c80000000ull, 0x0000000080000000ull }, + { MC_OCMB_MMIO, 8, 0x0006030d00000000ull, 0x0000000080000000ull }, + { MC_OCMB_MMIO, 9, 0x0006030d80000000ull, 0x0000000080000000ull }, + { MC_OCMB_CFG, 10, 0x0006030e00000000ull, 0x0000000080000000ull }, + { MC_OCMB_CFG, 11, 0x0006030e80000000ull, 0x0000000080000000ull }, + { MC_OCMB_MMIO, 10, 0x0006030f00000000ull, 0x0000000080000000ull }, + { MC_OCMB_MMIO, 11, 0x0006030f80000000ull, 0x0000000080000000ull }, + { MC_OCMB_CFG, 12, 0x0006031000000000ull, 0x0000000080000000ull }, + { MC_OCMB_CFG, 13, 0x0006031080000000ull, 0x0000000080000000ull }, + { MC_OCMB_MMIO, 12, 0x0006031100000000ull, 0x0000000080000000ull }, + { MC_OCMB_MMIO, 13, 0x0006031180000000ull, 0x0000000080000000ull }, + { MC_OCMB_CFG, 14, 0x0006031200000000ull, 0x0000000080000000ull }, + { MC_OCMB_CFG, 15, 0x0006031280000000ull, 0x0000000080000000ull }, + { MC_OCMB_MMIO, 14, 0x0006031300000000ull, 0x0000000080000000ull }, + { MC_OCMB_MMIO, 15, 0x0006031380000000ull, 0x0000000080000000ull }, + { RESV, 5, 0x0006031400000000ull, 0x000000d800000000ull }, + { NPU_REGS, 0, 0x000603ec00000000ull, 0x0000000001000000ull }, + { NPU_REGS, 1, 0x000603ec01000000ull, 0x0000000001000000ull }, + { NPU_REGS, 2, 0x000603ec02000000ull, 0x0000000001000000ull }, + { NPU_NTL, 0, 0x000603ec03000000ull, 0x0000000000020000ull }, + { NPU_NTL, 1, 0x000603ec03020000ull, 0x0000000000020000ull }, + { NPU_NTL, 2, 0x000603ec03040000ull, 0x0000000000020000ull }, + { NPU_NTL, 3, 0x000603ec03060000ull, 0x0000000000020000ull }, + { NPU_GENID, 0, 0x000603ec03080000ull, 0x0000000000080000ull }, + { NPU_NTL, 4, 0x000603ec03100000ull, 0x0000000000020000ull }, + { NPU_NTL, 5, 0x000603ec03120000ull, 0x0000000000020000ull }, + { NPU_NTL, 6, 0x000603ec03140000ull, 0x0000000000020000ull }, + { NPU_NTL, 7, 0x000603ec03160000ull, 0x0000000000020000ull }, + { NPU_GENID, 1, 0x000603ec03180000ull, 0x0000000000080000ull }, + { NPU_NTL, 8, 0x000603ec03200000ull, 0x0000000000020000ull }, + { NPU_NTL, 9, 0x000603ec03220000ull, 0x0000000000020000ull }, + { NPU_NTL, 10, 0x000603ec03240000ull, 0x0000000000020000ull }, + { NPU_NTL, 11, 0x000603ec03260000ull, 0x0000000000020000ull }, + { NPU_GENID, 2, 0x000603ec03280000ull, 0x0000000000080000ull }, + { RESV, 6, 0x000603ec03300000ull, 0x0000000ffcd00000ull }, + { XSCOM, 0, 0x000603fc00000000ull, 0x0000000400000000ull }, + + /* NULL entry at end */ + { NULL_MAP, 0, 0, 0 }, +}; + +static const struct phys_map_info phys_map_axone = { + .chip_select_shift = 42, + .table = phys_map_table_axone, +}; + +static const struct phys_map_info phys_map_p10 = { + .chip_select_shift = 44, + .table = phys_map_table_p10, +}; + +static inline bool phys_map_entry_null(const struct phys_map_entry *e) +{ + if (e->type == NULL_MAP) + return true; + return false; +} + + +/* This crashes skiboot on error as any bad calls here are almost + * certainly a developer error + */ +void __phys_map_get(uint64_t topology_idx, uint64_t gcid, enum phys_map_type type, + int index, uint64_t *addr, uint64_t *size) { + const struct phys_map_entry *e; + uint64_t a; + + if (!phys_map) + goto error; + + /* Find entry in table */ + for (e = phys_map->table; ; e++) { + + /* End of table */ + if (phys_map_entry_null(e)) + goto error; + + /* Is this our entry? */ + if (e->type != type) + continue; + if (e->index != index) + continue; + + /* Found entry! */ + break; + } + a = e->addr; + a += topology_idx << (phys_map->chip_select_shift); + + if (addr) + *addr = a; + if (size) + *size = e->size; + + prlog(PR_TRACE, "Assigning BAR [%"PRIx64"] type:%02i index:%x " + "0x%016"PRIx64" for 0x%016"PRIx64"\n", + gcid, type, index, a, e->size); + + return; + +error: + /* Something has gone really wrong */ + prlog(PR_EMERG, "ERROR: Failed to lookup BAR type:%i index:%i\n", + type, index); + assert(0); +} + +void phys_map_get(uint64_t gcid, enum phys_map_type type, + int index, uint64_t *addr, uint64_t *size) +{ + struct proc_chip *chip; + uint64_t topology_idx = gcid; + + if (proc_gen >= proc_gen_p10) { + chip = get_chip(gcid); + topology_idx = chip->primary_topology; + } + + return __phys_map_get(topology_idx, gcid, type, index, addr, size); +} + +void phys_map_init(unsigned long pvr) +{ + const char *name = "unused"; + + phys_map = NULL; + + if (proc_gen == proc_gen_p9) { + switch(PVR_TYPE(pvr)) { + case PVR_TYPE_P9P: + name = "axone"; + phys_map = &phys_map_axone; + break; + default: + name = "nimbus"; + phys_map = &phys_map_nimbus; + } + } else if (proc_gen == proc_gen_p10) { + name = "p10"; + phys_map = &phys_map_p10; + } + + prlog(PR_DEBUG, "Assigning physical memory map table for %s\n", name); + +} diff --git a/roms/skiboot/hw/prd.c b/roms/skiboot/hw/prd.c new file mode 100644 index 000000000..45d765457 --- /dev/null +++ b/roms/skiboot/hw/prd.c @@ -0,0 +1,789 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * PRD: Processor Runtime Diagnostics + * + * Copyright 2014-2019 IBM Corp. + */ + +#include <skiboot.h> +#include <opal.h> +#include <lock.h> +#include <xscom.h> +#include <chip.h> +#include <opal-msg.h> +#include <fsp.h> +#include <mem_region.h> +#include <prd-fw-msg.h> +#include <hostservices.h> + +enum events { + EVENT_ATTN = 1 << 0, + EVENT_OCC_ERROR = 1 << 1, + EVENT_OCC_RESET = 1 << 2, + EVENT_SBE_PASSTHROUGH = 1 << 3, + EVENT_FSP_OCC_RESET = 1 << 4, + EVENT_FSP_OCC_LOAD_START = 1 << 5, +}; + +static uint8_t events[MAX_CHIPS]; +static uint64_t ipoll_status[MAX_CHIPS]; +static uint8_t _prd_msg_buf[sizeof(struct opal_prd_msg) + + sizeof(struct prd_fw_msg)]; +static struct opal_prd_msg *prd_msg = (struct opal_prd_msg *)&_prd_msg_buf; +static struct opal_prd_msg *prd_msg_fsp_req; +static struct opal_prd_msg *prd_msg_fsp_notify; +static bool prd_msg_inuse, prd_active; +static struct dt_node *prd_node; +static bool prd_enabled = false; + +/* Locking: + * + * The events lock serialises access to the events, ipoll_status, + * prd_msg_inuse, and prd_active variables. + * + * The ipoll_lock protects against concurrent updates to the ipoll registers. + * + * The ipoll_lock may be acquired with events_lock held. This order must + * be preserved. + */ +static struct lock events_lock = LOCK_UNLOCKED; +static struct lock ipoll_lock = LOCK_UNLOCKED; + +static uint64_t prd_ipoll_mask_reg; +static uint64_t prd_ipoll_status_reg; +static uint64_t prd_ipoll_mask; + +/* PRD registers */ +#define PRD_P8_IPOLL_REG_MASK 0x01020013 +#define PRD_P8_IPOLL_REG_STATUS 0x01020014 +#define PRD_P8_IPOLL_XSTOP PPC_BIT(0) /* Xstop for host/core/millicode */ +#define PRD_P8_IPOLL_RECOV PPC_BIT(1) /* Recoverable */ +#define PRD_P8_IPOLL_SPEC_ATTN PPC_BIT(2) /* Special attention */ +#define PRD_P8_IPOLL_HOST_ATTN PPC_BIT(3) /* Host attention */ +#define PRD_P8_IPOLL_MASK PPC_BITMASK(0, 3) + +#define PRD_P9_IPOLL_REG_MASK 0x000F0033 +#define PRD_P9_IPOLL_REG_STATUS 0x000F0034 +#define PRD_P9_IPOLL_XSTOP PPC_BIT(0) /* Xstop for host/core/millicode */ +#define PRD_P9_IPOLL_RECOV PPC_BIT(1) /* Recoverable */ +#define PRD_P9_IPOLL_SPEC_ATTN PPC_BIT(2) /* Special attention */ +#define PRD_P9_IPOLL_UNIT_CS PPC_BIT(3) /* Unit Xstop */ +#define PRD_P9_IPOLL_HOST_ATTN PPC_BIT(4) /* Host attention */ +#define PRD_P9_IPOLL_MASK_INTR PPC_BIT(5) /* Host interrupt */ +#define PRD_P9_IPOLL_MASK PPC_BITMASK(0, 5) + +static void send_next_pending_event(void); + +static void prd_msg_consumed(void *data, int status) +{ + struct opal_prd_msg *msg = data; + uint32_t proc; + int notify_status = OPAL_SUCCESS; + uint8_t event = 0; + + lock(&events_lock); + switch (msg->hdr.type) { + case OPAL_PRD_MSG_TYPE_ATTN: + proc = be64_to_cpu(msg->attn.proc); + + /* If other ipoll events have been received in the time + * between prd_msg creation and consumption, we'll need to + * raise a separate ATTN message for those. So, we only + * clear the event if we don't have any further ipoll_status + * bits. + */ + ipoll_status[proc] &= ~be64_to_cpu(msg->attn.ipoll_status); + if (!ipoll_status[proc]) + event = EVENT_ATTN; + + break; + case OPAL_PRD_MSG_TYPE_OCC_ERROR: + proc = be64_to_cpu(msg->occ_error.chip); + event = EVENT_OCC_ERROR; + break; + case OPAL_PRD_MSG_TYPE_OCC_RESET: + proc = be64_to_cpu(msg->occ_reset.chip); + event = EVENT_OCC_RESET; + break; + case OPAL_PRD_MSG_TYPE_FIRMWARE_RESPONSE: + if (prd_msg_fsp_req) { + free(prd_msg_fsp_req); + prd_msg_fsp_req = NULL; + } + break; + case OPAL_PRD_MSG_TYPE_FIRMWARE_NOTIFY: + if (prd_msg_fsp_notify) { + free(prd_msg_fsp_notify); + prd_msg_fsp_notify = NULL; + } + if (status != 0) { + prlog(PR_DEBUG, + "PRD: Failed to send FSP -> HBRT message\n"); + notify_status = FSP_STATUS_GENERIC_ERROR; + } + if (platform.prd && platform.prd->msg_response) + platform.prd->msg_response(notify_status); + break; + case OPAL_PRD_MSG_TYPE_SBE_PASSTHROUGH: + proc = be64_to_cpu(msg->sbe_passthrough.chip); + event = EVENT_SBE_PASSTHROUGH; + break; + case OPAL_PRD_MSG_TYPE_FSP_OCC_RESET: + proc = be64_to_cpu(msg->occ_reset.chip); + event = EVENT_FSP_OCC_RESET; + break; + case OPAL_PRD_MSG_TYPE_FSP_OCC_LOAD_START: + proc = be64_to_cpu(msg->occ_reset.chip); + event = EVENT_FSP_OCC_LOAD_START; + break; + default: + prlog(PR_ERR, "PRD: invalid msg consumed, type: 0x%x\n", + msg->hdr.type); + } + + if (event) + events[proc] &= ~event; + prd_msg_inuse = false; + send_next_pending_event(); + unlock(&events_lock); +} + +/* + * OPAL_MSG_PRD interface can handle message size <= OPAL_MSG_FIXED_PARAMS_SIZE. + * But kernel prd driver had a bug where it will not copy partial data to user + * space. Use OPAL_MSG_PRD interface only if size is <= sizeof(opal_prg_msg). + */ +static inline int opal_queue_prd_msg(struct opal_prd_msg *msg) +{ + enum opal_msg_type msg_type = OPAL_MSG_PRD2; + + if (be16_to_cpu(msg->hdr.size) <= 0x20) + msg_type = OPAL_MSG_PRD; + + return _opal_queue_msg(msg_type, msg, prd_msg_consumed, + be16_to_cpu(msg->hdr.size), msg); +} + +static int populate_ipoll_msg(struct opal_prd_msg *msg, uint32_t proc) +{ + uint64_t ipoll_mask; + int rc; + + lock(&ipoll_lock); + rc = xscom_read(proc, prd_ipoll_mask_reg, &ipoll_mask); + unlock(&ipoll_lock); + + if (rc) { + prlog(PR_ERR, "PRD: Unable to read ipoll status (chip %d)!\n", + proc); + return -1; + } + + msg->attn.proc = cpu_to_be64(proc); + msg->attn.ipoll_status = cpu_to_be64(ipoll_status[proc]); + msg->attn.ipoll_mask = cpu_to_be64(ipoll_mask); + return 0; +} + +static void send_next_pending_event(void) +{ + struct proc_chip *chip; + uint32_t proc; + int rc; + uint8_t event; + + assert(!prd_msg_inuse); + + if (!prd_active) + return; + + event = 0; + + for_each_chip(chip) { + proc = chip->id; + if (events[proc]) { + event = events[proc]; + break; + } + } + + if (!event) + return; + + prd_msg->token = 0; + prd_msg->hdr.size = cpu_to_be16(sizeof(*prd_msg)); + + if (event & EVENT_ATTN) { + prd_msg->hdr.type = OPAL_PRD_MSG_TYPE_ATTN; + populate_ipoll_msg(prd_msg, proc); + } else if (event & EVENT_OCC_ERROR) { + prd_msg->hdr.type = OPAL_PRD_MSG_TYPE_OCC_ERROR; + prd_msg->occ_error.chip = cpu_to_be64(proc); + } else if (event & EVENT_OCC_RESET) { + prd_msg->hdr.type = OPAL_PRD_MSG_TYPE_OCC_RESET; + prd_msg->occ_reset.chip = cpu_to_be64(proc); + occ_msg_queue_occ_reset(); + } else if (event & EVENT_SBE_PASSTHROUGH) { + prd_msg->hdr.type = OPAL_PRD_MSG_TYPE_SBE_PASSTHROUGH; + prd_msg->sbe_passthrough.chip = cpu_to_be64(proc); + } else if (event & EVENT_FSP_OCC_RESET) { + prd_msg->hdr.type = OPAL_PRD_MSG_TYPE_FSP_OCC_RESET; + prd_msg->occ_reset.chip = cpu_to_be64(proc); + } else if (event & EVENT_FSP_OCC_LOAD_START) { + prd_msg->hdr.type = OPAL_PRD_MSG_TYPE_FSP_OCC_LOAD_START; + prd_msg->occ_reset.chip = cpu_to_be64(proc); + } + + /* + * We always need to handle PSI interrupts, but if the is PRD is + * disabled then we shouldn't propagate PRD events to the host. + */ + if (prd_enabled) { + rc = opal_queue_prd_msg(prd_msg); + if (!rc) + prd_msg_inuse = true; + } +} + +static void __prd_event(uint32_t proc, uint8_t event) +{ + events[proc] |= event; + if (!prd_msg_inuse) + send_next_pending_event(); +} + +static void prd_event(uint32_t proc, uint8_t event) +{ + lock(&events_lock); + __prd_event(proc, event); + unlock(&events_lock); +} + +static int __ipoll_update_mask(uint32_t proc, bool set, uint64_t bits) +{ + uint64_t mask; + int rc; + + rc = xscom_read(proc, prd_ipoll_mask_reg, &mask); + if (rc) + return rc; + + if (set) + mask |= bits; + else + mask &= ~bits; + + return xscom_write(proc, prd_ipoll_mask_reg, mask); +} + +static int ipoll_record_and_mask_pending(uint32_t proc) +{ + uint64_t status; + int rc; + + lock(&ipoll_lock); + rc = xscom_read(proc, prd_ipoll_status_reg, &status); + status &= prd_ipoll_mask; + if (!rc) + __ipoll_update_mask(proc, true, status); + unlock(&ipoll_lock); + + if (!rc) + ipoll_status[proc] |= status; + + return rc; +} + +/* Entry point for interrupts */ +void prd_psi_interrupt(uint32_t proc) +{ + int rc; + + lock(&events_lock); + + rc = ipoll_record_and_mask_pending(proc); + if (rc) + prlog(PR_ERR, "PRD: Failed to update IPOLL mask\n"); + + __prd_event(proc, EVENT_ATTN); + + unlock(&events_lock); +} + +void prd_tmgt_interrupt(uint32_t proc) +{ + prd_event(proc, EVENT_OCC_ERROR); +} + +void prd_occ_reset(uint32_t proc) +{ + prd_event(proc, EVENT_OCC_RESET); +} + +void prd_fsp_occ_reset(uint32_t proc) +{ + prd_event(proc, EVENT_FSP_OCC_RESET); +} + +void prd_sbe_passthrough(uint32_t proc) +{ + prd_event(proc, EVENT_SBE_PASSTHROUGH); +} + +void prd_fsp_occ_load_start(uint32_t proc) +{ + prd_event(proc, EVENT_FSP_OCC_LOAD_START); +} + +void prd_fw_resp_fsp_response(int status) +{ + struct prd_fw_msg *fw_resp; + uint64_t fw_resp_len_old; + int rc; + uint16_t hdr_size; + + lock(&events_lock); + + /* In case of failure, return code is passed via generic_resp */ + if (status != 0) { + fw_resp = (struct prd_fw_msg *)prd_msg_fsp_req->fw_resp.data; + fw_resp->type = cpu_to_be64(PRD_FW_MSG_TYPE_RESP_GENERIC); + fw_resp->generic_resp.status = cpu_to_be64(status); + + fw_resp_len_old = be64_to_cpu(prd_msg_fsp_req->fw_resp.len); + prd_msg_fsp_req->fw_resp.len = cpu_to_be64(PRD_FW_MSG_BASE_SIZE + + sizeof(fw_resp->generic_resp)); + + /* Update prd message size */ + hdr_size = be16_to_cpu(prd_msg_fsp_req->hdr.size); + hdr_size -= fw_resp_len_old; + hdr_size += be64_to_cpu(prd_msg_fsp_req->fw_resp.len); + prd_msg_fsp_req->hdr.size = cpu_to_be16(hdr_size); + } + + rc = opal_queue_prd_msg(prd_msg_fsp_req); + if (!rc) + prd_msg_inuse = true; + unlock(&events_lock); +} + +int prd_hbrt_fsp_msg_notify(void *data, u32 dsize) +{ + struct prd_fw_msg *fw_notify; + int size, fw_notify_size; + int rc = FSP_STATUS_GENERIC_ERROR; + + if (!prd_enabled) { + prlog(PR_NOTICE, "PRD: %s: PRD daemon is not ready\n", + __func__); + return rc; + } + + /* Calculate prd message size */ + fw_notify_size = PRD_FW_MSG_BASE_SIZE + dsize; + size = sizeof(prd_msg->hdr) + sizeof(prd_msg->token) + + sizeof(prd_msg->fw_notify) + fw_notify_size; + + if (size > OPAL_PRD_MSG_SIZE_MAX) { + prlog(PR_DEBUG, "PRD: FSP - HBRT notify message size (0x%x)" + " is bigger than prd interface can handle\n", size); + return rc; + } + + lock(&events_lock); + + /* FSP - HBRT messages are serialized */ + if (prd_msg_fsp_notify) { + prlog(PR_DEBUG, "PRD: FSP - HBRT notify message is busy\n"); + goto unlock_events; + } + + /* Handle message allocation */ + prd_msg_fsp_notify = zalloc(size); + if (!prd_msg_fsp_notify) { + prlog(PR_DEBUG, + "PRD: %s: Failed to allocate memory.\n", __func__); + goto unlock_events; + } + + prd_msg_fsp_notify->hdr.type = OPAL_PRD_MSG_TYPE_FIRMWARE_NOTIFY; + prd_msg_fsp_notify->hdr.size = cpu_to_be16(size); + prd_msg_fsp_notify->token = 0; + prd_msg_fsp_notify->fw_notify.len = cpu_to_be64(fw_notify_size); + fw_notify = (void *)prd_msg_fsp_notify->fw_notify.data; + fw_notify->type = cpu_to_be64(PRD_FW_MSG_TYPE_HBRT_FSP); + memcpy(&(fw_notify->mbox_msg), data, dsize); + + if (!prd_active) { + // save the message, we'll deliver it when prd starts + rc = FSP_STATUS_BUSY; + goto unlock_events; + } + + rc = opal_queue_prd_msg(prd_msg_fsp_notify); + if (!rc) + prd_msg_inuse = true; + +unlock_events: + unlock(&events_lock); + return rc; +} + +/* incoming message handlers */ +static int prd_msg_handle_attn_ack(struct opal_prd_msg *msg) +{ + int rc; + + lock(&ipoll_lock); + rc = __ipoll_update_mask(be64_to_cpu(msg->attn_ack.proc), false, + be64_to_cpu(msg->attn_ack.ipoll_ack) & prd_ipoll_mask); + unlock(&ipoll_lock); + + if (rc) + prlog(PR_ERR, "PRD: Unable to unmask ipoll!\n"); + + return rc; +} + +static int prd_msg_handle_init(struct opal_prd_msg *msg) +{ + struct proc_chip *chip; + + lock(&ipoll_lock); + for_each_chip(chip) { + __ipoll_update_mask(chip->id, false, + be64_to_cpu(msg->init.ipoll) & prd_ipoll_mask); + } + unlock(&ipoll_lock); + + /* we're transitioning from inactive to active; send any pending tmgt + * interrupts */ + lock(&events_lock); + prd_active = true; + + if (prd_msg_fsp_notify) { + if (!opal_queue_prd_msg(prd_msg_fsp_notify)) + prd_msg_inuse = true; + } + if (!prd_msg_inuse) + send_next_pending_event(); + unlock(&events_lock); + + return OPAL_SUCCESS; +} + +static int prd_msg_handle_fini(void) +{ + struct proc_chip *chip; + + lock(&events_lock); + prd_active = false; + unlock(&events_lock); + + lock(&ipoll_lock); + for_each_chip(chip) { + __ipoll_update_mask(chip->id, true, prd_ipoll_mask); + } + unlock(&ipoll_lock); + + return OPAL_SUCCESS; +} + +static int prd_msg_handle_firmware_req(struct opal_prd_msg *msg) +{ + unsigned long fw_req_len, fw_resp_len, data_len; + struct prd_fw_msg *fw_req, *fw_resp; + int rc; + uint64_t resp_msg_size; + + fw_req_len = be64_to_cpu(msg->fw_req.req_len); + fw_resp_len = be64_to_cpu(msg->fw_req.resp_len); + fw_req = (struct prd_fw_msg *)msg->fw_req.data; + + /* do we have a full firmware message? */ + if (fw_req_len < sizeof(struct prd_fw_msg)) + return -EINVAL; + + /* does the total (outer) PRD message len provide enough data for the + * claimed (inner) FW message? + */ + if (be16_to_cpu(msg->hdr.size) < fw_req_len + + offsetof(struct opal_prd_msg, fw_req.data)) + return -EINVAL; + + /* is there enough response buffer for a base response? Type-specific + * responses may be larger, but anything less than BASE_SIZE is + * invalid. */ + if (fw_resp_len < PRD_FW_MSG_BASE_SIZE) + return -EINVAL; + + /* prepare a response message. */ + lock(&events_lock); + prd_msg_inuse = true; + prd_msg->token = 0; + prd_msg->hdr.type = OPAL_PRD_MSG_TYPE_FIRMWARE_RESPONSE; + fw_resp = (void *)prd_msg->fw_resp.data; + + switch (be64_to_cpu(fw_req->type)) { + case PRD_FW_MSG_TYPE_REQ_NOP: + fw_resp->type = cpu_to_be64(PRD_FW_MSG_TYPE_RESP_NOP); + prd_msg->fw_resp.len = cpu_to_be64(PRD_FW_MSG_BASE_SIZE); + prd_msg->hdr.size = cpu_to_be16(sizeof(*prd_msg)); + rc = 0; + break; + case PRD_FW_MSG_TYPE_ERROR_LOG: + if (platform.prd == NULL || + platform.prd->send_error_log == NULL) { + rc = OPAL_UNSUPPORTED; + break; + } + + rc = platform.prd->send_error_log(be32_to_cpu(fw_req->errorlog.plid), + be32_to_cpu(fw_req->errorlog.size), + fw_req->errorlog.data); + /* Return generic response to HBRT */ + fw_resp->type = cpu_to_be64(PRD_FW_MSG_TYPE_RESP_GENERIC); + fw_resp->generic_resp.status = cpu_to_be64(rc); + prd_msg->fw_resp.len = cpu_to_be64(PRD_FW_MSG_BASE_SIZE + + sizeof(fw_resp->generic_resp)); + prd_msg->hdr.size = cpu_to_be16(sizeof(*prd_msg)); + rc = 0; + break; + case PRD_FW_MSG_TYPE_HBRT_FSP: + if (platform.prd == NULL || + platform.prd->send_hbrt_msg == NULL) { + rc = OPAL_UNSUPPORTED; + break; + } + + /* + * HBRT -> FSP messages are serialized. Just to be sure check + * whether fsp_req message is free or not. + */ + if (prd_msg_fsp_req) { + prlog(PR_DEBUG, "PRD: HBRT - FSP message is busy\n"); + rc = OPAL_BUSY; + break; + } + + /* + * FSP interface doesn't tell us the response data size. + * Hence pass response length = request length. + */ + resp_msg_size = sizeof(msg->hdr) + sizeof(msg->token) + + sizeof(msg->fw_resp) + fw_req_len; + + if (resp_msg_size > OPAL_PRD_MSG_SIZE_MAX) { + prlog(PR_DEBUG, "PRD: HBRT - FSP response size (0x%llx)" + " is bigger than prd interface can handle\n", + resp_msg_size); + rc = OPAL_INTERNAL_ERROR; + break; + } + + /* + * We will use fsp_queue_msg() to pass HBRT data to FSP. + * We cannot directly map kernel passed data as kernel + * will release the memory as soon as we return the control. + * Also FSP uses same memory to pass response to HBRT. Hence + * lets copy data to local memory. Then pass this memory to + * FSP via TCE mapping. + */ + prd_msg_fsp_req = zalloc(resp_msg_size); + if (!prd_msg_fsp_req) { + prlog(PR_DEBUG, "PRD: Failed to allocate memory " + "for HBRT - FSP message\n"); + rc = OPAL_RESOURCE; + break; + } + + /* Update message header */ + prd_msg_fsp_req->hdr.type = OPAL_PRD_MSG_TYPE_FIRMWARE_RESPONSE; + prd_msg_fsp_req->hdr.size = cpu_to_be16(resp_msg_size); + prd_msg_fsp_req->token = 0; + prd_msg_fsp_req->fw_resp.len = cpu_to_be64(fw_req_len); + + /* copy HBRT data to local memory */ + fw_resp = (struct prd_fw_msg *)prd_msg_fsp_req->fw_resp.data; + memcpy(fw_resp, fw_req, fw_req_len); + + /* Update response type */ + fw_resp->type = cpu_to_be64(PRD_FW_MSG_TYPE_HBRT_FSP); + + /* Get MBOX message size */ + data_len = fw_req_len - PRD_FW_MSG_BASE_SIZE; + + /* We have to wait until FSP responds */ + prd_msg_inuse = false; + /* Unlock to avoid recursive lock issue */ + unlock(&events_lock); + + /* Send message to FSP */ + rc = platform.prd->send_hbrt_msg(&(fw_resp->mbox_msg), data_len); + + /* + * Callback handler from hservice_send_hbrt_msg will take + * care of sending response to HBRT. So just send return + * code to Linux. + */ + if (rc == OPAL_SUCCESS) + return rc; + + lock(&events_lock); + if (prd_msg_fsp_req) { + free(prd_msg_fsp_req); + prd_msg_fsp_req = NULL; + } + break; + default: + prlog(PR_DEBUG, "PRD: Unsupported fw_request type : 0x%llx\n", + be64_to_cpu(fw_req->type)); + rc = -ENOSYS; + } + + if (!rc) { + rc = opal_queue_prd_msg(prd_msg); + if (rc) + prd_msg_inuse = false; + } else { + prd_msg_inuse = false; + } + + unlock(&events_lock); + + return rc; +} + +/* Entry from the host above */ +static int64_t opal_prd_msg(struct opal_prd_msg *msg) +{ + int rc; + + /* fini is a little special: the kernel (which may not have the entire + * opal_prd_msg definition) can send a FINI message, so we don't check + * the full size */ + if (be16_to_cpu(msg->hdr.size) >= sizeof(struct opal_prd_msg_header) && + msg->hdr.type == OPAL_PRD_MSG_TYPE_FINI) + return prd_msg_handle_fini(); + + if (be16_to_cpu(msg->hdr.size) < sizeof(*msg)) + return OPAL_PARAMETER; + + switch (msg->hdr.type) { + case OPAL_PRD_MSG_TYPE_INIT: + rc = prd_msg_handle_init(msg); + break; + case OPAL_PRD_MSG_TYPE_ATTN_ACK: + rc = prd_msg_handle_attn_ack(msg); + break; + case OPAL_PRD_MSG_TYPE_OCC_RESET_NOTIFY: + rc = occ_msg_queue_occ_reset(); + break; + case OPAL_PRD_MSG_TYPE_FIRMWARE_REQUEST: + rc = prd_msg_handle_firmware_req(msg); + break; + case OPAL_PRD_MSG_TYPE_FSP_OCC_RESET_STATUS: + if (platform.prd == NULL || + platform.prd->fsp_occ_reset_status == NULL) { + rc = OPAL_UNSUPPORTED; + break; + } + rc = platform.prd->fsp_occ_reset_status( + be64_to_cpu(msg->fsp_occ_reset_status.chip), + be64_to_cpu(msg->fsp_occ_reset_status.status)); + break; + case OPAL_PRD_MSG_TYPE_CORE_SPECIAL_WAKEUP: + if (platform.prd == NULL || + platform.prd->wakeup == NULL) { + rc = OPAL_UNSUPPORTED; + break; + } + rc = platform.prd->wakeup(be32_to_cpu(msg->spl_wakeup.core), + be32_to_cpu(msg->spl_wakeup.mode)); + break; + case OPAL_PRD_MSG_TYPE_FSP_OCC_LOAD_START_STATUS: + if (platform.prd == NULL || + platform.prd->fsp_occ_load_start_status == NULL) { + rc = OPAL_UNSUPPORTED; + break; + } + rc = platform.prd->fsp_occ_load_start_status( + be64_to_cpu(msg->fsp_occ_reset_status.chip), + be64_to_cpu(msg->fsp_occ_reset_status.status)); + break; + default: + prlog(PR_DEBUG, "PRD: Unsupported prd message type : 0x%x\n", + msg->hdr.type); + rc = OPAL_UNSUPPORTED; + } + + return rc; +} + + +/* + * Initialise the Opal backend for the PRD daemon. This must be called from + * platform probe or init function. + */ +void prd_init(void) +{ + struct proc_chip *chip; + + switch (proc_gen) { + case proc_gen_p8: + prd_ipoll_mask_reg = PRD_P8_IPOLL_REG_MASK; + prd_ipoll_status_reg = PRD_P8_IPOLL_REG_STATUS; + prd_ipoll_mask = PRD_P8_IPOLL_MASK; + break; + case proc_gen_p9: + prd_ipoll_mask_reg = PRD_P9_IPOLL_REG_MASK; + prd_ipoll_status_reg = PRD_P9_IPOLL_REG_STATUS; + prd_ipoll_mask = PRD_P9_IPOLL_MASK; + break; + case proc_gen_p10: /* IPOLL regs are the same for p9 and p10 */ + prd_ipoll_mask_reg = PRD_P9_IPOLL_REG_MASK; + prd_ipoll_status_reg = PRD_P9_IPOLL_REG_STATUS; + prd_ipoll_mask = PRD_P9_IPOLL_MASK; + break; + default: + assert(0); + } + + /* mask everything */ + lock(&ipoll_lock); + for_each_chip(chip) { + __ipoll_update_mask(chip->id, true, prd_ipoll_mask); + } + unlock(&ipoll_lock); + + prd_enabled = true; + opal_register(OPAL_PRD_MSG, opal_prd_msg, 1); + + prd_node = dt_new(opal_node, "diagnostics"); + dt_add_property_strings(prd_node, "compatible", "ibm,opal-prd"); +} + +void prd_register_reserved_memory(void) +{ + struct mem_region *region; + + if (!prd_node) + return; + + lock(&mem_region_lock); + for (region = mem_region_next(NULL); region; + region = mem_region_next(region)) { + + if (region->type != REGION_FW_RESERVED) + continue; + + if (!region->node) + continue; + + if (!dt_find_property(region->node, "ibm,prd-label")) { + dt_add_property_string(region->node, "ibm,prd-label", + region->name); + } + } + unlock(&mem_region_lock); +} diff --git a/roms/skiboot/hw/psi.c b/roms/skiboot/hw/psi.c new file mode 100644 index 000000000..de074ce4a --- /dev/null +++ b/roms/skiboot/hw/psi.c @@ -0,0 +1,1079 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Service Processor serial console handling code + * + * Copyright 2013-2019 IBM Corp. + */ + +#include <io.h> +#include <psi.h> +#include <fsp.h> +#include <opal.h> +#include <interrupts.h> +#include <cpu.h> +#include <dio-p9.h> +#include <trace.h> +#include <xscom.h> +#include <chip.h> +#include <lpc.h> +#include <i2c.h> +#include <timebase.h> +#include <platform.h> +#include <errorlog.h> +#include <xive.h> +#include <sbe-p9.h> +#include <phys-map.h> +#include <occ.h> + +static LIST_HEAD(psis); +static u64 psi_link_timer; +static u64 psi_link_timeout; +static bool psi_link_poll_active; + +static void psi_activate_phb(struct psi *psi); + +struct lock psi_lock = LOCK_UNLOCKED; + +DEFINE_LOG_ENTRY(OPAL_RC_PSI_TIMEOUT, OPAL_PLATFORM_ERR_EVT, OPAL_PSI, + OPAL_PLATFORM_FIRMWARE, + OPAL_UNRECOVERABLE_ERR_LOSS_OF_FUNCTION, OPAL_NA); + +void psi_set_link_polling(bool active) +{ + printf("PSI: %sing link polling\n", + active ? "start" : "stopp"); + psi_link_poll_active = active; +} + +void psi_disable_link(struct psi *psi) +{ + lock(&psi_lock); + + /* + * Note: This can be called with the link already down but + * not detected as such yet by this layer since psi_check_link_active() + * operates locklessly and thus won't update the PSI structure. This + * is a non-issue, the only consequence is the messages in the log + * mentioning first the link having gone down then being disabled. + */ + if (psi->active) { + u64 reg; + psi->active = false; + + /* Mask errors in SEMR */ + reg = in_be64(psi->regs + PSIHB_SEMR); + reg &= ((0xfffull << 36) | (0xfffull << 20)); + out_be64(psi->regs + PSIHB_SEMR, reg); + printf("PSI: SEMR set to %llx\n", reg); + + /* Reset all the error bits in PSIHB_CR and + * disable FSP interrupts + */ + reg = in_be64(psi->regs + PSIHB_CR); + reg &= ~(0x7ffull << 20); + reg &= ~PSIHB_CR_PSI_LINK_ENABLE; /* flip link enable */ + /* + * Ensure no commands/spurious interrupts reach + * the processor, by flipping the command enable. + */ + reg &= ~PSIHB_CR_FSP_CMD_ENABLE; + reg &= ~PSIHB_CR_FSP_IRQ_ENABLE; + reg &= ~PSIHB_CR_FSP_IRQ; /* Clear interrupt state too */ + printf("PSI[0x%03x]: Disabling link!\n", psi->chip_id); + out_be64(psi->regs + PSIHB_CR, reg); + printf("PSI: PSIHB_CR (error bits) set to %llx\n", + in_be64(psi->regs + PSIHB_CR)); + psi_set_link_polling(true); + } + + unlock(&psi_lock); +} + +/* + * Resetting the FSP is a multi step sequence: + * 1. Read the PSIHBCR + * 2. Set the PSIHBCR[6] -- write register back. + * 3. Read PSIHBCR again + * 4. Reset PSIHBCR[6] -- write register back. + */ +void psi_reset_fsp(struct psi *psi) +{ + lock(&psi_lock); + + if (psi->active) { + u64 reg; + + printf("PSI: Driving FSP reset via PSI\n"); + reg = in_be64(psi->regs + PSIHB_CR); + reg &= ~(0xfffull << 20); /* Reset error bits */ + reg |= PSIHB_CR_FSP_RESET; /* FSP reset trigger start */ + out_be64(psi->regs + PSIHB_CR, reg); + printf("PSI[0x%03x]: FSP reset start PSIHBCR set to %llx\n", + psi->chip_id, in_be64(psi->regs + PSIHB_CR)); + + reg = in_be64(psi->regs + PSIHB_CR); + reg &= ~PSIHB_CR_FSP_RESET; /* Clear FSP reset bit */ + out_be64(psi->regs + PSIHB_CR, reg); /* Complete reset */ + printf("PSI[0x%03x]: FSP reset complete. PSIHBCR set to %llx\n", + psi->chip_id, in_be64(psi->regs + PSIHB_CR)); + } + unlock(&psi_lock); + + /* Now bring down the PSI link too... */ + psi_disable_link(psi); +} + +bool psi_check_link_active(struct psi *psi) +{ + u64 val = in_be64(psi->regs + PSIHB_CR); + + /* + * Unlocked, used during fsp_poke_msg so we really want + * to avoid fancy link re-entrancy and deadlocks here + */ + if (!psi->active) + return false; + return (val & PSIHB_CR_PSI_LINK_ENABLE) && + (val & PSIHB_CR_FSP_LINK_ACTIVE); +} + +struct psi *psi_find_link(uint32_t chip_id) +{ + struct psi *psi; + + list_for_each(&psis, psi, list) { + if (psi->chip_id == chip_id) + return psi; + } + return NULL; +} + +#define PSI_LINK_CHECK_INTERVAL 10 /* Interval in secs */ +#define PSI_LINK_RECOVERY_TIMEOUT 1800 /* 30 minutes */ + +static void psi_link_poll(void *data __unused) +{ + struct psi *psi; + u64 now; + + if (!psi_link_poll_active) + return; + + now = mftb(); + if (psi_link_timer == 0 || + (tb_compare(now, psi_link_timer) == TB_AAFTERB) || + (tb_compare(now, psi_link_timer) == TB_AEQUALB)) { + + lock(&psi_lock); + + list_for_each(&psis, psi, list) { + u64 val; + + if (psi->active) + continue; + + val = in_be64(psi->regs + PSIHB_CR); + + printf("PSI[0x%03x]: Poll CR=0x%016llx\n", + psi->chip_id, val); + + if ((val & PSIHB_CR_PSI_LINK_ENABLE) && + (val & PSIHB_CR_FSP_LINK_ACTIVE)) { + printf("PSI[0x%03x]: Found active link!\n", + psi->chip_id); + psi_link_timeout = 0; + psi->active = true; + psi_activate_phb(psi); + psi_set_link_polling(false); + unlock(&psi_lock); + if (platform.psi && platform.psi->link_established) + platform.psi->link_established(); + return; + } + } + if (!psi_link_timeout) + psi_link_timeout = + now + secs_to_tb(PSI_LINK_RECOVERY_TIMEOUT); + + if (tb_compare(now, psi_link_timeout) == TB_AAFTERB) { + log_simple_error(&e_info(OPAL_RC_PSI_TIMEOUT), + "PSI: Link timeout -- loss of FSP\n"); + /* Reset the link timeout and continue looking */ + psi_link_timeout = 0; + } + + /* Poll every 10 seconds */ + psi_link_timer = now + secs_to_tb(PSI_LINK_CHECK_INTERVAL); + + unlock(&psi_lock); + } +} + +void psi_enable_fsp_interrupt(struct psi *psi) +{ + /* Enable FSP interrupts in the GXHB */ + lock(&psi_lock); + out_be64(psi->regs + PSIHB_CR, + in_be64(psi->regs + PSIHB_CR) | PSIHB_CR_FSP_IRQ_ENABLE); + unlock(&psi_lock); +} + +/* Multiple bits can be set on errors */ +static void decode_psihb_error(u64 val) +{ + if (val & PSIHB_CR_PSI_ERROR) + printf("PSI: PSI Reported Error\n"); + if (val & PSIHB_CR_PSI_LINK_INACTIVE) + printf("PSI: PSI Link Inactive Transition\n"); + if (val & PSIHB_CR_FSP_ACK_TIMEOUT) + printf("PSI: FSP Ack Timeout\n"); + if (val & PSIHB_CR_MMIO_LOAD_TIMEOUT) + printf("PSI: MMIO Load Timeout\n"); + if (val & PSIHB_CR_MMIO_LENGTH_ERROR) + printf("PSI: MMIO Length Error\n"); + if (val & PSIHB_CR_MMIO_ADDRESS_ERROR) + printf("PSI: MMIO Address Error\n"); + if (val & PSIHB_CR_MMIO_TYPE_ERROR) + printf("PSI: MMIO Type Error\n"); + if (val & PSIHB_CR_UE) + printf("PSI: UE Detected\n"); + if (val & PSIHB_CR_PARITY_ERROR) + printf("PSI: Internal Parity Error\n"); + if (val & PSIHB_CR_SYNC_ERR_ALERT1) + printf("PSI: Sync Error Alert1\n"); + if (val & PSIHB_CR_SYNC_ERR_ALERT2) + printf("PSI: Sync Error Alert2\n"); + if (val & PSIHB_CR_FSP_COMMAND_ERROR) + printf("PSI: FSP Command Error\n"); +} + + +static void handle_psi_interrupt(struct psi *psi, u64 val) +{ + printf("PSI[0x%03x]: PSI mgmnt interrupt CR=0x%016llx\n", + psi->chip_id, val); + + if (val & (0xfffull << 20)) { + decode_psihb_error(val); + psi_disable_link(psi); + } else if (val & (0x1full << 11)) + printf("PSI: FSP error detected\n"); +} + +static void psi_spurious_fsp_irq(struct psi *psi) +{ + u64 reg, bit; + + prlog(PR_NOTICE, "PSI: Spurious interrupt, attempting clear\n"); + + if (proc_gen == proc_gen_p10) { + reg = PSIHB_XSCOM_P10_HBCSR_CLR; + bit = PSIHB_XSCOM_P10_HBSCR_FSP_IRQ; + } else if (proc_gen == proc_gen_p9) { + reg = PSIHB_XSCOM_P9_HBCSR_CLR; + bit = PSIHB_XSCOM_P9_HBSCR_FSP_IRQ; + } else if (proc_gen == proc_gen_p8) { + reg = PSIHB_XSCOM_P8_HBCSR_CLR; + bit = PSIHB_XSCOM_P8_HBSCR_FSP_IRQ; + } else { + assert(false); + } + xscom_write(psi->chip_id, psi->xscom_base + reg, bit); +} + +bool psi_poll_fsp_interrupt(struct psi *psi) +{ + return !!(in_be64(psi->regs + PSIHB_CR) & PSIHB_CR_FSP_IRQ); +} + +static void psihb_interrupt(struct irq_source *is, uint32_t isn __unused) +{ + struct psi *psi = is->data; + u64 val; + + val = in_be64(psi->regs + PSIHB_CR); + + if (psi_link_poll_active) { + printf("PSI[0x%03x]: PSI interrupt CR=0x%016llx (A=%d)\n", + psi->chip_id, val, psi->active); + } + + /* Handle PSI interrupts first in case it's a link down */ + if (val & PSIHB_CR_PSI_IRQ) { + handle_psi_interrupt(psi, val); + + /* + * If the link went down, re-read PSIHB_CR as + * the FSP interrupt might have been cleared. + */ + if (!psi->active) + val = in_be64(psi->regs + PSIHB_CR); + } + + + /* + * We avoid forwarding FSP interrupts if the link isn't + * active. They should be masked anyway but it looks + * like the CR bit can remain set. + */ + if (val & PSIHB_CR_FSP_IRQ) { + /* + * We have a case a flood with FSP mailbox interrupts + * when the link is down, see if we manage to clear + * the condition + */ + if (!psi->active) + psi_spurious_fsp_irq(psi); + else { + if (platform.psi && platform.psi->fsp_interrupt) + platform.psi->fsp_interrupt(); + } + } + + if (platform.psi && platform.psi->psihb_interrupt) + platform.psi->psihb_interrupt(); +} + + +static const uint32_t psi_p8_irq_to_xivr[P8_IRQ_PSI_IRQ_COUNT] = { + [P8_IRQ_PSI_FSP] = PSIHB_XIVR_FSP, + [P8_IRQ_PSI_OCC] = PSIHB_XIVR_OCC, + [P8_IRQ_PSI_FSI] = PSIHB_XIVR_FSI, + [P8_IRQ_PSI_LPC] = PSIHB_XIVR_LPC, + [P8_IRQ_PSI_LOCAL_ERR] = PSIHB_XIVR_LOCAL_ERR, + [P8_IRQ_PSI_EXTERNAL]= PSIHB_XIVR_HOST_ERR, +}; + +static void psi_cleanup_irq(struct psi *psi) +{ + uint32_t irq; + uint64_t xivr, xivr_p; + + for (irq = 0; irq < P8_IRQ_PSI_IRQ_COUNT; irq++) { + prlog(PR_DEBUG, "PSI[0x%03x]: Cleaning up IRQ %d\n", + psi->chip_id, irq); + + xivr_p = psi_p8_irq_to_xivr[irq]; + xivr = in_be64(psi->regs + xivr_p); + xivr |= (0xffull << 32); + out_be64(psi->regs + xivr_p, xivr); + time_wait_ms_nopoll(10); + xivr = in_be64(psi->regs + xivr_p); + if (xivr & PPC_BIT(39)) { + printf(" Need EOI !\n"); + icp_send_eoi(psi->interrupt + irq); + } + } +} + +/* Called on a fast reset, make sure we aren't stuck with + * an accepted and never EOId PSI interrupt + */ +void psi_irq_reset(void) +{ + struct psi *psi; + + printf("PSI: Hot reset!\n"); + + assert(proc_gen == proc_gen_p8); + + list_for_each(&psis, psi, list) { + psi_cleanup_irq(psi); + } +} + +static int64_t psi_p8_set_xive(struct irq_source *is, uint32_t isn, + uint16_t server, uint8_t priority) +{ + struct psi *psi = is->data; + uint64_t xivr_p, xivr; + uint32_t irq_idx = isn & 7; + + if (irq_idx >= P8_IRQ_PSI_IRQ_COUNT) + return OPAL_PARAMETER; + xivr_p = psi_p8_irq_to_xivr[irq_idx]; + + /* Populate the XIVR */ + xivr = (uint64_t)server << 40; + xivr |= (uint64_t)priority << 32; + xivr |= (uint64_t)(isn & 7) << 29; + + out_be64(psi->regs + xivr_p, xivr); + + return OPAL_SUCCESS; +} + +static int64_t psi_p8_get_xive(struct irq_source *is, uint32_t isn __unused, + uint16_t *server, uint8_t *priority) +{ + struct psi *psi = is->data; + uint64_t xivr_p, xivr; + uint32_t irq_idx = isn & 7; + + if (irq_idx >= P8_IRQ_PSI_IRQ_COUNT) + return OPAL_PARAMETER; + + xivr_p = psi_p8_irq_to_xivr[irq_idx]; + + /* Read & decode the XIVR */ + xivr = in_be64(psi->regs + xivr_p); + + *server = (xivr >> 40) & 0xffff; + *priority = (xivr >> 32) & 0xff; + + return OPAL_SUCCESS; +} + +static void psihb_p8_interrupt(struct irq_source *is, uint32_t isn) +{ + struct psi *psi = is->data; + uint32_t idx = isn - psi->interrupt; + + switch (idx) { + case P8_IRQ_PSI_FSP: + psihb_interrupt(is, isn); + break; + case P8_IRQ_PSI_OCC: + occ_p8_interrupt(psi->chip_id); + break; + case P8_IRQ_PSI_FSI: + printf("PSI: FSI irq received\n"); + break; + case P8_IRQ_PSI_LPC: + lpc_interrupt(psi->chip_id); + + /* + * i2c interrupts are ORed with the LPC ones on + * Murano DD2.1 and Venice DD2.0 + */ + p8_i2c_interrupt(psi->chip_id); + break; + case P8_IRQ_PSI_LOCAL_ERR: + prd_psi_interrupt(psi->chip_id); + break; + case P8_IRQ_PSI_EXTERNAL: + if (platform.external_irq) + platform.external_irq(psi->chip_id); + break; + } + + /* + * TODO: Per Vicente Chung, CRESPs don't generate interrupts, + * and are just informational. Need to define the policy + * to handle them. + */ +} + +static uint64_t psi_p8_irq_attributes(struct irq_source *is, uint32_t isn) +{ + struct psi *psi = is->data; + uint32_t idx = isn - psi->interrupt; + uint64_t attr; + + if (psi->no_lpc_irqs && idx == P8_IRQ_PSI_LPC) + return IRQ_ATTR_TARGET_LINUX; + + /* Only direct external interrupts to OPAL if we have a handler */ + if (idx == P8_IRQ_PSI_EXTERNAL && !platform.external_irq) + return IRQ_ATTR_TARGET_LINUX; + + attr = IRQ_ATTR_TARGET_OPAL | IRQ_ATTR_TYPE_LSI; + if (idx == P8_IRQ_PSI_EXTERNAL || idx == P8_IRQ_PSI_LPC || + idx == P8_IRQ_PSI_FSP) + attr |= IRQ_ATTR_TARGET_FREQUENT; + return attr; +} + +static char *psi_p8_irq_name(struct irq_source *is, uint32_t isn) +{ + struct psi *psi = is->data; + uint32_t idx = isn - psi->interrupt; + char tmp[30]; + + static const char *names[P8_IRQ_PSI_IRQ_COUNT] = { + "fsp", + "occ", + "fsi", + "lpchc", + "local_err", + "external", + }; + + if (idx >= P8_IRQ_PSI_IRQ_COUNT) + return NULL; + + snprintf(tmp, sizeof(tmp), "psi#%x:%s", + psi->chip_id, names[idx]); + + return strdup(tmp); +} + +static const struct irq_source_ops psi_p8_irq_ops = { + .get_xive = psi_p8_get_xive, + .set_xive = psi_p8_set_xive, + .interrupt = psihb_p8_interrupt, + .attributes = psi_p8_irq_attributes, + .name = psi_p8_irq_name, +}; + +static const char *psi_p9_irq_names[P9_PSI_NUM_IRQS] = { + "fsp", + "occ", + "fsi", + "lpchc", + "local_err", + "global_err", + "external", + "lpc_serirq_mux0", /* Have a callback to get name ? */ + "lpc_serirq_mux1", /* Have a callback to get name ? */ + "lpc_serirq_mux2", /* Have a callback to get name ? */ + "lpc_serirq_mux3", /* Have a callback to get name ? */ + "i2c", + "dio", + "psu" +}; + +static void psi_p9_mask_all(struct psi *psi) +{ + struct irq_source *is; + int isn; + + /* Mask all sources */ + is = irq_find_source(psi->interrupt); + for (isn = is->start; isn < is->end; isn++) + xive_source_mask(is, isn); +} + +static void psi_p9_mask_unhandled_irq(struct irq_source *is, uint32_t isn) +{ + struct psi *psi = is->data; + int idx = isn - psi->interrupt; + const char *name; + + if (idx < ARRAY_SIZE(psi_p9_irq_names)) + name = psi_p9_irq_names[idx]; + else + name = "unknown!"; + + prerror("PSI[0x%03x]: Masking unhandled LSI %d (%s)\n", + psi->chip_id, idx, name); + + /* + * All the PSI interrupts are LSIs and will be constantly re-fired + * unless the underlying interrupt condition is cleared. If we don't + * have a handler for the interrupt then it needs to be masked to + * prevent the IRQ from locking up the thread which handles it. + */ + switch (proc_gen) { + case proc_gen_p9: + xive_source_mask(is, isn); + break; + case proc_gen_p10: + xive2_source_mask(is, isn); + return; + default: + assert(false); + } + +} + +static void psihb_p9_interrupt(struct irq_source *is, uint32_t isn) +{ + struct psi *psi = is->data; + uint32_t idx = isn - psi->interrupt; + + switch (idx) { + case P9_PSI_IRQ_PSI: + psihb_interrupt(is, isn); + break; + case P9_PSI_IRQ_OCC: + occ_p9_interrupt(psi->chip_id); + break; + case P9_PSI_IRQ_LPCHC: + lpc_interrupt(psi->chip_id); + break; + case P9_PSI_IRQ_LOCAL_ERR: + prd_psi_interrupt(psi->chip_id); + break; + case P9_PSI_IRQ_EXTERNAL: + if (platform.external_irq) + platform.external_irq(psi->chip_id); + else + psi_p9_mask_unhandled_irq(is, isn); + break; + case P9_PSI_IRQ_LPC_SIRQ0: + case P9_PSI_IRQ_LPC_SIRQ1: + case P9_PSI_IRQ_LPC_SIRQ2: + case P9_PSI_IRQ_LPC_SIRQ3: + lpc_serirq(psi->chip_id, idx - P9_PSI_IRQ_LPC_SIRQ0); + break; + case P9_PSI_IRQ_SBE_I2C: + p8_i2c_interrupt(psi->chip_id); + break; + case P9_PSI_IRQ_DIO: + printf("PSI: DIO irq received\n"); + dio_interrupt_handler(psi->chip_id); + break; + case P9_PSI_IRQ_PSU: + p9_sbe_interrupt(psi->chip_id); + break; + + default: + psi_p9_mask_unhandled_irq(is, isn); + } +} + +static uint64_t psi_p9_irq_attributes(struct irq_source *is __unused, + uint32_t isn) +{ + struct psi *psi = is->data; + unsigned int idx = isn & 0xf; + bool is_lpc_serirq; + + is_lpc_serirq = + (idx == P9_PSI_IRQ_LPC_SIRQ0 || + idx == P9_PSI_IRQ_LPC_SIRQ1 || + idx == P9_PSI_IRQ_LPC_SIRQ2 || + idx == P9_PSI_IRQ_LPC_SIRQ3); + + /* If LPC interrupts are disabled, route them to Linux + * (who will not request them since they aren't referenced + * in the device tree) + */ + if (is_lpc_serirq && psi->no_lpc_irqs) + return IRQ_ATTR_TARGET_LINUX; + + /* For serirq, check the LPC layer for policy */ + if (is_lpc_serirq) + return lpc_get_irq_policy(psi->chip_id, idx - P9_PSI_IRQ_LPC_SIRQ0); + + /* Only direct external interrupts to OPAL if we have a handler */ + if (idx == P9_PSI_IRQ_EXTERNAL && !platform.external_irq) + return IRQ_ATTR_TARGET_LINUX | IRQ_ATTR_TYPE_LSI; + + return IRQ_ATTR_TARGET_OPAL | IRQ_ATTR_TYPE_LSI; +} + +static char *psi_p9_irq_name(struct irq_source *is, uint32_t isn) +{ + struct psi *psi = is->data; + uint32_t idx = isn - psi->interrupt; + char tmp[30]; + + if (idx >= ARRAY_SIZE(psi_p9_irq_names)) + return NULL; + + snprintf(tmp, sizeof(tmp), "psi#%x:%s", + psi->chip_id, psi_p9_irq_names[idx]); + + return strdup(tmp); +} + +static const struct irq_source_ops psi_p9_irq_ops = { + .interrupt = psihb_p9_interrupt, + .attributes = psi_p9_irq_attributes, + .name = psi_p9_irq_name, +}; + +static void psi_init_p8_interrupts(struct psi *psi) +{ + uint32_t irq; + uint64_t xivr_p; + + /* On P8 we get a block of 8, set up the base/mask + * and mask all the sources for now + */ + out_be64(psi->regs + PSIHB_IRSN, + SETFIELD(PSIHB_IRSN_COMP, 0ul, psi->interrupt) | + SETFIELD(PSIHB_IRSN_MASK, 0ul, 0x7fff8ul) | + PSIHB_IRSN_DOWNSTREAM_EN | + PSIHB_IRSN_UPSTREAM_EN); + + for (irq = 0; irq < P8_IRQ_PSI_IRQ_COUNT; irq++) { + xivr_p = psi_p8_irq_to_xivr[irq]; + out_be64(psi->regs + xivr_p, (0xffull << 32) | (irq << 29)); + } + + /* + * Register the IRQ sources FSP, OCC, FSI, LPC + * and Local Error. Host Error is actually the + * external interrupt and the policy for that comes + * from the platform + */ + register_irq_source(&psi_p8_irq_ops, psi, + psi->interrupt, P8_IRQ_PSI_IRQ_COUNT); +} + +static void psi_init_p9_interrupts(struct psi *psi) +{ + struct proc_chip *chip; + u64 val; + + /* Grab chip */ + chip = get_chip(psi->chip_id); + if (!chip) + return; + + /* Configure the CI BAR */ + phys_map_get(chip->id, PSIHB_ESB, 0, &val, NULL); + val |= PSIHB_ESB_CI_VALID; + out_be64(psi->regs + PSIHB_ESB_CI_BASE, val); + + val = in_be64(psi->regs + PSIHB_ESB_CI_BASE); + psi->esb_mmio = (void *)(val & ~PSIHB_ESB_CI_VALID); + prlog(PR_DEBUG, "PSI[0x%03x]: ESB MMIO at @%p\n", + psi->chip_id, psi->esb_mmio); + + /* Register sources */ + prlog(PR_DEBUG, + "PSI[0x%03x]: Interrupts sources registered for P9 DD2.x\n", + psi->chip_id); + xive_register_hw_source(psi->interrupt, P9_PSI_NUM_IRQS, + 12, psi->esb_mmio, XIVE_SRC_LSI, + psi, &psi_p9_irq_ops); + + psi_p9_mask_all(psi); + + /* Setup interrupt offset */ + val = xive_get_notify_base(psi->interrupt); + val <<= 32; + out_be64(psi->regs + PSIHB_IVT_OFFSET, val); + + /* Grab and configure the notification port */ + val = xive_get_notify_port(psi->chip_id, XIVE_HW_SRC_PSI); + val |= PSIHB_ESB_NOTIF_VALID; + out_be64(psi->regs + PSIHB_ESB_NOTIF_ADDR, val); + + /* Reset irq handling and switch to ESB mode */ + out_be64(psi->regs + PSIHB_INTERRUPT_CONTROL, PSIHB_IRQ_RESET); + out_be64(psi->regs + PSIHB_INTERRUPT_CONTROL, 0); +} + +/* + * P9 and P10 have the same PSIHB interface + */ +static const struct irq_source_ops psi_p10_irq_ops = { + .interrupt = psihb_p9_interrupt, + .attributes = psi_p9_irq_attributes, + .name = psi_p9_irq_name, +}; + +#define PSIHB10_CAN_STORE_EOI(x) XIVE2_STORE_EOI_ENABLED + +static void psi_init_p10_interrupts(struct psi *psi) +{ + struct proc_chip *chip; + u64 val; + uint32_t esb_shift = 16; + uint32_t flags = XIVE_SRC_LSI; + struct irq_source *is; + int isn; + + /* Grab chip */ + chip = get_chip(psi->chip_id); + if (!chip) + return; + + /* Configure the CI BAR */ + phys_map_get(chip->id, PSIHB_ESB, 0, &val, NULL); + val |= PSIHB_ESB_CI_VALID; + if (esb_shift == 16) + val |= PSIHB10_ESB_CI_64K; + out_be64(psi->regs + PSIHB_ESB_CI_BASE, val); + + val = in_be64(psi->regs + PSIHB_ESB_CI_BASE); + psi->esb_mmio = (void *)(val & ~(PSIHB_ESB_CI_VALID|PSIHB10_ESB_CI_64K)); + prlog(PR_DEBUG, "PSI[0x%03x]: ESB MMIO at @%p\n", + psi->chip_id, psi->esb_mmio); + + /* Store EOI */ + if (PSIHB10_CAN_STORE_EOI(psi)) { + val = in_be64(psi->regs + PSIHB_CR); + val |= PSIHB10_CR_STORE_EOI; + out_be64(psi->regs + PSIHB_CR, val); + prlog(PR_DEBUG, "PSI[0x%03x]: store EOI is enabled\n", + psi->chip_id); + flags |= XIVE_SRC_STORE_EOI; + } + + /* Register sources */ + prlog(PR_DEBUG, + "PSI[0x%03x]: Interrupts sources registered for P10 DD%i.%i\n", + psi->chip_id, 0xf & (chip->ec_level >> 4), chip->ec_level & 0xf); + + xive2_register_hw_source(psi->interrupt, P9_PSI_NUM_IRQS, + esb_shift, psi->esb_mmio, flags, + psi, &psi_p10_irq_ops); + + /* Mask all sources */ + is = irq_find_source(psi->interrupt); + for (isn = is->start; isn < is->end; isn++) + xive2_source_mask(is, isn); + + /* Setup interrupt offset */ + val = xive2_get_notify_base(psi->interrupt); + val <<= 32; + out_be64(psi->regs + PSIHB_IVT_OFFSET, val); + + /* Grab and configure the notification port */ + val = xive2_get_notify_port(psi->chip_id, XIVE_HW_SRC_PSI); + val |= PSIHB_ESB_NOTIF_VALID; + out_be64(psi->regs + PSIHB_ESB_NOTIF_ADDR, val); + + /* Reset irq handling and switch to ESB mode */ + out_be64(psi->regs + PSIHB_INTERRUPT_CONTROL, PSIHB_IRQ_RESET); + out_be64(psi->regs + PSIHB_INTERRUPT_CONTROL, 0); +} + +static void psi_init_interrupts(struct psi *psi) +{ + /* Configure the interrupt BUID and mask it */ + switch (proc_gen) { + case proc_gen_p8: + psi_init_p8_interrupts(psi); + break; + case proc_gen_p9: + psi_init_p9_interrupts(psi); + break; + case proc_gen_p10: + psi_init_p10_interrupts(psi); + break; + default: + /* Unknown: just no interrupts */ + prerror("PSI: Unknown interrupt type\n"); + } +} + +static void psi_activate_phb(struct psi *psi) +{ + u64 reg; + + /* + * Disable interrupt emission in the control register, + * it will be re-enabled later, after the mailbox one + * will have been enabled. + */ + reg = in_be64(psi->regs + PSIHB_CR); + reg &= ~PSIHB_CR_FSP_IRQ_ENABLE; + out_be64(psi->regs + PSIHB_CR, reg); + + /* Enable interrupts in the mask register. We enable everything + * except for bit "FSP command error detected" which the doc + * (P7 BookIV) says should be masked for normal ops. It also + * seems to be masked under OPAL. + */ + reg = 0x0000010000100000ull; + out_be64(psi->regs + PSIHB_SEMR, reg); + +#if 0 + /* Dump the GXHB registers */ + printf(" PSIHB_BBAR : %llx\n", + in_be64(psi->regs + PSIHB_BBAR)); + printf(" PSIHB_FSPBAR : %llx\n", + in_be64(psi->regs + PSIHB_FSPBAR)); + printf(" PSIHB_FSPMMR : %llx\n", + in_be64(psi->regs + PSIHB_FSPMMR)); + printf(" PSIHB_TAR : %llx\n", + in_be64(psi->regs + PSIHB_TAR)); + printf(" PSIHB_CR : %llx\n", + in_be64(psi->regs + PSIHB_CR)); + printf(" PSIHB_SEMR : %llx\n", + in_be64(psi->regs + PSIHB_SEMR)); + printf(" PSIHB_XIVR : %llx\n", + in_be64(psi->regs + PSIHB_XIVR)); +#endif +} + +static void psi_create_p9_int_map(struct psi *psi, struct dt_node *np) +{ + __be32 map[P9_PSI_NUM_IRQS][4]; + int i; + + for (i = 0; i < P9_PSI_NUM_IRQS; i++) { + map[i][0] = cpu_to_be32(i); + map[i][1] = cpu_to_be32(get_ics_phandle()); + map[i][2] = cpu_to_be32(psi->interrupt + i); + map[i][3] = cpu_to_be32(1); + } + dt_add_property(np, "interrupt-map", map, sizeof(map)); + dt_add_property_cells(np, "#address-cells", 0); + dt_add_property_cells(np, "#interrupt-cells", 1); +} + +static void psi_create_mm_dtnode(struct psi *psi) +{ + struct dt_node *np; + uint64_t addr = (uint64_t)psi->regs; + + np = dt_new_addr(dt_root, "psi", addr); + if (!np) + return; + + /* Hard wire size to 4G */ + dt_add_property_u64s(np, "reg", addr, 0x100000000ull); + switch (proc_gen) { + case proc_gen_p8: + dt_add_property_strings(np, "compatible", "ibm,psi", + "ibm,power8-psi"); + break; + case proc_gen_p9: + case proc_gen_p10: + dt_add_property_strings(np, "compatible", "ibm,psi", + "ibm,power9-psi"); + psi_create_p9_int_map(psi, np); + break; + default: + assert(0); + break; + } + dt_add_property_cells(np, "interrupt-parent", get_ics_phandle()); + dt_add_property_cells(np, "interrupts", psi->interrupt, 1); + dt_add_property_cells(np, "ibm,chip-id", psi->chip_id); + psi->node = np; +} + +static struct psi *alloc_psi(struct proc_chip *chip, uint64_t base) +{ + struct psi *psi; + + psi = zalloc(sizeof(struct psi)); + if (!psi) { + prerror("PSI: Could not allocate memory\n"); + return NULL; + } + psi->xscom_base = base; + psi->chip_id = chip->id; + return psi; +} + +static struct psi *psi_probe_p8(struct proc_chip *chip, u64 base) +{ + struct psi *psi = NULL; + uint64_t rc, val; + + rc = xscom_read(chip->id, base + PSIHB_XSCOM_P8_BASE, &val); + if (rc) { + prerror("PSI[0x%03x]: Error %llx reading PSIHB BAR\n", + chip->id, rc); + return NULL; + } + if (val & PSIHB_XSCOM_P8_HBBAR_EN) { + psi = alloc_psi(chip, base); + if (!psi) + return NULL; + psi->regs = (void *)(val & ~PSIHB_XSCOM_P8_HBBAR_EN); + psi->interrupt = get_psi_interrupt(chip->id); + } else + printf("PSI[0x%03x]: Working chip not found\n", chip->id); + + return psi; +} + +static struct psi *psi_probe_p9(struct proc_chip *chip, u64 base) +{ + struct psi *psi = NULL; + uint64_t addr; + + phys_map_get(chip->id, PSIHB_REG, 0, &addr, NULL); + xscom_write(chip->id, base + PSIHB_XSCOM_P9_BASE, + addr | PSIHB_XSCOM_P9_HBBAR_EN); + + psi = alloc_psi(chip, base); + if (!psi) + return NULL; + psi->regs = (void *)addr; + psi->interrupt = xive_alloc_hw_irqs(chip->id, P9_PSI_NUM_IRQS, 16); + return psi; +} + +static struct psi *psi_probe_p10(struct proc_chip *chip, u64 base) +{ + struct psi *psi = NULL; + uint64_t addr; + + phys_map_get(chip->id, PSIHB_REG, 0, &addr, NULL); + xscom_write(chip->id, base + PSIHB_XSCOM_P9_BASE, + addr | PSIHB_XSCOM_P9_HBBAR_EN); + + psi = alloc_psi(chip, base); + if (!psi) + return NULL; + psi->regs = (void *)addr; + psi->interrupt = xive2_alloc_hw_irqs(chip->id, P9_PSI_NUM_IRQS, 16); + return psi; +} + +static bool psi_init_psihb(struct dt_node *psihb) +{ + uint32_t chip_id = dt_get_chip_id(psihb); + struct proc_chip *chip = get_chip(chip_id); + struct psi *psi = NULL; + u64 base, val; + + if (!chip) { + prerror("PSI: Can't find chip!\n"); + return false; + } + + base = dt_get_address(psihb, 0, NULL); + + if (dt_node_is_compatible(psihb, "ibm,power8-psihb-x")) + psi = psi_probe_p8(chip, base); + else if (dt_node_is_compatible(psihb, "ibm,power9-psihb-x")) + psi = psi_probe_p9(chip, base); + else if (dt_node_is_compatible(psihb, "ibm,power10-psihb-x")) + psi = psi_probe_p10(chip, base); + else { + prerror("PSI: Unknown processor type\n"); + return false; + } + if (!psi) + return false; + + list_add(&psis, &psi->list); + + val = in_be64(psi->regs + PSIHB_CR); + if (val & PSIHB_CR_FSP_LINK_ACTIVE) { + lock(&psi_lock); + psi->active = true; + unlock(&psi_lock); + } + chip->psi = psi; + + if (dt_has_node_property(psihb, "no-lpc-interrupts", NULL)) + psi->no_lpc_irqs = true; + + psi_activate_phb(psi); + psi_init_interrupts(psi); + psi_create_mm_dtnode(psi); + + prlog(PR_INFO, "PSI[0x%03x]: Found PSI bridge [active=%d]\n", + psi->chip_id, psi->active); + return true; +} + +void psi_fsp_link_in_use(struct psi *psi __unused) +{ + static bool poller_created = false; + + /* Do this once only */ + if (!poller_created) { + poller_created = true; + opal_add_poller(psi_link_poll, NULL); + } +} + +struct psi *psi_find_functional_chip(void) +{ + return list_top(&psis, struct psi, list); +} + +void psi_init(void) +{ + struct dt_node *np; + + dt_for_each_compatible(dt_root, np, "ibm,psihb-x") + psi_init_psihb(np); +} + + diff --git a/roms/skiboot/hw/sbe-p8.c b/roms/skiboot/hw/sbe-p8.c new file mode 100644 index 000000000..73fa5f1f2 --- /dev/null +++ b/roms/skiboot/hw/sbe-p8.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * POWER8 Self Boot Engine (SLW - SLeep/Winkle) + * + * Copyright 2013-2018 IBM Corp. + */ + +#include <device.h> +#include <sbe-p8.h> +#include <skiboot.h> +#include <timebase.h> +#include <xscom.h> + +/* SLW timer related stuff */ +static bool sbe_has_timer; +static uint64_t sbe_timer_inc; +static uint64_t sbe_timer_target; +static uint32_t sbe_timer_chip; +static uint64_t sbe_last_gen; +static uint64_t sbe_last_gen_stamp; + +static void p8_sbe_dump_timer_ffdc(void) +{ + uint64_t i, val; + int64_t rc; + + static const uint32_t dump_regs[] = { + 0xe0000, 0xe0001, 0xe0002, 0xe0003, + 0xe0004, 0xe0005, 0xe0006, 0xe0007, + 0xe0008, 0xe0009, 0xe000a, 0xe000b, + 0xe000c, 0xe000d, 0xe000e, 0xe000f, + 0xe0010, 0xe0011, 0xe0012, 0xe0013, + 0xe0014, 0xe0015, 0xe0016, 0xe0017, + 0xe0018, 0xe0019, + 0x5001c, + 0x50038, 0x50039, 0x5003a, 0x5003b + }; + + /** + * @fwts-label SLWRegisterDump + * @fwts-advice An error condition occurred in sleep/winkle + * engines timer state machine. Dumping debug information to + * root-cause. OPAL/skiboot may be stuck on some operation that + * requires SLW timer state machine (e.g. core powersaving) + */ + prlog(PR_DEBUG, "SLW: Register state:\n"); + + for (i = 0; i < ARRAY_SIZE(dump_regs); i++) { + uint32_t reg = dump_regs[i]; + rc = xscom_read(sbe_timer_chip, reg, &val); + if (rc) { + prlog(PR_DEBUG, "SLW: XSCOM error %lld reading" + " reg 0x%x\n", rc, reg); + break; + } + prlog(PR_DEBUG, "SLW: %5x = %016llx\n", reg, val); + } +} + +/* This is called with the timer lock held, so there is no + * issue with re-entrancy or concurrence + */ +void p8_sbe_update_timer_expiry(uint64_t new_target) +{ + uint64_t count, gen, gen2, req, now; + int64_t rc; + + if (!sbe_has_timer || new_target == sbe_timer_target) + return; + + sbe_timer_target = new_target; + + _xscom_lock(); + now = mftb(); + /* Calculate how many increments from now, rounded up */ + if (now < new_target) + count = (new_target - now + sbe_timer_inc - 1) / sbe_timer_inc; + else + count = 1; + + /* Max counter is 24-bit */ + if (count > 0xffffff) + count = 0xffffff; + /* Fabricate update request */ + req = (1ull << 63) | (count << 32); + + prlog(PR_TRACE, "SLW: TMR expiry: 0x%llx, req: %016llx\n", count, req); + + do { + /* Grab generation and spin if odd */ + for (;;) { + rc = _xscom_read(sbe_timer_chip, 0xE0006, &gen, false); + if (rc) { + prerror("SLW: Error %lld reading tmr gen " + " count\n", rc); + _xscom_unlock(); + return; + } + if (!(gen & 1)) + break; + if (tb_compare(now + msecs_to_tb(1), mftb()) == TB_ABEFOREB) { + /** + * @fwts-label SLWTimerStuck + * @fwts-advice The SLeep/Winkle Engine (SLW) + * failed to increment the generation number + * within our timeout period (it *should* have + * done so within ~10us, not >1ms. OPAL uses + * the SLW timer to schedule some operations, + * but can fall back to the (much less frequent + * OPAL poller, which although does not affect + * functionality, runs *much* less frequently. + * This could have the effect of slow I2C + * operations (for example). It may also mean + * that you *had* an increase in jitter, due + * to slow interactions with SLW. + * This error may also occur if the machine + * is connected to via soft FSI. + */ + prerror("SLW: timer stuck, falling back to OPAL pollers. You will likely have slower I2C and may have experienced increased jitter.\n"); + prlog(PR_DEBUG, "SLW: Stuck with odd generation !\n"); + _xscom_unlock(); + sbe_has_timer = false; + p8_sbe_dump_timer_ffdc(); + return; + } + } + + rc = _xscom_write(sbe_timer_chip, 0x5003A, req, false); + if (rc) { + prerror("SLW: Error %lld writing tmr request\n", rc); + _xscom_unlock(); + return; + } + + /* Re-check gen count */ + rc = _xscom_read(sbe_timer_chip, 0xE0006, &gen2, false); + if (rc) { + prerror("SLW: Error %lld re-reading tmr gen " + " count\n", rc); + _xscom_unlock(); + return; + } + } while(gen != gen2); + _xscom_unlock(); + + /* Check if the timer is working. If at least 1ms has elapsed + * since the last call to this function, check that the gen + * count has changed + */ + if (tb_compare(sbe_last_gen_stamp + msecs_to_tb(1), now) + == TB_ABEFOREB) { + if (sbe_last_gen == gen) { + prlog(PR_ERR, + "SLW: Timer appears to not be running !\n"); + sbe_has_timer = false; + p8_sbe_dump_timer_ffdc(); + } + sbe_last_gen = gen; + sbe_last_gen_stamp = mftb(); + } + + prlog(PR_TRACE, "SLW: gen: %llx\n", gen); +} + +bool p8_sbe_timer_ok(void) +{ + return sbe_has_timer; +} + +void p8_sbe_init_timer(void) +{ + struct dt_node *np; + int64_t rc; + uint32_t tick_us; + + np = dt_find_compatible_node(dt_root, NULL, "ibm,power8-sbe-timer"); + if (!np) + return; + + sbe_timer_chip = dt_get_chip_id(np); + tick_us = dt_prop_get_u32(np, "tick-time-us"); + sbe_timer_inc = usecs_to_tb(tick_us); + sbe_timer_target = ~0ull; + + rc = xscom_read(sbe_timer_chip, 0xE0006, &sbe_last_gen); + if (rc) { + prerror("SLW: Error %lld reading tmr gen count\n", rc); + return; + } + sbe_last_gen_stamp = mftb(); + + prlog(PR_INFO, "SLW: Timer facility on chip %d, resolution %dus\n", + sbe_timer_chip, tick_us); + sbe_has_timer = true; +} diff --git a/roms/skiboot/hw/sbe-p9.c b/roms/skiboot/hw/sbe-p9.c new file mode 100644 index 000000000..898a1fb56 --- /dev/null +++ b/roms/skiboot/hw/sbe-p9.c @@ -0,0 +1,1040 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * + * P9 OPAL - SBE communication driver + * + * SBE firmware at https://github.com/open-power/sbe + * + * P9 chip has Self Boot Engine (SBE). OPAL uses SBE for various purpose like + * timer, scom, MPIPL, etc,. Every chip has SBE. OPAL can communicate to SBE + * on all chips. Based on message type it selects appropriate SBE (ex: schedule + * timer on any chip). + * + * OPAL communicates to SBE via a set of data and control registers provided by + * the PSU block in P9 chip. + * - Four 8 byte registers for Host to send command packets to SBE. + * - Four 8 byte registers for SBE to send response packets to Host. + * - Two doorbell registers (1 on each side) to alert either party + * when data is placed in above mentioned data registers. Once Host/SBE reads + * incoming data, it should clear doorbell register. Interrupt is disabled + * as soon as doorbell register is cleared. + * + * OPAL - SBE message format: + * - OPAL communicates to SBE via set of well defined commands. + * - Reg0 contains message header (command class, subclass, flags etc). + * - Reg1-3 contains actual data. If data is big then it uses indirect method + * (data is passed via memory and memory address/size is passed in Reg1-3). + * - Every message has defined timeout. SBE must respond within specified + * time. Otherwise OPAL discards message and sends error message to caller. + * + * Constraints: + * - Only one command is accepted in the command buffer until the response for + * the command is enqueued in the response buffer by SBE. + * + * Copyright 2017-2019 IBM Corp. + */ + +#define pr_fmt(fmt) "SBE: " fmt + +#include <chip.h> +#include <errorlog.h> +#include <lock.h> +#include <opal.h> +#include <opal-dump.h> +#include <sbe-p9.h> +#include <skiboot.h> +#include <timebase.h> +#include <timer.h> +#include <trace.h> +#include <xscom.h> + +enum p9_sbe_mbox_state { + sbe_mbox_idle = 0, /* Ready to send message */ + sbe_mbox_send, /* Message sent, waiting for ack/response */ + sbe_mbox_rr, /* SBE in R/R */ +}; + +struct p9_sbe { + /* Chip ID to send message */ + u32 chip_id; + + /* List to hold SBE queue messages */ + struct list_head msg_list; + + struct lock lock; + + enum p9_sbe_mbox_state state; + + /* SBE MBOX message sequence number */ + u16 cur_seq; +}; + +/* Default SBE chip ID */ +static int sbe_default_chip_id = -1; + +/* Is SBE timer running? */ +static bool sbe_has_timer = false; +static bool sbe_timer_in_progress = false; +static bool has_new_target = false; + +/* Inflight and next timer in TB */ +static uint64_t sbe_last_gen_stamp; +static uint64_t sbe_timer_target; + +/* Timer lock */ +static struct lock sbe_timer_lock; + +/* + * Minimum timeout value for P9 is 500 microseconds. After that + * SBE timer can handle granularity of 1 microsecond. + */ +#define SBE_TIMER_DEFAULT_US 500 +static uint64_t sbe_timer_def_tb; + +/* + * Rate limit continuous timer update. + * We can update inflight timer if new timer request is lesser than inflight + * one. Limit such updates so that SBE gets time to handle FIFO side requests. + */ +#define SBE_TIMER_UPDATE_MAX 2 +static uint32_t timer_update_cnt = 0; + +/* Timer control message */ +static struct p9_sbe_msg *timer_ctrl_msg; + +#define SBE_STATUS_PRI_SHIFT 0x30 +#define SBE_STATUS_SEC_SHIFT 0x20 + +/* Forward declaration */ +static void p9_sbe_timeout_poll_one(struct p9_sbe *sbe); +static void p9_sbe_timer_schedule(void); + +/* bit 0-15 : Primary status code */ +static inline u16 p9_sbe_get_primary_rc(struct p9_sbe_msg *resp) +{ + return (resp->reg[0] >> SBE_STATUS_PRI_SHIFT); +} + +static inline void p9_sbe_set_primary_rc(struct p9_sbe_msg *resp, u64 rc) +{ + resp->reg[0] |= (rc << SBE_STATUS_PRI_SHIFT); +} + +static u64 p9_sbe_rreg(u32 chip_id, u64 reg) +{ + u64 data = 0; + int rc; + + rc = xscom_read(chip_id, reg, &data); + if (rc != OPAL_SUCCESS) { + prlog(PR_DEBUG, "XSCOM error %d reading reg 0x%llx\n", rc, reg); + return 0xffffffff; + } + + return data; +} + +static void p9_sbe_reg_dump(u32 chip_id) +{ +#define SBE_DUMP_REG_ONE(chip_id, x) \ + prlog(PR_DEBUG, " %20s: %016llx\n", #x, p9_sbe_rreg(chip_id, x)) + + prlog(PR_DEBUG, "MBOX register dump for chip : %x\n", chip_id); + SBE_DUMP_REG_ONE(chip_id, PSU_SBE_DOORBELL_REG_RW); + SBE_DUMP_REG_ONE(chip_id, PSU_HOST_SBE_MBOX_REG0); + SBE_DUMP_REG_ONE(chip_id, PSU_HOST_SBE_MBOX_REG1); + SBE_DUMP_REG_ONE(chip_id, PSU_HOST_SBE_MBOX_REG2); + SBE_DUMP_REG_ONE(chip_id, PSU_HOST_SBE_MBOX_REG3); + SBE_DUMP_REG_ONE(chip_id, PSU_HOST_DOORBELL_REG_RW); + SBE_DUMP_REG_ONE(chip_id, PSU_HOST_SBE_MBOX_REG4); + SBE_DUMP_REG_ONE(chip_id, PSU_HOST_SBE_MBOX_REG5); + SBE_DUMP_REG_ONE(chip_id, PSU_HOST_SBE_MBOX_REG6); + SBE_DUMP_REG_ONE(chip_id, PSU_HOST_SBE_MBOX_REG7); +} + +void p9_sbe_freemsg(struct p9_sbe_msg *msg) +{ + if (msg && msg->resp) + free(msg->resp); + free(msg); +} + +static void p9_sbe_fillmsg(struct p9_sbe_msg *msg, u16 cmd, + u16 ctrl_flag, u64 reg1, u64 reg2, u64 reg3) +{ + bool response = !!(ctrl_flag & SBE_CMD_CTRL_RESP_REQ); + u16 flag; + + /* + * Always set ack required flag. SBE will interrupt OPAL once it read + * message from mailbox register. If OPAL is expecting response, then + * it will update message timeout, otherwise it will send next message. + */ + flag = ctrl_flag | SBE_CMD_CTRL_ACK_REQ; + + /* Seqence ID is filled by p9_sbe_queue_msg() */ + msg->reg[0] = ((u64)flag << 32) | cmd; + msg->reg[1] = reg1; + msg->reg[2] = reg2; + msg->reg[3] = reg3; + msg->state = sbe_msg_unused; + msg->response = response; +} + +static struct p9_sbe_msg *p9_sbe_allocmsg(bool alloc_resp) +{ + struct p9_sbe_msg *msg; + + msg = zalloc(sizeof(struct p9_sbe_msg)); + if (!msg) { + prlog(PR_ERR, "Failed to allocate SBE message\n"); + return NULL; + } + if (alloc_resp) { + msg->resp = zalloc(sizeof(struct p9_sbe_msg)); + if (!msg->resp) { + prlog(PR_ERR, "Failed to allocate SBE resp message\n"); + free(msg); + return NULL; + } + } + + return msg; +} + +/* + * Handles "command with direct data" format only. + * + * Note: All mbox messages of our interest uses direct data format. If we need + * indirect data format then we may have to enhance this function. + */ +struct p9_sbe_msg *p9_sbe_mkmsg(u16 cmd, u16 ctrl_flag, + u64 reg1, u64 reg2, u64 reg3) +{ + struct p9_sbe_msg *msg; + + msg = p9_sbe_allocmsg(!!(ctrl_flag & SBE_CMD_CTRL_RESP_REQ)); + if (!msg) + return NULL; + + p9_sbe_fillmsg(msg, cmd, ctrl_flag, reg1, reg2, reg3); + return msg; +} + +static inline bool p9_sbe_mbox_busy(struct p9_sbe *sbe) +{ + return (sbe->state != sbe_mbox_idle); +} + +static inline bool p9_sbe_msg_busy(struct p9_sbe_msg *msg) +{ + switch (msg->state) { + case sbe_msg_queued: + /* fall through */ + case sbe_msg_sent: + case sbe_msg_wresp: + return true; + default: /* + sbe_msg_unused, sbe_msg_done, + sbe_msg_timeout, sbe_msg_error */ + break; + } + return false; +} + +static inline struct p9_sbe *p9_sbe_get_sbe(u32 chip_id) +{ + struct proc_chip *chip; + + /* Default to SBE on master chip */ + if (chip_id == -1) { + if (sbe_default_chip_id == -1) + return NULL; + + chip = get_chip(sbe_default_chip_id); + } else { + chip = get_chip(chip_id); + } + if (chip == NULL || chip->sbe == NULL) + return NULL; + + return chip->sbe; +} + +static int p9_sbe_msg_send(struct p9_sbe *sbe, struct p9_sbe_msg *msg) +{ + int rc, i; + u64 addr, *data; + + addr = PSU_HOST_SBE_MBOX_REG0; + data = &msg->reg[0]; + + for (i = 0; i < NR_HOST_SBE_MBOX_REG; i++) { + rc = xscom_write(sbe->chip_id, addr, *data); + if (rc) + return rc; + + addr++; + data++; + } + + rc = xscom_write(sbe->chip_id, PSU_SBE_DOORBELL_REG_OR, + HOST_SBE_MSG_WAITING); + if (rc != OPAL_SUCCESS) + return rc; + + prlog(PR_TRACE, "Message queued [chip id = 0x%x]:\n", sbe->chip_id); + for (i = 0; i < 4; i++) + prlog(PR_TRACE, " Reg%d : %016llx\n", i, msg->reg[i]); + + msg->timeout = mftb() + msecs_to_tb(SBE_CMD_TIMEOUT_MAX); + sbe->state = sbe_mbox_send; + msg->state = sbe_msg_sent; + return rc; +} + +static int p9_sbe_msg_receive(u32 chip_id, struct p9_sbe_msg *resp) +{ + int i; + int rc = OPAL_SUCCESS; + u64 addr, *data; + + addr = PSU_HOST_SBE_MBOX_REG4; + data = &resp->reg[0]; + + for (i = 0; i < NR_HOST_SBE_MBOX_REG; i++) { + rc = xscom_read(chip_id, addr, data); + if (rc) + return rc; + + addr++; + data++; + } + return rc; +} + +/* WARNING: This will drop sbe->lock */ +static void p9_sbe_msg_complete(struct p9_sbe *sbe, struct p9_sbe_msg *msg, + enum p9_sbe_msg_state msg_state) +{ + void (*comp)(struct p9_sbe_msg *msg); + + prlog(PR_TRACE, "Completing msg [chip id = %x], reg0 : 0x%llx\n", + sbe->chip_id, msg->reg[0]); + + comp = msg->complete; + list_del(&msg->link); + sync(); + msg->state = msg_state; + + if (comp) { + unlock(&sbe->lock); + comp(msg); + lock(&sbe->lock); + } +} + +/* WARNING: This will drop sbe->lock */ +static void p9_sbe_send_complete(struct p9_sbe *sbe) +{ + struct p9_sbe_msg *msg; + + if (list_empty(&sbe->msg_list)) + return; + + msg = list_top(&sbe->msg_list, struct p9_sbe_msg, link); + /* Need response */ + if (msg->response) { + msg->state = sbe_msg_wresp; + } else { + sbe->state = sbe_mbox_idle; + p9_sbe_msg_complete(sbe, msg, sbe_msg_done); + } +} + +/* WARNING: This will drop sbe->lock */ +static void p9_sbe_process_queue(struct p9_sbe *sbe) +{ + int rc, retry_cnt = 0; + struct p9_sbe_msg *msg = NULL; + + if (p9_sbe_mbox_busy(sbe)) + return; + + while (!list_empty(&sbe->msg_list)) { + msg = list_top(&sbe->msg_list, struct p9_sbe_msg, link); + /* Send message */ + rc = p9_sbe_msg_send(sbe, msg); + if (rc == OPAL_SUCCESS) + return; + + prlog(PR_ERR, "Failed to send message to SBE [chip id = %x]\n", + sbe->chip_id); + if (msg->resp) { + p9_sbe_set_primary_rc(msg->resp, + SBE_STATUS_PRI_GENERIC_ERR); + } + p9_sbe_msg_complete(sbe, msg, sbe_msg_error); + + /* + * Repeatedly failed to send message to SBE. Lets stop + * sending message. + */ + if (retry_cnt++ >= 3) { + prlog(PR_ERR, "Temporarily stopped sending " + "message to SBE\n"); + return; + } + } +} + +/* + * WARNING: + * Only one command is accepted in the command buffer until response + * to the command is enqueued in the response buffer by SBE. + * + * Head of msg_list contains in-flight message. Hence we should always + * add new message to tail of the list. + */ +int p9_sbe_queue_msg(u32 chip_id, struct p9_sbe_msg *msg, + void (*comp)(struct p9_sbe_msg *msg)) +{ + struct p9_sbe *sbe; + + if (!msg) + return OPAL_PARAMETER; + + sbe = p9_sbe_get_sbe(chip_id); + if (!sbe) + return OPAL_HARDWARE; + + lock(&sbe->lock); + /* Set completion and update sequence number */ + msg->complete = comp; + msg->state = sbe_msg_queued; + msg->reg[0] = msg->reg[0] | ((u64)sbe->cur_seq << 16); + sbe->cur_seq++; + + /* Reset sequence number */ + if (sbe->cur_seq == 0xffff) + sbe->cur_seq = 1; + + /* Add message to queue */ + list_add_tail(&sbe->msg_list, &msg->link); + p9_sbe_process_queue(sbe); + unlock(&sbe->lock); + + return OPAL_SUCCESS; +} + +int p9_sbe_sync_msg(u32 chip_id, struct p9_sbe_msg *msg, bool autofree) +{ + int rc; + struct p9_sbe *sbe; + + rc = p9_sbe_queue_msg(chip_id, msg, NULL); + if (rc) + goto free_msg; + + sbe = p9_sbe_get_sbe(chip_id); + if (!sbe) { + rc = OPAL_HARDWARE; + goto free_msg; + } + + while (p9_sbe_msg_busy(msg)) { + cpu_relax(); + p9_sbe_timeout_poll_one(sbe); + } + + if (msg->state == sbe_msg_done) + rc = SBE_STATUS_PRI_SUCCESS; + else + rc = SBE_STATUS_PRI_GENERIC_ERR; + + if (msg->response && msg->resp) + rc = p9_sbe_get_primary_rc(msg->resp); + +free_msg: + if (autofree) + p9_sbe_freemsg(msg); + + return rc; +} + +/* Remove SBE message from queue. It will not remove inflight message */ +int p9_sbe_cancelmsg(u32 chip_id, struct p9_sbe_msg *msg) +{ + struct p9_sbe *sbe; + + sbe = p9_sbe_get_sbe(chip_id); + if (!sbe) + return OPAL_PARAMETER; + + lock(&sbe->lock); + if (msg->state != sbe_msg_queued) { + unlock(&sbe->lock); + return OPAL_BUSY; + } + + list_del(&msg->link); + msg->state = sbe_msg_done; + unlock(&sbe->lock); + return OPAL_SUCCESS; +} + +static void p9_sbe_handle_response(u32 chip_id, struct p9_sbe_msg *msg) +{ + u16 send_seq, resp_seq; + int rc; + + if (msg == NULL || msg->resp == NULL) + return; + + memset(msg->resp, 0, sizeof(struct p9_sbe_msg)); + + rc = p9_sbe_msg_receive(chip_id, msg->resp); + if (rc != OPAL_SUCCESS) { + prlog(PR_ERR, "Failed to read response message " + "[chip id = %x]\n", chip_id); + p9_sbe_set_primary_rc(msg->resp, SBE_STATUS_PRI_GENERIC_ERR); + return; + } + + /* Validate sequence number */ + send_seq = (msg->reg[0] >> 16) & 0xffff; + resp_seq = (msg->resp->reg[0] >> 16) & 0xffff; + if (send_seq != resp_seq) { + /* + * XXX Handle SBE R/R. + * Lets send sequence error to caller until SBE reset works. + */ + prlog(PR_ERR, "Invalid sequence id [chip id = %x]\n", chip_id); + p9_sbe_set_primary_rc(msg->resp, SBE_STATUS_PRI_SEQ_ERR); + return; + } +} + +static int p9_sbe_clear_interrupt(struct p9_sbe *sbe, u64 bits) +{ + int rc; + u64 val; + + /* Clear doorbell register */ + val = SBE_HOST_RESPONSE_MASK & ~bits; + rc = xscom_write(sbe->chip_id, PSU_HOST_DOORBELL_REG_AND, val); + if (rc) { + prlog(PR_ERR, "Failed to clear SBE to Host doorbell " + "interrupt [chip id = %x]\n", sbe->chip_id); + } + return rc; +} + +/* WARNING: This will drop sbe->lock */ +static void p9_sbe_timer_response(struct p9_sbe *sbe) +{ + if (sbe->chip_id != sbe_default_chip_id) + return; + + sbe_timer_in_progress = false; + /* Drop lock and call timers */ + unlock(&sbe->lock); + + lock(&sbe_timer_lock); + /* + * Once we get timer expiry interrupt (even if its suprious interrupt) + * we can schedule next timer request. + */ + timer_update_cnt = 0; + unlock(&sbe_timer_lock); + + check_timers(true); + lock(&sbe->lock); +} + +/* WARNING: This will drop sbe->lock */ +static void __p9_sbe_interrupt(struct p9_sbe *sbe) +{ + bool has_response; + int rc; + u64 data = 0, val; + struct p9_sbe_msg *msg = NULL; + +again: + /* Read doorbell register */ + rc = xscom_read(sbe->chip_id, PSU_HOST_DOORBELL_REG_RW, &data); + if (rc) { + prlog(PR_ERR, "Failed to read SBE to Host doorbell register " + "[chip id = %x]\n", sbe->chip_id); + p9_sbe_reg_dump(sbe->chip_id); + return; + } + + /* Completed processing all the bits */ + if (!data) + return; + + /* SBE came back from reset */ + if (data & SBE_HOST_RESET) { + /* Clear all bits and restart sending message */ + rc = p9_sbe_clear_interrupt(sbe, data); + if (rc) + return; + + prlog(PR_NOTICE, + "Back from reset [chip id = %x]\n", sbe->chip_id); + /* Reset SBE MBOX state */ + sbe->state = sbe_mbox_idle; + + /* Reset message state */ + if (!list_empty(&sbe->msg_list)) { + msg = list_top(&sbe->msg_list, struct p9_sbe_msg, link); + msg->state = sbe_msg_queued; + } + return; + } + + /* Process ACK message before response */ + if (data & SBE_HOST_MSG_READ) { + rc = p9_sbe_clear_interrupt(sbe, SBE_HOST_MSG_READ); + if (rc) + return; + p9_sbe_send_complete(sbe); + goto again; + } + + /* Read SBE response before clearing doorbell register */ + if (data & SBE_HOST_RESPONSE_WAITING) { + if (!list_empty(&sbe->msg_list)) { + msg = list_top(&sbe->msg_list, struct p9_sbe_msg, link); + p9_sbe_handle_response(sbe->chip_id, msg); + has_response = true; + } else { + has_response = false; + prlog(PR_DEBUG, + "Got response with no pending message\n"); + } + + rc = p9_sbe_clear_interrupt(sbe, SBE_HOST_RESPONSE_WAITING); + if (rc) + return; + + /* Reset SBE MBOX state */ + sbe->state = sbe_mbox_idle; + if (has_response) + p9_sbe_msg_complete(sbe, msg, sbe_msg_done); + + goto again; + } + + /* SBE passthrough command, call prd handler */ + if (data & SBE_HOST_PASSTHROUGH) { + rc = p9_sbe_clear_interrupt(sbe, SBE_HOST_PASSTHROUGH); + if (rc) + return; + prd_sbe_passthrough(sbe->chip_id); + goto again; + } + + /* Timer expired */ + if (data & SBE_HOST_TIMER_EXPIRY) { + rc = p9_sbe_clear_interrupt(sbe, SBE_HOST_TIMER_EXPIRY); + if (rc) + return; + p9_sbe_timer_response(sbe); + goto again; + } + + /* Unhandled bits */ + val = data & ~(SBE_HOST_RESPONSE_MASK); + if (val) { + prlog(PR_ERR, "Unhandled interrupt bit [chip id = %x] : " + " %016llx\n", sbe->chip_id, val); + rc = p9_sbe_clear_interrupt(sbe, data); + if (rc) + return; + goto again; + } +} + +void p9_sbe_interrupt(uint32_t chip_id) +{ + struct proc_chip *chip; + struct p9_sbe *sbe; + + chip = get_chip(chip_id); + if (chip == NULL || chip->sbe == NULL) + return; + + sbe = chip->sbe; + lock(&sbe->lock); + __p9_sbe_interrupt(sbe); + p9_sbe_process_queue(sbe); + unlock(&sbe->lock); +} + +/* + * Check if the timer is working. If at least 10ms elapsed since + * last scheduled timer expiry. + */ +static void p9_sbe_timer_poll(struct p9_sbe *sbe) +{ + if (sbe->chip_id != sbe_default_chip_id) + return; + + if (!sbe_has_timer || !sbe_timer_in_progress) + return; + + if (tb_compare(mftb(), sbe_last_gen_stamp + msecs_to_tb(10)) + != TB_AAFTERB) + return; + + prlog(PR_ERR, "Timer stuck, falling back to OPAL pollers.\n"); + prlog(PR_ERR, "You will likely have slower I2C and may have " + "experienced increased jitter.\n"); + p9_sbe_reg_dump(sbe->chip_id); + sbe_has_timer = false; + sbe_timer_in_progress = false; +} + +static void p9_sbe_timeout_poll_one(struct p9_sbe *sbe) +{ + struct p9_sbe_msg *msg; + + if (sbe->chip_id == sbe_default_chip_id) { + if (list_empty_nocheck(&sbe->msg_list) && + !sbe_timer_in_progress) + return; + } else { + if (list_empty_nocheck(&sbe->msg_list)) + return; + } + + lock(&sbe->lock); + + /* + * In some cases there will be a delay in calling OPAL interrupt + * handler routine (opal_handle_interrupt). In such cases its + * possible that SBE has responded, but OPAL didn't act on that. + * Hence check for SBE response. + */ + __p9_sbe_interrupt(sbe); + p9_sbe_timer_poll(sbe); + + if (list_empty(&sbe->msg_list)) + goto out; + + /* + * For some reason OPAL didn't sent message to SBE. + * Lets try to send message again. + */ + if (!p9_sbe_mbox_busy(sbe)) { + p9_sbe_process_queue(sbe); + goto out; + } + + msg = list_top(&sbe->msg_list, struct p9_sbe_msg, link); + if (tb_compare(mftb(), msg->timeout) != TB_AAFTERB) + goto out; + + /* Message timeout */ + prlog(PR_ERR, "Message timeout [chip id = %x], cmd = %llx, " + "subcmd = %llx\n", sbe->chip_id, + (msg->reg[0] >> 8) & 0xff, msg->reg[0] & 0xff); + p9_sbe_reg_dump(sbe->chip_id); + if (msg->resp) { + p9_sbe_set_primary_rc(msg->resp, + SBE_STATUS_PRI_GENERIC_ERR); + } + + /* XXX Handle SBE R/R. Reset SBE state until SBE R/R works. */ + sbe->state = sbe_mbox_idle; + p9_sbe_msg_complete(sbe, msg, sbe_msg_timeout); + p9_sbe_process_queue(sbe); + +out: + unlock(&sbe->lock); +} + +static void p9_sbe_timeout_poll(void *user_data __unused) +{ + struct p9_sbe *sbe; + struct proc_chip *chip; + + for_each_chip(chip) { + if (chip->sbe == NULL) + continue; + sbe = chip->sbe; + p9_sbe_timeout_poll_one(sbe); + } +} + +static void p9_sbe_timer_resp(struct p9_sbe_msg *msg) +{ + if (msg->state != sbe_msg_done) { + prlog(PR_DEBUG, "Failed to schedule timer [chip id %x]\n", + sbe_default_chip_id); + } else { + /* Update last scheduled timer value */ + sbe_last_gen_stamp = mftb() + + usecs_to_tb(timer_ctrl_msg->reg[1]); + sbe_timer_in_progress = true; + } + + if (!has_new_target) + return; + + lock(&sbe_timer_lock); + if (has_new_target) { + if (!p9_sbe_msg_busy(timer_ctrl_msg)) { + has_new_target = false; + p9_sbe_timer_schedule(); + } + } + unlock(&sbe_timer_lock); +} + +static void p9_sbe_timer_schedule(void) +{ + int rc; + u32 tick_us = SBE_TIMER_DEFAULT_US; + u64 tb_cnt, now = mftb(); + + if (sbe_timer_in_progress) { + if (sbe_timer_target >= sbe_last_gen_stamp) + return; + + if (now >= sbe_last_gen_stamp) + return; + + /* Remaining time of inflight timer <= sbe_timer_def_tb */ + if ((sbe_last_gen_stamp - now) <= sbe_timer_def_tb) + return; + } + + /* Stop sending timer update chipop until inflight timer expires */ + if (timer_update_cnt > SBE_TIMER_UPDATE_MAX) + return; + timer_update_cnt++; + + if (now < sbe_timer_target) { + /* Calculate how many microseconds from now, rounded up */ + if ((sbe_timer_target - now) > sbe_timer_def_tb) { + tb_cnt = sbe_timer_target - now + usecs_to_tb(1) - 1; + tick_us = tb_to_usecs(tb_cnt); + } + } + + /* Clear sequence number. p9_sbe_queue_msg will add new sequene ID */ + timer_ctrl_msg->reg[0] &= ~(PPC_BITMASK(32, 47)); + /* Update timeout value */ + timer_ctrl_msg->reg[1] = tick_us; + rc = p9_sbe_queue_msg(sbe_default_chip_id, timer_ctrl_msg, + p9_sbe_timer_resp); + if (rc != OPAL_SUCCESS) { + prlog(PR_ERR, "Failed to start timer [chip id = %x]\n", + sbe_default_chip_id); + return; + } +} + +/* + * This is called with the timer lock held, so there is no + * issue with re-entrancy or concurrence + */ +void p9_sbe_update_timer_expiry(uint64_t new_target) +{ + if (!sbe_has_timer || new_target == sbe_timer_target) + return; + + lock(&sbe_timer_lock); + /* Timer message is in flight. Record new timer and schedule later */ + if (p9_sbe_msg_busy(timer_ctrl_msg) || has_new_target) { + if (new_target < sbe_timer_target) { + sbe_timer_target = new_target; + has_new_target = true; + } + } else { + sbe_timer_target = new_target; + p9_sbe_timer_schedule(); + } + unlock(&sbe_timer_lock); +} + +/* Initialize SBE timer */ +static void p9_sbe_timer_init(void) +{ + timer_ctrl_msg = p9_sbe_mkmsg(SBE_CMD_CONTROL_TIMER, + CONTROL_TIMER_START, 0, 0, 0); + assert(timer_ctrl_msg); + init_lock(&sbe_timer_lock); + sbe_has_timer = true; + sbe_timer_target = mftb(); + sbe_last_gen_stamp = ~0ull; + sbe_timer_def_tb = usecs_to_tb(SBE_TIMER_DEFAULT_US); + prlog(PR_INFO, "Timer facility on chip %x\n", sbe_default_chip_id); +} + +bool p9_sbe_timer_ok(void) +{ + return sbe_has_timer; +} + +static void p9_sbe_stash_chipop_resp(struct p9_sbe_msg *msg) +{ + int rc = p9_sbe_get_primary_rc(msg->resp); + struct p9_sbe *sbe = (void *)msg->user_data; + + if (rc == SBE_STATUS_PRI_SUCCESS) { + prlog(PR_DEBUG, "Sent stash MPIPL config [chip id =0x%x]\n", + sbe->chip_id); + } else { + prlog(PR_ERR, "Failed to send stash MPIPL config " + "[chip id = 0x%x, rc = %d]\n", sbe->chip_id, rc); + } + + p9_sbe_freemsg(msg); +} + +static void p9_sbe_send_relocated_base_single(struct p9_sbe *sbe, u64 reloc_base) +{ + u8 key = SBE_STASH_KEY_SKIBOOT_BASE; + u16 cmd = SBE_CMD_STASH_MPIPL_CONFIG; + u16 flag = SBE_CMD_CTRL_RESP_REQ; + struct p9_sbe_msg *msg; + + msg = p9_sbe_mkmsg(cmd, flag, key, reloc_base, 0); + if (!msg) { + prlog(PR_ERR, "Message allocation failed\n"); + return; + } + + msg->user_data = (void *)sbe; + if (p9_sbe_queue_msg(sbe->chip_id, msg, p9_sbe_stash_chipop_resp)) { + prlog(PR_ERR, "Failed to queue stash MPIPL config message\n"); + } +} + +/* Send relocated skiboot base address to all SBE */ +void p9_sbe_send_relocated_base(uint64_t reloc_base) +{ + struct proc_chip *chip; + + for_each_chip(chip) { + if (chip->sbe == NULL) + continue; + + p9_sbe_send_relocated_base_single(chip->sbe, reloc_base); + } +} + +void p9_sbe_init(void) +{ + struct dt_node *xn; + struct proc_chip *chip; + struct p9_sbe *sbe; + + if (proc_gen < proc_gen_p9) + return; + + dt_for_each_compatible(dt_root, xn, "ibm,xscom") { + sbe = zalloc(sizeof(struct p9_sbe)); + assert(sbe); + sbe->chip_id = dt_get_chip_id(xn); + sbe->cur_seq = 1; + sbe->state = sbe_mbox_idle; + list_head_init(&sbe->msg_list); + init_lock(&sbe->lock); + + chip = get_chip(sbe->chip_id); + assert(chip); + chip->sbe = sbe; + + if (dt_has_node_property(xn, "primary", NULL)) { + sbe_default_chip_id = sbe->chip_id; + prlog(PR_DEBUG, "Master chip id : %x\n", sbe->chip_id); + } + } + + if (sbe_default_chip_id == -1) { + prlog(PR_ERR, "Master chip ID not found.\n"); + return; + } + + /* Initiate SBE timer */ + p9_sbe_timer_init(); + + /* Initiate SBE timeout poller */ + opal_add_poller(p9_sbe_timeout_poll, NULL); +} + +/* Terminate and initiate MPIPL */ +void p9_sbe_terminate(void) +{ + uint32_t primary_chip = -1; + int rc; + u64 wait_tb; + struct proc_chip *chip; + + /* Return if MPIPL is not supported */ + if (!is_mpipl_enabled()) + return; + + /* Save crashing CPU details */ + opal_mpipl_save_crashing_pir(); + + /* Unregister flash. It will request BMC MBOX reset */ + if (!flash_unregister()) { + prlog(PR_DEBUG, "Failed to reset BMC MBOX\n"); + return; + } + + /* + * Send S0 interrupt to all SBE. Sequence: + * - S0 interrupt on secondary chip SBE + * - S0 interrupt on Primary chip SBE + */ + for_each_chip(chip) { + if (dt_has_node_property(chip->devnode, "primary", NULL)) { + primary_chip = chip->id; + continue; + } + + rc = xscom_write(chip->id, + SBE_CONTROL_REG_RW, SBE_CONTROL_REG_S0); + /* Initiate normal reboot */ + if (rc) { + prlog(PR_ERR, "Failed to write S0 interrupt [chip id = %x]\n", + chip->id); + return; + } + } + + /* Initiate normal reboot */ + if (primary_chip == -1) { + prlog(PR_ERR, "Primary chip ID not found.\n"); + return; + } + + rc = xscom_write(primary_chip, + SBE_CONTROL_REG_RW, SBE_CONTROL_REG_S0); + if (rc) { + prlog(PR_ERR, "Failed to write S0 interrupt [chip id = %x]\n", + primary_chip); + return; + } + + /* XXX We expect SBE to act on interrupt, quiesce the system and start + * MPIPL flow. Currently we do not have a way to detect SBE state. + * Hence wait for max time SBE takes to respond and then trigger + * normal reboot. + */ + prlog(PR_NOTICE, "Initiated MPIPL, waiting for SBE to respond...\n"); + wait_tb = mftb() + msecs_to_tb(SBE_CMD_TIMEOUT_MAX); + while (mftb() < wait_tb) { + cpu_relax(); + } + + prlog(PR_ERR, "SBE did not respond within timeout period (%d secs).\n", + SBE_CMD_TIMEOUT_MAX / 1000); + prlog(PR_ERR, "Falling back to normal reboot\n"); +} diff --git a/roms/skiboot/hw/sfc-ctrl.c b/roms/skiboot/hw/sfc-ctrl.c new file mode 100644 index 000000000..34b5b8e20 --- /dev/null +++ b/roms/skiboot/hw/sfc-ctrl.c @@ -0,0 +1,510 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* Copyright 2013-2014 IBM Corp. */ + +#include <stdint.h> +#include <stdbool.h> +#include <stdlib.h> +#include <errno.h> +#include <stdio.h> +#include <string.h> +#include <time.h> +#include <lpc.h> +#include <sfc-ctrl.h> + +#include <libflash/libflash.h> +#include <libflash/libflash-priv.h> + +/* Offset of SFC registers in FW space */ +#define SFC_CMDREG_OFFSET 0x00000c00 +/* Offset of SFC command buffer in FW space */ +#define SFC_CMDBUF_OFFSET 0x00000d00 +/* Offset of flash MMIO mapping in FW space */ +#define SFC_MMIO_OFFSET 0x0c000000 + + +/* + * Register definitions + */ +#define SFC_REG_CONF 0x10 /* CONF: Direct Access Configuration */ +#define SFC_REG_CONF_FRZE (1 << 3) +#define SFC_REG_CONF_ECCEN (1 << 2) +#define SFC_REG_CONF_DRCD (1 << 1) +#define SFC_REG_CONF_FLRLD (1 << 0) + +#define SFC_REG_STATUS 0x0C /* STATUS : Status Reg */ +#define SFC_REG_STATUS_NX_ON_SHFT 28 +#define SFC_REG_STATUS_RWP (1 << 27) +#define SFC_REG_STATUS_FOURBYTEAD (1 << 26) +#define SFC_REG_STATUS_ILLEGAL (1 << 4) +#define SFC_REG_STATUS_ECCERRCNTN (1 << 3) +#define SFC_REG_STATUS_ECCUEN (1 << 2) +#define SFC_REG_STATUS_DONE (1 << 0) + +#define SFC_REG_CMD 0x40 /* CMD : Command */ +#define SFC_REG_CMD_OPCODE_SHFT 9 +#define SFC_REG_CMD_LENGTH_SHFT 0 + +#define SFC_REG_SPICLK 0x3C /* SPICLK: SPI clock rate config */ +#define SFC_REG_SPICLK_OUTDLY_SHFT 24 +#define SFC_REG_SPICLK_INSAMPDLY_SHFT 16 +#define SFC_REG_SPICLK_CLKHI_SHFT 8 +#define SFC_REG_SPICLK_CLKLO_SHFT 0 + +#define SFC_REG_ADR 0x44 /* ADR : Address */ +#define SFC_REG_ERASMS 0x48 /* ERASMS : Small Erase Block Size */ +#define SFC_REG_ERASLGS 0x4C /* ERALGS : Large Erase Block Size */ +#define SFC_REG_CONF4 0x54 /* CONF4 : SPI Op Code for Small Erase */ +#define SFC_REG_CONF5 0x58 /* CONF5 : Small Erase Size config reg */ + +#define SFC_REG_CONF8 0x64 /* CONF8 : Read Command */ +#define SFC_REG_CONF8_CSINACTIVERD_SHFT 18 +#define SFC_REG_CONF8_DUMMY_SHFT 8 +#define SFC_REG_CONF8_READOP_SHFT 0 + +#define SFC_REG_ADRCBF 0x80 /* ADRCBF : First Intf NOR Addr Offset */ +#define SFC_REG_ADRCMF 0x84 /* ADRCMF : First Intf NOR Allocation */ +#define SFC_REG_ADRCBS 0x88 /* ADRCBS : Second Intf NOR Addr Offset */ +#define SFC_REG_ADRCMS 0x8C /* ADRCMS : Second Intf NOR Allocation */ +#define SFC_REG_OADRNB 0x90 /* OADRNB : Direct Access OBP Window Base Address */ +#define SFC_REG_OADRNS 0x94 /* OADRNS : DIrect Access OPB Window Size */ + +#define SFC_REG_CHIPIDCONF 0x9C /* CHIPIDCONF : config ChipId CMD */ +#define SFC_REG_CHIPIDCONF_OPCODE_SHFT 24 +#define SFC_REG_CHIPIDCONF_READ (1 << 23) +#define SFC_REG_CHIPIDCONF_WRITE (1 << 22) +#define SFC_REG_CHIPIDCONF_USE_ADDR (1 << 21) +#define SFC_REG_CHIPIDCONF_DUMMY_SHFT 16 +#define SFC_REG_CHIPIDCONF_LEN_SHFT 0 + +/* + * SFC Opcodes + */ +#define SFC_OP_READRAW 0x03 /* Read Raw */ +#define SFC_OP_WRITERAW 0x02 /* Write Raw */ +#define SFC_OP_ERASM 0x32 /* Erase Small */ +#define SFC_OP_ERALG 0x34 /* Erase Large */ +#define SFC_OP_ENWRITPROT 0x53 /* Enable WRite Protect */ +#define SFC_OP_CHIPID 0x1F /* Get Chip ID */ +#define SFC_OP_STATUS 0x05 /* Get Status */ +#define SFC_OP_TURNOFF 0x5E /* Turn Off */ +#define SFC_OP_TURNON 0x50 /* Turn On */ +#define SFC_OP_ABORT 0x6F /* Super-Abort */ +#define SFC_OP_START4BA 0x37 /* Start 4BA */ +#define SFC_OP_END4BA 0x69 /* End 4BA */ + +/* Command buffer size */ +#define SFC_CMDBUF_SIZE 256 + +struct sfc_ctrl { + /* Erase sizes */ + uint32_t small_er_size; + uint32_t large_er_size; + + /* Current 4b mode */ + bool mode_4b; + + /* Callbacks */ + struct spi_flash_ctrl ops; +}; + +/* Command register support */ +static inline int sfc_reg_read(uint8_t reg, uint32_t *val) +{ + int rc; + + *val = 0xffffffff; + rc = lpc_fw_read32(val, SFC_CMDREG_OFFSET + reg); + if (rc) + return rc; + return 0; +} + +static inline int sfc_reg_write(uint8_t reg, uint32_t val) +{ + return lpc_fw_write32(val, SFC_CMDREG_OFFSET + reg); +} + +static int sfc_buf_write(uint32_t len, const void *data) +{ + __be32 tmp; + uint32_t off = 0; + int rc; + + if (len > SFC_CMDBUF_SIZE) + return FLASH_ERR_PARM_ERROR; + + while (len >= 4) { + tmp = cpu_to_be32(*(const uint32_t *)data); + rc = lpc_fw_write32((u32)tmp, SFC_CMDBUF_OFFSET + off); + if (rc) + return rc; + off += 4; + len -= 4; + data += 4; + } + if (!len) + return 0; + + /* lpc_fw_write operates on BE values so that's what we layout + * in memory with memcpy. The swap in the register on LE doesn't + * matter, the result in memory will be in the right order. + */ + tmp = cpu_to_be32(-1); + memcpy(&tmp, data, len); /* XXX: is this right? */ + return lpc_fw_write32((u32)tmp, SFC_CMDBUF_OFFSET + off); +} + +static int sfc_buf_read(uint32_t len, void *data) +{ + uint32_t tmp, off = 0; + int rc; + + if (len > SFC_CMDBUF_SIZE) + return FLASH_ERR_PARM_ERROR; + + while (len >= 4) { + rc = lpc_fw_read32(data, SFC_CMDBUF_OFFSET + off); + if (rc) + return rc; + off += 4; + len -= 4; + data += 4; + } + if (!len) + return 0; + + rc = lpc_fw_read32(&tmp, SFC_CMDBUF_OFFSET + off); + if (rc) + return rc; + /* We know tmp contains a big endian value, so memcpy is + * our friend here + */ + memcpy(data, &tmp, len); + return 0; +} + +/* Polls until SFC indicates command is complete */ +static int sfc_poll_complete(void) +{ + uint32_t status, timeout; + struct timespec ts; + + /* + * A full 256 bytes read/write command will take at least + * 126us. Smaller commands are faster but we use less of + * them. So let's sleep in increments of 100us + */ + ts.tv_sec = 0; + ts.tv_nsec = 100000; + + /* + * Use a 1s timeout which should be sufficient for the + * commands we use + */ + timeout = 10000; + + do { + int rc; + + rc = sfc_reg_read(SFC_REG_STATUS, &status); + if (rc) + return rc; + if (status & SFC_REG_STATUS_DONE) + break; + if (--timeout == 0) + return FLASH_ERR_CTRL_TIMEOUT; + nanosleep(&ts, NULL); + } while (true); + + return 0; +} + +static int sfc_exec_command(uint8_t opcode, uint32_t length) +{ + int rc = 0; + uint32_t cmd_reg = 0; + + if (opcode > 0x7f || length > 0x1ff) + return FLASH_ERR_PARM_ERROR; + + /* Write command register to start execution */ + cmd_reg |= (opcode << SFC_REG_CMD_OPCODE_SHFT); + cmd_reg |= (length << SFC_REG_CMD_LENGTH_SHFT); + rc = sfc_reg_write(SFC_REG_CMD, cmd_reg); + if (rc) + return rc; + + /* Wait for command to complete */ + return sfc_poll_complete(); +} + +static int sfc_chip_id(struct spi_flash_ctrl *ctrl, uint8_t *id_buf, + uint32_t *id_size) +{ + uint32_t idconf; + int rc; + + (void)ctrl; + + if ((*id_size) < 3) + return FLASH_ERR_PARM_ERROR; + + /* + * XXX This will not work in locked down mode but we assume that + * in this case, the chip ID command is already properly programmed + * and the SFC will ignore this. However I haven't verified... + */ + idconf = ((uint64_t)CMD_RDID) << SFC_REG_CHIPIDCONF_OPCODE_SHFT; + idconf |= SFC_REG_CHIPIDCONF_READ; + idconf |= (3ul << SFC_REG_CHIPIDCONF_LEN_SHFT); + (void)sfc_reg_write(SFC_REG_CHIPIDCONF, idconf); + + /* Perform command */ + rc = sfc_exec_command(SFC_OP_CHIPID, 0); + if (rc) + return rc; + + /* Read chip ID */ + rc = sfc_buf_read(3, id_buf); + if (rc) + return rc; + *id_size = 3; + + return 0; +} + + +static int sfc_read(struct spi_flash_ctrl *ctrl, uint32_t pos, + void *buf, uint32_t len) +{ + (void)ctrl; + + while(len) { + uint32_t chunk = len; + int rc; + + if (chunk > SFC_CMDBUF_SIZE) + chunk = SFC_CMDBUF_SIZE; + rc = sfc_reg_write(SFC_REG_ADR, pos); + if (rc) + return rc; + rc = sfc_exec_command(SFC_OP_READRAW, chunk); + if (rc) + return rc; + rc = sfc_buf_read(chunk, buf); + if (rc) + return rc; + len -= chunk; + pos += chunk; + buf += chunk; + } + return 0; +} + +static int sfc_write(struct spi_flash_ctrl *ctrl, uint32_t addr, + const void *buf, uint32_t size) +{ + uint32_t chunk; + int rc; + + (void)ctrl; + + while(size) { + /* We shall not cross a page boundary */ + chunk = 0x100 - (addr & 0xff); + if (chunk > size) + chunk = size; + + /* Write to SFC write buffer */ + rc = sfc_buf_write(chunk, buf); + if (rc) + return rc; + + /* Program address */ + rc = sfc_reg_write(SFC_REG_ADR, addr); + if (rc) + return rc; + + /* Send command */ + rc = sfc_exec_command(SFC_OP_WRITERAW, chunk); + if (rc) + return rc; + + addr += chunk; + buf += chunk; + size -= chunk; + } + return 0; +} + +static int sfc_erase(struct spi_flash_ctrl *ctrl, uint32_t addr, + uint32_t size) +{ + struct sfc_ctrl *ct = container_of(ctrl, struct sfc_ctrl, ops); + uint32_t sm_mask = ct->small_er_size - 1; + uint32_t lg_mask = ct->large_er_size - 1; + uint32_t chunk; + uint8_t cmd; + int rc; + + while(size) { + /* Choose erase size for this chunk */ + if (((addr | size) & lg_mask) == 0) { + chunk = ct->large_er_size; + cmd = SFC_OP_ERALG; + } else if (((addr | size) & sm_mask) == 0) { + chunk = ct->small_er_size; + cmd = SFC_OP_ERASM; + } else + return FLASH_ERR_ERASE_BOUNDARY; + + rc = sfc_reg_write(SFC_REG_ADR, addr); + if (rc) + return rc; + rc = sfc_exec_command(cmd, 0); + if (rc) + return rc; + addr += chunk; + size -= chunk; + } + return 0; +} + +static int sfc_setup(struct spi_flash_ctrl *ctrl, uint32_t *tsize) +{ + struct sfc_ctrl *ct = container_of(ctrl, struct sfc_ctrl, ops); + struct flash_info *info = ctrl->finfo; + uint32_t er_flags; + + (void)tsize; + + /* Keep non-erase related flags */ + er_flags = ~FL_ERASE_ALL; + + /* Add supported erase sizes */ + if (ct->small_er_size == 0x1000 || ct->large_er_size == 0x1000) + er_flags |= FL_ERASE_4K; + if (ct->small_er_size == 0x8000 || ct->large_er_size == 0x8000) + er_flags |= FL_ERASE_32K; + if (ct->small_er_size == 0x10000 || ct->large_er_size == 0x10000) + er_flags |= FL_ERASE_64K; + + /* Mask the flags out */ + info->flags &= er_flags; + + return 0; +} + +static int sfc_set_4b(struct spi_flash_ctrl *ctrl, bool enable) +{ + struct sfc_ctrl *ct = container_of(ctrl, struct sfc_ctrl, ops); + int rc; + + rc = sfc_exec_command(enable ? SFC_OP_START4BA : SFC_OP_END4BA, 0); + if (rc) + return rc; + ct->mode_4b = enable; + return 0; +} + +static void sfc_validate_er_size(uint32_t *size) +{ + if (*size == 0) + return; + + /* We only support 4k, 32k and 64k */ + if (*size != 0x1000 && *size != 0x8000 && *size != 0x10000) { + FL_ERR("SFC: Erase size %d bytes unsupported\n", *size); + *size = 0; + } +} + +static int sfc_init(struct sfc_ctrl *ct) +{ + int rc; + uint32_t status; + + /* + * Assumptions: The controller has been fully initialized + * by an earlier FW layer setting the chip ID command, the + * erase sizes, and configuring the timings for reads and + * writes. + * + * This driver is meant to be usable if the configuration + * is in lock down. + * + * If that wasn't the case, we could configure some sane + * defaults here and tuned values in setup() after the + * chip has been identified. + */ + + /* Read erase sizes from flash */ + rc = sfc_reg_read(SFC_REG_ERASMS, &ct->small_er_size); + if (rc) + return rc; + sfc_validate_er_size(&ct->small_er_size); + rc = sfc_reg_read(SFC_REG_ERASLGS, &ct->large_er_size); + if (rc) + return rc; + sfc_validate_er_size(&ct->large_er_size); + + /* No erase sizes we can cope with ? Ouch... */ + if ((ct->small_er_size == 0 && ct->large_er_size == 0) || + (ct->large_er_size && (ct->small_er_size > ct->large_er_size))) { + FL_ERR("SFC: No supported erase sizes !\n"); + return FLASH_ERR_CTRL_CONFIG_MISMATCH; + } + + FL_INF("SFC: Suppored erase sizes:"); + if (ct->small_er_size) + FL_INF(" %dKB", ct->small_er_size >> 10); + if (ct->large_er_size) + FL_INF(" %dKB", ct->large_er_size >> 10); + FL_INF("\n"); + + /* Read current state of 4 byte addressing */ + rc = sfc_reg_read(SFC_REG_STATUS, &status); + if (rc) + return rc; + ct->mode_4b = !!(status & SFC_REG_STATUS_FOURBYTEAD); + + return 0; +} + +int sfc_open(struct spi_flash_ctrl **ctrl) +{ + struct sfc_ctrl *ct; + int rc; + + *ctrl = NULL; + ct = malloc(sizeof(*ct)); + if (!ct) { + FL_ERR("SFC: Failed to allocate\n"); + return FLASH_ERR_MALLOC_FAILED; + } + memset(ct, 0, sizeof(*ct)); + ct->ops.chip_id = sfc_chip_id; + ct->ops.setup = sfc_setup; + ct->ops.set_4b = sfc_set_4b; + ct->ops.read = sfc_read; + ct->ops.write = sfc_write; + ct->ops.erase = sfc_erase; + + rc = sfc_init(ct); + if (rc) + goto fail; + *ctrl = &ct->ops; + return 0; + fail: + free(ct); + return rc; +} + +void sfc_close(struct spi_flash_ctrl *ctrl) +{ + struct sfc_ctrl *ct = container_of(ctrl, struct sfc_ctrl, ops); + + /* Free the whole lot */ + free(ct); +} + diff --git a/roms/skiboot/hw/slw.c b/roms/skiboot/hw/slw.c new file mode 100644 index 000000000..56ba05b0a --- /dev/null +++ b/roms/skiboot/hw/slw.c @@ -0,0 +1,1731 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Everything to do with deep power saving (stop) states + * SLeep/Winkle, Handle ChipTOD chip & configure core timebases + * + * Copyright 2013-2019 IBM Corp. + */ + +#include <skiboot.h> +#include <xscom.h> +#include <xscom-p8-regs.h> +#include <xscom-p9-regs.h> +#include <xscom-p10-regs.h> +#include <io.h> +#include <cpu.h> +#include <chip.h> +#include <mem_region.h> +#include <chiptod.h> +#include <interrupts.h> +#include <timebase.h> +#include <errorlog.h> +#include <libfdt/libfdt.h> +#include <opal-api.h> +#include <nvram.h> +#include <sbe-p8.h> +#include <xive.h> + +#include <p10_stop_api.H> +#include <p8_pore_table_gen_api.H> +#include <sbe_xip_image.h> + +static uint32_t slw_saved_reset[0x100]; + +static bool slw_current_le = false; + +enum wakeup_engine_states wakeup_engine_state = WAKEUP_ENGINE_NOT_PRESENT; +bool has_deep_states = false; + +DEFINE_LOG_ENTRY(OPAL_RC_SLW_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_SLW, + OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_GENERAL, + OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_SLW_SET, OPAL_PLATFORM_ERR_EVT, OPAL_SLW, + OPAL_PLATFORM_FIRMWARE, OPAL_INFO, + OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_SLW_GET, OPAL_PLATFORM_ERR_EVT, OPAL_SLW, + OPAL_PLATFORM_FIRMWARE, OPAL_INFO, + OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_SLW_REG, OPAL_PLATFORM_ERR_EVT, OPAL_SLW, + OPAL_PLATFORM_FIRMWARE, OPAL_INFO, + OPAL_NA); + +static void slw_do_rvwinkle(void *data) +{ + struct cpu_thread *cpu = this_cpu(); + struct cpu_thread *master = data; + uint64_t lpcr = mfspr(SPR_LPCR); + struct proc_chip *chip; + + /* Setup our ICP to receive IPIs */ + icp_prep_for_pm(); + + /* Setup LPCR to wakeup on external interrupts only */ + mtspr(SPR_LPCR, ((lpcr & ~SPR_LPCR_P8_PECE) | SPR_LPCR_P8_PECE2)); + isync(); + + prlog(PR_DEBUG, "SLW: CPU PIR 0x%04x going to rvwinkle...\n", + cpu->pir); + + /* Tell that we got it */ + cpu->state = cpu_state_rvwinkle; + + enter_p8_pm_state(1); + + /* Restore SPRs */ + init_shared_sprs(); + init_replicated_sprs(); + + /* Ok, it's ours again */ + cpu->state = cpu_state_active; + + prlog(PR_DEBUG, "SLW: CPU PIR 0x%04x woken up !\n", cpu->pir); + + /* Cleanup our ICP */ + reset_cpu_icp(); + + /* Resync timebase */ + chiptod_wakeup_resync(); + + /* Restore LPCR */ + mtspr(SPR_LPCR, lpcr); + isync(); + + /* If we are passed a master pointer we are the designated + * waker, let's proceed. If not, return, we are finished. + */ + if (!master) + return; + + prlog(PR_DEBUG, "SLW: CPU PIR 0x%04x waiting for master...\n", + cpu->pir); + + /* Allriiiight... now wait for master to go down */ + while(master->state != cpu_state_rvwinkle) + sync(); + + /* XXX Wait one second ! (should check xscom state ? ) */ + time_wait_ms(1000); + + for_each_chip(chip) { + struct cpu_thread *c; + uint64_t tmp; + for_each_available_core_in_chip(c, chip->id) { + xscom_read(chip->id, + XSCOM_ADDR_P8_EX_SLAVE(pir_to_core_id(c->pir), + EX_PM_IDLE_STATE_HISTORY_PHYP), + &tmp); + prlog(PR_TRACE, "SLW: core %x:%x" + " history: 0x%016llx (mid2)\n", + chip->id, pir_to_core_id(c->pir), + tmp); + } + } + + prlog(PR_DEBUG, "SLW: Waking master (PIR 0x%04x)...\n", master->pir); + + /* Now poke all the secondary threads on the master's core */ + for_each_cpu(cpu) { + if (!cpu_is_sibling(cpu, master) || (cpu == master)) + continue; + icp_kick_cpu(cpu); + + /* Wait for it to claim to be back (XXX ADD TIMEOUT) */ + while(cpu->state != cpu_state_active) + sync(); + } + + /* Now poke the master and be gone */ + icp_kick_cpu(master); +} + +static void slw_patch_reset(void) +{ + uint32_t *src, *dst, *sav; + + src = &reset_patch_start; + dst = (uint32_t *)0x100; + sav = slw_saved_reset; + while(src < &reset_patch_end) { + *(sav++) = *(dst); + *(dst++) = *(src++); + } + sync_icache(); +} + +static void slw_unpatch_reset(void) +{ + extern uint32_t reset_patch_start; + extern uint32_t reset_patch_end; + uint32_t *src, *dst, *sav; + + src = &reset_patch_start; + dst = (uint32_t *)0x100; + sav = slw_saved_reset; + while(src < &reset_patch_end) { + *(dst++) = *(sav++); + src++; + } + sync_icache(); +} + +static bool slw_general_init(struct proc_chip *chip, struct cpu_thread *c) +{ + uint32_t core = pir_to_core_id(c->pir); + uint64_t tmp; + int rc; + + /* PowerManagement GP0 clear PM_DISABLE */ + rc = xscom_read(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_GP0), &tmp); + if (rc) { + log_simple_error(&e_info(OPAL_RC_SLW_INIT), + "SLW: Failed to read PM_GP0\n"); + return false; + } + tmp = tmp & ~0x8000000000000000ULL; + rc = xscom_write(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_GP0), tmp); + if (rc) { + log_simple_error(&e_info(OPAL_RC_SLW_INIT), + "SLW: Failed to write PM_GP0\n"); + return false; + } + prlog(PR_TRACE, "SLW: PMGP0 set to 0x%016llx\n", tmp); + + /* Read back for debug */ + rc = xscom_read(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_GP0), &tmp); + if (rc) + log_simple_error(&e_info(OPAL_RC_SLW_INIT), + "SLW: Failed to re-read PM_GP0. Continuing...\n"); + + prlog(PR_TRACE, "SLW: PMGP0 read 0x%016llx\n", tmp); + + return true; +} + +static bool slw_set_overrides(struct proc_chip *chip, struct cpu_thread *c) +{ + uint32_t core = pir_to_core_id(c->pir); + int rc; + + rc = xscom_write(chip->id, + XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_SPECIAL_WAKEUP_PHYP), + 0); + if (rc) { + log_simple_error(&e_info(OPAL_RC_SLW_SET), + "SLW: Failed to write PM_SPECIAL_WAKEUP_PHYP\n"); + return false; + } + + return true; +} + +static bool slw_set_overrides_p10(struct proc_chip *chip, struct cpu_thread *c) +{ + uint64_t tmp; + int rc; + uint32_t core = pir_to_core_id(c->pir); + + /* Special wakeup bits that could hold power mgt */ + rc = xscom_read(chip->id, + XSCOM_ADDR_P10_QME_CORE(core, P10_QME_SPWU_HYP), + &tmp); + if (rc) { + log_simple_error(&e_info(OPAL_RC_SLW_SET), + "SLW: Failed to read P10_QME_SPWU_HYP\n"); + return false; + } + if (tmp & P10_SPWU_REQ) + prlog(PR_WARNING, + "SLW: core %d P10_QME_SPWU_HYP requested 0x%016llx\n", + core, tmp); + + return true; +} + + +static bool slw_set_overrides_p9(struct proc_chip *chip, struct cpu_thread *c) +{ + uint64_t tmp; + int rc; + uint32_t core = pir_to_core_id(c->pir); + + /* Special wakeup bits that could hold power mgt */ + rc = xscom_read(chip->id, + XSCOM_ADDR_P9_EC_SLAVE(core, EC_PPM_SPECIAL_WKUP_HYP), + &tmp); + if (rc) { + log_simple_error(&e_info(OPAL_RC_SLW_SET), + "SLW: Failed to read EC_PPM_SPECIAL_WKUP_HYP\n"); + return false; + } + if (tmp) + prlog(PR_WARNING, + "SLW: core %d EC_PPM_SPECIAL_WKUP_HYP read 0x%016llx\n", + core, tmp); + rc = xscom_read(chip->id, + XSCOM_ADDR_P9_EC_SLAVE(core, EC_PPM_SPECIAL_WKUP_OTR), + &tmp); + if (tmp) + prlog(PR_WARNING, + "SLW: core %d EC_PPM_SPECIAL_WKUP_OTR read 0x%016llx\n", + core, tmp); + return true; +} + +static bool slw_unset_overrides(struct proc_chip *chip, struct cpu_thread *c) +{ + uint32_t core = pir_to_core_id(c->pir); + + /* XXX FIXME: Save and restore the overrides */ + prlog(PR_DEBUG, "SLW: slw_unset_overrides %x:%x\n", chip->id, core); + return true; +} + +static bool slw_set_idle_mode(struct proc_chip *chip, struct cpu_thread *c) +{ + uint32_t core = pir_to_core_id(c->pir); + uint64_t tmp; + int rc; + + /* + * PM GP1 allows fast/deep mode to be selected independently for sleep + * and winkle. Init PM GP1 so that sleep happens in fast mode and + * winkle happens in deep mode. + * Make use of the OR XSCOM for this since the OCC might be manipulating + * the PM_GP1 register as well. Before doing this ensure that the bits + * managing idle states are cleared so as to override any bits set at + * init time. + */ + + tmp = ~EX_PM_GP1_SLEEP_WINKLE_MASK; + rc = xscom_write(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_CLEAR_GP1), + tmp); + if (rc) { + log_simple_error(&e_info(OPAL_RC_SLW_SET), + "SLW: Failed to write PM_GP1\n"); + return false; + } + + rc = xscom_write(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_SET_GP1), + EX_PM_SETUP_GP1_FAST_SLEEP_DEEP_WINKLE); + if (rc) { + log_simple_error(&e_info(OPAL_RC_SLW_SET), + "SLW: Failed to write PM_GP1\n"); + return false; + } + + /* Read back for debug */ + xscom_read(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_GP1), &tmp); + prlog(PR_TRACE, "SLW: PMGP1 read 0x%016llx\n", tmp); + return true; +} + +static bool slw_get_idle_state_history(struct proc_chip *chip, struct cpu_thread *c) +{ + uint32_t core = pir_to_core_id(c->pir); + uint64_t tmp; + int rc; + + /* Cleanup history */ + rc = xscom_read(chip->id, + XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_IDLE_STATE_HISTORY_PHYP), + &tmp); + if (rc) { + log_simple_error(&e_info(OPAL_RC_SLW_GET), + "SLW: Failed to read PM_IDLE_STATE_HISTORY\n"); + return false; + } + + prlog(PR_TRACE, "SLW: core %x:%x history: 0x%016llx (old1)\n", + chip->id, core, tmp); + + rc = xscom_read(chip->id, + XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_IDLE_STATE_HISTORY_PHYP), + &tmp); + + if (rc) { + log_simple_error(&e_info(OPAL_RC_SLW_GET), + "SLW: Failed to read PM_IDLE_STATE_HISTORY\n"); + return false; + } + + prlog(PR_TRACE, "SLW: core %x:%x history: 0x%016llx (old2)\n", + chip->id, core, tmp); + + return true; +} + +static bool idle_prepare_core(struct proc_chip *chip, struct cpu_thread *c) +{ + prlog(PR_TRACE, "FASTSLEEP: Prepare core %x:%x\n", + chip->id, pir_to_core_id(c->pir)); + + if(!slw_general_init(chip, c)) + return false; + if(!slw_set_overrides(chip, c)) + return false; + if(!slw_set_idle_mode(chip, c)) + return false; + if(!slw_get_idle_state_history(chip, c)) + return false; + + return true; + +} + +/* Define device-tree fields */ +#define MAX_NAME_LEN 16 +struct cpu_idle_states { + char name[MAX_NAME_LEN]; + u32 latency_ns; + u32 residency_ns; + /* + * Register value/mask used to select different idle states. + * PMICR in POWER8 and PSSCR in POWER9 + */ + u64 pm_ctrl_reg_val; + u64 pm_ctrl_reg_mask; + u32 flags; +}; + +static struct cpu_idle_states nap_only_cpu_idle_states[] = { + { /* nap */ + .name = "nap", + .latency_ns = 4000, + .residency_ns = 100000, + .flags = 0*OPAL_PM_DEC_STOP \ + | 0*OPAL_PM_TIMEBASE_STOP \ + | 1*OPAL_PM_LOSE_USER_CONTEXT \ + | 0*OPAL_PM_LOSE_HYP_CONTEXT \ + | 0*OPAL_PM_LOSE_FULL_CONTEXT \ + | 1*OPAL_PM_NAP_ENABLED \ + | 0*OPAL_PM_SLEEP_ENABLED \ + | 0*OPAL_PM_WINKLE_ENABLED \ + | 0*OPAL_USE_PMICR, + .pm_ctrl_reg_val = 0, + .pm_ctrl_reg_mask = 0 }, +}; + +static struct cpu_idle_states power8_cpu_idle_states[] = { + { /* nap */ + .name = "nap", + .latency_ns = 4000, + .residency_ns = 100000, + .flags = 0*OPAL_PM_DEC_STOP \ + | 0*OPAL_PM_TIMEBASE_STOP \ + | 1*OPAL_PM_LOSE_USER_CONTEXT \ + | 0*OPAL_PM_LOSE_HYP_CONTEXT \ + | 0*OPAL_PM_LOSE_FULL_CONTEXT \ + | 1*OPAL_PM_NAP_ENABLED \ + | 0*OPAL_USE_PMICR, + .pm_ctrl_reg_val = 0, + .pm_ctrl_reg_mask = 0 }, + { /* fast sleep (with workaround) */ + .name = "fastsleep_", + .latency_ns = 40000, + .residency_ns = 300000000, + .flags = 1*OPAL_PM_DEC_STOP \ + | 1*OPAL_PM_TIMEBASE_STOP \ + | 1*OPAL_PM_LOSE_USER_CONTEXT \ + | 0*OPAL_PM_LOSE_HYP_CONTEXT \ + | 0*OPAL_PM_LOSE_FULL_CONTEXT \ + | 1*OPAL_PM_SLEEP_ENABLED_ER1 \ + | 0*OPAL_USE_PMICR, /* Not enabled until deep + states are available */ + .pm_ctrl_reg_val = OPAL_PM_FASTSLEEP_PMICR, + .pm_ctrl_reg_mask = OPAL_PM_SLEEP_PMICR_MASK }, + { /* Winkle */ + .name = "winkle", + .latency_ns = 10000000, + .residency_ns = 1000000000, /* Educated guess (not measured). + * Winkle is not currently used by + * linux cpuidle subsystem so we + * don't have real world user. + * However, this should be roughly + * accurate for when linux does + * use it. */ + .flags = 1*OPAL_PM_DEC_STOP \ + | 1*OPAL_PM_TIMEBASE_STOP \ + | 1*OPAL_PM_LOSE_USER_CONTEXT \ + | 1*OPAL_PM_LOSE_HYP_CONTEXT \ + | 1*OPAL_PM_LOSE_FULL_CONTEXT \ + | 1*OPAL_PM_WINKLE_ENABLED \ + | 0*OPAL_USE_PMICR, /* Currently choosing deep vs + fast via EX_PM_GP1 reg */ + .pm_ctrl_reg_val = 0, + .pm_ctrl_reg_mask = 0 }, +}; + +/* + * cpu_idle_states for key idle states of POWER9 that we want to + * exploit. + * Note latency_ns and residency_ns are estimated values for now. + */ +static struct cpu_idle_states power9_cpu_idle_states[] = { + { + .name = "stop0_lite", /* Enter stop0 with no state loss */ + .latency_ns = 1000, + .residency_ns = 10000, + .flags = 0*OPAL_PM_DEC_STOP \ + | 0*OPAL_PM_TIMEBASE_STOP \ + | 0*OPAL_PM_LOSE_USER_CONTEXT \ + | 0*OPAL_PM_LOSE_HYP_CONTEXT \ + | 0*OPAL_PM_LOSE_FULL_CONTEXT \ + | 1*OPAL_PM_STOP_INST_FAST, + .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(0) \ + | OPAL_PM_PSSCR_MTL(3) \ + | OPAL_PM_PSSCR_TR(3), + .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK }, + { + .name = "stop0", + .latency_ns = 2000, + .residency_ns = 20000, + .flags = 0*OPAL_PM_DEC_STOP \ + | 0*OPAL_PM_TIMEBASE_STOP \ + | 1*OPAL_PM_LOSE_USER_CONTEXT \ + | 0*OPAL_PM_LOSE_HYP_CONTEXT \ + | 0*OPAL_PM_LOSE_FULL_CONTEXT \ + | 1*OPAL_PM_STOP_INST_FAST, + .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(0) \ + | OPAL_PM_PSSCR_MTL(3) \ + | OPAL_PM_PSSCR_TR(3) \ + | OPAL_PM_PSSCR_ESL \ + | OPAL_PM_PSSCR_EC, + .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK }, + + /* stop1_lite has been removed since it adds no additional benefit over stop0_lite */ + + { + .name = "stop1", + .latency_ns = 5000, + .residency_ns = 50000, + .flags = 0*OPAL_PM_DEC_STOP \ + | 0*OPAL_PM_TIMEBASE_STOP \ + | 1*OPAL_PM_LOSE_USER_CONTEXT \ + | 0*OPAL_PM_LOSE_HYP_CONTEXT \ + | 0*OPAL_PM_LOSE_FULL_CONTEXT \ + | 1*OPAL_PM_STOP_INST_FAST, + .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(1) \ + | OPAL_PM_PSSCR_MTL(3) \ + | OPAL_PM_PSSCR_TR(3) \ + | OPAL_PM_PSSCR_ESL \ + | OPAL_PM_PSSCR_EC, + .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK }, + /* + * stop2_lite has been removed since currently it adds minimal benefit over stop2. + * However, the benefit is eclipsed by the time required to ungate the clocks + */ + + { + .name = "stop2", + .latency_ns = 10000, + .residency_ns = 100000, + .flags = 0*OPAL_PM_DEC_STOP \ + | 0*OPAL_PM_TIMEBASE_STOP \ + | 1*OPAL_PM_LOSE_USER_CONTEXT \ + | 0*OPAL_PM_LOSE_HYP_CONTEXT \ + | 0*OPAL_PM_LOSE_FULL_CONTEXT \ + | 1*OPAL_PM_STOP_INST_FAST, + .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(2) \ + | OPAL_PM_PSSCR_MTL(3) \ + | OPAL_PM_PSSCR_TR(3) \ + | OPAL_PM_PSSCR_ESL \ + | OPAL_PM_PSSCR_EC, + .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK }, + { + .name = "stop4", + .latency_ns = 100000, + .residency_ns = 10000000, + .flags = 0*OPAL_PM_DEC_STOP \ + | 0*OPAL_PM_TIMEBASE_STOP \ + | 1*OPAL_PM_LOSE_USER_CONTEXT \ + | 1*OPAL_PM_LOSE_HYP_CONTEXT \ + | 1*OPAL_PM_LOSE_FULL_CONTEXT \ + | 1*OPAL_PM_STOP_INST_DEEP, + .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(4) \ + | OPAL_PM_PSSCR_MTL(7) \ + | OPAL_PM_PSSCR_TR(3) \ + | OPAL_PM_PSSCR_ESL \ + | OPAL_PM_PSSCR_EC, + .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK }, + { + .name = "stop5", + .latency_ns = 200000, + .residency_ns = 20000000, + .flags = 0*OPAL_PM_DEC_STOP \ + | 0*OPAL_PM_TIMEBASE_STOP \ + | 1*OPAL_PM_LOSE_USER_CONTEXT \ + | 1*OPAL_PM_LOSE_HYP_CONTEXT \ + | 1*OPAL_PM_LOSE_FULL_CONTEXT \ + | 1*OPAL_PM_STOP_INST_DEEP, + .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(5) \ + | OPAL_PM_PSSCR_MTL(7) \ + | OPAL_PM_PSSCR_TR(3) \ + | OPAL_PM_PSSCR_ESL \ + | OPAL_PM_PSSCR_EC, + .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK }, + + { + .name = "stop8", + .latency_ns = 2000000, + .residency_ns = 20000000, + .flags = 1*OPAL_PM_DEC_STOP \ + | 1*OPAL_PM_TIMEBASE_STOP \ + | 1*OPAL_PM_LOSE_USER_CONTEXT \ + | 1*OPAL_PM_LOSE_HYP_CONTEXT \ + | 1*OPAL_PM_LOSE_FULL_CONTEXT \ + | 1*OPAL_PM_STOP_INST_DEEP, + .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(8) \ + | OPAL_PM_PSSCR_MTL(11) \ + | OPAL_PM_PSSCR_TR(3) \ + | OPAL_PM_PSSCR_ESL \ + | OPAL_PM_PSSCR_EC, + .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK }, + + { + .name = "stop11", + .latency_ns = 10000000, + .residency_ns = 100000000, + .flags = 1*OPAL_PM_DEC_STOP \ + | 1*OPAL_PM_TIMEBASE_STOP \ + | 1*OPAL_PM_LOSE_USER_CONTEXT \ + | 1*OPAL_PM_LOSE_HYP_CONTEXT \ + | 1*OPAL_PM_LOSE_FULL_CONTEXT \ + | 1*OPAL_PM_STOP_INST_DEEP, + .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(11) \ + | OPAL_PM_PSSCR_MTL(11) \ + | OPAL_PM_PSSCR_TR(3) \ + | OPAL_PM_PSSCR_ESL \ + | OPAL_PM_PSSCR_EC, + .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK }, + +}; + +/* + * Prior to Mambo.7.8.21, mambo did set the MSR correctly for lite stop + * states, so disable them for now. + */ +static struct cpu_idle_states power9_mambo_cpu_idle_states[] = { + { + .name = "stop0", + .latency_ns = 2000, + .residency_ns = 20000, + .flags = 0*OPAL_PM_DEC_STOP \ + | 0*OPAL_PM_TIMEBASE_STOP \ + | 1*OPAL_PM_LOSE_USER_CONTEXT \ + | 0*OPAL_PM_LOSE_HYP_CONTEXT \ + | 0*OPAL_PM_LOSE_FULL_CONTEXT \ + | 1*OPAL_PM_STOP_INST_FAST, + .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(0) \ + | OPAL_PM_PSSCR_MTL(3) \ + | OPAL_PM_PSSCR_TR(3) \ + | OPAL_PM_PSSCR_ESL \ + | OPAL_PM_PSSCR_EC, + .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK }, + { + .name = "stop1", + .latency_ns = 5000, + .residency_ns = 50000, + .flags = 0*OPAL_PM_DEC_STOP \ + | 0*OPAL_PM_TIMEBASE_STOP \ + | 1*OPAL_PM_LOSE_USER_CONTEXT \ + | 0*OPAL_PM_LOSE_HYP_CONTEXT \ + | 0*OPAL_PM_LOSE_FULL_CONTEXT \ + | 1*OPAL_PM_STOP_INST_FAST, + .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(1) \ + | OPAL_PM_PSSCR_MTL(3) \ + | OPAL_PM_PSSCR_TR(3) \ + | OPAL_PM_PSSCR_ESL \ + | OPAL_PM_PSSCR_EC, + .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK }, + { + .name = "stop2", + .latency_ns = 10000, + .residency_ns = 100000, + .flags = 0*OPAL_PM_DEC_STOP \ + | 0*OPAL_PM_TIMEBASE_STOP \ + | 1*OPAL_PM_LOSE_USER_CONTEXT \ + | 0*OPAL_PM_LOSE_HYP_CONTEXT \ + | 0*OPAL_PM_LOSE_FULL_CONTEXT \ + | 1*OPAL_PM_STOP_INST_FAST, + .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(2) \ + | OPAL_PM_PSSCR_MTL(3) \ + | OPAL_PM_PSSCR_TR(3) \ + | OPAL_PM_PSSCR_ESL \ + | OPAL_PM_PSSCR_EC, + .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK }, + { + .name = "stop4", + .latency_ns = 100000, + .residency_ns = 1000000, + .flags = 1*OPAL_PM_DEC_STOP \ + | 1*OPAL_PM_TIMEBASE_STOP \ + | 1*OPAL_PM_LOSE_USER_CONTEXT \ + | 1*OPAL_PM_LOSE_HYP_CONTEXT \ + | 1*OPAL_PM_LOSE_FULL_CONTEXT \ + | 1*OPAL_PM_STOP_INST_DEEP, + .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(4) \ + | OPAL_PM_PSSCR_MTL(7) \ + | OPAL_PM_PSSCR_TR(3) \ + | OPAL_PM_PSSCR_ESL \ + | OPAL_PM_PSSCR_EC, + .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK }, + + { + .name = "stop8", + .latency_ns = 2000000, + .residency_ns = 20000000, + .flags = 1*OPAL_PM_DEC_STOP \ + | 1*OPAL_PM_TIMEBASE_STOP \ + | 1*OPAL_PM_LOSE_USER_CONTEXT \ + | 1*OPAL_PM_LOSE_HYP_CONTEXT \ + | 1*OPAL_PM_LOSE_FULL_CONTEXT \ + | 1*OPAL_PM_STOP_INST_DEEP, + .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(8) \ + | OPAL_PM_PSSCR_MTL(11) \ + | OPAL_PM_PSSCR_TR(3) \ + | OPAL_PM_PSSCR_ESL \ + | OPAL_PM_PSSCR_EC, + .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK }, + + { + .name = "stop11", + .latency_ns = 10000000, + .residency_ns = 100000000, + .flags = 1*OPAL_PM_DEC_STOP \ + | 1*OPAL_PM_TIMEBASE_STOP \ + | 1*OPAL_PM_LOSE_USER_CONTEXT \ + | 1*OPAL_PM_LOSE_HYP_CONTEXT \ + | 1*OPAL_PM_LOSE_FULL_CONTEXT \ + | 1*OPAL_PM_STOP_INST_DEEP, + .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(11) \ + | OPAL_PM_PSSCR_MTL(11) \ + | OPAL_PM_PSSCR_TR(3) \ + | OPAL_PM_PSSCR_ESL \ + | OPAL_PM_PSSCR_EC, + .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK }, + +}; + +/* + * cpu_idle_states for fused core configuration + * These will be a subset of power9 idle states. + */ +static struct cpu_idle_states power9_fusedcore_cpu_idle_states[] = { + { + .name = "stop0_lite", /* Enter stop0 with no state loss */ + .latency_ns = 1000, + .residency_ns = 10000, + .flags = 0*OPAL_PM_DEC_STOP \ + | 0*OPAL_PM_TIMEBASE_STOP \ + | 0*OPAL_PM_LOSE_USER_CONTEXT \ + | 0*OPAL_PM_LOSE_HYP_CONTEXT \ + | 0*OPAL_PM_LOSE_FULL_CONTEXT \ + | 1*OPAL_PM_STOP_INST_FAST, + .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(0) \ + | OPAL_PM_PSSCR_MTL(3) \ + | OPAL_PM_PSSCR_TR(3), + .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK }, + { + .name = "stop0", + .latency_ns = 2000, + .residency_ns = 20000, + .flags = 0*OPAL_PM_DEC_STOP \ + | 0*OPAL_PM_TIMEBASE_STOP \ + | 1*OPAL_PM_LOSE_USER_CONTEXT \ + | 0*OPAL_PM_LOSE_HYP_CONTEXT \ + | 0*OPAL_PM_LOSE_FULL_CONTEXT \ + | 1*OPAL_PM_STOP_INST_FAST, + .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(0) \ + | OPAL_PM_PSSCR_MTL(3) \ + | OPAL_PM_PSSCR_TR(3) \ + | OPAL_PM_PSSCR_ESL \ + | OPAL_PM_PSSCR_EC, + .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK }, + + /* stop1_lite has been removed since it adds no additional benefit over stop0_lite */ + + { + .name = "stop1", + .latency_ns = 5000, + .residency_ns = 50000, + .flags = 0*OPAL_PM_DEC_STOP \ + | 0*OPAL_PM_TIMEBASE_STOP \ + | 1*OPAL_PM_LOSE_USER_CONTEXT \ + | 0*OPAL_PM_LOSE_HYP_CONTEXT \ + | 0*OPAL_PM_LOSE_FULL_CONTEXT \ + | 1*OPAL_PM_STOP_INST_FAST, + .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(1) \ + | OPAL_PM_PSSCR_MTL(3) \ + | OPAL_PM_PSSCR_TR(3) \ + | OPAL_PM_PSSCR_ESL \ + | OPAL_PM_PSSCR_EC, + .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK }, + /* + * stop2_lite has been removed since currently it adds minimal benefit over stop2. + * However, the benefit is eclipsed by the time required to ungate the clocks + */ + + { + .name = "stop2", + .latency_ns = 10000, + .residency_ns = 100000, + .flags = 0*OPAL_PM_DEC_STOP \ + | 0*OPAL_PM_TIMEBASE_STOP \ + | 1*OPAL_PM_LOSE_USER_CONTEXT \ + | 0*OPAL_PM_LOSE_HYP_CONTEXT \ + | 0*OPAL_PM_LOSE_FULL_CONTEXT \ + | 1*OPAL_PM_STOP_INST_FAST, + .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(2) \ + | OPAL_PM_PSSCR_MTL(3) \ + | OPAL_PM_PSSCR_TR(3) \ + | OPAL_PM_PSSCR_ESL \ + | OPAL_PM_PSSCR_EC, + .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK }, +}; + +/* + * Note latency_ns and residency_ns are estimated values for now. + */ +static struct cpu_idle_states power10_cpu_idle_states[] = { + { + .name = "stop0_lite", /* Enter stop0 with no state loss */ + .latency_ns = 1000, + .residency_ns = 10000, + .flags = 0*OPAL_PM_DEC_STOP \ + | 0*OPAL_PM_TIMEBASE_STOP \ + | 0*OPAL_PM_LOSE_USER_CONTEXT \ + | 0*OPAL_PM_LOSE_HYP_CONTEXT \ + | 0*OPAL_PM_LOSE_FULL_CONTEXT \ + | 1*OPAL_PM_STOP_INST_FAST, + .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(0) \ + | OPAL_PM_PSSCR_MTL(0) \ + | OPAL_PM_PSSCR_TR(3), + .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK }, + { + .name = "stop0", + .latency_ns = 10000, + .residency_ns = 100000, + .flags = 0*OPAL_PM_DEC_STOP \ + | 0*OPAL_PM_TIMEBASE_STOP \ + | 1*OPAL_PM_LOSE_USER_CONTEXT \ + | 0*OPAL_PM_LOSE_HYP_CONTEXT \ + | 0*OPAL_PM_LOSE_FULL_CONTEXT \ + | 1*OPAL_PM_STOP_INST_FAST, + .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(0) \ + | OPAL_PM_PSSCR_MTL(0) \ + | OPAL_PM_PSSCR_TR(3) \ + | OPAL_PM_PSSCR_ESL \ + | OPAL_PM_PSSCR_EC, + .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK }, + { + .name = "stop2", + .latency_ns = 20000, + .residency_ns = 200000, + .flags = 0*OPAL_PM_DEC_STOP \ + | 0*OPAL_PM_TIMEBASE_STOP \ + | 1*OPAL_PM_LOSE_USER_CONTEXT \ + | 0*OPAL_PM_LOSE_HYP_CONTEXT \ + | 0*OPAL_PM_LOSE_FULL_CONTEXT \ + | 1*OPAL_PM_STOP_INST_FAST, + .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(2) \ + | OPAL_PM_PSSCR_MTL(2) \ + | OPAL_PM_PSSCR_TR(3) \ + | OPAL_PM_PSSCR_ESL \ + | OPAL_PM_PSSCR_EC, + .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK }, + { + .name = "stop3", + .latency_ns = 45000, + .residency_ns = 450000, + .flags = 0*OPAL_PM_DEC_STOP \ + | 0*OPAL_PM_TIMEBASE_STOP \ + | 1*OPAL_PM_LOSE_USER_CONTEXT \ + | 0*OPAL_PM_LOSE_HYP_CONTEXT \ + | 0*OPAL_PM_LOSE_FULL_CONTEXT \ + | 1*OPAL_PM_STOP_INST_FAST, + .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(3) \ + | OPAL_PM_PSSCR_MTL(3) \ + | OPAL_PM_PSSCR_TR(3) \ + | OPAL_PM_PSSCR_ESL \ + | OPAL_PM_PSSCR_EC, + .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK }, +#if 0 + { + .name = "stop11", + .latency_ns = 10000000, + .residency_ns = 100000000, + .flags = 1*OPAL_PM_DEC_STOP \ + | 1*OPAL_PM_TIMEBASE_STOP \ + | 1*OPAL_PM_LOSE_USER_CONTEXT \ + | 1*OPAL_PM_LOSE_HYP_CONTEXT \ + | 1*OPAL_PM_LOSE_FULL_CONTEXT \ + | 1*OPAL_PM_STOP_INST_DEEP, + .pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(11) \ + | OPAL_PM_PSSCR_MTL(11) \ + | OPAL_PM_PSSCR_TR(3) \ + | OPAL_PM_PSSCR_ESL \ + | OPAL_PM_PSSCR_EC, + .pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK }, +#endif +}; + +static void slw_late_init_p9(struct proc_chip *chip) +{ + struct cpu_thread *c; + int rc; + + prlog(PR_INFO, "SLW: Configuring self-restore for HRMOR\n"); + for_each_available_cpu(c) { + if (c->chip_id != chip->id) + continue; + /* + * Clear HRMOR. Need to update only for thread + * 0 of each core. Doing it anyway for all threads + */ + rc = p9_stop_save_cpureg((void *)chip->homer_base, + P9_STOP_SPR_HRMOR, 0, + c->pir); + if (rc) { + log_simple_error(&e_info(OPAL_RC_SLW_REG), + "SLW: Failed to set HRMOR for CPU %x,RC=0x%x\n", + c->pir, rc); + prlog(PR_ERR, "Disabling deep stop states\n"); + } + } +} + +static void slw_late_init_p10(struct proc_chip *chip) +{ + struct cpu_thread *c; + int rc; + + prlog(PR_INFO, "SLW: Configuring self-restore for HRMOR\n"); + for_each_available_cpu(c) { + if (c->chip_id != chip->id) + continue; + /* + * Clear HRMOR. Need to update only for thread + * 0 of each core. Doing it anyway for all threads + */ + rc = proc_stop_save_cpureg((void *)chip->homer_base, + PROC_STOP_SPR_HRMOR, 0, + c->pir); + if (rc) { + log_simple_error(&e_info(OPAL_RC_SLW_REG), + "SLW: Failed to set HRMOR for CPU %x,RC=0x%x\n", + c->pir, rc); + prlog(PR_ERR, "Disabling deep stop states\n"); + } + } +} + +/* Add device tree properties to describe idle states */ +void add_cpu_idle_state_properties(void) +{ + struct dt_node *power_mgt; + struct cpu_idle_states *states; + struct proc_chip *chip; + int nr_states; + + bool can_sleep = true; + bool has_stop_inst = false; + u8 i; + + fdt64_t *pm_ctrl_reg_val_buf; + fdt64_t *pm_ctrl_reg_mask_buf; + u32 supported_states_mask; + u32 opal_disabled_states_mask = ~0xFC000000; /* all but stop11 */ + const char* nvram_disable_str; + u32 nvram_disabled_states_mask = 0x00; + u32 stop_levels; + + /* Variables to track buffer length */ + u8 name_buf_len; + u8 num_supported_idle_states; + + /* Buffers to hold idle state properties */ + char *name_buf, *alloced_name_buf; + fdt32_t *latency_ns_buf; + fdt32_t *residency_ns_buf; + fdt32_t *flags_buf; + + prlog(PR_DEBUG, "CPU idle state device tree init\n"); + + /* Create /ibm,opal/power-mgt if it doesn't exist already */ + power_mgt = dt_new_check(opal_node, "power-mgt"); + if (!power_mgt) { + /** + * @fwts-label CreateDTPowerMgtNodeFail + * @fwts-advice OPAL failed to add the power-mgt device tree + * node. This could mean that firmware ran out of memory, + * or there's a bug somewhere. + */ + prlog(PR_ERR, "creating dt node /ibm,opal/power-mgt failed\n"); + return; + } + + /* + * Chose the right state table for the chip + * + * XXX We use the first chip version, we should probably look + * for the smaller of all chips instead.. + */ + chip = next_chip(NULL); + assert(chip); + if (proc_gen >= proc_gen_p9) { + if (chip->type == PROC_CHIP_P9_NIMBUS || + chip->type == PROC_CHIP_P9_CUMULUS || + chip->type == PROC_CHIP_P9P) { + if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS) { + states = power9_mambo_cpu_idle_states; + nr_states = ARRAY_SIZE(power9_mambo_cpu_idle_states); + } else if (this_cpu()->is_fused_core) { + states = power9_fusedcore_cpu_idle_states; + nr_states = ARRAY_SIZE(power9_fusedcore_cpu_idle_states); + } else { + states = power9_cpu_idle_states; + nr_states = ARRAY_SIZE(power9_cpu_idle_states); + } + } else if (chip->type == PROC_CHIP_P10) { + states = power10_cpu_idle_states; + nr_states = ARRAY_SIZE(power10_cpu_idle_states); + } else { + prlog(PR_ERR, "determining chip type\n"); + return; + } + + has_stop_inst = true; + stop_levels = dt_prop_get_u32_def(power_mgt, + "ibm,enabled-stop-levels", 0); + if (!stop_levels) { + prerror("SLW: No stop levels available. Power saving is disabled!\n"); + has_deep_states = false; + } else { + /* Iterate to see if we have deep states enabled */ + for (i = 0; i < nr_states; i++) { + u32 level = 31 - (states[i].pm_ctrl_reg_val & + OPAL_PM_PSSCR_RL_MASK); + + if ((stop_levels & (1ul << level)) && + (states[i].flags & OPAL_PM_STOP_INST_DEEP)) + has_deep_states = true; + } + } + if ((wakeup_engine_state == WAKEUP_ENGINE_PRESENT) && has_deep_states) { + if (chip->type == PROC_CHIP_P9_NIMBUS || + chip->type == PROC_CHIP_P9_CUMULUS) { + slw_late_init_p9(chip); + xive_late_init(); + nx_p9_rng_late_init(); + } else if (chip->type == PROC_CHIP_P10) { + slw_late_init_p10(chip); + xive2_late_init(); + } + } + if (wakeup_engine_state != WAKEUP_ENGINE_PRESENT) + has_deep_states = false; + } else if (chip->type == PROC_CHIP_P8_MURANO || + chip->type == PROC_CHIP_P8_VENICE || + chip->type == PROC_CHIP_P8_NAPLES) { + const struct dt_property *p; + + p = dt_find_property(dt_root, "ibm,enabled-idle-states"); + if (p) + prlog(PR_NOTICE, + "SLW: HB-provided idle states property found\n"); + states = power8_cpu_idle_states; + nr_states = ARRAY_SIZE(power8_cpu_idle_states); + + /* Check if hostboot say we can sleep */ + if (!p || !dt_prop_find_string(p, "fast-sleep")) { + prlog(PR_WARNING, "SLW: Sleep not enabled by HB" + " on this platform\n"); + can_sleep = false; + } + + /* Clip to NAP only on Murano and Venice DD1.x */ + if ((chip->type == PROC_CHIP_P8_MURANO || + chip->type == PROC_CHIP_P8_VENICE) && + chip->ec_level < 0x20) { + prlog(PR_NOTICE, "SLW: Sleep not enabled on P8 DD1.x\n"); + can_sleep = false; + } + + } else { + states = nap_only_cpu_idle_states; + nr_states = ARRAY_SIZE(nap_only_cpu_idle_states); + } + + + /* + * Currently we can't append strings and cells to dt properties. + * So create buffers to which you can append values, then create + * dt properties with this buffer content. + */ + + /* Allocate memory to idle state property buffers. */ + alloced_name_buf= malloc(nr_states * sizeof(char) * MAX_NAME_LEN); + name_buf = alloced_name_buf; + latency_ns_buf = malloc(nr_states * sizeof(u32)); + residency_ns_buf= malloc(nr_states * sizeof(u32)); + flags_buf = malloc(nr_states * sizeof(u32)); + pm_ctrl_reg_val_buf = malloc(nr_states * sizeof(u64)); + pm_ctrl_reg_mask_buf = malloc(nr_states * sizeof(u64)); + + name_buf_len = 0; + num_supported_idle_states = 0; + + /* + * Create a mask with the flags of all supported idle states + * set. Use this to only add supported idle states to the + * device-tree + */ + if (has_stop_inst) { + /* Power 9/10 / POWER ISA 3.0 and above */ + supported_states_mask = OPAL_PM_STOP_INST_FAST; + if (wakeup_engine_state == WAKEUP_ENGINE_PRESENT) + supported_states_mask |= OPAL_PM_STOP_INST_DEEP; + } else { + /* Power 7 and Power 8 */ + supported_states_mask = OPAL_PM_NAP_ENABLED; + if (can_sleep) + supported_states_mask |= OPAL_PM_SLEEP_ENABLED | + OPAL_PM_SLEEP_ENABLED_ER1; + if (wakeup_engine_state == WAKEUP_ENGINE_PRESENT) + supported_states_mask |= OPAL_PM_WINKLE_ENABLED; + } + nvram_disable_str = nvram_query_dangerous("opal-stop-state-disable-mask"); + if (nvram_disable_str) + nvram_disabled_states_mask = strtol(nvram_disable_str, NULL, 0); + prlog(PR_DEBUG, "NVRAM stop disable mask: %x\n", nvram_disabled_states_mask); + for (i = 0; i < nr_states; i++) { + /* For each state, check if it is one of the supported states. */ + if (!(states[i].flags & supported_states_mask)) + continue; + + /* We can only use the stop levels that HB has made available */ + if (has_stop_inst) { + u32 level = 31 - (states[i].pm_ctrl_reg_val & + OPAL_PM_PSSCR_RL_MASK); + + if (!(stop_levels & (1ul << level))) + continue; + + if ((opal_disabled_states_mask | + nvram_disabled_states_mask) & + (1ul << level)) { + if (nvram_disable_str && + !(nvram_disabled_states_mask & (1ul << level))) { + prlog(PR_NOTICE, "SLW: Enabling: %s " + "(disabled in OPAL, forced by " + "NVRAM)\n",states[i].name); + } else { + prlog(PR_NOTICE, "SLW: Disabling: %s in OPAL\n", + states[i].name); + continue; + } + } + } + + prlog(PR_INFO, "SLW: Enabling: %s\n", states[i].name); + + /* + * If a state is supported add each of its property + * to its corresponding property buffer. + */ + strncpy(name_buf, states[i].name, MAX_NAME_LEN); + name_buf = name_buf + strlen(states[i].name) + 1; + + *latency_ns_buf = cpu_to_fdt32(states[i].latency_ns); + latency_ns_buf++; + + *residency_ns_buf = cpu_to_fdt32(states[i].residency_ns); + residency_ns_buf++; + + *flags_buf = cpu_to_fdt32(states[i].flags); + flags_buf++; + + *pm_ctrl_reg_val_buf = cpu_to_fdt64(states[i].pm_ctrl_reg_val); + pm_ctrl_reg_val_buf++; + + *pm_ctrl_reg_mask_buf = cpu_to_fdt64(states[i].pm_ctrl_reg_mask); + pm_ctrl_reg_mask_buf++; + + /* Increment buffer length trackers */ + name_buf_len += strlen(states[i].name) + 1; + num_supported_idle_states++; + + } + + /* Point buffer pointers back to beginning of the buffer */ + name_buf -= name_buf_len; + latency_ns_buf -= num_supported_idle_states; + residency_ns_buf -= num_supported_idle_states; + flags_buf -= num_supported_idle_states; + pm_ctrl_reg_val_buf -= num_supported_idle_states; + pm_ctrl_reg_mask_buf -= num_supported_idle_states; + /* Create dt properties with the buffer content */ + dt_add_property(power_mgt, "ibm,cpu-idle-state-names", name_buf, + name_buf_len* sizeof(char)); + dt_add_property(power_mgt, "ibm,cpu-idle-state-latencies-ns", + latency_ns_buf, num_supported_idle_states * sizeof(u32)); + dt_add_property(power_mgt, "ibm,cpu-idle-state-residency-ns", + residency_ns_buf, num_supported_idle_states * sizeof(u32)); + dt_add_property(power_mgt, "ibm,cpu-idle-state-flags", flags_buf, + num_supported_idle_states * sizeof(u32)); + + if (has_stop_inst) { + dt_add_property(power_mgt, "ibm,cpu-idle-state-psscr", + pm_ctrl_reg_val_buf, + num_supported_idle_states * sizeof(u64)); + dt_add_property(power_mgt, "ibm,cpu-idle-state-psscr-mask", + pm_ctrl_reg_mask_buf, + num_supported_idle_states * sizeof(u64)); + } else { + dt_add_property(power_mgt, "ibm,cpu-idle-state-pmicr", + pm_ctrl_reg_val_buf, + num_supported_idle_states * sizeof(u64)); + dt_add_property(power_mgt, "ibm,cpu-idle-state-pmicr-mask", + pm_ctrl_reg_mask_buf, + num_supported_idle_states * sizeof(u64)); + } + assert(alloced_name_buf == name_buf); + free(alloced_name_buf); + free(latency_ns_buf); + free(residency_ns_buf); + free(flags_buf); + free(pm_ctrl_reg_val_buf); + free(pm_ctrl_reg_mask_buf); +} + +static void slw_cleanup_core(struct proc_chip *chip, struct cpu_thread *c) +{ + uint64_t tmp; + int rc; + + /* Display history to check transition */ + rc = xscom_read(chip->id, + XSCOM_ADDR_P8_EX_SLAVE(pir_to_core_id(c->pir), + EX_PM_IDLE_STATE_HISTORY_PHYP), + &tmp); + if (rc) { + log_simple_error(&e_info(OPAL_RC_SLW_GET), + "SLW: Failed to read PM_IDLE_STATE_HISTORY\n"); + /* XXX error handling ? return false; */ + } + + prlog(PR_DEBUG, "SLW: core %x:%x history: 0x%016llx (new1)\n", + chip->id, pir_to_core_id(c->pir), tmp); + + rc = xscom_read(chip->id, + XSCOM_ADDR_P8_EX_SLAVE(pir_to_core_id(c->pir), + EX_PM_IDLE_STATE_HISTORY_PHYP), + &tmp); + if (rc) { + log_simple_error(&e_info(OPAL_RC_SLW_GET), + "SLW: Failed to read PM_IDLE_STATE_HISTORY\n"); + /* XXX error handling ? return false; */ + } + + prlog(PR_DEBUG, "SLW: core %x:%x history: 0x%016llx (new2)\n", + chip->id, pir_to_core_id(c->pir), tmp); + + /* + * XXX FIXME: Error out if the transition didn't reach rvwinkle ? + */ + + /* + * XXX FIXME: We should restore a bunch of the EX bits we + * overwrite to sane values here + */ + slw_unset_overrides(chip, c); +} + +static void slw_cleanup_chip(struct proc_chip *chip) +{ + struct cpu_thread *c; + + for_each_available_core_in_chip(c, chip->id) + slw_cleanup_core(chip, c); +} + +static void slw_patch_scans(struct proc_chip *chip, bool le_mode) +{ + int64_t rc; + uint64_t old_val, new_val; + + rc = sbe_xip_get_scalar((void *)chip->slw_base, + "skip_ex_override_ring_scans", &old_val); + if (rc) { + log_simple_error(&e_info(OPAL_RC_SLW_REG), + "SLW: Failed to read scan override on chip %d\n", + chip->id); + return; + } + + new_val = le_mode ? 0 : 1; + + prlog(PR_TRACE, "SLW: Chip %d, LE value was: %lld, setting to %lld\n", + chip->id, old_val, new_val); + + rc = sbe_xip_set_scalar((void *)chip->slw_base, + "skip_ex_override_ring_scans", new_val); + if (rc) { + log_simple_error(&e_info(OPAL_RC_SLW_REG), + "SLW: Failed to set LE mode on chip %d\n", chip->id); + return; + } +} + +int64_t slw_reinit(uint64_t flags) +{ + struct proc_chip *chip; + struct cpu_thread *cpu; + bool has_waker = false; + bool target_le = slw_current_le; + + if (flags & OPAL_REINIT_CPUS_HILE_BE) + target_le = false; + if (flags & OPAL_REINIT_CPUS_HILE_LE) + target_le = true; + + prlog(PR_TRACE, "SLW Reinit from CPU PIR 0x%04x," + " HILE set to %s endian...\n", + this_cpu()->pir, + target_le ? "little" : "big"); + + /* Prepare chips/cores for rvwinkle */ + for_each_chip(chip) { + if (!chip->slw_base) { + log_simple_error(&e_info(OPAL_RC_SLW_INIT), + "SLW: Not found on chip %d\n", chip->id); + return OPAL_HARDWARE; + } + + slw_patch_scans(chip, target_le); + } + slw_current_le = target_le; + + /* XXX Save HIDs ? Or do that in head.S ... */ + + slw_patch_reset(); + + /* rvwinkle everybody and pick one to wake me once I rvwinkle myself */ + for_each_available_cpu(cpu) { + struct cpu_thread *master = NULL; + + if (cpu == this_cpu()) + continue; + + /* Pick up a waker for myself: it must not be a sibling of + * the current CPU and must be a thread 0 (so it gets to + * sync its timebase before doing time_wait_ms() + */ + if (!has_waker && !cpu_is_sibling(cpu, this_cpu()) && + cpu_is_thread0(cpu)) { + has_waker = true; + master = this_cpu(); + } + __cpu_queue_job(cpu, "slw_do_rvwinkle", + slw_do_rvwinkle, master, true); + + /* Wait for it to claim to be down */ + while(cpu->state != cpu_state_rvwinkle) + sync(); + } + + /* XXX Wait one second ! (should check xscom state ? ) */ + prlog(PR_TRACE, "SLW: Waiting one second...\n"); + time_wait_ms(1000); + prlog(PR_TRACE, "SLW: Done.\n"); + + for_each_chip(chip) { + struct cpu_thread *c; + uint64_t tmp; + for_each_available_core_in_chip(c, chip->id) { + xscom_read(chip->id, + XSCOM_ADDR_P8_EX_SLAVE(pir_to_core_id(c->pir), + EX_PM_IDLE_STATE_HISTORY_PHYP), + &tmp); + prlog(PR_DEBUG, "SLW: core %x:%x" + " history: 0x%016llx (mid)\n", + chip->id, pir_to_core_id(c->pir), tmp); + } + } + + + /* Wake everybody except on my core */ + for_each_cpu(cpu) { + if (cpu->state != cpu_state_rvwinkle || + cpu_is_sibling(cpu, this_cpu())) + continue; + icp_kick_cpu(cpu); + + /* Wait for it to claim to be back (XXX ADD TIMEOUT) */ + while(cpu->state != cpu_state_active) + sync(); + } + + /* Did we find a waker ? If we didn't, that means we had no + * other core in the system, we can't do it + */ + if (!has_waker) { + prlog(PR_TRACE, "SLW: No candidate waker, giving up !\n"); + return OPAL_HARDWARE; + } + + /* Our siblings are rvwinkling, and our waker is waiting for us + * so let's just go down now + */ + slw_do_rvwinkle(NULL); + + slw_unpatch_reset(); + + for_each_chip(chip) + slw_cleanup_chip(chip); + + prlog(PR_TRACE, "SLW Reinit complete !\n"); + + return OPAL_SUCCESS; +} + +static void slw_patch_regs(struct proc_chip *chip) +{ + struct cpu_thread *c; + void *image = (void *)chip->slw_base; + int rc; + + for_each_available_cpu(c) { + if (c->chip_id != chip->id) + continue; + + /* Clear HRMOR */ + rc = p8_pore_gen_cpureg_fixed(image, P8_SLW_MODEBUILD_SRAM, + P8_SPR_HRMOR, 0, + cpu_get_core_index(c), + cpu_get_thread_index(c)); + if (rc) { + log_simple_error(&e_info(OPAL_RC_SLW_REG), + "SLW: Failed to set HRMOR for CPU %x\n", + c->pir); + } + + /* XXX Add HIDs etc... */ + } +} + +static void slw_init_chip_p9(struct proc_chip *chip) +{ + struct cpu_thread *c; + + prlog(PR_DEBUG, "SLW: Init chip 0x%x\n", chip->id); + + /* At power ON setup inits for power-mgt */ + for_each_available_core_in_chip(c, chip->id) + slw_set_overrides_p9(chip, c); + + +} + +static void slw_init_chip_p10(struct proc_chip *chip) +{ + struct cpu_thread *c; + + prlog(PR_DEBUG, "SLW: Init chip 0x%x\n", chip->id); + + /* At power ON setup inits for power-mgt */ + for_each_available_core_in_chip(c, chip->id) + slw_set_overrides_p10(chip, c); + + +} + + +static bool slw_image_check_p9(struct proc_chip *chip) +{ + + if (!chip->homer_base) { + log_simple_error(&e_info(OPAL_RC_SLW_REG), + "SLW: HOMER base not set %x\n", + chip->id); + return false; + } else + return true; + + +} + +static bool slw_image_check_p8(struct proc_chip *chip) +{ + int64_t rc; + + prlog(PR_DEBUG, "SLW: slw_check chip 0x%x\n", chip->id); + if (!chip->slw_base) { + prerror("SLW: No image found !\n"); + return false; + } + + /* Check actual image size */ + rc = sbe_xip_get_scalar((void *)chip->slw_base, "image_size", + &chip->slw_image_size); + if (rc != 0) { + log_simple_error(&e_info(OPAL_RC_SLW_INIT), + "SLW: Error %lld reading SLW image size\n", rc); + /* XXX Panic ? */ + chip->slw_base = 0; + chip->slw_bar_size = 0; + chip->slw_image_size = 0; + return false; + } + prlog(PR_DEBUG, "SLW: Image size from image: 0x%llx\n", + chip->slw_image_size); + + if (chip->slw_image_size > chip->slw_bar_size) { + log_simple_error(&e_info(OPAL_RC_SLW_INIT), + "SLW: Built-in image size larger than BAR size !\n"); + /* XXX Panic ? */ + return false; + } + return true; + +} + +static void slw_late_init_p8(struct proc_chip *chip) +{ + + prlog(PR_DEBUG, "SLW: late Init chip 0x%x\n", chip->id); + + /* Patch SLW image */ + slw_patch_regs(chip); + +} +static void slw_init_chip_p8(struct proc_chip *chip) +{ + struct cpu_thread *c; + + prlog(PR_DEBUG, "SLW: Init chip 0x%x\n", chip->id); + /* At power ON setup inits for fast-sleep */ + for_each_available_core_in_chip(c, chip->id) { + idle_prepare_core(chip, c); + } +} + +/* Workarounds while entering fast-sleep */ + +static void fast_sleep_enter(void) +{ + uint32_t core = pir_to_core_id(this_cpu()->pir); + uint32_t chip_id = this_cpu()->chip_id; + struct cpu_thread *primary_thread; + uint64_t tmp; + int rc; + + primary_thread = this_cpu()->primary; + + rc = xscom_read(chip_id, XSCOM_ADDR_P8_EX(core, L2_FIR_ACTION1), + &tmp); + if (rc) { + prlog(PR_WARNING, "fast_sleep_enter XSCOM failed(1):" + " rc=%d chip_id=%d core=%d\n", + rc, chip_id, core); + return; + } + + primary_thread->save_l2_fir_action1 = tmp; + primary_thread->in_fast_sleep = true; + + tmp = tmp & ~0x0200000000000000ULL; + rc = xscom_write(chip_id, XSCOM_ADDR_P8_EX(core, L2_FIR_ACTION1), + tmp); + if (rc) { + prlog(PR_WARNING, "fast_sleep_enter XSCOM failed(2):" + " rc=%d chip_id=%d core=%d\n", + rc, chip_id, core); + return; + } + rc = xscom_read(chip_id, XSCOM_ADDR_P8_EX(core, L2_FIR_ACTION1), + &tmp); + if (rc) { + prlog(PR_WARNING, "fast_sleep_enter XSCOM failed(3):" + " rc=%d chip_id=%d core=%d\n", + rc, chip_id, core); + return; + } + +} + +/* Workarounds while exiting fast-sleep */ + +void fast_sleep_exit(void) +{ + uint32_t core = pir_to_core_id(this_cpu()->pir); + uint32_t chip_id = this_cpu()->chip_id; + struct cpu_thread *primary_thread; + int rc; + + primary_thread = this_cpu()->primary; + primary_thread->in_fast_sleep = false; + + rc = xscom_write(chip_id, XSCOM_ADDR_P8_EX(core, L2_FIR_ACTION1), + primary_thread->save_l2_fir_action1); + if (rc) { + prlog(PR_WARNING, "fast_sleep_exit XSCOM failed:" + " rc=%d chip_id=%d core=%d\n", + rc, chip_id, core); + return; + } +} + +/* + * Setup and cleanup method for fast-sleep workarounds + * state = 1 fast-sleep + * enter = 1 Enter state + * exit = 0 Exit state + */ + +static int64_t opal_config_cpu_idle_state(uint64_t state, uint64_t enter) +{ + /* Only fast-sleep for now */ + if (state != 1) + return OPAL_PARAMETER; + + switch(enter) { + case 1: + fast_sleep_enter(); + break; + case 0: + fast_sleep_exit(); + break; + default: + return OPAL_PARAMETER; + } + + return OPAL_SUCCESS; +} + +opal_call(OPAL_CONFIG_CPU_IDLE_STATE, opal_config_cpu_idle_state, 2); + +int64_t opal_slw_set_reg(uint64_t cpu_pir, uint64_t sprn, uint64_t val) +{ + + struct cpu_thread *c = find_cpu_by_pir(cpu_pir); + struct proc_chip *chip; + int rc; + + if (!c) { + prerror("SLW: Unknown thread with pir %x\n", (u32) cpu_pir); + return OPAL_PARAMETER; + } + + chip = get_chip(c->chip_id); + if (!chip) { + prerror("SLW: Unknown chip for thread with pir %x\n", + (u32) cpu_pir); + return OPAL_PARAMETER; + } + + if (proc_gen >= proc_gen_p9) { + if (!has_deep_states) { + prlog(PR_INFO, "SLW: Deep states not enabled\n"); + return OPAL_SUCCESS; + } + + if (wakeup_engine_state != WAKEUP_ENGINE_PRESENT) { + log_simple_error(&e_info(OPAL_RC_SLW_REG), + "SLW: wakeup_engine in bad state=%d chip=%x\n", + wakeup_engine_state,chip->id); + return OPAL_INTERNAL_ERROR; + } + if (proc_gen == proc_gen_p9) { + rc = p9_stop_save_cpureg((void *)chip->homer_base, + sprn, val, cpu_pir); + } else { + rc = proc_stop_save_cpureg((void *)chip->homer_base, + sprn, val, cpu_pir); + } + + } else if (proc_gen == proc_gen_p8) { + int spr_is_supported = 0; + void *image; + int i; + + /* Check of the SPR is supported by libpore */ + for (i = 0; i < SLW_SPR_REGS_SIZE ; i++) { + if (sprn == SLW_SPR_REGS[i].value) { + spr_is_supported = 1; + break; + } + } + if (!spr_is_supported) { + log_simple_error(&e_info(OPAL_RC_SLW_REG), + "SLW: Trying to set unsupported spr for CPU %x\n", + c->pir); + return OPAL_UNSUPPORTED; + } + image = (void *)chip->slw_base; + rc = p8_pore_gen_cpureg_fixed(image, P8_SLW_MODEBUILD_SRAM, + sprn, val, + cpu_get_core_index(c), + cpu_get_thread_index(c)); + } else { + log_simple_error(&e_info(OPAL_RC_SLW_REG), + "SLW: proc_gen not supported\n"); + return OPAL_UNSUPPORTED; + + } + + if (rc) { + log_simple_error(&e_info(OPAL_RC_SLW_REG), + "SLW: Failed to set spr %llx for CPU %x, RC=0x%x\n", + sprn, c->pir, rc); + return OPAL_INTERNAL_ERROR; + } + prlog(PR_DEBUG, "SLW: restore spr:0x%llx on c:0x%x with 0x%llx\n", + sprn, c->pir, val); + return OPAL_SUCCESS; + +} + +opal_call(OPAL_SLW_SET_REG, opal_slw_set_reg, 3); + +void slw_init(void) +{ + struct proc_chip *chip; + + if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS) { + wakeup_engine_state = WAKEUP_ENGINE_NOT_PRESENT; + add_cpu_idle_state_properties(); + return; + } + if (proc_gen == proc_gen_p8) { + for_each_chip(chip) { + slw_init_chip_p8(chip); + if(slw_image_check_p8(chip)) + wakeup_engine_state = WAKEUP_ENGINE_PRESENT; + if (wakeup_engine_state == WAKEUP_ENGINE_PRESENT) + slw_late_init_p8(chip); + } + p8_sbe_init_timer(); + } else if (proc_gen == proc_gen_p9) { + for_each_chip(chip) { + slw_init_chip_p9(chip); + if(slw_image_check_p9(chip)) + wakeup_engine_state = WAKEUP_ENGINE_PRESENT; + if (wakeup_engine_state == WAKEUP_ENGINE_PRESENT) + slw_late_init_p9(chip); + } + } else if (proc_gen == proc_gen_p10) { + for_each_chip(chip) { + slw_init_chip_p10(chip); + if(slw_image_check_p9(chip)) + wakeup_engine_state = WAKEUP_ENGINE_PRESENT; + if (wakeup_engine_state == WAKEUP_ENGINE_PRESENT) { + slw_late_init_p10(chip); + } + } + } + add_cpu_idle_state_properties(); +} diff --git a/roms/skiboot/hw/test/Makefile.check b/roms/skiboot/hw/test/Makefile.check new file mode 100644 index 000000000..45eb8072f --- /dev/null +++ b/roms/skiboot/hw/test/Makefile.check @@ -0,0 +1,29 @@ +# -*-Makefile-*- +SUBDIRS += hw/test/ +HW_TEST := hw/test/phys-map-test hw/test/run-port80h + +.PHONY : hw-check +hw-check: $(HW_TEST:%=%-check) + +.PHONY : hw-coverage +hw-coverage: $(HW_TEST:%=%-gcov-run) + +check: hw-check +coverage: hw-coverage + +$(HW_TEST:%=%-gcov-run) : %-run: % + $(call QTEST, TEST-COVERAGE ,$< , $<) + +$(HW_TEST:%=%-check) : %-check: % + $(call QTEST, RUN-TEST ,$(VALGRIND) $<, $<) + +$(HW_TEST) : % : %.c hw/phys-map.o + $(call Q, HOSTCC ,$(HOSTCC) $(HOSTCFLAGS) -O0 -g -I include -I . -o $@ $<, $<) + +$(HW_TEST:%=%-gcov): %-gcov : %.c % + $(call QTEST, HOSTCC ,$(HOSTCC) $(HOSTCFLAGS) $(HOSTGCOVCFLAGS) -I include -I . -lgcov -o $@ $<, $<) + +clean: hw-clean + +hw-clean: + $(RM) -f hw/test/*.[od] $(HW_TEST) $(HW_TEST:%=%-gcov) diff --git a/roms/skiboot/hw/test/phys-map-test.c b/roms/skiboot/hw/test/phys-map-test.c new file mode 100644 index 000000000..d507175fe --- /dev/null +++ b/roms/skiboot/hw/test/phys-map-test.c @@ -0,0 +1,203 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Physical memory map test + * + * Copyright 2013-2017 IBM Corp. + */ + +#include "../../core/test/stubs.c" +#include "../phys-map.c" + +enum proc_gen proc_gen; + +static inline void print_entry(const struct phys_map_entry *e) +{ + printf("type:%i index:%i addr:%016lx size:%016lx", + e->type, e->index, e->addr, e->size); +} + +/* Check table directly for overlaps */ +static void check_table_directly(void) +{ + const struct phys_map_entry *e, *prev; + uint64_t start, end, pstart, pend; + bool passed; + + /* Loop over table entries ... */ + for (e = phys_map->table; !phys_map_entry_null(e); e++) { + + start = e->addr; + end = e->addr + e->size; + /* ... see if they overlap with previous entries */ + for (prev = phys_map->table; prev != e; prev++) { + passed = true; + /* Check for overlaping regions */ + pstart = prev->addr; + pend = prev->addr + prev->size; + if ((start > pstart) && (start < pend)) + passed = false; + if ((end > pstart) && (end < pend)) + passed = false; + + /* Check for duplicate entries */ + if ((e->type == prev->type) && + (e->index == prev->index)) + passed = false; + + if (passed) + continue; + + printf("Phys map direct test FAILED: Entry overlaps\n"); + printf("First: "); + print_entry(prev); + printf("\n"); + printf("Second: "); + print_entry(e); + printf("\n"); + assert(0); + } + } +} + +struct map_call_entry { + uint64_t start; + uint64_t end; +}; + +static inline bool map_call_entry_null(const struct map_call_entry *t) +{ + if ((t->start == 0) && + (t->end == 0)) + return true; + return false; +} + +/* Check calls to map to see if they overlap. + * Creates a new table for each of the entries it gets to check against + */ + +/* Pick a chip ID, any ID. */ +#define FAKE_CHIP_ID 8 + +struct proc_chip *get_chip(uint32_t chip_id __unused) +{ + return NULL; +} + +static void check_map_call(void) +{ + uint64_t start, size, end; + const struct phys_map_entry *e; + struct map_call_entry *tbl, *t, *tnext; + int tbl_size = 0; + bool passed; + + for (e = phys_map->table; !phys_map_entry_null(e); e++) + tbl_size++; + + tbl_size++; /* allow for null entry at end */ + tbl_size *= sizeof(struct map_call_entry); + tbl = malloc(tbl_size); + assert(tbl != NULL); + memset(tbl, 0, tbl_size); + + /* Loop over table entries ... */ + for (e = phys_map->table; !phys_map_entry_null(e); e++) { + __phys_map_get(FAKE_CHIP_ID, FAKE_CHIP_ID, e->type, e->index, &start, &size); + + /* Check for alignment */ + if ((e->type != SYSTEM_MEM) && (e->type != RESV)) { + /* Size is power of 2? */ + assert(__builtin_popcountl(size) == 1); + /* Start is aligned to size? */ + assert((start % size) == 0); + } + + end = start + size; + for (t = tbl; !map_call_entry_null(t); t++) { + passed = true; + + /* Check for overlaping regions */ + if ((start > t->start) && (start < t->end)) + passed = false; + if ((end > t->start) && (end < t->end)) + passed = false; + + if (passed) + continue; + + printf("Phys map call test FAILED: Entry overlaps\n"); + printf("First: addr:%016lx size:%016lx\n", + t->start, t->end - t->start); + printf("Second: addr:%016lx size:%016lx\n ", + start, size); + print_entry(e); + printf("\n"); + assert(0); + } + /* Insert entry at end of table */ + t->start = start; + t->end = end; + } + + for (t = tbl; !map_call_entry_null(t + 1); t++) { + tnext = t + 1; + /* Make sure the table is sorted */ + if (t->start > tnext->start) { + printf("Phys map test FAILED: Entry not sorted\n"); + printf("First: addr:%016lx size:%016lx\n", + t->start, t->end - t->start); + printf("Second: addr:%016lx size:%016lx\n", + tnext->start, tnext->end - tnext->start); + assert(0); + } + + /* Look for holes in the table in MMIO region */ + /* We assume over 1PB is MMIO. */ + if ((t->end != tnext->start) && + (t->start > 0x0004000000000000)) { + printf("Phys map test FAILED: Hole in map\n"); + printf("First: addr:%016lx size:%016lx\n", + t->start, t->end - t->start); + printf("Second: addr:%016lx size:%016lx\n", + tnext->start, tnext->end - tnext->start); + assert(0); + } + } + + free(tbl); +} + +/* Fake PVR definitions. See include/processor.h */ +unsigned long fake_pvr[] = { + 0x004e0200, /* PVR_P9 */ + 0x004f0100, /* PVR_P9P */ + 0x00800100, /* PVR_P10 */ +}; + +int main(void) +{ + for (int i = 0; i < ARRAY_SIZE(fake_pvr); i++) { + switch(PVR_TYPE(fake_pvr[i])) { + case PVR_TYPE_P9: + case PVR_TYPE_P9P: + proc_gen = proc_gen_p9; + break; + case PVR_TYPE_P10: + proc_gen = proc_gen_p10; + break; + default: + printf("Unknown PVR 0x%lx\n", fake_pvr[i]); + return 1; + break; + } + + phys_map_init(fake_pvr[i]); + + /* Run tests */ + check_table_directly(); + check_map_call(); + } + + return(0); +} diff --git a/roms/skiboot/hw/test/run-port80h.c b/roms/skiboot/hw/test/run-port80h.c new file mode 100644 index 000000000..860a4244d --- /dev/null +++ b/roms/skiboot/hw/test/run-port80h.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Test result of our LPC port 80h boot progress code + * + * Copyright 2018-2019 IBM Corp. + */ + +#include <stdio.h> +#include <stdarg.h> +#include <stdint.h> +#include <assert.h> + +#define __unused __attribute__((unused)) + +#define __LPC_H + +uint8_t port80; +uint16_t port8x; + +static int64_t lpc_probe_write(int addr_type __unused, uint32_t addr, + uint32_t data, uint32_t sz) +{ + assert((addr - 0x80) <= 2); + assert(sz == 1); + if (addr == 0x80) + port80 = data; + if (addr == 0x81) + port8x = data << 8 | (port8x & 0xff); + if (addr == 0x82) + port8x = (port8x & 0xff00) | data; + return 0; +} + +#include "op-panel.h" + +void op_display_lpc(enum op_severity s, enum op_module m, uint16_t c); + +#include "../lpc-port80h.c" +#include "../../core/test/stubs.c" + +enum proc_chip_quirks proc_chip_quirks; + +int main(void) +{ + op_display_lpc(OP_LOG, OP_MOD_INIT, 0x00); + assert(port80 == 0x80); + assert(port8x == 0x8000); + op_display_lpc(OP_WARN, OP_MOD_INIT, 0x00); + assert(port80 == 0x82); + assert(port8x == 0x8002); + op_display_lpc(OP_ERROR, OP_MOD_INIT, 0x00); + assert(port80 == 0x81); + assert(port8x == 0x8001); + op_display_lpc(OP_FATAL, OP_MOD_INIT, 0x00); + assert(port80 == 0x83); + assert(port8x == 0x8003); + op_display_lpc(OP_FATAL, OP_MOD_INIT, 0x0f); + assert(port80 == 0xBF); + assert(port8x == 0x803F); + op_display_lpc(OP_LOG, OP_MOD_INIT, 0x0f); + assert(port80 == 0xBC); + assert(port8x == 0x803C); + op_display_lpc(OP_FATAL, OP_MOD_CORE, 0x6666); + assert(port80 == 0xBF); + assert(port8x == 0x803F); + op_display_lpc(OP_LOG, OP_MOD_INIT, 0x01); + assert(port80 == 0x84); + assert(port8x == 0x8004); + op_display_lpc(OP_LOG, OP_MOD_CPU, 0x05); + assert(port80 == 0xC4); + assert(port8x == 0xC014); + op_display_lpc(OP_LOG, OP_MOD_LOCK, 0x07); + assert(port80 == 0xDC); + assert(port8x == 0xD01C); + op_display_lpc(OP_FATAL, OP_MOD_LOCK, 0x07); + assert(port80 == 0xDF); + assert(port8x == 0xD01F); + op_display_lpc(OP_FATAL, OP_MOD_MEM, 0x07); + assert(port80 == 0xEF); + assert(port8x == 0xE01F); + op_display_lpc(OP_WARN, OP_MOD_MEM, 0x02); + assert(port80 == 0xEA); + assert(port8x == 0xE00A); + op_display_lpc(OP_WARN, OP_MOD_CHIPTOD, 0x02); + assert(port80 == 0xFA); + assert(port8x == 0xF00A); + + /* + * We can't assert that OP_MOD_FSP is invalid as we'd end up + * trying to set port80 in the assert parth + */ + op_display_lpc(OP_LOG, OP_MOD_FSP, 0x00); + assert(port80 == 0x80); + assert(port8x == 0x8000); + op_display_lpc(OP_LOG, OP_MOD_FSPCON, 0x00); + assert(port80 == 0x80); + assert(port8x == 0x8000); + return 0; +} diff --git a/roms/skiboot/hw/vas.c b/roms/skiboot/hw/vas.c new file mode 100644 index 000000000..0dbe0bcda --- /dev/null +++ b/roms/skiboot/hw/vas.c @@ -0,0 +1,639 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* Copyright 2013-2018 IBM Corp. */ + +#include <skiboot.h> +#include <chip.h> +#include <phys-map.h> +#include <xscom.h> +#include <io.h> +#include <xive.h> +#include <interrupts.h> +#include <nvram.h> +#include <vas.h> + +#define vas_err(__fmt,...) prlog(PR_ERR,"VAS: " __fmt, ##__VA_ARGS__) + +#ifdef VAS_VERBOSE_DEBUG +#define vas_vdbg(__x,__fmt,...) prlog(PR_DEBUG,"VAS: " __fmt, ##__VA_ARGS__) +#else +#define vas_vdbg(__x,__fmt,...) do { } while (0) +#endif + +static int vas_initialized; + +struct vas { + uint32_t chip_id; + uint32_t vas_id; + uint64_t xscom_base; + uint64_t wcbs; + uint32_t vas_irq; + uint64_t vas_port; +}; + +static inline void get_hvwc_mmio_bar(int chipid, uint64_t *start, uint64_t *len) +{ + phys_map_get(chipid, VAS_HYP_WIN, 0, start, len); +} + +static inline void get_uwc_mmio_bar(int chipid, uint64_t *start, uint64_t *len) +{ + phys_map_get(chipid, VAS_USER_WIN, 0, start, len); +} + +static inline uint64_t compute_vas_scom_addr(struct vas *vas, uint64_t reg) +{ + return vas->xscom_base + reg; +} + +static int vas_scom_write(struct proc_chip *chip, uint64_t reg, uint64_t val) +{ + int rc; + uint64_t addr; + + addr = compute_vas_scom_addr(chip->vas, reg); + + rc = xscom_write(chip->id, addr, val); + if (rc != OPAL_SUCCESS) { + vas_err("Error writing 0x%llx to 0x%llx, rc %d\n", val, addr, + rc); + } + + return rc; +} + +/* + * Return true if NX crypto/compression is enabled on this processor. + * + * On POWER8, NX-842 crypto and compression are allowed, but they do not + * use VAS (return true). + * + * On POWER9, NX 842 and GZIP compressions use VAS but the PASTE instruction + * and hence VAS is not enabled in following revisions: + * + * - Nimbus DD1.X, DD2.01, DD2.1 + * - Cumulus DD1.0 + * + * Return false for these revisions. Return true otherwise. + */ +__attrconst inline bool vas_nx_enabled(void) +{ + uint32_t pvr; + int major, minor; + struct proc_chip *chip; + + chip = next_chip(NULL); + + pvr = mfspr(SPR_PVR); + major = PVR_VERS_MAJ(pvr); + minor = PVR_VERS_MIN(pvr); + + switch (chip->type) { + case PROC_CHIP_P9_NIMBUS: + return (major > 2 || (major == 2 && minor > 1)); + case PROC_CHIP_P9_CUMULUS: + return (major > 1 || minor > 0); + default: + return true; + } +} + +/* Interface for NX - make sure VAS is fully initialized first */ +__attrconst inline uint64_t vas_get_hvwc_mmio_bar(const int chipid) +{ + uint64_t addr; + + if (!vas_initialized) + return 0ULL; + + get_hvwc_mmio_bar(chipid, &addr, NULL); + + return addr; +} + +/* Interface for NX - make sure VAS is fully initialized first */ +__attrconst uint64_t vas_get_wcbs_bar(int chipid) +{ + struct proc_chip *chip; + + if (!vas_initialized) + return 0ULL; + + chip = get_chip(chipid); + if (!chip) + return 0ULL; + + return chip->vas->wcbs; +} + +static int init_north_ctl(struct proc_chip *chip) +{ + uint64_t val = 0ULL; + + val = SETFIELD(VAS_64K_MODE_MASK, val, true); + val = SETFIELD(VAS_ACCEPT_PASTE_MASK, val, true); + val = SETFIELD(VAS_ENABLE_WC_MMIO_BAR, val, true); + val = SETFIELD(VAS_ENABLE_UWC_MMIO_BAR, val, true); + val = SETFIELD(VAS_ENABLE_RMA_MMIO_BAR, val, true); + + return vas_scom_write(chip, VAS_MISC_N_CTL, val); +} + +/* + * Ensure paste instructions are not accepted and MMIO BARs are disabled. + */ +static inline int reset_north_ctl(struct proc_chip *chip) +{ + return vas_scom_write(chip, VAS_MISC_N_CTL, 0ULL); +} + +static void reset_fir(struct proc_chip *chip) +{ + vas_scom_write(chip, VAS_FIR0, 0x0000000000000000ULL); + /* From VAS workbook */ + vas_scom_write(chip, VAS_FIR_MASK, 0x000001000001ffffULL); + vas_scom_write(chip, VAS_FIR_ACTION0, 0xf800fdfc0001ffffull); + vas_scom_write(chip, VAS_FIR_ACTION1, 0xf8fffefffffc8000ull); +} + +/* VAS workbook: Section 1.3.3.1: Send Message w/ Paste Commands (cl_rma_w) */ +/* P9 paste base address format */ +#define P9_RMA_LSMP_64K_SYS_ID PPC_BITMASK(8, 12) +#define P9_RMA_LSMP_64K_NODE_ID PPC_BITMASK(15, 18) +#define P9_RMA_LSMP_64K_CHIP_ID PPC_BITMASK(19, 21) + +/* Paste base address format (on P10 or later) */ +#define RMA_FOREIGN_ADDR_ENABLE PPC_BITMASK(8, 11) +#define RMA_TOPOLOGY_INDEX PPC_BITMASK(15, 19) + +#define RMA_LSMP_WINID_START_BIT 32 +#define RMA_LSMP_WINID_NUM_BITS 16 + +/* + * The start/base of the paste BAR is computed using the tables 1.1 through + * 1.4 in Section 1.3.3.1 (Send Message w/Paste Commands (cl_rma_w)) of VAS + * P9 Workbook. + * + * With 64K mode and Large SMP Mode the bits are used as follows: + * + * Bits Values Comments + * -------------------------------------- + * 0:7 0b 0000_0000 Reserved + * 8:12 0b 0000_1 System id/Foreign Index 0:4 + * 13:14 0b 00 Foreign Index 5:6 + * + * 15:18 0 throuh 15 Node id (0 through 15) + * 19:21 0 through 7 Chip id (0 throuh 7) + * 22:23 0b 00 Unused, Foreign index 7:8 + * + * 24:31 0b 0000_0000 RPN 0:7, Reserved + * 32:47 0 through 64K Send Window Id + * 48:51 0b 0000 Spare + * + * 52 0b 0 Reserved + * 53 0b 1 Report Enable (Set to 1 for NX). + * 54 0b 0 Reserved + * + * 55:56 0b 00 Snoop Bus + * 57:63 0b 0000_000 Reserved + * + * Except for a few bits, the small SMP mode computation is similar. + * + * TODO: Detect and compute address for small SMP mode. + * + * Example: For Node 0, Chip 0, Window id 4, Report Enable 1: + * + * Byte0 Byte1 Byte2 Byte3 Byte4 Byte5 Byte6 Byte7 + * 00000000 00001000 00000000 00000000 00000000 00000100 00000100 00000000 + * | || | | | | + * +-+-++++ +-------+-------+ v + * | | | Report Enable + * v v v + * Node Chip Window id 4 + * + * Thus the paste address for window id 4 is 0x00080000_00040400 and + * the _base_ paste address for Node 0 Chip 0 is 0x00080000_00000000. + */ + +static void p9_get_rma_bar(int chipid, uint64_t *val) +{ + uint64_t v; + + v = 0ULL; + v = SETFIELD(P9_RMA_LSMP_64K_SYS_ID, v, 1); + v = SETFIELD(P9_RMA_LSMP_64K_NODE_ID, v, P9_GCID2NODEID(chipid)); + v = SETFIELD(P9_RMA_LSMP_64K_CHIP_ID, v, P9_GCID2CHIPID(chipid)); + + *val = v; +} + +/* + * The start/base of the paste BAR is computed using the tables 1.1 through + * 1.3 in Section 1.3.3.1 (Send Message w/Paste Commands (cl_rma_w)) of VAS + * P10 Workbook. + * + * With 64K mode and Large SMP Mode the bits are used as follows: + * + * Bits Values Comments + * -------------------------------------- + * 0:7 0b 0000_0000 Reserved + * 8:11 0b 0001 Foreign Address Enable + * 12 0b 0 SMF + * 13:14 0b 00 Memory Select + * + * 15:19 0 throuh 16 Topology Index + * 20:23 0b 0000 Chip Internal Address + * + * 24:31 0b 0000_0000 RPN 0:7, Reserved + * 32:47 0 through 64K Send Window Id + * 48:51 0b 0000 Spare + * + * 52 0b 0 Reserved + * 53 0b 1 Report Enable (Set to 1 for NX). + * 54 0b 0 Reserved + * + * 55:56 0b 00 Snoop Bus + * 57:63 0b 0000_000 Reserved + * + * Example: For Node 0, Chip 0, Window id 4, Report Enable 1: + * + * Byte0 Byte1 Byte2 Byte3 Byte4 Byte5 Byte6 Byte7 + * 00000000 00010000 00000000 00000000 00000000 00000100 00000100 00000000 + * | | | | | + * +---+ +-------+-------+ v + * | | Report Enable + * v v + * Topology Index Window id 4 + * + * Thus the paste address for window id 4 is 0x00100000_00040400 and + * the _base_ paste address for Node 0 Chip 0 is 0x00100000_00000000. + * + * Note: Bit 11 (Foreign Address Enable) is set only for paste base address. + * Not for VAS/NX RMA BAR. RA(0:12) = 0 for VAS/NX RMA BAR. + */ + +static void get_rma_bar(struct proc_chip *chip, uint64_t *val) +{ + uint64_t v; + + v = 0ULL; + v = SETFIELD(RMA_TOPOLOGY_INDEX, v, chip->primary_topology); + + *val = v; +} + +/* Interface for NX - make sure VAS is fully initialized first */ +__attrconst uint64_t vas_get_rma_bar(int chipid) +{ + struct proc_chip *chip; + uint64_t addr; + + if (!vas_initialized) + return 0ULL; + + chip = get_chip(chipid); + if (!chip) + return 0ULL; + + get_rma_bar(chip, &addr); + + return addr; +} + +/* + * Initialize RMA BAR on this chip to correspond to its node/chip id. + * This will cause VAS to accept paste commands to targeted for this chip. + * Initialize RMA Base Address Mask Register (BAMR) to its default value. + */ +static int init_rma(struct proc_chip *chip) +{ + int rc; + uint64_t val; + + if (proc_gen == proc_gen_p9) + p9_get_rma_bar(chip->id, &val); + else + get_rma_bar(chip, &val); + + rc = vas_scom_write(chip, VAS_RMA_BAR, val); + if (rc) + return rc; + + val = SETFIELD(VAS_RMA_BAMR_ADDR_MASK, 0ULL, 0xFFFC0000000ULL); + + return vas_scom_write(chip, VAS_RMA_BAMR, val); +} + +/* + * get_paste_bar(): + * + * Compute and return the "paste base address region" for @chipid. This + * BAR contains the "paste" addreses for all windows on the chip. Linux + * uses this paste BAR to compute the hardware paste address of a (send) + * window using: + * + * paste_addr = base + (winid << shift) + * + * where winid is the window index and shift is computed as: + * + * start = RMA_LSMP_WINID_START_BIT; + * nbits = RMA_LSMP_WINID_NUM_BITS; + * shift = 63 - (start + nbits - 1); + * + * See also get_paste_bitfield() below, which is used to export the 'start' + * and 'nbits' to Linux through the DT. + * + * Each chip supports VAS_WINDOWS_PER_CHIP (64K on Power9) windows. To + * provide proper isolation, the paste address for each window is on a + * separate page. Thus with a page size of 64K, the length of the paste + * BAR for a chip is VAS_WINDOWS_PER_CHIP times 64K (or 4GB for Power9). + * + */ +#define VAS_PASTE_BAR_LEN (1ULL << 32) /* 4GB - see above */ + +static inline void get_paste_bar(int chipid, uint64_t *start, uint64_t *len) +{ + struct proc_chip *chip; + uint64_t val; + + if (proc_gen == proc_gen_p9) + p9_get_rma_bar(chipid, &val); + else { + chip = get_chip(chipid); + if (!chip) + return; + + get_rma_bar(chip, &val); + + /* + * RA(11) (Foreign Address Enable) is set only for paste + * base address. + */ + val = SETFIELD(RMA_FOREIGN_ADDR_ENABLE, val, 1); + } + + *start = val; + *len = VAS_PASTE_BAR_LEN; +} + +/* + * get_paste_bitfield(): + * + * As explained in the function header for get_paste_bar(), the window + * id is encoded in bits 32:47 of the paste address. Export this bitfield + * to Linux via the device tree as a reg property (with start bit and + * number of bits). + */ +static inline void get_paste_bitfield(uint64_t *start, uint64_t *n_bits) +{ + *start = (uint64_t)RMA_LSMP_WINID_START_BIT; + *n_bits = (uint64_t)RMA_LSMP_WINID_NUM_BITS; +} + +/* + * Window Context MMIO (WCM) Region for each chip is assigned in the P9 + * MMIO MAP spreadsheet. Write this value to the SCOM address associated + * with WCM_BAR. + */ +static int init_wcm(struct proc_chip *chip) +{ + uint64_t wcmbar; + + get_hvwc_mmio_bar(chip->id, &wcmbar, NULL); + + /* + * Write the entire WCMBAR address to the SCOM address. VAS will + * extract bits that it thinks are relevant i.e bits 8..38 + */ + return vas_scom_write(chip, VAS_WCM_BAR, wcmbar); +} + +/* + * OS/User Window Context MMIO (UWCM) Region for each is assigned in the + * P9 MMIO MAP spreadsheet. Write this value to the SCOM address associated + * with UWCM_BAR. + */ +static int init_uwcm(struct proc_chip *chip) +{ + uint64_t uwcmbar; + + get_uwc_mmio_bar(chip->id, &uwcmbar, NULL); + + /* + * Write the entire UWCMBAR address to the SCOM address. VAS will + * extract bits that it thinks are relevant i.e bits 8..35. + */ + return vas_scom_write(chip, VAS_UWCM_BAR, uwcmbar); +} + +static inline void free_wcbs(struct proc_chip *chip) +{ + if (chip->vas->wcbs) { + free((void *)chip->vas->wcbs); + chip->vas->wcbs = 0ULL; + } +} + +/* + * VAS needs a backing store for the 64K window contexts on a chip. + * (64K times 512 = 8MB). This region needs to be contiguous, so + * allocate during early boot. Then write the allocated address to + * the SCOM address for the Backing store BAR. + */ +static int alloc_init_wcbs(struct proc_chip *chip) +{ + int rc; + uint64_t wcbs; + size_t size; + + /* align to the backing store size */ + size = (size_t)VAS_WCBS_SIZE; + wcbs = (uint64_t)local_alloc(chip->id, size, size); + if (!wcbs) { + vas_err("Unable to allocate memory for backing store\n"); + return -ENOMEM; + } + memset((void *)wcbs, 0ULL, size); + + /* + * Write entire WCBS_BAR address to the SCOM address. VAS will extract + * relevant bits. + */ + rc = vas_scom_write(chip, VAS_WCBS_BAR, wcbs); + if (rc != OPAL_SUCCESS) + goto out; + + chip->vas->wcbs = wcbs; + return OPAL_SUCCESS; + +out: + free((void *)wcbs); + return rc; +} + +static struct vas *alloc_vas(uint32_t chip_id, uint32_t vas_id, uint64_t base) +{ + struct vas *vas; + + vas = zalloc(sizeof(struct vas)); + assert(vas); + + vas->chip_id = chip_id; + vas->vas_id = vas_id; + vas->xscom_base = base; + + return vas; +} + +static void create_mm_dt_node(struct proc_chip *chip) +{ + struct dt_node *dn; + struct vas *vas; + const char *compat; + uint64_t hvwc_start, hvwc_len; + uint64_t uwc_start, uwc_len; + uint64_t pbf_start, pbf_nbits; + uint64_t pbar_start = 0, pbar_len = 0; + + vas = chip->vas; + get_hvwc_mmio_bar(chip->id, &hvwc_start, &hvwc_len); + get_uwc_mmio_bar(chip->id, &uwc_start, &uwc_len); + get_paste_bar(chip->id, &pbar_start, &pbar_len); + get_paste_bitfield(&pbf_start, &pbf_nbits); + + if (proc_gen == proc_gen_p9) + compat = "ibm,power9-vas"; + else + compat = "ibm,power10-vas"; + + dn = dt_new_addr(dt_root, "vas", hvwc_start); + + dt_add_property_strings(dn, "compatible", compat, + "ibm,vas"); + + dt_add_property_u64s(dn, "reg", hvwc_start, hvwc_len, + uwc_start, uwc_len, + pbar_start, pbar_len, + pbf_start, pbf_nbits); + + dt_add_property_cells(dn, "ibm,vas-id", vas->vas_id); + dt_add_property_cells(dn, "ibm,chip-id", chip->id); + if (vas->vas_irq) { + dt_add_property_cells(dn, "interrupts", vas->vas_irq, 0); + dt_add_property_cells(dn, "interrupt-parent", + get_ics_phandle()); + dt_add_property_u64(dn, "ibm,vas-port", vas->vas_port); + } +} + +/* + * Disable one VAS instance. + * + * Free memory and ensure chip does not accept paste instructions. + */ +static void disable_vas_inst(struct dt_node *np) +{ + struct proc_chip *chip; + + chip = get_chip(dt_get_chip_id(np)); + + if (!chip->vas) + return; + + free_wcbs(chip); + + reset_north_ctl(chip); +} + +static void vas_setup_irq(struct proc_chip *chip) +{ + uint64_t port; + uint32_t irq; + + irq = xive_alloc_ipi_irqs(chip->id, 1, 64); + if (irq == XIVE_IRQ_ERROR) { + vas_err("Failed to allocate interrupt sources for chipID %d\n", + chip->id); + return; + } + + vas_vdbg("trigger port: 0x%p\n", xive_get_trigger_port(irq)); + + port = (uint64_t)xive_get_trigger_port(irq); + + chip->vas->vas_irq = irq; + chip->vas->vas_port = port; +} + +/* + * Initialize one VAS instance and enable it if @enable is true. + */ +static int init_vas_inst(struct dt_node *np, bool enable) +{ + uint32_t vas_id; + uint64_t xscom_base; + struct proc_chip *chip; + + chip = get_chip(dt_get_chip_id(np)); + vas_id = dt_prop_get_u32(np, "ibm,vas-id"); + xscom_base = dt_get_address(np, 0, NULL); + + chip->vas = alloc_vas(chip->id, vas_id, xscom_base); + + if (!enable) { + reset_north_ctl(chip); + return 0; + } + + if (alloc_init_wcbs(chip)) + return -1; + + reset_fir(chip); + + if (init_wcm(chip) || init_uwcm(chip) || init_north_ctl(chip) || + init_rma(chip)) + return -1; + + /* + * Use NVRAM 'vas-user-space' config for backward compatibility + * to older kernels. Remove this option in future if not needed. + */ + if (nvram_query_eq_dangerous("vas-user-space", "enable")) + vas_setup_irq(chip); + + create_mm_dt_node(chip); + + prlog(PR_INFO, "VAS: Initialized chip %d\n", chip->id); + return 0; + +} + +void vas_init(void) +{ + bool enabled; + struct dt_node *np; + const char *compat; + + if (proc_gen == proc_gen_p9) + compat = "ibm,power9-vas-x"; + else if (proc_gen == proc_gen_p10) + compat = "ibm,power10-vas-x"; + else + return; + + enabled = vas_nx_enabled(); + + dt_for_each_compatible(dt_root, np, compat) { + if (init_vas_inst(np, enabled)) + goto out; + } + + vas_initialized = enabled; + return; + +out: + dt_for_each_compatible(dt_root, np, compat) + disable_vas_inst(np); + + vas_err("Disabled (failed initialization)\n"); + return; +} diff --git a/roms/skiboot/hw/xive.c b/roms/skiboot/hw/xive.c new file mode 100644 index 000000000..51b03549a --- /dev/null +++ b/roms/skiboot/hw/xive.c @@ -0,0 +1,5234 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * XIVE: eXternal Interrupt Virtualization Engine. POWER9 interrupt + * controller + * + * Copyright (c) 2016-2019, IBM Corporation. + */ + +#include <skiboot.h> +#include <xscom.h> +#include <chip.h> +#include <io.h> +#include <xive.h> +#include <xive-p9-regs.h> +#include <xscom-p9-regs.h> +#include <interrupts.h> +#include <timebase.h> +#include <bitmap.h> +#include <buddy.h> +#include <phys-map.h> +#include <p9_stop_api.H> + +/* Always notify from EQ to VP (no EOI on EQs). Will speed up + * EOIs at the expense of potentially higher powerbus traffic. + */ +#define EQ_ALWAYS_NOTIFY + +/* Verbose debug */ +#undef XIVE_VERBOSE_DEBUG + +/* Extra debug options used in debug builds */ +#ifdef DEBUG +#define XIVE_DEBUG_DUPLICATES +#define XIVE_PERCPU_LOG +#define XIVE_DEBUG_INIT_CACHE_UPDATES +#define XIVE_EXTRA_CHECK_INIT_CACHE +#undef XIVE_CHECK_MISROUTED_IPI +#define XIVE_CHECK_LOCKS +#else +#undef XIVE_DEBUG_DUPLICATES +#undef XIVE_PERCPU_LOG +#undef XIVE_DEBUG_INIT_CACHE_UPDATES +#undef XIVE_EXTRA_CHECK_INIT_CACHE +#undef XIVE_CHECK_MISROUTED_IPI +#undef XIVE_CHECK_LOCKS +#endif + +/* + * + * VSDs, blocks, set translation etc... + * + * This stuff confused me to no end so here's an attempt at explaining + * my understanding of it and how I use it in OPAL & Linux + * + * For the following data structures, the XIVE use a mechanism called + * Virtualization Structure Tables (VST) to manage the memory layout + * and access: ESBs (Event State Buffers, aka IPI sources), EAS/IVT + * (Event assignment structures), END/EQs (Notification descriptors + * aka event queues) and NVT/VPD (Notification Virtual Targets). + * + * These structures divide those tables into 16 "blocks". Each XIVE + * instance has a definition for all 16 blocks that can either represent + * an actual table in memory or a remote XIVE MMIO port to access a + * block that is owned by that remote XIVE. + * + * Our SW design will consist of allocating one block per chip (and thus + * per XIVE instance) for now, thus giving us up to 16 supported chips in + * the system. We may have to revisit that if we ever support systems with + * more than 16 chips but that isn't on our radar at the moment or if we + * want to do like pHyp on some machines and dedicate 2 blocks per chip + * for some structures. + * + * Thus we need to be careful that we never expose to Linux the concept + * of block and block boundaries, but instead we provide full number ranges + * so that consecutive blocks can be supported. + * + * We will pre-allocate some of the tables in order to support a "fallback" + * mode operations where an old-style XICS is emulated via OPAL calls. This + * is achieved by having a default of one VP per physical thread associated + * with one EQ and one IPI. There is also enought EATs to cover all the PHBs. + * + * Similarily, for MMIO access, the BARs support what is called "set + * translation" which allows the BAR to be divided into a certain + * number of sets. The VC BAR (ESBs, ENDs, ...) supports 64 sets and + * the PC BAR supports 16. Each "set" can be routed to a specific + * block and offset within a block. + * + * For now, we will not use much of that functionality. We will use a + * fixed split between ESB and ENDs for the VC BAR as defined by the + * constants below and we will allocate all the PC BARs set to the + * local block of that chip + */ + +#define XIVE_VSD_SIZE sizeof(u64) + +/* VC BAR contains set translations for the ESBs and the EQs. + * + * It's divided in 64 sets, each of which can be either ESB pages or EQ pages. + * The table configuring this is the EDT + * + * Additionally, the ESB pages come in pair of Linux_Trig_Mode isn't enabled + * (which we won't enable for now as it assumes write-only permission which + * the MMU doesn't support). + * + * To get started we just hard wire the following setup: + * + * VC_BAR size is 512G. We split it into 384G of ESBs (48 sets) and 128G + * of ENDs (16 sets) for the time being. IE. Each set is thus 8GB + */ + +#define VC_ESB_SETS 48 +#define VC_END_SETS 16 +#define VC_MAX_SETS 64 + +/* The table configuring the PC set translation (16 sets) is the VDT */ +#define PC_MAX_SETS 16 + +/* XXX This is the currently top limit of number of ESB/SBE entries + * and EAS/IVT entries pre-allocated per chip. This should probably + * turn into a device-tree property or NVRAM setting, or maybe + * calculated from the amount of system RAM... + * + * This is currently set to 1M + * + * This is independent of the sizing of the MMIO space. + * + * WARNING: Due to how XICS emulation works, we cannot support more + * interrupts per chip at this stage as the full interrupt number + * (block + index) has to fit in a 24-bit number. + * + * That gives us a pre-allocated space of 256KB per chip for the state + * bits and 8M per chip for the EAS/IVT. + * + * Note: The HW interrupts from PCIe and similar other entities that + * use their own state bit array will have to share that IVT space, + * so we could potentially make the IVT size twice as big, but for now + * we will simply share it and ensure we don't hand out IPIs that + * overlap the HW interrupts. + * + * TODO: adjust the VC BAR range for IPI ESBs on this value + */ + +#define XIVE_INT_ORDER 20 /* 1M interrupts */ +#define XIVE_INT_COUNT (1ul << XIVE_INT_ORDER) + +/* + * First interrupt number, also the first logical interrupt number + * allocated by Linux (the first numbers are reserved for ISA) + */ +#define XIVE_INT_FIRST 0x10 + +/* Corresponding direct table sizes */ + +#define SBE_PER_BYTE 4 /* PQ bits couples */ +#define SBE_SIZE (XIVE_INT_COUNT / SBE_PER_BYTE) +#define IVT_SIZE (XIVE_INT_COUNT * sizeof(struct xive_ive)) + +/* Use 64K for everything by default */ +#define XIVE_ESB_SHIFT (16 + 1) /* trigger + mgmt pages */ +#define XIVE_ESB_PAGE_SIZE (1ul << XIVE_ESB_SHIFT) /* 2 pages */ + +/* Max number of EQs. We allocate an indirect table big enough so + * that when fully populated we can have that many EQs. + * + * The max number of EQs we support in our MMIO space is 128G/128K + * ie. 1M. Since one EQ is 8 words (32 bytes), a 64K page can hold + * 2K EQs. We need 512 pointers, ie, 4K of memory for the indirect + * table. + * + * TODO: adjust the VC BAR range for END ESBs on this value + */ +#define EQ_PER_PAGE (PAGE_SIZE / sizeof(struct xive_eq)) + +#define XIVE_EQ_ORDER 20 /* 1M ENDs */ +#define XIVE_EQ_COUNT (1ul << XIVE_EQ_ORDER) +#define XIVE_EQ_TABLE_SIZE ((XIVE_EQ_COUNT / EQ_PER_PAGE) * XIVE_VSD_SIZE) + +#define XIVE_EQ_SHIFT (16 + 1) /* ESn + ESe pages */ + +/* Number of priorities (and thus EQDs) we allocate for each VP */ +#define NUM_INT_PRIORITIES 8 + +/* Max priority number */ +#define XIVE_MAX_PRIO 7 + +/* Priority used for the one queue in XICS emulation */ +#define XIVE_EMULATION_PRIO 7 + +/* Priority used for gather/silent escalation (KVM) */ +#define XIVE_ESCALATION_PRIO 7 + +/* Max number of VPs. We allocate an indirect table big enough so + * that when fully populated we can have that many VPs. + * + * The max number of VPs we support in our MMIO space is 64G/64K + * ie. 1M. Since one VP is 16 words (64 bytes), a 64K page can hold + * 1K EQ. We need 1024 pointers, ie, 8K of memory for the indirect + * table. + * + * HOWEVER: A block supports only up to 512K VPs (19 bits of target + * in the EQ). Since we currently only support 1 block per chip, + * we will allocate half of the above. We might add support for + * 2 blocks per chip later if necessary. + * + * TODO: adjust the PC BAR range + */ +#define VP_PER_PAGE (PAGE_SIZE / sizeof(struct xive_vp)) + +#define NVT_SHIFT 19 /* in sync with EQ_W6_NVT_INDEX */ + +/* + * We use 8 priorities per VP and the number of EQs is configured to + * 1M. Therefore, our VP space is limited to 128k. + */ +#define XIVE_VP_ORDER (XIVE_EQ_ORDER - 3) /* 128k */ +#define XIVE_VP_COUNT (1ul << XIVE_VP_ORDER) +#define XIVE_VP_TABLE_SIZE ((XIVE_VP_COUNT / VP_PER_PAGE) * XIVE_VSD_SIZE) + +/* + * VP ids for HW threads. + * + * These values are hardcoded in the CAM line of the HW context and + * they depend on the thread id bits of the chip, 7bit for p9. + * + * HW CAM Line |chip|000000000001|thrdid | + * 23bits 4 12 7 + */ +#define XIVE_THREADID_SHIFT 7 +#define XIVE_HW_VP_BASE (1 << XIVE_THREADID_SHIFT) +#define XIVE_HW_VP_COUNT (1 << XIVE_THREADID_SHIFT) + +/* The xive operation mode indicates the active "API" and corresponds + * to the "mode" parameter of the opal_xive_reset() call + */ +static enum { + XIVE_MODE_EMU = OPAL_XIVE_MODE_EMU, + XIVE_MODE_EXPL = OPAL_XIVE_MODE_EXPL, + XIVE_MODE_NONE, +} xive_mode = XIVE_MODE_NONE; + + +/* Each source controller has one of these. There's one embedded + * in the XIVE struct for IPIs + */ +struct xive_src { + struct irq_source is; + const struct irq_source_ops *orig_ops; + struct xive *xive; + void *esb_mmio; + uint32_t esb_base; + uint32_t esb_shift; + uint32_t flags; +}; + +#define LOG_TYPE_XIRR 0 +#define LOG_TYPE_XIRR2 1 +#define LOG_TYPE_POPQ 2 +#define LOG_TYPE_EOI 3 +#define LOG_TYPE_EQD 4 + +struct xive_log_ent { + uint8_t type; + uint8_t cnt; + uint64_t tb; +#define MAX_LOG_DATA 8 + uint32_t data[MAX_LOG_DATA]; +}; +#define MAX_LOG_ENT 32 + +struct xive_cpu_state { + struct xive *xive; + void *tm_ring1; + +#ifdef XIVE_PERCPU_LOG + struct xive_log_ent log[MAX_LOG_ENT]; + uint32_t log_pos; +#endif + /* Base HW VP and associated queues */ + uint32_t vp_blk; + uint32_t vp_idx; + uint32_t eq_blk; + uint32_t eq_idx; /* Base eq index of a block of 8 */ + void *eq_page; + + /* Pre-allocated IPI */ + uint32_t ipi_irq; + + /* Use for XICS emulation */ + struct lock lock; + uint8_t cppr; + uint8_t mfrr; + uint8_t pending; + uint8_t prev_cppr; + uint32_t *eqbuf; + uint32_t eqptr; + uint32_t eqmsk; + uint8_t eqgen; + void *eqmmio; + uint64_t total_irqs; +}; + +#ifdef XIVE_PERCPU_LOG + +static void log_add(struct xive_cpu_state *xs, uint8_t type, + uint8_t count, ...) +{ + struct xive_log_ent *e = &xs->log[xs->log_pos]; + va_list args; + int i; + + e->type = type; + e->cnt = count; + e->tb = mftb(); + va_start(args, count); + for (i = 0; i < count; i++) + e->data[i] = va_arg(args, u32); + va_end(args); + xs->log_pos = xs->log_pos + 1; + if (xs->log_pos == MAX_LOG_ENT) + xs->log_pos = 0; +} + +static void log_print(struct xive_cpu_state *xs) +{ + uint32_t pos = xs->log_pos; + uint8_t buf[256]; + int i, j; + static const char *lts[] = { + ">XIRR", + "<XIRR", + " POPQ", + " EOI", + " EQD" + }; + for (i = 0; i < MAX_LOG_ENT; i++) { + struct xive_log_ent *e = &xs->log[pos]; + uint8_t *b = buf, *eb = &buf[255]; + + b += snprintf(b, eb-b, "%08llx %s ", e->tb, + lts[e->type]); + for (j = 0; j < e->cnt && b < eb; j++) + b += snprintf(b, eb-b, "%08x ", e->data[j]); + printf("%s\n", buf); + pos = pos + 1; + if (pos == MAX_LOG_ENT) + pos = 0; + } +} + +#else /* XIVE_PERCPU_LOG */ + +static inline void log_add(struct xive_cpu_state *xs __unused, + uint8_t type __unused, + uint8_t count __unused, ...) { } +static inline void log_print(struct xive_cpu_state *xs __unused) { } + +#endif /* XIVE_PERCPU_LOG */ + +struct xive { + uint32_t chip_id; + uint32_t block_id; + struct dt_node *x_node; + + uint64_t xscom_base; + + /* MMIO regions */ + void *ic_base; + uint64_t ic_size; + uint32_t ic_shift; + void *tm_base; + uint64_t tm_size; + uint32_t tm_shift; + void *pc_base; + uint64_t pc_size; + void *vc_base; + uint64_t vc_size; + + void *esb_mmio; + void *eq_mmio; + + /* Set on XSCOM register access error */ + bool last_reg_error; + + /* Per-XIVE mutex */ + struct lock lock; + + /* Pre-allocated tables. + * + * We setup all the VDS for actual tables (ie, by opposition to + * forwarding ports) as either direct pre-allocated or indirect + * and partially populated. + * + * Currently, the ESB/SBE and the EAS/IVT tables are direct and + * fully pre-allocated based on XIVE_INT_COUNT. + * + * The other tables are indirect, we thus pre-allocate the indirect + * table (ie, pages of pointers) and populate enough of the pages + * for our basic setup using 64K pages. + * + * The size of the indirect tables are driven by XIVE_VP_COUNT and + * XIVE_EQ_COUNT. The number of pre-allocated ones are driven by + * XIVE_HW_VP_COUNT (number of EQ depends on number of VP) in block + * mode, otherwise we only preallocate INITIAL_BLK0_VP_COUNT on + * block 0. + */ + + /* Direct SBE and IVT tables */ + void *sbe_base; + void *ivt_base; + + /* Indirect END/EQ table. NULL entries are unallocated, count is + * the numbre of pointers (ie, sub page placeholders). + */ + __be64 *eq_ind_base; + uint32_t eq_ind_count; + + /* EQ allocation bitmap. Each bit represent 8 EQs */ + bitmap_t *eq_map; + + /* Indirect NVT/VP table. NULL entries are unallocated, count is + * the numbre of pointers (ie, sub page placeholders). + */ + __be64 *vp_ind_base; + uint32_t vp_ind_count; + + /* Pool of donated pages for provisioning indirect EQ and VP pages */ + struct list_head donated_pages; + + /* To ease a possible change to supporting more than one block of + * interrupts per chip, we store here the "base" global number + * and max number of interrupts for this chip. The global number + * encompass the block number and index. + */ + uint32_t int_base; + uint32_t int_max; + + /* Due to the overlap between IPIs and HW sources in the IVT table, + * we keep some kind of top-down allocator. It is used for HW sources + * to "allocate" interrupt entries and will limit what can be handed + * out as IPIs. Of course this assumes we "allocate" all HW sources + * before we start handing out IPIs. + * + * Note: The numbers here are global interrupt numbers so that we can + * potentially handle more than one block per chip in the future. + */ + uint32_t int_hw_bot; /* Bottom of HW allocation */ + uint32_t int_ipi_top; /* Highest IPI handed out so far + 1 */ + + /* The IPI allocation bitmap */ + bitmap_t *ipi_alloc_map; + + /* We keep track of which interrupts were ever enabled to + * speed up xive_reset + */ + bitmap_t *int_enabled_map; + + /* Embedded source IPIs */ + struct xive_src ipis; + + /* Embedded escalation interrupts */ + struct xive_src esc_irqs; + + /* In memory queue overflow */ + void *q_ovf; +}; + +#define XIVE_CAN_STORE_EOI(x) XIVE_STORE_EOI_ENABLED + +/* Global DT node */ +static struct dt_node *xive_dt_node; + + +/* Block <-> Chip conversions. + * + * As chipIDs may not be within the range of 16 block IDs supported by XIVE, + * we have a 2 way conversion scheme. + * + * From block to chip, use the global table below. + * + * From chip to block, a field in struct proc_chip contains the first block + * of that chip. For now we only support one block per chip but that might + * change in the future + */ +#define XIVE_INVALID_CHIP 0xffffffff +#define XIVE_MAX_CHIPS 16 +static uint32_t xive_block_to_chip[XIVE_MAX_CHIPS]; +static uint32_t xive_block_count; + +static uint32_t xive_chip_to_block(uint32_t chip_id) +{ + struct proc_chip *c = get_chip(chip_id); + + assert(c); + assert(c->xive); + return c->xive->block_id; +} + +/* Conversion between GIRQ and block/index. + * + * ------------------------------------ + * |0000000E|BLOC| INDEX| + * ------------------------------------ + * 8 4 20 + * + * the E bit indicates that this is an escalation interrupt, in + * that case, the BLOCK/INDEX points to the EQ descriptor associated + * with the escalation. + * + * Global interrupt numbers for non-escalation interrupts are thus + * limited to 24 bits because the XICS emulation encodes the CPPR + * value in the top (MSB) 8 bits. Hence, 4 bits are left for the XIVE + * block number and the remaining 20 bits for the interrupt index + * number. + */ +#define INT_SHIFT 20 +#define INT_ESC_SHIFT (INT_SHIFT + 4) /* 4bits block id */ + +#if XIVE_INT_ORDER > INT_SHIFT +#error "Too many ESBs for IRQ encoding" +#endif + +#if XIVE_EQ_ORDER > INT_SHIFT +#error "Too many EQs for escalation IRQ number encoding" +#endif + +#define GIRQ_TO_BLK(__g) (((__g) >> INT_SHIFT) & 0xf) +#define GIRQ_TO_IDX(__g) ((__g) & ((1 << INT_SHIFT) - 1)) +#define BLKIDX_TO_GIRQ(__b,__i) (((uint32_t)(__b)) << INT_SHIFT | (__i)) +#define GIRQ_IS_ESCALATION(__g) ((__g) & (1 << INT_ESC_SHIFT)) +#define MAKE_ESCALATION_GIRQ(__b,__i)(BLKIDX_TO_GIRQ(__b,__i) | (1 << INT_ESC_SHIFT)) + +/* Block/IRQ to chip# conversions */ +#define PC_BLK_TO_CHIP(__b) (xive_block_to_chip[__b]) +#define VC_BLK_TO_CHIP(__b) (xive_block_to_chip[__b]) +#define GIRQ_TO_CHIP(__isn) (VC_BLK_TO_CHIP(GIRQ_TO_BLK(__isn))) + +/* Routing of physical processors to VPs */ +#define PIR2VP_IDX(__pir) (XIVE_HW_VP_BASE | P9_PIR2LOCALCPU(__pir)) +#define PIR2VP_BLK(__pir) (xive_chip_to_block(P9_PIR2GCID(__pir))) +#define VP2PIR(__blk, __idx) (P9_PIRFROMLOCALCPU(VC_BLK_TO_CHIP(__blk), (__idx) & 0x7f)) + +/* Decoding of OPAL API VP IDs. The VP IDs are encoded as follow + * + * Block group mode: + * + * ----------------------------------- + * |GVEOOOOO| INDEX| + * ----------------------------------- + * || | + * || Order + * |Virtual + * Group + * + * G (Group) : Set to 1 for a group VP (not currently supported) + * V (Virtual) : Set to 1 for an allocated VP (vs. a physical processor ID) + * E (Error) : Should never be 1, used internally for errors + * O (Order) : Allocation order of the VP block + * + * The conversion is thus done as follow (groups aren't implemented yet) + * + * If V=0, O must be 0 and 24-bit INDEX value is the PIR + * If V=1, the order O group is allocated such that if N is the number of + * chip bits considered for allocation (*) + * then the INDEX is constructed as follow (bit numbers such as 0=LSB) + * - bottom O-N bits is the index within the "VP block" + * - next N bits is the XIVE blockID of the VP + * - the remaining bits is the per-chip "base" + * so the conversion consists of "extracting" the block ID and moving + * down the upper bits by N bits. + * + * In non-block-group mode, the difference is that the blockID is + * on the left of the index (the entire VP block is in a single + * block ID) + */ + +/* VP allocation */ +static uint32_t xive_chips_alloc_bits = 0; +static struct buddy *xive_vp_buddy; +static struct lock xive_buddy_lock = LOCK_UNLOCKED; + +/* VP# decoding/encoding */ +static bool xive_decode_vp(uint32_t vp, uint32_t *blk, uint32_t *idx, + uint8_t *order, bool *group) +{ + uint32_t o = (vp >> 24) & 0x1f; + uint32_t n = xive_chips_alloc_bits; + uint32_t index = vp & 0x00ffffff; + uint32_t imask = (1 << (o - n)) - 1; + + /* Groups not supported yet */ + if ((vp >> 31) & 1) + return false; + if (group) + *group = false; + + /* PIR case */ + if (((vp >> 30) & 1) == 0) { + if (find_cpu_by_pir(index) == NULL) + return false; + if (blk) + *blk = PIR2VP_BLK(index); + if (idx) + *idx = PIR2VP_IDX(index); + return true; + } + + /* Ensure o > n, we have *at least* 2 VPs per block */ + if (o <= n) + return false; + + /* Combine the index base and index */ + if (idx) + *idx = ((index >> n) & ~imask) | (index & imask); + /* Extract block ID */ + if (blk) + *blk = (index >> (o - n)) & ((1 << n) - 1); + + /* Return order as well if asked for */ + if (order) + *order = o; + + return true; +} + +static uint32_t xive_encode_vp(uint32_t blk, uint32_t idx, uint32_t order) +{ + uint32_t vp = 0x40000000 | (order << 24); + uint32_t n = xive_chips_alloc_bits; + uint32_t imask = (1 << (order - n)) - 1; + + vp |= (idx & ~imask) << n; + vp |= blk << (order - n); + vp |= idx & imask; + return vp; +} + +#define xive_regw(__x, __r, __v) \ + __xive_regw(__x, __r, X_##__r, __v, #__r) +#define xive_regr(__x, __r) \ + __xive_regr(__x, __r, X_##__r, #__r) +#define xive_regwx(__x, __r, __v) \ + __xive_regw(__x, 0, X_##__r, __v, #__r) +#define xive_regrx(__x, __r) \ + __xive_regr(__x, 0, X_##__r, #__r) + +#ifdef XIVE_VERBOSE_DEBUG +#define xive_vdbg(__x,__fmt,...) prlog(PR_DEBUG,"XIVE[ IC %02x ] " __fmt, (__x)->chip_id, ##__VA_ARGS__) +#define xive_cpu_vdbg(__c,__fmt,...) prlog(PR_DEBUG,"XIVE[CPU %04x] " __fmt, (__c)->pir, ##__VA_ARGS__) +#else +#define xive_vdbg(x,fmt,...) do { } while(0) +#define xive_cpu_vdbg(x,fmt,...) do { } while(0) +#endif + +#define xive_dbg(__x,__fmt,...) prlog(PR_DEBUG,"XIVE[ IC %02x ] " __fmt, (__x)->chip_id, ##__VA_ARGS__) +#define xive_cpu_dbg(__c,__fmt,...) prlog(PR_DEBUG,"XIVE[CPU %04x] " __fmt, (__c)->pir, ##__VA_ARGS__) +#define xive_warn(__x,__fmt,...) prlog(PR_WARNING,"XIVE[ IC %02x ] " __fmt, (__x)->chip_id, ##__VA_ARGS__) +#define xive_cpu_warn(__c,__fmt,...) prlog(PR_WARNING,"XIVE[CPU %04x] " __fmt, (__c)->pir, ##__VA_ARGS__) +#define xive_err(__x,__fmt,...) prlog(PR_ERR,"XIVE[ IC %02x ] " __fmt, (__x)->chip_id, ##__VA_ARGS__) +#define xive_cpu_err(__c,__fmt,...) prlog(PR_ERR,"XIVE[CPU %04x] " __fmt, (__c)->pir, ##__VA_ARGS__) + +static void __xive_regw(struct xive *x, uint32_t m_reg, uint32_t x_reg, uint64_t v, + const char *rname) +{ + bool use_xscom = (m_reg == 0) || !x->ic_base; + int64_t rc; + + x->last_reg_error = false; + + if (use_xscom) { + assert(x_reg != 0); + rc = xscom_write(x->chip_id, x->xscom_base + x_reg, v); + if (rc) { + if (!rname) + rname = "???"; + xive_err(x, "Error writing register %s\n", rname); + /* Anything else we can do here ? */ + x->last_reg_error = true; + } + } else { + out_be64(x->ic_base + m_reg, v); + } +} + +static uint64_t __xive_regr(struct xive *x, uint32_t m_reg, uint32_t x_reg, + const char *rname) +{ + bool use_xscom = (m_reg == 0) || !x->ic_base; + int64_t rc; + uint64_t val; + + x->last_reg_error = false; + + if (use_xscom) { + assert(x_reg != 0); + rc = xscom_read(x->chip_id, x->xscom_base + x_reg, &val); + if (rc) { + if (!rname) + rname = "???"; + xive_err(x, "Error reading register %s\n", rname); + /* Anything else we can do here ? */ + x->last_reg_error = true; + return -1ull; + } + } else { + val = in_be64(x->ic_base + m_reg); + } + return val; +} + +/* Locate a controller from an IRQ number */ +static struct xive *xive_from_isn(uint32_t isn) +{ + uint32_t chip_id = GIRQ_TO_CHIP(isn); + struct proc_chip *c = get_chip(chip_id); + + if (!c) + return NULL; + return c->xive; +} + +static struct xive *xive_from_pc_blk(uint32_t blk) +{ + uint32_t chip_id = PC_BLK_TO_CHIP(blk); + struct proc_chip *c = get_chip(chip_id); + + if (!c) + return NULL; + return c->xive; +} + +static struct xive *xive_from_vc_blk(uint32_t blk) +{ + uint32_t chip_id = VC_BLK_TO_CHIP(blk); + struct proc_chip *c = get_chip(chip_id); + + if (!c) + return NULL; + return c->xive; +} + +static struct xive_eq *xive_get_eq(struct xive *x, unsigned int idx) +{ + struct xive_eq *p; + + if (idx >= (x->eq_ind_count * EQ_PER_PAGE)) + return NULL; + p = (struct xive_eq *)(be64_to_cpu(x->eq_ind_base[idx / EQ_PER_PAGE]) & + VSD_ADDRESS_MASK); + if (!p) + return NULL; + + return &p[idx % EQ_PER_PAGE]; +} + +static struct xive_ive *xive_get_ive(struct xive *x, unsigned int isn) +{ + struct xive_ive *ivt; + uint32_t idx = GIRQ_TO_IDX(isn); + + if (GIRQ_IS_ESCALATION(isn)) { + /* All right, an escalation IVE is buried inside an EQ, let's + * try to find it + */ + struct xive_eq *eq; + + if (x->chip_id != VC_BLK_TO_CHIP(GIRQ_TO_BLK(isn))) { + xive_err(x, "xive_get_ive, ESC ISN 0x%x not on right chip\n", isn); + return NULL; + } + eq = xive_get_eq(x, idx); + if (!eq) { + xive_err(x, "xive_get_ive, ESC ISN 0x%x EQ not found\n", isn); + return NULL; + } + + /* If using single-escalation, don't let anybody get to the individual + * escalation interrupts + */ + if (xive_get_field32(EQ_W0_UNCOND_ESCALATE, eq->w0)) + return NULL; + + /* Grab the buried IVE */ + return (struct xive_ive *)(char *)&eq->w4; + } else { + /* Check the block matches */ + if (isn < x->int_base || isn >= x->int_max) { + xive_err(x, "xive_get_ive, ISN 0x%x not on right chip\n", isn); + return NULL; + } + assert (idx < XIVE_INT_COUNT); + + /* If we support >1 block per chip, this should still work as + * we are likely to make the table contiguous anyway + */ + ivt = x->ivt_base; + assert(ivt); + + return ivt + idx; + } +} + +static struct xive_vp *xive_get_vp(struct xive *x, unsigned int idx) +{ + struct xive_vp *p; + + assert(idx < (x->vp_ind_count * VP_PER_PAGE)); + p = (struct xive_vp *)(be64_to_cpu(x->vp_ind_base[idx / VP_PER_PAGE]) & + VSD_ADDRESS_MASK); + if (!p) + return NULL; + + return &p[idx % VP_PER_PAGE]; +} + +static void xive_init_default_vp(struct xive_vp *vp, + uint32_t eq_blk, uint32_t eq_idx) +{ + memset(vp, 0, sizeof(struct xive_vp)); + + /* Stash the EQ base in the pressure relief interrupt field */ + vp->w1 = cpu_to_be32((eq_blk << 28) | eq_idx); + vp->w0 = xive_set_field32(VP_W0_VALID, 0, 1); +} + +static void xive_init_emu_eq(uint32_t vp_blk, uint32_t vp_idx, + struct xive_eq *eq, void *backing_page, + uint8_t prio) +{ + memset(eq, 0, sizeof(struct xive_eq)); + + eq->w1 = xive_set_field32(EQ_W1_GENERATION, 0, 1); + eq->w3 = cpu_to_be32(((uint64_t)backing_page) & EQ_W3_OP_DESC_LO); + eq->w2 = cpu_to_be32((((uint64_t)backing_page) >> 32) & EQ_W2_OP_DESC_HI); + eq->w6 = xive_set_field32(EQ_W6_NVT_BLOCK, 0, vp_blk) | + xive_set_field32(EQ_W6_NVT_INDEX, 0, vp_idx); + eq->w7 = xive_set_field32(EQ_W7_F0_PRIORITY, 0, prio); + eq->w0 = xive_set_field32(EQ_W0_VALID, 0, 1) | + xive_set_field32(EQ_W0_ENQUEUE, 0, 1) | + xive_set_field32(EQ_W0_FIRMWARE, 0, 1) | + xive_set_field32(EQ_W0_QSIZE, 0, EQ_QSIZE_64K) | +#ifdef EQ_ALWAYS_NOTIFY + xive_set_field32(EQ_W0_UCOND_NOTIFY, 0, 1) | +#endif + 0 ; +} + +static uint32_t *xive_get_eq_buf(uint32_t eq_blk, uint32_t eq_idx) +{ + struct xive *x = xive_from_vc_blk(eq_blk); + struct xive_eq *eq; + uint64_t addr; + + assert(x); + eq = xive_get_eq(x, eq_idx); + assert(eq); + assert(xive_get_field32(EQ_W0_VALID, eq->w0)); + addr = ((((uint64_t)be32_to_cpu(eq->w2)) & 0x0fffffff) << 32) | be32_to_cpu(eq->w3); + + return (uint32_t *)addr; +} + +static void *xive_get_donated_page(struct xive *x) +{ + return (void *)list_pop_(&x->donated_pages, 0); +} + +#define XIVE_ALLOC_IS_ERR(_idx) ((_idx) >= 0xfffffff0) + +#define XIVE_ALLOC_NO_SPACE 0xffffffff /* No possible space */ +#define XIVE_ALLOC_NO_IND 0xfffffffe /* Indirect need provisioning */ +#define XIVE_ALLOC_NO_MEM 0xfffffffd /* Local allocation failed */ + +static uint32_t xive_alloc_eq_set(struct xive *x, bool alloc_indirect) +{ + uint32_t ind_idx; + int idx; + int eq_base_idx; + + xive_vdbg(x, "Allocating EQ set...\n"); + + assert(x->eq_map); + + /* Allocate from the EQ bitmap. Each bit is 8 EQs */ + idx = bitmap_find_zero_bit(*x->eq_map, 0, XIVE_EQ_COUNT >> 3); + if (idx < 0) { + xive_dbg(x, "Allocation from EQ bitmap failed !\n"); + return XIVE_ALLOC_NO_SPACE; + } + + eq_base_idx = idx << 3; + + xive_vdbg(x, "Got EQs 0x%x..0x%x\n", eq_base_idx, + eq_base_idx + XIVE_MAX_PRIO); + + /* Calculate the indirect page where the EQs reside */ + ind_idx = eq_base_idx / EQ_PER_PAGE; + + /* Is there an indirect page ? If not, check if we can provision it */ + if (!x->eq_ind_base[ind_idx]) { + /* Default flags */ + uint64_t vsd_flags = SETFIELD(VSD_TSIZE, 0ull, 4) | + SETFIELD(VSD_MODE, 0ull, VSD_MODE_EXCLUSIVE); + void *page; + + /* If alloc_indirect is set, allocate the memory from OPAL own, + * otherwise try to provision from the donated pool + */ + if (alloc_indirect) { + /* Allocate/provision indirect page during boot only */ + xive_vdbg(x, "Indirect empty, provisioning from local pool\n"); + page = local_alloc(x->chip_id, PAGE_SIZE, PAGE_SIZE); + if (!page) { + xive_dbg(x, "provisioning failed !\n"); + return XIVE_ALLOC_NO_MEM; + } + vsd_flags |= VSD_FIRMWARE; + } else { + xive_vdbg(x, "Indirect empty, provisioning from donated pages\n"); + page = xive_get_donated_page(x); + if (!page) { + xive_vdbg(x, "no idirect pages available !\n"); + return XIVE_ALLOC_NO_IND; + } + } + memset(page, 0, PAGE_SIZE); + x->eq_ind_base[ind_idx] = cpu_to_be64(vsd_flags | + (((uint64_t)page) & VSD_ADDRESS_MASK)); + /* Any cache scrub needed ? */ + } + + bitmap_set_bit(*x->eq_map, idx); + return eq_base_idx; +} + +static void xive_free_eq_set(struct xive *x, uint32_t eqs) +{ + uint32_t idx; + + xive_vdbg(x, "Freeing EQ 0x%x..0x%x\n", eqs, eqs + XIVE_MAX_PRIO); + + assert((eqs & 7) == 0); + assert(x->eq_map); + + idx = eqs >> 3; + bitmap_clr_bit(*x->eq_map, idx); +} + +static bool xive_provision_vp_ind(struct xive *x, uint32_t vp_idx, uint32_t order) +{ + uint32_t pbase, pend, i; + + pbase = vp_idx / VP_PER_PAGE; + pend = (vp_idx + (1 << order)) / VP_PER_PAGE; + + for (i = pbase; i <= pend; i++) { + void *page; + u64 vsd; + + /* Already provisioned ? */ + if (x->vp_ind_base[i]) + continue; + + /* Try to grab a donated page */ + page = xive_get_donated_page(x); + if (!page) + return false; + + /* Install the page */ + memset(page, 0, PAGE_SIZE); + vsd = ((uint64_t)page) & VSD_ADDRESS_MASK; + vsd |= SETFIELD(VSD_TSIZE, 0ull, 4); + vsd |= SETFIELD(VSD_MODE, 0ull, VSD_MODE_EXCLUSIVE); + x->vp_ind_base[i] = cpu_to_be64(vsd); + } + return true; +} + +static void xive_init_vp_allocator(void) +{ + /* Initialize chip alloc bits */ + xive_chips_alloc_bits = ilog2(xive_block_count); + + prlog(PR_INFO, "XIVE: %d chips considered for VP allocations\n", + 1 << xive_chips_alloc_bits); + + /* Allocate a buddy big enough for XIVE_VP_ORDER allocations. + * + * each bit in the buddy represents 1 << xive_chips_alloc_bits + * VPs. + */ + xive_vp_buddy = buddy_create(XIVE_VP_ORDER); + assert(xive_vp_buddy); + + /* We reserve the whole range of VPs representing HW chips. + * + * These are 0x80..0xff, so order 7 starting at 0x80. This will + * reserve that range on each chip. + */ + assert(buddy_reserve(xive_vp_buddy, XIVE_HW_VP_BASE, + XIVE_THREADID_SHIFT)); +} + +static uint32_t xive_alloc_vps(uint32_t order) +{ + uint32_t local_order, i; + int vp; + + /* The minimum order is 2 VPs per chip */ + if (order < (xive_chips_alloc_bits + 1)) + order = xive_chips_alloc_bits + 1; + + /* We split the allocation */ + local_order = order - xive_chips_alloc_bits; + + /* We grab that in the global buddy */ + assert(xive_vp_buddy); + lock(&xive_buddy_lock); + vp = buddy_alloc(xive_vp_buddy, local_order); + unlock(&xive_buddy_lock); + if (vp < 0) + return XIVE_ALLOC_NO_SPACE; + + /* Provision on every chip considered for allocation */ + for (i = 0; i < (1 << xive_chips_alloc_bits); i++) { + struct xive *x = xive_from_pc_blk(i); + bool success; + + /* Return internal error & log rather than assert ? */ + assert(x); + lock(&x->lock); + success = xive_provision_vp_ind(x, vp, local_order); + unlock(&x->lock); + if (!success) { + lock(&xive_buddy_lock); + buddy_free(xive_vp_buddy, vp, local_order); + unlock(&xive_buddy_lock); + return XIVE_ALLOC_NO_IND; + } + } + + /* Encode the VP number. "blk" is 0 as this represents + * all blocks and the allocation always starts at 0 + */ + return xive_encode_vp(0, vp, order); +} + +static void xive_free_vps(uint32_t vp) +{ + uint32_t idx; + uint8_t order, local_order; + + assert(xive_decode_vp(vp, NULL, &idx, &order, NULL)); + + /* We split the allocation */ + local_order = order - xive_chips_alloc_bits; + + /* Free that in the buddy */ + lock(&xive_buddy_lock); + buddy_free(xive_vp_buddy, idx, local_order); + unlock(&xive_buddy_lock); +} + +enum xive_cache_type { + xive_cache_ivc, + xive_cache_sbc, + xive_cache_eqc, + xive_cache_vpc, +}; + +static int64_t __xive_cache_watch(struct xive *x, enum xive_cache_type ctype, + uint64_t block, uint64_t idx, + uint32_t start_dword, uint32_t dword_count, + __be64 *new_data, bool light_watch, + bool synchronous); + +static void xive_scrub_workaround_vp(struct xive *x, uint32_t block, uint32_t idx __unused) +{ + /* VP variant of the workaround described in __xive_cache_scrub(), + * we need to be careful to use for that workaround an NVT that + * sits on the same xive but isn NOT part of a donated indirect + * entry. + * + * The reason is that the dummy cache watch will re-create a + * dirty entry in the cache, even if the entry is marked + * invalid. + * + * Thus if we are about to dispose of the indirect entry backing + * it, we'll cause a checkstop later on when trying to write it + * out. + * + * Note: This means the workaround only works for block group + * mode. + */ + __xive_cache_watch(x, xive_cache_vpc, block, XIVE_HW_VP_BASE, 0, + 0, NULL, true, false); +} + +static void xive_scrub_workaround_eq(struct xive *x, uint32_t block __unused, uint32_t idx) +{ + void *mmio; + + /* EQ variant of the workaround described in __xive_cache_scrub(), + * a simple non-side effect load from ESn will do + */ + mmio = x->eq_mmio + idx * XIVE_ESB_PAGE_SIZE; + + /* Ensure the above has returned before we do anything else + * the XIVE store queue is completely empty + */ + load_wait(in_be64(mmio + XIVE_ESB_GET)); +} + +static int64_t __xive_cache_scrub(struct xive *x, enum xive_cache_type ctype, + uint64_t block, uint64_t idx, + bool want_inval, bool want_disable) +{ + uint64_t sreg, sregx, mreg, mregx; + uint64_t mval, sval; + +#ifdef XIVE_CHECK_LOCKS + assert(lock_held_by_me(&x->lock)); +#endif + + /* Workaround a HW bug in XIVE where the scrub completion + * isn't ordered by loads, thus the data might still be + * in a queue and may not have reached coherency. + * + * The workaround is two folds: We force the scrub to also + * invalidate, then after the scrub, we do a dummy cache + * watch which will make the HW read the data back, which + * should be ordered behind all the preceding stores. + * + * Update: For EQs we can do a non-side effect ESB load instead + * which is faster. + */ + want_inval = true; + + switch (ctype) { + case xive_cache_ivc: + sreg = VC_IVC_SCRUB_TRIG; + sregx = X_VC_IVC_SCRUB_TRIG; + mreg = VC_IVC_SCRUB_MASK; + mregx = X_VC_IVC_SCRUB_MASK; + break; + case xive_cache_sbc: + sreg = VC_SBC_SCRUB_TRIG; + sregx = X_VC_SBC_SCRUB_TRIG; + mreg = VC_SBC_SCRUB_MASK; + mregx = X_VC_SBC_SCRUB_MASK; + break; + case xive_cache_eqc: + sreg = VC_EQC_SCRUB_TRIG; + sregx = X_VC_EQC_SCRUB_TRIG; + mreg = VC_EQC_SCRUB_MASK; + mregx = X_VC_EQC_SCRUB_MASK; + break; + case xive_cache_vpc: + sreg = PC_VPC_SCRUB_TRIG; + sregx = X_PC_VPC_SCRUB_TRIG; + mreg = PC_VPC_SCRUB_MASK; + mregx = X_PC_VPC_SCRUB_MASK; + break; + default: + return OPAL_INTERNAL_ERROR; + } + if (ctype == xive_cache_vpc) { + mval = PC_SCRUB_BLOCK_ID | PC_SCRUB_OFFSET; + sval = SETFIELD(PC_SCRUB_BLOCK_ID, idx, block) | + PC_SCRUB_VALID; + } else { + mval = VC_SCRUB_BLOCK_ID | VC_SCRUB_OFFSET; + sval = SETFIELD(VC_SCRUB_BLOCK_ID, idx, block) | + VC_SCRUB_VALID; + } + if (want_inval) + sval |= PC_SCRUB_WANT_INVAL; + if (want_disable) + sval |= PC_SCRUB_WANT_DISABLE; + + __xive_regw(x, mreg, mregx, mval, NULL); + __xive_regw(x, sreg, sregx, sval, NULL); + + /* XXX Add timeout !!! */ + for (;;) { + sval = __xive_regr(x, sreg, sregx, NULL); + if (!(sval & VC_SCRUB_VALID)) + break; + /* Small delay */ + time_wait(100); + } + sync(); + + /* Workaround for HW bug described above (only applies to + * EQC and VPC + */ + if (ctype == xive_cache_eqc) + xive_scrub_workaround_eq(x, block, idx); + else if (ctype == xive_cache_vpc) + xive_scrub_workaround_vp(x, block, idx); + + return 0; +} + +static int64_t xive_ivc_scrub(struct xive *x, uint64_t block, uint64_t idx) +{ + /* IVC has no "want_inval" bit, it always invalidates */ + return __xive_cache_scrub(x, xive_cache_ivc, block, idx, false, false); +} + +static int64_t xive_vpc_scrub(struct xive *x, uint64_t block, uint64_t idx) +{ + return __xive_cache_scrub(x, xive_cache_vpc, block, idx, false, false); +} + +static int64_t xive_vpc_scrub_clean(struct xive *x, uint64_t block, uint64_t idx) +{ + return __xive_cache_scrub(x, xive_cache_vpc, block, idx, true, false); +} + +static int64_t xive_eqc_scrub(struct xive *x, uint64_t block, uint64_t idx) +{ + return __xive_cache_scrub(x, xive_cache_eqc, block, idx, false, false); +} + +#define XIVE_CACHE_WATCH_MAX_RETRIES 10 + +static int64_t __xive_cache_watch(struct xive *x, enum xive_cache_type ctype, + uint64_t block, uint64_t idx, + uint32_t start_dword, uint32_t dword_count, + __be64 *new_data, bool light_watch, + bool synchronous) +{ + uint64_t sreg, sregx, dreg0, dreg0x; + uint64_t dval0, sval, status; + int64_t i; + int retries = 0; + +#ifdef XIVE_CHECK_LOCKS + assert(lock_held_by_me(&x->lock)); +#endif + switch (ctype) { + case xive_cache_eqc: + sreg = VC_EQC_CWATCH_SPEC; + sregx = X_VC_EQC_CWATCH_SPEC; + dreg0 = VC_EQC_CWATCH_DAT0; + dreg0x = X_VC_EQC_CWATCH_DAT0; + sval = SETFIELD(VC_EQC_CWATCH_BLOCKID, idx, block); + break; + case xive_cache_vpc: + sreg = PC_VPC_CWATCH_SPEC; + sregx = X_PC_VPC_CWATCH_SPEC; + dreg0 = PC_VPC_CWATCH_DAT0; + dreg0x = X_PC_VPC_CWATCH_DAT0; + sval = SETFIELD(PC_VPC_CWATCH_BLOCKID, idx, block); + break; + default: + return OPAL_INTERNAL_ERROR; + } + + /* The full bit is in the same position for EQC and VPC */ + if (!light_watch) + sval |= VC_EQC_CWATCH_FULL; + + for (;;) { + /* Write the cache watch spec */ + __xive_regw(x, sreg, sregx, sval, NULL); + + /* Load data0 register to populate the watch */ + dval0 = __xive_regr(x, dreg0, dreg0x, NULL); + + /* If new_data is NULL, this is a dummy watch used as a + * workaround for a HW bug + */ + if (!new_data) { + __xive_regw(x, dreg0, dreg0x, dval0, NULL); + return 0; + } + + /* Write the words into the watch facility. We write in reverse + * order in case word 0 is part of it as it must be the last + * one written. + */ + for (i = start_dword + dword_count - 1; i >= start_dword ;i--) { + uint64_t dw = be64_to_cpu(new_data[i - start_dword]); + __xive_regw(x, dreg0 + i * 8, dreg0x + i, dw, NULL); + } + + /* Write data0 register to trigger the update if word 0 wasn't + * written above + */ + if (start_dword > 0) + __xive_regw(x, dreg0, dreg0x, dval0, NULL); + + /* This may not be necessary for light updates (it's possible + * that a sync in sufficient, TBD). Ensure the above is + * complete and check the status of the watch. + */ + status = __xive_regr(x, sreg, sregx, NULL); + + /* Bits FULL and CONFLICT are in the same position in + * EQC and VPC + */ + if (!(status & VC_EQC_CWATCH_FULL) || + !(status & VC_EQC_CWATCH_CONFLICT)) + break; + if (!synchronous) + return OPAL_BUSY; + + if (++retries == XIVE_CACHE_WATCH_MAX_RETRIES) { + xive_err(x, "Reached maximum retries %d when doing " + "a %s cache update\n", retries, + ctype == xive_cache_eqc ? "EQC" : "VPC"); + return OPAL_BUSY; + } + } + + /* Perform a scrub with "want_invalidate" set to false to push the + * cache updates to memory as well + */ + return __xive_cache_scrub(x, ctype, block, idx, false, false); +} + +static int64_t xive_escalation_ive_cache_update(struct xive *x, uint64_t block, + uint64_t idx, struct xive_ive *ive, + bool synchronous) +{ + return __xive_cache_watch(x, xive_cache_eqc, block, idx, + 2, 1, &ive->w, true, synchronous); +} + +static int64_t xive_eqc_cache_update(struct xive *x, uint64_t block, + uint64_t idx, struct xive_eq *eq, + bool synchronous) +{ + return __xive_cache_watch(x, xive_cache_eqc, block, idx, + 0, 4, (__be64 *)eq, false, synchronous); +} + +static int64_t xive_vpc_cache_update(struct xive *x, uint64_t block, + uint64_t idx, struct xive_vp *vp, + bool synchronous) +{ + return __xive_cache_watch(x, xive_cache_vpc, block, idx, + 0, 8, (__be64 *)vp, false, synchronous); +} + +static bool xive_set_vsd(struct xive *x, uint32_t tbl, uint32_t idx, uint64_t v) +{ + /* Set VC version */ + xive_regw(x, VC_VSD_TABLE_ADDR, + SETFIELD(VST_TABLE_SELECT, 0ull, tbl) | + SETFIELD(VST_TABLE_OFFSET, 0ull, idx)); + if (x->last_reg_error) + return false; + xive_regw(x, VC_VSD_TABLE_DATA, v); + if (x->last_reg_error) + return false; + + /* Except for IRQ table, also set PC version */ + if (tbl == VST_TSEL_IRQ) + return true; + + xive_regw(x, PC_VSD_TABLE_ADDR, + SETFIELD(VST_TABLE_SELECT, 0ull, tbl) | + SETFIELD(VST_TABLE_OFFSET, 0ull, idx)); + if (x->last_reg_error) + return false; + xive_regw(x, PC_VSD_TABLE_DATA, v); + if (x->last_reg_error) + return false; + return true; +} + +static bool xive_set_local_tables(struct xive *x) +{ + uint64_t base, i; + + /* These have to be power of 2 sized */ + assert(is_pow2(SBE_SIZE)); + assert(is_pow2(IVT_SIZE)); + + /* All tables set as exclusive */ + base = SETFIELD(VSD_MODE, 0ull, VSD_MODE_EXCLUSIVE); + + /* Set IVT as direct mode */ + if (!xive_set_vsd(x, VST_TSEL_IVT, x->block_id, base | + (((uint64_t)x->ivt_base) & VSD_ADDRESS_MASK) | + SETFIELD(VSD_TSIZE, 0ull, ilog2(IVT_SIZE) - 12))) + return false; + + /* Set SBE as direct mode */ + if (!xive_set_vsd(x, VST_TSEL_SBE, x->block_id, base | + (((uint64_t)x->sbe_base) & VSD_ADDRESS_MASK) | + SETFIELD(VSD_TSIZE, 0ull, ilog2(SBE_SIZE) - 12))) + return false; + + /* Set EQDT as indirect mode with 64K subpages */ + if (!xive_set_vsd(x, VST_TSEL_EQDT, x->block_id, base | + (((uint64_t)x->eq_ind_base) & VSD_ADDRESS_MASK) | + VSD_INDIRECT | SETFIELD(VSD_TSIZE, 0ull, 4))) + return false; + + /* Set VPDT as indirect mode with 64K subpages */ + if (!xive_set_vsd(x, VST_TSEL_VPDT, x->block_id, base | + (((uint64_t)x->vp_ind_base) & VSD_ADDRESS_MASK) | + VSD_INDIRECT | SETFIELD(VSD_TSIZE, 0ull, 4))) + return false; + + /* Setup queue overflows */ + for (i = 0; i < VC_QUEUE_OVF_COUNT; i++) { + u64 addr = ((uint64_t)x->q_ovf) + i * PAGE_SIZE; + u64 cfg, sreg, sregx; + + if (!xive_set_vsd(x, VST_TSEL_IRQ, i, base | + (addr & VSD_ADDRESS_MASK) | + SETFIELD(VSD_TSIZE, 0ull, 4))) + return false; + sreg = VC_IRQ_CONFIG_IPI + i * 8; + sregx = X_VC_IRQ_CONFIG_IPI + i; + cfg = __xive_regr(x, sreg, sregx, NULL); + cfg |= VC_IRQ_CONFIG_MEMB_EN; + cfg = SETFIELD(VC_IRQ_CONFIG_MEMB_SZ, cfg, 4); + __xive_regw(x, sreg, sregx, cfg, NULL); + } + + return true; +} + +static bool xive_configure_bars(struct xive *x) +{ + uint64_t chip_id = x->chip_id; + uint64_t val; + + /* IC BAR */ + phys_map_get(chip_id, XIVE_IC, 0, (uint64_t *)&x->ic_base, &x->ic_size); + val = (uint64_t)x->ic_base | CQ_IC_BAR_VALID | CQ_IC_BAR_64K; + x->ic_shift = 16; + + xive_regwx(x, CQ_IC_BAR, val); + if (x->last_reg_error) + return false; + + /* TM BAR, only configure TM1. Note that this has the same address + * for each chip !!! Hence we create a fake chip 0 and use that for + * all phys_map_get(XIVE_TM) calls. + */ + phys_map_get(0, XIVE_TM, 0, (uint64_t *)&x->tm_base, &x->tm_size); + val = (uint64_t)x->tm_base | CQ_TM_BAR_VALID | CQ_TM_BAR_64K; + x->tm_shift = 16; + + xive_regwx(x, CQ_TM1_BAR, val); + if (x->last_reg_error) + return false; + xive_regwx(x, CQ_TM2_BAR, 0); + if (x->last_reg_error) + return false; + + /* PC BAR. Clear first, write mask, then write value */ + phys_map_get(chip_id, XIVE_PC, 0, (uint64_t *)&x->pc_base, &x->pc_size); + xive_regwx(x, CQ_PC_BAR, 0); + if (x->last_reg_error) + return false; + val = ~(x->pc_size - 1) & CQ_PC_BARM_MASK; + xive_regwx(x, CQ_PC_BARM, val); + if (x->last_reg_error) + return false; + val = (uint64_t)x->pc_base | CQ_PC_BAR_VALID; + xive_regwx(x, CQ_PC_BAR, val); + if (x->last_reg_error) + return false; + + /* VC BAR. Clear first, write mask, then write value */ + phys_map_get(chip_id, XIVE_VC, 0, (uint64_t *)&x->vc_base, &x->vc_size); + xive_regwx(x, CQ_VC_BAR, 0); + if (x->last_reg_error) + return false; + val = ~(x->vc_size - 1) & CQ_VC_BARM_MASK; + xive_regwx(x, CQ_VC_BARM, val); + if (x->last_reg_error) + return false; + val = (uint64_t)x->vc_base | CQ_VC_BAR_VALID; + xive_regwx(x, CQ_VC_BAR, val); + if (x->last_reg_error) + return false; + + /* Calculate some MMIO bases in the VC BAR */ + x->esb_mmio = x->vc_base; + x->eq_mmio = x->vc_base + (x->vc_size / VC_MAX_SETS) * VC_ESB_SETS; + + /* Print things out */ + xive_dbg(x, "IC: %14p [0x%012llx/%d]\n", x->ic_base, x->ic_size, + x->ic_shift); + xive_dbg(x, "TM: %14p [0x%012llx/%d]\n", x->tm_base, x->tm_size, + x->tm_shift); + xive_dbg(x, "PC: %14p [0x%012llx]\n", x->pc_base, x->pc_size); + xive_dbg(x, "VC: %14p [0x%012llx]\n", x->vc_base, x->vc_size); + + return true; +} + +static void xive_dump_mmio(struct xive *x) +{ + prlog(PR_DEBUG, " CQ_CFG_PB_GEN = %016llx\n", + in_be64(x->ic_base + CQ_CFG_PB_GEN)); + prlog(PR_DEBUG, " CQ_MSGSND = %016llx\n", + in_be64(x->ic_base + CQ_MSGSND)); +} + +static bool xive_config_init(struct xive *x) +{ + uint64_t val; + + /* Configure PC and VC page sizes and disable Linux trigger mode */ + xive_regwx(x, CQ_PBI_CTL, CQ_PBI_PC_64K | CQ_PBI_VC_64K | CQ_PBI_FORCE_TM_LOCAL); + if (x->last_reg_error) + return false; + + /*** The rest can use MMIO ***/ + + /* Enable indirect mode in VC config */ + val = xive_regr(x, VC_GLOBAL_CONFIG); + val |= VC_GCONF_INDIRECT; + xive_regw(x, VC_GLOBAL_CONFIG, val); + + /* Enable indirect mode in PC config */ + val = xive_regr(x, PC_GLOBAL_CONFIG); + val |= PC_GCONF_INDIRECT; + val |= PC_GCONF_CHIPID_OVR; + val = SETFIELD(PC_GCONF_CHIPID, val, x->block_id); + xive_regw(x, PC_GLOBAL_CONFIG, val); + xive_dbg(x, "PC_GLOBAL_CONFIG=%016llx\n", val); + + val = xive_regr(x, PC_TCTXT_CFG); + val |= PC_TCTXT_CFG_BLKGRP_EN | PC_TCTXT_CFG_HARD_CHIPID_BLK; + val |= PC_TCTXT_CHIPID_OVERRIDE; + val |= PC_TCTXT_CFG_TARGET_EN; + val = SETFIELD(PC_TCTXT_CHIPID, val, x->block_id); + val = SETFIELD(PC_TCTXT_INIT_AGE, val, 0x2); + val |= PC_TCTXT_CFG_LGS_EN; + /* Disable pressure relief as we hijack the field in the VPs */ + val &= ~PC_TCTXT_CFG_STORE_ACK; + if (this_cpu()->is_fused_core) + val |= PC_TCTXT_CFG_FUSE_CORE_EN; + else + val &= ~PC_TCTXT_CFG_FUSE_CORE_EN; + xive_regw(x, PC_TCTXT_CFG, val); + xive_dbg(x, "PC_TCTXT_CFG=%016llx\n", val); + + val = xive_regr(x, CQ_CFG_PB_GEN); + /* 1-block-per-chip mode */ + val = SETFIELD(CQ_INT_ADDR_OPT, val, 2); + xive_regw(x, CQ_CFG_PB_GEN, val); + + /* Enable StoreEOI */ + val = xive_regr(x, VC_SBC_CONFIG); + if (XIVE_CAN_STORE_EOI(x)) + val |= VC_SBC_CONF_CPLX_CIST | VC_SBC_CONF_CIST_BOTH; + else + xive_dbg(x, "store EOI is disabled\n"); + + val |= VC_SBC_CONF_NO_UPD_PRF; + xive_regw(x, VC_SBC_CONFIG, val); + + /* Disable block tracking on Nimbus (we may want to enable + * it on Cumulus later). HW Erratas. + */ + val = xive_regr(x, PC_TCTXT_TRACK); + val &= ~PC_TCTXT_TRACK_EN; + xive_regw(x, PC_TCTXT_TRACK, val); + + /* Enable relaxed ordering of trigger forwarding */ + val = xive_regr(x, VC_AIB_TX_ORDER_TAG2); + val |= VC_AIB_TX_ORDER_TAG2_REL_TF; + xive_regw(x, VC_AIB_TX_ORDER_TAG2, val); + + /* Enable new END s and u bits for silent escalate */ + val = xive_regr(x, VC_EQC_CONFIG); + val |= VC_EQC_CONF_ENABLE_END_s_BIT; + val |= VC_EQC_CONF_ENABLE_END_u_BIT; + xive_regw(x, VC_EQC_CONFIG, val); + + /* Disable error reporting in the FIR for info errors + * from the VC. + */ + xive_regw(x, CQ_FIRMASK_OR, CQ_FIR_VC_INFO_ERROR_0_1); + + /* Mask CI Load and Store to bad location, as IPI trigger + * pages may be mapped to user space, and a read on the + * trigger page causes a checkstop + */ + xive_regw(x, CQ_FIRMASK_OR, CQ_FIR_PB_RCMDX_CI_ERR1); + + return true; +} + +static bool xive_setup_set_xlate(struct xive *x) +{ + unsigned int i; + + /* Configure EDT for ESBs (aka IPIs) */ + xive_regw(x, CQ_TAR, CQ_TAR_TBL_AUTOINC | CQ_TAR_TSEL_EDT); + if (x->last_reg_error) + return false; + for (i = 0; i < VC_ESB_SETS; i++) { + xive_regw(x, CQ_TDR, + /* IPI type */ + (1ull << 62) | + /* block ID */ + (((uint64_t)x->block_id) << 48) | + /* offset */ + (((uint64_t)i) << 32)); + if (x->last_reg_error) + return false; + } + + /* Configure EDT for ENDs (aka EQs) */ + for (i = 0; i < VC_END_SETS; i++) { + xive_regw(x, CQ_TDR, + /* EQ type */ + (2ull << 62) | + /* block ID */ + (((uint64_t)x->block_id) << 48) | + /* offset */ + (((uint64_t)i) << 32)); + if (x->last_reg_error) + return false; + } + + /* Configure VDT */ + xive_regw(x, CQ_TAR, CQ_TAR_TBL_AUTOINC | CQ_TAR_TSEL_VDT); + if (x->last_reg_error) + return false; + for (i = 0; i < PC_MAX_SETS; i++) { + xive_regw(x, CQ_TDR, + /* Valid bit */ + (1ull << 63) | + /* block ID */ + (((uint64_t)x->block_id) << 48) | + /* offset */ + (((uint64_t)i) << 32)); + if (x->last_reg_error) + return false; + } + return true; +} + +static bool xive_prealloc_tables(struct xive *x) +{ + uint32_t i, vp_init_count, vp_init_base; + uint32_t pbase, pend; + uint64_t al; + + /* ESB/SBE has 4 entries per byte */ + x->sbe_base = local_alloc(x->chip_id, SBE_SIZE, SBE_SIZE); + if (!x->sbe_base) { + xive_err(x, "Failed to allocate SBE\n"); + return false; + } + /* SBEs are initialized to 0b01 which corresponds to "ints off" */ + memset(x->sbe_base, 0x55, SBE_SIZE); + xive_dbg(x, "SBE at %p size 0x%lx\n", x->sbe_base, SBE_SIZE); + + /* EAS/IVT entries are 8 bytes */ + x->ivt_base = local_alloc(x->chip_id, IVT_SIZE, IVT_SIZE); + if (!x->ivt_base) { + xive_err(x, "Failed to allocate IVT\n"); + return false; + } + /* We clear the entries (non-valid). They will be initialized + * when actually used + */ + memset(x->ivt_base, 0, IVT_SIZE); + xive_dbg(x, "IVT at %p size 0x%lx\n", x->ivt_base, IVT_SIZE); + + /* Indirect EQ table. Limited to one top page. */ + al = ALIGN_UP(XIVE_EQ_TABLE_SIZE, PAGE_SIZE); + if (al > PAGE_SIZE) { + xive_err(x, "EQ indirect table is too big !\n"); + return false; + } + x->eq_ind_base = local_alloc(x->chip_id, al, al); + if (!x->eq_ind_base) { + xive_err(x, "Failed to allocate EQ indirect table\n"); + return false; + } + memset(x->eq_ind_base, 0, al); + xive_dbg(x, "EQi at %p size 0x%llx\n", x->eq_ind_base, al); + x->eq_ind_count = XIVE_EQ_TABLE_SIZE / XIVE_VSD_SIZE; + + /* Indirect VP table. Limited to one top page. */ + al = ALIGN_UP(XIVE_VP_TABLE_SIZE, PAGE_SIZE); + if (al > PAGE_SIZE) { + xive_err(x, "VP indirect table is too big !\n"); + return false; + } + x->vp_ind_base = local_alloc(x->chip_id, al, al); + if (!x->vp_ind_base) { + xive_err(x, "Failed to allocate VP indirect table\n"); + return false; + } + xive_dbg(x, "VPi at %p size 0x%llx\n", x->vp_ind_base, al); + x->vp_ind_count = XIVE_VP_TABLE_SIZE / XIVE_VSD_SIZE; + memset(x->vp_ind_base, 0, al); + + /* Populate/initialize VP/EQs indirect backing */ + vp_init_count = XIVE_HW_VP_COUNT; + vp_init_base = XIVE_HW_VP_BASE; + + /* Allocate pages for some VPs in indirect mode */ + pbase = vp_init_base / VP_PER_PAGE; + pend = (vp_init_base + vp_init_count) / VP_PER_PAGE; + + xive_dbg(x, "Allocating pages %d to %d of VPs (for %d VPs)\n", + pbase, pend, vp_init_count); + for (i = pbase; i <= pend; i++) { + void *page; + u64 vsd; + + /* Indirect entries have a VSD format */ + page = local_alloc(x->chip_id, PAGE_SIZE, PAGE_SIZE); + if (!page) { + xive_err(x, "Failed to allocate VP page\n"); + return false; + } + xive_dbg(x, "VP%d at %p size 0x%x\n", i, page, PAGE_SIZE); + memset(page, 0, PAGE_SIZE); + vsd = ((uint64_t)page) & VSD_ADDRESS_MASK; + + vsd |= SETFIELD(VSD_TSIZE, 0ull, 4); + vsd |= SETFIELD(VSD_MODE, 0ull, VSD_MODE_EXCLUSIVE); + vsd |= VSD_FIRMWARE; + x->vp_ind_base[i] = cpu_to_be64(vsd); + } + + /* Allocate the queue overflow pages */ + x->q_ovf = local_alloc(x->chip_id, VC_QUEUE_OVF_COUNT * PAGE_SIZE, PAGE_SIZE); + if (!x->q_ovf) { + xive_err(x, "Failed to allocate queue overflow\n"); + return false; + } + return true; +} + +static void xive_add_provisioning_properties(void) +{ + __be32 chips[XIVE_MAX_CHIPS]; + uint32_t i, count; + + dt_add_property_cells(xive_dt_node, + "ibm,xive-provision-page-size", PAGE_SIZE); + + count = 1 << xive_chips_alloc_bits; + for (i = 0; i < count; i++) + chips[i] = cpu_to_be32(xive_block_to_chip[i]); + dt_add_property(xive_dt_node, "ibm,xive-provision-chips", + chips, 4 * count); +} + +static void xive_create_mmio_dt_node(struct xive *x) +{ + uint64_t tb = (uint64_t)x->tm_base; + uint32_t stride = 1u << x->tm_shift; + + xive_dt_node = dt_new_addr(dt_root, "interrupt-controller", tb); + assert(xive_dt_node); + + dt_add_property_u64s(xive_dt_node, "reg", + tb + 0 * stride, stride, + tb + 1 * stride, stride, + tb + 2 * stride, stride, + tb + 3 * stride, stride); + + dt_add_property_strings(xive_dt_node, "compatible", + "ibm,opal-xive-pe", "ibm,opal-intc"); + + dt_add_property_cells(xive_dt_node, "ibm,xive-eq-sizes", + 12, 16, 21, 24); + + dt_add_property_cells(xive_dt_node, "ibm,xive-#priorities", + NUM_INT_PRIORITIES); + dt_add_property(xive_dt_node, "single-escalation-support", NULL, 0); + + xive_add_provisioning_properties(); +} + +static void xive_setup_forward_ports(struct xive *x, struct proc_chip *remote_chip) +{ + struct xive *remote_xive = remote_chip->xive; + uint64_t base = SETFIELD(VSD_MODE, 0ull, VSD_MODE_FORWARD); + uint32_t remote_id = remote_xive->block_id; + uint64_t nport; + + /* ESB(SBE), EAS(IVT) and END(EQ) point to the notify port */ + nport = ((uint64_t)remote_xive->ic_base) + (1ul << remote_xive->ic_shift); + if (!xive_set_vsd(x, VST_TSEL_IVT, remote_id, base | nport)) + goto error; + if (!xive_set_vsd(x, VST_TSEL_SBE, remote_id, base | nport)) + goto error; + if (!xive_set_vsd(x, VST_TSEL_EQDT, remote_id, base | nport)) + goto error; + + /* NVT/VPD points to the remote NVT MMIO sets */ + if (!xive_set_vsd(x, VST_TSEL_VPDT, remote_id, + base | ((uint64_t)remote_xive->pc_base) | + SETFIELD(VSD_TSIZE, 0ull, ilog2(x->pc_size) - 12))) + goto error; + + return; + + error: + xive_err(x, "Failure configuring forwarding ports\n"); +} + +static void late_init_one_xive(struct xive *x) +{ + struct proc_chip *chip; + + /* We need to setup the cross-chip forward ports. Let's + * iterate all chip and set them up accordingly + */ + for_each_chip(chip) { + /* We skip ourselves or chips without a xive */ + if (chip->xive == x || !chip->xive) + continue; + + /* Setup our forward ports to that chip */ + xive_setup_forward_ports(x, chip); + } +} + +static bool xive_check_ipi_free(struct xive *x, uint32_t irq, uint32_t count) +{ + uint32_t i, idx = GIRQ_TO_IDX(irq); + + for (i = 0; i < count; i++) + if (bitmap_tst_bit(*x->ipi_alloc_map, idx + i)) + return false; + return true; +} + +uint32_t xive_alloc_hw_irqs(uint32_t chip_id, uint32_t count, uint32_t align) +{ + struct proc_chip *chip = get_chip(chip_id); + struct xive *x; + uint32_t base, i; + + assert(chip); + assert(is_pow2(align)); + + x = chip->xive; + assert(x); + + lock(&x->lock); + + /* Allocate the HW interrupts */ + base = x->int_hw_bot - count; + base &= ~(align - 1); + if (base < x->int_ipi_top) { + xive_err(x, + "HW alloc request for %d interrupts aligned to %d failed\n", + count, align); + unlock(&x->lock); + return XIVE_IRQ_ERROR; + } + if (!xive_check_ipi_free(x, base, count)) { + xive_err(x, "HWIRQ boot allocator request overlaps dynamic allocator\n"); + unlock(&x->lock); + return XIVE_IRQ_ERROR; + } + + x->int_hw_bot = base; + + /* Initialize the corresponding IVT entries to sane defaults, + * IE entry is valid, not routed and masked, EQ data is set + * to the GIRQ number. + */ + for (i = 0; i < count; i++) { + struct xive_ive *ive = xive_get_ive(x, base + i); + + ive->w = xive_set_field64(IVE_VALID, 0ul, 1) | + xive_set_field64(IVE_MASKED, 0ul, 1) | + xive_set_field64(IVE_EQ_DATA, 0ul, base + i); + } + + unlock(&x->lock); + return base; +} + +uint32_t xive_alloc_ipi_irqs(uint32_t chip_id, uint32_t count, uint32_t align) +{ + struct proc_chip *chip = get_chip(chip_id); + struct xive *x; + uint32_t base, i; + + assert(chip); + assert(is_pow2(align)); + + x = chip->xive; + assert(x); + + lock(&x->lock); + + /* Allocate the IPI interrupts */ + base = x->int_ipi_top + (align - 1); + base &= ~(align - 1); + if (base >= x->int_hw_bot) { + xive_err(x, + "IPI alloc request for %d interrupts aligned to %d failed\n", + count, align); + unlock(&x->lock); + return XIVE_IRQ_ERROR; + } + if (!xive_check_ipi_free(x, base, count)) { + xive_err(x, "IPI boot allocator request overlaps dynamic allocator\n"); + unlock(&x->lock); + return XIVE_IRQ_ERROR; + } + + x->int_ipi_top = base + count; + + /* Initialize the corresponding IVT entries to sane defaults, + * IE entry is valid, not routed and masked, EQ data is set + * to the GIRQ number. + */ + for (i = 0; i < count; i++) { + struct xive_ive *ive = xive_get_ive(x, base + i); + + ive->w = xive_set_field64(IVE_VALID, 0ul, 1) | + xive_set_field64(IVE_MASKED, 0ul, 1) | + xive_set_field64(IVE_EQ_DATA, 0ul, base + i); + } + + unlock(&x->lock); + return base; +} + +void *xive_get_trigger_port(uint32_t girq) +{ + uint32_t idx = GIRQ_TO_IDX(girq); + struct xive *x; + + /* Find XIVE on which the IVE resides */ + x = xive_from_isn(girq); + if (!x) + return NULL; + + if (GIRQ_IS_ESCALATION(girq)) { + /* There is no trigger page for escalation interrupts */ + return NULL; + } else { + /* Make sure it's an IPI on that chip */ + if (girq < x->int_base || + girq >= x->int_ipi_top) + return NULL; + + return x->esb_mmio + idx * XIVE_ESB_PAGE_SIZE; + } +} + +uint64_t xive_get_notify_port(uint32_t chip_id, uint32_t ent) +{ + struct proc_chip *chip = get_chip(chip_id); + struct xive *x; + uint32_t offset = 0; + + assert(chip); + x = chip->xive; + assert(x); + + /* This is where we can assign a different HW queue to a different + * source by offsetting into the cache lines of the notify port + * + * For now we keep it very basic, this will have to be looked at + * again on real HW with some proper performance analysis. + * + * Here's what Florian says on the matter: + * + * << + * The first 2k of the notify port page can all be used for PCIe triggers + * + * However the idea would be that we try to use the first 4 cache lines to + * balance the PCIe Interrupt requests to use the least used snoop buses + * (we went from 2 to 4 snoop buses for P9). snoop 0 is heavily used + * (I think TLBIs are using that in addition to the normal addresses), + * snoop 3 is used for all Int commands, so I think snoop 2 (CL 2 in the + * page) is the least used overall. So we probably should that one for + * the Int commands from PCIe. + * + * In addition, our EAS cache supports hashing to provide "private" cache + * areas for the PHBs in the shared 1k EAS cache. This allows e.g. to avoid + * that one "thrashing" PHB thrashes the EAS cache for everyone, or provide + * a PHB with a private area that would allow high cache hits in case of a + * device using very few interrupts. The hashing is based on the offset within + * the cache line. So using that, you can e.g. set the EAS cache up so that + * IPIs use 512 entries, the x16 PHB uses 256 entries and the x8 PHBs 128 + * entries each - or IPIs using all entries and sharing with PHBs, so PHBs + * would use 512 entries and 256 entries respectively. + * + * This is a tuning we would probably do later in the lab, but as a "prep" + * we should set up the different PHBs such that they are using different + * 8B-aligned offsets within the cache line, so e.g. + * PH4_0 addr 0x100 (CL 2 DW0 + * PH4_1 addr 0x108 (CL 2 DW1) + * PH4_2 addr 0x110 (CL 2 DW2) + * etc. + * >> + * + * I'm using snoop1 for PHB0 and snoop2 for everybody else. + */ + switch(ent) { + case XIVE_HW_SRC_PHBn(0): + offset = 0x100; + break; + case XIVE_HW_SRC_PHBn(1): + offset = 0x208; + break; + case XIVE_HW_SRC_PHBn(2): + offset = 0x210; + break; + case XIVE_HW_SRC_PHBn(3): + offset = 0x218; + break; + case XIVE_HW_SRC_PHBn(4): + offset = 0x220; + break; + case XIVE_HW_SRC_PHBn(5): + offset = 0x228; + break; + case XIVE_HW_SRC_PSI: + offset = 0x230; + break; + default: + assert(false); + return 0; + } + + /* Notify port is the second page of the IC BAR */ + return ((uint64_t)x->ic_base) + (1ul << x->ic_shift) + offset; +} + +/* Manufacture the powerbus packet bits 32:63 */ +__attrconst uint32_t xive_get_notify_base(uint32_t girq) +{ + return (GIRQ_TO_BLK(girq) << 28) | GIRQ_TO_IDX(girq); +} + +static bool xive_get_irq_targetting(uint32_t isn, uint32_t *out_target, + uint8_t *out_prio, uint32_t *out_lirq) +{ + struct xive_ive *ive; + struct xive *x, *eq_x; + struct xive_eq *eq; + uint32_t eq_blk, eq_idx; + uint32_t vp_blk __unused, vp_idx; + uint32_t prio, server; + bool is_escalation = GIRQ_IS_ESCALATION(isn); + + /* Find XIVE on which the IVE resides */ + x = xive_from_isn(isn); + if (!x) + return false; + /* Grab the IVE */ + ive = xive_get_ive(x, isn); + if (!ive) + return false; + if (!xive_get_field64(IVE_VALID, ive->w) && !is_escalation) { + xive_err(x, "ISN %x lead to invalid IVE !\n", isn); + return false; + } + + if (out_lirq) + *out_lirq = xive_get_field64(IVE_EQ_DATA, ive->w); + + /* Find the EQ and its xive instance */ + eq_blk = xive_get_field64(IVE_EQ_BLOCK, ive->w); + eq_idx = xive_get_field64(IVE_EQ_INDEX, ive->w); + eq_x = xive_from_vc_blk(eq_blk); + + /* This can fail if the interrupt hasn't been initialized yet + * but it should also be masked, so fail silently + */ + if (!eq_x) + goto pick_default; + eq = xive_get_eq(eq_x, eq_idx); + if (!eq) + goto pick_default; + + /* XXX Check valid and format 0 */ + + /* No priority conversion, return the actual one ! */ + if (xive_get_field64(IVE_MASKED, ive->w)) + prio = 0xff; + else + prio = xive_get_field32(EQ_W7_F0_PRIORITY, eq->w7); + if (out_prio) + *out_prio = prio; + + vp_blk = xive_get_field32(EQ_W6_NVT_BLOCK, eq->w6); + vp_idx = xive_get_field32(EQ_W6_NVT_INDEX, eq->w6); + server = VP2PIR(vp_blk, vp_idx); + + if (out_target) + *out_target = server; + + xive_vdbg(eq_x, "EQ info for ISN %x: prio=%d, server=0x%x (VP %x/%x)\n", + isn, prio, server, vp_blk, vp_idx); + return true; + +pick_default: + xive_vdbg(eq_x, "EQ info for ISN %x: Using masked defaults\n", isn); + + if (out_prio) + *out_prio = 0xff; + /* Pick a random default, me will be fine ... */ + if (out_target) + *out_target = mfspr(SPR_PIR); + return true; +} + +static inline bool xive_eq_for_target(uint32_t target, uint8_t prio, + uint32_t *out_eq_blk, + uint32_t *out_eq_idx) +{ + struct xive *x; + struct xive_vp *vp; + uint32_t vp_blk, vp_idx; + uint32_t eq_blk, eq_idx; + + if (prio > XIVE_MAX_PRIO) + return false; + + /* Get the VP block/index from the target word */ + if (!xive_decode_vp(target, &vp_blk, &vp_idx, NULL, NULL)) + return false; + + /* Grab the target VP's XIVE */ + x = xive_from_pc_blk(vp_blk); + if (!x) + return false; + + /* Find the VP structrure where we stashed the EQ number */ + vp = xive_get_vp(x, vp_idx); + if (!vp) + return false; + + /* Grab it, it's in the pressure relief interrupt field, + * top 4 bits are the block (word 1). + */ + eq_blk = be32_to_cpu(vp->w1) >> 28; + eq_idx = be32_to_cpu(vp->w1) & 0x0fffffff; + + /* Currently the EQ block and VP block should be the same */ + if (eq_blk != vp_blk) { + xive_err(x, "eq_blk != vp_blk (%d vs. %d) for target 0x%08x/%d\n", + eq_blk, vp_blk, target, prio); + return false; + } + + if (out_eq_blk) + *out_eq_blk = eq_blk; + if (out_eq_idx) + *out_eq_idx = eq_idx + prio; + + return true; +} + +static int64_t xive_set_irq_targetting(uint32_t isn, uint32_t target, + uint8_t prio, uint32_t lirq, + bool synchronous) +{ + struct xive *x; + struct xive_ive *ive, new_ive; + uint32_t eq_blk, eq_idx; + bool is_escalation = GIRQ_IS_ESCALATION(isn); + int64_t rc; + + /* Find XIVE on which the IVE resides */ + x = xive_from_isn(isn); + if (!x) + return OPAL_PARAMETER; + /* Grab the IVE */ + ive = xive_get_ive(x, isn); + if (!ive) + return OPAL_PARAMETER; + if (!xive_get_field64(IVE_VALID, ive->w) && !is_escalation) { + xive_err(x, "ISN %x lead to invalid IVE !\n", isn); + return OPAL_PARAMETER; + } + + lock(&x->lock); + + /* If using emulation mode, fixup prio to the only supported one */ + if (xive_mode == XIVE_MODE_EMU && prio != 0xff) + prio = XIVE_EMULATION_PRIO; + + /* Read existing IVE */ + new_ive = *ive; + + /* Are we masking ? */ + if (prio == 0xff && !is_escalation) { + new_ive.w = xive_set_field64(IVE_MASKED, new_ive.w, 1); + xive_vdbg(x, "ISN %x masked !\n", isn); + + /* Put prio 7 in the EQ */ + prio = XIVE_MAX_PRIO; + } else { + /* Unmasking */ + new_ive.w = xive_set_field64(IVE_MASKED, new_ive.w, 0); + xive_vdbg(x, "ISN %x unmasked !\n", isn); + + /* For normal interrupt sources, keep track of which ones + * we ever enabled since the last reset + */ + if (!is_escalation) + bitmap_set_bit(*x->int_enabled_map, GIRQ_TO_IDX(isn)); + } + + /* If prio isn't 0xff, re-target the IVE. First find the EQ + * correponding to the target + */ + if (prio != 0xff) { + if (!xive_eq_for_target(target, prio, &eq_blk, &eq_idx)) { + xive_err(x, "Can't find EQ for target/prio 0x%x/%d\n", + target, prio); + unlock(&x->lock); + return OPAL_PARAMETER; + } + + /* Try to update it atomically to avoid an intermediary + * stale state + */ + new_ive.w = xive_set_field64(IVE_EQ_BLOCK, new_ive.w, eq_blk); + new_ive.w = xive_set_field64(IVE_EQ_INDEX, new_ive.w, eq_idx); + } + new_ive.w = xive_set_field64(IVE_EQ_DATA, new_ive.w, lirq); + + xive_vdbg(x,"ISN %x routed to eq %x/%x lirq=%08x IVE=%016llx !\n", + isn, eq_blk, eq_idx, lirq, be64_to_cpu(new_ive.w)); + + /* Updating the cache differs between real IVEs and escalation + * IVEs inside an EQ + */ + if (is_escalation) { + rc = xive_escalation_ive_cache_update(x, x->block_id, + GIRQ_TO_IDX(isn), &new_ive, synchronous); + } else { + sync(); + *ive = new_ive; + rc = xive_ivc_scrub(x, x->block_id, GIRQ_TO_IDX(isn)); + } + + unlock(&x->lock); + return rc; +} + +static int64_t xive_source_get_xive(struct irq_source *is __unused, + uint32_t isn, uint16_t *server, + uint8_t *prio) +{ + uint32_t target_id; + + if (xive_get_irq_targetting(isn, &target_id, prio, NULL)) { + *server = target_id << 2; + return OPAL_SUCCESS; + } else + return OPAL_PARAMETER; +} + +static void xive_update_irq_mask(struct xive_src *s, uint32_t idx, bool masked) +{ + void *mmio_base = s->esb_mmio + (1ul << s->esb_shift) * idx; + uint32_t offset; + + /* XXX FIXME: A quick mask/umask can make us shoot an interrupt + * more than once to a queue. We need to keep track better + */ + if (s->flags & XIVE_SRC_EOI_PAGE1) + mmio_base += 1ull << (s->esb_shift - 1); + if (masked) + offset = XIVE_ESB_SET_PQ_01; + else + offset = XIVE_ESB_SET_PQ_00; + + in_be64(mmio_base + offset); +} + +static int64_t xive_sync(struct xive *x) +{ + uint64_t r; + void *p; + + lock(&x->lock); + + /* Second 2K range of second page */ + p = x->ic_base + (1 << x->ic_shift) + 0x800; + + /* TODO: Make this more fine grained */ + out_be64(p + (10 << 7), 0); /* Sync OS escalations */ + out_be64(p + (11 << 7), 0); /* Sync Hyp escalations */ + out_be64(p + (12 << 7), 0); /* Sync Redistribution */ + out_be64(p + ( 8 << 7), 0); /* Sync IPI */ + out_be64(p + ( 9 << 7), 0); /* Sync HW */ + +#define SYNC_MASK \ + (VC_EQC_CONF_SYNC_IPI | \ + VC_EQC_CONF_SYNC_HW | \ + VC_EQC_CONF_SYNC_ESC1 | \ + VC_EQC_CONF_SYNC_ESC2 | \ + VC_EQC_CONF_SYNC_REDI) + + /* XXX Add timeout */ + for (;;) { + r = xive_regr(x, VC_EQC_CONFIG); + if ((r & SYNC_MASK) == SYNC_MASK) + break; + cpu_relax(); + } + xive_regw(x, VC_EQC_CONFIG, r & ~SYNC_MASK); + + /* Workaround HW issue, read back before allowing a new sync */ + xive_regr(x, VC_GLOBAL_CONFIG); + + unlock(&x->lock); + + return 0; +} + +static int64_t __xive_set_irq_config(struct irq_source *is, uint32_t girq, + uint64_t vp, uint8_t prio, uint32_t lirq, + bool update_esb, bool sync) +{ + struct xive_src *s = container_of(is, struct xive_src, is); + uint32_t old_target, vp_blk; + u8 old_prio; + int64_t rc; + + /* Grab existing target */ + if (!xive_get_irq_targetting(girq, &old_target, &old_prio, NULL)) + return OPAL_PARAMETER; + + /* Let XIVE configure the EQ. We do the update without the + * synchronous flag, thus a cache update failure will result + * in us returning OPAL_BUSY + */ + rc = xive_set_irq_targetting(girq, vp, prio, lirq, false); + if (rc) + return rc; + + /* Do we need to update the mask ? */ + if (old_prio != prio && (old_prio == 0xff || prio == 0xff)) { + /* The source has special variants of masking/unmasking */ + if (s->orig_ops && s->orig_ops->set_xive) { + /* We don't pass as server on source ops ! Targetting + * is handled by the XIVE + */ + rc = s->orig_ops->set_xive(is, girq, 0, prio); + } else if (update_esb) { + /* Ensure it's enabled/disabled in the source + * controller + */ + xive_update_irq_mask(s, girq - s->esb_base, + prio == 0xff); + } + } + + /* + * Synchronize the source and old target XIVEs to ensure that + * all pending interrupts to the old target have reached their + * respective queue. + * + * WARNING: This assumes the VP and it's queues are on the same + * XIVE instance ! + */ + if (!sync) + return OPAL_SUCCESS; + xive_sync(s->xive); + if (xive_decode_vp(old_target, &vp_blk, NULL, NULL, NULL)) { + struct xive *x = xive_from_pc_blk(vp_blk); + if (x) + xive_sync(x); + } + + return OPAL_SUCCESS; +} + +static int64_t xive_set_irq_config(uint32_t girq, uint64_t vp, uint8_t prio, + uint32_t lirq, bool update_esb) +{ + struct irq_source *is = irq_find_source(girq); + + return __xive_set_irq_config(is, girq, vp, prio, lirq, update_esb, + true); +} + +static int64_t xive_source_set_xive(struct irq_source *is, + uint32_t isn, uint16_t server, uint8_t prio) +{ + /* + * WARNING: There is an inherent race with the use of the + * mask bit in the EAS/IVT. When masked, interrupts are "lost" + * but their P/Q bits are still set. So when unmasking, one has + * to check the P bit and possibly trigger a resend. + * + * We "deal" with it by relying on the fact that the OS will + * lazy disable MSIs. Thus mask will only be called if the + * interrupt occurred while already logically masked. Thus + * losing subsequent occurrences is of no consequences, we just + * need to "cleanup" P and Q when unmasking. + * + * This needs to be documented in the OPAL APIs + */ + + /* Unmangle server */ + server >>= 2; + + /* Set logical irq to match isn */ + return __xive_set_irq_config(is, isn, server, prio, isn, true, true); +} + +static void __xive_source_eoi(struct irq_source *is, uint32_t isn) +{ + struct xive_src *s = container_of(is, struct xive_src, is); + uint32_t idx = isn - s->esb_base; + struct xive_ive *ive; + void *mmio_base; + uint64_t eoi_val; + + /* Grab the IVE */ + ive = s->xive->ivt_base; + if (!ive) + return; + ive += GIRQ_TO_IDX(isn); + + /* XXX To fix the races with mask/unmask potentially causing + * multiple queue entries, we need to keep track of EOIs here, + * before the masked test below + */ + + /* If it's invalid or masked, don't do anything */ + if (xive_get_field64(IVE_MASKED, ive->w) || !xive_get_field64(IVE_VALID, ive->w)) + return; + + /* Grab MMIO control address for that ESB */ + mmio_base = s->esb_mmio + (1ull << s->esb_shift) * idx; + + /* If the XIVE supports the new "store EOI facility, use it */ + if (s->flags & XIVE_SRC_STORE_EOI) + out_be64(mmio_base + XIVE_ESB_STORE_EOI, 0); + else { + uint64_t offset; + + /* Otherwise for EOI, we use the special MMIO that does + * a clear of both P and Q and returns the old Q. + * + * This allows us to then do a re-trigger if Q was set + * rather than synthetizing an interrupt in software + */ + if (s->flags & XIVE_SRC_EOI_PAGE1) + mmio_base += 1ull << (s->esb_shift - 1); + + /* LSIs don't need anything special, just EOI */ + if (s->flags & XIVE_SRC_LSI) + in_be64(mmio_base); + else { + offset = XIVE_ESB_SET_PQ_00; + eoi_val = in_be64(mmio_base + offset); + xive_vdbg(s->xive, "ISN: %08x EOI=%llx\n", + isn, eoi_val); + if (!(eoi_val & 1)) + return; + + /* Re-trigger always on page0 or page1 ? */ + out_be64(mmio_base + XIVE_ESB_STORE_TRIGGER, 0); + } + } +} + +static void xive_source_eoi(struct irq_source *is, uint32_t isn) +{ + struct xive_src *s = container_of(is, struct xive_src, is); + + if (s->orig_ops && s->orig_ops->eoi) + s->orig_ops->eoi(is, isn); + else + __xive_source_eoi(is, isn); +} + +static void xive_source_interrupt(struct irq_source *is, uint32_t isn) +{ + struct xive_src *s = container_of(is, struct xive_src, is); + + if (!s->orig_ops || !s->orig_ops->interrupt) + return; + s->orig_ops->interrupt(is, isn); +} + +static uint64_t xive_source_attributes(struct irq_source *is, uint32_t isn) +{ + struct xive_src *s = container_of(is, struct xive_src, is); + + if (!s->orig_ops || !s->orig_ops->attributes) + return IRQ_ATTR_TARGET_LINUX; + return s->orig_ops->attributes(is, isn); +} + +static char *xive_source_name(struct irq_source *is, uint32_t isn) +{ + struct xive_src *s = container_of(is, struct xive_src, is); + + if (!s->orig_ops || !s->orig_ops->name) + return NULL; + return s->orig_ops->name(is, isn); +} + +void xive_source_mask(struct irq_source *is, uint32_t isn) +{ + struct xive_src *s = container_of(is, struct xive_src, is); + + xive_update_irq_mask(s, isn - s->esb_base, true); +} + +static const struct irq_source_ops xive_irq_source_ops = { + .get_xive = xive_source_get_xive, + .set_xive = xive_source_set_xive, + .eoi = xive_source_eoi, + .interrupt = xive_source_interrupt, + .attributes = xive_source_attributes, + .name = xive_source_name, +}; + +static void __xive_register_source(struct xive *x, struct xive_src *s, + uint32_t base, uint32_t count, + uint32_t shift, void *mmio, uint32_t flags, + bool secondary, void *data, + const struct irq_source_ops *orig_ops) +{ + s->esb_base = base; + s->esb_shift = shift; + s->esb_mmio = mmio; + s->flags = flags; + s->orig_ops = orig_ops; + s->xive = x; + s->is.start = base; + s->is.end = base + count; + s->is.ops = &xive_irq_source_ops; + s->is.data = data; + + __register_irq_source(&s->is, secondary); +} + +void xive_register_hw_source(uint32_t base, uint32_t count, uint32_t shift, + void *mmio, uint32_t flags, void *data, + const struct irq_source_ops *ops) +{ + struct xive_src *s; + struct xive *x = xive_from_isn(base); + + assert(x); + + s = malloc(sizeof(struct xive_src)); + assert(s); + __xive_register_source(x, s, base, count, shift, mmio, flags, + false, data, ops); +} + +void xive_register_ipi_source(uint32_t base, uint32_t count, void *data, + const struct irq_source_ops *ops) +{ + struct xive_src *s; + struct xive *x = xive_from_isn(base); + uint32_t base_idx = GIRQ_TO_IDX(base); + void *mmio_base; + uint32_t flags = XIVE_SRC_EOI_PAGE1 | XIVE_SRC_TRIGGER_PAGE; + + assert(x); + assert(base >= x->int_base && (base + count) <= x->int_ipi_top); + + s = malloc(sizeof(struct xive_src)); + assert(s); + + /* Store EOI supported on DD2.0 */ + if (XIVE_CAN_STORE_EOI(x)) + flags |= XIVE_SRC_STORE_EOI; + + /* Callbacks assume the MMIO base corresponds to the first + * interrupt of that source structure so adjust it + */ + mmio_base = x->esb_mmio + (1ul << XIVE_ESB_SHIFT) * base_idx; + __xive_register_source(x, s, base, count, XIVE_ESB_SHIFT, mmio_base, + flags, false, data, ops); +} + +static struct xive *init_one_xive(struct dt_node *np) +{ + struct xive *x; + struct proc_chip *chip; + uint32_t flags; + + x = zalloc(sizeof(struct xive)); + assert(x); + x->x_node = np; + x->xscom_base = dt_get_address(np, 0, NULL); + x->chip_id = dt_get_chip_id(np); + + /* "Allocate" a new block ID for the chip */ + x->block_id = xive_block_count++; + assert (x->block_id < XIVE_MAX_CHIPS); + xive_block_to_chip[x->block_id] = x->chip_id; + init_lock(&x->lock); + + chip = get_chip(x->chip_id); + assert(chip); + + /* All supported P9 are revision 2 (Nimbus DD2) */ + switch (chip->type) { + case PROC_CHIP_P9_NIMBUS: + /* We should not be able to boot a P9N DD1 */ + assert((chip->ec_level & 0xf0) != 0x10); + /* Fallthrough */ + case PROC_CHIP_P9_CUMULUS: + case PROC_CHIP_P9P: + break; + default: + assert(0); + } + + xive_dbg(x, "Initializing block ID %d...\n", x->block_id); + chip->xive = x; + + list_head_init(&x->donated_pages); + + /* Base interrupt numbers and allocator init */ + /* XXX Consider allocating half as many ESBs than MMIO space + * so that HW sources land outside of ESB space... + */ + x->int_base = BLKIDX_TO_GIRQ(x->block_id, 0); + x->int_max = x->int_base + XIVE_INT_COUNT; + x->int_hw_bot = x->int_max; + x->int_ipi_top = x->int_base; + + /* Make sure we never hand out "2" as it's reserved for XICS emulation + * IPI returns. Generally start handing out at 0x10 + */ + if (x->int_ipi_top < XIVE_INT_FIRST) + x->int_ipi_top = XIVE_INT_FIRST; + + /* Allocate a few bitmaps */ + x->eq_map = local_alloc(x->chip_id, BITMAP_BYTES(XIVE_EQ_COUNT >> 3), PAGE_SIZE); + assert(x->eq_map); + memset(x->eq_map, 0, BITMAP_BYTES(XIVE_EQ_COUNT >> 3)); + + /* Make sure we don't hand out 0 */ + bitmap_set_bit(*x->eq_map, 0); + + x->int_enabled_map = local_alloc(x->chip_id, BITMAP_BYTES(XIVE_INT_COUNT), PAGE_SIZE); + assert(x->int_enabled_map); + memset(x->int_enabled_map, 0, BITMAP_BYTES(XIVE_INT_COUNT)); + x->ipi_alloc_map = local_alloc(x->chip_id, BITMAP_BYTES(XIVE_INT_COUNT), PAGE_SIZE); + assert(x->ipi_alloc_map); + memset(x->ipi_alloc_map, 0, BITMAP_BYTES(XIVE_INT_COUNT)); + + xive_dbg(x, "Handling interrupts [%08x..%08x]\n", + x->int_base, x->int_max - 1); + + /* Setup the BARs */ + if (!xive_configure_bars(x)) + goto fail; + + /* Some basic global inits such as page sizes etc... */ + if (!xive_config_init(x)) + goto fail; + + /* Configure the set translations for MMIO */ + if (!xive_setup_set_xlate(x)) + goto fail; + + /* Dump some MMIO registers for diagnostics */ + xive_dump_mmio(x); + + /* Pre-allocate a number of tables */ + if (!xive_prealloc_tables(x)) + goto fail; + + /* Configure local tables in VSDs (forward ports will be + * handled later) + */ + if (!xive_set_local_tables(x)) + goto fail; + + /* Register built-in source controllers (aka IPIs) */ + flags = XIVE_SRC_EOI_PAGE1 | XIVE_SRC_TRIGGER_PAGE; + if (XIVE_CAN_STORE_EOI(x)) + flags |= XIVE_SRC_STORE_EOI; + __xive_register_source(x, &x->ipis, x->int_base, + x->int_hw_bot - x->int_base, XIVE_ESB_SHIFT, + x->esb_mmio, flags, true, NULL, NULL); + + /* Register escalation sources */ + __xive_register_source(x, &x->esc_irqs, + MAKE_ESCALATION_GIRQ(x->block_id, 0), + XIVE_EQ_COUNT, XIVE_EQ_SHIFT, + x->eq_mmio, XIVE_SRC_EOI_PAGE1, + false, NULL, NULL); + + + return x; + fail: + xive_err(x, "Initialization failed...\n"); + + /* Should this be fatal ? */ + //assert(false); + return NULL; +} + +/* + * XICS emulation + */ +static void xive_ipi_init(struct xive *x, struct cpu_thread *cpu) +{ + struct xive_cpu_state *xs = cpu->xstate; + + assert(xs); + + __xive_set_irq_config(&x->ipis.is, xs->ipi_irq, cpu->pir, + XIVE_EMULATION_PRIO, xs->ipi_irq, + true, true); +} + +static void xive_ipi_eoi(struct xive *x, uint32_t idx) +{ + uint8_t *mm = x->esb_mmio + idx * XIVE_ESB_PAGE_SIZE; + uint8_t eoi_val; + + /* For EOI, we use the special MMIO that does a clear of both + * P and Q and returns the old Q. + * + * This allows us to then do a re-trigger if Q was set rather + * than synthetizing an interrupt in software + */ + eoi_val = in_8(mm + PAGE_SIZE + XIVE_ESB_SET_PQ_00); + if (eoi_val & 1) { + out_8(mm + XIVE_ESB_STORE_TRIGGER, 0); + } +} + +static void xive_ipi_trigger(struct xive *x, uint32_t idx) +{ + uint8_t *mm = x->esb_mmio + idx * XIVE_ESB_PAGE_SIZE; + + xive_vdbg(x, "Trigger IPI 0x%x\n", idx); + + out_8(mm + XIVE_ESB_STORE_TRIGGER, 0); +} + + +static void xive_reset_enable_thread(struct cpu_thread *c) +{ + struct proc_chip *chip = get_chip(c->chip_id); + struct xive *x = chip->xive; + uint32_t fc, bit; + uint64_t enable; + + /* Get fused core number */ + fc = (c->pir >> 3) & 0xf; + + /* Get bit in register */ + bit = c->pir & 0x3f; + + /* Get which register to access */ + if (fc < 8) { + xive_regw(x, PC_THREAD_EN_REG0_CLR, PPC_BIT(bit)); + xive_regw(x, PC_THREAD_EN_REG0_SET, PPC_BIT(bit)); + + /* + * To guarantee that the TIMA accesses will see the + * latest state of the enable register, add an extra + * load on PC_THREAD_EN_REG. + */ + enable = xive_regr(x, PC_THREAD_EN_REG0); + if (!(enable & PPC_BIT(bit))) + xive_cpu_err(c, "Failed to enable thread\n"); + } else { + xive_regw(x, PC_THREAD_EN_REG1_CLR, PPC_BIT(bit)); + xive_regw(x, PC_THREAD_EN_REG1_SET, PPC_BIT(bit)); + + /* Same as above */ + enable = xive_regr(x, PC_THREAD_EN_REG1); + if (!(enable & PPC_BIT(bit))) + xive_cpu_err(c, "Failed to enable thread\n"); + } +} + +void xive_cpu_callin(struct cpu_thread *cpu) +{ + struct xive_cpu_state *xs = cpu->xstate; + uint8_t old_w2 __unused, w2 __unused; + + if (!xs) + return; + + /* Reset the HW thread context and enable it */ + xive_reset_enable_thread(cpu); + + /* Set VT to 1 */ + old_w2 = in_8(xs->tm_ring1 + TM_QW3_HV_PHYS + TM_WORD2); + out_8(xs->tm_ring1 + TM_QW3_HV_PHYS + TM_WORD2, 0x80); + w2 = in_8(xs->tm_ring1 + TM_QW3_HV_PHYS + TM_WORD2); + + xive_cpu_vdbg(cpu, "Initialized TIMA VP=%x/%x W01=%016llx W2=%02x->%02x\n", + xs->vp_blk, xs->vp_idx, + in_be64(xs->tm_ring1 + TM_QW3_HV_PHYS), + old_w2, w2); +} + +#ifdef XIVE_DEBUG_INIT_CACHE_UPDATES +static bool xive_check_eq_update(struct xive *x, uint32_t idx, struct xive_eq *eq) +{ + struct xive_eq *eq_p = xive_get_eq(x, idx); + struct xive_eq eq2; + + assert(eq_p); + eq2 = *eq_p; + if (memcmp(eq, &eq2, sizeof(struct xive_eq)) != 0) { + xive_err(x, "EQ update mismatch idx %d\n", idx); + xive_err(x, "want: %08x %08x %08x %08x\n", + be32_to_cpu(eq->w0), be32_to_cpu(eq->w1), + be32_to_cpu(eq->w2), be32_to_cpu(eq->w3)); + xive_err(x, " %08x %08x %08x %08x\n", + be32_to_cpu(eq->w4), be32_to_cpu(eq->w5), + be32_to_cpu(eq->w6), be32_to_cpu(eq->w7)); + xive_err(x, "got : %08x %08x %08x %08x\n", + be32_to_cpu(eq2.w0), be32_to_cpu(eq2.w1), + be32_to_cpu(eq2.w2), be32_to_cpu(eq2.w3)); + xive_err(x, " %08x %08x %08x %08x\n", + be32_to_cpu(eq2.w4), be32_to_cpu(eq2.w5), + be32_to_cpu(eq2.w6), be32_to_cpu(eq2.w7)); + return false; + } + return true; +} + +static bool xive_check_vpc_update(struct xive *x, uint32_t idx, struct xive_vp *vp) +{ + struct xive_vp *vp_p = xive_get_vp(x, idx); + struct xive_vp vp2; + + assert(vp_p); + vp2 = *vp_p; + if (memcmp(vp, &vp2, sizeof(struct xive_vp)) != 0) { + xive_err(x, "VP update mismatch idx %d\n", idx); + xive_err(x, "want: %08x %08x %08x %08x\n", + be32_to_cpu(vp->w0), be32_to_cpu(vp->w1), + be32_to_cpu(vp->w2), be32_to_cpu(vp->w3)); + xive_err(x, " %08x %08x %08x %08x\n", + be32_to_cpu(vp->w4), be32_to_cpu(vp->w5), + be32_to_cpu(vp->w6), be32_to_cpu(vp->w7)); + xive_err(x, "got : %08x %08x %08x %08x\n", + be32_to_cpu(vp2.w0), be32_to_cpu(vp2.w1), + be32_to_cpu(vp2.w2), be32_to_cpu(vp2.w3)); + xive_err(x, " %08x %08x %08x %08x\n", + be32_to_cpu(vp2.w4), be32_to_cpu(vp2.w5), + be32_to_cpu(vp2.w6), be32_to_cpu(vp2.w7)); + return false; + } + return true; +} +#else +static inline bool xive_check_eq_update(struct xive *x __unused, + uint32_t idx __unused, + struct xive_eq *eq __unused) +{ + return true; +} + +static inline bool xive_check_vpc_update(struct xive *x __unused, + uint32_t idx __unused, + struct xive_vp *vp __unused) +{ + return true; +} +#endif + +#ifdef XIVE_EXTRA_CHECK_INIT_CACHE +static void xive_special_cache_check(struct xive *x, uint32_t blk, uint32_t idx) +{ + struct xive_vp vp = {0}; + uint32_t i; + + for (i = 0; i < 1000; i++) { + struct xive_vp *vp_m = xive_get_vp(x, idx); + + memset(vp_m, (~i) & 0xff, sizeof(*vp_m)); + sync(); + vp.w1 = cpu_to_be32((i << 16) | i); + xive_vpc_cache_update(x, blk, idx, &vp, true); + if (!xive_check_vpc_update(x, idx, &vp)) { + xive_dbg(x, "Test failed at %d iterations\n", i); + return; + } + } + xive_dbg(x, "1000 iterations test success at %d/0x%x\n", blk, idx); +} +#else +static inline void xive_special_cache_check(struct xive *x __unused, + uint32_t blk __unused, + uint32_t idx __unused) +{ +} +#endif + +static void xive_setup_hw_for_emu(struct xive_cpu_state *xs) +{ + struct xive_eq eq; + struct xive_vp vp; + struct xive *x_eq, *x_vp; + + /* Grab the XIVE where the VP resides. It could be different from + * the local chip XIVE if not using block group mode + */ + x_vp = xive_from_pc_blk(xs->vp_blk); + assert(x_vp); + + /* Grab the XIVE where the EQ resides. It will be the same as the + * VP one with the current provisioning but I prefer not making + * this code depend on it. + */ + x_eq = xive_from_vc_blk(xs->eq_blk); + assert(x_eq); + + /* Initialize the structure */ + xive_init_emu_eq(xs->vp_blk, xs->vp_idx, &eq, + xs->eq_page, XIVE_EMULATION_PRIO); + + /* Use the cache watch to write it out */ + lock(&x_eq->lock); + xive_eqc_cache_update(x_eq, xs->eq_blk, xs->eq_idx + XIVE_EMULATION_PRIO, &eq, true); + xive_check_eq_update(x_eq, xs->eq_idx + XIVE_EMULATION_PRIO, &eq); + + /* Extra testing of cache watch & scrub facilities */ + xive_special_cache_check(x_vp, xs->vp_blk, xs->vp_idx); + unlock(&x_eq->lock); + + /* Initialize/enable the VP */ + xive_init_default_vp(&vp, xs->eq_blk, xs->eq_idx); + + /* Use the cache watch to write it out */ + lock(&x_vp->lock); + xive_vpc_cache_update(x_vp, xs->vp_blk, xs->vp_idx, &vp, true); + xive_check_vpc_update(x_vp, xs->vp_idx, &vp); + unlock(&x_vp->lock); +} + +static void xive_init_cpu_emulation(struct xive_cpu_state *xs, + struct cpu_thread *cpu) +{ + struct xive *x; + + /* Setup HW EQ and VP */ + xive_setup_hw_for_emu(xs); + + /* Setup and unmask the IPI */ + xive_ipi_init(xs->xive, cpu); + + /* Initialize remaining state */ + xs->cppr = 0; + xs->mfrr = 0xff; + xs->eqbuf = xive_get_eq_buf(xs->vp_blk, + xs->eq_idx + XIVE_EMULATION_PRIO); + assert(xs->eqbuf); + memset(xs->eqbuf, 0, PAGE_SIZE); + + xs->eqptr = 0; + xs->eqmsk = (PAGE_SIZE / 4) - 1; + xs->eqgen = 0; + x = xive_from_vc_blk(xs->eq_blk); + assert(x); + xs->eqmmio = x->eq_mmio + (xs->eq_idx + XIVE_EMULATION_PRIO) * XIVE_ESB_PAGE_SIZE; +} + +static void xive_init_cpu_exploitation(struct xive_cpu_state *xs) +{ + struct xive_vp vp; + struct xive *x_vp; + + /* Grab the XIVE where the VP resides. It could be different from + * the local chip XIVE if not using block group mode + */ + x_vp = xive_from_pc_blk(xs->vp_blk); + assert(x_vp); + + /* Initialize/enable the VP */ + xive_init_default_vp(&vp, xs->eq_blk, xs->eq_idx); + + /* Use the cache watch to write it out */ + lock(&x_vp->lock); + xive_vpc_cache_update(x_vp, xs->vp_blk, xs->vp_idx, &vp, true); + unlock(&x_vp->lock); + + /* Clenaup remaining state */ + xs->cppr = 0; + xs->mfrr = 0xff; + xs->eqbuf = NULL; + xs->eqptr = 0; + xs->eqmsk = 0; + xs->eqgen = 0; + xs->eqmmio = NULL; +} + +static void xive_configure_ex_special_bar(struct xive *x, struct cpu_thread *c) +{ + uint64_t xa, val; + int64_t rc; + + xive_cpu_vdbg(c, "Setting up special BAR\n"); + xa = XSCOM_ADDR_P9_EX(pir_to_core_id(c->pir), P9X_EX_NCU_SPEC_BAR); + val = (uint64_t)x->tm_base | P9X_EX_NCU_SPEC_BAR_ENABLE; + if (x->tm_shift == 16) + val |= P9X_EX_NCU_SPEC_BAR_256K; + xive_cpu_vdbg(c, "NCU_SPEC_BAR_XA[%08llx]=%016llx\n", xa, val); + rc = xscom_write(c->chip_id, xa, val); + if (rc) { + xive_cpu_err(c, "Failed to setup NCU_SPEC_BAR\n"); + /* XXXX what do do now ? */ + } +} + +void xive_late_init(void) +{ + struct cpu_thread *c; + + prlog(PR_INFO, "SLW: Configuring self-restore for NCU_SPEC_BAR\n"); + for_each_present_cpu(c) { + if(cpu_is_thread0(c)) { + struct proc_chip *chip = get_chip(c->chip_id); + struct xive *x = chip->xive; + uint64_t xa, val, rc; + xa = XSCOM_ADDR_P9_EX(pir_to_core_id(c->pir), + P9X_EX_NCU_SPEC_BAR); + val = (uint64_t)x->tm_base | P9X_EX_NCU_SPEC_BAR_ENABLE; + /* Bail out if wakeup engine has already failed */ + if ( wakeup_engine_state != WAKEUP_ENGINE_PRESENT) { + prlog(PR_ERR, "XIVE p9_stop_api fail detected\n"); + break; + } + rc = p9_stop_save_scom((void *)chip->homer_base, xa, val, + P9_STOP_SCOM_REPLACE, P9_STOP_SECTION_EQ_SCOM); + if (rc) { + xive_cpu_err(c, "p9_stop_api failed for NCU_SPEC_BAR rc=%lld\n", + rc); + wakeup_engine_state = WAKEUP_ENGINE_FAILED; + } + } + } + +} +static void xive_provision_cpu(struct xive_cpu_state *xs, struct cpu_thread *c) +{ + struct xive *x; + void *p; + + /* Physical VPs are pre-allocated */ + xs->vp_blk = PIR2VP_BLK(c->pir); + xs->vp_idx = PIR2VP_IDX(c->pir); + + /* For now we use identical block IDs for VC and PC but that might + * change. We allocate the EQs on the same XIVE as the VP. + */ + xs->eq_blk = xs->vp_blk; + + /* Grab the XIVE where the EQ resides. It could be different from + * the local chip XIVE if not using block group mode + */ + x = xive_from_vc_blk(xs->eq_blk); + assert(x); + + /* Allocate a set of EQs for that VP */ + xs->eq_idx = xive_alloc_eq_set(x, true); + assert(!XIVE_ALLOC_IS_ERR(xs->eq_idx)); + + /* Provision one of the queues. Allocate the memory on the + * chip where the CPU resides + */ + p = local_alloc(c->chip_id, PAGE_SIZE, PAGE_SIZE); + if (!p) { + xive_err(x, "Failed to allocate EQ backing store\n"); + assert(false); + } + xs->eq_page = p; +} + +static void xive_init_cpu(struct cpu_thread *c) +{ + struct proc_chip *chip = get_chip(c->chip_id); + struct xive *x = chip->xive; + struct xive_cpu_state *xs; + + if (!x) + return; + + /* + * Each core pair (EX) needs this special BAR setup to have the + * right powerbus cycle for the TM area (as it has the same address + * on all chips so it's somewhat special). + * + * Because we don't want to bother trying to figure out which core + * of a pair is present we just do the setup for each of them, which + * is harmless. + */ + if (cpu_is_thread0(c) || cpu_is_core_chiplet_primary(c)) + xive_configure_ex_special_bar(x, c); + + /* Initialize the state structure */ + c->xstate = xs = local_alloc(c->chip_id, sizeof(struct xive_cpu_state), 1); + assert(xs); + memset(xs, 0, sizeof(struct xive_cpu_state)); + xs->xive = x; + + init_lock(&xs->lock); + + /* Shortcut to TM HV ring */ + xs->tm_ring1 = x->tm_base + (1u << x->tm_shift); + + /* Allocate an IPI */ + xs->ipi_irq = xive_alloc_ipi_irqs(c->chip_id, 1, 1); + + xive_cpu_vdbg(c, "CPU IPI is irq %08x\n", xs->ipi_irq); + + /* Provision a VP and some EQDs for a physical CPU */ + xive_provision_cpu(xs, c); + + /* Initialize the XICS emulation related fields */ + xive_init_cpu_emulation(xs, c); +} + +static void xive_init_cpu_properties(struct cpu_thread *cpu) +{ + struct cpu_thread *t; + __be32 iprop[8][2] = { }; + uint32_t i; + + assert(cpu_thread_count <= 8); + + if (!cpu->node) + return; + for (i = 0; i < cpu_thread_count; i++) { + t = (i == 0) ? cpu : find_cpu_by_pir(cpu->pir + i); + if (!t) + continue; + iprop[i][0] = cpu_to_be32(t->xstate->ipi_irq); + iprop[i][1] = 0; /* Edge */ + } + dt_add_property(cpu->node, "interrupts", iprop, cpu_thread_count * 8); + dt_add_property_cells(cpu->node, "interrupt-parent", get_ics_phandle()); +} + +#ifdef XIVE_DEBUG_DUPLICATES +static uint32_t xive_count_irq_copies(struct xive_cpu_state *xs, uint32_t ref) +{ + uint32_t i, irq; + uint32_t cnt = 0; + uint32_t pos = xs->eqptr; + uint32_t gen = xs->eqgen; + + for (i = 0; i < 0x3fff; i++) { + irq = xs->eqbuf[pos]; + if ((irq >> 31) == gen) + break; + if (irq == ref) + cnt++; + pos = (pos + 1) & xs->eqmsk; + if (!pos) + gen ^= 1; + } + return cnt; +} +#else +static inline uint32_t xive_count_irq_copies(struct xive_cpu_state *xs __unused, + uint32_t ref __unused) +{ + return 1; +} +#endif + +static uint32_t xive_read_eq(struct xive_cpu_state *xs, bool just_peek) +{ + uint32_t cur, copies; + + xive_cpu_vdbg(this_cpu(), " EQ %s... IDX=%x MSK=%x G=%d\n", + just_peek ? "peek" : "read", + xs->eqptr, xs->eqmsk, xs->eqgen); + cur = xs->eqbuf[xs->eqptr]; + xive_cpu_vdbg(this_cpu(), " cur: %08x [%08x %08x %08x ...]\n", cur, + xs->eqbuf[(xs->eqptr + 1) & xs->eqmsk], + xs->eqbuf[(xs->eqptr + 2) & xs->eqmsk], + xs->eqbuf[(xs->eqptr + 3) & xs->eqmsk]); + if ((cur >> 31) == xs->eqgen) + return 0; + + /* Debug: check for duplicate interrupts in the queue */ + copies = xive_count_irq_copies(xs, cur); + if (copies > 1) { + struct xive_eq *eq; + + prerror("Wow ! Dups of irq %x, found %d copies !\n", + cur & 0x7fffffff, copies); + prerror("[%08x > %08x %08x %08x %08x ...] eqgen=%x eqptr=%x jp=%d\n", + xs->eqbuf[(xs->eqptr - 1) & xs->eqmsk], + xs->eqbuf[(xs->eqptr + 0) & xs->eqmsk], + xs->eqbuf[(xs->eqptr + 1) & xs->eqmsk], + xs->eqbuf[(xs->eqptr + 2) & xs->eqmsk], + xs->eqbuf[(xs->eqptr + 3) & xs->eqmsk], + xs->eqgen, xs->eqptr, just_peek); + lock(&xs->xive->lock); + __xive_cache_scrub(xs->xive, xive_cache_eqc, xs->eq_blk, + xs->eq_idx + XIVE_EMULATION_PRIO, + false, false); + unlock(&xs->xive->lock); + eq = xive_get_eq(xs->xive, xs->eq_idx + XIVE_EMULATION_PRIO); + prerror("EQ @%p W0=%08x W1=%08x qbuf @%p\n", + eq, be32_to_cpu(eq->w0), be32_to_cpu(eq->w1), xs->eqbuf); + } + log_add(xs, LOG_TYPE_POPQ, 7, cur, + xs->eqbuf[(xs->eqptr + 1) & xs->eqmsk], + xs->eqbuf[(xs->eqptr + 2) & xs->eqmsk], + copies, + xs->eqptr, xs->eqgen, just_peek); + if (!just_peek) { + xs->eqptr = (xs->eqptr + 1) & xs->eqmsk; + if (xs->eqptr == 0) + xs->eqgen ^= 1; + xs->total_irqs++; + } + return cur & 0x00ffffff; +} + +static uint8_t xive_sanitize_cppr(uint8_t cppr) +{ + if (cppr == 0xff || cppr == 0) + return cppr; + else + return XIVE_EMULATION_PRIO; +} + +static inline uint8_t opal_xive_check_pending(struct xive_cpu_state *xs, + uint8_t cppr) +{ + uint8_t mask = (cppr > 7) ? 0xff : ~((0x100 >> cppr) - 1); + + return xs->pending & mask; +} + +static void opal_xive_update_cppr(struct xive_cpu_state *xs, u8 cppr) +{ + /* Peform the update */ + xs->cppr = cppr; + out_8(xs->tm_ring1 + TM_QW3_HV_PHYS + TM_CPPR, cppr); + + /* Trigger the IPI if it's still more favored than the CPPR + * + * This can lead to a bunch of spurrious retriggers if the + * IPI is queued up behind other interrupts but that's not + * a big deal and keeps the code simpler + */ + if (xs->mfrr < cppr) + xive_ipi_trigger(xs->xive, GIRQ_TO_IDX(xs->ipi_irq)); +} + +static int64_t opal_xive_eoi(uint32_t xirr) +{ + struct cpu_thread *c = this_cpu(); + struct xive_cpu_state *xs = c->xstate; + uint32_t isn = xirr & 0x00ffffff; + struct xive *src_x; + bool special_ipi = false; + uint8_t cppr; + + /* + * In exploitation mode, this is supported as a way to perform + * an EOI via a FW calls. This can be needed to workaround HW + * implementation bugs for example. In this case interrupts will + * have the OPAL_XIVE_IRQ_EOI_VIA_FW flag set. + * + * In that mode the entire "xirr" argument is interpreterd as + * a global IRQ number (including the escalation bit), ther is + * no split between the top 8 bits for CPPR and bottom 24 for + * the interrupt number. + */ + if (xive_mode != XIVE_MODE_EMU) + return irq_source_eoi(xirr) ? OPAL_SUCCESS : OPAL_PARAMETER; + + if (!xs) + return OPAL_INTERNAL_ERROR; + + xive_cpu_vdbg(c, "EOI xirr=%08x cur_cppr=%d\n", xirr, xs->cppr); + + /* Limit supported CPPR values from OS */ + cppr = xive_sanitize_cppr(xirr >> 24); + + lock(&xs->lock); + + log_add(xs, LOG_TYPE_EOI, 3, isn, xs->eqptr, xs->eqgen); + + /* If this was our magic IPI, convert to IRQ number */ + if (isn == 2) { + isn = xs->ipi_irq; + special_ipi = true; + xive_cpu_vdbg(c, "User EOI for IPI !\n"); + } + + /* First check if we have stuff in that queue. If we do, don't bother with + * doing an EOI on the EQ. Just mark that priority pending, we'll come + * back later. + * + * If/when supporting multiple queues we would have to check them all + * in ascending prio order up to the passed-in CPPR value (exclusive). + */ + if (xive_read_eq(xs, true)) { + xive_cpu_vdbg(c, " isn %08x, skip, queue non empty\n", xirr); + xs->pending |= 1 << XIVE_EMULATION_PRIO; + } +#ifndef EQ_ALWAYS_NOTIFY + else { + uint8_t eoi_val; + + /* Perform EQ level EOI. Only one EQ for now ... + * + * Note: We aren't doing an actual EOI. Instead we are clearing + * both P and Q and will re-check the queue if Q was set. + */ + eoi_val = in_8(xs->eqmmio + XIVE_ESB_SET_PQ_00); + xive_cpu_vdbg(c, " isn %08x, eoi_val=%02x\n", xirr, eoi_val); + + /* Q was set ? Check EQ again after doing a sync to ensure + * ordering. + */ + if (eoi_val & 1) { + sync(); + if (xive_read_eq(xs, true)) + xs->pending |= 1 << XIVE_EMULATION_PRIO; + } + } +#endif + + /* Perform source level EOI if it's not our emulated MFRR IPI + * otherwise EOI ourselves + */ + src_x = xive_from_isn(isn); + if (src_x) { + uint32_t idx = GIRQ_TO_IDX(isn); + + /* Is it an IPI ? */ + if (special_ipi) { + xive_ipi_eoi(src_x, idx); + } else { + /* Otherwise go through the source mechanism */ + xive_vdbg(src_x, "EOI of IDX %x in EXT range\n", idx); + irq_source_eoi(isn); + } + } else { + xive_cpu_err(c, " EOI unknown ISN %08x\n", isn); + } + + /* Finally restore CPPR */ + opal_xive_update_cppr(xs, cppr); + + xive_cpu_vdbg(c, " pending=0x%x cppr=%d\n", xs->pending, cppr); + + unlock(&xs->lock); + + /* Return whether something is pending that is suitable for + * delivery considering the new CPPR value. This can be done + * without lock as these fields are per-cpu. + */ + return opal_xive_check_pending(xs, cppr) ? 1 : 0; +} + +#ifdef XIVE_CHECK_MISROUTED_IPI +static void xive_dump_eq(uint32_t eq_blk, uint32_t eq_idx) +{ + struct cpu_thread *me = this_cpu(); + struct xive *x; + struct xive_eq *eq; + + x = xive_from_vc_blk(eq_blk); + if (!x) + return; + eq = xive_get_eq(x, eq_idx); + if (!eq) + return; + xive_cpu_err(me, "EQ: %08x %08x %08x %08x (@%p)\n", + eq->w0, eq->w1, eq->w2, eq->w3, eq); + xive_cpu_err(me, " %08x %08x %08x %08x\n", + eq->w4, eq->w5, eq->w6, eq->w7); +} +static int64_t __opal_xive_dump_emu(struct xive_cpu_state *xs, uint32_t pir); + +static bool check_misrouted_ipi(struct cpu_thread *me, uint32_t irq) +{ + struct cpu_thread *c; + + for_each_present_cpu(c) { + struct xive_cpu_state *xs = c->xstate; + struct xive_ive *ive; + uint32_t ipi_target, i, eq_blk, eq_idx; + struct proc_chip *chip; + struct xive *x; + + if (!xs) + continue; + if (irq == xs->ipi_irq) { + xive_cpu_err(me, "misrouted IPI 0x%x, should" + " be aimed at CPU 0x%x\n", + irq, c->pir); + xive_cpu_err(me, " my eq_page=%p eqbuff=%p eq=0x%x/%x\n", + me->xstate->eq_page, me->xstate->eqbuf, + me->xstate->eq_blk, me->xstate->eq_idx + XIVE_EMULATION_PRIO); + xive_cpu_err(me, "tgt eq_page=%p eqbuff=%p eq=0x%x/%x\n", + c->xstate->eq_page, c->xstate->eqbuf, + c->xstate->eq_blk, c->xstate->eq_idx + XIVE_EMULATION_PRIO); + __opal_xive_dump_emu(me->xstate, me->pir); + __opal_xive_dump_emu(c->xstate, c->pir); + if (xive_get_irq_targetting(xs->ipi_irq, &ipi_target, NULL, NULL)) + xive_cpu_err(me, "target=%08x\n", ipi_target); + else + xive_cpu_err(me, "target=???\n"); + /* Find XIVE on which the IVE resides */ + x = xive_from_isn(irq); + if (!x) { + xive_cpu_err(me, "no xive attached\n"); + return true; + } + ive = xive_get_ive(x, irq); + if (!ive) { + xive_cpu_err(me, "no ive attached\n"); + return true; + } + xive_cpu_err(me, "ive=%016llx\n", be64_to_cpu(ive->w)); + for_each_chip(chip) { + x = chip->xive; + if (!x) + continue; + ive = x->ivt_base; + for (i = 0; i < XIVE_INT_COUNT; i++) { + if (xive_get_field64(IVE_EQ_DATA, ive[i].w) == irq) { + eq_blk = xive_get_field64(IVE_EQ_BLOCK, ive[i].w); + eq_idx = xive_get_field64(IVE_EQ_INDEX, ive[i].w); + xive_cpu_err(me, "Found source: 0x%x ive=%016llx\n" + " eq 0x%x/%x", + BLKIDX_TO_GIRQ(x->block_id, i), + be64_to_cpu(ive[i].w), eq_blk, eq_idx); + xive_dump_eq(eq_blk, eq_idx); + } + } + } + return true; + } + } + return false; +} +#else +static inline bool check_misrouted_ipi(struct cpu_thread *c __unused, + uint32_t irq __unused) +{ + return false; +} +#endif + +static int64_t opal_xive_get_xirr(__be32 *out_xirr, bool just_poll) +{ + struct cpu_thread *c = this_cpu(); + struct xive_cpu_state *xs = c->xstate; + uint16_t ack; + uint8_t active, old_cppr; + + if (xive_mode != XIVE_MODE_EMU) + return OPAL_WRONG_STATE; + if (!xs) + return OPAL_INTERNAL_ERROR; + if (!out_xirr) + return OPAL_PARAMETER; + + *out_xirr = 0; + + lock(&xs->lock); + + /* + * Due to the need to fetch multiple interrupts from the EQ, we + * need to play some tricks. + * + * The "pending" byte in "xs" keeps track of the priorities that + * are known to have stuff to read (currently we only use one). + * + * It is set in EOI and cleared when consumed here. We don't bother + * looking ahead here, EOI will do it. + * + * We do need to still do an ACK every time in case a higher prio + * exception occurred (though we don't do prio yet... right ? still + * let's get the basic design right !). + * + * Note that if we haven't found anything via ack, but did find + * something in the queue, we must also raise CPPR back. + */ + + xive_cpu_vdbg(c, "get_xirr W01=%016llx W2=%08x\n", + __in_be64(xs->tm_ring1 + TM_QW3_HV_PHYS), + __in_be32(xs->tm_ring1 + TM_QW3_HV_PHYS + 8)); + + /* Perform the HV Ack cycle */ + if (just_poll) + ack = __in_be64(xs->tm_ring1 + TM_QW3_HV_PHYS) >> 48; + else + ack = __in_be16(xs->tm_ring1 + TM_SPC_ACK_HV_REG); + sync(); + xive_cpu_vdbg(c, "get_xirr,%s=%04x\n", just_poll ? "POLL" : "ACK", ack); + + /* Capture the old CPPR which we will return with the interrupt */ + old_cppr = xs->cppr; + + switch(GETFIELD(TM_QW3_NSR_HE, (ack >> 8))) { + case TM_QW3_NSR_HE_NONE: + break; + case TM_QW3_NSR_HE_POOL: + break; + case TM_QW3_NSR_HE_PHYS: + /* Mark pending and keep track of the CPPR update */ + if (!just_poll && (ack & 0xff) != 0xff) { + xs->cppr = ack & 0xff; + xs->pending |= 1 << xs->cppr; + } + break; + case TM_QW3_NSR_HE_LSI: + break; + } + + /* Calculate "active" lines as being the pending interrupts + * masked by the "old" CPPR + */ + active = opal_xive_check_pending(xs, old_cppr); + + log_add(xs, LOG_TYPE_XIRR, 6, old_cppr, xs->cppr, xs->pending, active, + xs->eqptr, xs->eqgen); + +#ifdef XIVE_PERCPU_LOG + { + struct xive_eq *eq; + lock(&xs->xive->lock); + __xive_cache_scrub(xs->xive, xive_cache_eqc, xs->eq_blk, + xs->eq_idx + XIVE_EMULATION_PRIO, + false, false); + unlock(&xs->xive->lock); + eq = xive_get_eq(xs->xive, xs->eq_idx + XIVE_EMULATION_PRIO); + log_add(xs, LOG_TYPE_EQD, 2, be32_to_cpu(eq->w0), be32_to_cpu(eq->w1)); + } +#endif /* XIVE_PERCPU_LOG */ + + xive_cpu_vdbg(c, " cppr=%d->%d pending=0x%x active=%x\n", + old_cppr, xs->cppr, xs->pending, active); + if (active) { + /* Find highest pending */ + uint8_t prio = ffs(active) - 1; + uint32_t val; + + /* XXX Use "p" to select queue */ + val = xive_read_eq(xs, just_poll); + + if (val && val < XIVE_INT_FIRST) + xive_cpu_err(c, "Bogus interrupt 0x%x received !\n", val); + + /* Convert to magic IPI if needed */ + if (val == xs->ipi_irq) + val = 2; + if (check_misrouted_ipi(c, val)) + val = 2; + + *out_xirr = cpu_to_be32((old_cppr << 24) | val); + + /* If we are polling, that's it */ + if (just_poll) + goto skip; + + /* Clear the pending bit. EOI will set it again if needed. We + * could check the queue but that's not really critical here. + */ + xs->pending &= ~(1 << prio); + + /* Spurrious IPB bit, nothing to fetch, bring CPPR back */ + if (!val) + prio = old_cppr; + + /* We could have fetched a pending interrupt left over + * by a previous EOI, so the CPPR might need adjusting + * Also if we had a spurrious one as well. + */ + if (xs->cppr != prio) { + xs->cppr = prio; + out_8(xs->tm_ring1 + TM_QW3_HV_PHYS + TM_CPPR, prio); + xive_cpu_vdbg(c, " adjusted CPPR to %d\n", prio); + } + + if (val) + xive_cpu_vdbg(c, " found irq, prio=%d\n", prio); + + } else { + /* Nothing was active, this is a fluke, restore CPPR */ + opal_xive_update_cppr(xs, old_cppr); + xive_cpu_vdbg(c, " nothing active, restored CPPR to %d\n", + old_cppr); + } + skip: + + log_add(xs, LOG_TYPE_XIRR2, 5, xs->cppr, xs->pending, + be32_to_cpu(*out_xirr), xs->eqptr, xs->eqgen); + xive_cpu_vdbg(c, " returning XIRR=%08x, pending=0x%x\n", + be32_to_cpu(*out_xirr), xs->pending); + + unlock(&xs->lock); + + return OPAL_SUCCESS; +} + +static int64_t opal_xive_set_cppr(uint8_t cppr) +{ + struct cpu_thread *c = this_cpu(); + struct xive_cpu_state *xs = c->xstate; + + if (xive_mode != XIVE_MODE_EMU) + return OPAL_WRONG_STATE; + + /* Limit supported CPPR values */ + cppr = xive_sanitize_cppr(cppr); + + if (!xs) + return OPAL_INTERNAL_ERROR; + xive_cpu_vdbg(c, "CPPR setting to %d\n", cppr); + + lock(&xs->lock); + opal_xive_update_cppr(xs, cppr); + unlock(&xs->lock); + + return OPAL_SUCCESS; +} + +static int64_t opal_xive_set_mfrr(uint32_t cpu, uint8_t mfrr) +{ + struct cpu_thread *c = find_cpu_by_server(cpu); + struct xive_cpu_state *xs; + uint8_t old_mfrr; + + if (xive_mode != XIVE_MODE_EMU) + return OPAL_WRONG_STATE; + if (!c) + return OPAL_PARAMETER; + xs = c->xstate; + if (!xs) + return OPAL_INTERNAL_ERROR; + + lock(&xs->lock); + old_mfrr = xs->mfrr; + xive_cpu_vdbg(c, " Setting MFRR to %x, old is %x\n", mfrr, old_mfrr); + xs->mfrr = mfrr; + if (old_mfrr > mfrr && mfrr < xs->cppr) + xive_ipi_trigger(xs->xive, GIRQ_TO_IDX(xs->ipi_irq)); + unlock(&xs->lock); + + return OPAL_SUCCESS; +} + +static uint64_t xive_convert_irq_flags(uint64_t iflags) +{ + uint64_t oflags = 0; + + if (iflags & XIVE_SRC_STORE_EOI) + oflags |= OPAL_XIVE_IRQ_STORE_EOI; + + /* OPAL_XIVE_IRQ_TRIGGER_PAGE is only meant to be set if + * the interrupt has a *separate* trigger page. + */ + if ((iflags & XIVE_SRC_EOI_PAGE1) && + (iflags & XIVE_SRC_TRIGGER_PAGE)) + oflags |= OPAL_XIVE_IRQ_TRIGGER_PAGE; + + if (iflags & XIVE_SRC_LSI) + oflags |= OPAL_XIVE_IRQ_LSI; + return oflags; +} + +static int64_t opal_xive_get_irq_info(uint32_t girq, + __be64 *out_flags, + __be64 *out_eoi_page, + __be64 *out_trig_page, + __be32 *out_esb_shift, + __be32 *out_src_chip) +{ + struct irq_source *is = irq_find_source(girq); + struct xive_src *s = container_of(is, struct xive_src, is); + uint32_t idx; + uint64_t mm_base; + uint64_t eoi_page = 0, trig_page = 0; + + if (xive_mode != XIVE_MODE_EXPL) + return OPAL_WRONG_STATE; + if (is == NULL || out_flags == NULL) + return OPAL_PARAMETER; + assert(is->ops == &xive_irq_source_ops); + + if (out_flags) + *out_flags = cpu_to_be64(xive_convert_irq_flags(s->flags)); + + idx = girq - s->esb_base; + + if (out_esb_shift) + *out_esb_shift = cpu_to_be32(s->esb_shift); + + mm_base = (uint64_t)s->esb_mmio + (1ull << s->esb_shift) * idx; + + /* The EOI page can either be the first or second page */ + if (s->flags & XIVE_SRC_EOI_PAGE1) { + uint64_t p1off = 1ull << (s->esb_shift - 1); + eoi_page = mm_base + p1off; + } else + eoi_page = mm_base; + + /* The trigger page, if it exists, is always the first page */ + if (s->flags & XIVE_SRC_TRIGGER_PAGE) + trig_page = mm_base; + + if (out_eoi_page) + *out_eoi_page = cpu_to_be64(eoi_page); + if (out_trig_page) + *out_trig_page = cpu_to_be64(trig_page); + if (out_src_chip) + *out_src_chip = cpu_to_be32(GIRQ_TO_CHIP(girq)); + + return OPAL_SUCCESS; +} + +static int64_t opal_xive_get_irq_config(uint32_t girq, + __be64 *out_vp, + uint8_t *out_prio, + __be32 *out_lirq) +{ + uint32_t vp; + uint32_t lirq; + uint8_t prio; + + if (xive_mode != XIVE_MODE_EXPL) + return OPAL_WRONG_STATE; + + if (xive_get_irq_targetting(girq, &vp, &prio, &lirq)) { + *out_vp = cpu_to_be64(vp); + *out_prio = prio; + *out_lirq = cpu_to_be32(lirq); + return OPAL_SUCCESS; + } else + return OPAL_PARAMETER; +} + +static int64_t opal_xive_set_irq_config(uint32_t girq, + uint64_t vp, + uint8_t prio, + uint32_t lirq) +{ + /* + * This variant is meant for a XIVE-aware OS, thus it will + * *not* affect the ESB state of the interrupt. If used with + * a prio of FF, the IVT/EAS will be mased. In that case the + * races have to be handled by the OS. + * + * The exception to this rule is interrupts for which masking + * and unmasking is handled by firmware. In that case the ESB + * state isn't under OS control and will be dealt here. This + * is currently only the case of LSIs and on P9 DD1.0 only so + * isn't an issue. + */ + + if (xive_mode != XIVE_MODE_EXPL) + return OPAL_WRONG_STATE; + + return xive_set_irq_config(girq, vp, prio, lirq, false); +} + +static int64_t opal_xive_get_queue_info(uint64_t vp, uint32_t prio, + __be64 *out_qpage, + __be64 *out_qsize, + __be64 *out_qeoi_page, + __be32 *out_escalate_irq, + __be64 *out_qflags) +{ + uint32_t blk, idx; + struct xive *x; + struct xive_eq *eq; + + if (xive_mode != XIVE_MODE_EXPL) + return OPAL_WRONG_STATE; + + if (!xive_eq_for_target(vp, prio, &blk, &idx)) + return OPAL_PARAMETER; + + x = xive_from_vc_blk(blk); + if (!x) + return OPAL_PARAMETER; + + eq = xive_get_eq(x, idx); + if (!eq) + return OPAL_PARAMETER; + + if (out_escalate_irq) { + uint32_t esc_idx = idx; + + /* If escalations are routed to a single queue, fix up + * the escalation interrupt number here. + */ + if (xive_get_field32(EQ_W0_UNCOND_ESCALATE, eq->w0)) + esc_idx |= XIVE_ESCALATION_PRIO; + + *out_escalate_irq = + cpu_to_be32(MAKE_ESCALATION_GIRQ(blk, esc_idx)); + } + + /* If this is a single-escalation gather queue, that's all + * there is to return + */ + if (xive_get_field32(EQ_W0_SILENT_ESCALATE, eq->w0)) { + if (out_qflags) + *out_qflags = 0; + if (out_qpage) + *out_qpage = 0; + if (out_qsize) + *out_qsize = 0; + if (out_qeoi_page) + *out_qeoi_page = 0; + return OPAL_SUCCESS; + } + + if (out_qpage) { + if (xive_get_field32(EQ_W0_ENQUEUE, eq->w0)) + *out_qpage = cpu_to_be64(((uint64_t)xive_get_field32(EQ_W2_OP_DESC_HI, eq->w2) << 32) | be32_to_cpu(eq->w3)); + else + *out_qpage = 0; + } + if (out_qsize) { + if (xive_get_field32(EQ_W0_ENQUEUE, eq->w0)) + *out_qsize = cpu_to_be64(xive_get_field32(EQ_W0_QSIZE, eq->w0) + 12); + else + *out_qsize = 0; + } + if (out_qeoi_page) { + *out_qeoi_page = + cpu_to_be64((uint64_t)x->eq_mmio + idx * XIVE_ESB_PAGE_SIZE); + } + if (out_qflags) { + *out_qflags = 0; + if (xive_get_field32(EQ_W0_VALID, eq->w0)) + *out_qflags |= cpu_to_be64(OPAL_XIVE_EQ_ENABLED); + if (xive_get_field32(EQ_W0_UCOND_NOTIFY, eq->w0)) + *out_qflags |= cpu_to_be64(OPAL_XIVE_EQ_ALWAYS_NOTIFY); + if (xive_get_field32(EQ_W0_ESCALATE_CTL, eq->w0)) + *out_qflags |= cpu_to_be64(OPAL_XIVE_EQ_ESCALATE); + } + + return OPAL_SUCCESS; +} + +static void xive_cleanup_eq(struct xive_eq *eq) +{ + eq->w0 = xive_set_field32(EQ_W0_FIRMWARE, 0, xive_get_field32(EQ_W0_FIRMWARE, eq->w0)); + eq->w1 = cpu_to_be32(EQ_W1_ESe_Q | EQ_W1_ESn_Q); + eq->w2 = eq->w3 = eq->w4 = eq->w5 = eq->w6 = eq->w7 = 0; +} + +static int64_t opal_xive_set_queue_info(uint64_t vp, uint32_t prio, + uint64_t qpage, + uint64_t qsize, + uint64_t qflags) +{ + uint32_t blk, idx; + struct xive *x; + struct xive_eq *old_eq; + struct xive_eq eq; + uint32_t vp_blk, vp_idx; + bool group; + int64_t rc; + + if (xive_mode != XIVE_MODE_EXPL) + return OPAL_WRONG_STATE; + if (!xive_eq_for_target(vp, prio, &blk, &idx)) + return OPAL_PARAMETER; + + x = xive_from_vc_blk(blk); + if (!x) + return OPAL_PARAMETER; + + old_eq = xive_get_eq(x, idx); + if (!old_eq) + return OPAL_PARAMETER; + + /* If this is a silent escalation queue, it cannot be + * configured directly + */ + if (xive_get_field32(EQ_W0_SILENT_ESCALATE, old_eq->w0)) + return OPAL_PARAMETER; + + /* This shouldn't fail or xive_eq_for_target would have + * failed already + */ + if (!xive_decode_vp(vp, &vp_blk, &vp_idx, NULL, &group)) + return OPAL_PARAMETER; + + /* + * Make a local copy which we will later try to commit using + * the cache watch facility + */ + eq = *old_eq; + + if (qflags & OPAL_XIVE_EQ_ENABLED) { + switch(qsize) { + /* Supported sizes */ + case 12: + case 16: + case 21: + case 24: + eq.w3 = cpu_to_be32(((uint64_t)qpage) & EQ_W3_OP_DESC_LO); + eq.w2 = cpu_to_be32((((uint64_t)qpage) >> 32) & EQ_W2_OP_DESC_HI); + eq.w0 = xive_set_field32(EQ_W0_ENQUEUE, eq.w0, 1); + eq.w0 = xive_set_field32(EQ_W0_QSIZE, eq.w0, qsize - 12); + break; + case 0: + eq.w2 = eq.w3 = 0; + eq.w0 = xive_set_field32(EQ_W0_ENQUEUE, eq.w0, 0); + break; + default: + return OPAL_PARAMETER; + } + + /* Ensure the priority and target are correctly set (they will + * not be right after allocation + */ + eq.w6 = xive_set_field32(EQ_W6_NVT_BLOCK, 0, vp_blk) | + xive_set_field32(EQ_W6_NVT_INDEX, 0, vp_idx); + eq.w7 = xive_set_field32(EQ_W7_F0_PRIORITY, 0, prio); + /* XXX Handle group i bit when needed */ + + /* Always notify flag */ + if (qflags & OPAL_XIVE_EQ_ALWAYS_NOTIFY) + eq.w0 = xive_set_field32(EQ_W0_UCOND_NOTIFY, eq.w0, 1); + else + eq.w0 = xive_set_field32(EQ_W0_UCOND_NOTIFY, eq.w0, 0); + + /* Escalation flag */ + if (qflags & OPAL_XIVE_EQ_ESCALATE) + eq.w0 = xive_set_field32(EQ_W0_ESCALATE_CTL, eq.w0, 1); + else + eq.w0 = xive_set_field32(EQ_W0_ESCALATE_CTL, eq.w0, 0); + + /* Unconditionally clear the current queue pointer, set + * generation to 1 and disable escalation interrupts. + */ + eq.w1 = xive_set_field32(EQ_W1_GENERATION, 0, 1) | + xive_set_field32(EQ_W1_ES, 0, xive_get_field32(EQ_W1_ES, old_eq->w1)); + + /* Enable. We always enable backlog for an enabled queue + * otherwise escalations won't work. + */ + eq.w0 = xive_set_field32(EQ_W0_VALID, eq.w0, 1); + eq.w0 = xive_set_field32(EQ_W0_BACKLOG, eq.w0, 1); + } else + xive_cleanup_eq(&eq); + + /* Update EQ, non-synchronous */ + lock(&x->lock); + rc = xive_eqc_cache_update(x, blk, idx, &eq, false); + unlock(&x->lock); + + return rc; +} + +static int64_t opal_xive_get_queue_state(uint64_t vp, uint32_t prio, + __be32 *out_qtoggle, + __be32 *out_qindex) +{ + uint32_t blk, idx; + struct xive *x; + struct xive_eq *eq; + int64_t rc; + + if (xive_mode != XIVE_MODE_EXPL) + return OPAL_WRONG_STATE; + + if (!out_qtoggle || !out_qindex || + !xive_eq_for_target(vp, prio, &blk, &idx)) + return OPAL_PARAMETER; + + x = xive_from_vc_blk(blk); + if (!x) + return OPAL_PARAMETER; + + eq = xive_get_eq(x, idx); + if (!eq) + return OPAL_PARAMETER; + + /* Scrub the queue */ + lock(&x->lock); + rc = xive_eqc_scrub(x, blk, idx); + unlock(&x->lock); + if (rc) + return rc; + + /* We don't do disable queues */ + if (!xive_get_field32(EQ_W0_VALID, eq->w0)) + return OPAL_WRONG_STATE; + + *out_qtoggle = cpu_to_be32(xive_get_field32(EQ_W1_GENERATION, eq->w1)); + *out_qindex = cpu_to_be32(xive_get_field32(EQ_W1_PAGE_OFF, eq->w1)); + + return OPAL_SUCCESS; +} + +static int64_t opal_xive_set_queue_state(uint64_t vp, uint32_t prio, + uint32_t qtoggle, uint32_t qindex) +{ + uint32_t blk, idx; + struct xive *x; + struct xive_eq *eq, new_eq; + int64_t rc; + + if (xive_mode != XIVE_MODE_EXPL) + return OPAL_WRONG_STATE; + + if (!xive_eq_for_target(vp, prio, &blk, &idx)) + return OPAL_PARAMETER; + + x = xive_from_vc_blk(blk); + if (!x) + return OPAL_PARAMETER; + + eq = xive_get_eq(x, idx); + if (!eq) + return OPAL_PARAMETER; + + /* We don't do disable queues */ + if (!xive_get_field32(EQ_W0_VALID, eq->w0)) + return OPAL_WRONG_STATE; + + new_eq = *eq; + + new_eq.w1 = xive_set_field32(EQ_W1_GENERATION, new_eq.w1, qtoggle); + new_eq.w1 = xive_set_field32(EQ_W1_PAGE_OFF, new_eq.w1, qindex); + + lock(&x->lock); + rc = xive_eqc_cache_update(x, blk, idx, &new_eq, false); + unlock(&x->lock); + + return rc; +} + +static int64_t opal_xive_donate_page(uint32_t chip_id, uint64_t addr) +{ + struct proc_chip *c = get_chip(chip_id); + struct list_node *n; + + if (xive_mode != XIVE_MODE_EXPL) + return OPAL_WRONG_STATE; + if (!c) + return OPAL_PARAMETER; + if (!c->xive) + return OPAL_PARAMETER; + if (addr & 0xffff) + return OPAL_PARAMETER; + + n = (struct list_node *)addr; + lock(&c->xive->lock); + list_add(&c->xive->donated_pages, n); + unlock(&c->xive->lock); + + return OPAL_SUCCESS; +} + +static int64_t opal_xive_get_vp_info(uint64_t vp_id, + __be64 *out_flags, + __be64 *out_cam_value, + __be64 *out_report_cl_pair, + __be32 *out_chip_id) +{ + struct xive *x; + struct xive_vp *vp; + uint32_t blk, idx; + bool group; + + if (!xive_decode_vp(vp_id, &blk, &idx, NULL, &group)) + return OPAL_PARAMETER; + /* We don't do groups yet */ + if (group) + return OPAL_PARAMETER; + x = xive_from_pc_blk(blk); + if (!x) + return OPAL_PARAMETER; + vp = xive_get_vp(x, idx); + if (!vp) + return OPAL_PARAMETER; + + if (out_flags) { + uint32_t eq_blk, eq_idx; + struct xive_eq *eq; + struct xive *eq_x; + *out_flags = 0; + + /* We would like to a way to stash a SW bit in the VP to + * know whether silent escalation is enabled or not, but + * unlike what happens with EQs, the PC cache watch doesn't + * implement the reserved bit in the VPs... so we have to go + * look at EQ 7 instead. + */ + /* Grab EQ for prio 7 to check for silent escalation */ + if (!xive_eq_for_target(vp_id, XIVE_ESCALATION_PRIO, + &eq_blk, &eq_idx)) + return OPAL_PARAMETER; + + eq_x = xive_from_vc_blk(eq_blk); + if (!eq_x) + return OPAL_PARAMETER; + + eq = xive_get_eq(x, eq_idx); + if (!eq) + return OPAL_PARAMETER; + if (xive_get_field32(VP_W0_VALID, vp->w0)) + *out_flags |= cpu_to_be64(OPAL_XIVE_VP_ENABLED); + if (xive_get_field32(EQ_W0_SILENT_ESCALATE, eq->w0)) + *out_flags |= cpu_to_be64(OPAL_XIVE_VP_SINGLE_ESCALATION); + } + + if (out_cam_value) + *out_cam_value = cpu_to_be64((blk << NVT_SHIFT) | idx); + + if (out_report_cl_pair) { + *out_report_cl_pair = cpu_to_be64(((uint64_t)(be32_to_cpu(vp->w6) & 0x0fffffff)) << 32); + *out_report_cl_pair |= cpu_to_be64(be32_to_cpu(vp->w7) & 0xffffff00); + } + + if (out_chip_id) + *out_chip_id = cpu_to_be32(xive_block_to_chip[blk]); + + return OPAL_SUCCESS; +} + +static int64_t xive_setup_silent_gather(uint64_t vp_id, bool enable) +{ + uint32_t blk, idx, i; + struct xive_eq *eq_orig; + struct xive_eq eq; + struct xive *x; + int64_t rc; + + /* Get base EQ block */ + if (!xive_eq_for_target(vp_id, 0, &blk, &idx)) + return OPAL_PARAMETER; + x = xive_from_vc_blk(blk); + if (!x) + return OPAL_PARAMETER; + + /* Grab prio 7 */ + eq_orig = xive_get_eq(x, idx + XIVE_ESCALATION_PRIO); + if (!eq_orig) + return OPAL_PARAMETER; + + /* If trying to enable silent gather, make sure prio 7 is not + * already enabled as a normal queue + */ + if (enable && xive_get_field32(EQ_W0_VALID, eq_orig->w0) && + !xive_get_field32(EQ_W0_SILENT_ESCALATE, eq_orig->w0)) { + xive_dbg(x, "Attempt at enabling silent gather but" + " prio 7 queue already in use\n"); + return OPAL_PARAMETER; + } + + eq = *eq_orig; + + if (enable) { + /* W0: Enabled and "s" set, no other bit */ + eq.w0 = xive_set_field32(EQ_W0_FIRMWARE, 0, xive_get_field32(EQ_W0_FIRMWARE, eq.w0)) | + xive_set_field32(EQ_W0_VALID, 0, 1) | + xive_set_field32(EQ_W0_SILENT_ESCALATE, 0, 1) | + xive_set_field32(EQ_W0_ESCALATE_CTL, 0, 1) | + xive_set_field32(EQ_W0_BACKLOG, 0, 1); + + /* W1: Mark ESn as 01, ESe as 00 */ + eq.w1 = xive_set_field32(EQ_W1_ESn_P, eq.w1, 0); + eq.w1 = xive_set_field32(EQ_W1_ESn_Q, eq.w1, 1); + eq.w1 = xive_set_field32(EQ_W1_ESe, eq.w1, 0); + } else if (xive_get_field32(EQ_W0_SILENT_ESCALATE, eq.w0)) + xive_cleanup_eq(&eq); + + if (!memcmp(eq_orig, &eq, sizeof(eq))) + rc = 0; + else + rc = xive_eqc_cache_update(x, blk, idx + XIVE_ESCALATION_PRIO, + &eq, false); + if (rc) + return rc; + + /* Mark/unmark all other prios with the new "u" bit and update + * escalation + */ + for (i = 0; i < NUM_INT_PRIORITIES; i++) { + if (i == XIVE_ESCALATION_PRIO) + continue; + eq_orig = xive_get_eq(x, idx + i); + if (!eq_orig) + continue; + eq = *eq_orig; + if (enable) { + /* Set new "u" bit */ + eq.w0 = xive_set_field32(EQ_W0_UNCOND_ESCALATE, eq.w0, 1); + + /* Re-route escalation interrupt (previous + * route is lost !) to the gather queue + */ + eq.w4 = xive_set_field32(EQ_W4_ESC_EQ_BLOCK, eq.w4, blk); + eq.w4 = xive_set_field32(EQ_W4_ESC_EQ_INDEX, eq.w4, idx + XIVE_ESCALATION_PRIO); + } else if (xive_get_field32(EQ_W0_UNCOND_ESCALATE, eq.w0)) { + /* Clear the "u" bit, disable escalations if it was set */ + eq.w0 = xive_set_field32(EQ_W0_UNCOND_ESCALATE, eq.w0, 0); + eq.w0 = xive_set_field32(EQ_W0_ESCALATE_CTL, eq.w0, 0); + } + if (!memcmp(eq_orig, &eq, sizeof(eq))) + continue; + rc = xive_eqc_cache_update(x, blk, idx + i, &eq, false); + if (rc) + break; + } + + return rc; +} + +static int64_t opal_xive_set_vp_info(uint64_t vp_id, + uint64_t flags, + uint64_t report_cl_pair) +{ + struct xive *x; + struct xive_vp *vp, vp_new; + uint32_t blk, idx; + bool group; + int64_t rc; + + if (!xive_decode_vp(vp_id, &blk, &idx, NULL, &group)) + return OPAL_PARAMETER; + /* We don't do groups yet */ + if (group) + return OPAL_PARAMETER; + if (report_cl_pair & 0xff) + return OPAL_PARAMETER; + x = xive_from_pc_blk(blk); + if (!x) + return OPAL_PARAMETER; + vp = xive_get_vp(x, idx); + if (!vp) + return OPAL_PARAMETER; + + lock(&x->lock); + + vp_new = *vp; + if (flags & OPAL_XIVE_VP_ENABLED) { + vp_new.w0 = xive_set_field32(VP_W0_VALID, vp_new.w0, 1); + vp_new.w6 = cpu_to_be32(report_cl_pair >> 32); + vp_new.w7 = cpu_to_be32(report_cl_pair & 0xffffffff); + + if (flags & OPAL_XIVE_VP_SINGLE_ESCALATION) + rc = xive_setup_silent_gather(vp_id, true); + else + rc = xive_setup_silent_gather(vp_id, false); + } else { + vp_new.w0 = vp_new.w6 = vp_new.w7 = 0; + rc = xive_setup_silent_gather(vp_id, false); + } + + if (rc) { + if (rc != OPAL_BUSY) + xive_dbg(x, "Silent gather setup failed with err %lld\n", rc); + goto bail; + } + + rc = xive_vpc_cache_update(x, blk, idx, &vp_new, false); + if (rc) + goto bail; + + /* When disabling, we scrub clean (invalidate the entry) so + * we can avoid cache ops in alloc/free + */ + if (!(flags & OPAL_XIVE_VP_ENABLED)) + xive_vpc_scrub_clean(x, blk, idx); + +bail: + unlock(&x->lock); + return rc; +} + +static int64_t opal_xive_get_vp_state(uint64_t vp_id, __be64 *out_state) +{ + struct xive *x; + struct xive_vp *vp; + uint32_t blk, idx; + int64_t rc; + bool group; + + if (!out_state || !xive_decode_vp(vp_id, &blk, &idx, NULL, &group)) + return OPAL_PARAMETER; + if (group) + return OPAL_PARAMETER; + x = xive_from_pc_blk(blk); + if (!x) + return OPAL_PARAMETER; + vp = xive_get_vp(x, idx); + if (!vp) + return OPAL_PARAMETER; + + /* Scrub the vp */ + lock(&x->lock); + rc = xive_vpc_scrub(x, blk, idx); + unlock(&x->lock); + if (rc) + return rc; + + if (!xive_get_field32(VP_W0_VALID, vp->w0)) + return OPAL_WRONG_STATE; + + /* + * Return word4 and word5 which contain the saved HW thread + * context. The IPB register is all we care for now on P9. + */ + *out_state = cpu_to_be64((((uint64_t)be32_to_cpu(vp->w4)) << 32) | be32_to_cpu(vp->w5)); + + return OPAL_SUCCESS; +} + +static void xive_cleanup_cpu_tima(struct cpu_thread *c) +{ + struct xive_cpu_state *xs = c->xstate; + struct xive *x = xs->xive; + void *ind_tm_base = x->ic_base + (4 << x->ic_shift); + uint8_t old_w2 __unused, w2 __unused; + + /* Reset the HW context */ + xive_reset_enable_thread(c); + + /* Setup indirect access to the corresponding thread */ + xive_regw(x, PC_TCTXT_INDIR0, + PC_TCTXT_INDIR_VALID | + SETFIELD(PC_TCTXT_INDIR_THRDID, 0ull, c->pir & 0xff)); + + /* Workaround for HW issue: Need to read the above register + * back before doing the subsequent accesses + */ + xive_regr(x, PC_TCTXT_INDIR0); + + /* Set VT to 1 */ + old_w2 = in_8(ind_tm_base + TM_QW3_HV_PHYS + TM_WORD2); + out_8(ind_tm_base + TM_QW3_HV_PHYS + TM_WORD2, 0x80); + w2 = in_8(ind_tm_base + TM_QW3_HV_PHYS + TM_WORD2); + + /* Dump HV state */ + xive_cpu_vdbg(c, "[reset] VP TIMA VP=%x/%x W01=%016llx W2=%02x->%02x\n", + xs->vp_blk, xs->vp_idx, + in_be64(ind_tm_base + TM_QW3_HV_PHYS), + old_w2, w2); + + /* Reset indirect access */ + xive_regw(x, PC_TCTXT_INDIR0, 0); +} + +static int64_t xive_vc_ind_cache_kill(struct xive *x, uint64_t type) +{ + uint64_t val; + + /* We clear the whole thing */ + xive_regw(x, VC_AT_MACRO_KILL_MASK, 0); + xive_regw(x, VC_AT_MACRO_KILL, VC_KILL_VALID | + SETFIELD(VC_KILL_TYPE, 0ull, type)); + + /* XXX SIMICS problem ? */ + if (chip_quirk(QUIRK_SIMICS)) + return 0; + + /* XXX Add timeout */ + for (;;) { + val = xive_regr(x, VC_AT_MACRO_KILL); + if (!(val & VC_KILL_VALID)) + break; + } + return 0; +} + +static int64_t xive_pc_ind_cache_kill(struct xive *x) +{ + uint64_t val; + + /* We clear the whole thing */ + xive_regw(x, PC_AT_KILL_MASK, 0); + xive_regw(x, PC_AT_KILL, PC_AT_KILL_VALID); + + /* XXX SIMICS problem ? */ + if (chip_quirk(QUIRK_SIMICS)) + return 0; + + /* XXX Add timeout */ + for (;;) { + val = xive_regr(x, PC_AT_KILL); + if (!(val & PC_AT_KILL_VALID)) + break; + } + return 0; +} + +static void xive_cleanup_vp_ind(struct xive *x) +{ + int i; + + xive_dbg(x, "Cleaning up %d VP ind entries...\n", x->vp_ind_count); + for (i = 0; i < x->vp_ind_count; i++) { + if (be64_to_cpu(x->vp_ind_base[i]) & VSD_FIRMWARE) { + xive_dbg(x, " %04x ... skip (firmware)\n", i); + continue; + } + if (x->vp_ind_base[i] != 0) { + x->vp_ind_base[i] = 0; + xive_dbg(x, " %04x ... cleaned\n", i); + } + } + xive_pc_ind_cache_kill(x); +} + +static void xive_cleanup_eq_ind(struct xive *x) +{ + int i; + + xive_dbg(x, "Cleaning up %d EQ ind entries...\n", x->eq_ind_count); + for (i = 0; i < x->eq_ind_count; i++) { + if (be64_to_cpu(x->eq_ind_base[i]) & VSD_FIRMWARE) { + xive_dbg(x, " %04x ... skip (firmware)\n", i); + continue; + } + if (x->eq_ind_base[i] != 0) { + x->eq_ind_base[i] = 0; + xive_dbg(x, " %04x ... cleaned\n", i); + } + } + xive_vc_ind_cache_kill(x, VC_KILL_EQD); +} + +static void xive_reset_one(struct xive *x) +{ + struct cpu_thread *c; + bool eq_firmware; + int i; + + xive_dbg(x, "Resetting one xive...\n"); + + lock(&x->lock); + + /* Check all interrupts are disabled */ + i = bitmap_find_one_bit(*x->int_enabled_map, 0, XIVE_INT_COUNT); + if (i >= 0) + xive_warn(x, "Interrupt %d (and maybe more) not disabled" + " at reset !\n", i); + + /* Reset IPI allocation */ + xive_dbg(x, "freeing alloc map %p/%p\n", + x->ipi_alloc_map, *x->ipi_alloc_map); + memset(x->ipi_alloc_map, 0, BITMAP_BYTES(XIVE_INT_COUNT)); + + xive_dbg(x, "Resetting EQs...\n"); + + /* Reset all allocated EQs and free the user ones */ + bitmap_for_each_one(*x->eq_map, XIVE_EQ_COUNT >> 3, i) { + struct xive_eq eq0; + struct xive_eq *eq; + int j; + + if (i == 0) + continue; + eq_firmware = false; + for (j = 0; j < NUM_INT_PRIORITIES; j++) { + uint32_t idx = (i << 3) | j; + + eq = xive_get_eq(x, idx); + if (!eq) + continue; + + /* We need to preserve the firmware bit, otherwise + * we will incorrectly free the EQs that are reserved + * for the physical CPUs + */ + if (xive_get_field32(EQ_W0_VALID, eq->w0)) { + if (!xive_get_field32(EQ_W0_FIRMWARE, eq->w0)) + xive_dbg(x, "EQ 0x%x:0x%x is valid at reset: %08x %08x\n", + x->block_id, idx, be32_to_cpu(eq->w0), be32_to_cpu(eq->w1)); + eq0 = *eq; + xive_cleanup_eq(&eq0); + xive_eqc_cache_update(x, x->block_id, idx, &eq0, true); + } + if (xive_get_field32(EQ_W0_FIRMWARE, eq->w0)) + eq_firmware = true; + } + if (!eq_firmware) + bitmap_clr_bit(*x->eq_map, i); + } + + /* Take out all VPs from HW and reset all CPPRs to 0 */ + for_each_present_cpu(c) { + if (c->chip_id != x->chip_id) + continue; + if (!c->xstate) + continue; + xive_cleanup_cpu_tima(c); + } + + /* Reset all user-allocated VPs. This is inefficient, we should + * either keep a bitmap of allocated VPs or add an iterator to + * the buddy which is trickier but doable. + */ + for (i = 0; i < XIVE_VP_COUNT; i++) { + struct xive_vp *vp; + struct xive_vp vp0 = {0}; + + /* Ignore the physical CPU VPs */ + if (i >= XIVE_HW_VP_BASE && + i < (XIVE_HW_VP_BASE + XIVE_HW_VP_COUNT)) + continue; + + /* Is the VP valid ? */ + vp = xive_get_vp(x, i); + if (!vp || !xive_get_field32(VP_W0_VALID, vp->w0)) + continue; + + /* Clear it */ + xive_dbg(x, "VP 0x%x:0x%x is valid at reset\n", x->block_id, i); + xive_vpc_cache_update(x, x->block_id, i, &vp0, true); + } + + /* Forget about remaining donated pages */ + list_head_init(&x->donated_pages); + + /* And cleanup donated indirect VP and EQ pages */ + xive_cleanup_vp_ind(x); + xive_cleanup_eq_ind(x); + + /* The rest must not be called with the lock held */ + unlock(&x->lock); + + /* Re-configure VPs and emulation */ + for_each_present_cpu(c) { + struct xive_cpu_state *xs = c->xstate; + + if (c->chip_id != x->chip_id || !xs) + continue; + + if (xive_mode == XIVE_MODE_EMU) + xive_init_cpu_emulation(xs, c); + else + xive_init_cpu_exploitation(xs); + } +} + +static void xive_reset_mask_source_cb(struct irq_source *is, + void *data __unused) +{ + struct xive_src *s = container_of(is, struct xive_src, is); + struct xive *x; + uint32_t isn; + + if (is->ops != &xive_irq_source_ops) + return; + + /* Skip escalation sources */ + if (GIRQ_IS_ESCALATION(is->start)) + return; + + x = s->xive; + + /* Iterate all interrupts */ + for (isn = is->start; isn < is->end; isn++) { + /* Has it ever been enabled ? */ + if (!bitmap_tst_bit(*x->int_enabled_map, GIRQ_TO_IDX(isn))) + continue; + /* Mask it and clear the enabled map bit */ + xive_vdbg(x, "[reset] disabling source 0x%x\n", isn); + __xive_set_irq_config(is, isn, 0, 0xff, isn, true, false); + bitmap_clr_bit(*x->int_enabled_map, GIRQ_TO_IDX(isn)); + } +} + +void xive_cpu_reset(void) +{ + struct cpu_thread *c = this_cpu(); + struct xive_cpu_state *xs = c->xstate; + + xs->cppr = 0; + out_8(xs->tm_ring1 + TM_QW3_HV_PHYS + TM_CPPR, 0); + + in_be64(xs->tm_ring1 + TM_SPC_PULL_POOL_CTX); +} + +static int64_t __xive_reset(uint64_t version) +{ + struct proc_chip *chip; + + xive_mode = version; + + /* Mask all interrupt sources */ + irq_for_each_source(xive_reset_mask_source_cb, NULL); + + /* For each XIVE do a sync... */ + for_each_chip(chip) { + if (!chip->xive) + continue; + xive_sync(chip->xive); + } + + /* For each XIVE reset everything else... */ + for_each_chip(chip) { + if (!chip->xive) + continue; + xive_reset_one(chip->xive); + } + + /* Cleanup global VP allocator */ + buddy_reset(xive_vp_buddy); + + /* We reserve the whole range of VPs representing HW chips. + * + * These are 0x80..0xff, so order 7 starting at 0x80. This will + * reserve that range on each chip. + */ + assert(buddy_reserve(xive_vp_buddy, XIVE_HW_VP_BASE, + XIVE_THREADID_SHIFT)); + + return OPAL_SUCCESS; +} + +/* Called by fast reboot */ +int64_t xive_reset(void) +{ + if (xive_mode == XIVE_MODE_NONE) + return OPAL_SUCCESS; + return __xive_reset(XIVE_MODE_EMU); +} + +static int64_t opal_xive_reset(uint64_t version) +{ + prlog(PR_DEBUG, "XIVE reset, version: %d...\n", (int)version); + + if (version > 1) + return OPAL_PARAMETER; + + return __xive_reset(version); +} + +static int64_t opal_xive_free_vp_block(uint64_t vp_base) +{ + uint32_t blk, idx, i, j, count; + uint8_t order; + bool group; + + if (xive_mode != XIVE_MODE_EXPL) + return OPAL_WRONG_STATE; + + if (!xive_decode_vp(vp_base, &blk, &idx, &order, &group)) + return OPAL_PARAMETER; + if (group) + return OPAL_PARAMETER; + if (blk) + return OPAL_PARAMETER; + if (order < (xive_chips_alloc_bits + 1)) + return OPAL_PARAMETER; + if (idx & ((1 << (order - xive_chips_alloc_bits)) - 1)) + return OPAL_PARAMETER; + + count = 1 << order; + for (i = 0; i < count; i++) { + uint32_t vp_id = vp_base + i; + uint32_t blk, idx, eq_blk, eq_idx; + struct xive *x; + struct xive_vp *vp; + + if (!xive_decode_vp(vp_id, &blk, &idx, NULL, NULL)) { + prerror("XIVE: Couldn't decode VP id %u\n", vp_id); + return OPAL_INTERNAL_ERROR; + } + x = xive_from_pc_blk(blk); + if (!x) { + prerror("XIVE: Instance not found for deallocated VP" + " block %d\n", blk); + return OPAL_INTERNAL_ERROR; + } + vp = xive_get_vp(x, idx); + if (!vp) { + prerror("XIVE: VP not found for deallocation !"); + return OPAL_INTERNAL_ERROR; + } + + /* VP must be disabled */ + if (xive_get_field32(VP_W0_VALID, vp->w0)) { + prlog(PR_ERR, "XIVE: freeing active VP %d\n", vp_id); + return OPAL_XIVE_FREE_ACTIVE; + } + + /* Not populated */ + if (vp->w1 == 0) + continue; + eq_blk = be32_to_cpu(vp->w1) >> 28; + eq_idx = be32_to_cpu(vp->w1) & 0x0fffffff; + + lock(&x->lock); + + /* Ensure EQs are disabled and cleaned up. Ideally the caller + * should have done it but we double check it here + */ + for (j = 0; j < NUM_INT_PRIORITIES; j++) { + struct xive *eq_x = xive_from_vc_blk(eq_blk); + struct xive_eq eq, *orig_eq = xive_get_eq(eq_x, eq_idx + j); + + if (!xive_get_field32(EQ_W0_VALID, orig_eq->w0)) + continue; + + prlog(PR_WARNING, "XIVE: freeing VP %d with queue %d active\n", + vp_id, j); + eq = *orig_eq; + xive_cleanup_eq(&eq); + xive_eqc_cache_update(x, eq_blk, eq_idx + j, &eq, true); + } + + /* Mark it not populated so we don't try to free it again */ + vp->w1 = 0; + + if (eq_blk != blk) { + prerror("XIVE: Block mismatch trying to free EQs\n"); + unlock(&x->lock); + return OPAL_INTERNAL_ERROR; + } + + xive_free_eq_set(x, eq_idx); + unlock(&x->lock); + } + + xive_free_vps(vp_base); + + return OPAL_SUCCESS; +} + +static int64_t opal_xive_alloc_vp_block(uint32_t alloc_order) +{ + uint32_t vp_base, eqs, count, i; + int64_t rc; + + if (xive_mode != XIVE_MODE_EXPL) + return OPAL_WRONG_STATE; + + prlog(PR_TRACE, "opal_xive_alloc_vp_block(%d)\n", alloc_order); + + vp_base = xive_alloc_vps(alloc_order); + if (XIVE_ALLOC_IS_ERR(vp_base)) { + if (vp_base == XIVE_ALLOC_NO_IND) + return OPAL_XIVE_PROVISIONING; + return OPAL_RESOURCE; + } + + /* Allocate EQs and initialize VPs */ + count = 1 << alloc_order; + for (i = 0; i < count; i++) { + uint32_t vp_id = vp_base + i; + uint32_t blk, idx; + struct xive *x; + struct xive_vp *vp; + + if (!xive_decode_vp(vp_id, &blk, &idx, NULL, NULL)) { + prerror("XIVE: Couldn't decode VP id %u\n", vp_id); + return OPAL_INTERNAL_ERROR; + } + x = xive_from_pc_blk(blk); + if (!x) { + prerror("XIVE: Instance not found for allocated VP" + " block %d\n", blk); + rc = OPAL_INTERNAL_ERROR; + goto fail; + } + vp = xive_get_vp(x, idx); + if (!vp) { + prerror("XIVE: VP not found after allocation !"); + rc = OPAL_INTERNAL_ERROR; + goto fail; + } + + /* Allocate EQs, if fails, free the VPs and return */ + lock(&x->lock); + eqs = xive_alloc_eq_set(x, false); + unlock(&x->lock); + if (XIVE_ALLOC_IS_ERR(eqs)) { + if (eqs == XIVE_ALLOC_NO_IND) + rc = OPAL_XIVE_PROVISIONING; + else + rc = OPAL_RESOURCE; + goto fail; + } + + /* Initialize the VP structure. We don't use a cache watch + * as we have made sure when freeing the entries to scrub + * it out of the cache. + */ + memset(vp, 0, sizeof(*vp)); + vp->w1 = cpu_to_be32((blk << 28) | eqs); + } + return vp_base; + fail: + opal_xive_free_vp_block(vp_base); + + return rc; +} + +static int64_t xive_try_allocate_irq(struct xive *x) +{ + int idx, base_idx, max_count, girq; + struct xive_ive *ive; + + lock(&x->lock); + + base_idx = x->int_ipi_top - x->int_base; + max_count = x->int_hw_bot - x->int_ipi_top; + + idx = bitmap_find_zero_bit(*x->ipi_alloc_map, base_idx, max_count); + if (idx < 0) { + unlock(&x->lock); + return OPAL_RESOURCE; + } + bitmap_set_bit(*x->ipi_alloc_map, idx); + girq = x->int_base + idx; + + /* Mark the IVE valid. Don't bother with the HW cache, it's + * still masked anyway, the cache will be updated when unmasked + * and configured. + */ + ive = xive_get_ive(x, girq); + if (!ive) { + bitmap_clr_bit(*x->ipi_alloc_map, idx); + unlock(&x->lock); + return OPAL_PARAMETER; + } + ive->w = xive_set_field64(IVE_VALID, 0ul, 1) | + xive_set_field64(IVE_MASKED, 0ul, 1) | + xive_set_field64(IVE_EQ_DATA, 0ul, girq); + unlock(&x->lock); + + return girq; +} + +static int64_t opal_xive_allocate_irq(uint32_t chip_id) +{ + struct proc_chip *chip; + bool try_all = false; + int64_t rc; + + if (xive_mode != XIVE_MODE_EXPL) + return OPAL_WRONG_STATE; + + if (chip_id == OPAL_XIVE_ANY_CHIP) { + try_all = true; + chip_id = this_cpu()->chip_id; + } + chip = get_chip(chip_id); + if (!chip) + return OPAL_PARAMETER; + + /* Try initial target chip */ + if (!chip->xive) + rc = OPAL_PARAMETER; + else + rc = xive_try_allocate_irq(chip->xive); + if (rc >= 0 || !try_all) + return rc; + + /* Failed and we try all... do so */ + for_each_chip(chip) { + if (!chip->xive) + continue; + rc = xive_try_allocate_irq(chip->xive); + if (rc >= 0) + break; + } + return rc; +} + +static int64_t opal_xive_free_irq(uint32_t girq) +{ + struct irq_source *is = irq_find_source(girq); + struct xive_src *s = container_of(is, struct xive_src, is); + struct xive *x = xive_from_isn(girq); + struct xive_ive *ive; + uint32_t idx; + + if (xive_mode != XIVE_MODE_EXPL) + return OPAL_WRONG_STATE; + if (!x || !is) + return OPAL_PARAMETER; + + idx = GIRQ_TO_IDX(girq); + + lock(&x->lock); + + ive = xive_get_ive(x, girq); + if (!ive) { + unlock(&x->lock); + return OPAL_PARAMETER; + } + + /* Mask the interrupt source */ + xive_update_irq_mask(s, girq - s->esb_base, true); + + /* Mark the IVE masked and invalid */ + ive->w = xive_set_field64(IVE_VALID, 0ul, 1) | + xive_set_field64(IVE_MASKED, 0ul, 1); + xive_ivc_scrub(x, x->block_id, idx); + + /* Free it */ + if (!bitmap_tst_bit(*x->ipi_alloc_map, idx)) { + unlock(&x->lock); + return OPAL_PARAMETER; + } + bitmap_clr_bit(*x->ipi_alloc_map, idx); + bitmap_clr_bit(*x->int_enabled_map, idx); + unlock(&x->lock); + + return OPAL_SUCCESS; +} + +static int64_t opal_xive_dump_tm(uint32_t offset, const char *n, uint32_t pir) +{ + struct cpu_thread *c = find_cpu_by_pir(pir); + struct xive_cpu_state *xs; + struct xive *x; + void *ind_tm_base; + uint64_t v0,v1; + + if (!c) + return OPAL_PARAMETER; + xs = c->xstate; + if (!xs || !xs->tm_ring1) + return OPAL_INTERNAL_ERROR; + x = xs->xive; + ind_tm_base = x->ic_base + (4 << x->ic_shift); + + lock(&x->lock); + + /* Setup indirect access to the corresponding thread */ + xive_regw(x, PC_TCTXT_INDIR0, + PC_TCTXT_INDIR_VALID | + SETFIELD(PC_TCTXT_INDIR_THRDID, 0ull, pir & 0xff)); + + /* Workaround for HW issue: Need to read the above register + * back before doing the subsequent accesses + */ + xive_regr(x, PC_TCTXT_INDIR0); + + v0 = in_be64(ind_tm_base + offset); + if (offset == TM_QW3_HV_PHYS) { + v1 = in_8(ind_tm_base + offset + 8); + v1 <<= 56; + } else { + v1 = in_be32(ind_tm_base + offset + 8); + v1 <<= 32; + } + prlog(PR_INFO, "CPU[%04x]: TM state for QW %s\n", pir, n); + prlog(PR_INFO, "CPU[%04x]: NSR CPPR IPB LSMFB ACK# INC AGE PIPR" + " W2 W3\n", pir); + prlog(PR_INFO, "CPU[%04x]: %02x %02x %02x %02x %02x " + "%02x %02x %02x %08x %08x\n", pir, + (uint8_t)(v0 >> 58) & 0xff, (uint8_t)(v0 >> 48) & 0xff, + (uint8_t)(v0 >> 40) & 0xff, (uint8_t)(v0 >> 32) & 0xff, + (uint8_t)(v0 >> 24) & 0xff, (uint8_t)(v0 >> 16) & 0xff, + (uint8_t)(v0 >> 8) & 0xff, (uint8_t)(v0 ) & 0xff, + (uint32_t)(v1 >> 32) & 0xffffffff, + (uint32_t)(v1 & 0xffffffff)); + + + xive_regw(x, PC_TCTXT_INDIR0, 0); + unlock(&x->lock); + + return OPAL_SUCCESS; +} + +static int64_t opal_xive_dump_vp(uint32_t vp_id) +{ + uint32_t blk, idx; + uint8_t order; + bool group; + struct xive *x; + struct xive_vp *vp; + uint32_t *vpw; + + if (!xive_decode_vp(vp_id, &blk, &idx, &order, &group)) + return OPAL_PARAMETER; + + x = xive_from_vc_blk(blk); + if (!x) + return OPAL_PARAMETER; + vp = xive_get_vp(x, idx); + if (!vp) + return OPAL_PARAMETER; + lock(&x->lock); + + xive_vpc_scrub_clean(x, blk, idx); + + vpw = ((uint32_t *)vp) + (group ? 8 : 0); + prlog(PR_INFO, "VP[%08x]: 0..3: %08x %08x %08x %08x\n", vp_id, + vpw[0], vpw[1], vpw[2], vpw[3]); + prlog(PR_INFO, "VP[%08x]: 4..7: %08x %08x %08x %08x\n", vp_id, + vpw[4], vpw[5], vpw[6], vpw[7]); + unlock(&x->lock); + + return OPAL_SUCCESS; +} + +static int64_t __opal_xive_dump_emu(struct xive_cpu_state *xs, uint32_t pir) +{ + struct xive_eq *eq; + uint32_t ipi_target; + uint8_t *mm, pq; + + prlog(PR_INFO, "CPU[%04x]: XIVE emulation state\n", pir); + + prlog(PR_INFO, "CPU[%04x]: cppr=%02x mfrr=%02x pend=%02x" + " prev_cppr=%02x total_irqs=%llx\n", pir, + xs->cppr, xs->mfrr, xs->pending, xs->prev_cppr, xs->total_irqs); + + prlog(PR_INFO, "CPU[%04x]: EQ IDX=%x MSK=%x G=%d [%08x %08x %08x > %08x %08x %08x %08x ...]\n", + pir, xs->eqptr, xs->eqmsk, xs->eqgen, + xs->eqbuf[(xs->eqptr - 3) & xs->eqmsk], + xs->eqbuf[(xs->eqptr - 2) & xs->eqmsk], + xs->eqbuf[(xs->eqptr - 1) & xs->eqmsk], + xs->eqbuf[(xs->eqptr + 0) & xs->eqmsk], + xs->eqbuf[(xs->eqptr + 1) & xs->eqmsk], + xs->eqbuf[(xs->eqptr + 2) & xs->eqmsk], + xs->eqbuf[(xs->eqptr + 3) & xs->eqmsk]); + + mm = xs->xive->esb_mmio + GIRQ_TO_IDX(xs->ipi_irq) * XIVE_ESB_PAGE_SIZE; + pq = in_8(mm + 0x10800); + if (xive_get_irq_targetting(xs->ipi_irq, &ipi_target, NULL, NULL)) + prlog(PR_INFO, "CPU[%04x]: IPI #%08x PQ=%x target=%08x\n", + pir, xs->ipi_irq, pq, ipi_target); + else + prlog(PR_INFO, "CPU[%04x]: IPI #%08x PQ=%x target=??\n", + pir, xs->ipi_irq, pq); + + + + __xive_cache_scrub(xs->xive, xive_cache_eqc, xs->eq_blk, + xs->eq_idx + XIVE_EMULATION_PRIO, + false, false); + eq = xive_get_eq(xs->xive, xs->eq_idx + XIVE_EMULATION_PRIO); + prlog(PR_INFO, "CPU[%04x]: EQ @%p W0=%08x W1=%08x qbuf @%p\n", + pir, eq, be32_to_cpu(eq->w0), be32_to_cpu(eq->w1), xs->eqbuf); + + return OPAL_SUCCESS; +} + +static int64_t opal_xive_dump_emu(uint32_t pir) +{ + struct cpu_thread *c = find_cpu_by_pir(pir); + struct xive_cpu_state *xs; + int64_t rc; + + if (!c) + return OPAL_PARAMETER; + + xs = c->xstate; + if (!xs) { + prlog(PR_INFO, " <none>\n"); + return OPAL_SUCCESS; + } + lock(&xs->lock); + rc = __opal_xive_dump_emu(xs, pir); + log_print(xs); + unlock(&xs->lock); + + return rc; +} + +static int64_t opal_xive_sync_irq_src(uint32_t girq) +{ + struct xive *x = xive_from_isn(girq); + + if (!x) + return OPAL_PARAMETER; + return xive_sync(x); +} + +static int64_t opal_xive_sync_irq_target(uint32_t girq) +{ + uint32_t target, vp_blk; + struct xive *x; + + if (!xive_get_irq_targetting(girq, &target, NULL, NULL)) + return OPAL_PARAMETER; + if (!xive_decode_vp(target, &vp_blk, NULL, NULL, NULL)) + return OPAL_PARAMETER; + x = xive_from_pc_blk(vp_blk); + if (!x) + return OPAL_PARAMETER; + return xive_sync(x); +} + +static int64_t opal_xive_sync(uint32_t type, uint32_t id) +{ + int64_t rc = OPAL_SUCCESS;; + + if (type & XIVE_SYNC_EAS) + rc = opal_xive_sync_irq_src(id); + if (rc) + return rc; + if (type & XIVE_SYNC_QUEUE) + rc = opal_xive_sync_irq_target(id); + if (rc) + return rc; + + /* Add more ... */ + + return rc; +} + +static int64_t opal_xive_dump(uint32_t type, uint32_t id) +{ + switch (type) { + case XIVE_DUMP_TM_HYP: + return opal_xive_dump_tm(TM_QW3_HV_PHYS, "PHYS", id); + case XIVE_DUMP_TM_POOL: + return opal_xive_dump_tm(TM_QW2_HV_POOL, "POOL", id); + case XIVE_DUMP_TM_OS: + return opal_xive_dump_tm(TM_QW1_OS, "OS ", id); + case XIVE_DUMP_TM_USER: + return opal_xive_dump_tm(TM_QW0_USER, "USER", id); + case XIVE_DUMP_VP: + return opal_xive_dump_vp(id); + case XIVE_DUMP_EMU_STATE: + return opal_xive_dump_emu(id); + default: + return OPAL_PARAMETER; + } +} + +static void xive_init_globals(void) +{ + uint32_t i; + + for (i = 0; i < XIVE_MAX_CHIPS; i++) + xive_block_to_chip[i] = XIVE_INVALID_CHIP; +} + +void init_xive(void) +{ + struct dt_node *np; + struct proc_chip *chip; + struct cpu_thread *cpu; + struct xive *one_xive; + bool first = true; + + /* Look for xive nodes and do basic inits */ + dt_for_each_compatible(dt_root, np, "ibm,power9-xive-x") { + struct xive *x; + + /* Initialize some global stuff */ + if (first) + xive_init_globals(); + + /* Create/initialize the xive instance */ + x = init_one_xive(np); + if (first) + one_xive = x; + first = false; + } + if (first) + return; + + xive_mode = XIVE_MODE_EMU; + + /* Init VP allocator */ + xive_init_vp_allocator(); + + /* Create a device-tree node for Linux use */ + xive_create_mmio_dt_node(one_xive); + + /* Some inits must be done after all xive have been created + * such as setting up the forwarding ports + */ + for_each_chip(chip) { + if (chip->xive) + late_init_one_xive(chip->xive); + } + + /* Initialize XICS emulation per-cpu structures */ + for_each_present_cpu(cpu) { + xive_init_cpu(cpu); + } + /* Add interrupts propertie to each CPU node */ + for_each_present_cpu(cpu) { + if (cpu_is_thread0(cpu)) + xive_init_cpu_properties(cpu); + } + + /* Calling boot CPU */ + xive_cpu_callin(this_cpu()); + + /* Register XICS emulation calls */ + opal_register(OPAL_INT_GET_XIRR, opal_xive_get_xirr, 2); + opal_register(OPAL_INT_SET_CPPR, opal_xive_set_cppr, 1); + opal_register(OPAL_INT_EOI, opal_xive_eoi, 1); + opal_register(OPAL_INT_SET_MFRR, opal_xive_set_mfrr, 2); + + /* Register XIVE exploitation calls */ + opal_register(OPAL_XIVE_RESET, opal_xive_reset, 1); + opal_register(OPAL_XIVE_GET_IRQ_INFO, opal_xive_get_irq_info, 6); + opal_register(OPAL_XIVE_GET_IRQ_CONFIG, opal_xive_get_irq_config, 4); + opal_register(OPAL_XIVE_SET_IRQ_CONFIG, opal_xive_set_irq_config, 4); + opal_register(OPAL_XIVE_GET_QUEUE_INFO, opal_xive_get_queue_info, 7); + opal_register(OPAL_XIVE_SET_QUEUE_INFO, opal_xive_set_queue_info, 5); + opal_register(OPAL_XIVE_DONATE_PAGE, opal_xive_donate_page, 2); + opal_register(OPAL_XIVE_ALLOCATE_IRQ, opal_xive_allocate_irq, 1); + opal_register(OPAL_XIVE_FREE_IRQ, opal_xive_free_irq, 1); + opal_register(OPAL_XIVE_ALLOCATE_VP_BLOCK, opal_xive_alloc_vp_block, 1); + opal_register(OPAL_XIVE_FREE_VP_BLOCK, opal_xive_free_vp_block, 1); + opal_register(OPAL_XIVE_GET_VP_INFO, opal_xive_get_vp_info, 5); + opal_register(OPAL_XIVE_SET_VP_INFO, opal_xive_set_vp_info, 3); + opal_register(OPAL_XIVE_SYNC, opal_xive_sync, 2); + opal_register(OPAL_XIVE_DUMP, opal_xive_dump, 2); + opal_register(OPAL_XIVE_GET_QUEUE_STATE, opal_xive_get_queue_state, 4); + opal_register(OPAL_XIVE_SET_QUEUE_STATE, opal_xive_set_queue_state, 4); + opal_register(OPAL_XIVE_GET_VP_STATE, opal_xive_get_vp_state, 2); +} + diff --git a/roms/skiboot/hw/xive2.c b/roms/skiboot/hw/xive2.c new file mode 100644 index 000000000..d5814bcbf --- /dev/null +++ b/roms/skiboot/hw/xive2.c @@ -0,0 +1,4666 @@ +// SPDX-License-Identifier: Apache-2.0 +/* + * XIVE2: eXternal Interrupt Virtualization Engine. POWER10 interrupt + * controller + * + * Copyright (c) 2016-2019, IBM Corporation. + */ + +#define pr_fmt(fmt) "XIVE: " fmt + +#include <skiboot.h> +#include <xscom.h> +#include <chip.h> +#include <io.h> +#include <xive.h> +#include <xive2-regs.h> +#include <xscom-p10-regs.h> +#include <interrupts.h> +#include <timebase.h> +#include <bitmap.h> +#include <buddy.h> +#include <phys-map.h> +#include <p10_stop_api.H> + + +/* Verbose debug */ +#undef XIVE_VERBOSE_DEBUG +#undef DEBUG + +/* Extra debug options used in debug builds */ +#ifdef DEBUG +#define XIVE_CHECK_LOCKS +#define XIVE_DEBUG_INIT_CACHE_UPDATES +#define XIVE_EXTRA_CHECK_INIT_CACHE +#else +#undef XIVE_CHECK_LOCKS +#undef XIVE_DEBUG_INIT_CACHE_UPDATES +#undef XIVE_EXTRA_CHECK_INIT_CACHE +#endif + +/* + * VSDs, blocks, set translation etc... + * + * For the following data structures, the XIVE use a mechanism called + * Virtualization Structure Tables (VST) to manage the memory layout + * and access: ESBs (Event State Buffers), EAS (Event assignment + * structures), ENDs (Event Notification Descriptors) and NVT/NVP + * (Notification Virtual Targets/Processors). + * + * These structures divide those tables into 16 "blocks". Each XIVE + * instance has a definition for all 16 blocks that can either represent + * an actual table in memory or a remote XIVE MMIO port to access a + * block that is owned by that remote XIVE. + * + * Our SW design will consist of allocating one block per chip (and thus + * per XIVE instance) for now, thus giving us up to 16 supported chips in + * the system. We may have to revisit that if we ever support systems with + * more than 16 chips but that isn't on our radar at the moment or if we + * want to do like pHyp on some machines and dedicate 2 blocks per chip + * for some structures. + * + * Thus we need to be careful that we never expose to Linux the concept + * of block and block boundaries, but instead we provide full number ranges + * so that consecutive blocks can be supported. + * + * Similarily, for MMIO access, the BARs support what is called "set + * translation" which allows the BAR to be devided into a certain + * number of sets. Each "set" can be routed to a specific block and + * offset within a block. + */ + +#define XIVE_MAX_BLOCKS 16 +#define XIVE_VSD_SIZE 8 + +/* + * Max number of ESBs. (direct table) + * + * The max number of ESBs supported in the P10 MMIO space is 1TB/128K: 8M. + * + * 1M is our current top limit of ESB entries and EAS entries + * pre-allocated per chip. That allocates 256KB per chip for the state + * bits and 8M per chip for the EAS. + */ + +#define XIVE_INT_ORDER 20 /* 1M interrupts */ +#define XIVE_INT_COUNT (1ul << XIVE_INT_ORDER) + +/* + * First interrupt number, also the first logical interrupt number + * allocated by Linux (maximum ISA interrupt number + 1) + */ +#define XIVE_INT_FIRST 0x10 + +/* Corresponding direct table sizes */ +#define XIVE_ESB_SIZE (XIVE_INT_COUNT / 4) +#define XIVE_EAT_SIZE (XIVE_INT_COUNT * 8) + +/* Use 64K for everything by default */ +#define XIVE_ESB_SHIFT (16 + 1) /* trigger + mgmt pages */ +#define XIVE_ESB_PAGE_SIZE (1ul << XIVE_ESB_SHIFT) /* 2 pages */ + +/* + * Max number of ENDs. (indirect table) + * + * The max number of ENDs supported in the P10 MMIO space is 2TB/128K: 16M. + * Since one END is 32 bytes, a 64K indirect subpage can hold 2K ENDs. + * We need 8192 subpages, ie, 64K of memory for the indirect table. + */ +#define END_PER_PAGE (PAGE_SIZE / sizeof(struct xive_end)) + +#define XIVE_END_ORDER 23 /* 8M ENDs */ +#define XIVE_END_COUNT (1ul << XIVE_END_ORDER) +#define XIVE_END_TABLE_SIZE ((XIVE_END_COUNT / END_PER_PAGE) * XIVE_VSD_SIZE) + +#define XIVE_END_SHIFT (16 + 1) /* ESn + ESe pages */ + +/* One bit per number of priorities configured */ +#define xive_end_bitmap_size(x) (XIVE_END_COUNT >> xive_cfg_vp_prio_shift(x)) + +/* Number of priorities (and thus ENDs) we allocate for each VP */ +#define xive_cfg_vp_prio_shift(x) GETFIELD(CQ_XIVE_CFG_VP_INT_PRIO, (x)->config) +#define xive_cfg_vp_prio(x) (1 << xive_cfg_vp_prio_shift(x)) + +/* Max priority number */ +#define xive_max_prio(x) (xive_cfg_vp_prio(x) - 1) + +/* Priority used for gather/silent escalation (KVM) */ +#define xive_escalation_prio(x) xive_max_prio(x) + +/* + * Max number of VPs. (indirect table) + * + * The max number of NVPs we support in our MMIO space is 1TB/128K: 8M. + * Since one NVP is 32 bytes, a 64K indirect subpage can hold 2K NVPs. + * We need 4096 pointers, ie, 32K of memory for the indirect table. + * + * However, we use 8 priorities (by default) per NVP and the number of + * ENDs is configured to 8M. Therefore, our VP space is limited to 1M. + */ +#define VP_PER_PAGE (PAGE_SIZE / sizeof(struct xive_nvp)) + +#define XIVE_VP_ORDER(x) (XIVE_END_ORDER - xive_cfg_vp_prio_shift(x)) +#define XIVE_VP_COUNT(x) (1ul << XIVE_VP_ORDER(x)) +#define XIVE_VP_TABLE_SIZE(x) ((XIVE_VP_COUNT(x) / VP_PER_PAGE) * XIVE_VSD_SIZE) + +#define XIVE_NVP_SHIFT 17 /* NVPG BAR: two pages, even NVP, odd NVG */ + +/* VP Space maximums in Gen1 and Gen2 modes */ +#define VP_SHIFT_GEN1 19 /* in sync with END_W6_VP_OFFSET_GEN1 */ +#define VP_SHIFT_GEN2 24 /* in sync with END_W6_VP_OFFSET */ + +/* + * VP ids for HW threads. + * + * Depends on the thread id bits configuration of the IC. 8bit is the + * default for P10 and 7bit for p9. + * + * These values are global because they should be common to all chips + */ +static uint32_t xive_threadid_shift; +static uint32_t xive_hw_vp_base; +static uint32_t xive_hw_vp_count; + +/* + * The XIVE operation mode indicates the active "API" and corresponds + * to the "version/mode" parameter of the opal_xive_reset() call + */ +static enum { + /* No XICS emulation */ + XIVE_MODE_EXPL = OPAL_XIVE_MODE_EXPL, /* default */ + XIVE_MODE_NONE, +} xive_mode = XIVE_MODE_NONE; + +/* + * The XIVE exploitation mode options indicates the active features and + * is part of the mode parameter of the opal_xive_reset() call + */ +static uint64_t xive_expl_options; + +#define XIVE_EXPL_ALL_OPTIONS 0 + +/* + * Each source controller has one of these. There's one embedded in + * the XIVE struct for IPIs + */ +struct xive_src { + struct irq_source is; + const struct irq_source_ops *orig_ops; + struct xive *xive; + void *esb_mmio; + uint32_t esb_base; + uint32_t esb_shift; + uint32_t flags; +}; + +struct xive_cpu_state { + struct xive *xive; + void *tm_ring1; + + /* Base HW VP and associated queues */ + uint32_t vp_blk; + uint32_t vp_idx; + uint32_t end_blk; + uint32_t end_idx; /* Base end index of a block of 8 */ + + struct lock lock; +}; + +enum xive_generation { + XIVE_GEN1 = 1, /* P9 compat mode */ + XIVE_GEN2 = 2, /* P10 default */ +}; + +enum xive_quirks { + /* HW527671 - 8bits Hardwired Thread Id range not implemented */ + XIVE_QUIRK_THREADID_7BITS = 0x00000001, + /* HW542974 - interrupt command priority checker not working properly */ + XIVE_QUIRK_BROKEN_PRIO_CHECK = 0x00000002, +}; + +struct xive { + uint32_t chip_id; + uint32_t block_id; + struct dt_node *x_node; + + enum xive_generation generation; + uint64_t capabilities; + uint64_t config; + + uint64_t xscom_base; + + /* MMIO regions */ + void *ic_base; + uint64_t ic_size; + uint32_t ic_shift; + void *ic_tm_direct_base; + + void *tm_base; + uint64_t tm_size; + uint32_t tm_shift; + void *nvp_base; + uint64_t nvp_size; + void *esb_base; + uint64_t esb_size; + void *end_base; + uint64_t end_size; + + /* Set on XSCOM register access error */ + bool last_reg_error; + + /* Per-XIVE mutex */ + struct lock lock; + + /* Pre-allocated tables. + * + * We setup all the VDS for actual tables (ie, by opposition to + * forwarding ports) as either direct pre-allocated or indirect + * and partially populated. + * + * Currently, the ESB and the EAS tables are direct and fully + * pre-allocated based on XIVE_INT_COUNT. + * + * The other tables are indirect, we thus pre-allocate the indirect + * table (ie, pages of pointers) and populate enough of the pages + * for our basic setup using 64K subpages. + * + * The size of the indirect tables are driven by XIVE_VP_COUNT + * and XIVE_END_COUNT. The number of pre-allocated ones are + * driven by xive_hw_vp_count for the HW threads. The number + * of END depends on number of VP. + */ + + /* Direct SBE and EAT tables */ + void *sbe_base; + void *eat_base; + + /* Indirect END table. NULL entries are unallocated, count is + * the numbre of pointers (ie, sub page placeholders). + */ + beint64_t *end_ind_base; + uint32_t end_ind_count; + uint64_t end_ind_size; + + /* END allocation bitmap. Each bit represent #priority ENDs */ + bitmap_t *end_map; + + /* Indirect NVT/VP table. NULL entries are unallocated, count is + * the numbre of pointers (ie, sub page placeholders). + */ + beint64_t *vp_ind_base; + uint32_t vp_ind_count; + uint64_t vp_ind_size; + + /* VP space size. Depends on Gen1/2 mode */ + uint32_t vp_shift; + + /* Pool of donated pages for provisioning indirect END and VP pages */ + struct list_head donated_pages; + + /* To ease a possible change to supporting more than one block of + * interrupts per chip, we store here the "base" global number + * and max number of interrupts for this chip. The global number + * encompass the block number and index. + */ + uint32_t int_base; + uint32_t int_count; + + /* Due to the overlap between IPIs and HW sources in the EAS table, + * we keep some kind of top-down allocator. It is used for HW sources + * to "allocate" interrupt entries and will limit what can be handed + * out as IPIs. Of course this assumes we "allocate" all HW sources + * before we start handing out IPIs. + * + * Note: The numbers here are global interrupt numbers so that we can + * potentially handle more than one block per chip in the future. + */ + uint32_t int_hw_bot; /* Bottom of HW allocation */ + uint32_t int_ipi_top; /* Highest IPI handed out so far + 1 */ + + /* The IPI allocation bitmap */ + bitmap_t *ipi_alloc_map; + + /* We keep track of which interrupts were ever enabled to + * speed up xive_reset + */ + bitmap_t *int_enabled_map; + + /* Embedded source IPIs */ + struct xive_src ipis; + + /* Embedded escalation interrupts */ + struct xive_src esc_irqs; + + /* In memory queue overflow */ + void *q_ovf; + + /* Cache/sync injection */ + uint64_t sync_inject_size; + void *sync_inject; + + /* INT HW Errata */ + uint64_t quirks; +}; + +/* First XIVE unit configured on the system */ +static struct xive *one_xive; + +/* Global DT node */ +static struct dt_node *xive_dt_node; + +/* Block <-> Chip conversions. + * + * As chipIDs may not be within the range of 16 block IDs supported by XIVE, + * we have a 2 way conversion scheme. + * + * From block to chip, use the global table below. + * + * From chip to block, a field in struct proc_chip contains the first block + * of that chip. For now we only support one block per chip but that might + * change in the future + */ +#define XIVE_INVALID_CHIP 0xffffffff +#define XIVE_MAX_CHIPS 16 +static uint32_t xive_block_to_chip[XIVE_MAX_CHIPS]; +static uint32_t xive_block_count; + +static uint32_t xive_chip_to_block(uint32_t chip_id) +{ + struct proc_chip *c = get_chip(chip_id); + + assert(c); + assert(c->xive); + return c->xive->block_id; +} + +/* + * Conversion between GIRQ and block/index. + * + * ------------------------------------ + * |000E|BLOC| INDEX| + * ------------------------------------ + * 4 4 24 + * + * the E bit indicates that this is an escalation interrupt, in + * that case, the BLOC/INDEX represents the END containing the + * corresponding escalation descriptor. + * + * Global interrupt numbers for non-escalation interrupts are thus + * limited to 28 bits. + */ + +#define INT_SHIFT 24 +#define INT_ESC_SHIFT (INT_SHIFT + 4) /* 4bits block id */ + +#if XIVE_INT_ORDER > INT_SHIFT +#error "Too many ESBs for IRQ encoding" +#endif + +#if XIVE_END_ORDER > INT_SHIFT +#error "Too many ENDs for escalation IRQ number encoding" +#endif + +#define GIRQ_TO_BLK(__g) (((__g) >> INT_SHIFT) & 0xf) +#define GIRQ_TO_IDX(__g) ((__g) & ((1 << INT_SHIFT) - 1)) +#define BLKIDX_TO_GIRQ(__b,__i) (((uint32_t)(__b)) << INT_SHIFT | (__i)) + +#define GIRQ_IS_ESCALATION(__g) ((__g) & (1 << INT_ESC_SHIFT)) +#define MAKE_ESCALATION_GIRQ(__b,__i)(BLKIDX_TO_GIRQ(__b,__i) | (1 << INT_ESC_SHIFT)) + + +/* Block/IRQ to chip# conversions */ +#define PC_BLK_TO_CHIP(__b) (xive_block_to_chip[__b]) +#define VC_BLK_TO_CHIP(__b) (xive_block_to_chip[__b]) +#define GIRQ_TO_CHIP(__isn) (VC_BLK_TO_CHIP(GIRQ_TO_BLK(__isn))) + +/* Routing of physical processors to VPs */ +#define PIR2VP_IDX( __pir) (xive_hw_vp_base | P10_PIR2LOCALCPU(__pir)) +#define PIR2VP_BLK(__pir) (xive_chip_to_block(P10_PIR2GCID(__pir))) +#define VP2PIR(__blk, __idx) (P10_PIRFROMLOCALCPU(VC_BLK_TO_CHIP(__blk), (__idx) & 0xff)) + +/* Decoding of OPAL API VP IDs. The VP IDs are encoded as follow + * + * Block group mode: + * + * ----------------------------------- + * |GVEOOOOO| INDEX| + * ----------------------------------- + * || | + * || Order + * |Virtual + * Group + * + * G (Group) : Set to 1 for a group VP (not currently supported) + * V (Virtual) : Set to 1 for an allocated VP (vs. a physical processor ID) + * E (Error) : Should never be 1, used internally for errors + * O (Order) : Allocation order of the VP block + * + * The conversion is thus done as follow (groups aren't implemented yet) + * + * If V=0, O must be 0 and 24-bit INDEX value is the PIR + * If V=1, the order O group is allocated such that if N is the number of + * chip bits considered for allocation (*) + * then the INDEX is constructed as follow (bit numbers such as 0=LSB) + * - bottom O-N bits is the index within the "VP block" + * - next N bits is the XIVE blockID of the VP + * - the remaining bits is the per-chip "base" + * so the conversion consists of "extracting" the block ID and moving + * down the upper bits by N bits. + * + * In non-block-group mode, the difference is that the blockID is + * on the left of the index (the entire VP block is in a single + * block ID) + */ + +#define VP_GROUP_SHIFT 31 +#define VP_VIRTUAL_SHIFT 30 +#define VP_ERROR_SHIFT 29 +#define VP_ORDER_SHIFT 24 + +#define vp_group(vp) (((vp) >> VP_GROUP_SHIFT) & 1) +#define vp_virtual(vp) (((vp) >> VP_VIRTUAL_SHIFT) & 1) +#define vp_order(vp) (((vp) >> VP_ORDER_SHIFT) & 0x1f) +#define vp_index(vp) ((vp) & ((1 << VP_ORDER_SHIFT) - 1)) + +/* VP allocation */ +static uint32_t xive_chips_alloc_bits = 0; +static struct buddy *xive_vp_buddy; +static struct lock xive_buddy_lock = LOCK_UNLOCKED; + +/* VP# decoding/encoding */ +static bool xive_decode_vp(uint32_t vp, uint32_t *blk, uint32_t *idx, + uint8_t *order, bool *group) +{ + uint32_t o = vp_order(vp); + uint32_t n = xive_chips_alloc_bits; + uint32_t index = vp_index(vp); + uint32_t imask = (1 << (o - n)) - 1; + + /* Groups not supported yet */ + if (vp_group(vp)) + return false; + if (group) + *group = false; + + /* PIR case */ + if (!vp_virtual(vp)) { + if (find_cpu_by_pir(index) == NULL) + return false; + if (blk) + *blk = PIR2VP_BLK(index); + if (idx) + *idx = PIR2VP_IDX(index); + return true; + } + + /* Ensure o > n, we have *at least* 2 VPs per block */ + if (o <= n) + return false; + + /* Combine the index base and index */ + if (idx) + *idx = ((index >> n) & ~imask) | (index & imask); + /* Extract block ID */ + if (blk) + *blk = (index >> (o - n)) & ((1 << n) - 1); + + /* Return order as well if asked for */ + if (order) + *order = o; + + return true; +} + +static uint32_t xive_encode_vp(uint32_t blk, uint32_t idx, uint32_t order) +{ + uint32_t vp = (1 << VP_VIRTUAL_SHIFT) | (order << VP_ORDER_SHIFT); + uint32_t n = xive_chips_alloc_bits; + uint32_t imask = (1 << (order - n)) - 1; + + vp |= (idx & ~imask) << n; + vp |= blk << (order - n); + vp |= idx & imask; + return vp; +} + +/* + * XSCOM/MMIO helpers + */ +#define XIVE_NO_MMIO -1 + +#define xive_regw(__x, __r, __v) \ + __xive_regw(__x, __r, X_##__r, __v, #__r) +#define xive_regr(__x, __r) \ + __xive_regr(__x, __r, X_##__r, #__r) +#define xive_regwx(__x, __r, __v) \ + __xive_regw(__x, XIVE_NO_MMIO, X_##__r, __v, #__r) +#define xive_regrx(__x, __r) \ + __xive_regr(__x, XIVE_NO_MMIO, X_##__r, #__r) + +#ifdef XIVE_VERBOSE_DEBUG +#define xive_vdbg(__x,__fmt,...) prlog(PR_DEBUG,"[ IC %02x ] " __fmt, (__x)->chip_id, ##__VA_ARGS__) +#define xive_cpu_vdbg(__c,__fmt,...) prlog(PR_DEBUG,"[CPU %04x] " __fmt, (__c)->pir, ##__VA_ARGS__) +#else +#define xive_vdbg(x,fmt,...) do { } while(0) +#define xive_cpu_vdbg(x,fmt,...) do { } while(0) +#endif + +#define xive_dbg(__x,__fmt,...) prlog(PR_DEBUG,"[ IC %02x ] " __fmt, (__x)->chip_id, ##__VA_ARGS__) +#define xive_cpu_dbg(__c,__fmt,...) prlog(PR_DEBUG,"[CPU %04x] " __fmt, (__c)->pir, ##__VA_ARGS__) +#define xive_notice(__x,__fmt,...) prlog(PR_NOTICE,"[ IC %02x ] " __fmt, (__x)->chip_id, ##__VA_ARGS__) +#define xive_cpu_notice(__c,__fmt,...) prlog(PR_NOTICE,"[CPU %04x] " __fmt, (__c)->pir, ##__VA_ARGS__) +#define xive_warn(__x,__fmt,...) prlog(PR_WARNING,"[ IC %02x ] " __fmt, (__x)->chip_id, ##__VA_ARGS__) +#define xive_cpu_warn(__c,__fmt,...) prlog(PR_WARNING,"[CPU %04x] " __fmt, (__c)->pir, ##__VA_ARGS__) +#define xive_err(__x,__fmt,...) prlog(PR_ERR,"[ IC %02x ] " __fmt, (__x)->chip_id, ##__VA_ARGS__) +#define xive_cpu_err(__c,__fmt,...) prlog(PR_ERR,"[CPU %04x] " __fmt, (__c)->pir, ##__VA_ARGS__) + +/* + * The XIVE subengine being accessed can be deduced from the XSCOM + * reg, and from there, the page offset in the IC BAR. + */ +static void* xive_ic_page(struct xive *x, uint32_t x_reg) +{ + uint64_t pgoff = (x_reg >> 8) & 0x3; + + return x->ic_base + (pgoff << x->ic_shift); +} + +static void __xive_regw(struct xive *x, uint32_t m_reg, uint32_t x_reg, uint64_t v, + const char *rname) +{ + bool use_xscom = (m_reg == XIVE_NO_MMIO) || !x->ic_base; + int64_t rc; + + x->last_reg_error = false; + + assert(x_reg != 0); + + if (use_xscom) { + rc = xscom_write(x->chip_id, x->xscom_base + x_reg, v); + if (rc) { + if (!rname) + rname = "???"; + xive_err(x, "Error writing register %s\n", rname); + /* Anything else we can do here ? */ + x->last_reg_error = true; + } + } else { + out_be64(xive_ic_page(x, x_reg) + m_reg, v); + } +} + +static uint64_t __xive_regr(struct xive *x, uint32_t m_reg, uint32_t x_reg, + const char *rname) +{ + bool use_xscom = (m_reg == XIVE_NO_MMIO) || !x->ic_base; + int64_t rc; + uint64_t val; + + x->last_reg_error = false; + + assert(x_reg != 0); + + if (use_xscom) { + rc = xscom_read(x->chip_id, x->xscom_base + x_reg, &val); + if (rc) { + if (!rname) + rname = "???"; + xive_err(x, "Error reading register %s\n", rname); + /* Anything else we can do here ? */ + x->last_reg_error = true; + return -1ull; + } + } else { + val = in_be64(xive_ic_page(x, x_reg) + m_reg); + } + return val; +} + +/* Locate a controller from an IRQ number */ +static struct xive *xive_from_isn(uint32_t isn) +{ + uint32_t chip_id = GIRQ_TO_CHIP(isn); + struct proc_chip *c = get_chip(chip_id); + + if (!c) + return NULL; + return c->xive; +} + +static struct xive *xive_from_pc_blk(uint32_t blk) +{ + uint32_t chip_id = PC_BLK_TO_CHIP(blk); + struct proc_chip *c = get_chip(chip_id); + + if (!c) + return NULL; + return c->xive; +} + +static struct xive *xive_from_vc_blk(uint32_t blk) +{ + uint32_t chip_id = VC_BLK_TO_CHIP(blk); + struct proc_chip *c = get_chip(chip_id); + + if (!c) + return NULL; + return c->xive; +} + +static struct xive_end *xive_get_end(struct xive *x, unsigned int idx) +{ + struct xive_end *p; + + if (idx >= (x->end_ind_count * END_PER_PAGE)) + return NULL; + p = (struct xive_end *)(be64_to_cpu(x->end_ind_base[idx / END_PER_PAGE]) & + VSD_ADDRESS_MASK); + if (!p) + return NULL; + + return &p[idx % END_PER_PAGE]; +} + +static struct xive_eas *xive_get_eas(struct xive *x, unsigned int isn) +{ + struct xive_eas *eat; + uint32_t idx = GIRQ_TO_IDX(isn); + + if (GIRQ_IS_ESCALATION(isn)) { + /* Allright, an escalation EAS is buried inside an END, let's + * try to find it + */ + struct xive_end *end; + + if (x->chip_id != VC_BLK_TO_CHIP(GIRQ_TO_BLK(isn))) { + xive_err(x, "%s, ESC ISN 0x%x not on right chip\n", + __func__, isn); + return NULL; + } + end = xive_get_end(x, idx); + if (!end) { + xive_err(x, "%s, ESC ISN 0x%x END not found\n", + __func__, isn); + return NULL; + } + + /* If using single-escalation, don't let anybody get + * to the individual escalation interrupts + */ + if (xive_get_field32(END_W0_UNCOND_ESCALATE, end->w0)) + return NULL; + + /* Grab the escalation END */ + return (struct xive_eas *)(char *)&end->w4; + } else { + /* Check the block matches */ + if (isn < x->int_base || isn >= x->int_count) { + xive_err(x, "%s, ISN 0x%x not on right chip\n", + __func__, isn); + return NULL; + } + assert (idx < XIVE_INT_COUNT); + + /* If we support >1 block per chip, this should still + * work as we are likely to make the table contiguous + * anyway + */ + eat = x->eat_base; + assert(eat); + + return eat + idx; + } +} + +static struct xive_nvp *xive_get_vp(struct xive *x, unsigned int idx) +{ + struct xive_nvp *p; + + assert(idx < (x->vp_ind_count * VP_PER_PAGE)); + p = (struct xive_nvp *)(be64_to_cpu(x->vp_ind_base[idx / VP_PER_PAGE]) & + VSD_ADDRESS_MASK); + if (!p) + return NULL; + + return &p[idx % VP_PER_PAGE]; +} + +/* + * Store the END base of the VP in W5, using the new architected field + * in P10. Used to be the pressure relief interrupt field on P9. + */ +static void xive_vp_set_end_base(struct xive_nvp *vp, + uint32_t end_blk, uint32_t end_idx) +{ + vp->w5 = xive_set_field32(NVP_W5_VP_END_BLOCK, 0, end_blk) | + xive_set_field32(NVP_W5_VP_END_INDEX, 0, end_idx); + + /* This is the criteria to know if a VP was allocated */ + assert(vp->w5 != 0); +} + +static void xive_init_default_vp(struct xive_nvp *vp, + uint32_t end_blk, uint32_t end_idx) +{ + memset(vp, 0, sizeof(struct xive_nvp)); + + xive_vp_set_end_base(vp, end_blk, end_idx); + + vp->w0 = xive_set_field32(NVP_W0_VALID, 0, 1); +} + +/* + * VPs of the HW threads have their own set of ENDs which is allocated + * when XIVE is initialized. These are tagged with a FIRMWARE bit so + * that they can be identified when the driver is reset (kexec). + */ +static void xive_init_hw_end(struct xive_end *end) +{ + memset(end, 0, sizeof(struct xive_end)); + end->w0 = xive_set_field32(END_W0_FIRMWARE1, 0, 1); +} + +static void *xive_get_donated_page(struct xive *x) +{ + return (void *)list_pop_(&x->donated_pages, 0); +} + +#define XIVE_ALLOC_IS_ERR(_idx) ((_idx) >= 0xfffffff0) + +#define XIVE_ALLOC_NO_SPACE 0xffffffff /* No possible space */ +#define XIVE_ALLOC_NO_IND 0xfffffffe /* Indirect need provisioning */ +#define XIVE_ALLOC_NO_MEM 0xfffffffd /* Local allocation failed */ + +static uint32_t xive_alloc_end_set(struct xive *x, bool alloc_indirect) +{ + uint32_t ind_idx; + int idx; + int end_base_idx; + + xive_vdbg(x, "Allocating END set...\n"); + + assert(x->end_map); + + /* Allocate from the END bitmap. Each bit is 8 ENDs */ + idx = bitmap_find_zero_bit(*x->end_map, 0, xive_end_bitmap_size(x)); + if (idx < 0) { + xive_dbg(x, "Allocation from END bitmap failed !\n"); + return XIVE_ALLOC_NO_SPACE; + } + + end_base_idx = idx << xive_cfg_vp_prio_shift(x); + + xive_vdbg(x, "Got ENDs 0x%x..0x%x\n", end_base_idx, + end_base_idx + xive_max_prio(x)); + + /* Calculate the indirect page where the ENDs reside */ + ind_idx = end_base_idx / END_PER_PAGE; + + /* Is there an indirect page ? If not, check if we can provision it */ + if (!x->end_ind_base[ind_idx]) { + /* Default flags */ + uint64_t vsd_flags = SETFIELD(VSD_TSIZE, 0ull, 4) | + SETFIELD(VSD_MODE, 0ull, VSD_MODE_EXCLUSIVE); + void *page; + + /* If alloc_indirect is set, allocate the memory from OPAL own, + * otherwise try to provision from the donated pool + */ + if (alloc_indirect) { + /* Allocate/provision indirect page during boot only */ + xive_vdbg(x, "Indirect empty, provisioning from local pool\n"); + page = local_alloc(x->chip_id, PAGE_SIZE, PAGE_SIZE); + if (!page) { + xive_dbg(x, "provisioning failed !\n"); + return XIVE_ALLOC_NO_MEM; + } + vsd_flags |= VSD_FIRMWARE; + } else { + xive_vdbg(x, "Indirect empty, provisioning from donated pages\n"); + page = xive_get_donated_page(x); + if (!page) { + xive_vdbg(x, "no idirect pages available !\n"); + return XIVE_ALLOC_NO_IND; + } + } + memset(page, 0, PAGE_SIZE); + x->end_ind_base[ind_idx] = cpu_to_be64(vsd_flags | + (((uint64_t)page) & VSD_ADDRESS_MASK)); + /* Any cache scrub needed ? */ + } + + bitmap_set_bit(*x->end_map, idx); + return end_base_idx; +} + +static void xive_free_end_set(struct xive *x, uint32_t ends) +{ + uint32_t idx; + uint8_t prio_mask = xive_max_prio(x); + + xive_vdbg(x, "Freeing END 0x%x..0x%x\n", ends, ends + xive_max_prio(x)); + + assert((ends & prio_mask) == 0); + assert(x->end_map); + + idx = ends >> xive_cfg_vp_prio_shift(x); + bitmap_clr_bit(*x->end_map, idx); +} + +static bool xive_provision_vp_ind(struct xive *x, uint32_t vp_idx, uint32_t order) +{ + uint32_t pbase, pend, i; + + pbase = vp_idx / VP_PER_PAGE; + pend = (vp_idx + (1 << order)) / VP_PER_PAGE; + + for (i = pbase; i <= pend; i++) { + void *page; + u64 vsd; + + /* Already provisioned ? */ + if (x->vp_ind_base[i]) + continue; + + /* Try to grab a donated page */ + page = xive_get_donated_page(x); + if (!page) + return false; + + /* Install the page */ + memset(page, 0, PAGE_SIZE); + vsd = ((uint64_t)page) & VSD_ADDRESS_MASK; + vsd |= SETFIELD(VSD_TSIZE, 0ull, 4); + vsd |= SETFIELD(VSD_MODE, 0ull, VSD_MODE_EXCLUSIVE); + x->vp_ind_base[i] = cpu_to_be64(vsd); + } + return true; +} + +static void xive_init_vp_allocator(void) +{ + /* Initialize chip alloc bits */ + xive_chips_alloc_bits = ilog2(xive_block_count); + + prlog(PR_INFO, "%d chips considered for VP allocations\n", + 1 << xive_chips_alloc_bits); + + /* Allocate a buddy big enough for XIVE_VP_ORDER allocations. + * + * each bit in the buddy represents 1 << xive_chips_alloc_bits + * VPs. + */ + xive_vp_buddy = buddy_create(XIVE_VP_ORDER(one_xive)); + assert(xive_vp_buddy); + + /* + * We reserve the whole range of VP ids representing HW threads. + */ + assert(buddy_reserve(xive_vp_buddy, xive_hw_vp_base, + xive_threadid_shift)); +} + +static uint32_t xive_alloc_vps(uint32_t order) +{ + uint32_t local_order, i; + int vp; + + /* The minimum order is 2 VPs per chip */ + if (order < (xive_chips_alloc_bits + 1)) + order = xive_chips_alloc_bits + 1; + + /* We split the allocation */ + local_order = order - xive_chips_alloc_bits; + + /* We grab that in the global buddy */ + assert(xive_vp_buddy); + lock(&xive_buddy_lock); + vp = buddy_alloc(xive_vp_buddy, local_order); + unlock(&xive_buddy_lock); + if (vp < 0) + return XIVE_ALLOC_NO_SPACE; + + /* Provision on every chip considered for allocation */ + for (i = 0; i < (1 << xive_chips_alloc_bits); i++) { + struct xive *x = xive_from_pc_blk(i); + bool success; + + /* Return internal error & log rather than assert ? */ + assert(x); + lock(&x->lock); + success = xive_provision_vp_ind(x, vp, local_order); + unlock(&x->lock); + if (!success) { + lock(&xive_buddy_lock); + buddy_free(xive_vp_buddy, vp, local_order); + unlock(&xive_buddy_lock); + return XIVE_ALLOC_NO_IND; + } + } + + /* Encode the VP number. "blk" is 0 as this represents + * all blocks and the allocation always starts at 0 + */ + return xive_encode_vp(0, vp, order); +} + +static void xive_free_vps(uint32_t vp) +{ + uint32_t idx; + uint8_t order, local_order; + + assert(xive_decode_vp(vp, NULL, &idx, &order, NULL)); + + /* We split the allocation */ + local_order = order - xive_chips_alloc_bits; + + /* Free that in the buddy */ + lock(&xive_buddy_lock); + buddy_free(xive_vp_buddy, idx, local_order); + unlock(&xive_buddy_lock); +} + +enum xive_cache_type { + xive_cache_easc, + xive_cache_esbc, + xive_cache_endc, + xive_cache_nxc, +}; + +/* + * Cache update + */ + +#define FLUSH_CTRL_POLL_VALID PPC_BIT(0) /* POLL bit is the same for all */ + +static int64_t __xive_cache_scrub(struct xive *x, + enum xive_cache_type ctype, + uint64_t block, uint64_t idx, + bool want_inval __unused, bool want_disable __unused) +{ + uint64_t ctrl_reg, x_ctrl_reg; + uint64_t poll_val, ctrl_val; + +#ifdef XIVE_CHECK_LOCKS + assert(lock_held_by_me(&x->lock)); +#endif + switch (ctype) { + case xive_cache_easc: + poll_val = + SETFIELD(VC_EASC_FLUSH_POLL_BLOCK_ID, 0ll, block) | + SETFIELD(VC_EASC_FLUSH_POLL_OFFSET, 0ll, idx) | + VC_EASC_FLUSH_POLL_BLOCK_ID_MASK | + VC_EASC_FLUSH_POLL_OFFSET_MASK; + xive_regw(x, VC_EASC_FLUSH_POLL, poll_val); + ctrl_reg = VC_EASC_FLUSH_CTRL; + x_ctrl_reg = X_VC_EASC_FLUSH_CTRL; + break; + case xive_cache_esbc: + poll_val = + SETFIELD(VC_ESBC_FLUSH_POLL_BLOCK_ID, 0ll, block) | + SETFIELD(VC_ESBC_FLUSH_POLL_OFFSET, 0ll, idx) | + VC_ESBC_FLUSH_POLL_BLOCK_ID_MASK | + VC_ESBC_FLUSH_POLL_OFFSET_MASK; + xive_regw(x, VC_ESBC_FLUSH_POLL, poll_val); + ctrl_reg = VC_ESBC_FLUSH_CTRL; + x_ctrl_reg = X_VC_ESBC_FLUSH_CTRL; + break; + case xive_cache_endc: + poll_val = + SETFIELD(VC_ENDC_FLUSH_POLL_BLOCK_ID, 0ll, block) | + SETFIELD(VC_ENDC_FLUSH_POLL_OFFSET, 0ll, idx) | + VC_ENDC_FLUSH_POLL_BLOCK_ID_MASK | + VC_ENDC_FLUSH_POLL_OFFSET_MASK; + xive_regw(x, VC_ENDC_FLUSH_POLL, poll_val); + ctrl_reg = VC_ENDC_FLUSH_CTRL; + x_ctrl_reg = X_VC_ENDC_FLUSH_CTRL; + break; + case xive_cache_nxc: + poll_val = + SETFIELD(PC_NXC_FLUSH_POLL_BLOCK_ID, 0ll, block) | + SETFIELD(PC_NXC_FLUSH_POLL_OFFSET, 0ll, idx) | + PC_NXC_FLUSH_POLL_BLOCK_ID_MASK | + PC_NXC_FLUSH_POLL_OFFSET_MASK; + xive_regw(x, PC_NXC_FLUSH_POLL, poll_val); + ctrl_reg = PC_NXC_FLUSH_CTRL; + x_ctrl_reg = X_PC_NXC_FLUSH_CTRL; + break; + default: + return OPAL_INTERNAL_ERROR; + } + + /* XXX Add timeout !!! */ + for (;;) { + ctrl_val = __xive_regr(x, ctrl_reg, x_ctrl_reg, NULL); + if (!(ctrl_val & FLUSH_CTRL_POLL_VALID)) + break; + /* Small delay */ + time_wait(100); + } + sync(); + return 0; +} + +static int64_t xive_easc_scrub(struct xive *x, uint64_t block, uint64_t idx) +{ + return __xive_cache_scrub(x, xive_cache_easc, block, idx, false, false); +} + +static int64_t xive_nxc_scrub(struct xive *x, uint64_t block, uint64_t idx) +{ + return __xive_cache_scrub(x, xive_cache_nxc, block, idx, false, false); +} + +static int64_t xive_nxc_scrub_clean(struct xive *x, uint64_t block, uint64_t idx) +{ + return __xive_cache_scrub(x, xive_cache_nxc, block, idx, true, false); +} + +static int64_t xive_endc_scrub(struct xive *x, uint64_t block, uint64_t idx) +{ + return __xive_cache_scrub(x, xive_cache_endc, block, idx, false, false); +} + +#define XIVE_CACHE_WATCH_MAX_RETRIES 10 + +static int64_t __xive_cache_watch(struct xive *x, enum xive_cache_type ctype, + uint64_t block, uint64_t idx, + uint32_t start_dword, uint32_t dword_count, + beint64_t *new_data, bool light_watch, + bool synchronous) +{ + uint64_t sreg, sregx, dreg0, dreg0x; + uint64_t dval0, sval, status; + int64_t i; + int retries = 0; + +#ifdef XIVE_CHECK_LOCKS + assert(lock_held_by_me(&x->lock)); +#endif + switch (ctype) { + case xive_cache_endc: + sreg = VC_ENDC_WATCH0_SPEC; + sregx = X_VC_ENDC_WATCH0_SPEC; + dreg0 = VC_ENDC_WATCH0_DATA0; + dreg0x = X_VC_ENDC_WATCH0_DATA0; + sval = SETFIELD(VC_ENDC_WATCH_BLOCK_ID, idx, block); + break; + case xive_cache_nxc: + sreg = PC_NXC_WATCH0_SPEC; + sregx = X_PC_NXC_WATCH0_SPEC; + dreg0 = PC_NXC_WATCH0_DATA0; + dreg0x = X_PC_NXC_WATCH0_DATA0; + sval = SETFIELD(PC_NXC_WATCH_BLOCK_ID, idx, block); + break; + default: + return OPAL_INTERNAL_ERROR; + } + + /* The full bit is in the same position for ENDC and NXC */ + if (!light_watch) + sval |= VC_ENDC_WATCH_FULL; + + for (;;) { + /* Write the cache watch spec */ + __xive_regw(x, sreg, sregx, sval, NULL); + + /* Load data0 register to populate the watch */ + dval0 = __xive_regr(x, dreg0, dreg0x, NULL); + + /* If new_data is NULL, this is a dummy watch used as a + * workaround for a HW bug + */ + if (!new_data) { + __xive_regw(x, dreg0, dreg0x, dval0, NULL); + return 0; + } + + /* Write the words into the watch facility. We write in reverse + * order in case word 0 is part of it as it must be the last + * one written. + */ + for (i = start_dword + dword_count - 1; i >= start_dword ;i--) { + uint64_t dw = be64_to_cpu(new_data[i - start_dword]); + __xive_regw(x, dreg0 + i * 8, dreg0x + i, dw, NULL); + } + + /* Write data0 register to trigger the update if word 0 wasn't + * written above + */ + if (start_dword > 0) + __xive_regw(x, dreg0, dreg0x, dval0, NULL); + + /* This may not be necessary for light updates (it's possible + * that a sync in sufficient, TBD). Ensure the above is + * complete and check the status of the watch. + */ + status = __xive_regr(x, sreg, sregx, NULL); + + /* Bits FULL and CONFLICT are in the same position in + * ENDC and NXC + */ + if (!(status & VC_ENDC_WATCH_FULL) || + !(status & VC_ENDC_WATCH_CONFLICT)) + break; + if (!synchronous) + return OPAL_BUSY; + + if (++retries == XIVE_CACHE_WATCH_MAX_RETRIES) { + xive_err(x, "Reached maximum retries %d when doing " + "a %s cache update\n", retries, + ctype == xive_cache_endc ? "ENDC" : "NXC"); + return OPAL_BUSY; + } + } + + /* Perform a scrub with "want_invalidate" set to false to push the + * cache updates to memory as well + */ + return __xive_cache_scrub(x, ctype, block, idx, false, false); +} + +#ifdef XIVE_DEBUG_INIT_CACHE_UPDATES +static bool xive_check_endc_update(struct xive *x, uint32_t idx, struct xive_end *end) +{ + struct xive_end *end_p = xive_get_end(x, idx); + struct xive_end end2; + + assert(end_p); + end2 = *end_p; + if (memcmp(end, &end2, sizeof(struct xive_end)) != 0) { + xive_err(x, "END update mismatch idx %d\n", idx); + xive_err(x, "want: %08x %08x %08x %08x\n", + end->w0, end->w1, end->w2, end->w3); + xive_err(x, " %08x %08x %08x %08x\n", + end->w4, end->w5, end->w6, end->w7); + xive_err(x, "got : %08x %08x %08x %08x\n", + end2.w0, end2.w1, end2.w2, end2.w3); + xive_err(x, " %08x %08x %08x %08x\n", + end2.w4, end2.w5, end2.w6, end2.w7); + return false; + } + return true; +} + +static bool xive_check_nxc_update(struct xive *x, uint32_t idx, struct xive_nvp *vp) +{ + struct xive_nvp *vp_p = xive_get_vp(x, idx); + struct xive_nvp vp2; + + assert(vp_p); + vp2 = *vp_p; + if (memcmp(vp, &vp2, sizeof(struct xive_nvp)) != 0) { + xive_err(x, "VP update mismatch idx %d\n", idx); + xive_err(x, "want: %08x %08x %08x %08x\n", + vp->w0, vp->w1, vp->w2, vp->w3); + xive_err(x, " %08x %08x %08x %08x\n", + vp->w4, vp->w5, vp->w6, vp->w7); + xive_err(x, "got : %08x %08x %08x %08x\n", + vp2.w0, vp2.w1, vp2.w2, vp2.w3); + xive_err(x, " %08x %08x %08x %08x\n", + vp2.w4, vp2.w5, vp2.w6, vp2.w7); + return false; + } + return true; +} +#else +static inline bool xive_check_endc_update(struct xive *x __unused, + uint32_t idx __unused, + struct xive_end *end __unused) +{ + return true; +} + +static inline bool xive_check_nxc_update(struct xive *x __unused, + uint32_t idx __unused, + struct xive_nvp *vp __unused) +{ + return true; +} +#endif + +static int64_t xive_escalation_ive_cache_update(struct xive *x, uint64_t block, + uint64_t idx, struct xive_eas *eas, + bool synchronous) +{ + return __xive_cache_watch(x, xive_cache_endc, block, idx, + 2, 1, &eas->w, true, synchronous); +} + +static int64_t xive_endc_cache_update(struct xive *x, uint64_t block, + uint64_t idx, struct xive_end *end, + bool synchronous) +{ + int64_t ret; + + ret = __xive_cache_watch(x, xive_cache_endc, block, idx, + 0, 4, (beint64_t *)end, false, synchronous); + xive_check_endc_update(x, idx, end); + return ret; +} + +static int64_t xive_nxc_cache_update(struct xive *x, uint64_t block, + uint64_t idx, struct xive_nvp *vp, + bool synchronous) +{ + int64_t ret; + + ret = __xive_cache_watch(x, xive_cache_nxc, block, idx, + 0, 4, (beint64_t *)vp, false, synchronous); + xive_check_nxc_update(x, idx, vp); + return ret; +} + +/* + * VSD + */ +static bool xive_set_vsd(struct xive *x, uint32_t tbl, uint32_t idx, uint64_t v) +{ + /* Set VC subengine */ + xive_regw(x, VC_VSD_TABLE_ADDR, + SETFIELD(VC_VSD_TABLE_SELECT, 0ull, tbl) | + SETFIELD(VC_VSD_TABLE_ADDRESS, 0ull, idx)); + if (x->last_reg_error) + return false; + xive_regw(x, VC_VSD_TABLE_DATA, v); + if (x->last_reg_error) + return false; + + /* also set PC subengine if table is used */ + if (tbl == VST_EAS || tbl == VST_ERQ || tbl == VST_IC) + return true; + + xive_regw(x, PC_VSD_TABLE_ADDR, + SETFIELD(PC_VSD_TABLE_SELECT, 0ull, tbl) | + SETFIELD(PC_VSD_TABLE_ADDRESS, 0ull, idx)); + if (x->last_reg_error) + return false; + xive_regw(x, PC_VSD_TABLE_DATA, v); + if (x->last_reg_error) + return false; + return true; +} + +static bool xive_set_local_tables(struct xive *x) +{ + uint64_t base, i; + + /* These have to be power of 2 sized */ + assert(is_pow2(XIVE_ESB_SIZE)); + assert(is_pow2(XIVE_EAT_SIZE)); + + /* All tables set as exclusive */ + base = SETFIELD(VSD_MODE, 0ull, VSD_MODE_EXCLUSIVE); + + /* ESB: direct mode */ + if (!xive_set_vsd(x, VST_ESB, x->block_id, base | + (((uint64_t)x->sbe_base) & VSD_ADDRESS_MASK) | + SETFIELD(VSD_TSIZE, 0ull, ilog2(XIVE_ESB_SIZE) - 12))) + return false; + + /* EAS: direct mode */ + if (!xive_set_vsd(x, VST_EAS, x->block_id, base | + (((uint64_t)x->eat_base) & VSD_ADDRESS_MASK) | + SETFIELD(VSD_TSIZE, 0ull, ilog2(XIVE_EAT_SIZE) - 12))) + return false; + + /* END: indirect mode with 64K subpages */ + if (!xive_set_vsd(x, VST_END, x->block_id, base | + (((uint64_t)x->end_ind_base) & VSD_ADDRESS_MASK) | + VSD_INDIRECT | SETFIELD(VSD_TSIZE, 0ull, + ilog2(x->end_ind_size) - 12))) + return false; + + /* NVP: indirect mode with 64K subpages */ + if (!xive_set_vsd(x, VST_NVP, x->block_id, base | + (((uint64_t)x->vp_ind_base) & VSD_ADDRESS_MASK) | + VSD_INDIRECT | SETFIELD(VSD_TSIZE, 0ull, + ilog2(x->vp_ind_size) - 12))) + return false; + + /* NVG: not used */ + /* NVC: not used */ + + /* INT and SYNC: indexed with the Topology# */ + if (!xive_set_vsd(x, VST_IC, x->chip_id, base | + (((uint64_t)x->ic_base) & VSD_ADDRESS_MASK) | + SETFIELD(VSD_TSIZE, 0ull, ilog2(x->ic_size) - 12))) + return false; + + if (!xive_set_vsd(x, VST_SYNC, x->chip_id, base | + (((uint64_t)x->sync_inject) & VSD_ADDRESS_MASK) | + SETFIELD(VSD_TSIZE, 0ull, ilog2(x->sync_inject_size) - 12))) + return false; + + /* + * ERQ: one 64K page for each queue overflow. Indexed with : + * + * 0:IPI, 1:HWD, 2:NxC, 3:INT, 4:OS-Queue, 5:Pool-Queue, 6:Hard-Queue + */ + for (i = 0; i < VC_QUEUE_COUNT; i++) { + u64 addr = ((uint64_t)x->q_ovf) + i * PAGE_SIZE; + u64 cfg, sreg, sregx; + + if (!xive_set_vsd(x, VST_ERQ, i, base | + (addr & VSD_ADDRESS_MASK) | + SETFIELD(VSD_TSIZE, 0ull, 4))) + return false; + + sreg = VC_QUEUES_CFG_REM0 + i * 8; + sregx = X_VC_QUEUES_CFG_REM0 + i; + cfg = __xive_regr(x, sreg, sregx, NULL); + cfg |= VC_QUEUES_CFG_MEMB_EN; + cfg = SETFIELD(VC_QUEUES_CFG_MEMB_SZ, cfg, 4); + __xive_regw(x, sreg, sregx, cfg, NULL); + } + + return true; +} + + +/* + * IC BAR layout + * + * Page 0: Internal CQ register accesses (reads & writes) + * Page 1: Internal PC register accesses (reads & writes) + * Page 2: Internal VC register accesses (reads & writes) + * Page 3: Internal TCTXT (TIMA) reg accesses (read & writes) + * Page 4: Notify Port page (writes only, w/data), + * Page 5: Reserved + * Page 6: Sync Poll page (writes only, dataless) + * Page 7: Sync Inject page (writes only, dataless) + * Page 8: LSI Trigger page (writes only, dataless) + * Page 9: LSI SB Management page (reads & writes dataless) + * Pages 10-255: Reserved + * Pages 256-383: Direct mapped Thread Context Area (reads & writes) + * covering the 128 threads in P10. + * Pages 384-511: Reserved + */ + +#define XIVE_IC_CQ_PGOFF 0 +#define XIVE_IC_PC_PGOFF 1 +#define XIVE_IC_VC_PGOFF 2 +#define XIVE_IC_TCTXT_PGOFF 3 +#define XIVE_NOTIFY_PGOFF 4 +#define XIVE_SYNC_POLL_PGOFF 6 +#define XIVE_SYNC_INJECT_PGOFF 7 +#define XIVE_LSI_TRIGGER_PGOFF 8 +#define XIVE_LSI_MGMT_PGOFF 9 +#define XIVE_IC_TM_DIRECT_PGOFF 256 + +static bool xive_configure_ic_bars(struct xive *x) +{ + uint64_t chip_id = x->chip_id; + uint64_t val; + + /* Reset all bars to zero */ + xive_regwx(x, CQ_RST_CTL, CQ_RST_PB_BAR_RESET); + + /* IC BAR */ + phys_map_get(chip_id, XIVE_IC, 0, (uint64_t *)&x->ic_base, &x->ic_size); + val = (uint64_t)x->ic_base | CQ_IC_BAR_VALID | CQ_IC_BAR_64K; + x->ic_shift = 16; + + xive_regwx(x, CQ_IC_BAR, val); + if (x->last_reg_error) + return false; + + /* + * TM BAR, same address for each chip. Hence we create a fake + * chip 0 and use that for all phys_map_get(XIVE_TM) calls. + */ + phys_map_get(0, XIVE_TM, 0, (uint64_t *)&x->tm_base, &x->tm_size); + val = (uint64_t)x->tm_base | CQ_TM_BAR_VALID | CQ_TM_BAR_64K; + x->tm_shift = 16; + + xive_regwx(x, CQ_TM_BAR, val); + if (x->last_reg_error) + return false; + + /* IC BAR sub-pages shortcuts */ + x->ic_tm_direct_base = x->ic_base + + (XIVE_IC_TM_DIRECT_PGOFF << x->ic_shift); + + return true; +} + +/* + * NVPG, NVC, ESB, END BARs have common attributes: 64k page and only + * one set covering the whole BAR. + */ +static bool xive_configure_bars(struct xive *x) +{ + uint64_t chip_id = x->chip_id; + uint64_t val; + uint64_t esb_size; + uint64_t end_size; + uint64_t nvp_size; + + x->nvp_size = XIVE_VP_COUNT(x) << XIVE_NVP_SHIFT; + x->esb_size = XIVE_INT_COUNT << XIVE_ESB_SHIFT; + x->end_size = XIVE_END_COUNT << XIVE_END_SHIFT; + + /* + * NVC BAR is not configured because we do not use the XIVE2 + * Crowd capability. + */ + + /* NVPG BAR: two pages, even NVP, odd NVG */ + phys_map_get(chip_id, XIVE_NVPG, 0, (uint64_t *)&x->nvp_base, &nvp_size); + if (x->nvp_size > nvp_size) { + xive_err(x, "NVP table is larger than default: " + "0x%012llx > 0x%012llx\n", x->nvp_size, nvp_size); + return false; + } + + val = (uint64_t)x->nvp_base | CQ_BAR_VALID | CQ_BAR_64K | + SETFIELD(CQ_BAR_RANGE, 0ull, ilog2(x->nvp_size) - 24); + xive_regwx(x, CQ_NVPG_BAR, val); + if (x->last_reg_error) + return false; + + /* ESB BAR */ + phys_map_get(chip_id, XIVE_ESB, 0, (uint64_t *)&x->esb_base, &esb_size); + if (x->esb_size > esb_size) { + xive_err(x, "ESB table is larger than default: " + "0x%012llx > 0x%012llx\n", x->esb_size, esb_size); + return false; + } + + val = (uint64_t)x->esb_base | CQ_BAR_VALID | CQ_BAR_64K | + SETFIELD(CQ_BAR_RANGE, 0ull, ilog2(x->esb_size) - 24); + xive_regwx(x, CQ_ESB_BAR, val); + if (x->last_reg_error) + return false; + + /* END BAR */ + phys_map_get(chip_id, XIVE_END, 0, (uint64_t *)&x->end_base, &end_size); + if (x->end_size > end_size) { + xive_err(x, "END table is larger than default: " + "0x%012llx > 0x%012llx\n", x->end_size, end_size); + return false; + } + + val = (uint64_t)x->end_base | CQ_BAR_VALID | CQ_BAR_64K | + SETFIELD(CQ_BAR_RANGE, 0ull, ilog2(x->end_size) - 24); + xive_regwx(x, CQ_END_BAR, val); + if (x->last_reg_error) + return false; + + xive_dbg(x, "IC: %14p [0x%012llx]\n", x->ic_base, x->ic_size); + xive_dbg(x, "TM: %14p [0x%012llx]\n", x->tm_base, x->tm_size); + xive_dbg(x, "NVP: %14p [0x%012llx]\n", x->nvp_base, x->nvp_size); + xive_dbg(x, "ESB: %14p [0x%012llx]\n", x->esb_base, x->esb_size); + xive_dbg(x, "END: %14p [0x%012llx]\n", x->end_base, x->end_size); + xive_dbg(x, "OVF: %14p [0x%012x]\n", x->q_ovf, + VC_QUEUE_COUNT * PAGE_SIZE); + + return true; +} + +static void xive_dump_mmio(struct xive *x) +{ + prlog(PR_DEBUG, " CQ_CFG_PB_GEN = %016llx\n", + in_be64(x->ic_base + CQ_CFG_PB_GEN)); + prlog(PR_DEBUG, " CQ_MSGSND = %016llx\n", + in_be64(x->ic_base + CQ_MSGSND)); +} + +static const struct { + uint64_t bitmask; + const char *name; +} xive_capabilities[] = { + { CQ_XIVE_CAP_PHB_PQ_DISABLE, "PHB PQ disable mode support" }, + { CQ_XIVE_CAP_PHB_ABT, "PHB address based trigger mode support" }, + { CQ_XIVE_CAP_EXPLOITATION_MODE, "Exploitation mode" }, + { CQ_XIVE_CAP_STORE_EOI, "StoreEOI mode support" }, + { CQ_XIVE_CAP_VP_SAVE_RESTORE, "VP Context Save and Restore" }, +}; + +static void xive_dump_capabilities(struct xive *x, uint64_t cap_val) +{ + int i; + + xive_dbg(x, "capabilities: %016llx\n", cap_val); + xive_dbg(x, "\tVersion: %lld\n", + GETFIELD(CQ_XIVE_CAP_VERSION, cap_val)); + xive_dbg(x, "\tUser interrupt priorities: [ 1 - %d ]\n", + 1 << GETFIELD(CQ_XIVE_CAP_USER_INT_PRIO, cap_val)); + xive_dbg(x, "\tVP interrupt priorities: [ %d - 8 ]\n", + 1 << GETFIELD(CQ_XIVE_CAP_VP_INT_PRIO, cap_val)); + xive_dbg(x, "\tExtended Blockid bits: %lld\n", + 4 + GETFIELD(CQ_XIVE_CAP_BLOCK_ID_WIDTH, cap_val)); + + for (i = 0; i < ARRAY_SIZE(xive_capabilities); i++) { + if (xive_capabilities[i].bitmask & cap_val) + xive_dbg(x, "\t%s\n", xive_capabilities[i].name); + } +} + +static const struct { + uint64_t bitmask; + const char *name; +} xive_configs[] = { + { CQ_XIVE_CFG_GEN1_TIMA_OS, "Gen1 mode TIMA OS" }, + { CQ_XIVE_CFG_GEN1_TIMA_HYP, "Gen1 mode TIMA Hyp" }, + { CQ_XIVE_CFG_GEN1_TIMA_HYP_BLK0, "Gen1 mode TIMA General Hypervisor Block0" }, + { CQ_XIVE_CFG_GEN1_TIMA_CROWD_DIS, "Gen1 mode TIMA Crowd disable" }, + { CQ_XIVE_CFG_GEN1_END_ESX, "Gen1 mode END ESx" }, + { CQ_XIVE_CFG_EN_VP_SAVE_RESTORE, "VP Context Save and Restore" }, + { CQ_XIVE_CFG_EN_VP_SAVE_REST_STRICT, "VP Context Save and Restore strict" }, +}; + +static void xive_dump_configuration(struct xive *x, const char *prefix, + uint64_t cfg_val) +{ + int i ; + + xive_dbg(x, "%s configuration: %016llx\n", prefix, cfg_val); + xive_dbg(x, "\tHardwired Thread Id range: %lld bits\n", + 7 + GETFIELD(CQ_XIVE_CFG_HYP_HARD_RANGE, cfg_val)); + xive_dbg(x, "\tUser Interrupt priorities: [ 1 - %d ]\n", + 1 << GETFIELD(CQ_XIVE_CFG_USER_INT_PRIO, cfg_val)); + xive_dbg(x, "\tVP Interrupt priorities: [ 0 - %d ]\n", xive_max_prio(x)); + xive_dbg(x, "\tBlockId bits: %lld bits\n", + 4 + GETFIELD(CQ_XIVE_CFG_BLOCK_ID_WIDTH, cfg_val)); + if (CQ_XIVE_CFG_HYP_HARD_BLKID_OVERRIDE & cfg_val) + xive_dbg(x, "\tHardwired BlockId: %lld\n", + GETFIELD(CQ_XIVE_CFG_HYP_HARD_BLOCK_ID, cfg_val)); + + for (i = 0; i < ARRAY_SIZE(xive_configs); i++) { + if (xive_configs[i].bitmask & cfg_val) + xive_dbg(x, "\t%s\n", xive_configs[i].name); + } +} + +/* + * Default XIVE configuration + */ +#define XIVE_CONFIGURATION \ + (SETFIELD(CQ_XIVE_CFG_HYP_HARD_RANGE, 0ull, CQ_XIVE_CFG_THREADID_8BITS) | \ + SETFIELD(CQ_XIVE_CFG_VP_INT_PRIO, 0ull, CQ_XIVE_CFG_INT_PRIO_8)) + +/* + * Gen1 configuration for tests (QEMU) + */ +#define XIVE_CONFIGURATION_GEN1 \ + (SETFIELD(CQ_XIVE_CFG_HYP_HARD_RANGE, 0ull, CQ_XIVE_CFG_THREADID_7BITS) | \ + SETFIELD(CQ_XIVE_CFG_VP_INT_PRIO, 0ull, CQ_XIVE_CFG_INT_PRIO_8) | \ + CQ_XIVE_CFG_GEN1_TIMA_OS | \ + CQ_XIVE_CFG_GEN1_TIMA_HYP | \ + CQ_XIVE_CFG_GEN1_TIMA_HYP_BLK0 | \ + CQ_XIVE_CFG_GEN1_TIMA_CROWD_DIS | \ + CQ_XIVE_CFG_GEN1_END_ESX) + +static bool xive_has_cap(struct xive *x, uint64_t cap) +{ + return !!x && !!(x->capabilities & cap); +} + +#define XIVE_CAN_STORE_EOI(x) xive_has_cap(x, CQ_XIVE_CAP_STORE_EOI) + +static bool xive_cfg_save_restore(struct xive *x) +{ + return !!(x->config & CQ_XIVE_CFG_EN_VP_SAVE_RESTORE); +} + +/* + * When PQ_disable is available, configure the ESB cache to improve + * performance for PHB ESBs. + * + * split_mode : + * 1/3rd of the cache is reserved for PHB ESBs and the rest to + * IPIs. This is sufficient to keep all the PHB ESBs in cache and + * avoid ESB cache misses during IO interrupt processing. + * + * hash_array_enable : + * Internal cache hashing optimization. The hash_array tracks for + * ESBs where the original trigger came from so that we avoid + * getting the EAS into the cache twice. + */ +static void xive_config_esb_cache(struct xive *x) +{ + uint64_t val = xive_regr(x, VC_ESBC_CFG); + + if (xive_has_cap(x, CQ_XIVE_CAP_PHB_PQ_DISABLE)) { + val |= VC_ESBC_CFG_SPLIT_MODE | VC_ESBC_CFG_HASH_ARRAY_ENABLE; + val = SETFIELD(VC_ESBC_CFG_MAX_ENTRIES_IN_MODIFIED, val, 0xE); + xive_dbg(x, "ESB cache configured with split mode " + "and hash array. VC_ESBC_CFG=%016llx\n", val); + } else + val &= ~VC_ESBC_CFG_SPLIT_MODE; + + xive_regw(x, VC_ESBC_CFG, val); +} + +static void xive_config_fused_core(struct xive *x) +{ + uint64_t val = xive_regr(x, TCTXT_CFG); + + if (this_cpu()->is_fused_core) { + val |= TCTXT_CFG_FUSE_CORE_EN; + xive_dbg(x, "configured for fused cores. " + "PC_TCTXT_CFG=%016llx\n", val); + } else + val &= ~TCTXT_CFG_FUSE_CORE_EN; + xive_regw(x, TCTXT_CFG, val); +} + +static void xive_config_reduced_priorities_fixup(struct xive *x) +{ + if (xive_cfg_vp_prio_shift(x) < CQ_XIVE_CFG_INT_PRIO_8 && + x->quirks & XIVE_QUIRK_BROKEN_PRIO_CHECK) { + uint64_t val = xive_regr(x, PC_ERR1_CFG1); + + val &= ~PC_ERR1_CFG1_INTERRUPT_INVALID_PRIO; + xive_dbg(x, "workaround for reduced priorities. " + "PC_ERR1_CFG1=%016llx\n", val); + xive_regw(x, PC_ERR1_CFG1, val); + } +} + +static bool xive_config_init(struct xive *x) +{ + x->capabilities = xive_regr(x, CQ_XIVE_CAP); + xive_dump_capabilities(x, x->capabilities); + + x->generation = GETFIELD(CQ_XIVE_CAP_VERSION, x->capabilities); + + /* + * Allow QEMU to override version for tests + */ + if (x->generation != XIVE_GEN2 && !chip_quirk(QUIRK_QEMU)) { + xive_err(x, "Invalid XIVE controller version %d\n", + x->generation); + return false; + } + + x->config = xive_regr(x, CQ_XIVE_CFG); + xive_dump_configuration(x, "default", x->config); + + /* Start with default settings */ + x->config = x->generation == XIVE_GEN1 ? XIVE_CONFIGURATION_GEN1 : + XIVE_CONFIGURATION; + + if (x->quirks & XIVE_QUIRK_THREADID_7BITS) + x->config = SETFIELD(CQ_XIVE_CFG_HYP_HARD_RANGE, x->config, + CQ_XIVE_CFG_THREADID_7BITS); + + /* + * Hardwire the block ID. The default value is the topology ID + * of the chip which is different from the block. + */ + x->config |= CQ_XIVE_CFG_HYP_HARD_BLKID_OVERRIDE | + SETFIELD(CQ_XIVE_CFG_HYP_HARD_BLOCK_ID, 0ull, x->block_id); + + /* + * Enable "VP Context Save and Restore" by default. it is + * compatible with KVM which currently does the context + * save&restore in the entry/exit path of the vCPU + */ + if (x->capabilities & CQ_XIVE_CAP_VP_SAVE_RESTORE) + x->config |= CQ_XIVE_CFG_EN_VP_SAVE_RESTORE; + + xive_dump_configuration(x, "new", x->config); + xive_regw(x, CQ_XIVE_CFG, x->config); + if (xive_regr(x, CQ_XIVE_CFG) != x->config) { + xive_err(x, "configuration setting failed\n"); + } + + /* + * Disable error reporting in the FIR for info errors from the VC. + */ + xive_regw(x, CQ_FIRMASK_OR, CQ_FIR_VC_INFO_ERROR_0_2); + + /* + * Mask CI Load and Store to bad location, as IPI trigger + * pages may be mapped to user space, and a read on the + * trigger page causes a checkstop + */ + xive_regw(x, CQ_FIRMASK_OR, CQ_FIR_PB_RCMDX_CI_ERR1); + + /* + * VP space settings. P9 mode is 19bits. + */ + x->vp_shift = x->generation == XIVE_GEN1 ? + VP_SHIFT_GEN1 : VP_SHIFT_GEN2; + + /* + * VP ids for HW threads. These values are hardcoded in the + * CAM line of the HW context + * + * POWER10 |chip|0000000000000001|threadid| + * 28bits 4 16 8 + * + * POWER9 |chip|000000000001|thrdid | + * 23bits 4 12 7 + */ + + /* TODO (cosmetic): set VP ids for HW threads only once */ + xive_threadid_shift = 7 + GETFIELD(CQ_XIVE_CFG_HYP_HARD_RANGE, + x->config); + + xive_hw_vp_base = 1 << xive_threadid_shift; + xive_hw_vp_count = 1 << xive_threadid_shift; + + xive_dbg(x, "store EOI is %savailable\n", + XIVE_CAN_STORE_EOI(x) ? "" : "not "); + + xive_config_fused_core(x); + + xive_config_esb_cache(x); + + xive_config_reduced_priorities_fixup(x); + + return true; +} + +/* Set Translation tables : 1 block per chip */ +static bool xive_setup_set_xlate(struct xive *x) +{ + unsigned int i; + + /* Configure ESBs */ + xive_regw(x, CQ_TAR, + CQ_TAR_AUTOINC | SETFIELD(CQ_TAR_SELECT, 0ull, CQ_TAR_ESB)); + if (x->last_reg_error) + return false; + for (i = 0; i < XIVE_MAX_BLOCKS; i++) { + xive_regw(x, CQ_TDR, CQ_TDR_VALID | + SETFIELD(CQ_TDR_BLOCK_ID, 0ull, x->block_id)); + if (x->last_reg_error) + return false; + } + + /* Configure ENDs */ + xive_regw(x, CQ_TAR, + CQ_TAR_AUTOINC | SETFIELD(CQ_TAR_SELECT, 0ull, CQ_TAR_END)); + if (x->last_reg_error) + return false; + for (i = 0; i < XIVE_MAX_BLOCKS; i++) { + xive_regw(x, CQ_TDR, CQ_TDR_VALID | + SETFIELD(CQ_TDR_BLOCK_ID, 0ull, x->block_id)); + if (x->last_reg_error) + return false; + } + + /* Configure NVPs */ + xive_regw(x, CQ_TAR, + CQ_TAR_AUTOINC | SETFIELD(CQ_TAR_SELECT, 0ull, CQ_TAR_NVPG)); + if (x->last_reg_error) + return false; + for (i = 0; i < XIVE_MAX_BLOCKS; i++) { + xive_regw(x, CQ_TDR, CQ_TDR_VALID | + SETFIELD(CQ_TDR_BLOCK_ID, 0ull, x->block_id)); + if (x->last_reg_error) + return false; + } + return true; +} + +static bool xive_prealloc_tables(struct xive *x) +{ + uint32_t i; + uint32_t pbase, pend; + + /* ESB has 4 entries per byte */ + x->sbe_base = local_alloc(x->chip_id, XIVE_ESB_SIZE, XIVE_ESB_SIZE); + if (!x->sbe_base) { + xive_err(x, "Failed to allocate SBE\n"); + return false; + } + + /* PQs are initialized to 0b01 which corresponds to "ints off" */ + memset(x->sbe_base, 0x55, XIVE_ESB_SIZE); + xive_dbg(x, "SBE at %p size 0x%lx\n", x->sbe_base, XIVE_ESB_SIZE); + + /* EAS entries are 8 bytes */ + x->eat_base = local_alloc(x->chip_id, XIVE_EAT_SIZE, XIVE_EAT_SIZE); + if (!x->eat_base) { + xive_err(x, "Failed to allocate EAS\n"); + return false; + } + + /* + * We clear the entries (non-valid). They will be initialized + * when actually used + */ + memset(x->eat_base, 0, XIVE_EAT_SIZE); + xive_dbg(x, "EAT at %p size 0x%lx\n", x->eat_base, XIVE_EAT_SIZE); + + /* Indirect END table. Limited to one top page. */ + x->end_ind_size = ALIGN_UP(XIVE_END_TABLE_SIZE, PAGE_SIZE); + if (x->end_ind_size > PAGE_SIZE) { + xive_err(x, "END indirect table is too big !\n"); + return false; + } + x->end_ind_base = local_alloc(x->chip_id, x->end_ind_size, + x->end_ind_size); + if (!x->end_ind_base) { + xive_err(x, "Failed to allocate END indirect table\n"); + return false; + } + memset(x->end_ind_base, 0, x->end_ind_size); + xive_dbg(x, "ENDi at %p size 0x%llx #%ld entries\n", x->end_ind_base, + x->end_ind_size, XIVE_END_COUNT); + x->end_ind_count = XIVE_END_TABLE_SIZE / XIVE_VSD_SIZE; + + /* Indirect VP table. Limited to one top page. */ + x->vp_ind_size = ALIGN_UP(XIVE_VP_TABLE_SIZE(x), PAGE_SIZE); + if (x->vp_ind_size > PAGE_SIZE) { + xive_err(x, "VP indirect table is too big !\n"); + return false; + } + x->vp_ind_base = local_alloc(x->chip_id, x->vp_ind_size, + x->vp_ind_size); + if (!x->vp_ind_base) { + xive_err(x, "Failed to allocate VP indirect table\n"); + return false; + } + xive_dbg(x, "VPi at %p size 0x%llx #%ld entries\n", x->vp_ind_base, + x->vp_ind_size, XIVE_VP_COUNT(x)); + x->vp_ind_count = XIVE_VP_TABLE_SIZE(x) / XIVE_VSD_SIZE; + memset(x->vp_ind_base, 0, x->vp_ind_size); + + /* Allocate pages for the VP ids representing HW threads */ + pbase = xive_hw_vp_base / VP_PER_PAGE; + pend = (xive_hw_vp_base + xive_hw_vp_count) / VP_PER_PAGE; + + xive_dbg(x, "Allocating pages %d to %d of VPs (for %d VPs)\n", + pbase, pend, xive_hw_vp_count); + for (i = pbase; i <= pend; i++) { + void *page; + u64 vsd; + + /* Indirect entries have a VSD format */ + page = local_alloc(x->chip_id, PAGE_SIZE, PAGE_SIZE); + if (!page) { + xive_err(x, "Failed to allocate VP page\n"); + return false; + } + xive_dbg(x, "VP%d at %p size 0x%x\n", i, page, PAGE_SIZE); + memset(page, 0, PAGE_SIZE); + vsd = ((uint64_t)page) & VSD_ADDRESS_MASK; + + vsd |= SETFIELD(VSD_TSIZE, 0ull, 4); + vsd |= SETFIELD(VSD_MODE, 0ull, VSD_MODE_EXCLUSIVE); + vsd |= VSD_FIRMWARE; + x->vp_ind_base[i] = cpu_to_be64(vsd); + } + + /* + * Allocate page for cache and sync injection (512 * 128 hw + * threads) + one extra page for future use + */ + x->sync_inject_size = PAGE_SIZE + PAGE_SIZE; + x->sync_inject = local_alloc(x->chip_id, x->sync_inject_size, + x->sync_inject_size); + if (!x->sync_inject) { + xive_err(x, "Failed to allocate sync pages\n"); + return false; + } + + /* + * The Memory Coherence Directory uses 16M "granule" to track + * shared copies of a cache line. If any cache line within the + * 16M range gets touched by someone outside of the group, the + * MCD forces accesses to any cache line within the range to + * include everyone that might have a shared copy. + */ +#define QUEUE_OVF_ALIGN (16 << 20) /* MCD granule size */ + + /* + * Allocate the queue overflow pages and use a 16M alignment + * to avoid sharing with other structures and reduce traffic + * on the PowerBus. + */ + x->q_ovf = local_alloc(x->chip_id, VC_QUEUE_COUNT * PAGE_SIZE, + QUEUE_OVF_ALIGN); + if (!x->q_ovf) { + xive_err(x, "Failed to allocate queue overflow\n"); + return false; + } + return true; +} + +static void xive_add_provisioning_properties(void) +{ + beint32_t chips[XIVE_MAX_CHIPS]; + uint32_t i, count; + + dt_add_property_cells(xive_dt_node, + "ibm,xive-provision-page-size", PAGE_SIZE); + + count = 1 << xive_chips_alloc_bits; + for (i = 0; i < count; i++) + chips[i] = cpu_to_be32(xive_block_to_chip[i]); + dt_add_property(xive_dt_node, "ibm,xive-provision-chips", + chips, 4 * count); +} + +static void xive_create_mmio_dt_node(struct xive *x) +{ + uint64_t tb = (uint64_t)x->tm_base; + uint32_t stride = 1u << x->tm_shift; + + xive_dt_node = dt_new_addr(dt_root, "interrupt-controller", tb); + assert(xive_dt_node); + + dt_add_property_u64s(xive_dt_node, "reg", + tb + 0 * stride, stride, + tb + 1 * stride, stride, + tb + 2 * stride, stride, + tb + 3 * stride, stride); + + dt_add_property_strings(xive_dt_node, "compatible", + "ibm,opal-xive-pe", "ibm,opal-intc"); + + dt_add_property_cells(xive_dt_node, "ibm,xive-eq-sizes", + 12, 16, 21, 24); + + dt_add_property_cells(xive_dt_node, "ibm,xive-#priorities", + xive_cfg_vp_prio(x)); + + dt_add_property(xive_dt_node, "single-escalation-support", NULL, 0); + + if (XIVE_CAN_STORE_EOI(x)) + dt_add_property(xive_dt_node, "store-eoi", NULL, 0); + + if (xive_cfg_save_restore(x)) + dt_add_property(xive_dt_node, "vp-save-restore", NULL, 0); + + xive_add_provisioning_properties(); + +} + +static void xive_setup_forward_ports(struct xive *x, struct proc_chip *remote_chip) +{ + struct xive *remote_xive = remote_chip->xive; + uint64_t base = SETFIELD(VSD_MODE, 0ull, VSD_MODE_FORWARD); + + if (!xive_set_vsd(x, VST_ESB, remote_xive->block_id, + base | ((uint64_t)remote_xive->esb_base) | + SETFIELD(VSD_TSIZE, 0ull, ilog2(x->esb_size) - 12))) + goto error; + + /* EAS: No remote */ + + if (!xive_set_vsd(x, VST_END, remote_xive->block_id, + base | ((uint64_t)remote_xive->end_base) | + SETFIELD(VSD_TSIZE, 0ull, ilog2(x->end_size) - 12))) + goto error; + + if (!xive_set_vsd(x, VST_NVP, remote_xive->block_id, + base | ((uint64_t)remote_xive->nvp_base) | + SETFIELD(VSD_TSIZE, 0ull, ilog2(x->nvp_size) - 12))) + goto error; + + /* NVG: not used */ + /* NVC: not used */ + + if (!xive_set_vsd(x, VST_IC, remote_xive->chip_id, + base | ((uint64_t)remote_xive->ic_base) | + SETFIELD(VSD_TSIZE, 0ull, ilog2(x->ic_size) - 12))) + goto error; + + if (!xive_set_vsd(x, VST_SYNC, remote_xive->chip_id, + base | ((uint64_t)remote_xive->sync_inject) | + SETFIELD(VSD_TSIZE, 0ull, ilog2(x->sync_inject_size) - 12))) + goto error; + + /* ERQ: No remote */ + + return; + + error: + xive_err(x, "Failure configuring forwarding ports\n"); +} + +static void late_init_one_xive(struct xive *x) +{ + struct proc_chip *chip; + + /* We need to setup the cross-chip forward ports. Let's + * iterate all chip and set them up accordingly + */ + for_each_chip(chip) { + /* We skip ourselves or chips without a xive */ + if (chip->xive == x || !chip->xive) + continue; + + /* Setup our forward ports to that chip */ + xive_setup_forward_ports(x, chip); + } +} + +static bool xive_check_ipi_free(struct xive *x, uint32_t irq, uint32_t count) +{ + uint32_t i, idx = GIRQ_TO_IDX(irq); + + for (i = 0; i < count; i++) + if (bitmap_tst_bit(*x->ipi_alloc_map, idx + i)) + return false; + return true; +} + +uint32_t xive2_alloc_hw_irqs(uint32_t chip_id, uint32_t count, + uint32_t align) +{ + struct proc_chip *chip = get_chip(chip_id); + struct xive *x; + uint32_t base, i; + + assert(chip); + assert(is_pow2(align)); + + x = chip->xive; + assert(x); + + lock(&x->lock); + + /* Allocate the HW interrupts */ + base = x->int_hw_bot - count; + base &= ~(align - 1); + if (base < x->int_ipi_top) { + xive_err(x, + "HW alloc request for %d interrupts aligned to %d failed\n", + count, align); + unlock(&x->lock); + return XIVE_IRQ_ERROR; + } + if (!xive_check_ipi_free(x, base, count)) { + xive_err(x, "HWIRQ boot allocator request overlaps dynamic allocator\n"); + unlock(&x->lock); + return XIVE_IRQ_ERROR; + } + + x->int_hw_bot = base; + + /* Initialize the corresponding EAS entries to sane defaults, + * IE entry is valid, not routed and masked, EQ data is set + * to the GIRQ number. + */ + for (i = 0; i < count; i++) { + struct xive_eas *eas = xive_get_eas(x, base + i); + + eas->w = xive_set_field64(EAS_VALID, 0, 1) | + xive_set_field64(EAS_MASKED, 0, 1) | + xive_set_field64(EAS_END_DATA, 0, base + i); + } + + unlock(&x->lock); + return base; +} + +uint32_t xive2_alloc_ipi_irqs(uint32_t chip_id, uint32_t count, + uint32_t align) +{ + struct proc_chip *chip = get_chip(chip_id); + struct xive *x; + uint32_t base, i; + + assert(chip); + assert(is_pow2(align)); + + x = chip->xive; + assert(x); + + lock(&x->lock); + + /* Allocate the IPI interrupts */ + base = x->int_ipi_top + (align - 1); + base &= ~(align - 1); + if (base >= x->int_hw_bot) { + xive_err(x, + "IPI alloc request for %d interrupts aligned to %d failed\n", + count, align); + unlock(&x->lock); + return XIVE_IRQ_ERROR; + } + if (!xive_check_ipi_free(x, base, count)) { + xive_err(x, "IPI boot allocator request overlaps dynamic allocator\n"); + unlock(&x->lock); + return XIVE_IRQ_ERROR; + } + + x->int_ipi_top = base + count; + + /* Initialize the corresponding EAS entries to sane defaults, + * IE entry is valid, not routed and masked, END data is set + * to the GIRQ number. + */ + for (i = 0; i < count; i++) { + struct xive_eas *eas = xive_get_eas(x, base + i); + + eas->w = xive_set_field64(EAS_VALID, 0, 1) | + xive_set_field64(EAS_MASKED, 0, 1) | + xive_set_field64(EAS_END_DATA, 0, base + i); + } + + unlock(&x->lock); + return base; +} + +void *xive2_get_trigger_port(uint32_t girq) +{ + uint32_t idx = GIRQ_TO_IDX(girq); + struct xive *x; + + /* Find XIVE on which the EAS resides */ + x = xive_from_isn(girq); + if (!x) + return NULL; + + if (GIRQ_IS_ESCALATION(girq)) { + /* There is no trigger page for escalation interrupts */ + return NULL; + } else { + /* Make sure it's an IPI on that chip */ + if (girq < x->int_base || + girq >= x->int_ipi_top) + return NULL; + + return x->esb_base + idx * XIVE_ESB_PAGE_SIZE; + } +} + +/* + * Notify Port page (writes only, w/data), separated into two + * categories, both sent to VC: + * - IPI queue (Addr bit 52 = 0) (for NPU) + * - HW queue (Addr bit 52 = 1) + */ +uint64_t xive2_get_notify_port(uint32_t chip_id, uint32_t ent) +{ + struct proc_chip *chip = get_chip(chip_id); + struct xive *x; + uint32_t offset = 0; + + assert(chip); + x = chip->xive; + assert(x); + + /* This is where we can assign a different HW queue to a different + * source by offsetting into the cache lines of the notify port + * + * For now we keep it very basic, this will have to be looked at + * again on real HW with some proper performance analysis. + * + * Here's what Florian says on the matter: + * + * << + * The first 2k of the notify port page can all be used for PCIe triggers + * + * However the idea would be that we try to use the first 4 cache lines to + * balance the PCIe Interrupt requests to use the least used snoop buses + * (we went from 2 to 4 snoop buses for P9). snoop 0 is heavily used + * (I think TLBIs are using that in addition to the normal addresses), + * snoop 3 is used for all Int commands, so I think snoop 2 (CL 2 in the + * page) is the least used overall. So we probably should that one for + * the Int commands from PCIe. + * + * In addition, our EAS cache supports hashing to provide "private" cache + * areas for the PHBs in the shared 1k EAS cache. This allows e.g. to avoid + * that one "thrashing" PHB thrashes the EAS cache for everyone, or provide + * a PHB with a private area that would allow high cache hits in case of a + * device using very few interrupts. The hashing is based on the offset within + * the cache line. So using that, you can e.g. set the EAS cache up so that + * IPIs use 512 entries, the x16 PHB uses 256 entries and the x8 PHBs 128 + * entries each - or IPIs using all entries and sharing with PHBs, so PHBs + * would use 512 entries and 256 entries respectively. + * + * This is a tuning we would probably do later in the lab, but as a "prep" + * we should set up the different PHBs such that they are using different + * 8B-aligned offsets within the cache line, so e.g. + * PH4_0 addr 0x100 (CL 2 DW0 + * PH4_1 addr 0x108 (CL 2 DW1) + * PH4_2 addr 0x110 (CL 2 DW2) + * etc. + * >> + * + * I'm using snoop1 for PHB0 and snoop2 for everybody else. + */ + + /* Florian adds : + * + * we just set them up for a start to have different offsets + * within the cache line so that we could use the allocation + * restrictions that can be enforced in the interrupt + * controller + * + * P10 might now be randomizing the cache line bits in HW to + * balance snoop bus usage + */ + switch(ent) { + case XIVE_HW_SRC_PHBn(0): + offset = 0x800; + break; + case XIVE_HW_SRC_PHBn(1): + offset = 0x908; + break; + case XIVE_HW_SRC_PHBn(2): + offset = 0x910; + break; + case XIVE_HW_SRC_PHBn(3): + offset = 0x918; + break; + case XIVE_HW_SRC_PHBn(4): + offset = 0x920; + break; + case XIVE_HW_SRC_PHBn(5): + offset = 0x928; + break; + case XIVE_HW_SRC_PSI: + offset = 0x930; + break; + default: + assert(false); + return 0; + } + + return ((uint64_t)x->ic_base) + + (XIVE_NOTIFY_PGOFF << x->ic_shift) + offset; +} + +/* Manufacture the powerbus packet bits 32:63 */ +__attrconst uint32_t xive2_get_notify_base(uint32_t girq) +{ + return (GIRQ_TO_BLK(girq) << 28) | GIRQ_TO_IDX(girq); +} + +static bool xive_get_irq_targetting(uint32_t isn, uint32_t *out_target, + uint8_t *out_prio, uint32_t *out_lirq) +{ + struct xive_eas *eas; + struct xive *x, *end_x; + struct xive_end *end; + uint32_t end_blk, end_idx; + uint32_t vp_blk, vp_idx; + uint32_t prio, server; + bool is_escalation = GIRQ_IS_ESCALATION(isn); + + /* Find XIVE on which the EAS resides */ + x = xive_from_isn(isn); + if (!x) + return false; + /* Grab the EAS */ + eas = xive_get_eas(x, isn); + if (!eas) + return false; + if (!xive_get_field64(EAS_VALID, eas->w) && !is_escalation) { + xive_err(x, "ISN %x lead to invalid EAS !\n", isn); + return false; + } + + if (out_lirq) + *out_lirq = xive_get_field64(EAS_END_DATA, eas->w); + + /* Find the END and its xive instance */ + end_blk = xive_get_field64(EAS_END_BLOCK, eas->w); + end_idx = xive_get_field64(EAS_END_INDEX, eas->w); + end_x = xive_from_vc_blk(end_blk); + + /* This can fail if the interrupt hasn't been initialized yet + * but it should also be masked, so fail silently + */ + if (!end_x) + goto pick_default; + end = xive_get_end(end_x, end_idx); + if (!end) + goto pick_default; + + /* XXX Check valid and format 0 */ + + /* No priority conversion, return the actual one ! */ + if (xive_get_field64(EAS_MASKED, eas->w)) + prio = 0xff; + else + prio = xive_get_field32(END_W7_F0_PRIORITY, end->w7); + if (out_prio) + *out_prio = prio; + + vp_blk = xive_get_field32(END_W6_VP_BLOCK, end->w6); + vp_idx = xive_get_field32(END_W6_VP_OFFSET, end->w6); + server = VP2PIR(vp_blk, vp_idx); + + if (out_target) + *out_target = server; + + xive_vdbg(end_x, "END info for ISN %x: prio=%d, server=0x%x (VP %x/%x)\n", + isn, prio, server, vp_blk, vp_idx); + return true; + +pick_default: + xive_vdbg(end_x, "END info for ISN %x: Using masked defaults\n", isn); + + if (out_prio) + *out_prio = 0xff; + /* Pick a random default, me will be fine ... */ + if (out_target) + *out_target = mfspr(SPR_PIR); + return true; +} + +static inline bool xive_end_for_target(uint32_t target, uint8_t prio, + uint32_t *out_end_blk, + uint32_t *out_end_idx) +{ + struct xive *x; + struct xive_nvp *vp; + uint32_t vp_blk, vp_idx; + uint32_t end_blk, end_idx; + + if (prio > xive_max_prio(one_xive)) + return false; + + /* Get the VP block/index from the target word */ + if (!xive_decode_vp(target, &vp_blk, &vp_idx, NULL, NULL)) + return false; + + /* Grab the target VP's XIVE */ + x = xive_from_pc_blk(vp_blk); + if (!x) + return false; + + /* Find the VP structrure where we stashed the END number */ + vp = xive_get_vp(x, vp_idx); + if (!vp) + return false; + + end_blk = xive_get_field32(NVP_W5_VP_END_BLOCK, vp->w5); + end_idx = xive_get_field32(NVP_W5_VP_END_INDEX, vp->w5); + + /* Currently the END block and VP block should be the same */ + if (end_blk != vp_blk) { + xive_err(x, "end_blk != vp_blk (%d vs. %d) for target 0x%08x/%d\n", + end_blk, vp_blk, target, prio); + assert(false); + } + + if (out_end_blk) + *out_end_blk = end_blk; + if (out_end_idx) + *out_end_idx = end_idx + prio; + + return true; +} + +static int64_t xive_set_irq_targetting(uint32_t isn, uint32_t target, + uint8_t prio, uint32_t lirq, + bool synchronous) +{ + struct xive *x; + struct xive_eas *eas, new_eas; + uint32_t end_blk, end_idx; + bool is_escalation = GIRQ_IS_ESCALATION(isn); + int64_t rc; + + /* Find XIVE on which the EAS resides */ + x = xive_from_isn(isn); + if (!x) + return OPAL_PARAMETER; + /* Grab the EAS */ + eas = xive_get_eas(x, isn); + if (!eas) + return OPAL_PARAMETER; + if (!xive_get_field64(EAS_VALID, eas->w) && !is_escalation) { + xive_err(x, "ISN %x lead to invalid EAS !\n", isn); + return OPAL_PARAMETER; + } + + lock(&x->lock); + + /* Read existing EAS */ + new_eas = *eas; + + /* Are we masking ? */ + if (prio == 0xff && !is_escalation) { + new_eas.w = xive_set_field64(EAS_MASKED, new_eas.w, 1); + xive_vdbg(x, "ISN %x masked !\n", isn); + + /* Put prio 7 in the END */ + prio = xive_max_prio(x); + } else { + /* Unmasking */ + new_eas.w = xive_set_field64(EAS_MASKED, new_eas.w, 0); + xive_vdbg(x, "ISN %x unmasked !\n", isn); + + /* For normal interrupt sources, keep track of which ones + * we ever enabled since the last reset + */ + if (!is_escalation) + bitmap_set_bit(*x->int_enabled_map, GIRQ_TO_IDX(isn)); + } + + /* If prio isn't 0xff, re-target the EAS. First find the END + * correponding to the target + */ + if (prio != 0xff) { + if (!xive_end_for_target(target, prio, &end_blk, &end_idx)) { + xive_err(x, "Can't find END for target/prio 0x%x/%d\n", + target, prio); + unlock(&x->lock); + return OPAL_PARAMETER; + } + + /* Try to update it atomically to avoid an intermediary + * stale state + */ + new_eas.w = xive_set_field64(EAS_END_BLOCK, new_eas.w, end_blk); + new_eas.w = xive_set_field64(EAS_END_INDEX, new_eas.w, end_idx); + } + new_eas.w = xive_set_field64(EAS_END_DATA, new_eas.w, lirq); + + xive_vdbg(x,"ISN %x routed to end %x/%x lirq=%08x EAS=%016llx !\n", + isn, end_blk, end_idx, lirq, new_eas.w); + + /* Updating the cache differs between real EAS and escalation + * EAS inside an END + */ + if (is_escalation) { + rc = xive_escalation_ive_cache_update(x, x->block_id, + GIRQ_TO_IDX(isn), &new_eas, synchronous); + } else { + sync(); + *eas = new_eas; + rc = xive_easc_scrub(x, x->block_id, GIRQ_TO_IDX(isn)); + } + + unlock(&x->lock); + return rc; +} + +static void xive_update_irq_mask(struct xive_src *s, uint32_t idx, bool masked) +{ + void *mmio_base = s->esb_mmio + (1ul << s->esb_shift) * idx; + uint32_t offset; + + /* XXX FIXME: A quick mask/umask can make us shoot an interrupt + * more than once to a queue. We need to keep track better + */ + if (s->flags & XIVE_SRC_EOI_PAGE1) + mmio_base += 1ull << (s->esb_shift - 1); + if (masked) + offset = XIVE_ESB_SET_PQ_01; + else + offset = XIVE_ESB_SET_PQ_00; + + in_be64(mmio_base + offset); +} + +#define XIVE_SYNC_IPI 0x000 +#define XIVE_SYNC_HW 0x080 +#define XIVE_SYNC_NxC 0x100 +#define XIVE_SYNC_INT 0x180 +#define XIVE_SYNC_OS_ESC 0x200 +#define XIVE_SYNC_POOL_ESC 0x280 +#define XIVE_SYNC_HARD_ESC 0x300 + +static int64_t xive_sync(struct xive *x __unused) +{ + uint64_t r; + void *sync_base; + + lock(&x->lock); + + sync_base = x->ic_base + (XIVE_SYNC_POLL_PGOFF << x->ic_shift); + + out_be64(sync_base + XIVE_SYNC_IPI, 0); + out_be64(sync_base + XIVE_SYNC_HW, 0); + out_be64(sync_base + XIVE_SYNC_NxC, 0); + out_be64(sync_base + XIVE_SYNC_INT, 0); + out_be64(sync_base + XIVE_SYNC_OS_ESC, 0); + out_be64(sync_base + XIVE_SYNC_POOL_ESC, 0); + out_be64(sync_base + XIVE_SYNC_HARD_ESC, 0); + + /* XXX Add timeout */ + for (;;) { + r = xive_regr(x, VC_ENDC_SYNC_DONE); + if ((r & VC_ENDC_SYNC_POLL_DONE) == VC_ENDC_SYNC_POLL_DONE) + break; + cpu_relax(); + } + xive_regw(x, VC_ENDC_SYNC_DONE, r & ~VC_ENDC_SYNC_POLL_DONE); + + /* + * Do a read after clearing the sync done bit to prevent any + * race between CI write and next sync command + */ + xive_regr(x, VC_ENDC_SYNC_DONE); + + unlock(&x->lock); + return 0; +} + +static int64_t __xive_set_irq_config(struct irq_source *is, uint32_t girq, + uint64_t vp, uint8_t prio, uint32_t lirq, + bool update_esb, bool sync) +{ + struct xive_src *s = container_of(is, struct xive_src, is); + uint32_t old_target, vp_blk; + u8 old_prio; + int64_t rc; + + /* Grab existing target */ + if (!xive_get_irq_targetting(girq, &old_target, &old_prio, NULL)) + return OPAL_PARAMETER; + + /* Let XIVE configure the END. We do the update without the + * synchronous flag, thus a cache update failure will result + * in us returning OPAL_BUSY + */ + rc = xive_set_irq_targetting(girq, vp, prio, lirq, false); + if (rc) + return rc; + + /* Do we need to update the mask ? */ + if (old_prio != prio && (old_prio == 0xff || prio == 0xff)) { + /* The source has special variants of masking/unmasking */ + if (update_esb) { + /* Ensure it's enabled/disabled in the source + * controller + */ + xive_update_irq_mask(s, girq - s->esb_base, + prio == 0xff); + } + } + + /* + * Synchronize the source and old target XIVEs to ensure that + * all pending interrupts to the old target have reached their + * respective queue. + * + * WARNING: This assumes the VP and it's queues are on the same + * XIVE instance ! + */ + if (!sync) + return OPAL_SUCCESS; + xive_sync(s->xive); + if (xive_decode_vp(old_target, &vp_blk, NULL, NULL, NULL)) { + struct xive *x = xive_from_pc_blk(vp_blk); + if (x) + xive_sync(x); + } + + return OPAL_SUCCESS; +} + +static int64_t xive_set_irq_config(uint32_t girq, uint64_t vp, uint8_t prio, + uint32_t lirq, bool update_esb) +{ + struct irq_source *is = irq_find_source(girq); + + return __xive_set_irq_config(is, girq, vp, prio, lirq, update_esb, + true); +} + +static void xive_source_interrupt(struct irq_source *is, uint32_t isn) +{ + struct xive_src *s = container_of(is, struct xive_src, is); + + if (!s->orig_ops || !s->orig_ops->interrupt) + return; + s->orig_ops->interrupt(is, isn); +} + +static uint64_t xive_source_attributes(struct irq_source *is, uint32_t isn) +{ + struct xive_src *s = container_of(is, struct xive_src, is); + + if (!s->orig_ops || !s->orig_ops->attributes) + return IRQ_ATTR_TARGET_LINUX; + return s->orig_ops->attributes(is, isn); +} + +static char *xive_source_name(struct irq_source *is, uint32_t isn) +{ + struct xive_src *s = container_of(is, struct xive_src, is); + + if (!s->orig_ops || !s->orig_ops->name) + return NULL; + return s->orig_ops->name(is, isn); +} + +void xive2_source_mask(struct irq_source *is, uint32_t isn) +{ + struct xive_src *s = container_of(is, struct xive_src, is); + + xive_update_irq_mask(s, isn - s->esb_base, true); +} + +static const struct irq_source_ops xive_irq_source_ops = { + .interrupt = xive_source_interrupt, + .attributes = xive_source_attributes, + .name = xive_source_name, +}; + +static void __xive_register_source(struct xive *x, struct xive_src *s, + uint32_t base, uint32_t count, + uint32_t shift, void *mmio, uint32_t flags, + bool secondary, void *data, + const struct irq_source_ops *orig_ops) +{ + s->esb_base = base; + s->esb_shift = shift; + s->esb_mmio = mmio; + s->flags = flags; + s->orig_ops = orig_ops; + s->xive = x; + s->is.start = base; + s->is.end = base + count; + s->is.ops = &xive_irq_source_ops; + s->is.data = data; + + __register_irq_source(&s->is, secondary); +} + +void xive2_register_hw_source(uint32_t base, uint32_t count, uint32_t shift, + void *mmio, uint32_t flags, void *data, + const struct irq_source_ops *ops) +{ + struct xive_src *s; + struct xive *x = xive_from_isn(base); + + assert(x); + + s = malloc(sizeof(struct xive_src)); + assert(s); + __xive_register_source(x, s, base, count, shift, mmio, flags, + false, data, ops); +} + +static void __xive2_register_esb_source(uint32_t base, uint32_t count, + void *data, const struct irq_source_ops *ops) +{ + struct xive_src *s; + struct xive *x = xive_from_isn(base); + uint32_t base_idx = GIRQ_TO_IDX(base); + void *mmio_base; + uint32_t flags = XIVE_SRC_EOI_PAGE1 | XIVE_SRC_TRIGGER_PAGE; + + assert(x); + + s = malloc(sizeof(struct xive_src)); + assert(s); + + if (XIVE_CAN_STORE_EOI(x)) + flags |= XIVE_SRC_STORE_EOI; + + /* Callbacks assume the MMIO base corresponds to the first + * interrupt of that source structure so adjust it + */ + mmio_base = x->esb_base + (1ul << XIVE_ESB_SHIFT) * base_idx; + __xive_register_source(x, s, base, count, XIVE_ESB_SHIFT, mmio_base, + flags, false, data, ops); +} + +/* + * Check that IPI sources have interrupt numbers in the IPI interrupt + * number range + */ +void xive2_register_ipi_source(uint32_t base, uint32_t count, void *data, + const struct irq_source_ops *ops) +{ + struct xive *x = xive_from_isn(base); + + assert(x); + assert(base >= x->int_base && (base + count) <= x->int_ipi_top); + + __xive2_register_esb_source(base, count, data, ops); +} + +/* + * Some HW sources (PHB) can disable the use of their own ESB pages + * and offload all the checks on ESB pages of the IC. The interrupt + * numbers are not necessarily in the IPI range. + */ +void xive2_register_esb_source(uint32_t base, uint32_t count) +{ + __xive2_register_esb_source(base, count, NULL, NULL); +} + +uint64_t xive2_get_esb_base(uint32_t base) +{ + struct xive *x = xive_from_isn(base); + uint32_t base_idx = GIRQ_TO_IDX(base); + + assert(x); + + return (uint64_t) x->esb_base + (1ul << XIVE_ESB_SHIFT) * base_idx; +} + +static void xive_set_quirks(struct xive *x, struct proc_chip *chip __unused) +{ + uint64_t quirks = 0; + + /* This extension is dropped for P10 */ + if (proc_gen == proc_gen_p10) + quirks |= XIVE_QUIRK_THREADID_7BITS; + + /* Broken check on invalid priority when reduced priorities is in use */ + if (proc_gen == proc_gen_p10) + quirks |= XIVE_QUIRK_BROKEN_PRIO_CHECK; + + xive_dbg(x, "setting XIVE quirks to %016llx\n", quirks); + x->quirks = quirks; +} + +static struct xive *init_one_xive(struct dt_node *np) +{ + struct xive *x; + struct proc_chip *chip; + uint32_t flags; + + x = zalloc(sizeof(struct xive)); + assert(x); + x->x_node = np; + x->xscom_base = dt_get_address(np, 0, NULL); + x->chip_id = dt_get_chip_id(np); + + /* "Allocate" a new block ID for the chip */ + x->block_id = xive_block_count++; + assert (x->block_id < XIVE_MAX_CHIPS); + xive_block_to_chip[x->block_id] = x->chip_id; + init_lock(&x->lock); + + chip = get_chip(x->chip_id); + assert(chip); + + xive_notice(x, "Initializing XIVE block ID %d...\n", x->block_id); + chip->xive = x; + + xive_set_quirks(x, chip); + + list_head_init(&x->donated_pages); + + /* Base interrupt numbers and allocator init */ + + x->int_base = BLKIDX_TO_GIRQ(x->block_id, 0); + x->int_count = x->int_base + XIVE_INT_COUNT; + x->int_hw_bot = x->int_count; + x->int_ipi_top = x->int_base; + + if (x->int_ipi_top < XIVE_INT_FIRST) + x->int_ipi_top = XIVE_INT_FIRST; + + /* Allocate a few bitmaps */ + x->end_map = local_alloc(x->chip_id, BITMAP_BYTES(xive_end_bitmap_size(x)), PAGE_SIZE); + assert(x->end_map); + memset(x->end_map, 0, BITMAP_BYTES(xive_end_bitmap_size(x))); + + /* + * Allocate END index 0 to make sure it can not be used as an + * END base for a VP. This is the criteria to know if a VP was + * allocated. + */ + bitmap_set_bit(*x->end_map, 0); + + x->int_enabled_map = local_alloc(x->chip_id, BITMAP_BYTES(XIVE_INT_COUNT), PAGE_SIZE); + assert(x->int_enabled_map); + memset(x->int_enabled_map, 0, BITMAP_BYTES(XIVE_INT_COUNT)); + x->ipi_alloc_map = local_alloc(x->chip_id, BITMAP_BYTES(XIVE_INT_COUNT), PAGE_SIZE); + assert(x->ipi_alloc_map); + memset(x->ipi_alloc_map, 0, BITMAP_BYTES(XIVE_INT_COUNT)); + + xive_dbg(x, "Handling interrupts [%08x..%08x]\n", + x->int_base, x->int_count - 1); + + /* Setup the IC BARs */ + if (!xive_configure_ic_bars(x)) + goto fail; + + /* Some basic global inits such as page sizes etc... */ + if (!xive_config_init(x)) + goto fail; + + /* Configure the set translations for MMIO */ + if (!xive_setup_set_xlate(x)) + goto fail; + + /* Dump some MMIO registers for diagnostics */ + xive_dump_mmio(x); + + /* Pre-allocate a number of tables */ + if (!xive_prealloc_tables(x)) + goto fail; + + /* Setup the XIVE structures BARs */ + if (!xive_configure_bars(x)) + goto fail; + + /* + * Configure local tables in VSDs (forward ports will be + * handled later) + */ + if (!xive_set_local_tables(x)) + goto fail; + + /* Register built-in source controllers (aka IPIs) */ + flags = XIVE_SRC_EOI_PAGE1 | XIVE_SRC_TRIGGER_PAGE; + if (XIVE_CAN_STORE_EOI(x)) + flags |= XIVE_SRC_STORE_EOI; + __xive_register_source(x, &x->ipis, x->int_base, + x->int_hw_bot - x->int_base, XIVE_ESB_SHIFT, + x->esb_base, flags, true, NULL, NULL); + + /* Register escalation sources (ENDs) + * + * The ESe PQ bits are used for coalescing and the END ESB for + * interrupt management. The word 4&5 of the END is the EAS + * for the escalation source and the indexing is the same as + * the END. + * + * This is an OPAL primary source, IPIs are secondary. + */ + __xive_register_source(x, &x->esc_irqs, + MAKE_ESCALATION_GIRQ(x->block_id, 0), + XIVE_END_COUNT, XIVE_END_SHIFT, + x->end_base, XIVE_SRC_EOI_PAGE1, + false, NULL, NULL); + + + return x; + fail: + xive_err(x, "Initialization failed...\n"); + + /* Should this be fatal ? */ + //assert(false); + return NULL; +} + +static void xive_reset_enable_thread(struct cpu_thread *c) +{ + struct proc_chip *chip = get_chip(c->chip_id); + struct xive *x = chip->xive; + uint32_t fc, bit; + uint64_t enable; + + /* Get fused core number */ + fc = (c->pir >> 3) & 0xf; + + /* Get bit in register */ + bit = c->pir & 0x3f; + + /* Get which register to access */ + if (fc < 8) { + xive_regw(x, TCTXT_EN0_RESET, PPC_BIT(bit)); + xive_regw(x, TCTXT_EN0_SET, PPC_BIT(bit)); + + enable = xive_regr(x, TCTXT_EN0); + if (!(enable & PPC_BIT(bit))) + xive_cpu_err(c, "Failed to enable thread\n"); + } else { + xive_regw(x, TCTXT_EN1_RESET, PPC_BIT(bit)); + xive_regw(x, TCTXT_EN1_SET, PPC_BIT(bit)); + + enable = xive_regr(x, TCTXT_EN1); + if (!(enable & PPC_BIT(bit))) + xive_cpu_err(c, "Failed to enable thread\n"); + } +} + +void xive2_cpu_callin(struct cpu_thread *cpu) +{ + struct xive_cpu_state *xs = cpu->xstate; + uint8_t old_w2 __unused, w2 __unused; + + if (!xs) + return; + + /* Reset the HW thread context and enable it */ + xive_reset_enable_thread(cpu); + + /* Set VT to 1 */ + old_w2 = in_8(xs->tm_ring1 + TM_QW3_HV_PHYS + TM_WORD2); + out_8(xs->tm_ring1 + TM_QW3_HV_PHYS + TM_WORD2, 0x80); + w2 = in_8(xs->tm_ring1 + TM_QW3_HV_PHYS + TM_WORD2); + + xive_cpu_vdbg(cpu, "Initialized TIMA VP=%x/%x W01=%016llx W2=%02x->%02x\n", + xs->vp_blk, xs->vp_idx, + in_be64(xs->tm_ring1 + TM_QW3_HV_PHYS), + old_w2, w2); +} + +#ifdef XIVE_EXTRA_CHECK_INIT_CACHE +#define CHECK_INIT_CACHE_LOOP 0x100 +static void xive_special_cache_check(struct xive *x, uint32_t blk, uint32_t idx) +{ + struct xive_nvp vp = {0}; + uint32_t i; + + /* + * SIMICS checks the value of reserved fields + */ + if (chip_quirk(QUIRK_SIMICS)) + return; + + for (i = 0; i < CHECK_INIT_CACHE_LOOP; i++) { + struct xive_nvp *vp_m = xive_get_vp(x, idx); + + memset(vp_m, (~i) & 0xff, sizeof(*vp_m)); + sync(); + vp.w1 = (i << 16) | i; + assert(!xive_nxc_cache_update(x, blk, idx, &vp, true)); + if (!xive_check_nxc_update(x, idx, &vp)) { + xive_dbg(x, "NXC update test failed at %d iterations\n", i); + return; + } + } + xive_dbg(x, "NXC update test passed for %d/0x%x\n", blk, idx); +} +#else +static inline void xive_special_cache_check(struct xive *x __unused, + uint32_t blk __unused, + uint32_t idx __unused) +{ +} +#endif + +static void xive_init_cpu_exploitation(struct xive_cpu_state *xs) +{ + struct xive_end end; + struct xive_nvp vp; + struct xive *x_vp, *x_end; + int i; + + /* Grab the XIVE where the VP resides. It could be different from + * the local chip XIVE if not using block group mode + */ + x_vp = xive_from_pc_blk(xs->vp_blk); + assert(x_vp); + + /* Grab the XIVE where the END resides. It should be the same + * as the VP. + */ + x_end = xive_from_vc_blk(xs->end_blk); + assert(x_end); + + xive_init_hw_end(&end); + + /* Use the cache watch to update all ENDs reserved for HW VPs */ + lock(&x_end->lock); + for (i = 0; i < xive_cfg_vp_prio(x_end); i++) + xive_endc_cache_update(x_end, xs->end_blk, xs->end_idx + i, + &end, true); + unlock(&x_end->lock); + + /* Initialize/enable the VP */ + xive_init_default_vp(&vp, xs->end_blk, xs->end_idx); + + /* Use the cache watch to write it out */ + lock(&x_vp->lock); + xive_special_cache_check(x_vp, xs->vp_blk, xs->vp_idx); + xive_nxc_cache_update(x_vp, xs->vp_blk, xs->vp_idx, &vp, true); + unlock(&x_vp->lock); +} + +static void xive_configure_ex_special_bar(struct xive *x, struct cpu_thread *c) +{ + uint64_t xa, val; + int64_t rc; + + xive_cpu_vdbg(c, "Setting up special BAR\n"); + xa = XSCOM_ADDR_P10_NCU(pir_to_core_id(c->pir), P10_NCU_SPEC_BAR); + val = (uint64_t)x->tm_base | P10_NCU_SPEC_BAR_ENABLE; + if (x->tm_shift == 16) + val |= P10_NCU_SPEC_BAR_256K; + xive_cpu_vdbg(c, "NCU_SPEC_BAR_XA[%08llx]=%016llx\n", xa, val); + rc = xscom_write(c->chip_id, xa, val); + if (rc) { + xive_cpu_err(c, "Failed to setup NCU_SPEC_BAR\n"); + /* XXXX what do do now ? */ + } +} + +void xive2_late_init(void) +{ + struct cpu_thread *c; + + prlog(PR_INFO, "SLW: Configuring self-restore for NCU_SPEC_BAR\n"); + for_each_present_cpu(c) { + if(cpu_is_thread0(c)) { + struct proc_chip *chip = get_chip(c->chip_id); + struct xive *x = chip->xive; + uint64_t xa, val, rc; + xa = XSCOM_ADDR_P10_NCU(pir_to_core_id(c->pir), P10_NCU_SPEC_BAR); + val = (uint64_t)x->tm_base | P10_NCU_SPEC_BAR_ENABLE; + /* Bail out if wakeup engine has already failed */ + if (wakeup_engine_state != WAKEUP_ENGINE_PRESENT) { + prlog(PR_ERR, "XIVE proc_stop_api fail detected\n"); + break; + } + rc = proc_stop_save_scom((void *)chip->homer_base, xa, val, + PROC_STOP_SCOM_REPLACE, PROC_STOP_SECTION_L3); + if (rc) { + xive_cpu_err(c, "proc_stop_save_scom failed for NCU_SPEC_BAR rc=%lld\n", + rc); + wakeup_engine_state = WAKEUP_ENGINE_FAILED; + } + } + } +} + +static void xive_provision_cpu(struct xive_cpu_state *xs, struct cpu_thread *c) +{ + struct xive *x; + + /* VP ids for HW threads are pre-allocated */ + xs->vp_blk = PIR2VP_BLK(c->pir); + xs->vp_idx = PIR2VP_IDX(c->pir); + + /* For now we use identical block IDs for VC and PC but that might + * change. We allocate the ENDs on the same XIVE as the VP. + */ + xs->end_blk = xs->vp_blk; + + /* Grab the XIVE where the END resides. It could be different from + * the local chip XIVE if not using block group mode + */ + x = xive_from_vc_blk(xs->end_blk); + assert(x); + + /* Allocate a set of ENDs for that VP */ + xs->end_idx = xive_alloc_end_set(x, true); + assert(!XIVE_ALLOC_IS_ERR(xs->end_idx)); +} + +static void xive_init_cpu(struct cpu_thread *c) +{ + struct proc_chip *chip = get_chip(c->chip_id); + struct xive *x = chip->xive; + struct xive_cpu_state *xs; + + if (!x) + return; + + /* + * Each core pair (EX) needs this special BAR setup to have the + * right powerbus cycle for the TM area (as it has the same address + * on all chips so it's somewhat special). + * + * Because we don't want to bother trying to figure out which core + * of a pair is present we just do the setup for each of them, which + * is harmless. + */ + if (cpu_is_thread0(c) || cpu_is_core_chiplet_primary(c)) + xive_configure_ex_special_bar(x, c); + + /* Initialize the state structure */ + c->xstate = xs = local_alloc(c->chip_id, sizeof(struct xive_cpu_state), 1); + assert(xs); + memset(xs, 0, sizeof(struct xive_cpu_state)); + xs->xive = x; + + init_lock(&xs->lock); + + /* Shortcut to TM HV ring */ + xs->tm_ring1 = x->tm_base + (1u << x->tm_shift); + + /* Provision a VP id and some ENDs for a HW thread */ + xive_provision_cpu(xs, c); + + xive_init_cpu_exploitation(xs); +} + +static uint64_t xive_convert_irq_flags(uint64_t iflags) +{ + uint64_t oflags = 0; + + if (iflags & XIVE_SRC_STORE_EOI) + oflags |= OPAL_XIVE_IRQ_STORE_EOI2; + + /* OPAL_XIVE_IRQ_TRIGGER_PAGE is only meant to be set if + * the interrupt has a *separate* trigger page. + */ + if ((iflags & XIVE_SRC_EOI_PAGE1) && + (iflags & XIVE_SRC_TRIGGER_PAGE)) + oflags |= OPAL_XIVE_IRQ_TRIGGER_PAGE; + + if (iflags & XIVE_SRC_LSI) + oflags |= OPAL_XIVE_IRQ_LSI; + + return oflags; +} + +static int64_t opal_xive_get_irq_info(uint32_t girq, + beint64_t *out_flags, + beint64_t *out_eoi_page, + beint64_t *out_trig_page, + beint32_t *out_esb_shift, + beint32_t *out_src_chip) +{ + struct irq_source *is = irq_find_source(girq); + struct xive_src *s = container_of(is, struct xive_src, is); + uint32_t idx; + uint64_t mm_base; + uint64_t eoi_page = 0, trig_page = 0; + + if (xive_mode != XIVE_MODE_EXPL) + return OPAL_WRONG_STATE; + if (is == NULL || out_flags == NULL) + return OPAL_PARAMETER; + assert(is->ops == &xive_irq_source_ops); + + if (out_flags) + *out_flags = cpu_to_be64(xive_convert_irq_flags(s->flags)); + + idx = girq - s->esb_base; + + if (out_esb_shift) + *out_esb_shift = cpu_to_be32(s->esb_shift); + + mm_base = (uint64_t)s->esb_mmio + (1ull << s->esb_shift) * idx; + + /* The EOI page can either be the first or second page */ + if (s->flags & XIVE_SRC_EOI_PAGE1) { + uint64_t p1off = 1ull << (s->esb_shift - 1); + eoi_page = mm_base + p1off; + } else + eoi_page = mm_base; + + /* The trigger page, if it exists, is always the first page */ + if (s->flags & XIVE_SRC_TRIGGER_PAGE) + trig_page = mm_base; + + if (out_eoi_page) + *out_eoi_page = cpu_to_be64(eoi_page); + if (out_trig_page) + *out_trig_page = cpu_to_be64(trig_page); + if (out_src_chip) + *out_src_chip = cpu_to_be32(GIRQ_TO_CHIP(girq)); + + return OPAL_SUCCESS; +} + +static int64_t opal_xive_get_irq_config(uint32_t girq, + beint64_t *out_vp, + uint8_t *out_prio, + beint32_t *out_lirq) +{ + uint32_t vp; + uint32_t lirq; + uint8_t prio; + + if (xive_mode != XIVE_MODE_EXPL) + return OPAL_WRONG_STATE; + + if (xive_get_irq_targetting(girq, &vp, &prio, &lirq)) { + *out_vp = cpu_to_be64(vp); + *out_prio = prio; + *out_lirq = cpu_to_be32(lirq); + return OPAL_SUCCESS; + } else + return OPAL_PARAMETER; +} + +static int64_t opal_xive_set_irq_config(uint32_t girq, + uint64_t vp, + uint8_t prio, + uint32_t lirq) +{ + /* + * This variant is meant for a XIVE-aware OS, thus it will + * *not* affect the ESB state of the interrupt. If used with + * a prio of FF, the EAS will be masked. In that case the + * races have to be handled by the OS. + */ + if (xive_mode != XIVE_MODE_EXPL) + return OPAL_WRONG_STATE; + + return xive_set_irq_config(girq, vp, prio, lirq, false); +} + +static int64_t opal_xive_get_queue_info(uint64_t vp, uint32_t prio, + beint64_t *out_qpage, + beint64_t *out_qsize, + beint64_t *out_qeoi_page, + beint32_t *out_escalate_irq, + beint64_t *out_qflags) +{ + uint32_t blk, idx; + struct xive *x; + struct xive_end *end; + + if (xive_mode != XIVE_MODE_EXPL) + return OPAL_WRONG_STATE; + + if (!xive_end_for_target(vp, prio, &blk, &idx)) + return OPAL_PARAMETER; + + x = xive_from_vc_blk(blk); + if (!x) + return OPAL_PARAMETER; + + end = xive_get_end(x, idx); + if (!end) + return OPAL_PARAMETER; + + if (out_escalate_irq) { + uint32_t esc_idx = idx; + + /* If escalations are routed to a single queue, fix up + * the escalation interrupt number here. + */ + if (xive_get_field32(END_W0_UNCOND_ESCALATE, end->w0)) + esc_idx |= xive_escalation_prio(x); + *out_escalate_irq = + cpu_to_be32(MAKE_ESCALATION_GIRQ(blk, esc_idx)); + } + + /* If this is a single-escalation gather queue, that's all + * there is to return + */ + if (xive_get_field32(END_W0_SILENT_ESCALATE, end->w0)) { + if (out_qflags) + *out_qflags = 0; + if (out_qpage) + *out_qpage = 0; + if (out_qsize) + *out_qsize = 0; + if (out_qeoi_page) + *out_qeoi_page = 0; + return OPAL_SUCCESS; + } + + if (out_qpage) { + if (xive_get_field32(END_W0_ENQUEUE, end->w0)) + *out_qpage = cpu_to_be64( + ((uint64_t)xive_get_field32(END_W2_EQ_ADDR_HI, end->w2) << 32) | + xive_get_field32(END_W3_EQ_ADDR_LO, end->w3)); + else + *out_qpage = 0; + } + if (out_qsize) { + if (xive_get_field32(END_W0_ENQUEUE, end->w0)) + *out_qsize = cpu_to_be64(xive_get_field32(END_W3_QSIZE, end->w3) + 12); + else + *out_qsize = 0; + } + if (out_qeoi_page) { + *out_qeoi_page = cpu_to_be64( + (uint64_t)x->end_base + idx * XIVE_ESB_PAGE_SIZE); + } + if (out_qflags) { + *out_qflags = 0; + if (xive_get_field32(END_W0_VALID, end->w0)) + *out_qflags |= cpu_to_be64(OPAL_XIVE_EQ_ENABLED); + if (xive_get_field32(END_W0_UCOND_NOTIFY, end->w0)) + *out_qflags |= cpu_to_be64(OPAL_XIVE_EQ_ALWAYS_NOTIFY); + if (xive_get_field32(END_W0_ESCALATE_CTL, end->w0)) + *out_qflags |= cpu_to_be64(OPAL_XIVE_EQ_ESCALATE); + } + + return OPAL_SUCCESS; +} + +static void xive_cleanup_end(struct xive_end *end) +{ + end->w0 = xive_set_field32(END_W0_FIRMWARE1, 0, xive_end_is_firmware1(end)); + end->w1 = xive_set_field32(END_W1_ESe_Q, 0, 1) | + xive_set_field32(END_W1_ESn_Q, 0, 1); + end->w2 = end->w3 = end->w4 = end->w5 = end->w6 = end->w7 = 0; +} + +static int64_t opal_xive_set_queue_info(uint64_t vp, uint32_t prio, + uint64_t qpage, + uint64_t qsize, + uint64_t qflags) +{ + uint32_t blk, idx; + struct xive *x; + struct xive_end *old_end; + struct xive_end end; + uint32_t vp_blk, vp_idx; + bool group; + int64_t rc; + + if (!xive_end_for_target(vp, prio, &blk, &idx)) + return OPAL_PARAMETER; + + x = xive_from_vc_blk(blk); + if (!x) + return OPAL_PARAMETER; + + old_end = xive_get_end(x, idx); + if (!old_end) + return OPAL_PARAMETER; + + /* If this is a silent escalation queue, it cannot be + * configured directly + */ + if (xive_get_field32(END_W0_SILENT_ESCALATE, old_end->w0)) + return OPAL_PARAMETER; + + /* This shouldn't fail or xive_end_for_target would have + * failed already + */ + if (!xive_decode_vp(vp, &vp_blk, &vp_idx, NULL, &group)) + return OPAL_PARAMETER; + + /* + * Make a local copy which we will later try to commit using + * the cache watch facility + */ + end = *old_end; + + if (qflags & OPAL_XIVE_EQ_ENABLED) { + switch(qsize) { + /* Supported sizes */ + case 12: + case 16: + case 21: + case 24: + end.w3 = cpu_to_be32(qpage & END_W3_EQ_ADDR_LO); + end.w2 = cpu_to_be32((qpage >> 32) & END_W2_EQ_ADDR_HI); + end.w3 = xive_set_field32(END_W3_QSIZE, end.w3, qsize - 12); + end.w0 = xive_set_field32(END_W0_ENQUEUE, end.w0, 1); + break; + case 0: + end.w2 = end.w3 = 0; + end.w0 = xive_set_field32(END_W0_ENQUEUE, end.w0, 0); + break; + default: + return OPAL_PARAMETER; + } + + /* Ensure the priority and target are correctly set (they will + * not be right after allocation + */ + end.w6 = xive_set_field32(END_W6_VP_BLOCK, 0, vp_blk) | + xive_set_field32(END_W6_VP_OFFSET, 0, vp_idx); + end.w7 = xive_set_field32(END_W7_F0_PRIORITY, 0, prio); + /* XXX Handle group i bit when needed */ + + /* Always notify flag */ + if (qflags & OPAL_XIVE_EQ_ALWAYS_NOTIFY) + end.w0 = xive_set_field32(END_W0_UCOND_NOTIFY, end.w0, 1); + else + end.w0 = xive_set_field32(END_W0_UCOND_NOTIFY, end.w0, 0); + + /* Escalation flag */ + if (qflags & OPAL_XIVE_EQ_ESCALATE) + end.w0 = xive_set_field32(END_W0_ESCALATE_CTL, end.w0, 1); + else + end.w0 = xive_set_field32(END_W0_ESCALATE_CTL, end.w0, 0); + + /* Unconditionally clear the current queue pointer, set + * generation to 1 and disable escalation interrupts. + */ + end.w1 = xive_set_field32(END_W1_GENERATION, 0, 1) | + xive_set_field32(END_W1_ES, 0, xive_get_field32(END_W1_ES, old_end->w1)); + + /* Enable. We always enable backlog for an enabled queue + * otherwise escalations won't work. + */ + end.w0 = xive_set_field32(END_W0_VALID, end.w0, 1); + end.w0 = xive_set_field32(END_W0_BACKLOG, end.w0, 1); + } else + xive_cleanup_end(&end); + + /* Update END, non-synchronous */ + lock(&x->lock); + rc = xive_endc_cache_update(x, blk, idx, &end, false); + unlock(&x->lock); + + return rc; +} + +static int64_t opal_xive_get_queue_state(uint64_t vp, uint32_t prio, + beint32_t *out_qtoggle, + beint32_t *out_qindex) +{ + uint32_t blk, idx; + struct xive *x; + struct xive_end *end; + int64_t rc; + + if (xive_mode != XIVE_MODE_EXPL) + return OPAL_WRONG_STATE; + + if (!out_qtoggle || !out_qindex || + !xive_end_for_target(vp, prio, &blk, &idx)) + return OPAL_PARAMETER; + + x = xive_from_vc_blk(blk); + if (!x) + return OPAL_PARAMETER; + + end = xive_get_end(x, idx); + if (!end) + return OPAL_PARAMETER; + + /* Scrub the queue */ + lock(&x->lock); + rc = xive_endc_scrub(x, blk, idx); + unlock(&x->lock); + if (rc) + return rc; + + /* We don't do disable queues */ + if (!xive_get_field32(END_W0_VALID, end->w0)) + return OPAL_WRONG_STATE; + + *out_qtoggle = cpu_to_be32(xive_get_field32(END_W1_GENERATION, end->w1)); + *out_qindex = cpu_to_be32(xive_get_field32(END_W1_PAGE_OFF, end->w1)); + + return OPAL_SUCCESS; +} + +static int64_t opal_xive_set_queue_state(uint64_t vp, uint32_t prio, + uint32_t qtoggle, uint32_t qindex) +{ + uint32_t blk, idx; + struct xive *x; + struct xive_end *end, new_end; + int64_t rc; + + if (xive_mode != XIVE_MODE_EXPL) + return OPAL_WRONG_STATE; + + if (!xive_end_for_target(vp, prio, &blk, &idx)) + return OPAL_PARAMETER; + + x = xive_from_vc_blk(blk); + if (!x) + return OPAL_PARAMETER; + + end = xive_get_end(x, idx); + if (!end) + return OPAL_PARAMETER; + + /* We don't do disable queues */ + if (!xive_get_field32(END_W0_VALID, end->w0)) + return OPAL_WRONG_STATE; + + new_end = *end; + + new_end.w1 = xive_set_field32(END_W1_GENERATION, new_end.w1, qtoggle); + new_end.w1 = xive_set_field32(END_W1_PAGE_OFF, new_end.w1, qindex); + + lock(&x->lock); + rc = xive_endc_cache_update(x, blk, idx, &new_end, false); + unlock(&x->lock); + + return rc; +} + +static int64_t opal_xive_donate_page(uint32_t chip_id, uint64_t addr) +{ + struct proc_chip *c = get_chip(chip_id); + struct list_node *n; + + if (xive_mode != XIVE_MODE_EXPL) + return OPAL_WRONG_STATE; + if (!c) + return OPAL_PARAMETER; + if (!c->xive) + return OPAL_PARAMETER; + if (addr & 0xffff) + return OPAL_PARAMETER; + + n = (struct list_node *)addr; + lock(&c->xive->lock); + list_add(&c->xive->donated_pages, n); + unlock(&c->xive->lock); + return OPAL_SUCCESS; +} + +static int64_t opal_xive_get_vp_info(uint64_t vp_id, + beint64_t *out_flags, + beint64_t *out_cam_value, + beint64_t *out_report_cl_pair, + beint32_t *out_chip_id) +{ + struct xive *x; + struct xive_nvp *vp; + uint32_t blk, idx; + bool group; + + if (!xive_decode_vp(vp_id, &blk, &idx, NULL, &group)) + return OPAL_PARAMETER; + /* We don't do groups yet */ + if (group) + return OPAL_PARAMETER; + x = xive_from_pc_blk(blk); + if (!x) + return OPAL_PARAMETER; + vp = xive_get_vp(x, idx); + if (!vp) + return OPAL_PARAMETER; + + if (out_flags) { + uint32_t end_blk, end_idx; + struct xive_end *end; + struct xive *end_x; + *out_flags = 0; + + /* + * We would like to a way to stash a SW bit in the VP + * to know whether silent escalation is enabled or + * not, but unlike what happens with ENDs, the PC + * cache watch doesn't implement the reserved bit in + * the VPs... so we have to go look at END 7 instead. + */ + + /* Grab END for prio 7 to check for silent escalation */ + if (!xive_end_for_target(vp_id, xive_escalation_prio(x), + &end_blk, &end_idx)) + return OPAL_PARAMETER; + + end_x = xive_from_vc_blk(end_blk); + if (!end_x) + return OPAL_PARAMETER; + + end = xive_get_end(x, end_idx); + if (!end) + return OPAL_PARAMETER; + if (xive_get_field32(NVP_W0_VALID, vp->w0)) + *out_flags |= cpu_to_be64(OPAL_XIVE_VP_ENABLED); + if (xive_cfg_save_restore(x)) + *out_flags |= cpu_to_be64(OPAL_XIVE_VP_SAVE_RESTORE); + if (xive_get_field32(END_W0_SILENT_ESCALATE, end->w0)) + *out_flags |= cpu_to_be64(OPAL_XIVE_VP_SINGLE_ESCALATION); + } + + if (out_cam_value) { + uint64_t cam_value; + + cam_value = (blk << x->vp_shift) | idx; + + /* + * If save-restore is enabled, force the CAM line + * value with the H bit. + */ + if (xive_cfg_save_restore(x)) + cam_value |= TM10_QW1W2_HO; + + *out_cam_value = cpu_to_be64(cam_value); + } + + if (out_report_cl_pair) { + uint64_t report_cl_pair; + + report_cl_pair = ((uint64_t)(be32_to_cpu(vp->w6) & 0x0fffffff)) << 32; + report_cl_pair |= be32_to_cpu(vp->w7) & 0xffffff00; + + *out_report_cl_pair = cpu_to_be64(report_cl_pair); + } + + if (out_chip_id) + *out_chip_id = cpu_to_be32(xive_block_to_chip[blk]); + + return OPAL_SUCCESS; +} + +static int64_t xive_setup_silent_gather(uint64_t vp_id, bool enable) +{ + uint32_t blk, idx, i; + struct xive_end *end_orig; + struct xive_end end; + struct xive *x; + int64_t rc; + + /* Get base END block */ + if (!xive_end_for_target(vp_id, 0, &blk, &idx)) { + prlog(PR_ERR, "%s: Invalid VP 0x%08llx\n", __func__, vp_id); + return OPAL_PARAMETER; + } + x = xive_from_vc_blk(blk); + if (!x) { + prlog(PR_ERR, "%s: VP 0x%08llx has invalid block %d\n", __func__, + vp_id, blk); + return OPAL_PARAMETER; + } + + /* Grab prio 7 */ + end_orig = xive_get_end(x, idx + xive_escalation_prio(x)); + if (!end_orig) { + xive_err(x, "Failed to get silent gather END 0x%x for VP 0x%08llx\n", + idx + xive_escalation_prio(x), vp_id); + return OPAL_PARAMETER; + } + + /* If trying to enable silent gather, make sure prio 7 is not + * already enabled as a normal queue + */ + if (enable && xive_get_field32(END_W0_VALID, end_orig->w0) && + !xive_get_field32(END_W0_SILENT_ESCALATE, end_orig->w0)) { + xive_err(x, "silent gather END 0x%x already in use\n", + idx + xive_escalation_prio(x)); + return OPAL_PARAMETER; + } + + end = *end_orig; + + if (enable) { + /* W0: Enabled and "s" set, no other bit */ + end.w0 = xive_set_field32(END_W0_FIRMWARE1, end.w0, 0); + end.w0 = xive_set_field32(END_W0_VALID, end.w0, 1); + end.w0 = xive_set_field32(END_W0_SILENT_ESCALATE, end.w0, 1); + end.w0 = xive_set_field32(END_W0_ESCALATE_CTL, end.w0, 1); + end.w0 = xive_set_field32(END_W0_BACKLOG, end.w0, 1); + + /* Set new "N" for END escalation (vs. ESB) */ + end.w0 = xive_set_field32(END_W0_ESCALATE_END, end.w0, 1); + + /* W1: Mark ESn as 01, ESe as 00 */ + end.w1 = xive_set_field32(END_W1_ESn_P, end.w1, 0); + end.w1 = xive_set_field32(END_W1_ESn_Q, end.w1, 1); + end.w1 = xive_set_field32(END_W1_ESe, end.w1, 0); + } else if (xive_get_field32(END_W0_SILENT_ESCALATE, end.w0)) + xive_cleanup_end(&end); + + if (!memcmp(end_orig, &end, sizeof(end))) + rc = 0; + else + rc = xive_endc_cache_update(x, blk, idx + xive_escalation_prio(x), + &end, false); + if (rc) + return rc; + + /* Mark/unmark all other prios with the new "u" bit and update + * escalation + */ + for (i = 0; i < xive_cfg_vp_prio(x); i++) { + if (i == xive_escalation_prio(x)) + continue; + end_orig = xive_get_end(x, idx + i); + if (!end_orig) + continue; + end = *end_orig; + if (enable) { + /* Set "u" bit */ + end.w0 = xive_set_field32(END_W0_UNCOND_ESCALATE, end.w0, 1); + + /* Set new "N" for END escalation (vs. ESB) */ + /* TODO (Gen2+) : use ESB escalation configuration */ + end.w0 = xive_set_field32(END_W0_ESCALATE_END, end.w0, 1); + + /* Re-route escalation interrupt (previous + * route is lost !) to the gather queue + */ + end.w4 = xive_set_field32(END_W4_END_BLOCK, end.w4, blk); + end.w4 = xive_set_field32(END_W4_ESC_END_INDEX, + end.w4, idx + xive_escalation_prio(x)); + } else if (xive_get_field32(END_W0_UNCOND_ESCALATE, end.w0)) { + /* Clear the "u" bit, disable escalations if it was set */ + end.w0 = xive_set_field32(END_W0_UNCOND_ESCALATE, end.w0, 0); + end.w0 = xive_set_field32(END_W0_ESCALATE_CTL, end.w0, 0); + } + if (!memcmp(end_orig, &end, sizeof(end))) + continue; + rc = xive_endc_cache_update(x, blk, idx + i, &end, false); + if (rc) + break; + } + + return rc; +} + +static int64_t opal_xive_set_vp_info(uint64_t vp_id, + uint64_t flags, + uint64_t report_cl_pair) +{ + struct xive *x; + struct xive_nvp *vp, vp_new; + uint32_t blk, idx; + bool group; + int64_t rc; + + if (!xive_decode_vp(vp_id, &blk, &idx, NULL, &group)) + return OPAL_PARAMETER; + /* We don't do groups yet */ + if (group) + return OPAL_PARAMETER; + if (report_cl_pair & 0xff) + return OPAL_PARAMETER; + x = xive_from_pc_blk(blk); + if (!x) + return OPAL_PARAMETER; + vp = xive_get_vp(x, idx); + if (!vp) + return OPAL_PARAMETER; + + /* Consistency check. */ + if ((flags & OPAL_XIVE_VP_SAVE_RESTORE) && !xive_cfg_save_restore(x)) + return OPAL_PARAMETER; + + lock(&x->lock); + + vp_new = *vp; + if (flags & OPAL_XIVE_VP_ENABLED) { + vp_new.w0 = xive_set_field32(NVP_W0_VALID, vp_new.w0, 1); + vp_new.w6 = cpu_to_be32(report_cl_pair >> 32); + vp_new.w7 = cpu_to_be32(report_cl_pair & 0xffffffff); + + if (flags & OPAL_XIVE_VP_SINGLE_ESCALATION) + rc = xive_setup_silent_gather(vp_id, true); + else + rc = xive_setup_silent_gather(vp_id, false); + + /* + * Prepare NVP to be HW owned for automatic save-restore + */ + if (xive_cfg_save_restore(x)) { + /* + * Set NVP privilege level. Default to OS. + * This check only makes sense for KVM guests + * currently. We would need an extra flag to + * distinguish from pool level. + */ + vp_new.w0 = xive_set_field32(NVP_W0_VPRIV, vp_new.w0, 0); + + vp_new.w2 = xive_set_field32(NVP_W2_CPPR, vp_new.w2, 0xFF); + vp_new.w0 = xive_set_field32(NVP_W0_HW, vp_new.w0, 1); + } + } else { + /* + * TODO (kvm): disabling a VP invalidates the associated ENDs. + * + * The loads then return all 1s which can be an issue for the + * Linux code to handle. + */ + + vp_new.w0 = vp_new.w6 = vp_new.w7 = 0; + rc = xive_setup_silent_gather(vp_id, false); + } + + if (rc) { + if (rc != OPAL_BUSY) + xive_dbg(x, "Silent gather setup failed with err %lld\n", rc); + goto bail; + } + + rc = xive_nxc_cache_update(x, blk, idx, &vp_new, false); + if (rc) + goto bail; + + /* When disabling, we scrub clean (invalidate the entry) so + * we can avoid cache ops in alloc/free + */ + if (!(flags & OPAL_XIVE_VP_ENABLED)) + xive_nxc_scrub_clean(x, blk, idx); + +bail: + unlock(&x->lock); + return rc; +} + +static int64_t opal_xive_get_vp_state(uint64_t vp_id, beint64_t *out_state) +{ + struct xive *x; + struct xive_nvp *vp; + uint32_t blk, idx; + int64_t rc; + bool group; + + if (!out_state || !xive_decode_vp(vp_id, &blk, &idx, NULL, &group)) + return OPAL_PARAMETER; + if (group) + return OPAL_PARAMETER; + x = xive_from_pc_blk(blk); + if (!x) + return OPAL_PARAMETER; + vp = xive_get_vp(x, idx); + if (!vp) + return OPAL_PARAMETER; + + /* Scrub the vp */ + lock(&x->lock); + rc = xive_nxc_scrub(x, blk, idx); + unlock(&x->lock); + if (rc) + return rc; + + if (!xive_get_field32(NVP_W0_VALID, vp->w0)) + return OPAL_WRONG_STATE; + + /* + * return a state matching the layout of WORD 0-1 of the TIMA + * as this is expected by current implementation. + */ + *out_state = cpu_to_be64(((uint64_t) 0x0) << 54 | + (uint64_t)xive_get_field32(NVP_W2_CPPR, vp->w2) << 48 | + (uint64_t)xive_get_field32(NVP_W2_IPB, vp->w2) << 40 | + (uint64_t)xive_get_field32(NVP_W2_LSMFB, vp->w2) << 32); + + return OPAL_SUCCESS; +} + +static void *xive_cpu_get_tima(struct cpu_thread *c) +{ + struct xive_cpu_state *xs = c->xstate; + struct xive *x = xs->xive; + + return x->ic_tm_direct_base + ((c->pir & 0xff) << x->ic_shift); +} + +static void xive_cleanup_cpu_tima(struct cpu_thread *c) +{ + struct xive_cpu_state *xs __unused = c->xstate; + void *cpu_tm_base = xive_cpu_get_tima(c); + uint8_t old_w2 __unused, w2 __unused; + + /* Reset the HW context */ + xive_reset_enable_thread(c); + + /* Set VT to 1 */ + old_w2 = in_8(cpu_tm_base + TM_QW3_HV_PHYS + TM_WORD2); + out_8(cpu_tm_base + TM_QW3_HV_PHYS + TM_WORD2, 0x80); + w2 = in_8(cpu_tm_base + TM_QW3_HV_PHYS + TM_WORD2); + + /* Dump HV state */ + xive_cpu_vdbg(c, "[reset] VP TIMA VP=%x/%x W01=%016llx W2=%02x->%02x\n", + xs->vp_blk, xs->vp_idx, + in_be64(cpu_tm_base + TM_QW3_HV_PHYS), + old_w2, w2); +} + +static int64_t xive_vc_ind_cache_kill(struct xive *x, uint64_t type) +{ + uint64_t val; + + /* We clear the whole thing */ + xive_regw(x, VC_AT_MACRO_KILL_MASK, 0); + xive_regw(x, VC_AT_MACRO_KILL, VC_AT_MACRO_KILL_VALID | + SETFIELD(VC_AT_MACRO_KILL_VSD, 0ull, type)); + + /* XXX Add timeout */ + for (;;) { + val = xive_regr(x, VC_AT_MACRO_KILL); + if (!(val & VC_AT_MACRO_KILL_VALID)) + break; + } + return 0; +} + +static int64_t xive_pc_ind_cache_kill(struct xive *x) +{ + uint64_t val; + + /* We clear the whole thing */ + xive_regw(x, PC_AT_KILL_MASK, 0); + xive_regw(x, PC_AT_KILL, PC_AT_KILL_VALID | + SETFIELD(VC_AT_MACRO_KILL_VSD, 0ull, VST_NVP)); + + /* XXX Add timeout */ + for (;;) { + val = xive_regr(x, PC_AT_KILL); + if (!(val & PC_AT_KILL_VALID)) + break; + } + return 0; +} + +static void xive_cleanup_vp_ind(struct xive *x) +{ + int i; + + xive_dbg(x, "Cleaning up %d VP ind entries...\n", x->vp_ind_count); + for (i = 0; i < x->vp_ind_count; i++) { + if (be64_to_cpu(x->vp_ind_base[i]) & VSD_FIRMWARE) { + xive_dbg(x, " %04x ... skip (firmware)\n", i); + continue; + } + if (x->vp_ind_base[i] != 0) { + x->vp_ind_base[i] = 0; + xive_dbg(x, " %04x ... cleaned\n", i); + } + } + xive_pc_ind_cache_kill(x); +} + +static void xive_cleanup_end_ind(struct xive *x) +{ + int i; + + xive_dbg(x, "Cleaning up %d END ind entries...\n", x->end_ind_count); + for (i = 0; i < x->end_ind_count; i++) { + if (be64_to_cpu(x->end_ind_base[i]) & VSD_FIRMWARE) { + xive_dbg(x, " %04x ... skip (firmware)\n", i); + continue; + } + if (x->end_ind_base[i] != 0) { + x->end_ind_base[i] = 0; + xive_dbg(x, " %04x ... cleaned\n", i); + } + } + xive_vc_ind_cache_kill(x, VST_END); +} + +static void xive_reset_one(struct xive *x) +{ + struct cpu_thread *c; + bool end_firmware; + int i; + + xive_notice(x, "Resetting one xive...\n"); + + lock(&x->lock); + + /* Check all interrupts are disabled */ + i = bitmap_find_one_bit(*x->int_enabled_map, 0, XIVE_INT_COUNT); + if (i >= 0) + xive_warn(x, "Interrupt %d (and maybe more) not disabled" + " at reset !\n", i); + + /* Reset IPI allocation */ + xive_dbg(x, "freeing alloc map %p/%p\n", + x->ipi_alloc_map, *x->ipi_alloc_map); + memset(x->ipi_alloc_map, 0, BITMAP_BYTES(XIVE_INT_COUNT)); + + xive_dbg(x, "Resetting ENDs...\n"); + + /* Reset all allocated ENDs and free the user ones */ + bitmap_for_each_one(*x->end_map, xive_end_bitmap_size(x), i) { + struct xive_end end0; + struct xive_end *end; + int j; + + if (i == 0) + continue; + end_firmware = false; + for (j = 0; j < xive_cfg_vp_prio(x); j++) { + uint32_t idx = (i << xive_cfg_vp_prio_shift(x)) | j; + + end = xive_get_end(x, idx); + if (!end) + continue; + + /* We need to preserve the firmware bit, otherwise + * we will incorrectly free the ENDs that are reserved + * for the physical CPUs + */ + if (xive_get_field32(END_W0_VALID, end->w0)) { + if (!xive_end_is_firmware1(end)) + xive_dbg(x, "END 0x%x:0x%x is valid at reset: %08x %08x\n", + x->block_id, idx, end->w0, end->w1); + end0 = *end; + xive_cleanup_end(&end0); + xive_endc_cache_update(x, x->block_id, idx, &end0, true); + } + if (xive_end_is_firmware1(end)) + end_firmware = true; + } + if (!end_firmware) + bitmap_clr_bit(*x->end_map, i); + } + + /* Take out all VPs from HW and reset all CPPRs to 0 */ + for_each_present_cpu(c) { + if (c->chip_id != x->chip_id) + continue; + if (!c->xstate) + continue; + xive_cleanup_cpu_tima(c); + } + + /* Reset all user-allocated VPs. This is inefficient, we should + * either keep a bitmap of allocated VPs or add an iterator to + * the buddy which is trickier but doable. + */ + for (i = 0; i < XIVE_VP_COUNT(x); i++) { + struct xive_nvp *vp; + struct xive_nvp vp0 = {0}; + + /* Ignore the physical CPU VPs */ + if (i >= xive_hw_vp_count && + i < (xive_hw_vp_base + xive_hw_vp_count)) + continue; + + /* Is the VP valid ? */ + vp = xive_get_vp(x, i); + if (!vp || !xive_get_field32(NVP_W0_VALID, vp->w0)) + continue; + + /* Clear it */ + xive_dbg(x, "VP 0x%x:0x%x is valid at reset\n", x->block_id, i); + xive_nxc_cache_update(x, x->block_id, i, &vp0, true); + } + + /* Forget about remaining donated pages */ + list_head_init(&x->donated_pages); + + /* And cleanup donated indirect VP and END pages */ + xive_cleanup_vp_ind(x); + xive_cleanup_end_ind(x); + + /* The rest must not be called with the lock held */ + unlock(&x->lock); + + /* Re-configure VPs */ + for_each_present_cpu(c) { + struct xive_cpu_state *xs = c->xstate; + + if (c->chip_id != x->chip_id || !xs) + continue; + + xive_init_cpu_exploitation(xs); + } +} + +static void xive_reset_mask_source_cb(struct irq_source *is, + void *data __unused) +{ + struct xive_src *s = container_of(is, struct xive_src, is); + struct xive *x; + uint32_t isn; + + if (is->ops != &xive_irq_source_ops) + return; + + /* Skip escalation sources */ + if (GIRQ_IS_ESCALATION(is->start)) + return; + + x = s->xive; + + /* Iterate all interrupts */ + for (isn = is->start; isn < is->end; isn++) { + /* Has it ever been enabled ? */ + if (!bitmap_tst_bit(*x->int_enabled_map, GIRQ_TO_IDX(isn))) + continue; + /* Mask it and clear the enabled map bit */ + xive_vdbg(x, "[reset] disabling source 0x%x\n", isn); + __xive_set_irq_config(is, isn, 0, 0xff, isn, true, false); + bitmap_clr_bit(*x->int_enabled_map, GIRQ_TO_IDX(isn)); + } +} + +void xive2_cpu_reset(void) +{ + struct cpu_thread *c = this_cpu(); + struct xive_cpu_state *xs = c->xstate; + + out_8(xs->tm_ring1 + TM_QW3_HV_PHYS + TM_CPPR, 0); + + in_be64(xs->tm_ring1 + TM_SPC_PULL_POOL_CTX); +} + +static int64_t __xive_reset(uint64_t mode) +{ + struct proc_chip *chip; + + xive_mode = mode; + + /* Mask all interrupt sources */ + irq_for_each_source(xive_reset_mask_source_cb, NULL); + + /* For each XIVE do a sync... */ + for_each_chip(chip) { + if (!chip->xive) + continue; + xive_sync(chip->xive); + } + + /* For each XIVE reset everything else... */ + for_each_chip(chip) { + if (!chip->xive) + continue; + xive_reset_one(chip->xive); + } + + /* Cleanup global VP allocator */ + buddy_reset(xive_vp_buddy); + + /* + * We reserve the whole range of VP ids for HW threads. + */ + assert(buddy_reserve(xive_vp_buddy, xive_hw_vp_base, xive_threadid_shift)); + + return OPAL_SUCCESS; +} + +/* Called by fast reboot */ +int64_t xive2_reset(void) +{ + if (xive_mode == XIVE_MODE_NONE) + return OPAL_SUCCESS; + return __xive_reset(XIVE_MODE_EXPL); +} + +static int64_t opal_xive_reset(uint64_t mode) +{ + prlog(PR_DEBUG, "XIVE reset. mode = %llx\n", mode); + + if (!(mode & XIVE_MODE_EXPL)) { + prlog(PR_NOTICE, "No emulation mode. XIVE exploitation mode " + "is the default\n"); + } + + xive_expl_options = mode & ~XIVE_MODE_EXPL; + if (xive_expl_options & ~XIVE_EXPL_ALL_OPTIONS) { + prerror("invalid XIVE exploitation mode option %016llx\n", + xive_expl_options); + return OPAL_PARAMETER; + } + + return __xive_reset(XIVE_MODE_EXPL); +} + +static int64_t opal_xive_free_vp_block(uint64_t vp_base) +{ + uint32_t blk, idx, i, j, count; + uint8_t order; + bool group; + + if (xive_mode != XIVE_MODE_EXPL) + return OPAL_WRONG_STATE; + + if (!xive_decode_vp(vp_base, &blk, &idx, &order, &group)) + return OPAL_PARAMETER; + if (group) + return OPAL_PARAMETER; + if (blk) + return OPAL_PARAMETER; + if (order < (xive_chips_alloc_bits + 1)) + return OPAL_PARAMETER; + if (idx & ((1 << (order - xive_chips_alloc_bits)) - 1)) + return OPAL_PARAMETER; + + count = 1 << order; + for (i = 0; i < count; i++) { + uint32_t vp_id = vp_base + i; + uint32_t blk, idx, end_blk, end_idx; + struct xive *x; + struct xive_nvp *vp; + + if (!xive_decode_vp(vp_id, &blk, &idx, NULL, NULL)) { + prerror("Couldn't decode VP id %u\n", vp_id); + return OPAL_INTERNAL_ERROR; + } + x = xive_from_pc_blk(blk); + if (!x) { + prerror("Instance not found for deallocated VP" + " block %d\n", blk); + return OPAL_INTERNAL_ERROR; + } + vp = xive_get_vp(x, idx); + if (!vp) { + prerror("VP not found for deallocation !"); + return OPAL_INTERNAL_ERROR; + } + + /* VP must be disabled */ + if (xive_get_field32(NVP_W0_VALID, vp->w0)) { + prlog(PR_ERR, "freeing active VP %d\n", vp_id); + return OPAL_XIVE_FREE_ACTIVE; + } + + /* Not populated */ + if (vp->w5 == 0) + continue; + + end_blk = xive_get_field32(NVP_W5_VP_END_BLOCK, vp->w5); + end_idx = xive_get_field32(NVP_W5_VP_END_INDEX, vp->w5); + + lock(&x->lock); + + /* Ensure ENDs are disabled and cleaned up. Ideally the caller + * should have done it but we double check it here + */ + for (j = 0; j < xive_cfg_vp_prio(x); j++) { + struct xive *end_x = xive_from_vc_blk(end_blk); + struct xive_end end, *orig_end = xive_get_end(end_x, end_idx + j); + + if (!xive_get_field32(END_W0_VALID, orig_end->w0)) + continue; + + prlog(PR_WARNING, "freeing VP %d with queue %d active\n", + vp_id, j); + end = *orig_end; + xive_cleanup_end(&end); + xive_endc_cache_update(x, end_blk, end_idx + j, &end, true); + } + + /* Mark it not populated so we don't try to free it again */ + vp->w5 = 0; + + if (end_blk != blk) { + prerror("Block mismatch trying to free ENDs\n"); + unlock(&x->lock); + return OPAL_INTERNAL_ERROR; + } + + xive_free_end_set(x, end_idx); + unlock(&x->lock); + } + + xive_free_vps(vp_base); + + return OPAL_SUCCESS; +} + +static int64_t opal_xive_alloc_vp_block(uint32_t alloc_order) +{ + uint32_t vp_base, ends, count, i; + int64_t rc; + + if (xive_mode != XIVE_MODE_EXPL) + return OPAL_WRONG_STATE; + + prlog(PR_TRACE, "opal_xive_alloc_vp_block(%d)\n", alloc_order); + + vp_base = xive_alloc_vps(alloc_order); + if (XIVE_ALLOC_IS_ERR(vp_base)) { + if (vp_base == XIVE_ALLOC_NO_IND) + return OPAL_XIVE_PROVISIONING; + return OPAL_RESOURCE; + } + + /* Allocate ENDs and initialize VPs */ + count = 1 << alloc_order; + for (i = 0; i < count; i++) { + uint32_t vp_id = vp_base + i; + uint32_t blk, idx; + struct xive *x; + struct xive_nvp *vp; + + if (!xive_decode_vp(vp_id, &blk, &idx, NULL, NULL)) { + prerror("Couldn't decode VP id %u\n", vp_id); + return OPAL_INTERNAL_ERROR; + } + x = xive_from_pc_blk(blk); + if (!x) { + prerror("Instance not found for allocated VP" + " block %d\n", blk); + rc = OPAL_INTERNAL_ERROR; + goto fail; + } + vp = xive_get_vp(x, idx); + if (!vp) { + prerror("VP not found after allocation !"); + rc = OPAL_INTERNAL_ERROR; + goto fail; + } + + /* Allocate ENDs, if fails, free the VPs and return */ + lock(&x->lock); + ends = xive_alloc_end_set(x, false); + unlock(&x->lock); + if (XIVE_ALLOC_IS_ERR(ends)) { + if (ends == XIVE_ALLOC_NO_IND) + rc = OPAL_XIVE_PROVISIONING; + else + rc = OPAL_RESOURCE; + goto fail; + } + + /* Initialize the VP structure. We don't use a cache watch + * as we have made sure when freeing the entries to scrub + * it out of the cache. + */ + memset(vp, 0, sizeof(*vp)); + + /* Store the END base of the VP in W5 (new in p10) */ + xive_vp_set_end_base(vp, blk, ends); + } + return vp_base; + fail: + opal_xive_free_vp_block(vp_base); + + return rc; +} + +static int64_t xive_try_allocate_irq(struct xive *x) +{ + int idx, base_idx, max_count, girq; + struct xive_eas *eas; + + lock(&x->lock); + + base_idx = x->int_ipi_top - x->int_base; + max_count = x->int_hw_bot - x->int_ipi_top; + + idx = bitmap_find_zero_bit(*x->ipi_alloc_map, base_idx, max_count); + if (idx < 0) { + unlock(&x->lock); + return OPAL_RESOURCE; + } + bitmap_set_bit(*x->ipi_alloc_map, idx); + girq = x->int_base + idx; + + /* Mark the EAS valid. Don't bother with the HW cache, it's + * still masked anyway, the cache will be updated when unmasked + * and configured. + */ + eas = xive_get_eas(x, girq); + if (!eas) { + bitmap_clr_bit(*x->ipi_alloc_map, idx); + unlock(&x->lock); + return OPAL_PARAMETER; + } + eas->w = xive_set_field64(EAS_VALID, 0, 1) | + xive_set_field64(EAS_MASKED, 0, 1) | + xive_set_field64(EAS_END_DATA, 0, girq); + unlock(&x->lock); + + return girq; +} + +static int64_t opal_xive_allocate_irq(uint32_t chip_id) +{ + struct proc_chip *chip; + bool try_all = false; + int64_t rc; + + if (xive_mode != XIVE_MODE_EXPL) + return OPAL_WRONG_STATE; + + if (chip_id == OPAL_XIVE_ANY_CHIP) { + try_all = true; + chip_id = this_cpu()->chip_id; + } + chip = get_chip(chip_id); + if (!chip) + return OPAL_PARAMETER; + + /* Try initial target chip */ + if (!chip->xive) + rc = OPAL_PARAMETER; + else + rc = xive_try_allocate_irq(chip->xive); + if (rc >= 0 || !try_all) + return rc; + + /* Failed and we try all... do so */ + for_each_chip(chip) { + if (!chip->xive) + continue; + rc = xive_try_allocate_irq(chip->xive); + if (rc >= 0) + break; + } + return rc; +} + +static int64_t opal_xive_free_irq(uint32_t girq) +{ + struct irq_source *is = irq_find_source(girq); + struct xive_src *s = container_of(is, struct xive_src, is); + struct xive *x = xive_from_isn(girq); + struct xive_eas *eas; + uint32_t idx; + + if (xive_mode != XIVE_MODE_EXPL) + return OPAL_WRONG_STATE; + if (!x || !is) + return OPAL_PARAMETER; + + idx = GIRQ_TO_IDX(girq); + + lock(&x->lock); + + eas = xive_get_eas(x, girq); + if (!eas) { + unlock(&x->lock); + return OPAL_PARAMETER; + } + + /* Mask the interrupt source */ + xive_update_irq_mask(s, girq - s->esb_base, true); + + /* Mark the EAS masked and invalid */ + eas->w = xive_set_field64(EAS_VALID, 0, 1) | + xive_set_field64(EAS_MASKED, 0, 1); + xive_easc_scrub(x, x->block_id, idx); + + /* Free it */ + if (!bitmap_tst_bit(*x->ipi_alloc_map, idx)) { + unlock(&x->lock); + return OPAL_PARAMETER; + } + bitmap_clr_bit(*x->ipi_alloc_map, idx); + bitmap_clr_bit(*x->int_enabled_map, idx); + unlock(&x->lock); + + return OPAL_SUCCESS; +} + +static int64_t opal_xive_dump_tm(uint32_t offset, const char *n, uint32_t pir) +{ + struct cpu_thread *c = find_cpu_by_pir(pir); + struct xive_cpu_state *xs; + struct xive *x; + void *cpu_tm_base; + uint64_t v0,v1; + + if (!c) + return OPAL_PARAMETER; + xs = c->xstate; + if (!xs || !xs->tm_ring1) + return OPAL_INTERNAL_ERROR; + x = xs->xive; + cpu_tm_base = xive_cpu_get_tima(c); + + lock(&x->lock); + v0 = in_be64(cpu_tm_base + offset); + if (offset == TM_QW3_HV_PHYS) { + v1 = in_8(cpu_tm_base + offset + 8); + v1 <<= 56; + } else { + v1 = in_be32(cpu_tm_base + offset + 8); + v1 <<= 32; + } + prlog(PR_INFO, "CPU[%04x]: TM state for QW %s\n", pir, n); + prlog(PR_INFO, "CPU[%04x]: NSR CPPR IPB LSMFB ACK# INC AGE PIPR" + " W2 W3\n", pir); + prlog(PR_INFO, "CPU[%04x]: %02x %02x %02x %02x %02x " + "%02x %02x %02x %08x %08x\n", pir, + (uint8_t)(v0 >> 58) & 0xff, (uint8_t)(v0 >> 48) & 0xff, + (uint8_t)(v0 >> 40) & 0xff, (uint8_t)(v0 >> 32) & 0xff, + (uint8_t)(v0 >> 24) & 0xff, (uint8_t)(v0 >> 16) & 0xff, + (uint8_t)(v0 >> 8) & 0xff, (uint8_t)(v0 ) & 0xff, + (uint32_t)(v1 >> 32) & 0xffffffff, + (uint32_t)(v1 & 0xffffffff)); + unlock(&x->lock); + + return OPAL_SUCCESS; +} + +static int64_t opal_xive_dump_vp(uint32_t vp_id) +{ + uint32_t blk, idx; + uint8_t order; + bool group; + struct xive *x; + struct xive_nvp *vp; + uint32_t *vpw; + + if (!xive_decode_vp(vp_id, &blk, &idx, &order, &group)) + return OPAL_PARAMETER; + + x = xive_from_vc_blk(blk); + if (!x) + return OPAL_PARAMETER; + vp = xive_get_vp(x, idx); + if (!vp) + return OPAL_PARAMETER; + lock(&x->lock); + + xive_nxc_scrub_clean(x, blk, idx); + + vpw = ((uint32_t *)vp) + (group ? 8 : 0); + prlog(PR_INFO, "VP[%08x]: 0..3: %08x %08x %08x %08x\n", vp_id, + vpw[0], vpw[1], vpw[2], vpw[3]); + prlog(PR_INFO, "VP[%08x]: 4..7: %08x %08x %08x %08x\n", vp_id, + vpw[4], vpw[5], vpw[6], vpw[7]); + unlock(&x->lock); + + return OPAL_SUCCESS; +} + +static int64_t opal_xive_sync_irq_src(uint32_t girq) +{ + struct xive *x = xive_from_isn(girq); + + if (!x) + return OPAL_PARAMETER; + return xive_sync(x); +} + +static int64_t opal_xive_sync_irq_target(uint32_t girq) +{ + uint32_t target, vp_blk; + struct xive *x; + + if (!xive_get_irq_targetting(girq, &target, NULL, NULL)) + return OPAL_PARAMETER; + if (!xive_decode_vp(target, &vp_blk, NULL, NULL, NULL)) + return OPAL_PARAMETER; + x = xive_from_pc_blk(vp_blk); + if (!x) + return OPAL_PARAMETER; + return xive_sync(x); +} + +static int64_t opal_xive_sync(uint32_t type, uint32_t id) +{ + int64_t rc = OPAL_SUCCESS;; + + if (type & XIVE_SYNC_EAS) + rc = opal_xive_sync_irq_src(id); + if (rc) + return rc; + if (type & XIVE_SYNC_QUEUE) + rc = opal_xive_sync_irq_target(id); + if (rc) + return rc; + + /* Add more ... */ + + return rc; +} + +static int64_t opal_xive_dump(uint32_t type, uint32_t id) +{ + switch (type) { + case XIVE_DUMP_TM_HYP: + return opal_xive_dump_tm(TM_QW3_HV_PHYS, "PHYS", id); + case XIVE_DUMP_TM_POOL: + return opal_xive_dump_tm(TM_QW2_HV_POOL, "POOL", id); + case XIVE_DUMP_TM_OS: + return opal_xive_dump_tm(TM_QW1_OS, "OS ", id); + case XIVE_DUMP_TM_USER: + return opal_xive_dump_tm(TM_QW0_USER, "USER", id); + case XIVE_DUMP_VP: + return opal_xive_dump_vp(id); + default: + return OPAL_PARAMETER; + } +} + +static void xive_init_globals(void) +{ + uint32_t i; + + for (i = 0; i < XIVE_MAX_CHIPS; i++) + xive_block_to_chip[i] = XIVE_INVALID_CHIP; +} + +/* + * The global availability of some capabilities used in other drivers + * (PHB, PSI) is deduced from the capabilities of the first XIVE chip + * of the system. It should be common to all chips. + */ +bool xive2_cap_phb_pq_disable(void) +{ + return xive_has_cap(one_xive, CQ_XIVE_CAP_PHB_PQ_DISABLE); +} + +bool xive2_cap_phb_abt(void) +{ + if (!xive_has_cap(one_xive, CQ_XIVE_CAP_PHB_ABT)) + return false; + + /* + * We need 'PQ disable' to use ABT mode, else the OS will use + * two different sets of ESB pages (PHB and IC) to control the + * interrupt sources. Can not work. + */ + if (!xive2_cap_phb_pq_disable()) { + prlog_once(PR_ERR, "ABT mode is set without PQ disable. " + "Ignoring bogus configuration\n"); + return false; + } + + return true; +} + +bool xive2_cap_store_eoi(void) +{ + return xive_has_cap(one_xive, CQ_XIVE_CAP_STORE_EOI); +} + +void xive2_init(void) +{ + struct dt_node *np; + struct proc_chip *chip; + struct cpu_thread *cpu; + bool first = true; + + /* Look for xive nodes and do basic inits */ + dt_for_each_compatible(dt_root, np, "ibm,power10-xive-x") { + struct xive *x; + + /* Initialize some global stuff */ + if (first) + xive_init_globals(); + + /* Create/initialize the xive instance */ + x = init_one_xive(np); + if (first) + one_xive = x; + first = false; + } + if (first) + return; + + /* + * P8 emulation is not supported on P10 anymore. Exploitation + * is the default XIVE mode. We might introduce a GEN2 mode. + */ + xive_mode = XIVE_MODE_EXPL; + + /* Init VP allocator */ + xive_init_vp_allocator(); + + /* Create a device-tree node for Linux use */ + xive_create_mmio_dt_node(one_xive); + + /* Some inits must be done after all xive have been created + * such as setting up the forwarding ports + */ + for_each_chip(chip) { + if (chip->xive) + late_init_one_xive(chip->xive); + } + + /* Initialize per-cpu structures */ + for_each_present_cpu(cpu) { + xive_init_cpu(cpu); + } + + /* Calling boot CPU */ + xive2_cpu_callin(this_cpu()); + + /* Register XIVE exploitation calls */ + opal_register(OPAL_XIVE_RESET, opal_xive_reset, 1); + opal_register(OPAL_XIVE_GET_IRQ_INFO, opal_xive_get_irq_info, 6); + opal_register(OPAL_XIVE_GET_IRQ_CONFIG, opal_xive_get_irq_config, 4); + opal_register(OPAL_XIVE_SET_IRQ_CONFIG, opal_xive_set_irq_config, 4); + opal_register(OPAL_XIVE_GET_QUEUE_INFO, opal_xive_get_queue_info, 7); + opal_register(OPAL_XIVE_SET_QUEUE_INFO, opal_xive_set_queue_info, 5); + opal_register(OPAL_XIVE_DONATE_PAGE, opal_xive_donate_page, 2); + opal_register(OPAL_XIVE_ALLOCATE_IRQ, opal_xive_allocate_irq, 1); + opal_register(OPAL_XIVE_FREE_IRQ, opal_xive_free_irq, 1); + opal_register(OPAL_XIVE_ALLOCATE_VP_BLOCK, opal_xive_alloc_vp_block, 1); + opal_register(OPAL_XIVE_FREE_VP_BLOCK, opal_xive_free_vp_block, 1); + opal_register(OPAL_XIVE_GET_VP_INFO, opal_xive_get_vp_info, 5); + opal_register(OPAL_XIVE_SET_VP_INFO, opal_xive_set_vp_info, 3); + opal_register(OPAL_XIVE_SYNC, opal_xive_sync, 2); + opal_register(OPAL_XIVE_DUMP, opal_xive_dump, 2); + opal_register(OPAL_XIVE_GET_QUEUE_STATE, opal_xive_get_queue_state, 4); + opal_register(OPAL_XIVE_SET_QUEUE_STATE, opal_xive_set_queue_state, 4); + opal_register(OPAL_XIVE_GET_VP_STATE, opal_xive_get_vp_state, 2); +} diff --git a/roms/skiboot/hw/xscom.c b/roms/skiboot/hw/xscom.c new file mode 100644 index 000000000..347457242 --- /dev/null +++ b/roms/skiboot/hw/xscom.c @@ -0,0 +1,1019 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * XSCOM driver + * + * Copyright 2013-2019 IBM Corp. + */ + +#include <skiboot.h> +#include <xscom.h> +#include <io.h> +#include <processor.h> +#include <device.h> +#include <chip.h> +#include <centaur.h> +#include <errorlog.h> +#include <opal-api.h> +#include <timebase.h> +#include <nvram.h> + +/* Mask of bits to clear in HMER before an access */ +#define HMER_CLR_MASK (~(SPR_HMER_XSCOM_FAIL | \ + SPR_HMER_XSCOM_DONE | \ + SPR_HMER_XSCOM_STATUS)) + +DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_RW, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM, + OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL, + OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_INDIRECT_RW, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM, + OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL, + OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_RESET, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM, + OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL, + OPAL_NA); + +DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_BUSY, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM, + OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL, + OPAL_NA); + +/* xscom details to trigger xstop */ +static struct { + uint64_t addr; + uint64_t fir_bit; +} xstop_xscom; + +/* + * Locking notes: + * + * We used to have a per-target lock. However due to errata HW822317 + * we can have issues on the issuer side if multiple threads try to + * send XSCOMs simultaneously (HMER responses get mixed up), so just + * use a global lock instead + */ +static struct lock xscom_lock = LOCK_UNLOCKED; + +static inline void *xscom_addr(uint32_t gcid, uint32_t pcb_addr) +{ + struct proc_chip *chip = get_chip(gcid); + uint64_t addr; + + assert(chip); + addr = chip->xscom_base; + if (proc_gen == proc_gen_p8) { + addr |= ((uint64_t)pcb_addr << 4) & ~0xfful; + addr |= (pcb_addr << 3) & 0x78; + } else + addr |= ((uint64_t)pcb_addr << 3); + return (void *)addr; +} + +static uint64_t xscom_wait_done(void) +{ + uint64_t hmer; + + do + hmer = mfspr(SPR_HMER); + while(!(hmer & SPR_HMER_XSCOM_DONE)); + + /* + * HW822317: We need to read a second time as the actual + * status can be delayed by 1 cycle after DONE + */ + return mfspr(SPR_HMER); +} + +static void xscom_reset(uint32_t gcid, bool need_delay) +{ + u64 hmer; + uint32_t recv_status_reg, log_reg, err_reg; + struct timespec ts; + + /* Clear errors in HMER */ + mtspr(SPR_HMER, HMER_CLR_MASK); + + /* Setup local and target scom addresses */ + if (proc_gen == proc_gen_p10) { + recv_status_reg = 0x00090018; + log_reg = 0x0090012; + err_reg = 0x0090013; + } else if (proc_gen == proc_gen_p9) { + recv_status_reg = 0x00090018; + log_reg = 0x0090012; + err_reg = 0x0090013; + } else { + recv_status_reg = 0x202000f; + log_reg = 0x2020007; + err_reg = 0x2020009; + } + + /* First we need to write 0 to a register on our chip */ + out_be64(xscom_addr(this_cpu()->chip_id, recv_status_reg), 0); + hmer = xscom_wait_done(); + if (hmer & SPR_HMER_XSCOM_FAIL) + goto fail; + + /* Then we need to clear those two other registers on the target */ + out_be64(xscom_addr(gcid, log_reg), 0); + hmer = xscom_wait_done(); + if (hmer & SPR_HMER_XSCOM_FAIL) + goto fail; + out_be64(xscom_addr(gcid, err_reg), 0); + hmer = xscom_wait_done(); + if (hmer & SPR_HMER_XSCOM_FAIL) + goto fail; + + if (need_delay) { + /* + * Its observed that sometimes immediate retry of + * XSCOM operation returns wrong data. Adding a + * delay for XSCOM reset to be effective. Delay of + * 10 ms is found to be working fine experimentally. + * FIXME: Replace 10ms delay by exact delay needed + * or other alternate method to confirm XSCOM reset + * completion, after checking from HW folks. + */ + ts.tv_sec = 0; + ts.tv_nsec = 10 * 1000; + nanosleep_nopoll(&ts, NULL); + } + return; + fail: + /* Fatal error resetting XSCOM */ + log_simple_error(&e_info(OPAL_RC_XSCOM_RESET), + "XSCOM: Fatal error resetting engine after failed access !\n"); + + /* XXX Generate error log ? attn ? panic ? + * If we decide to panic, change the above severity to PANIC + */ +} + +static int xscom_clear_error(uint32_t gcid, uint32_t pcb_addr) +{ + u64 hmer; + uint32_t base_xscom_addr; + uint32_t xscom_clear_reg = 0x20010800; + + /* only in case of p9 */ + if (proc_gen != proc_gen_p9) + return 0; + +/* xscom clear address range/mask */ +#define XSCOM_CLEAR_RANGE_START 0x20010A00 +#define XSCOM_CLEAR_RANGE_END 0x20010ABF +#define XSCOM_CLEAR_RANGE_MASK 0x200FFBFF + + /* + * Due to a hardware issue where core responding to scom was delayed + * due to thread reconfiguration, leaves the scom logic in a state + * where the subsequent scom to that core can get errors. This is + * affected for Core PC scom registers in the range of + * 20010A80-20010ABF. + * + * The solution is if a xscom timeout occurs to one of Core PC scom + * registers in the range of 20010A80-20010ABF, a clearing scom + * write is done to 0x20010800 with data of '0x00000000' which will + * also get a timeout but clears the scom logic errors. After the + * clearing write is done the original scom operation can be retried. + * + * The scom timeout is reported as status 0x4 (Invalid address) + * in HMER[21-23]. + */ + + base_xscom_addr = pcb_addr & XSCOM_CLEAR_RANGE_MASK; + if (!((base_xscom_addr >= XSCOM_CLEAR_RANGE_START) && + (base_xscom_addr <= XSCOM_CLEAR_RANGE_END))) + return 0; + + /* + * Reset the XSCOM or next scom operation will fail. + * We also need a small delay before we go ahead with clearing write. + * We have observed that without a delay the clearing write has reported + * a wrong status. + */ + xscom_reset(gcid, true); + + /* Clear errors in HMER */ + mtspr(SPR_HMER, HMER_CLR_MASK); + + /* Write 0 to clear the xscom logic errors on target chip */ + out_be64(xscom_addr(gcid, xscom_clear_reg), 0); + hmer = xscom_wait_done(); + + /* + * Above clearing xscom write will timeout and error out with + * invalid access as there is no register at that address. This + * xscom operation just helps to clear the xscom logic error. + * + * On failure, reset the XSCOM or we'll hang on the next access + */ + if (hmer & SPR_HMER_XSCOM_FAIL) + xscom_reset(gcid, true); + + return 1; +} + +static int64_t xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr, + bool is_write, int64_t retries, + int64_t *xscom_clear_retries) +{ + unsigned int stat = GETFIELD(SPR_HMER_XSCOM_STATUS, hmer); + int64_t rc = OPAL_HARDWARE; + + /* XXX Figure out error codes from doc and error + * recovery procedures + */ + switch(stat) { + case 1: + /* + * XSCOM engine is blocked, need to retry. Reset XSCOM + * engine after crossing retry threshold before + * retrying again. + */ + if (retries && !(retries % XSCOM_BUSY_RESET_THRESHOLD)) { + prlog(PR_NOTICE, "XSCOM: Busy even after %d retries, " + "resetting XSCOM now. Total retries = %lld\n", + XSCOM_BUSY_RESET_THRESHOLD, retries); + xscom_reset(gcid, true); + + } + + /* Log error if we have retried enough and its still busy */ + if (retries == XSCOM_BUSY_MAX_RETRIES) + log_simple_error(&e_info(OPAL_RC_XSCOM_BUSY), + "XSCOM: %s-busy error gcid=0x%x pcb_addr=0x%x " + "stat=0x%x\n", is_write ? "write" : "read", + gcid, pcb_addr, stat); + return OPAL_XSCOM_BUSY; + + case 2: /* CPU is asleep, reset XSCOM engine and return */ + xscom_reset(gcid, false); + return OPAL_XSCOM_CHIPLET_OFF; + case 3: /* Partial good */ + rc = OPAL_XSCOM_PARTIAL_GOOD; + break; + case 4: /* Invalid address / address error */ + rc = OPAL_XSCOM_ADDR_ERROR; + if (xscom_clear_error(gcid, pcb_addr)) { + /* return busy if retries still pending. */ + if ((*xscom_clear_retries)--) + return OPAL_XSCOM_BUSY; + + prlog(PR_DEBUG, "XSCOM: error recovery failed for " + "gcid=0x%x pcb_addr=0x%x\n", gcid, pcb_addr); + + } + break; + case 5: /* Clock error */ + rc = OPAL_XSCOM_CLOCK_ERROR; + break; + case 6: /* Parity error */ + rc = OPAL_XSCOM_PARITY_ERROR; + break; + case 7: /* Time out */ + rc = OPAL_XSCOM_TIMEOUT; + break; + } + + /* + * If we're in an XSCOM opal call then squash the error + * we assume that the caller (probably opal-prd) will + * handle logging it + */ + if (this_cpu()->current_token != OPAL_XSCOM_READ && + this_cpu()->current_token != OPAL_XSCOM_WRITE) { + log_simple_error(&e_info(OPAL_RC_XSCOM_RW), + "XSCOM: %s error gcid=0x%x pcb_addr=0x%x stat=0x%x\n", + is_write ? "write" : "read", gcid, pcb_addr, stat); + } + + /* We need to reset the XSCOM or we'll hang on the next access */ + xscom_reset(gcid, false); + + /* Non recovered ... just fail */ + return rc; +} + +static void xscom_handle_ind_error(uint64_t data, uint32_t gcid, + uint64_t pcb_addr, bool is_write) +{ + unsigned int stat = GETFIELD(XSCOM_DATA_IND_ERR, data); + bool timeout = !(data & XSCOM_DATA_IND_COMPLETE); + + /* XXX: Create error log entry ? */ + if (timeout) + log_simple_error(&e_info(OPAL_RC_XSCOM_INDIRECT_RW), + "XSCOM: indirect %s timeout, gcid=0x%x pcb_addr=0x%llx" + " stat=0x%x\n", + is_write ? "write" : "read", gcid, pcb_addr, stat); + else + log_simple_error(&e_info(OPAL_RC_XSCOM_INDIRECT_RW), + "XSCOM: indirect %s error, gcid=0x%x pcb_addr=0x%llx" + " stat=0x%x\n", + is_write ? "write" : "read", gcid, pcb_addr, stat); +} + +static bool xscom_gcid_ok(uint32_t gcid) +{ + return get_chip(gcid) != NULL; +} + +/* Determine if SCOM address is multicast */ +static inline bool xscom_is_multicast_addr(uint32_t addr) +{ + return (((addr >> 30) & 0x1) == 0x1); +} + +/* + * Low level XSCOM access functions, perform a single direct xscom + * access via MMIO + */ +static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val) +{ + uint64_t hmer; + int64_t ret, retries; + int64_t xscom_clear_retries = XSCOM_CLEAR_MAX_RETRIES; + + if (!xscom_gcid_ok(gcid)) { + prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid); + return OPAL_PARAMETER; + } + + for (retries = 0; retries <= XSCOM_BUSY_MAX_RETRIES; retries++) { + /* Clear status bits in HMER (HMER is special + * writing to it *ands* bits + */ + mtspr(SPR_HMER, HMER_CLR_MASK); + + /* Read value from SCOM */ + *val = in_be64(xscom_addr(gcid, pcb_addr)); + + /* Wait for done bit */ + hmer = xscom_wait_done(); + + /* Check for error */ + if (!(hmer & SPR_HMER_XSCOM_FAIL)) + return OPAL_SUCCESS; + + /* Handle error and possibly eventually retry */ + ret = xscom_handle_error(hmer, gcid, pcb_addr, false, retries, + &xscom_clear_retries); + if (ret != OPAL_BUSY) + break; + } + + /* Do not print error message for multicast SCOMS */ + if (xscom_is_multicast_addr(pcb_addr) && ret == OPAL_XSCOM_CHIPLET_OFF) + return ret; + + /* + * Workaround on P9: PRD does operations it *knows* will fail with this + * error to work around a hardware issue where accesses via the PIB + * (FSI or OCC) work as expected, accesses via the ADU (what xscom goes + * through) do not. The chip logic will always return all FFs if there + * is any error on the scom. + */ + if (proc_gen == proc_gen_p9 && ret == OPAL_XSCOM_CHIPLET_OFF) + return ret; + + /* + * If an OPAL call XSCOM read fails, then the OPAL-PRD will + * handle logging the error. Hence just print an + * informational message here. + */ + if (this_cpu()->current_token == OPAL_XSCOM_READ) + prlog(PR_INFO, "XSCOM: Read failed, ret = %lld\n", ret); + else + prerror("XSCOM: Read failed, ret = %lld\n", ret); + + return ret; +} + +static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val) +{ + uint64_t hmer; + int64_t ret, retries = 0; + int64_t xscom_clear_retries = XSCOM_CLEAR_MAX_RETRIES; + + if (!xscom_gcid_ok(gcid)) { + prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid); + return OPAL_PARAMETER; + } + + for (retries = 0; retries <= XSCOM_BUSY_MAX_RETRIES; retries++) { + /* Clear status bits in HMER (HMER is special + * writing to it *ands* bits + */ + mtspr(SPR_HMER, HMER_CLR_MASK); + + /* Write value to SCOM */ + out_be64(xscom_addr(gcid, pcb_addr), val); + + /* Wait for done bit */ + hmer = xscom_wait_done(); + + /* Check for error */ + if (!(hmer & SPR_HMER_XSCOM_FAIL)) + return OPAL_SUCCESS; + + /* Handle error and possibly eventually retry */ + ret = xscom_handle_error(hmer, gcid, pcb_addr, true, retries, + &xscom_clear_retries); + if (ret != OPAL_BUSY) + break; + } + + /* Do not print error message for multicast SCOMS */ + if (xscom_is_multicast_addr(pcb_addr) && ret == OPAL_XSCOM_CHIPLET_OFF) + return ret; + + /* + * Workaround on P9: PRD does operations it *knows* will fail with this + * error to work around a hardware issue where accesses via the PIB + * (FSI or OCC) work as expected, accesses via the ADU (what xscom goes + * through) do not. The chip logic will always return all FFs if there + * is any error on the scom. + */ + if (proc_gen == proc_gen_p9 && ret == OPAL_XSCOM_CHIPLET_OFF) + return ret; + /* + * If an OPAL call XSCOM write fails, then the OPAL-PRD will + * handle logging the error. Hence just print an + * informational message here. + */ + if (this_cpu()->current_token == OPAL_XSCOM_WRITE) + prlog(PR_INFO, "XSCOM: Write failed, ret = %lld\n", ret); + else + prerror("XSCOM: Write failed, ret = %lld\n", ret); + + return ret; +} + +/* + * Indirect XSCOM access functions + */ +static int xscom_indirect_read_form0(uint32_t gcid, uint64_t pcb_addr, + uint64_t *val) +{ + uint32_t addr; + uint64_t data; + int rc, retries; + + /* Write indirect address */ + addr = pcb_addr & 0x7fffffff; + data = XSCOM_DATA_IND_READ | + (pcb_addr & XSCOM_ADDR_IND_ADDR); + rc = __xscom_write(gcid, addr, data); + if (rc) + goto bail; + + /* Wait for completion */ + for (retries = 0; retries < XSCOM_IND_MAX_RETRIES; retries++) { + rc = __xscom_read(gcid, addr, &data); + if (rc) + goto bail; + if ((data & XSCOM_DATA_IND_COMPLETE) && + ((data & XSCOM_DATA_IND_ERR) == 0)) { + *val = data & XSCOM_DATA_IND_DATA; + break; + } + if ((data & XSCOM_DATA_IND_COMPLETE) || + (retries >= XSCOM_IND_MAX_RETRIES)) { + xscom_handle_ind_error(data, gcid, pcb_addr, + false); + rc = OPAL_HARDWARE; + goto bail; + } + } + bail: + if (rc) + *val = (uint64_t)-1; + return rc; +} + +static int xscom_indirect_form(uint64_t pcb_addr) +{ + return (pcb_addr >> 60) & 1; +} + +static int xscom_indirect_read(uint32_t gcid, uint64_t pcb_addr, uint64_t *val) +{ + uint64_t form = xscom_indirect_form(pcb_addr); + + if ((proc_gen >= proc_gen_p9) && (form == 1)) + return OPAL_UNSUPPORTED; + + return xscom_indirect_read_form0(gcid, pcb_addr, val); +} + +static int xscom_indirect_write_form0(uint32_t gcid, uint64_t pcb_addr, + uint64_t val) +{ + uint32_t addr; + uint64_t data; + int rc, retries; + + /* Only 16 bit data with indirect */ + if (val & ~(XSCOM_ADDR_IND_DATA)) + return OPAL_PARAMETER; + + /* Write indirect address & data */ + addr = pcb_addr & 0x7fffffff; + data = pcb_addr & XSCOM_ADDR_IND_ADDR; + data |= val & XSCOM_ADDR_IND_DATA; + + rc = __xscom_write(gcid, addr, data); + if (rc) + goto bail; + + /* Wait for completion */ + for (retries = 0; retries < XSCOM_IND_MAX_RETRIES; retries++) { + rc = __xscom_read(gcid, addr, &data); + if (rc) + goto bail; + if ((data & XSCOM_DATA_IND_COMPLETE) && + ((data & XSCOM_DATA_IND_ERR) == 0)) + break; + if ((data & XSCOM_DATA_IND_COMPLETE) || + (retries >= XSCOM_IND_MAX_RETRIES)) { + xscom_handle_ind_error(data, gcid, pcb_addr, + true); + rc = OPAL_HARDWARE; + goto bail; + } + } + bail: + return rc; +} + +static int xscom_indirect_write_form1(uint32_t gcid, uint64_t pcb_addr, + uint64_t val) +{ + uint32_t addr; + uint64_t data; + + if (proc_gen < proc_gen_p9) + return OPAL_UNSUPPORTED; + if (val & ~(XSCOM_DATA_IND_FORM1_DATA)) + return OPAL_PARAMETER; + + /* Mangle address and data for form1 */ + addr = (pcb_addr & 0x000ffffffffUL); + data = (pcb_addr & 0xfff00000000UL) << 20; + data |= val; + return __xscom_write(gcid, addr, data); +} + +static int xscom_indirect_write(uint32_t gcid, uint64_t pcb_addr, uint64_t val) +{ + uint64_t form = xscom_indirect_form(pcb_addr); + + if ((proc_gen >= proc_gen_p9) && (form == 1)) + return xscom_indirect_write_form1(gcid, pcb_addr, val); + + return xscom_indirect_write_form0(gcid, pcb_addr, val); +} + +static uint32_t xscom_decode_chiplet(uint32_t partid, uint64_t *pcb_addr) +{ + uint32_t gcid = (partid & 0x0fffffff) >> 4; + uint32_t core = partid & 0xf; + + if (proc_gen >= proc_gen_p9) { + /* XXX Not supported */ + *pcb_addr = 0; + } else { + *pcb_addr |= P8_EX_PCB_SLAVE_BASE; + *pcb_addr |= core << 24; + } + + return gcid; +} + +void _xscom_lock(void) +{ + lock(&xscom_lock); +} + +void _xscom_unlock(void) +{ + unlock(&xscom_lock); +} + +/* sorted by the scom controller's partid */ +static LIST_HEAD(scom_list); + +int64_t scom_register(struct scom_controller *new) +{ + struct scom_controller *cur; + + list_for_each(&scom_list, cur, link) { + if (cur->part_id == new->part_id) { + prerror("Attempted to add duplicate scom, partid %x\n", + new->part_id); + return OPAL_BUSY; + } + + if (cur->part_id > new->part_id) { + list_add_before(&scom_list, &new->link, &cur->link); + return 0; + } + } + + /* if we never find a larger partid then this is the largest */ + list_add_tail(&scom_list, &new->link); + + return 0; +} + +static struct scom_controller *scom_find(uint32_t partid) +{ + struct scom_controller *cur; + + list_for_each(&scom_list, cur, link) + if (partid == cur->part_id) + return cur; + + return NULL; +} + +static int64_t scom_read(struct scom_controller *scom, uint32_t partid, + uint64_t pcbaddr, uint64_t *val) +{ + int64_t rc = scom->read(scom, partid, pcbaddr, val); + + if (rc) { + prerror("%s: to %x off: %llx rc = %lld\n", + __func__, partid, pcbaddr, rc); + } + + return rc; +} + +static int64_t scom_write(struct scom_controller *scom, uint32_t partid, + uint64_t pcbaddr, uint64_t val) +{ + int64_t rc = scom->write(scom, partid, pcbaddr, val); + + if (rc) { + prerror("%s: to %x off: %llx rc = %lld\n", + __func__, partid, pcbaddr, rc); + } + + return rc; +} + +/* + * External API + */ +int _xscom_read(uint32_t partid, uint64_t pcb_addr, uint64_t *val, bool take_lock) +{ + struct scom_controller *scom; + uint32_t gcid; + int rc; + + if (!opal_addr_valid(val)) + return OPAL_PARAMETER; + + /* Due to a bug in some versions of the PRD wrapper app, errors + * might not be properly forwarded to PRD, in which case the data + * set here will be used. Rather than a random value let's thus + * initialize the data to a known clean state. + */ + *val = 0xdeadbeefdeadbeefull; + + /* Handle part ID decoding */ + switch(partid >> 28) { + case 0: /* Normal processor chip */ + gcid = partid; + break; + case 4: /* EX chiplet */ + gcid = xscom_decode_chiplet(partid, &pcb_addr); + if (pcb_addr == 0) + return OPAL_UNSUPPORTED; + break; + default: + /* is it one of our hacks? */ + scom = scom_find(partid); + if (scom) + return scom_read(scom, partid, pcb_addr, val); + + /** + * @fwts-label XSCOMReadInvalidPartID + * @fwts-advice xscom_read was called with an invalid partid. + * There's likely a bug somewhere in the stack that's causing + * someone to try an xscom_read on something that isn't a + * processor, Centaur or EX chiplet. + */ + prerror("%s: invalid XSCOM partid 0x%x\n", __func__, partid); + return OPAL_PARAMETER; + } + + /* HW822317 requires us to do global locking */ + if (take_lock) + lock(&xscom_lock); + + /* Direct vs indirect access */ + if (pcb_addr & XSCOM_ADDR_IND_FLAG) + rc = xscom_indirect_read(gcid, pcb_addr, val); + else + rc = __xscom_read(gcid, pcb_addr & 0x7fffffff, val); + + /* Unlock it */ + if (take_lock) + unlock(&xscom_lock); + return rc; +} + +static int64_t opal_xscom_read(uint32_t partid, uint64_t pcb_addr, __be64 *__val) +{ + uint64_t val; + int64_t rc; + + rc = xscom_read(partid, pcb_addr, &val); + *__val = cpu_to_be64(val); + + return rc; +} +opal_call(OPAL_XSCOM_READ, opal_xscom_read, 3); + +int _xscom_write(uint32_t partid, uint64_t pcb_addr, uint64_t val, bool take_lock) +{ + struct scom_controller *scom; + uint32_t gcid; + int rc; + + /* Handle part ID decoding */ + switch(partid >> 28) { + case 0: /* Normal processor chip */ + gcid = partid; + break; + case 4: /* EX chiplet */ + gcid = xscom_decode_chiplet(partid, &pcb_addr); + break; + default: + /* is it one of our hacks? */ + scom = scom_find(partid); + if (scom) + return scom_write(scom, partid, pcb_addr, val); + + /** + * @fwts-label XSCOMWriteInvalidPartID + * @fwts-advice xscom_write was called with an invalid partid. + * There's likely a bug somewhere in the stack that's causing + * someone to try an xscom_write on something that isn't a + * processor, Centaur or EX chiplet. + */ + prerror("%s: invalid XSCOM partid 0x%x\n", __func__, partid); + return OPAL_PARAMETER; + } + + /* HW822317 requires us to do global locking */ + if (take_lock) + lock(&xscom_lock); + + /* Direct vs indirect access */ + if (pcb_addr & XSCOM_ADDR_IND_FLAG) + rc = xscom_indirect_write(gcid, pcb_addr, val); + else + rc = __xscom_write(gcid, pcb_addr & 0x7fffffff, val); + + /* Unlock it */ + if (take_lock) + unlock(&xscom_lock); + return rc; +} + +static int64_t opal_xscom_write(uint32_t partid, uint64_t pcb_addr, uint64_t val) +{ + return xscom_write(partid, pcb_addr, val); +} +opal_call(OPAL_XSCOM_WRITE, opal_xscom_write, 3); + +/* + * Perform a xscom read-modify-write. + */ +int xscom_write_mask(uint32_t partid, uint64_t pcb_addr, uint64_t val, uint64_t mask) +{ + int rc; + uint64_t old_val; + + rc = xscom_read(partid, pcb_addr, &old_val); + if (rc) + return rc; + val = (old_val & ~mask) | (val & mask); + return xscom_write(partid, pcb_addr, val); +} + +int xscom_readme(uint64_t pcb_addr, uint64_t *val) +{ + return xscom_read(this_cpu()->chip_id, pcb_addr, val); +} + +int xscom_writeme(uint64_t pcb_addr, uint64_t val) +{ + return xscom_write(this_cpu()->chip_id, pcb_addr, val); +} + +int64_t xscom_read_cfam_chipid(uint32_t partid, uint32_t *chip_id) +{ + uint64_t val; + int64_t rc = OPAL_SUCCESS; + + /* Mambo chip model lacks the f000f register, just make + * something up + */ + if (chip_quirk(QUIRK_NO_F000F)) { + if (proc_gen == proc_gen_p10) + val = 0x120DA04980000000UL; /* P10 DD1.0 */ + else if (proc_gen == proc_gen_p9) + val = 0x203D104980000000UL; /* P9 Nimbus DD2.3 */ + else + val = 0x221EF04980000000UL; /* P8 Murano DD2.1 */ + } else + rc = xscom_read(partid, 0xf000f, &val); + + /* Extract CFAM id */ + if (rc == OPAL_SUCCESS) + *chip_id = (uint32_t)(val >> 44); + + return rc; +} + +static void xscom_init_chip_info(struct proc_chip *chip) +{ + uint32_t val; + int64_t rc; + + rc = xscom_read_cfam_chipid(chip->id, &val); + if (rc) { + prerror("XSCOM: Error %lld reading 0xf000f register\n", rc); + /* We leave chip type to UNKNOWN */ + return; + } + + /* Identify chip */ + switch(val & 0xff) { + case 0xef: + chip->type = PROC_CHIP_P8_MURANO; + assert(proc_gen == proc_gen_p8); + break; + case 0xea: + chip->type = PROC_CHIP_P8_VENICE; + assert(proc_gen == proc_gen_p8); + break; + case 0xd3: + chip->type = PROC_CHIP_P8_NAPLES; + assert(proc_gen == proc_gen_p8); + break; + case 0xd1: + chip->type = PROC_CHIP_P9_NIMBUS; + assert(proc_gen == proc_gen_p9); + break; + case 0xd4: + chip->type = PROC_CHIP_P9_CUMULUS; + assert(proc_gen == proc_gen_p9); + break; + case 0xd9: + chip->type = PROC_CHIP_P9P; + assert(proc_gen == proc_gen_p9); + break; + case 0xda: + chip->type = PROC_CHIP_P10; + assert(proc_gen == proc_gen_p10); + break; + default: + printf("CHIP: Unknown chip type 0x%02x !!!\n", + (unsigned char)(val & 0xff)); + } + + /* Get EC level from CFAM ID */ + chip->ec_level = ((val >> 16) & 0xf) << 4; + chip->ec_level |= (val >> 8) & 0xf; + + /* + * On P9, grab the ECID bits to differenciate + * DD1.01, 1.02, 2.00, etc... + */ + if (chip_quirk(QUIRK_MAMBO_CALLOUTS)) { + chip->ec_rev = 0; + } else if (proc_gen == proc_gen_p9) { + uint64_t ecid2 = 0; + uint8_t rev; + xscom_read(chip->id, 0x18002, &ecid2); + switch((ecid2 >> 45) & 7) { + case 0: + rev = 0; + break; + case 1: + rev = 1; + break; + case 3: + rev = 2; + break; + case 7: + rev = 3; + break; + default: + rev = 0; + } + prlog(PR_INFO,"P9 DD%i.%i%d detected\n", 0xf & (chip->ec_level >> 4), + chip->ec_level & 0xf, rev); + chip->ec_rev = rev; + } /* XXX P10 */ +} + +/* +* This function triggers xstop by writing to XSCOM. +* Machine would enter xstop state post completion of this. +*/ +int64_t xscom_trigger_xstop(void) +{ + int rc = OPAL_UNSUPPORTED; + bool xstop_disabled = false; + + if (nvram_query_eq_dangerous("opal-sw-xstop", "disable")) + xstop_disabled = true; + + if (xstop_disabled) { + prlog(PR_NOTICE, "Software initiated checkstop disabled.\n"); + return rc; + } + + if (xstop_xscom.addr) + rc = xscom_writeme(xstop_xscom.addr, + PPC_BIT(xstop_xscom.fir_bit)); + + return rc; +} + +void xscom_init(void) +{ + struct dt_node *xn; + const struct dt_property *p; + + dt_for_each_compatible(dt_root, xn, "ibm,xscom") { + uint32_t gcid = dt_get_chip_id(xn); + const struct dt_property *reg; + struct proc_chip *chip; + const char *chip_name; + static const char *chip_names[] = { + "UNKNOWN", "P8E", "P8", "P8NVL", "P9N", "P9C", "P9P", + "P10", + }; + + chip = get_chip(gcid); + assert(chip); + + /* XXX We need a proper address parsing. For now, we just + * "know" that we are looking at a u64 + */ + reg = dt_find_property(xn, "reg"); + assert(reg); + + chip->xscom_base = dt_translate_address(xn, 0, NULL); + + /* Grab processor type and EC level */ + xscom_init_chip_info(chip); + + if (chip->type >= ARRAY_SIZE(chip_names)) + chip_name = "INVALID"; + else + chip_name = chip_names[chip->type]; + + /* We keep a "CHIP" prefix to make the log more user-friendly */ + prlog(PR_NOTICE, "CHIP: Chip ID %04x type: %s DD%x.%x%d\n", + gcid, chip_name, chip->ec_level >> 4, + chip->ec_level & 0xf, chip->ec_rev); + prlog(PR_DEBUG, "XSCOM: Base address: 0x%llx\n", chip->xscom_base); + } + + /* Collect details to trigger xstop via XSCOM write */ + p = dt_find_property(dt_root, "ibm,sw-checkstop-fir"); + if (p) { + xstop_xscom.addr = dt_property_get_cell(p, 0); + xstop_xscom.fir_bit = dt_property_get_cell(p, 1); + prlog(PR_DEBUG, "XSTOP: XSCOM addr = 0x%llx, FIR bit = %lld\n", + xstop_xscom.addr, xstop_xscom.fir_bit); + } else + prlog(PR_DEBUG, "XSTOP: ibm,sw-checkstop-fir prop not found\n"); +} + +void xscom_used_by_console(void) +{ + xscom_lock.in_con_path = true; + + /* + * Some other processor might hold it without having + * disabled the console locally so let's make sure that + * is over by taking/releasing the lock ourselves + */ + lock(&xscom_lock); + unlock(&xscom_lock); +} + +bool xscom_ok(void) +{ + return !lock_held_by_me(&xscom_lock); +} |