diff options
author | Angelos Mouzakitis <a.mouzakitis@virtualopensystems.com> | 2023-10-10 14:33:42 +0000 |
---|---|---|
committer | Angelos Mouzakitis <a.mouzakitis@virtualopensystems.com> | 2023-10-10 14:33:42 +0000 |
commit | af1a266670d040d2f4083ff309d732d648afba2a (patch) | |
tree | 2fc46203448ddcc6f81546d379abfaeb323575e9 /roms/skiboot/hw/npu-hw-procedures.c | |
parent | e02cda008591317b1625707ff8e115a4841aa889 (diff) |
Change-Id: Iaf8d18082d3991dec7c0ebbea540f092188eb4ec
Diffstat (limited to 'roms/skiboot/hw/npu-hw-procedures.c')
-rw-r--r-- | roms/skiboot/hw/npu-hw-procedures.c | 608 |
1 files changed, 608 insertions, 0 deletions
diff --git a/roms/skiboot/hw/npu-hw-procedures.c b/roms/skiboot/hw/npu-hw-procedures.c new file mode 100644 index 000000000..91bbb0f15 --- /dev/null +++ b/roms/skiboot/hw/npu-hw-procedures.c @@ -0,0 +1,608 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * NPU (NVLink1, POWER8NVL) Hardware Procedures + * + * Copyright 2013-2019 IBM Corp. + */ + +#include <skiboot.h> +#include <io.h> +#include <timebase.h> +#include <pci.h> +#include <pci-virt.h> +#include <interrupts.h> +#include <npu-regs.h> +#include <npu.h> +#include <xscom.h> + +typedef uint32_t (*step)(struct npu_dev *); + +struct procedure { + const char *name; + step steps[]; +}; + +#define DEFINE_PROCEDURE(NAME, STEPS...) \ + static struct procedure procedure_##NAME = \ + {.name = #NAME, .steps = {NAME, ##STEPS}} + +#define PROCEDURE_INPROGRESS (1 << 31) +#define PROCEDURE_COMPLETE (1 << 30) +#define PROCEDURE_NEXT (1 << 29) +#define PROCEDURE_FAILED 2 +#define PROCEDURE_ABORTED 3 +#define PROCEDURE_UNSUPPORTED 4 + +/* Mask defining which status bits we want to expose */ +#define PROCEDURE_STATUS_MASK 0xc000000f + +/* Accesors for PHY registers. These can be done either via MMIO or SCOM. */ +static bool pl_use_scom = 1; +static void phy_write(struct npu_dev *npu_dev, uint64_t addr, uint32_t val) +{ + if (pl_use_scom) + xscom_write(npu_dev->npu->chip_id, npu_dev->pl_xscom_base | addr, val); + else + out_be16((void *) npu_dev->pl_base + PL_MMIO_ADDR(addr), val); +} + +static uint16_t phy_read(struct npu_dev *npu_dev, uint64_t addr) +{ + uint64_t val; + + if (pl_use_scom) + xscom_read(npu_dev->npu->chip_id, npu_dev->pl_xscom_base + addr, &val); + else + val = in_be16((void *) npu_dev->pl_base + PL_MMIO_ADDR(addr)); + + return val & 0xffff; +} + +/* The DL registers can be accessed indirectly via the NTL */ +static void dl_write(struct npu_dev *npu_dev, uint32_t addr, uint32_t val) +{ + xscom_write(npu_dev->npu->chip_id, + npu_dev->xscom + NX_DL_REG_ADDR, addr); + xscom_write(npu_dev->npu->chip_id, + npu_dev->xscom + NX_DL_REG_DATA, val); +} + +static uint64_t __unused dl_read(struct npu_dev *npu_dev, uint32_t addr) +{ + uint64_t val; + + xscom_write(npu_dev->npu->chip_id, + npu_dev->xscom + NX_DL_REG_ADDR, addr); + xscom_read(npu_dev->npu->chip_id, + npu_dev->xscom + NX_DL_REG_DATA, &val); + return val; +} + +/* Our hardware bits are backwards here. The lane vectors are 16-bit + * values represented in IBM bit ordering. This means lane 0 is + * represented by bit 15 in most of the registers. Internally we keep + * this sane (ie. npu_dev->lane_mask[0] == lane 0) as we need sane + * numbering for set_lane_reg() anyway. */ +static uint32_t phy_lane_mask(struct npu_dev *npu_dev) +{ + /* We only train 8 lanes at a time so we don't do a full + * bit-swap */ + assert(npu_dev->lane_mask == 0xff00 || npu_dev->lane_mask == 0xff); + + return ~npu_dev->lane_mask & 0xffff; +} + +static void set_lane_reg(struct npu_dev *npu_dev, uint64_t base_reg, + uint64_t data, uint64_t mask) +{ + uint64_t val, i; + uint32_t lane_mask = npu_dev->lane_mask; + + for (i = 0; i <= 23; i++) { + if (lane_mask & (1ul << i)) { + uint64_t tx_rxcal_reg = base_reg + (i << 32); + val = phy_read(npu_dev, tx_rxcal_reg); + val = (val & ~mask) | data; + phy_write(npu_dev, tx_rxcal_reg, val); + } + } +} + +static uint32_t stop(struct npu_dev *npu_dev __unused) +{ + return PROCEDURE_COMPLETE | PROCEDURE_ABORTED; +} +DEFINE_PROCEDURE(stop); + +static uint32_t nop(struct npu_dev *npu_dev __unused) +{ + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(nop); + +/* Procedure 1.2.1 (RESET_NPU_DL) from opt_programmerguide.odt. Also + * incorporates AT reset. */ +static uint32_t reset_npu_dl(struct npu_dev *npu_dev) +{ + uint64_t val; + + /* Assert NPU reset */ + xscom_read(npu_dev->npu->chip_id, npu_dev->xscom + NX_NTL_CONTROL, &val); + val |= NTL_CONTROL_RESET; + xscom_write(npu_dev->npu->chip_id, npu_dev->xscom + NX_NTL_CONTROL, val); + + /* Put the Nvidia logic in reset */ + dl_write(npu_dev, NDL_CONTROL, 0xe8000000); + + /* Release Nvidia logic from reset */ + dl_write(npu_dev, NDL_CONTROL, 0); + + /* Release NPU from reset */ + val &= ~NTL_CONTROL_RESET; + xscom_write(npu_dev->npu->chip_id, npu_dev->xscom + NX_NTL_CONTROL, val); + + /* Setup up TL credits */ + xscom_write(npu_dev->npu->chip_id, npu_dev->xscom + NX_TL_CMD_CR, PPC_BIT(0)); + xscom_write(npu_dev->npu->chip_id, npu_dev->xscom + NX_TL_CMD_D_CR, PPC_BIT(0)); + xscom_write(npu_dev->npu->chip_id, npu_dev->xscom + NX_TL_RSP_CR, PPC_BIT(15)); + xscom_write(npu_dev->npu->chip_id, npu_dev->xscom + NX_TL_RSP_D_CR, PPC_BIT(15)); + + /* Reset error registers. TODO: are there more we should clear here? */ + npu_ioda_sel(npu_dev->npu, NPU_IODA_TBL_PESTB, 0, true); + for (val = 0; val < NPU_NUM_OF_PES; val++) + out_be64(npu_dev->npu->at_regs + NPU_IODA_DATA0, 0); + + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(reset_npu_dl); + +/* Procedures 1.2.3 (reset_lanes) & 1.2.4 + * (io_register_write_reset_values) */ +static uint32_t phy_reset(struct npu_dev *npu_dev) +{ + uint16_t val; + + /* Lower run_lane inputs for lanes to be reset */ + val = phy_read(npu_dev, RX_RUN_LANE_VEC_0_15); + val &= ~phy_lane_mask(npu_dev); + phy_write(npu_dev, RX_RUN_LANE_VEC_0_15, val); + + return PROCEDURE_NEXT; +} + +static uint32_t phy_reset_wait(struct npu_dev *npu_dev) +{ + uint16_t val; + + /* Wait for lane busy outputs to go to zero for lanes to be + * reset */ + val = phy_read(npu_dev, RX_LANE_BUSY_VEC_0_15); + if (val & phy_lane_mask(npu_dev)) + return PROCEDURE_INPROGRESS; + + return PROCEDURE_NEXT; +} + +static uint32_t phy_reset_complete(struct npu_dev *npu_dev) +{ + uint16_t val; + uint32_t lane_mask = phy_lane_mask(npu_dev); + + /* Set ioreset_vec for the desired lanes bit positions */ + val = phy_read(npu_dev, RX_IORESET_VEC_0_15); + phy_write(npu_dev, RX_IORESET_VEC_0_15, val | lane_mask); + + val = phy_read(npu_dev, TX_IORESET_VEC_0_15); + phy_write(npu_dev, TX_IORESET_VEC_0_15, val | lane_mask); + + /* Clear ioreset_vec */ + val = phy_read(npu_dev, RX_IORESET_VEC_0_15); + phy_write(npu_dev, RX_IORESET_VEC_0_15, val & ~lane_mask); + + val = phy_read(npu_dev, TX_IORESET_VEC_0_15); + phy_write(npu_dev, TX_IORESET_VEC_0_15, val & ~lane_mask); + + /* Reset RX phase rotators */ + set_lane_reg(npu_dev, RX_PR_CNTL_PL, RX_PR_RESET, RX_PR_RESET); + set_lane_reg(npu_dev, RX_PR_CNTL_PL, 0, RX_PR_RESET); + + /* Restore registers from scominit that may have changed */ + set_lane_reg(npu_dev, RX_PR_MODE, 0x8, RX_PR_PHASE_STEP); + set_lane_reg(npu_dev, RX_A_DAC_CNTL, + 0x7 << MASK_TO_LSH(RX_PR_IQ_RES_SEL), + RX_PR_IQ_RES_SEL); + set_lane_reg(npu_dev, TX_MODE1_PL, 0, TX_LANE_PDWN); + set_lane_reg(npu_dev, RX_BANK_CONTROLS, 0, RX_LANE_ANA_PDWN); + set_lane_reg(npu_dev, RX_MODE, 0, RX_LANE_DIG_PDWN); + + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(phy_reset, phy_reset_wait, phy_reset_complete); + +/* Round a fixed decimal number. Frac is the number of fractional + * bits */ +static uint32_t round(uint32_t val, int frac) +{ + if (val >> (frac - 1) & 0x1) + return (val >> frac) + 1; + else + return val >> frac; +} + +#define ZCAL_MIN (10 << 3) +#define ZCAL_MAX (40 << 3) +#define ZCAL_K0 0x0 +#define ZCAL_M 128 +/* TODO: add a test case for the following values: + + Initial values: + zcal_n = 0xda; + zcal_p = 0xc7; + + Results: + pre_p = 0x0 + pre_n = 0x0 + margin_p = 0x0 + margin_n = 0x0 + total_en_p = 0x32 + total_en_n = 0x37 + */ + +static uint32_t phy_tx_zcal(struct npu_dev *npu_dev) +{ + uint64_t val; + + if (npu_dev->index < 2 && npu_dev->npu->tx_zcal_complete[0]) + return PROCEDURE_COMPLETE; + + if (npu_dev->index >= 2 && npu_dev->npu->tx_zcal_complete[1]) + return PROCEDURE_COMPLETE; + + /* Start calibration */ + val = phy_read(npu_dev, TX_IMPCAL_SWO1_PB); + val &= TX_ZCAL_SWO_EN; + phy_write(npu_dev, TX_IMPCAL_SWO1_PB, val); + phy_write(npu_dev, TX_IMPCAL_SWO2_PB, 0x50 << 2); + val = phy_read(npu_dev, TX_IMPCAL_PB); + val |= TX_ZCAL_REQ; + phy_write(npu_dev, TX_IMPCAL_PB, val); + + return PROCEDURE_NEXT; +} + +static uint32_t phy_tx_zcal_wait(struct npu_dev *npu_dev) +{ + uint64_t val; + + val = phy_read(npu_dev, TX_IMPCAL_PB); + if (!(val & TX_ZCAL_DONE)) + return PROCEDURE_INPROGRESS; + + if (val & TX_ZCAL_ERROR) + return PROCEDURE_COMPLETE | PROCEDURE_FAILED; + + return PROCEDURE_NEXT; +} + +static uint32_t phy_tx_zcal_calculate(struct npu_dev *npu_dev) +{ + uint64_t val; + uint64_t zcal_n; + uint64_t zcal_p; + uint64_t margin_n; + uint64_t margin_p; + uint64_t pre_n; + uint64_t pre_p; + uint64_t total_en_n; + uint64_t total_en_p; + + val = phy_read(npu_dev, TX_IMPCAL_NVAL_PB); + zcal_n = GETFIELD(TX_ZCAL_N, val); + val = phy_read(npu_dev, TX_IMPCAL_PVAL_PB); + zcal_p = GETFIELD(TX_ZCAL_P, val); + + if ((zcal_n < ZCAL_MIN) || (zcal_n > ZCAL_MAX) || + (zcal_p < ZCAL_MIN) || (zcal_p > ZCAL_MAX)) + return PROCEDURE_COMPLETE | PROCEDURE_FAILED; + + margin_n = (0x80 - ZCAL_M) * zcal_n / 2; + margin_p = (0x80 - ZCAL_M) * zcal_p / 2; + pre_n = (((0x80 * zcal_n) - (2 * margin_n)) * ZCAL_K0) / 0x80; + pre_p = (((0x80 * zcal_p) - (2 * margin_p)) * ZCAL_K0) / 0x80; + + total_en_n = 0x80 * zcal_n - (2 * margin_n) - (pre_n & 1023); + total_en_p = 0x80 * zcal_p - (2 * margin_p) - (pre_p & 1023); + + pre_p = round(pre_p, 9); + pre_n = round(pre_n, 9); + margin_p = round(margin_p, 9); + margin_n = round(margin_n, 9); + total_en_p = round(total_en_p, 9); + total_en_n = round(total_en_n, 9); + + val = SETFIELD(TX_FFE_TOTAL_ENABLE_N_ENC, 0, total_en_n); + val = SETFIELD(TX_FFE_TOTAL_ENABLE_P_ENC, val, total_en_p); + phy_write(npu_dev, TX_FFE_TOTAL_2RSTEP_EN, val); + + val = SETFIELD(TX_FFE_PRE_N_SEL_ENC, 0, pre_n); + val = SETFIELD(TX_FFE_PRE_P_SEL_ENC, val, pre_p); + phy_write(npu_dev, TX_FFE_PRE_2RSTEP_SEL, val); + + val = SETFIELD(TX_FFE_MARGIN_PD_N_SEL_ENC, 0, margin_n); + val = SETFIELD(TX_FFE_MARGIN_PU_P_SEL_ENC, val, margin_p); + phy_write(npu_dev, TX_FFE_MARGIN_2RSTEP_SEL, val); + + if (npu_dev->index < 2) + npu_dev->npu->tx_zcal_complete[0] = true; + else + npu_dev->npu->tx_zcal_complete[1] = true; + + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(phy_tx_zcal, phy_tx_zcal_wait, phy_tx_zcal_calculate); + +static uint32_t phy_enable_tx_rxcal(struct npu_dev *npu_dev) +{ + /* Turn common mode on */ + set_lane_reg(npu_dev, TX_MODE2_PL, TX_RXCAL, TX_RXCAL); + + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(phy_enable_tx_rxcal); + +static uint32_t phy_disable_tx_rxcal(struct npu_dev *npu_dev) +{ + /* Turn common mode off */ + set_lane_reg(npu_dev, TX_MODE2_PL, 0, TX_RXCAL); + + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(phy_disable_tx_rxcal); + +static uint32_t phy_rx_dccal(struct npu_dev *npu_dev) +{ + if (phy_read(npu_dev, RX_LANE_BUSY_VEC_0_15) + & ~phy_read(npu_dev, RX_INIT_DONE_VEC_0_15)) + return PROCEDURE_INPROGRESS; + + return PROCEDURE_NEXT; +} + +static uint32_t phy_rx_dccal_start(struct npu_dev *npu_dev) +{ + uint64_t val; + + /* Save EO step control */ + val = phy_read(npu_dev, RX_EO_STEP_CNTL_PG); + npu_dev->procedure_data = val; + + phy_write(npu_dev, RX_EO_STEP_CNTL_PG, + RX_EO_ENABLE_LATCH_OFFSET_CAL + | RX_EO_ENABLE_CM_COARSE_CAL); + + val = phy_read(npu_dev, RX_RECAL_ABORT_VEC_0_15); + val |= phy_lane_mask(npu_dev); + phy_write(npu_dev, RX_RECAL_ABORT_VEC_0_15, val); + + val = phy_read(npu_dev, RX_RUN_LANE_VEC_0_15); + val |= phy_lane_mask(npu_dev); + phy_write(npu_dev, RX_RUN_LANE_VEC_0_15, val); + + return PROCEDURE_NEXT; +} + +static uint32_t phy_rx_dccal_complete(struct npu_dev *npu_dev) +{ + /* Poll for completion on relevant lanes */ + if ((phy_read(npu_dev, RX_INIT_DONE_VEC_0_15) & phy_lane_mask(npu_dev)) + != phy_lane_mask(npu_dev)) + return PROCEDURE_INPROGRESS; + + return PROCEDURE_NEXT; +} + +static uint32_t phy_rx_dccal_fifo_init(struct npu_dev *npu_dev) +{ + uint64_t val; + + val = phy_read(npu_dev, RX_RUN_LANE_VEC_0_15); + val &= ~phy_lane_mask(npu_dev); + phy_write(npu_dev, RX_RUN_LANE_VEC_0_15, val); + + /* Turn off recal abort */ + val = phy_read(npu_dev, RX_RECAL_ABORT_VEC_0_15); + val &= ~phy_lane_mask(npu_dev); + phy_write(npu_dev, RX_RECAL_ABORT_VEC_0_15, val); + + /* Restore original settings */ + phy_write(npu_dev, RX_EO_STEP_CNTL_PG, npu_dev->procedure_data); + + /* FIFO Init */ + set_lane_reg(npu_dev, TX_MODE2_PL, 0, TX_UNLOAD_CLK_DISABLE); + set_lane_reg(npu_dev, TX_CNTL_STAT2, TX_FIFO_INIT, TX_FIFO_INIT); + set_lane_reg(npu_dev, TX_MODE2_PL, TX_UNLOAD_CLK_DISABLE, + TX_UNLOAD_CLK_DISABLE); + + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(phy_rx_dccal, phy_rx_dccal_start, phy_rx_dccal_complete, + phy_rx_dccal_fifo_init); + +static uint32_t phy_rx_training(struct npu_dev *npu_dev) +{ + uint16_t val; + + if (!npu_dev->procedure_data) { + val = phy_read(npu_dev, RX_RUN_LANE_VEC_0_15); + val |= phy_lane_mask(npu_dev); + phy_write(npu_dev, RX_RUN_LANE_VEC_0_15, val); + } + + npu_dev->procedure_data++; + if (npu_dev->procedure_data >= 1000000) + return PROCEDURE_COMPLETE | PROCEDURE_FAILED; + + val = phy_read(npu_dev, RX_RUN_LANE_VEC_0_15); + if ((val & phy_lane_mask(npu_dev)) != phy_lane_mask(npu_dev)) + return PROCEDURE_INPROGRESS; + + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(phy_rx_training); + +static struct procedure *npu_procedures[] = { + &procedure_stop, + &procedure_nop, + NULL, + NULL, + &procedure_phy_reset, + &procedure_phy_tx_zcal, + &procedure_phy_rx_dccal, + &procedure_phy_enable_tx_rxcal, + &procedure_phy_disable_tx_rxcal, + &procedure_phy_rx_training, + &procedure_reset_npu_dl, + + /* Place holders for pre-terminate and terminate procedures */ + &procedure_nop, + &procedure_nop}; + +/* Run a procedure step(s) and return status */ +static uint32_t get_procedure_status(struct npu_dev *dev) +{ + uint32_t result; + uint16_t procedure = dev->procedure_number; + uint16_t step = dev->procedure_step; + const char *name = npu_procedures[procedure]->name; + + do { + result = npu_procedures[procedure]->steps[step](dev); + + if (result & PROCEDURE_NEXT) { + step++; + NPUDEVINF(dev, "Running procedure %s step %d\n", name, step); + } + } while (result & PROCEDURE_NEXT); + + dev->procedure_step = step; + + if (result & PROCEDURE_COMPLETE) + NPUDEVINF(dev, "Procedure %s complete\n", name); + else if (mftb() > dev->procedure_tb + msecs_to_tb(100)) { + NPUDEVINF(dev, "Procedure %s timed out\n", name); + result = PROCEDURE_COMPLETE | PROCEDURE_FAILED; + } + + /* Mask off internal state bits */ + dev->procedure_status = result & PROCEDURE_STATUS_MASK; + + return dev->procedure_status; +} + +static int64_t npu_dev_procedure_read(struct npu_dev *dev, uint32_t offset, + uint32_t size, uint32_t *data) +{ + int64_t rc = OPAL_SUCCESS; + + if (size != 4) { + /* Short config reads are not supported */ + prlog(PR_ERR, "NPU%d: Short read of procedure register\n", dev->npu->phb.opal_id); + return OPAL_PARAMETER; + } + + *data = 0; + + switch (offset) { + case 0: + /* Only run the procedure if not already complete */ + if (dev->procedure_status & PROCEDURE_COMPLETE) + *data = dev->procedure_status; + else + *data = get_procedure_status(dev); + + break; + + case 4: + *data = dev->procedure_number; + break; + + default: + prlog(PR_ERR, "NPU%d: Invalid vendor specific offset 0x%08x\n", + dev->npu->phb.opal_id, offset); + rc = OPAL_PARAMETER; + } + + return rc; +} + +static int64_t npu_dev_procedure_write(struct npu_dev *dev, uint32_t offset, + uint32_t size, uint32_t data) +{ + const char *name; + int64_t rc = OPAL_SUCCESS; + + if (size != 4) { + /* Short config writes are not supported */ + prlog(PR_ERR, "NPU%d: Short read of procedure register\n", + dev->npu->phb.opal_id); + return OPAL_PARAMETER; + } + + switch (offset) { + case 0: + /* We ignore writes to the status register */ + NPUDEVINF(dev, "Ignoring writes to status register\n"); + break; + + case 4: + if (data >= ARRAY_SIZE(npu_procedures) || + !npu_procedures[data]) { + NPUDEVINF(dev, "Unsupported procedure number %d\n", data); + dev->procedure_status = PROCEDURE_COMPLETE + | PROCEDURE_UNSUPPORTED; + break; + } + + name = npu_procedures[data]->name; + if (dev->procedure_number == data + && !(dev->procedure_status & PROCEDURE_COMPLETE)) + NPUDEVINF(dev, "Restarting procuedure %s\n", name); + else + NPUDEVINF(dev, "Starting procedure %s\n", name); + + dev->procedure_status = PROCEDURE_INPROGRESS; + dev->procedure_number = data; + dev->procedure_step = 0; + dev->procedure_data = 0; + dev->procedure_tb = mftb(); + break; + + default: + NPUDEVINF(dev, "Invalid vendor specific offset 0x%08x\n", offset); + rc = OPAL_PARAMETER; + } + + return rc; +} + +int64_t npu_dev_procedure(void *dev, struct pci_cfg_reg_filter *pcrf, + uint32_t offset, uint32_t len, uint32_t *data, + bool write) +{ + struct pci_virt_device *pvd = dev; + struct npu_dev *ndev = pvd->data; + + if (write) + return npu_dev_procedure_write(ndev, offset - pcrf->start, + len, *data); + + return npu_dev_procedure_read(ndev, offset - pcrf->start, len, data); +} + +void npu_dev_procedure_reset(struct npu_dev *dev) +{ + dev->procedure_status = 0; + dev->procedure_number = 0; + dev->procedure_step = 0; + dev->procedure_data = 0; +} |