diff options
Diffstat (limited to 'roms/skiboot/hw/npu.c')
-rw-r--r-- | roms/skiboot/hw/npu.c | 1693 |
1 files changed, 1693 insertions, 0 deletions
diff --git a/roms/skiboot/hw/npu.c b/roms/skiboot/hw/npu.c new file mode 100644 index 000000000..dba7ee50f --- /dev/null +++ b/roms/skiboot/hw/npu.c @@ -0,0 +1,1693 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * NVLink1, supported by the NPU (POWER8) + * + * Copyright 2013-2019 IBM Corp. + */ + +#include <skiboot.h> +#include <io.h> +#include <timebase.h> +#include <pci.h> +#include <pci-cfg.h> +#include <pci-virt.h> +#include <pci-slot.h> +#include <interrupts.h> +#include <opal.h> +#include <opal-api.h> +#include <cpu.h> +#include <device.h> +#include <ccan/str/str.h> +#include <ccan/array_size/array_size.h> +#include <ccan/build_assert/build_assert.h> +#include <affinity.h> +#include <npu-regs.h> +#include <npu.h> +#include <xscom.h> +#include <string.h> + +/* + * Terminology: + * + * Brick - A group of either 8 TX or 8 RX lanes + * Link - A group of 8 TX and 8 RX lanes + * + * Each link is represented in system software as an emulated PCI + * device. Garrison has two chips each with 4 links, therefore there + * are 8 emulated PCI devices in total. + * + * +----------------------------------------------------------------+ + * | PBCQ3 (SCOM Base Address 0x2012c00) | + * | PHB3 (SCOM Base Address 0x9012c00) | + * +----------------------------------------------------------------+ + * |||||||| |||||||| + * |||||||| |||||||| + * |||||||| |||||||| + * |||||||| |||||||| + * +----------------------------------------------------------------+ + * | PCIe x8 | + * +----------------------------------------------------------------+ + * | GPU0 | + * +--------------------------------+-------------------------------+ + * | NV Link 1 | NV Link 0 | + * +---------------+----------------+---------------+---------------+ + * | RX | TX | RX | TX | + * +---------------+----------------+---------------+---------------+ + * |||||||| |||||||| |||||||| |||||||| + * |||||||| |||||||| |||||||| |||||||| + * |||||||| |||||||| |||||||| |||||||| + * |||||||| |||||||| |||||||| |||||||| + * +---------------+----------------+---------------+---------------+ + * | TX | RX | TX | RX | + * +---------------+----------------+---------------+---------------+ + * | Lanes [0:7] PHY 0 Lanes [8:15] | + * | SCOM Base Address 0x8000080008010c3f | + * +--------------------------------+-------------------------------+ + * | Link 0 NDL/NTL | Link 1 NTL/NDL | + * | SCOM Base Address 0x8013c00 | SCOM Base Address 0x8013c40 | + * +--------------------------------+-------------------------------+ + * | | + * | Address Translation/AT (shared for all links) | + * | SCOM Base Address 0x8013d80 | + * | | + * +--------------------------------+-------------------------------+ + * | Link 3 NDL/NTL | Link 4 NTL/NDL | + * | SCOM Base Address 0x8013d00 | SCOM Base Address 0x8013d40 | + * +--------------------------------+-------------------------------+ + * | Lanes [8:15] PHY 1 Lanes [0:7] | + * | SCOM Base Address 0x8000080008010c7f | + * +---------------+----------------+---------------+---------------+ + * | TX | RX | TX | RX | + * +---------------+----------------+---------------+---------------+ + * |||||||| |||||||| |||||||| |||||||| + * |||||||| |||||||| |||||||| |||||||| + * |||||||| |||||||| |||||||| |||||||| + * |||||||| |||||||| |||||||| |||||||| + * +---------------+----------------+---------------+---------------+ + * | RX | TX | RX | TX | + * +---------------+----------------+---------------+---------------+ + * | NV Link 2 | NV Link 3 | + * +--------------------------------+-------------------------------+ + * | GPU1 | + * +----------------------------------------------------------------+ + * | PCIe x8 | + * +----------------------------------------------------------------+ + * |||||||| |||||||| + * |||||||| |||||||| + * |||||||| |||||||| + * |||||||| |||||||| + * +----------------------------------------------------------------+ + * | PHB2 (SCOM Base Address 0x9012800) | + * | PBCQ2 (SCOM Base Address 0x2012800) | + * +----------------------------------------------------------------+ + * + */ + +static struct npu_dev_cap *npu_dev_find_capability(struct npu_dev *dev, + uint16_t id); + +#define OPAL_NPU_VERSION 0x02 + +#define PCIE_CAP_START 0x40 +#define PCIE_CAP_END 0x80 +#define VENDOR_CAP_START 0x80 +#define VENDOR_CAP_END 0x90 + +#define VENDOR_CAP_PCI_DEV_OFFSET 0x0d + +/* Returns the scom base for the given link index */ +static uint64_t npu_link_scom_base(struct dt_node *dn, uint32_t scom_base, + int index) +{ + struct dt_node *link; + uint32_t link_index; + char namebuf[32]; + + snprintf(namebuf, sizeof(namebuf), "link@%x", index); + link = dt_find_by_name(dn, namebuf); + assert(link); + link_index = dt_prop_get_u32(link, "ibm,npu-link-index"); + return scom_base + (link_index * NPU_LINK_SIZE); +} + +static uint64_t get_bar_size(uint64_t bar) +{ + return (1 << GETFIELD(NX_MMIO_BAR_SIZE, bar)) * 0x10000; +} + +/* Update the changes of the device BAR to link BARs */ +static void npu_dev_bar_update(uint32_t gcid, struct npu_dev_bar *bar, + bool enable) +{ + uint64_t val; + + if (!bar->xscom) + return; + + val = bar->base; + val = SETFIELD(NX_MMIO_BAR_SIZE, val, ilog2(bar->size / 0x10000)); + if (enable) + val |= NX_MMIO_BAR_ENABLE; + xscom_write(gcid, bar->xscom, val); +} + +/* Trap for PCI command (0x4) to enable or disable device's BARs */ +static int64_t npu_dev_cfg_write_cmd(void *dev, + struct pci_cfg_reg_filter *pcrf __unused, + uint32_t offset, uint32_t size, + uint32_t *data, bool write) +{ + struct pci_virt_device *pvd = dev; + struct npu_dev *ndev = pvd->data; + bool enable; + + if (!write) + return OPAL_PARTIAL; + + if (offset != PCI_CFG_CMD) + return OPAL_PARAMETER; + if (size != 1 && size != 2 && size != 4) + return OPAL_PARAMETER; + + /* Update device BARs and link BARs will be syncrhonized + * with hardware automatically. + */ + enable = !!(*data & PCI_CFG_CMD_MEM_EN); + npu_dev_bar_update(ndev->npu->chip_id, &ndev->bar, enable); + + /* Normal path to update PCI config buffer */ + return OPAL_PARTIAL; +} + +/* + * Trap for memory BARs: 0xFF's should be written to BAR register + * prior to getting its size. + */ +static int64_t npu_dev_cfg_bar_read(struct npu_dev *dev __unused, + struct pci_cfg_reg_filter *pcrf, + uint32_t offset, uint32_t size, + uint32_t *data) +{ + struct npu_dev_bar *bar = (struct npu_dev_bar *)(pcrf->data); + + /* Revert to normal path if we weren't trapped for BAR size */ + if (!bar->trapped) + return OPAL_PARTIAL; + + if (offset != pcrf->start && + offset != pcrf->start + 4) + return OPAL_PARAMETER; + if (size != 4) + return OPAL_PARAMETER; + + bar->trapped = false; + *data = bar->bar_sz; + return OPAL_SUCCESS; +} + +static int64_t npu_dev_cfg_bar_write(struct npu_dev *dev, + struct pci_cfg_reg_filter *pcrf, + uint32_t offset, uint32_t size, + uint32_t data) +{ + struct pci_virt_device *pvd = dev->pvd; + struct npu_dev_bar *bar = (struct npu_dev_bar *)(pcrf->data); + uint32_t pci_cmd; + + if (offset != pcrf->start && + offset != pcrf->start + 4) + return OPAL_PARAMETER; + if (size != 4) + return OPAL_PARAMETER; + + /* Return BAR size on next read */ + if (data == 0xffffffff) { + bar->trapped = true; + if (offset == pcrf->start) + bar->bar_sz = (bar->size & 0xffffffff); + else + bar->bar_sz = (bar->size >> 32); + + return OPAL_SUCCESS; + } + + /* Update BAR base address */ + if (offset == pcrf->start) { + bar->base &= 0xffffffff00000000UL; + bar->base |= (data & 0xfffffff0); + } else { + bar->base &= 0x00000000ffffffffUL; + bar->base |= ((uint64_t)data << 32); + + PCI_VIRT_CFG_NORMAL_RD(pvd, PCI_CFG_CMD, 4, &pci_cmd); + npu_dev_bar_update(dev->npu->chip_id, bar, + !!(pci_cmd & PCI_CFG_CMD_MEM_EN)); + } + + /* We still depend on the normal path to update the + * cached config buffer. + */ + return OPAL_PARAMETER; +} + +static int64_t npu_dev_cfg_bar(void *dev, struct pci_cfg_reg_filter *pcrf, + uint32_t offset, uint32_t len, uint32_t *data, + bool write) +{ + struct pci_virt_device *pvd = dev; + struct npu_dev *ndev = pvd->data; + + if (write) + return npu_dev_cfg_bar_write(ndev, pcrf, offset, len, *data); + + return npu_dev_cfg_bar_read(ndev, pcrf, offset, len, data); +} + +static int64_t npu_dev_cfg_exp_devcap(void *dev, + struct pci_cfg_reg_filter *pcrf __unused, + uint32_t offset, uint32_t size, + uint32_t *data, bool write) +{ + struct pci_virt_device *pvd = dev; + struct npu_dev *ndev = pvd->data; + + assert(write); + + if ((size != 2) || (offset & 1)) { + /* Short config writes are not supported */ + prlog(PR_ERR, "NPU%d: Unsupported write to pcie control register\n", + ndev->phb->opal_id); + return OPAL_PARAMETER; + } + + if (*data & PCICAP_EXP_DEVCTL_FUNC_RESET) + npu_dev_procedure_reset(ndev); + + return OPAL_PARTIAL; +} + +static struct npu_dev *bdfn_to_npu_dev(struct npu *p, uint32_t bdfn) +{ + struct pci_virt_device *pvd; + + /* Sanity check */ + if (bdfn & ~0xff) + return NULL; + + pvd = pci_virt_find_device(&p->phb, bdfn); + if (pvd) + return pvd->data; + + return NULL; +} + +#define NPU_CFG_READ(size, type) \ +static int64_t npu_cfg_read##size(struct phb *phb, uint32_t bdfn, \ + uint32_t offset, type *data) \ +{ \ + uint32_t val; \ + int64_t ret; \ + \ + ret = pci_virt_cfg_read(phb, bdfn, offset, sizeof(*data), &val); \ + *data = (type)val; \ + return ret; \ +} +#define NPU_CFG_WRITE(size, type) \ +static int64_t npu_cfg_write##size(struct phb *phb, uint32_t bdfn, \ + uint32_t offset, type data) \ +{ \ + uint32_t val = data; \ + \ + return pci_virt_cfg_write(phb, bdfn, offset, sizeof(data), val); \ +} + +NPU_CFG_READ(8, u8); +NPU_CFG_READ(16, u16); +NPU_CFG_READ(32, u32); +NPU_CFG_WRITE(8, u8); +NPU_CFG_WRITE(16, u16); +NPU_CFG_WRITE(32, u32); + +static int __npu_dev_bind_pci_dev(struct phb *phb __unused, + struct pci_device *pd, + void *data) +{ + struct npu_dev *dev = data; + struct dt_node *pci_dt_node; + char *pcislot; + + /* Ignore non-nvidia PCI devices */ + if ((pd->vdid & 0xffff) != 0x10de) + return 0; + + /* Find the PCI device's slot location */ + for (pci_dt_node = pd->dn; + pci_dt_node && !dt_find_property(pci_dt_node, "ibm,slot-label"); + pci_dt_node = pci_dt_node->parent); + + if (!pci_dt_node) + return 0; + + pcislot = (char *)dt_prop_get(pci_dt_node, "ibm,slot-label"); + + prlog(PR_DEBUG, "NPU: comparing GPU %s and NPU %s\n", + pcislot, dev->slot_label); + + if (streq(pcislot, dev->slot_label)) + return 1; + + return 0; +} + +static void npu_dev_bind_pci_dev(struct npu_dev *dev) +{ + struct phb *phb; + uint32_t i; + + if (dev->pd) + return; + + for (i = 0; i < 64; i++) { + if (dev->npu->phb.opal_id == i) + continue; + + phb = pci_get_phb(i); + if (!phb) + continue; + + dev->pd = pci_walk_dev(phb, NULL, __npu_dev_bind_pci_dev, dev); + if (dev->pd) { + dev->phb = phb; + /* Found the device, set the bit in config space */ + PCI_VIRT_CFG_INIT_RO(dev->pvd, VENDOR_CAP_START + + VENDOR_CAP_PCI_DEV_OFFSET, 1, 0x01); + return; + } + } + + prlog(PR_INFO, "%s: No PCI device for NPU device %04x:%02x:%02x.%x to bind to. If you expect a GPU to be there, this is a problem.\n", + __func__, dev->npu->phb.opal_id, + dev->pvd->bdfn >> 8 & 0xff, + dev->pvd->bdfn >> 3 & 0x1f, + dev->pvd->bdfn & 0x7); + +} + +static struct lock pci_npu_phandle_lock = LOCK_UNLOCKED; + +/* Appends an NPU phandle to the given PCI device node ibm,npu + * property */ +static void npu_append_pci_phandle(struct dt_node *dn, u32 phandle) +{ + uint32_t *npu_phandles; + struct dt_property *pci_npu_phandle_prop; + size_t prop_len; + + /* Use a lock to make sure no one else has a reference to an + * ibm,npu property (this assumes this is the only function + * that holds a reference to it). */ + lock(&pci_npu_phandle_lock); + + /* This function shouldn't be called unless ibm,npu exists */ + pci_npu_phandle_prop = (struct dt_property *) + dt_require_property(dn, "ibm,npu", -1); + + /* Need to append to the properties */ + prop_len = pci_npu_phandle_prop->len; + prop_len += sizeof(*npu_phandles); + dt_resize_property(&pci_npu_phandle_prop, prop_len); + + npu_phandles = (uint32_t *) pci_npu_phandle_prop->prop; + npu_phandles[prop_len/sizeof(*npu_phandles) - 1] = phandle; + unlock(&pci_npu_phandle_lock); +} + +static int npu_dn_fixup(struct phb *phb, + struct pci_device *pd, + void *data __unused) +{ + struct npu *p = phb_to_npu(phb); + struct npu_dev *dev; + + dev = bdfn_to_npu_dev(p, pd->bdfn); + assert(dev); + + if (dev->phb || dev->pd) + return 0; + + /* NPU devices require a slot location to associate with GPUs */ + dev->slot_label = dt_prop_get(pd->dn, "ibm,slot-label"); + + /* Bind the emulated PCI device with the real one, which can't + * be done until the PCI devices are populated. Once the real + * PCI device is identified, we also need fix the device-tree + * for it + */ + npu_dev_bind_pci_dev(dev); + if (dev->phb && dev->pd && dev->pd->dn) { + if (dt_find_property(dev->pd->dn, "ibm,npu")) + npu_append_pci_phandle(dev->pd->dn, pd->dn->phandle); + else + dt_add_property_cells(dev->pd->dn, "ibm,npu", pd->dn->phandle); + + dt_add_property_cells(pd->dn, "ibm,gpu", dev->pd->dn->phandle); + } + + return 0; +} + +static void npu_phb_final_fixup(struct phb *phb) +{ + pci_walk_dev(phb, NULL, npu_dn_fixup, NULL); +} + +static void npu_ioda_init(struct npu *p) +{ + uint64_t *data64; + uint32_t i; + + /* LXIVT - Disable all LSIs */ + for (i = 0; i < ARRAY_SIZE(p->lxive_cache); i++) { + data64 = &p->lxive_cache[i]; + *data64 = SETFIELD(NPU_IODA_LXIVT_PRIORITY, 0ul, 0xff); + *data64 = SETFIELD(NPU_IODA_LXIVT_SERVER, *data64, 0); + } + + /* PCT - Reset to reserved PE# */ + for (i = 0; i < ARRAY_SIZE(p->pce_cache); i++) { + data64 = &p->pce_cache[i]; + *data64 = SETFIELD(NPU_IODA_PCT_PE, 0ul, 0ul); + *data64 |= NPU_IODA_PCT_LINK_ENABLED; + } + + /* Clear TVT */ + memset(p->tve_cache, 0, sizeof(p->tve_cache)); +} + +static int64_t npu_ioda_reset(struct phb *phb, bool purge) +{ + struct npu *p = phb_to_npu(phb); + uint32_t i; + + if (purge) { + NPUDBG(p, "Purging all IODA tables...\n"); + npu_ioda_init(p); + } + + /* LIST */ + npu_ioda_sel(p, NPU_IODA_TBL_LIST, 0, true); + for (i = 0; i < 8; i++) + out_be64(p->at_regs + NPU_IODA_DATA0, 0x1); + + /* LIXVT */ + npu_ioda_sel(p, NPU_IODA_TBL_LXIVT, 0, true); + for (i = 0; i < ARRAY_SIZE(p->lxive_cache); i++) + out_be64(p->at_regs + NPU_IODA_DATA0, p->lxive_cache[i]); + + /* PCT */ + npu_ioda_sel(p, NPU_IODA_TBL_PCT, 0, true); + for (i = 0; i < ARRAY_SIZE(p->pce_cache); i++) + out_be64(p->at_regs + NPU_IODA_DATA0, p->pce_cache[i]); + + /* TVT */ + npu_ioda_sel(p, NPU_IODA_TBL_TVT, 0, true); + for (i = 0; i < ARRAY_SIZE(p->tve_cache); i++) + out_be64(p->at_regs + NPU_IODA_DATA0, p->tve_cache[i]); + + return OPAL_SUCCESS; +} + +static int npu_isn_valid(struct npu *p, uint32_t isn) +{ + if (p->chip_id != p8_irq_to_chip(isn) || p->index != 0 || + NPU_IRQ_NUM(isn) < NPU_LSI_IRQ_MIN || + NPU_IRQ_NUM(isn) > NPU_LSI_IRQ_MAX) { + /** + * @fwts-label NPUisnInvalid + * @fwts-advice NVLink not functional + */ + prlog(PR_ERR, "NPU%d: isn 0x%x not valid for this NPU\n", + p->phb.opal_id, isn); + return false; + } + + return true; +} + +static int64_t npu_lsi_get_xive(struct irq_source *is, uint32_t isn, + uint16_t *server, uint8_t *prio) +{ + struct npu *p = is->data; + uint32_t irq = NPU_IRQ_NUM(isn); + uint64_t lxive; + + if (!npu_isn_valid(p, isn)) + return OPAL_PARAMETER; + + /* The content is fetched from the cache, which requires + * that the initial cache should be initialized with the + * default values + */ + irq -= NPU_LSI_IRQ_MIN; + lxive = p->lxive_cache[irq]; + *server = GETFIELD(NPU_IODA_LXIVT_SERVER, lxive); + *prio = GETFIELD(NPU_IODA_LXIVT_PRIORITY, lxive); + + return OPAL_SUCCESS; +} + +static int64_t npu_lsi_set_xive(struct irq_source *is, uint32_t isn, + uint16_t server, uint8_t prio) +{ + struct npu *p = is->data; + uint32_t irq = NPU_IRQ_NUM(isn); + uint64_t lxive; + + if (!npu_isn_valid(p, isn)) + return OPAL_PARAMETER; + + /* Figure out LXIVT entry */ + lxive = SETFIELD(NPU_IODA_LXIVT_SERVER, 0ul, server); + lxive = SETFIELD(NPU_IODA_LXIVT_PRIORITY, lxive, prio); + + /* Cache LXIVT entry */ + irq -= NPU_LSI_IRQ_MIN; + p->lxive_cache[irq] = lxive; + + /* Update to LXIVT entry */ + npu_ioda_sel(p, NPU_IODA_TBL_LXIVT, irq, false); + lxive = in_be64(p->at_regs + NPU_IODA_DATA0); + lxive = SETFIELD(NPU_IODA_LXIVT_SERVER, lxive, server); + lxive = SETFIELD(NPU_IODA_LXIVT_PRIORITY, lxive, prio); + out_be64(p->at_regs + NPU_IODA_DATA0, lxive); + + return OPAL_SUCCESS; +} + +static void npu_err_interrupt(struct irq_source *is, uint32_t isn) +{ + struct npu *p = is->data; + uint32_t irq = NPU_IRQ_NUM(isn); + + if (!npu_isn_valid(p, isn)) + return; + + /* There're 4 LSIs used for error reporting: 4/5 for data + * link error reporting while 6/7 for frozen PE detection + */ + irq -= NPU_LSI_IRQ_MIN; + switch (irq) { + case 4 ... 5: + prerror("Invalid NPU error interrupt received\n"); + break; + case 6 ... 7: + opal_update_pending_evt(OPAL_EVENT_PCI_ERROR, + OPAL_EVENT_PCI_ERROR); + } +} + +static uint64_t npu_lsi_attributes(struct irq_source *is, uint32_t isn) +{ + struct npu *p = is->data; + uint32_t idx = isn - p->base_lsi; + + if (idx >= 4) + return IRQ_ATTR_TARGET_OPAL | IRQ_ATTR_TARGET_RARE | IRQ_ATTR_TYPE_LSI; + return IRQ_ATTR_TARGET_LINUX; +} + +/* Error LSIs (skiboot owned) */ +static const struct irq_source_ops npu_lsi_irq_ops = { + .get_xive = npu_lsi_get_xive, + .set_xive = npu_lsi_set_xive, + .attributes = npu_lsi_attributes, + .interrupt = npu_err_interrupt, +}; + +static void npu_register_irq(struct npu *p) +{ + register_irq_source(&npu_lsi_irq_ops, p, p->base_lsi, 8); +} + +static void npu_hw_init(struct npu *p) +{ + /* 3 MMIO setup for AT */ + out_be64(p->at_regs + NPU_LSI_SOURCE_ID, + SETFIELD(NPU_LSI_SRC_ID_BASE, 0ul, NPU_LSI_IRQ_MIN >> 4)); + BUILD_ASSERT((NPU_LSI_IRQ_MIN & 0x07F0) == NPU_LSI_IRQ_MIN); + out_be64(p->at_regs + NPU_INTREP_TIMER, 0x0ul); + npu_ioda_reset(&p->phb, false); +} + +static int64_t npu_map_pe_dma_window_real(struct phb *phb, + uint64_t pe_number, + uint16_t window_id, + uint64_t pci_start_addr, + uint64_t pci_mem_size) +{ + struct npu *p = phb_to_npu(phb); + uint64_t end; + uint64_t tve; + + /* Sanity check. Each PE has one corresponding TVE */ + if (pe_number >= NPU_NUM_OF_PES || + window_id != pe_number) + return OPAL_PARAMETER; + + if (pci_mem_size) { + /* Enable */ + + end = pci_start_addr + pci_mem_size; + + /* We have to be 16M aligned */ + if ((pci_start_addr & 0x00ffffff) || + (pci_mem_size & 0x00ffffff)) + return OPAL_PARAMETER; + + /* + * It *looks* like this is the max we can support (we need + * to verify this. Also we are not checking for rollover, + * but then we aren't trying too hard to protect ourselves + * againt a completely broken OS. + */ + if (end > 0x0003ffffffffffffull) + return OPAL_PARAMETER; + + /* + * Put start address bits 49:24 into TVE[52:53]||[0:23] + * and end address bits 49:24 into TVE[54:55]||[24:47] + * and set TVE[51] + */ + tve = (pci_start_addr << 16) & (0xffffffull << 48); + tve |= (pci_start_addr >> 38) & (3ull << 10); + tve |= (end >> 8) & (0xfffffful << 16); + tve |= (end >> 40) & (3ull << 8); + tve |= PPC_BIT(51); + } else { + /* Disable */ + tve = 0; + } + + npu_ioda_sel(p, NPU_IODA_TBL_TVT, window_id, false); + out_be64(p->at_regs + NPU_IODA_DATA0, tve); + p->tve_cache[window_id] = tve; + + return OPAL_SUCCESS; +} + +static int64_t npu_map_pe_dma_window(struct phb *phb, + uint64_t pe_number, + uint16_t window_id, + uint16_t tce_levels, + uint64_t tce_table_addr, + uint64_t tce_table_size, + uint64_t tce_page_size) +{ + struct npu *p = phb_to_npu(phb); + uint64_t tts_encoded; + uint64_t data64 = 0; + + /* Sanity check. Each PE has one corresponding TVE */ + if (pe_number >= NPU_NUM_OF_PES || + window_id != pe_number) + return OPAL_PARAMETER; + + /* Special condition, zero TCE table size used to disable + * the TVE. + */ + if (!tce_table_size) { + npu_ioda_sel(p, NPU_IODA_TBL_TVT, window_id, false); + out_be64(p->at_regs + NPU_IODA_DATA0, 0ul); + p->tve_cache[window_id] = 0ul; + return OPAL_SUCCESS; + } + + /* Additional arguments validation */ + if (tce_levels < 1 || + tce_levels > 4 || + !is_pow2(tce_table_size) || + tce_table_size < 0x1000) + return OPAL_PARAMETER; + + /* TCE table size */ + data64 = SETFIELD(NPU_IODA_TVT_TTA, 0ul, tce_table_addr >> 12); + tts_encoded = ilog2(tce_table_size) - 11; + if (tts_encoded > 39) + return OPAL_PARAMETER; + data64 = SETFIELD(NPU_IODA_TVT_SIZE, data64, tts_encoded); + + /* TCE page size */ + switch (tce_page_size) { + case 0x10000: /* 64K */ + data64 = SETFIELD(NPU_IODA_TVT_PSIZE, data64, 5); + break; + case 0x1000000: /* 16M */ + data64 = SETFIELD(NPU_IODA_TVT_PSIZE, data64, 13); + break; + case 0x10000000: /* 256M */ + data64 = SETFIELD(NPU_IODA_TVT_PSIZE, data64, 17); + break; + case 0x1000: /* 4K */ + default: + data64 = SETFIELD(NPU_IODA_TVT_PSIZE, data64, 1); + } + + /* Number of levels */ + data64 = SETFIELD(NPU_IODA_TVT_LEVELS, data64, tce_levels - 1); + + /* Update to hardware */ + npu_ioda_sel(p, NPU_IODA_TBL_TVT, window_id, false); + out_be64(p->at_regs + NPU_IODA_DATA0, data64); + p->tve_cache[window_id] = data64; + + return OPAL_SUCCESS; +} + +static int64_t npu_set_pe(struct phb *phb, + uint64_t pe_number, + uint64_t bdfn, + uint8_t bcompare, + uint8_t dcompare, + uint8_t fcompare, + uint8_t action) +{ + struct npu *p = phb_to_npu(phb); + struct npu_dev *dev; + uint32_t link_idx; + uint64_t *data64; + + /* Sanity check */ + if (action != OPAL_MAP_PE && + action != OPAL_UNMAP_PE) + return OPAL_PARAMETER; + if (pe_number >= NPU_NUM_OF_PES) + return OPAL_PARAMETER; + + /* All emulated PCI devices hooked to root bus, whose + * bus number is zero. + */ + dev = bdfn_to_npu_dev(p, bdfn); + if (PCI_BUS_NUM(bdfn) || !dev) + return OPAL_PARAMETER; + + link_idx = dev->index; + dev->pe_number = pe_number; + + /* Separate links will be mapped to different PEs */ + if (bcompare != OpalPciBusAll || + dcompare != OPAL_COMPARE_RID_DEVICE_NUMBER || + fcompare != OPAL_COMPARE_RID_FUNCTION_NUMBER) + return OPAL_UNSUPPORTED; + + /* Map the link to the corresponding PE */ + data64 = &p->pce_cache[link_idx]; + if (action == OPAL_MAP_PE) + *data64 = SETFIELD(NPU_IODA_PCT_PE, *data64, + pe_number); + else + *data64 = SETFIELD(NPU_IODA_PCT_PE, *data64, + NPU_NUM_OF_PES); + + *data64 |= NPU_IODA_PCT_LINK_ENABLED; + + npu_ioda_sel(p, NPU_IODA_TBL_PCT, link_idx, false); + out_be64(p->at_regs + NPU_IODA_DATA0, *data64); + + return OPAL_SUCCESS; +} + +static int64_t npu_get_link_state(struct pci_slot *slot __unused, uint8_t *val) +{ + /* As we're emulating all PCI stuff, the link bandwidth + * isn't big deal anyway. + */ + *val = OPAL_SHPC_LINK_UP_x1; + return OPAL_SUCCESS; +} + +static int64_t npu_get_power_state(struct pci_slot *slot __unused, uint8_t *val) +{ + *val = PCI_SLOT_POWER_ON; + return OPAL_SUCCESS; +} + +static int64_t npu_hreset(struct pci_slot *slot __unused) +{ + prlog(PR_DEBUG, "NPU: driver should call reset procedure here\n"); + + return OPAL_SUCCESS; +} + +static int64_t npu_freset(struct pci_slot *slot __unused) +{ + /* FIXME: PHB fundamental reset, which need to be + * figured out later. It's used by EEH recovery + * upon fenced AT. + */ + return OPAL_SUCCESS; +} + +static struct pci_slot *npu_slot_create(struct phb *phb) +{ + struct pci_slot *slot; + + slot = pci_slot_alloc(phb, NULL); + if (!slot) + return slot; + + /* Elementary functions */ + slot->ops.get_presence_state = NULL; + slot->ops.get_link_state = npu_get_link_state; + slot->ops.get_power_state = npu_get_power_state; + slot->ops.get_attention_state = NULL; + slot->ops.get_latch_state = NULL; + slot->ops.set_power_state = NULL; + slot->ops.set_attention_state = NULL; + + slot->ops.prepare_link_change = NULL; + slot->ops.poll_link = NULL; + slot->ops.hreset = npu_hreset; + slot->ops.freset = npu_freset; + slot->ops.creset = NULL; + + return slot; +} + +static int64_t npu_freeze_status(struct phb *phb, + uint64_t pe_number __unused, + uint8_t *freeze_state, + uint16_t *pci_error_type __unused, + uint16_t *severity __unused) +{ + /* FIXME: When it's called by skiboot PCI config accessor, + * the PE number is fixed to 0, which is incorrect. We need + * introduce another PHB callback to translate it. For now, + * it keeps the skiboot PCI enumeration going. + */ + struct npu *p = phb_to_npu(phb); + if (p->fenced) + *freeze_state = OPAL_EEH_STOPPED_MMIO_DMA_FREEZE; + else + *freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN; + return OPAL_SUCCESS; +} + +static int64_t npu_eeh_next_error(struct phb *phb, + uint64_t *first_frozen_pe, + uint16_t *pci_error_type, + uint16_t *severity) +{ + struct npu *p = phb_to_npu(phb); + int i; + uint64_t result = 0; + *first_frozen_pe = -1; + *pci_error_type = OPAL_EEH_NO_ERROR; + *severity = OPAL_EEH_SEV_NO_ERROR; + + if (p->fenced) { + *pci_error_type = OPAL_EEH_PHB_ERROR; + *severity = OPAL_EEH_SEV_PHB_FENCED; + return OPAL_SUCCESS; + } + + npu_ioda_sel(p, NPU_IODA_TBL_PESTB, 0, true); + for (i = 0; i < NPU_NUM_OF_PES; i++) { + result = in_be64(p->at_regs + NPU_IODA_DATA0); + if (result > 0) { + *first_frozen_pe = i; + *pci_error_type = OPAL_EEH_PE_ERROR; + *severity = OPAL_EEH_SEV_PE_ER; + break; + } + } + + return OPAL_SUCCESS; +} + +/* For use in error injection and handling. */ +void npu_set_fence_state(struct npu *p, bool fence) { + p->fenced = fence; + + if (fence) + prlog(PR_ERR, "NPU: Chip %x is fenced, reboot required.\n", + p->chip_id); + else + prlog(PR_WARNING, "NPU: un-fencing is dangerous and should \ + only be used for development purposes."); +} + +/* Sets the NPU to trigger an error when a DMA occurs */ +static int64_t npu_err_inject(struct phb *phb, uint64_t pe_number, + uint32_t type, uint32_t func __unused, + uint64_t addr __unused, uint64_t mask __unused) +{ + struct npu *p = phb_to_npu(phb); + struct npu_dev *dev = NULL; + int i; + + if (pe_number >= NPU_NUM_OF_PES) { + prlog(PR_ERR, "NPU: error injection failed, bad PE given\n"); + return OPAL_PARAMETER; + } + + for (i = 0; i < p->total_devices; i++) { + if (p->devices[i].pe_number == pe_number) { + dev = &p->devices[i]; + break; + } + } + + if (!dev) { + prlog(PR_ERR, "NPU: couldn't find device with PE%llx\n", pe_number); + return OPAL_PARAMETER; + } + + /* TODO: extend this to conform to OPAL injection standards */ + if (type > 1) { + prlog(PR_ERR, "NPU: invalid error injection type\n"); + return OPAL_PARAMETER; + } else if (type == 1) { + /* Emulate fence mode. */ + npu_set_fence_state(p, true); + } else { + /* Cause a freeze with an invalid MMIO read. If the BAR is not + * enabled, this will checkstop the machine. + */ + npu_dev_bar_update(p->chip_id, &dev->bar, true); + in_be64((void *)dev->bar.base); + } + + return OPAL_SUCCESS; +} + +static const struct phb_ops npu_ops = { + .cfg_read8 = npu_cfg_read8, + .cfg_read16 = npu_cfg_read16, + .cfg_read32 = npu_cfg_read32, + .cfg_write8 = npu_cfg_write8, + .cfg_write16 = npu_cfg_write16, + .cfg_write32 = npu_cfg_write32, + .get_reserved_pe_number = NULL, + .device_init = NULL, + .phb_final_fixup = npu_phb_final_fixup, + .ioda_reset = npu_ioda_reset, + .papr_errinjct_reset = NULL, + .pci_reinit = NULL, + .set_phb_mem_window = NULL, + .phb_mmio_enable = NULL, + .map_pe_mmio_window = NULL, + .map_pe_dma_window = npu_map_pe_dma_window, + .map_pe_dma_window_real = npu_map_pe_dma_window_real, + .pci_msi_eoi = NULL, + .set_xive_pe = NULL, + .get_msi_32 = NULL, + .get_msi_64 = NULL, + .set_pe = npu_set_pe, + .set_peltv = NULL, + .eeh_freeze_status = npu_freeze_status, + .eeh_freeze_clear = NULL, + .eeh_freeze_set = NULL, + .next_error = npu_eeh_next_error, + .err_inject = npu_err_inject, + .get_diag_data2 = NULL, + .set_capi_mode = NULL, + .set_capp_recovery = NULL, +}; + +static void assign_mmio_bars(uint32_t gcid, uint32_t xscom, + struct dt_node *npu_dn, uint64_t mm_win[2], + uint64_t at_bar[2]) +{ + uint64_t mem_start, mem_end; + struct npu_dev_bar bar; + struct dt_node *link; + + /* Configure BAR selection. + * + * Currently, each PHY contains 2 links and each link has 2 + * BARs. The first BAR is assigned to the DLTL region which is + * what the kernel uses. The second BAR is either assigned to + * either the PL or AT region or unassigned. The PL0/PL1/AT + * MMIO regions are not exposed to the kernel so we assigned + * them at the start of the available memory area followed by + * the DLTL regions. So we end up with the following memory + * map (assuming we're given a memory region starting at + * 0x3fff000000000): + * + * Link#0-BAR#0: NTL/NDL BAR (128KB) - 0x3fff000420000 + * Link#0-BAR#1: PL0 BAR ( 2MB) - 0x3fff000000000 + * Link#1-BAR#0: NTL/NDL BAR (128KB) - 0x3fff000440000 + * Link#1-BAR#1: AT BAR ( 64KB) - 0x3fff000400000 + * Link#2-BAR#0: NTL/NDL BAR (128KB) - 0x3fff000460000 + * Link#2-BAR#1: PL1 BAR ( 2MB) - 0x3fff000200000 + * Link#3-BAR#0: NTL/NDL BAR (128KB) - 0x3fff000480000 + * Link#3-BAR#1: UNASSIGNED + */ + xscom_write(gcid, xscom + NPU_AT_SCOM_OFFSET + NX_BAR, + 0x0211000043500000UL); + + xscom_read(gcid, npu_link_scom_base(npu_dn, xscom, 0) + NX_MMIO_BAR_0, + &mem_start); + mem_start = GETFIELD(NX_MMIO_BAR_BASE, mem_start) << 12; + + xscom_read(gcid, npu_link_scom_base(npu_dn, xscom, 5) + NX_MMIO_BAR_0, + &mem_end); + mem_end = (GETFIELD(NX_MMIO_BAR_BASE, mem_end) << 12) + + get_bar_size(mem_end); + + /* PL0 BAR comes first at 0x3fff000000000 */ + bar.xscom = npu_link_scom_base(npu_dn, xscom, 0) + NX_MMIO_BAR_1; + bar.base = mem_start; + bar.size = NX_MMIO_PL_SIZE; + npu_dev_bar_update(gcid, &bar, true); + + /* PL1 BAR */ + bar.xscom = npu_link_scom_base(npu_dn, xscom, 4) + NX_MMIO_BAR_1; + bar.base += bar.size; + bar.size = NX_MMIO_PL_SIZE; + npu_dev_bar_update(gcid, &bar, true); + + /* Then the AT BAR */ + bar.xscom = npu_link_scom_base(npu_dn, xscom, 1) + NX_MMIO_BAR_1; + bar.base += bar.size; + bar.size = NX_MMIO_AT_SIZE; + at_bar[0] = bar.base; + at_bar[1] = NX_MMIO_AT_SIZE; + npu_dev_bar_update(gcid, &bar, true); + + /* Now we configure all the DLTL BARs. These are the ones + * actually exposed to the kernel. */ + mm_win[0] = bar.base + bar.size; + dt_for_each_node(npu_dn, link) { + uint32_t index; + + index = dt_prop_get_u32(link, "ibm,npu-link-index"); + bar.xscom = npu_link_scom_base(npu_dn, xscom, index) + + NX_MMIO_BAR_0; + bar.base += bar.size; + bar.size = NX_MMIO_DL_SIZE; + bar.base = ALIGN_UP(bar.base, bar.size); + npu_dev_bar_update(gcid, &bar, false); + } + mm_win[1] = (bar.base + bar.size) - mm_win[0]; + + /* If we weren't given enough room to setup all the BARs we + * require it's better to crash here than risk creating + * overlapping BARs which will xstop the machine randomly in + * the future.*/ + assert(bar.base + bar.size <= mem_end); +} + +/* Probe NPU device node and create PCI root device node + * accordingly. The NPU deivce node should specify number + * of links and xscom base address to access links. + */ +static void npu_probe_phb(struct dt_node *dn) +{ + struct dt_node *np; + uint32_t gcid, index, phb_index, xscom; + uint64_t at_bar[2], mm_win[2]; + uint32_t links; + char *path; + + /* Retrieve chip id */ + path = dt_get_path(dn); + gcid = dt_get_chip_id(dn); + index = dt_prop_get_u32(dn, "ibm,npu-index"); + phb_index = dt_prop_get_u32(dn, "ibm,phb-index"); + links = dt_prop_get_u32(dn, "ibm,npu-links"); + prlog(PR_INFO, "Chip %d Found NPU%d (%d links) at %s\n", + gcid, index, links, path); + free(path); + + /* Retrieve xscom base addr */ + xscom = dt_get_address(dn, 0, NULL); + prlog(PR_INFO, " XSCOM Base: %08x\n", xscom); + + assign_mmio_bars(gcid, xscom, dn, mm_win, at_bar); + prlog(PR_INFO, " AT BAR: %016llx (%lldKB)\n", + at_bar[0], at_bar[1] / 0x400); + + /* Create PCI root device node */ + np = dt_new_addr(dt_root, "pciex", at_bar[0]); + assert(np); + + dt_add_property_strings(np, "compatible", + "ibm,power8-npu-pciex", "ibm,ioda2-npu-phb"); + dt_add_property_strings(np, "device_type", "pciex"); + dt_add_property(np, "reg", at_bar, sizeof(at_bar)); + + dt_add_property_cells(np, "ibm,phb-index", phb_index); + dt_add_property_cells(np, "ibm,npu-index", index); + dt_add_property_cells(np, "ibm,chip-id", gcid); + dt_add_property_cells(np, "ibm,xscom-base", xscom); + dt_add_property_cells(np, "ibm,npcq", dn->phandle); + dt_add_property_cells(np, "ibm,links", links); + dt_add_property(np, "ibm,mmio-window", mm_win, sizeof(mm_win)); + dt_add_property_cells(np, "ibm,phb-diag-data-size", 0); + + /* Disable fast reboot - not currently supported */ + disable_fast_reboot("NVLink device enabled"); +} + +static void npu_dev_populate_vendor_cap(struct npu_dev_cap *cap) +{ + struct npu_dev *dev = cap->dev; + struct pci_virt_device *pvd = dev->pvd; + uint32_t offset = cap->start; + uint8_t val; + + /* Add length and version information */ + val = cap->end - cap->start; + PCI_VIRT_CFG_INIT_RO(pvd, offset + 2, 1, val); + PCI_VIRT_CFG_INIT_RO(pvd, offset + 3, 1, OPAL_NPU_VERSION); + offset += 4; + + /* Defaults when the trap can't handle the read/write (eg. due + * to reading/writing less than 4 bytes). */ + val = 0x0; + PCI_VIRT_CFG_INIT_RO(pvd, offset, 4, val); + PCI_VIRT_CFG_INIT_RO(pvd, offset + 4, 4, val); + + /* Create a trap for AT/PL procedures */ + pci_virt_add_filter(pvd, offset, 8, + PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE, + npu_dev_procedure, NULL); + offset += 8; + + PCI_VIRT_CFG_INIT_RO(pvd, offset, 1, dev->index); +} + +static void npu_dev_populate_pcie_cap(struct npu_dev_cap *cap) +{ + struct npu_dev *dev = cap->dev; + struct pci_virt_device *pvd = dev->pvd; + uint32_t base = cap->start; + uint32_t val; + + /* Sanity check on capability ID */ + if (cap->id != PCI_CFG_CAP_ID_EXP) { + prlog(PR_NOTICE, "%s: Invalid capability ID %d (%d)\n", + __func__, cap->id, PCI_CFG_CAP_ID_EXP); + return; + } + + /* Sanity check on spanned registers */ + if ((cap->end - cap->start) < PCIE_CAP_START) { + prlog(PR_NOTICE, "%s: Invalid reg region [%x, %x] for cap %d\n", + __func__, cap->start, cap->end, cap->id); + return; + } + + /* 0x00 - ID/PCIE capability */ + val = cap->id; + val |= ((0x2 << 16) | (PCIE_TYPE_ENDPOINT << 20)); + PCI_VIRT_CFG_INIT_RO(pvd, base, 4, val); + + /* 0x04 - Device capability + * + * We should support FLR. Otherwise, it might have + * problem passing it through to userland via Linux + * VFIO infrastructure + */ + val = ((PCIE_MPSS_128) | + (PCIE_PHANTOM_NONE << 3) | + (PCIE_L0SL_MAX_NO_LIMIT << 6) | + (PCIE_L1L_MAX_NO_LIMIT << 9) | + (PCICAP_EXP_DEVCAP_FUNC_RESET)); + PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_DEVCAP, 4, val); + + pci_virt_add_filter(pvd, base + PCICAP_EXP_DEVCTL, 2, + PCI_REG_FLAG_WRITE, + npu_dev_cfg_exp_devcap, NULL); + + /* 0x08 - Device control and status */ + PCI_VIRT_CFG_INIT(pvd, base + PCICAP_EXP_DEVCTL, 4, 0x00002810, + 0xffff0000, 0x000f0000); + + /* 0x0c - Link capability */ + val = (PCIE_LSPEED_VECBIT_2 | (PCIE_LWIDTH_1X << 4)); + PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_LCAP, 4, val); + + /* 0x10 - Link control and status */ + PCI_VIRT_CFG_INIT(pvd, base + PCICAP_EXP_LCTL, 4, 0x00130000, + 0xfffff000, 0xc0000000); + + /* 0x14 - Slot capability */ + PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_SLOTCAP, 4, 0x00000000); + + /* 0x18 - Slot control and status */ + PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_SLOTCTL, 4, 0x00000000); + + /* 0x1c - Root control and capability */ + PCI_VIRT_CFG_INIT(pvd, base + PCICAP_EXP_RC, 4, 0x00000000, + 0xffffffe0, 0x00000000); + + /* 0x20 - Root status */ + PCI_VIRT_CFG_INIT(pvd, base + PCICAP_EXP_RSTAT, 4, 0x00000000, + 0xffffffff, 0x00010000); + + /* 0x24 - Device capability 2 */ + PCI_VIRT_CFG_INIT_RO(pvd, base + PCIECAP_EXP_DCAP2, 4, 0x00000000); + + /* 0x28 - Device Control and status 2 */ + PCI_VIRT_CFG_INIT(pvd, base + PCICAP_EXP_DCTL2, 4, 0x00070000, + 0xffff0000, 0x00000000); + + /* 0x2c - Link capability 2 */ + PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_LCAP2, 4, 0x00000007); + + /* 0x30 - Link control and status 2 */ + PCI_VIRT_CFG_INIT(pvd, base + PCICAP_EXP_LCTL2, 4, 0x00000003, + 0xffff0000, 0x00200000); + + /* 0x34 - Slot capability 2 */ + PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_SCAP2, 4, 0x00000000); + + /* 0x38 - Slot control and status 2 */ + PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_SCTL2, 4, 0x00000000); +} + +static struct npu_dev_cap *npu_dev_create_capability(struct npu_dev *dev, + void (*populate)(struct npu_dev_cap *), + uint16_t id, + uint16_t start, + uint16_t end) +{ + struct npu_dev_cap *cap; + + /* Check if the capability is existing */ + cap = npu_dev_find_capability(dev, id); + if (cap) + return cap; + + /* Allocate new one */ + cap = zalloc(sizeof(struct npu_dev_cap)); + assert(cap); + + /* Put it into the pool */ + cap->id = id; + cap->start = start; + cap->end = end; + cap->dev = dev; + cap->populate = populate; + list_add_tail(&dev->capabilities, &cap->link); + + return cap; +} + +static struct npu_dev_cap *npu_dev_find_capability(struct npu_dev *dev, + uint16_t id) +{ + struct npu_dev_cap *cap; + + list_for_each(&dev->capabilities, cap, link) { + if (cap->id == id) + return cap; + } + + return NULL; +} + +/* + * All capabilities should be put into the device capability + * list according to register offset in ascending order for + * easy access at later point. + */ +static void npu_dev_create_capabilities(struct npu_dev *dev) +{ + list_head_init(&dev->capabilities); + + /* PCI express capability */ + npu_dev_create_capability(dev, npu_dev_populate_pcie_cap, + PCI_CFG_CAP_ID_EXP, PCIE_CAP_START, + PCIE_CAP_END); + + /* Vendor specific capability */ + npu_dev_create_capability(dev, npu_dev_populate_vendor_cap, + PCI_CFG_CAP_ID_VENDOR, VENDOR_CAP_START, + VENDOR_CAP_END); +} + +static void npu_dev_create_cfg(struct npu_dev *dev) +{ + struct pci_virt_device *pvd = dev->pvd; + struct npu_dev_cap *cap; + uint32_t offset; + uint32_t last_cap_offset; + + /* 0x00 - Vendor/Device ID */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_VENDOR_ID, 4, 0x04ea1014); + + /* 0x04 - Command/Status + * + * Create one trap to trace toggling memory BAR enable bit + */ + PCI_VIRT_CFG_INIT(pvd, PCI_CFG_CMD, 4, 0x00100000, 0xffb802b8, + 0xf9000000); + + pci_virt_add_filter(pvd, PCI_CFG_CMD, 1, PCI_REG_FLAG_WRITE, + npu_dev_cfg_write_cmd, NULL); + + /* 0x08 - Rev/Class/Cache */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_REV_ID, 4, 0x06800100); + + /* 0x0c - CLS/Latency Timer/Header/BIST */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CACHE_LINE_SIZE, 4, 0x00800000); + + /* 0x10 - BARs, always 64-bits non-prefetchable + * + * Each emulated device represents one link and therefore + * there is one BAR for the associated DLTL region. + */ + + /* Low 32-bits */ + PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR0, 4, + (dev->bar.base & 0xfffffff0) | dev->bar.flags, + 0x0000000f, 0x00000000); + + /* High 32-bits */ + PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR1, 4, (dev->bar.base >> 32), + 0x00000000, 0x00000000); + + /* + * Create trap. Writting 0xFF's to BAR registers should be + * trapped and return size on next read + */ + pci_virt_add_filter(pvd, PCI_CFG_BAR0, 8, + PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE, + npu_dev_cfg_bar, &dev->bar); + + /* 0x18/1c/20/24 - Disabled BAR#2/3/4/5 + * + * Mark those BARs readonly so that 0x0 will be returned when + * probing the length and the BARs will be skipped. + */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR2, 4, 0x00000000); + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR3, 4, 0x00000000); + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR4, 4, 0x00000000); + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR5, 4, 0x00000000); + + /* 0x28 - Cardbus CIS pointer */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CARDBUS_CIS, 4, 0x00000000); + + /* 0x2c - Subsystem ID */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_SUBSYS_VENDOR_ID, 4, 0x00000000); + + /* 0x30 - ROM BAR + * + * Force its size to be zero so that the kernel will skip + * probing the ROM BAR. We needn't emulate ROM BAR. + */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_ROMBAR, 4, 0xffffffff); + + /* 0x34 - PCI Capability + * + * By default, we don't have any capabilities + */ + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CAP, 4, 0x00000000); + + last_cap_offset = PCI_CFG_CAP - 1; + list_for_each(&dev->capabilities, cap, link) { + offset = cap->start; + + /* Initialize config space for the capability */ + if (cap->populate) + cap->populate(cap); + + /* Add capability header */ + PCI_VIRT_CFG_INIT_RO(pvd, offset, 2, cap->id); + + /* Update the next capability pointer */ + PCI_VIRT_CFG_NORMAL_WR(pvd, last_cap_offset + 1, 1, offset); + + last_cap_offset = offset; + } + + /* 0x38 - Reserved */ + PCI_VIRT_CFG_INIT_RO(pvd, 0x38, 4, 0x00000000); + + /* 0x3c - INT line/pin/Minimal grant/Maximal latency */ + if (!(dev->index % 2)) + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_INT_LINE, 4, 0x00000100); + else + PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_INT_LINE, 4, 0x00000200); +} + +static uint32_t npu_allocate_bdfn(struct npu *p, uint32_t group) +{ + int i; + int bdfn = (group << 3); + + for (i = 0; i < p->total_devices; i++) { + if ((p->devices[i].pvd->bdfn & 0xf8) == (bdfn & 0xf8)) + bdfn++; + } + + return bdfn; +} + +static void npu_create_devices(struct dt_node *dn, struct npu *p) +{ + struct npu_dev *dev; + struct dt_node *npu_dn, *link; + uint32_t bdfn, npu_phandle, index = 0; + uint64_t buid_reg; + uint64_t lsisrcid; + uint64_t buid; + + + /* The bits in the LSI ID Base register are always compared and + * can be set to 0 in the buid base and mask fields. The + * buid (bus unit id) is the full irq minus the last 4 bits. */ + lsisrcid = GETFIELD(NPU_LSI_SRC_ID_BASE, NPU_LSI_SRC_ID_BASE); + buid = p8_chip_irq_block_base(p->chip_id, P8_IRQ_BLOCK_MISC) >> 4; + + buid_reg = SETFIELD(NP_IRQ_LEVELS, NP_BUID_ENABLE, ~0); + buid_reg = SETFIELD(NP_BUID_MASK, buid_reg, ~lsisrcid); + buid_reg = SETFIELD(NP_BUID_BASE, buid_reg, (buid & ~lsisrcid)); + + /* Get the npu node which has the links which we expand here + * into pci like devices attached to our emulated phb. */ + npu_phandle = dt_prop_get_u32(dn, "ibm,npcq"); + npu_dn = dt_find_by_phandle(dt_root, npu_phandle); + assert(npu_dn); + + /* Walk the link@x nodes to initialize devices */ + p->total_devices = 0; + p->phb.scan_map = 0; + list_head_init(&p->phb.virt_devices); + dt_for_each_compatible(npu_dn, link, "ibm,npu-link") { + struct npu_dev_bar *bar; + uint32_t group_id; + uint64_t val; + + dev = &p->devices[index]; + dev->index = dt_prop_get_u32(link, "ibm,npu-link-index"); + dev->xscom = npu_link_scom_base(npu_dn, p->xscom_base, + dev->index); + + dev->npu = p; + dev->dt_node = link; + + /* We don't support MMIO PHY access yet */ + dev->pl_base = NULL; + + group_id = dt_prop_get_u32(link, "ibm,npu-group-id"); + bdfn = npu_allocate_bdfn(p, group_id); + + /* This must be done after calling + * npu_allocate_bdfn() */ + p->total_devices++; + p->phb.scan_map |= 0x1 << ((bdfn & 0xf8) >> 3); + + dev->pl_xscom_base = dt_prop_get_u64(link, "ibm,npu-phy"); + dev->lane_mask = dt_prop_get_u32(link, "ibm,npu-lane-mask"); + + /* Setup BUID/ISRN */ + xscom_write(p->chip_id, dev->xscom + NX_NP_BUID, buid_reg); + + /* Create PCI virtual device */ + dev->pvd = pci_virt_add_device(&p->phb, bdfn, NPU_DEV_CFG_SIZE, dev); + assert(dev->pvd); + bar = &dev->bar; + bar->flags = (PCI_CFG_BAR_TYPE_MEM | + PCI_CFG_BAR_MEM64); + + /* Update BAR info */ + bar->xscom = dev->xscom + NX_MMIO_BAR_0; + xscom_read(p->chip_id, bar->xscom, &val); + bar->base = GETFIELD(NX_MMIO_BAR_BASE, val) << 12; + bar->size = get_bar_size(val); + + /* + * The config space is initialised with the BARs + * disabled, so make sure it is actually disabled in + * hardware. + */ + npu_dev_bar_update(p->chip_id, bar, false); + + /* Initialize capabilities */ + npu_dev_create_capabilities(dev); + + /* Initialize config space */ + npu_dev_create_cfg(dev); + + index++; + } +} + +static void npu_add_phb_properties(struct npu *p) +{ + struct dt_node *np = p->phb.dt_node; + uint32_t icsp = get_ics_phandle(); + uint64_t tkill, mm_base, mm_size; + uint32_t base_lsi = p->base_lsi; + uint32_t map[] = { + /* Dev 0 INT#A (used by fn0) */ + 0x0000, 0x0, 0x0, 0x1, icsp, base_lsi + NPU_LSI_INT_DL0, 1, + /* Dev 0 INT#B (used by fn1) */ + 0x0000, 0x0, 0x0, 0x2, icsp, base_lsi + NPU_LSI_INT_DL1, 1, + /* Dev 1 INT#A (used by fn0) */ + 0x0800, 0x0, 0x0, 0x1, icsp, base_lsi + NPU_LSI_INT_DL2, 1, + /* Dev 1 INT#B (used by fn1) */ + 0x0800, 0x0, 0x0, 0x2, icsp, base_lsi + NPU_LSI_INT_DL3, 1, + }; + /* Mask is bus, device and INT# */ + uint32_t mask[] = {0xf800, 0x0, 0x0, 0x7}; + char slotbuf[32]; + + /* Add various properties that HB doesn't have to + * add, some of them simply because they result from + * policy decisions made in skiboot rather than in HB + * such as the MMIO windows going to PCI, interrupts, + * etc. + */ + dt_add_property_cells(np, "#address-cells", 3); + dt_add_property_cells(np, "#size-cells", 2); + dt_add_property_cells(np, "#interrupt-cells", 1); + dt_add_property_cells(np, "bus-range", 0, 0xff); + dt_add_property_cells(np, "clock-frequency", 0x200, 0); + dt_add_property_cells(np, "interrupt-parent", icsp); + + /* DLPL Interrupts, we don't use the standard swizzle */ + p->phb.lstate.int_size = 0; + dt_add_property(np, "interrupt-map", map, sizeof(map)); + dt_add_property(np, "interrupt-map-mask", mask, sizeof(mask)); + + /* NPU PHB properties */ + /* TODO: Due to an errata TCE KILL only works when DMA traffic + * has been stopped. We need to implement the work around + * which is to do a TCE kill all instead. */ + tkill = cleanup_addr((uint64_t)p->at_regs) + NPU_TCE_KILL; + dt_add_property_cells(np, "ibm,opal-num-pes", + NPU_NUM_OF_PES); + dt_add_property_cells(np, "ibm,opal-reserved-pe", + 0); + dt_add_property_u64(np, "ibm,opal-tce-kill", tkill); + + /* Memory window is exposed as 32-bits non-prefetchable + * one because 64-bits prefetchable one is kind of special + * to kernel. + */ + mm_base = p->mm_base; + mm_size = p->mm_size; + dt_add_property_cells(np, "ranges", 0x02000000, + hi32(mm_base), lo32(mm_base), + hi32(mm_base), lo32(mm_base), + hi32(mm_size), lo32(mm_size)); + + /* Set the slot location on the NPU PHB. This PHB can contain + * devices that correlate with multiple physical slots, so + * present the chip ID instead. + */ + snprintf(slotbuf, sizeof(slotbuf), "NPU Chip %d", p->chip_id); + dt_add_property_string(np, "ibm,io-base-loc-code", slotbuf); +} + +static void npu_create_phb(struct dt_node *dn) +{ + const struct dt_property *prop; + struct npu *p; + struct pci_slot *slot; + uint32_t links; + void *pmem; + + /* Retrieve number of devices */ + links = dt_prop_get_u32(dn, "ibm,links"); + pmem = zalloc(sizeof(struct npu) + links * sizeof(struct npu_dev)); + assert(pmem); + + /* Populate PHB */ + p = pmem; + p->index = dt_prop_get_u32(dn, "ibm,npu-index"); + p->chip_id = dt_prop_get_u32(dn, "ibm,chip-id"); + p->xscom_base = dt_prop_get_u32(dn, "ibm,xscom-base"); + p->total_devices = links; + + /* TODO: When hardware fences are implemented, detect them here */ + p->fenced = false; + + /* This is the AT base */ + p->at_xscom = p->xscom_base + NPU_AT_SCOM_OFFSET; + p->at_regs = (void *)dt_get_address(dn, 0, NULL); + + prop = dt_require_property(dn, "ibm,mmio-window", -1); + assert(prop->len >= (2 * sizeof(uint64_t))); + p->mm_base = ((const uint64_t *)prop->prop)[0]; + p->mm_size = ((const uint64_t *)prop->prop)[1]; + + p->devices = pmem + sizeof(struct npu); + + /* Interrupt */ + p->base_lsi = p8_chip_irq_block_base(p->chip_id, P8_IRQ_BLOCK_MISC) + + NPU_LSI_IRQ_MIN; + + /* Generic PHB */ + p->phb.dt_node = dn; + p->phb.ops = &npu_ops; + p->phb.phb_type = phb_type_pcie_v3; + + /* Populate devices */ + npu_create_devices(dn, p); + + /* Populate extra properties */ + npu_add_phb_properties(p); + + /* Create PHB slot */ + slot = npu_slot_create(&p->phb); + if (!slot) + { + /** + * @fwts-label NPUCannotCreatePHBSlot + * @fwts-advice Firmware probably ran out of memory creating + * NPU slot. NVLink functionality could be broken. + */ + prlog(PR_ERR, "NPU: Cannot create PHB slot\n"); + } + + /* Register PHB */ + pci_register_phb(&p->phb, OPAL_DYNAMIC_PHB_ID); + + /* Initialize IODA cache */ + npu_ioda_init(p); + + /* Register interrupt source */ + npu_register_irq(p); + + /* Initialize hardware */ + npu_hw_init(p); +} + +void probe_npu(void) +{ + struct dt_node *np; + + /* Scan NPU XSCOM nodes */ + dt_for_each_compatible(dt_root, np, "ibm,power8-npu") + npu_probe_phb(np); + + /* Scan newly created PHB nodes */ + dt_for_each_compatible(dt_root, np, "ibm,power8-npu-pciex") + npu_create_phb(np); +} |