From af1a266670d040d2f4083ff309d732d648afba2a Mon Sep 17 00:00:00 2001 From: Angelos Mouzakitis Date: Tue, 10 Oct 2023 14:33:42 +0000 Subject: Add submodule dependency files Change-Id: Iaf8d18082d3991dec7c0ebbea540f092188eb4ec --- roms/skiboot/core/Makefile.inc | 28 + roms/skiboot/core/affinity.c | 125 ++ roms/skiboot/core/bitmap.c | 44 + roms/skiboot/core/buddy.c | 292 +++ roms/skiboot/core/chip.c | 190 ++ roms/skiboot/core/console-log.c | 71 + roms/skiboot/core/console.c | 451 +++++ roms/skiboot/core/cpu.c | 1785 ++++++++++++++++++ roms/skiboot/core/cpufeatures.c | 1043 +++++++++++ roms/skiboot/core/device.c | 1128 +++++++++++ roms/skiboot/core/direct-controls.c | 1161 ++++++++++++ roms/skiboot/core/errorlog.c | 223 +++ roms/skiboot/core/exceptions.c | 233 +++ roms/skiboot/core/fast-reboot.c | 467 +++++ roms/skiboot/core/fdt.c | 258 +++ roms/skiboot/core/flash-firmware-versions.c | 164 ++ roms/skiboot/core/flash-subpartition.c | 110 ++ roms/skiboot/core/flash.c | 1186 ++++++++++++ roms/skiboot/core/gcov-profiling.c | 127 ++ roms/skiboot/core/hmi.c | 1558 ++++++++++++++++ roms/skiboot/core/i2c.c | 288 +++ roms/skiboot/core/init.c | 1469 +++++++++++++++ roms/skiboot/core/interrupts.c | 513 +++++ roms/skiboot/core/ipmi-opal.c | 138 ++ roms/skiboot/core/ipmi.c | 263 +++ roms/skiboot/core/lock.c | 336 ++++ roms/skiboot/core/malloc.c | 76 + roms/skiboot/core/mce.c | 309 +++ roms/skiboot/core/mem_region.c | 1555 ++++++++++++++++ roms/skiboot/core/nvram-format.c | 331 ++++ roms/skiboot/core/nvram.c | 203 ++ roms/skiboot/core/opal-dump.c | 582 ++++++ roms/skiboot/core/opal-msg.c | 193 ++ roms/skiboot/core/opal.c | 700 +++++++ roms/skiboot/core/pci-dt-slot.c | 212 +++ roms/skiboot/core/pci-opal.c | 1135 +++++++++++ roms/skiboot/core/pci-quirk.c | 135 ++ roms/skiboot/core/pci-slot.c | 241 +++ roms/skiboot/core/pci-virt.c | 256 +++ roms/skiboot/core/pci.c | 1962 ++++++++++++++++++++ roms/skiboot/core/pcie-slot.c | 566 ++++++ roms/skiboot/core/pel.c | 279 +++ roms/skiboot/core/platform.c | 319 ++++ roms/skiboot/core/pool.c | 68 + roms/skiboot/core/powercap.c | 37 + roms/skiboot/core/psr.c | 41 + roms/skiboot/core/relocate.c | 55 + roms/skiboot/core/rtc.c | 62 + roms/skiboot/core/sensor.c | 152 ++ roms/skiboot/core/stack.c | 266 +++ roms/skiboot/core/test/Makefile.check | 101 + roms/skiboot/core/test/dummy-cpu.h | 35 + .../core/test/firmware-versions-input/version-0 | Bin 0 -> 4096 bytes .../core/test/firmware-versions-input/version-1 | Bin 0 -> 4096 bytes .../core/test/firmware-versions-input/version-10 | Bin 0 -> 4096 bytes .../core/test/firmware-versions-input/version-11 | Bin 0 -> 4096 bytes .../core/test/firmware-versions-input/version-16 | Bin 0 -> 4096 bytes .../core/test/firmware-versions-input/version-2 | Bin 0 -> 4096 bytes .../core/test/firmware-versions-input/version-26 | Bin 0 -> 4096 bytes .../core/test/firmware-versions-input/version-27 | Bin 0 -> 4096 bytes .../core/test/firmware-versions-input/version-29 | Bin 0 -> 4096 bytes .../core/test/firmware-versions-input/version-long | 2 + .../test/firmware-versions-input/version-nodash | 2 + .../test/firmware-versions-input/version-trunc | 2 + roms/skiboot/core/test/run-api-test.c | 40 + roms/skiboot/core/test/run-bitmap.c | 80 + roms/skiboot/core/test/run-buddy.c | 73 + .../core/test/run-console-log-buf-overrun.c | 105 ++ roms/skiboot/core/test/run-console-log-pr_fmt.c | 63 + roms/skiboot/core/test/run-console-log.c | 63 + roms/skiboot/core/test/run-cpufeatures.c | 144 ++ roms/skiboot/core/test/run-device.c | 471 +++++ .../core/test/run-flash-firmware-versions.c | 154 ++ roms/skiboot/core/test/run-flash-subpartition.c | 48 + roms/skiboot/core/test/run-malloc-speed.c | 88 + roms/skiboot/core/test/run-malloc.c | 174 ++ roms/skiboot/core/test/run-mem_range_is_reserved.c | 207 +++ roms/skiboot/core/test/run-mem_region.c | 252 +++ roms/skiboot/core/test/run-mem_region_init.c | 175 ++ roms/skiboot/core/test/run-mem_region_next.c | 105 ++ .../core/test/run-mem_region_release_unused.c | 177 ++ .../test/run-mem_region_release_unused_noalloc.c | 156 ++ .../core/test/run-mem_region_reservations.c | 228 +++ roms/skiboot/core/test/run-msg.c | 281 +++ roms/skiboot/core/test/run-nvram-format.c | 167 ++ roms/skiboot/core/test/run-pci-quirk.c | 98 + roms/skiboot/core/test/run-pel.c | 120 ++ roms/skiboot/core/test/run-pool.c | 59 + roms/skiboot/core/test/run-time-utils.c | 52 + roms/skiboot/core/test/run-timebase.c | 47 + roms/skiboot/core/test/run-timer.c | 84 + roms/skiboot/core/test/run-trace.c | 397 ++++ roms/skiboot/core/test/stubs.c | 101 + roms/skiboot/core/time-utils.c | 64 + roms/skiboot/core/timebase.c | 141 ++ roms/skiboot/core/timer.c | 298 +++ roms/skiboot/core/trace.c | 265 +++ roms/skiboot/core/utils.c | 101 + roms/skiboot/core/vpd.c | 139 ++ 99 files changed, 28745 insertions(+) create mode 100644 roms/skiboot/core/Makefile.inc create mode 100644 roms/skiboot/core/affinity.c create mode 100644 roms/skiboot/core/bitmap.c create mode 100644 roms/skiboot/core/buddy.c create mode 100644 roms/skiboot/core/chip.c create mode 100644 roms/skiboot/core/console-log.c create mode 100644 roms/skiboot/core/console.c create mode 100644 roms/skiboot/core/cpu.c create mode 100644 roms/skiboot/core/cpufeatures.c create mode 100644 roms/skiboot/core/device.c create mode 100644 roms/skiboot/core/direct-controls.c create mode 100644 roms/skiboot/core/errorlog.c create mode 100644 roms/skiboot/core/exceptions.c create mode 100644 roms/skiboot/core/fast-reboot.c create mode 100644 roms/skiboot/core/fdt.c create mode 100644 roms/skiboot/core/flash-firmware-versions.c create mode 100644 roms/skiboot/core/flash-subpartition.c create mode 100644 roms/skiboot/core/flash.c create mode 100644 roms/skiboot/core/gcov-profiling.c create mode 100644 roms/skiboot/core/hmi.c create mode 100644 roms/skiboot/core/i2c.c create mode 100644 roms/skiboot/core/init.c create mode 100644 roms/skiboot/core/interrupts.c create mode 100644 roms/skiboot/core/ipmi-opal.c create mode 100644 roms/skiboot/core/ipmi.c create mode 100644 roms/skiboot/core/lock.c create mode 100644 roms/skiboot/core/malloc.c create mode 100644 roms/skiboot/core/mce.c create mode 100644 roms/skiboot/core/mem_region.c create mode 100644 roms/skiboot/core/nvram-format.c create mode 100644 roms/skiboot/core/nvram.c create mode 100644 roms/skiboot/core/opal-dump.c create mode 100644 roms/skiboot/core/opal-msg.c create mode 100644 roms/skiboot/core/opal.c create mode 100644 roms/skiboot/core/pci-dt-slot.c create mode 100644 roms/skiboot/core/pci-opal.c create mode 100644 roms/skiboot/core/pci-quirk.c create mode 100644 roms/skiboot/core/pci-slot.c create mode 100644 roms/skiboot/core/pci-virt.c create mode 100644 roms/skiboot/core/pci.c create mode 100644 roms/skiboot/core/pcie-slot.c create mode 100644 roms/skiboot/core/pel.c create mode 100644 roms/skiboot/core/platform.c create mode 100644 roms/skiboot/core/pool.c create mode 100644 roms/skiboot/core/powercap.c create mode 100644 roms/skiboot/core/psr.c create mode 100644 roms/skiboot/core/relocate.c create mode 100644 roms/skiboot/core/rtc.c create mode 100644 roms/skiboot/core/sensor.c create mode 100644 roms/skiboot/core/stack.c create mode 100644 roms/skiboot/core/test/Makefile.check create mode 100644 roms/skiboot/core/test/dummy-cpu.h create mode 100644 roms/skiboot/core/test/firmware-versions-input/version-0 create mode 100644 roms/skiboot/core/test/firmware-versions-input/version-1 create mode 100644 roms/skiboot/core/test/firmware-versions-input/version-10 create mode 100644 roms/skiboot/core/test/firmware-versions-input/version-11 create mode 100644 roms/skiboot/core/test/firmware-versions-input/version-16 create mode 100644 roms/skiboot/core/test/firmware-versions-input/version-2 create mode 100644 roms/skiboot/core/test/firmware-versions-input/version-26 create mode 100644 roms/skiboot/core/test/firmware-versions-input/version-27 create mode 100644 roms/skiboot/core/test/firmware-versions-input/version-29 create mode 100644 roms/skiboot/core/test/firmware-versions-input/version-long create mode 100644 roms/skiboot/core/test/firmware-versions-input/version-nodash create mode 100644 roms/skiboot/core/test/firmware-versions-input/version-trunc create mode 100644 roms/skiboot/core/test/run-api-test.c create mode 100644 roms/skiboot/core/test/run-bitmap.c create mode 100644 roms/skiboot/core/test/run-buddy.c create mode 100644 roms/skiboot/core/test/run-console-log-buf-overrun.c create mode 100644 roms/skiboot/core/test/run-console-log-pr_fmt.c create mode 100644 roms/skiboot/core/test/run-console-log.c create mode 100644 roms/skiboot/core/test/run-cpufeatures.c create mode 100644 roms/skiboot/core/test/run-device.c create mode 100644 roms/skiboot/core/test/run-flash-firmware-versions.c create mode 100644 roms/skiboot/core/test/run-flash-subpartition.c create mode 100644 roms/skiboot/core/test/run-malloc-speed.c create mode 100644 roms/skiboot/core/test/run-malloc.c create mode 100644 roms/skiboot/core/test/run-mem_range_is_reserved.c create mode 100644 roms/skiboot/core/test/run-mem_region.c create mode 100644 roms/skiboot/core/test/run-mem_region_init.c create mode 100644 roms/skiboot/core/test/run-mem_region_next.c create mode 100644 roms/skiboot/core/test/run-mem_region_release_unused.c create mode 100644 roms/skiboot/core/test/run-mem_region_release_unused_noalloc.c create mode 100644 roms/skiboot/core/test/run-mem_region_reservations.c create mode 100644 roms/skiboot/core/test/run-msg.c create mode 100644 roms/skiboot/core/test/run-nvram-format.c create mode 100644 roms/skiboot/core/test/run-pci-quirk.c create mode 100644 roms/skiboot/core/test/run-pel.c create mode 100644 roms/skiboot/core/test/run-pool.c create mode 100644 roms/skiboot/core/test/run-time-utils.c create mode 100644 roms/skiboot/core/test/run-timebase.c create mode 100644 roms/skiboot/core/test/run-timer.c create mode 100644 roms/skiboot/core/test/run-trace.c create mode 100644 roms/skiboot/core/test/stubs.c create mode 100644 roms/skiboot/core/time-utils.c create mode 100644 roms/skiboot/core/timebase.c create mode 100644 roms/skiboot/core/timer.c create mode 100644 roms/skiboot/core/trace.c create mode 100644 roms/skiboot/core/utils.c create mode 100644 roms/skiboot/core/vpd.c (limited to 'roms/skiboot/core') diff --git a/roms/skiboot/core/Makefile.inc b/roms/skiboot/core/Makefile.inc new file mode 100644 index 000000000..829800e5b --- /dev/null +++ b/roms/skiboot/core/Makefile.inc @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +# Copyright 2012-2019 IBM Corp +# -*-Makefile-*- + +SUBDIRS += core +CORE_OBJS = relocate.o console.o stack.o init.o chip.o mem_region.o +CORE_OBJS += malloc.o lock.o cpu.o utils.o fdt.o opal.o interrupts.o timebase.o +CORE_OBJS += opal-msg.o pci.o pci-virt.o pci-slot.o pcie-slot.o +CORE_OBJS += pci-opal.o fast-reboot.o device.o exceptions.o trace.o affinity.o +CORE_OBJS += vpd.o platform.o nvram.o nvram-format.o hmi.o mce.o +CORE_OBJS += console-log.o ipmi.o time-utils.o pel.o pool.o errorlog.o +CORE_OBJS += timer.o i2c.o rtc.o flash.o sensor.o ipmi-opal.o +CORE_OBJS += flash-subpartition.o bitmap.o buddy.o pci-quirk.o powercap.o psr.o +CORE_OBJS += pci-dt-slot.o direct-controls.o cpufeatures.o +CORE_OBJS += flash-firmware-versions.o opal-dump.o + +ifeq ($(SKIBOOT_GCOV),1) +CORE_OBJS += gcov-profiling.o +CFLAGS_SKIP_core/gcov-profiling.o = -Wsuggest-attribute=const +endif + +CORE=core/built-in.a + +CFLAGS_SKIP_core/relocate.o = -pg -fstack-protector-all +CFLAGS_SKIP_core/relocate.o += -fstack-protector -fstack-protector-strong +CFLAGS_SKIP_core/relocate.o += -fprofile-arcs -ftest-coverage + +$(CORE): $(CORE_OBJS:%=core/%) diff --git a/roms/skiboot/core/affinity.c b/roms/skiboot/core/affinity.c new file mode 100644 index 000000000..0209d3cd9 --- /dev/null +++ b/roms/skiboot/core/affinity.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* Copyright 2013-2019 IBM Corp. */ + +/* + * + * We currently construct our associativity properties as such: + * + * - For "chip" devices (bridges, memory, ...), 4 entries: + * + * - CCM node ID + * - HW card ID + * - HW module ID + * - Chip ID + * + * The information is constructed based on the chip ID which (unlike + * pHyp) is our HW chip ID (aka "XSCOM" chip ID). We use it to retrieve + * the other properties from the corresponding chip/xscom node in the + * device-tree. If those properties are absent, 0 is used. + * + * - For "core" devices, we add a 5th entry: + * + * - Core ID + * + * Here too, we do not use the "cooked" HW processor ID from HDAT but + * instead use the real HW core ID which is basically the interrupt + * server number of thread 0 on that core. + * + * + * The ibm,associativity-reference-points property is currently set to + * 4,4 indicating that the chip ID is our only reference point. This + * should be extended to encompass the node IDs eventually. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +static uint32_t get_chip_node_id(struct proc_chip *chip) +{ + /* If the xscom node has an ibm,ccm-node-id property, use it */ + if (dt_has_node_property(chip->devnode, "ibm,ccm-node-id", NULL)) + return dt_prop_get_u32(chip->devnode, "ibm,ccm-node-id"); + + /* + * Else use the 3 top bits of the chip ID which should be + * the node on P8 + */ + return chip->id >> 3; +} + +void add_associativity_ref_point(void) +{ + int ref2 = 0x4; + + /* + * Note about our use of reference points: + * + * Linux currently supports up to three levels of NUMA. We use the + * first reference point for the node ID and the second reference + * point for a second level of affinity. We always use the chip ID + * (4) for the first reference point. + * + * Choosing the second level of affinity is model specific + * unfortunately. Current POWER8E models should use the DCM + * as a second level of NUMA. + * + * If there is a way to obtain this information from the FSP + * that would be ideal, but for now hardwire our POWER8E setting. + * + * For GPU nodes we add a third level of NUMA, such that the + * distance of the GPU node from all other nodes is uniformly + * the highest. + */ + if (PVR_TYPE(mfspr(SPR_PVR)) == PVR_TYPE_P8E) + ref2 = 0x3; + + dt_add_property_cells(opal_node, "ibm,associativity-reference-points", + 0x4, ref2, 0x2); +} + +void add_chip_dev_associativity(struct dt_node *dev) +{ + uint32_t chip_id = dt_get_chip_id(dev); + struct proc_chip *chip = get_chip(chip_id); + uint32_t hw_cid, hw_mid; + + if (!chip) + return; + + hw_cid = dt_prop_get_u32_def(chip->devnode, "ibm,hw-card-id", 0); + hw_mid = dt_prop_get_u32_def(chip->devnode, "ibm,hw-module-id", 0); + + dt_add_property_cells(dev, "ibm,associativity", 4, + get_chip_node_id(chip), + hw_cid, hw_mid, chip_id); +} + +void add_core_associativity(struct cpu_thread *cpu) +{ + struct proc_chip *chip = get_chip(cpu->chip_id); + uint32_t hw_cid, hw_mid, core_id; + + if (!chip) + return; + + if (proc_gen == proc_gen_p8) + core_id = (cpu->pir >> 3) & 0xf; + else if (proc_gen == proc_gen_p9) + core_id = (cpu->pir >> 2) & 0x1f; + else if (proc_gen == proc_gen_p10) + core_id = (cpu->pir >> 2) & 0x1f; + else + return; + + hw_cid = dt_prop_get_u32_def(chip->devnode, "ibm,hw-card-id", 0); + hw_mid = dt_prop_get_u32_def(chip->devnode, "ibm,hw-module-id", 0); + + dt_add_property_cells(cpu->node, "ibm,associativity", 5, + get_chip_node_id(chip), + hw_cid, hw_mid, chip->id, core_id); +} diff --git a/roms/skiboot/core/bitmap.c b/roms/skiboot/core/bitmap.c new file mode 100644 index 000000000..8de1356c3 --- /dev/null +++ b/roms/skiboot/core/bitmap.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* Copyright 2016 IBM Corp. */ + +#include "bitmap.h" + +static int __bitmap_find_bit(bitmap_t map, unsigned int start, unsigned int count, + bool value) +{ + unsigned int el, first_bit; + unsigned int end = start + count; + bitmap_elem_t e, ev; + int b; + + ev = value ? -1ul : 0; + el = BITMAP_ELEM(start); + first_bit = BITMAP_BIT(start); + + while (start < end) { + e = map[el] ^ ev; + e |= ((1ul << first_bit) - 1); + if (~e) + break; + start = (start + BITMAP_ELSZ) & ~(BITMAP_ELSZ - 1); + first_bit = 0; + el++; + } + for (b = first_bit; b < BITMAP_ELSZ && start < end; b++,start++) { + if ((e & (1ull << b)) == 0) + return start; + } + + return -1; +} + +int bitmap_find_zero_bit(bitmap_t map, unsigned int start, unsigned int count) +{ + return __bitmap_find_bit(map, start, count, false); +} + +int bitmap_find_one_bit(bitmap_t map, unsigned int start, unsigned int count) +{ + return __bitmap_find_bit(map, start, count, true); +} + diff --git a/roms/skiboot/core/buddy.c b/roms/skiboot/core/buddy.c new file mode 100644 index 000000000..b36e407d1 --- /dev/null +++ b/roms/skiboot/core/buddy.c @@ -0,0 +1,292 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* Copyright 2016-2017 IBM Corp. */ + +#include +#include +#include +#include + +#include "buddy.h" + +#define BUDDY_DEBUG +#undef BUDDY_VERBOSE + +#ifdef BUDDY_VERBOSE +#define BUDDY_NOISE(fmt...) printf(fmt) +#else +#define BUDDY_NOISE(fmt...) do { } while(0) +#endif + +static inline unsigned int buddy_map_size(struct buddy *b) +{ + return 1u << (b->max_order + 1); +} + +static inline unsigned int buddy_order_start(struct buddy *b, + unsigned int order) +{ + unsigned int level = b->max_order - order; + + /* Starting bit of index for order */ + return 1u << level; +} + +static inline unsigned int buddy_index_to_node(struct buddy *b, + unsigned int index, + unsigned int order) +{ + /* Ensure the index is a multiple of the order */ + assert((index & ((1u << order) - 1)) == 0); + + return buddy_order_start(b, order) + (index >> order); +} + +static inline unsigned int buddy_node_to_index(struct buddy *b, + unsigned int node, + unsigned int order) +{ + unsigned int start = buddy_order_start(b, order); + + return (node - start) << order; +} + +#ifdef BUDDY_DEBUG +static void buddy_check_alloc(struct buddy *b, unsigned int node) +{ + assert(bitmap_tst_bit(b->map, node)); +} + +static void buddy_check_alloc_down(struct buddy *b, unsigned int node) +{ + unsigned int i, count = 1; + + while (node < buddy_map_size(b)) { + for (i = 0; i < count; i++) + buddy_check_alloc(b, node + i); + + /* Down one level */ + node <<= 1; + count <<= 1; + } +} +#else +static inline void buddy_check_alloc(struct buddy *b __unused, unsigned int node __unused) {} +static inline void buddy_check_alloc_down(struct buddy *b __unused, unsigned int node __unused) {} +#endif + +int buddy_alloc(struct buddy *b, unsigned int order) +{ + unsigned int o; + int node, index; + + BUDDY_NOISE("buddy_alloc(%d)\n", order); + /* + * Find the first order up the tree from our requested order that + * has at least one free node. + */ + for (o = order; o <= b->max_order; o++) { + if (b->freecounts[o] > 0) + break; + } + + /* Nothing found ? fail */ + if (o > b->max_order) { + BUDDY_NOISE(" no free nodes !\n"); + return -1; + } + + BUDDY_NOISE(" %d free node(s) at order %d, bits %d(%d)\n", + b->freecounts[o], o, + buddy_order_start(b, o), + 1u << (b->max_order - o)); + + /* Now find a free node */ + node = bitmap_find_zero_bit(b->map, buddy_order_start(b, o), + 1u << (b->max_order - o)); + + /* There should always be one */ + assert(node >= 0); + + /* Mark it allocated and decrease free count */ + bitmap_set_bit(b->map, node); + b->freecounts[o]--; + + /* We know that node was free which means all its children must have + * been marked "allocated". Double check. + */ + buddy_check_alloc_down(b, node); + + /* We have a node, we've marked it allocated, now we need to go down + * the tree until we reach "order" which is the order we need. For + * each level along the way, we mark the buddy free and leave the + * first child allocated. + */ + while (o > order) { + /* Next level down */ + o--; + node <<= 1; + + BUDDY_NOISE(" order %d, using %d marking %d free\n", + o, node, node ^ 1); + bitmap_clr_bit(b->map, node ^ 1); + b->freecounts[o]++; + assert(bitmap_tst_bit(b->map, node)); + } + + index = buddy_node_to_index(b, node, order); + + BUDDY_NOISE(" result is index %d (node %d)\n", index, node); + + /* We have a node, convert it to an element number */ + return index; +} + +bool buddy_reserve(struct buddy *b, unsigned int index, unsigned int order) +{ + unsigned int node, freenode, o; + + assert(index < (1u << b->max_order)); + + BUDDY_NOISE("buddy_reserve(%d,%d)\n", index, order); + + /* Get bit number for node */ + node = buddy_index_to_node(b, index, order); + + BUDDY_NOISE(" node=%d\n", node); + + /* Find something free */ + for (freenode = node, o = order; freenode > 0; freenode >>= 1, o++) + if (!bitmap_tst_bit(b->map, freenode)) + break; + + BUDDY_NOISE(" freenode=%d order %d\n", freenode, o); + + /* Nothing free, error out */ + if (!freenode) + return false; + + /* We sit on a free node, mark it busy */ + bitmap_set_bit(b->map, freenode); + assert(b->freecounts[o]); + b->freecounts[o]--; + + /* We know that node was free which means all its children must have + * been marked "allocated". Double check. + */ + buddy_check_alloc_down(b, freenode); + + /* Reverse-walk the path and break down nodes */ + while (o > order) { + /* Next level down */ + o--; + freenode <<= 1; + + /* Find the right one on the path to node */ + if (node & (1u << (o - order))) + freenode++; + + BUDDY_NOISE(" order %d, using %d marking %d free\n", + o, freenode, freenode ^ 1); + bitmap_clr_bit(b->map, freenode ^ 1); + b->freecounts[o]++; + assert(bitmap_tst_bit(b->map, node)); + } + assert(node == freenode); + + return true; +} + +void buddy_free(struct buddy *b, unsigned int index, unsigned int order) +{ + unsigned int node; + + assert(index < (1u << b->max_order)); + + BUDDY_NOISE("buddy_free(%d,%d)\n", index, order); + + /* Get bit number for node */ + node = buddy_index_to_node(b, index, order); + + BUDDY_NOISE(" node=%d\n", node); + + /* We assume that anything freed was fully allocated, ie, + * there is no child node of that allocation index/order + * that is already free. + * + * BUDDY_DEBUG will verify it at the cost of performances + */ + buddy_check_alloc_down(b, node); + + /* Propagate if buddy is free */ + while (order < b->max_order && !bitmap_tst_bit(b->map, node ^ 1)) { + BUDDY_NOISE(" order %d node %d buddy %d free, propagating\n", + order, node, node ^ 1); + + /* Mark buddy busy (we are already marked busy) */ + bitmap_set_bit(b->map, node ^ 1); + + /* Reduce free count */ + assert(b->freecounts[order] > 0); + b->freecounts[order]--; + + /* Get parent */ + node >>= 1; + order++; + + /* It must be busy already ! */ + buddy_check_alloc(b, node); + + BUDDY_NOISE(" testing order %d node %d\n", order, node ^ 1); + } + + /* No more coalescing, mark it free */ + bitmap_clr_bit(b->map, node); + + /* Increase the freelist count for that level */ + b->freecounts[order]++; + + BUDDY_NOISE(" free count at order %d is %d\n", + order, b->freecounts[order]); +} + +void buddy_reset(struct buddy *b) +{ + unsigned int bsize = BITMAP_BYTES(1u << (b->max_order + 1)); + + BUDDY_NOISE("buddy_reset()\n"); + /* We fill the bitmap with 1's to make it completely "busy" */ + memset(b->map, 0xff, bsize); + memset(b->freecounts, 0, sizeof(b->freecounts)); + + /* We mark the root of the tree free, this is entry 1 as entry 0 + * is unused. + */ + buddy_free(b, 0, b->max_order); +} + +struct buddy *buddy_create(unsigned int max_order) +{ + struct buddy *b; + unsigned int bsize; + + assert(max_order <= BUDDY_MAX_ORDER); + + bsize = BITMAP_BYTES(1u << (max_order + 1)); + + b = zalloc(sizeof(struct buddy) + bsize); + if (!b) + return NULL; + b->max_order = max_order; + + BUDDY_NOISE("Map @%p, size: %d bytes\n", b->map, bsize); + + buddy_reset(b); + + return b; +} + +void buddy_destroy(struct buddy *b) +{ + free(b); +} + diff --git a/roms/skiboot/core/chip.c b/roms/skiboot/core/chip.c new file mode 100644 index 000000000..2d95b2e05 --- /dev/null +++ b/roms/skiboot/core/chip.c @@ -0,0 +1,190 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* Copyright 2013-2019 IBM Corp. */ + +#include +#include +#include +#include +#include +#include + +static struct proc_chip *chips[MAX_CHIPS]; +enum proc_chip_quirks proc_chip_quirks; + +uint32_t pir_to_chip_id(uint32_t pir) +{ + if (proc_gen == proc_gen_p10) + return P10_PIR2GCID(pir); + else if (proc_gen == proc_gen_p9) + return P9_PIR2GCID(pir); + else if (proc_gen == proc_gen_p8) + return P8_PIR2GCID(pir); + else + assert(false); +} + +uint32_t pir_to_core_id(uint32_t pir) +{ + if (proc_gen == proc_gen_p10) { + if (this_cpu()->is_fused_core) + return P10_PIRFUSED2NORMALCOREID(pir); + else + return P10_PIR2COREID(pir); + } else if (proc_gen == proc_gen_p9) { + if (this_cpu()->is_fused_core) + return P9_PIRFUSED2NORMALCOREID(pir); + else + return P9_PIR2COREID(pir); + } else if (proc_gen == proc_gen_p8) { + return P8_PIR2COREID(pir); + } else { + assert(false); + } +} + +uint32_t pir_to_fused_core_id(uint32_t pir) +{ + if (proc_gen == proc_gen_p10) { + if (this_cpu()->is_fused_core) + return P10_PIR2FUSEDCOREID(pir); + else + return P10_PIR2COREID(pir); + } else if (proc_gen == proc_gen_p9) { + if (this_cpu()->is_fused_core) + return P9_PIR2FUSEDCOREID(pir); + else + return P9_PIR2COREID(pir); + } else if (proc_gen == proc_gen_p8) { + return P8_PIR2COREID(pir); + } else { + assert(false); + } +} + +uint32_t pir_to_thread_id(uint32_t pir) +{ + if (proc_gen == proc_gen_p10) { + if (this_cpu()->is_fused_core) + return P10_PIRFUSED2NORMALTHREADID(pir); + else + return P10_PIR2THREADID(pir); + } else if (proc_gen == proc_gen_p9) { + if (this_cpu()->is_fused_core) + return P9_PIRFUSED2NORMALTHREADID(pir); + else + return P9_PIR2THREADID(pir); + } else if (proc_gen == proc_gen_p8) { + return P8_PIR2THREADID(pir); + } else { + assert(false); + } +} + +struct proc_chip *next_chip(struct proc_chip *chip) +{ + unsigned int i; + + for (i = chip ? (chip->id + 1) : 0; i < MAX_CHIPS; i++) + if (chips[i]) + return chips[i]; + return NULL; +} + + +struct proc_chip *get_chip(uint32_t chip_id) +{ + if (chip_id >= MAX_CHIPS) + return NULL; + return chips[chip_id]; +} + +static void init_chip(struct dt_node *dn) +{ + struct proc_chip *chip; + uint32_t id; + const char *lc = NULL; + + id = dt_get_chip_id(dn); + assert(id < MAX_CHIPS); + assert(chips[id] == NULL); + + chip = zalloc(sizeof(struct proc_chip)); + assert(chip); + + chip->id = id; + chip->devnode = dn; + + chip->dbob_id = dt_prop_get_u32_def(dn, "ibm,dbob-id", 0xffffffff); + chip->pcid = dt_prop_get_u32_def(dn, "ibm,proc-chip-id", 0xffffffff); + + if (dt_prop_get_u32_def(dn, "ibm,occ-functional-state", 0)) + chip->occ_functional = true; + else + chip->occ_functional = false; + + list_head_init(&chip->i2cms); + + /* Update the location code for this chip. */ + if (dt_has_node_property(dn, "ibm,loc-code", NULL)) + lc = dt_prop_get(dn, "ibm,loc-code"); + else if (dt_has_node_property(dn, "ibm,slot-location-code", NULL)) + lc = dt_prop_get(dn, "ibm,slot-location-code"); + + if (lc) + chip->loc_code = strdup(lc); + + chip->primary_topology = dt_prop_get_u32_def(dn, + "ibm,primary-topology-index", 0xffffffff); + + prlog(PR_INFO, "CHIP: Initialised chip %d from %s\n", id, dn->name); + chips[id] = chip; +} + +void init_chips(void) +{ + struct dt_node *xn; + + /* Detect mambo chip */ + if (dt_find_by_path(dt_root, "/mambo")) { + proc_chip_quirks |= QUIRK_NO_CHIPTOD | QUIRK_MAMBO_CALLOUTS + | QUIRK_NO_F000F | QUIRK_NO_PBA | QUIRK_NO_OCC_IRQ + | QUIRK_NO_RNG; + + enable_mambo_console(); + + prlog(PR_NOTICE, "CHIP: Detected Mambo simulator\n"); + + dt_for_each_compatible(dt_root, xn, "ibm,mambo-chip") + init_chip(xn); + } + + /* Detect simics */ + if (dt_find_by_path(dt_root, "/simics")) { + proc_chip_quirks |= QUIRK_SIMICS + | QUIRK_NO_PBA | QUIRK_NO_OCC_IRQ | QUIRK_SLOW_SIM; + tb_hz = 512000; + prlog(PR_NOTICE, "CHIP: Detected Simics simulator\n"); + } + /* Detect Awan emulator */ + if (dt_find_by_path(dt_root, "/awan")) { + proc_chip_quirks |= QUIRK_NO_CHIPTOD | QUIRK_NO_F000F + | QUIRK_NO_PBA | QUIRK_NO_OCC_IRQ | QUIRK_SLOW_SIM; + tb_hz = 512000; + prlog(PR_NOTICE, "CHIP: Detected Awan emulator\n"); + } + /* Detect Qemu */ + if (dt_node_is_compatible(dt_root, "qemu,powernv") || + dt_node_is_compatible(dt_root, "qemu,powernv8") || + dt_node_is_compatible(dt_root, "qemu,powernv9") || + dt_node_is_compatible(dt_root, "qemu,powernv10") || + dt_find_by_path(dt_root, "/qemu")) { + proc_chip_quirks |= QUIRK_QEMU | QUIRK_NO_CHIPTOD + | QUIRK_NO_DIRECT_CTL | QUIRK_NO_RNG; + prlog(PR_NOTICE, "CHIP: Detected QEMU simulator\n"); + } + + /* We walk the chips based on xscom nodes in the tree */ + dt_for_each_compatible(dt_root, xn, "ibm,xscom") { + init_chip(xn); + } +} diff --git a/roms/skiboot/core/console-log.c b/roms/skiboot/core/console-log.c new file mode 100644 index 000000000..21a1442bd --- /dev/null +++ b/roms/skiboot/core/console-log.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Console Log routines + * Wraps libc and console lower level functions + * does fancy-schmancy things like timestamps and priorities + * Doesn't make waffles. + * + * Copyright 2013-2018 IBM Corp. + */ + +#include "skiboot.h" +#include "unistd.h" +#include "stdio.h" +#include "console.h" +#include "timebase.h" +#include + +static int vprlog(int log_level, const char *fmt, va_list ap) +{ + int count; + char buffer[320]; + bool flush_to_drivers = true; + unsigned long tb = mftb(); + + /* It's safe to return 0 when we "did" something here + * as only printf cares about how much we wrote, and + * if you change log_level to below PR_PRINTF then you + * get everything you deserve. + * By default, only PR_DEBUG and higher are stored in memory. + * PR_TRACE and PR_INSANE are for those having a bad day. + */ + if (log_level > (debug_descriptor.console_log_levels >> 4)) + return 0; + + count = snprintf(buffer, sizeof(buffer), "[%5lu.%09lu,%d] ", + tb_to_secs(tb), tb_remaining_nsecs(tb), log_level); + count+= vsnprintf(buffer+count, sizeof(buffer)-count, fmt, ap); + + if (log_level > (debug_descriptor.console_log_levels & 0x0f)) + flush_to_drivers = false; + + console_write(flush_to_drivers, buffer, count); + + return count; +} + +/* we don't return anything as what on earth are we going to do + * if we actually fail to print a log message? Print a log message about it? + * Callers shouldn't care, prlog and friends should do something generically + * sane in such crazy situations. + */ +void _prlog(int log_level, const char* fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vprlog(log_level, fmt, ap); + va_end(ap); +} + +int _printf(const char* fmt, ...) +{ + int count; + va_list ap; + + va_start(ap, fmt); + count = vprlog(PR_PRINTF, fmt, ap); + va_end(ap); + + return count; +} diff --git a/roms/skiboot/core/console.c b/roms/skiboot/core/console.c new file mode 100644 index 000000000..2a1509025 --- /dev/null +++ b/roms/skiboot/core/console.c @@ -0,0 +1,451 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Console IO routine for use by libc + * + * fd is the classic posix 0,1,2 (stdin, stdout, stderr) + * + * Copyright 2013-2018 IBM Corp. + */ + +#include +#include +#include +#include +#include +#include +#include + +static char *con_buf = (char *)INMEM_CON_START; +static size_t con_in; +static size_t con_out; +static bool con_wrapped; + +/* Internal console driver ops */ +static struct con_ops *con_driver; + +/* External (OPAL) console driver ops */ +static struct opal_con_ops *opal_con_driver = &dummy_opal_con; + +static struct lock con_lock = LOCK_UNLOCKED; + +/* This is mapped via TCEs so we keep it alone in a page */ +struct memcons memcons __section(".data.memcons") = { + .magic = CPU_TO_BE64(MEMCONS_MAGIC), + .obuf_phys = CPU_TO_BE64(INMEM_CON_START), + .ibuf_phys = CPU_TO_BE64(INMEM_CON_START + INMEM_CON_OUT_LEN), + .obuf_size = CPU_TO_BE32(INMEM_CON_OUT_LEN), + .ibuf_size = CPU_TO_BE32(INMEM_CON_IN_LEN), +}; + +static bool dummy_console_enabled(void) +{ +#ifdef FORCE_DUMMY_CONSOLE + return true; +#else + return dt_has_node_property(dt_chosen, + "sapphire,enable-dummy-console", NULL); +#endif +} + +/* + * Helper function for adding /ibm,opal/consoles/serial@ nodes + */ +struct dt_node *add_opal_console_node(int index, const char *type, + uint32_t write_buffer_size) +{ + struct dt_node *con, *consoles; + char buffer[32]; + + consoles = dt_find_by_name(opal_node, "consoles"); + if (!consoles) { + consoles = dt_new(opal_node, "consoles"); + assert(consoles); + dt_add_property_cells(consoles, "#address-cells", 1); + dt_add_property_cells(consoles, "#size-cells", 0); + } + + con = dt_new_addr(consoles, "serial", index); + assert(con); + + snprintf(buffer, sizeof(buffer), "ibm,opal-console-%s", type); + dt_add_property_string(con, "compatible", buffer); + + dt_add_property_cells(con, "#write-buffer-size", write_buffer_size); + dt_add_property_cells(con, "reg", index); + dt_add_property_string(con, "device_type", "serial"); + + return con; +} + +void clear_console(void) +{ + memset(con_buf, 0, INMEM_CON_LEN); +} + +/* + * Flush the console buffer into the driver, returns true + * if there is more to go. + * Optionally can skip flushing to drivers, leaving messages + * just in memory console. + */ +static bool __flush_console(bool flush_to_drivers, bool need_unlock) +{ + struct cpu_thread *cpu = this_cpu(); + size_t req, len = 0; + static bool in_flush, more_flush; + + /* Is there anything to flush ? Bail out early if not */ + if (con_in == con_out || !con_driver) + return false; + + /* + * Console flushing is suspended on this CPU, typically because + * some critical locks are held that would potentially cause a + * flush to deadlock + * + * Also if it recursed on con_lock (need_unlock is false). This + * can happen due to debug code firing (e.g., list or stack + * debugging). + */ + if (cpu->con_suspend || !need_unlock) { + cpu->con_need_flush = true; + return false; + } + cpu->con_need_flush = false; + + /* + * We must call the underlying driver with the console lock + * dropped otherwise we get some deadlocks if anything down + * that path tries to printf() something. + * + * So instead what we do is we keep a static in_flush flag + * set/released with the lock held, which is used to prevent + * concurrent attempts at flushing the same chunk of buffer + * by other processors. + */ + if (in_flush) { + more_flush = true; + return false; + } + in_flush = true; + + /* + * NB: this must appear after the in_flush check since it modifies + * con_out. + */ + if (!flush_to_drivers) { + con_out = con_in; + in_flush = false; + return false; + } + + do { + more_flush = false; + + if (con_out > con_in) { + req = INMEM_CON_OUT_LEN - con_out; + more_flush = true; + } else + req = con_in - con_out; + + unlock(&con_lock); + len = con_driver->write(con_buf + con_out, req); + lock(&con_lock); + + con_out = (con_out + len) % INMEM_CON_OUT_LEN; + + /* write error? */ + if (len < req) + break; + } while(more_flush); + + in_flush = false; + return con_out != con_in; +} + +bool flush_console(void) +{ + bool ret; + + lock(&con_lock); + ret = __flush_console(true, true); + unlock(&con_lock); + + return ret; +} + +static void inmem_write(char c) +{ + uint32_t opos; + + if (!c) + return; + con_buf[con_in++] = c; + if (con_in >= INMEM_CON_OUT_LEN) { + con_in = 0; + con_wrapped = true; + } + + /* + * We must always re-generate memcons.out_pos because + * under some circumstances, the console script will + * use a broken putmemproc that does RMW on the full + * 8 bytes containing out_pos and in_prod, thus corrupting + * out_pos + */ + opos = con_in; + if (con_wrapped) + opos |= MEMCONS_OUT_POS_WRAP; + lwsync(); + memcons.out_pos = cpu_to_be32(opos); + + /* If head reaches tail, push tail around & drop chars */ + if (con_in == con_out) + con_out = (con_in + 1) % INMEM_CON_OUT_LEN; +} + +static size_t inmem_read(char *buf, size_t req) +{ + size_t read = 0; + char *ibuf = (char *)be64_to_cpu(memcons.ibuf_phys); + + while (req && be32_to_cpu(memcons.in_prod) != be32_to_cpu(memcons.in_cons)) { + *(buf++) = ibuf[be32_to_cpu(memcons.in_cons)]; + lwsync(); + memcons.in_cons = cpu_to_be32((be32_to_cpu(memcons.in_cons) + 1) % INMEM_CON_IN_LEN); + req--; + read++; + } + return read; +} + +static void write_char(char c) +{ +#ifdef MAMBO_DEBUG_CONSOLE + mambo_console_write(&c, 1); +#endif + inmem_write(c); +} + +ssize_t console_write(bool flush_to_drivers, const void *buf, size_t count) +{ + /* We use recursive locking here as we can get called + * from fairly deep debug path + */ + bool need_unlock = lock_recursive(&con_lock); + const char *cbuf = buf; + + while(count--) { + char c = *(cbuf++); + if (c == '\n') + write_char('\r'); + write_char(c); + } + + __flush_console(flush_to_drivers, need_unlock); + + if (need_unlock) + unlock(&con_lock); + + return count; +} + +ssize_t write(int fd __unused, const void *buf, size_t count) +{ + return console_write(true, buf, count); +} + +ssize_t read(int fd __unused, void *buf, size_t req_count) +{ + bool need_unlock = lock_recursive(&con_lock); + size_t count = 0; + + if (con_driver && con_driver->read) + count = con_driver->read(buf, req_count); + if (!count) + count = inmem_read(buf, req_count); + if (need_unlock) + unlock(&con_lock); + return count; +} + +/* Helper function to perform a full synchronous flush */ +void console_complete_flush(void) +{ + /* + * Using term 0 here is a dumb hack that works because the UART + * only has term 0 and the FSP doesn't have an explicit flush method. + */ + int64_t ret = opal_con_driver->flush(0); + + if (ret == OPAL_UNSUPPORTED || ret == OPAL_PARAMETER) + return; + + while (ret != OPAL_SUCCESS) { + ret = opal_con_driver->flush(0); + } +} + +/* + * set_console() + * + * This sets the driver used internally by Skiboot. This is different to the + * OPAL console driver. + */ +void set_console(struct con_ops *driver) +{ + con_driver = driver; + if (driver) + flush_console(); +} + +/* + * set_opal_console() + * + * Configure the console driver to handle the console provided by the OPAL API. + * They are different to the above in that they are typically buffered, and used + * by the host OS rather than skiboot. + */ +static bool opal_cons_init = false; + +void set_opal_console(struct opal_con_ops *driver) +{ + assert(!opal_cons_init); + opal_con_driver = driver; +} + +void init_opal_console(void) +{ + assert(!opal_cons_init); + opal_cons_init = true; + + if (dummy_console_enabled() && opal_con_driver != &dummy_opal_con) { + prlog(PR_WARNING, "OPAL: Dummy console forced, %s ignored\n", + opal_con_driver->name); + + opal_con_driver = &dummy_opal_con; + } + + prlog(PR_INFO, "OPAL: Using %s\n", opal_con_driver->name); + + if (opal_con_driver->init) + opal_con_driver->init(); + + opal_register(OPAL_CONSOLE_READ, opal_con_driver->read, 3); + opal_register(OPAL_CONSOLE_WRITE, opal_con_driver->write, 3); + opal_register(OPAL_CONSOLE_FLUSH, opal_con_driver->flush, 1); + opal_register(OPAL_CONSOLE_WRITE_BUFFER_SPACE, + opal_con_driver->space, 2); +} + +void memcons_add_properties(void) +{ + dt_add_property_u64(opal_node, "ibm,opal-memcons", (u64) &memcons); +} + +/* + * The default OPAL console. + * + * In the absence of a "real" OPAL console driver we handle the OPAL_CONSOLE_* + * calls by writing into the skiboot log buffer. Reads are a little more + * complicated since they can come from the in-memory console (BML) or from the + * internal skiboot console driver. + */ +static int64_t dummy_console_write(int64_t term_number, __be64 *length, + const uint8_t *buffer) +{ + uint64_t l; + + if (term_number != 0) + return OPAL_PARAMETER; + + if (!opal_addr_valid(length) || !opal_addr_valid(buffer)) + return OPAL_PARAMETER; + + l = be64_to_cpu(*length); + write(0, buffer, l); + + return OPAL_SUCCESS; +} + +static int64_t dummy_console_write_buffer_space(int64_t term_number, + __be64 *length) +{ + if (term_number != 0) + return OPAL_PARAMETER; + + if (!opal_addr_valid(length)) + return OPAL_PARAMETER; + + if (length) + *length = cpu_to_be64(INMEM_CON_OUT_LEN); + + return OPAL_SUCCESS; +} + +static int64_t dummy_console_read(int64_t term_number, __be64 *length, + uint8_t *buffer) +{ + uint64_t l; + + if (term_number != 0) + return OPAL_PARAMETER; + + if (!opal_addr_valid(length) || !opal_addr_valid(buffer)) + return OPAL_PARAMETER; + + l = be64_to_cpu(*length); + l = read(0, buffer, l); + *length = cpu_to_be64(l); + opal_update_pending_evt(OPAL_EVENT_CONSOLE_INPUT, 0); + + return OPAL_SUCCESS; +} + +static int64_t dummy_console_flush(int64_t term_number __unused) +{ + return OPAL_UNSUPPORTED; +} + +static void dummy_console_poll(void *data __unused) +{ + bool has_data = false; + + lock(&con_lock); + if (con_driver && con_driver->poll_read) + has_data = con_driver->poll_read(); + if (memcons.in_prod != memcons.in_cons) + has_data = true; + if (has_data) + opal_update_pending_evt(OPAL_EVENT_CONSOLE_INPUT, + OPAL_EVENT_CONSOLE_INPUT); + else + opal_update_pending_evt(OPAL_EVENT_CONSOLE_INPUT, 0); + unlock(&con_lock); +} + +void dummy_console_add_nodes(void) +{ + struct dt_property *p; + + add_opal_console_node(0, "raw", be32_to_cpu(memcons.obuf_size)); + + /* Mambo might have left a crap one, clear it */ + p = __dt_find_property(dt_chosen, "linux,stdout-path"); + if (p) + dt_del_property(dt_chosen, p); + + dt_add_property_string(dt_chosen, "linux,stdout-path", + "/ibm,opal/consoles/serial@0"); + + opal_add_poller(dummy_console_poll, NULL); +} + +struct opal_con_ops dummy_opal_con = { + .name = "Dummy Console", + .init = dummy_console_add_nodes, + .read = dummy_console_read, + .write = dummy_console_write, + .space = dummy_console_write_buffer_space, + .flush = dummy_console_flush, +}; diff --git a/roms/skiboot/core/cpu.c b/roms/skiboot/core/cpu.c new file mode 100644 index 000000000..f58aeb27a --- /dev/null +++ b/roms/skiboot/core/cpu.c @@ -0,0 +1,1785 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Code to manage and manipulate CPUs + * + * Copyright 2013-2019 IBM Corp. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* The cpu_threads array is static and indexed by PIR in + * order to speed up lookup from asm entry points + */ +struct cpu_stack { + union { + uint8_t stack[STACK_SIZE]; + struct cpu_thread cpu; + }; +} __align(STACK_SIZE); + +static struct cpu_stack * const cpu_stacks = (struct cpu_stack *)CPU_STACKS_BASE; +unsigned int cpu_thread_count; +unsigned int cpu_max_pir; +struct cpu_thread *boot_cpu; +static struct lock reinit_lock = LOCK_UNLOCKED; +static bool hile_supported; +static bool radix_supported; +static unsigned long hid0_hile; +static unsigned long hid0_attn; +static bool sreset_enabled; +static bool ipi_enabled; +static bool pm_enabled; +static bool current_hile_mode = HAVE_LITTLE_ENDIAN; +static bool current_radix_mode = true; +static bool tm_suspend_enabled; + +unsigned long cpu_secondary_start __force_data = 0; + +struct cpu_job { + struct list_node link; + void (*func)(void *data); + void *data; + const char *name; + bool complete; + bool no_return; +}; + +/* attribute const as cpu_stacks is constant. */ +unsigned long __attrconst cpu_stack_bottom(unsigned int pir) +{ + return ((unsigned long)&cpu_stacks[pir]) + + sizeof(struct cpu_thread) + STACK_SAFETY_GAP; +} + +unsigned long __attrconst cpu_stack_top(unsigned int pir) +{ + /* This is the top of the normal stack. */ + return ((unsigned long)&cpu_stacks[pir]) + + NORMAL_STACK_SIZE - STACK_TOP_GAP; +} + +unsigned long __attrconst cpu_emergency_stack_top(unsigned int pir) +{ + /* This is the top of the emergency stack, above the normal stack. */ + return ((unsigned long)&cpu_stacks[pir]) + + NORMAL_STACK_SIZE + EMERGENCY_STACK_SIZE - STACK_TOP_GAP; +} + +void __nomcount cpu_relax(void) +{ + /* Relax a bit to give sibling threads some breathing space */ + smt_lowest(); + asm volatile("nop; nop; nop; nop;\n" + "nop; nop; nop; nop;\n" + "nop; nop; nop; nop;\n" + "nop; nop; nop; nop;\n"); + smt_medium(); + barrier(); +} + +static void cpu_wake(struct cpu_thread *cpu) +{ + /* Is it idle ? If not, no need to wake */ + sync(); + if (!cpu->in_idle) + return; + + if (proc_gen == proc_gen_p8) { + /* Poke IPI */ + icp_kick_cpu(cpu); + } else if (proc_gen == proc_gen_p9 || proc_gen == proc_gen_p10) { + p9_dbell_send(cpu->pir); + } +} + +/* + * If chip_id is >= 0, schedule the job on that node. + * Otherwise schedule the job anywhere. + */ +static struct cpu_thread *cpu_find_job_target(int32_t chip_id) +{ + struct cpu_thread *cpu, *best, *me = this_cpu(); + uint32_t best_count; + + /* We try to find a target to run a job. We need to avoid + * a CPU that has a "no return" job on its queue as it might + * never be able to process anything. + * + * Additionally we don't check the list but the job count + * on the target CPUs, since that is decremented *after* + * a job has been completed. + */ + + + /* First we scan all available primary threads + */ + for_each_available_cpu(cpu) { + if (chip_id >= 0 && cpu->chip_id != chip_id) + continue; + if (cpu == me || !cpu_is_thread0(cpu) || cpu->job_has_no_return) + continue; + if (cpu->job_count) + continue; + lock(&cpu->job_lock); + if (!cpu->job_count) + return cpu; + unlock(&cpu->job_lock); + } + + /* Now try again with secondary threads included and keep + * track of the one with the less jobs queued up. This is + * done in a racy way, but it's just an optimization in case + * we are overcommitted on jobs. Could could also just pick + * a random one... + */ + best = NULL; + best_count = -1u; + for_each_available_cpu(cpu) { + if (chip_id >= 0 && cpu->chip_id != chip_id) + continue; + if (cpu == me || cpu->job_has_no_return) + continue; + if (!best || cpu->job_count < best_count) { + best = cpu; + best_count = cpu->job_count; + } + if (cpu->job_count) + continue; + lock(&cpu->job_lock); + if (!cpu->job_count) + return cpu; + unlock(&cpu->job_lock); + } + + /* We haven't found anybody, do we have a bestie ? */ + if (best) { + lock(&best->job_lock); + return best; + } + + /* Go away */ + return NULL; +} + +/* job_lock is held, returns with it released */ +static void queue_job_on_cpu(struct cpu_thread *cpu, struct cpu_job *job) +{ + /* That's bad, the job will never run */ + if (cpu->job_has_no_return) { + prlog(PR_WARNING, "WARNING ! Job %s scheduled on CPU 0x%x" + " which has a no-return job on its queue !\n", + job->name, cpu->pir); + backtrace(); + } + list_add_tail(&cpu->job_queue, &job->link); + if (job->no_return) + cpu->job_has_no_return = true; + else + cpu->job_count++; + if (pm_enabled) + cpu_wake(cpu); + unlock(&cpu->job_lock); +} + +struct cpu_job *__cpu_queue_job(struct cpu_thread *cpu, + const char *name, + void (*func)(void *data), void *data, + bool no_return) +{ + struct cpu_job *job; + +#ifdef DEBUG_SERIALIZE_CPU_JOBS + if (cpu == NULL) + cpu = this_cpu(); +#endif + + if (cpu && !cpu_is_available(cpu)) { + prerror("CPU: Tried to queue job on unavailable CPU 0x%04x\n", + cpu->pir); + return NULL; + } + + job = zalloc(sizeof(struct cpu_job)); + if (!job) + return NULL; + job->func = func; + job->data = data; + job->name = name; + job->complete = false; + job->no_return = no_return; + + /* Pick a candidate. Returns with target queue locked */ + if (cpu == NULL) + cpu = cpu_find_job_target(-1); + else if (cpu != this_cpu()) + lock(&cpu->job_lock); + else + cpu = NULL; + + /* Can't be scheduled, run it now */ + if (cpu == NULL) { + if (!this_cpu()->job_has_no_return) + this_cpu()->job_has_no_return = no_return; + func(data); + job->complete = true; + return job; + } + + queue_job_on_cpu(cpu, job); + + return job; +} + +struct cpu_job *cpu_queue_job_on_node(uint32_t chip_id, + const char *name, + void (*func)(void *data), void *data) +{ + struct cpu_thread *cpu; + struct cpu_job *job; + + job = zalloc(sizeof(struct cpu_job)); + if (!job) + return NULL; + job->func = func; + job->data = data; + job->name = name; + job->complete = false; + job->no_return = false; + + /* Pick a candidate. Returns with target queue locked */ + cpu = cpu_find_job_target(chip_id); + + /* Can't be scheduled... */ + if (cpu == NULL) { + cpu = this_cpu(); + if (cpu->chip_id == chip_id) { + /* Run it now if we're the right node. */ + func(data); + job->complete = true; + return job; + } + /* Otherwise fail. */ + free(job); + return NULL; + } + + queue_job_on_cpu(cpu, job); + + return job; +} + +bool cpu_poll_job(struct cpu_job *job) +{ + lwsync(); + return job->complete; +} + +void cpu_wait_job(struct cpu_job *job, bool free_it) +{ + unsigned long time_waited = 0; + + if (!job) + return; + + while (!job->complete) { + /* This will call OPAL pollers for us */ + time_wait_ms(10); + time_waited += 10; + lwsync(); + if ((time_waited % 30000) == 0) { + prlog(PR_INFO, "cpu_wait_job(%s) for %lums\n", + job->name, time_waited); + backtrace(); + } + } + lwsync(); + + if (time_waited > 1000) + prlog(PR_DEBUG, "cpu_wait_job(%s) for %lums\n", + job->name, time_waited); + + if (free_it) + free(job); +} + +bool cpu_check_jobs(struct cpu_thread *cpu) +{ + return !list_empty_nocheck(&cpu->job_queue); +} + +void cpu_process_jobs(void) +{ + struct cpu_thread *cpu = this_cpu(); + struct cpu_job *job = NULL; + void (*func)(void *); + void *data; + + sync(); + if (!cpu_check_jobs(cpu)) + return; + + lock(&cpu->job_lock); + while (true) { + bool no_return; + + job = list_pop(&cpu->job_queue, struct cpu_job, link); + if (!job) + break; + + func = job->func; + data = job->data; + no_return = job->no_return; + unlock(&cpu->job_lock); + prlog(PR_TRACE, "running job %s on %x\n", job->name, cpu->pir); + if (no_return) + free(job); + func(data); + if (!list_empty(&cpu->locks_held)) { + if (no_return) + prlog(PR_ERR, "OPAL no-return job returned with" + "locks held!\n"); + else + prlog(PR_ERR, "OPAL job %s returning with locks held\n", + job->name); + drop_my_locks(true); + } + lock(&cpu->job_lock); + if (!no_return) { + cpu->job_count--; + lwsync(); + job->complete = true; + } + } + unlock(&cpu->job_lock); +} + +enum cpu_wake_cause { + cpu_wake_on_job, + cpu_wake_on_dec, +}; + +static unsigned int cpu_idle_p8(enum cpu_wake_cause wake_on) +{ + uint64_t lpcr = mfspr(SPR_LPCR) & ~SPR_LPCR_P8_PECE; + struct cpu_thread *cpu = this_cpu(); + unsigned int vec = 0; + + if (!pm_enabled) { + prlog_once(PR_DEBUG, "cpu_idle_p8 called pm disabled\n"); + return vec; + } + + /* Clean up ICP, be ready for IPIs */ + icp_prep_for_pm(); + + /* Synchronize with wakers */ + if (wake_on == cpu_wake_on_job) { + /* Mark ourselves in idle so other CPUs know to send an IPI */ + cpu->in_idle = true; + sync(); + + /* Check for jobs again */ + if (cpu_check_jobs(cpu) || !pm_enabled) + goto skip_sleep; + + /* Setup wakup cause in LPCR: EE (for IPI) */ + lpcr |= SPR_LPCR_P8_PECE2; + mtspr(SPR_LPCR, lpcr); + + } else { + /* Mark outselves sleeping so cpu_set_pm_enable knows to + * send an IPI + */ + cpu->in_sleep = true; + sync(); + + /* Check if PM got disabled */ + if (!pm_enabled) + goto skip_sleep; + + /* EE and DEC */ + lpcr |= SPR_LPCR_P8_PECE2 | SPR_LPCR_P8_PECE3; + mtspr(SPR_LPCR, lpcr); + } + isync(); + + /* Enter nap */ + vec = enter_p8_pm_state(false); + +skip_sleep: + /* Restore */ + sync(); + cpu->in_idle = false; + cpu->in_sleep = false; + reset_cpu_icp(); + + return vec; +} + +static unsigned int cpu_idle_p9(enum cpu_wake_cause wake_on) +{ + uint64_t lpcr = mfspr(SPR_LPCR) & ~SPR_LPCR_P9_PECE; + uint64_t psscr; + struct cpu_thread *cpu = this_cpu(); + unsigned int vec = 0; + + if (!pm_enabled) { + prlog(PR_DEBUG, "cpu_idle_p9 called on cpu 0x%04x with pm disabled\n", cpu->pir); + return vec; + } + + /* Synchronize with wakers */ + if (wake_on == cpu_wake_on_job) { + /* Mark ourselves in idle so other CPUs know to send an IPI */ + cpu->in_idle = true; + sync(); + + /* Check for jobs again */ + if (cpu_check_jobs(cpu) || !pm_enabled) + goto skip_sleep; + + /* HV DBELL for IPI */ + lpcr |= SPR_LPCR_P9_PECEL1; + } else { + /* Mark outselves sleeping so cpu_set_pm_enable knows to + * send an IPI + */ + cpu->in_sleep = true; + sync(); + + /* Check if PM got disabled */ + if (!pm_enabled) + goto skip_sleep; + + /* HV DBELL and DEC */ + lpcr |= SPR_LPCR_P9_PECEL1 | SPR_LPCR_P9_PECEL3; + } + + mtspr(SPR_LPCR, lpcr); + isync(); + + if (sreset_enabled) { + /* stop with EC=1 (sreset) and ESL=1 (enable thread switch). */ + /* PSSCR SD=0 ESL=1 EC=1 PSSL=0 TR=3 MTL=0 RL=1 */ + psscr = PPC_BIT(42) | PPC_BIT(43) | + PPC_BITMASK(54, 55) | PPC_BIT(63); + vec = enter_p9_pm_state(psscr); + } else { + /* stop with EC=0 (resumes) which does not require sreset. */ + /* PSSCR SD=0 ESL=0 EC=0 PSSL=0 TR=3 MTL=0 RL=1 */ + psscr = PPC_BITMASK(54, 55) | PPC_BIT(63); + enter_p9_pm_lite_state(psscr); + } + + /* Clear doorbell */ + p9_dbell_receive(); + + skip_sleep: + /* Restore */ + sync(); + cpu->in_idle = false; + cpu->in_sleep = false; + + return vec; +} + +static void cpu_idle_pm(enum cpu_wake_cause wake_on) +{ + unsigned int vec; + + switch(proc_gen) { + case proc_gen_p8: + vec = cpu_idle_p8(wake_on); + break; + case proc_gen_p9: + vec = cpu_idle_p9(wake_on); + break; + case proc_gen_p10: + vec = cpu_idle_p9(wake_on); + break; + default: + vec = 0; + prlog_once(PR_DEBUG, "cpu_idle_pm called with bad processor type\n"); + break; + } + + if (vec == 0x100) { + unsigned long srr1 = mfspr(SPR_SRR1); + + switch (srr1 & SPR_SRR1_PM_WAKE_MASK) { + case SPR_SRR1_PM_WAKE_SRESET: + exception_entry_pm_sreset(); + break; + default: + break; + } + mtmsrd(MSR_RI, 1); + + } else if (vec == 0x200) { + exception_entry_pm_mce(); + enable_machine_check(); + mtmsrd(MSR_RI, 1); + } +} + +void cpu_idle_job(void) +{ + if (pm_enabled) { + cpu_idle_pm(cpu_wake_on_job); + } else { + struct cpu_thread *cpu = this_cpu(); + + smt_lowest(); + /* Check for jobs again */ + while (!cpu_check_jobs(cpu)) { + if (pm_enabled) + break; + cpu_relax(); + barrier(); + } + smt_medium(); + } +} + +void cpu_idle_delay(unsigned long delay) +{ + unsigned long now = mftb(); + unsigned long end = now + delay; + unsigned long min_pm = usecs_to_tb(10); + + if (pm_enabled && delay > min_pm) { +pm: + for (;;) { + if (delay >= 0x7fffffff) + delay = 0x7fffffff; + mtspr(SPR_DEC, delay); + + cpu_idle_pm(cpu_wake_on_dec); + + now = mftb(); + if (tb_compare(now, end) == TB_AAFTERB) + break; + delay = end - now; + if (!(pm_enabled && delay > min_pm)) + goto no_pm; + } + } else { +no_pm: + smt_lowest(); + for (;;) { + now = mftb(); + if (tb_compare(now, end) == TB_AAFTERB) + break; + delay = end - now; + if (pm_enabled && delay > min_pm) { + smt_medium(); + goto pm; + } + } + smt_medium(); + } +} + +static void cpu_pm_disable(void) +{ + struct cpu_thread *cpu; + unsigned int timeout; + + pm_enabled = false; + sync(); + + if (proc_gen == proc_gen_p8) { + for_each_available_cpu(cpu) { + while (cpu->in_sleep || cpu->in_idle) { + icp_kick_cpu(cpu); + cpu_relax(); + } + } + } else if (proc_gen == proc_gen_p9 || proc_gen == proc_gen_p10) { + for_each_available_cpu(cpu) { + if (cpu->in_sleep || cpu->in_idle) + p9_dbell_send(cpu->pir); + } + + /* This code is racy with cpus entering idle, late ones miss the dbell */ + + smt_lowest(); + for_each_available_cpu(cpu) { + timeout = 0x08000000; + while ((cpu->in_sleep || cpu->in_idle) && --timeout) + barrier(); + if (!timeout) { + prlog(PR_DEBUG, "cpu_pm_disable TIMEOUT on cpu 0x%04x to exit idle\n", + cpu->pir); + p9_dbell_send(cpu->pir); + } + } + smt_medium(); + } +} + +void cpu_set_sreset_enable(bool enabled) +{ + if (sreset_enabled == enabled) + return; + + if (proc_gen == proc_gen_p8) { + /* Public P8 Mambo has broken NAP */ + if (chip_quirk(QUIRK_MAMBO_CALLOUTS)) + return; + + sreset_enabled = enabled; + sync(); + + if (!enabled) { + cpu_pm_disable(); + } else { + if (ipi_enabled) + pm_enabled = true; + } + + } else if (proc_gen == proc_gen_p9 || proc_gen == proc_gen_p10) { + sreset_enabled = enabled; + sync(); + /* + * Kick everybody out of PM so they can adjust the PM + * mode they are using (EC=0/1). + */ + cpu_pm_disable(); + if (ipi_enabled) + pm_enabled = true; + } +} + +void cpu_set_ipi_enable(bool enabled) +{ + if (ipi_enabled == enabled) + return; + + if (proc_gen == proc_gen_p8) { + ipi_enabled = enabled; + sync(); + if (!enabled) { + cpu_pm_disable(); + } else { + if (sreset_enabled) + pm_enabled = true; + } + + } else if (proc_gen == proc_gen_p9 || proc_gen == proc_gen_p10) { + ipi_enabled = enabled; + sync(); + if (!enabled) + cpu_pm_disable(); + else + pm_enabled = true; + } +} + +void cpu_process_local_jobs(void) +{ + struct cpu_thread *cpu = first_available_cpu(); + + while (cpu) { + if (cpu != this_cpu()) + return; + + cpu = next_available_cpu(cpu); + } + + if (!cpu) + cpu = first_available_cpu(); + + /* No CPU to run on, just run synchro */ + if (cpu == this_cpu()) { + prlog_once(PR_DEBUG, "Processing jobs synchronously\n"); + cpu_process_jobs(); + opal_run_pollers(); + } +} + + +struct dt_node *get_cpu_node(u32 pir) +{ + struct cpu_thread *t = find_cpu_by_pir(pir); + + return t ? t->node : NULL; +} + +/* This only covers primary, active cpus */ +struct cpu_thread *find_cpu_by_chip_id(u32 chip_id) +{ + struct cpu_thread *t; + + for_each_available_cpu(t) { + if (t->is_secondary) + continue; + if (t->chip_id == chip_id) + return t; + } + return NULL; +} + +struct cpu_thread *find_cpu_by_node(struct dt_node *cpu) +{ + struct cpu_thread *t; + + for_each_available_cpu(t) { + if (t->node == cpu) + return t; + } + return NULL; +} + +struct cpu_thread *find_cpu_by_pir(u32 pir) +{ + if (pir > cpu_max_pir) + return NULL; + return &cpu_stacks[pir].cpu; +} + +struct cpu_thread __nomcount *find_cpu_by_pir_nomcount(u32 pir) +{ + if (pir > cpu_max_pir) + return NULL; + return &cpu_stacks[pir].cpu; +} + +struct cpu_thread *find_cpu_by_server(u32 server_no) +{ + struct cpu_thread *t; + + for_each_cpu(t) { + if (t->server_no == server_no) + return t; + } + return NULL; +} + +struct cpu_thread *next_cpu(struct cpu_thread *cpu) +{ + struct cpu_stack *s; + unsigned int index = 0; + + if (cpu != NULL) { + s = container_of(cpu, struct cpu_stack, cpu); + index = s - cpu_stacks + 1; + } + for (; index <= cpu_max_pir; index++) { + cpu = &cpu_stacks[index].cpu; + if (cpu->state != cpu_state_no_cpu) + return cpu; + } + return NULL; +} + +struct cpu_thread *first_cpu(void) +{ + return next_cpu(NULL); +} + +struct cpu_thread *next_available_cpu(struct cpu_thread *cpu) +{ + do { + cpu = next_cpu(cpu); + } while(cpu && !cpu_is_available(cpu)); + + return cpu; +} + +struct cpu_thread *first_available_cpu(void) +{ + return next_available_cpu(NULL); +} + +struct cpu_thread *next_present_cpu(struct cpu_thread *cpu) +{ + do { + cpu = next_cpu(cpu); + } while(cpu && !cpu_is_present(cpu)); + + return cpu; +} + +struct cpu_thread *first_present_cpu(void) +{ + return next_present_cpu(NULL); +} + +struct cpu_thread *next_ungarded_cpu(struct cpu_thread *cpu) +{ + do { + cpu = next_cpu(cpu); + } while(cpu && cpu->state == cpu_state_unavailable); + + return cpu; +} + +struct cpu_thread *first_ungarded_cpu(void) +{ + return next_ungarded_cpu(NULL); +} + +struct cpu_thread *next_ungarded_primary(struct cpu_thread *cpu) +{ + do { + cpu = next_ungarded_cpu(cpu); + } while (cpu && !(cpu == cpu->primary || cpu == cpu->ec_primary)); + + return cpu; +} + +struct cpu_thread *first_ungarded_primary(void) +{ + return next_ungarded_primary(NULL); +} + +u8 get_available_nr_cores_in_chip(u32 chip_id) +{ + struct cpu_thread *core; + u8 nr_cores = 0; + + for_each_available_core_in_chip(core, chip_id) + nr_cores++; + + return nr_cores; +} + +struct cpu_thread *next_available_core_in_chip(struct cpu_thread *core, + u32 chip_id) +{ + do { + core = next_cpu(core); + } while(core && (!cpu_is_available(core) || + core->chip_id != chip_id || + core->is_secondary)); + return core; +} + +struct cpu_thread *first_available_core_in_chip(u32 chip_id) +{ + return next_available_core_in_chip(NULL, chip_id); +} + +uint32_t cpu_get_core_index(struct cpu_thread *cpu) +{ + return pir_to_fused_core_id(cpu->pir); +} + +void cpu_remove_node(const struct cpu_thread *t) +{ + struct dt_node *i; + + /* Find this cpu node */ + dt_for_each_node(dt_root, i) { + const struct dt_property *p; + + if (!dt_has_node_property(i, "device_type", "cpu")) + continue; + p = dt_find_property(i, "ibm,pir"); + if (!p) + continue; + if (dt_property_get_cell(p, 0) == t->pir) { + dt_free(i); + return; + } + } + prerror("CPU: Could not find cpu node %i to remove!\n", t->pir); + abort(); +} + +void cpu_disable_all_threads(struct cpu_thread *cpu) +{ + unsigned int i; + struct dt_property *p; + + for (i = 0; i <= cpu_max_pir; i++) { + struct cpu_thread *t = &cpu_stacks[i].cpu; + + if (t->primary == cpu->primary) + t->state = cpu_state_disabled; + + } + + /* Mark this core as bad so that Linux kernel don't use this CPU. */ + prlog(PR_DEBUG, "CPU: Mark CPU bad (PIR 0x%04x)...\n", cpu->pir); + p = __dt_find_property(cpu->node, "status"); + if (p) + dt_del_property(cpu->node, p); + + dt_add_property_string(cpu->node, "status", "bad"); + + /* XXX Do something to actually stop the core */ +} + +static void init_cpu_thread(struct cpu_thread *t, + enum cpu_thread_state state, + unsigned int pir) +{ + /* offset within cpu_thread to prevent stack_guard clobber */ + const size_t guard_skip = container_off_var(t, stack_guard) + + sizeof(t->stack_guard); + + memset(((void *)t) + guard_skip, 0, sizeof(struct cpu_thread) - guard_skip); + init_lock(&t->dctl_lock); + init_lock(&t->job_lock); + list_head_init(&t->job_queue); + list_head_init(&t->locks_held); + t->stack_guard = STACK_CHECK_GUARD_BASE ^ pir; + t->state = state; + t->pir = pir; +#ifdef STACK_CHECK_ENABLED + t->stack_bot_mark = LONG_MAX; +#endif + t->is_fused_core = is_fused_core(mfspr(SPR_PVR)); + assert(pir == container_of(t, struct cpu_stack, cpu) - cpu_stacks); +} + +static void enable_attn(void) +{ + unsigned long hid0; + + hid0 = mfspr(SPR_HID0); + hid0 |= hid0_attn; + set_hid0(hid0); +} + +static void disable_attn(void) +{ + unsigned long hid0; + + hid0 = mfspr(SPR_HID0); + hid0 &= ~hid0_attn; + set_hid0(hid0); +} + +extern void __trigger_attn(void); +void trigger_attn(void) +{ + enable_attn(); + __trigger_attn(); +} + +static void init_hid(void) +{ + /* attn is enabled even when HV=0, so make sure it's off */ + disable_attn(); +} + +void __nomcount pre_init_boot_cpu(void) +{ + struct cpu_thread *cpu = this_cpu(); + + /* We skip the stack guard ! */ + memset(((void *)cpu) + 8, 0, sizeof(struct cpu_thread) - 8); +} + +void init_boot_cpu(void) +{ + unsigned int pir, pvr; + + pir = mfspr(SPR_PIR); + pvr = mfspr(SPR_PVR); + + /* Get CPU family and other flags based on PVR */ + switch(PVR_TYPE(pvr)) { + case PVR_TYPE_P8E: + case PVR_TYPE_P8: + proc_gen = proc_gen_p8; + hile_supported = PVR_VERS_MAJ(mfspr(SPR_PVR)) >= 2; + hid0_hile = SPR_HID0_POWER8_HILE; + hid0_attn = SPR_HID0_POWER8_ENABLE_ATTN; + break; + case PVR_TYPE_P8NVL: + proc_gen = proc_gen_p8; + hile_supported = true; + hid0_hile = SPR_HID0_POWER8_HILE; + hid0_attn = SPR_HID0_POWER8_ENABLE_ATTN; + break; + case PVR_TYPE_P9: + case PVR_TYPE_P9P: + proc_gen = proc_gen_p9; + hile_supported = true; + radix_supported = true; + hid0_hile = SPR_HID0_POWER9_HILE; + hid0_attn = SPR_HID0_POWER9_ENABLE_ATTN; + break; + case PVR_TYPE_P10: + proc_gen = proc_gen_p10; + hile_supported = true; + radix_supported = true; + hid0_hile = SPR_HID0_POWER10_HILE; + hid0_attn = SPR_HID0_POWER10_ENABLE_ATTN; + break; + default: + proc_gen = proc_gen_unknown; + } + + /* Get a CPU thread count based on family */ + switch(proc_gen) { + case proc_gen_p8: + cpu_thread_count = 8; + prlog(PR_INFO, "CPU: P8 generation processor" + " (max %d threads/core)\n", cpu_thread_count); + break; + case proc_gen_p9: + if (is_fused_core(pvr)) + cpu_thread_count = 8; + else + cpu_thread_count = 4; + prlog(PR_INFO, "CPU: P9 generation processor" + " (max %d threads/core)\n", cpu_thread_count); + break; + case proc_gen_p10: + if (is_fused_core(pvr)) + cpu_thread_count = 8; + else + cpu_thread_count = 4; + prlog(PR_INFO, "CPU: P10 generation processor" + " (max %d threads/core)\n", cpu_thread_count); + break; + default: + prerror("CPU: Unknown PVR, assuming 1 thread\n"); + cpu_thread_count = 1; + } + + if (is_power9n(pvr) && (PVR_VERS_MAJ(pvr) == 1)) { + prerror("CPU: POWER9N DD1 is not supported\n"); + abort(); + } + + prlog(PR_DEBUG, "CPU: Boot CPU PIR is 0x%04x PVR is 0x%08x\n", + pir, pvr); + + /* + * Adjust top of RAM to include the boot CPU stack. If we have less + * RAM than this, it's not possible to boot. + */ + cpu_max_pir = pir; + top_of_ram += (cpu_max_pir + 1) * STACK_SIZE; + + /* Setup boot CPU state */ + boot_cpu = &cpu_stacks[pir].cpu; + init_cpu_thread(boot_cpu, cpu_state_active, pir); + init_boot_tracebuf(boot_cpu); + assert(this_cpu() == boot_cpu); + init_hid(); +} + +static void enable_large_dec(bool on) +{ + u64 lpcr = mfspr(SPR_LPCR); + + if (on) + lpcr |= SPR_LPCR_P9_LD; + else + lpcr &= ~SPR_LPCR_P9_LD; + + mtspr(SPR_LPCR, lpcr); + isync(); +} + +#define HIGH_BIT (1ull << 63) + +static int find_dec_bits(void) +{ + int bits = 65; /* we always decrement once */ + u64 mask = ~0ull; + + if (proc_gen < proc_gen_p9) + return 32; + + /* The ISA doesn't specify the width of the decrementer register so we + * need to discover it. When in large mode (LPCR.LD = 1) reads from the + * DEC SPR are sign extended to 64 bits and writes are truncated to the + * physical register width. We can use this behaviour to detect the + * width by starting from an all 1s value and left shifting until we + * read a value from the DEC with it's high bit cleared. + */ + + enable_large_dec(true); + + do { + bits--; + mask = mask >> 1; + mtspr(SPR_DEC, mask); + } while (mfspr(SPR_DEC) & HIGH_BIT); + + enable_large_dec(false); + + prlog(PR_DEBUG, "CPU: decrementer bits %d\n", bits); + return bits; +} + +static void init_tm_suspend_mode_property(void) +{ + struct dt_node *node; + + /* If we don't find anything, assume TM suspend is enabled */ + tm_suspend_enabled = true; + + node = dt_find_by_path(dt_root, "/ibm,opal/fw-features/tm-suspend-mode"); + if (!node) + return; + + if (dt_find_property(node, "disabled")) + tm_suspend_enabled = false; +} + +void init_cpu_max_pir(void) +{ + struct dt_node *cpus, *cpu; + + cpus = dt_find_by_path(dt_root, "/cpus"); + assert(cpus); + + /* Iterate all CPUs in the device-tree */ + dt_for_each_child(cpus, cpu) { + unsigned int pir, server_no; + + /* Skip cache nodes */ + if (strcmp(dt_prop_get(cpu, "device_type"), "cpu")) + continue; + + server_no = dt_prop_get_u32(cpu, "reg"); + + /* If PIR property is absent, assume it's the same as the + * server number + */ + pir = dt_prop_get_u32_def(cpu, "ibm,pir", server_no); + + if (cpu_max_pir < pir + cpu_thread_count - 1) + cpu_max_pir = pir + cpu_thread_count - 1; + } + + prlog(PR_DEBUG, "CPU: New max PIR set to 0x%x\n", cpu_max_pir); +} + +/* + * Set cpu->state to cpu_state_no_cpu for all secondaries, before the dt is + * parsed and they will be flipped to present as populated CPUs are found. + * + * Some configurations (e.g., with memory encryption) will not zero system + * memory at boot, so can't rely on cpu->state to be zero (== cpu_state_no_cpu). + */ +static void mark_all_secondary_cpus_absent(void) +{ + unsigned int pir; + struct cpu_thread *cpu; + + for (pir = 0; pir <= cpu_max_pir; pir++) { + cpu = &cpu_stacks[pir].cpu; + if (cpu == boot_cpu) + continue; + cpu->state = cpu_state_no_cpu; + } +} + +void init_all_cpus(void) +{ + struct dt_node *cpus, *cpu; + unsigned int pir, thread; + int dec_bits = find_dec_bits(); + + cpus = dt_find_by_path(dt_root, "/cpus"); + assert(cpus); + + init_tm_suspend_mode_property(); + + mark_all_secondary_cpus_absent(); + + /* Iterate all CPUs in the device-tree */ + dt_for_each_child(cpus, cpu) { + unsigned int server_no, chip_id, threads; + enum cpu_thread_state state; + const struct dt_property *p; + struct cpu_thread *t, *pt0, *pt1; + + /* Skip cache nodes */ + if (strcmp(dt_prop_get(cpu, "device_type"), "cpu")) + continue; + + server_no = dt_prop_get_u32(cpu, "reg"); + + /* If PIR property is absent, assume it's the same as the + * server number + */ + pir = dt_prop_get_u32_def(cpu, "ibm,pir", server_no); + + /* We should always have an ibm,chip-id property */ + chip_id = dt_get_chip_id(cpu); + + /* Only use operational CPUs */ + if (!strcmp(dt_prop_get(cpu, "status"), "okay")) { + state = cpu_state_present; + get_chip(chip_id)->ex_present = true; + } else { + state = cpu_state_unavailable; + } + + prlog(PR_INFO, "CPU: CPU from DT PIR=0x%04x Server#=0x%x" + " State=%d\n", pir, server_no, state); + + /* Check max PIR */ + if (cpu_max_pir < (pir + cpu_thread_count - 1)) { + prlog(PR_WARNING, "CPU: CPU potentially out of range" + "PIR=0x%04x MAX=0x%04x !\n", + pir, cpu_max_pir); + continue; + } + + /* Setup thread 0 */ + assert(pir <= cpu_max_pir); + t = pt0 = &cpu_stacks[pir].cpu; + if (t != boot_cpu) { + init_cpu_thread(t, state, pir); + /* Each cpu gets its own later in init_trace_buffers */ + t->trace = boot_cpu->trace; + } + if (t->is_fused_core) + pt1 = &cpu_stacks[pir + 1].cpu; + else + pt1 = pt0; + t->server_no = server_no; + t->primary = t->ec_primary = t; + t->node = cpu; + t->chip_id = chip_id; + t->icp_regs = NULL; /* Will be set later */ +#ifdef DEBUG_LOCKS + t->requested_lock = NULL; +#endif + t->core_hmi_state = 0; + t->core_hmi_state_ptr = &t->core_hmi_state; + + /* Add associativity properties */ + add_core_associativity(t); + + /* Add the decrementer width property */ + dt_add_property_cells(cpu, "ibm,dec-bits", dec_bits); + + if (t->is_fused_core) + dt_add_property(t->node, "ibm,fused-core", NULL, 0); + + /* Iterate threads */ + p = dt_find_property(cpu, "ibm,ppc-interrupt-server#s"); + if (!p) + continue; + threads = p->len / 4; + if (threads > cpu_thread_count) { + prlog(PR_WARNING, "CPU: Threads out of range for PIR 0x%04x" + " threads=%d max=%d\n", + pir, threads, cpu_thread_count); + threads = cpu_thread_count; + } + for (thread = 1; thread < threads; thread++) { + prlog(PR_TRACE, "CPU: secondary thread %d found\n", + thread); + t = &cpu_stacks[pir + thread].cpu; + init_cpu_thread(t, state, pir + thread); + t->trace = boot_cpu->trace; + t->server_no = dt_property_get_cell(p, thread); + t->is_secondary = true; + t->is_fused_core = pt0->is_fused_core; + t->primary = pt0; + t->ec_primary = (thread & 1) ? pt1 : pt0; + t->node = cpu; + t->chip_id = chip_id; + t->core_hmi_state_ptr = &pt0->core_hmi_state; + } + prlog(PR_INFO, "CPU: %d secondary threads\n", thread); + } +} + +void cpu_bringup(void) +{ + struct cpu_thread *t; + uint32_t count = 0; + + prlog(PR_INFO, "CPU: Setting up secondary CPU state\n"); + + op_display(OP_LOG, OP_MOD_CPU, 0x0000); + + /* Tell everybody to chime in ! */ + prlog(PR_INFO, "CPU: Calling in all processors...\n"); + cpu_secondary_start = 1; + sync(); + + op_display(OP_LOG, OP_MOD_CPU, 0x0002); + + for_each_cpu(t) { + if (t->state != cpu_state_present && + t->state != cpu_state_active) + continue; + + /* Add a callin timeout ? If so, call cpu_remove_node(t). */ + while (t->state != cpu_state_active) { + smt_lowest(); + sync(); + } + smt_medium(); + count++; + } + + prlog(PR_NOTICE, "CPU: All %d processors called in...\n", count); + + op_display(OP_LOG, OP_MOD_CPU, 0x0003); +} + +void cpu_callin(struct cpu_thread *cpu) +{ + sync(); + cpu->state = cpu_state_active; + sync(); + + cpu->job_has_no_return = false; + if (cpu_is_thread0(cpu)) + init_hid(); +} + +static void opal_start_thread_job(void *data) +{ + cpu_give_self_os(); + + /* We do not return, so let's mark the job as + * complete + */ + start_kernel_secondary((uint64_t)data); +} + +static int64_t opal_start_cpu_thread(uint64_t server_no, uint64_t start_address) +{ + struct cpu_thread *cpu; + struct cpu_job *job; + + if (!opal_addr_valid((void *)start_address)) + return OPAL_PARAMETER; + + cpu = find_cpu_by_server(server_no); + if (!cpu) { + prerror("OPAL: Start invalid CPU 0x%04llx !\n", server_no); + return OPAL_PARAMETER; + } + prlog(PR_DEBUG, "OPAL: Start CPU 0x%04llx (PIR 0x%04x) -> 0x%016llx\n", + server_no, cpu->pir, start_address); + + lock(&reinit_lock); + if (!cpu_is_available(cpu)) { + unlock(&reinit_lock); + prerror("OPAL: CPU not active in OPAL !\n"); + return OPAL_WRONG_STATE; + } + if (cpu->in_reinit) { + unlock(&reinit_lock); + prerror("OPAL: CPU being reinitialized !\n"); + return OPAL_WRONG_STATE; + } + job = __cpu_queue_job(cpu, "start_thread", + opal_start_thread_job, (void *)start_address, + true); + unlock(&reinit_lock); + if (!job) { + prerror("OPAL: Failed to create CPU start job !\n"); + return OPAL_INTERNAL_ERROR; + } + return OPAL_SUCCESS; +} +opal_call(OPAL_START_CPU, opal_start_cpu_thread, 2); + +static int64_t opal_query_cpu_status(uint64_t server_no, uint8_t *thread_status) +{ + struct cpu_thread *cpu; + + if (!opal_addr_valid(thread_status)) + return OPAL_PARAMETER; + + cpu = find_cpu_by_server(server_no); + if (!cpu) { + prerror("OPAL: Query invalid CPU 0x%04llx !\n", server_no); + return OPAL_PARAMETER; + } + if (!cpu_is_available(cpu) && cpu->state != cpu_state_os) { + prerror("OPAL: CPU not active in OPAL nor OS !\n"); + return OPAL_PARAMETER; + } + switch(cpu->state) { + case cpu_state_os: + *thread_status = OPAL_THREAD_STARTED; + break; + case cpu_state_active: + /* Active in skiboot -> inactive in OS */ + *thread_status = OPAL_THREAD_INACTIVE; + break; + default: + *thread_status = OPAL_THREAD_UNAVAILABLE; + } + + return OPAL_SUCCESS; +} +opal_call(OPAL_QUERY_CPU_STATUS, opal_query_cpu_status, 2); + +static int64_t opal_return_cpu(void) +{ + prlog(PR_DEBUG, "OPAL: Returning CPU 0x%04x\n", this_cpu()->pir); + + this_cpu()->in_opal_call--; + if (this_cpu()->in_opal_call != 0) { + printf("OPAL in_opal_call=%u\n", this_cpu()->in_opal_call); + } + + __secondary_cpu_entry(); + + return OPAL_HARDWARE; /* Should not happen */ +} +opal_call(OPAL_RETURN_CPU, opal_return_cpu, 0); + +struct hid0_change_req { + uint64_t clr_bits; + uint64_t set_bits; +}; + +static void cpu_change_hid0(void *__req) +{ + struct hid0_change_req *req = __req; + unsigned long hid0, new_hid0; + + hid0 = new_hid0 = mfspr(SPR_HID0); + new_hid0 &= ~req->clr_bits; + new_hid0 |= req->set_bits; + prlog(PR_DEBUG, "CPU: [%08x] HID0 change 0x%016lx -> 0x%016lx\n", + this_cpu()->pir, hid0, new_hid0); + set_hid0(new_hid0); +} + +static int64_t cpu_change_all_hid0(struct hid0_change_req *req) +{ + struct cpu_thread *cpu; + struct cpu_job **jobs; + + jobs = zalloc(sizeof(struct cpu_job *) * (cpu_max_pir + 1)); + assert(jobs); + + for_each_available_cpu(cpu) { + if (!cpu_is_thread0(cpu) && !cpu_is_core_chiplet_primary(cpu)) + continue; + if (cpu == this_cpu()) + continue; + jobs[cpu->pir] = cpu_queue_job(cpu, "cpu_change_hid0", + cpu_change_hid0, req); + } + + /* this cpu */ + cpu_change_hid0(req); + + for_each_available_cpu(cpu) { + if (jobs[cpu->pir]) + cpu_wait_job(jobs[cpu->pir], true); + } + + free(jobs); + + return OPAL_SUCCESS; +} + +void cpu_set_hile_mode(bool hile) +{ + struct hid0_change_req req; + + if (hile == current_hile_mode) + return; + + if (hile) { + req.clr_bits = 0; + req.set_bits = hid0_hile; + } else { + req.clr_bits = hid0_hile; + req.set_bits = 0; + } + cpu_change_all_hid0(&req); + current_hile_mode = hile; +} + +static void cpu_cleanup_one(void *param __unused) +{ + mtspr(SPR_AMR, 0); + mtspr(SPR_IAMR, 0); + mtspr(SPR_PCR, 0); +} + +static int64_t cpu_cleanup_all(void) +{ + struct cpu_thread *cpu; + struct cpu_job **jobs; + + jobs = zalloc(sizeof(struct cpu_job *) * (cpu_max_pir + 1)); + assert(jobs); + + for_each_available_cpu(cpu) { + if (cpu == this_cpu()) + continue; + jobs[cpu->pir] = cpu_queue_job(cpu, "cpu_cleanup", + cpu_cleanup_one, NULL); + } + + /* this cpu */ + cpu_cleanup_one(NULL); + + for_each_available_cpu(cpu) { + if (jobs[cpu->pir]) + cpu_wait_job(jobs[cpu->pir], true); + } + + free(jobs); + + + return OPAL_SUCCESS; +} + +void cpu_fast_reboot_complete(void) +{ + /* Fast reboot will have set HID0:HILE to skiboot endian */ + current_hile_mode = HAVE_LITTLE_ENDIAN; + + /* and set HID0:RADIX */ + if (proc_gen == proc_gen_p9) + current_radix_mode = true; +} + +static int64_t opal_reinit_cpus(uint64_t flags) +{ + struct hid0_change_req req = { 0, 0 }; + struct cpu_thread *cpu; + int64_t rc = OPAL_SUCCESS; + int i; + + prlog(PR_DEBUG, "OPAL: CPU re-init with flags: 0x%llx\n", flags); + + if (flags & OPAL_REINIT_CPUS_HILE_LE) + prlog(PR_INFO, "OPAL: Switch to little-endian OS\n"); + else if (flags & OPAL_REINIT_CPUS_HILE_BE) + prlog(PR_INFO, "OPAL: Switch to big-endian OS\n"); + + again: + lock(&reinit_lock); + + for (cpu = first_cpu(); cpu; cpu = next_cpu(cpu)) { + if (cpu == this_cpu() || cpu->in_reinit) + continue; + if (cpu->state == cpu_state_os) { + unlock(&reinit_lock); + /* + * That might be a race with return CPU during kexec + * where we are still, wait a bit and try again + */ + for (i = 0; (i < 1000) && + (cpu->state == cpu_state_os); i++) { + time_wait_ms(1); + } + if (cpu->state == cpu_state_os) { + prerror("OPAL: CPU 0x%x not in OPAL !\n", cpu->pir); + return OPAL_WRONG_STATE; + } + goto again; + } + cpu->in_reinit = true; + } + /* + * Now we need to mark ourselves "active" or we'll be skipped + * by the various "for_each_active_..." calls done by slw_reinit() + */ + this_cpu()->state = cpu_state_active; + this_cpu()->in_reinit = true; + unlock(&reinit_lock); + + /* + * This cleans up a few things left over by Linux + * that can cause problems in cases such as radix->hash + * transitions. Ideally Linux should do it but doing it + * here works around existing broken kernels. + */ + cpu_cleanup_all(); + + /* If HILE change via HID0 is supported ... */ + if (hile_supported && + (flags & (OPAL_REINIT_CPUS_HILE_BE | + OPAL_REINIT_CPUS_HILE_LE))) { + bool hile = !!(flags & OPAL_REINIT_CPUS_HILE_LE); + + flags &= ~(OPAL_REINIT_CPUS_HILE_BE | OPAL_REINIT_CPUS_HILE_LE); + if (hile != current_hile_mode) { + if (hile) + req.set_bits |= hid0_hile; + else + req.clr_bits |= hid0_hile; + current_hile_mode = hile; + } + } + + /* If MMU mode change is supported */ + if (radix_supported && + (flags & (OPAL_REINIT_CPUS_MMU_HASH | + OPAL_REINIT_CPUS_MMU_RADIX))) { + bool radix = !!(flags & OPAL_REINIT_CPUS_MMU_RADIX); + + flags &= ~(OPAL_REINIT_CPUS_MMU_HASH | + OPAL_REINIT_CPUS_MMU_RADIX); + + if (proc_gen == proc_gen_p9 && radix != current_radix_mode) { + if (radix) + req.set_bits |= SPR_HID0_POWER9_RADIX; + else + req.clr_bits |= SPR_HID0_POWER9_RADIX; + + current_radix_mode = radix; + } + } + + /* Cleanup the TLB. We do that unconditionally, this works + * around issues where OSes fail to invalidate the PWC in Radix + * mode for example. This only works on P9 and later, but we + * also know we don't have a problem with Linux cleanups on + * P8 so this isn't a problem. If we wanted to cleanup the + * TLB on P8 as well, we'd have to use jobs to do it locally + * on each CPU. + */ + cleanup_global_tlb(); + + /* Apply HID bits changes if any */ + if (req.set_bits || req.clr_bits) + cpu_change_all_hid0(&req); + + if (flags & OPAL_REINIT_CPUS_TM_SUSPEND_DISABLED) { + flags &= ~OPAL_REINIT_CPUS_TM_SUSPEND_DISABLED; + + if (tm_suspend_enabled) + rc = OPAL_UNSUPPORTED; + else + rc = OPAL_SUCCESS; + } + + /* Handle P8 DD1 SLW reinit */ + if (flags != 0 && proc_gen == proc_gen_p8 && !hile_supported) + rc = slw_reinit(flags); + else if (flags != 0) + rc = OPAL_UNSUPPORTED; + + /* And undo the above */ + lock(&reinit_lock); + this_cpu()->state = cpu_state_os; + for (cpu = first_cpu(); cpu; cpu = next_cpu(cpu)) + cpu->in_reinit = false; + unlock(&reinit_lock); + + return rc; +} +opal_call(OPAL_REINIT_CPUS, opal_reinit_cpus, 1); + +#define NMMU_XLAT_CTL_PTCR 0xb +static int64_t nmmu_set_ptcr(uint64_t chip_id, struct dt_node *node, uint64_t ptcr) +{ + uint32_t nmmu_base_addr; + + nmmu_base_addr = dt_get_address(node, 0, NULL); + return xscom_write(chip_id, nmmu_base_addr + NMMU_XLAT_CTL_PTCR, ptcr); +} + +/* + * Setup the the Nest MMU PTCR register for all chips in the system or + * the specified chip id. + * + * The PTCR value may be overwritten so long as all users have been + * quiesced. If it is set to an invalid memory address the system will + * checkstop if anything attempts to use it. + * + * Returns OPAL_UNSUPPORTED if no nest mmu was found. + */ +static int64_t opal_nmmu_set_ptcr(uint64_t chip_id, uint64_t ptcr) +{ + struct dt_node *node; + int64_t rc = OPAL_UNSUPPORTED; + + if (chip_id == -1ULL) + dt_for_each_compatible(dt_root, node, "ibm,power9-nest-mmu") { + chip_id = dt_get_chip_id(node); + if ((rc = nmmu_set_ptcr(chip_id, node, ptcr))) + return rc; + } + else + dt_for_each_compatible_on_chip(dt_root, node, "ibm,power9-nest-mmu", chip_id) + if ((rc = nmmu_set_ptcr(chip_id, node, ptcr))) + return rc; + + return rc; +} +opal_call(OPAL_NMMU_SET_PTCR, opal_nmmu_set_ptcr, 2); + +static void _exit_uv_mode(void *data __unused) +{ + prlog(PR_DEBUG, "Exit uv mode on cpu pir 0x%04x\n", this_cpu()->pir); + /* HW has smfctrl shared between threads but on Mambo it is per-thread */ + if (chip_quirk(QUIRK_MAMBO_CALLOUTS)) + exit_uv_mode(1); + else + exit_uv_mode(cpu_is_thread0(this_cpu())); +} + +void cpu_disable_pef(void) +{ + struct cpu_thread *cpu; + struct cpu_job **jobs; + + if (!(mfmsr() & MSR_S)) { + prlog(PR_DEBUG, "UV mode off on cpu pir 0x%04x\n", this_cpu()->pir); + return; + } + + jobs = zalloc(sizeof(struct cpu_job *) * (cpu_max_pir + 1)); + assert(jobs); + + /* Exit uv mode on all secondary threads before touching + * smfctrl on thread 0 */ + for_each_available_cpu(cpu) { + if (cpu == this_cpu()) + continue; + + if (!cpu_is_thread0(cpu)) + jobs[cpu->pir] = cpu_queue_job(cpu, "exit_uv_mode", + _exit_uv_mode, NULL); + } + + for_each_available_cpu(cpu) + if (jobs[cpu->pir]) { + cpu_wait_job(jobs[cpu->pir], true); + jobs[cpu->pir] = NULL; + } + + /* Exit uv mode and disable smfctrl on primary threads */ + for_each_available_cpu(cpu) { + if (cpu == this_cpu()) + continue; + + if (cpu_is_thread0(cpu)) + jobs[cpu->pir] = cpu_queue_job(cpu, "exit_uv_mode", + _exit_uv_mode, NULL); + } + + for_each_available_cpu(cpu) + if (jobs[cpu->pir]) + cpu_wait_job(jobs[cpu->pir], true); + + free(jobs); + + _exit_uv_mode(NULL); +} diff --git a/roms/skiboot/core/cpufeatures.c b/roms/skiboot/core/cpufeatures.c new file mode 100644 index 000000000..5620b741d --- /dev/null +++ b/roms/skiboot/core/cpufeatures.c @@ -0,0 +1,1043 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * This file deals with setup of /cpus/ibm,powerpc-cpu-features dt + * + * Copyright 2017-2019 IBM Corp. + */ + +#include +#include +#include +#include +#include + +#ifdef DEBUG +#define DBG(fmt, a...) prlog(PR_DEBUG, "CPUFT: " fmt, ##a) +#else +#define DBG(fmt, a...) +#endif + +/* Device-tree visible constants follow */ +#define ISA_V2_07B 2070 +#define ISA_V3_0B 3000 +#define ISA_V3_1 3100 + +#define USABLE_PR (1U << 0) +#define USABLE_OS (1U << 1) +#define USABLE_HV (1U << 2) + +#define HV_SUPPORT_HFSCR (1U << 0) +#define OS_SUPPORT_FSCR (1U << 0) + +/* Following are definitions for the match tables, not the DT binding itself */ +#define ISA_BASE 0 + +#define HV_NONE 0 +#define HV_CUSTOM 1 +#define HV_HFSCR 2 + +#define OS_NONE 0 +#define OS_CUSTOM 1 +#define OS_FSCR 2 + +/* CPU bitmasks for match table */ +#define CPU_P8_DD1 (1U << 0) +#define CPU_P8_DD2 (1U << 1) +#define CPU_P9_DD1 (1U << 2) +#define CPU_P9_DD2_0_1 (1U << 3) // 2.01 or 2.1 +#define CPU_P9P (1U << 4) +#define CPU_P9_DD2_2 (1U << 5) +#define CPU_P9_DD2_3 (1U << 6) +#define CPU_P10 (1U << 7) + +#define CPU_P9_DD2 (CPU_P9_DD2_0_1|CPU_P9_DD2_2|CPU_P9_DD2_3|CPU_P9P) + +#define CPU_P8 (CPU_P8_DD1|CPU_P8_DD2) +#define CPU_P9 (CPU_P9_DD1|CPU_P9_DD2|CPU_P9P) +#define CPU_ALL (CPU_P8|CPU_P9|CPU_P10) + +struct cpu_feature { + const char *name; + uint32_t cpus_supported; + uint32_t isa; + uint32_t usable_privilege; + uint32_t hv_support; + uint32_t os_support; + uint32_t hfscr_bit_nr; + uint32_t fscr_bit_nr; + uint32_t hwcap_bit_nr; + const char *dependencies_names; /* space-delimited names */ +}; + +/* + * The base (or NULL) cpu feature set is the CPU features available + * when no child nodes of the /cpus/ibm,powerpc-cpu-features node exist. The + * base feature set is POWER8 (ISAv2.07B), less features that are listed + * explicitly. + * + * XXX: currently, the feature dependencies are not necessarily captured + * exactly or completely. This is somewhat acceptable because all + * implementations must be aware of all these features. + */ +static const struct cpu_feature cpu_features_table[] = { + /* + * Big endian as in ISAv2.07B, MSR_LE=0 + */ + { "big-endian", + CPU_ALL, + ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR, + HV_CUSTOM, OS_CUSTOM, + -1, -1, -1, + NULL, }, + + /* + * Little endian as in ISAv2.07B, MSR_LE=1. + * + * When both big and little endian are defined, there is an LPCR ILE + * bit and implementation specific way to switch HILE mode, MSR_SLE, + * etc. + */ + { "little-endian", + CPU_ALL, + ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR, + HV_CUSTOM, OS_CUSTOM, + -1, -1, -1, + NULL, }, + + /* + * MSR_HV=1 mode as in ISAv2.07B (i.e., hypervisor privileged + * instructions and registers). + */ + { "hypervisor", + CPU_ALL, + ISA_BASE, USABLE_HV, + HV_CUSTOM, OS_NONE, + -1, -1, -1, + NULL, }, + + /* + * ISAv2.07B interrupt vectors, registers, and control registers + * (e.g., AIL, ILE, HV, etc LPCR bits). + * + * This does not necessarily specify all possible interrupt types. + * floating-point, for example requires some ways to handle floating + * point exceptions, but the low level details of interrupt handler + * is not a dependency there. There will always be *some* interrupt + * handler, (and some way to provide memory magagement, etc.). + */ + { "interrupt-facilities", + CPU_ALL, + ISA_BASE, USABLE_HV|USABLE_OS, + HV_CUSTOM, OS_CUSTOM, + -1, -1, -1, + NULL, }, + + { "smt", + CPU_ALL, + ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR, + HV_CUSTOM, OS_CUSTOM, + -1, -1, 14, + NULL, }, + + /* + * ISAv2.07B Program Priority Registers (PPR) + * PPR and associated control registers (e.g. RPR, PSPB), + * priority "or" instructions, etc. + */ + { "program-priority-register", + CPU_ALL, + ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR, + HV_NONE, OS_NONE, + -1, -1, -1, + NULL, }, + + /* + * ISAv2.07B Book3S Chapter 5.7.9.1. Virtual Page Class Key Protecion + * AMR, IAMR, AMOR, UAMOR, etc registers and MMU key bits. + */ + { "virtual-page-class-key-protection", + CPU_ALL, + ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR, + HV_CUSTOM, OS_CUSTOM, + -1, -1, -1, + NULL, }, + + /* + * ISAv2.07B SAO storage control attribute + */ + { "strong-access-ordering", + CPU_ALL & ~CPU_P9_DD1, + ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR, + HV_CUSTOM, OS_CUSTOM, + -1, -1, -1, + NULL, }, + + /* + * ISAv2.07B no-execute storage control attribute + */ + { "no-execute", + CPU_ALL, + ISA_BASE, USABLE_HV|USABLE_OS, + HV_CUSTOM, OS_CUSTOM, + -1, -1, -1, + NULL, }, + + /* + * Cache inhibited attribute supported on large pages. + */ + { "cache-inhibited-large-page", + CPU_ALL, + ISA_BASE, USABLE_HV|USABLE_OS, + HV_CUSTOM, OS_CUSTOM, + -1, -1, -1, + NULL, }, + + /* + * ISAv2.07B Book3S Chapter 8. Debug Facilities + * CIEA, CIABR, DEAW, MEte, trace interrupt, etc. + * Except CFAR, branch tracing. + */ + { "debug-facilities", + CPU_ALL, + ISA_BASE, USABLE_HV|USABLE_OS, + HV_CUSTOM, OS_CUSTOM, + -1, -1, -1, + NULL, }, + + /* + * DAWR1, DAWRX1 etc. + */ + { "debug-facilities-v31", + CPU_P10, + ISA_V3_1, USABLE_HV|USABLE_OS, + HV_CUSTOM, OS_CUSTOM, + -1, -1, -1, + NULL, }, + + /* + * ISAv2.07B CFAR + */ + { "come-from-address-register", + CPU_ALL, + ISA_BASE, USABLE_HV|USABLE_OS, + HV_CUSTOM, OS_CUSTOM, + -1, -1, -1, + "debug-facilities", }, + + /* + * ISAv2.07B Branch tracing (optional in ISA) + */ + { "branch-tracing", + CPU_ALL, + ISA_BASE, USABLE_HV|USABLE_OS, + HV_CUSTOM, OS_CUSTOM, + -1, -1, -1, + "debug-facilities", }, + + /* + * ISAv2.07B Floating-point Facility + */ + { "floating-point", + CPU_ALL, + ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR, + HV_CUSTOM, OS_CUSTOM, + PPC_BITLSHIFT(63), -1, 27, + NULL, }, + + /* + * ISAv2.07B Vector Facility (VMX) + */ + { "vector", + CPU_ALL, + ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR, + HV_CUSTOM, OS_CUSTOM, + PPC_BITLSHIFT(62), -1, 28, + "floating-point", }, + + /* + * ISAv2.07B Vector-scalar Facility (VSX) + */ + { "vector-scalar", + CPU_ALL, + ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR, + HV_CUSTOM, OS_CUSTOM, + -1, -1, 7, + "vector", }, + + { "vector-crypto", + CPU_ALL, + ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR, + HV_NONE, OS_NONE, + -1, -1, 57, + "vector", }, + + /* + * ISAv2.07B Quadword Load and Store instructions + * including lqarx/stdqcx. instructions. + */ + { "quadword-load-store", + CPU_ALL, + ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR, + HV_NONE, OS_NONE, + -1, -1, -1, + NULL, }, + + /* + * ISAv2.07B Binary Coded Decimal (BCD) + * BCD fixed point instructions + */ + { "decimal-integer", + CPU_ALL, + ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR, + HV_NONE, OS_NONE, + -1, -1, -1, + NULL, }, + + /* + * ISAv2.07B Decimal floating-point Facility (DFP) + */ + { "decimal-floating-point", + CPU_ALL, + ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR, + HV_NONE, OS_NONE, + -1, -1, 10, + "floating-point", }, + + /* + * ISAv2.07B + * DSCR, default data prefetch LPCR, etc + */ + { "data-stream-control-register", + CPU_ALL, + ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR, + HV_CUSTOM, OS_CUSTOM, + PPC_BITLSHIFT(61), PPC_BITLSHIFT(61), 61, + NULL, }, + + /* + * ISAv2.07B Branch History Rolling Buffer (BHRB) + */ + { "branch-history-rolling-buffer", + CPU_ALL, + ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR, + HV_CUSTOM, OS_CUSTOM, + PPC_BITLSHIFT(59), -1, -1, + NULL, }, + + /* + * ISAv2.07B Transactional Memory Facility (TM or HTM) + */ + { "transactional-memory", + CPU_P8, /* P9 support is not enabled yet */ + ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR, + HV_CUSTOM, OS_CUSTOM, + PPC_BITLSHIFT(58), -1, 62, + NULL, }, + + /* + * ISAv3.0B TM additions + * TEXASR bit 17, self-induced vs external footprint overflow + */ + { "transactional-memory-v3", + 0, + ISA_V3_0B, USABLE_HV|USABLE_OS|USABLE_PR, + HV_NONE, OS_NONE, + -1, -1, -1, + "transactional-memory", }, + + /* + * ISAv2.07B Event-Based Branch Facility (EBB) + */ + { "event-based-branch", + CPU_ALL, + ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR, + HV_CUSTOM, OS_CUSTOM, + PPC_BITLSHIFT(56), PPC_BITLSHIFT(56), 60, + NULL, }, + + /* + * ISAv2.07B Target Address Register (TAR) + */ + { "target-address-register", + CPU_ALL, + ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR, + HV_CUSTOM, OS_CUSTOM, + PPC_BITLSHIFT(55), PPC_BITLSHIFT(55), 58, + NULL, }, + + /* + * ISAv2.07B Control Register (CTRL) + */ + { "control-register", + CPU_ALL, + ISA_BASE, USABLE_HV|USABLE_OS, + HV_CUSTOM, OS_CUSTOM, + -1, -1, -1, + NULL, }, + + /* + * ISAv2.07B Book3S Chapter 11. Processor Control. + * msgsnd, msgsndp, doorbell, etc. + * + * ISAv3.0B is not compatible (different addressing, HFSCR required + * for msgsndp). + */ + { "processor-control-facility", + CPU_P8_DD2, /* P8 DD1 has no dbell */ + ISA_BASE, USABLE_HV|USABLE_OS, + HV_CUSTOM, OS_CUSTOM, + -1, -1, -1, + NULL, }, + + /* + * ISAv2.07B PURR, SPURR registers + */ + { "processor-utilization-of-resources-register", + CPU_ALL, + ISA_BASE, USABLE_HV|USABLE_OS, + HV_CUSTOM, OS_CUSTOM, + -1, -1, -1, + NULL, }, + + /* + * POWER8 initiate coprocessor store word indexed (icswx) instruction + */ + { "coprocessor-icswx", + CPU_P8, + ISA_BASE, USABLE_HV|USABLE_OS, + HV_CUSTOM, OS_CUSTOM, + -1, -1, -1, + NULL, }, + + /* + * ISAv2.07B hash based MMU and all instructions, registers, + * data structures, exceptions, etc. + */ + { "mmu-hash", + CPU_P8, + ISA_BASE, USABLE_HV|USABLE_OS, + HV_CUSTOM, OS_CUSTOM, + -1, -1, -1, + NULL, }, + + /* + * POWER8 MCE / machine check exception. + */ + { "machine-check-power8", + CPU_P8, + ISA_BASE, USABLE_HV|USABLE_OS, + HV_CUSTOM, OS_CUSTOM, + -1, -1, -1, + NULL, }, + + /* + * POWER8 PMU / performance monitor unit. + */ + { "performance-monitor-power8", + CPU_P8, + ISA_BASE, USABLE_HV|USABLE_OS, + HV_CUSTOM, OS_CUSTOM, + -1, -1, -1, + NULL, }, + + /* + * ISAv2.07B alignment interrupts set DSISR register + * + * POWER CPUs do not used this, and it's removed from ISAv3.0B. + */ + { "alignment-interrupt-dsisr", + 0, + ISA_BASE, USABLE_HV|USABLE_OS, + HV_NONE, OS_NONE, + -1, -1, -1, + NULL, }, + + /* + * ISAv2.07B / POWER8 doze, nap, sleep, winkle instructions + * XXX: is Linux we using some BookIV specific implementation details + * in nap handling? We have no POWER8 specific key here. + */ + { "idle-nap", + CPU_P8, + ISA_BASE, USABLE_HV, + HV_CUSTOM, OS_NONE, + -1, -1, -1, + NULL, }, + + /* + * ISAv2.07B wait instruction + */ + { "wait", + CPU_P8, + ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR, + HV_NONE, OS_NONE, + -1, -1, -1, + NULL, }, + + { "subcore", + CPU_P8, + ISA_BASE, USABLE_HV|USABLE_OS, + HV_CUSTOM, OS_CUSTOM, + -1, -1, -1, + "smt", }, + + /* + * ISAv3.0B radix based MMU + */ + { "mmu-radix", + CPU_P9|CPU_P10, + ISA_V3_0B, USABLE_HV|USABLE_OS, + HV_CUSTOM, OS_CUSTOM, + -1, -1, -1, + NULL, }, + + /* + * ISAv3.0B hash based MMU, new hash pte format, PCTR, etc + */ + { "mmu-hash-v3", + CPU_P9|CPU_P10, + ISA_V3_0B, USABLE_HV|USABLE_OS, + HV_CUSTOM, OS_CUSTOM, + -1, -1, -1, + NULL, }, + + /* + * ISAv3.0B wait instruction + */ + { "wait-v3", + CPU_P9|CPU_P10, + ISA_V3_0B, USABLE_HV|USABLE_OS|USABLE_PR, + HV_NONE, OS_NONE, + -1, -1, -1, + NULL, }, + + /* + * ISAv3.0B stop idle instructions and registers + * XXX: Same question as for idle-nap + */ + { "idle-stop", + CPU_P9|CPU_P10, + ISA_V3_0B, USABLE_HV|USABLE_OS, + HV_CUSTOM, OS_CUSTOM, + -1, -1, -1, + NULL, }, + + /* + * ISAv3.0B Hypervisor Virtualization Interrupt + * Also associated system registers, LPCR EE, HEIC, HVICE, + * system reset SRR1 reason, etc. + */ + { "hypervisor-virtualization-interrupt", + CPU_P9|CPU_P10, + ISA_V3_0B, USABLE_HV, + HV_CUSTOM, OS_NONE, + -1, -1, -1, + NULL, }, + + /* + * POWER9 MCE / machine check exception. + */ + { "machine-check-power9", + CPU_P9, + ISA_V3_0B, USABLE_HV|USABLE_OS, + HV_CUSTOM, OS_CUSTOM, + -1, -1, -1, + NULL, }, + + /* + * POWER10 MCE / machine check exception. + */ + { "machine-check-power10", + CPU_P10, + ISA_V3_0B, USABLE_HV|USABLE_OS, + HV_CUSTOM, OS_CUSTOM, + -1, -1, -1, + NULL, }, + + /* + * POWER9 PMU / performance monitor unit. + */ + { "performance-monitor-power9", + CPU_P9, + ISA_V3_0B, USABLE_HV|USABLE_OS, + HV_CUSTOM, OS_CUSTOM, + -1, -1, -1, + NULL, }, + + /* + * POWER10 PMU / performance monitor unit. + */ + { "performance-monitor-power10", + CPU_P10, + ISA_V3_1, USABLE_HV|USABLE_OS, + HV_CUSTOM, OS_CUSTOM, + -1, -1, -1, + NULL, }, + + /* + * ISAv3.0B scv/rfscv system call instructions and exceptions, fscr bit + * etc. + */ + { "system-call-vectored", + CPU_P9|CPU_P10, + ISA_V3_0B, USABLE_OS|USABLE_PR, + HV_NONE, OS_CUSTOM, + -1, PPC_BITLSHIFT(51), 52, + NULL, }, + + /* + * ISAv3.0B Book3S Chapter 10. Processor Control. + * global msgsnd, msgsndp, msgsync, doorbell, etc. + */ + { "processor-control-facility-v3", + CPU_P9|CPU_P10, + ISA_V3_0B, USABLE_HV|USABLE_OS, + HV_CUSTOM, OS_NONE, + PPC_BITLSHIFT(53), -1, -1, + NULL, }, + + /* + * ISAv3.0B addpcis instruction + */ + { "pc-relative-addressing", + CPU_P9|CPU_P10, + ISA_V3_0B, USABLE_HV|USABLE_OS|USABLE_PR, + HV_NONE, OS_NONE, + -1, -1, -1, + NULL, }, + + /* + * ISAv2.07B Book3S Chapter 7. Timer Facilities + * TB, VTB, DEC, HDEC, IC, etc registers and exceptions. + * Not including PURR or SPURR registers. + */ + { "timer-facilities", + CPU_ALL, + ISA_BASE, USABLE_HV|USABLE_OS, + HV_NONE, OS_NONE, + -1, -1, -1, + NULL, }, + + /* + * ISAv3.0B Book3S Chapter 7. Timer Facilities + * Large decrementer and hypervisor decrementer + */ + { "timer-facilities-v3", + CPU_P9|CPU_P10, + ISA_V3_0B, USABLE_HV|USABLE_OS, + HV_NONE, OS_NONE, + -1, -1, -1, + "timer-facilities", }, + + /* + * ISAv3.0B deliver a random number instruction (darn) + */ + { "random-number-generator", + CPU_P9|CPU_P10, + ISA_V3_0B, USABLE_HV|USABLE_OS|USABLE_PR, + HV_NONE, OS_NONE, + -1, -1, 53, + NULL, }, + + /* + * ISAv3.0B fixed point instructions and registers + * multiply-add, modulo, count trailing zeroes, cmprb, cmpeqb, + * extswsli, mfvsrld, mtvsrdd, mtvsrws, addex, CA32, OV32, + * mcrxrx, setb + */ + { "fixed-point-v3", + CPU_P9|CPU_P10, + ISA_V3_0B, USABLE_HV|USABLE_OS|USABLE_PR, + HV_NONE, OS_NONE, + -1, -1, -1, + NULL, }, + + { "decimal-integer-v3", + CPU_P9|CPU_P10, + ISA_V3_0B, USABLE_HV|USABLE_OS|USABLE_PR, + HV_NONE, OS_NONE, + -1, -1, -1, + "fixed-point-v3 decimal-integer", }, + + /* + * ISAv3.0B lightweight mffs + */ + { "floating-point-v3", + CPU_P9|CPU_P10, + ISA_V3_0B, USABLE_HV|USABLE_OS|USABLE_PR, + HV_NONE, OS_NONE, + -1, -1, -1, + "floating-point", }, + + { "decimal-floating-point-v3", + CPU_P9|CPU_P10, + ISA_V3_0B, USABLE_HV|USABLE_OS|USABLE_PR, + HV_NONE, OS_NONE, + -1, -1, -1, + "floating-point-v3 decimal-floating-point", }, + + { "vector-v3", + CPU_P9|CPU_P10, + ISA_V3_0B, USABLE_HV|USABLE_OS|USABLE_PR, + HV_NONE, OS_NONE, + -1, -1, -1, + "vector", }, + + { "vector-scalar-v3", + CPU_P9|CPU_P10, + ISA_V3_0B, USABLE_HV|USABLE_OS|USABLE_PR, + HV_NONE, OS_NONE, + -1, -1, -1, + "vector-v3 vector-scalar" }, + + { "vector-binary128", + CPU_P9|CPU_P10, + ISA_V3_0B, USABLE_HV|USABLE_OS|USABLE_PR, + HV_NONE, OS_NONE, + -1, -1, 54, + "vector-scalar-v3", }, + + { "vector-binary16", + CPU_P9|CPU_P10, + ISA_V3_0B, USABLE_HV|USABLE_OS|USABLE_PR, + HV_NONE, OS_NONE, + -1, -1, -1, + "vector-v3", }, + + /* + * ISAv3.0B external exception for EBB + */ + { "event-based-branch-v3", + CPU_P9|CPU_P10, + ISA_V3_0B, USABLE_HV|USABLE_OS|USABLE_PR, + HV_NONE, OS_NONE, + -1, -1, -1, + "event-based-branch", }, + + /* + * ISAv3.0B Atomic Memory Operations (AMO) + */ + { "atomic-memory-operations", + CPU_P9|CPU_P10, + ISA_V3_0B, USABLE_HV|USABLE_OS|USABLE_PR, + HV_NONE, OS_NONE, + -1, -1, -1, + NULL, }, + + /* + * ISAv3.0B Copy-Paste Facility + */ + { "copy-paste", + CPU_P9|CPU_P10, + ISA_V3_0B, USABLE_HV|USABLE_OS|USABLE_PR, + HV_NONE, OS_NONE, + -1, -1, -1, + NULL, }, + + /* + * ISAv3.0B GSR SPR register + * POWER9 does not implement it + */ + { "group-start-register", + 0, + ISA_V3_0B, USABLE_HV|USABLE_OS, + HV_NONE, OS_NONE, + -1, -1, -1, + NULL, }, + + /* + * Enable matrix multiply accumulate. + */ + { "matrix-multiply-accumulate", + CPU_P10, + ISA_V3_1, USABLE_PR, + HV_CUSTOM, OS_CUSTOM, + -1, -1, 49, + NULL, }, + + /* + * Enable prefix instructions. Toolchains assume this is + * enabled for when compiling for ISA 3.1. + */ + { "prefix-instructions", + CPU_P10, + ISA_V3_1, USABLE_HV|USABLE_OS|USABLE_PR, + HV_HFSCR, OS_FSCR, + 13, 13, -1, + NULL, }, + + /* + * Due to hardware bugs in POWER9, the hypervisor needs to assist + * guests. + * + * Presence of this feature indicates presence of the bug. + * + * See linux kernel commit 4bb3c7a0208f + * and linux Documentation/powerpc/transactional_memory.txt + */ + { "tm-suspend-hypervisor-assist", + CPU_P9_DD2_2|CPU_P9_DD2_3|CPU_P9P, + ISA_V3_0B, USABLE_HV, + HV_CUSTOM, OS_NONE, + -1, -1, -1, + NULL, }, + + /* + * Due to hardware bugs in POWER9, the hypervisor can hit + * CPU bugs in the operations it needs to do for + * tm-suspend-hypervisor-assist. + * + * Presence of this "feature" means processor is affected by the bug. + * + * See linux kernel commit 4bb3c7a0208f + * and linux Documentation/powerpc/transactional_memory.txt + */ + { "tm-suspend-xer-so-bug", + CPU_P9_DD2_2, + ISA_V3_0B, USABLE_HV, + HV_CUSTOM, OS_NONE, + -1, -1, -1, + NULL, }, +}; + +static void add_cpu_feature_nodeps(struct dt_node *features, + const struct cpu_feature *f) +{ + struct dt_node *feature; + + feature = dt_new(features, f->name); + assert(feature); + + dt_add_property_cells(feature, "isa", f->isa); + dt_add_property_cells(feature, "usable-privilege", f->usable_privilege); + + if (f->usable_privilege & USABLE_HV) { + if (f->hv_support != HV_NONE) { + uint32_t s = 0; + if (f->hv_support == HV_HFSCR) + s |= HV_SUPPORT_HFSCR; + + dt_add_property_cells(feature, "hv-support", s); + if (f->hfscr_bit_nr != -1) + dt_add_property_cells(feature, "hfscr-bit-nr", f->hfscr_bit_nr); + } else { + assert(f->hfscr_bit_nr == -1); + } + } + + if (f->usable_privilege & USABLE_OS) { + if (f->os_support != OS_NONE) { + uint32_t s = 0; + if (f->os_support == OS_FSCR) + s |= OS_SUPPORT_FSCR; + dt_add_property_cells(feature, "os-support", s); + if (f->fscr_bit_nr != -1) + dt_add_property_cells(feature, "fscr-bit-nr", f->fscr_bit_nr); + } else { + assert(f->fscr_bit_nr == -1); + } + } + + if (f->usable_privilege & USABLE_PR) { + if (f->hwcap_bit_nr != -1) + dt_add_property_cells(feature, "hwcap-bit-nr", f->hwcap_bit_nr); + } + + if (f->dependencies_names) + dt_add_property(feature, "dependencies", NULL, 0); +} + +static void add_cpufeatures_dependencies(struct dt_node *features) +{ + struct dt_node *feature; + + dt_for_each_node(features, feature) { + const struct cpu_feature *f = NULL; + const char *deps_names; + struct dt_property *deps; + int nr_deps; + int i; + + /* Find features with dependencies */ + + deps = __dt_find_property(feature, "dependencies"); + if (!deps) + continue; + + /* Find the matching cpu table */ + for (i = 0; i < ARRAY_SIZE(cpu_features_table); i++) { + f = &cpu_features_table[i]; + if (!strcmp(f->name, feature->name)) + break; + } + assert(f); + assert(f->dependencies_names); + + /* + * Count number of depended features and allocate space + * for phandles in the property. + */ + deps_names = f->dependencies_names; + nr_deps = strcount(deps_names, " ") + 1; + dt_resize_property(&deps, nr_deps * sizeof(u32)); + + DBG("feature %s has %d dependencies (%s)\n", f->name, nr_deps, deps_names); + /* + * For each one, find the depended feature then advance to + * next name. + */ + for (i = 0; i < nr_deps; i++) { + struct dt_node *dep; + int len; + + if (nr_deps - i == 1) + len = strlen(deps_names); + else + len = strchr(deps_names, ' ') - deps_names; + + dt_for_each_node(features, dep) { + if (!strncmp(deps_names, dep->name, len)) + goto found_dep; + } + + prlog(PR_ERR, "CPUFT: feature %s dependencies not found\n", f->name); + break; +found_dep: + DBG(" %s found dep (%s)\n", f->name, dep->name); + dt_property_set_cell(deps, i, dep->phandle); + + /* Advance over the name + delimiter */ + deps_names += len + 1; + } + } +} + +static void add_cpufeatures(struct dt_node *cpus, + uint32_t cpu_feature_isa, uint32_t cpu_feature_cpu, + const char *cpu_name) +{ + struct dt_node *features; + int i; + + DBG("creating cpufeatures for cpu:%d isa:%d\n", cpu_feature_cpu, cpu_feature_isa); + + features = dt_new(cpus, "ibm,powerpc-cpu-features"); + assert(features); + + dt_add_property_cells(features, "isa", cpu_feature_isa); + + dt_add_property_string(features, "device_type", "cpu-features"); + dt_add_property_string(features, "compatible", "ibm,powerpc-cpu-features"); + dt_add_property_string(features, "display-name", cpu_name); + + /* add without dependencies */ + for (i = 0; i < ARRAY_SIZE(cpu_features_table); i++) { + const struct cpu_feature *f = &cpu_features_table[i]; + + if (f->cpus_supported & cpu_feature_cpu) { + DBG(" '%s'\n", f->name); + add_cpu_feature_nodeps(features, f); + } + } + + /* dependency construction pass */ + add_cpufeatures_dependencies(features); +} + +void dt_add_cpufeatures(struct dt_node *root) +{ + int version; + uint32_t cpu_feature_isa = 0; + uint32_t cpu_feature_cpu = 0; + struct dt_node *cpus; + const char *cpu_name = NULL; + + version = mfspr(SPR_PVR); + switch(PVR_TYPE(version)) { + case PVR_TYPE_P8: + if (!cpu_name) + cpu_name = "POWER8"; + /* fallthrough */ + case PVR_TYPE_P8E: + if (!cpu_name) + cpu_name = "POWER8E"; + /* fallthrough */ + cpu_feature_isa = ISA_V2_07B; + if (PVR_VERS_MAJ(version) == 1) + cpu_feature_cpu = CPU_P8_DD1; + else + cpu_feature_cpu = CPU_P8_DD2; + break; + case PVR_TYPE_P8NVL: + cpu_name = "POWER8NVL"; + cpu_feature_isa = ISA_V2_07B; + cpu_feature_cpu = CPU_P8_DD2; + break; + case PVR_TYPE_P9: + if (!cpu_name) + cpu_name = "POWER9"; + + cpu_feature_isa = ISA_V3_0B; + if (is_power9n(version) && + (PVR_VERS_MAJ(version) == 2)) { + /* P9N DD2.x */ + switch (PVR_VERS_MIN(version)) { + case 0: + case 1: + cpu_feature_cpu = CPU_P9_DD2_0_1; + break; + case 2: + cpu_feature_cpu = CPU_P9_DD2_2; + break; + case 3: + cpu_feature_cpu = CPU_P9_DD2_3; + break; + default: + assert(0); + } + } else if (is_power9c(version) && + (PVR_VERS_MAJ(version) == 1)) { + /* P9C DD1.x */ + switch (PVR_VERS_MIN(version)) { + case 1: + /* Cumulus DD1.1 => Nimbus DD2.1 */ + cpu_feature_cpu = CPU_P9_DD2_0_1; + break; + case 2: + /* Cumulus DD1.2 */ + cpu_feature_cpu = CPU_P9_DD2_2; + break; + case 3: + /* Cumulus DD1.3 */ + cpu_feature_cpu = CPU_P9_DD2_3; + break; + default: + assert(0); + } + } else { + assert(0); + } + + break; + case PVR_TYPE_P9P: + if (!cpu_name) + cpu_name = "POWER9P"; + + cpu_feature_isa = ISA_V3_0B; + cpu_feature_cpu = CPU_P9P; + break; + case PVR_TYPE_P10: + if (!cpu_name) + cpu_name = "POWER10"; + + cpu_feature_isa = ISA_V3_1; + cpu_feature_cpu = CPU_P10; + break; + default: + return; + } + + cpus = dt_new_check(root, "cpus"); + + add_cpufeatures(cpus, cpu_feature_isa, cpu_feature_cpu, cpu_name); +} diff --git a/roms/skiboot/core/device.c b/roms/skiboot/core/device.c new file mode 100644 index 000000000..b102dd973 --- /dev/null +++ b/roms/skiboot/core/device.c @@ -0,0 +1,1128 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Manipulate the device tree + * + * Copyright 2013-2019 IBM Corp. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Used to give unique handles. */ +u32 last_phandle = 0; + +struct dt_node *dt_root; +struct dt_node *dt_chosen; + +static const char *take_name(const char *name) +{ + if (!is_rodata(name) && !(name = strdup(name))) { + prerror("Failed to allocate copy of name"); + abort(); + } + return name; +} + +static void free_name(const char *name) +{ + if (!is_rodata(name)) + free((char *)name); +} + +static struct dt_node *new_node(const char *name) +{ + struct dt_node *node = malloc(sizeof *node); + if (!node) { + prerror("Failed to allocate node\n"); + abort(); + } + + node->name = take_name(name); + node->parent = NULL; + list_head_init(&node->properties); + list_head_init(&node->children); + /* FIXME: locking? */ + node->phandle = new_phandle(); + return node; +} + +struct dt_node *dt_new_root(const char *name) +{ + return new_node(name); +} + +static const char *get_unitname(const struct dt_node *node) +{ + const char *c = strchr(node->name, '@'); + + if (!c) + return NULL; + + return c + 1; +} + +int dt_cmp_subnodes(const struct dt_node *a, const struct dt_node *b) +{ + const char *a_unit = get_unitname(a); + const char *b_unit = get_unitname(b); + + ptrdiff_t basenamelen = a_unit - a->name; + + /* sort hex unit addresses by number */ + if (a_unit && b_unit && !strncmp(a->name, b->name, basenamelen)) { + unsigned long long a_num, b_num; + char *a_end, *b_end; + + a_num = strtoul(a_unit, &a_end, 16); + b_num = strtoul(b_unit, &b_end, 16); + + /* only compare if the unit addr parsed correctly */ + if (*a_end == 0 && *b_end == 0) + return (a_num > b_num) - (a_num < b_num); + } + + return strcmp(a->name, b->name); +} + +bool dt_attach_root(struct dt_node *parent, struct dt_node *root) +{ + struct dt_node *node; + + assert(!root->parent); + + if (list_empty(&parent->children)) { + list_add(&parent->children, &root->list); + root->parent = parent; + + return true; + } + + dt_for_each_child(parent, node) { + int cmp = dt_cmp_subnodes(node, root); + + /* Look for duplicates */ + if (cmp == 0) { + prerror("DT: %s failed, duplicate %s\n", + __func__, root->name); + return false; + } + + /* insert before the first node that's larger + * the the node we're inserting */ + if (cmp > 0) + break; + } + + list_add_before(&parent->children, &root->list, &node->list); + root->parent = parent; + + return true; +} + +static inline void dt_destroy(struct dt_node *dn) +{ + if (!dn) + return; + + free_name(dn->name); + free(dn); +} + +struct dt_node *dt_new(struct dt_node *parent, const char *name) +{ + struct dt_node *new; + assert(parent); + + new = new_node(name); + if (!dt_attach_root(parent, new)) { + dt_destroy(new); + return NULL; + } + return new; +} + +/* + * low level variant, we export this because there are "weird" address + * formats, such as LPC/ISA bus addresses which have a letter to identify + * which bus space the address is inside of. + */ +struct dt_node *__dt_find_by_name_addr(struct dt_node *parent, const char *name, + const char *addr) +{ + struct dt_node *node; + + if (list_empty(&parent->children)) + return NULL; + + dt_for_each_child(parent, node) { + const char *unit = get_unitname(node); + int len; + + if (!unit) + continue; + + /* match the name */ + len = (int) (unit - node->name) - 1; + if (strncmp(node->name, name, len)) + continue; + + /* match the unit */ + if (strcmp(unit, addr) == 0) + return node; + } + + dt_for_each_child(parent, node) { + struct dt_node *ret = __dt_find_by_name_addr(node, name, addr); + + if (ret) + return ret; + } + + return NULL; +} + +struct dt_node *dt_find_by_name_addr(struct dt_node *parent, const char *name, + uint64_t addr) +{ + char addr_str[16 + 1]; /* max size of a 64bit int */ + snprintf(addr_str, sizeof(addr_str), "%" PRIx64, addr); + + return __dt_find_by_name_addr(parent, name, addr_str); +} + +struct dt_node *dt_new_addr(struct dt_node *parent, const char *name, + uint64_t addr) +{ + char *lname; + struct dt_node *new; + size_t len; + + assert(parent); + len = strlen(name) + STR_MAX_CHARS(addr) + 2; + lname = malloc(len); + if (!lname) + return NULL; + snprintf(lname, len, "%s@%llx", name, (long long)addr); + new = new_node(lname); + free(lname); + if (!dt_attach_root(parent, new)) { + dt_destroy(new); + return NULL; + } + return new; +} + +struct dt_node *dt_new_2addr(struct dt_node *parent, const char *name, + uint64_t addr0, uint64_t addr1) +{ + char *lname; + struct dt_node *new; + size_t len; + assert(parent); + + len = strlen(name) + 2*STR_MAX_CHARS(addr0) + 3; + lname = malloc(len); + if (!lname) + return NULL; + snprintf(lname, len, "%s@%llx,%llx", + name, (long long)addr0, (long long)addr1); + new = new_node(lname); + free(lname); + if (!dt_attach_root(parent, new)) { + dt_destroy(new); + return NULL; + } + return new; +} + +static struct dt_node *__dt_copy(struct dt_node *node, struct dt_node *parent, + bool root) +{ + struct dt_property *prop, *new_prop; + struct dt_node *new_node, *child; + + new_node = dt_new(parent, node->name); + if (!new_node) + return NULL; + + list_for_each(&node->properties, prop, list) { + new_prop = dt_add_property(new_node, prop->name, prop->prop, + prop->len); + if (!new_prop) + goto fail; + } + + list_for_each(&node->children, child, list) { + child = __dt_copy(child, new_node, false); + if (!child) + goto fail; + } + + return new_node; + +fail: + /* dt_free will recurse for us, so only free when we unwind to the + * top-level failure */ + if (root) + dt_free(new_node); + return NULL; +} + +struct dt_node *dt_copy(struct dt_node *node, struct dt_node *parent) +{ + return __dt_copy(node, parent, true); +} + +char *dt_get_path(const struct dt_node *node) +{ + unsigned int len = 0; + const struct dt_node *n; + char *path, *p; + + /* Dealing with NULL is for test/debug purposes */ + if (!node) + return strdup(""); + + for (n = node; n; n = n->parent) { + len += strlen(n->name); + if (n->parent || n == node) + len++; + } + path = zalloc(len + 1); + assert(path); + p = path + len; + for (n = node; n; n = n->parent) { + len = strlen(n->name); + p -= len; + memcpy(p, n->name, len); + if (n->parent || n == node) + *(--p) = '/'; + } + assert(p == path); + + return p; +} + +static const char *__dt_path_split(const char *p, + const char **namep, unsigned int *namel, + const char **addrp, unsigned int *addrl) +{ + const char *at, *sl; + + *namel = *addrl = 0; + + /* Skip initial '/' */ + while (*p == '/') + p++; + + /* Check empty path */ + if (*p == 0) + return p; + + at = strchr(p, '@'); + sl = strchr(p, '/'); + if (sl == NULL) + sl = p + strlen(p); + if (sl < at) + at = NULL; + if (at) { + *addrp = at + 1; + *addrl = sl - at - 1; + } + *namep = p; + *namel = at ? (at - p) : (sl - p); + + return sl; +} + +struct dt_node *dt_find_by_path(struct dt_node *root, const char *path) +{ + struct dt_node *n; + const char *pn, *pa, *p = path, *nn, *na; + unsigned int pnl, pal, nnl, nal; + bool match; + + /* Walk path components */ + while (*p) { + /* Extract next path component */ + p = __dt_path_split(p, &pn, &pnl, &pa, &pal); + if (pnl == 0 && pal == 0) + break; + + /* Compare with each child node */ + match = false; + list_for_each(&root->children, n, list) { + match = true; + __dt_path_split(n->name, &nn, &nnl, &na, &nal); + if (pnl && (pnl != nnl || strncmp(pn, nn, pnl))) + match = false; + if (pal && (pal != nal || strncmp(pa, na, pal))) + match = false; + if (match) { + root = n; + break; + } + } + + /* No child match */ + if (!match) + return NULL; + } + return root; +} + +struct dt_node *dt_find_by_name(struct dt_node *root, const char *name) +{ + struct dt_node *child, *match; + + list_for_each(&root->children, child, list) { + if (!strcmp(child->name, name)) + return child; + + match = dt_find_by_name(child, name); + if (match) + return match; + } + + return NULL; +} + + +struct dt_node *dt_new_check(struct dt_node *parent, const char *name) +{ + struct dt_node *node = dt_find_by_name(parent, name); + + if (!node) { + node = dt_new(parent, name); + assert(node); + } + + return node; +} + + +struct dt_node *dt_find_by_phandle(struct dt_node *root, u32 phandle) +{ + struct dt_node *node; + + dt_for_each_node(root, node) + if (node->phandle == phandle) + return node; + return NULL; +} + +static struct dt_property *new_property(struct dt_node *node, + const char *name, size_t size) +{ + struct dt_property *p = malloc(sizeof(*p) + size); + char *path; + + if (!p) { + path = dt_get_path(node); + prerror("Failed to allocate property \"%s\" for %s of %zu bytes\n", + name, path, size); + free(path); + abort(); + } + if (dt_find_property(node, name)) { + path = dt_get_path(node); + prerror("Duplicate property \"%s\" in node %s\n", + name, path); + free(path); + abort(); + + } + + p->name = take_name(name); + p->len = size; + list_add_tail(&node->properties, &p->list); + return p; +} + +struct dt_property *dt_add_property(struct dt_node *node, + const char *name, + const void *val, size_t size) +{ + struct dt_property *p; + + /* + * Filter out phandle properties, we re-generate them + * when flattening + */ + if (strcmp(name, "linux,phandle") == 0 || + strcmp(name, "phandle") == 0) { + assert(size == 4); + node->phandle = *(const u32 *)val; + if (node->phandle >= last_phandle) + set_last_phandle(node->phandle); + return NULL; + } + + p = new_property(node, name, size); + if (size) + memcpy(p->prop, val, size); + return p; +} + +void dt_resize_property(struct dt_property **prop, size_t len) +{ + size_t new_len = sizeof(**prop) + len; + + *prop = realloc(*prop, new_len); + (*prop)->len = len; + + /* Fix up linked lists in case we moved. (note: not an empty list). */ + (*prop)->list.next->prev = &(*prop)->list; + (*prop)->list.prev->next = &(*prop)->list; +} + +struct dt_property *dt_add_property_string(struct dt_node *node, + const char *name, + const char *value) +{ + size_t len = 0; + if (value) + len = strlen(value) + 1; + return dt_add_property(node, name, value, len); +} + +struct dt_property *dt_add_property_nstr(struct dt_node *node, + const char *name, + const char *value, unsigned int vlen) +{ + struct dt_property *p; + char *tmp = zalloc(vlen + 1); + + if (!tmp) + return NULL; + + strncpy(tmp, value, vlen); + p = dt_add_property(node, name, tmp, strlen(tmp)+1); + free(tmp); + + return p; +} + +struct dt_property *__dt_add_property_cells(struct dt_node *node, + const char *name, + int count, ...) +{ + struct dt_property *p; + fdt32_t *val; + unsigned int i; + va_list args; + + p = new_property(node, name, count * sizeof(u32)); + val = (fdt32_t *)p->prop; + va_start(args, count); + for (i = 0; i < count; i++) + val[i] = cpu_to_fdt32(va_arg(args, u32)); + va_end(args); + return p; +} + +struct dt_property *__dt_add_property_u64s(struct dt_node *node, + const char *name, + int count, ...) +{ + struct dt_property *p; + fdt64_t *val; + unsigned int i; + va_list args; + + p = new_property(node, name, count * sizeof(u64)); + val = (fdt64_t *)p->prop; + va_start(args, count); + for (i = 0; i < count; i++) + val[i] = cpu_to_fdt64(va_arg(args, u64)); + va_end(args); + return p; +} + +struct dt_property *__dt_add_property_strings(struct dt_node *node, + const char *name, + int count, ...) +{ + struct dt_property *p; + unsigned int i, size; + va_list args; + const char *sstr; + char *s; + + va_start(args, count); + for (i = size = 0; i < count; i++) { + sstr = va_arg(args, const char *); + if (sstr) + size += strlen(sstr) + 1; + } + va_end(args); + if (!size) + size = 1; + p = new_property(node, name, size); + s = (char *)p->prop; + *s = 0; + va_start(args, count); + for (i = 0; i < count; i++) { + sstr = va_arg(args, const char *); + if (sstr) { + strcpy(s, sstr); + s = s + strlen(sstr) + 1; + } + } + va_end(args); + return p; +} + +void dt_del_property(struct dt_node *node, struct dt_property *prop) +{ + list_del_from(&node->properties, &prop->list); + free_name(prop->name); + free(prop); +} + +u32 dt_property_get_cell(const struct dt_property *prop, u32 index) +{ + assert(prop->len >= (index+1)*sizeof(u32)); + /* Always aligned, so this works. */ + return fdt32_to_cpu(((const fdt32_t *)prop->prop)[index]); +} + +u64 dt_property_get_u64(const struct dt_property *prop, u32 index) +{ + assert(prop->len >= (index+1)*sizeof(u64)); + /* Always aligned, so this works. */ + return fdt64_to_cpu(((const fdt64_t *)prop->prop)[index]); +} + +void dt_property_set_cell(struct dt_property *prop, u32 index, u32 val) +{ + assert(prop->len >= (index+1)*sizeof(u32)); + /* Always aligned, so this works. */ + ((fdt32_t *)prop->prop)[index] = cpu_to_fdt32(val); +} + +/* First child of this node. */ +struct dt_node *dt_first(const struct dt_node *root) +{ + return list_top(&root->children, struct dt_node, list); +} + +/* Return next node, or NULL. */ +struct dt_node *dt_next(const struct dt_node *root, + const struct dt_node *prev) +{ + if (!prev) { + struct dt_node *first = dt_first(root); + + if (!first) + return NULL; + else + return first; + } + + /* Children? */ + if (!list_empty(&prev->children)) + return dt_first(prev); + + do { + /* More siblings? */ + if (prev->list.next != &prev->parent->children.n) + return list_entry(prev->list.next, struct dt_node,list); + + /* No more siblings, move up to parent. */ + prev = prev->parent; + } while (prev != root); + + return NULL; +} + +struct dt_property *__dt_find_property(struct dt_node *node, const char *name) +{ + struct dt_property *i; + + list_for_each(&node->properties, i, list) + if (strcmp(i->name, name) == 0) + return i; + return NULL; +} + +const struct dt_property *dt_find_property(const struct dt_node *node, + const char *name) +{ + const struct dt_property *i; + + list_for_each(&node->properties, i, list) + if (strcmp(i->name, name) == 0) + return i; + return NULL; +} + +void dt_check_del_prop(struct dt_node *node, const char *name) +{ + struct dt_property *p; + + p = __dt_find_property(node, name); + if (p) + dt_del_property(node, p); +} +const struct dt_property *dt_require_property(const struct dt_node *node, + const char *name, int wanted_len) +{ + const struct dt_property *p = dt_find_property(node, name); + + if (!p) { + const char *path = dt_get_path(node); + + prerror("DT: Missing required property %s/%s\n", + path, name); + assert(false); + } + if (wanted_len >= 0 && p->len != wanted_len) { + const char *path = dt_get_path(node); + + prerror("DT: Unexpected property length %s/%s\n", + path, name); + prerror("DT: Expected len: %d got len: %zu\n", + wanted_len, p->len); + assert(false); + } + + return p; +} + +bool dt_has_node_property(const struct dt_node *node, + const char *name, const char *val) +{ + const struct dt_property *p = dt_find_property(node, name); + + if (!p) + return false; + if (!val) + return true; + + return p->len == strlen(val) + 1 && memcmp(p->prop, val, p->len) == 0; +} + +bool dt_prop_find_string(const struct dt_property *p, const char *s) +{ + const char *c, *end; + + if (!p) + return false; + c = p->prop; + end = c + p->len; + + while(c < end) { + if (!strcasecmp(s, c)) + return true; + c += strlen(c) + 1; + } + return false; +} + +bool dt_node_is_compatible(const struct dt_node *node, const char *compat) +{ + const struct dt_property *p = dt_find_property(node, "compatible"); + + return dt_prop_find_string(p, compat); +} + +struct dt_node *dt_find_compatible_node(struct dt_node *root, + struct dt_node *prev, + const char *compat) +{ + struct dt_node *node = prev; + + while ((node = dt_next(root, node))) + if (dt_node_is_compatible(node, compat)) + return node; + return NULL; +} + +u64 dt_prop_get_u64(const struct dt_node *node, const char *prop) +{ + const struct dt_property *p = dt_require_property(node, prop, 8); + + return ((u64)dt_property_get_cell(p, 0) << 32) + | dt_property_get_cell(p, 1); +} + +u64 dt_prop_get_u64_def(const struct dt_node *node, const char *prop, u64 def) +{ + const struct dt_property *p = dt_find_property(node, prop); + + if (!p) + return def; + + return ((u64)dt_property_get_cell(p, 0) << 32) + | dt_property_get_cell(p, 1); +} + +u32 dt_prop_get_u32(const struct dt_node *node, const char *prop) +{ + const struct dt_property *p = dt_require_property(node, prop, 4); + + return dt_property_get_cell(p, 0); +} + +u32 dt_prop_get_u32_def(const struct dt_node *node, const char *prop, u32 def) +{ + const struct dt_property *p = dt_find_property(node, prop); + + if (!p) + return def; + + return dt_property_get_cell(p, 0); +} + +const void *dt_prop_get(const struct dt_node *node, const char *prop) +{ + const struct dt_property *p = dt_require_property(node, prop, -1); + + return p->prop; +} + +const void *dt_prop_get_def(const struct dt_node *node, const char *prop, + void *def) +{ + const struct dt_property *p = dt_find_property(node, prop); + + return p ? p->prop : def; +} + +const void *dt_prop_get_def_size(const struct dt_node *node, const char *prop, + void *def, size_t *len) +{ + const struct dt_property *p = dt_find_property(node, prop); + *len = 0; + if (p) + *len = p->len; + + return p ? p->prop : def; +} + +u32 dt_prop_get_cell(const struct dt_node *node, const char *prop, u32 cell) +{ + const struct dt_property *p = dt_require_property(node, prop, -1); + + return dt_property_get_cell(p, cell); +} + +u32 dt_prop_get_cell_def(const struct dt_node *node, const char *prop, + u32 cell, u32 def) +{ + const struct dt_property *p = dt_find_property(node, prop); + + if (!p) + return def; + + return dt_property_get_cell(p, cell); +} + +void dt_free(struct dt_node *node) +{ + struct dt_node *child; + struct dt_property *p; + + while ((child = list_top(&node->children, struct dt_node, list))) + dt_free(child); + + while ((p = list_pop(&node->properties, struct dt_property, list))) { + free_name(p->name); + free(p); + } + + if (node->parent) + list_del_from(&node->parent->children, &node->list); + dt_destroy(node); +} + +int dt_expand_node(struct dt_node *node, const void *fdt, int fdt_node) +{ + const struct fdt_property *prop; + int offset, nextoffset, err; + struct dt_node *child; + const char *name; + uint32_t tag; + + if (((err = fdt_check_header(fdt)) != 0) + || ((err = fdt_check_node_offset_(fdt, fdt_node)) < 0)) { + prerror("FDT: Error %d parsing node 0x%x\n", err, fdt_node); + return -1; + } + + nextoffset = err; + do { + offset = nextoffset; + + tag = fdt_next_tag(fdt, offset, &nextoffset); + switch (tag) { + case FDT_PROP: + prop = fdt_offset_ptr_(fdt, offset); + name = fdt_string(fdt, fdt32_to_cpu(prop->nameoff)); + dt_add_property(node, name, prop->data, + fdt32_to_cpu(prop->len)); + break; + case FDT_BEGIN_NODE: + name = fdt_get_name(fdt, offset, NULL); + child = dt_new_root(name); + assert(child); + nextoffset = dt_expand_node(child, fdt, offset); + + /* + * This may fail in case of duplicate, keep it + * going for now, we may ultimately want to + * assert + */ + if (!dt_attach_root(node, child)) + /** + * @fwts-label DTHasDuplicateNodeID + * @fwts-advice OPAL will parse the Flattened + * Device Tree(FDT), which can be generated + * from different firmware sources. During + * expansion of FDT, OPAL observed a node + * assigned multiple times (a duplicate). This + * indicates either a Hostboot bug *OR*, more + * likely, a bug in the platform XML. Check + * the platform XML for duplicate IDs for + * this type of device. Because of this + * duplicate node, OPAL won't add the hardware + * device found with a duplicate node ID into + * DT, rendering the corresponding device not + * functional. + */ + prlog(PR_ERR, "DT: Found duplicate node: %s\n", + child->name); + break; + case FDT_END: + return -1; + } + } while (tag != FDT_END_NODE); + + return nextoffset; +} + +void dt_expand(const void *fdt) +{ + prlog(PR_DEBUG, "FDT: Parsing fdt @%p\n", fdt); + + if (dt_expand_node(dt_root, fdt, 0) < 0) + abort(); +} + +u64 dt_get_number(const void *pdata, unsigned int cells) +{ + const __be32 *p = pdata; + u64 ret = 0; + + while(cells--) + ret = (ret << 32) | be32_to_cpu(*(p++)); + return ret; +} + +u32 dt_n_address_cells(const struct dt_node *node) +{ + if (!node->parent) + return 0; + return dt_prop_get_u32_def(node->parent, "#address-cells", 2); +} + +u32 dt_n_size_cells(const struct dt_node *node) +{ + if (!node->parent) + return 0; + return dt_prop_get_u32_def(node->parent, "#size-cells", 1); +} + +u64 dt_get_address(const struct dt_node *node, unsigned int index, + u64 *out_size) +{ + const struct dt_property *p; + u32 na = dt_n_address_cells(node); + u32 ns = dt_n_size_cells(node); + u32 pos, n; + + p = dt_require_property(node, "reg", -1); + n = (na + ns) * sizeof(u32); + pos = n * index; + assert((pos + n) <= p->len); + if (out_size) + *out_size = dt_get_number(p->prop + pos + na * sizeof(u32), ns); + return dt_get_number(p->prop + pos, na); +} + +u32 __dt_get_chip_id(const struct dt_node *node) +{ + const struct dt_property *prop; + + for (; node; node = node->parent) { + prop = dt_find_property(node, "ibm,chip-id"); + if (prop) + return dt_property_get_cell(prop, 0); + } + return 0xffffffff; +} + +u32 dt_get_chip_id(const struct dt_node *node) +{ + u32 id = __dt_get_chip_id(node); + assert(id != 0xffffffff); + return id; +} + +struct dt_node *dt_find_compatible_node_on_chip(struct dt_node *root, + struct dt_node *prev, + const char *compat, + uint32_t chip_id) +{ + struct dt_node *node = prev; + + while ((node = dt_next(root, node))) { + u32 cid = __dt_get_chip_id(node); + if (cid == chip_id && + dt_node_is_compatible(node, compat)) + return node; + } + return NULL; +} + +unsigned int dt_count_addresses(const struct dt_node *node) +{ + const struct dt_property *p; + u32 na = dt_n_address_cells(node); + u32 ns = dt_n_size_cells(node); + u32 n; + + p = dt_require_property(node, "reg", -1); + n = (na + ns) * sizeof(u32); + + if (n == 0) + return 0; + + return p->len / n; +} + +/* Translates an address from the given bus into its parent's address space */ +static u64 dt_translate_one(const struct dt_node *bus, u64 addr) +{ + u32 ranges_count, na, ns, parent_na; + const struct dt_property *p; + const u32 *ranges; + int i, stride; + + assert(bus->parent); + + na = dt_prop_get_u32_def(bus, "#address-cells", 2); + ns = dt_prop_get_u32_def(bus, "#size-cells", 2); + parent_na = dt_n_address_cells(bus); + + stride = na + ns + parent_na; + + /* + * FIXME: We should handle arbitrary length addresses, rather than + * limiting it to 64bit. If someone wants/needs that they + * can implement the bignum math for it :) + */ + assert(na <= 2); + assert(parent_na <= 2); + + /* We should never be trying to translate an address without a ranges */ + p = dt_require_property(bus, "ranges", -1); + + ranges = (u32 *) &p->prop; + ranges_count = (p->len / 4) / (na + parent_na + ns); + + /* An empty ranges property implies 1-1 translation */ + if (ranges_count == 0) + return addr; + + for (i = 0; i < ranges_count; i++, ranges += stride) { + /* ranges format: */ + u64 child_base = dt_get_number(ranges, na); + u64 parent_base = dt_get_number(ranges + na, parent_na); + u64 size = dt_get_number(ranges + na + parent_na, ns); + + if (addr >= child_base && addr < child_base + size) + return (addr - child_base) + parent_base; + } + + /* input address was outside the any of our mapped ranges */ + return 0; +} + +u64 dt_translate_address(const struct dt_node *node, unsigned int index, + u64 *out_size) +{ + u64 addr = dt_get_address(node, index, NULL); + struct dt_node *bus = node->parent; + + /* FIXME: One day we will probably want to use this, but for now just + * force it it to be zero since we only support returning a u64 or u32 + */ + assert(!out_size); + + /* apply each translation until we hit the root bus */ + while (bus->parent) { + addr = dt_translate_one(bus, addr); + bus = bus->parent; + } + + return addr; +} + +bool dt_node_is_enabled(struct dt_node *node) +{ + const struct dt_property *p = dt_find_property(node, "status"); + + if (!p) + return true; + + return p->len > 1 && p->prop[0] == 'o' && p->prop[1] == 'k'; +} + +/* + * Function to fixup the phandle in the subtree. + */ +void dt_adjust_subtree_phandle(struct dt_node *dev, + const char** (get_properties_to_fix)(struct dt_node *n)) +{ + struct dt_node *node; + struct dt_property *prop; + u32 phandle, max_phandle = 0, import_phandle = new_phandle(); + __be32 p; + const char **name; + + dt_for_each_node(dev, node) { + const char **props_to_update; + node->phandle += import_phandle; + + /* + * calculate max_phandle(new_tree), needed to update + * last_phandle. + */ + if (node->phandle >= max_phandle) + max_phandle = node->phandle; + + props_to_update = get_properties_to_fix(node); + if (!props_to_update) + continue; + for (name = props_to_update; *name != NULL; name++) { + prop = __dt_find_property(node, *name); + if (!prop) + continue; + phandle = dt_prop_get_u32(node, *name); + phandle += import_phandle; + p = cpu_to_be32(phandle); + memcpy((char *)&prop->prop, &p, prop->len); + } + } + + set_last_phandle(max_phandle); +} diff --git a/roms/skiboot/core/direct-controls.c b/roms/skiboot/core/direct-controls.c new file mode 100644 index 000000000..37bcf9826 --- /dev/null +++ b/roms/skiboot/core/direct-controls.c @@ -0,0 +1,1161 @@ +// SPDX-License-Identifier: Apache-2.0 +/* + * Directly control CPU cores/threads. SRESET, special wakeup, etc + * + * Copyright 2017-2019 IBM Corp. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/**************** mambo direct controls ****************/ + +extern unsigned long callthru_tcl(const char *str, int len); + +static void mambo_sreset_cpu(struct cpu_thread *cpu) +{ + uint32_t chip_id = pir_to_chip_id(cpu->pir); + uint32_t core_id = pir_to_core_id(cpu->pir); + uint32_t thread_id = pir_to_thread_id(cpu->pir); + char tcl_cmd[50]; + + snprintf(tcl_cmd, sizeof(tcl_cmd), + "mysim cpu %i:%i:%i start_thread 0x100", + chip_id, core_id, thread_id); + callthru_tcl(tcl_cmd, strlen(tcl_cmd)); +} + +static void mambo_stop_cpu(struct cpu_thread *cpu) +{ + uint32_t chip_id = pir_to_chip_id(cpu->pir); + uint32_t core_id = pir_to_core_id(cpu->pir); + uint32_t thread_id = pir_to_thread_id(cpu->pir); + char tcl_cmd[50]; + + snprintf(tcl_cmd, sizeof(tcl_cmd), + "mysim cpu %i:%i:%i stop_thread", + chip_id, core_id, thread_id); + callthru_tcl(tcl_cmd, strlen(tcl_cmd)); +} + +/**************** POWER8 direct controls ****************/ + +static int p8_core_set_special_wakeup(struct cpu_thread *cpu) +{ + uint64_t val, poll_target, stamp; + uint32_t core_id; + int rc; + + /* + * Note: HWP checks for checkstops, but I assume we don't need to + * as we wouldn't be running if one was present + */ + + /* Grab core ID once */ + core_id = pir_to_core_id(cpu->pir); + + prlog(PR_DEBUG, "RESET Waking up core 0x%x\n", core_id); + + /* + * The original HWp reads the XSCOM first but ignores the result + * and error, let's do the same until I know for sure that is + * not necessary + */ + xscom_read(cpu->chip_id, + XSCOM_ADDR_P8_EX_SLAVE(core_id, EX_PM_SPECIAL_WAKEUP_PHYP), + &val); + + /* Then we write special wakeup */ + rc = xscom_write(cpu->chip_id, + XSCOM_ADDR_P8_EX_SLAVE(core_id, + EX_PM_SPECIAL_WAKEUP_PHYP), + PPC_BIT(0)); + if (rc) { + prerror("RESET: XSCOM error %d asserting special" + " wakeup on 0x%x\n", rc, cpu->pir); + return rc; + } + + /* + * HWP uses the history for Perf register here, dunno why it uses + * that one instead of the pHyp one, maybe to avoid clobbering it... + * + * In any case, it does that to check for run/nap vs.sleep/winkle/other + * to decide whether to poll on checkstop or not. Since we don't deal + * with checkstop conditions here, we ignore that part. + */ + + /* + * Now poll for completion of special wakeup. The HWP is nasty here, + * it will poll at 5ms intervals for up to 200ms. This is not quite + * acceptable for us at runtime, at least not until we have the + * ability to "context switch" HBRT. In practice, because we don't + * winkle, it will never take that long, so we increase the polling + * frequency to 1us per poll. However we do have to keep the same + * timeout. + * + * We don't use time_wait_ms() either for now as we don't want to + * poll the FSP here. + */ + stamp = mftb(); + poll_target = stamp + msecs_to_tb(200); + val = 0; + while (!(val & EX_PM_GP0_SPECIAL_WAKEUP_DONE)) { + /* Wait 1 us */ + time_wait_us(1); + + /* Read PM state */ + rc = xscom_read(cpu->chip_id, + XSCOM_ADDR_P8_EX_SLAVE(core_id, EX_PM_GP0), + &val); + if (rc) { + prerror("RESET: XSCOM error %d reading PM state on" + " 0x%x\n", rc, cpu->pir); + return rc; + } + /* Check timeout */ + if (mftb() > poll_target) + break; + } + + /* Success ? */ + if (val & EX_PM_GP0_SPECIAL_WAKEUP_DONE) { + uint64_t now = mftb(); + prlog(PR_TRACE, "RESET: Special wakeup complete after %ld us\n", + tb_to_usecs(now - stamp)); + return 0; + } + + /* + * We timed out ... + * + * HWP has a complex workaround for HW255321 which affects + * Murano DD1 and Venice DD1. Ignore that for now + * + * Instead we just dump some XSCOMs for error logging + */ + prerror("RESET: Timeout on special wakeup of 0x%0x\n", cpu->pir); + prerror("RESET: PM0 = 0x%016llx\n", val); + val = -1; + xscom_read(cpu->chip_id, + XSCOM_ADDR_P8_EX_SLAVE(core_id, EX_PM_SPECIAL_WAKEUP_PHYP), + &val); + prerror("RESET: SPC_WKUP = 0x%016llx\n", val); + val = -1; + xscom_read(cpu->chip_id, + XSCOM_ADDR_P8_EX_SLAVE(core_id, + EX_PM_IDLE_STATE_HISTORY_PHYP), + &val); + prerror("RESET: HISTORY = 0x%016llx\n", val); + + return OPAL_HARDWARE; +} + +static int p8_core_clear_special_wakeup(struct cpu_thread *cpu) +{ + uint64_t val; + uint32_t core_id; + int rc; + + /* + * Note: HWP checks for checkstops, but I assume we don't need to + * as we wouldn't be running if one was present + */ + + /* Grab core ID once */ + core_id = pir_to_core_id(cpu->pir); + + prlog(PR_DEBUG, "RESET: Releasing core 0x%x wakeup\n", core_id); + + /* + * The original HWp reads the XSCOM first but ignores the result + * and error, let's do the same until I know for sure that is + * not necessary + */ + xscom_read(cpu->chip_id, + XSCOM_ADDR_P8_EX_SLAVE(core_id, EX_PM_SPECIAL_WAKEUP_PHYP), + &val); + + /* Then we write special wakeup */ + rc = xscom_write(cpu->chip_id, + XSCOM_ADDR_P8_EX_SLAVE(core_id, + EX_PM_SPECIAL_WAKEUP_PHYP), 0); + if (rc) { + prerror("RESET: XSCOM error %d deasserting" + " special wakeup on 0x%x\n", rc, cpu->pir); + return rc; + } + + /* + * The original HWp reads the XSCOM again with the comment + * "This puts an inherent delay in the propagation of the reset + * transition" + */ + xscom_read(cpu->chip_id, + XSCOM_ADDR_P8_EX_SLAVE(core_id, EX_PM_SPECIAL_WAKEUP_PHYP), + &val); + + return 0; +} + +static int p8_stop_thread(struct cpu_thread *cpu) +{ + uint32_t core_id = pir_to_core_id(cpu->pir); + uint32_t chip_id = pir_to_chip_id(cpu->pir); + uint32_t thread_id = pir_to_thread_id(cpu->pir); + uint32_t xscom_addr; + + xscom_addr = XSCOM_ADDR_P8_EX(core_id, + P8_EX_TCTL_DIRECT_CONTROLS(thread_id)); + + if (xscom_write(chip_id, xscom_addr, P8_DIRECT_CTL_STOP)) { + prlog(PR_ERR, "Could not stop thread %u:%u:%u:" + " Unable to write EX_TCTL_DIRECT_CONTROLS.\n", + chip_id, core_id, thread_id); + return OPAL_HARDWARE; + } + + return OPAL_SUCCESS; +} + +static int p8_sreset_thread(struct cpu_thread *cpu) +{ + uint32_t core_id = pir_to_core_id(cpu->pir); + uint32_t chip_id = pir_to_chip_id(cpu->pir); + uint32_t thread_id = pir_to_thread_id(cpu->pir); + uint32_t xscom_addr; + + xscom_addr = XSCOM_ADDR_P8_EX(core_id, + P8_EX_TCTL_DIRECT_CONTROLS(thread_id)); + + if (xscom_write(chip_id, xscom_addr, P8_DIRECT_CTL_PRENAP)) { + prlog(PR_ERR, "Could not prenap thread %u:%u:%u:" + " Unable to write EX_TCTL_DIRECT_CONTROLS.\n", + chip_id, core_id, thread_id); + return OPAL_HARDWARE; + } + if (xscom_write(chip_id, xscom_addr, P8_DIRECT_CTL_SRESET)) { + prlog(PR_ERR, "Could not sreset thread %u:%u:%u:" + " Unable to write EX_TCTL_DIRECT_CONTROLS.\n", + chip_id, core_id, thread_id); + return OPAL_HARDWARE; + } + + return OPAL_SUCCESS; +} + + +/**************** POWER9 direct controls ****************/ + +/* Long running instructions may take time to complete. Timeout 100ms */ +#define P9_QUIESCE_POLL_INTERVAL 100 +#define P9_QUIESCE_TIMEOUT 100000 + +/* Waking may take up to 5ms for deepest sleep states. Set timeout to 100ms */ +#define P9_SPWKUP_POLL_INTERVAL 100 +#define P9_SPWKUP_TIMEOUT 100000 + +/* + * This implements direct control facilities of processor cores and threads + * using scom registers. + */ + +static int p9_core_is_gated(struct cpu_thread *cpu) +{ + uint32_t chip_id = pir_to_chip_id(cpu->pir); + uint32_t core_id = pir_to_core_id(cpu->pir); + uint32_t sshhyp_addr; + uint64_t val; + + sshhyp_addr = XSCOM_ADDR_P9_EC_SLAVE(core_id, P9_EC_PPM_SSHHYP); + + if (xscom_read(chip_id, sshhyp_addr, &val)) { + prlog(PR_ERR, "Could not query core gated on %u:%u:" + " Unable to read PPM_SSHHYP.\n", + chip_id, core_id); + return OPAL_HARDWARE; + } + + return !!(val & P9_CORE_GATED); +} + +static int p9_core_set_special_wakeup(struct cpu_thread *cpu) +{ + uint32_t chip_id = pir_to_chip_id(cpu->pir); + uint32_t core_id = pir_to_core_id(cpu->pir); + uint32_t swake_addr; + uint32_t sshhyp_addr; + uint64_t val; + int i; + + swake_addr = XSCOM_ADDR_P9_EC_SLAVE(core_id, EC_PPM_SPECIAL_WKUP_HYP); + sshhyp_addr = XSCOM_ADDR_P9_EC_SLAVE(core_id, P9_EC_PPM_SSHHYP); + + if (xscom_write(chip_id, swake_addr, P9_SPWKUP_SET)) { + prlog(PR_ERR, "Could not set special wakeup on %u:%u:" + " Unable to write PPM_SPECIAL_WKUP_HYP.\n", + chip_id, core_id); + goto out_fail; + } + + for (i = 0; i < P9_SPWKUP_TIMEOUT / P9_SPWKUP_POLL_INTERVAL; i++) { + if (xscom_read(chip_id, sshhyp_addr, &val)) { + prlog(PR_ERR, "Could not set special wakeup on %u:%u:" + " Unable to read PPM_SSHHYP.\n", + chip_id, core_id); + goto out_fail; + } + if (val & P9_SPECIAL_WKUP_DONE) { + /* + * CORE_GATED will be unset on a successful special + * wakeup of the core which indicates that the core is + * out of stop state. If CORE_GATED is still set then + * raise error. + */ + if (p9_core_is_gated(cpu)) { + /* Deassert spwu for this strange error */ + xscom_write(chip_id, swake_addr, 0); + prlog(PR_ERR, "Failed special wakeup on %u:%u" + " as CORE_GATED is set\n", + chip_id, core_id); + goto out_fail; + } else { + return 0; + } + } + time_wait_us(P9_SPWKUP_POLL_INTERVAL); + } + + prlog(PR_ERR, "Could not set special wakeup on %u:%u:" + " timeout waiting for SPECIAL_WKUP_DONE.\n", + chip_id, core_id); + +out_fail: + /* + * As per the special wakeup protocol we should not de-assert + * the special wakeup on the core until WAKEUP_DONE is set. + * So even on error do not de-assert. + */ + return OPAL_HARDWARE; +} + +static int p9_core_clear_special_wakeup(struct cpu_thread *cpu) +{ + uint32_t chip_id = pir_to_chip_id(cpu->pir); + uint32_t core_id = pir_to_core_id(cpu->pir); + uint32_t swake_addr; + + swake_addr = XSCOM_ADDR_P9_EC_SLAVE(core_id, EC_PPM_SPECIAL_WKUP_HYP); + + /* + * De-assert special wakeup after a small delay. + * The delay may help avoid problems setting and clearing special + * wakeup back-to-back. This should be confirmed. + */ + time_wait_us(1); + if (xscom_write(chip_id, swake_addr, 0)) { + prlog(PR_ERR, "Could not clear special wakeup on %u:%u:" + " Unable to write PPM_SPECIAL_WKUP_HYP.\n", + chip_id, core_id); + return OPAL_HARDWARE; + } + + /* + * Don't wait for de-assert to complete as other components + * could have requested for special wkeup. Wait for 10ms to + * avoid back-to-back asserts + */ + time_wait_us(10000); + return 0; +} + +static int p9_thread_quiesced(struct cpu_thread *cpu) +{ + uint32_t chip_id = pir_to_chip_id(cpu->pir); + uint32_t core_id = pir_to_core_id(cpu->pir); + uint32_t thread_id = pir_to_thread_id(cpu->pir); + uint32_t ras_addr; + uint64_t ras_status; + + ras_addr = XSCOM_ADDR_P9_EC(core_id, P9_RAS_STATUS); + if (xscom_read(chip_id, ras_addr, &ras_status)) { + prlog(PR_ERR, "Could not check thread state on %u:%u:" + " Unable to read RAS_STATUS.\n", + chip_id, core_id); + return OPAL_HARDWARE; + } + + /* + * This returns true when the thread is quiesced and all + * instructions completed. For sreset this may not be necessary, + * but we may want to use instruction ramming or stepping + * direct controls where it is important. + */ + if ((ras_status & P9_THREAD_QUIESCED(thread_id)) + == P9_THREAD_QUIESCED(thread_id)) + return 1; + + return 0; +} + +static int p9_cont_thread(struct cpu_thread *cpu) +{ + uint32_t chip_id = pir_to_chip_id(cpu->pir); + uint32_t core_id = pir_to_core_id(cpu->pir); + uint32_t thread_id = pir_to_thread_id(cpu->pir); + uint32_t cts_addr; + uint32_t ti_addr; + uint32_t dctl_addr; + uint64_t core_thread_state; + uint64_t thread_info; + bool active, stop; + int rc; + + rc = p9_thread_quiesced(cpu); + if (rc < 0) + return rc; + if (!rc) { + prlog(PR_ERR, "Could not cont thread %u:%u:%u:" + " Thread is not quiesced.\n", + chip_id, core_id, thread_id); + return OPAL_BUSY; + } + + cts_addr = XSCOM_ADDR_P9_EC(core_id, P9_CORE_THREAD_STATE); + ti_addr = XSCOM_ADDR_P9_EC(core_id, P9_THREAD_INFO); + dctl_addr = XSCOM_ADDR_P9_EC(core_id, P9_EC_DIRECT_CONTROLS); + + if (xscom_read(chip_id, cts_addr, &core_thread_state)) { + prlog(PR_ERR, "Could not resume thread %u:%u:%u:" + " Unable to read CORE_THREAD_STATE.\n", + chip_id, core_id, thread_id); + return OPAL_HARDWARE; + } + if (core_thread_state & PPC_BIT(56 + thread_id)) + stop = true; + else + stop = false; + + if (xscom_read(chip_id, ti_addr, &thread_info)) { + prlog(PR_ERR, "Could not resume thread %u:%u:%u:" + " Unable to read THREAD_INFO.\n", + chip_id, core_id, thread_id); + return OPAL_HARDWARE; + } + if (thread_info & PPC_BIT(thread_id)) + active = true; + else + active = false; + + if (!active || stop) { + if (xscom_write(chip_id, dctl_addr, P9_THREAD_CLEAR_MAINT(thread_id))) { + prlog(PR_ERR, "Could not resume thread %u:%u:%u:" + " Unable to write EC_DIRECT_CONTROLS.\n", + chip_id, core_id, thread_id); + } + } else { + if (xscom_write(chip_id, dctl_addr, P9_THREAD_CONT(thread_id))) { + prlog(PR_ERR, "Could not resume thread %u:%u:%u:" + " Unable to write EC_DIRECT_CONTROLS.\n", + chip_id, core_id, thread_id); + } + } + + return 0; +} + +static int p9_stop_thread(struct cpu_thread *cpu) +{ + uint32_t chip_id = pir_to_chip_id(cpu->pir); + uint32_t core_id = pir_to_core_id(cpu->pir); + uint32_t thread_id = pir_to_thread_id(cpu->pir); + uint32_t dctl_addr; + int rc; + int i; + + dctl_addr = XSCOM_ADDR_P9_EC(core_id, P9_EC_DIRECT_CONTROLS); + + rc = p9_thread_quiesced(cpu); + if (rc < 0) + return rc; + if (rc) { + prlog(PR_ERR, "Could not stop thread %u:%u:%u:" + " Thread is quiesced already.\n", + chip_id, core_id, thread_id); + return OPAL_BUSY; + } + + if (xscom_write(chip_id, dctl_addr, P9_THREAD_STOP(thread_id))) { + prlog(PR_ERR, "Could not stop thread %u:%u:%u:" + " Unable to write EC_DIRECT_CONTROLS.\n", + chip_id, core_id, thread_id); + return OPAL_HARDWARE; + } + + for (i = 0; i < P9_QUIESCE_TIMEOUT / P9_QUIESCE_POLL_INTERVAL; i++) { + int rc = p9_thread_quiesced(cpu); + if (rc < 0) + break; + if (rc) + return 0; + + time_wait_us(P9_QUIESCE_POLL_INTERVAL); + } + + prlog(PR_ERR, "Could not stop thread %u:%u:%u:" + " Unable to quiesce thread.\n", + chip_id, core_id, thread_id); + + return OPAL_HARDWARE; +} + +static int p9_sreset_thread(struct cpu_thread *cpu) +{ + uint32_t chip_id = pir_to_chip_id(cpu->pir); + uint32_t core_id = pir_to_core_id(cpu->pir); + uint32_t thread_id = pir_to_thread_id(cpu->pir); + uint32_t dctl_addr; + + dctl_addr = XSCOM_ADDR_P9_EC(core_id, P9_EC_DIRECT_CONTROLS); + + if (xscom_write(chip_id, dctl_addr, P9_THREAD_SRESET(thread_id))) { + prlog(PR_ERR, "Could not sreset thread %u:%u:%u:" + " Unable to write EC_DIRECT_CONTROLS.\n", + chip_id, core_id, thread_id); + return OPAL_HARDWARE; + } + + return 0; +} + +/**************** POWER10 direct controls ****************/ + +/* Long running instructions may take time to complete. Timeout 100ms */ +#define P10_QUIESCE_POLL_INTERVAL 100 +#define P10_QUIESCE_TIMEOUT 100000 + +/* Waking may take up to 5ms for deepest sleep states. Set timeout to 100ms */ +#define P10_SPWU_POLL_INTERVAL 100 +#define P10_SPWU_TIMEOUT 100000 + +/* + * This implements direct control facilities of processor cores and threads + * using scom registers. + */ +static int p10_core_is_gated(struct cpu_thread *cpu) +{ + uint32_t chip_id = pir_to_chip_id(cpu->pir); + uint32_t core_id = pir_to_core_id(cpu->pir); + uint32_t ssh_addr; + uint64_t val; + + ssh_addr = XSCOM_ADDR_P10_QME_CORE(core_id, P10_QME_SSH_HYP); + + if (xscom_read(chip_id, ssh_addr, &val)) { + prlog(PR_ERR, "Could not query core gated on %u:%u:" + " Unable to read QME_SSH_HYP.\n", + chip_id, core_id); + return OPAL_HARDWARE; + } + + return !!(val & P10_SSH_CORE_GATED); +} + + +static int p10_core_set_special_wakeup(struct cpu_thread *cpu) +{ + uint32_t chip_id = pir_to_chip_id(cpu->pir); + uint32_t core_id = pir_to_core_id(cpu->pir); + uint32_t spwu_addr, ssh_addr; + uint64_t val; + int i; + + /* P10 could use SPWU_HYP done bit instead of SSH? */ + spwu_addr = XSCOM_ADDR_P10_QME_CORE(core_id, P10_QME_SPWU_HYP); + ssh_addr = XSCOM_ADDR_P10_QME_CORE(core_id, P10_QME_SSH_HYP); + + if (xscom_write(chip_id, spwu_addr, P10_SPWU_REQ)) { + prlog(PR_ERR, "Could not set special wakeup on %u:%u:" + " Unable to write QME_SPWU_HYP.\n", + chip_id, core_id); + return OPAL_HARDWARE; + } + + for (i = 0; i < P10_SPWU_TIMEOUT / P10_SPWU_POLL_INTERVAL; i++) { + if (xscom_read(chip_id, ssh_addr, &val)) { + prlog(PR_ERR, "Could not set special wakeup on %u:%u:" + " Unable to read QME_SSH_HYP.\n", + chip_id, core_id); + return OPAL_HARDWARE; + } + if (val & P10_SSH_SPWU_DONE) { + /* + * CORE_GATED will be unset on a successful special + * wakeup of the core which indicates that the core is + * out of stop state. If CORE_GATED is still set then + * check SPWU register and raise error only if SPWU_DONE + * is not set, else print a warning and consider SPWU + * operation as successful. + * This is in conjunction with a micocode bug, which + * calls out the fact that SPW can succeed in the case + * the core is gated but SPWU_HYP bit is set. + */ + if (p10_core_is_gated(cpu)) { + if(xscom_read(chip_id, spwu_addr, &val)) { + prlog(PR_ERR, "Core %u:%u:" + " unable to read QME_SPWU_HYP\n", + chip_id, core_id); + return OPAL_HARDWARE; + } + if (val & P10_SPWU_DONE) { + /* + * If SPWU DONE bit is set then + * SPWU operation is complete + */ + prlog(PR_DEBUG, "Special wakeup on " + "%u:%u: core remains gated while" + " SPWU_HYP DONE set\n", + chip_id, core_id); + return 0; + } + /* Deassert spwu for this strange error */ + xscom_write(chip_id, spwu_addr, 0); + prlog(PR_ERR, + "Failed special wakeup on %u:%u" + " core remains gated.\n", + chip_id, core_id); + return OPAL_HARDWARE; + } else { + return 0; + } + } + time_wait_us(P10_SPWU_POLL_INTERVAL); + } + + prlog(PR_ERR, "Could not set special wakeup on %u:%u:" + " operation timeout.\n", + chip_id, core_id); + /* + * As per the special wakeup protocol we should not de-assert + * the special wakeup on the core until WAKEUP_DONE is set. + * So even on error do not de-assert. + */ + + return OPAL_HARDWARE; +} + +static int p10_core_clear_special_wakeup(struct cpu_thread *cpu) +{ + uint32_t chip_id = pir_to_chip_id(cpu->pir); + uint32_t core_id = pir_to_core_id(cpu->pir); + uint32_t spwu_addr; + + spwu_addr = XSCOM_ADDR_P10_QME_CORE(core_id, P10_QME_SPWU_HYP); + + /* Add a small delay here if spwu problems time_wait_us(1); */ + if (xscom_write(chip_id, spwu_addr, 0)) { + prlog(PR_ERR, "Could not clear special wakeup on %u:%u:" + " Unable to write QME_SPWU_HYP.\n", + chip_id, core_id); + return OPAL_HARDWARE; + } + + return 0; +} + +static int p10_thread_quiesced(struct cpu_thread *cpu) +{ + uint32_t chip_id = pir_to_chip_id(cpu->pir); + uint32_t core_id = pir_to_core_id(cpu->pir); + uint32_t thread_id = pir_to_thread_id(cpu->pir); + uint32_t ras_addr; + uint64_t ras_status; + + ras_addr = XSCOM_ADDR_P10_EC(core_id, P10_EC_RAS_STATUS); + if (xscom_read(chip_id, ras_addr, &ras_status)) { + prlog(PR_ERR, "Could not check thread state on %u:%u:" + " Unable to read EC_RAS_STATUS.\n", + chip_id, core_id); + return OPAL_HARDWARE; + } + + /* + * p10_thread_stop for the purpose of sreset wants QUIESCED + * and MAINT bits set. Step, RAM, etc. need more, but we don't + * use those in skiboot. + * + * P10 could try wait for more here in case of errors. + */ + if (!(ras_status & P10_THREAD_QUIESCED(thread_id))) + return 0; + + if (!(ras_status & P10_THREAD_MAINT(thread_id))) + return 0; + + return 1; +} + +static int p10_cont_thread(struct cpu_thread *cpu) +{ + uint32_t chip_id = pir_to_chip_id(cpu->pir); + uint32_t core_id = pir_to_core_id(cpu->pir); + uint32_t thread_id = pir_to_thread_id(cpu->pir); + uint32_t cts_addr; + uint32_t ti_addr; + uint32_t dctl_addr; + uint64_t core_thread_state; + uint64_t thread_info; + bool active, stop; + int rc; + int i; + + rc = p10_thread_quiesced(cpu); + if (rc < 0) + return rc; + if (!rc) { + prlog(PR_ERR, "Could not cont thread %u:%u:%u:" + " Thread is not quiesced.\n", + chip_id, core_id, thread_id); + return OPAL_BUSY; + } + + cts_addr = XSCOM_ADDR_P10_EC(core_id, P10_EC_CORE_THREAD_STATE); + ti_addr = XSCOM_ADDR_P10_EC(core_id, P10_EC_THREAD_INFO); + dctl_addr = XSCOM_ADDR_P10_EC(core_id, P10_EC_DIRECT_CONTROLS); + + if (xscom_read(chip_id, cts_addr, &core_thread_state)) { + prlog(PR_ERR, "Could not resume thread %u:%u:%u:" + " Unable to read EC_CORE_THREAD_STATE.\n", + chip_id, core_id, thread_id); + return OPAL_HARDWARE; + } + if (core_thread_state & P10_THREAD_STOPPED(thread_id)) + stop = true; + else + stop = false; + + if (xscom_read(chip_id, ti_addr, &thread_info)) { + prlog(PR_ERR, "Could not resume thread %u:%u:%u:" + " Unable to read EC_THREAD_INFO.\n", + chip_id, core_id, thread_id); + return OPAL_HARDWARE; + } + if (thread_info & P10_THREAD_ACTIVE(thread_id)) + active = true; + else + active = false; + + if (!active || stop) { + if (xscom_write(chip_id, dctl_addr, P10_THREAD_CLEAR_MAINT(thread_id))) { + prlog(PR_ERR, "Could not resume thread %u:%u:%u:" + " Unable to write EC_DIRECT_CONTROLS.\n", + chip_id, core_id, thread_id); + } + } else { + if (xscom_write(chip_id, dctl_addr, P10_THREAD_START(thread_id))) { + prlog(PR_ERR, "Could not resume thread %u:%u:%u:" + " Unable to write EC_DIRECT_CONTROLS.\n", + chip_id, core_id, thread_id); + } + } + + for (i = 0; i < P10_QUIESCE_TIMEOUT / P10_QUIESCE_POLL_INTERVAL; i++) { + int rc = p10_thread_quiesced(cpu); + if (rc < 0) + break; + if (!rc) + return 0; + + time_wait_us(P10_QUIESCE_POLL_INTERVAL); + } + + prlog(PR_ERR, "Could not start thread %u:%u:%u:" + " Unable to start thread.\n", + chip_id, core_id, thread_id); + + return OPAL_HARDWARE; +} + +static int p10_stop_thread(struct cpu_thread *cpu) +{ + uint32_t chip_id = pir_to_chip_id(cpu->pir); + uint32_t core_id = pir_to_core_id(cpu->pir); + uint32_t thread_id = pir_to_thread_id(cpu->pir); + uint32_t dctl_addr; + int rc; + int i; + + dctl_addr = XSCOM_ADDR_P10_EC(core_id, P10_EC_DIRECT_CONTROLS); + + rc = p10_thread_quiesced(cpu); + if (rc < 0) + return rc; + if (rc) { + prlog(PR_ERR, "Could not stop thread %u:%u:%u:" + " Thread is quiesced already.\n", + chip_id, core_id, thread_id); + return OPAL_BUSY; + } + + if (xscom_write(chip_id, dctl_addr, P10_THREAD_STOP(thread_id))) { + prlog(PR_ERR, "Could not stop thread %u:%u:%u:" + " Unable to write EC_DIRECT_CONTROLS.\n", + chip_id, core_id, thread_id); + return OPAL_HARDWARE; + } + + for (i = 0; i < P10_QUIESCE_TIMEOUT / P10_QUIESCE_POLL_INTERVAL; i++) { + int rc = p10_thread_quiesced(cpu); + if (rc < 0) + break; + if (rc) + return 0; + + time_wait_us(P10_QUIESCE_POLL_INTERVAL); + } + + prlog(PR_ERR, "Could not stop thread %u:%u:%u:" + " Unable to quiesce thread.\n", + chip_id, core_id, thread_id); + + return OPAL_HARDWARE; +} + +static int p10_sreset_thread(struct cpu_thread *cpu) +{ + uint32_t chip_id = pir_to_chip_id(cpu->pir); + uint32_t core_id = pir_to_core_id(cpu->pir); + uint32_t thread_id = pir_to_thread_id(cpu->pir); + uint32_t dctl_addr; + + dctl_addr = XSCOM_ADDR_P10_EC(core_id, P10_EC_DIRECT_CONTROLS); + + if (xscom_write(chip_id, dctl_addr, P10_THREAD_SRESET(thread_id))) { + prlog(PR_ERR, "Could not sreset thread %u:%u:%u:" + " Unable to write EC_DIRECT_CONTROLS.\n", + chip_id, core_id, thread_id); + return OPAL_HARDWARE; + } + + return 0; +} + +/**************** generic direct controls ****************/ + +int dctl_set_special_wakeup(struct cpu_thread *t) +{ + struct cpu_thread *c = t->ec_primary; + int rc = OPAL_SUCCESS; + + if (proc_gen == proc_gen_unknown) + return OPAL_UNSUPPORTED; + + lock(&c->dctl_lock); + if (c->special_wakeup_count == 0) { + if (proc_gen == proc_gen_p10) + rc = p10_core_set_special_wakeup(c); + else if (proc_gen == proc_gen_p9) + rc = p9_core_set_special_wakeup(c); + else /* (proc_gen == proc_gen_p8) */ + rc = p8_core_set_special_wakeup(c); + } + if (!rc) + c->special_wakeup_count++; + unlock(&c->dctl_lock); + + return rc; +} + +int dctl_clear_special_wakeup(struct cpu_thread *t) +{ + struct cpu_thread *c = t->ec_primary; + int rc = OPAL_SUCCESS; + + if (proc_gen == proc_gen_unknown) + return OPAL_UNSUPPORTED; + + lock(&c->dctl_lock); + if (!c->special_wakeup_count) + goto out; + if (c->special_wakeup_count == 1) { + if (proc_gen == proc_gen_p10) + rc = p10_core_clear_special_wakeup(c); + else if (proc_gen == proc_gen_p9) + rc = p9_core_clear_special_wakeup(c); + else /* (proc_gen == proc_gen_p8) */ + rc = p8_core_clear_special_wakeup(c); + } + if (!rc) + c->special_wakeup_count--; +out: + unlock(&c->dctl_lock); + + return rc; +} + +int dctl_core_is_gated(struct cpu_thread *t) +{ + struct cpu_thread *c = t->primary; + + if (proc_gen == proc_gen_p10) + return p10_core_is_gated(c); + else if (proc_gen == proc_gen_p9) + return p9_core_is_gated(c); + else + return OPAL_UNSUPPORTED; +} + +static int dctl_stop(struct cpu_thread *t) +{ + struct cpu_thread *c = t->ec_primary; + int rc; + + lock(&c->dctl_lock); + if (t->dctl_stopped) { + unlock(&c->dctl_lock); + return OPAL_BUSY; + } + if (proc_gen == proc_gen_p10) + rc = p10_stop_thread(t); + else if (proc_gen == proc_gen_p9) + rc = p9_stop_thread(t); + else /* (proc_gen == proc_gen_p8) */ + rc = p8_stop_thread(t); + if (!rc) + t->dctl_stopped = true; + unlock(&c->dctl_lock); + + return rc; +} + +static int dctl_cont(struct cpu_thread *t) +{ + struct cpu_thread *c = t->primary; + int rc; + + if (proc_gen != proc_gen_p10 && proc_gen != proc_gen_p9) + return OPAL_UNSUPPORTED; + + lock(&c->dctl_lock); + if (!t->dctl_stopped) { + unlock(&c->dctl_lock); + return OPAL_BUSY; + } + if (proc_gen == proc_gen_p10) + rc = p10_cont_thread(t); + else /* (proc_gen == proc_gen_p9) */ + rc = p9_cont_thread(t); + if (!rc) + t->dctl_stopped = false; + unlock(&c->dctl_lock); + + return rc; +} + +/* + * NOTE: + * The POWER8 sreset does not provide SRR registers, so it can be used + * for fast reboot, but not OPAL_SIGNAL_SYSTEM_RESET or anywhere that is + * expected to return. For now, callers beware. + */ +static int dctl_sreset(struct cpu_thread *t) +{ + struct cpu_thread *c = t->ec_primary; + int rc; + + lock(&c->dctl_lock); + if (!t->dctl_stopped) { + unlock(&c->dctl_lock); + return OPAL_BUSY; + } + if (proc_gen == proc_gen_p10) + rc = p10_sreset_thread(t); + else if (proc_gen == proc_gen_p9) + rc = p9_sreset_thread(t); + else /* (proc_gen == proc_gen_p8) */ + rc = p8_sreset_thread(t); + if (!rc) + t->dctl_stopped = false; + unlock(&c->dctl_lock); + + return rc; +} + + +/**************** fast reboot API ****************/ + +int sreset_all_prepare(void) +{ + struct cpu_thread *cpu; + + if (proc_gen == proc_gen_unknown) + return OPAL_UNSUPPORTED; + + prlog(PR_DEBUG, "RESET: Resetting from cpu: 0x%x (core 0x%x)\n", + this_cpu()->pir, pir_to_core_id(this_cpu()->pir)); + + if (chip_quirk(QUIRK_MAMBO_CALLOUTS)) { + for_each_ungarded_cpu(cpu) { + if (cpu == this_cpu()) + continue; + mambo_stop_cpu(cpu); + } + return OPAL_SUCCESS; + } + + /* Assert special wakup on all cores. Only on operational cores. */ + for_each_ungarded_primary(cpu) { + if (dctl_set_special_wakeup(cpu) != OPAL_SUCCESS) + return OPAL_HARDWARE; + } + + prlog(PR_DEBUG, "RESET: Stopping the world...\n"); + + /* Put everybody in stop except myself */ + for_each_ungarded_cpu(cpu) { + if (cpu == this_cpu()) + continue; + if (dctl_stop(cpu) != OPAL_SUCCESS) + return OPAL_HARDWARE; + + } + + return OPAL_SUCCESS; +} + +void sreset_all_finish(void) +{ + struct cpu_thread *cpu; + + if (chip_quirk(QUIRK_MAMBO_CALLOUTS)) + return; + + for_each_ungarded_primary(cpu) + dctl_clear_special_wakeup(cpu); +} + +int sreset_all_others(void) +{ + struct cpu_thread *cpu; + + prlog(PR_DEBUG, "RESET: Resetting all threads but self...\n"); + + /* + * mambo should actually implement stop as well, and implement + * the dctl_ helpers properly. Currently it's racy just sresetting. + */ + if (chip_quirk(QUIRK_MAMBO_CALLOUTS)) { + for_each_ungarded_cpu(cpu) { + if (cpu == this_cpu()) + continue; + mambo_sreset_cpu(cpu); + } + return OPAL_SUCCESS; + } + + for_each_ungarded_cpu(cpu) { + if (cpu == this_cpu()) + continue; + if (dctl_sreset(cpu) != OPAL_SUCCESS) + return OPAL_HARDWARE; + } + + return OPAL_SUCCESS; +} + + +/**************** OPAL_SIGNAL_SYSTEM_RESET API ****************/ + +/* + * This provides a way for the host to raise system reset exceptions + * on other threads using direct control scoms on POWER9. + * + * We assert special wakeup on the core first. + * Then stop target thread and wait for it to quiesce. + * Then sreset the target thread, which resumes execution on that thread. + * Then de-assert special wakeup on the core. + */ +static int64_t do_sreset_cpu(struct cpu_thread *cpu) +{ + int rc; + + if (this_cpu() == cpu) { + prlog(PR_ERR, "SRESET: Unable to reset self\n"); + return OPAL_PARAMETER; + } + + rc = dctl_set_special_wakeup(cpu); + if (rc) + return rc; + + rc = dctl_stop(cpu); + if (rc) + goto out_spwk; + + rc = dctl_sreset(cpu); + if (rc) + goto out_cont; + + dctl_clear_special_wakeup(cpu); + + return 0; + +out_cont: + dctl_cont(cpu); +out_spwk: + dctl_clear_special_wakeup(cpu); + + return rc; +} + +static struct lock sreset_lock = LOCK_UNLOCKED; + +int64_t opal_signal_system_reset(int cpu_nr) +{ + struct cpu_thread *cpu; + int64_t ret; + + if (proc_gen != proc_gen_p9 && proc_gen != proc_gen_p10) + return OPAL_UNSUPPORTED; + + /* + * Broadcasts unsupported. Not clear what threads should be + * signaled, so it's better for the OS to perform one-at-a-time + * for now. + */ + if (cpu_nr < 0) + return OPAL_CONSTRAINED; + + /* Reset a single CPU */ + cpu = find_cpu_by_server(cpu_nr); + if (!cpu) { + prlog(PR_ERR, "SRESET: could not find cpu by server %d\n", cpu_nr); + return OPAL_PARAMETER; + } + + lock(&sreset_lock); + ret = do_sreset_cpu(cpu); + unlock(&sreset_lock); + + return ret; +} + +void direct_controls_init(void) +{ + if (chip_quirk(QUIRK_MAMBO_CALLOUTS)) + return; + + if (proc_gen != proc_gen_p9 && proc_gen != proc_gen_p10) + return; + + opal_register(OPAL_SIGNAL_SYSTEM_RESET, opal_signal_system_reset, 1); +} diff --git a/roms/skiboot/core/errorlog.c b/roms/skiboot/core/errorlog.c new file mode 100644 index 000000000..f64ac3f23 --- /dev/null +++ b/roms/skiboot/core/errorlog.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* This file contains the front end for OPAL error logging. It is used + * to construct a struct errorlog representing the event/error to be + * logged which is then passed to the platform specific backend to log + * the actual errors. + * + * Copyright 2013-2017 IBM Corp. + */ + +#include +#include +#include +#include + +/* + * Maximum number buffers that are pre-allocated + * to hold elogs that are reported on Sapphire and + * PowerNV. + */ +#define ELOG_WRITE_MAX_RECORD 64 +/* Platform log id as per the spec */ +static uint32_t sapphire_elog_id = 0xB0000000; + +/* Reserved for future use */ +/* static uint32_t powernv_elog_id = 0xB1000000; */ + +/* Pool to allocate elog messages from */ +static struct pool elog_pool; +static struct lock elog_lock = LOCK_UNLOCKED; + +static bool elog_available = false; + +static struct errorlog *get_write_buffer(int opal_event_severity) +{ + struct errorlog *buf; + + if (!elog_available) + return NULL; + + lock(&elog_lock); + if (opal_event_severity == OPAL_ERROR_PANIC) + buf = pool_get(&elog_pool, POOL_HIGH); + else + buf = pool_get(&elog_pool, POOL_NORMAL); + + unlock(&elog_lock); + return buf; +} + +/* Reporting of error via struct errorlog */ +struct errorlog *opal_elog_create(struct opal_err_info *e_info, uint32_t tag) +{ + struct errorlog *buf; + + buf = get_write_buffer(e_info->sev); + if (buf) { + buf->error_event_type = e_info->err_type; + buf->component_id = e_info->cmp_id; + buf->subsystem_id = e_info->subsystem; + buf->event_severity = e_info->sev; + buf->event_subtype = e_info->event_subtype; + buf->reason_code = e_info->reason_code; + buf->elog_origin = ORG_SAPPHIRE; + + lock(&elog_lock); + buf->plid = ++sapphire_elog_id; + unlock(&elog_lock); + + /* Initialise the first user dump section */ + log_add_section(buf, tag); + } + + return buf; +} + +/* Add a new user data section to an existing error log */ +void log_add_section(struct errorlog *buf, uint32_t tag) +{ + size_t size = sizeof(struct elog_user_data_section) - 1; + struct elog_user_data_section *tmp; + + if (!buf) { + prerror("ELOG: Cannot add user data section. " + "Buffer is invalid\n"); + return; + } + + if ((buf->user_section_size + size) > OPAL_LOG_MAX_DUMP) { + prerror("ELOG: Size of dump data overruns buffer\n"); + return; + } + + tmp = (struct elog_user_data_section *)(buf->user_data_dump + + buf->user_section_size); + /* Use DESC if no other tag provided */ + tmp->tag = tag ? cpu_to_be32(tag) : cpu_to_be32(OPAL_ELOG_SEC_DESC); + tmp->size = cpu_to_be16(size); + + buf->user_section_size += size; + buf->user_section_count++; +} + +void opal_elog_complete(struct errorlog *buf, bool success) +{ + if (!success) + printf("Unable to log error\n"); + + lock(&elog_lock); + pool_free_object(&elog_pool, buf); + unlock(&elog_lock); +} + +void log_commit(struct errorlog *elog) +{ + int rc; + + if (!elog) + return; + + if (platform.elog_commit) { + rc = platform.elog_commit(elog); + if (rc) + prerror("ELOG: Platform commit error %d\n", rc); + + return; + } + + opal_elog_complete(elog, false); +} + +void log_append_data(struct errorlog *buf, unsigned char *data, uint16_t size) +{ + struct elog_user_data_section *section; + uint8_t n_sections; + char *buffer; + uint16_t ssize; + + if (!buf) { + prerror("ELOG: Cannot update user data. Buffer is invalid\n"); + return; + } + + if ((buf->user_section_size + size) > OPAL_LOG_MAX_DUMP) { + prerror("ELOG: Size of dump data overruns buffer\n"); + return; + } + + /* Step through user sections to find latest dump section */ + buffer = buf->user_data_dump; + n_sections = buf->user_section_count; + if (!n_sections) { + prerror("ELOG: User section invalid\n"); + return; + } + + while (--n_sections) { + section = (struct elog_user_data_section *)buffer; + buffer += be16_to_cpu(section->size); + } + + section = (struct elog_user_data_section *)buffer; + ssize = be16_to_cpu(section->size); + buffer += ssize; + memcpy(buffer, data, size); + section->size = cpu_to_be16(ssize + size); + buf->user_section_size += size; +} + +void log_append_msg(struct errorlog *buf, const char *fmt, ...) +{ + char err_msg[250]; + va_list list; + + if (!buf) { + prerror("Tried to append log to NULL buffer\n"); + return; + } + + va_start(list, fmt); + vsnprintf(err_msg, sizeof(err_msg), fmt, list); + va_end(list); + + /* Log the error on to Sapphire console */ + prerror("%s", err_msg); + + log_append_data(buf, err_msg, strlen(err_msg)); +} + +uint32_t log_simple_error(struct opal_err_info *e_info, const char *fmt, ...) +{ + struct errorlog *buf; + va_list list; + char err_msg[250]; + + va_start(list, fmt); + vsnprintf(err_msg, sizeof(err_msg), fmt, list); + va_end(list); + + /* Log the error on to Sapphire console */ + prerror("%s", err_msg); + + buf = opal_elog_create(e_info, 0); + if (buf == NULL) { + prerror("ELOG: Error getting buffer to log error\n"); + return -1; + } + + log_append_data(buf, err_msg, strlen(err_msg)); + log_commit(buf); + + return buf->plid; +} + +int elog_init(void) +{ + /* Pre-allocate memory for records */ + if (pool_init(&elog_pool, sizeof(struct errorlog), + ELOG_WRITE_MAX_RECORD, 1)) + return OPAL_RESOURCE; + + elog_available = true; + return 0; +} diff --git a/roms/skiboot/core/exceptions.c b/roms/skiboot/core/exceptions.c new file mode 100644 index 000000000..389548d16 --- /dev/null +++ b/roms/skiboot/core/exceptions.c @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Deal with exceptions when in OPAL. + * + * Copyright 2013-2014 IBM Corp. + */ + +#include +#include +#include +#include +#include +#include + +#define REG "%016llx" +#define REG32 "%08x" +#define REGS_PER_LINE 4 + +static void dump_regs(struct stack_frame *stack) +{ + unsigned int i; + + prerror("CFAR : "REG" MSR : "REG"\n", stack->cfar, stack->msr); + prerror("SRR0 : "REG" SRR1 : "REG"\n", stack->srr0, stack->srr1); + prerror("HSRR0: "REG" HSRR1: "REG"\n", stack->hsrr0, stack->hsrr1); + prerror("DSISR: "REG32" DAR : "REG"\n", stack->dsisr, stack->dar); + prerror("LR : "REG" CTR : "REG"\n", stack->lr, stack->ctr); + prerror("CR : "REG32" XER : "REG32"\n", stack->cr, stack->xer); + for (i = 0; i < 16; i++) + prerror("GPR%02d: "REG" GPR%02d: "REG"\n", + i, stack->gpr[i], i + 16, stack->gpr[i + 16]); +} + +#define EXCEPTION_MAX_STR 320 + +static void handle_mce(struct stack_frame *stack, uint64_t nip, uint64_t msr, bool *fatal) +{ + uint64_t mce_flags, mce_addr; + const char *mce_err; + const char *mce_fix = NULL; + char buf[EXCEPTION_MAX_STR]; + size_t l; + + decode_mce(stack->srr0, stack->srr1, stack->dsisr, stack->dar, + &mce_flags, &mce_err, &mce_addr); + + /* Try to recover. */ + if (mce_flags & MCE_ERAT_ERROR) { + /* Real-mode still uses ERAT, flush transient bitflips */ + flush_erat(); + mce_fix = "ERAT flush"; + + } else { + *fatal = true; + } + + prerror("***********************************************\n"); + l = 0; + l += snprintf(buf + l, EXCEPTION_MAX_STR - l, + "%s MCE at "REG" ", *fatal ? "Fatal" : "Non-fatal", nip); + l += snprintf_symbol(buf + l, EXCEPTION_MAX_STR - l, nip); + l += snprintf(buf + l, EXCEPTION_MAX_STR - l, " MSR "REG, msr); + prerror("%s\n", buf); + + l = 0; + l += snprintf(buf + l, EXCEPTION_MAX_STR - l, + "Cause: %s", mce_err); + prerror("%s\n", buf); + if (mce_flags & MCE_INVOLVED_EA) { + l = 0; + l += snprintf(buf + l, EXCEPTION_MAX_STR - l, + "Effective address: 0x%016llx", mce_addr); + prerror("%s\n", buf); + } + + if (!*fatal) { + l = 0; + l += snprintf(buf + l, EXCEPTION_MAX_STR - l, + "Attempting recovery: %s", mce_fix); + prerror("%s\n", buf); + } +} + +void exception_entry(struct stack_frame *stack) +{ + bool fatal = false; + bool hv; + uint64_t nip; + uint64_t msr; + char buf[EXCEPTION_MAX_STR]; + size_t l; + + switch (stack->type) { + case 0x500: + case 0x980: + case 0xe00: + case 0xe20: + case 0xe40: + case 0xe60: + case 0xe80: + case 0xea0: + case 0xf80: + hv = true; + break; + default: + hv = false; + break; + } + + if (hv) { + nip = stack->hsrr0; + msr = stack->hsrr1; + } else { + nip = stack->srr0; + msr = stack->srr1; + } + stack->msr = msr; + stack->pc = nip; + + if (!(msr & MSR_RI)) + fatal = true; + + l = 0; + switch (stack->type) { + case 0x100: + prerror("***********************************************\n"); + if (fatal) { + l += snprintf(buf + l, EXCEPTION_MAX_STR - l, + "Fatal System Reset at "REG" ", nip); + } else { + l += snprintf(buf + l, EXCEPTION_MAX_STR - l, + "System Reset at "REG" ", nip); + } + break; + + case 0x200: + handle_mce(stack, nip, msr, &fatal); + goto no_symbol; + + case 0x700: { + struct trap_table_entry *tte; + + fatal = true; + prerror("***********************************************\n"); + for (tte = __trap_table_start; tte < __trap_table_end; tte++) { + if (tte->address == nip) { + prerror("< %s >\n", tte->message); + prerror(" .\n"); + prerror(" .\n"); + prerror(" .\n"); + prerror(" OO__)\n"); + prerror(" <\"__/\n"); + prerror(" ^ ^\n"); + break; + } + } + l += snprintf(buf + l, EXCEPTION_MAX_STR - l, + "Fatal TRAP at "REG" ", nip); + l += snprintf_symbol(buf + l, EXCEPTION_MAX_STR - l, nip); + l += snprintf(buf + l, EXCEPTION_MAX_STR - l, " MSR "REG, msr); + prerror("%s\n", buf); + dump_regs(stack); + backtrace_r1((uint64_t)stack); + if (platform.terminate) + platform.terminate(buf); + for (;;) ; + break; } + + default: + fatal = true; + prerror("***********************************************\n"); + l += snprintf(buf + l, EXCEPTION_MAX_STR - l, + "Fatal Exception 0x%llx at "REG" ", stack->type, nip); + break; + } + l += snprintf_symbol(buf + l, EXCEPTION_MAX_STR - l, nip); + l += snprintf(buf + l, EXCEPTION_MAX_STR - l, " MSR "REG, msr); + prerror("%s\n", buf); +no_symbol: + dump_regs(stack); + backtrace_r1((uint64_t)stack); + if (fatal) { + if (platform.terminate) + platform.terminate(buf); + for (;;) ; + } + + if (hv) { + /* Set up for SRR return */ + stack->srr0 = nip; + stack->srr1 = msr; + } +} + +void exception_entry_pm_sreset(void) +{ + char buf[EXCEPTION_MAX_STR]; + size_t l; + + prerror("***********************************************\n"); + l = 0; + l += snprintf(buf + l, EXCEPTION_MAX_STR - l, + "System Reset in sleep"); + prerror("%s\n", buf); + backtrace(); +} + +void __noreturn exception_entry_pm_mce(void) +{ + char buf[EXCEPTION_MAX_STR]; + size_t l; + + prerror("***********************************************\n"); + l = 0; + l += snprintf(buf + l, EXCEPTION_MAX_STR - l, + "Fatal MCE in sleep"); + prerror("%s\n", buf); + prerror("SRR0 : "REG" SRR1 : "REG"\n", + (uint64_t)mfspr(SPR_SRR0), (uint64_t)mfspr(SPR_SRR1)); + prerror("DSISR: "REG32" DAR : "REG"\n", + (uint32_t)mfspr(SPR_DSISR), (uint64_t)mfspr(SPR_DAR)); + abort(); +} + +static int64_t opal_register_exc_handler(uint64_t opal_exception __unused, + uint64_t handler_address __unused, + uint64_t glue_cache_line __unused) +{ + /* This interface is deprecated */ + return OPAL_UNSUPPORTED; +} +opal_call(OPAL_REGISTER_OPAL_EXCEPTION_HANDLER, opal_register_exc_handler, 3); + diff --git a/roms/skiboot/core/fast-reboot.c b/roms/skiboot/core/fast-reboot.c new file mode 100644 index 000000000..9f92525a9 --- /dev/null +++ b/roms/skiboot/core/fast-reboot.c @@ -0,0 +1,467 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Full IPL is slow, let's cheat! + * + * Copyright 2013-2019 IBM Corp. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Flag tested by the OPAL entry code */ +static volatile bool fast_boot_release; +static volatile bool spr_set_release; +static volatile bool nmi_mce_release; + +static void wait_on(volatile bool *cond) +{ + sync(); + if (!*cond) { + smt_lowest(); + while (!*cond) + barrier(); + smt_medium(); + } + sync(); +} + +static bool cpu_state_wait_all_others(enum cpu_thread_state state, + unsigned long timeout_tb) +{ + struct cpu_thread *cpu; + unsigned long end = mftb() + timeout_tb; + + sync(); + for_each_ungarded_cpu(cpu) { + if (cpu == this_cpu()) + continue; + + if (cpu->state != state) { + smt_lowest(); + while (cpu->state != state) { + barrier(); + + if (timeout_tb && (tb_compare(mftb(), end) == TB_AAFTERB)) { + smt_medium(); + return false; + } + } + smt_medium(); + } + } + sync(); + + return true; +} + +static const char *fast_reboot_disabled = NULL; + +void disable_fast_reboot(const char *reason) +{ + if (fast_reboot_disabled) + return; + + prlog(PR_NOTICE, "RESET: Fast reboot disabled: %s\n", reason); + fast_reboot_disabled = reason; +} + +void add_fast_reboot_dt_entries(void) +{ + dt_check_del_prop(opal_node, "fast-reboot"); + + if (fast_reboot_disabled) { + dt_add_property_string(opal_node, "fast-reboot", fast_reboot_disabled); + } else { + dt_add_property_string(opal_node, "fast-reboot", "okay"); + } +} + +/* + * This is called by the reboot CPU after all other CPUs have been + * quiesced and stopped, to perform various sanity checks on firmware + * data (and potentially hardware), to determine whether the fast + * reboot should go ahead. + */ +static bool fast_reboot_sanity_check(void) +{ + if (!mem_check_all()) { + disable_fast_reboot("Inconsistent firmware data"); + return false; + } + + if (!verify_romem()) { + disable_fast_reboot("Inconsistent firmware romem checksum"); + return false; + } + + return true; +} + +void fast_reboot(void) +{ + static int fast_reboot_count = 0; + + if (chip_quirk(QUIRK_NO_DIRECT_CTL)) { + prlog(PR_DEBUG, + "RESET: Fast reboot disabled by quirk\n"); + return; + } + + /* + * Ensure all other CPUs have left OPAL calls. + */ + if (!opal_quiesce(QUIESCE_HOLD, -1)) { + disable_fast_reboot("OPAL quiesce timeout"); + return; + } + + if (fast_reboot_disabled && + nvram_query_eq_dangerous("force-fast-reset", "1")) { + /* Do fast reboot even if it's been disabled */ + prlog(PR_NOTICE, "RESET: Ignoring fast reboot disabled: %s\n", + fast_reboot_disabled); + } else if (fast_reboot_disabled) { + prlog(PR_NOTICE, "RESET: Fast reboot disabled: %s\n", + fast_reboot_disabled); + opal_quiesce(QUIESCE_RESUME, -1); + return; + } + + prlog(PR_NOTICE, "RESET: Initiating fast reboot %d...\n", ++fast_reboot_count); + fast_boot_release = false; + spr_set_release = false; + nmi_mce_release = false; + sync(); + + /* Put everybody in stop except myself */ + if (sreset_all_prepare()) { + prlog(PR_NOTICE, "RESET: Fast reboot failed to prepare " + "secondaries for system reset\n"); + opal_quiesce(QUIESCE_RESUME, -1); + return; + } + + if (!fast_reboot_sanity_check()) { + opal_quiesce(QUIESCE_RESUME, -1); + return; + } + + cpu_set_sreset_enable(false); + cpu_set_ipi_enable(false); + + /* + * The fast reboot sreset vector has FIXUP_ENDIAN, so secondaries can + * cope with a wrong HILE setting. + */ + copy_sreset_vector_fast_reboot(); + + /* + * There is no point clearing special wakeup or un-quiesce due to + * failure after this point, because we will be going to full IPL. + * Less cleanup work means less opportunity to fail. + */ + + /* Send everyone else to 0x100 */ + if (sreset_all_others() != OPAL_SUCCESS) { + prlog(PR_NOTICE, "RESET: Fast reboot failed to system reset " + "secondaries\n"); + return; + } + + /* Ensure all the sresets get through */ + if (!cpu_state_wait_all_others(cpu_state_fast_reboot_entry, msecs_to_tb(1000))) { + prlog(PR_NOTICE, "RESET: Fast reboot timed out waiting for " + "secondaries to call in\n"); + return; + } + + prlog(PR_DEBUG, "RESET: Releasing special wakeups...\n"); + sreset_all_finish(); + + /* This resets our quiesce state ready to enter the new kernel. */ + opal_quiesce(QUIESCE_RESUME_FAST_REBOOT, -1); + + console_complete_flush(); + + mtmsrd(0, 1); /* Clear MSR[RI] for 0x100 reset */ + asm volatile("ba 0x100\n\t" : : : "memory"); + for (;;) + ; +} + +void __noreturn enter_nap(void); + +static void check_split_core(void) +{ + struct cpu_thread *cpu; + u64 mask, hid0; + + hid0 = mfspr(SPR_HID0); + mask = SPR_HID0_POWER8_4LPARMODE | SPR_HID0_POWER8_2LPARMODE; + + if ((hid0 & mask) == 0) + return; + + prlog(PR_INFO, "RESET: CPU 0x%04x is split !\n", this_cpu()->pir); + + /* If it's a secondary thread, just send it to nap */ + if (this_cpu()->pir & 7) { + /* Prepare to be woken up */ + icp_prep_for_pm(); + /* Setup LPCR to wakeup on external interrupts only */ + mtspr(SPR_LPCR, ((mfspr(SPR_LPCR) & ~SPR_LPCR_P8_PECE) | + SPR_LPCR_P8_PECE2)); + isync(); + /* Go to nap (doesn't return) */ + enter_nap(); + } + + prlog(PR_INFO, "RESET: Primary, unsplitting... \n"); + + /* Trigger unsplit operation and update SLW image */ + hid0 &= ~SPR_HID0_POWER8_DYNLPARDIS; + set_hid0(hid0); + opal_slw_set_reg(this_cpu()->pir, SPR_HID0, hid0); + + /* Wait for unsplit */ + while (mfspr(SPR_HID0) & mask) + cpu_relax(); + + /* Now the guys are sleeping, wake'em up. They will come back + * via reset and continue the fast reboot process normally. + * No need to wait. + */ + prlog(PR_INFO, "RESET: Waking unsplit secondaries... \n"); + + for_each_cpu(cpu) { + if (!cpu_is_sibling(cpu, this_cpu()) || (cpu == this_cpu())) + continue; + icp_kick_cpu(cpu); + } +} + +static void cleanup_cpu_state(void) +{ + struct cpu_thread *cpu = this_cpu(); + + if (proc_gen == proc_gen_p9) + xive_cpu_reset(); + else if (proc_gen == proc_gen_p10) + xive2_cpu_reset(); + + /* Per core cleanup */ + if (cpu_is_thread0(cpu) || cpu_is_core_chiplet_primary(cpu)) { + /* Shared SPRs whacked back to normal */ + + /* XXX Update the SLW copies ! Also dbl check HIDs etc... */ + init_shared_sprs(); + + if (proc_gen == proc_gen_p8) { + /* If somebody was in fast_sleep, we may have a + * workaround to undo + */ + if (cpu->in_fast_sleep) { + prlog(PR_DEBUG, "RESET: CPU 0x%04x in fast sleep" + " undoing workarounds...\n", cpu->pir); + fast_sleep_exit(); + } + + /* The TLB surely contains garbage. + * P9 clears TLBs in cpu_fast_reboot_complete + */ + cleanup_local_tlb(); + } + + /* And we might have lost TB sync */ + chiptod_wakeup_resync(); + } + + /* Per-thread additional cleanup */ + init_replicated_sprs(); + + // XXX Cleanup SLW, check HIDs ... +} + +/* Entry from asm after a fast reset */ +void __noreturn fast_reboot_entry(void); + +void __noreturn fast_reboot_entry(void) +{ + struct cpu_thread *cpu = this_cpu(); + + if (proc_gen == proc_gen_p8) { + /* We reset our ICP first ! Otherwise we might get stray + * interrupts when unsplitting + */ + reset_cpu_icp(); + + /* If we are split, we need to unsplit. Since that can send us + * to NAP, which will come back via reset, we do it now + */ + check_split_core(); + } + + /* Until SPRs (notably HID[HILE]) are set and new exception vectors + * installed, nobody should take machine checks. Try to do minimal + * work between these points. + */ + disable_machine_check(); + mtmsrd(0, 1); /* Clear RI */ + + sync(); + cpu->state = cpu_state_fast_reboot_entry; + sync(); + if (cpu == boot_cpu) { + cpu_state_wait_all_others(cpu_state_fast_reboot_entry, 0); + spr_set_release = true; + } else { + wait_on(&spr_set_release); + } + + + /* Reset SPRs */ + if (cpu_is_thread0(cpu)) + init_shared_sprs(); + init_replicated_sprs(); + + if (cpu == boot_cpu) { + /* Restore skiboot vectors */ + copy_exception_vectors(); + copy_sreset_vector(); + patch_traps(true); + } + + /* Must wait for others to because shared SPRs like HID0 are only set + * by thread0, so can't enable machine checks until those have been + * set. + */ + sync(); + cpu->state = cpu_state_present; + sync(); + if (cpu == boot_cpu) { + cpu_state_wait_all_others(cpu_state_present, 0); + nmi_mce_release = true; + } else { + wait_on(&nmi_mce_release); + } + + /* At this point skiboot exception vectors are in place and all + * cores/threads have SPRs set for running skiboot. + */ + enable_machine_check(); + mtmsrd(MSR_RI, 1); + + cleanup_cpu_state(); + + prlog(PR_DEBUG, "RESET: CPU 0x%04x reset in\n", cpu->pir); + + /* The original boot CPU (not the fast reboot initiator) takes + * command. Secondaries wait for the signal then go to their secondary + * entry point. + */ + if (cpu != boot_cpu) { + wait_on(&fast_boot_release); + + __secondary_cpu_entry(); + } + + if (proc_gen == proc_gen_p9) + xive_reset(); + else if (proc_gen == proc_gen_p10) + xive2_reset(); + + /* Let the CPU layer do some last minute global cleanups */ + cpu_fast_reboot_complete(); + + /* We can now do NAP mode */ + cpu_set_sreset_enable(true); + cpu_set_ipi_enable(true); + + prlog(PR_INFO, "RESET: Releasing secondaries...\n"); + + /* Release everybody */ + sync(); + fast_boot_release = true; + sync(); + cpu->state = cpu_state_active; + sync(); + + /* Wait for them to respond */ + cpu_state_wait_all_others(cpu_state_active, 0); + + sync(); + + prlog(PR_INFO, "RESET: All done, cleaning up...\n"); + + /* Clear release flag for next time */ + fast_boot_release = false; + + if (!chip_quirk(QUIRK_MAMBO_CALLOUTS)) { + /* + * mem_region_clear_unused avoids these preload regions + * so it can run along side image preloading. Clear these + * regions now to catch anything not overwritten by + * preload. + * + * Mambo may have embedded payload here, so don't clear + * it at all. + */ + memset(KERNEL_LOAD_BASE, 0, KERNEL_LOAD_SIZE); + memset(INITRAMFS_LOAD_BASE, 0, INITRAMFS_LOAD_SIZE); + } + + /* Start preloading kernel and ramdisk */ + start_preload_kernel(); + + /* Start clearing memory */ + start_mem_region_clear_unused(); + + if (platform.fast_reboot_init) + platform.fast_reboot_init(); + + if (proc_gen == proc_gen_p8) { + /* XXX */ + /* Reset/EOI the PSI interrupt */ + psi_irq_reset(); + } + + /* update pci nvram settings */ + pci_nvram_init(); + + /* Remove all PCI devices */ + if (pci_reset()) { + prlog(PR_NOTICE, "RESET: Fast reboot failed to reset PCI\n"); + + /* + * Can't return to caller here because we're past no-return. + * Attempt an IPL here which is what the caller would do. + */ + if (platform.cec_reboot) + platform.cec_reboot(); + for (;;) + ; + } + + ipmi_set_fw_progress_sensor(IPMI_FW_PCI_INIT); + + wait_mem_region_clear_unused(); + + /* Load and boot payload */ + load_and_boot_kernel(true); +} diff --git a/roms/skiboot/core/fdt.c b/roms/skiboot/core/fdt.c new file mode 100644 index 000000000..463dc6912 --- /dev/null +++ b/roms/skiboot/core/fdt.c @@ -0,0 +1,258 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Produce and consume flattened device trees + * + * Copyright 2013-2019 IBM Corp. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int fdt_error; + +#undef DEBUG_FDT +#ifdef DEBUG_FDT +#define FDT_DBG(fmt, a...) prlog(PR_DEBUG, "FDT: " fmt, ##a) +#else +#define FDT_DBG(fmt, a...) +#endif + +static void __save_err(int err, const char *str) +{ + FDT_DBG("rc: %d from \"%s\"\n", err, str); + if (err && !fdt_error) { + prerror("FDT: Error %d from \"%s\"\n", err, str); + fdt_error = err; + } +} + +#define save_err(...) __save_err(__VA_ARGS__, #__VA_ARGS__) + +static void dt_property_cell(void *fdt, const char *name, u32 cell) +{ + save_err(fdt_property_cell(fdt, name, cell)); +} + +static void dt_begin_node(void *fdt, const struct dt_node *dn) +{ + save_err(fdt_begin_node(fdt, dn->name)); + + dt_property_cell(fdt, "phandle", dn->phandle); +} + +static void dt_property(void *fdt, const struct dt_property *p) +{ + save_err(fdt_property(fdt, p->name, p->prop, p->len)); +} + +static void dt_end_node(void *fdt) +{ + save_err(fdt_end_node(fdt)); +} + +#ifdef DEBUG_FDT +static void dump_fdt(void *fdt) +{ + int i, off, depth, err; + + prlog(PR_INFO, "Device tree %u@%p\n", fdt_totalsize(fdt), fdt); + err = fdt_check_header(fdt); + if (err) { + prerror("fdt_check_header: %s\n", fdt_strerror(err)); + return; + } + prlog(PR_INFO, "fdt_check_header passed\n"); + + prlog(PR_INFO, "fdt_num_mem_rsv = %u\n", fdt_num_mem_rsv(fdt)); + for (i = 0; i < fdt_num_mem_rsv(fdt); i++) { + u64 addr, size; + + err = fdt_get_mem_rsv(fdt, i, &addr, &size); + if (err) { + prlog(PR_INFO, " ERR %s\n", fdt_strerror(err)); + return; + } + prlog(PR_INFO, " mem_rsv[%i] = %lu@%#lx\n", + i, (long)addr, (long)size); + } + + for (off = fdt_next_node(fdt, 0, &depth); + off > 0; + off = fdt_next_node(fdt, off, &depth)) { + int len; + const char *name; + + name = fdt_get_name(fdt, off, &len); + if (!name) { + prerror("fdt: offset %i no name!\n", off); + return; + } + prlog(PR_INFO, "name: %s [%u]\n", name, off); + } +} +#endif + +static void flatten_dt_properties(void *fdt, const struct dt_node *dn) +{ + const struct dt_property *p; + + list_for_each(&dn->properties, p, list) { + if (strstarts(p->name, DT_PRIVATE)) + continue; + + FDT_DBG(" prop: %s size: %ld\n", p->name, p->len); + dt_property(fdt, p); + } +} + +static void flatten_dt_node(void *fdt, const struct dt_node *root, + bool exclusive) +{ + const struct dt_node *i; + + if (!exclusive) { + FDT_DBG("node: %s\n", root->name); + dt_begin_node(fdt, root); + flatten_dt_properties(fdt, root); + } + + list_for_each(&root->children, i, list) + flatten_dt_node(fdt, i, false); + + if (!exclusive) + dt_end_node(fdt); +} + +static void create_dtb_reservemap(void *fdt, const struct dt_node *root) +{ + uint64_t base, size; + const __be64 *ranges; + const struct dt_property *prop; + int i; + + /* Duplicate the reserved-ranges property into the fdt reservemap */ + prop = dt_find_property(root, "reserved-ranges"); + if (prop) { + ranges = (const void *)prop->prop; + + for (i = 0; i < prop->len / (sizeof(uint64_t) * 2); i++) { + base = be64_to_cpu(*(ranges++)); + size = be64_to_cpu(*(ranges++)); + save_err(fdt_add_reservemap_entry(fdt, base, size)); + } + } + + save_err(fdt_finish_reservemap(fdt)); +} + +static int __create_dtb(void *fdt, size_t len, + const struct dt_node *root, + bool exclusive) +{ + if (chip_quirk(QUIRK_SLOW_SIM)) + save_err(fdt_create_with_flags(fdt, len, FDT_CREATE_FLAG_NO_NAME_DEDUP)); + else + save_err(fdt_create_with_flags(fdt, len, 0)); + if (fdt_error) + goto err; + + if (root == dt_root && !exclusive) + create_dtb_reservemap(fdt, root); + else + save_err(fdt_finish_reservemap(fdt)); + + flatten_dt_node(fdt, root, exclusive); + + save_err(fdt_finish(fdt)); + if (fdt_error) { +err: + prerror("dtb: error %s\n", fdt_strerror(fdt_error)); + return fdt_error; + } + +#ifdef DEBUG_FDT + dump_fdt(fdt); +#endif + return 0; +} + +void *create_dtb(const struct dt_node *root, bool exclusive) +{ + void *fdt = NULL; + size_t len = DEVICE_TREE_MAX_SIZE; + uint32_t old_last_phandle = get_last_phandle(); + int ret; + + do { + set_last_phandle(old_last_phandle); + fdt_error = 0; + fdt = malloc(len); + if (!fdt) { + prerror("dtb: could not malloc %lu\n", (long)len); + return NULL; + } + + ret = __create_dtb(fdt, len, root, exclusive); + if (ret) { + free(fdt); + fdt = NULL; + } + + len *= 2; + } while (ret == -FDT_ERR_NOSPACE); + + return fdt; +} + +static int64_t opal_get_device_tree(uint32_t phandle, + uint64_t buf, uint64_t len) +{ + struct dt_node *root; + void *fdt = (void *)buf; + uint32_t old_last_phandle; + int64_t totalsize; + int ret; + + if (!opal_addr_valid(fdt)) + return OPAL_PARAMETER; + + root = dt_find_by_phandle(dt_root, phandle); + if (!root) + return OPAL_PARAMETER; + + if (!fdt) { + fdt = create_dtb(root, true); + if (!fdt) + return OPAL_INTERNAL_ERROR; + totalsize = fdt_totalsize(fdt); + free(fdt); + return totalsize; + } + + if (!len) + return OPAL_PARAMETER; + + fdt_error = 0; + old_last_phandle = get_last_phandle(); + ret = __create_dtb(fdt, len, root, true); + if (ret) { + set_last_phandle(old_last_phandle); + if (ret == -FDT_ERR_NOSPACE) + return OPAL_NO_MEM; + + return OPAL_EMPTY; + } + + return OPAL_SUCCESS; +} +opal_call(OPAL_GET_DEVICE_TREE, opal_get_device_tree, 3); diff --git a/roms/skiboot/core/flash-firmware-versions.c b/roms/skiboot/core/flash-firmware-versions.c new file mode 100644 index 000000000..975ac6aff --- /dev/null +++ b/roms/skiboot/core/flash-firmware-versions.c @@ -0,0 +1,164 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Parse VERSION partition, add to device tree + * + * Copyright 2013-2018 IBM Corp. + */ + +#include +#include +#include +#include +#include + +/* ibm,firmware-versions support */ +static char *version_buf; +static size_t version_buf_size = 0x2000; + +static void __flash_dt_add_fw_version(struct dt_node *fw_version, char* data) +{ + static bool first = true; + char *prop; + int version_len, i; + int len = strlen(data); + const char *skiboot_version; + const char * version_str[] = {"open-power", "buildroot", "skiboot", + "hostboot-binaries", "hostboot", "linux", + "petitboot", "occ", "capp-ucode", "sbe", + "machine-xml", "hcode"}; + + if (first) { + first = false; + + /* Increment past "key-" */ + if (memcmp(data, "open-power", strlen("open-power")) == 0) + prop = data + strlen("open-power"); + else + prop = strchr(data, '-'); + if (!prop) { + prlog(PR_DEBUG, + "FLASH: Invalid fw version format (%s)\n", data); + return; + } + prop++; + + dt_add_property_string(fw_version, "version", prop); + return; + } + + /* + * PNOR version strings are not easily consumable. Split them into + * property, value. + * + * Example input from PNOR : + * "open-power-firestone-v1.8" + * "linux-4.4.6-openpower1-8420e0f" + * + * Desired output in device tree: + * open-power = "firestone-v1.8"; + * linux = "4.4.6-openpower1-8420e0f"; + */ + for(i = 0; i < ARRAY_SIZE(version_str); i++) + { + version_len = strlen(version_str[i]); + if (len < version_len) + continue; + + if (memcmp(data, version_str[i], version_len) != 0) + continue; + + /* Found a match, add property */ + if (dt_find_property(fw_version, version_str[i])) + continue; + + /* Increment past "key-" */ + prop = data + version_len + 1; + dt_add_property_string(fw_version, version_str[i], prop); + + /* Sanity check against what Skiboot thinks its version is. */ + if (strncmp(version_str[i], "skiboot", + strlen("skiboot")) == 0) { + /* + * If Skiboot was built with Buildroot its version may + * include a 'skiboot-' prefix; ignore it. + */ + if (strncmp(version, "skiboot-", + strlen("skiboot-")) == 0) + skiboot_version = version + strlen("skiboot-"); + else + skiboot_version = version; + if (strncmp(prop, skiboot_version, + strlen(skiboot_version)) != 0) + prlog(PR_WARNING, "WARNING! Skiboot version does not match VERSION partition!\n"); + } + } +} + +void flash_dt_add_fw_version(void) +{ + uint8_t version_data[80]; + int rc; + int numbytes = 0, i = 0; + struct dt_node *fw_version; + + if (version_buf == NULL) + return; + + rc = wait_for_resource_loaded(RESOURCE_ID_VERSION, RESOURCE_SUBID_NONE); + if (rc != OPAL_SUCCESS) { + prlog(PR_WARNING, "FLASH: Failed to load VERSION data\n"); + free(version_buf); + return; + } + + fw_version = dt_new(dt_root, "ibm,firmware-versions"); + assert(fw_version); + + if (stb_is_container(version_buf, version_buf_size)) + numbytes += SECURE_BOOT_HEADERS_SIZE; + for ( ; (numbytes < version_buf_size) && version_buf[numbytes]; numbytes++) { + if (version_buf[numbytes] == '\n') { + version_data[i] = '\0'; + __flash_dt_add_fw_version(fw_version, version_data); + memset(version_data, 0, sizeof(version_data)); + i = 0; + continue; + } else if (version_buf[numbytes] == '\t') { + continue; /* skip tabs */ + } + + version_data[i++] = version_buf[numbytes]; + if (i == sizeof(version_data)) { + prlog(PR_WARNING, "VERSION item >%lu chars, skipping\n", + sizeof(version_data)); + break; + } + } + + free(version_buf); +} + +void flash_fw_version_preload(void) +{ + int rc; + + if (proc_gen < proc_gen_p9) + return; + + prlog(PR_INFO, "FLASH: Loading VERSION section\n"); + + version_buf = malloc(version_buf_size); + if (!version_buf) { + prlog(PR_WARNING, "FLASH: Failed to allocate memory\n"); + return; + } + + rc = start_preload_resource(RESOURCE_ID_VERSION, RESOURCE_SUBID_NONE, + version_buf, &version_buf_size); + if (rc != OPAL_SUCCESS) { + prlog(PR_WARNING, + "FLASH: Failed to start loading VERSION data\n"); + free(version_buf); + version_buf = NULL; + } +} diff --git a/roms/skiboot/core/flash-subpartition.c b/roms/skiboot/core/flash-subpartition.c new file mode 100644 index 000000000..6e0fec6c3 --- /dev/null +++ b/roms/skiboot/core/flash-subpartition.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Parse flash sub-partitions + * + * Copyright 2013-2018 IBM Corp. + */ + +#include +#include + +struct flash_hostboot_toc { + be32 ec; + be32 offset; /* From start of header. 4K aligned */ + be32 size; +}; +#define FLASH_HOSTBOOT_TOC_MAX_ENTRIES ((FLASH_SUBPART_HEADER_SIZE - 8) \ + /sizeof(struct flash_hostboot_toc)) + +struct flash_hostboot_header { + char eyecatcher[4]; + be32 version; + struct flash_hostboot_toc toc[FLASH_HOSTBOOT_TOC_MAX_ENTRIES]; +}; + +int flash_subpart_info(void *part_header, uint32_t header_len, + uint32_t part_size, uint32_t *part_actualp, + uint32_t subid, uint32_t *offset, uint32_t *size) +{ + struct flash_hostboot_header *header; + char eyecatcher[5]; + uint32_t i, ec, o, s; + uint32_t part_actual; + bool subpart_found; + + if (!part_header || ( !offset && !size && !part_actualp)) { + prlog(PR_ERR, "FLASH: invalid parameters: ph %p of %p sz %p " + "tsz %p\n", part_header, offset, size, part_actualp); + return OPAL_PARAMETER; + } + + if (header_len < FLASH_SUBPART_HEADER_SIZE) { + prlog(PR_ERR, "FLASH: subpartition header too small 0x%x\n", + header_len); + return OPAL_PARAMETER; + } + + header = (struct flash_hostboot_header*) part_header; + + /* Perform sanity */ + i = be32_to_cpu(header->version); + if (i != 1) { + prerror("FLASH: flash subpartition TOC version unknown %i\n", i); + return OPAL_RESOURCE; + } + + /* NULL terminate eyecatcher */ + strncpy(eyecatcher, header->eyecatcher, 4); + eyecatcher[4] = '\0'; + prlog(PR_DEBUG, "FLASH: flash subpartition eyecatcher %s\n", + eyecatcher); + + subpart_found = false; + part_actual = 0; + for (i = 0; i < FLASH_HOSTBOOT_TOC_MAX_ENTRIES; i++) { + + ec = be32_to_cpu(header->toc[i].ec); + o = be32_to_cpu(header->toc[i].offset); + s = be32_to_cpu(header->toc[i].size); + + /* Check for null terminating entry */ + if (!ec && !o && !s) + break; + + /* Sanity check the offset and size. */ + if (o + s > part_size) { + prerror("FLASH: flash subpartition too big: %i\n", i); + return OPAL_RESOURCE; + } + if (!s) { + prerror("FLASH: flash subpartition zero size: %i\n", i); + return OPAL_RESOURCE; + } + if (o < FLASH_SUBPART_HEADER_SIZE) { + prerror("FLASH: flash subpartition offset too small: " + "%i\n", i); + return OPAL_RESOURCE; + } + /* + * Subpartitions content are different, but multiple toc entries + * may point to the same subpartition. + */ + if (ALIGN_UP(o + s, FLASH_SUBPART_HEADER_SIZE) > part_actual) + part_actual = ALIGN_UP(o + s, FLASH_SUBPART_HEADER_SIZE); + + if (ec == subid) { + if (offset) + *offset += o; + if (size) + *size = s; + subpart_found = true; + } + } + if (!subpart_found && (offset || size)) { + prerror("FLASH: flash subpartition not found.\n"); + return OPAL_RESOURCE; + } + if (part_actualp) + *part_actualp = part_actual; + return OPAL_SUCCESS; +} diff --git a/roms/skiboot/core/flash.c b/roms/skiboot/core/flash.c new file mode 100644 index 000000000..8c1e788c4 --- /dev/null +++ b/roms/skiboot/core/flash.c @@ -0,0 +1,1186 @@ +// SPDX-License-Identifier: Apache-2.0 +/* + * Init, manage, read, write, and load resources from flash + * + * Copyright 2013-2019 IBM Corp. + * Copyright 2018-2019 Raptor Engineering, LLC + */ + +#define pr_fmt(fmt) "FLASH: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct flash { + struct list_node list; + bool busy; + bool no_erase; + struct blocklevel_device *bl; + uint64_t size; + uint32_t block_size; + int id; +}; + +static struct { + enum resource_id id; + uint32_t subid; + char name[PART_NAME_MAX+1]; +} part_name_map[] = { + { RESOURCE_ID_KERNEL, RESOURCE_SUBID_NONE, "BOOTKERNEL" }, + { RESOURCE_ID_INITRAMFS,RESOURCE_SUBID_NONE, "ROOTFS" }, + { RESOURCE_ID_CAPP, RESOURCE_SUBID_SUPPORTED, "CAPP" }, + { RESOURCE_ID_IMA_CATALOG, RESOURCE_SUBID_SUPPORTED, "IMA_CATALOG" }, + { RESOURCE_ID_VERSION, RESOURCE_SUBID_NONE, "VERSION" }, + { RESOURCE_ID_KERNEL_FW, RESOURCE_SUBID_NONE, "BOOTKERNFW" }, +}; + +static LIST_HEAD(flashes); +static struct flash *system_flash; + +/* Using a single lock as we only have one flash at present. */ +static struct lock flash_lock; + +/* nvram-on-flash support */ +static struct flash *nvram_flash; +static u32 nvram_offset, nvram_size; + +/* secboot-on-flash support */ +static struct flash *secboot_flash; +static u32 secboot_offset, secboot_size; + +bool flash_reserve(void) +{ + bool rc = false; + + if (!try_lock(&flash_lock)) + return false; + + if (!system_flash->busy) { + system_flash->busy = true; + rc = true; + } + unlock(&flash_lock); + + return rc; +} + +void flash_release(void) +{ + lock(&flash_lock); + system_flash->busy = false; + unlock(&flash_lock); +} + +bool flash_unregister(void) +{ + struct blocklevel_device *bl = system_flash->bl; + + if (bl->exit) + return bl->exit(bl); + + prlog(PR_NOTICE, "Unregister flash device is not supported\n"); + return true; +} + +int flash_secboot_info(uint32_t *total_size) +{ + int rc; + + lock(&flash_lock); + if (!secboot_flash) { + rc = OPAL_HARDWARE; + } else if (secboot_flash->busy) { + rc = OPAL_BUSY; + } else { + *total_size = secboot_size; + rc = OPAL_SUCCESS; + } + unlock(&flash_lock); + + return rc; +} + +int flash_secboot_read(void *dst, uint32_t src, uint32_t len) +{ + int rc; + + if (!try_lock(&flash_lock)) + return OPAL_BUSY; + + if (!secboot_flash) { + rc = OPAL_HARDWARE; + goto out; + } + + if (secboot_flash->busy) { + rc = OPAL_BUSY; + goto out; + } + + if ((src + len) > secboot_size) { + prerror("FLASH_SECBOOT: read out of bound (0x%x,0x%x)\n", + src, len); + rc = OPAL_PARAMETER; + goto out; + } + + secboot_flash->busy = true; + unlock(&flash_lock); + + rc = blocklevel_read(secboot_flash->bl, secboot_offset + src, dst, len); + + lock(&flash_lock); + secboot_flash->busy = false; +out: + unlock(&flash_lock); + return rc; +} + +int flash_secboot_write(uint32_t dst, void *src, uint32_t len) +{ + int rc; + + if (!try_lock(&flash_lock)) + return OPAL_BUSY; + + if (secboot_flash->busy) { + rc = OPAL_BUSY; + goto out; + } + + if ((dst + len) > secboot_size) { + prerror("FLASH_SECBOOT: write out of bound (0x%x,0x%x)\n", + dst, len); + rc = OPAL_PARAMETER; + goto out; + } + + secboot_flash->busy = true; + unlock(&flash_lock); + + rc = blocklevel_write(secboot_flash->bl, secboot_offset + dst, src, len); + + lock(&flash_lock); + secboot_flash->busy = false; +out: + unlock(&flash_lock); + return rc; +} + +static int flash_nvram_info(uint32_t *total_size) +{ + int rc; + + lock(&flash_lock); + if (!nvram_flash) { + rc = OPAL_HARDWARE; + } else if (nvram_flash->busy) { + rc = OPAL_BUSY; + } else { + *total_size = nvram_size; + rc = OPAL_SUCCESS; + } + unlock(&flash_lock); + + return rc; +} + +static int flash_nvram_start_read(void *dst, uint32_t src, uint32_t len) +{ + int rc; + + if (!try_lock(&flash_lock)) + return OPAL_BUSY; + + if (!nvram_flash) { + rc = OPAL_HARDWARE; + goto out; + } + + if (nvram_flash->busy) { + rc = OPAL_BUSY; + goto out; + } + + if ((src + len) > nvram_size) { + prerror("NVRAM: read out of bound (0x%x,0x%x)\n", + src, len); + rc = OPAL_PARAMETER; + goto out; + } + + nvram_flash->busy = true; + unlock(&flash_lock); + + rc = blocklevel_read(nvram_flash->bl, nvram_offset + src, dst, len); + + lock(&flash_lock); + nvram_flash->busy = false; +out: + unlock(&flash_lock); + if (!rc) + nvram_read_complete(true); + return rc; +} + +static int flash_nvram_write(uint32_t dst, void *src, uint32_t len) +{ + int rc; + + if (!try_lock(&flash_lock)) + return OPAL_BUSY; + + if (nvram_flash->busy) { + rc = OPAL_BUSY; + goto out; + } + + /* TODO: When we have async jobs for PRD, turn this into one */ + + if ((dst + len) > nvram_size) { + prerror("NVRAM: write out of bound (0x%x,0x%x)\n", + dst, len); + rc = OPAL_PARAMETER; + goto out; + } + + nvram_flash->busy = true; + unlock(&flash_lock); + + rc = blocklevel_write(nvram_flash->bl, nvram_offset + dst, src, len); + + lock(&flash_lock); + nvram_flash->busy = false; +out: + unlock(&flash_lock); + return rc; +} + + +static int flash_secboot_probe(struct flash *flash, struct ffs_handle *ffs) +{ + uint32_t start, size, part; + bool ecc; + int rc; + + prlog(PR_DEBUG, "FLASH: probing for SECBOOT\n"); + + rc = ffs_lookup_part(ffs, "SECBOOT", &part); + if (rc) { + prlog(PR_WARNING, "FLASH: no SECBOOT partition found\n"); + return OPAL_HARDWARE; + } + + rc = ffs_part_info(ffs, part, NULL, + &start, &size, NULL, &ecc); + if (rc) { + /** + * @fwts-label SECBOOTNoPartition + * @fwts-advice OPAL could not find an SECBOOT partition + * on the system flash. Check that the system flash + * has a valid partition table, and that the firmware + * build process has added a SECBOOT partition. + */ + prlog(PR_ERR, "FLASH: Can't parse ffs info for SECBOOT\n"); + return OPAL_HARDWARE; + } + + secboot_flash = flash; + secboot_offset = start; + secboot_size = ecc ? ecc_buffer_size_minus_ecc(size) : size; + + return 0; +} + +static int flash_nvram_probe(struct flash *flash, struct ffs_handle *ffs) +{ + uint32_t start, size, part; + bool ecc; + int rc; + + prlog(PR_INFO, "probing for NVRAM\n"); + + rc = ffs_lookup_part(ffs, "NVRAM", &part); + if (rc) { + prlog(PR_WARNING, "no NVRAM partition found\n"); + return OPAL_HARDWARE; + } + + rc = ffs_part_info(ffs, part, NULL, + &start, &size, NULL, &ecc); + if (rc) { + /** + * @fwts-label NVRAMNoPartition + * @fwts-advice OPAL could not find an NVRAM partition + * on the system flash. Check that the system flash + * has a valid partition table, and that the firmware + * build process has added a NVRAM partition. + */ + prlog(PR_ERR, "Can't parse ffs info for NVRAM\n"); + return OPAL_HARDWARE; + } + + nvram_flash = flash; + nvram_offset = start; + nvram_size = ecc ? ecc_buffer_size_minus_ecc(size) : size; + + platform.nvram_info = flash_nvram_info; + platform.nvram_start_read = flash_nvram_start_read; + platform.nvram_write = flash_nvram_write; + + return 0; +} + +/* core flash support */ + +static struct dt_node *flash_add_dt_node(struct flash *flash, int id) +{ + int i; + int rc; + const char *name; + bool ecc; + struct ffs_handle *ffs; + int ffs_part_num, ffs_part_start, ffs_part_size; + struct dt_node *flash_node; + struct dt_node *partition_container_node; + struct dt_node *partition_node; + + flash_node = dt_new_addr(opal_node, "flash", id); + dt_add_property_strings(flash_node, "compatible", "ibm,opal-flash"); + dt_add_property_cells(flash_node, "ibm,opal-id", id); + dt_add_property_u64(flash_node, "reg", flash->size); + dt_add_property_cells(flash_node, "ibm,flash-block-size", + flash->block_size); + if (flash->no_erase) + dt_add_property(flash_node, "no-erase", NULL, 0); + + /* we fix to 32-bits */ + dt_add_property_cells(flash_node, "#address-cells", 1); + dt_add_property_cells(flash_node, "#size-cells", 1); + + /* Add partition container node */ + partition_container_node = dt_new(flash_node, "partitions"); + dt_add_property_strings(partition_container_node, "compatible", "fixed-partitions"); + + /* we fix to 32-bits */ + dt_add_property_cells(partition_container_node, "#address-cells", 1); + dt_add_property_cells(partition_container_node, "#size-cells", 1); + + /* Add partitions */ + for (i = 0, name = NULL; i < ARRAY_SIZE(part_name_map); i++) { + name = part_name_map[i].name; + + rc = ffs_init(0, flash->size, flash->bl, &ffs, 1); + if (rc) { + prerror("Can't open ffs handle\n"); + continue; + } + + rc = ffs_lookup_part(ffs, name, &ffs_part_num); + if (rc) { + /* This is not an error per-se, some partitions + * are purposefully absent, don't spam the logs + */ + prlog(PR_DEBUG, "No %s partition\n", name); + continue; + } + rc = ffs_part_info(ffs, ffs_part_num, NULL, + &ffs_part_start, NULL, &ffs_part_size, &ecc); + if (rc) { + prerror("Failed to get %s partition info\n", name); + continue; + } + + partition_node = dt_new_addr(partition_container_node, "partition", ffs_part_start); + dt_add_property_strings(partition_node, "label", name); + dt_add_property_cells(partition_node, "reg", ffs_part_start, ffs_part_size); + if (part_name_map[i].id != RESOURCE_ID_KERNEL_FW) { + /* Mark all partitions other than the full PNOR and the boot kernel + * firmware as read only. These two partitions are the only partitions + * that are properly erase block aligned at this time. + */ + dt_add_property(partition_node, "read-only", NULL, 0); + } + } + + partition_node = dt_new_addr(partition_container_node, "partition", 0); + dt_add_property_strings(partition_node, "label", "PNOR"); + dt_add_property_cells(partition_node, "reg", 0, flash->size); + + return flash_node; +} + +static void setup_system_flash(struct flash *flash, struct dt_node *node, + const char *name, struct ffs_handle *ffs) +{ + char *path; + + if (!ffs) + return; + + if (system_flash) { + /** + * @fwts-label SystemFlashMultiple + * @fwts-advice OPAL Found multiple system flash. + * Since we've already found a system flash we are + * going to use that one but this ordering is not + * guaranteed so may change in future. + */ + prlog(PR_WARNING, "Attempted to register multiple system " + "flash: %s\n", name); + return; + } + + prlog(PR_NOTICE, "Found system flash: %s id:%i\n", + name, flash->id); + + system_flash = flash; + path = dt_get_path(node); + dt_add_property_string(dt_chosen, "ibm,system-flash", path); + free(path); + + prlog(PR_INFO, "registered system flash device %s\n", name); + + flash_nvram_probe(flash, ffs); + flash_secboot_probe(flash, ffs); +} + +static int num_flashes(void) +{ + struct flash *flash; + int i = 0; + + list_for_each(&flashes, flash, list) + i++; + + return i; +} + +int flash_register(struct blocklevel_device *bl) +{ + uint64_t size; + uint32_t block_size; + struct ffs_handle *ffs; + struct dt_node *node; + struct flash *flash; + const char *name; + int rc; + + rc = blocklevel_get_info(bl, &name, &size, &block_size); + if (rc) + return rc; + + if (!name) + name = "(unnamed)"; + + prlog(PR_INFO, "registering flash device %s " + "(size 0x%llx, blocksize 0x%x)\n", + name, size, block_size); + + flash = malloc(sizeof(struct flash)); + if (!flash) { + prlog(PR_ERR, "Error allocating flash structure\n"); + return OPAL_RESOURCE; + } + + flash->busy = false; + flash->bl = bl; + flash->no_erase = !(bl->flags & WRITE_NEED_ERASE); + flash->size = size; + flash->block_size = block_size; + flash->id = num_flashes(); + + rc = ffs_init(0, flash->size, bl, &ffs, 1); + if (rc) { + /** + * @fwts-label NoFFS + * @fwts-advice System flash isn't formatted as expected. + * This could mean several OPAL utilities do not function + * as expected. e.g. gard, pflash. + */ + prlog(PR_WARNING, "No ffs info; " + "using raw device only\n"); + ffs = NULL; + } + + node = flash_add_dt_node(flash, flash->id); + + setup_system_flash(flash, node, name, ffs); + + if (ffs) + ffs_close(ffs); + + lock(&flash_lock); + list_add(&flashes, &flash->list); + unlock(&flash_lock); + + return OPAL_SUCCESS; +} + +enum flash_op { + FLASH_OP_READ, + FLASH_OP_WRITE, + FLASH_OP_ERASE, +}; + +static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset, + uint64_t buf, uint64_t size, uint64_t token) +{ + struct flash *flash = NULL; + int rc; + + if (!try_lock(&flash_lock)) + return OPAL_BUSY; + + list_for_each(&flashes, flash, list) + if (flash->id == id) + break; + + if (flash->id != id) { + /* Couldn't find the flash */ + rc = OPAL_PARAMETER; + goto err; + } + + if (flash->busy) { + rc = OPAL_BUSY; + goto err; + } + + if (size >= flash->size || offset >= flash->size + || offset + size > flash->size) { + rc = OPAL_PARAMETER; + goto err; + } + + /* + * These ops intentionally have no smarts (ecc correction or erase + * before write) to them. + * Skiboot is simply exposing the PNOR flash to the host. + * The host is expected to understand that this is a raw flash + * device and treat it as such. + */ + switch (op) { + case FLASH_OP_READ: + rc = blocklevel_raw_read(flash->bl, offset, (void *)buf, size); + break; + case FLASH_OP_WRITE: + rc = blocklevel_raw_write(flash->bl, offset, (void *)buf, size); + break; + case FLASH_OP_ERASE: + rc = blocklevel_erase(flash->bl, offset, size); + break; + default: + assert(0); + } + + if (rc) { + rc = OPAL_HARDWARE; + goto err; + } + + unlock(&flash_lock); + + opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, + cpu_to_be64(token), + cpu_to_be64(rc)); + + return OPAL_ASYNC_COMPLETION; + +err: + unlock(&flash_lock); + return rc; +} + +static int64_t opal_flash_read(uint64_t id, uint64_t offset, uint64_t buf, + uint64_t size, uint64_t token) +{ + if (!opal_addr_valid((void *)buf)) + return OPAL_PARAMETER; + + return opal_flash_op(FLASH_OP_READ, id, offset, buf, size, token); +} + +static int64_t opal_flash_write(uint64_t id, uint64_t offset, uint64_t buf, + uint64_t size, uint64_t token) +{ + if (!opal_addr_valid((void *)buf)) + return OPAL_PARAMETER; + + return opal_flash_op(FLASH_OP_WRITE, id, offset, buf, size, token); +} + +static int64_t opal_flash_erase(uint64_t id, uint64_t offset, uint64_t size, + uint64_t token) +{ + return opal_flash_op(FLASH_OP_ERASE, id, offset, 0L, size, token); +} + +opal_call(OPAL_FLASH_READ, opal_flash_read, 5); +opal_call(OPAL_FLASH_WRITE, opal_flash_write, 5); +opal_call(OPAL_FLASH_ERASE, opal_flash_erase, 4); + +/* flash resource API */ +const char *flash_map_resource_name(enum resource_id id) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(part_name_map); i++) { + if (part_name_map[i].id == id) + return part_name_map[i].name; + } + return NULL; +} + +static size_t sizeof_elf_from_hdr(void *buf) +{ + struct elf_hdr *elf = (struct elf_hdr *)buf; + size_t sz = 0; + + BUILD_ASSERT(SECURE_BOOT_HEADERS_SIZE > sizeof(struct elf_hdr)); + BUILD_ASSERT(SECURE_BOOT_HEADERS_SIZE > sizeof(struct elf64be_hdr)); + BUILD_ASSERT(SECURE_BOOT_HEADERS_SIZE > sizeof(struct elf32be_hdr)); + + if (elf->ei_ident == ELF_IDENT) { + if (elf->ei_class == ELF_CLASS_64) { + if (elf->ei_data == ELF_DATA_LSB) { + struct elf64le_hdr *kh = (struct elf64le_hdr *)buf; + sz = le64_to_cpu(kh->e_shoff) + + ((uint32_t)le16_to_cpu(kh->e_shentsize) * + (uint32_t)le16_to_cpu(kh->e_shnum)); + } else { + struct elf64be_hdr *kh = (struct elf64be_hdr *)buf; + sz = be64_to_cpu(kh->e_shoff) + + ((uint32_t)be16_to_cpu(kh->e_shentsize) * + (uint32_t)be16_to_cpu(kh->e_shnum)); + } + } else if (elf->ei_class == ELF_CLASS_32) { + if (elf->ei_data == ELF_DATA_LSB) { + struct elf32le_hdr *kh = (struct elf32le_hdr *)buf; + sz = le32_to_cpu(kh->e_shoff) + + (le16_to_cpu(kh->e_shentsize) * + le16_to_cpu(kh->e_shnum)); + } else { + struct elf32be_hdr *kh = (struct elf32be_hdr *)buf; + sz = be32_to_cpu(kh->e_shoff) + + (be16_to_cpu(kh->e_shentsize) * + be16_to_cpu(kh->e_shnum)); + } + } + } + + return sz; +} + +/* + * load a resource from FLASH + * buf and len shouldn't account for ECC even if partition is ECCed. + * + * The API here is a bit strange. + * If resource has a STB container, buf will contain it + * If loading subpartition with STB container, buff will *NOT* contain it + * For trusted boot, the whole partition containing the subpart is measured. + * + * Additionally, the logic to work out how much to read from flash is insane. + */ +static int flash_load_resource(enum resource_id id, uint32_t subid, + void *buf, size_t *len) +{ + int i; + int rc = OPAL_RESOURCE; + struct ffs_handle *ffs; + struct flash *flash; + const char *name; + bool status = false; + bool ecc; + bool part_signed = false; + void *bufp = buf; + size_t bufsz = *len; + int ffs_part_num, ffs_part_start, ffs_part_size; + int content_size = 0; + int offset = 0; + + lock(&flash_lock); + + if (!system_flash) { + /** + * @fwts-label SystemFlashNotFound + * @fwts-advice No system flash was found. Check for missing + * calls flash_register(...). + */ + prlog(PR_WARNING, "Can't load resource id:%i. " + "No system flash found\n", id); + goto out_unlock; + } + + flash = system_flash; + + if (flash->busy) + goto out_unlock; + + for (i = 0, name = NULL; i < ARRAY_SIZE(part_name_map); i++) { + if (part_name_map[i].id == id) { + name = part_name_map[i].name; + break; + } + } + if (!name) { + prerror("Couldn't find partition for id %d\n", id); + goto out_unlock; + } + /* + * If partition doesn't have a subindex but the caller specifies one, + * we fail. eg. kernel partition doesn't have a subindex + */ + if ((part_name_map[i].subid == RESOURCE_SUBID_NONE) && + (subid != RESOURCE_SUBID_NONE)) { + prerror("PLAT: Partition %s doesn't have subindex\n", name); + goto out_unlock; + } + + rc = ffs_init(0, flash->size, flash->bl, &ffs, 1); + if (rc) { + prerror("Can't open ffs handle: %d\n", rc); + goto out_unlock; + } + + rc = ffs_lookup_part(ffs, name, &ffs_part_num); + if (rc) { + /* This is not an error per-se, some partitions + * are purposefully absent, don't spam the logs + */ + prlog(PR_DEBUG, "No %s partition\n", name); + goto out_free_ffs; + } + rc = ffs_part_info(ffs, ffs_part_num, NULL, + &ffs_part_start, NULL, &ffs_part_size, &ecc); + if (rc) { + prerror("Failed to get %s partition info\n", name); + goto out_free_ffs; + } + prlog(PR_DEBUG,"%s partition %s ECC\n", + name, ecc ? "has" : "doesn't have"); + + /* + * FIXME: Make the fact we don't support partitions smaller than 4K + * more explicit. + */ + if (ffs_part_size < SECURE_BOOT_HEADERS_SIZE) { + prerror("secboot headers bigger than " + "partition size 0x%x\n", ffs_part_size); + goto out_free_ffs; + } + + rc = blocklevel_read(flash->bl, ffs_part_start, bufp, + SECURE_BOOT_HEADERS_SIZE); + if (rc) { + prerror("failed to read the first 0x%x from " + "%s partition, rc %d\n", SECURE_BOOT_HEADERS_SIZE, + name, rc); + goto out_free_ffs; + } + + part_signed = stb_is_container(bufp, SECURE_BOOT_HEADERS_SIZE); + + prlog(PR_DEBUG, "%s partition %s signed\n", name, + part_signed ? "is" : "isn't"); + + /* + * part_start/size are raw pointers into the partition. + * ie. they will account for ECC if included. + */ + + if (part_signed) { + bufp += SECURE_BOOT_HEADERS_SIZE; + bufsz -= SECURE_BOOT_HEADERS_SIZE; + content_size = stb_sw_payload_size(buf, SECURE_BOOT_HEADERS_SIZE); + *len = content_size + SECURE_BOOT_HEADERS_SIZE; + + if (content_size > bufsz) { + prerror("content size > buffer size\n"); + rc = OPAL_PARAMETER; + goto out_free_ffs; + } + + if (*len > ffs_part_size) { + prerror("FLASH: Cannot load %s. Content is larger than the partition\n", + name); + rc = OPAL_PARAMETER; + goto out_free_ffs; + } + + ffs_part_start += SECURE_BOOT_HEADERS_SIZE; + + rc = blocklevel_read(flash->bl, ffs_part_start, bufp, + content_size); + if (rc) { + prerror("failed to read content size %d" + " %s partition, rc %d\n", + content_size, name, rc); + goto out_free_ffs; + } + + if (subid == RESOURCE_SUBID_NONE) + goto done_reading; + + rc = flash_subpart_info(bufp, content_size, ffs_part_size, + NULL, subid, &offset, &content_size); + if (rc) { + prerror("Failed to parse subpart info for %s\n", + name); + goto out_free_ffs; + } + bufp += offset; + goto done_reading; + } else /* stb_signed */ { + /* + * Back to the old way of doing things, no STB header. + */ + if (subid == RESOURCE_SUBID_NONE) { + if (id == RESOURCE_ID_KERNEL || + id == RESOURCE_ID_INITRAMFS) { + /* + * Because actualSize is a lie, we compute the + * size of the BOOTKERNEL based on what the ELF + * headers say. Otherwise we end up reading more + * than we should + */ + content_size = sizeof_elf_from_hdr(buf); + if (!content_size) { + prerror("Invalid ELF header part" + " %s\n", name); + rc = OPAL_RESOURCE; + goto out_free_ffs; + } + } else { + content_size = ffs_part_size; + } + if (content_size > bufsz) { + prerror("%s content size %d > " + " buffer size %lu\n", name, + content_size, bufsz); + rc = OPAL_PARAMETER; + goto out_free_ffs; + } + prlog(PR_DEBUG, "computed %s size %u\n", + name, content_size); + rc = blocklevel_read(flash->bl, ffs_part_start, + buf, content_size); + if (rc) { + prerror("failed to read content size %d" + " %s partition, rc %d\n", + content_size, name, rc); + goto out_free_ffs; + } + *len = content_size; + goto done_reading; + } + BUILD_ASSERT(FLASH_SUBPART_HEADER_SIZE <= SECURE_BOOT_HEADERS_SIZE); + rc = flash_subpart_info(bufp, SECURE_BOOT_HEADERS_SIZE, + ffs_part_size, &ffs_part_size, subid, + &offset, &content_size); + if (rc) { + prerror("FAILED reading subpart info. rc=%d\n", + rc); + goto out_free_ffs; + } + + *len = ffs_part_size; + prlog(PR_DEBUG, "Computed %s partition size: %u " + "(subpart %u size %u offset %u)\n", name, ffs_part_size, + subid, content_size, offset); + /* + * For a sub partition, we read the whole (computed) + * partition, and then measure that. + * Afterwards, we memmove() things back into place for + * the caller. + */ + rc = blocklevel_read(flash->bl, ffs_part_start, + buf, ffs_part_size); + + bufp += offset; + } + +done_reading: + /* + * Verify and measure the retrieved PNOR partition as part of the + * secure boot and trusted boot requirements + */ + secureboot_verify(id, buf, *len); + trustedboot_measure(id, buf, *len); + + /* Find subpartition */ + if (subid != RESOURCE_SUBID_NONE) { + memmove(buf, bufp, content_size); + *len = content_size; + } + + status = true; + +out_free_ffs: + ffs_close(ffs); +out_unlock: + unlock(&flash_lock); + return status ? OPAL_SUCCESS : rc; +} + + +struct flash_load_resource_item { + enum resource_id id; + uint32_t subid; + int result; + void *buf; + size_t *len; + struct list_node link; +}; + +static LIST_HEAD(flash_load_resource_queue); +static LIST_HEAD(flash_loaded_resources); +static struct lock flash_load_resource_lock = LOCK_UNLOCKED; +static struct cpu_job *flash_load_job = NULL; + +int flash_resource_loaded(enum resource_id id, uint32_t subid) +{ + struct flash_load_resource_item *resource = NULL; + struct flash_load_resource_item *r; + int rc = OPAL_BUSY; + + lock(&flash_load_resource_lock); + list_for_each(&flash_loaded_resources, r, link) { + if (r->id == id && r->subid == subid) { + resource = r; + break; + } + } + + if (resource) { + rc = resource->result; + list_del(&resource->link); + free(resource); + } + + if (list_empty(&flash_load_resource_queue) && flash_load_job) { + cpu_wait_job(flash_load_job, true); + flash_load_job = NULL; + } + + unlock(&flash_load_resource_lock); + + return rc; +} + +/* + * Retry for 10 minutes in 5 second intervals: allow 5 minutes for a BMC reboot + * (need the BMC if we're using HIOMAP flash access), then 2x for some margin. + */ +#define FLASH_LOAD_WAIT_MS 5000 +#define FLASH_LOAD_RETRIES (2 * 5 * (60 / (FLASH_LOAD_WAIT_MS / 1000))) + +static void flash_load_resources(void *data __unused) +{ + struct flash_load_resource_item *r; + int retries = FLASH_LOAD_RETRIES; + int result = OPAL_RESOURCE; + + lock(&flash_load_resource_lock); + do { + if (list_empty(&flash_load_resource_queue)) { + break; + } + r = list_top(&flash_load_resource_queue, + struct flash_load_resource_item, link); + if (r->result != OPAL_EMPTY) + prerror("flash_load_resources() list_top unexpected " + " result %d\n", r->result); + r->result = OPAL_BUSY; + unlock(&flash_load_resource_lock); + + while (retries) { + result = flash_load_resource(r->id, r->subid, r->buf, + r->len); + if (result == OPAL_SUCCESS) { + retries = FLASH_LOAD_RETRIES; + break; + } + + if (result != FLASH_ERR_AGAIN && + result != FLASH_ERR_DEVICE_GONE) + break; + + time_wait_ms(FLASH_LOAD_WAIT_MS); + + retries--; + + prlog(PR_WARNING, + "Retrying load of %d:%d, %d attempts remain\n", + r->id, r->subid, retries); + } + + lock(&flash_load_resource_lock); + r = list_pop(&flash_load_resource_queue, + struct flash_load_resource_item, link); + /* Will reuse the result from when we hit retries == 0 */ + r->result = result; + list_add_tail(&flash_loaded_resources, &r->link); + } while(true); + unlock(&flash_load_resource_lock); +} + +static void start_flash_load_resource_job(void) +{ + if (flash_load_job) + cpu_wait_job(flash_load_job, true); + + flash_load_job = cpu_queue_job(NULL, "flash_load_resources", + flash_load_resources, NULL); + + cpu_process_local_jobs(); +} + +int flash_start_preload_resource(enum resource_id id, uint32_t subid, + void *buf, size_t *len) +{ + struct flash_load_resource_item *r; + bool start_thread = false; + + r = malloc(sizeof(struct flash_load_resource_item)); + + assert(r != NULL); + r->id = id; + r->subid = subid; + r->buf = buf; + r->len = len; + r->result = OPAL_EMPTY; + + prlog(PR_DEBUG, "Queueing preload of %x/%x\n", + r->id, r->subid); + + lock(&flash_load_resource_lock); + if (list_empty(&flash_load_resource_queue)) { + start_thread = true; + } + list_add_tail(&flash_load_resource_queue, &r->link); + unlock(&flash_load_resource_lock); + + if (start_thread) + start_flash_load_resource_job(); + + return OPAL_SUCCESS; +} + +/* + * The `libxz` decompression routines are blocking; the new decompression + * routines, wrapper around `libxz` functions, provide support for asynchronous + * decompression. There are two routines, which start the decompression, and one + * which waits for the decompression to complete. + * + * The decompressed image will be present in the `dst` parameter of + * `xz_decompress` structure. + * + * When the decompression is successful, the xz_decompress->status will be + * `OPAL_SUCCESS` else OPAL_PARAMETER, see definition of xz_decompress structure + * for details. + */ +static void xz_decompress(void *data) +{ + struct xz_decompress *xz = (struct xz_decompress *)data; + struct xz_dec *s; + struct xz_buf b; + + /* Initialize the xz library first */ + xz_crc32_init(); + s = xz_dec_init(XZ_SINGLE, 0); + if (s == NULL) { + prerror("initialization error for xz\n"); + xz->status = OPAL_NO_MEM; + return; + } + + xz->xz_error = XZ_DATA_ERROR; + xz->status = OPAL_PARTIAL; + + b.in = xz->src; + b.in_pos = 0; + b.in_size = xz->src_size; + b.out = xz->dst; + b.out_pos = 0; + b.out_size = xz->dst_size; + + /* Start decompressing */ + xz->xz_error = xz_dec_run(s, &b); + if (xz->xz_error != XZ_STREAM_END) { + prerror("failed to decompress subpartition\n"); + xz->status = OPAL_PARAMETER; + } else + xz->status = OPAL_SUCCESS; + + xz_dec_end(s); +} + +/* + * xz_start_decompress: start the decompression job and return. + * + * struct xz_decompress *xz, should be populated by the caller with + * - the starting address of the compressed binary + * - the address where the decompressed image should be placed + * - the sizes of the source and the destination + * + * xz->src: Source address (The compressed binary) + * xz->src_size: Source size + * xz->dst: Destination address (The memory area where the `src` will be + * decompressed) + * xz->dst_size: Destination size + * + * The `status` value will be OPAL_PARTIAL till the job completes (successfully + * or not) + */ +void xz_start_decompress(struct xz_decompress *xz) +{ + struct cpu_job *job; + + if (!xz) + return; + + if (!xz->dst || !xz->dst_size || !xz->src || !xz->src_size) { + xz->status = OPAL_PARAMETER; + return; + } + + job = cpu_queue_job(NULL, "xz_decompress", xz_decompress, + (void *) xz); + if (!job) { + xz->status = OPAL_NO_MEM; + return; + } + + xz->job = job; +} + +/* + * This function waits for the decompression job to complete. The `ret` + * structure member in `xz_decompress` will have the status code. + * + * status == OPAL_SUCCESS on success, else the corresponding error code. + */ +void wait_xz_decompress(struct xz_decompress *xz) +{ + if (!xz) + return; + + cpu_wait_job(xz->job, true); +} diff --git a/roms/skiboot/core/gcov-profiling.c b/roms/skiboot/core/gcov-profiling.c new file mode 100644 index 000000000..fdad51ed9 --- /dev/null +++ b/roms/skiboot/core/gcov-profiling.c @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * libgcov skeleton reimplementation to build skiboot with gcov support + * + * Copyright 2015-2018 IBM Corp. + */ + +#include +#include +#include + +typedef long gcov_type; + +/* + * This is GCC internal data structure. See GCC libgcc/libgcov.h for + * details. + * + * If gcc changes this, we have to change it. + */ + +typedef unsigned int gcov_unsigned_int; + +#if __GNUC__ == 4 && __GNUC_MINOR__ >= 9 +#define GCOV_COUNTERS 9 +#else +#define GCOV_COUNTERS 8 +#endif + +struct gcov_info +{ + gcov_unsigned_int version; + struct gcov_info *next; + gcov_unsigned_int stamp; + const char *filename; + void (*merge[GCOV_COUNTERS])(gcov_type *, unsigned int); + unsigned int n_functions; + struct gcov_fn_info **functions; +}; + +/* We have a list of all gcov info set up at startup */ +struct gcov_info *gcov_info_list; + +void __gcov_init(struct gcov_info* f); +void skiboot_gcov_done(void); +void __gcov_flush(void); +void __gcov_merge_add(gcov_type *counters, unsigned int n_counters); +void __gcov_merge_single(gcov_type *counters, unsigned int n_counters); +void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters); +void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters); +void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters); +void __gcov_exit(void); + +void __gcov_init(struct gcov_info* f) +{ + static gcov_unsigned_int version = 0; + + if (version == 0) { + printf("GCOV version: %u\n", f->version); + version = f->version; + } + + if (gcov_info_list) + f->next = gcov_info_list; + + gcov_info_list = f; + return; +} + +void skiboot_gcov_done(void) +{ + struct gcov_info *i = gcov_info_list; + + if (i->filename) + printf("GCOV: gcov_info_list looks sane (first file: %s)\n", + i->filename); + else + prlog(PR_WARNING, "GCOV: gcov_info_list doesn't look sane. " + "i->filename == NULL."); + + printf("GCOV: gcov_info_list at 0x%p\n", gcov_info_list); +} + +void __gcov_merge_add(gcov_type *counters, unsigned int n_counters) +{ + (void)counters; + (void)n_counters; + + return; +} + +void __gcov_flush(void) +{ + return; +} + +void __gcov_merge_single(gcov_type *counters, unsigned int n_counters) +{ + (void)counters; + (void)n_counters; + + return; +} + +void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters) +{ + (void)counters; + (void)n_counters; + + return; +} + +void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters) +{ + (void)counters; + (void)n_counters; + return; +} + +void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters) +{ + (void)counters; + (void)n_counters; +} + +void __gcov_exit(void) +{ +} diff --git a/roms/skiboot/core/hmi.c b/roms/skiboot/core/hmi.c new file mode 100644 index 000000000..9363cc5fb --- /dev/null +++ b/roms/skiboot/core/hmi.c @@ -0,0 +1,1558 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Deal with Hypervisor Maintenance Interrupts + * + * Copyright 2013-2019 IBM Corp. + */ + +#define pr_fmt(fmt) "HMI: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * P9 HMER register layout: + * +===+==========+============================+========+===================+ + * |Bit|Name |Description |PowerKVM|Action | + * | | | |HMI | | + * | | | |enabled | | + * | | | |for this| | + * | | | |bit ? | | + * +===+==========+============================+========+===================+ + * |0 |malfunctio|A processor core in the |Yes |Raise attn from | + * | |n_allert |system has checkstopped | |sapphire resulting | + * | | |(failed recovery) and has | |xstop | + * | | |requested a CP Sparing | | | + * | | |to occur. This is | | | + * | | |broadcasted to every | | | + * | | |processor in the system | | | + * |---+----------+----------------------------+--------+-------------------| + * |1 |Reserved |reserved |n/a | | + * |---+----------+----------------------------+--------+-------------------| + * |2 |proc_recv_|Processor recovery occurred |Yes |Log message and | + * | |done |error-bit in fir not masked | |continue working. | + * | | |(see bit 11) | | | + * |---+----------+----------------------------+--------+-------------------| + * |3 |proc_recv_|Processor went through |Yes |Log message and | + * | |error_mask|recovery for an error which | |continue working. | + * | |ed |is actually masked for | | | + * | | |reporting | | | + * |---+----------+----------------------------+--------+-------------------| + * |4 | |Timer facility experienced |Yes |Raise attn from | + * | |tfac_error|an error. | |sapphire resulting | + * | | |TB, DEC, HDEC, PURR or SPURR| |xstop | + * | | |may be corrupted (details in| | | + * | | |TFMR) | | | + * |---+----------+----------------------------+--------+-------------------| + * |5 | |TFMR SPR itself is |Yes |Raise attn from | + * | |tfmr_parit|corrupted. | |sapphire resulting | + * | |y_error |Entire timing facility may | |xstop | + * | | |be compromised. | | | + * |---+----------+----------------------------+--------+-------------------| + * |6 |ha_overflo| UPS (Uniterrupted Power |No |N/A | + * | |w_warning |System) Overflow indication | | | + * | | |indicating that the UPS | | | + * | | |DirtyAddrTable has | | | + * | | |reached a limit where it | | | + * | | |requires PHYP unload support| | | + * |---+----------+----------------------------+--------+-------------------| + * |7 |reserved |reserved |n/a |n/a | + * |---+----------+----------------------------+--------+-------------------| + * |8 |xscom_fail|An XSCOM operation caused by|No |We handle it by | + * | | |a cache inhibited load/store| |manually reading | + * | | |from this thread failed. A | |HMER register. | + * | | |trap register is | | | + * | | |available. | | | + * | | | | | | + * |---+----------+----------------------------+--------+-------------------| + * |9 |xscom_done|An XSCOM operation caused by|No |We handle it by | + * | | |a cache inhibited load/store| |manually reading | + * | | |from this thread completed. | |HMER register. | + * | | |If hypervisor | | | + * | | |intends to use this bit, it | | | + * | | |is responsible for clearing | | | + * | | |it before performing the | | | + * | | |xscom operation. | | | + * | | |NOTE: this bit should always| | | + * | | |be masked in HMEER | | | + * |---+----------+----------------------------+--------+-------------------| + * |10 |reserved |reserved |n/a |n/a | + * |---+----------+----------------------------+--------+-------------------| + * |11 |proc_recv_|Processor recovery occurred |y |Log message and | + * | |again |again before bit2 or bit3 | |continue working. | + * | | |was cleared | | | + * |---+----------+----------------------------+--------+-------------------| + * |12-|reserved |was temperature sensor |n/a |n/a | + * |15 | |passed the critical point on| | | + * | | |the way up | | | + * |---+----------+----------------------------+--------+-------------------| + * |16 | |SCOM has set a reserved FIR |No |n/a | + * | |scom_fir_h|bit to cause recovery | | | + * | |m | | | | + * |---+----------+----------------------------+--------+-------------------| + * |17 |trig_fir_h|Debug trigger has set a |No |n/a | + * | |mi |reserved FIR bit to cause | | | + * | | |recovery | | | + * |---+----------+----------------------------+--------+-------------------| + * |18 |reserved |reserved |n/a |n/a | + * |---+----------+----------------------------+--------+-------------------| + * |19 |reserved |reserved |n/a |n/a | + * |---+----------+----------------------------+--------+-------------------| + * |20 |hyp_resour|A hypervisor resource error |y |Raise attn from | + * | |ce_err |occurred: data parity error | |sapphire resulting | + * | | |on, SPRC0:3; SPR_Modereg or | |xstop. | + * | | |HMEER. | | | + * | | |Note: this bit will cause an| | | + * | | |check_stop when (HV=1, PR=0 | | | + * | | |and EE=0) | | | + * |---+----------+----------------------------+--------+-------------------| + * |21-| |if bit 8 is active, the |No |We handle it by | + * |23 |xscom_stat|reason will be detailed in | |Manually reading | + * | |us |these bits. see chapter 11.1| |HMER register. | + * | | |This bits are information | | | + * | | |only and always masked | | | + * | | |(mask = '0') | | | + * | | |If hypervisor intends to use| | | + * | | |this bit, it is responsible | | | + * | | |for clearing it before | | | + * | | |performing the xscom | | | + * | | |operation. | | | + * |---+----------+----------------------------+--------+-------------------| + * |24-|Not |Not implemented |n/a |n/a | + * |63 |implemente| | | | + * | |d | | | | + * +-- +----------+----------------------------+--------+-------------------+ + * + * Above HMER bits can be enabled/disabled by modifying + * SPR_HMEER_HMI_ENABLE_MASK #define in include/processor.h + * If you modify support for any of the bits listed above, please make sure + * you change the above table to refelct that. + * + * NOTE: Per Dave Larson, never enable 8,9,21-23 + */ + +/* + * P10 HMER register layout: + * Bit Name Description + * 0 malfunction_alert A processor core in the system has checkstopped + * (failed recovery). This is broadcasted to every + * processor in the system + * + * 1 reserved reserved + * + * 2 proc_rcvy_done Processor recovery occurred error-bit in fir not + * masked (see bit 11) + * + * 3 reserved reserved + * + * 4 tfac_error Timer facility experienced an error. TB, DEC, + * HDEC, PURR or SPURR may be corrupted (details in + * TFMR) + * + * 5 tfx_error Error occurred on transfer from tfac shadow to + * core + * + * 6 spurr_scale_limit Nominal frequency exceeded 399 percent + * + * 7 reserved reserved + * + * 8 xscom_fail An XSCOM operation caused by a cache inhibited + * load/store from this thread failed. A trap + * register is available. + * + * 9 xscom_done An XSCOM operation caused by a cache inhibited + * load/store from this thread completed. If + * hypervisor intends to use this bit, it is + * responsible for clearing it before performing the + * xscom operation. NOTE: this bit should always be + * masked in HMEER + * + * 10 reserved reserved + * + * 11 proc_rcvy_again Processor recovery occurred again before bit 2 + * was cleared + * + * 12-15 reserved reserved + * + * 16 scom_fir_hmi An error inject to PC FIR has occurred to set HMI. + * This error inject can also set FIR(61) to cause + * recovery. + * + * 17 reserved reserved + * + * 18 trig_fir_hmi Debug trigger has occurred to set HMI. This + * trigger can also set FIR(60) to cause recovery + * + * 19-20 reserved reserved + * + * 21-23 xscom_status If bit 8 is active, the reason will be detailed in + * these bits. These bits are information only and + * always masked (mask = ‘0’) If hypervisor intends + * to use this field, it is responsible for clearing + * it before performing the xscom operation. + * + * 24:63 Not implemented Not implemented. + * + * P10 HMEER enabled bits: + * Name Action + * malfunction_alert Decode and log FIR bits. + * proc_rcvy_done Log and continue. + * tfac_error Log and attempt to recover time facilities. + * tfx_error Log and attempt to recover time facilities. + * spurr_scale_limit Log and continue. XXX? + * proc_rcvy_again Log and continue. + */ + +/* Used for tracking cpu threads inside hmi handling. */ +#define HMI_STATE_CLEANUP_DONE 0x100 +#define CORE_THREAD_MASK 0x0ff +#define SUBCORE_THREAD_MASK(s_id, t_count) \ + ((((1UL) << (t_count)) - 1) << ((s_id) * (t_count))) +#define SINGLE_THREAD_MASK(t_id) ((1UL) << (t_id)) + +/* + * Number of iterations for the various timeouts. We can't use the timebase + * as it might be broken. We measured experimentally that 40 millions loops + * of cpu_relax() gives us more than 1s. The margin is comfortable enough. + */ +#define TIMEOUT_LOOPS 40000000 + +/* TFMR other errors. (other than bit 26 and 45) */ +#define SPR_TFMR_OTHER_ERRORS \ + (SPR_TFMR_TBST_CORRUPT | SPR_TFMR_TB_MISSING_SYNC | \ + SPR_TFMR_TB_MISSING_STEP | SPR_TFMR_FW_CONTROL_ERR | \ + SPR_TFMR_PURR_PARITY_ERR | SPR_TFMR_SPURR_PARITY_ERR | \ + SPR_TFMR_DEC_PARITY_ERR | SPR_TFMR_TFMR_CORRUPT | \ + SPR_TFMR_CHIP_TOD_INTERRUPT) + +/* TFMR "all core" errors (sent to all threads) */ +#define SPR_TFMR_CORE_ERRORS \ + (SPR_TFMR_TBST_CORRUPT | SPR_TFMR_TB_MISSING_SYNC | \ + SPR_TFMR_TB_MISSING_STEP | SPR_TFMR_FW_CONTROL_ERR | \ + SPR_TFMR_TFMR_CORRUPT | SPR_TFMR_TB_RESIDUE_ERR | \ + SPR_TFMR_HDEC_PARITY_ERROR | SPR_TFMR_TFAC_XFER_ERROR) + +/* TFMR "thread" errors */ +#define SPR_TFMR_THREAD_ERRORS \ + (SPR_TFMR_PURR_PARITY_ERR | SPR_TFMR_SPURR_PARITY_ERR | \ + SPR_TFMR_DEC_PARITY_ERR) + +/* + * Starting from p9, core inits are setup to escalate all core + * local checkstop to system checkstop. Review this list when that changes. + */ +static const struct core_xstop_bit_info { + uint8_t bit; /* CORE FIR bit number */ + enum OpalHMI_CoreXstopReason reason; +} xstop_bits[] = { + { 3, CORE_CHECKSTOP_IFU_REGFILE }, + { 5, CORE_CHECKSTOP_IFU_LOGIC }, + { 8, CORE_CHECKSTOP_PC_DURING_RECOV }, + { 10, CORE_CHECKSTOP_ISU_REGFILE }, + { 12, CORE_CHECKSTOP_ISU_LOGIC }, + { 21, CORE_CHECKSTOP_FXU_LOGIC }, + { 25, CORE_CHECKSTOP_VSU_LOGIC }, + { 26, CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE }, + { 32, CORE_CHECKSTOP_LSU_REGFILE }, + { 36, CORE_CHECKSTOP_PC_FWD_PROGRESS }, + { 38, CORE_CHECKSTOP_LSU_LOGIC }, + { 45, CORE_CHECKSTOP_PC_LOGIC }, + { 48, CORE_CHECKSTOP_PC_HYP_RESOURCE }, + { 52, CORE_CHECKSTOP_PC_HANG_RECOV_FAILED }, + { 54, CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED }, + { 63, CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ }, +}; + +struct core_fir_bit_info { + uint8_t bit; /* CORE FIR bit number */ + const char *reason; +}; + +static const struct core_fir_bit_info p9_recoverable_bits[] = { + { 0, "IFU - SRAM (ICACHE parity, etc)" }, + { 2, "IFU - RegFile" }, + { 4, "IFU - Logic" }, + { 9, "ISU - RegFile" }, + { 11, "ISU - Logic" }, + { 13, "ISU - Recoverable due to not in MT window" }, + { 24, "VSU - Logic" }, + { 27, "VSU - DFU logic" }, + { 29, "LSU - SRAM (DCACHE parity, etc)" }, + { 31, "LSU - RegFile" }, + /* The following 3 bits may be set by SRAM errors. */ + { 33, "LSU - TLB multi hit" }, + { 34, "LSU - SLB multi hit" }, + { 35, "LSU - ERAT multi hit" }, + { 37, "LSU - Logic" }, + { 39, "LSU - Recoverable due to not in MT window" }, + { 43, "PC - Thread hang recovery" }, +}; + +static const struct core_fir_bit_info p10_core_fir_bits[] = { + { 0, "IFU - SRAM recoverable error (ICACHE parity error, etc.)" }, + { 1, "PC - TC checkstop" }, + { 2, "IFU - RegFile recoverable error" }, + { 3, "IFU - RegFile core checkstop" }, + { 4, "IFU - Logic recoverable error" }, + { 5, "IFU - Logic core checkstop" }, + { 7, "VSU - Inference accumulator recoverable error" }, + { 8, "PC - Recovery core checkstop" }, + { 9, "VSU - Slice Target File (STF) recoverable error" }, + { 11, "ISU - Logic recoverable error" }, + { 12, "ISU - Logic core checkstop" }, + { 14, "ISU - Machine check received while ME=0 checkstop" }, + { 15, "ISU - UE from L2" }, + { 16, "ISU - Number of UEs from L2 above threshold" }, + { 17, "ISU - UE on CI load" }, + { 18, "MMU - TLB recoverable error" }, + { 19, "MMU - SLB error" }, + { 21, "MMU - CXT recoverable error" }, + { 22, "MMU - Logic core checkstop" }, + { 23, "MMU - MMU system checkstop" }, + { 24, "VSU - Logic recoverable error" }, + { 25, "VSU - Logic core checkstop" }, + { 26, "PC - In maint mode and recovery in progress" }, + { 28, "PC - PC system checkstop" }, + { 29, "LSU - SRAM recoverable error (DCACHE parity error, etc.)" }, + { 30, "LSU - Set deleted" }, + { 31, "LSU - RegFile recoverable error" }, + { 32, "LSU - RegFile core checkstop" }, + { 33, "MMU - TLB multi hit error occurred" }, + { 34, "MMU - SLB multi hit error occurred" }, + { 35, "LSU - ERAT multi hit error occurred" }, + { 36, "PC - Forward progress error" }, + { 37, "LSU - Logic recoverable error" }, + { 38, "LSU - Logic core checkstop" }, + { 41, "LSU - System checkstop" }, + { 43, "PC - Thread hang recoverable error" }, + { 45, "PC - Logic core checkstop" }, + { 47, "PC - TimeBase facility checkstop" }, + { 52, "PC - Hang recovery failed core checkstop" }, + { 53, "PC - Core internal hang detected" }, + { 55, "PC - Nest hang detected" }, + { 56, "PC - Other core chiplet recoverable error" }, + { 57, "PC - Other core chiplet core checkstop" }, + { 58, "PC - Other core chiplet system checkstop" }, + { 59, "PC - SCOM satellite error detected" }, + { 60, "PC - Debug trigger error inject" }, + { 61, "PC - SCOM or firmware recoverable error inject" }, + { 62, "PC - Firmware checkstop error inject" }, + { 63, "PC - Firmware SPRC / SPRD checkstop" }, +}; + +static const struct nx_xstop_bit_info { + uint8_t bit; /* NX FIR bit number */ + enum OpalHMI_NestAccelXstopReason reason; +} nx_dma_xstop_bits[] = { + { 1, NX_CHECKSTOP_SHM_INVAL_STATE_ERR }, + { 15, NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1 }, + { 16, NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2 }, + { 20, NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR }, + { 21, NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR }, + { 22, NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR }, + { 23, NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR }, + { 24, NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR }, + { 25, NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR }, + { 26, NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR }, + { 27, NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR }, + { 31, NX_CHECKSTOP_DMA_CRB_UE }, + { 32, NX_CHECKSTOP_DMA_CRB_SUE }, +}; + +static const struct nx_xstop_bit_info nx_pbi_xstop_bits[] = { + { 12, NX_CHECKSTOP_PBI_ISN_UE }, +}; + +static struct lock hmi_lock = LOCK_UNLOCKED; +static uint32_t malf_alert_scom; +static uint32_t nx_status_reg; +static uint32_t nx_dma_engine_fir; +static uint32_t nx_pbi_fir; + +static int setup_scom_addresses(void) +{ + switch (proc_gen) { + case proc_gen_p8: + malf_alert_scom = P8_MALFUNC_ALERT; + nx_status_reg = P8_NX_STATUS_REG; + nx_dma_engine_fir = P8_NX_DMA_ENGINE_FIR; + nx_pbi_fir = P8_NX_PBI_FIR; + return 1; + case proc_gen_p9: + malf_alert_scom = P9_MALFUNC_ALERT; + nx_status_reg = P9_NX_STATUS_REG; + nx_dma_engine_fir = P9_NX_DMA_ENGINE_FIR; + nx_pbi_fir = P9_NX_PBI_FIR; + return 1; + case proc_gen_p10: + malf_alert_scom = P10_MALFUNC_ALERT; + nx_status_reg = P10_NX_STATUS_REG; + nx_dma_engine_fir = P10_NX_DMA_ENGINE_FIR; + nx_pbi_fir = P10_NX_PBI_FIR; + return 1; + default: + prerror("%s: Unknown CPU type\n", __func__); + break; + } + return 0; +} + +static int queue_hmi_event(struct OpalHMIEvent *hmi_evt, int recover, uint64_t *out_flags) +{ + size_t size; + + /* Don't queue up event if recover == -1 */ + if (recover == -1) + return 0; + + /* set disposition */ + if (recover == 1) + hmi_evt->disposition = OpalHMI_DISPOSITION_RECOVERED; + else if (recover == 0) + hmi_evt->disposition = OpalHMI_DISPOSITION_NOT_RECOVERED; + + /* + * V2 of struct OpalHMIEvent is of (5 * 64 bits) size and well packed + * structure. Hence use uint64_t pointer to pass entire structure + * using 5 params in generic message format. Instead of hard coding + * num_params divide the struct size by 8 bytes to get exact + * num_params value. + */ + size = ALIGN_UP(sizeof(*hmi_evt), sizeof(u64)); + + *out_flags |= OPAL_HMI_FLAGS_NEW_EVENT; + + /* queue up for delivery to host. */ + return _opal_queue_msg(OPAL_MSG_HMI_EVT, NULL, NULL, + size, hmi_evt); +} + +static int read_core_fir(uint32_t chip_id, uint32_t core_id, uint64_t *core_fir) +{ + int rc; + + switch (proc_gen) { + case proc_gen_p8: + rc = xscom_read(chip_id, + XSCOM_ADDR_P8_EX(core_id, P8_CORE_FIR), core_fir); + break; + case proc_gen_p9: + rc = xscom_read(chip_id, + XSCOM_ADDR_P9_EC(core_id, P9_CORE_FIR), core_fir); + break; + case proc_gen_p10: + rc = xscom_read(chip_id, + XSCOM_ADDR_P10_EC(core_id, P10_CORE_FIR), core_fir); + break; + default: + rc = OPAL_HARDWARE; + } + return rc; +} + +static int read_core_wof(uint32_t chip_id, uint32_t core_id, uint64_t *core_wof) +{ + int rc; + + switch (proc_gen) { + case proc_gen_p9: + rc = xscom_read(chip_id, + XSCOM_ADDR_P9_EC(core_id, P9_CORE_WOF), core_wof); + break; + case proc_gen_p10: + rc = xscom_read(chip_id, + XSCOM_ADDR_P10_EC(core_id, P10_CORE_WOF), core_wof); + break; + default: + rc = OPAL_HARDWARE; + } + return rc; +} + +static bool decode_core_fir(struct cpu_thread *cpu, + struct OpalHMIEvent *hmi_evt) +{ + uint64_t core_fir; + uint32_t core_id; + int i, swkup_rc; + bool found = false; + int64_t ret; + const char *loc; + + /* Sanity check */ + if (!cpu || !hmi_evt) + return false; + + core_id = pir_to_core_id(cpu->pir); + + /* Force the core to wakeup, otherwise reading core_fir is unrealiable + * if stop-state 5 is enabled. + */ + swkup_rc = dctl_set_special_wakeup(cpu); + + /* Get CORE FIR register value. */ + ret = read_core_fir(cpu->chip_id, core_id, &core_fir); + + if (!swkup_rc) + dctl_clear_special_wakeup(cpu); + + + if (ret == OPAL_WRONG_STATE) { + /* + * CPU is asleep, so it probably didn't cause the checkstop. + * If no other HMI cause is found a "catchall" checkstop + * will be raised, so if this CPU should've been awake the + * error will be handled appropriately. + */ + prlog(PR_DEBUG, + "FIR read failed, chip %d core %d asleep\n", + cpu->chip_id, core_id); + return false; + } else if (ret != OPAL_SUCCESS) { + prerror("XSCOM error reading CORE FIR\n"); + /* If the FIR can't be read, we should checkstop. */ + return true; + } + + if (!core_fir) + return false; + + loc = chip_loc_code(cpu->chip_id); + prlog(PR_INFO, "[Loc: %s]: CHIP ID: %x, CORE ID: %x, FIR: %016llx\n", + loc ? loc : "Not Available", + cpu->chip_id, core_id, core_fir); + + if (proc_gen == proc_gen_p10) { + for (i = 0; i < ARRAY_SIZE(p10_core_fir_bits); i++) { + if (core_fir & PPC_BIT(p10_core_fir_bits[i].bit)) + prlog(PR_INFO, " %s\n", p10_core_fir_bits[i].reason); + } + } + + /* Check CORE FIR bits and populate HMI event with error info. */ + for (i = 0; i < ARRAY_SIZE(xstop_bits); i++) { + if (core_fir & PPC_BIT(xstop_bits[i].bit)) { + found = true; + hmi_evt->u.xstop_error.xstop_reason + |= cpu_to_be32(xstop_bits[i].reason); + } + } + return found; +} + +static void find_core_checkstop_reason(struct OpalHMIEvent *hmi_evt, + uint64_t *out_flags) +{ + struct cpu_thread *cpu; + + /* Initialize HMI event */ + hmi_evt->severity = OpalHMI_SEV_FATAL; + hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT; + hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_CORE; + + /* + * Check CORE FIRs and find the reason for core checkstop. + * Send a separate HMI event for each core that has checkstopped. + */ + for_each_cpu(cpu) { + /* GARDed CPUs are marked unavailable. Skip them. */ + if (cpu->state == cpu_state_unavailable) + continue; + + /* Only check on primaries (ie. core), not threads */ + if (cpu->is_secondary) + continue; + + /* Initialize xstop_error fields. */ + hmi_evt->u.xstop_error.xstop_reason = 0; + hmi_evt->u.xstop_error.u.pir = cpu_to_be32(cpu->pir); + + if (decode_core_fir(cpu, hmi_evt)) + queue_hmi_event(hmi_evt, 0, out_flags); + } +} + +static void find_capp_checkstop_reason(int flat_chip_id, + struct OpalHMIEvent *hmi_evt, + uint64_t *out_flags) +{ + struct capp_info info; + struct phb *phb; + uint64_t capp_fir; + uint64_t capp_fir_mask; + uint64_t capp_fir_action0; + uint64_t capp_fir_action1; + uint64_t reg; + int64_t rc; + + /* CAPP exists on P8 and P9 only */ + if (proc_gen != proc_gen_p8 && proc_gen != proc_gen_p9) + return; + + /* Find the CAPP on the chip associated with the HMI. */ + for_each_phb(phb) { + /* get the CAPP info */ + rc = capp_get_info(flat_chip_id, phb, &info); + if (rc == OPAL_PARAMETER) + continue; + + if (xscom_read(flat_chip_id, info.capp_fir_reg, &capp_fir) || + xscom_read(flat_chip_id, info.capp_fir_mask_reg, + &capp_fir_mask) || + xscom_read(flat_chip_id, info.capp_fir_action0_reg, + &capp_fir_action0) || + xscom_read(flat_chip_id, info.capp_fir_action1_reg, + &capp_fir_action1)) { + prerror("CAPP: Couldn't read CAPP#%d (PHB:#%x) FIR registers by XSCOM!\n", + info.capp_index, info.phb_index); + continue; + } + + if (!(capp_fir & ~capp_fir_mask)) + continue; + + prlog(PR_DEBUG, "CAPP#%d (PHB:#%x): FIR 0x%016llx mask 0x%016llx\n", + info.capp_index, info.phb_index, capp_fir, + capp_fir_mask); + prlog(PR_DEBUG, "CAPP#%d (PHB:#%x): ACTION0 0x%016llx, ACTION1 0x%016llx\n", + info.capp_index, info.phb_index, capp_fir_action0, + capp_fir_action1); + + /* + * If this bit is set (=1) a Recoverable Error has been + * detected + */ + xscom_read(flat_chip_id, info.capp_err_status_ctrl_reg, ®); + if ((reg & PPC_BIT(0)) != 0) { + phb_lock(phb); + phb->ops->set_capp_recovery(phb); + phb_unlock(phb); + + hmi_evt->severity = OpalHMI_SEV_NO_ERROR; + hmi_evt->type = OpalHMI_ERROR_CAPP_RECOVERY; + queue_hmi_event(hmi_evt, 1, out_flags); + + return; + } + } +} + +static void find_nx_checkstop_reason(int flat_chip_id, + struct OpalHMIEvent *hmi_evt, + uint64_t *out_flags) +{ + uint64_t nx_status; + uint64_t nx_dma_fir; + uint64_t nx_pbi_fir_val; + int i; + + /* Get NX status register value. */ + if (xscom_read(flat_chip_id, nx_status_reg, &nx_status) != 0) { + prerror("XSCOM error reading NX_STATUS_REG\n"); + return; + } + + /* Check if NX has driven an HMI interrupt. */ + if (!(nx_status & NX_HMI_ACTIVE)) + return; + + /* Initialize HMI event */ + hmi_evt->severity = OpalHMI_SEV_FATAL; + hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT; + hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_NX; + hmi_evt->u.xstop_error.u.chip_id = cpu_to_be32(flat_chip_id); + + /* Get DMA & Engine FIR data register value. */ + if (xscom_read(flat_chip_id, nx_dma_engine_fir, &nx_dma_fir) != 0) { + prerror("XSCOM error reading NX_DMA_ENGINE_FIR\n"); + return; + } + + /* Get PowerBus Interface FIR data register value. */ + if (xscom_read(flat_chip_id, nx_pbi_fir, &nx_pbi_fir_val) != 0) { + prerror("XSCOM error reading NX_PBI_FIR\n"); + return; + } + + /* Find NX checkstop reason and populate HMI event with error info. */ + for (i = 0; i < ARRAY_SIZE(nx_dma_xstop_bits); i++) + if (nx_dma_fir & PPC_BIT(nx_dma_xstop_bits[i].bit)) + hmi_evt->u.xstop_error.xstop_reason + |= cpu_to_be32(nx_dma_xstop_bits[i].reason); + + for (i = 0; i < ARRAY_SIZE(nx_pbi_xstop_bits); i++) + if (nx_pbi_fir_val & PPC_BIT(nx_pbi_xstop_bits[i].bit)) + hmi_evt->u.xstop_error.xstop_reason + |= cpu_to_be32(nx_pbi_xstop_bits[i].reason); + + /* + * Set NXDMAENGFIR[38] to signal PRD that service action is required. + * Without this inject, PRD will not be able to do NX unit checkstop + * error analysis. NXDMAENGFIR[38] is a spare bit and used to report + * a software initiated attention. + * + * The behavior of this bit and all FIR bits are documented in + * RAS spreadsheet. + */ + xscom_write(flat_chip_id, nx_dma_engine_fir, PPC_BIT(38)); + + /* Send an HMI event. */ + queue_hmi_event(hmi_evt, 0, out_flags); +} + +static bool phb_is_npu2(struct dt_node *dn) +{ + return (dt_node_is_compatible(dn, "ibm,power9-npu-pciex") || + dt_node_is_compatible(dn, "ibm,power9-npu-opencapi-pciex")); +} + +static void add_npu2_xstop_reason(uint32_t *xstop_reason, uint8_t reason) +{ + int i, reason_count; + uint8_t *ptr; + + reason_count = sizeof(*xstop_reason) / sizeof(reason); + ptr = (uint8_t *) xstop_reason; + for (i = 0; i < reason_count; i++) { + if (*ptr == 0) { + *ptr = reason; + break; + } + ptr++; + } +} + +static void encode_npu2_xstop_reason(uint32_t *xstop_reason, + uint64_t fir, int fir_number) +{ + int bit; + uint8_t reason; + + /* + * There are three 64-bit FIRs but the xstop reason field of + * the hmi event is only 32-bit. Encode which FIR bit is set as: + * - 2 bits for the FIR number + * - 6 bits for the bit number (0 -> 63) + * + * So we could even encode up to 4 reasons for the HMI, if + * that can ever happen + */ + while (fir) { + bit = ilog2(fir); + reason = fir_number << 6; + reason |= (63 - bit); // IBM numbering + add_npu2_xstop_reason(xstop_reason, reason); + fir ^= 1ULL << bit; + } +} + +static void find_npu2_checkstop_reason(int flat_chip_id, + struct OpalHMIEvent *hmi_evt, + uint64_t *out_flags) +{ + struct phb *phb; + int i; + bool npu2_hmi_verbose = false, found = false; + uint64_t npu2_fir; + uint64_t npu2_fir_mask; + uint64_t npu2_fir_action0; + uint64_t npu2_fir_action1; + uint64_t npu2_fir_addr; + uint64_t npu2_fir_mask_addr; + uint64_t npu2_fir_action0_addr; + uint64_t npu2_fir_action1_addr; + uint64_t fatal_errors; + uint32_t xstop_reason = 0; + int total_errors = 0; + const char *loc; + + /* NPU2 only */ + if (PVR_TYPE(mfspr(SPR_PVR)) != PVR_TYPE_P9) + return; + + /* Find the NPU on the chip associated with the HMI. */ + for_each_phb(phb) { + /* NOTE: if a chip ever has >1 NPU this will need adjusting */ + if (phb_is_npu2(phb->dt_node) && + (dt_get_chip_id(phb->dt_node) == flat_chip_id)) { + found = true; + break; + } + } + + /* If we didn't find a NPU on the chip, it's not our checkstop. */ + if (!found) + return; + + npu2_fir_addr = NPU2_FIR_REGISTER_0; + npu2_fir_mask_addr = NPU2_FIR_REGISTER_0 + NPU2_FIR_MASK_OFFSET; + npu2_fir_action0_addr = NPU2_FIR_REGISTER_0 + NPU2_FIR_ACTION0_OFFSET; + npu2_fir_action1_addr = NPU2_FIR_REGISTER_0 + NPU2_FIR_ACTION1_OFFSET; + + for (i = 0; i < NPU2_TOTAL_FIR_REGISTERS; i++) { + /* Read all the registers necessary to find a checkstop condition. */ + if (xscom_read(flat_chip_id, npu2_fir_addr, &npu2_fir) || + xscom_read(flat_chip_id, npu2_fir_mask_addr, &npu2_fir_mask) || + xscom_read(flat_chip_id, npu2_fir_action0_addr, &npu2_fir_action0) || + xscom_read(flat_chip_id, npu2_fir_action1_addr, &npu2_fir_action1)) { + prerror("HMI: Couldn't read NPU FIR register%d with XSCOM\n", i); + continue; + } + + fatal_errors = npu2_fir & ~npu2_fir_mask & npu2_fir_action0 & npu2_fir_action1; + + if (fatal_errors) { + loc = chip_loc_code(flat_chip_id); + if (!loc) + loc = "Not Available"; + prlog(PR_ERR, "NPU: [Loc: %s] P:%d FIR#%d FIR 0x%016llx mask 0x%016llx\n", + loc, flat_chip_id, i, npu2_fir, npu2_fir_mask); + prlog(PR_ERR, "NPU: [Loc: %s] P:%d ACTION0 0x%016llx, ACTION1 0x%016llx\n", + loc, flat_chip_id, npu2_fir_action0, npu2_fir_action1); + total_errors++; + + encode_npu2_xstop_reason(&xstop_reason, fatal_errors, i); + } + + /* Can't do a fence yet, we are just logging fir information for now */ + npu2_fir_addr += NPU2_FIR_OFFSET; + npu2_fir_mask_addr += NPU2_FIR_OFFSET; + npu2_fir_action0_addr += NPU2_FIR_OFFSET; + npu2_fir_action1_addr += NPU2_FIR_OFFSET; + + } + + if (!total_errors) + return; + + npu2_hmi_verbose = nvram_query_eq_safe("npu2-hmi-verbose", "true"); + /* Force this for now until we sort out something better */ + npu2_hmi_verbose = true; + + if (npu2_hmi_verbose) { + npu2_dump_scoms(flat_chip_id); + prlog(PR_ERR, " _________________________ \n"); + prlog(PR_ERR, "< It's Debug time! >\n"); + prlog(PR_ERR, " ------------------------- \n"); + prlog(PR_ERR, " \\ ,__, \n"); + prlog(PR_ERR, " \\ (oo)____ \n"); + prlog(PR_ERR, " (__) )\\ \n"); + prlog(PR_ERR, " ||--|| * \n"); + } + + /* Set up the HMI event */ + hmi_evt->severity = OpalHMI_SEV_WARNING; + hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT; + hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_NPU; + hmi_evt->u.xstop_error.xstop_reason = cpu_to_be32(xstop_reason); + hmi_evt->u.xstop_error.u.chip_id = cpu_to_be32(flat_chip_id); + + /* Marking the event as recoverable so that we don't crash */ + queue_hmi_event(hmi_evt, 1, out_flags); +} + +static void find_npu_checkstop_reason(int flat_chip_id, + struct OpalHMIEvent *hmi_evt, + uint64_t *out_flags) +{ + struct phb *phb; + struct npu *p = NULL; + + uint64_t npu_fir; + uint64_t npu_fir_mask; + uint64_t npu_fir_action0; + uint64_t npu_fir_action1; + uint64_t fatal_errors; + + /* Only check for NPU errors if the chip has a NPU */ + if (PVR_TYPE(mfspr(SPR_PVR)) != PVR_TYPE_P8NVL) + return find_npu2_checkstop_reason(flat_chip_id, hmi_evt, out_flags); + + /* Find the NPU on the chip associated with the HMI. */ + for_each_phb(phb) { + /* NOTE: if a chip ever has >1 NPU this will need adjusting */ + if (dt_node_is_compatible(phb->dt_node, "ibm,power8-npu-pciex") && + (dt_get_chip_id(phb->dt_node) == flat_chip_id)) { + p = phb_to_npu(phb); + break; + } + } + + /* If we didn't find a NPU on the chip, it's not our checkstop. */ + if (p == NULL) + return; + + /* Read all the registers necessary to find a checkstop condition. */ + if (xscom_read(flat_chip_id, + p->at_xscom + NX_FIR, &npu_fir) || + xscom_read(flat_chip_id, + p->at_xscom + NX_FIR_MASK, &npu_fir_mask) || + xscom_read(flat_chip_id, + p->at_xscom + NX_FIR_ACTION0, &npu_fir_action0) || + xscom_read(flat_chip_id, + p->at_xscom + NX_FIR_ACTION1, &npu_fir_action1)) { + prerror("Couldn't read NPU registers with XSCOM\n"); + return; + } + + fatal_errors = npu_fir & ~npu_fir_mask & npu_fir_action0 & npu_fir_action1; + + /* If there's no errors, we don't need to do anything. */ + if (!fatal_errors) + return; + + prlog(PR_DEBUG, "NPU: FIR 0x%016llx mask 0x%016llx\n", + npu_fir, npu_fir_mask); + prlog(PR_DEBUG, "NPU: ACTION0 0x%016llx, ACTION1 0x%016llx\n", + npu_fir_action0, npu_fir_action1); + + /* Set the NPU to fenced since it can't recover. */ + npu_set_fence_state(p, true); + + /* Set up the HMI event */ + hmi_evt->severity = OpalHMI_SEV_WARNING; + hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT; + hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_NPU; + hmi_evt->u.xstop_error.u.chip_id = cpu_to_be32(flat_chip_id); + + /* The HMI is "recoverable" because it shouldn't crash the system */ + queue_hmi_event(hmi_evt, 1, out_flags); +} + +static void decode_malfunction(struct OpalHMIEvent *hmi_evt, uint64_t *out_flags) +{ + int i; + uint64_t malf_alert, flags; + + flags = 0; + + if (!setup_scom_addresses()) { + prerror("Failed to setup scom addresses\n"); + /* Send an unknown HMI event. */ + hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_UNKNOWN; + hmi_evt->u.xstop_error.xstop_reason = 0; + queue_hmi_event(hmi_evt, false, out_flags); + return; + } + + xscom_read(this_cpu()->chip_id, malf_alert_scom, &malf_alert); + + if (!malf_alert) + return; + + for (i = 0; i < 64; i++) { + if (malf_alert & PPC_BIT(i)) { + xscom_write(this_cpu()->chip_id, malf_alert_scom, + ~PPC_BIT(i)); + find_capp_checkstop_reason(i, hmi_evt, &flags); + find_nx_checkstop_reason(i, hmi_evt, &flags); + find_npu_checkstop_reason(i, hmi_evt, &flags); + } + } + + find_core_checkstop_reason(hmi_evt, &flags); + + /* + * If we fail to find checkstop reason, send an unknown HMI event. + */ + if (!(flags & OPAL_HMI_FLAGS_NEW_EVENT)) { + hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_UNKNOWN; + hmi_evt->u.xstop_error.xstop_reason = 0; + queue_hmi_event(hmi_evt, false, &flags); + } + *out_flags |= flags; +} + +/* + * This will "rendez-vous" all threads on the core to the rendez-vous + * id "sig". You need to make sure that "sig" is different from the + * previous rendez vous. The sig value must be between 0 and 7 with + * boot time being set to 0. + * + * Note: in theory, we could just use a flip flop "sig" in the thread + * structure (binary rendez-vous with no argument). This is a bit more + * debuggable and better at handling timeouts (arguably). + * + * This should be called with the no lock held + */ +static void hmi_rendez_vous(uint32_t sig) +{ + struct cpu_thread *t = this_cpu(); + uint32_t my_id = cpu_get_thread_index(t); + uint32_t my_shift = my_id << 2; + uint32_t *sptr = t->core_hmi_state_ptr; + uint32_t val, prev, shift, i; + uint64_t timeout; + + assert(sig <= 0x7); + + /* + * Mark ourselves as having reached the rendez vous point with + * the exit bit cleared + */ + do { + val = prev = *sptr; + val &= ~(0xfu << my_shift); + val |= sig << my_shift; + } while (cmpxchg32(sptr, prev, val) != prev); + + /* + * Wait for everybody else to reach that point, ignore the + * exit bit as another thread could have already set it. + */ + for (i = 0; i < cpu_thread_count; i++) { + shift = i << 2; + + timeout = TIMEOUT_LOOPS; + while (((*sptr >> shift) & 0x7) != sig && --timeout) + cpu_relax(); + if (!timeout) + prlog(PR_ERR, "Rendez-vous stage 1 timeout, CPU 0x%x" + " waiting for thread %d (sptr=%08x)\n", + t->pir, i, *sptr); + } + + /* Set the exit bit */ + do { + val = prev = *sptr; + val &= ~(0xfu << my_shift); + val |= (sig | 8) << my_shift; + } while (cmpxchg32(sptr, prev, val) != prev); + + /* At this point, we need to wait for everybody else to have a value + * that is *not* sig. IE. they either have set the exit bit *or* they + * have changed the rendez-vous (meaning they have moved on to another + * rendez vous point). + */ + for (i = 0; i < cpu_thread_count; i++) { + shift = i << 2; + + timeout = TIMEOUT_LOOPS; + while (((*sptr >> shift) & 0xf) == sig && --timeout) + cpu_relax(); + if (!timeout) + prlog(PR_ERR, "Rendez-vous stage 2 timeout, CPU 0x%x" + " waiting for thread %d (sptr=%08x)\n", + t->pir, i, *sptr); + } +} + +static void hmi_print_debug(const uint8_t *msg, uint64_t hmer) +{ + const char *loc; + uint32_t core_id, thread_index; + + core_id = pir_to_core_id(this_cpu()->pir); + thread_index = cpu_get_thread_index(this_cpu()); + + loc = chip_loc_code(this_cpu()->chip_id); + if (!loc) + loc = "Not Available"; + + /* Also covers P10 SPR_HMER_TFAC_SHADOW_XFER_ERROR */ + if (hmer & (SPR_HMER_TFAC_ERROR | SPR_HMER_TFMR_PARITY_ERROR)) { + prlog(PR_DEBUG, "[Loc: %s]: P:%d C:%d T:%d: TFMR(%016lx) %s\n", + loc, this_cpu()->chip_id, core_id, thread_index, + mfspr(SPR_TFMR), msg); + } else { + prlog(PR_DEBUG, "[Loc: %s]: P:%d C:%d T:%d: %s\n", + loc, this_cpu()->chip_id, core_id, thread_index, + msg); + } +} + +static int handle_thread_tfac_error(uint64_t tfmr, uint64_t *out_flags) +{ + int recover = 1; + + if (tfmr & SPR_TFMR_DEC_PARITY_ERR) + *out_flags |= OPAL_HMI_FLAGS_DEC_LOST; + if (!tfmr_recover_local_errors(tfmr)) + recover = 0; + tfmr &= ~(SPR_TFMR_PURR_PARITY_ERR | + SPR_TFMR_SPURR_PARITY_ERR | + SPR_TFMR_DEC_PARITY_ERR); + return recover; +} + +static int64_t opal_handle_hmi(void); + +static void opal_handle_hmi_job(void *data __unused) +{ + opal_handle_hmi(); +} + +/* + * Queue hmi handling job If secondaries are still in OPAL + * This function is called by thread 0. + */ +static struct cpu_job **hmi_kick_secondaries(void) +{ + struct cpu_thread *ts = this_cpu(); + struct cpu_job **hmi_jobs = NULL; + int job_sz = sizeof(struct cpu_job *) * cpu_thread_count; + int i; + + for (i = 1; i < cpu_thread_count; i++) { + ts = next_cpu(ts); + + /* Is this thread still in OPAL ? */ + if (ts->state == cpu_state_active) { + if (!hmi_jobs) { + hmi_jobs = zalloc(job_sz); + assert(hmi_jobs); + } + + prlog(PR_DEBUG, "Sending hmi job to thread %d\n", i); + hmi_jobs[i] = cpu_queue_job(ts, "handle_hmi_job", + opal_handle_hmi_job, NULL); + } + } + return hmi_jobs; +} + +static int handle_all_core_tfac_error(uint64_t tfmr, uint64_t *out_flags) +{ + struct cpu_thread *t, *t0; + int recover = -1; + struct cpu_job **hmi_jobs = NULL; + + t = this_cpu(); + t0 = find_cpu_by_pir(cpu_get_thread0(t)); + + if (t == t0 && t0->state == cpu_state_os) + hmi_jobs = hmi_kick_secondaries(); + + /* Rendez vous all threads */ + hmi_rendez_vous(1); + + /* We use a lock here as some of the TFMR bits are shared and I + * prefer avoiding doing the cleanup simultaneously. + */ + lock(&hmi_lock); + + /* First handle corrupt TFMR otherwise we can't trust anything. + * We'll use a lock here so that the threads don't try to do it at + * the same time + */ + if (tfmr & SPR_TFMR_TFMR_CORRUPT) { + /* Check if it's still in error state */ + if (mfspr(SPR_TFMR) & SPR_TFMR_TFMR_CORRUPT) + if (!recover_corrupt_tfmr()) { + unlock(&hmi_lock); + recover = 0; + goto error_out; + } + + tfmr = mfspr(SPR_TFMR); + + /* We could have got new thread errors in the meantime */ + if (tfmr & SPR_TFMR_THREAD_ERRORS) { + recover = handle_thread_tfac_error(tfmr, out_flags); + tfmr &= ~SPR_TFMR_THREAD_ERRORS; + } + if (!recover) { + unlock(&hmi_lock); + goto error_out; + } + } + + /* Tell the OS ... */ + if (tfmr & SPR_TFMR_HDEC_PARITY_ERROR) + *out_flags |= OPAL_HMI_FLAGS_HDEC_LOST; + + /* Cleanup bad HDEC or TB on all threads or subcures before we clear + * the error conditions + */ + tfmr_cleanup_core_errors(tfmr); + + /* Unlock before next rendez-vous */ + unlock(&hmi_lock); + + /* Second rendez vous, ensure the above cleanups are all done before + * we proceed further + */ + hmi_rendez_vous(2); + + /* We can now clear the error conditions in the core. */ + recover = tfmr_clear_core_errors(tfmr); + if (recover == 0) + goto error_out; + + /* Third rendez-vous. We could in theory do the timebase resync as + * part of the previous one, but I prefer having all the error + * conditions cleared before we start trying. + */ + hmi_rendez_vous(3); + + /* Now perform the actual TB recovery on thread 0 */ + if (t == t0) + recover = chiptod_recover_tb_errors(&this_cpu()->tb_resynced); + +error_out: + /* Last rendez-vous */ + hmi_rendez_vous(4); + + /* Now all threads have gone past rendez-vous 3 and not yet past another + * rendez-vous 1, so the value of tb_resynced of thread 0 of the core + * contains an accurate indication as to whether the timebase was lost. + */ + if (t0->tb_resynced) + *out_flags |= OPAL_HMI_FLAGS_TB_RESYNC; + + if (t == t0 && hmi_jobs) { + int i; + for (i = 1; i < cpu_thread_count; i++) + if (hmi_jobs[i]) + cpu_wait_job(hmi_jobs[i], true); + free(hmi_jobs); + } + + return recover; +} + +static uint64_t read_tfmr_t0(void) +{ + uint64_t tfmr_t0; + uint32_t chip_id = this_cpu()->chip_id; + uint32_t core_id = pir_to_core_id(this_cpu()->pir); + + lock(&hmi_lock); + + xscom_write(chip_id, XSCOM_ADDR_P9_EC(core_id, P9_SCOM_SPRC), + SETFIELD(P9_SCOMC_SPR_SELECT, 0, P9_SCOMC_TFMR_T0)); + xscom_read(chip_id, XSCOM_ADDR_P9_EC(core_id, P9_SCOM_SPRD), + &tfmr_t0); + unlock(&hmi_lock); + return tfmr_t0; +} + +/* P9 errata: In theory, an HDEC error is sent to all threads. However, + * due to an errata on P9 where TFMR bit 26 (HDEC parity) cannot be + * cleared on thread 1..3, I am not confident we can do a rendez-vous + * in all cases. + * + * Our current approach is to ignore that error unless it is present + * on thread 0 TFMR. Also, ignore TB residue error due to a similar + * errata as above. + */ +static void validate_latched_errors(uint64_t *tfmr) +{ + if ((*tfmr & (SPR_TFMR_HDEC_PARITY_ERROR | SPR_TFMR_TB_RESIDUE_ERR)) + && this_cpu()->is_secondary) { + uint64_t tfmr_t0 = read_tfmr_t0(); + + if (!(tfmr_t0 & SPR_TFMR_HDEC_PARITY_ERROR)) + *tfmr &= ~SPR_TFMR_HDEC_PARITY_ERROR; + + if (!(tfmr_t0 & SPR_TFMR_TB_RESIDUE_ERR)) + *tfmr &= ~SPR_TFMR_TB_RESIDUE_ERR; + } +} + +static int handle_tfac_errors(struct OpalHMIEvent *hmi_evt, uint64_t *out_flags) +{ + int recover = -1; + uint64_t tfmr = mfspr(SPR_TFMR); + + /* Initialize the hmi event with old value of TFMR */ + hmi_evt->tfmr = cpu_to_be64(tfmr); + + /* A TFMR parity/corrupt error makes us ignore all the local stuff.*/ + if (tfmr & SPR_TFMR_TFMR_CORRUPT) { + /* Mark TB as invalid for now as we don't trust TFMR, we'll fix + * it up later + */ + this_cpu()->tb_invalid = true; + goto bad_tfmr; + } + + this_cpu()->tb_invalid = !(tfmr & SPR_TFMR_TB_VALID); + + if (proc_gen == proc_gen_p9) + validate_latched_errors(&tfmr); + + /* First, handle thread local errors */ + if (tfmr & SPR_TFMR_THREAD_ERRORS) { + recover = handle_thread_tfac_error(tfmr, out_flags); + tfmr &= ~SPR_TFMR_THREAD_ERRORS; + } + + bad_tfmr: + + /* Let's see if we still have a all-core error to deal with, if + * not, we just bail out + */ + if (tfmr & SPR_TFMR_CORE_ERRORS) { + int recover2; + + /* Only update "recover" if it's not already 0 (non-recovered) + */ + recover2 = handle_all_core_tfac_error(tfmr, out_flags); + if (recover != 0) + recover = recover2; + } else if (tfmr & SPR_TFMR_CHIP_TOD_INTERRUPT) { + int recover2; + + /* + * There are some TOD errors which do not affect working of + * TOD and TB. They stay in valid state. Hence we don't need + * rendez vous. + * + * TOD errors that affects TOD/TB will report a global error + * on TFMR alongwith bit 51, and they will go in rendez vous. + */ + recover2 = chiptod_recover_tod_errors(); + if (recover != 0) + recover = recover2; + } else if (this_cpu()->tb_invalid) { + /* This shouldn't happen, TB is invalid and no global error + * was reported. We just return for now assuming one will + * be. We can't do a rendez vous without a core-global HMI. + */ + prlog(PR_ERR, "HMI: TB invalid without core error reported ! " + "CPU=%x, TFMR=0x%016lx\n", this_cpu()->pir, + mfspr(SPR_TFMR)); + } + + if (recover != -1 && hmi_evt) { + hmi_evt->severity = OpalHMI_SEV_ERROR_SYNC; + hmi_evt->type = OpalHMI_ERROR_TFAC; + queue_hmi_event(hmi_evt, recover, out_flags); + } + + /* Set the TB state looking at TFMR register before we head out. */ + this_cpu()->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID); + + if (this_cpu()->tb_invalid) { + *out_flags |= OPAL_HMI_FLAGS_TOD_TB_FAIL; + prlog(PR_WARNING, "Failed to get TB in running state! " + "CPU=%x, TFMR=%016lx\n", this_cpu()->pir, + mfspr(SPR_TFMR)); + } + + return recover; +} + +static int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt, + uint64_t *out_flags) +{ + struct cpu_thread *cpu = this_cpu(); + int recover = 1; + uint64_t handled = 0; + + prlog(PR_DEBUG, "Received HMI interrupt: HMER = 0x%016llx\n", hmer); + /* Initialize the hmi event with old value of HMER */ + if (hmi_evt) + hmi_evt->hmer = cpu_to_be64(hmer); + + /* Handle Timer/TOD errors separately */ + if (hmer & (SPR_HMER_TFAC_ERROR | SPR_HMER_TFMR_PARITY_ERROR)) { + hmi_print_debug("Timer Facility Error", hmer); + handled = hmer & (SPR_HMER_TFAC_ERROR | SPR_HMER_TFMR_PARITY_ERROR); + mtspr(SPR_HMER, ~handled); + recover = handle_tfac_errors(hmi_evt, out_flags); + handled = 0; + } + + lock(&hmi_lock); + /* + * Not all HMIs would move TB into invalid state. Set the TB state + * looking at TFMR register. TFMR will tell us correct state of + * TB register. + */ + if (hmer & SPR_HMER_PROC_RECV_DONE) { + uint32_t chip_id = pir_to_chip_id(cpu->pir); + uint32_t core_id = pir_to_core_id(cpu->pir); + uint64_t core_wof; + + hmi_print_debug("Processor recovery occurred.", hmer); + if (!read_core_wof(chip_id, core_id, &core_wof)) { + int i; + + prlog(PR_DEBUG, "Core WOF = 0x%016llx recovered error:\n", core_wof); + if (proc_gen <= proc_gen_p9) { + for (i = 0; i < ARRAY_SIZE(p9_recoverable_bits); i++) { + if (core_wof & PPC_BIT(p9_recoverable_bits[i].bit)) + prlog(PR_DEBUG, " %s\n", p9_recoverable_bits[i].reason); + } + } else if (proc_gen == proc_gen_p10) { + for (i = 0; i < ARRAY_SIZE(p10_core_fir_bits); i++) { + if (core_wof & PPC_BIT(p10_core_fir_bits[i].bit)) + prlog(PR_DEBUG, " %s\n", p10_core_fir_bits[i].reason); + } + } + } + + handled |= SPR_HMER_PROC_RECV_DONE; + if (cpu_is_thread0(cpu) && hmi_evt) { + hmi_evt->severity = OpalHMI_SEV_NO_ERROR; + hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_DONE; + queue_hmi_event(hmi_evt, recover, out_flags); + } + } + + if ((proc_gen <= proc_gen_p9) && (hmer & SPR_HMER_PROC_RECV_ERROR_MASKED)) { + handled |= SPR_HMER_PROC_RECV_ERROR_MASKED; + if (cpu_is_thread0(cpu) && hmi_evt) { + hmi_evt->severity = OpalHMI_SEV_NO_ERROR; + hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_MASKED; + queue_hmi_event(hmi_evt, recover, out_flags); + } + hmi_print_debug("Processor recovery Done (masked).", hmer); + } + + if (hmer & SPR_HMER_PROC_RECV_AGAIN) { + handled |= SPR_HMER_PROC_RECV_AGAIN; + if (cpu_is_thread0(cpu) && hmi_evt) { + hmi_evt->severity = OpalHMI_SEV_NO_ERROR; + hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_DONE_AGAIN; + queue_hmi_event(hmi_evt, recover, out_flags); + } + hmi_print_debug("Processor recovery occurred again before" + "bit2 was cleared\n", hmer); + } + + /* XXX: what to do with this? */ + if (hmer & SPR_HMER_SPURR_SCALE_LIMIT) { + handled |= SPR_HMER_SPURR_SCALE_LIMIT; + if (cpu_is_thread0(cpu) && hmi_evt) { + hmi_evt->severity = OpalHMI_SEV_NO_ERROR; + hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_DONE; + queue_hmi_event(hmi_evt, recover, out_flags); + } + hmi_print_debug("Turbo versus nominal frequency exceeded limit.", hmer); + } + + /* Assert if we see malfunction alert, we can not continue. */ + if (hmer & SPR_HMER_MALFUNCTION_ALERT) { + handled |= SPR_HMER_MALFUNCTION_ALERT; + + hmi_print_debug("Malfunction Alert", hmer); + recover = 0; + if (hmi_evt) + decode_malfunction(hmi_evt, out_flags); + } + + /* Assert if we see Hypervisor resource error, we can not continue. */ + if ((proc_gen <= proc_gen_p9) && (hmer & SPR_HMER_HYP_RESOURCE_ERR)) { + handled |= SPR_HMER_HYP_RESOURCE_ERR; + + hmi_print_debug("Hypervisor resource error", hmer); + recover = 0; + if (hmi_evt) { + hmi_evt->severity = OpalHMI_SEV_FATAL; + hmi_evt->type = OpalHMI_ERROR_HYP_RESOURCE; + queue_hmi_event(hmi_evt, recover, out_flags); + } + } + + /* XXX: what to do with this? */ + if ((proc_gen <= proc_gen_p9) && (hmer & SPR_HMER_THD_WAKE_BLOCKED_TM_SUSPEND)) { + handled |= SPR_HMER_THD_WAKE_BLOCKED_TM_SUSPEND; + hmer &= ~SPR_HMER_THD_WAKE_BLOCKED_TM_SUSPEND; + + hmi_print_debug("Attempted to wake thread when threads in TM suspend mode.", hmer); + if (hmi_evt) { + hmi_evt->severity = OpalHMI_SEV_NO_ERROR; + hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_DONE, + queue_hmi_event(hmi_evt, recover, out_flags); + } + } + + if ((proc_gen <= proc_gen_p9) && (hmer & SPR_HMER_TRIG_FIR_HMI)) { + handled |= SPR_HMER_TRIG_FIR_HMI; + hmer &= ~SPR_HMER_TRIG_FIR_HMI; + + hmi_print_debug("Clearing unknown debug trigger", hmer); + if (hmi_evt) { + hmi_evt->severity = OpalHMI_SEV_NO_ERROR; + hmi_evt->type = OpalHMI_ERROR_DEBUG_TRIG_FIR, + queue_hmi_event(hmi_evt, recover, out_flags); + } + } + if ((proc_gen == proc_gen_p10) && (hmer & SPR_HMER_P10_TRIG_FIR_HMI)) { + handled |= SPR_HMER_P10_TRIG_FIR_HMI; + hmer &= ~SPR_HMER_P10_TRIG_FIR_HMI; + + hmi_print_debug("Clearing unknown debug trigger", hmer); + if (hmi_evt) { + hmi_evt->severity = OpalHMI_SEV_NO_ERROR; + hmi_evt->type = OpalHMI_ERROR_DEBUG_TRIG_FIR, + queue_hmi_event(hmi_evt, recover, out_flags); + } + } + + if (recover == 0) + disable_fast_reboot("Unrecoverable HMI"); + /* + * HMER bits are sticky, once set to 1 they remain set to 1 until + * they are set to 0. Reset the error source bit to 0, otherwise + * we keep getting HMI interrupt again and again. Writing to HMER + * acts as an AND, so we write mask of all 1's except for the bits + * we want to clear. + */ + mtspr(SPR_HMER, ~handled); + unlock(&hmi_lock); + return recover; +} + +static int64_t opal_handle_hmi(void) +{ + uint64_t hmer, dummy_flags; + struct OpalHMIEvent hmi_evt; + + /* + * Compiled time check to see size of OpalHMIEvent do not exceed + * that of struct opal_msg. + */ + BUILD_ASSERT(sizeof(struct opal_msg) >= sizeof(struct OpalHMIEvent)); + + memset(&hmi_evt, 0, sizeof(struct OpalHMIEvent)); + hmi_evt.version = OpalHMIEvt_V2; + + hmer = mfspr(SPR_HMER); /* Get HMER register value */ + handle_hmi_exception(hmer, &hmi_evt, &dummy_flags); + + return OPAL_SUCCESS; +} +opal_call(OPAL_HANDLE_HMI, opal_handle_hmi, 0); + +static int64_t opal_handle_hmi2(__be64 *out_flags) +{ + uint64_t hmer, flags = 0; + struct OpalHMIEvent hmi_evt; + + /* + * Compiled time check to see size of OpalHMIEvent do not exceed + * that of struct opal_msg. + */ + BUILD_ASSERT(sizeof(struct opal_msg) >= sizeof(struct OpalHMIEvent)); + + memset(&hmi_evt, 0, sizeof(struct OpalHMIEvent)); + hmi_evt.version = OpalHMIEvt_V2; + + hmer = mfspr(SPR_HMER); /* Get HMER register value */ + handle_hmi_exception(hmer, &hmi_evt, &flags); + *out_flags = cpu_to_be64(flags); + + return OPAL_SUCCESS; +} +opal_call(OPAL_HANDLE_HMI2, opal_handle_hmi2, 1); diff --git a/roms/skiboot/core/i2c.c b/roms/skiboot/core/i2c.c new file mode 100644 index 000000000..b4313d430 --- /dev/null +++ b/roms/skiboot/core/i2c.c @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * I2C + * + * Copyright 2013-2019 IBM Corp. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static LIST_HEAD(i2c_bus_list); + +/* Used to assign OPAL IDs */ +static uint32_t i2c_next_bus; + +void i2c_add_bus(struct i2c_bus *bus) +{ + bus->opal_id = ++i2c_next_bus; + dt_add_property_cells(bus->dt_node, "ibm,opal-id", bus->opal_id); + + list_add_tail(&i2c_bus_list, &bus->link); +} + +struct i2c_bus *i2c_find_bus_by_id(uint32_t opal_id) +{ + struct i2c_bus *bus; + + list_for_each(&i2c_bus_list, bus, link) { + if (bus->opal_id == opal_id) + return bus; + } + return NULL; +} + +static inline void i2c_trace_req(struct i2c_request *req, int rc) +{ + struct trace_i2c t; + + memset(&t, 0, sizeof(t)); + + t.bus = req->bus->opal_id; + t.type = req->op | (req->offset_bytes << 4); + t.i2c_addr = req->dev_addr; + t.smbus_reg = req->offset & 0xffff; // FIXME: log whole offset + t.size = req->rw_len; + t.rc = rc; + + /* FIXME: trace should not be a union... */ + trace_add((void *)&t, TRACE_I2C, sizeof(t)); +} + +int64_t i2c_queue_req(struct i2c_request *req) +{ + int64_t ret = req->bus->queue_req(req); + + i2c_trace_req(req, OPAL_ASYNC_COMPLETION); + + if (!ret) + req->req_state = i2c_req_queued; + return ret; +} + +static void opal_i2c_request_complete(int rc, struct i2c_request *req) +{ + uint64_t token = (uint64_t)(unsigned long)req->user_data; + + opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, + cpu_to_be64(token), + cpu_to_be64(rc)); + i2c_trace_req(req, rc); + + free(req); +} + +static int opal_i2c_request(uint64_t async_token, uint32_t bus_id, + struct opal_i2c_request *oreq) +{ + struct i2c_bus *bus = NULL; + struct i2c_request *req; + int rc; + + if (!opal_addr_valid(oreq)) + return OPAL_PARAMETER; + + if (oreq->flags & OPAL_I2C_ADDR_10) + return OPAL_UNSUPPORTED; + + bus = i2c_find_bus_by_id(bus_id); + if (!bus) { + /** + * @fwts-label I2CInvalidBusID + * @fwts-advice opal_i2c_request was passed an invalid bus + * ID. This has likely come from the OS rather than OPAL + * and thus could indicate an OS bug rather than an OPAL + * bug. + */ + prlog(PR_ERR, "I2C: Invalid 'bus_id' passed to the OPAL\n"); + return OPAL_PARAMETER; + } + + req = zalloc(sizeof(*req)); + if (!req) { + /** + * @fwts-label I2CFailedAllocation + * @fwts-advice OPAL failed to allocate memory for an + * i2c_request. This points to an OPAL bug as OPAL ran + * out of memory and this should never happen. + */ + prlog(PR_ERR, "I2C: Failed to allocate 'i2c_request'\n"); + return OPAL_NO_MEM; + } + + switch(oreq->type) { + case OPAL_I2C_RAW_READ: + req->op = I2C_READ; + break; + case OPAL_I2C_RAW_WRITE: + req->op = I2C_WRITE; + break; + case OPAL_I2C_SM_READ: + req->op = SMBUS_READ; + req->offset = be32_to_cpu(oreq->subaddr); + req->offset_bytes = oreq->subaddr_sz; + break; + case OPAL_I2C_SM_WRITE: + req->op = SMBUS_WRITE; + req->offset = be32_to_cpu(oreq->subaddr); + req->offset_bytes = oreq->subaddr_sz; + break; + default: + free(req); + return OPAL_PARAMETER; + } + req->dev_addr = be16_to_cpu(oreq->addr); + req->rw_len = be32_to_cpu(oreq->size); + req->rw_buf = (void *)be64_to_cpu(oreq->buffer_ra); + req->completion = opal_i2c_request_complete; + req->user_data = (void *)(unsigned long)async_token; + req->bus = bus; + + if (i2c_check_quirk(req, &rc)) { + free(req); + return rc; + } + + /* Finally, queue the OPAL i2c request and return */ + rc = i2c_queue_req(req); + if (rc) { + free(req); + return rc; + } + + return OPAL_ASYNC_COMPLETION; +} +opal_call(OPAL_I2C_REQUEST, opal_i2c_request, 3); + +#define MAX_NACK_RETRIES 2 +#define REQ_COMPLETE_POLLING 5 /* Check if req is complete + in 5ms interval */ +int64_t i2c_request_sync(struct i2c_request *req) +{ + uint64_t timer_period = msecs_to_tb(5), timer_count; + uint64_t time_to_wait = 0; + int64_t rc, waited, retries; + size_t i, count; + char buf[17]; /* 8 bytes in hex + NUL */ + + for (retries = 0; retries <= MAX_NACK_RETRIES; retries++) { + waited = 0; + timer_count = 0; + + i2c_queue_req(req); + + do { + time_to_wait = i2c_run_req(req); + if (!time_to_wait) + time_to_wait = REQ_COMPLETE_POLLING; + time_wait(time_to_wait); + waited += time_to_wait; + timer_count += time_to_wait; + if (timer_count > timer_period) { + /* + * The above request may be relying on + * timers to complete, yet there may + * not be called, especially during + * opal init. We could be looping here + * forever. So explicitly check the + * timers once in a while + */ + check_timers(false); + timer_count = 0; + } + } while (req->req_state != i2c_req_done); + + lwsync(); + rc = req->result; + + /* retry on NACK, otherwise exit */ + if (rc != OPAL_I2C_NACK_RCVD) + break; + req->req_state = i2c_req_new; + } + + i2c_trace_req(req, rc); + count = 0; + for (i = 0; i < req->rw_len && count < sizeof(buf); i++) { + count += snprintf(buf+count, sizeof(buf)-count, "%02x", + *(unsigned char *)(req->rw_buf+i)); + } + + prlog(PR_DEBUG, "I2C: %s req op=%x offset=%x buf=%s buflen=%d " + "delay=%lu/%lld rc=%lld\n", + (rc) ? "!!!!" : "----", req->op, req->offset, + buf, req->rw_len, tb_to_msecs(waited), req->timeout, rc); + + return rc; +} + +/** + * i2c_request_send - send request to i2c bus synchronously + * @bus_id: i2c bus id + * @dev_addr: address of the device + * @read_write: SMBUS_READ or SMBUS_WRITE + * @offset: any of the I2C interface offset defined + * @offset_bytes: offset size in bytes + * @buf: data to be read or written + * @buflen: buf length + * @timeout: request timeout in milliseconds + * + * Send an I2C request to a device synchronously + * + * Returns: Zero on success otherwise a negative error code + */ +int64_t i2c_request_send(int bus_id, int dev_addr, int read_write, + uint32_t offset, uint32_t offset_bytes, void* buf, + size_t buflen, int timeout) +{ + struct i2c_request *req; + struct i2c_bus *bus; + int64_t rc; + + bus = i2c_find_bus_by_id(bus_id); + if (!bus) { + /** + * @fwts-label I2CInvalidBusID + * @fwts-advice i2c_request_send was passed an invalid bus + * ID. This indicates a bug. + */ + prlog(PR_ERR, "I2C: Invalid bus_id=%x\n", bus_id); + return OPAL_PARAMETER; + } + + req = zalloc(sizeof(*req)); + if (!req) { + /** + * @fwts-label I2CAllocationFailed + * @fwts-advice OPAL failed to allocate memory for an + * i2c_request. This points to an OPAL bug as OPAL run out of + * memory and this should never happen. + */ + prlog(PR_ERR, "I2C: allocating i2c_request failed\n"); + return OPAL_INTERNAL_ERROR; + } + + req->bus = bus; + req->dev_addr = dev_addr; + req->op = read_write; + req->offset = offset; + req->offset_bytes = offset_bytes; + req->rw_buf = (void*) buf; + req->rw_len = buflen; + req->timeout = timeout; + + rc = i2c_request_sync(req); + + free(req); + if (rc) + return OPAL_HARDWARE; + + return OPAL_SUCCESS; +} diff --git a/roms/skiboot/core/init.c b/roms/skiboot/core/init.c new file mode 100644 index 000000000..a8bac28a8 --- /dev/null +++ b/roms/skiboot/core/init.c @@ -0,0 +1,1469 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * skiboot C entry point + * + * Copyright 2013-2019 IBM Corp. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +enum proc_gen proc_gen; +unsigned int pcie_max_link_speed; +bool pci_tracing; +bool verbose_eeh; +extern const char version[]; + +static uint64_t kernel_entry; +static size_t kernel_size; +static bool kernel_32bit; + +/* We backup the previous vectors here before copying our own */ +static uint8_t old_vectors[EXCEPTION_VECTORS_END]; + +#ifdef DEBUG +#define DEBUG_STR "-debug" +#else +#define DEBUG_STR "" +#endif + +#ifdef SKIBOOT_GCOV +void skiboot_gcov_done(void); +#endif + +struct debug_descriptor debug_descriptor = { + .eye_catcher = "OPALdbug", + .version = CPU_TO_BE32(DEBUG_DESC_VERSION), + .state_flags = 0, + .memcons_phys = 0, /* cpu_to_be64(&memcons) can't init constant */ + .trace_mask = 0, /* All traces disabled by default */ + /* console log level: + * high 4 bits in memory, low 4 bits driver (e.g. uart). */ +#ifdef DEBUG + .console_log_levels = (PR_TRACE << 4) | PR_DEBUG, +#else + .console_log_levels = (PR_DEBUG << 4) | PR_NOTICE, +#endif +}; + +static void checksum_romem(void); + +static bool try_load_elf64_le(struct elf_hdr *header) +{ + struct elf64le_hdr *kh = (struct elf64le_hdr *)header; + uint64_t load_base = (uint64_t)kh; + struct elf64le_phdr *ph; + unsigned int i; + + printf("INIT: 64-bit LE kernel discovered\n"); + + /* Look for a loadable program header that has our entry in it + * + * Note that we execute the kernel in-place, we don't actually + * obey the load informations in the headers. This is expected + * to work for the Linux Kernel because it's a fairly dumb ELF + * but it will not work for any ELF binary. + */ + ph = (struct elf64le_phdr *)(load_base + le64_to_cpu(kh->e_phoff)); + for (i = 0; i < le16_to_cpu(kh->e_phnum); i++, ph++) { + if (le32_to_cpu(ph->p_type) != ELF_PTYPE_LOAD) + continue; + if (le64_to_cpu(ph->p_vaddr) > le64_to_cpu(kh->e_entry) || + (le64_to_cpu(ph->p_vaddr) + le64_to_cpu(ph->p_memsz)) < + le64_to_cpu(kh->e_entry)) + continue; + + /* Get our entry */ + kernel_entry = le64_to_cpu(kh->e_entry) - + le64_to_cpu(ph->p_vaddr) + le64_to_cpu(ph->p_offset); + break; + } + + if (!kernel_entry) { + prerror("INIT: Failed to find kernel entry !\n"); + return false; + } + kernel_entry += load_base; + kernel_32bit = false; + + kernel_size = le64_to_cpu(kh->e_shoff) + + ((uint32_t)le16_to_cpu(kh->e_shentsize) * + (uint32_t)le16_to_cpu(kh->e_shnum)); + + prlog(PR_DEBUG, "INIT: 64-bit kernel entry at 0x%llx, size 0x%lx\n", + kernel_entry, kernel_size); + + return true; +} + +static bool try_load_elf64(struct elf_hdr *header) +{ + struct elf64be_hdr *kh = (struct elf64be_hdr *)header; + struct elf64le_hdr *khle = (struct elf64le_hdr *)header; + uint64_t load_base = (uint64_t)kh; + struct elf64be_phdr *ph; + struct elf64be_shdr *sh; + unsigned int i; + + /* Check it's a ppc64 LE ELF */ + if (khle->ei_ident == ELF_IDENT && + khle->ei_data == ELF_DATA_LSB && + le16_to_cpu(khle->e_machine) == ELF_MACH_PPC64) { + return try_load_elf64_le(header); + } + + /* Check it's a ppc64 ELF */ + if (kh->ei_ident != ELF_IDENT || + kh->ei_data != ELF_DATA_MSB || + be16_to_cpu(kh->e_machine) != ELF_MACH_PPC64) { + prerror("INIT: Kernel doesn't look like an ppc64 ELF\n"); + return false; + } + + /* Look for a loadable program header that has our entry in it + * + * Note that we execute the kernel in-place, we don't actually + * obey the load informations in the headers. This is expected + * to work for the Linux Kernel because it's a fairly dumb ELF + * but it will not work for any ELF binary. + */ + ph = (struct elf64be_phdr *)(load_base + be64_to_cpu(kh->e_phoff)); + for (i = 0; i < be16_to_cpu(kh->e_phnum); i++, ph++) { + if (be32_to_cpu(ph->p_type) != ELF_PTYPE_LOAD) + continue; + if (be64_to_cpu(ph->p_vaddr) > be64_to_cpu(kh->e_entry) || + (be64_to_cpu(ph->p_vaddr) + be64_to_cpu(ph->p_memsz)) < + be64_to_cpu(kh->e_entry)) + continue; + + /* Get our entry */ + kernel_entry = be64_to_cpu(kh->e_entry) - + be64_to_cpu(ph->p_vaddr) + be64_to_cpu(ph->p_offset); + break; + } + + if (!kernel_entry) { + prerror("INIT: Failed to find kernel entry !\n"); + return false; + } + + /* For the normal big-endian ELF ABI, the kernel entry points + * to a function descriptor in the data section. Linux instead + * has it point directly to code. Test whether it is pointing + * into an executable section or not to figure this out. Default + * to assuming it obeys the ABI. + */ + sh = (struct elf64be_shdr *)(load_base + be64_to_cpu(kh->e_shoff)); + for (i = 0; i < be16_to_cpu(kh->e_shnum); i++, sh++) { + if (be64_to_cpu(sh->sh_addr) <= be64_to_cpu(kh->e_entry) && + (be64_to_cpu(sh->sh_addr) + be64_to_cpu(sh->sh_size)) > + be64_to_cpu(kh->e_entry)) + break; + } + + if (i == be16_to_cpu(kh->e_shnum) || + !(be64_to_cpu(sh->sh_flags) & ELF_SFLAGS_X)) { + kernel_entry = *(uint64_t *)(kernel_entry + load_base); + kernel_entry = kernel_entry - + be64_to_cpu(ph->p_vaddr) + be64_to_cpu(ph->p_offset); + } + + kernel_entry += load_base; + kernel_32bit = false; + + kernel_size = be64_to_cpu(kh->e_shoff) + + ((uint32_t)be16_to_cpu(kh->e_shentsize) * + (uint32_t)be16_to_cpu(kh->e_shnum)); + + printf("INIT: 64-bit kernel entry at 0x%llx, size 0x%lx\n", + kernel_entry, kernel_size); + + return true; +} + +static bool try_load_elf32_le(struct elf_hdr *header) +{ + struct elf32le_hdr *kh = (struct elf32le_hdr *)header; + uint64_t load_base = (uint64_t)kh; + struct elf32le_phdr *ph; + unsigned int i; + + printf("INIT: 32-bit LE kernel discovered\n"); + + /* Look for a loadable program header that has our entry in it + * + * Note that we execute the kernel in-place, we don't actually + * obey the load informations in the headers. This is expected + * to work for the Linux Kernel because it's a fairly dumb ELF + * but it will not work for any ELF binary. + */ + ph = (struct elf32le_phdr *)(load_base + le32_to_cpu(kh->e_phoff)); + for (i = 0; i < le16_to_cpu(kh->e_phnum); i++, ph++) { + if (le32_to_cpu(ph->p_type) != ELF_PTYPE_LOAD) + continue; + if (le32_to_cpu(ph->p_vaddr) > le32_to_cpu(kh->e_entry) || + (le32_to_cpu(ph->p_vaddr) + le32_to_cpu(ph->p_memsz)) < + le32_to_cpu(kh->e_entry)) + continue; + + /* Get our entry */ + kernel_entry = le32_to_cpu(kh->e_entry) - + le32_to_cpu(ph->p_vaddr) + le32_to_cpu(ph->p_offset); + break; + } + + if (!kernel_entry) { + prerror("INIT: Failed to find kernel entry !\n"); + return false; + } + + kernel_entry += load_base; + kernel_32bit = true; + + printf("INIT: 32-bit kernel entry at 0x%llx\n", kernel_entry); + + return true; +} + +static bool try_load_elf32(struct elf_hdr *header) +{ + struct elf32be_hdr *kh = (struct elf32be_hdr *)header; + struct elf32le_hdr *khle = (struct elf32le_hdr *)header; + uint64_t load_base = (uint64_t)kh; + struct elf32be_phdr *ph; + unsigned int i; + + /* Check it's a ppc32 LE ELF */ + if (khle->ei_ident == ELF_IDENT && + khle->ei_data == ELF_DATA_LSB && + le16_to_cpu(khle->e_machine) == ELF_MACH_PPC32) { + return try_load_elf32_le(header); + } + + /* Check it's a ppc32 ELF */ + if (kh->ei_ident != ELF_IDENT || + kh->ei_data != ELF_DATA_MSB || + be16_to_cpu(kh->e_machine) != ELF_MACH_PPC32) { + prerror("INIT: Kernel doesn't look like an ppc32 ELF\n"); + return false; + } + + /* Look for a loadable program header that has our entry in it + * + * Note that we execute the kernel in-place, we don't actually + * obey the load informations in the headers. This is expected + * to work for the Linux Kernel because it's a fairly dumb ELF + * but it will not work for any ELF binary. + */ + ph = (struct elf32be_phdr *)(load_base + be32_to_cpu(kh->e_phoff)); + for (i = 0; i < be16_to_cpu(kh->e_phnum); i++, ph++) { + if (be32_to_cpu(ph->p_type) != ELF_PTYPE_LOAD) + continue; + if (be32_to_cpu(ph->p_vaddr) > be32_to_cpu(kh->e_entry) || + (be32_to_cpu(ph->p_vaddr) + be32_to_cpu(ph->p_memsz)) < + be32_to_cpu(kh->e_entry)) + continue; + + /* Get our entry */ + kernel_entry = be32_to_cpu(kh->e_entry) - + be32_to_cpu(ph->p_vaddr) + be32_to_cpu(ph->p_offset); + break; + } + + if (!kernel_entry) { + prerror("INIT: Failed to find kernel entry !\n"); + return false; + } + + kernel_entry += load_base; + kernel_32bit = true; + + printf("INIT: 32-bit kernel entry at 0x%llx\n", kernel_entry); + + return true; +} + +extern char __builtin_kernel_start[]; +extern char __builtin_kernel_end[]; +extern uint64_t boot_offset; + +static size_t initramfs_size; + +bool start_preload_kernel(void) +{ + int loaded; + + /* Try to load an external kernel payload through the platform hooks */ + kernel_size = KERNEL_LOAD_SIZE; + loaded = start_preload_resource(RESOURCE_ID_KERNEL, + RESOURCE_SUBID_NONE, + KERNEL_LOAD_BASE, + &kernel_size); + if (loaded != OPAL_SUCCESS) { + printf("INIT: platform start load kernel failed\n"); + kernel_size = 0; + return false; + } + + initramfs_size = INITRAMFS_LOAD_SIZE; + loaded = start_preload_resource(RESOURCE_ID_INITRAMFS, + RESOURCE_SUBID_NONE, + INITRAMFS_LOAD_BASE, &initramfs_size); + if (loaded != OPAL_SUCCESS) { + printf("INIT: platform start load initramfs failed\n"); + initramfs_size = 0; + return false; + } + + return true; +} + +static bool load_kernel(void) +{ + void *stb_container = NULL; + struct elf_hdr *kh; + int loaded; + + prlog(PR_NOTICE, "INIT: Waiting for kernel...\n"); + + loaded = wait_for_resource_loaded(RESOURCE_ID_KERNEL, + RESOURCE_SUBID_NONE); + + if (loaded != OPAL_SUCCESS) { + printf("INIT: platform wait for kernel load failed\n"); + kernel_size = 0; + } + + /* Try embedded kernel payload */ + if (!kernel_size) { + kernel_size = __builtin_kernel_end - __builtin_kernel_start; + if (kernel_size) { + /* Move the built-in kernel up */ + uint64_t builtin_base = + ((uint64_t)__builtin_kernel_start) - + SKIBOOT_BASE + boot_offset; + printf("Using built-in kernel\n"); + memmove(KERNEL_LOAD_BASE, (void*)builtin_base, + kernel_size); + } + } + + if (dt_has_node_property(dt_chosen, "kernel-base-address", NULL)) { + kernel_entry = dt_prop_get_u64(dt_chosen, + "kernel-base-address"); + prlog(PR_DEBUG, "INIT: Kernel image at 0x%llx\n", kernel_entry); + kh = (struct elf_hdr *)kernel_entry; + /* + * If the kernel is at 0, restore it as it was overwritten + * by our vectors. + */ + if (kernel_entry < EXCEPTION_VECTORS_END) { + cpu_set_sreset_enable(false); + memcpy_null(NULL, old_vectors, EXCEPTION_VECTORS_END); + sync_icache(); + } else { + /* Hack for STB in Mambo, assume at least 4kb in mem */ + if (!kernel_size) + kernel_size = SECURE_BOOT_HEADERS_SIZE; + if (stb_is_container((void*)kernel_entry, kernel_size)) { + stb_container = (void*)kernel_entry; + kh = (struct elf_hdr *) (kernel_entry + SECURE_BOOT_HEADERS_SIZE); + } else + kh = (struct elf_hdr *) (kernel_entry); + } + } else { + if (!kernel_size) { + printf("INIT: Assuming kernel at %p\n", + KERNEL_LOAD_BASE); + /* Hack for STB in Mambo, assume at least 4kb in mem */ + kernel_size = SECURE_BOOT_HEADERS_SIZE; + kernel_entry = (uint64_t)KERNEL_LOAD_BASE; + } + if (stb_is_container(KERNEL_LOAD_BASE, kernel_size)) { + stb_container = KERNEL_LOAD_BASE; + kh = (struct elf_hdr *) (KERNEL_LOAD_BASE + SECURE_BOOT_HEADERS_SIZE); + } else + kh = (struct elf_hdr *) (KERNEL_LOAD_BASE); + + } + + prlog(PR_DEBUG, + "INIT: Kernel loaded, size: %zu bytes (0 = unknown preload)\n", + kernel_size); + + if (kh->ei_ident != ELF_IDENT) { + prerror("INIT: ELF header not found. Assuming raw binary.\n"); + return true; + } + + if (kh->ei_class == ELF_CLASS_64) { + if (!try_load_elf64(kh)) + return false; + } else if (kh->ei_class == ELF_CLASS_32) { + if (!try_load_elf32(kh)) + return false; + } else { + prerror("INIT: Neither ELF32 not ELF64 ?\n"); + return false; + } + + if (chip_quirk(QUIRK_MAMBO_CALLOUTS)) { + secureboot_verify(RESOURCE_ID_KERNEL, + stb_container, + SECURE_BOOT_HEADERS_SIZE + kernel_size); + trustedboot_measure(RESOURCE_ID_KERNEL, + stb_container, + SECURE_BOOT_HEADERS_SIZE + kernel_size); + } + + return true; +} + +static void load_initramfs(void) +{ + uint64_t *initramfs_start; + void *stb_container = NULL; + int loaded; + + loaded = wait_for_resource_loaded(RESOURCE_ID_INITRAMFS, + RESOURCE_SUBID_NONE); + + if (loaded != OPAL_SUCCESS || !initramfs_size) + return; + + if (stb_is_container(INITRAMFS_LOAD_BASE, initramfs_size)) { + stb_container = INITRAMFS_LOAD_BASE; + initramfs_start = INITRAMFS_LOAD_BASE + SECURE_BOOT_HEADERS_SIZE; + } else { + initramfs_start = INITRAMFS_LOAD_BASE; + } + + dt_check_del_prop(dt_chosen, "linux,initrd-start"); + dt_check_del_prop(dt_chosen, "linux,initrd-end"); + + printf("INIT: Initramfs loaded, size: %zu bytes\n", initramfs_size); + + dt_add_property_u64(dt_chosen, "linux,initrd-start", + (uint64_t)initramfs_start); + dt_add_property_u64(dt_chosen, "linux,initrd-end", + (uint64_t)initramfs_start + initramfs_size); + + if (chip_quirk(QUIRK_MAMBO_CALLOUTS)) { + secureboot_verify(RESOURCE_ID_INITRAMFS, + stb_container, + SECURE_BOOT_HEADERS_SIZE + initramfs_size); + trustedboot_measure(RESOURCE_ID_INITRAMFS, + stb_container, + SECURE_BOOT_HEADERS_SIZE + initramfs_size); + } +} + +static void cpu_disable_ME_RI_one(void *param __unused) +{ + disable_machine_check(); + mtmsrd(0, 1); +} + +static int64_t cpu_disable_ME_RI_all(void) +{ + struct cpu_thread *cpu; + struct cpu_job **jobs; + + jobs = zalloc(sizeof(struct cpu_job *) * (cpu_max_pir + 1)); + assert(jobs); + + for_each_available_cpu(cpu) { + if (cpu == this_cpu()) + continue; + jobs[cpu->pir] = cpu_queue_job(cpu, "cpu_disable_ME_RI", + cpu_disable_ME_RI_one, NULL); + } + + /* this cpu */ + cpu_disable_ME_RI_one(NULL); + + for_each_available_cpu(cpu) { + if (jobs[cpu->pir]) + cpu_wait_job(jobs[cpu->pir], true); + } + + free(jobs); + + return OPAL_SUCCESS; +} + +static void *fdt; + +void __noreturn load_and_boot_kernel(bool is_reboot) +{ + const struct dt_property *memprop; + const char *cmdline, *stdoutp; + uint64_t mem_top; + + memprop = dt_find_property(dt_root, DT_PRIVATE "maxmem"); + if (memprop) + mem_top = (u64)dt_property_get_cell(memprop, 0) << 32 + | dt_property_get_cell(memprop, 1); + else /* XXX HB hack, might want to calc it */ + mem_top = 0x40000000; + + op_display(OP_LOG, OP_MOD_INIT, 0x000A); + + /* Load kernel LID */ + if (!load_kernel()) { + op_display(OP_FATAL, OP_MOD_INIT, 1); + abort(); + } + + load_initramfs(); + + trustedboot_exit_boot_services(); + + ipmi_set_fw_progress_sensor(IPMI_FW_OS_BOOT); + + + if (!is_reboot) { + /* We wait for the nvram read to complete here so we can + * grab stuff from there such as the kernel arguments + */ + nvram_wait_for_load(); + + if (!occ_sensors_init()) + dts_sensor_create_nodes(sensor_node); + + } else { + /* fdt will be rebuilt */ + free(fdt); + fdt = NULL; + + nvram_reinit(); + occ_pstates_init(); + } + + /* Use nvram bootargs over device tree */ + cmdline = nvram_query_safe("bootargs"); + if (cmdline) { + dt_check_del_prop(dt_chosen, "bootargs"); + dt_add_property_string(dt_chosen, "bootargs", cmdline); + prlog(PR_DEBUG, "INIT: Command line from NVRAM: %s\n", + cmdline); + } + + op_display(OP_LOG, OP_MOD_INIT, 0x000B); + + add_fast_reboot_dt_entries(); + + if (platform.finalise_dt) + platform.finalise_dt(is_reboot); + + /* Create the device tree blob to boot OS. */ + fdt = create_dtb(dt_root, false); + if (!fdt) { + op_display(OP_FATAL, OP_MOD_INIT, 2); + abort(); + } + + op_display(OP_LOG, OP_MOD_INIT, 0x000C); + + mem_dump_free(); + + /* Dump the selected console */ + stdoutp = dt_prop_get_def(dt_chosen, "linux,stdout-path", NULL); + prlog(PR_DEBUG, "INIT: stdout-path: %s\n", stdoutp ? stdoutp : ""); + + fdt_set_boot_cpuid_phys(fdt, this_cpu()->pir); + + /* Check there is something there before we branch to it */ + if (*(uint32_t *)kernel_entry == 0) { + prlog(PR_EMERG, "FATAL: Kernel is zeros, can't execute!\n"); + assert(0); + } + + if (platform.exit) + platform.exit(); + + /* Take processors out of nap */ + cpu_set_sreset_enable(false); + cpu_set_ipi_enable(false); + + printf("INIT: Starting kernel at 0x%llx, fdt at %p %u bytes\n", + kernel_entry, fdt, fdt_totalsize(fdt)); + + /* Disable machine checks on all */ + cpu_disable_ME_RI_all(); + + patch_traps(false); + cpu_set_hile_mode(false); /* Clear HILE on all CPUs */ + + /* init MPIPL */ + if (!is_reboot) + opal_mpipl_init(); + + checksum_romem(); + + debug_descriptor.state_flags |= OPAL_BOOT_COMPLETE; + + cpu_give_self_os(); + + if (kernel_32bit) + start_kernel32(kernel_entry, fdt, mem_top); + start_kernel(kernel_entry, fdt, mem_top); +} + +static void storage_keys_fixup(void) +{ + struct dt_node *cpus, *n; + + cpus = dt_find_by_path(dt_root, "/cpus"); + assert(cpus); + + if (proc_gen == proc_gen_unknown) + return; + + dt_for_each_child(cpus, n) { + /* There may be cache nodes in /cpus. */ + if (!dt_has_node_property(n, "device_type", "cpu") || + dt_has_node_property(n, "ibm,processor-storage-keys", NULL)) + continue; + + /* + * skiboot supports p8 & p9, both of which support the IAMR, and + * both of which support 32 keys. So advertise 32 keys for data + * accesses and 32 for instruction accesses. + */ + dt_add_property_cells(n, "ibm,processor-storage-keys", 32, 32); + } +} + +static void dt_fixups(void) +{ + struct dt_node *n; + struct dt_node *primary_lpc = NULL; + + /* lpc node missing #address/size cells. Also pick one as + * primary for now (TBD: How to convey that from HB) + */ + dt_for_each_compatible(dt_root, n, "ibm,power8-lpc") { + if (!primary_lpc || dt_has_node_property(n, "primary", NULL)) + primary_lpc = n; + if (dt_has_node_property(n, "#address-cells", NULL)) + break; + dt_add_property_cells(n, "#address-cells", 2); + dt_add_property_cells(n, "#size-cells", 1); + dt_add_property_strings(n, "status", "ok"); + } + + /* Missing "primary" property in LPC bus */ + if (primary_lpc && !dt_has_node_property(primary_lpc, "primary", NULL)) + dt_add_property(primary_lpc, "primary", NULL, 0); + + /* Missing "scom-controller" */ + dt_for_each_compatible(dt_root, n, "ibm,xscom") { + if (!dt_has_node_property(n, "scom-controller", NULL)) + dt_add_property(n, "scom-controller", NULL, 0); + } + + storage_keys_fixup(); +} + +static void add_arch_vector(void) +{ + /** + * vec5 = a PVR-list : Number-of-option-vectors : + * option-vectors[Number-of-option-vectors + 1] + */ + uint8_t vec5[] = {0x05, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00}; + + if (dt_has_node_property(dt_chosen, "ibm,architecture-vec-5", NULL)) + return; + + dt_add_property(dt_chosen, "ibm,architecture-vec-5", + vec5, sizeof(vec5)); +} + +static void dt_init_misc(void) +{ + /* Check if there's a /chosen node, if not, add one */ + dt_chosen = dt_find_by_path(dt_root, "/chosen"); + if (!dt_chosen) + dt_chosen = dt_new(dt_root, "chosen"); + assert(dt_chosen); + + /* Add IBM architecture vectors if needed */ + add_arch_vector(); + + /* Add the "OPAL virtual ICS*/ + add_ics_node(); + + /* Additional fixups. TODO: Move into platform */ + dt_fixups(); +} + +static u8 console_get_level(const char *s) +{ + if (strcmp(s, "emerg") == 0) + return PR_EMERG; + if (strcmp(s, "alert") == 0) + return PR_ALERT; + if (strcmp(s, "crit") == 0) + return PR_CRIT; + if (strcmp(s, "err") == 0) + return PR_ERR; + if (strcmp(s, "warning") == 0) + return PR_WARNING; + if (strcmp(s, "notice") == 0) + return PR_NOTICE; + if (strcmp(s, "printf") == 0) + return PR_PRINTF; + if (strcmp(s, "info") == 0) + return PR_INFO; + if (strcmp(s, "debug") == 0) + return PR_DEBUG; + if (strcmp(s, "trace") == 0) + return PR_TRACE; + if (strcmp(s, "insane") == 0) + return PR_INSANE; + /* Assume it's a number instead */ + return atoi(s); +} + +static void console_log_level(void) +{ + const char *s; + u8 level; + + /* console log level: + * high 4 bits in memory, low 4 bits driver (e.g. uart). */ + s = nvram_query_safe("log-level-driver"); + if (s) { + level = console_get_level(s); + debug_descriptor.console_log_levels = + (debug_descriptor.console_log_levels & 0xf0 ) | + (level & 0x0f); + prlog(PR_NOTICE, "console: Setting driver log level to %i\n", + level & 0x0f); + } + s = nvram_query_safe("log-level-memory"); + if (s) { + level = console_get_level(s); + debug_descriptor.console_log_levels = + (debug_descriptor.console_log_levels & 0x0f ) | + ((level & 0x0f) << 4); + prlog(PR_NOTICE, "console: Setting memory log level to %i\n", + level & 0x0f); + } +} + +typedef void (*ctorcall_t)(void); + +static void __nomcount do_ctors(void) +{ + extern ctorcall_t __ctors_start[], __ctors_end[]; + ctorcall_t *call; + + for (call = __ctors_start; call < __ctors_end; call++) + (*call)(); +} + +#ifdef ELF_ABI_v2 +static void setup_branch_null_catcher(void) +{ + asm volatile( \ + ".section .rodata" "\n\t" \ + "3: .string \"branch to NULL\"" "\n\t" \ + ".previous" "\n\t" \ + ".section .trap_table,\"aw\"" "\n\t" \ + ".llong 0" "\n\t" \ + ".llong 3b" "\n\t" \ + ".previous" "\n\t" \ + ); +} +#else +static void branch_null(void) +{ + assert(0); +} + +static void setup_branch_null_catcher(void) +{ + void (*bn)(void) = branch_null; + + /* + * FIXME: This copies the function descriptor (16 bytes) for + * ABI v1 (ie. big endian). This will be broken if we ever + * move to ABI v2 (ie little endian) + */ + memcpy_null((void *)0, bn, 16); +} +#endif + +void copy_sreset_vector(void) +{ + uint32_t *src, *dst; + + /* Copy the reset code over the entry point. */ + src = &reset_patch_start; + dst = (uint32_t *)0x100; + while(src < &reset_patch_end) + *(dst++) = *(src++); + sync_icache(); +} + +void copy_sreset_vector_fast_reboot(void) +{ + uint32_t *src, *dst; + + /* Copy the reset code over the entry point. */ + src = &reset_fast_reboot_patch_start; + dst = (uint32_t *)0x100; + while(src < &reset_fast_reboot_patch_end) + *(dst++) = *(src++); + sync_icache(); +} + +void copy_exception_vectors(void) +{ + /* Copy from 0x100 to EXCEPTION_VECTORS_END, avoid below 0x100 as + * this is the boot flag used by CPUs still potentially entering + * skiboot. + */ + memcpy((void *)0x100, (void *)(SKIBOOT_BASE + 0x100), + EXCEPTION_VECTORS_END - 0x100); + sync_icache(); +} + +/* + * When skiboot owns the exception vectors, patch in 'trap' for assert fails. + * Otherwise use assert_fail() + */ +void patch_traps(bool enable) +{ + struct trap_table_entry *tte; + + for (tte = __trap_table_start; tte < __trap_table_end; tte++) { + uint32_t *insn; + + insn = (uint32_t *)tte->address; + if (enable) { + *insn = PPC_INST_TRAP; + } else { + *insn = PPC_INST_NOP; + } + } + + sync_icache(); +} + +static void per_thread_sanity_checks(void) +{ + struct cpu_thread *cpu = this_cpu(); + + /** + * @fwts-label NonZeroHRMOR + * @fwts-advice The contents of the hypervisor real mode offset register + * (HRMOR) is bitwise orded with the address of any hypervisor real mode + * (i.e Skiboot) memory accesses. Skiboot does not support operating + * with a non-zero HRMOR and setting it will break some things (e.g + * XSCOMs) in hard-to-debug ways. + */ + assert(mfspr(SPR_HRMOR) == 0); + + /** + * @fwts-label UnknownSecondary + * @fwts-advice The boot CPU attampted to call in a secondary thread + * without initialising the corresponding cpu_thread structure. This may + * happen if the HDAT or devicetree reports too few threads or cores for + * this processor. + */ + assert(cpu->state != cpu_state_no_cpu); +} + +void pci_nvram_init(void) +{ + const char *nvram_speed; + + verbose_eeh = nvram_query_eq_safe("pci-eeh-verbose", "true"); + if (verbose_eeh) + prlog(PR_INFO, "PHB: Verbose EEH enabled\n"); + + pcie_max_link_speed = 0; + + nvram_speed = nvram_query_dangerous("pcie-max-link-speed"); + if (nvram_speed) { + pcie_max_link_speed = atoi(nvram_speed); + prlog(PR_NOTICE, "PHB: NVRAM set max link speed to GEN%i\n", + pcie_max_link_speed); + } + + pci_tracing = nvram_query_eq_safe("pci-tracing", "true"); +} + +static uint32_t mem_csum(void *_p, void *_e) +{ + size_t len = _e - _p; + uint32_t *p = _p; + uint32_t v1 = 0, v2 = 0; + uint32_t csum; + unsigned int i; + + for (i = 0; i < len; i += 4) { + uint32_t v = *p++; + v1 += v; + v2 += v1; + } + + csum = v1 ^ v2; + + return csum; +} + +static uint32_t romem_csum; + +static void checksum_romem(void) +{ + uint32_t csum; + + romem_csum = 0; + if (chip_quirk(QUIRK_SLOW_SIM)) + return; + + csum = mem_csum(_start, _head_end); + romem_csum ^= csum; + + csum = mem_csum(_stext, _romem_end); + romem_csum ^= csum; + + csum = mem_csum(__builtin_kernel_start, __builtin_kernel_end); + romem_csum ^= csum; +} + +bool verify_romem(void) +{ + uint32_t old = romem_csum; + checksum_romem(); + if (old != romem_csum) { + romem_csum = old; + prlog(PR_NOTICE, "OPAL checksums did not match\n"); + return false; + } + return true; +} + +static void mask_pc_system_xstop(void) +{ + struct cpu_thread *cpu; + uint32_t chip_id, core_id; + int rc; + + if (proc_gen != proc_gen_p10) + return; + + if (chip_quirk(QUIRK_MAMBO_CALLOUTS)) + return; + + /* + * On P10 Mask PC system checkstop (bit 28). This is needed + * for HW570622. We keep processor recovery disabled via + * HID[5] and mask the checkstop that it can cause. CME does + * the recovery handling for us. + */ + for_each_cpu(cpu) { + chip_id = cpu->chip_id; + core_id = pir_to_core_id(cpu->pir); + + rc = xscom_write(chip_id, + XSCOM_ADDR_P10_EC(core_id, P10_CORE_FIRMASK_OR), + PPC_BIT(28)); + if (rc) + prerror("Error setting FIR MASK rc:%d on PIR:%x\n", + rc, cpu->pir); + } +} + + +/* Called from head.S, thus no prototype. */ +void __noreturn __nomcount main_cpu_entry(const void *fdt); + +void __noreturn __nomcount main_cpu_entry(const void *fdt) +{ + /* + * WARNING: At this point. the timebases have + * *not* been synchronized yet. Do not use any timebase + * related functions for timeouts etc... unless you can cope + * with the speed being some random core clock divider and + * the value jumping backward when the synchronization actually + * happens (in chiptod_init() below). + * + * Also the current cpu_thread() struct is not initialized + * either so we need to clear it out first thing first (without + * putting any other useful info in there jus yet) otherwise + * printf an locks are going to play funny games with "con_suspend" + */ + pre_init_boot_cpu(); + + /* + * Point to our mem console + */ + debug_descriptor.memcons_phys = cpu_to_be64((uint64_t)&memcons); + + /* + * Before first printk, ensure console buffer is clear or + * reading tools might think it has wrapped + */ + clear_console(); + + /* Backup previous vectors as this could contain a kernel + * image. + */ + memcpy_null(old_vectors, NULL, EXCEPTION_VECTORS_END); + + /* + * Some boot firmwares enter OPAL with MSR[ME]=1, as they presumably + * handle machine checks until we take over. As we overwrite the + * previous exception vectors with our own handlers, disable MSR[ME]. + * This could be done atomically by patching in a branch then patching + * it out last, but that's a lot of effort. + */ + disable_machine_check(); + + /* Copy all vectors down to 0 */ + copy_exception_vectors(); + + /* Enable trap based asserts */ + patch_traps(true); + + /* + * Enable MSR[ME] bit so we can take MCEs. We don't currently + * recover, but we print some useful information. + */ + enable_machine_check(); + mtmsrd(MSR_RI, 1); + + /* Setup a NULL catcher to catch accidental NULL ptr calls */ + setup_branch_null_catcher(); + + /* Call library constructors */ + do_ctors(); + + prlog(PR_NOTICE, "OPAL %s%s starting...\n", version, DEBUG_STR); + + prlog(PR_DEBUG, "initial console log level: memory %d, driver %d\n", + (debug_descriptor.console_log_levels >> 4), + (debug_descriptor.console_log_levels & 0x0f)); + prlog(PR_TRACE, "OPAL is Powered By Linked-List Technology.\n"); + +#ifdef SKIBOOT_GCOV + skiboot_gcov_done(); +#endif + + /* Initialize boot cpu's cpu_thread struct */ + init_boot_cpu(); + + /* Now locks can be used */ + init_locks(); + + /* Create the OPAL call table early on, entries can be overridden + * later on (FSP console code for example) + */ + opal_table_init(); + + /* Init the physical map table so we can start mapping things */ + phys_map_init(mfspr(SPR_PVR)); + + /* + * If we are coming in with a flat device-tree, we expand it + * now. Else look for HDAT and create a device-tree from them + * + * Hack alert: When entering via the OPAL entry point, fdt + * is set to -1, we record that and pass it to parse_hdat + */ + + dt_root = dt_new_root(""); + + if (fdt == (void *)-1ul) { + if (parse_hdat(true) < 0) + abort(); + } else if (fdt == NULL) { + if (parse_hdat(false) < 0) + abort(); + } else { + dt_expand(fdt); + } + dt_add_cpufeatures(dt_root); + + /* Now that we have a full devicetree, verify that we aren't on fire. */ + per_thread_sanity_checks(); + + /* + * From there, we follow a fairly strict initialization order. + * + * First we need to build up our chip data structures and initialize + * XSCOM which will be needed for a number of susbequent things. + * + * We want XSCOM available as early as the platform probe in case the + * probe requires some HW accesses. + * + * We also initialize the FSI master at that point in case we need + * to access chips via that path early on. + */ + init_chips(); + + xscom_init(); + mfsi_init(); + + /* + * Direct controls facilities provides some controls over CPUs + * using scoms. + */ + direct_controls_init(); + + /* + * Put various bits & pieces in device-tree that might not + * already be there such as the /chosen node if not there yet, + * the ICS node, etc... This can potentially use XSCOM + */ + dt_init_misc(); + + /* + * Initialize LPC (P8 and beyond) so we can get to UART, BMC and + * other system controller. This is done before probe_platform + * so that the platform probing code can access an external + * BMC if needed. + */ + lpc_init(); + + /* + * This should be done before mem_region_init, so the stack + * region length can be set according to the maximum PIR. + */ + init_cpu_max_pir(); + + /* + * Now, we init our memory map from the device-tree, and immediately + * reserve areas which we know might contain data coming from + * HostBoot. We need to do these things before we start doing + * allocations outside of our heap, such as chip local allocs, + * otherwise we might clobber those data. + */ + mem_region_init(); + + /* + * Reserve memory required to capture OPAL dump. This should be done + * immediately after mem_region_init to avoid any clash with local + * memory allocation. + */ + opal_mpipl_reserve_mem(); + + /* Reserve HOMER and OCC area */ + homer_init(); + + /* Initialize the rest of the cpu thread structs */ + init_all_cpus(); + if (proc_gen == proc_gen_p9 || proc_gen == proc_gen_p10) + cpu_set_ipi_enable(true); + + /* Once all CPU are up apply this workaround */ + mask_pc_system_xstop(); + + /* Add the /opal node to the device-tree */ + add_opal_node(); + + /* + * We probe the platform now. This means the platform probe gets + * the opportunity to reserve additional areas of memory if needed. + * + * Note: Timebases still not synchronized. + */ + probe_platform(); + + /* Allocate our split trace buffers now. Depends add_opal_node() */ + init_trace_buffers(); + + /* On P8, get the ICPs and make sure they are in a sane state */ + init_interrupts(); + if (proc_gen == proc_gen_p8) + cpu_set_ipi_enable(true); + + /* On P9 and P10, initialize XIVE */ + if (proc_gen == proc_gen_p9) + init_xive(); + else if (proc_gen == proc_gen_p10) + xive2_init(); + + /* Grab centaurs from device-tree if present (only on FSP-less) */ + centaur_init(); + + /* initialize ocmb scom-controller */ + ocmb_init(); + + /* Initialize PSI (depends on probe_platform being called) */ + psi_init(); + + /* Initialize/enable LPC interrupts. This must be done after the + * PSI interface has been initialized since it serves as an interrupt + * source for LPC interrupts. + */ + lpc_init_interrupts(); + + /* Call in secondary CPUs */ + cpu_bringup(); + + /* We can now overwrite the 0x100 vector as we are no longer being + * entered there. + */ + copy_sreset_vector(); + + /* We can now do NAP mode */ + cpu_set_sreset_enable(true); + + /* + * Synchronize time bases. Prior to chiptod_init() the timebase + * is free-running at a frequency based on the core clock rather + * than being synchronised to the ChipTOD network. This means + * that the timestamps in early boot might be a little off compared + * to wall clock time. + */ + chiptod_init(); + + /* Initialize P9 DIO */ + p9_dio_init(); + + /* + * SBE uses TB value for scheduling timer. Hence init after + * chiptod init + */ + p9_sbe_init(); + + /* Initialize i2c */ + p8_i2c_init(); + + /* Register routine to dispatch and read sensors */ + sensor_init(); + + /* + * Initialize the opal messaging before platform.init as we are + * getting request to queue occ load opal message when host services + * got load occ request from FSP + */ + opal_init_msg(); + + /* + * We have initialized the basic HW, we can now call into the + * platform to perform subsequent inits, such as establishing + * communication with the FSP or starting IPMI. + */ + if (platform.init) + platform.init(); + + /* Read in NVRAM and set it up */ + nvram_init(); + + /* Set the console level */ + console_log_level(); + + /* Secure/Trusted Boot init. We look for /ibm,secureboot in DT */ + secureboot_init(); + trustedboot_init(); + + /* Secure variables init, handled by platform */ + if (platform.secvar_init && is_fw_secureboot()) + platform.secvar_init(); + + /* + * BMC platforms load version information from flash after + * secure/trustedboot init. + */ + if (platform.bmc) + flash_fw_version_preload(); + + /* preload the IMC catalog dtb */ + imc_catalog_preload(); + + /* Install the OPAL Console handlers */ + init_opal_console(); + + /* + * Some platforms set a flag to wait for SBE validation to be + * performed by the BMC. If this occurs it leaves the SBE in a + * bad state and the system will reboot at this point. + */ + if (platform.seeprom_update) + platform.seeprom_update(); + + /* Init SLW related stuff, including fastsleep */ + slw_init(); + + op_display(OP_LOG, OP_MOD_INIT, 0x0002); + + /* + * On some POWER9 BMC systems, we need to initialise the OCC + * before the NPU to facilitate NVLink/OpenCAPI presence + * detection, so we set it up as early as possible. On FSP + * systems, Hostboot starts booting the OCC later, so we delay + * OCC initialisation as late as possible to give it the + * maximum time to boot up. + */ + if (platform.bmc) + occ_pstates_init(); + + pci_nvram_init(); + + preload_capp_ucode(); + start_preload_kernel(); + + /* Catalog decompression routine */ + imc_decompress_catalog(); + + /* Virtual Accelerator Switchboard */ + vas_init(); + + /* NX init */ + nx_init(); + + /* Probe PHB3 on P8 */ + probe_phb3(); + + /* Probe PHB4 on P9 and PHB5 on P10 */ + probe_phb4(); + + /* Probe NPUs */ + probe_npu(); + probe_npu2(); + probe_npu3(); + + /* Initialize PCI */ + pci_init_slots(); + + /* Add OPAL timer related properties */ + late_init_timers(); + + /* Setup ibm,firmware-versions if able */ + if (platform.bmc) { + flash_dt_add_fw_version(); + ipmi_dt_add_bmc_info(); + } + + ipmi_set_fw_progress_sensor(IPMI_FW_PCI_INIT); + + /* + * These last few things must be done as late as possible + * because they rely on various other things having been setup, + * for example, add_opal_interrupts() will add all the interrupt + * sources that are going to the firmware. We can't add a new one + * after that call. Similarly, the mem_region calls will construct + * the reserve maps in the DT so we shouldn't affect the memory + * regions after that + */ + + /* Create the LPC bus interrupt-map on P9 */ + lpc_finalize_interrupts(); + + /* Add the list of interrupts going to OPAL */ + add_opal_interrupts(); + + /* Init In-Memory Collection related stuff (load the IMC dtb into memory) */ + imc_init(); + + /* Disable protected execution facility in BML */ + cpu_disable_pef(); + + /* export the trace buffers */ + trace_add_dt_props(); + + /* Now release parts of memory nodes we haven't used ourselves... */ + mem_region_release_unused(); + + /* ... and add remaining reservations to the DT */ + mem_region_add_dt_reserved(); + + /* + * Update /ibm,secureboot/ibm,cvc/memory-region to point to + * /reserved-memory/secure-crypt-algo-code instead of + * /ibm,hostboot/reserved-memory/secure-crypt-algo-code. + */ + cvc_update_reserved_memory_phandle(); + + prd_register_reserved_memory(); + + load_and_boot_kernel(false); +} + +void __noreturn __secondary_cpu_entry(void) +{ + struct cpu_thread *cpu = this_cpu(); + + /* Secondary CPU called in */ + cpu_callin(cpu); + + enable_machine_check(); + mtmsrd(MSR_RI, 1); + + /* Some XIVE setup */ + if (proc_gen == proc_gen_p9) + xive_cpu_callin(cpu); + else if (proc_gen == proc_gen_p10) + xive2_cpu_callin(cpu); + + /* Wait for work to do */ + while(true) { + if (cpu_check_jobs(cpu)) + cpu_process_jobs(); + else + cpu_idle_job(); + } +} + +/* Called from head.S, thus no prototype. */ +void __noreturn __nomcount secondary_cpu_entry(void); + +void __noreturn __nomcount secondary_cpu_entry(void) +{ + struct cpu_thread *cpu = this_cpu(); + + per_thread_sanity_checks(); + + prlog(PR_DEBUG, "INIT: CPU PIR 0x%04x called in\n", cpu->pir); + + __secondary_cpu_entry(); +} diff --git a/roms/skiboot/core/interrupts.c b/roms/skiboot/core/interrupts.c new file mode 100644 index 000000000..0a617d385 --- /dev/null +++ b/roms/skiboot/core/interrupts.c @@ -0,0 +1,513 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Excuse me, you do work for me now? + * + * Copyright 2013-2019 IBM Corp. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* ICP registers */ +#define ICP_XIRR 0x4 /* 32-bit access */ +#define ICP_CPPR 0x4 /* 8-bit access */ +#define ICP_MFRR 0xc /* 8-bit access */ + +static LIST_HEAD(irq_sources); +static LIST_HEAD(irq_sources2); +static struct lock irq_lock = LOCK_UNLOCKED; + +void __register_irq_source(struct irq_source *is, bool secondary) +{ + struct irq_source *is1; + struct list_head *list = secondary ? &irq_sources2 : &irq_sources; + + prlog(PR_DEBUG, "IRQ: Registering %04x..%04x ops @%p (data %p)%s\n", + is->start, is->end - 1, is->ops, is->data, + secondary ? " [secondary]" : ""); + + lock(&irq_lock); + list_for_each(list, is1, link) { + if (is->end > is1->start && is->start < is1->end) { + prerror("register IRQ source overlap !\n"); + prerror(" new: %x..%x old: %x..%x\n", + is->start, is->end - 1, + is1->start, is1->end - 1); + assert(0); + } + } + list_add_tail(list, &is->link); + unlock(&irq_lock); +} + +void register_irq_source(const struct irq_source_ops *ops, void *data, + uint32_t start, uint32_t count) +{ + struct irq_source *is; + + is = zalloc(sizeof(struct irq_source)); + assert(is); + is->start = start; + is->end = start + count; + is->ops = ops; + is->data = data; + + __register_irq_source(is, false); +} + +void unregister_irq_source(uint32_t start, uint32_t count) +{ + struct irq_source *is; + + /* Note: We currently only unregister from the primary sources */ + lock(&irq_lock); + list_for_each(&irq_sources, is, link) { + if (start >= is->start && start < is->end) { + if (start != is->start || + count != (is->end - is->start)) { + prerror("unregister IRQ source mismatch !\n"); + prerror("start:%x, count: %x match: %x..%x\n", + start, count, is->start, is->end); + assert(0); + } + list_del(&is->link); + unlock(&irq_lock); + /* XXX Add synchronize / RCU */ + free(is); + return; + } + } + unlock(&irq_lock); + prerror("unregister IRQ source not found !\n"); + prerror("start:%x, count: %x\n", start, count); + assert(0); +} + +struct irq_source *irq_find_source(uint32_t isn) +{ + struct irq_source *is; + + lock(&irq_lock); + /* + * XXX This really needs some kind of caching ! + */ + list_for_each(&irq_sources, is, link) { + if (isn >= is->start && isn < is->end) { + unlock(&irq_lock); + return is; + } + } + list_for_each(&irq_sources2, is, link) { + if (isn >= is->start && isn < is->end) { + unlock(&irq_lock); + return is; + } + } + unlock(&irq_lock); + + return NULL; +} + +void irq_for_each_source(void (*cb)(struct irq_source *, void *), void *data) +{ + struct irq_source *is; + + lock(&irq_lock); + list_for_each(&irq_sources, is, link) + cb(is, data); + list_for_each(&irq_sources2, is, link) + cb(is, data); + unlock(&irq_lock); +} + +/* + * This takes a 6-bit chip id and returns a 20 bit value representing + * the PSI interrupt. This includes all the fields above, ie, is a + * global interrupt number. + * + * For P8, this returns the base of the 8-interrupts block for PSI + */ +uint32_t get_psi_interrupt(uint32_t chip_id) +{ + uint32_t irq; + + switch(proc_gen) { + case proc_gen_p8: + irq = p8_chip_irq_block_base(chip_id, P8_IRQ_BLOCK_MISC); + irq += P8_IRQ_MISC_PSI_BASE; + break; + default: + assert(false); + }; + + return irq; +} + + +struct dt_node *add_ics_node(void) +{ + struct dt_node *ics = dt_new_addr(dt_root, "interrupt-controller", 0); + bool has_xive; + + if (!ics) + return NULL; + + has_xive = proc_gen >= proc_gen_p9; + + dt_add_property_cells(ics, "reg", 0, 0, 0, 0); + dt_add_property_strings(ics, "compatible", + has_xive ? "ibm,opal-xive-vc" : "IBM,ppc-xics", + "IBM,opal-xics"); + dt_add_property_cells(ics, "#address-cells", 0); + dt_add_property_cells(ics, "#interrupt-cells", 2); + dt_add_property_string(ics, "device_type", + "PowerPC-Interrupt-Source-Controller"); + dt_add_property(ics, "interrupt-controller", NULL, 0); + + return ics; +} + +uint32_t get_ics_phandle(void) +{ + struct dt_node *i; + + for (i = dt_first(dt_root); i; i = dt_next(dt_root, i)) { + if (streq(i->name, "interrupt-controller@0")) { + return i->phandle; + } + } + abort(); +} + +void add_opal_interrupts(void) +{ + struct irq_source *is; + unsigned int i, ns, tns = 0, count = 0; + uint32_t isn; + __be32 *irqs = NULL; + char *names = NULL; + + lock(&irq_lock); + list_for_each(&irq_sources, is, link) { + /* + * Don't even consider sources that don't have an interrupts + * callback or don't have an attributes one. + */ + if (!is->ops->interrupt || !is->ops->attributes) + continue; + for (isn = is->start; isn < is->end; isn++) { + uint64_t attr = is->ops->attributes(is, isn); + uint32_t iflags; + char *name; + + if (attr & IRQ_ATTR_TARGET_LINUX) + continue; + if (attr & IRQ_ATTR_TYPE_MSI) + iflags = 0; + else + iflags = 1; + name = is->ops->name ? is->ops->name(is, isn) : NULL; + ns = name ? strlen(name) : 0; + prlog(PR_DEBUG, "irq %x name: %s %s\n", + isn, + name ? name : "", + iflags ? "[level]" : "[edge]"); + names = realloc(names, tns + ns + 1); + if (name) { + strcpy(names + tns, name); + tns += (ns + 1); + free(name); + } else + names[tns++] = 0; + i = count++; + irqs = realloc(irqs, 8 * count); + irqs[i*2] = cpu_to_be32(isn); + irqs[i*2+1] = cpu_to_be32(iflags); + } + } + unlock(&irq_lock); + + /* First create the standard "interrupts" property and the + * corresponding names property + */ + dt_add_property_cells(opal_node, "interrupt-parent", get_ics_phandle()); + dt_add_property(opal_node, "interrupts", irqs, count * 8); + dt_add_property(opal_node, "opal-interrupts-names", names, tns); + dt_add_property(opal_node, "interrupt-names", names, tns); + + /* Now "reduce" it to the old style "opal-interrupts" property + * format by stripping out the flags. The "opal-interrupts" + * property has one cell per interrupt, it is not a standard + * "interrupt" property. + * + * Note: Even if empty, create it, otherwise some bogus error + * handling in Linux can cause problems. + */ + for (i = 1; i < count; i++) + irqs[i] = irqs[i * 2]; + dt_add_property(opal_node, "opal-interrupts", irqs, count * 4); + + free(irqs); + free(names); +} + +/* + * This is called at init time (and one fast reboot) to sanitize the + * ICP. We set our priority to 0 to mask all interrupts and make sure + * no IPI is on the way. This is also called on wakeup from nap + */ +void reset_cpu_icp(void) +{ + void *icp = this_cpu()->icp_regs; + + if (!icp) + return; + + /* Dummy fetch */ + in_be32(icp + ICP_XIRR); + + /* Clear pending IPIs */ + out_8(icp + ICP_MFRR, 0xff); + + /* Set priority to max, ignore all incoming interrupts, EOI IPIs */ + out_be32(icp + ICP_XIRR, 2); +} + +/* Used by the PSI code to send an EOI during reset. This will also + * set the CPPR to 0 which should already be the case anyway + */ +void icp_send_eoi(uint32_t interrupt) +{ + void *icp = this_cpu()->icp_regs; + + if (!icp) + return; + + /* Set priority to max, ignore all incoming interrupts */ + out_be32(icp + ICP_XIRR, interrupt & 0xffffff); +} + +/* This is called before winkle or nap, we clear pending IPIs and + * set our priority to 1 to mask all but the IPI. + */ +void icp_prep_for_pm(void) +{ + void *icp = this_cpu()->icp_regs; + + if (!icp) + return; + + /* Clear pending IPIs */ + out_8(icp + ICP_MFRR, 0xff); + + /* Set priority to 1, ignore all incoming interrupts, EOI IPIs */ + out_be32(icp + ICP_XIRR, 0x01000002); +} + +/* This is called to wakeup somebody from winkle */ +void icp_kick_cpu(struct cpu_thread *cpu) +{ + void *icp = cpu->icp_regs; + + if (!icp) + return; + + /* Send high priority IPI */ + out_8(icp + ICP_MFRR, 0); +} + +/* Returns the number of chip ID bits used for interrupt numbers */ +static uint32_t p8_chip_id_bits(uint32_t chip) +{ + struct proc_chip *proc_chip = get_chip(chip); + + assert(proc_chip); + switch (proc_chip->type) { + case PROC_CHIP_P8_MURANO: + case PROC_CHIP_P8_VENICE: + return 6; + break; + + case PROC_CHIP_P8_NAPLES: + return 5; + break; + + default: + /* This shouldn't be called on non-P8 based systems */ + assert(0); + return 0; + break; + } +} + +/* The chip id mask is the upper p8_chip_id_bits of the irq number */ +static uint32_t chip_id_mask(uint32_t chip) +{ + uint32_t chip_id_bits = p8_chip_id_bits(chip); + uint32_t chip_id_mask; + + chip_id_mask = ((1 << chip_id_bits) - 1); + chip_id_mask <<= P8_IRQ_BITS - chip_id_bits; + return chip_id_mask; +} + +/* The block mask is what remains of the 19 bit irq number after + * removing the upper 5 or 6 bits for the chip# and the lower 11 bits + * for the number of bits per block. */ +static uint32_t block_mask(uint32_t chip) +{ + uint32_t chip_id_bits = p8_chip_id_bits(chip); + uint32_t irq_block_mask; + + irq_block_mask = P8_IRQ_BITS - chip_id_bits - P8_IVE_BITS; + irq_block_mask = ((1 << irq_block_mask) - 1) << P8_IVE_BITS; + return irq_block_mask; +} + +uint32_t p8_chip_irq_block_base(uint32_t chip, uint32_t block) +{ + uint32_t irq; + + assert(chip < (1 << p8_chip_id_bits(chip))); + irq = SETFIELD(chip_id_mask(chip), 0, chip); + irq = SETFIELD(block_mask(chip), irq, block); + + return irq; +} + +uint32_t p8_chip_irq_phb_base(uint32_t chip, uint32_t phb) +{ + assert(chip < (1 << p8_chip_id_bits(chip))); + + return p8_chip_irq_block_base(chip, phb + P8_IRQ_BLOCK_PHB_BASE); +} + +uint32_t p8_irq_to_chip(uint32_t irq) +{ + /* This assumes we only have one type of cpu in a system, + * which should be ok. */ + return GETFIELD(chip_id_mask(this_cpu()->chip_id), irq); +} + +uint32_t p8_irq_to_block(uint32_t irq) +{ + return GETFIELD(block_mask(this_cpu()->chip_id), irq); +} + +uint32_t p8_irq_to_phb(uint32_t irq) +{ + return p8_irq_to_block(irq) - P8_IRQ_BLOCK_PHB_BASE; +} + +bool __irq_source_eoi(struct irq_source *is, uint32_t isn) +{ + if (!is->ops->eoi) + return false; + + is->ops->eoi(is, isn); + return true; +} + +bool irq_source_eoi(uint32_t isn) +{ + struct irq_source *is = irq_find_source(isn); + + if (!is) + return false; + + return __irq_source_eoi(is, isn); +} + +static int64_t opal_set_xive(uint32_t isn, uint16_t server, uint8_t priority) +{ + struct irq_source *is = irq_find_source(isn); + + if (!is || !is->ops->set_xive) + return OPAL_PARAMETER; + + return is->ops->set_xive(is, isn, server, priority); +} +opal_call(OPAL_SET_XIVE, opal_set_xive, 3); + +static int64_t opal_get_xive(uint32_t isn, __be16 *server, uint8_t *priority) +{ + struct irq_source *is = irq_find_source(isn); + uint16_t s; + int64_t ret; + + if (!opal_addr_valid(server)) + return OPAL_PARAMETER; + + if (!is || !is->ops->get_xive) + return OPAL_PARAMETER; + + ret = is->ops->get_xive(is, isn, &s, priority); + *server = cpu_to_be16(s); + return ret; +} +opal_call(OPAL_GET_XIVE, opal_get_xive, 3); + +static int64_t opal_handle_interrupt(uint32_t isn, __be64 *outstanding_event_mask) +{ + struct irq_source *is = irq_find_source(isn); + int64_t rc = OPAL_SUCCESS; + + if (!opal_addr_valid(outstanding_event_mask)) + return OPAL_PARAMETER; + + /* No source ? return */ + if (!is || !is->ops->interrupt) { + rc = OPAL_PARAMETER; + goto bail; + } + + /* Run it */ + is->ops->interrupt(is, isn); + + /* Check timers if SBE timer isn't working */ + if (!p8_sbe_timer_ok() && !p9_sbe_timer_ok()) + check_timers(true); + + /* Update output events */ + bail: + if (outstanding_event_mask) + *outstanding_event_mask = cpu_to_be64(opal_pending_events); + + return rc; +} +opal_call(OPAL_HANDLE_INTERRUPT, opal_handle_interrupt, 2); + +void init_interrupts(void) +{ + struct dt_node *icp; + const struct dt_property *sranges; + struct cpu_thread *cpu; + u32 base, count, i; + u64 addr, size; + + dt_for_each_compatible(dt_root, icp, "ibm,ppc-xicp") { + sranges = dt_require_property(icp, + "ibm,interrupt-server-ranges", + -1); + base = dt_get_number(sranges->prop, 1); + count = dt_get_number(sranges->prop + 4, 1); + for (i = 0; i < count; i++) { + addr = dt_get_address(icp, i, &size); + cpu = find_cpu_by_server(base + i); + if (cpu) + cpu->icp_regs = (void *)addr; + } + } +} + diff --git a/roms/skiboot/core/ipmi-opal.c b/roms/skiboot/core/ipmi-opal.c new file mode 100644 index 000000000..cc45b409b --- /dev/null +++ b/roms/skiboot/core/ipmi-opal.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * IPMI OPAL calls + * + * Copyright 2013-2018 IBM Corp. + */ + +#include +#include +#include +#include +#include +#include +#include + +static struct lock msgq_lock = LOCK_UNLOCKED; +static struct list_head msgq = LIST_HEAD_INIT(msgq); + +static void opal_send_complete(struct ipmi_msg *msg) +{ + lock(&msgq_lock); + list_add_tail(&msgq, &msg->link); + opal_update_pending_evt(ipmi_backend->opal_event_ipmi_recv, + ipmi_backend->opal_event_ipmi_recv); + unlock(&msgq_lock); +} + +static int64_t opal_ipmi_send(uint64_t interface, + struct opal_ipmi_msg *opal_ipmi_msg, uint64_t msg_len) +{ + struct ipmi_msg *msg; + + if (opal_ipmi_msg->version != OPAL_IPMI_MSG_FORMAT_VERSION_1) { + prerror("OPAL IPMI: Incorrect version\n"); + return OPAL_UNSUPPORTED; + } + + msg_len -= sizeof(struct opal_ipmi_msg); + if (msg_len > IPMI_MAX_REQ_SIZE) { + prerror("OPAL IPMI: Invalid request length\n"); + return OPAL_PARAMETER; + } + + prlog(PR_TRACE, "opal_ipmi_send(cmd: 0x%02x netfn: 0x%02x len: 0x%02llx)\n", + opal_ipmi_msg->cmd, opal_ipmi_msg->netfn >> 2, msg_len); + + msg = ipmi_mkmsg(interface, + IPMI_CODE(opal_ipmi_msg->netfn >> 2, opal_ipmi_msg->cmd), + opal_send_complete, NULL, opal_ipmi_msg->data, + msg_len, IPMI_MAX_RESP_SIZE); + if (!msg) + return OPAL_RESOURCE; + + msg->complete = opal_send_complete; + msg->error = opal_send_complete; + return ipmi_queue_msg(msg); +} + +static int64_t opal_ipmi_recv(uint64_t interface, + struct opal_ipmi_msg *opal_ipmi_msg, __be64 *msg_len) +{ + struct ipmi_msg *msg; + int64_t rc; + + lock(&msgq_lock); + msg = list_top(&msgq, struct ipmi_msg, link); + + if (!msg) { + rc = OPAL_EMPTY; + goto out_unlock; + } + + if (opal_ipmi_msg->version != OPAL_IPMI_MSG_FORMAT_VERSION_1) { + prerror("OPAL IPMI: Incorrect version\n"); + rc = OPAL_UNSUPPORTED; + goto out_del_msg; + } + + if (interface != IPMI_DEFAULT_INTERFACE) { + prerror("IPMI: Invalid interface 0x%llx in opal_ipmi_recv\n", interface); + rc = OPAL_PARAMETER; + goto out_del_msg; + } + + if (be64_to_cpu(*msg_len) - sizeof(struct opal_ipmi_msg) < msg->resp_size + 1) { + rc = OPAL_RESOURCE; + goto out_del_msg; + } + + list_del(&msg->link); + if (list_empty(&msgq)) + opal_update_pending_evt(ipmi_backend->opal_event_ipmi_recv, 0); + unlock(&msgq_lock); + + opal_ipmi_msg->cmd = msg->cmd; + opal_ipmi_msg->netfn = msg->netfn; + opal_ipmi_msg->data[0] = msg->cc; + memcpy(&opal_ipmi_msg->data[1], msg->data, msg->resp_size); + + prlog(PR_TRACE, "opal_ipmi_recv(cmd: 0x%02x netfn: 0x%02x resp_size: 0x%02x)\n", + msg->cmd, msg->netfn >> 2, msg->resp_size); + + /* Add one as the completion code is returned in the message data */ + *msg_len = cpu_to_be64(msg->resp_size + sizeof(struct opal_ipmi_msg) + 1); + ipmi_free_msg(msg); + + return OPAL_SUCCESS; + +out_del_msg: + list_del(&msg->link); + if (list_empty(&msgq)) + opal_update_pending_evt(ipmi_backend->opal_event_ipmi_recv, 0); + ipmi_free_msg(msg); +out_unlock: + unlock(&msgq_lock); + return rc; +} + +void ipmi_opal_init(void) +{ + struct dt_node *opal_ipmi, *opal_event = NULL; + + opal_ipmi = dt_new(opal_node, "ipmi"); + dt_add_property_strings(opal_ipmi, "compatible", "ibm,opal-ipmi"); + dt_add_property_cells(opal_ipmi, "ibm,ipmi-interface-id", + IPMI_DEFAULT_INTERFACE); + dt_add_property_cells(opal_ipmi, "interrupts", + ilog2(ipmi_backend->opal_event_ipmi_recv)); + + if (proc_gen >= proc_gen_p9) + opal_event = dt_find_by_name(opal_node, "event"); + if (opal_event) + dt_add_property_cells(opal_ipmi, "interrupt-parent", + opal_event->phandle); + + opal_register(OPAL_IPMI_SEND, opal_ipmi_send, 3); + opal_register(OPAL_IPMI_RECV, opal_ipmi_recv, 3); +} diff --git a/roms/skiboot/core/ipmi.c b/roms/skiboot/core/ipmi.c new file mode 100644 index 000000000..bbc1a7b69 --- /dev/null +++ b/roms/skiboot/core/ipmi.c @@ -0,0 +1,263 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * in-band IPMI, probably over bt (or via FSP mbox on FSP) + * + * Copyright 2013-2019 IBM Corp. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct ipmi_backend *ipmi_backend = NULL; +static struct lock sync_lock = LOCK_UNLOCKED; +static struct ipmi_msg *sync_msg = NULL; + +void ipmi_free_msg(struct ipmi_msg *msg) +{ + /* ipmi_free_msg frees messages allocated by the + * backend. Without a backend we couldn't have allocated + * messages to free (we don't support removing backends + * yet). */ + if (!ipmi_present()) { + prerror("IPMI: Trying to free message without backend\n"); + return; + } + + msg->backend->free_msg(msg); +} + +void ipmi_init_msg(struct ipmi_msg *msg, int interface, + uint32_t code, void (*complete)(struct ipmi_msg *), + void *user_data, size_t req_size, size_t resp_size) +{ + /* We don't actually support multiple interfaces at the moment. */ + assert(interface == IPMI_DEFAULT_INTERFACE); + + msg->backend = ipmi_backend; + msg->cmd = IPMI_CMD(code); + msg->netfn = IPMI_NETFN(code) << 2; + msg->req_size = req_size; + msg->resp_size = resp_size; + msg->complete = complete; + msg->user_data = user_data; +} + +struct ipmi_msg *ipmi_mkmsg_simple(uint32_t code, void *req_data, size_t req_size) +{ + return ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, code, ipmi_free_msg, NULL, + req_data, req_size, 0); +} + +struct ipmi_msg *ipmi_mkmsg(int interface, uint32_t code, + void (*complete)(struct ipmi_msg *), + void *user_data, void *req_data, size_t req_size, + size_t resp_size) +{ + struct ipmi_msg *msg; + + if (!ipmi_present()) + return NULL; + + msg = ipmi_backend->alloc_msg(req_size, resp_size); + if (!msg) + return NULL; + + ipmi_init_msg(msg, interface, code, complete, user_data, req_size, + resp_size); + + /* Commands are free to over ride this if they want to handle errors */ + msg->error = ipmi_free_msg; + + if (req_data) + memcpy(msg->data, req_data, req_size); + + return msg; +} + +int ipmi_queue_msg_head(struct ipmi_msg *msg) +{ + if (!ipmi_present()) + return OPAL_HARDWARE; + + if (!msg) { + prerror("%s: Attempting to queue NULL message\n", __func__); + return OPAL_PARAMETER; + } + + return msg->backend->queue_msg_head(msg); +} + +int ipmi_queue_msg(struct ipmi_msg *msg) +{ + /* Here we could choose which interface to use if we want to support + multiple interfaces. */ + if (!ipmi_present()) + return OPAL_HARDWARE; + + if (!msg) { + prerror("%s: Attempting to queue NULL message\n", __func__); + return OPAL_PARAMETER; + } + + return msg->backend->queue_msg(msg); +} + +int ipmi_dequeue_msg(struct ipmi_msg *msg) +{ + if (!ipmi_present()) + return OPAL_HARDWARE; + + if (!msg) { + prerror("%s: Attempting to dequeue NULL message\n", __func__); + return OPAL_PARAMETER; + } + + return msg->backend->dequeue_msg(msg); +} + +void ipmi_cmd_done(uint8_t cmd, uint8_t netfn, uint8_t cc, struct ipmi_msg *msg) +{ + msg->cc = cc; + if (msg->cmd != cmd) { + prerror("IPMI: Incorrect cmd 0x%02x in response\n", cmd); + cc = IPMI_ERR_UNSPECIFIED; + } + + if ((msg->netfn >> 2) + 1 != (netfn >> 2)) { + prerror("IPMI: Incorrect netfn 0x%02x in response\n", netfn >> 2); + cc = IPMI_ERR_UNSPECIFIED; + } + msg->netfn = netfn; + + if (cc != IPMI_CC_NO_ERROR) { + prlog(PR_DEBUG, "IPMI: Got error response. cmd=0x%x, netfn=0x%x," + " rc=0x%02x\n", msg->cmd, msg->netfn >> 2, msg->cc); + + assert(msg->error); + msg->error(msg); + } else if (msg->complete) + msg->complete(msg); + + /* At this point the message has should have been freed by the + completion functions. */ + + /* If this is a synchronous message flag that we are done */ + if (msg == sync_msg) { + sync_msg = NULL; + barrier(); + } +} + +void ipmi_queue_msg_sync(struct ipmi_msg *msg) +{ + void (*poll)(void) = msg->backend->poll; + + if (!ipmi_present()) + return; + + if (!msg) { + prerror("%s: Attempting to queue NULL message\n", __func__); + return; + } + + lock(&sync_lock); + while (sync_msg); + sync_msg = msg; + if (msg->backend->disable_retry && !opal_booting()) + msg->backend->disable_retry(msg); + ipmi_queue_msg_head(msg); + unlock(&sync_lock); + + /* + * BT response handling relies on a timer. We can't just run all + * timers because we may have been called with a lock that a timer + * wants, and they're generally not written to cope with that. + * So, just run whatever the IPMI backend needs to make forward + * progress. + */ + while (sync_msg == msg) { + if (poll) + poll(); + time_wait_ms(10); + } +} + +static void ipmi_read_event_complete(struct ipmi_msg *msg) +{ + prlog(PR_DEBUG, "IPMI read event %02x complete: %d bytes. cc: %02x\n", + msg->cmd, msg->resp_size, msg->cc); + + /* Handle power control & PNOR handshake events */ + ipmi_parse_sel(msg); + + ipmi_free_msg(msg); +} + +static void ipmi_get_message_flags_complete(struct ipmi_msg *msg) +{ + uint8_t flags = msg->data[0]; + + ipmi_free_msg(msg); + + prlog(PR_DEBUG, "IPMI Get Message Flags: %02x\n", flags); + + /* Once we see an interrupt we assume the payload has + * booted. We disable the wdt and let the OS setup its own + * wdt. + * + * This is also where we consider the OS to be booted, so we set + * the boot count sensor */ + if (flags & IPMI_MESSAGE_FLAGS_WATCHDOG_PRE_TIMEOUT) { + ipmi_wdt_stop(); + ipmi_set_boot_count(); + } + + /* Message available in the event buffer? Queue a Read Event command + * to retrieve it. The flag is cleared by performing a read */ + if (flags & IPMI_MESSAGE_FLAGS_EVENT_BUFFER) { + msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, IPMI_READ_EVENT, + ipmi_read_event_complete, NULL, NULL, 0, 16); + ipmi_queue_msg(msg); + } +} + +void ipmi_sms_attention(void) +{ + struct ipmi_msg *msg; + + if (!ipmi_present()) + return; + + /* todo: when we handle multiple IPMI interfaces, we'll need to + * ensure that this message is associated with the appropriate + * backend. */ + msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, IPMI_GET_MESSAGE_FLAGS, + ipmi_get_message_flags_complete, NULL, NULL, 0, 1); + + ipmi_queue_msg(msg); +} + +void ipmi_register_backend(struct ipmi_backend *backend) +{ + /* We only support one backend at the moment */ + assert(backend->alloc_msg); + assert(backend->free_msg); + assert(backend->queue_msg); + assert(backend->dequeue_msg); + ipmi_backend = backend; + ipmi_backend->opal_event_ipmi_recv = opal_dynamic_event_alloc(); +} + +bool ipmi_present(void) +{ + return ipmi_backend != NULL; +} diff --git a/roms/skiboot/core/lock.c b/roms/skiboot/core/lock.c new file mode 100644 index 000000000..f0ab595b1 --- /dev/null +++ b/roms/skiboot/core/lock.c @@ -0,0 +1,336 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Simple spinlock + * + * Copyright 2013-2019 IBM Corp. + */ + +#include +#include +#include +#include +#include +#include +#include + +/* Set to bust locks. Note, this is initialized to true because our + * lock debugging code is not going to work until we have the per + * CPU data initialized + */ +bool bust_locks = true; + +#define LOCK_TIMEOUT_MS 5000 + +#ifdef DEBUG_LOCKS + +static void __nomcount lock_error(struct lock *l, const char *reason, uint16_t err) +{ + fprintf(stderr, "LOCK ERROR: %s @%p (state: 0x%016llx)\n", + reason, l, l->lock_val); + op_display(OP_FATAL, OP_MOD_LOCK, err); + + abort(); +} + +static inline void __nomcount lock_check(struct lock *l) +{ + if ((l->lock_val & 1) && (l->lock_val >> 32) == this_cpu()->pir) + lock_error(l, "Invalid recursive lock", 0); +} + +static inline void __nomcount unlock_check(struct lock *l) +{ + if (!(l->lock_val & 1)) + lock_error(l, "Unlocking unlocked lock", 1); + + if ((l->lock_val >> 32) != this_cpu()->pir) + lock_error(l, "Unlocked non-owned lock", 2); + + if (l->in_con_path && this_cpu()->con_suspend == 0) + lock_error(l, "Unlock con lock with console not suspended", 3); + + if (list_empty(&this_cpu()->locks_held)) + lock_error(l, "Releasing lock we don't hold depth", 4); +} + +static inline bool __nomcount __try_lock(struct cpu_thread *cpu, struct lock *l) +{ + uint64_t val; + + val = cpu->pir; + val <<= 32; + val |= 1; + + barrier(); + if (__cmpxchg64(&l->lock_val, 0, val) == 0) { + sync(); + return true; + } + return false; +} + +static inline bool lock_timeout(unsigned long start) +{ + /* Print warning if lock has been spinning for more than TIMEOUT_MS */ + unsigned long wait = tb_to_msecs(mftb()); + + if (wait - start > LOCK_TIMEOUT_MS) { + /* + * If the timebase is invalid, we shouldn't + * throw an error. This is possible with pending HMIs + * that need to recover TB. + */ + if( !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID)) + return false; + return true; + } + + return false; +} +#else +static inline void lock_check(struct lock *l) { }; +static inline void unlock_check(struct lock *l) { }; +static inline bool lock_timeout(unsigned long s) { return false; } +#endif /* DEBUG_LOCKS */ + +#if defined(DEADLOCK_CHECKER) && defined(DEBUG_LOCKS) + +static struct lock dl_lock = { + .lock_val = 0, + .in_con_path = true, + .owner = LOCK_CALLER +}; + +/* Find circular dependencies in the lock requests. */ +static __nomcount inline bool check_deadlock(void) +{ + uint32_t lock_owner, start, i; + struct cpu_thread *next_cpu; + struct lock *next; + + next = this_cpu()->requested_lock; + start = this_cpu()->pir; + i = 0; + + while (i < cpu_max_pir) { + + if (!next) + return false; + + if (!(next->lock_val & 1) || next->in_con_path) + return false; + + lock_owner = next->lock_val >> 32; + + if (lock_owner == start) + return true; + + next_cpu = find_cpu_by_pir_nomcount(lock_owner); + + if (!next_cpu) + return false; + + next = next_cpu->requested_lock; + i++; + } + + return false; +} + +static void add_lock_request(struct lock *l) +{ + struct cpu_thread *curr = this_cpu(); + bool dead; + + if (curr->state != cpu_state_active && + curr->state != cpu_state_os) + return; + + /* + * For deadlock detection we must keep the lock states constant + * while doing the deadlock check. However we need to avoid + * clashing with the stack checker, so no mcount and use an + * inline implementation of the lock for the dl_lock + */ + for (;;) { + if (__try_lock(curr, &dl_lock)) + break; + smt_lowest(); + while (dl_lock.lock_val) + barrier(); + smt_medium(); + } + + curr->requested_lock = l; + + dead = check_deadlock(); + + lwsync(); + dl_lock.lock_val = 0; + + if (dead) + lock_error(l, "Deadlock detected", 0); +} + +static void remove_lock_request(void) +{ + this_cpu()->requested_lock = NULL; +} +#else +static inline void add_lock_request(struct lock *l) { }; +static inline void remove_lock_request(void) { }; +#endif /* #if defined(DEADLOCK_CHECKER) && defined(DEBUG_LOCKS) */ + +bool lock_held_by_me(struct lock *l) +{ + uint64_t pir64 = this_cpu()->pir; + + return l->lock_val == ((pir64 << 32) | 1); +} + +bool try_lock_caller(struct lock *l, const char *owner) +{ + struct cpu_thread *cpu = this_cpu(); + + if (bust_locks) + return true; + + if (l->in_con_path) + cpu->con_suspend++; + if (__try_lock(cpu, l)) { + l->owner = owner; + +#ifdef DEBUG_LOCKS_BACKTRACE + backtrace_create(l->bt_buf, LOCKS_BACKTRACE_MAX_ENTS, + &l->bt_metadata); +#endif + + list_add(&cpu->locks_held, &l->list); + return true; + } + if (l->in_con_path) + cpu->con_suspend--; + return false; +} + +void lock_caller(struct lock *l, const char *owner) +{ + bool timeout_warn = false; + unsigned long start = 0; + + if (bust_locks) + return; + + lock_check(l); + + if (try_lock_caller(l, owner)) + return; + add_lock_request(l); + +#ifdef DEBUG_LOCKS + /* + * Ensure that we get a valid start value + * as we may be handling TFMR errors and taking + * a lock to do so, so timebase could be garbage + */ + if( (mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID)) + start = tb_to_msecs(mftb()); +#endif + + for (;;) { + if (try_lock_caller(l, owner)) + break; + smt_lowest(); + while (l->lock_val) + barrier(); + smt_medium(); + + if (start && !timeout_warn && lock_timeout(start)) { + /* + * Holding the lock request while printing a + * timeout and taking console locks can result + * in deadlock fals positive if the lock owner + * tries to take the console lock. So drop it. + */ + remove_lock_request(); + prlog(PR_WARNING, "WARNING: Lock has been spinning for over %dms\n", LOCK_TIMEOUT_MS); + backtrace(); + add_lock_request(l); + timeout_warn = true; + } + } + + remove_lock_request(); +} + +void unlock(struct lock *l) +{ + struct cpu_thread *cpu = this_cpu(); + + if (bust_locks) + return; + + unlock_check(l); + + l->owner = NULL; + list_del(&l->list); + lwsync(); + l->lock_val = 0; + + /* WARNING: On fast reboot, we can be reset right at that + * point, so the reset_lock in there cannot be in the con path + */ + if (l->in_con_path) { + cpu->con_suspend--; + if (cpu->con_suspend == 0 && cpu->con_need_flush) + flush_console(); + } +} + +bool lock_recursive_caller(struct lock *l, const char *caller) +{ + if (bust_locks) + return false; + + if (lock_held_by_me(l)) + return false; + + lock_caller(l, caller); + return true; +} + +void init_locks(void) +{ + bust_locks = false; +} + +void dump_locks_list(void) +{ + struct lock *l; + + prlog(PR_ERR, "Locks held:\n"); + list_for_each(&this_cpu()->locks_held, l, list) { + prlog(PR_ERR, " %s\n", l->owner); +#ifdef DEBUG_LOCKS_BACKTRACE + backtrace_print(l->bt_buf, &l->bt_metadata, NULL, NULL, true); +#endif + } +} + +void drop_my_locks(bool warn) +{ + struct lock *l; + + disable_fast_reboot("Lock corruption"); + while((l = list_top(&this_cpu()->locks_held, struct lock, list)) != NULL) { + if (warn) { + prlog(PR_ERR, " %s\n", l->owner); +#ifdef DEBUG_LOCKS_BACKTRACE + backtrace_print(l->bt_buf, &l->bt_metadata, NULL, NULL, + true); +#endif + } + unlock(l); + } +} + diff --git a/roms/skiboot/core/malloc.c b/roms/skiboot/core/malloc.c new file mode 100644 index 000000000..76996fff4 --- /dev/null +++ b/roms/skiboot/core/malloc.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Implement malloc()/free() etc on top of our memory region allocator, + * which provides mem_alloc()/mem_free(). + * + * Copyright 2013-2015 IBM Corp. + */ + +#include +#include +#include +#include + +#define DEFAULT_ALIGN __alignof__(long) + +void *__memalign(size_t blocksize, size_t bytes, const char *location) +{ + void *p; + + lock(&skiboot_heap.free_list_lock); + p = mem_alloc(&skiboot_heap, bytes, blocksize, location); + unlock(&skiboot_heap.free_list_lock); + + return p; +} + +void *__malloc(size_t bytes, const char *location) +{ + return __memalign(DEFAULT_ALIGN, bytes, location); +} + +void __free(void *p, const char *location) +{ + lock(&skiboot_heap.free_list_lock); + mem_free(&skiboot_heap, p, location); + unlock(&skiboot_heap.free_list_lock); +} + +void *__realloc(void *ptr, size_t size, const char *location) +{ + void *newptr; + + /* Two classic malloc corner cases. */ + if (!size) { + __free(ptr, location); + return NULL; + } + if (!ptr) + return __malloc(size, location); + + lock(&skiboot_heap.free_list_lock); + if (mem_resize(&skiboot_heap, ptr, size, location)) { + newptr = ptr; + } else { + newptr = mem_alloc(&skiboot_heap, size, DEFAULT_ALIGN, + location); + if (newptr) { + size_t copy = mem_allocated_size(ptr); + if (copy > size) + copy = size; + memcpy(newptr, ptr, copy); + mem_free(&skiboot_heap, ptr, location); + } + } + unlock(&skiboot_heap.free_list_lock); + return newptr; +} + +void *__zalloc(size_t bytes, const char *location) +{ + void *p = __malloc(bytes, location); + + if (p) + memset(p, 0, bytes); + return p; +} diff --git a/roms/skiboot/core/mce.c b/roms/skiboot/core/mce.c new file mode 100644 index 000000000..47674abcb --- /dev/null +++ b/roms/skiboot/core/mce.c @@ -0,0 +1,309 @@ +// SPDX-License-Identifier: Apache-2.0 +/* + * Machine Check Exceptions + * + * Copyright 2020 IBM Corp. + */ + +#define pr_fmt(fmt) "MCE: " fmt + +#include +#include +#include + +#define SRR1_MC_LOADSTORE(srr1) ((srr1) & PPC_BIT(42)) + +struct mce_ierror_table { + unsigned long srr1_mask; + unsigned long srr1_value; + uint64_t type; + const char *error_str; +}; + +static const struct mce_ierror_table mce_p9_ierror_table[] = { +{ 0x00000000081c0000, 0x0000000000040000, + MCE_INSNFETCH | MCE_MEMORY_ERROR | MCE_INVOLVED_EA, + "instruction fetch memory uncorrectable error", }, +{ 0x00000000081c0000, 0x0000000000080000, + MCE_INSNFETCH | MCE_SLB_ERROR | MCE_INVOLVED_EA, + "instruction fetch SLB parity error", }, +{ 0x00000000081c0000, 0x00000000000c0000, + MCE_INSNFETCH | MCE_SLB_ERROR | MCE_INVOLVED_EA, + "instruction fetch SLB multi-hit error", }, +{ 0x00000000081c0000, 0x0000000000100000, + MCE_INSNFETCH | MCE_INVOLVED_EA | MCE_ERAT_ERROR, + "instruction fetch ERAT multi-hit error", }, +{ 0x00000000081c0000, 0x0000000000140000, + MCE_INSNFETCH | MCE_INVOLVED_EA | MCE_TLB_ERROR, + "instruction fetch TLB multi-hit error", }, +{ 0x00000000081c0000, 0x0000000000180000, + MCE_INSNFETCH | MCE_MEMORY_ERROR | MCE_TABLE_WALK | MCE_INVOLVED_EA, + "instruction fetch page table access memory uncorrectable error", }, +{ 0x00000000081c0000, 0x00000000001c0000, + MCE_INSNFETCH | MCE_INVOLVED_EA, + "instruction fetch to foreign address", }, +{ 0x00000000081c0000, 0x0000000008000000, + MCE_INSNFETCH | MCE_INVOLVED_EA, + "instruction fetch foreign link time-out", }, +{ 0x00000000081c0000, 0x0000000008040000, + MCE_INSNFETCH | MCE_TABLE_WALK | MCE_INVOLVED_EA, + "instruction fetch page table access foreign link time-out", }, +{ 0x00000000081c0000, 0x00000000080c0000, + MCE_INSNFETCH | MCE_INVOLVED_EA, + "instruction fetch real address error", }, +{ 0x00000000081c0000, 0x0000000008100000, + MCE_INSNFETCH | MCE_TABLE_WALK | MCE_INVOLVED_EA, + "instruction fetch page table access real address error", }, +{ 0x00000000081c0000, 0x0000000008140000, + MCE_LOADSTORE | MCE_IMPRECISE, + "store real address asynchronous error", }, +{ 0x00000000081c0000, 0x0000000008180000, + MCE_LOADSTORE | MCE_IMPRECISE, + "store foreign link time-out asynchronous error", }, +{ 0x00000000081c0000, 0x00000000081c0000, + MCE_INSNFETCH | MCE_TABLE_WALK | MCE_INVOLVED_EA, + "instruction fetch page table access to foreign address", }, +{ 0 } }; + +static const struct mce_ierror_table mce_p10_ierror_table[] = { +{ 0x00000000081c0000, 0x0000000000040000, + MCE_INSNFETCH | MCE_MEMORY_ERROR | MCE_INVOLVED_EA, + "instruction fetch memory uncorrectable error", }, +{ 0x00000000081c0000, 0x0000000000080000, + MCE_INSNFETCH | MCE_SLB_ERROR | MCE_INVOLVED_EA, + "instruction fetch SLB parity error", }, +{ 0x00000000081c0000, 0x00000000000c0000, + MCE_INSNFETCH | MCE_SLB_ERROR | MCE_INVOLVED_EA, + "instruction fetch SLB multi-hit error", }, +{ 0x00000000081c0000, 0x0000000000100000, + MCE_INSNFETCH | MCE_INVOLVED_EA | MCE_ERAT_ERROR, + "instruction fetch ERAT multi-hit error", }, +{ 0x00000000081c0000, 0x0000000000140000, + MCE_INSNFETCH | MCE_INVOLVED_EA | MCE_TLB_ERROR, + "instruction fetch TLB multi-hit error", }, +{ 0x00000000081c0000, 0x0000000000180000, + MCE_INSNFETCH | MCE_MEMORY_ERROR | MCE_TABLE_WALK | MCE_INVOLVED_EA, + "instruction fetch page table access memory uncorrectable error", }, +{ 0x00000000081c0000, 0x00000000001c0000, + MCE_INSNFETCH | MCE_INVOLVED_EA, + "instruction fetch to control real address", }, +{ 0x00000000081c0000, 0x00000000080c0000, + MCE_INSNFETCH | MCE_INVOLVED_EA, + "instruction fetch real address error", }, +{ 0x00000000081c0000, 0x0000000008100000, + MCE_INSNFETCH | MCE_TABLE_WALK | MCE_INVOLVED_EA, + "instruction fetch page table access real address error", }, +{ 0x00000000081c0000, 0x0000000008140000, + MCE_LOADSTORE | MCE_IMPRECISE, + "store real address asynchronous error", }, +{ 0x00000000081c0000, 0x00000000081c0000, + MCE_INSNFETCH | MCE_TABLE_WALK | MCE_INVOLVED_EA, + "instruction fetch page table access to control real address", }, +{ 0 } }; + +struct mce_derror_table { + unsigned long dsisr_value; + uint64_t type; + const char *error_str; +}; + +static const struct mce_derror_table mce_p9_derror_table[] = { +{ 0x00008000, + MCE_LOADSTORE | MCE_MEMORY_ERROR, + "load/store memory uncorrectable error", }, +{ 0x00004000, + MCE_LOADSTORE | MCE_MEMORY_ERROR | MCE_TABLE_WALK | MCE_INVOLVED_EA, + "load/store page table access memory uncorrectable error", }, +{ 0x00002000, + MCE_LOADSTORE | MCE_INVOLVED_EA, + "load/store foreign link time-out", }, +{ 0x00001000, + MCE_LOADSTORE | MCE_TABLE_WALK | MCE_INVOLVED_EA, + "load/store page table access foreign link time-out", }, +{ 0x00000800, + MCE_LOADSTORE | MCE_INVOLVED_EA | MCE_ERAT_ERROR, + "load/store ERAT multi-hit error", }, +{ 0x00000400, + MCE_LOADSTORE | MCE_INVOLVED_EA | MCE_TLB_ERROR, + "load/store TLB multi-hit error", }, +{ 0x00000200, + MCE_LOADSTORE | MCE_TLBIE_ERROR, + "TLBIE or TLBIEL instruction programming error", }, +{ 0x00000100, + MCE_LOADSTORE | MCE_INVOLVED_EA | MCE_SLB_ERROR, + "load/store SLB parity error", }, +{ 0x00000080, + MCE_LOADSTORE | MCE_INVOLVED_EA | MCE_SLB_ERROR, + "load/store SLB multi-hit error", }, +{ 0x00000040, + MCE_LOADSTORE | MCE_INVOLVED_EA, + "load real address error", }, +{ 0x00000020, + MCE_LOADSTORE | MCE_TABLE_WALK, + "load/store page table access real address error", }, +{ 0x00000010, + MCE_LOADSTORE | MCE_TABLE_WALK, + "load/store page table access to foreign address", }, +{ 0x00000008, + MCE_LOADSTORE, + "load/store to foreign address", }, +{ 0 } }; + +static const struct mce_derror_table mce_p10_derror_table[] = { +{ 0x00008000, + MCE_LOADSTORE | MCE_MEMORY_ERROR, + "load/store memory uncorrectable error", }, +{ 0x00004000, + MCE_LOADSTORE | MCE_MEMORY_ERROR | MCE_TABLE_WALK | MCE_INVOLVED_EA, + "load/store page table access memory uncorrectable error", }, +{ 0x00000800, + MCE_LOADSTORE | MCE_INVOLVED_EA | MCE_ERAT_ERROR, + "load/store ERAT multi-hit error", }, +{ 0x00000400, + MCE_LOADSTORE | MCE_INVOLVED_EA | MCE_TLB_ERROR, + "load/store TLB multi-hit error", }, +{ 0x00000200, + MCE_TLBIE_ERROR, + "TLBIE or TLBIEL instruction programming error", }, +{ 0x00000100, + MCE_LOADSTORE | MCE_INVOLVED_EA | MCE_SLB_ERROR, + "load/store SLB parity error", }, +{ 0x00000080, + MCE_LOADSTORE | MCE_INVOLVED_EA | MCE_SLB_ERROR, + "load/store SLB multi-hit error", }, +{ 0x00000040, + MCE_LOADSTORE | MCE_INVOLVED_EA, + "load real address error", }, +{ 0x00000020, + MCE_LOADSTORE | MCE_TABLE_WALK, + "load/store page table access real address error", }, +{ 0x00000010, + MCE_LOADSTORE | MCE_TABLE_WALK, + "load/store page table access to control real address", }, +{ 0x00000008, + MCE_LOADSTORE, + "load/store to control real address", }, +{ 0 } }; + +static void decode_ierror(const struct mce_ierror_table table[], + uint64_t srr1, + uint64_t *type, + const char **error_str) +{ + int i; + + for (i = 0; table[i].srr1_mask; i++) { + if ((srr1 & table[i].srr1_mask) != table[i].srr1_value) + continue; + + *type = table[i].type; + *error_str = table[i].error_str; + } +} + +static void decode_derror(const struct mce_derror_table table[], + uint32_t dsisr, + uint64_t *type, + const char **error_str) +{ + int i; + + for (i = 0; table[i].dsisr_value; i++) { + if (!(dsisr & table[i].dsisr_value)) + continue; + + *type = table[i].type; + *error_str = table[i].error_str; + } +} + +static void decode_mce_p9(uint64_t srr0, uint64_t srr1, + uint32_t dsisr, uint64_t dar, + uint64_t *type, const char **error_str, + uint64_t *address) +{ + /* + * On POWER9 DD2.1 and below, it's possible to get a machine check + * caused by a paste instruction where only DSISR bit 25 is set. This + * will result in the MCE handler seeing an unknown event and the + * kernel crashing. An MCE that occurs like this is spurious, so we + * don't need to do anything in terms of servicing it. If there is + * something that needs to be serviced, the CPU will raise the MCE + * again with the correct DSISR so that it can be serviced properly. + * So detect this case and mark it as handled. + */ + if (SRR1_MC_LOADSTORE(srr1) && dsisr == 0x02000000) { + *type = MCE_NO_ERROR; + *error_str = "no error (superfluous machine check)"; + return; + } + + /* + * Async machine check due to bad real address from store or foreign + * link time out comes with the load/store bit (PPC bit 42) set in + * SRR1, but the cause comes in SRR1 not DSISR. Clear bit 42 so we're + * directed to the ierror table so it will find the cause (which + * describes it correctly as a store error). + */ + if (SRR1_MC_LOADSTORE(srr1) && + ((srr1 & 0x081c0000) == 0x08140000 || + (srr1 & 0x081c0000) == 0x08180000)) { + srr1 &= ~PPC_BIT(42); + } + + if (SRR1_MC_LOADSTORE(srr1)) { + decode_derror(mce_p9_derror_table, dsisr, type, error_str); + if (*type & MCE_INVOLVED_EA) + *address = dar; + } else { + decode_ierror(mce_p9_ierror_table, srr1, type, error_str); + if (*type & MCE_INVOLVED_EA) + *address = srr0; + } +} + +static void decode_mce_p10(uint64_t srr0, uint64_t srr1, + uint32_t dsisr, uint64_t dar, + uint64_t *type, const char **error_str, + uint64_t *address) +{ + /* + * Async machine check due to bad real address from store or foreign + * link time out comes with the load/store bit (PPC bit 42) set in + * SRR1, but the cause comes in SRR1 not DSISR. Clear bit 42 so we're + * directed to the ierror table so it will find the cause (which + * describes it correctly as a store error). + */ + if (SRR1_MC_LOADSTORE(srr1) && + (srr1 & 0x081c0000) == 0x08140000) { + srr1 &= ~PPC_BIT(42); + } + + if (SRR1_MC_LOADSTORE(srr1)) { + decode_derror(mce_p10_derror_table, dsisr, type, error_str); + if (*type & MCE_INVOLVED_EA) + *address = dar; + } else { + decode_ierror(mce_p10_ierror_table, srr1, type, error_str); + if (*type & MCE_INVOLVED_EA) + *address = srr0; + } +} + +void decode_mce(uint64_t srr0, uint64_t srr1, + uint32_t dsisr, uint64_t dar, + uint64_t *type, const char **error_str, + uint64_t *address) +{ + *type = MCE_UNKNOWN; + *error_str = "unknown error"; + *address = 0; + + if (proc_gen == proc_gen_p9) { + decode_mce_p9(srr0, srr1, dsisr, dar, type, error_str, address); + } else if (proc_gen == proc_gen_p10) { + decode_mce_p10(srr0, srr1, dsisr, dar, type, error_str, address); + } else { + *error_str = "unknown error (processor not supported)"; + } +} diff --git a/roms/skiboot/core/mem_region.c b/roms/skiboot/core/mem_region.c new file mode 100644 index 000000000..36de2d094 --- /dev/null +++ b/roms/skiboot/core/mem_region.c @@ -0,0 +1,1555 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Simple memory allocator + * + * Copyright 2013-2018 IBM Corp. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Memory poisoning on free (if POISON_MEM_REGION set to 1) */ +#ifdef DEBUG +#define POISON_MEM_REGION 1 +#else +#define POISON_MEM_REGION 0 +#endif +#define POISON_MEM_REGION_WITH 0x99 +#define POISON_MEM_REGION_LIMIT 1*1024*1024*1024 + +/* Locking: The mem_region_lock protects the regions list from concurrent + * updates. Additions to, or removals from, the region list must be done + * with this lock held. This is typically done when we're establishing + * the memory & reserved regions. + * + * Each region has a lock (region->free_list_lock) to protect the free list + * from concurrent modification. This lock is used when we're allocating + * memory out of a specific region. + * + * If both locks are needed (eg, __local_alloc, where we need to find a region, + * then allocate from it), the mem_region_lock must be acquired before (and + * released after) the per-region lock. + */ +struct lock mem_region_lock = LOCK_UNLOCKED; + +static struct list_head regions = LIST_HEAD_INIT(regions); +static struct list_head early_reserves = LIST_HEAD_INIT(early_reserves); + +static bool mem_region_init_done = false; +static bool mem_regions_finalised = false; + +unsigned long top_of_ram = SKIBOOT_BASE + SKIBOOT_SIZE; + +static struct mem_region skiboot_os_reserve = { + .name = "ibm,os-reserve", + .start = 0, + .len = SKIBOOT_BASE, + .type = REGION_OS, +}; + +struct mem_region skiboot_heap = { + .name = "ibm,firmware-heap", + .start = HEAP_BASE, + .len = HEAP_SIZE, + .type = REGION_SKIBOOT_HEAP, +}; + +static struct mem_region skiboot_code_and_text = { + .name = "ibm,firmware-code", + .start = SKIBOOT_BASE, + .len = HEAP_BASE - SKIBOOT_BASE, + .type = REGION_SKIBOOT_FIRMWARE, +}; + +static struct mem_region skiboot_after_heap = { + .name = "ibm,firmware-data", + .start = HEAP_BASE + HEAP_SIZE, + .len = SKIBOOT_BASE + SKIBOOT_SIZE - (HEAP_BASE + HEAP_SIZE), + .type = REGION_SKIBOOT_FIRMWARE, +}; + +static struct mem_region skiboot_cpu_stacks = { + .name = "ibm,firmware-stacks", + .start = CPU_STACKS_BASE, + .len = 0, /* TBA */ + .type = REGION_SKIBOOT_FIRMWARE, +}; + +static struct mem_region skiboot_mambo_kernel = { + .name = "ibm,firmware-mambo-kernel", + .start = (unsigned long)KERNEL_LOAD_BASE, + .len = KERNEL_LOAD_SIZE, + .type = REGION_SKIBOOT_FIRMWARE, +}; + +static struct mem_region skiboot_mambo_initramfs = { + .name = "ibm,firmware-mambo-initramfs", + .start = (unsigned long)INITRAMFS_LOAD_BASE, + .len = INITRAMFS_LOAD_SIZE, + .type = REGION_SKIBOOT_FIRMWARE, +}; + + +struct alloc_hdr { + bool free : 1; + bool prev_free : 1; + bool printed : 1; + unsigned long num_longs : BITS_PER_LONG-3; /* Including header. */ + const char *location; +}; + +struct free_hdr { + struct alloc_hdr hdr; + struct list_node list; + /* ... unsigned long tailer; */ +}; + +#define ALLOC_HDR_LONGS (sizeof(struct alloc_hdr) / sizeof(long)) +#define ALLOC_MIN_LONGS (sizeof(struct free_hdr) / sizeof(long) + 1) + +/* Avoid ugly casts. */ +static void *region_start(const struct mem_region *region) +{ + return (void *)(unsigned long)region->start; +} + +/* Each free block has a tailer, so we can walk backwards. */ +static unsigned long *tailer(struct free_hdr *f) +{ + return (unsigned long *)f + f->hdr.num_longs - 1; +} + +/* This walks forward to the next hdr (or NULL if at the end). */ +static struct alloc_hdr *next_hdr(const struct mem_region *region, + const struct alloc_hdr *hdr) +{ + void *next; + + next = ((unsigned long *)hdr + hdr->num_longs); + if (next >= region_start(region) + region->len) + next = NULL; + return next; +} + +#if POISON_MEM_REGION == 1 +static void mem_poison(struct free_hdr *f) +{ + size_t poison_size = (void*)tailer(f) - (void*)(f+1); + + /* We only poison up to a limit, as otherwise boot is + * kinda slow */ + if (poison_size > POISON_MEM_REGION_LIMIT) + poison_size = POISON_MEM_REGION_LIMIT; + + memset(f+1, POISON_MEM_REGION_WITH, poison_size); +} +#endif + +/* Creates free block covering entire region. */ +static void init_allocatable_region(struct mem_region *region) +{ + struct free_hdr *f = region_start(region); + assert(region->type == REGION_SKIBOOT_HEAP || + region->type == REGION_MEMORY); + f->hdr.num_longs = region->len / sizeof(long); + f->hdr.free = true; + f->hdr.prev_free = false; + *tailer(f) = f->hdr.num_longs; + list_head_init(®ion->free_list); + list_add(®ion->free_list, &f->list); +#if POISON_MEM_REGION == 1 + mem_poison(f); +#endif +} + +static void make_free(struct mem_region *region, struct free_hdr *f, + const char *location, bool skip_poison) +{ + struct alloc_hdr *next; + +#if POISON_MEM_REGION == 1 + if (!skip_poison) + mem_poison(f); +#else + (void)skip_poison; +#endif + + if (f->hdr.prev_free) { + struct free_hdr *prev; + unsigned long *prev_tailer = (unsigned long *)f - 1; + + assert(*prev_tailer); + prev = (void *)((unsigned long *)f - *prev_tailer); + assert(prev->hdr.free); + assert(!prev->hdr.prev_free); + + /* Expand to cover the one we just freed. */ + prev->hdr.num_longs += f->hdr.num_longs; + f = prev; + } else { + f->hdr.free = true; + f->hdr.location = location; + list_add(®ion->free_list, &f->list); + } + + /* Fix up tailer. */ + *tailer(f) = f->hdr.num_longs; + + /* If next is free, coalesce it */ + next = next_hdr(region, &f->hdr); + if (next) { + next->prev_free = true; + if (next->free) { + struct free_hdr *next_free = (void *)next; + list_del_from(®ion->free_list, &next_free->list); + /* Maximum of one level of recursion */ + make_free(region, next_free, location, true); + } + } +} + +/* Can we fit this many longs with this alignment in this free block? */ +static bool fits(struct free_hdr *f, size_t longs, size_t align, size_t *offset) +{ + *offset = 0; + + while (f->hdr.num_longs >= *offset + longs) { + size_t addr; + + addr = (unsigned long)f + + (*offset + ALLOC_HDR_LONGS) * sizeof(long); + if ((addr & (align - 1)) == 0) + return true; + + /* Don't make tiny chunks! */ + if (*offset == 0) + *offset = ALLOC_MIN_LONGS; + else + (*offset)++; + } + return false; +} + +static void discard_excess(struct mem_region *region, + struct alloc_hdr *hdr, size_t alloc_longs, + const char *location, bool skip_poison) +{ + /* Do we have excess? */ + if (hdr->num_longs > alloc_longs + ALLOC_MIN_LONGS) { + struct free_hdr *post; + + /* Set up post block. */ + post = (void *)hdr + alloc_longs * sizeof(long); + post->hdr.num_longs = hdr->num_longs - alloc_longs; + post->hdr.prev_free = false; + + /* Trim our block. */ + hdr->num_longs = alloc_longs; + + /* This coalesces as required. */ + make_free(region, post, location, skip_poison); + } +} + +static const char *hdr_location(const struct alloc_hdr *hdr) +{ + /* Corrupt: step carefully! */ + if (is_rodata(hdr->location)) + return hdr->location; + return "*CORRUPT*"; +} + +static void bad_header(const struct mem_region *region, + const struct alloc_hdr *hdr, + const char *during, + const char *location) +{ + /* Corrupt: step carefully! */ + if (is_rodata(hdr->location)) + prerror("%p (in %s) %s at %s, previously %s\n", + hdr-1, region->name, during, location, hdr->location); + else + prerror("%p (in %s) %s at %s, previously %p\n", + hdr-1, region->name, during, location, hdr->location); + abort(); +} + +static bool region_is_reservable(struct mem_region *region) +{ + return region->type != REGION_OS; +} + +static bool region_is_reserved(struct mem_region *region) +{ + return region->type != REGION_OS && region->type != REGION_MEMORY; +} + +void mem_dump_allocs(void) +{ + struct mem_region *region; + struct alloc_hdr *h, *i; + + /* Second pass: populate property data */ + prlog(PR_INFO, "Memory regions:\n"); + list_for_each(®ions, region, list) { + if (!(region->type == REGION_SKIBOOT_HEAP || + region->type == REGION_MEMORY)) + continue; + prlog(PR_INFO, " 0x%012llx..%012llx : %s\n", + (long long)region->start, + (long long)(region->start + region->len - 1), + region->name); + if (region->free_list.n.next == NULL) { + prlog(PR_INFO, " no allocs\n"); + continue; + } + + /* + * XXX: When dumping the allocation list we coalase allocations + * with the same location and size into a single line. This is + * quadratic, but it makes the dump human-readable and the raw + * dump sometimes causes the log buffer to wrap. + */ + for (h = region_start(region); h; h = next_hdr(region, h)) + h->printed = false; + + for (h = region_start(region); h; h = next_hdr(region, h)) { + unsigned long bytes; + int count = 0; + + if (h->free) + continue; + if (h->printed) + continue; + + for (i = h; i; i = next_hdr(region, i)) { + if (i->free) + continue; + if (i->num_longs != h->num_longs) + continue; + if (strcmp(i->location, h->location)) + continue; + + i->printed = true; + count++; + } + + bytes = h->num_longs * sizeof(long); + prlog(PR_NOTICE, " % 8d allocs of 0x%.8lx bytes at %s (total 0x%lx)\n", + count, bytes, hdr_location(h), bytes * count); + } + } +} + +int64_t mem_dump_free(void) +{ + struct mem_region *region; + struct alloc_hdr *hdr; + int64_t total_free; + int64_t region_free; + + total_free = 0; + + prlog(PR_INFO, "Free space in HEAP memory regions:\n"); + list_for_each(®ions, region, list) { + if (!(region->type == REGION_SKIBOOT_HEAP || + region->type == REGION_MEMORY)) + continue; + region_free = 0; + + if (region->free_list.n.next == NULL) { + continue; + } + for (hdr = region_start(region); hdr; hdr = next_hdr(region, hdr)) { + if (!hdr->free) + continue; + + region_free+= hdr->num_longs * sizeof(long); + } + prlog(PR_INFO, "Region %s free: %"PRIx64"\n", + region->name, region_free); + total_free += region_free; + } + + prlog(PR_INFO, "Total free: %"PRIu64"\n", total_free); + + return total_free; +} + +static void *__mem_alloc(struct mem_region *region, size_t size, size_t align, + const char *location) +{ + size_t alloc_longs, offset; + struct free_hdr *f; + struct alloc_hdr *next; + + /* Align must be power of 2. */ + assert(!((align - 1) & align)); + + /* This should be a constant. */ + assert(is_rodata(location)); + + /* Unallocatable region? */ + if (!(region->type == REGION_SKIBOOT_HEAP || + region->type == REGION_MEMORY)) + return NULL; + + /* First allocation? */ + if (region->free_list.n.next == NULL) + init_allocatable_region(region); + + /* Don't do screwy sizes. */ + if (size > region->len) + return NULL; + + /* Don't do tiny alignments, we deal in long increments. */ + if (align < sizeof(long)) + align = sizeof(long); + + /* Convert size to number of longs, too. */ + alloc_longs = (size + sizeof(long)-1) / sizeof(long) + ALLOC_HDR_LONGS; + + /* Can't be too small for when we free it, either. */ + if (alloc_longs < ALLOC_MIN_LONGS) + alloc_longs = ALLOC_MIN_LONGS; + + /* Walk free list. */ + list_for_each(®ion->free_list, f, list) { + /* We may have to skip some to meet alignment. */ + if (fits(f, alloc_longs, align, &offset)) + goto found; + } + + return NULL; + +found: + assert(f->hdr.free); + assert(!f->hdr.prev_free); + + /* This block is no longer free. */ + list_del_from(®ion->free_list, &f->list); + f->hdr.free = false; + f->hdr.location = location; + + next = next_hdr(region, &f->hdr); + if (next) { + assert(next->prev_free); + next->prev_free = false; + } + + if (offset != 0) { + struct free_hdr *pre = f; + + f = (void *)f + offset * sizeof(long); + assert(f >= pre + 1); + + /* Set up new header. */ + f->hdr.num_longs = pre->hdr.num_longs - offset; + /* f->hdr.prev_free will be set by make_free below. */ + f->hdr.free = false; + f->hdr.location = location; + + /* Fix up old header. */ + pre->hdr.num_longs = offset; + pre->hdr.prev_free = false; + + /* This coalesces as required. */ + make_free(region, pre, location, true); + } + + /* We might be too long; put the rest back. */ + discard_excess(region, &f->hdr, alloc_longs, location, true); + + /* Clear tailer for debugging */ + *tailer(f) = 0; + + /* Their pointer is immediately after header. */ + return &f->hdr + 1; +} + +void *mem_alloc(struct mem_region *region, size_t size, size_t align, + const char *location) +{ + static bool dumped = false; + void *r; + + assert(lock_held_by_me(®ion->free_list_lock)); + + r = __mem_alloc(region, size, align, location); + if (r) + return r; + + prerror("mem_alloc(0x%lx, 0x%lx, \"%s\", %s) failed !\n", + size, align, location, region->name); + if (!dumped) { + mem_dump_allocs(); + dumped = true; + } + + return NULL; +} + +void mem_free(struct mem_region *region, void *mem, const char *location) +{ + struct alloc_hdr *hdr; + + /* This should be a constant. */ + assert(is_rodata(location)); + + assert(lock_held_by_me(®ion->free_list_lock)); + + /* Freeing NULL is always a noop. */ + if (!mem) + return; + + /* Your memory is in the region, right? */ + assert(mem >= region_start(region) + sizeof(*hdr)); + assert(mem < region_start(region) + region->len); + + /* Grab header. */ + hdr = mem - sizeof(*hdr); + + if (hdr->free) + bad_header(region, hdr, "re-freed", location); + + make_free(region, (struct free_hdr *)hdr, location, false); +} + +size_t mem_allocated_size(const void *ptr) +{ + const struct alloc_hdr *hdr = ptr - sizeof(*hdr); + return hdr->num_longs * sizeof(long) - sizeof(struct alloc_hdr); +} + +bool mem_resize(struct mem_region *region, void *mem, size_t len, + const char *location) +{ + struct alloc_hdr *hdr, *next; + struct free_hdr *f; + + /* This should be a constant. */ + assert(is_rodata(location)); + + assert(lock_held_by_me(®ion->free_list_lock)); + + /* Get header. */ + hdr = mem - sizeof(*hdr); + if (hdr->free) + bad_header(region, hdr, "resize", location); + + /* Round up size to multiple of longs. */ + len = (sizeof(*hdr) + len + sizeof(long) - 1) / sizeof(long); + + /* Can't be too small for when we free it, either. */ + if (len < ALLOC_MIN_LONGS) + len = ALLOC_MIN_LONGS; + + /* Shrinking is simple. */ + if (len <= hdr->num_longs) { + hdr->location = location; + discard_excess(region, hdr, len, location, false); + return true; + } + + /* Check if we can expand. */ + next = next_hdr(region, hdr); + if (!next || !next->free || hdr->num_longs + next->num_longs < len) + return false; + + /* OK, it's free and big enough, absorb it. */ + f = (struct free_hdr *)next; + list_del_from(®ion->free_list, &f->list); + hdr->num_longs += next->num_longs; + hdr->location = location; + + /* Update next prev_free */ + next = next_hdr(region, &f->hdr); + if (next) { + assert(next->prev_free); + next->prev_free = false; + } + + /* Clear tailer for debugging */ + *tailer(f) = 0; + + /* Now we might have *too* much. */ + discard_excess(region, hdr, len, location, true); + return true; +} + +bool mem_check(const struct mem_region *region) +{ + size_t frees = 0; + struct alloc_hdr *hdr, *prev_free = NULL; + struct free_hdr *f; + + /* Check it's sanely aligned. */ + if (region->start % sizeof(long)) { + prerror("Region '%s' not sanely aligned (%llx)\n", + region->name, (unsigned long long)region->start); + return false; + } + if ((long)region->len % sizeof(long)) { + prerror("Region '%s' not sane length (%llu)\n", + region->name, (unsigned long long)region->len); + return false; + } + + /* Not ours to play with, or empty? Don't do anything. */ + if (!(region->type == REGION_MEMORY || + region->type == REGION_SKIBOOT_HEAP) || + region->free_list.n.next == NULL) + return true; + + /* Walk linearly. */ + for (hdr = region_start(region); hdr; hdr = next_hdr(region, hdr)) { + if (hdr->num_longs < ALLOC_MIN_LONGS) { + prerror("Region '%s' %s %p (%s) size %zu\n", + region->name, hdr->free ? "free" : "alloc", + hdr, hdr_location(hdr), + hdr->num_longs * sizeof(long)); + return false; + } + if ((unsigned long)hdr + hdr->num_longs * sizeof(long) > + region->start + region->len) { + prerror("Region '%s' %s %p (%s) oversize %zu\n", + region->name, hdr->free ? "free" : "alloc", + hdr, hdr_location(hdr), + hdr->num_longs * sizeof(long)); + return false; + } + if (hdr->free) { + if (hdr->prev_free || prev_free) { + prerror("Region '%s' free %p (%s) has prev_free" + " %p (%s) %sset?\n", + region->name, hdr, hdr_location(hdr), + prev_free, + prev_free ? hdr_location(prev_free) + : "NULL", + hdr->prev_free ? "" : "un"); + return false; + } + prev_free = hdr; + frees ^= (unsigned long)hdr - region->start; + } else { + if (hdr->prev_free != (bool)prev_free) { + prerror("Region '%s' alloc %p (%s) has" + " prev_free %p %sset?\n", + region->name, hdr, hdr_location(hdr), + prev_free, hdr->prev_free ? "" : "un"); + return false; + } + prev_free = NULL; + } + } + + /* Now walk free list. */ + list_for_each(®ion->free_list, f, list) + frees ^= (unsigned long)f - region->start; + + if (frees) { + prerror("Region '%s' free list and walk do not match!\n", + region->name); + return false; + } + return true; +} + +bool mem_check_all(void) +{ + struct mem_region *r; + + list_for_each(®ions, r, list) { + if (!mem_check(r)) + return false; + } + + return true; +} + +static struct mem_region *new_region(const char *name, + uint64_t start, uint64_t len, + struct dt_node *node, + enum mem_region_type type) +{ + struct mem_region *region; + + region = malloc(sizeof(*region)); + if (!region) + return NULL; + + region->name = name; + region->start = start; + region->len = len; + region->node = node; + region->type = type; + region->free_list.n.next = NULL; + init_lock(®ion->free_list_lock); + + return region; +} + +/* We always split regions, so we only have to replace one. */ +static struct mem_region *split_region(struct mem_region *head, + uint64_t split_at, + enum mem_region_type type) +{ + struct mem_region *tail; + uint64_t end = head->start + head->len; + + tail = new_region(head->name, split_at, end - split_at, + head->node, type); + /* Original region becomes head. */ + if (tail) + head->len -= tail->len; + + return tail; +} + +static bool intersects(const struct mem_region *region, uint64_t addr) +{ + return addr > region->start && + addr < region->start + region->len; +} + +static bool maybe_split(struct mem_region *r, uint64_t split_at) +{ + struct mem_region *tail; + + if (!intersects(r, split_at)) + return true; + + tail = split_region(r, split_at, r->type); + if (!tail) + return false; + + /* Tail add is important: we may need to split again! */ + list_add_after(®ions, &tail->list, &r->list); + return true; +} + +static bool overlaps(const struct mem_region *r1, const struct mem_region *r2) +{ + return (r1->start + r1->len > r2->start + && r1->start < r2->start + r2->len); +} + +static bool contains(const struct mem_region *r1, const struct mem_region *r2) +{ + u64 r1_end = r1->start + r1->len; + u64 r2_end = r2->start + r2->len; + + return (r1->start <= r2->start && r2_end <= r1_end); +} + +static struct mem_region *get_overlap(const struct mem_region *region) +{ + struct mem_region *i; + + list_for_each(®ions, i, list) { + if (overlaps(region, i)) + return i; + } + return NULL; +} + +static void add_region_to_regions(struct mem_region *region) +{ + struct mem_region *r; + + list_for_each(®ions, r, list) { + if (r->start < region->start) + continue; + + list_add_before(®ions, ®ion->list, &r->list); + return; + } + list_add_tail(®ions, ®ion->list); +} + +static bool add_region(struct mem_region *region) +{ + struct mem_region *r; + + if (mem_regions_finalised) { + prerror("MEM: add_region(%s@0x%"PRIx64") called after finalise!\n", + region->name, region->start); + return false; + } + + /* First split any regions which intersect. */ + list_for_each(®ions, r, list) { + /* + * The new region should be fully contained by an existing one. + * If it's not then we have a problem where reservations + * partially overlap which is probably broken. + * + * NB: There *might* be situations where this is legitimate, + * but the region handling does not currently support this. + */ + if (overlaps(r, region) && !contains(r, region)) { + prerror("MEM: Partial overlap detected between regions:\n"); + prerror("MEM: %s [0x%"PRIx64"-0x%"PRIx64"] (new)\n", + region->name, region->start, + region->start + region->len); + prerror("MEM: %s [0x%"PRIx64"-0x%"PRIx64"]\n", + r->name, r->start, r->start + r->len); + return false; + } + + if (!maybe_split(r, region->start) || + !maybe_split(r, region->start + region->len)) + return false; + } + + /* Now we have only whole overlaps, if any. */ + while ((r = get_overlap(region)) != NULL) { + assert(r->start == region->start); + assert(r->len == region->len); + list_del_from(®ions, &r->list); + free(r); + } + + /* Finally, add in our own region. */ + add_region_to_regions(region); + return true; +} + +static void mem_reserve(enum mem_region_type type, const char *name, + uint64_t start, uint64_t len) +{ + struct mem_region *region; + bool added = true; + + lock(&mem_region_lock); + region = new_region(name, start, len, NULL, type); + assert(region); + + if (!mem_region_init_done) + list_add(&early_reserves, ®ion->list); + else + added = add_region(region); + + assert(added); + unlock(&mem_region_lock); +} + +void mem_reserve_fw(const char *name, uint64_t start, uint64_t len) +{ + mem_reserve(REGION_FW_RESERVED, name, start, len); +} + +void mem_reserve_hwbuf(const char *name, uint64_t start, uint64_t len) +{ + mem_reserve(REGION_RESERVED, name, start, len); +} + +static bool matches_chip_id(const __be32 ids[], size_t num, u32 chip_id) +{ + size_t i; + + for (i = 0; i < num; i++) + if (be32_to_cpu(ids[i]) == chip_id) + return true; + + return false; +} + +void *__local_alloc(unsigned int chip_id, size_t size, size_t align, + const char *location) +{ + struct mem_region *region; + void *p = NULL; + bool use_local = true; + + lock(&mem_region_lock); + +restart: + list_for_each(®ions, region, list) { + const struct dt_property *prop; + const __be32 *ids; + + if (!(region->type == REGION_SKIBOOT_HEAP || + region->type == REGION_MEMORY)) + continue; + + /* Don't allocate from normal heap. */ + if (region == &skiboot_heap) + continue; + + /* First pass, only match node local regions */ + if (use_local) { + if (!region->node) + continue; + prop = dt_find_property(region->node, "ibm,chip-id"); + ids = (const __be32 *)prop->prop; + if (!matches_chip_id(ids, prop->len/sizeof(u32), + chip_id)) + continue; + } + + /* Second pass, match anything */ + lock(®ion->free_list_lock); + p = mem_alloc(region, size, align, location); + unlock(®ion->free_list_lock); + if (p) + break; + } + + /* + * If we can't allocate the memory block from the expected + * node, we bail to any one that can accommodate our request. + */ + if (!p && use_local) { + use_local = false; + goto restart; + } + + unlock(&mem_region_lock); + + return p; +} + +struct mem_region *find_mem_region(const char *name) +{ + struct mem_region *region; + + list_for_each(®ions, region, list) { + if (streq(region->name, name)) + return region; + } + return NULL; +} + +bool mem_range_is_reserved(uint64_t start, uint64_t size) +{ + uint64_t end = start + size; + struct mem_region *region; + struct list_head *search; + + /* We may have the range covered by a number of regions, which could + * appear in any order. So, we look for a region that covers the + * start address, and bump start up to the end of that region. + * + * We repeat until we've either bumped past the end of the range, + * or we didn't find a matching region. + * + * This has a worst-case of O(n^2), but n is well bounded by the + * small number of reservations. + */ + + if (!mem_region_init_done) + search = &early_reserves; + else + search = ®ions; + + for (;;) { + bool found = false; + + list_for_each(search, region, list) { + if (!region_is_reserved(region)) + continue; + + /* does this region overlap the start address, and + * have a non-zero size? */ + if (region->start <= start && + region->start + region->len > start && + region->len) { + start = region->start + region->len; + found = true; + } + } + + /* 'end' is the first byte outside of the range */ + if (start >= end) + return true; + + if (!found) + break; + } + + return false; +} + +static void mem_region_parse_reserved_properties(void) +{ + const struct dt_property *names, *ranges; + struct mem_region *region; + + prlog(PR_DEBUG, "MEM: parsing reserved memory from " + "reserved-names/-ranges properties\n"); + + names = dt_find_property(dt_root, "reserved-names"); + ranges = dt_find_property(dt_root, "reserved-ranges"); + if (names && ranges) { + const uint64_t *range; + int n, len; + + range = (const void *)ranges->prop; + + for (n = 0; n < names->len; n += len, range += 2) { + char *name; + + len = strlen(names->prop + n) + 1; + name = strdup(names->prop + n); + + region = new_region(name, + dt_get_number(range, 2), + dt_get_number(range + 1, 2), + NULL, REGION_FW_RESERVED); + if (!add_region(region)) { + prerror("Couldn't add mem_region %s\n", name); + abort(); + } + } + } else if (names || ranges) { + prerror("Invalid properties: reserved-names=%p " + "with reserved-ranges=%p\n", + names, ranges); + abort(); + } else { + return; + } +} + +static bool mem_region_parse_reserved_nodes(const char *path) +{ + struct dt_node *parent, *node; + + parent = dt_find_by_path(dt_root, path); + if (!parent) + return false; + + prlog(PR_INFO, "MEM: parsing reserved memory from node %s\n", path); + + dt_for_each_child(parent, node) { + const struct dt_property *reg; + struct mem_region *region; + int type; + + reg = dt_find_property(node, "reg"); + if (!reg) { + char *nodepath = dt_get_path(node); + prerror("node %s has no reg property, ignoring\n", + nodepath); + free(nodepath); + continue; + } + + if (dt_has_node_property(node, "no-map", NULL)) + type = REGION_RESERVED; + else + type = REGION_FW_RESERVED; + + region = new_region(strdup(node->name), + dt_get_number(reg->prop, 2), + dt_get_number(reg->prop + sizeof(u64), 2), + node, type); + if (!add_region(region)) { + char *nodepath = dt_get_path(node); + prerror("node %s failed to add_region()\n", nodepath); + free(nodepath); + } + } + + return true; +} + +/* Trawl through device tree, create memory regions from nodes. */ +void mem_region_init(void) +{ + struct mem_region *region, *next; + struct dt_node *i; + bool rc; + + /* + * Add associativity properties outside of the lock + * to avoid recursive locking caused by allocations + * done by add_chip_dev_associativity() + */ + dt_for_each_node(dt_root, i) { + if (!dt_has_node_property(i, "device_type", "memory") && + !dt_has_node_property(i, "compatible", "pmem-region")) + continue; + + /* Add associativity properties */ + add_chip_dev_associativity(i); + } + + /* Add each memory node. */ + dt_for_each_node(dt_root, i) { + uint64_t start, len; + char *rname; +#define NODE_REGION_PREFIX "ibm,firmware-allocs-" + + if (!dt_has_node_property(i, "device_type", "memory")) + continue; + rname = zalloc(strlen(i->name) + strlen(NODE_REGION_PREFIX) + 1); + assert(rname); + strcat(rname, NODE_REGION_PREFIX); + strcat(rname, i->name); + start = dt_get_address(i, 0, &len); + lock(&mem_region_lock); + region = new_region(rname, start, len, i, REGION_MEMORY); + if (!region) { + prerror("MEM: Could not add mem region %s!\n", i->name); + abort(); + } + add_region_to_regions(region); + if ((start + len) > top_of_ram) + top_of_ram = start + len; + unlock(&mem_region_lock); + } + + /* + * This is called after we know the maximum PIR of all CPUs, + * so we can dynamically set the stack length. + */ + skiboot_cpu_stacks.len = (cpu_max_pir + 1) * STACK_SIZE; + + lock(&mem_region_lock); + + /* Now carve out our own reserved areas. */ + if (!add_region(&skiboot_os_reserve) || + !add_region(&skiboot_code_and_text) || + !add_region(&skiboot_heap) || + !add_region(&skiboot_after_heap) || + !add_region(&skiboot_cpu_stacks)) { + prerror("Out of memory adding skiboot reserved areas\n"); + abort(); + } + + if (chip_quirk(QUIRK_MAMBO_CALLOUTS)) { + if (!add_region(&skiboot_mambo_kernel) || + !add_region(&skiboot_mambo_initramfs)) { + prerror("Out of memory adding mambo payload\n"); + abort(); + } + } + + /* Add reserved reanges from HDAT */ + list_for_each_safe(&early_reserves, region, next, list) { + bool added; + + list_del(®ion->list); + added = add_region(region); + assert(added); + } + + /* Add reserved ranges from the DT */ + rc = mem_region_parse_reserved_nodes("/reserved-memory"); + if (!rc) + rc = mem_region_parse_reserved_nodes( + "/ibm,hostboot/reserved-memory"); + if (!rc) + mem_region_parse_reserved_properties(); + + mem_region_init_done = true; + unlock(&mem_region_lock); +} + +static uint64_t allocated_length(const struct mem_region *r) +{ + struct free_hdr *f, *last = NULL; + + /* No allocations at all? */ + if (r->free_list.n.next == NULL) + return 0; + + /* Find last free block. */ + list_for_each(&r->free_list, f, list) + if (f > last) + last = f; + + /* No free blocks? */ + if (!last) + return r->len; + + /* Last free block isn't at end? */ + if (next_hdr(r, &last->hdr)) + return r->len; + return (unsigned long)last - r->start; +} + +/* Separate out allocated sections into their own region. */ +void mem_region_release_unused(void) +{ + struct mem_region *r; + + lock(&mem_region_lock); + assert(!mem_regions_finalised); + + prlog(PR_INFO, "Releasing unused memory:\n"); + list_for_each(®ions, r, list) { + uint64_t used_len; + + /* If it's not allocatable, ignore it. */ + if (!(r->type == REGION_SKIBOOT_HEAP || + r->type == REGION_MEMORY)) + continue; + + used_len = allocated_length(r); + + prlog(PR_INFO, " %s: %llu/%llu used\n", + r->name, (long long)used_len, (long long)r->len); + + /* We keep the skiboot heap. */ + if (r == &skiboot_heap) + continue; + + /* Nothing used? Whole thing is for Linux. */ + if (used_len == 0) + r->type = REGION_OS; + /* Partially used? Split region. */ + else if (used_len != r->len) { + struct mem_region *for_linux; + struct free_hdr *last = region_start(r) + used_len; + + /* Remove the final free block. */ + list_del_from(&r->free_list, &last->list); + + for_linux = split_region(r, r->start + used_len, + REGION_OS); + if (!for_linux) { + prerror("OOM splitting mem node %s for linux\n", + r->name); + abort(); + } + list_add(®ions, &for_linux->list); + } + } + unlock(&mem_region_lock); +} + +static void mem_clear_range(uint64_t s, uint64_t e) +{ + uint64_t res_start, res_end; + + /* Skip exception vectors */ + if (s < EXCEPTION_VECTORS_END) + s = EXCEPTION_VECTORS_END; + + /* Skip kernel preload area */ + res_start = (uint64_t)KERNEL_LOAD_BASE; + res_end = res_start + KERNEL_LOAD_SIZE; + + if (s >= res_start && s < res_end) + s = res_end; + if (e > res_start && e <= res_end) + e = res_start; + if (e <= s) + return; + if (s < res_start && e > res_end) { + mem_clear_range(s, res_start); + mem_clear_range(res_end, e); + return; + } + + /* Skip initramfs preload area */ + res_start = (uint64_t)INITRAMFS_LOAD_BASE; + res_end = res_start + INITRAMFS_LOAD_SIZE; + + if (s >= res_start && s < res_end) + s = res_end; + if (e > res_start && e <= res_end) + e = res_start; + if (e <= s) + return; + if (s < res_start && e > res_end) { + mem_clear_range(s, res_start); + mem_clear_range(res_end, e); + return; + } + + prlog(PR_DEBUG, "Clearing region %llx-%llx\n", + (long long)s, (long long)e); + memset((void *)s, 0, e - s); +} + +struct mem_region_clear_job_args { + char *job_name; + uint64_t s,e; +}; + +static void mem_region_clear_job(void *data) +{ + struct mem_region_clear_job_args *arg = (struct mem_region_clear_job_args*)data; + mem_clear_range(arg->s, arg->e); +} + +#define MEM_REGION_CLEAR_JOB_SIZE (16ULL*(1<<30)) + +static struct cpu_job **mem_clear_jobs; +static struct mem_region_clear_job_args *mem_clear_job_args; +static int mem_clear_njobs = 0; + +void start_mem_region_clear_unused(void) +{ + struct mem_region *r; + uint64_t s,l; + uint64_t total = 0; + uint32_t chip_id; + char *path; + int i; + struct cpu_job **jobs; + struct mem_region_clear_job_args *job_args; + + lock(&mem_region_lock); + assert(mem_regions_finalised); + + mem_clear_njobs = 0; + + list_for_each(®ions, r, list) { + if (!(r->type == REGION_OS)) + continue; + mem_clear_njobs++; + /* One job per 16GB */ + mem_clear_njobs += r->len / MEM_REGION_CLEAR_JOB_SIZE; + } + + jobs = malloc(mem_clear_njobs * sizeof(struct cpu_job*)); + job_args = malloc(mem_clear_njobs * sizeof(struct mem_region_clear_job_args)); + mem_clear_jobs = jobs; + mem_clear_job_args = job_args; + + prlog(PR_NOTICE, "Clearing unused memory:\n"); + i = 0; + list_for_each(®ions, r, list) { + /* If it's not unused, ignore it. */ + if (!(r->type == REGION_OS)) + continue; + + assert(r != &skiboot_heap); + + s = r->start; + l = r->len; + while(l > MEM_REGION_CLEAR_JOB_SIZE) { + job_args[i].s = s+l - MEM_REGION_CLEAR_JOB_SIZE; + job_args[i].e = s+l; + l-=MEM_REGION_CLEAR_JOB_SIZE; + job_args[i].job_name = malloc(sizeof(char)*100); + total+=MEM_REGION_CLEAR_JOB_SIZE; + chip_id = __dt_get_chip_id(r->node); + if (chip_id == -1) + chip_id = 0; + path = dt_get_path(r->node); + snprintf(job_args[i].job_name, 100, + "clear %s, %s 0x%"PRIx64" len: %"PRIx64" on %d", + r->name, path, + job_args[i].s, + (job_args[i].e - job_args[i].s), + chip_id); + free(path); + jobs[i] = cpu_queue_job_on_node(chip_id, + job_args[i].job_name, + mem_region_clear_job, + &job_args[i]); + if (!jobs[i]) + jobs[i] = cpu_queue_job(NULL, + job_args[i].job_name, + mem_region_clear_job, + &job_args[i]); + assert(jobs[i]); + i++; + } + job_args[i].s = s; + job_args[i].e = s+l; + job_args[i].job_name = malloc(sizeof(char)*100); + total+=l; + chip_id = __dt_get_chip_id(r->node); + if (chip_id == -1) + chip_id = 0; + path = dt_get_path(r->node); + snprintf(job_args[i].job_name,100, + "clear %s, %s 0x%"PRIx64" len: 0x%"PRIx64" on %d", + r->name, path, + job_args[i].s, + (job_args[i].e - job_args[i].s), + chip_id); + free(path); + jobs[i] = cpu_queue_job_on_node(chip_id, + job_args[i].job_name, + mem_region_clear_job, + &job_args[i]); + if (!jobs[i]) + jobs[i] = cpu_queue_job(NULL, + job_args[i].job_name, + mem_region_clear_job, + &job_args[i]); + assert(jobs[i]); + i++; + } + unlock(&mem_region_lock); + cpu_process_local_jobs(); +} + +void wait_mem_region_clear_unused(void) +{ + uint64_t l; + uint64_t total = 0; + int i; + + for(i=0; i < mem_clear_njobs; i++) { + total += (mem_clear_job_args[i].e - mem_clear_job_args[i].s); + } + + l = 0; + for(i=0; i < mem_clear_njobs; i++) { + cpu_wait_job(mem_clear_jobs[i], true); + l += (mem_clear_job_args[i].e - mem_clear_job_args[i].s); + printf("Clearing memory... %"PRIu64"/%"PRIu64"GB done\n", + l>>30, total>>30); + free(mem_clear_job_args[i].job_name); + } + free(mem_clear_jobs); + free(mem_clear_job_args); +} + +static void mem_region_add_dt_reserved_node(struct dt_node *parent, + struct mem_region *region) +{ + char *name, *p; + + /* If a reserved region was established before skiboot, it may be + * referenced by a device-tree node with extra data. In that case, + * copy the node to /reserved-memory/, unless it's already there. + * + * We update region->node to the new copy here, as the prd code may + * update regions' device-tree nodes, and we want those updates to + * apply to the nodes in /reserved-memory/. + */ + if (region->type == REGION_FW_RESERVED && region->node) { + if (region->node->parent != parent) + region->node = dt_copy(region->node, parent); + return; + } + + name = strdup(region->name); + assert(name); + + /* remove any cell addresses in the region name; we have our own cell + * addresses here */ + p = strchr(name, '@'); + if (p) + *p = '\0'; + + region->node = dt_new_addr(parent, name, region->start); + assert(region->node); + dt_add_property_u64s(region->node, "reg", region->start, region->len); + + /* + * This memory is used by hardware and may need special handling. Ask + * the host kernel not to map it by default. + */ + if (region->type == REGION_RESERVED) + dt_add_property(region->node, "no-map", NULL, 0); + + free(name); +} + +void mem_region_add_dt_reserved(void) +{ + int names_len, ranges_len, len; + const struct dt_property *prop; + struct mem_region *region; + void *names, *ranges; + struct dt_node *node; + fdt64_t *range; + char *name; + + names_len = 0; + ranges_len = 0; + + /* Finalise the region list, so we know that the regions list won't be + * altered after this point. The regions' free lists may change after + * we drop the lock, but we don't access those. */ + lock(&mem_region_lock); + mem_regions_finalised = true; + + /* establish top-level reservation node */ + node = dt_find_by_path(dt_root, "reserved-memory"); + if (!node) { + node = dt_new(dt_root, "reserved-memory"); + dt_add_property_cells(node, "#address-cells", 2); + dt_add_property_cells(node, "#size-cells", 2); + dt_add_property(node, "ranges", NULL, 0); + } + + prlog(PR_INFO, "Reserved regions:\n"); + + /* First pass, create /reserved-memory/ nodes for each reservation, + * and calculate the length for the /reserved-names and + * /reserved-ranges properties */ + list_for_each(®ions, region, list) { + if (!region_is_reservable(region)) + continue; + + prlog(PR_INFO, " 0x%012llx..%012llx : %s\n", + (long long)region->start, + (long long)(region->start + region->len - 1), + region->name); + + mem_region_add_dt_reserved_node(node, region); + + /* calculate the size of the properties populated later */ + names_len += strlen(region->node->name) + 1; + ranges_len += 2 * sizeof(uint64_t); + } + + name = names = malloc(names_len); + range = ranges = malloc(ranges_len); + + /* Second pass: populate the old-style reserved-names and + * reserved-regions arrays based on the node data */ + list_for_each(®ions, region, list) { + if (!region_is_reservable(region)) + continue; + + len = strlen(region->node->name) + 1; + memcpy(name, region->node->name, len); + name += len; + + range[0] = cpu_to_fdt64(region->start); + range[1] = cpu_to_fdt64(region->len); + range += 2; + } + unlock(&mem_region_lock); + + prop = dt_find_property(dt_root, "reserved-names"); + if (prop) + dt_del_property(dt_root, (struct dt_property *)prop); + + prop = dt_find_property(dt_root, "reserved-ranges"); + if (prop) + dt_del_property(dt_root, (struct dt_property *)prop); + + dt_add_property(dt_root, "reserved-names", names, names_len); + dt_add_property(dt_root, "reserved-ranges", ranges, ranges_len); + + free(names); + free(ranges); +} + +struct mem_region *mem_region_next(struct mem_region *region) +{ + struct list_node *node; + + assert(lock_held_by_me(&mem_region_lock)); + + node = region ? ®ion->list : ®ions.n; + + if (node->next == ®ions.n) + return NULL; + + return list_entry(node->next, struct mem_region, list); +} diff --git a/roms/skiboot/core/nvram-format.c b/roms/skiboot/core/nvram-format.c new file mode 100644 index 000000000..8aa5abf22 --- /dev/null +++ b/roms/skiboot/core/nvram-format.c @@ -0,0 +1,331 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * NVRAM Format as specified in PAPR + * + * Copyright 2013-2019 IBM Corp. + */ + +#include +#include + +struct chrp_nvram_hdr { + uint8_t sig; + uint8_t cksum; + be16 len; + char name[12]; +}; + +static struct chrp_nvram_hdr *skiboot_part_hdr; + +#define NVRAM_SIG_FW_PRIV 0x51 +#define NVRAM_SIG_SYSTEM 0x70 +#define NVRAM_SIG_FREE 0x7f + +#define NVRAM_NAME_COMMON "common" +#define NVRAM_NAME_FW_PRIV "ibm,skiboot" +#define NVRAM_NAME_FREE "wwwwwwwwwwww" + +/* 64k should be enough, famous last words... */ +#define NVRAM_SIZE_COMMON 0x10000 + +/* 4k should be enough, famous last words... */ +#define NVRAM_SIZE_FW_PRIV 0x1000 + +static uint8_t chrp_nv_cksum(struct chrp_nvram_hdr *hdr) +{ + struct chrp_nvram_hdr h_copy = *hdr; + uint8_t b_data, i_sum, c_sum; + uint8_t *p = (uint8_t *)&h_copy; + unsigned int nbytes = sizeof(h_copy); + + h_copy.cksum = 0; + for (c_sum = 0; nbytes; nbytes--) { + b_data = *(p++); + i_sum = c_sum + b_data; + if (i_sum < c_sum) + i_sum++; + c_sum = i_sum; + } + return c_sum; +} + +int nvram_format(void *nvram_image, uint32_t nvram_size) +{ + struct chrp_nvram_hdr *h; + unsigned int offset = 0; + + prerror("NVRAM: Re-initializing (size: 0x%08x)\n", nvram_size); + memset(nvram_image, 0, nvram_size); + + /* Create private partition */ + if (nvram_size - offset < NVRAM_SIZE_FW_PRIV) + return -1; + h = nvram_image + offset; + h->sig = NVRAM_SIG_FW_PRIV; + h->len = cpu_to_be16(NVRAM_SIZE_FW_PRIV >> 4); + strcpy(h->name, NVRAM_NAME_FW_PRIV); + h->cksum = chrp_nv_cksum(h); + prlog(PR_DEBUG, "NVRAM: Created '%s' partition at 0x%08x" + " for size 0x%08x with cksum 0x%02x\n", + NVRAM_NAME_FW_PRIV, offset, + be16_to_cpu(h->len), h->cksum); + offset += NVRAM_SIZE_FW_PRIV; + + /* Create common partition */ + if (nvram_size - offset < NVRAM_SIZE_COMMON) + return -1; + h = nvram_image + offset; + h->sig = NVRAM_SIG_SYSTEM; + h->len = cpu_to_be16(NVRAM_SIZE_COMMON >> 4); + strcpy(h->name, NVRAM_NAME_COMMON); + h->cksum = chrp_nv_cksum(h); + prlog(PR_DEBUG, "NVRAM: Created '%s' partition at 0x%08x" + " for size 0x%08x with cksum 0x%02x\n", + NVRAM_NAME_COMMON, offset, + be16_to_cpu(h->len), h->cksum); + offset += NVRAM_SIZE_COMMON; + + /* Create free space partition */ + if (nvram_size - offset < sizeof(struct chrp_nvram_hdr)) + return -1; + h = nvram_image + offset; + h->sig = NVRAM_SIG_FREE; + h->len = cpu_to_be16((nvram_size - offset) >> 4); + /* We have the full 12 bytes here */ + memcpy(h->name, NVRAM_NAME_FREE, 12); + h->cksum = chrp_nv_cksum(h); + prlog(PR_DEBUG, "NVRAM: Created '%s' partition at 0x%08x" + " for size 0x%08x with cksum 0x%02x\n", + NVRAM_NAME_FREE, offset, be16_to_cpu(h->len), h->cksum); + return 0; +} + +/* + * Check that the nvram partition layout is sane and that it + * contains our required partitions. If not, we re-format the + * lot of it + */ +int nvram_check(void *nvram_image, const uint32_t nvram_size) +{ + unsigned int offset = 0; + bool found_common = false; + + skiboot_part_hdr = NULL; + + while (offset + sizeof(struct chrp_nvram_hdr) < nvram_size) { + struct chrp_nvram_hdr *h = nvram_image + offset; + + if (chrp_nv_cksum(h) != h->cksum) { + prerror("NVRAM: Partition at offset 0x%x" + " has bad checksum: 0x%02x vs 0x%02x\n", + offset, h->cksum, chrp_nv_cksum(h)); + goto failed; + } + if (be16_to_cpu(h->len) < 1) { + prerror("NVRAM: Partition at offset 0x%x" + " has incorrect 0 length\n", offset); + goto failed; + } + + if (h->sig == NVRAM_SIG_SYSTEM && + strcmp(h->name, NVRAM_NAME_COMMON) == 0) + found_common = true; + + if (h->sig == NVRAM_SIG_FW_PRIV && + strcmp(h->name, NVRAM_NAME_FW_PRIV) == 0) + skiboot_part_hdr = h; + + offset += be16_to_cpu(h->len) << 4; + if (offset > nvram_size) { + prerror("NVRAM: Partition at offset 0x%x" + " extends beyond end of nvram !\n", offset); + goto failed; + } + } + if (!found_common) { + prlog_once(PR_ERR, "NVRAM: Common partition not found !\n"); + goto failed; + } + + if (!skiboot_part_hdr) { + prlog_once(PR_ERR, "NVRAM: Skiboot private partition not found !\n"); + goto failed; + } else { + /* + * The OF NVRAM format requires config strings to be NUL + * terminated and unused memory to be set to zero. Well behaved + * software should ensure this is done for us, but we should + * always check. + */ + const char *last_byte = (const char *) skiboot_part_hdr + + be16_to_cpu(skiboot_part_hdr->len) * 16 - 1; + + if (*last_byte != 0) { + prerror("NVRAM: Skiboot private partition is not NUL terminated"); + goto failed; + } + } + + prlog(PR_INFO, "NVRAM: Layout appears sane\n"); + assert(skiboot_part_hdr); + return 0; + failed: + return -1; +} + +static const char *find_next_key(const char *start, const char *end) +{ + /* + * Unused parts of the partition are set to NUL. If we hit two + * NULs in a row then we assume that we have hit the end of the + * partition. + */ + if (*start == 0) + return NULL; + + while (start < end) { + if (*start == 0) + return start + 1; + + start++; + } + + return NULL; +} + +static void nvram_dangerous(const char *key) +{ + prlog(PR_ERR, " ___________________________________________________________\n"); + prlog(PR_ERR, "< Dangerous NVRAM option: %s\n", key); + prlog(PR_ERR, " -----------------------------------------------------------\n"); + prlog(PR_ERR, " \\ \n"); + prlog(PR_ERR, " \\ WW \n"); + prlog(PR_ERR, " <^ \\___/| \n"); + prlog(PR_ERR, " \\ / \n"); + prlog(PR_ERR, " \\_ _/ \n"); + prlog(PR_ERR, " }{ \n"); +} + + +/* + * nvram_query_safe/dangerous() - Searches skiboot NVRAM partition + * for a key=value pair. + * + * Dangerous means it should only be used for testing as it may + * mask issues. Safe is ok for long term use. + * + * Returns a pointer to a NUL terminated string that contains the value + * associated with the given key. + */ +static const char *__nvram_query(const char *key, bool dangerous) +{ + const char *part_end, *start; + int key_len = strlen(key); + + assert(key); + + if (!nvram_has_loaded()) { + prlog(PR_DEBUG, + "NVRAM: Query for '%s' must wait for NVRAM to load\n", + key); + if (!nvram_wait_for_load()) { + prlog(PR_CRIT, "NVRAM: Failed to load\n"); + return NULL; + } + } + + /* + * The running OS can modify the NVRAM as it pleases so we need to be + * a little paranoid and check that it's ok before we try parse it. + * + * NB: nvram_validate() can update skiboot_part_hdr + */ + if (!nvram_validate()) + return NULL; + + assert(skiboot_part_hdr); + + part_end = (const char *) skiboot_part_hdr + + be16_to_cpu(skiboot_part_hdr->len) * 16 - 1; + + start = (const char *) skiboot_part_hdr + + sizeof(*skiboot_part_hdr); + + if (!key_len) { + prlog(PR_WARNING, "NVRAM: search key is empty!\n"); + return NULL; + } + + if (key_len > 32) + prlog(PR_WARNING, "NVRAM: search key '%s' is longer than 32 chars\n", key); + + while (start) { + int remaining = part_end - start; + + prlog(PR_TRACE, "NVRAM: '%s' (%lu)\n", + start, strlen(start)); + + if (key_len + 1 > remaining) + return NULL; + + if (!strncmp(key, start, key_len) && start[key_len] == '=') { + const char *value = &start[key_len + 1]; + + prlog(PR_DEBUG, "NVRAM: Searched for '%s' found '%s'\n", + key, value); + + if (dangerous) + nvram_dangerous(start); + return value; + } + + start = find_next_key(start, part_end); + } + + prlog(PR_DEBUG, "NVRAM: '%s' not found\n", key); + + return NULL; +} + +const char *nvram_query_safe(const char *key) +{ + return __nvram_query(key, false); +} + +const char *nvram_query_dangerous(const char *key) +{ + return __nvram_query(key, true); +} + +/* + * nvram_query_eq_safe/dangerous() - Check if the given 'key' exists + * and is set to 'value'. + * + * Dangerous means it should only be used for testing as it may + * mask issues. Safe is ok for long term use. + * + * Note: Its an error to check for non-existence of a key + * by passing 'value == NULL' as a key's value can never be + * NULL in nvram. + */ +static bool __nvram_query_eq(const char *key, const char *value, bool dangerous) +{ + const char *s = __nvram_query(key, dangerous); + + if (!s) + return false; + + assert(value != NULL); + return !strcmp(s, value); +} + +bool nvram_query_eq_safe(const char *key, const char *value) +{ + return __nvram_query_eq(key, value, false); +} + +bool nvram_query_eq_dangerous(const char *key, const char *value) +{ + return __nvram_query_eq(key, value, true); +} + diff --git a/roms/skiboot/core/nvram.c b/roms/skiboot/core/nvram.c new file mode 100644 index 000000000..773d20280 --- /dev/null +++ b/roms/skiboot/core/nvram.c @@ -0,0 +1,203 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * NVRAM support + * + * Copyright 2013-2018 IBM Corp. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static void *nvram_image; +static uint32_t nvram_size; + +static bool nvram_ready; /* has the nvram been loaded? */ +static bool nvram_valid; /* is the nvram format ok? */ + +static int64_t opal_read_nvram(uint64_t buffer, uint64_t size, uint64_t offset) +{ + if (!nvram_ready) + return OPAL_HARDWARE; + + if (!opal_addr_valid((void *)buffer)) + return OPAL_PARAMETER; + + if (offset >= nvram_size || (offset + size) > nvram_size) + return OPAL_PARAMETER; + + memcpy((void *)buffer, nvram_image + offset, size); + return OPAL_SUCCESS; +} +opal_call(OPAL_READ_NVRAM, opal_read_nvram, 3); + +static int64_t opal_write_nvram(uint64_t buffer, uint64_t size, uint64_t offset) +{ + if (!nvram_ready) + return OPAL_HARDWARE; + + if (!opal_addr_valid((void *)buffer)) + return OPAL_PARAMETER; + + if (offset >= nvram_size || (offset + size) > nvram_size) + return OPAL_PARAMETER; + memcpy(nvram_image + offset, (void *)buffer, size); + if (platform.nvram_write) + platform.nvram_write(offset, nvram_image + offset, size); + + /* The host OS has written to the NVRAM so we can't be sure that it's + * well formatted. + */ + nvram_valid = false; + + return OPAL_SUCCESS; +} +opal_call(OPAL_WRITE_NVRAM, opal_write_nvram, 3); + +bool nvram_validate(void) +{ + if (!nvram_valid) { + if (!nvram_check(nvram_image, nvram_size)) + nvram_valid = true; + } + + return nvram_valid; +} + +static void nvram_reformat(void) +{ + if (nvram_format(nvram_image, nvram_size)) { + prerror("NVRAM: Failed to format NVRAM!\n"); + nvram_valid = false; + return; + } + + /* Write the whole thing back */ + if (platform.nvram_write) + platform.nvram_write(0, nvram_image, nvram_size); + + nvram_validate(); +} + +void nvram_reinit(void) +{ + /* It's possible we failed to load nvram at boot. */ + if (!nvram_ready) + nvram_init(); + else if (!nvram_validate()) + nvram_reformat(); +} + +void nvram_read_complete(bool success) +{ + struct dt_node *np; + + /* Read not successful, error out and free the buffer */ + if (!success) { + free(nvram_image); + nvram_size = 0; + return; + } + + if (!nvram_validate()) + nvram_reformat(); + + /* Add nvram node */ + np = dt_new(opal_node, "nvram"); + dt_add_property_cells(np, "#bytes", nvram_size); + dt_add_property_string(np, "compatible", "ibm,opal-nvram"); + + /* Mark ready */ + nvram_ready = true; +} + +bool nvram_wait_for_load(void) +{ + uint64_t started; + + /* Short cut */ + if (nvram_ready) + return true; + + /* Tell the caller it will never happen */ + if (!platform.nvram_info) + return false; + + /* + * One of two things has happened here. + * 1. nvram_wait_for_load() was called before nvram_init() + * 2. The read of NVRAM failed. + * Either way, this is quite a bad event. + */ + if (!nvram_image && !nvram_size) { + prlog(PR_CRIT, "NVRAM: Possible wait before nvram_init()!\n"); + return false; + } + + started = mftb(); + + while (!nvram_ready) { + opal_run_pollers(); + /* If the read fails, tell the caller */ + if (!nvram_image && !nvram_size) + return false; + } + + prlog(PR_DEBUG, "NVRAM: Waited %lums for nvram to load\n", + tb_to_msecs(mftb() - started)); + + return true; +} + +bool nvram_has_loaded(void) +{ + return nvram_ready; +} + +void nvram_init(void) +{ + int rc; + + if (!platform.nvram_info) + return; + rc = platform.nvram_info(&nvram_size); + if (rc) { + prerror("NVRAM: Error %d retrieving nvram info\n", rc); + return; + } + prlog(PR_INFO, "NVRAM: Size is %d KB\n", nvram_size >> 10); + if (nvram_size > 0x100000) { + prlog(PR_WARNING, "NVRAM: Cropping to 1MB !\n"); + nvram_size = 0x100000; + } + + /* + * We allocate the nvram image with 4k alignment to make the + * FSP backend job's easier + */ + nvram_image = memalign(0x1000, nvram_size); + if (!nvram_image) { + prerror("NVRAM: Failed to allocate nvram image\n"); + nvram_size = 0; + return; + } + + /* Read it in */ + rc = platform.nvram_start_read(nvram_image, 0, nvram_size); + if (rc) { + prerror("NVRAM: Failed to read NVRAM from FSP !\n"); + nvram_size = 0; + free(nvram_image); + return; + } + + /* + * We'll get called back later (or recursively from + * nvram_start_read) in nvram_read_complete() + */ +} diff --git a/roms/skiboot/core/opal-dump.c b/roms/skiboot/core/opal-dump.c new file mode 100644 index 000000000..4f54a3ef1 --- /dev/null +++ b/roms/skiboot/core/opal-dump.c @@ -0,0 +1,582 @@ +/* Copyright 2019 IBM Corp. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define pr_fmt(fmt) "DUMP: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "hdata/spira.h" + +/* XXX Ideally we should use HDAT provided data (proc_dump_area->thread_size). + * But we are not getting this data durig boot. Hence lets reserve fixed + * memory for architected registers data collection. + */ +#define ARCH_REGS_DATA_SIZE_PER_CHIP (512 * 1024) + +/* Actual address of MDST and MDDT table */ +#define MDST_TABLE_BASE (SKIBOOT_BASE + MDST_TABLE_OFF) +#define MDDT_TABLE_BASE (SKIBOOT_BASE + MDDT_TABLE_OFF) +#define PROC_DUMP_AREA_BASE (SKIBOOT_BASE + PROC_DUMP_AREA_OFF) + +static struct spira_ntuple *ntuple_mdst; +static struct spira_ntuple *ntuple_mddt; +static struct spira_ntuple *ntuple_mdrt; + +static struct mpipl_metadata *mpipl_metadata; + +/* Dump metadata area */ +static struct opal_mpipl_fadump *opal_mpipl_data; +static struct opal_mpipl_fadump *opal_mpipl_cpu_data; + +/* + * Number of tags passed by OPAL to kernel after MPIPL boot. + * Currently it supports below tags: + * - CPU register data area + * - OPAL metadata area address + * - Kernel passed tag during MPIPL registration + * - Post MPIPL boot memory size + */ +#define MAX_OPAL_MPIPL_TAGS 0x04 +static u64 opal_mpipl_tags[MAX_OPAL_MPIPL_TAGS]; +static int opal_mpipl_max_tags = MAX_OPAL_MPIPL_TAGS; + +static u64 opal_dump_addr, opal_dump_size; + +static bool mpipl_enabled; + +static int opal_mpipl_add_entry(u8 region, u64 src, u64 dest, u64 size) +{ + int i; + int mdst_cnt = be16_to_cpu(ntuple_mdst->act_cnt); + int mddt_cnt = be16_to_cpu(ntuple_mddt->act_cnt); + struct mdst_table *mdst; + struct mddt_table *mddt; + + if (mdst_cnt >= MDST_TABLE_SIZE / sizeof(struct mdst_table)) { + prlog(PR_DEBUG, "MDST table is full\n"); + return OPAL_RESOURCE; + } + + if (mddt_cnt >= MDDT_TABLE_SIZE / sizeof(struct mddt_table)) { + prlog(PR_DEBUG, "MDDT table is full\n"); + return OPAL_RESOURCE; + } + + /* Use relocated memory address */ + mdst = (void *)(MDST_TABLE_BASE); + mddt = (void *)(MDDT_TABLE_BASE); + + /* Check for duplicate entry */ + for (i = 0; i < mdst_cnt; i++) { + if (be64_to_cpu(mdst->addr) == (src | HRMOR_BIT)) { + prlog(PR_DEBUG, + "Duplicate source address : 0x%llx", src); + return OPAL_PARAMETER; + } + mdst++; + } + for (i = 0; i < mddt_cnt; i++) { + if (be64_to_cpu(mddt->addr) == (dest | HRMOR_BIT)) { + prlog(PR_DEBUG, + "Duplicate destination address : 0x%llx", dest); + return OPAL_PARAMETER; + } + mddt++; + } + + /* Add OPAL source address to MDST entry */ + mdst->addr = cpu_to_be64(src | HRMOR_BIT); + mdst->data_region = region; + mdst->size = cpu_to_be32(size); + ntuple_mdst->act_cnt = cpu_to_be16(mdst_cnt + 1); + + /* Add OPAL destination address to MDDT entry */ + mddt->addr = cpu_to_be64(dest | HRMOR_BIT); + mddt->data_region = region; + mddt->size = cpu_to_be32(size); + ntuple_mddt->act_cnt = cpu_to_be16(mddt_cnt + 1); + + prlog(PR_TRACE, "Added new entry. src : 0x%llx, dest : 0x%llx," + " size : 0x%llx\n", src, dest, size); + return OPAL_SUCCESS; +} + +/* Remove entry from source (MDST) table */ +static int opal_mpipl_remove_entry_mdst(bool remove_all, u8 region, u64 src) +{ + bool found = false; + int i, j; + int mdst_cnt = be16_to_cpu(ntuple_mdst->act_cnt); + struct mdst_table *tmp_mdst; + struct mdst_table *mdst = (void *)(MDST_TABLE_BASE); + + for (i = 0; i < mdst_cnt;) { + if (mdst->data_region != region) { + mdst++; + i++; + continue; + } + + if (remove_all != true && + be64_to_cpu(mdst->addr) != (src | HRMOR_BIT)) { + mdst++; + i++; + continue; + } + + tmp_mdst = mdst; + memset(tmp_mdst, 0, sizeof(struct mdst_table)); + + for (j = i; j < mdst_cnt - 1; j++) { + memcpy((void *)tmp_mdst, + (void *)(tmp_mdst + 1), sizeof(struct mdst_table)); + tmp_mdst++; + memset(tmp_mdst, 0, sizeof(struct mdst_table)); + } + + mdst_cnt--; + + if (remove_all == false) { + found = true; + break; + } + } /* end - for loop */ + + ntuple_mdst->act_cnt = cpu_to_be16((u16)mdst_cnt); + + if (remove_all == false && found == false) { + prlog(PR_DEBUG, + "Source address [0x%llx] not found in MDST table\n", src); + return OPAL_PARAMETER; + } + + return OPAL_SUCCESS; +} + +/* Remove entry from destination (MDDT) table */ +static int opal_mpipl_remove_entry_mddt(bool remove_all, u8 region, u64 dest) +{ + bool found = false; + int i, j; + int mddt_cnt = be16_to_cpu(ntuple_mddt->act_cnt); + struct mddt_table *tmp_mddt; + struct mddt_table *mddt = (void *)(MDDT_TABLE_BASE); + + for (i = 0; i < mddt_cnt;) { + if (mddt->data_region != region) { + mddt++; + i++; + continue; + } + + if (remove_all != true && + be64_to_cpu(mddt->addr) != (dest | HRMOR_BIT)) { + mddt++; + i++; + continue; + } + + tmp_mddt = mddt; + memset(tmp_mddt, 0, sizeof(struct mddt_table)); + + for (j = i; j < mddt_cnt - 1; j++) { + memcpy((void *)tmp_mddt, + (void *)(tmp_mddt + 1), sizeof(struct mddt_table)); + tmp_mddt++; + memset(tmp_mddt, 0, sizeof(struct mddt_table)); + } + + mddt_cnt--; + + if (remove_all == false) { + found = true; + break; + } + } /* end - for loop */ + + ntuple_mddt->act_cnt = cpu_to_be16((u16)mddt_cnt); + + if (remove_all == false && found == false) { + prlog(PR_DEBUG, + "Dest address [0x%llx] not found in MDDT table\n", dest); + return OPAL_PARAMETER; + } + + return OPAL_SUCCESS; +} + +/* Register for OPAL dump. */ +static void opal_mpipl_register(void) +{ + u64 arch_regs_dest, arch_regs_size; + struct proc_dump_area *proc_dump = (void *)(PROC_DUMP_AREA_BASE); + + /* Add OPAL reservation detail to MDST/MDDT table */ + opal_mpipl_add_entry(DUMP_REGION_OPAL_MEMORY, + SKIBOOT_BASE, opal_dump_addr, opal_dump_size); + + /* Thread size check */ + if (proc_dump->thread_size != 0) { + prlog(PR_INFO, "Thread register entry size is available, " + "but not supported.\n"); + } + + /* Reserve memory used to capture architected register state */ + arch_regs_dest = opal_dump_addr + opal_dump_size; + arch_regs_size = nr_chips() * ARCH_REGS_DATA_SIZE_PER_CHIP; + proc_dump->alloc_addr = cpu_to_be64(arch_regs_dest | HRMOR_BIT); + proc_dump->alloc_size = cpu_to_be32(arch_regs_size); + prlog(PR_NOTICE, "Architected register dest addr : 0x%llx, " + "size : 0x%llx\n", arch_regs_dest, arch_regs_size); +} + +static int payload_mpipl_register(u64 src, u64 dest, u64 size) +{ + if (!opal_addr_valid((void *)src)) { + prlog(PR_DEBUG, "Invalid source address [0x%llx]\n", src); + return OPAL_PARAMETER; + } + + if (!opal_addr_valid((void *)dest)) { + prlog(PR_DEBUG, "Invalid dest address [0x%llx]\n", dest); + return OPAL_PARAMETER; + } + + if (size <= 0) { + prlog(PR_DEBUG, "Invalid size [0x%llx]\n", size); + return OPAL_PARAMETER; + } + + return opal_mpipl_add_entry(DUMP_REGION_KERNEL, src, dest, size); +} + +static int payload_mpipl_unregister(u64 src, u64 dest) +{ + int rc; + + /* Remove src from MDST table */ + rc = opal_mpipl_remove_entry_mdst(false, DUMP_REGION_KERNEL, src); + if (rc) + return rc; + + /* Remove dest from MDDT table */ + rc = opal_mpipl_remove_entry_mddt(false, DUMP_REGION_KERNEL, dest); + return rc; +} + +static int payload_mpipl_unregister_all(void) +{ + opal_mpipl_remove_entry_mdst(true, DUMP_REGION_KERNEL, 0); + opal_mpipl_remove_entry_mddt(true, DUMP_REGION_KERNEL, 0); + + return OPAL_SUCCESS; +} + +static int64_t opal_mpipl_update(enum opal_mpipl_ops ops, + u64 src, u64 dest, u64 size) +{ + int rc; + + switch (ops) { + case OPAL_MPIPL_ADD_RANGE: + rc = payload_mpipl_register(src, dest, size); + if (!rc) + prlog(PR_NOTICE, "Payload registered for MPIPL\n"); + break; + case OPAL_MPIPL_REMOVE_RANGE: + rc = payload_mpipl_unregister(src, dest); + if (!rc) { + prlog(PR_NOTICE, "Payload removed entry from MPIPL." + "[src : 0x%llx, dest : 0x%llx]\n", src, dest); + } + break; + case OPAL_MPIPL_REMOVE_ALL: + rc = payload_mpipl_unregister_all(); + if (!rc) + prlog(PR_NOTICE, "Payload unregistered for MPIPL\n"); + break; + case OPAL_MPIPL_FREE_PRESERVED_MEMORY: + /* Clear tags */ + memset(&opal_mpipl_tags, 0, (sizeof(u64) * MAX_OPAL_MPIPL_TAGS)); + opal_mpipl_max_tags = 0; + /* Release memory */ + free(opal_mpipl_data); + opal_mpipl_data = NULL; + free(opal_mpipl_cpu_data); + opal_mpipl_cpu_data = NULL; + /* Clear MDRT table */ + memset((void *)MDRT_TABLE_BASE, 0, MDRT_TABLE_SIZE); + /* Set MDRT count to max allocated count */ + ntuple_mdrt->act_cnt = cpu_to_be16(MDRT_TABLE_SIZE / sizeof(struct mdrt_table)); + rc = OPAL_SUCCESS; + prlog(PR_NOTICE, "Payload Invalidated MPIPL\n"); + break; + default: + prlog(PR_DEBUG, "Unsupported MPIPL update operation : 0x%x\n", ops); + rc = OPAL_PARAMETER; + break; + } + + return rc; +} + +static int64_t opal_mpipl_register_tag(enum opal_mpipl_tags tag, + uint64_t tag_val) +{ + int rc = OPAL_SUCCESS; + + switch (tag) { + case OPAL_MPIPL_TAG_BOOT_MEM: + if (tag_val <= 0 || tag_val > top_of_ram) { + prlog(PR_DEBUG, "Payload sent invalid boot mem size" + " : 0x%llx\n", tag_val); + rc = OPAL_PARAMETER; + } else { + mpipl_metadata->boot_mem_size = tag_val; + prlog(PR_NOTICE, "Boot mem size : 0x%llx\n", tag_val); + } + break; + case OPAL_MPIPL_TAG_KERNEL: + mpipl_metadata->kernel_tag = tag_val; + prlog(PR_NOTICE, "Payload sent metadata tag : 0x%llx\n", tag_val); + break; + default: + prlog(PR_DEBUG, "Payload sent unsupported tag : 0x%x\n", tag); + rc = OPAL_PARAMETER; + break; + } + return rc; +} + +static uint64_t opal_mpipl_query_tag(enum opal_mpipl_tags tag, __be64 *tag_val) +{ + if (!opal_addr_valid(tag_val)) { + prlog(PR_DEBUG, "Invalid tag address\n"); + return OPAL_PARAMETER; + } + + if (tag >= opal_mpipl_max_tags) + return OPAL_PARAMETER; + + *tag_val = cpu_to_be64(opal_mpipl_tags[tag]); + return OPAL_SUCCESS; +} + +static inline void post_mpipl_get_preserved_tags(void) +{ + if (mpipl_metadata->kernel_tag) + opal_mpipl_tags[OPAL_MPIPL_TAG_KERNEL] = mpipl_metadata->kernel_tag; + if (mpipl_metadata->boot_mem_size) + opal_mpipl_tags[OPAL_MPIPL_TAG_BOOT_MEM] = mpipl_metadata->boot_mem_size; +} + +static void post_mpipl_arch_regs_data(void) +{ + struct proc_dump_area *proc_dump = (void *)(PROC_DUMP_AREA_BASE); + + if (proc_dump->dest_addr == 0) { + prlog(PR_DEBUG, "Invalid CPU registers destination address\n"); + return; + } + + if (proc_dump->act_size == 0) { + prlog(PR_DEBUG, "Invalid CPU registers destination size\n"); + return; + } + + opal_mpipl_cpu_data = zalloc(sizeof(struct opal_mpipl_fadump) + + sizeof(struct opal_mpipl_region)); + if (!opal_mpipl_cpu_data) { + prlog(PR_ERR, "Failed to allocate memory\n"); + return; + } + + /* Fill CPU register details */ + opal_mpipl_cpu_data->version = OPAL_MPIPL_VERSION; + opal_mpipl_cpu_data->cpu_data_version = cpu_to_be32((u32)proc_dump->version); + opal_mpipl_cpu_data->cpu_data_size = proc_dump->thread_size; + opal_mpipl_cpu_data->region_cnt = cpu_to_be32(1); + + opal_mpipl_cpu_data->region[0].src = proc_dump->dest_addr & ~(cpu_to_be64(HRMOR_BIT)); + opal_mpipl_cpu_data->region[0].dest = proc_dump->dest_addr & ~(cpu_to_be64(HRMOR_BIT)); + opal_mpipl_cpu_data->region[0].size = cpu_to_be64(be32_to_cpu(proc_dump->act_size)); + + /* Update tag */ + opal_mpipl_tags[OPAL_MPIPL_TAG_CPU] = (u64)opal_mpipl_cpu_data; +} + +static void post_mpipl_get_opal_data(void) +{ + struct mdrt_table *mdrt = (void *)(MDRT_TABLE_BASE); + int i, j = 0, count = 0; + int mdrt_cnt = be16_to_cpu(ntuple_mdrt->act_cnt); + struct opal_mpipl_region *region; + + /* Count OPAL dump regions */ + for (i = 0; i < mdrt_cnt; i++) { + if (mdrt->data_region == DUMP_REGION_OPAL_MEMORY) + count++; + mdrt++; + } + + if (count == 0) { + prlog(PR_INFO, "OPAL dump is not available\n"); + return; + } + + opal_mpipl_data = zalloc(sizeof(struct opal_mpipl_fadump) + + count * sizeof(struct opal_mpipl_region)); + if (!opal_mpipl_data) { + prlog(PR_ERR, "Failed to allocate memory\n"); + return; + } + + /* Fill OPAL dump details */ + opal_mpipl_data->version = OPAL_MPIPL_VERSION; + opal_mpipl_data->crashing_pir = cpu_to_be32(mpipl_metadata->crashing_pir); + opal_mpipl_data->region_cnt = cpu_to_be32(count); + region = opal_mpipl_data->region; + + mdrt = (void *)(MDRT_TABLE_BASE); + for (i = 0; i < mdrt_cnt; i++) { + if (mdrt->data_region != DUMP_REGION_OPAL_MEMORY) { + mdrt++; + continue; + } + + region[j].src = mdrt->src_addr & ~(cpu_to_be64(HRMOR_BIT)); + region[j].dest = mdrt->dest_addr & ~(cpu_to_be64(HRMOR_BIT)); + region[j].size = cpu_to_be64(be32_to_cpu(mdrt->size)); + + prlog(PR_NOTICE, "OPAL reserved region %d - src : 0x%llx, " + "dest : 0x%llx, size : 0x%llx\n", j, + be64_to_cpu(region[j].src), be64_to_cpu(region[j].dest), + be64_to_cpu(region[j].size)); + + mdrt++; + j++; + if (j == count) + break; + } + + opal_mpipl_tags[OPAL_MPIPL_TAG_OPAL] = (u64)opal_mpipl_data; +} + +void opal_mpipl_save_crashing_pir(void) +{ + if (!is_mpipl_enabled()) + return; + + mpipl_metadata->crashing_pir = this_cpu()->pir; + prlog(PR_NOTICE, "Crashing PIR = 0x%x\n", this_cpu()->pir); +} + +void opal_mpipl_reserve_mem(void) +{ + struct dt_node *opal_node, *dump_node; + u64 arch_regs_dest, arch_regs_size; + + opal_node = dt_find_by_path(dt_root, "ibm,opal"); + if (!opal_node) + return; + + dump_node = dt_find_by_path(opal_node, "dump"); + if (!dump_node) + return; + + /* Calculcate and Reserve OPAL dump destination memory */ + opal_dump_size = SKIBOOT_SIZE + (cpu_max_pir + 1) * STACK_SIZE; + opal_dump_addr = SKIBOOT_BASE + opal_dump_size; + mem_reserve_fw("ibm,firmware-dump", + opal_dump_addr, opal_dump_size); + + /* Reserve memory to capture CPU register data */ + arch_regs_dest = opal_dump_addr + opal_dump_size; + arch_regs_size = nr_chips() * ARCH_REGS_DATA_SIZE_PER_CHIP; + mem_reserve_fw("ibm,firmware-arch-registers", + arch_regs_dest, arch_regs_size); +} + +bool is_mpipl_enabled(void) +{ + return mpipl_enabled; +} + +void opal_mpipl_init(void) +{ + void *mdst_base = (void *)MDST_TABLE_BASE; + void *mddt_base = (void *)MDDT_TABLE_BASE; + struct dt_node *dump_node; + + dump_node = dt_find_by_path(opal_node, "dump"); + if (!dump_node) + return; + + /* Get MDST and MDDT ntuple from SPIRAH */ + ntuple_mdst = &(spirah.ntuples.mdump_src); + ntuple_mddt = &(spirah.ntuples.mdump_dst); + ntuple_mdrt = &(spirah.ntuples.mdump_res); + + /* Get metadata area pointer */ + mpipl_metadata = (void *)(DUMP_METADATA_AREA_BASE); + + if (dt_find_property(dump_node, "mpipl-boot")) { + disable_fast_reboot("MPIPL Boot"); + + post_mpipl_get_preserved_tags(); + post_mpipl_get_opal_data(); + post_mpipl_arch_regs_data(); + } + + /* Clear OPAL metadata area */ + if (sizeof(struct mpipl_metadata) > DUMP_METADATA_AREA_SIZE) { + prlog(PR_ERR, "INSUFFICIENT OPAL METADATA AREA\n"); + prlog(PR_ERR, "INCREASE OPAL MEDTADATA AREA SIZE\n"); + assert(false); + } + memset(mpipl_metadata, 0, sizeof(struct mpipl_metadata)); + + /* Clear MDST and MDDT table */ + memset(mdst_base, 0, MDST_TABLE_SIZE); + ntuple_mdst->act_cnt = 0; + memset(mddt_base, 0, MDDT_TABLE_SIZE); + ntuple_mddt->act_cnt = 0; + + opal_mpipl_register(); + + /* Send OPAL relocated base address to SBE */ + p9_sbe_send_relocated_base(SKIBOOT_BASE); + + /* OPAL API for MPIPL update */ + opal_register(OPAL_MPIPL_UPDATE, opal_mpipl_update, 4); + opal_register(OPAL_MPIPL_REGISTER_TAG, opal_mpipl_register_tag, 2); + opal_register(OPAL_MPIPL_QUERY_TAG, opal_mpipl_query_tag, 2); + + /* Enable MPIPL */ + mpipl_enabled = true; +} diff --git a/roms/skiboot/core/opal-msg.c b/roms/skiboot/core/opal-msg.c new file mode 100644 index 000000000..65a2476b2 --- /dev/null +++ b/roms/skiboot/core/opal-msg.c @@ -0,0 +1,193 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * OPAL Message queue between host and skiboot + * + * Copyright 2013-2019 IBM Corp. + */ + +#define pr_fmt(fmt) "opalmsg: " fmt +#include +#include +#include +#include + +#define OPAL_MAX_MSGS (OPAL_MSG_TYPE_MAX + OPAL_MAX_ASYNC_COMP - 1) + +struct opal_msg_entry { + struct list_node link; + void (*consumed)(void *data, int status); + bool extended; + void *data; + struct opal_msg msg; +}; + +static LIST_HEAD(msg_free_list); +static LIST_HEAD(msg_pending_list); + +static struct lock opal_msg_lock = LOCK_UNLOCKED; + +int _opal_queue_msg(enum opal_msg_type msg_type, void *data, + void (*consumed)(void *data, int status), + size_t params_size, const void *params) +{ + struct opal_msg_entry *entry; + uint64_t entry_size; + + if ((params_size + OPAL_MSG_HDR_SIZE) > OPAL_MSG_SIZE) { + prlog(PR_DEBUG, "param_size (0x%x) > opal_msg param size (0x%x)\n", + (u32)params_size, (u32)(OPAL_MSG_SIZE - OPAL_MSG_HDR_SIZE)); + return OPAL_PARAMETER; + } + + lock(&opal_msg_lock); + + if (params_size > OPAL_MSG_FIXED_PARAMS_SIZE) { + entry_size = sizeof(struct opal_msg_entry) + params_size; + entry_size -= OPAL_MSG_FIXED_PARAMS_SIZE; + entry = zalloc(entry_size); + if (entry) + entry->extended = true; + } else { + entry = list_pop(&msg_free_list, struct opal_msg_entry, link); + if (!entry) { + prerror("No available node in the free list, allocating\n"); + entry = zalloc(sizeof(struct opal_msg_entry)); + } + } + if (!entry) { + prerror("Allocation failed\n"); + unlock(&opal_msg_lock); + return OPAL_RESOURCE; + } + + entry->consumed = consumed; + entry->data = data; + entry->msg.msg_type = cpu_to_be32(msg_type); + entry->msg.size = cpu_to_be32(params_size); + memcpy(entry->msg.params, params, params_size); + + list_add_tail(&msg_pending_list, &entry->link); + opal_update_pending_evt(OPAL_EVENT_MSG_PENDING, + OPAL_EVENT_MSG_PENDING); + unlock(&opal_msg_lock); + + return OPAL_SUCCESS; +} + +static int64_t opal_get_msg(uint64_t *buffer, uint64_t size) +{ + struct opal_msg_entry *entry; + void (*callback)(void *data, int status); + void *data; + uint64_t msg_size; + int rc = OPAL_SUCCESS; + + if (size < sizeof(struct opal_msg) || !buffer) + return OPAL_PARAMETER; + + if (!opal_addr_valid(buffer)) + return OPAL_PARAMETER; + + lock(&opal_msg_lock); + + entry = list_pop(&msg_pending_list, struct opal_msg_entry, link); + if (!entry) { + unlock(&opal_msg_lock); + return OPAL_RESOURCE; + } + + msg_size = OPAL_MSG_HDR_SIZE + be32_to_cpu(entry->msg.size); + if (size < msg_size) { + /* Send partial data to Linux */ + prlog(PR_NOTICE, "Sending partial data [msg_type : 0x%x, " + "msg_size : 0x%x, buf_size : 0x%x]\n", + be32_to_cpu(entry->msg.msg_type), + (u32)msg_size, (u32)size); + + entry->msg.size = cpu_to_be32(size - OPAL_MSG_HDR_SIZE); + msg_size = size; + rc = OPAL_PARTIAL; + } + + memcpy((void *)buffer, (void *)&entry->msg, msg_size); + callback = entry->consumed; + data = entry->data; + + if (entry->extended) + free(entry); + else + list_add(&msg_free_list, &entry->link); + + if (list_empty(&msg_pending_list)) + opal_update_pending_evt(OPAL_EVENT_MSG_PENDING, 0); + + unlock(&opal_msg_lock); + + if (callback) + callback(data, rc); + + return rc; +} +opal_call(OPAL_GET_MSG, opal_get_msg, 2); + +static int64_t opal_check_completion(uint64_t *buffer, uint64_t size, + uint64_t token) +{ + struct opal_msg_entry *entry, *next_entry; + void (*callback)(void *data, int status) = NULL; + int rc = OPAL_BUSY; + void *data = NULL; + + if (!opal_addr_valid(buffer)) + return OPAL_PARAMETER; + + lock(&opal_msg_lock); + list_for_each_safe(&msg_pending_list, entry, next_entry, link) { + if (be32_to_cpu(entry->msg.msg_type) == OPAL_MSG_ASYNC_COMP && + be64_to_cpu(entry->msg.params[0]) == token) { + list_del(&entry->link); + callback = entry->consumed; + data = entry->data; + list_add(&msg_free_list, &entry->link); + if (list_empty(&msg_pending_list)) + opal_update_pending_evt(OPAL_EVENT_MSG_PENDING, + 0); + rc = OPAL_SUCCESS; + break; + } + } + + if (rc == OPAL_SUCCESS && size >= sizeof(struct opal_msg)) + memcpy(buffer, &entry->msg, sizeof(entry->msg)); + + unlock(&opal_msg_lock); + + if (callback) + callback(data, OPAL_SUCCESS); + + return rc; + +} +opal_call(OPAL_CHECK_ASYNC_COMPLETION, opal_check_completion, 3); + +void opal_init_msg(void) +{ + struct opal_msg_entry *entry; + int i; + + for (i = 0; i < OPAL_MAX_MSGS; i++, entry++) { + entry = zalloc(sizeof(*entry)); + if (!entry) + goto err; + list_add_tail(&msg_free_list, &entry->link); + } + return; + +err: + for (; i > 0; i--) { + entry = list_pop(&msg_free_list, struct opal_msg_entry, link); + if (entry) + free(entry); + } +} + diff --git a/roms/skiboot/core/opal.c b/roms/skiboot/core/opal.c new file mode 100644 index 000000000..2898a45ce --- /dev/null +++ b/roms/skiboot/core/opal.c @@ -0,0 +1,700 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Base support for OPAL calls + * + * Copyright 2013-2019 IBM Corp. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Pending events to signal via opal_poll_events */ +uint64_t opal_pending_events; + +/* OPAL dispatch table defined in head.S */ +extern const uint64_t opal_branch_table[]; + +/* Number of args expected for each call. */ +static const u8 opal_num_args[OPAL_LAST+1]; + +/* OPAL anchor node */ +struct dt_node *opal_node; + +/* mask of dynamic vs fixed events; opal_allocate_dynamic_event will + * only allocate from this range */ +static const uint64_t opal_dynamic_events_mask = 0xffffffff00000000ul; +static uint64_t opal_dynamic_events; + +extern uint32_t attn_trigger; +extern uint32_t hir_trigger; + + +void opal_table_init(void) +{ + struct opal_table_entry *s = __opal_table_start; + struct opal_table_entry *e = __opal_table_end; + + prlog(PR_DEBUG, "OPAL table: %p .. %p, branch table: %p\n", + s, e, opal_branch_table); + while(s < e) { + ((uint64_t *)opal_branch_table)[s->token] = function_entry_address(s->func); + ((u8 *)opal_num_args)[s->token] = s->nargs; + s++; + } +} + +/* Called from head.S, thus no prototype */ +long opal_bad_token(uint64_t token); + +long opal_bad_token(uint64_t token) +{ + /** + * @fwts-label OPALBadToken + * @fwts-advice OPAL was called with a bad token. On POWER8 and + * earlier, Linux kernels had a bug where they wouldn't check + * if firmware supported particular OPAL calls before making them. + * It is, in fact, harmless for these cases. On systems newer than + * POWER8, this should never happen and indicates a kernel bug + * where OPAL_CHECK_TOKEN isn't being called where it should be. + */ + prlog(PR_ERR, "OPAL: Called with bad token %lld !\n", token); + + return OPAL_PARAMETER; +} + +#ifdef OPAL_TRACE_ENTRY +static void opal_trace_entry(struct stack_frame *eframe __unused) +{ + union trace t; + unsigned nargs, i; + + if (eframe->gpr[0] > OPAL_LAST) + nargs = 0; + else + nargs = opal_num_args[eframe->gpr[0]]; + + t.opal.token = cpu_to_be64(eframe->gpr[0]); + t.opal.lr = cpu_to_be64(eframe->lr); + t.opal.sp = cpu_to_be64(eframe->gpr[1]); + for(i=0; igpr[3+i]); + + trace_add(&t, TRACE_OPAL, offsetof(struct trace_opal, r3_to_11[nargs])); +} +#endif + +/* + * opal_quiesce_state is used as a lock. Don't use an actual lock to avoid + * lock busting. + */ +static uint32_t opal_quiesce_state; /* 0 or QUIESCE_HOLD/QUIESCE_REJECT */ +static int32_t opal_quiesce_owner; /* PIR */ +static int32_t opal_quiesce_target; /* -1 or PIR */ + +static int64_t opal_check_token(uint64_t token); + +/* Called from head.S, thus no prototype */ +int64_t opal_entry_check(struct stack_frame *eframe); + +int64_t opal_entry_check(struct stack_frame *eframe) +{ + struct cpu_thread *cpu = this_cpu(); + uint64_t token = eframe->gpr[0]; + + if (cpu->pir != mfspr(SPR_PIR)) { + printf("CPU MISMATCH ! PIR=%04lx cpu @%p -> pir=%04x token=%llu\n", + mfspr(SPR_PIR), cpu, cpu->pir, token); + abort(); + } + +#ifdef OPAL_TRACE_ENTRY + opal_trace_entry(eframe); +#endif + + if (!opal_check_token(token)) + return opal_bad_token(token); + + if (!opal_quiesce_state && cpu->in_opal_call > 1) { + disable_fast_reboot("Kernel re-entered OPAL"); + switch (token) { + case OPAL_CONSOLE_READ: + case OPAL_CONSOLE_WRITE: + case OPAL_CONSOLE_WRITE_BUFFER_SPACE: + case OPAL_CONSOLE_FLUSH: + case OPAL_POLL_EVENTS: + case OPAL_CHECK_TOKEN: + case OPAL_CEC_REBOOT: + case OPAL_CEC_REBOOT2: + case OPAL_SIGNAL_SYSTEM_RESET: + break; + default: + printf("CPU ATTEMPT TO RE-ENTER FIRMWARE! PIR=%04lx cpu @%p -> pir=%04x token=%llu\n", + mfspr(SPR_PIR), cpu, cpu->pir, token); + if (cpu->in_opal_call > 2) { + printf("Emergency stack is destroyed, can't continue.\n"); + abort(); + } + return OPAL_INTERNAL_ERROR; + } + } + + cpu->entered_opal_call_at = mftb(); + return OPAL_SUCCESS; +} + +int64_t opal_exit_check(int64_t retval, struct stack_frame *eframe); + +int64_t opal_exit_check(int64_t retval, struct stack_frame *eframe) +{ + struct cpu_thread *cpu = this_cpu(); + uint64_t token = eframe->gpr[0]; + uint64_t now = mftb(); + uint64_t call_time = tb_to_msecs(now - cpu->entered_opal_call_at); + + if (!cpu->in_opal_call) { + disable_fast_reboot("Un-accounted firmware entry"); + printf("CPU UN-ACCOUNTED FIRMWARE ENTRY! PIR=%04lx cpu @%p -> pir=%04x token=%llu retval=%lld\n", + mfspr(SPR_PIR), cpu, cpu->pir, token, retval); + cpu->in_opal_call++; /* avoid exit path underflowing */ + } else { + if (cpu->in_opal_call > 2) { + printf("Emergency stack is destroyed, can't continue.\n"); + abort(); + } + if (!list_empty(&cpu->locks_held)) { + prlog(PR_ERR, "OPAL exiting with locks held, pir=%04x token=%llu retval=%lld\n", + cpu->pir, token, retval); + drop_my_locks(true); + } + } + + if (call_time > 100 && token != OPAL_RESYNC_TIMEBASE) { + prlog((call_time < 1000) ? PR_DEBUG : PR_WARNING, + "Spent %llu msecs in OPAL call %llu!\n", + call_time, token); + } + + cpu->current_token = 0; + + return retval; +} + +int64_t opal_quiesce(uint32_t quiesce_type, int32_t cpu_target) +{ + struct cpu_thread *cpu = this_cpu(); + struct cpu_thread *target = NULL; + struct cpu_thread *c; + uint64_t end; + bool stuck = false; + + if (cpu_target >= 0) { + target = find_cpu_by_server(cpu_target); + if (!target) + return OPAL_PARAMETER; + } else if (cpu_target != -1) { + return OPAL_PARAMETER; + } + + if (quiesce_type == QUIESCE_HOLD || quiesce_type == QUIESCE_REJECT) { + if (cmpxchg32(&opal_quiesce_state, 0, quiesce_type) != 0) { + if (opal_quiesce_owner != cpu->pir) { + /* + * Nested is allowed for now just for + * internal uses, so an error is returned + * for OS callers, but no error message + * printed if we are nested. + */ + printf("opal_quiesce already quiescing\n"); + } + return OPAL_BUSY; + } + opal_quiesce_owner = cpu->pir; + opal_quiesce_target = cpu_target; + } + + if (opal_quiesce_owner != cpu->pir) { + printf("opal_quiesce CPU does not own quiesce state (must call QUIESCE_HOLD or QUIESCE_REJECT)\n"); + return OPAL_BUSY; + } + + /* Okay now we own the quiesce state */ + + if (quiesce_type == QUIESCE_RESUME || + quiesce_type == QUIESCE_RESUME_FAST_REBOOT) { + bust_locks = false; + sync(); /* release barrier vs opal entry */ + if (target) { + target->quiesce_opal_call = 0; + } else { + for_each_cpu(c) { + if (quiesce_type == QUIESCE_RESUME_FAST_REBOOT) + c->in_opal_call = 0; + + if (c == cpu) { + assert(!c->quiesce_opal_call); + continue; + } + c->quiesce_opal_call = 0; + } + } + sync(); + opal_quiesce_state = 0; + return OPAL_SUCCESS; + } + + if (quiesce_type == QUIESCE_LOCK_BREAK) { + if (opal_quiesce_target != -1) { + printf("opal_quiesce has not quiesced all CPUs (must target -1)\n"); + return OPAL_BUSY; + } + bust_locks = true; + return OPAL_SUCCESS; + } + + if (target) { + target->quiesce_opal_call = quiesce_type; + } else { + for_each_cpu(c) { + if (c == cpu) + continue; + c->quiesce_opal_call = quiesce_type; + } + } + + sync(); /* Order stores to quiesce_opal_call vs loads of in_opal_call */ + + end = mftb() + msecs_to_tb(1000); + + smt_lowest(); + if (target) { + while (target->in_opal_call) { + if (tb_compare(mftb(), end) == TB_AAFTERB) { + printf("OPAL quiesce CPU:%04x stuck in OPAL\n", target->pir); + stuck = true; + break; + } + barrier(); + } + } else { + for_each_cpu(c) { + if (c == cpu) + continue; + while (c->in_opal_call) { + if (tb_compare(mftb(), end) == TB_AAFTERB) { + printf("OPAL quiesce CPU:%04x stuck in OPAL\n", c->pir); + stuck = true; + break; + } + barrier(); + } + } + } + smt_medium(); + sync(); /* acquire barrier vs opal entry */ + + if (stuck) { + printf("OPAL quiesce could not kick all CPUs out of OPAL\n"); + return OPAL_PARTIAL; + } + + return OPAL_SUCCESS; +} +opal_call(OPAL_QUIESCE, opal_quiesce, 2); + +void __opal_register(uint64_t token, void *func, unsigned int nargs) +{ + assert(token <= OPAL_LAST); + + ((uint64_t *)opal_branch_table)[token] = function_entry_address(func); + ((u8 *)opal_num_args)[token] = nargs; +} + +/* + * add_opal_firmware_exports_node: adds properties to the device-tree which + * the OS will then change into sysfs nodes. + * The properties must be placed under /ibm,opal/firmware/exports. + * The new sysfs nodes are created under /opal/exports. + * To be correctly exported the properties must contain: + * name + * base memory location (u64) + * size (u64) + */ +static void add_opal_firmware_exports_node(struct dt_node *node) +{ + struct dt_node *exports = dt_new(node, "exports"); + uint64_t sym_start = (uint64_t)__sym_map_start; + uint64_t sym_size = (uint64_t)__sym_map_end - sym_start; + + /* + * These property names will be used by Linux as the user-visible file + * name, so make them meaningful if possible. We use _ as the separator + * here to remain consistent with existing file names in /sys/opal. + */ + dt_add_property_u64s(exports, "symbol_map", sym_start, sym_size); + dt_add_property_u64s(exports, "hdat_map", SPIRA_HEAP_BASE, + SPIRA_HEAP_SIZE); +#ifdef SKIBOOT_GCOV + dt_add_property_u64s(exports, "gcov", SKIBOOT_BASE, + HEAP_BASE - SKIBOOT_BASE); +#endif +} + +static void add_opal_firmware_node(void) +{ + struct dt_node *firmware = dt_new(opal_node, "firmware"); + uint64_t sym_start = (uint64_t)__sym_map_start; + uint64_t sym_size = (uint64_t)__sym_map_end - sym_start; + + dt_add_property_string(firmware, "compatible", "ibm,opal-firmware"); + dt_add_property_string(firmware, "name", "firmware"); + dt_add_property_string(firmware, "version", version); + /* + * As previous OS versions use symbol-map located at + * /ibm,opal/firmware we will keep a copy of symbol-map here + * for backwards compatibility + */ + dt_add_property_u64s(firmware, "symbol-map", sym_start, sym_size); + + add_opal_firmware_exports_node(firmware); +} + +void add_opal_node(void) +{ + uint64_t base, entry, size; + extern uint32_t opal_entry; + extern uint32_t boot_entry; + struct dt_node *opal_event; + + /* XXX TODO: Reorg this. We should create the base OPAL + * node early on, and have the various sub modules populate + * their own entries (console etc...) + * + * The logic of which console backend to use should be + * extracted + */ + + entry = (uint64_t)&opal_entry; + base = SKIBOOT_BASE; + size = (CPU_STACKS_BASE + + (uint64_t)(cpu_max_pir + 1) * STACK_SIZE) - SKIBOOT_BASE; + + opal_node = dt_new_check(dt_root, "ibm,opal"); + dt_add_property_cells(opal_node, "#address-cells", 0); + dt_add_property_cells(opal_node, "#size-cells", 0); + + if (proc_gen < proc_gen_p9) + dt_add_property_strings(opal_node, "compatible", "ibm,opal-v2", + "ibm,opal-v3"); + else + dt_add_property_strings(opal_node, "compatible", "ibm,opal-v3"); + + dt_add_property_cells(opal_node, "opal-msg-async-num", OPAL_MAX_ASYNC_COMP); + dt_add_property_cells(opal_node, "opal-msg-size", OPAL_MSG_SIZE); + dt_add_property_u64(opal_node, "opal-base-address", base); + dt_add_property_u64(opal_node, "opal-entry-address", entry); + dt_add_property_u64(opal_node, "opal-boot-address", (uint64_t)&boot_entry); + dt_add_property_u64(opal_node, "opal-runtime-size", size); + + /* Add irqchip interrupt controller */ + opal_event = dt_new(opal_node, "event"); + dt_add_property_strings(opal_event, "compatible", "ibm,opal-event"); + dt_add_property_cells(opal_event, "#interrupt-cells", 0x1); + dt_add_property(opal_event, "interrupt-controller", NULL, 0); + + add_opal_firmware_node(); + add_associativity_ref_point(); + memcons_add_properties(); +} + +static struct lock evt_lock = LOCK_UNLOCKED; + +void opal_update_pending_evt(uint64_t evt_mask, uint64_t evt_values) +{ + uint64_t new_evts; + + lock(&evt_lock); + new_evts = (opal_pending_events & ~evt_mask) | evt_values; + if (opal_pending_events != new_evts) { + uint64_t tok; + +#ifdef OPAL_TRACE_EVT_CHG + printf("OPAL: Evt change: 0x%016llx -> 0x%016llx\n", + opal_pending_events, new_evts); +#endif + /* + * If an event gets *set* while we are in a different call chain + * than opal_handle_interrupt() or opal_handle_hmi(), then we + * artificially generate an interrupt (OCC interrupt specifically) + * to ensure that Linux properly broadcast the event change internally + */ + if ((new_evts & ~opal_pending_events) != 0) { + tok = this_cpu()->current_token; + if (tok != OPAL_HANDLE_INTERRUPT && tok != OPAL_HANDLE_HMI) + occ_send_dummy_interrupt(); + } + opal_pending_events = new_evts; + } + unlock(&evt_lock); +} + +uint64_t opal_dynamic_event_alloc(void) +{ + uint64_t new_event; + int n; + + lock(&evt_lock); + + /* Create the event mask. This set-bit will be within the event mask + * iff there are free events, or out of the mask if there are no free + * events. If opal_dynamic_events is all ones (ie, all events are + * dynamic, and allocated), then ilog2 will return -1, and we'll have a + * zero mask. + */ + n = ilog2(~opal_dynamic_events); + new_event = 1ull << n; + + /* Ensure we're still within the allocatable dynamic events range */ + if (new_event & opal_dynamic_events_mask) + opal_dynamic_events |= new_event; + else + new_event = 0; + + unlock(&evt_lock); + return new_event; +} + +void opal_dynamic_event_free(uint64_t event) +{ + lock(&evt_lock); + opal_dynamic_events &= ~event; + unlock(&evt_lock); +} + +static uint64_t opal_test_func(uint64_t arg) +{ + printf("OPAL: Test function called with arg 0x%llx\n", arg); + + return 0xfeedf00d; +} +opal_call(OPAL_TEST, opal_test_func, 1); + +struct opal_poll_entry { + struct list_node link; + void (*poller)(void *data); + void *data; +}; + +static struct list_head opal_pollers = LIST_HEAD_INIT(opal_pollers); +static struct lock opal_poll_lock = LOCK_UNLOCKED; + +void opal_add_poller(void (*poller)(void *data), void *data) +{ + struct opal_poll_entry *ent; + + ent = zalloc(sizeof(struct opal_poll_entry)); + assert(ent); + ent->poller = poller; + ent->data = data; + lock(&opal_poll_lock); + list_add_tail(&opal_pollers, &ent->link); + unlock(&opal_poll_lock); +} + +void opal_del_poller(void (*poller)(void *data)) +{ + struct opal_poll_entry *ent; + + /* XXX This is currently unused. To solve various "interesting" + * locking issues, the pollers are run locklessly, so if we were + * to free them, we would have to be careful, using something + * akin to RCU to synchronize with other OPAL entries. For now + * if anybody uses it, print a warning and leak the entry, don't + * free it. + */ + /** + * @fwts-label UnsupportedOPALdelpoller + * @fwts-advice Currently removing a poller is DANGEROUS and + * MUST NOT be done in production firmware. + */ + prlog(PR_ALERT, "WARNING: Unsupported opal_del_poller." + " Interesting locking issues, don't call this.\n"); + + lock(&opal_poll_lock); + list_for_each(&opal_pollers, ent, link) { + if (ent->poller == poller) { + list_del(&ent->link); + /* free(ent); */ + break; + } + } + unlock(&opal_poll_lock); +} + +void opal_run_pollers(void) +{ + static int pollers_with_lock_warnings = 0; + static int poller_recursion = 0; + struct opal_poll_entry *poll_ent; + bool was_in_poller; + + /* Don't re-enter on this CPU, unless it was an OPAL re-entry */ + if (this_cpu()->in_opal_call == 1 && this_cpu()->in_poller) { + + /** + * @fwts-label OPALPollerRecursion + * @fwts-advice Recursion detected in opal_run_pollers(). This + * indicates a bug in OPAL where a poller ended up running + * pollers, which doesn't lead anywhere good. + */ + poller_recursion++; + if (poller_recursion <= 16) { + disable_fast_reboot("Poller recursion detected."); + prlog(PR_ERR, "OPAL: Poller recursion detected.\n"); + backtrace(); + + } + + if (poller_recursion == 16) + prlog(PR_ERR, "OPAL: Squashing future poller recursion warnings (>16).\n"); + + return; + } + was_in_poller = this_cpu()->in_poller; + this_cpu()->in_poller = true; + + if (!list_empty(&this_cpu()->locks_held) && pollers_with_lock_warnings < 64) { + /** + * @fwts-label OPALPollerWithLock + * @fwts-advice opal_run_pollers() was called with a lock + * held, which could lead to deadlock if not excessively + * lucky/careful. + */ + prlog(PR_ERR, "Running pollers with lock held !\n"); + dump_locks_list(); + backtrace(); + pollers_with_lock_warnings++; + if (pollers_with_lock_warnings == 64) { + /** + * @fwts-label OPALPollerWithLock64 + * @fwts-advice Your firmware is buggy, see the 64 + * messages complaining about opal_run_pollers with + * lock held. + */ + prlog(PR_ERR, "opal_run_pollers with lock run 64 " + "times, disabling warning.\n"); + } + } + + /* We run the timers first */ + check_timers(false); + + /* The pollers are run lokelessly, see comment in opal_del_poller */ + list_for_each(&opal_pollers, poll_ent, link) + poll_ent->poller(poll_ent->data); + + /* Disable poller flag */ + this_cpu()->in_poller = was_in_poller; + + /* On debug builds, print max stack usage */ + check_stacks(); +} + +static int64_t opal_poll_events(__be64 *outstanding_event_mask) +{ + + if (!opal_addr_valid(outstanding_event_mask)) + return OPAL_PARAMETER; + + /* Check if we need to trigger an attn for test use */ + if (attn_trigger == 0xdeadbeef) { + prlog(PR_EMERG, "Triggering attn\n"); + assert(false); + } + + opal_run_pollers(); + + if (outstanding_event_mask) + *outstanding_event_mask = cpu_to_be64(opal_pending_events); + + return OPAL_SUCCESS; +} +opal_call(OPAL_POLL_EVENTS, opal_poll_events, 1); + +static int64_t opal_check_token(uint64_t token) +{ + if (token > OPAL_LAST) + return OPAL_TOKEN_ABSENT; + + if (opal_branch_table[token]) + return OPAL_TOKEN_PRESENT; + + return OPAL_TOKEN_ABSENT; +} +opal_call(OPAL_CHECK_TOKEN, opal_check_token, 1); + +struct opal_sync_entry { + struct list_node link; + bool (*notify)(void *data); + void *data; +}; + +static struct list_head opal_syncers = LIST_HEAD_INIT(opal_syncers); + +void opal_add_host_sync_notifier(bool (*notify)(void *data), void *data) +{ + struct opal_sync_entry *ent; + + ent = zalloc(sizeof(struct opal_sync_entry)); + assert(ent); + ent->notify = notify; + ent->data = data; + list_add_tail(&opal_syncers, &ent->link); +} + +/* + * Remove a host sync notifier for given callback and data + */ +void opal_del_host_sync_notifier(bool (*notify)(void *data), void *data) +{ + struct opal_sync_entry *ent; + + list_for_each(&opal_syncers, ent, link) { + if (ent->notify == notify && ent->data == data) { + list_del(&ent->link); + free(ent); + return; + } + } +} + +/* + * OPAL call to handle host kexec'ing scenario + */ +static int64_t opal_sync_host_reboot(void) +{ + struct opal_sync_entry *ent, *nxt; + int ret = OPAL_SUCCESS; + + list_for_each_safe(&opal_syncers, ent, nxt, link) + if (! ent->notify(ent->data)) + ret = OPAL_BUSY_EVENT; + + return ret; +} +opal_call(OPAL_SYNC_HOST_REBOOT, opal_sync_host_reboot, 0); diff --git a/roms/skiboot/core/pci-dt-slot.c b/roms/skiboot/core/pci-dt-slot.c new file mode 100644 index 000000000..2441bf940 --- /dev/null +++ b/roms/skiboot/core/pci-dt-slot.c @@ -0,0 +1,212 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * PCI slots in the device tree. + * + * Copyright 2017-2018 IBM Corp. + */ + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#undef pr_fmt +#define pr_fmt(fmt) "DT-SLOT: " fmt + +struct dt_node *dt_slots; + +static struct dt_node *map_phb_to_slot(struct phb *phb) +{ + uint32_t chip_id = dt_get_chip_id(phb->dt_node); + uint32_t phb_idx = dt_prop_get_u32_def(phb->dt_node, + "ibm,phb-index", 0); + struct dt_node *slot_node; + + if (!dt_slots) + dt_slots = dt_find_by_path(dt_root, "/ibm,pcie-slots"); + + if (!dt_slots) + return NULL; + + dt_for_each_child(dt_slots, slot_node) { + u32 reg[2]; + + if (!dt_node_is_compatible(slot_node, "ibm,pcie-root-port")) + continue; + + reg[0] = dt_prop_get_cell(slot_node, "reg", 0); + reg[1] = dt_prop_get_cell(slot_node, "reg", 1); + + if (reg[0] == chip_id && reg[1] == phb_idx) + return slot_node; + } + + return NULL; +} + +static struct dt_node *find_devfn(struct dt_node *bus, uint32_t bdfn) +{ + uint32_t port_dev_id = PCI_DEV(bdfn); + struct dt_node *child; + + dt_for_each_child(bus, child) + if (dt_prop_get_u32_def(child, "reg", ~0u) == port_dev_id) + return child; + + return NULL; +} + +/* Looks for a device device under this slot. */ +static struct dt_node *find_dev_under_slot(struct dt_node *slot, + struct pci_device *pd) +{ + struct dt_node *child, *wildcard = NULL; + + /* find the device in the parent bus node */ + dt_for_each_child(slot, child) { + u32 vdid; + + /* "pluggable" and "builtin" without unit addrs are wildcards */ + if (!dt_has_node_property(child, "reg", NULL)) { + if (wildcard) + prerror("Duplicate wildcard entry! Already have %s, found %s", + wildcard->name, child->name); + + wildcard = child; + continue; + } + + /* NB: the pci_device vdid is did,vid rather than vid,did */ + vdid = dt_prop_get_cell(child, "reg", 1) << 16 | + dt_prop_get_cell(child, "reg", 0); + + if (vdid == pd->vdid) + return child; + } + + if (!wildcard) + PCIDBG(pd->phb, pd->bdfn, + "Unable to find a slot for device %.4x:%.4x\n", + (pd->vdid & 0xffff0000) >> 16, pd->vdid & 0xffff); + + return wildcard; +} + +/* + * If the `pd` is a bridge this returns a node with a compatible of + * ibm,pcie-port to indicate it's a "slot node". + */ +static struct dt_node *find_node_for_dev(struct phb *phb, + struct pci_device *pd) +{ + struct dt_node *sw_slot, *sw_up; + + assert(pd); + + if (pd->slot && pd->slot->data) + return pd->slot->data; + + /* + * Example DT: + * /root-complex@8,5/switch-up@10b5,8725/down-port@4 + */ + switch (pd->dev_type) { + case PCIE_TYPE_ROOT_PORT: // find the root-complex@, node + return map_phb_to_slot(phb); + + case PCIE_TYPE_SWITCH_DNPORT: // grab the down-port@ + /* + * Walk up the topology to find the slot that contains + * the switch upstream port is connected to. In the example + * this would be the root-complex@8,5 node. + */ + sw_slot = find_node_for_dev(phb, pd->parent->parent); + if (!sw_slot) + return NULL; + + /* find the per-device node for this switch */ + sw_up = find_dev_under_slot(sw_slot, pd->parent); + if (!sw_up) + return NULL; + + /* find this down port */ + return find_devfn(sw_up, pd->bdfn); + + default: + PCIDBG(phb, pd->bdfn, + "Trying to find a slot for non-pcie bridge type %d\n", + pd->dev_type); + assert(0); + } + + return NULL; +} + +struct dt_node *map_pci_dev_to_slot(struct phb *phb, struct pci_device *pd) +{ + struct dt_node *n; + char *path; + + assert(pd); + + /* + * Having a slot only makes sense for root and switch downstream ports. + * We don't care about PCI-X. + */ + if (pd->dev_type != PCIE_TYPE_SWITCH_DNPORT && + pd->dev_type != PCIE_TYPE_ROOT_PORT) + return NULL; + + PCIDBG(phb, pd->bdfn, "Finding slot\n"); + + n = find_node_for_dev(phb, pd); + if (!n) { + PCIDBG(phb, pd->bdfn, "No slot found!\n"); + } else { + path = dt_get_path(n); + PCIDBG(phb, pd->bdfn, "Slot found %s\n", path); + free(path); + } + + return n; +} + +int __print_slot(struct phb *phb, struct pci_device *pd, void *userdata); +int __print_slot(struct phb *phb, struct pci_device *pd, + void __unused *userdata) +{ + struct dt_node *node; + struct dt_node *pnode; + char *c = NULL; + u32 phandle = 0; + + if (!pd) + return 0; + + node = map_pci_dev_to_slot(phb, pd); + + /* at this point all node associations should be done */ + if (pd->dn && dt_has_node_property(pd->dn, "ibm,pcie-slot", NULL)) { + phandle = dt_prop_get_u32(pd->dn, "ibm,pcie-slot"); + pnode = dt_find_by_phandle(dt_root, phandle); + + assert(node == pnode); + } + + if (node) + c = dt_get_path(node); + + PCIDBG(phb, pd->bdfn, "Mapped to slot %s (%x)\n", + c ? c : "", phandle); + + free(c); + + return 0; +} diff --git a/roms/skiboot/core/pci-opal.c b/roms/skiboot/core/pci-opal.c new file mode 100644 index 000000000..aa375c6aa --- /dev/null +++ b/roms/skiboot/core/pci-opal.c @@ -0,0 +1,1135 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * PCIe OPAL Calls + * + * Copyright 2013-2019 IBM Corp. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define OPAL_PCICFG_ACCESS_READ(op, cb, type) \ +static int64_t opal_pci_config_##op(uint64_t phb_id, \ + uint64_t bus_dev_func, \ + uint64_t offset, type data) \ +{ \ + struct phb *phb = pci_get_phb(phb_id); \ + int64_t rc; \ + \ + if (!opal_addr_valid((void *)data)) \ + return OPAL_PARAMETER; \ + \ + if (!phb) \ + return OPAL_PARAMETER; \ + phb_lock(phb); \ + rc = phb->ops->cfg_##cb(phb, bus_dev_func, offset, data); \ + phb_unlock(phb); \ + \ + return rc; \ +} + +#define OPAL_PCICFG_ACCESS_WRITE(op, cb, type) \ +static int64_t opal_pci_config_##op(uint64_t phb_id, \ + uint64_t bus_dev_func, \ + uint64_t offset, type data) \ +{ \ + struct phb *phb = pci_get_phb(phb_id); \ + int64_t rc; \ + \ + if (!phb) \ + return OPAL_PARAMETER; \ + phb_lock(phb); \ + rc = phb->ops->cfg_##cb(phb, bus_dev_func, offset, data); \ + phb_unlock(phb); \ + \ + return rc; \ +} + +OPAL_PCICFG_ACCESS_READ(read_byte, read8, uint8_t *) +OPAL_PCICFG_ACCESS_READ(read_half_word, read16, uint16_t *) +OPAL_PCICFG_ACCESS_READ(read_word, read32, uint32_t *) +OPAL_PCICFG_ACCESS_WRITE(write_byte, write8, uint8_t) +OPAL_PCICFG_ACCESS_WRITE(write_half_word, write16, uint16_t) +OPAL_PCICFG_ACCESS_WRITE(write_word, write32, uint32_t) + +static int64_t opal_pci_config_read_half_word_be(uint64_t phb_id, + uint64_t bus_dev_func, + uint64_t offset, + __be16 *__data) +{ + uint16_t data; + int64_t rc; + + rc = opal_pci_config_read_half_word(phb_id, bus_dev_func, offset, &data); + *__data = cpu_to_be16(data); + + return rc; +} + +static int64_t opal_pci_config_read_word_be(uint64_t phb_id, + uint64_t bus_dev_func, + uint64_t offset, + __be32 *__data) +{ + uint32_t data; + int64_t rc; + + rc = opal_pci_config_read_word(phb_id, bus_dev_func, offset, &data); + *__data = cpu_to_be32(data); + + return rc; +} + + +opal_call(OPAL_PCI_CONFIG_READ_BYTE, opal_pci_config_read_byte, 4); +opal_call(OPAL_PCI_CONFIG_READ_HALF_WORD, opal_pci_config_read_half_word_be, 4); +opal_call(OPAL_PCI_CONFIG_READ_WORD, opal_pci_config_read_word_be, 4); +opal_call(OPAL_PCI_CONFIG_WRITE_BYTE, opal_pci_config_write_byte, 4); +opal_call(OPAL_PCI_CONFIG_WRITE_HALF_WORD, opal_pci_config_write_half_word, 4); +opal_call(OPAL_PCI_CONFIG_WRITE_WORD, opal_pci_config_write_word, 4); + +static struct lock opal_eeh_evt_lock = LOCK_UNLOCKED; +static uint64_t opal_eeh_evt = 0; + +void opal_pci_eeh_set_evt(uint64_t phb_id) +{ + lock(&opal_eeh_evt_lock); + opal_eeh_evt |= 1ULL << phb_id; + opal_update_pending_evt(OPAL_EVENT_PCI_ERROR, OPAL_EVENT_PCI_ERROR); + unlock(&opal_eeh_evt_lock); +} + +void opal_pci_eeh_clear_evt(uint64_t phb_id) +{ + lock(&opal_eeh_evt_lock); + opal_eeh_evt &= ~(1ULL << phb_id); + if (!opal_eeh_evt) + opal_update_pending_evt(OPAL_EVENT_PCI_ERROR, 0); + unlock(&opal_eeh_evt_lock); +} + +static int64_t opal_pci_eeh_freeze_status(uint64_t phb_id, uint64_t pe_number, + uint8_t *freeze_state, + __be16 *__pci_error_type, + __be64 *__phb_status) +{ + struct phb *phb = pci_get_phb(phb_id); + uint16_t pci_error_type; + int64_t rc; + + if (!opal_addr_valid(freeze_state) || !opal_addr_valid(__pci_error_type) + || !opal_addr_valid(__phb_status)) + return OPAL_PARAMETER; + + if (!phb) + return OPAL_PARAMETER; + if (!phb->ops->eeh_freeze_status) + return OPAL_UNSUPPORTED; + phb_lock(phb); + + if (__phb_status) + prlog(PR_ERR, "PHB#%04llx: %s: deprecated PHB status\n", + phb_id, __func__); + + rc = phb->ops->eeh_freeze_status(phb, pe_number, freeze_state, + &pci_error_type, NULL); + *__pci_error_type = cpu_to_be16(pci_error_type); + phb_unlock(phb); + + return rc; +} +opal_call(OPAL_PCI_EEH_FREEZE_STATUS, opal_pci_eeh_freeze_status, 5); + +static int64_t opal_pci_eeh_freeze_clear(uint64_t phb_id, uint64_t pe_number, + uint64_t eeh_action_token) +{ + struct phb *phb = pci_get_phb(phb_id); + int64_t rc; + + if (!phb) + return OPAL_PARAMETER; + if (!phb->ops->eeh_freeze_clear) + return OPAL_UNSUPPORTED; + phb_lock(phb); + rc = phb->ops->eeh_freeze_clear(phb, pe_number, eeh_action_token); + phb_unlock(phb); + + return rc; +} +opal_call(OPAL_PCI_EEH_FREEZE_CLEAR, opal_pci_eeh_freeze_clear, 3); + +static int64_t opal_pci_eeh_freeze_set(uint64_t phb_id, uint64_t pe_number, + uint64_t eeh_action_token) +{ + struct phb *phb = pci_get_phb(phb_id); + int64_t rc; + + if (!phb) + return OPAL_PARAMETER; + if (!phb->ops->eeh_freeze_set) + return OPAL_UNSUPPORTED; + phb_lock(phb); + rc = phb->ops->eeh_freeze_set(phb, pe_number, eeh_action_token); + phb_unlock(phb); + + return rc; +} +opal_call(OPAL_PCI_EEH_FREEZE_SET, opal_pci_eeh_freeze_set, 3); + +static int64_t opal_pci_err_inject(uint64_t phb_id, uint64_t pe_number, + uint32_t type, uint32_t func, + uint64_t addr, uint64_t mask) +{ + struct phb *phb = pci_get_phb(phb_id); + int64_t rc; + + if (!phb) + return OPAL_PARAMETER; + if (!phb->ops || !phb->ops->err_inject) + return OPAL_UNSUPPORTED; + + if (type != OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR && + type != OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR64) + return OPAL_PARAMETER; + + phb_lock(phb); + rc = phb->ops->err_inject(phb, pe_number, type, func, addr, mask); + phb_unlock(phb); + + return rc; +} +opal_call(OPAL_PCI_ERR_INJECT, opal_pci_err_inject, 6); + +static int64_t opal_pci_phb_mmio_enable(uint64_t phb_id, uint16_t window_type, + uint16_t window_num, uint16_t enable) +{ + struct phb *phb = pci_get_phb(phb_id); + int64_t rc; + + if (!phb) + return OPAL_PARAMETER; + if (!phb->ops->phb_mmio_enable) + return OPAL_UNSUPPORTED; + phb_lock(phb); + rc = phb->ops->phb_mmio_enable(phb, window_type, window_num, enable); + phb_unlock(phb); + + return rc; +} +opal_call(OPAL_PCI_PHB_MMIO_ENABLE, opal_pci_phb_mmio_enable, 4); + +static int64_t opal_pci_set_phb_mem_window(uint64_t phb_id, + uint16_t window_type, + uint16_t window_num, + uint64_t addr, + uint64_t pci_addr, + uint64_t size) +{ + struct phb *phb = pci_get_phb(phb_id); + int64_t rc; + + if (!phb) + return OPAL_PARAMETER; + if (!phb->ops->set_phb_mem_window) + return OPAL_UNSUPPORTED; + phb_lock(phb); + rc = phb->ops->set_phb_mem_window(phb, window_type, window_num, + addr, pci_addr, size); + phb_unlock(phb); + + return rc; +} +opal_call(OPAL_PCI_SET_PHB_MEM_WINDOW, opal_pci_set_phb_mem_window, 6); + +static int64_t opal_pci_map_pe_mmio_window(uint64_t phb_id, uint64_t pe_number, + uint16_t window_type, + uint16_t window_num, + uint16_t segment_num) +{ + struct phb *phb = pci_get_phb(phb_id); + int64_t rc; + + if (!phb) + return OPAL_PARAMETER; + if (!phb->ops->map_pe_mmio_window) + return OPAL_UNSUPPORTED; + phb_lock(phb); + rc = phb->ops->map_pe_mmio_window(phb, pe_number, window_type, + window_num, segment_num); + phb_unlock(phb); + + return rc; +} +opal_call(OPAL_PCI_MAP_PE_MMIO_WINDOW, opal_pci_map_pe_mmio_window, 5); + +static int64_t opal_pci_set_pe(uint64_t phb_id, uint64_t pe_number, + uint64_t bus_dev_func, uint8_t bus_compare, + uint8_t dev_compare, uint8_t func_compare, + uint8_t pe_action) +{ + struct phb *phb = pci_get_phb(phb_id); + int64_t rc; + + if (!phb) + return OPAL_PARAMETER; + if (!phb->ops->set_pe) + return OPAL_UNSUPPORTED; + phb_lock(phb); + rc = phb->ops->set_pe(phb, pe_number, bus_dev_func, bus_compare, + dev_compare, func_compare, pe_action); + phb_unlock(phb); + + return rc; +} +opal_call(OPAL_PCI_SET_PE, opal_pci_set_pe, 7); + +static int64_t opal_pci_set_peltv(uint64_t phb_id, uint32_t parent_pe, + uint32_t child_pe, uint8_t state) +{ + struct phb *phb = pci_get_phb(phb_id); + int64_t rc; + + if (!phb) + return OPAL_PARAMETER; + if (!phb->ops->set_peltv) + return OPAL_UNSUPPORTED; + phb_lock(phb); + rc = phb->ops->set_peltv(phb, parent_pe, child_pe, state); + phb_unlock(phb); + + return rc; +} +opal_call(OPAL_PCI_SET_PELTV, opal_pci_set_peltv, 4); + +static int64_t opal_pci_set_mve(uint64_t phb_id, uint32_t mve_number, + uint64_t pe_number) +{ + struct phb *phb = pci_get_phb(phb_id); + int64_t rc; + + if (!phb) + return OPAL_PARAMETER; + if (!phb->ops->set_mve) + return OPAL_UNSUPPORTED; + phb_lock(phb); + rc = phb->ops->set_mve(phb, mve_number, pe_number); + phb_unlock(phb); + + return rc; +} +opal_call(OPAL_PCI_SET_MVE, opal_pci_set_mve, 3); + +static int64_t opal_pci_set_mve_enable(uint64_t phb_id, uint32_t mve_number, + uint32_t state) +{ + struct phb *phb = pci_get_phb(phb_id); + int64_t rc; + + if (!phb) + return OPAL_PARAMETER; + if (!phb->ops->set_mve_enable) + return OPAL_UNSUPPORTED; + phb_lock(phb); + rc = phb->ops->set_mve_enable(phb, mve_number, state); + phb_unlock(phb); + + return rc; +} +opal_call(OPAL_PCI_SET_MVE_ENABLE, opal_pci_set_mve_enable, 3); + +static int64_t opal_pci_msi_eoi(uint64_t phb_id, + uint32_t hwirq) +{ + struct phb *phb = pci_get_phb(phb_id); + int64_t rc; + + if (!phb) + return OPAL_PARAMETER; + if (!phb->ops->pci_msi_eoi) + return OPAL_UNSUPPORTED; + phb_lock(phb); + rc = phb->ops->pci_msi_eoi(phb, hwirq); + phb_unlock(phb); + + return rc; +} +opal_call(OPAL_PCI_MSI_EOI, opal_pci_msi_eoi, 2); + +static int64_t opal_pci_tce_kill(uint64_t phb_id, + uint32_t kill_type, + uint64_t pe_number, uint32_t tce_size, + uint64_t dma_addr, uint32_t npages) +{ + struct phb *phb = pci_get_phb(phb_id); + int64_t rc; + + if (!phb) + return OPAL_PARAMETER; + if (!phb->ops->tce_kill) + return OPAL_UNSUPPORTED; + phb_lock(phb); + rc = phb->ops->tce_kill(phb, kill_type, pe_number, tce_size, + dma_addr, npages); + phb_unlock(phb); + + return rc; +} +opal_call(OPAL_PCI_TCE_KILL, opal_pci_tce_kill, 6); + +static int64_t opal_pci_set_xive_pe(uint64_t phb_id, uint64_t pe_number, + uint32_t xive_num) +{ + struct phb *phb = pci_get_phb(phb_id); + int64_t rc; + + if (!phb) + return OPAL_PARAMETER; + if (!phb->ops->set_xive_pe) + return OPAL_UNSUPPORTED; + phb_lock(phb); + rc = phb->ops->set_xive_pe(phb, pe_number, xive_num); + phb_unlock(phb); + + return rc; +} +opal_call(OPAL_PCI_SET_XIVE_PE, opal_pci_set_xive_pe, 3); + +static int64_t opal_get_msi_32(uint64_t phb_id, uint32_t mve_number, + uint32_t xive_num, uint8_t msi_range, + __be32 *__msi_address, __be32 *__message_data) +{ + struct phb *phb = pci_get_phb(phb_id); + uint32_t msi_address; + uint32_t message_data; + int64_t rc; + + if (!opal_addr_valid(__msi_address) || !opal_addr_valid(__message_data)) + return OPAL_PARAMETER; + + if (!phb) + return OPAL_PARAMETER; + if (!phb->ops->get_msi_32) + return OPAL_UNSUPPORTED; + phb_lock(phb); + rc = phb->ops->get_msi_32(phb, mve_number, xive_num, msi_range, + &msi_address, &message_data); + phb_unlock(phb); + + *__msi_address = cpu_to_be32(msi_address); + *__message_data = cpu_to_be32(message_data); + + return rc; +} +opal_call(OPAL_GET_MSI_32, opal_get_msi_32, 6); + +static int64_t opal_get_msi_64(uint64_t phb_id, uint32_t mve_number, + uint32_t xive_num, uint8_t msi_range, + __be64 *__msi_address, __be32 *__message_data) +{ + struct phb *phb = pci_get_phb(phb_id); + uint64_t msi_address; + uint32_t message_data; + int64_t rc; + + if (!opal_addr_valid(__msi_address) || !opal_addr_valid(__message_data)) + return OPAL_PARAMETER; + + if (!phb) + return OPAL_PARAMETER; + if (!phb->ops->get_msi_64) + return OPAL_UNSUPPORTED; + phb_lock(phb); + rc = phb->ops->get_msi_64(phb, mve_number, xive_num, msi_range, + &msi_address, &message_data); + phb_unlock(phb); + + *__msi_address = cpu_to_be64(msi_address); + *__message_data = cpu_to_be32(message_data); + + return rc; +} +opal_call(OPAL_GET_MSI_64, opal_get_msi_64, 6); + +static int64_t opal_pci_map_pe_dma_window(uint64_t phb_id, uint64_t pe_number, + uint16_t window_id, + uint16_t tce_levels, + uint64_t tce_table_addr, + uint64_t tce_table_size, + uint64_t tce_page_size) +{ + struct phb *phb = pci_get_phb(phb_id); + int64_t rc; + + if (!phb) + return OPAL_PARAMETER; + if (!phb->ops->map_pe_dma_window) + return OPAL_UNSUPPORTED; + phb_lock(phb); + rc = phb->ops->map_pe_dma_window(phb, pe_number, window_id, + tce_levels, tce_table_addr, + tce_table_size, tce_page_size); + phb_unlock(phb); + + return rc; +} +opal_call(OPAL_PCI_MAP_PE_DMA_WINDOW, opal_pci_map_pe_dma_window, 7); + +static int64_t opal_pci_map_pe_dma_window_real(uint64_t phb_id, + uint64_t pe_number, + uint16_t window_id, + uint64_t pci_start_addr, + uint64_t pci_mem_size) +{ + struct phb *phb = pci_get_phb(phb_id); + int64_t rc; + + if (!phb) + return OPAL_PARAMETER; + if (!phb->ops->map_pe_dma_window_real) + return OPAL_UNSUPPORTED; + phb_lock(phb); + rc = phb->ops->map_pe_dma_window_real(phb, pe_number, window_id, + pci_start_addr, pci_mem_size); + phb_unlock(phb); + + return rc; +} +opal_call(OPAL_PCI_MAP_PE_DMA_WINDOW_REAL, opal_pci_map_pe_dma_window_real, 5); + +static int64_t opal_phb_set_option(uint64_t phb_id, uint64_t opt, + uint64_t setting) +{ + struct phb *phb = pci_get_phb(phb_id); + int64_t rc; + + if (!phb) + return OPAL_PARAMETER; + + if (!phb->ops->set_option) + return OPAL_UNSUPPORTED; + + phb_lock(phb); + rc = phb->ops->set_option(phb, opt, setting); + phb_unlock(phb); + + return rc; +} +opal_call(OPAL_PHB_SET_OPTION, opal_phb_set_option, 3); + +static int64_t opal_phb_get_option(uint64_t phb_id, uint64_t opt, + __be64 *setting) +{ + struct phb *phb = pci_get_phb(phb_id); + int64_t rc; + + if (!phb || !setting) + return OPAL_PARAMETER; + + if (!phb->ops->get_option) + return OPAL_UNSUPPORTED; + + phb_lock(phb); + rc = phb->ops->get_option(phb, opt, setting); + phb_unlock(phb); + + return rc; +} +opal_call(OPAL_PHB_GET_OPTION, opal_phb_get_option, 3); + +static int64_t opal_pci_reset(uint64_t id, uint8_t reset_scope, + uint8_t assert_state) +{ + struct pci_slot *slot = pci_slot_find(id); + struct phb *phb = slot ? slot->phb : NULL; + int64_t rc = OPAL_SUCCESS; + + if (!slot || !phb) + return OPAL_PARAMETER; + if (assert_state != OPAL_ASSERT_RESET && + assert_state != OPAL_DEASSERT_RESET) + return OPAL_PARAMETER; + + phb_lock(phb); + + switch(reset_scope) { + case OPAL_RESET_PHB_COMPLETE: + /* Complete reset is applicable to PHB slot only */ + if (!slot->ops.creset || slot->pd) { + rc = OPAL_UNSUPPORTED; + break; + } + + if (assert_state != OPAL_ASSERT_RESET) + break; + + rc = slot->ops.creset(slot); + if (rc < 0) + prlog(PR_ERR, "SLOT-%016llx: Error %lld on complete reset\n", + slot->id, rc); + break; + case OPAL_RESET_PCI_FUNDAMENTAL: + if (!slot->ops.freset) { + rc = OPAL_UNSUPPORTED; + break; + } + + /* We need do nothing on deassert time */ + if (assert_state != OPAL_ASSERT_RESET) + break; + + rc = slot->ops.freset(slot); + if (rc < 0) + prlog(PR_ERR, "SLOT-%016llx: Error %lld on fundamental reset\n", + slot->id, rc); + break; + case OPAL_RESET_PCI_HOT: + if (!slot->ops.hreset) { + rc = OPAL_UNSUPPORTED; + break; + } + + /* We need do nothing on deassert time */ + if (assert_state != OPAL_ASSERT_RESET) + break; + + rc = slot->ops.hreset(slot); + if (rc < 0) + prlog(PR_ERR, "SLOT-%016llx: Error %lld on hot reset\n", + slot->id, rc); + break; + case OPAL_RESET_PCI_IODA_TABLE: + /* It's allowed on PHB slot only */ + if (slot->pd || !phb->ops || !phb->ops->ioda_reset) { + rc = OPAL_UNSUPPORTED; + break; + } + + if (assert_state != OPAL_ASSERT_RESET) + break; + + rc = phb->ops->ioda_reset(phb, true); + break; + case OPAL_RESET_PHB_ERROR: + /* It's allowed on PHB slot only */ + if (slot->pd || !phb->ops || !phb->ops->papr_errinjct_reset) { + rc = OPAL_UNSUPPORTED; + break; + } + + if (assert_state != OPAL_ASSERT_RESET) + break; + + rc = phb->ops->papr_errinjct_reset(phb); + break; + default: + rc = OPAL_UNSUPPORTED; + } + phb_unlock(phb); + + return (rc > 0) ? tb_to_msecs(rc) : rc; +} +opal_call(OPAL_PCI_RESET, opal_pci_reset, 3); + +static int64_t opal_pci_reinit(uint64_t phb_id, + uint64_t reinit_scope, + uint64_t data) +{ + struct phb *phb = pci_get_phb(phb_id); + int64_t rc; + + if (!phb) + return OPAL_PARAMETER; + if (!phb->ops || !phb->ops->pci_reinit) + return OPAL_UNSUPPORTED; + + phb_lock(phb); + rc = phb->ops->pci_reinit(phb, reinit_scope, data); + phb_unlock(phb); + + return rc; +} +opal_call(OPAL_PCI_REINIT, opal_pci_reinit, 3); + +static int64_t opal_pci_poll(uint64_t id) +{ + struct pci_slot *slot = pci_slot_find(id); + struct phb *phb = slot ? slot->phb : NULL; + int64_t rc; + + if (!slot || !phb) + return OPAL_PARAMETER; + if (!slot->ops.run_sm) + return OPAL_UNSUPPORTED; + + phb_lock(phb); + rc = slot->ops.run_sm(slot); + phb_unlock(phb); + + /* Return milliseconds for caller to sleep: round up */ + if (rc > 0) { + rc = tb_to_msecs(rc); + if (rc == 0) + rc = 1; + } + + return rc; +} +opal_call(OPAL_PCI_POLL, opal_pci_poll, 1); + +static int64_t opal_pci_get_presence_state(uint64_t id, uint64_t data) +{ + struct pci_slot *slot = pci_slot_find(id); + struct phb *phb = slot ? slot->phb : NULL; + uint8_t *presence = (uint8_t *)data; + int64_t rc; + + if (!opal_addr_valid(presence)) + return OPAL_PARAMETER; + + if (!slot || !phb) + return OPAL_PARAMETER; + if (!slot->ops.get_presence_state) + return OPAL_UNSUPPORTED; + + phb_lock(phb); + rc = slot->ops.get_presence_state(slot, presence); + phb_unlock(phb); + + return rc; +} +opal_call(OPAL_PCI_GET_PRESENCE_STATE, opal_pci_get_presence_state, 2); + +static int64_t opal_pci_get_power_state(uint64_t id, uint64_t data) +{ + struct pci_slot *slot = pci_slot_find(id); + struct phb *phb = slot ? slot->phb : NULL; + uint8_t *power_state = (uint8_t *)data; + int64_t rc; + + if (!opal_addr_valid(power_state)) + return OPAL_PARAMETER; + + if (!slot || !phb) + return OPAL_PARAMETER; + if (!slot->ops.get_power_state) + return OPAL_UNSUPPORTED; + + phb_lock(phb); + rc = slot->ops.get_power_state(slot, power_state); + phb_unlock(phb); + + return rc; +} +opal_call(OPAL_PCI_GET_POWER_STATE, opal_pci_get_power_state, 2); + +static u32 get_slot_phandle(struct pci_slot *slot) +{ + struct phb *phb = slot->phb; + struct pci_device *pd = slot->pd; + + if (pd) + return pd->dn->phandle; + else + return phb->dt_node->phandle; +} + +static void rescan_slot_devices(struct pci_slot *slot) +{ + struct phb *phb = slot->phb; + struct pci_device *pd = slot->pd; + + /* + * prepare_link_change() is called (if needed) by the state + * machine during the slot reset or link polling + */ + if (phb->phb_type != phb_type_npu_v2_opencapi) { + pci_scan_bus(phb, pd->secondary_bus, + pd->subordinate_bus, &pd->children, pd, true); + pci_add_device_nodes(phb, &pd->children, pd->dn, + &phb->lstate, 0); + } else { + pci_scan_bus(phb, 0, 0xff, &phb->devices, NULL, true); + pci_add_device_nodes(phb, &phb->devices, + phb->dt_node, &phb->lstate, 0); + phb->ops->phb_final_fixup(phb); + } +} + +static void remove_slot_devices(struct pci_slot *slot) +{ + struct phb *phb = slot->phb; + struct pci_device *pd = slot->pd; + + if (phb->phb_type != phb_type_npu_v2_opencapi) + pci_remove_bus(phb, &pd->children); + else + pci_remove_bus(phb, &phb->devices); +} + +static void link_up_timer(struct timer *t, void *data, + uint64_t now __unused) +{ + struct pci_slot *slot = data; + struct phb *phb = slot->phb; + uint8_t link; + int64_t rc = 0; + + if (!phb_try_lock(phb)) { + schedule_timer(&slot->timer, msecs_to_tb(10)); + return; + } + + rc = slot->ops.run_sm(slot); + if (rc < 0) + goto out; + if (rc > 0) { + schedule_timer(t, rc); + phb_unlock(phb); + return; + } + + if (slot->ops.get_link_state(slot, &link) != OPAL_SUCCESS) + link = 0; + if (!link) { + rc = OPAL_HARDWARE; + goto out; + } + + rescan_slot_devices(slot); +out: + opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, + cpu_to_be64(slot->async_token), + cpu_to_be64(get_slot_phandle(slot)), + cpu_to_be64(slot->power_state), + rc <= 0 ? cpu_to_be64(rc) : cpu_to_be64(OPAL_BUSY)); + phb_unlock(phb); +} + +static bool training_needed(struct pci_slot *slot) +{ + struct phb *phb = slot->phb; + struct pci_device *pd = slot->pd; + + /* only for opencapi slots for now */ + if (!pd && phb->phb_type == phb_type_npu_v2_opencapi) + return true; + return false; +} + +static void wait_for_link_up_and_rescan(struct pci_slot *slot) +{ + int64_t rc = 1; + + /* + * Links for PHB slots need to be retrained by triggering a + * fundamental reset. Other slots also need to be tested for + * readiness + */ + if (training_needed(slot)) { + pci_slot_set_state(slot, PCI_SLOT_STATE_NORMAL); + rc = slot->ops.freset(slot); + if (rc < 0) { + opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, + cpu_to_be64(slot->async_token), + cpu_to_be64(get_slot_phandle(slot)), + cpu_to_be64(slot->power_state), + cpu_to_be64(rc)) + return; + } + } else { + pci_slot_set_state(slot, PCI_SLOT_STATE_LINK_START_POLL); + rc = msecs_to_tb(20); + } + init_timer(&slot->timer, link_up_timer, slot); + schedule_timer(&slot->timer, rc); +} + +static void set_power_timer(struct timer *t __unused, void *data, + uint64_t now __unused) +{ + struct pci_slot *slot = data; + struct phb *phb = slot->phb; + + if (!phb_try_lock(phb)) { + schedule_timer(&slot->timer, msecs_to_tb(10)); + return; + } + + switch (slot->state) { + case PCI_SLOT_STATE_SPOWER_START: + if (slot->retries-- == 0) { + pci_slot_set_state(slot, PCI_SLOT_STATE_NORMAL); + opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, + cpu_to_be64(slot->async_token), + cpu_to_be64(get_slot_phandle(slot)), + cpu_to_be64(slot->power_state), + cpu_to_be64(OPAL_BUSY)); + } else { + schedule_timer(&slot->timer, msecs_to_tb(10)); + } + + break; + case PCI_SLOT_STATE_SPOWER_DONE: + if (slot->power_state == OPAL_PCI_SLOT_POWER_OFF) { + remove_slot_devices(slot); + pci_slot_set_state(slot, PCI_SLOT_STATE_NORMAL); + opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, + cpu_to_be64(slot->async_token), + cpu_to_be64(get_slot_phandle(slot)), + cpu_to_be64(OPAL_PCI_SLOT_POWER_OFF), + cpu_to_be64(OPAL_SUCCESS)); + break; + } + + /* Power on */ + wait_for_link_up_and_rescan(slot); + break; + default: + prlog(PR_ERR, "PCI SLOT %016llx: Unexpected state 0x%08x\n", + slot->id, slot->state); + } + phb_unlock(phb); +} + +static int64_t opal_pci_set_power_state(uint64_t async_token, + uint64_t id, + uint64_t data) +{ + struct pci_slot *slot = pci_slot_find(id); + struct phb *phb = slot ? slot->phb : NULL; + struct pci_device *pd = slot ? slot->pd : NULL; + uint8_t *state = (uint8_t *)data; + int64_t rc; + + if (!slot || !phb) + return OPAL_PARAMETER; + + if (!opal_addr_valid(state)) + return OPAL_PARAMETER; + + phb_lock(phb); + switch (*state) { + case OPAL_PCI_SLOT_POWER_OFF: + if (!slot->ops.prepare_link_change || + !slot->ops.set_power_state) { + phb_unlock(phb); + return OPAL_UNSUPPORTED; + } + + slot->async_token = async_token; + slot->ops.prepare_link_change(slot, false); + rc = slot->ops.set_power_state(slot, PCI_SLOT_POWER_OFF); + break; + case OPAL_PCI_SLOT_POWER_ON: + if (!slot->ops.set_power_state || + !slot->ops.get_link_state) { + phb_unlock(phb); + return OPAL_UNSUPPORTED; + } + + slot->async_token = async_token; + rc = slot->ops.set_power_state(slot, PCI_SLOT_POWER_ON); + break; + case OPAL_PCI_SLOT_OFFLINE: + if (!pd) { + phb_unlock(phb); + return OPAL_PARAMETER; + } + + pci_remove_bus(phb, &pd->children); + phb_unlock(phb); + return OPAL_SUCCESS; + case OPAL_PCI_SLOT_ONLINE: + if (!pd) { + phb_unlock(phb); + return OPAL_PARAMETER; + } + pci_scan_bus(phb, pd->secondary_bus, pd->subordinate_bus, + &pd->children, pd, true); + pci_add_device_nodes(phb, &pd->children, pd->dn, + &phb->lstate, 0); + phb_unlock(phb); + return OPAL_SUCCESS; + default: + rc = OPAL_PARAMETER; + } + + /* + * OPAL_ASYNC_COMPLETION is returned when delay is needed to change + * the power state in the backend. When it can be finished without + * delay, OPAL_SUCCESS is returned. The PCI topology needs to be + * updated in both cases. + */ + if (rc == OPAL_ASYNC_COMPLETION) { + slot->retries = 500; + init_timer(&slot->timer, set_power_timer, slot); + schedule_timer(&slot->timer, msecs_to_tb(10)); + } else if (rc == OPAL_SUCCESS) { + if (*state == OPAL_PCI_SLOT_POWER_OFF) { + remove_slot_devices(slot); + } else { + wait_for_link_up_and_rescan(slot); + rc = OPAL_ASYNC_COMPLETION; + } + } + + phb_unlock(phb); + return rc; +} +opal_call(OPAL_PCI_SET_POWER_STATE, opal_pci_set_power_state, 3); + +static int64_t opal_pci_get_phb_diag_data2(uint64_t phb_id, + void *diag_buffer, + uint64_t diag_buffer_len) +{ + struct phb *phb = pci_get_phb(phb_id); + int64_t rc; + + if (!opal_addr_valid(diag_buffer)) + return OPAL_PARAMETER; + + if (!phb) + return OPAL_PARAMETER; + if (!phb->ops->get_diag_data2) + return OPAL_UNSUPPORTED; + phb_lock(phb); + rc = phb->ops->get_diag_data2(phb, diag_buffer, diag_buffer_len); + phb_unlock(phb); + + return rc; +} +opal_call(OPAL_PCI_GET_PHB_DIAG_DATA2, opal_pci_get_phb_diag_data2, 3); + +static int64_t opal_pci_next_error(uint64_t phb_id, __be64 *__first_frozen_pe, + __be16 *__pci_error_type, __be16 *__severity) +{ + struct phb *phb = pci_get_phb(phb_id); + uint64_t first_frozen_pe; + uint16_t pci_error_type; + uint16_t severity; + int64_t rc; + + if (!opal_addr_valid(__first_frozen_pe) || + !opal_addr_valid(__pci_error_type) || !opal_addr_valid(__severity)) + return OPAL_PARAMETER; + + if (!phb) + return OPAL_PARAMETER; + if (!phb->ops->next_error) + return OPAL_UNSUPPORTED; + phb_lock(phb); + + opal_pci_eeh_clear_evt(phb_id); + rc = phb->ops->next_error(phb, &first_frozen_pe, &pci_error_type, + &severity); + phb_unlock(phb); + + *__first_frozen_pe = cpu_to_be64(first_frozen_pe); + *__pci_error_type = cpu_to_be16(pci_error_type); + *__severity = cpu_to_be16(severity); + + return rc; +} +opal_call(OPAL_PCI_NEXT_ERROR, opal_pci_next_error, 4); + +static int64_t opal_pci_set_phb_capi_mode(uint64_t phb_id, uint64_t mode, uint64_t pe_number) +{ + struct phb *phb = pci_get_phb(phb_id); + int64_t rc; + + if (!phb) + return OPAL_PARAMETER; + if (!phb->ops->set_capi_mode) + return OPAL_UNSUPPORTED; + + phb_lock(phb); + rc = phb->ops->set_capi_mode(phb, mode, pe_number); + phb_unlock(phb); + return rc; +} +opal_call(OPAL_PCI_SET_PHB_CAPI_MODE, opal_pci_set_phb_capi_mode, 3); + +static int64_t opal_pci_set_p2p(uint64_t phbid_init, uint64_t phbid_target, + uint64_t desc, uint16_t pe_number) +{ + struct phb *phb_init = pci_get_phb(phbid_init); + struct phb *phb_target = pci_get_phb(phbid_target); + + if (!phb_init || !phb_target) + return OPAL_PARAMETER; + /* + * Having the 2 devices under the same PHB may require tuning + * the configuration of intermediate switch(es), more easily + * done from linux. And it shouldn't require a PHB config + * change. + * Return an error for the time being. + */ + if (phb_init == phb_target) + return OPAL_UNSUPPORTED; + if (!phb_init->ops->set_p2p || !phb_target->ops->set_p2p) + return OPAL_UNSUPPORTED; + /* + * Loads would be supported on p9 if the 2 devices are under + * the same PHB, but we ruled it out above. + */ + if (desc & OPAL_PCI_P2P_LOAD) + return OPAL_UNSUPPORTED; + + phb_lock(phb_init); + phb_init->ops->set_p2p(phb_init, OPAL_PCI_P2P_INITIATOR, desc, + pe_number); + phb_unlock(phb_init); + + phb_lock(phb_target); + phb_target->ops->set_p2p(phb_target, OPAL_PCI_P2P_TARGET, desc, + pe_number); + phb_unlock(phb_target); + return OPAL_SUCCESS; +} +opal_call(OPAL_PCI_SET_P2P, opal_pci_set_p2p, 4); + +static int64_t opal_pci_get_pbcq_tunnel_bar(uint64_t phb_id, __be64 *__addr) +{ + struct phb *phb = pci_get_phb(phb_id); + uint64_t addr; + + if (!opal_addr_valid(__addr)) + return OPAL_PARAMETER; + + if (!phb) + return OPAL_PARAMETER; + if (!phb->ops->get_tunnel_bar) + return OPAL_UNSUPPORTED; + + phb_lock(phb); + phb->ops->get_tunnel_bar(phb, &addr); + phb_unlock(phb); + + *__addr = cpu_to_be64(addr); + + return OPAL_SUCCESS; +} +opal_call(OPAL_PCI_GET_PBCQ_TUNNEL_BAR, opal_pci_get_pbcq_tunnel_bar, 2); + +static int64_t opal_pci_set_pbcq_tunnel_bar(uint64_t phb_id, uint64_t addr) +{ + struct phb *phb = pci_get_phb(phb_id); + int64_t rc; + + if (!phb) + return OPAL_PARAMETER; + if (!phb->ops->set_tunnel_bar) + return OPAL_UNSUPPORTED; + + phb_lock(phb); + rc = phb->ops->set_tunnel_bar(phb, addr); + phb_unlock(phb); + return rc; +} +opal_call(OPAL_PCI_SET_PBCQ_TUNNEL_BAR, opal_pci_set_pbcq_tunnel_bar, 2); diff --git a/roms/skiboot/core/pci-quirk.c b/roms/skiboot/core/pci-quirk.c new file mode 100644 index 000000000..5c8b091ea --- /dev/null +++ b/roms/skiboot/core/pci-quirk.c @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * Deal with PCI device quirks + * + * Copyright 2017-2018 IBM Corp. + */ + +#define pr_fmt(fmt) "PCI-QUIRK: " fmt + +#include +#include +#include +#include +#include +#include + +static int64_t cfg_block_filter(void *dev __unused, + struct pci_cfg_reg_filter *pcrf __unused, + uint32_t offset __unused, uint32_t len, + uint32_t *data, bool write) +{ + if (write) + return OPAL_SUCCESS; + + switch (len) { + case 4: + *data = 0x0; + return OPAL_SUCCESS; + case 2: + *((uint16_t *)data) = 0x0; + return OPAL_SUCCESS; + case 1: + *((uint8_t *)data) = 0x0; + return OPAL_SUCCESS; + } + + return OPAL_PARAMETER; /* should never happen */ +} + +/* blocks config accesses to registers in the range: [start, end] */ +#define BLOCK_CFG_RANGE(pd, start, end) \ + pci_add_cfg_reg_filter(pd, start, end - start + 1, \ + PCI_REG_FLAG_WRITE | PCI_REG_FLAG_READ, \ + cfg_block_filter); + +static void quirk_microsemi_gen4_sw(struct phb *phb, struct pci_device *pd) +{ + uint8_t data; + bool frozen; + int offset; + int start; + + pci_check_clear_freeze(phb); + + /* + * Reading from 0xff should trigger a UR on the affected switches. + * If we don't get a freeze then we don't need the workaround + */ + pci_cfg_read8(phb, pd->bdfn, 0xff, &data); + frozen = pci_check_clear_freeze(phb); + if (!frozen) + return; + + for (start = -1, offset = 0; offset < 4096; offset++) { + pci_cfg_read8(phb, pd->bdfn, offset, &data); + frozen = pci_check_clear_freeze(phb); + + if (start < 0 && frozen) { /* new UR range */ + start = offset; + } else if (start >= 0 && !frozen) { /* end of range */ + BLOCK_CFG_RANGE(pd, start, offset - 1); + PCINOTICE(phb, pd->bdfn, "Applied UR workaround to [%03x..%03x]\n", start, offset - 1); + + start = -1; + } + } + + /* range lasted until the end of config space */ + if (start >= 0) { + BLOCK_CFG_RANGE(pd, start, 0xfff); + PCINOTICE(phb, pd->bdfn, "Applied UR workaround to [%03x..fff]\n", start); + } +} + +static void quirk_astbmc_vga(struct phb *phb __unused, + struct pci_device *pd) +{ + struct dt_node *np = pd->dn; + uint32_t revision, mcr_configuration, mcr_scu_mpll, mcr_scu_strap; + + if (ast_sio_is_enabled()) { + revision = ast_ahb_readl(SCU_REVISION_ID); + mcr_configuration = ast_ahb_readl(MCR_CONFIGURATION); + mcr_scu_mpll = ast_ahb_readl(MCR_SCU_MPLL); + mcr_scu_strap = ast_ahb_readl(MCR_SCU_STRAP); + } else { + /* Previously we would warn, now SIO disabled by design */ + prlog(PR_INFO, "Assumed platform default parameters for %s\n", + __func__); + revision = bmc_platform->hw->scu_revision_id; + mcr_configuration = bmc_platform->hw->mcr_configuration; + mcr_scu_mpll = bmc_platform->hw->mcr_scu_mpll; + mcr_scu_strap = bmc_platform->hw->mcr_scu_strap; + } + + dt_add_property_cells(np, "aspeed,scu-revision-id", revision); + dt_add_property_cells(np, "aspeed,mcr-configuration", mcr_configuration); + dt_add_property_cells(np, "aspeed,mcr-scu-mpll", mcr_scu_mpll); + dt_add_property_cells(np, "aspeed,mcr-scu-strap", mcr_scu_strap); +} + +/* Quirks are: {fixup function, vendor ID, (device ID or PCI_ANY_ID)} */ +static const struct pci_quirk quirk_table[] = { + /* ASPEED 2400 VGA device */ + { 0x1a03, 0x2000, &quirk_astbmc_vga }, + { 0x11f8, 0x4052, &quirk_microsemi_gen4_sw }, + { 0, 0, NULL } +}; + +static void __pci_handle_quirk(struct phb *phb, struct pci_device *pd, + const struct pci_quirk *quirks) +{ + while (quirks->vendor_id) { + if (quirks->vendor_id == PCI_VENDOR_ID(pd->vdid) && + (quirks->device_id == PCI_ANY_ID || + quirks->device_id == PCI_DEVICE_ID(pd->vdid))) + quirks->fixup(phb, pd); + quirks++; + } +} + +void pci_handle_quirk(struct phb *phb, struct pci_device *pd) +{ + __pci_handle_quirk(phb, pd, quirk_table); +} diff --git a/roms/skiboot/core/pci-slot.c b/roms/skiboot/core/pci-slot.c new file mode 100644 index 000000000..71d3d329c --- /dev/null +++ b/roms/skiboot/core/pci-slot.c @@ -0,0 +1,241 @@ +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +/* + * PCI Slots + * + * Copyright 2013-2019 IBM Corp. + */ + +#include +#include +#include +#include +#include + +/* Debugging options */ +#define PCI_SLOT_PREFIX "PCI-SLOT-%016llx " +#define PCI_SLOT_DBG(s, fmt, a...) \ + prlog(PR_DEBUG, PCI_SLOT_PREFIX fmt, (s)->id, ##a) + +static void pci_slot_prepare_link_change(struct pci_slot *slot, bool up) +{ + struct phb *phb = slot->phb; + struct pci_device *pd = slot->pd; + uint32_t aercap, mask; + + /* + * Mask the link down and receiver error before the link becomes + * down. Otherwise, unmask the errors when the link is up. + */ + if (pci_has_cap(pd, PCIECAP_ID_AER, true)) { + aercap = pci_cap(pd, PCIECAP_ID_AER, true); + + /* Mask link surprise down event. The event is always + * masked when the associated PCI slot supports PCI + * surprise hotplug. We needn't toggle it when the link + * bounces caused by reset and just keep it always masked. + */ + if (!pd->slot || !pd->slot->surprise_pluggable) { + pci_cfg_read32(phb, pd->bdfn, + aercap + PCIECAP_AER_UE_MASK, &mask); + if (up) + mask &= ~PCIECAP_AER_UE_MASK_SURPRISE_DOWN; + else + mask |= PCIECAP_AER_UE_MASK_SURPRISE_DOWN; + pci_cfg_write32(phb, pd->bdfn, + aercap + PCIECAP_AER_UE_MASK, mask); + } + + /* Receiver error */ + pci_cfg_read32(phb, pd->bdfn, aercap + PCIECAP_AER_CE_MASK, + &mask); + if (up) + mask &= ~PCIECAP_AER_CE_RECVR_ERR; + else + mask |= PCIECAP_AER_CE_RECVR_ERR; + pci_cfg_write32(phb, pd->bdfn, aercap + PCIECAP_AER_CE_MASK, + mask); + } + + /* + * We're coming back from reset. We need restore bus ranges + * and reinitialize the affected bridges and devices. + */ + if (up) { + pci_restore_bridge_buses(phb, pd); + if (phb->ops->device_init) + pci_walk_dev(phb, pd, phb->ops->device_init, NULL); + } +} + +static int64_t pci_slot_run_sm(struct pci_slot *slot) +{ + uint64_t now = mftb(); + int64_t ret; + + /* Return remaining timeout if we're still waiting */ + if (slot->delay_tgt_tb && + tb_compare(now, slot->delay_tgt_tb) == TB_ABEFOREB) + return slot->delay_tgt_tb - now; + + slot->delay_tgt_tb = 0; + switch (slot->state & PCI_SLOT_STATE_MASK) { + case PCI_SLOT_STATE_LINK: + ret = slot->ops.poll_link(slot); + break; + case PCI_SLOT_STATE_HRESET: + ret = slot->ops.hreset(slot); + break; + case PCI_SLOT_STATE_FRESET: + ret = slot->ops.freset(slot); + break; + case PCI_SLOT_STATE_CRESET: + ret = slot->ops.creset(slot); + break; + default: + prlog(PR_ERR, PCI_SLOT_PREFIX + "Invalid state %08x\n", slot->id, slot->state); + pci_slot_set_state(slot, PCI_SLOT_STATE_NORMAL); + ret = OPAL_HARDWARE; + } + + /* Notify about the pci slot state machine completion */ + if (ret <= 0 && slot->ops.completed_sm_run) + slot->ops.completed_sm_run(slot, ret); + + return ret; +} + +void pci_slot_add_dt_properties(struct pci_slot *slot, + struct dt_node *np) +{ + /* Bail without device node */ + if (!np) + return; + + dt_add_property_cells(np, "ibm,reset-by-firmware", 1); + dt_add_property_cells(np, "ibm,slot-pluggable", slot->pluggable); + dt_add_property_cells(np, "ibm,slot-surprise-pluggable", + slot->surprise_pluggable); + if (pci_slot_has_flags(slot, PCI_SLOT_FLAG_BROKEN_PDC)) + dt_add_property_cells(np, "ibm,slot-broken-pdc", 1); + + dt_add_property_cells(np, "ibm,slot-power-ctl", slot->power_ctl); + dt_add_property_cells(np, "ibm,slot-power-led-ctlled", + slot->power_led_ctl); + dt_add_property_cells(np, "ibm,slot-attn-led", slot->attn_led_ctl); + dt_add_property_cells(np, "ibm,slot-connector-type", + slot->connector_type); + dt_add_property_cells(np, "ibm,slot-card-desc", slot->card_desc); + dt_add_property_cells(np, "ibm,slot-card-mech", slot->card_mech); + dt_add_property_cells(np, "ibm,slot-wired-lanes", slot->wired_lanes); + dt_add_property_cells(np, "ibm,power-limit", slot->power_limit); + + if (slot->ops.add_properties) + slot->ops.add_properties(slot, np); +} + +struct pci_slot *pci_slot_alloc(struct phb *phb, + struct pci_device *pd) +{ + struct pci_slot *slot = NULL; + + /* + * The function can be used to allocate either PHB slot or normal + * one. For both cases, the @phb should be always valid. + */ + if (!phb) + return NULL; + + /* + * When @pd is NULL, we're going to create a PHB slot. Otherwise, + * a normal slot will be created. Check if the specified slot + * already exists or not. + */ + slot = pd ? pd->slot : phb->slot; + if (slot) { + prlog(PR_ERR, PCI_SLOT_PREFIX "Already exists\n", slot->id); + return slot; + } + + /* Allocate memory chunk */ + slot = zalloc(sizeof(struct pci_slot)); + if (!slot) { + prlog(PR_ERR, "%s: Out of memory\n", __func__); + return NULL; + } + + /* + * The polling function sholdn't be overridden by individual + * platforms + */ + slot->phb = phb; + slot->pd = pd; + pci_slot_set_state(slot, PCI_SLOT_STATE_NORMAL); + slot->power_state = PCI_SLOT_POWER_ON; + slot->ops.run_sm = pci_slot_run_sm; + slot->ops.prepare_link_change = pci_slot_prepare_link_change; + slot->peer_slot = NULL; + if (!pd) { + slot->id = PCI_PHB_SLOT_ID(phb); + phb->slot = slot; + } else { + slot->id = PCI_SLOT_ID(phb, pd->bdfn); + pd->slot = slot; + } + + return slot; +} + +struct pci_slot *pci_slot_find(uint64_t id) +{ + struct phb *phb; + struct pci_device *pd; + struct pci_slot *slot; + uint64_t index; + uint16_t bdfn; + + index = PCI_SLOT_PHB_INDEX(id); + phb = pci_get_phb(index); + + /* PHB slot */ + if (!(id & PCI_SLOT_ID_PREFIX)) { + slot = phb ? phb->slot : NULL; + return slot; + } + + /* Normal PCI slot */ + bdfn = PCI_SLOT_BDFN(id); + pd = phb ? pci_find_dev(phb, bdfn) : NULL; + slot = pd ? pd->slot : NULL; + return slot; +} + +void pci_slot_add_loc(struct pci_slot *slot, + struct dt_node *np, const char *label) +{ + char tmp[8], loc_code[LOC_CODE_SIZE]; + struct pci_device *pd = slot->pd; + struct phb *phb = slot->phb; + + if (!np) + return; + + /* didn't get a real slot label? generate one! */ + if (!label) { + snprintf(tmp, sizeof(tmp), "S%04x%02x", phb->opal_id, + pd->secondary_bus); + label = tmp; + } + + /* Make a -