99 files changed, 28745 insertions, 0 deletions
diff --git a/roms/skiboot/core/Makefile.inc b/roms/skiboot/core/Makefile.inc
new file mode 100644
index 000000000..829800e5b
--- /dev/null
+++ b/roms/skiboot/core/Makefile.inc
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+# Copyright 2012-2019 IBM Corp
+# -*-Makefile-*-
+
+SUBDIRS += core
+CORE_OBJS = relocate.o console.o stack.o init.o chip.o mem_region.o
+CORE_OBJS += malloc.o lock.o cpu.o utils.o fdt.o opal.o interrupts.o timebase.o
+CORE_OBJS += opal-msg.o pci.o pci-virt.o pci-slot.o pcie-slot.o
+CORE_OBJS += pci-opal.o fast-reboot.o device.o exceptions.o trace.o affinity.o
+CORE_OBJS += vpd.o platform.o nvram.o nvram-format.o hmi.o mce.o
+CORE_OBJS += console-log.o ipmi.o time-utils.o pel.o pool.o errorlog.o
+CORE_OBJS += timer.o i2c.o rtc.o flash.o sensor.o ipmi-opal.o
+CORE_OBJS += flash-subpartition.o bitmap.o buddy.o pci-quirk.o powercap.o psr.o
+CORE_OBJS += pci-dt-slot.o direct-controls.o cpufeatures.o
+CORE_OBJS += flash-firmware-versions.o opal-dump.o
+
+ifeq ($(SKIBOOT_GCOV),1)
+CORE_OBJS += gcov-profiling.o
+CFLAGS_SKIP_core/gcov-profiling.o = -Wsuggest-attribute=const
+endif
+
+CORE=core/built-in.a
+
+CFLAGS_SKIP_core/relocate.o = -pg -fstack-protector-all
+CFLAGS_SKIP_core/relocate.o += -fstack-protector -fstack-protector-strong
+CFLAGS_SKIP_core/relocate.o += -fprofile-arcs -ftest-coverage
+
+$(CORE): $(CORE_OBJS:%=core/%)
diff --git a/roms/skiboot/core/affinity.c b/roms/skiboot/core/affinity.c
new file mode 100644
index 000000000..0209d3cd9
--- /dev/null
+++ b/roms/skiboot/core/affinity.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2013-2019 IBM Corp. */
+
+/*
+ *
+ * We currently construct our associativity properties as such:
+ *
+ * - For "chip" devices (bridges, memory, ...), 4 entries:
+ *
+ *     - CCM node ID
+ *     - HW card ID
+ *     - HW module ID
+ *     - Chip ID
+ *
+ *   The information is constructed based on the chip ID which (unlike
+ *   pHyp) is our HW chip ID (aka "XSCOM" chip ID). We use it to retrieve
+ *   the other properties from the corresponding chip/xscom node in the
+ *   device-tree. If those properties are absent, 0 is used.
+ *
+ * - For "core" devices, we add a 5th entry:
+ *
+ *     - Core ID
+ *
+ *   Here too, we do not use the "cooked" HW processor ID from HDAT but
+ *   instead use the real HW core ID which is basically the interrupt
+ *   server number of thread 0 on that core.
+ *
+ *
+ * The ibm,associativity-reference-points property is currently set to
+ * 4,4 indicating that the chip ID is our only reference point. This
+ * should be extended to encompass the node IDs eventually.
+ */
+#include <skiboot.h>
+#include <opal.h>
+#include <device.h>
+#include <console.h>
+#include <trace.h>
+#include <chip.h>
+#include <cpu.h>
+#include <affinity.h>
+
+static uint32_t get_chip_node_id(struct proc_chip *chip)
+{
+	/* If the xscom node has an ibm,ccm-node-id property, use it */
+	if (dt_has_node_property(chip->devnode, "ibm,ccm-node-id", NULL))
+		return dt_prop_get_u32(chip->devnode, "ibm,ccm-node-id");
+
+	/*
+	 * Else use the 3 top bits of the chip ID which should be
+	 * the node on P8
+	 */
+	return chip->id >> 3;
+}
+
+void add_associativity_ref_point(void)
+{
+	int ref2 = 0x4;
+
+	/*
+	 * Note about our use of reference points:
+	 *
+	 * Linux currently supports up to three levels of NUMA. We use the
+	 * first reference point for the node ID and the second reference
+	 * point for a second level of affinity. We always use the chip ID
+	 * (4) for the first reference point.
+	 *
+	 * Choosing the second level of affinity is model specific
+	 * unfortunately. Current POWER8E models should use the DCM
+	 * as a second level of NUMA.
+	 *
+	 * If there is a way to obtain this information from the FSP
+	 * that would be ideal, but for now hardwire our POWER8E setting.
+	 *
+	 * For GPU nodes we add a third level of NUMA, such that the
+	 * distance of the GPU node from all other nodes is uniformly
+	 * the highest.
+	 */
+	if (PVR_TYPE(mfspr(SPR_PVR)) == PVR_TYPE_P8E)
+		ref2 = 0x3;
+
+	dt_add_property_cells(opal_node, "ibm,associativity-reference-points",
+			      0x4, ref2, 0x2);
+}
+
+void add_chip_dev_associativity(struct dt_node *dev)
+{
+	uint32_t chip_id = dt_get_chip_id(dev);
+	struct proc_chip *chip = get_chip(chip_id);
+	uint32_t hw_cid, hw_mid;
+
+	if (!chip)
+		return;
+
+	hw_cid = dt_prop_get_u32_def(chip->devnode, "ibm,hw-card-id", 0);
+	hw_mid = dt_prop_get_u32_def(chip->devnode, "ibm,hw-module-id", 0);
+
+	dt_add_property_cells(dev, "ibm,associativity", 4,
+			      get_chip_node_id(chip),
+			      hw_cid, hw_mid, chip_id);
+}
+
+void add_core_associativity(struct cpu_thread *cpu)
+{
+	struct proc_chip *chip = get_chip(cpu->chip_id);
+	uint32_t hw_cid, hw_mid, core_id;
+
+	if (!chip)
+		return;
+
+	if (proc_gen == proc_gen_p8)
+		core_id = (cpu->pir >> 3) & 0xf;
+	else if (proc_gen == proc_gen_p9)
+		core_id = (cpu->pir >> 2) & 0x1f;
+	else if (proc_gen == proc_gen_p10)
+		core_id = (cpu->pir >> 2) & 0x1f;
+	else
+		return;
+
+	hw_cid = dt_prop_get_u32_def(chip->devnode, "ibm,hw-card-id", 0);
+	hw_mid = dt_prop_get_u32_def(chip->devnode, "ibm,hw-module-id", 0);
+
+	dt_add_property_cells(cpu->node, "ibm,associativity", 5,
+			      get_chip_node_id(chip),
+			      hw_cid, hw_mid, chip->id, core_id);
+}
diff --git a/roms/skiboot/core/bitmap.c b/roms/skiboot/core/bitmap.c
new file mode 100644
index 000000000..8de1356c3
--- /dev/null
+++ b/roms/skiboot/core/bitmap.c
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2016 IBM Corp. */
+
+#include "bitmap.h"
+
+static int __bitmap_find_bit(bitmap_t map, unsigned int start, unsigned int count,
+			     bool value)
+{
+	unsigned int el, first_bit;
+	unsigned int end = start + count;
+	bitmap_elem_t e, ev;
+	int b;
+
+	ev = value ? -1ul : 0;
+	el = BITMAP_ELEM(start);
+	first_bit = BITMAP_BIT(start);
+
+	while (start < end) {
+		e = map[el] ^ ev;
+		e |= ((1ul << first_bit) - 1);
+		if (~e)
+			break;
+		start = (start + BITMAP_ELSZ) & ~(BITMAP_ELSZ - 1);
+		first_bit = 0;
+		el++;
+	}
+	for (b = first_bit; b < BITMAP_ELSZ && start < end; b++,start++) {
+		if ((e & (1ull << b)) == 0)
+			return start;
+	}
+
+	return -1;
+}
+
+int bitmap_find_zero_bit(bitmap_t map, unsigned int start, unsigned int count)
+{
+	return __bitmap_find_bit(map, start, count, false);
+}
+
+int bitmap_find_one_bit(bitmap_t map, unsigned int start, unsigned int count)
+{
+	return __bitmap_find_bit(map, start, count, true);
+}
+
diff --git a/roms/skiboot/core/buddy.c b/roms/skiboot/core/buddy.c
new file mode 100644
index 000000000..b36e407d1
--- /dev/null
+++ b/roms/skiboot/core/buddy.c
@@ -0,0 +1,292 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2016-2017 IBM Corp. */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "buddy.h"
+
+#define BUDDY_DEBUG
+#undef  BUDDY_VERBOSE
+
+#ifdef BUDDY_VERBOSE
+#define BUDDY_NOISE(fmt...)	printf(fmt)
+#else
+#define BUDDY_NOISE(fmt...)	do { } while(0)
+#endif
+
+static inline unsigned int buddy_map_size(struct buddy *b)
+{
+	return 1u << (b->max_order + 1);
+}
+
+static inline unsigned int buddy_order_start(struct buddy *b,
+					     unsigned int order)
+{
+	unsigned int level = b->max_order - order;
+
+	/* Starting bit of index for order */
+	return 1u << level;
+}
+
+static inline unsigned int buddy_index_to_node(struct buddy *b,
+					       unsigned int index,
+					       unsigned int order)
+{
+	/* Ensure the index is a multiple of the order */
+	assert((index & ((1u << order) - 1)) == 0);
+
+	return buddy_order_start(b, order) + (index >> order);
+}
+
+static inline unsigned int buddy_node_to_index(struct buddy *b,
+					       unsigned int node,
+					       unsigned int order)
+{
+	unsigned int start = buddy_order_start(b, order);
+
+	return (node - start) << order;
+}
+
+#ifdef BUDDY_DEBUG
+static void buddy_check_alloc(struct buddy *b, unsigned int node)
+{
+	assert(bitmap_tst_bit(b->map, node));
+}
+
+static void buddy_check_alloc_down(struct buddy *b, unsigned int node)
+{
+	unsigned int i, count = 1;
+
+	while (node < buddy_map_size(b)) {
+		for (i = 0; i < count; i++)
+			buddy_check_alloc(b, node + i);
+
+		/* Down one level */
+		node <<= 1;
+		count <<= 1;
+	}
+}
+#else
+static inline void buddy_check_alloc(struct buddy *b __unused, unsigned int node __unused) {}
+static inline void buddy_check_alloc_down(struct buddy *b __unused, unsigned int node __unused) {}
+#endif
+
+int buddy_alloc(struct buddy *b, unsigned int order)
+{
+	unsigned int o;
+	int node, index;
+
+	BUDDY_NOISE("buddy_alloc(%d)\n", order);
+	/*
+	 * Find the first order up the tree from our requested order that
+	 * has at least one free node.
+	 */
+	for (o = order; o <= b->max_order; o++) {
+		if (b->freecounts[o] > 0)
+			break;
+	}
+
+	/* Nothing found ? fail */
+	if (o > b->max_order) {
+		BUDDY_NOISE("  no free nodes !\n");
+		return -1;
+	}
+
+	BUDDY_NOISE("  %d free node(s) at order %d, bits %d(%d)\n",
+		    b->freecounts[o], o,
+		    buddy_order_start(b, o),
+		    1u << (b->max_order - o));
+
+	/* Now find a free node */
+	node = bitmap_find_zero_bit(b->map, buddy_order_start(b, o),
+				    1u << (b->max_order - o));
+
+	/* There should always be one */
+	assert(node >= 0);
+
+	/* Mark it allocated and decrease free count */
+	bitmap_set_bit(b->map, node);
+	b->freecounts[o]--;
+
+	/* We know that node was free which means all its children must have
+	 * been marked "allocated". Double check.
+	 */
+	buddy_check_alloc_down(b, node);
+
+	/* We have a node, we've marked it allocated, now we need to go down
+	 * the tree until we reach "order" which is the order we need. For
+	 * each level along the way, we mark the buddy free and leave the
+	 * first child allocated.
+	 */
+	while (o > order) {
+		/* Next level down */
+		o--;
+		node <<= 1;
+
+		BUDDY_NOISE("  order %d, using %d marking %d free\n",
+			    o, node, node ^ 1);
+		bitmap_clr_bit(b->map, node ^ 1);
+		b->freecounts[o]++;
+		assert(bitmap_tst_bit(b->map, node));
+	}
+
+	index = buddy_node_to_index(b, node, order);
+
+	BUDDY_NOISE("  result is index %d (node %d)\n", index, node);
+
+	/* We have a node, convert it to an element number */
+	return index;
+}
+
+bool buddy_reserve(struct buddy *b, unsigned int index, unsigned int order)
+{
+	unsigned int node, freenode, o;
+
+	assert(index < (1u << b->max_order));
+
+	BUDDY_NOISE("buddy_reserve(%d,%d)\n", index, order);
+
+	/* Get bit number for node */
+	node = buddy_index_to_node(b, index, order);
+
+	BUDDY_NOISE("  node=%d\n", node);
+
+	/* Find something free */
+	for (freenode = node, o = order; freenode > 0; freenode >>= 1, o++)
+		if (!bitmap_tst_bit(b->map, freenode))
+			break;
+
+	BUDDY_NOISE("  freenode=%d order %d\n", freenode, o);
+
+	/* Nothing free, error out */
+	if (!freenode)
+		return false;
+
+	/* We sit on a free node, mark it busy */
+	bitmap_set_bit(b->map, freenode);
+	assert(b->freecounts[o]);
+	b->freecounts[o]--;
+
+	/* We know that node was free which means all its children must have
+	 * been marked "allocated". Double check.
+	 */
+	buddy_check_alloc_down(b, freenode);
+
+	/* Reverse-walk the path and break down nodes */
+	while (o > order) {
+		/* Next level down */
+		o--;
+		freenode <<= 1;
+
+		/* Find the right one on the path to node */
+		if (node & (1u << (o - order)))
+		    freenode++;
+
+		BUDDY_NOISE("  order %d, using %d marking %d free\n",
+			    o, freenode, freenode ^ 1);
+		bitmap_clr_bit(b->map, freenode ^ 1);
+		b->freecounts[o]++;
+		assert(bitmap_tst_bit(b->map, node));
+	}
+	assert(node == freenode);
+
+	return true;
+}
+
+void buddy_free(struct buddy *b, unsigned int index, unsigned int order)
+{
+	unsigned int node;
+
+	assert(index < (1u << b->max_order));
+
+	BUDDY_NOISE("buddy_free(%d,%d)\n", index, order);
+
+	/* Get bit number for node */
+	node = buddy_index_to_node(b, index, order);
+
+	BUDDY_NOISE("  node=%d\n", node);
+
+	/* We assume that anything freed was fully allocated, ie,
+	 * there is no child node of that allocation index/order
+	 * that is already free.
+	 *
+	 * BUDDY_DEBUG will verify it at the cost of performances
+	 */
+	buddy_check_alloc_down(b, node);
+
+	/* Propagate if buddy is free */
+	while (order < b->max_order && !bitmap_tst_bit(b->map, node ^ 1)) {
+		BUDDY_NOISE("  order %d node %d buddy %d free, propagating\n",
+			    order, node, node ^ 1);
+
+		/* Mark buddy busy (we are already marked busy) */
+		bitmap_set_bit(b->map, node ^ 1);
+
+		/* Reduce free count */
+		assert(b->freecounts[order] > 0);
+		b->freecounts[order]--;
+
+		/* Get parent */
+		node >>= 1;
+		order++;
+
+		/* It must be busy already ! */
+		buddy_check_alloc(b, node);
+
+		BUDDY_NOISE("  testing order %d node %d\n", order, node ^ 1);
+	}
+
+	/* No more coalescing, mark it free */
+	bitmap_clr_bit(b->map, node);
+
+	/* Increase the freelist count for that level */
+	b->freecounts[order]++;
+
+	BUDDY_NOISE("  free count at order %d is %d\n",
+		    order, b->freecounts[order]);
+}
+
+void buddy_reset(struct buddy *b)
+{
+	unsigned int bsize = BITMAP_BYTES(1u << (b->max_order + 1));
+
+	BUDDY_NOISE("buddy_reset()\n");
+	/* We fill the bitmap with 1's to make it completely "busy" */
+	memset(b->map, 0xff, bsize);
+	memset(b->freecounts, 0, sizeof(b->freecounts));
+
+	/* We mark the root of the tree free, this is entry 1 as entry 0
+	 * is unused.
+	 */
+	buddy_free(b, 0, b->max_order);
+}
+
+struct buddy *buddy_create(unsigned int max_order)
+{
+	struct buddy *b;
+	unsigned int bsize;
+
+	assert(max_order <= BUDDY_MAX_ORDER);
+
+	bsize = BITMAP_BYTES(1u << (max_order + 1));
+
+	b = zalloc(sizeof(struct buddy) + bsize);
+	if (!b)
+		return NULL;
+	b->max_order = max_order;
+
+	BUDDY_NOISE("Map @%p, size: %d bytes\n", b->map, bsize);
+
+	buddy_reset(b);
+
+	return b;
+}
+
+void buddy_destroy(struct buddy *b)
+{
+	free(b);
+}
+
diff --git a/roms/skiboot/core/chip.c b/roms/skiboot/core/chip.c
new file mode 100644
index 000000000..2d95b2e05
--- /dev/null
+++ b/roms/skiboot/core/chip.c
@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2013-2019 IBM Corp. */
+
+#include <skiboot.h>
+#include <chip.h>
+#include <console.h>
+#include <device.h>
+#include <timebase.h>
+#include <cpu.h>
+
+static struct proc_chip *chips[MAX_CHIPS];
+enum proc_chip_quirks proc_chip_quirks;
+
+uint32_t pir_to_chip_id(uint32_t pir)
+{
+	if (proc_gen == proc_gen_p10)
+		return P10_PIR2GCID(pir);
+	else if (proc_gen == proc_gen_p9)
+		return P9_PIR2GCID(pir);
+	else if (proc_gen == proc_gen_p8)
+		return P8_PIR2GCID(pir);
+	else
+		assert(false);
+}
+
+uint32_t pir_to_core_id(uint32_t pir)
+{
+	if (proc_gen == proc_gen_p10) {
+		if (this_cpu()->is_fused_core)
+			return P10_PIRFUSED2NORMALCOREID(pir);
+		else
+			return P10_PIR2COREID(pir);
+	} else if (proc_gen == proc_gen_p9) {
+		if (this_cpu()->is_fused_core)
+			return P9_PIRFUSED2NORMALCOREID(pir);
+		else
+			return P9_PIR2COREID(pir);
+	} else if (proc_gen == proc_gen_p8) {
+		return P8_PIR2COREID(pir);
+	} else {
+		assert(false);
+	}
+}
+
+uint32_t pir_to_fused_core_id(uint32_t pir)
+{
+	if (proc_gen == proc_gen_p10) {
+		if (this_cpu()->is_fused_core)
+			return P10_PIR2FUSEDCOREID(pir);
+		else
+			return P10_PIR2COREID(pir);
+	} else if (proc_gen == proc_gen_p9) {
+		if (this_cpu()->is_fused_core)
+			return P9_PIR2FUSEDCOREID(pir);
+		else
+			return P9_PIR2COREID(pir);
+	} else if (proc_gen == proc_gen_p8) {
+		return P8_PIR2COREID(pir);
+	} else {
+		assert(false);
+	}
+}
+
+uint32_t pir_to_thread_id(uint32_t pir)
+{
+	if (proc_gen == proc_gen_p10) {
+		if (this_cpu()->is_fused_core)
+			return P10_PIRFUSED2NORMALTHREADID(pir);
+		else
+			return P10_PIR2THREADID(pir);
+	} else if (proc_gen == proc_gen_p9) {
+		if (this_cpu()->is_fused_core)
+			return P9_PIRFUSED2NORMALTHREADID(pir);
+		else
+			return P9_PIR2THREADID(pir);
+	} else if (proc_gen == proc_gen_p8) {
+		return P8_PIR2THREADID(pir);
+	} else {
+		assert(false);
+	}
+}
+
+struct proc_chip *next_chip(struct proc_chip *chip)
+{
+	unsigned int i;
+
+	for (i = chip ? (chip->id + 1) : 0; i < MAX_CHIPS; i++)
+		if (chips[i])
+			return chips[i];
+	return NULL;
+}
+
+
+struct proc_chip *get_chip(uint32_t chip_id)
+{
+	if (chip_id >= MAX_CHIPS)
+		return NULL;
+	return chips[chip_id];
+}
+
+static void init_chip(struct dt_node *dn)
+{
+	struct proc_chip *chip;
+	uint32_t id;
+	const char *lc = NULL;
+
+	id = dt_get_chip_id(dn);
+	assert(id < MAX_CHIPS);
+	assert(chips[id] == NULL);
+
+	chip = zalloc(sizeof(struct proc_chip));
+	assert(chip);
+
+	chip->id = id;
+	chip->devnode = dn;
+
+	chip->dbob_id = dt_prop_get_u32_def(dn, "ibm,dbob-id", 0xffffffff);
+	chip->pcid = dt_prop_get_u32_def(dn, "ibm,proc-chip-id", 0xffffffff);
+
+	if (dt_prop_get_u32_def(dn, "ibm,occ-functional-state", 0))
+		chip->occ_functional = true;
+	else
+		chip->occ_functional = false;
+
+	list_head_init(&chip->i2cms);
+
+	/* Update the location code for this chip. */
+	if (dt_has_node_property(dn, "ibm,loc-code", NULL))
+		lc = dt_prop_get(dn, "ibm,loc-code");
+	else if (dt_has_node_property(dn, "ibm,slot-location-code", NULL))
+		lc = dt_prop_get(dn, "ibm,slot-location-code");
+
+	if (lc)
+		chip->loc_code = strdup(lc);
+
+	chip->primary_topology = dt_prop_get_u32_def(dn,
+		"ibm,primary-topology-index", 0xffffffff);
+
+	prlog(PR_INFO, "CHIP: Initialised chip %d from %s\n", id, dn->name);
+	chips[id] = chip;
+}
+
+void init_chips(void)
+{
+	struct dt_node *xn;
+
+	/* Detect mambo chip */
+	if (dt_find_by_path(dt_root, "/mambo")) {
+		proc_chip_quirks |= QUIRK_NO_CHIPTOD | QUIRK_MAMBO_CALLOUTS
+			| QUIRK_NO_F000F | QUIRK_NO_PBA | QUIRK_NO_OCC_IRQ
+			| QUIRK_NO_RNG;
+
+		enable_mambo_console();
+
+		prlog(PR_NOTICE, "CHIP: Detected Mambo simulator\n");
+
+		dt_for_each_compatible(dt_root, xn, "ibm,mambo-chip")
+			init_chip(xn);
+	}
+
+	/* Detect simics */
+	if (dt_find_by_path(dt_root, "/simics")) {
+		proc_chip_quirks |= QUIRK_SIMICS
+			| QUIRK_NO_PBA | QUIRK_NO_OCC_IRQ | QUIRK_SLOW_SIM;
+		tb_hz = 512000;
+		prlog(PR_NOTICE, "CHIP: Detected Simics simulator\n");
+	}
+	/* Detect Awan emulator */
+	if (dt_find_by_path(dt_root, "/awan")) {
+		proc_chip_quirks |= QUIRK_NO_CHIPTOD | QUIRK_NO_F000F
+			| QUIRK_NO_PBA | QUIRK_NO_OCC_IRQ | QUIRK_SLOW_SIM;
+		tb_hz = 512000;
+		prlog(PR_NOTICE, "CHIP: Detected Awan emulator\n");
+	}
+	/* Detect Qemu */
+	if (dt_node_is_compatible(dt_root, "qemu,powernv") ||
+	    dt_node_is_compatible(dt_root, "qemu,powernv8") ||
+	    dt_node_is_compatible(dt_root, "qemu,powernv9") ||
+	    dt_node_is_compatible(dt_root, "qemu,powernv10") ||
+	    dt_find_by_path(dt_root, "/qemu")) {
+		proc_chip_quirks |= QUIRK_QEMU | QUIRK_NO_CHIPTOD
+			| QUIRK_NO_DIRECT_CTL | QUIRK_NO_RNG;
+		prlog(PR_NOTICE, "CHIP: Detected QEMU simulator\n");
+	}
+
+	/* We walk the chips based on xscom nodes in the tree */
+	dt_for_each_compatible(dt_root, xn, "ibm,xscom") {
+		init_chip(xn);
+	}
+}
diff --git a/roms/skiboot/core/console-log.c b/roms/skiboot/core/console-log.c
new file mode 100644
index 000000000..21a1442bd
--- /dev/null
+++ b/roms/skiboot/core/console-log.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Console Log routines
+ * Wraps libc and console lower level functions
+ * does fancy-schmancy things like timestamps and priorities
+ * Doesn't make waffles.
+ *
+ * Copyright 2013-2018 IBM Corp.
+ */
+
+#include "skiboot.h"
+#include "unistd.h"
+#include "stdio.h"
+#include "console.h"
+#include "timebase.h"
+#include <debug_descriptor.h>
+
+static int vprlog(int log_level, const char *fmt, va_list ap)
+{
+	int count;
+	char buffer[320];
+	bool flush_to_drivers = true;
+	unsigned long tb = mftb();
+
+	/* It's safe to return 0 when we "did" something here
+	 * as only printf cares about how much we wrote, and
+	 * if you change log_level to below PR_PRINTF then you
+	 * get everything you deserve.
+	 * By default, only PR_DEBUG and higher are stored in memory.
+	 * PR_TRACE and PR_INSANE are for those having a bad day.
+	 */
+	if (log_level > (debug_descriptor.console_log_levels >> 4))
+		return 0;
+
+	count = snprintf(buffer, sizeof(buffer), "[%5lu.%09lu,%d] ",
+			 tb_to_secs(tb), tb_remaining_nsecs(tb), log_level);
+	count+= vsnprintf(buffer+count, sizeof(buffer)-count, fmt, ap);
+
+	if (log_level > (debug_descriptor.console_log_levels & 0x0f))
+		flush_to_drivers = false;
+
+	console_write(flush_to_drivers, buffer, count);
+
+	return count;
+}
+
+/* we don't return anything as what on earth are we going to do
+ * if we actually fail to print a log message? Print a log message about it?
+ * Callers shouldn't care, prlog and friends should do something generically
+ * sane in such crazy situations.
+ */
+void _prlog(int log_level, const char* fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	vprlog(log_level, fmt, ap);
+	va_end(ap);
+}
+
+int _printf(const char* fmt, ...)
+{
+	int count;
+	va_list ap;
+
+	va_start(ap, fmt);
+	count = vprlog(PR_PRINTF, fmt, ap);
+	va_end(ap);
+
+	return count;
+}
diff --git a/roms/skiboot/core/console.c b/roms/skiboot/core/console.c
new file mode 100644
index 000000000..2a1509025
--- /dev/null
+++ b/roms/skiboot/core/console.c
@@ -0,0 +1,451 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Console IO routine for use by libc
+ *
+ * fd is the classic posix 0,1,2 (stdin, stdout, stderr)
+ *
+ * Copyright 2013-2018 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <unistd.h>
+#include <console.h>
+#include <opal.h>
+#include <device.h>
+#include <processor.h>
+#include <cpu.h>
+
+static char *con_buf = (char *)INMEM_CON_START;
+static size_t con_in;
+static size_t con_out;
+static bool con_wrapped;
+
+/* Internal console driver ops */
+static struct con_ops *con_driver;
+
+/* External (OPAL) console driver ops */
+static struct opal_con_ops *opal_con_driver = &dummy_opal_con;
+
+static struct lock con_lock = LOCK_UNLOCKED;
+
+/* This is mapped via TCEs so we keep it alone in a page */
+struct memcons memcons __section(".data.memcons") = {
+	.magic		= CPU_TO_BE64(MEMCONS_MAGIC),
+	.obuf_phys	= CPU_TO_BE64(INMEM_CON_START),
+	.ibuf_phys	= CPU_TO_BE64(INMEM_CON_START + INMEM_CON_OUT_LEN),
+	.obuf_size	= CPU_TO_BE32(INMEM_CON_OUT_LEN),
+	.ibuf_size	= CPU_TO_BE32(INMEM_CON_IN_LEN),
+};
+
+static bool dummy_console_enabled(void)
+{
+#ifdef FORCE_DUMMY_CONSOLE
+	return true;
+#else
+	return dt_has_node_property(dt_chosen,
+				    "sapphire,enable-dummy-console", NULL);
+#endif
+}
+
+/*
+ * Helper function for adding /ibm,opal/consoles/serial@<xyz> nodes
+ */
+struct dt_node *add_opal_console_node(int index, const char *type,
+	uint32_t write_buffer_size)
+{
+	struct dt_node *con, *consoles;
+	char buffer[32];
+
+	consoles = dt_find_by_name(opal_node, "consoles");
+	if (!consoles) {
+		consoles = dt_new(opal_node, "consoles");
+		assert(consoles);
+		dt_add_property_cells(consoles, "#address-cells", 1);
+		dt_add_property_cells(consoles, "#size-cells", 0);
+	}
+
+	con = dt_new_addr(consoles, "serial", index);
+	assert(con);
+
+	snprintf(buffer, sizeof(buffer), "ibm,opal-console-%s", type);
+	dt_add_property_string(con, "compatible", buffer);
+
+	dt_add_property_cells(con, "#write-buffer-size", write_buffer_size);
+	dt_add_property_cells(con, "reg", index);
+	dt_add_property_string(con, "device_type", "serial");
+
+	return con;
+}
+
+void clear_console(void)
+{
+	memset(con_buf, 0, INMEM_CON_LEN);
+}
+
+/*
+ * Flush the console buffer into the driver, returns true
+ * if there is more to go.
+ * Optionally can skip flushing to drivers, leaving messages
+ * just in memory console.
+ */
+static bool __flush_console(bool flush_to_drivers, bool need_unlock)
+{
+	struct cpu_thread *cpu = this_cpu();
+	size_t req, len = 0;
+	static bool in_flush, more_flush;
+
+	/* Is there anything to flush ? Bail out early if not */
+	if (con_in == con_out || !con_driver)
+		return false;
+
+	/*
+	 * Console flushing is suspended on this CPU, typically because
+	 * some critical locks are held that would potentially cause a
+	 * flush to deadlock
+	 *
+	 * Also if it recursed on con_lock (need_unlock is false). This
+	 * can happen due to debug code firing (e.g., list or stack
+	 * debugging).
+	 */
+	if (cpu->con_suspend || !need_unlock) {
+		cpu->con_need_flush = true;
+		return false;
+	}
+	cpu->con_need_flush = false;
+
+	/*
+	 * We must call the underlying driver with the console lock
+	 * dropped otherwise we get some deadlocks if anything down
+	 * that path tries to printf() something.
+	 *
+	 * So instead what we do is we keep a static in_flush flag
+	 * set/released with the lock held, which is used to prevent
+	 * concurrent attempts at flushing the same chunk of buffer
+	 * by other processors.
+	 */
+	if (in_flush) {
+		more_flush = true;
+		return false;
+	}
+	in_flush = true;
+
+	/*
+	 * NB: this must appear after the in_flush check since it modifies
+	 *     con_out.
+	 */
+	if (!flush_to_drivers) {
+		con_out = con_in;
+		in_flush = false;
+		return false;
+	}
+
+	do {
+		more_flush = false;
+
+		if (con_out > con_in) {
+			req = INMEM_CON_OUT_LEN - con_out;
+			more_flush = true;
+		} else
+			req = con_in - con_out;
+
+		unlock(&con_lock);
+		len = con_driver->write(con_buf + con_out, req);
+		lock(&con_lock);
+
+		con_out = (con_out + len) % INMEM_CON_OUT_LEN;
+
+		/* write error? */
+		if (len < req)
+			break;
+	} while(more_flush);
+
+	in_flush = false;
+	return con_out != con_in;
+}
+
+bool flush_console(void)
+{
+	bool ret;
+
+	lock(&con_lock);
+	ret = __flush_console(true, true);
+	unlock(&con_lock);
+
+	return ret;
+}
+
+static void inmem_write(char c)
+{
+	uint32_t opos;
+
+	if (!c)
+		return;
+	con_buf[con_in++] = c;
+	if (con_in >= INMEM_CON_OUT_LEN) {
+		con_in = 0;
+		con_wrapped = true;
+	}
+
+	/*
+	 * We must always re-generate memcons.out_pos because
+	 * under some circumstances, the console script will
+	 * use a broken putmemproc that does RMW on the full
+	 * 8 bytes containing out_pos and in_prod, thus corrupting
+	 * out_pos
+	 */
+	opos = con_in;
+	if (con_wrapped)
+		opos |= MEMCONS_OUT_POS_WRAP;
+	lwsync();
+	memcons.out_pos = cpu_to_be32(opos);
+
+	/* If head reaches tail, push tail around & drop chars */
+	if (con_in == con_out)
+		con_out = (con_in + 1) % INMEM_CON_OUT_LEN;
+}
+
+static size_t inmem_read(char *buf, size_t req)
+{
+	size_t read = 0;
+	char *ibuf = (char *)be64_to_cpu(memcons.ibuf_phys);
+
+	while (req && be32_to_cpu(memcons.in_prod) != be32_to_cpu(memcons.in_cons)) {
+		*(buf++) = ibuf[be32_to_cpu(memcons.in_cons)];
+		lwsync();
+		memcons.in_cons = cpu_to_be32((be32_to_cpu(memcons.in_cons) + 1) % INMEM_CON_IN_LEN);
+		req--;
+		read++;
+	}
+	return read;
+}
+
+static void write_char(char c)
+{
+#ifdef MAMBO_DEBUG_CONSOLE
+	mambo_console_write(&c, 1);
+#endif
+	inmem_write(c);
+}
+
+ssize_t console_write(bool flush_to_drivers, const void *buf, size_t count)
+{
+	/* We use recursive locking here as we can get called
+	 * from fairly deep debug path
+	 */
+	bool need_unlock = lock_recursive(&con_lock);
+	const char *cbuf = buf;
+
+	while(count--) {
+		char c = *(cbuf++);
+		if (c == '\n')
+			write_char('\r');
+		write_char(c);
+	}
+
+	__flush_console(flush_to_drivers, need_unlock);
+
+	if (need_unlock)
+		unlock(&con_lock);
+
+	return count;
+}
+
+ssize_t write(int fd __unused, const void *buf, size_t count)
+{
+	return console_write(true, buf, count);
+}
+
+ssize_t read(int fd __unused, void *buf, size_t req_count)
+{
+	bool need_unlock = lock_recursive(&con_lock);
+	size_t count = 0;
+
+	if (con_driver && con_driver->read)
+		count = con_driver->read(buf, req_count);
+	if (!count)
+		count = inmem_read(buf, req_count);
+	if (need_unlock)
+		unlock(&con_lock);
+	return count;
+}
+
+/* Helper function to perform a full synchronous flush */
+void console_complete_flush(void)
+{
+	/*
+	 * Using term 0 here is a dumb hack that works because the UART
+	 * only has term 0 and the FSP doesn't have an explicit flush method.
+	 */
+	int64_t ret = opal_con_driver->flush(0);
+
+	if (ret == OPAL_UNSUPPORTED || ret == OPAL_PARAMETER)
+		return;
+
+	while (ret != OPAL_SUCCESS) {
+		ret = opal_con_driver->flush(0);
+	}
+}
+
+/*
+ * set_console()
+ *
+ * This sets the driver used internally by Skiboot. This is different to the
+ * OPAL console driver.
+ */
+void set_console(struct con_ops *driver)
+{
+	con_driver = driver;
+	if (driver)
+		flush_console();
+}
+
+/*
+ * set_opal_console()
+ *
+ * Configure the console driver to handle the console provided by the OPAL API.
+ * They are different to the above in that they are typically buffered, and used
+ * by the host OS rather than skiboot.
+ */
+static bool opal_cons_init = false;
+
+void set_opal_console(struct opal_con_ops *driver)
+{
+	assert(!opal_cons_init);
+	opal_con_driver = driver;
+}
+
+void init_opal_console(void)
+{
+	assert(!opal_cons_init);
+	opal_cons_init = true;
+
+	if (dummy_console_enabled() && opal_con_driver != &dummy_opal_con) {
+		prlog(PR_WARNING, "OPAL: Dummy console forced, %s ignored\n",
+		      opal_con_driver->name);
+
+		opal_con_driver = &dummy_opal_con;
+	}
+
+	prlog(PR_INFO, "OPAL: Using %s\n", opal_con_driver->name);
+
+	if (opal_con_driver->init)
+		opal_con_driver->init();
+
+	opal_register(OPAL_CONSOLE_READ, opal_con_driver->read, 3);
+	opal_register(OPAL_CONSOLE_WRITE, opal_con_driver->write, 3);
+	opal_register(OPAL_CONSOLE_FLUSH, opal_con_driver->flush, 1);
+	opal_register(OPAL_CONSOLE_WRITE_BUFFER_SPACE,
+			opal_con_driver->space, 2);
+}
+
+void memcons_add_properties(void)
+{
+	dt_add_property_u64(opal_node, "ibm,opal-memcons", (u64) &memcons);
+}
+
+/*
+ * The default OPAL console.
+ *
+ * In the absence of a "real" OPAL console driver we handle the OPAL_CONSOLE_*
+ * calls by writing into the skiboot log buffer. Reads are a little more
+ * complicated since they can come from the in-memory console (BML) or from the
+ * internal skiboot console driver.
+ */
+static int64_t dummy_console_write(int64_t term_number, __be64 *length,
+				   const uint8_t *buffer)
+{
+	uint64_t l;
+
+	if (term_number != 0)
+		return OPAL_PARAMETER;
+
+	if (!opal_addr_valid(length) || !opal_addr_valid(buffer))
+		return OPAL_PARAMETER;
+
+	l = be64_to_cpu(*length);
+	write(0, buffer, l);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t dummy_console_write_buffer_space(int64_t term_number,
+						__be64 *length)
+{
+	if (term_number != 0)
+		return OPAL_PARAMETER;
+
+	if (!opal_addr_valid(length))
+		return OPAL_PARAMETER;
+
+	if (length)
+		*length = cpu_to_be64(INMEM_CON_OUT_LEN);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t dummy_console_read(int64_t term_number, __be64 *length,
+				  uint8_t *buffer)
+{
+	uint64_t l;
+
+	if (term_number != 0)
+		return OPAL_PARAMETER;
+
+	if (!opal_addr_valid(length) || !opal_addr_valid(buffer))
+		return OPAL_PARAMETER;
+
+	l = be64_to_cpu(*length);
+	l = read(0, buffer, l);
+	*length = cpu_to_be64(l);
+	opal_update_pending_evt(OPAL_EVENT_CONSOLE_INPUT, 0);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t dummy_console_flush(int64_t term_number __unused)
+{
+	return OPAL_UNSUPPORTED;
+}
+
+static void dummy_console_poll(void *data __unused)
+{
+	bool has_data = false;
+
+	lock(&con_lock);
+	if (con_driver && con_driver->poll_read)
+		has_data = con_driver->poll_read();
+	if (memcons.in_prod != memcons.in_cons)
+		has_data = true;
+	if (has_data)
+		opal_update_pending_evt(OPAL_EVENT_CONSOLE_INPUT,
+					OPAL_EVENT_CONSOLE_INPUT);
+	else
+		opal_update_pending_evt(OPAL_EVENT_CONSOLE_INPUT, 0);
+	unlock(&con_lock);
+}
+
+void dummy_console_add_nodes(void)
+{
+	struct dt_property *p;
+
+	add_opal_console_node(0, "raw", be32_to_cpu(memcons.obuf_size));
+
+	/* Mambo might have left a crap one, clear it */
+	p = __dt_find_property(dt_chosen, "linux,stdout-path");
+	if (p)
+		dt_del_property(dt_chosen, p);
+
+	dt_add_property_string(dt_chosen, "linux,stdout-path",
+			       "/ibm,opal/consoles/serial@0");
+
+	opal_add_poller(dummy_console_poll, NULL);
+}
+
+struct opal_con_ops dummy_opal_con = {
+	.name = "Dummy Console",
+	.init = dummy_console_add_nodes,
+	.read = dummy_console_read,
+	.write = dummy_console_write,
+	.space = dummy_console_write_buffer_space,
+	.flush = dummy_console_flush,
+};
diff --git a/roms/skiboot/core/cpu.c b/roms/skiboot/core/cpu.c
new file mode 100644
index 000000000..f58aeb27a
--- /dev/null
+++ b/roms/skiboot/core/cpu.c
@@ -0,0 +1,1785 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Code to manage and manipulate CPUs
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <cpu.h>
+#include <device.h>
+#include <mem_region.h>
+#include <opal.h>
+#include <stack.h>
+#include <trace.h>
+#include <affinity.h>
+#include <chip.h>
+#include <timebase.h>
+#include <interrupts.h>
+#include <ccan/str/str.h>
+#include <ccan/container_of/container_of.h>
+#include <xscom.h>
+
+/* The cpu_threads array is static and indexed by PIR in
+ * order to speed up lookup from asm entry points
+ */
+struct cpu_stack {
+	union {
+		uint8_t	stack[STACK_SIZE];
+		struct cpu_thread cpu;
+	};
+} __align(STACK_SIZE);
+
+static struct cpu_stack * const cpu_stacks = (struct cpu_stack *)CPU_STACKS_BASE;
+unsigned int cpu_thread_count;
+unsigned int cpu_max_pir;
+struct cpu_thread *boot_cpu;
+static struct lock reinit_lock = LOCK_UNLOCKED;
+static bool hile_supported;
+static bool radix_supported;
+static unsigned long hid0_hile;
+static unsigned long hid0_attn;
+static bool sreset_enabled;
+static bool ipi_enabled;
+static bool pm_enabled;
+static bool current_hile_mode = HAVE_LITTLE_ENDIAN;
+static bool current_radix_mode = true;
+static bool tm_suspend_enabled;
+
+unsigned long cpu_secondary_start __force_data = 0;
+
+struct cpu_job {
+	struct list_node	link;
+	void			(*func)(void *data);
+	void			*data;
+	const char		*name;
+	bool			complete;
+	bool		        no_return;
+};
+
+/* attribute const as cpu_stacks is constant. */
+unsigned long __attrconst cpu_stack_bottom(unsigned int pir)
+{
+	return ((unsigned long)&cpu_stacks[pir]) +
+		sizeof(struct cpu_thread) + STACK_SAFETY_GAP;
+}
+
+unsigned long __attrconst cpu_stack_top(unsigned int pir)
+{
+	/* This is the top of the normal stack. */
+	return ((unsigned long)&cpu_stacks[pir]) +
+		NORMAL_STACK_SIZE - STACK_TOP_GAP;
+}
+
+unsigned long __attrconst cpu_emergency_stack_top(unsigned int pir)
+{
+	/* This is the top of the emergency stack, above the normal stack. */
+	return ((unsigned long)&cpu_stacks[pir]) +
+		NORMAL_STACK_SIZE + EMERGENCY_STACK_SIZE - STACK_TOP_GAP;
+}
+
+void __nomcount cpu_relax(void)
+{
+	/* Relax a bit to give sibling threads some breathing space */
+	smt_lowest();
+	asm volatile("nop; nop; nop; nop;\n"
+		     "nop; nop; nop; nop;\n"
+		     "nop; nop; nop; nop;\n"
+		     "nop; nop; nop; nop;\n");
+	smt_medium();
+	barrier();
+}
+
+static void cpu_wake(struct cpu_thread *cpu)
+{
+	/* Is it idle ? If not, no need to wake */
+	sync();
+	if (!cpu->in_idle)
+		return;
+
+	if (proc_gen == proc_gen_p8) {
+		/* Poke IPI */
+		icp_kick_cpu(cpu);
+	} else if (proc_gen == proc_gen_p9 || proc_gen == proc_gen_p10) {
+		p9_dbell_send(cpu->pir);
+	}
+}
+
+/*
+ * If chip_id is >= 0, schedule the job on that node.
+ * Otherwise schedule the job anywhere.
+ */
+static struct cpu_thread *cpu_find_job_target(int32_t chip_id)
+{
+	struct cpu_thread *cpu, *best, *me = this_cpu();
+	uint32_t best_count;
+
+	/* We try to find a target to run a job. We need to avoid
+	 * a CPU that has a "no return" job on its queue as it might
+	 * never be able to process anything.
+	 *
+	 * Additionally we don't check the list but the job count
+	 * on the target CPUs, since that is decremented *after*
+	 * a job has been completed.
+	 */
+
+
+	/* First we scan all available primary threads
+	 */
+	for_each_available_cpu(cpu) {
+		if (chip_id >= 0 && cpu->chip_id != chip_id)
+			continue;
+		if (cpu == me || !cpu_is_thread0(cpu) || cpu->job_has_no_return)
+			continue;
+		if (cpu->job_count)
+			continue;
+		lock(&cpu->job_lock);
+		if (!cpu->job_count)
+			return cpu;
+		unlock(&cpu->job_lock);
+	}
+
+	/* Now try again with secondary threads included and keep
+	 * track of the one with the less jobs queued up. This is
+	 * done in a racy way, but it's just an optimization in case
+	 * we are overcommitted on jobs. Could could also just pick
+	 * a random one...
+	 */
+	best = NULL;
+	best_count = -1u;
+	for_each_available_cpu(cpu) {
+		if (chip_id >= 0 && cpu->chip_id != chip_id)
+			continue;
+		if (cpu == me || cpu->job_has_no_return)
+			continue;
+		if (!best || cpu->job_count < best_count) {
+			best = cpu;
+			best_count = cpu->job_count;
+		}
+		if (cpu->job_count)
+			continue;
+		lock(&cpu->job_lock);
+		if (!cpu->job_count)
+			return cpu;
+		unlock(&cpu->job_lock);
+	}
+
+	/* We haven't found anybody, do we have a bestie ? */
+	if (best) {
+		lock(&best->job_lock);
+		return best;
+	}
+
+	/* Go away */
+	return NULL;
+}
+
+/* job_lock is held, returns with it released */
+static void queue_job_on_cpu(struct cpu_thread *cpu, struct cpu_job *job)
+{
+	/* That's bad, the job will never run */
+	if (cpu->job_has_no_return) {
+		prlog(PR_WARNING, "WARNING ! Job %s scheduled on CPU 0x%x"
+		      " which has a no-return job on its queue !\n",
+		      job->name, cpu->pir);
+		backtrace();
+	}
+	list_add_tail(&cpu->job_queue, &job->link);
+	if (job->no_return)
+		cpu->job_has_no_return = true;
+	else
+		cpu->job_count++;
+	if (pm_enabled)
+		cpu_wake(cpu);
+	unlock(&cpu->job_lock);
+}
+
+struct cpu_job *__cpu_queue_job(struct cpu_thread *cpu,
+				const char *name,
+				void (*func)(void *data), void *data,
+				bool no_return)
+{
+	struct cpu_job *job;
+
+#ifdef DEBUG_SERIALIZE_CPU_JOBS
+	if (cpu == NULL)
+		cpu = this_cpu();
+#endif
+
+	if (cpu && !cpu_is_available(cpu)) {
+		prerror("CPU: Tried to queue job on unavailable CPU 0x%04x\n",
+			cpu->pir);
+		return NULL;
+	}
+
+	job = zalloc(sizeof(struct cpu_job));
+	if (!job)
+		return NULL;
+	job->func = func;
+	job->data = data;
+	job->name = name;
+	job->complete = false;
+	job->no_return = no_return;
+
+	/* Pick a candidate. Returns with target queue locked */
+	if (cpu == NULL)
+		cpu = cpu_find_job_target(-1);
+	else if (cpu != this_cpu())
+		lock(&cpu->job_lock);
+	else
+		cpu = NULL;
+
+	/* Can't be scheduled, run it now */
+	if (cpu == NULL) {
+		if (!this_cpu()->job_has_no_return)
+			this_cpu()->job_has_no_return = no_return;
+		func(data);
+		job->complete = true;
+		return job;
+	}
+
+	queue_job_on_cpu(cpu, job);
+
+	return job;
+}
+
+struct cpu_job *cpu_queue_job_on_node(uint32_t chip_id,
+				const char *name,
+				void (*func)(void *data), void *data)
+{
+	struct cpu_thread *cpu;
+	struct cpu_job *job;
+
+	job = zalloc(sizeof(struct cpu_job));
+	if (!job)
+		return NULL;
+	job->func = func;
+	job->data = data;
+	job->name = name;
+	job->complete = false;
+	job->no_return = false;
+
+	/* Pick a candidate. Returns with target queue locked */
+	cpu = cpu_find_job_target(chip_id);
+
+	/* Can't be scheduled... */
+	if (cpu == NULL) {
+		cpu = this_cpu();
+		if (cpu->chip_id == chip_id) {
+			/* Run it now if we're the right node. */
+			func(data);
+			job->complete = true;
+			return job;
+		}
+		/* Otherwise fail. */
+		free(job);
+		return NULL;
+	}
+
+	queue_job_on_cpu(cpu, job);
+
+	return job;
+}
+
+bool cpu_poll_job(struct cpu_job *job)
+{
+	lwsync();
+	return job->complete;
+}
+
+void cpu_wait_job(struct cpu_job *job, bool free_it)
+{
+	unsigned long time_waited = 0;
+
+	if (!job)
+		return;
+
+	while (!job->complete) {
+		/* This will call OPAL pollers for us */
+		time_wait_ms(10);
+		time_waited += 10;
+		lwsync();
+		if ((time_waited % 30000) == 0) {
+			prlog(PR_INFO, "cpu_wait_job(%s) for %lums\n",
+			      job->name, time_waited);
+			backtrace();
+		}
+	}
+	lwsync();
+
+	if (time_waited > 1000)
+		prlog(PR_DEBUG, "cpu_wait_job(%s) for %lums\n",
+		      job->name, time_waited);
+
+	if (free_it)
+		free(job);
+}
+
+bool cpu_check_jobs(struct cpu_thread *cpu)
+{
+	return !list_empty_nocheck(&cpu->job_queue);
+}
+
+void cpu_process_jobs(void)
+{
+	struct cpu_thread *cpu = this_cpu();
+	struct cpu_job *job = NULL;
+	void (*func)(void *);
+	void *data;
+
+	sync();
+	if (!cpu_check_jobs(cpu))
+		return;
+
+	lock(&cpu->job_lock);
+	while (true) {
+		bool no_return;
+
+		job = list_pop(&cpu->job_queue, struct cpu_job, link);
+		if (!job)
+			break;
+
+		func = job->func;
+		data = job->data;
+		no_return = job->no_return;
+		unlock(&cpu->job_lock);
+		prlog(PR_TRACE, "running job %s on %x\n", job->name, cpu->pir);
+		if (no_return)
+			free(job);
+		func(data);
+		if (!list_empty(&cpu->locks_held)) {
+			if (no_return)
+				prlog(PR_ERR, "OPAL no-return job returned with"
+				      "locks held!\n");
+			else
+				prlog(PR_ERR, "OPAL job %s returning with locks held\n",
+				      job->name);
+			drop_my_locks(true);
+		}
+		lock(&cpu->job_lock);
+		if (!no_return) {
+			cpu->job_count--;
+			lwsync();
+			job->complete = true;
+		}
+	}
+	unlock(&cpu->job_lock);
+}
+
+enum cpu_wake_cause {
+	cpu_wake_on_job,
+	cpu_wake_on_dec,
+};
+
+static unsigned int cpu_idle_p8(enum cpu_wake_cause wake_on)
+{
+	uint64_t lpcr = mfspr(SPR_LPCR) & ~SPR_LPCR_P8_PECE;
+	struct cpu_thread *cpu = this_cpu();
+	unsigned int vec = 0;
+
+	if (!pm_enabled) {
+		prlog_once(PR_DEBUG, "cpu_idle_p8 called pm disabled\n");
+		return vec;
+	}
+
+	/* Clean up ICP, be ready for IPIs */
+	icp_prep_for_pm();
+
+	/* Synchronize with wakers */
+	if (wake_on == cpu_wake_on_job) {
+		/* Mark ourselves in idle so other CPUs know to send an IPI */
+		cpu->in_idle = true;
+		sync();
+
+		/* Check for jobs again */
+		if (cpu_check_jobs(cpu) || !pm_enabled)
+			goto skip_sleep;
+
+		/* Setup wakup cause in LPCR: EE (for IPI) */
+		lpcr |= SPR_LPCR_P8_PECE2;
+		mtspr(SPR_LPCR, lpcr);
+
+	} else {
+		/* Mark outselves sleeping so cpu_set_pm_enable knows to
+		 * send an IPI
+		 */
+		cpu->in_sleep = true;
+		sync();
+
+		/* Check if PM got disabled */
+		if (!pm_enabled)
+			goto skip_sleep;
+
+		/* EE and DEC */
+		lpcr |= SPR_LPCR_P8_PECE2 | SPR_LPCR_P8_PECE3;
+		mtspr(SPR_LPCR, lpcr);
+	}
+	isync();
+
+	/* Enter nap */
+	vec = enter_p8_pm_state(false);
+
+skip_sleep:
+	/* Restore */
+	sync();
+	cpu->in_idle = false;
+	cpu->in_sleep = false;
+	reset_cpu_icp();
+
+	return vec;
+}
+
+static unsigned int cpu_idle_p9(enum cpu_wake_cause wake_on)
+{
+	uint64_t lpcr = mfspr(SPR_LPCR) & ~SPR_LPCR_P9_PECE;
+	uint64_t psscr;
+	struct cpu_thread *cpu = this_cpu();
+	unsigned int vec = 0;
+
+	if (!pm_enabled) {
+		prlog(PR_DEBUG, "cpu_idle_p9 called on cpu 0x%04x with pm disabled\n", cpu->pir);
+		return vec;
+	}
+
+	/* Synchronize with wakers */
+	if (wake_on == cpu_wake_on_job) {
+		/* Mark ourselves in idle so other CPUs know to send an IPI */
+		cpu->in_idle = true;
+		sync();
+
+		/* Check for jobs again */
+		if (cpu_check_jobs(cpu) || !pm_enabled)
+			goto skip_sleep;
+
+		/* HV DBELL for IPI */
+		lpcr |= SPR_LPCR_P9_PECEL1;
+	} else {
+		/* Mark outselves sleeping so cpu_set_pm_enable knows to
+		 * send an IPI
+		 */
+		cpu->in_sleep = true;
+		sync();
+
+		/* Check if PM got disabled */
+		if (!pm_enabled)
+			goto skip_sleep;
+
+		/* HV DBELL and DEC */
+		lpcr |= SPR_LPCR_P9_PECEL1 | SPR_LPCR_P9_PECEL3;
+	}
+
+	mtspr(SPR_LPCR, lpcr);
+	isync();
+
+	if (sreset_enabled) {
+		/* stop with EC=1 (sreset) and ESL=1 (enable thread switch). */
+		/* PSSCR SD=0 ESL=1 EC=1 PSSL=0 TR=3 MTL=0 RL=1 */
+		psscr = PPC_BIT(42) | PPC_BIT(43) |
+			PPC_BITMASK(54, 55) | PPC_BIT(63);
+		vec = enter_p9_pm_state(psscr);
+	} else {
+		/* stop with EC=0 (resumes) which does not require sreset. */
+		/* PSSCR SD=0 ESL=0 EC=0 PSSL=0 TR=3 MTL=0 RL=1 */
+		psscr = PPC_BITMASK(54, 55) | PPC_BIT(63);
+		enter_p9_pm_lite_state(psscr);
+	}
+
+	/* Clear doorbell */
+	p9_dbell_receive();
+
+ skip_sleep:
+	/* Restore */
+	sync();
+	cpu->in_idle = false;
+	cpu->in_sleep = false;
+
+	return vec;
+}
+
+static void cpu_idle_pm(enum cpu_wake_cause wake_on)
+{
+	unsigned int vec;
+
+	switch(proc_gen) {
+	case proc_gen_p8:
+		vec = cpu_idle_p8(wake_on);
+		break;
+	case proc_gen_p9:
+		vec = cpu_idle_p9(wake_on);
+		break;
+	case proc_gen_p10:
+		vec = cpu_idle_p9(wake_on);
+		break;
+	default:
+		vec = 0;
+		prlog_once(PR_DEBUG, "cpu_idle_pm called with bad processor type\n");
+		break;
+	}
+
+	if (vec == 0x100) {
+		unsigned long srr1 = mfspr(SPR_SRR1);
+
+		switch (srr1 & SPR_SRR1_PM_WAKE_MASK) {
+		case SPR_SRR1_PM_WAKE_SRESET:
+			exception_entry_pm_sreset();
+			break;
+		default:
+			break;
+		}
+		mtmsrd(MSR_RI, 1);
+
+	} else if (vec == 0x200) {
+		exception_entry_pm_mce();
+		enable_machine_check();
+		mtmsrd(MSR_RI, 1);
+	}
+}
+
+void cpu_idle_job(void)
+{
+	if (pm_enabled) {
+		cpu_idle_pm(cpu_wake_on_job);
+	} else {
+		struct cpu_thread *cpu = this_cpu();
+
+		smt_lowest();
+		/* Check for jobs again */
+		while (!cpu_check_jobs(cpu)) {
+			if (pm_enabled)
+				break;
+			cpu_relax();
+			barrier();
+		}
+		smt_medium();
+	}
+}
+
+void cpu_idle_delay(unsigned long delay)
+{
+	unsigned long now = mftb();
+	unsigned long end = now + delay;
+	unsigned long min_pm = usecs_to_tb(10);
+
+	if (pm_enabled && delay > min_pm) {
+pm:
+		for (;;) {
+			if (delay >= 0x7fffffff)
+				delay = 0x7fffffff;
+			mtspr(SPR_DEC, delay);
+
+			cpu_idle_pm(cpu_wake_on_dec);
+
+			now = mftb();
+			if (tb_compare(now, end) == TB_AAFTERB)
+				break;
+			delay = end - now;
+			if (!(pm_enabled && delay > min_pm))
+				goto no_pm;
+		}
+	} else {
+no_pm:
+		smt_lowest();
+		for (;;) {
+			now = mftb();
+			if (tb_compare(now, end) == TB_AAFTERB)
+				break;
+			delay = end - now;
+			if (pm_enabled && delay > min_pm) {
+				smt_medium();
+				goto pm;
+			}
+		}
+		smt_medium();
+	}
+}
+
+static void cpu_pm_disable(void)
+{
+	struct cpu_thread *cpu;
+	unsigned int timeout;
+
+	pm_enabled = false;
+	sync();
+
+	if (proc_gen == proc_gen_p8) {
+		for_each_available_cpu(cpu) {
+			while (cpu->in_sleep || cpu->in_idle) {
+				icp_kick_cpu(cpu);
+				cpu_relax();
+			}
+		}
+	} else if (proc_gen == proc_gen_p9 || proc_gen == proc_gen_p10) {
+		for_each_available_cpu(cpu) {
+			if (cpu->in_sleep || cpu->in_idle)
+				p9_dbell_send(cpu->pir);
+		}
+
+		/*  This code is racy with cpus entering idle, late ones miss the dbell */
+
+		smt_lowest();
+		for_each_available_cpu(cpu) {
+			timeout = 0x08000000;
+			while ((cpu->in_sleep || cpu->in_idle) && --timeout)
+				barrier();
+			if (!timeout) {
+				prlog(PR_DEBUG, "cpu_pm_disable TIMEOUT on cpu 0x%04x to exit idle\n",
+				      cpu->pir);
+                                p9_dbell_send(cpu->pir);
+                        }
+		}
+		smt_medium();
+	}
+}
+
+void cpu_set_sreset_enable(bool enabled)
+{
+	if (sreset_enabled == enabled)
+		return;
+
+	if (proc_gen == proc_gen_p8) {
+		/* Public P8 Mambo has broken NAP */
+		if (chip_quirk(QUIRK_MAMBO_CALLOUTS))
+			return;
+
+		sreset_enabled = enabled;
+		sync();
+
+		if (!enabled) {
+			cpu_pm_disable();
+		} else {
+			if (ipi_enabled)
+				pm_enabled = true;
+		}
+
+	} else if (proc_gen == proc_gen_p9 || proc_gen == proc_gen_p10) {
+		sreset_enabled = enabled;
+		sync();
+		/*
+		 * Kick everybody out of PM so they can adjust the PM
+		 * mode they are using (EC=0/1).
+		 */
+		cpu_pm_disable();
+		if (ipi_enabled)
+			pm_enabled = true;
+	}
+}
+
+void cpu_set_ipi_enable(bool enabled)
+{
+	if (ipi_enabled == enabled)
+		return;
+
+	if (proc_gen == proc_gen_p8) {
+		ipi_enabled = enabled;
+		sync();
+		if (!enabled) {
+			cpu_pm_disable();
+		} else {
+			if (sreset_enabled)
+				pm_enabled = true;
+		}
+
+	} else if (proc_gen == proc_gen_p9 || proc_gen == proc_gen_p10) {
+		ipi_enabled = enabled;
+		sync();
+		if (!enabled)
+			cpu_pm_disable();
+		else
+			pm_enabled = true;
+	}
+}
+
+void cpu_process_local_jobs(void)
+{
+	struct cpu_thread *cpu = first_available_cpu();
+
+	while (cpu) {
+		if (cpu != this_cpu())
+			return;
+
+		cpu = next_available_cpu(cpu);
+	}
+
+	if (!cpu)
+		cpu = first_available_cpu();
+
+	/* No CPU to run on, just run synchro */
+	if (cpu == this_cpu()) {
+		prlog_once(PR_DEBUG, "Processing jobs synchronously\n");
+		cpu_process_jobs();
+		opal_run_pollers();
+	}
+}
+
+
+struct dt_node *get_cpu_node(u32 pir)
+{
+	struct cpu_thread *t = find_cpu_by_pir(pir);
+
+	return t ? t->node : NULL;
+}
+
+/* This only covers primary, active cpus */
+struct cpu_thread *find_cpu_by_chip_id(u32 chip_id)
+{
+	struct cpu_thread *t;
+
+	for_each_available_cpu(t) {
+		if (t->is_secondary)
+			continue;
+		if (t->chip_id == chip_id)
+			return t;
+	}
+	return NULL;
+}
+
+struct cpu_thread *find_cpu_by_node(struct dt_node *cpu)
+{
+	struct cpu_thread *t;
+
+	for_each_available_cpu(t) {
+		if (t->node == cpu)
+			return t;
+	}
+	return NULL;
+}
+
+struct cpu_thread *find_cpu_by_pir(u32 pir)
+{
+	if (pir > cpu_max_pir)
+		return NULL;
+	return &cpu_stacks[pir].cpu;
+}
+
+struct cpu_thread __nomcount *find_cpu_by_pir_nomcount(u32 pir)
+{
+	if (pir > cpu_max_pir)
+		return NULL;
+	return &cpu_stacks[pir].cpu;
+}
+
+struct cpu_thread *find_cpu_by_server(u32 server_no)
+{
+	struct cpu_thread *t;
+
+	for_each_cpu(t) {
+		if (t->server_no == server_no)
+			return t;
+	}
+	return NULL;
+}
+
+struct cpu_thread *next_cpu(struct cpu_thread *cpu)
+{
+	struct cpu_stack *s;
+	unsigned int index = 0;
+
+	if (cpu != NULL) {
+		s = container_of(cpu, struct cpu_stack, cpu);
+		index = s - cpu_stacks + 1;
+	}
+	for (; index <= cpu_max_pir; index++) {
+		cpu = &cpu_stacks[index].cpu;
+		if (cpu->state != cpu_state_no_cpu)
+			return cpu;
+	}
+	return NULL;
+}
+
+struct cpu_thread *first_cpu(void)
+{
+	return next_cpu(NULL);
+}
+
+struct cpu_thread *next_available_cpu(struct cpu_thread *cpu)
+{
+	do {
+		cpu = next_cpu(cpu);
+	} while(cpu && !cpu_is_available(cpu));
+
+	return cpu;
+}
+
+struct cpu_thread *first_available_cpu(void)
+{
+	return next_available_cpu(NULL);
+}
+
+struct cpu_thread *next_present_cpu(struct cpu_thread *cpu)
+{
+	do {
+		cpu = next_cpu(cpu);
+	} while(cpu && !cpu_is_present(cpu));
+
+	return cpu;
+}
+
+struct cpu_thread *first_present_cpu(void)
+{
+	return next_present_cpu(NULL);
+}
+
+struct cpu_thread *next_ungarded_cpu(struct cpu_thread *cpu)
+{
+	do {
+		cpu = next_cpu(cpu);
+	} while(cpu && cpu->state == cpu_state_unavailable);
+
+	return cpu;
+}
+
+struct cpu_thread *first_ungarded_cpu(void)
+{
+	return next_ungarded_cpu(NULL);
+}
+
+struct cpu_thread *next_ungarded_primary(struct cpu_thread *cpu)
+{
+	do {
+		cpu = next_ungarded_cpu(cpu);
+	} while (cpu && !(cpu == cpu->primary || cpu == cpu->ec_primary));
+
+	return cpu;
+}
+
+struct cpu_thread *first_ungarded_primary(void)
+{
+	return next_ungarded_primary(NULL);
+}
+
+u8 get_available_nr_cores_in_chip(u32 chip_id)
+{
+	struct cpu_thread *core;
+	u8 nr_cores = 0;
+
+	for_each_available_core_in_chip(core, chip_id)
+		nr_cores++;
+
+	return nr_cores;
+}
+
+struct cpu_thread *next_available_core_in_chip(struct cpu_thread *core,
+					       u32 chip_id)
+{
+	do {
+		core = next_cpu(core);
+	} while(core && (!cpu_is_available(core) ||
+			 core->chip_id != chip_id ||
+			 core->is_secondary));
+	return core;
+}
+
+struct cpu_thread *first_available_core_in_chip(u32 chip_id)
+{
+	return next_available_core_in_chip(NULL, chip_id);
+}
+
+uint32_t cpu_get_core_index(struct cpu_thread *cpu)
+{
+	return pir_to_fused_core_id(cpu->pir);
+}
+
+void cpu_remove_node(const struct cpu_thread *t)
+{
+	struct dt_node *i;
+
+	/* Find this cpu node */
+	dt_for_each_node(dt_root, i) {
+		const struct dt_property *p;
+
+		if (!dt_has_node_property(i, "device_type", "cpu"))
+			continue;
+		p = dt_find_property(i, "ibm,pir");
+		if (!p)
+			continue;
+		if (dt_property_get_cell(p, 0) == t->pir) {
+			dt_free(i);
+			return;
+		}
+	}
+	prerror("CPU: Could not find cpu node %i to remove!\n", t->pir);
+	abort();
+}
+
+void cpu_disable_all_threads(struct cpu_thread *cpu)
+{
+	unsigned int i;
+	struct dt_property *p;
+
+	for (i = 0; i <= cpu_max_pir; i++) {
+		struct cpu_thread *t = &cpu_stacks[i].cpu;
+
+		if (t->primary == cpu->primary)
+			t->state = cpu_state_disabled;
+
+	}
+
+	/* Mark this core as bad so that Linux kernel don't use this CPU. */
+	prlog(PR_DEBUG, "CPU: Mark CPU bad (PIR 0x%04x)...\n", cpu->pir);
+	p = __dt_find_property(cpu->node, "status");
+	if (p)
+		dt_del_property(cpu->node, p);
+
+	dt_add_property_string(cpu->node, "status", "bad");
+
+	/* XXX Do something to actually stop the core */
+}
+
+static void init_cpu_thread(struct cpu_thread *t,
+			    enum cpu_thread_state state,
+			    unsigned int pir)
+{
+	/* offset within cpu_thread to prevent stack_guard clobber */
+	const size_t guard_skip = container_off_var(t, stack_guard) +
+		sizeof(t->stack_guard);
+
+	memset(((void *)t) + guard_skip, 0, sizeof(struct cpu_thread) - guard_skip);
+	init_lock(&t->dctl_lock);
+	init_lock(&t->job_lock);
+	list_head_init(&t->job_queue);
+	list_head_init(&t->locks_held);
+	t->stack_guard = STACK_CHECK_GUARD_BASE ^ pir;
+	t->state = state;
+	t->pir = pir;
+#ifdef STACK_CHECK_ENABLED
+	t->stack_bot_mark = LONG_MAX;
+#endif
+	t->is_fused_core = is_fused_core(mfspr(SPR_PVR));
+	assert(pir == container_of(t, struct cpu_stack, cpu) - cpu_stacks);
+}
+
+static void enable_attn(void)
+{
+	unsigned long hid0;
+
+	hid0 = mfspr(SPR_HID0);
+	hid0 |= hid0_attn;
+	set_hid0(hid0);
+}
+
+static void disable_attn(void)
+{
+	unsigned long hid0;
+
+	hid0 = mfspr(SPR_HID0);
+	hid0 &= ~hid0_attn;
+	set_hid0(hid0);
+}
+
+extern void __trigger_attn(void);
+void trigger_attn(void)
+{
+	enable_attn();
+	__trigger_attn();
+}
+
+static void init_hid(void)
+{
+	/* attn is enabled even when HV=0, so make sure it's off */
+	disable_attn();
+}
+
+void __nomcount pre_init_boot_cpu(void)
+{
+	struct cpu_thread *cpu = this_cpu();
+
+	/* We skip the stack guard ! */
+	memset(((void *)cpu) + 8, 0, sizeof(struct cpu_thread) - 8);
+}
+
+void init_boot_cpu(void)
+{
+	unsigned int pir, pvr;
+
+	pir = mfspr(SPR_PIR);
+	pvr = mfspr(SPR_PVR);
+
+	/* Get CPU family and other flags based on PVR */
+	switch(PVR_TYPE(pvr)) {
+	case PVR_TYPE_P8E:
+	case PVR_TYPE_P8:
+		proc_gen = proc_gen_p8;
+		hile_supported = PVR_VERS_MAJ(mfspr(SPR_PVR)) >= 2;
+		hid0_hile = SPR_HID0_POWER8_HILE;
+		hid0_attn = SPR_HID0_POWER8_ENABLE_ATTN;
+		break;
+	case PVR_TYPE_P8NVL:
+		proc_gen = proc_gen_p8;
+		hile_supported = true;
+		hid0_hile = SPR_HID0_POWER8_HILE;
+		hid0_attn = SPR_HID0_POWER8_ENABLE_ATTN;
+		break;
+	case PVR_TYPE_P9:
+	case PVR_TYPE_P9P:
+		proc_gen = proc_gen_p9;
+		hile_supported = true;
+		radix_supported = true;
+		hid0_hile = SPR_HID0_POWER9_HILE;
+		hid0_attn = SPR_HID0_POWER9_ENABLE_ATTN;
+		break;
+	case PVR_TYPE_P10:
+		proc_gen = proc_gen_p10;
+		hile_supported = true;
+		radix_supported = true;
+		hid0_hile = SPR_HID0_POWER10_HILE;
+		hid0_attn = SPR_HID0_POWER10_ENABLE_ATTN;
+		break;
+	default:
+		proc_gen = proc_gen_unknown;
+	}
+
+	/* Get a CPU thread count based on family */
+	switch(proc_gen) {
+	case proc_gen_p8:
+		cpu_thread_count = 8;
+		prlog(PR_INFO, "CPU: P8 generation processor"
+		      " (max %d threads/core)\n", cpu_thread_count);
+		break;
+	case proc_gen_p9:
+		if (is_fused_core(pvr))
+			cpu_thread_count = 8;
+		else
+			cpu_thread_count = 4;
+		prlog(PR_INFO, "CPU: P9 generation processor"
+		      " (max %d threads/core)\n", cpu_thread_count);
+		break;
+	case proc_gen_p10:
+		if (is_fused_core(pvr))
+			cpu_thread_count = 8;
+		else
+			cpu_thread_count = 4;
+		prlog(PR_INFO, "CPU: P10 generation processor"
+		      " (max %d threads/core)\n", cpu_thread_count);
+		break;
+	default:
+		prerror("CPU: Unknown PVR, assuming 1 thread\n");
+		cpu_thread_count = 1;
+	}
+
+	if (is_power9n(pvr) && (PVR_VERS_MAJ(pvr) == 1)) {
+		prerror("CPU: POWER9N DD1 is not supported\n");
+		abort();
+	}
+
+	prlog(PR_DEBUG, "CPU: Boot CPU PIR is 0x%04x PVR is 0x%08x\n",
+	      pir, pvr);
+
+	/*
+	 * Adjust top of RAM to include the boot CPU stack. If we have less
+	 * RAM than this, it's not possible to boot.
+	 */
+	cpu_max_pir = pir;
+	top_of_ram += (cpu_max_pir + 1) * STACK_SIZE;
+
+	/* Setup boot CPU state */
+	boot_cpu = &cpu_stacks[pir].cpu;
+	init_cpu_thread(boot_cpu, cpu_state_active, pir);
+	init_boot_tracebuf(boot_cpu);
+	assert(this_cpu() == boot_cpu);
+	init_hid();
+}
+
+static void enable_large_dec(bool on)
+{
+	u64 lpcr = mfspr(SPR_LPCR);
+
+	if (on)
+		lpcr |= SPR_LPCR_P9_LD;
+	else
+		lpcr &= ~SPR_LPCR_P9_LD;
+
+	mtspr(SPR_LPCR, lpcr);
+	isync();
+}
+
+#define HIGH_BIT (1ull << 63)
+
+static int find_dec_bits(void)
+{
+	int bits = 65; /* we always decrement once */
+	u64 mask = ~0ull;
+
+	if (proc_gen < proc_gen_p9)
+		return 32;
+
+	/* The ISA doesn't specify the width of the decrementer register so we
+	 * need to discover it. When in large mode (LPCR.LD = 1) reads from the
+	 * DEC SPR are sign extended to 64 bits and writes are truncated to the
+	 * physical register width. We can use this behaviour to detect the
+	 * width by starting from an all 1s value and left shifting until we
+	 * read a value from the DEC with it's high bit cleared.
+	 */
+
+	enable_large_dec(true);
+
+	do {
+		bits--;
+		mask = mask >> 1;
+		mtspr(SPR_DEC, mask);
+	} while (mfspr(SPR_DEC) & HIGH_BIT);
+
+	enable_large_dec(false);
+
+	prlog(PR_DEBUG, "CPU: decrementer bits %d\n", bits);
+	return bits;
+}
+
+static void init_tm_suspend_mode_property(void)
+{
+	struct dt_node *node;
+
+	/* If we don't find anything, assume TM suspend is enabled */
+	tm_suspend_enabled = true;
+
+	node = dt_find_by_path(dt_root, "/ibm,opal/fw-features/tm-suspend-mode");
+	if (!node)
+		return;
+
+	if (dt_find_property(node, "disabled"))
+		tm_suspend_enabled = false;
+}
+
+void init_cpu_max_pir(void)
+{
+	struct dt_node *cpus, *cpu;
+
+	cpus = dt_find_by_path(dt_root, "/cpus");
+	assert(cpus);
+
+	/* Iterate all CPUs in the device-tree */
+	dt_for_each_child(cpus, cpu) {
+		unsigned int pir, server_no;
+
+		/* Skip cache nodes */
+		if (strcmp(dt_prop_get(cpu, "device_type"), "cpu"))
+			continue;
+
+		server_no = dt_prop_get_u32(cpu, "reg");
+
+		/* If PIR property is absent, assume it's the same as the
+		 * server number
+		 */
+		pir = dt_prop_get_u32_def(cpu, "ibm,pir", server_no);
+
+		if (cpu_max_pir < pir + cpu_thread_count - 1)
+			cpu_max_pir = pir + cpu_thread_count - 1;
+	}
+
+	prlog(PR_DEBUG, "CPU: New max PIR set to 0x%x\n", cpu_max_pir);
+}
+
+/*
+ * Set cpu->state to cpu_state_no_cpu for all secondaries, before the dt is
+ * parsed and they will be flipped to present as populated CPUs are found.
+ *
+ * Some configurations (e.g., with memory encryption) will not zero system
+ * memory at boot, so can't rely on cpu->state to be zero (== cpu_state_no_cpu).
+ */
+static void mark_all_secondary_cpus_absent(void)
+{
+	unsigned int pir;
+	struct cpu_thread *cpu;
+
+	for (pir = 0; pir <= cpu_max_pir; pir++) {
+		cpu = &cpu_stacks[pir].cpu;
+		if (cpu == boot_cpu)
+			continue;
+		cpu->state = cpu_state_no_cpu;
+	}
+}
+
+void init_all_cpus(void)
+{
+	struct dt_node *cpus, *cpu;
+	unsigned int pir, thread;
+	int dec_bits = find_dec_bits();
+
+	cpus = dt_find_by_path(dt_root, "/cpus");
+	assert(cpus);
+
+	init_tm_suspend_mode_property();
+
+	mark_all_secondary_cpus_absent();
+
+	/* Iterate all CPUs in the device-tree */
+	dt_for_each_child(cpus, cpu) {
+		unsigned int server_no, chip_id, threads;
+		enum cpu_thread_state state;
+		const struct dt_property *p;
+		struct cpu_thread *t, *pt0, *pt1;
+
+		/* Skip cache nodes */
+		if (strcmp(dt_prop_get(cpu, "device_type"), "cpu"))
+			continue;
+
+		server_no = dt_prop_get_u32(cpu, "reg");
+
+		/* If PIR property is absent, assume it's the same as the
+		 * server number
+		 */
+		pir = dt_prop_get_u32_def(cpu, "ibm,pir", server_no);
+
+		/* We should always have an ibm,chip-id property */
+		chip_id = dt_get_chip_id(cpu);
+
+		/* Only use operational CPUs */
+		if (!strcmp(dt_prop_get(cpu, "status"), "okay")) {
+			state = cpu_state_present;
+			get_chip(chip_id)->ex_present = true;
+		} else {
+			state = cpu_state_unavailable;
+		}
+
+		prlog(PR_INFO, "CPU: CPU from DT PIR=0x%04x Server#=0x%x"
+		      " State=%d\n", pir, server_no, state);
+
+		/* Check max PIR */
+		if (cpu_max_pir < (pir + cpu_thread_count - 1)) {
+			prlog(PR_WARNING, "CPU: CPU potentially out of range"
+			      "PIR=0x%04x MAX=0x%04x !\n",
+			      pir, cpu_max_pir);
+			continue;
+		}
+
+		/* Setup thread 0 */
+		assert(pir <= cpu_max_pir);
+		t = pt0 = &cpu_stacks[pir].cpu;
+		if (t != boot_cpu) {
+			init_cpu_thread(t, state, pir);
+			/* Each cpu gets its own later in init_trace_buffers */
+			t->trace = boot_cpu->trace;
+		}
+		if (t->is_fused_core)
+			pt1 = &cpu_stacks[pir + 1].cpu;
+		else
+			pt1 = pt0;
+		t->server_no = server_no;
+		t->primary = t->ec_primary = t;
+		t->node = cpu;
+		t->chip_id = chip_id;
+		t->icp_regs = NULL; /* Will be set later */
+#ifdef DEBUG_LOCKS
+		t->requested_lock = NULL;
+#endif
+		t->core_hmi_state = 0;
+		t->core_hmi_state_ptr = &t->core_hmi_state;
+
+		/* Add associativity properties */
+		add_core_associativity(t);
+
+		/* Add the decrementer width property */
+		dt_add_property_cells(cpu, "ibm,dec-bits", dec_bits);
+
+		if (t->is_fused_core)
+			dt_add_property(t->node, "ibm,fused-core", NULL, 0);
+
+		/* Iterate threads */
+		p = dt_find_property(cpu, "ibm,ppc-interrupt-server#s");
+		if (!p)
+			continue;
+		threads = p->len / 4;
+		if (threads > cpu_thread_count) {
+			prlog(PR_WARNING, "CPU: Threads out of range for PIR 0x%04x"
+			      " threads=%d max=%d\n",
+			      pir, threads, cpu_thread_count);
+			threads = cpu_thread_count;
+		}
+		for (thread = 1; thread < threads; thread++) {
+			prlog(PR_TRACE, "CPU:   secondary thread %d found\n",
+			      thread);
+			t = &cpu_stacks[pir + thread].cpu;
+			init_cpu_thread(t, state, pir + thread);
+			t->trace = boot_cpu->trace;
+			t->server_no = dt_property_get_cell(p, thread);
+			t->is_secondary = true;
+			t->is_fused_core = pt0->is_fused_core;
+			t->primary = pt0;
+			t->ec_primary = (thread & 1) ? pt1 : pt0;
+			t->node = cpu;
+			t->chip_id = chip_id;
+			t->core_hmi_state_ptr = &pt0->core_hmi_state;
+		}
+		prlog(PR_INFO, "CPU:  %d secondary threads\n", thread);
+	}
+}
+
+void cpu_bringup(void)
+{
+	struct cpu_thread *t;
+	uint32_t count = 0;
+
+	prlog(PR_INFO, "CPU: Setting up secondary CPU state\n");
+
+	op_display(OP_LOG, OP_MOD_CPU, 0x0000);
+
+	/* Tell everybody to chime in ! */
+	prlog(PR_INFO, "CPU: Calling in all processors...\n");
+	cpu_secondary_start = 1;
+	sync();
+
+	op_display(OP_LOG, OP_MOD_CPU, 0x0002);
+
+	for_each_cpu(t) {
+		if (t->state != cpu_state_present &&
+		    t->state != cpu_state_active)
+			continue;
+
+		/* Add a callin timeout ?  If so, call cpu_remove_node(t). */
+		while (t->state != cpu_state_active) {
+			smt_lowest();
+			sync();
+		}
+		smt_medium();
+		count++;
+	}
+
+	prlog(PR_NOTICE, "CPU: All %d processors called in...\n", count);
+
+	op_display(OP_LOG, OP_MOD_CPU, 0x0003);
+}
+
+void cpu_callin(struct cpu_thread *cpu)
+{
+	sync();
+	cpu->state = cpu_state_active;
+	sync();
+
+	cpu->job_has_no_return = false;
+	if (cpu_is_thread0(cpu))
+		init_hid();
+}
+
+static void opal_start_thread_job(void *data)
+{
+	cpu_give_self_os();
+
+	/* We do not return, so let's mark the job as
+	 * complete
+	 */
+	start_kernel_secondary((uint64_t)data);
+}
+
+static int64_t opal_start_cpu_thread(uint64_t server_no, uint64_t start_address)
+{
+	struct cpu_thread *cpu;
+	struct cpu_job *job;
+
+	if (!opal_addr_valid((void *)start_address))
+		return OPAL_PARAMETER;
+
+	cpu = find_cpu_by_server(server_no);
+	if (!cpu) {
+		prerror("OPAL: Start invalid CPU 0x%04llx !\n", server_no);
+		return OPAL_PARAMETER;
+	}
+	prlog(PR_DEBUG, "OPAL: Start CPU 0x%04llx (PIR 0x%04x) -> 0x%016llx\n",
+	       server_no, cpu->pir, start_address);
+
+	lock(&reinit_lock);
+	if (!cpu_is_available(cpu)) {
+		unlock(&reinit_lock);
+		prerror("OPAL: CPU not active in OPAL !\n");
+		return OPAL_WRONG_STATE;
+	}
+	if (cpu->in_reinit) {
+		unlock(&reinit_lock);
+		prerror("OPAL: CPU being reinitialized !\n");
+		return OPAL_WRONG_STATE;
+	}
+	job = __cpu_queue_job(cpu, "start_thread",
+			      opal_start_thread_job, (void *)start_address,
+			      true);
+	unlock(&reinit_lock);
+	if (!job) {
+		prerror("OPAL: Failed to create CPU start job !\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+	return OPAL_SUCCESS;
+}
+opal_call(OPAL_START_CPU, opal_start_cpu_thread, 2);
+
+static int64_t opal_query_cpu_status(uint64_t server_no, uint8_t *thread_status)
+{
+	struct cpu_thread *cpu;
+
+	if (!opal_addr_valid(thread_status))
+		return OPAL_PARAMETER;
+
+	cpu = find_cpu_by_server(server_no);
+	if (!cpu) {
+		prerror("OPAL: Query invalid CPU 0x%04llx !\n", server_no);
+		return OPAL_PARAMETER;
+	}
+	if (!cpu_is_available(cpu) && cpu->state != cpu_state_os) {
+		prerror("OPAL: CPU not active in OPAL nor OS !\n");
+		return OPAL_PARAMETER;
+	}
+	switch(cpu->state) {
+	case cpu_state_os:
+		*thread_status = OPAL_THREAD_STARTED;
+		break;
+	case cpu_state_active:
+		/* Active in skiboot -> inactive in OS */
+		*thread_status = OPAL_THREAD_INACTIVE;
+		break;
+	default:
+		*thread_status = OPAL_THREAD_UNAVAILABLE;
+	}
+
+	return OPAL_SUCCESS;
+}
+opal_call(OPAL_QUERY_CPU_STATUS, opal_query_cpu_status, 2);
+
+static int64_t opal_return_cpu(void)
+{
+	prlog(PR_DEBUG, "OPAL: Returning CPU 0x%04x\n", this_cpu()->pir);
+
+	this_cpu()->in_opal_call--;
+	if (this_cpu()->in_opal_call != 0) {
+		printf("OPAL in_opal_call=%u\n", this_cpu()->in_opal_call);
+	}
+
+	__secondary_cpu_entry();
+
+	return OPAL_HARDWARE; /* Should not happen */
+}
+opal_call(OPAL_RETURN_CPU, opal_return_cpu, 0);
+
+struct hid0_change_req {
+	uint64_t clr_bits;
+	uint64_t set_bits;
+};
+
+static void cpu_change_hid0(void *__req)
+{
+	struct hid0_change_req *req = __req;
+	unsigned long hid0, new_hid0;
+
+	hid0 = new_hid0 = mfspr(SPR_HID0);
+	new_hid0 &= ~req->clr_bits;
+	new_hid0 |= req->set_bits;
+	prlog(PR_DEBUG, "CPU: [%08x] HID0 change 0x%016lx -> 0x%016lx\n",
+		this_cpu()->pir, hid0, new_hid0);
+	set_hid0(new_hid0);
+}
+
+static int64_t cpu_change_all_hid0(struct hid0_change_req *req)
+{
+	struct cpu_thread *cpu;
+	struct cpu_job **jobs;
+
+	jobs = zalloc(sizeof(struct cpu_job *) * (cpu_max_pir + 1));
+	assert(jobs);
+
+	for_each_available_cpu(cpu) {
+		if (!cpu_is_thread0(cpu) && !cpu_is_core_chiplet_primary(cpu))
+			continue;
+		if (cpu == this_cpu())
+			continue;
+		jobs[cpu->pir] = cpu_queue_job(cpu, "cpu_change_hid0",
+						cpu_change_hid0, req);
+	}
+
+	/* this cpu */
+	cpu_change_hid0(req);
+
+	for_each_available_cpu(cpu) {
+		if (jobs[cpu->pir])
+			cpu_wait_job(jobs[cpu->pir], true);
+	}
+
+	free(jobs);
+
+	return OPAL_SUCCESS;
+}
+
+void cpu_set_hile_mode(bool hile)
+{
+	struct hid0_change_req req;
+
+	if (hile == current_hile_mode)
+		return;
+
+	if (hile) {
+		req.clr_bits = 0;
+		req.set_bits = hid0_hile;
+	} else {
+		req.clr_bits = hid0_hile;
+		req.set_bits = 0;
+	}
+	cpu_change_all_hid0(&req);
+	current_hile_mode = hile;
+}
+
+static void cpu_cleanup_one(void *param __unused)
+{
+	mtspr(SPR_AMR, 0);
+	mtspr(SPR_IAMR, 0);
+	mtspr(SPR_PCR, 0);
+}
+
+static int64_t cpu_cleanup_all(void)
+{
+	struct cpu_thread *cpu;
+	struct cpu_job **jobs;
+
+	jobs = zalloc(sizeof(struct cpu_job *) * (cpu_max_pir + 1));
+	assert(jobs);
+
+	for_each_available_cpu(cpu) {
+		if (cpu == this_cpu())
+			continue;
+		jobs[cpu->pir] = cpu_queue_job(cpu, "cpu_cleanup",
+						cpu_cleanup_one, NULL);
+	}
+
+	/* this cpu */
+	cpu_cleanup_one(NULL);
+
+	for_each_available_cpu(cpu) {
+		if (jobs[cpu->pir])
+			cpu_wait_job(jobs[cpu->pir], true);
+	}
+
+	free(jobs);
+
+
+	return OPAL_SUCCESS;
+}
+
+void cpu_fast_reboot_complete(void)
+{
+	/* Fast reboot will have set HID0:HILE to skiboot endian */
+	current_hile_mode = HAVE_LITTLE_ENDIAN;
+
+	/* and set HID0:RADIX */
+	if (proc_gen == proc_gen_p9)
+		current_radix_mode = true;
+}
+
+static int64_t opal_reinit_cpus(uint64_t flags)
+{
+	struct hid0_change_req req = { 0, 0 };
+	struct cpu_thread *cpu;
+	int64_t rc = OPAL_SUCCESS;
+	int i;
+
+	prlog(PR_DEBUG, "OPAL: CPU re-init with flags: 0x%llx\n", flags);
+
+	if (flags & OPAL_REINIT_CPUS_HILE_LE)
+		prlog(PR_INFO, "OPAL: Switch to little-endian OS\n");
+	else if (flags & OPAL_REINIT_CPUS_HILE_BE)
+		prlog(PR_INFO, "OPAL: Switch to big-endian OS\n");
+
+ again:
+	lock(&reinit_lock);
+
+	for (cpu = first_cpu(); cpu; cpu = next_cpu(cpu)) {
+		if (cpu == this_cpu() || cpu->in_reinit)
+			continue;
+		if (cpu->state == cpu_state_os) {
+			unlock(&reinit_lock);
+			/*
+			 * That might be a race with return CPU during kexec
+			 * where we are still, wait a bit and try again
+			 */
+			for (i = 0; (i < 1000) &&
+				     (cpu->state == cpu_state_os); i++) {
+				time_wait_ms(1);
+			}
+			if (cpu->state == cpu_state_os) {
+				prerror("OPAL: CPU 0x%x not in OPAL !\n", cpu->pir);
+				return OPAL_WRONG_STATE;
+			}
+			goto again;
+		}
+		cpu->in_reinit = true;
+	}
+	/*
+	 * Now we need to mark ourselves "active" or we'll be skipped
+	 * by the various "for_each_active_..." calls done by slw_reinit()
+	 */
+	this_cpu()->state = cpu_state_active;
+	this_cpu()->in_reinit = true;
+	unlock(&reinit_lock);
+
+	/*
+	 * This cleans up a few things left over by Linux
+	 * that can cause problems in cases such as radix->hash
+	 * transitions. Ideally Linux should do it but doing it
+	 * here works around existing broken kernels.
+	 */
+	cpu_cleanup_all();
+
+	/* If HILE change via HID0 is supported ... */
+	if (hile_supported &&
+	    (flags & (OPAL_REINIT_CPUS_HILE_BE |
+		      OPAL_REINIT_CPUS_HILE_LE))) {
+		bool hile = !!(flags & OPAL_REINIT_CPUS_HILE_LE);
+
+		flags &= ~(OPAL_REINIT_CPUS_HILE_BE | OPAL_REINIT_CPUS_HILE_LE);
+		if (hile != current_hile_mode) {
+			if (hile)
+				req.set_bits |= hid0_hile;
+			else
+				req.clr_bits |= hid0_hile;
+			current_hile_mode = hile;
+		}
+	}
+
+	/* If MMU mode change is supported */
+	if (radix_supported &&
+	    (flags & (OPAL_REINIT_CPUS_MMU_HASH |
+		      OPAL_REINIT_CPUS_MMU_RADIX))) {
+		bool radix = !!(flags & OPAL_REINIT_CPUS_MMU_RADIX);
+
+		flags &= ~(OPAL_REINIT_CPUS_MMU_HASH |
+			   OPAL_REINIT_CPUS_MMU_RADIX);
+
+		if (proc_gen == proc_gen_p9 && radix != current_radix_mode) {
+			if (radix)
+				req.set_bits |= SPR_HID0_POWER9_RADIX;
+			else
+				req.clr_bits |= SPR_HID0_POWER9_RADIX;
+
+			current_radix_mode = radix;
+		}
+	}
+
+	/* Cleanup the TLB. We do that unconditionally, this works
+	 * around issues where OSes fail to invalidate the PWC in Radix
+	 * mode for example. This only works on P9 and later, but we
+	 * also know we don't have a problem with Linux cleanups on
+	 * P8 so this isn't a problem. If we wanted to cleanup the
+	 * TLB on P8 as well, we'd have to use jobs to do it locally
+	 * on each CPU.
+	 */
+	 cleanup_global_tlb();
+
+	 /* Apply HID bits changes if any */
+	if (req.set_bits || req.clr_bits)
+		cpu_change_all_hid0(&req);
+
+	if (flags & OPAL_REINIT_CPUS_TM_SUSPEND_DISABLED) {
+		flags &= ~OPAL_REINIT_CPUS_TM_SUSPEND_DISABLED;
+
+		if (tm_suspend_enabled)
+			rc = OPAL_UNSUPPORTED;
+		else
+			rc = OPAL_SUCCESS;
+	}
+
+	/* Handle P8 DD1 SLW reinit */
+	if (flags != 0 && proc_gen == proc_gen_p8 && !hile_supported)
+		rc = slw_reinit(flags);
+	else if (flags != 0)
+		rc = OPAL_UNSUPPORTED;
+
+	/* And undo the above */
+	lock(&reinit_lock);
+	this_cpu()->state = cpu_state_os;
+	for (cpu = first_cpu(); cpu; cpu = next_cpu(cpu))
+		cpu->in_reinit = false;
+	unlock(&reinit_lock);
+
+	return rc;
+}
+opal_call(OPAL_REINIT_CPUS, opal_reinit_cpus, 1);
+
+#define NMMU_XLAT_CTL_PTCR 0xb
+static int64_t nmmu_set_ptcr(uint64_t chip_id, struct dt_node *node, uint64_t ptcr)
+{
+	uint32_t nmmu_base_addr;
+
+	nmmu_base_addr = dt_get_address(node, 0, NULL);
+	return xscom_write(chip_id, nmmu_base_addr + NMMU_XLAT_CTL_PTCR, ptcr);
+}
+
+/*
+ * Setup the the Nest MMU PTCR register for all chips in the system or
+ * the specified chip id.
+ *
+ * The PTCR value may be overwritten so long as all users have been
+ * quiesced. If it is set to an invalid memory address the system will
+ * checkstop if anything attempts to use it.
+ *
+ * Returns OPAL_UNSUPPORTED if no nest mmu was found.
+ */
+static int64_t opal_nmmu_set_ptcr(uint64_t chip_id, uint64_t ptcr)
+{
+	struct dt_node *node;
+	int64_t rc = OPAL_UNSUPPORTED;
+
+	if (chip_id == -1ULL)
+		dt_for_each_compatible(dt_root, node, "ibm,power9-nest-mmu") {
+			chip_id = dt_get_chip_id(node);
+			if ((rc = nmmu_set_ptcr(chip_id, node, ptcr)))
+				return rc;
+		}
+	else
+		dt_for_each_compatible_on_chip(dt_root, node, "ibm,power9-nest-mmu", chip_id)
+			if ((rc = nmmu_set_ptcr(chip_id, node, ptcr)))
+				return rc;
+
+	return rc;
+}
+opal_call(OPAL_NMMU_SET_PTCR, opal_nmmu_set_ptcr, 2);
+
+static void _exit_uv_mode(void *data __unused)
+{
+	prlog(PR_DEBUG, "Exit uv mode on cpu pir 0x%04x\n", this_cpu()->pir);
+	/* HW has smfctrl shared between threads but on Mambo it is per-thread */
+	if (chip_quirk(QUIRK_MAMBO_CALLOUTS))
+		exit_uv_mode(1);
+	else
+		exit_uv_mode(cpu_is_thread0(this_cpu()));
+}
+
+void cpu_disable_pef(void)
+{
+	struct cpu_thread *cpu;
+	struct cpu_job **jobs;
+
+	if (!(mfmsr() & MSR_S)) {
+		prlog(PR_DEBUG, "UV mode off on cpu pir 0x%04x\n", this_cpu()->pir);
+		return;
+	}
+
+	jobs = zalloc(sizeof(struct cpu_job *) * (cpu_max_pir + 1));
+	assert(jobs);
+
+	/* Exit uv mode on all secondary threads before touching
+	 * smfctrl on thread 0 */
+	for_each_available_cpu(cpu) {
+		if (cpu == this_cpu())
+			continue;
+
+		if (!cpu_is_thread0(cpu))
+			jobs[cpu->pir] = cpu_queue_job(cpu, "exit_uv_mode",
+					_exit_uv_mode, NULL);
+	}
+
+	for_each_available_cpu(cpu)
+		if (jobs[cpu->pir]) {
+			cpu_wait_job(jobs[cpu->pir], true);
+			jobs[cpu->pir] = NULL;
+		}
+
+	/* Exit uv mode and disable smfctrl on primary threads */
+	for_each_available_cpu(cpu) {
+		if (cpu == this_cpu())
+			continue;
+
+		if (cpu_is_thread0(cpu))
+			jobs[cpu->pir] = cpu_queue_job(cpu, "exit_uv_mode",
+					_exit_uv_mode, NULL);
+	}
+
+	for_each_available_cpu(cpu)
+		if (jobs[cpu->pir])
+			cpu_wait_job(jobs[cpu->pir], true);
+
+	free(jobs);
+
+	_exit_uv_mode(NULL);
+}
diff --git a/roms/skiboot/core/cpufeatures.c b/roms/skiboot/core/cpufeatures.c
new file mode 100644
index 000000000..5620b741d
--- /dev/null
+++ b/roms/skiboot/core/cpufeatures.c
@@ -0,0 +1,1043 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * This file deals with setup of /cpus/ibm,powerpc-cpu-features dt
+ *
+ * Copyright 2017-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <cpu.h>
+#include <processor.h>
+#include <ccan/str/str.h>
+#include <device.h>
+
+#ifdef DEBUG
+#define DBG(fmt, a...)	prlog(PR_DEBUG, "CPUFT: " fmt, ##a)
+#else
+#define DBG(fmt, a...)
+#endif
+
+/* Device-tree visible constants follow */
+#define ISA_V2_07B	2070
+#define ISA_V3_0B	3000
+#define ISA_V3_1	3100
+
+#define USABLE_PR		(1U << 0)
+#define USABLE_OS		(1U << 1)
+#define USABLE_HV		(1U << 2)
+
+#define HV_SUPPORT_HFSCR	(1U << 0)
+#define OS_SUPPORT_FSCR		(1U << 0)
+
+/* Following are definitions for the match tables, not the DT binding itself */
+#define ISA_BASE	0
+
+#define HV_NONE		0
+#define HV_CUSTOM	1
+#define HV_HFSCR	2
+
+#define OS_NONE		0
+#define OS_CUSTOM	1
+#define OS_FSCR		2
+
+/* CPU bitmasks for match table */
+#define CPU_P8_DD1	(1U << 0)
+#define CPU_P8_DD2	(1U << 1)
+#define CPU_P9_DD1	(1U << 2)
+#define CPU_P9_DD2_0_1	(1U << 3) // 2.01 or 2.1
+#define CPU_P9P		(1U << 4)
+#define CPU_P9_DD2_2    (1U << 5)
+#define CPU_P9_DD2_3    (1U << 6)
+#define CPU_P10		(1U << 7)
+
+#define CPU_P9_DD2      (CPU_P9_DD2_0_1|CPU_P9_DD2_2|CPU_P9_DD2_3|CPU_P9P)
+
+#define CPU_P8		(CPU_P8_DD1|CPU_P8_DD2)
+#define CPU_P9		(CPU_P9_DD1|CPU_P9_DD2|CPU_P9P)
+#define CPU_ALL		(CPU_P8|CPU_P9|CPU_P10)
+
+struct cpu_feature {
+	const char *name;
+	uint32_t cpus_supported;
+	uint32_t isa;
+	uint32_t usable_privilege;
+	uint32_t hv_support;
+	uint32_t os_support;
+	uint32_t hfscr_bit_nr;
+	uint32_t fscr_bit_nr;
+	uint32_t hwcap_bit_nr;
+	const char *dependencies_names; /* space-delimited names */
+};
+
+/*
+ * The base (or NULL) cpu feature set is the CPU features available
+ * when no child nodes of the /cpus/ibm,powerpc-cpu-features node exist. The
+ * base feature set is POWER8 (ISAv2.07B), less features that are listed
+ * explicitly.
+ *
+ * XXX: currently, the feature dependencies are not necessarily captured
+ * exactly or completely. This is somewhat acceptable because all
+ * implementations must be aware of all these features.
+ */
+static const struct cpu_feature cpu_features_table[] = {
+	/*
+	 * Big endian as in ISAv2.07B, MSR_LE=0
+	 */
+	{ "big-endian",
+	CPU_ALL,
+	ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_CUSTOM, OS_CUSTOM,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * Little endian as in ISAv2.07B, MSR_LE=1.
+	 *
+	 * When both big and little endian are defined, there is an LPCR ILE
+	 * bit and implementation specific way to switch HILE mode, MSR_SLE,
+	 * etc.
+	 */
+	{ "little-endian",
+	CPU_ALL,
+	ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_CUSTOM, OS_CUSTOM,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * MSR_HV=1 mode as in ISAv2.07B (i.e., hypervisor privileged
+	 * instructions and registers).
+	 */
+	{ "hypervisor",
+	CPU_ALL,
+	ISA_BASE, USABLE_HV,
+	HV_CUSTOM, OS_NONE,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * ISAv2.07B interrupt vectors, registers, and control registers
+	 * (e.g., AIL, ILE, HV, etc LPCR bits).
+	 *
+	 * This does not necessarily specify all possible interrupt types.
+	 * floating-point, for example requires some ways to handle floating
+	 * point exceptions, but the low level details of interrupt handler
+	 * is not a dependency there. There will always be *some* interrupt
+	 * handler, (and some way to provide memory magagement, etc.).
+	 */
+	{ "interrupt-facilities",
+	CPU_ALL,
+	ISA_BASE, USABLE_HV|USABLE_OS,
+	HV_CUSTOM, OS_CUSTOM,
+	-1, -1, -1,
+	NULL, },
+
+	{ "smt",
+	CPU_ALL,
+	ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_CUSTOM, OS_CUSTOM,
+	-1, -1, 14,
+	NULL, },
+
+	/*
+	 * ISAv2.07B Program Priority Registers (PPR)
+	 * PPR and associated control registers (e.g. RPR, PSPB),
+	 * priority "or" instructions, etc.
+	 */
+	{ "program-priority-register",
+	CPU_ALL,
+	ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_NONE, OS_NONE,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * ISAv2.07B Book3S Chapter 5.7.9.1. Virtual Page Class Key Protecion
+	 * AMR, IAMR, AMOR, UAMOR, etc registers and MMU key bits.
+	 */
+	{ "virtual-page-class-key-protection",
+	CPU_ALL,
+	ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_CUSTOM, OS_CUSTOM,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * ISAv2.07B SAO storage control attribute
+	 */
+	{ "strong-access-ordering",
+	CPU_ALL & ~CPU_P9_DD1,
+	ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_CUSTOM, OS_CUSTOM,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * ISAv2.07B no-execute storage control attribute
+	 */
+	{ "no-execute",
+	CPU_ALL,
+	ISA_BASE, USABLE_HV|USABLE_OS,
+	HV_CUSTOM, OS_CUSTOM,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * Cache inhibited attribute supported on large pages.
+	 */
+	{ "cache-inhibited-large-page",
+	CPU_ALL,
+	ISA_BASE, USABLE_HV|USABLE_OS,
+	HV_CUSTOM, OS_CUSTOM,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * ISAv2.07B Book3S Chapter 8. Debug Facilities
+	 * CIEA, CIABR, DEAW, MEte, trace interrupt, etc.
+	 * Except CFAR, branch tracing.
+	 */
+	{ "debug-facilities",
+	CPU_ALL,
+	ISA_BASE, USABLE_HV|USABLE_OS,
+	HV_CUSTOM, OS_CUSTOM,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * DAWR1, DAWRX1 etc.
+	 */
+	{ "debug-facilities-v31",
+	CPU_P10,
+	ISA_V3_1, USABLE_HV|USABLE_OS,
+	HV_CUSTOM, OS_CUSTOM,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * ISAv2.07B CFAR
+	 */
+	{ "come-from-address-register",
+	CPU_ALL,
+	ISA_BASE, USABLE_HV|USABLE_OS,
+	HV_CUSTOM, OS_CUSTOM,
+	-1, -1, -1,
+	"debug-facilities", },
+
+	/*
+	 * ISAv2.07B Branch tracing (optional in ISA)
+	 */
+	{ "branch-tracing",
+	CPU_ALL,
+	ISA_BASE, USABLE_HV|USABLE_OS,
+	HV_CUSTOM, OS_CUSTOM,
+	-1, -1, -1,
+	"debug-facilities", },
+
+	/*
+	 * ISAv2.07B Floating-point Facility
+	 */
+	{ "floating-point",
+	CPU_ALL,
+	ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_CUSTOM, OS_CUSTOM,
+	PPC_BITLSHIFT(63), -1, 27,
+	NULL, },
+
+	/*
+	 * ISAv2.07B Vector Facility (VMX)
+	 */
+	{ "vector",
+	CPU_ALL,
+	ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_CUSTOM, OS_CUSTOM,
+	PPC_BITLSHIFT(62), -1, 28,
+	"floating-point", },
+
+	/*
+	 * ISAv2.07B Vector-scalar Facility (VSX)
+	 */
+	{ "vector-scalar",
+	CPU_ALL,
+	ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_CUSTOM, OS_CUSTOM,
+	-1, -1, 7,
+	"vector", },
+
+	{ "vector-crypto",
+	CPU_ALL,
+	ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_NONE, OS_NONE,
+	-1, -1, 57,
+	"vector", },
+
+	/*
+	 * ISAv2.07B Quadword Load and Store instructions
+	 * including lqarx/stdqcx. instructions.
+	 */
+	{ "quadword-load-store",
+	CPU_ALL,
+	ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_NONE, OS_NONE,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * ISAv2.07B Binary Coded Decimal (BCD)
+	 * BCD fixed point instructions
+	 */
+	{ "decimal-integer",
+	CPU_ALL,
+	ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_NONE, OS_NONE,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * ISAv2.07B Decimal floating-point Facility (DFP)
+	 */
+	{ "decimal-floating-point",
+	CPU_ALL,
+	ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_NONE, OS_NONE,
+	-1, -1, 10,
+	"floating-point", },
+
+	/*
+	 * ISAv2.07B
+	 * DSCR, default data prefetch LPCR, etc
+	 */
+	{ "data-stream-control-register",
+	CPU_ALL,
+	ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_CUSTOM, OS_CUSTOM,
+	PPC_BITLSHIFT(61), PPC_BITLSHIFT(61), 61,
+	NULL, },
+
+	/*
+	 * ISAv2.07B Branch History Rolling Buffer (BHRB)
+	 */
+	{ "branch-history-rolling-buffer",
+	CPU_ALL,
+	ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_CUSTOM, OS_CUSTOM,
+	PPC_BITLSHIFT(59), -1, -1,
+	NULL, },
+
+	/*
+	 * ISAv2.07B Transactional Memory Facility (TM or HTM)
+	 */
+	{ "transactional-memory",
+	CPU_P8, /* P9 support is not enabled yet */
+	ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_CUSTOM, OS_CUSTOM,
+	PPC_BITLSHIFT(58), -1, 62,
+	NULL, },
+
+	/*
+	 * ISAv3.0B TM additions
+	 * TEXASR bit 17, self-induced vs external footprint overflow
+	 */
+	{ "transactional-memory-v3",
+	0,
+	ISA_V3_0B, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_NONE, OS_NONE,
+	-1, -1, -1,
+	"transactional-memory", },
+
+	/*
+	 * ISAv2.07B Event-Based Branch Facility (EBB)
+	 */
+	{ "event-based-branch",
+	CPU_ALL,
+	ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_CUSTOM, OS_CUSTOM,
+	PPC_BITLSHIFT(56), PPC_BITLSHIFT(56), 60,
+	NULL, },
+
+	/*
+	 * ISAv2.07B Target Address Register (TAR)
+	 */
+	{ "target-address-register",
+	CPU_ALL,
+	ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_CUSTOM, OS_CUSTOM,
+	PPC_BITLSHIFT(55), PPC_BITLSHIFT(55), 58,
+	NULL, },
+
+	/*
+	 * ISAv2.07B Control Register (CTRL)
+	 */
+	{ "control-register",
+	CPU_ALL,
+	ISA_BASE, USABLE_HV|USABLE_OS,
+	HV_CUSTOM, OS_CUSTOM,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * ISAv2.07B Book3S Chapter 11. Processor Control.
+	 * msgsnd, msgsndp, doorbell, etc.
+	 *
+	 * ISAv3.0B is not compatible (different addressing, HFSCR required
+	 * for msgsndp).
+	 */
+	{ "processor-control-facility",
+	CPU_P8_DD2, /* P8 DD1 has no dbell */
+	ISA_BASE, USABLE_HV|USABLE_OS,
+	HV_CUSTOM, OS_CUSTOM,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * ISAv2.07B PURR, SPURR registers
+	 */
+	{ "processor-utilization-of-resources-register",
+	CPU_ALL,
+	ISA_BASE, USABLE_HV|USABLE_OS,
+	HV_CUSTOM, OS_CUSTOM,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * POWER8 initiate coprocessor store word indexed (icswx) instruction
+	 */
+	{ "coprocessor-icswx",
+	CPU_P8,
+	ISA_BASE, USABLE_HV|USABLE_OS,
+	HV_CUSTOM, OS_CUSTOM,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * ISAv2.07B hash based MMU and all instructions, registers,
+	 * data structures, exceptions, etc.
+	 */
+	{ "mmu-hash",
+	CPU_P8,
+	ISA_BASE, USABLE_HV|USABLE_OS,
+	HV_CUSTOM, OS_CUSTOM,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * POWER8 MCE / machine check exception.
+	 */
+	{ "machine-check-power8",
+	CPU_P8,
+	ISA_BASE, USABLE_HV|USABLE_OS,
+	HV_CUSTOM, OS_CUSTOM,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * POWER8 PMU / performance monitor unit.
+	 */
+	{ "performance-monitor-power8",
+	CPU_P8,
+	ISA_BASE, USABLE_HV|USABLE_OS,
+	HV_CUSTOM, OS_CUSTOM,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * ISAv2.07B alignment interrupts set DSISR register
+	 *
+	 * POWER CPUs do not used this, and it's removed from ISAv3.0B.
+	 */
+	{ "alignment-interrupt-dsisr",
+	0,
+	ISA_BASE, USABLE_HV|USABLE_OS,
+	HV_NONE, OS_NONE,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * ISAv2.07B / POWER8 doze, nap, sleep, winkle instructions
+	 * XXX: is Linux we using some BookIV specific implementation details
+	 * in nap handling? We have no POWER8 specific key here.
+	 */
+	{ "idle-nap",
+	CPU_P8,
+	ISA_BASE, USABLE_HV,
+	HV_CUSTOM, OS_NONE,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * ISAv2.07B wait instruction
+	 */
+	{ "wait",
+	CPU_P8,
+	ISA_BASE, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_NONE, OS_NONE,
+	-1, -1, -1,
+	NULL, },
+
+	{ "subcore",
+	CPU_P8,
+	ISA_BASE, USABLE_HV|USABLE_OS,
+	HV_CUSTOM, OS_CUSTOM,
+	-1, -1, -1,
+	"smt", },
+
+	/*
+	 * ISAv3.0B radix based MMU
+	 */
+	{ "mmu-radix",
+	CPU_P9|CPU_P10,
+	ISA_V3_0B, USABLE_HV|USABLE_OS,
+	HV_CUSTOM, OS_CUSTOM,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * ISAv3.0B hash based MMU, new hash pte format, PCTR, etc
+	 */
+	{ "mmu-hash-v3",
+	CPU_P9|CPU_P10,
+	ISA_V3_0B, USABLE_HV|USABLE_OS,
+	HV_CUSTOM, OS_CUSTOM,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * ISAv3.0B wait instruction
+	 */
+	{ "wait-v3",
+	CPU_P9|CPU_P10,
+	ISA_V3_0B, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_NONE, OS_NONE,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * ISAv3.0B stop idle instructions and registers
+	 * XXX: Same question as for idle-nap
+	 */
+	{ "idle-stop",
+	CPU_P9|CPU_P10,
+	ISA_V3_0B, USABLE_HV|USABLE_OS,
+	HV_CUSTOM, OS_CUSTOM,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * ISAv3.0B Hypervisor Virtualization Interrupt
+	 * Also associated system registers, LPCR EE, HEIC, HVICE,
+	 * system reset SRR1 reason, etc.
+	 */
+	{ "hypervisor-virtualization-interrupt",
+	CPU_P9|CPU_P10,
+	ISA_V3_0B, USABLE_HV,
+	HV_CUSTOM, OS_NONE,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * POWER9 MCE / machine check exception.
+	 */
+	{ "machine-check-power9",
+	CPU_P9,
+	ISA_V3_0B, USABLE_HV|USABLE_OS,
+	HV_CUSTOM, OS_CUSTOM,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * POWER10 MCE / machine check exception.
+	 */
+	{ "machine-check-power10",
+	CPU_P10,
+	ISA_V3_0B, USABLE_HV|USABLE_OS,
+	HV_CUSTOM, OS_CUSTOM,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * POWER9 PMU / performance monitor unit.
+	 */
+	{ "performance-monitor-power9",
+	CPU_P9,
+	ISA_V3_0B, USABLE_HV|USABLE_OS,
+	HV_CUSTOM, OS_CUSTOM,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * POWER10 PMU / performance monitor unit.
+	 */
+	{ "performance-monitor-power10",
+	CPU_P10,
+	ISA_V3_1, USABLE_HV|USABLE_OS,
+	HV_CUSTOM, OS_CUSTOM,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * ISAv3.0B scv/rfscv system call instructions and exceptions, fscr bit
+	 * etc.
+	 */
+	{ "system-call-vectored",
+	CPU_P9|CPU_P10,
+	ISA_V3_0B, USABLE_OS|USABLE_PR,
+	HV_NONE, OS_CUSTOM,
+	-1, PPC_BITLSHIFT(51), 52,
+	NULL, },
+
+	/*
+	 * ISAv3.0B Book3S Chapter 10. Processor Control.
+	 * global msgsnd, msgsndp, msgsync, doorbell, etc.
+	 */
+	{ "processor-control-facility-v3",
+	CPU_P9|CPU_P10,
+	ISA_V3_0B, USABLE_HV|USABLE_OS,
+	HV_CUSTOM, OS_NONE,
+	PPC_BITLSHIFT(53), -1, -1,
+	NULL, },
+
+	/*
+	 * ISAv3.0B addpcis instruction
+	 */
+	{ "pc-relative-addressing",
+	CPU_P9|CPU_P10,
+	ISA_V3_0B, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_NONE, OS_NONE,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * ISAv2.07B Book3S Chapter 7. Timer Facilities
+	 * TB, VTB, DEC, HDEC, IC, etc registers and exceptions.
+	 * Not including PURR or SPURR registers.
+	 */
+	{ "timer-facilities",
+	CPU_ALL,
+	ISA_BASE, USABLE_HV|USABLE_OS,
+	HV_NONE, OS_NONE,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * ISAv3.0B Book3S Chapter 7. Timer Facilities
+	 * Large decrementer and hypervisor decrementer
+	 */
+	{ "timer-facilities-v3",
+	CPU_P9|CPU_P10,
+	ISA_V3_0B, USABLE_HV|USABLE_OS,
+	HV_NONE, OS_NONE,
+	-1, -1, -1,
+	"timer-facilities", },
+
+	/*
+	 * ISAv3.0B deliver a random number instruction (darn)
+	 */
+	{ "random-number-generator",
+	CPU_P9|CPU_P10,
+	ISA_V3_0B, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_NONE, OS_NONE,
+	-1, -1, 53,
+	NULL, },
+
+	/*
+	 * ISAv3.0B fixed point instructions and registers
+	 * multiply-add, modulo, count trailing zeroes, cmprb, cmpeqb,
+	 * extswsli, mfvsrld, mtvsrdd, mtvsrws, addex, CA32, OV32,
+	 * mcrxrx, setb
+	 */
+	{ "fixed-point-v3",
+	CPU_P9|CPU_P10,
+	ISA_V3_0B, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_NONE, OS_NONE,
+	-1, -1, -1,
+	NULL, },
+
+	{ "decimal-integer-v3",
+	CPU_P9|CPU_P10,
+	ISA_V3_0B, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_NONE, OS_NONE,
+	-1, -1, -1,
+	"fixed-point-v3 decimal-integer", },
+
+	/*
+	 * ISAv3.0B lightweight mffs
+	 */
+	{ "floating-point-v3",
+	CPU_P9|CPU_P10,
+	ISA_V3_0B, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_NONE, OS_NONE,
+	-1, -1, -1,
+	"floating-point", },
+
+	{ "decimal-floating-point-v3",
+	CPU_P9|CPU_P10,
+	ISA_V3_0B, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_NONE, OS_NONE,
+	-1, -1, -1,
+	"floating-point-v3 decimal-floating-point", },
+
+	{ "vector-v3",
+	CPU_P9|CPU_P10,
+	ISA_V3_0B, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_NONE, OS_NONE,
+	-1, -1, -1,
+	"vector", },
+
+	{ "vector-scalar-v3",
+	CPU_P9|CPU_P10,
+	ISA_V3_0B, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_NONE, OS_NONE,
+	-1, -1, -1,
+	"vector-v3 vector-scalar" },
+
+	{ "vector-binary128",
+	CPU_P9|CPU_P10,
+	ISA_V3_0B, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_NONE, OS_NONE,
+	-1, -1, 54,
+	"vector-scalar-v3", },
+
+	{ "vector-binary16",
+	CPU_P9|CPU_P10,
+	ISA_V3_0B, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_NONE, OS_NONE,
+	-1, -1, -1,
+	"vector-v3", },
+
+	/*
+	 * ISAv3.0B external exception for EBB
+	 */
+	{ "event-based-branch-v3",
+	CPU_P9|CPU_P10,
+	ISA_V3_0B, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_NONE, OS_NONE,
+	-1, -1, -1,
+	"event-based-branch", },
+
+	/*
+	 * ISAv3.0B Atomic Memory Operations (AMO)
+	 */
+	{ "atomic-memory-operations",
+	CPU_P9|CPU_P10,
+	ISA_V3_0B, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_NONE, OS_NONE,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * ISAv3.0B Copy-Paste Facility
+	 */
+	{ "copy-paste",
+	CPU_P9|CPU_P10,
+	ISA_V3_0B, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_NONE, OS_NONE,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * ISAv3.0B GSR SPR register
+	 * POWER9 does not implement it
+	 */
+	{ "group-start-register",
+	0,
+	ISA_V3_0B, USABLE_HV|USABLE_OS,
+	HV_NONE, OS_NONE,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * Enable matrix multiply accumulate.
+	 */
+	{ "matrix-multiply-accumulate",
+	CPU_P10,
+	ISA_V3_1, USABLE_PR,
+	HV_CUSTOM, OS_CUSTOM,
+	-1, -1, 49,
+	NULL, },
+
+	/*
+	 * Enable prefix instructions. Toolchains assume this is
+	 * enabled for when compiling for ISA 3.1.
+	 */
+	{ "prefix-instructions",
+	CPU_P10,
+	ISA_V3_1, USABLE_HV|USABLE_OS|USABLE_PR,
+	HV_HFSCR, OS_FSCR,
+	13, 13, -1,
+	NULL, },
+
+	/*
+	 * Due to hardware bugs in POWER9, the hypervisor needs to assist
+	 * guests.
+	 *
+	 * Presence of this feature indicates presence of the bug.
+	 *
+	 * See linux kernel commit 4bb3c7a0208f
+	 * and linux Documentation/powerpc/transactional_memory.txt
+	 */
+	{ "tm-suspend-hypervisor-assist",
+	CPU_P9_DD2_2|CPU_P9_DD2_3|CPU_P9P,
+	ISA_V3_0B, USABLE_HV,
+	HV_CUSTOM, OS_NONE,
+	-1, -1, -1,
+	NULL, },
+
+	/*
+	 * Due to hardware bugs in POWER9, the hypervisor can hit
+	 * CPU bugs in the operations it needs to do for
+	 * tm-suspend-hypervisor-assist.
+	 *
+	 * Presence of this "feature" means processor is affected by the bug.
+	 *
+	 * See linux kernel commit 4bb3c7a0208f
+	 * and linux Documentation/powerpc/transactional_memory.txt
+	 */
+	{ "tm-suspend-xer-so-bug",
+	CPU_P9_DD2_2,
+	ISA_V3_0B, USABLE_HV,
+	HV_CUSTOM, OS_NONE,
+	-1, -1, -1,
+	NULL, },
+};
+
+static void add_cpu_feature_nodeps(struct dt_node *features,
+				   const struct cpu_feature *f)
+{
+	struct dt_node *feature;
+
+	feature = dt_new(features, f->name);
+	assert(feature);
+
+	dt_add_property_cells(feature, "isa", f->isa);
+	dt_add_property_cells(feature, "usable-privilege", f->usable_privilege);
+
+	if (f->usable_privilege & USABLE_HV) {
+		if (f->hv_support != HV_NONE) {
+			uint32_t s = 0;
+			if (f->hv_support == HV_HFSCR)
+				s |= HV_SUPPORT_HFSCR;
+
+			dt_add_property_cells(feature, "hv-support", s);
+			if (f->hfscr_bit_nr != -1)
+				dt_add_property_cells(feature, "hfscr-bit-nr", f->hfscr_bit_nr);
+		} else {
+			assert(f->hfscr_bit_nr == -1);
+		}
+	}
+
+	if (f->usable_privilege & USABLE_OS) {
+		if (f->os_support != OS_NONE) {
+			uint32_t s = 0;
+			if (f->os_support == OS_FSCR)
+				s |= OS_SUPPORT_FSCR;
+			dt_add_property_cells(feature, "os-support", s);
+			if (f->fscr_bit_nr != -1)
+				dt_add_property_cells(feature, "fscr-bit-nr", f->fscr_bit_nr);
+		} else {
+			assert(f->fscr_bit_nr == -1);
+		}
+	}
+
+	if (f->usable_privilege & USABLE_PR) {
+		if (f->hwcap_bit_nr != -1)
+			dt_add_property_cells(feature, "hwcap-bit-nr", f->hwcap_bit_nr);
+	}
+
+	if (f->dependencies_names)
+		dt_add_property(feature, "dependencies", NULL, 0);
+}
+
+static void add_cpufeatures_dependencies(struct dt_node *features)
+{
+	struct dt_node *feature;
+
+	dt_for_each_node(features, feature) {
+		const struct cpu_feature *f = NULL;
+		const char *deps_names;
+		struct dt_property *deps;
+		int nr_deps;
+		int i;
+
+		/* Find features with dependencies */
+
+		deps = __dt_find_property(feature, "dependencies");
+		if (!deps)
+			continue;
+
+		/* Find the matching cpu table */
+		for (i = 0; i < ARRAY_SIZE(cpu_features_table); i++) {
+			f = &cpu_features_table[i];
+			if (!strcmp(f->name, feature->name))
+				break;
+		}
+		assert(f);
+		assert(f->dependencies_names);
+
+		/*
+		 * Count number of depended features and allocate space
+		 * for phandles in the property.
+		 */
+		deps_names = f->dependencies_names;
+		nr_deps = strcount(deps_names, " ") + 1;
+		dt_resize_property(&deps, nr_deps * sizeof(u32));
+
+		DBG("feature %s has %d dependencies (%s)\n", f->name, nr_deps, deps_names);
+		/*
+		 * For each one, find the depended feature then advance to
+		 * next name.
+		 */
+		for (i = 0; i < nr_deps; i++) {
+			struct dt_node *dep;
+			int len;
+
+			if (nr_deps - i == 1)
+				len = strlen(deps_names);
+			else
+				len = strchr(deps_names, ' ') - deps_names;
+
+			dt_for_each_node(features, dep) {
+				if (!strncmp(deps_names, dep->name, len))
+					goto found_dep;
+			}
+
+			prlog(PR_ERR, "CPUFT: feature %s dependencies not found\n", f->name);
+			break;
+found_dep:
+			DBG(" %s found dep (%s)\n", f->name, dep->name);
+			dt_property_set_cell(deps, i, dep->phandle);
+
+			/* Advance over the name + delimiter */
+			deps_names += len + 1;
+		}
+	}
+}
+
+static void add_cpufeatures(struct dt_node *cpus,
+			    uint32_t cpu_feature_isa, uint32_t cpu_feature_cpu,
+			    const char *cpu_name)
+{
+	struct dt_node *features;
+	int i;
+
+	DBG("creating cpufeatures for cpu:%d isa:%d\n", cpu_feature_cpu, cpu_feature_isa);
+
+	features = dt_new(cpus, "ibm,powerpc-cpu-features");
+	assert(features);
+
+	dt_add_property_cells(features, "isa", cpu_feature_isa);
+
+	dt_add_property_string(features, "device_type", "cpu-features");
+	dt_add_property_string(features, "compatible", "ibm,powerpc-cpu-features");
+	dt_add_property_string(features, "display-name", cpu_name);
+
+	/* add without dependencies */
+	for (i = 0; i < ARRAY_SIZE(cpu_features_table); i++) {
+		const struct cpu_feature *f = &cpu_features_table[i];
+
+		if (f->cpus_supported & cpu_feature_cpu) {
+			DBG("  '%s'\n", f->name);
+			add_cpu_feature_nodeps(features, f);
+		}
+	}
+
+	/* dependency construction pass */
+	add_cpufeatures_dependencies(features);
+}
+
+void dt_add_cpufeatures(struct dt_node *root)
+{
+	int version;
+	uint32_t cpu_feature_isa = 0;
+	uint32_t cpu_feature_cpu = 0;
+	struct dt_node *cpus;
+	const char *cpu_name = NULL;
+
+	version = mfspr(SPR_PVR);
+	switch(PVR_TYPE(version)) {
+	case PVR_TYPE_P8:
+		if (!cpu_name)
+			cpu_name = "POWER8";
+		/* fallthrough */
+	case PVR_TYPE_P8E:
+		if (!cpu_name)
+			cpu_name = "POWER8E";
+		/* fallthrough */
+		cpu_feature_isa = ISA_V2_07B;
+		if (PVR_VERS_MAJ(version) == 1)
+			cpu_feature_cpu = CPU_P8_DD1;
+		else
+			cpu_feature_cpu = CPU_P8_DD2;
+		break;
+	case PVR_TYPE_P8NVL:
+		cpu_name = "POWER8NVL";
+		cpu_feature_isa = ISA_V2_07B;
+		cpu_feature_cpu = CPU_P8_DD2;
+		break;
+	case PVR_TYPE_P9:
+		if (!cpu_name)
+			cpu_name = "POWER9";
+
+		cpu_feature_isa = ISA_V3_0B;
+		if (is_power9n(version) &&
+			   (PVR_VERS_MAJ(version) == 2)) {
+			/* P9N DD2.x */
+			switch (PVR_VERS_MIN(version)) {
+			case 0:
+			case 1:
+				cpu_feature_cpu = CPU_P9_DD2_0_1;
+				break;
+			case 2:
+				cpu_feature_cpu = CPU_P9_DD2_2;
+				break;
+			case 3:
+				cpu_feature_cpu = CPU_P9_DD2_3;
+				break;
+			default:
+				assert(0);
+			}
+		} else if (is_power9c(version) &&
+                            (PVR_VERS_MAJ(version) == 1)) {
+                          /* P9C DD1.x */
+			switch (PVR_VERS_MIN(version)) {
+                        case 1:
+				/* Cumulus DD1.1 => Nimbus DD2.1 */
+				cpu_feature_cpu = CPU_P9_DD2_0_1;
+				break;
+			case 2:
+				/* Cumulus DD1.2 */
+				cpu_feature_cpu = CPU_P9_DD2_2;
+				break;
+			case 3:
+				/* Cumulus DD1.3 */
+				cpu_feature_cpu = CPU_P9_DD2_3;
+				break;
+			default:
+				assert(0);
+			}
+		} else {
+			assert(0);
+		}
+
+		break;
+	case PVR_TYPE_P9P:
+		if (!cpu_name)
+			cpu_name = "POWER9P";
+
+		cpu_feature_isa = ISA_V3_0B;
+		cpu_feature_cpu = CPU_P9P;
+		break;
+	case PVR_TYPE_P10:
+		if (!cpu_name)
+			cpu_name = "POWER10";
+
+		cpu_feature_isa = ISA_V3_1;
+		cpu_feature_cpu = CPU_P10;
+		break;
+	default:
+		return;
+	}
+
+	cpus = dt_new_check(root, "cpus");
+
+	add_cpufeatures(cpus, cpu_feature_isa, cpu_feature_cpu, cpu_name);
+}
diff --git a/roms/skiboot/core/device.c b/roms/skiboot/core/device.c
new file mode 100644
index 000000000..b102dd973
--- /dev/null
+++ b/roms/skiboot/core/device.c
@@ -0,0 +1,1128 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Manipulate the device tree
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <stdarg.h>
+#include <device.h>
+#include <stdlib.h>
+#include <skiboot.h>
+#include <libfdt/libfdt.h>
+#include <libfdt/libfdt_internal.h>
+#include <ccan/str/str.h>
+#include <ccan/endian/endian.h>
+#include <inttypes.h>
+
+/* Used to give unique handles. */
+u32 last_phandle = 0;
+
+struct dt_node *dt_root;
+struct dt_node *dt_chosen;
+
+static const char *take_name(const char *name)
+{
+	if (!is_rodata(name) && !(name = strdup(name))) {
+		prerror("Failed to allocate copy of name");
+		abort();
+	}
+	return name;
+}
+
+static void free_name(const char *name)
+{
+	if (!is_rodata(name))
+		free((char *)name);
+}
+
+static struct dt_node *new_node(const char *name)
+{
+	struct dt_node *node = malloc(sizeof *node);
+	if (!node) {
+		prerror("Failed to allocate node\n");
+		abort();
+	}
+
+	node->name = take_name(name);
+	node->parent = NULL;
+	list_head_init(&node->properties);
+	list_head_init(&node->children);
+	/* FIXME: locking? */
+	node->phandle = new_phandle();
+	return node;
+}
+
+struct dt_node *dt_new_root(const char *name)
+{
+	return new_node(name);
+}
+
+static const char *get_unitname(const struct dt_node *node)
+{
+	const char *c = strchr(node->name, '@');
+
+	if (!c)
+		return NULL;
+
+	return c + 1;
+}
+
+int dt_cmp_subnodes(const struct dt_node *a, const struct dt_node *b)
+{
+	const char *a_unit = get_unitname(a);
+	const char *b_unit = get_unitname(b);
+
+	ptrdiff_t basenamelen = a_unit - a->name;
+
+	/* sort hex unit addresses by number */
+	if (a_unit && b_unit && !strncmp(a->name, b->name, basenamelen)) {
+		unsigned long long a_num, b_num;
+		char *a_end, *b_end;
+
+		a_num = strtoul(a_unit, &a_end, 16);
+		b_num = strtoul(b_unit, &b_end, 16);
+
+		/* only compare if the unit addr parsed correctly */
+		if (*a_end == 0 && *b_end == 0)
+			return (a_num > b_num) - (a_num < b_num);
+	}
+
+	return strcmp(a->name, b->name);
+}
+
+bool dt_attach_root(struct dt_node *parent, struct dt_node *root)
+{
+	struct dt_node *node;
+
+	assert(!root->parent);
+
+	if (list_empty(&parent->children)) {
+		list_add(&parent->children, &root->list);
+		root->parent = parent;
+
+		return true;
+	}
+
+	dt_for_each_child(parent, node) {
+		int cmp = dt_cmp_subnodes(node, root);
+
+		/* Look for duplicates */
+		if (cmp == 0) {
+			prerror("DT: %s failed, duplicate %s\n",
+				__func__, root->name);
+			return false;
+		}
+
+		/* insert before the first node that's larger
+		 * the the node we're inserting */
+		if (cmp > 0)
+			break;
+	}
+
+	list_add_before(&parent->children, &root->list, &node->list);
+	root->parent = parent;
+
+	return true;
+}
+
+static inline void dt_destroy(struct dt_node *dn)
+{
+	if (!dn)
+		return;
+
+	free_name(dn->name);
+	free(dn);
+}
+	
+struct dt_node *dt_new(struct dt_node *parent, const char *name)
+{
+	struct dt_node *new;
+	assert(parent);
+
+	new = new_node(name);
+	if (!dt_attach_root(parent, new)) {
+		dt_destroy(new);
+		return NULL;
+	}
+	return new;
+}
+
+/*
+ * low level variant, we export this because there are "weird" address
+ * formats, such as LPC/ISA bus addresses which have a letter to identify
+ * which bus space the address is inside of.
+ */
+struct dt_node *__dt_find_by_name_addr(struct dt_node *parent, const char *name,
+	const char *addr)
+{
+	struct dt_node *node;
+
+	if (list_empty(&parent->children))
+		return NULL;
+
+	dt_for_each_child(parent, node) {
+		const char *unit = get_unitname(node);
+		int len;
+
+		if (!unit)
+			continue;
+
+		/* match the name */
+		len = (int) (unit - node->name) - 1;
+		if (strncmp(node->name, name, len))
+			continue;
+
+		/* match the unit */
+		if (strcmp(unit, addr) == 0)
+			return node;
+	}
+
+	dt_for_each_child(parent, node) {
+		struct dt_node *ret = __dt_find_by_name_addr(node, name, addr);
+
+		if (ret)
+			return ret;
+	}
+
+	return NULL;
+}
+
+struct dt_node *dt_find_by_name_addr(struct dt_node *parent, const char *name,
+	uint64_t addr)
+{
+	char addr_str[16 + 1]; /* max size of a 64bit int */
+	snprintf(addr_str, sizeof(addr_str), "%" PRIx64, addr);
+
+	return __dt_find_by_name_addr(parent, name, addr_str);
+}
+
+struct dt_node *dt_new_addr(struct dt_node *parent, const char *name,
+			    uint64_t addr)
+{
+	char *lname;
+	struct dt_node *new;
+	size_t len;
+
+	assert(parent);
+	len = strlen(name) + STR_MAX_CHARS(addr) + 2;
+	lname = malloc(len);
+	if (!lname)
+		return NULL;
+	snprintf(lname, len, "%s@%llx", name, (long long)addr);
+	new = new_node(lname);
+	free(lname);
+	if (!dt_attach_root(parent, new)) {
+		dt_destroy(new);
+		return NULL;
+	}
+	return new;
+}
+
+struct dt_node *dt_new_2addr(struct dt_node *parent, const char *name,
+			     uint64_t addr0, uint64_t addr1)
+{
+	char *lname;
+	struct dt_node *new;
+	size_t len;
+	assert(parent);
+
+	len = strlen(name) + 2*STR_MAX_CHARS(addr0) + 3;
+	lname = malloc(len);
+	if (!lname)
+		return NULL;
+	snprintf(lname, len, "%s@%llx,%llx",
+		 name, (long long)addr0, (long long)addr1);
+	new = new_node(lname);
+	free(lname);
+	if (!dt_attach_root(parent, new)) {
+		dt_destroy(new);
+		return NULL;
+	}
+	return new;
+}
+
+static struct dt_node *__dt_copy(struct dt_node *node, struct dt_node *parent,
+		bool root)
+{
+	struct dt_property *prop, *new_prop;
+	struct dt_node *new_node, *child;
+
+	new_node = dt_new(parent, node->name);
+	if (!new_node)
+		return NULL;
+
+	list_for_each(&node->properties, prop, list) {
+		new_prop = dt_add_property(new_node, prop->name, prop->prop,
+				prop->len);
+		if (!new_prop)
+			goto fail;
+	}
+
+	list_for_each(&node->children, child, list) {
+		child = __dt_copy(child, new_node, false);
+		if (!child)
+			goto fail;
+	}
+
+	return new_node;
+
+fail:
+	/* dt_free will recurse for us, so only free when we unwind to the
+	 * top-level failure */
+	if (root)
+		dt_free(new_node);
+	return NULL;
+}
+
+struct dt_node *dt_copy(struct dt_node *node, struct dt_node *parent)
+{
+	return __dt_copy(node, parent, true);
+}
+
+char *dt_get_path(const struct dt_node *node)
+{
+	unsigned int len = 0;
+	const struct dt_node *n;
+	char *path, *p;
+
+	/* Dealing with NULL is for test/debug purposes */
+	if (!node)
+		return strdup("<NULL>");
+
+	for (n = node; n; n = n->parent) {
+		len += strlen(n->name);
+		if (n->parent || n == node)
+			len++;
+	}
+	path = zalloc(len + 1);
+	assert(path);
+	p = path + len;
+	for (n = node; n; n = n->parent) {
+		len = strlen(n->name);
+		p -= len;
+		memcpy(p, n->name, len);
+		if (n->parent || n == node)
+			*(--p) = '/';
+	}
+	assert(p == path);
+
+	return p;
+}
+
+static const char *__dt_path_split(const char *p,
+				   const char **namep, unsigned int *namel,
+				   const char **addrp, unsigned int *addrl)
+{
+	const char *at, *sl;
+
+	*namel = *addrl = 0;
+
+	/* Skip initial '/' */
+	while (*p == '/')
+		p++;
+
+	/* Check empty path */
+	if (*p == 0)
+		return p;
+
+	at = strchr(p, '@');
+	sl = strchr(p, '/');
+	if (sl == NULL)
+		sl = p + strlen(p);
+	if (sl < at)
+		at = NULL;
+	if (at) {
+		*addrp = at + 1;
+		*addrl = sl - at - 1;
+	}
+	*namep = p;
+	*namel = at ? (at - p) : (sl - p);
+
+	return sl;
+}
+
+struct dt_node *dt_find_by_path(struct dt_node *root, const char *path)
+{
+	struct dt_node *n;
+	const char *pn, *pa, *p = path, *nn, *na;
+	unsigned int pnl, pal, nnl, nal;
+	bool match;
+
+	/* Walk path components */
+	while (*p) {
+		/* Extract next path component */
+		p = __dt_path_split(p, &pn, &pnl, &pa, &pal);
+		if (pnl == 0 && pal == 0)
+			break;
+
+		/* Compare with each child node */
+		match = false;
+		list_for_each(&root->children, n, list) {
+			match = true;
+			__dt_path_split(n->name, &nn, &nnl, &na, &nal);
+			if (pnl && (pnl != nnl || strncmp(pn, nn, pnl)))
+				match = false;
+			if (pal && (pal != nal || strncmp(pa, na, pal)))
+				match = false;
+			if (match) {
+				root = n;
+				break;
+			}
+		}
+
+		/* No child match */
+		if (!match)
+			return NULL;
+	}
+	return root;
+}
+
+struct dt_node *dt_find_by_name(struct dt_node *root, const char *name)
+{
+	struct dt_node *child, *match;
+
+	list_for_each(&root->children, child, list) {
+		if (!strcmp(child->name, name))
+			return child;
+
+		match = dt_find_by_name(child, name);
+		if (match)
+			return match;
+	}
+
+	return NULL;
+}
+
+
+struct dt_node *dt_new_check(struct dt_node *parent, const char *name)
+{
+	struct dt_node *node = dt_find_by_name(parent, name);
+
+	if (!node) {
+		node = dt_new(parent, name);
+		assert(node);
+	}
+
+	return node;
+}
+
+
+struct dt_node *dt_find_by_phandle(struct dt_node *root, u32 phandle)
+{
+	struct dt_node *node;
+
+	dt_for_each_node(root, node)
+		if (node->phandle == phandle)
+			return node;
+	return NULL;
+}
+
+static struct dt_property *new_property(struct dt_node *node,
+					const char *name, size_t size)
+{
+	struct dt_property *p = malloc(sizeof(*p) + size);
+	char *path;
+
+	if (!p) {
+		path = dt_get_path(node);
+		prerror("Failed to allocate property \"%s\" for %s of %zu bytes\n",
+			name, path, size);
+		free(path);
+		abort();
+	}
+	if (dt_find_property(node, name)) {
+		path = dt_get_path(node);
+		prerror("Duplicate property \"%s\" in node %s\n",
+			name, path);
+		free(path);
+		abort();
+
+	}
+
+	p->name = take_name(name);
+	p->len = size;
+	list_add_tail(&node->properties, &p->list);
+	return p;
+}
+
+struct dt_property *dt_add_property(struct dt_node *node,
+				    const char *name,
+				    const void *val, size_t size)
+{
+	struct dt_property *p;
+
+	/*
+	 * Filter out phandle properties, we re-generate them
+	 * when flattening
+	 */
+	if (strcmp(name, "linux,phandle") == 0 ||
+	    strcmp(name, "phandle") == 0) {
+		assert(size == 4);
+		node->phandle = *(const u32 *)val;
+		if (node->phandle >= last_phandle)
+			set_last_phandle(node->phandle);
+		return NULL;
+	}
+
+	p = new_property(node, name, size);
+	if (size)
+		memcpy(p->prop, val, size);
+	return p;
+}
+
+void dt_resize_property(struct dt_property **prop, size_t len)
+{
+	size_t new_len = sizeof(**prop) + len;
+
+	*prop = realloc(*prop, new_len);
+	(*prop)->len = len;
+
+	/* Fix up linked lists in case we moved. (note: not an empty list). */
+	(*prop)->list.next->prev = &(*prop)->list;
+	(*prop)->list.prev->next = &(*prop)->list;
+}
+
+struct dt_property *dt_add_property_string(struct dt_node *node,
+					   const char *name,
+					   const char *value)
+{
+	size_t len = 0;
+	if (value)
+		len = strlen(value) + 1;
+	return dt_add_property(node, name, value, len);
+}
+
+struct dt_property *dt_add_property_nstr(struct dt_node *node,
+					 const char *name,
+					 const char *value, unsigned int vlen)
+{
+	struct dt_property *p;
+	char *tmp = zalloc(vlen + 1);
+
+	if (!tmp)
+		return NULL;
+
+	strncpy(tmp, value, vlen);
+	p = dt_add_property(node, name, tmp, strlen(tmp)+1);
+	free(tmp);
+
+	return p;
+}
+
+struct dt_property *__dt_add_property_cells(struct dt_node *node,
+					    const char *name,
+					    int count, ...)
+{
+	struct dt_property *p;
+	fdt32_t *val;
+	unsigned int i;
+	va_list args;
+
+	p = new_property(node, name, count * sizeof(u32));
+	val = (fdt32_t *)p->prop;
+	va_start(args, count);
+	for (i = 0; i < count; i++)
+		val[i] = cpu_to_fdt32(va_arg(args, u32));
+	va_end(args);
+	return p;
+}
+
+struct dt_property *__dt_add_property_u64s(struct dt_node *node,
+					   const char *name,
+					   int count, ...)
+{
+	struct dt_property *p;
+	fdt64_t *val;
+	unsigned int i;
+	va_list args;
+
+	p = new_property(node, name, count * sizeof(u64));
+	val = (fdt64_t *)p->prop;
+	va_start(args, count);
+	for (i = 0; i < count; i++)
+		val[i] = cpu_to_fdt64(va_arg(args, u64));
+	va_end(args);
+	return p;
+}
+
+struct dt_property *__dt_add_property_strings(struct dt_node *node,
+					      const char *name,
+					      int count, ...)
+{
+	struct dt_property *p;
+	unsigned int i, size;
+	va_list args;
+	const char *sstr;
+	char *s;
+
+	va_start(args, count);
+	for (i = size = 0; i < count; i++) {
+		sstr = va_arg(args, const char *);
+		if (sstr)
+			size += strlen(sstr) + 1;
+	}
+	va_end(args);
+	if (!size)
+		size = 1;
+	p = new_property(node, name, size);
+	s = (char *)p->prop;
+	*s = 0;
+	va_start(args, count);
+	for (i = 0; i < count; i++) {	
+		sstr = va_arg(args, const char *);
+		if (sstr) {
+			strcpy(s, sstr);
+			s = s + strlen(sstr) + 1;
+		}
+	}
+	va_end(args);
+	return p;
+}
+
+void dt_del_property(struct dt_node *node, struct dt_property *prop)
+{
+	list_del_from(&node->properties, &prop->list);
+	free_name(prop->name);
+	free(prop);
+}
+
+u32 dt_property_get_cell(const struct dt_property *prop, u32 index)
+{
+	assert(prop->len >= (index+1)*sizeof(u32));
+	/* Always aligned, so this works. */
+	return fdt32_to_cpu(((const fdt32_t *)prop->prop)[index]);
+}
+
+u64 dt_property_get_u64(const struct dt_property *prop, u32 index)
+{
+	assert(prop->len >= (index+1)*sizeof(u64));
+	/* Always aligned, so this works. */
+	return fdt64_to_cpu(((const fdt64_t *)prop->prop)[index]);
+}
+
+void dt_property_set_cell(struct dt_property *prop, u32 index, u32 val)
+{
+	assert(prop->len >= (index+1)*sizeof(u32));
+	/* Always aligned, so this works. */
+	((fdt32_t *)prop->prop)[index] = cpu_to_fdt32(val);
+}
+
+/* First child of this node. */
+struct dt_node *dt_first(const struct dt_node *root)
+{
+	return list_top(&root->children, struct dt_node, list);
+}
+
+/* Return next node, or NULL. */
+struct dt_node *dt_next(const struct dt_node *root,
+			const struct dt_node *prev)
+{
+	if (!prev) {
+		struct dt_node *first = dt_first(root);
+
+		if (!first)
+			return NULL;
+		else
+			return first;
+	}
+
+	/* Children? */
+	if (!list_empty(&prev->children))
+		return dt_first(prev);
+
+	do {
+		/* More siblings? */
+		if (prev->list.next != &prev->parent->children.n)
+			return list_entry(prev->list.next, struct dt_node,list);
+
+		/* No more siblings, move up to parent. */
+		prev = prev->parent;
+	} while (prev != root);
+
+	return NULL;
+}
+
+struct dt_property *__dt_find_property(struct dt_node *node, const char *name)
+{
+	struct dt_property *i;
+
+	list_for_each(&node->properties, i, list)
+		if (strcmp(i->name, name) == 0)
+			return i;
+	return NULL;
+}
+
+const struct dt_property *dt_find_property(const struct dt_node *node,
+					   const char *name)
+{
+	const struct dt_property *i;
+
+	list_for_each(&node->properties, i, list)
+		if (strcmp(i->name, name) == 0)
+			return i;
+	return NULL;
+}
+
+void dt_check_del_prop(struct dt_node *node, const char *name)
+{
+	struct dt_property *p;
+
+	p = __dt_find_property(node, name);
+	if (p)
+		dt_del_property(node, p);
+}
+const struct dt_property *dt_require_property(const struct dt_node *node,
+					      const char *name, int wanted_len)
+{
+	const struct dt_property *p = dt_find_property(node, name);
+
+	if (!p) {
+		const char *path = dt_get_path(node);
+
+		prerror("DT: Missing required property %s/%s\n",
+			path, name);
+		assert(false);
+	}
+	if (wanted_len >= 0 && p->len != wanted_len) {
+		const char *path = dt_get_path(node);
+
+		prerror("DT: Unexpected property length %s/%s\n",
+			path, name);
+		prerror("DT: Expected len: %d got len: %zu\n",
+			wanted_len, p->len);
+		assert(false);
+	}
+
+	return p;
+}
+
+bool dt_has_node_property(const struct dt_node *node,
+			  const char *name, const char *val)
+{
+	const struct dt_property *p = dt_find_property(node, name);
+
+	if (!p)
+		return false;
+	if (!val)
+		return true;
+
+	return p->len == strlen(val) + 1 && memcmp(p->prop, val, p->len) == 0;
+}
+
+bool dt_prop_find_string(const struct dt_property *p, const char *s)
+{
+	const char *c, *end;
+
+	if (!p)
+		return false;
+	c = p->prop;
+	end = c + p->len;
+
+	while(c < end) {
+		if (!strcasecmp(s, c))
+			return true;
+		c += strlen(c) + 1;
+	}
+	return false;
+}
+
+bool dt_node_is_compatible(const struct dt_node *node, const char *compat)
+{
+	const struct dt_property *p = dt_find_property(node, "compatible");
+
+	return dt_prop_find_string(p, compat);
+}
+
+struct dt_node *dt_find_compatible_node(struct dt_node *root,
+					struct dt_node *prev,
+					const char *compat)
+{
+	struct dt_node *node = prev;
+
+	while ((node = dt_next(root, node)))
+		if (dt_node_is_compatible(node, compat))
+			return node;
+	return NULL;
+}
+
+u64 dt_prop_get_u64(const struct dt_node *node, const char *prop)
+{
+	const struct dt_property *p = dt_require_property(node, prop, 8);
+
+	return ((u64)dt_property_get_cell(p, 0) << 32)
+		| dt_property_get_cell(p, 1);
+}
+
+u64 dt_prop_get_u64_def(const struct dt_node *node, const char *prop, u64 def)
+{
+	const struct dt_property *p = dt_find_property(node, prop);
+
+	if (!p)
+		return def;
+
+	return ((u64)dt_property_get_cell(p, 0) << 32)
+		| dt_property_get_cell(p, 1);
+}
+
+u32 dt_prop_get_u32(const struct dt_node *node, const char *prop)
+{
+	const struct dt_property *p = dt_require_property(node, prop, 4);
+
+	return dt_property_get_cell(p, 0);
+}
+
+u32 dt_prop_get_u32_def(const struct dt_node *node, const char *prop, u32 def)
+{
+	const struct dt_property *p = dt_find_property(node, prop);
+
+	if (!p)
+		return def;
+
+	return dt_property_get_cell(p, 0);
+}
+
+const void *dt_prop_get(const struct dt_node *node, const char *prop)
+{
+	const struct dt_property *p = dt_require_property(node, prop, -1);
+
+	return p->prop;
+}
+
+const void *dt_prop_get_def(const struct dt_node *node, const char *prop,
+			    void *def)
+{
+	const struct dt_property *p = dt_find_property(node, prop);
+
+	return p ? p->prop : def;
+}
+
+const void *dt_prop_get_def_size(const struct dt_node *node, const char *prop,
+				void *def, size_t *len)
+{
+	const struct dt_property *p = dt_find_property(node, prop);
+	*len = 0;
+	if (p)
+		*len = p->len;
+
+	return p ? p->prop : def;
+}
+
+u32 dt_prop_get_cell(const struct dt_node *node, const char *prop, u32 cell)
+{
+	const struct dt_property *p = dt_require_property(node, prop, -1);
+
+	return dt_property_get_cell(p, cell);
+}
+
+u32 dt_prop_get_cell_def(const struct dt_node *node, const char *prop,
+			 u32 cell, u32 def)
+{
+	const struct dt_property *p = dt_find_property(node, prop);
+
+	if (!p)
+		return def;
+
+	return dt_property_get_cell(p, cell);
+}
+
+void dt_free(struct dt_node *node)
+{
+	struct dt_node *child;
+	struct dt_property *p;
+
+	while ((child = list_top(&node->children, struct dt_node, list)))
+		dt_free(child);
+
+	while ((p = list_pop(&node->properties, struct dt_property, list))) {
+		free_name(p->name);
+		free(p);
+	}
+
+	if (node->parent)
+		list_del_from(&node->parent->children, &node->list);
+	dt_destroy(node);
+}
+
+int dt_expand_node(struct dt_node *node, const void *fdt, int fdt_node)
+{
+	const struct fdt_property *prop;
+	int offset, nextoffset, err;
+	struct dt_node *child;
+	const char *name;
+	uint32_t tag;
+
+	if (((err = fdt_check_header(fdt)) != 0)
+	    || ((err = fdt_check_node_offset_(fdt, fdt_node)) < 0)) {
+		prerror("FDT: Error %d parsing node 0x%x\n", err, fdt_node);
+		return -1;
+	}
+
+	nextoffset = err;
+	do {
+		offset = nextoffset;
+
+		tag = fdt_next_tag(fdt, offset, &nextoffset);
+		switch (tag) {
+		case FDT_PROP:
+			prop = fdt_offset_ptr_(fdt, offset);
+			name = fdt_string(fdt, fdt32_to_cpu(prop->nameoff));
+			dt_add_property(node, name, prop->data,
+					fdt32_to_cpu(prop->len));
+			break;
+		case FDT_BEGIN_NODE:
+			name = fdt_get_name(fdt, offset, NULL);
+			child = dt_new_root(name);
+			assert(child);
+			nextoffset = dt_expand_node(child, fdt, offset);
+
+			/*
+			 * This may fail in case of duplicate, keep it
+			 * going for now, we may ultimately want to
+			 * assert
+			 */
+			if (!dt_attach_root(node, child))
+	                       /**
+	                         * @fwts-label DTHasDuplicateNodeID
+	                         * @fwts-advice OPAL will parse the Flattened
+				 * Device Tree(FDT), which can be generated
+				 * from different firmware sources. During
+				 * expansion of FDT, OPAL observed a node
+				 * assigned multiple times (a duplicate). This
+				 * indicates either a Hostboot bug *OR*, more
+				 * likely, a bug in the platform XML. Check
+				 * the platform XML for duplicate IDs for
+				 * this type of device. Because of this
+				 * duplicate node, OPAL won't add the hardware
+				 * device found with a duplicate node ID into
+				 * DT, rendering the corresponding device not
+				 * functional.
+	                         */
+				prlog(PR_ERR, "DT: Found duplicate node: %s\n",
+				      child->name);
+			break;
+		case FDT_END:
+			return -1;
+		}
+	} while (tag != FDT_END_NODE);
+
+	return nextoffset;
+}
+
+void dt_expand(const void *fdt)
+{
+	prlog(PR_DEBUG, "FDT: Parsing fdt @%p\n", fdt);
+
+	if (dt_expand_node(dt_root, fdt, 0) < 0)
+		abort();
+}
+
+u64 dt_get_number(const void *pdata, unsigned int cells)
+{
+	const __be32 *p = pdata;
+	u64 ret = 0;
+
+	while(cells--)
+		ret = (ret << 32) | be32_to_cpu(*(p++));
+	return ret;
+}
+
+u32 dt_n_address_cells(const struct dt_node *node)
+{
+	if (!node->parent)
+		return 0;
+	return dt_prop_get_u32_def(node->parent, "#address-cells", 2);
+}
+
+u32 dt_n_size_cells(const struct dt_node *node)
+{
+	if (!node->parent)
+		return 0;
+	return dt_prop_get_u32_def(node->parent, "#size-cells", 1);
+}
+
+u64 dt_get_address(const struct dt_node *node, unsigned int index,
+		   u64 *out_size)
+{
+	const struct dt_property *p;
+	u32 na = dt_n_address_cells(node);
+	u32 ns = dt_n_size_cells(node);
+	u32 pos, n;
+
+	p = dt_require_property(node, "reg", -1);
+	n = (na + ns) * sizeof(u32);
+	pos = n * index;
+	assert((pos + n) <= p->len);
+	if (out_size)
+		*out_size = dt_get_number(p->prop + pos + na * sizeof(u32), ns);
+	return dt_get_number(p->prop + pos, na);
+}
+
+u32 __dt_get_chip_id(const struct dt_node *node)
+{
+	const struct dt_property *prop;
+
+	for (; node; node = node->parent) {
+		prop = dt_find_property(node, "ibm,chip-id");
+		if (prop)
+			return dt_property_get_cell(prop, 0);
+	}
+	return 0xffffffff;
+}
+
+u32 dt_get_chip_id(const struct dt_node *node)
+{
+	u32 id = __dt_get_chip_id(node);
+	assert(id != 0xffffffff);
+	return id;
+}
+
+struct dt_node *dt_find_compatible_node_on_chip(struct dt_node *root,
+						struct dt_node *prev,
+						const char *compat,
+						uint32_t chip_id)
+{
+	struct dt_node *node = prev;
+
+	while ((node = dt_next(root, node))) {
+		u32 cid = __dt_get_chip_id(node);
+		if (cid == chip_id &&
+		    dt_node_is_compatible(node, compat))
+			return node;
+	}
+	return NULL;
+}
+
+unsigned int dt_count_addresses(const struct dt_node *node)
+{
+	const struct dt_property *p;
+	u32 na = dt_n_address_cells(node);
+	u32 ns = dt_n_size_cells(node);
+	u32 n;
+
+	p = dt_require_property(node, "reg", -1);
+	n = (na + ns) * sizeof(u32);
+
+	if (n == 0)
+		return 0;
+
+	return p->len / n;
+}
+
+/* Translates an address from the given bus into its parent's address space */
+static u64 dt_translate_one(const struct dt_node *bus, u64 addr)
+{
+	u32 ranges_count, na, ns, parent_na;
+	const struct dt_property *p;
+	const u32 *ranges;
+	int i, stride;
+
+	assert(bus->parent);
+
+	na = dt_prop_get_u32_def(bus, "#address-cells", 2);
+	ns = dt_prop_get_u32_def(bus, "#size-cells", 2);
+	parent_na = dt_n_address_cells(bus);
+
+	stride = na + ns + parent_na;
+
+	/*
+	 * FIXME: We should handle arbitrary length addresses, rather than
+	 *        limiting it to 64bit. If someone wants/needs that they
+	 *        can implement the bignum math for it :)
+	 */
+	assert(na <= 2);
+	assert(parent_na <= 2);
+
+	/* We should never be trying to translate an address without a ranges */
+	p = dt_require_property(bus, "ranges", -1);
+
+	ranges = (u32 *) &p->prop;
+	ranges_count = (p->len / 4) / (na + parent_na + ns);
+
+	/* An empty ranges property implies 1-1 translation */
+	if (ranges_count == 0)
+		return addr;
+
+	for (i = 0; i < ranges_count; i++, ranges += stride) {
+		/* ranges format: <child base> <parent base> <size> */
+		u64 child_base = dt_get_number(ranges, na);
+		u64 parent_base = dt_get_number(ranges + na, parent_na);
+		u64 size = dt_get_number(ranges + na + parent_na, ns);
+
+		if (addr >= child_base && addr < child_base + size)
+			return (addr - child_base) + parent_base;
+	}
+
+	/* input address was outside the any of our mapped ranges */
+	return 0;
+}
+
+u64 dt_translate_address(const struct dt_node *node, unsigned int index,
+			 u64 *out_size)
+{
+	u64 addr = dt_get_address(node, index, NULL);
+	struct dt_node *bus = node->parent;
+
+	/* FIXME: One day we will probably want to use this, but for now just
+	 * force it it to be zero since we only support returning a u64 or u32
+	 */
+	assert(!out_size);
+
+	/* apply each translation until we hit the root bus */
+	while (bus->parent) {
+		addr = dt_translate_one(bus, addr);
+		bus = bus->parent;
+	}
+
+	return addr;
+}
+
+bool dt_node_is_enabled(struct dt_node *node)
+{
+	const struct dt_property *p = dt_find_property(node, "status");
+
+	if (!p)
+		return true;
+
+	return p->len > 1 && p->prop[0] == 'o' && p->prop[1] == 'k';
+}
+
+/*
+ * Function to fixup the phandle in the subtree.
+ */
+void dt_adjust_subtree_phandle(struct dt_node *dev,
+			const char** (get_properties_to_fix)(struct dt_node *n))
+{
+	struct dt_node *node;
+	struct dt_property *prop;
+	u32 phandle, max_phandle = 0, import_phandle = new_phandle();
+	__be32 p;
+	const char **name;
+
+	dt_for_each_node(dev, node) {
+		const char **props_to_update;
+		node->phandle += import_phandle;
+
+		/*
+		 * calculate max_phandle(new_tree), needed to update
+		 * last_phandle.
+		 */
+		if (node->phandle >= max_phandle)
+			max_phandle = node->phandle;
+
+		props_to_update = get_properties_to_fix(node);
+		if (!props_to_update)
+			continue;
+		for (name = props_to_update; *name != NULL; name++) {
+			prop = __dt_find_property(node, *name);
+			if (!prop)
+				continue;
+			phandle = dt_prop_get_u32(node, *name);
+			phandle += import_phandle;
+			p = cpu_to_be32(phandle);
+			memcpy((char *)&prop->prop, &p, prop->len);
+		}
+       }
+
+       set_last_phandle(max_phandle);
+}
diff --git a/roms/skiboot/core/direct-controls.c b/roms/skiboot/core/direct-controls.c
new file mode 100644
index 000000000..37bcf9826
--- /dev/null
+++ b/roms/skiboot/core/direct-controls.c
@@ -0,0 +1,1161 @@
+// SPDX-License-Identifier: Apache-2.0
+/*
+ * Directly control CPU cores/threads. SRESET, special wakeup, etc
+ *
+ * Copyright 2017-2019 IBM Corp.
+ */
+
+#include <direct-controls.h>
+#include <skiboot.h>
+#include <opal.h>
+#include <cpu.h>
+#include <xscom.h>
+#include <xscom-p8-regs.h>
+#include <xscom-p9-regs.h>
+#include <xscom-p10-regs.h>
+#include <timebase.h>
+#include <chip.h>
+
+
+/**************** mambo direct controls ****************/
+
+extern unsigned long callthru_tcl(const char *str, int len);
+
+static void mambo_sreset_cpu(struct cpu_thread *cpu)
+{
+	uint32_t chip_id = pir_to_chip_id(cpu->pir);
+	uint32_t core_id = pir_to_core_id(cpu->pir);
+	uint32_t thread_id = pir_to_thread_id(cpu->pir);
+	char tcl_cmd[50];
+
+	snprintf(tcl_cmd, sizeof(tcl_cmd),
+			"mysim cpu %i:%i:%i start_thread 0x100",
+			chip_id, core_id, thread_id);
+	callthru_tcl(tcl_cmd, strlen(tcl_cmd));
+}
+
+static void mambo_stop_cpu(struct cpu_thread *cpu)
+{
+	uint32_t chip_id = pir_to_chip_id(cpu->pir);
+	uint32_t core_id = pir_to_core_id(cpu->pir);
+	uint32_t thread_id = pir_to_thread_id(cpu->pir);
+	char tcl_cmd[50];
+
+	snprintf(tcl_cmd, sizeof(tcl_cmd),
+			"mysim cpu %i:%i:%i stop_thread",
+			chip_id, core_id, thread_id);
+	callthru_tcl(tcl_cmd, strlen(tcl_cmd));
+}
+
+/**************** POWER8 direct controls ****************/
+
+static int p8_core_set_special_wakeup(struct cpu_thread *cpu)
+{
+	uint64_t val, poll_target, stamp;
+	uint32_t core_id;
+	int rc;
+
+	/*
+	 * Note: HWP checks for checkstops, but I assume we don't need to
+	 * as we wouldn't be running if one was present
+	 */
+
+	/* Grab core ID once */
+	core_id = pir_to_core_id(cpu->pir);
+
+	prlog(PR_DEBUG, "RESET Waking up core 0x%x\n", core_id);
+
+	/*
+	 * The original HWp reads the XSCOM first but ignores the result
+	 * and error, let's do the same until I know for sure that is
+	 * not necessary
+	 */
+	xscom_read(cpu->chip_id,
+		   XSCOM_ADDR_P8_EX_SLAVE(core_id, EX_PM_SPECIAL_WAKEUP_PHYP),
+		   &val);
+
+	/* Then we write special wakeup */
+	rc = xscom_write(cpu->chip_id,
+			 XSCOM_ADDR_P8_EX_SLAVE(core_id,
+						EX_PM_SPECIAL_WAKEUP_PHYP),
+			 PPC_BIT(0));
+	if (rc) {
+		prerror("RESET: XSCOM error %d asserting special"
+			" wakeup on 0x%x\n", rc, cpu->pir);
+		return rc;
+	}
+
+	/*
+	 * HWP uses the history for Perf register here, dunno why it uses
+	 * that one instead of the pHyp one, maybe to avoid clobbering it...
+	 *
+	 * In any case, it does that to check for run/nap vs.sleep/winkle/other
+	 * to decide whether to poll on checkstop or not. Since we don't deal
+	 * with checkstop conditions here, we ignore that part.
+	 */
+
+	/*
+	 * Now poll for completion of special wakeup. The HWP is nasty here,
+	 * it will poll at 5ms intervals for up to 200ms. This is not quite
+	 * acceptable for us at runtime, at least not until we have the
+	 * ability to "context switch" HBRT. In practice, because we don't
+	 * winkle, it will never take that long, so we increase the polling
+	 * frequency to 1us per poll. However we do have to keep the same
+	 * timeout.
+	 *
+	 * We don't use time_wait_ms() either for now as we don't want to
+	 * poll the FSP here.
+	 */
+	stamp = mftb();
+	poll_target = stamp + msecs_to_tb(200);
+	val = 0;
+	while (!(val & EX_PM_GP0_SPECIAL_WAKEUP_DONE)) {
+		/* Wait 1 us */
+		time_wait_us(1);
+
+		/* Read PM state */
+		rc = xscom_read(cpu->chip_id,
+				XSCOM_ADDR_P8_EX_SLAVE(core_id, EX_PM_GP0),
+				&val);
+		if (rc) {
+			prerror("RESET: XSCOM error %d reading PM state on"
+				" 0x%x\n", rc, cpu->pir);
+			return rc;
+		}
+		/* Check timeout */
+		if (mftb() > poll_target)
+			break;
+	}
+
+	/* Success ? */
+	if (val & EX_PM_GP0_SPECIAL_WAKEUP_DONE) {
+		uint64_t now = mftb();
+		prlog(PR_TRACE, "RESET: Special wakeup complete after %ld us\n",
+		      tb_to_usecs(now - stamp));
+		return 0;
+	}
+
+	/*
+	 * We timed out ...
+	 *
+	 * HWP has a complex workaround for HW255321 which affects
+	 * Murano DD1 and Venice DD1. Ignore that for now
+	 *
+	 * Instead we just dump some XSCOMs for error logging
+	 */
+	prerror("RESET: Timeout on special wakeup of 0x%0x\n", cpu->pir);
+	prerror("RESET:      PM0 = 0x%016llx\n", val);
+	val = -1;
+	xscom_read(cpu->chip_id,
+		   XSCOM_ADDR_P8_EX_SLAVE(core_id, EX_PM_SPECIAL_WAKEUP_PHYP),
+		   &val);
+	prerror("RESET: SPC_WKUP = 0x%016llx\n", val);
+	val = -1;
+	xscom_read(cpu->chip_id,
+		   XSCOM_ADDR_P8_EX_SLAVE(core_id,
+					  EX_PM_IDLE_STATE_HISTORY_PHYP),
+		   &val);
+	prerror("RESET:  HISTORY = 0x%016llx\n", val);
+
+	return OPAL_HARDWARE;
+}
+
+static int p8_core_clear_special_wakeup(struct cpu_thread *cpu)
+{
+	uint64_t val;
+	uint32_t core_id;
+	int rc;
+
+	/*
+	 * Note: HWP checks for checkstops, but I assume we don't need to
+	 * as we wouldn't be running if one was present
+	 */
+
+	/* Grab core ID once */
+	core_id = pir_to_core_id(cpu->pir);
+
+	prlog(PR_DEBUG, "RESET: Releasing core 0x%x wakeup\n", core_id);
+
+	/*
+	 * The original HWp reads the XSCOM first but ignores the result
+	 * and error, let's do the same until I know for sure that is
+	 * not necessary
+	 */
+	xscom_read(cpu->chip_id,
+		   XSCOM_ADDR_P8_EX_SLAVE(core_id, EX_PM_SPECIAL_WAKEUP_PHYP),
+		   &val);
+
+	/* Then we write special wakeup */
+	rc = xscom_write(cpu->chip_id,
+			 XSCOM_ADDR_P8_EX_SLAVE(core_id,
+						EX_PM_SPECIAL_WAKEUP_PHYP), 0);
+	if (rc) {
+		prerror("RESET: XSCOM error %d deasserting"
+			" special wakeup on 0x%x\n", rc, cpu->pir);
+		return rc;
+	}
+
+	/*
+	 * The original HWp reads the XSCOM again with the comment
+	 * "This puts an inherent delay in the propagation of the reset
+	 * transition"
+	 */
+	xscom_read(cpu->chip_id,
+		   XSCOM_ADDR_P8_EX_SLAVE(core_id, EX_PM_SPECIAL_WAKEUP_PHYP),
+		   &val);
+
+	return 0;
+}
+
+static int p8_stop_thread(struct cpu_thread *cpu)
+{
+	uint32_t core_id = pir_to_core_id(cpu->pir);
+	uint32_t chip_id = pir_to_chip_id(cpu->pir);
+	uint32_t thread_id = pir_to_thread_id(cpu->pir);
+	uint32_t xscom_addr;
+
+	xscom_addr = XSCOM_ADDR_P8_EX(core_id,
+				      P8_EX_TCTL_DIRECT_CONTROLS(thread_id));
+
+	if (xscom_write(chip_id, xscom_addr, P8_DIRECT_CTL_STOP)) {
+		prlog(PR_ERR, "Could not stop thread %u:%u:%u:"
+				" Unable to write EX_TCTL_DIRECT_CONTROLS.\n",
+				chip_id, core_id, thread_id);
+		return OPAL_HARDWARE;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static int p8_sreset_thread(struct cpu_thread *cpu)
+{
+	uint32_t core_id = pir_to_core_id(cpu->pir);
+	uint32_t chip_id = pir_to_chip_id(cpu->pir);
+	uint32_t thread_id = pir_to_thread_id(cpu->pir);
+	uint32_t xscom_addr;
+
+	xscom_addr = XSCOM_ADDR_P8_EX(core_id,
+				      P8_EX_TCTL_DIRECT_CONTROLS(thread_id));
+
+	if (xscom_write(chip_id, xscom_addr, P8_DIRECT_CTL_PRENAP)) {
+		prlog(PR_ERR, "Could not prenap thread %u:%u:%u:"
+				" Unable to write EX_TCTL_DIRECT_CONTROLS.\n",
+				chip_id, core_id, thread_id);
+		return OPAL_HARDWARE;
+	}
+	if (xscom_write(chip_id, xscom_addr, P8_DIRECT_CTL_SRESET)) {
+		prlog(PR_ERR, "Could not sreset thread %u:%u:%u:"
+				" Unable to write EX_TCTL_DIRECT_CONTROLS.\n",
+				chip_id, core_id, thread_id);
+		return OPAL_HARDWARE;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+
+/**************** POWER9 direct controls ****************/
+
+/* Long running instructions may take time to complete. Timeout 100ms */
+#define P9_QUIESCE_POLL_INTERVAL	100
+#define P9_QUIESCE_TIMEOUT		100000
+
+/* Waking may take up to 5ms for deepest sleep states. Set timeout to 100ms */
+#define P9_SPWKUP_POLL_INTERVAL		100
+#define P9_SPWKUP_TIMEOUT		100000
+
+/*
+ * This implements direct control facilities of processor cores and threads
+ * using scom registers.
+ */
+
+static int p9_core_is_gated(struct cpu_thread *cpu)
+{
+	uint32_t chip_id = pir_to_chip_id(cpu->pir);
+	uint32_t core_id = pir_to_core_id(cpu->pir);
+	uint32_t sshhyp_addr;
+	uint64_t val;
+
+	sshhyp_addr = XSCOM_ADDR_P9_EC_SLAVE(core_id, P9_EC_PPM_SSHHYP);
+
+	if (xscom_read(chip_id, sshhyp_addr, &val)) {
+		prlog(PR_ERR, "Could not query core gated on %u:%u:"
+				" Unable to read PPM_SSHHYP.\n",
+				chip_id, core_id);
+		return OPAL_HARDWARE;
+	}
+
+	return !!(val & P9_CORE_GATED);
+}
+
+static int p9_core_set_special_wakeup(struct cpu_thread *cpu)
+{
+	uint32_t chip_id = pir_to_chip_id(cpu->pir);
+	uint32_t core_id = pir_to_core_id(cpu->pir);
+	uint32_t swake_addr;
+	uint32_t sshhyp_addr;
+	uint64_t val;
+	int i;
+
+	swake_addr = XSCOM_ADDR_P9_EC_SLAVE(core_id, EC_PPM_SPECIAL_WKUP_HYP);
+	sshhyp_addr = XSCOM_ADDR_P9_EC_SLAVE(core_id, P9_EC_PPM_SSHHYP);
+
+	if (xscom_write(chip_id, swake_addr, P9_SPWKUP_SET)) {
+		prlog(PR_ERR, "Could not set special wakeup on %u:%u:"
+				" Unable to write PPM_SPECIAL_WKUP_HYP.\n",
+				chip_id, core_id);
+		goto out_fail;
+	}
+
+	for (i = 0; i < P9_SPWKUP_TIMEOUT / P9_SPWKUP_POLL_INTERVAL; i++) {
+		if (xscom_read(chip_id, sshhyp_addr, &val)) {
+			prlog(PR_ERR, "Could not set special wakeup on %u:%u:"
+					" Unable to read PPM_SSHHYP.\n",
+					chip_id, core_id);
+			goto out_fail;
+		}
+		if (val & P9_SPECIAL_WKUP_DONE) {
+			/*
+			 * CORE_GATED will be unset on a successful special
+			 * wakeup of the core which indicates that the core is
+			 * out of stop state. If CORE_GATED is still set then
+			 * raise error.
+			 */
+			if (p9_core_is_gated(cpu)) {
+				/* Deassert spwu for this strange error */
+				xscom_write(chip_id, swake_addr, 0);
+				prlog(PR_ERR, "Failed special wakeup on %u:%u"
+						" as CORE_GATED is set\n",
+						chip_id, core_id);
+				goto out_fail;
+			} else {
+				return 0;
+			}
+		}
+		time_wait_us(P9_SPWKUP_POLL_INTERVAL);
+	}
+
+	prlog(PR_ERR, "Could not set special wakeup on %u:%u:"
+			" timeout waiting for SPECIAL_WKUP_DONE.\n",
+			chip_id, core_id);
+
+out_fail:
+	/*
+	 * As per the special wakeup protocol we should not de-assert
+	 * the special wakeup on the core until WAKEUP_DONE is set.
+	 * So even on error do not de-assert.
+	 */
+	return OPAL_HARDWARE;
+}
+
+static int p9_core_clear_special_wakeup(struct cpu_thread *cpu)
+{
+	uint32_t chip_id = pir_to_chip_id(cpu->pir);
+	uint32_t core_id = pir_to_core_id(cpu->pir);
+	uint32_t swake_addr;
+
+	swake_addr = XSCOM_ADDR_P9_EC_SLAVE(core_id, EC_PPM_SPECIAL_WKUP_HYP);
+
+	/*
+	 * De-assert special wakeup after a small delay.
+	 * The delay may help avoid problems setting and clearing special
+	 * wakeup back-to-back. This should be confirmed.
+	 */
+	time_wait_us(1);
+	if (xscom_write(chip_id, swake_addr, 0)) {
+		prlog(PR_ERR, "Could not clear special wakeup on %u:%u:"
+				" Unable to write PPM_SPECIAL_WKUP_HYP.\n",
+				chip_id, core_id);
+		return OPAL_HARDWARE;
+	}
+
+	/*
+	 * Don't wait for de-assert to complete as other components
+	 * could have requested for special wkeup. Wait for 10ms to
+	 * avoid back-to-back asserts
+	 */
+	time_wait_us(10000);
+	return 0;
+}
+
+static int p9_thread_quiesced(struct cpu_thread *cpu)
+{
+	uint32_t chip_id = pir_to_chip_id(cpu->pir);
+	uint32_t core_id = pir_to_core_id(cpu->pir);
+	uint32_t thread_id = pir_to_thread_id(cpu->pir);
+	uint32_t ras_addr;
+	uint64_t ras_status;
+
+	ras_addr = XSCOM_ADDR_P9_EC(core_id, P9_RAS_STATUS);
+	if (xscom_read(chip_id, ras_addr, &ras_status)) {
+		prlog(PR_ERR, "Could not check thread state on %u:%u:"
+				" Unable to read RAS_STATUS.\n",
+				chip_id, core_id);
+		return OPAL_HARDWARE;
+	}
+
+	/*
+	 * This returns true when the thread is quiesced and all
+	 * instructions completed. For sreset this may not be necessary,
+	 * but we may want to use instruction ramming or stepping
+	 * direct controls where it is important.
+	 */
+	if ((ras_status & P9_THREAD_QUIESCED(thread_id))
+			== P9_THREAD_QUIESCED(thread_id))
+		return 1;
+
+	return 0;
+}
+
+static int p9_cont_thread(struct cpu_thread *cpu)
+{
+	uint32_t chip_id = pir_to_chip_id(cpu->pir);
+	uint32_t core_id = pir_to_core_id(cpu->pir);
+	uint32_t thread_id = pir_to_thread_id(cpu->pir);
+	uint32_t cts_addr;
+	uint32_t ti_addr;
+	uint32_t dctl_addr;
+	uint64_t core_thread_state;
+	uint64_t thread_info;
+	bool active, stop;
+	int rc;
+
+	rc = p9_thread_quiesced(cpu);
+	if (rc < 0)
+		return rc;
+	if (!rc) {
+		prlog(PR_ERR, "Could not cont thread %u:%u:%u:"
+				" Thread is not quiesced.\n",
+				chip_id, core_id, thread_id);
+		return OPAL_BUSY;
+	}
+
+	cts_addr = XSCOM_ADDR_P9_EC(core_id, P9_CORE_THREAD_STATE);
+	ti_addr = XSCOM_ADDR_P9_EC(core_id, P9_THREAD_INFO);
+	dctl_addr = XSCOM_ADDR_P9_EC(core_id, P9_EC_DIRECT_CONTROLS);
+
+	if (xscom_read(chip_id, cts_addr, &core_thread_state)) {
+		prlog(PR_ERR, "Could not resume thread %u:%u:%u:"
+				" Unable to read CORE_THREAD_STATE.\n",
+				chip_id, core_id, thread_id);
+		return OPAL_HARDWARE;
+	}
+	if (core_thread_state & PPC_BIT(56 + thread_id))
+		stop = true;
+	else
+		stop = false;
+
+	if (xscom_read(chip_id, ti_addr, &thread_info)) {
+		prlog(PR_ERR, "Could not resume thread %u:%u:%u:"
+				" Unable to read THREAD_INFO.\n",
+				chip_id, core_id, thread_id);
+		return OPAL_HARDWARE;
+	}
+	if (thread_info & PPC_BIT(thread_id))
+		active = true;
+	else
+		active = false;
+
+	if (!active || stop) {
+		if (xscom_write(chip_id, dctl_addr, P9_THREAD_CLEAR_MAINT(thread_id))) {
+			prlog(PR_ERR, "Could not resume thread %u:%u:%u:"
+				      " Unable to write EC_DIRECT_CONTROLS.\n",
+				      chip_id, core_id, thread_id);
+		}
+	} else {
+		if (xscom_write(chip_id, dctl_addr, P9_THREAD_CONT(thread_id))) {
+			prlog(PR_ERR, "Could not resume thread %u:%u:%u:"
+				      " Unable to write EC_DIRECT_CONTROLS.\n",
+				      chip_id, core_id, thread_id);
+		}
+	}
+
+	return 0;
+}
+
+static int p9_stop_thread(struct cpu_thread *cpu)
+{
+	uint32_t chip_id = pir_to_chip_id(cpu->pir);
+	uint32_t core_id = pir_to_core_id(cpu->pir);
+	uint32_t thread_id = pir_to_thread_id(cpu->pir);
+	uint32_t dctl_addr;
+	int rc;
+	int i;
+
+	dctl_addr = XSCOM_ADDR_P9_EC(core_id, P9_EC_DIRECT_CONTROLS);
+
+	rc = p9_thread_quiesced(cpu);
+	if (rc < 0)
+		return rc;
+	if (rc) {
+		prlog(PR_ERR, "Could not stop thread %u:%u:%u:"
+				" Thread is quiesced already.\n",
+				chip_id, core_id, thread_id);
+		return OPAL_BUSY;
+	}
+
+	if (xscom_write(chip_id, dctl_addr, P9_THREAD_STOP(thread_id))) {
+		prlog(PR_ERR, "Could not stop thread %u:%u:%u:"
+				" Unable to write EC_DIRECT_CONTROLS.\n",
+				chip_id, core_id, thread_id);
+		return OPAL_HARDWARE;
+	}
+
+	for (i = 0; i < P9_QUIESCE_TIMEOUT / P9_QUIESCE_POLL_INTERVAL; i++) {
+		int rc = p9_thread_quiesced(cpu);
+		if (rc < 0)
+			break;
+		if (rc)
+			return 0;
+
+		time_wait_us(P9_QUIESCE_POLL_INTERVAL);
+	}
+
+	prlog(PR_ERR, "Could not stop thread %u:%u:%u:"
+			" Unable to quiesce thread.\n",
+			chip_id, core_id, thread_id);
+
+	return OPAL_HARDWARE;
+}
+
+static int p9_sreset_thread(struct cpu_thread *cpu)
+{
+	uint32_t chip_id = pir_to_chip_id(cpu->pir);
+	uint32_t core_id = pir_to_core_id(cpu->pir);
+	uint32_t thread_id = pir_to_thread_id(cpu->pir);
+	uint32_t dctl_addr;
+
+	dctl_addr = XSCOM_ADDR_P9_EC(core_id, P9_EC_DIRECT_CONTROLS);
+
+	if (xscom_write(chip_id, dctl_addr, P9_THREAD_SRESET(thread_id))) {
+		prlog(PR_ERR, "Could not sreset thread %u:%u:%u:"
+				" Unable to write EC_DIRECT_CONTROLS.\n",
+				chip_id, core_id, thread_id);
+		return OPAL_HARDWARE;
+	}
+
+	return 0;
+}
+
+/**************** POWER10 direct controls ****************/
+
+/* Long running instructions may take time to complete. Timeout 100ms */
+#define P10_QUIESCE_POLL_INTERVAL	100
+#define P10_QUIESCE_TIMEOUT		100000
+
+/* Waking may take up to 5ms for deepest sleep states. Set timeout to 100ms */
+#define P10_SPWU_POLL_INTERVAL		100
+#define P10_SPWU_TIMEOUT		100000
+
+/*
+ * This implements direct control facilities of processor cores and threads
+ * using scom registers.
+ */
+static int p10_core_is_gated(struct cpu_thread *cpu)
+{
+	uint32_t chip_id = pir_to_chip_id(cpu->pir);
+	uint32_t core_id = pir_to_core_id(cpu->pir);
+	uint32_t ssh_addr;
+	uint64_t val;
+
+	ssh_addr = XSCOM_ADDR_P10_QME_CORE(core_id, P10_QME_SSH_HYP);
+
+	if (xscom_read(chip_id, ssh_addr, &val)) {
+		prlog(PR_ERR, "Could not query core gated on %u:%u:"
+				" Unable to read QME_SSH_HYP.\n",
+				chip_id, core_id);
+		return OPAL_HARDWARE;
+	}
+
+	return !!(val & P10_SSH_CORE_GATED);
+}
+
+
+static int p10_core_set_special_wakeup(struct cpu_thread *cpu)
+{
+	uint32_t chip_id = pir_to_chip_id(cpu->pir);
+	uint32_t core_id = pir_to_core_id(cpu->pir);
+	uint32_t spwu_addr, ssh_addr;
+	uint64_t val;
+	int i;
+
+	/* P10 could use SPWU_HYP done bit instead of SSH? */
+	spwu_addr = XSCOM_ADDR_P10_QME_CORE(core_id, P10_QME_SPWU_HYP);
+	ssh_addr = XSCOM_ADDR_P10_QME_CORE(core_id, P10_QME_SSH_HYP);
+
+	if (xscom_write(chip_id, spwu_addr, P10_SPWU_REQ)) {
+		prlog(PR_ERR, "Could not set special wakeup on %u:%u:"
+				" Unable to write QME_SPWU_HYP.\n",
+				chip_id, core_id);
+		return OPAL_HARDWARE;
+	}
+
+	for (i = 0; i < P10_SPWU_TIMEOUT / P10_SPWU_POLL_INTERVAL; i++) {
+		if (xscom_read(chip_id, ssh_addr, &val)) {
+			prlog(PR_ERR, "Could not set special wakeup on %u:%u:"
+					" Unable to read QME_SSH_HYP.\n",
+					chip_id, core_id);
+			return OPAL_HARDWARE;
+		}
+		if (val & P10_SSH_SPWU_DONE) {
+			/*
+			 * CORE_GATED will be unset on a successful special
+			 * wakeup of the core which indicates that the core is
+			 * out of stop state. If CORE_GATED is still set then
+			 * check SPWU register and raise error only if SPWU_DONE
+			 * is not set, else print a warning and consider SPWU
+			 * operation as successful.
+			 * This is in conjunction with a micocode bug, which
+			 * calls out the fact that SPW can succeed in the case
+			 * the core is gated but SPWU_HYP bit is set.
+			 */
+			if (p10_core_is_gated(cpu)) {
+				if(xscom_read(chip_id, spwu_addr, &val)) {
+					prlog(PR_ERR, "Core %u:%u:"
+					      " unable to read QME_SPWU_HYP\n",
+					      chip_id, core_id);
+					return OPAL_HARDWARE;
+				}
+				if (val & P10_SPWU_DONE) {
+					/*
+					 * If SPWU DONE bit is set then
+					 * SPWU operation is complete
+					 */
+					prlog(PR_DEBUG, "Special wakeup on "
+					      "%u:%u: core remains gated while"
+					      " SPWU_HYP DONE set\n",
+					      chip_id, core_id);
+					return 0;
+				}
+				/* Deassert spwu for this strange error */
+				xscom_write(chip_id, spwu_addr, 0);
+				prlog(PR_ERR,
+				      "Failed special wakeup on %u:%u"
+				      " core remains gated.\n",
+				      chip_id, core_id);
+				return OPAL_HARDWARE;
+			} else {
+				return 0;
+			}
+		}
+		time_wait_us(P10_SPWU_POLL_INTERVAL);
+	}
+
+	prlog(PR_ERR, "Could not set special wakeup on %u:%u:"
+			" operation timeout.\n",
+			chip_id, core_id);
+	/*
+	 * As per the special wakeup protocol we should not de-assert
+	 * the special wakeup on the core until WAKEUP_DONE is set.
+	 * So even on error do not de-assert.
+	 */
+
+	return OPAL_HARDWARE;
+}
+
+static int p10_core_clear_special_wakeup(struct cpu_thread *cpu)
+{
+	uint32_t chip_id = pir_to_chip_id(cpu->pir);
+	uint32_t core_id = pir_to_core_id(cpu->pir);
+	uint32_t spwu_addr;
+
+	spwu_addr = XSCOM_ADDR_P10_QME_CORE(core_id, P10_QME_SPWU_HYP);
+
+	/* Add a small delay here if spwu problems time_wait_us(1); */
+	if (xscom_write(chip_id, spwu_addr, 0)) {
+		prlog(PR_ERR, "Could not clear special wakeup on %u:%u:"
+				" Unable to write QME_SPWU_HYP.\n",
+				chip_id, core_id);
+		return OPAL_HARDWARE;
+	}
+
+	return 0;
+}
+
+static int p10_thread_quiesced(struct cpu_thread *cpu)
+{
+	uint32_t chip_id = pir_to_chip_id(cpu->pir);
+	uint32_t core_id = pir_to_core_id(cpu->pir);
+	uint32_t thread_id = pir_to_thread_id(cpu->pir);
+	uint32_t ras_addr;
+	uint64_t ras_status;
+
+	ras_addr = XSCOM_ADDR_P10_EC(core_id, P10_EC_RAS_STATUS);
+	if (xscom_read(chip_id, ras_addr, &ras_status)) {
+		prlog(PR_ERR, "Could not check thread state on %u:%u:"
+				" Unable to read EC_RAS_STATUS.\n",
+				chip_id, core_id);
+		return OPAL_HARDWARE;
+	}
+
+	/*
+	 * p10_thread_stop for the purpose of sreset wants QUIESCED
+	 * and MAINT bits set. Step, RAM, etc. need more, but we don't
+	 * use those in skiboot.
+	 *
+	 * P10 could try wait for more here in case of errors.
+	 */
+	if (!(ras_status & P10_THREAD_QUIESCED(thread_id)))
+		return 0;
+
+	if (!(ras_status & P10_THREAD_MAINT(thread_id)))
+		return 0;
+
+	return 1;
+}
+
+static int p10_cont_thread(struct cpu_thread *cpu)
+{
+	uint32_t chip_id = pir_to_chip_id(cpu->pir);
+	uint32_t core_id = pir_to_core_id(cpu->pir);
+	uint32_t thread_id = pir_to_thread_id(cpu->pir);
+	uint32_t cts_addr;
+	uint32_t ti_addr;
+	uint32_t dctl_addr;
+	uint64_t core_thread_state;
+	uint64_t thread_info;
+	bool active, stop;
+	int rc;
+	int i;
+
+	rc = p10_thread_quiesced(cpu);
+	if (rc < 0)
+		return rc;
+	if (!rc) {
+		prlog(PR_ERR, "Could not cont thread %u:%u:%u:"
+				" Thread is not quiesced.\n",
+				chip_id, core_id, thread_id);
+		return OPAL_BUSY;
+	}
+
+	cts_addr = XSCOM_ADDR_P10_EC(core_id, P10_EC_CORE_THREAD_STATE);
+	ti_addr = XSCOM_ADDR_P10_EC(core_id, P10_EC_THREAD_INFO);
+	dctl_addr = XSCOM_ADDR_P10_EC(core_id, P10_EC_DIRECT_CONTROLS);
+
+	if (xscom_read(chip_id, cts_addr, &core_thread_state)) {
+		prlog(PR_ERR, "Could not resume thread %u:%u:%u:"
+				" Unable to read EC_CORE_THREAD_STATE.\n",
+				chip_id, core_id, thread_id);
+		return OPAL_HARDWARE;
+	}
+	if (core_thread_state & P10_THREAD_STOPPED(thread_id))
+		stop = true;
+	else
+		stop = false;
+
+	if (xscom_read(chip_id, ti_addr, &thread_info)) {
+		prlog(PR_ERR, "Could not resume thread %u:%u:%u:"
+				" Unable to read EC_THREAD_INFO.\n",
+				chip_id, core_id, thread_id);
+		return OPAL_HARDWARE;
+	}
+	if (thread_info & P10_THREAD_ACTIVE(thread_id))
+		active = true;
+	else
+		active = false;
+
+	if (!active || stop) {
+		if (xscom_write(chip_id, dctl_addr, P10_THREAD_CLEAR_MAINT(thread_id))) {
+			prlog(PR_ERR, "Could not resume thread %u:%u:%u:"
+				      " Unable to write EC_DIRECT_CONTROLS.\n",
+				      chip_id, core_id, thread_id);
+		}
+	} else {
+		if (xscom_write(chip_id, dctl_addr, P10_THREAD_START(thread_id))) {
+			prlog(PR_ERR, "Could not resume thread %u:%u:%u:"
+				      " Unable to write EC_DIRECT_CONTROLS.\n",
+				      chip_id, core_id, thread_id);
+		}
+	}
+
+	for (i = 0; i < P10_QUIESCE_TIMEOUT / P10_QUIESCE_POLL_INTERVAL; i++) {
+		int rc = p10_thread_quiesced(cpu);
+		if (rc < 0)
+			break;
+		if (!rc)
+			return 0;
+
+		time_wait_us(P10_QUIESCE_POLL_INTERVAL);
+	}
+
+	prlog(PR_ERR, "Could not start thread %u:%u:%u:"
+			" Unable to start thread.\n",
+			chip_id, core_id, thread_id);
+
+	return OPAL_HARDWARE;
+}
+
+static int p10_stop_thread(struct cpu_thread *cpu)
+{
+	uint32_t chip_id = pir_to_chip_id(cpu->pir);
+	uint32_t core_id = pir_to_core_id(cpu->pir);
+	uint32_t thread_id = pir_to_thread_id(cpu->pir);
+	uint32_t dctl_addr;
+	int rc;
+	int i;
+
+	dctl_addr = XSCOM_ADDR_P10_EC(core_id, P10_EC_DIRECT_CONTROLS);
+
+	rc = p10_thread_quiesced(cpu);
+	if (rc < 0)
+		return rc;
+	if (rc) {
+		prlog(PR_ERR, "Could not stop thread %u:%u:%u:"
+				" Thread is quiesced already.\n",
+				chip_id, core_id, thread_id);
+		return OPAL_BUSY;
+	}
+
+	if (xscom_write(chip_id, dctl_addr, P10_THREAD_STOP(thread_id))) {
+		prlog(PR_ERR, "Could not stop thread %u:%u:%u:"
+				" Unable to write EC_DIRECT_CONTROLS.\n",
+				chip_id, core_id, thread_id);
+		return OPAL_HARDWARE;
+	}
+
+	for (i = 0; i < P10_QUIESCE_TIMEOUT / P10_QUIESCE_POLL_INTERVAL; i++) {
+		int rc = p10_thread_quiesced(cpu);
+		if (rc < 0)
+			break;
+		if (rc)
+			return 0;
+
+		time_wait_us(P10_QUIESCE_POLL_INTERVAL);
+	}
+
+	prlog(PR_ERR, "Could not stop thread %u:%u:%u:"
+			" Unable to quiesce thread.\n",
+			chip_id, core_id, thread_id);
+
+	return OPAL_HARDWARE;
+}
+
+static int p10_sreset_thread(struct cpu_thread *cpu)
+{
+	uint32_t chip_id = pir_to_chip_id(cpu->pir);
+	uint32_t core_id = pir_to_core_id(cpu->pir);
+	uint32_t thread_id = pir_to_thread_id(cpu->pir);
+	uint32_t dctl_addr;
+
+	dctl_addr = XSCOM_ADDR_P10_EC(core_id, P10_EC_DIRECT_CONTROLS);
+
+	if (xscom_write(chip_id, dctl_addr, P10_THREAD_SRESET(thread_id))) {
+		prlog(PR_ERR, "Could not sreset thread %u:%u:%u:"
+				" Unable to write EC_DIRECT_CONTROLS.\n",
+				chip_id, core_id, thread_id);
+		return OPAL_HARDWARE;
+	}
+
+	return 0;
+}
+
+/**************** generic direct controls ****************/
+
+int dctl_set_special_wakeup(struct cpu_thread *t)
+{
+	struct cpu_thread *c = t->ec_primary;
+	int rc = OPAL_SUCCESS;
+
+	if (proc_gen == proc_gen_unknown)
+		return OPAL_UNSUPPORTED;
+
+	lock(&c->dctl_lock);
+	if (c->special_wakeup_count == 0) {
+		if (proc_gen == proc_gen_p10)
+			rc = p10_core_set_special_wakeup(c);
+		else if (proc_gen == proc_gen_p9)
+			rc = p9_core_set_special_wakeup(c);
+		else /* (proc_gen == proc_gen_p8) */
+			rc = p8_core_set_special_wakeup(c);
+	}
+	if (!rc)
+		c->special_wakeup_count++;
+	unlock(&c->dctl_lock);
+
+	return rc;
+}
+
+int dctl_clear_special_wakeup(struct cpu_thread *t)
+{
+	struct cpu_thread *c = t->ec_primary;
+	int rc = OPAL_SUCCESS;
+
+	if (proc_gen == proc_gen_unknown)
+		return OPAL_UNSUPPORTED;
+
+	lock(&c->dctl_lock);
+	if (!c->special_wakeup_count)
+		goto out;
+	if (c->special_wakeup_count == 1) {
+		if (proc_gen == proc_gen_p10)
+			rc = p10_core_clear_special_wakeup(c);
+		else if (proc_gen == proc_gen_p9)
+			rc = p9_core_clear_special_wakeup(c);
+		else /* (proc_gen == proc_gen_p8) */
+			rc = p8_core_clear_special_wakeup(c);
+	}
+	if (!rc)
+		c->special_wakeup_count--;
+out:
+	unlock(&c->dctl_lock);
+
+	return rc;
+}
+
+int dctl_core_is_gated(struct cpu_thread *t)
+{
+	struct cpu_thread *c = t->primary;
+
+	if (proc_gen == proc_gen_p10)
+		return p10_core_is_gated(c);
+	else if (proc_gen == proc_gen_p9)
+		return p9_core_is_gated(c);
+	else
+		return OPAL_UNSUPPORTED;
+}
+
+static int dctl_stop(struct cpu_thread *t)
+{
+	struct cpu_thread *c = t->ec_primary;
+	int rc;
+
+	lock(&c->dctl_lock);
+	if (t->dctl_stopped) {
+		unlock(&c->dctl_lock);
+		return OPAL_BUSY;
+	}
+	if (proc_gen == proc_gen_p10)
+		rc = p10_stop_thread(t);
+	else if (proc_gen == proc_gen_p9)
+		rc = p9_stop_thread(t);
+	else /* (proc_gen == proc_gen_p8) */
+		rc = p8_stop_thread(t);
+	if (!rc)
+		t->dctl_stopped = true;
+	unlock(&c->dctl_lock);
+
+	return rc;
+}
+
+static int dctl_cont(struct cpu_thread *t)
+{
+	struct cpu_thread *c = t->primary;
+	int rc;
+
+	if (proc_gen != proc_gen_p10 && proc_gen != proc_gen_p9)
+		return OPAL_UNSUPPORTED;
+
+	lock(&c->dctl_lock);
+	if (!t->dctl_stopped) {
+		unlock(&c->dctl_lock);
+		return OPAL_BUSY;
+	}
+	if (proc_gen == proc_gen_p10)
+		rc = p10_cont_thread(t);
+	else /* (proc_gen == proc_gen_p9) */
+		rc = p9_cont_thread(t);
+	if (!rc)
+		t->dctl_stopped = false;
+	unlock(&c->dctl_lock);
+
+	return rc;
+}
+
+/*
+ * NOTE:
+ * The POWER8 sreset does not provide SRR registers, so it can be used
+ * for fast reboot, but not OPAL_SIGNAL_SYSTEM_RESET or anywhere that is
+ * expected to return. For now, callers beware.
+ */
+static int dctl_sreset(struct cpu_thread *t)
+{
+	struct cpu_thread *c = t->ec_primary;
+	int rc;
+
+	lock(&c->dctl_lock);
+	if (!t->dctl_stopped) {
+		unlock(&c->dctl_lock);
+		return OPAL_BUSY;
+	}
+	if (proc_gen == proc_gen_p10)
+		rc = p10_sreset_thread(t);
+	else if (proc_gen == proc_gen_p9)
+		rc = p9_sreset_thread(t);
+	else /* (proc_gen == proc_gen_p8) */
+		rc = p8_sreset_thread(t);
+	if (!rc)
+		t->dctl_stopped = false;
+	unlock(&c->dctl_lock);
+
+	return rc;
+}
+
+
+/**************** fast reboot API ****************/
+
+int sreset_all_prepare(void)
+{
+	struct cpu_thread *cpu;
+
+	if (proc_gen == proc_gen_unknown)
+		return OPAL_UNSUPPORTED;
+
+	prlog(PR_DEBUG, "RESET: Resetting from cpu: 0x%x (core 0x%x)\n",
+	      this_cpu()->pir, pir_to_core_id(this_cpu()->pir));
+
+	if (chip_quirk(QUIRK_MAMBO_CALLOUTS)) {
+		for_each_ungarded_cpu(cpu) {
+			if (cpu == this_cpu())
+				continue;
+			mambo_stop_cpu(cpu);
+		}
+		return OPAL_SUCCESS;
+	}
+
+	/* Assert special wakup on all cores. Only on operational cores. */
+	for_each_ungarded_primary(cpu) {
+		if (dctl_set_special_wakeup(cpu) != OPAL_SUCCESS)
+			return OPAL_HARDWARE;
+	}
+
+	prlog(PR_DEBUG, "RESET: Stopping the world...\n");
+
+	/* Put everybody in stop except myself */
+	for_each_ungarded_cpu(cpu) {
+		if (cpu == this_cpu())
+			continue;
+		if (dctl_stop(cpu) != OPAL_SUCCESS)
+			return OPAL_HARDWARE;
+
+	}
+
+	return OPAL_SUCCESS;
+}
+
+void sreset_all_finish(void)
+{
+	struct cpu_thread *cpu;
+
+	if (chip_quirk(QUIRK_MAMBO_CALLOUTS))
+		return;
+
+	for_each_ungarded_primary(cpu)
+		dctl_clear_special_wakeup(cpu);
+}
+
+int sreset_all_others(void)
+{
+	struct cpu_thread *cpu;
+
+	prlog(PR_DEBUG, "RESET: Resetting all threads but self...\n");
+
+	/*
+	 * mambo should actually implement stop as well, and implement
+	 * the dctl_ helpers properly. Currently it's racy just sresetting.
+	 */
+	if (chip_quirk(QUIRK_MAMBO_CALLOUTS)) {
+		for_each_ungarded_cpu(cpu) {
+			if (cpu == this_cpu())
+				continue;
+			mambo_sreset_cpu(cpu);
+		}
+		return OPAL_SUCCESS;
+	}
+
+	for_each_ungarded_cpu(cpu) {
+		if (cpu == this_cpu())
+			continue;
+		if (dctl_sreset(cpu) != OPAL_SUCCESS)
+			return OPAL_HARDWARE;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+
+/**************** OPAL_SIGNAL_SYSTEM_RESET API ****************/
+
+/*
+ * This provides a way for the host to raise system reset exceptions
+ * on other threads using direct control scoms on POWER9.
+ *
+ * We assert special wakeup on the core first.
+ * Then stop target thread and wait for it to quiesce.
+ * Then sreset the target thread, which resumes execution on that thread.
+ * Then de-assert special wakeup on the core.
+ */
+static int64_t do_sreset_cpu(struct cpu_thread *cpu)
+{
+	int rc;
+
+	if (this_cpu() == cpu) {
+		prlog(PR_ERR, "SRESET: Unable to reset self\n");
+		return OPAL_PARAMETER;
+	}
+
+	rc = dctl_set_special_wakeup(cpu);
+	if (rc)
+		return rc;
+
+	rc = dctl_stop(cpu);
+	if (rc)
+		goto out_spwk;
+
+	rc = dctl_sreset(cpu);
+	if (rc)
+		goto out_cont;
+
+	dctl_clear_special_wakeup(cpu);
+
+	return 0;
+
+out_cont:
+	dctl_cont(cpu);
+out_spwk:
+	dctl_clear_special_wakeup(cpu);
+
+	return rc;
+}
+
+static struct lock sreset_lock = LOCK_UNLOCKED;
+
+int64_t opal_signal_system_reset(int cpu_nr)
+{
+	struct cpu_thread *cpu;
+	int64_t ret;
+
+	if (proc_gen != proc_gen_p9 && proc_gen != proc_gen_p10)
+		return OPAL_UNSUPPORTED;
+
+	/*
+	 * Broadcasts unsupported. Not clear what threads should be
+	 * signaled, so it's better for the OS to perform one-at-a-time
+	 * for now.
+	 */
+	if (cpu_nr < 0)
+		return OPAL_CONSTRAINED;
+
+	/* Reset a single CPU */
+	cpu = find_cpu_by_server(cpu_nr);
+	if (!cpu) {
+		prlog(PR_ERR, "SRESET: could not find cpu by server %d\n", cpu_nr);
+		return OPAL_PARAMETER;
+	}
+
+	lock(&sreset_lock);
+	ret = do_sreset_cpu(cpu);
+	unlock(&sreset_lock);
+
+	return ret;
+}
+
+void direct_controls_init(void)
+{
+	if (chip_quirk(QUIRK_MAMBO_CALLOUTS))
+		return;
+
+	if (proc_gen != proc_gen_p9 && proc_gen != proc_gen_p10)
+		return;
+
+	opal_register(OPAL_SIGNAL_SYSTEM_RESET, opal_signal_system_reset, 1);
+}
diff --git a/roms/skiboot/core/errorlog.c b/roms/skiboot/core/errorlog.c
new file mode 100644
index 000000000..f64ac3f23
--- /dev/null
+++ b/roms/skiboot/core/errorlog.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* This file contains the front end for OPAL error logging. It is used
+ * to construct a struct errorlog representing the event/error to be
+ * logged which is then passed to the platform specific backend to log
+ * the actual errors.
+ *
+ * Copyright 2013-2017 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <lock.h>
+#include <errorlog.h>
+#include <pool.h>
+
+/*
+ * Maximum number buffers that are pre-allocated
+ * to hold elogs that are reported on Sapphire and
+ * PowerNV.
+ */
+#define ELOG_WRITE_MAX_RECORD		64
+/* Platform log id as per the spec */
+static uint32_t sapphire_elog_id = 0xB0000000;
+
+/* Reserved for future use */
+/* static uint32_t powernv_elog_id = 0xB1000000; */
+
+/* Pool to allocate elog messages from */
+static struct pool elog_pool;
+static struct lock elog_lock = LOCK_UNLOCKED;
+
+static bool elog_available = false;
+
+static struct errorlog *get_write_buffer(int opal_event_severity)
+{
+	struct errorlog *buf;
+
+	if (!elog_available)
+		return NULL;
+
+	lock(&elog_lock);
+	if (opal_event_severity == OPAL_ERROR_PANIC)
+		buf = pool_get(&elog_pool, POOL_HIGH);
+	else
+		buf = pool_get(&elog_pool, POOL_NORMAL);
+
+	unlock(&elog_lock);
+	return buf;
+}
+
+/* Reporting of error via struct errorlog */
+struct errorlog *opal_elog_create(struct opal_err_info *e_info, uint32_t tag)
+{
+	struct errorlog *buf;
+
+	buf = get_write_buffer(e_info->sev);
+	if (buf) {
+		buf->error_event_type = e_info->err_type;
+		buf->component_id = e_info->cmp_id;
+		buf->subsystem_id = e_info->subsystem;
+		buf->event_severity = e_info->sev;
+		buf->event_subtype = e_info->event_subtype;
+		buf->reason_code = e_info->reason_code;
+		buf->elog_origin = ORG_SAPPHIRE;
+
+		lock(&elog_lock);
+		buf->plid = ++sapphire_elog_id;
+		unlock(&elog_lock);
+
+		/* Initialise the first user dump section */
+		log_add_section(buf, tag);
+	}
+
+	return buf;
+}
+
+/* Add a new user data section to an existing error log */
+void log_add_section(struct errorlog *buf, uint32_t tag)
+{
+	size_t size = sizeof(struct elog_user_data_section) - 1;
+	struct elog_user_data_section *tmp;
+
+	if (!buf) {
+		prerror("ELOG: Cannot add user data section. "
+			"Buffer is invalid\n");
+		return;
+	}
+
+	if ((buf->user_section_size + size) > OPAL_LOG_MAX_DUMP) {
+		prerror("ELOG: Size of dump data overruns buffer\n");
+		return;
+	}
+
+	tmp = (struct elog_user_data_section *)(buf->user_data_dump +
+						buf->user_section_size);
+	/* Use DESC if no other tag provided */
+	tmp->tag = tag ? cpu_to_be32(tag) : cpu_to_be32(OPAL_ELOG_SEC_DESC);
+	tmp->size = cpu_to_be16(size);
+
+	buf->user_section_size += size;
+	buf->user_section_count++;
+}
+
+void opal_elog_complete(struct errorlog *buf, bool success)
+{
+	if (!success)
+		printf("Unable to log error\n");
+
+	lock(&elog_lock);
+	pool_free_object(&elog_pool, buf);
+	unlock(&elog_lock);
+}
+
+void log_commit(struct errorlog *elog)
+{
+	int rc;
+
+	if (!elog)
+		return;
+
+	if (platform.elog_commit) {
+		rc = platform.elog_commit(elog);
+		if (rc)
+			prerror("ELOG: Platform commit error %d\n", rc);
+
+		return;
+	}
+
+	opal_elog_complete(elog, false);
+}
+
+void log_append_data(struct errorlog *buf, unsigned char *data, uint16_t size)
+{
+	struct elog_user_data_section *section;
+	uint8_t n_sections;
+	char *buffer;
+	uint16_t ssize;
+
+	if (!buf) {
+		prerror("ELOG: Cannot update user data. Buffer is invalid\n");
+		return;
+	}
+
+	if ((buf->user_section_size + size) > OPAL_LOG_MAX_DUMP) {
+		prerror("ELOG: Size of dump data overruns buffer\n");
+		return;
+	}
+
+	/* Step through user sections to find latest dump section */
+	buffer = buf->user_data_dump;
+	n_sections = buf->user_section_count;
+	if (!n_sections) {
+		prerror("ELOG: User section invalid\n");
+		return;
+	}
+
+	while (--n_sections) {
+		section = (struct elog_user_data_section *)buffer;
+		buffer += be16_to_cpu(section->size);
+	}
+
+	section = (struct elog_user_data_section *)buffer;
+	ssize = be16_to_cpu(section->size);
+	buffer += ssize;
+	memcpy(buffer, data, size);
+	section->size = cpu_to_be16(ssize + size);
+	buf->user_section_size += size;
+}
+
+void log_append_msg(struct errorlog *buf, const char *fmt, ...)
+{
+	char err_msg[250];
+	va_list list;
+
+	if (!buf) {
+		prerror("Tried to append log to NULL buffer\n");
+		return;
+	}
+
+	va_start(list, fmt);
+	vsnprintf(err_msg, sizeof(err_msg), fmt, list);
+	va_end(list);
+
+	/* Log the error on to Sapphire console */
+	prerror("%s", err_msg);
+
+	log_append_data(buf, err_msg, strlen(err_msg));
+}
+
+uint32_t log_simple_error(struct opal_err_info *e_info, const char *fmt, ...)
+{
+	struct errorlog *buf;
+	va_list list;
+	char err_msg[250];
+
+	va_start(list, fmt);
+	vsnprintf(err_msg, sizeof(err_msg), fmt, list);
+	va_end(list);
+
+	/* Log the error on to Sapphire console */
+	prerror("%s", err_msg);
+
+	buf = opal_elog_create(e_info, 0);
+	if (buf == NULL) {
+		prerror("ELOG: Error getting buffer to log error\n");
+		return -1;
+	}
+
+	log_append_data(buf, err_msg, strlen(err_msg));
+	log_commit(buf);
+
+	return buf->plid;
+}
+
+int elog_init(void)
+{
+	/* Pre-allocate memory for records */
+	if (pool_init(&elog_pool, sizeof(struct errorlog),
+					ELOG_WRITE_MAX_RECORD, 1))
+		return OPAL_RESOURCE;
+
+	elog_available = true;
+	return 0;
+}
diff --git a/roms/skiboot/core/exceptions.c b/roms/skiboot/core/exceptions.c
new file mode 100644
index 000000000..389548d16
--- /dev/null
+++ b/roms/skiboot/core/exceptions.c
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Deal with exceptions when in OPAL.
+ *
+ * Copyright 2013-2014 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <stack.h>
+#include <opal.h>
+#include <processor.h>
+#include <cpu.h>
+#include <ras.h>
+
+#define REG		"%016llx"
+#define REG32		"%08x"
+#define REGS_PER_LINE	4
+
+static void dump_regs(struct stack_frame *stack)
+{
+	unsigned int i;
+
+	prerror("CFAR : "REG" MSR  : "REG"\n", stack->cfar, stack->msr);
+	prerror("SRR0 : "REG" SRR1 : "REG"\n", stack->srr0, stack->srr1);
+	prerror("HSRR0: "REG" HSRR1: "REG"\n", stack->hsrr0, stack->hsrr1);
+	prerror("DSISR: "REG32"         DAR  : "REG"\n", stack->dsisr, stack->dar);
+	prerror("LR   : "REG" CTR  : "REG"\n", stack->lr, stack->ctr);
+	prerror("CR   : "REG32"         XER  : "REG32"\n", stack->cr, stack->xer);
+	for (i = 0;  i < 16;  i++)
+		prerror("GPR%02d: "REG" GPR%02d: "REG"\n",
+		       i, stack->gpr[i], i + 16, stack->gpr[i + 16]);
+}
+
+#define EXCEPTION_MAX_STR 320
+
+static void handle_mce(struct stack_frame *stack, uint64_t nip, uint64_t msr, bool *fatal)
+{
+	uint64_t mce_flags, mce_addr;
+	const char *mce_err;
+	const char *mce_fix = NULL;
+	char buf[EXCEPTION_MAX_STR];
+	size_t l;
+
+	decode_mce(stack->srr0, stack->srr1, stack->dsisr, stack->dar,
+			&mce_flags, &mce_err, &mce_addr);
+
+	/* Try to recover. */
+	if (mce_flags & MCE_ERAT_ERROR) {
+		/* Real-mode still uses ERAT, flush transient bitflips */
+		flush_erat();
+		mce_fix = "ERAT flush";
+
+	} else {
+		*fatal = true;
+	}
+
+	prerror("***********************************************\n");
+	l = 0;
+	l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
+		"%s MCE at "REG"   ", *fatal ? "Fatal" : "Non-fatal", nip);
+	l += snprintf_symbol(buf + l, EXCEPTION_MAX_STR - l, nip);
+	l += snprintf(buf + l, EXCEPTION_MAX_STR - l, "  MSR "REG, msr);
+	prerror("%s\n", buf);
+
+	l = 0;
+	l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
+		"Cause: %s", mce_err);
+	prerror("%s\n", buf);
+	if (mce_flags & MCE_INVOLVED_EA) {
+		l = 0;
+		l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
+			"Effective address: 0x%016llx", mce_addr);
+		prerror("%s\n", buf);
+	}
+
+	if (!*fatal) {
+		l = 0;
+		l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
+			"Attempting recovery: %s", mce_fix);
+		prerror("%s\n", buf);
+	}
+}
+
+void exception_entry(struct stack_frame *stack)
+{
+	bool fatal = false;
+	bool hv;
+	uint64_t nip;
+	uint64_t msr;
+	char buf[EXCEPTION_MAX_STR];
+	size_t l;
+
+	switch (stack->type) {
+	case 0x500:
+	case 0x980:
+	case 0xe00:
+	case 0xe20:
+	case 0xe40:
+	case 0xe60:
+	case 0xe80:
+	case 0xea0:
+	case 0xf80:
+		hv = true;
+		break;
+	default:
+		hv = false;
+		break;
+	}
+
+	if (hv) {
+		nip = stack->hsrr0;
+		msr = stack->hsrr1;
+	} else {
+		nip = stack->srr0;
+		msr = stack->srr1;
+	}
+	stack->msr = msr;
+	stack->pc = nip;
+
+	if (!(msr & MSR_RI))
+		fatal = true;
+
+	l = 0;
+	switch (stack->type) {
+	case 0x100:
+		prerror("***********************************************\n");
+		if (fatal) {
+			l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
+				"Fatal System Reset at "REG"   ", nip);
+		} else {
+			l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
+				"System Reset at "REG"   ", nip);
+		}
+		break;
+
+	case 0x200:
+		handle_mce(stack, nip, msr, &fatal);
+		goto no_symbol;
+
+	case 0x700: {
+		struct trap_table_entry *tte;
+
+		fatal = true;
+		prerror("***********************************************\n");
+		for (tte = __trap_table_start; tte < __trap_table_end; tte++) {
+			if (tte->address == nip) {
+				prerror("< %s >\n", tte->message);
+				prerror("    .\n");
+				prerror("     .\n");
+				prerror("      .\n");
+				prerror("        OO__)\n");
+				prerror("       <\"__/\n");
+				prerror("        ^ ^\n");
+				break;
+			}
+		}
+		l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
+			"Fatal TRAP at "REG"   ", nip);
+		l += snprintf_symbol(buf + l, EXCEPTION_MAX_STR - l, nip);
+		l += snprintf(buf + l, EXCEPTION_MAX_STR - l, "  MSR "REG, msr);
+		prerror("%s\n", buf);
+		dump_regs(stack);
+		backtrace_r1((uint64_t)stack);
+		if (platform.terminate)
+			platform.terminate(buf);
+		for (;;) ;
+		break; }
+
+	default:
+		fatal = true;
+		prerror("***********************************************\n");
+		l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
+			"Fatal Exception 0x%llx at "REG"  ", stack->type, nip);
+		break;
+	}
+	l += snprintf_symbol(buf + l, EXCEPTION_MAX_STR - l, nip);
+	l += snprintf(buf + l, EXCEPTION_MAX_STR - l, "  MSR "REG, msr);
+	prerror("%s\n", buf);
+no_symbol:
+	dump_regs(stack);
+	backtrace_r1((uint64_t)stack);
+	if (fatal) {
+		if (platform.terminate)
+			platform.terminate(buf);
+		for (;;) ;
+	}
+
+	if (hv) {
+		/* Set up for SRR return */
+		stack->srr0 = nip;
+		stack->srr1 = msr;
+	}
+}
+
+void exception_entry_pm_sreset(void)
+{
+	char buf[EXCEPTION_MAX_STR];
+	size_t l;
+
+	prerror("***********************************************\n");
+	l = 0;
+	l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
+		"System Reset in sleep");
+	prerror("%s\n", buf);
+	backtrace();
+}
+
+void __noreturn exception_entry_pm_mce(void)
+{
+	char buf[EXCEPTION_MAX_STR];
+	size_t l;
+
+	prerror("***********************************************\n");
+	l = 0;
+	l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
+		"Fatal MCE in sleep");
+	prerror("%s\n", buf);
+	prerror("SRR0 : "REG" SRR1 : "REG"\n",
+			(uint64_t)mfspr(SPR_SRR0), (uint64_t)mfspr(SPR_SRR1));
+	prerror("DSISR: "REG32"         DAR  : "REG"\n",
+			(uint32_t)mfspr(SPR_DSISR), (uint64_t)mfspr(SPR_DAR));
+	abort();
+}
+
+static int64_t opal_register_exc_handler(uint64_t opal_exception __unused,
+					 uint64_t handler_address __unused,
+					 uint64_t glue_cache_line __unused)
+{
+	/* This interface is deprecated */
+	return OPAL_UNSUPPORTED;
+}
+opal_call(OPAL_REGISTER_OPAL_EXCEPTION_HANDLER, opal_register_exc_handler, 3);
+
diff --git a/roms/skiboot/core/fast-reboot.c b/roms/skiboot/core/fast-reboot.c
new file mode 100644
index 000000000..9f92525a9
--- /dev/null
+++ b/roms/skiboot/core/fast-reboot.c
@@ -0,0 +1,467 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Full IPL is slow, let's cheat!
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <cpu.h>
+#include <console.h>
+#include <fsp.h>
+#include <psi.h>
+#include <opal.h>
+#include <mem_region.h>
+#include <xscom.h>
+#include <interrupts.h>
+#include <cec.h>
+#include <timebase.h>
+#include <pci.h>
+#include <xive.h>
+#include <chip.h>
+#include <chiptod.h>
+#include <ipmi.h>
+#include <direct-controls.h>
+#include <nvram.h>
+
+/* Flag tested by the OPAL entry code */
+static volatile bool fast_boot_release;
+static volatile bool spr_set_release;
+static volatile bool nmi_mce_release;
+
+static void wait_on(volatile bool *cond)
+{
+	sync();
+	if (!*cond) {
+		smt_lowest();
+		while (!*cond)
+			barrier();
+		smt_medium();
+	}
+	sync();
+}
+
+static bool cpu_state_wait_all_others(enum cpu_thread_state state,
+					unsigned long timeout_tb)
+{
+	struct cpu_thread *cpu;
+	unsigned long end = mftb() + timeout_tb;
+
+	sync();
+	for_each_ungarded_cpu(cpu) {
+		if (cpu == this_cpu())
+			continue;
+
+		if (cpu->state != state) {
+			smt_lowest();
+			while (cpu->state != state) {
+				barrier();
+
+				if (timeout_tb && (tb_compare(mftb(), end) == TB_AAFTERB)) {
+					smt_medium();
+					return false;
+				}
+			}
+			smt_medium();
+		}
+	}
+	sync();
+
+	return true;
+}
+
+static const char *fast_reboot_disabled = NULL;
+
+void disable_fast_reboot(const char *reason)
+{
+	if (fast_reboot_disabled)
+		return;
+
+	prlog(PR_NOTICE, "RESET: Fast reboot disabled: %s\n", reason);
+	fast_reboot_disabled = reason;
+}
+
+void add_fast_reboot_dt_entries(void)
+{
+	dt_check_del_prop(opal_node, "fast-reboot");
+
+	if (fast_reboot_disabled) {
+		dt_add_property_string(opal_node, "fast-reboot", fast_reboot_disabled);
+	} else {
+		dt_add_property_string(opal_node, "fast-reboot", "okay");
+	}
+}
+
+/*
+ * This is called by the reboot CPU after all other CPUs have been
+ * quiesced and stopped, to perform various sanity checks on firmware
+ * data (and potentially hardware), to determine whether the fast
+ * reboot should go ahead.
+ */
+static bool fast_reboot_sanity_check(void)
+{
+	if (!mem_check_all()) {
+		disable_fast_reboot("Inconsistent firmware data");
+		return false;
+	}
+
+	if (!verify_romem()) {
+		disable_fast_reboot("Inconsistent firmware romem checksum");
+		return false;
+	}
+
+	return true;
+}
+
+void fast_reboot(void)
+{
+	static int fast_reboot_count = 0;
+
+	if (chip_quirk(QUIRK_NO_DIRECT_CTL)) {
+		prlog(PR_DEBUG,
+		      "RESET: Fast reboot disabled by quirk\n");
+		return;
+	}
+
+	/*
+	 * Ensure all other CPUs have left OPAL calls.
+	 */
+	if (!opal_quiesce(QUIESCE_HOLD, -1)) {
+		disable_fast_reboot("OPAL quiesce timeout");
+		return;
+	}
+
+	if (fast_reboot_disabled &&
+	    nvram_query_eq_dangerous("force-fast-reset", "1")) {
+		/* Do fast reboot even if it's been disabled */
+		prlog(PR_NOTICE, "RESET: Ignoring fast reboot disabled: %s\n",
+				fast_reboot_disabled);
+	} else if (fast_reboot_disabled) {
+		prlog(PR_NOTICE, "RESET: Fast reboot disabled: %s\n",
+		      fast_reboot_disabled);
+		opal_quiesce(QUIESCE_RESUME, -1);
+		return;
+	}
+
+	prlog(PR_NOTICE, "RESET: Initiating fast reboot %d...\n", ++fast_reboot_count);
+	fast_boot_release = false;
+	spr_set_release = false;
+	nmi_mce_release = false;
+	sync();
+
+	/* Put everybody in stop except myself */
+	if (sreset_all_prepare()) {
+		prlog(PR_NOTICE, "RESET: Fast reboot failed to prepare "
+				"secondaries for system reset\n");
+		opal_quiesce(QUIESCE_RESUME, -1);
+		return;
+	}
+
+	if (!fast_reboot_sanity_check()) {
+		opal_quiesce(QUIESCE_RESUME, -1);
+		return;
+	}
+
+	cpu_set_sreset_enable(false);
+	cpu_set_ipi_enable(false);
+
+	/*
+	 * The fast reboot sreset vector has FIXUP_ENDIAN, so secondaries can
+	 * cope with a wrong HILE setting.
+	 */
+	copy_sreset_vector_fast_reboot();
+
+	/*
+	 * There is no point clearing special wakeup or un-quiesce due to
+	 * failure after this point, because we will be going to full IPL.
+	 * Less cleanup work means less opportunity to fail.
+	 */
+
+	/* Send everyone else to 0x100 */
+	if (sreset_all_others() != OPAL_SUCCESS) {
+		prlog(PR_NOTICE, "RESET: Fast reboot failed to system reset "
+				"secondaries\n");
+		return;
+	}
+
+	/* Ensure all the sresets get through */
+	if (!cpu_state_wait_all_others(cpu_state_fast_reboot_entry, msecs_to_tb(1000))) {
+		prlog(PR_NOTICE, "RESET: Fast reboot timed out waiting for "
+				"secondaries to call in\n");
+		return;
+	}
+
+	prlog(PR_DEBUG, "RESET: Releasing special wakeups...\n");
+	sreset_all_finish();
+
+	/* This resets our quiesce state ready to enter the new kernel. */
+	opal_quiesce(QUIESCE_RESUME_FAST_REBOOT, -1);
+
+	console_complete_flush();
+
+	mtmsrd(0, 1); /* Clear MSR[RI] for 0x100 reset */
+	asm volatile("ba	0x100\n\t" : : : "memory");
+	for (;;)
+		;
+}
+
+void __noreturn enter_nap(void);
+
+static void check_split_core(void)
+{
+	struct cpu_thread *cpu;
+	u64 mask, hid0;
+
+        hid0 = mfspr(SPR_HID0);
+	mask = SPR_HID0_POWER8_4LPARMODE | SPR_HID0_POWER8_2LPARMODE;
+
+	if ((hid0 & mask) == 0)
+		return;
+
+	prlog(PR_INFO, "RESET: CPU 0x%04x is split !\n", this_cpu()->pir);
+
+	/* If it's a secondary thread, just send it to nap */
+	if (this_cpu()->pir & 7) {
+		/* Prepare to be woken up */
+		icp_prep_for_pm();
+		/* Setup LPCR to wakeup on external interrupts only */
+		mtspr(SPR_LPCR, ((mfspr(SPR_LPCR) & ~SPR_LPCR_P8_PECE) |
+				 SPR_LPCR_P8_PECE2));
+		isync();
+		/* Go to nap (doesn't return) */
+		enter_nap();
+	}
+
+	prlog(PR_INFO, "RESET: Primary, unsplitting... \n");
+
+	/* Trigger unsplit operation and update SLW image */
+	hid0 &= ~SPR_HID0_POWER8_DYNLPARDIS;
+	set_hid0(hid0);
+	opal_slw_set_reg(this_cpu()->pir, SPR_HID0, hid0);
+
+	/* Wait for unsplit */
+	while (mfspr(SPR_HID0) & mask)
+		cpu_relax();
+
+	/* Now the guys are sleeping, wake'em up. They will come back
+	 * via reset and continue the fast reboot process normally.
+	 * No need to wait.
+	 */
+	prlog(PR_INFO, "RESET: Waking unsplit secondaries... \n");
+
+	for_each_cpu(cpu) {
+		if (!cpu_is_sibling(cpu, this_cpu()) || (cpu == this_cpu()))
+			continue;
+		icp_kick_cpu(cpu);
+	}
+}
+
+static void cleanup_cpu_state(void)
+{
+	struct cpu_thread *cpu = this_cpu();
+
+	if (proc_gen == proc_gen_p9)
+		xive_cpu_reset();
+	else if (proc_gen == proc_gen_p10)
+		xive2_cpu_reset();
+
+	/* Per core cleanup */
+	if (cpu_is_thread0(cpu) || cpu_is_core_chiplet_primary(cpu)) {
+		/* Shared SPRs whacked back to normal */
+
+		/* XXX Update the SLW copies ! Also dbl check HIDs etc... */
+		init_shared_sprs();
+
+		if (proc_gen == proc_gen_p8) {
+			/* If somebody was in fast_sleep, we may have a
+			 * workaround to undo
+			 */
+			if (cpu->in_fast_sleep) {
+				prlog(PR_DEBUG, "RESET: CPU 0x%04x in fast sleep"
+				      " undoing workarounds...\n", cpu->pir);
+				fast_sleep_exit();
+			}
+
+			/* The TLB surely contains garbage.
+			 * P9 clears TLBs in cpu_fast_reboot_complete
+			 */
+			cleanup_local_tlb();
+		}
+
+		/* And we might have lost TB sync */
+		chiptod_wakeup_resync();
+	}
+
+	/* Per-thread additional cleanup */
+	init_replicated_sprs();
+
+	// XXX Cleanup SLW, check HIDs ...
+}
+
+/* Entry from asm after a fast reset */
+void __noreturn fast_reboot_entry(void);
+
+void __noreturn fast_reboot_entry(void)
+{
+	struct cpu_thread *cpu = this_cpu();
+
+	if (proc_gen == proc_gen_p8) {
+		/* We reset our ICP first ! Otherwise we might get stray
+		 * interrupts when unsplitting
+		 */
+		reset_cpu_icp();
+
+		/* If we are split, we need to unsplit. Since that can send us
+		 * to NAP, which will come back via reset, we do it now
+		 */
+		check_split_core();
+	}
+
+	/* Until SPRs (notably HID[HILE]) are set and new exception vectors
+	 * installed, nobody should take machine checks. Try to do minimal
+	 * work between these points.
+	 */
+	disable_machine_check();
+	mtmsrd(0, 1); /* Clear RI */
+
+	sync();
+	cpu->state = cpu_state_fast_reboot_entry;
+	sync();
+	if (cpu == boot_cpu) {
+		cpu_state_wait_all_others(cpu_state_fast_reboot_entry, 0);
+		spr_set_release = true;
+	} else {
+		wait_on(&spr_set_release);
+	}
+
+
+	/* Reset SPRs */
+	if (cpu_is_thread0(cpu))
+		init_shared_sprs();
+	init_replicated_sprs();
+
+	if (cpu == boot_cpu) {
+		/* Restore skiboot vectors */
+		copy_exception_vectors();
+		copy_sreset_vector();
+		patch_traps(true);
+	}
+
+	/* Must wait for others to because shared SPRs like HID0 are only set
+	 * by thread0, so can't enable machine checks until those have been
+	 * set.
+	 */
+	sync();
+	cpu->state = cpu_state_present;
+	sync();
+	if (cpu == boot_cpu) {
+		cpu_state_wait_all_others(cpu_state_present, 0);
+		nmi_mce_release = true;
+	} else {
+		wait_on(&nmi_mce_release);
+	}
+
+	/* At this point skiboot exception vectors are in place and all
+	 * cores/threads have SPRs set for running skiboot.
+	 */
+	enable_machine_check();
+	mtmsrd(MSR_RI, 1);
+
+	cleanup_cpu_state();
+
+	prlog(PR_DEBUG, "RESET: CPU 0x%04x reset in\n", cpu->pir);
+
+	/* The original boot CPU (not the fast reboot initiator) takes
+	 * command. Secondaries wait for the signal then go to their secondary
+	 * entry point.
+	 */
+	if (cpu != boot_cpu) {
+		wait_on(&fast_boot_release);
+
+		__secondary_cpu_entry();
+	}
+
+	if (proc_gen == proc_gen_p9)
+		xive_reset();
+	else if (proc_gen == proc_gen_p10)
+		xive2_reset();
+
+	/* Let the CPU layer do some last minute global cleanups */
+	cpu_fast_reboot_complete();
+
+	/* We can now do NAP mode */
+	cpu_set_sreset_enable(true);
+	cpu_set_ipi_enable(true);
+
+	prlog(PR_INFO, "RESET: Releasing secondaries...\n");
+
+	/* Release everybody */
+	sync();
+	fast_boot_release = true;
+	sync();
+	cpu->state = cpu_state_active;
+	sync();
+
+	/* Wait for them to respond */
+	cpu_state_wait_all_others(cpu_state_active, 0);
+
+	sync();
+
+	prlog(PR_INFO, "RESET: All done, cleaning up...\n");
+
+	/* Clear release flag for next time */
+	fast_boot_release = false;
+
+	if (!chip_quirk(QUIRK_MAMBO_CALLOUTS)) {
+		/*
+		 * mem_region_clear_unused avoids these preload regions
+		 * so it can run along side image preloading. Clear these
+		 * regions now to catch anything not overwritten by
+		 * preload.
+		 *
+		 * Mambo may have embedded payload here, so don't clear
+		 * it at all.
+		 */
+		memset(KERNEL_LOAD_BASE, 0, KERNEL_LOAD_SIZE);
+		memset(INITRAMFS_LOAD_BASE, 0, INITRAMFS_LOAD_SIZE);
+	}
+
+	/* Start preloading kernel and ramdisk */
+	start_preload_kernel();
+
+	/* Start clearing memory */
+	start_mem_region_clear_unused();
+
+	if (platform.fast_reboot_init)
+		platform.fast_reboot_init();
+
+	if (proc_gen == proc_gen_p8) {
+		/* XXX */
+		/* Reset/EOI the PSI interrupt */
+		psi_irq_reset();
+	}
+
+	/* update pci nvram settings */
+	pci_nvram_init();
+
+	/* Remove all PCI devices */
+	if (pci_reset()) {
+		prlog(PR_NOTICE, "RESET: Fast reboot failed to reset PCI\n");
+
+		/*
+		 * Can't return to caller here because we're past no-return.
+		 * Attempt an IPL here which is what the caller would do.
+		 */
+		if (platform.cec_reboot)
+			platform.cec_reboot();
+		for (;;)
+			;
+	}
+
+	ipmi_set_fw_progress_sensor(IPMI_FW_PCI_INIT);
+
+	wait_mem_region_clear_unused();
+
+	/* Load and boot payload */
+	load_and_boot_kernel(true);
+}
diff --git a/roms/skiboot/core/fdt.c b/roms/skiboot/core/fdt.c
new file mode 100644
index 000000000..463dc6912
--- /dev/null
+++ b/roms/skiboot/core/fdt.c
@@ -0,0 +1,258 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Produce and consume flattened device trees
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <stdarg.h>
+#include <libfdt.h>
+#include <device.h>
+#include <chip.h>
+#include <cpu.h>
+#include <opal.h>
+#include <interrupts.h>
+#include <fsp.h>
+#include <cec.h>
+#include <vpd.h>
+#include <ccan/str/str.h>
+
+static int fdt_error;
+
+#undef DEBUG_FDT
+#ifdef DEBUG_FDT
+#define FDT_DBG(fmt, a...)	prlog(PR_DEBUG, "FDT: " fmt, ##a)
+#else
+#define FDT_DBG(fmt, a...)
+#endif
+
+static void __save_err(int err, const char *str)
+{
+	FDT_DBG("rc: %d from \"%s\"\n", err, str);
+	if (err && !fdt_error) {
+		prerror("FDT: Error %d from \"%s\"\n", err, str);
+		fdt_error = err;
+	}
+}
+
+#define save_err(...) __save_err(__VA_ARGS__, #__VA_ARGS__)
+
+static void dt_property_cell(void *fdt, const char *name, u32 cell)
+{
+	save_err(fdt_property_cell(fdt, name, cell));
+}
+
+static void dt_begin_node(void *fdt, const struct dt_node *dn)
+{
+	save_err(fdt_begin_node(fdt, dn->name));
+
+	dt_property_cell(fdt, "phandle", dn->phandle);
+}
+
+static void dt_property(void *fdt, const struct dt_property *p)
+{
+	save_err(fdt_property(fdt, p->name, p->prop, p->len));
+}
+
+static void dt_end_node(void *fdt)
+{
+	save_err(fdt_end_node(fdt));
+}
+
+#ifdef DEBUG_FDT
+static void dump_fdt(void *fdt)
+{
+	int i, off, depth, err;
+
+	prlog(PR_INFO, "Device tree %u@%p\n", fdt_totalsize(fdt), fdt);
+	err = fdt_check_header(fdt);
+	if (err) {
+		prerror("fdt_check_header: %s\n", fdt_strerror(err));
+		return;
+	}
+	prlog(PR_INFO, "fdt_check_header passed\n");
+
+	prlog(PR_INFO, "fdt_num_mem_rsv = %u\n", fdt_num_mem_rsv(fdt));
+	for (i = 0; i < fdt_num_mem_rsv(fdt); i++) {
+		u64 addr, size;
+
+		err = fdt_get_mem_rsv(fdt, i, &addr, &size);
+		if (err) {
+			prlog(PR_INFO, " ERR %s\n", fdt_strerror(err));
+			return;
+		}
+		prlog(PR_INFO, "  mem_rsv[%i] = %lu@%#lx\n",
+		      i, (long)addr, (long)size);
+	}
+
+	for (off = fdt_next_node(fdt, 0, &depth);
+	     off > 0;
+	     off = fdt_next_node(fdt, off, &depth)) {
+		int len;
+		const char *name;
+
+		name = fdt_get_name(fdt, off, &len);
+		if (!name) {
+			prerror("fdt: offset %i no name!\n", off);
+			return;
+		}
+		prlog(PR_INFO, "name: %s [%u]\n", name, off);
+	}
+}
+#endif
+
+static void flatten_dt_properties(void *fdt, const struct dt_node *dn)
+{
+	const struct dt_property *p;
+
+	list_for_each(&dn->properties, p, list) {
+		if (strstarts(p->name, DT_PRIVATE))
+			continue;
+
+		FDT_DBG("  prop: %s size: %ld\n", p->name, p->len);
+		dt_property(fdt, p);
+	}
+}
+
+static void flatten_dt_node(void *fdt, const struct dt_node *root,
+			    bool exclusive)
+{
+	const struct dt_node *i;
+
+	if (!exclusive) {
+		FDT_DBG("node: %s\n", root->name);
+		dt_begin_node(fdt, root);
+		flatten_dt_properties(fdt, root);
+	}
+
+	list_for_each(&root->children, i, list)
+		flatten_dt_node(fdt, i, false);
+
+	if (!exclusive)
+		dt_end_node(fdt);
+}
+
+static void create_dtb_reservemap(void *fdt, const struct dt_node *root)
+{
+	uint64_t base, size;
+	const __be64 *ranges;
+	const struct dt_property *prop;
+	int i;
+
+	/* Duplicate the reserved-ranges property into the fdt reservemap */
+	prop = dt_find_property(root, "reserved-ranges");
+	if (prop) {
+		ranges = (const void *)prop->prop;
+
+		for (i = 0; i < prop->len / (sizeof(uint64_t) * 2); i++) {
+			base = be64_to_cpu(*(ranges++));
+			size = be64_to_cpu(*(ranges++));
+			save_err(fdt_add_reservemap_entry(fdt, base, size));
+		}
+	}
+
+	save_err(fdt_finish_reservemap(fdt));
+}
+
+static int __create_dtb(void *fdt, size_t len,
+			const struct dt_node *root,
+			bool exclusive)
+{
+	if (chip_quirk(QUIRK_SLOW_SIM))
+		save_err(fdt_create_with_flags(fdt, len, FDT_CREATE_FLAG_NO_NAME_DEDUP));
+	else
+		save_err(fdt_create_with_flags(fdt, len, 0));
+	if (fdt_error)
+		goto err;
+
+	if (root == dt_root && !exclusive)
+		create_dtb_reservemap(fdt, root);
+	else
+		save_err(fdt_finish_reservemap(fdt));
+
+	flatten_dt_node(fdt, root, exclusive);
+
+	save_err(fdt_finish(fdt));
+	if (fdt_error) {
+err:
+		prerror("dtb: error %s\n", fdt_strerror(fdt_error));
+		return fdt_error;
+	}
+
+#ifdef DEBUG_FDT
+	dump_fdt(fdt);
+#endif
+	return 0;
+}
+
+void *create_dtb(const struct dt_node *root, bool exclusive)
+{
+	void *fdt = NULL;
+	size_t len = DEVICE_TREE_MAX_SIZE;
+	uint32_t old_last_phandle = get_last_phandle();
+	int ret;
+
+	do {
+		set_last_phandle(old_last_phandle);
+		fdt_error = 0;
+		fdt = malloc(len);
+		if (!fdt) {
+			prerror("dtb: could not malloc %lu\n", (long)len);
+			return NULL;
+		}
+
+		ret = __create_dtb(fdt, len, root, exclusive);
+		if (ret) {
+			free(fdt);
+			fdt = NULL;
+		}
+
+		len *= 2;
+	} while (ret == -FDT_ERR_NOSPACE);
+
+	return fdt;
+}
+
+static int64_t opal_get_device_tree(uint32_t phandle,
+				    uint64_t buf, uint64_t len)
+{
+	struct dt_node *root;
+	void *fdt = (void *)buf;
+	uint32_t old_last_phandle;
+	int64_t totalsize;
+	int ret;
+
+	if (!opal_addr_valid(fdt))
+		return OPAL_PARAMETER;
+
+	root = dt_find_by_phandle(dt_root, phandle);
+	if (!root)
+		return OPAL_PARAMETER;
+
+	if (!fdt) {
+		fdt = create_dtb(root, true);
+		if (!fdt)
+			return OPAL_INTERNAL_ERROR;
+		totalsize = fdt_totalsize(fdt);
+		free(fdt);
+		return totalsize;
+	}
+
+	if (!len)
+		return OPAL_PARAMETER;
+
+	fdt_error = 0;
+	old_last_phandle = get_last_phandle();
+	ret = __create_dtb(fdt, len, root, true);
+	if (ret) {
+		set_last_phandle(old_last_phandle);
+		if (ret == -FDT_ERR_NOSPACE)
+			return OPAL_NO_MEM;
+
+		return OPAL_EMPTY;
+	}
+
+	return OPAL_SUCCESS;
+}
+opal_call(OPAL_GET_DEVICE_TREE, opal_get_device_tree, 3);
diff --git a/roms/skiboot/core/flash-firmware-versions.c b/roms/skiboot/core/flash-firmware-versions.c
new file mode 100644
index 000000000..975ac6aff
--- /dev/null
+++ b/roms/skiboot/core/flash-firmware-versions.c
@@ -0,0 +1,164 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Parse VERSION partition, add to device tree
+ *
+ * Copyright 2013-2018 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <device.h>
+#include <opal.h>
+#include <libstb/secureboot.h>
+#include <libstb/trustedboot.h>
+
+/* ibm,firmware-versions support */
+static char *version_buf;
+static size_t version_buf_size = 0x2000;
+
+static void __flash_dt_add_fw_version(struct dt_node *fw_version, char* data)
+{
+	static bool first = true;
+	char *prop;
+	int version_len, i;
+	int len = strlen(data);
+	const char *skiboot_version;
+	const char * version_str[] = {"open-power", "buildroot", "skiboot",
+				      "hostboot-binaries", "hostboot", "linux",
+				      "petitboot", "occ", "capp-ucode", "sbe",
+				      "machine-xml", "hcode"};
+
+	if (first) {
+		first = false;
+
+		/* Increment past "key-" */
+		if (memcmp(data, "open-power", strlen("open-power")) == 0)
+			prop = data + strlen("open-power");
+		else
+			prop = strchr(data, '-');
+		if (!prop) {
+			prlog(PR_DEBUG,
+			      "FLASH: Invalid fw version format (%s)\n", data);
+			return;
+		}
+		prop++;
+
+		dt_add_property_string(fw_version, "version", prop);
+		return;
+	}
+
+	/*
+	 * PNOR version strings are not easily consumable. Split them into
+	 * property, value.
+	 *
+	 * Example input from PNOR :
+	 *   "open-power-firestone-v1.8"
+	 *   "linux-4.4.6-openpower1-8420e0f"
+	 *
+	 * Desired output in device tree:
+	 *   open-power = "firestone-v1.8";
+	 *   linux = "4.4.6-openpower1-8420e0f";
+	 */
+	for(i = 0; i < ARRAY_SIZE(version_str); i++)
+	{
+		version_len = strlen(version_str[i]);
+		if (len < version_len)
+			continue;
+
+		if (memcmp(data, version_str[i], version_len) != 0)
+			continue;
+
+		/* Found a match, add property */
+		if (dt_find_property(fw_version, version_str[i]))
+			continue;
+
+		/* Increment past "key-" */
+		prop = data + version_len + 1;
+		dt_add_property_string(fw_version, version_str[i], prop);
+
+		/* Sanity check against what Skiboot thinks its version is. */
+		if (strncmp(version_str[i], "skiboot",
+					strlen("skiboot")) == 0) {
+			/*
+			 * If Skiboot was built with Buildroot its version may
+			 * include a 'skiboot-' prefix; ignore it.
+			 */
+			if (strncmp(version, "skiboot-",
+						strlen("skiboot-")) == 0)
+				skiboot_version = version + strlen("skiboot-");
+			else
+				skiboot_version = version;
+			if (strncmp(prop, skiboot_version,
+						strlen(skiboot_version)) != 0)
+				prlog(PR_WARNING, "WARNING! Skiboot version does not match VERSION partition!\n");
+		}
+	}
+}
+
+void flash_dt_add_fw_version(void)
+{
+	uint8_t version_data[80];
+	int rc;
+	int numbytes = 0, i = 0;
+	struct dt_node *fw_version;
+
+	if (version_buf == NULL)
+		return;
+
+	rc = wait_for_resource_loaded(RESOURCE_ID_VERSION, RESOURCE_SUBID_NONE);
+	if (rc != OPAL_SUCCESS) {
+		prlog(PR_WARNING, "FLASH: Failed to load VERSION data\n");
+		free(version_buf);
+		return;
+	}
+
+	fw_version = dt_new(dt_root, "ibm,firmware-versions");
+	assert(fw_version);
+
+	if (stb_is_container(version_buf, version_buf_size))
+		numbytes += SECURE_BOOT_HEADERS_SIZE;
+	for ( ; (numbytes < version_buf_size) && version_buf[numbytes]; numbytes++) {
+		if (version_buf[numbytes] == '\n') {
+			version_data[i] = '\0';
+			__flash_dt_add_fw_version(fw_version, version_data);
+			memset(version_data, 0, sizeof(version_data));
+			i = 0;
+			continue;
+		} else if (version_buf[numbytes] == '\t') {
+			continue; /* skip tabs */
+		}
+
+		version_data[i++] = version_buf[numbytes];
+		if (i == sizeof(version_data)) {
+			prlog(PR_WARNING, "VERSION item >%lu chars, skipping\n",
+			      sizeof(version_data));
+			break;
+		}
+	}
+
+	free(version_buf);
+}
+
+void flash_fw_version_preload(void)
+{
+	int rc;
+
+	if (proc_gen < proc_gen_p9)
+		return;
+
+	prlog(PR_INFO, "FLASH: Loading VERSION section\n");
+
+	version_buf = malloc(version_buf_size);
+	if (!version_buf) {
+		prlog(PR_WARNING, "FLASH: Failed to allocate memory\n");
+		return;
+	}
+
+	rc = start_preload_resource(RESOURCE_ID_VERSION, RESOURCE_SUBID_NONE,
+				    version_buf, &version_buf_size);
+	if (rc != OPAL_SUCCESS) {
+		prlog(PR_WARNING,
+		      "FLASH: Failed to start loading VERSION data\n");
+		free(version_buf);
+		version_buf = NULL;
+	}
+}
diff --git a/roms/skiboot/core/flash-subpartition.c b/roms/skiboot/core/flash-subpartition.c
new file mode 100644
index 000000000..6e0fec6c3
--- /dev/null
+++ b/roms/skiboot/core/flash-subpartition.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Parse flash sub-partitions
+ *
+ * Copyright 2013-2018 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <opal-api.h>
+
+struct flash_hostboot_toc {
+	be32 ec;
+	be32 offset; /* From start of header.  4K aligned */
+	be32 size;
+};
+#define FLASH_HOSTBOOT_TOC_MAX_ENTRIES ((FLASH_SUBPART_HEADER_SIZE - 8) \
+		/sizeof(struct flash_hostboot_toc))
+
+struct flash_hostboot_header {
+	char eyecatcher[4];
+	be32 version;
+	struct flash_hostboot_toc toc[FLASH_HOSTBOOT_TOC_MAX_ENTRIES];
+};
+
+int flash_subpart_info(void *part_header, uint32_t header_len,
+		       uint32_t part_size, uint32_t *part_actualp,
+		       uint32_t subid, uint32_t *offset, uint32_t *size)
+{
+	struct flash_hostboot_header *header;
+	char eyecatcher[5];
+	uint32_t i, ec, o, s;
+	uint32_t part_actual;
+	bool subpart_found;
+
+	if (!part_header || ( !offset && !size && !part_actualp)) {
+		prlog(PR_ERR, "FLASH: invalid parameters: ph %p of %p sz %p "
+		      "tsz %p\n", part_header, offset, size, part_actualp);
+		return OPAL_PARAMETER;
+	}
+
+	if (header_len < FLASH_SUBPART_HEADER_SIZE) {
+		prlog(PR_ERR, "FLASH: subpartition header too small 0x%x\n",
+		      header_len);
+		return OPAL_PARAMETER;
+	}
+
+	header = (struct flash_hostboot_header*) part_header;
+
+	/* Perform sanity */
+	i = be32_to_cpu(header->version);
+	if (i != 1) {
+		prerror("FLASH: flash subpartition TOC version unknown %i\n", i);
+		return OPAL_RESOURCE;
+	}
+
+	/* NULL terminate eyecatcher */
+	strncpy(eyecatcher, header->eyecatcher, 4);
+	eyecatcher[4] = '\0';
+	prlog(PR_DEBUG, "FLASH: flash subpartition eyecatcher %s\n",
+	      eyecatcher);
+
+	subpart_found = false;
+	part_actual = 0;
+	for (i = 0; i < FLASH_HOSTBOOT_TOC_MAX_ENTRIES; i++) {
+
+		ec = be32_to_cpu(header->toc[i].ec);
+		o = be32_to_cpu(header->toc[i].offset);
+		s = be32_to_cpu(header->toc[i].size);
+
+		/* Check for null terminating entry */
+		if (!ec && !o && !s)
+			break;
+
+		/* Sanity check the offset and size. */
+		if (o + s > part_size) {
+			prerror("FLASH: flash subpartition too big: %i\n", i);
+			return OPAL_RESOURCE;
+		}
+		if (!s) {
+			prerror("FLASH: flash subpartition zero size: %i\n", i);
+			return OPAL_RESOURCE;
+		}
+		if (o < FLASH_SUBPART_HEADER_SIZE) {
+			prerror("FLASH: flash subpartition offset too small: "
+			        "%i\n", i);
+			return OPAL_RESOURCE;
+		}
+		/*
+		 * Subpartitions content are different, but multiple toc entries
+		 * may point to the same subpartition.
+		 */
+		if (ALIGN_UP(o + s, FLASH_SUBPART_HEADER_SIZE) > part_actual)
+			part_actual = ALIGN_UP(o + s, FLASH_SUBPART_HEADER_SIZE);
+
+		if (ec == subid) {
+			if (offset)
+				*offset += o;
+			if (size)
+				*size = s;
+			subpart_found = true;
+		}
+	}
+	if (!subpart_found && (offset || size)) {
+		prerror("FLASH: flash subpartition not found.\n");
+		return OPAL_RESOURCE;
+	}
+	if (part_actualp)
+		*part_actualp = part_actual;
+	return OPAL_SUCCESS;
+}
diff --git a/roms/skiboot/core/flash.c b/roms/skiboot/core/flash.c
new file mode 100644
index 000000000..8c1e788c4
--- /dev/null
+++ b/roms/skiboot/core/flash.c
@@ -0,0 +1,1186 @@
+// SPDX-License-Identifier: Apache-2.0
+/*
+ * Init, manage, read, write, and load resources from flash
+ *
+ * Copyright 2013-2019 IBM Corp.
+ * Copyright 2018-2019 Raptor Engineering, LLC
+ */
+
+#define pr_fmt(fmt)    "FLASH: " fmt
+
+#include <skiboot.h>
+#include <cpu.h>
+#include <lock.h>
+#include <opal.h>
+#include <opal-msg.h>
+#include <platform.h>
+#include <device.h>
+#include <libflash/libflash.h>
+#include <libflash/libffs.h>
+#include <libflash/ipmi-hiomap.h>
+#include <libflash/blocklevel.h>
+#include <libflash/ecc.h>
+#include <libstb/secureboot.h>
+#include <libstb/trustedboot.h>
+#include <libxz/xz.h>
+#include <elf.h>
+#include <timebase.h>
+
+struct flash {
+	struct list_node	list;
+	bool			busy;
+	bool			no_erase;
+	struct blocklevel_device *bl;
+	uint64_t		size;
+	uint32_t		block_size;
+	int			id;
+};
+
+static struct {
+	enum resource_id	id;
+	uint32_t		subid;
+	char			name[PART_NAME_MAX+1];
+} part_name_map[] = {
+	{ RESOURCE_ID_KERNEL,	RESOURCE_SUBID_NONE,		"BOOTKERNEL" },
+	{ RESOURCE_ID_INITRAMFS,RESOURCE_SUBID_NONE,		"ROOTFS" },
+	{ RESOURCE_ID_CAPP,	RESOURCE_SUBID_SUPPORTED,	"CAPP" },
+	{ RESOURCE_ID_IMA_CATALOG,  RESOURCE_SUBID_SUPPORTED,	"IMA_CATALOG" },
+	{ RESOURCE_ID_VERSION,	RESOURCE_SUBID_NONE,		"VERSION" },
+	{ RESOURCE_ID_KERNEL_FW,	RESOURCE_SUBID_NONE,		"BOOTKERNFW" },
+};
+
+static LIST_HEAD(flashes);
+static struct flash *system_flash;
+
+/* Using a single lock as we only have one flash at present. */
+static struct lock flash_lock;
+
+/* nvram-on-flash support */
+static struct flash *nvram_flash;
+static u32 nvram_offset, nvram_size;
+
+/* secboot-on-flash support */
+static struct flash *secboot_flash;
+static u32 secboot_offset, secboot_size;
+
+bool flash_reserve(void)
+{
+	bool rc = false;
+
+	if (!try_lock(&flash_lock))
+		return false;
+
+	if (!system_flash->busy) {
+		system_flash->busy = true;
+		rc = true;
+	}
+	unlock(&flash_lock);
+
+	return rc;
+}
+
+void flash_release(void)
+{
+	lock(&flash_lock);
+	system_flash->busy = false;
+	unlock(&flash_lock);
+}
+
+bool flash_unregister(void)
+{
+	struct blocklevel_device *bl = system_flash->bl;
+
+	if (bl->exit)
+		return bl->exit(bl);
+
+	prlog(PR_NOTICE, "Unregister flash device is not supported\n");
+	return true;
+}
+
+int flash_secboot_info(uint32_t *total_size)
+{
+	int rc;
+
+	lock(&flash_lock);
+	if (!secboot_flash) {
+		rc = OPAL_HARDWARE;
+	} else if (secboot_flash->busy) {
+		rc = OPAL_BUSY;
+	} else {
+		*total_size = secboot_size;
+		rc = OPAL_SUCCESS;
+	}
+	unlock(&flash_lock);
+
+	return rc;
+}
+
+int flash_secboot_read(void *dst, uint32_t src, uint32_t len)
+{
+	int rc;
+
+	if (!try_lock(&flash_lock))
+		return OPAL_BUSY;
+
+	if (!secboot_flash) {
+		rc = OPAL_HARDWARE;
+		goto out;
+	}
+
+	if (secboot_flash->busy) {
+		rc = OPAL_BUSY;
+		goto out;
+	}
+
+	if ((src + len) > secboot_size) {
+		prerror("FLASH_SECBOOT: read out of bound (0x%x,0x%x)\n",
+			src, len);
+		rc = OPAL_PARAMETER;
+		goto out;
+	}
+
+	secboot_flash->busy = true;
+	unlock(&flash_lock);
+
+	rc = blocklevel_read(secboot_flash->bl, secboot_offset + src, dst, len);
+
+	lock(&flash_lock);
+	secboot_flash->busy = false;
+out:
+	unlock(&flash_lock);
+	return rc;
+}
+
+int flash_secboot_write(uint32_t dst, void *src, uint32_t len)
+{
+	int rc;
+
+	if (!try_lock(&flash_lock))
+		return OPAL_BUSY;
+
+	if (secboot_flash->busy) {
+		rc = OPAL_BUSY;
+		goto out;
+	}
+
+	if ((dst + len) > secboot_size) {
+		prerror("FLASH_SECBOOT: write out of bound (0x%x,0x%x)\n",
+			dst, len);
+		rc = OPAL_PARAMETER;
+		goto out;
+	}
+
+	secboot_flash->busy = true;
+	unlock(&flash_lock);
+
+	rc = blocklevel_write(secboot_flash->bl, secboot_offset + dst, src, len);
+
+	lock(&flash_lock);
+	secboot_flash->busy = false;
+out:
+	unlock(&flash_lock);
+	return rc;
+}
+
+static int flash_nvram_info(uint32_t *total_size)
+{
+	int rc;
+
+	lock(&flash_lock);
+	if (!nvram_flash) {
+		rc = OPAL_HARDWARE;
+	} else if (nvram_flash->busy) {
+		rc = OPAL_BUSY;
+	} else {
+		*total_size = nvram_size;
+		rc = OPAL_SUCCESS;
+	}
+	unlock(&flash_lock);
+
+	return rc;
+}
+
+static int flash_nvram_start_read(void *dst, uint32_t src, uint32_t len)
+{
+	int rc;
+
+	if (!try_lock(&flash_lock))
+		return OPAL_BUSY;
+
+	if (!nvram_flash) {
+		rc = OPAL_HARDWARE;
+		goto out;
+	}
+
+	if (nvram_flash->busy) {
+		rc = OPAL_BUSY;
+		goto out;
+	}
+
+	if ((src + len) > nvram_size) {
+		prerror("NVRAM: read out of bound (0x%x,0x%x)\n",
+			src, len);
+		rc = OPAL_PARAMETER;
+		goto out;
+	}
+
+	nvram_flash->busy = true;
+	unlock(&flash_lock);
+
+	rc = blocklevel_read(nvram_flash->bl, nvram_offset + src, dst, len);
+
+	lock(&flash_lock);
+	nvram_flash->busy = false;
+out:
+	unlock(&flash_lock);
+	if (!rc)
+		nvram_read_complete(true);
+	return rc;
+}
+
+static int flash_nvram_write(uint32_t dst, void *src, uint32_t len)
+{
+	int rc;
+
+	if (!try_lock(&flash_lock))
+		return OPAL_BUSY;
+
+	if (nvram_flash->busy) {
+		rc = OPAL_BUSY;
+		goto out;
+	}
+
+	/* TODO: When we have async jobs for PRD, turn this into one */
+
+	if ((dst + len) > nvram_size) {
+		prerror("NVRAM: write out of bound (0x%x,0x%x)\n",
+			dst, len);
+		rc = OPAL_PARAMETER;
+		goto out;
+	}
+
+	nvram_flash->busy = true;
+	unlock(&flash_lock);
+
+	rc = blocklevel_write(nvram_flash->bl, nvram_offset + dst, src, len);
+
+	lock(&flash_lock);
+	nvram_flash->busy = false;
+out:
+	unlock(&flash_lock);
+	return rc;
+}
+
+
+static int flash_secboot_probe(struct flash *flash, struct ffs_handle *ffs)
+{
+	uint32_t start, size, part;
+	bool ecc;
+	int rc;
+
+	prlog(PR_DEBUG, "FLASH: probing for SECBOOT\n");
+
+	rc = ffs_lookup_part(ffs, "SECBOOT", &part);
+	if (rc) {
+		prlog(PR_WARNING, "FLASH: no SECBOOT partition found\n");
+		return OPAL_HARDWARE;
+	}
+
+	rc = ffs_part_info(ffs, part, NULL,
+			   &start, &size, NULL, &ecc);
+	if (rc) {
+		/**
+		 * @fwts-label SECBOOTNoPartition
+		 * @fwts-advice OPAL could not find an SECBOOT partition
+		 *     on the system flash. Check that the system flash
+		 *     has a valid partition table, and that the firmware
+		 *     build process has added a SECBOOT partition.
+		 */
+		prlog(PR_ERR, "FLASH: Can't parse ffs info for SECBOOT\n");
+		return OPAL_HARDWARE;
+	}
+
+	secboot_flash = flash;
+	secboot_offset = start;
+	secboot_size = ecc ? ecc_buffer_size_minus_ecc(size) : size;
+
+	return 0;
+}
+
+static int flash_nvram_probe(struct flash *flash, struct ffs_handle *ffs)
+{
+	uint32_t start, size, part;
+	bool ecc;
+	int rc;
+
+	prlog(PR_INFO, "probing for NVRAM\n");
+
+	rc = ffs_lookup_part(ffs, "NVRAM", &part);
+	if (rc) {
+		prlog(PR_WARNING, "no NVRAM partition found\n");
+		return OPAL_HARDWARE;
+	}
+
+	rc = ffs_part_info(ffs, part, NULL,
+			   &start, &size, NULL, &ecc);
+	if (rc) {
+		/**
+		 * @fwts-label NVRAMNoPartition
+		 * @fwts-advice OPAL could not find an NVRAM partition
+		 *     on the system flash. Check that the system flash
+		 *     has a valid partition table, and that the firmware
+		 *     build process has added a NVRAM partition.
+		 */
+		prlog(PR_ERR, "Can't parse ffs info for NVRAM\n");
+		return OPAL_HARDWARE;
+	}
+
+	nvram_flash = flash;
+	nvram_offset = start;
+	nvram_size = ecc ? ecc_buffer_size_minus_ecc(size) : size;
+
+	platform.nvram_info = flash_nvram_info;
+	platform.nvram_start_read = flash_nvram_start_read;
+	platform.nvram_write = flash_nvram_write;
+
+	return 0;
+}
+
+/* core flash support */
+
+static struct dt_node *flash_add_dt_node(struct flash *flash, int id)
+{
+	int i;
+	int rc;
+	const char *name;
+	bool ecc;
+	struct ffs_handle *ffs;
+	int ffs_part_num, ffs_part_start, ffs_part_size;
+	struct dt_node *flash_node;
+	struct dt_node *partition_container_node;
+	struct dt_node *partition_node;
+
+	flash_node = dt_new_addr(opal_node, "flash", id);
+	dt_add_property_strings(flash_node, "compatible", "ibm,opal-flash");
+	dt_add_property_cells(flash_node, "ibm,opal-id", id);
+	dt_add_property_u64(flash_node, "reg", flash->size);
+	dt_add_property_cells(flash_node, "ibm,flash-block-size",
+			flash->block_size);
+	if (flash->no_erase)
+		dt_add_property(flash_node, "no-erase", NULL, 0);
+
+	/* we fix to 32-bits */
+	dt_add_property_cells(flash_node, "#address-cells", 1);
+	dt_add_property_cells(flash_node, "#size-cells", 1);
+
+	/* Add partition container node */
+	partition_container_node = dt_new(flash_node, "partitions");
+	dt_add_property_strings(partition_container_node, "compatible", "fixed-partitions");
+
+	/* we fix to 32-bits */
+	dt_add_property_cells(partition_container_node, "#address-cells", 1);
+	dt_add_property_cells(partition_container_node, "#size-cells", 1);
+
+	/* Add partitions */
+	for (i = 0, name = NULL; i < ARRAY_SIZE(part_name_map); i++) {
+		name = part_name_map[i].name;
+
+		rc = ffs_init(0, flash->size, flash->bl, &ffs, 1);
+		if (rc) {
+			prerror("Can't open ffs handle\n");
+			continue;
+		}
+
+		rc = ffs_lookup_part(ffs, name, &ffs_part_num);
+		if (rc) {
+			/* This is not an error per-se, some partitions
+			 * are purposefully absent, don't spam the logs
+			 */
+		        prlog(PR_DEBUG, "No %s partition\n", name);
+			continue;
+		}
+		rc = ffs_part_info(ffs, ffs_part_num, NULL,
+				   &ffs_part_start, NULL, &ffs_part_size, &ecc);
+		if (rc) {
+			prerror("Failed to get %s partition info\n", name);
+			continue;
+		}
+
+		partition_node = dt_new_addr(partition_container_node, "partition", ffs_part_start);
+		dt_add_property_strings(partition_node, "label", name);
+		dt_add_property_cells(partition_node, "reg", ffs_part_start, ffs_part_size);
+		if (part_name_map[i].id != RESOURCE_ID_KERNEL_FW) {
+			/* Mark all partitions other than the full PNOR and the boot kernel
+			 * firmware as read only.  These two partitions are the only partitions
+			 * that are properly erase block aligned at this time.
+			 */
+			dt_add_property(partition_node, "read-only", NULL, 0);
+		}
+	}
+
+	partition_node = dt_new_addr(partition_container_node, "partition", 0);
+	dt_add_property_strings(partition_node, "label", "PNOR");
+	dt_add_property_cells(partition_node, "reg", 0, flash->size);
+
+	return flash_node;
+}
+
+static void setup_system_flash(struct flash *flash, struct dt_node *node,
+		const char *name, struct ffs_handle *ffs)
+{
+	char *path;
+
+	if (!ffs)
+		return;
+
+	if (system_flash) {
+		/**
+		 * @fwts-label SystemFlashMultiple
+		 * @fwts-advice OPAL Found multiple system flash.
+		 *    Since we've already found a system flash we are
+		 *    going to use that one but this ordering is not
+		 *    guaranteed so may change in future.
+		 */
+		prlog(PR_WARNING, "Attempted to register multiple system "
+		      "flash: %s\n", name);
+		return;
+	}
+
+	prlog(PR_NOTICE, "Found system flash: %s id:%i\n",
+	      name, flash->id);
+
+	system_flash = flash;
+	path = dt_get_path(node);
+	dt_add_property_string(dt_chosen, "ibm,system-flash", path);
+	free(path);
+
+	prlog(PR_INFO, "registered system flash device %s\n", name);
+
+	flash_nvram_probe(flash, ffs);
+	flash_secboot_probe(flash, ffs);
+}
+
+static int num_flashes(void)
+{
+	struct flash *flash;
+	int i = 0;
+
+	list_for_each(&flashes, flash, list)
+		i++;
+
+	return i;
+}
+
+int flash_register(struct blocklevel_device *bl)
+{
+	uint64_t size;
+	uint32_t block_size;
+	struct ffs_handle *ffs;
+	struct dt_node *node;
+	struct flash *flash;
+	const char *name;
+	int rc;
+
+	rc = blocklevel_get_info(bl, &name, &size, &block_size);
+	if (rc)
+		return rc;
+
+	if (!name)
+		name = "(unnamed)";
+
+	prlog(PR_INFO, "registering flash device %s "
+			"(size 0x%llx, blocksize 0x%x)\n",
+			name, size, block_size);
+
+	flash = malloc(sizeof(struct flash));
+	if (!flash) {
+		prlog(PR_ERR, "Error allocating flash structure\n");
+		return OPAL_RESOURCE;
+	}
+
+	flash->busy = false;
+	flash->bl = bl;
+	flash->no_erase = !(bl->flags & WRITE_NEED_ERASE);
+	flash->size = size;
+	flash->block_size = block_size;
+	flash->id = num_flashes();
+
+	rc = ffs_init(0, flash->size, bl, &ffs, 1);
+	if (rc) {
+		/**
+		 * @fwts-label NoFFS
+		 * @fwts-advice System flash isn't formatted as expected.
+		 * This could mean several OPAL utilities do not function
+		 * as expected. e.g. gard, pflash.
+		 */
+		prlog(PR_WARNING, "No ffs info; "
+				"using raw device only\n");
+		ffs = NULL;
+	}
+
+	node = flash_add_dt_node(flash, flash->id);
+
+	setup_system_flash(flash, node, name, ffs);
+
+	if (ffs)
+		ffs_close(ffs);
+
+	lock(&flash_lock);
+	list_add(&flashes, &flash->list);
+	unlock(&flash_lock);
+
+	return OPAL_SUCCESS;
+}
+
+enum flash_op {
+	FLASH_OP_READ,
+	FLASH_OP_WRITE,
+	FLASH_OP_ERASE,
+};
+
+static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
+		uint64_t buf, uint64_t size, uint64_t token)
+{
+	struct flash *flash = NULL;
+	int rc;
+
+	if (!try_lock(&flash_lock))
+		return OPAL_BUSY;
+
+	list_for_each(&flashes, flash, list)
+		if (flash->id == id)
+			break;
+
+	if (flash->id != id) {
+		/* Couldn't find the flash */
+		rc = OPAL_PARAMETER;
+		goto err;
+	}
+
+	if (flash->busy) {
+		rc = OPAL_BUSY;
+		goto err;
+	}
+
+	if (size >= flash->size || offset >= flash->size
+			|| offset + size > flash->size) {
+		rc = OPAL_PARAMETER;
+		goto err;
+	}
+
+	/*
+	 * These ops intentionally have no smarts (ecc correction or erase
+	 * before write) to them.
+	 * Skiboot is simply exposing the PNOR flash to the host.
+	 * The host is expected to understand that this is a raw flash
+	 * device and treat it as such.
+	 */
+	switch (op) {
+	case FLASH_OP_READ:
+		rc = blocklevel_raw_read(flash->bl, offset, (void *)buf, size);
+		break;
+	case FLASH_OP_WRITE:
+		rc = blocklevel_raw_write(flash->bl, offset, (void *)buf, size);
+		break;
+	case FLASH_OP_ERASE:
+		rc = blocklevel_erase(flash->bl, offset, size);
+		break;
+	default:
+		assert(0);
+	}
+
+	if (rc) {
+		rc = OPAL_HARDWARE;
+		goto err;
+	}
+
+	unlock(&flash_lock);
+
+	opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL,
+			cpu_to_be64(token),
+			cpu_to_be64(rc));
+
+	return OPAL_ASYNC_COMPLETION;
+
+err:
+	unlock(&flash_lock);
+	return rc;
+}
+
+static int64_t opal_flash_read(uint64_t id, uint64_t offset, uint64_t buf,
+		uint64_t size, uint64_t token)
+{
+	if (!opal_addr_valid((void *)buf))
+		return OPAL_PARAMETER;
+
+	return opal_flash_op(FLASH_OP_READ, id, offset, buf, size, token);
+}
+
+static int64_t opal_flash_write(uint64_t id, uint64_t offset, uint64_t buf,
+		uint64_t size, uint64_t token)
+{
+	if (!opal_addr_valid((void *)buf))
+		return OPAL_PARAMETER;
+
+	return opal_flash_op(FLASH_OP_WRITE, id, offset, buf, size, token);
+}
+
+static int64_t opal_flash_erase(uint64_t id, uint64_t offset, uint64_t size,
+		uint64_t token)
+{
+	return opal_flash_op(FLASH_OP_ERASE, id, offset, 0L, size, token);
+}
+
+opal_call(OPAL_FLASH_READ, opal_flash_read, 5);
+opal_call(OPAL_FLASH_WRITE, opal_flash_write, 5);
+opal_call(OPAL_FLASH_ERASE, opal_flash_erase, 4);
+
+/* flash resource API */
+const char *flash_map_resource_name(enum resource_id id)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(part_name_map); i++) {
+		if (part_name_map[i].id == id)
+			return part_name_map[i].name;
+	}
+	return NULL;
+}
+
+static size_t sizeof_elf_from_hdr(void *buf)
+{
+	struct elf_hdr *elf = (struct elf_hdr *)buf;
+	size_t sz = 0;
+
+	BUILD_ASSERT(SECURE_BOOT_HEADERS_SIZE > sizeof(struct elf_hdr));
+	BUILD_ASSERT(SECURE_BOOT_HEADERS_SIZE > sizeof(struct elf64be_hdr));
+	BUILD_ASSERT(SECURE_BOOT_HEADERS_SIZE > sizeof(struct elf32be_hdr));
+
+	if (elf->ei_ident == ELF_IDENT) {
+		if (elf->ei_class == ELF_CLASS_64) {
+			if (elf->ei_data == ELF_DATA_LSB) {
+				struct elf64le_hdr *kh = (struct elf64le_hdr *)buf;
+				sz = le64_to_cpu(kh->e_shoff) +
+					((uint32_t)le16_to_cpu(kh->e_shentsize) *
+					 (uint32_t)le16_to_cpu(kh->e_shnum));
+			} else {
+				struct elf64be_hdr *kh = (struct elf64be_hdr *)buf;
+				sz = be64_to_cpu(kh->e_shoff) +
+					((uint32_t)be16_to_cpu(kh->e_shentsize) *
+					 (uint32_t)be16_to_cpu(kh->e_shnum));
+			}
+		} else if (elf->ei_class == ELF_CLASS_32) {
+			if (elf->ei_data == ELF_DATA_LSB) {
+				struct elf32le_hdr *kh = (struct elf32le_hdr *)buf;
+				sz = le32_to_cpu(kh->e_shoff) +
+					(le16_to_cpu(kh->e_shentsize) *
+					 le16_to_cpu(kh->e_shnum));
+			} else {
+				struct elf32be_hdr *kh = (struct elf32be_hdr *)buf;
+				sz = be32_to_cpu(kh->e_shoff) +
+					(be16_to_cpu(kh->e_shentsize) *
+					 be16_to_cpu(kh->e_shnum));
+			}
+		}
+	}
+
+	return sz;
+}
+
+/*
+ * load a resource from FLASH
+ * buf and len shouldn't account for ECC even if partition is ECCed.
+ *
+ * The API here is a bit strange.
+ * If resource has a STB container, buf will contain it
+ * If loading subpartition with STB container, buff will *NOT* contain it
+ * For trusted boot, the whole partition containing the subpart is measured.
+ *
+ * Additionally, the logic to work out how much to read from flash is insane.
+ */
+static int flash_load_resource(enum resource_id id, uint32_t subid,
+			       void *buf, size_t *len)
+{
+	int i;
+	int rc = OPAL_RESOURCE;
+	struct ffs_handle *ffs;
+	struct flash *flash;
+	const char *name;
+	bool status = false;
+	bool ecc;
+	bool part_signed = false;
+	void *bufp = buf;
+	size_t bufsz = *len;
+	int ffs_part_num, ffs_part_start, ffs_part_size;
+	int content_size = 0;
+	int offset = 0;
+
+	lock(&flash_lock);
+
+	if (!system_flash) {
+		/**
+		 * @fwts-label SystemFlashNotFound
+		 * @fwts-advice No system flash was found. Check for missing
+		 * calls flash_register(...).
+		 */
+		prlog(PR_WARNING, "Can't load resource id:%i. "
+		      "No system flash found\n", id);
+		goto out_unlock;
+	}
+
+	flash = system_flash;
+
+	if (flash->busy)
+		goto out_unlock;
+
+	for (i = 0, name = NULL; i < ARRAY_SIZE(part_name_map); i++) {
+		if (part_name_map[i].id == id) {
+			name = part_name_map[i].name;
+			break;
+		}
+	}
+	if (!name) {
+		prerror("Couldn't find partition for id %d\n", id);
+		goto out_unlock;
+	}
+	/*
+	 * If partition doesn't have a subindex but the caller specifies one,
+	 * we fail.  eg. kernel partition doesn't have a subindex
+	 */
+	if ((part_name_map[i].subid == RESOURCE_SUBID_NONE) &&
+	    (subid != RESOURCE_SUBID_NONE)) {
+		prerror("PLAT: Partition %s doesn't have subindex\n", name);
+		goto out_unlock;
+	}
+
+	rc = ffs_init(0, flash->size, flash->bl, &ffs, 1);
+	if (rc) {
+		prerror("Can't open ffs handle: %d\n", rc);
+		goto out_unlock;
+	}
+
+	rc = ffs_lookup_part(ffs, name, &ffs_part_num);
+	if (rc) {
+		/* This is not an error per-se, some partitions
+		 * are purposefully absent, don't spam the logs
+		 */
+	        prlog(PR_DEBUG, "No %s partition\n", name);
+		goto out_free_ffs;
+	}
+	rc = ffs_part_info(ffs, ffs_part_num, NULL,
+			   &ffs_part_start, NULL, &ffs_part_size, &ecc);
+	if (rc) {
+		prerror("Failed to get %s partition info\n", name);
+		goto out_free_ffs;
+	}
+	prlog(PR_DEBUG,"%s partition %s ECC\n",
+	      name, ecc  ? "has" : "doesn't have");
+
+	/*
+	 * FIXME: Make the fact we don't support partitions smaller than 4K
+	 *  	  more explicit.
+	 */
+	if (ffs_part_size < SECURE_BOOT_HEADERS_SIZE) {
+		prerror("secboot headers bigger than "
+			"partition size 0x%x\n", ffs_part_size);
+		goto out_free_ffs;
+	}
+
+	rc = blocklevel_read(flash->bl, ffs_part_start, bufp,
+			SECURE_BOOT_HEADERS_SIZE);
+	if (rc) {
+		prerror("failed to read the first 0x%x from "
+			"%s partition, rc %d\n", SECURE_BOOT_HEADERS_SIZE,
+			name, rc);
+		goto out_free_ffs;
+	}
+
+	part_signed = stb_is_container(bufp, SECURE_BOOT_HEADERS_SIZE);
+
+	prlog(PR_DEBUG, "%s partition %s signed\n", name,
+	      part_signed ? "is" : "isn't");
+
+	/*
+	 * part_start/size are raw pointers into the partition.
+	 *  ie. they will account for ECC if included.
+	 */
+
+	if (part_signed) {
+		bufp += SECURE_BOOT_HEADERS_SIZE;
+		bufsz -= SECURE_BOOT_HEADERS_SIZE;
+		content_size = stb_sw_payload_size(buf, SECURE_BOOT_HEADERS_SIZE);
+		*len = content_size + SECURE_BOOT_HEADERS_SIZE;
+
+		if (content_size > bufsz) {
+			prerror("content size > buffer size\n");
+			rc = OPAL_PARAMETER;
+			goto out_free_ffs;
+		}
+
+		if (*len > ffs_part_size) {
+			prerror("FLASH: Cannot load %s. Content is larger than the partition\n",
+					name);
+			rc = OPAL_PARAMETER;
+			goto out_free_ffs;
+		}
+
+		ffs_part_start += SECURE_BOOT_HEADERS_SIZE;
+
+		rc = blocklevel_read(flash->bl, ffs_part_start, bufp,
+					  content_size);
+		if (rc) {
+			prerror("failed to read content size %d"
+				" %s partition, rc %d\n",
+				content_size, name, rc);
+			goto out_free_ffs;
+		}
+
+		if (subid == RESOURCE_SUBID_NONE)
+			goto done_reading;
+
+		rc = flash_subpart_info(bufp, content_size, ffs_part_size,
+					NULL, subid, &offset, &content_size);
+		if (rc) {
+			prerror("Failed to parse subpart info for %s\n",
+				name);
+			goto out_free_ffs;
+		}
+		bufp += offset;
+		goto done_reading;
+	} else /* stb_signed */ {
+		/*
+		 * Back to the old way of doing things, no STB header.
+		 */
+		if (subid == RESOURCE_SUBID_NONE) {
+			if (id == RESOURCE_ID_KERNEL ||
+				id == RESOURCE_ID_INITRAMFS) {
+				/*
+				 * Because actualSize is a lie, we compute the
+				 * size of the BOOTKERNEL based on what the ELF
+				 * headers say. Otherwise we end up reading more
+				 * than we should
+				 */
+				content_size = sizeof_elf_from_hdr(buf);
+				if (!content_size) {
+					prerror("Invalid ELF header part"
+						" %s\n", name);
+					rc = OPAL_RESOURCE;
+					goto out_free_ffs;
+				}
+			} else {
+				content_size = ffs_part_size;
+			}
+			if (content_size > bufsz) {
+				prerror("%s content size %d > "
+					" buffer size %lu\n", name,
+					content_size, bufsz);
+				rc = OPAL_PARAMETER;
+				goto out_free_ffs;
+			}
+			prlog(PR_DEBUG, "computed %s size %u\n",
+			      name, content_size);
+			rc = blocklevel_read(flash->bl, ffs_part_start,
+						  buf, content_size);
+			if (rc) {
+				prerror("failed to read content size %d"
+					" %s partition, rc %d\n",
+					content_size, name, rc);
+				goto out_free_ffs;
+			}
+			*len = content_size;
+			goto done_reading;
+		}
+		BUILD_ASSERT(FLASH_SUBPART_HEADER_SIZE <= SECURE_BOOT_HEADERS_SIZE);
+		rc = flash_subpart_info(bufp, SECURE_BOOT_HEADERS_SIZE,
+					ffs_part_size, &ffs_part_size, subid,
+					&offset, &content_size);
+		if (rc) {
+			prerror("FAILED reading subpart info. rc=%d\n",
+				rc);
+			goto out_free_ffs;
+		}
+
+		*len = ffs_part_size;
+		prlog(PR_DEBUG, "Computed %s partition size: %u "
+		      "(subpart %u size %u offset %u)\n", name, ffs_part_size,
+		      subid, content_size, offset);
+		/*
+		 * For a sub partition, we read the whole (computed)
+		 * partition, and then measure that.
+		 * Afterwards, we memmove() things back into place for
+		 * the caller.
+		 */
+		rc = blocklevel_read(flash->bl, ffs_part_start,
+					  buf, ffs_part_size);
+
+		bufp += offset;
+	}
+
+done_reading:
+	/*
+	 * Verify and measure the retrieved PNOR partition as part of the
+	 * secure boot and trusted boot requirements
+	 */
+	secureboot_verify(id, buf, *len);
+	trustedboot_measure(id, buf, *len);
+
+	/* Find subpartition */
+	if (subid != RESOURCE_SUBID_NONE) {
+		memmove(buf, bufp, content_size);
+		*len = content_size;
+	}
+
+	status = true;
+
+out_free_ffs:
+	ffs_close(ffs);
+out_unlock:
+	unlock(&flash_lock);
+	return status ? OPAL_SUCCESS : rc;
+}
+
+
+struct flash_load_resource_item {
+	enum resource_id id;
+	uint32_t subid;
+	int result;
+	void *buf;
+	size_t *len;
+	struct list_node link;
+};
+
+static LIST_HEAD(flash_load_resource_queue);
+static LIST_HEAD(flash_loaded_resources);
+static struct lock flash_load_resource_lock = LOCK_UNLOCKED;
+static struct cpu_job *flash_load_job = NULL;
+
+int flash_resource_loaded(enum resource_id id, uint32_t subid)
+{
+	struct flash_load_resource_item *resource = NULL;
+	struct flash_load_resource_item *r;
+	int rc = OPAL_BUSY;
+
+	lock(&flash_load_resource_lock);
+	list_for_each(&flash_loaded_resources, r, link) {
+		if (r->id == id && r->subid == subid) {
+			resource = r;
+			break;
+		}
+	}
+
+	if (resource) {
+		rc = resource->result;
+		list_del(&resource->link);
+		free(resource);
+	}
+
+	if (list_empty(&flash_load_resource_queue) && flash_load_job) {
+		cpu_wait_job(flash_load_job, true);
+		flash_load_job = NULL;
+	}
+
+	unlock(&flash_load_resource_lock);
+
+	return rc;
+}
+
+/*
+ * Retry for 10 minutes in 5 second intervals: allow 5 minutes for a BMC reboot
+ * (need the BMC if we're using HIOMAP flash access), then 2x for some margin.
+ */
+#define FLASH_LOAD_WAIT_MS	5000
+#define FLASH_LOAD_RETRIES	(2 * 5 * (60 / (FLASH_LOAD_WAIT_MS / 1000)))
+
+static void flash_load_resources(void *data __unused)
+{
+	struct flash_load_resource_item *r;
+	int retries = FLASH_LOAD_RETRIES;
+	int result = OPAL_RESOURCE;
+
+	lock(&flash_load_resource_lock);
+	do {
+		if (list_empty(&flash_load_resource_queue)) {
+			break;
+		}
+		r = list_top(&flash_load_resource_queue,
+			     struct flash_load_resource_item, link);
+		if (r->result != OPAL_EMPTY)
+			prerror("flash_load_resources() list_top unexpected "
+				" result %d\n", r->result);
+		r->result = OPAL_BUSY;
+		unlock(&flash_load_resource_lock);
+
+		while (retries) {
+			result = flash_load_resource(r->id, r->subid, r->buf,
+						     r->len);
+			if (result == OPAL_SUCCESS) {
+				retries = FLASH_LOAD_RETRIES;
+				break;
+			}
+
+			if (result != FLASH_ERR_AGAIN &&
+					result != FLASH_ERR_DEVICE_GONE)
+				break;
+
+			time_wait_ms(FLASH_LOAD_WAIT_MS);
+
+			retries--;
+
+			prlog(PR_WARNING,
+			      "Retrying load of %d:%d, %d attempts remain\n",
+			      r->id, r->subid, retries);
+		}
+
+		lock(&flash_load_resource_lock);
+		r = list_pop(&flash_load_resource_queue,
+			     struct flash_load_resource_item, link);
+		/* Will reuse the result from when we hit retries == 0 */
+		r->result = result;
+		list_add_tail(&flash_loaded_resources, &r->link);
+	} while(true);
+	unlock(&flash_load_resource_lock);
+}
+
+static void start_flash_load_resource_job(void)
+{
+	if (flash_load_job)
+		cpu_wait_job(flash_load_job, true);
+
+	flash_load_job = cpu_queue_job(NULL, "flash_load_resources",
+				       flash_load_resources, NULL);
+
+	cpu_process_local_jobs();
+}
+
+int flash_start_preload_resource(enum resource_id id, uint32_t subid,
+				 void *buf, size_t *len)
+{
+	struct flash_load_resource_item *r;
+	bool start_thread = false;
+
+	r = malloc(sizeof(struct flash_load_resource_item));
+
+	assert(r != NULL);
+	r->id = id;
+	r->subid = subid;
+	r->buf = buf;
+	r->len = len;
+	r->result = OPAL_EMPTY;
+
+	prlog(PR_DEBUG, "Queueing preload of %x/%x\n",
+	      r->id, r->subid);
+
+	lock(&flash_load_resource_lock);
+	if (list_empty(&flash_load_resource_queue)) {
+		start_thread = true;
+	}
+	list_add_tail(&flash_load_resource_queue, &r->link);
+	unlock(&flash_load_resource_lock);
+
+	if (start_thread)
+		start_flash_load_resource_job();
+
+	return OPAL_SUCCESS;
+}
+
+/*
+ * The `libxz` decompression routines are blocking; the new decompression
+ * routines, wrapper around `libxz` functions, provide support for asynchronous
+ * decompression. There are two routines, which start the decompression, and one
+ * which waits for the decompression to complete.
+ *
+ * The decompressed image will be present in the `dst` parameter of
+ * `xz_decompress` structure.
+ *
+ * When the decompression is successful, the xz_decompress->status will be
+ * `OPAL_SUCCESS` else OPAL_PARAMETER, see definition of xz_decompress structure
+ * for details.
+ */
+static void xz_decompress(void *data)
+{
+	struct xz_decompress *xz = (struct xz_decompress *)data;
+	struct xz_dec *s;
+	struct xz_buf b;
+
+	/* Initialize the xz library first */
+	xz_crc32_init();
+	s = xz_dec_init(XZ_SINGLE, 0);
+	if (s == NULL) {
+		prerror("initialization error for xz\n");
+		xz->status = OPAL_NO_MEM;
+		return;
+	}
+
+	xz->xz_error = XZ_DATA_ERROR;
+	xz->status = OPAL_PARTIAL;
+
+	b.in = xz->src;
+	b.in_pos = 0;
+	b.in_size = xz->src_size;
+	b.out = xz->dst;
+	b.out_pos = 0;
+	b.out_size = xz->dst_size;
+
+	/* Start decompressing */
+	xz->xz_error = xz_dec_run(s, &b);
+	if (xz->xz_error != XZ_STREAM_END) {
+		prerror("failed to decompress subpartition\n");
+		xz->status = OPAL_PARAMETER;
+	} else
+		xz->status = OPAL_SUCCESS;
+
+	xz_dec_end(s);
+}
+
+/*
+ * xz_start_decompress: start the decompression job and return.
+ *
+ * struct xz_decompress *xz, should be populated by the caller with
+ *     - the starting address of the compressed binary
+ *     - the address where the decompressed image should be placed
+ *     - the sizes of the source and the destination
+ *
+ * xz->src: Source address (The compressed binary)
+ * xz->src_size: Source size
+ * xz->dst: Destination address (The memory area where the `src` will be
+ *          decompressed)
+ * xz->dst_size: Destination size
+ *
+ * The `status` value will be OPAL_PARTIAL till the job completes (successfully
+ * or not)
+ */
+void xz_start_decompress(struct xz_decompress *xz)
+{
+	struct cpu_job *job;
+
+	if (!xz)
+		return;
+
+	if (!xz->dst || !xz->dst_size || !xz->src || !xz->src_size) {
+		xz->status = OPAL_PARAMETER;
+		return;
+	}
+
+	job = cpu_queue_job(NULL, "xz_decompress", xz_decompress,
+			    (void *) xz);
+	if (!job) {
+		xz->status = OPAL_NO_MEM;
+		return;
+	}
+
+	xz->job = job;
+}
+
+/*
+ * This function waits for the decompression job to complete. The `ret`
+ * structure member in `xz_decompress` will have the status code.
+ *
+ * status == OPAL_SUCCESS on success, else the corresponding error code.
+ */
+void wait_xz_decompress(struct xz_decompress *xz)
+{
+	if (!xz)
+		return;
+
+	cpu_wait_job(xz->job, true);
+}
diff --git a/roms/skiboot/core/gcov-profiling.c b/roms/skiboot/core/gcov-profiling.c
new file mode 100644
index 000000000..fdad51ed9
--- /dev/null
+++ b/roms/skiboot/core/gcov-profiling.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * libgcov skeleton reimplementation to build skiboot with gcov support
+ *
+ * Copyright 2015-2018 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <compiler.h>
+#include <stdio.h>
+
+typedef long gcov_type;
+
+/*
+ * This is GCC internal data structure. See GCC libgcc/libgcov.h for
+ * details.
+ *
+ * If gcc changes this, we have to change it.
+ */
+
+typedef unsigned int gcov_unsigned_int;
+
+#if __GNUC__ == 4 && __GNUC_MINOR__ >= 9
+#define GCOV_COUNTERS                   9
+#else
+#define GCOV_COUNTERS                   8
+#endif
+
+struct gcov_info
+{
+        gcov_unsigned_int version;
+        struct gcov_info *next;
+        gcov_unsigned_int stamp;
+        const char *filename;
+        void (*merge[GCOV_COUNTERS])(gcov_type *, unsigned int);
+        unsigned int n_functions;
+        struct gcov_fn_info **functions;
+};
+
+/* We have a list of all gcov info set up at startup */
+struct gcov_info *gcov_info_list;
+
+void __gcov_init(struct gcov_info* f);
+void skiboot_gcov_done(void);
+void __gcov_flush(void);
+void __gcov_merge_add(gcov_type *counters, unsigned int n_counters);
+void __gcov_merge_single(gcov_type *counters, unsigned int n_counters);
+void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters);
+void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters);
+void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters);
+void __gcov_exit(void);
+
+void __gcov_init(struct gcov_info* f)
+{
+	static gcov_unsigned_int version = 0;
+
+	if (version == 0) {
+		printf("GCOV version: %u\n", f->version);
+		version = f->version;
+	}
+
+	if (gcov_info_list)
+		f->next = gcov_info_list;
+
+	gcov_info_list = f;
+	return;
+}
+
+void skiboot_gcov_done(void)
+{
+	struct gcov_info *i = gcov_info_list;
+
+	if (i->filename)
+		printf("GCOV: gcov_info_list looks sane (first file: %s)\n",
+		       i->filename);
+	else
+		prlog(PR_WARNING, "GCOV: gcov_info_list doesn't look sane. "
+		      "i->filename == NULL.");
+
+	printf("GCOV: gcov_info_list at 0x%p\n", gcov_info_list);
+}
+
+void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
+{
+	(void)counters;
+	(void)n_counters;
+
+	return;
+}
+
+void __gcov_flush(void)
+{
+	return;
+}
+
+void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
+{
+	(void)counters;
+	(void)n_counters;
+
+	return;
+}
+
+void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
+{
+	(void)counters;
+	(void)n_counters;
+
+	return;
+}
+
+void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
+{
+	(void)counters;
+	(void)n_counters;
+	return;
+}
+
+void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
+{
+	(void)counters;
+	(void)n_counters;
+}
+
+void __gcov_exit(void)
+{
+}
diff --git a/roms/skiboot/core/hmi.c b/roms/skiboot/core/hmi.c
new file mode 100644
index 000000000..9363cc5fb
--- /dev/null
+++ b/roms/skiboot/core/hmi.c
@@ -0,0 +1,1558 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Deal with Hypervisor Maintenance Interrupts
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#define pr_fmt(fmt)	"HMI: " fmt
+
+#include <skiboot.h>
+#include <opal.h>
+#include <opal-msg.h>
+#include <processor.h>
+#include <chiptod.h>
+#include <xscom.h>
+#include <xscom-p8-regs.h>
+#include <xscom-p9-regs.h>
+#include <xscom-p10-regs.h>
+#include <pci.h>
+#include <cpu.h>
+#include <chip.h>
+#include <npu-regs.h>
+#include <npu2-regs.h>
+#include <npu2.h>
+#include <npu.h>
+#include <capp.h>
+#include <nvram.h>
+#include <cpu.h>
+
+/*
+ * P9 HMER register layout:
+ * +===+==========+============================+========+===================+
+ * |Bit|Name      |Description                 |PowerKVM|Action             |
+ * |   |          |                            |HMI     |                   |
+ * |   |          |                            |enabled |                   |
+ * |   |          |                            |for this|                   |
+ * |   |          |                            |bit ?   |                   |
+ * +===+==========+============================+========+===================+
+ * |0  |malfunctio|A processor core in the     |Yes     |Raise attn from    |
+ * |   |n_allert  |system has checkstopped     |        |sapphire resulting |
+ * |   |          |(failed recovery) and has   |        |xstop              |
+ * |   |          |requested a CP Sparing      |        |                   |
+ * |   |          |to occur. This is           |        |                   |
+ * |   |          |broadcasted to every        |        |                   |
+ * |   |          |processor in the system     |        |                   |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |1  |Reserved  |reserved                    |n/a     |                   |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |2  |proc_recv_|Processor recovery occurred |Yes     |Log message and    |
+ * |   |done      |error-bit in fir not masked |        |continue working.  |
+ * |   |          |(see bit 11)                |        |                   |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |3  |proc_recv_|Processor went through      |Yes     |Log message and    |
+ * |   |error_mask|recovery for an error which |        |continue working.  |
+ * |   |ed        |is actually masked for      |        |                   |
+ * |   |          |reporting                   |        |                   |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |4  |          |Timer facility experienced  |Yes     |Raise attn from    |
+ * |   |tfac_error|an error.                   |        |sapphire resulting |
+ * |   |          |TB, DEC, HDEC, PURR or SPURR|        |xstop              |
+ * |   |          |may be corrupted (details in|        |                   |
+ * |   |          |TFMR)                       |        |                   |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |5  |          |TFMR SPR itself is          |Yes     |Raise attn from    |
+ * |   |tfmr_parit|corrupted.                  |        |sapphire resulting |
+ * |   |y_error   |Entire timing facility may  |        |xstop              |
+ * |   |          |be compromised.             |        |                   |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |6  |ha_overflo| UPS (Uniterrupted Power    |No      |N/A                |
+ * |   |w_warning |System) Overflow indication |        |                   |
+ * |   |          |indicating that the UPS     |        |                   |
+ * |   |          |DirtyAddrTable has          |        |                   |
+ * |   |          |reached a limit where it    |        |                   |
+ * |   |          |requires PHYP unload support|        |                   |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |7  |reserved  |reserved                    |n/a     |n/a                |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |8  |xscom_fail|An XSCOM operation caused by|No      |We handle it by    |
+ * |   |          |a cache inhibited load/store|        |manually reading   |
+ * |   |          |from this thread failed. A  |        |HMER register.     |
+ * |   |          |trap register is            |        |                   |
+ * |   |          |available.                  |        |                   |
+ * |   |          |                            |        |                   |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |9  |xscom_done|An XSCOM operation caused by|No      |We handle it by    |
+ * |   |          |a cache inhibited load/store|        |manually reading   |
+ * |   |          |from this thread completed. |        |HMER register.     |
+ * |   |          |If hypervisor               |        |                   |
+ * |   |          |intends to use this bit, it |        |                   |
+ * |   |          |is responsible for clearing |        |                   |
+ * |   |          |it before performing the    |        |                   |
+ * |   |          |xscom operation.            |        |                   |
+ * |   |          |NOTE: this bit should always|        |                   |
+ * |   |          |be masked in HMEER          |        |                   |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |10 |reserved  |reserved                    |n/a     |n/a                |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |11 |proc_recv_|Processor recovery occurred |y       |Log message and    |
+ * |   |again     |again before bit2 or bit3   |        |continue working.  |
+ * |   |          |was cleared                 |        |                   |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |12-|reserved  |was temperature sensor      |n/a     |n/a                |
+ * |15 |          |passed the critical point on|        |                   |
+ * |   |          |the way up                  |        |                   |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |16 |          |SCOM has set a reserved FIR |No      |n/a                |
+ * |   |scom_fir_h|bit to cause recovery       |        |                   |
+ * |   |m         |                            |        |                   |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |17 |trig_fir_h|Debug trigger has set a     |No      |n/a                |
+ * |   |mi        |reserved FIR bit to cause   |        |                   |
+ * |   |          |recovery                    |        |                   |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |18 |reserved  |reserved                    |n/a     |n/a                |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |19 |reserved  |reserved                    |n/a     |n/a                |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |20 |hyp_resour|A hypervisor resource error |y       |Raise attn from    |
+ * |   |ce_err    |occurred: data parity error |        |sapphire resulting |
+ * |   |          |on, SPRC0:3; SPR_Modereg or |        |xstop.             |
+ * |   |          |HMEER.                      |        |                   |
+ * |   |          |Note: this bit will cause an|        |                   |
+ * |   |          |check_stop when (HV=1, PR=0 |        |                   |
+ * |   |          |and EE=0)                   |        |                   |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |21-|          |if bit 8 is active, the     |No      |We handle it by    |
+ * |23 |xscom_stat|reason will be detailed in  |        |Manually reading   |
+ * |   |us        |these bits. see chapter 11.1|        |HMER register.     |
+ * |   |          |This bits are information   |        |                   |
+ * |   |          |only and always masked      |        |                   |
+ * |   |          |(mask = '0')                |        |                   |
+ * |   |          |If hypervisor intends to use|        |                   |
+ * |   |          |this bit, it is responsible |        |                   |
+ * |   |          |for clearing it before      |        |                   |
+ * |   |          |performing the xscom        |        |                   |
+ * |   |          |operation.                  |        |                   |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |24-|Not       |Not implemented             |n/a     |n/a                |
+ * |63 |implemente|                            |        |                   |
+ * |   |d         |                            |        |                   |
+ * +-- +----------+----------------------------+--------+-------------------+
+ *
+ * Above HMER bits can be enabled/disabled by modifying
+ * SPR_HMEER_HMI_ENABLE_MASK #define in include/processor.h
+ * If you modify support for any of the bits listed above, please make sure
+ * you change the above table to refelct that.
+ *
+ * NOTE: Per Dave Larson, never enable 8,9,21-23
+ */
+
+/*
+ * P10 HMER register layout:
+ * Bit   Name                Description
+ * 0     malfunction_alert   A processor core in the system has checkstopped
+ *                           (failed recovery). This is broadcasted to every
+ *                           processor in the system
+ *
+ * 1     reserved            reserved
+ *
+ * 2     proc_rcvy_done      Processor recovery occurred error-bit in fir not
+ *                           masked (see bit 11)
+ *
+ * 3     reserved            reserved
+ *
+ * 4     tfac_error          Timer facility experienced an error. TB, DEC,
+ *                           HDEC, PURR or SPURR may be corrupted (details in
+ *                           TFMR)
+ *
+ * 5     tfx_error           Error occurred on transfer from tfac shadow to
+ *                           core
+ *
+ * 6     spurr_scale_limit   Nominal frequency exceeded 399 percent
+ *
+ * 7     reserved            reserved
+ *
+ * 8     xscom_fail          An XSCOM operation caused by a cache inhibited
+ *                           load/store from this thread failed. A trap
+ *                           register is available.
+ *
+ * 9     xscom_done          An XSCOM operation caused by a cache inhibited
+ *                           load/store from this thread completed. If
+ *                           hypervisor intends to use this bit, it is
+ *                           responsible for clearing it before performing the
+ *                           xscom operation. NOTE: this bit should always be
+ *                           masked in HMEER
+ *
+ * 10    reserved            reserved
+ *
+ * 11    proc_rcvy_again     Processor recovery occurred again before bit 2
+ *                           was cleared
+ *
+ * 12-15 reserved            reserved
+ *
+ * 16    scom_fir_hmi        An error inject to PC FIR has occurred to set HMI.
+ *                           This error inject can also set FIR(61) to cause
+ *                           recovery.
+ *
+ * 17    reserved            reserved
+ *
+ * 18    trig_fir_hmi        Debug trigger has occurred to set HMI. This
+ *                           trigger can also set FIR(60) to cause recovery
+ *
+ * 19-20 reserved            reserved
+ *
+ * 21-23 xscom_status        If bit 8 is active, the reason will be detailed in
+ *                           these bits. These bits are information only and
+ *                           always masked (mask = ‘0’) If hypervisor intends
+ *                           to use this field, it is responsible for clearing
+ *                           it before performing the xscom operation.
+ *
+ * 24:63 Not implemented     Not implemented.
+ *
+ * P10 HMEER enabled bits:
+ * Name                      Action
+ * malfunction_alert         Decode and log FIR bits.
+ * proc_rcvy_done            Log and continue.
+ * tfac_error                Log and attempt to recover time facilities.
+ * tfx_error                 Log and attempt to recover time facilities.
+ * spurr_scale_limit         Log and continue. XXX?
+ * proc_rcvy_again           Log and continue.
+ */
+
+/* Used for tracking cpu threads inside hmi handling. */
+#define HMI_STATE_CLEANUP_DONE	0x100
+#define CORE_THREAD_MASK	0x0ff
+#define SUBCORE_THREAD_MASK(s_id, t_count) \
+		((((1UL) << (t_count)) - 1) << ((s_id) * (t_count)))
+#define SINGLE_THREAD_MASK(t_id)	((1UL) << (t_id))
+
+/*
+ * Number of iterations for the various timeouts. We can't use the timebase
+ * as it might be broken. We measured experimentally that 40 millions loops
+ * of cpu_relax() gives us more than 1s. The margin is comfortable enough.
+ */
+#define TIMEOUT_LOOPS		40000000
+
+/* TFMR other errors. (other than bit 26 and 45) */
+#define SPR_TFMR_OTHER_ERRORS	\
+	(SPR_TFMR_TBST_CORRUPT | SPR_TFMR_TB_MISSING_SYNC |	\
+	 SPR_TFMR_TB_MISSING_STEP | SPR_TFMR_FW_CONTROL_ERR |	\
+	 SPR_TFMR_PURR_PARITY_ERR | SPR_TFMR_SPURR_PARITY_ERR |	\
+	 SPR_TFMR_DEC_PARITY_ERR | SPR_TFMR_TFMR_CORRUPT |	\
+	 SPR_TFMR_CHIP_TOD_INTERRUPT)
+
+/* TFMR "all core" errors (sent to all threads) */
+#define SPR_TFMR_CORE_ERRORS	\
+	(SPR_TFMR_TBST_CORRUPT | SPR_TFMR_TB_MISSING_SYNC |	\
+	 SPR_TFMR_TB_MISSING_STEP | SPR_TFMR_FW_CONTROL_ERR |	\
+	 SPR_TFMR_TFMR_CORRUPT | SPR_TFMR_TB_RESIDUE_ERR |	\
+	 SPR_TFMR_HDEC_PARITY_ERROR | SPR_TFMR_TFAC_XFER_ERROR)
+
+/* TFMR "thread" errors  */
+#define SPR_TFMR_THREAD_ERRORS \
+	(SPR_TFMR_PURR_PARITY_ERR | SPR_TFMR_SPURR_PARITY_ERR |	\
+	 SPR_TFMR_DEC_PARITY_ERR)
+
+/*
+ * Starting from p9, core inits are setup to escalate all core
+ * local checkstop to system checkstop. Review this list when that changes.
+ */
+static const struct core_xstop_bit_info {
+	uint8_t bit;		/* CORE FIR bit number */
+	enum OpalHMI_CoreXstopReason reason;
+} xstop_bits[] = {
+	{ 3, CORE_CHECKSTOP_IFU_REGFILE },
+	{ 5, CORE_CHECKSTOP_IFU_LOGIC },
+	{ 8, CORE_CHECKSTOP_PC_DURING_RECOV },
+	{ 10, CORE_CHECKSTOP_ISU_REGFILE },
+	{ 12, CORE_CHECKSTOP_ISU_LOGIC },
+	{ 21, CORE_CHECKSTOP_FXU_LOGIC },
+	{ 25, CORE_CHECKSTOP_VSU_LOGIC },
+	{ 26, CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE },
+	{ 32, CORE_CHECKSTOP_LSU_REGFILE },
+	{ 36, CORE_CHECKSTOP_PC_FWD_PROGRESS },
+	{ 38, CORE_CHECKSTOP_LSU_LOGIC },
+	{ 45, CORE_CHECKSTOP_PC_LOGIC },
+	{ 48, CORE_CHECKSTOP_PC_HYP_RESOURCE },
+	{ 52, CORE_CHECKSTOP_PC_HANG_RECOV_FAILED },
+	{ 54, CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED },
+	{ 63, CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ },
+};
+
+struct core_fir_bit_info {
+	uint8_t bit;		/* CORE FIR bit number */
+	const char *reason;
+};
+
+static const struct core_fir_bit_info p9_recoverable_bits[] = {
+	{ 0, "IFU - SRAM (ICACHE parity, etc)" },
+	{ 2, "IFU - RegFile" },
+	{ 4, "IFU - Logic" },
+	{ 9, "ISU - RegFile" },
+	{ 11, "ISU - Logic" },
+	{ 13, "ISU - Recoverable due to not in MT window" },
+	{ 24, "VSU - Logic" },
+	{ 27, "VSU - DFU logic" },
+	{ 29, "LSU - SRAM (DCACHE parity, etc)" },
+	{ 31, "LSU - RegFile" },
+	/* The following 3 bits may be set by SRAM errors. */
+	{ 33, "LSU - TLB multi hit" },
+	{ 34, "LSU - SLB multi hit" },
+	{ 35, "LSU - ERAT multi hit" },
+	{ 37, "LSU - Logic" },
+	{ 39, "LSU - Recoverable due to not in MT window" },
+	{ 43, "PC - Thread hang recovery" },
+};
+
+static const struct core_fir_bit_info p10_core_fir_bits[] = {
+	{ 0,  "IFU - SRAM recoverable error (ICACHE parity error, etc.)" },
+	{ 1,  "PC - TC checkstop" },
+	{ 2,  "IFU - RegFile recoverable error" },
+	{ 3,  "IFU - RegFile core checkstop" },
+	{ 4,  "IFU - Logic recoverable error" },
+	{ 5,  "IFU - Logic core checkstop" },
+	{ 7,  "VSU - Inference accumulator recoverable error" },
+	{ 8,  "PC - Recovery core checkstop" },
+	{ 9,  "VSU - Slice Target File (STF) recoverable error" },
+	{ 11, "ISU - Logic recoverable error" },
+	{ 12, "ISU - Logic core checkstop" },
+	{ 14, "ISU - Machine check received while ME=0 checkstop" },
+	{ 15, "ISU - UE from L2" },
+	{ 16, "ISU - Number of UEs from L2 above threshold" },
+	{ 17, "ISU - UE on CI load" },
+	{ 18, "MMU - TLB recoverable error" },
+	{ 19, "MMU - SLB error" },
+	{ 21, "MMU - CXT recoverable error" },
+	{ 22, "MMU - Logic core checkstop" },
+	{ 23, "MMU - MMU system checkstop" },
+	{ 24, "VSU - Logic recoverable error" },
+	{ 25, "VSU - Logic core checkstop" },
+	{ 26, "PC - In maint mode and recovery in progress" },
+	{ 28, "PC - PC system checkstop" },
+	{ 29, "LSU - SRAM recoverable error (DCACHE parity error, etc.)" },
+	{ 30, "LSU - Set deleted" },
+	{ 31, "LSU - RegFile recoverable error" },
+	{ 32, "LSU - RegFile core checkstop" },
+	{ 33, "MMU - TLB multi hit error occurred" },
+	{ 34, "MMU - SLB multi hit error occurred" },
+	{ 35, "LSU - ERAT multi hit error occurred" },
+	{ 36, "PC - Forward progress error" },
+	{ 37, "LSU - Logic recoverable error" },
+	{ 38, "LSU - Logic core checkstop" },
+	{ 41, "LSU - System checkstop" },
+	{ 43, "PC - Thread hang recoverable error" },
+	{ 45, "PC - Logic core checkstop" },
+	{ 47, "PC - TimeBase facility checkstop" },
+	{ 52, "PC - Hang recovery failed core checkstop" },
+	{ 53, "PC - Core internal hang detected" },
+	{ 55, "PC - Nest hang detected" },
+	{ 56, "PC - Other core chiplet recoverable error" },
+	{ 57, "PC - Other core chiplet core checkstop" },
+	{ 58, "PC - Other core chiplet system checkstop" },
+	{ 59, "PC - SCOM satellite error detected" },
+	{ 60, "PC - Debug trigger error inject" },
+	{ 61, "PC - SCOM or firmware recoverable error inject" },
+	{ 62, "PC - Firmware checkstop error inject" },
+	{ 63, "PC - Firmware SPRC / SPRD checkstop" },
+};
+
+static const struct nx_xstop_bit_info {
+	uint8_t bit;		/* NX FIR bit number */
+	enum OpalHMI_NestAccelXstopReason reason;
+} nx_dma_xstop_bits[] = {
+	{ 1, NX_CHECKSTOP_SHM_INVAL_STATE_ERR },
+	{ 15, NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1 },
+	{ 16, NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2 },
+	{ 20, NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR },
+	{ 21, NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR },
+	{ 22, NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR },
+	{ 23, NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR },
+	{ 24, NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR },
+	{ 25, NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR },
+	{ 26, NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR },
+	{ 27, NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR },
+	{ 31, NX_CHECKSTOP_DMA_CRB_UE },
+	{ 32, NX_CHECKSTOP_DMA_CRB_SUE },
+};
+
+static const struct nx_xstop_bit_info nx_pbi_xstop_bits[] = {
+	{ 12, NX_CHECKSTOP_PBI_ISN_UE },
+};
+
+static struct lock hmi_lock = LOCK_UNLOCKED;
+static uint32_t malf_alert_scom;
+static uint32_t nx_status_reg;
+static uint32_t nx_dma_engine_fir;
+static uint32_t nx_pbi_fir;
+
+static int setup_scom_addresses(void)
+{
+	switch (proc_gen) {
+	case proc_gen_p8:
+		malf_alert_scom = P8_MALFUNC_ALERT;
+		nx_status_reg = P8_NX_STATUS_REG;
+		nx_dma_engine_fir = P8_NX_DMA_ENGINE_FIR;
+		nx_pbi_fir = P8_NX_PBI_FIR;
+		return 1;
+	case proc_gen_p9:
+		malf_alert_scom = P9_MALFUNC_ALERT;
+		nx_status_reg = P9_NX_STATUS_REG;
+		nx_dma_engine_fir = P9_NX_DMA_ENGINE_FIR;
+		nx_pbi_fir = P9_NX_PBI_FIR;
+		return 1;
+	case proc_gen_p10:
+		malf_alert_scom = P10_MALFUNC_ALERT;
+		nx_status_reg = P10_NX_STATUS_REG;
+		nx_dma_engine_fir = P10_NX_DMA_ENGINE_FIR;
+		nx_pbi_fir = P10_NX_PBI_FIR;
+		return 1;
+	default:
+		prerror("%s: Unknown CPU type\n", __func__);
+		break;
+	}
+	return 0;
+}
+
+static int queue_hmi_event(struct OpalHMIEvent *hmi_evt, int recover, uint64_t *out_flags)
+{
+	size_t size;
+
+	/* Don't queue up event if recover == -1 */
+	if (recover == -1)
+		return 0;
+
+	/* set disposition */
+	if (recover == 1)
+		hmi_evt->disposition = OpalHMI_DISPOSITION_RECOVERED;
+	else if (recover == 0)
+		hmi_evt->disposition = OpalHMI_DISPOSITION_NOT_RECOVERED;
+
+	/*
+	 * V2 of struct OpalHMIEvent is of (5 * 64 bits) size and well packed
+	 * structure. Hence use uint64_t pointer to pass entire structure
+	 * using 5 params in generic message format. Instead of hard coding
+	 * num_params divide the struct size by 8 bytes to get exact
+	 * num_params value.
+	 */
+	size = ALIGN_UP(sizeof(*hmi_evt), sizeof(u64));
+
+	*out_flags |= OPAL_HMI_FLAGS_NEW_EVENT;
+
+	/* queue up for delivery to host. */
+	return _opal_queue_msg(OPAL_MSG_HMI_EVT, NULL, NULL,
+				size, hmi_evt);
+}
+
+static int read_core_fir(uint32_t chip_id, uint32_t core_id, uint64_t *core_fir)
+{
+	int rc;
+
+	switch (proc_gen) {
+	case proc_gen_p8:
+		rc = xscom_read(chip_id,
+			XSCOM_ADDR_P8_EX(core_id, P8_CORE_FIR), core_fir);
+		break;
+	case proc_gen_p9:
+		rc = xscom_read(chip_id,
+			XSCOM_ADDR_P9_EC(core_id, P9_CORE_FIR), core_fir);
+		break;
+	case proc_gen_p10:
+		rc = xscom_read(chip_id,
+			XSCOM_ADDR_P10_EC(core_id, P10_CORE_FIR), core_fir);
+		break;
+	default:
+		rc = OPAL_HARDWARE;
+	}
+	return rc;
+}
+
+static int read_core_wof(uint32_t chip_id, uint32_t core_id, uint64_t *core_wof)
+{
+	int rc;
+
+	switch (proc_gen) {
+	case proc_gen_p9:
+		rc = xscom_read(chip_id,
+			XSCOM_ADDR_P9_EC(core_id, P9_CORE_WOF), core_wof);
+		break;
+	case proc_gen_p10:
+		rc = xscom_read(chip_id,
+			XSCOM_ADDR_P10_EC(core_id, P10_CORE_WOF), core_wof);
+		break;
+	default:
+		rc = OPAL_HARDWARE;
+	}
+	return rc;
+}
+
+static bool decode_core_fir(struct cpu_thread *cpu,
+				struct OpalHMIEvent *hmi_evt)
+{
+	uint64_t core_fir;
+	uint32_t core_id;
+	int i, swkup_rc;
+	bool found = false;
+	int64_t ret;
+	const char *loc;
+
+	/* Sanity check */
+	if (!cpu || !hmi_evt)
+		return false;
+
+	core_id = pir_to_core_id(cpu->pir);
+
+	/* Force the core to wakeup, otherwise reading core_fir is unrealiable
+	 * if stop-state 5 is enabled.
+	 */
+	swkup_rc = dctl_set_special_wakeup(cpu);
+
+	/* Get CORE FIR register value. */
+	ret = read_core_fir(cpu->chip_id, core_id, &core_fir);
+
+	if (!swkup_rc)
+		dctl_clear_special_wakeup(cpu);
+
+
+	if (ret == OPAL_WRONG_STATE) {
+		/*
+		 * CPU is asleep, so it probably didn't cause the checkstop.
+		 * If no other HMI cause is found a "catchall" checkstop
+		 * will be raised, so if this CPU should've been awake the
+		 * error will be handled appropriately.
+		 */
+		prlog(PR_DEBUG,
+		      "FIR read failed, chip %d core %d asleep\n",
+		      cpu->chip_id, core_id);
+		return false;
+	} else if (ret != OPAL_SUCCESS) {
+		prerror("XSCOM error reading CORE FIR\n");
+		/* If the FIR can't be read, we should checkstop. */
+		return true;
+	}
+
+	if (!core_fir)
+		return false;
+
+	loc = chip_loc_code(cpu->chip_id);
+	prlog(PR_INFO, "[Loc: %s]: CHIP ID: %x, CORE ID: %x, FIR: %016llx\n",
+			loc ? loc : "Not Available",
+			cpu->chip_id, core_id, core_fir);
+
+	if (proc_gen == proc_gen_p10) {
+		for (i = 0; i < ARRAY_SIZE(p10_core_fir_bits); i++) {
+			if (core_fir & PPC_BIT(p10_core_fir_bits[i].bit))
+				prlog(PR_INFO, "    %s\n", p10_core_fir_bits[i].reason);
+		}
+	}
+
+	/* Check CORE FIR bits and populate HMI event with error info. */
+	for (i = 0; i < ARRAY_SIZE(xstop_bits); i++) {
+		if (core_fir & PPC_BIT(xstop_bits[i].bit)) {
+			found = true;
+			hmi_evt->u.xstop_error.xstop_reason
+					|= cpu_to_be32(xstop_bits[i].reason);
+		}
+	}
+	return found;
+}
+
+static void find_core_checkstop_reason(struct OpalHMIEvent *hmi_evt,
+				       uint64_t *out_flags)
+{
+	struct cpu_thread *cpu;
+
+	/* Initialize HMI event */
+	hmi_evt->severity = OpalHMI_SEV_FATAL;
+	hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT;
+	hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_CORE;
+
+	/*
+	 * Check CORE FIRs and find the reason for core checkstop.
+	 * Send a separate HMI event for each core that has checkstopped.
+	 */
+	for_each_cpu(cpu) {
+		/* GARDed CPUs are marked unavailable. Skip them.  */
+		if (cpu->state == cpu_state_unavailable)
+			continue;
+
+		/* Only check on primaries (ie. core), not threads */
+		if (cpu->is_secondary)
+			continue;
+
+		/* Initialize xstop_error fields. */
+		hmi_evt->u.xstop_error.xstop_reason = 0;
+		hmi_evt->u.xstop_error.u.pir = cpu_to_be32(cpu->pir);
+
+		if (decode_core_fir(cpu, hmi_evt))
+			queue_hmi_event(hmi_evt, 0, out_flags);
+	}
+}
+
+static void find_capp_checkstop_reason(int flat_chip_id,
+				       struct OpalHMIEvent *hmi_evt,
+				       uint64_t *out_flags)
+{
+	struct capp_info info;
+	struct phb *phb;
+	uint64_t capp_fir;
+	uint64_t capp_fir_mask;
+	uint64_t capp_fir_action0;
+	uint64_t capp_fir_action1;
+	uint64_t reg;
+	int64_t rc;
+
+	/* CAPP exists on P8 and P9 only */
+	if (proc_gen != proc_gen_p8 && proc_gen != proc_gen_p9)
+		return;
+
+	/* Find the CAPP on the chip associated with the HMI. */
+	for_each_phb(phb) {
+		/* get the CAPP info */
+		rc = capp_get_info(flat_chip_id, phb, &info);
+		if (rc == OPAL_PARAMETER)
+			continue;
+
+		if (xscom_read(flat_chip_id, info.capp_fir_reg, &capp_fir) ||
+		    xscom_read(flat_chip_id, info.capp_fir_mask_reg,
+			       &capp_fir_mask) ||
+		    xscom_read(flat_chip_id, info.capp_fir_action0_reg,
+			       &capp_fir_action0) ||
+		    xscom_read(flat_chip_id, info.capp_fir_action1_reg,
+			       &capp_fir_action1)) {
+			prerror("CAPP: Couldn't read CAPP#%d (PHB:#%x) FIR registers by XSCOM!\n",
+				info.capp_index, info.phb_index);
+			continue;
+		}
+
+		if (!(capp_fir & ~capp_fir_mask))
+			continue;
+
+		prlog(PR_DEBUG, "CAPP#%d (PHB:#%x): FIR 0x%016llx mask 0x%016llx\n",
+		      info.capp_index, info.phb_index, capp_fir,
+		      capp_fir_mask);
+		prlog(PR_DEBUG, "CAPP#%d (PHB:#%x): ACTION0 0x%016llx, ACTION1 0x%016llx\n",
+		      info.capp_index, info.phb_index, capp_fir_action0,
+		      capp_fir_action1);
+
+		/*
+		 * If this bit is set (=1) a Recoverable Error has been
+		 * detected
+		 */
+		xscom_read(flat_chip_id, info.capp_err_status_ctrl_reg, &reg);
+		if ((reg & PPC_BIT(0)) != 0) {
+			phb_lock(phb);
+			phb->ops->set_capp_recovery(phb);
+			phb_unlock(phb);
+
+			hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
+			hmi_evt->type = OpalHMI_ERROR_CAPP_RECOVERY;
+			queue_hmi_event(hmi_evt, 1, out_flags);
+
+			return;
+		}
+	}
+}
+
+static void find_nx_checkstop_reason(int flat_chip_id,
+				     struct OpalHMIEvent *hmi_evt,
+				     uint64_t *out_flags)
+{
+	uint64_t nx_status;
+	uint64_t nx_dma_fir;
+	uint64_t nx_pbi_fir_val;
+	int i;
+
+	/* Get NX status register value. */
+	if (xscom_read(flat_chip_id, nx_status_reg, &nx_status) != 0) {
+		prerror("XSCOM error reading NX_STATUS_REG\n");
+		return;
+	}
+
+	/* Check if NX has driven an HMI interrupt. */
+	if (!(nx_status & NX_HMI_ACTIVE))
+		return;
+
+	/* Initialize HMI event */
+	hmi_evt->severity = OpalHMI_SEV_FATAL;
+	hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT;
+	hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_NX;
+	hmi_evt->u.xstop_error.u.chip_id = cpu_to_be32(flat_chip_id);
+
+	/* Get DMA & Engine FIR data register value. */
+	if (xscom_read(flat_chip_id, nx_dma_engine_fir, &nx_dma_fir) != 0) {
+		prerror("XSCOM error reading NX_DMA_ENGINE_FIR\n");
+		return;
+	}
+
+	/* Get PowerBus Interface FIR data register value. */
+	if (xscom_read(flat_chip_id, nx_pbi_fir, &nx_pbi_fir_val) != 0) {
+		prerror("XSCOM error reading NX_PBI_FIR\n");
+		return;
+	}
+
+	/* Find NX checkstop reason and populate HMI event with error info. */
+	for (i = 0; i < ARRAY_SIZE(nx_dma_xstop_bits); i++)
+		if (nx_dma_fir & PPC_BIT(nx_dma_xstop_bits[i].bit))
+			hmi_evt->u.xstop_error.xstop_reason
+				|= cpu_to_be32(nx_dma_xstop_bits[i].reason);
+
+	for (i = 0; i < ARRAY_SIZE(nx_pbi_xstop_bits); i++)
+		if (nx_pbi_fir_val & PPC_BIT(nx_pbi_xstop_bits[i].bit))
+			hmi_evt->u.xstop_error.xstop_reason
+				|= cpu_to_be32(nx_pbi_xstop_bits[i].reason);
+
+	/*
+	 * Set NXDMAENGFIR[38] to signal PRD that service action is required.
+	 * Without this inject, PRD will not be able to do NX unit checkstop
+	 * error analysis. NXDMAENGFIR[38] is a spare bit and used to report
+	 * a software initiated attention.
+	 *
+	 * The behavior of this bit and all FIR bits are documented in
+	 * RAS spreadsheet.
+	 */
+	xscom_write(flat_chip_id, nx_dma_engine_fir, PPC_BIT(38));
+
+	/* Send an HMI event. */
+	queue_hmi_event(hmi_evt, 0, out_flags);
+}
+
+static bool phb_is_npu2(struct dt_node *dn)
+{
+	return (dt_node_is_compatible(dn, "ibm,power9-npu-pciex") ||
+		dt_node_is_compatible(dn, "ibm,power9-npu-opencapi-pciex"));
+}
+
+static void add_npu2_xstop_reason(uint32_t *xstop_reason, uint8_t reason)
+{
+	int i, reason_count;
+	uint8_t *ptr;
+
+	reason_count = sizeof(*xstop_reason) / sizeof(reason);
+	ptr = (uint8_t *) xstop_reason;
+	for (i = 0; i < reason_count; i++) {
+		if (*ptr == 0) {
+			*ptr = reason;
+			break;
+		}
+		ptr++;
+	}
+}
+
+static void encode_npu2_xstop_reason(uint32_t *xstop_reason,
+				uint64_t fir, int fir_number)
+{
+	int bit;
+	uint8_t reason;
+
+	/*
+	 * There are three 64-bit FIRs but the xstop reason field of
+	 * the hmi event is only 32-bit. Encode which FIR bit is set as:
+	 * - 2 bits for the FIR number
+	 * - 6 bits for the bit number (0 -> 63)
+	 *
+	 * So we could even encode up to 4 reasons for the HMI, if
+	 * that can ever happen
+	 */
+	while (fir) {
+		bit = ilog2(fir);
+		reason = fir_number << 6;
+		reason |= (63 - bit); // IBM numbering
+		add_npu2_xstop_reason(xstop_reason, reason);
+		fir ^= 1ULL << bit;
+	}
+}
+
+static void find_npu2_checkstop_reason(int flat_chip_id,
+				      struct OpalHMIEvent *hmi_evt,
+				      uint64_t *out_flags)
+{
+	struct phb *phb;
+	int i;
+	bool npu2_hmi_verbose = false, found = false;
+	uint64_t npu2_fir;
+	uint64_t npu2_fir_mask;
+	uint64_t npu2_fir_action0;
+	uint64_t npu2_fir_action1;
+	uint64_t npu2_fir_addr;
+	uint64_t npu2_fir_mask_addr;
+	uint64_t npu2_fir_action0_addr;
+	uint64_t npu2_fir_action1_addr;
+	uint64_t fatal_errors;
+	uint32_t xstop_reason = 0;
+	int total_errors = 0;
+	const char *loc;
+
+	/* NPU2 only */
+	if (PVR_TYPE(mfspr(SPR_PVR)) != PVR_TYPE_P9)
+		return;
+
+	/* Find the NPU on the chip associated with the HMI. */
+	for_each_phb(phb) {
+		/* NOTE: if a chip ever has >1 NPU this will need adjusting */
+		if (phb_is_npu2(phb->dt_node) &&
+		    (dt_get_chip_id(phb->dt_node) == flat_chip_id)) {
+			found = true;
+			break;
+		}
+	}
+
+	/* If we didn't find a NPU on the chip, it's not our checkstop. */
+	if (!found)
+		return;
+
+	npu2_fir_addr = NPU2_FIR_REGISTER_0;
+	npu2_fir_mask_addr = NPU2_FIR_REGISTER_0 + NPU2_FIR_MASK_OFFSET;
+	npu2_fir_action0_addr = NPU2_FIR_REGISTER_0 + NPU2_FIR_ACTION0_OFFSET;
+	npu2_fir_action1_addr = NPU2_FIR_REGISTER_0 + NPU2_FIR_ACTION1_OFFSET;
+
+	for (i = 0; i < NPU2_TOTAL_FIR_REGISTERS; i++) {
+		/* Read all the registers necessary to find a checkstop condition. */
+		if (xscom_read(flat_chip_id, npu2_fir_addr, &npu2_fir) ||
+			xscom_read(flat_chip_id, npu2_fir_mask_addr, &npu2_fir_mask) ||
+			xscom_read(flat_chip_id, npu2_fir_action0_addr, &npu2_fir_action0) ||
+			xscom_read(flat_chip_id, npu2_fir_action1_addr, &npu2_fir_action1)) {
+			prerror("HMI: Couldn't read NPU FIR register%d with XSCOM\n", i);
+			continue;
+		}
+
+		fatal_errors = npu2_fir & ~npu2_fir_mask & npu2_fir_action0 & npu2_fir_action1;
+
+		if (fatal_errors) {
+			loc = chip_loc_code(flat_chip_id);
+			if (!loc)
+				loc = "Not Available";
+			prlog(PR_ERR, "NPU: [Loc: %s] P:%d FIR#%d FIR 0x%016llx mask 0x%016llx\n",
+					loc, flat_chip_id, i, npu2_fir, npu2_fir_mask);
+			prlog(PR_ERR, "NPU: [Loc: %s] P:%d ACTION0 0x%016llx, ACTION1 0x%016llx\n",
+					loc, flat_chip_id, npu2_fir_action0, npu2_fir_action1);
+			total_errors++;
+
+			encode_npu2_xstop_reason(&xstop_reason, fatal_errors, i);
+		}
+
+		/* Can't do a fence yet, we are just logging fir information for now */
+		npu2_fir_addr += NPU2_FIR_OFFSET;
+		npu2_fir_mask_addr += NPU2_FIR_OFFSET;
+		npu2_fir_action0_addr += NPU2_FIR_OFFSET;
+		npu2_fir_action1_addr += NPU2_FIR_OFFSET;
+
+	}
+
+	if (!total_errors)
+		return;
+
+	npu2_hmi_verbose = nvram_query_eq_safe("npu2-hmi-verbose", "true");
+	/* Force this for now until we sort out something better */
+	npu2_hmi_verbose = true;
+
+	if (npu2_hmi_verbose) {
+		npu2_dump_scoms(flat_chip_id);
+		prlog(PR_ERR, " _________________________ \n");
+		prlog(PR_ERR, "<    It's Debug time!     >\n");
+		prlog(PR_ERR, " ------------------------- \n");
+		prlog(PR_ERR, "       \\   ,__,            \n");
+		prlog(PR_ERR, "        \\  (oo)____        \n");
+		prlog(PR_ERR, "           (__)    )\\      \n");
+		prlog(PR_ERR, "              ||--|| *     \n");
+	}
+
+	/* Set up the HMI event */
+	hmi_evt->severity = OpalHMI_SEV_WARNING;
+	hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT;
+	hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_NPU;
+	hmi_evt->u.xstop_error.xstop_reason = cpu_to_be32(xstop_reason);
+	hmi_evt->u.xstop_error.u.chip_id = cpu_to_be32(flat_chip_id);
+
+	/* Marking the event as recoverable so that we don't crash */
+	queue_hmi_event(hmi_evt, 1, out_flags);
+}
+
+static void find_npu_checkstop_reason(int flat_chip_id,
+				      struct OpalHMIEvent *hmi_evt,
+				      uint64_t *out_flags)
+{
+	struct phb *phb;
+	struct npu *p = NULL;
+
+	uint64_t npu_fir;
+	uint64_t npu_fir_mask;
+	uint64_t npu_fir_action0;
+	uint64_t npu_fir_action1;
+	uint64_t fatal_errors;
+
+	/* Only check for NPU errors if the chip has a NPU */
+	if (PVR_TYPE(mfspr(SPR_PVR)) != PVR_TYPE_P8NVL)
+		return find_npu2_checkstop_reason(flat_chip_id, hmi_evt, out_flags);
+
+	/* Find the NPU on the chip associated with the HMI. */
+	for_each_phb(phb) {
+		/* NOTE: if a chip ever has >1 NPU this will need adjusting */
+		if (dt_node_is_compatible(phb->dt_node, "ibm,power8-npu-pciex") &&
+		    (dt_get_chip_id(phb->dt_node) == flat_chip_id)) {
+			p = phb_to_npu(phb);
+			break;
+		}
+	}
+
+	/* If we didn't find a NPU on the chip, it's not our checkstop. */
+	if (p == NULL)
+		return;
+
+	/* Read all the registers necessary to find a checkstop condition. */
+	if (xscom_read(flat_chip_id,
+		       p->at_xscom + NX_FIR, &npu_fir) ||
+	    xscom_read(flat_chip_id,
+		       p->at_xscom + NX_FIR_MASK, &npu_fir_mask) ||
+	    xscom_read(flat_chip_id,
+		       p->at_xscom + NX_FIR_ACTION0, &npu_fir_action0) ||
+	    xscom_read(flat_chip_id,
+		       p->at_xscom + NX_FIR_ACTION1, &npu_fir_action1)) {
+		prerror("Couldn't read NPU registers with XSCOM\n");
+		return;
+	}
+
+	fatal_errors = npu_fir & ~npu_fir_mask & npu_fir_action0 & npu_fir_action1;
+
+	/* If there's no errors, we don't need to do anything. */
+	if (!fatal_errors)
+		return;
+
+	prlog(PR_DEBUG, "NPU: FIR 0x%016llx mask 0x%016llx\n",
+	      npu_fir, npu_fir_mask);
+	prlog(PR_DEBUG, "NPU: ACTION0 0x%016llx, ACTION1 0x%016llx\n",
+	      npu_fir_action0, npu_fir_action1);
+
+	/* Set the NPU to fenced since it can't recover. */
+	npu_set_fence_state(p, true);
+
+	/* Set up the HMI event */
+	hmi_evt->severity = OpalHMI_SEV_WARNING;
+	hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT;
+	hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_NPU;
+	hmi_evt->u.xstop_error.u.chip_id = cpu_to_be32(flat_chip_id);
+
+	/* The HMI is "recoverable" because it shouldn't crash the system */
+	queue_hmi_event(hmi_evt, 1, out_flags);
+}
+
+static void decode_malfunction(struct OpalHMIEvent *hmi_evt, uint64_t *out_flags)
+{
+	int i;
+	uint64_t malf_alert, flags;
+
+	flags = 0;
+
+	if (!setup_scom_addresses()) {
+		prerror("Failed to setup scom addresses\n");
+		/* Send an unknown HMI event. */
+		hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_UNKNOWN;
+		hmi_evt->u.xstop_error.xstop_reason = 0;
+		queue_hmi_event(hmi_evt, false, out_flags);
+		return;
+	}
+
+	xscom_read(this_cpu()->chip_id, malf_alert_scom, &malf_alert);
+
+	if (!malf_alert)
+		return;
+
+	for (i = 0; i < 64; i++) {
+		if (malf_alert & PPC_BIT(i)) {
+			xscom_write(this_cpu()->chip_id, malf_alert_scom,
+								~PPC_BIT(i));
+			find_capp_checkstop_reason(i, hmi_evt, &flags);
+			find_nx_checkstop_reason(i, hmi_evt, &flags);
+			find_npu_checkstop_reason(i, hmi_evt, &flags);
+		}
+	}
+
+	find_core_checkstop_reason(hmi_evt, &flags);
+
+	/*
+	 * If we fail to find checkstop reason, send an unknown HMI event.
+	 */
+	if (!(flags & OPAL_HMI_FLAGS_NEW_EVENT)) {
+		hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_UNKNOWN;
+		hmi_evt->u.xstop_error.xstop_reason = 0;
+		queue_hmi_event(hmi_evt, false, &flags);
+	}
+	*out_flags |= flags;
+}
+
+/*
+ * This will "rendez-vous" all threads on the core to the rendez-vous
+ * id "sig". You need to make sure that "sig" is different from the
+ * previous rendez vous. The sig value must be between 0 and 7 with
+ * boot time being set to 0.
+ *
+ * Note: in theory, we could just use a flip flop "sig" in the thread
+ * structure (binary rendez-vous with no argument). This is a bit more
+ * debuggable and better at handling timeouts (arguably).
+ *
+ * This should be called with the no lock held
+ */
+static void hmi_rendez_vous(uint32_t sig)
+{
+	struct cpu_thread *t = this_cpu();
+	uint32_t my_id = cpu_get_thread_index(t);
+	uint32_t my_shift = my_id << 2;
+	uint32_t *sptr = t->core_hmi_state_ptr;
+	uint32_t val, prev, shift, i;
+	uint64_t timeout;
+
+	assert(sig <= 0x7);
+
+	/*
+	 * Mark ourselves as having reached the rendez vous point with
+	 * the exit bit cleared
+	 */
+	do {
+		val = prev = *sptr;
+		val &= ~(0xfu << my_shift);
+		val |= sig << my_shift;
+	} while (cmpxchg32(sptr, prev, val) != prev);
+
+	/*
+	 * Wait for everybody else to reach that point, ignore the
+	 * exit bit as another thread could have already set it.
+	 */
+	for (i = 0; i < cpu_thread_count; i++) {
+		shift = i << 2;
+
+		timeout = TIMEOUT_LOOPS;
+		while (((*sptr >> shift) & 0x7) != sig && --timeout)
+			cpu_relax();
+		if (!timeout)
+			prlog(PR_ERR, "Rendez-vous stage 1 timeout, CPU 0x%x"
+			      " waiting for thread %d (sptr=%08x)\n",
+						      t->pir, i, *sptr);
+	}
+
+	/* Set the exit bit */
+	do {
+		val = prev = *sptr;
+		val &= ~(0xfu << my_shift);
+		val |= (sig | 8) << my_shift;
+	} while (cmpxchg32(sptr, prev, val) != prev);
+
+	/* At this point, we need to wait for everybody else to have a value
+	 * that is *not* sig. IE. they either have set the exit bit *or* they
+	 * have changed the rendez-vous (meaning they have moved on to another
+	 * rendez vous point).
+	 */
+	for (i = 0; i < cpu_thread_count; i++) {
+		shift = i << 2;
+
+		timeout = TIMEOUT_LOOPS;
+		while (((*sptr >> shift) & 0xf) == sig && --timeout)
+			cpu_relax();
+		if (!timeout)
+			prlog(PR_ERR, "Rendez-vous stage 2 timeout, CPU 0x%x"
+			      " waiting for thread %d (sptr=%08x)\n",
+						      t->pir, i, *sptr);
+	}
+}
+
+static void hmi_print_debug(const uint8_t *msg, uint64_t hmer)
+{
+	const char *loc;
+	uint32_t core_id, thread_index;
+
+	core_id = pir_to_core_id(this_cpu()->pir);
+	thread_index = cpu_get_thread_index(this_cpu());
+
+	loc = chip_loc_code(this_cpu()->chip_id);
+	if (!loc)
+		loc = "Not Available";
+
+	/* Also covers P10 SPR_HMER_TFAC_SHADOW_XFER_ERROR */
+	if (hmer & (SPR_HMER_TFAC_ERROR | SPR_HMER_TFMR_PARITY_ERROR)) {
+		prlog(PR_DEBUG, "[Loc: %s]: P:%d C:%d T:%d: TFMR(%016lx) %s\n",
+			loc, this_cpu()->chip_id, core_id, thread_index,
+			mfspr(SPR_TFMR), msg);
+	} else {
+		prlog(PR_DEBUG, "[Loc: %s]: P:%d C:%d T:%d: %s\n",
+			loc, this_cpu()->chip_id, core_id, thread_index,
+			msg);
+	}
+}
+
+static int handle_thread_tfac_error(uint64_t tfmr, uint64_t *out_flags)
+{
+	int recover = 1;
+
+	if (tfmr & SPR_TFMR_DEC_PARITY_ERR)
+		*out_flags |= OPAL_HMI_FLAGS_DEC_LOST;
+	if (!tfmr_recover_local_errors(tfmr))
+		recover = 0;
+	tfmr &= ~(SPR_TFMR_PURR_PARITY_ERR |
+		  SPR_TFMR_SPURR_PARITY_ERR |
+		  SPR_TFMR_DEC_PARITY_ERR);
+	return recover;
+}
+
+static int64_t opal_handle_hmi(void);
+
+static void opal_handle_hmi_job(void *data __unused)
+{
+	opal_handle_hmi();
+}
+
+/*
+ * Queue hmi handling job If secondaries are still in OPAL
+ * This function is called by thread 0.
+ */
+static struct cpu_job **hmi_kick_secondaries(void)
+{
+	struct cpu_thread *ts = this_cpu();
+	struct cpu_job **hmi_jobs = NULL;
+	int job_sz = sizeof(struct cpu_job *) * cpu_thread_count;
+	int i;
+
+	for (i = 1; i < cpu_thread_count; i++) {
+		ts = next_cpu(ts);
+
+		/* Is this thread still in OPAL ? */
+		if (ts->state == cpu_state_active) {
+			if (!hmi_jobs) {
+				hmi_jobs = zalloc(job_sz);
+				assert(hmi_jobs);
+			}
+
+			prlog(PR_DEBUG, "Sending hmi job to thread %d\n", i);
+			hmi_jobs[i] = cpu_queue_job(ts, "handle_hmi_job",
+					opal_handle_hmi_job, NULL);
+		}
+	}
+	return hmi_jobs;
+}
+
+static int handle_all_core_tfac_error(uint64_t tfmr, uint64_t *out_flags)
+{
+	struct cpu_thread *t, *t0;
+	int recover = -1;
+	struct cpu_job **hmi_jobs = NULL;
+
+	t = this_cpu();
+	t0 = find_cpu_by_pir(cpu_get_thread0(t));
+
+	if (t == t0 && t0->state == cpu_state_os)
+		hmi_jobs = hmi_kick_secondaries();
+
+	/* Rendez vous all threads */
+	hmi_rendez_vous(1);
+
+	/* We use a lock here as some of the TFMR bits are shared and I
+	 * prefer avoiding doing the cleanup simultaneously.
+	 */
+	lock(&hmi_lock);
+
+	/* First handle corrupt TFMR otherwise we can't trust anything.
+	 * We'll use a lock here so that the threads don't try to do it at
+	 * the same time
+	 */
+	if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
+		/* Check if it's still in error state */
+		if (mfspr(SPR_TFMR) & SPR_TFMR_TFMR_CORRUPT)
+			if (!recover_corrupt_tfmr()) {
+				unlock(&hmi_lock);
+				recover = 0;
+				goto error_out;
+			}
+
+		tfmr = mfspr(SPR_TFMR);
+
+		/* We could have got new thread errors in the meantime */
+		if (tfmr & SPR_TFMR_THREAD_ERRORS) {
+			recover = handle_thread_tfac_error(tfmr, out_flags);
+			tfmr &= ~SPR_TFMR_THREAD_ERRORS;
+		}
+		if (!recover) {
+			unlock(&hmi_lock);
+			goto error_out;
+		}
+	}
+
+	/* Tell the OS ... */
+	if (tfmr & SPR_TFMR_HDEC_PARITY_ERROR)
+		*out_flags |= OPAL_HMI_FLAGS_HDEC_LOST;
+
+	/* Cleanup bad HDEC or TB on all threads or subcures before we clear
+	 * the error conditions
+	 */
+	tfmr_cleanup_core_errors(tfmr);
+
+	/* Unlock before next rendez-vous */
+	unlock(&hmi_lock);
+
+	/* Second rendez vous, ensure the above cleanups are all done before
+	 * we proceed further
+	 */
+	hmi_rendez_vous(2);
+
+	/* We can now clear the error conditions in the core. */
+	recover = tfmr_clear_core_errors(tfmr);
+	if (recover == 0)
+		goto error_out;
+
+	/* Third rendez-vous. We could in theory do the timebase resync as
+	 * part of the previous one, but I prefer having all the error
+	 * conditions cleared before we start trying.
+	 */
+	hmi_rendez_vous(3);
+
+	/* Now perform the actual TB recovery on thread 0 */
+	if (t == t0)
+		recover = chiptod_recover_tb_errors(&this_cpu()->tb_resynced);
+
+error_out:
+	/* Last rendez-vous */
+	hmi_rendez_vous(4);
+
+	/* Now all threads have gone past rendez-vous 3 and not yet past another
+	 * rendez-vous 1, so the value of tb_resynced of thread 0 of the core
+	 * contains an accurate indication as to whether the timebase was lost.
+	 */
+	if (t0->tb_resynced)
+		*out_flags |= OPAL_HMI_FLAGS_TB_RESYNC;
+
+	if (t == t0 && hmi_jobs) {
+		int i;
+		for (i = 1; i < cpu_thread_count; i++)
+			if (hmi_jobs[i])
+				cpu_wait_job(hmi_jobs[i], true);
+		free(hmi_jobs);
+	}
+
+	return recover;
+}
+
+static uint64_t read_tfmr_t0(void)
+{
+	uint64_t tfmr_t0;
+	uint32_t chip_id = this_cpu()->chip_id;
+	uint32_t core_id = pir_to_core_id(this_cpu()->pir);
+
+	lock(&hmi_lock);
+
+	xscom_write(chip_id, XSCOM_ADDR_P9_EC(core_id, P9_SCOM_SPRC),
+			SETFIELD(P9_SCOMC_SPR_SELECT, 0, P9_SCOMC_TFMR_T0));
+	xscom_read(chip_id, XSCOM_ADDR_P9_EC(core_id, P9_SCOM_SPRD),
+				&tfmr_t0);
+	unlock(&hmi_lock);
+	return tfmr_t0;
+}
+
+/* P9 errata: In theory, an HDEC error is sent to all threads. However,
+ * due to an errata on P9 where TFMR bit 26 (HDEC parity) cannot be
+ * cleared on thread 1..3, I am not confident we can do a rendez-vous
+ * in all cases.
+ *
+ * Our current approach is to ignore that error unless it is present
+ * on thread 0 TFMR. Also, ignore TB residue error due to a similar
+ * errata as above.
+ */
+static void validate_latched_errors(uint64_t *tfmr)
+{
+	if ((*tfmr & (SPR_TFMR_HDEC_PARITY_ERROR | SPR_TFMR_TB_RESIDUE_ERR))
+				&& this_cpu()->is_secondary) {
+		uint64_t tfmr_t0 = read_tfmr_t0();
+
+		if (!(tfmr_t0 & SPR_TFMR_HDEC_PARITY_ERROR))
+			*tfmr &= ~SPR_TFMR_HDEC_PARITY_ERROR;
+
+		if (!(tfmr_t0 & SPR_TFMR_TB_RESIDUE_ERR))
+			*tfmr &= ~SPR_TFMR_TB_RESIDUE_ERR;
+	}
+}
+
+static int handle_tfac_errors(struct OpalHMIEvent *hmi_evt, uint64_t *out_flags)
+{
+	int recover = -1;
+	uint64_t tfmr = mfspr(SPR_TFMR);
+
+	/* Initialize the hmi event with old value of TFMR */
+	hmi_evt->tfmr = cpu_to_be64(tfmr);
+
+	/* A TFMR parity/corrupt error makes us ignore all the local stuff.*/
+	if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
+		/* Mark TB as invalid for now as we don't trust TFMR, we'll fix
+		 * it up later
+		 */
+		this_cpu()->tb_invalid = true;
+		goto bad_tfmr;
+	}
+
+	this_cpu()->tb_invalid = !(tfmr & SPR_TFMR_TB_VALID);
+
+	if (proc_gen == proc_gen_p9)
+		validate_latched_errors(&tfmr);
+
+	/* First, handle thread local errors */
+	if (tfmr & SPR_TFMR_THREAD_ERRORS) {
+		recover = handle_thread_tfac_error(tfmr, out_flags);
+		tfmr &= ~SPR_TFMR_THREAD_ERRORS;
+	}
+
+ bad_tfmr:
+
+	/* Let's see if we still have a all-core error to deal with, if
+	 * not, we just bail out
+	 */
+	if (tfmr & SPR_TFMR_CORE_ERRORS) {
+		int recover2;
+
+		/* Only update "recover" if it's not already 0 (non-recovered)
+		 */
+		recover2 = handle_all_core_tfac_error(tfmr, out_flags);
+		if (recover != 0)
+			recover = recover2;
+	} else if (tfmr & SPR_TFMR_CHIP_TOD_INTERRUPT) {
+		int recover2;
+
+		/*
+		 * There are some TOD errors which do not affect working of
+		 * TOD and TB. They stay in valid state. Hence we don't need
+		 * rendez vous.
+		 *
+		 * TOD errors that affects TOD/TB will report a global error
+		 * on TFMR alongwith bit 51, and they will go in rendez vous.
+		 */
+		recover2 = chiptod_recover_tod_errors();
+		if (recover != 0)
+			recover = recover2;
+	} else if (this_cpu()->tb_invalid) {
+		/* This shouldn't happen, TB is invalid and no global error
+		 * was reported. We just return for now assuming one will
+		 * be. We can't do a rendez vous without a core-global HMI.
+		 */
+		prlog(PR_ERR, "HMI: TB invalid without core error reported ! "
+			"CPU=%x, TFMR=0x%016lx\n", this_cpu()->pir,
+						mfspr(SPR_TFMR));
+	}
+
+	if (recover != -1 && hmi_evt) {
+		hmi_evt->severity = OpalHMI_SEV_ERROR_SYNC;
+		hmi_evt->type = OpalHMI_ERROR_TFAC;
+		queue_hmi_event(hmi_evt, recover, out_flags);
+	}
+
+	/* Set the TB state looking at TFMR register before we head out. */
+	this_cpu()->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID);
+
+	if (this_cpu()->tb_invalid) {
+		*out_flags |= OPAL_HMI_FLAGS_TOD_TB_FAIL;
+		prlog(PR_WARNING, "Failed to get TB in running state! "
+			"CPU=%x, TFMR=%016lx\n", this_cpu()->pir,
+					mfspr(SPR_TFMR));
+	}
+
+	return recover;
+}
+
+static int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt,
+				uint64_t *out_flags)
+{
+	struct cpu_thread *cpu = this_cpu();
+	int recover = 1;
+	uint64_t handled = 0;
+
+	prlog(PR_DEBUG, "Received HMI interrupt: HMER = 0x%016llx\n", hmer);
+	/* Initialize the hmi event with old value of HMER */
+	if (hmi_evt)
+		hmi_evt->hmer = cpu_to_be64(hmer);
+
+	/* Handle Timer/TOD errors separately */
+	if (hmer & (SPR_HMER_TFAC_ERROR | SPR_HMER_TFMR_PARITY_ERROR)) {
+		hmi_print_debug("Timer Facility Error", hmer);
+		handled = hmer & (SPR_HMER_TFAC_ERROR | SPR_HMER_TFMR_PARITY_ERROR);
+		mtspr(SPR_HMER, ~handled);
+		recover = handle_tfac_errors(hmi_evt, out_flags);
+		handled = 0;
+	}
+
+	lock(&hmi_lock);
+	/*
+	 * Not all HMIs would move TB into invalid state. Set the TB state
+	 * looking at TFMR register. TFMR will tell us correct state of
+	 * TB register.
+	 */
+	if (hmer & SPR_HMER_PROC_RECV_DONE) {
+		uint32_t chip_id = pir_to_chip_id(cpu->pir);
+		uint32_t core_id = pir_to_core_id(cpu->pir);
+		uint64_t core_wof;
+
+		hmi_print_debug("Processor recovery occurred.", hmer);
+		if (!read_core_wof(chip_id, core_id, &core_wof)) {
+			int i;
+
+			prlog(PR_DEBUG, "Core WOF = 0x%016llx recovered error:\n", core_wof);
+			if (proc_gen <= proc_gen_p9) {
+				for (i = 0; i < ARRAY_SIZE(p9_recoverable_bits); i++) {
+					if (core_wof & PPC_BIT(p9_recoverable_bits[i].bit))
+						prlog(PR_DEBUG, "    %s\n", p9_recoverable_bits[i].reason);
+				}
+			} else if (proc_gen == proc_gen_p10) {
+				for (i = 0; i < ARRAY_SIZE(p10_core_fir_bits); i++) {
+					if (core_wof & PPC_BIT(p10_core_fir_bits[i].bit))
+						prlog(PR_DEBUG, "    %s\n", p10_core_fir_bits[i].reason);
+				}
+			}
+		}
+
+		handled |= SPR_HMER_PROC_RECV_DONE;
+		if (cpu_is_thread0(cpu) && hmi_evt) {
+			hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
+			hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_DONE;
+			queue_hmi_event(hmi_evt, recover, out_flags);
+		}
+	}
+
+	if ((proc_gen <= proc_gen_p9) && (hmer & SPR_HMER_PROC_RECV_ERROR_MASKED)) {
+		handled |= SPR_HMER_PROC_RECV_ERROR_MASKED;
+		if (cpu_is_thread0(cpu) && hmi_evt) {
+			hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
+			hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_MASKED;
+			queue_hmi_event(hmi_evt, recover, out_flags);
+		}
+		hmi_print_debug("Processor recovery Done (masked).", hmer);
+	}
+
+	if (hmer & SPR_HMER_PROC_RECV_AGAIN) {
+		handled |= SPR_HMER_PROC_RECV_AGAIN;
+		if (cpu_is_thread0(cpu) && hmi_evt) {
+			hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
+			hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_DONE_AGAIN;
+			queue_hmi_event(hmi_evt, recover, out_flags);
+		}
+		hmi_print_debug("Processor recovery occurred again before"
+				"bit2 was cleared\n", hmer);
+	}
+
+	/* XXX: what to do with this? */
+	if (hmer & SPR_HMER_SPURR_SCALE_LIMIT) {
+		handled |= SPR_HMER_SPURR_SCALE_LIMIT;
+		if (cpu_is_thread0(cpu) && hmi_evt) {
+			hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
+			hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_DONE;
+			queue_hmi_event(hmi_evt, recover, out_flags);
+		}
+		hmi_print_debug("Turbo versus nominal frequency exceeded limit.", hmer);
+	}
+
+	/* Assert if we see malfunction alert, we can not continue. */
+	if (hmer & SPR_HMER_MALFUNCTION_ALERT) {
+		handled |= SPR_HMER_MALFUNCTION_ALERT;
+
+		hmi_print_debug("Malfunction Alert", hmer);
+		recover = 0;
+		if (hmi_evt)
+			decode_malfunction(hmi_evt, out_flags);
+	}
+
+	/* Assert if we see Hypervisor resource error, we can not continue. */
+	if ((proc_gen <= proc_gen_p9) && (hmer & SPR_HMER_HYP_RESOURCE_ERR)) {
+		handled |= SPR_HMER_HYP_RESOURCE_ERR;
+
+		hmi_print_debug("Hypervisor resource error", hmer);
+		recover = 0;
+		if (hmi_evt) {
+			hmi_evt->severity = OpalHMI_SEV_FATAL;
+			hmi_evt->type = OpalHMI_ERROR_HYP_RESOURCE;
+			queue_hmi_event(hmi_evt, recover, out_flags);
+		}
+	}
+
+	/* XXX: what to do with this? */
+	if ((proc_gen <= proc_gen_p9) && (hmer & SPR_HMER_THD_WAKE_BLOCKED_TM_SUSPEND)) {
+		handled |= SPR_HMER_THD_WAKE_BLOCKED_TM_SUSPEND;
+		hmer &= ~SPR_HMER_THD_WAKE_BLOCKED_TM_SUSPEND;
+
+		hmi_print_debug("Attempted to wake thread when threads in TM suspend mode.", hmer);
+		if (hmi_evt) {
+			hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
+			hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_DONE,
+				queue_hmi_event(hmi_evt, recover, out_flags);
+		}
+	}
+
+	if ((proc_gen <= proc_gen_p9) && (hmer & SPR_HMER_TRIG_FIR_HMI)) {
+		handled |= SPR_HMER_TRIG_FIR_HMI;
+		hmer &= ~SPR_HMER_TRIG_FIR_HMI;
+
+		hmi_print_debug("Clearing unknown debug trigger", hmer);
+		if (hmi_evt) {
+			hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
+			hmi_evt->type = OpalHMI_ERROR_DEBUG_TRIG_FIR,
+				queue_hmi_event(hmi_evt, recover, out_flags);
+		}
+	}
+	if ((proc_gen == proc_gen_p10) && (hmer & SPR_HMER_P10_TRIG_FIR_HMI)) {
+		handled |= SPR_HMER_P10_TRIG_FIR_HMI;
+		hmer &= ~SPR_HMER_P10_TRIG_FIR_HMI;
+
+		hmi_print_debug("Clearing unknown debug trigger", hmer);
+		if (hmi_evt) {
+			hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
+			hmi_evt->type = OpalHMI_ERROR_DEBUG_TRIG_FIR,
+				queue_hmi_event(hmi_evt, recover, out_flags);
+		}
+	}
+
+	if (recover == 0)
+		disable_fast_reboot("Unrecoverable HMI");
+	/*
+	 * HMER bits are sticky, once set to 1 they remain set to 1 until
+	 * they are set to 0. Reset the error source bit to 0, otherwise
+	 * we keep getting HMI interrupt again and again. Writing to HMER
+	 * acts as an AND, so we write mask of all 1's except for the bits
+	 * we want to clear.
+	 */
+	mtspr(SPR_HMER, ~handled);
+	unlock(&hmi_lock);
+	return recover;
+}
+
+static int64_t opal_handle_hmi(void)
+{
+	uint64_t hmer, dummy_flags;
+	struct OpalHMIEvent hmi_evt;
+
+	/*
+	 * Compiled time check to see size of OpalHMIEvent do not exceed
+	 * that of struct opal_msg.
+	 */
+	BUILD_ASSERT(sizeof(struct opal_msg) >= sizeof(struct OpalHMIEvent));
+
+	memset(&hmi_evt, 0, sizeof(struct OpalHMIEvent));
+	hmi_evt.version = OpalHMIEvt_V2;
+
+	hmer = mfspr(SPR_HMER);		/* Get HMER register value */
+	handle_hmi_exception(hmer, &hmi_evt, &dummy_flags);
+
+	return OPAL_SUCCESS;
+}
+opal_call(OPAL_HANDLE_HMI, opal_handle_hmi, 0);
+
+static int64_t opal_handle_hmi2(__be64 *out_flags)
+{
+	uint64_t hmer, flags = 0;
+	struct OpalHMIEvent hmi_evt;
+
+	/*
+	 * Compiled time check to see size of OpalHMIEvent do not exceed
+	 * that of struct opal_msg.
+	 */
+	BUILD_ASSERT(sizeof(struct opal_msg) >= sizeof(struct OpalHMIEvent));
+
+	memset(&hmi_evt, 0, sizeof(struct OpalHMIEvent));
+	hmi_evt.version = OpalHMIEvt_V2;
+
+	hmer = mfspr(SPR_HMER);		/* Get HMER register value */
+	handle_hmi_exception(hmer, &hmi_evt, &flags);
+	*out_flags = cpu_to_be64(flags);
+
+	return OPAL_SUCCESS;
+}
+opal_call(OPAL_HANDLE_HMI2, opal_handle_hmi2, 1);
diff --git a/roms/skiboot/core/i2c.c b/roms/skiboot/core/i2c.c
new file mode 100644
index 000000000..b4313d430
--- /dev/null
+++ b/roms/skiboot/core/i2c.c
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * I2C
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <i2c.h>
+#include <opal.h>
+#include <device.h>
+#include <opal-msg.h>
+#include <timebase.h>
+#include <processor.h>
+#include <timer.h>
+#include <trace.h>
+
+static LIST_HEAD(i2c_bus_list);
+
+/* Used to assign OPAL IDs */
+static uint32_t i2c_next_bus;
+
+void i2c_add_bus(struct i2c_bus *bus)
+{
+	bus->opal_id = ++i2c_next_bus;
+	dt_add_property_cells(bus->dt_node, "ibm,opal-id", bus->opal_id);
+
+	list_add_tail(&i2c_bus_list, &bus->link);
+}
+
+struct i2c_bus *i2c_find_bus_by_id(uint32_t opal_id)
+{
+	struct i2c_bus *bus;
+
+	list_for_each(&i2c_bus_list, bus, link) {
+		if (bus->opal_id == opal_id)
+			return bus;
+	}
+	return NULL;
+}
+
+static inline void i2c_trace_req(struct i2c_request *req, int rc)
+{
+	struct trace_i2c t;
+
+	memset(&t, 0, sizeof(t));
+
+	t.bus = req->bus->opal_id;
+	t.type = req->op | (req->offset_bytes << 4);
+	t.i2c_addr = req->dev_addr;
+	t.smbus_reg = req->offset & 0xffff; // FIXME: log whole offset
+	t.size = req->rw_len;
+	t.rc = rc;
+
+	/* FIXME: trace should not be a union... */
+	trace_add((void *)&t, TRACE_I2C, sizeof(t));
+}
+
+int64_t i2c_queue_req(struct i2c_request *req)
+{
+	int64_t ret = req->bus->queue_req(req);
+
+	i2c_trace_req(req, OPAL_ASYNC_COMPLETION);
+
+	if (!ret)
+		req->req_state = i2c_req_queued;
+	return ret;
+}
+
+static void opal_i2c_request_complete(int rc, struct i2c_request *req)
+{
+	uint64_t token = (uint64_t)(unsigned long)req->user_data;
+
+	opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL,
+			cpu_to_be64(token),
+			cpu_to_be64(rc));
+	i2c_trace_req(req, rc);
+
+	free(req);
+}
+
+static int opal_i2c_request(uint64_t async_token, uint32_t bus_id,
+			    struct opal_i2c_request *oreq)
+{
+	struct i2c_bus *bus = NULL;
+	struct i2c_request *req;
+	int rc;
+
+	if (!opal_addr_valid(oreq))
+		return OPAL_PARAMETER;
+
+	if (oreq->flags & OPAL_I2C_ADDR_10)
+		return OPAL_UNSUPPORTED;
+
+	bus = i2c_find_bus_by_id(bus_id);
+	if (!bus) {
+		/**
+		 * @fwts-label I2CInvalidBusID
+		 * @fwts-advice opal_i2c_request was passed an invalid bus
+		 * ID. This has likely come from the OS rather than OPAL
+		 * and thus could indicate an OS bug rather than an OPAL
+		 * bug.
+		 */
+		prlog(PR_ERR, "I2C: Invalid 'bus_id' passed to the OPAL\n");
+		return OPAL_PARAMETER;
+	}
+
+	req = zalloc(sizeof(*req));
+	if (!req) {
+		/**
+		 * @fwts-label I2CFailedAllocation
+		 * @fwts-advice OPAL failed to allocate memory for an
+		 * i2c_request. This points to an OPAL bug as OPAL ran
+		 * out of memory and this should never happen.
+		 */
+		prlog(PR_ERR, "I2C: Failed to allocate 'i2c_request'\n");
+		return OPAL_NO_MEM;
+	}
+
+	switch(oreq->type) {
+	case OPAL_I2C_RAW_READ:
+		req->op = I2C_READ;
+		break;
+	case OPAL_I2C_RAW_WRITE:
+		req->op = I2C_WRITE;
+		break;
+	case OPAL_I2C_SM_READ:
+		req->op = SMBUS_READ;
+		req->offset = be32_to_cpu(oreq->subaddr);
+		req->offset_bytes = oreq->subaddr_sz;
+		break;
+	case OPAL_I2C_SM_WRITE:
+		req->op = SMBUS_WRITE;
+		req->offset = be32_to_cpu(oreq->subaddr);
+		req->offset_bytes = oreq->subaddr_sz;
+		break;
+	default:
+		free(req);
+		return OPAL_PARAMETER;
+	}
+	req->dev_addr = be16_to_cpu(oreq->addr);
+	req->rw_len = be32_to_cpu(oreq->size);
+	req->rw_buf = (void *)be64_to_cpu(oreq->buffer_ra);
+	req->completion = opal_i2c_request_complete;
+	req->user_data = (void *)(unsigned long)async_token;
+	req->bus = bus;
+
+	if (i2c_check_quirk(req, &rc)) {
+		free(req);
+		return rc;
+	}
+
+	/* Finally, queue the OPAL i2c request and return */
+	rc = i2c_queue_req(req);
+	if (rc) {
+		free(req);
+		return rc;
+	}
+
+	return OPAL_ASYNC_COMPLETION;
+}
+opal_call(OPAL_I2C_REQUEST, opal_i2c_request, 3);
+
+#define MAX_NACK_RETRIES		 2
+#define REQ_COMPLETE_POLLING		 5  /* Check if req is complete
+					       in 5ms interval */
+int64_t i2c_request_sync(struct i2c_request *req)
+{
+	uint64_t timer_period = msecs_to_tb(5), timer_count;
+	uint64_t time_to_wait = 0;
+	int64_t rc, waited, retries;
+	size_t i, count;
+	char buf[17]; /* 8 bytes in hex + NUL */
+
+	for (retries = 0; retries <= MAX_NACK_RETRIES; retries++) {
+		waited = 0;
+		timer_count = 0;
+
+		i2c_queue_req(req);
+
+		do {
+			time_to_wait = i2c_run_req(req);
+			if (!time_to_wait)
+				time_to_wait = REQ_COMPLETE_POLLING;
+			time_wait(time_to_wait);
+			waited += time_to_wait;
+			timer_count += time_to_wait;
+			if (timer_count > timer_period) {
+				/*
+				 * The above request may be relying on
+				 * timers to complete, yet there may
+				 * not be called, especially during
+				 * opal init. We could be looping here
+				 * forever. So explicitly check the
+				 * timers once in a while
+				 */
+				check_timers(false);
+				timer_count = 0;
+			}
+		} while (req->req_state != i2c_req_done);
+
+		lwsync();
+		rc = req->result;
+
+		/* retry on NACK, otherwise exit */
+		if (rc != OPAL_I2C_NACK_RCVD)
+			break;
+		req->req_state = i2c_req_new;
+	}
+
+	i2c_trace_req(req, rc);
+	count = 0;
+	for (i = 0; i < req->rw_len && count < sizeof(buf); i++) {
+		count += snprintf(buf+count, sizeof(buf)-count, "%02x",
+				*(unsigned char *)(req->rw_buf+i));
+	}
+
+	prlog(PR_DEBUG, "I2C: %s req op=%x offset=%x buf=%s buflen=%d "
+	      "delay=%lu/%lld rc=%lld\n",
+	      (rc) ? "!!!!" : "----", req->op, req->offset,
+	      buf, req->rw_len, tb_to_msecs(waited), req->timeout, rc);
+
+	return rc;
+}
+
+/**
+ * i2c_request_send - send request to i2c bus synchronously
+ * @bus_id: i2c bus id
+ * @dev_addr: address of the device
+ * @read_write: SMBUS_READ or SMBUS_WRITE
+ * @offset: any of the I2C interface offset defined
+ * @offset_bytes: offset size in bytes
+ * @buf: data to be read or written
+ * @buflen: buf length
+ * @timeout: request timeout in milliseconds
+ *
+ * Send an I2C request to a device synchronously
+ *
+ * Returns: Zero on success otherwise a negative error code
+ */
+int64_t i2c_request_send(int bus_id, int dev_addr, int read_write,
+		     uint32_t offset, uint32_t offset_bytes, void* buf,
+		     size_t buflen, int timeout)
+{
+	struct i2c_request *req;
+	struct i2c_bus *bus;
+	int64_t rc;
+
+	bus = i2c_find_bus_by_id(bus_id);
+	if (!bus) {
+		/**
+		 * @fwts-label I2CInvalidBusID
+		 * @fwts-advice i2c_request_send was passed an invalid bus
+		 * ID. This indicates a bug.
+		 */
+		prlog(PR_ERR, "I2C: Invalid bus_id=%x\n", bus_id);
+		return OPAL_PARAMETER;
+	}
+
+	req = zalloc(sizeof(*req));
+	if (!req) {
+		/**
+		 * @fwts-label I2CAllocationFailed
+		 * @fwts-advice OPAL failed to allocate memory for an
+		 * i2c_request. This points to an OPAL bug as OPAL run out of
+		 * memory and this should never happen.
+		 */
+		prlog(PR_ERR, "I2C: allocating i2c_request failed\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+
+	req->bus	= bus;
+	req->dev_addr   = dev_addr;
+	req->op         = read_write;
+	req->offset     = offset;
+	req->offset_bytes = offset_bytes;
+	req->rw_buf     = (void*) buf;
+	req->rw_len     = buflen;
+	req->timeout    = timeout;
+
+	rc = i2c_request_sync(req);
+
+	free(req);
+	if (rc)
+		return OPAL_HARDWARE;
+
+	return OPAL_SUCCESS;
+}
diff --git a/roms/skiboot/core/init.c b/roms/skiboot/core/init.c
new file mode 100644
index 000000000..a8bac28a8
--- /dev/null
+++ b/roms/skiboot/core/init.c
@@ -0,0 +1,1469 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * skiboot C entry point
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <psi.h>
+#include <chiptod.h>
+#include <nx.h>
+#include <cpu.h>
+#include <processor.h>
+#include <xscom.h>
+#include <opal.h>
+#include <opal-msg.h>
+#include <elf.h>
+#include <elf-abi.h>
+#include <io.h>
+#include <cec.h>
+#include <device.h>
+#include <pci.h>
+#include <lpc.h>
+#include <i2c.h>
+#include <chip.h>
+#include <interrupts.h>
+#include <mem_region.h>
+#include <trace.h>
+#include <console.h>
+#include <fsi-master.h>
+#include <centaur.h>
+#include <ocmb.h>
+#include <libfdt/libfdt.h>
+#include <timer.h>
+#include <ipmi.h>
+#include <sensor.h>
+#include <xive.h>
+#include <nvram.h>
+#include <vas.h>
+#include <libstb/secureboot.h>
+#include <libstb/trustedboot.h>
+#include <phys-map.h>
+#include <imc.h>
+#include <dts.h>
+#include <dio-p9.h>
+#include <sbe-p9.h>
+#include <debug_descriptor.h>
+#include <occ.h>
+#include <opal-dump.h>
+#include <xscom-p10-regs.h>
+
+enum proc_gen proc_gen;
+unsigned int pcie_max_link_speed;
+bool pci_tracing;
+bool verbose_eeh;
+extern const char version[];
+
+static uint64_t kernel_entry;
+static size_t kernel_size;
+static bool kernel_32bit;
+
+/* We backup the previous vectors here before copying our own */
+static uint8_t old_vectors[EXCEPTION_VECTORS_END];
+
+#ifdef DEBUG
+#define DEBUG_STR "-debug"
+#else
+#define DEBUG_STR ""
+#endif
+
+#ifdef SKIBOOT_GCOV
+void skiboot_gcov_done(void);
+#endif
+
+struct debug_descriptor debug_descriptor = {
+	.eye_catcher	= "OPALdbug",
+	.version	= CPU_TO_BE32(DEBUG_DESC_VERSION),
+	.state_flags	= 0,
+	.memcons_phys	= 0, /* cpu_to_be64(&memcons) can't init constant */
+	.trace_mask	= 0, /* All traces disabled by default */
+	/* console log level:
+	 *   high 4 bits in memory, low 4 bits driver (e.g. uart). */
+#ifdef DEBUG
+	.console_log_levels = (PR_TRACE << 4) | PR_DEBUG,
+#else
+	.console_log_levels = (PR_DEBUG << 4) | PR_NOTICE,
+#endif
+};
+
+static void checksum_romem(void);
+
+static bool try_load_elf64_le(struct elf_hdr *header)
+{
+	struct elf64le_hdr *kh = (struct elf64le_hdr *)header;
+	uint64_t load_base = (uint64_t)kh;
+	struct elf64le_phdr *ph;
+	unsigned int i;
+
+	printf("INIT: 64-bit LE kernel discovered\n");
+
+	/* Look for a loadable program header that has our entry in it
+	 *
+	 * Note that we execute the kernel in-place, we don't actually
+	 * obey the load informations in the headers. This is expected
+	 * to work for the Linux Kernel because it's a fairly dumb ELF
+	 * but it will not work for any ELF binary.
+	 */
+	ph = (struct elf64le_phdr *)(load_base + le64_to_cpu(kh->e_phoff));
+	for (i = 0; i < le16_to_cpu(kh->e_phnum); i++, ph++) {
+		if (le32_to_cpu(ph->p_type) != ELF_PTYPE_LOAD)
+			continue;
+		if (le64_to_cpu(ph->p_vaddr) > le64_to_cpu(kh->e_entry) ||
+		    (le64_to_cpu(ph->p_vaddr) + le64_to_cpu(ph->p_memsz)) <
+		    le64_to_cpu(kh->e_entry))
+			continue;
+
+		/* Get our entry */
+		kernel_entry = le64_to_cpu(kh->e_entry) -
+			le64_to_cpu(ph->p_vaddr) + le64_to_cpu(ph->p_offset);
+		break;
+	}
+
+	if (!kernel_entry) {
+		prerror("INIT: Failed to find kernel entry !\n");
+		return false;
+	}
+	kernel_entry += load_base;
+	kernel_32bit = false;
+
+	kernel_size = le64_to_cpu(kh->e_shoff) +
+		((uint32_t)le16_to_cpu(kh->e_shentsize) *
+		 (uint32_t)le16_to_cpu(kh->e_shnum));
+
+	prlog(PR_DEBUG, "INIT: 64-bit kernel entry at 0x%llx, size 0x%lx\n",
+	      kernel_entry, kernel_size);
+
+	return true;
+}
+
+static bool try_load_elf64(struct elf_hdr *header)
+{
+	struct elf64be_hdr *kh = (struct elf64be_hdr *)header;
+	struct elf64le_hdr *khle = (struct elf64le_hdr *)header;
+	uint64_t load_base = (uint64_t)kh;
+	struct elf64be_phdr *ph;
+	struct elf64be_shdr *sh;
+	unsigned int i;
+
+	/* Check it's a ppc64 LE ELF */
+	if (khle->ei_ident == ELF_IDENT		&&
+	    khle->ei_data == ELF_DATA_LSB	&&
+	    le16_to_cpu(khle->e_machine) == ELF_MACH_PPC64) {
+		return try_load_elf64_le(header);
+	}
+
+	/* Check it's a ppc64 ELF */
+	if (kh->ei_ident != ELF_IDENT		||
+	    kh->ei_data != ELF_DATA_MSB		||
+	    be16_to_cpu(kh->e_machine) != ELF_MACH_PPC64) {
+		prerror("INIT: Kernel doesn't look like an ppc64 ELF\n");
+		return false;
+	}
+
+	/* Look for a loadable program header that has our entry in it
+	 *
+	 * Note that we execute the kernel in-place, we don't actually
+	 * obey the load informations in the headers. This is expected
+	 * to work for the Linux Kernel because it's a fairly dumb ELF
+	 * but it will not work for any ELF binary.
+	 */
+	ph = (struct elf64be_phdr *)(load_base + be64_to_cpu(kh->e_phoff));
+	for (i = 0; i < be16_to_cpu(kh->e_phnum); i++, ph++) {
+		if (be32_to_cpu(ph->p_type) != ELF_PTYPE_LOAD)
+			continue;
+		if (be64_to_cpu(ph->p_vaddr) > be64_to_cpu(kh->e_entry) ||
+		    (be64_to_cpu(ph->p_vaddr) + be64_to_cpu(ph->p_memsz)) <
+		    be64_to_cpu(kh->e_entry))
+			continue;
+
+		/* Get our entry */
+		kernel_entry = be64_to_cpu(kh->e_entry) -
+			be64_to_cpu(ph->p_vaddr) + be64_to_cpu(ph->p_offset);
+		break;
+	}
+
+	if (!kernel_entry) {
+		prerror("INIT: Failed to find kernel entry !\n");
+		return false;
+	}
+
+	/* For the normal big-endian ELF ABI, the kernel entry points
+	 * to a function descriptor in the data section. Linux instead
+	 * has it point directly to code. Test whether it is pointing
+	 * into an executable section or not to figure this out. Default
+	 * to assuming it obeys the ABI.
+	 */
+	sh = (struct elf64be_shdr *)(load_base + be64_to_cpu(kh->e_shoff));
+	for (i = 0; i < be16_to_cpu(kh->e_shnum); i++, sh++) {
+		if (be64_to_cpu(sh->sh_addr) <= be64_to_cpu(kh->e_entry) &&
+		    (be64_to_cpu(sh->sh_addr) + be64_to_cpu(sh->sh_size)) >
+		    be64_to_cpu(kh->e_entry))
+			break;
+	}
+
+	if (i == be16_to_cpu(kh->e_shnum) ||
+			!(be64_to_cpu(sh->sh_flags) & ELF_SFLAGS_X)) {
+		kernel_entry = *(uint64_t *)(kernel_entry + load_base);
+		kernel_entry = kernel_entry -
+			be64_to_cpu(ph->p_vaddr) + be64_to_cpu(ph->p_offset);
+	}
+
+	kernel_entry += load_base;
+	kernel_32bit = false;
+
+	kernel_size = be64_to_cpu(kh->e_shoff) +
+		((uint32_t)be16_to_cpu(kh->e_shentsize) *
+		 (uint32_t)be16_to_cpu(kh->e_shnum));
+
+	printf("INIT: 64-bit kernel entry at 0x%llx, size 0x%lx\n",
+	       kernel_entry, kernel_size);
+
+	return true;
+}
+
+static bool try_load_elf32_le(struct elf_hdr *header)
+{
+	struct elf32le_hdr *kh = (struct elf32le_hdr *)header;
+	uint64_t load_base = (uint64_t)kh;
+	struct elf32le_phdr *ph;
+	unsigned int i;
+
+	printf("INIT: 32-bit LE kernel discovered\n");
+
+	/* Look for a loadable program header that has our entry in it
+	 *
+	 * Note that we execute the kernel in-place, we don't actually
+	 * obey the load informations in the headers. This is expected
+	 * to work for the Linux Kernel because it's a fairly dumb ELF
+	 * but it will not work for any ELF binary.
+	 */
+	ph = (struct elf32le_phdr *)(load_base + le32_to_cpu(kh->e_phoff));
+	for (i = 0; i < le16_to_cpu(kh->e_phnum); i++, ph++) {
+		if (le32_to_cpu(ph->p_type) != ELF_PTYPE_LOAD)
+			continue;
+		if (le32_to_cpu(ph->p_vaddr) > le32_to_cpu(kh->e_entry) ||
+		    (le32_to_cpu(ph->p_vaddr) + le32_to_cpu(ph->p_memsz)) <
+		    le32_to_cpu(kh->e_entry))
+			continue;
+
+		/* Get our entry */
+		kernel_entry = le32_to_cpu(kh->e_entry) -
+			le32_to_cpu(ph->p_vaddr) + le32_to_cpu(ph->p_offset);
+		break;
+	}
+
+	if (!kernel_entry) {
+		prerror("INIT: Failed to find kernel entry !\n");
+		return false;
+	}
+
+	kernel_entry += load_base;
+	kernel_32bit = true;
+
+	printf("INIT: 32-bit kernel entry at 0x%llx\n", kernel_entry);
+
+	return true;
+}
+
+static bool try_load_elf32(struct elf_hdr *header)
+{
+	struct elf32be_hdr *kh = (struct elf32be_hdr *)header;
+	struct elf32le_hdr *khle = (struct elf32le_hdr *)header;
+	uint64_t load_base = (uint64_t)kh;
+	struct elf32be_phdr *ph;
+	unsigned int i;
+
+	/* Check it's a ppc32 LE ELF */
+	if (khle->ei_ident == ELF_IDENT		&&
+	    khle->ei_data == ELF_DATA_LSB	&&
+	    le16_to_cpu(khle->e_machine) == ELF_MACH_PPC32) {
+		return try_load_elf32_le(header);
+	}
+
+	/* Check it's a ppc32 ELF */
+	if (kh->ei_ident != ELF_IDENT		||
+	    kh->ei_data != ELF_DATA_MSB		||
+	    be16_to_cpu(kh->e_machine) != ELF_MACH_PPC32) {
+		prerror("INIT: Kernel doesn't look like an ppc32 ELF\n");
+		return false;
+	}
+
+	/* Look for a loadable program header that has our entry in it
+	 *
+	 * Note that we execute the kernel in-place, we don't actually
+	 * obey the load informations in the headers. This is expected
+	 * to work for the Linux Kernel because it's a fairly dumb ELF
+	 * but it will not work for any ELF binary.
+	 */
+	ph = (struct elf32be_phdr *)(load_base + be32_to_cpu(kh->e_phoff));
+	for (i = 0; i < be16_to_cpu(kh->e_phnum); i++, ph++) {
+		if (be32_to_cpu(ph->p_type) != ELF_PTYPE_LOAD)
+			continue;
+		if (be32_to_cpu(ph->p_vaddr) > be32_to_cpu(kh->e_entry) ||
+		    (be32_to_cpu(ph->p_vaddr) + be32_to_cpu(ph->p_memsz)) <
+		    be32_to_cpu(kh->e_entry))
+			continue;
+
+		/* Get our entry */
+		kernel_entry = be32_to_cpu(kh->e_entry) -
+			be32_to_cpu(ph->p_vaddr) + be32_to_cpu(ph->p_offset);
+		break;
+	}
+
+	if (!kernel_entry) {
+		prerror("INIT: Failed to find kernel entry !\n");
+		return false;
+	}
+
+	kernel_entry += load_base;
+	kernel_32bit = true;
+
+	printf("INIT: 32-bit kernel entry at 0x%llx\n", kernel_entry);
+
+	return true;
+}
+
+extern char __builtin_kernel_start[];
+extern char __builtin_kernel_end[];
+extern uint64_t boot_offset;
+
+static size_t initramfs_size;
+
+bool start_preload_kernel(void)
+{
+	int loaded;
+
+	/* Try to load an external kernel payload through the platform hooks */
+	kernel_size = KERNEL_LOAD_SIZE;
+	loaded = start_preload_resource(RESOURCE_ID_KERNEL,
+					RESOURCE_SUBID_NONE,
+					KERNEL_LOAD_BASE,
+					&kernel_size);
+	if (loaded != OPAL_SUCCESS) {
+		printf("INIT: platform start load kernel failed\n");
+		kernel_size = 0;
+		return false;
+	}
+
+	initramfs_size = INITRAMFS_LOAD_SIZE;
+	loaded = start_preload_resource(RESOURCE_ID_INITRAMFS,
+					RESOURCE_SUBID_NONE,
+					INITRAMFS_LOAD_BASE, &initramfs_size);
+	if (loaded != OPAL_SUCCESS) {
+		printf("INIT: platform start load initramfs failed\n");
+		initramfs_size = 0;
+		return false;
+	}
+
+	return true;
+}
+
+static bool load_kernel(void)
+{
+	void *stb_container = NULL;
+	struct elf_hdr *kh;
+	int loaded;
+
+	prlog(PR_NOTICE, "INIT: Waiting for kernel...\n");
+
+	loaded = wait_for_resource_loaded(RESOURCE_ID_KERNEL,
+					  RESOURCE_SUBID_NONE);
+
+	if (loaded != OPAL_SUCCESS) {
+		printf("INIT: platform wait for kernel load failed\n");
+		kernel_size = 0;
+	}
+
+	/* Try embedded kernel payload */
+	if (!kernel_size) {
+		kernel_size = __builtin_kernel_end - __builtin_kernel_start;
+		if (kernel_size) {
+			/* Move the built-in kernel up */
+			uint64_t builtin_base =
+				((uint64_t)__builtin_kernel_start) -
+				SKIBOOT_BASE + boot_offset;
+			printf("Using built-in kernel\n");
+			memmove(KERNEL_LOAD_BASE, (void*)builtin_base,
+				kernel_size);
+		}
+	}
+
+	if (dt_has_node_property(dt_chosen, "kernel-base-address", NULL)) {
+		kernel_entry = dt_prop_get_u64(dt_chosen,
+					       "kernel-base-address");
+		prlog(PR_DEBUG, "INIT: Kernel image at 0x%llx\n", kernel_entry);
+		kh = (struct elf_hdr *)kernel_entry;
+		/*
+		 * If the kernel is at 0, restore it as it was overwritten
+		 * by our vectors.
+		 */
+		if (kernel_entry < EXCEPTION_VECTORS_END) {
+			cpu_set_sreset_enable(false);
+			memcpy_null(NULL, old_vectors, EXCEPTION_VECTORS_END);
+			sync_icache();
+		} else {
+			/* Hack for STB in Mambo, assume at least 4kb in mem */
+			if (!kernel_size)
+				kernel_size = SECURE_BOOT_HEADERS_SIZE;
+			if (stb_is_container((void*)kernel_entry, kernel_size)) {
+				stb_container = (void*)kernel_entry;
+				kh = (struct elf_hdr *) (kernel_entry + SECURE_BOOT_HEADERS_SIZE);
+			} else
+				kh = (struct elf_hdr *) (kernel_entry);
+		}
+	} else {
+		if (!kernel_size) {
+			printf("INIT: Assuming kernel at %p\n",
+			       KERNEL_LOAD_BASE);
+			/* Hack for STB in Mambo, assume at least 4kb in mem */
+			kernel_size = SECURE_BOOT_HEADERS_SIZE;
+			kernel_entry = (uint64_t)KERNEL_LOAD_BASE;
+		}
+		if (stb_is_container(KERNEL_LOAD_BASE, kernel_size)) {
+			stb_container = KERNEL_LOAD_BASE;
+			kh = (struct elf_hdr *) (KERNEL_LOAD_BASE + SECURE_BOOT_HEADERS_SIZE);
+		} else
+			kh = (struct elf_hdr *) (KERNEL_LOAD_BASE);
+
+	}
+
+	prlog(PR_DEBUG,
+	      "INIT: Kernel loaded, size: %zu bytes (0 = unknown preload)\n",
+	      kernel_size);
+
+	if (kh->ei_ident != ELF_IDENT) {
+		prerror("INIT: ELF header not found. Assuming raw binary.\n");
+		return true;
+	}
+
+	if (kh->ei_class == ELF_CLASS_64) {
+		if (!try_load_elf64(kh))
+			return false;
+	} else if (kh->ei_class == ELF_CLASS_32) {
+		if (!try_load_elf32(kh))
+			return false;
+	} else {
+		prerror("INIT: Neither ELF32 not ELF64 ?\n");
+		return false;
+	}
+
+	if (chip_quirk(QUIRK_MAMBO_CALLOUTS)) {
+		secureboot_verify(RESOURCE_ID_KERNEL,
+				  stb_container,
+				  SECURE_BOOT_HEADERS_SIZE + kernel_size);
+		trustedboot_measure(RESOURCE_ID_KERNEL,
+				    stb_container,
+				    SECURE_BOOT_HEADERS_SIZE + kernel_size);
+	}
+
+	return true;
+}
+
+static void load_initramfs(void)
+{
+	uint64_t *initramfs_start;
+	void *stb_container = NULL;
+	int loaded;
+
+	loaded = wait_for_resource_loaded(RESOURCE_ID_INITRAMFS,
+					  RESOURCE_SUBID_NONE);
+
+	if (loaded != OPAL_SUCCESS || !initramfs_size)
+		return;
+
+	if (stb_is_container(INITRAMFS_LOAD_BASE, initramfs_size)) {
+		stb_container = INITRAMFS_LOAD_BASE;
+		initramfs_start = INITRAMFS_LOAD_BASE + SECURE_BOOT_HEADERS_SIZE;
+	} else {
+		initramfs_start = INITRAMFS_LOAD_BASE;
+	}
+
+	dt_check_del_prop(dt_chosen, "linux,initrd-start");
+	dt_check_del_prop(dt_chosen, "linux,initrd-end");
+
+	printf("INIT: Initramfs loaded, size: %zu bytes\n", initramfs_size);
+
+	dt_add_property_u64(dt_chosen, "linux,initrd-start",
+			(uint64_t)initramfs_start);
+	dt_add_property_u64(dt_chosen, "linux,initrd-end",
+			(uint64_t)initramfs_start + initramfs_size);
+
+	if (chip_quirk(QUIRK_MAMBO_CALLOUTS)) {
+		secureboot_verify(RESOURCE_ID_INITRAMFS,
+				  stb_container,
+				  SECURE_BOOT_HEADERS_SIZE + initramfs_size);
+		trustedboot_measure(RESOURCE_ID_INITRAMFS,
+				    stb_container,
+				    SECURE_BOOT_HEADERS_SIZE + initramfs_size);
+	}
+}
+
+static void cpu_disable_ME_RI_one(void *param __unused)
+{
+	disable_machine_check();
+	mtmsrd(0, 1);
+}
+
+static int64_t cpu_disable_ME_RI_all(void)
+{
+	struct cpu_thread *cpu;
+	struct cpu_job **jobs;
+
+	jobs = zalloc(sizeof(struct cpu_job *) * (cpu_max_pir + 1));
+	assert(jobs);
+
+	for_each_available_cpu(cpu) {
+		if (cpu == this_cpu())
+			continue;
+		jobs[cpu->pir] = cpu_queue_job(cpu, "cpu_disable_ME_RI",
+						cpu_disable_ME_RI_one, NULL);
+	}
+
+	/* this cpu */
+	cpu_disable_ME_RI_one(NULL);
+
+	for_each_available_cpu(cpu) {
+		if (jobs[cpu->pir])
+			cpu_wait_job(jobs[cpu->pir], true);
+	}
+
+	free(jobs);
+
+	return OPAL_SUCCESS;
+}
+
+static void *fdt;
+
+void __noreturn load_and_boot_kernel(bool is_reboot)
+{
+	const struct dt_property *memprop;
+	const char *cmdline, *stdoutp;
+	uint64_t mem_top;
+
+	memprop = dt_find_property(dt_root, DT_PRIVATE "maxmem");
+	if (memprop)
+		mem_top = (u64)dt_property_get_cell(memprop, 0) << 32
+			| dt_property_get_cell(memprop, 1);
+	else /* XXX HB hack, might want to calc it */
+		mem_top = 0x40000000;
+
+	op_display(OP_LOG, OP_MOD_INIT, 0x000A);
+
+	/* Load kernel LID */
+	if (!load_kernel()) {
+		op_display(OP_FATAL, OP_MOD_INIT, 1);
+		abort();
+	}
+
+	load_initramfs();
+
+	trustedboot_exit_boot_services();
+
+	ipmi_set_fw_progress_sensor(IPMI_FW_OS_BOOT);
+
+
+	if (!is_reboot) {
+		/* We wait for the nvram read to complete here so we can
+		 * grab stuff from there such as the kernel arguments
+		 */
+		nvram_wait_for_load();
+
+		if (!occ_sensors_init())
+			dts_sensor_create_nodes(sensor_node);
+
+	} else {
+		/* fdt will be rebuilt */
+		free(fdt);
+		fdt = NULL;
+
+		nvram_reinit();
+		occ_pstates_init();
+	}
+
+	/* Use nvram bootargs over device tree */
+	cmdline = nvram_query_safe("bootargs");
+	if (cmdline) {
+		dt_check_del_prop(dt_chosen, "bootargs");
+		dt_add_property_string(dt_chosen, "bootargs", cmdline);
+		prlog(PR_DEBUG, "INIT: Command line from NVRAM: %s\n",
+		      cmdline);
+	}
+
+	op_display(OP_LOG, OP_MOD_INIT, 0x000B);
+
+	add_fast_reboot_dt_entries();
+
+	if (platform.finalise_dt)
+		platform.finalise_dt(is_reboot);
+
+	/* Create the device tree blob to boot OS. */
+	fdt = create_dtb(dt_root, false);
+	if (!fdt) {
+		op_display(OP_FATAL, OP_MOD_INIT, 2);
+		abort();
+	}
+
+	op_display(OP_LOG, OP_MOD_INIT, 0x000C);
+
+	mem_dump_free();
+
+	/* Dump the selected console */
+	stdoutp = dt_prop_get_def(dt_chosen, "linux,stdout-path", NULL);
+	prlog(PR_DEBUG, "INIT: stdout-path: %s\n", stdoutp ? stdoutp : "");
+
+	fdt_set_boot_cpuid_phys(fdt, this_cpu()->pir);
+
+	/* Check there is something there before we branch to it */
+	if (*(uint32_t *)kernel_entry == 0) {
+		prlog(PR_EMERG, "FATAL: Kernel is zeros, can't execute!\n");
+		assert(0);
+	}
+
+	if (platform.exit)
+		platform.exit();
+
+	/* Take processors out of nap */
+	cpu_set_sreset_enable(false);
+	cpu_set_ipi_enable(false);
+
+	printf("INIT: Starting kernel at 0x%llx, fdt at %p %u bytes\n",
+	       kernel_entry, fdt, fdt_totalsize(fdt));
+
+	/* Disable machine checks on all */
+	cpu_disable_ME_RI_all();
+
+	patch_traps(false);
+	cpu_set_hile_mode(false); /* Clear HILE on all CPUs */
+
+	/* init MPIPL */
+	if (!is_reboot)
+		opal_mpipl_init();
+
+	checksum_romem();
+
+	debug_descriptor.state_flags |= OPAL_BOOT_COMPLETE;
+
+	cpu_give_self_os();
+
+	if (kernel_32bit)
+		start_kernel32(kernel_entry, fdt, mem_top);
+	start_kernel(kernel_entry, fdt, mem_top);
+}
+
+static void storage_keys_fixup(void)
+{
+	struct dt_node *cpus, *n;
+
+	cpus = dt_find_by_path(dt_root, "/cpus");
+	assert(cpus);
+
+	if (proc_gen == proc_gen_unknown)
+		return;
+
+	dt_for_each_child(cpus, n) {
+		/* There may be cache nodes in /cpus. */
+		if (!dt_has_node_property(n, "device_type", "cpu") ||
+		    dt_has_node_property(n, "ibm,processor-storage-keys", NULL))
+			continue;
+
+		/*
+		 * skiboot supports p8 & p9, both of which support the IAMR, and
+		 * both of which support 32 keys. So advertise 32 keys for data
+		 * accesses and 32 for instruction accesses.
+		 */
+		dt_add_property_cells(n, "ibm,processor-storage-keys", 32, 32);
+	}
+}
+
+static void dt_fixups(void)
+{
+	struct dt_node *n;
+	struct dt_node *primary_lpc = NULL;
+
+	/* lpc node missing #address/size cells. Also pick one as
+	 * primary for now (TBD: How to convey that from HB)
+	 */
+	dt_for_each_compatible(dt_root, n, "ibm,power8-lpc") {
+		if (!primary_lpc || dt_has_node_property(n, "primary", NULL))
+			primary_lpc = n;
+		if (dt_has_node_property(n, "#address-cells", NULL))
+			break;
+		dt_add_property_cells(n, "#address-cells", 2);
+		dt_add_property_cells(n, "#size-cells", 1);
+		dt_add_property_strings(n, "status", "ok");
+	}
+
+	/* Missing "primary" property in LPC bus */
+	if (primary_lpc && !dt_has_node_property(primary_lpc, "primary", NULL))
+		dt_add_property(primary_lpc, "primary", NULL, 0);
+
+	/* Missing "scom-controller" */
+	dt_for_each_compatible(dt_root, n, "ibm,xscom") {
+		if (!dt_has_node_property(n, "scom-controller", NULL))
+			dt_add_property(n, "scom-controller", NULL, 0);
+	}
+
+	storage_keys_fixup();
+}
+
+static void add_arch_vector(void)
+{
+	/**
+	 * vec5 = a PVR-list : Number-of-option-vectors :
+	 *	  option-vectors[Number-of-option-vectors + 1]
+	 */
+	uint8_t vec5[] = {0x05, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00};
+
+	if (dt_has_node_property(dt_chosen, "ibm,architecture-vec-5", NULL))
+		return;
+
+	dt_add_property(dt_chosen, "ibm,architecture-vec-5",
+			vec5, sizeof(vec5));
+}
+
+static void dt_init_misc(void)
+{
+	/* Check if there's a /chosen node, if not, add one */
+	dt_chosen = dt_find_by_path(dt_root, "/chosen");
+	if (!dt_chosen)
+		dt_chosen = dt_new(dt_root, "chosen");
+	assert(dt_chosen);
+
+	/* Add IBM architecture vectors if needed */
+	add_arch_vector();
+
+	/* Add the "OPAL virtual ICS*/
+	add_ics_node();
+
+	/* Additional fixups. TODO: Move into platform */
+	dt_fixups();
+}
+
+static u8 console_get_level(const char *s)
+{
+	if (strcmp(s, "emerg") == 0)
+		return PR_EMERG;
+	if (strcmp(s, "alert") == 0)
+		return PR_ALERT;
+	if (strcmp(s, "crit") == 0)
+		return PR_CRIT;
+	if (strcmp(s, "err") == 0)
+		return PR_ERR;
+	if (strcmp(s, "warning") == 0)
+		return PR_WARNING;
+	if (strcmp(s, "notice") == 0)
+		return PR_NOTICE;
+	if (strcmp(s, "printf") == 0)
+		return PR_PRINTF;
+	if (strcmp(s, "info") == 0)
+		return PR_INFO;
+	if (strcmp(s, "debug") == 0)
+		return PR_DEBUG;
+	if (strcmp(s, "trace") == 0)
+		return PR_TRACE;
+	if (strcmp(s, "insane") == 0)
+		return PR_INSANE;
+	/* Assume it's a number instead */
+	return atoi(s);
+}
+
+static void console_log_level(void)
+{
+	const char *s;
+	u8 level;
+
+	/* console log level:
+	 *   high 4 bits in memory, low 4 bits driver (e.g. uart). */
+	s = nvram_query_safe("log-level-driver");
+	if (s) {
+		level = console_get_level(s);
+		debug_descriptor.console_log_levels =
+			(debug_descriptor.console_log_levels & 0xf0 ) |
+			(level & 0x0f);
+		prlog(PR_NOTICE, "console: Setting driver log level to %i\n",
+		      level & 0x0f);
+	}
+	s = nvram_query_safe("log-level-memory");
+	if (s) {
+		level = console_get_level(s);
+		debug_descriptor.console_log_levels =
+			(debug_descriptor.console_log_levels & 0x0f ) |
+			((level & 0x0f) << 4);
+		prlog(PR_NOTICE, "console: Setting memory log level to %i\n",
+		      level & 0x0f);
+	}
+}
+
+typedef void (*ctorcall_t)(void);
+
+static void __nomcount do_ctors(void)
+{
+	extern ctorcall_t __ctors_start[], __ctors_end[];
+	ctorcall_t *call;
+
+	for (call = __ctors_start; call < __ctors_end; call++)
+		(*call)();
+}
+
+#ifdef ELF_ABI_v2
+static void setup_branch_null_catcher(void)
+{
+	asm volatile(							\
+		".section .rodata"				"\n\t"	\
+		"3:	.string	\"branch to NULL\""		"\n\t"	\
+		".previous"					"\n\t"	\
+		".section .trap_table,\"aw\""			"\n\t"	\
+		".llong	0"					"\n\t"	\
+		".llong	3b"					"\n\t"	\
+		".previous"					"\n\t"	\
+		);
+}
+#else
+static void branch_null(void)
+{
+	assert(0);
+}
+
+static void setup_branch_null_catcher(void)
+{
+       void (*bn)(void) = branch_null;
+
+       /*
+        * FIXME: This copies the function descriptor (16 bytes) for
+        * ABI v1 (ie. big endian).  This will be broken if we ever
+        * move to ABI v2 (ie little endian)
+        */
+       memcpy_null((void *)0, bn, 16);
+}
+#endif
+
+void copy_sreset_vector(void)
+{
+	uint32_t *src, *dst;
+
+	/* Copy the reset code over the entry point. */
+	src = &reset_patch_start;
+	dst = (uint32_t *)0x100;
+	while(src < &reset_patch_end)
+		*(dst++) = *(src++);
+	sync_icache();
+}
+
+void copy_sreset_vector_fast_reboot(void)
+{
+	uint32_t *src, *dst;
+
+	/* Copy the reset code over the entry point. */
+	src = &reset_fast_reboot_patch_start;
+	dst = (uint32_t *)0x100;
+	while(src < &reset_fast_reboot_patch_end)
+		*(dst++) = *(src++);
+	sync_icache();
+}
+
+void copy_exception_vectors(void)
+{
+	/* Copy from 0x100 to EXCEPTION_VECTORS_END, avoid below 0x100 as
+	 * this is the boot flag used by CPUs still potentially entering
+	 * skiboot.
+	 */
+	memcpy((void *)0x100, (void *)(SKIBOOT_BASE + 0x100),
+			EXCEPTION_VECTORS_END - 0x100);
+	sync_icache();
+}
+
+/*
+ * When skiboot owns the exception vectors, patch in 'trap' for assert fails.
+ * Otherwise use assert_fail()
+ */
+void patch_traps(bool enable)
+{
+	struct trap_table_entry *tte;
+
+	for (tte = __trap_table_start; tte < __trap_table_end; tte++) {
+		uint32_t *insn;
+
+		insn = (uint32_t *)tte->address;
+		if (enable) {
+			*insn = PPC_INST_TRAP;
+		} else {
+			*insn = PPC_INST_NOP;
+		}
+	}
+
+	sync_icache();
+}
+
+static void per_thread_sanity_checks(void)
+{
+	struct cpu_thread *cpu = this_cpu();
+
+	/**
+	 * @fwts-label NonZeroHRMOR
+	 * @fwts-advice The contents of the hypervisor real mode offset register
+	 * (HRMOR) is bitwise orded with the address of any hypervisor real mode
+	 * (i.e Skiboot) memory accesses. Skiboot does not support operating
+	 * with a non-zero HRMOR and setting it will break some things (e.g
+	 * XSCOMs) in hard-to-debug ways.
+	 */
+	assert(mfspr(SPR_HRMOR) == 0);
+
+	/**
+	 * @fwts-label UnknownSecondary
+	 * @fwts-advice The boot CPU attampted to call in a secondary thread
+	 * without initialising the corresponding cpu_thread structure. This may
+	 * happen if the HDAT or devicetree reports too few threads or cores for
+	 * this processor.
+	 */
+	assert(cpu->state != cpu_state_no_cpu);
+}
+
+void pci_nvram_init(void)
+{
+	const char *nvram_speed;
+
+	verbose_eeh = nvram_query_eq_safe("pci-eeh-verbose", "true");
+	if (verbose_eeh)
+		prlog(PR_INFO, "PHB: Verbose EEH enabled\n");
+
+	pcie_max_link_speed = 0;
+
+	nvram_speed = nvram_query_dangerous("pcie-max-link-speed");
+	if (nvram_speed) {
+		pcie_max_link_speed = atoi(nvram_speed);
+		prlog(PR_NOTICE, "PHB: NVRAM set max link speed to GEN%i\n",
+		      pcie_max_link_speed);
+	}
+
+	pci_tracing = nvram_query_eq_safe("pci-tracing", "true");
+}
+
+static uint32_t mem_csum(void *_p, void *_e)
+{
+	size_t len = _e - _p;
+	uint32_t *p = _p;
+	uint32_t v1 = 0, v2 = 0;
+	uint32_t csum;
+	unsigned int i;
+
+	for (i = 0; i < len; i += 4) {
+		uint32_t v = *p++;
+		v1 += v;
+		v2 += v1;
+	}
+
+	csum = v1 ^ v2;
+
+	return csum;
+}
+
+static uint32_t romem_csum;
+
+static void checksum_romem(void)
+{
+	uint32_t csum;
+
+	romem_csum = 0;
+	if (chip_quirk(QUIRK_SLOW_SIM))
+		return;
+
+	csum = mem_csum(_start, _head_end);
+	romem_csum ^= csum;
+
+	csum = mem_csum(_stext, _romem_end);
+	romem_csum ^= csum;
+
+	csum = mem_csum(__builtin_kernel_start, __builtin_kernel_end);
+	romem_csum ^= csum;
+}
+
+bool verify_romem(void)
+{
+	uint32_t old = romem_csum;
+	checksum_romem();
+	if (old != romem_csum) {
+		romem_csum = old;
+		prlog(PR_NOTICE, "OPAL checksums did not match\n");
+		return false;
+	}
+	return true;
+}
+
+static void mask_pc_system_xstop(void)
+{
+        struct cpu_thread *cpu;
+        uint32_t chip_id, core_id;
+        int rc;
+
+	if (proc_gen != proc_gen_p10)
+                return;
+
+	if (chip_quirk(QUIRK_MAMBO_CALLOUTS))
+		return;
+
+        /*
+         * On P10 Mask PC system checkstop (bit 28). This is needed
+         * for HW570622. We keep processor recovery disabled via
+         * HID[5] and mask the checkstop that it can cause. CME does
+         * the recovery handling for us.
+         */
+        for_each_cpu(cpu) {
+                chip_id = cpu->chip_id;
+                core_id = pir_to_core_id(cpu->pir);
+
+                rc = xscom_write(chip_id,
+                                 XSCOM_ADDR_P10_EC(core_id, P10_CORE_FIRMASK_OR),
+                                 PPC_BIT(28));
+                if (rc)
+                        prerror("Error setting FIR MASK rc:%d on PIR:%x\n",
+                                rc, cpu->pir);
+        }
+}
+
+
+/* Called from head.S, thus no prototype. */
+void __noreturn __nomcount  main_cpu_entry(const void *fdt);
+
+void __noreturn __nomcount main_cpu_entry(const void *fdt)
+{
+	/*
+	 * WARNING: At this point. the timebases have
+	 * *not* been synchronized yet. Do not use any timebase
+	 * related functions for timeouts etc... unless you can cope
+	 * with the speed being some random core clock divider and
+	 * the value jumping backward when the synchronization actually
+	 * happens (in chiptod_init() below).
+	 *
+	 * Also the current cpu_thread() struct is not initialized
+	 * either so we need to clear it out first thing first (without
+	 * putting any other useful info in there jus yet) otherwise
+	 * printf an locks are going to play funny games with "con_suspend"
+	 */
+	pre_init_boot_cpu();
+
+	/*
+	 * Point to our mem console
+	 */
+	debug_descriptor.memcons_phys = cpu_to_be64((uint64_t)&memcons);
+
+	/*
+	 * Before first printk, ensure console buffer is clear or
+	 * reading tools might think it has wrapped
+	 */
+	clear_console();
+
+	/* Backup previous vectors as this could contain a kernel
+	 * image.
+	 */
+	memcpy_null(old_vectors, NULL, EXCEPTION_VECTORS_END);
+
+	/*
+	 * Some boot firmwares enter OPAL with MSR[ME]=1, as they presumably
+	 * handle machine checks until we take over. As we overwrite the
+	 * previous exception vectors with our own handlers, disable MSR[ME].
+	 * This could be done atomically by patching in a branch then patching
+	 * it out last, but that's a lot of effort.
+	 */
+	disable_machine_check();
+
+	/* Copy all vectors down to 0 */
+	copy_exception_vectors();
+
+	/* Enable trap based asserts */
+	patch_traps(true);
+
+	/*
+	 * Enable MSR[ME] bit so we can take MCEs. We don't currently
+	 * recover, but we print some useful information.
+	 */
+	enable_machine_check();
+	mtmsrd(MSR_RI, 1);
+
+	/* Setup a NULL catcher to catch accidental NULL ptr calls */
+	setup_branch_null_catcher();
+
+	/* Call library constructors */
+	do_ctors();
+
+	prlog(PR_NOTICE, "OPAL %s%s starting...\n", version, DEBUG_STR);
+
+	prlog(PR_DEBUG, "initial console log level: memory %d, driver %d\n",
+	       (debug_descriptor.console_log_levels >> 4),
+	       (debug_descriptor.console_log_levels & 0x0f));
+	prlog(PR_TRACE, "OPAL is Powered By Linked-List Technology.\n");
+
+#ifdef SKIBOOT_GCOV
+	skiboot_gcov_done();
+#endif
+
+	/* Initialize boot cpu's cpu_thread struct */
+	init_boot_cpu();
+
+	/* Now locks can be used */
+	init_locks();
+
+	/* Create the OPAL call table early on, entries can be overridden
+	 * later on (FSP console code for example)
+	 */
+	opal_table_init();
+
+	/* Init the physical map table so we can start mapping things */
+	phys_map_init(mfspr(SPR_PVR));
+
+	/*
+	 * If we are coming in with a flat device-tree, we expand it
+	 * now. Else look for HDAT and create a device-tree from them
+	 *
+	 * Hack alert: When entering via the OPAL entry point, fdt
+	 * is set to -1, we record that and pass it to parse_hdat
+	 */
+
+	dt_root = dt_new_root("");
+
+	if (fdt == (void *)-1ul) {
+		if (parse_hdat(true) < 0)
+			abort();
+	} else if (fdt == NULL) {
+		if (parse_hdat(false) < 0)
+			abort();
+	} else {
+		dt_expand(fdt);
+	}
+	dt_add_cpufeatures(dt_root);
+
+	/* Now that we have a full devicetree, verify that we aren't on fire. */
+	per_thread_sanity_checks();
+
+	/*
+	 * From there, we follow a fairly strict initialization order.
+	 *
+	 * First we need to build up our chip data structures and initialize
+	 * XSCOM which will be needed for a number of susbequent things.
+	 *
+	 * We want XSCOM available as early as the platform probe in case the
+	 * probe requires some HW accesses.
+	 *
+	 * We also initialize the FSI master at that point in case we need
+	 * to access chips via that path early on.
+	 */
+	init_chips();
+
+	xscom_init();
+	mfsi_init();
+
+	/*
+	 * Direct controls facilities provides some controls over CPUs
+	 * using scoms.
+	 */
+	direct_controls_init();
+
+	/*
+	 * Put various bits & pieces in device-tree that might not
+	 * already be there such as the /chosen node if not there yet,
+	 * the ICS node, etc... This can potentially use XSCOM
+	 */
+	dt_init_misc();
+
+	/*
+	 * Initialize LPC (P8 and beyond) so we can get to UART, BMC and
+	 * other system controller. This is done before probe_platform
+	 * so that the platform probing code can access an external
+	 * BMC if needed.
+	 */
+	lpc_init();
+
+	/*
+	 * This should be done before mem_region_init, so the stack
+	 * region length can be set according to the maximum PIR.
+	 */
+	init_cpu_max_pir();
+
+	/*
+	 * Now, we init our memory map from the device-tree, and immediately
+	 * reserve areas which we know might contain data coming from
+	 * HostBoot. We need to do these things before we start doing
+	 * allocations outside of our heap, such as chip local allocs,
+	 * otherwise we might clobber those data.
+	 */
+	mem_region_init();
+
+	/*
+	 * Reserve memory required to capture OPAL dump. This should be done
+	 * immediately after mem_region_init to avoid any clash with local
+	 * memory allocation.
+	 */
+	opal_mpipl_reserve_mem();
+
+	/* Reserve HOMER and OCC area */
+	homer_init();
+
+	/* Initialize the rest of the cpu thread structs */
+	init_all_cpus();
+	if (proc_gen == proc_gen_p9 || proc_gen == proc_gen_p10)
+		cpu_set_ipi_enable(true);
+
+        /* Once all CPU are up apply this workaround */
+        mask_pc_system_xstop();
+
+	/* Add the /opal node to the device-tree */
+	add_opal_node();
+
+	/*
+	 * We probe the platform now. This means the platform probe gets
+	 * the opportunity to reserve additional areas of memory if needed.
+	 *
+	 * Note: Timebases still not synchronized.
+	 */
+	probe_platform();
+
+	/* Allocate our split trace buffers now. Depends add_opal_node() */
+	init_trace_buffers();
+
+	/* On P8, get the ICPs and make sure they are in a sane state */
+	init_interrupts();
+	if (proc_gen == proc_gen_p8)
+		cpu_set_ipi_enable(true);
+
+	/* On P9 and P10, initialize XIVE */
+	if (proc_gen == proc_gen_p9)
+		init_xive();
+	else if (proc_gen == proc_gen_p10)
+		xive2_init();
+
+	/* Grab centaurs from device-tree if present (only on FSP-less) */
+	centaur_init();
+
+	/* initialize ocmb scom-controller */
+	ocmb_init();
+
+	/* Initialize PSI (depends on probe_platform being called) */
+	psi_init();
+
+	/* Initialize/enable LPC interrupts. This must be done after the
+	 * PSI interface has been initialized since it serves as an interrupt
+	 * source for LPC interrupts.
+	 */
+	lpc_init_interrupts();
+
+	/* Call in secondary CPUs */
+	cpu_bringup();
+
+	/* We can now overwrite the 0x100 vector as we are no longer being
+	 * entered there.
+	 */
+	copy_sreset_vector();
+
+	/* We can now do NAP mode */
+	cpu_set_sreset_enable(true);
+
+	/*
+	 * Synchronize time bases. Prior to chiptod_init() the timebase
+	 * is free-running at a frequency based on the core clock rather
+	 * than being synchronised to the ChipTOD network. This means
+	 * that the timestamps in early boot might be a little off compared
+	 * to wall clock time.
+	 */
+	chiptod_init();
+
+	/* Initialize P9 DIO */
+	p9_dio_init();
+
+	/*
+	 * SBE uses TB value for scheduling timer. Hence init after
+	 * chiptod init
+	 */
+	p9_sbe_init();
+
+	/* Initialize i2c */
+	p8_i2c_init();
+
+	/* Register routine to dispatch and read sensors */
+	sensor_init();
+
+        /*
+	 * Initialize the opal messaging before platform.init as we are
+	 * getting request to queue occ load opal message when host services
+	 * got load occ request from FSP
+	 */
+        opal_init_msg();
+
+	/*
+	 * We have initialized the basic HW, we can now call into the
+	 * platform to perform subsequent inits, such as establishing
+	 * communication with the FSP or starting IPMI.
+	 */
+	if (platform.init)
+		platform.init();
+
+	/* Read in NVRAM and set it up */
+	nvram_init();
+
+	/* Set the console level */
+	console_log_level();
+
+	/* Secure/Trusted Boot init. We look for /ibm,secureboot in DT */
+	secureboot_init();
+	trustedboot_init();
+
+	/* Secure variables init, handled by platform */
+	if (platform.secvar_init && is_fw_secureboot())
+		platform.secvar_init();
+
+	/*
+	 * BMC platforms load version information from flash after
+	 * secure/trustedboot init.
+	 */
+	if (platform.bmc)
+		flash_fw_version_preload();
+
+        /* preload the IMC catalog dtb */
+        imc_catalog_preload();
+
+	/* Install the OPAL Console handlers */
+	init_opal_console();
+
+	/*
+	 * Some platforms set a flag to wait for SBE validation to be
+	 * performed by the BMC. If this occurs it leaves the SBE in a
+	 * bad state and the system will reboot at this point.
+	 */
+	if (platform.seeprom_update)
+		platform.seeprom_update();
+
+	/* Init SLW related stuff, including fastsleep */
+	slw_init();
+
+	op_display(OP_LOG, OP_MOD_INIT, 0x0002);
+
+	/*
+	 * On some POWER9 BMC systems, we need to initialise the OCC
+	 * before the NPU to facilitate NVLink/OpenCAPI presence
+	 * detection, so we set it up as early as possible. On FSP
+	 * systems, Hostboot starts booting the OCC later, so we delay
+	 * OCC initialisation as late as possible to give it the
+	 * maximum time to boot up.
+	 */
+	if (platform.bmc)
+		occ_pstates_init();
+
+	pci_nvram_init();
+
+	preload_capp_ucode();
+	start_preload_kernel();
+
+	/* Catalog decompression routine */
+	imc_decompress_catalog();
+
+	/* Virtual Accelerator Switchboard */
+	vas_init();
+
+	/* NX init */
+	nx_init();
+
+	/* Probe PHB3 on P8 */
+	probe_phb3();
+
+	/* Probe PHB4 on P9 and PHB5 on P10 */
+	probe_phb4();
+
+	/* Probe NPUs */
+	probe_npu();
+	probe_npu2();
+	probe_npu3();
+
+	/* Initialize PCI */
+	pci_init_slots();
+
+	/* Add OPAL timer related properties */
+	late_init_timers();
+
+	/* Setup ibm,firmware-versions if able */
+	if (platform.bmc) {
+		flash_dt_add_fw_version();
+		ipmi_dt_add_bmc_info();
+	}
+
+	ipmi_set_fw_progress_sensor(IPMI_FW_PCI_INIT);
+
+	/*
+	 * These last few things must be done as late as possible
+	 * because they rely on various other things having been setup,
+	 * for example, add_opal_interrupts() will add all the interrupt
+	 * sources that are going to the firmware. We can't add a new one
+	 * after that call. Similarly, the mem_region calls will construct
+	 * the reserve maps in the DT so we shouldn't affect the memory
+	 * regions after that
+	 */
+
+	/* Create the LPC bus interrupt-map on P9 */
+	lpc_finalize_interrupts();
+
+	/* Add the list of interrupts going to OPAL */
+	add_opal_interrupts();
+
+	/* Init In-Memory Collection related stuff (load the IMC dtb into memory) */
+	imc_init();
+
+	/* Disable protected execution facility in BML */
+	cpu_disable_pef();
+
+	/* export the trace buffers */
+	trace_add_dt_props();
+
+	/* Now release parts of memory nodes we haven't used ourselves... */
+	mem_region_release_unused();
+
+	/* ... and add remaining reservations to the DT */
+	mem_region_add_dt_reserved();
+
+	/*
+	 * Update /ibm,secureboot/ibm,cvc/memory-region to point to
+	 * /reserved-memory/secure-crypt-algo-code instead of
+	 * /ibm,hostboot/reserved-memory/secure-crypt-algo-code.
+	 */
+	cvc_update_reserved_memory_phandle();
+
+	prd_register_reserved_memory();
+
+	load_and_boot_kernel(false);
+}
+
+void __noreturn __secondary_cpu_entry(void)
+{
+	struct cpu_thread *cpu = this_cpu();
+
+	/* Secondary CPU called in */
+	cpu_callin(cpu);
+
+	enable_machine_check();
+	mtmsrd(MSR_RI, 1);
+
+	/* Some XIVE setup */
+	if (proc_gen == proc_gen_p9)
+		xive_cpu_callin(cpu);
+	else if (proc_gen == proc_gen_p10)
+		xive2_cpu_callin(cpu);
+
+	/* Wait for work to do */
+	while(true) {
+		if (cpu_check_jobs(cpu))
+			cpu_process_jobs();
+		else
+			cpu_idle_job();
+	}
+}
+
+/* Called from head.S, thus no prototype. */
+void __noreturn __nomcount secondary_cpu_entry(void);
+
+void __noreturn __nomcount secondary_cpu_entry(void)
+{
+	struct cpu_thread *cpu = this_cpu();
+
+	per_thread_sanity_checks();
+
+	prlog(PR_DEBUG, "INIT: CPU PIR 0x%04x called in\n", cpu->pir);
+
+	__secondary_cpu_entry();
+}
diff --git a/roms/skiboot/core/interrupts.c b/roms/skiboot/core/interrupts.c
new file mode 100644
index 000000000..0a617d385
--- /dev/null
+++ b/roms/skiboot/core/interrupts.c
@@ -0,0 +1,513 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Excuse me, you do work for me now?
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <chip.h>
+#include <cpu.h>
+#include <fsp.h>
+#include <interrupts.h>
+#include <opal.h>
+#include <io.h>
+#include <cec.h>
+#include <device.h>
+#include <ccan/str/str.h>
+#include <timer.h>
+#include <sbe-p8.h>
+#include <sbe-p9.h>
+
+/* ICP registers */
+#define ICP_XIRR		0x4	/* 32-bit access */
+#define ICP_CPPR		0x4	/* 8-bit access */
+#define ICP_MFRR		0xc	/* 8-bit access */
+
+static LIST_HEAD(irq_sources);
+static LIST_HEAD(irq_sources2);
+static struct lock irq_lock = LOCK_UNLOCKED;
+
+void __register_irq_source(struct irq_source *is, bool secondary)
+{
+	struct irq_source *is1;
+	struct list_head *list = secondary ? &irq_sources2 : &irq_sources;
+
+	prlog(PR_DEBUG, "IRQ: Registering %04x..%04x ops @%p (data %p)%s\n",
+	      is->start, is->end - 1, is->ops, is->data,
+	      secondary ? " [secondary]" : "");
+
+	lock(&irq_lock);
+	list_for_each(list, is1, link) {
+		if (is->end > is1->start && is->start < is1->end) {
+			prerror("register IRQ source overlap !\n");
+			prerror("  new: %x..%x old: %x..%x\n",
+				is->start, is->end - 1,
+				is1->start, is1->end - 1);
+			assert(0);
+		}
+	}
+	list_add_tail(list, &is->link);
+	unlock(&irq_lock);
+}
+
+void register_irq_source(const struct irq_source_ops *ops, void *data,
+			 uint32_t start, uint32_t count)
+{
+	struct irq_source *is;
+
+	is = zalloc(sizeof(struct irq_source));
+	assert(is);
+	is->start = start;
+	is->end = start + count;
+	is->ops = ops;
+	is->data = data;
+
+	__register_irq_source(is, false);
+}
+
+void unregister_irq_source(uint32_t start, uint32_t count)
+{
+	struct irq_source *is;
+
+	/* Note: We currently only unregister from the primary sources */
+	lock(&irq_lock);
+	list_for_each(&irq_sources, is, link) {
+		if (start >= is->start && start < is->end) {
+			if (start != is->start ||
+			    count != (is->end - is->start)) {
+				prerror("unregister IRQ source mismatch !\n");
+				prerror("start:%x, count: %x match: %x..%x\n",
+					start, count, is->start, is->end);
+				assert(0);
+			}
+			list_del(&is->link);
+			unlock(&irq_lock);
+			/* XXX Add synchronize / RCU */
+			free(is);
+			return;
+		}
+	}
+	unlock(&irq_lock);
+	prerror("unregister IRQ source not found !\n");
+	prerror("start:%x, count: %x\n", start, count);
+	assert(0);
+}
+
+struct irq_source *irq_find_source(uint32_t isn)
+{
+	struct irq_source *is;
+
+	lock(&irq_lock);
+	/*
+	 * XXX This really needs some kind of caching !
+	 */
+	list_for_each(&irq_sources, is, link) {
+		if (isn >= is->start && isn < is->end) {
+			unlock(&irq_lock);
+			return is;
+		}
+	}
+	list_for_each(&irq_sources2, is, link) {
+		if (isn >= is->start && isn < is->end) {
+			unlock(&irq_lock);
+			return is;
+		}
+	}
+	unlock(&irq_lock);
+
+	return NULL;
+}
+
+void irq_for_each_source(void (*cb)(struct irq_source *, void *), void *data)
+{
+	struct irq_source *is;
+
+	lock(&irq_lock);
+	list_for_each(&irq_sources, is, link)
+		cb(is, data);
+	list_for_each(&irq_sources2, is, link)
+		cb(is, data);
+	unlock(&irq_lock);
+}
+
+/*
+ * This takes a 6-bit chip id and returns a 20 bit value representing
+ * the PSI interrupt. This includes all the fields above, ie, is a
+ * global interrupt number.
+ *
+ * For P8, this returns the base of the 8-interrupts block for PSI
+ */
+uint32_t get_psi_interrupt(uint32_t chip_id)
+{
+	uint32_t irq;
+
+	switch(proc_gen) {
+	case proc_gen_p8:
+		irq = p8_chip_irq_block_base(chip_id, P8_IRQ_BLOCK_MISC);
+		irq += P8_IRQ_MISC_PSI_BASE;
+		break;
+	default:
+		assert(false);
+	};
+
+	return irq;
+}
+
+
+struct dt_node *add_ics_node(void)
+{
+	struct dt_node *ics = dt_new_addr(dt_root, "interrupt-controller", 0);
+	bool has_xive;
+
+	if (!ics)
+		return NULL;
+
+	has_xive = proc_gen >= proc_gen_p9;
+
+	dt_add_property_cells(ics, "reg", 0, 0, 0, 0);
+	dt_add_property_strings(ics, "compatible",
+				has_xive ? "ibm,opal-xive-vc" : "IBM,ppc-xics",
+				"IBM,opal-xics");
+	dt_add_property_cells(ics, "#address-cells", 0);
+	dt_add_property_cells(ics, "#interrupt-cells", 2);
+	dt_add_property_string(ics, "device_type",
+			       "PowerPC-Interrupt-Source-Controller");
+	dt_add_property(ics, "interrupt-controller", NULL, 0);
+
+	return ics;
+}
+
+uint32_t get_ics_phandle(void)
+{
+	struct dt_node *i;
+
+	for (i = dt_first(dt_root); i; i = dt_next(dt_root, i)) {
+		if (streq(i->name, "interrupt-controller@0")) {
+			return i->phandle;
+		}
+	}
+	abort();
+}
+
+void add_opal_interrupts(void)
+{
+	struct irq_source *is;
+	unsigned int i, ns, tns = 0, count = 0;
+	uint32_t isn;
+	__be32 *irqs = NULL;
+	char *names = NULL;
+
+	lock(&irq_lock);
+	list_for_each(&irq_sources, is, link) {
+		/*
+		 * Don't even consider sources that don't have an interrupts
+		 * callback or don't have an attributes one.
+		 */
+		if (!is->ops->interrupt || !is->ops->attributes)
+			continue;
+		for (isn = is->start; isn < is->end; isn++) {
+			uint64_t attr = is->ops->attributes(is, isn);
+			uint32_t iflags;
+			char *name;
+
+			if (attr & IRQ_ATTR_TARGET_LINUX)
+				continue;
+			if (attr & IRQ_ATTR_TYPE_MSI)
+				iflags = 0;
+			else
+				iflags = 1;
+			name = is->ops->name ? is->ops->name(is, isn) : NULL;
+			ns = name ? strlen(name) : 0;
+			prlog(PR_DEBUG, "irq %x name: %s %s\n",
+			      isn,
+			      name ? name : "<null>",
+			      iflags ? "[level]" : "[edge]");
+			names = realloc(names, tns + ns + 1);
+			if (name) {
+				strcpy(names + tns, name);
+				tns += (ns + 1);
+				free(name);
+			} else
+				names[tns++] = 0;
+			i = count++;
+			irqs = realloc(irqs, 8 * count);
+			irqs[i*2] = cpu_to_be32(isn);
+			irqs[i*2+1] = cpu_to_be32(iflags);
+		}
+	}
+	unlock(&irq_lock);
+
+	/* First create the standard "interrupts" property and the
+	 * corresponding names property
+	 */
+	dt_add_property_cells(opal_node, "interrupt-parent", get_ics_phandle());
+	dt_add_property(opal_node, "interrupts", irqs, count * 8);
+	dt_add_property(opal_node, "opal-interrupts-names", names, tns);
+	dt_add_property(opal_node, "interrupt-names", names, tns);
+
+	/* Now "reduce" it to the old style "opal-interrupts" property
+	 * format by stripping out the flags. The "opal-interrupts"
+	 * property has one cell per interrupt, it is not a standard
+	 * "interrupt" property.
+	 *
+	 * Note: Even if empty, create it, otherwise some bogus error
+	 * handling in Linux can cause problems.
+	 */
+	for (i = 1; i < count; i++)
+		irqs[i] = irqs[i * 2];
+	dt_add_property(opal_node, "opal-interrupts", irqs, count * 4);
+
+	free(irqs);
+	free(names);
+}
+
+/*
+ * This is called at init time (and one fast reboot) to sanitize the
+ * ICP. We set our priority to 0 to mask all interrupts and make sure
+ * no IPI is on the way. This is also called on wakeup from nap
+ */
+void reset_cpu_icp(void)
+{
+	void *icp = this_cpu()->icp_regs;
+
+	if (!icp)
+		return;
+
+	/* Dummy fetch */
+	in_be32(icp + ICP_XIRR);
+
+	/* Clear pending IPIs */
+	out_8(icp + ICP_MFRR, 0xff);
+
+	/* Set priority to max, ignore all incoming interrupts, EOI IPIs */
+	out_be32(icp + ICP_XIRR, 2);
+}
+
+/* Used by the PSI code to send an EOI during reset. This will also
+ * set the CPPR to 0 which should already be the case anyway
+ */
+void icp_send_eoi(uint32_t interrupt)
+{
+	void *icp = this_cpu()->icp_regs;
+
+	if (!icp)
+		return;
+
+	/* Set priority to max, ignore all incoming interrupts */
+	out_be32(icp + ICP_XIRR, interrupt & 0xffffff);
+}
+
+/* This is called before winkle or nap, we clear pending IPIs and
+ * set our priority to 1 to mask all but the IPI.
+ */
+void icp_prep_for_pm(void)
+{
+	void *icp = this_cpu()->icp_regs;
+
+	if (!icp)
+		return;
+
+	/* Clear pending IPIs */
+	out_8(icp + ICP_MFRR, 0xff);
+
+	/* Set priority to 1, ignore all incoming interrupts, EOI IPIs */
+	out_be32(icp + ICP_XIRR, 0x01000002);
+}
+
+/* This is called to wakeup somebody from winkle */
+void icp_kick_cpu(struct cpu_thread *cpu)
+{
+	void *icp = cpu->icp_regs;
+
+	if (!icp)
+		return;
+
+	/* Send high priority IPI */
+	out_8(icp + ICP_MFRR, 0);
+}
+
+/* Returns the number of chip ID bits used for interrupt numbers */
+static uint32_t p8_chip_id_bits(uint32_t chip)
+{
+	struct proc_chip *proc_chip = get_chip(chip);
+
+	assert(proc_chip);
+	switch (proc_chip->type) {
+	case PROC_CHIP_P8_MURANO:
+	case PROC_CHIP_P8_VENICE:
+		return 6;
+		break;
+
+	case PROC_CHIP_P8_NAPLES:
+		return 5;
+		break;
+
+	default:
+		/* This shouldn't be called on non-P8 based systems */
+		assert(0);
+		return 0;
+		break;
+	}
+}
+
+/* The chip id mask is the upper p8_chip_id_bits of the irq number */
+static uint32_t chip_id_mask(uint32_t chip)
+{
+	uint32_t chip_id_bits = p8_chip_id_bits(chip);
+	uint32_t chip_id_mask;
+
+	chip_id_mask = ((1 << chip_id_bits) - 1);
+	chip_id_mask <<= P8_IRQ_BITS - chip_id_bits;
+	return chip_id_mask;
+}
+
+/* The block mask is what remains of the 19 bit irq number after
+ * removing the upper 5 or 6 bits for the chip# and the lower 11 bits
+ * for the number of bits per block. */
+static uint32_t block_mask(uint32_t chip)
+{
+	uint32_t chip_id_bits = p8_chip_id_bits(chip);
+	uint32_t irq_block_mask;
+
+	irq_block_mask = P8_IRQ_BITS - chip_id_bits - P8_IVE_BITS;
+	irq_block_mask = ((1 << irq_block_mask) - 1) << P8_IVE_BITS;
+	return irq_block_mask;
+}
+
+uint32_t p8_chip_irq_block_base(uint32_t chip, uint32_t block)
+{
+	uint32_t irq;
+
+	assert(chip < (1 << p8_chip_id_bits(chip)));
+	irq = SETFIELD(chip_id_mask(chip), 0, chip);
+	irq = SETFIELD(block_mask(chip), irq, block);
+
+	return irq;
+}
+
+uint32_t p8_chip_irq_phb_base(uint32_t chip, uint32_t phb)
+{
+	assert(chip < (1 << p8_chip_id_bits(chip)));
+
+	return p8_chip_irq_block_base(chip, phb + P8_IRQ_BLOCK_PHB_BASE);
+}
+
+uint32_t p8_irq_to_chip(uint32_t irq)
+{
+	/* This assumes we only have one type of cpu in a system,
+	 * which should be ok. */
+	return GETFIELD(chip_id_mask(this_cpu()->chip_id), irq);
+}
+
+uint32_t p8_irq_to_block(uint32_t irq)
+{
+	return GETFIELD(block_mask(this_cpu()->chip_id), irq);
+}
+
+uint32_t p8_irq_to_phb(uint32_t irq)
+{
+	return p8_irq_to_block(irq) - P8_IRQ_BLOCK_PHB_BASE;
+}
+
+bool __irq_source_eoi(struct irq_source *is, uint32_t isn)
+{
+	if (!is->ops->eoi)
+		return false;
+
+	is->ops->eoi(is, isn);
+	return true;
+}
+
+bool irq_source_eoi(uint32_t isn)
+{
+	struct irq_source *is = irq_find_source(isn);
+
+	if (!is)
+		return false;
+
+	return __irq_source_eoi(is, isn);
+}
+
+static int64_t opal_set_xive(uint32_t isn, uint16_t server, uint8_t priority)
+{
+	struct irq_source *is = irq_find_source(isn);
+
+	if (!is || !is->ops->set_xive)
+		return OPAL_PARAMETER;
+
+	return is->ops->set_xive(is, isn, server, priority);
+}
+opal_call(OPAL_SET_XIVE, opal_set_xive, 3);
+
+static int64_t opal_get_xive(uint32_t isn, __be16 *server, uint8_t *priority)
+{
+	struct irq_source *is = irq_find_source(isn);
+	uint16_t s;
+	int64_t ret;
+
+	if (!opal_addr_valid(server))
+		return OPAL_PARAMETER;
+
+	if (!is || !is->ops->get_xive)
+		return OPAL_PARAMETER;
+
+	ret = is->ops->get_xive(is, isn, &s, priority);
+	*server = cpu_to_be16(s);
+	return ret;
+}
+opal_call(OPAL_GET_XIVE, opal_get_xive, 3);
+
+static int64_t opal_handle_interrupt(uint32_t isn, __be64 *outstanding_event_mask)
+{
+	struct irq_source *is = irq_find_source(isn);
+	int64_t rc = OPAL_SUCCESS;
+
+	if (!opal_addr_valid(outstanding_event_mask))
+		return OPAL_PARAMETER;
+
+	/* No source ? return */
+	if (!is || !is->ops->interrupt) {
+		rc = OPAL_PARAMETER;
+		goto bail;
+	}
+
+	/* Run it */
+	is->ops->interrupt(is, isn);
+
+	/* Check timers if SBE timer isn't working */
+	if (!p8_sbe_timer_ok() && !p9_sbe_timer_ok())
+		check_timers(true);
+
+	/* Update output events */
+ bail:
+	if (outstanding_event_mask)
+		*outstanding_event_mask = cpu_to_be64(opal_pending_events);
+
+	return rc;
+}
+opal_call(OPAL_HANDLE_INTERRUPT, opal_handle_interrupt, 2);
+
+void init_interrupts(void)
+{
+	struct dt_node *icp;
+	const struct dt_property *sranges;
+	struct cpu_thread *cpu;
+	u32 base, count, i;
+	u64 addr, size;
+
+	dt_for_each_compatible(dt_root, icp, "ibm,ppc-xicp") {
+		sranges = dt_require_property(icp,
+					      "ibm,interrupt-server-ranges",
+					      -1);
+		base = dt_get_number(sranges->prop, 1);
+		count = dt_get_number(sranges->prop + 4, 1);
+		for (i = 0; i < count; i++) {
+			addr = dt_get_address(icp, i, &size);
+			cpu = find_cpu_by_server(base + i);
+			if (cpu)
+				cpu->icp_regs = (void *)addr;
+		}
+	}
+}
+
diff --git a/roms/skiboot/core/ipmi-opal.c b/roms/skiboot/core/ipmi-opal.c
new file mode 100644
index 000000000..cc45b409b
--- /dev/null
+++ b/roms/skiboot/core/ipmi-opal.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * IPMI OPAL calls
+ *
+ * Copyright 2013-2018 IBM Corp.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <ipmi.h>
+#include <lock.h>
+#include <opal.h>
+#include <device.h>
+#include <ccan/list/list.h>
+
+static struct lock msgq_lock = LOCK_UNLOCKED;
+static struct list_head msgq = LIST_HEAD_INIT(msgq);
+
+static void opal_send_complete(struct ipmi_msg *msg)
+{
+	lock(&msgq_lock);
+	list_add_tail(&msgq, &msg->link);
+	opal_update_pending_evt(ipmi_backend->opal_event_ipmi_recv,
+				ipmi_backend->opal_event_ipmi_recv);
+	unlock(&msgq_lock);
+}
+
+static int64_t opal_ipmi_send(uint64_t interface,
+			      struct opal_ipmi_msg *opal_ipmi_msg, uint64_t msg_len)
+{
+	struct ipmi_msg *msg;
+
+	if (opal_ipmi_msg->version != OPAL_IPMI_MSG_FORMAT_VERSION_1) {
+		prerror("OPAL IPMI: Incorrect version\n");
+		return OPAL_UNSUPPORTED;
+	}
+
+	msg_len -= sizeof(struct opal_ipmi_msg);
+	if (msg_len > IPMI_MAX_REQ_SIZE) {
+		prerror("OPAL IPMI: Invalid request length\n");
+		return OPAL_PARAMETER;
+	}
+
+	prlog(PR_TRACE, "opal_ipmi_send(cmd: 0x%02x netfn: 0x%02x len: 0x%02llx)\n",
+	       opal_ipmi_msg->cmd, opal_ipmi_msg->netfn >> 2, msg_len);
+
+	msg = ipmi_mkmsg(interface,
+			 IPMI_CODE(opal_ipmi_msg->netfn >> 2, opal_ipmi_msg->cmd),
+			 opal_send_complete, NULL, opal_ipmi_msg->data,
+			 msg_len, IPMI_MAX_RESP_SIZE);
+	if (!msg)
+		return OPAL_RESOURCE;
+
+	msg->complete = opal_send_complete;
+	msg->error = opal_send_complete;
+	return ipmi_queue_msg(msg);
+}
+
+static int64_t opal_ipmi_recv(uint64_t interface,
+			      struct opal_ipmi_msg *opal_ipmi_msg, __be64 *msg_len)
+{
+	struct ipmi_msg *msg;
+	int64_t rc;
+
+	lock(&msgq_lock);
+	msg = list_top(&msgq, struct ipmi_msg, link);
+
+	if (!msg) {
+		rc = OPAL_EMPTY;
+		goto out_unlock;
+	}
+
+	if (opal_ipmi_msg->version != OPAL_IPMI_MSG_FORMAT_VERSION_1) {
+		prerror("OPAL IPMI: Incorrect version\n");
+		rc = OPAL_UNSUPPORTED;
+		goto out_del_msg;
+	}
+
+	if (interface != IPMI_DEFAULT_INTERFACE) {
+		prerror("IPMI: Invalid interface 0x%llx in opal_ipmi_recv\n", interface);
+		rc = OPAL_PARAMETER;
+		goto out_del_msg;
+	}
+
+	if (be64_to_cpu(*msg_len) - sizeof(struct opal_ipmi_msg) < msg->resp_size + 1) {
+		rc = OPAL_RESOURCE;
+		goto out_del_msg;
+	}
+
+	list_del(&msg->link);
+	if (list_empty(&msgq))
+		opal_update_pending_evt(ipmi_backend->opal_event_ipmi_recv, 0);
+	unlock(&msgq_lock);
+
+	opal_ipmi_msg->cmd = msg->cmd;
+	opal_ipmi_msg->netfn = msg->netfn;
+	opal_ipmi_msg->data[0] = msg->cc;
+	memcpy(&opal_ipmi_msg->data[1], msg->data, msg->resp_size);
+
+	prlog(PR_TRACE, "opal_ipmi_recv(cmd: 0x%02x netfn: 0x%02x resp_size: 0x%02x)\n",
+	      msg->cmd, msg->netfn >> 2, msg->resp_size);
+
+	/* Add one as the completion code is returned in the message data */
+	*msg_len = cpu_to_be64(msg->resp_size + sizeof(struct opal_ipmi_msg) + 1);
+	ipmi_free_msg(msg);
+
+	return OPAL_SUCCESS;
+
+out_del_msg:
+	list_del(&msg->link);
+	if (list_empty(&msgq))
+		opal_update_pending_evt(ipmi_backend->opal_event_ipmi_recv, 0);
+	ipmi_free_msg(msg);
+out_unlock:
+	unlock(&msgq_lock);
+	return rc;
+}
+
+void ipmi_opal_init(void)
+{
+	struct dt_node *opal_ipmi, *opal_event = NULL;
+
+	opal_ipmi = dt_new(opal_node, "ipmi");
+	dt_add_property_strings(opal_ipmi, "compatible", "ibm,opal-ipmi");
+	dt_add_property_cells(opal_ipmi, "ibm,ipmi-interface-id",
+			      IPMI_DEFAULT_INTERFACE);
+	dt_add_property_cells(opal_ipmi, "interrupts",
+			      ilog2(ipmi_backend->opal_event_ipmi_recv));
+
+        if (proc_gen >= proc_gen_p9)
+		opal_event = dt_find_by_name(opal_node, "event");
+	if (opal_event)
+		dt_add_property_cells(opal_ipmi, "interrupt-parent",
+				      opal_event->phandle);
+
+	opal_register(OPAL_IPMI_SEND, opal_ipmi_send, 3);
+	opal_register(OPAL_IPMI_RECV, opal_ipmi_recv, 3);
+}
diff --git a/roms/skiboot/core/ipmi.c b/roms/skiboot/core/ipmi.c
new file mode 100644
index 000000000..bbc1a7b69
--- /dev/null
+++ b/roms/skiboot/core/ipmi.c
@@ -0,0 +1,263 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * in-band IPMI, probably over bt (or via FSP mbox on FSP)
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <bt.h>
+#include <ipmi.h>
+#include <opal.h>
+#include <device.h>
+#include <skiboot.h>
+#include <lock.h>
+#include <cpu.h>
+#include <timebase.h>
+#include <debug_descriptor.h>
+
+struct ipmi_backend *ipmi_backend = NULL;
+static struct lock sync_lock = LOCK_UNLOCKED;
+static struct ipmi_msg *sync_msg = NULL;
+
+void ipmi_free_msg(struct ipmi_msg *msg)
+{
+	/* ipmi_free_msg frees messages allocated by the
+	 * backend. Without a backend we couldn't have allocated
+	 * messages to free (we don't support removing backends
+	 * yet). */
+	if (!ipmi_present()) {
+		prerror("IPMI: Trying to free message without backend\n");
+		return;
+	}
+
+	msg->backend->free_msg(msg);
+}
+
+void ipmi_init_msg(struct ipmi_msg *msg, int interface,
+		   uint32_t code, void (*complete)(struct ipmi_msg *),
+		   void *user_data, size_t req_size, size_t resp_size)
+{
+	/* We don't actually support multiple interfaces at the moment. */
+	assert(interface == IPMI_DEFAULT_INTERFACE);
+
+	msg->backend = ipmi_backend;
+	msg->cmd = IPMI_CMD(code);
+	msg->netfn = IPMI_NETFN(code) << 2;
+	msg->req_size = req_size;
+	msg->resp_size = resp_size;
+	msg->complete = complete;
+	msg->user_data = user_data;
+}
+
+struct ipmi_msg *ipmi_mkmsg_simple(uint32_t code, void *req_data, size_t req_size)
+{
+	return ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, code, ipmi_free_msg, NULL,
+			  req_data, req_size, 0);
+}
+
+struct ipmi_msg *ipmi_mkmsg(int interface, uint32_t code,
+			    void (*complete)(struct ipmi_msg *),
+			    void *user_data, void *req_data, size_t req_size,
+			    size_t resp_size)
+{
+	struct ipmi_msg *msg;
+
+	if (!ipmi_present())
+		return NULL;
+
+	msg = ipmi_backend->alloc_msg(req_size, resp_size);
+	if (!msg)
+		return NULL;
+
+	ipmi_init_msg(msg, interface, code, complete, user_data, req_size,
+		      resp_size);
+
+	/* Commands are free to over ride this if they want to handle errors */
+	msg->error = ipmi_free_msg;
+
+	if (req_data)
+		memcpy(msg->data, req_data, req_size);
+
+	return msg;
+}
+
+int ipmi_queue_msg_head(struct ipmi_msg *msg)
+{
+	if (!ipmi_present())
+		return OPAL_HARDWARE;
+
+	if (!msg) {
+		prerror("%s: Attempting to queue NULL message\n", __func__);
+		return OPAL_PARAMETER;
+	}
+
+	return msg->backend->queue_msg_head(msg);
+}
+
+int ipmi_queue_msg(struct ipmi_msg *msg)
+{
+	/* Here we could choose which interface to use if we want to support
+	   multiple interfaces. */
+	if (!ipmi_present())
+		return OPAL_HARDWARE;
+
+	if (!msg) {
+		prerror("%s: Attempting to queue NULL message\n", __func__);
+		return OPAL_PARAMETER;
+	}
+
+	return msg->backend->queue_msg(msg);
+}
+
+int ipmi_dequeue_msg(struct ipmi_msg *msg)
+{
+	if (!ipmi_present())
+		return OPAL_HARDWARE;
+
+	if (!msg) {
+		prerror("%s: Attempting to dequeue NULL message\n", __func__);
+		return OPAL_PARAMETER;
+	}
+
+	return msg->backend->dequeue_msg(msg);
+}
+
+void ipmi_cmd_done(uint8_t cmd, uint8_t netfn, uint8_t cc, struct ipmi_msg *msg)
+{
+	msg->cc = cc;
+	if (msg->cmd != cmd) {
+		prerror("IPMI: Incorrect cmd 0x%02x in response\n", cmd);
+		cc = IPMI_ERR_UNSPECIFIED;
+	}
+
+	if ((msg->netfn >> 2) + 1 != (netfn >> 2)) {
+		prerror("IPMI: Incorrect netfn 0x%02x in response\n", netfn >> 2);
+		cc = IPMI_ERR_UNSPECIFIED;
+	}
+	msg->netfn = netfn;
+
+	if (cc != IPMI_CC_NO_ERROR) {
+		prlog(PR_DEBUG, "IPMI: Got error response. cmd=0x%x, netfn=0x%x,"
+		      " rc=0x%02x\n", msg->cmd, msg->netfn >> 2, msg->cc);
+
+		assert(msg->error);
+		msg->error(msg);
+	} else if (msg->complete)
+		msg->complete(msg);
+
+	/* At this point the message has should have been freed by the
+	   completion functions. */
+
+	/* If this is a synchronous message flag that we are done */
+	if (msg == sync_msg) {
+		sync_msg = NULL;
+		barrier();
+	}
+}
+
+void ipmi_queue_msg_sync(struct ipmi_msg *msg)
+{
+	void (*poll)(void) = msg->backend->poll;
+
+	if (!ipmi_present())
+		return;
+
+	if (!msg) {
+		prerror("%s: Attempting to queue NULL message\n", __func__);
+		return;
+	}
+
+	lock(&sync_lock);
+	while (sync_msg);
+	sync_msg = msg;
+	if (msg->backend->disable_retry && !opal_booting())
+		msg->backend->disable_retry(msg);
+	ipmi_queue_msg_head(msg);
+	unlock(&sync_lock);
+
+	/*
+	 * BT response handling relies on a timer. We can't just run all
+	 * timers because we may have been called with a lock that a timer
+	 * wants, and they're generally not written to cope with that.
+	 * So, just run whatever the IPMI backend needs to make forward
+	 * progress.
+	 */
+	while (sync_msg == msg) {
+		if (poll)
+			poll();
+		time_wait_ms(10);
+	}
+}
+
+static void ipmi_read_event_complete(struct ipmi_msg *msg)
+{
+	prlog(PR_DEBUG, "IPMI read event %02x complete: %d bytes. cc: %02x\n",
+	      msg->cmd, msg->resp_size, msg->cc);
+
+	/* Handle power control & PNOR handshake events */
+	ipmi_parse_sel(msg);
+
+	ipmi_free_msg(msg);
+}
+
+static void ipmi_get_message_flags_complete(struct ipmi_msg *msg)
+{
+	uint8_t flags = msg->data[0];
+
+	ipmi_free_msg(msg);
+
+	prlog(PR_DEBUG, "IPMI Get Message Flags: %02x\n", flags);
+
+	/* Once we see an interrupt we assume the payload has
+	 * booted. We disable the wdt and let the OS setup its own
+	 * wdt.
+	 *
+	 * This is also where we consider the OS to be booted, so we set
+	 * the boot count sensor */
+	if (flags & IPMI_MESSAGE_FLAGS_WATCHDOG_PRE_TIMEOUT) {
+		ipmi_wdt_stop();
+		ipmi_set_boot_count();
+	}
+
+	/* Message available in the event buffer? Queue a Read Event command
+	 * to retrieve it. The flag is cleared by performing a read */
+	if (flags & IPMI_MESSAGE_FLAGS_EVENT_BUFFER) {
+		msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, IPMI_READ_EVENT,
+				ipmi_read_event_complete, NULL, NULL, 0, 16);
+		ipmi_queue_msg(msg);
+	}
+}
+
+void ipmi_sms_attention(void)
+{
+	struct ipmi_msg *msg;
+
+	if (!ipmi_present())
+		return;
+
+	/* todo: when we handle multiple IPMI interfaces, we'll need to
+	 * ensure that this message is associated with the appropriate
+	 * backend. */
+	msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, IPMI_GET_MESSAGE_FLAGS,
+			ipmi_get_message_flags_complete, NULL, NULL, 0, 1);
+
+	ipmi_queue_msg(msg);
+}
+
+void ipmi_register_backend(struct ipmi_backend *backend)
+{
+	/* We only support one backend at the moment */
+	assert(backend->alloc_msg);
+	assert(backend->free_msg);
+	assert(backend->queue_msg);
+	assert(backend->dequeue_msg);
+	ipmi_backend = backend;
+	ipmi_backend->opal_event_ipmi_recv = opal_dynamic_event_alloc();
+}
+
+bool ipmi_present(void)
+{
+	return ipmi_backend != NULL;
+}
diff --git a/roms/skiboot/core/lock.c b/roms/skiboot/core/lock.c
new file mode 100644
index 000000000..f0ab595b1
--- /dev/null
+++ b/roms/skiboot/core/lock.c
@@ -0,0 +1,336 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Simple spinlock
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <lock.h>
+#include <assert.h>
+#include <processor.h>
+#include <cpu.h>
+#include <console.h>
+#include <timebase.h>
+
+/* Set to bust locks. Note, this is initialized to true because our
+ * lock debugging code is not going to work until we have the per
+ * CPU data initialized
+ */
+bool bust_locks = true;
+
+#define LOCK_TIMEOUT_MS 5000
+
+#ifdef DEBUG_LOCKS
+
+static void __nomcount lock_error(struct lock *l, const char *reason, uint16_t err)
+{
+	fprintf(stderr, "LOCK ERROR: %s @%p (state: 0x%016llx)\n",
+		reason, l, l->lock_val);
+	op_display(OP_FATAL, OP_MOD_LOCK, err);
+
+	abort();
+}
+
+static inline void __nomcount lock_check(struct lock *l)
+{
+	if ((l->lock_val & 1) && (l->lock_val >> 32) == this_cpu()->pir)
+		lock_error(l, "Invalid recursive lock", 0);
+}
+
+static inline void __nomcount unlock_check(struct lock *l)
+{
+	if (!(l->lock_val & 1))
+		lock_error(l, "Unlocking unlocked lock", 1);
+
+	if ((l->lock_val >> 32) != this_cpu()->pir)
+		lock_error(l, "Unlocked non-owned lock", 2);
+
+	if (l->in_con_path && this_cpu()->con_suspend == 0)
+		lock_error(l, "Unlock con lock with console not suspended", 3);
+
+	if (list_empty(&this_cpu()->locks_held))
+		lock_error(l, "Releasing lock we don't hold depth", 4);
+}
+
+static inline bool __nomcount __try_lock(struct cpu_thread *cpu, struct lock *l)
+{
+	uint64_t val;
+
+	val = cpu->pir;
+	val <<= 32;
+	val |= 1;
+
+	barrier();
+	if (__cmpxchg64(&l->lock_val, 0, val) == 0) {
+		sync();
+		return true;
+	}
+	return false;
+}
+
+static inline bool lock_timeout(unsigned long start)
+{
+	/* Print warning if lock has been spinning for more than TIMEOUT_MS */
+	unsigned long wait = tb_to_msecs(mftb());
+
+	if (wait - start > LOCK_TIMEOUT_MS) {
+		/*
+		 * If the timebase is invalid, we shouldn't
+		 * throw an error. This is possible with pending HMIs
+		 * that need to recover TB.
+		 */
+		if( !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID))
+			return false;
+		return true;
+	}
+
+	return false;
+}
+#else
+static inline void lock_check(struct lock *l) { };
+static inline void unlock_check(struct lock *l) { };
+static inline bool lock_timeout(unsigned long s) { return false; }
+#endif /* DEBUG_LOCKS */
+
+#if defined(DEADLOCK_CHECKER) && defined(DEBUG_LOCKS)
+
+static struct lock dl_lock = {
+	.lock_val = 0,
+	.in_con_path = true,
+	.owner = LOCK_CALLER
+};
+
+/* Find circular dependencies in the lock requests. */
+static __nomcount inline bool check_deadlock(void)
+{
+	uint32_t lock_owner, start, i;
+	struct cpu_thread *next_cpu;
+	struct lock *next;
+
+	next  = this_cpu()->requested_lock;
+	start = this_cpu()->pir;
+	i = 0;
+
+	while (i < cpu_max_pir) {
+
+		if (!next)
+			return false;
+
+		if (!(next->lock_val & 1) || next->in_con_path)
+			return false;
+
+		lock_owner = next->lock_val >> 32;
+
+		if (lock_owner == start)
+			return true;
+
+		next_cpu = find_cpu_by_pir_nomcount(lock_owner);
+
+		if (!next_cpu)
+			return false;
+
+		next = next_cpu->requested_lock;
+		i++;
+	}
+
+	return false;
+}
+
+static void add_lock_request(struct lock *l)
+{
+	struct cpu_thread *curr = this_cpu();
+	bool dead;
+
+	if (curr->state != cpu_state_active &&
+	    curr->state != cpu_state_os)
+		return;
+
+	/*
+	 * For deadlock detection we must keep the lock states constant
+	 * while doing the deadlock check. However we need to avoid
+	 * clashing with the stack checker, so no mcount and use an
+	 * inline implementation of the lock for the dl_lock
+	 */
+	for (;;) {
+		if (__try_lock(curr, &dl_lock))
+			break;
+		smt_lowest();
+		while (dl_lock.lock_val)
+			barrier();
+		smt_medium();
+	}
+
+	curr->requested_lock = l;
+
+	dead = check_deadlock();
+
+	lwsync();
+	dl_lock.lock_val = 0;
+
+	if (dead)
+		lock_error(l, "Deadlock detected", 0);
+}
+
+static void remove_lock_request(void)
+{
+	this_cpu()->requested_lock = NULL;
+}
+#else
+static inline void add_lock_request(struct lock *l) { };
+static inline void remove_lock_request(void) { };
+#endif /* #if defined(DEADLOCK_CHECKER) && defined(DEBUG_LOCKS) */
+
+bool lock_held_by_me(struct lock *l)
+{
+	uint64_t pir64 = this_cpu()->pir;
+
+	return l->lock_val == ((pir64 << 32) | 1);
+}
+
+bool try_lock_caller(struct lock *l, const char *owner)
+{
+	struct cpu_thread *cpu = this_cpu();
+
+	if (bust_locks)
+		return true;
+
+	if (l->in_con_path)
+		cpu->con_suspend++;
+	if (__try_lock(cpu, l)) {
+		l->owner = owner;
+
+#ifdef DEBUG_LOCKS_BACKTRACE
+		backtrace_create(l->bt_buf, LOCKS_BACKTRACE_MAX_ENTS,
+				 &l->bt_metadata);
+#endif
+
+		list_add(&cpu->locks_held, &l->list);
+		return true;
+	}
+	if (l->in_con_path)
+		cpu->con_suspend--;
+	return false;
+}
+
+void lock_caller(struct lock *l, const char *owner)
+{
+	bool timeout_warn = false;
+	unsigned long start = 0;
+
+	if (bust_locks)
+		return;
+
+	lock_check(l);
+
+	if (try_lock_caller(l, owner))
+		return;
+	add_lock_request(l);
+
+#ifdef DEBUG_LOCKS
+	/*
+	 * Ensure that we get a valid start value
+	 * as we may be handling TFMR errors and taking
+	 * a lock to do so, so timebase could be garbage
+	 */
+	if( (mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID))
+		start = tb_to_msecs(mftb());
+#endif
+
+	for (;;) {
+		if (try_lock_caller(l, owner))
+			break;
+		smt_lowest();
+		while (l->lock_val)
+			barrier();
+		smt_medium();
+
+		if (start && !timeout_warn && lock_timeout(start)) {
+			/*
+			 * Holding the lock request while printing a
+			 * timeout and taking console locks can result
+			 * in deadlock fals positive if the lock owner
+			 * tries to take the console lock. So drop it.
+			 */
+			remove_lock_request();
+			prlog(PR_WARNING, "WARNING: Lock has been spinning for over %dms\n", LOCK_TIMEOUT_MS);
+			backtrace();
+			add_lock_request(l);
+			timeout_warn = true;
+		}
+	}
+
+	remove_lock_request();
+}
+
+void unlock(struct lock *l)
+{
+	struct cpu_thread *cpu = this_cpu();
+
+	if (bust_locks)
+		return;
+
+	unlock_check(l);
+
+	l->owner = NULL;
+	list_del(&l->list);
+	lwsync();
+	l->lock_val = 0;
+
+	/* WARNING: On fast reboot, we can be reset right at that
+	 * point, so the reset_lock in there cannot be in the con path
+	 */
+	if (l->in_con_path) {
+		cpu->con_suspend--;
+		if (cpu->con_suspend == 0 && cpu->con_need_flush)
+			flush_console();
+	}
+}
+
+bool lock_recursive_caller(struct lock *l, const char *caller)
+{
+	if (bust_locks)
+		return false;
+
+	if (lock_held_by_me(l))
+		return false;
+
+	lock_caller(l, caller);
+	return true;
+}
+
+void init_locks(void)
+{
+	bust_locks = false;
+}
+
+void dump_locks_list(void)
+{
+	struct lock *l;
+
+	prlog(PR_ERR, "Locks held:\n");
+	list_for_each(&this_cpu()->locks_held, l, list) {
+		prlog(PR_ERR, "  %s\n", l->owner);
+#ifdef DEBUG_LOCKS_BACKTRACE
+		backtrace_print(l->bt_buf, &l->bt_metadata, NULL, NULL, true);
+#endif
+	}
+}
+
+void drop_my_locks(bool warn)
+{
+	struct lock *l;
+
+	disable_fast_reboot("Lock corruption");
+	while((l = list_top(&this_cpu()->locks_held, struct lock, list)) != NULL) {
+		if (warn) {
+			prlog(PR_ERR, "  %s\n", l->owner);
+#ifdef DEBUG_LOCKS_BACKTRACE
+			backtrace_print(l->bt_buf, &l->bt_metadata, NULL, NULL,
+					true);
+#endif
+		}
+		unlock(l);
+	}
+}
+
diff --git a/roms/skiboot/core/malloc.c b/roms/skiboot/core/malloc.c
new file mode 100644
index 000000000..76996fff4
--- /dev/null
+++ b/roms/skiboot/core/malloc.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Implement malloc()/free() etc on top of our memory region allocator,
+ * which provides mem_alloc()/mem_free().
+ *
+ * Copyright 2013-2015 IBM Corp.
+ */
+
+#include <mem_region.h>
+#include <lock.h>
+#include <string.h>
+#include <mem_region-malloc.h>
+
+#define DEFAULT_ALIGN __alignof__(long)
+
+void *__memalign(size_t blocksize, size_t bytes, const char *location)
+{
+	void *p;
+
+	lock(&skiboot_heap.free_list_lock);
+	p = mem_alloc(&skiboot_heap, bytes, blocksize, location);
+	unlock(&skiboot_heap.free_list_lock);
+
+	return p;
+}
+
+void *__malloc(size_t bytes, const char *location)
+{
+	return __memalign(DEFAULT_ALIGN, bytes, location);
+}
+
+void __free(void *p, const char *location)
+{
+	lock(&skiboot_heap.free_list_lock);
+	mem_free(&skiboot_heap, p, location);
+	unlock(&skiboot_heap.free_list_lock);
+}
+
+void *__realloc(void *ptr, size_t size, const char *location)
+{
+	void *newptr;
+
+	/* Two classic malloc corner cases. */
+	if (!size) {
+		__free(ptr, location);
+		return NULL;
+	}
+	if (!ptr)
+		return __malloc(size, location);
+
+	lock(&skiboot_heap.free_list_lock);
+	if (mem_resize(&skiboot_heap, ptr, size, location)) {
+		newptr = ptr;
+	} else {
+		newptr = mem_alloc(&skiboot_heap, size, DEFAULT_ALIGN,
+				   location);
+		if (newptr) {
+			size_t copy = mem_allocated_size(ptr);
+			if (copy > size)
+				copy = size;
+			memcpy(newptr, ptr, copy);
+			mem_free(&skiboot_heap, ptr, location);
+		}
+	}
+	unlock(&skiboot_heap.free_list_lock);
+	return newptr;
+}
+
+void *__zalloc(size_t bytes, const char *location)
+{
+	void *p = __malloc(bytes, location);
+
+	if (p)
+		memset(p, 0, bytes);
+	return p;
+}
diff --git a/roms/skiboot/core/mce.c b/roms/skiboot/core/mce.c
new file mode 100644
index 000000000..47674abcb
--- /dev/null
+++ b/roms/skiboot/core/mce.c
@@ -0,0 +1,309 @@
+// SPDX-License-Identifier: Apache-2.0
+/*
+ * Machine Check Exceptions
+ *
+ * Copyright 2020 IBM Corp.
+ */
+
+#define pr_fmt(fmt)	"MCE: " fmt
+
+#include <ras.h>
+#include <opal.h>
+#include <cpu.h>
+
+#define SRR1_MC_LOADSTORE(srr1)	((srr1) & PPC_BIT(42))
+
+struct mce_ierror_table {
+	unsigned long srr1_mask;
+	unsigned long srr1_value;
+	uint64_t type;
+	const char *error_str;
+};
+
+static const struct mce_ierror_table mce_p9_ierror_table[] = {
+{ 0x00000000081c0000, 0x0000000000040000,
+  MCE_INSNFETCH | MCE_MEMORY_ERROR | MCE_INVOLVED_EA,
+  "instruction fetch memory uncorrectable error", },
+{ 0x00000000081c0000, 0x0000000000080000,
+  MCE_INSNFETCH | MCE_SLB_ERROR | MCE_INVOLVED_EA,
+  "instruction fetch SLB parity error", },
+{ 0x00000000081c0000, 0x00000000000c0000,
+  MCE_INSNFETCH | MCE_SLB_ERROR | MCE_INVOLVED_EA,
+  "instruction fetch SLB multi-hit error", },
+{ 0x00000000081c0000, 0x0000000000100000,
+  MCE_INSNFETCH | MCE_INVOLVED_EA | MCE_ERAT_ERROR,
+  "instruction fetch ERAT multi-hit error", },
+{ 0x00000000081c0000, 0x0000000000140000,
+  MCE_INSNFETCH | MCE_INVOLVED_EA | MCE_TLB_ERROR,
+  "instruction fetch TLB multi-hit error", },
+{ 0x00000000081c0000, 0x0000000000180000,
+  MCE_INSNFETCH | MCE_MEMORY_ERROR | MCE_TABLE_WALK | MCE_INVOLVED_EA,
+  "instruction fetch page table access memory uncorrectable error", },
+{ 0x00000000081c0000, 0x00000000001c0000,
+  MCE_INSNFETCH | MCE_INVOLVED_EA,
+  "instruction fetch to foreign address", },
+{ 0x00000000081c0000, 0x0000000008000000,
+  MCE_INSNFETCH | MCE_INVOLVED_EA,
+  "instruction fetch foreign link time-out", },
+{ 0x00000000081c0000, 0x0000000008040000,
+  MCE_INSNFETCH | MCE_TABLE_WALK | MCE_INVOLVED_EA,
+  "instruction fetch page table access foreign link time-out", },
+{ 0x00000000081c0000, 0x00000000080c0000,
+  MCE_INSNFETCH | MCE_INVOLVED_EA,
+  "instruction fetch real address error", },
+{ 0x00000000081c0000, 0x0000000008100000,
+  MCE_INSNFETCH | MCE_TABLE_WALK | MCE_INVOLVED_EA,
+  "instruction fetch page table access real address error", },
+{ 0x00000000081c0000, 0x0000000008140000,
+  MCE_LOADSTORE | MCE_IMPRECISE,
+  "store real address asynchronous error", },
+{ 0x00000000081c0000, 0x0000000008180000,
+  MCE_LOADSTORE | MCE_IMPRECISE,
+  "store foreign link time-out asynchronous error", },
+{ 0x00000000081c0000, 0x00000000081c0000,
+  MCE_INSNFETCH | MCE_TABLE_WALK | MCE_INVOLVED_EA,
+  "instruction fetch page table access to foreign address", },
+{ 0 } };
+
+static const struct mce_ierror_table mce_p10_ierror_table[] = {
+{ 0x00000000081c0000, 0x0000000000040000,
+  MCE_INSNFETCH | MCE_MEMORY_ERROR | MCE_INVOLVED_EA,
+  "instruction fetch memory uncorrectable error", },
+{ 0x00000000081c0000, 0x0000000000080000,
+  MCE_INSNFETCH | MCE_SLB_ERROR | MCE_INVOLVED_EA,
+  "instruction fetch SLB parity error", },
+{ 0x00000000081c0000, 0x00000000000c0000,
+  MCE_INSNFETCH | MCE_SLB_ERROR | MCE_INVOLVED_EA,
+  "instruction fetch SLB multi-hit error", },
+{ 0x00000000081c0000, 0x0000000000100000,
+  MCE_INSNFETCH | MCE_INVOLVED_EA | MCE_ERAT_ERROR,
+  "instruction fetch ERAT multi-hit error", },
+{ 0x00000000081c0000, 0x0000000000140000,
+  MCE_INSNFETCH | MCE_INVOLVED_EA | MCE_TLB_ERROR,
+  "instruction fetch TLB multi-hit error", },
+{ 0x00000000081c0000, 0x0000000000180000,
+  MCE_INSNFETCH | MCE_MEMORY_ERROR | MCE_TABLE_WALK | MCE_INVOLVED_EA,
+  "instruction fetch page table access memory uncorrectable error", },
+{ 0x00000000081c0000, 0x00000000001c0000,
+  MCE_INSNFETCH | MCE_INVOLVED_EA,
+  "instruction fetch to control real address", },
+{ 0x00000000081c0000, 0x00000000080c0000,
+  MCE_INSNFETCH | MCE_INVOLVED_EA,
+  "instruction fetch real address error", },
+{ 0x00000000081c0000, 0x0000000008100000,
+  MCE_INSNFETCH | MCE_TABLE_WALK | MCE_INVOLVED_EA,
+  "instruction fetch page table access real address error", },
+{ 0x00000000081c0000, 0x0000000008140000,
+  MCE_LOADSTORE | MCE_IMPRECISE,
+  "store real address asynchronous error", },
+{ 0x00000000081c0000, 0x00000000081c0000,
+  MCE_INSNFETCH | MCE_TABLE_WALK | MCE_INVOLVED_EA,
+  "instruction fetch page table access to control real address", },
+{ 0 } };
+
+struct mce_derror_table {
+	unsigned long dsisr_value;
+	uint64_t type;
+	const char *error_str;
+};
+
+static const struct mce_derror_table mce_p9_derror_table[] = {
+{ 0x00008000,
+  MCE_LOADSTORE | MCE_MEMORY_ERROR,
+  "load/store memory uncorrectable error", },
+{ 0x00004000,
+  MCE_LOADSTORE | MCE_MEMORY_ERROR | MCE_TABLE_WALK | MCE_INVOLVED_EA,
+  "load/store page table access memory uncorrectable error", },
+{ 0x00002000,
+  MCE_LOADSTORE | MCE_INVOLVED_EA,
+  "load/store foreign link time-out", },
+{ 0x00001000,
+  MCE_LOADSTORE | MCE_TABLE_WALK | MCE_INVOLVED_EA,
+  "load/store page table access foreign link time-out", },
+{ 0x00000800,
+  MCE_LOADSTORE | MCE_INVOLVED_EA | MCE_ERAT_ERROR,
+  "load/store ERAT multi-hit error", },
+{ 0x00000400,
+  MCE_LOADSTORE | MCE_INVOLVED_EA | MCE_TLB_ERROR,
+  "load/store TLB multi-hit error", },
+{ 0x00000200,
+  MCE_LOADSTORE | MCE_TLBIE_ERROR,
+  "TLBIE or TLBIEL instruction programming error", },
+{ 0x00000100,
+  MCE_LOADSTORE | MCE_INVOLVED_EA | MCE_SLB_ERROR,
+  "load/store SLB parity error", },
+{ 0x00000080,
+  MCE_LOADSTORE | MCE_INVOLVED_EA | MCE_SLB_ERROR,
+  "load/store SLB multi-hit error", },
+{ 0x00000040,
+  MCE_LOADSTORE | MCE_INVOLVED_EA,
+  "load real address error", },
+{ 0x00000020,
+  MCE_LOADSTORE | MCE_TABLE_WALK,
+  "load/store page table access real address error", },
+{ 0x00000010,
+  MCE_LOADSTORE | MCE_TABLE_WALK,
+  "load/store page table access to foreign address", },
+{ 0x00000008,
+  MCE_LOADSTORE,
+  "load/store to foreign address", },
+{ 0 } };
+
+static const struct mce_derror_table mce_p10_derror_table[] = {
+{ 0x00008000,
+  MCE_LOADSTORE | MCE_MEMORY_ERROR,
+  "load/store memory uncorrectable error", },
+{ 0x00004000,
+  MCE_LOADSTORE | MCE_MEMORY_ERROR | MCE_TABLE_WALK | MCE_INVOLVED_EA,
+  "load/store page table access memory uncorrectable error", },
+{ 0x00000800,
+  MCE_LOADSTORE | MCE_INVOLVED_EA | MCE_ERAT_ERROR,
+  "load/store ERAT multi-hit error", },
+{ 0x00000400,
+  MCE_LOADSTORE | MCE_INVOLVED_EA | MCE_TLB_ERROR,
+  "load/store TLB multi-hit error", },
+{ 0x00000200,
+  MCE_TLBIE_ERROR,
+  "TLBIE or TLBIEL instruction programming error", },
+{ 0x00000100,
+  MCE_LOADSTORE | MCE_INVOLVED_EA | MCE_SLB_ERROR,
+  "load/store SLB parity error", },
+{ 0x00000080,
+  MCE_LOADSTORE | MCE_INVOLVED_EA | MCE_SLB_ERROR,
+  "load/store SLB multi-hit error", },
+{ 0x00000040,
+  MCE_LOADSTORE | MCE_INVOLVED_EA,
+  "load real address error", },
+{ 0x00000020,
+  MCE_LOADSTORE | MCE_TABLE_WALK,
+  "load/store page table access real address error", },
+{ 0x00000010,
+  MCE_LOADSTORE | MCE_TABLE_WALK,
+  "load/store page table access to control real address", },
+{ 0x00000008,
+  MCE_LOADSTORE,
+  "load/store to control real address", },
+{ 0 } };
+
+static void decode_ierror(const struct mce_ierror_table table[],
+				uint64_t srr1,
+				uint64_t *type,
+				const char **error_str)
+{
+	int i;
+
+	for (i = 0; table[i].srr1_mask; i++) {
+		if ((srr1 & table[i].srr1_mask) != table[i].srr1_value)
+			continue;
+
+		*type = table[i].type;
+		*error_str = table[i].error_str;
+	}
+}
+
+static void decode_derror(const struct mce_derror_table table[],
+		uint32_t dsisr,
+		uint64_t *type,
+		const char **error_str)
+{
+	int i;
+
+	for (i = 0; table[i].dsisr_value; i++) {
+		if (!(dsisr & table[i].dsisr_value))
+			continue;
+
+		*type = table[i].type;
+		*error_str = table[i].error_str;
+	}
+}
+
+static void decode_mce_p9(uint64_t srr0, uint64_t srr1,
+		uint32_t dsisr, uint64_t dar,
+		uint64_t *type, const char **error_str,
+		uint64_t *address)
+{
+	/*
+	 * On POWER9 DD2.1 and below, it's possible to get a machine check
+	 * caused by a paste instruction where only DSISR bit 25 is set. This
+	 * will result in the MCE handler seeing an unknown event and the
+	 * kernel crashing. An MCE that occurs like this is spurious, so we
+	 * don't need to do anything in terms of servicing it. If there is
+	 * something that needs to be serviced, the CPU will raise the MCE
+	 * again with the correct DSISR so that it can be serviced properly.
+	 * So detect this case and mark it as handled.
+	 */
+	if (SRR1_MC_LOADSTORE(srr1) && dsisr == 0x02000000) {
+		*type = MCE_NO_ERROR;
+		*error_str = "no error (superfluous machine check)";
+		return;
+	}
+
+	/*
+	 * Async machine check due to bad real address from store or foreign
+	 * link time out comes with the load/store bit (PPC bit 42) set in
+	 * SRR1, but the cause comes in SRR1 not DSISR. Clear bit 42 so we're
+	 * directed to the ierror table so it will find the cause (which
+	 * describes it correctly as a store error).
+	 */
+	if (SRR1_MC_LOADSTORE(srr1) &&
+			((srr1 & 0x081c0000) == 0x08140000 ||
+			 (srr1 & 0x081c0000) == 0x08180000)) {
+		srr1 &= ~PPC_BIT(42);
+	}
+
+	if (SRR1_MC_LOADSTORE(srr1)) {
+		decode_derror(mce_p9_derror_table, dsisr, type, error_str);
+		if (*type & MCE_INVOLVED_EA)
+			*address = dar;
+	} else {
+		decode_ierror(mce_p9_ierror_table, srr1, type, error_str);
+		if (*type & MCE_INVOLVED_EA)
+			*address = srr0;
+	}
+}
+
+static void decode_mce_p10(uint64_t srr0, uint64_t srr1,
+		uint32_t dsisr, uint64_t dar,
+		uint64_t *type, const char **error_str,
+		uint64_t *address)
+{
+	/*
+	 * Async machine check due to bad real address from store or foreign
+	 * link time out comes with the load/store bit (PPC bit 42) set in
+	 * SRR1, but the cause comes in SRR1 not DSISR. Clear bit 42 so we're
+	 * directed to the ierror table so it will find the cause (which
+	 * describes it correctly as a store error).
+	 */
+	if (SRR1_MC_LOADSTORE(srr1) &&
+			(srr1 & 0x081c0000) == 0x08140000) {
+		srr1 &= ~PPC_BIT(42);
+	}
+
+	if (SRR1_MC_LOADSTORE(srr1)) {
+		decode_derror(mce_p10_derror_table, dsisr, type, error_str);
+		if (*type & MCE_INVOLVED_EA)
+			*address = dar;
+	} else {
+		decode_ierror(mce_p10_ierror_table, srr1, type, error_str);
+		if (*type & MCE_INVOLVED_EA)
+			*address = srr0;
+	}
+}
+
+void decode_mce(uint64_t srr0, uint64_t srr1,
+		uint32_t dsisr, uint64_t dar,
+		uint64_t *type, const char **error_str,
+		uint64_t *address)
+{
+	*type = MCE_UNKNOWN;
+	*error_str = "unknown error";
+	*address = 0;
+
+	if (proc_gen == proc_gen_p9) {
+		decode_mce_p9(srr0, srr1, dsisr, dar, type, error_str, address);
+	} else if (proc_gen == proc_gen_p10) {
+		decode_mce_p10(srr0, srr1, dsisr, dar, type, error_str, address);
+	} else {
+		*error_str = "unknown error (processor not supported)";
+	}
+}
diff --git a/roms/skiboot/core/mem_region.c b/roms/skiboot/core/mem_region.c
new file mode 100644
index 000000000..36de2d094
--- /dev/null
+++ b/roms/skiboot/core/mem_region.c
@@ -0,0 +1,1555 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Simple memory allocator
+ *
+ * Copyright 2013-2018 IBM Corp.
+ */
+
+#include <inttypes.h>
+#include <skiboot.h>
+#include <mem-map.h>
+#include <libfdt_env.h>
+#include <lock.h>
+#include <device.h>
+#include <cpu.h>
+#include <chip.h>
+#include <affinity.h>
+#include <types.h>
+#include <mem_region.h>
+#include <mem_region-malloc.h>
+
+/* Memory poisoning on free (if POISON_MEM_REGION set to 1) */
+#ifdef DEBUG
+#define POISON_MEM_REGION	1
+#else
+#define POISON_MEM_REGION	0
+#endif
+#define POISON_MEM_REGION_WITH	0x99
+#define POISON_MEM_REGION_LIMIT 1*1024*1024*1024
+
+/* Locking: The mem_region_lock protects the regions list from concurrent
+ * updates. Additions to, or removals from, the region list must be done
+ * with this lock held. This is typically done when we're establishing
+ * the memory & reserved regions.
+ *
+ * Each region has a lock (region->free_list_lock) to protect the free list
+ * from concurrent modification. This lock is used when we're allocating
+ * memory out of a specific region.
+ *
+ * If both locks are needed (eg, __local_alloc, where we need to find a region,
+ * then allocate from it), the mem_region_lock must be acquired before (and
+ * released after) the per-region lock.
+ */
+struct lock mem_region_lock = LOCK_UNLOCKED;
+
+static struct list_head regions = LIST_HEAD_INIT(regions);
+static struct list_head early_reserves = LIST_HEAD_INIT(early_reserves);
+
+static bool mem_region_init_done = false;
+static bool mem_regions_finalised = false;
+
+unsigned long top_of_ram = SKIBOOT_BASE + SKIBOOT_SIZE;
+
+static struct mem_region skiboot_os_reserve = {
+	.name		= "ibm,os-reserve",
+	.start		= 0,
+	.len		= SKIBOOT_BASE,
+	.type		= REGION_OS,
+};
+
+struct mem_region skiboot_heap = {
+	.name		= "ibm,firmware-heap",
+	.start		= HEAP_BASE,
+	.len		= HEAP_SIZE,
+	.type		= REGION_SKIBOOT_HEAP,
+};
+
+static struct mem_region skiboot_code_and_text = {
+	.name		= "ibm,firmware-code",
+	.start		= SKIBOOT_BASE,
+	.len		= HEAP_BASE - SKIBOOT_BASE,
+	.type		= REGION_SKIBOOT_FIRMWARE,
+};
+
+static struct mem_region skiboot_after_heap = {
+	.name		= "ibm,firmware-data",
+	.start		= HEAP_BASE + HEAP_SIZE,
+	.len		= SKIBOOT_BASE + SKIBOOT_SIZE - (HEAP_BASE + HEAP_SIZE),
+	.type		= REGION_SKIBOOT_FIRMWARE,
+};
+
+static struct mem_region skiboot_cpu_stacks = {
+	.name		= "ibm,firmware-stacks",
+	.start		= CPU_STACKS_BASE,
+	.len		= 0, /* TBA */
+	.type		= REGION_SKIBOOT_FIRMWARE,
+};
+
+static struct mem_region skiboot_mambo_kernel = {
+	.name		= "ibm,firmware-mambo-kernel",
+	.start		= (unsigned long)KERNEL_LOAD_BASE,
+	.len		= KERNEL_LOAD_SIZE,
+	.type		= REGION_SKIBOOT_FIRMWARE,
+};
+
+static struct mem_region skiboot_mambo_initramfs = {
+	.name		= "ibm,firmware-mambo-initramfs",
+	.start		= (unsigned long)INITRAMFS_LOAD_BASE,
+	.len		= INITRAMFS_LOAD_SIZE,
+	.type		= REGION_SKIBOOT_FIRMWARE,
+};
+
+
+struct alloc_hdr {
+	bool free : 1;
+	bool prev_free : 1;
+	bool printed : 1;
+	unsigned long num_longs : BITS_PER_LONG-3; /* Including header. */
+	const char *location;
+};
+
+struct free_hdr {
+	struct alloc_hdr hdr;
+	struct list_node list;
+	/* ... unsigned long tailer; */
+};
+
+#define ALLOC_HDR_LONGS (sizeof(struct alloc_hdr) / sizeof(long))
+#define ALLOC_MIN_LONGS (sizeof(struct free_hdr) / sizeof(long) + 1)
+
+/* Avoid ugly casts. */
+static void *region_start(const struct mem_region *region)
+{
+	return (void *)(unsigned long)region->start;
+}
+
+/* Each free block has a tailer, so we can walk backwards. */
+static unsigned long *tailer(struct free_hdr *f)
+{
+	return (unsigned long *)f + f->hdr.num_longs - 1;
+}
+
+/* This walks forward to the next hdr (or NULL if at the end). */
+static struct alloc_hdr *next_hdr(const struct mem_region *region,
+				  const struct alloc_hdr *hdr)
+{
+	void *next;
+
+	next = ((unsigned long *)hdr + hdr->num_longs);
+	if (next >= region_start(region) + region->len)
+		next = NULL;
+	return next;
+}
+
+#if POISON_MEM_REGION == 1
+static void mem_poison(struct free_hdr *f)
+{
+	size_t poison_size = (void*)tailer(f) - (void*)(f+1);
+
+	/* We only poison up to a limit, as otherwise boot is
+	 * kinda slow */
+	if (poison_size > POISON_MEM_REGION_LIMIT)
+		poison_size = POISON_MEM_REGION_LIMIT;
+
+	memset(f+1, POISON_MEM_REGION_WITH, poison_size);
+}
+#endif
+
+/* Creates free block covering entire region. */
+static void init_allocatable_region(struct mem_region *region)
+{
+	struct free_hdr *f = region_start(region);
+	assert(region->type == REGION_SKIBOOT_HEAP ||
+	       region->type == REGION_MEMORY);
+	f->hdr.num_longs = region->len / sizeof(long);
+	f->hdr.free = true;
+	f->hdr.prev_free = false;
+	*tailer(f) = f->hdr.num_longs;
+	list_head_init(&region->free_list);
+	list_add(&region->free_list, &f->list);
+#if POISON_MEM_REGION == 1
+	mem_poison(f);
+#endif
+}
+
+static void make_free(struct mem_region *region, struct free_hdr *f,
+		      const char *location, bool skip_poison)
+{
+	struct alloc_hdr *next;
+
+#if POISON_MEM_REGION == 1
+	if (!skip_poison)
+		mem_poison(f);
+#else
+	(void)skip_poison;
+#endif
+
+	if (f->hdr.prev_free) {
+		struct free_hdr *prev;
+		unsigned long *prev_tailer = (unsigned long *)f - 1;
+
+		assert(*prev_tailer);
+		prev = (void *)((unsigned long *)f - *prev_tailer);
+		assert(prev->hdr.free);
+		assert(!prev->hdr.prev_free);
+
+		/* Expand to cover the one we just freed. */
+		prev->hdr.num_longs += f->hdr.num_longs;
+		f = prev;
+	} else {
+		f->hdr.free = true;
+		f->hdr.location = location;
+		list_add(&region->free_list, &f->list);
+	}
+
+	/* Fix up tailer. */
+	*tailer(f) = f->hdr.num_longs;
+
+	/* If next is free, coalesce it */
+	next = next_hdr(region, &f->hdr);
+	if (next) {
+		next->prev_free = true;
+		if (next->free) {
+			struct free_hdr *next_free = (void *)next;
+			list_del_from(&region->free_list, &next_free->list);
+			/* Maximum of one level of recursion */
+			make_free(region, next_free, location, true);
+		}
+	}
+}
+
+/* Can we fit this many longs with this alignment in this free block? */
+static bool fits(struct free_hdr *f, size_t longs, size_t align, size_t *offset)
+{
+	*offset = 0;
+
+	while (f->hdr.num_longs >= *offset + longs) {
+		size_t addr;
+
+		addr = (unsigned long)f
+			+ (*offset + ALLOC_HDR_LONGS) * sizeof(long);
+		if ((addr & (align - 1)) == 0)
+			return true;
+
+		/* Don't make tiny chunks! */
+		if (*offset == 0)
+			*offset = ALLOC_MIN_LONGS;
+		else
+			(*offset)++;
+	}
+	return false;
+}
+
+static void discard_excess(struct mem_region *region,
+			   struct alloc_hdr *hdr, size_t alloc_longs,
+			   const char *location, bool skip_poison)
+{
+	/* Do we have excess? */
+	if (hdr->num_longs > alloc_longs + ALLOC_MIN_LONGS) {
+		struct free_hdr *post;
+
+		/* Set up post block. */
+		post = (void *)hdr + alloc_longs * sizeof(long);
+		post->hdr.num_longs = hdr->num_longs - alloc_longs;
+		post->hdr.prev_free = false;
+
+		/* Trim our block. */
+		hdr->num_longs = alloc_longs;
+
+		/* This coalesces as required. */
+		make_free(region, post, location, skip_poison);
+	}
+}
+
+static const char *hdr_location(const struct alloc_hdr *hdr)
+{
+	/* Corrupt: step carefully! */
+	if (is_rodata(hdr->location))
+		return hdr->location;
+	return "*CORRUPT*";
+}
+
+static void bad_header(const struct mem_region *region,
+		       const struct alloc_hdr *hdr,
+		       const char *during,
+		       const char *location)
+{
+	/* Corrupt: step carefully! */
+	if (is_rodata(hdr->location))
+		prerror("%p (in %s) %s at %s, previously %s\n",
+			hdr-1, region->name, during, location, hdr->location);
+	else
+		prerror("%p (in %s) %s at %s, previously %p\n",
+			hdr-1, region->name, during, location, hdr->location);
+	abort();
+}
+
+static bool region_is_reservable(struct mem_region *region)
+{
+	return region->type != REGION_OS;
+}
+
+static bool region_is_reserved(struct mem_region *region)
+{
+	return region->type != REGION_OS && region->type != REGION_MEMORY;
+}
+
+void mem_dump_allocs(void)
+{
+	struct mem_region *region;
+	struct alloc_hdr *h, *i;
+
+	/* Second pass: populate property data */
+	prlog(PR_INFO, "Memory regions:\n");
+	list_for_each(&regions, region, list) {
+		if (!(region->type == REGION_SKIBOOT_HEAP ||
+		      region->type == REGION_MEMORY))
+			continue;
+		prlog(PR_INFO, "  0x%012llx..%012llx : %s\n",
+		       (long long)region->start,
+		       (long long)(region->start + region->len - 1),
+		       region->name);
+		if (region->free_list.n.next == NULL) {
+			prlog(PR_INFO, "    no allocs\n");
+			continue;
+		}
+
+		/*
+		 * XXX: When dumping the allocation list we coalase allocations
+		 * with the same location and size into a single line. This is
+		 * quadratic, but it makes the dump human-readable and the raw
+		 * dump sometimes causes the log buffer to wrap.
+		 */
+		for (h = region_start(region); h; h = next_hdr(region, h))
+			h->printed = false;
+
+		for (h = region_start(region); h; h = next_hdr(region, h)) {
+			unsigned long bytes;
+			int count = 0;
+
+			if (h->free)
+				continue;
+			if (h->printed)
+				continue;
+
+			for (i = h; i; i = next_hdr(region, i)) {
+				if (i->free)
+					continue;
+				if (i->num_longs != h->num_longs)
+					continue;
+				if (strcmp(i->location, h->location))
+					continue;
+
+				i->printed = true;
+				count++;
+			}
+
+			bytes = h->num_longs * sizeof(long);
+			prlog(PR_NOTICE, " % 8d allocs of 0x%.8lx bytes at %s (total 0x%lx)\n",
+				count, bytes, hdr_location(h), bytes * count);
+		}
+	}
+}
+
+int64_t mem_dump_free(void)
+{
+	struct mem_region *region;
+	struct alloc_hdr *hdr;
+	int64_t total_free;
+	int64_t region_free;
+
+	total_free = 0;
+
+	prlog(PR_INFO, "Free space in HEAP memory regions:\n");
+	list_for_each(&regions, region, list) {
+		if (!(region->type == REGION_SKIBOOT_HEAP ||
+		      region->type == REGION_MEMORY))
+			continue;
+		region_free = 0;
+
+		if (region->free_list.n.next == NULL) {
+			continue;
+		}
+		for (hdr = region_start(region); hdr; hdr = next_hdr(region, hdr)) {
+			if (!hdr->free)
+				continue;
+
+			region_free+= hdr->num_longs * sizeof(long);
+		}
+		prlog(PR_INFO, "Region %s free: %"PRIx64"\n",
+		       region->name, region_free);
+		total_free += region_free;
+	}
+
+	prlog(PR_INFO, "Total free: %"PRIu64"\n", total_free);
+
+	return total_free;
+}
+
+static void *__mem_alloc(struct mem_region *region, size_t size, size_t align,
+			 const char *location)
+{
+	size_t alloc_longs, offset;
+	struct free_hdr *f;
+	struct alloc_hdr *next;
+
+	/* Align must be power of 2. */
+	assert(!((align - 1) & align));
+
+	/* This should be a constant. */
+	assert(is_rodata(location));
+
+	/* Unallocatable region? */
+	if (!(region->type == REGION_SKIBOOT_HEAP ||
+	      region->type == REGION_MEMORY))
+		return NULL;
+
+	/* First allocation? */
+	if (region->free_list.n.next == NULL)
+		init_allocatable_region(region);
+
+	/* Don't do screwy sizes. */
+	if (size > region->len)
+		return NULL;
+
+	/* Don't do tiny alignments, we deal in long increments. */
+	if (align < sizeof(long))
+		align = sizeof(long);
+
+	/* Convert size to number of longs, too. */
+	alloc_longs = (size + sizeof(long)-1) / sizeof(long) + ALLOC_HDR_LONGS;
+
+	/* Can't be too small for when we free it, either. */
+	if (alloc_longs < ALLOC_MIN_LONGS)
+		alloc_longs = ALLOC_MIN_LONGS;
+
+	/* Walk free list. */
+	list_for_each(&region->free_list, f, list) {
+		/* We may have to skip some to meet alignment. */
+		if (fits(f, alloc_longs, align, &offset))
+			goto found;
+	}
+
+	return NULL;
+
+found:
+	assert(f->hdr.free);
+	assert(!f->hdr.prev_free);
+
+	/* This block is no longer free. */
+	list_del_from(&region->free_list, &f->list);
+	f->hdr.free = false;
+	f->hdr.location = location;
+
+	next = next_hdr(region, &f->hdr);
+	if (next) {
+		assert(next->prev_free);
+		next->prev_free = false;
+	}
+
+	if (offset != 0) {
+		struct free_hdr *pre = f;
+
+		f = (void *)f + offset * sizeof(long);
+		assert(f >= pre + 1);
+
+		/* Set up new header. */
+		f->hdr.num_longs = pre->hdr.num_longs - offset;
+		/* f->hdr.prev_free will be set by make_free below. */
+		f->hdr.free = false;
+		f->hdr.location = location;
+
+		/* Fix up old header. */
+		pre->hdr.num_longs = offset;
+		pre->hdr.prev_free = false;
+
+		/* This coalesces as required. */
+		make_free(region, pre, location, true);
+	}
+
+	/* We might be too long; put the rest back. */
+	discard_excess(region, &f->hdr, alloc_longs, location, true);
+
+	/* Clear tailer for debugging */
+	*tailer(f) = 0;
+
+	/* Their pointer is immediately after header. */
+	return &f->hdr + 1;
+}
+
+void *mem_alloc(struct mem_region *region, size_t size, size_t align,
+		const char *location)
+{
+	static bool dumped = false;
+	void *r;
+
+	assert(lock_held_by_me(&region->free_list_lock));
+
+	r = __mem_alloc(region, size, align, location);
+	if (r)
+		return r;
+
+	prerror("mem_alloc(0x%lx, 0x%lx, \"%s\", %s) failed !\n",
+		size, align, location, region->name);
+	if (!dumped) {
+		mem_dump_allocs();
+		dumped = true;
+	}
+
+	return NULL;
+}
+
+void mem_free(struct mem_region *region, void *mem, const char *location)
+{
+	struct alloc_hdr *hdr;
+
+	/* This should be a constant. */
+	assert(is_rodata(location));
+
+	assert(lock_held_by_me(&region->free_list_lock));
+
+	/* Freeing NULL is always a noop. */
+	if (!mem)
+		return;
+
+	/* Your memory is in the region, right? */
+	assert(mem >= region_start(region) + sizeof(*hdr));
+	assert(mem < region_start(region) + region->len);
+
+	/* Grab header. */
+	hdr = mem - sizeof(*hdr);
+
+	if (hdr->free)
+		bad_header(region, hdr, "re-freed", location);
+
+	make_free(region, (struct free_hdr *)hdr, location, false);
+}
+
+size_t mem_allocated_size(const void *ptr)
+{
+	const struct alloc_hdr *hdr = ptr - sizeof(*hdr);
+	return hdr->num_longs * sizeof(long) - sizeof(struct alloc_hdr);
+}
+
+bool mem_resize(struct mem_region *region, void *mem, size_t len,
+		const char *location)
+{
+	struct alloc_hdr *hdr, *next;
+	struct free_hdr *f;
+
+	/* This should be a constant. */
+	assert(is_rodata(location));
+
+	assert(lock_held_by_me(&region->free_list_lock));
+
+	/* Get header. */
+	hdr = mem - sizeof(*hdr);
+	if (hdr->free)
+		bad_header(region, hdr, "resize", location);
+
+	/* Round up size to multiple of longs. */
+	len = (sizeof(*hdr) + len + sizeof(long) - 1) / sizeof(long);
+
+	/* Can't be too small for when we free it, either. */
+	if (len < ALLOC_MIN_LONGS)
+		len = ALLOC_MIN_LONGS;
+
+	/* Shrinking is simple. */
+	if (len <= hdr->num_longs) {
+		hdr->location = location;
+		discard_excess(region, hdr, len, location, false);
+		return true;
+	}
+
+	/* Check if we can expand. */
+	next = next_hdr(region, hdr);
+	if (!next || !next->free || hdr->num_longs + next->num_longs < len)
+		return false;
+
+	/* OK, it's free and big enough, absorb it. */
+	f = (struct free_hdr *)next;
+	list_del_from(&region->free_list, &f->list);
+	hdr->num_longs += next->num_longs;
+	hdr->location = location;
+
+	/* Update next prev_free */
+	next = next_hdr(region, &f->hdr);
+	if (next) {
+		assert(next->prev_free);
+		next->prev_free = false;
+	}
+
+	/* Clear tailer for debugging */
+	*tailer(f) = 0;
+
+	/* Now we might have *too* much. */
+	discard_excess(region, hdr, len, location, true);
+	return true;
+}
+
+bool mem_check(const struct mem_region *region)
+{
+	size_t frees = 0;
+	struct alloc_hdr *hdr, *prev_free = NULL;
+	struct free_hdr *f;
+
+	/* Check it's sanely aligned. */
+	if (region->start % sizeof(long)) {
+		prerror("Region '%s' not sanely aligned (%llx)\n",
+			region->name, (unsigned long long)region->start);
+		return false;
+	}
+	if ((long)region->len % sizeof(long)) {
+		prerror("Region '%s' not sane length (%llu)\n",
+			region->name, (unsigned long long)region->len);
+		return false;
+	}
+
+	/* Not ours to play with, or empty?  Don't do anything. */
+	if (!(region->type == REGION_MEMORY ||
+	      region->type == REGION_SKIBOOT_HEAP) ||
+	    region->free_list.n.next == NULL)
+		return true;
+
+	/* Walk linearly. */
+	for (hdr = region_start(region); hdr; hdr = next_hdr(region, hdr)) {
+		if (hdr->num_longs < ALLOC_MIN_LONGS) {
+			prerror("Region '%s' %s %p (%s) size %zu\n",
+				region->name, hdr->free ? "free" : "alloc",
+				hdr, hdr_location(hdr),
+				hdr->num_longs * sizeof(long));
+			return false;
+		}
+		if ((unsigned long)hdr + hdr->num_longs * sizeof(long) >
+		    region->start + region->len) {
+			prerror("Region '%s' %s %p (%s) oversize %zu\n",
+				region->name, hdr->free ? "free" : "alloc",
+				hdr, hdr_location(hdr),
+				hdr->num_longs * sizeof(long));
+			return false;
+		}
+		if (hdr->free) {
+			if (hdr->prev_free || prev_free) {
+				prerror("Region '%s' free %p (%s) has prev_free"
+					" %p (%s) %sset?\n",
+					region->name, hdr, hdr_location(hdr),
+					prev_free,
+					prev_free ? hdr_location(prev_free)
+					: "NULL",
+					hdr->prev_free ? "" : "un");
+				return false;
+			}
+			prev_free = hdr;
+			frees ^= (unsigned long)hdr - region->start;
+		} else {
+			if (hdr->prev_free != (bool)prev_free) {
+				prerror("Region '%s' alloc %p (%s) has"
+					" prev_free %p %sset?\n",
+					region->name, hdr, hdr_location(hdr),
+					prev_free, hdr->prev_free ? "" : "un");
+				return false;
+			}
+			prev_free = NULL;
+		}
+	}
+
+	/* Now walk free list. */
+	list_for_each(&region->free_list, f, list)
+		frees ^= (unsigned long)f - region->start;
+
+	if (frees) {
+		prerror("Region '%s' free list and walk do not match!\n",
+			region->name);
+		return false;
+	}
+	return true;
+}
+
+bool mem_check_all(void)
+{
+	struct mem_region *r;
+
+	list_for_each(&regions, r, list) {
+		if (!mem_check(r))
+			return false;
+	}
+
+	return true;
+}
+
+static struct mem_region *new_region(const char *name,
+				     uint64_t start, uint64_t len,
+				     struct dt_node *node,
+				     enum mem_region_type type)
+{
+	struct mem_region *region;
+
+	region = malloc(sizeof(*region));
+	if (!region)
+		return NULL;
+
+	region->name = name;
+	region->start = start;
+	region->len = len;
+	region->node = node;
+	region->type = type;
+	region->free_list.n.next = NULL;
+	init_lock(&region->free_list_lock);
+
+	return region;
+}
+
+/* We always split regions, so we only have to replace one. */
+static struct mem_region *split_region(struct mem_region *head,
+				       uint64_t split_at,
+				       enum mem_region_type type)
+{
+	struct mem_region *tail;
+	uint64_t end = head->start + head->len;
+
+	tail = new_region(head->name, split_at, end - split_at,
+			  head->node, type);
+	/* Original region becomes head. */
+	if (tail)
+		head->len -= tail->len;
+
+	return tail;
+}
+
+static bool intersects(const struct mem_region *region, uint64_t addr)
+{
+	return addr > region->start &&
+		addr < region->start + region->len;
+}
+
+static bool maybe_split(struct mem_region *r, uint64_t split_at)
+{
+	struct mem_region *tail;
+
+	if (!intersects(r, split_at))
+		return true;
+
+	tail = split_region(r, split_at, r->type);
+	if (!tail)
+		return false;
+
+	/* Tail add is important: we may need to split again! */
+	list_add_after(&regions, &tail->list, &r->list);
+	return true;
+}
+
+static bool overlaps(const struct mem_region *r1, const struct mem_region *r2)
+{
+	return (r1->start + r1->len > r2->start
+		&& r1->start < r2->start + r2->len);
+}
+
+static bool contains(const struct mem_region *r1, const struct mem_region *r2)
+{
+	u64 r1_end = r1->start + r1->len;
+	u64 r2_end = r2->start + r2->len;
+
+	return (r1->start <= r2->start && r2_end <= r1_end);
+}
+
+static struct mem_region *get_overlap(const struct mem_region *region)
+{
+	struct mem_region *i;
+
+	list_for_each(&regions, i, list) {
+		if (overlaps(region, i))
+			return i;
+	}
+	return NULL;
+}
+
+static void add_region_to_regions(struct mem_region *region)
+{
+	struct mem_region *r;
+
+	list_for_each(&regions, r, list) {
+		if (r->start < region->start)
+			continue;
+
+		list_add_before(&regions, &region->list, &r->list);
+		return;
+	}
+	list_add_tail(&regions, &region->list);
+}
+
+static bool add_region(struct mem_region *region)
+{
+	struct mem_region *r;
+
+	if (mem_regions_finalised) {
+		prerror("MEM: add_region(%s@0x%"PRIx64") called after finalise!\n",
+				region->name, region->start);
+		return false;
+	}
+
+	/* First split any regions which intersect. */
+	list_for_each(&regions, r, list) {
+		/*
+		 * The new region should be fully contained by an existing one.
+		 * If it's not then we have a problem where reservations
+		 * partially overlap which is probably broken.
+		 *
+		 * NB: There *might* be situations where this is legitimate,
+		 * but the region handling does not currently support this.
+		 */
+		if (overlaps(r, region) && !contains(r, region)) {
+			prerror("MEM: Partial overlap detected between regions:\n");
+			prerror("MEM: %s [0x%"PRIx64"-0x%"PRIx64"] (new)\n",
+				region->name, region->start,
+				region->start + region->len);
+			prerror("MEM: %s [0x%"PRIx64"-0x%"PRIx64"]\n",
+				r->name, r->start, r->start + r->len);
+			return false;
+		}
+
+		if (!maybe_split(r, region->start) ||
+		    !maybe_split(r, region->start + region->len))
+			return false;
+	}
+
+	/* Now we have only whole overlaps, if any. */
+	while ((r = get_overlap(region)) != NULL) {
+		assert(r->start == region->start);
+		assert(r->len == region->len);
+		list_del_from(&regions, &r->list);
+		free(r);
+	}
+
+	/* Finally, add in our own region. */
+	add_region_to_regions(region);
+	return true;
+}
+
+static void mem_reserve(enum mem_region_type type, const char *name,
+		uint64_t start, uint64_t len)
+{
+	struct mem_region *region;
+	bool added = true;
+
+	lock(&mem_region_lock);
+	region = new_region(name, start, len, NULL, type);
+	assert(region);
+
+	if (!mem_region_init_done)
+		list_add(&early_reserves, &region->list);
+	else
+		added = add_region(region);
+
+	assert(added);
+	unlock(&mem_region_lock);
+}
+
+void mem_reserve_fw(const char *name, uint64_t start, uint64_t len)
+{
+	mem_reserve(REGION_FW_RESERVED, name, start, len);
+}
+
+void mem_reserve_hwbuf(const char *name, uint64_t start, uint64_t len)
+{
+	mem_reserve(REGION_RESERVED, name, start, len);
+}
+
+static bool matches_chip_id(const __be32 ids[], size_t num, u32 chip_id)
+{
+	size_t i;
+
+	for (i = 0; i < num; i++)
+		if (be32_to_cpu(ids[i]) == chip_id)
+			return true;
+
+	return false;
+}
+
+void *__local_alloc(unsigned int chip_id, size_t size, size_t align,
+		    const char *location)
+{
+	struct mem_region *region;
+	void *p = NULL;
+	bool use_local = true;
+
+	lock(&mem_region_lock);
+
+restart:
+	list_for_each(&regions, region, list) {
+		const struct dt_property *prop;
+		const __be32 *ids;
+
+		if (!(region->type == REGION_SKIBOOT_HEAP ||
+		      region->type == REGION_MEMORY))
+			continue;
+
+		/* Don't allocate from normal heap. */
+		if (region == &skiboot_heap)
+			continue;
+
+		/* First pass, only match node local regions */
+		if (use_local) {
+			if (!region->node)
+				continue;
+			prop = dt_find_property(region->node, "ibm,chip-id");
+			ids = (const __be32 *)prop->prop;
+			if (!matches_chip_id(ids, prop->len/sizeof(u32),
+					     chip_id))
+				continue;
+		}
+
+		/* Second pass, match anything */
+		lock(&region->free_list_lock);
+		p = mem_alloc(region, size, align, location);
+		unlock(&region->free_list_lock);
+		if (p)
+			break;
+	}
+
+	/*
+	 * If we can't allocate the memory block from the expected
+	 * node, we bail to any one that can accommodate our request.
+	 */
+	if (!p && use_local) {
+		use_local = false;
+		goto restart;
+	}
+
+	unlock(&mem_region_lock);
+
+	return p;
+}
+
+struct mem_region *find_mem_region(const char *name)
+{
+	struct mem_region *region;
+
+	list_for_each(&regions, region, list) {
+		if (streq(region->name, name))
+			return region;
+	}
+	return NULL;
+}
+
+bool mem_range_is_reserved(uint64_t start, uint64_t size)
+{
+	uint64_t end = start + size;
+	struct mem_region *region;
+	struct list_head *search;
+
+	/* We may have the range covered by a number of regions, which could
+	 * appear in any order. So, we look for a region that covers the
+	 * start address, and bump start up to the end of that region.
+	 *
+	 * We repeat until we've either bumped past the end of the range,
+	 * or we didn't find a matching region.
+	 *
+	 * This has a worst-case of O(n^2), but n is well bounded by the
+	 * small number of reservations.
+	 */
+
+	if (!mem_region_init_done)
+		search = &early_reserves;
+	else
+		search = &regions;
+
+	for (;;) {
+		bool found = false;
+
+		list_for_each(search, region, list) {
+			if (!region_is_reserved(region))
+				continue;
+
+			/* does this region overlap the start address, and
+			 * have a non-zero size? */
+			if (region->start <= start &&
+					region->start + region->len > start &&
+					region->len) {
+				start = region->start + region->len;
+				found = true;
+			}
+		}
+
+		/* 'end' is the first byte outside of the range */
+		if (start >= end)
+			return true;
+
+		if (!found)
+			break;
+	}
+
+	return false;
+}
+
+static void mem_region_parse_reserved_properties(void)
+{
+	const struct dt_property *names, *ranges;
+	struct mem_region *region;
+
+	prlog(PR_DEBUG, "MEM: parsing reserved memory from "
+			"reserved-names/-ranges properties\n");
+
+	names = dt_find_property(dt_root, "reserved-names");
+	ranges = dt_find_property(dt_root, "reserved-ranges");
+	if (names && ranges) {
+		const uint64_t *range;
+		int n, len;
+
+		range = (const void *)ranges->prop;
+
+		for (n = 0; n < names->len; n += len, range += 2) {
+			char *name;
+
+			len = strlen(names->prop + n) + 1;
+			name = strdup(names->prop + n);
+
+			region = new_region(name,
+					dt_get_number(range, 2),
+					dt_get_number(range + 1, 2),
+					NULL, REGION_FW_RESERVED);
+			if (!add_region(region)) {
+				prerror("Couldn't add mem_region %s\n", name);
+				abort();
+			}
+		}
+	} else if (names || ranges) {
+		prerror("Invalid properties: reserved-names=%p "
+				"with reserved-ranges=%p\n",
+				names, ranges);
+		abort();
+	} else {
+		return;
+	}
+}
+
+static bool mem_region_parse_reserved_nodes(const char *path)
+{
+	struct dt_node *parent, *node;
+
+	parent = dt_find_by_path(dt_root, path);
+	if (!parent)
+		return false;
+
+	prlog(PR_INFO, "MEM: parsing reserved memory from node %s\n", path);
+
+	dt_for_each_child(parent, node) {
+		const struct dt_property *reg;
+		struct mem_region *region;
+		int type;
+
+		reg = dt_find_property(node, "reg");
+		if (!reg) {
+			char *nodepath = dt_get_path(node);
+			prerror("node %s has no reg property, ignoring\n",
+					nodepath);
+			free(nodepath);
+			continue;
+		}
+
+		if (dt_has_node_property(node, "no-map", NULL))
+			type = REGION_RESERVED;
+		else
+			type = REGION_FW_RESERVED;
+
+		region = new_region(strdup(node->name),
+				dt_get_number(reg->prop, 2),
+				dt_get_number(reg->prop + sizeof(u64), 2),
+				node, type);
+		if (!add_region(region)) {
+			char *nodepath = dt_get_path(node);
+			prerror("node %s failed to add_region()\n", nodepath);
+			free(nodepath);
+		}
+	}
+
+	return true;
+}
+
+/* Trawl through device tree, create memory regions from nodes. */
+void mem_region_init(void)
+{
+	struct mem_region *region, *next;
+	struct dt_node *i;
+	bool rc;
+
+	/*
+	 * Add associativity properties outside of the lock
+	 * to avoid recursive locking caused by allocations
+	 * done by add_chip_dev_associativity()
+	 */
+	dt_for_each_node(dt_root, i) {
+		if (!dt_has_node_property(i, "device_type", "memory") &&
+		    !dt_has_node_property(i, "compatible", "pmem-region"))
+			continue;
+
+		/* Add associativity properties */
+		add_chip_dev_associativity(i);
+	}
+
+	/* Add each memory node. */
+	dt_for_each_node(dt_root, i) {
+		uint64_t start, len;
+		char *rname;
+#define NODE_REGION_PREFIX 	"ibm,firmware-allocs-"
+
+		if (!dt_has_node_property(i, "device_type", "memory"))
+			continue;
+		rname = zalloc(strlen(i->name) + strlen(NODE_REGION_PREFIX) + 1);
+		assert(rname);
+		strcat(rname, NODE_REGION_PREFIX);
+		strcat(rname, i->name);
+		start = dt_get_address(i, 0, &len);
+		lock(&mem_region_lock);
+		region = new_region(rname, start, len, i, REGION_MEMORY);
+		if (!region) {
+			prerror("MEM: Could not add mem region %s!\n", i->name);
+			abort();
+		}
+		add_region_to_regions(region);
+		if ((start + len) > top_of_ram)
+			top_of_ram = start + len;
+		unlock(&mem_region_lock);
+	}
+
+	/*
+	 * This is called after we know the maximum PIR of all CPUs,
+	 * so we can dynamically set the stack length.
+	 */
+	skiboot_cpu_stacks.len = (cpu_max_pir + 1) * STACK_SIZE;
+
+	lock(&mem_region_lock);
+
+	/* Now carve out our own reserved areas. */
+	if (!add_region(&skiboot_os_reserve) ||
+	    !add_region(&skiboot_code_and_text) ||
+	    !add_region(&skiboot_heap) ||
+	    !add_region(&skiboot_after_heap) ||
+	    !add_region(&skiboot_cpu_stacks)) {
+		prerror("Out of memory adding skiboot reserved areas\n");
+		abort();
+	}
+
+	if (chip_quirk(QUIRK_MAMBO_CALLOUTS)) {
+		if (!add_region(&skiboot_mambo_kernel) ||
+		    !add_region(&skiboot_mambo_initramfs)) {
+			prerror("Out of memory adding mambo payload\n");
+			abort();
+		}
+	}
+
+	/* Add reserved reanges from HDAT */
+	list_for_each_safe(&early_reserves, region, next, list) {
+		bool added;
+
+		list_del(&region->list);
+		added = add_region(region);
+		assert(added);
+	}
+
+	/* Add reserved ranges from the DT */
+	rc = mem_region_parse_reserved_nodes("/reserved-memory");
+	if (!rc)
+		rc = mem_region_parse_reserved_nodes(
+				"/ibm,hostboot/reserved-memory");
+	if (!rc)
+		mem_region_parse_reserved_properties();
+
+	mem_region_init_done = true;
+	unlock(&mem_region_lock);
+}
+
+static uint64_t allocated_length(const struct mem_region *r)
+{
+	struct free_hdr *f, *last = NULL;
+
+	/* No allocations at all? */
+	if (r->free_list.n.next == NULL)
+		return 0;
+
+	/* Find last free block. */
+	list_for_each(&r->free_list, f, list)
+		if (f > last)
+			last = f;
+
+	/* No free blocks? */
+	if (!last)
+		return r->len;
+
+	/* Last free block isn't at end? */
+	if (next_hdr(r, &last->hdr))
+		return r->len;
+	return (unsigned long)last - r->start;
+}
+
+/* Separate out allocated sections into their own region. */
+void mem_region_release_unused(void)
+{
+	struct mem_region *r;
+
+	lock(&mem_region_lock);
+	assert(!mem_regions_finalised);
+
+	prlog(PR_INFO, "Releasing unused memory:\n");
+	list_for_each(&regions, r, list) {
+		uint64_t used_len;
+
+		/* If it's not allocatable, ignore it. */
+		if (!(r->type == REGION_SKIBOOT_HEAP ||
+		      r->type == REGION_MEMORY))
+			continue;
+
+		used_len = allocated_length(r);
+
+		prlog(PR_INFO, "    %s: %llu/%llu used\n",
+		       r->name, (long long)used_len, (long long)r->len);
+
+		/* We keep the skiboot heap. */
+		if (r == &skiboot_heap)
+			continue;
+
+		/* Nothing used?  Whole thing is for Linux. */
+		if (used_len == 0)
+			r->type = REGION_OS;
+		/* Partially used?  Split region. */
+		else if (used_len != r->len) {
+			struct mem_region *for_linux;
+			struct free_hdr *last = region_start(r) + used_len;
+
+			/* Remove the final free block. */
+			list_del_from(&r->free_list, &last->list);
+
+			for_linux = split_region(r, r->start + used_len,
+						 REGION_OS);
+			if (!for_linux) {
+				prerror("OOM splitting mem node %s for linux\n",
+					r->name);
+				abort();
+			}
+			list_add(&regions, &for_linux->list);
+		}
+	}
+	unlock(&mem_region_lock);
+}
+
+static void mem_clear_range(uint64_t s, uint64_t e)
+{
+	uint64_t res_start, res_end;
+
+	/* Skip exception vectors */
+	if (s < EXCEPTION_VECTORS_END)
+		s = EXCEPTION_VECTORS_END;
+
+	/* Skip kernel preload area */
+	res_start = (uint64_t)KERNEL_LOAD_BASE;
+	res_end = res_start + KERNEL_LOAD_SIZE;
+
+	if (s >= res_start && s < res_end)
+	       s = res_end;
+	if (e > res_start && e <= res_end)
+	       e = res_start;
+	if (e <= s)
+		return;
+	if (s < res_start && e > res_end) {
+		mem_clear_range(s, res_start);
+		mem_clear_range(res_end, e);
+		return;
+	}
+
+	/* Skip initramfs preload area */
+	res_start = (uint64_t)INITRAMFS_LOAD_BASE;
+	res_end = res_start + INITRAMFS_LOAD_SIZE;
+
+	if (s >= res_start && s < res_end)
+	       s = res_end;
+	if (e > res_start && e <= res_end)
+	       e = res_start;
+	if (e <= s)
+		return;
+	if (s < res_start && e > res_end) {
+		mem_clear_range(s, res_start);
+		mem_clear_range(res_end, e);
+		return;
+	}
+
+	prlog(PR_DEBUG, "Clearing region %llx-%llx\n",
+	      (long long)s, (long long)e);
+	memset((void *)s, 0, e - s);
+}
+
+struct mem_region_clear_job_args {
+	char *job_name;
+	uint64_t s,e;
+};
+
+static void mem_region_clear_job(void *data)
+{
+	struct mem_region_clear_job_args *arg = (struct mem_region_clear_job_args*)data;
+	mem_clear_range(arg->s, arg->e);
+}
+
+#define MEM_REGION_CLEAR_JOB_SIZE (16ULL*(1<<30))
+
+static struct cpu_job **mem_clear_jobs;
+static struct mem_region_clear_job_args *mem_clear_job_args;
+static int mem_clear_njobs = 0;
+
+void start_mem_region_clear_unused(void)
+{
+	struct mem_region *r;
+	uint64_t s,l;
+	uint64_t total = 0;
+	uint32_t chip_id;
+	char *path;
+	int i;
+	struct cpu_job **jobs;
+	struct mem_region_clear_job_args *job_args;
+
+	lock(&mem_region_lock);
+	assert(mem_regions_finalised);
+
+	mem_clear_njobs = 0;
+
+	list_for_each(&regions, r, list) {
+		if (!(r->type == REGION_OS))
+			continue;
+		mem_clear_njobs++;
+		/* One job per 16GB */
+		mem_clear_njobs += r->len / MEM_REGION_CLEAR_JOB_SIZE;
+	}
+
+	jobs = malloc(mem_clear_njobs * sizeof(struct cpu_job*));
+	job_args = malloc(mem_clear_njobs * sizeof(struct mem_region_clear_job_args));
+	mem_clear_jobs = jobs;
+	mem_clear_job_args = job_args;
+
+	prlog(PR_NOTICE, "Clearing unused memory:\n");
+	i = 0;
+	list_for_each(&regions, r, list) {
+		/* If it's not unused, ignore it. */
+		if (!(r->type == REGION_OS))
+			continue;
+
+		assert(r != &skiboot_heap);
+
+		s = r->start;
+		l = r->len;
+		while(l > MEM_REGION_CLEAR_JOB_SIZE) {
+			job_args[i].s = s+l - MEM_REGION_CLEAR_JOB_SIZE;
+			job_args[i].e = s+l;
+			l-=MEM_REGION_CLEAR_JOB_SIZE;
+			job_args[i].job_name = malloc(sizeof(char)*100);
+			total+=MEM_REGION_CLEAR_JOB_SIZE;
+			chip_id = __dt_get_chip_id(r->node);
+			if (chip_id == -1)
+				chip_id = 0;
+			path = dt_get_path(r->node);
+			snprintf(job_args[i].job_name, 100,
+				 "clear %s, %s 0x%"PRIx64" len: %"PRIx64" on %d",
+				 r->name, path,
+				 job_args[i].s,
+				 (job_args[i].e - job_args[i].s),
+				 chip_id);
+			free(path);
+			jobs[i] = cpu_queue_job_on_node(chip_id,
+							job_args[i].job_name,
+							mem_region_clear_job,
+							&job_args[i]);
+			if (!jobs[i])
+				jobs[i] = cpu_queue_job(NULL,
+							job_args[i].job_name,
+							mem_region_clear_job,
+							&job_args[i]);
+			assert(jobs[i]);
+			i++;
+		}
+		job_args[i].s = s;
+		job_args[i].e = s+l;
+		job_args[i].job_name = malloc(sizeof(char)*100);
+		total+=l;
+		chip_id = __dt_get_chip_id(r->node);
+		if (chip_id == -1)
+			chip_id = 0;
+		path = dt_get_path(r->node);
+		snprintf(job_args[i].job_name,100,
+			 "clear %s, %s 0x%"PRIx64" len: 0x%"PRIx64" on %d",
+			 r->name, path,
+			 job_args[i].s,
+			 (job_args[i].e - job_args[i].s),
+			 chip_id);
+		free(path);
+		jobs[i] = cpu_queue_job_on_node(chip_id,
+						job_args[i].job_name,
+						mem_region_clear_job,
+						&job_args[i]);
+		if (!jobs[i])
+			jobs[i] = cpu_queue_job(NULL,
+						job_args[i].job_name,
+						mem_region_clear_job,
+						&job_args[i]);
+		assert(jobs[i]);
+		i++;
+	}
+	unlock(&mem_region_lock);
+	cpu_process_local_jobs();
+}
+
+void wait_mem_region_clear_unused(void)
+{
+	uint64_t l;
+	uint64_t total = 0;
+	int i;
+
+	for(i=0; i < mem_clear_njobs; i++) {
+		total += (mem_clear_job_args[i].e - mem_clear_job_args[i].s);
+	}
+
+	l = 0;
+	for(i=0; i < mem_clear_njobs; i++) {
+		cpu_wait_job(mem_clear_jobs[i], true);
+		l += (mem_clear_job_args[i].e - mem_clear_job_args[i].s);
+		printf("Clearing memory... %"PRIu64"/%"PRIu64"GB done\n",
+		       l>>30, total>>30);
+		free(mem_clear_job_args[i].job_name);
+	}
+	free(mem_clear_jobs);
+	free(mem_clear_job_args);
+}
+
+static void mem_region_add_dt_reserved_node(struct dt_node *parent,
+		struct mem_region *region)
+{
+	char *name, *p;
+
+	/* If a reserved region was established before skiboot, it may be
+	 * referenced by a device-tree node with extra data. In that case,
+	 * copy the node to /reserved-memory/, unless it's already there.
+	 *
+	 * We update region->node to the new copy here, as the prd code may
+	 * update regions' device-tree nodes, and we want those updates to
+	 * apply to the nodes in /reserved-memory/.
+	 */
+	if (region->type == REGION_FW_RESERVED && region->node) {
+		if (region->node->parent != parent)
+			region->node = dt_copy(region->node, parent);
+		return;
+	}
+
+	name = strdup(region->name);
+	assert(name);
+
+	/* remove any cell addresses in the region name; we have our own cell
+	 * addresses here */
+	p = strchr(name, '@');
+	if (p)
+		*p = '\0';
+
+	region->node = dt_new_addr(parent, name, region->start);
+	assert(region->node);
+	dt_add_property_u64s(region->node, "reg", region->start, region->len);
+
+	/*
+	 * This memory is used by hardware and may need special handling. Ask
+	 * the host kernel not to map it by default.
+	 */
+	if (region->type == REGION_RESERVED)
+		dt_add_property(region->node, "no-map", NULL, 0);
+
+	free(name);
+}
+
+void mem_region_add_dt_reserved(void)
+{
+	int names_len, ranges_len, len;
+	const struct dt_property *prop;
+	struct mem_region *region;
+	void *names, *ranges;
+	struct dt_node *node;
+	fdt64_t *range;
+	char *name;
+
+	names_len = 0;
+	ranges_len = 0;
+
+	/* Finalise the region list, so we know that the regions list won't be
+	 * altered after this point. The regions' free lists may change after
+	 * we drop the lock, but we don't access those. */
+	lock(&mem_region_lock);
+	mem_regions_finalised = true;
+
+	/* establish top-level reservation node */
+	node = dt_find_by_path(dt_root, "reserved-memory");
+	if (!node) {
+		node = dt_new(dt_root, "reserved-memory");
+		dt_add_property_cells(node, "#address-cells", 2);
+		dt_add_property_cells(node, "#size-cells", 2);
+		dt_add_property(node, "ranges", NULL, 0);
+	}
+
+	prlog(PR_INFO, "Reserved regions:\n");
+
+	/* First pass, create /reserved-memory/ nodes for each reservation,
+	 * and calculate the length for the /reserved-names and
+	 * /reserved-ranges properties */
+	list_for_each(&regions, region, list) {
+		if (!region_is_reservable(region))
+			continue;
+
+		prlog(PR_INFO, "  0x%012llx..%012llx : %s\n",
+		       (long long)region->start,
+		       (long long)(region->start + region->len - 1),
+		       region->name);
+
+		mem_region_add_dt_reserved_node(node, region);
+
+		/* calculate the size of the properties populated later */
+		names_len += strlen(region->node->name) + 1;
+		ranges_len += 2 * sizeof(uint64_t);
+	}
+
+	name = names = malloc(names_len);
+	range = ranges = malloc(ranges_len);
+
+	/* Second pass: populate the old-style reserved-names and
+	 * reserved-regions arrays based on the node data */
+	list_for_each(&regions, region, list) {
+		if (!region_is_reservable(region))
+			continue;
+
+		len = strlen(region->node->name) + 1;
+		memcpy(name, region->node->name, len);
+		name += len;
+
+		range[0] = cpu_to_fdt64(region->start);
+		range[1] = cpu_to_fdt64(region->len);
+		range += 2;
+	}
+	unlock(&mem_region_lock);
+
+	prop = dt_find_property(dt_root, "reserved-names");
+	if (prop)
+		dt_del_property(dt_root, (struct dt_property *)prop);
+
+	prop = dt_find_property(dt_root, "reserved-ranges");
+	if (prop)
+		dt_del_property(dt_root, (struct dt_property *)prop);
+
+	dt_add_property(dt_root, "reserved-names", names, names_len);
+	dt_add_property(dt_root, "reserved-ranges", ranges, ranges_len);
+
+	free(names);
+	free(ranges);
+}
+
+struct mem_region *mem_region_next(struct mem_region *region)
+{
+	struct list_node *node;
+
+	assert(lock_held_by_me(&mem_region_lock));
+
+	node = region ? &region->list : &regions.n;
+
+	if (node->next == &regions.n)
+		return NULL;
+
+	return list_entry(node->next, struct mem_region, list);
+}
diff --git a/roms/skiboot/core/nvram-format.c b/roms/skiboot/core/nvram-format.c
new file mode 100644
index 000000000..8aa5abf22
--- /dev/null
+++ b/roms/skiboot/core/nvram-format.c
@@ -0,0 +1,331 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * NVRAM Format as specified in PAPR
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <nvram.h>
+
+struct chrp_nvram_hdr {
+	uint8_t		sig;
+	uint8_t		cksum;
+	be16		len;
+	char		name[12];
+};
+
+static struct chrp_nvram_hdr *skiboot_part_hdr;
+
+#define NVRAM_SIG_FW_PRIV	0x51
+#define NVRAM_SIG_SYSTEM	0x70
+#define NVRAM_SIG_FREE		0x7f
+
+#define NVRAM_NAME_COMMON	"common"
+#define NVRAM_NAME_FW_PRIV	"ibm,skiboot"
+#define NVRAM_NAME_FREE		"wwwwwwwwwwww"
+
+/* 64k should be enough, famous last words... */
+#define NVRAM_SIZE_COMMON	0x10000
+
+/* 4k should be enough, famous last words... */
+#define NVRAM_SIZE_FW_PRIV	0x1000
+
+static uint8_t chrp_nv_cksum(struct chrp_nvram_hdr *hdr)
+{
+	struct chrp_nvram_hdr h_copy = *hdr;
+	uint8_t b_data, i_sum, c_sum;
+	uint8_t *p = (uint8_t *)&h_copy;
+	unsigned int nbytes = sizeof(h_copy);
+
+	h_copy.cksum = 0;
+	for (c_sum = 0; nbytes; nbytes--) {
+		b_data = *(p++);
+		i_sum = c_sum + b_data;
+		if (i_sum < c_sum)
+			i_sum++;
+		c_sum = i_sum;
+	}
+	return c_sum;
+}
+
+int nvram_format(void *nvram_image, uint32_t nvram_size)
+{
+	struct chrp_nvram_hdr *h;
+	unsigned int offset = 0;
+
+	prerror("NVRAM: Re-initializing (size: 0x%08x)\n", nvram_size);
+	memset(nvram_image, 0, nvram_size);
+
+	/* Create private partition */
+	if (nvram_size - offset < NVRAM_SIZE_FW_PRIV)
+		return -1;
+	h = nvram_image + offset;
+	h->sig = NVRAM_SIG_FW_PRIV;
+	h->len = cpu_to_be16(NVRAM_SIZE_FW_PRIV >> 4);
+	strcpy(h->name, NVRAM_NAME_FW_PRIV);
+	h->cksum = chrp_nv_cksum(h);
+	prlog(PR_DEBUG, "NVRAM: Created '%s' partition at 0x%08x"
+	      " for size 0x%08x with cksum 0x%02x\n",
+	      NVRAM_NAME_FW_PRIV, offset,
+	      be16_to_cpu(h->len), h->cksum);
+	offset += NVRAM_SIZE_FW_PRIV;
+
+	/* Create common partition */
+	if (nvram_size - offset < NVRAM_SIZE_COMMON)
+		return -1;
+	h = nvram_image + offset;
+	h->sig = NVRAM_SIG_SYSTEM;
+	h->len = cpu_to_be16(NVRAM_SIZE_COMMON >> 4);
+	strcpy(h->name, NVRAM_NAME_COMMON);
+	h->cksum = chrp_nv_cksum(h);
+	prlog(PR_DEBUG, "NVRAM: Created '%s' partition at 0x%08x"
+	      " for size 0x%08x with cksum 0x%02x\n",
+	      NVRAM_NAME_COMMON, offset,
+	      be16_to_cpu(h->len), h->cksum);
+	offset += NVRAM_SIZE_COMMON;
+
+	/* Create free space partition */
+	if (nvram_size - offset < sizeof(struct chrp_nvram_hdr))
+		return -1;
+	h = nvram_image + offset;
+	h->sig = NVRAM_SIG_FREE;
+	h->len = cpu_to_be16((nvram_size - offset) >> 4);
+	/* We have the full 12 bytes here */
+	memcpy(h->name, NVRAM_NAME_FREE, 12);
+	h->cksum = chrp_nv_cksum(h);
+	prlog(PR_DEBUG, "NVRAM: Created '%s' partition at 0x%08x"
+	      " for size 0x%08x with cksum 0x%02x\n",
+	      NVRAM_NAME_FREE, offset, be16_to_cpu(h->len), h->cksum);
+	return 0;
+}
+
+/*
+ * Check that the nvram partition layout is sane and that it
+ * contains our required partitions. If not, we re-format the
+ * lot of it
+ */
+int nvram_check(void *nvram_image, const uint32_t nvram_size)
+{
+	unsigned int offset = 0;
+	bool found_common = false;
+
+	skiboot_part_hdr = NULL;
+
+	while (offset + sizeof(struct chrp_nvram_hdr) < nvram_size) {
+		struct chrp_nvram_hdr *h = nvram_image + offset;
+
+		if (chrp_nv_cksum(h) != h->cksum) {
+			prerror("NVRAM: Partition at offset 0x%x"
+				" has bad checksum: 0x%02x vs 0x%02x\n",
+				offset, h->cksum, chrp_nv_cksum(h));
+			goto failed;
+		}
+		if (be16_to_cpu(h->len) < 1) {
+			prerror("NVRAM: Partition at offset 0x%x"
+				" has incorrect 0 length\n", offset);
+			goto failed;
+		}
+
+		if (h->sig == NVRAM_SIG_SYSTEM &&
+		    strcmp(h->name, NVRAM_NAME_COMMON) == 0)
+			found_common = true;
+
+		if (h->sig == NVRAM_SIG_FW_PRIV &&
+		    strcmp(h->name, NVRAM_NAME_FW_PRIV) == 0)
+			skiboot_part_hdr = h;
+
+		offset += be16_to_cpu(h->len) << 4;
+		if (offset > nvram_size) {
+			prerror("NVRAM: Partition at offset 0x%x"
+				" extends beyond end of nvram !\n", offset);
+			goto failed;
+		}
+	}
+	if (!found_common) {
+		prlog_once(PR_ERR, "NVRAM: Common partition not found !\n");
+		goto failed;
+	}
+
+	if (!skiboot_part_hdr) {
+		prlog_once(PR_ERR, "NVRAM: Skiboot private partition not found !\n");
+		goto failed;
+	} else {
+		/*
+		 * The OF NVRAM format requires config strings to be NUL
+		 * terminated and unused memory to be set to zero. Well behaved
+		 * software should ensure this is done for us, but we should
+		 * always check.
+		 */
+		const char *last_byte = (const char *) skiboot_part_hdr +
+			be16_to_cpu(skiboot_part_hdr->len) * 16 - 1;
+
+		if (*last_byte != 0) {
+			prerror("NVRAM: Skiboot private partition is not NUL terminated");
+			goto failed;
+		}
+	}
+
+	prlog(PR_INFO, "NVRAM: Layout appears sane\n");
+	assert(skiboot_part_hdr);
+	return 0;
+ failed:
+	return -1;
+}
+
+static const char *find_next_key(const char *start, const char *end)
+{
+	/*
+	 * Unused parts of the partition are set to NUL. If we hit two
+	 * NULs in a row then we assume that we have hit the end of the
+	 * partition.
+	 */
+	if (*start == 0)
+		return NULL;
+
+	while (start < end) {
+		if (*start == 0)
+			return start + 1;
+
+		start++;
+	}
+
+	return NULL;
+}
+
+static void nvram_dangerous(const char *key)
+{
+	prlog(PR_ERR, " ___________________________________________________________\n");
+	prlog(PR_ERR, "<  Dangerous NVRAM option: %s\n", key);
+	prlog(PR_ERR, " -----------------------------------------------------------\n");
+	prlog(PR_ERR, "                  \\                         \n");
+	prlog(PR_ERR, "                   \\   WW                   \n");
+	prlog(PR_ERR, "                      <^ \\___/|             \n");
+	prlog(PR_ERR, "                       \\      /             \n");
+	prlog(PR_ERR, "                        \\_  _/              \n");
+	prlog(PR_ERR, "                          }{                 \n");
+}
+
+
+/*
+ * nvram_query_safe/dangerous() - Searches skiboot NVRAM partition
+ * for a key=value pair.
+ *
+ * Dangerous means it should only be used for testing as it may
+ * mask issues. Safe is ok for long term use.
+ *
+ * Returns a pointer to a NUL terminated string that contains the value
+ * associated with the given key.
+ */
+static const char *__nvram_query(const char *key, bool dangerous)
+{
+	const char *part_end, *start;
+	int key_len = strlen(key);
+
+	assert(key);
+
+	if (!nvram_has_loaded()) {
+		prlog(PR_DEBUG,
+			"NVRAM: Query for '%s' must wait for NVRAM to load\n",
+			key);
+		if (!nvram_wait_for_load()) {
+			prlog(PR_CRIT, "NVRAM: Failed to load\n");
+			return NULL;
+		}
+	}
+
+	/*
+	 * The running OS can modify the NVRAM as it pleases so we need to be
+	 * a little paranoid and check that it's ok before we try parse it.
+	 *
+	 * NB: nvram_validate() can update skiboot_part_hdr
+	 */
+	if (!nvram_validate())
+		return NULL;
+
+	assert(skiboot_part_hdr);
+
+	part_end = (const char *) skiboot_part_hdr
+		+ be16_to_cpu(skiboot_part_hdr->len) * 16 - 1;
+
+	start = (const char *) skiboot_part_hdr
+		+ sizeof(*skiboot_part_hdr);
+
+	if (!key_len) {
+		prlog(PR_WARNING, "NVRAM: search key is empty!\n");
+		return NULL;
+	}
+
+	if (key_len > 32)
+		prlog(PR_WARNING, "NVRAM: search key '%s' is longer than 32 chars\n", key);
+
+	while (start) {
+		int remaining = part_end - start;
+
+		prlog(PR_TRACE, "NVRAM: '%s' (%lu)\n",
+			start, strlen(start));
+
+		if (key_len + 1 > remaining)
+			return NULL;
+
+		if (!strncmp(key, start, key_len) && start[key_len] == '=') {
+			const char *value = &start[key_len + 1];
+
+			prlog(PR_DEBUG, "NVRAM: Searched for '%s' found '%s'\n",
+				key, value);
+
+			if (dangerous)
+				nvram_dangerous(start);
+			return value;
+		}
+
+		start = find_next_key(start, part_end);
+	}
+
+	prlog(PR_DEBUG, "NVRAM: '%s' not found\n", key);
+
+	return NULL;
+}
+
+const char *nvram_query_safe(const char *key)
+{
+	return __nvram_query(key, false);
+}
+
+const char *nvram_query_dangerous(const char *key)
+{
+	return __nvram_query(key, true);
+}
+
+/*
+ * nvram_query_eq_safe/dangerous() - Check if the given 'key' exists
+ * and is set to 'value'.
+ *
+ * Dangerous means it should only be used for testing as it may
+ * mask issues. Safe is ok for long term use.
+ *
+ * Note: Its an error to check for non-existence of a key
+ * by passing 'value == NULL' as a key's value can never be
+ * NULL in nvram.
+ */
+static bool __nvram_query_eq(const char *key, const char *value, bool dangerous)
+{
+	const char *s = __nvram_query(key, dangerous);
+
+	if (!s)
+		return false;
+
+	assert(value != NULL);
+	return !strcmp(s, value);
+}
+
+bool nvram_query_eq_safe(const char *key, const char *value)
+{
+	return __nvram_query_eq(key, value, false);
+}
+
+bool nvram_query_eq_dangerous(const char *key, const char *value)
+{
+	return __nvram_query_eq(key, value, true);
+}
+
diff --git a/roms/skiboot/core/nvram.c b/roms/skiboot/core/nvram.c
new file mode 100644
index 000000000..773d20280
--- /dev/null
+++ b/roms/skiboot/core/nvram.c
@@ -0,0 +1,203 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * NVRAM support
+ *
+ * Copyright 2013-2018 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <fsp.h>
+#include <opal.h>
+#include <lock.h>
+#include <device.h>
+#include <platform.h>
+#include <nvram.h>
+#include <timebase.h>
+
+static void *nvram_image;
+static uint32_t nvram_size;
+
+static bool nvram_ready; /* has the nvram been loaded? */
+static bool nvram_valid; /* is the nvram format ok? */
+
+static int64_t opal_read_nvram(uint64_t buffer, uint64_t size, uint64_t offset)
+{
+	if (!nvram_ready)
+		return OPAL_HARDWARE;
+
+	if (!opal_addr_valid((void *)buffer))
+		return OPAL_PARAMETER;
+
+	if (offset >= nvram_size || (offset + size) > nvram_size)
+		return OPAL_PARAMETER;
+
+	memcpy((void *)buffer, nvram_image + offset, size);
+	return OPAL_SUCCESS;
+}
+opal_call(OPAL_READ_NVRAM, opal_read_nvram, 3);
+
+static int64_t opal_write_nvram(uint64_t buffer, uint64_t size, uint64_t offset)
+{
+	if (!nvram_ready)
+		return OPAL_HARDWARE;
+
+	if (!opal_addr_valid((void *)buffer))
+		return OPAL_PARAMETER;
+
+	if (offset >= nvram_size || (offset + size) > nvram_size)
+		return OPAL_PARAMETER;
+	memcpy(nvram_image + offset, (void *)buffer, size);
+	if (platform.nvram_write)
+		platform.nvram_write(offset, nvram_image + offset, size);
+
+	/* The host OS has written to the NVRAM so we can't be sure that it's
+	 * well formatted.
+	 */
+	nvram_valid = false;
+
+	return OPAL_SUCCESS;
+}
+opal_call(OPAL_WRITE_NVRAM, opal_write_nvram, 3);
+
+bool nvram_validate(void)
+{
+	if (!nvram_valid) {
+		if (!nvram_check(nvram_image, nvram_size))
+			nvram_valid = true;
+	}
+
+	return nvram_valid;
+}
+
+static void nvram_reformat(void)
+{
+	if (nvram_format(nvram_image, nvram_size)) {
+		prerror("NVRAM: Failed to format NVRAM!\n");
+		nvram_valid = false;
+		return;
+	}
+
+	/* Write the whole thing back */
+	if (platform.nvram_write)
+		platform.nvram_write(0, nvram_image, nvram_size);
+
+	nvram_validate();
+}
+
+void nvram_reinit(void)
+{
+	/* It's possible we failed to load nvram at boot. */
+	if (!nvram_ready)
+		nvram_init();
+	else if (!nvram_validate())
+		nvram_reformat();
+}
+
+void nvram_read_complete(bool success)
+{
+	struct dt_node *np;
+
+	/* Read not successful, error out and free the buffer */
+	if (!success) {
+		free(nvram_image);
+		nvram_size = 0;
+		return;
+	}
+
+	if (!nvram_validate())
+		nvram_reformat();
+
+	/* Add nvram node */
+	np = dt_new(opal_node, "nvram");
+	dt_add_property_cells(np, "#bytes", nvram_size);
+	dt_add_property_string(np, "compatible", "ibm,opal-nvram");
+
+	/* Mark ready */
+	nvram_ready = true;
+}
+
+bool nvram_wait_for_load(void)
+{
+	uint64_t started;
+
+	/* Short cut */
+	if (nvram_ready)
+		return true;
+
+	/* Tell the caller it will never happen */
+	if (!platform.nvram_info)
+		return false;
+
+	/*
+	 * One of two things has happened here.
+	 * 1. nvram_wait_for_load() was called before nvram_init()
+	 * 2. The read of NVRAM failed.
+	 * Either way, this is quite a bad event.
+	 */
+	if (!nvram_image && !nvram_size) {
+		prlog(PR_CRIT, "NVRAM: Possible wait before nvram_init()!\n");
+		return false;
+	}
+
+	started = mftb();
+
+	while (!nvram_ready) {
+		opal_run_pollers();
+		/* If the read fails, tell the caller */
+		if (!nvram_image && !nvram_size)
+			return false;
+	}
+
+	prlog(PR_DEBUG, "NVRAM: Waited %lums for nvram to load\n",
+		tb_to_msecs(mftb() - started));
+
+	return true;
+}
+
+bool nvram_has_loaded(void)
+{
+	return nvram_ready;
+}
+
+void nvram_init(void)
+{
+	int rc;
+
+	if (!platform.nvram_info)
+		return;
+	rc = platform.nvram_info(&nvram_size);
+	if (rc) {
+		prerror("NVRAM: Error %d retrieving nvram info\n", rc);
+		return;
+	}
+	prlog(PR_INFO, "NVRAM: Size is %d KB\n", nvram_size >> 10);
+	if (nvram_size > 0x100000) {
+		prlog(PR_WARNING, "NVRAM: Cropping to 1MB !\n");
+		nvram_size = 0x100000;
+	}
+
+	/*
+	 * We allocate the nvram image with 4k alignment to make the
+	 * FSP backend job's easier
+	 */
+	nvram_image = memalign(0x1000, nvram_size);
+	if (!nvram_image) {
+		prerror("NVRAM: Failed to allocate nvram image\n");
+		nvram_size = 0;
+		return;
+	}
+
+	/* Read it in */
+	rc = platform.nvram_start_read(nvram_image, 0, nvram_size);
+	if (rc) {
+		prerror("NVRAM: Failed to read NVRAM from FSP !\n");
+		nvram_size = 0;
+		free(nvram_image);
+		return;
+	}
+
+	/*
+	 * We'll get called back later (or recursively from
+	 * nvram_start_read) in nvram_read_complete()
+	 */
+}
diff --git a/roms/skiboot/core/opal-dump.c b/roms/skiboot/core/opal-dump.c
new file mode 100644
index 000000000..4f54a3ef1
--- /dev/null
+++ b/roms/skiboot/core/opal-dump.c
@@ -0,0 +1,582 @@
+/* Copyright 2019 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define pr_fmt(fmt)	"DUMP: " fmt
+
+#include <chip.h>
+#include <cpu.h>
+#include <device.h>
+#include <mem-map.h>
+#include <mem_region.h>
+#include <mem_region-malloc.h>
+#include <opal.h>
+#include <opal-dump.h>
+#include <opal-internal.h>
+#include <sbe-p9.h>
+#include <skiboot.h>
+
+#include <ccan/endian/endian.h>
+
+#include "hdata/spira.h"
+
+/* XXX Ideally we should use HDAT provided data (proc_dump_area->thread_size).
+ *     But we are not getting this data durig boot. Hence lets reserve fixed
+ *     memory for architected registers data collection.
+ */
+#define ARCH_REGS_DATA_SIZE_PER_CHIP	(512 * 1024)
+
+/* Actual address of MDST and MDDT table */
+#define MDST_TABLE_BASE		(SKIBOOT_BASE + MDST_TABLE_OFF)
+#define MDDT_TABLE_BASE		(SKIBOOT_BASE + MDDT_TABLE_OFF)
+#define PROC_DUMP_AREA_BASE	(SKIBOOT_BASE + PROC_DUMP_AREA_OFF)
+
+static struct spira_ntuple *ntuple_mdst;
+static struct spira_ntuple *ntuple_mddt;
+static struct spira_ntuple *ntuple_mdrt;
+
+static struct mpipl_metadata    *mpipl_metadata;
+
+/* Dump metadata area */
+static struct opal_mpipl_fadump *opal_mpipl_data;
+static struct opal_mpipl_fadump *opal_mpipl_cpu_data;
+
+/*
+ * Number of tags passed by OPAL to kernel after MPIPL boot.
+ * Currently it supports below tags:
+ *   - CPU register data area
+ *   - OPAL metadata area address
+ *   - Kernel passed tag during MPIPL registration
+ *   - Post MPIPL boot memory size
+ */
+#define MAX_OPAL_MPIPL_TAGS	0x04
+static u64 opal_mpipl_tags[MAX_OPAL_MPIPL_TAGS];
+static int opal_mpipl_max_tags = MAX_OPAL_MPIPL_TAGS;
+
+static u64 opal_dump_addr, opal_dump_size;
+
+static bool mpipl_enabled;
+
+static int opal_mpipl_add_entry(u8 region, u64 src, u64 dest, u64 size)
+{
+	int i;
+	int mdst_cnt = be16_to_cpu(ntuple_mdst->act_cnt);
+	int mddt_cnt = be16_to_cpu(ntuple_mddt->act_cnt);
+	struct mdst_table *mdst;
+	struct mddt_table *mddt;
+
+	if (mdst_cnt >= MDST_TABLE_SIZE / sizeof(struct mdst_table)) {
+		prlog(PR_DEBUG, "MDST table is full\n");
+		return OPAL_RESOURCE;
+	}
+
+	if (mddt_cnt >= MDDT_TABLE_SIZE / sizeof(struct mddt_table)) {
+		prlog(PR_DEBUG, "MDDT table is full\n");
+		return OPAL_RESOURCE;
+	}
+
+	/* Use relocated memory address */
+	mdst = (void *)(MDST_TABLE_BASE);
+	mddt = (void *)(MDDT_TABLE_BASE);
+
+	/* Check for duplicate entry */
+	for (i = 0; i < mdst_cnt; i++) {
+		if (be64_to_cpu(mdst->addr) == (src | HRMOR_BIT)) {
+			prlog(PR_DEBUG,
+			      "Duplicate source address : 0x%llx", src);
+			return OPAL_PARAMETER;
+		}
+		mdst++;
+	}
+	for (i = 0; i < mddt_cnt; i++) {
+		if (be64_to_cpu(mddt->addr) == (dest | HRMOR_BIT)) {
+			prlog(PR_DEBUG,
+			      "Duplicate destination address : 0x%llx", dest);
+			return OPAL_PARAMETER;
+		}
+		mddt++;
+	}
+
+	/* Add OPAL source address to MDST entry */
+	mdst->addr = cpu_to_be64(src | HRMOR_BIT);
+	mdst->data_region = region;
+	mdst->size = cpu_to_be32(size);
+	ntuple_mdst->act_cnt = cpu_to_be16(mdst_cnt + 1);
+
+	/* Add OPAL destination address to MDDT entry */
+	mddt->addr = cpu_to_be64(dest | HRMOR_BIT);
+	mddt->data_region = region;
+	mddt->size = cpu_to_be32(size);
+	ntuple_mddt->act_cnt = cpu_to_be16(mddt_cnt + 1);
+
+	prlog(PR_TRACE, "Added new entry. src : 0x%llx, dest : 0x%llx,"
+	      " size : 0x%llx\n", src, dest, size);
+	return OPAL_SUCCESS;
+}
+
+/* Remove entry from source (MDST) table */
+static int opal_mpipl_remove_entry_mdst(bool remove_all, u8 region, u64 src)
+{
+	bool found = false;
+	int i, j;
+	int mdst_cnt = be16_to_cpu(ntuple_mdst->act_cnt);
+	struct mdst_table *tmp_mdst;
+	struct mdst_table *mdst = (void *)(MDST_TABLE_BASE);
+
+	for (i = 0; i < mdst_cnt;) {
+		if (mdst->data_region != region) {
+			mdst++;
+			i++;
+			continue;
+		}
+
+		if (remove_all != true &&
+				be64_to_cpu(mdst->addr) != (src | HRMOR_BIT)) {
+			mdst++;
+			i++;
+			continue;
+		}
+
+		tmp_mdst = mdst;
+		memset(tmp_mdst, 0, sizeof(struct mdst_table));
+
+		for (j = i; j < mdst_cnt - 1; j++) {
+			memcpy((void *)tmp_mdst,
+			       (void *)(tmp_mdst + 1), sizeof(struct mdst_table));
+			tmp_mdst++;
+			memset(tmp_mdst, 0, sizeof(struct mdst_table));
+		}
+
+		mdst_cnt--;
+
+		if (remove_all == false) {
+			found = true;
+			break;
+		}
+	}  /* end - for loop */
+
+	ntuple_mdst->act_cnt = cpu_to_be16((u16)mdst_cnt);
+
+	if (remove_all == false && found == false) {
+		prlog(PR_DEBUG,
+		      "Source address [0x%llx] not found in MDST table\n", src);
+		return OPAL_PARAMETER;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+/* Remove entry from destination (MDDT) table */
+static int opal_mpipl_remove_entry_mddt(bool remove_all, u8 region, u64 dest)
+{
+	bool found = false;
+	int i, j;
+	int mddt_cnt = be16_to_cpu(ntuple_mddt->act_cnt);
+	struct mddt_table *tmp_mddt;
+	struct mddt_table *mddt = (void *)(MDDT_TABLE_BASE);
+
+	for (i = 0; i < mddt_cnt;) {
+		if (mddt->data_region != region) {
+			mddt++;
+			i++;
+			continue;
+		}
+
+		if (remove_all != true &&
+				be64_to_cpu(mddt->addr) != (dest | HRMOR_BIT)) {
+			mddt++;
+			i++;
+			continue;
+		}
+
+		tmp_mddt = mddt;
+		memset(tmp_mddt, 0, sizeof(struct mddt_table));
+
+		for (j = i; j < mddt_cnt - 1; j++) {
+			memcpy((void *)tmp_mddt,
+			       (void *)(tmp_mddt + 1), sizeof(struct mddt_table));
+			tmp_mddt++;
+			memset(tmp_mddt, 0, sizeof(struct mddt_table));
+		}
+
+		mddt_cnt--;
+
+		if (remove_all == false) {
+			found = true;
+			break;
+		}
+	}  /* end - for loop */
+
+	ntuple_mddt->act_cnt = cpu_to_be16((u16)mddt_cnt);
+
+	if (remove_all == false && found == false) {
+		prlog(PR_DEBUG,
+		      "Dest address [0x%llx] not found in MDDT table\n", dest);
+		return OPAL_PARAMETER;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+/* Register for OPAL dump.  */
+static void opal_mpipl_register(void)
+{
+	u64 arch_regs_dest, arch_regs_size;
+	struct proc_dump_area *proc_dump = (void *)(PROC_DUMP_AREA_BASE);
+
+	/* Add OPAL reservation detail to MDST/MDDT table */
+	opal_mpipl_add_entry(DUMP_REGION_OPAL_MEMORY,
+			     SKIBOOT_BASE, opal_dump_addr, opal_dump_size);
+
+	/* Thread size check */
+	if (proc_dump->thread_size != 0) {
+		prlog(PR_INFO, "Thread register entry size is available, "
+		      "but not supported.\n");
+	}
+
+	/* Reserve memory used to capture architected register state */
+	arch_regs_dest = opal_dump_addr + opal_dump_size;
+	arch_regs_size = nr_chips() * ARCH_REGS_DATA_SIZE_PER_CHIP;
+	proc_dump->alloc_addr = cpu_to_be64(arch_regs_dest | HRMOR_BIT);
+	proc_dump->alloc_size = cpu_to_be32(arch_regs_size);
+	prlog(PR_NOTICE, "Architected register dest addr : 0x%llx, "
+	      "size : 0x%llx\n", arch_regs_dest, arch_regs_size);
+}
+
+static int payload_mpipl_register(u64 src, u64 dest, u64 size)
+{
+	if (!opal_addr_valid((void *)src)) {
+		prlog(PR_DEBUG, "Invalid source address [0x%llx]\n", src);
+		return OPAL_PARAMETER;
+	}
+
+	if (!opal_addr_valid((void *)dest)) {
+		prlog(PR_DEBUG, "Invalid dest address [0x%llx]\n", dest);
+		return OPAL_PARAMETER;
+	}
+
+	if (size <= 0) {
+		prlog(PR_DEBUG, "Invalid size [0x%llx]\n", size);
+		return OPAL_PARAMETER;
+	}
+
+	return opal_mpipl_add_entry(DUMP_REGION_KERNEL, src, dest, size);
+}
+
+static int payload_mpipl_unregister(u64 src, u64 dest)
+{
+	int rc;
+
+	/* Remove src from MDST table */
+	rc = opal_mpipl_remove_entry_mdst(false, DUMP_REGION_KERNEL, src);
+	if (rc)
+		return rc;
+
+	/* Remove dest from MDDT table */
+	rc = opal_mpipl_remove_entry_mddt(false, DUMP_REGION_KERNEL, dest);
+	return rc;
+}
+
+static int payload_mpipl_unregister_all(void)
+{
+	opal_mpipl_remove_entry_mdst(true, DUMP_REGION_KERNEL, 0);
+	opal_mpipl_remove_entry_mddt(true, DUMP_REGION_KERNEL, 0);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t opal_mpipl_update(enum opal_mpipl_ops ops,
+				 u64 src, u64 dest, u64 size)
+{
+	int rc;
+
+	switch (ops) {
+	case OPAL_MPIPL_ADD_RANGE:
+		rc = payload_mpipl_register(src, dest, size);
+		if (!rc)
+			prlog(PR_NOTICE, "Payload registered for MPIPL\n");
+		break;
+	case OPAL_MPIPL_REMOVE_RANGE:
+		rc = payload_mpipl_unregister(src, dest);
+		if (!rc) {
+			prlog(PR_NOTICE, "Payload removed entry from MPIPL."
+			      "[src : 0x%llx, dest : 0x%llx]\n", src, dest);
+		}
+		break;
+	case OPAL_MPIPL_REMOVE_ALL:
+		rc = payload_mpipl_unregister_all();
+		if (!rc)
+			prlog(PR_NOTICE, "Payload unregistered for MPIPL\n");
+		break;
+	case OPAL_MPIPL_FREE_PRESERVED_MEMORY:
+		/* Clear tags */
+		memset(&opal_mpipl_tags, 0, (sizeof(u64) * MAX_OPAL_MPIPL_TAGS));
+		opal_mpipl_max_tags = 0;
+		/* Release memory */
+		free(opal_mpipl_data);
+		opal_mpipl_data = NULL;
+		free(opal_mpipl_cpu_data);
+		opal_mpipl_cpu_data = NULL;
+		/* Clear MDRT table */
+		memset((void *)MDRT_TABLE_BASE, 0, MDRT_TABLE_SIZE);
+		/* Set MDRT count to max allocated count */
+		ntuple_mdrt->act_cnt = cpu_to_be16(MDRT_TABLE_SIZE / sizeof(struct mdrt_table));
+		rc = OPAL_SUCCESS;
+		prlog(PR_NOTICE, "Payload Invalidated MPIPL\n");
+		break;
+	default:
+		prlog(PR_DEBUG, "Unsupported MPIPL update operation : 0x%x\n", ops);
+		rc = OPAL_PARAMETER;
+		break;
+	}
+
+	return rc;
+}
+
+static int64_t opal_mpipl_register_tag(enum opal_mpipl_tags tag,
+				       uint64_t tag_val)
+{
+	int rc = OPAL_SUCCESS;
+
+	switch (tag) {
+	case OPAL_MPIPL_TAG_BOOT_MEM:
+		if (tag_val <= 0 || tag_val > top_of_ram) {
+			prlog(PR_DEBUG, "Payload sent invalid boot mem size"
+			      " :  0x%llx\n", tag_val);
+			rc = OPAL_PARAMETER;
+		} else {
+			mpipl_metadata->boot_mem_size = tag_val;
+			prlog(PR_NOTICE, "Boot mem size : 0x%llx\n", tag_val);
+		}
+		break;
+	case OPAL_MPIPL_TAG_KERNEL:
+		mpipl_metadata->kernel_tag = tag_val;
+		prlog(PR_NOTICE, "Payload sent metadata tag : 0x%llx\n", tag_val);
+		break;
+	default:
+		prlog(PR_DEBUG, "Payload sent unsupported tag : 0x%x\n", tag);
+		rc = OPAL_PARAMETER;
+		break;
+	}
+	return rc;
+}
+
+static uint64_t opal_mpipl_query_tag(enum opal_mpipl_tags tag, __be64 *tag_val)
+{
+	if (!opal_addr_valid(tag_val)) {
+		prlog(PR_DEBUG, "Invalid tag address\n");
+		return OPAL_PARAMETER;
+	}
+
+	if (tag >= opal_mpipl_max_tags)
+		return OPAL_PARAMETER;
+
+	*tag_val = cpu_to_be64(opal_mpipl_tags[tag]);
+	return OPAL_SUCCESS;
+}
+
+static inline void post_mpipl_get_preserved_tags(void)
+{
+	if (mpipl_metadata->kernel_tag)
+		opal_mpipl_tags[OPAL_MPIPL_TAG_KERNEL] = mpipl_metadata->kernel_tag;
+	if (mpipl_metadata->boot_mem_size)
+		opal_mpipl_tags[OPAL_MPIPL_TAG_BOOT_MEM] = mpipl_metadata->boot_mem_size;
+}
+
+static void post_mpipl_arch_regs_data(void)
+{
+	struct proc_dump_area *proc_dump = (void *)(PROC_DUMP_AREA_BASE);
+
+	if (proc_dump->dest_addr == 0) {
+		prlog(PR_DEBUG, "Invalid CPU registers destination address\n");
+		return;
+	}
+
+	if (proc_dump->act_size == 0) {
+		prlog(PR_DEBUG, "Invalid CPU registers destination size\n");
+		return;
+	}
+
+	opal_mpipl_cpu_data = zalloc(sizeof(struct opal_mpipl_fadump) +
+				sizeof(struct opal_mpipl_region));
+	if (!opal_mpipl_cpu_data) {
+		prlog(PR_ERR, "Failed to allocate memory\n");
+		return;
+	}
+
+	/* Fill CPU register details */
+	opal_mpipl_cpu_data->version = OPAL_MPIPL_VERSION;
+	opal_mpipl_cpu_data->cpu_data_version = cpu_to_be32((u32)proc_dump->version);
+	opal_mpipl_cpu_data->cpu_data_size = proc_dump->thread_size;
+	opal_mpipl_cpu_data->region_cnt = cpu_to_be32(1);
+
+	opal_mpipl_cpu_data->region[0].src  = proc_dump->dest_addr & ~(cpu_to_be64(HRMOR_BIT));
+	opal_mpipl_cpu_data->region[0].dest = proc_dump->dest_addr & ~(cpu_to_be64(HRMOR_BIT));
+	opal_mpipl_cpu_data->region[0].size = cpu_to_be64(be32_to_cpu(proc_dump->act_size));
+
+	/* Update tag */
+	opal_mpipl_tags[OPAL_MPIPL_TAG_CPU] = (u64)opal_mpipl_cpu_data;
+}
+
+static void post_mpipl_get_opal_data(void)
+{
+	struct mdrt_table *mdrt = (void *)(MDRT_TABLE_BASE);
+	int i, j = 0, count = 0;
+	int mdrt_cnt = be16_to_cpu(ntuple_mdrt->act_cnt);
+	struct opal_mpipl_region *region;
+
+	/* Count OPAL dump regions */
+	for (i = 0; i < mdrt_cnt; i++) {
+		if (mdrt->data_region == DUMP_REGION_OPAL_MEMORY)
+			count++;
+		mdrt++;
+	}
+
+	if (count == 0) {
+		prlog(PR_INFO, "OPAL dump is not available\n");
+		return;
+	}
+
+	opal_mpipl_data = zalloc(sizeof(struct opal_mpipl_fadump) +
+				 count * sizeof(struct opal_mpipl_region));
+	if (!opal_mpipl_data) {
+		prlog(PR_ERR, "Failed to allocate memory\n");
+		return;
+	}
+
+	/* Fill OPAL dump details */
+	opal_mpipl_data->version = OPAL_MPIPL_VERSION;
+	opal_mpipl_data->crashing_pir = cpu_to_be32(mpipl_metadata->crashing_pir);
+	opal_mpipl_data->region_cnt = cpu_to_be32(count);
+	region = opal_mpipl_data->region;
+
+	mdrt = (void *)(MDRT_TABLE_BASE);
+	for (i = 0; i < mdrt_cnt; i++) {
+		if (mdrt->data_region != DUMP_REGION_OPAL_MEMORY) {
+			mdrt++;
+			continue;
+		}
+
+		region[j].src  = mdrt->src_addr  & ~(cpu_to_be64(HRMOR_BIT));
+		region[j].dest = mdrt->dest_addr & ~(cpu_to_be64(HRMOR_BIT));
+		region[j].size = cpu_to_be64(be32_to_cpu(mdrt->size));
+
+		prlog(PR_NOTICE, "OPAL reserved region %d - src : 0x%llx, "
+		      "dest : 0x%llx, size : 0x%llx\n", j,
+		      be64_to_cpu(region[j].src), be64_to_cpu(region[j].dest),
+		      be64_to_cpu(region[j].size));
+
+		mdrt++;
+		j++;
+		if (j == count)
+			break;
+	}
+
+	opal_mpipl_tags[OPAL_MPIPL_TAG_OPAL] = (u64)opal_mpipl_data;
+}
+
+void opal_mpipl_save_crashing_pir(void)
+{
+	if (!is_mpipl_enabled())
+		return;
+
+	mpipl_metadata->crashing_pir = this_cpu()->pir;
+	prlog(PR_NOTICE, "Crashing PIR = 0x%x\n", this_cpu()->pir);
+}
+
+void opal_mpipl_reserve_mem(void)
+{
+	struct dt_node *opal_node, *dump_node;
+	u64 arch_regs_dest, arch_regs_size;
+
+	opal_node = dt_find_by_path(dt_root, "ibm,opal");
+	if (!opal_node)
+		return;
+
+	dump_node = dt_find_by_path(opal_node, "dump");
+	if (!dump_node)
+		return;
+
+	/* Calculcate and Reserve OPAL dump destination memory */
+	opal_dump_size = SKIBOOT_SIZE + (cpu_max_pir + 1) * STACK_SIZE;
+	opal_dump_addr = SKIBOOT_BASE + opal_dump_size;
+	mem_reserve_fw("ibm,firmware-dump",
+		       opal_dump_addr, opal_dump_size);
+
+	/* Reserve memory to capture CPU register data */
+	arch_regs_dest = opal_dump_addr + opal_dump_size;
+	arch_regs_size = nr_chips() * ARCH_REGS_DATA_SIZE_PER_CHIP;
+	mem_reserve_fw("ibm,firmware-arch-registers",
+		       arch_regs_dest, arch_regs_size);
+}
+
+bool is_mpipl_enabled(void)
+{
+	return mpipl_enabled;
+}
+
+void opal_mpipl_init(void)
+{
+	void *mdst_base = (void *)MDST_TABLE_BASE;
+	void *mddt_base = (void *)MDDT_TABLE_BASE;
+	struct dt_node *dump_node;
+
+	dump_node = dt_find_by_path(opal_node, "dump");
+	if (!dump_node)
+		return;
+
+	/* Get MDST and MDDT ntuple from SPIRAH */
+	ntuple_mdst = &(spirah.ntuples.mdump_src);
+	ntuple_mddt = &(spirah.ntuples.mdump_dst);
+	ntuple_mdrt = &(spirah.ntuples.mdump_res);
+
+	/* Get metadata area pointer */
+	mpipl_metadata = (void *)(DUMP_METADATA_AREA_BASE);
+
+	if (dt_find_property(dump_node, "mpipl-boot")) {
+		disable_fast_reboot("MPIPL Boot");
+
+		post_mpipl_get_preserved_tags();
+		post_mpipl_get_opal_data();
+		post_mpipl_arch_regs_data();
+	}
+
+	/* Clear OPAL metadata area */
+	if (sizeof(struct mpipl_metadata) > DUMP_METADATA_AREA_SIZE) {
+		prlog(PR_ERR, "INSUFFICIENT OPAL METADATA AREA\n");
+		prlog(PR_ERR, "INCREASE OPAL MEDTADATA AREA SIZE\n");
+		assert(false);
+	}
+	memset(mpipl_metadata, 0, sizeof(struct mpipl_metadata));
+
+	/* Clear MDST and MDDT table */
+	memset(mdst_base, 0, MDST_TABLE_SIZE);
+	ntuple_mdst->act_cnt = 0;
+	memset(mddt_base, 0, MDDT_TABLE_SIZE);
+	ntuple_mddt->act_cnt = 0;
+
+	opal_mpipl_register();
+
+	/* Send OPAL relocated base address to SBE */
+	p9_sbe_send_relocated_base(SKIBOOT_BASE);
+
+	/* OPAL API for MPIPL update */
+	opal_register(OPAL_MPIPL_UPDATE, opal_mpipl_update, 4);
+	opal_register(OPAL_MPIPL_REGISTER_TAG, opal_mpipl_register_tag, 2);
+	opal_register(OPAL_MPIPL_QUERY_TAG, opal_mpipl_query_tag, 2);
+
+	/* Enable MPIPL */
+	mpipl_enabled = true;
+}
diff --git a/roms/skiboot/core/opal-msg.c b/roms/skiboot/core/opal-msg.c
new file mode 100644
index 000000000..65a2476b2
--- /dev/null
+++ b/roms/skiboot/core/opal-msg.c
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * OPAL Message queue between host and skiboot
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#define pr_fmt(fmt) "opalmsg: " fmt
+#include <skiboot.h>
+#include <opal-msg.h>
+#include <opal-api.h>
+#include <lock.h>
+
+#define OPAL_MAX_MSGS		(OPAL_MSG_TYPE_MAX + OPAL_MAX_ASYNC_COMP - 1)
+
+struct opal_msg_entry {
+	struct list_node link;
+	void (*consumed)(void *data, int status);
+	bool extended;
+	void *data;
+	struct opal_msg msg;
+};
+
+static LIST_HEAD(msg_free_list);
+static LIST_HEAD(msg_pending_list);
+
+static struct lock opal_msg_lock = LOCK_UNLOCKED;
+
+int _opal_queue_msg(enum opal_msg_type msg_type, void *data,
+		    void (*consumed)(void *data, int status),
+		    size_t params_size, const void *params)
+{
+	struct opal_msg_entry *entry;
+	uint64_t entry_size;
+
+	if ((params_size + OPAL_MSG_HDR_SIZE) > OPAL_MSG_SIZE) {
+		prlog(PR_DEBUG, "param_size (0x%x) > opal_msg param size (0x%x)\n",
+		      (u32)params_size, (u32)(OPAL_MSG_SIZE - OPAL_MSG_HDR_SIZE));
+		return OPAL_PARAMETER;
+	}
+
+	lock(&opal_msg_lock);
+
+	if (params_size > OPAL_MSG_FIXED_PARAMS_SIZE) {
+		entry_size = sizeof(struct opal_msg_entry) + params_size;
+		entry_size -= OPAL_MSG_FIXED_PARAMS_SIZE;
+		entry = zalloc(entry_size);
+		if (entry)
+			entry->extended = true;
+	} else {
+		entry = list_pop(&msg_free_list, struct opal_msg_entry, link);
+		if (!entry) {
+			prerror("No available node in the free list, allocating\n");
+			entry = zalloc(sizeof(struct opal_msg_entry));
+		}
+	}
+	if (!entry) {
+		prerror("Allocation failed\n");
+		unlock(&opal_msg_lock);
+		return OPAL_RESOURCE;
+	}
+
+	entry->consumed = consumed;
+	entry->data = data;
+	entry->msg.msg_type = cpu_to_be32(msg_type);
+	entry->msg.size = cpu_to_be32(params_size);
+	memcpy(entry->msg.params, params, params_size);
+
+	list_add_tail(&msg_pending_list, &entry->link);
+	opal_update_pending_evt(OPAL_EVENT_MSG_PENDING,
+				OPAL_EVENT_MSG_PENDING);
+	unlock(&opal_msg_lock);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t opal_get_msg(uint64_t *buffer, uint64_t size)
+{
+	struct opal_msg_entry *entry;
+	void (*callback)(void *data, int status);
+	void *data;
+	uint64_t msg_size;
+	int rc = OPAL_SUCCESS;
+
+	if (size < sizeof(struct opal_msg) || !buffer)
+		return OPAL_PARAMETER;
+
+	if (!opal_addr_valid(buffer))
+		return OPAL_PARAMETER;
+
+	lock(&opal_msg_lock);
+
+	entry = list_pop(&msg_pending_list, struct opal_msg_entry, link);
+	if (!entry) {
+		unlock(&opal_msg_lock);
+		return OPAL_RESOURCE;
+	}
+
+	msg_size = OPAL_MSG_HDR_SIZE + be32_to_cpu(entry->msg.size);
+	if (size < msg_size) {
+		/* Send partial data to Linux */
+		prlog(PR_NOTICE, "Sending partial data [msg_type : 0x%x, "
+		      "msg_size : 0x%x, buf_size : 0x%x]\n",
+		      be32_to_cpu(entry->msg.msg_type),
+		      (u32)msg_size, (u32)size);
+
+		entry->msg.size = cpu_to_be32(size - OPAL_MSG_HDR_SIZE);
+		msg_size = size;
+		rc = OPAL_PARTIAL;
+	}
+
+	memcpy((void *)buffer, (void *)&entry->msg, msg_size);
+	callback = entry->consumed;
+	data = entry->data;
+
+	if (entry->extended)
+		free(entry);
+	else
+		list_add(&msg_free_list, &entry->link);
+
+	if (list_empty(&msg_pending_list))
+		opal_update_pending_evt(OPAL_EVENT_MSG_PENDING, 0);
+
+	unlock(&opal_msg_lock);
+
+	if (callback)
+		callback(data, rc);
+
+	return rc;
+}
+opal_call(OPAL_GET_MSG, opal_get_msg, 2);
+
+static int64_t opal_check_completion(uint64_t *buffer, uint64_t size,
+				     uint64_t token)
+{
+	struct opal_msg_entry *entry, *next_entry;
+	void (*callback)(void *data, int status) = NULL;
+	int rc = OPAL_BUSY;
+	void *data = NULL;
+
+	if (!opal_addr_valid(buffer))
+		return OPAL_PARAMETER;
+
+	lock(&opal_msg_lock);
+	list_for_each_safe(&msg_pending_list, entry, next_entry, link) {
+		if (be32_to_cpu(entry->msg.msg_type) == OPAL_MSG_ASYNC_COMP &&
+		    be64_to_cpu(entry->msg.params[0]) == token) {
+			list_del(&entry->link);
+			callback = entry->consumed;
+			data = entry->data;
+			list_add(&msg_free_list, &entry->link);
+			if (list_empty(&msg_pending_list))
+				opal_update_pending_evt(OPAL_EVENT_MSG_PENDING,
+							0);
+			rc = OPAL_SUCCESS;
+			break;
+		}
+	}
+
+	if (rc == OPAL_SUCCESS && size >= sizeof(struct opal_msg))
+		memcpy(buffer, &entry->msg, sizeof(entry->msg));
+
+	unlock(&opal_msg_lock);
+
+	if (callback)
+		callback(data, OPAL_SUCCESS);
+
+	return rc;
+
+}
+opal_call(OPAL_CHECK_ASYNC_COMPLETION, opal_check_completion, 3);
+
+void opal_init_msg(void)
+{
+	struct opal_msg_entry *entry;
+	int i;
+
+	for (i = 0; i < OPAL_MAX_MSGS; i++, entry++) {
+                entry = zalloc(sizeof(*entry));
+                if (!entry)
+                        goto err;
+		list_add_tail(&msg_free_list, &entry->link);
+        }
+        return;
+
+err:
+        for (; i > 0; i--) {
+                entry = list_pop(&msg_free_list, struct opal_msg_entry, link);
+                if (entry)
+                        free(entry);
+        }
+}
+
diff --git a/roms/skiboot/core/opal.c b/roms/skiboot/core/opal.c
new file mode 100644
index 000000000..2898a45ce
--- /dev/null
+++ b/roms/skiboot/core/opal.c
@@ -0,0 +1,700 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Base support for OPAL calls
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <opal.h>
+#include <stack.h>
+#include <lock.h>
+#include <fsp.h>
+#include <cpu.h>
+#include <interrupts.h>
+#include <op-panel.h>
+#include <device.h>
+#include <console.h>
+#include <trace.h>
+#include <timebase.h>
+#include <affinity.h>
+#include <opal-msg.h>
+#include <timer.h>
+#include <elf-abi.h>
+#include <errorlog.h>
+#include <occ.h>
+
+/* Pending events to signal via opal_poll_events */
+uint64_t opal_pending_events;
+
+/* OPAL dispatch table defined in head.S */
+extern const uint64_t opal_branch_table[];
+
+/* Number of args expected for each call. */
+static const u8 opal_num_args[OPAL_LAST+1];
+
+/* OPAL anchor node */
+struct dt_node *opal_node;
+
+/* mask of dynamic vs fixed events; opal_allocate_dynamic_event will
+ * only allocate from this range */
+static const uint64_t opal_dynamic_events_mask = 0xffffffff00000000ul;
+static uint64_t opal_dynamic_events;
+
+extern uint32_t attn_trigger;
+extern uint32_t hir_trigger;
+
+
+void opal_table_init(void)
+{
+	struct opal_table_entry *s = __opal_table_start;
+	struct opal_table_entry *e = __opal_table_end;
+
+	prlog(PR_DEBUG, "OPAL table: %p .. %p, branch table: %p\n",
+	      s, e, opal_branch_table);
+	while(s < e) {
+		((uint64_t *)opal_branch_table)[s->token] = function_entry_address(s->func);
+		((u8 *)opal_num_args)[s->token] = s->nargs;
+		s++;
+	}
+}
+
+/* Called from head.S, thus no prototype */
+long opal_bad_token(uint64_t token);
+
+long opal_bad_token(uint64_t token)
+{
+	/**
+	 * @fwts-label OPALBadToken
+	 * @fwts-advice OPAL was called with a bad token. On POWER8 and
+	 * earlier, Linux kernels had a bug where they wouldn't check
+	 * if firmware supported particular OPAL calls before making them.
+	 * It is, in fact, harmless for these cases. On systems newer than
+	 * POWER8, this should never happen and indicates a kernel bug
+	 * where OPAL_CHECK_TOKEN isn't being called where it should be.
+	 */
+	prlog(PR_ERR, "OPAL: Called with bad token %lld !\n", token);
+
+	return OPAL_PARAMETER;
+}
+
+#ifdef OPAL_TRACE_ENTRY
+static void opal_trace_entry(struct stack_frame *eframe __unused)
+{
+	union trace t;
+	unsigned nargs, i;
+
+	if (eframe->gpr[0] > OPAL_LAST)
+		nargs = 0;
+	else
+		nargs = opal_num_args[eframe->gpr[0]];
+
+	t.opal.token = cpu_to_be64(eframe->gpr[0]);
+	t.opal.lr = cpu_to_be64(eframe->lr);
+	t.opal.sp = cpu_to_be64(eframe->gpr[1]);
+	for(i=0; i<nargs; i++)
+		t.opal.r3_to_11[i] = cpu_to_be64(eframe->gpr[3+i]);
+
+	trace_add(&t, TRACE_OPAL, offsetof(struct trace_opal, r3_to_11[nargs]));
+}
+#endif
+
+/*
+ * opal_quiesce_state is used as a lock. Don't use an actual lock to avoid
+ * lock busting.
+ */
+static uint32_t opal_quiesce_state;	/* 0 or QUIESCE_HOLD/QUIESCE_REJECT */
+static int32_t opal_quiesce_owner;	/* PIR */
+static int32_t opal_quiesce_target;	/* -1 or PIR */
+
+static int64_t opal_check_token(uint64_t token);
+
+/* Called from head.S, thus no prototype */
+int64_t opal_entry_check(struct stack_frame *eframe);
+
+int64_t opal_entry_check(struct stack_frame *eframe)
+{
+	struct cpu_thread *cpu = this_cpu();
+	uint64_t token = eframe->gpr[0];
+
+	if (cpu->pir != mfspr(SPR_PIR)) {
+		printf("CPU MISMATCH ! PIR=%04lx cpu @%p -> pir=%04x token=%llu\n",
+		       mfspr(SPR_PIR), cpu, cpu->pir, token);
+		abort();
+	}
+
+#ifdef OPAL_TRACE_ENTRY
+	opal_trace_entry(eframe);
+#endif
+
+	if (!opal_check_token(token))
+		return opal_bad_token(token);
+
+	if (!opal_quiesce_state && cpu->in_opal_call > 1) {
+		disable_fast_reboot("Kernel re-entered OPAL");
+		switch (token) {
+		case OPAL_CONSOLE_READ:
+		case OPAL_CONSOLE_WRITE:
+		case OPAL_CONSOLE_WRITE_BUFFER_SPACE:
+		case OPAL_CONSOLE_FLUSH:
+		case OPAL_POLL_EVENTS:
+		case OPAL_CHECK_TOKEN:
+		case OPAL_CEC_REBOOT:
+		case OPAL_CEC_REBOOT2:
+		case OPAL_SIGNAL_SYSTEM_RESET:
+			break;
+		default:
+			printf("CPU ATTEMPT TO RE-ENTER FIRMWARE! PIR=%04lx cpu @%p -> pir=%04x token=%llu\n",
+			       mfspr(SPR_PIR), cpu, cpu->pir, token);
+			if (cpu->in_opal_call > 2) {
+				printf("Emergency stack is destroyed, can't continue.\n");
+				abort();
+			}
+			return OPAL_INTERNAL_ERROR;
+		}
+	}
+
+	cpu->entered_opal_call_at = mftb();
+	return OPAL_SUCCESS;
+}
+
+int64_t opal_exit_check(int64_t retval, struct stack_frame *eframe);
+
+int64_t opal_exit_check(int64_t retval, struct stack_frame *eframe)
+{
+	struct cpu_thread *cpu = this_cpu();
+	uint64_t token = eframe->gpr[0];
+	uint64_t now = mftb();
+	uint64_t call_time = tb_to_msecs(now - cpu->entered_opal_call_at);
+
+	if (!cpu->in_opal_call) {
+		disable_fast_reboot("Un-accounted firmware entry");
+		printf("CPU UN-ACCOUNTED FIRMWARE ENTRY! PIR=%04lx cpu @%p -> pir=%04x token=%llu retval=%lld\n",
+		       mfspr(SPR_PIR), cpu, cpu->pir, token, retval);
+		cpu->in_opal_call++; /* avoid exit path underflowing */
+	} else {
+		if (cpu->in_opal_call > 2) {
+			printf("Emergency stack is destroyed, can't continue.\n");
+			abort();
+		}
+		if (!list_empty(&cpu->locks_held)) {
+			prlog(PR_ERR, "OPAL exiting with locks held, pir=%04x token=%llu retval=%lld\n",
+			      cpu->pir, token, retval);
+			drop_my_locks(true);
+		}
+	}
+
+	if (call_time > 100 && token != OPAL_RESYNC_TIMEBASE) {
+		prlog((call_time < 1000) ? PR_DEBUG : PR_WARNING,
+		      "Spent %llu msecs in OPAL call %llu!\n",
+		      call_time, token);
+	}
+
+	cpu->current_token = 0;
+
+	return retval;
+}
+
+int64_t opal_quiesce(uint32_t quiesce_type, int32_t cpu_target)
+{
+	struct cpu_thread *cpu = this_cpu();
+	struct cpu_thread *target = NULL;
+	struct cpu_thread *c;
+	uint64_t end;
+	bool stuck = false;
+
+	if (cpu_target >= 0) {
+		target = find_cpu_by_server(cpu_target);
+		if (!target)
+			return OPAL_PARAMETER;
+	} else if (cpu_target != -1) {
+		return OPAL_PARAMETER;
+	}
+
+	if (quiesce_type == QUIESCE_HOLD || quiesce_type == QUIESCE_REJECT) {
+		if (cmpxchg32(&opal_quiesce_state, 0, quiesce_type) != 0) {
+			if (opal_quiesce_owner != cpu->pir) {
+				/*
+				 * Nested is allowed for now just for
+				 * internal uses, so an error is returned
+				 * for OS callers, but no error message
+				 * printed if we are nested.
+				 */
+				printf("opal_quiesce already quiescing\n");
+			}
+			return OPAL_BUSY;
+		}
+		opal_quiesce_owner = cpu->pir;
+		opal_quiesce_target = cpu_target;
+	}
+
+	if (opal_quiesce_owner != cpu->pir) {
+		printf("opal_quiesce CPU does not own quiesce state (must call QUIESCE_HOLD or QUIESCE_REJECT)\n");
+		return OPAL_BUSY;
+	}
+
+	/* Okay now we own the quiesce state */
+
+	if (quiesce_type == QUIESCE_RESUME ||
+			quiesce_type == QUIESCE_RESUME_FAST_REBOOT) {
+		bust_locks = false;
+		sync(); /* release barrier vs opal entry */
+		if (target) {
+			target->quiesce_opal_call = 0;
+		} else {
+			for_each_cpu(c) {
+				if (quiesce_type == QUIESCE_RESUME_FAST_REBOOT)
+					c->in_opal_call = 0;
+
+				if (c == cpu) {
+					assert(!c->quiesce_opal_call);
+					continue;
+				}
+				c->quiesce_opal_call = 0;
+			}
+		}
+		sync();
+		opal_quiesce_state = 0;
+		return OPAL_SUCCESS;
+	}
+
+	if (quiesce_type == QUIESCE_LOCK_BREAK) {
+		if (opal_quiesce_target != -1) {
+			printf("opal_quiesce has not quiesced all CPUs (must target -1)\n");
+			return OPAL_BUSY;
+		}
+		bust_locks = true;
+		return OPAL_SUCCESS;
+	}
+
+	if (target) {
+		target->quiesce_opal_call = quiesce_type;
+	} else {
+		for_each_cpu(c) {
+			if (c == cpu)
+				continue;
+			c->quiesce_opal_call = quiesce_type;
+		}
+	}
+
+	sync(); /* Order stores to quiesce_opal_call vs loads of in_opal_call */
+
+	end = mftb() + msecs_to_tb(1000);
+
+	smt_lowest();
+	if (target) {
+		while (target->in_opal_call) {
+			if (tb_compare(mftb(), end) == TB_AAFTERB) {
+				printf("OPAL quiesce CPU:%04x stuck in OPAL\n", target->pir);
+				stuck = true;
+				break;
+			}
+			barrier();
+		}
+	} else {
+		for_each_cpu(c) {
+			if (c == cpu)
+				continue;
+			while (c->in_opal_call) {
+				if (tb_compare(mftb(), end) == TB_AAFTERB) {
+					printf("OPAL quiesce CPU:%04x stuck in OPAL\n", c->pir);
+					stuck = true;
+					break;
+				}
+				barrier();
+			}
+		}
+	}
+	smt_medium();
+	sync(); /* acquire barrier vs opal entry */
+
+	if (stuck) {
+		printf("OPAL quiesce could not kick all CPUs out of OPAL\n");
+		return OPAL_PARTIAL;
+	}
+
+	return OPAL_SUCCESS;
+}
+opal_call(OPAL_QUIESCE, opal_quiesce, 2);
+
+void __opal_register(uint64_t token, void *func, unsigned int nargs)
+{
+	assert(token <= OPAL_LAST);
+
+	((uint64_t *)opal_branch_table)[token] = function_entry_address(func);
+	((u8 *)opal_num_args)[token] = nargs;
+}
+
+/*
+ * add_opal_firmware_exports_node: adds properties to the device-tree which
+ * the OS will then change into sysfs nodes.
+ * The properties must be placed under /ibm,opal/firmware/exports.
+ * The new sysfs nodes are created under /opal/exports.
+ * To be correctly exported the properties must contain:
+ * 	name
+ * 	base memory location (u64)
+ * 	size 		     (u64)
+ */
+static void add_opal_firmware_exports_node(struct dt_node *node)
+{
+	struct dt_node *exports = dt_new(node, "exports");
+	uint64_t sym_start = (uint64_t)__sym_map_start;
+	uint64_t sym_size = (uint64_t)__sym_map_end - sym_start;
+
+	/*
+	 * These property names will be used by Linux as the user-visible file
+	 * name, so make them meaningful if possible. We use _ as the separator
+	 * here to remain consistent with existing file names in /sys/opal.
+	 */
+	dt_add_property_u64s(exports, "symbol_map", sym_start, sym_size);
+	dt_add_property_u64s(exports, "hdat_map", SPIRA_HEAP_BASE,
+				SPIRA_HEAP_SIZE);
+#ifdef SKIBOOT_GCOV
+	dt_add_property_u64s(exports, "gcov", SKIBOOT_BASE,
+				HEAP_BASE - SKIBOOT_BASE);
+#endif
+}
+
+static void add_opal_firmware_node(void)
+{
+	struct dt_node *firmware = dt_new(opal_node, "firmware");
+	uint64_t sym_start = (uint64_t)__sym_map_start;
+	uint64_t sym_size = (uint64_t)__sym_map_end - sym_start;
+
+	dt_add_property_string(firmware, "compatible", "ibm,opal-firmware");
+	dt_add_property_string(firmware, "name", "firmware");
+	dt_add_property_string(firmware, "version", version);
+	/*
+	 * As previous OS versions use symbol-map located at
+	 * /ibm,opal/firmware we will keep a copy of symbol-map here
+	 * for backwards compatibility
+	 */
+	dt_add_property_u64s(firmware, "symbol-map", sym_start, sym_size);
+
+	add_opal_firmware_exports_node(firmware);
+}
+
+void add_opal_node(void)
+{
+	uint64_t base, entry, size;
+	extern uint32_t opal_entry;
+	extern uint32_t boot_entry;
+	struct dt_node *opal_event;
+
+	/* XXX TODO: Reorg this. We should create the base OPAL
+	 * node early on, and have the various sub modules populate
+	 * their own entries (console etc...)
+	 *
+	 * The logic of which console backend to use should be
+	 * extracted
+	 */
+
+	entry = (uint64_t)&opal_entry;
+	base = SKIBOOT_BASE;
+	size = (CPU_STACKS_BASE +
+		(uint64_t)(cpu_max_pir + 1) * STACK_SIZE) - SKIBOOT_BASE;
+
+	opal_node = dt_new_check(dt_root, "ibm,opal");
+	dt_add_property_cells(opal_node, "#address-cells", 0);
+	dt_add_property_cells(opal_node, "#size-cells", 0);
+
+	if (proc_gen < proc_gen_p9)
+		dt_add_property_strings(opal_node, "compatible", "ibm,opal-v2",
+					"ibm,opal-v3");
+	else
+		dt_add_property_strings(opal_node, "compatible", "ibm,opal-v3");
+
+	dt_add_property_cells(opal_node, "opal-msg-async-num", OPAL_MAX_ASYNC_COMP);
+	dt_add_property_cells(opal_node, "opal-msg-size", OPAL_MSG_SIZE);
+	dt_add_property_u64(opal_node, "opal-base-address", base);
+	dt_add_property_u64(opal_node, "opal-entry-address", entry);
+	dt_add_property_u64(opal_node, "opal-boot-address", (uint64_t)&boot_entry);
+	dt_add_property_u64(opal_node, "opal-runtime-size", size);
+
+	/* Add irqchip interrupt controller */
+	opal_event = dt_new(opal_node, "event");
+	dt_add_property_strings(opal_event, "compatible", "ibm,opal-event");
+	dt_add_property_cells(opal_event, "#interrupt-cells", 0x1);
+	dt_add_property(opal_event, "interrupt-controller", NULL, 0);
+
+	add_opal_firmware_node();
+	add_associativity_ref_point();
+	memcons_add_properties();
+}
+
+static struct lock evt_lock = LOCK_UNLOCKED;
+
+void opal_update_pending_evt(uint64_t evt_mask, uint64_t evt_values)
+{
+	uint64_t new_evts;
+
+	lock(&evt_lock);
+	new_evts = (opal_pending_events & ~evt_mask) | evt_values;
+	if (opal_pending_events != new_evts) {
+		uint64_t tok;
+
+#ifdef OPAL_TRACE_EVT_CHG
+		printf("OPAL: Evt change: 0x%016llx -> 0x%016llx\n",
+		       opal_pending_events, new_evts);
+#endif
+		/*
+		 * If an event gets *set* while we are in a different call chain
+		 * than opal_handle_interrupt() or opal_handle_hmi(), then we
+		 * artificially generate an interrupt (OCC interrupt specifically)
+		 * to ensure that Linux properly broadcast the event change internally
+		 */
+		if ((new_evts & ~opal_pending_events) != 0) {
+			tok = this_cpu()->current_token;
+			if (tok != OPAL_HANDLE_INTERRUPT && tok != OPAL_HANDLE_HMI)
+				occ_send_dummy_interrupt();
+		}
+		opal_pending_events = new_evts;
+	}
+	unlock(&evt_lock);
+}
+
+uint64_t opal_dynamic_event_alloc(void)
+{
+	uint64_t new_event;
+	int n;
+
+	lock(&evt_lock);
+
+	/* Create the event mask. This set-bit will be within the event mask
+	 * iff there are free events, or out of the mask if there are no free
+	 * events. If opal_dynamic_events is all ones (ie, all events are
+	 * dynamic, and allocated), then ilog2 will return -1, and we'll have a
+	 * zero mask.
+	 */
+	n = ilog2(~opal_dynamic_events);
+	new_event = 1ull << n;
+
+	/* Ensure we're still within the allocatable dynamic events range */
+	if (new_event & opal_dynamic_events_mask)
+		opal_dynamic_events |= new_event;
+	else
+		new_event = 0;
+
+	unlock(&evt_lock);
+	return new_event;
+}
+
+void opal_dynamic_event_free(uint64_t event)
+{
+	lock(&evt_lock);
+	opal_dynamic_events &= ~event;
+	unlock(&evt_lock);
+}
+
+static uint64_t opal_test_func(uint64_t arg)
+{
+	printf("OPAL: Test function called with arg 0x%llx\n", arg);
+
+	return 0xfeedf00d;
+}
+opal_call(OPAL_TEST, opal_test_func, 1);
+
+struct opal_poll_entry {
+	struct list_node	link;
+	void			(*poller)(void *data);
+	void			*data;
+};
+
+static struct list_head opal_pollers = LIST_HEAD_INIT(opal_pollers);
+static struct lock opal_poll_lock = LOCK_UNLOCKED;
+
+void opal_add_poller(void (*poller)(void *data), void *data)
+{
+	struct opal_poll_entry *ent;
+
+	ent = zalloc(sizeof(struct opal_poll_entry));
+	assert(ent);
+	ent->poller = poller;
+	ent->data = data;
+	lock(&opal_poll_lock);
+	list_add_tail(&opal_pollers, &ent->link);
+	unlock(&opal_poll_lock);
+}
+
+void opal_del_poller(void (*poller)(void *data))
+{
+	struct opal_poll_entry *ent;
+
+	/* XXX This is currently unused. To solve various "interesting"
+	 * locking issues, the pollers are run locklessly, so if we were
+	 * to free them, we would have to be careful, using something
+	 * akin to RCU to synchronize with other OPAL entries. For now
+	 * if anybody uses it, print a warning and leak the entry, don't
+	 * free it.
+	 */
+	/**
+	 * @fwts-label UnsupportedOPALdelpoller
+	 * @fwts-advice Currently removing a poller is DANGEROUS and
+	 * MUST NOT be done in production firmware.
+	 */
+	prlog(PR_ALERT, "WARNING: Unsupported opal_del_poller."
+	      " Interesting locking issues, don't call this.\n");
+
+	lock(&opal_poll_lock);
+	list_for_each(&opal_pollers, ent, link) {
+		if (ent->poller == poller) {
+			list_del(&ent->link);
+			/* free(ent); */
+			break;
+		}
+	}
+	unlock(&opal_poll_lock);
+}
+
+void opal_run_pollers(void)
+{
+	static int pollers_with_lock_warnings = 0;
+	static int poller_recursion = 0;
+	struct opal_poll_entry *poll_ent;
+	bool was_in_poller;
+
+	/* Don't re-enter on this CPU, unless it was an OPAL re-entry */
+	if (this_cpu()->in_opal_call == 1 && this_cpu()->in_poller) {
+
+		/**
+		 * @fwts-label OPALPollerRecursion
+		 * @fwts-advice Recursion detected in opal_run_pollers(). This
+		 * indicates a bug in OPAL where a poller ended up running
+		 * pollers, which doesn't lead anywhere good.
+		 */
+		poller_recursion++;
+		if (poller_recursion <= 16) {
+			disable_fast_reboot("Poller recursion detected.");
+			prlog(PR_ERR, "OPAL: Poller recursion detected.\n");
+			backtrace();
+
+		}
+
+		if (poller_recursion == 16)
+			prlog(PR_ERR, "OPAL: Squashing future poller recursion warnings (>16).\n");
+
+		return;
+	}
+	was_in_poller = this_cpu()->in_poller;
+	this_cpu()->in_poller = true;
+
+	if (!list_empty(&this_cpu()->locks_held) && pollers_with_lock_warnings < 64) {
+		/**
+		 * @fwts-label OPALPollerWithLock
+		 * @fwts-advice opal_run_pollers() was called with a lock
+		 * held, which could lead to deadlock if not excessively
+		 * lucky/careful.
+		 */
+		prlog(PR_ERR, "Running pollers with lock held !\n");
+		dump_locks_list();
+		backtrace();
+		pollers_with_lock_warnings++;
+		if (pollers_with_lock_warnings == 64) {
+			/**
+			 * @fwts-label OPALPollerWithLock64
+			 * @fwts-advice Your firmware is buggy, see the 64
+			 * messages complaining about opal_run_pollers with
+			 * lock held.
+			 */
+			prlog(PR_ERR, "opal_run_pollers with lock run 64 "
+			      "times, disabling warning.\n");
+		}
+	}
+
+	/* We run the timers first */
+	check_timers(false);
+
+	/* The pollers are run lokelessly, see comment in opal_del_poller */
+	list_for_each(&opal_pollers, poll_ent, link)
+		poll_ent->poller(poll_ent->data);
+
+	/* Disable poller flag */
+	this_cpu()->in_poller = was_in_poller;
+
+	/* On debug builds, print max stack usage */
+	check_stacks();
+}
+
+static int64_t opal_poll_events(__be64 *outstanding_event_mask)
+{
+
+	if (!opal_addr_valid(outstanding_event_mask))
+		return OPAL_PARAMETER;
+
+	/* Check if we need to trigger an attn for test use */
+	if (attn_trigger == 0xdeadbeef) {
+		prlog(PR_EMERG, "Triggering attn\n");
+		assert(false);
+	}
+
+	opal_run_pollers();
+
+	if (outstanding_event_mask)
+		*outstanding_event_mask = cpu_to_be64(opal_pending_events);
+
+	return OPAL_SUCCESS;
+}
+opal_call(OPAL_POLL_EVENTS, opal_poll_events, 1);
+
+static int64_t opal_check_token(uint64_t token)
+{
+	if (token > OPAL_LAST)
+		return OPAL_TOKEN_ABSENT;
+
+	if (opal_branch_table[token])
+		return OPAL_TOKEN_PRESENT;
+
+	return OPAL_TOKEN_ABSENT;
+}
+opal_call(OPAL_CHECK_TOKEN, opal_check_token, 1);
+
+struct opal_sync_entry {
+	struct list_node	link;
+	bool			(*notify)(void *data);
+	void			*data;
+};
+
+static struct list_head opal_syncers = LIST_HEAD_INIT(opal_syncers);
+
+void opal_add_host_sync_notifier(bool (*notify)(void *data), void *data)
+{
+	struct opal_sync_entry *ent;
+
+	ent = zalloc(sizeof(struct opal_sync_entry));
+	assert(ent);
+	ent->notify = notify;
+	ent->data = data;
+	list_add_tail(&opal_syncers, &ent->link);
+}
+
+/*
+ * Remove a host sync notifier for given callback and data
+ */
+void opal_del_host_sync_notifier(bool (*notify)(void *data), void *data)
+{
+	struct opal_sync_entry *ent;
+
+	list_for_each(&opal_syncers, ent, link) {
+		if (ent->notify == notify && ent->data == data) {
+			list_del(&ent->link);
+			free(ent);
+			return;
+		}
+	}
+}
+
+/*
+ * OPAL call to handle host kexec'ing scenario
+ */
+static int64_t opal_sync_host_reboot(void)
+{
+	struct opal_sync_entry *ent, *nxt;
+	int ret = OPAL_SUCCESS;
+
+	list_for_each_safe(&opal_syncers, ent, nxt, link)
+		if (! ent->notify(ent->data))
+			ret = OPAL_BUSY_EVENT;
+
+	return ret;
+}
+opal_call(OPAL_SYNC_HOST_REBOOT, opal_sync_host_reboot, 0);
diff --git a/roms/skiboot/core/pci-dt-slot.c b/roms/skiboot/core/pci-dt-slot.c
new file mode 100644
index 000000000..2441bf940
--- /dev/null
+++ b/roms/skiboot/core/pci-dt-slot.c
@@ -0,0 +1,212 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * PCI slots in the device tree.
+ *
+ * Copyright 2017-2018 IBM Corp.
+ */
+
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#include <skiboot.h>
+#include <device.h>
+
+#include <pci.h>
+#include <pci-cfg.h>
+#include <pci-slot.h>
+#include <ccan/list/list.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "DT-SLOT: " fmt
+
+struct dt_node *dt_slots;
+
+static struct dt_node *map_phb_to_slot(struct phb *phb)
+{
+	uint32_t chip_id = dt_get_chip_id(phb->dt_node);
+	uint32_t phb_idx = dt_prop_get_u32_def(phb->dt_node,
+					       "ibm,phb-index", 0);
+	struct dt_node *slot_node;
+
+	if (!dt_slots)
+		dt_slots = dt_find_by_path(dt_root, "/ibm,pcie-slots");
+
+	if (!dt_slots)
+		return NULL;
+
+	dt_for_each_child(dt_slots, slot_node) {
+		u32 reg[2];
+
+		if (!dt_node_is_compatible(slot_node, "ibm,pcie-root-port"))
+			continue;
+
+		reg[0] = dt_prop_get_cell(slot_node, "reg", 0);
+		reg[1] = dt_prop_get_cell(slot_node, "reg", 1);
+
+		if (reg[0] == chip_id && reg[1] == phb_idx)
+			return slot_node;
+	}
+
+	return NULL;
+}
+
+static struct dt_node *find_devfn(struct dt_node *bus, uint32_t bdfn)
+{
+	uint32_t port_dev_id = PCI_DEV(bdfn);
+	struct dt_node *child;
+
+	dt_for_each_child(bus, child)
+		if (dt_prop_get_u32_def(child, "reg", ~0u) == port_dev_id)
+			return child;
+
+	return NULL;
+}
+
+/* Looks for a device device under this slot. */
+static struct dt_node *find_dev_under_slot(struct dt_node *slot,
+					   struct pci_device *pd)
+{
+	struct dt_node *child, *wildcard = NULL;
+
+	/* find the device in the parent bus node */
+	dt_for_each_child(slot, child) {
+		u32 vdid;
+
+		/* "pluggable" and "builtin" without unit addrs are wildcards */
+		if (!dt_has_node_property(child, "reg", NULL)) {
+			if (wildcard)
+				prerror("Duplicate wildcard entry! Already have %s, found %s",
+					wildcard->name, child->name);
+
+			wildcard = child;
+			continue;
+		}
+
+		/* NB: the pci_device vdid is did,vid rather than vid,did */
+		vdid = dt_prop_get_cell(child, "reg", 1) << 16 |
+			dt_prop_get_cell(child, "reg", 0);
+
+		if (vdid == pd->vdid)
+			return child;
+	}
+
+	if (!wildcard)
+		PCIDBG(pd->phb, pd->bdfn,
+			"Unable to find a slot for device %.4x:%.4x\n",
+			(pd->vdid & 0xffff0000) >> 16, pd->vdid & 0xffff);
+
+	return wildcard;
+}
+
+/*
+ * If the `pd` is a bridge this returns a node with a compatible of
+ * ibm,pcie-port to indicate it's a "slot node".
+ */
+static struct dt_node *find_node_for_dev(struct phb *phb,
+					 struct pci_device *pd)
+{
+	struct dt_node *sw_slot, *sw_up;
+
+	assert(pd);
+
+	if (pd->slot && pd->slot->data)
+		return pd->slot->data;
+
+	/*
+	 * Example DT:
+	 *	 /root-complex@8,5/switch-up@10b5,8725/down-port@4
+	 */
+	switch (pd->dev_type) {
+	case PCIE_TYPE_ROOT_PORT: // find the root-complex@<chip>,<phb> node
+		return map_phb_to_slot(phb);
+
+	case PCIE_TYPE_SWITCH_DNPORT: // grab the down-port@<devfn>
+		/*
+		 * Walk up the topology to find the slot that contains
+		 * the switch upstream port is connected to. In the example
+		 * this would be the root-complex@8,5 node.
+		 */
+		sw_slot = find_node_for_dev(phb, pd->parent->parent);
+		if (!sw_slot)
+			return NULL;
+
+		/* find the per-device node for this switch */
+		sw_up = find_dev_under_slot(sw_slot, pd->parent);
+		if (!sw_up)
+			return NULL;
+
+		/* find this down port */
+		return find_devfn(sw_up, pd->bdfn);
+
+	default:
+		PCIDBG(phb, pd->bdfn,
+			"Trying to find a slot for non-pcie bridge type %d\n",
+			pd->dev_type);
+		assert(0);
+	}
+
+	return NULL;
+}
+
+struct dt_node *map_pci_dev_to_slot(struct phb *phb, struct pci_device *pd)
+{
+	struct dt_node *n;
+	char *path;
+
+	assert(pd);
+
+	/*
+	 * Having a slot only makes sense for root and switch downstream ports.
+	 * We don't care about PCI-X.
+	 */
+	if (pd->dev_type != PCIE_TYPE_SWITCH_DNPORT &&
+	    pd->dev_type != PCIE_TYPE_ROOT_PORT)
+		return NULL;
+
+	PCIDBG(phb, pd->bdfn, "Finding slot\n");
+
+	n = find_node_for_dev(phb, pd);
+	if (!n) {
+		PCIDBG(phb, pd->bdfn, "No slot found!\n");
+	} else {
+		path = dt_get_path(n);
+		PCIDBG(phb, pd->bdfn, "Slot found %s\n", path);
+		free(path);
+	}
+
+	return n;
+}
+
+int __print_slot(struct phb *phb, struct pci_device *pd, void *userdata);
+int __print_slot(struct phb *phb, struct pci_device *pd,
+			void __unused *userdata)
+{
+	struct dt_node *node;
+	struct dt_node *pnode;
+	char *c = NULL;
+	u32 phandle = 0;
+
+	if (!pd)
+		return 0;
+
+	node = map_pci_dev_to_slot(phb, pd);
+
+	/* at this point all node associations should be done */
+	if (pd->dn && dt_has_node_property(pd->dn, "ibm,pcie-slot", NULL)) {
+		phandle = dt_prop_get_u32(pd->dn, "ibm,pcie-slot");
+		pnode = dt_find_by_phandle(dt_root, phandle);
+
+		assert(node == pnode);
+	}
+
+	if (node)
+		c = dt_get_path(node);
+
+	PCIDBG(phb, pd->bdfn, "Mapped to slot %s (%x)\n",
+		c ? c : "<null>", phandle);
+
+	free(c);
+
+	return 0;
+}
diff --git a/roms/skiboot/core/pci-opal.c b/roms/skiboot/core/pci-opal.c
new file mode 100644
index 000000000..aa375c6aa
--- /dev/null
+++ b/roms/skiboot/core/pci-opal.c
@@ -0,0 +1,1135 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * PCIe OPAL Calls
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <opal-api.h>
+#include <pci.h>
+#include <pci-cfg.h>
+#include <pci-slot.h>
+#include <opal-msg.h>
+#include <timebase.h>
+#include <timer.h>
+
+#define OPAL_PCICFG_ACCESS_READ(op, cb, type)	\
+static int64_t opal_pci_config_##op(uint64_t phb_id,			\
+				    uint64_t bus_dev_func,		\
+				    uint64_t offset, type data)		\
+{									\
+	struct phb *phb = pci_get_phb(phb_id);				\
+	int64_t rc;							\
+									\
+	if (!opal_addr_valid((void *)data))				\
+		return OPAL_PARAMETER;					\
+									\
+	if (!phb)							\
+		return OPAL_PARAMETER;					\
+	phb_lock(phb);							\
+	rc = phb->ops->cfg_##cb(phb, bus_dev_func, offset, data);	\
+	phb_unlock(phb);						\
+									\
+	return rc;							\
+}
+
+#define OPAL_PCICFG_ACCESS_WRITE(op, cb, type)	\
+static int64_t opal_pci_config_##op(uint64_t phb_id,			\
+				    uint64_t bus_dev_func,		\
+				    uint64_t offset, type data)		\
+{									\
+	struct phb *phb = pci_get_phb(phb_id);				\
+	int64_t rc;							\
+									\
+	if (!phb)							\
+		return OPAL_PARAMETER;					\
+	phb_lock(phb);						\
+	rc = phb->ops->cfg_##cb(phb, bus_dev_func, offset, data);	\
+	phb_unlock(phb);						\
+									\
+	return rc;							\
+}
+
+OPAL_PCICFG_ACCESS_READ(read_byte,		read8, uint8_t *)
+OPAL_PCICFG_ACCESS_READ(read_half_word,		read16, uint16_t *)
+OPAL_PCICFG_ACCESS_READ(read_word,		read32, uint32_t *)
+OPAL_PCICFG_ACCESS_WRITE(write_byte,		write8, uint8_t)
+OPAL_PCICFG_ACCESS_WRITE(write_half_word,	write16, uint16_t)
+OPAL_PCICFG_ACCESS_WRITE(write_word,		write32, uint32_t)
+
+static int64_t opal_pci_config_read_half_word_be(uint64_t phb_id,
+						 uint64_t bus_dev_func,
+						 uint64_t offset,
+						 __be16 *__data)
+{
+	uint16_t data;
+	int64_t rc;
+
+	rc = opal_pci_config_read_half_word(phb_id, bus_dev_func, offset, &data);
+	*__data = cpu_to_be16(data);
+
+	return rc;
+}
+
+static int64_t opal_pci_config_read_word_be(uint64_t phb_id,
+						 uint64_t bus_dev_func,
+						 uint64_t offset,
+						 __be32 *__data)
+{
+	uint32_t data;
+	int64_t rc;
+
+	rc = opal_pci_config_read_word(phb_id, bus_dev_func, offset, &data);
+	*__data = cpu_to_be32(data);
+
+	return rc;
+}
+
+
+opal_call(OPAL_PCI_CONFIG_READ_BYTE, opal_pci_config_read_byte, 4);
+opal_call(OPAL_PCI_CONFIG_READ_HALF_WORD, opal_pci_config_read_half_word_be, 4);
+opal_call(OPAL_PCI_CONFIG_READ_WORD, opal_pci_config_read_word_be, 4);
+opal_call(OPAL_PCI_CONFIG_WRITE_BYTE, opal_pci_config_write_byte, 4);
+opal_call(OPAL_PCI_CONFIG_WRITE_HALF_WORD, opal_pci_config_write_half_word, 4);
+opal_call(OPAL_PCI_CONFIG_WRITE_WORD, opal_pci_config_write_word, 4);
+
+static struct lock opal_eeh_evt_lock = LOCK_UNLOCKED;
+static uint64_t opal_eeh_evt = 0;
+
+void opal_pci_eeh_set_evt(uint64_t phb_id)
+{
+	lock(&opal_eeh_evt_lock);
+	opal_eeh_evt |= 1ULL << phb_id;
+	opal_update_pending_evt(OPAL_EVENT_PCI_ERROR, OPAL_EVENT_PCI_ERROR);
+	unlock(&opal_eeh_evt_lock);
+}
+
+void opal_pci_eeh_clear_evt(uint64_t phb_id)
+{
+	lock(&opal_eeh_evt_lock);
+	opal_eeh_evt &= ~(1ULL << phb_id);
+	if (!opal_eeh_evt)
+		opal_update_pending_evt(OPAL_EVENT_PCI_ERROR, 0);
+	unlock(&opal_eeh_evt_lock);
+}
+
+static int64_t opal_pci_eeh_freeze_status(uint64_t phb_id, uint64_t pe_number,
+					  uint8_t *freeze_state,
+					  __be16 *__pci_error_type,
+					  __be64 *__phb_status)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	uint16_t pci_error_type;
+	int64_t rc;
+
+	if (!opal_addr_valid(freeze_state) || !opal_addr_valid(__pci_error_type)
+		|| !opal_addr_valid(__phb_status))
+		return OPAL_PARAMETER;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->eeh_freeze_status)
+		return OPAL_UNSUPPORTED;
+	phb_lock(phb);
+
+	if (__phb_status)
+		prlog(PR_ERR, "PHB#%04llx: %s: deprecated PHB status\n",
+				phb_id, __func__);
+
+	rc = phb->ops->eeh_freeze_status(phb, pe_number, freeze_state,
+					 &pci_error_type, NULL);
+	*__pci_error_type = cpu_to_be16(pci_error_type);
+	phb_unlock(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_EEH_FREEZE_STATUS, opal_pci_eeh_freeze_status, 5);
+
+static int64_t opal_pci_eeh_freeze_clear(uint64_t phb_id, uint64_t pe_number,
+					 uint64_t eeh_action_token)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->eeh_freeze_clear)
+		return OPAL_UNSUPPORTED;
+	phb_lock(phb);
+	rc = phb->ops->eeh_freeze_clear(phb, pe_number, eeh_action_token);
+	phb_unlock(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_EEH_FREEZE_CLEAR, opal_pci_eeh_freeze_clear, 3);
+
+static int64_t opal_pci_eeh_freeze_set(uint64_t phb_id, uint64_t pe_number,
+				       uint64_t eeh_action_token)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->eeh_freeze_set)
+		return OPAL_UNSUPPORTED;
+	phb_lock(phb);
+	rc = phb->ops->eeh_freeze_set(phb, pe_number, eeh_action_token);
+	phb_unlock(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_EEH_FREEZE_SET, opal_pci_eeh_freeze_set, 3);
+
+static int64_t opal_pci_err_inject(uint64_t phb_id, uint64_t pe_number,
+				   uint32_t type, uint32_t func,
+				   uint64_t addr, uint64_t mask)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops || !phb->ops->err_inject)
+		return OPAL_UNSUPPORTED;
+
+	if (type != OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR &&
+	    type != OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR64)
+		return OPAL_PARAMETER;
+
+	phb_lock(phb);
+	rc = phb->ops->err_inject(phb, pe_number, type, func, addr, mask);
+	phb_unlock(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_ERR_INJECT, opal_pci_err_inject, 6);
+
+static int64_t opal_pci_phb_mmio_enable(uint64_t phb_id, uint16_t window_type,
+					uint16_t window_num, uint16_t enable)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->phb_mmio_enable)
+		return OPAL_UNSUPPORTED;
+	phb_lock(phb);
+	rc = phb->ops->phb_mmio_enable(phb, window_type, window_num, enable);
+	phb_unlock(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_PHB_MMIO_ENABLE, opal_pci_phb_mmio_enable, 4);
+
+static int64_t opal_pci_set_phb_mem_window(uint64_t phb_id,
+					   uint16_t window_type,
+					   uint16_t window_num,
+					   uint64_t addr,
+					   uint64_t pci_addr,
+					   uint64_t size)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->set_phb_mem_window)
+		return OPAL_UNSUPPORTED;
+	phb_lock(phb);
+	rc = phb->ops->set_phb_mem_window(phb, window_type, window_num,
+					  addr, pci_addr, size);
+	phb_unlock(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_SET_PHB_MEM_WINDOW, opal_pci_set_phb_mem_window, 6);
+
+static int64_t opal_pci_map_pe_mmio_window(uint64_t phb_id, uint64_t pe_number,
+					   uint16_t window_type,
+					   uint16_t window_num,
+					   uint16_t segment_num)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->map_pe_mmio_window)
+		return OPAL_UNSUPPORTED;
+	phb_lock(phb);
+	rc = phb->ops->map_pe_mmio_window(phb, pe_number, window_type,
+					  window_num, segment_num);
+	phb_unlock(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_MAP_PE_MMIO_WINDOW, opal_pci_map_pe_mmio_window, 5);
+
+static int64_t opal_pci_set_pe(uint64_t phb_id, uint64_t pe_number,
+			       uint64_t bus_dev_func, uint8_t bus_compare,
+			       uint8_t dev_compare, uint8_t func_compare,
+			       uint8_t pe_action)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->set_pe)
+		return OPAL_UNSUPPORTED;
+	phb_lock(phb);
+	rc = phb->ops->set_pe(phb, pe_number, bus_dev_func, bus_compare,
+			      dev_compare, func_compare, pe_action);
+	phb_unlock(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_SET_PE, opal_pci_set_pe, 7);
+
+static int64_t opal_pci_set_peltv(uint64_t phb_id, uint32_t parent_pe,
+				  uint32_t child_pe, uint8_t state)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->set_peltv)
+		return OPAL_UNSUPPORTED;
+	phb_lock(phb);
+	rc = phb->ops->set_peltv(phb, parent_pe, child_pe, state);
+	phb_unlock(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_SET_PELTV, opal_pci_set_peltv, 4);
+
+static int64_t opal_pci_set_mve(uint64_t phb_id, uint32_t mve_number,
+				uint64_t pe_number)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->set_mve)
+		return OPAL_UNSUPPORTED;
+	phb_lock(phb);
+	rc = phb->ops->set_mve(phb, mve_number, pe_number);
+	phb_unlock(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_SET_MVE, opal_pci_set_mve, 3);
+
+static int64_t opal_pci_set_mve_enable(uint64_t phb_id, uint32_t mve_number,
+				       uint32_t state)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->set_mve_enable)
+		return OPAL_UNSUPPORTED;
+	phb_lock(phb);
+	rc = phb->ops->set_mve_enable(phb, mve_number, state);
+	phb_unlock(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_SET_MVE_ENABLE, opal_pci_set_mve_enable, 3);
+
+static int64_t opal_pci_msi_eoi(uint64_t phb_id,
+				uint32_t hwirq)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->pci_msi_eoi)
+		return OPAL_UNSUPPORTED;
+	phb_lock(phb);
+	rc = phb->ops->pci_msi_eoi(phb, hwirq);
+	phb_unlock(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_MSI_EOI, opal_pci_msi_eoi, 2);
+
+static int64_t opal_pci_tce_kill(uint64_t phb_id,
+				 uint32_t kill_type,
+				 uint64_t pe_number, uint32_t tce_size,
+				 uint64_t dma_addr, uint32_t npages)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->tce_kill)
+		return OPAL_UNSUPPORTED;
+	phb_lock(phb);
+	rc = phb->ops->tce_kill(phb, kill_type, pe_number, tce_size,
+				dma_addr, npages);
+	phb_unlock(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_TCE_KILL, opal_pci_tce_kill, 6);
+
+static int64_t opal_pci_set_xive_pe(uint64_t phb_id, uint64_t pe_number,
+				    uint32_t xive_num)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->set_xive_pe)
+		return OPAL_UNSUPPORTED;
+	phb_lock(phb);
+	rc = phb->ops->set_xive_pe(phb, pe_number, xive_num);
+	phb_unlock(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_SET_XIVE_PE, opal_pci_set_xive_pe, 3);
+
+static int64_t opal_get_msi_32(uint64_t phb_id, uint32_t mve_number,
+			       uint32_t xive_num, uint8_t msi_range,
+			       __be32 *__msi_address, __be32 *__message_data)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	uint32_t msi_address;
+	uint32_t message_data;
+	int64_t rc;
+
+	if (!opal_addr_valid(__msi_address) || !opal_addr_valid(__message_data))
+		return OPAL_PARAMETER;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->get_msi_32)
+		return OPAL_UNSUPPORTED;
+	phb_lock(phb);
+	rc = phb->ops->get_msi_32(phb, mve_number, xive_num, msi_range,
+				  &msi_address, &message_data);
+	phb_unlock(phb);
+
+	*__msi_address = cpu_to_be32(msi_address);
+	*__message_data = cpu_to_be32(message_data);
+
+	return rc;
+}
+opal_call(OPAL_GET_MSI_32, opal_get_msi_32, 6);
+
+static int64_t opal_get_msi_64(uint64_t phb_id, uint32_t mve_number,
+			       uint32_t xive_num, uint8_t msi_range,
+			       __be64 *__msi_address, __be32 *__message_data)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	uint64_t msi_address;
+	uint32_t message_data;
+	int64_t rc;
+
+	if (!opal_addr_valid(__msi_address) || !opal_addr_valid(__message_data))
+		return OPAL_PARAMETER;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->get_msi_64)
+		return OPAL_UNSUPPORTED;
+	phb_lock(phb);
+	rc = phb->ops->get_msi_64(phb, mve_number, xive_num, msi_range,
+				  &msi_address, &message_data);
+	phb_unlock(phb);
+
+	*__msi_address = cpu_to_be64(msi_address);
+	*__message_data = cpu_to_be32(message_data);
+
+	return rc;
+}
+opal_call(OPAL_GET_MSI_64, opal_get_msi_64, 6);
+
+static int64_t opal_pci_map_pe_dma_window(uint64_t phb_id, uint64_t pe_number,
+					  uint16_t window_id,
+					  uint16_t tce_levels,
+					  uint64_t tce_table_addr,
+					  uint64_t tce_table_size,
+					  uint64_t tce_page_size)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->map_pe_dma_window)
+		return OPAL_UNSUPPORTED;
+	phb_lock(phb);
+	rc = phb->ops->map_pe_dma_window(phb, pe_number, window_id,
+					 tce_levels, tce_table_addr,
+					 tce_table_size, tce_page_size);
+	phb_unlock(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_MAP_PE_DMA_WINDOW, opal_pci_map_pe_dma_window, 7);
+
+static int64_t opal_pci_map_pe_dma_window_real(uint64_t phb_id,
+					       uint64_t pe_number,
+					       uint16_t window_id,
+					       uint64_t pci_start_addr,
+					       uint64_t pci_mem_size)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->map_pe_dma_window_real)
+		return OPAL_UNSUPPORTED;
+	phb_lock(phb);
+	rc = phb->ops->map_pe_dma_window_real(phb, pe_number, window_id,
+					      pci_start_addr, pci_mem_size);
+	phb_unlock(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_MAP_PE_DMA_WINDOW_REAL, opal_pci_map_pe_dma_window_real, 5);
+
+static int64_t opal_phb_set_option(uint64_t phb_id, uint64_t opt,
+				   uint64_t setting)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+
+	if (!phb->ops->set_option)
+		return OPAL_UNSUPPORTED;
+
+	phb_lock(phb);
+	rc = phb->ops->set_option(phb, opt, setting);
+	phb_unlock(phb);
+
+	return rc;
+}
+opal_call(OPAL_PHB_SET_OPTION, opal_phb_set_option, 3);
+
+static int64_t opal_phb_get_option(uint64_t phb_id, uint64_t opt,
+				   __be64 *setting)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb || !setting)
+		return OPAL_PARAMETER;
+
+	if (!phb->ops->get_option)
+		return OPAL_UNSUPPORTED;
+
+	phb_lock(phb);
+	rc = phb->ops->get_option(phb, opt, setting);
+	phb_unlock(phb);
+
+	return rc;
+}
+opal_call(OPAL_PHB_GET_OPTION, opal_phb_get_option, 3);
+
+static int64_t opal_pci_reset(uint64_t id, uint8_t reset_scope,
+                              uint8_t assert_state)
+{
+	struct pci_slot *slot = pci_slot_find(id);
+	struct phb *phb = slot ? slot->phb : NULL;
+	int64_t rc = OPAL_SUCCESS;
+
+	if (!slot || !phb)
+		return OPAL_PARAMETER;
+	if (assert_state != OPAL_ASSERT_RESET &&
+	    assert_state != OPAL_DEASSERT_RESET)
+		return OPAL_PARAMETER;
+
+	phb_lock(phb);
+
+	switch(reset_scope) {
+	case OPAL_RESET_PHB_COMPLETE:
+		/* Complete reset is applicable to PHB slot only */
+		if (!slot->ops.creset || slot->pd) {
+			rc = OPAL_UNSUPPORTED;
+			break;
+		}
+
+		if (assert_state != OPAL_ASSERT_RESET)
+			break;
+
+		rc = slot->ops.creset(slot);
+		if (rc < 0)
+			prlog(PR_ERR, "SLOT-%016llx: Error %lld on complete reset\n",
+			      slot->id, rc);
+		break;
+	case OPAL_RESET_PCI_FUNDAMENTAL:
+		if (!slot->ops.freset) {
+			rc = OPAL_UNSUPPORTED;
+			break;
+		}
+
+		/* We need do nothing on deassert time */
+		if (assert_state != OPAL_ASSERT_RESET)
+			break;
+
+		rc = slot->ops.freset(slot);
+		if (rc < 0)
+			prlog(PR_ERR, "SLOT-%016llx: Error %lld on fundamental reset\n",
+			      slot->id, rc);
+		break;
+	case OPAL_RESET_PCI_HOT:
+		if (!slot->ops.hreset) {
+			rc = OPAL_UNSUPPORTED;
+			break;
+		}
+
+		/* We need do nothing on deassert time */
+		if (assert_state != OPAL_ASSERT_RESET)
+			break;
+
+		rc = slot->ops.hreset(slot);
+		if (rc < 0)
+			prlog(PR_ERR, "SLOT-%016llx: Error %lld on hot reset\n",
+			      slot->id, rc);
+		break;
+	case OPAL_RESET_PCI_IODA_TABLE:
+		/* It's allowed on PHB slot only */
+		if (slot->pd || !phb->ops || !phb->ops->ioda_reset) {
+			rc = OPAL_UNSUPPORTED;
+			break;
+		}
+
+		if (assert_state != OPAL_ASSERT_RESET)
+			break;
+
+		rc = phb->ops->ioda_reset(phb, true);
+		break;
+	case OPAL_RESET_PHB_ERROR:
+		/* It's allowed on PHB slot only */
+		if (slot->pd || !phb->ops || !phb->ops->papr_errinjct_reset) {
+			rc = OPAL_UNSUPPORTED;
+			break;
+		}
+
+		if (assert_state != OPAL_ASSERT_RESET)
+			break;
+
+		rc = phb->ops->papr_errinjct_reset(phb);
+		break;
+	default:
+		rc = OPAL_UNSUPPORTED;
+	}
+	phb_unlock(phb);
+
+	return (rc > 0) ? tb_to_msecs(rc) : rc;
+}
+opal_call(OPAL_PCI_RESET, opal_pci_reset, 3);
+
+static int64_t opal_pci_reinit(uint64_t phb_id,
+			       uint64_t reinit_scope,
+			       uint64_t data)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops || !phb->ops->pci_reinit)
+		return OPAL_UNSUPPORTED;
+
+	phb_lock(phb);
+	rc = phb->ops->pci_reinit(phb, reinit_scope, data);
+	phb_unlock(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_REINIT, opal_pci_reinit, 3);
+
+static int64_t opal_pci_poll(uint64_t id)
+{
+	struct pci_slot *slot = pci_slot_find(id);
+	struct phb *phb = slot ? slot->phb : NULL;
+	int64_t rc;
+
+	if (!slot || !phb)
+		return OPAL_PARAMETER;
+	if (!slot->ops.run_sm)
+		return OPAL_UNSUPPORTED;
+
+	phb_lock(phb);
+	rc = slot->ops.run_sm(slot);
+	phb_unlock(phb);
+
+	/* Return milliseconds for caller to sleep: round up */
+	if (rc > 0) {
+		rc = tb_to_msecs(rc);
+		if (rc == 0)
+			rc = 1;
+	}
+
+	return rc;
+}
+opal_call(OPAL_PCI_POLL, opal_pci_poll, 1);
+
+static int64_t opal_pci_get_presence_state(uint64_t id, uint64_t data)
+{
+	struct pci_slot *slot = pci_slot_find(id);
+	struct phb *phb = slot ? slot->phb : NULL;
+	uint8_t *presence = (uint8_t *)data;
+	int64_t rc;
+
+	if (!opal_addr_valid(presence))
+		return OPAL_PARAMETER;
+
+	if (!slot || !phb)
+		return OPAL_PARAMETER;
+	if (!slot->ops.get_presence_state)
+		return OPAL_UNSUPPORTED;
+
+	phb_lock(phb);
+	rc = slot->ops.get_presence_state(slot, presence);
+	phb_unlock(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_GET_PRESENCE_STATE, opal_pci_get_presence_state, 2);
+
+static int64_t opal_pci_get_power_state(uint64_t id, uint64_t data)
+{
+	struct pci_slot *slot = pci_slot_find(id);
+	struct phb *phb = slot ? slot->phb : NULL;
+	uint8_t *power_state = (uint8_t *)data;
+	int64_t rc;
+
+	if (!opal_addr_valid(power_state))
+		return OPAL_PARAMETER;
+
+	if (!slot || !phb)
+		return OPAL_PARAMETER;
+	if (!slot->ops.get_power_state)
+		return OPAL_UNSUPPORTED;
+
+	phb_lock(phb);
+	rc = slot->ops.get_power_state(slot, power_state);
+	phb_unlock(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_GET_POWER_STATE, opal_pci_get_power_state, 2);
+
+static u32 get_slot_phandle(struct pci_slot *slot)
+{
+	struct phb *phb = slot->phb;
+	struct pci_device *pd = slot->pd;
+
+	if (pd)
+		return pd->dn->phandle;
+	else
+		return phb->dt_node->phandle;
+}
+
+static void rescan_slot_devices(struct pci_slot *slot)
+{
+	struct phb *phb = slot->phb;
+	struct pci_device *pd = slot->pd;
+
+	/*
+	 * prepare_link_change() is called (if needed) by the state
+	 * machine during the slot reset or link polling
+	 */
+	if (phb->phb_type != phb_type_npu_v2_opencapi) {
+		pci_scan_bus(phb, pd->secondary_bus,
+			     pd->subordinate_bus, &pd->children, pd, true);
+		pci_add_device_nodes(phb, &pd->children, pd->dn,
+				     &phb->lstate, 0);
+	} else {
+		pci_scan_bus(phb, 0, 0xff, &phb->devices, NULL, true);
+		pci_add_device_nodes(phb, &phb->devices,
+				     phb->dt_node, &phb->lstate, 0);
+		phb->ops->phb_final_fixup(phb);
+	}
+}
+
+static void remove_slot_devices(struct pci_slot *slot)
+{
+	struct phb *phb = slot->phb;
+	struct pci_device *pd = slot->pd;
+
+	if (phb->phb_type != phb_type_npu_v2_opencapi)
+		pci_remove_bus(phb, &pd->children);
+	else
+		pci_remove_bus(phb, &phb->devices);
+}
+
+static void link_up_timer(struct timer *t, void *data,
+			  uint64_t now __unused)
+{
+	struct pci_slot *slot = data;
+	struct phb *phb = slot->phb;
+	uint8_t link;
+	int64_t rc = 0;
+
+	if (!phb_try_lock(phb)) {
+		schedule_timer(&slot->timer, msecs_to_tb(10));
+		return;
+	}
+
+	rc = slot->ops.run_sm(slot);
+	if (rc < 0)
+		goto out;
+	if (rc > 0) {
+		schedule_timer(t, rc);
+		phb_unlock(phb);
+		return;
+	}
+
+	if (slot->ops.get_link_state(slot, &link) != OPAL_SUCCESS)
+		link = 0;
+	if (!link) {
+		rc = OPAL_HARDWARE;
+		goto out;
+	}
+
+	rescan_slot_devices(slot);
+out:
+	opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL,
+		       cpu_to_be64(slot->async_token),
+		       cpu_to_be64(get_slot_phandle(slot)),
+		       cpu_to_be64(slot->power_state),
+		       rc <= 0 ? cpu_to_be64(rc) : cpu_to_be64(OPAL_BUSY));
+	phb_unlock(phb);
+}
+
+static bool training_needed(struct pci_slot *slot)
+{
+	struct phb *phb = slot->phb;
+	struct pci_device *pd = slot->pd;
+
+	/* only for opencapi slots for now */
+	if (!pd && phb->phb_type == phb_type_npu_v2_opencapi)
+		return true;
+	return false;
+}
+
+static void wait_for_link_up_and_rescan(struct pci_slot *slot)
+{
+	int64_t rc = 1;
+
+	/*
+	 * Links for PHB slots need to be retrained by triggering a
+	 * fundamental reset. Other slots also need to be tested for
+	 * readiness
+	 */
+	if (training_needed(slot)) {
+		pci_slot_set_state(slot, PCI_SLOT_STATE_NORMAL);
+		rc = slot->ops.freset(slot);
+		if (rc < 0) {
+			opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL,
+				       cpu_to_be64(slot->async_token),
+				       cpu_to_be64(get_slot_phandle(slot)),
+				       cpu_to_be64(slot->power_state),
+				       cpu_to_be64(rc))
+			return;
+		}
+	} else {
+		pci_slot_set_state(slot, PCI_SLOT_STATE_LINK_START_POLL);
+		rc = msecs_to_tb(20);
+	}
+	init_timer(&slot->timer, link_up_timer, slot);
+	schedule_timer(&slot->timer, rc);
+}
+
+static void set_power_timer(struct timer *t __unused, void *data,
+			    uint64_t now __unused)
+{
+	struct pci_slot *slot = data;
+	struct phb *phb = slot->phb;
+
+	if (!phb_try_lock(phb)) {
+		schedule_timer(&slot->timer, msecs_to_tb(10));
+		return;
+	}
+
+	switch (slot->state) {
+	case PCI_SLOT_STATE_SPOWER_START:
+		if (slot->retries-- == 0) {
+			pci_slot_set_state(slot, PCI_SLOT_STATE_NORMAL);
+			opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL,
+				       cpu_to_be64(slot->async_token),
+				       cpu_to_be64(get_slot_phandle(slot)),
+				       cpu_to_be64(slot->power_state),
+				       cpu_to_be64(OPAL_BUSY));
+		} else {
+			schedule_timer(&slot->timer, msecs_to_tb(10));
+		}
+
+		break;
+	case PCI_SLOT_STATE_SPOWER_DONE:
+		if (slot->power_state == OPAL_PCI_SLOT_POWER_OFF) {
+			remove_slot_devices(slot);
+			pci_slot_set_state(slot, PCI_SLOT_STATE_NORMAL);
+			opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL,
+				       cpu_to_be64(slot->async_token),
+				       cpu_to_be64(get_slot_phandle(slot)),
+				       cpu_to_be64(OPAL_PCI_SLOT_POWER_OFF),
+				       cpu_to_be64(OPAL_SUCCESS));
+			break;
+		}
+
+		/* Power on */
+		wait_for_link_up_and_rescan(slot);
+		break;
+	default:
+		prlog(PR_ERR, "PCI SLOT %016llx: Unexpected state 0x%08x\n",
+		      slot->id, slot->state);
+	}
+	phb_unlock(phb);
+}
+
+static int64_t opal_pci_set_power_state(uint64_t async_token,
+					uint64_t id,
+					uint64_t data)
+{
+	struct pci_slot *slot = pci_slot_find(id);
+	struct phb *phb = slot ? slot->phb : NULL;
+	struct pci_device *pd = slot ? slot->pd : NULL;
+	uint8_t *state = (uint8_t *)data;
+	int64_t rc;
+
+	if (!slot || !phb)
+		return OPAL_PARAMETER;
+
+	if (!opal_addr_valid(state))
+		return OPAL_PARAMETER;
+
+	phb_lock(phb);
+	switch (*state) {
+	case OPAL_PCI_SLOT_POWER_OFF:
+		if (!slot->ops.prepare_link_change ||
+		    !slot->ops.set_power_state) {
+			phb_unlock(phb);
+			return OPAL_UNSUPPORTED;
+		}
+
+		slot->async_token = async_token;
+		slot->ops.prepare_link_change(slot, false);
+		rc = slot->ops.set_power_state(slot, PCI_SLOT_POWER_OFF);
+		break;
+	case OPAL_PCI_SLOT_POWER_ON:
+		if (!slot->ops.set_power_state ||
+		    !slot->ops.get_link_state) {
+			phb_unlock(phb);
+			return OPAL_UNSUPPORTED;
+		}
+
+		slot->async_token = async_token;
+		rc = slot->ops.set_power_state(slot, PCI_SLOT_POWER_ON);
+		break;
+	case OPAL_PCI_SLOT_OFFLINE:
+		if (!pd) {
+			phb_unlock(phb);
+			return OPAL_PARAMETER;
+		}
+
+		pci_remove_bus(phb, &pd->children);
+		phb_unlock(phb);
+		return OPAL_SUCCESS;
+	case OPAL_PCI_SLOT_ONLINE:
+		if (!pd) {
+			phb_unlock(phb);
+			return OPAL_PARAMETER;
+		}
+		pci_scan_bus(phb, pd->secondary_bus, pd->subordinate_bus,
+			     &pd->children, pd, true);
+		pci_add_device_nodes(phb, &pd->children, pd->dn,
+				     &phb->lstate, 0);
+		phb_unlock(phb);
+		return OPAL_SUCCESS;
+	default:
+		rc = OPAL_PARAMETER;
+	}
+
+	/*
+	 * OPAL_ASYNC_COMPLETION is returned when delay is needed to change
+	 * the power state in the backend. When it can be finished without
+	 * delay, OPAL_SUCCESS is returned. The PCI topology needs to be
+	 * updated in both cases.
+	 */
+	if (rc == OPAL_ASYNC_COMPLETION) {
+		slot->retries = 500;
+		init_timer(&slot->timer, set_power_timer, slot);
+		schedule_timer(&slot->timer, msecs_to_tb(10));
+	} else if (rc == OPAL_SUCCESS) {
+		if (*state == OPAL_PCI_SLOT_POWER_OFF) {
+			remove_slot_devices(slot);
+		} else {
+			wait_for_link_up_and_rescan(slot);
+			rc = OPAL_ASYNC_COMPLETION;
+		}
+	}
+
+	phb_unlock(phb);
+	return rc;
+}
+opal_call(OPAL_PCI_SET_POWER_STATE, opal_pci_set_power_state, 3);
+
+static int64_t opal_pci_get_phb_diag_data2(uint64_t phb_id,
+					   void *diag_buffer,
+					   uint64_t diag_buffer_len)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!opal_addr_valid(diag_buffer))
+		return OPAL_PARAMETER;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->get_diag_data2)
+		return OPAL_UNSUPPORTED;
+	phb_lock(phb);
+	rc = phb->ops->get_diag_data2(phb, diag_buffer, diag_buffer_len);
+	phb_unlock(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_GET_PHB_DIAG_DATA2, opal_pci_get_phb_diag_data2, 3);
+
+static int64_t opal_pci_next_error(uint64_t phb_id, __be64 *__first_frozen_pe,
+				   __be16 *__pci_error_type, __be16 *__severity)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	uint64_t first_frozen_pe;
+	uint16_t pci_error_type;
+	uint16_t severity;
+	int64_t rc;
+
+	if (!opal_addr_valid(__first_frozen_pe) ||
+		!opal_addr_valid(__pci_error_type) || !opal_addr_valid(__severity))
+		return OPAL_PARAMETER;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->next_error)
+		return OPAL_UNSUPPORTED;
+	phb_lock(phb);
+
+	opal_pci_eeh_clear_evt(phb_id);
+	rc = phb->ops->next_error(phb, &first_frozen_pe, &pci_error_type,
+				  &severity);
+	phb_unlock(phb);
+
+	*__first_frozen_pe = cpu_to_be64(first_frozen_pe);
+	*__pci_error_type = cpu_to_be16(pci_error_type);
+	*__severity = cpu_to_be16(severity);
+
+	return rc;
+}
+opal_call(OPAL_PCI_NEXT_ERROR, opal_pci_next_error, 4);
+
+static int64_t opal_pci_set_phb_capi_mode(uint64_t phb_id, uint64_t mode, uint64_t pe_number)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->set_capi_mode)
+		return OPAL_UNSUPPORTED;
+
+	phb_lock(phb);
+	rc = phb->ops->set_capi_mode(phb, mode, pe_number);
+	phb_unlock(phb);
+	return rc;
+}
+opal_call(OPAL_PCI_SET_PHB_CAPI_MODE, opal_pci_set_phb_capi_mode, 3);
+
+static int64_t opal_pci_set_p2p(uint64_t phbid_init, uint64_t phbid_target,
+				uint64_t desc, uint16_t pe_number)
+{
+	struct phb *phb_init = pci_get_phb(phbid_init);
+	struct phb *phb_target = pci_get_phb(phbid_target);
+
+	if (!phb_init || !phb_target)
+		return OPAL_PARAMETER;
+	/*
+	 * Having the 2 devices under the same PHB may require tuning
+	 * the configuration of intermediate switch(es), more easily
+	 * done from linux. And it shouldn't require a PHB config
+	 * change.
+	 * Return an error for the time being.
+	 */
+	if (phb_init == phb_target)
+		return OPAL_UNSUPPORTED;
+	if (!phb_init->ops->set_p2p || !phb_target->ops->set_p2p)
+		return OPAL_UNSUPPORTED;
+	/*
+	 * Loads would be supported on p9 if the 2 devices are under
+	 * the same PHB, but we ruled it out above.
+	 */
+	if (desc & OPAL_PCI_P2P_LOAD)
+		return OPAL_UNSUPPORTED;
+
+	phb_lock(phb_init);
+	phb_init->ops->set_p2p(phb_init, OPAL_PCI_P2P_INITIATOR, desc,
+			pe_number);
+	phb_unlock(phb_init);
+
+	phb_lock(phb_target);
+	phb_target->ops->set_p2p(phb_target, OPAL_PCI_P2P_TARGET, desc,
+				pe_number);
+	phb_unlock(phb_target);
+	return OPAL_SUCCESS;
+}
+opal_call(OPAL_PCI_SET_P2P, opal_pci_set_p2p, 4);
+
+static int64_t opal_pci_get_pbcq_tunnel_bar(uint64_t phb_id, __be64 *__addr)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	uint64_t addr;
+
+	if (!opal_addr_valid(__addr))
+		return OPAL_PARAMETER;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->get_tunnel_bar)
+		return OPAL_UNSUPPORTED;
+
+	phb_lock(phb);
+	phb->ops->get_tunnel_bar(phb, &addr);
+	phb_unlock(phb);
+
+	*__addr = cpu_to_be64(addr);
+
+	return OPAL_SUCCESS;
+}
+opal_call(OPAL_PCI_GET_PBCQ_TUNNEL_BAR, opal_pci_get_pbcq_tunnel_bar, 2);
+
+static int64_t opal_pci_set_pbcq_tunnel_bar(uint64_t phb_id, uint64_t addr)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->set_tunnel_bar)
+		return OPAL_UNSUPPORTED;
+
+	phb_lock(phb);
+	rc = phb->ops->set_tunnel_bar(phb, addr);
+	phb_unlock(phb);
+	return rc;
+}
+opal_call(OPAL_PCI_SET_PBCQ_TUNNEL_BAR, opal_pci_set_pbcq_tunnel_bar, 2);
diff --git a/roms/skiboot/core/pci-quirk.c b/roms/skiboot/core/pci-quirk.c
new file mode 100644
index 000000000..5c8b091ea
--- /dev/null
+++ b/roms/skiboot/core/pci-quirk.c
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Deal with PCI device quirks
+ *
+ * Copyright 2017-2018 IBM Corp.
+ */
+
+#define pr_fmt(fmt)  "PCI-QUIRK: " fmt
+
+#include <skiboot.h>
+#include <pci.h>
+#include <pci-cfg.h>
+#include <pci-quirk.h>
+#include <platform.h>
+#include <ast.h>
+
+static int64_t cfg_block_filter(void *dev __unused,
+				struct pci_cfg_reg_filter *pcrf __unused,
+				uint32_t offset __unused, uint32_t len,
+				uint32_t *data, bool write)
+{
+	if (write)
+		return OPAL_SUCCESS;
+
+	switch (len) {
+	case 4:
+		*data = 0x0;
+		return OPAL_SUCCESS;
+	case 2:
+		*((uint16_t *)data) = 0x0;
+		return OPAL_SUCCESS;
+	case 1:
+		*((uint8_t *)data) = 0x0;
+		return OPAL_SUCCESS;
+	}
+
+	return OPAL_PARAMETER; /* should never happen */
+}
+
+/* blocks config accesses to registers in the range: [start, end] */
+#define BLOCK_CFG_RANGE(pd, start, end) \
+	pci_add_cfg_reg_filter(pd, start, end - start + 1, \
+		PCI_REG_FLAG_WRITE | PCI_REG_FLAG_READ, \
+		cfg_block_filter);
+
+static void quirk_microsemi_gen4_sw(struct phb *phb, struct pci_device *pd)
+{
+	uint8_t data;
+	bool frozen;
+	int offset;
+	int start;
+
+	pci_check_clear_freeze(phb);
+
+	/*
+	 * Reading from 0xff should trigger a UR on the affected switches.
+	 * If we don't get a freeze then we don't need the workaround
+	 */
+	pci_cfg_read8(phb, pd->bdfn, 0xff, &data);
+	frozen = pci_check_clear_freeze(phb);
+	if (!frozen)
+		return;
+
+	for (start = -1, offset = 0; offset < 4096; offset++) {
+		pci_cfg_read8(phb, pd->bdfn, offset, &data);
+		frozen = pci_check_clear_freeze(phb);
+
+		if (start < 0 && frozen) { /* new UR range */
+			start = offset;
+		} else if (start >= 0 && !frozen) { /* end of range */
+			BLOCK_CFG_RANGE(pd, start, offset - 1);
+			PCINOTICE(phb, pd->bdfn, "Applied UR workaround to [%03x..%03x]\n", start, offset - 1);
+
+			start = -1;
+		}
+	}
+
+	/* range lasted until the end of config space */
+	if (start >= 0) {
+		BLOCK_CFG_RANGE(pd, start, 0xfff);
+		PCINOTICE(phb, pd->bdfn, "Applied UR workaround to [%03x..fff]\n", start);
+	}
+}
+
+static void quirk_astbmc_vga(struct phb *phb __unused,
+			     struct pci_device *pd)
+{
+	struct dt_node *np = pd->dn;
+	uint32_t revision, mcr_configuration, mcr_scu_mpll, mcr_scu_strap;
+
+	if (ast_sio_is_enabled()) {
+		revision = ast_ahb_readl(SCU_REVISION_ID);
+		mcr_configuration = ast_ahb_readl(MCR_CONFIGURATION);
+		mcr_scu_mpll = ast_ahb_readl(MCR_SCU_MPLL);
+		mcr_scu_strap = ast_ahb_readl(MCR_SCU_STRAP);
+	} else {
+		/* Previously we would warn, now SIO disabled by design */
+		prlog(PR_INFO, "Assumed platform default parameters for %s\n",
+		      __func__);
+		revision = bmc_platform->hw->scu_revision_id;
+		mcr_configuration = bmc_platform->hw->mcr_configuration;
+		mcr_scu_mpll = bmc_platform->hw->mcr_scu_mpll;
+		mcr_scu_strap = bmc_platform->hw->mcr_scu_strap;
+	}
+
+	dt_add_property_cells(np, "aspeed,scu-revision-id", revision);
+	dt_add_property_cells(np, "aspeed,mcr-configuration", mcr_configuration);
+	dt_add_property_cells(np, "aspeed,mcr-scu-mpll", mcr_scu_mpll);
+	dt_add_property_cells(np, "aspeed,mcr-scu-strap", mcr_scu_strap);
+}
+
+/* Quirks are: {fixup function, vendor ID, (device ID or PCI_ANY_ID)} */
+static const struct pci_quirk quirk_table[] = {
+	/* ASPEED 2400 VGA device */
+	{ 0x1a03, 0x2000, &quirk_astbmc_vga },
+	{ 0x11f8, 0x4052, &quirk_microsemi_gen4_sw },
+	{ 0, 0, NULL }
+};
+
+static void __pci_handle_quirk(struct phb *phb, struct pci_device *pd,
+			       const struct pci_quirk *quirks)
+{
+	while (quirks->vendor_id) {
+		if (quirks->vendor_id == PCI_VENDOR_ID(pd->vdid) &&
+		    (quirks->device_id == PCI_ANY_ID ||
+		     quirks->device_id == PCI_DEVICE_ID(pd->vdid)))
+			quirks->fixup(phb, pd);
+		quirks++;
+	}
+}
+
+void pci_handle_quirk(struct phb *phb, struct pci_device *pd)
+{
+	__pci_handle_quirk(phb, pd, quirk_table);
+}
diff --git a/roms/skiboot/core/pci-slot.c b/roms/skiboot/core/pci-slot.c
new file mode 100644
index 000000000..71d3d329c
--- /dev/null
+++ b/roms/skiboot/core/pci-slot.c
@@ -0,0 +1,241 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * PCI Slots
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <opal-msg.h>
+#include <pci-cfg.h>
+#include <pci.h>
+#include <pci-slot.h>
+
+/* Debugging options */
+#define PCI_SLOT_PREFIX	"PCI-SLOT-%016llx "
+#define PCI_SLOT_DBG(s, fmt, a...)		 \
+	prlog(PR_DEBUG, PCI_SLOT_PREFIX fmt, (s)->id, ##a)
+
+static void pci_slot_prepare_link_change(struct pci_slot *slot, bool up)
+{
+	struct phb *phb = slot->phb;
+	struct pci_device *pd = slot->pd;
+	uint32_t aercap, mask;
+
+	/*
+	 * Mask the link down and receiver error before the link becomes
+	 * down. Otherwise, unmask the errors when the link is up.
+	 */
+	if (pci_has_cap(pd, PCIECAP_ID_AER, true)) {
+		aercap = pci_cap(pd, PCIECAP_ID_AER, true);
+
+		/* Mask link surprise down event. The event is always
+		 * masked when the associated PCI slot supports PCI
+		 * surprise hotplug. We needn't toggle it when the link
+		 * bounces caused by reset and just keep it always masked.
+		 */
+		if (!pd->slot || !pd->slot->surprise_pluggable) {
+			pci_cfg_read32(phb, pd->bdfn,
+				       aercap + PCIECAP_AER_UE_MASK, &mask);
+			if (up)
+				mask &= ~PCIECAP_AER_UE_MASK_SURPRISE_DOWN;
+			else
+				mask |= PCIECAP_AER_UE_MASK_SURPRISE_DOWN;
+			pci_cfg_write32(phb, pd->bdfn,
+					aercap + PCIECAP_AER_UE_MASK, mask);
+		}
+
+		/* Receiver error */
+		pci_cfg_read32(phb, pd->bdfn, aercap + PCIECAP_AER_CE_MASK,
+			       &mask);
+		if (up)
+			mask &= ~PCIECAP_AER_CE_RECVR_ERR;
+		else
+			mask |= PCIECAP_AER_CE_RECVR_ERR;
+		pci_cfg_write32(phb, pd->bdfn, aercap + PCIECAP_AER_CE_MASK,
+				mask);
+	}
+
+	/*
+	 * We're coming back from reset. We need restore bus ranges
+	 * and reinitialize the affected bridges and devices.
+	 */
+	if (up) {
+		pci_restore_bridge_buses(phb, pd);
+		if (phb->ops->device_init)
+			pci_walk_dev(phb, pd, phb->ops->device_init, NULL);
+	}
+}
+
+static int64_t pci_slot_run_sm(struct pci_slot *slot)
+{
+	uint64_t now = mftb();
+	int64_t ret;
+
+	/* Return remaining timeout if we're still waiting */
+	if (slot->delay_tgt_tb &&
+	    tb_compare(now, slot->delay_tgt_tb) == TB_ABEFOREB)
+		return slot->delay_tgt_tb - now;
+
+	slot->delay_tgt_tb = 0;
+	switch (slot->state & PCI_SLOT_STATE_MASK) {
+	case PCI_SLOT_STATE_LINK:
+		ret = slot->ops.poll_link(slot);
+		break;
+	case PCI_SLOT_STATE_HRESET:
+		ret = slot->ops.hreset(slot);
+		break;
+	case PCI_SLOT_STATE_FRESET:
+		ret = slot->ops.freset(slot);
+		break;
+	case PCI_SLOT_STATE_CRESET:
+		ret = slot->ops.creset(slot);
+		break;
+	default:
+		prlog(PR_ERR, PCI_SLOT_PREFIX
+		      "Invalid state %08x\n", slot->id, slot->state);
+		pci_slot_set_state(slot, PCI_SLOT_STATE_NORMAL);
+		ret = OPAL_HARDWARE;
+	}
+
+	/* Notify about the pci slot state machine completion */
+	if (ret <= 0 && slot->ops.completed_sm_run)
+		slot->ops.completed_sm_run(slot, ret);
+
+	return ret;
+}
+
+void pci_slot_add_dt_properties(struct pci_slot *slot,
+				struct dt_node *np)
+{
+	/* Bail without device node */
+	if (!np)
+		return;
+
+	dt_add_property_cells(np, "ibm,reset-by-firmware", 1);
+	dt_add_property_cells(np, "ibm,slot-pluggable", slot->pluggable);
+	dt_add_property_cells(np, "ibm,slot-surprise-pluggable",
+			      slot->surprise_pluggable);
+	if (pci_slot_has_flags(slot, PCI_SLOT_FLAG_BROKEN_PDC))
+		dt_add_property_cells(np, "ibm,slot-broken-pdc", 1);
+
+	dt_add_property_cells(np, "ibm,slot-power-ctl", slot->power_ctl);
+	dt_add_property_cells(np, "ibm,slot-power-led-ctlled",
+			      slot->power_led_ctl);
+	dt_add_property_cells(np, "ibm,slot-attn-led", slot->attn_led_ctl);
+	dt_add_property_cells(np, "ibm,slot-connector-type",
+			      slot->connector_type);
+	dt_add_property_cells(np, "ibm,slot-card-desc", slot->card_desc);
+	dt_add_property_cells(np, "ibm,slot-card-mech", slot->card_mech);
+	dt_add_property_cells(np, "ibm,slot-wired-lanes", slot->wired_lanes);
+	dt_add_property_cells(np, "ibm,power-limit", slot->power_limit);
+
+	if (slot->ops.add_properties)
+		slot->ops.add_properties(slot, np);
+}
+
+struct pci_slot *pci_slot_alloc(struct phb *phb,
+				struct pci_device *pd)
+{
+	struct pci_slot *slot = NULL;
+
+	/*
+	 * The function can be used to allocate either PHB slot or normal
+	 * one. For both cases, the @phb should be always valid.
+	 */
+	if (!phb)
+		return NULL;
+
+	/*
+	 * When @pd is NULL, we're going to create a PHB slot. Otherwise,
+	 * a normal slot will be created. Check if the specified slot
+	 * already exists or not.
+	 */
+	slot = pd ? pd->slot : phb->slot;
+	if (slot) {
+		prlog(PR_ERR, PCI_SLOT_PREFIX "Already exists\n", slot->id);
+		return slot;
+	}
+
+	/* Allocate memory chunk */
+	slot = zalloc(sizeof(struct pci_slot));
+	if (!slot) {
+		prlog(PR_ERR, "%s: Out of memory\n", __func__);
+		return NULL;
+	}
+
+	/*
+	 * The polling function sholdn't be overridden by individual
+	 * platforms
+	 */
+	slot->phb = phb;
+	slot->pd = pd;
+	pci_slot_set_state(slot, PCI_SLOT_STATE_NORMAL);
+	slot->power_state = PCI_SLOT_POWER_ON;
+	slot->ops.run_sm = pci_slot_run_sm;
+	slot->ops.prepare_link_change = pci_slot_prepare_link_change;
+	slot->peer_slot = NULL;
+	if (!pd) {
+		slot->id = PCI_PHB_SLOT_ID(phb);
+		phb->slot = slot;
+	} else {
+		slot->id = PCI_SLOT_ID(phb, pd->bdfn);
+		pd->slot = slot;
+	}
+
+	return slot;
+}
+
+struct pci_slot *pci_slot_find(uint64_t id)
+{
+	struct phb *phb;
+	struct pci_device *pd;
+	struct pci_slot *slot;
+	uint64_t index;
+	uint16_t bdfn;
+
+	index = PCI_SLOT_PHB_INDEX(id);
+	phb = pci_get_phb(index);
+
+	/* PHB slot */
+	if (!(id & PCI_SLOT_ID_PREFIX)) {
+		slot = phb ? phb->slot : NULL;
+		return slot;
+	}
+
+	/* Normal PCI slot */
+	bdfn = PCI_SLOT_BDFN(id);
+	pd = phb ? pci_find_dev(phb, bdfn) : NULL;
+	slot = pd ? pd->slot : NULL;
+	return slot;
+}
+
+void pci_slot_add_loc(struct pci_slot *slot,
+			struct dt_node *np, const char *label)
+{
+	char tmp[8], loc_code[LOC_CODE_SIZE];
+	struct pci_device *pd = slot->pd;
+	struct phb *phb = slot->phb;
+
+	if (!np)
+		return;
+
+	/* didn't get a real slot label? generate one! */
+	if (!label) {
+		snprintf(tmp, sizeof(tmp), "S%04x%02x", phb->opal_id,
+			pd->secondary_bus);
+		label = tmp;
+	}
+
+	/* Make a <PHB_LOC_CODE>-<LABEL> pair if we have a PHB loc code */
+	if (phb->base_loc_code) {
+		snprintf(loc_code, sizeof(loc_code), "%s-%s",
+			phb->base_loc_code, label);
+	} else {
+		strncpy(loc_code, label, sizeof(loc_code) - 1);
+		loc_code[LOC_CODE_SIZE - 1] = '\0';
+	}
+
+	dt_add_property_string(np, "ibm,slot-label", label);
+	dt_add_property_string(np, "ibm,slot-location-code", loc_code);
+}
diff --git a/roms/skiboot/core/pci-virt.c b/roms/skiboot/core/pci-virt.c
new file mode 100644
index 000000000..e0cb9949c
--- /dev/null
+++ b/roms/skiboot/core/pci-virt.c
@@ -0,0 +1,256 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Support virtual PCI devices
+ *
+ * Copyright 2013-2016 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <pci.h>
+#include <pci-virt.h>
+
+void pci_virt_cfg_read_raw(struct pci_virt_device *pvd,
+			   uint32_t space, uint32_t offset,
+			   uint32_t size, uint32_t *data)
+{
+	uint32_t i;
+
+	if (space >= PCI_VIRT_CFG_MAX || !pvd->config[space])
+		return;
+
+	for (*data = 0, i = 0; i < size; i++)
+		*data |= ((uint32_t)(pvd->config[space][offset + i]) << (i * 8));
+}
+
+void pci_virt_cfg_write_raw(struct pci_virt_device *pvd,
+			    uint32_t space, uint32_t offset,
+			    uint32_t size, uint32_t data)
+{
+	int i;
+
+	if (space >= PCI_VIRT_CFG_MAX || !pvd->config[space])
+		return;
+
+	for (i = 0; i < size; i++) {
+		pvd->config[space][offset + i] = data;
+		data = (data >> 8);
+	}
+}
+
+static struct pci_cfg_reg_filter *pci_virt_find_filter(
+					struct pci_virt_device *pvd,
+					uint32_t start, uint32_t len)
+{
+	struct pci_cfg_reg_filter *pcrf;
+
+	if (!pvd || !len || start >= pvd->cfg_size)
+		return NULL;
+
+	/* Return filter if there is overlapped region. We don't
+	 * require strict matching for more flexibility. It also
+	 * means the associated handler should validate the register
+	 * offset and length.
+	 */
+	list_for_each(&pvd->pcrf, pcrf, link) {
+		if (start < (pcrf->start + pcrf->len) &&
+		    (start + len) > pcrf->start)
+			return pcrf;
+	}
+
+	return NULL;
+}
+
+struct pci_cfg_reg_filter *pci_virt_add_filter(struct pci_virt_device *pvd,
+					       uint32_t start,
+					       uint32_t len,
+					       uint32_t flags,
+					       pci_cfg_reg_func func,
+					       void *data)
+{
+	struct pci_cfg_reg_filter *pcrf;
+
+	if (!pvd || !len || (start + len) >= pvd->cfg_size)
+		return NULL;
+	if (!(flags & PCI_REG_FLAG_MASK))
+		return NULL;
+
+	pcrf = pci_virt_find_filter(pvd, start, len);
+	if (pcrf) {
+		prlog(PR_ERR, "%s: Filter [%x, %x] overlapped with [%x, %x]\n",
+		      __func__, start, len, pcrf->start, pcrf->len);
+		return NULL;
+	}
+
+	pcrf = zalloc(sizeof(*pcrf));
+	if (!pcrf) {
+		prlog(PR_ERR, "%s: Out of memory!\n", __func__);
+		return NULL;
+	}
+
+	pcrf->start = start;
+	pcrf->len   = len;
+	pcrf->flags = flags;
+	pcrf->func  = func;
+	pcrf->data  = data;
+	list_add_tail(&pvd->pcrf, &pcrf->link);
+
+	return pcrf;
+}
+
+struct pci_virt_device *pci_virt_find_device(struct phb *phb,
+					     uint32_t bdfn)
+{
+	struct pci_virt_device *pvd;
+
+	list_for_each(&phb->virt_devices, pvd, node) {
+		if (pvd->bdfn == bdfn)
+			return pvd;
+	}
+
+	return NULL;
+}
+
+static inline bool pci_virt_cfg_valid(struct pci_virt_device *pvd,
+				      uint32_t offset, uint32_t size)
+{
+	if ((offset + size) > pvd->cfg_size)
+		return false;
+
+	if (!size || (size > 4))
+		return false;
+
+	if ((size & (size - 1)) || (offset & (size - 1)))
+		return false;
+
+	return true;
+}
+
+int64_t pci_virt_cfg_read(struct phb *phb, uint32_t bdfn,
+			  uint32_t offset, uint32_t size,
+			  uint32_t *data)
+{
+	struct pci_virt_device *pvd;
+	struct pci_cfg_reg_filter *pcrf;
+	int64_t ret = OPAL_SUCCESS;
+
+	*data = 0xffffffff;
+
+	/* Search for PCI virtual device */
+	pvd = pci_virt_find_device(phb, bdfn);
+	if (!pvd)
+		return OPAL_PARAMETER;
+
+	/* Check if config address is valid or not */
+	if (!pci_virt_cfg_valid(pvd, offset, size))
+		return OPAL_PARAMETER;
+
+	/* The value is fetched from the normal config space when the
+	 * trap handler returns OPAL_PARTIAL. Otherwise, the trap handler
+	 * should provide the return value.
+	 */
+	pcrf = pci_virt_find_filter(pvd, offset, size);
+	if (!pcrf || !pcrf->func || !(pcrf->flags & PCI_REG_FLAG_READ))
+		goto out;
+
+	ret = pcrf->func(pvd, pcrf, offset, size, data, false);
+	if (ret != OPAL_PARTIAL)
+		return ret;
+out:
+	pci_virt_cfg_read_raw(pvd, PCI_VIRT_CFG_NORMAL, offset, size, data);
+	return OPAL_SUCCESS;
+}
+
+int64_t pci_virt_cfg_write(struct phb *phb, uint32_t bdfn,
+			   uint32_t offset, uint32_t size,
+			   uint32_t data)
+{
+	struct pci_virt_device *pvd;
+	struct pci_cfg_reg_filter *pcrf;
+	uint32_t val, v, r, c, i;
+	int64_t ret = OPAL_SUCCESS;
+
+	/* Search for PCI virtual device */
+	pvd = pci_virt_find_device(phb, bdfn);
+	if (!pvd)
+		return OPAL_PARAMETER;
+
+	/* Check if config address is valid or not */
+	if (!pci_virt_cfg_valid(pvd, offset, size))
+		return OPAL_PARAMETER;
+
+	/* The value is written to the config space if the trap handler
+	 * returns OPAL_PARTIAL. Otherwise, the value to be written is
+	 * dropped.
+	 */
+	pcrf = pci_virt_find_filter(pvd, offset, size);
+	if (!pcrf || !pcrf->func || !(pcrf->flags & PCI_REG_FLAG_WRITE))
+		goto out;
+
+	ret = pcrf->func(pvd, pcrf, offset, size, &data, true);
+	if (ret != OPAL_PARTIAL)
+		return ret;
+out:
+	val = data;
+	for (i = 0; i < size; i++) {
+		PCI_VIRT_CFG_NORMAL_RD(pvd, offset + i, 1, &v);
+		PCI_VIRT_CFG_RDONLY_RD(pvd, offset + i, 1, &r);
+		PCI_VIRT_CFG_W1CLR_RD(pvd, offset + i, 1, &c);
+
+		/* Drop read-only bits */
+		val &= ~(r << (i * 8));
+		val |= (r & v) << (i * 8);
+
+		/* Drop W1C bits */
+		val &= ~(val & ((c & v) << (i * 8)));
+	}
+
+	PCI_VIRT_CFG_NORMAL_WR(pvd, offset, size, val);
+	return OPAL_SUCCESS;
+}
+
+struct pci_virt_device *pci_virt_add_device(struct phb *phb, uint32_t bdfn,
+					    uint32_t cfg_size, void *data)
+{
+	struct pci_virt_device *pvd;
+	uint8_t *cfg;
+	uint32_t i;
+
+	/* The standard config header size is 64 bytes */
+	if (!phb || (bdfn & 0xffff0000) || (cfg_size < 64))
+		return NULL;
+
+	/* Check if the bdfn is available */
+	pvd = pci_virt_find_device(phb, bdfn);
+	if (pvd) {
+		prlog(PR_ERR, "%s: bdfn 0x%x was reserved\n",
+		      __func__, bdfn);
+		return NULL;
+	}
+
+	/* Populate the PCI virtual device */
+	pvd = zalloc(sizeof(*pvd));
+	if (!pvd) {
+		prlog(PR_ERR, "%s: Cannot alloate PCI virtual device (0x%x)\n",
+		      __func__, bdfn);
+		return NULL;
+	}
+
+	cfg = zalloc(cfg_size * PCI_VIRT_CFG_MAX);
+	if (!cfg) {
+		prlog(PR_ERR, "%s: Cannot allocate config space (0x%x)\n",
+		      __func__, bdfn);
+		free(pvd);
+		return NULL;
+	}
+
+	for (i = 0; i < PCI_VIRT_CFG_MAX; i++, cfg += cfg_size)
+		pvd->config[i] = cfg;
+
+	pvd->bdfn     = bdfn;
+	pvd->cfg_size = cfg_size;
+	pvd->data     = data;
+	list_head_init(&pvd->pcrf);
+	list_add_tail(&phb->virt_devices, &pvd->node);
+
+	return pvd;
+}
diff --git a/roms/skiboot/core/pci.c b/roms/skiboot/core/pci.c
new file mode 100644
index 000000000..e195ecbf4
--- /dev/null
+++ b/roms/skiboot/core/pci.c
@@ -0,0 +1,1962 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Base PCI support
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <cpu.h>
+#include <pci.h>
+#include <pci-cfg.h>
+#include <pci-slot.h>
+#include <pci-quirk.h>
+#include <timebase.h>
+#include <device.h>
+
+#define MAX_PHB_ID	256
+static struct phb *phbs[MAX_PHB_ID];
+int last_phb_id = 0;
+
+/*
+ * Generic PCI utilities
+ */
+
+static int64_t __pci_find_cap(struct phb *phb, uint16_t bdfn,
+			      uint8_t want, bool check_cap_indicator)
+{
+	int64_t rc;
+	uint16_t stat, cap;
+	uint8_t pos, next;
+
+	rc = pci_cfg_read16(phb, bdfn, PCI_CFG_STAT, &stat);
+	if (rc)
+		return rc;
+	if (check_cap_indicator && !(stat & PCI_CFG_STAT_CAP))
+		return OPAL_UNSUPPORTED;
+	rc = pci_cfg_read8(phb, bdfn, PCI_CFG_CAP, &pos);
+	if (rc)
+		return rc;
+	pos &= 0xfc;
+	while(pos) {
+		rc = pci_cfg_read16(phb, bdfn, pos, &cap);
+		if (rc)
+			return rc;
+		if ((cap & 0xff) == want)
+			return pos;
+		next = (cap >> 8) & 0xfc;
+		if (next == pos) {
+			PCIERR(phb, bdfn, "pci_find_cap hit a loop !\n");
+			break;
+		}
+		pos = next;
+	}
+	return OPAL_UNSUPPORTED;
+}
+
+/* pci_find_cap - Find a PCI capability in a device config space
+ *
+ * This will return a config space offset (positive) or a negative
+ * error (OPAL error codes).
+ *
+ * OPAL_UNSUPPORTED is returned if the capability doesn't exist
+ */
+int64_t pci_find_cap(struct phb *phb, uint16_t bdfn, uint8_t want)
+{
+	return __pci_find_cap(phb, bdfn, want, true);
+}
+
+/* pci_find_ecap - Find a PCIe extended capability in a device
+ *                 config space
+ *
+ * This will return a config space offset (positive) or a negative
+ * error (OPAL error code). Additionally, if the "version" argument
+ * is non-NULL, the capability version will be returned there.
+ *
+ * OPAL_UNSUPPORTED is returned if the capability doesn't exist
+ */
+int64_t pci_find_ecap(struct phb *phb, uint16_t bdfn, uint16_t want,
+		      uint8_t *version)
+{
+	int64_t rc;
+	uint32_t cap;
+	uint16_t off, prev = 0;
+
+	for (off = 0x100; off && off < 0x1000; off = (cap >> 20) & 0xffc ) {
+		if (off == prev) {
+			PCIERR(phb, bdfn, "pci_find_ecap hit a loop !\n");
+			break;
+		}
+		prev = off;
+		rc = pci_cfg_read32(phb, bdfn, off, &cap);
+		if (rc)
+			return rc;
+
+		/* no ecaps supported */
+		if (cap == 0 || (cap & 0xffff) == 0xffff)
+			return OPAL_UNSUPPORTED;
+
+		if ((cap & 0xffff) == want) {
+			if (version)
+				*version = (cap >> 16) & 0xf;
+			return off;
+		}
+	}
+	return OPAL_UNSUPPORTED;
+}
+
+static void pci_init_pcie_cap(struct phb *phb, struct pci_device *pd)
+{
+	int64_t ecap = 0;
+	uint16_t reg;
+	uint32_t val;
+
+	/* On the upstream port of PLX bridge 8724 (rev ba), PCI_STATUS
+	 * register doesn't have capability indicator though it support
+	 * various PCI capabilities. So we need ignore that bit when
+	 * looking for PCI capabilities on the upstream port, which is
+	 * limited to one that seats directly under root port.
+	 */
+	if (pd->vdid == 0x872410b5 && pd->parent && !pd->parent->parent) {
+		uint8_t rev;
+
+		pci_cfg_read8(phb, pd->bdfn, PCI_CFG_REV_ID, &rev);
+		if (rev == 0xba)
+			ecap = __pci_find_cap(phb, pd->bdfn,
+					      PCI_CFG_CAP_ID_EXP, false);
+		else
+			ecap = pci_find_cap(phb, pd->bdfn, PCI_CFG_CAP_ID_EXP);
+	} else {
+		ecap = pci_find_cap(phb, pd->bdfn, PCI_CFG_CAP_ID_EXP);
+	}
+
+	if (ecap <= 0) {
+		pd->dev_type = PCIE_TYPE_LEGACY;
+		return;
+	}
+
+	pci_set_cap(pd, PCI_CFG_CAP_ID_EXP, ecap, NULL, NULL, false);
+
+	/*
+	 * XXX We observe a problem on some PLX switches where one
+	 * of the downstream ports appears as an upstream port, we
+	 * fix that up here otherwise, other code will misbehave
+	 */
+	pci_cfg_read16(phb, pd->bdfn, ecap + PCICAP_EXP_CAPABILITY_REG, &reg);
+	pd->dev_type = GETFIELD(PCICAP_EXP_CAP_TYPE, reg);
+	if (pd->parent && pd->parent->dev_type == PCIE_TYPE_SWITCH_UPPORT &&
+	    pd->vdid == 0x874810b5 && pd->dev_type == PCIE_TYPE_SWITCH_UPPORT) {
+		PCIDBG(phb, pd->bdfn, "Fixing up bad PLX downstream port !\n");
+		pd->dev_type = PCIE_TYPE_SWITCH_DNPORT;
+	}
+
+	/* XXX Handle ARI */
+	if (pd->dev_type == PCIE_TYPE_SWITCH_DNPORT ||
+	    pd->dev_type == PCIE_TYPE_ROOT_PORT)
+		pd->scan_map = 0x1;
+
+	/* Read MPS capability, whose maximal size is 4096 */
+	pci_cfg_read32(phb, pd->bdfn, ecap + PCICAP_EXP_DEVCAP, &val);
+	pd->mps = (128 << GETFIELD(PCICAP_EXP_DEVCAP_MPSS, val));
+	if (pd->mps > 4096)
+		pd->mps = 4096;
+}
+
+static void pci_init_aer_cap(struct phb *phb, struct pci_device *pd)
+{
+	int64_t pos;
+
+	if (!pci_has_cap(pd, PCI_CFG_CAP_ID_EXP, false))
+		return;
+
+	pos = pci_find_ecap(phb, pd->bdfn, PCIECAP_ID_AER, NULL);
+	if (pos > 0)
+		pci_set_cap(pd, PCIECAP_ID_AER, pos, NULL, NULL, true);
+}
+
+static void pci_init_pm_cap(struct phb *phb, struct pci_device *pd)
+{
+	int64_t pos;
+
+	pos = pci_find_cap(phb, pd->bdfn, PCI_CFG_CAP_ID_PM);
+	if (pos > 0)
+		pci_set_cap(pd, PCI_CFG_CAP_ID_PM, pos, NULL, NULL, false);
+}
+
+void pci_init_capabilities(struct phb *phb, struct pci_device *pd)
+{
+	pci_init_pcie_cap(phb, pd);
+	pci_init_aer_cap(phb, pd);
+	pci_init_pm_cap(phb, pd);
+}
+
+bool pci_wait_crs(struct phb *phb, uint16_t bdfn, uint32_t *out_vdid)
+{
+	uint32_t retries, vdid;
+	int64_t rc;
+	bool had_crs = false;
+
+	for (retries = 0; retries < 40; retries++) {
+		rc = pci_cfg_read32(phb, bdfn, PCI_CFG_VENDOR_ID, &vdid);
+		if (rc)
+			return false;
+		if (vdid == 0xffffffff || vdid == 0x00000000)
+			return false;
+		if (vdid != 0xffff0001)
+			break;
+		had_crs = true;
+		time_wait_ms(100);
+	}
+	if (vdid == 0xffff0001) {
+		PCIERR(phb, bdfn, "CRS timeout !\n");
+		return false;
+	}
+	if (had_crs)
+		PCIDBG(phb, bdfn, "Probe success after %d CRS\n", retries);
+
+	if (out_vdid)
+		*out_vdid = vdid;
+	return true;
+}
+
+static struct pci_device *pci_scan_one(struct phb *phb, struct pci_device *parent,
+				       uint16_t bdfn)
+{
+	struct pci_device *pd = NULL;
+	uint32_t vdid;
+	int64_t rc;
+	uint8_t htype;
+
+	if (!pci_wait_crs(phb, bdfn, &vdid))
+		return NULL;
+
+	/* Perform a dummy write to the device in order for it to
+	 * capture it's own bus number, so any subsequent error
+	 * messages will be properly tagged
+	 */
+	pci_cfg_write32(phb, bdfn, PCI_CFG_VENDOR_ID, vdid);
+
+	pd = zalloc(sizeof(struct pci_device));
+	if (!pd) {
+		PCIERR(phb, bdfn,"Failed to allocate structure pci_device !\n");
+		goto fail;
+	}
+	pd->phb = phb;
+	pd->bdfn = bdfn;
+	pd->vdid = vdid;
+	pci_cfg_read32(phb, bdfn, PCI_CFG_SUBSYS_VENDOR_ID, &pd->sub_vdid);
+	pci_cfg_read32(phb, bdfn, PCI_CFG_REV_ID, &pd->class);
+	pd->class >>= 8;
+
+	pd->parent = parent;
+	list_head_init(&pd->pcrf);
+	list_head_init(&pd->children);
+	rc = pci_cfg_read8(phb, bdfn, PCI_CFG_HDR_TYPE, &htype);
+	if (rc) {
+		PCIERR(phb, bdfn, "Failed to read header type !\n");
+		goto fail;
+	}
+	pd->is_multifunction = !!(htype & 0x80);
+	pd->is_bridge = (htype & 0x7f) != 0;
+	pd->is_vf = false;
+	pd->scan_map = 0xffffffff; /* Default */
+	pd->primary_bus = PCI_BUS_NUM(bdfn);
+
+	pci_init_capabilities(phb, pd);
+
+	/* If it's a bridge, sanitize the bus numbers to avoid forwarding
+	 *
+	 * This will help when walking down those bridges later on
+	 */
+	if (pd->is_bridge) {
+		pci_cfg_write8(phb, bdfn, PCI_CFG_PRIMARY_BUS, pd->primary_bus);
+		pci_cfg_write8(phb, bdfn, PCI_CFG_SECONDARY_BUS, 0);
+		pci_cfg_write8(phb, bdfn, PCI_CFG_SUBORDINATE_BUS, 0);
+	}
+
+	/* XXX Need to do some basic setups, such as MPSS, MRS,
+	 * RCB, etc...
+	 */
+
+	PCIDBG(phb, bdfn, "Found VID:%04x DEV:%04x TYP:%d MF%s BR%s EX%s\n",
+	       vdid & 0xffff, vdid >> 16, pd->dev_type,
+	       pd->is_multifunction ? "+" : "-",
+	       pd->is_bridge ? "+" : "-",
+	       pci_has_cap(pd, PCI_CFG_CAP_ID_EXP, false) ? "+" : "-");
+
+	/* Try to get PCI slot behind the device */
+	if (platform.pci_get_slot_info)
+		platform.pci_get_slot_info(phb, pd);
+
+	/* Put it to the child device of list of PHB or parent */
+	if (!parent)
+		list_add_tail(&phb->devices, &pd->link);
+	else
+		list_add_tail(&parent->children, &pd->link);
+
+	/*
+	 * Call PHB hook
+	 */
+	if (phb->ops->device_init)
+		phb->ops->device_init(phb, pd, NULL);
+
+	return pd;
+ fail:
+	if (pd)
+		free(pd);
+	return NULL;
+}
+
+/* pci_check_clear_freeze - Probing empty slot will result in an EEH
+ *                          freeze. Currently we have a single PE mapping
+ *                          everything (default state of our backend) so
+ *                          we just check and clear the state of PE#0
+ *
+ *                          returns true if a freeze was detected
+ *
+ * NOTE: We currently only handle simple PE freeze, not PHB fencing
+ *       (or rather our backend does)
+ */
+bool pci_check_clear_freeze(struct phb *phb)
+{
+	uint8_t freeze_state;
+	uint16_t pci_error_type, sev;
+	int64_t pe_number, rc;
+
+	/* Retrieve the reserved PE number */
+	pe_number = OPAL_PARAMETER;
+	if (phb->ops->get_reserved_pe_number)
+		pe_number = phb->ops->get_reserved_pe_number(phb);
+	if (pe_number < 0)
+		return false;
+
+	/* Retrieve the frozen state */
+	rc = phb->ops->eeh_freeze_status(phb, pe_number, &freeze_state,
+					 &pci_error_type, &sev);
+	if (rc)
+		return true; /* phb fence? */
+
+	if (freeze_state == OPAL_EEH_STOPPED_NOT_FROZEN)
+		return false;
+	/* We can't handle anything worse than an ER here */
+	if (sev > OPAL_EEH_SEV_NO_ERROR &&
+	    sev < OPAL_EEH_SEV_PE_ER) {
+		PCIERR(phb, 0, "Fatal probe in %s error !\n", __func__);
+		return true;
+	}
+
+	phb->ops->eeh_freeze_clear(phb, pe_number,
+				   OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+	return true;
+}
+
+/*
+ * Turn off slot's power supply if there are nothing connected for
+ * 2 purposes: power saving obviously and initialize the slot to
+ * to initial power-off state for hotplug.
+ *
+ * The power should be turned on if the downstream link of the slot
+ * isn't up.
+ */
+static void pci_slot_set_power_state(struct phb *phb,
+				     struct pci_device *pd,
+				     uint8_t state)
+{
+	struct pci_slot *slot;
+	uint8_t cur_state;
+	int32_t wait = 100;
+	int64_t rc;
+
+	if (!pd || !pd->slot)
+		return;
+
+	slot = pd->slot;
+	if (!slot->pluggable ||
+	    !slot->ops.get_power_state ||
+	    !slot->ops.set_power_state)
+		return;
+
+	if (state == PCI_SLOT_POWER_OFF) {
+		/* Bail if there're something connected */
+		if (!list_empty(&pd->children)) {
+			PCIERR(phb, pd->bdfn, "Attempted to power off slot with attached devices!\n");
+			return;
+		}
+
+		pci_slot_add_flags(slot, PCI_SLOT_FLAG_BOOTUP);
+		rc = slot->ops.get_power_state(slot, &cur_state);
+		if (rc != OPAL_SUCCESS) {
+			PCINOTICE(phb, pd->bdfn, "Error %lld getting slot power state\n", rc);
+			cur_state = PCI_SLOT_POWER_OFF;
+		}
+
+		pci_slot_remove_flags(slot, PCI_SLOT_FLAG_BOOTUP);
+		if (cur_state == PCI_SLOT_POWER_OFF)
+			return;
+	}
+
+	pci_slot_add_flags(slot,
+		(PCI_SLOT_FLAG_BOOTUP | PCI_SLOT_FLAG_ENFORCE));
+	rc = slot->ops.set_power_state(slot, state);
+	if (rc == OPAL_SUCCESS)
+		goto success;
+	if (rc != OPAL_ASYNC_COMPLETION) {
+		PCINOTICE(phb, pd->bdfn, "Error %lld powering %s slot\n",
+			  rc, state == PCI_SLOT_POWER_ON ? "on" : "off");
+		goto error;
+	}
+
+	/* Wait until the operation is completed */
+	do {
+		if (slot->state == PCI_SLOT_STATE_SPOWER_DONE)
+			break;
+
+		check_timers(false);
+		time_wait_ms(10);
+	} while (--wait >= 0);
+
+	if (wait < 0) {
+		PCINOTICE(phb, pd->bdfn, "Timeout powering %s slot\n",
+			  state == PCI_SLOT_POWER_ON ? "on" : "off");
+		goto error;
+	}
+
+success:
+	PCIDBG(phb, pd->bdfn, "Powering %s hotpluggable slot\n",
+	       state == PCI_SLOT_POWER_ON ? "on" : "off");
+error:
+	pci_slot_remove_flags(slot,
+		(PCI_SLOT_FLAG_BOOTUP | PCI_SLOT_FLAG_ENFORCE));
+	pci_slot_set_state(slot, PCI_SLOT_STATE_NORMAL);
+}
+
+static bool pci_bridge_power_on(struct phb *phb, struct pci_device *pd)
+{
+	int32_t ecap;
+	uint16_t pcie_cap, slot_sts, slot_ctl, link_ctl;
+	uint32_t slot_cap;
+	int64_t rc;
+
+	/*
+	 * If there is a PCI slot associated with the bridge, to use
+	 * the PCI slot's facality to power it on.
+	 */
+	if (pd->slot) {
+		struct pci_slot *slot = pd->slot;
+		uint8_t presence;
+
+		/*
+		 * We assume the presence state is OPAL_PCI_SLOT_PRESENT
+		 * by default. In this way, we won't miss anything when
+		 * the operation isn't supported or hitting error upon
+		 * retrieving it.
+		 */
+		if (slot->ops.get_presence_state) {
+			rc = slot->ops.get_presence_state(slot, &presence);
+			if (rc == OPAL_SUCCESS &&
+			    presence == OPAL_PCI_SLOT_EMPTY)
+				return false;
+		}
+
+		/* To power it on */
+		pci_slot_set_power_state(phb, pd, PCI_SLOT_POWER_ON);
+		return true;
+	}
+
+	if (!pci_has_cap(pd, PCI_CFG_CAP_ID_EXP, false))
+		return true;
+
+	/* Check if slot is supported */
+	ecap = pci_cap(pd, PCI_CFG_CAP_ID_EXP, false);
+	pci_cfg_read16(phb, pd->bdfn,
+		       ecap + PCICAP_EXP_CAPABILITY_REG, &pcie_cap);
+	if (!(pcie_cap & PCICAP_EXP_CAP_SLOT))
+		return true;
+
+	/* Check presence */
+	pci_cfg_read16(phb, pd->bdfn,
+		       ecap + PCICAP_EXP_SLOTSTAT, &slot_sts);
+        if (!(slot_sts & PCICAP_EXP_SLOTSTAT_PDETECTST))
+		return false;
+
+	/* Ensure that power control is supported */
+	pci_cfg_read32(phb, pd->bdfn,
+		       ecap + PCICAP_EXP_SLOTCAP, &slot_cap);
+	if (!(slot_cap & PCICAP_EXP_SLOTCAP_PWCTRL))
+		return true;
+
+
+	/* Read the slot control register, check if the slot is off */
+	pci_cfg_read16(phb, pd->bdfn, ecap + PCICAP_EXP_SLOTCTL, &slot_ctl);
+	PCITRACE(phb, pd->bdfn, " SLOT_CTL=%04x\n", slot_ctl);
+	if (slot_ctl & PCICAP_EXP_SLOTCTL_PWRCTLR) {
+		PCIDBG(phb, pd->bdfn, "Bridge power is off, turning on ...\n");
+		slot_ctl &= ~PCICAP_EXP_SLOTCTL_PWRCTLR;
+		slot_ctl |= SETFIELD(PCICAP_EXP_SLOTCTL_PWRI, 0, PCIE_INDIC_ON);
+		pci_cfg_write16(phb, pd->bdfn,
+				ecap + PCICAP_EXP_SLOTCTL, slot_ctl);
+
+		/* Wait a couple of seconds */
+		time_wait_ms(2000);
+	}
+
+	/* Enable link */
+	pci_cfg_read16(phb, pd->bdfn, ecap + PCICAP_EXP_LCTL, &link_ctl);
+	PCITRACE(phb, pd->bdfn, " LINK_CTL=%04x\n", link_ctl);
+	link_ctl &= ~PCICAP_EXP_LCTL_LINK_DIS;
+	pci_cfg_write16(phb, pd->bdfn, ecap + PCICAP_EXP_LCTL, link_ctl);
+
+	return true;
+}
+
+static bool pci_bridge_wait_link(struct phb *phb,
+				 struct pci_device *pd,
+				 bool was_reset)
+{
+	int32_t ecap = 0;
+	uint32_t link_cap = 0, retries = 100;
+	uint16_t link_sts;
+
+	if (pci_has_cap(pd, PCI_CFG_CAP_ID_EXP, false)) {
+		ecap = pci_cap(pd, PCI_CFG_CAP_ID_EXP, false);
+		pci_cfg_read32(phb, pd->bdfn, ecap + PCICAP_EXP_LCAP, &link_cap);
+	}
+
+	/*
+	 * If link state reporting isn't supported, wait 1 second
+	 * if the downstream link was ever resetted.
+	 */
+	if (!(link_cap & PCICAP_EXP_LCAP_DL_ACT_REP)) {
+		if (was_reset)
+			time_wait_ms(1000);
+
+		return true;
+	}
+
+	/*
+	 * Link state reporting is supported, wait for the link to
+	 * come up until timeout.
+	 */
+	PCIDBG(phb, pd->bdfn, "waiting for link... \n");
+	while (retries--) {
+		pci_cfg_read16(phb, pd->bdfn,
+			       ecap + PCICAP_EXP_LSTAT, &link_sts);
+		if (link_sts & PCICAP_EXP_LSTAT_DLLL_ACT)
+			break;
+
+		time_wait_ms(100);
+	}
+
+	if (!(link_sts & PCICAP_EXP_LSTAT_DLLL_ACT)) {
+		PCIERR(phb, pd->bdfn, "Timeout waiting for downstream link\n");
+		return false;
+	}
+
+	/* Need another 100ms before touching the config space */
+	time_wait_ms(100);
+	PCIDBG(phb, pd->bdfn, "link is up\n");
+
+	return true;
+}
+
+/* pci_enable_bridge - Called before scanning a bridge
+ *
+ * Ensures error flags are clean, disable master abort, and
+ * check if the subordinate bus isn't reset, the slot is enabled
+ * on PCIe, etc...
+ */
+static bool pci_enable_bridge(struct phb *phb, struct pci_device *pd)
+{
+	uint16_t bctl;
+	bool was_reset = false;
+
+	/* Disable master aborts, clear errors */
+	pci_cfg_read16(phb, pd->bdfn, PCI_CFG_BRCTL, &bctl);
+	bctl &= ~PCI_CFG_BRCTL_MABORT_REPORT;
+	pci_cfg_write16(phb, pd->bdfn, PCI_CFG_BRCTL, bctl);
+
+
+	/* PCI-E bridge, check the slot state. We don't do that on the
+	 * root complex as this is handled separately and not all our
+	 * RCs implement the standard register set.
+	 */
+	if ((pd->dev_type == PCIE_TYPE_ROOT_PORT && pd->primary_bus > 0) ||
+	    pd->dev_type == PCIE_TYPE_SWITCH_DNPORT) {
+		if (pci_has_cap(pd, PCI_CFG_CAP_ID_EXP, false)) {
+			int32_t ecap;
+			uint32_t link_cap = 0;
+			uint16_t link_sts = 0;
+
+			ecap = pci_cap(pd, PCI_CFG_CAP_ID_EXP, false);
+			pci_cfg_read32(phb, pd->bdfn,
+				       ecap + PCICAP_EXP_LCAP, &link_cap);
+
+			/*
+			 * No need to touch the power supply if the PCIe link has
+			 * been up. Further more, the slot presence bit is lost while
+			 * the PCIe link is up on the specific PCI topology. In that
+			 * case, we need ignore the slot presence bit and go ahead for
+			 * probing. Otherwise, the NVMe adapter won't be probed.
+			 *
+			 * PHB3 root port, PLX switch 8748 (10b5:8748), PLX swich 9733
+			 * (10b5:9733), PMC 8546 swtich (11f8:8546), NVMe adapter
+			 * (1c58:0023).
+			 */
+			ecap = pci_cap(pd, PCI_CFG_CAP_ID_EXP, false);
+			pci_cfg_read32(phb, pd->bdfn,
+				       ecap + PCICAP_EXP_LCAP, &link_cap);
+			pci_cfg_read16(phb, pd->bdfn,
+				       ecap + PCICAP_EXP_LSTAT, &link_sts);
+			if ((link_cap & PCICAP_EXP_LCAP_DL_ACT_REP) &&
+			    (link_sts & PCICAP_EXP_LSTAT_DLLL_ACT))
+				return true;
+		}
+
+		/* Power on the downstream slot or link */
+		if (!pci_bridge_power_on(phb, pd))
+			return false;
+	}
+
+	/* Clear secondary reset */
+	if (bctl & PCI_CFG_BRCTL_SECONDARY_RESET) {
+		PCIDBG(phb, pd->bdfn,
+		       "Bridge secondary reset is on, clearing it ...\n");
+		bctl &= ~PCI_CFG_BRCTL_SECONDARY_RESET;
+		pci_cfg_write16(phb, pd->bdfn, PCI_CFG_BRCTL, bctl);
+		time_wait_ms(1000);
+		was_reset = true;
+	}
+
+	/* PCI-E bridge, wait for link */
+	if (pd->dev_type == PCIE_TYPE_ROOT_PORT ||
+	    pd->dev_type == PCIE_TYPE_SWITCH_DNPORT) {
+		if (!pci_bridge_wait_link(phb, pd, was_reset))
+			return false;
+	}
+
+	/* Clear error status */
+	pci_cfg_write16(phb, pd->bdfn, PCI_CFG_STAT, 0xffff);
+	return true;
+}
+
+/* Clear up bridge resources */
+static void pci_cleanup_bridge(struct phb *phb, struct pci_device *pd)
+{
+	uint16_t cmd;
+
+	pci_cfg_write16(phb, pd->bdfn, PCI_CFG_IO_BASE_U16, 0xffff);
+	pci_cfg_write8(phb, pd->bdfn, PCI_CFG_IO_BASE, 0xf0);
+	pci_cfg_write16(phb, pd->bdfn, PCI_CFG_IO_LIMIT_U16, 0);
+	pci_cfg_write8(phb, pd->bdfn, PCI_CFG_IO_LIMIT, 0);
+	pci_cfg_write16(phb, pd->bdfn, PCI_CFG_MEM_BASE, 0xfff0);
+	pci_cfg_write16(phb, pd->bdfn, PCI_CFG_MEM_LIMIT, 0);
+	pci_cfg_write32(phb, pd->bdfn, PCI_CFG_PREF_MEM_BASE_U32, 0xffffffff);
+	pci_cfg_write16(phb, pd->bdfn, PCI_CFG_PREF_MEM_BASE, 0xfff0);
+	pci_cfg_write32(phb, pd->bdfn, PCI_CFG_PREF_MEM_LIMIT_U32, 0);
+	pci_cfg_write16(phb, pd->bdfn, PCI_CFG_PREF_MEM_LIMIT, 0);
+
+	/* Note: This is a bit fishy but since we have closed all the
+	 * bridge windows above, it shouldn't be a problem. Basically
+	 * we enable Memory, IO and Bus Master on the bridge because
+	 * some versions of Linux will fail to do it themselves.
+	 */
+	pci_cfg_read16(phb, pd->bdfn, PCI_CFG_CMD, &cmd);
+	cmd |= PCI_CFG_CMD_IO_EN | PCI_CFG_CMD_MEM_EN;
+	cmd |= PCI_CFG_CMD_BUS_MASTER_EN;
+	pci_cfg_write16(phb, pd->bdfn, PCI_CFG_CMD, cmd);	
+}
+
+/* Remove all subordinate PCI devices leading from the indicated
+ * PCI bus. It's used to remove all PCI devices behind one PCI
+ * slot at unplugging time
+ */
+void pci_remove_bus(struct phb *phb, struct list_head *list)
+{
+	struct pci_device *pd, *tmp;
+
+	list_for_each_safe(list, pd, tmp, link) {
+		pci_remove_bus(phb, &pd->children);
+
+		if (phb->ops->device_remove)
+			phb->ops->device_remove(phb, pd);
+
+		/* Release device node and PCI slot */
+		if (pd->dn)
+			dt_free(pd->dn);
+		if (pd->slot)
+			free(pd->slot);
+
+		/* Remove from parent list and release itself */
+		list_del(&pd->link);
+		free(pd);
+	}
+}
+
+static void pci_set_power_limit(struct pci_device *pd)
+{
+	uint32_t offset, val;
+	uint16_t caps;
+
+	offset = pci_cap(pd, PCI_CFG_CAP_ID_EXP, false);
+	if (!offset)
+		return; /* legacy dev */
+
+	pci_cfg_read16(pd->phb, pd->bdfn,
+			offset + PCICAP_EXP_CAPABILITY_REG, &caps);
+
+	if (!(caps & PCICAP_EXP_CAP_SLOT))
+		return; /* bridge has no slot capabilities */
+	if (!pd->slot || !pd->slot->power_limit)
+		return;
+
+	pci_cfg_read32(pd->phb, pd->bdfn, offset + PCICAP_EXP_SLOTCAP, &val);
+
+	val = SETFIELD(PCICAP_EXP_SLOTCAP_SPLSC, val, 0); /* 1W scale */
+	val = SETFIELD(PCICAP_EXP_SLOTCAP_SPLVA, val, pd->slot->power_limit);
+
+	pci_cfg_write32(pd->phb, pd->bdfn, offset + PCICAP_EXP_SLOTCAP, val);
+
+	/* update the cached copy in the slot */
+	pd->slot->slot_cap = val;
+
+	PCIDBG(pd->phb, pd->bdfn, "Slot power limit set to %dW\n",
+		pd->slot->power_limit);
+}
+
+/* Perform a recursive scan of the bus at bus_number populating
+ * the list passed as an argument. This also performs the bus
+ * numbering, so it returns the largest bus number that was
+ * assigned.
+ *
+ * Note: Eventually this might want to access some VPD information
+ *       in order to know what slots to scan and what not etc..
+ *
+ * XXX NOTE: We might want to enable ARI along the way...
+ *
+ * XXX NOTE: We might also want to setup the PCIe MPS/MRSS properly
+ *           here as Linux may or may not do it
+ */
+uint8_t pci_scan_bus(struct phb *phb, uint8_t bus, uint8_t max_bus,
+		     struct list_head *list, struct pci_device *parent,
+		     bool scan_downstream)
+{
+	struct pci_device *pd = NULL, *rc = NULL;
+	uint8_t dev, fn, next_bus, max_sub;
+	uint32_t scan_map;
+
+	/* Decide what to scan  */
+	scan_map = parent ? parent->scan_map : phb->scan_map;
+
+	/* Do scan */
+	for (dev = 0; dev < 32; dev++) {
+		if (!(scan_map & (1ul << dev)))
+			continue;
+
+		/* Scan the device */
+		pd = pci_scan_one(phb, parent, (bus << 8) | (dev << 3));
+		pci_check_clear_freeze(phb);
+		if (!pd)
+			continue;
+
+		/* Record RC when its downstream link is down */
+		if (!scan_downstream && dev == 0 && !rc)
+			rc = pd;
+
+		/* XXX Handle ARI */
+		if (!pd->is_multifunction)
+			continue;
+		for (fn = 1; fn < 8; fn++) {
+			pd = pci_scan_one(phb, parent,
+					  ((uint16_t)bus << 8) | (dev << 3) | fn);
+			pci_check_clear_freeze(phb);
+		}
+	}
+
+	/* Reserve all possible buses if RC's downstream link is down
+	 * if PCI hotplug is supported.
+	 */
+	if (rc && rc->slot && rc->slot->pluggable) {
+		next_bus = bus + 1;
+		rc->secondary_bus = next_bus;
+		rc->subordinate_bus = max_bus;
+		pci_cfg_write8(phb, rc->bdfn, PCI_CFG_SECONDARY_BUS,
+			       rc->secondary_bus);
+		pci_cfg_write8(phb, rc->bdfn, PCI_CFG_SUBORDINATE_BUS,
+			       rc->subordinate_bus);
+	}
+
+	/* set the power limit for any downstream slots while we're here */
+	list_for_each(list, pd, link) {
+		if (pd->is_bridge)
+			pci_set_power_limit(pd);
+	}
+
+	/*
+	 * We only scan downstream if instructed to do so by the
+	 * caller. Typically we avoid the scan when we know the
+	 * link is down already, which happens for the top level
+	 * root complex, and avoids a long secondary timeout
+	 */
+	if (!scan_downstream) {
+		list_for_each(list, pd, link)
+			pci_slot_set_power_state(phb, pd, PCI_SLOT_POWER_OFF);
+
+		return bus;
+	}
+
+	next_bus = bus + 1;
+	max_sub = bus;
+
+	/* Scan down bridges */
+	list_for_each(list, pd, link) {
+		bool do_scan;
+
+		if (!pd->is_bridge)
+			continue;
+
+		/* Configure the bridge with the returned values */
+		if (next_bus <= bus) {
+			PCIERR(phb, pd->bdfn, "Out of bus numbers !\n");
+			max_bus = next_bus = 0; /* Failure case */
+		}
+
+		pd->secondary_bus = next_bus;
+		pd->subordinate_bus = max_bus;
+		pci_cfg_write8(phb, pd->bdfn, PCI_CFG_SECONDARY_BUS, next_bus);
+		pci_cfg_write8(phb, pd->bdfn, PCI_CFG_SUBORDINATE_BUS, max_bus);
+		if (!next_bus)
+			break;
+
+		PCIDBG(phb, pd->bdfn, "Bus %02x..%02x scanning...\n",
+		       next_bus, max_bus);
+
+		/* Clear up bridge resources */
+		pci_cleanup_bridge(phb, pd);
+
+		/* Configure the bridge. This will enable power to the slot
+		 * if it's currently disabled, lift reset, etc...
+		 *
+		 * Return false if we know there's nothing behind the bridge
+		 */
+		do_scan = pci_enable_bridge(phb, pd);
+
+		/* Perform recursive scan */
+		if (do_scan) {
+			max_sub = pci_scan_bus(phb, next_bus, max_bus,
+					       &pd->children, pd, true);
+		} else {
+			/* Empty bridge. We leave room for hotplug
+			 * slots if the downstream port is pluggable.
+			 */
+			if (pd->slot && !pd->slot->pluggable)
+				max_sub = next_bus;
+			else {
+				max_sub = next_bus + 4;
+				if (max_sub > max_bus)
+					max_sub = max_bus;
+			}
+		}
+
+		pd->subordinate_bus = max_sub;
+		pci_cfg_write8(phb, pd->bdfn, PCI_CFG_SUBORDINATE_BUS, max_sub);
+		next_bus = max_sub + 1;
+
+		/* power off the slot if there's nothing below it */
+		if (list_empty(&pd->children))
+			pci_slot_set_power_state(phb, pd, PCI_SLOT_POWER_OFF);
+	}
+
+	return max_sub;
+}
+
+static int pci_get_mps(struct phb *phb,
+		       struct pci_device *pd, void *userdata)
+{
+	uint32_t *mps = (uint32_t *)userdata;
+
+	/* Only check PCI device that had MPS capacity */
+	if (phb && pd && pd->mps && *mps > pd->mps)
+		*mps = pd->mps;
+
+	return 0;
+}
+
+static int pci_configure_mps(struct phb *phb,
+			     struct pci_device *pd,
+			     void *userdata __unused)
+{
+	uint32_t ecap, aercap, mps;
+	uint16_t val;
+
+	assert(phb);
+	assert(pd);
+
+	/* If the MPS isn't acceptable one, bail immediately */
+	mps = phb->mps;
+	if (mps < 128 || mps > 4096)
+		return 1;
+
+	/* Retrieve PCIe and AER capability */
+	ecap = pci_cap(pd, PCI_CFG_CAP_ID_EXP, false);
+	aercap = pci_cap(pd, PCIECAP_ID_AER, true);
+
+	/* PCIe device always has MPS capacity */
+	if (pd->mps) {
+		mps = ilog2(mps) - 7;
+
+		pci_cfg_read16(phb, pd->bdfn, ecap + PCICAP_EXP_DEVCTL, &val);
+		val = SETFIELD(PCICAP_EXP_DEVCTL_MPS, val, mps);
+		pci_cfg_write16(phb, pd->bdfn, ecap + PCICAP_EXP_DEVCTL, val);
+	}
+
+	/* Changing MPS on upstream PCI bridge might cause some error
+	 * bits in PCIe and AER capability. To clear them to avoid
+	 * confusion.
+	 */
+	if (aercap) {
+		pci_cfg_write32(phb, pd->bdfn, aercap + PCIECAP_AER_UE_STATUS,
+				0xffffffff);
+		pci_cfg_write32(phb, pd->bdfn, aercap + PCIECAP_AER_CE_STATUS,
+				0xffffffff);
+	}
+	if (ecap)
+		pci_cfg_write16(phb, pd->bdfn, ecap + PCICAP_EXP_DEVSTAT, 0xf);
+
+	return 0;
+}
+
+static void pci_disable_completion_timeout(struct phb *phb, struct pci_device *pd)
+{
+	uint32_t ecap, val;
+	uint16_t pcie_cap;
+
+	/* PCIE capability required */
+	if (!pci_has_cap(pd, PCI_CFG_CAP_ID_EXP, false))
+		return;
+
+	/* Check PCIe capability version */
+	ecap = pci_cap(pd, PCI_CFG_CAP_ID_EXP, false);
+	pci_cfg_read16(phb, pd->bdfn,
+		       ecap + PCICAP_EXP_CAPABILITY_REG, &pcie_cap);
+	if ((pcie_cap & PCICAP_EXP_CAP_VERSION) <= 1)
+		return;
+
+	/* Check if it has capability to disable completion timeout */
+	pci_cfg_read32(phb, pd->bdfn, ecap + PCIECAP_EXP_DCAP2, &val);
+	if (!(val & PCICAP_EXP_DCAP2_CMPTOUT_DIS))
+		return;
+
+	/* Disable completion timeout without more check */
+	pci_cfg_read32(phb, pd->bdfn, ecap + PCICAP_EXP_DCTL2, &val);
+	val |= PCICAP_EXP_DCTL2_CMPTOUT_DIS;
+	pci_cfg_write32(phb, pd->bdfn, ecap + PCICAP_EXP_DCTL2, val);
+}
+
+void pci_device_init(struct phb *phb, struct pci_device *pd)
+{
+	pci_configure_mps(phb, pd, NULL);
+	pci_disable_completion_timeout(phb, pd);
+}
+
+static void pci_reset_phb(void *data)
+{
+	struct phb *phb = data;
+	struct pci_slot *slot = phb->slot;
+	int64_t rc;
+
+	if (!slot || !slot->ops.run_sm) {
+		PCINOTICE(phb, 0, "Cannot issue reset\n");
+		return;
+	}
+
+	pci_slot_add_flags(slot, PCI_SLOT_FLAG_BOOTUP);
+	rc = slot->ops.run_sm(slot);
+	while (rc > 0) {
+		PCITRACE(phb, 0, "Waiting %ld ms\n", tb_to_msecs(rc));
+		time_wait(rc);
+		rc = slot->ops.run_sm(slot);
+	}
+	pci_slot_remove_flags(slot, PCI_SLOT_FLAG_BOOTUP);
+	if (rc < 0)
+		PCIDBG(phb, 0, "Error %lld resetting\n", rc);
+}
+
+static void pci_scan_phb(void *data)
+{
+	struct phb *phb = data;
+	struct pci_slot *slot = phb->slot;
+	uint8_t link;
+	uint32_t mps = 0xffffffff;
+	int64_t rc;
+
+	if (!slot || !slot->ops.get_link_state) {
+		PCIERR(phb, 0, "Cannot query link status\n");
+		link = 0;
+	} else {
+		rc = slot->ops.get_link_state(slot, &link);
+		if (rc != OPAL_SUCCESS) {
+			PCIERR(phb, 0, "Error %lld querying link status\n",
+			       rc);
+			link = 0;
+		}
+	}
+
+	if (!link)
+		PCIDBG(phb, 0, "Link down\n");
+	else
+		PCIDBG(phb, 0, "Link up at x%d width\n", link);
+
+	/* Scan root port and downstream ports if applicable */
+	PCIDBG(phb, 0, "Scanning (upstream%s)...\n",
+	       link ? "+downsteam" : " only");
+	pci_scan_bus(phb, 0, 0xff, &phb->devices, NULL, link);
+
+	/* Configure MPS (Max Payload Size) for PCIe domain */
+	pci_walk_dev(phb, NULL, pci_get_mps, &mps);
+	phb->mps = mps;
+	pci_walk_dev(phb, NULL, pci_configure_mps, NULL);
+}
+
+int64_t pci_register_phb(struct phb *phb, int opal_id)
+{
+	/* The user didn't specify an opal_id, allocate one */
+	if (opal_id == OPAL_DYNAMIC_PHB_ID) {
+		/* This is called at init time in non-concurrent way, so no lock needed */
+		for (opal_id = 0; opal_id < ARRAY_SIZE(phbs); opal_id++)
+			if (!phbs[opal_id])
+				break;
+		if (opal_id >= ARRAY_SIZE(phbs)) {
+			prerror("PHB: Failed to find a free ID slot\n");
+			return OPAL_RESOURCE;
+		}
+	} else {
+		if (opal_id >= ARRAY_SIZE(phbs)) {
+			prerror("PHB: ID %x out of range !\n", opal_id);
+			return OPAL_PARAMETER;
+		}
+		/* The user did specify an opal_id, check it's free */
+		if (phbs[opal_id]) {
+			prerror("PHB: Duplicate registration of ID %x\n", opal_id);
+			return OPAL_PARAMETER;
+		}
+	}
+
+	phbs[opal_id] = phb;
+	phb->opal_id = opal_id;
+	if (opal_id > last_phb_id)
+		last_phb_id = opal_id;
+	dt_add_property_cells(phb->dt_node, "ibm,opal-phbid", 0, phb->opal_id);
+	PCIDBG(phb, 0, "PCI: Registered PHB\n");
+
+	init_lock(&phb->lock);
+	list_head_init(&phb->devices);
+
+	phb->filter_map = zalloc(BITMAP_BYTES(0x10000));
+	assert(phb->filter_map);
+
+	return OPAL_SUCCESS;
+}
+
+int64_t pci_unregister_phb(struct phb *phb)
+{
+	/* XXX We want some kind of RCU or RWlock to make things
+	 * like that happen while no OPAL callback is in progress,
+	 * that way we avoid taking a lock in each of them.
+	 *
+	 * Right now we don't unregister so we are fine
+	 */
+	phbs[phb->opal_id] = phb;
+
+	return OPAL_SUCCESS;
+}
+
+struct phb *pci_get_phb(uint64_t phb_id)
+{
+	if (phb_id >= ARRAY_SIZE(phbs))
+		return NULL;
+
+	/* XXX See comment in pci_unregister_phb() about locking etc... */
+	return phbs[phb_id];
+}
+
+static const char *pci_class_name(uint32_t class_code)
+{
+	uint8_t class = class_code >> 16;
+	uint8_t sub = (class_code >> 8) & 0xff;
+	uint8_t pif = class_code & 0xff;
+
+	switch(class) {
+	case 0x00:
+		switch(sub) {
+		case 0x00: return "device";
+		case 0x01: return "vga";
+		}
+		break;
+	case 0x01:
+		switch(sub) {
+		case 0x00: return "scsi";
+		case 0x01: return "ide";
+		case 0x02: return "fdc";
+		case 0x03: return "ipi";
+		case 0x04: return "raid";
+		case 0x05: return "ata";
+		case 0x06: return "sata";
+		case 0x07: return "sas";
+		default:   return "mass-storage";
+		}
+	case 0x02:
+		switch(sub) {
+		case 0x00: return "ethernet";
+		case 0x01: return "token-ring";
+		case 0x02: return "fddi";
+		case 0x03: return "atm";
+		case 0x04: return "isdn";
+		case 0x05: return "worldfip";
+		case 0x06: return "picmg";
+		default:   return "network";
+		}
+	case 0x03:
+		switch(sub) {
+		case 0x00: return "vga";
+		case 0x01: return "xga";
+		case 0x02: return "3d-controller";
+		default:   return "display";
+		}
+	case 0x04:
+		switch(sub) {
+		case 0x00: return "video";
+		case 0x01: return "sound";
+		case 0x02: return "telephony";
+		default:   return "multimedia-device";
+		}
+	case 0x05:
+		switch(sub) {
+		case 0x00: return "memory";
+		case 0x01: return "flash";
+		default:   return "memory-controller";
+		}
+	case 0x06:
+		switch(sub) {
+		case 0x00: return "host";
+		case 0x01: return "isa";
+		case 0x02: return "eisa";
+		case 0x03: return "mca";
+		case 0x04: return "pci";
+		case 0x05: return "pcmcia";
+		case 0x06: return "nubus";
+		case 0x07: return "cardbus";
+		case 0x08: return "raceway";
+		case 0x09: return "semi-transparent-pci";
+		case 0x0a: return "infiniband";
+		default:   return "unknown-bridge";
+		}
+	case 0x07:
+		switch(sub) {
+		case 0x00:
+			switch(pif) {
+			case 0x01: return "16450-serial";
+			case 0x02: return "16550-serial";
+			case 0x03: return "16650-serial";
+			case 0x04: return "16750-serial";
+			case 0x05: return "16850-serial";
+			case 0x06: return "16950-serial";
+			default:   return "serial";
+			}
+		case 0x01:
+			switch(pif) {
+			case 0x01: return "bi-directional-parallel";
+			case 0x02: return "ecp-1.x-parallel";
+			case 0x03: return "ieee1284-controller";
+			case 0xfe: return "ieee1284-device";
+			default:   return "parallel";
+			}
+		case 0x02: return "multiport-serial";
+		case 0x03:
+			switch(pif) {
+			case 0x01: return "16450-modem";
+			case 0x02: return "16550-modem";
+			case 0x03: return "16650-modem";
+			case 0x04: return "16750-modem";
+			default:   return "modem";
+			}
+		case 0x04: return "gpib";
+		case 0x05: return "smart-card";
+		default:   return "communication-controller";
+		}
+	case 0x08:
+		switch(sub) {
+		case 0x00:
+			switch(pif) {
+			case 0x01: return "isa-pic";
+			case 0x02: return "eisa-pic";
+			case 0x10: return "io-apic";
+			case 0x20: return "iox-apic";
+			default:   return "interrupt-controller";
+			}
+		case 0x01:
+			switch(pif) {
+			case 0x01: return "isa-dma";
+			case 0x02: return "eisa-dma";
+			default:   return "dma-controller";
+			}
+		case 0x02:
+			switch(pif) {
+			case 0x01: return "isa-system-timer";
+			case 0x02: return "eisa-system-timer";
+			default:   return "timer";
+			}
+		case 0x03:
+			switch(pif) {
+			case 0x01: return "isa-rtc";
+			default:   return "rtc";
+			}
+		case 0x04: return "hotplug-controller";
+		case 0x05: return "sd-host-controller";
+		default:   return "system-peripheral";
+		}
+	case 0x09:
+		switch(sub) {
+		case 0x00: return "keyboard";
+		case 0x01: return "pen";
+		case 0x02: return "mouse";
+		case 0x03: return "scanner";
+		case 0x04: return "gameport";
+		default:   return "input-controller";
+		}
+	case 0x0a:
+		switch(sub) {
+		case 0x00: return "clock";
+		default:   return "docking-station";
+		}
+	case 0x0b:
+		switch(sub) {
+		case 0x00: return "386";
+		case 0x01: return "486";
+		case 0x02: return "pentium";
+		case 0x10: return "alpha";
+		case 0x20: return "powerpc";
+		case 0x30: return "mips";
+		case 0x40: return "co-processor";
+		default:   return "cpu";
+		}
+	case 0x0c:
+		switch(sub) {
+		case 0x00: return "firewire";
+		case 0x01: return "access-bus";
+		case 0x02: return "ssa";
+		case 0x03:
+			switch(pif) {
+			case 0x00: return "usb-uhci";
+			case 0x10: return "usb-ohci";
+			case 0x20: return "usb-ehci";
+			case 0x30: return "usb-xhci";
+			case 0xfe: return "usb-device";
+			default:   return "usb";
+			}
+		case 0x04: return "fibre-channel";
+		case 0x05: return "smb";
+		case 0x06: return "infiniband";
+		case 0x07:
+			switch(pif) {
+			case 0x00: return "impi-smic";
+			case 0x01: return "impi-kbrd";
+			case 0x02: return "impi-bltr";
+			default:   return "impi";
+			}
+		case 0x08: return "secos";
+		case 0x09: return "canbus";
+		default:   return "serial-bus";
+		}
+	case 0x0d:
+		switch(sub) {
+		case 0x00: return "irda";
+		case 0x01: return "consumer-ir";
+		case 0x10: return "rf-controller";
+		case 0x11: return "bluetooth";
+		case 0x12: return "broadband";
+		case 0x20: return "enet-802.11a";
+		case 0x21: return "enet-802.11b";
+		default:   return "wireless-controller";
+		}
+	case 0x0e: return "intelligent-controller";
+	case 0x0f:
+		switch(sub) {
+		case 0x01: return "satellite-tv";
+		case 0x02: return "satellite-audio";
+		case 0x03: return "satellite-voice";
+		case 0x04: return "satellite-data";
+		default:   return "satellite-device";
+		}
+	case 0x10:
+		switch(sub) {
+		case 0x00: return "network-encryption";
+		case 0x01: return "entertainment-encryption";
+		default:   return "encryption";
+		}
+	case 0x011:
+		switch(sub) {
+		case 0x00: return "dpio";
+		case 0x01: return "counter";
+		case 0x10: return "measurement";
+		case 0x20: return "management-card";
+		default:   return "data-processing";
+		}
+	}
+	return "device";
+}
+
+void pci_std_swizzle_irq_map(struct dt_node *np,
+			     struct pci_device *pd,
+			     struct pci_lsi_state *lstate,
+			     uint8_t swizzle)
+{
+	__be32 *p, *map;
+	int dev, irq, esize, edevcount;
+	size_t map_size;
+
+	/* Some emulated setups don't use standard interrupts
+	 * representation
+	 */
+	if (lstate->int_size == 0)
+		return;
+
+	/* Calculate the size of a map entry:
+	 *
+	 * 3 cells : PCI Address
+	 * 1 cell  : PCI IRQ
+	 * 1 cell  : PIC phandle
+	 * n cells : PIC irq (n = lstate->int_size)
+	 *
+	 * Assumption: PIC address is 0-size
+	 */
+	esize = 3 + 1 + 1 + lstate->int_size;
+
+	/* Number of map "device" entries
+	 *
+	 * A PCI Express root or downstream port needs only one
+	 * entry for device 0. Anything else will get a full map
+	 * for all possible 32 child device numbers
+	 *
+	 * If we have been passed a host bridge (pd == NULL) we also
+	 * do a simple per-pin map
+	 */
+	if (!pd || (pd->dev_type == PCIE_TYPE_ROOT_PORT ||
+		    pd->dev_type == PCIE_TYPE_SWITCH_DNPORT)) {
+		edevcount = 1;
+		dt_add_property_cells(np, "interrupt-map-mask", 0, 0, 0, 7);
+	} else {
+		edevcount = 32;
+		dt_add_property_cells(np, "interrupt-map-mask",
+				      0xf800, 0, 0, 7);
+	}
+	map_size = esize * edevcount * 4 * sizeof(u32);
+	map = p = zalloc(map_size);
+	if (!map) {
+		prerror("Failed to allocate interrupt-map-mask !\n");
+		return;
+	}
+
+	for (dev = 0; dev < edevcount; dev++) {
+		for (irq = 0; irq < 4; irq++) {
+			/* Calculate pin */
+			size_t i;
+			uint32_t new_irq = (irq + dev + swizzle) % 4;
+
+			/* PCI address portion */
+			*(p++) = cpu_to_be32(dev << (8 + 3));
+			*(p++) = 0;
+			*(p++) = 0;
+
+			/* PCI interrupt portion */
+			*(p++) = cpu_to_be32(irq + 1);
+
+			/* Parent phandle */
+			*(p++) = cpu_to_be32(lstate->int_parent[new_irq]);
+
+			/* Parent desc */
+			for (i = 0; i < lstate->int_size; i++)
+				*(p++) = cpu_to_be32(lstate->int_val[new_irq][i]);
+		}
+	}
+
+	dt_add_property(np, "interrupt-map", map, map_size);
+	free(map);
+}
+
+static void pci_add_loc_code(struct dt_node *np)
+{
+	struct dt_node *p;
+	const char *lcode = NULL;
+
+	for (p = np->parent; p; p = p->parent) {
+		/* prefer slot-label by default */
+		lcode = dt_prop_get_def(p, "ibm,slot-label", NULL);
+		if (lcode)
+			break;
+
+		/* otherwise use the fully qualified location code */
+		lcode = dt_prop_get_def(p, "ibm,slot-location-code", NULL);
+		if (lcode)
+			break;
+	}
+
+	if (!lcode)
+		lcode = dt_prop_get_def(np, "ibm,slot-location-code", NULL);
+
+	if (!lcode) {
+		/* Fall back to finding a ibm,loc-code */
+		for (p = np->parent; p; p = p->parent) {
+			lcode = dt_prop_get_def(p, "ibm,loc-code", NULL);
+			if (lcode)
+				break;
+		}
+	}
+
+	if (!lcode)
+		return;
+
+	dt_add_property_string(np, "ibm,loc-code", lcode);
+}
+
+static void pci_print_summary_line(struct phb *phb, struct pci_device *pd,
+				   struct dt_node *np, u32 rev_class,
+				   const char *cname)
+{
+	const char *label, *dtype, *s;
+#define MAX_SLOTSTR 80
+	char slotstr[MAX_SLOTSTR  + 1] = { 0, };
+
+	/* If it's a slot, it has a slot-label */
+	label = dt_prop_get_def(np, "ibm,slot-label", NULL);
+	if (label) {
+		u32 lanes = dt_prop_get_u32_def(np, "ibm,slot-wired-lanes", 0);
+		static const char *lanestrs[] = {
+			"", " x1", " x2", " x4", " x8", "x16", "x32", "32b", "64b"
+		};
+		const char *lstr = lanes > PCI_SLOT_WIRED_LANES_PCIX_64 ? "" : lanestrs[lanes];
+		snprintf(slotstr, MAX_SLOTSTR, "SLOT=%3s %s", label, lstr);
+		/* XXX Add more slot info */
+	} else {
+		/*
+		 * No label, ignore downstream switch legs and root complex,
+		 * Those would essentially be non-populated
+		 */
+		if (pd->dev_type != PCIE_TYPE_ROOT_PORT &&
+		    pd->dev_type != PCIE_TYPE_SWITCH_DNPORT) {
+			/* It's a mere device, get loc code */
+			s = dt_prop_get_def(np, "ibm,loc-code", NULL);
+			if (s)
+				snprintf(slotstr, MAX_SLOTSTR, "LOC_CODE=%s", s);
+		}
+	}
+
+	if (pci_has_cap(pd, PCI_CFG_CAP_ID_EXP, false)) {
+		static const char *pcie_types[] = {
+			"EP  ", "LGCY", "????", "????", "ROOT", "SWUP", "SWDN",
+			"ETOX", "XTOE", "RINT", "EVTC" };
+		if (pd->dev_type >= ARRAY_SIZE(pcie_types))
+			dtype = "????";
+		else
+			dtype = pcie_types[pd->dev_type];
+	} else
+		dtype = pd->is_bridge ? "PCIB" : "PCID";
+
+	if (pd->is_bridge)
+		PCINOTICE(phb, pd->bdfn,
+			  "[%s] %04x %04x R:%02x C:%06x B:%02x..%02x %s\n",
+			  dtype, PCI_VENDOR_ID(pd->vdid),
+			  PCI_DEVICE_ID(pd->vdid),
+			  rev_class & 0xff, rev_class >> 8, pd->secondary_bus,
+			  pd->subordinate_bus, slotstr);
+	else
+		PCINOTICE(phb, pd->bdfn,
+			  "[%s] %04x %04x R:%02x C:%06x (%14s) %s\n",
+			  dtype, PCI_VENDOR_ID(pd->vdid),
+			  PCI_DEVICE_ID(pd->vdid),
+			  rev_class & 0xff, rev_class >> 8, cname, slotstr);
+}
+
+static void __noinline pci_add_one_device_node(struct phb *phb,
+					       struct pci_device *pd,
+					       struct dt_node *parent_node,
+					       struct pci_lsi_state *lstate,
+					       uint8_t swizzle)
+{
+	struct dt_node *np;
+	const char *cname;
+#define MAX_NAME 256
+	char name[MAX_NAME];
+	char compat[MAX_NAME];
+	uint32_t rev_class;
+	uint8_t intpin;
+	bool is_pcie;
+
+	pci_cfg_read32(phb, pd->bdfn, PCI_CFG_REV_ID, &rev_class);
+	pci_cfg_read8(phb, pd->bdfn, PCI_CFG_INT_PIN, &intpin);
+	is_pcie = pci_has_cap(pd, PCI_CFG_CAP_ID_EXP, false);
+
+	/*
+	 * Some IBM PHBs (p7ioc?) have an invalid PCI class code. Linux
+	 * uses prefers to read the class code from the DT rather than
+	 * re-reading config space we can hack around it here.
+	 */
+	if (is_pcie && pd->dev_type == PCIE_TYPE_ROOT_PORT)
+		rev_class = (rev_class & 0xff) | 0x6040000;
+	cname = pci_class_name(rev_class >> 8);
+
+	if (PCI_FUNC(pd->bdfn))
+		snprintf(name, MAX_NAME - 1, "%s@%x,%x",
+			 cname, PCI_DEV(pd->bdfn), PCI_FUNC(pd->bdfn));
+	else
+		snprintf(name, MAX_NAME - 1, "%s@%x",
+			 cname, PCI_DEV(pd->bdfn));
+	pd->dn = np = dt_new(parent_node, name);
+
+	/*
+	 * NB: ibm,pci-config-space-type is the PAPR way of indicating the
+	 * device has a 4KB config space. It's got nothing to do with the
+	 * standard Type 0/1 config spaces defined by PCI.
+	 */
+	if (is_pcie || phb->phb_type == phb_type_npu_v2_opencapi) {
+		snprintf(compat, MAX_NAME, "pciex%x,%x",
+			 PCI_VENDOR_ID(pd->vdid), PCI_DEVICE_ID(pd->vdid));
+		dt_add_property_cells(np, "ibm,pci-config-space-type", 1);
+	} else {
+		snprintf(compat, MAX_NAME, "pci%x,%x",
+			 PCI_VENDOR_ID(pd->vdid), PCI_DEVICE_ID(pd->vdid));
+		dt_add_property_cells(np, "ibm,pci-config-space-type", 0);
+	}
+	dt_add_property_cells(np, "class-code", rev_class >> 8);
+	dt_add_property_cells(np, "revision-id", rev_class & 0xff);
+	dt_add_property_cells(np, "vendor-id", PCI_VENDOR_ID(pd->vdid));
+	dt_add_property_cells(np, "device-id", PCI_DEVICE_ID(pd->vdid));
+	if (intpin)
+		dt_add_property_cells(np, "interrupts", intpin);
+
+	pci_handle_quirk(phb, pd);
+
+	/* XXX FIXME: Add a few missing ones such as
+	 *
+	 *  - devsel-speed (!express)
+	 *  - max-latency
+	 *  - min-grant
+	 *  - subsystem-id
+	 *  - subsystem-vendor-id
+	 *  - ...
+	 */
+
+	/* Add slot properties if needed and iff this is a bridge */
+	if (pd->slot)
+		pci_slot_add_dt_properties(pd->slot, np);
+
+	/*
+	 * Use the phb base location code for root ports if the platform
+	 * doesn't provide one via slot->add_properties() operation.
+	 */
+	if (pd->dev_type == PCIE_TYPE_ROOT_PORT && phb->base_loc_code &&
+	    !dt_has_node_property(np, "ibm,slot-location-code", NULL))
+		dt_add_property_string(np, "ibm,slot-location-code",
+				       phb->base_loc_code);
+
+	/* Make up location code */
+	if (platform.pci_add_loc_code)
+		platform.pci_add_loc_code(np, pd);
+	else
+		pci_add_loc_code(np);
+
+	/* XXX FIXME: We don't look for BARs, we only put the config space
+	 * entry in the "reg" property. That's enough for Linux and we might
+	 * even want to make this legit in future ePAPR
+	 */
+	dt_add_property_cells(np, "reg", pd->bdfn << 8, 0, 0, 0, 0);
+
+	/* Print summary info about the device */
+	pci_print_summary_line(phb, pd, np, rev_class, cname);
+	if (!pd->is_bridge)
+		return;
+
+	dt_add_property_cells(np, "#address-cells", 3);
+	dt_add_property_cells(np, "#size-cells", 2);
+	dt_add_property_cells(np, "#interrupt-cells", 1);
+
+	/* We want "device_type" for bridges */
+	if (is_pcie)
+		dt_add_property_string(np, "device_type", "pciex");
+	else
+		dt_add_property_string(np, "device_type", "pci");
+
+	/* Update the current interrupt swizzling level based on our own
+	 * device number
+	 */
+	swizzle = (swizzle + PCI_DEV(pd->bdfn)) & 3;
+
+	/* We generate a standard-swizzling interrupt map. This is pretty
+	 * big, we *could* try to be smarter for things that aren't hotplug
+	 * slots at least and only populate those entries for which there's
+	 * an actual children (especially on PCI Express), but for now that
+	 * will do
+	 */
+	pci_std_swizzle_irq_map(np, pd, lstate, swizzle);
+
+	/* Parts of the OF address translation in the kernel will fail to
+	 * correctly translate a PCI address if translating a 1:1 mapping
+	 * (ie. an empty ranges property).
+	 * Instead add a ranges property that explicitly translates 1:1.
+	 */
+	dt_add_property_cells(np, "ranges",
+				/* 64-bit direct mapping. We know the bridges
+				 * don't cover the entire address space so
+				 * use 0xf00... as a good compromise. */
+				0x02000000, 0x0, 0x0,
+				0x02000000, 0x0, 0x0,
+				0xf0000000, 0x0);
+}
+
+void __noinline pci_add_device_nodes(struct phb *phb,
+				     struct list_head *list,
+				     struct dt_node *parent_node,
+				     struct pci_lsi_state *lstate,
+				     uint8_t swizzle)
+{
+	struct pci_device *pd;
+
+	/* Add all child devices */
+	list_for_each(list, pd, link) {
+		pci_add_one_device_node(phb, pd, parent_node,
+					lstate, swizzle);
+		if (list_empty(&pd->children))
+			continue;
+
+		pci_add_device_nodes(phb, &pd->children,
+				     pd->dn, lstate, swizzle);
+	}
+}
+
+static void pci_do_jobs(void (*fn)(void *))
+{
+	struct cpu_job **jobs;
+	int i;
+
+	jobs = zalloc(sizeof(struct cpu_job *) * ARRAY_SIZE(phbs));
+	assert(jobs);
+	for (i = 0; i < ARRAY_SIZE(phbs); i++) {
+		if (!phbs[i]) {
+			jobs[i] = NULL;
+			continue;
+		}
+
+		jobs[i] = __cpu_queue_job(NULL, phbs[i]->dt_node->name,
+					  fn, phbs[i], false);
+		assert(jobs[i]);
+
+	}
+
+	/* If no secondary CPUs, do everything sync */
+	cpu_process_local_jobs();
+
+	/* Wait until all tasks are done */
+	for (i = 0; i < ARRAY_SIZE(phbs); i++) {
+		if (!jobs[i])
+			continue;
+
+		cpu_wait_job(jobs[i], true);
+	}
+	free(jobs);
+}
+
+static void __pci_init_slots(void)
+{
+	unsigned int i;
+
+	/* Some PHBs may need that long to debounce the presence detect
+	 * after HW initialization.
+	 */
+	for (i = 0; i < ARRAY_SIZE(phbs); i++) {
+		if (phbs[i]) {
+			time_wait_ms(20);
+			break;
+		}
+	}
+
+	if (platform.pre_pci_fixup)
+		platform.pre_pci_fixup();
+
+	prlog(PR_NOTICE, "PCI: Resetting PHBs and training links...\n");
+	pci_do_jobs(pci_reset_phb);
+
+	prlog(PR_NOTICE, "PCI: Probing slots...\n");
+	pci_do_jobs(pci_scan_phb);
+
+	if (platform.pci_probe_complete)
+		platform.pci_probe_complete();
+
+	prlog(PR_NOTICE, "PCI Summary:\n");
+
+	for (i = 0; i < ARRAY_SIZE(phbs); i++) {
+		if (!phbs[i])
+			continue;
+
+		pci_add_device_nodes(phbs[i], &phbs[i]->devices,
+				     phbs[i]->dt_node, &phbs[i]->lstate, 0);
+	}
+
+	/* PHB final fixup */
+	for (i = 0; i < ARRAY_SIZE(phbs); i++) {
+		if (!phbs[i] || !phbs[i]->ops || !phbs[i]->ops->phb_final_fixup)
+			continue;
+
+		phbs[i]->ops->phb_final_fixup(phbs[i]);
+	}
+}
+
+static void __pci_reset(struct list_head *list)
+{
+	struct pci_device *pd;
+	struct pci_cfg_reg_filter *pcrf;
+	int i;
+
+	while ((pd = list_pop(list, struct pci_device, link)) != NULL) {
+		__pci_reset(&pd->children);
+		dt_free(pd->dn);
+		free(pd->slot);
+		while((pcrf = list_pop(&pd->pcrf, struct pci_cfg_reg_filter, link)) != NULL) {
+			free(pcrf);
+		}
+		for(i=0; i < 64; i++)
+			if (pd->cap[i].free_func)
+				pd->cap[i].free_func(pd->cap[i].data);
+		free(pd);
+	}
+}
+
+int64_t pci_reset(void)
+{
+	unsigned int i;
+
+	prlog(PR_NOTICE, "PCI: Clearing all devices...\n");
+
+	for (i = 0; i < ARRAY_SIZE(phbs); i++) {
+		struct phb *phb = phbs[i];
+		if (!phb)
+			continue;
+		__pci_reset(&phb->devices);
+
+		pci_slot_set_state(phb->slot, PCI_SLOT_STATE_CRESET_START);
+	}
+
+	/* Do init and discovery of PCI slots in parallel */
+	__pci_init_slots();
+
+	return 0;
+}
+
+void pci_init_slots(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(phbs); i++) {
+		struct phb *phb = phbs[i];
+		if (!phb)
+			continue;
+		pci_slot_set_state(phb->slot, PCI_SLOT_STATE_FRESET_POWER_OFF);
+	}
+	__pci_init_slots();
+}
+
+/*
+ * Complete iteration on current level before switching to
+ * child level, which is the proper order for restoring
+ * PCI bus range on bridges.
+ */
+static struct pci_device *__pci_walk_dev(struct phb *phb,
+					 struct list_head *l,
+					 int (*cb)(struct phb *,
+						   struct pci_device *,
+						   void *),
+					 void *userdata)
+{
+	struct pci_device *pd, *child;
+
+	if (list_empty(l))
+		return NULL;
+
+	list_for_each(l, pd, link) {
+		if (cb && cb(phb, pd, userdata))
+			return pd;
+	}
+
+	list_for_each(l, pd, link) {
+		child = __pci_walk_dev(phb, &pd->children, cb, userdata);
+		if (child)
+			return child;
+	}
+
+	return NULL;
+}
+
+struct pci_device *pci_walk_dev(struct phb *phb,
+				struct pci_device *pd,
+				int (*cb)(struct phb *,
+					  struct pci_device *,
+					  void *),
+				void *userdata)
+{
+	if (pd)
+		return __pci_walk_dev(phb, &pd->children, cb, userdata);
+
+	return __pci_walk_dev(phb, &phb->devices, cb, userdata);
+}
+
+static int __pci_find_dev(struct phb *phb,
+			  struct pci_device *pd, void *userdata)
+{
+	uint16_t bdfn = *((uint16_t *)userdata);
+
+	if (!phb || !pd)
+		return 0;
+
+	if (pd->bdfn == bdfn)
+		return 1;
+
+	return 0;
+}
+
+struct pci_device *pci_find_dev(struct phb *phb, uint16_t bdfn)
+{
+	return pci_walk_dev(phb, NULL, __pci_find_dev, &bdfn);
+}
+
+static int __pci_restore_bridge_buses(struct phb *phb,
+				      struct pci_device *pd,
+				      void *data __unused)
+{
+	uint32_t vdid;
+
+	/* If the device is behind a switch, wait for the switch */
+	if (!pd->is_vf && !(pd->bdfn & 7) && pd->parent != NULL &&
+	    pd->parent->dev_type == PCIE_TYPE_SWITCH_DNPORT) {
+		if (!pci_bridge_wait_link(phb, pd->parent, true)) {
+			PCIERR(phb, pd->bdfn, "Timeout waiting for switch\n");
+			return -1;
+		}
+	}
+
+	/* Wait for config space to stop returning CRS */
+	if (!pci_wait_crs(phb, pd->bdfn, &vdid))
+		return -1;
+
+	/* Make all devices below a bridge "re-capture" the bdfn */
+	pci_cfg_write32(phb, pd->bdfn, PCI_CFG_VENDOR_ID, vdid);
+
+	if (!pd->is_bridge)
+		return 0;
+
+	pci_cfg_write8(phb, pd->bdfn, PCI_CFG_PRIMARY_BUS,
+		       pd->primary_bus);
+	pci_cfg_write8(phb, pd->bdfn, PCI_CFG_SECONDARY_BUS,
+		       pd->secondary_bus);
+	pci_cfg_write8(phb, pd->bdfn, PCI_CFG_SUBORDINATE_BUS,
+		       pd->subordinate_bus);
+	return 0;
+}
+
+void pci_restore_bridge_buses(struct phb *phb, struct pci_device *pd)
+{
+	pci_walk_dev(phb, pd, __pci_restore_bridge_buses, NULL);
+}
+
+void pci_restore_slot_bus_configs(struct pci_slot *slot)
+{
+	/*
+	 * We might lose the bus numbers during the reset operation
+	 * and we need to restore them. Otherwise, some adapters (e.g.
+	 * IPR) can't be probed properly by the kernel. We don't need
+	 * to restore bus numbers for every kind of reset, however,
+	 * it's not harmful to always restore the bus numbers, which
+	 * simplifies the logic.
+	 */
+	pci_restore_bridge_buses(slot->phb, slot->pd);
+	if (slot->phb->ops->device_init)
+		pci_walk_dev(slot->phb, slot->pd,
+			     slot->phb->ops->device_init, NULL);
+}
+
+struct pci_cfg_reg_filter *pci_find_cfg_reg_filter(struct pci_device *pd,
+						   uint32_t start, uint32_t len)
+{
+	struct pci_cfg_reg_filter *pcrf;
+
+	/* Check on the cached range, which contains holes */
+	if ((start + len) <= pd->pcrf_start ||
+	    pd->pcrf_end <= start)
+		return NULL;
+
+	list_for_each(&pd->pcrf, pcrf, link) {
+		if (start >= pcrf->start &&
+		    (start + len) <= (pcrf->start + pcrf->len))
+			return pcrf;
+	}
+
+	return NULL;
+}
+
+static bool pci_device_has_cfg_reg_filters(struct phb *phb, uint16_t bdfn)
+{
+       return bitmap_tst_bit(*phb->filter_map, bdfn);
+}
+
+int64_t pci_handle_cfg_filters(struct phb *phb, uint32_t bdfn,
+			       uint32_t offset, uint32_t len,
+			       uint32_t *data, bool write)
+{
+	struct pci_device *pd;
+	struct pci_cfg_reg_filter *pcrf;
+	uint32_t flags;
+
+	if (!pci_device_has_cfg_reg_filters(phb, bdfn))
+		return OPAL_PARTIAL;
+	pd = pci_find_dev(phb, bdfn);
+	pcrf = pd ? pci_find_cfg_reg_filter(pd, offset, len) : NULL;
+	if (!pcrf || !pcrf->func)
+		return OPAL_PARTIAL;
+
+	flags = write ? PCI_REG_FLAG_WRITE : PCI_REG_FLAG_READ;
+	if ((pcrf->flags & flags) != flags)
+		return OPAL_PARTIAL;
+
+	return pcrf->func(pd, pcrf, offset, len, data, write);
+}
+
+struct pci_cfg_reg_filter *pci_add_cfg_reg_filter(struct pci_device *pd,
+						  uint32_t start, uint32_t len,
+						  uint32_t flags,
+						  pci_cfg_reg_func func)
+{
+	struct pci_cfg_reg_filter *pcrf;
+
+	pcrf = pci_find_cfg_reg_filter(pd, start, len);
+	if (pcrf)
+		return pcrf;
+
+	pcrf = zalloc(sizeof(*pcrf) + ((len + 0x4) & ~0x3));
+	if (!pcrf)
+		return NULL;
+
+	/* Don't validate the flags so that the private flags
+	 * can be supported for debugging purpose.
+	 */
+	pcrf->flags = flags;
+	pcrf->start = start;
+	pcrf->len = len;
+	pcrf->func = func;
+	pcrf->data = (uint8_t *)(pcrf + 1);
+
+	if (start < pd->pcrf_start)
+		pd->pcrf_start = start;
+	if (pd->pcrf_end < (start + len))
+		pd->pcrf_end = start + len;
+	list_add_tail(&pd->pcrf, &pcrf->link);
+	bitmap_set_bit(*pd->phb->filter_map, pd->bdfn);
+
+	return pcrf;
+}
diff --git a/roms/skiboot/core/pcie-slot.c b/roms/skiboot/core/pcie-slot.c
new file mode 100644
index 000000000..03326e58f
--- /dev/null
+++ b/roms/skiboot/core/pcie-slot.c
@@ -0,0 +1,566 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * PCIe Slots
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <opal-msg.h>
+#include <pci-cfg.h>
+#include <pci.h>
+#include <pci-slot.h>
+
+/* Debugging options */
+#define PCIE_SLOT_PREFIX	"PCIE-SLOT-%016llx "
+#define PCIE_SLOT_DBG(s, fmt, a...)		  \
+	prlog(PR_DEBUG, PCIE_SLOT_PREFIX fmt, (s)->id, ##a)
+
+static int64_t pcie_slot_get_presence_state(struct pci_slot *slot, uint8_t *val)
+{
+	struct phb *phb = slot->phb;
+	struct pci_device *pd = slot->pd;
+	uint32_t ecap;
+	uint16_t state;
+
+	/* The presence is always on if it's a switch upstream port */
+	if (pd->dev_type == PCIE_TYPE_SWITCH_UPPORT) {
+		*val = OPAL_PCI_SLOT_PRESENT;
+		return OPAL_SUCCESS;
+	}
+
+	/*
+	 * The presence is always on if a switch downstream port
+	 * doesn't support slot capability according to PCIE spec.
+	 */
+	if (pd->dev_type == PCIE_TYPE_SWITCH_DNPORT &&
+	    !(slot->pcie_cap & PCICAP_EXP_CAP_SLOT)) {
+		*val = OPAL_PCI_SLOT_PRESENT;
+		return OPAL_SUCCESS;
+	}
+
+	/* Retrieve presence status */
+	ecap = pci_cap(pd, PCI_CFG_CAP_ID_EXP, false);
+	pci_cfg_read16(phb, pd->bdfn, ecap + PCICAP_EXP_SLOTSTAT, &state);
+	if (state & PCICAP_EXP_SLOTSTAT_PDETECTST)
+		*val = OPAL_PCI_SLOT_PRESENT;
+	else
+		*val = OPAL_PCI_SLOT_EMPTY;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t pcie_slot_get_link_state(struct pci_slot *slot,
+					uint8_t *val)
+{
+	struct phb *phb = slot->phb;
+	struct pci_device *pd = slot->pd;
+	uint32_t ecap;
+	int16_t state;
+
+	/*
+	 * The link behind switch upstream port is always on
+	 * since it doesn't have a valid link indicator.
+	 */
+	if (pd->dev_type == PCIE_TYPE_SWITCH_UPPORT) {
+		*val = 1;
+		return OPAL_SUCCESS;
+	}
+
+	/* Retrieve link width */
+	ecap = pci_cap(pd, PCI_CFG_CAP_ID_EXP, false);
+	pci_cfg_read16(phb, pd->bdfn, ecap + PCICAP_EXP_LSTAT, &state);
+	if (state & PCICAP_EXP_LSTAT_DLLL_ACT)
+		*val = ((state & PCICAP_EXP_LSTAT_WIDTH) >> 4);
+	else
+		*val = 0;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t pcie_slot_get_power_state(struct pci_slot *slot __unused,
+					 uint8_t *val)
+{
+	/* We should return the cached power state that is same to
+	 * the PCI slot hotplug state (added/removed). Otherwise,
+	 * the OS will see mismatched states, causing the adapter
+	 * behind the slot can't be probed successfully on request
+	 * of hot add. So we could run into the situation where the
+	 * OS sees power-off but it's on in hardware.
+	 */
+	*val = slot->power_state;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t pcie_slot_get_attention_state(struct pci_slot *slot,
+					     uint8_t *val)
+{
+	struct phb *phb = slot->phb;
+	struct pci_device *pd = slot->pd;
+	uint32_t ecap;
+	uint16_t state;
+
+	/* Attention is off if the capability is missing */
+	if (!(slot->slot_cap & PCICAP_EXP_SLOTCAP_ATTNI)) {
+		*val = 0;
+		return OPAL_SUCCESS;
+	}
+
+	/* Retrieve attention state */
+	ecap = pci_cap(pd, PCI_CFG_CAP_ID_EXP, false);
+	pci_cfg_read16(phb, pd->bdfn, ecap + PCICAP_EXP_SLOTCTL, &state);
+	state = (state & PCICAP_EXP_SLOTCTL_ATTNI) >> 6;
+	switch (state) {
+	case PCIE_INDIC_ON:
+		*val = PCI_SLOT_ATTN_LED_ON;
+		break;
+	case PCIE_INDIC_BLINK:
+		*val = PCI_SLOT_ATTN_LED_BLINK;
+		break;
+	case PCIE_INDIC_OFF:
+	default:
+		*val = PCI_SLOT_ATTN_LED_OFF;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t pcie_slot_get_latch_state(struct pci_slot *slot,
+					 uint8_t *val)
+{
+	struct phb *phb = slot->phb;
+	struct pci_device *pd = slot->pd;
+	uint32_t ecap;
+	uint16_t state;
+
+	/* Latch is off if MRL sensor doesn't exist */
+	if (!(slot->slot_cap & PCICAP_EXP_SLOTCAP_MRLSENS)) {
+		*val = 0;
+		return OPAL_SUCCESS;
+	}
+
+	/* Retrieve MRL sensor state */
+	ecap = pci_cap(pd, PCI_CFG_CAP_ID_EXP, false);
+	pci_cfg_read16(phb, pd->bdfn, ecap + PCICAP_EXP_SLOTSTAT, &state);
+	if (state & PCICAP_EXP_SLOTSTAT_MRLSENSST)
+		*val = 1;
+	else
+		*val = 0;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t pcie_slot_set_attention_state(struct pci_slot *slot,
+					     uint8_t val)
+{
+	struct phb *phb = slot->phb;
+	struct pci_device *pd = slot->pd;
+	uint32_t ecap;
+	uint16_t state;
+
+	/* Drop the request if functionality doesn't exist */
+	if (!(slot->slot_cap & PCICAP_EXP_SLOTCAP_ATTNI))
+		return OPAL_SUCCESS;
+
+	/* Update with the requested state */
+	ecap = pci_cap(pd, PCI_CFG_CAP_ID_EXP, false);
+	pci_cfg_read16(phb, pd->bdfn, ecap + PCICAP_EXP_SLOTCTL, &state);
+	state &= ~PCICAP_EXP_SLOTCTL_ATTNI;
+	switch (val) {
+	case PCI_SLOT_ATTN_LED_ON:
+		state |= (PCIE_INDIC_ON << 6);
+		break;
+	case PCI_SLOT_ATTN_LED_BLINK:
+		state |= (PCIE_INDIC_BLINK << 6);
+		break;
+	case PCI_SLOT_ATTN_LED_OFF:
+		state |= (PCIE_INDIC_OFF << 6);
+		break;
+	default:
+		prlog(PR_ERR, PCIE_SLOT_PREFIX
+		      "Invalid attention state (0x%x)\n", slot->id, val);
+		return OPAL_PARAMETER;
+	}
+
+	pci_cfg_write16(phb, pd->bdfn, ecap + PCICAP_EXP_SLOTCTL, state);
+	return OPAL_SUCCESS;
+}
+
+static int64_t pcie_slot_set_power_state_ext(struct pci_slot *slot, uint8_t val,
+					     bool surprise_check)
+{
+	struct phb *phb = slot->phb;
+	struct pci_device *pd = slot->pd;
+	uint32_t ecap;
+	uint16_t state;
+
+	if (slot->power_state == val)
+		return OPAL_SUCCESS;
+
+	/* Update the power state and return immediately if the power
+	 * control functionality isn't supported on the PCI slot.
+	 */
+	if (!(slot->slot_cap & PCICAP_EXP_SLOTCAP_PWCTRL)) {
+		slot->power_state = val;
+		return OPAL_SUCCESS;
+	}
+
+	/*
+	 * Suprise hotpluggable slots need to be handled with care since
+	 * many systems do not implement the presence detect side-band
+	 * signal. Instead, they rely on in-band presence to report the
+	 * existence of a hotplugged card.
+	 *
+	 * This is problematic because:
+	 * a) When PERST is asserted in-band presence doesn't work, and
+	 * b) Switches assert PERST as a part of the "slot power down" sequence
+	 *
+	 * To work around the problem we leave the slot physically powered on
+	 * and exit early here. This way when a new card is inserted, the switch
+	 * will raise an interrupt due to the PresDet status changing.
+	 */
+	if (surprise_check && slot->surprise_pluggable) {
+		slot->power_state = val;
+		if (val == PCI_SLOT_POWER_OFF)
+			return OPAL_SUCCESS;
+
+		/*
+		 * Some systems have the slot power disabled by default
+		 * so we always perform the power-on step. This is not
+		 * *strictly* required, but it's probably a good idea.
+		 */
+	}
+
+	pci_slot_set_state(slot, PCI_SLOT_STATE_SPOWER_START);
+	slot->power_state = val;
+	ecap = pci_cap(pd, PCI_CFG_CAP_ID_EXP, false);
+	pci_cfg_read16(phb, pd->bdfn, ecap + PCICAP_EXP_SLOTCTL, &state);
+	state &= ~(PCICAP_EXP_SLOTCTL_PWRCTLR | PCICAP_EXP_SLOTCTL_PWRI);
+	switch (val) {
+	case PCI_SLOT_POWER_OFF:
+		state |= (PCICAP_EXP_SLOTCTL_PWRCTLR | (PCIE_INDIC_OFF << 8));
+		break;
+	case PCI_SLOT_POWER_ON:
+		state |= (PCIE_INDIC_ON << 8);
+		break;
+	default:
+		pci_slot_set_state(slot, PCI_SLOT_STATE_NORMAL);
+		prlog(PR_ERR, PCIE_SLOT_PREFIX
+		      "Invalid power state (0x%x)\n", slot->id, val);
+		return OPAL_PARAMETER;
+	}
+
+	pci_cfg_write16(phb, pd->bdfn, ecap + PCICAP_EXP_SLOTCTL, state);
+	pci_slot_set_state(slot, PCI_SLOT_STATE_SPOWER_DONE);
+
+	return OPAL_ASYNC_COMPLETION;
+}
+
+static int64_t pcie_slot_set_power_state(struct pci_slot *slot, uint8_t val)
+{
+	return pcie_slot_set_power_state_ext(slot, val, true);
+}
+
+static int64_t pcie_slot_sm_poll_link(struct pci_slot *slot)
+{
+	struct phb *phb = slot->phb;
+	struct pci_device *pd = slot->pd;
+	uint32_t ecap = pci_cap(pd, PCI_CFG_CAP_ID_EXP, false);
+	uint16_t val;
+	uint8_t presence = 0;
+
+	switch (slot->state) {
+	case PCI_SLOT_STATE_LINK_START_POLL:
+		PCIE_SLOT_DBG(slot, "LINK: Start polling\n");
+
+		/* Link is down for ever without devices attached */
+		if (slot->ops.get_presence_state)
+			slot->ops.get_presence_state(slot, &presence);
+		if (!presence) {
+			PCIE_SLOT_DBG(slot, "LINK: No adapter, end polling\n");
+			pci_slot_set_state(slot, PCI_SLOT_STATE_NORMAL);
+			return OPAL_SUCCESS;
+		}
+
+		/* Enable the link without check */
+		pci_cfg_read16(phb, pd->bdfn, ecap + PCICAP_EXP_LCTL, &val);
+		val &= ~PCICAP_EXP_LCTL_LINK_DIS;
+		pci_cfg_write16(phb, pd->bdfn, ecap + PCICAP_EXP_LCTL, val);
+
+		/*
+		 * If the link change report isn't supported, we expect
+		 * the link is up and stabilized after one second.
+		 */
+		if (!(slot->link_cap & PCICAP_EXP_LCAP_DL_ACT_REP)) {
+			pci_slot_set_state(slot,
+					   PCI_SLOT_STATE_LINK_DELAY_FINALIZED);
+			return pci_slot_set_sm_timeout(slot, secs_to_tb(1));
+		}
+
+		/*
+		 * Poll the link state if link state change report is
+		 * supported on the link.
+		 */
+		pci_slot_set_state(slot, PCI_SLOT_STATE_LINK_POLLING);
+		slot->retries = 250;
+		return pci_slot_set_sm_timeout(slot, msecs_to_tb(20));
+	case PCI_SLOT_STATE_LINK_DELAY_FINALIZED:
+		PCIE_SLOT_DBG(slot, "LINK: No link report, end polling\n");
+		if (slot->ops.prepare_link_change)
+			slot->ops.prepare_link_change(slot, true);
+		pci_slot_set_state(slot, PCI_SLOT_STATE_NORMAL);
+		return OPAL_SUCCESS;
+	case PCI_SLOT_STATE_LINK_POLLING:
+		pci_cfg_read16(phb, pd->bdfn, ecap + PCICAP_EXP_LSTAT, &val);
+		if (val & PCICAP_EXP_LSTAT_DLLL_ACT) {
+			PCIE_SLOT_DBG(slot, "LINK: Link is up, end polling\n");
+			if (slot->ops.prepare_link_change)
+				slot->ops.prepare_link_change(slot, true);
+			pci_slot_set_state(slot, PCI_SLOT_STATE_NORMAL);
+			return OPAL_SUCCESS;
+		}
+
+		/* Check link state again until timeout */
+		if (slot->retries-- == 0) {
+			prlog(PR_ERR, PCIE_SLOT_PREFIX
+			      "LINK: Timeout waiting for up (%04x)\n",
+			      slot->id, val);
+			pci_slot_set_state(slot, PCI_SLOT_STATE_NORMAL);
+			return OPAL_SUCCESS;
+		}
+
+		return pci_slot_set_sm_timeout(slot, msecs_to_tb(20));
+	default:
+		prlog(PR_ERR, PCIE_SLOT_PREFIX
+		      "Link: Unexpected slot state %08x\n",
+		      slot->id, slot->state);
+	}
+
+	pci_slot_set_state(slot, PCI_SLOT_STATE_NORMAL);
+	return OPAL_HARDWARE;
+}
+
+static void pcie_slot_reset(struct pci_slot *slot, bool assert)
+{
+	struct phb *phb = slot->phb;
+	struct pci_device *pd = slot->pd;
+	uint16_t ctl;
+
+	pci_cfg_read16(phb, pd->bdfn, PCI_CFG_BRCTL, &ctl);
+	if (assert)
+		ctl |= PCI_CFG_BRCTL_SECONDARY_RESET;
+	else
+		ctl &= ~PCI_CFG_BRCTL_SECONDARY_RESET;
+	pci_cfg_write16(phb, pd->bdfn, PCI_CFG_BRCTL, ctl);
+}
+
+static int64_t pcie_slot_sm_hreset(struct pci_slot *slot)
+{
+	switch (slot->state) {
+	case PCI_SLOT_STATE_NORMAL:
+		PCIE_SLOT_DBG(slot, "HRESET: Starts\n");
+		if (slot->ops.prepare_link_change) {
+			PCIE_SLOT_DBG(slot, "HRESET: Prepare for link down\n");
+			slot->ops.prepare_link_change(slot, false);
+		}
+		/* fall through */
+	case PCI_SLOT_STATE_HRESET_START:
+		PCIE_SLOT_DBG(slot, "HRESET: Assert\n");
+		pcie_slot_reset(slot, true);
+		pci_slot_set_state(slot, PCI_SLOT_STATE_HRESET_HOLD);
+		return pci_slot_set_sm_timeout(slot, msecs_to_tb(250));
+	case PCI_SLOT_STATE_HRESET_HOLD:
+		PCIE_SLOT_DBG(slot, "HRESET: Deassert\n");
+		pcie_slot_reset(slot, false);
+		pci_slot_set_state(slot, PCI_SLOT_STATE_LINK_START_POLL);
+		return pci_slot_set_sm_timeout(slot, msecs_to_tb(1800));
+	default:
+		PCIE_SLOT_DBG(slot, "HRESET: Unexpected slot state %08x\n",
+			      slot->state);
+	}
+
+	pci_slot_set_state(slot, PCI_SLOT_STATE_NORMAL);
+	return OPAL_HARDWARE;
+}
+
+/*
+ * Usually, individual platforms need to override the power
+ * management methods for fundamental reset, but the hot
+ * reset method is commonly shared.
+ */
+static int64_t pcie_slot_sm_freset(struct pci_slot *slot)
+{
+	uint8_t power_state = PCI_SLOT_POWER_ON;
+
+	switch (slot->state) {
+	case PCI_SLOT_STATE_NORMAL:
+		PCIE_SLOT_DBG(slot, "FRESET: Starts\n");
+		if (slot->ops.prepare_link_change)
+			slot->ops.prepare_link_change(slot, false);
+
+		/* Retrieve power state */
+		if (slot->ops.get_power_state) {
+			PCIE_SLOT_DBG(slot, "FRESET: Retrieve power state\n");
+			slot->ops.get_power_state(slot, &power_state);
+		}
+
+		/* In power on state, power it off */
+		if (power_state == PCI_SLOT_POWER_ON) {
+			PCIE_SLOT_DBG(slot, "FRESET: Power is on, turn off\n");
+			pcie_slot_set_power_state_ext(slot,
+				PCI_SLOT_POWER_OFF, false);
+			pci_slot_set_state(slot,
+				PCI_SLOT_STATE_FRESET_POWER_OFF);
+			return pci_slot_set_sm_timeout(slot, msecs_to_tb(50));
+		}
+		/* No power state change, */
+		/* fallthrough */
+	case PCI_SLOT_STATE_FRESET_POWER_OFF:
+		PCIE_SLOT_DBG(slot, "FRESET: Power is off, turn on\n");
+		pcie_slot_set_power_state_ext(slot, PCI_SLOT_POWER_ON, false);
+
+		pci_slot_set_state(slot, PCI_SLOT_STATE_LINK_START_POLL);
+		return pci_slot_set_sm_timeout(slot, msecs_to_tb(50));
+	default:
+		prlog(PR_ERR, PCIE_SLOT_PREFIX
+		      "FRESET: Unexpected slot state %08x\n",
+		      slot->id, slot->state);
+	}
+
+	pci_slot_set_state(slot, PCI_SLOT_STATE_NORMAL);
+	return OPAL_HARDWARE;
+}
+
+struct pci_slot *pcie_slot_create(struct phb *phb, struct pci_device *pd)
+{
+	struct pci_slot *slot;
+	uint32_t ecap;
+	uint16_t slot_ctl;
+
+	/* Allocate PCI slot */
+	slot = pci_slot_alloc(phb, pd);
+	if (!slot)
+		return NULL;
+
+	/* Cache the link and slot capabilities */
+	ecap = pci_cap(pd, PCI_CFG_CAP_ID_EXP, false);
+	pci_cfg_read16(phb, pd->bdfn, ecap + PCICAP_EXP_CAPABILITY_REG,
+		       &slot->pcie_cap);
+	pci_cfg_read32(phb, pd->bdfn, ecap + PCICAP_EXP_LCAP,
+		       &slot->link_cap);
+
+	/* Leave PCI slot capability blank if PCI slot isn't supported */
+	if (slot->pcie_cap & PCICAP_EXP_CAP_SLOT)
+		pci_cfg_read32(phb, pd->bdfn, ecap + PCICAP_EXP_SLOTCAP,
+			       &slot->slot_cap);
+	else
+		slot->slot_cap = 0;
+
+	if (slot->slot_cap & PCICAP_EXP_SLOTCAP_HPLUG_CAP)
+		slot->pluggable = 1;
+
+	/* Assume the slot is powered on by default */
+	slot->power_state = PCI_SLOT_POWER_ON;
+	if (slot->slot_cap & PCICAP_EXP_SLOTCAP_PWCTRL) {
+		slot->power_ctl = 1;
+
+		pci_cfg_read16(phb, pd->bdfn, ecap + PCICAP_EXP_SLOTCTL,
+			       &slot_ctl);
+		if (slot_ctl & PCICAP_EXP_SLOTCTL_PWRCTLR)
+			slot->power_state = PCI_SLOT_POWER_OFF;
+	}
+
+	if (slot->slot_cap & PCICAP_EXP_SLOTCAP_PWRI)
+		slot->power_led_ctl = PCI_SLOT_PWR_LED_CTL_KERNEL;
+	if (slot->slot_cap & PCICAP_EXP_SLOTCAP_ATTNI)
+		slot->attn_led_ctl = PCI_SLOT_ATTN_LED_CTL_KERNEL;
+	slot->wired_lanes = ((slot->link_cap & PCICAP_EXP_LCAP_MAXWDTH) >> 4);
+
+	/* The surprise hotplug capability is claimed when it's supported
+	 * in the slot's capability bits or link state change reporting is
+	 * supported in PCIe link capability. It means the surprise hotplug
+	 * relies on presence or link state change events. In order for the
+	 * link state change event to be properly raised during surprise hot
+	 * add/remove, the power supply to the slot should be always on.
+	 *
+	 * For PCI slots that don't claim surprise hotplug capability explicitly.
+	 * Its PDC (Presence Detection Change) isn't reliable. To mark that as
+	 * broken on them.
+	 */
+	if (slot->pcie_cap & PCICAP_EXP_CAP_SLOT) {
+		if (slot->slot_cap & PCICAP_EXP_SLOTCAP_HPLUG_SURP) {
+			slot->surprise_pluggable = 1;
+		} else if (slot->link_cap & PCICAP_EXP_LCAP_DL_ACT_REP) {
+			slot->surprise_pluggable = 1;
+
+			pci_slot_add_flags(slot, PCI_SLOT_FLAG_BROKEN_PDC);
+		}
+	}
+
+	/* Standard slot operations */
+	slot->ops.get_presence_state  = pcie_slot_get_presence_state;
+	slot->ops.get_link_state      = pcie_slot_get_link_state;
+	slot->ops.get_power_state     = pcie_slot_get_power_state;
+	slot->ops.get_attention_state = pcie_slot_get_attention_state;
+	slot->ops.get_latch_state     = pcie_slot_get_latch_state;
+	slot->ops.set_power_state     = pcie_slot_set_power_state;
+	slot->ops.set_attention_state = pcie_slot_set_attention_state;
+
+	/*
+	 * State machine (SM) based reset stuff. The poll function is always
+	 * unified for all cases.
+	 */
+	slot->ops.poll_link             = pcie_slot_sm_poll_link;
+	slot->ops.hreset                = pcie_slot_sm_hreset;
+	slot->ops.freset                = pcie_slot_sm_freset;
+
+	slot->wired_lanes    = PCI_SLOT_WIRED_LANES_UNKNOWN;
+	slot->connector_type = PCI_SLOT_CONNECTOR_PCIE_NS;
+	slot->card_desc      = PCI_SLOT_DESC_NON_STANDARD;
+	slot->card_mech      = PCI_SLOT_MECH_NONE;
+	slot->power_led_ctl  = PCI_SLOT_PWR_LED_CTL_NONE;
+	slot->attn_led_ctl   = PCI_SLOT_ATTN_LED_CTL_NONE;
+
+	return slot;
+}
+
+/* FIXME: this is kind of insane */
+struct pci_slot *pcie_slot_create_dynamic(struct phb *phb,
+		struct pci_device *pd)
+{
+	uint32_t ecap, val;
+	struct pci_slot *slot;
+
+	if (!phb || !pd || pd->slot)
+		return NULL;
+
+	/* Try to create slot whose details aren't provided by platform. */
+	if (pd->dev_type != PCIE_TYPE_SWITCH_DNPORT)
+		return NULL;
+
+	ecap = pci_cap(pd, PCI_CFG_CAP_ID_EXP, false);
+	pci_cfg_read32(phb, pd->bdfn, ecap + PCICAP_EXP_SLOTCAP, &val);
+	if (!(val & PCICAP_EXP_SLOTCAP_HPLUG_CAP))
+		return NULL;
+
+	slot = pcie_slot_create(phb, pd);
+
+	/* On superMicro's "p8dnu" platform, we create dynamic PCI slots
+	 * for all downstream ports of PEX9733 that is connected to PHB
+	 * direct slot. The power supply to the PCI slot is lost after
+	 * PCI adapter is removed from it. The power supply can't be
+	 * turned on when the slot is in empty state. The power supply
+	 * isn't turned on automatically when inserting PCI adapter to
+	 * the slot at later point. We set a flag to the slot here, to
+	 * turn on the power supply in (suprise or managed) hot-add path.
+	 *
+	 * We have same issue with PEX8718 as above on "p8dnu" platform.
+	 */
+	if (dt_node_is_compatible(dt_root, "supermicro,p8dnu") && slot &&
+	    slot->pd && (slot->pd->vdid == 0x973310b5 ||
+	    slot->pd->vdid == 0x871810b5))
+		pci_slot_add_flags(slot, PCI_SLOT_FLAG_FORCE_POWERON);
+
+	return slot;
+}
diff --git a/roms/skiboot/core/pel.c b/roms/skiboot/core/pel.c
new file mode 100644
index 000000000..ec13e5590
--- /dev/null
+++ b/roms/skiboot/core/pel.c
@@ -0,0 +1,279 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Platform Error Log (PEL) generation
+ *
+ * Copyright 2014-2016 IBM Corp
+ */
+
+#include <string.h>
+#include <errorlog.h>
+#include <device.h>
+#include <fsp.h>
+#include <pel.h>
+#include <rtc.h>
+
+/* Create MTMS section for sapphire log */
+static void create_mtms_section(struct errorlog *elog_data,
+					char *pel_buffer, int *pel_offset)
+{
+	const struct dt_property *p;
+
+	struct opal_mtms_section *mtms = (struct opal_mtms_section *)
+				(pel_buffer + *pel_offset);
+
+	mtms->v6header.id = cpu_to_be16(ELOG_SID_MACHINE_TYPE);
+	mtms->v6header.length = cpu_to_be16(MTMS_SECTION_SIZE);
+	mtms->v6header.version = OPAL_EXT_HRD_VER;
+	mtms->v6header.subtype = 0;
+	mtms->v6header.component_id = cpu_to_be16(elog_data->component_id);
+
+	memset(mtms->model, 0x00, sizeof(mtms->model));
+	memcpy(mtms->model, dt_prop_get(dt_root, "model"), OPAL_SYS_MODEL_LEN);
+
+	memset(mtms->serial_no, 0x00, sizeof(mtms->serial_no));
+	p = dt_find_property(dt_root, "system-id");
+	if (p)
+		memcpy(mtms->serial_no, p->prop, OPAL_SYS_SERIAL_LEN);
+	else
+		memset(mtms->serial_no, 0, OPAL_SYS_SERIAL_LEN);
+
+	*pel_offset += MTMS_SECTION_SIZE;
+}
+
+/* Create extended header section */
+static void create_extended_header_section(struct errorlog *elog_data,
+					char *pel_buffer, int *pel_offset)
+{
+	const char  *opalmodel = NULL;
+	const struct dt_property *p;
+	uint64_t extd_time;
+	uint32_t extd_date;
+
+	struct opal_extended_header_section *extdhdr =
+			(struct opal_extended_header_section *)
+					(pel_buffer + *pel_offset);
+
+	extdhdr->v6header.id = cpu_to_be16(ELOG_SID_EXTENDED_HEADER);
+	extdhdr->v6header.length = cpu_to_be16(EXTENDED_HEADER_SECTION_SIZE);
+	extdhdr->v6header.version = OPAL_EXT_HRD_VER;
+	extdhdr->v6header.subtype = 0;
+	extdhdr->v6header.component_id = cpu_to_be16(elog_data->component_id);
+
+	memset(extdhdr->model, 0x00, sizeof(extdhdr->model));
+	opalmodel = dt_prop_get(dt_root, "model");
+	memcpy(extdhdr->model, opalmodel, OPAL_SYS_MODEL_LEN);
+
+	memset(extdhdr->serial_no, 0x00, sizeof(extdhdr->serial_no));
+	p = dt_find_property(dt_root, "system-id");
+	if (p)
+		memcpy(extdhdr->serial_no, p->prop, OPAL_SYS_SERIAL_LEN);
+	else
+		memset(extdhdr->serial_no, 0, OPAL_SYS_SERIAL_LEN);
+
+	memset(extdhdr->opal_release_version, 0x00,
+				sizeof(extdhdr->opal_release_version));
+	memset(extdhdr->opal_subsys_version, 0x00,
+				sizeof(extdhdr->opal_subsys_version));
+
+	rtc_cache_get_datetime(&extd_date, &extd_time);
+	extdhdr->extended_header_date = cpu_to_be32(extd_date);
+	extdhdr->extended_header_time = cpu_to_be32(extd_time >> 32);
+	extdhdr->opal_symid_len = 0;
+
+	*pel_offset += EXTENDED_HEADER_SECTION_SIZE;
+}
+
+/* set src type */
+static void settype(struct opal_src_section *src, uint8_t src_type)
+{
+	char type[4];
+	snprintf(type, sizeof(type), "%02X", src_type);
+	memcpy(src->srcstring, type, 2);
+}
+
+/* set SRC subsystem type */
+static void setsubsys(struct opal_src_section *src, uint8_t src_subsys)
+{
+	char subsys[4];
+	snprintf(subsys, sizeof(subsys), "%02X", src_subsys);
+	memcpy(src->srcstring+2, subsys, 2);
+}
+
+/* Ser reason code of SRC */
+static void setrefcode(struct opal_src_section *src, uint16_t src_refcode)
+{
+	char refcode[8];
+	snprintf(refcode, sizeof(refcode), "%04X", src_refcode);
+	memcpy(src->srcstring+4, refcode, 4);
+}
+
+/* Create SRC section of OPAL log */
+static void create_src_section(struct errorlog *elog_data,
+					char *pel_buffer, int *pel_offset)
+{
+	struct opal_src_section *src = (struct opal_src_section *)
+						(pel_buffer + *pel_offset);
+
+	src->v6header.id = cpu_to_be16(ELOG_SID_PRIMARY_SRC);
+	src->v6header.length = cpu_to_be16(SRC_SECTION_SIZE);
+	src->v6header.version = OPAL_ELOG_VERSION;
+	src->v6header.subtype = OPAL_ELOG_SST;
+	src->v6header.component_id = cpu_to_be16(elog_data->component_id);
+
+	src->version = OPAL_SRC_SEC_VER;
+	src->flags = 0;
+	src->wordcount = OPAL_SRC_MAX_WORD_COUNT;
+	src->srclength = cpu_to_be16(SRC_LENGTH);
+	settype(src, OPAL_SRC_TYPE_ERROR);
+	setsubsys(src, OPAL_FAILING_SUBSYSTEM);
+	setrefcode(src, elog_data->reason_code);
+	memset(src->hexwords, 0 , (8 * 4));
+	src->hexwords[0] = cpu_to_be32(OPAL_SRC_FORMAT);
+	src->hexwords[4] = cpu_to_be32(elog_data->additional_info[0]);
+	src->hexwords[5] = cpu_to_be32(elog_data->additional_info[1]);
+	src->hexwords[6] = cpu_to_be32(elog_data->additional_info[2]);
+	src->hexwords[7] = cpu_to_be32(elog_data->additional_info[3]);
+	*pel_offset += SRC_SECTION_SIZE;
+}
+
+/* Create user header section */
+static void create_user_header_section(struct errorlog *elog_data,
+					char *pel_buffer, int *pel_offset)
+{
+	struct opal_user_header_section *usrhdr =
+				(struct opal_user_header_section *)
+						(pel_buffer + *pel_offset);
+
+	usrhdr->v6header.id = cpu_to_be16(ELOG_SID_USER_HEADER);
+	usrhdr->v6header.length = cpu_to_be16(USER_HEADER_SECTION_SIZE);
+	usrhdr->v6header.version = OPAL_ELOG_VERSION;
+	usrhdr->v6header.subtype = OPAL_ELOG_SST;
+	usrhdr->v6header.component_id = cpu_to_be16(elog_data->component_id);
+
+	usrhdr->subsystem_id = elog_data->subsystem_id;
+	usrhdr->event_scope = 0;
+	usrhdr->event_severity = elog_data->event_severity;
+	usrhdr->event_type = elog_data->event_subtype;
+
+	if (elog_data->elog_origin == ORG_SAPPHIRE)
+		usrhdr->action_flags = cpu_to_be16(ERRL_ACTION_REPORT);
+	else
+		usrhdr->action_flags = cpu_to_be16(ERRL_ACTION_NONE);
+
+	*pel_offset += USER_HEADER_SECTION_SIZE;
+}
+
+/* Create private header section */
+static void create_private_header_section(struct errorlog *elog_data,
+					char *pel_buffer, int *pel_offset)
+{
+	uint64_t ctime;
+	uint32_t cdate;
+	struct opal_private_header_section *privhdr =
+				(struct opal_private_header_section *)
+								pel_buffer;
+
+	privhdr->v6header.id = cpu_to_be16(ELOG_SID_PRIVATE_HEADER);
+	privhdr->v6header.length = cpu_to_be16(PRIVATE_HEADER_SECTION_SIZE);
+	privhdr->v6header.version = OPAL_ELOG_VERSION;
+	privhdr->v6header.subtype = OPAL_ELOG_SST;
+	privhdr->v6header.component_id = cpu_to_be16(elog_data->component_id);
+	privhdr->plid = cpu_to_be32(elog_data->plid);
+
+	rtc_cache_get_datetime(&cdate, &ctime);
+	privhdr->create_date = cpu_to_be32(cdate);
+	privhdr->create_time = cpu_to_be32(ctime >> 32);
+	privhdr->section_count = 5;
+
+	privhdr->creator_subid_hi = 0x00;
+	privhdr->creator_subid_lo = 0x00;
+
+	if (elog_data->elog_origin == ORG_SAPPHIRE)
+		privhdr->creator_id = OPAL_CID_SAPPHIRE;
+	else
+		privhdr->creator_id = OPAL_CID_POWERNV;
+
+	privhdr->log_entry_id = cpu_to_be32(elog_data->plid); /*entry id is updated by FSP*/
+
+	*pel_offset += PRIVATE_HEADER_SECTION_SIZE;
+}
+
+static void create_user_defined_section(struct errorlog *elog_data,
+					char *pel_buffer, int *pel_offset)
+{
+	char *dump = (char *)pel_buffer + *pel_offset;
+	char *opal_buf = (char *)elog_data->user_data_dump;
+	struct opal_user_section *usrhdr;
+	struct elog_user_data_section *opal_usr_data;
+	struct opal_private_header_section *privhdr =
+			 (struct opal_private_header_section *)pel_buffer;
+	int i;
+
+	for (i = 0; i < elog_data->user_section_count; i++) {
+
+		usrhdr = (struct opal_user_section *)dump;
+		opal_usr_data = (struct elog_user_data_section *)opal_buf;
+
+		usrhdr->v6header.id = cpu_to_be16(ELOG_SID_USER_DEFINED);
+		usrhdr->v6header.length = cpu_to_be16(
+					sizeof(struct opal_v6_header) +
+					be16_to_cpu(opal_usr_data->size));
+		usrhdr->v6header.version = OPAL_ELOG_VERSION;
+		usrhdr->v6header.subtype = OPAL_ELOG_SST;
+		usrhdr->v6header.component_id = cpu_to_be16(elog_data->component_id);
+
+		memcpy(usrhdr->dump, opal_buf, be16_to_cpu(opal_usr_data->size));
+		*pel_offset += be16_to_cpu(usrhdr->v6header.length);
+		dump += be16_to_cpu(usrhdr->v6header.length);
+		opal_buf += be16_to_cpu(opal_usr_data->size);
+		privhdr->section_count++;
+	}
+}
+
+static size_t pel_user_section_size(struct errorlog *elog_data)
+{
+	int i;
+	size_t total = 0;
+	char *opal_buf = (char *)elog_data->user_data_dump;
+	struct elog_user_data_section *opal_usr_data;
+
+	for (i = 0; i < elog_data->user_section_count; i++) {
+		u16 s;
+
+		opal_usr_data = (struct elog_user_data_section *)opal_buf;
+		s = be16_to_cpu(opal_usr_data->size);
+		total += sizeof(struct opal_v6_header) + s;
+		opal_buf += s;
+	}
+
+	return total;
+}
+
+size_t pel_size(struct errorlog *elog_data)
+{
+	return PEL_MIN_SIZE + pel_user_section_size(elog_data);
+}
+
+/* Converts an OPAL errorlog into a PEL formatted log */
+int create_pel_log(struct errorlog *elog_data, char *pel_buffer,
+		   size_t pel_buffer_size)
+{
+	int pel_offset = 0;
+
+	if (pel_buffer_size < pel_size(elog_data)) {
+		prerror("PEL buffer too small to create record\n");
+		return 0;
+	}
+
+	memset(pel_buffer, 0, pel_buffer_size);
+
+	create_private_header_section(elog_data, pel_buffer, &pel_offset);
+	create_user_header_section(elog_data, pel_buffer, &pel_offset);
+	create_src_section(elog_data, pel_buffer, &pel_offset);
+	create_extended_header_section(elog_data, pel_buffer, &pel_offset);
+	create_mtms_section(elog_data, pel_buffer, &pel_offset);
+	if (elog_data->user_section_count)
+		create_user_defined_section(elog_data, pel_buffer, &pel_offset);
+
+	return pel_offset;
+}
diff --git a/roms/skiboot/core/platform.c b/roms/skiboot/core/platform.c
new file mode 100644
index 000000000..320fdea03
--- /dev/null
+++ b/roms/skiboot/core/platform.c
@@ -0,0 +1,319 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * OPAL Platform abstraction
+ *
+ * Some OPAL calls may/may not call into the struct platform that's
+ * probed during boot. There's also a bunch of platform specific init
+ * and configuration that's called.
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <stdlib.h>
+#include <skiboot.h>
+#include <opal.h>
+#include <console.h>
+#include <timebase.h>
+#include <cpu.h>
+#include <chip.h>
+#include <xscom.h>
+#include <errorlog.h>
+#include <bt.h>
+#include <nvram.h>
+#include <npu2.h>
+#include <platforms/astbmc/astbmc.h>
+
+bool manufacturing_mode = false;
+struct platform	platform;
+
+DEFINE_LOG_ENTRY(OPAL_RC_ABNORMAL_REBOOT, OPAL_PLATFORM_ERR_EVT, OPAL_CEC,
+		 OPAL_CEC_HARDWARE, OPAL_ERROR_PANIC,
+		 OPAL_ABNORMAL_POWER_OFF);
+
+/*
+ * Various wrappers for platform functions
+ */
+static int64_t opal_cec_power_down(uint64_t request)
+{
+	prlog(PR_NOTICE, "OPAL: Shutdown request type 0x%llx...\n", request);
+
+	opal_quiesce(QUIESCE_HOLD, -1);
+
+	console_complete_flush();
+
+	if (platform.cec_power_down)
+		return platform.cec_power_down(request);
+
+	return OPAL_SUCCESS;
+}
+opal_call(OPAL_CEC_POWER_DOWN, opal_cec_power_down, 1);
+
+static int64_t full_reboot(void)
+{
+	prlog(PR_NOTICE, "OPAL: Reboot request...\n");
+
+	console_complete_flush();
+
+	if (platform.cec_reboot)
+		return platform.cec_reboot();
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t opal_cec_reboot(void)
+{
+	opal_quiesce(QUIESCE_HOLD, -1);
+
+	/*
+	 * Fast-reset was enabled by default for a long time in an attempt to
+	 * make it more stable by exercising it more frequently. This resulted
+	 * in a fair amount of pain due to mis-behaving hardware and confusion
+	 * about what a "reset" is supposed to do exactly. Additionally,
+	 * secure variables require a full reboot to work at all.
+	 *
+	 * Due to all that fast-reset should only be used if it's explicitly
+	 * enabled. It started life as a debug hack and should remain one.
+	 */
+	if (nvram_query_eq_safe("fast-reset", "1"))
+		fast_reboot();
+
+	return full_reboot();
+}
+opal_call(OPAL_CEC_REBOOT, opal_cec_reboot, 0);
+
+static int64_t opal_cec_reboot2(uint32_t reboot_type, char *diag)
+{
+	struct errorlog *buf;
+
+	opal_quiesce(QUIESCE_HOLD, -1);
+
+	switch (reboot_type) {
+	case OPAL_REBOOT_NORMAL:
+		return opal_cec_reboot();
+	case OPAL_REBOOT_PLATFORM_ERROR:
+		prlog(PR_EMERG,
+			  "OPAL: Reboot requested due to Platform error.\n");
+		buf = opal_elog_create(&e_info(OPAL_RC_ABNORMAL_REBOOT), 0);
+		if (buf) {
+			log_append_msg(buf,
+			  "OPAL: Reboot requested due to Platform error.");
+			if (diag) {
+				/* Add user section "DESC" */
+				log_add_section(buf, OPAL_ELOG_SEC_DESC);
+				log_append_data(buf, diag, strlen(diag));
+			}
+			log_commit(buf);
+		} else {
+			prerror("OPAL: failed to log an error\n");
+		}
+		disable_fast_reboot("Reboot due to Platform Error");
+		console_complete_flush();
+		return xscom_trigger_xstop();
+	case OPAL_REBOOT_FULL_IPL:
+		prlog(PR_NOTICE, "Reboot: Full reboot requested");
+		return full_reboot();
+	case OPAL_REBOOT_MPIPL:
+		prlog(PR_NOTICE, "Reboot: OS reported error. Performing MPIPL\n");
+		console_complete_flush();
+		if (platform.terminate)
+			platform.terminate("OS reported error. Performing MPIPL\n");
+		else
+			full_reboot();
+		for (;;);
+		break;
+	case OPAL_REBOOT_FAST:
+		prlog(PR_NOTICE, "Reboot: Fast reboot requested by OS\n");
+		fast_reboot();
+		prlog(PR_NOTICE, "Reboot: Fast reboot failed\n");
+		return OPAL_UNSUPPORTED;
+	default:
+		prlog(PR_NOTICE, "OPAL: Unsupported reboot request %d\n", reboot_type);
+		return OPAL_UNSUPPORTED;
+		break;
+	}
+	return OPAL_SUCCESS;
+}
+opal_call(OPAL_CEC_REBOOT2, opal_cec_reboot2, 2);
+
+static bool generic_platform_probe(void)
+{
+	if (dt_find_by_path(dt_root, "bmc")) {
+		/* We appear to have a BMC... so let's cross our fingers
+		 * and see if we can do anything!
+		 */
+		prlog(PR_ERR, "GENERIC BMC PLATFORM: **GUESSING** that there's "
+		      "*maybe* a BMC we can talk to.\n");
+		prlog(PR_ERR, "THIS IS ****UNSUPPORTED****, BRINGUP USE ONLY.\n");
+		astbmc_early_init();
+	} else {
+		uart_init();
+	}
+
+	return true;
+}
+
+static void generic_platform_init(void)
+{
+	if (uart_enabled())
+		set_opal_console(&uart_opal_con);
+
+	if (dt_find_by_path(dt_root, "bmc")) {
+		prlog(PR_ERR, "BMC-GUESSWORK: Here be dragons with a taste for human flesh\n");
+		astbmc_init();
+	} else {
+		/* Otherwise we go down the ultra-minimal path */
+
+		/* Enable a BT interface if we find one too */
+		bt_init();
+	}
+
+	/* Fake a real time clock */
+	fake_rtc_init();
+}
+
+static int64_t generic_cec_power_down(uint64_t request __unused)
+{
+	return OPAL_UNSUPPORTED;
+}
+
+static int generic_resource_loaded(enum resource_id id, uint32_t subid)
+{
+	if (dt_find_by_path(dt_root, "bmc"))
+		return flash_resource_loaded(id, subid);
+
+	return OPAL_EMPTY;
+}
+
+static int generic_start_preload_resource(enum resource_id id, uint32_t subid,
+				 void *buf, size_t *len)
+{
+	if (dt_find_by_path(dt_root, "bmc"))
+		return flash_start_preload_resource(id, subid, buf, len);
+
+	return OPAL_EMPTY;
+}
+
+/* These values will work for a ZZ booted using BML */
+static const struct platform_ocapi generic_ocapi = {
+	.i2c_engine          = 1,
+	.i2c_port            = 4,
+	.i2c_reset_addr      = 0x20,
+	.i2c_reset_brick2    = (1 << 1),
+	.i2c_reset_brick3    = (1 << 6),
+	.i2c_reset_brick4    = 0, /* unused */
+	.i2c_reset_brick5    = 0, /* unused */
+	.i2c_presence_addr   = 0x20,
+	.i2c_presence_brick2 = (1 << 2), /* bottom connector */
+	.i2c_presence_brick3 = (1 << 7), /* top connector */
+	.i2c_presence_brick4 = 0, /* unused */
+	.i2c_presence_brick5 = 0, /* unused */
+	.odl_phy_swap        = true,
+};
+
+static struct bmc_platform generic_bmc = {
+	.name = "generic",
+};
+
+static struct platform generic_platform = {
+	.name		= "generic",
+	.bmc		= &generic_bmc,
+	.probe          = generic_platform_probe,
+	.init		= generic_platform_init,
+	.nvram_info	= fake_nvram_info,
+	.nvram_start_read = fake_nvram_start_read,
+	.nvram_write	= fake_nvram_write,
+	.cec_power_down	= generic_cec_power_down,
+	.start_preload_resource	= generic_start_preload_resource,
+	.resource_loaded	= generic_resource_loaded,
+	.ocapi		= &generic_ocapi,
+	.npu2_device_detect = npu2_i2c_presence_detect, /* Assumes ZZ */
+};
+
+const struct bmc_platform *bmc_platform = &generic_bmc;
+
+void set_bmc_platform(const struct bmc_platform *bmc)
+{
+	if (bmc)
+		prlog(PR_NOTICE, "PLAT: Detected BMC platform %s\n", bmc->name);
+	else
+		bmc = &generic_bmc;
+
+	bmc_platform = bmc;
+}
+
+void probe_platform(void)
+{
+	struct platform *platforms = &__platforms_start;
+	unsigned int i;
+
+	/* Detect Manufacturing mode */
+	if (dt_find_property(dt_root, "ibm,manufacturing-mode")) {
+		/**
+		 * @fwts-label ManufacturingMode
+		 * @fwts-advice You are running in manufacturing mode.
+		 * This mode should only be enabled in a factory during
+		 * manufacturing.
+		 */
+		prlog(PR_NOTICE, "PLAT: Manufacturing mode ON\n");
+		manufacturing_mode = true;
+	}
+
+	for (i = 0; &platforms[i] < &__platforms_end; i++) {
+		if (platforms[i].probe && platforms[i].probe()) {
+			platform = platforms[i];
+			break;
+		}
+	}
+	if (!platform.name) {
+		platform = generic_platform;
+		if (platform.probe)
+			platform.probe();
+	}
+
+	prlog(PR_NOTICE, "PLAT: Detected %s platform\n", platform.name);
+
+	set_bmc_platform(platform.bmc);
+}
+
+
+int start_preload_resource(enum resource_id id, uint32_t subid,
+			   void *buf, size_t *len)
+{
+	if (!platform.start_preload_resource)
+		return OPAL_UNSUPPORTED;
+
+	return platform.start_preload_resource(id, subid, buf, len);
+}
+
+int resource_loaded(enum resource_id id, uint32_t idx)
+{
+	if (!platform.resource_loaded)
+		return OPAL_SUCCESS;
+
+	return platform.resource_loaded(id, idx);
+}
+
+int wait_for_resource_loaded(enum resource_id id, uint32_t idx)
+{
+	int r = resource_loaded(id, idx);
+	int waited = 0;
+
+	while(r == OPAL_BUSY) {
+		opal_run_pollers();
+		r = resource_loaded(id, idx);
+		if (r != OPAL_BUSY)
+			break;
+		time_wait_ms_nopoll(5);
+		waited+=5;
+	}
+
+	prlog(PR_TRACE, "PLATFORM: wait_for_resource_loaded %x/%x %u ms\n",
+	      id, idx, waited);
+	return r;
+}
+
+void op_display(enum op_severity sev, enum op_module mod, uint16_t code)
+{
+	if (platform.op_display)
+		platform.op_display(sev, mod, code);
+}
diff --git a/roms/skiboot/core/pool.c b/roms/skiboot/core/pool.c
new file mode 100644
index 000000000..a0283199a
--- /dev/null
+++ b/roms/skiboot/core/pool.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * This file provides some functions to manage a pool of pre-allocated
+ * objects. It also provides a method to reserve a pre-defined number
+ * of objects for higher priorty requests. The allocations follow the
+ * following rules:
+ *
+ * 1. An allocation will succeed at any priority if there is more than
+ *    the reserved number of objects free.
+ * 2. Only high priority allocations will succeed when there are less
+ *    than the reserved number of objects free.
+ * 3. When an allocation is freed it is always added to the high priority
+ *    pool if there are less than the reserved number of allocations
+ *    available.
+ *
+ * Copyright 2013-2014 IBM Corp.
+ */
+
+#include <pool.h>
+#include <string.h>
+#include <stdlib.h>
+#include <ccan/list/list.h>
+
+void* pool_get(struct pool *pool, enum pool_priority priority)
+{
+	void *obj;
+
+	if (!pool->free_count ||
+	    ((pool->free_count <= pool->reserved) && priority == POOL_NORMAL))
+		return NULL;
+
+	pool->free_count--;
+	obj = (void *) list_pop_(&pool->free_list, 0);
+	assert(obj);
+	memset(obj, 0, pool->obj_size);
+	return obj;
+}
+
+void pool_free_object(struct pool *pool, void *obj)
+{
+	pool->free_count++;
+	list_add_tail(&pool->free_list,
+		      (struct list_node *) (obj));
+}
+
+int pool_init(struct pool *pool, size_t obj_size, int count, int reserved)
+{
+	int i;
+
+	if (obj_size < sizeof(struct list_node))
+		obj_size = sizeof(struct list_node);
+
+	assert(count >= reserved);
+	pool->buf = malloc(obj_size*count);
+	if (!pool->buf)
+		return -1;
+
+	pool->obj_size = obj_size;
+	pool->free_count = count;
+	pool->reserved = reserved;
+	list_head_init(&pool->free_list);
+
+	for(i = 0; i < count; i++)
+		list_add_tail(&pool->free_list,
+			      (struct list_node *) (pool->buf + obj_size*i));
+
+	return 0;
+}
diff --git a/roms/skiboot/core/powercap.c b/roms/skiboot/core/powercap.c
new file mode 100644
index 000000000..6ae58eb86
--- /dev/null
+++ b/roms/skiboot/core/powercap.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * OPAL calls to get/set power caps
+ *
+ * Copyright 2017 IBM Corp.
+ */
+
+#include <powercap.h>
+
+static int opal_get_powercap(u32 handle, int token __unused, __be32 *__pcap)
+{
+	if (!__pcap || !opal_addr_valid(__pcap))
+		return OPAL_PARAMETER;
+
+	if (powercap_get_class(handle) == POWERCAP_CLASS_OCC) {
+		u32 pcap;
+		int rc;
+
+		rc = occ_get_powercap(handle, &pcap);
+		*__pcap = cpu_to_be32(pcap);
+		return rc;
+	}
+
+	return OPAL_UNSUPPORTED;
+};
+
+opal_call(OPAL_GET_POWERCAP, opal_get_powercap, 3);
+
+static int opal_set_powercap(u32 handle, int token, u32 pcap)
+{
+	if (powercap_get_class(handle) == POWERCAP_CLASS_OCC)
+		return occ_set_powercap(handle, token, pcap);
+
+	return OPAL_UNSUPPORTED;
+};
+
+opal_call(OPAL_SET_POWERCAP, opal_set_powercap, 3);
diff --git a/roms/skiboot/core/psr.c b/roms/skiboot/core/psr.c
new file mode 100644
index 000000000..75ccc6617
--- /dev/null
+++ b/roms/skiboot/core/psr.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * OPAL calls to get/set Power Shift Ratio (PSR)
+ *
+ * i.e. when something has to be throttled, what gets throttled?
+ *
+ * Copyright 2017 IBM Corp.
+ */
+
+#include <psr.h>
+
+static int opal_get_power_shift_ratio(u32 handle, int token __unused,
+				      __be32 *__ratio)
+{
+	if (!__ratio || !opal_addr_valid(__ratio))
+		return OPAL_PARAMETER;
+
+	if (psr_get_class(handle) == PSR_CLASS_OCC) {
+		u32 ratio;
+		int rc;
+
+		rc = occ_get_psr(handle, &ratio);
+		*__ratio = cpu_to_be32(ratio);
+		return rc;
+	}
+
+	return OPAL_UNSUPPORTED;
+};
+
+opal_call(OPAL_GET_POWER_SHIFT_RATIO, opal_get_power_shift_ratio, 3);
+
+static int opal_set_power_shift_ratio(u32 handle, int token,
+				      u32 ratio)
+{
+	if (psr_get_class(handle) == PSR_CLASS_OCC)
+		return occ_set_psr(handle, token, ratio);
+
+	return OPAL_UNSUPPORTED;
+};
+
+opal_call(OPAL_SET_POWER_SHIFT_RATIO, opal_set_power_shift_ratio, 3);
diff --git a/roms/skiboot/core/relocate.c b/roms/skiboot/core/relocate.c
new file mode 100644
index 000000000..6295927e2
--- /dev/null
+++ b/roms/skiboot/core/relocate.c
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Relocate ourselves
+ *
+ * WARNING: This code is used to self-relocate, it cannot have any
+ * global reference nor TOC reference. It's also called before BSS
+ * is cleared.
+ *
+ * Copyright 2013-2015 IBM Corp.
+ */
+
+#include <stdbool.h>
+#include <elf.h>
+
+/* Called from head.S, thus no header. */
+int relocate(uint64_t offset, struct elf64_dyn *dyn, struct elf64_rela *rela);
+
+/* Note: This code is simplified according to the assumptions
+ *       that our link address is 0 and we are running at the
+ *       target address already.
+ */
+int relocate(uint64_t offset, struct elf64_dyn *dyn, struct elf64_rela *rela)
+{
+	uint64_t dt_rela	= 0;
+	uint64_t dt_relacount	= 0;
+	unsigned int i;
+
+	/* Look for relocation table */
+	for (; dyn->d_tag != DT_NULL; dyn++) {
+		if (dyn->d_tag == DT_RELA)
+			dt_rela = dyn->d_val;
+		else if (dyn->d_tag == DT_RELACOUNT)
+			dt_relacount = dyn->d_val;
+	}
+
+	/* If we miss either rela or relacount, bail */
+	if (!dt_rela || !dt_relacount)
+		return -1;
+
+	/* Check if the offset is consistent */
+	if ((offset + dt_rela) != (uint64_t)rela)
+		return -2;
+
+	/* Perform relocations */
+	for (i = 0; i < dt_relacount; i++, rela++) {
+		uint64_t *t;
+
+		if (ELF64_R_TYPE(rela->r_info) != R_PPC64_RELATIVE)
+			return -3;
+		t = (uint64_t *)(rela->r_offset + offset);
+		*t = rela->r_addend + offset;
+	}
+
+	return 0;
+}
diff --git a/roms/skiboot/core/rtc.c b/roms/skiboot/core/rtc.c
new file mode 100644
index 000000000..3c0dda71e
--- /dev/null
+++ b/roms/skiboot/core/rtc.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Real Time Clock (RTC) Cache
+ *
+ * Copyright 2013-2014 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <lock.h>
+#include <rtc.h>
+#include <timebase.h>
+
+static struct lock rtc_tod_lock = LOCK_UNLOCKED;
+
+static struct {
+	struct tm	tm;
+	unsigned long	tb;
+	bool		valid;
+} rtc_tod_cache;
+
+void rtc_cache_update(struct tm *tm)
+{
+	lock(&rtc_tod_lock);
+	rtc_tod_cache.tb = mftb();
+	rtc_tod_cache.tm = *tm;
+	rtc_tod_cache.valid = true;
+	unlock(&rtc_tod_lock);
+}
+
+int rtc_cache_get(struct tm *tm)
+{
+	unsigned long cache_age_sec;
+
+	lock(&rtc_tod_lock);
+
+	if (!rtc_tod_cache.valid) {
+		unlock(&rtc_tod_lock);
+		return -1;
+	}
+
+	cache_age_sec = tb_to_msecs(mftb() - rtc_tod_cache.tb) / 1000;
+	*tm = rtc_tod_cache.tm;
+	unlock(&rtc_tod_lock);
+
+	tm->tm_sec += cache_age_sec;
+	mktime(tm);
+
+	return 0;
+}
+
+int rtc_cache_get_datetime(uint32_t *year_month_day,
+		uint64_t *hour_minute_second_millisecond)
+{
+	struct tm tm;
+
+	if (rtc_cache_get(&tm) < 0)
+		return -1;
+
+	tm_to_datetime(&tm, year_month_day, hour_minute_second_millisecond);
+
+	return 0;
+}
diff --git a/roms/skiboot/core/sensor.c b/roms/skiboot/core/sensor.c
new file mode 100644
index 000000000..303d867e2
--- /dev/null
+++ b/roms/skiboot/core/sensor.c
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * OPAL Sensor APIs
+ *
+ * Copyright 2013-2018 IBM Corp.
+ */
+
+#include <sensor.h>
+#include <skiboot.h>
+#include <device.h>
+#include <opal.h>
+#include <dts.h>
+#include <lock.h>
+#include <occ.h>
+
+struct dt_node *sensor_node;
+
+static struct lock async_read_list_lock = LOCK_UNLOCKED;
+static LIST_HEAD(async_read_list);
+
+struct sensor_async_read {
+	struct list_node link;
+	__be64 *val;
+	__be32 *opal_data;
+	int token;
+};
+
+static int add_to_async_read_list(int token, __be32 *opal_data, __be64 *val)
+{
+	struct sensor_async_read *req;
+
+	req = zalloc(sizeof(*req));
+	if (!req)
+		return OPAL_NO_MEM;
+
+	req->token = token;
+	req->val = val;
+	req->opal_data = opal_data;
+
+	lock(&async_read_list_lock);
+	list_add_tail(&async_read_list, &req->link);
+	unlock(&async_read_list_lock);
+
+	return OPAL_ASYNC_COMPLETION;
+}
+
+void check_sensor_read(int token)
+{
+	struct sensor_async_read *req = NULL;
+
+	lock(&async_read_list_lock);
+	if (list_empty(&async_read_list))
+		goto out;
+
+	list_for_each(&async_read_list, req, link) {
+		if (req->token == token)
+			break;
+	}
+	if (!req)
+		goto out;
+
+	*req->opal_data = cpu_to_be32(be64_to_cpu(*req->val));
+	free(req->val);
+	list_del(&req->link);
+	free(req);
+out:
+	unlock(&async_read_list_lock);
+}
+
+static s64 opal_sensor_read_64(u32 sensor_hndl, int token, __be64 *data)
+{
+	s64 rc;
+
+	switch (sensor_get_family(sensor_hndl)) {
+	case SENSOR_DTS:
+		rc = dts_sensor_read(sensor_hndl, token, data);
+		return rc;
+
+	case SENSOR_OCC:
+		rc = occ_sensor_read(sensor_hndl, data);
+		return rc;
+
+	default:
+		break;
+	}
+
+	if (platform.sensor_read) {
+		rc = platform.sensor_read(sensor_hndl, token, data);
+		return rc;
+	}
+
+	return OPAL_UNSUPPORTED;
+}
+
+static int64_t opal_sensor_read(uint32_t sensor_hndl, int token,
+				__be32 *data)
+{
+	__be64 *val;
+	s64 rc;
+
+	val = zalloc(sizeof(*val));
+	if (!val)
+		return OPAL_NO_MEM;
+
+	rc = opal_sensor_read_64(sensor_hndl, token, val);
+	if (rc == OPAL_SUCCESS) {
+		*data = cpu_to_be32(be64_to_cpu(*val));
+		free(val);
+	} else if (rc == OPAL_ASYNC_COMPLETION) {
+		rc = add_to_async_read_list(token, data, val);
+	}
+
+	return rc;
+}
+
+static int opal_sensor_group_clear(u32 group_hndl, int token)
+{
+	switch (sensor_get_family(group_hndl)) {
+	case SENSOR_OCC:
+		return occ_sensor_group_clear(group_hndl, token);
+	default:
+		break;
+	}
+
+	return OPAL_UNSUPPORTED;
+}
+
+static int opal_sensor_group_enable(u32 group_hndl, int token, bool enable)
+{
+	switch (sensor_get_family(group_hndl)) {
+	case SENSOR_OCC:
+		return occ_sensor_group_enable(group_hndl, token, enable);
+	default:
+		break;
+	}
+
+	return OPAL_UNSUPPORTED;
+}
+void sensor_init(void)
+{
+	sensor_node = dt_new(opal_node, "sensors");
+
+	dt_add_property_string(sensor_node, "compatible", "ibm,opal-sensor");
+	dt_add_property_cells(sensor_node, "#address-cells", 1);
+	dt_add_property_cells(sensor_node, "#size-cells", 0);
+
+	/* Register OPAL interface */
+	opal_register(OPAL_SENSOR_READ, opal_sensor_read, 3);
+	opal_register(OPAL_SENSOR_GROUP_CLEAR, opal_sensor_group_clear, 2);
+	opal_register(OPAL_SENSOR_READ_U64, opal_sensor_read_64, 3);
+	opal_register(OPAL_SENSOR_GROUP_ENABLE, opal_sensor_group_enable, 3);
+}
diff --git a/roms/skiboot/core/stack.c b/roms/skiboot/core/stack.c
new file mode 100644
index 000000000..3edf98411
--- /dev/null
+++ b/roms/skiboot/core/stack.c
@@ -0,0 +1,266 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Create/Print backtraces, check stack usage etc.
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <processor.h>
+#include <cpu.h>
+#include <stack.h>
+#include <mem_region.h>
+#include <unistd.h>
+#include <lock.h>
+
+#define STACK_BUF_ENTRIES	60
+static struct bt_entry bt_buf[STACK_BUF_ENTRIES];
+
+/* Dumps backtrace to buffer */
+static void __nomcount __backtrace_create(struct bt_entry *entries,
+				 unsigned int max_ents,
+				 struct bt_metadata *metadata,
+				 struct stack_frame *eframe)
+{
+	unsigned long *fp = (unsigned long *)eframe;
+	unsigned long top_adj = top_of_ram;
+
+	/* Assume one stack for early backtraces */
+	if (top_of_ram == SKIBOOT_BASE + SKIBOOT_SIZE)
+		top_adj = top_of_ram + STACK_SIZE;
+
+	metadata->ents = 0;
+	while (max_ents) {
+		fp = (unsigned long *)fp[0];
+		if (!fp || (unsigned long)fp > top_adj)
+			break;
+		eframe = (struct stack_frame *)fp;
+		if (eframe->magic == STACK_INT_MAGIC) {
+			entries->exception_type = eframe->type;
+			entries->exception_pc = eframe->pc;
+		} else {
+			entries->exception_type = 0;
+		}
+		entries->sp = (unsigned long)fp;
+		entries->pc = fp[2];
+		entries++;
+		metadata->ents++;
+		max_ents--;
+	}
+
+	metadata->r1_caller = eframe->gpr[1];
+
+	if (fp)
+		metadata->token = eframe->gpr[0];
+	else
+		metadata->token = -1UL;
+
+	metadata->pir = mfspr(SPR_PIR);
+}
+
+void __nomcount backtrace_create(struct bt_entry *entries,
+				 unsigned int max_ents,
+				 struct bt_metadata *metadata)
+{
+	unsigned long *fp = __builtin_frame_address(0);
+	struct stack_frame *eframe = (struct stack_frame *)fp;
+
+	__backtrace_create(entries, max_ents, metadata, eframe);
+}
+
+void backtrace_print(struct bt_entry *entries, struct bt_metadata *metadata,
+		     char *out_buf, unsigned int *len, bool symbols)
+{
+	static char bt_text_buf[4096];
+	int i, l = 0, max;
+	char *buf = out_buf;
+	unsigned long bottom, top, normal_top, tbot, ttop;
+	char mark;
+
+	if (!out_buf) {
+		buf = bt_text_buf;
+		max = sizeof(bt_text_buf) - 16;
+	} else
+		max = *len - 1;
+
+	bottom = cpu_stack_bottom(metadata->pir);
+	normal_top = cpu_stack_top(metadata->pir);
+	top = cpu_emergency_stack_top(metadata->pir);
+	tbot = SKIBOOT_BASE;
+	ttop = (unsigned long)&_etext;
+
+	l += snprintf(buf, max, "CPU %04lx Backtrace:\n", metadata->pir);
+	for (i = 0; i < metadata->ents && l < max; i++) {
+		if (entries->sp < bottom || entries->sp > top)
+			mark = '!';
+		else if (entries->sp > normal_top)
+			mark = 'E';
+		else if (entries->pc < tbot || entries->pc > ttop)
+			mark = '*';
+		else
+			mark = ' ';
+		l += snprintf(buf + l, max - l,
+			      " S: %016lx R: %016lx %c ",
+			      entries->sp, entries->pc, mark);
+		if (symbols)
+			l += snprintf_symbol(buf + l, max - l, entries->pc);
+		l += snprintf(buf + l, max - l, "\n");
+		if (entries->exception_type) {
+			l += snprintf(buf + l, max - l,
+				      " --- Interrupt 0x%lx at %016lx ---\n",
+				      entries->exception_type, entries->exception_pc);
+		}
+		entries++;
+	}
+	if (metadata->token <= OPAL_LAST)
+		l += snprintf(buf + l, max - l,
+			      " --- OPAL call token: 0x%lx caller R1: 0x%016lx ---\n",
+			      metadata->token, metadata->r1_caller);
+	else if (metadata->token == -1UL)
+		l += snprintf(buf + l, max - l, " --- OPAL boot ---\n");
+	if (!out_buf)
+		write(stdout->fd, bt_text_buf, l);
+	buf[l++] = 0;
+	if (len)
+		*len = l;
+}
+
+/*
+ * To ensure that we always get backtrace output we bypass the usual console
+ * locking paths. The downside is that when multiple threads need to print
+ * a backtrace they garble each other. To prevent this we use a seperate
+ * lock to serialise printing of the dumps.
+ */
+static struct lock bt_lock = LOCK_UNLOCKED;
+
+void backtrace(void)
+{
+	struct bt_metadata metadata;
+
+	lock(&bt_lock);
+
+	backtrace_create(bt_buf, STACK_BUF_ENTRIES, &metadata);
+	backtrace_print(bt_buf, &metadata, NULL, NULL, true);
+
+	unlock(&bt_lock);
+}
+
+void backtrace_r1(uint64_t r1)
+{
+	struct bt_metadata metadata;
+
+	lock(&bt_lock);
+
+	__backtrace_create(bt_buf, STACK_BUF_ENTRIES, &metadata, (struct stack_frame *)r1);
+	backtrace_print(bt_buf, &metadata, NULL, NULL, true);
+
+	unlock(&bt_lock);
+}
+
+void __nomcount __stack_chk_fail(void);
+void __nomcount __stack_chk_fail(void)
+{
+	static bool failed_once;
+
+	if (failed_once)
+		return;
+	failed_once = true;
+	prlog(PR_EMERG, "Stack corruption detected !\n");
+	abort();
+}
+
+#ifdef STACK_CHECK_ENABLED
+
+static int64_t lowest_stack_mark = LONG_MAX;
+static struct lock stack_check_lock = LOCK_UNLOCKED;
+
+void __nomcount __mcount_stack_check(uint64_t sp, uint64_t lr);
+void __nomcount __mcount_stack_check(uint64_t sp, uint64_t lr)
+{
+	struct cpu_thread *c = this_cpu();
+	uint64_t base = (uint64_t)c;
+	uint64_t bot = base + sizeof(struct cpu_thread);
+	int64_t mark = sp - bot;
+	uint64_t top = base + NORMAL_STACK_SIZE;
+
+	/*
+	 * Don't check the emergency stack just yet.
+	 */
+	if (c->in_opal_call > 1)
+		return;
+
+	/*
+	 * Don't re-enter on this CPU or don't enter at all if somebody
+	 * has spotted an overflow
+	 */
+	if (c->in_mcount)
+		return;
+	c->in_mcount = true;
+
+	/* Capture lowest stack for this thread */
+	if (mark < c->stack_bot_mark) {
+		lock(&stack_check_lock);
+		c->stack_bot_mark = mark;
+		c->stack_bot_pc = lr;
+		c->stack_bot_tok = c->current_token;
+		backtrace_create(c->stack_bot_bt, CPU_BACKTRACE_SIZE,
+				 &c->stack_bot_bt_metadata);
+		unlock(&stack_check_lock);
+
+		if (mark < STACK_WARNING_GAP) {
+			prlog(PR_EMERG, "CPU %04x Stack usage danger !"
+			      " pc=%08llx sp=%08llx (gap=%lld) token=%lld\n",
+			      c->pir, lr, sp, mark, c->current_token);
+		}
+	}
+
+	/* Stack is within bounds? */
+	if (sp >= (bot + STACK_SAFETY_GAP) && sp < top) {
+		c->in_mcount = false;
+		return;
+	}
+	
+	prlog(PR_EMERG, "CPU %04x Stack overflow detected !"
+	      " pc=%08llx sp=%08llx (gap=%lld) token=%lld\n",
+	      c->pir, lr, sp, mark, c->current_token);
+	abort();
+}
+
+void check_stacks(void)
+{
+	struct cpu_thread *c, *lowest = NULL;
+
+	/* We should never call that from mcount */
+	assert(!this_cpu()->in_mcount);
+
+	/* Mark ourselves "in_mcount" to avoid deadlock on stack
+	 * check lock
+	 */
+	this_cpu()->in_mcount = true;
+
+	for_each_cpu(c) {
+		if (!c->stack_bot_mark ||
+		    c->stack_bot_mark >= lowest_stack_mark)
+			continue;
+		lock(&stack_check_lock);
+		if (c->stack_bot_mark < lowest_stack_mark) {
+			lowest = c;
+			lowest_stack_mark = c->stack_bot_mark;
+		}
+		unlock(&stack_check_lock);
+	}
+	if (lowest) {
+		lock(&bt_lock);
+		prlog(PR_NOTICE, "CPU %04x lowest stack mark %lld bytes left"
+		      " pc=%08llx token=%lld\n",
+		      lowest->pir, lowest->stack_bot_mark, lowest->stack_bot_pc,
+		      lowest->stack_bot_tok);
+		backtrace_print(lowest->stack_bot_bt,
+				&lowest->stack_bot_bt_metadata,
+				NULL, NULL, true);
+		unlock(&bt_lock);
+	}
+
+	this_cpu()->in_mcount = false;
+}
+#endif /* STACK_CHECK_ENABLED */
diff --git a/roms/skiboot/core/test/Makefile.check b/roms/skiboot/core/test/Makefile.check
new file mode 100644
index 000000000..7c347bea2
--- /dev/null
+++ b/roms/skiboot/core/test/Makefile.check
@@ -0,0 +1,101 @@
+# -*-Makefile-*-
+CORE_TEST := \
+	core/test/run-bitmap \
+	core/test/run-cpufeatures \
+	core/test/run-device \
+	core/test/run-flash-subpartition \
+	core/test/run-flash-firmware-versions \
+	core/test/run-mem_region \
+	core/test/run-malloc \
+	core/test/run-malloc-speed \
+	core/test/run-mem_region_init \
+	core/test/run-mem_region_next \
+	core/test/run-mem_region_release_unused \
+	core/test/run-mem_region_release_unused_noalloc \
+	core/test/run-mem_region_reservations \
+	core/test/run-mem_range_is_reserved \
+	core/test/run-nvram-format \
+	core/test/run-trace core/test/run-msg \
+	core/test/run-pel \
+	core/test/run-pool \
+	core/test/run-time-utils \
+	core/test/run-timebase \
+	core/test/run-timer \
+	core/test/run-buddy \
+	core/test/run-pci-quirk
+
+HOSTCFLAGS+=-I . -I include -Wno-error=attributes
+
+CORE_TEST_NOSTUB := core/test/run-console-log
+CORE_TEST_NOSTUB += core/test/run-console-log-buf-overrun
+CORE_TEST_NOSTUB += core/test/run-console-log-pr_fmt
+CORE_TEST_NOSTUB += core/test/run-api-test
+
+LCOV_EXCLUDE += $(CORE_TEST:%=%.c) core/test/stubs.c
+LCOV_EXCLUDE += $(CORE_TEST_NOSTUB:%=%.c) /usr/include/*
+
+.PHONY : core-check
+core-check: $(CORE_TEST:%=%-check) $(CORE_TEST_NOSTUB:%=%-check)
+
+.PHONY : core-coverage
+core-coverage: $(CORE_TEST:%=%-gcov-run)
+core-coverage: $(CORE_TEST_NOSTUB:%=%-gcov-run)
+
+check: core-check
+coverage: core-coverage
+
+$(CORE_TEST:%=%-gcov-run) : %-run: %
+	$(call QTEST, TEST-COVERAGE ,$< , $<)
+
+$(CORE_TEST_NOSTUB:%=%-gcov-run) : %-run: %
+	$(call QTEST, TEST-COVERAGE ,$< , $<)
+
+$(CORE_TEST:%=%-check) : %-check: %
+	$(call QTEST, RUN-TEST ,$(VALGRIND) $<, $<)
+
+$(CORE_TEST_NOSTUB:%=%-check) : %-check: %
+	$(call QTEST, RUN-TEST ,$(VALGRIND) $<, $<)
+
+core/test/stubs.o: core/test/stubs.c
+	$(call Q, HOSTCC ,$(HOSTCC) $(HOSTCFLAGS) -g -c -o $@ $<, $<)
+
+$(CORE_TEST) : core/test/stubs.o
+
+$(CORE_TEST) : % : %.c
+	$(call Q, HOSTCC ,$(HOSTCC) $(HOSTCFLAGS) -O0 -g -I include -I . -I libfdt -o $@ $< core/test/stubs.o, $<)
+
+$(CORE_TEST_NOSTUB) : % : %.c
+	$(call Q, HOSTCC ,$(HOSTCC) $(HOSTCFLAGS) -O0 -g -I include -I . -I libfdt -o $@ $< , $<)
+
+$(CORE_TEST:%=%-gcov): %-gcov : %.c %
+	$(call Q, HOSTCC ,$(HOSTCC) $(HOSTCFLAGS) $(HOSTGCOVCFLAGS) -I include -I . -I libfdt -lgcov -o $@ $< core/test/stubs.o, $<)
+
+$(CORE_TEST_NOSTUB:%=%-gcov) : %-gcov : %.c %
+	$(call Q, HOSTCC ,$(HOSTCC) $(HOSTCFLAGS) $(HOSTGCOVCFLAGS) -I include -I . -I libfdt -lgcov -o $@ $< , $<)
+
+core/test/run-flash-firmware-versions-gcov-run: core/test/run-flash-firmware-versions-inputs-gcov-run
+
+core/test/run-flash-firmware-versions-inputs-gcov-run: core/test/run-flash-firmware-versions-gcov
+	$(call Q, TEST-COVERAGE , ./core/test/run-flash-firmware-versions-gcov core/test/firmware-versions-input/version-0 > /dev/null, $< version-0)
+	$(call Q, TEST-COVERAGE , ./core/test/run-flash-firmware-versions-gcov core/test/firmware-versions-input/version-1 > /dev/null, $< version-1)
+	$(call Q, TEST-COVERAGE , ./core/test/run-flash-firmware-versions-gcov core/test/firmware-versions-input/version-2 > /dev/null, $< version-2)
+	$(call Q, TEST-COVERAGE , ./core/test/run-flash-firmware-versions-gcov core/test/firmware-versions-input/version-10 > /dev/null, $< version-10)
+	$(call Q, TEST-COVERAGE , ./core/test/run-flash-firmware-versions-gcov core/test/firmware-versions-input/version-11 > /dev/null, $< version-11)
+	$(call Q, TEST-COVERAGE , ./core/test/run-flash-firmware-versions-gcov core/test/firmware-versions-input/version-16 > /dev/null, $< version-16)
+	$(call Q, TEST-COVERAGE , ./core/test/run-flash-firmware-versions-gcov core/test/firmware-versions-input/version-26 > /dev/null, $< version-26)
+	$(call Q, TEST-COVERAGE , ./core/test/run-flash-firmware-versions-gcov core/test/firmware-versions-input/version-27 > /dev/null, $< version-27)
+	$(call Q, TEST-COVERAGE , ./core/test/run-flash-firmware-versions-gcov core/test/firmware-versions-input/version-29 > /dev/null, $< version-29)
+	$(call Q, TEST-COVERAGE , ./core/test/run-flash-firmware-versions-gcov core/test/firmware-versions-input/version-trunc > /dev/null, $< version-trunc)
+	$(call Q, TEST-COVERAGE , ./core/test/run-flash-firmware-versions-gcov core/test/firmware-versions-input/version-long > /dev/null, $< version-long)
+	$(call Q, TEST-COVERAGE , ./core/test/run-flash-firmware-versions-gcov core/test/firmware-versions-input/version-nodash > /dev/null, $< version-nodash)
+
+
+-include $(wildcard core/test/*.d)
+
+clean: core-test-clean
+
+core-test-clean:
+	$(RM) -f core/test/*.[od] $(CORE_TEST) $(CORE_TEST:%=%-gcov)
+	$(RM) -f $(CORE_TEST_NOSTUB) $(CORE_TEST_NOSTUB:%=%-gcov)
+	$(RM) -f *.gcda *.gcno skiboot.info
+	$(RM) -rf coverage-report
diff --git a/roms/skiboot/core/test/dummy-cpu.h b/roms/skiboot/core/test/dummy-cpu.h
new file mode 100644
index 000000000..64fb71bce
--- /dev/null
+++ b/roms/skiboot/core/test/dummy-cpu.h
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2013-2018 IBM Corp.
+ *
+ * A dummy cpu.h for tests.
+ * We don't want to include the real skiboot cpu.h, it's PPC-specific
+ */
+
+#ifndef __CPU_H
+#define __CPU_H
+
+#include <stdint.h>
+#include <stdbool.h>
+
+static unsigned int cpu_max_pir = 1;
+struct cpu_thread {
+	unsigned int			chip_id;
+};
+struct cpu_job *__cpu_queue_job(struct cpu_thread *cpu,
+				const char *name,
+				void (*func)(void *data), void *data,
+				bool no_return);
+static inline struct cpu_job *cpu_queue_job(struct cpu_thread *cpu,
+					    const char *name,
+					    void (*func)(void *data),
+					    void *data)
+{
+	return __cpu_queue_job(cpu, name, func, data, false);
+}
+void cpu_wait_job(struct cpu_job *job, bool free_it);
+void cpu_process_local_jobs(void);
+struct cpu_job *cpu_queue_job_on_node(uint32_t chip_id,
+				       const char *name,
+				       void (*func)(void *data), void *data);
+#endif /* __CPU_H */
diff --git a/roms/skiboot/core/test/firmware-versions-input/version-0 b/roms/skiboot/core/test/firmware-versions-input/version-0
new file mode 100644
index 000000000..2ab241af5
--- /dev/null
+++ b/roms/skiboot/core/test/firmware-versions-input/version-0
diff --git a/roms/skiboot/core/test/firmware-versions-input/version-1 b/roms/skiboot/core/test/firmware-versions-input/version-1
new file mode 100644
index 000000000..746327a8b
--- /dev/null
+++ b/roms/skiboot/core/test/firmware-versions-input/version-1
diff --git a/roms/skiboot/core/test/firmware-versions-input/version-10 b/roms/skiboot/core/test/firmware-versions-input/version-10
new file mode 100644
index 000000000..013af6089
--- /dev/null
+++ b/roms/skiboot/core/test/firmware-versions-input/version-10
diff --git a/roms/skiboot/core/test/firmware-versions-input/version-11 b/roms/skiboot/core/test/firmware-versions-input/version-11
new file mode 100644
index 000000000..55e835321
--- /dev/null
+++ b/roms/skiboot/core/test/firmware-versions-input/version-11
diff --git a/roms/skiboot/core/test/firmware-versions-input/version-16 b/roms/skiboot/core/test/firmware-versions-input/version-16
new file mode 100644
index 000000000..8906af4e9
--- /dev/null
+++ b/roms/skiboot/core/test/firmware-versions-input/version-16
diff --git a/roms/skiboot/core/test/firmware-versions-input/version-2 b/roms/skiboot/core/test/firmware-versions-input/version-2
new file mode 100644
index 000000000..f012ffd23
--- /dev/null
+++ b/roms/skiboot/core/test/firmware-versions-input/version-2
diff --git a/roms/skiboot/core/test/firmware-versions-input/version-26 b/roms/skiboot/core/test/firmware-versions-input/version-26
new file mode 100644
index 000000000..adfd5bbcf
--- /dev/null
+++ b/roms/skiboot/core/test/firmware-versions-input/version-26
diff --git a/roms/skiboot/core/test/firmware-versions-input/version-27 b/roms/skiboot/core/test/firmware-versions-input/version-27
new file mode 100644
index 000000000..d7ade9863
--- /dev/null
+++ b/roms/skiboot/core/test/firmware-versions-input/version-27
diff --git a/roms/skiboot/core/test/firmware-versions-input/version-29 b/roms/skiboot/core/test/firmware-versions-input/version-29
new file mode 100644
index 000000000..b1476a3a5
--- /dev/null
+++ b/roms/skiboot/core/test/firmware-versions-input/version-29
diff --git a/roms/skiboot/core/test/firmware-versions-input/version-long b/roms/skiboot/core/test/firmware-versions-input/version-long
new file mode 100644
index 000000000..f814fa6f4
--- /dev/null
+++ b/roms/skiboot/core/test/firmware-versions-input/version-long
@@ -0,0 +1,2 @@
+open-power-whatever-v2.0-10-g1cec21d-dirty
+	Well, I wonder what a short essay here will mean for parsing everything. I hope it is all okay, but we want to get greater than 80 chars.
diff --git a/roms/skiboot/core/test/firmware-versions-input/version-nodash b/roms/skiboot/core/test/firmware-versions-input/version-nodash
new file mode 100644
index 000000000..139aa9350
--- /dev/null
+++ b/roms/skiboot/core/test/firmware-versions-input/version-nodash
@@ -0,0 +1,2 @@
+no_dashes_in_version
+	this_is_wrong
diff --git a/roms/skiboot/core/test/firmware-versions-input/version-trunc b/roms/skiboot/core/test/firmware-versions-input/version-trunc
new file mode 100644
index 000000000..c9c92a01f
--- /dev/null
+++ b/roms/skiboot/core/test/firmware-versions-input/version-trunc
@@ -0,0 +1,2 @@
+open-power-SUPERMICRO-P8DTU-V2.00.GA2-20161028
+	op
diff --git a/roms/skiboot/core/test/run-api-test.c b/roms/skiboot/core/test/run-api-test.c
new file mode 100644
index 000000000..35e8135d4
--- /dev/null
+++ b/roms/skiboot/core/test/run-api-test.c
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2014-2016 IBM Corp.
+ *
+ * For now it just validates that addresses passed are sane and test the
+ * wrapper that validates addresses
+ *
+ * Copyright 2016 IBM Corp.
+ */
+
+#include <config.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <stdarg.h>
+#include <compiler.h>
+#include <opal-internal.h>
+
+#define __TEST__
+unsigned long top_of_ram;	/* Fake it here */
+int main(void)
+{
+	unsigned long addr = 0xd000000000000000;
+
+	top_of_ram = 16ULL * 1024 * 1024 * 1024; /* 16 GB */
+	assert(opal_addr_valid((void *)addr) == false);
+
+	addr = 0xc000000000000000;
+	assert(opal_addr_valid((void *)addr) == true);
+
+	addr = 0x0;
+	assert(opal_addr_valid((void *)addr) == true);
+
+	addr = ~0;
+	assert(opal_addr_valid((void *)addr) == false);
+
+	addr = top_of_ram + 1;
+	assert(opal_addr_valid((void *)addr) == false);
+	return 0;
+}
diff --git a/roms/skiboot/core/test/run-bitmap.c b/roms/skiboot/core/test/run-bitmap.c
new file mode 100644
index 000000000..e474915b8
--- /dev/null
+++ b/roms/skiboot/core/test/run-bitmap.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2017 IBM Corp.
+ */
+
+#include "../bitmap.c"
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+int main(void)
+{
+	bitmap_t *map = malloc(sizeof(bitmap_elem_t));
+	int i;
+	memset(map, 0, sizeof(bitmap_elem_t));
+
+	assert(BITMAP_ELEMS(16) == (BITMAP_ELEMS(8)));
+	assert(BITMAP_ELEMS(128) == (BITMAP_ELEMS(64)*2));
+
+	assert(BITMAP_BYTES(64) == 8);
+	assert(BITMAP_BYTES(128) == 16);
+
+	assert(BITMAP_BIT(1) == 0x1);
+	assert(BITMAP_BIT(2) == 0x2);
+	assert(BITMAP_BIT(3) == 0x3);
+	assert(BITMAP_BIT(8) == 0x8);
+
+	assert(BITMAP_MASK(0) == 0x1);
+	assert(BITMAP_MASK(1) == 0x2);
+	assert(BITMAP_MASK(8) == 0x100);
+	assert(BITMAP_MASK(9) == 0x200);
+
+	assert(BITMAP_ELEM(1) == 0);
+	assert(BITMAP_ELEM(128) == BITMAP_ELEMS(128));
+
+	bitmap_set_bit(*map, 0);
+	assert(*(unsigned long*)map == 0x1);
+	assert(bitmap_tst_bit(*map, 0) == true);
+	bitmap_clr_bit(*map, 0);
+	assert(*(unsigned long*)map == 0x00);
+
+	bitmap_set_bit(*map, 8);
+	assert(*(unsigned long*)map == 0x100);
+	assert(bitmap_tst_bit(*map, 0) == false);
+	assert(bitmap_tst_bit(*map, 1) == false);
+	assert(bitmap_tst_bit(*map, 2) == false);
+	assert(bitmap_tst_bit(*map, 3) == false);
+	assert(bitmap_tst_bit(*map, 4) == false);
+	assert(bitmap_tst_bit(*map, 5) == false);
+	assert(bitmap_tst_bit(*map, 6) == false);
+	assert(bitmap_tst_bit(*map, 7) == false);
+	assert(bitmap_tst_bit(*map, 8) == true);
+	assert(bitmap_tst_bit(*map, 9) == false);
+	assert(bitmap_tst_bit(*map, 10) == false);
+	assert(bitmap_tst_bit(*map, 11) == false);
+	assert(bitmap_tst_bit(*map, 12) == false);
+	assert(bitmap_tst_bit(*map, 13) == false);
+	assert(bitmap_tst_bit(*map, 14) == false);
+	assert(bitmap_tst_bit(*map, 15) == false);
+	assert(bitmap_find_one_bit(*map, 0, 16) == 8);
+	bitmap_clr_bit(*map, 8);
+	assert(bitmap_find_one_bit(*map, 0, 16) == -1);
+	assert(*(unsigned long*)map == 0x00);
+	assert(bitmap_tst_bit(*map, 8) == false);
+
+	bitmap_for_each_zero(*map, 7, i) {
+		bitmap_set_bit(*map, i);
+	}
+
+	for (i = 0; i < 7; i++)
+		assert(bitmap_tst_bit(*map, i) == true);
+
+	assert(bitmap_tst_bit(*map, 8) == false);
+
+
+	free(map);
+
+	return 0;
+}
diff --git a/roms/skiboot/core/test/run-buddy.c b/roms/skiboot/core/test/run-buddy.c
new file mode 100644
index 000000000..8ae26cb6c
--- /dev/null
+++ b/roms/skiboot/core/test/run-buddy.c
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2016-2017 IBM Corp.
+ */
+
+#include <buddy.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+static void *zalloc(size_t size)
+{
+        return calloc(size, 1);
+}
+
+#include "../buddy.c"
+#include "../bitmap.c"
+
+#define BUDDY_ORDER	8
+
+int main(void)
+{
+	struct buddy *b;
+	int i, a[10];
+
+	b = buddy_create(BUDDY_ORDER);
+	assert(b);
+
+	buddy_reserve(b, 127, 0);
+	buddy_reserve(b, 0, 4);
+	assert(buddy_reserve(b, 0, 4) == false);
+
+	a[0] = buddy_alloc(b, 0);
+	assert(a[0] >= 0);
+	a[1] = buddy_alloc(b, 0);
+	assert(a[1] >= 0);
+	a[2] = buddy_alloc(b, 3);
+	assert(a[2] >= 0);
+	a[3] = buddy_alloc(b, 4);
+	assert(a[3] >= 0);
+	a[4] = buddy_alloc(b, 5);
+	assert(a[4] >= 0);
+	a[5] = buddy_alloc(b, 4);
+	assert(a[5] >= 0);
+	a[6] = buddy_alloc(b, 3);
+	assert(a[6] >= 0);
+	a[7] = buddy_alloc(b, 2);
+	assert(a[7] >= 0);
+	a[8] = buddy_alloc(b, 1);
+	assert(a[8] >= 0);
+	a[9] = buddy_alloc(b, 8);
+	assert(a[9] < 0);
+
+	buddy_free(b, a[0], 0);
+	buddy_free(b, a[8], 1);
+	buddy_free(b, a[1], 0);
+	buddy_free(b, a[7], 2);
+	buddy_free(b, a[2], 3);
+	buddy_free(b, a[6], 3);
+	buddy_free(b, a[3], 4);
+	buddy_free(b, a[5], 4);
+	buddy_free(b, a[4], 5);
+
+	buddy_free(b, 127, 0);
+	buddy_free(b, 0, 4);
+
+	for (i = 2; i < buddy_map_size(b); i++)
+		assert(bitmap_tst_bit(b->map, i));
+	assert(!bitmap_tst_bit(b->map, 1));
+
+	buddy_destroy(b);
+	return 0;
+}
diff --git a/roms/skiboot/core/test/run-console-log-buf-overrun.c b/roms/skiboot/core/test/run-console-log-buf-overrun.c
new file mode 100644
index 000000000..83774c4c9
--- /dev/null
+++ b/roms/skiboot/core/test/run-console-log-buf-overrun.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2015-2016 IBM Corp.
+ */
+
+#include <config.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <stdarg.h>
+#include <compiler.h>
+
+unsigned long tb_hz = 512000000;
+
+#define __TEST__
+
+#define CHECK_BUF_ASSERT(buf, str)			\
+	assert(memcmp(buf, str, strlen(str)) == 0)
+
+#define CHECK_ASSERT(str)				\
+	CHECK_BUF_ASSERT(console_buffer, str)
+
+int huge_tb;
+
+static inline unsigned long mftb(void)
+{
+	/*
+	 * return huge value for TB that overrun tmp[16] buffer defined
+	 * in print_itoa().
+	 */
+	if (huge_tb)
+		return 1223372515963611388;
+	else
+		return 42;
+}
+
+#include "../../libc/include/stdio.h"
+#include "../console-log.c"
+#include "../../libc/stdio/snprintf.c"
+#include "../../libc/stdio/vsnprintf.c"
+
+char console_buffer[4096];
+struct debug_descriptor debug_descriptor;
+
+bool flushed_to_drivers;
+
+ssize_t console_write(bool flush_to_drivers, const void *buf, size_t count)
+{
+	flushed_to_drivers = flush_to_drivers;
+	memcpy(console_buffer, buf, count);
+	return count;
+}
+
+int main(void)
+{
+	unsigned long value = 0xffffffffffffffff;
+	char *ptr = console_buffer;
+
+	debug_descriptor.console_log_levels = 0x75;
+
+	/* Test for huge TB value. */
+	huge_tb = 1;
+
+	prlog(PR_EMERG, "Hello World");
+	CHECK_ASSERT("[2389399445.123611388,0] Hello World");
+
+	memset(console_buffer, 0, sizeof(console_buffer));
+
+	/* Test for normal TB with huge unsigned long value */
+	huge_tb = 0;
+
+	prlog(PR_EMERG, "Hello World %lu", value);
+	CHECK_ASSERT("[    0.000000042,0] Hello World 18446744073709551615");
+
+	printf("Hello World %lu", value);
+	CHECK_ASSERT("[    0.000000042,5] Hello World 18446744073709551615");
+
+	/*
+	 * Test string of size > 320
+	 *
+	 * core/console-log.c:vprlog() uses buffer[320] to print message
+	 * Try printing more than 320 bytes to test stack corruption.
+	 * You would see Segmentation fault on stack corruption.
+	 */
+	prlog(PR_EMERG, "%330s", "Hello World");
+
+	memset(console_buffer, 0, sizeof(console_buffer));
+
+	/*
+	 * Test boundary condition.
+	 *
+	 * Print string of exact size 320. We should see string truncated
+	 * with console_buffer[319] == '\0'.
+	 */
+	memset(console_buffer, 0, sizeof(console_buffer));
+
+	prlog(PR_EMERG, "%300s", "Hello World");
+	assert(console_buffer[319] == 0);
+
+	/* compare truncated string */
+	ptr += 320 - strlen("Hello World");
+	CHECK_BUF_ASSERT(ptr, "Hello Worl");
+
+	return 0;
+}
diff --git a/roms/skiboot/core/test/run-console-log-pr_fmt.c b/roms/skiboot/core/test/run-console-log-pr_fmt.c
new file mode 100644
index 000000000..457de03fb
--- /dev/null
+++ b/roms/skiboot/core/test/run-console-log-pr_fmt.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2015-2016 IBM Corp.
+ */
+
+#include <config.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <stdarg.h>
+
+#define __TEST__
+
+unsigned long tb_hz = 512000000;
+
+static inline unsigned long mftb(void)
+{
+	return 42;
+}
+
+#define pr_fmt(f) "PREFIX: " f
+#include "../../libc/include/stdio.h"
+#include "../console-log.c"
+#include "../../libc/stdio/snprintf.c"
+#include "../../libc/stdio/vsnprintf.c"
+
+struct debug_descriptor debug_descriptor;
+
+bool flushed_to_drivers;
+char console_buffer[4096];
+
+ssize_t console_write(bool flush_to_drivers, const void *buf, size_t count)
+{
+	flushed_to_drivers = flush_to_drivers;
+	memcpy(console_buffer, buf, count);
+	return count;
+}
+
+int main(void)
+{
+	debug_descriptor.console_log_levels = 0x75;
+
+	prlog(PR_EMERG, "Hello World");
+	assert(strcmp(console_buffer, "[    0.000000042,0] PREFIX: Hello World") == 0);
+	assert(flushed_to_drivers==true);
+
+	memset(console_buffer, 0, sizeof(console_buffer));
+
+	// Below log level
+	prlog(PR_TRACE, "Hello World");
+	assert(console_buffer[0] == 0);
+
+	// Should not be flushed to console
+	prlog(PR_DEBUG, "Hello World");
+	assert(strcmp(console_buffer, "[    0.000000042,7] PREFIX: Hello World") == 0);
+	assert(flushed_to_drivers==false);
+
+	printf("Hello World");
+	assert(strcmp(console_buffer, "[    0.000000042,5] PREFIX: Hello World") == 0);
+	assert(flushed_to_drivers==true);
+
+	return 0;
+}
diff --git a/roms/skiboot/core/test/run-console-log.c b/roms/skiboot/core/test/run-console-log.c
new file mode 100644
index 000000000..bec281b6e
--- /dev/null
+++ b/roms/skiboot/core/test/run-console-log.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2014-2016 IBM Corp.
+ */
+
+#include <config.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <stdarg.h>
+
+#define __TEST__
+
+#define _printf printf
+
+unsigned long tb_hz = 512000000;
+
+static inline unsigned long mftb(void)
+{
+	return 42;
+}
+
+int _printf(const char* fmt, ...);
+
+#include "../console-log.c"
+
+struct debug_descriptor debug_descriptor;
+
+bool flushed_to_drivers;
+char console_buffer[4096];
+
+ssize_t console_write(bool flush_to_drivers, const void *buf, size_t count)
+{
+	flushed_to_drivers = flush_to_drivers;
+	memcpy(console_buffer, buf, count);
+	return count;
+}
+
+int main(void)
+{
+	debug_descriptor.console_log_levels = 0x75;
+
+	prlog(PR_EMERG, "Hello World");
+	assert(strcmp(console_buffer, "[    0.000000042,0] Hello World") == 0);
+	assert(flushed_to_drivers==true);
+
+	memset(console_buffer, 0, sizeof(console_buffer));
+
+	// Below log level
+	prlog(PR_TRACE, "Hello World");
+	assert(console_buffer[0] == 0);
+
+	// Should not be flushed to console
+	prlog(PR_DEBUG, "Hello World");
+	assert(strcmp(console_buffer, "[    0.000000042,7] Hello World") == 0);
+	assert(flushed_to_drivers==false);
+
+	printf("Hello World");
+	assert(strcmp(console_buffer, "[    0.000000042,5] Hello World") == 0);
+	assert(flushed_to_drivers==true);
+
+	return 0;
+}
diff --git a/roms/skiboot/core/test/run-cpufeatures.c b/roms/skiboot/core/test/run-cpufeatures.c
new file mode 100644
index 000000000..bb89b2573
--- /dev/null
+++ b/roms/skiboot/core/test/run-cpufeatures.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+
+/* Override this for testing. */
+#define is_rodata(p) fake_is_rodata(p)
+
+char __rodata_start[16];
+#define __rodata_end (__rodata_start + sizeof(__rodata_start))
+
+static inline bool fake_is_rodata(const void *p)
+{
+	return ((char *)p >= __rodata_start && (char *)p < __rodata_end);
+}
+
+#define zalloc(bytes) calloc((bytes), 1)
+
+#include "../device.c"
+#include <assert.h>
+#include "../../test/dt_common.c"
+
+#define __TEST__
+
+static inline unsigned long mfspr(unsigned int spr);
+
+#include <ccan/str/str.c>
+
+#include "../cpufeatures.c"
+
+static unsigned long fake_pvr = PVR_TYPE_P8;
+
+static inline unsigned long mfspr(unsigned int spr)
+{
+	assert(spr == SPR_PVR);
+	return fake_pvr;
+}
+
+int main(void)
+{
+	struct dt_node *dt_root;
+
+	dt_root = dt_new_root("");
+	dt_add_cpufeatures(dt_root);
+	dump_dt(dt_root, 0, true);
+	dt_free(dt_root);
+
+	fake_pvr = (PVR_TYPE_P8E << 16) | 0x100; // P8E DD1.0
+	dt_root = dt_new_root("");
+	dt_add_cpufeatures(dt_root);
+	dump_dt(dt_root, 0, false);
+	assert(dt_find_by_path(dt_root, "cpus/ibm,powerpc-cpu-features/mmu-radix") == 0);
+	assert(dt_find_by_path(dt_root, "cpus/ibm,powerpc-cpu-features/tm-suspend-hypervisor-assist") == 0);
+	assert(dt_find_by_path(dt_root, "cpus/ibm,powerpc-cpu-features/tm-suspend-xer-so-bug") == 0);
+	dt_free(dt_root);
+
+	fake_pvr = (PVR_TYPE_P8E << 16) | 0x200; // P8E DD2.0
+	dt_root = dt_new_root("");
+	dt_add_cpufeatures(dt_root);
+	dump_dt(dt_root, 0, false);
+	assert(dt_find_by_path(dt_root, "cpus/ibm,powerpc-cpu-features/mmu-radix") == 0);
+	assert(dt_find_by_path(dt_root, "cpus/ibm,powerpc-cpu-features/tm-suspend-hypervisor-assist") == 0);
+	assert(dt_find_by_path(dt_root, "cpus/ibm,powerpc-cpu-features/tm-suspend-xer-so-bug") == 0);
+	dt_free(dt_root);
+
+	fake_pvr = (PVR_TYPE_P8 << 16) | 0x100; // P8 DD1.0
+	dt_root = dt_new_root("");
+	dt_add_cpufeatures(dt_root);
+	dump_dt(dt_root, 0, false);
+	assert(dt_find_by_path(dt_root, "cpus/ibm,powerpc-cpu-features/mmu-radix") == 0);
+	assert(dt_find_by_path(dt_root, "cpus/ibm,powerpc-cpu-features/tm-suspend-hypervisor-assist") == 0);
+	assert(dt_find_by_path(dt_root, "cpus/ibm,powerpc-cpu-features/tm-suspend-xer-so-bug") == 0);
+	dt_free(dt_root);
+
+	fake_pvr = (PVR_TYPE_P8 << 16) | 0x200; // P8 DD2.0
+	dt_root = dt_new_root("");
+	dt_add_cpufeatures(dt_root);
+	dump_dt(dt_root, 0, false);
+	assert(dt_find_by_path(dt_root, "cpus/ibm,powerpc-cpu-features/mmu-radix") == 0);
+	assert(dt_find_by_path(dt_root, "cpus/ibm,powerpc-cpu-features/tm-suspend-hypervisor-assist") == 0);
+	assert(dt_find_by_path(dt_root, "cpus/ibm,powerpc-cpu-features/tm-suspend-xer-so-bug") == 0);
+	dt_free(dt_root);
+
+	fake_pvr = (PVR_TYPE_P8NVL << 16) | 0x100; // P8NVL DD1.0
+	dt_root = dt_new_root("");
+	dt_add_cpufeatures(dt_root);
+	dump_dt(dt_root, 0, false);
+	assert(dt_find_by_path(dt_root, "cpus/ibm,powerpc-cpu-features/mmu-radix") == 0);
+	assert(dt_find_by_path(dt_root, "cpus/ibm,powerpc-cpu-features/tm-suspend-hypervisor-assist") == 0);
+	assert(dt_find_by_path(dt_root, "cpus/ibm,powerpc-cpu-features/tm-suspend-xer-so-bug") == 0);
+	dt_free(dt_root);
+
+	fake_pvr = (PVR_TYPE_P9 << 16) | 0x200; // P9 DD2.0
+	dt_root = dt_new_root("");
+	dt_add_cpufeatures(dt_root);
+	dump_dt(dt_root, 0, false);
+	assert(dt_find_by_path(dt_root, "cpus/ibm,powerpc-cpu-features/mmu-radix"));
+	assert(dt_find_by_path(dt_root, "cpus/ibm,powerpc-cpu-features/tm-suspend-hypervisor-assist") == 0);
+	assert(dt_find_by_path(dt_root, "cpus/ibm,powerpc-cpu-features/tm-suspend-xer-so-bug") == 0);
+	dt_free(dt_root);
+
+	fake_pvr = (PVR_TYPE_P9 << 16) | 0x201; // P9 DD2.1
+	dt_root = dt_new_root("");
+	dt_add_cpufeatures(dt_root);
+	dump_dt(dt_root, 0, false);
+	assert(dt_find_by_path(dt_root, "cpus/ibm,powerpc-cpu-features/mmu-radix"));
+	assert(dt_find_by_path(dt_root, "cpus/ibm,powerpc-cpu-features/tm-suspend-hypervisor-assist") == 0);
+	assert(dt_find_by_path(dt_root, "cpus/ibm,powerpc-cpu-features/tm-suspend-xer-so-bug") == 0);
+	dt_free(dt_root);
+
+	fake_pvr = (PVR_TYPE_P9 << 16) | 0x202; // P9 DD2.2
+	dt_root = dt_new_root("");
+	dt_add_cpufeatures(dt_root);
+	dump_dt(dt_root, 0, false);
+	assert(dt_find_by_path(dt_root, "cpus/ibm,powerpc-cpu-features/mmu-radix"));
+	assert(dt_find_by_path(dt_root, "cpus/ibm,powerpc-cpu-features/tm-suspend-hypervisor-assist") != 0);
+	assert(dt_find_by_path(dt_root, "cpus/ibm,powerpc-cpu-features/tm-suspend-xer-so-bug") != 0);
+	dt_free(dt_root);
+
+	fake_pvr = (PVR_TYPE_P9 << 16) | 0x203; // P9 DD2.3
+	dt_root = dt_new_root("");
+	dt_add_cpufeatures(dt_root);
+	dump_dt(dt_root, 0, false);
+	assert(dt_find_by_path(dt_root, "cpus/ibm,powerpc-cpu-features/mmu-radix"));
+	assert(dt_find_by_path(dt_root, "cpus/ibm,powerpc-cpu-features/tm-suspend-hypervisor-assist") != 0);
+	assert(dt_find_by_path(dt_root, "cpus/ibm,powerpc-cpu-features/tm-suspend-xer-so-bug") == 0);
+	dt_free(dt_root);
+
+	fake_pvr = (PVR_TYPE_P9P << 16) | 0x100; // P9P DD1.0
+	dt_root = dt_new_root("");
+	dt_add_cpufeatures(dt_root);
+	dump_dt(dt_root, 0, false);
+	assert(dt_find_by_path(dt_root, "cpus/ibm,powerpc-cpu-features/mmu-radix"));
+	assert(dt_find_by_path(dt_root, "cpus/ibm,powerpc-cpu-features/tm-suspend-hypervisor-assist") != 0);
+	assert(dt_find_by_path(dt_root, "cpus/ibm,powerpc-cpu-features/tm-suspend-xer-so-bug") == 0);
+	dt_free(dt_root);
+
+	exit(EXIT_SUCCESS);
+}
diff --git a/roms/skiboot/core/test/run-device.c b/roms/skiboot/core/test/run-device.c
new file mode 100644
index 000000000..4a12382bb
--- /dev/null
+++ b/roms/skiboot/core/test/run-device.c
@@ -0,0 +1,471 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2012-2018 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <stdlib.h>
+
+/* Override this for testing. */
+#define is_rodata(p) fake_is_rodata(p)
+
+char __rodata_start[16];
+#define __rodata_end (__rodata_start + sizeof(__rodata_start))
+
+static inline bool fake_is_rodata(const void *p)
+{
+	return ((char *)p >= __rodata_start && (char *)p < __rodata_end);
+}
+
+#define zalloc(bytes) calloc((bytes), 1)
+
+#include "../device.c"
+#include <assert.h>
+#include "../../test/dt_common.c"
+const char *prop_to_fix[] = {"something", NULL};
+const char **props_to_fix(struct dt_node *node);
+
+static void check_path(const struct dt_node *node, const char * expected_path)
+{
+	char * path;
+	path = dt_get_path(node);
+	if (strcmp(path, expected_path) != 0) {
+		printf("check_path: expected %s, got %s\n", expected_path, path);
+	}
+	assert(strcmp(path, expected_path) == 0);
+	free(path);
+}
+
+/* constructs a random nodes only device tree */
+static void build_tree(int max_depth, int min_depth, struct dt_node *parent)
+{
+	char name[64];
+	int i;
+
+	for (i = 0; i < max_depth; i++) {
+		struct dt_node *new;
+
+		snprintf(name, sizeof name, "prefix@%.8x", rand());
+
+		new = dt_new(parent, name);
+
+		if(max_depth > min_depth)
+			build_tree(max_depth - 1, min_depth, new);
+	}
+}
+
+static bool is_sorted(const struct dt_node *root)
+{
+	struct dt_node *end = list_tail(&root->children, struct dt_node, list);
+	struct dt_node *node;
+
+	dt_for_each_child(root, node) {
+		struct dt_node *next =
+			list_entry(node->list.next, struct dt_node, list);
+
+		/* current node must be "less than" the next node */
+		if (node != end && dt_cmp_subnodes(node, next) != -1) {
+			printf("nodes '%s' and '%s' out of order\n",
+				node->name, next->name);
+
+			return false;
+		}
+
+		if (!is_sorted(node))
+			return false;
+	}
+
+	return true;
+}
+
+/*handler for phandle fixup test */
+const char **props_to_fix(struct dt_node *node)
+{
+	const struct dt_property *prop;
+
+	prop = dt_find_property(node, "something");
+	if (prop)
+		return prop_to_fix;
+
+	return NULL;
+}
+
+int main(void)
+{
+	struct dt_node *root, *other_root, *c1, *c2, *c2_c, *gc1, *gc2, *gc3, *ggc1, *ggc2;
+	struct dt_node *addrs, *addr1, *addr2;
+	struct dt_node *i, *subtree, *ev1, *ut1, *ut2;
+	const struct dt_property *p;
+	struct dt_property *p2;
+	unsigned int n;
+	char *s;
+	size_t sz;
+	u32 phandle, ev1_ph, new_prop_ph;
+
+	root = dt_new_root("");
+	assert(!list_top(&root->properties, struct dt_property, list));
+	check_path(root, "/");
+
+	c1 = dt_new_check(root, "c1");
+	assert(!list_top(&c1->properties, struct dt_property, list));
+	check_path(c1, "/c1");
+	assert(dt_find_by_name(root, "c1") == c1);
+	assert(dt_find_by_path(root, "/c1") == c1);
+	assert(dt_new(root, "c1") == NULL);
+
+	c2 = dt_new(root, "c2");
+	c2_c = dt_new_check(root, "c2");
+	assert(c2 == c2_c);
+	assert(!list_top(&c2->properties, struct dt_property, list));
+	check_path(c2, "/c2");
+	assert(dt_find_by_name(root, "c2") == c2);
+	assert(dt_find_by_path(root, "/c2") == c2);
+
+	gc1 = dt_new(c1, "gc1");
+	assert(!list_top(&gc1->properties, struct dt_property, list));
+	check_path(gc1, "/c1/gc1");
+	assert(dt_find_by_name(root, "gc1") == gc1);
+	assert(dt_find_by_path(root, "/c1/gc1") == gc1);
+
+	gc2 = dt_new(c1, "gc2");
+	assert(!list_top(&gc2->properties, struct dt_property, list));
+	check_path(gc2, "/c1/gc2");
+	assert(dt_find_by_name(root, "gc2") == gc2);
+	assert(dt_find_by_path(root, "/c1/gc2") == gc2);
+
+	gc3 = dt_new(c1, "gc3");
+	assert(!list_top(&gc3->properties, struct dt_property, list));
+	check_path(gc3, "/c1/gc3");
+	assert(dt_find_by_name(root, "gc3") == gc3);
+	assert(dt_find_by_path(root, "/c1/gc3") == gc3);
+
+	ggc1 = dt_new(gc1, "ggc1");
+	assert(!list_top(&ggc1->properties, struct dt_property, list));
+	check_path(ggc1, "/c1/gc1/ggc1");
+	assert(dt_find_by_name(root, "ggc1") == ggc1);
+	assert(dt_find_by_path(root, "/c1/gc1/ggc1") == ggc1);
+
+	addrs = dt_new(root, "addrs");
+	assert(!list_top(&addrs->properties, struct dt_property, list));
+	check_path(addrs, "/addrs");
+	assert(dt_find_by_name(root, "addrs") == addrs);
+	assert(dt_find_by_path(root, "/addrs") == addrs);
+
+	addr1 = dt_new_addr(addrs, "addr", 0x1337);
+	assert(!list_top(&addr1->properties, struct dt_property, list));
+	check_path(addr1, "/addrs/addr@1337");
+	assert(dt_find_by_name(root, "addr@1337") == addr1);
+	assert(dt_find_by_name_addr(root, "addr", 0x1337) == addr1);
+	assert(dt_find_by_path(root, "/addrs/addr@1337") == addr1);
+	assert(dt_new_addr(addrs, "addr", 0x1337) == NULL);
+
+	addr2 = dt_new_2addr(addrs, "2addr", 0xdead, 0xbeef);
+	assert(!list_top(&addr2->properties, struct dt_property, list));
+	check_path(addr2, "/addrs/2addr@dead,beef");
+	assert(dt_find_by_name(root, "2addr@dead,beef") == addr2);
+	assert(dt_find_by_path(root, "/addrs/2addr@dead,beef") == addr2);
+	assert(dt_new_2addr(addrs, "2addr", 0xdead, 0xbeef) == NULL);
+
+	/* Test walking the tree, checking and setting values */
+	for (n = 0, i = dt_first(root); i; i = dt_next(root, i), n++) {
+		assert(!list_top(&i->properties, struct dt_property, list));
+		dt_add_property_cells(i, "visited", 1);
+	}
+	assert(n == 9);
+
+	for (n = 0, i = dt_first(root); i; i = dt_next(root, i), n++) {
+		p = list_top(&i->properties, struct dt_property, list);
+		assert(strcmp(p->name, "visited") == 0);
+		assert(p->len == sizeof(u32));
+		assert(fdt32_to_cpu(*(u32 *)p->prop) == 1);
+	}
+	assert(n == 9);
+
+	/* Test cells */
+	dt_add_property_cells(c1, "some-property", 1, 2, 3);
+	p = dt_find_property(c1, "some-property");
+	assert(p);
+	assert(strcmp(p->name, "some-property") == 0);
+	assert(p->len == sizeof(u32) * 3);
+	assert(fdt32_to_cpu(*(u32 *)p->prop) == 1);
+	assert(dt_prop_get_cell(c1, "some-property", 0) == 1);
+	assert(fdt32_to_cpu(*((u32 *)p->prop + 1)) == 2);
+	assert(dt_prop_get_cell(c1, "some-property", 1) == 2);
+	assert(fdt32_to_cpu(*((u32 *)p->prop + 2)) == 3);
+	assert(dt_prop_get_cell_def(c1, "some-property", 2, 42) == 3);
+
+	assert(dt_prop_get_cell_def(c1, "not-a-property", 2, 42) == 42);
+
+	/* Test u64s */
+	dt_add_property_u64s(c2, "some-property", (2LL << 33), (3LL << 33), (4LL << 33));
+	p = dt_find_property(c2, "some-property");
+	assert(p);
+	assert(p->len == sizeof(u64) * 3);
+	assert(fdt64_to_cpu(*(u64 *)p->prop) == (2LL << 33));
+	assert(fdt64_to_cpu(*((u64 *)p->prop + 1)) == (3LL << 33));
+	assert(fdt64_to_cpu(*((u64 *)p->prop + 2)) == (4LL << 33));
+
+	/* Test u32/u64 get defaults */
+	assert(dt_prop_get_u32_def(c1, "u32", 42) == 42);
+	dt_add_property_cells(c1, "u32", 1337);
+	assert(dt_prop_get_u32_def(c1, "u32", 42) == 1337);
+	assert(dt_prop_get_u32(c1, "u32") == 1337);
+
+	assert(dt_prop_get_u64_def(c1, "u64", (42LL << 42)) == (42LL << 42));
+	dt_add_property_u64s(c1, "u64", (1337LL << 42));
+	assert(dt_prop_get_u64_def(c1, "u64", (42LL << 42)) == (1337LL << 42));
+	assert(dt_prop_get_u64(c1, "u64") == (1337LL << 42));
+
+	/* Test freeing a single node */
+	assert(!list_empty(&gc1->children));
+	dt_free(ggc1);
+	assert(list_empty(&gc1->children));
+
+	/* Test rodata logic. */
+	assert(!is_rodata("hello"));
+	assert(is_rodata(__rodata_start));
+	strcpy(__rodata_start, "name");
+	ggc1 = dt_new(root, __rodata_start);
+	assert(ggc1->name == __rodata_start);
+
+	/* Test string node. */
+	dt_add_property_string(ggc1, "somestring", "someval");
+	assert(dt_has_node_property(ggc1, "somestring", "someval"));
+	assert(!dt_has_node_property(ggc1, "somestrin", "someval"));
+	assert(!dt_has_node_property(ggc1, "somestring", "someva"));
+	assert(!dt_has_node_property(ggc1, "somestring", "somevale"));
+
+	/* Test nstr, which allows for non-null-terminated inputs */
+	dt_add_property_nstr(ggc1, "nstring", "somevalue_long", 7);
+	assert(dt_has_node_property(ggc1, "nstring", "someval"));
+	assert(!dt_has_node_property(ggc1, "nstring", "someva"));
+	assert(!dt_has_node_property(ggc1, "nstring", "somevalue_long"));
+
+	/* Test multiple strings */
+	dt_add_property_strings(ggc1, "somestrings",
+				"These", "are", "strings!");
+	p = dt_find_property(ggc1, "somestrings");
+	assert(p);
+	assert(p->len == sizeof(char) * (6 + 4 + 9));
+	s = (char *)p->prop;
+	assert(strcmp(s, "These") == 0);
+	assert(strlen(s) == 5);
+	s += 6;
+	assert(strcmp(s, "are") == 0);
+	assert(strlen(s) == 3);
+	s += 4;
+	assert(strcmp(s, "strings!") == 0);
+	assert(strlen(s) == 8);
+	s += 9;
+	assert(s == (char *)p->prop + p->len);
+	assert(dt_prop_find_string(p, "These"));
+	/* dt_prop_find_string is case insensitve */
+	assert(dt_prop_find_string(p, "ARE"));
+	assert(!dt_prop_find_string(p, "integers!"));
+	/* And always returns false for NULL properties */
+	assert(!dt_prop_find_string(NULL, "anything!"));
+
+	/* Test more get/get_def varieties */
+	assert(dt_prop_get_def(c1, "does-not-exist", NULL) == NULL);
+	sz = 0xbad;
+	assert(dt_prop_get_def_size(c1, "does-not-exist", NULL, &sz) == NULL);
+	assert(sz == 0);
+	dt_add_property_string(c1, "another-property", "xyzzy");
+	assert(dt_prop_get_def(c1, "another-property", NULL) != NULL);
+	assert(strcmp(dt_prop_get(c1, "another-property"), "xyzzy") == 0);
+	n = 0xbad;
+	assert(dt_prop_get_def_size(c1, "another-property", NULL, &sz) != NULL);
+	assert(sz == strlen("xyzzy") + 1);
+
+	/* Test resizing property. */
+	p = p2 = __dt_find_property(c1, "some-property");
+	assert(p);
+	n = p2->len;
+	while (p2 == p) {
+		n *= 2;
+		dt_resize_property(&p2, n);
+	}
+
+	assert(dt_find_property(c1, "some-property") == p2);
+	list_check(&c1->properties, "properties after resizing");
+
+	dt_del_property(c1, p2);
+	list_check(&c1->properties, "properties after delete");
+
+	/* No leaks for valgrind! */
+	dt_free(root);
+
+	/* Test compatible and chip id. */
+	root = dt_new_root("");
+
+	c1 = dt_new(root, "chip1");
+	dt_add_property_cells(c1, "ibm,chip-id", 0xcafe);
+	assert(dt_get_chip_id(c1) == 0xcafe);
+	dt_add_property_strings(c1, "compatible",
+				"specific-fake-chip",
+				"generic-fake-chip");
+	assert(dt_node_is_compatible(c1, "specific-fake-chip"));
+	assert(dt_node_is_compatible(c1, "generic-fake-chip"));
+
+	c2 = dt_new(root, "chip2");
+	dt_add_property_cells(c2, "ibm,chip-id", 0xbeef);
+	assert(dt_get_chip_id(c2) == 0xbeef);
+	dt_add_property_strings(c2, "compatible",
+				"specific-fake-bus",
+				"generic-fake-bus");
+
+	gc1 = dt_new(c1, "coprocessor1");
+	dt_add_property_strings(gc1, "compatible",
+				"specific-fake-coprocessor");
+	gc2 = dt_new(gc1, "coprocessor2");
+	dt_add_property_strings(gc2, "compatible",
+				"specific-fake-coprocessor");
+	gc3 = dt_new(c1, "coprocessor3");
+	dt_add_property_strings(gc3, "compatible",
+				"specific-fake-coprocessor");
+
+
+	assert(dt_find_compatible_node(root, NULL, "generic-fake-bus") == c2);
+	assert(dt_find_compatible_node(root, c2, "generic-fake-bus") == NULL);
+
+	/* we can find all compatible nodes */
+	assert(dt_find_compatible_node(c1, NULL, "specific-fake-coprocessor") == gc1);
+	assert(dt_find_compatible_node(c1, gc1, "specific-fake-coprocessor") == gc2);
+	assert(dt_find_compatible_node(c1, gc2, "specific-fake-coprocessor") == gc3);
+	assert(dt_find_compatible_node(c1, gc3, "specific-fake-coprocessor") == NULL);
+	assert(dt_find_compatible_node(root, NULL, "specific-fake-coprocessor") == gc1);
+	assert(dt_find_compatible_node(root, gc1, "specific-fake-coprocessor") == gc2);
+	assert(dt_find_compatible_node(root, gc2, "specific-fake-coprocessor") == gc3);
+	assert(dt_find_compatible_node(root, gc3, "specific-fake-coprocessor") == NULL);
+
+	/* we can find the coprocessor once on the cpu */
+	assert(dt_find_compatible_node_on_chip(root,
+					       NULL,
+					       "specific-fake-coprocessor",
+					       0xcafe) == gc1);
+	assert(dt_find_compatible_node_on_chip(root,
+					       gc1,
+					       "specific-fake-coprocessor",
+					       0xcafe) == gc2);
+	assert(dt_find_compatible_node_on_chip(root,
+					       gc2,
+					       "specific-fake-coprocessor",
+					       0xcafe) == gc3);
+	assert(dt_find_compatible_node_on_chip(root,
+					       gc3,
+					       "specific-fake-coprocessor",
+					       0xcafe) == NULL);
+
+	/* we can't find the coprocessor on the bus */
+	assert(dt_find_compatible_node_on_chip(root,
+					       NULL,
+					       "specific-fake-coprocessor",
+					       0xbeef) == NULL);
+
+	/* Test phandles. We override the automatically generated one. */
+	phandle = 0xf00;
+	dt_add_property(gc3, "phandle", (const void *)&phandle, 4);
+	assert(last_phandle == 0xf00);
+	assert(dt_find_by_phandle(root, 0xf00) == gc3);
+	assert(dt_find_by_phandle(root, 0xf0f) == NULL);
+
+	dt_free(root);
+
+	/* basic sorting */
+	root = dt_new_root("rewt");
+	dt_new(root, "a@1");
+	dt_new(root, "a@2");
+	dt_new(root, "a@3");
+	dt_new(root, "a@4");
+	dt_new(root, "b@4");
+	dt_new(root, "c@4");
+
+	assert(is_sorted(root));
+
+	/* Now test dt_attach_root */
+	other_root = dt_new_root("other_root");
+	dt_new(other_root, "d@1");
+
+	assert(dt_attach_root(root, other_root));
+	other_root = dt_new_root("other_root");
+	assert(!dt_attach_root(root, other_root));
+	dt_free(root);
+
+	/* Test child node sorting */
+	root = dt_new_root("test root");
+	build_tree(5, 3, root);
+
+	if (!is_sorted(root)) {
+		dump_dt(root, 1, false);
+	}
+	assert(is_sorted(root));
+
+	dt_free(root);
+
+	/* check dt_translate_address */
+
+	/* NB: the root bus has two address cells */
+	root = dt_new_root("");
+
+	c1 = dt_new_addr(root, "some-32bit-bus", 0x80000000);
+	dt_add_property_cells(c1, "#address-cells", 1);
+	dt_add_property_cells(c1, "#size-cells", 1);
+	dt_add_property_cells(c1, "ranges", 0x0, 0x8, 0x0, 0x1000);
+
+	gc1 = dt_new_addr(c1, "test", 0x0500);
+	dt_add_property_cells(gc1, "reg", 0x0500, 0x10);
+
+	assert(dt_translate_address(gc1, 0, NULL) == 0x800000500ul);
+
+	/* try three level translation */
+
+	gc2 = dt_new_addr(c1, "another-32bit-bus", 0x40000000);
+	dt_add_property_cells(gc2, "#address-cells", 1);
+	dt_add_property_cells(gc2, "#size-cells", 1);
+	dt_add_property_cells(gc2, "ranges",	0x0, 0x600, 0x100,
+						0x100, 0x800, 0x100);
+
+	ggc1 = dt_new_addr(gc2, "test", 0x50);
+	dt_add_property_cells(ggc1, "reg", 0x50, 0x10);
+	assert(dt_translate_address(ggc1, 0, NULL) == 0x800000650ul);
+
+	/* test multiple ranges work */
+	ggc2 = dt_new_addr(gc2, "test", 0x150);
+	dt_add_property_cells(ggc2, "reg", 0x150, 0x10);
+	assert(dt_translate_address(ggc2, 0, NULL) == 0x800000850ul);
+
+	/* try 64bit -> 64bit */
+
+	c2 = dt_new_addr(root, "some-64bit-bus", 0xe00000000);
+	dt_add_property_cells(c2, "#address-cells", 2);
+	dt_add_property_cells(c2, "#size-cells", 2);
+	dt_add_property_cells(c2, "ranges", 0x0, 0x0, 0xe, 0x0, 0x2, 0x0);
+
+	gc2 = dt_new_addr(c2, "test", 0x100000000ul);
+	dt_add_property_u64s(gc2, "reg", 0x100000000ul, 0x10ul);
+	assert(dt_translate_address(gc2, 0, NULL) == 0xf00000000ul);
+
+	dt_free(root);
+
+	/* phandle fixup test */
+	subtree = dt_new_root("subtree");
+	ev1 = dt_new(subtree, "ev@1");
+	ev1_ph = ev1->phandle;
+	dt_new(ev1,"a@1");
+	dt_new(ev1,"a@2");
+	dt_new(ev1,"a@3");
+	ut1 = dt_new(subtree, "ut@1");
+	dt_add_property(ut1, "something", (const void *)&ev1->phandle, 4);
+	ut2 = dt_new(subtree, "ut@2");
+	dt_add_property(ut2, "something", (const void *)&ev1->phandle, 4);
+
+	dt_adjust_subtree_phandle(subtree, props_to_fix);
+	assert(!(ev1->phandle == ev1_ph));
+	new_prop_ph = dt_prop_get_u32(ut1, "something");
+	assert(!(new_prop_ph == ev1_ph));
+	new_prop_ph = dt_prop_get_u32(ut2, "something");
+	assert(!(new_prop_ph == ev1_ph));
+	dt_free(subtree);
+	return 0;
+}
+
diff --git a/roms/skiboot/core/test/run-flash-firmware-versions.c b/roms/skiboot/core/test/run-flash-firmware-versions.c
new file mode 100644
index 000000000..9f96f5c19
--- /dev/null
+++ b/roms/skiboot/core/test/run-flash-firmware-versions.c
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2018-2019 IBM Corp.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+#include <malloc.h>
+#include <stdint.h>
+
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdint.h>
+
+
+#include <interrupts.h>
+#include <bitutils.h>
+
+#include <compiler.h>
+
+/*
+ * Skiboot malloc stubs
+ *
+ * The actual prototypes for these are defined in mem_region-malloc.h,
+ * but that file also #defines malloc, and friends so we don't pull that in
+ * directly.
+ */
+
+#define DEFAULT_ALIGN __alignof__(long)
+
+void *__memalign(size_t blocksize, size_t bytes, const char *location __unused);
+void *__memalign(size_t blocksize, size_t bytes, const char *location __unused)
+{
+	return memalign(blocksize, bytes);
+}
+
+void *__malloc(size_t bytes, const char *location);
+void *__malloc(size_t bytes, const char *location)
+{
+	return __memalign(DEFAULT_ALIGN, bytes, location);
+}
+
+void __free(void *p, const char *location __unused);
+void __free(void *p, const char *location __unused)
+{
+	free(p);
+}
+
+void *__realloc(void *ptr, size_t size, const char *location __unused);
+void *__realloc(void *ptr, size_t size, const char *location __unused)
+{
+	return realloc(ptr, size);
+}
+
+void *__zalloc(size_t bytes, const char *location);
+void *__zalloc(size_t bytes, const char *location)
+{
+	void *p = __malloc(bytes, location);
+
+	if (p)
+		memset(p, 0, bytes);
+	return p;
+}
+
+#include <mem_region-malloc.h>
+
+#include <opal-api.h>
+
+#include "../../libfdt/fdt.c"
+#include "../../libfdt/fdt_ro.c"
+#include "../../libfdt/fdt_sw.c"
+#include "../../libfdt/fdt_strerror.c"
+
+#include "../../core/device.c"
+
+#include "../../libstb/container-utils.h"
+#include "../../libstb/container.h"
+#include "../../libstb/container.c"
+
+#include "../flash-firmware-versions.c"
+#include <assert.h>
+
+char __rodata_start[1], __rodata_end[1];
+
+const char version[]="Hello world!";
+
+enum proc_gen proc_gen = proc_gen_p8;
+
+static char *loaded_version_buf;
+static size_t loaded_version_buf_size;
+
+#define min(x,y) ((x) < (y) ? x : y)
+
+int start_preload_resource(enum resource_id id, uint32_t subid,
+			   void *buf, size_t *len)
+{
+	(void)id;
+	(void)subid;
+	(void)buf;
+	if (loaded_version_buf) {
+		*len = min(*len, loaded_version_buf_size);
+		memcpy(buf, loaded_version_buf, *len);
+	} else {
+		*len = 0;
+	}
+
+	return 0;
+}
+
+int wait_for_resource_loaded(enum resource_id id, uint32_t idx)
+{
+	(void)id;
+	(void)idx;
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	int fd;
+	struct stat ver_st;
+	int r;
+
+	dt_root = dt_new_root("");
+
+	if (argc > 1) {
+		fd = open(argv[1], O_RDONLY);
+
+		assert(fd > 0);
+		r = fstat(fd, &ver_st);
+		assert(r == 0);
+
+		loaded_version_buf = mmap(NULL, ver_st.st_size,
+					  PROT_READ, MAP_PRIVATE, fd, 0);
+		assert(loaded_version_buf != (char*)-1);
+		loaded_version_buf_size = ver_st.st_size;
+	}
+
+	flash_fw_version_preload();
+
+	proc_gen = proc_gen_p9;
+	flash_fw_version_preload();
+	flash_dt_add_fw_version();
+
+	return 0;
+}
+
diff --git a/roms/skiboot/core/test/run-flash-subpartition.c b/roms/skiboot/core/test/run-flash-subpartition.c
new file mode 100644
index 000000000..5b6df87f2
--- /dev/null
+++ b/roms/skiboot/core/test/run-flash-subpartition.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2013-2016 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <opal-api.h>
+#include <stdlib.h>
+
+#include "../flash-subpartition.c"
+#include <assert.h>
+
+/* This is a straight dump of the CAPP ucode partition header */
+char capp[4096] = {0x43, 0x41, 0x50, 0x50, 0x00, 0x00, 0x00, 0x01,
+		   0x00, 0x01, 0x00, 0xea, 0x00, 0x00, 0x10, 0x00,
+		   0x00, 0x00, 0x8e, 0x50, 0x00, 0x02, 0x00, 0xea,
+		   0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x8e, 0x50,
+		   0x00, 0x02, 0x00, 0xef, 0x00, 0x00, 0x10, 0x00,
+		   0x00, 0x00, 0x8e, 0x50, 0x00, 0x02, 0x01, 0xef,
+		   0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x8e, 0x50,
+		   0x00, 0x01, 0x00, 0xd3, 0x00, 0x00, 0x10, 0x00,
+		   0x00, 0x00, 0x8e, 0x50, 0x00, 0x00, 0x00, 0x00 };
+
+int main(void)
+{
+	int rc;
+	uint32_t part_actual;
+	uint32_t offset;
+	uint32_t size;
+	uint32_t subids[] = { 0x100ea, 0x200ea, 0x200ef, 0x201ef, 0x100d3 };
+
+	for (int i = 0; i < sizeof(subids)/sizeof(uint32_t); i++) {
+		offset = 0;
+		rc = flash_subpart_info(capp, sizeof(capp), 0x24000,
+					&part_actual, subids[i],
+					&offset, &size);
+		printf("\nsubid %x\n", subids[i]);
+		printf("part_actual %u\n", part_actual);
+		printf("offset %u\n", offset);
+		printf("size %u\n", size);
+		assert (rc == 0);
+		assert (size == 36432);
+		assert (offset == 4096);
+		assert (part_actual == 40960);
+	}
+
+	return 0;
+}
diff --git a/roms/skiboot/core/test/run-malloc-speed.c b/roms/skiboot/core/test/run-malloc-speed.c
new file mode 100644
index 000000000..39a24f9cb
--- /dev/null
+++ b/roms/skiboot/core/test/run-malloc-speed.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2013-2018 IBM Corp.
+ */
+
+#include <config.h>
+
+#define BITS_PER_LONG (sizeof(long) * 8)
+#include "dummy-cpu.h"
+
+#include <stdlib.h>
+
+/* Use these before we undefine them below. */
+static inline void *real_malloc(size_t size)
+{
+	return malloc(size);
+}
+
+static inline void real_free(void *p)
+{
+	return free(p);
+}
+
+#include <skiboot.h>
+
+/* We need mem_region to accept __location__ */
+#define is_rodata(p) true
+#include "../malloc.c"
+#include "../mem_region.c"
+#include "../device.c"
+
+#undef malloc
+#undef free
+#undef realloc
+
+#include <assert.h>
+#include <stdio.h>
+
+char __rodata_start[1], __rodata_end[1];
+struct dt_node *dt_root;
+enum proc_chip_quirks proc_chip_quirks;
+
+void lock_caller(struct lock *l, const char *caller)
+{
+	(void)caller;
+	assert(!l->lock_val);
+	l->lock_val = 1;
+}
+
+void unlock(struct lock *l)
+{
+	assert(l->lock_val);
+	l->lock_val = 0;
+}
+
+bool lock_held_by_me(struct lock *l)
+{
+	return l->lock_val;
+}
+
+#define TEST_HEAP_ORDER 27
+#define TEST_HEAP_SIZE (1ULL << TEST_HEAP_ORDER)
+
+#define NUM_ALLOCS 4096
+
+int main(void)
+{
+	uint64_t i, len;
+	void **p = real_malloc(sizeof(void*)*NUM_ALLOCS);
+
+	assert(p);
+
+	/* Use malloc for the heap, so valgrind can find issues. */
+	skiboot_heap.start = (unsigned long)real_malloc(skiboot_heap.len);
+
+	len = skiboot_heap.len / NUM_ALLOCS - sizeof(struct alloc_hdr);
+	for (i = 0; i < NUM_ALLOCS; i++) {
+		p[i] = __malloc(len, __location__);
+		assert(p[i] > region_start(&skiboot_heap));
+		assert(p[i] + len <= region_start(&skiboot_heap)
+		       + skiboot_heap.len);
+	}
+	assert(mem_check(&skiboot_heap));
+	assert(skiboot_heap.free_list_lock.lock_val == 0);
+	free(region_start(&skiboot_heap));
+	real_free(p);
+	return 0;
+}
diff --git a/roms/skiboot/core/test/run-malloc.c b/roms/skiboot/core/test/run-malloc.c
new file mode 100644
index 000000000..10cc64e86
--- /dev/null
+++ b/roms/skiboot/core/test/run-malloc.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2013-2018 IBM Corp.
+ */
+
+#include <config.h>
+
+#define BITS_PER_LONG (sizeof(long) * 8)
+
+#include "dummy-cpu.h"
+
+#include <stdlib.h>
+
+/* Use these before we undefine them below. */
+static inline void *real_malloc(size_t size)
+{
+	return malloc(size);
+}
+
+static inline void real_free(void *p)
+{
+	return free(p);
+}
+
+#undef malloc
+#undef free
+#undef realloc
+
+#include <skiboot.h>
+
+#define is_rodata(p) true
+
+#include "../mem_region.c"
+#include "../malloc.c"
+#include "../device.c"
+
+#include "mem_region-malloc.h"
+
+#define TEST_HEAP_ORDER 16
+#define TEST_HEAP_SIZE (1ULL << TEST_HEAP_ORDER)
+
+struct dt_node *dt_root;
+enum proc_chip_quirks proc_chip_quirks;
+
+void lock_caller(struct lock *l, const char *caller)
+{
+	(void)caller;
+	assert(!l->lock_val);
+	l->lock_val = 1;
+}
+
+void unlock(struct lock *l)
+{
+	assert(l->lock_val);
+	l->lock_val = 0;
+}
+
+bool lock_held_by_me(struct lock *l)
+{
+	return l->lock_val;
+}
+
+static bool heap_empty(void)
+{
+	const struct alloc_hdr *h = region_start(&skiboot_heap);
+	return h->num_longs == skiboot_heap.len / sizeof(long);
+}
+
+int main(void)
+{
+	char *test_heap = real_malloc(TEST_HEAP_SIZE);
+	char *p, *p2, *p3, *p4;
+	char *pr;
+	size_t i;
+
+	/* Use malloc for the heap, so valgrind can find issues. */
+	skiboot_heap.start = (unsigned long)test_heap;
+	skiboot_heap.len = TEST_HEAP_SIZE;
+
+	/* Allocations of various sizes. */
+	for (i = 0; i < TEST_HEAP_ORDER; i++) {
+		p = malloc(1ULL << i);
+		assert(p);
+		assert(p > (char *)test_heap);
+		assert(p + (1ULL << i) <= (char *)test_heap + TEST_HEAP_SIZE);
+		assert(!skiboot_heap.free_list_lock.lock_val);
+		free(p);
+		assert(!skiboot_heap.free_list_lock.lock_val);
+		assert(heap_empty());
+	}
+
+	/* Realloc as malloc. */
+	skiboot_heap.free_list_lock.lock_val = 0;
+	p = realloc(NULL, 100);
+	assert(p);
+	assert(!skiboot_heap.free_list_lock.lock_val);
+
+	/* Realloc as free. */
+	p = realloc(p, 0);
+	assert(!p);
+	assert(!skiboot_heap.free_list_lock.lock_val);
+	assert(heap_empty());
+
+	/* Realloc longer. */
+	p = realloc(NULL, 100);
+	assert(p);
+	assert(!skiboot_heap.free_list_lock.lock_val);
+	p2 = realloc(p, 200);
+	assert(p2 == p);
+	assert(!skiboot_heap.free_list_lock.lock_val);
+	free(p2);
+	assert(!skiboot_heap.free_list_lock.lock_val);
+	assert(heap_empty());
+
+	/* Realloc shorter. */
+	skiboot_heap.free_list_lock.lock_val = 0;
+	p = realloc(NULL, 100);
+	assert(!skiboot_heap.free_list_lock.lock_val);
+	assert(p);
+	p2 = realloc(p, 1);
+	assert(!skiboot_heap.free_list_lock.lock_val);
+	assert(p2 == p);
+	free(p2);
+	assert(!skiboot_heap.free_list_lock.lock_val);
+	assert(heap_empty());
+
+	/* zalloc failure */
+	p2 = zalloc(TEST_HEAP_SIZE * 2);
+	assert(p2 == NULL);
+
+	/* Realloc with move. */
+	p2 = malloc(TEST_HEAP_SIZE - 64 - sizeof(struct alloc_hdr)*2);
+	memset(p2, 'a', TEST_HEAP_SIZE - 64 - sizeof(struct alloc_hdr)*2);
+	assert(p2);
+	p = malloc(64);
+	memset(p, 'b', 64);
+	p[63] = 'c';
+	assert(p);
+	free(p2);
+
+	p2 = realloc(p, 128);
+	assert(p2 != p);
+	assert(p2[63] == 'c');
+	free(p2);
+	assert(heap_empty());
+	assert(!skiboot_heap.free_list_lock.lock_val);
+
+	/* Realloc with failure to allocate new size */
+	p2 = malloc(TEST_HEAP_SIZE - sizeof(struct alloc_hdr)*2);
+	assert(p2);
+	memset(p2, 'a', TEST_HEAP_SIZE - sizeof(struct alloc_hdr)*2);
+	p = p2;
+	p2 = realloc(p, TEST_HEAP_SIZE*2);
+	assert(p2==NULL);
+	memset(p, 'b', TEST_HEAP_SIZE - sizeof(struct alloc_hdr)*2);
+	free(p);
+
+	/* Reproduce bug BZ109128/SW257364 */
+	p = malloc(100);
+	p2 = malloc(100);
+	p3 = malloc(100);
+	p4 = malloc(100);
+	free(p2);
+	pr = realloc(p,216);
+	assert(pr);
+	free(p3);
+	free(pr);
+	free(p4);
+	assert(heap_empty());
+	assert(!skiboot_heap.free_list_lock.lock_val);
+
+	real_free(test_heap);
+	return 0;
+}
diff --git a/roms/skiboot/core/test/run-mem_range_is_reserved.c b/roms/skiboot/core/test/run-mem_range_is_reserved.c
new file mode 100644
index 000000000..9891dbd9a
--- /dev/null
+++ b/roms/skiboot/core/test/run-mem_range_is_reserved.c
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2015-2019 IBM Corp.
+ */
+
+#include <config.h>
+
+/* The lock backtrace structures consume too much room on the skiboot heap */
+#undef DEBUG_LOCKS_BACKTRACE
+
+#define BITS_PER_LONG (sizeof(long) * 8)
+
+#include "dummy-cpu.h"
+
+#include <stdlib.h>
+
+static void *real_malloc(size_t size)
+{
+	return malloc(size);
+}
+
+static void real_free(void *p)
+{
+	return free(p);
+}
+
+#undef malloc
+#undef free
+#undef realloc
+
+#include <skiboot.h>
+#include <mem_region-malloc.h>
+
+/* We need mem_region to accept __location__ */
+#define is_rodata(p) true
+#include "../mem_region.c"
+#include "../malloc.c"
+
+/* But we need device tree to make copies of names. */
+#undef is_rodata
+#define is_rodata(p) false
+#include "../../libc/string/strdup.c"
+
+#include "../device.c"
+#include <assert.h>
+#include <stdio.h>
+
+enum proc_chip_quirks proc_chip_quirks;
+
+void lock_caller(struct lock *l, const char *caller)
+{
+	(void)caller;
+	assert(!l->lock_val);
+	l->lock_val++;
+}
+
+void unlock(struct lock *l)
+{
+	assert(l->lock_val);
+	l->lock_val--;
+}
+
+bool lock_held_by_me(struct lock *l)
+{
+	return l->lock_val;
+}
+
+#define TEST_HEAP_ORDER 16
+#define TEST_HEAP_SIZE (1ULL << TEST_HEAP_ORDER)
+
+static void add_mem_node(uint64_t start, uint64_t len)
+{
+	struct dt_node *mem;
+	u64 reg[2];
+	char *name;
+
+	name = (char*)malloc(sizeof("memory@") + STR_MAX_CHARS(reg[0]));
+	assert(name);
+
+	/* reg contains start and length */
+	reg[0] = cpu_to_be64(start);
+	reg[1] = cpu_to_be64(len);
+
+	sprintf(name, "memory@%llx", (long long)start);
+
+	mem = dt_new(dt_root, name);
+	dt_add_property_string(mem, "device_type", "memory");
+	dt_add_property(mem, "reg", reg, sizeof(reg));
+	free(name);
+}
+
+void add_chip_dev_associativity(struct dt_node *dev __attribute__((unused)))
+{
+}
+
+struct test_region {
+	uint64_t	start;
+	uint64_t	end;
+};
+
+static struct test {
+	struct test_region	regions[3];
+	bool			reserved;
+} tests[] = {
+	/* empty region set */
+	{ { { 0 } }, false },
+
+	/* single exact match */
+	{ { { 0x1000, 0x2000 }, }, true },
+
+	/* overlap downwards */
+	{ { { 0x0fff, 0x2000 }, }, true },
+
+	/* overlap upwards */
+	{ { { 0x1000, 0x2001 }, }, true },
+
+	/* missing first byte */
+	{ { { 0x1001, 0x2000 }, }, false },
+
+	/* missing last byte */
+	{ { { 0x1000, 0x1fff }, }, false },
+
+	/* two regions, full coverage, split before start of range */
+	{ { { 0x0500, 0x1000 }, { 0x1000, 0x2500 } }, true },
+
+	/* two regions, full coverage, split after start of range */
+	{ { { 0x0500, 0x1001 }, { 0x1001, 0x2500 } }, true },
+
+	/* two regions, full coverage, split at middle of range */
+	{ { { 0x0500, 0x1500 }, { 0x1500, 0x2500 } }, true },
+
+	/* two regions, full coverage, split before end of range */
+	{ { { 0x0500, 0x1fff }, { 0x1fff, 0x2500 } }, true },
+
+	/* two regions, full coverage, split after end of range */
+	{ { { 0x0500, 0x2000 }, { 0x2000, 0x2500 } }, true },
+
+	/* two regions, missing byte in middle of range */
+	{ { { 0x0500, 0x14ff }, { 0x1500, 0x2500 } }, false },
+
+	/* two regions, missing byte after start of range */
+	{ { { 0x0500, 0x1000 }, { 0x1001, 0x2500 } }, false },
+
+	/* two regions, missing byte before end of range */
+	{ { { 0x0500, 0x1fff }, { 0x2000, 0x2500 } }, false },
+};
+
+static void run_test(struct test *test)
+{
+	struct test_region *r;
+	bool reserved;
+
+	list_head_init(&regions);
+
+	mem_region_init();
+
+	/* create our reservations */
+	for (r = test->regions; r->start; r++)
+		mem_reserve_fw("r", r->start, r->end - r->start);
+
+	reserved = mem_range_is_reserved(0x1000, 0x1000);
+
+	if (reserved != test->reserved)	{
+		struct mem_region *r;
+		fprintf(stderr, "test failed; got %s, expected %s\n",
+				reserved ? "reserved" : "unreserved",
+				test->reserved ? "reserved" : "unreserved");
+
+		fprintf(stderr, "reserved regions:\n");
+
+		list_for_each(&regions, r, list) {
+			fprintf(stderr, "\t: %08"PRIx64"[%08"PRIx64"] %s\n",
+					r->start, r->len, r->name);
+		}
+		exit(EXIT_FAILURE);
+	}
+}
+
+
+int main(void)
+{
+	unsigned int i;
+	void *buf;
+
+	/* Use malloc for the heap, so valgrind can find issues. */
+	skiboot_heap.start = (long)real_malloc(TEST_HEAP_SIZE);
+	skiboot_heap.len = TEST_HEAP_SIZE;
+
+	/* shift the OS reserve area out of the way of our playground */
+	skiboot_os_reserve.start = 0x100000;
+	skiboot_os_reserve.len = 0x1000;
+
+	dt_root = dt_new_root("");
+	dt_add_property_cells(dt_root, "#address-cells", 2);
+	dt_add_property_cells(dt_root, "#size-cells", 2);
+
+	buf = real_malloc(1024*1024);
+	add_mem_node((unsigned long)buf, 1024*1024);
+
+	for (i = 0; i < ARRAY_SIZE(tests); i++)
+		run_test(&tests[i]);
+
+	dt_free(dt_root);
+	real_free(buf);
+	real_free((void *)(long)skiboot_heap.start);
+	return 0;
+}
diff --git a/roms/skiboot/core/test/run-mem_region.c b/roms/skiboot/core/test/run-mem_region.c
new file mode 100644
index 000000000..50da8033c
--- /dev/null
+++ b/roms/skiboot/core/test/run-mem_region.c
@@ -0,0 +1,252 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <config.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+/* The lock backtrace structures consume too much room on the skiboot heap */
+#undef DEBUG_LOCKS_BACKTRACE
+
+#define BITS_PER_LONG (sizeof(long) * 8)
+
+#include "dummy-cpu.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+/* Use these before we override definitions below. */
+static void *real_malloc(size_t size)
+{
+	return malloc(size);
+}
+
+static inline void real_free(void *p)
+{
+	return free(p);
+}
+
+#undef malloc
+#undef free
+#undef realloc
+
+#include <skiboot.h>
+
+#define is_rodata(p) true
+
+#include "../mem_region.c"
+#include "../malloc.c"
+#include "../device.c"
+
+#include <assert.h>
+#include <stdio.h>
+
+struct dt_node *dt_root;
+enum proc_chip_quirks proc_chip_quirks;
+
+void lock_caller(struct lock *l, const char *caller)
+{
+	(void)caller;
+	assert(!l->lock_val);
+	l->lock_val++;
+}
+
+void unlock(struct lock *l)
+{
+	assert(l->lock_val);
+	l->lock_val--;
+}
+
+bool lock_held_by_me(struct lock *l)
+{
+	return l->lock_val;
+}
+
+#define TEST_HEAP_ORDER 16
+#define TEST_HEAP_SIZE (1ULL << TEST_HEAP_ORDER)
+
+static bool heap_empty(void)
+{
+	const struct alloc_hdr *h = region_start(&skiboot_heap);
+	return h->num_longs == skiboot_heap.len / sizeof(long);
+}
+
+int main(void)
+{
+	char *test_heap;
+	void *p, *ptrs[100];
+	size_t i;
+	struct mem_region *r;
+
+	/* Use malloc for the heap, so valgrind can find issues. */
+	test_heap = real_malloc(TEST_HEAP_SIZE);
+	skiboot_heap.start = (unsigned long)test_heap;
+	skiboot_heap.len = TEST_HEAP_SIZE;
+
+	lock(&skiboot_heap.free_list_lock);
+
+	/* Allocations of various sizes. */
+	for (i = 0; i < TEST_HEAP_ORDER; i++) {
+		p = mem_alloc(&skiboot_heap, 1ULL << i, 1, "here");
+		assert(p);
+		assert(mem_check(&skiboot_heap));
+		assert(!strcmp(((struct alloc_hdr *)p)[-1].location, "here"));
+		assert(p > (void *)test_heap);
+		assert(p + (1ULL << i) <= (void *)test_heap + TEST_HEAP_SIZE);
+		assert(mem_allocated_size(p) >= 1ULL << i);
+		mem_free(&skiboot_heap, p, "freed");
+		assert(heap_empty());
+		assert(mem_check(&skiboot_heap));
+		assert(!strcmp(((struct alloc_hdr *)p)[-1].location, "freed"));
+	}
+	p = mem_alloc(&skiboot_heap, 1ULL << i, 1, "here");
+	assert(!p);
+	mem_free(&skiboot_heap, p, "freed");
+	assert(heap_empty());
+	assert(mem_check(&skiboot_heap));
+
+	/* Allocations of various alignments: use small alloc first. */
+	ptrs[0] = mem_alloc(&skiboot_heap, 1, 1, "small");
+	for (i = 0; ; i++) {
+		p = mem_alloc(&skiboot_heap, 1, 1ULL << i, "here");
+		assert(mem_check(&skiboot_heap));
+		/* We will eventually fail... */
+		if (!p) {
+			assert(i >= TEST_HEAP_ORDER);
+			break;
+		}
+		assert(p);
+		assert((long)p % (1ULL << i) == 0);
+		assert(p > (void *)test_heap);
+		assert(p + 1 <= (void *)test_heap + TEST_HEAP_SIZE);
+		mem_free(&skiboot_heap, p, "freed");
+		assert(mem_check(&skiboot_heap));
+	}
+	mem_free(&skiboot_heap, ptrs[0], "small freed");
+	assert(heap_empty());
+	assert(mem_check(&skiboot_heap));
+
+	/* Many little allocations, freed in reverse order. */
+	for (i = 0; i < 100; i++) {
+		ptrs[i] = mem_alloc(&skiboot_heap, sizeof(long), 1, "here");
+		assert(ptrs[i]);
+		assert(ptrs[i] > (void *)test_heap);
+		assert(ptrs[i] + sizeof(long)
+		       <= (void *)test_heap + TEST_HEAP_SIZE);
+		assert(mem_check(&skiboot_heap));
+	}
+	mem_dump_free();
+	for (i = 0; i < 100; i++)
+		mem_free(&skiboot_heap, ptrs[100 - 1 - i], "freed");
+
+	assert(heap_empty());
+	assert(mem_check(&skiboot_heap));
+
+	/* Check the prev_free gets updated properly. */
+	ptrs[0] = mem_alloc(&skiboot_heap, sizeof(long), 1, "ptrs[0]");
+	ptrs[1] = mem_alloc(&skiboot_heap, sizeof(long), 1, "ptrs[1]");
+	assert(ptrs[1] > ptrs[0]);
+	mem_free(&skiboot_heap, ptrs[0], "ptrs[0] free");
+	assert(mem_check(&skiboot_heap));
+	ptrs[0] = mem_alloc(&skiboot_heap, sizeof(long), 1, "ptrs[0] again");
+	assert(mem_check(&skiboot_heap));
+	mem_free(&skiboot_heap, ptrs[1], "ptrs[1] free");
+	mem_free(&skiboot_heap, ptrs[0], "ptrs[0] free");
+	assert(mem_check(&skiboot_heap));
+	assert(heap_empty());
+
+#if 0
+	printf("Heap map:\n");
+	for (i = 0; i < TEST_HEAP_SIZE / sizeof(long); i++) {
+		printf("%u", test_bit(skiboot_heap.bitmap, i));
+		if (i % 64 == 63)
+			printf("\n");
+		else if (i % 8 == 7)
+			printf(" ");
+	}
+#endif
+
+	/* Simple enlargement, then free */
+	p = mem_alloc(&skiboot_heap, 1, 1, "one byte");
+	assert(p);
+	assert(mem_resize(&skiboot_heap, p, 100, "hundred bytes"));
+	assert(mem_allocated_size(p) >= 100);
+	assert(mem_check(&skiboot_heap));
+	assert(!strcmp(((struct alloc_hdr *)p)[-1].location, "hundred bytes"));
+	mem_free(&skiboot_heap, p, "freed");
+
+	/* Simple shrink, then free */
+	p = mem_alloc(&skiboot_heap, 100, 1, "100 bytes");
+	assert(p);
+	assert(mem_resize(&skiboot_heap, p, 1, "1 byte"));
+	assert(mem_allocated_size(p) < 100);
+	assert(mem_check(&skiboot_heap));
+	assert(!strcmp(((struct alloc_hdr *)p)[-1].location, "1 byte"));
+	mem_free(&skiboot_heap, p, "freed");
+
+	/* Lots of resizing (enlarge). */
+	p = mem_alloc(&skiboot_heap, 1, 1, "one byte");
+	assert(p);
+	for (i = 1; i <= TEST_HEAP_SIZE - sizeof(struct alloc_hdr); i++) {
+		assert(mem_resize(&skiboot_heap, p, i, "enlarge"));
+		assert(mem_allocated_size(p) >= i);
+		assert(mem_check(&skiboot_heap));
+	}
+
+	/* Can't make it larger though. */
+	assert(!mem_resize(&skiboot_heap, p, i, "enlarge"));
+
+	for (i = TEST_HEAP_SIZE - sizeof(struct alloc_hdr); i > 0; i--) {
+		assert(mem_resize(&skiboot_heap, p, i, "shrink"));
+		assert(mem_check(&skiboot_heap));
+	}
+
+	mem_free(&skiboot_heap, p, "freed");
+	assert(mem_check(&skiboot_heap));
+
+	unlock(&skiboot_heap.free_list_lock);
+
+	/* lock the regions list */
+	lock(&mem_region_lock);
+	/* Test splitting of a region. */
+	r = new_region("base", (unsigned long)test_heap,
+		       TEST_HEAP_SIZE, NULL, REGION_SKIBOOT_HEAP);
+	assert(add_region(r));
+	r = new_region("splitter", (unsigned long)test_heap + TEST_HEAP_SIZE/4,
+		       TEST_HEAP_SIZE/2, NULL, REGION_RESERVED);
+	assert(add_region(r));
+	/* Now we should have *three* regions. */
+	i = 0;
+	list_for_each(&regions, r, list) {
+		if (region_start(r) == test_heap) {
+			assert(r->len == TEST_HEAP_SIZE/4);
+			assert(strcmp(r->name, "base") == 0);
+			assert(r->type == REGION_SKIBOOT_HEAP);
+		} else if (region_start(r) == test_heap + TEST_HEAP_SIZE / 4) {
+			assert(r->len == TEST_HEAP_SIZE/2);
+			assert(strcmp(r->name, "splitter") == 0);
+			assert(r->type == REGION_RESERVED);
+			assert(!r->free_list.n.next);
+		} else if (region_start(r) == test_heap + TEST_HEAP_SIZE/4*3) {
+			assert(r->len == TEST_HEAP_SIZE/4);
+			assert(strcmp(r->name, "base") == 0);
+			assert(r->type == REGION_SKIBOOT_HEAP);
+		} else
+			abort();
+		assert(mem_check(r));
+		i++;
+	}
+	mem_dump_free();
+	assert(i == 3);
+	while ((r = list_pop(&regions, struct mem_region, list)) != NULL) {
+		lock(&skiboot_heap.free_list_lock);
+		mem_free(&skiboot_heap, r, __location__);
+		unlock(&skiboot_heap.free_list_lock);
+	}
+	unlock(&mem_region_lock);
+	assert(skiboot_heap.free_list_lock.lock_val == 0);
+	real_free(test_heap);
+	return 0;
+}
diff --git a/roms/skiboot/core/test/run-mem_region_init.c b/roms/skiboot/core/test/run-mem_region_init.c
new file mode 100644
index 000000000..e96282de8
--- /dev/null
+++ b/roms/skiboot/core/test/run-mem_region_init.c
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2013-2018 IBM Corp.
+ */
+
+#include <config.h>
+
+#define BITS_PER_LONG (sizeof(long) * 8)
+
+#include "dummy-cpu.h"
+
+#include <stdlib.h>
+
+/* Use these before we undefine them below. */
+static inline void *real_malloc(size_t size)
+{
+	return malloc(size);
+}
+
+static inline void real_free(void *p)
+{
+	return free(p);
+}
+
+#include "../malloc.c"
+
+#include <skiboot.h>
+/* We need mem_region to accept __location__ */
+#define is_rodata(p) true
+#include "../mem_region.c"
+
+/* But we need device tree to make copies of names. */
+#undef is_rodata
+#define is_rodata(p) false
+
+static inline char *skiboot_strdup(const char *str)
+{
+	char *ret = __malloc(strlen(str) + 1, "");
+	return memcpy(ret, str, strlen(str) + 1);
+}
+#undef strdup
+#define strdup skiboot_strdup
+
+#include "../device.c"
+
+#include <skiboot.h>
+
+#include <assert.h>
+#include <stdio.h>
+
+enum proc_chip_quirks proc_chip_quirks;
+
+void lock_caller(struct lock *l, const char *caller)
+{
+	(void)caller;
+	assert(!l->lock_val);
+	l->lock_val = 1;
+}
+
+void unlock(struct lock *l)
+{
+	assert(l->lock_val);
+	l->lock_val = 0;
+}
+
+bool lock_held_by_me(struct lock *l)
+{
+	return l->lock_val;
+}
+
+/* We actually need a lot of room for the bitmaps! */
+#define TEST_HEAP_ORDER 27
+#define TEST_HEAP_SIZE (1ULL << TEST_HEAP_ORDER)
+
+static void add_mem_node(uint64_t start, uint64_t len)
+{
+	struct dt_node *mem;
+	u64 reg[2];
+	char *name= (char*)malloc(sizeof("memory@") + STR_MAX_CHARS(reg[0]));
+
+	assert(name);
+
+	/* reg contains start and length */
+	reg[0] = cpu_to_be64(start);
+	reg[1] = cpu_to_be64(len);
+
+	sprintf(name, "memory@%llx", (unsigned long long)start);
+
+	mem = dt_new(dt_root, name);
+	assert(mem);
+	dt_add_property_string(mem, "device_type", "memory");
+	dt_add_property(mem, "reg", reg, sizeof(reg));
+	free(name);
+}
+
+void add_chip_dev_associativity(struct dt_node *dev __attribute__((unused)))
+{
+}
+
+int main(void)
+{
+	uint64_t end;
+	int builtins;
+	struct mem_region *r;
+	char *heap = real_malloc(TEST_HEAP_SIZE);
+
+	/* Use malloc for the heap, so valgrind can find issues. */
+	skiboot_heap.start = (unsigned long)heap;
+	skiboot_heap.len = TEST_HEAP_SIZE;
+	skiboot_os_reserve.len = 16384;
+
+	dt_root = dt_new_root("");
+	dt_add_property_cells(dt_root, "#address-cells", 2);
+	dt_add_property_cells(dt_root, "#size-cells", 2);
+
+	/* Make sure we overlap the heap, at least. */
+	add_mem_node(0, (uint64_t)(heap + 0x100000000ULL));
+	add_mem_node((uint64_t)heap+0x100000000ULL , 0x100000000ULL);
+	end = (uint64_t)(heap+ 0x100000000ULL + 0x100000000ULL);
+
+	/* Now convert. */
+	mem_region_init();
+	mem_dump_allocs();
+	assert(mem_check(&skiboot_heap));
+
+	builtins = 0;
+	list_for_each(&regions, r, list) {
+		/* Regions must not overlap. */
+		struct mem_region *r2, *pre = NULL, *post = NULL;
+		list_for_each(&regions, r2, list) {
+			if (r == r2)
+				continue;
+			assert(!overlaps(r, r2));
+		}
+
+		/* But should have exact neighbours. */
+		list_for_each(&regions, r2, list) {
+			if (r == r2)
+				continue;
+			if (r2->start == r->start + r->len)
+				post = r2;
+			if (r2->start + r2->len == r->start)
+				pre = r2;
+		}
+		assert(r->start == 0 || pre);
+		assert(r->start + r->len == end || post);
+
+		if (r == &skiboot_code_and_text ||
+		    r == &skiboot_heap ||
+		    r == &skiboot_after_heap ||
+		    r == &skiboot_cpu_stacks ||
+		    r == &skiboot_os_reserve)
+			builtins++;
+		else
+			assert(r->type == REGION_MEMORY);
+		assert(mem_check(r));
+	}
+	assert(builtins == 5);
+
+	dt_free(dt_root);
+
+	while ((r = list_pop(&regions, struct mem_region, list)) != NULL) {
+		if (r != &skiboot_code_and_text &&
+		    r != &skiboot_heap &&
+		    r != &skiboot_after_heap &&
+		    r != &skiboot_os_reserve &&
+		    r != &skiboot_cpu_stacks) {
+			free(r);
+		}
+		assert(mem_check(&skiboot_heap));
+	}
+	assert(skiboot_heap.free_list_lock.lock_val == 0);
+	real_free(heap);
+	return 0;
+}
diff --git a/roms/skiboot/core/test/run-mem_region_next.c b/roms/skiboot/core/test/run-mem_region_next.c
new file mode 100644
index 000000000..4f2f73c55
--- /dev/null
+++ b/roms/skiboot/core/test/run-mem_region_next.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2015-2018 IBM Corp.
+ */
+
+#include <config.h>
+
+#define BITS_PER_LONG (sizeof(long) * 8)
+
+#include "dummy-cpu.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+/* Use these before we override definitions below. */
+static void *real_malloc(size_t size)
+{
+	return malloc(size);
+}
+
+static void real_free(void *p)
+{
+	return free(p);
+}
+
+#undef malloc
+#undef free
+
+#include <skiboot.h>
+
+#define is_rodata(p) true
+
+#include "../mem_region.c"
+#include "../malloc.c"
+#include "../device.c"
+
+#include <assert.h>
+#include <stdio.h>
+
+enum proc_chip_quirks proc_chip_quirks;
+
+void lock_caller(struct lock *l, const char *caller)
+{
+	(void)caller;
+	assert(!l->lock_val);
+	l->lock_val++;
+}
+
+void unlock(struct lock *l)
+{
+	assert(l->lock_val);
+	l->lock_val--;
+}
+
+bool lock_held_by_me(struct lock *l)
+{
+	return l->lock_val;
+}
+
+
+#define TEST_HEAP_ORDER 16
+#define TEST_HEAP_SIZE (1ULL << TEST_HEAP_ORDER)
+
+int main(void)
+{
+	struct mem_region *r;
+	char *test_heap;
+
+	/* Use malloc for the heap, so valgrind can find issues. */
+	test_heap = real_malloc(TEST_HEAP_SIZE);
+	skiboot_heap.start = (unsigned long)test_heap;
+	skiboot_heap.len = TEST_HEAP_SIZE;
+
+	lock(&mem_region_lock);
+
+	/* empty regions */
+	r = mem_region_next(NULL);
+	assert(!r);
+
+	r = new_region("test.1", 0x1000, 0x1000, NULL, REGION_RESERVED);
+	assert(add_region(r));
+	r = new_region("test.2", 0x2000, 0x1000, NULL, REGION_RESERVED);
+	assert(add_region(r));
+	mem_regions_finalised = true;
+
+	r = mem_region_next(NULL);
+	assert(r);
+	assert(r->start == 0x1000);
+	assert(r->len == 0x1000);
+	assert(r->type == REGION_RESERVED);
+
+	r = mem_region_next(r);
+	assert(r);
+	assert(r->start == 0x2000);
+	assert(r->len == 0x1000);
+	assert(r->type == REGION_RESERVED);
+
+	r = mem_region_next(r);
+	assert(!r);
+
+	unlock(&mem_region_lock);
+	real_free(test_heap);
+
+	return 0;
+}
diff --git a/roms/skiboot/core/test/run-mem_region_release_unused.c b/roms/skiboot/core/test/run-mem_region_release_unused.c
new file mode 100644
index 000000000..463f54283
--- /dev/null
+++ b/roms/skiboot/core/test/run-mem_region_release_unused.c
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2013-2018 IBM Corp.
+ */
+
+#include <config.h>
+
+#define BITS_PER_LONG (sizeof(long) * 8)
+
+#include "dummy-cpu.h"
+
+#include <stdlib.h>
+
+static void *__malloc(size_t size, const char *location __attribute__((unused)))
+{
+	return malloc(size);
+}
+
+static void *__realloc(void *ptr, size_t size, const char *location __attribute__((unused)))
+{
+	return realloc(ptr, size);
+}
+
+static void *__zalloc(size_t size, const char *location __attribute__((unused)))
+{
+	return calloc(size, 1);
+}
+
+static inline void __free(void *p, const char *location __attribute__((unused)))
+{
+	return free(p);
+}
+
+#include <skiboot.h>
+
+/* We need mem_region to accept __location__ */
+#define is_rodata(p) true
+#include "../mem_region.c"
+
+/* But we need device tree to make copies of names. */
+#undef is_rodata
+#define is_rodata(p) false
+
+#include "../device.c"
+#include <assert.h>
+#include <stdio.h>
+
+enum proc_chip_quirks proc_chip_quirks;
+
+void lock_caller(struct lock *l, const char *caller)
+{
+	(void)caller;
+	l->lock_val++;
+}
+
+void unlock(struct lock *l)
+{
+	l->lock_val--;
+}
+
+bool lock_held_by_me(struct lock *l)
+{
+	return l->lock_val;
+}
+
+#define TEST_HEAP_ORDER 16
+#define TEST_HEAP_SIZE (1ULL << TEST_HEAP_ORDER)
+
+static void add_mem_node(uint64_t start, uint64_t len)
+{
+	struct dt_node *mem;
+	u64 reg[2];
+	char *name;
+
+	name = (char*)malloc(sizeof("memory@") + STR_MAX_CHARS(reg[0]));
+	assert(name);
+
+	/* reg contains start and length */
+	reg[0] = cpu_to_be64(start);
+	reg[1] = cpu_to_be64(len);
+
+	sprintf(name, "memory@%llx", (long long)start);
+
+	mem = dt_new(dt_root, name);
+	dt_add_property_string(mem, "device_type", "memory");
+	dt_add_property(mem, "reg", reg, sizeof(reg));
+	free(name);
+}
+
+void add_chip_dev_associativity(struct dt_node *dev __attribute__((unused)))
+{
+}
+
+int main(void)
+{
+	uint64_t i;
+	struct mem_region *r, *other = NULL;
+	void *other_mem;
+	const char *last;
+
+	/* Use malloc for the heap, so valgrind can find issues. */
+	skiboot_heap.start = (unsigned long)malloc(TEST_HEAP_SIZE);
+	skiboot_heap.len = TEST_HEAP_SIZE;
+	skiboot_os_reserve.len = 0;
+
+	dt_root = dt_new_root("");
+	dt_add_property_cells(dt_root, "#address-cells", 2);
+	dt_add_property_cells(dt_root, "#size-cells", 2);
+
+	other_mem = malloc(1024*1024);
+	add_mem_node((unsigned long)other_mem, 1024*1024);
+
+	/* Now convert. */
+	mem_region_init();
+
+	/* Find our node to allocate from */
+	list_for_each(&regions, r, list) {
+		if (region_start(r) == other_mem)
+			other = r;
+	}
+	/* This could happen if skiboot addresses clashed with our alloc. */
+	assert(other);
+	assert(mem_check(other));
+
+	/* Allocate 1k from other region. */
+	lock(&other->free_list_lock);
+	mem_alloc(other, 1024, 1, "1k");
+	unlock(&other->free_list_lock);
+
+	mem_region_release_unused();
+
+	assert(mem_check(&skiboot_heap));
+
+	/* Now we expect it to be split. */
+	i = 0;
+	list_for_each(&regions, r, list) {
+		assert(mem_check(r));
+		i++;
+		if (r == &skiboot_os_reserve)
+			continue;
+		if (r == &skiboot_code_and_text)
+			continue;
+		if (r == &skiboot_heap)
+			continue;
+		if (r == &skiboot_after_heap)
+			continue;
+		if (r == &skiboot_cpu_stacks)
+			continue;
+		if (r == other) {
+			assert(r->type == REGION_MEMORY);
+			assert(r->len < 1024 * 1024);
+		} else {
+			assert(r->type == REGION_OS);
+			assert(r->start == other->start + other->len);
+			assert(r->start + r->len == other->start + 1024*1024);
+		}
+	}
+	assert(i == 7);
+
+	last = NULL;
+	list_for_each(&regions, r, list) {
+		if (last != r->name &&
+		    strncmp(r->name, NODE_REGION_PREFIX,
+			    strlen(NODE_REGION_PREFIX)) == 0) {
+			/* It's safe to cast away const as this is
+			 * only going to happen in test code */
+			free((void*)r->name);
+			break;
+		}
+		last = r->name;
+	}
+
+	dt_free(dt_root);
+	free((void *)(long)skiboot_heap.start);
+	free(other_mem);
+	return 0;
+}
diff --git a/roms/skiboot/core/test/run-mem_region_release_unused_noalloc.c b/roms/skiboot/core/test/run-mem_region_release_unused_noalloc.c
new file mode 100644
index 000000000..d7adc5a9a
--- /dev/null
+++ b/roms/skiboot/core/test/run-mem_region_release_unused_noalloc.c
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2013-2018 IBM Corp.
+ */
+
+#include <config.h>
+
+#define BITS_PER_LONG (sizeof(long) * 8)
+
+#include "dummy-cpu.h"
+
+#include <stdlib.h>
+
+static void *__malloc(size_t size, const char *location __attribute__((unused)))
+{
+	return malloc(size);
+}
+
+static void *__realloc(void *ptr, size_t size, const char *location __attribute__((unused)))
+{
+	return realloc(ptr, size);
+}
+
+static void *__zalloc(size_t size, const char *location __attribute__((unused)))
+{
+	return calloc(size, 1);
+}
+
+static inline void __free(void *p, const char *location __attribute__((unused)))
+{
+	return free(p);
+}
+
+#include <skiboot.h>
+
+/* We need mem_region to accept __location__ */
+#define is_rodata(p) true
+#include "../mem_region.c"
+
+/* But we need device tree to make copies of names. */
+#undef is_rodata
+#define is_rodata(p) false
+
+#include "../device.c"
+#include <assert.h>
+#include <stdio.h>
+
+enum proc_chip_quirks proc_chip_quirks;
+
+void lock_caller(struct lock *l, const char *caller)
+{
+	(void)caller;
+	l->lock_val++;
+}
+
+void unlock(struct lock *l)
+{
+	l->lock_val--;
+}
+
+bool lock_held_by_me(struct lock *l)
+{
+	return l->lock_val;
+}
+
+#define TEST_HEAP_ORDER 16
+#define TEST_HEAP_SIZE (1ULL << TEST_HEAP_ORDER)
+
+static void add_mem_node(uint64_t start, uint64_t len)
+{
+	struct dt_node *mem;
+	u64 reg[2];
+	char *name;
+
+	name = (char*)malloc(sizeof("memory@") + STR_MAX_CHARS(reg[0]));
+	assert(name);
+
+	/* reg contains start and length */
+	reg[0] = cpu_to_be64(start);
+	reg[1] = cpu_to_be64(len);
+
+	sprintf(name, "memory@%llx", (long long)start);
+
+	mem = dt_new(dt_root, name);
+	dt_add_property_string(mem, "device_type", "memory");
+	dt_add_property(mem, "reg", reg, sizeof(reg));
+	free(name);
+}
+
+void add_chip_dev_associativity(struct dt_node *dev __attribute__((unused)))
+{
+}
+
+int main(void)
+{
+	uint64_t i;
+	struct mem_region *r;
+	const char *last;
+
+	/* Use malloc for the heap, so valgrind can find issues. */
+	skiboot_heap.start = 0;
+	skiboot_heap.len = TEST_HEAP_SIZE;
+	skiboot_os_reserve.start = 0;
+	skiboot_os_reserve.len = 0;
+
+	dt_root = dt_new_root("");
+	dt_add_property_cells(dt_root, "#address-cells", 2);
+	dt_add_property_cells(dt_root, "#size-cells", 2);
+
+	add_mem_node(0, 0x100000000ULL);
+	add_mem_node(0x100000000ULL, 0x100000000ULL);
+
+	mem_region_init();
+
+	mem_region_release_unused();
+
+	assert(mem_check(&skiboot_heap));
+
+	/* Now we expect it to be split. */
+	i = 0;
+	list_for_each(&regions, r, list) {
+		assert(mem_check(r));
+		i++;
+		if (r == &skiboot_os_reserve)
+			continue;
+		if (r == &skiboot_code_and_text)
+			continue;
+		if (r == &skiboot_heap)
+			continue;
+		if (r == &skiboot_after_heap)
+			continue;
+		if (r == &skiboot_cpu_stacks)
+			continue;
+
+		/* the memory nodes should all be available to the OS now */
+		assert(r->type == REGION_OS);
+	}
+	assert(i == 9);
+
+	last = NULL;
+	list_for_each(&regions, r, list) {
+		if (last != r->name &&
+		    strncmp(r->name, NODE_REGION_PREFIX,
+			    strlen(NODE_REGION_PREFIX)) == 0) {
+			/* It's safe to cast away the const as
+			 * this never happens at runtime,
+			 * only in test and only for valgrind
+			 */
+			free((void*)r->name);
+			last = r->name;
+		}
+	}
+
+	dt_free(dt_root);
+	return 0;
+}
diff --git a/roms/skiboot/core/test/run-mem_region_reservations.c b/roms/skiboot/core/test/run-mem_region_reservations.c
new file mode 100644
index 000000000..c24652f41
--- /dev/null
+++ b/roms/skiboot/core/test/run-mem_region_reservations.c
@@ -0,0 +1,228 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2013-2018 IBM Corp.
+ */
+
+#include <config.h>
+
+#define BITS_PER_LONG (sizeof(long) * 8)
+
+#include "dummy-cpu.h"
+
+#include <stdlib.h>
+
+static void *real_malloc(size_t size)
+{
+	return malloc(size);
+}
+
+static void real_free(void *p)
+{
+	return free(p);
+}
+
+#undef malloc
+#undef free
+#undef realloc
+
+#include <skiboot.h>
+#include <mem_region-malloc.h>
+
+/* We need mem_region to accept __location__ */
+#define is_rodata(p) true
+#include "../mem_region.c"
+#include "../malloc.c"
+
+/* But we need device tree to make copies of names. */
+#undef is_rodata
+#define is_rodata(p) false
+#include "../../libc/string/strdup.c"
+
+#include "../device.c"
+#include <assert.h>
+#include <stdio.h>
+
+enum proc_chip_quirks proc_chip_quirks;
+
+void lock_caller(struct lock *l, const char *caller)
+{
+	(void)caller;
+	assert(!l->lock_val);
+	l->lock_val++;
+}
+
+void unlock(struct lock *l)
+{
+	assert(l->lock_val);
+	l->lock_val--;
+}
+
+bool lock_held_by_me(struct lock *l)
+{
+	return l->lock_val;
+}
+
+#define TEST_HEAP_ORDER 16
+#define TEST_HEAP_SIZE (1ULL << TEST_HEAP_ORDER)
+
+static void add_mem_node(uint64_t start, uint64_t len)
+{
+	struct dt_node *mem;
+	u64 reg[2];
+	char *name;
+
+	name = (char*)malloc(sizeof("memory@") + STR_MAX_CHARS(reg[0]));
+	assert(name);
+
+	/* reg contains start and length */
+	reg[0] = cpu_to_be64(start);
+	reg[1] = cpu_to_be64(len);
+
+	sprintf(name, "memory@%llx", (long long)start);
+
+	mem = dt_new(dt_root, name);
+	dt_add_property_string(mem, "device_type", "memory");
+	dt_add_property(mem, "reg", reg, sizeof(reg));
+	free(name);
+}
+
+void add_chip_dev_associativity(struct dt_node *dev __attribute__((unused)))
+{
+}
+
+static struct {
+	const char	*name;
+	uint64_t	addr;
+	bool		found;
+} test_regions[] = {
+	{ "test.1", 0x1000, false },
+	{ "test.2", 0x2000, false },
+	{ "test.3", 0x4000, false },
+};
+
+static void check_property_reservations(void)
+{
+	const struct dt_property *names, *ranges;
+	unsigned int i, l;
+	const char *name;
+	uint64_t *rangep;
+	const char *at;
+
+	/* check dt properties */
+	names = dt_find_property(dt_root, "reserved-names");
+	ranges = dt_find_property(dt_root, "reserved-ranges");
+
+	assert(names && ranges);
+
+	/* walk through names & ranges properies, ensuring that the test
+	 * regions are all present */
+	for (name = names->prop, rangep = (uint64_t *)ranges->prop;
+			name < names->prop + names->len;
+			name += l, rangep += 2) {
+		uint64_t addr;
+
+		addr = dt_get_number(rangep, 2);
+		l = strlen(name) + 1;
+
+		for (i = 0; i < ARRAY_SIZE(test_regions); i++) {
+			at = strchr(name, '@');
+			if (strncmp(test_regions[i].name, name,
+				    at ? at-name: strlen(name)))
+				continue;
+			assert(test_regions[i].addr == addr);
+			assert(!test_regions[i].found);
+			test_regions[i].found = true;
+		}
+	}
+
+	for (i = 0; i < ARRAY_SIZE(test_regions); i++) {
+		assert(test_regions[i].found);
+		test_regions[i].found = false;
+	}
+}
+
+static void check_node_reservations(void)
+{
+	struct dt_node *parent, *node;
+	unsigned int i;
+
+	parent = dt_find_by_name(dt_root, "reserved-memory");
+	assert(parent);
+
+	assert(dt_prop_get_cell(parent, "#address-cells", 0) == 2);
+	assert(dt_prop_get_cell(parent, "#size-cells", 0) == 2);
+	dt_require_property(parent, "ranges", 0);
+
+	dt_for_each_child(parent, node) {
+		uint64_t addr, size;
+
+		addr = dt_get_address(node, 0, &size);
+
+		for (i = 0; i < ARRAY_SIZE(test_regions); i++) {
+			if (strncmp(test_regions[i].name, node->name,
+						strlen(test_regions[i].name)))
+				continue;
+
+			assert(!test_regions[i].found);
+			assert(test_regions[i].addr == addr);
+			assert(size == 0x1000);
+			test_regions[i].found = true;
+		}
+	}
+
+	for (i = 0; i < ARRAY_SIZE(test_regions); i++) {
+		assert(test_regions[i].found);
+		test_regions[i].found = false;
+	}
+}
+
+int main(void)
+{
+	struct mem_region *r;
+	unsigned int i;
+	void *buf;
+
+	/* Use malloc for the heap, so valgrind can find issues. */
+	skiboot_heap.start = (long)real_malloc(TEST_HEAP_SIZE);
+	skiboot_heap.len = TEST_HEAP_SIZE;
+	skiboot_os_reserve.len = skiboot_heap.start;
+
+	dt_root = dt_new_root("");
+	dt_add_property_cells(dt_root, "#address-cells", 2);
+	dt_add_property_cells(dt_root, "#size-cells", 2);
+
+	buf = real_malloc(1024*1024);
+	add_mem_node((unsigned long)buf, 1024*1024);
+
+	/* add pre-init reservations */
+	for (i = 0; i < ARRAY_SIZE(test_regions); i++)
+		mem_reserve_fw(test_regions[i].name,
+				test_regions[i].addr, 0x1000);
+
+	/* Now convert. */
+	mem_region_init();
+
+	/* add a post-init reservation */
+	mem_reserve_fw("test.4", 0x5000, 0x1000);
+
+	/* release unused */
+	mem_region_release_unused();
+
+	/* and create reservations */
+	mem_region_add_dt_reserved();
+
+	/* ensure we can't create further reservations */
+	r = new_region("test.5", 0x5000, 0x1000, NULL, REGION_RESERVED);
+	assert(!add_region(r));
+
+	/* check old property-style reservations */
+	check_property_reservations();
+
+	/* and new node-style reservations */
+	check_node_reservations();
+
+	dt_free(dt_root);
+	real_free(buf);
+	real_free((void *)(long)skiboot_heap.start);
+	return 0;
+}
diff --git a/roms/skiboot/core/test/run-msg.c b/roms/skiboot/core/test/run-msg.c
new file mode 100644
index 000000000..3659a12d7
--- /dev/null
+++ b/roms/skiboot/core/test/run-msg.c
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+
+static bool zalloc_should_fail = false;
+static int zalloc_should_fail_after = 0;
+
+/* Fake top_of_ram -- needed for API's */
+unsigned long top_of_ram = 0xffffffffffffffffULL;
+
+static void *zalloc(size_t size)
+{
+        if (zalloc_should_fail && zalloc_should_fail_after == 0) {
+                errno = ENOMEM;
+                return NULL;
+        }
+	if (zalloc_should_fail_after > 0)
+		zalloc_should_fail_after--;
+
+        return calloc(size, 1);
+}
+
+#include "../opal-msg.c"
+#include <skiboot.h>
+
+void lock_caller(struct lock *l, const char *caller)
+{
+	(void)caller;
+        assert(!l->lock_val);
+        l->lock_val = 1;
+}
+
+void unlock(struct lock *l)
+{
+        assert(l->lock_val);
+        l->lock_val = 0;
+}
+
+void opal_update_pending_evt(uint64_t evt_mask, uint64_t evt_values)
+{
+        (void)evt_mask;
+        (void)evt_values;
+}
+
+static long magic = 8097883813087437089UL;
+static void callback(void *data, int status)
+{
+	assert((status == OPAL_SUCCESS || status == OPAL_PARTIAL));
+        assert(*(uint64_t *)data == magic);
+}
+
+static size_t list_count(struct list_head *list)
+{
+        size_t count = 0;
+        struct opal_msg_entry *dummy;
+
+        list_for_each(list, dummy, link)
+                count++;
+        return count;
+}
+
+int main(void)
+{
+        struct opal_msg_entry* entry;
+        int free_size = OPAL_MAX_MSGS;
+        int nfree = free_size;
+        int npending = 0;
+        int r;
+        static struct opal_msg m;
+        uint64_t *m_ptr = (uint64_t *)&m;
+
+	zalloc_should_fail = true;
+	zalloc_should_fail_after = 3;
+	opal_init_msg();
+
+	zalloc_should_fail = false;
+	opal_init_msg();
+
+        assert(list_count(&msg_pending_list) == npending);
+        assert(list_count(&msg_free_list) == nfree);
+
+        /* Callback. */
+        r = opal_queue_msg(0, &magic, callback, (u64)0, (u64)1, (u64)2);
+        assert(r == 0);
+
+        assert(list_count(&msg_pending_list) == ++npending);
+        assert(list_count(&msg_free_list) == --nfree);
+
+        r = opal_get_msg(m_ptr, sizeof(m));
+        assert(r == 0);
+
+        assert(m.params[0] == 0);
+        assert(m.params[1] == 1);
+        assert(m.params[2] == 2);
+
+        assert(list_count(&msg_pending_list) == --npending);
+        assert(list_count(&msg_free_list) == ++nfree);
+
+        /* No params. */
+        r = opal_queue_msg(0, NULL, NULL);
+        assert(r == 0);
+
+        assert(list_count(&msg_pending_list) == ++npending);
+        assert(list_count(&msg_free_list) == --nfree);
+
+        r = opal_get_msg(m_ptr, sizeof(m));
+        assert(r == 0);
+
+        assert(list_count(&msg_pending_list) == --npending);
+        assert(list_count(&msg_free_list) == ++nfree);
+
+        /* > 8 params (ARRAY_SIZE(entry->msg.params) */
+        r = opal_queue_msg(0, NULL, NULL, 0, 1, 2, 3, 4, 5, 6, 7, 0xBADDA7A);
+        assert(r == 0);
+
+        assert(list_count(&msg_pending_list) == ++npending);
+        assert(list_count(&msg_free_list) == nfree);
+
+        r = opal_get_msg(m_ptr, sizeof(m));
+	assert(r == OPAL_PARTIAL);
+
+        assert(list_count(&msg_pending_list) == --npending);
+        assert(list_count(&msg_free_list) == nfree);
+
+        /* Return OPAL_PARTIAL to callback */
+	r = opal_queue_msg(0, &magic, callback, 0, 1, 2, 3, 4, 5, 6, 7, 0xBADDA7A);
+	assert(r == 0);
+
+	assert(list_count(&msg_pending_list) == ++npending);
+	assert(list_count(&msg_free_list) == nfree);
+
+	r = opal_get_msg(m_ptr, sizeof(m));
+	assert(r == OPAL_PARTIAL);
+
+	assert(list_count(&msg_pending_list) == --npending);
+	assert(list_count(&msg_free_list) == nfree);
+
+        /* return OPAL_PARAMETER */
+	r = _opal_queue_msg(0, NULL, NULL, OPAL_MSG_SIZE, m_ptr);
+	assert(r == OPAL_PARAMETER);
+
+        assert(m.params[0] == 0);
+        assert(m.params[1] == 1);
+        assert(m.params[2] == 2);
+        assert(m.params[3] == 3);
+        assert(m.params[4] == 4);
+        assert(m.params[5] == 5);
+        assert(m.params[6] == 6);
+        assert(m.params[7] == 7);
+
+        /* 8 params (ARRAY_SIZE(entry->msg.params) */
+        r = opal_queue_msg(0, NULL, NULL, 0, 10, 20, 30, 40, 50, 60, 70);
+        assert(r == 0);
+
+        assert(list_count(&msg_pending_list) == ++npending);
+        assert(list_count(&msg_free_list) == --nfree);
+
+        r = opal_get_msg(m_ptr, sizeof(m));
+        assert(r == 0);
+
+        assert(list_count(&msg_pending_list) == --npending);
+        assert(list_count(&msg_free_list) == ++nfree);
+
+        assert(m.params[0] == 0);
+        assert(m.params[1] == 10);
+        assert(m.params[2] == 20);
+        assert(m.params[3] == 30);
+        assert(m.params[4] == 40);
+        assert(m.params[5] == 50);
+        assert(m.params[6] == 60);
+        assert(m.params[7] == 70);
+
+        /* Full list (no free nodes in pending). */
+        while (nfree > 0) {
+                r = opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL);
+                assert(r == 0);
+                assert(list_count(&msg_pending_list) == ++npending);
+                assert(list_count(&msg_free_list) == --nfree);
+        }
+        assert(list_count(&msg_free_list) == 0);
+        assert(nfree == 0);
+        assert(npending == OPAL_MAX_MSGS);
+
+        r = opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL);
+        assert(r == 0);
+
+        assert(list_count(&msg_pending_list) == OPAL_MAX_MSGS+1);
+        assert(list_count(&msg_pending_list) == ++npending);
+        assert(list_count(&msg_free_list) == nfree);
+
+        /* Make zalloc fail to test error handling. */
+        zalloc_should_fail = true;
+        r = opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL);
+        assert(r == OPAL_RESOURCE);
+
+        assert(list_count(&msg_pending_list) == OPAL_MAX_MSGS+1);
+        assert(list_count(&msg_pending_list) == npending);
+        assert(list_count(&msg_free_list) == nfree);
+
+        /* Empty list (no nodes). */
+        while(!list_empty(&msg_pending_list)) {
+                r = opal_get_msg(m_ptr, sizeof(m));
+                assert(r == 0);
+                npending--;
+                nfree++;
+        }
+        assert(list_count(&msg_pending_list) == npending);
+        assert(list_count(&msg_free_list) == nfree);
+        assert(npending == 0);
+        assert(nfree == OPAL_MAX_MSGS+1);
+
+        r = opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL);
+        assert(r == 0);
+
+        assert(list_count(&msg_pending_list) == ++npending);
+        assert(list_count(&msg_free_list) == --nfree);
+
+        /* Request invalid size. */
+        r = opal_get_msg(m_ptr, sizeof(m) - 1);
+        assert(r == OPAL_PARAMETER);
+
+        /* Pass null buffer. */
+        r = opal_get_msg(NULL, sizeof(m));
+        assert(r == OPAL_PARAMETER);
+
+        /* Get msg when none are pending. */
+        r = opal_get_msg(m_ptr, sizeof(m));
+        assert(r == 0);
+
+        r = opal_get_msg(m_ptr, sizeof(m));
+        assert(r == OPAL_RESOURCE);
+
+#define test_queue_num(type, val) \
+        r = opal_queue_msg(0, NULL, NULL, \
+                (type)val, (type)val, (type)val, (type)val, \
+                (type)val, (type)val, (type)val, (type)val); \
+        assert(r == 0); \
+        opal_get_msg(m_ptr, sizeof(m)); \
+        assert(r == OPAL_SUCCESS); \
+        assert(m.params[0] == (type)val); \
+        assert(m.params[1] == (type)val); \
+        assert(m.params[2] == (type)val); \
+        assert(m.params[3] == (type)val); \
+        assert(m.params[4] == (type)val); \
+        assert(m.params[5] == (type)val); \
+        assert(m.params[6] == (type)val); \
+        assert(m.params[7] == (type)val)
+
+        /* Test types of various widths */
+        test_queue_num(u64, -1);
+        test_queue_num(s64, -1);
+        test_queue_num(u32, -1);
+        test_queue_num(s32, -1);
+        test_queue_num(u16, -1);
+        test_queue_num(s16, -1);
+        test_queue_num(u8, -1);
+        test_queue_num(s8, -1);
+
+        /* Clean up the list to keep valgrind happy. */
+        while(!list_empty(&msg_free_list)) {
+                entry = list_pop(&msg_free_list, struct opal_msg_entry, link);
+                assert(entry);
+                free(entry);
+        }
+
+        while(!list_empty(&msg_pending_list)) {
+                entry = list_pop(&msg_pending_list, struct opal_msg_entry, link);
+                assert(entry);
+                free(entry);
+        }
+
+        return 0;
+}
diff --git a/roms/skiboot/core/test/run-nvram-format.c b/roms/skiboot/core/test/run-nvram-format.c
new file mode 100644
index 000000000..ba286bea3
--- /dev/null
+++ b/roms/skiboot/core/test/run-nvram-format.c
@@ -0,0 +1,167 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <stdlib.h>
+
+#include "../nvram-format.c"
+
+bool nvram_wait_for_load(void)
+{
+	return true;
+}
+
+bool nvram_validate(void)
+{
+	return true;
+}
+
+bool nvram_has_loaded(void)
+{
+	return true;
+}
+
+static char *nvram_reset(void *nvram_image, int size)
+{
+	struct chrp_nvram_hdr *h = nvram_image;
+
+	/* entire partition used by one key */
+	assert(nvram_format(nvram_image, size) == 0);
+	memset((char *) h + sizeof(*h), 0, NVRAM_SIZE_FW_PRIV - sizeof(*h));
+	assert(nvram_check(nvram_image, size) == 0);
+
+	return (char *) h + sizeof(*h);
+}
+
+int main(void)
+{
+	char *nvram_image;
+	size_t sz;
+	struct chrp_nvram_hdr *h;
+	char *data;
+	const char *result;
+
+	/* 1024 bytes is too small for our NVRAM */
+	nvram_image = malloc(1024);
+	assert(nvram_format(nvram_image, 1024)!=0);
+	free(nvram_image);
+
+	/* 4096 bytes is too small for our NVRAM */
+	nvram_image = malloc(4096);
+	assert(nvram_format(nvram_image, 4096)!=0);
+	free(nvram_image);
+
+	/* 64k is too small for our NVRAM */
+	nvram_image = malloc(0x10000);
+	assert(nvram_format(nvram_image, 0x10000)!=0);
+	free(nvram_image);
+
+	/* 68k is too small for our NVRAM */
+	nvram_image = malloc(68*1024);
+	assert(nvram_format(nvram_image, 68*1024)!=0);
+	free(nvram_image);
+
+	/* 68k+16 bytes (nvram header) should generate empty free space */
+	sz = NVRAM_SIZE_COMMON + NVRAM_SIZE_FW_PRIV
+		+ sizeof(struct chrp_nvram_hdr);
+	nvram_image = malloc(sz);
+	assert(nvram_format(nvram_image, sz)==0);
+	assert(nvram_check(nvram_image, sz)==0);
+	assert(nvram_image[sz-14]==0);
+	assert(nvram_image[sz-13]==1);
+	h = (struct chrp_nvram_hdr*)(&nvram_image[NVRAM_SIZE_COMMON + NVRAM_SIZE_FW_PRIV]);
+	assert(memcmp(h->name, "wwwwwwwwwwww", 12)==0);
+	free(nvram_image);
+
+	/* 128k NVRAM check */
+	nvram_image = malloc(128*1024);
+	assert(nvram_format(nvram_image, 128*1024)==0);
+	assert(nvram_check(nvram_image,128*1024)==0);
+
+	/* Now, we corrupt it */
+	nvram_image[0] = 0;
+	assert(nvram_check(nvram_image,128*1024) != 0);
+
+	/* Does our NUL checking work? */
+	assert(nvram_format(nvram_image, 128 * 1024) == 0);
+	h = (struct chrp_nvram_hdr *) nvram_image;
+	memset((char *) h + sizeof(*h), 0xFF, be16_to_cpu(h->len) * 16 - sizeof(*h));
+	assert(nvram_check(nvram_image, 128 * 1024) != 0);
+
+	assert(nvram_format(nvram_image, 128*1024)==0);
+	/* corrupt the length of the partition */
+	nvram_image[2] = 0;
+	nvram_image[3] = 0;
+	assert(nvram_check(nvram_image,128*1024) != 0);
+
+	assert(nvram_format(nvram_image, 128*1024)==0);
+	/* corrupt the length of the partition */
+	nvram_image[2] = 0;
+	nvram_image[3] = 0;
+	/* but reset checksum! */
+	h = (struct chrp_nvram_hdr*)nvram_image;
+	h->cksum = chrp_nv_cksum(h);
+	assert(nvram_check(nvram_image,128*1024) != 0);
+
+	assert(nvram_format(nvram_image, 128*1024)==0);
+	/* make the length insanely beyond end of nvram  */
+	nvram_image[2] = 42;
+	nvram_image[3] = 32;
+	/* but reset checksum! */
+	h = (struct chrp_nvram_hdr*)nvram_image;
+	h->cksum = chrp_nv_cksum(h);
+	assert(nvram_check(nvram_image,128*1024) != 0);
+
+	assert(nvram_format(nvram_image, 128*1024)==0);
+	/* remove skiboot partition */
+	nvram_image[12] = '\0';
+	/* but reset checksum! */
+	h = (struct chrp_nvram_hdr*)nvram_image;
+	h->cksum = chrp_nv_cksum(h);
+	assert(nvram_check(nvram_image,128*1024) != 0);
+
+	assert(nvram_format(nvram_image, 128*1024)==0);
+	/* remove common partition */
+	nvram_image[NVRAM_SIZE_FW_PRIV+5] = '\0';
+	/* but reset checksum! */
+	h = (struct chrp_nvram_hdr*)(&nvram_image[NVRAM_SIZE_FW_PRIV]);
+	h->cksum = chrp_nv_cksum(h);
+	assert(nvram_check(nvram_image,128*1024) != 0);
+
+	/* test nvram_query() */
+
+	/* does an empty partition break us? */
+	data = nvram_reset(nvram_image, 128*1024);
+	assert(nvram_query_safe("test") == NULL);
+
+	/* does a zero length key break us? */
+	data = nvram_reset(nvram_image, 128*1024);
+	data[0] = '=';
+	assert(nvram_query_safe("test") == NULL);
+
+	/* does a missing = break us? */
+	data = nvram_reset(nvram_image, 128*1024);
+	data[0] = 'a';
+	assert(nvram_query_safe("test") == NULL);
+
+	/* does an empty value break us? */
+	data = nvram_reset(nvram_image, 128*1024);
+	data[0] = 'a';
+	data[1] = '=';
+	result = nvram_query_safe("a");
+	assert(result);
+	assert(strlen(result) == 0);
+
+	/* do we trip over malformed keys? */
+	data = nvram_reset(nvram_image, 128*1024);
+#define TEST_1 "a\0a=\0test=test\0"
+	memcpy(data, TEST_1, sizeof(TEST_1));
+	result = nvram_query_safe("test");
+	assert(result);
+	assert(strcmp(result, "test") == 0);
+
+	free(nvram_image);
+
+	return 0;
+}
diff --git a/roms/skiboot/core/test/run-pci-quirk.c b/roms/skiboot/core/test/run-pci-quirk.c
new file mode 100644
index 000000000..fd4d95c10
--- /dev/null
+++ b/roms/skiboot/core/test/run-pci-quirk.c
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2018 IBM Corp
+ */
+
+#include <assert.h>
+#include <stdint.h>
+#include <compiler.h>
+#include <stdbool.h>
+
+/* Stubs for quirk_astbmc_vga() */
+
+struct dt_property;
+struct dt_node;
+
+static struct bmc_platform fake_bmc;
+const struct bmc_platform *bmc_platform = &fake_bmc;
+
+static int ast_sio_is_enabled(void)
+{
+	return 0;
+}
+
+static uint32_t ast_ahb_readl(uint32_t reg)
+{
+	return reg;
+}
+
+static struct dt_property *__dt_add_property_cells(
+		struct dt_node *node __unused, const char *name __unused,
+		int count __unused, ...)
+{
+	return (void *)0;
+}
+
+struct pci_device;
+struct pci_cfg_reg_filter;
+typedef int64_t (*pci_cfg_reg_func)(void *dev,
+				    struct pci_cfg_reg_filter *pcrf,
+				    uint32_t offset, uint32_t len,
+				    uint32_t *data, bool write);
+
+
+static struct pci_cfg_reg_filter *pci_add_cfg_reg_filter(
+	struct pci_device *pd __unused,
+	uint32_t start __unused,
+	uint32_t len __unused,
+	uint32_t flags __unused,
+	pci_cfg_reg_func func __unused)
+{
+	return NULL;
+}
+
+#include "../pci-quirk.c"
+
+struct pci_device test_pd;
+int test_fixup_ran;
+
+static void test_fixup(struct phb *phb __unused, struct pci_device *pd __unused)
+{
+	assert(PCI_VENDOR_ID(pd->vdid) == 0x1a03);
+	assert(PCI_DEVICE_ID(pd->vdid) == 0x2000);
+	test_fixup_ran = 1;
+}
+
+/* Quirks are: {fixup function, vendor ID, (device ID or PCI_ANY_ID)} */
+static const struct pci_quirk test_quirk_table[] = {
+	/* ASPEED 2400 VGA device */
+	{ 0x1a03, 0x2000, &test_fixup },
+	{ 0, 0, NULL }
+};
+
+#define PCI_COMPOSE_VDID(vendor, device) (((device) << 16) | (vendor))
+
+int main(void)
+{
+	/* Unrecognised vendor and device ID */
+	test_pd.vdid = PCI_COMPOSE_VDID(0xabcd, 0xef01);
+	__pci_handle_quirk(NULL, &test_pd, test_quirk_table);
+	assert(test_fixup_ran == 0);
+
+	/* Unrecognised vendor ID, matching device ID */
+	test_pd.vdid = PCI_COMPOSE_VDID(0xabcd, 0x2000);
+	__pci_handle_quirk(NULL, &test_pd, test_quirk_table);
+	assert(test_fixup_ran == 0);
+
+	/* Matching vendor ID, unrecognised device ID */
+	test_pd.vdid = PCI_COMPOSE_VDID(0x1a03, 0xef01);
+	__pci_handle_quirk(NULL, &test_pd, test_quirk_table);
+	assert(test_fixup_ran == 0);
+
+	/* Matching vendor and device ID */
+	test_pd.vdid = PCI_COMPOSE_VDID(0x1a03, 0x2000);
+	__pci_handle_quirk(NULL, &test_pd, test_quirk_table);
+	assert(test_fixup_ran == 1);
+
+	return 0;
+}
diff --git a/roms/skiboot/core/test/run-pel.c b/roms/skiboot/core/test/run-pel.c
new file mode 100644
index 000000000..812c8996c
--- /dev/null
+++ b/roms/skiboot/core/test/run-pel.c
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Test for our PEL record generation. Currently this doesn't actually
+ * test that the records we generate are correct, but it at least lets
+ * us run valgrind over the generation routines to check for buffer
+ * overflows, etc.
+ *
+ * Copyright 2013-2016 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <inttypes.h>
+#include <assert.h>
+#include <pel.h>
+#include <errorlog.h>
+#include <device.h>
+
+#define TEST_ERROR 0x1234
+#define TEST_SUBSYS 0x5678
+
+DEFINE_LOG_ENTRY(TEST_ERROR, OPAL_PLATFORM_ERR_EVT, TEST_SUBSYS,
+			OPAL_PLATFORM_FIRMWARE, OPAL_INFO,
+			OPAL_NA);
+
+/* Override this for testing. */
+#define is_rodata(p) fake_is_rodata(p)
+
+char __rodata_start[16];
+#define __rodata_end (__rodata_start + sizeof(__rodata_start))
+
+static inline bool fake_is_rodata(const void *p)
+{
+	return ((char *)p >= __rodata_start && (char *)p < __rodata_end);
+}
+
+#define zalloc(bytes) calloc((bytes), 1)
+
+#include "../device.c"
+#include "../pel.c"
+
+struct dt_node *dt_root = NULL;
+char dt_prop[] = "DUMMY DT PROP";
+
+int rtc_cache_get_datetime(uint32_t *year_month_day,
+			   uint64_t *hour_minute_second_millisecond)
+{
+	*year_month_day = 0;
+	*hour_minute_second_millisecond = 0;
+
+	return 0;
+}
+
+int main(void)
+{
+	char *pel_buf;
+	size_t size;
+	struct errorlog *elog;
+	struct opal_err_info *opal_err_info = &err_TEST_ERROR;
+	char *buffer;
+	struct elog_user_data_section *tmp;
+
+	dt_root = dt_new_root("");
+	dt_add_property_string(dt_root, "model", "run-pel-unittest");
+
+	elog = malloc(sizeof(struct errorlog));
+	pel_buf = malloc(PEL_MIN_SIZE + 4);
+	assert(elog);
+	assert(pel_buf);
+
+	memset(elog, 0, sizeof(struct errorlog));
+
+	elog->error_event_type = opal_err_info->err_type;
+	elog->component_id = opal_err_info->cmp_id;
+	elog->subsystem_id = opal_err_info->subsystem;
+	elog->event_severity = opal_err_info->sev;
+	elog->event_subtype = opal_err_info->event_subtype;
+	elog->reason_code = opal_err_info->reason_code;
+	elog->elog_origin = ORG_SAPPHIRE;
+
+	size = pel_size(elog);
+
+	printf("Test buffer too small: ");
+	assert(0 == create_pel_log(elog, NULL, size - 1));
+
+	assert(size <= PEL_MIN_SIZE + 4);
+	assert(size == create_pel_log(elog, pel_buf, size));
+
+	memset(elog, 0, sizeof(struct errorlog));
+
+	elog->error_event_type = opal_err_info->err_type;
+	elog->component_id = opal_err_info->cmp_id;
+	elog->subsystem_id = opal_err_info->subsystem;
+	elog->event_severity = opal_err_info->sev;
+	elog->event_subtype = opal_err_info->event_subtype;
+	elog->reason_code = opal_err_info->reason_code;
+	elog->elog_origin = ORG_SAPPHIRE;
+
+	size = pel_size(elog);
+	pel_buf = realloc(pel_buf, size);
+	assert(pel_buf);
+
+	buffer = elog->user_data_dump + elog->user_section_size;
+	tmp = (struct elog_user_data_section *)buffer;
+	tmp->tag = OPAL_ELOG_SEC_DESC;  /* ASCII of DESC */
+	tmp->size = size + sizeof(struct elog_user_data_section) - 1;
+	strcpy(tmp->data_dump, "Hello World!");
+	elog->user_section_size += tmp->size;
+	elog->user_section_count++;
+
+	size = pel_size(elog);
+	pel_buf = realloc(pel_buf, size);
+	assert(pel_buf);
+
+	assert(size == create_pel_log(elog, pel_buf, size));
+
+	free(pel_buf);
+	free(elog);
+
+	return 0;
+}
diff --git a/roms/skiboot/core/test/run-pool.c b/roms/skiboot/core/test/run-pool.c
new file mode 100644
index 000000000..e1c3843ff
--- /dev/null
+++ b/roms/skiboot/core/test/run-pool.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2014 IBM Corp
+ */
+
+#include <pool.h>
+
+#include "../pool.c"
+
+#define POOL_OBJ_COUNT 10
+#define POOL_RESERVED_COUNT 2
+#define POOL_NORMAL_COUNT (POOL_OBJ_COUNT - POOL_RESERVED_COUNT)
+
+struct test_object
+{
+	int a;
+	int b;
+	int c;
+};
+
+int main(void)
+{
+	int i, count = 0;
+	struct pool pool;
+	struct test_object *a[POOL_OBJ_COUNT];
+
+	assert(!pool_init(&pool, sizeof(struct test_object), POOL_OBJ_COUNT,
+		      POOL_RESERVED_COUNT));
+
+	a[0] = pool_get(&pool, POOL_NORMAL);
+	assert(a[0]);
+	pool_free_object(&pool, a[0]);
+
+	for(i = 0; i < POOL_NORMAL_COUNT; i++)
+	{
+		a[i] = pool_get(&pool, POOL_NORMAL);
+		if (a[i])
+			count++;
+	}
+	assert(count == POOL_NORMAL_COUNT);
+
+	/* Normal pool should be exhausted */
+	assert(!pool_get(&pool, POOL_NORMAL));
+
+	/* Reserved pool should still be available */
+	a[POOL_NORMAL_COUNT] = pool_get(&pool, POOL_HIGH);
+	assert(a[POOL_NORMAL_COUNT]);
+	a[POOL_NORMAL_COUNT + 1] = pool_get(&pool, POOL_HIGH);
+	assert(a[POOL_NORMAL_COUNT + 1]);
+
+	pool_free_object(&pool, a[3]);
+
+	/* Should be a free object to get now */
+	a[3] = pool_get(&pool, POOL_HIGH);
+	assert(a[3]);
+
+	/* This exits depending on whether all tests passed */
+	return 0;
+}
diff --git a/roms/skiboot/core/test/run-time-utils.c b/roms/skiboot/core/test/run-time-utils.c
new file mode 100644
index 000000000..04723dd61
--- /dev/null
+++ b/roms/skiboot/core/test/run-time-utils.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2015-2017 IBM Corp.
+ */
+
+#include <config.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <stdarg.h>
+#include <stdio.h>
+
+#define __TEST__
+
+#include "../time-utils.c"
+
+int main(void)
+{
+	struct tm *t = malloc(sizeof(struct tm));
+	uint32_t *ymd = malloc(sizeof(uint32_t));
+	uint64_t *hms = malloc(sizeof(uint64_t));
+
+	t->tm_year = 1982;
+	t->tm_mon = 0;
+	t->tm_mday = 29;
+	t->tm_hour = 7;
+	t->tm_min = 42;
+	t->tm_sec = 24;
+
+	tm_to_datetime(t, ymd, hms);
+
+	assert(*ymd == 0x19820129);
+	assert(*hms == 0x742240000000000ULL);
+
+	memset(t, 0, sizeof(struct tm));
+
+	*ymd = 0x19760412;
+
+	datetime_to_tm(*ymd, *hms, t);
+	assert(t->tm_year == 1976);
+	assert(t->tm_mon == 03);
+	assert(t->tm_mday == 12);
+	assert(t->tm_hour == 7);
+	assert(t->tm_min == 42);
+	assert(t->tm_sec == 24);
+
+	free(t);
+	free(ymd);
+	free(hms);
+	return 0;
+}
+
diff --git a/roms/skiboot/core/test/run-timebase.c b/roms/skiboot/core/test/run-timebase.c
new file mode 100644
index 000000000..a613609a0
--- /dev/null
+++ b/roms/skiboot/core/test/run-timebase.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2015-2016 IBM Corp.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+
+#define __TEST__
+#include <timebase.h>
+
+unsigned long tb_hz = 512000000;
+
+int main(void)
+{
+	/* This is a fairly solid assumption that the math we're doing
+	 * is based on tb_hz of exactly 512mhz.
+	 * If we do start doing the math on different tb_hz, you probably
+	 * want to go and audit every bit of code that touches tb to
+	 * count/delay things.
+	 */
+	assert(tb_hz == 512000000);
+	assert(secs_to_tb(1) == tb_hz);
+	assert(secs_to_tb(2) == 1024000000);
+	assert(secs_to_tb(10) == 5120000000);
+	assert(tb_to_secs(512000000) == 1);
+	assert(tb_to_secs(5120000000) == 10);
+	assert(tb_to_secs(1024000000) == 2);
+
+	assert(msecs_to_tb(1) == 512000);
+	assert(msecs_to_tb(100) == 51200000);
+	assert(msecs_to_tb(5) == 2560000);
+	assert(tb_to_msecs(512000) == 1);
+
+	assert(usecs_to_tb(5) == 2560);
+	assert(tb_to_usecs(2560) == 5);
+	assert(usecs_to_tb(5)*1000 == msecs_to_tb(5));
+	assert(tb_to_usecs(512000) == 1000);
+
+	assert(tb_compare(msecs_to_tb(5), usecs_to_tb(5)) == TB_AAFTERB);
+	assert(tb_compare(msecs_to_tb(5), usecs_to_tb(50000)) == TB_ABEFOREB);
+	assert(tb_compare(msecs_to_tb(5), usecs_to_tb(5)*1000) == TB_AEQUALB);
+
+	return 0;
+}
diff --git a/roms/skiboot/core/test/run-timer.c b/roms/skiboot/core/test/run-timer.c
new file mode 100644
index 000000000..8f8b20ed3
--- /dev/null
+++ b/roms/skiboot/core/test/run-timer.c
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2014-2018 IBM Corp
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#define __TEST__
+#include <timer.h>
+#include <skiboot.h>
+
+#define mftb()	(stamp)
+#define sync()
+#define smt_lowest()
+#define smt_medium()
+
+enum proc_gen proc_gen = proc_gen_unknown;
+
+static uint64_t stamp, last;
+struct lock;
+static inline void lock_caller(struct lock *l, const char *caller)
+{
+	(void)caller;
+	(void)l;
+}
+static inline void unlock(struct lock *l) { (void)l; }
+
+unsigned long tb_hz = 512000000;
+
+#include "../timer.c"
+
+#define NUM_TIMERS	100
+
+static struct timer timers[NUM_TIMERS];
+static unsigned int rand_shift, count;
+
+static void init_rand(void)
+{
+	unsigned long max = RAND_MAX;
+
+	/* Get something reasonably small */
+	while(max > 0x10000) {
+		rand_shift++;
+		max >>= 1;
+	}
+}
+
+static void expiry(struct timer *t, void *data, uint64_t now)
+{
+	(void)data;
+	(void)now;
+	assert(t->target >= last);
+	count--;
+}
+
+void p8_sbe_update_timer_expiry(uint64_t new_target)
+{
+	(void)new_target;
+	/* FIXME: do intersting SLW timer sim */
+}
+
+void p9_sbe_update_timer_expiry(uint64_t new_target)
+{
+	(void)new_target;
+}
+
+int main(void)
+{
+	unsigned int i;
+
+	init_rand();
+	for (i = 0; i < NUM_TIMERS; i++) {
+		init_timer(&timers[i], expiry, NULL);
+		schedule_timer(&timers[i], random() >> rand_shift);
+	}
+	count = NUM_TIMERS;
+	while(count) {
+		check_timers(false);
+		stamp++;
+	}
+	return 0;
+}
diff --git a/roms/skiboot/core/test/run-trace.c b/roms/skiboot/core/test/run-trace.c
new file mode 100644
index 000000000..88b090358
--- /dev/null
+++ b/roms/skiboot/core/test/run-trace.c
@@ -0,0 +1,397 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <config.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <sched.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include <skiboot-valgrind.h>
+
+/* Don't include these: PPC-specific */
+#define __CPU_H
+#define __TIME_H
+#define __PROCESSOR_H
+
+#if defined(__i386__) || defined(__x86_64__)
+/* This is more than a lwsync, but it'll work */
+static void full_barrier(void)
+{
+	asm volatile("mfence" : : : "memory");
+}
+#define lwsync full_barrier
+#elif defined(__powerpc__) || defined(__powerpc64__)
+static inline void lwsync(void)
+{
+	asm volatile("lwsync" : : : "memory");
+}
+#else
+#error "Define lwsync for this arch"
+#endif
+
+#define zalloc(size) calloc((size), 1)
+
+struct cpu_thread {
+	uint32_t pir;
+	uint32_t chip_id;
+	struct trace_info *trace;
+	uint32_t server_no;
+	bool is_secondary;
+	struct cpu_thread *primary;
+};
+static struct cpu_thread *this_cpu(void);
+
+#define CPUS 4
+
+static struct cpu_thread fake_cpus[CPUS];
+
+static inline struct cpu_thread *next_cpu(struct cpu_thread *cpu)
+{
+	if (cpu == NULL)
+		return &fake_cpus[0];
+	cpu++;
+	if (cpu == &fake_cpus[CPUS])
+		return NULL;
+	return cpu;
+}
+
+#define first_cpu() next_cpu(NULL)
+
+#define for_each_cpu(cpu)	\
+	for (cpu = first_cpu(); cpu; cpu = next_cpu(cpu))
+
+static unsigned long timestamp;
+static unsigned long mftb(void)
+{
+	return timestamp;
+}
+
+static void *local_alloc(unsigned int chip_id,
+			 size_t size, size_t align)
+{
+	void *p;
+
+	(void)chip_id;
+	if (posix_memalign(&p, align, size))
+		p = NULL;
+	return p;
+}
+
+struct dt_node;
+extern struct dt_node *opal_node;
+
+#include "../trace.c"
+
+#include "../external/trace/trace.c"
+static struct trace_reader trace_readers[CPUS];
+struct trace_reader *my_trace_reader;
+#include "../device.c"
+
+char __rodata_start[1], __rodata_end[1];
+struct dt_node *opal_node;
+struct debug_descriptor debug_descriptor = {
+	.trace_mask = -1
+};
+
+const char *nvram_query_safe(const char *key __unused)
+{
+	return NULL;
+}
+
+void lock_caller(struct lock *l, const char *caller)
+{
+	(void)caller;
+	assert(!l->lock_val);
+	l->lock_val = 1;
+}
+
+void unlock(struct lock *l)
+{
+	assert(l->lock_val);
+	l->lock_val = 0;
+}
+
+struct cpu_thread *my_fake_cpu;
+static struct cpu_thread *this_cpu(void)
+{
+	return my_fake_cpu;
+}
+
+#include <sys/mman.h>
+#define PER_CHILD_TRACES ((RUNNING_ON_VALGRIND) ? (1024*16) : (1024*1024))
+
+static void write_trace_entries(int id)
+{
+	void exit(int);
+	unsigned int i;
+	union trace trace;
+
+	timestamp = id;
+	for (i = 0; i < PER_CHILD_TRACES; i++) {
+		timestamp = i * CPUS + id;
+		assert(sizeof(trace.hdr) % 8 == 0);
+		/* First child never repeats, second repeats once, etc. */
+		trace_add(&trace, 3 + ((i / (id + 1)) % 0x40),
+			  sizeof(trace.hdr));
+	}
+
+	/* Final entry has special type, so parent knows it's over. */
+	trace_add(&trace, 0x70, sizeof(trace.hdr));
+	exit(0);
+}
+
+static bool all_done(const bool done[])
+{
+	unsigned int i;
+
+	for (i = 0; i < CPUS; i++)
+		if (!done[i])
+			return false;
+	return true;
+}
+
+static void test_parallel(void)
+{
+	void *p;
+	unsigned int cpu;
+	unsigned int i, counts[CPUS] = { 0 }, overflows[CPUS] = { 0 };
+	unsigned int repeats[CPUS] = { 0 }, num_overflows[CPUS] = { 0 };
+	bool done[CPUS] = { false };
+	size_t len = sizeof(struct trace_info) + TBUF_SZ + sizeof(union trace);
+	int last = 0;
+
+	/* Use a shared mmap to test actual parallel buffers. */
+	i = (CPUS*len + getpagesize()-1)&~(getpagesize()-1);
+	p = mmap(NULL, i, PROT_READ|PROT_WRITE,
+		 MAP_ANONYMOUS|MAP_SHARED, -1, 0);
+
+	for (i = 0; i < CPUS; i++) {
+		fake_cpus[i].trace = p + i * len;
+		fake_cpus[i].trace->tb.buf_size = cpu_to_be64(TBUF_SZ);
+		fake_cpus[i].trace->tb.max_size = cpu_to_be32(sizeof(union trace));
+		fake_cpus[i].is_secondary = false;
+		memset(&trace_readers[i], 0, sizeof(struct trace_reader));
+		trace_readers[i].tb = &fake_cpus[i].trace->tb;
+	}
+
+	for (i = 0; i < CPUS; i++) {
+		if (!fork()) {
+			/* Child. */
+			my_fake_cpu = &fake_cpus[i];
+			write_trace_entries(i);
+		}
+	}
+
+	while (!all_done(done)) {
+		union trace t;
+
+		for (i = 0; i < CPUS; i++) {
+			if (trace_get(&t, &trace_readers[(i+last) % CPUS]))
+				break;
+		}
+
+		if (i == CPUS) {
+			sched_yield();
+			continue;
+		}
+		i = (i + last) % CPUS;
+		last = i;
+
+		if (t.hdr.type == TRACE_OVERFLOW) {
+			/* Conveniently, each record is 16 bytes here. */
+			assert(be64_to_cpu(t.overflow.bytes_missed) % 16 == 0);
+			overflows[i] += be64_to_cpu(t.overflow.bytes_missed) / 16;
+			num_overflows[i]++;
+			continue;
+		}
+
+		assert(be16_to_cpu(t.hdr.cpu) < CPUS);
+		assert(!done[be16_to_cpu(t.hdr.cpu)]);
+		assert(be64_to_cpu(t.hdr.timestamp) % CPUS == be16_to_cpu(t.hdr.cpu));
+		if (t.hdr.type == TRACE_REPEAT) {
+			assert(t.hdr.len_div_8 * 8 == sizeof(t.repeat));
+			assert(be16_to_cpu(t.repeat.num) != 0);
+			assert(be16_to_cpu(t.repeat.num) <= be16_to_cpu(t.hdr.cpu));
+			repeats[be16_to_cpu(t.hdr.cpu)] += be16_to_cpu(t.repeat.num);
+		} else if (t.hdr.type == 0x70) {
+			cpu = be16_to_cpu(t.hdr.cpu);
+			assert(cpu < CPUS);
+			done[cpu] = true;
+		} else {
+			cpu = be16_to_cpu(t.hdr.cpu);
+			assert(cpu < CPUS);
+			counts[cpu]++;
+		}
+	}
+
+	/* Gather children. */
+	for (i = 0; i < CPUS; i++) {
+		int status;
+		wait(&status);
+	}
+
+	for (i = 0; i < CPUS; i++) {
+		printf("Child %i: %u produced, %u overflows, %llu total\n", i,
+		       counts[i], overflows[i],
+		       (long long)be64_to_cpu(fake_cpus[i].trace->tb.end));
+		assert(counts[i] + repeats[i] <= PER_CHILD_TRACES);
+	}
+	/* Child 0 never repeats. */
+	assert(repeats[0] == 0);
+	assert(counts[0] + overflows[0] == PER_CHILD_TRACES);
+
+	/*
+	 * FIXME: Other children have some fuzz, since overflows may
+	 * include repeat record we already read.  And odd-numbered
+	 * overflows may include more repeat records than normal
+	 * records (they alternate).
+	 */
+}
+
+int main(void)
+{
+	union trace minimal;
+	union trace large;
+	union trace trace;
+	unsigned int i, j;
+
+	opal_node = dt_new_root("opal");
+	dt_new(dt_new(opal_node, "firmware"), "exports");
+	for (i = 0; i < CPUS; i++) {
+		fake_cpus[i].server_no = i;
+		fake_cpus[i].pir = i;
+		fake_cpus[i].is_secondary = (i & 0x1);
+		fake_cpus[i].primary = &fake_cpus[i & ~0x1];
+	}
+	my_fake_cpu = &fake_cpus[0];
+	my_trace_reader = &trace_readers[0];
+	init_trace_buffers();
+
+	for (i = 0; i < CPUS; i++) {
+		trace_readers[i].tb = &fake_cpus[i].trace->tb;
+		assert(trace_empty(&trace_readers[i]));
+		assert(!trace_get(&trace, &trace_readers[i]));
+	}
+
+	assert(sizeof(trace.hdr) % 8 == 0);
+	timestamp = 1;
+	trace_add(&minimal, 100, sizeof(trace.hdr));
+	assert(trace_get(&trace, my_trace_reader));
+	assert(trace.hdr.len_div_8 == minimal.hdr.len_div_8);
+	assert(be64_to_cpu(trace.hdr.timestamp) == timestamp);
+
+	/* Make it wrap once. */
+	for (i = 0; i < TBUF_SZ / (minimal.hdr.len_div_8 * 8) + 1; i++) {
+		timestamp = i;
+		trace_add(&minimal, 99 + (i%2), sizeof(trace.hdr));
+	}
+
+	assert(trace_get(&trace, my_trace_reader));
+	/* First one must be overflow marker. */
+	assert(trace.hdr.type == TRACE_OVERFLOW);
+	assert(trace.hdr.len_div_8 * 8 == sizeof(trace.overflow));
+	assert(be64_to_cpu(trace.overflow.bytes_missed) == minimal.hdr.len_div_8 * 8);
+
+	for (i = 0; i < TBUF_SZ / (minimal.hdr.len_div_8 * 8); i++) {
+		assert(trace_get(&trace, my_trace_reader));
+		assert(trace.hdr.len_div_8 == minimal.hdr.len_div_8);
+		assert(be64_to_cpu(trace.hdr.timestamp) == i+1);
+		assert(trace.hdr.type == 99 + ((i+1)%2));
+	}
+	assert(!trace_get(&trace, my_trace_reader));
+
+	/* Now put in some weird-length ones, to test overlap.
+	 * Last power of 2, minus 8. */
+	for (j = 0; (1 << j) < sizeof(large); j++);
+	for (i = 0; i < TBUF_SZ; i++) {
+		timestamp = i;
+		trace_add(&large, 100 + (i%2), (1 << (j-1)));
+	}
+	assert(trace_get(&trace, my_trace_reader));
+	assert(trace.hdr.type == TRACE_OVERFLOW);
+	assert(trace_get(&trace, my_trace_reader));
+	assert(trace.hdr.len_div_8 == large.hdr.len_div_8);
+	i = be64_to_cpu(trace.hdr.timestamp);
+	while (trace_get(&trace, my_trace_reader))
+		assert(be64_to_cpu(trace.hdr.timestamp) == ++i);
+
+	/* Test repeats. */
+	for (i = 0; i < 65538; i++) {
+		timestamp = i;
+		trace_add(&minimal, 100, sizeof(trace.hdr));
+	}
+	timestamp = i;
+	trace_add(&minimal, 101, sizeof(trace.hdr));
+	timestamp = i+1;
+	trace_add(&minimal, 101, sizeof(trace.hdr));
+
+	assert(trace_get(&trace, my_trace_reader));
+	assert(trace.hdr.timestamp == 0);
+	assert(trace.hdr.len_div_8 == minimal.hdr.len_div_8);
+	assert(trace.hdr.type == 100);
+	assert(trace_get(&trace, my_trace_reader));
+	assert(trace.hdr.type == TRACE_REPEAT);
+	assert(trace.hdr.len_div_8 * 8 == sizeof(trace.repeat));
+	assert(be16_to_cpu(trace.repeat.num) == 65535);
+	assert(be64_to_cpu(trace.repeat.timestamp) == 65535);
+	assert(trace_get(&trace, my_trace_reader));
+	assert(be64_to_cpu(trace.hdr.timestamp) == 65536);
+	assert(trace.hdr.len_div_8 == minimal.hdr.len_div_8);
+	assert(trace.hdr.type == 100);
+	assert(trace_get(&trace, my_trace_reader));
+	assert(trace.hdr.type == TRACE_REPEAT);
+	assert(trace.hdr.len_div_8 * 8 == sizeof(trace.repeat));
+	assert(be16_to_cpu(trace.repeat.num) == 1);
+	assert(be64_to_cpu(trace.repeat.timestamp) == 65537);
+
+	assert(trace_get(&trace, my_trace_reader));
+	assert(be64_to_cpu(trace.hdr.timestamp) == 65538);
+	assert(trace.hdr.len_div_8 == minimal.hdr.len_div_8);
+	assert(trace.hdr.type == 101);
+	assert(trace_get(&trace, my_trace_reader));
+	assert(trace.hdr.type == TRACE_REPEAT);
+	assert(trace.hdr.len_div_8 * 8 == sizeof(trace.repeat));
+	assert(be16_to_cpu(trace.repeat.num) == 1);
+	assert(be64_to_cpu(trace.repeat.timestamp) == 65539);
+
+	/* Now, test adding repeat while we're reading... */
+	timestamp = 0;
+	trace_add(&minimal, 100, sizeof(trace.hdr));
+	assert(trace_get(&trace, my_trace_reader));
+	assert(be64_to_cpu(trace.hdr.timestamp) == 0);
+	assert(trace.hdr.len_div_8 == minimal.hdr.len_div_8);
+	assert(trace.hdr.type == 100);
+
+	for (i = 1; i < TBUF_SZ; i++) {
+		timestamp = i;
+		trace_add(&minimal, 100, sizeof(trace.hdr));
+		assert(trace_get(&trace, my_trace_reader));
+		if (i % 65536 == 0) {
+			assert(trace.hdr.type == 100);
+			assert(trace.hdr.len_div_8 == minimal.hdr.len_div_8);
+		} else {
+			assert(trace.hdr.type == TRACE_REPEAT);
+			assert(trace.hdr.len_div_8 * 8 == sizeof(trace.repeat));
+			assert(be16_to_cpu(trace.repeat.num) == 1);
+		}
+		assert(be64_to_cpu(trace.repeat.timestamp) == i);
+		assert(!trace_get(&trace, my_trace_reader));
+	}
+
+	for (i = 0; i < CPUS; i++)
+		if (!fake_cpus[i].is_secondary)
+			free(fake_cpus[i].trace);
+
+	test_parallel();
+
+	return 0;
+}
diff --git a/roms/skiboot/core/test/stubs.c b/roms/skiboot/core/test/stubs.c
new file mode 100644
index 000000000..0e97af249
--- /dev/null
+++ b/roms/skiboot/core/test/stubs.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2013-2019 IBM Corp
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+#include <stdint.h>
+
+#include <compiler.h>
+#include "../../ccan/list/list.c"
+
+void _prlog(int log_level __attribute__((unused)), const char* fmt, ...) __attribute__((format (printf, 2, 3)));
+
+#ifndef pr_fmt
+#define pr_fmt(fmt) fmt
+#endif
+#define prlog(l, f, ...) do { _prlog(l, pr_fmt(f), ##__VA_ARGS__); } while(0)
+
+void _prlog(int log_level __attribute__((unused)), const char* fmt, ...)
+{
+        va_list ap;
+
+        va_start(ap, fmt);
+        vprintf(fmt, ap);
+        va_end(ap);
+}
+
+/* Add any stub functions required for linking here. */
+static void stub_function(void)
+{
+	abort();
+}
+
+struct cpu_thread;
+
+struct cpu_job *__cpu_queue_job(struct cpu_thread *cpu,
+				const char *name,
+				void (*func)(void *data), void *data,
+				bool no_return);
+
+void cpu_wait_job(struct cpu_job *job, bool free_it);
+void cpu_process_local_jobs(void);
+struct cpu_job *cpu_queue_job_on_node(uint32_t chip_id,
+				       const char *name,
+				       void (*func)(void *data), void *data);
+
+struct cpu_job *cpu_queue_job_on_node(uint32_t chip_id,
+				       const char *name,
+				       void (*func)(void *data), void *data)
+{
+	(void)chip_id;
+	return __cpu_queue_job(NULL, name, func, data, false);
+}
+
+struct cpu_job *__cpu_queue_job(struct cpu_thread *cpu,
+				const char *name,
+				void (*func)(void *data), void *data,
+				bool no_return)
+{
+	(void)cpu;
+	(void)name;
+	(func)(data);
+	(void)no_return;
+	return NULL;
+}
+
+void cpu_wait_job(struct cpu_job *job, bool free_it)
+{
+	(void)job;
+	(void)free_it;
+	return;
+}
+
+void cpu_process_local_jobs(void)
+{
+}
+
+#define STUB(fnname) \
+	void fnname(void) __attribute__((weak, alias ("stub_function")))
+
+STUB(fdt_begin_node);
+STUB(fdt_property);
+STUB(fdt_end_node);
+STUB(fdt_create_with_flags);
+STUB(fdt_add_reservemap_entry);
+STUB(fdt_finish_reservemap);
+STUB(fdt_strerror);
+STUB(fdt_check_header);
+STUB(fdt_check_node_offset_);
+STUB(fdt_next_tag);
+STUB(fdt_string);
+STUB(fdt_get_name);
+STUB(dt_first);
+STUB(dt_next);
+STUB(dt_has_node_property);
+STUB(dt_get_address);
+STUB(add_chip_dev_associativity);
+STUB(pci_check_clear_freeze);
diff --git a/roms/skiboot/core/time-utils.c b/roms/skiboot/core/time-utils.c
new file mode 100644
index 000000000..e948654d3
--- /dev/null
+++ b/roms/skiboot/core/time-utils.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Converts an OPAL formatted datetime into a struct tm. We ignore microseconds
+ * as Linux doesn't use them anyway.
+ *
+ *  |      year       | month |   mday   |
+ *  +------------------------------------+
+ *  |  hour  | minute | secs  | reserved |
+ *  +------------------------------------+
+ *  |             microseconds           |
+ *
+ * Copyright 2013-2014 IBM Corp.
+ */
+
+#include <time-utils.h>
+
+void datetime_to_tm(uint32_t y_m_d, uint64_t h_m_s_m, struct tm *tm)
+{
+	uint32_t x;
+
+	tm->tm_year = bcd_byte(y_m_d, 3) * 100 + bcd_byte(y_m_d, 2);
+	tm->tm_mon = bcd_byte(y_m_d, 1) - 1;
+	tm->tm_mday = bcd_byte(y_m_d, 0);
+
+	x = h_m_s_m >> 32;
+	tm->tm_hour = bcd_byte(x, 3);
+	tm->tm_min = bcd_byte(x, 2);
+	tm->tm_sec = bcd_byte(x, 1);
+}
+
+/*
+ * The OPAL API is defined as returned a u64 of a similar
+ * format to the FSP message; the 32-bit date field is
+ * in the format:
+ *
+ * |      year        | month |   mday   |
+ *
+ * ... and the 64-bit time field is in the format
+ *
+ * |  hour  | minutes | secs  | millisec |
+ * | -------------------------------------
+ * |        millisec          | reserved |
+ *
+ * We simply ignore the microseconds/milliseconds for now
+ * as I don't quite understand why the OPAL API defines that
+ * it needs 6 digits for the milliseconds :-) I suspect the
+ * doc got that wrong and it's supposed to be micro but
+ * let's ignore it.
+ *
+ * Note that Linux doesn't use nor set the ms field anyway.
+ */
+void tm_to_datetime(struct tm *tm, uint32_t *y_m_d, uint64_t *h_m_s_m)
+{
+	uint64_t h_m_s;
+	*y_m_d = int_to_bcd4(tm->tm_year) << 16 |
+		 int_to_bcd2(tm->tm_mon + 1) << 8 |
+		 int_to_bcd2(tm->tm_mday);
+
+	h_m_s = int_to_bcd2(tm->tm_hour) << 24 |
+	        int_to_bcd2(tm->tm_min) << 16 |
+	        int_to_bcd2(tm->tm_sec) << 8;
+
+	*h_m_s_m = h_m_s << 32;
+}
diff --git a/roms/skiboot/core/timebase.c b/roms/skiboot/core/timebase.c
new file mode 100644
index 000000000..451e3710e
--- /dev/null
+++ b/roms/skiboot/core/timebase.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Wait for things, by waiting for timebase to tick over
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <timebase.h>
+#include <opal.h>
+#include <cpu.h>
+#include <chip.h>
+#include <debug_descriptor.h>
+
+unsigned long tb_hz = 512000000;
+
+static void time_wait_poll(unsigned long duration)
+{
+	unsigned long now = mftb();
+	unsigned long end = now + duration;
+	unsigned long period = msecs_to_tb(5);
+
+	if (this_cpu()->tb_invalid) {
+		/*
+		 * Run pollers to allow some backends to process response.
+		 *
+		 * In TOD failure case where TOD is unrecoverable, running
+		 * pollers allows ipmi backend to deal with ipmi response
+		 * from bmc and helps ipmi_queue_msg_sync() to get un-stuck.
+		 * Thus it avoids linux kernel to hang during panic due to
+		 * TOD failure.
+		 */
+		opal_run_pollers();
+		cpu_relax();
+		return;
+	}
+
+	while (tb_compare(now, end) != TB_AAFTERB) {
+
+		unsigned long remaining = end - now;
+
+		/* Call pollers periodically but not continually to avoid
+		 * bouncing cachelines due to lock contention. */
+		if (remaining >= period) {
+			opal_run_pollers();
+			time_wait_nopoll(period);
+		} else
+			time_wait_nopoll(remaining);
+
+		now = mftb();
+	}
+}
+
+void time_wait(unsigned long duration)
+{
+	struct cpu_thread *c = this_cpu();
+
+	if (!list_empty(&this_cpu()->locks_held)) {
+		time_wait_nopoll(duration);
+		return;
+	}
+
+	if (c != boot_cpu && opal_booting())
+		time_wait_nopoll(duration);
+	else
+		time_wait_poll(duration);
+}
+
+void time_wait_nopoll(unsigned long duration)
+{
+	if (this_cpu()->tb_invalid) {
+		cpu_relax();
+		return;
+	}
+
+	cpu_idle_delay(duration);
+}
+
+void time_wait_ms(unsigned long ms)
+{
+	time_wait(msecs_to_tb(ms));
+}
+
+void time_wait_ms_nopoll(unsigned long ms)
+{
+	time_wait_nopoll(msecs_to_tb(ms));
+}
+
+void time_wait_us(unsigned long us)
+{
+	time_wait(usecs_to_tb(us));
+}
+
+void time_wait_us_nopoll(unsigned long us)
+{
+	time_wait_nopoll(usecs_to_tb(us));
+}
+
+unsigned long timespec_to_tb(const struct timespec *ts)
+{
+	unsigned long ns;
+
+	/* First convert to ns */
+	ns = ts->tv_sec * 1000000000ul;
+	ns += ts->tv_nsec;
+
+	/*
+	 * This is a very rough approximation, it works provided
+	 * we never try to pass too long delays here and the TB
+	 * frequency isn't significantly lower than 512Mhz.
+	 *
+	 * We could improve the precision by shifting less bits
+	 * at the expense of capacity or do 128 bit math which
+	 * I'm not eager to do :-)
+	 */
+	if (chip_quirk(QUIRK_SLOW_SIM))
+		return (ns * (tb_hz >> 16)) / (1000000000ul >> 16);
+	else
+		return (ns * (tb_hz >> 24)) / (1000000000ul >> 24);
+}
+
+int nanosleep(const struct timespec *req, struct timespec *rem)
+{
+	time_wait(timespec_to_tb(req));
+
+	if (rem) {
+		rem->tv_sec = 0;
+		rem->tv_nsec = 0;
+	}
+	return 0;
+}
+
+int nanosleep_nopoll(const struct timespec *req, struct timespec *rem)
+{
+	time_wait_nopoll(timespec_to_tb(req));
+
+	if (rem) {
+		rem->tv_sec = 0;
+		rem->tv_nsec = 0;
+	}
+	return 0;
+}
diff --git a/roms/skiboot/core/timer.c b/roms/skiboot/core/timer.c
new file mode 100644
index 000000000..652ffba30
--- /dev/null
+++ b/roms/skiboot/core/timer.c
@@ -0,0 +1,298 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * run something, but later.
+ *
+ * Timers are run when the SBE timer interrupt triggers (based on us setting
+ * it) or when the regular heartbeat call from the OS occurs and there's a
+ * timer that's expired.
+ *
+ * Copyright 2014-2019 IBM Corp.
+ */
+
+#include <timer.h>
+#include <timebase.h>
+#include <lock.h>
+#include <fsp.h>
+#include <device.h>
+#include <opal.h>
+#include <sbe-p8.h>
+#include <sbe-p9.h>
+
+#ifdef __TEST__
+#define this_cpu()	((void *)-1)
+#define cpu_relax()
+#else
+#include <cpu.h>
+#endif
+
+/* Heartbeat requested from Linux */
+#define HEARTBEAT_DEFAULT_MS	200
+
+static struct lock timer_lock = LOCK_UNLOCKED;
+static LIST_HEAD(timer_list);
+static LIST_HEAD(timer_poll_list);
+static bool timer_in_poll;
+static uint64_t timer_poll_gen;
+
+static inline void update_timer_expiry(uint64_t target)
+{
+	if (proc_gen < proc_gen_p9)
+		p8_sbe_update_timer_expiry(target);
+	else
+		p9_sbe_update_timer_expiry(target);
+}
+
+void init_timer(struct timer *t, timer_func_t expiry, void *data)
+{
+	t->link.next = t->link.prev = NULL;
+	t->target = 0;
+	t->expiry = expiry;
+	t->user_data = data;
+	t->running = NULL;
+}
+
+static void __remove_timer(struct timer *t)
+{
+	list_del(&t->link);
+	t->link.next = t->link.prev = NULL;
+}
+
+static void __sync_timer(struct timer *t)
+{
+	sync();
+
+	/* Guard against re-entrancy */
+	assert(t->running != this_cpu());
+
+	while (t->running) {
+		unlock(&timer_lock);
+		smt_lowest();
+		while (t->running)
+			barrier();
+		smt_medium();
+		/* Should we call the pollers here ? */
+		lock(&timer_lock);
+	}
+}
+
+void sync_timer(struct timer *t)
+{
+	lock(&timer_lock);
+	__sync_timer(t);
+	unlock(&timer_lock);
+}
+
+void cancel_timer(struct timer *t)
+{
+	lock(&timer_lock);
+	__sync_timer(t);
+	if (t->link.next)
+		__remove_timer(t);
+	unlock(&timer_lock);
+}
+
+void cancel_timer_async(struct timer *t)
+{
+	lock(&timer_lock);
+	if (t->link.next)
+		__remove_timer(t);
+	unlock(&timer_lock);
+}
+
+static void __schedule_timer_at(struct timer *t, uint64_t when)
+{
+	struct timer *lt;
+
+	/* If the timer is already scheduled, take it out */
+	if (t->link.next)
+		__remove_timer(t);
+
+	/* Update target */
+	t->target = when;
+
+	if (when == TIMER_POLL) {
+		/* It's a poller, add it to the poller list */
+		t->gen = timer_poll_gen;
+		list_add_tail(&timer_poll_list, &t->link);
+	} else {
+		/* It's a real timer, add it in the right spot in the
+		 * ordered timer list
+		 */
+		list_for_each(&timer_list, lt, link) {
+			if (when >= lt->target)
+				continue;
+			list_add_before(&timer_list, &t->link, &lt->link);
+			goto bail;
+		}
+		list_add_tail(&timer_list, &t->link);
+	}
+ bail:
+	/* Pick up the next timer and upddate the SBE HW timer */
+	lt = list_top(&timer_list, struct timer, link);
+	if (lt) {
+		update_timer_expiry(lt->target);
+	}
+}
+
+void schedule_timer_at(struct timer *t, uint64_t when)
+{
+	lock(&timer_lock);
+	__schedule_timer_at(t, when);
+	unlock(&timer_lock);
+}
+
+uint64_t schedule_timer(struct timer *t, uint64_t how_long)
+{
+	uint64_t now = mftb();
+
+	if (how_long == TIMER_POLL)
+		schedule_timer_at(t, TIMER_POLL);
+	else
+		schedule_timer_at(t, now + how_long);
+
+	return now;
+}
+
+static void __check_poll_timers(uint64_t now)
+{
+	struct timer *t;
+
+	/* Don't call this from multiple CPUs at once */
+	if (timer_in_poll)
+		return;
+	timer_in_poll = true;
+
+	/*
+	 * Poll timers might re-enqueue themselves and don't have an
+	 * expiry so we can't do like normal timers and just run until
+	 * we hit a wall. Instead, each timer has a generation count,
+	 * which we set to the current global gen count when we schedule
+	 * it and update when we run it. It will only be considered if
+	 * the generation count is different than the current one. We
+	 * don't try to compare generations being larger or smaller
+	 * because at boot, this can be called quite quickly and I want
+	 * to be safe vs. wraps.
+	 */
+	timer_poll_gen++;
+	for (;;) {
+		t = list_top(&timer_poll_list, struct timer, link);
+
+		/* Top timer has a different generation than current ? Must
+		 * be older, we are done.
+		 */
+		if (!t || t->gen == timer_poll_gen)
+			break;
+
+		/* Top of list still running, we have to delay handling it,
+		 * let's reprogram the SLW with a small delay. We chose
+		 * arbitrarily 1us.
+		 */
+		if (t->running) {
+			update_timer_expiry(now + usecs_to_tb(1));
+			break;
+		}
+
+		/* Allright, first remove it and mark it running */
+		__remove_timer(t);
+		t->running = this_cpu();
+
+		/* Now we can unlock and call it's expiry */
+		unlock(&timer_lock);
+		t->expiry(t, t->user_data, now);
+
+		/* Re-lock and mark not running */
+		lock(&timer_lock);
+		t->running = NULL;
+	}
+	timer_in_poll = false;
+}
+
+static void __check_timers(uint64_t now)
+{
+	struct timer *t;
+
+	for (;;) {
+		t = list_top(&timer_list, struct timer, link);
+
+		/* Top of list not expired ? that's it ... */
+		if (!t || t->target > now)
+			break;
+
+		/* Top of list still running, we have to delay handling
+		 * it. For now just skip until the next poll, when we have
+		 * SLW interrupts, we'll probably want to trip another one
+		 * ASAP
+		 */
+		if (t->running)
+			break;
+
+		/* Allright, first remove it and mark it running */
+		__remove_timer(t);
+		t->running = this_cpu();
+
+		/* Now we can unlock and call it's expiry */
+		unlock(&timer_lock);
+		t->expiry(t, t->user_data, now);
+
+		/* Re-lock and mark not running */
+		lock(&timer_lock);
+		t->running = NULL;
+
+		/* Update time stamp */
+		now = mftb();
+	}
+}
+
+void check_timers(bool from_interrupt)
+{
+	uint64_t now = mftb();
+
+	/* This is the polling variant, the SLW interrupt path, when it
+	 * exists, will use a slight variant of this that doesn't call
+	 * the pollers
+	 */
+
+	/* Lockless "peek", a bit racy but shouldn't be a problem as
+	 * we are only looking at whether the list is empty
+	 */
+	if (list_empty_nocheck(&timer_poll_list) &&
+	    list_empty_nocheck(&timer_list))
+		return;
+
+	/* Take lock and try again */
+	lock(&timer_lock);
+	if (!from_interrupt)
+		__check_poll_timers(now);
+	__check_timers(now);
+	unlock(&timer_lock);
+}
+
+#ifndef __TEST__
+
+void late_init_timers(void)
+{
+	int heartbeat = HEARTBEAT_DEFAULT_MS;
+
+	/* Add a property requesting the OS to call opal_poll_event() at
+	 * a specified interval in order for us to run our background
+	 * low priority pollers.
+	 *
+	 * If a platform quirk exists, use that, else use the default.
+	 *
+	 * If we have an SBE timer facility, we run this 10 times slower,
+	 * we could possibly completely get rid of it.
+	 *
+	 * We use a value in milliseconds, we don't want this to ever be
+	 * faster than that.
+	 */
+	if (platform.heartbeat_time) {
+		heartbeat = platform.heartbeat_time();
+	} else if (p9_sbe_timer_ok()) {
+		heartbeat = HEARTBEAT_DEFAULT_MS * 10;
+	} else if (p8_sbe_timer_ok()) {
+		heartbeat = HEARTBEAT_DEFAULT_MS * 10;
+	}
+
+	dt_add_property_cells(opal_node, "ibm,heartbeat-ms", heartbeat);
+}
+#endif
diff --git a/roms/skiboot/core/trace.c b/roms/skiboot/core/trace.c
new file mode 100644
index 000000000..561bd79e0
--- /dev/null
+++ b/roms/skiboot/core/trace.c
@@ -0,0 +1,265 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Trace various things into in-memory buffers
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <trace.h>
+#include <timebase.h>
+#include <lock.h>
+#include <string.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <cpu.h>
+#include <device.h>
+#include <libfdt.h>
+#include <processor.h>
+#include <skiboot.h>
+#include <opal-api.h>
+#include <debug_descriptor.h>
+#include <nvram.h>
+
+#define DEBUG_TRACES
+
+#define MAX_SIZE sizeof(union trace)
+
+/* Smaller trace buffer for early booting */
+#define BOOT_TBUF_SZ 65536
+static struct {
+	struct trace_info trace_info;
+	char buf[BOOT_TBUF_SZ + MAX_SIZE];
+} boot_tracebuf __section(".data.boot_trace");
+
+void init_boot_tracebuf(struct cpu_thread *boot_cpu)
+{
+	init_lock(&boot_tracebuf.trace_info.lock);
+	boot_tracebuf.trace_info.tb.buf_size = cpu_to_be64(BOOT_TBUF_SZ);
+	boot_tracebuf.trace_info.tb.max_size = cpu_to_be32(MAX_SIZE);
+
+	boot_cpu->trace = &boot_tracebuf.trace_info;
+}
+
+static size_t tracebuf_extra(void)
+{
+	/* We make room for the largest possible record */
+	return TBUF_SZ + MAX_SIZE;
+}
+
+/* To avoid bloating each entry, repeats are actually specific entries.
+ * tb->last points to the last (non-repeat) entry. */
+static bool handle_repeat(struct tracebuf *tb, const union trace *trace)
+{
+	struct trace_hdr *prev;
+	struct trace_repeat *rpt;
+	u32 len;
+
+	prev = (void *)tb->buf + be64_to_cpu(tb->last) % be64_to_cpu(tb->buf_size);
+
+	if (prev->type != trace->hdr.type
+	    || prev->len_div_8 != trace->hdr.len_div_8
+	    || prev->cpu != trace->hdr.cpu)
+		return false;
+
+	len = prev->len_div_8 << 3;
+	if (memcmp(prev + 1, &trace->hdr + 1, len - sizeof(*prev)) != 0)
+		return false;
+
+	/* If they've consumed prev entry, don't repeat. */
+	if (be64_to_cpu(tb->last) < be64_to_cpu(tb->start))
+		return false;
+
+	/* OK, it's a duplicate.  Do we already have repeat? */
+	if (be64_to_cpu(tb->last) + len != be64_to_cpu(tb->end)) {
+		u64 pos = be64_to_cpu(tb->last) + len;
+		/* FIXME: Reader is not protected from seeing this! */
+		rpt = (void *)tb->buf + pos % be64_to_cpu(tb->buf_size);
+		assert(pos + rpt->len_div_8*8 == be64_to_cpu(tb->end));
+		assert(rpt->type == TRACE_REPEAT);
+
+		/* If this repeat entry is full, don't repeat. */
+		if (be16_to_cpu(rpt->num) == 0xFFFF)
+			return false;
+
+		rpt->num = cpu_to_be16(be16_to_cpu(rpt->num) + 1);
+		rpt->timestamp = trace->hdr.timestamp;
+		return true;
+	}
+
+	/*
+	 * Generate repeat entry: it's the smallest possible entry, so we
+	 * must have eliminated old entries.
+	 */
+	assert(trace->hdr.len_div_8 * 8 >= sizeof(*rpt));
+
+	rpt = (void *)tb->buf + be64_to_cpu(tb->end) % be64_to_cpu(tb->buf_size);
+	rpt->timestamp = trace->hdr.timestamp;
+	rpt->type = TRACE_REPEAT;
+	rpt->len_div_8 = sizeof(*rpt) >> 3;
+	rpt->cpu = trace->hdr.cpu;
+	rpt->prev_len = cpu_to_be16(trace->hdr.len_div_8 << 3);
+	rpt->num = cpu_to_be16(1);
+	lwsync(); /* write barrier: complete repeat record before exposing */
+	tb->end = cpu_to_be64(be64_to_cpu(tb->end) + sizeof(*rpt));
+	return true;
+}
+
+void trace_add(union trace *trace, u8 type, u16 len)
+{
+	struct trace_info *ti = this_cpu()->trace;
+	unsigned int tsz;
+
+	trace->hdr.type = type;
+	trace->hdr.len_div_8 = (len + 7) >> 3;
+
+	tsz = trace->hdr.len_div_8 << 3;
+
+#ifdef DEBUG_TRACES
+	assert(tsz >= sizeof(trace->hdr));
+	assert(tsz <= sizeof(*trace));
+	assert(trace->hdr.type != TRACE_REPEAT);
+	assert(trace->hdr.type != TRACE_OVERFLOW);
+#endif
+	/* Skip traces not enabled in the debug descriptor */
+	if (trace->hdr.type < (8 * sizeof(debug_descriptor.trace_mask)) &&
+	    !((1ul << trace->hdr.type) & be64_to_cpu(debug_descriptor.trace_mask)))
+		return;
+
+	trace->hdr.timestamp = cpu_to_be64(mftb());
+	trace->hdr.cpu = cpu_to_be16(this_cpu()->server_no);
+
+	lock(&ti->lock);
+
+	/* Throw away old entries before we overwrite them. */
+	while ((be64_to_cpu(ti->tb.start) + be64_to_cpu(ti->tb.buf_size))
+	       < (be64_to_cpu(ti->tb.end) + tsz)) {
+		struct trace_hdr *hdr;
+
+		hdr = (void *)ti->tb.buf +
+			be64_to_cpu(ti->tb.start) % be64_to_cpu(ti->tb.buf_size);
+		ti->tb.start = cpu_to_be64(be64_to_cpu(ti->tb.start) +
+					   (hdr->len_div_8 << 3));
+	}
+
+	/* Must update ->start before we rewrite new entries. */
+	lwsync(); /* write barrier */
+
+	/* Check for duplicates... */
+	if (!handle_repeat(&ti->tb, trace)) {
+		/* This may go off end, and that's why ti->tb.buf is oversize */
+		memcpy(ti->tb.buf + be64_to_cpu(ti->tb.end) % be64_to_cpu(ti->tb.buf_size),
+		       trace, tsz);
+		ti->tb.last = ti->tb.end;
+		lwsync(); /* write barrier: write entry before exposing */
+		ti->tb.end = cpu_to_be64(be64_to_cpu(ti->tb.end) + tsz);
+	}
+	unlock(&ti->lock);
+}
+
+void trace_add_dt_props(void)
+{
+	uint64_t boot_buf_phys = (uint64_t) &boot_tracebuf.trace_info;
+	struct dt_node *exports, *traces;
+	unsigned int i;
+	fdt64_t *prop;
+	u64 tmask;
+	char tname[256];
+
+	exports = dt_find_by_path(opal_node, "firmware/exports");
+	if (!exports)
+		return;
+
+	/*
+	 * nvram hack to put all the trace buffer exports in the exports
+	 * node. This is useful if the kernel doesn't also export subnodes.
+	 */
+	if (nvram_query_safe("flat-trace-buf"))
+		traces = exports;
+	else
+		traces = dt_new(exports, "traces");
+
+	prop = malloc(sizeof(u64) * 2 * be32_to_cpu(debug_descriptor.num_traces));
+
+	for (i = 0; i < be32_to_cpu(debug_descriptor.num_traces); i++) {
+		uint64_t addr = be64_to_cpu(debug_descriptor.trace_phys[i]);
+		uint64_t size = be32_to_cpu(debug_descriptor.trace_size[i]);
+		uint32_t pir = be16_to_cpu(debug_descriptor.trace_pir[i]);
+
+		prop[i * 2]     = cpu_to_fdt64(addr);
+		prop[i * 2 + 1] = cpu_to_fdt64(size);
+
+		if (addr == boot_buf_phys)
+			snprintf(tname, sizeof(tname), "boot-%x", pir);
+		else
+			snprintf(tname, sizeof(tname), "trace-%x", pir);
+
+		dt_add_property_u64s(traces, tname, addr, size);
+	}
+
+	dt_add_property(opal_node, "ibm,opal-traces",
+			prop, sizeof(u64) * 2 * i);
+	free(prop);
+
+	tmask = (uint64_t)&debug_descriptor.trace_mask;
+	dt_add_property_u64(opal_node, "ibm,opal-trace-mask", tmask);
+}
+
+static void trace_add_desc(struct trace_info *t, uint64_t size, uint16_t pir)
+{
+	unsigned int i = be32_to_cpu(debug_descriptor.num_traces);
+
+	if (i >= DEBUG_DESC_MAX_TRACES) {
+		prerror("TRACE: Debug descriptor trace list full !\n");
+		return;
+	}
+
+	debug_descriptor.num_traces = cpu_to_be32(i + 1);
+	debug_descriptor.trace_phys[i] = cpu_to_be64((uint64_t)t);
+	debug_descriptor.trace_tce[i] = 0; /* populated later */
+	debug_descriptor.trace_size[i] = cpu_to_be32(size);
+	debug_descriptor.trace_pir[i] = cpu_to_be16(pir);
+}
+
+/* Allocate trace buffers once we know memory topology */
+void init_trace_buffers(void)
+{
+	struct cpu_thread *t;
+	struct trace_info *any = &boot_tracebuf.trace_info;
+	uint64_t size;
+
+	/* Boot the boot trace in the debug descriptor */
+	trace_add_desc(any, sizeof(boot_tracebuf), this_cpu()->pir);
+
+	/* Allocate a trace buffer for each primary cpu. */
+	for_each_cpu(t) {
+		if (t->is_secondary)
+			continue;
+
+		/* Use a 64K alignment for TCE mapping */
+		size = ALIGN_UP(sizeof(*t->trace) + tracebuf_extra(), 0x10000);
+		t->trace = local_alloc(t->chip_id, size, 0x10000);
+		if (t->trace) {
+			any = t->trace;
+			memset(t->trace, 0, size);
+			init_lock(&t->trace->lock);
+			t->trace->tb.max_size = cpu_to_be32(MAX_SIZE);
+			t->trace->tb.buf_size = cpu_to_be64(TBUF_SZ);
+			trace_add_desc(any, sizeof(t->trace->tb) +
+				       tracebuf_extra(), t->pir);
+		} else
+			prerror("TRACE: cpu 0x%x allocation failed\n", t->pir);
+	}
+
+	/* In case any allocations failed, share trace buffers. */
+	for_each_cpu(t) {
+		if (!t->is_secondary && !t->trace)
+			t->trace = any;
+	}
+
+	/* And copy those to the secondaries. */
+	for_each_cpu(t) {
+		if (!t->is_secondary)
+			continue;
+		t->trace = t->primary->trace;
+	}
+}
diff --git a/roms/skiboot/core/utils.c b/roms/skiboot/core/utils.c
new file mode 100644
index 000000000..0d2f5e894
--- /dev/null
+++ b/roms/skiboot/core/utils.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Misc utility functions
+ *
+ * Copyright 2013-2018 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <lock.h>
+#include <fsp.h>
+#include <platform.h>
+#include <processor.h>
+#include <cpu.h>
+#include <stack.h>
+
+void __noreturn assert_fail(const char *msg, const char *file,
+				unsigned int line, const char *function)
+{
+	static bool in_abort = false;
+
+	(void)function;
+	if (in_abort)
+		for (;;) ;
+	in_abort = true;
+
+	/**
+	 * @fwts-label FailedAssert2
+	 * @fwts-advice OPAL hit an assert(). During normal usage (even
+	 * testing) we should never hit an assert. There are other code
+	 * paths for controlled shutdown/panic in the event of catastrophic
+	 * errors.
+	 */
+	prlog(PR_EMERG, "assert failed at %s:%u: %s\n", file, line, msg);
+	backtrace();
+
+	if (platform.terminate)
+		platform.terminate(msg);
+
+	for (;;) ;
+}
+
+char __attrconst tohex(uint8_t nibble)
+{
+	static const char __tohex[] = {'0','1','2','3','4','5','6','7','8','9',
+				       'A','B','C','D','E','F'};
+	if (nibble > 0xf)
+		return '?';
+	return __tohex[nibble];
+}
+
+static unsigned long get_symbol(unsigned long addr, char **sym, char **sym_end)
+{
+	unsigned long prev = 0, next;
+	char *psym = NULL, *p = __sym_map_start;
+
+	*sym = *sym_end = NULL;
+	while(p < __sym_map_end) {
+		next = strtoul(p, &p, 16) | SKIBOOT_BASE;
+		if (next > addr && prev <= addr) {
+			p = psym + 3;;
+			if (p >= __sym_map_end)
+				return 0;
+			*sym = p;
+			while(p < __sym_map_end && *p != 10)
+				p++;
+			*sym_end = p;
+			return prev;
+		}
+		prev = next;
+		psym = p;
+		while(p < __sym_map_end && *p != 10)
+			p++;
+		p++;
+	}
+	return 0;
+}
+
+size_t snprintf_symbol(char *buf, size_t len, uint64_t addr)
+{
+	unsigned long saddr;
+	char *sym, *sym_end;
+	size_t l;
+
+	saddr = get_symbol(addr, &sym, &sym_end);
+	if (!saddr)
+		return 0;
+
+	if (len > sym_end - sym)
+		l = sym_end - sym;
+	else
+		l = len - 1;
+	memcpy(buf, sym, l);
+
+	/*
+	 * This snprintf will insert the terminating NUL even if the
+	 * symbol has used up the entire buffer less 1.
+	 */
+	l += snprintf(buf + l, len - l, "+0x%llx", addr - saddr);
+
+	return l;
+}
diff --git a/roms/skiboot/core/vpd.c b/roms/skiboot/core/vpd.c
new file mode 100644
index 000000000..20fe09597
--- /dev/null
+++ b/roms/skiboot/core/vpd.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Parse Vital Product Data (VPD)
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <vpd.h>
+#include <string.h>
+#include <device.h>
+
+#define CHECK_SPACE(_p, _n, _e) (((_e) - (_p)) >= (_n))
+
+/* Low level keyword search in a record. Can be used when we
+ * need to find the next keyword of a given type, for example
+ * when having multiple MF/SM keyword pairs
+ */
+const void *vpd_find_keyword(const void *rec, size_t rec_sz,
+			     const char *kw, uint8_t *kw_size)
+{
+	const uint8_t *p = rec, *end = rec + rec_sz;
+
+	while (CHECK_SPACE(p, 3, end)) {
+		uint8_t k1 = *(p++);
+		uint8_t k2 = *(p++);
+		uint8_t sz = *(p++);
+
+		if (k1 == kw[0] && k2 == kw[1]) {
+			if (kw_size)
+				*kw_size = sz;
+			return p;
+		}
+		p += sz;
+	}
+	return NULL;
+}
+
+/* vpd_valid - does some basic sanity checks to ensure a VPD blob is
+ *             actually a VPD blob
+ */
+bool vpd_valid(const void *vvpd, size_t vpd_size)
+{
+	const uint8_t *vpd = vvpd;
+	int size, i = 0;
+
+	/* find the record start byte */
+	while (i < vpd_size)
+		if (vpd[i++] == 0x84)
+			break;
+
+	if (i >= vpd_size)
+		return false;
+
+	/* next two bytes are the record length, little endian */
+	size  = 2;
+	size += vpd[i];
+	size += vpd[i + 1] << 8;
+
+	i += size; /* skip to the end marker */
+
+	if (i >= vpd_size || vpd[i] != 0x78)
+		return false;
+
+	return true;
+}
+
+/* Locate  a record in a VPD blob
+ *
+ * Note: This works with VPD LIDs. It will scan until it finds
+ * the first 0x84, so it will skip all those 0's that the VPD
+ * LIDs seem to contain
+ */
+const void *vpd_find_record(const void *vpd, size_t vpd_size,
+			    const char *record, size_t *sz)
+{
+	const uint8_t *p = vpd, *end = vpd + vpd_size;
+	bool first_start = true;
+	size_t rec_sz;
+	uint8_t namesz = 0;
+	const char *rec_name;
+
+	if (!vpd)
+		return NULL;
+
+	while (CHECK_SPACE(p, 4, end)) {
+		/* Get header byte */
+		if (*(p++) != 0x84) {
+			/* Skip initial crap in VPD LIDs */
+			if (first_start)
+				continue;
+			break;
+		}
+		first_start = false;
+		rec_sz = *(p++);
+		rec_sz |= *(p++) << 8;
+		if (!CHECK_SPACE(p, rec_sz, end)) {
+			prerror("VPD: Malformed or truncated VPD,"
+				" record size doesn't fit\n");
+			return NULL;
+		}
+
+		/* Find record name */
+		rec_name = vpd_find_keyword(p, rec_sz, "RT", &namesz);
+		if (rec_name && strncmp(record, rec_name, namesz) == 0) {
+			if (sz)
+				*sz = rec_sz;
+			return p;
+		}
+
+		p += rec_sz;
+		if (*(p++) != 0x78) {
+			prerror("VPD: Malformed or truncated VPD,"
+				" missing final 0x78 in record %.4s\n",
+				rec_name ? rec_name : "????");
+			return NULL;
+		}
+	}
+	return NULL;
+}
+
+/* Locate a keyword in a record in a VPD blob
+ *
+ * Note: This works with VPD LIDs. It will scan until it finds
+ * the first 0x84, so it will skip all those 0's that the VPD
+ * LIDs seem to contain
+ */
+const void *vpd_find(const void *vpd, size_t vpd_size,
+		     const char *record, const char *keyword,
+		     uint8_t *sz)
+{
+	size_t rec_sz;
+	const uint8_t *p;
+
+	p = vpd_find_record(vpd, vpd_size, record, &rec_sz);
+	if (p)
+		p = vpd_find_keyword(p, rec_sz, keyword, sz);
+	return p;
+}