diff options
Diffstat (limited to 'roms/skiboot/external/pci-scripts')
-rwxr-xr-x | roms/skiboot/external/pci-scripts/phberr.py | 658 | ||||
-rw-r--r-- | roms/skiboot/external/pci-scripts/ppc.py | 28 |
2 files changed, 686 insertions, 0 deletions
diff --git a/roms/skiboot/external/pci-scripts/phberr.py b/roms/skiboot/external/pci-scripts/phberr.py new file mode 100755 index 000000000..5f295fdc9 --- /dev/null +++ b/roms/skiboot/external/pci-scripts/phberr.py @@ -0,0 +1,658 @@ +#!/usr/bin/env python3 + +import sys +import ppc +import re + +# Mnemonic PHB_ESR - Address Offset 0x0C80 - phbErrorStatusRegister +phb_esr_bits = [ + (0, "ETU/RSB Request Address Error"), + (1, "Fundamental A Request Address Error"), + (2, "Fundamental A Request Size/Alignment Error"), + (3, "Fundamental A PCI CFG Addr/Size Error"), + (4, "Fundamental A IODA Table Access Error"), + (5, "Fundamental A Internal Registers Parity Error"), + (6, "PHB Error Registers Request Address Error"), + (7, "PHB Error Registers Request Size/Alignment Error"), + (8, "Fundamental B Request Address Error"), + (9, "Fundamental B Request Size/Alignment Error"), + (10, "Fundamental B Internal Registers Parity Error"), + (11, "Internal Bus Logic Bad PCIE Macro Request Address"), + (12, "Debug Request Address Error"), + (13, "Debug Request Size/Alignment Error"), + (14, "Debug Internal Registers Parity Error"), + (15, "Internal Bus Logic State Machine One-Hot Error"), + (16, "UV Page Request Address Error"), + (17, "UV Page Request Size/Alignment Error"), + (18, "UV Page Internal Registers Parity Error"), + (20, "RXE_ARB OR Error Status"), + (21, "RXE_MRG OR Error Status"), + (22, "RXE_TCE OR Error Status"), + (23, "TXE OR Error Status"), + (24, "pcie_etu_regb_err_inf"), + (25, "pcie_etu_regb_err_erc"), + (26, "pcie_etu_regb_err_fat"), + (27, "bus_regs_req_wr_data_p_e"), + (28, "SCOM HV Indirect Access Error"), + (29, "SCOM UV Indirect Access Error"), + (30, "SCOM Internal Registers Parity Error"), + (31, "SCOM Satellite Finite State Machine Error"), +] + +# Mnemonic TXE_ESR - Address Offset 0x0D00 - txeFirstErrorStatus +txe_esr_bits = [ + (0, "AIB Command Invalid"), + (2, "AIB Address Decode Error"), + (3, "AIB Size Invalid"), + (4, "AIB Cmd Ctrls Parity Error"), + (5, "AIB Data Ctrls Parity Error"), + (8, "AIB Alignment Error"), + (9, "AIB Cmd Bus Parity Error"), + (10, "AIB Data Bus UE ECC Error"), + (11, "AIB Data Ctrls Sequence Error"), + (12, "AIB Data Bus CE ECC Error"), + (13, "TCE Rd Response DAT_ERR Indication"), + (14, "AIB Command Credits Error"), + (15, "AIB Data Credits Error"), + (16, "BLIF Controls Parity Error"), + (17, "CFG Write Error CA or UR response"), + (18, "BLIF Forward Progress Timeout"), + (19, "MMIO RD Pending Error"), + (20, "MMIO WR Pending Error"), + (21, "MMIO CFG Pending Error"), + (22, "MMIO Write DAT_ERR Indication"), + (23, "CI Store Data Fifo Error"), + (24, "CFG Enable Error, RRB"), + (25, "CFG Size Error"), + (26, "CFG Bus Address Error"), + (27, "CFG Link Down Error"), + (28, "PAPR TXE Injection Error Triggered"), + (29, "CFG Write Request Timeout"), + (30, "PAPR TXE Injection Error Triggered"), + (36, "CI Trigger Buffer ECC Correctable Error"), + (37, "CI Trigger Buffer ECC Uncorrectable Error"), + (38, "CI Trigger Buffer Stage Data Parity Error"), + (40, "MMIO BAR Table (MBT) Parity Error"), + (42, "MMIO Domain Table (MDT) ECC Correctable Error"), + (43, "MMIO Domain Table (MDT) ECC Uncorrectable Error"), + (44, "MMIO Domain Table (MDT) Stage Parity Error"), + (45, "MMIO Domain Table (MDT) Stage Valid Error"), + (46, "AIB Data Special Uncorrectable Error (SUE)"), + (47, "MMIO Domain Table (MDT)"), + (48, "P2P Store Data Fifo Error"), + (49, "EPAT Table Parity Error"), + (50, "MMIO Cmd Parity Error"), + (51, "BLIF1 Reg Parity Error"), + (52, "P2P1 Reg Parity Error"), + (53, "P2P WR Pending Error"), + (54, "CRW Onehot Error"), + (55, "CRW Pending Error"), + (56, "RRB Parity Error"), + (57, "RRB Size/Alignment Error"), + (58, "s_bad_addr_e_q"), + (59, "s_req_size_align_e_q"), +] + +# Mnemonic RXE_ARB_ESR - Address Offset 0x0D80 - phbRxeArbErrorStatus +rxe_arb_bits = [ + (0, "BLIF Inbound CA Completion Error"), + (1, "BLIF Inbound UR Completion Error"), + (2, "MSI Size Error"), + (3, "MSI Address Alignment Error"), + (5, "BLIF Inbound Header ECC Correctable (CE)"), + (6, "BLIF Inbound Header ECC Uncorrectable (UE)"), + (7, "ARB Stage Valid Error"), + (8, "TCE Tag Release Unused"), + (9, "TCE Tag Used, Not Free"), + (10, "ARB MMIO Buffer Overflow"), + (11, "ARB MMIO Buffer Underflow"), + (12, "ARB MMIO Internal Parity Error"), + (13, "ARB DMA Buffer Overflow"), + (14, "ARB DMA Buffer Underflow"), + (15, "ARB DMA Internal Parity Error"), + (16, "BLIF Header Control Bits Parity Error"), + (17, "BLIF Data Control Bits Parity Error"), + (18, "BLIF Unsupported Request (UR) Error"), + (19, "BLIF Completion Timeout Error"), + (20, "SEID Table ECC Correctable (CE)"), + (21, "SEID Table ECC Uncorrectable (UE)"), + (22, "NBW Size Error"), + (23, "DEC IODA Table Fatal Error"), + (24, "TLP Poisoned Error"), + (25, "MIST ECC Correctable Error"), + (26, "IODA TVT Entry Invalid"), + (27, "MSI PE# Mismatch"), + (28, "IODA TVT Address"), + (29, "TVT ECC Correctable Error"), + (30, "TVT ECC Uncorrectable Error"), + (31, "MIST ECC Uncorrectable Error"), + (32, "PELT-V BAR Disabled Error"), + (33, "IODA Table Parity Error"), + (34, "PCT Timeout"), + (35, "PCT Unexpected Completion"), + (36, "PCT Parity Error"), + (37, "DEC Stage Valid Error"), + (38, "DEC Stage Parity Error"), + (39, "PAPR Inbound Injection Error Triggered"), + (40, "DMA/MSI: RTE PE Number"), + (41, "RTT BAR Disabled Error"), + (42, "RTC Internal Parity Error"), + (43, "RTC Queue Overflow"), + (44, "RTC Queue Underflow"), + (45, "RTC Stage Valid Error"), + (46, "RTC RCAM Bad State Error"), + (47, "RTC RCAM Multiple Hit Error"), + (48, "RRB Parity Error"), + (49, "RRB request Size / Alignment Error"), + (50, "s_bad_addr_e_q"), + (51, "s_req_size_align_e_q"), + (54, "Discontiguous DMA Write Fragmentation"), + (55, "LIST Table Parity Error"), + (56, "LKP PEST Data Queue Error"), + (57, "PCIE Fatal Error Message Received"), + (58, "PCIE Nonfatal Error Message Received"), + (59, "PCIE Correctable Error Message Received"), +] + +#Mnemonic RXE_MRG_ESR - Address Offset 0x0E00, phbRxeMrgErrorStatus +rxe_mrg_bits = [ + (8, "MRG TMB Allocation Error"), + (9, "MRG TMB Response Invalid"), + (10, "MRG TMB Response Ready Error"), + (11, "MRG MMIO Queue Overflow Error"), + (12, "MRG MMIO Queue Underflow Error"), + (13, "MRG MMIO Internal Parity Error"), + (14, "MRG DMA Queue Overflow Error"), + (15, "MRG DMA Queue Underflow Error"), + (16, "MRG DMA Internal Parity Error"), + (17, "MRG Migration Register Table"), + (18, "MRG Migration Register Table"), + (20, "s_bad_addr_e_q"), + (21, "s_req_size_align_e_q"), + (22, "RRB Parity Error"), + (23, "RRB request Size / Alignment Error"), + (24, "DSP AIB TX Timeout Error"), + (25, "Reserved (vA4.1)"), + (26, "DSP AIB TX CMD Credit Parity Error"), + (28, "DSP AIB TX DAT Credit Parity Error"), + (30, "DSP Command Credit Overflow Error"), + (31, "DSP Command Credit Underflow Error"), + (32, "DSP Command Credit Parity Error"), + (33, "DSP Data Credit Overflow Error"), + (34, "DSP Data Credit Underflow Error"), + (35, "DSP Data Credit Parity Error"), + (36, "DSP Completion State Machine One-Hot Error"), + (37, "DSP Write Thread State Machine One-Hot Error"), + (38, "DSP DMA Secure Address Error (vA4.2)"), + (39, "DSP MSI Interrupt Notification Secure Address"), + (40, "DSP TREQ ECC Correctable Error"), + (41, "DSP TREQ ECC Uncorrectable Error"), + (42, "DSP MMIO Queue Overflow Error"), + (43, "DSP MMIO Queue Underflow Error"), + (44, "DSP MMIO Internal Parity Error"), + (45, "DSP DMA Queue Overflow Error"), + (46, "DSP DMA Queue Underflow Error"), + (47, "DSP DMA Internal Parity Error"), + (48, "DSP Read Thread State Machine One-Hot Error"), + (49, "DSP Table State Machine One-Hot Error"), + (50, "DSP NBW State Machine One-Hot Error"), + (51, "DSP TSM PEST BAR Disabled Error"), + (56, "IPD ECC Correctable Error"), + (57, "IPD ECC Uncorrectable Error"), + (58, "ICPLD ECC Correctable Error"), + (59, "ICPLD ECC Uncorrectable Error"), + (60, "NBWD ECC Correctable Error"), + (61, "NBWD ECC Uncorrectable Error"), + (63, "pb_etu_ai_rx_raise_fence"), +] + + +# Mnemonic RXE_TCE_ESR - Address Offset 0x0E80 - phbRxeTceErrorStatus +rxe_tce_bits = [ + (0, "TCE CMP Internal Parity Error"), + (1, "TCE Request Page Access Error"), + (2, "TCE Response Page Access Error"), + (3, "TCE CMP Queue Overflow"), + (4, "TCE CMP Queue Underflow"), + (5, "TCE Secure Address Error"), + (6, "TCE Cache Bad State Error"), + (7, "TCE Cache Multi-Way Hit Error"), + (8, "TCE Request Timeout Error"), + (9, "TCE TCR ECC Correctable Error"), + (10, "TCE TCR ECC Uncorrectable Error"), + (11, "TCE TDR ECC Correctable Error"), + (12, "TCE TDR ECC Uncorrectable Error"), + (13, "TCE Unexpected Response Error"), + (14, "RRB Parity Error"), + (15, "RRB request Size / Alignment Error"), + (16, "TCE RES Internal Parity Error"), + (17, "s_bad_addr_e_q"), + (18, "s_req_size_align_e_q"), + (19, "TCE RES Queue Overflow"), + (20, "TCE RES Queue Underflow"), + (21, "TCE Response Data Parity Error"), + (22, "TCE TCLB CAM Bad State Error"), + (23, "TCE TCLB CAM Multi-Hit Error"), + (24, "TCE Kill Internal Parity Error"), + (25, "TCE THASH Array ECC Correctable Error"), + (26, "TCE THASH Array ECC Uncorrectable Error"), + (27, "TCE TCLB TDAT ECC Correctable Error"), + (28, "TCE TCLB TDAT ECC Uncorrectable Error"), + (29, "TCE Kill State Machine One-Hot Error"), + (30, "TCE Kill Queue Overflow"), + (31, "TCE Kill Queue Underflow"), + (32, "TCE Request Secure Address Register"), + (33, "TCE Response Secure Address Register"), +] + + +#Mnemonic PBL_ESR - Address Offset 0x1900 - phbPblErrorStatus +pbl_esr_bits = [ + (0, "pb_err_p_fe_tlif_rx_par_e Parity error detected on TLIF Receive interface."), + (1, "pb_err_p_fe_tlif_tx_par_e Parity error detected on TLIF Transmit interface."), + (2, "pb_err_p_fe_blif_out_par_e"), + (3, "pb_err_p_fe_blif_in_par_e"), + (4, "pb_err_p_fe_int_par_e"), + (5, "pb_err_p_fe_toc_cred_e"), + (6, "pb_err_p_fe_ocf_par_e"), + (7, "pb_err_p_fe_ocf_prot_e"), + (12, "pb_err_p_fe_pct_erq_overflow_e"), + (13, "pb_err_p_fe_pct_erq_underflow_e"), + (14, "pb_err_p_fe_pct_onp_tags_rls_unused_e"), + (15, "pb_err_p_fe_pct_onp_tags_used_notfree_e"), + (16, "pb_err_p_fe_pct_onp_tags_used_unexp_e"), + (17, "pb_err_p_fe_bct_onp_tags_rls_unused_e"), + (18, "pb_err_p_fe_bct_onp_tags_used_notfree_e"), + (19, "pb_err_p_fe_ib_bct_rd_inv"), + (20, "pb_err_p_fe_ob_buffer_overflow_e"), + (21, "pb_err_p_fe_ob_buffer_underflow_e"), + (22, "pb_err_p_fe_ib_buffer_overflow_e"), + (23, "pb_err_p_fe_ib_buffer_underflow_e"), + (24, "pb_err_p_fe_ib_d_ecc_ue"), + (25, "pb_err_p_fe_ib_h_ecc_ue"), + (26, "pb_err_p_fe_ob_d_ecc_ue"), + (27, "pb_err_p_fe_ob_h_ecc_ue"), + (28, "pb_err_p_fe_ocf_ecc_ue"), + (32, "pb_err_p_fe_tx_pst_discard_e"), + (33, "pb_err_p_inf_tx_npst_discard_e"), + (34, "pb_err_p_fe_nbw_tlp_e"), + (36, "pb_err_p_fe_pci_rcv_cpl_ca_e"), + (37, "pb_err_p_fe_pci_rcv_cpl_crs_e"), + (38, "pb_err_p_fe_pci_rcv_cpl_rsvd_e"), + (39, "pb_err_p_fe_pci_rcv_cpl_ur_e"), + (40, "pb_err_p_fe_pci_rcv_ecrc_e"), + (41, "pb_err_p_fe_pci_rcv_malf_tlp_e"), + (42, "pb_err_p_fe_pci_rcv_overflow_e"), + (43, "pb_err_p_fe_pci_rcv_poisoned_tlp_e"), + (44, "pb_err_p_fe_pci_rcv_unexp_cpl_e"), + (45, "pb_err_p_fe_pci_rcv_unsup_req_e"), + (46, "pb_err_p_fe_pci_sig_cpl_abort_e"), + (47, "pb_err_p_fe_pci_sig_cpl_timeout_e"), + (48, "pb_err_p_fe_pci_sig_poisoned_tlp_e"), + (52, "pb_err_p_inf_out_trans_to_pst_e"), + (53, "pb_err_p_inf_out_trans_to_npst_e"), + (54, "pb_err_p_inf_out_trans_to_cpl_e"), + (56, "pb_err_p_inf_ib_d_ecc_ce"), + (57, "pb_err_p_inf_ib_h_ecc_ce"), + (58, "pb_err_p_inf_ob_d_ecc_ce"), + (59, "pb_err_p_inf_ob_h_ecc_ce"), + (60, "pb_err_p_inf_ocf_ecc_ce"), + (62, "PBL Bad Register Address Error"), + (63, "PBL Register Parity Error"), +] + +# Mnemonic REGB_ESR - Address Offset 0x1C00 - phbRegbErrorStatus +regb_esr_bits = [ + (0, "REGB Internal Register Parity Error"), + (1, "PBL Internal Register Parity Error"), + (2, "Invalid Address Decode Error"), + (3, "Register Access Invalid Address+Size Error"), + (5, "Register State Machine or Other Internal Error"), + (6, "PCI CFG Core Registers Parity Error"), + (7, "Register access to CFG core while in reset error."), + (8, "PCIE Link Down"), + (9, "PCIE Link Up"), + (10, "PCIE Link Auto Bandwidth Event Status"), + (11, "PCIE Link BW Management Event Status"), + (25, "PBL Error Trap: INF Error"), + (26, "PBL Error Trap: ERC Error"), + (27, "PBL Error Trap: FAT Error"), + (28, "tldlpo_dl_mon_rxreceivererror(0)"), + (29, "tldlpo_dl_mon_rxreceivererror(1)"), + (30, "tldlpo_dl_mon_rxreceivererror(2)"), + (32, "DL_EC08_BADDLLP"), + (33, "DL_EC08_BADTLP"), + (34, "DL_EC08_DLLPE"), + (35, "DL_EC08_RECEIVERERROR"), + (36, "DL_EC08_ REPLAYROLLOVER"), + (37, "DL_EC08_REPLAYTIMEOUT"), + (39, "DL_INTERNALERROR"), + (40, "DL_LB_ERROR"), + (41, "DL_RX_MALFORMED"), + (42, "DL_RX_NULLIFY"), + (43, "DL_RX_OVERFLOW"), + (44, "DL_TX_CORRERROR"), + (45, "DL_TX_UNCORRERROR"), + (46, "TL_EC08_FCPE"), + (48, "Replay ECC Correctable Error (CE)"), + (49, "Replay ECC UnCorrectable Error (UE)"), + (50, "Bad DLLP Error Count Saturated"), + (51, "Bad TLP Error Count Saturated"), + (52, "Receiver Error Count Saturated"), + (53, "DLLPE Error Count Saturated"), + (58, "pbl_ptl_dl_al_rx_initcredit_p_e"), + (59, "pbl_ptl_dl_al_rx_updatecredit_p_e"), + (60, "PTL Core DLIF Protocol Error"), + (61, "PTL Core TLIF Protocol Error"), + (62, "PTL Core Internal Parity Error"), +] + +# FIXME: use the long desc +nfir_bits = [ + (0, "bar_pe"), # One of the BARs or BAR Mask Register parity error. + (1, "nonbar_pe"), # Any non-BAR parity error. + (2, "PB_to_PEC_ce"), # ECC correctable error off of outbound SMP interconnect. + (3, "PB_to_PEC_ue"), # ECC uncorrectable error off of outbound SMP interconnect. + (4, "PB_to_PEC_sue"), # ECC special uncorrectable error off of outbound SMP interconnect + (5, "ary_ecc_ce"), # ECC correctable error on an internal array. + (6, "ary_ecc_ue"), # ECC uncorrectable error on an internal array. + (7, "ary_ecc_sue"), # ECC special uncorrectable error on an internal array. + (8, "register_array_pe"), # Parity error on an internal register file. + (9, "pb_interface_pe"), # Parity error on the PB interface (address/aTag/tTag/rTAG). + (10, "pb_data_hang_errors"), # Any SMP interconnect data hang poll error (only checked for CI stores). + (11, "pb_hang_errors"), # Any SMP interconnect command hang error (domestic address range). + (12, "rd_are_errors"), # SMP interconnect address error (ARE) detected by a DMA read. + (13, "nonrd_are_errors"), # SMP interconnect address error detected by a DMA write or an interrupt engine. + (14, "pci_hang_error"), # PBCQ detected that the PCI load, store, EOI, or DMA read response did not make forward progress. + (15, "pci_clock_error"), # PBCQ has detected that the PCI clock has stopped. + (16, "PFIR_freeze"), # This is the freeze signal from the PFIR freeze output. + (17, "hw_errors"), # Any miscellaneous hardware error. + (18, "UnsolicitiedPBData"), # The PEC received data with an rTAG matching a queue that was not expecting data or too much data was received. + (19, "UnExpectedCResp"), # PEC received an unexpected combined response. + (20, "InvalidCResp"), # PEC received an invalid combined response. + (21, "PBUnsupportedSize"), # PEC received a CI load/store that hits a BAR but is an unsupported size or address alignment. +] + +pfir_bits = [ + (0, "register_pe"), # PBAIB register parity error. + (1, "hardware_error"), # Hardware error. + (2, "AIB_intf_error"), # AIB interface error. + (3, "ETU_Reset_error"), # ETU reset error. + (4, "PEC_scom_error"), # Common PEC SCOM error. + (5, "scomfir_error0"), # SCOM Error bit 0 + (6, "scomfir_error1"), # SCOM Error bit 1 +] + +class PHBError: + reg_bits = { + "NEST FIR": nfir_bits, + "PCI FIR": pfir_bits, + "phbErrorStatus": phb_esr_bits, + "phbTxeErrorStatus": txe_esr_bits, + "phbRxeArbErrorStatus": rxe_arb_bits, + "phbRxeMrgErrorStatus": rxe_mrg_bits, + "phbRxeTceErrorStatus": rxe_tce_bits, + "phbRegbErrorStatus": regb_esr_bits, + "phbPblErrorStatus": pbl_esr_bits, + } + + def __str__(self): + s = "" + for k, v in self.regs.items(): + s += "{:30s} - {:#018x} - {}\n".format(k, v, ppc.setbits(v)) + return s + + def __init__(self, timestamp = 0): + self.timestamp = timestamp + self.pest = [] + self.regs = {} + + # NB: Value is a str, FIXME: Work out how to use python's type annotations + def set_reg(self, reg, value): + reg = reg.replace(" ", "") + if not self.regs.get(reg): + self.regs[reg] = value + return True + return False + + def get_reg(self, reg): + reg = reg.replace(" ", "") + v = self.regs.get(reg) + if v: + return v + return 0 + + # NB: pest entries should be inserted in sort order, but it might be a good + # idea to explicitly sort them by PE number + def set_pest(self, pe, pesta, pestb): + self.pest.append((pe, pesta, pestb)) + + def get_pest(self, pe_number): + for pe, a, b in self.pest: + if pe == pe_number: + return (a, b) + return None + + def header(self): + return self.timestamp + + # TODO: move the formatting out of here and into the main loop + def show_errs(self): + out = "" + for reg_name,reg_bits in self.reg_bits.items(): + reg_value = self.get_reg(reg_name) + parts = reg_name.split("Error"); + if len(parts) > 1: + first_name = "{:s}FirstError{:s}".format(parts[0], parts[1]) + first_value = self.get_reg(first_name) + + # skiboot spells it wrong, so check Frst too + if first_value == 0: + frst_name = "{:s}FrstError{:s}".format(parts[0], parts[1]) + first_value = self.get_reg(frst_name) + else: + first_value = 0 + + if reg_value == 0: + continue + out += "{} = {:016x}:\n".format(reg_name, reg_value); + + for bit in reg_bits: + if ppc.ppcbit(bit[0]) & reg_value: + bang = "!" if (ppc.ppcbit(bit[0]) & reg_value & first_value) == ppc.ppcbit(bit[0]) else "" + out += "{:s}\t{:2d} - {}\n".format(bang, bit[0], bit[1]) + out += "\n" + + if len(self.pest) == 0: + return out + + out += "PEST entries:\n" + for pe, pesta, pestb in self.pest: + out += "\tPEST[{:03x}] = {:016x} {:016x}\n".format(pe, pesta, pestb) + + return out + + + +def parse_opal_log(log_text): + # Patterns to match: + # + # [ 938.249526636,3] PHB#0030[8:0]: NEST FIR WOF=0000800000000000 + # [ 938.250657886,3] PHB#0030[8:0]: slotStatus = 00402000 + # [ 938.254305278,3] PHB#0030[8:0]: PEST[511] = 3740002a01000000 0000000000000000 + # + phblog_re = re.compile("" + + "^\[\s*[\d.,]+] " + # skiboot log header + "(PHB#....\[.:.]):" + # PHB name + "\s+" + # whitespace between the PHB and register name + "([^:=]+)" + # register name, NB: this might have some trailing WS + "=\s*" + # the '=' seperating name and value, along with the whitespace + "([a-fA-F\d ]+)") # register value(s) + + # this alone isn't really sufficent. There's a few cases that can cause a register + # dump to be generated (e.g. when the link is retrained we do a reg dump) + new_log_marker = re.compile("" + + "^\[ [\d.,]+] " + + "(PHB#....\[.:.]): " + + "PHB Freeze/Fence detected !") + + # Store the current register set for each PHB. Keep in mind that we can have register + # dumps from different PHBs being interleaved in the register log. + current = {} + + # list discovered error logs + error_logs = [] + + # Match things and split them on a per-PHB basis. We can get multiple PHB error logs + # printed interleaved in the skiboot log if there are multiple PHBs frozen. + for l in log_text.split("\n"): + m = new_log_marker.match(l) + if not m: + m = phblog_re.match(l) + if not m: + continue + + match = m.groups() + phb = match[0] + + # new log marker, save the current log and create a new one to store register values in + log = current.get(phb) + if not log: + current[phb] = PHBError(l); + elif len(match) == 1: + error_logs.append(current[phb]) + current[phb] = PHBError(l) # create a new log object + log = current[phb] + + if len(match) > 1: + if match[1].find("PEST") >= 0: # PEST entry + # NB: unlike .match() .search() scans the whole string + m = re.search("PEST\[([\da-fA-F]+)] = ([\da-fA-F]+) ([\da-fA-F]+)", l) + pe, pesta, pestb = [int(i, 16) for i in m.groups()] + current[phb].set_pest(pe, pesta, pestb) + else: # Normal register + name = match[1].strip() + value = int(match[2].strip(), 16) + + ok = current[phb].set_reg(name, value) + + # If we have duplicate registers then we're in a new log context + # so stash the current one and init a new one. + if not ok: + error_logs.append(current[phb]) + current[phb] = PHBError(l) + current[phb].set_reg(name, value) + + # save all the logs we're still processing + for k,v in current.items(): + error_logs.append(v) + + return error_logs + + +''' +Mar 25 10:01:49 localhost kernel: PHB4 PHB#48 Diag-data (Version: 1) +Mar 25 10:01:49 localhost kernel: brdgCtl: 00000002 +Mar 25 10:01:49 localhost kernel: RootSts: 00010020 00402000 a1030008 00100107 00002000 +Mar 25 10:01:49 localhost kernel: RootErrSts: 00000000 00000000 00000001 +Mar 25 10:01:49 localhost kernel: PhbSts: 0000001c00000000 0000001c00000000 +Mar 25 10:01:49 localhost kernel: Lem: 0000000100280000 0000000000000000 0000000100000000 +Mar 25 10:01:49 localhost kernel: PhbErr: 0000088000000000 0000008000000000 2148000098000240 a008400000000000 +Mar 25 10:01:49 localhost kernel: RxeArbErr: 4000200000000000 0000200000000000 02409fde30000000 0000000000000000 +Mar 25 10:01:49 localhost kernel: PblErr: 0000000001000000 0000000001000000 0000000000000000 0000000000000000 +Mar 25 10:01:49 localhost kernel: PcieDlp: 0000000000000000 0000000000000000 ffff000000000000 +Mar 25 10:01:49 localhost kernel: RegbErr: 0000004a10000800 0000000810000000 8800003c00000000 0000000007011000 +Mar 25 10:01:49 localhost kernel: PE[1fd] A/B: a440002a05000000 8000000000000000 +''' + +def parse_kernel_log(log_text): + reg8 = "([0-9a-fA-F]{8})" + reg16 = "([0-9a-fA-F]{16})" + + # TODO: pick up the AER stuff the kernel logs too? + # NB: The register names used for set_reg are the skiboot register names, not the kernel. + # TODO: check these for completeness / accuracy. I might have missed something + register_patterns = [ + (re.compile("brdgCtl: {}" .format(reg8)), "brdgCtl"), + (re.compile("RootSts: {} {} {} {} {}".format(reg8, reg8, reg8, reg8, reg8)), + 'deviceStatus', 'slotStatus', 'linkStatus', 'devCmdStatus', 'devSecStatus'), + (re.compile("RootErrSts: {} {} {}" .format(reg8, reg8, reg8)), + 'rootErrorStatus', 'uncorrErrorStatus', 'corrErrorStatus'), + (re.compile("PhbSts: {} {}" .format(reg16, reg16)), "phbPlssr", "phbCsr"), + (re.compile("nFir: {} {} {}" .format(reg16, reg16, reg16)), "nFir", "nFirMask", "nFirWOF"), + (re.compile("Lem: {} {} {}" .format(reg16, reg16, reg16)), "lemFir", "lemErrorMask", "lemWOF"), + (re.compile("PhbErr: {} {} {} {}" .format(reg16, reg16, reg16, reg16)), + "phbErrorStatus", "phbFirstErrorStatus", "phbErrorLog0", "phbErrorLog1"), + (re.compile("PhbTxeErr: {} {} {} {}" .format(reg16, reg16, reg16, reg16)), + "phbPhbTxeErrorStatus", "phbPhbTxeFirstErrorStatus", "phbPhbTxeErrorLog0", "phbTxeErrorLog1"), + (re.compile("RxeArbErr: {} {} {} {}" .format(reg16, reg16, reg16, reg16)), + "phbRxeArbErrorStatus", "phbRxeArbFirstErrorStatus", "phbRxeArbErrorLog0", "phbRxeArbErrorLog1"), + (re.compile("RxeMrgErr: {} {} {} {}" .format(reg16, reg16, reg16, reg16)), + "phbRxeMrgErrorStatus", "phbRxeMrgFirstErrorStatus", "phbRxeMrgErrorLog0", "phbRxeMrgErrorLog1"), + (re.compile("RxeTceErr: {} {} {} {}" .format(reg16, reg16, reg16, reg16)), + "phbRxeTceErrorStatus", "phbRxeTceFirstErrorStatus", "phbRxeTceErrorLog0", "phbRxeTceErrorLog1"), + (re.compile("PblErr: {} {} {} {}" .format(reg16, reg16, reg16, reg16)), + "phbPblErrorStatus", "phbPblFirstErrorStatus", "phbPblErrorLog0", "phbPblErrorLog1"), + (re.compile("PcieDlp: {} {} {}" .format(reg16, reg16, reg16)), + "phbPcieDlpErrorLog1", "phbPcieDlpErrorLog2", "phbPcieDlpErrorStatus"), + (re.compile("RegbErr: {} {} {} {}" .format(reg16, reg16, reg16, reg16)), + "phbRegbErrorStatus", "phbRegbFirstErrorStatus", "phbRegbErrorLog0", "phbRegbErrorLog1"), + ] + + header_pattern = re.compile("PHB4 PHB#[0-9]+ Diag-data") # match header + pe_pattern = re.compile("PE\[{}\] A/B: {} {}".format("([ 0-9a-fA-F]{3})", reg16, reg16)) # the PE number is three hex digits + + logs = [] + log = PHBError(""); + + # pretty nasty but since interpreting the kernel logs requires context I + # don't have any better ideas + for l in log_text.split("\n"): + m = header_pattern.search(l) + if m: # start a new log + logs.append(log) + log = PHBError(l) + continue + + for p,*names in register_patterns: + m = p.search(l) + if not m: + continue + for name, val in zip(names, m.groups()): + log.set_reg(name, int(val, 16)) + break + + m = pe_pattern.search(l) + if m: + pe = int(m.groups()[0], 16) + pesta = int(m.groups()[1], 16) + pestb = int(m.groups()[2], 16) + log.set_pest(pe, pesta, pestb) + + logs.append(log) + + return logs + +def main(argv): + if len(argv) < 2: + print("Usage: {} <log file>".format(argv[0])); + return + + try: + log_text = open(argv[1]).read(); + except Exception as err: + print(err) + sys.exit(1) + + logs = parse_opal_log(log_text); + logs.extend(parse_kernel_log(log_text)) + + for err in logs: + print("==== PHB Register dump found ====") + print("") + print(err.header()) + print("") + print(err.show_errs()) + +if __name__ == "__main__": + main(sys.argv) diff --git a/roms/skiboot/external/pci-scripts/ppc.py b/roms/skiboot/external/pci-scripts/ppc.py new file mode 100644 index 000000000..ae04e4b1b --- /dev/null +++ b/roms/skiboot/external/pci-scripts/ppc.py @@ -0,0 +1,28 @@ +#!/usr/bin/python -i + +# Just some helper functions to convert PPC bits (in the docs) to integer +# values we can actually use in code. + +def ppcbit(i): + return 1 << (63 - i) + +def ppcmask(a,b): + mask = 0 + for i in range(a, b + 1): + mask += ppcbit(i) + return mask + +def ppcfield(a, b, v): + return (v & ppcmask(a,b)) >> (63 - b) + +def ppcbit32(i): + return 1 << (31 - i) + +def ppcmask32(a,b): + mask = 0 + for i in range(a, b + 1): + mask += ppcbit32(i) + return mask + +def ppcfield32(a, b, v): + return (v & ppcmask32(a,b)) >> (31 - b) |