From: Paul Mackerras From: Linas Vepstas This patch fixes the usage of the slot-error-detail log buffer for the Power5 architecture. The size of the error buffer is variable, and the correct size to use should have been obtained from firmware. Failure to use the correct buffer sizes will result in hard-to-debug system lockups deep in firmware. This patch is based on an earlier patch from Ben Herrenschmidt, which essentially did the same thing. This patch also tweaks some of the subroutine documentation. Signed-off-by: Linas Vepstas Signed-off-by: Paul Mackerras Signed-off-by: Andrew Morton --- 25-akpm/arch/ppc64/kernel/eeh.c | 65 ++++++++++++++++++++++++---------------- 1 files changed, 39 insertions(+), 26 deletions(-) diff -puN arch/ppc64/kernel/eeh.c~ppc64-eeh-fixes-for-power5-machines-2-2 arch/ppc64/kernel/eeh.c --- 25/arch/ppc64/kernel/eeh.c~ppc64-eeh-fixes-for-power5-machines-2-2 2004-07-03 01:27:36.654011544 -0700 +++ 25-akpm/arch/ppc64/kernel/eeh.c 2004-07-03 01:27:36.659010784 -0700 @@ -45,12 +45,18 @@ static int ibm_set_eeh_option; static int ibm_set_slot_reset; static int ibm_read_slot_reset_state; +static int ibm_slot_error_detail; static int eeh_subsystem_enabled; #define EEH_MAX_OPTS 4096 static char *eeh_opts; static int eeh_opts_last; +/* Buffer for reporting slot-error-detail rtas calls */ +static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX]; +static spinlock_t slot_errbuf_lock = SPIN_LOCK_UNLOCKED; +static int eeh_error_buf_size; + /* System monitoring statistics */ static DEFINE_PER_CPU(unsigned long, total_mmio_ffs); static DEFINE_PER_CPU(unsigned long, false_positives); @@ -368,9 +374,6 @@ unsigned long eeh_check_failure(void *to struct device_node *dn; int ret; int rets[2]; - static spinlock_t lock = SPIN_LOCK_UNLOCKED; - /* dont want this on the stack */ - static unsigned char slot_err_buf[RTAS_ERROR_LOG_MAX]; unsigned long flags; __get_cpu_var(total_mmio_ffs)++; @@ -414,23 +417,24 @@ unsigned long eeh_check_failure(void *to BUID_LO(dn->phb->buid)); if (ret == 0 && rets[1] == 1 && rets[0] >= 2) { - int slot_err_ret; + int log_event; + + spin_lock_irqsave(&slot_errbuf_lock, flags); + memset(slot_errbuf, 0, eeh_error_buf_size); - spin_lock_irqsave(&lock, flags); - memset(slot_err_buf, 0, RTAS_ERROR_LOG_MAX); - slot_err_ret = rtas_call(rtas_token("ibm,slot-error-detail"), - 8, 1, NULL, dn->eeh_config_addr, - BUID_HI(dn->phb->buid), - BUID_LO(dn->phb->buid), NULL, 0, - __pa(slot_err_buf), - RTAS_ERROR_LOG_MAX, - 2 /* Permanent Error */); + log_event = rtas_call(ibm_slot_error_detail, + 8, 1, NULL, dn->eeh_config_addr, + BUID_HI(dn->phb->buid), + BUID_LO(dn->phb->buid), NULL, 0, + virt_to_phys(slot_errbuf), + eeh_error_buf_size, + 2 /* Permanent Error */); - if (slot_err_ret == 0) - log_error(slot_err_buf, ERR_TYPE_RTAS_LOG, + if (log_event == 0) + log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 1 /* Fatal */); - spin_unlock_irqrestore(&lock, flags); + spin_unlock_irqrestore(&slot_errbuf_lock, flags); /* * XXX We should create a separate sysctl for this. @@ -517,8 +521,7 @@ static void *early_enable_eeh(struct dev } if (!enable || info->force_off) { - dn->eeh_mode = EEH_MODE_NOCHECK; - return NULL; + dn->eeh_mode |= EEH_MODE_NOCHECK; } /* This device may already have an EEH parent. */ @@ -562,14 +565,13 @@ static void *early_enable_eeh(struct dev * As a side effect we can determine here if eeh is supported at all. * Note that we leave EEH on so failed config cycles won't cause a machine * check. If a user turns off EEH for a particular adapter they are really - * telling Linux to ignore errors. - * - * We should probably distinguish between "ignore errors" and "turn EEH off" - * but for now disabling EEH for adapters is mostly to work around drivers that - * directly access mmio space (without using the macros). - * - * The eeh-force-off option does literally what it says, so if Linux must - * avoid enabling EEH this must be done. + * telling Linux to ignore errors. Some hardware (e.g. POWER5) won't + * grant access to a slot if EEH isn't enabled, and so we always enable + * EEH for all slots/all devices. + * + * The eeh-force-off option disables EEH checking globally, for all slots. + * Even if force-off is set, the EEH hardware is still enabled, so that + * newer systems can boot. */ void __init eeh_init(void) { @@ -588,10 +590,21 @@ void __init eeh_init(void) ibm_set_eeh_option = rtas_token("ibm,set-eeh-option"); ibm_set_slot_reset = rtas_token("ibm,set-slot-reset"); ibm_read_slot_reset_state = rtas_token("ibm,read-slot-reset-state"); + ibm_slot_error_detail = rtas_token("ibm,slot-error-detail"); if (ibm_set_eeh_option == RTAS_UNKNOWN_SERVICE) return; + eeh_error_buf_size = rtas_token("rtas-error-log-max"); + if (eeh_error_buf_size == RTAS_UNKNOWN_SERVICE) { + eeh_error_buf_size = 1024; + } + if (eeh_error_buf_size > RTAS_ERROR_LOG_MAX) { + printk(KERN_WARNING "EEH: rtas-error-log-max is bigger than allocated " + "buffer ! (%d vs %d)", eeh_error_buf_size, RTAS_ERROR_LOG_MAX); + eeh_error_buf_size = RTAS_ERROR_LOG_MAX; + } + info.force_off = 0; if (eeh_force_off) { printk(KERN_WARNING "EEH: WARNING: PCI Enhanced I/O Error " _