From b9183f9b99a9bd3349aefbd51d22f7e1bdc4a087 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 15 Sep 2009 15:56:32 +0200 Subject: amd64_edac: build driver only on AMD hardware -tip testing found the following build failure (config attached): drivers/built-in.o: In function `amd64_check': amd64_edac.c:(.text+0x3e9491): undefined reference to `amd_decode_nb_mce' drivers/built-in.o: In function `amd64_init_2nd_stage': amd64_edac.c:(.text+0x3e9b46): undefined reference to `amd_report_gart_errors' amd64_edac.c:(.text+0x3e9b55): undefined reference to `amd_register_ecc_decoder' drivers/built-in.o: In function `amd64_nbea_store': amd64_edac_dbg.c:(.text+0x3ea22e): undefined reference to `amd_decode_nb_mce' drivers/built-in.o: In function `amd64_remove_one_instance': amd64_edac.c:(.devexit.text+0x3eea): undefined reference to `amd_report_gart_errors' amd64_edac.c:(.devexit.text+0x3ef6): undefined reference to `amd_unregister_ecc_decoder' the AMD EDAC code has a dependency on CONFIG_CPU_SUP_AMD facilities. The patch below solves the problem here. Signed-off-by: Ingo Molnar Signed-off-by: Borislav Petkov diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index 4339b1a..a3ca18e 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -59,7 +59,7 @@ config EDAC_MM_EDAC config EDAC_AMD64 tristate "AMD64 (Opteron, Athlon64) K8, F10h, F11h" - depends on EDAC_MM_EDAC && K8_NB && X86_64 && PCI + depends on EDAC_MM_EDAC && K8_NB && X86_64 && PCI && CPU_SUP_AMD help Support for error detection and correction on the AMD 64 Families of Memory Controllers (K8, F10h and F11h) -- cgit v0.10.2 From 6a8126911a5ab167783fce18ae9cc70ec9b84fe2 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 16 Sep 2009 11:33:40 +0200 Subject: x86, EDAC: Provide function to return NodeId of a CPU Signed-off-by: Andreas Herrmann Signed-off-by: Borislav Petkov Acked-by: H. Peter Anvin diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index e08ea04..42a3f93 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -1020,4 +1020,6 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, extern int get_tsc_mode(unsigned long adr); extern int set_tsc_mode(unsigned int val); +extern int amd_get_nb_id(int cpu); + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 22a47c8..f32fa71 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -333,6 +333,16 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) #endif } +int amd_get_nb_id(int cpu) +{ + int id = 0; +#ifdef CONFIG_SMP + id = per_cpu(cpu_llc_id, cpu); +#endif + return id; +} +EXPORT_SYMBOL_GPL(amd_get_nb_id); + static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) { #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index c8ca713..0c21c37 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -405,7 +405,7 @@ void decode_mce(struct mce *m) regs.nbsh = (u32)(m->status >> 32); regs.nbeal = (u32) m->addr; regs.nbeah = (u32)(m->addr >> 32); - node = per_cpu(cpu_llc_id, m->extcpu); + node = amd_get_nb_id(m->extcpu); amd_decode_nb_mce(node, ®s, 1); break; -- cgit v0.10.2 From be3468e8ff768c986849870b24e85fa84806da73 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 5 Aug 2009 15:47:22 +0200 Subject: amd64_edac: cleanup amd64_check_ecc_enabled Simplify code flow and make sure return value is always valid since further driver init depends on it. Carve out long warning string and make code more readable. Shorten some names, while at it. There should be no functional change resulting from this patch. Signed-off-by: Borislav Petkov diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 173dc4a..bde3d02 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2783,7 +2783,7 @@ static void check_mcg_ctl(void *ret) } /* check MCG_CTL on all the cpus on this node */ -static int amd64_mcg_ctl_enabled_on_cpus(const cpumask_t *mask) +static int mcg_ctl_enabled_on_node(const struct cpumask *mask) { int ret = 1; preempt_disable(); @@ -2799,71 +2799,45 @@ static int amd64_mcg_ctl_enabled_on_cpus(const cpumask_t *mask) * the memory system completely. A command line option allows to force-enable * hardware ECC later in amd64_enable_ecc_error_reporting(). */ +static const char *ecc_warning = + "WARNING: ECC is disabled by BIOS. Module will NOT be loaded.\n" + " Either Enable ECC in the BIOS, or set 'ecc_enable_override'.\n" + " Also, use of the override can cause unknown side effects.\n"; + static int amd64_check_ecc_enabled(struct amd64_pvt *pvt) { u32 value; - int err = 0, ret = 0; - u8 ecc_enabled = 0; + int err = 0; + u8 ecc_enabled = 0, mcg_ctl_en = 0; err = pci_read_config_dword(pvt->misc_f3_ctl, K8_NBCFG, &value); if (err) debugf0("Reading K8_NBCTL failed\n"); ecc_enabled = !!(value & K8_NBCFG_ECC_ENABLE); + if (!ecc_enabled) + amd64_printk(KERN_WARNING, "This node reports that Memory ECC " + "is currently disabled, set F3x%x[22] (%s).\n", + K8_NBCFG, pci_name(pvt->misc_f3_ctl)); + else + amd64_printk(KERN_INFO, "ECC is enabled by BIOS.\n"); - ret = amd64_mcg_ctl_enabled_on_cpus(cpumask_of_node(pvt->mc_node_id)); - - debugf0("K8_NBCFG=0x%x, DRAM ECC is %s\n", value, - (value & K8_NBCFG_ECC_ENABLE ? "enabled" : "disabled")); - - if (!ecc_enabled || !ret) { - if (!ecc_enabled) { - amd64_printk(KERN_WARNING, "This node reports that " - "Memory ECC is currently " - "disabled.\n"); + mcg_ctl_en = mcg_ctl_enabled_on_node(cpumask_of_node(pvt->mc_node_id)); + if (!mcg_ctl_en) + amd64_printk(KERN_WARNING, "NB MCE bank disabled, set MSR " + "0x%08x[4] on node %d to enable.\n", + MSR_IA32_MCG_CTL, pvt->mc_node_id); - amd64_printk(KERN_WARNING, "bit 0x%lx in register " - "F3x%x of the MISC_CONTROL device (%s) " - "should be enabled\n", K8_NBCFG_ECC_ENABLE, - K8_NBCFG, pci_name(pvt->misc_f3_ctl)); - } - if (!ret) { - amd64_printk(KERN_WARNING, "bit 0x%016lx in MSR 0x%08x " - "of node %d should be enabled\n", - K8_MSR_MCGCTL_NBE, MSR_IA32_MCG_CTL, - pvt->mc_node_id); - } + if (!ecc_enabled || !mcg_ctl_en) { if (!ecc_enable_override) { - amd64_printk(KERN_WARNING, "WARNING: ECC is NOT " - "currently enabled by the BIOS. Module " - "will NOT be loaded.\n" - " Either Enable ECC in the BIOS, " - "or use the 'ecc_enable_override' " - "parameter.\n" - " Might be a BIOS bug, if BIOS says " - "ECC is enabled\n" - " Use of the override can cause " - "unknown side effects.\n"); - ret = -ENODEV; - } else - /* - * enable further driver loading if ECC enable is - * overridden. - */ - ret = 0; - } else { - amd64_printk(KERN_INFO, - "ECC is enabled by BIOS, Proceeding " - "with EDAC module initialization\n"); - - /* Signal good ECC status */ - ret = 0; - + amd64_printk(KERN_WARNING, "%s", ecc_warning); + return -ENODEV; + } + } else /* CLEAR the override, since BIOS controlled it */ ecc_enable_override = 0; - } - return ret; + return 0; } struct mcidev_sysfs_attribute sysfs_attrs[ARRAY_SIZE(amd64_dbg_attrs) + -- cgit v0.10.2 From 57a30854c89f862eeada4cce822f3a87bc006c95 Mon Sep 17 00:00:00 2001 From: Wan Wei Date: Fri, 7 Aug 2009 17:04:49 +0200 Subject: amd64_edac: Rewrite unganged mode code of f10_early_channel_count Simplify the procedure by checking if there is any DIMM in each channel. This patch will fix the bugs such as when there is no DIMMs under certain node, two DIMMs in the same channel, and only one DIMM in each channel of the node. Borislav: minor fixups Signed-off-by: Wan Wei Signed-off-by: Borislav Petkov diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index bde3d02..f943ad8 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -1255,7 +1255,9 @@ static int k8_dbam_map_to_pages(struct amd64_pvt *pvt, int dram_map) */ static int f10_early_channel_count(struct amd64_pvt *pvt) { + int dbams[] = { DBAM0, DBAM1 }; int err = 0, channels = 0; + int i, j; u32 dbam; err = pci_read_config_dword(pvt->dram_f2_ctl, F10_DCLR_0, &pvt->dclr0); @@ -1288,46 +1290,19 @@ static int f10_early_channel_count(struct amd64_pvt *pvt) * is more than just one DIMM present in unganged mode. Need to check * both controllers since DIMMs can be placed in either one. */ - channels = 0; - err = pci_read_config_dword(pvt->dram_f2_ctl, DBAM0, &dbam); - if (err) - goto err_reg; - - if (DBAM_DIMM(0, dbam) > 0) - channels++; - if (DBAM_DIMM(1, dbam) > 0) - channels++; - if (DBAM_DIMM(2, dbam) > 0) - channels++; - if (DBAM_DIMM(3, dbam) > 0) - channels++; - - /* If more than 2 DIMMs are present, then we have 2 channels */ - if (channels > 2) - channels = 2; - else if (channels == 0) { - /* No DIMMs on DCT0, so look at DCT1 */ - err = pci_read_config_dword(pvt->dram_f2_ctl, DBAM1, &dbam); + for (i = 0; i < ARRAY_SIZE(dbams); i++) { + err = pci_read_config_dword(pvt->dram_f2_ctl, dbams[i], &dbam); if (err) goto err_reg; - if (DBAM_DIMM(0, dbam) > 0) - channels++; - if (DBAM_DIMM(1, dbam) > 0) - channels++; - if (DBAM_DIMM(2, dbam) > 0) - channels++; - if (DBAM_DIMM(3, dbam) > 0) - channels++; - - if (channels > 2) - channels = 2; + for (j = 0; j < 4; j++) { + if (DBAM_DIMM(j, dbam) > 0) { + channels++; + break; + } + } } - /* If we found ALL 0 values, then assume just ONE DIMM-ONE Channel */ - if (channels == 0) - channels = 1; - debugf0("MCT channel count: %d\n", channels); return channels; -- cgit v0.10.2 From 06724535f8fa26e78238bf8adfc9c81650a665f7 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 16 Sep 2009 13:05:46 +0200 Subject: amd64_edac: check NB MCE bank enable on the current node properly The old code was using smp_call_function_many which skips the current cpu if it is in the supplied cpumask. Switch to the rdmsr_on_cpus() interface which takes care of that. In addition, add get_cpus_on_this_dct_cpumask helper which computes a cpumask of all the cores on a node and thus on a DCT. Signed-off-by: Borislav Petkov diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index f943ad8..4e551e6 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2741,30 +2741,53 @@ static void amd64_restore_ecc_error_reporting(struct amd64_pvt *pvt) wrmsr_on_cpus(cpumask, K8_MSR_MCGCTL, msrs); } -static void check_mcg_ctl(void *ret) +/* get all cores on this DCT */ +static void get_cpus_on_this_dct_cpumask(cpumask_t *mask, int nid) { - u64 msr_val = 0; - u8 nbe; + int cpu; - rdmsrl(MSR_IA32_MCG_CTL, msr_val); - nbe = msr_val & K8_MSR_MCGCTL_NBE; - - debugf0("core: %u, MCG_CTL: 0x%llx, NB MSR is %s\n", - raw_smp_processor_id(), msr_val, - (nbe ? "enabled" : "disabled")); - - if (!nbe) - *(int *)ret = 0; + for_each_online_cpu(cpu) + if (amd_get_nb_id(cpu) == nid) + cpumask_set_cpu(cpu, mask); } /* check MCG_CTL on all the cpus on this node */ -static int mcg_ctl_enabled_on_node(const struct cpumask *mask) +static bool amd64_nb_mce_bank_enabled_on_node(int nid) { - int ret = 1; - preempt_disable(); - smp_call_function_many(mask, check_mcg_ctl, &ret, 1); - preempt_enable(); + cpumask_t mask; + struct msr *msrs; + int cpu, nbe, idx = 0; + bool ret = false; + + cpumask_clear(&mask); + + get_cpus_on_this_dct_cpumask(&mask, nid); + + msrs = kzalloc(sizeof(struct msr) * cpumask_weight(&mask), GFP_KERNEL); + if (!msrs) { + amd64_printk(KERN_WARNING, "%s: error allocating msrs\n", + __func__); + return false; + } + + rdmsr_on_cpus(&mask, MSR_IA32_MCG_CTL, msrs); + + for_each_cpu(cpu, &mask) { + nbe = msrs[idx].l & K8_MSR_MCGCTL_NBE; + + debugf0("core: %u, MCG_CTL: 0x%llx, NB MSR is %s\n", + cpu, msrs[idx].q, + (nbe ? "enabled" : "disabled")); + + if (!nbe) + goto out; + + idx++; + } + ret = true; +out: + kfree(msrs); return ret; } @@ -2783,7 +2806,8 @@ static int amd64_check_ecc_enabled(struct amd64_pvt *pvt) { u32 value; int err = 0; - u8 ecc_enabled = 0, mcg_ctl_en = 0; + u8 ecc_enabled = 0; + bool nb_mce_en = false; err = pci_read_config_dword(pvt->misc_f3_ctl, K8_NBCFG, &value); if (err) @@ -2797,13 +2821,13 @@ static int amd64_check_ecc_enabled(struct amd64_pvt *pvt) else amd64_printk(KERN_INFO, "ECC is enabled by BIOS.\n"); - mcg_ctl_en = mcg_ctl_enabled_on_node(cpumask_of_node(pvt->mc_node_id)); - if (!mcg_ctl_en) + nb_mce_en = amd64_nb_mce_bank_enabled_on_node(pvt->mc_node_id); + if (!nb_mce_en) amd64_printk(KERN_WARNING, "NB MCE bank disabled, set MSR " "0x%08x[4] on node %d to enable.\n", MSR_IA32_MCG_CTL, pvt->mc_node_id); - if (!ecc_enabled || !mcg_ctl_en) { + if (!ecc_enabled || !nb_mce_en) { if (!ecc_enable_override) { amd64_printk(KERN_WARNING, "%s", ecc_warning); return -ENODEV; -- cgit v0.10.2