From b2de43605410d1970dc9e0f349e399f1d561be13 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 27 May 2016 14:11:06 -0700 Subject: x86/mce: Do not use bank 1 for APEI generated error logs BIOS can report a memory error to Linux using ACPI/APEI mechanism. When it does this, we create a fictitious machine check error record and feed it into the standard mce_log() function. The error record needs a machine check bank number, and for some reason we chose "1" for this. But "1" is a valid bank number, and this causes confusion and heartburn among h/w folks who are concerned that a memory error signature was somehow logged in bank 1. Change to use "-1" (field is a "u8" so will typically print as 255). This should make it clearer that this error did not originate in a machine check bank. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Aristeu Rozanski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mauro Carvalho Chehab Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-edac Link: http://lkml.kernel.org/r/b7fffb2b326bc1dd150ffceb9919a803f9496e0e.1464805958.git.tony.luck@intel.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index 34c89a3..83f1a98 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -46,7 +46,7 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) return; mce_setup(&m); - m.bank = 1; + m.bank = -1; /* Fake a memory read error with unknown channel */ m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f; -- cgit v0.10.2 From 955d1427a91b18f53e082bd7c19c40ce13b0a0f4 Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Fri, 8 Jul 2016 11:09:38 +0200 Subject: x86/mce/AMD: Increase size of the bank_map type Change bank_map type from 'char' to 'int' since we now have more than eight banks in a system. Signed-off-by: Aravind Gopalakrishnan Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Aravind Gopalakrishnan Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/1467968983-4874-2-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 10b0661..7b7f3be 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -93,7 +93,7 @@ const char * const amd_df_mcablock_names[] = { EXPORT_SYMBOL_GPL(amd_df_mcablock_names); static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); -static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ +static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */ static void amd_threshold_interrupt(void); static void amd_deferred_error_interrupt(void); -- cgit v0.10.2 From 340e983ab8afd02b59d698dd1365d7773bf136b3 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 8 Jul 2016 11:09:39 +0200 Subject: x86/RAS/AMD: Reduce the number of IPIs when prepping error injection We currently use wrmsr_on_cpu() 4 times when prepping for an error injection. This will generate 4 IPIs for each MSR write. We can reduce the number of IPIs to 1 by grouping the MSR writes and executing them serially on the appropriate CPU. Suggested-by: Borislav Petkov Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Aravind Gopalakrishnan Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/1467968983-4874-3-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c index e69f470..1104515 100644 --- a/arch/x86/ras/mce_amd_inj.c +++ b/arch/x86/ras/mce_amd_inj.c @@ -241,6 +241,31 @@ static void toggle_nb_mca_mst_cpu(u16 nid) __func__, PCI_FUNC(F3->devfn), NBCFG); } +static void prepare_msrs(void *info) +{ + struct mce i_mce = *(struct mce *)info; + u8 b = i_mce.bank; + + wrmsrl(MSR_IA32_MCG_STATUS, i_mce.mcgstatus); + + if (boot_cpu_has(X86_FEATURE_SMCA)) { + if (i_mce.inject_flags == DFR_INT_INJ) { + wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), i_mce.status); + wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), i_mce.addr); + } else { + wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), i_mce.status); + wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), i_mce.addr); + } + + wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), i_mce.misc); + } else { + wrmsrl(MSR_IA32_MCx_STATUS(b), i_mce.status); + wrmsrl(MSR_IA32_MCx_ADDR(b), i_mce.addr); + wrmsrl(MSR_IA32_MCx_MISC(b), i_mce.misc); + } + +} + static void do_inject(void) { u64 mcg_status = 0; @@ -287,36 +312,9 @@ static void do_inject(void) toggle_hw_mce_inject(cpu, true); - wrmsr_on_cpu(cpu, MSR_IA32_MCG_STATUS, - (u32)mcg_status, (u32)(mcg_status >> 32)); - - if (boot_cpu_has(X86_FEATURE_SMCA)) { - if (inj_type == DFR_INT_INJ) { - wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_DESTAT(b), - (u32)i_mce.status, (u32)(i_mce.status >> 32)); - - wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_DEADDR(b), - (u32)i_mce.addr, (u32)(i_mce.addr >> 32)); - } else { - wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_STATUS(b), - (u32)i_mce.status, (u32)(i_mce.status >> 32)); - - wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_ADDR(b), - (u32)i_mce.addr, (u32)(i_mce.addr >> 32)); - } - - wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(b), - (u32)i_mce.misc, (u32)(i_mce.misc >> 32)); - } else { - wrmsr_on_cpu(cpu, MSR_IA32_MCx_STATUS(b), - (u32)i_mce.status, (u32)(i_mce.status >> 32)); - - wrmsr_on_cpu(cpu, MSR_IA32_MCx_ADDR(b), - (u32)i_mce.addr, (u32)(i_mce.addr >> 32)); - - wrmsr_on_cpu(cpu, MSR_IA32_MCx_MISC(b), - (u32)i_mce.misc, (u32)(i_mce.misc >> 32)); - } + i_mce.mcgstatus = mcg_status; + i_mce.inject_flags = inj_type; + smp_call_function_single(cpu, prepare_msrs, &i_mce, 0); toggle_hw_mce_inject(cpu, false); -- cgit v0.10.2 From 38c54ccb2ded3e93d8a353baeb7b9e12e1b77e23 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 8 Jul 2016 11:09:41 +0200 Subject: x86/mce: Fix mce_rdmsrl() warning message The MSR address we're dumping in there should be in hex, otherwise we get funsies like: [ 0.016000] WARNING: CPU: 1 PID: 0 at arch/x86/kernel/cpu/mcheck/mce.c:428 mce_rdmsrl+0xd9/0xe0 [ 0.016000] mce: Unable to read msr -1073733631! ^^^^^^^^^^^ Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/1467968983-4874-5-git-send-email-bp@alien8.de [ Fixed capitalization of 'MSR'. ] Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 92e5e37..58af630 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -425,7 +425,7 @@ static u64 mce_rdmsrl(u32 msr) } if (rdmsrl_safe(msr, &v)) { - WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr); + WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr); /* * Return zero in case the access faulted. This should * not happen normally but can happen if the CPU does -- cgit v0.10.2