From 853d9b18f1e861d37e9b271742329f8c1176eabe Mon Sep 17 00:00:00 2001 From: Levente Kurusa Date: Fri, 29 Nov 2013 21:28:48 +0100 Subject: x86, mce: Call put_device on device_register failure This patch adds a call to put_device() when the device_register() call has failed. This is required so that the last reference to the device is given up. Signed-off-by: Levente Kurusa Link: http://lkml.kernel.org/r/5298F900.9000208@linux.com Signed-off-by: Borislav Petkov diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index b3218cd..a389c1d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -2272,8 +2272,10 @@ static int mce_device_create(unsigned int cpu) dev->release = &mce_device_release; err = device_register(dev); - if (err) + if (err) { + put_device(dev); return err; + } for (i = 0; mce_device_attrs[i]; i++) { err = device_create_file(dev, mce_device_attrs[i]); -- cgit v0.10.2 From 545104dd2ba0feb62544284ee1d6c98fe6ef8245 Mon Sep 17 00:00:00 2001 From: Rui Wang Date: Sun, 8 Dec 2013 12:17:53 +0800 Subject: PCI, AER: Fix severity usage in aer trace event There's inconsistency between dmesg and the trace event output. When dmesg says "severity=Corrected", the trace event says "severity=Fatal". What happens is that HW_EVENT_ERR_CORRECTED is defined in edac.h: enum hw_event_mc_err_type { HW_EVENT_ERR_CORRECTED, HW_EVENT_ERR_UNCORRECTED, HW_EVENT_ERR_FATAL, HW_EVENT_ERR_INFO, }; while aer_print_error() uses aer_error_severity_string[] defined as: static const char *aer_error_severity_string[] = { "Uncorrected (Non-Fatal)", "Uncorrected (Fatal)", "Corrected" }; In this case dmesg is correct because info->severity is assigned in aer_isr_one_error() using the definitions in include/linux/ras.h: Signed-off-by: Rui Wang Acked-by: Ethan Zhao Link: http://lkml.kernel.org/r/CANVTcTaP18CiGOSEcX5Ch_wPw9mEhkgokfp+d+ZOMFD+Ce4juA@mail.gmail.com Signed-off-by: Borislav Petkov diff --git a/include/trace/events/ras.h b/include/trace/events/ras.h index 88b8783..1c875ad 100644 --- a/include/trace/events/ras.h +++ b/include/trace/events/ras.h @@ -5,7 +5,7 @@ #define _TRACE_AER_H #include -#include +#include /* @@ -63,10 +63,10 @@ TRACE_EVENT(aer_event, TP_printk("%s PCIe Bus Error: severity=%s, %s\n", __get_str(dev_name), - __entry->severity == HW_EVENT_ERR_CORRECTED ? "Corrected" : - __entry->severity == HW_EVENT_ERR_FATAL ? - "Fatal" : "Uncorrected", - __entry->severity == HW_EVENT_ERR_CORRECTED ? + __entry->severity == AER_CORRECTABLE ? "Corrected" : + __entry->severity == AER_FATAL ? + "Fatal" : "Uncorrected, non-fatal", + __entry->severity == AER_CORRECTABLE ? __print_flags(__entry->status, "|", aer_correctable_errors) : __print_flags(__entry->status, "|", aer_uncorrectable_errors)) ); -- cgit v0.10.2 From c700f013adb0ec57518a7fe0163e3117659ce249 Mon Sep 17 00:00:00 2001 From: "Chen, Gong" Date: Fri, 6 Dec 2013 01:17:08 -0500 Subject: EDAC: Add an edac_report parameter to EDAC This new parameter is used to control how to report HW error reporting, especially for newer Intel platform, like Ivybridge-EX, which contains an enhanced error decoding functionality in the firmware, i.e. eMCA. Signed-off-by: Chen, Gong Acked-by: Tony Luck Link: http://lkml.kernel.org/r/1386310630-12529-2-git-send-email-gong.chen@linux.intel.com [ Boris: massage commit message. ] Signed-off-by: Borislav Petkov diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 50680a5..453092c 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -881,6 +881,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted. The xen output can only be used by Xen PV guests. + edac_report= [HW,EDAC] Control how to report EDAC event + Format: {"on" | "off" | "force"} + on: enable EDAC to report H/W event. May be overridden + by other higher priority error reporting module. + off: disable H/W event reporting through EDAC. + force: enforce the use of EDAC to report H/W event. + default: on. + ekgdboc= [X86,KGDB] Allow early kernel console debugging ekgdboc=kbd diff --git a/drivers/edac/edac_stub.c b/drivers/edac/edac_stub.c index 351945f..9d9e18a 100644 --- a/drivers/edac/edac_stub.c +++ b/drivers/edac/edac_stub.c @@ -29,6 +29,25 @@ EXPORT_SYMBOL_GPL(edac_err_assert); static atomic_t edac_subsys_valid = ATOMIC_INIT(0); +int edac_report_status = EDAC_REPORTING_ENABLED; +EXPORT_SYMBOL_GPL(edac_report_status); + +static int __init edac_report_setup(char *str) +{ + if (!str) + return -EINVAL; + + if (!strncmp(str, "on", 2)) + set_edac_report_status(EDAC_REPORTING_ENABLED); + else if (!strncmp(str, "off", 3)) + set_edac_report_status(EDAC_REPORTING_DISABLED); + else if (!strncmp(str, "force", 5)) + set_edac_report_status(EDAC_REPORTING_FORCE); + + return 0; +} +__setup("edac_report=", edac_report_setup); + /* * called to determine if there is an EDAC driver interested in * knowing an event (such as NMI) occurred diff --git a/include/linux/edac.h b/include/linux/edac.h index dbdffe8..8e6c20a 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -35,6 +35,34 @@ extern void edac_atomic_assert_error(void); extern struct bus_type *edac_get_sysfs_subsys(void); extern void edac_put_sysfs_subsys(void); +enum { + EDAC_REPORTING_ENABLED, + EDAC_REPORTING_DISABLED, + EDAC_REPORTING_FORCE +}; + +extern int edac_report_status; +#ifdef CONFIG_EDAC +static inline int get_edac_report_status(void) +{ + return edac_report_status; +} + +static inline void set_edac_report_status(int new) +{ + edac_report_status = new; +} +#else +static inline int get_edac_report_status(void) +{ + return EDAC_REPORTING_DISABLED; +} + +static inline void set_edac_report_status(int new) +{ +} +#endif + static inline void opstate_init(void) { switch (edac_op_state) { -- cgit v0.10.2 From fd521039666529a4674b9822f1cc873672f57ee6 Mon Sep 17 00:00:00 2001 From: "Chen, Gong" Date: Fri, 6 Dec 2013 01:17:09 -0500 Subject: EDAC, sb_edac: Modify H/W event reporting policy Newer Intel platforms support more than one method to report H/W event. On this kind of platform, H/W event report can adopt new method and traditional EDAC method should be disabled. Moreover, if EDAC event report method is set to *force*, it means event must be reported via EDAC interface. IOW, it overrides the default event report policy. Signed-off-by: Chen, Gong Acked-by: Tony Luck Link: http://lkml.kernel.org/r/1386310630-12529-3-git-send-email-gong.chen@linux.intel.com [ Boris: massage commit and error messages ] Signed-off-by: Borislav Petkov diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 8472405..e2e2cb3 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -1829,6 +1829,9 @@ static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val, struct mem_ctl_info *mci; struct sbridge_pvt *pvt; + if (get_edac_report_status() == EDAC_REPORTING_DISABLED) + return NOTIFY_DONE; + mci = get_mci_for_node_id(mce->socketid); if (!mci) return NOTIFY_BAD; @@ -2142,9 +2145,10 @@ static int __init sbridge_init(void) opstate_init(); pci_rc = pci_register_driver(&sbridge_driver); - if (pci_rc >= 0) { mce_register_decode_chain(&sbridge_mce_dec); + if (get_edac_report_status() == EDAC_REPORTING_DISABLED) + sbridge_printk(KERN_WARNING, "Loading driver, error reporting disabled.\n"); return 0; } -- cgit v0.10.2 From 42139eb356e3384759ca143ae04d82376346eb4c Mon Sep 17 00:00:00 2001 From: "Chen, Gong" Date: Fri, 6 Dec 2013 01:17:10 -0500 Subject: ACPI, eMCA: Combine eMCA/EDAC event reporting priority eMCA has higher H/W event reporting priority. Once it is loaded, EDAC event reporting should be disabled, unless EDAC overrides eMCA priority via command line parameter "edac_report=force". Signed-off-by: Chen, Gong Acked-by: Tony Luck Link: http://lkml.kernel.org/r/1386310630-12529-4-git-send-email-gong.chen@linux.intel.com [ Boris: massage printk message. ] Signed-off-by: Borislav Petkov diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c index a6869e1..5d33c54 100644 --- a/drivers/acpi/acpi_extlog.c +++ b/drivers/acpi/acpi_extlog.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -43,6 +44,8 @@ struct extlog_l1_head { u8 rev1[12]; }; +static int old_edac_report_status; + static u8 extlog_dsm_uuid[] = "663E35AF-CC10-41A4-88EA-5470AF055295"; /* L1 table related physical address */ @@ -150,7 +153,7 @@ static int extlog_print(struct notifier_block *nb, unsigned long val, rc = print_extlog_rcd(NULL, (struct acpi_generic_status *)elog_buf, cpu); - return NOTIFY_DONE; + return NOTIFY_STOP; } static int extlog_get_dsm(acpi_handle handle, int rev, int func, u64 *ret) @@ -231,8 +234,12 @@ static int __init extlog_init(void) u64 cap; int rc; - rc = -ENODEV; + if (get_edac_report_status() == EDAC_REPORTING_FORCE) { + pr_warn("Not loading eMCA, error reporting force-enabled through EDAC.\n"); + return -EPERM; + } + rc = -ENODEV; rdmsrl(MSR_IA32_MCG_CAP, cap); if (!(cap & MCG_ELOG_P)) return rc; @@ -287,6 +294,12 @@ static int __init extlog_init(void) if (elog_buf == NULL) goto err_release_elog; + /* + * eMCA event report method has higher priority than EDAC method, + * unless EDAC event report method is mandatory. + */ + old_edac_report_status = get_edac_report_status(); + set_edac_report_status(EDAC_REPORTING_DISABLED); mce_register_decode_chain(&extlog_mce_dec); /* enable OS to be involved to take over management from BIOS */ ((struct extlog_l1_head *)extlog_l1_addr)->flags |= FLAG_OS_OPTIN; @@ -308,6 +321,7 @@ err: static void __exit extlog_exit(void) { + set_edac_report_status(old_edac_report_status); mce_unregister_decode_chain(&extlog_mce_dec); ((struct extlog_l1_head *)extlog_l1_addr)->flags &= ~FLAG_OS_OPTIN; if (extlog_l1_addr) -- cgit v0.10.2