diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-03-01 04:42:33 (GMT) |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-03-01 04:42:33 (GMT) |
commit | ad6c2c2eb34f234d6253292b9b3c047614fbfe7e (patch) | |
tree | 8ceb00db9874c09f3002b5ca579f1f9146b30a28 /include | |
parent | 19cc90f58d4f2538b4cf5371681a057d2e5209f2 (diff) | |
parent | b0769891ba7baa53f270dc70d71934748beb4c5b (diff) | |
download | linux-ad6c2c2eb34f234d6253292b9b3c047614fbfe7e.tar.xz |
Merge branch 'linux_next' of git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-edac
Pull EDAC fixes and ghes-edac from Mauro Carvalho Chehab:
"For:
- Some fixes at edac drivers (i7core_edac, sb_edac, i3200_edac);
- error injection support for i5100, when EDAC debug is enabled;
- fix edac when it is loaded builtin (early init for the subsystem);
- a "Firmware First" EDAC driver, allowing ghes to report errors via
EDAC (ghes-edac).
With regards to ghes-edac, this fixes a longstanding BZ at Red Hat
that happens with Nehalem and Sandy Bridge CPUs: when both GHES and
i7core_edac or sb_edac are running, the error reports are
unpredictable, as both BIOS and OS race to access the registers. With
ghes-edac, the EDAC core will refuse to register any other concurrent
memory error driver.
This patchset moves the ghes struct definitions to a separate header
file (include/acpi/ghes.h) and adds 3 hooks at apei/ghes.c to
register/unregister and to report errors via ghes-edac. Those changes
were acked by ghes driver maintainer (Huang)."
* 'linux_next' of git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-edac: (30 commits)
i5100_edac: convert to use simple_open()
ghes_edac: fix to use list_for_each_entry_safe() when delete list items
ghes_edac: Fix RAS tracing
ghes_edac: Make it compliant with UEFI spec 2.3.1
ghes_edac: Improve driver's printk messages
ghes_edac: Don't credit the same memory dimm twice
ghes_edac: do a better job of filling EDAC DIMM info
ghes_edac: add support for reporting errors via EDAC
ghes_edac: Register at EDAC core the BIOS report
ghes: add the needed hooks for EDAC error report
ghes: move structures/enum to a header file
edac: add support for error type "Info"
edac: add support for raw error reports
edac: reduce stack pressure by using a pre-allocated buffer
edac: lock module owner to avoid error report conflicts
edac: remove proc_name from mci structure
edac: add a new memory layer type
edac: initialize the core earlier
edac: better report error conditions in debug mode
i5100_edac: Remove two checkpatch warnings
...
Diffstat (limited to 'include')
-rw-r--r-- | include/acpi/ghes.h | 72 | ||||
-rw-r--r-- | include/linux/edac.h | 79 | ||||
-rw-r--r-- | include/linux/pci_ids.h | 1 | ||||
-rw-r--r-- | include/ras/ras_event.h | 4 |
4 files changed, 150 insertions, 6 deletions
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h new file mode 100644 index 0000000..720446c --- /dev/null +++ b/include/acpi/ghes.h @@ -0,0 +1,72 @@ +#include <acpi/apei.h> +#include <acpi/hed.h> + +/* + * One struct ghes is created for each generic hardware error source. + * It provides the context for APEI hardware error timer/IRQ/SCI/NMI + * handler. + * + * estatus: memory buffer for error status block, allocated during + * HEST parsing. + */ +#define GHES_TO_CLEAR 0x0001 +#define GHES_EXITING 0x0002 + +struct ghes { + struct acpi_hest_generic *generic; + struct acpi_hest_generic_status *estatus; + u64 buffer_paddr; + unsigned long flags; + union { + struct list_head list; + struct timer_list timer; + unsigned int irq; + }; +}; + +struct ghes_estatus_node { + struct llist_node llnode; + struct acpi_hest_generic *generic; + struct ghes *ghes; +}; + +struct ghes_estatus_cache { + u32 estatus_len; + atomic_t count; + struct acpi_hest_generic *generic; + unsigned long long time_in; + struct rcu_head rcu; +}; + +enum { + GHES_SEV_NO = 0x0, + GHES_SEV_CORRECTED = 0x1, + GHES_SEV_RECOVERABLE = 0x2, + GHES_SEV_PANIC = 0x3, +}; + +/* From drivers/edac/ghes_edac.c */ + +#ifdef CONFIG_EDAC_GHES +void ghes_edac_report_mem_error(struct ghes *ghes, int sev, + struct cper_sec_mem_err *mem_err); + +int ghes_edac_register(struct ghes *ghes, struct device *dev); + +void ghes_edac_unregister(struct ghes *ghes); + +#else +static inline void ghes_edac_report_mem_error(struct ghes *ghes, int sev, + struct cper_sec_mem_err *mem_err) +{ +} + +static inline int ghes_edac_register(struct ghes *ghes, struct device *dev) +{ + return 0; +} + +static inline void ghes_edac_unregister(struct ghes *ghes) +{ +} +#endif diff --git a/include/linux/edac.h b/include/linux/edac.h index 1b8c02b..4fd4999 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -14,7 +14,6 @@ #include <linux/atomic.h> #include <linux/device.h> -#include <linux/kobject.h> #include <linux/completion.h> #include <linux/workqueue.h> #include <linux/debugfs.h> @@ -48,8 +47,17 @@ static inline void opstate_init(void) return; } +/* Max length of a DIMM label*/ #define EDAC_MC_LABEL_LEN 31 -#define MC_PROC_NAME_MAX_LEN 7 + +/* Maximum size of the location string */ +#define LOCATION_SIZE 80 + +/* Defines the maximum number of labels that can be reported */ +#define EDAC_MAX_LABELS 8 + +/* String used to join two or more labels */ +#define OTHER_LABEL " or " /** * enum dev_type - describe the type of memory DRAM chips used at the stick @@ -101,8 +109,24 @@ enum hw_event_mc_err_type { HW_EVENT_ERR_CORRECTED, HW_EVENT_ERR_UNCORRECTED, HW_EVENT_ERR_FATAL, + HW_EVENT_ERR_INFO, }; +static inline char *mc_event_error_type(const unsigned int err_type) +{ + switch (err_type) { + case HW_EVENT_ERR_CORRECTED: + return "Corrected"; + case HW_EVENT_ERR_UNCORRECTED: + return "Uncorrected"; + case HW_EVENT_ERR_FATAL: + return "Fatal"; + default: + case HW_EVENT_ERR_INFO: + return "Info"; + } +} + /** * enum mem_type - memory types. For a more detailed reference, please see * http://en.wikipedia.org/wiki/DRAM @@ -376,6 +400,9 @@ enum scrub_type { * @EDAC_MC_LAYER_CHANNEL: memory layer is named "channel" * @EDAC_MC_LAYER_SLOT: memory layer is named "slot" * @EDAC_MC_LAYER_CHIP_SELECT: memory layer is named "chip select" + * @EDAC_MC_LAYER_ALL_MEM: memory layout is unknown. All memory is mapped + * as a single memory area. This is used when + * retrieving errors from a firmware driven driver. * * This enum is used by the drivers to tell edac_mc_sysfs what name should * be used when describing a memory stick location. @@ -385,6 +412,7 @@ enum edac_mc_layer_type { EDAC_MC_LAYER_CHANNEL, EDAC_MC_LAYER_SLOT, EDAC_MC_LAYER_CHIP_SELECT, + EDAC_MC_LAYER_ALL_MEM, }; /** @@ -551,6 +579,46 @@ struct errcount_attribute_data { int layer0, layer1, layer2; }; +/** + * edac_raw_error_desc - Raw error report structure + * @grain: minimum granularity for an error report, in bytes + * @error_count: number of errors of the same type + * @top_layer: top layer of the error (layer[0]) + * @mid_layer: middle layer of the error (layer[1]) + * @low_layer: low layer of the error (layer[2]) + * @page_frame_number: page where the error happened + * @offset_in_page: page offset + * @syndrome: syndrome of the error (or 0 if unknown or if + * the syndrome is not applicable) + * @msg: error message + * @location: location of the error + * @label: label of the affected DIMM(s) + * @other_detail: other driver-specific detail about the error + * @enable_per_layer_report: if false, the error affects all layers + * (typically, a memory controller error) + */ +struct edac_raw_error_desc { + /* + * NOTE: everything before grain won't be cleaned by + * edac_raw_error_desc_clean() + */ + char location[LOCATION_SIZE]; + char label[(EDAC_MC_LABEL_LEN + 1 + sizeof(OTHER_LABEL)) * EDAC_MAX_LABELS]; + long grain; + + /* the vars below and grain will be cleaned on every new error report */ + u16 error_count; + int top_layer; + int mid_layer; + int low_layer; + unsigned long page_frame_number; + unsigned long offset_in_page; + unsigned long syndrome; + const char *msg; + const char *other_detail; + bool enable_per_layer_report; +}; + /* MEMORY controller information structure */ struct mem_ctl_info { @@ -630,7 +698,6 @@ struct mem_ctl_info { const char *mod_ver; const char *ctl_name; const char *dev_name; - char proc_name[MC_PROC_NAME_MAX_LEN + 1]; void *pvt_info; unsigned long start_time; /* mci load start time (in jiffies) */ @@ -659,6 +726,12 @@ struct mem_ctl_info { /* work struct for this MC */ struct delayed_work work; + /* + * Used to report an error - by being at the global struct + * makes the memory allocated by the EDAC core + */ + struct edac_raw_error_desc error_desc; + /* the internal state of this controller instance */ int op_state; diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 31717bd..f11c1c2 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2802,6 +2802,7 @@ #define PCI_DEVICE_ID_INTEL_JAKETOWN_UBOX 0x3ce0 #define PCI_DEVICE_ID_INTEL_IOAT_SNB 0x402f #define PCI_DEVICE_ID_INTEL_5100_16 0x65f0 +#define PCI_DEVICE_ID_INTEL_5100_19 0x65f3 #define PCI_DEVICE_ID_INTEL_5100_21 0x65f5 #define PCI_DEVICE_ID_INTEL_5100_22 0x65f6 #define PCI_DEVICE_ID_INTEL_5400_ERR 0x4030 diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index 260470e..21cdb0b 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -78,9 +78,7 @@ TRACE_EVENT(mc_event, TP_printk("%d %s error%s:%s%s on %s (mc:%d location:%d:%d:%d address:0x%08lx grain:%d syndrome:0x%08lx%s%s)", __entry->error_count, - (__entry->error_type == HW_EVENT_ERR_CORRECTED) ? "Corrected" : - ((__entry->error_type == HW_EVENT_ERR_FATAL) ? - "Fatal" : "Uncorrected"), + mc_event_error_type(__entry->error_type), __entry->error_count > 1 ? "s" : "", ((char *)__get_str(msg))[0] ? " " : "", __get_str(msg), |