From addccbb264e5e0e5762f4893f6df24afad327c8c Mon Sep 17 00:00:00 2001
From: "Chen, Gong" <gong.chen@linux.intel.com>
Date: Mon, 25 Nov 2013 02:15:00 -0500
Subject: ACPI, APEI, GHES: Do not report only correctable errors with SCI

Currently SCI is employed to handle corrected errors - memory corrected
errors, more specifically but in fact SCI still can be used to handle
any errors, e.g. uncorrected or even fatal ones if enabled by the BIOS.
Enable logging for those kinds of errors too.

Signed-off-by: Chen, Gong <gong.chen@linux.intel.com>
Acked-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Cc: Tony Luck <tony.luck@intel.com>
Link: http://lkml.kernel.org/r/1385363701-12387-1-git-send-email-gong.chen@linux.intel.com
[ Boris: massage commit message, rename function arg. ]
Signed-off-by: Borislav Petkov <bp@suse.de>

diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index de8b60a..a1aef95 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -33,22 +33,28 @@
 #include <linux/acpi.h>
 #include <linux/cper.h>
 #include <acpi/apei.h>
+#include <acpi/ghes.h>
 #include <asm/mce.h>
 
 #include "mce-internal.h"
 
-void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
+void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
 {
 	struct mce m;
 
-	/* Only corrected MC is reported */
-	if (!corrected || !(mem_err->validation_bits & CPER_MEM_VALID_PA))
+	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
 		return;
 
 	mce_setup(&m);
 	m.bank = 1;
-	/* Fake a memory read corrected error with unknown channel */
+	/* Fake a memory read error with unknown channel */
 	m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
+
+	if (severity >= GHES_SEV_RECOVERABLE)
+		m.status |= MCI_STATUS_UC;
+	if (severity >= GHES_SEV_PANIC)
+		m.status |= MCI_STATUS_PCC;
+
 	m.addr = mem_err->physical_addr;
 	mce_log(&m);
 	mce_notify_irq();
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index a30bc31..ce3683d 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -453,8 +453,7 @@ static void ghes_do_proc(struct ghes *ghes,
 			ghes_edac_report_mem_error(ghes, sev, mem_err);
 
 #ifdef CONFIG_X86_MCE
-			apei_mce_report_mem_error(sev == GHES_SEV_CORRECTED,
-						  mem_err);
+			apei_mce_report_mem_error(sev, mem_err);
 #endif
 			ghes_handle_memory_failure(gdata, sev);
 		}
-- 
cgit v0.10.2


From d3ab3edc029bf79b09f91d6a22881c24ecaeb000 Mon Sep 17 00:00:00 2001
From: "Chen, Gong" <gong.chen@linux.intel.com>
Date: Wed, 18 Dec 2013 01:30:49 -0500
Subject: ACPI, APEI: Cleanup alignment-aware accesses

We do use memcpy to avoid access alignment issues between firmware and
OS. Now we can use a better and standard way to avoid this issue. While
at it, simplify some variable names to avoid the 80 cols limit and
use structure assignment instead of unnecessary memcpy. No functional
changes.

Because ERST record id cache is implemented in memory to increase the
access speed via caching ERST content we can refrain from using memcpy
there too and use regular assignment instead.

Signed-off-by: Chen, Gong <gong.chen@linux.intel.com>
Cc: Cc: Tony Luck <tony.luck@intel.com>
Link: http://lkml.kernel.org/r/1387348249-20014-1-git-send-email-gong.chen@linux.intel.com
[ Boris: massage commit message a bit. ]
Signed-off-by: Borislav Petkov <bp@suse.de>

diff --git a/drivers/acpi/apei/apei-base.c b/drivers/acpi/apei/apei-base.c
index 6d2c49b..e55584a 100644
--- a/drivers/acpi/apei/apei-base.c
+++ b/drivers/acpi/apei/apei-base.c
@@ -41,6 +41,7 @@
 #include <linux/rculist.h>
 #include <linux/interrupt.h>
 #include <linux/debugfs.h>
+#include <asm/unaligned.h>
 
 #include "apei-internal.h"
 
@@ -567,8 +568,7 @@ static int apei_check_gar(struct acpi_generic_address *reg, u64 *paddr,
 	bit_offset = reg->bit_offset;
 	access_size_code = reg->access_width;
 	space_id = reg->space_id;
-	/* Handle possible alignment issues */
-	memcpy(paddr, &reg->address, sizeof(*paddr));
+	*paddr = get_unaligned(&reg->address);
 	if (!*paddr) {
 		pr_warning(FW_BUG APEI_PFX
 			   "Invalid physical address in GAR [0x%llx/%u/%u/%u/%u]\n",
diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c
index fb57d03..361177a 100644
--- a/drivers/acpi/apei/einj.c
+++ b/drivers/acpi/apei/einj.c
@@ -34,6 +34,7 @@
 #include <linux/delay.h>
 #include <linux/mm.h>
 #include <acpi/acpi.h>
+#include <asm/unaligned.h>
 
 #include "apei-internal.h"
 
@@ -216,7 +217,7 @@ static void check_vendor_extension(u64 paddr,
 static void *einj_get_parameter_address(void)
 {
 	int i;
-	u64 paddrv4 = 0, paddrv5 = 0;
+	u64 pa_v4 = 0, pa_v5 = 0;
 	struct acpi_whea_header *entry;
 
 	entry = EINJ_TAB_ENTRY(einj_tab);
@@ -225,30 +226,28 @@ static void *einj_get_parameter_address(void)
 		    entry->instruction == ACPI_EINJ_WRITE_REGISTER &&
 		    entry->register_region.space_id ==
 		    ACPI_ADR_SPACE_SYSTEM_MEMORY)
-			memcpy(&paddrv4, &entry->register_region.address,
-			       sizeof(paddrv4));
+			pa_v4 = get_unaligned(&entry->register_region.address);
 		if (entry->action == ACPI_EINJ_SET_ERROR_TYPE_WITH_ADDRESS &&
 		    entry->instruction == ACPI_EINJ_WRITE_REGISTER &&
 		    entry->register_region.space_id ==
 		    ACPI_ADR_SPACE_SYSTEM_MEMORY)
-			memcpy(&paddrv5, &entry->register_region.address,
-			       sizeof(paddrv5));
+			pa_v5 = get_unaligned(&entry->register_region.address);
 		entry++;
 	}
-	if (paddrv5) {
+	if (pa_v5) {
 		struct set_error_type_with_address *v5param;
 
-		v5param = acpi_os_map_memory(paddrv5, sizeof(*v5param));
+		v5param = acpi_os_map_memory(pa_v5, sizeof(*v5param));
 		if (v5param) {
 			acpi5 = 1;
-			check_vendor_extension(paddrv5, v5param);
+			check_vendor_extension(pa_v5, v5param);
 			return v5param;
 		}
 	}
-	if (param_extension && paddrv4) {
+	if (param_extension && pa_v4) {
 		struct einj_parameter *v4param;
 
-		v4param = acpi_os_map_memory(paddrv4, sizeof(*v4param));
+		v4param = acpi_os_map_memory(pa_v4, sizeof(*v4param));
 		if (!v4param)
 			return NULL;
 		if (v4param->reserved1 || v4param->reserved2) {
diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index 26311f2..bf30a12 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -611,7 +611,7 @@ static void __erst_record_id_cache_compact(void)
 		if (entries[i] == APEI_ERST_INVALID_RECORD_ID)
 			continue;
 		if (wpos != i)
-			memcpy(&entries[wpos], &entries[i], sizeof(entries[i]));
+			entries[wpos] = entries[i];
 		wpos++;
 	}
 	erst_record_id_cache.len = wpos;
-- 
cgit v0.10.2


From ca104edc17841da87850b20ab77e57fe0a99ead6 Mon Sep 17 00:00:00 2001
From: "Chen, Gong" <gong.chen@linux.intel.com>
Date: Mon, 25 Nov 2013 02:15:01 -0500
Subject: ACPI, APEI, GHES: Cleanup ghes memory error handling

Cleanup the logic in ghes_handle_memory_failure(). While at it, add
proper PFN validity check for UC error and cleanup the code logic to
make it simpler and cleaner.

Signed-off-by: Chen, Gong <gong.chen@linux.intel.com>
Acked-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/1385363701-12387-2-git-send-email-gong.chen@linux.intel.com
[ Boris: massage commit message. ]
Signed-off-by: Borislav Petkov <bp@suse.de>

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index ce3683d..46766ef 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -413,27 +413,31 @@ static void ghes_handle_memory_failure(struct acpi_generic_data *gdata, int sev)
 {
 #ifdef CONFIG_ACPI_APEI_MEMORY_FAILURE
 	unsigned long pfn;
+	int flags = -1;
 	int sec_sev = ghes_severity(gdata->error_severity);
 	struct cper_sec_mem_err *mem_err;
 	mem_err = (struct cper_sec_mem_err *)(gdata + 1);
 
-	if (sec_sev == GHES_SEV_CORRECTED &&
-	    (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED) &&
-	    (mem_err->validation_bits & CPER_MEM_VALID_PA)) {
-		pfn = mem_err->physical_addr >> PAGE_SHIFT;
-		if (pfn_valid(pfn))
-			memory_failure_queue(pfn, 0, MF_SOFT_OFFLINE);
-		else if (printk_ratelimit())
-			pr_warn(FW_WARN GHES_PFX
-			"Invalid address in generic error data: %#llx\n",
-			mem_err->physical_addr);
-	}
-	if (sev == GHES_SEV_RECOVERABLE &&
-	    sec_sev == GHES_SEV_RECOVERABLE &&
-	    mem_err->validation_bits & CPER_MEM_VALID_PA) {
-		pfn = mem_err->physical_addr >> PAGE_SHIFT;
-		memory_failure_queue(pfn, 0, 0);
+	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
+		return;
+
+	pfn = mem_err->physical_addr >> PAGE_SHIFT;
+	if (!pfn_valid(pfn)) {
+		pr_warn_ratelimited(FW_WARN GHES_PFX
+		"Invalid address in generic error data: %#llx\n",
+		mem_err->physical_addr);
+		return;
 	}
+
+	/* iff following two events can be handled properly by now */
+	if (sec_sev == GHES_SEV_CORRECTED &&
+	    (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED))
+		flags = MF_SOFT_OFFLINE;
+	if (sev == GHES_SEV_RECOVERABLE && sec_sev == GHES_SEV_RECOVERABLE)
+		flags = 0;
+
+	if (flags != -1)
+		memory_failure_queue(pfn, 0, flags);
 #endif
 }
 
-- 
cgit v0.10.2


From 4f75d8412792777a314ac5c1393a9ed43d695fd1 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 23 Dec 2013 18:05:02 +0100
Subject: x86, mce: Fix mce_start_timer semantics

So mce_start_timer() has a 'cpu' argument which is supposed to mean to
start a timer on that cpu. However, the code currently starts a timer on
the *current* cpu the function runs on and causes the sanity-check in
mce_timer_fn to fire:

	WARNING: CPU: 0 PID: 0 at arch/x86/kernel/cpu/mcheck/mce.c:1286 mce_timer_fn

because it is running on the wrong cpu.

This was triggered by Prarit Bhargava <prarit@redhat.com> by offlining
all the cpus in succession.

Then, we were fiddling with the CMCI storm settings when starting the
timer whereas there's no need for that - if there's storm happening
on this newly restarted cpu, we're going to be in normal CMCI mode
initially and then when the CMCI interrupt starts firing, we're going to
go to the polling mode with the timer real soon.

Signed-off-by: Borislav Petkov <bp@suse.de>
Tested-by: Prarit Bhargava <prarit@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Reviewed-by: Chen, Gong <gong.chen@linux.intel.com>
Link: http://lkml.kernel.org/r/1387722156-5511-1-git-send-email-prarit@redhat.com

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a389c1d..4d5419b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1638,15 +1638,15 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
 
 static void mce_start_timer(unsigned int cpu, struct timer_list *t)
 {
-	unsigned long iv = mce_adjust_timer(check_interval * HZ);
-
-	__this_cpu_write(mce_next_interval, iv);
+	unsigned long iv = check_interval * HZ;
 
 	if (mca_cfg.ignore_ce || !iv)
 		return;
 
+	per_cpu(mce_next_interval, cpu) = iv;
+
 	t->expires = round_jiffies(jiffies + iv);
-	add_timer_on(t, smp_processor_id());
+	add_timer_on(t, cpu);
 }
 
 static void __mcheck_cpu_init_timer(void)
-- 
cgit v0.10.2