From 539da7877275edb21a76aa02fb2c147eff02c559 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 4 Nov 2015 22:57:00 +0000 Subject: x86/apic: Add a single-target IPI function to the apic We still fall back on the "send mask" versions if an apic definition doesn't have the single-target version, but at least this allows the (trivial) case for the common clustered x2apic case. Signed-off-by: Linus Torvalds Reviewed-by: Ingo Molnar Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Mike Travis Cc: Daniel J Blueman Link: http://lkml.kernel.org/r/20151104220848.737120838@linutronix.de Signed-off-by: Thomas Gleixner diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index a30316b..7f62ad4 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -303,6 +303,7 @@ struct apic { unsigned int *apicid); /* ipi */ + void (*send_IPI)(int cpu, int vector); void (*send_IPI_mask)(const struct cpumask *mask, int vector); void (*send_IPI_mask_allbutself)(const struct cpumask *mask, int vector); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 12c8286..1dbf590 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -115,6 +115,18 @@ static atomic_t stopping_cpu = ATOMIC_INIT(-1); static bool smp_no_nmi_ipi = false; /* + * Helper wrapper: not all apic definitions support sending to + * a single CPU, so we fall back to sending to a mask. + */ +static void send_IPI_cpu(int cpu, int vector) +{ + if (apic->send_IPI) + apic->send_IPI(cpu, vector); + else + apic->send_IPI_mask(cpumask_of(cpu), vector); +} + +/* * this function sends a 'reschedule' IPI to another CPU. * it goes straight through and wastes no time serializing * anything. Worst case is that we lose a reschedule ... @@ -125,12 +137,12 @@ static void native_smp_send_reschedule(int cpu) WARN_ON(1); return; } - apic->send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR); + send_IPI_cpu(cpu, RESCHEDULE_VECTOR); } void native_send_call_func_single_ipi(int cpu) { - apic->send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR); + send_IPI_cpu(cpu, CALL_FUNCTION_SINGLE_VECTOR); } void native_send_call_func_ipi(const struct cpumask *mask) -- cgit v0.10.2 From 7b6ce46cb3d096831dea3accacee4717c66abac8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 4 Nov 2015 22:57:00 +0000 Subject: x86/apic: Implement single target IPI function for x2apic_cluster [ tglx: Split it out from the patch which provides the new callback ] Signed-off-by: Linus Torvalds Reviewed-by: Ingo Molnar Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Mike Travis Cc: Daniel J Blueman Link: http://lkml.kernel.org/r/20151104220848.817975597@linutronix.de Signed-off-by: Thomas Gleixner diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index cc8311c..aca8b75 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -23,6 +23,14 @@ static inline u32 x2apic_cluster(int cpu) return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16; } +static void x2apic_send_IPI(int cpu, int vector) +{ + u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu); + + x2apic_wrmsr_fence(); + __x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL); +} + static void __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) { @@ -266,6 +274,7 @@ static struct apic apic_x2apic_cluster = { .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, + .send_IPI = x2apic_send_IPI, .send_IPI_mask = x2apic_send_IPI_mask, .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, .send_IPI_allbutself = x2apic_send_IPI_allbutself, -- cgit v0.10.2 From 53be0fac8bdaeec87e0df7d0334345421d2be187 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Nov 2015 22:57:01 +0000 Subject: x86/apic: Implement default single target IPI function apic_physflat and bigsmp_apic can share that implementation. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Linus Torvalds Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Mike Travis Cc: Daniel J Blueman Link: http://lkml.kernel.org/r/20151104220848.898543767@linutronix.de diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h index 615fa90..22998a8 100644 --- a/arch/x86/include/asm/ipi.h +++ b/arch/x86/include/asm/ipi.h @@ -119,6 +119,7 @@ static inline void native_apic_mem_write(APIC_ICR, cfg); } +extern void default_send_IPI_single_phys(int cpu, int vector); extern void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector); extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 6207156..4fcffbf 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -18,6 +18,16 @@ #include #include +void default_send_IPI_single_phys(int cpu, int vector) +{ + unsigned long flags; + + local_irq_save(flags); + __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, cpu), + vector, APIC_DEST_PHYSICAL); + local_irq_restore(flags); +} + void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector) { unsigned long query_cpu; -- cgit v0.10.2 From 449112f4f35074f1dc70d4f0e769cb14150c159c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Nov 2015 22:57:02 +0000 Subject: x86/apic: Remove pointless indirections from apic_physflat No value in having 32 byte extra text. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Linus Torvalds Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Mike Travis Cc: Daniel J Blueman Link: http://lkml.kernel.org/r/20151104220848.975653382@linutronix.de diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index f92ab36..6d3e1a6 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -230,17 +230,6 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } -static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector) -{ - default_send_IPI_mask_sequence_phys(cpumask, vector); -} - -static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask, - int vector) -{ - default_send_IPI_mask_allbutself_phys(cpumask, vector); -} - static void physflat_send_IPI_allbutself(int vector) { default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); @@ -248,7 +237,7 @@ static void physflat_send_IPI_allbutself(int vector) static void physflat_send_IPI_all(int vector) { - physflat_send_IPI_mask(cpu_online_mask, vector); + default_send_IPI_mask_sequence_phys(cpu_online_mask, vector); } static int physflat_probe(void) @@ -292,8 +281,8 @@ static struct apic apic_physflat = { .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, - .send_IPI_mask = physflat_send_IPI_mask, - .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself, + .send_IPI_mask = default_send_IPI_mask_sequence_phys, + .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_phys, .send_IPI_allbutself = physflat_send_IPI_allbutself, .send_IPI_all = physflat_send_IPI_all, .send_IPI_self = apic_send_IPI_self, -- cgit v0.10.2 From 68cd88ff8df97846eb07080f17264a4de50cb012 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Nov 2015 22:57:02 +0000 Subject: x86/apic: Wire up single IPI for apic_physflat Use the default implementation. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Linus Torvalds Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Mike Travis Cc: Daniel J Blueman Link: http://lkml.kernel.org/r/20151104220849.055046864@linutronix.de diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 6d3e1a6..9de25d4 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -281,6 +281,7 @@ static struct apic apic_physflat = { .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .send_IPI = default_send_IPI_single_phys, .send_IPI_mask = default_send_IPI_mask_sequence_phys, .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_phys, .send_IPI_allbutself = physflat_send_IPI_allbutself, -- cgit v0.10.2 From 500bd02fb17e5d9296c77ccc07db61fd5d4922a4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Nov 2015 22:57:03 +0000 Subject: x86/apic: Remove pointless indirections from bigsmp_apic Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Linus Torvalds Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Mike Travis Cc: Daniel J Blueman Link: http://lkml.kernel.org/r/20151104220849.133086575@linutronix.de diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 971cf88..d4d103b 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -96,11 +96,6 @@ static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) return cpuid_apic >> index_msb; } -static inline void bigsmp_send_IPI_mask(const struct cpumask *mask, int vector) -{ - default_send_IPI_mask_sequence_phys(mask, vector); -} - static void bigsmp_send_IPI_allbutself(int vector) { default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); @@ -108,7 +103,7 @@ static void bigsmp_send_IPI_allbutself(int vector) static void bigsmp_send_IPI_all(int vector) { - bigsmp_send_IPI_mask(cpu_online_mask, vector); + default_send_IPI_mask_sequence_phys(cpu_online_mask, vector); } static int dmi_bigsmp; /* can be set by dmi scanners */ @@ -180,7 +175,7 @@ static struct apic apic_bigsmp = { .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, - .send_IPI_mask = bigsmp_send_IPI_mask, + .send_IPI_mask = default_send_IPI_mask_sequence_phys, .send_IPI_mask_allbutself = NULL, .send_IPI_allbutself = bigsmp_send_IPI_allbutself, .send_IPI_all = bigsmp_send_IPI_all, -- cgit v0.10.2 From 5789a12e28f7bf6a37564a5fc9ebc60dc86659b5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Nov 2015 22:57:04 +0000 Subject: x86/apic: Wire up single IPI for bigsmp_apic Use the default implementation. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Linus Torvalds Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Mike Travis Cc: Daniel J Blueman Link: http://lkml.kernel.org/r/20151104220849.213292642@linutronix.de diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index d4d103b..cf9bd89 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -175,6 +175,7 @@ static struct apic apic_bigsmp = { .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .send_IPI = default_send_IPI_single_phys, .send_IPI_mask = default_send_IPI_mask_sequence_phys, .send_IPI_mask_allbutself = NULL, .send_IPI_allbutself = bigsmp_send_IPI_allbutself, -- cgit v0.10.2 From f2bffe8a3eef42a1cd3393d56acd9fe598d2119c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Nov 2015 22:57:04 +0000 Subject: x86/apic: Implement single IPI for x2apic_phys Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Linus Torvalds Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Mike Travis Cc: Daniel J Blueman Link: http://lkml.kernel.org/r/20151104220849.296438009@linutronix.de diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 662e915..a1242e2 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -36,6 +36,14 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return x2apic_enabled() && (x2apic_phys || x2apic_fadt_phys()); } +static void x2apic_send_IPI(int cpu, int vector) +{ + u32 dest = per_cpu(x86_cpu_to_apicid, cpu); + + x2apic_wrmsr_fence(); + __x2apic_send_IPI_dest(dest, vector, APIC_DEST_PHYSICAL); +} + static void __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) { @@ -122,6 +130,7 @@ static struct apic apic_x2apic_phys = { .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .send_IPI = x2apic_send_IPI, .send_IPI_mask = x2apic_send_IPI_mask, .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, .send_IPI_allbutself = x2apic_send_IPI_allbutself, -- cgit v0.10.2 From 8642ea953d99fc037c1076e9a8b3a822025fb251 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Nov 2015 22:57:05 +0000 Subject: x86/apic: Wire up single IPI for x2apic_uv The function already exists. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Linus Torvalds Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Mike Travis Cc: Daniel J Blueman Link: http://lkml.kernel.org/r/20151104220849.376775625@linutronix.de diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 4a13946..d760c6b 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -406,6 +406,7 @@ static struct apic __refdata apic_x2apic_uv_x = { .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and, + .send_IPI = uv_send_IPI_one, .send_IPI_mask = uv_send_IPI_mask, .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself, .send_IPI_allbutself = uv_send_IPI_allbutself, -- cgit v0.10.2 From c61a0d31ba0ce75cb1b88bb4eb2f41a1b80bc90f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Nov 2015 22:57:06 +0000 Subject: x86/apic: Wire up single IPI for apic_numachip The function already exists. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Linus Torvalds Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Mike Travis Cc: Daniel J Blueman Link: http://lkml.kernel.org/r/20151104220849.551445489@linutronix.de diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 38dd5ef..69329a6 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -276,6 +276,7 @@ static const struct apic apic_numachip1 __refconst = { .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .send_IPI = numachip_send_IPI_one, .send_IPI_mask = numachip_send_IPI_mask, .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself, .send_IPI_allbutself = numachip_send_IPI_allbutself, @@ -327,6 +328,7 @@ static const struct apic apic_numachip2 __refconst = { .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .send_IPI = numachip_send_IPI_one, .send_IPI_mask = numachip_send_IPI_mask, .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself, .send_IPI_allbutself = numachip_send_IPI_allbutself, -- cgit v0.10.2 From 4727da2eb1ec79fdc2acdd2f764b5b2aacab998c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Nov 2015 22:57:06 +0000 Subject: x86/apic: Implement single IPI for apic_noop Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Linus Torvalds Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Mike Travis Cc: Daniel J Blueman Link: http://lkml.kernel.org/r/20151104220849.455429817@linutronix.de diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 0d96749..331a7a0 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -30,6 +30,7 @@ #include static void noop_init_apic_ldr(void) { } +static void noop_send_IPI(int cpu, int vector) { } static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { } static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { } static void noop_send_IPI_allbutself(int vector) { } @@ -144,6 +145,7 @@ struct apic apic_noop = { .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, + .send_IPI = noop_send_IPI, .send_IPI_mask = noop_send_IPI_mask, .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself, .send_IPI_allbutself = noop_send_IPI_allbutself, -- cgit v0.10.2 From 7e29393b20a1a863a5f9bf48dc71e5cff4035ff5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Nov 2015 22:57:07 +0000 Subject: x86/apic: Provide default send single IPI wrapper Instead of doing the wrapping in the smp code we can provide a default wrapper for those APICs which insist on cpumasks. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Linus Torvalds Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Mike Travis Cc: Daniel J Blueman Link: http://lkml.kernel.org/r/20151104220849.631111846@linutronix.de diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h index 22998a8..cfc9a0d 100644 --- a/arch/x86/include/asm/ipi.h +++ b/arch/x86/include/asm/ipi.h @@ -119,6 +119,7 @@ static inline void native_apic_mem_write(APIC_ICR, cfg); } +extern void default_send_IPI_single(int cpu, int vector); extern void default_send_IPI_single_phys(int cpu, int vector); extern void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector); diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 4fcffbf..eb45fc9 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -65,6 +65,14 @@ void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, local_irq_restore(flags); } +/* + * Helper function for APICs which insist on cpumasks + */ +void default_send_IPI_single(int cpu, int vector) +{ + apic->send_IPI_mask(cpumask_of(cpu), vector); +} + #ifdef CONFIG_X86_32 void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, -- cgit v0.10.2 From 6153058a03f4cc5200b0b29e201caa11779ebca0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Nov 2015 22:57:08 +0000 Subject: x86/apic: Use default send single IPI wrapper Wire up the default_send_IPI_single() wrapper to the last holdouts. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Linus Torvalds Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Mike Travis Cc: Daniel J Blueman Link: http://lkml.kernel.org/r/20151104220849.711224890@linutronix.de diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 9de25d4..9968f30 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -185,6 +185,7 @@ static struct apic apic_flat = { .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, + .send_IPI = default_send_IPI_single, .send_IPI_mask = flat_send_IPI_mask, .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself, .send_IPI_allbutself = flat_send_IPI_allbutself, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 7694ae6..f316e34 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -105,6 +105,7 @@ static struct apic apic_default = { .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, + .send_IPI = default_send_IPI_single, .send_IPI_mask = default_send_IPI_mask_logical, .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical, .send_IPI_allbutself = default_send_IPI_allbutself, -- cgit v0.10.2 From 72613184a1f076659e8a902d64351f50d3f9c990 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Nov 2015 22:57:09 +0000 Subject: x86/smp: Remove single IPI wrapper All APIC implementation have send_IPI now. Remove the conditional in the calling code. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Linus Torvalds Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Mike Travis Cc: Daniel J Blueman Link: http://lkml.kernel.org/r/20151104220849.807817097@linutronix.de diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 1dbf590..658777c 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -115,18 +115,6 @@ static atomic_t stopping_cpu = ATOMIC_INIT(-1); static bool smp_no_nmi_ipi = false; /* - * Helper wrapper: not all apic definitions support sending to - * a single CPU, so we fall back to sending to a mask. - */ -static void send_IPI_cpu(int cpu, int vector) -{ - if (apic->send_IPI) - apic->send_IPI(cpu, vector); - else - apic->send_IPI_mask(cpumask_of(cpu), vector); -} - -/* * this function sends a 'reschedule' IPI to another CPU. * it goes straight through and wastes no time serializing * anything. Worst case is that we lose a reschedule ... @@ -137,12 +125,12 @@ static void native_smp_send_reschedule(int cpu) WARN_ON(1); return; } - send_IPI_cpu(cpu, RESCHEDULE_VECTOR); + apic->send_IPI(cpu, RESCHEDULE_VECTOR); } void native_send_call_func_single_ipi(int cpu) { - send_IPI_cpu(cpu, CALL_FUNCTION_SINGLE_VECTOR); + apic->send_IPI(cpu, CALL_FUNCTION_SINGLE_VECTOR); } void native_send_call_func_ipi(const struct cpumask *mask) -- cgit v0.10.2 From 2fde46b79e2fdbc90d0d97cf992782732b5a371c Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sun, 22 Nov 2015 18:16:15 -0500 Subject: x86/smpboot: Re-enable init_udelay=0 by default on modern CPUs Fix a Linux-4.3 corner case performance regression, introduced by commit: f1ccd249319e ("x86/smpboot: Fix cpu_init_udelay=10000 corner case boot parameter misbehavior") which allowed the cmdline "cpu_init_udelay=" to work with all values, including the default of 10000. But in setting the default of 10000, it over-rode the code stat sets the delay 0 on modern processors. Also, tidy up use of INT/UINT. Reported-by: Shane Signed-off-by: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dparsons@brightdsl.net Link: http://lkml.kernel.org/r/9082eb809ef40dad02db714759c7aaf618c518d4.1448232494.git.len.brown@intel.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 892ee2e5..fbabe4f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -509,7 +509,7 @@ void __inquire_remote_apic(int apicid) */ #define UDELAY_10MS_DEFAULT 10000 -static unsigned int init_udelay = INT_MAX; +static unsigned int init_udelay = UINT_MAX; static int __init cpu_init_udelay(char *str) { @@ -522,14 +522,15 @@ early_param("cpu_init_udelay", cpu_init_udelay); static void __init smp_quirk_init_udelay(void) { /* if cmdline changed it from default, leave it alone */ - if (init_udelay != INT_MAX) + if (init_udelay != UINT_MAX) return; /* if modern processor, use no delay */ if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) || - ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) + ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) { init_udelay = 0; - + return; + } /* else, use legacy delay */ init_udelay = UDELAY_10MS_DEFAULT; } -- cgit v0.10.2 From 42baa2581c92f8d07e7260506c8d41caf14b0fc3 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 23 Nov 2015 11:59:24 +0100 Subject: x86/apic: Fix the saving and restoring of lapic vectors during suspend/resume Saving and restoring lapic vectors in lapic_suspend() and lapic_resume() is not consistent: the thmr vector saving is guarded by a different config option than the restore part. The cmci vector isn't handled at all. Those inconsistencies are not very critical, as the missing cmci vector will be set via mce resume handling, the wrong config option used for restoring the thmr vector can't be configured differently than the one which should be used. Nevertheless correct the thmr vector restore and add cmci vector handling. Signed-off-by: Juergen Gross Acked-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1448276364-31334-1-git-send-email-jgross@suse.com [ Minor code edits. ] Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 2f69e3b..8d7df74 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2270,6 +2270,7 @@ static struct { unsigned int apic_tmict; unsigned int apic_tdcr; unsigned int apic_thmr; + unsigned int apic_cmci; } apic_pm_state; static int lapic_suspend(void) @@ -2299,6 +2300,10 @@ static int lapic_suspend(void) if (maxlvt >= 5) apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); #endif +#ifdef CONFIG_X86_MCE_INTEL + if (maxlvt >= 6) + apic_pm_state.apic_cmci = apic_read(APIC_LVTCMCI); +#endif local_irq_save(flags); disable_local_APIC(); @@ -2355,10 +2360,14 @@ static void lapic_resume(void) apic_write(APIC_SPIV, apic_pm_state.apic_spiv); apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); -#if defined(CONFIG_X86_MCE_INTEL) +#ifdef CONFIG_X86_THERMAL_VECTOR if (maxlvt >= 5) apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); #endif +#ifdef CONFIG_X86_MCE_INTEL + if (maxlvt >= 6) + apic_write(APIC_LVTCMCI, apic_pm_state.apic_cmci); +#endif if (maxlvt >= 4) apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); -- cgit v0.10.2 From 1717f2096b543cede7a380c858c765c41936bc35 Mon Sep 17 00:00:00 2001 From: Hidehiro Kawai Date: Mon, 14 Dec 2015 11:19:09 +0100 Subject: panic, x86: Fix re-entrance problem due to panic on NMI If panic on NMI happens just after panic() on the same CPU, panic() is recursively called. Kernel stalls, as a result, after failing to acquire panic_lock. To avoid this problem, don't call panic() in NMI context if we've already entered panic(). For that, introduce nmi_panic() macro to reduce code duplication. In the case of panic on NMI, don't return from NMI handlers if another CPU already panicked. Signed-off-by: Hidehiro Kawai Acked-by: Michal Hocko Cc: Aaron Tomlin Cc: Andrew Morton Cc: Andy Lutomirski Cc: Baoquan He Cc: Chris Metcalf Cc: David Hildenbrand Cc: Don Zickus Cc: "Eric W. Biederman" Cc: Frederic Weisbecker Cc: Gobinda Charan Maji Cc: HATAYAMA Daisuke Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Javi Merino Cc: Jonathan Corbet Cc: kexec@lists.infradead.org Cc: linux-doc@vger.kernel.org Cc: lkml Cc: Masami Hiramatsu Cc: Michal Nazarewicz Cc: Nicolas Iooss Cc: Peter Zijlstra Cc: Prarit Bhargava Cc: Rasmus Villemoes Cc: Rusty Russell Cc: Seth Jennings Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Ulrich Obergfell Cc: Vitaly Kuznetsov Cc: Vivek Goyal Link: http://lkml.kernel.org/r/20151210014626.25437.13302.stgit@softrs [ Cleanup comments, fixup formatting. ] Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 697f90d..fca8793 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -231,7 +231,7 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs) #endif if (panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); + nmi_panic("NMI: Not continuing"); pr_emerg("Dazed and confused, but trying to continue\n"); @@ -255,8 +255,16 @@ io_check_error(unsigned char reason, struct pt_regs *regs) reason, smp_processor_id()); show_regs(regs); - if (panic_on_io_nmi) - panic("NMI IOCK error: Not continuing"); + if (panic_on_io_nmi) { + nmi_panic("NMI IOCK error: Not continuing"); + + /* + * If we end up here, it means we have received an NMI while + * processing panic(). Simply return without delaying and + * re-enabling NMIs. + */ + return; + } /* Re-enable the IOCK line, wait for a few seconds */ reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK; @@ -297,7 +305,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) pr_emerg("Do you have a strange power saving mode enabled?\n"); if (unknown_nmi_panic || panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); + nmi_panic("NMI: Not continuing"); pr_emerg("Dazed and confused, but trying to continue\n"); } diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 350dfb0..750cc5c 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -446,6 +446,26 @@ extern int sysctl_panic_on_stackoverflow; extern bool crash_kexec_post_notifiers; /* + * panic_cpu is used for synchronizing panic() and crash_kexec() execution. It + * holds a CPU number which is executing panic() currently. A value of + * PANIC_CPU_INVALID means no CPU has entered panic() or crash_kexec(). + */ +extern atomic_t panic_cpu; +#define PANIC_CPU_INVALID -1 + +/* + * A variant of panic() called from NMI context. We return if we've already + * panicked on this CPU. + */ +#define nmi_panic(fmt, ...) \ +do { \ + int cpu = raw_smp_processor_id(); \ + \ + if (atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu) != cpu) \ + panic(fmt, ##__VA_ARGS__); \ +} while (0) + +/* * Only to be used by arch init code. If the user over-wrote the default * CONFIG_PANIC_TIMEOUT, honor it. */ diff --git a/kernel/panic.c b/kernel/panic.c index 4b150bc..3344524 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -61,6 +61,8 @@ void __weak panic_smp_self_stop(void) cpu_relax(); } +atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); + /** * panic - halt the system * @fmt: The text string to print @@ -71,17 +73,17 @@ void __weak panic_smp_self_stop(void) */ void panic(const char *fmt, ...) { - static DEFINE_SPINLOCK(panic_lock); static char buf[1024]; va_list args; long i, i_next = 0; int state = 0; + int old_cpu, this_cpu; /* * Disable local interrupts. This will prevent panic_smp_self_stop * from deadlocking the first cpu that invokes the panic, since * there is nothing to prevent an interrupt handler (that runs - * after the panic_lock is acquired) from invoking panic again. + * after setting panic_cpu) from invoking panic() again. */ local_irq_disable(); @@ -94,8 +96,16 @@ void panic(const char *fmt, ...) * multiple parallel invocations of panic, all other CPUs either * stop themself or will wait until they are stopped by the 1st CPU * with smp_send_stop(). + * + * `old_cpu == PANIC_CPU_INVALID' means this is the 1st CPU which + * comes here, so go ahead. + * `old_cpu == this_cpu' means we came from nmi_panic() which sets + * panic_cpu to this CPU. In this case, this is also the 1st CPU. */ - if (!spin_trylock(&panic_lock)) + this_cpu = raw_smp_processor_id(); + old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); + + if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu) panic_smp_self_stop(); console_verbose(); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 18f34cf..b9be18f 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -351,7 +351,7 @@ static void watchdog_overflow_callback(struct perf_event *event, trigger_allbutself_cpu_backtrace(); if (hardlockup_panic) - panic("Hard LOCKUP"); + nmi_panic("Hard LOCKUP"); __this_cpu_write(hard_watchdog_warn, true); return; -- cgit v0.10.2 From 58c5661f2144c089bbc2e5d87c9ec1dc1d2964fe Mon Sep 17 00:00:00 2001 From: Hidehiro Kawai Date: Mon, 14 Dec 2015 11:19:10 +0100 Subject: panic, x86: Allow CPUs to save registers even if looping in NMI context Currently, kdump_nmi_shootdown_cpus(), a subroutine of crash_kexec(), sends an NMI IPI to CPUs which haven't called panic() to stop them, save their register information and do some cleanups for crash dumping. However, if such a CPU is infinitely looping in NMI context, we fail to save its register information into the crash dump. For example, this can happen when unknown NMIs are broadcast to all CPUs as follows: CPU 0 CPU 1 =========================== ========================== receive an unknown NMI unknown_nmi_error() panic() receive an unknown NMI spin_trylock(&panic_lock) unknown_nmi_error() crash_kexec() panic() spin_trylock(&panic_lock) panic_smp_self_stop() infinite loop kdump_nmi_shootdown_cpus() issue NMI IPI -----------> blocked until IRET infinite loop... Here, since CPU 1 is in NMI context, the second NMI from CPU 0 is blocked until CPU 1 executes IRET. However, CPU 1 never executes IRET, so the NMI is not handled and the callback function to save registers is never called. In practice, this can happen on some servers which broadcast NMIs to all CPUs when the NMI button is pushed. To save registers in this case, we need to: a) Return from NMI handler instead of looping infinitely or b) Call the callback function directly from the infinite loop Inherently, a) is risky because NMI is also used to prevent corrupted data from being propagated to devices. So, we chose b). This patch does the following: 1. Move the infinite looping of CPUs which haven't called panic() in NMI context (actually done by panic_smp_self_stop()) outside of panic() to enable us to refer pt_regs. Please note that panic_smp_self_stop() is still used for normal context. 2. Call a callback of kdump_nmi_shootdown_cpus() directly to save registers and do some cleanups after setting waiting_for_crash_ipi which is used for counting down the number of CPUs which handled the callback Signed-off-by: Hidehiro Kawai Acked-by: Michal Hocko Cc: Aaron Tomlin Cc: Andrew Morton Cc: Andy Lutomirski Cc: Baoquan He Cc: Chris Metcalf Cc: Dave Young Cc: David Hildenbrand Cc: Don Zickus Cc: Eric Biederman Cc: Frederic Weisbecker Cc: Gobinda Charan Maji Cc: HATAYAMA Daisuke Cc: Hidehiro Kawai Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Javi Merino Cc: Jiang Liu Cc: Jonathan Corbet Cc: kexec@lists.infradead.org Cc: linux-doc@vger.kernel.org Cc: lkml Cc: Masami Hiramatsu Cc: Michal Nazarewicz Cc: Nicolas Iooss Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Prarit Bhargava Cc: Rasmus Villemoes Cc: Seth Jennings Cc: Stefan Lippers-Hollmann Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Ulrich Obergfell Cc: Vitaly Kuznetsov Cc: Vivek Goyal Cc: Yasuaki Ishimatsu Link: http://lkml.kernel.org/r/20151210014628.25437.75256.stgit@softrs [ Cleanup comments, fixup formatting. ] Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index fca8793..424aec4 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -231,7 +231,7 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs) #endif if (panic_on_unrecovered_nmi) - nmi_panic("NMI: Not continuing"); + nmi_panic(regs, "NMI: Not continuing"); pr_emerg("Dazed and confused, but trying to continue\n"); @@ -256,7 +256,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs) show_regs(regs); if (panic_on_io_nmi) { - nmi_panic("NMI IOCK error: Not continuing"); + nmi_panic(regs, "NMI IOCK error: Not continuing"); /* * If we end up here, it means we have received an NMI while @@ -305,7 +305,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) pr_emerg("Do you have a strange power saving mode enabled?\n"); if (unknown_nmi_panic || panic_on_unrecovered_nmi) - nmi_panic("NMI: Not continuing"); + nmi_panic(regs, "NMI: Not continuing"); pr_emerg("Dazed and confused, but trying to continue\n"); } diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 02693dd..1da1302 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -718,6 +718,7 @@ static int crashing_cpu; static nmi_shootdown_cb shootdown_callback; static atomic_t waiting_for_crash_ipi; +static int crash_ipi_issued; static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) { @@ -780,6 +781,9 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) smp_send_nmi_allbutself(); + /* Kick CPUs looping in NMI context. */ + WRITE_ONCE(crash_ipi_issued, 1); + msecs = 1000; /* Wait at most a second for the other cpus to stop */ while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) { mdelay(1); @@ -788,6 +792,22 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) /* Leave the nmi callback set */ } + +/* Override the weak function in kernel/panic.c */ +void nmi_panic_self_stop(struct pt_regs *regs) +{ + while (1) { + /* + * Wait for the crash dumping IPI to be issued, and then + * call its callback directly. + */ + if (READ_ONCE(crash_ipi_issued)) + crash_nmi_callback(0, regs); /* Don't return */ + + cpu_relax(); + } +} + #else /* !CONFIG_SMP */ void nmi_shootdown_cpus(nmi_shootdown_cb callback) { diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 750cc5c..7311c32 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -255,6 +255,7 @@ extern long (*panic_blink)(int state); __printf(1, 2) void panic(const char *fmt, ...) __noreturn __cold; +void nmi_panic_self_stop(struct pt_regs *); extern void oops_enter(void); extern void oops_exit(void); void print_oops_end_marker(void); @@ -455,14 +456,21 @@ extern atomic_t panic_cpu; /* * A variant of panic() called from NMI context. We return if we've already - * panicked on this CPU. + * panicked on this CPU. If another CPU already panicked, loop in + * nmi_panic_self_stop() which can provide architecture dependent code such + * as saving register state for crash dump. */ -#define nmi_panic(fmt, ...) \ +#define nmi_panic(regs, fmt, ...) \ do { \ - int cpu = raw_smp_processor_id(); \ + int old_cpu, cpu; \ \ - if (atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu) != cpu) \ + cpu = raw_smp_processor_id(); \ + old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu); \ + \ + if (old_cpu == PANIC_CPU_INVALID) \ panic(fmt, ##__VA_ARGS__); \ + else if (old_cpu != cpu) \ + nmi_panic_self_stop(regs); \ } while (0) /* diff --git a/kernel/panic.c b/kernel/panic.c index 3344524..06f31b49 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -61,6 +61,15 @@ void __weak panic_smp_self_stop(void) cpu_relax(); } +/* + * Stop ourselves in NMI context if another CPU has already panicked. Arch code + * may override this to prepare for crash dumping, e.g. save regs info. + */ +void __weak nmi_panic_self_stop(struct pt_regs *regs) +{ + panic_smp_self_stop(); +} + atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); /** diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b9be18f..84b5035 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -351,7 +351,7 @@ static void watchdog_overflow_callback(struct perf_event *event, trigger_allbutself_cpu_backtrace(); if (hardlockup_panic) - nmi_panic("Hard LOCKUP"); + nmi_panic(regs, "Hard LOCKUP"); __this_cpu_write(hard_watchdog_warn, true); return; -- cgit v0.10.2 From 7bbee5ca3896f69f09c68be549cb8997abe6bca6 Mon Sep 17 00:00:00 2001 From: Hidehiro Kawai Date: Mon, 14 Dec 2015 11:19:11 +0100 Subject: kexec: Fix race between panic() and crash_kexec() Currently, panic() and crash_kexec() can be called at the same time. For example (x86 case): CPU 0: oops_end() crash_kexec() mutex_trylock() // acquired nmi_shootdown_cpus() // stop other CPUs CPU 1: panic() crash_kexec() mutex_trylock() // failed to acquire smp_send_stop() // stop other CPUs infinite loop If CPU 1 calls smp_send_stop() before nmi_shootdown_cpus(), kdump fails. In another case: CPU 0: oops_end() crash_kexec() mutex_trylock() // acquired io_check_error() panic() crash_kexec() mutex_trylock() // failed to acquire infinite loop Clearly, this is an undesirable result. To fix this problem, this patch changes crash_kexec() to exclude others by using the panic_cpu atomic. Signed-off-by: Hidehiro Kawai Acked-by: Michal Hocko Cc: Andrew Morton Cc: Baoquan He Cc: Dave Young Cc: "Eric W. Biederman" Cc: HATAYAMA Daisuke Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jonathan Corbet Cc: kexec@lists.infradead.org Cc: linux-doc@vger.kernel.org Cc: Martin Schwidefsky Cc: Masami Hiramatsu Cc: Minfei Huang Cc: Peter Zijlstra Cc: Seth Jennings Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vitaly Kuznetsov Cc: Vivek Goyal Cc: x86-ml Link: http://lkml.kernel.org/r/20151210014630.25437.94161.stgit@softrs Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner diff --git a/include/linux/kexec.h b/include/linux/kexec.h index d140b1e..7b68d27 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -237,6 +237,7 @@ extern int kexec_purgatory_get_set_symbol(struct kimage *image, unsigned int size, bool get_value); extern void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name); +extern void __crash_kexec(struct pt_regs *); extern void crash_kexec(struct pt_regs *); int kexec_should_crash(struct task_struct *); void crash_save_cpu(struct pt_regs *regs, int cpu); @@ -332,6 +333,7 @@ int __weak arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, #else /* !CONFIG_KEXEC_CORE */ struct pt_regs; struct task_struct; +static inline void __crash_kexec(struct pt_regs *regs) { } static inline void crash_kexec(struct pt_regs *regs) { } static inline int kexec_should_crash(struct task_struct *p) { return 0; } #define kexec_in_progress false diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 11b64a6..c823f30 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -853,7 +853,12 @@ struct kimage *kexec_image; struct kimage *kexec_crash_image; int kexec_load_disabled; -void crash_kexec(struct pt_regs *regs) +/* + * No panic_cpu check version of crash_kexec(). This function is called + * only when panic_cpu holds the current CPU number; this is the only CPU + * which processes crash_kexec routines. + */ +void __crash_kexec(struct pt_regs *regs) { /* Take the kexec_mutex here to prevent sys_kexec_load * running on one cpu from replacing the crash kernel @@ -876,6 +881,29 @@ void crash_kexec(struct pt_regs *regs) } } +void crash_kexec(struct pt_regs *regs) +{ + int old_cpu, this_cpu; + + /* + * Only one CPU is allowed to execute the crash_kexec() code as with + * panic(). Otherwise parallel calls of panic() and crash_kexec() + * may stop each other. To exclude them, we use panic_cpu here too. + */ + this_cpu = raw_smp_processor_id(); + old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); + if (old_cpu == PANIC_CPU_INVALID) { + /* This is the 1st CPU which comes here, so go ahead. */ + __crash_kexec(regs); + + /* + * Reset panic_cpu to allow another panic()/crash_kexec() + * call. + */ + atomic_set(&panic_cpu, PANIC_CPU_INVALID); + } +} + size_t crash_get_memory_size(void) { size_t size = 0; diff --git a/kernel/panic.c b/kernel/panic.c index 06f31b49..b333380 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -136,9 +136,11 @@ void panic(const char *fmt, ...) * everything else. * If we want to run this after calling panic_notifiers, pass * the "crash_kexec_post_notifiers" option to the kernel. + * + * Bypass the panic_cpu check and call __crash_kexec directly. */ if (!crash_kexec_post_notifiers) - crash_kexec(NULL); + __crash_kexec(NULL); /* * Note smp_send_stop is the usual smp shutdown function, which @@ -161,9 +163,11 @@ void panic(const char *fmt, ...) * panic_notifiers and dumping kmsg before kdump. * Note: since some panic_notifiers can make crashed kernel * more unstable, it can increase risks of the kdump failure too. + * + * Bypass the panic_cpu check and call __crash_kexec directly. */ if (crash_kexec_post_notifiers) - crash_kexec(NULL); + __crash_kexec(NULL); bust_spinlocks(0); -- cgit v0.10.2 From b7c4948e9881fb38b048269f376fb4bf194ce24a Mon Sep 17 00:00:00 2001 From: Hidehiro Kawai Date: Mon, 14 Dec 2015 11:19:12 +0100 Subject: x86/apic: Introduce apic_extnmi command line parameter This patch introduces a command line parameter apic_extnmi: apic_extnmi=( bsp|all|none ) The default value is "bsp" and this is the current behavior: only the Boot-Strapping Processor receives an external NMI. "all" allows external NMIs to be broadcast to all CPUs. This would raise the success rate of panic on NMI when BSP hangs in NMI context or the external NMI is swallowed by other NMI handlers on the BSP. If you specify "none", no CPUs receive external NMIs. This is useful for the dump capture kernel so that it cannot be shot down by accidentally pressing the external NMI button (on platforms which have it) while saving a crash dump. Signed-off-by: Hidehiro Kawai Acked-by: Michal Hocko Cc: Andrew Morton Cc: Andy Lutomirski Cc: Bandan Das Cc: Baoquan He Cc: "Eric W. Biederman" Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jiang Liu Cc: Joerg Roedel Cc: Jonathan Corbet Cc: kexec@lists.infradead.org Cc: linux-doc@vger.kernel.org Cc: "Maciej W. Rozycki" Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ricardo Ribalda Delgado Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Viresh Kumar Cc: Vivek Goyal Cc: x86-ml Link: http://lkml.kernel.org/r/20151210014632.25437.43778.stgit@softrs Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 742f69d..74acea5 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -472,6 +472,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Change the amount of debugging information output when initialising the APIC and IO-APIC components. + apic_extnmi= [APIC,X86] External NMI delivery setting + Format: { bsp (default) | all | none } + bsp: External NMI is delivered only to CPU 0 + all: External NMIs are broadcast to all CPUs as a + backup of CPU 0 + none: External NMI is masked for all CPUs. This is + useful so that a dump capture kernel won't be + shot down by NMI + autoconf= [IPV6] See Documentation/networking/ipv6.txt. diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 7f62ad4..c80f6b6 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -23,6 +23,11 @@ #define APIC_VERBOSE 1 #define APIC_DEBUG 2 +/* Macros for apic_extnmi which controls external NMI masking */ +#define APIC_EXTNMI_BSP 0 /* Default */ +#define APIC_EXTNMI_ALL 1 +#define APIC_EXTNMI_NONE 2 + /* * Define the default level of output to be very little * This can be turned up by using apic=verbose for more diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 8d7df74..8a5cdda 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -82,6 +82,12 @@ physid_mask_t phys_cpu_present_map; static unsigned int disabled_cpu_apicid __read_mostly = BAD_APICID; /* + * This variable controls which CPUs receive external NMIs. By default, + * external NMIs are delivered only to the BSP. + */ +static int apic_extnmi = APIC_EXTNMI_BSP; + +/* * Map cpu index to physical APIC ID */ DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID); @@ -1161,6 +1167,8 @@ void __init init_bsp_APIC(void) value = APIC_DM_NMI; if (!lapic_is_integrated()) /* 82489DX */ value |= APIC_LVT_LEVEL_TRIGGER; + if (apic_extnmi == APIC_EXTNMI_NONE) + value |= APIC_LVT_MASKED; apic_write(APIC_LVT1, value); } @@ -1378,9 +1386,11 @@ void setup_local_APIC(void) apic_write(APIC_LVT0, value); /* - * only the BP should see the LINT1 NMI signal, obviously. + * Only the BSP sees the LINT1 NMI signal by default. This can be + * modified by apic_extnmi= boot option. */ - if (!cpu) + if ((!cpu && apic_extnmi != APIC_EXTNMI_NONE) || + apic_extnmi == APIC_EXTNMI_ALL) value = APIC_DM_NMI; else value = APIC_DM_NMI | APIC_LVT_MASKED; @@ -2557,3 +2567,23 @@ static int __init apic_set_disabled_cpu_apicid(char *arg) return 0; } early_param("disable_cpu_apicid", apic_set_disabled_cpu_apicid); + +static int __init apic_set_extnmi(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strncmp("all", arg, 3)) + apic_extnmi = APIC_EXTNMI_ALL; + else if (!strncmp("none", arg, 4)) + apic_extnmi = APIC_EXTNMI_NONE; + else if (!strncmp("bsp", arg, 3)) + apic_extnmi = APIC_EXTNMI_BSP; + else { + pr_warn("Unknown external NMI delivery mode `%s' ignored\n", arg); + return -EINVAL; + } + + return 0; +} +early_param("apic_extnmi", apic_set_extnmi); -- cgit v0.10.2 From b279d67df88a49c6ca32b3eebd195660254be394 Mon Sep 17 00:00:00 2001 From: Hidehiro Kawai Date: Mon, 14 Dec 2015 11:19:13 +0100 Subject: x86/nmi: Save regs in crash dump on external NMI Now, multiple CPUs can receive an external NMI simultaneously by specifying the "apic_extnmi=all" command line parameter. When we take a crash dump by using external NMI with this option, we fail to save registers into the crash dump. This happens as follows: CPU 0 CPU 1 ================================ ============================= receive an external NMI default_do_nmi() receive an external NMI spin_lock(&nmi_reason_lock) default_do_nmi() io_check_error() spin_lock(&nmi_reason_lock) panic() busy loop ... kdump_nmi_shootdown_cpus() issue NMI IPI -----------> blocked until IRET busy loop... Here, since CPU 1 is in NMI context, an additional NMI from CPU 0 remains unhandled until CPU 1 IRETs. However, CPU 1 will never execute IRET so the NMI is not handled and the callback function to save registers is never called. To solve this issue, we check if the IPI for crash dumping was issued while waiting for nmi_reason_lock to be released, and if so, call its callback function directly. If the IPI is not issued (e.g. kdump is disabled), the actual behavior doesn't change. Signed-off-by: Hidehiro Kawai Acked-by: Michal Hocko Cc: Andrew Morton Cc: Andy Lutomirski Cc: Baoquan He Cc: Dave Young Cc: "Eric W. Biederman" Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jiang Liu Cc: Jonathan Corbet Cc: kexec@lists.infradead.org Cc: linux-doc@vger.kernel.org Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stefan Lippers-Hollmann Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vivek Goyal Cc: x86-ml Link: http://lkml.kernel.org/r/20151210065245.4587.39316.stgit@softrs Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index a82c4f1..2cb1cc2 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -25,5 +25,6 @@ void __noreturn machine_real_restart(unsigned int type); typedef void (*nmi_shootdown_cb)(int, struct pt_regs*); void nmi_shootdown_cpus(nmi_shootdown_cb callback); +void run_crash_ipi_callback(struct pt_regs *regs); #endif /* _ASM_X86_REBOOT_H */ diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 424aec4..8a2cdd7 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -29,6 +29,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -356,8 +357,19 @@ static void default_do_nmi(struct pt_regs *regs) return; } - /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */ - raw_spin_lock(&nmi_reason_lock); + /* + * Non-CPU-specific NMI: NMI sources can be processed on any CPU. + * + * Another CPU may be processing panic routines while holding + * nmi_reason_lock. Check if the CPU issued the IPI for crash dumping, + * and if so, call its callback directly. If there is no CPU preparing + * crash dump, we simply loop here. + */ + while (!raw_spin_trylock(&nmi_reason_lock)) { + run_crash_ipi_callback(regs); + cpu_relax(); + } + reason = x86_platform.get_nmi_reason(); if (reason & NMI_REASON_MASK) { diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 1da1302..d64889a 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -793,17 +793,23 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) /* Leave the nmi callback set */ } +/* + * Check if the crash dumping IPI got issued and if so, call its callback + * directly. This function is used when we have already been in NMI handler. + * It doesn't return. + */ +void run_crash_ipi_callback(struct pt_regs *regs) +{ + if (crash_ipi_issued) + crash_nmi_callback(0, regs); +} + /* Override the weak function in kernel/panic.c */ void nmi_panic_self_stop(struct pt_regs *regs) { while (1) { - /* - * Wait for the crash dumping IPI to be issued, and then - * call its callback directly. - */ - if (READ_ONCE(crash_ipi_issued)) - crash_nmi_callback(0, regs); /* Don't return */ - + /* If no CPU is preparing crash dump, we simply loop here. */ + run_crash_ipi_callback(regs); cpu_relax(); } } @@ -813,4 +819,8 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) { /* No other CPUs to shoot down */ } + +void run_crash_ipi_callback(struct pt_regs *regs) +{ +} #endif -- cgit v0.10.2 From 9f318e3fcb1d4c48c26e8ca2ff2a459b82f36a23 Mon Sep 17 00:00:00 2001 From: Hidehiro Kawai Date: Mon, 14 Dec 2015 11:19:14 +0100 Subject: Documentation: Document kernel.panic_on_io_nmi sysctl kernel.panic_on_io_nmi sysctl was introduced by commit 5211a242d0cb ("x86: Add sysctl to allow panic on IOCK NMI error") but its documentation is missing. So, add it. Signed-off-by: Hidehiro Kawai Requested-by: Borislav Petkov Cc: Andrew Morton Cc: Baoquan He Cc: Chris Metcalf Cc: Don Zickus Cc: "Eric W. Biederman" Cc: Heinrich Schuchardt Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jiri Kosina Cc: Jonathan Corbet Cc: kexec@lists.infradead.org Cc: linux-doc@vger.kernel.org Cc: Manfred Spraul Cc: Masami Hiramatsu Cc: Michal Hocko Cc: Nicolas Iooss Cc: Peter Zijlstra Cc: Seth Jennings Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Ulrich Obergfell Cc: Vivek Goyal Cc: x86-ml Link: http://lkml.kernel.org/r/20151210014637.25437.71903.stgit@softrs Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index af70d15..73c6b1e 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -551,6 +551,21 @@ the recommended setting is 60. ============================================================== +panic_on_io_nmi: + +Controls the kernel's behavior when a CPU receives an NMI caused by +an IO error. + +0: try to continue operation (default) + +1: panic immediately. The IO error triggered an NMI. This indicates a + serious system condition which could result in IO data corruption. + Rather than continuing, panicking might be a better choice. Some + servers issue this sort of NMI when the dump button is pushed, + and you can use this option to take a crash dump. + +============================================================== + panic_on_oops: Controls the kernel's behaviour when an oops or BUG is encountered. -- cgit v0.10.2 From c8f3e518d3444ee9200a4987421fcee60f768f11 Mon Sep 17 00:00:00 2001 From: Jake Oshins Date: Thu, 10 Dec 2015 17:52:59 +0000 Subject: x86/irq: Export functions to allow MSI domains in modules The Linux kernel already has the concept of IRQ domain, wherein a component can expose a set of IRQs which are managed by a particular interrupt controller chip or other subsystem. The PCI driver exposes the notion of an IRQ domain for Message-Signaled Interrupts (MSI) from PCI Express devices. This patch exposes the functions which are necessary for creating a MSI IRQ domain within a module. [ tglx: Split it into x86 and core irq parts ] Signed-off-by: Jake Oshins Cc: gregkh@linuxfoundation.org Cc: kys@microsoft.com Cc: devel@linuxdriverproject.org Cc: olaf@aepfle.de Cc: apw@canonical.com Cc: vkuznets@redhat.com Cc: haiyangz@microsoft.com Cc: marc.zyngier@arm.com Cc: bhelgaas@google.com Link: http://lkml.kernel.org/r/1449769983-12948-4-git-send-email-jakeo@microsoft.com Signed-off-by: Thomas Gleixner diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h index 93724cc..eb4b09b 100644 --- a/arch/x86/include/asm/msi.h +++ b/arch/x86/include/asm/msi.h @@ -1,7 +1,13 @@ #ifndef _ASM_X86_MSI_H #define _ASM_X86_MSI_H #include +#include typedef struct irq_alloc_info msi_alloc_info_t; +int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, + msi_alloc_info_t *arg); + +void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc); + #endif /* _ASM_X86_MSI_H */ diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 5f1feb6..ade2532 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -96,8 +96,8 @@ static irq_hw_number_t pci_msi_get_hwirq(struct msi_domain_info *info, return arg->msi_hwirq; } -static int pci_msi_prepare(struct irq_domain *domain, struct device *dev, - int nvec, msi_alloc_info_t *arg) +int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, + msi_alloc_info_t *arg) { struct pci_dev *pdev = to_pci_dev(dev); struct msi_desc *desc = first_pci_msi_entry(pdev); @@ -113,11 +113,13 @@ static int pci_msi_prepare(struct irq_domain *domain, struct device *dev, return 0; } +EXPORT_SYMBOL_GPL(pci_msi_prepare); -static void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc) +void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc) { arg->msi_hwirq = pci_msi_domain_calc_hwirq(arg->msi_dev, desc); } +EXPORT_SYMBOL_GPL(pci_msi_set_desc); static struct msi_domain_ops pci_msi_domain_ops = { .get_hwirq = pci_msi_get_hwirq, diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 861bc59..908cb37 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -29,6 +29,7 @@ struct apic_chip_data { }; struct irq_domain *x86_vector_domain; +EXPORT_SYMBOL_GPL(x86_vector_domain); static DEFINE_RAW_SPINLOCK(vector_lock); static cpumask_var_t vector_cpumask; static struct irq_chip lapic_controller; @@ -66,6 +67,7 @@ struct irq_cfg *irqd_cfg(struct irq_data *irq_data) return data ? &data->cfg : NULL; } +EXPORT_SYMBOL_GPL(irqd_cfg); struct irq_cfg *irq_cfg(unsigned int irq) { -- cgit v0.10.2