From e020d5bd8a730757b565b18d620240f71c3e21fe Mon Sep 17 00:00:00 2001
From: Steven Miao <realmz6@gmail.com>
Date: Mon, 23 Jun 2014 13:22:02 -0700
Subject: mm: nommu: per-thread vma cache fix

mm could be removed from current task struct, using previous vma->vm_mm

It will crash on blackfin after updated to Linux 3.15.  The commit "mm:
per-thread vma caching" caused the crash.  mm could be removed from
current task struct before

  mmput()->
    exit_mmap()->
      delete_vma_from_mm()

the detailed fault information:

    NULL pointer access
    Kernel OOPS in progress
    Deferred Exception context
    CURRENT PROCESS:
    COMM=modprobe PID=278  CPU=0
    invalid mm
    return address: [0x000531de]; contents of:
    0x000531b0:  c727  acea  0c42  181d  0000  0000  0000  a0a8
    0x000531c0:  b090  acaa  0c42  1806  0000  0000  0000  a0e8
    0x000531d0:  b0d0  e801  0000  05b3  0010  e522  0046 [a090]
    0x000531e0:  6408  b090  0c00  17cc  3042  e3ff  f37b  2fc8

    CPU: 0 PID: 278 Comm: modprobe Not tainted 3.15.0-ADI-2014R1-pre-00345-gea9f446 #25
    task: 0572b720 ti: 0569e000 task.ti: 0569e000
    Compiled for cpu family 0x27fe (Rev 0), but running on:0x0000 (Rev 0)
    ADSP-BF609-0.0 500(MHz CCLK) 125(MHz SCLK) (mpu off)
    Linux version 3.15.0-ADI-2014R1-pre-00345-gea9f446 (steven@steven-OptiPlex-390) (gcc version 4.3.5 (ADI-trunk/svn-5962) ) #25 Tue Jun 10 17:47:46 CST 2014

    SEQUENCER STATUS:		Not tainted
     SEQSTAT: 00000027  IPEND: 8008  IMASK: ffff  SYSCFG: 2806
      EXCAUSE   : 0x27
      physical IVG3 asserted : <0xffa00744> { _trap + 0x0 }
      physical IVG15 asserted : <0xffa00d68> { _evt_system_call + 0x0 }
      logical irq   6 mapped  : <0xffa003bc> { _bfin_coretmr_interrupt + 0x0 }
      logical irq   7 mapped  : <0x00008828> { _bfin_fault_routine + 0x0 }
      logical irq  11 mapped  : <0x00007724> { _l2_ecc_err + 0x0 }
      logical irq  13 mapped  : <0x00008828> { _bfin_fault_routine + 0x0 }
      logical irq  39 mapped  : <0x00150788> { _bfin_twi_interrupt_entry + 0x0 }
      logical irq  40 mapped  : <0x00150788> { _bfin_twi_interrupt_entry + 0x0 }
     RETE: <0x00000000> /* Maybe null pointer? */
     RETN: <0x0569fe50> /* kernel dynamic memory (maybe user-space) */
     RETX: <0x00000480> /* Maybe fixed code section */
     RETS: <0x00053384> { _exit_mmap + 0x28 }
     PC  : <0x000531de> { _delete_vma_from_mm + 0x92 }
    DCPLB_FAULT_ADDR: <0x00000008> /* Maybe null pointer? */
    ICPLB_FAULT_ADDR: <0x000531de> { _delete_vma_from_mm + 0x92 }
    PROCESSOR STATE:
     R0 : 00000004    R1 : 0569e000    R2 : 00bf3db4    R3 : 00000000
     R4 : 057f9800    R5 : 00000001    R6 : 0569ddd0    R7 : 0572b720
     P0 : 0572b854    P1 : 00000004    P2 : 00000000    P3 : 0569dda0
     P4 : 0572b720    P5 : 0566c368    FP : 0569fe5c    SP : 0569fd74
     LB0: 057f523f    LT0: 057f523e    LC0: 00000000
     LB1: 0005317c    LT1: 00053172    LC1: 00000002
     B0 : 00000000    L0 : 00000000    M0 : 0566f5bc    I0 : 00000000
     B1 : 00000000    L1 : 00000000    M1 : 00000000    I1 : ffffffff
     B2 : 00000001    L2 : 00000000    M2 : 00000000    I2 : 00000000
     B3 : 00000000    L3 : 00000000    M3 : 00000000    I3 : 057f8000
    A0.w: 00000000   A0.x: 00000000   A1.w: 00000000   A1.x: 00000000
    USP : 056ffcf8  ASTAT: 02003024

    Hardware Trace:
       0 Target : <0x00003fb8> { _trap_c + 0x0 }
         Source : <0xffa006d8> { _exception_to_level5 + 0xa0 } JUMP.L
       1 Target : <0xffa00638> { _exception_to_level5 + 0x0 }
         Source : <0xffa004f2> { _bfin_return_from_exception + 0x6 } RTX
       2 Target : <0xffa004ec> { _bfin_return_from_exception + 0x0 }
         Source : <0xffa00590> { _ex_trap_c + 0x70 } JUMP.S
       3 Target : <0xffa00520> { _ex_trap_c + 0x0 }
         Source : <0xffa0076e> { _trap + 0x2a } JUMP (P4)
       4 Target : <0xffa00744> { _trap + 0x0 }
          FAULT : <0x000531de> { _delete_vma_from_mm + 0x92 } P0 = W[P2 + 2]
         Source : <0x000531da> { _delete_vma_from_mm + 0x8e } P2 = [P4 + 0x18]
       5 Target : <0x000531da> { _delete_vma_from_mm + 0x8e }
         Source : <0x00053176> { _delete_vma_from_mm + 0x2a } IF CC JUMP pcrel
       6 Target : <0x0005314c> { _delete_vma_from_mm + 0x0 }
         Source : <0x00053380> { _exit_mmap + 0x24 } JUMP.L
       7 Target : <0x00053378> { _exit_mmap + 0x1c }
         Source : <0x00053394> { _exit_mmap + 0x38 } IF !CC JUMP pcrel (BP)
       8 Target : <0x00053390> { _exit_mmap + 0x34 }
         Source : <0xffa020e0> { __cond_resched + 0x20 } RTS
       9 Target : <0xffa020c0> { __cond_resched + 0x0 }
         Source : <0x0005338c> { _exit_mmap + 0x30 } JUMP.L
      10 Target : <0x0005338c> { _exit_mmap + 0x30 }
         Source : <0x0005333a> { _delete_vma + 0xb2 } RTS
      11 Target : <0x00053334> { _delete_vma + 0xac }
         Source : <0x0005507a> { _kmem_cache_free + 0xba } RTS
      12 Target : <0x00055068> { _kmem_cache_free + 0xa8 }
         Source : <0x0005505e> { _kmem_cache_free + 0x9e } IF !CC JUMP pcrel (BP)
      13 Target : <0x00055052> { _kmem_cache_free + 0x92 }
         Source : <0x0005501a> { _kmem_cache_free + 0x5a } IF CC JUMP pcrel
      14 Target : <0x00054ff4> { _kmem_cache_free + 0x34 }
         Source : <0x00054fce> { _kmem_cache_free + 0xe } IF CC JUMP pcrel (BP)
      15 Target : <0x00054fc0> { _kmem_cache_free + 0x0 }
         Source : <0x00053330> { _delete_vma + 0xa8 } JUMP.L
    Kernel Stack
    Stack info:
     SP: [0x0569ff24] <0x0569ff24> /* kernel dynamic memory (maybe user-space) */
     Memory from 0x0569ff20 to 056a0000
    0569ff20: 00000001 [04e8da5a] 00008000  00000000  00000000  056a0000  04e8da5a  04e8da5a
    0569ff40: 04eb9eea  ffa00dce  02003025  04ea09c5  057f523f  04ea09c4  057f523e  00000000
    0569ff60: 00000000  00000000  00000000  00000000  00000000  00000000  00000001  00000000
    0569ff80: 00000000  00000000  00000000  00000000  00000000  00000000  00000000  00000000
    0569ffa0: 0566f5bc  057f8000  057f8000  00000001  04ec0170  056ffcf8  056ffd04  057f9800
    0569ffc0: 04d1d498  057f9800  057f8fe4  057f8ef0  00000001  057f928c  00000001  00000001
    0569ffe0: 057f9800  00000000  00000008  00000007  00000001  00000001  00000001 <00002806>
    Return addresses in stack:
        address : <0x00002806> { _show_cpuinfo + 0x2d2 }
    Modules linked in:
    Kernel panic - not syncing: Kernel exception
    [ end Kernel panic - not syncing: Kernel exception

Signed-off-by: Steven Miao <realmz6@gmail.com>
Acked-by: Davidlohr Bueso <davidlohr@hp.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: <stable@vger.kernel.org>	[3.15.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/nommu.c b/mm/nommu.c
index b78e3a8..4a852f6 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -786,7 +786,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
 	for (i = 0; i < VMACACHE_SIZE; i++) {
 		/* if the vma is cached, invalidate the entire cache */
 		if (curr->vmacache[i] == vma) {
-			vmacache_invalidate(curr->mm);
+			vmacache_invalidate(mm);
 			break;
 		}
 	}
-- 
cgit v0.10.2


From 8d056c48e486249e6487910b83e0f3be7c14acf7 Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Mon, 23 Jun 2014 13:22:02 -0700
Subject: CPU hotplug, smp: flush any pending IPI callbacks before CPU offline

There is a race between the CPU offline code (within stop-machine) and
the smp-call-function code, which can lead to getting IPIs on the
outgoing CPU, *after* it has gone offline.

Specifically, this can happen when using
smp_call_function_single_async() to send the IPI, since this API allows
sending asynchronous IPIs from IRQ disabled contexts.  The exact race
condition is described below.

During CPU offline, in stop-machine, we don't enforce any rule in the
_DISABLE_IRQ stage, regarding the order in which the outgoing CPU and
the other CPUs disable their local interrupts.  Due to this, we can
encounter a situation in which an IPI is sent by one of the other CPUs
to the outgoing CPU (while it is *still* online), but the outgoing CPU
ends up noticing it only *after* it has gone offline.

              CPU 1                                         CPU 2
          (Online CPU)                               (CPU going offline)

       Enter _PREPARE stage                          Enter _PREPARE stage

                                                     Enter _DISABLE_IRQ stage

                                                   =
       Got a device interrupt, and                 | Didn't notice the IPI
       the interrupt handler sent an               | since interrupts were
       IPI to CPU 2 using                          | disabled on this CPU.
       smp_call_function_single_async()            |
                                                   =

       Enter _DISABLE_IRQ stage

       Enter _RUN stage                              Enter _RUN stage

                                  =
       Busy loop with interrupts  |                  Invoke take_cpu_down()
       disabled.                  |                  and take CPU 2 offline
                                  =

       Enter _EXIT stage                             Enter _EXIT stage

       Re-enable interrupts                          Re-enable interrupts

                                                     The pending IPI is noted
                                                     immediately, but alas,
                                                     the CPU is offline at
                                                     this point.

This of course, makes the smp-call-function IPI handler code running on
CPU 2 unhappy and it complains about "receiving an IPI on an offline
CPU".

One real example of the scenario on CPU 1 is the block layer's
complete-request call-path:

	__blk_complete_request() [interrupt-handler]
	    raise_blk_irq()
	        smp_call_function_single_async()

However, if we look closely, the block layer does check that the target
CPU is online before firing the IPI.  So in this case, it is actually
the unfortunate ordering/timing of events in the stop-machine phase that
leads to receiving IPIs after the target CPU has gone offline.

In reality, getting a late IPI on an offline CPU is not too bad by
itself (this can happen even due to hardware latencies in IPI
send-receive).  It is a bug only if the target CPU really went offline
without executing all the callbacks queued on its list.  (Note that a
CPU is free to execute its pending smp-call-function callbacks in a
batch, without waiting for the corresponding IPIs to arrive for each one
of those callbacks).

So, fixing this issue can be broken up into two parts:

1. Ensure that a CPU goes offline only after executing all the
   callbacks queued on it.

2. Modify the warning condition in the smp-call-function IPI handler
   code such that it warns only if an offline CPU got an IPI *and* that
   CPU had gone offline with callbacks still pending in its queue.

Achieving part 1 is straight-forward - just flush (execute) all the
queued callbacks on the outgoing CPU in the CPU_DYING stage[1],
including those callbacks for which the source CPU's IPIs might not have
been received on the outgoing CPU yet.  Once we do this, an IPI that
arrives late on the CPU going offline (either due to the race mentioned
above, or due to hardware latencies) will be completely harmless, since
the outgoing CPU would have executed all the queued callbacks before
going offline.

Overall, this fix (parts 1 and 2 put together) additionally guarantees
that we will see a warning only when the *IPI-sender code* is buggy -
that is, if it queues the callback _after_ the target CPU has gone
offline.

[1].  The CPU_DYING part needs a little more explanation: by the time we
execute the CPU_DYING notifier callbacks, the CPU would have already
been marked offline.  But we want to flush out the pending callbacks at
this stage, ignoring the fact that the CPU is offline.  So restructure
the IPI handler code so that we can by-pass the "is-cpu-offline?" check
in this particular case.  (Of course, the right solution here is to fix
CPU hotplug to mark the CPU offline _after_ invoking the CPU_DYING
notifiers, but this requires a lot of audit to ensure that this change
doesn't break any existing code; hence lets go with the solution
proposed above until that is done).

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Suggested-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Galbraith <mgalbraith@suse.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Rik van Riel <riel@redhat.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Sachin Kamat <sachin.kamat@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/smp.c b/kernel/smp.c
index 306f818..80c33f8 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -29,6 +29,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
 
+static void flush_smp_call_function_queue(bool warn_cpu_offline);
+
 static int
 hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
@@ -51,12 +53,27 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_UP_CANCELED:
 	case CPU_UP_CANCELED_FROZEN:
+		/* Fall-through to the CPU_DEAD[_FROZEN] case. */
 
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 		free_cpumask_var(cfd->cpumask);
 		free_percpu(cfd->csd);
 		break;
+
+	case CPU_DYING:
+	case CPU_DYING_FROZEN:
+		/*
+		 * The IPIs for the smp-call-function callbacks queued by other
+		 * CPUs might arrive late, either due to hardware latencies or
+		 * because this CPU disabled interrupts (inside stop-machine)
+		 * before the IPIs were sent. So flush out any pending callbacks
+		 * explicitly (without waiting for the IPIs to arrive), to
+		 * ensure that the outgoing CPU doesn't go offline with work
+		 * still pending.
+		 */
+		flush_smp_call_function_queue(false);
+		break;
 #endif
 	};
 
@@ -177,23 +194,47 @@ static int generic_exec_single(int cpu, struct call_single_data *csd,
 	return 0;
 }
 
-/*
- * Invoked by arch to handle an IPI for call function single. Must be
- * called from the arch with interrupts disabled.
+/**
+ * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks
+ *
+ * Invoked by arch to handle an IPI for call function single.
+ * Must be called with interrupts disabled.
  */
 void generic_smp_call_function_single_interrupt(void)
 {
+	flush_smp_call_function_queue(true);
+}
+
+/**
+ * flush_smp_call_function_queue - Flush pending smp-call-function callbacks
+ *
+ * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an
+ *		      offline CPU. Skip this check if set to 'false'.
+ *
+ * Flush any pending smp-call-function callbacks queued on this CPU. This is
+ * invoked by the generic IPI handler, as well as by a CPU about to go offline,
+ * to ensure that all pending IPI callbacks are run before it goes completely
+ * offline.
+ *
+ * Loop through the call_single_queue and run all the queued callbacks.
+ * Must be called with interrupts disabled.
+ */
+static void flush_smp_call_function_queue(bool warn_cpu_offline)
+{
+	struct llist_head *head;
 	struct llist_node *entry;
 	struct call_single_data *csd, *csd_next;
 	static bool warned;
 
-	entry = llist_del_all(&__get_cpu_var(call_single_queue));
+	WARN_ON(!irqs_disabled());
+
+	head = &__get_cpu_var(call_single_queue);
+	entry = llist_del_all(head);
 	entry = llist_reverse_order(entry);
 
-	/*
-	 * Shouldn't receive this interrupt on a cpu that is not yet online.
-	 */
-	if (unlikely(!cpu_online(smp_processor_id()) && !warned)) {
+	/* There shouldn't be any pending callbacks on an offline CPU. */
+	if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) &&
+		     !warned && !llist_empty(head))) {
 		warned = true;
 		WARN(1, "IPI on offline CPU %d\n", smp_processor_id());
 
-- 
cgit v0.10.2


From b3acc56bfe1287c6b666e80edc70b89eea2a1a80 Mon Sep 17 00:00:00 2001
From: Petr Tesarik <ptesarik@suse.cz>
Date: Mon, 23 Jun 2014 13:22:03 -0700
Subject: kexec: save PG_head_mask in VMCOREINFO

To allow filtering of huge pages, makedumpfile must be able to identify
them in the dump.  This can be done by checking the appropriate page
flag, so communicate its value to makedumpfile through the VMCOREINFO
interface.

There's only one small catch.  Depending on how many page flags are
available on a given architecture, this bit can be called PG_head or
PG_compound.

I sent a similar patch back in 2012, but Eric Biederman did not like
using an #ifdef.  So, this time I'm adding a common symbol
(PG_head_mask) instead.

See https://lkml.org/lkml/2012/11/28/91 for the previous version.

Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
Cc: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 3c545b4..8304959 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -360,6 +360,9 @@ static inline void ClearPageCompound(struct page *page)
 	ClearPageHead(page);
 }
 #endif
+
+#define PG_head_mask ((1L << PG_head))
+
 #else
 /*
  * Reduce page flag use as much as possible by overlapping
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 6748688..369f41a 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1617,6 +1617,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 #ifdef CONFIG_MEMORY_FAILURE
 	VMCOREINFO_NUMBER(PG_hwpoison);
 #endif
+	VMCOREINFO_NUMBER(PG_head_mask);
 	VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
 
 	arch_crash_save_vmcoreinfo();
-- 
cgit v0.10.2


From 7cdb0d25bc93e936edbfde12fe277ee21b88c1ef Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 23 Jun 2014 13:22:03 -0700
Subject: mm, hotplug: probe interface is available on several platforms

Documentation/memory-hotplug.txt incorrectly states that the memory
driver "probe" interface is only supported on powerpc and is vague about
its application on x86.  Clarify the platforms that make this interface
available if memory hotplug is enabled.

Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index f304edb..45134dc 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -209,15 +209,12 @@ If memory device is found, memory hotplug code will be called.
 
 4.2 Notify memory hot-add event by hand
 ------------
-On powerpc, the firmware does not notify a memory hotplug event to the kernel.
-Therefore, "probe" interface is supported to notify the event to the kernel.
-This interface depends on CONFIG_ARCH_MEMORY_PROBE.
-
-CONFIG_ARCH_MEMORY_PROBE is supported on powerpc only. On x86, this config
-option is disabled by default since ACPI notifies a memory hotplug event to
-the kernel, which performs its hotplug operation as the result. Please
-enable this option if you need the "probe" interface for testing purposes
-on x86.
+On some architectures, the firmware may not notify the kernel of a memory
+hotplug event.  Therefore, the memory "probe" interface is supported to
+explicitly notify the kernel.  This interface depends on
+CONFIG_ARCH_MEMORY_PROBE and can be configured on powerpc, sh, and x86
+if hotplug is supported, although for x86 this should be handled by ACPI
+notification.
 
 Probe interface is located at
 /sys/devices/system/memory/probe
-- 
cgit v0.10.2


From 13ace4d0d9db40e10ecd66dfda14e297571be813 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 23 Jun 2014 13:22:03 -0700
Subject: tmpfs: ZERO_RANGE and COLLAPSE_RANGE not currently supported

I was well aware of FALLOC_FL_ZERO_RANGE and FALLOC_FL_COLLAPSE_RANGE
support being added to fallocate(); but didn't realize until now that I
had been too stupid to future-proof shmem_fallocate() against new
additions.  -EOPNOTSUPP instead of going on to ordinary fallocation.

Signed-off-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Lukas Czerner <lczerner@redhat.com>
Cc: <stable@vger.kernel.org>	[3.15]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/shmem.c b/mm/shmem.c
index f484c27..91b912b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1724,6 +1724,9 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 	pgoff_t start, index, end;
 	int error;
 
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+		return -EOPNOTSUPP;
+
 	mutex_lock(&inode->i_mutex);
 
 	if (mode & FALLOC_FL_PUNCH_HOLE) {
-- 
cgit v0.10.2


From 4a705fef986231a3e7a6b1a6d3c37025f021f49f Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Mon, 23 Jun 2014 13:22:03 -0700
Subject: hugetlb: fix copy_hugetlb_page_range() to handle migration/hwpoisoned
 entry

There's a race between fork() and hugepage migration, as a result we try
to "dereference" a swap entry as a normal pte, causing kernel panic.
The cause of the problem is that copy_hugetlb_page_range() can't handle
"swap entry" family (migration entry and hwpoisoned entry) so let's fix
it.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: <stable@vger.kernel.org>	[2.6.37+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 226910c..2024bbd 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2520,6 +2520,31 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
 		update_mmu_cache(vma, address, ptep);
 }
 
+static int is_hugetlb_entry_migration(pte_t pte)
+{
+	swp_entry_t swp;
+
+	if (huge_pte_none(pte) || pte_present(pte))
+		return 0;
+	swp = pte_to_swp_entry(pte);
+	if (non_swap_entry(swp) && is_migration_entry(swp))
+		return 1;
+	else
+		return 0;
+}
+
+static int is_hugetlb_entry_hwpoisoned(pte_t pte)
+{
+	swp_entry_t swp;
+
+	if (huge_pte_none(pte) || pte_present(pte))
+		return 0;
+	swp = pte_to_swp_entry(pte);
+	if (non_swap_entry(swp) && is_hwpoison_entry(swp))
+		return 1;
+	else
+		return 0;
+}
 
 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			    struct vm_area_struct *vma)
@@ -2559,10 +2584,26 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 		dst_ptl = huge_pte_lock(h, dst, dst_pte);
 		src_ptl = huge_pte_lockptr(h, src, src_pte);
 		spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
-		if (!huge_pte_none(huge_ptep_get(src_pte))) {
+		entry = huge_ptep_get(src_pte);
+		if (huge_pte_none(entry)) { /* skip none entry */
+			;
+		} else if (unlikely(is_hugetlb_entry_migration(entry) ||
+				    is_hugetlb_entry_hwpoisoned(entry))) {
+			swp_entry_t swp_entry = pte_to_swp_entry(entry);
+
+			if (is_write_migration_entry(swp_entry) && cow) {
+				/*
+				 * COW mappings require pages in both
+				 * parent and child to be set to read.
+				 */
+				make_migration_entry_read(&swp_entry);
+				entry = swp_entry_to_pte(swp_entry);
+				set_huge_pte_at(src, addr, src_pte, entry);
+			}
+			set_huge_pte_at(dst, addr, dst_pte, entry);
+		} else {
 			if (cow)
 				huge_ptep_set_wrprotect(src, addr, src_pte);
-			entry = huge_ptep_get(src_pte);
 			ptepage = pte_page(entry);
 			get_page(ptepage);
 			page_dup_rmap(ptepage);
@@ -2578,32 +2619,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 	return ret;
 }
 
-static int is_hugetlb_entry_migration(pte_t pte)
-{
-	swp_entry_t swp;
-
-	if (huge_pte_none(pte) || pte_present(pte))
-		return 0;
-	swp = pte_to_swp_entry(pte);
-	if (non_swap_entry(swp) && is_migration_entry(swp))
-		return 1;
-	else
-		return 0;
-}
-
-static int is_hugetlb_entry_hwpoisoned(pte_t pte)
-{
-	swp_entry_t swp;
-
-	if (huge_pte_none(pte) || pte_present(pte))
-		return 0;
-	swp = pte_to_swp_entry(pte);
-	if (non_swap_entry(swp) && is_hwpoison_entry(swp))
-		return 1;
-	else
-		return 0;
-}
-
 void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 			    unsigned long start, unsigned long end,
 			    struct page *ref_page)
-- 
cgit v0.10.2


From 16e943bf8dbc852832ce7f971690c9f0787749d5 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@gentwo.org>
Date: Mon, 23 Jun 2014 13:22:03 -0700
Subject: MAINTAINERS: SLAB maintainer update

As discussed in various threads on the side:

Remove one inactive maintainer, add two new ones and update my email
address.  Plus add Andrew.  And fix the glob to include files like
mm/slab_common.c

Signed-off-by: Christoph Lameter <cl@linux.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/CREDITS b/CREDITS
index c322dcf..28ee151 100644
--- a/CREDITS
+++ b/CREDITS
@@ -9,6 +9,10 @@
 			Linus
 ----------
 
+M: Matt Mackal
+E: mpm@selenic.com
+D: SLOB slab allocator
+
 N: Matti Aarnio
 E: mea@nic.funet.fi
 D: Alpha systems hacking, IPv6 and other network related stuff
diff --git a/MAINTAINERS b/MAINTAINERS
index 3f2e171..3cc94ff 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8196,13 +8196,15 @@ S:	Maintained
 F:	drivers/usb/misc/sisusbvga/
 
 SLAB ALLOCATOR
-M:	Christoph Lameter <cl@linux-foundation.org>
+M:	Christoph Lameter <cl@linux.com>
 M:	Pekka Enberg <penberg@kernel.org>
-M:	Matt Mackall <mpm@selenic.com>
+M:	David Rientjes <rientjes@google.com>
+M:	Joonsoo Kim <iamjoonsoo.kim@lge.com>
+M:	Andrew Morton <akpm@linux-foundation.org>
 L:	linux-mm@kvack.org
 S:	Maintained
 F:	include/linux/sl?b*.h
-F:	mm/sl?b.c
+F:	mm/sl?b*
 
 SLEEPABLE READ-COPY UPDATE (SRCU)
 M:	Lai Jiangshan <laijs@cn.fujitsu.com>
-- 
cgit v0.10.2


From bde92cf455a03a91badb7046855592d8c008e929 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Mon, 23 Jun 2014 13:22:03 -0700
Subject: kernel/watchdog.c: remove preemption restrictions when restarting
 lockup detector

Peter Wu noticed the following splat on his machine when updating
/proc/sys/kernel/watchdog_thresh:

  BUG: sleeping function called from invalid context at mm/slub.c:965
  in_atomic(): 1, irqs_disabled(): 0, pid: 1, name: init
  3 locks held by init/1:
   #0:  (sb_writers#3){.+.+.+}, at: [<ffffffff8117b663>] vfs_write+0x143/0x180
   #1:  (watchdog_proc_mutex){+.+.+.}, at: [<ffffffff810e02d3>] proc_dowatchdog+0x33/0x110
   #2:  (cpu_hotplug.lock){.+.+.+}, at: [<ffffffff810589c2>] get_online_cpus+0x32/0x80
  Preemption disabled at:[<ffffffff810e0384>] proc_dowatchdog+0xe4/0x110

  CPU: 0 PID: 1 Comm: init Not tainted 3.16.0-rc1-testing #34
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
  Call Trace:
    dump_stack+0x4e/0x7a
    __might_sleep+0x11d/0x190
    kmem_cache_alloc_trace+0x4e/0x1e0
    perf_event_alloc+0x55/0x440
    perf_event_create_kernel_counter+0x26/0xe0
    watchdog_nmi_enable+0x75/0x140
    update_timers_all_cpus+0x53/0xa0
    proc_dowatchdog+0xe4/0x110
    proc_sys_call_handler+0xb3/0xc0
    proc_sys_write+0x14/0x20
    vfs_write+0xad/0x180
    SyS_write+0x49/0xb0
    system_call_fastpath+0x16/0x1b
  NMI watchdog: disabled (cpu0): hardware events not enabled

What happened is after updating the watchdog_thresh, the lockup detector
is restarted to utilize the new value.  Part of this process involved
disabling preemption.  Once preemption was disabled, perf tried to
allocate a new event (as part of the restart).  This caused the above
BUG_ON as you can't sleep with preemption disabled.

The preemption restriction seemed agressive as we are not doing anything
on that particular cpu, but with all the online cpus (which are
protected by the get_online_cpus lock).  Remove the restriction and the
BUG_ON goes away.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Reported-by: Peter Wu <peter@lekensteyn.nl>
Tested-by: Peter Wu <peter@lekensteyn.nl>
Acked-by: David Rientjes <rientjes@google.com>
Cc: <stable@vger.kernel.org>		[3.13+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 516203e..30e4822 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -527,10 +527,8 @@ static void update_timers_all_cpus(void)
 	int cpu;
 
 	get_online_cpus();
-	preempt_disable();
 	for_each_online_cpu(cpu)
 		update_timers(cpu);
-	preempt_enable();
 	put_online_cpus();
 }
 
-- 
cgit v0.10.2


From df2e1ef68c51ddccfdb6f34f92ee9f93541de802 Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen.5i5j@gmail.com>
Date: Mon, 23 Jun 2014 13:22:04 -0700
Subject: lib/Kconfig.debug: let FRAME_POINTER exclude SCORE, just like exclude
 most of other architectures

The related warning:

  scripts/kconfig/conf --allmodconfig Kconfig
  warning: (FAULT_INJECTION_STACKTRACE_FILTER && LATENCYTOP && KMEMCHECK && LOCKDEP) selects FRAME_POINTER which has unmet direct dependencies (DEBUG_KERNEL && (CRIS || M68K || FRV || UML || AVR32 || SUPERH || BLACKFIN || MN10300 || METAG) || ARCH_WANT_FRAME_POINTERS)

Signed-off-by: Chen Gang <gang.chen.5i5j@gmail.com>
Cc: Chen Liqin <liqin.linux@gmail.com>
Cc: Lennox Wu <lennox.wu@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 7cfcc1b..7a638aa 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -930,7 +930,7 @@ config LOCKDEP
 	bool
 	depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT
 	select STACKTRACE
-	select FRAME_POINTER if !MIPS && !PPC && !ARM_UNWIND && !S390 && !MICROBLAZE && !ARC
+	select FRAME_POINTER if !MIPS && !PPC && !ARM_UNWIND && !S390 && !MICROBLAZE && !ARC && !SCORE
 	select KALLSYMS
 	select KALLSYMS_ALL
 
@@ -1408,7 +1408,7 @@ config FAULT_INJECTION_STACKTRACE_FILTER
 	depends on FAULT_INJECTION_DEBUG_FS && STACKTRACE_SUPPORT
 	depends on !X86_64
 	select STACKTRACE
-	select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC
+	select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC && !SCORE
 	help
 	  Provide stacktrace filter for fault-injection capabilities
 
-- 
cgit v0.10.2


From 7cd2b0a34ab8e4db971920eef8982f985441adfb Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 23 Jun 2014 13:22:04 -0700
Subject: mm, pcp: allow restoring percpu_pagelist_fraction default

Oleg reports a division by zero error on zero-length write() to the
percpu_pagelist_fraction sysctl:

    divide error: 0000 [#1] SMP DEBUG_PAGEALLOC
    CPU: 1 PID: 9142 Comm: badarea_io Not tainted 3.15.0-rc2-vm-nfs+ #19
    Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
    task: ffff8800d5aeb6e0 ti: ffff8800d87a2000 task.ti: ffff8800d87a2000
    RIP: 0010: percpu_pagelist_fraction_sysctl_handler+0x84/0x120
    RSP: 0018:ffff8800d87a3e78  EFLAGS: 00010246
    RAX: 0000000000000f89 RBX: ffff88011f7fd000 RCX: 0000000000000000
    RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000010
    RBP: ffff8800d87a3e98 R08: ffffffff81d002c8 R09: ffff8800d87a3f50
    R10: 000000000000000b R11: 0000000000000246 R12: 0000000000000060
    R13: ffffffff81c3c3e0 R14: ffffffff81cfddf8 R15: ffff8801193b0800
    FS:  00007f614f1e9740(0000) GS:ffff88011f440000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
    CR2: 00007f614f1fa000 CR3: 00000000d9291000 CR4: 00000000000006e0
    Call Trace:
      proc_sys_call_handler+0xb3/0xc0
      proc_sys_write+0x14/0x20
      vfs_write+0xba/0x1e0
      SyS_write+0x46/0xb0
      tracesys+0xe1/0xe6

However, if the percpu_pagelist_fraction sysctl is set by the user, it
is also impossible to restore it to the kernel default since the user
cannot write 0 to the sysctl.

This patch allows the user to write 0 to restore the default behavior.
It still requires a fraction equal to or larger than 8, however, as
stated by the documentation for sanity.  If a value in the range [1, 7]
is written, the sysctl will return EINVAL.

This successfully solves the divide by zero issue at the same time.

Signed-off-by: David Rientjes <rientjes@google.com>
Reported-by: Oleg Drokin <green@linuxhacker.ru>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index bd4b34c..4415aa9 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -702,7 +702,8 @@ The batch value of each per cpu pagelist is also updated as a result.  It is
 set to pcp->high/4.  The upper limit of batch is (PAGE_SHIFT * 8)
 
 The initial value is zero.  Kernel does not use this value at boot time to set
-the high water marks for each per cpu page list.
+the high water marks for each per cpu page list.  If the user writes '0' to this
+sysctl, it will revert to this default behavior.
 
 ==============================================================
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7de6555..075d190 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -136,7 +136,6 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
 static int minolduid;
-static int min_percpu_pagelist_fract = 8;
 
 static int ngroups_max = NGROUPS_MAX;
 static const int cap_last_cap = CAP_LAST_CAP;
@@ -1317,7 +1316,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(percpu_pagelist_fraction),
 		.mode		= 0644,
 		.proc_handler	= percpu_pagelist_fraction_sysctl_handler,
-		.extra1		= &min_percpu_pagelist_fract,
+		.extra1		= &zero,
 	},
 #ifdef CONFIG_MMU
 	{
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4f59fa2..20d17f8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -69,6 +69,7 @@
 
 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
 static DEFINE_MUTEX(pcp_batch_high_lock);
+#define MIN_PERCPU_PAGELIST_FRACTION	(8)
 
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
@@ -4145,7 +4146,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
 	memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
 #endif
 
-static int __meminit zone_batchsize(struct zone *zone)
+static int zone_batchsize(struct zone *zone)
 {
 #ifdef CONFIG_MMU
 	int batch;
@@ -4261,8 +4262,8 @@ static void pageset_set_high(struct per_cpu_pageset *p,
 	pageset_update(&p->pcp, high, batch);
 }
 
-static void __meminit pageset_set_high_and_batch(struct zone *zone,
-		struct per_cpu_pageset *pcp)
+static void pageset_set_high_and_batch(struct zone *zone,
+				       struct per_cpu_pageset *pcp)
 {
 	if (percpu_pagelist_fraction)
 		pageset_set_high(pcp,
@@ -5881,23 +5882,38 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
-	unsigned int cpu;
+	int old_percpu_pagelist_fraction;
 	int ret;
 
+	mutex_lock(&pcp_batch_high_lock);
+	old_percpu_pagelist_fraction = percpu_pagelist_fraction;
+
 	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
-	if (!write || (ret < 0))
-		return ret;
+	if (!write || ret < 0)
+		goto out;
+
+	/* Sanity checking to avoid pcp imbalance */
+	if (percpu_pagelist_fraction &&
+	    percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
+		percpu_pagelist_fraction = old_percpu_pagelist_fraction;
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* No change? */
+	if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
+		goto out;
 
-	mutex_lock(&pcp_batch_high_lock);
 	for_each_populated_zone(zone) {
-		unsigned long  high;
-		high = zone->managed_pages / percpu_pagelist_fraction;
+		unsigned int cpu;
+
 		for_each_possible_cpu(cpu)
-			pageset_set_high(per_cpu_ptr(zone->pageset, cpu),
-					 high);
+			pageset_set_high_and_batch(zone,
+					per_cpu_ptr(zone->pageset, cpu));
 	}
+out:
 	mutex_unlock(&pcp_batch_high_lock);
-	return 0;
+	return ret;
 }
 
 int hashdist = HASHDIST_DEFAULT;
-- 
cgit v0.10.2


From b6226b45c66196e14ef628d3aead2139700db1ad Mon Sep 17 00:00:00 2001
From: Micky Ching <micky_ching@realsil.com.cn>
Date: Mon, 23 Jun 2014 13:22:04 -0700
Subject: drivers/memstick/host/rtsx_pci_ms.c: add cancel_work when remove
 driver

Add cancel_work_sync() in rtsx_pci_ms_drv_remove() to cancel pending
request work when removing the driver.

Signed-off-by: Micky Ching <micky_ching@realsil.com.cn>
Cc: Samuel Ortiz <sameo@linux.intel.com> says:
Cc: Maxim Levitsky <maximlevitsky@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Alex Dubov <oakad@yahoo.com>
Cc: Roger Tseng <rogerable@realtek.com>
Cc: Wei WANG <wei_wang@realsil.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/memstick/host/rtsx_pci_ms.c b/drivers/memstick/host/rtsx_pci_ms.c
index 2a635b6..c880ba6 100644
--- a/drivers/memstick/host/rtsx_pci_ms.c
+++ b/drivers/memstick/host/rtsx_pci_ms.c
@@ -601,6 +601,7 @@ static int rtsx_pci_ms_drv_remove(struct platform_device *pdev)
 	pcr->slots[RTSX_MS_CARD].card_event = NULL;
 	msh = host->msh;
 	host->eject = true;
+	cancel_work_sync(&host->handle_req);
 
 	mutex_lock(&host->host_mutex);
 	if (host->req) {
-- 
cgit v0.10.2


From 88e15ce402c58f41037752da092683e90826742a Mon Sep 17 00:00:00 2001
From: Rickard Strandqvist <rickard_strandqvist@spectrumdigital.se>
Date: Mon, 23 Jun 2014 13:22:04 -0700
Subject: Documentation/accounting/getdelays.c: add missing null-terminate
 after strncpy call

Added a guaranteed null-terminate after call to strncpy.

This was partly found using a static code analysis program called
cppcheck.

Signed-off-by: Rickard Strandqvist <rickard_strandqvist@spectrumdigital.se>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/accounting/getdelays.c b/Documentation/accounting/getdelays.c
index c6a06b7..f405780 100644
--- a/Documentation/accounting/getdelays.c
+++ b/Documentation/accounting/getdelays.c
@@ -314,6 +314,7 @@ int main(int argc, char *argv[])
 			break;
 		case 'm':
 			strncpy(cpumask, optarg, sizeof(cpumask));
+			cpumask[sizeof(cpumask) - 1] = '\0';
 			maskset = 1;
 			printf("cpumask %s maskset %d\n", cpumask, maskset);
 			break;
-- 
cgit v0.10.2


From f3aca3d09525f87731ba6b892c9b010570bc54b4 Mon Sep 17 00:00:00 2001
From: Aaron Tomlin <atomlin@redhat.com>
Date: Mon, 23 Jun 2014 13:22:05 -0700
Subject: nmi: provide the option to issue an NMI back trace to every cpu but
 current

Sometimes it is preferred not to use the trigger_all_cpu_backtrace()
routine when one wants to avoid capturing a back trace for current.  For
instance if one was previously captured recently.

This patch provides a new routine namely
trigger_allbutself_cpu_backtrace() which offers the flexibility to issue
an NMI to every cpu but current and capture a back trace accordingly.

Patch x86 and sparc to support new routine.

[dzickus@redhat.com: add stub in #else clause]
[dzickus@redhat.com: don't print message in single processor case, wrap with get/put_cpu based on Oleg's suggestion]
[sfr@canb.auug.org.au: undo C99ism]
Signed-off-by: Aaron Tomlin <atomlin@redhat.com>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Mateusz Guzik <mguzik@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/sparc/include/asm/irq_64.h b/arch/sparc/include/asm/irq_64.h
index 375cffc..91d2193 100644
--- a/arch/sparc/include/asm/irq_64.h
+++ b/arch/sparc/include/asm/irq_64.h
@@ -89,7 +89,7 @@ static inline unsigned long get_softint(void)
 	return retval;
 }
 
-void arch_trigger_all_cpu_backtrace(void);
+void arch_trigger_all_cpu_backtrace(bool);
 #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
 
 extern void *hardirq_stack[NR_CPUS];
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index b2988f2..027e099 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -239,7 +239,7 @@ static void __global_reg_poll(struct global_reg_snapshot *gp)
 	}
 }
 
-void arch_trigger_all_cpu_backtrace(void)
+void arch_trigger_all_cpu_backtrace(bool include_self)
 {
 	struct thread_info *tp = current_thread_info();
 	struct pt_regs *regs = get_irq_regs();
@@ -251,16 +251,22 @@ void arch_trigger_all_cpu_backtrace(void)
 
 	spin_lock_irqsave(&global_cpu_snapshot_lock, flags);
 
-	memset(global_cpu_snapshot, 0, sizeof(global_cpu_snapshot));
-
 	this_cpu = raw_smp_processor_id();
 
-	__global_reg_self(tp, regs, this_cpu);
+	memset(global_cpu_snapshot, 0, sizeof(global_cpu_snapshot));
+
+	if (include_self)
+		__global_reg_self(tp, regs, this_cpu);
 
 	smp_fetch_global_regs();
 
 	for_each_online_cpu(cpu) {
-		struct global_reg_snapshot *gp = &global_cpu_snapshot[cpu].reg;
+		struct global_reg_snapshot *gp;
+
+		if (!include_self && cpu == this_cpu)
+			continue;
+
+		gp = &global_cpu_snapshot[cpu].reg;
 
 		__global_reg_poll(gp);
 
@@ -292,7 +298,7 @@ void arch_trigger_all_cpu_backtrace(void)
 
 static void sysrq_handle_globreg(int key)
 {
-	arch_trigger_all_cpu_backtrace();
+	arch_trigger_all_cpu_backtrace(true);
 }
 
 static struct sysrq_key_op sparc_globalreg_op = {
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index cb6cfcd..a80cbb8 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -43,7 +43,7 @@ extern int vector_used_by_percpu_irq(unsigned int vector);
 extern void init_ISA_irqs(void);
 
 #ifdef CONFIG_X86_LOCAL_APIC
-void arch_trigger_all_cpu_backtrace(void);
+void arch_trigger_all_cpu_backtrace(bool);
 #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
 #endif
 
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index c3fcb5d..6a1e71b 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -33,31 +33,41 @@ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
 /* "in progress" flag of arch_trigger_all_cpu_backtrace */
 static unsigned long backtrace_flag;
 
-void arch_trigger_all_cpu_backtrace(void)
+void arch_trigger_all_cpu_backtrace(bool include_self)
 {
 	int i;
+	int cpu = get_cpu();
 
-	if (test_and_set_bit(0, &backtrace_flag))
+	if (test_and_set_bit(0, &backtrace_flag)) {
 		/*
 		 * If there is already a trigger_all_cpu_backtrace() in progress
 		 * (backtrace_flag == 1), don't output double cpu dump infos.
 		 */
+		put_cpu();
 		return;
+	}
 
 	cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
+	if (!include_self)
+		cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
 
-	printk(KERN_INFO "sending NMI to all CPUs:\n");
-	apic->send_IPI_all(NMI_VECTOR);
+	if (!cpumask_empty(to_cpumask(backtrace_mask))) {
+		pr_info("sending NMI to %s CPUs:\n",
+			(include_self ? "all" : "other"));
+		apic->send_IPI_mask(to_cpumask(backtrace_mask), NMI_VECTOR);
+	}
 
 	/* Wait for up to 10 seconds for all CPUs to do the backtrace */
 	for (i = 0; i < 10 * 1000; i++) {
 		if (cpumask_empty(to_cpumask(backtrace_mask)))
 			break;
 		mdelay(1);
+		touch_softlockup_watchdog();
 	}
 
 	clear_bit(0, &backtrace_flag);
 	smp_mb__after_atomic();
+	put_cpu();
 }
 
 static int
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 6a45fb5..a17ab63 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -32,15 +32,24 @@ static inline void touch_nmi_watchdog(void)
 #ifdef arch_trigger_all_cpu_backtrace
 static inline bool trigger_all_cpu_backtrace(void)
 {
-	arch_trigger_all_cpu_backtrace();
+	arch_trigger_all_cpu_backtrace(true);
 
 	return true;
 }
+static inline bool trigger_allbutself_cpu_backtrace(void)
+{
+	arch_trigger_all_cpu_backtrace(false);
+	return true;
+}
 #else
 static inline bool trigger_all_cpu_backtrace(void)
 {
 	return false;
 }
+static inline bool trigger_allbutself_cpu_backtrace(void)
+{
+	return false;
+}
 #endif
 
 #ifdef CONFIG_LOCKUP_DETECTOR
-- 
cgit v0.10.2


From ed235875e2ca983197831337a986f0517074e1a0 Mon Sep 17 00:00:00 2001
From: Aaron Tomlin <atomlin@redhat.com>
Date: Mon, 23 Jun 2014 13:22:05 -0700
Subject: kernel/watchdog.c: print traces for all cpus on lockup detection

A 'softlockup' is defined as a bug that causes the kernel to loop in
kernel mode for more than a predefined period to time, without giving
other tasks a chance to run.

Currently, upon detection of this condition by the per-cpu watchdog
task, debug information (including a stack trace) is sent to the system
log.

On some occasions, we have observed that the "victim" rather than the
actual "culprit" (i.e.  the owner/holder of the contended resource) is
reported to the user.  Often this information has proven to be
insufficient to assist debugging efforts.

To avoid loss of useful debug information, for architectures which
support NMI, this patch makes it possible to improve soft lockup
reporting.  This is accomplished by issuing an NMI to each cpu to obtain
a stack trace.

If NMI is not supported we just revert back to the old method.  A sysctl
and boot-time parameter is available to toggle this feature.

[dzickus@redhat.com: add CONFIG_SMP in certain areas]
[akpm@linux-foundation.org: additional CONFIG_SMP=n optimisations]
[mq@suse.cz: fix warning]
Signed-off-by: Aaron Tomlin <atomlin@redhat.com>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Mateusz Guzik <mguzik@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Jan Moskyto Matejka <mq@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 8849049..c1b9aa8 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3130,6 +3130,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			[KNL] Should the soft-lockup detector generate panics.
 			Format: <integer>
 
+	softlockup_all_cpu_backtrace=
+			[KNL] Should the soft-lockup detector generate
+			backtraces on all cpus.
+			Format: <integer>
+
 	sonypi.*=	[HW] Sony Programmable I/O Control Device driver
 			See Documentation/laptops/sonypi.txt
 
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 708bb7f..c14374e 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -75,6 +75,7 @@ show up in /proc/sys/kernel:
 - shmall
 - shmmax                      [ sysv ipc ]
 - shmmni
+- softlockup_all_cpu_backtrace
 - stop-a                      [ SPARC only ]
 - sysrq                       ==> Documentation/sysrq.txt
 - sysctl_writes_strict
@@ -783,6 +784,22 @@ via the /proc/sys interface:
 
 ==============================================================
 
+softlockup_all_cpu_backtrace:
+
+This value controls the soft lockup detector thread's behavior
+when a soft lockup condition is detected as to whether or not
+to gather further debug information. If enabled, each cpu will
+be issued an NMI and instructed to capture stack trace.
+
+This feature is only applicable for architectures which support
+NMI.
+
+0: do nothing. This is the default behavior.
+
+1: on detection capture more debug information.
+
+==============================================================
+
 tainted:
 
 Non-zero if the kernel has been tainted.  Numeric values, which
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index a17ab63..447775e 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -57,6 +57,7 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *);
 u64 hw_nmi_get_sample_period(int watchdog_thresh);
 extern int watchdog_user_enabled;
 extern int watchdog_thresh;
+extern int sysctl_softlockup_all_cpu_backtrace;
 struct ctl_table;
 extern int proc_dowatchdog(struct ctl_table *, int ,
 			   void __user *, size_t *, loff_t *);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 075d190..75b22e2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -860,6 +860,17 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
+#ifdef CONFIG_SMP
+	{
+		.procname	= "softlockup_all_cpu_backtrace",
+		.data		= &sysctl_softlockup_all_cpu_backtrace,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+#endif /* CONFIG_SMP */
 	{
 		.procname       = "nmi_watchdog",
 		.data           = &watchdog_user_enabled,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 30e4822..c3319bd 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -31,6 +31,12 @@
 
 int watchdog_user_enabled = 1;
 int __read_mostly watchdog_thresh = 10;
+#ifdef CONFIG_SMP
+int __read_mostly sysctl_softlockup_all_cpu_backtrace;
+#else
+#define sysctl_softlockup_all_cpu_backtrace 0
+#endif
+
 static int __read_mostly watchdog_running;
 static u64 __read_mostly sample_period;
 
@@ -47,6 +53,7 @@ static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
 #endif
+static unsigned long soft_lockup_nmi_warn;
 
 /* boot commands */
 /*
@@ -95,6 +102,15 @@ static int __init nosoftlockup_setup(char *str)
 }
 __setup("nosoftlockup", nosoftlockup_setup);
 /*  */
+#ifdef CONFIG_SMP
+static int __init softlockup_all_cpu_backtrace_setup(char *str)
+{
+	sysctl_softlockup_all_cpu_backtrace =
+		!!simple_strtol(str, NULL, 0);
+	return 1;
+}
+__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
+#endif
 
 /*
  * Hard-lockup warnings should be triggered after just a few seconds. Soft-
@@ -271,6 +287,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 	unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
 	struct pt_regs *regs = get_irq_regs();
 	int duration;
+	int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
 
 	/* kick the hardlockup detector */
 	watchdog_interrupt_count();
@@ -317,6 +334,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 		if (__this_cpu_read(soft_watchdog_warn) == true)
 			return HRTIMER_RESTART;
 
+		if (softlockup_all_cpu_backtrace) {
+			/* Prevent multiple soft-lockup reports if one cpu is already
+			 * engaged in dumping cpu back traces
+			 */
+			if (test_and_set_bit(0, &soft_lockup_nmi_warn)) {
+				/* Someone else will report us. Let's give up */
+				__this_cpu_write(soft_watchdog_warn, true);
+				return HRTIMER_RESTART;
+			}
+		}
+
 		printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
 			smp_processor_id(), duration,
 			current->comm, task_pid_nr(current));
@@ -327,6 +355,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 		else
 			dump_stack();
 
+		if (softlockup_all_cpu_backtrace) {
+			/* Avoid generating two back traces for current
+			 * given that one is already made above
+			 */
+			trigger_allbutself_cpu_backtrace();
+
+			clear_bit(0, &soft_lockup_nmi_warn);
+			/* Barrier to sync with other cpus */
+			smp_mb__after_atomic();
+		}
+
 		if (softlockup_panic)
 			panic("softlockup: hung tasks");
 		__this_cpu_write(soft_watchdog_warn, true);
-- 
cgit v0.10.2


From 5338a9372234f8b782c7d78f0355e1cb21d02468 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 23 Jun 2014 13:22:05 -0700
Subject: mm: thp: fix DEBUG_PAGEALLOC oops in copy_page_rep()

Trinity has for over a year been reporting a CONFIG_DEBUG_PAGEALLOC oops
in copy_page_rep() called from copy_user_huge_page() called from
do_huge_pmd_wp_page().

I believe this is a DEBUG_PAGEALLOC false positive, due to the source
page being split, and a tail page freed, while copy is in progress; and
not a problem without DEBUG_PAGEALLOC, since the pmd_same() check will
prevent a miscopy from being made visible.

Fix by adding get_user_huge_page() and put_user_huge_page(): reducing to
the usual get_page() and put_page() on head page in the usual config;
but get and put references to all of the tail pages when
DEBUG_PAGEALLOC.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Tested-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e60837d..bade35e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -941,6 +941,37 @@ unlock:
 	spin_unlock(ptl);
 }
 
+/*
+ * Save CONFIG_DEBUG_PAGEALLOC from faulting falsely on tail pages
+ * during copy_user_huge_page()'s copy_page_rep(): in the case when
+ * the source page gets split and a tail freed before copy completes.
+ * Called under pmd_lock of checked pmd, so safe from splitting itself.
+ */
+static void get_user_huge_page(struct page *page)
+{
+	if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
+		struct page *endpage = page + HPAGE_PMD_NR;
+
+		atomic_add(HPAGE_PMD_NR, &page->_count);
+		while (++page < endpage)
+			get_huge_page_tail(page);
+	} else {
+		get_page(page);
+	}
+}
+
+static void put_user_huge_page(struct page *page)
+{
+	if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
+		struct page *endpage = page + HPAGE_PMD_NR;
+
+		while (page < endpage)
+			put_page(page++);
+	} else {
+		put_page(page);
+	}
+}
+
 static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 					struct vm_area_struct *vma,
 					unsigned long address,
@@ -1074,7 +1105,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		ret |= VM_FAULT_WRITE;
 		goto out_unlock;
 	}
-	get_page(page);
+	get_user_huge_page(page);
 	spin_unlock(ptl);
 alloc:
 	if (transparent_hugepage_enabled(vma) &&
@@ -1095,7 +1126,7 @@ alloc:
 				split_huge_page(page);
 				ret |= VM_FAULT_FALLBACK;
 			}
-			put_page(page);
+			put_user_huge_page(page);
 		}
 		count_vm_event(THP_FAULT_FALLBACK);
 		goto out;
@@ -1105,7 +1136,7 @@ alloc:
 		put_page(new_page);
 		if (page) {
 			split_huge_page(page);
-			put_page(page);
+			put_user_huge_page(page);
 		} else
 			split_huge_page_pmd(vma, address, pmd);
 		ret |= VM_FAULT_FALLBACK;
@@ -1127,7 +1158,7 @@ alloc:
 
 	spin_lock(ptl);
 	if (page)
-		put_page(page);
+		put_user_huge_page(page);
 	if (unlikely(!pmd_same(*pmd, orig_pmd))) {
 		spin_unlock(ptl);
 		mem_cgroup_uncharge_page(new_page);
-- 
cgit v0.10.2


From f72e7dcdd25229446b102e587ef2f826f76bff28 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 23 Jun 2014 13:22:05 -0700
Subject: mm: let mm_find_pmd fix buggy race with THP fault

Trinity has reported:

    BUG: unable to handle kernel NULL pointer dereference at 0000000000000018
    IP: __lock_acquire (kernel/locking/lockdep.c:3070 (discriminator 1))
    CPU: 6 PID: 16173 Comm: trinity-c364 Tainted: G        W
                            3.15.0-rc1-next-20140415-sasha-00020-gaa90d09 #398
    lock_acquire (arch/x86/include/asm/current.h:14
                  kernel/locking/lockdep.c:3602)
    _raw_spin_lock (include/linux/spinlock_api_smp.h:143
                    kernel/locking/spinlock.c:151)
    remove_migration_pte (mm/migrate.c:137)
    rmap_walk (mm/rmap.c:1628 mm/rmap.c:1699)
    remove_migration_ptes (mm/migrate.c:224)
    migrate_pages (mm/migrate.c:922 mm/migrate.c:960 mm/migrate.c:1126)
    migrate_misplaced_page (mm/migrate.c:1733)
    __handle_mm_fault (mm/memory.c:3762 mm/memory.c:3812 mm/memory.c:3925)
    handle_mm_fault (mm/memory.c:3948)
    __get_user_pages (mm/memory.c:1851)
    __mlock_vma_pages_range (mm/mlock.c:255)
    __mm_populate (mm/mlock.c:711)
    SyS_mlockall (include/linux/mm.h:1799 mm/mlock.c:817 mm/mlock.c:791)

I believe this comes about because, whereas collapsing and splitting THP
functions take anon_vma lock in write mode (which excludes concurrent
rmap walks), faulting THP functions (write protection and misplaced
NUMA) do not - and mostly they do not need to.

But they do use a pmdp_clear_flush(), set_pmd_at() sequence which, for
an instant (indeed, for a long instant, given the inter-CPU TLB flush in
there), leaves *pmd neither present not trans_huge.

Which can confuse a concurrent rmap walk, as when removing migration
ptes, seen in the dumped trace.  Although that rmap walk has a 4k page
to insert, anon_vmas containing THPs are in no way segregated from
4k-page anon_vmas, so the 4k-intent mm_find_pmd() does need to cope with
that instant when a trans_huge pmd is temporarily absent.

I don't think we need strengthen the locking at the THP end: it's easily
handled with an ACCESS_ONCE() before testing both conditions.

And since mm_find_pmd() had only one caller who wanted a THP rather than
a pmd, let's slightly repurpose it to fail when it hits a THP or
non-present pmd, and open code split_huge_page_address() again.

Signed-off-by: Hugh Dickins <hughd@google.com>
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Bob Liu <bob.liu@oracle.com>
Cc: Christoph Lameter <cl@gentwo.org>
Cc: Dave Jones <davej@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index bade35e..33514d8 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2423,8 +2423,6 @@ static void collapse_huge_page(struct mm_struct *mm,
 	pmd = mm_find_pmd(mm, address);
 	if (!pmd)
 		goto out;
-	if (pmd_trans_huge(*pmd))
-		goto out;
 
 	anon_vma_lock_write(vma->anon_vma);
 
@@ -2523,8 +2521,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 	pmd = mm_find_pmd(mm, address);
 	if (!pmd)
 		goto out;
-	if (pmd_trans_huge(*pmd))
-		goto out;
 
 	memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
 	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
@@ -2877,12 +2873,22 @@ void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
 static void split_huge_page_address(struct mm_struct *mm,
 				    unsigned long address)
 {
+	pgd_t *pgd;
+	pud_t *pud;
 	pmd_t *pmd;
 
 	VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
 
-	pmd = mm_find_pmd(mm, address);
-	if (!pmd)
+	pgd = pgd_offset(mm, address);
+	if (!pgd_present(*pgd))
+		return;
+
+	pud = pud_offset(pgd, address);
+	if (!pud_present(*pud))
+		return;
+
+	pmd = pmd_offset(pud, address);
+	if (!pmd_present(*pmd))
 		return;
 	/*
 	 * Caller holds the mmap_sem write mode, so a huge pmd cannot
diff --git a/mm/ksm.c b/mm/ksm.c
index 68710e8..346ddc9 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -945,7 +945,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
 	pmd = mm_find_pmd(mm, addr);
 	if (!pmd)
 		goto out;
-	BUG_ON(pmd_trans_huge(*pmd));
 
 	mmun_start = addr;
 	mmun_end   = addr + PAGE_SIZE;
diff --git a/mm/migrate.c b/mm/migrate.c
index 63f0cd5..9e0beaa 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -120,8 +120,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
 		pmd = mm_find_pmd(mm, addr);
 		if (!pmd)
 			goto out;
-		if (pmd_trans_huge(*pmd))
-			goto out;
 
 		ptep = pte_offset_map(pmd, addr);
 
diff --git a/mm/rmap.c b/mm/rmap.c
index bf05fc8..b7e94eb 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -569,6 +569,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd = NULL;
+	pmd_t pmde;
 
 	pgd = pgd_offset(mm, address);
 	if (!pgd_present(*pgd))
@@ -579,7 +580,13 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
 		goto out;
 
 	pmd = pmd_offset(pud, address);
-	if (!pmd_present(*pmd))
+	/*
+	 * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at()
+	 * without holding anon_vma lock for write.  So when looking for a
+	 * genuine pmde (in which to find pte), test present and !THP together.
+	 */
+	pmde = ACCESS_ONCE(*pmd);
+	if (!pmd_present(pmde) || pmd_trans_huge(pmde))
 		pmd = NULL;
 out:
 	return pmd;
@@ -615,9 +622,6 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
 	if (!pmd)
 		return NULL;
 
-	if (pmd_trans_huge(*pmd))
-		return NULL;
-
 	pte = pte_offset_map(pmd, address);
 	/* Make a quick check before getting the lock */
 	if (!sync && !pte_present(*pte)) {
-- 
cgit v0.10.2


From f00cdc6df7d7cfcabb5b740911e6788cb0802bdb Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 23 Jun 2014 13:22:06 -0700
Subject: shmem: fix faulting into a hole while it's punched

Trinity finds that mmap access to a hole while it's punched from shmem
can prevent the madvise(MADV_REMOVE) or fallocate(FALLOC_FL_PUNCH_HOLE)
from completing, until the reader chooses to stop; with the puncher's
hold on i_mutex locking out all other writers until it can complete.

It appears that the tmpfs fault path is too light in comparison with its
hole-punching path, lacking an i_data_sem to obstruct it; but we don't
want to slow down the common case.

Extend shmem_fallocate()'s existing range notification mechanism, so
shmem_fault() can refrain from faulting pages into the hole while it's
punched, waiting instead on i_mutex (when safe to sleep; or repeatedly
faulting when not).

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Hugh Dickins <hughd@google.com>
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Tested-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Dave Jones <davej@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/shmem.c b/mm/shmem.c
index 91b912b..8f419cf 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -80,11 +80,12 @@ static struct vfsmount *shm_mnt;
 #define SHORT_SYMLINK_LEN 128
 
 /*
- * shmem_fallocate and shmem_writepage communicate via inode->i_private
- * (with i_mutex making sure that it has only one user at a time):
- * we would prefer not to enlarge the shmem inode just for that.
+ * shmem_fallocate communicates with shmem_fault or shmem_writepage via
+ * inode->i_private (with i_mutex making sure that it has only one user at
+ * a time): we would prefer not to enlarge the shmem inode just for that.
  */
 struct shmem_falloc {
+	int	mode;		/* FALLOC_FL mode currently operating */
 	pgoff_t start;		/* start of range currently being fallocated */
 	pgoff_t next;		/* the next page offset to be fallocated */
 	pgoff_t nr_falloced;	/* how many new pages have been fallocated */
@@ -759,6 +760,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 			spin_lock(&inode->i_lock);
 			shmem_falloc = inode->i_private;
 			if (shmem_falloc &&
+			    !shmem_falloc->mode &&
 			    index >= shmem_falloc->start &&
 			    index < shmem_falloc->next)
 				shmem_falloc->nr_unswapped++;
@@ -1233,6 +1235,44 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	int error;
 	int ret = VM_FAULT_LOCKED;
 
+	/*
+	 * Trinity finds that probing a hole which tmpfs is punching can
+	 * prevent the hole-punch from ever completing: which in turn
+	 * locks writers out with its hold on i_mutex.  So refrain from
+	 * faulting pages into the hole while it's being punched, and
+	 * wait on i_mutex to be released if vmf->flags permits.
+	 */
+	if (unlikely(inode->i_private)) {
+		struct shmem_falloc *shmem_falloc;
+
+		spin_lock(&inode->i_lock);
+		shmem_falloc = inode->i_private;
+		if (!shmem_falloc ||
+		    shmem_falloc->mode != FALLOC_FL_PUNCH_HOLE ||
+		    vmf->pgoff < shmem_falloc->start ||
+		    vmf->pgoff >= shmem_falloc->next)
+			shmem_falloc = NULL;
+		spin_unlock(&inode->i_lock);
+		/*
+		 * i_lock has protected us from taking shmem_falloc seriously
+		 * once return from shmem_fallocate() went back up that stack.
+		 * i_lock does not serialize with i_mutex at all, but it does
+		 * not matter if sometimes we wait unnecessarily, or sometimes
+		 * miss out on waiting: we just need to make those cases rare.
+		 */
+		if (shmem_falloc) {
+			if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
+			   !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
+				up_read(&vma->vm_mm->mmap_sem);
+				mutex_lock(&inode->i_mutex);
+				mutex_unlock(&inode->i_mutex);
+				return VM_FAULT_RETRY;
+			}
+			/* cond_resched? Leave that to GUP or return to user */
+			return VM_FAULT_NOPAGE;
+		}
+	}
+
 	error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
 	if (error)
 		return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
@@ -1729,18 +1769,26 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 
 	mutex_lock(&inode->i_mutex);
 
+	shmem_falloc.mode = mode & ~FALLOC_FL_KEEP_SIZE;
+
 	if (mode & FALLOC_FL_PUNCH_HOLE) {
 		struct address_space *mapping = file->f_mapping;
 		loff_t unmap_start = round_up(offset, PAGE_SIZE);
 		loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
 
+		shmem_falloc.start = unmap_start >> PAGE_SHIFT;
+		shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
+		spin_lock(&inode->i_lock);
+		inode->i_private = &shmem_falloc;
+		spin_unlock(&inode->i_lock);
+
 		if ((u64)unmap_end > (u64)unmap_start)
 			unmap_mapping_range(mapping, unmap_start,
 					    1 + unmap_end - unmap_start, 0);
 		shmem_truncate_range(inode, offset, offset + len - 1);
 		/* No need to unmap again: hole-punching leaves COWed pages */
 		error = 0;
-		goto out;
+		goto undone;
 	}
 
 	/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
-- 
cgit v0.10.2


From 03787301420376ae41fbaf4267f4a6253d152ac5 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Mon, 23 Jun 2014 13:22:06 -0700
Subject: slab: fix oops when reading /proc/slab_allocators

Commit b1cb0982bdd6 ("change the management method of free objects of
the slab") introduced a bug on slab leak detector
('/proc/slab_allocators').  This detector works like as following
decription.

 1. traverse all objects on all the slabs.
 2. determine whether it is active or not.
 3. if active, print who allocate this object.

but that commit changed the way how to manage free objects, so the logic
determining whether it is active or not is also changed.  In before, we
regard object in cpu caches as inactive one, but, with this commit, we
mistakenly regard object in cpu caches as active one.

This intoduces kernel oops if DEBUG_PAGEALLOC is enabled.  If
DEBUG_PAGEALLOC is enabled, kernel_map_pages() is used to detect who
corrupt free memory in the slab.  It unmaps page table mapping if object
is free and map it if object is active.  When slab leak detector check
object in cpu caches, it mistakenly think this object active so try to
access object memory to retrieve caller of allocation.  At this point,
page table mapping to this object doesn't exist, so oops occurs.

Following is oops message reported from Dave.

It blew up when something tried to read /proc/slab_allocators
(Just cat it, and you should see the oops below)

  Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
  Modules linked in:
  [snip...]
  CPU: 1 PID: 9386 Comm: trinity-c33 Not tainted 3.14.0-rc5+ #131
  task: ffff8801aa46e890 ti: ffff880076924000 task.ti: ffff880076924000
  RIP: 0010:[<ffffffffaa1a8f4a>]  [<ffffffffaa1a8f4a>] handle_slab+0x8a/0x180
  RSP: 0018:ffff880076925de0  EFLAGS: 00010002
  RAX: 0000000000001000 RBX: 0000000000000000 RCX: 000000005ce85ce7
  RDX: ffffea00079be100 RSI: 0000000000001000 RDI: ffff880107458000
  RBP: ffff880076925e18 R08: 0000000000000001 R09: 0000000000000000
  R10: 0000000000000000 R11: 000000000000000f R12: ffff8801e6f84000
  R13: ffffea00079be100 R14: ffff880107458000 R15: ffff88022bb8d2c0
  FS:  00007fb769e45740(0000) GS:ffff88024d040000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: ffff8801e6f84ff8 CR3: 00000000a22db000 CR4: 00000000001407e0
  DR0: 0000000002695000 DR1: 0000000002695000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000070602
  Call Trace:
    leaks_show+0xce/0x240
    seq_read+0x28e/0x490
    proc_reg_read+0x3d/0x80
    vfs_read+0x9b/0x160
    SyS_read+0x58/0xb0
    tracesys+0xd4/0xd9
  Code: f5 00 00 00 0f 1f 44 00 00 48 63 c8 44 3b 0c 8a 0f 84 e3 00 00 00 83 c0 01 44 39 c0 72 eb 41 f6 47 1a 01 0f 84 e9 00 00 00 89 f0 <4d> 8b 4c 04 f8 4d 85 c9 0f 84 88 00 00 00 49 8b 7e 08 4d 8d 46
  RIP   handle_slab+0x8a/0x180

To fix the problem, I introduce an object status buffer on each slab.
With this, we can track object status precisely, so slab leak detector
would not access active object and no kernel oops would occur.  Memory
overhead caused by this fix is only imposed to CONFIG_DEBUG_SLAB_LEAK
which is mainly used for debugging, so memory overhead isn't big
problem.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Reported-by: Dave Jones <davej@redhat.com>
Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/slab.c b/mm/slab.c
index 9ca3b87..3070b92 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -386,6 +386,39 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 
 #endif
 
+#define OBJECT_FREE (0)
+#define OBJECT_ACTIVE (1)
+
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+
+static void set_obj_status(struct page *page, int idx, int val)
+{
+	int freelist_size;
+	char *status;
+	struct kmem_cache *cachep = page->slab_cache;
+
+	freelist_size = cachep->num * sizeof(freelist_idx_t);
+	status = (char *)page->freelist + freelist_size;
+	status[idx] = val;
+}
+
+static inline unsigned int get_obj_status(struct page *page, int idx)
+{
+	int freelist_size;
+	char *status;
+	struct kmem_cache *cachep = page->slab_cache;
+
+	freelist_size = cachep->num * sizeof(freelist_idx_t);
+	status = (char *)page->freelist + freelist_size;
+
+	return status[idx];
+}
+
+#else
+static inline void set_obj_status(struct page *page, int idx, int val) {}
+
+#endif
+
 /*
  * Do not go above this order unless 0 objects fit into the slab or
  * overridden on the command line.
@@ -576,12 +609,30 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
 	return cachep->array[smp_processor_id()];
 }
 
+static size_t calculate_freelist_size(int nr_objs, size_t align)
+{
+	size_t freelist_size;
+
+	freelist_size = nr_objs * sizeof(freelist_idx_t);
+	if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
+		freelist_size += nr_objs * sizeof(char);
+
+	if (align)
+		freelist_size = ALIGN(freelist_size, align);
+
+	return freelist_size;
+}
+
 static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
 				size_t idx_size, size_t align)
 {
 	int nr_objs;
+	size_t remained_size;
 	size_t freelist_size;
+	int extra_space = 0;
 
+	if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
+		extra_space = sizeof(char);
 	/*
 	 * Ignore padding for the initial guess. The padding
 	 * is at most @align-1 bytes, and @buffer_size is at
@@ -590,14 +641,15 @@ static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
 	 * into the memory allocation when taking the padding
 	 * into account.
 	 */
-	nr_objs = slab_size / (buffer_size + idx_size);
+	nr_objs = slab_size / (buffer_size + idx_size + extra_space);
 
 	/*
 	 * This calculated number will be either the right
 	 * amount, or one greater than what we want.
 	 */
-	freelist_size = slab_size - nr_objs * buffer_size;
-	if (freelist_size < ALIGN(nr_objs * idx_size, align))
+	remained_size = slab_size - nr_objs * buffer_size;
+	freelist_size = calculate_freelist_size(nr_objs, align);
+	if (remained_size < freelist_size)
 		nr_objs--;
 
 	return nr_objs;
@@ -635,7 +687,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
 	} else {
 		nr_objs = calculate_nr_objs(slab_size, buffer_size,
 					sizeof(freelist_idx_t), align);
-		mgmt_size = ALIGN(nr_objs * sizeof(freelist_idx_t), align);
+		mgmt_size = calculate_freelist_size(nr_objs, align);
 	}
 	*num = nr_objs;
 	*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
@@ -2041,13 +2093,16 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
 			break;
 
 		if (flags & CFLGS_OFF_SLAB) {
+			size_t freelist_size_per_obj = sizeof(freelist_idx_t);
 			/*
 			 * Max number of objs-per-slab for caches which
 			 * use off-slab slabs. Needed to avoid a possible
 			 * looping condition in cache_grow().
 			 */
+			if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
+				freelist_size_per_obj += sizeof(char);
 			offslab_limit = size;
-			offslab_limit /= sizeof(freelist_idx_t);
+			offslab_limit /= freelist_size_per_obj;
 
  			if (num > offslab_limit)
 				break;
@@ -2294,8 +2349,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 	if (!cachep->num)
 		return -E2BIG;
 
-	freelist_size =
-		ALIGN(cachep->num * sizeof(freelist_idx_t), cachep->align);
+	freelist_size = calculate_freelist_size(cachep->num, cachep->align);
 
 	/*
 	 * If the slab has been placed off-slab, and we have enough space then
@@ -2308,7 +2362,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 
 	if (flags & CFLGS_OFF_SLAB) {
 		/* really off slab. No need for manual alignment */
-		freelist_size = cachep->num * sizeof(freelist_idx_t);
+		freelist_size = calculate_freelist_size(cachep->num, 0);
 
 #ifdef CONFIG_PAGE_POISONING
 		/* If we're going to use the generic kernel_map_pages()
@@ -2612,6 +2666,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
 		if (cachep->ctor)
 			cachep->ctor(objp);
 #endif
+		set_obj_status(page, i, OBJECT_FREE);
 		set_free_obj(page, i, i);
 	}
 }
@@ -2820,6 +2875,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 	BUG_ON(objnr >= cachep->num);
 	BUG_ON(objp != index_to_obj(cachep, page, objnr));
 
+	set_obj_status(page, objnr, OBJECT_FREE);
 	if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
 		if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
@@ -2953,6 +3009,8 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
 static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 				gfp_t flags, void *objp, unsigned long caller)
 {
+	struct page *page;
+
 	if (!objp)
 		return objp;
 	if (cachep->flags & SLAB_POISON) {
@@ -2983,6 +3041,9 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 		*dbg_redzone1(cachep, objp) = RED_ACTIVE;
 		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
 	}
+
+	page = virt_to_head_page(objp);
+	set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE);
 	objp += obj_offset(cachep);
 	if (cachep->ctor && cachep->flags & SLAB_POISON)
 		cachep->ctor(objp);
@@ -4219,21 +4280,12 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c,
 						struct page *page)
 {
 	void *p;
-	int i, j;
+	int i;
 
 	if (n[0] == n[1])
 		return;
 	for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
-		bool active = true;
-
-		for (j = page->active; j < c->num; j++) {
-			/* Skip freed item */
-			if (get_free_obj(page, j) == i) {
-				active = false;
-				break;
-			}
-		}
-		if (!active)
+		if (get_obj_status(page, i) != OBJECT_ACTIVE)
 			continue;
 
 		if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
-- 
cgit v0.10.2


From fe8eea4f4a3f299ef83ed090d5354698ebe4fda8 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Mon, 23 Jun 2014 13:22:07 -0700
Subject: DMA, CMA: fix possible memory leak

We should free memory for bitmap when we find zone mismatch, otherwise
this memory will leak.

Additionally, I copy code comment from PPC KVM's CMA code to inform why
we need to check zone mis-match.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Reviewed-by: Michal Nazarewicz <mina86@mina86.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Gleb Natapov <gleb@kernel.org>
Cc: Alexander Graf <agraf@suse.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c
index 83969f8..6467c91 100644
--- a/drivers/base/dma-contiguous.c
+++ b/drivers/base/dma-contiguous.c
@@ -176,14 +176,24 @@ static int __init cma_activate_area(struct cma *cma)
 		base_pfn = pfn;
 		for (j = pageblock_nr_pages; j; --j, pfn++) {
 			WARN_ON_ONCE(!pfn_valid(pfn));
+			/*
+			 * alloc_contig_range requires the pfn range
+			 * specified to be in the same zone. Make this
+			 * simple by forcing the entire CMA resv range
+			 * to be in the same zone.
+			 */
 			if (page_zone(pfn_to_page(pfn)) != zone)
-				return -EINVAL;
+				goto err;
 		}
 		init_cma_reserved_pageblock(pfn_to_page(base_pfn));
 	} while (--i);
 
 	mutex_init(&cma->lock);
 	return 0;
+
+err:
+	kfree(cma->bitmap);
+	return -EINVAL;
 }
 
 static struct cma cma_areas[MAX_CMA_AREAS];
-- 
cgit v0.10.2


From f9af420fc8208d3add2fe3198dc5d8215f5a81ba Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 23 Jun 2014 13:22:07 -0700
Subject: ia64: arch/ia64/include/uapi/asm/fcntl.h needs personality.h

fs/notify/fanotify/fanotify_user.c: In function 'SYSC_fanotify_init':
fs/notify/fanotify/fanotify_user.c:726: error: implicit declaration of function 'personality'
fs/notify/fanotify/fanotify_user.c:726: error: 'PER_LINUX32' undeclared (first use in this function)
fs/notify/fanotify/fanotify_user.c:726: error: (Each undeclared identifier is reported only once
fs/notify/fanotify/fanotify_user.c:726: error: for each function it appears in.)

Reported-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Will Woods <wwoods@redhat.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: <stable@vger.kernel.org>	[3.15.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/ia64/include/uapi/asm/fcntl.h b/arch/ia64/include/uapi/asm/fcntl.h
index 1dd275d..7b48587 100644
--- a/arch/ia64/include/uapi/asm/fcntl.h
+++ b/arch/ia64/include/uapi/asm/fcntl.h
@@ -8,6 +8,7 @@
 #define force_o_largefile()	\
 		(personality(current->personality) != PER_LINUX32)
 
+#include <linux/personality.h>
 #include <asm-generic/fcntl.h>
 
 #endif /* _ASM_IA64_FCNTL_H */
-- 
cgit v0.10.2


From b43ae21bd1d8199df10548f3fc0d806052027f29 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 23 Jun 2014 13:22:07 -0700
Subject: checkpatch: reduce false positives when checking void function return
 statements

The previous patch had a few too many false positives on styles that
should be acceptable.

Signed-off-by: Joe Perches <joe@perches.com>
Tested-by: Anish Bhatt <anish@chelsio.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 010b18e..182be0f 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -3476,12 +3476,17 @@ sub process {
 			}
 		}
 
-# unnecessary return in a void function? (a single leading tab, then return;)
-		if ($sline =~ /^\+\treturn\s*;\s*$/ &&
-		    $prevline =~ /^\+/) {
+# unnecessary return in a void function
+# at end-of-function, with the previous line a single leading tab, then return;
+# and the line before that not a goto label target like "out:"
+		if ($sline =~ /^[ \+]}\s*$/ &&
+		    $prevline =~ /^\+\treturn\s*;\s*$/ &&
+		    $linenr >= 3 &&
+		    $lines[$linenr - 3] =~ /^[ +]/ &&
+		    $lines[$linenr - 3] !~ /^[ +]\s*$Ident\s*:/) {
 			WARN("RETURN_VOID",
-			     "void function return statements are not generally useful\n" . $herecurr);
-		}
+			     "void function return statements are not generally useful\n" . $hereprev);
+               }
 
 # if statements using unnecessary parentheses - ie: if ((foo == bar))
 		if ($^V && $^V ge 5.10.0 &&
-- 
cgit v0.10.2


From d05f0cdcbe6388723f1900c549b4850360545201 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 23 Jun 2014 13:22:07 -0700
Subject: mm: fix crashes from mbind() merging vmas

In v2.6.34 commit 9d8cebd4bcd7 ("mm: fix mbind vma merge problem")
introduced vma merging to mbind(), but it should have also changed the
convention of passing start vma from queue_pages_range() (formerly
check_range()) to new_vma_page(): vma merging may have already freed
that structure, resulting in BUG at mm/mempolicy.c:1738 and probably
worse crashes.

Fixes: 9d8cebd4bcd7 ("mm: fix mbind vma merge problem")
Reported-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Tested-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: <stable@vger.kernel.org>	[2.6.34+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 2849742..eb58de1 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -656,19 +656,18 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
  * @nodes and @flags,) it's isolated and queued to the pagelist which is
  * passed via @private.)
  */
-static struct vm_area_struct *
+static int
 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 		const nodemask_t *nodes, unsigned long flags, void *private)
 {
-	int err;
-	struct vm_area_struct *first, *vma, *prev;
-
+	int err = 0;
+	struct vm_area_struct *vma, *prev;
 
-	first = find_vma(mm, start);
-	if (!first)
-		return ERR_PTR(-EFAULT);
+	vma = find_vma(mm, start);
+	if (!vma)
+		return -EFAULT;
 	prev = NULL;
-	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
+	for (; vma && vma->vm_start < end; vma = vma->vm_next) {
 		unsigned long endvma = vma->vm_end;
 
 		if (endvma > end)
@@ -678,9 +677,9 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 
 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 			if (!vma->vm_next && vma->vm_end < end)
-				return ERR_PTR(-EFAULT);
+				return -EFAULT;
 			if (prev && prev->vm_end < vma->vm_start)
-				return ERR_PTR(-EFAULT);
+				return -EFAULT;
 		}
 
 		if (flags & MPOL_MF_LAZY) {
@@ -694,15 +693,13 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 
 			err = queue_pages_pgd_range(vma, start, endvma, nodes,
 						flags, private);
-			if (err) {
-				first = ERR_PTR(err);
+			if (err)
 				break;
-			}
 		}
 next:
 		prev = vma;
 	}
-	return first;
+	return err;
 }
 
 /*
@@ -1156,16 +1153,17 @@ out:
 
 /*
  * Allocate a new page for page migration based on vma policy.
- * Start assuming that page is mapped by vma pointed to by @private.
+ * Start by assuming the page is mapped by the same vma as contains @start.
  * Search forward from there, if not.  N.B., this assumes that the
  * list of pages handed to migrate_pages()--which is how we get here--
  * is in virtual address order.
  */
-static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+static struct page *new_page(struct page *page, unsigned long start, int **x)
 {
-	struct vm_area_struct *vma = (struct vm_area_struct *)private;
+	struct vm_area_struct *vma;
 	unsigned long uninitialized_var(address);
 
+	vma = find_vma(current->mm, start);
 	while (vma) {
 		address = page_address_in_vma(page, vma);
 		if (address != -EFAULT)
@@ -1195,7 +1193,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
 	return -ENOSYS;
 }
 
-static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+static struct page *new_page(struct page *page, unsigned long start, int **x)
 {
 	return NULL;
 }
@@ -1205,7 +1203,6 @@ static long do_mbind(unsigned long start, unsigned long len,
 		     unsigned short mode, unsigned short mode_flags,
 		     nodemask_t *nmask, unsigned long flags)
 {
-	struct vm_area_struct *vma;
 	struct mm_struct *mm = current->mm;
 	struct mempolicy *new;
 	unsigned long end;
@@ -1271,11 +1268,9 @@ static long do_mbind(unsigned long start, unsigned long len,
 	if (err)
 		goto mpol_out;
 
-	vma = queue_pages_range(mm, start, end, nmask,
+	err = queue_pages_range(mm, start, end, nmask,
 			  flags | MPOL_MF_INVERT, &pagelist);
-
-	err = PTR_ERR(vma);	/* maybe ... */
-	if (!IS_ERR(vma))
+	if (!err)
 		err = mbind_range(mm, start, end, new);
 
 	if (!err) {
@@ -1283,9 +1278,8 @@ static long do_mbind(unsigned long start, unsigned long len,
 
 		if (!list_empty(&pagelist)) {
 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
-			nr_failed = migrate_pages(&pagelist, new_vma_page,
-					NULL, (unsigned long)vma,
-					MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
+			nr_failed = migrate_pages(&pagelist, new_page, NULL,
+				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
 			if (nr_failed)
 				putback_movable_pages(&pagelist);
 		}
-- 
cgit v0.10.2


From 5fb1beb069dcf616e88be9d330b86878a79aa4ff Mon Sep 17 00:00:00 2001
From: alex chen <alex.chen@huawei.com>
Date: Mon, 23 Jun 2014 13:22:07 -0700
Subject: ocfs2: should add inode into orphan dir after updating entry in
 ocfs2_rename()

There are two files a and b in dir /mnt/ocfs2.

    node A                           node B

  mv a b
  In ocfs2_rename(), after calling
  ocfs2_orphan_add(), the inode of
  file b will be added into orphan
  dir.

  If ocfs2_update_entry() fails,
  ocfs2_rename return error and mv
  operation fails. But file b still
  exists in the parent dir.

  ocfs2_queue_orphan_scan
   -> ocfs2_queue_recovery_completion
   -> ocfs2_complete_recovery
   -> ocfs2_recover_orphans
  The inode of the file b will be
  put with iput().

  ocfs2_evict_inode
   -> ocfs2_delete_inode
   -> ocfs2_wipe_inode
   -> ocfs2_remove_inode
  OCFS2_VALID_FL in the inode
  i_flags will be cleared.

                                   The file b still can be accessed
                                   on node B.
                                   ls /mnt/ocfs2
                                   When first read the file b with
                                   ocfs2_read_inode_block(). It will
                                   validate the inode using
                                   ocfs2_validate_inode_block().
                                   Because OCFS2_VALID_FL not set in
                                   the inode i_flags, so the file
                                   system will be readonly.

So we should add inode into orphan dir after updating entry in
ocfs2_rename().

Signed-off-by: alex.chen <alex.chen@huawei.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2060fc3..5819bb5 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1098,6 +1098,7 @@ static int ocfs2_rename(struct inode *old_dir,
 	struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, };
 	struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
 	struct ocfs2_dir_lookup_result target_insert = { NULL, };
+	bool should_add_orphan = false;
 
 	/* At some point it might be nice to break this function up a
 	 * bit. */
@@ -1304,6 +1305,7 @@ static int ocfs2_rename(struct inode *old_dir,
 				mlog_errno(status);
 				goto bail;
 			}
+			should_add_orphan = true;
 		}
 	} else {
 		BUG_ON(new_dentry->d_parent->d_inode != new_dir);
@@ -1348,17 +1350,6 @@ static int ocfs2_rename(struct inode *old_dir,
 			goto bail;
 		}
 
-		if (S_ISDIR(new_inode->i_mode) ||
-		    (ocfs2_read_links_count(newfe) == 1)) {
-			status = ocfs2_orphan_add(osb, handle, new_inode,
-						  newfe_bh, orphan_name,
-						  &orphan_insert, orphan_dir);
-			if (status < 0) {
-				mlog_errno(status);
-				goto bail;
-			}
-		}
-
 		/* change the dirent to point to the correct inode */
 		status = ocfs2_update_entry(new_dir, handle, &target_lookup_res,
 					    old_inode);
@@ -1373,6 +1364,15 @@ static int ocfs2_rename(struct inode *old_dir,
 		else
 			ocfs2_add_links_count(newfe, -1);
 		ocfs2_journal_dirty(handle, newfe_bh);
+		if (should_add_orphan) {
+			status = ocfs2_orphan_add(osb, handle, new_inode,
+					newfe_bh, orphan_name,
+					&orphan_insert, orphan_dir);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
 	} else {
 		/* if the name was not found in new_dir, add it now */
 		status = ocfs2_add_entry(handle, new_dentry, old_inode,
-- 
cgit v0.10.2


From 27bf6305cf85cb474295c300c99cc3d10d68f50a Mon Sep 17 00:00:00 2001
From: Tariq Saeed <tariq.x.saeed@oracle.com>
Date: Mon, 23 Jun 2014 13:22:08 -0700
Subject: ocfs2: fix deadlock when two nodes are converting same lock from PR
 to EX and idletimeout closes conn

Orabug: 18639535

Two node cluster and both nodes hold a lock at PR level and both want to
convert to EX at the same time.  Master node 1 has sent BAST and then
closes the connection due to idletime out.  Node 0 receives BAST, sends
unlock req with cancel flag but gets error -ENOTCONN.  The problem is
this error is ignored in dlm_send_remote_unlock_request() on the
**incorrect** assumption that the master is dead.  See NOTE in comment
why it returns DLM_NORMAL.  Upon getting DLM_NORMAL, node 0 proceeds to
sends convert (without cancel flg) which fails with -ENOTCONN.  waits 5
sec and resends.

This time gets DLM_IVLOCKID from the master since lock not found in
grant, it had been moved to converting queue in response to conv PR->EX
req.  No way out.

Node 1 (master)				Node 0
==============				======

  lock mode PR				PR

  convert PR -> EX
  mv grant -> convert and que BAST
  ...
                     <-------- convert PR -> EX
  convert que looks like this: ((node 1, PR -> EX) (node 0, PR -> EX))
  ...
                        BAST (want PR -> NL)
                     ------------------>
  ...
  idle timout, conn closed
                                ...
                                In response to BAST,
                                sends unlock with cancel convert flag
                                gets -ENOTCONN. Ignores and
                                sends remote convert request
                                gets -ENOTCONN, waits 5 Sec, retries
  ...
  reconnects
                   <----------------- convert req goes through on next try
  does not find lock on grant que
                   status DLM_IVLOCKID
                   ------------------>
  ...

No way out.  Fix is to keep retrying unlock with cancel flag until it
succeeds or the master dies.

Signed-off-by: Tariq Saeed <tariq.x.saeed@oracle.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 5698b52..2e3c9db 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -191,7 +191,9 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
 				     DLM_UNLOCK_CLEAR_CONVERT_TYPE);
 		} else if (status == DLM_RECOVERING ||
 			   status == DLM_MIGRATING ||
-			   status == DLM_FORWARD) {
+			   status == DLM_FORWARD ||
+			   status == DLM_NOLOCKMGR
+			   ) {
 			/* must clear the actions because this unlock
 			 * is about to be retried.  cannot free or do
 			 * any list manipulation. */
@@ -200,7 +202,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
 			     res->lockname.name,
 			     status==DLM_RECOVERING?"recovering":
 			     (status==DLM_MIGRATING?"migrating":
-			      "forward"));
+				(status == DLM_FORWARD ? "forward" :
+						"nolockmanager")));
 			actions = 0;
 		}
 		if (flags & LKM_CANCEL)
@@ -364,7 +367,10 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
 			 * updated state to the recovery master.  this thread
 			 * just needs to finish out the operation and call
 			 * the unlockast. */
-			ret = DLM_NORMAL;
+			if (dlm_is_node_dead(dlm, owner))
+				ret = DLM_NORMAL;
+			else
+				ret = DLM_NOLOCKMGR;
 		} else {
 			/* something bad.  this will BUG in ocfs2 */
 			ret = dlm_err_to_dlm_status(tmpret);
@@ -638,7 +644,9 @@ retry:
 
 	if (status == DLM_RECOVERING ||
 	    status == DLM_MIGRATING ||
-	    status == DLM_FORWARD) {
+	    status == DLM_FORWARD ||
+	    status == DLM_NOLOCKMGR) {
+
 		/* We want to go away for a tiny bit to allow recovery
 		 * / migration to complete on this resource. I don't
 		 * know of any wait queue we could sleep on as this
@@ -650,7 +658,7 @@ retry:
 		msleep(50);
 
 		mlog(0, "retrying unlock due to pending recovery/"
-		     "migration/in-progress\n");
+		     "migration/in-progress/reconnect\n");
 		goto retry;
 	}
 
-- 
cgit v0.10.2


From b253bfd87866a38e11baf9b88c9d54c534cd70cd Mon Sep 17 00:00:00 2001
From: Xue jiufei <xuejiufei@huawei.com>
Date: Mon, 23 Jun 2014 13:22:08 -0700
Subject: ocfs2: revert "ocfs2: fix NULL pointer dereference when dismount and
 ocfs2rec simultaneously"

75f82eaa502c ("ocfs2: fix NULL pointer dereference when dismount and
ocfs2rec simultaneously") may cause umount hang while shutting down
truncate log.

The situation is as followes:
ocfs2_dismout_volume
-> ocfs2_recovery_exit
  -> free osb->recovery_map
-> ocfs2_truncate_shutdown
  -> lock global bitmap inode
    -> ocfs2_wait_for_recovery
          -> check whether osb->recovery_map->rm_used is zero

Because osb->recovery_map is already freed, rm_used can be any other
values, so it may yield umount hang.

Signed-off-by: joyce.xue <xuejiufei@huawei.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index c7a89cea..ddb662b 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1925,15 +1925,11 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 
 	ocfs2_shutdown_local_alloc(osb);
 
+	ocfs2_truncate_log_shutdown(osb);
+
 	/* This will disable recovery and flush any recovery work. */
 	ocfs2_recovery_exit(osb);
 
-	/*
-	 * During dismount, when it recovers another node it will call
-	 * ocfs2_recover_orphans and queue delayed work osb_truncate_log_wq.
-	 */
-	ocfs2_truncate_log_shutdown(osb);
-
 	ocfs2_journal_shutdown(osb);
 
 	ocfs2_sync_blockdev(sb);
-- 
cgit v0.10.2


From 8a8ad1c2f6ef6fbee7487ec293218f42e72ad6a3 Mon Sep 17 00:00:00 2001
From: Wengang Wang <wen.gang.wang@oracle.com>
Date: Mon, 23 Jun 2014 13:22:08 -0700
Subject: ocfs2: refcount: take rw_lock in ocfs2_reflink

This patch tries to fix this crash:

 #5 [ffff88003c1cd690] do_invalid_op at ffffffff810166d5
 #6 [ffff88003c1cd730] invalid_op at ffffffff8159b2de
    [exception RIP: ocfs2_direct_IO_get_blocks+359]
    RIP: ffffffffa05dfa27  RSP: ffff88003c1cd7e8  RFLAGS: 00010202
    RAX: 0000000000000000  RBX: ffff88003c1cdaa8  RCX: 0000000000000000
    RDX: 000000000000000c  RSI: ffff880027a95000  RDI: ffff88003c79b540
    RBP: ffff88003c1cd858   R8: 0000000000000000   R9: ffffffff815f6ba0
    R10: 00000000000001c9  R11: 00000000000001c9  R12: ffff88002d271500
    R13: 0000000000000001  R14: 0000000000000000  R15: 0000000000001000
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
 #7 [ffff88003c1cd860] do_direct_IO at ffffffff811cd31b
 #8 [ffff88003c1cd950] direct_IO_iovec at ffffffff811cde9c
 #9 [ffff88003c1cd9b0] do_blockdev_direct_IO at ffffffff811ce764
#10 [ffff88003c1cdb80] __blockdev_direct_IO at ffffffff811ce7cc
#11 [ffff88003c1cdbb0] ocfs2_direct_IO at ffffffffa05df756 [ocfs2]
#12 [ffff88003c1cdbe0] generic_file_direct_write_iter at ffffffff8112f935
#13 [ffff88003c1cdc40] ocfs2_file_write_iter at ffffffffa0600ccc [ocfs2]
#14 [ffff88003c1cdd50] do_aio_write at ffffffff8119126c
#15 [ffff88003c1cddc0] aio_rw_vect_retry at ffffffff811d9bb4
#16 [ffff88003c1cddf0] aio_run_iocb at ffffffff811db880
#17 [ffff88003c1cde30] io_submit_one at ffffffff811dc238
#18 [ffff88003c1cde80] do_io_submit at ffffffff811dc437
#19 [ffff88003c1cdf70] sys_io_submit at ffffffff811dc530
#20 [ffff88003c1cdf80] system_call_fastpath at ffffffff8159a159

It crashes at
        BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
in ocfs2_direct_IO_get_blocks.

ocfs2_direct_IO_get_blocks is expecting the OCFS2_EXT_REFCOUNTED be removed in
ocfs2_prepare_inode_for_write() if it was there. But no cluster lock is taken
during the time before (or inside) ocfs2_prepare_inode_for_write() and after
ocfs2_direct_IO_get_blocks().

It can happen in this case:

Node A(which crashes)				Node B
------------------------                 ---------------------------
ocfs2_file_aio_write
  ocfs2_prepare_inode_for_write
    ocfs2_inode_lock
    ...
    ocfs2_inode_unlock
  #no refcount found
....					ocfs2_reflink
                                          ocfs2_inode_lock
                                          ...
                                          ocfs2_inode_unlock
                                          #now, refcount flag set on extent

                                        ...
                                        flush change to disk

ocfs2_direct_IO_get_blocks
  ocfs2_get_clusters
    #extent map miss
    #buffer_head miss
    read extents from disk
  found refcount flag on extent
  crash..

Fix:
Take rw_lock in ocfs2_reflink path

Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 714e53b..636aab6 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4288,9 +4288,16 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
 		goto out;
 	}
 
+	error = ocfs2_rw_lock(inode, 1);
+	if (error) {
+		mlog_errno(error);
+		goto out;
+	}
+
 	error = ocfs2_inode_lock(inode, &old_bh, 1);
 	if (error) {
 		mlog_errno(error);
+		ocfs2_rw_unlock(inode, 1);
 		goto out;
 	}
 
@@ -4302,6 +4309,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
 	up_write(&OCFS2_I(inode)->ip_xattr_sem);
 
 	ocfs2_inode_unlock(inode, 1);
+	ocfs2_rw_unlock(inode, 1);
 	brelse(old_bh);
 
 	if (error) {
-- 
cgit v0.10.2


From a270c6d3c0d7ba914bd82da34152d1102920d805 Mon Sep 17 00:00:00 2001
From: Xue jiufei <xuejiufei@huawei.com>
Date: Mon, 23 Jun 2014 13:22:08 -0700
Subject: ocfs2/dlm: fix misuse of list_move_tail() in dlm_run_purge_list()

When a lockres in purge list but is still in use, it should be moved to
the tail of purge list.  dlm_thread will continue to check next lockres in
purge list.  However, code list_move_tail(&dlm->purge_list,
&lockres->purge) will do *no* movements, so dlm_thread will purge the same
lockres in this loop again and again.  If it is in use for a long time,
other lockres will not be processed.

Signed-off-by: Yiwen Jiang <jiangyiwen@huawei.com>
Signed-off-by: joyce.xue <xuejiufei@huawei.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 9db869d..cf53a82 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -264,7 +264,7 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
 			     "used %d, state %d\n", dlm->name,
 			     lockres->lockname.len, lockres->lockname.name,
 			     !unused, lockres->state);
-			list_move_tail(&dlm->purge_list, &lockres->purge);
+			list_move_tail(&lockres->purge, &dlm->purge_list);
 			spin_unlock(&lockres->spinlock);
 			continue;
 		}
-- 
cgit v0.10.2


From f7a14f32e7e1e7e025d88e7b4c8e3cc99770f756 Mon Sep 17 00:00:00 2001
From: Yiwen Jiang <jiangyiwen@huawei.com>
Date: Mon, 23 Jun 2014 13:22:09 -0700
Subject: ocfs2: fix a tiny race when running dirop_fileop_racer

When running dirop_fileop_racer we found a dead lock case.

2 nodes, say Node A and Node B, mount the same ocfs2 volume.  Create
/race/16/1 in the filesystem, and let the inode number of dir 16 is less
than the inode number of dir race.

Node A                            Node B
mv /race/16/1 /race/
                                  right after Node A has got the
                                  EX mode of /race/16/, and tries to
                                  get EX mode of /race
                                  ls /race/16/

In this case, Node A has got the EX mode of /race/16/, and wants to get EX
mode of /race/.  Node B has got the PR mode of /race/, and wants to get
the PR mode of /race/16/.  Since EX and PR are mutually exclusive, dead
lock happens.

This patch fixes this case by locking in ancestor order before trying
inode number order.

Signed-off-by: Yiwen Jiang <jiangyiwen@huawei.com>
Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Joel Becker <jlbec@evilplan.org>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 5819bb5..879f9f3 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -991,6 +991,65 @@ leave:
 	return status;
 }
 
+static int ocfs2_check_if_ancestor(struct ocfs2_super *osb,
+		u64 src_inode_no, u64 dest_inode_no)
+{
+	int ret = 0, i = 0;
+	u64 parent_inode_no = 0;
+	u64 child_inode_no = src_inode_no;
+	struct inode *child_inode;
+
+#define MAX_LOOKUP_TIMES 32
+	while (1) {
+		child_inode = ocfs2_iget(osb, child_inode_no, 0, 0);
+		if (IS_ERR(child_inode)) {
+			ret = PTR_ERR(child_inode);
+			break;
+		}
+
+		ret = ocfs2_inode_lock(child_inode, NULL, 0);
+		if (ret < 0) {
+			iput(child_inode);
+			if (ret != -ENOENT)
+				mlog_errno(ret);
+			break;
+		}
+
+		ret = ocfs2_lookup_ino_from_name(child_inode, "..", 2,
+				&parent_inode_no);
+		ocfs2_inode_unlock(child_inode, 0);
+		iput(child_inode);
+		if (ret < 0) {
+			ret = -ENOENT;
+			break;
+		}
+
+		if (parent_inode_no == dest_inode_no) {
+			ret = 1;
+			break;
+		}
+
+		if (parent_inode_no == osb->root_inode->i_ino) {
+			ret = 0;
+			break;
+		}
+
+		child_inode_no = parent_inode_no;
+
+		if (++i >= MAX_LOOKUP_TIMES) {
+			mlog(ML_NOTICE, "max lookup times reached, filesystem "
+					"may have nested directories, "
+					"src inode: %llu, dest inode: %llu.\n",
+					(unsigned long long)src_inode_no,
+					(unsigned long long)dest_inode_no);
+			ret = 0;
+			break;
+		}
+	}
+
+	return ret;
+}
+
 /*
  * The only place this should be used is rename!
  * if they have the same id, then the 1st one is the only one locked.
@@ -1002,6 +1061,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
 			     struct inode *inode2)
 {
 	int status;
+	int inode1_is_ancestor, inode2_is_ancestor;
 	struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
 	struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
 	struct buffer_head **tmpbh;
@@ -1015,9 +1075,26 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
 	if (*bh2)
 		*bh2 = NULL;
 
-	/* we always want to lock the one with the lower lockid first. */
+	/* we always want to lock the one with the lower lockid first.
+	 * and if they are nested, we lock ancestor first */
 	if (oi1->ip_blkno != oi2->ip_blkno) {
-		if (oi1->ip_blkno < oi2->ip_blkno) {
+		inode1_is_ancestor = ocfs2_check_if_ancestor(osb, oi2->ip_blkno,
+				oi1->ip_blkno);
+		if (inode1_is_ancestor < 0) {
+			status = inode1_is_ancestor;
+			goto bail;
+		}
+
+		inode2_is_ancestor = ocfs2_check_if_ancestor(osb, oi1->ip_blkno,
+				oi2->ip_blkno);
+		if (inode2_is_ancestor < 0) {
+			status = inode2_is_ancestor;
+			goto bail;
+		}
+
+		if ((inode1_is_ancestor == 1) ||
+				(oi1->ip_blkno < oi2->ip_blkno &&
+				inode2_is_ancestor == 0)) {
 			/* switch id1 and id2 around */
 			tmpbh = bh2;
 			bh2 = bh1;
@@ -1135,6 +1212,21 @@ static int ocfs2_rename(struct inode *old_dir,
 			goto bail;
 		}
 		rename_lock = 1;
+
+		/* here we cannot guarantee the inodes haven't just been
+		 * changed, so check if they are nested again */
+		status = ocfs2_check_if_ancestor(osb, new_dir->i_ino,
+				old_inode->i_ino);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		} else if (status == 1) {
+			status = -EPERM;
+			trace_ocfs2_rename_not_permitted(
+					(unsigned long long)old_inode->i_ino,
+					(unsigned long long)new_dir->i_ino);
+			goto bail;
+		}
 	}
 
 	/* if old and new are the same, this'll just do one lock. */
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 1b60c62..6cb019b 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -2292,6 +2292,8 @@ TRACE_EVENT(ocfs2_rename,
 		  __entry->new_len, __get_str(new_name))
 );
 
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_rename_not_permitted);
+
 TRACE_EVENT(ocfs2_rename_target_exists,
 	TP_PROTO(int new_len, const char *new_name),
 	TP_ARGS(new_len, new_name),
-- 
cgit v0.10.2


From 595297a8f9bc27f8f4a537fd57ed6d0a2060fb46 Mon Sep 17 00:00:00 2001
From: jiangyiwen <jiangyiwen@huawei.com>
Date: Mon, 23 Jun 2014 13:22:09 -0700
Subject: ocfs2: manually do the iput once ocfs2_add_entry failed in
 ocfs2_symlink and ocfs2_mknod

When the call to ocfs2_add_entry() failed in ocfs2_symlink() and
ocfs2_mknod(), iput() will not be called during dput(dentry) because no
d_instantiate(), and this will lead to umount hung.

Signed-off-by: jiangyiwen <jiangyiwen@huawei.com>
Cc: Joel Becker <jlbec@evilplan.org>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 879f9f3..8add6f1 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -205,6 +205,21 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode)
 	return inode;
 }
 
+static void ocfs2_cleanup_add_entry_failure(struct ocfs2_super *osb,
+		struct dentry *dentry, struct inode *inode)
+{
+	struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
+
+	ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
+	ocfs2_lock_res_free(&dl->dl_lockres);
+	BUG_ON(dl->dl_count != 1);
+	spin_lock(&dentry_attach_lock);
+	dentry->d_fsdata = NULL;
+	spin_unlock(&dentry_attach_lock);
+	kfree(dl);
+	iput(inode);
+}
+
 static int ocfs2_mknod(struct inode *dir,
 		       struct dentry *dentry,
 		       umode_t mode,
@@ -231,6 +246,7 @@ static int ocfs2_mknod(struct inode *dir,
 	sigset_t oldset;
 	int did_block_signals = 0;
 	struct posix_acl *default_acl = NULL, *acl = NULL;
+	struct ocfs2_dentry_lock *dl = NULL;
 
 	trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name,
 			  (unsigned long long)OCFS2_I(dir)->ip_blkno,
@@ -423,6 +439,8 @@ static int ocfs2_mknod(struct inode *dir,
 		goto leave;
 	}
 
+	dl = dentry->d_fsdata;
+
 	status = ocfs2_add_entry(handle, dentry, inode,
 				 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
 				 &lookup);
@@ -469,6 +487,9 @@ leave:
 	 * ocfs2_delete_inode will mutex_lock again.
 	 */
 	if ((status < 0) && inode) {
+		if (dl)
+			ocfs2_cleanup_add_entry_failure(osb, dentry, inode);
+
 		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
 		clear_nlink(inode);
 		iput(inode);
@@ -1734,6 +1755,7 @@ static int ocfs2_symlink(struct inode *dir,
 	struct ocfs2_dir_lookup_result lookup = { NULL, };
 	sigset_t oldset;
 	int did_block_signals = 0;
+	struct ocfs2_dentry_lock *dl = NULL;
 
 	trace_ocfs2_symlink_begin(dir, dentry, symname,
 				  dentry->d_name.len, dentry->d_name.name);
@@ -1922,6 +1944,8 @@ static int ocfs2_symlink(struct inode *dir,
 		goto bail;
 	}
 
+	dl = dentry->d_fsdata;
+
 	status = ocfs2_add_entry(handle, dentry, inode,
 				 le64_to_cpu(fe->i_blkno), parent_fe_bh,
 				 &lookup);
@@ -1956,6 +1980,9 @@ bail:
 	if (xattr_ac)
 		ocfs2_free_alloc_context(xattr_ac);
 	if ((status < 0) && inode) {
+		if (dl)
+			ocfs2_cleanup_add_entry_failure(osb, dentry, inode);
+
 		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
 		clear_nlink(inode);
 		iput(inode);
-- 
cgit v0.10.2


From b9aaac5a6b7d228435fcb80963d41c274406011b Mon Sep 17 00:00:00 2001
From: jiangyiwen <jiangyiwen@huawei.com>
Date: Mon, 23 Jun 2014 13:22:09 -0700
Subject: ocfs2: do not return DLM_MIGRATE_RESPONSE_MASTERY_REF to avoid
 endless,loop during umount

The following case may lead to endless loop during umount.

node A         node B               node C       node D
umount volume,
migrate lockres1
to B
                                                 want to lock lockres1,
                                                 send
                                                 MASTER_REQUEST_MSG
                                                 to C
                                    init block mle
               send
               MIGRATE_REQUEST_MSG
               to C
                                    find a block
                                    mle, and then
                                    return
                                    DLM_MIGRATE_RESPONSE_MASTERY_REF
                                    to B
               set C in refmap
                                    umount successfully
               try to umount, endless
               loop occurs when migrate
               lockres1 since C is in
               refmap

So we can fix this endless loop case by only returning
DLM_MIGRATE_RESPONSE_MASTERY_REF if it has a mastery mle when receiving
MIGRATE_REQUEST_MSG.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: jiangyiwen <jiangyiwen@huawei.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Xue jiufei <xuejiufei@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 3087a21..4f4b00e 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -3088,11 +3088,15 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
 			/* remove it so that only one mle will be found */
 			__dlm_unlink_mle(dlm, tmp);
 			__dlm_mle_detach_hb_events(dlm, tmp);
-			ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
-			mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
-			    "telling master to get ref for cleared out mle "
-			    "during migration\n", dlm->name, namelen, name,
-			    master, new_master);
+			if (tmp->type == DLM_MLE_MASTER) {
+				ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
+				mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
+						"telling master to get ref "
+						"for cleared out mle during "
+						"migration\n", dlm->name,
+						namelen, name, master,
+						new_master);
+			}
 		}
 		spin_unlock(&tmp->spinlock);
 	}
-- 
cgit v0.10.2


From ac4fef4d23ed879a7fd11ab24ccd2e1464277e9a Mon Sep 17 00:00:00 2001
From: Xue jiufei <xuejiufei@huawei.com>
Date: Mon, 23 Jun 2014 13:22:09 -0700
Subject: ocfs2/dlm: do not purge lockres that is queued for assert master

When workqueue is delayed, it may occur that a lockres is purged while it
is still queued for master assert.  it may trigger BUG() as follows.

N1                                         N2
dlm_get_lockres()
->dlm_do_master_requery
                                  is the master of lockres,
                                  so queue assert_master work

                                  dlm_thread() start running
                                  and purge the lockres

                                  dlm_assert_master_worker()
                                  send assert master message
                                  to other nodes
receiving the assert_master
message, set master to N2

dlmlock_remote() send create_lock message to N2, but receive DLM_IVLOCKID,
if it is RECOVERY lockres, it triggers the BUG().

Another BUG() is triggered when N3 become the new master and send
assert_master to N1, N1 will trigger the BUG() because owner doesn't
match.  So we should not purge lockres when it is queued for assert
master.

Signed-off-by: joyce.xue <xuejiufei@huawei.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index a106b3f..fae17c6 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -331,6 +331,7 @@ struct dlm_lock_resource
 	u16 state;
 	char lvb[DLM_LVB_LEN];
 	unsigned int inflight_locks;
+	unsigned int inflight_assert_workers;
 	unsigned long refmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
 };
 
@@ -910,6 +911,9 @@ void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
 void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
 				   struct dlm_lock_resource *res);
 
+void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
+		struct dlm_lock_resource *res);
+
 void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
 void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
 void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 4f4b00e..82abf0c 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -581,6 +581,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
 	atomic_set(&res->asts_reserved, 0);
 	res->migration_pending = 0;
 	res->inflight_locks = 0;
+	res->inflight_assert_workers = 0;
 
 	res->dlm = dlm;
 
@@ -683,6 +684,43 @@ void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
 	wake_up(&res->wq);
 }
 
+void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
+		struct dlm_lock_resource *res)
+{
+	assert_spin_locked(&res->spinlock);
+	res->inflight_assert_workers++;
+	mlog(0, "%s:%.*s: inflight assert worker++: now %u\n",
+			dlm->name, res->lockname.len, res->lockname.name,
+			res->inflight_assert_workers);
+}
+
+static void dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
+		struct dlm_lock_resource *res)
+{
+	spin_lock(&res->spinlock);
+	__dlm_lockres_grab_inflight_worker(dlm, res);
+	spin_unlock(&res->spinlock);
+}
+
+static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
+		struct dlm_lock_resource *res)
+{
+	assert_spin_locked(&res->spinlock);
+	BUG_ON(res->inflight_assert_workers == 0);
+	res->inflight_assert_workers--;
+	mlog(0, "%s:%.*s: inflight assert worker--: now %u\n",
+			dlm->name, res->lockname.len, res->lockname.name,
+			res->inflight_assert_workers);
+}
+
+static void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
+		struct dlm_lock_resource *res)
+{
+	spin_lock(&res->spinlock);
+	__dlm_lockres_drop_inflight_worker(dlm, res);
+	spin_unlock(&res->spinlock);
+}
+
 /*
  * lookup a lock resource by name.
  * may already exist in the hashtable.
@@ -1603,7 +1641,8 @@ send_response:
 			mlog(ML_ERROR, "failed to dispatch assert master work\n");
 			response = DLM_MASTER_RESP_ERROR;
 			dlm_lockres_put(res);
-		}
+		} else
+			dlm_lockres_grab_inflight_worker(dlm, res);
 	} else {
 		if (res)
 			dlm_lockres_put(res);
@@ -2118,6 +2157,8 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
 	dlm_lockres_release_ast(dlm, res);
 
 put:
+	dlm_lockres_drop_inflight_worker(dlm, res);
+
 	dlm_lockres_put(res);
 
 	mlog(0, "finished with dlm_assert_master_worker\n");
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 5de0194..45067fa 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1708,7 +1708,8 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
 				mlog_errno(-ENOMEM);
 				/* retry!? */
 				BUG();
-			}
+			} else
+				__dlm_lockres_grab_inflight_worker(dlm, res);
 		} else /* put.. incase we are not the master */
 			dlm_lockres_put(res);
 		spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index cf53a82..69aac6f 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -259,11 +259,14 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
 		 * refs on it. */
 		unused = __dlm_lockres_unused(lockres);
 		if (!unused ||
-		    (lockres->state & DLM_LOCK_RES_MIGRATING)) {
+		    (lockres->state & DLM_LOCK_RES_MIGRATING) ||
+		    (lockres->inflight_assert_workers != 0)) {
 			mlog(0, "%s: res %.*s is in use or being remastered, "
-			     "used %d, state %d\n", dlm->name,
-			     lockres->lockname.len, lockres->lockname.name,
-			     !unused, lockres->state);
+			     "used %d, state %d, assert master workers %u\n",
+			     dlm->name, lockres->lockname.len,
+			     lockres->lockname.name,
+			     !unused, lockres->state,
+			     lockres->inflight_assert_workers);
 			list_move_tail(&lockres->purge, &dlm->purge_list);
 			spin_unlock(&lockres->spinlock);
 			continue;
-- 
cgit v0.10.2