From 23d11a58a9a60dcb52c8fc6494efce908b24c295 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 29 Jan 2016 05:59:46 -0500
Subject: workqueue: skip flush dependency checks for legacy workqueues

fca839c00a12 ("workqueue: warn if memory reclaim tries to flush
!WQ_MEM_RECLAIM workqueue") implemented flush dependency warning which
triggers if a PF_MEMALLOC task or WQ_MEM_RECLAIM workqueue tries to
flush a !WQ_MEM_RECLAIM workquee.

This assumes that workqueues marked with WQ_MEM_RECLAIM sit in memory
reclaim path and making it depend on something which may need more
memory to make forward progress can lead to deadlocks.  Unfortunately,
workqueues created with the legacy create*_workqueue() interface
always have WQ_MEM_RECLAIM regardless of whether they are depended
upon memory reclaim or not.  These spurious WQ_MEM_RECLAIM markings
cause spurious triggering of the flush dependency checks.

  WARNING: CPU: 0 PID: 6 at kernel/workqueue.c:2361 check_flush_dependency+0x138/0x144()
  workqueue: WQ_MEM_RECLAIM deferwq:deferred_probe_work_func is flushing !WQ_MEM_RECLAIM events:lru_add_drain_per_cpu
  ...
  Workqueue: deferwq deferred_probe_work_func
  [<c0017acc>] (unwind_backtrace) from [<c0013134>] (show_stack+0x10/0x14)
  [<c0013134>] (show_stack) from [<c0245f18>] (dump_stack+0x94/0xd4)
  [<c0245f18>] (dump_stack) from [<c0026f9c>] (warn_slowpath_common+0x80/0xb0)
  [<c0026f9c>] (warn_slowpath_common) from [<c0026ffc>] (warn_slowpath_fmt+0x30/0x40)
  [<c0026ffc>] (warn_slowpath_fmt) from [<c00390b8>] (check_flush_dependency+0x138/0x144)
  [<c00390b8>] (check_flush_dependency) from [<c0039ca0>] (flush_work+0x50/0x15c)
  [<c0039ca0>] (flush_work) from [<c00c51b0>] (lru_add_drain_all+0x130/0x180)
  [<c00c51b0>] (lru_add_drain_all) from [<c00f728c>] (migrate_prep+0x8/0x10)
  [<c00f728c>] (migrate_prep) from [<c00bfbc4>] (alloc_contig_range+0xd8/0x338)
  [<c00bfbc4>] (alloc_contig_range) from [<c00f8f18>] (cma_alloc+0xe0/0x1ac)
  [<c00f8f18>] (cma_alloc) from [<c001cac4>] (__alloc_from_contiguous+0x38/0xd8)
  [<c001cac4>] (__alloc_from_contiguous) from [<c001ceb4>] (__dma_alloc+0x240/0x278)
  [<c001ceb4>] (__dma_alloc) from [<c001cf78>] (arm_dma_alloc+0x54/0x5c)
  [<c001cf78>] (arm_dma_alloc) from [<c0355ea4>] (dmam_alloc_coherent+0xc0/0xec)
  [<c0355ea4>] (dmam_alloc_coherent) from [<c039cc4c>] (ahci_port_start+0x150/0x1dc)
  [<c039cc4c>] (ahci_port_start) from [<c0384734>] (ata_host_start.part.3+0xc8/0x1c8)
  [<c0384734>] (ata_host_start.part.3) from [<c03898dc>] (ata_host_activate+0x50/0x148)
  [<c03898dc>] (ata_host_activate) from [<c039d558>] (ahci_host_activate+0x44/0x114)
  [<c039d558>] (ahci_host_activate) from [<c039f05c>] (ahci_platform_init_host+0x1d8/0x3c8)
  [<c039f05c>] (ahci_platform_init_host) from [<c039e6bc>] (tegra_ahci_probe+0x448/0x4e8)
  [<c039e6bc>] (tegra_ahci_probe) from [<c0347058>] (platform_drv_probe+0x50/0xac)
  [<c0347058>] (platform_drv_probe) from [<c03458cc>] (driver_probe_device+0x214/0x2c0)
  [<c03458cc>] (driver_probe_device) from [<c0343cc0>] (bus_for_each_drv+0x60/0x94)
  [<c0343cc0>] (bus_for_each_drv) from [<c03455d8>] (__device_attach+0xb0/0x114)
  [<c03455d8>] (__device_attach) from [<c0344ab8>] (bus_probe_device+0x84/0x8c)
  [<c0344ab8>] (bus_probe_device) from [<c0344f48>] (deferred_probe_work_func+0x68/0x98)
  [<c0344f48>] (deferred_probe_work_func) from [<c003b738>] (process_one_work+0x120/0x3f8)
  [<c003b738>] (process_one_work) from [<c003ba48>] (worker_thread+0x38/0x55c)
  [<c003ba48>] (worker_thread) from [<c0040f14>] (kthread+0xdc/0xf4)
  [<c0040f14>] (kthread) from [<c000f778>] (ret_from_fork+0x14/0x3c)

Fix it by marking workqueues created via create*_workqueue() with
__WQ_LEGACY and disabling flush dependency checks on them.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Thierry Reding <thierry.reding@gmail.com>
Link: http://lkml.kernel.org/g/20160126173843.GA11115@ulmo.nvidia.com
Fixes: fca839c00a12 ("workqueue: warn if memory reclaim tries to flush !WQ_MEM_RECLAIM workqueue")
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 0e32bc7..ca73c50 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -311,6 +311,7 @@ enum {
 
 	__WQ_DRAINING		= 1 << 16, /* internal: workqueue is draining */
 	__WQ_ORDERED		= 1 << 17, /* internal: workqueue is ordered */
+	__WQ_LEGACY		= 1 << 18, /* internal: create*_workqueue() */
 
 	WQ_MAX_ACTIVE		= 512,	  /* I like 512, better ideas? */
 	WQ_MAX_UNBOUND_PER_CPU	= 4,	  /* 4 * #cpus for unbound wq */
@@ -411,12 +412,12 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active,
 	alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args)
 
 #define create_workqueue(name)						\
-	alloc_workqueue("%s", WQ_MEM_RECLAIM, 1, (name))
+	alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name))
 #define create_freezable_workqueue(name)				\
-	alloc_workqueue("%s", WQ_FREEZABLE | WQ_UNBOUND | WQ_MEM_RECLAIM, \
-			1, (name))
+	alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND |	\
+			WQ_MEM_RECLAIM, 1, (name))
 #define create_singlethread_workqueue(name)				\
-	alloc_ordered_workqueue("%s", WQ_MEM_RECLAIM, name)
+	alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, name)
 
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 61a0264..dc7faad 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2355,7 +2355,8 @@ static void check_flush_dependency(struct workqueue_struct *target_wq,
 	WARN_ONCE(current->flags & PF_MEMALLOC,
 		  "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf",
 		  current->pid, current->comm, target_wq->name, target_func);
-	WARN_ONCE(worker && (worker->current_pwq->wq->flags & WQ_MEM_RECLAIM),
+	WARN_ONCE(worker && ((worker->current_pwq->wq->flags &
+			      (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM),
 		  "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf",
 		  worker->current_pwq->wq->name, worker->current_func,
 		  target_wq->name, target_func);
-- 
cgit v0.10.2


From 041bd12e272c53a35c54c13875839bcb98c999ce Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 9 Feb 2016 16:11:26 -0500
Subject: Revert "workqueue: make sure delayed work run in local cpu"

This reverts commit 874bbfe600a660cba9c776b3957b1ce393151b76.

Workqueue used to implicity guarantee that work items queued without
explicit CPU specified are put on the local CPU.  Recent changes in
timer broke the guarantee and led to vmstat breakage which was fixed
by 176bed1de5bf ("vmstat: explicitly schedule per-cpu work on the CPU
we need it to run on").

vmstat is the most likely to expose the issue and it's quite possible
that there are other similar problems which are a lot more difficult
to trigger.  As a preventive measure, 874bbfe600a6 ("workqueue: make
sure delayed work run in local cpu") was applied to restore the local
CPU guarnatee.  Unfortunately, the change exposed a bug in timer code
which got fixed by 22b886dd1018 ("timers: Use proper base migration in
add_timer_on()").  Due to code restructuring, the commit couldn't be
backported beyond certain point and stable kernels which only had
874bbfe600a6 started crashing.

The local CPU guarantee was accidental more than anything else and we
want to get rid of it anyway.  As, with the vmstat case fixed,
874bbfe600a6 is causing more problems than it's fixing, it has been
decided to take the chance and officially break the guarantee by
reverting the commit.  A debug feature will be added to force foreign
CPU assignment to expose cases relying on the guarantee and fixes for
the individual cases will be backported to stable as necessary.

Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: 874bbfe600a6 ("workqueue: make sure delayed work run in local cpu")
Link: http://lkml.kernel.org/g/20160120211926.GJ10810@quack.suse.cz
Cc: stable@vger.kernel.org
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Cc: Daniel Bilik <daniel.bilik@neosystem.cz>
Cc: Jan Kara <jack@suse.cz>
Cc: Shaohua Li <shli@fb.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Daniel Bilik <daniel.bilik@neosystem.cz>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Michal Hocko <mhocko@kernel.org>

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index dc7faad..5e63d3b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1464,13 +1464,13 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
 	timer_stats_timer_set_start_info(&dwork->timer);
 
 	dwork->wq = wq;
-	/* timer isn't guaranteed to run in this cpu, record earlier */
-	if (cpu == WORK_CPU_UNBOUND)
-		cpu = raw_smp_processor_id();
 	dwork->cpu = cpu;
 	timer->expires = jiffies + delay;
 
-	add_timer_on(timer, cpu);
+	if (unlikely(cpu != WORK_CPU_UNBOUND))
+		add_timer_on(timer, cpu);
+	else
+		add_timer(timer);
 }
 
 /**
-- 
cgit v0.10.2


From ef557180447fa9a7a0affd3abb21ecceb4b5e125 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <umgwanakikbuti@gmail.com>
Date: Tue, 9 Feb 2016 17:59:38 -0500
Subject: workqueue: schedule WORK_CPU_UNBOUND work on wq_unbound_cpumask CPUs

WORK_CPU_UNBOUND work items queued to a bound workqueue always run
locally.  This is a good thing normally, but not when the user has
asked us to keep unbound work away from certain CPUs.  Round robin
these to wq_unbound_cpumask CPUs instead, as perturbation avoidance
trumps performance.

tj: Cosmetic and comment changes.  WARN_ON_ONCE() dropped from empty
    (wq_unbound_cpumask AND cpu_online_mask).  If we want that, it
    should be done when config changes.

Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5e63d3b..0547746 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -301,7 +301,11 @@ static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */
 static LIST_HEAD(workqueues);		/* PR: list of all workqueues */
 static bool workqueue_freezing;		/* PL: have wqs started freezing? */
 
-static cpumask_var_t wq_unbound_cpumask; /* PL: low level cpumask for all unbound wqs */
+/* PL: allowable cpus for unbound wqs and work items */
+static cpumask_var_t wq_unbound_cpumask;
+
+/* CPU where unbound work was last round robin scheduled from this CPU */
+static DEFINE_PER_CPU(int, wq_rr_cpu_last);
 
 /* the per-cpu worker pools */
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
@@ -1298,6 +1302,32 @@ static bool is_chained_work(struct workqueue_struct *wq)
 	return worker && worker->current_pwq->wq == wq;
 }
 
+/*
+ * When queueing an unbound work item to a wq, prefer local CPU if allowed
+ * by wq_unbound_cpumask.  Otherwise, round robin among the allowed ones to
+ * avoid perturbing sensitive tasks.
+ */
+static int wq_select_unbound_cpu(int cpu)
+{
+	int new_cpu;
+
+	if (cpumask_test_cpu(cpu, wq_unbound_cpumask))
+		return cpu;
+	if (cpumask_empty(wq_unbound_cpumask))
+		return cpu;
+
+	new_cpu = __this_cpu_read(wq_rr_cpu_last);
+	new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask);
+	if (unlikely(new_cpu >= nr_cpu_ids)) {
+		new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask);
+		if (unlikely(new_cpu >= nr_cpu_ids))
+			return cpu;
+	}
+	__this_cpu_write(wq_rr_cpu_last, new_cpu);
+
+	return new_cpu;
+}
+
 static void __queue_work(int cpu, struct workqueue_struct *wq,
 			 struct work_struct *work)
 {
@@ -1323,7 +1353,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
 		return;
 retry:
 	if (req_cpu == WORK_CPU_UNBOUND)
-		cpu = raw_smp_processor_id();
+		cpu = wq_select_unbound_cpu(raw_smp_processor_id());
 
 	/* pwq which will be used unless @work is executing elsewhere */
 	if (!(wq->flags & WQ_UNBOUND))
-- 
cgit v0.10.2


From f303fccb82928790ec58eea82722bd5c54d300b3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 9 Feb 2016 17:59:38 -0500
Subject: workqueue: implement "workqueue.debug_force_rr_cpu" debug feature

Workqueue used to guarantee local execution for work items queued
without explicit target CPU.  The guarantee is gone now which can
break some usages in subtle ways.  To flush out those cases, this
patch implements a debug feature which forces round-robin CPU
selection for all such work items.

The debug feature defaults to off and can be enabled with a kernel
parameter.  The default can be flipped with a debug config option.

If you hit this commit during bisection, please refer to 041bd12e272c
("Revert "workqueue: make sure delayed work run in local cpu"") for
more information and ping me.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 87d40a7..cda2ead 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -4230,6 +4230,17 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			The default value of this parameter is determined by
 			the config option CONFIG_WQ_POWER_EFFICIENT_DEFAULT.
 
+	workqueue.debug_force_rr_cpu
+			Workqueue used to implicitly guarantee that work
+			items queued without explicit CPU specified are put
+			on the local CPU.  This guarantee is no longer true
+			and while local CPU is still preferred work items
+			may be put on foreign CPUs.  This debug option
+			forces round-robin CPU selection to flush out
+			usages which depend on the now broken guarantee.
+			When enabled, memory and cache locality will be
+			impacted.
+
 	x2apic_phys	[X86-64,APIC] Use x2apic physical mode instead of
 			default x2apic cluster mode on platforms
 			supporting x2apic.
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0547746..51d77e7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -307,6 +307,18 @@ static cpumask_var_t wq_unbound_cpumask;
 /* CPU where unbound work was last round robin scheduled from this CPU */
 static DEFINE_PER_CPU(int, wq_rr_cpu_last);
 
+/*
+ * Local execution of unbound work items is no longer guaranteed.  The
+ * following always forces round-robin CPU selection on unbound work items
+ * to uncover usages which depend on it.
+ */
+#ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU
+static bool wq_debug_force_rr_cpu = true;
+#else
+static bool wq_debug_force_rr_cpu = false;
+#endif
+module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644);
+
 /* the per-cpu worker pools */
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
 				     cpu_worker_pools);
@@ -1309,10 +1321,17 @@ static bool is_chained_work(struct workqueue_struct *wq)
  */
 static int wq_select_unbound_cpu(int cpu)
 {
+	static bool printed_dbg_warning;
 	int new_cpu;
 
-	if (cpumask_test_cpu(cpu, wq_unbound_cpumask))
-		return cpu;
+	if (likely(!wq_debug_force_rr_cpu)) {
+		if (cpumask_test_cpu(cpu, wq_unbound_cpumask))
+			return cpu;
+	} else if (!printed_dbg_warning) {
+		pr_warn("workqueue: round-robin CPU selection forced, expect performance impact\n");
+		printed_dbg_warning = true;
+	}
+
 	if (cpumask_empty(wq_unbound_cpumask))
 		return cpu;
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ecb9e75..8bfd1ac 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1400,6 +1400,21 @@ config RCU_EQS_DEBUG
 
 endmenu # "RCU Debugging"
 
+config DEBUG_WQ_FORCE_RR_CPU
+	bool "Force round-robin CPU selection for unbound work items"
+	depends on DEBUG_KERNEL
+	default n
+	help
+	  Workqueue used to implicitly guarantee that work items queued
+	  without explicit CPU specified are put on the local CPU.  This
+	  guarantee is no longer true and while local CPU is still
+	  preferred work items may be put on foreign CPUs.  Kernel
+	  parameter "workqueue.debug_force_rr_cpu" is added to force
+	  round-robin CPU selection to flush out usages which depend on the
+	  now broken guarantee.  This config option enables the debug
+	  feature by default.  When enabled, memory and cache locality will
+	  be impacted.
+
 config DEBUG_BLOCK_EXT_DEVT
         bool "Force extended block device numbers and spread them"
 	depends on DEBUG_KERNEL
-- 
cgit v0.10.2


From d6e022f1d207a161cd88e08ef0371554680ffc46 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 3 Feb 2016 13:54:25 -0500
Subject: workqueue: handle NUMA_NO_NODE for unbound pool_workqueue lookup

When looking up the pool_workqueue to use for an unbound workqueue,
workqueue assumes that the target CPU is always bound to a valid NUMA
node.  However, currently, when a CPU goes offline, the mapping is
destroyed and cpu_to_node() returns NUMA_NO_NODE.

This has always been broken but hasn't triggered often enough before
874bbfe600a6 ("workqueue: make sure delayed work run in local cpu").
After the commit, workqueue forcifully assigns the local CPU for
delayed work items without explicit target CPU to fix a different
issue.  This widens the window where CPU can go offline while a
delayed work item is pending causing delayed work items dispatched
with target CPU set to an already offlined CPU.  The resulting
NUMA_NO_NODE mapping makes workqueue try to queue the work item on a
NULL pool_workqueue and thus crash.

While 874bbfe600a6 has been reverted for a different reason making the
bug less visible again, it can still happen.  Fix it by mapping
NUMA_NO_NODE to the default pool_workqueue from unbound_pwq_by_node().
This is a temporary workaround.  The long term solution is keeping CPU
-> NODE mapping stable across CPU off/online cycles which is being
worked on.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Rafael J. Wysocki <rafael@kernel.org>
Cc: Len Brown <len.brown@intel.com>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/g/1454424264.11183.46.camel@gmail.com
Link: http://lkml.kernel.org/g/1453702100-2597-1-git-send-email-tangchen@cn.fujitsu.com

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 51d77e7..7ff5dc7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -586,6 +586,16 @@ static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
 						  int node)
 {
 	assert_rcu_or_wq_mutex_or_pool_mutex(wq);
+
+	/*
+	 * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a
+	 * delayed item is pending.  The plan is to keep CPU -> NODE
+	 * mapping valid and stable across CPU on/offlines.  Once that
+	 * happens, this workaround can be removed.
+	 */
+	if (unlikely(node == NUMA_NO_NODE))
+		return wq->dfl_pwq;
+
 	return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
 }
 
-- 
cgit v0.10.2