From 2423c9c3f0ffffa8f87cbdafe9781273c5d1b6a2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:30 -0700
Subject: blkcg: fix error return path in blkg_create()

In blkg_create(), after lookup of parent fails, the control jumps to
error path with the error code encoded into @blkg.  The error path
doesn't use @blkg for the return value.  It returns ERR_PTR(ret).
Make lookup fail path set @ret instead of @blkg.

Note that the parent lookup is guaranteed to succeed at that point and
the condition check is purely for sanity and triggers WARN when fails.
As such, I don't think it's necessary to mark it for -stable.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index e8918ff..7fc35f6 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -238,7 +238,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 	if (blkcg_parent(blkcg)) {
 		blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
 		if (WARN_ON_ONCE(!blkg->parent)) {
-			blkg = ERR_PTR(-EINVAL);
+			ret = -EINVAL;
 			goto err_put_css;
 		}
 		blkg_get(blkg->parent);
-- 
cgit v0.10.2


From dd4a4ffc0a4723cd0bef065af549803d256e1f7e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:30 -0700
Subject: blkcg: move blkg_for_each_descendant_pre() to block/blk-cgroup.h

blk-throttle hierarchy support will make use of it.  Move
blkg_for_each_descendant_pre() from block/blk-cgroup.c to
block/blk-cgroup.h.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 7fc35f6..b950306 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -32,26 +32,6 @@ EXPORT_SYMBOL_GPL(blkcg_root);
 
 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
 
-static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
-				      struct request_queue *q, bool update_hint);
-
-/**
- * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
- * @d_blkg: loop cursor pointing to the current descendant
- * @pos_cgrp: used for iteration
- * @p_blkg: target blkg to walk descendants of
- *
- * Walk @c_blkg through the descendants of @p_blkg.  Must be used with RCU
- * read locked.  If called under either blkcg or queue lock, the iteration
- * is guaranteed to include all and only online blkgs.  The caller may
- * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip
- * subtree.
- */
-#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg)		\
-	cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
-		if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
-					      (p_blkg)->q, false)))
-
 static bool blkcg_policy_enabled(struct request_queue *q,
 				 const struct blkcg_policy *pol)
 {
@@ -158,8 +138,8 @@ err_free:
  * @q's bypass state.  If @update_hint is %true, the caller should be
  * holding @q->queue_lock and lookup hint is updated on success.
  */
-static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
-				      struct request_queue *q, bool update_hint)
+struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
+			       bool update_hint)
 {
 	struct blkcg_gq *blkg;
 
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 4e595ee..11f5b92 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -282,6 +282,26 @@ static inline void blkg_put(struct blkcg_gq *blkg)
 		__blkg_release(blkg);
 }
 
+struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
+			       bool update_hint);
+
+/**
+ * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
+ * @d_blkg: loop cursor pointing to the current descendant
+ * @pos_cgrp: used for iteration
+ * @p_blkg: target blkg to walk descendants of
+ *
+ * Walk @c_blkg through the descendants of @p_blkg.  Must be used with RCU
+ * read locked.  If called under either blkcg or queue lock, the iteration
+ * is guaranteed to include all and only online blkgs.  The caller may
+ * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip
+ * subtree.
+ */
+#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg)		\
+	cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
+		if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
+					      (p_blkg)->q, false)))
+
 /**
  * blk_get_rl - get request_list to use
  * @q: request_queue of interest
-- 
cgit v0.10.2


From aa539cb38f4f1365ad52fed6a857b25f8b4dc06c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:31 -0700
Subject: blkcg: implement blkg_for_each_descendant_post()

This will be used by blk-throttle hierarchy support.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>

diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 11f5b92..e15f731 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -303,6 +303,20 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
 					      (p_blkg)->q, false)))
 
 /**
+ * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
+ * @d_blkg: loop cursor pointing to the current descendant
+ * @pos_cgrp: used for iteration
+ * @p_blkg: target blkg to walk descendants of
+ *
+ * Similar to blkg_for_each_descendant_pre() but performs post-order
+ * traversal instead.  Synchronization rules are the same.
+ */
+#define blkg_for_each_descendant_post(d_blkg, pos_cgrp, p_blkg)		\
+	cgroup_for_each_descendant_post((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
+		if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
+					      (p_blkg)->q, false)))
+
+/**
  * blk_get_rl - get request_list to use
  * @q: request_queue of interest
  * @bio: bio which will be attached to the allocated request (may be %NULL)
-- 
cgit v0.10.2


From db61367038dcd222476881cb09fd54661b3cd508 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:31 -0700
Subject: blkcg: invoke blkcg_policy->pd_init() after parent is linked

Currently, when creating a new blkcg_gq, each policy's pd_init_fn() is
invoked in blkg_alloc() before the parent is linked.  This makes it
difficult for policies to perform initializations which are dependent
on the parent.

This patch moves pd_init_fn() invocations to blkg_create() after the
parent blkg is linked where the new blkg is fully initialized.  As
this means that blkg_free() can't assume that pd's are initialized,
pd_exit_fn() invocations are moved to __blkg_release().  This
guarantees that pd_exit_fn() is also invoked with fully initialized
blkgs with valid parent pointers.

This will help implementing hierarchy support in blk-throttle.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b950306..6bbe0a9 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -51,18 +51,8 @@ static void blkg_free(struct blkcg_gq *blkg)
 	if (!blkg)
 		return;
 
-	for (i = 0; i < BLKCG_MAX_POLS; i++) {
-		struct blkcg_policy *pol = blkcg_policy[i];
-		struct blkg_policy_data *pd = blkg->pd[i];
-
-		if (!pd)
-			continue;
-
-		if (pol && pol->pd_exit_fn)
-			pol->pd_exit_fn(blkg);
-
-		kfree(pd);
-	}
+	for (i = 0; i < BLKCG_MAX_POLS; i++)
+		kfree(blkg->pd[i]);
 
 	blk_exit_rl(&blkg->rl);
 	kfree(blkg);
@@ -114,10 +104,6 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
 		blkg->pd[i] = pd;
 		pd->blkg = blkg;
 		pd->plid = i;
-
-		/* invoke per-policy init */
-		if (pol->pd_init_fn)
-			pol->pd_init_fn(blkg);
 	}
 
 	return blkg;
@@ -214,7 +200,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 	}
 	blkg = new_blkg;
 
-	/* link parent and insert */
+	/* link parent */
 	if (blkcg_parent(blkcg)) {
 		blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
 		if (WARN_ON_ONCE(!blkg->parent)) {
@@ -224,6 +210,15 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 		blkg_get(blkg->parent);
 	}
 
+	/* invoke per-policy init */
+	for (i = 0; i < BLKCG_MAX_POLS; i++) {
+		struct blkcg_policy *pol = blkcg_policy[i];
+
+		if (blkg->pd[i] && pol->pd_init_fn)
+			pol->pd_init_fn(blkg);
+	}
+
+	/* insert */
 	spin_lock(&blkcg->lock);
 	ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
 	if (likely(!ret)) {
@@ -381,6 +376,16 @@ static void blkg_rcu_free(struct rcu_head *rcu_head)
 
 void __blkg_release(struct blkcg_gq *blkg)
 {
+	int i;
+
+	/* tell policies that this one is being freed */
+	for (i = 0; i < BLKCG_MAX_POLS; i++) {
+		struct blkcg_policy *pol = blkcg_policy[i];
+
+		if (blkg->pd[i] && pol->pd_exit_fn)
+			pol->pd_exit_fn(blkg);
+	}
+
 	/* release the blkcg and parent blkg refs this blkg has been holding */
 	css_put(&blkg->blkcg->css);
 	if (blkg->parent)
-- 
cgit v0.10.2


From 2a4fd070ee8561d918a3776388331bb7e92ea59e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:31 -0700
Subject: blkcg: move bulk of blkcg_gq release operations to the RCU callback

Currently, when the last reference of a blkcg_gq is put, all then
release operations sans the actual freeing happen directly in
blkg_put().  As blkg_put() may be called under queue_lock, all
pd_exit_fn()s may be too.  This makes it impossible for pd_exit_fn()s
to use del_timer_sync() on timers which grab the queue_lock which is
an irq-safe lock due to the deadlock possibility described in the
comment on top of del_timer_sync().

This can be easily avoided by perfoming the release operations in the
RCU callback instead of directly from blkg_put().  This patch moves
the blkcg_gq release operations to the RCU callback.

As this leaves __blkg_release() with only call_rcu() invocation,
blkg_rcu_free() is renamed to __blkg_release_rcu(), exported and
call_rcu() invocation is now done directly from blkg_put() instead of
going through __blkg_release() which is removed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 6bbe0a9..d074760 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -369,13 +369,17 @@ static void blkg_destroy_all(struct request_queue *q)
 	q->root_rl.blkg = NULL;
 }
 
-static void blkg_rcu_free(struct rcu_head *rcu_head)
-{
-	blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head));
-}
-
-void __blkg_release(struct blkcg_gq *blkg)
+/*
+ * A group is RCU protected, but having an rcu lock does not mean that one
+ * can access all the fields of blkg and assume these are valid.  For
+ * example, don't try to follow throtl_data and request queue links.
+ *
+ * Having a reference to blkg under an rcu allows accesses to only values
+ * local to groups like group stats and group rate limits.
+ */
+void __blkg_release_rcu(struct rcu_head *rcu_head)
 {
+	struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
 	int i;
 
 	/* tell policies that this one is being freed */
@@ -388,21 +392,15 @@ void __blkg_release(struct blkcg_gq *blkg)
 
 	/* release the blkcg and parent blkg refs this blkg has been holding */
 	css_put(&blkg->blkcg->css);
-	if (blkg->parent)
+	if (blkg->parent) {
+		spin_lock_irq(blkg->q->queue_lock);
 		blkg_put(blkg->parent);
+		spin_unlock_irq(blkg->q->queue_lock);
+	}
 
-	/*
-	 * A group is freed in rcu manner. But having an rcu lock does not
-	 * mean that one can access all the fields of blkg and assume these
-	 * are valid. For example, don't try to follow throtl_data and
-	 * request queue links.
-	 *
-	 * Having a reference to blkg under an rcu allows acess to only
-	 * values local to groups like group stats and group rate limits
-	 */
-	call_rcu(&blkg->rcu_head, blkg_rcu_free);
+	blkg_free(blkg);
 }
-EXPORT_SYMBOL_GPL(__blkg_release);
+EXPORT_SYMBOL_GPL(__blkg_release_rcu);
 
 /*
  * The next function used by blk_queue_for_each_rl().  It's a bit tricky
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index e15f731..8056c03 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -266,7 +266,7 @@ static inline void blkg_get(struct blkcg_gq *blkg)
 	blkg->refcnt++;
 }
 
-void __blkg_release(struct blkcg_gq *blkg);
+void __blkg_release_rcu(struct rcu_head *rcu);
 
 /**
  * blkg_put - put a blkg reference
@@ -279,7 +279,7 @@ static inline void blkg_put(struct blkcg_gq *blkg)
 	lockdep_assert_held(blkg->q->queue_lock);
 	WARN_ON_ONCE(blkg->refcnt <= 0);
 	if (!--blkg->refcnt)
-		__blkg_release(blkg);
+		call_rcu(&blkg->rcu_head, __blkg_release_rcu);
 }
 
 struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
-- 
cgit v0.10.2


From 2db6314c213bb21102dd1dad06cfda6a8682d624 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:31 -0700
Subject: blk-throttle: remove spurious throtl_enqueue_tg() call from
 throtl_select_dispatch()

throtl_select_dispatch() calls throtl_enqueue_tg() right after
tg_update_disptime(), which always calls the function anyway.  The
call is, while harmless, unnecessary.  Remove it.

This patch doesn't introduce any behavior difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 3114622..3960787 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -816,10 +816,8 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
 
 		nr_disp += throtl_dispatch_tg(td, tg, bl);
 
-		if (tg->nr_queued[0] || tg->nr_queued[1]) {
+		if (tg->nr_queued[0] || tg->nr_queued[1])
 			tg_update_disptime(td, tg);
-			throtl_enqueue_tg(td, tg);
-		}
 
 		if (nr_disp >= throtl_quantum)
 			break;
-- 
cgit v0.10.2


From 632b44935f4c99a61c56f8a6f805a1080ab5a432 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:31 -0700
Subject: blk-throttle: remove deferred config application mechanism

When bps or iops configuration changes, blk-throttle records the new
configuration and sets a flag indicating that the config has changed.
The flag is checked in the bio dispatch path and applied.  This
deferred config application was necessary due to limitations in blkcg
framework, which haven't existed for quite a while now.

This patch removes the deferred config application mechanism and
applies new configurations directly from tg_set_conf(), which is
simpler.

v2: Dropped unnecessary throtl_schedule_delayed_work() call from
    tg_set_conf() as suggested by Vivek Goyal.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 3960787..7dbd0e69 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -85,9 +85,6 @@ struct throtl_grp {
 	unsigned long slice_start[2];
 	unsigned long slice_end[2];
 
-	/* Some throttle limits got updated for the group */
-	int limits_changed;
-
 	/* Per cpu stats pointer */
 	struct tg_stats_cpu __percpu *stats_cpu;
 
@@ -112,8 +109,6 @@ struct throtl_data
 
 	/* Work for dispatching throttled bios */
 	struct delayed_work throtl_work;
-
-	int limits_changed;
 };
 
 /* list and work item to allocate percpu group stats */
@@ -223,7 +218,6 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
 	RB_CLEAR_NODE(&tg->rb_node);
 	bio_list_init(&tg->bio_lists[0]);
 	bio_list_init(&tg->bio_lists[1]);
-	tg->limits_changed = false;
 
 	tg->bps[READ] = -1;
 	tg->bps[WRITE] = -1;
@@ -826,45 +820,6 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
 	return nr_disp;
 }
 
-static void throtl_process_limit_change(struct throtl_data *td)
-{
-	struct request_queue *q = td->queue;
-	struct blkcg_gq *blkg, *n;
-
-	if (!td->limits_changed)
-		return;
-
-	xchg(&td->limits_changed, false);
-
-	throtl_log(td, "limits changed");
-
-	list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
-		struct throtl_grp *tg = blkg_to_tg(blkg);
-
-		if (!tg->limits_changed)
-			continue;
-
-		if (!xchg(&tg->limits_changed, false))
-			continue;
-
-		throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
-			" riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE],
-			tg->iops[READ], tg->iops[WRITE]);
-
-		/*
-		 * Restart the slices for both READ and WRITES. It
-		 * might happen that a group's limit are dropped
-		 * suddenly and we don't want to account recently
-		 * dispatched IO with new low rate
-		 */
-		throtl_start_new_slice(td, tg, 0);
-		throtl_start_new_slice(td, tg, 1);
-
-		if (throtl_tg_on_rr(tg))
-			tg_update_disptime(td, tg);
-	}
-}
-
 /* Dispatch throttled bios. Should be called without queue lock held. */
 static int throtl_dispatch(struct request_queue *q)
 {
@@ -876,8 +831,6 @@ static int throtl_dispatch(struct request_queue *q)
 
 	spin_lock_irq(q->queue_lock);
 
-	throtl_process_limit_change(td);
-
 	if (!total_nr_queued(td))
 		goto out;
 
@@ -925,8 +878,7 @@ throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
 
 	struct delayed_work *dwork = &td->throtl_work;
 
-	/* schedule work if limits changed even if no bio is queued */
-	if (total_nr_queued(td) || td->limits_changed) {
+	if (total_nr_queued(td)) {
 		mod_delayed_work(kthrotld_workqueue, dwork, delay);
 		throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
 				delay, jiffies);
@@ -1023,10 +975,25 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
 	else
 		*(unsigned int *)((void *)tg + cft->private) = ctx.v;
 
-	/* XXX: we don't need the following deferred processing */
-	xchg(&tg->limits_changed, true);
-	xchg(&td->limits_changed, true);
-	throtl_schedule_delayed_work(td, 0);
+	throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
+		      tg->bps[READ], tg->bps[WRITE],
+		      tg->iops[READ], tg->iops[WRITE]);
+
+	/*
+	 * We're already holding queue_lock and know @tg is valid.  Let's
+	 * apply the new config directly.
+	 *
+	 * Restart the slices for both READ and WRITES. It might happen
+	 * that a group's limit are dropped suddenly and we don't want to
+	 * account recently dispatched IO with new low rate.
+	 */
+	throtl_start_new_slice(td, tg, 0);
+	throtl_start_new_slice(td, tg, 1);
+
+	if (throtl_tg_on_rr(tg)) {
+		tg_update_disptime(td, tg);
+		throtl_schedule_next_dispatch(td);
+	}
 
 	blkg_conf_finish(&ctx);
 	return 0;
@@ -1239,7 +1206,6 @@ int blk_throtl_init(struct request_queue *q)
 		return -ENOMEM;
 
 	td->tg_service_tree = THROTL_RB_ROOT;
-	td->limits_changed = false;
 	INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
 
 	q->td = td;
-- 
cgit v0.10.2


From cb76199c36a7ccf0947ef4875b32e0940f50d1a8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:31 -0700
Subject: blk-throttle: collapse throtl_dispatch() into the work function

blk-throttle is about to go through major restructuring to support
hierarchy.  Do cosmetic updates in preparation.

* s/throtl_data->throtl_work/throtl_data->dispatch_work/

* s/blk_throtl_work()/blk_throtl_dispatch_work_fn()/

* Collapse throtl_dispatch() into blk_throtl_dispatch_work_fn()

This patch is purely cosmetic.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 7dbd0e69..0a0bc00 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -108,7 +108,7 @@ struct throtl_data
 	unsigned int nr_undestroyed_grps;
 
 	/* Work for dispatching throttled bios */
-	struct delayed_work throtl_work;
+	struct delayed_work dispatch_work;
 };
 
 /* list and work item to allocate percpu group stats */
@@ -820,10 +820,12 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
 	return nr_disp;
 }
 
-/* Dispatch throttled bios. Should be called without queue lock held. */
-static int throtl_dispatch(struct request_queue *q)
+/* work function to dispatch throttled bios */
+void blk_throtl_dispatch_work_fn(struct work_struct *work)
 {
-	struct throtl_data *td = q->td;
+	struct throtl_data *td = container_of(to_delayed_work(work),
+					      struct throtl_data, dispatch_work);
+	struct request_queue *q = td->queue;
 	unsigned int nr_disp = 0;
 	struct bio_list bio_list_on_stack;
 	struct bio *bio;
@@ -859,16 +861,6 @@ out:
 			generic_make_request(bio);
 		blk_finish_plug(&plug);
 	}
-	return nr_disp;
-}
-
-void blk_throtl_work(struct work_struct *work)
-{
-	struct throtl_data *td = container_of(work, struct throtl_data,
-					throtl_work.work);
-	struct request_queue *q = td->queue;
-
-	throtl_dispatch(q);
 }
 
 /* Call with queue lock held */
@@ -876,7 +868,7 @@ static void
 throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
 {
 
-	struct delayed_work *dwork = &td->throtl_work;
+	struct delayed_work *dwork = &td->dispatch_work;
 
 	if (total_nr_queued(td)) {
 		mod_delayed_work(kthrotld_workqueue, dwork, delay);
@@ -1057,7 +1049,7 @@ static void throtl_shutdown_wq(struct request_queue *q)
 {
 	struct throtl_data *td = q->td;
 
-	cancel_delayed_work_sync(&td->throtl_work);
+	cancel_delayed_work_sync(&td->dispatch_work);
 }
 
 static struct blkcg_policy blkcg_policy_throtl = {
@@ -1206,7 +1198,7 @@ int blk_throtl_init(struct request_queue *q)
 		return -ENOMEM;
 
 	td->tg_service_tree = THROTL_RB_ROOT;
-	INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
+	INIT_DELAYED_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
 
 	q->td = td;
 	td->queue = q;
-- 
cgit v0.10.2


From a9131a27e2a3272df2207277a2be90377ce75fc6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:31 -0700
Subject: blk-throttle: relocate throtl_schedule_delayed_work()

Move throtl_schedule_delayed_work() above its first user so that the
forward declaration can be removed.

This patch is pure relocaiton.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 0a0bc00..507b1c6 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -25,8 +25,6 @@ static struct blkcg_policy blkcg_policy_throtl;
 
 /* A workqueue to queue throttle related work */
 static struct workqueue_struct *kthrotld_workqueue;
-static void throtl_schedule_delayed_work(struct throtl_data *td,
-				unsigned long delay);
 
 struct throtl_rb_root {
 	struct rb_root rb;
@@ -398,6 +396,19 @@ static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
 		__throtl_dequeue_tg(td, tg);
 }
 
+/* Call with queue lock held */
+static void throtl_schedule_delayed_work(struct throtl_data *td,
+					 unsigned long delay)
+{
+	struct delayed_work *dwork = &td->dispatch_work;
+
+	if (total_nr_queued(td)) {
+		mod_delayed_work(kthrotld_workqueue, dwork, delay);
+		throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
+			   delay, jiffies);
+	}
+}
+
 static void throtl_schedule_next_dispatch(struct throtl_data *td)
 {
 	struct throtl_rb_root *st = &td->tg_service_tree;
@@ -863,20 +874,6 @@ out:
 	}
 }
 
-/* Call with queue lock held */
-static void
-throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
-{
-
-	struct delayed_work *dwork = &td->dispatch_work;
-
-	if (total_nr_queued(td)) {
-		mod_delayed_work(kthrotld_workqueue, dwork, delay);
-		throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
-				delay, jiffies);
-	}
-}
-
 static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
 				struct blkg_policy_data *pd, int off)
 {
-- 
cgit v0.10.2


From 6a525600ffeb9e0d6cbbebda49eb89d6d3408c2b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:32 -0700
Subject: blk-throttle: remove pointless throtl_nr_queued() optimizations

throtl_nr_queued() is used in several places to avoid performing
certain operations when the throtl_data is empty.  This usually is
useless as those paths usually aren't traveled if there's no bio
queued.

* throtl_schedule_delayed_work() skips scheduling dispatch work item
  if @td doesn't have any bios queued; however, the only case it can
  be called when @td is empty is from tg_set_conf() which isn't
  something we should be optimizing for.

* throtl_schedule_next_dispatch() takes a quick exit if @td is empty;
  however, right after that it triggers BUG if the service tree is
  empty.  The two conditions are equivalent and it can just test
  @st->count for the quick exit.

* blk_throtl_dispatch_work_fn() skips dispatch if @td is empty.  This
  work function isn't usually invoked when @td is empty.  The only
  possibility is from tg_set_conf() and when it happens the normal
  dispatching path can handle empty @td fine.  No need to add special
  skip path.

This patch removes the above three unnecessary optimizations, which
leave throtl_log() call in blk_throtl_dispatch_work_fn() the only user
of throtl_nr_queued().  Remove throtl_nr_queued() and open code it in
throtl_log().  I don't think we need td->nr_queued[] at all.  Maybe we
can remove it later.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 507b1c6..dbeef30 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -166,11 +166,6 @@ THROTL_TG_FNS(on_rr);
 #define throtl_log(td, fmt, args...)	\
 	blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
 
-static inline unsigned int total_nr_queued(struct throtl_data *td)
-{
-	return td->nr_queued[0] + td->nr_queued[1];
-}
-
 /*
  * Worker for allocating per cpu stat for tgs. This is scheduled on the
  * system_wq once there are some groups on the alloc_list waiting for
@@ -402,25 +397,18 @@ static void throtl_schedule_delayed_work(struct throtl_data *td,
 {
 	struct delayed_work *dwork = &td->dispatch_work;
 
-	if (total_nr_queued(td)) {
-		mod_delayed_work(kthrotld_workqueue, dwork, delay);
-		throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
-			   delay, jiffies);
-	}
+	mod_delayed_work(kthrotld_workqueue, dwork, delay);
+	throtl_log(td, "schedule work. delay=%lu jiffies=%lu", delay, jiffies);
 }
 
 static void throtl_schedule_next_dispatch(struct throtl_data *td)
 {
 	struct throtl_rb_root *st = &td->tg_service_tree;
 
-	/*
-	 * If there are more bios pending, schedule more work.
-	 */
-	if (!total_nr_queued(td))
+	/* any pending children left? */
+	if (!st->count)
 		return;
 
-	BUG_ON(!st->count);
-
 	update_min_dispatch_time(st);
 
 	if (time_before_eq(st->min_disptime, jiffies))
@@ -844,14 +832,11 @@ void blk_throtl_dispatch_work_fn(struct work_struct *work)
 
 	spin_lock_irq(q->queue_lock);
 
-	if (!total_nr_queued(td))
-		goto out;
-
 	bio_list_init(&bio_list_on_stack);
 
 	throtl_log(td, "dispatch nr_queued=%u read=%u write=%u",
-			total_nr_queued(td), td->nr_queued[READ],
-			td->nr_queued[WRITE]);
+		   td->nr_queued[READ] + td->nr_queued[WRITE],
+		   td->nr_queued[READ], td->nr_queued[WRITE]);
 
 	nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
 
@@ -859,7 +844,7 @@ void blk_throtl_dispatch_work_fn(struct work_struct *work)
 		throtl_log(td, "bios disp=%u", nr_disp);
 
 	throtl_schedule_next_dispatch(td);
-out:
+
 	spin_unlock_irq(q->queue_lock);
 
 	/*
-- 
cgit v0.10.2


From c9e0332e877c1a1ccfe4ba315a437c7a8cf6e575 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:32 -0700
Subject: blk-throttle: rename throtl_rb_root to throtl_service_queue

throtl_rb_root will be expanded to cover more roles for hierarchy
support.  Rename it to throtl_service_queue and make its fields more
descriptive.

* rb		-> pending_tree
* left		-> first_pending
* count		-> nr_pending
* min_disptime	-> first_pending_disptime

This patch is purely cosmetic.

Signed-off-by: Tejun Heo <tj@kernel.org
Acked-by: Vivek Goyal <vgoyal@redhat.com>

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index dbeef30..b279110 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -26,15 +26,15 @@ static struct blkcg_policy blkcg_policy_throtl;
 /* A workqueue to queue throttle related work */
 static struct workqueue_struct *kthrotld_workqueue;
 
-struct throtl_rb_root {
-	struct rb_root rb;
-	struct rb_node *left;
-	unsigned int count;
-	unsigned long min_disptime;
+struct throtl_service_queue {
+	struct rb_root		pending_tree;	/* RB tree of active tgs */
+	struct rb_node		*first_pending;	/* first node in the tree */
+	unsigned int		nr_pending;	/* # queued in the tree */
+	unsigned long		first_pending_disptime;	/* disptime of the first tg */
 };
 
-#define THROTL_RB_ROOT	(struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \
-			.count = 0, .min_disptime = 0}
+#define THROTL_SERVICE_QUEUE_INITIALIZER				\
+	(struct throtl_service_queue){ .pending_tree = RB_ROOT }
 
 #define rb_entry_tg(node)	rb_entry((node), struct throtl_grp, rb_node)
 
@@ -50,7 +50,7 @@ struct throtl_grp {
 	/* must be the first member */
 	struct blkg_policy_data pd;
 
-	/* active throtl group service_tree member */
+	/* active throtl group service_queue member */
 	struct rb_node rb_node;
 
 	/*
@@ -93,7 +93,7 @@ struct throtl_grp {
 struct throtl_data
 {
 	/* service tree for active throtl groups */
-	struct throtl_rb_root tg_service_tree;
+	struct throtl_service_queue service_queue;
 
 	struct request_queue *queue;
 
@@ -296,17 +296,17 @@ static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
 	return tg;
 }
 
-static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root)
+static struct throtl_grp *throtl_rb_first(struct throtl_service_queue *sq)
 {
 	/* Service tree is empty */
-	if (!root->count)
+	if (!sq->nr_pending)
 		return NULL;
 
-	if (!root->left)
-		root->left = rb_first(&root->rb);
+	if (!sq->first_pending)
+		sq->first_pending = rb_first(&sq->pending_tree);
 
-	if (root->left)
-		return rb_entry_tg(root->left);
+	if (sq->first_pending)
+		return rb_entry_tg(sq->first_pending);
 
 	return NULL;
 }
@@ -317,29 +317,29 @@ static void rb_erase_init(struct rb_node *n, struct rb_root *root)
 	RB_CLEAR_NODE(n);
 }
 
-static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root)
+static void throtl_rb_erase(struct rb_node *n, struct throtl_service_queue *sq)
 {
-	if (root->left == n)
-		root->left = NULL;
-	rb_erase_init(n, &root->rb);
-	--root->count;
+	if (sq->first_pending == n)
+		sq->first_pending = NULL;
+	rb_erase_init(n, &sq->pending_tree);
+	--sq->nr_pending;
 }
 
-static void update_min_dispatch_time(struct throtl_rb_root *st)
+static void update_min_dispatch_time(struct throtl_service_queue *sq)
 {
 	struct throtl_grp *tg;
 
-	tg = throtl_rb_first(st);
+	tg = throtl_rb_first(sq);
 	if (!tg)
 		return;
 
-	st->min_disptime = tg->disptime;
+	sq->first_pending_disptime = tg->disptime;
 }
 
-static void
-tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
+static void tg_service_queue_add(struct throtl_service_queue *sq,
+				 struct throtl_grp *tg)
 {
-	struct rb_node **node = &st->rb.rb_node;
+	struct rb_node **node = &sq->pending_tree.rb_node;
 	struct rb_node *parent = NULL;
 	struct throtl_grp *__tg;
 	unsigned long key = tg->disptime;
@@ -358,19 +358,19 @@ tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
 	}
 
 	if (left)
-		st->left = &tg->rb_node;
+		sq->first_pending = &tg->rb_node;
 
 	rb_link_node(&tg->rb_node, parent, node);
-	rb_insert_color(&tg->rb_node, &st->rb);
+	rb_insert_color(&tg->rb_node, &sq->pending_tree);
 }
 
 static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
 {
-	struct throtl_rb_root *st = &td->tg_service_tree;
+	struct throtl_service_queue *sq = &td->service_queue;
 
-	tg_service_tree_add(st, tg);
+	tg_service_queue_add(sq, tg);
 	throtl_mark_tg_on_rr(tg);
-	st->count++;
+	sq->nr_pending++;
 }
 
 static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
@@ -381,7 +381,7 @@ static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
 
 static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
 {
-	throtl_rb_erase(&tg->rb_node, &td->tg_service_tree);
+	throtl_rb_erase(&tg->rb_node, &td->service_queue);
 	throtl_clear_tg_on_rr(tg);
 }
 
@@ -403,18 +403,18 @@ static void throtl_schedule_delayed_work(struct throtl_data *td,
 
 static void throtl_schedule_next_dispatch(struct throtl_data *td)
 {
-	struct throtl_rb_root *st = &td->tg_service_tree;
+	struct throtl_service_queue *sq = &td->service_queue;
 
 	/* any pending children left? */
-	if (!st->count)
+	if (!sq->nr_pending)
 		return;
 
-	update_min_dispatch_time(st);
+	update_min_dispatch_time(sq);
 
-	if (time_before_eq(st->min_disptime, jiffies))
+	if (time_before_eq(sq->first_pending_disptime, jiffies))
 		throtl_schedule_delayed_work(td, 0);
 	else
-		throtl_schedule_delayed_work(td, (st->min_disptime - jiffies));
+		throtl_schedule_delayed_work(td, sq->first_pending_disptime - jiffies);
 }
 
 static inline void
@@ -794,10 +794,10 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
 {
 	unsigned int nr_disp = 0;
 	struct throtl_grp *tg;
-	struct throtl_rb_root *st = &td->tg_service_tree;
+	struct throtl_service_queue *sq = &td->service_queue;
 
 	while (1) {
-		tg = throtl_rb_first(st);
+		tg = throtl_rb_first(sq);
 
 		if (!tg)
 			break;
@@ -1145,7 +1145,7 @@ void blk_throtl_drain(struct request_queue *q)
 	__releases(q->queue_lock) __acquires(q->queue_lock)
 {
 	struct throtl_data *td = q->td;
-	struct throtl_rb_root *st = &td->tg_service_tree;
+	struct throtl_service_queue *sq = &td->service_queue;
 	struct throtl_grp *tg;
 	struct bio_list bl;
 	struct bio *bio;
@@ -1154,7 +1154,7 @@ void blk_throtl_drain(struct request_queue *q)
 
 	bio_list_init(&bl);
 
-	while ((tg = throtl_rb_first(st))) {
+	while ((tg = throtl_rb_first(sq))) {
 		throtl_dequeue_tg(td, tg);
 
 		while ((bio = bio_list_peek(&tg->bio_lists[READ])))
@@ -1179,7 +1179,7 @@ int blk_throtl_init(struct request_queue *q)
 	if (!td)
 		return -ENOMEM;
 
-	td->tg_service_tree = THROTL_RB_ROOT;
+	td->service_queue = THROTL_SERVICE_QUEUE_INITIALIZER;
 	INIT_DELAYED_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
 
 	q->td = td;
-- 
cgit v0.10.2


From 5b2c16aae0c074c3bb546c4c066ca7064684553c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:32 -0700
Subject: blk-throttle: simplify throtl_grp flag handling

blk-throttle is still using function-defining macros to define flag
handling functions, which went out style at least a decade ago.

Just define the flag as bitmask and use direct bit operations.

This patch doesn't make any functional changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index b279110..e8ef43d 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -36,6 +36,10 @@ struct throtl_service_queue {
 #define THROTL_SERVICE_QUEUE_INITIALIZER				\
 	(struct throtl_service_queue){ .pending_tree = RB_ROOT }
 
+enum tg_state_flags {
+	THROTL_TG_PENDING	= 1 << 0,	/* on parent's pending tree */
+};
+
 #define rb_entry_tg(node)	rb_entry((node), struct throtl_grp, rb_node)
 
 /* Per-cpu group stats */
@@ -136,26 +140,6 @@ static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
 	return blkg_to_tg(td->queue->root_blkg);
 }
 
-enum tg_state_flags {
-	THROTL_TG_FLAG_on_rr = 0,	/* on round-robin busy list */
-};
-
-#define THROTL_TG_FNS(name)						\
-static inline void throtl_mark_tg_##name(struct throtl_grp *tg)		\
-{									\
-	(tg)->flags |= (1 << THROTL_TG_FLAG_##name);			\
-}									\
-static inline void throtl_clear_tg_##name(struct throtl_grp *tg)	\
-{									\
-	(tg)->flags &= ~(1 << THROTL_TG_FLAG_##name);			\
-}									\
-static inline int throtl_tg_##name(const struct throtl_grp *tg)		\
-{									\
-	return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0;	\
-}
-
-THROTL_TG_FNS(on_rr);
-
 #define throtl_log_tg(td, tg, fmt, args...)	do {			\
 	char __pbuf[128];						\
 									\
@@ -369,25 +353,25 @@ static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
 	struct throtl_service_queue *sq = &td->service_queue;
 
 	tg_service_queue_add(sq, tg);
-	throtl_mark_tg_on_rr(tg);
+	tg->flags |= THROTL_TG_PENDING;
 	sq->nr_pending++;
 }
 
 static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
 {
-	if (!throtl_tg_on_rr(tg))
+	if (!(tg->flags & THROTL_TG_PENDING))
 		__throtl_enqueue_tg(td, tg);
 }
 
 static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
 {
 	throtl_rb_erase(&tg->rb_node, &td->service_queue);
-	throtl_clear_tg_on_rr(tg);
+	tg->flags &= ~THROTL_TG_PENDING;
 }
 
 static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
 {
-	if (throtl_tg_on_rr(tg))
+	if (tg->flags & THROTL_TG_PENDING)
 		__throtl_dequeue_tg(td, tg);
 }
 
@@ -964,7 +948,7 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
 	throtl_start_new_slice(td, tg, 0);
 	throtl_start_new_slice(td, tg, 1);
 
-	if (throtl_tg_on_rr(tg)) {
+	if (tg->flags & THROTL_TG_PENDING) {
 		tg_update_disptime(td, tg);
 		throtl_schedule_next_dispatch(td);
 	}
-- 
cgit v0.10.2


From 0f3457f60edc57332bf6564fa00d561a4372dcb9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:32 -0700
Subject: blk-throttle: add backlink pointer from throtl_grp to throtl_data

Add throtl_grp->td so that the td (throtl_data) a given tg
(throtl_grp) belongs to can be determined, and remove @td argument
from functions which take both @td and @tg as the former now can be
determined from the latter.

This generally simplifies the code and removes a number of cases where
@td is passed as an argument without being actually used.  This will
also help hierarchy support implementation.

While at it, in multi-line conditions, move the logical operators
leading broken lines to the end of the previous line.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index e8ef43d..a489391 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -57,6 +57,9 @@ struct throtl_grp {
 	/* active throtl group service_queue member */
 	struct rb_node rb_node;
 
+	/* throtl_data this group belongs to */
+	struct throtl_data *td;
+
 	/*
 	 * Dispatch time in jiffies. This is the estimated time when group
 	 * will unthrottle and is ready to dispatch more bio. It is used as
@@ -140,11 +143,11 @@ static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
 	return blkg_to_tg(td->queue->root_blkg);
 }
 
-#define throtl_log_tg(td, tg, fmt, args...)	do {			\
+#define throtl_log_tg(tg, fmt, args...)	do {				\
 	char __pbuf[128];						\
 									\
 	blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf));		\
-	blk_add_trace_msg((td)->queue, "throtl %s " fmt, __pbuf, ##args); \
+	blk_add_trace_msg((tg)->td->queue, "throtl %s " fmt, __pbuf, ##args); \
 } while (0)
 
 #define throtl_log(td, fmt, args...)	\
@@ -193,6 +196,7 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
 	unsigned long flags;
 
 	RB_CLEAR_NODE(&tg->rb_node);
+	tg->td = blkg->q->td;
 	bio_list_init(&tg->bio_lists[0]);
 	bio_list_init(&tg->bio_lists[1]);
 
@@ -401,36 +405,34 @@ static void throtl_schedule_next_dispatch(struct throtl_data *td)
 		throtl_schedule_delayed_work(td, sq->first_pending_disptime - jiffies);
 }
 
-static inline void
-throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
 {
 	tg->bytes_disp[rw] = 0;
 	tg->io_disp[rw] = 0;
 	tg->slice_start[rw] = jiffies;
 	tg->slice_end[rw] = jiffies + throtl_slice;
-	throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
+	throtl_log_tg(tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
 			rw == READ ? 'R' : 'W', tg->slice_start[rw],
 			tg->slice_end[rw], jiffies);
 }
 
-static inline void throtl_set_slice_end(struct throtl_data *td,
-		struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
+static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
+					unsigned long jiffy_end)
 {
 	tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
 }
 
-static inline void throtl_extend_slice(struct throtl_data *td,
-		struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
+static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
+				       unsigned long jiffy_end)
 {
 	tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
-	throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
+	throtl_log_tg(tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
 			rw == READ ? 'R' : 'W', tg->slice_start[rw],
 			tg->slice_end[rw], jiffies);
 }
 
 /* Determine if previously allocated or extended slice is complete or not */
-static bool
-throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
 {
 	if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
 		return 0;
@@ -439,8 +441,7 @@ throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 }
 
 /* Trim the used slices and adjust slice start accordingly */
-static inline void
-throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
 {
 	unsigned long nr_slices, time_elapsed, io_trim;
 	u64 bytes_trim, tmp;
@@ -452,7 +453,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 	 * renewed. Don't try to trim the slice if slice is used. A new
 	 * slice will start when appropriate.
 	 */
-	if (throtl_slice_used(td, tg, rw))
+	if (throtl_slice_used(tg, rw))
 		return;
 
 	/*
@@ -463,7 +464,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 	 * is bad because it does not allow new slice to start.
 	 */
 
-	throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice);
+	throtl_set_slice_end(tg, rw, jiffies + throtl_slice);
 
 	time_elapsed = jiffies - tg->slice_start[rw];
 
@@ -492,14 +493,14 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 
 	tg->slice_start[rw] += nr_slices * throtl_slice;
 
-	throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
+	throtl_log_tg(tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
 			" start=%lu end=%lu jiffies=%lu",
 			rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
 			tg->slice_start[rw], tg->slice_end[rw], jiffies);
 }
 
-static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
-		struct bio *bio, unsigned long *wait)
+static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
+				  unsigned long *wait)
 {
 	bool rw = bio_data_dir(bio);
 	unsigned int io_allowed;
@@ -548,8 +549,8 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
 	return 0;
 }
 
-static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
-		struct bio *bio, unsigned long *wait)
+static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
+				 unsigned long *wait)
 {
 	bool rw = bio_data_dir(bio);
 	u64 bytes_allowed, extra_bytes, tmp;
@@ -600,8 +601,8 @@ static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {
  * Returns whether one can dispatch a bio or not. Also returns approx number
  * of jiffies to wait before this bio is with-in IO rate and can be dispatched
  */
-static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
-				struct bio *bio, unsigned long *wait)
+static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
+			    unsigned long *wait)
 {
 	bool rw = bio_data_dir(bio);
 	unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
@@ -626,15 +627,15 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
 	 * existing slice to make sure it is at least throtl_slice interval
 	 * long since now.
 	 */
-	if (throtl_slice_used(td, tg, rw))
-		throtl_start_new_slice(td, tg, rw);
+	if (throtl_slice_used(tg, rw))
+		throtl_start_new_slice(tg, rw);
 	else {
 		if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
-			throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
+			throtl_extend_slice(tg, rw, jiffies + throtl_slice);
 	}
 
-	if (tg_with_in_bps_limit(td, tg, bio, &bps_wait)
-	    && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) {
+	if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
+	    tg_with_in_iops_limit(tg, bio, &iops_wait)) {
 		if (wait)
 			*wait = 0;
 		return 1;
@@ -646,7 +647,7 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
 		*wait = max_wait;
 
 	if (time_before(tg->slice_end[rw], jiffies + max_wait))
-		throtl_extend_slice(td, tg, rw, jiffies + max_wait);
+		throtl_extend_slice(tg, rw, jiffies + max_wait);
 
 	return 0;
 }
@@ -707,10 +708,10 @@ static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
 	struct bio *bio;
 
 	if ((bio = bio_list_peek(&tg->bio_lists[READ])))
-		tg_may_dispatch(td, tg, bio, &read_wait);
+		tg_may_dispatch(tg, bio, &read_wait);
 
 	if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
-		tg_may_dispatch(td, tg, bio, &write_wait);
+		tg_may_dispatch(tg, bio, &write_wait);
 
 	min_wait = min(read_wait, write_wait);
 	disptime = jiffies + min_wait;
@@ -721,8 +722,8 @@ static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
 	throtl_enqueue_tg(td, tg);
 }
 
-static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
-				bool rw, struct bio_list *bl)
+static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw,
+				struct bio_list *bl)
 {
 	struct bio *bio;
 
@@ -731,18 +732,17 @@ static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
 	/* Drop bio reference on blkg */
 	blkg_put(tg_to_blkg(tg));
 
-	BUG_ON(td->nr_queued[rw] <= 0);
-	td->nr_queued[rw]--;
+	BUG_ON(tg->td->nr_queued[rw] <= 0);
+	tg->td->nr_queued[rw]--;
 
 	throtl_charge_bio(tg, bio);
 	bio_list_add(bl, bio);
 	bio->bi_rw |= REQ_THROTTLED;
 
-	throtl_trim_slice(td, tg, rw);
+	throtl_trim_slice(tg, rw);
 }
 
-static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
-				struct bio_list *bl)
+static int throtl_dispatch_tg(struct throtl_grp *tg, struct bio_list *bl)
 {
 	unsigned int nr_reads = 0, nr_writes = 0;
 	unsigned int max_nr_reads = throtl_grp_quantum*3/4;
@@ -751,20 +751,20 @@ static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
 
 	/* Try to dispatch 75% READS and 25% WRITES */
 
-	while ((bio = bio_list_peek(&tg->bio_lists[READ]))
-		&& tg_may_dispatch(td, tg, bio, NULL)) {
+	while ((bio = bio_list_peek(&tg->bio_lists[READ])) &&
+	       tg_may_dispatch(tg, bio, NULL)) {
 
-		tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
+		tg_dispatch_one_bio(tg, bio_data_dir(bio), bl);
 		nr_reads++;
 
 		if (nr_reads >= max_nr_reads)
 			break;
 	}
 
-	while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))
-		&& tg_may_dispatch(td, tg, bio, NULL)) {
+	while ((bio = bio_list_peek(&tg->bio_lists[WRITE])) &&
+	       tg_may_dispatch(tg, bio, NULL)) {
 
-		tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
+		tg_dispatch_one_bio(tg, bio_data_dir(bio), bl);
 		nr_writes++;
 
 		if (nr_writes >= max_nr_writes)
@@ -791,7 +791,7 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
 
 		throtl_dequeue_tg(td, tg);
 
-		nr_disp += throtl_dispatch_tg(td, tg, bl);
+		nr_disp += throtl_dispatch_tg(tg, bl);
 
 		if (tg->nr_queued[0] || tg->nr_queued[1])
 			tg_update_disptime(td, tg);
@@ -933,7 +933,7 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
 	else
 		*(unsigned int *)((void *)tg + cft->private) = ctx.v;
 
-	throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
+	throtl_log_tg(tg, "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
 		      tg->bps[READ], tg->bps[WRITE],
 		      tg->iops[READ], tg->iops[WRITE]);
 
@@ -945,8 +945,8 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
 	 * that a group's limit are dropped suddenly and we don't want to
 	 * account recently dispatched IO with new low rate.
 	 */
-	throtl_start_new_slice(td, tg, 0);
-	throtl_start_new_slice(td, tg, 1);
+	throtl_start_new_slice(tg, 0);
+	throtl_start_new_slice(tg, 1);
 
 	if (tg->flags & THROTL_TG_PENDING) {
 		tg_update_disptime(td, tg);
@@ -1076,7 +1076,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 	}
 
 	/* Bio is with-in rate limit of group */
-	if (tg_may_dispatch(td, tg, bio, NULL)) {
+	if (tg_may_dispatch(tg, bio, NULL)) {
 		throtl_charge_bio(tg, bio);
 
 		/*
@@ -1090,12 +1090,12 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 		 *
 		 * So keep on trimming slice even if bio is not queued.
 		 */
-		throtl_trim_slice(td, tg, rw);
+		throtl_trim_slice(tg, rw);
 		goto out_unlock;
 	}
 
 queue_bio:
-	throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu"
+	throtl_log_tg(tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu"
 			" iodisp=%u iops=%u queued=%d/%d",
 			rw == READ ? 'R' : 'W',
 			tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
@@ -1142,9 +1142,9 @@ void blk_throtl_drain(struct request_queue *q)
 		throtl_dequeue_tg(td, tg);
 
 		while ((bio = bio_list_peek(&tg->bio_lists[READ])))
-			tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
+			tg_dispatch_one_bio(tg, bio_data_dir(bio), &bl);
 		while ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
-			tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
+			tg_dispatch_one_bio(tg, bio_data_dir(bio), &bl);
 	}
 	spin_unlock_irq(q->queue_lock);
 
-- 
cgit v0.10.2


From e2d57e60195a65e2f161ac1229ec9c91935e0240 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:33 -0700
Subject: blk-throttle: pass around throtl_service_queue instead of throtl_data

throtl_service_queue will be used as the basic block to implement
hierarchy support.  Pass around throtl_service_queue *sq instead of
throtl_data *td in the following functions which will be used across
multiple levels of hierarchy.

* [__]throtl_enqueue/dequeue_tg()

* throtl_add_bio_tg()

* tg_update_disptime()

* throtl_select_dispatch()

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a489391..9660ec8 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -352,31 +352,33 @@ static void tg_service_queue_add(struct throtl_service_queue *sq,
 	rb_insert_color(&tg->rb_node, &sq->pending_tree);
 }
 
-static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
+static void __throtl_enqueue_tg(struct throtl_service_queue *sq,
+				struct throtl_grp *tg)
 {
-	struct throtl_service_queue *sq = &td->service_queue;
-
 	tg_service_queue_add(sq, tg);
 	tg->flags |= THROTL_TG_PENDING;
 	sq->nr_pending++;
 }
 
-static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
+static void throtl_enqueue_tg(struct throtl_service_queue *sq,
+			      struct throtl_grp *tg)
 {
 	if (!(tg->flags & THROTL_TG_PENDING))
-		__throtl_enqueue_tg(td, tg);
+		__throtl_enqueue_tg(sq, tg);
 }
 
-static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
+static void __throtl_dequeue_tg(struct throtl_service_queue *sq,
+				struct throtl_grp *tg)
 {
-	throtl_rb_erase(&tg->rb_node, &td->service_queue);
+	throtl_rb_erase(&tg->rb_node, sq);
 	tg->flags &= ~THROTL_TG_PENDING;
 }
 
-static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
+static void throtl_dequeue_tg(struct throtl_service_queue *sq,
+			      struct throtl_grp *tg)
 {
 	if (tg->flags & THROTL_TG_PENDING)
-		__throtl_dequeue_tg(td, tg);
+		__throtl_dequeue_tg(sq, tg);
 }
 
 /* Call with queue lock held */
@@ -689,8 +691,8 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 	throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw);
 }
 
-static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
-			struct bio *bio)
+static void throtl_add_bio_tg(struct throtl_service_queue *sq,
+			      struct throtl_grp *tg, struct bio *bio)
 {
 	bool rw = bio_data_dir(bio);
 
@@ -698,11 +700,12 @@ static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
 	/* Take a bio reference on tg */
 	blkg_get(tg_to_blkg(tg));
 	tg->nr_queued[rw]++;
-	td->nr_queued[rw]++;
-	throtl_enqueue_tg(td, tg);
+	tg->td->nr_queued[rw]++;
+	throtl_enqueue_tg(sq, tg);
 }
 
-static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
+static void tg_update_disptime(struct throtl_service_queue *sq,
+			       struct throtl_grp *tg)
 {
 	unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
 	struct bio *bio;
@@ -717,9 +720,9 @@ static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
 	disptime = jiffies + min_wait;
 
 	/* Update dispatch time */
-	throtl_dequeue_tg(td, tg);
+	throtl_dequeue_tg(sq, tg);
 	tg->disptime = disptime;
-	throtl_enqueue_tg(td, tg);
+	throtl_enqueue_tg(sq, tg);
 }
 
 static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw,
@@ -774,11 +777,11 @@ static int throtl_dispatch_tg(struct throtl_grp *tg, struct bio_list *bl)
 	return nr_reads + nr_writes;
 }
 
-static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
+static int throtl_select_dispatch(struct throtl_service_queue *sq,
+				  struct bio_list *bl)
 {
 	unsigned int nr_disp = 0;
 	struct throtl_grp *tg;
-	struct throtl_service_queue *sq = &td->service_queue;
 
 	while (1) {
 		tg = throtl_rb_first(sq);
@@ -789,12 +792,12 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
 		if (time_before(jiffies, tg->disptime))
 			break;
 
-		throtl_dequeue_tg(td, tg);
+		throtl_dequeue_tg(sq, tg);
 
 		nr_disp += throtl_dispatch_tg(tg, bl);
 
 		if (tg->nr_queued[0] || tg->nr_queued[1])
-			tg_update_disptime(td, tg);
+			tg_update_disptime(sq, tg);
 
 		if (nr_disp >= throtl_quantum)
 			break;
@@ -822,7 +825,7 @@ void blk_throtl_dispatch_work_fn(struct work_struct *work)
 		   td->nr_queued[READ] + td->nr_queued[WRITE],
 		   td->nr_queued[READ], td->nr_queued[WRITE]);
 
-	nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
+	nr_disp = throtl_select_dispatch(&td->service_queue, &bio_list_on_stack);
 
 	if (nr_disp)
 		throtl_log(td, "bios disp=%u", nr_disp);
@@ -949,7 +952,7 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
 	throtl_start_new_slice(tg, 1);
 
 	if (tg->flags & THROTL_TG_PENDING) {
-		tg_update_disptime(td, tg);
+		tg_update_disptime(&td->service_queue, tg);
 		throtl_schedule_next_dispatch(td);
 	}
 
@@ -1103,11 +1106,11 @@ queue_bio:
 			tg->nr_queued[READ], tg->nr_queued[WRITE]);
 
 	bio_associate_current(bio);
-	throtl_add_bio_tg(q->td, tg, bio);
+	throtl_add_bio_tg(&q->td->service_queue, tg, bio);
 	throttled = true;
 
 	if (update_disptime) {
-		tg_update_disptime(td, tg);
+		tg_update_disptime(&td->service_queue, tg);
 		throtl_schedule_next_dispatch(td);
 	}
 
@@ -1139,7 +1142,7 @@ void blk_throtl_drain(struct request_queue *q)
 	bio_list_init(&bl);
 
 	while ((tg = throtl_rb_first(sq))) {
-		throtl_dequeue_tg(td, tg);
+		throtl_dequeue_tg(sq, tg);
 
 		while ((bio = bio_list_peek(&tg->bio_lists[READ])))
 			tg_dispatch_one_bio(tg, bio_data_dir(bio), &bl);
-- 
cgit v0.10.2


From 0049af73bb4b74d1407db59caefc5fe057ee434a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:33 -0700
Subject: blk-throttle: reorganize throtl_service_queue passed around as
 argument

throtl_service_queue will be the building block of hierarchy support
and will form a tree.  This patch updates its usages as arguments to
reduce confusion.

* When a service queue is used as the parent role - the host of the
  rbtree - use @parent_sq instead of @sq.

* For functions taking both @tg and @parent_sq, reorder them so that
  the order is (@tg, @parent_sq) not the other way around.  This makes
  the code follow the usual convention of specifying the primary
  target of the operation as the first argument.

This patch doesn't make any functional differences.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 9660ec8..ebaaaa9 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -284,17 +284,18 @@ static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
 	return tg;
 }
 
-static struct throtl_grp *throtl_rb_first(struct throtl_service_queue *sq)
+static struct throtl_grp *
+throtl_rb_first(struct throtl_service_queue *parent_sq)
 {
 	/* Service tree is empty */
-	if (!sq->nr_pending)
+	if (!parent_sq->nr_pending)
 		return NULL;
 
-	if (!sq->first_pending)
-		sq->first_pending = rb_first(&sq->pending_tree);
+	if (!parent_sq->first_pending)
+		parent_sq->first_pending = rb_first(&parent_sq->pending_tree);
 
-	if (sq->first_pending)
-		return rb_entry_tg(sq->first_pending);
+	if (parent_sq->first_pending)
+		return rb_entry_tg(parent_sq->first_pending);
 
 	return NULL;
 }
@@ -305,29 +306,30 @@ static void rb_erase_init(struct rb_node *n, struct rb_root *root)
 	RB_CLEAR_NODE(n);
 }
 
-static void throtl_rb_erase(struct rb_node *n, struct throtl_service_queue *sq)
+static void throtl_rb_erase(struct rb_node *n,
+			    struct throtl_service_queue *parent_sq)
 {
-	if (sq->first_pending == n)
-		sq->first_pending = NULL;
-	rb_erase_init(n, &sq->pending_tree);
-	--sq->nr_pending;
+	if (parent_sq->first_pending == n)
+		parent_sq->first_pending = NULL;
+	rb_erase_init(n, &parent_sq->pending_tree);
+	--parent_sq->nr_pending;
 }
 
-static void update_min_dispatch_time(struct throtl_service_queue *sq)
+static void update_min_dispatch_time(struct throtl_service_queue *parent_sq)
 {
 	struct throtl_grp *tg;
 
-	tg = throtl_rb_first(sq);
+	tg = throtl_rb_first(parent_sq);
 	if (!tg)
 		return;
 
-	sq->first_pending_disptime = tg->disptime;
+	parent_sq->first_pending_disptime = tg->disptime;
 }
 
-static void tg_service_queue_add(struct throtl_service_queue *sq,
-				 struct throtl_grp *tg)
+static void tg_service_queue_add(struct throtl_grp *tg,
+				 struct throtl_service_queue *parent_sq)
 {
-	struct rb_node **node = &sq->pending_tree.rb_node;
+	struct rb_node **node = &parent_sq->pending_tree.rb_node;
 	struct rb_node *parent = NULL;
 	struct throtl_grp *__tg;
 	unsigned long key = tg->disptime;
@@ -346,39 +348,39 @@ static void tg_service_queue_add(struct throtl_service_queue *sq,
 	}
 
 	if (left)
-		sq->first_pending = &tg->rb_node;
+		parent_sq->first_pending = &tg->rb_node;
 
 	rb_link_node(&tg->rb_node, parent, node);
-	rb_insert_color(&tg->rb_node, &sq->pending_tree);
+	rb_insert_color(&tg->rb_node, &parent_sq->pending_tree);
 }
 
-static void __throtl_enqueue_tg(struct throtl_service_queue *sq,
-				struct throtl_grp *tg)
+static void __throtl_enqueue_tg(struct throtl_grp *tg,
+				struct throtl_service_queue *parent_sq)
 {
-	tg_service_queue_add(sq, tg);
+	tg_service_queue_add(tg, parent_sq);
 	tg->flags |= THROTL_TG_PENDING;
-	sq->nr_pending++;
+	parent_sq->nr_pending++;
 }
 
-static void throtl_enqueue_tg(struct throtl_service_queue *sq,
-			      struct throtl_grp *tg)
+static void throtl_enqueue_tg(struct throtl_grp *tg,
+			      struct throtl_service_queue *parent_sq)
 {
 	if (!(tg->flags & THROTL_TG_PENDING))
-		__throtl_enqueue_tg(sq, tg);
+		__throtl_enqueue_tg(tg, parent_sq);
 }
 
-static void __throtl_dequeue_tg(struct throtl_service_queue *sq,
-				struct throtl_grp *tg)
+static void __throtl_dequeue_tg(struct throtl_grp *tg,
+				struct throtl_service_queue *parent_sq)
 {
-	throtl_rb_erase(&tg->rb_node, sq);
+	throtl_rb_erase(&tg->rb_node, parent_sq);
 	tg->flags &= ~THROTL_TG_PENDING;
 }
 
-static void throtl_dequeue_tg(struct throtl_service_queue *sq,
-			      struct throtl_grp *tg)
+static void throtl_dequeue_tg(struct throtl_grp *tg,
+			      struct throtl_service_queue *parent_sq)
 {
 	if (tg->flags & THROTL_TG_PENDING)
-		__throtl_dequeue_tg(sq, tg);
+		__throtl_dequeue_tg(tg, parent_sq);
 }
 
 /* Call with queue lock held */
@@ -691,8 +693,8 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 	throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw);
 }
 
-static void throtl_add_bio_tg(struct throtl_service_queue *sq,
-			      struct throtl_grp *tg, struct bio *bio)
+static void throtl_add_bio_tg(struct bio *bio, struct throtl_grp *tg,
+			      struct throtl_service_queue *parent_sq)
 {
 	bool rw = bio_data_dir(bio);
 
@@ -701,11 +703,11 @@ static void throtl_add_bio_tg(struct throtl_service_queue *sq,
 	blkg_get(tg_to_blkg(tg));
 	tg->nr_queued[rw]++;
 	tg->td->nr_queued[rw]++;
-	throtl_enqueue_tg(sq, tg);
+	throtl_enqueue_tg(tg, parent_sq);
 }
 
-static void tg_update_disptime(struct throtl_service_queue *sq,
-			       struct throtl_grp *tg)
+static void tg_update_disptime(struct throtl_grp *tg,
+			       struct throtl_service_queue *parent_sq)
 {
 	unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
 	struct bio *bio;
@@ -720,9 +722,9 @@ static void tg_update_disptime(struct throtl_service_queue *sq,
 	disptime = jiffies + min_wait;
 
 	/* Update dispatch time */
-	throtl_dequeue_tg(sq, tg);
+	throtl_dequeue_tg(tg, parent_sq);
 	tg->disptime = disptime;
-	throtl_enqueue_tg(sq, tg);
+	throtl_enqueue_tg(tg, parent_sq);
 }
 
 static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw,
@@ -777,14 +779,14 @@ static int throtl_dispatch_tg(struct throtl_grp *tg, struct bio_list *bl)
 	return nr_reads + nr_writes;
 }
 
-static int throtl_select_dispatch(struct throtl_service_queue *sq,
+static int throtl_select_dispatch(struct throtl_service_queue *parent_sq,
 				  struct bio_list *bl)
 {
 	unsigned int nr_disp = 0;
 	struct throtl_grp *tg;
 
 	while (1) {
-		tg = throtl_rb_first(sq);
+		tg = throtl_rb_first(parent_sq);
 
 		if (!tg)
 			break;
@@ -792,12 +794,12 @@ static int throtl_select_dispatch(struct throtl_service_queue *sq,
 		if (time_before(jiffies, tg->disptime))
 			break;
 
-		throtl_dequeue_tg(sq, tg);
+		throtl_dequeue_tg(tg, parent_sq);
 
 		nr_disp += throtl_dispatch_tg(tg, bl);
 
 		if (tg->nr_queued[0] || tg->nr_queued[1])
-			tg_update_disptime(sq, tg);
+			tg_update_disptime(tg, parent_sq);
 
 		if (nr_disp >= throtl_quantum)
 			break;
@@ -952,7 +954,7 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
 	throtl_start_new_slice(tg, 1);
 
 	if (tg->flags & THROTL_TG_PENDING) {
-		tg_update_disptime(&td->service_queue, tg);
+		tg_update_disptime(tg, &td->service_queue);
 		throtl_schedule_next_dispatch(td);
 	}
 
@@ -1106,11 +1108,11 @@ queue_bio:
 			tg->nr_queued[READ], tg->nr_queued[WRITE]);
 
 	bio_associate_current(bio);
-	throtl_add_bio_tg(&q->td->service_queue, tg, bio);
+	throtl_add_bio_tg(bio, tg, &q->td->service_queue);
 	throttled = true;
 
 	if (update_disptime) {
-		tg_update_disptime(&td->service_queue, tg);
+		tg_update_disptime(tg, &td->service_queue);
 		throtl_schedule_next_dispatch(td);
 	}
 
@@ -1132,7 +1134,7 @@ void blk_throtl_drain(struct request_queue *q)
 	__releases(q->queue_lock) __acquires(q->queue_lock)
 {
 	struct throtl_data *td = q->td;
-	struct throtl_service_queue *sq = &td->service_queue;
+	struct throtl_service_queue *parent_sq = &td->service_queue;
 	struct throtl_grp *tg;
 	struct bio_list bl;
 	struct bio *bio;
@@ -1141,8 +1143,8 @@ void blk_throtl_drain(struct request_queue *q)
 
 	bio_list_init(&bl);
 
-	while ((tg = throtl_rb_first(sq))) {
-		throtl_dequeue_tg(sq, tg);
+	while ((tg = throtl_rb_first(parent_sq))) {
+		throtl_dequeue_tg(tg, parent_sq);
 
 		while ((bio = bio_list_peek(&tg->bio_lists[READ])))
 			tg_dispatch_one_bio(tg, bio_data_dir(bio), &bl);
-- 
cgit v0.10.2


From 49a2f1e3f231f6b2ccfc8192f4c395de7fa910a1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:34 -0700
Subject: blk-throttle: add throtl_grp->service_queue

Currently, there's single service_queue per queue -
throtl_data->service_queue.  All active throtl_grp's are queued on the
queue and dispatched according to their limits.  To support hierarchy,
this will be expanded such that active throtl_grp's form a tree
anchored at throtl_data->service_queue and chained through each
intermediate throtl_grp's service_queue.

This patch adds throtl_grp->service_queue to prepare for hierarchy
support.  The initialization function - throtl_service_queue_init() -
is added and replaces the macro initializer.  The newly added
tg->service_queue isn't used yet.  Following patches will do.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index ebaaaa9..7340440 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -33,9 +33,6 @@ struct throtl_service_queue {
 	unsigned long		first_pending_disptime;	/* disptime of the first tg */
 };
 
-#define THROTL_SERVICE_QUEUE_INITIALIZER				\
-	(struct throtl_service_queue){ .pending_tree = RB_ROOT }
-
 enum tg_state_flags {
 	THROTL_TG_PENDING	= 1 << 0,	/* on parent's pending tree */
 };
@@ -60,6 +57,9 @@ struct throtl_grp {
 	/* throtl_data this group belongs to */
 	struct throtl_data *td;
 
+	/* this group's service queue */
+	struct throtl_service_queue service_queue;
+
 	/*
 	 * Dispatch time in jiffies. This is the estimated time when group
 	 * will unthrottle and is ready to dispatch more bio. It is used as
@@ -190,11 +190,18 @@ alloc_stats:
 		goto alloc_stats;
 }
 
+/* init a service_queue, assumes the caller zeroed it */
+static void throtl_service_queue_init(struct throtl_service_queue *sq)
+{
+	sq->pending_tree = RB_ROOT;
+}
+
 static void throtl_pd_init(struct blkcg_gq *blkg)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
 	unsigned long flags;
 
+	throtl_service_queue_init(&tg->service_queue);
 	RB_CLEAR_NODE(&tg->rb_node);
 	tg->td = blkg->q->td;
 	bio_list_init(&tg->bio_lists[0]);
@@ -1168,8 +1175,8 @@ int blk_throtl_init(struct request_queue *q)
 	if (!td)
 		return -ENOMEM;
 
-	td->service_queue = THROTL_SERVICE_QUEUE_INITIALIZER;
 	INIT_DELAYED_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
+	throtl_service_queue_init(&td->service_queue);
 
 	q->td = td;
 	td->queue = q;
-- 
cgit v0.10.2


From 73f0d49a9637a7ec3448a62a0042e35b14ba18a3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:35 -0700
Subject: blk-throttle: move bio_lists[] and friends to throtl_service_queue

throtl_service_queues will eventually form a tree which is anchored at
throtl_data->service_queue and queue bios will climb the tree to the
top service_queue to be executed.

This patch moves bio_lists[] and nr_queued[] from throtl_grp to its
service_queue to prepare for that.  As currently only the
throtl_data->service_queue is in use, this patch just ends up moving
throtl_grp->bio_lists[] and ->nr_queued[] to
throtl_grp->service_queue.bio_lists[] and ->nr_queued[] without making
any functional differences.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 7340440..6f57f94 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -27,6 +27,17 @@ static struct blkcg_policy blkcg_policy_throtl;
 static struct workqueue_struct *kthrotld_workqueue;
 
 struct throtl_service_queue {
+	/*
+	 * Bios queued directly to this service_queue or dispatched from
+	 * children throtl_grp's.
+	 */
+	struct bio_list		bio_lists[2];	/* queued bios [READ/WRITE] */
+	unsigned int		nr_queued[2];	/* number of queued bios */
+
+	/*
+	 * RB tree of active children throtl_grp's, which are sorted by
+	 * their ->disptime.
+	 */
 	struct rb_root		pending_tree;	/* RB tree of active tgs */
 	struct rb_node		*first_pending;	/* first node in the tree */
 	unsigned int		nr_pending;	/* # queued in the tree */
@@ -69,12 +80,6 @@ struct throtl_grp {
 
 	unsigned int flags;
 
-	/* Two lists for READ and WRITE */
-	struct bio_list bio_lists[2];
-
-	/* Number of queued bios on READ and WRITE lists */
-	unsigned int nr_queued[2];
-
 	/* bytes per second rate limits */
 	uint64_t bps[2];
 
@@ -193,6 +198,8 @@ alloc_stats:
 /* init a service_queue, assumes the caller zeroed it */
 static void throtl_service_queue_init(struct throtl_service_queue *sq)
 {
+	bio_list_init(&sq->bio_lists[0]);
+	bio_list_init(&sq->bio_lists[1]);
 	sq->pending_tree = RB_ROOT;
 }
 
@@ -204,8 +211,6 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
 	throtl_service_queue_init(&tg->service_queue);
 	RB_CLEAR_NODE(&tg->rb_node);
 	tg->td = blkg->q->td;
-	bio_list_init(&tg->bio_lists[0]);
-	bio_list_init(&tg->bio_lists[1]);
 
 	tg->bps[READ] = -1;
 	tg->bps[WRITE] = -1;
@@ -624,7 +629,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
 	 * this function with a different bio if there are other bios
 	 * queued.
 	 */
-	BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
+	BUG_ON(tg->service_queue.nr_queued[rw] &&
+	       bio != bio_list_peek(&tg->service_queue.bio_lists[rw]));
 
 	/* If tg->bps = -1, then BW is unlimited */
 	if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
@@ -703,12 +709,13 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 static void throtl_add_bio_tg(struct bio *bio, struct throtl_grp *tg,
 			      struct throtl_service_queue *parent_sq)
 {
+	struct throtl_service_queue *sq = &tg->service_queue;
 	bool rw = bio_data_dir(bio);
 
-	bio_list_add(&tg->bio_lists[rw], bio);
+	bio_list_add(&sq->bio_lists[rw], bio);
 	/* Take a bio reference on tg */
 	blkg_get(tg_to_blkg(tg));
-	tg->nr_queued[rw]++;
+	sq->nr_queued[rw]++;
 	tg->td->nr_queued[rw]++;
 	throtl_enqueue_tg(tg, parent_sq);
 }
@@ -716,13 +723,14 @@ static void throtl_add_bio_tg(struct bio *bio, struct throtl_grp *tg,
 static void tg_update_disptime(struct throtl_grp *tg,
 			       struct throtl_service_queue *parent_sq)
 {
+	struct throtl_service_queue *sq = &tg->service_queue;
 	unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
 	struct bio *bio;
 
-	if ((bio = bio_list_peek(&tg->bio_lists[READ])))
+	if ((bio = bio_list_peek(&sq->bio_lists[READ])))
 		tg_may_dispatch(tg, bio, &read_wait);
 
-	if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
+	if ((bio = bio_list_peek(&sq->bio_lists[WRITE])))
 		tg_may_dispatch(tg, bio, &write_wait);
 
 	min_wait = min(read_wait, write_wait);
@@ -737,10 +745,11 @@ static void tg_update_disptime(struct throtl_grp *tg,
 static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw,
 				struct bio_list *bl)
 {
+	struct throtl_service_queue *sq = &tg->service_queue;
 	struct bio *bio;
 
-	bio = bio_list_pop(&tg->bio_lists[rw]);
-	tg->nr_queued[rw]--;
+	bio = bio_list_pop(&sq->bio_lists[rw]);
+	sq->nr_queued[rw]--;
 	/* Drop bio reference on blkg */
 	blkg_put(tg_to_blkg(tg));
 
@@ -756,6 +765,7 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw,
 
 static int throtl_dispatch_tg(struct throtl_grp *tg, struct bio_list *bl)
 {
+	struct throtl_service_queue *sq = &tg->service_queue;
 	unsigned int nr_reads = 0, nr_writes = 0;
 	unsigned int max_nr_reads = throtl_grp_quantum*3/4;
 	unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
@@ -763,7 +773,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg, struct bio_list *bl)
 
 	/* Try to dispatch 75% READS and 25% WRITES */
 
-	while ((bio = bio_list_peek(&tg->bio_lists[READ])) &&
+	while ((bio = bio_list_peek(&sq->bio_lists[READ])) &&
 	       tg_may_dispatch(tg, bio, NULL)) {
 
 		tg_dispatch_one_bio(tg, bio_data_dir(bio), bl);
@@ -773,7 +783,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg, struct bio_list *bl)
 			break;
 	}
 
-	while ((bio = bio_list_peek(&tg->bio_lists[WRITE])) &&
+	while ((bio = bio_list_peek(&sq->bio_lists[WRITE])) &&
 	       tg_may_dispatch(tg, bio, NULL)) {
 
 		tg_dispatch_one_bio(tg, bio_data_dir(bio), bl);
@@ -790,10 +800,10 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq,
 				  struct bio_list *bl)
 {
 	unsigned int nr_disp = 0;
-	struct throtl_grp *tg;
 
 	while (1) {
-		tg = throtl_rb_first(parent_sq);
+		struct throtl_grp *tg = throtl_rb_first(parent_sq);
+		struct throtl_service_queue *sq = &tg->service_queue;
 
 		if (!tg)
 			break;
@@ -805,7 +815,7 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq,
 
 		nr_disp += throtl_dispatch_tg(tg, bl);
 
-		if (tg->nr_queued[0] || tg->nr_queued[1])
+		if (sq->nr_queued[0] || sq->nr_queued[1])
 			tg_update_disptime(tg, parent_sq);
 
 		if (nr_disp >= throtl_quantum)
@@ -1043,6 +1053,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 {
 	struct throtl_data *td = q->td;
 	struct throtl_grp *tg;
+	struct throtl_service_queue *sq;
 	bool rw = bio_data_dir(bio), update_disptime = true;
 	struct blkcg *blkcg;
 	bool throttled = false;
@@ -1077,7 +1088,9 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 	if (unlikely(!tg))
 		goto out_unlock;
 
-	if (tg->nr_queued[rw]) {
+	sq = &tg->service_queue;
+
+	if (sq->nr_queued[rw]) {
 		/*
 		 * There is already another bio queued in same dir. No
 		 * need to update dispatch time.
@@ -1112,7 +1125,7 @@ queue_bio:
 			rw == READ ? 'R' : 'W',
 			tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
 			tg->io_disp[rw], tg->iops[rw],
-			tg->nr_queued[READ], tg->nr_queued[WRITE]);
+			sq->nr_queued[READ], sq->nr_queued[WRITE]);
 
 	bio_associate_current(bio);
 	throtl_add_bio_tg(bio, tg, &q->td->service_queue);
@@ -1151,11 +1164,13 @@ void blk_throtl_drain(struct request_queue *q)
 	bio_list_init(&bl);
 
 	while ((tg = throtl_rb_first(parent_sq))) {
+		struct throtl_service_queue *sq = &tg->service_queue;
+
 		throtl_dequeue_tg(tg, parent_sq);
 
-		while ((bio = bio_list_peek(&tg->bio_lists[READ])))
+		while ((bio = bio_list_peek(&sq->bio_lists[READ])))
 			tg_dispatch_one_bio(tg, bio_data_dir(bio), &bl);
-		while ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
+		while ((bio = bio_list_peek(&sq->bio_lists[WRITE])))
 			tg_dispatch_one_bio(tg, bio_data_dir(bio), &bl);
 	}
 	spin_unlock_irq(q->queue_lock);
-- 
cgit v0.10.2


From 651930bc1c2a2550fde93a8cfa1a201c363a0ca1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:35 -0700
Subject: blk-throttle: dispatch to throtl_data->service_queue.bio_lists[]

throtl_service_queues will eventually form a tree which is anchored at
throtl_data->service_queue and queue bios will climb the tree to the
top service_queue to be executed.

This patch makes the dispatch paths in blk_throtl_dispatch_work_fn()
and blk_throtl_drain() to dispatch bios to
throtl_data->service_queue.bio_lists[] instead of the on-stack
bio_lists.  This will keep the final dispatch to the top level
service_queue share the same mechanism as dispatches through the rest
of the hierarchy.

As bio's should be issued in a sleepable context,
blk_throtl_dispatch_work_fn() transfers all dispatched bio's from the
service_queue bio_lists[] into an onstack one before dropping
queue_lock and issuing the bio's.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 6f57f94..154bd63 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -743,7 +743,7 @@ static void tg_update_disptime(struct throtl_grp *tg,
 }
 
 static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw,
-				struct bio_list *bl)
+				struct throtl_service_queue *parent_sq)
 {
 	struct throtl_service_queue *sq = &tg->service_queue;
 	struct bio *bio;
@@ -757,13 +757,14 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw,
 	tg->td->nr_queued[rw]--;
 
 	throtl_charge_bio(tg, bio);
-	bio_list_add(bl, bio);
+	bio_list_add(&parent_sq->bio_lists[rw], bio);
 	bio->bi_rw |= REQ_THROTTLED;
 
 	throtl_trim_slice(tg, rw);
 }
 
-static int throtl_dispatch_tg(struct throtl_grp *tg, struct bio_list *bl)
+static int throtl_dispatch_tg(struct throtl_grp *tg,
+			      struct throtl_service_queue *parent_sq)
 {
 	struct throtl_service_queue *sq = &tg->service_queue;
 	unsigned int nr_reads = 0, nr_writes = 0;
@@ -776,7 +777,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg, struct bio_list *bl)
 	while ((bio = bio_list_peek(&sq->bio_lists[READ])) &&
 	       tg_may_dispatch(tg, bio, NULL)) {
 
-		tg_dispatch_one_bio(tg, bio_data_dir(bio), bl);
+		tg_dispatch_one_bio(tg, bio_data_dir(bio), parent_sq);
 		nr_reads++;
 
 		if (nr_reads >= max_nr_reads)
@@ -786,7 +787,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg, struct bio_list *bl)
 	while ((bio = bio_list_peek(&sq->bio_lists[WRITE])) &&
 	       tg_may_dispatch(tg, bio, NULL)) {
 
-		tg_dispatch_one_bio(tg, bio_data_dir(bio), bl);
+		tg_dispatch_one_bio(tg, bio_data_dir(bio), parent_sq);
 		nr_writes++;
 
 		if (nr_writes >= max_nr_writes)
@@ -796,8 +797,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg, struct bio_list *bl)
 	return nr_reads + nr_writes;
 }
 
-static int throtl_select_dispatch(struct throtl_service_queue *parent_sq,
-				  struct bio_list *bl)
+static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
 {
 	unsigned int nr_disp = 0;
 
@@ -813,7 +813,7 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq,
 
 		throtl_dequeue_tg(tg, parent_sq);
 
-		nr_disp += throtl_dispatch_tg(tg, bl);
+		nr_disp += throtl_dispatch_tg(tg, parent_sq);
 
 		if (sq->nr_queued[0] || sq->nr_queued[1])
 			tg_update_disptime(tg, parent_sq);
@@ -830,11 +830,13 @@ void blk_throtl_dispatch_work_fn(struct work_struct *work)
 {
 	struct throtl_data *td = container_of(to_delayed_work(work),
 					      struct throtl_data, dispatch_work);
+	struct throtl_service_queue *sq = &td->service_queue;
 	struct request_queue *q = td->queue;
 	unsigned int nr_disp = 0;
 	struct bio_list bio_list_on_stack;
 	struct bio *bio;
 	struct blk_plug plug;
+	int rw;
 
 	spin_lock_irq(q->queue_lock);
 
@@ -844,10 +846,15 @@ void blk_throtl_dispatch_work_fn(struct work_struct *work)
 		   td->nr_queued[READ] + td->nr_queued[WRITE],
 		   td->nr_queued[READ], td->nr_queued[WRITE]);
 
-	nr_disp = throtl_select_dispatch(&td->service_queue, &bio_list_on_stack);
+	nr_disp = throtl_select_dispatch(sq);
 
-	if (nr_disp)
+	if (nr_disp) {
+		for (rw = READ; rw <= WRITE; rw++) {
+			bio_list_merge(&bio_list_on_stack, &sq->bio_lists[rw]);
+			bio_list_init(&sq->bio_lists[rw]);
+		}
 		throtl_log(td, "bios disp=%u", nr_disp);
+	}
 
 	throtl_schedule_next_dispatch(td);
 
@@ -1156,27 +1163,26 @@ void blk_throtl_drain(struct request_queue *q)
 	struct throtl_data *td = q->td;
 	struct throtl_service_queue *parent_sq = &td->service_queue;
 	struct throtl_grp *tg;
-	struct bio_list bl;
 	struct bio *bio;
+	int rw;
 
 	queue_lockdep_assert_held(q);
 
-	bio_list_init(&bl);
-
 	while ((tg = throtl_rb_first(parent_sq))) {
 		struct throtl_service_queue *sq = &tg->service_queue;
 
 		throtl_dequeue_tg(tg, parent_sq);
 
 		while ((bio = bio_list_peek(&sq->bio_lists[READ])))
-			tg_dispatch_one_bio(tg, bio_data_dir(bio), &bl);
+			tg_dispatch_one_bio(tg, bio_data_dir(bio), parent_sq);
 		while ((bio = bio_list_peek(&sq->bio_lists[WRITE])))
-			tg_dispatch_one_bio(tg, bio_data_dir(bio), &bl);
+			tg_dispatch_one_bio(tg, bio_data_dir(bio), parent_sq);
 	}
 	spin_unlock_irq(q->queue_lock);
 
-	while ((bio = bio_list_pop(&bl)))
-		generic_make_request(bio);
+	for (rw = READ; rw <= WRITE; rw++)
+		while ((bio = bio_list_pop(&parent_sq->bio_lists[rw])))
+			generic_make_request(bio);
 
 	spin_lock_irq(q->queue_lock);
 }
-- 
cgit v0.10.2


From 0e9f4164ba915052918a77ecb2a59822dbfd661c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:35 -0700
Subject: blk-throttle: generalize update_disptime optimization in
 blk_throtl_bio()

When blk_throtl_bio() wants to queue a bio to a tg (throtl_grp), it
avoids invoking tg_update_disptime() and
throtl_schedule_next_dispatch() if the tg already has bios queued in
that direction.  As a new bio is appeneded after the existing ones, it
can't change the tg's next dispatch time or the parent's dispatch
schedule.

This optimization is currently open coded in blk_throtl_bio().
Whether the target biolist was occupied was recorded in a local
variable and later used to skip disptime update.  This patch moves
generalizes it so that throtl_add_bio_tg() sets a new flag
THROTL_TG_WAS_EMPTY if the biolist was empty before the new bio was
added.  tg_update_disptime() clears the flag automatically.
blk_throtl_bio() is updated to simply test the flag before updating
disptime.

This patch doesn't make any functional differences now but will enable
using the same optimization for recursive dispatch.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 154bd63..ec9397f 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -46,6 +46,7 @@ struct throtl_service_queue {
 
 enum tg_state_flags {
 	THROTL_TG_PENDING	= 1 << 0,	/* on parent's pending tree */
+	THROTL_TG_WAS_EMPTY	= 1 << 1,	/* bio_lists[] became non-empty */
 };
 
 #define rb_entry_tg(node)	rb_entry((node), struct throtl_grp, rb_node)
@@ -712,6 +713,15 @@ static void throtl_add_bio_tg(struct bio *bio, struct throtl_grp *tg,
 	struct throtl_service_queue *sq = &tg->service_queue;
 	bool rw = bio_data_dir(bio);
 
+	/*
+	 * If @tg doesn't currently have any bios queued in the same
+	 * direction, queueing @bio can change when @tg should be
+	 * dispatched.  Mark that @tg was empty.  This is automatically
+	 * cleaered on the next tg_update_disptime().
+	 */
+	if (!sq->nr_queued[rw])
+		tg->flags |= THROTL_TG_WAS_EMPTY;
+
 	bio_list_add(&sq->bio_lists[rw], bio);
 	/* Take a bio reference on tg */
 	blkg_get(tg_to_blkg(tg));
@@ -740,6 +750,9 @@ static void tg_update_disptime(struct throtl_grp *tg,
 	throtl_dequeue_tg(tg, parent_sq);
 	tg->disptime = disptime;
 	throtl_enqueue_tg(tg, parent_sq);
+
+	/* see throtl_add_bio_tg() */
+	tg->flags &= ~THROTL_TG_WAS_EMPTY;
 }
 
 static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw,
@@ -1061,7 +1074,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 	struct throtl_data *td = q->td;
 	struct throtl_grp *tg;
 	struct throtl_service_queue *sq;
-	bool rw = bio_data_dir(bio), update_disptime = true;
+	bool rw = bio_data_dir(bio);
 	struct blkcg *blkcg;
 	bool throttled = false;
 
@@ -1097,16 +1110,10 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 
 	sq = &tg->service_queue;
 
-	if (sq->nr_queued[rw]) {
-		/*
-		 * There is already another bio queued in same dir. No
-		 * need to update dispatch time.
-		 */
-		update_disptime = false;
+	/* throtl is FIFO - if other bios are already queued, should queue */
+	if (sq->nr_queued[rw])
 		goto queue_bio;
 
-	}
-
 	/* Bio is with-in rate limit of group */
 	if (tg_may_dispatch(tg, bio, NULL)) {
 		throtl_charge_bio(tg, bio);
@@ -1138,7 +1145,8 @@ queue_bio:
 	throtl_add_bio_tg(bio, tg, &q->td->service_queue);
 	throttled = true;
 
-	if (update_disptime) {
+	/* update @tg's dispatch time if @tg was empty before @bio */
+	if (tg->flags & THROTL_TG_WAS_EMPTY) {
 		tg_update_disptime(tg, &td->service_queue);
 		throtl_schedule_next_dispatch(td);
 	}
-- 
cgit v0.10.2


From 77216b0484817817a18aaa6b089dffe49070f7c1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:36 -0700
Subject: blk-throttle: add throtl_service_queue->parent_sq

To prepare for hierarchy support, this patch adds
throtl_service_queue->service_sq which points to the arent
service_queue.  Currently, for all service_queues embedded in
throtl_grps, it points to throtl_data->service_queue.  As
throtl_data->service_queue doesn't have a parent its parent_sq is set
to NULL.

There are a number of functions which take both throtl_grp *tg and
throtl_service_queue *parent_sq.  With this patch, the parent
service_queue can be determined from @tg and the @parent_sq arguments
are removed.

This patch doesn't make any behavior differences.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index ec9397f..00cfdd0 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -27,6 +27,8 @@ static struct blkcg_policy blkcg_policy_throtl;
 static struct workqueue_struct *kthrotld_workqueue;
 
 struct throtl_service_queue {
+	struct throtl_service_queue *parent_sq;	/* the parent service_queue */
+
 	/*
 	 * Bios queued directly to this service_queue or dispatched from
 	 * children throtl_grp's.
@@ -197,21 +199,24 @@ alloc_stats:
 }
 
 /* init a service_queue, assumes the caller zeroed it */
-static void throtl_service_queue_init(struct throtl_service_queue *sq)
+static void throtl_service_queue_init(struct throtl_service_queue *sq,
+				      struct throtl_service_queue *parent_sq)
 {
 	bio_list_init(&sq->bio_lists[0]);
 	bio_list_init(&sq->bio_lists[1]);
 	sq->pending_tree = RB_ROOT;
+	sq->parent_sq = parent_sq;
 }
 
 static void throtl_pd_init(struct blkcg_gq *blkg)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
+	struct throtl_data *td = blkg->q->td;
 	unsigned long flags;
 
-	throtl_service_queue_init(&tg->service_queue);
+	throtl_service_queue_init(&tg->service_queue, &td->service_queue);
 	RB_CLEAR_NODE(&tg->rb_node);
-	tg->td = blkg->q->td;
+	tg->td = td;
 
 	tg->bps[READ] = -1;
 	tg->bps[WRITE] = -1;
@@ -339,9 +344,9 @@ static void update_min_dispatch_time(struct throtl_service_queue *parent_sq)
 	parent_sq->first_pending_disptime = tg->disptime;
 }
 
-static void tg_service_queue_add(struct throtl_grp *tg,
-				 struct throtl_service_queue *parent_sq)
+static void tg_service_queue_add(struct throtl_grp *tg)
 {
+	struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq;
 	struct rb_node **node = &parent_sq->pending_tree.rb_node;
 	struct rb_node *parent = NULL;
 	struct throtl_grp *__tg;
@@ -367,33 +372,29 @@ static void tg_service_queue_add(struct throtl_grp *tg,
 	rb_insert_color(&tg->rb_node, &parent_sq->pending_tree);
 }
 
-static void __throtl_enqueue_tg(struct throtl_grp *tg,
-				struct throtl_service_queue *parent_sq)
+static void __throtl_enqueue_tg(struct throtl_grp *tg)
 {
-	tg_service_queue_add(tg, parent_sq);
+	tg_service_queue_add(tg);
 	tg->flags |= THROTL_TG_PENDING;
-	parent_sq->nr_pending++;
+	tg->service_queue.parent_sq->nr_pending++;
 }
 
-static void throtl_enqueue_tg(struct throtl_grp *tg,
-			      struct throtl_service_queue *parent_sq)
+static void throtl_enqueue_tg(struct throtl_grp *tg)
 {
 	if (!(tg->flags & THROTL_TG_PENDING))
-		__throtl_enqueue_tg(tg, parent_sq);
+		__throtl_enqueue_tg(tg);
 }
 
-static void __throtl_dequeue_tg(struct throtl_grp *tg,
-				struct throtl_service_queue *parent_sq)
+static void __throtl_dequeue_tg(struct throtl_grp *tg)
 {
-	throtl_rb_erase(&tg->rb_node, parent_sq);
+	throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq);
 	tg->flags &= ~THROTL_TG_PENDING;
 }
 
-static void throtl_dequeue_tg(struct throtl_grp *tg,
-			      struct throtl_service_queue *parent_sq)
+static void throtl_dequeue_tg(struct throtl_grp *tg)
 {
 	if (tg->flags & THROTL_TG_PENDING)
-		__throtl_dequeue_tg(tg, parent_sq);
+		__throtl_dequeue_tg(tg);
 }
 
 /* Call with queue lock held */
@@ -707,8 +708,7 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 	throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw);
 }
 
-static void throtl_add_bio_tg(struct bio *bio, struct throtl_grp *tg,
-			      struct throtl_service_queue *parent_sq)
+static void throtl_add_bio_tg(struct bio *bio, struct throtl_grp *tg)
 {
 	struct throtl_service_queue *sq = &tg->service_queue;
 	bool rw = bio_data_dir(bio);
@@ -727,11 +727,10 @@ static void throtl_add_bio_tg(struct bio *bio, struct throtl_grp *tg,
 	blkg_get(tg_to_blkg(tg));
 	sq->nr_queued[rw]++;
 	tg->td->nr_queued[rw]++;
-	throtl_enqueue_tg(tg, parent_sq);
+	throtl_enqueue_tg(tg);
 }
 
-static void tg_update_disptime(struct throtl_grp *tg,
-			       struct throtl_service_queue *parent_sq)
+static void tg_update_disptime(struct throtl_grp *tg)
 {
 	struct throtl_service_queue *sq = &tg->service_queue;
 	unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
@@ -747,16 +746,15 @@ static void tg_update_disptime(struct throtl_grp *tg,
 	disptime = jiffies + min_wait;
 
 	/* Update dispatch time */
-	throtl_dequeue_tg(tg, parent_sq);
+	throtl_dequeue_tg(tg);
 	tg->disptime = disptime;
-	throtl_enqueue_tg(tg, parent_sq);
+	throtl_enqueue_tg(tg);
 
 	/* see throtl_add_bio_tg() */
 	tg->flags &= ~THROTL_TG_WAS_EMPTY;
 }
 
-static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw,
-				struct throtl_service_queue *parent_sq)
+static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
 {
 	struct throtl_service_queue *sq = &tg->service_queue;
 	struct bio *bio;
@@ -770,14 +768,13 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw,
 	tg->td->nr_queued[rw]--;
 
 	throtl_charge_bio(tg, bio);
-	bio_list_add(&parent_sq->bio_lists[rw], bio);
+	bio_list_add(&sq->parent_sq->bio_lists[rw], bio);
 	bio->bi_rw |= REQ_THROTTLED;
 
 	throtl_trim_slice(tg, rw);
 }
 
-static int throtl_dispatch_tg(struct throtl_grp *tg,
-			      struct throtl_service_queue *parent_sq)
+static int throtl_dispatch_tg(struct throtl_grp *tg)
 {
 	struct throtl_service_queue *sq = &tg->service_queue;
 	unsigned int nr_reads = 0, nr_writes = 0;
@@ -790,7 +787,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg,
 	while ((bio = bio_list_peek(&sq->bio_lists[READ])) &&
 	       tg_may_dispatch(tg, bio, NULL)) {
 
-		tg_dispatch_one_bio(tg, bio_data_dir(bio), parent_sq);
+		tg_dispatch_one_bio(tg, bio_data_dir(bio));
 		nr_reads++;
 
 		if (nr_reads >= max_nr_reads)
@@ -800,7 +797,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg,
 	while ((bio = bio_list_peek(&sq->bio_lists[WRITE])) &&
 	       tg_may_dispatch(tg, bio, NULL)) {
 
-		tg_dispatch_one_bio(tg, bio_data_dir(bio), parent_sq);
+		tg_dispatch_one_bio(tg, bio_data_dir(bio));
 		nr_writes++;
 
 		if (nr_writes >= max_nr_writes)
@@ -824,12 +821,12 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
 		if (time_before(jiffies, tg->disptime))
 			break;
 
-		throtl_dequeue_tg(tg, parent_sq);
+		throtl_dequeue_tg(tg);
 
-		nr_disp += throtl_dispatch_tg(tg, parent_sq);
+		nr_disp += throtl_dispatch_tg(tg);
 
 		if (sq->nr_queued[0] || sq->nr_queued[1])
-			tg_update_disptime(tg, parent_sq);
+			tg_update_disptime(tg);
 
 		if (nr_disp >= throtl_quantum)
 			break;
@@ -991,7 +988,7 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
 	throtl_start_new_slice(tg, 1);
 
 	if (tg->flags & THROTL_TG_PENDING) {
-		tg_update_disptime(tg, &td->service_queue);
+		tg_update_disptime(tg);
 		throtl_schedule_next_dispatch(td);
 	}
 
@@ -1142,12 +1139,12 @@ queue_bio:
 			sq->nr_queued[READ], sq->nr_queued[WRITE]);
 
 	bio_associate_current(bio);
-	throtl_add_bio_tg(bio, tg, &q->td->service_queue);
+	throtl_add_bio_tg(bio, tg);
 	throttled = true;
 
 	/* update @tg's dispatch time if @tg was empty before @bio */
 	if (tg->flags & THROTL_TG_WAS_EMPTY) {
-		tg_update_disptime(tg, &td->service_queue);
+		tg_update_disptime(tg);
 		throtl_schedule_next_dispatch(td);
 	}
 
@@ -1179,12 +1176,12 @@ void blk_throtl_drain(struct request_queue *q)
 	while ((tg = throtl_rb_first(parent_sq))) {
 		struct throtl_service_queue *sq = &tg->service_queue;
 
-		throtl_dequeue_tg(tg, parent_sq);
+		throtl_dequeue_tg(tg);
 
 		while ((bio = bio_list_peek(&sq->bio_lists[READ])))
-			tg_dispatch_one_bio(tg, bio_data_dir(bio), parent_sq);
+			tg_dispatch_one_bio(tg, bio_data_dir(bio));
 		while ((bio = bio_list_peek(&sq->bio_lists[WRITE])))
-			tg_dispatch_one_bio(tg, bio_data_dir(bio), parent_sq);
+			tg_dispatch_one_bio(tg, bio_data_dir(bio));
 	}
 	spin_unlock_irq(q->queue_lock);
 
@@ -1205,7 +1202,7 @@ int blk_throtl_init(struct request_queue *q)
 		return -ENOMEM;
 
 	INIT_DELAYED_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
-	throtl_service_queue_init(&td->service_queue);
+	throtl_service_queue_init(&td->service_queue, NULL);
 
 	q->td = td;
 	td->queue = q;
-- 
cgit v0.10.2


From fda6f272c77a7acd798bb247fadc4791574e698b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:36 -0700
Subject: blk-throttle: implement sq_to_tg(), sq_to_td() and throtl_log()

Now that both throtl_data and throtl_grp embed throtl_service_queue,
we can unify throtl_log() and throtl_log_tg().

* sq_to_tg() is added.  This returns the throtl_grp a service_queue is
  embedded in.  If the service_queue is the top-level one embedded in
  throtl_data, NULL is returned.

* sq_to_td() is added.  A service_queue is always associated with a
  throtl_data.  This function finds the associated td and returns it.

* throtl_log() is updated to take throtl_service_queue instead of
  throtl_data.  If the service_queue is one embedded in throtl_grp, it
  prints the same header as throtl_log_tg() did.  If it's one embedded
  in throtl_data, it behaves the same as before.  This renders
  throtl_log_tg() unnecessary.  Removed.

This change is necessary for hierarchy support as we're gonna be using
the same code paths to dispatch bios to intermediate service_queues
embedded in throtl_grps and the top-level service_queue embedded in
throtl_data.

This patch doesn't make any behavior changes.

v2: throtl_log() didn't print a space after blkg path.  Updated so
    that it prints a space after throtl_grp path.  Spotted by Vivek.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 00cfdd0..2875ff6 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -151,16 +151,65 @@ static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
 	return blkg_to_tg(td->queue->root_blkg);
 }
 
-#define throtl_log_tg(tg, fmt, args...)	do {				\
-	char __pbuf[128];						\
+/**
+ * sq_to_tg - return the throl_grp the specified service queue belongs to
+ * @sq: the throtl_service_queue of interest
+ *
+ * Return the throtl_grp @sq belongs to.  If @sq is the top-level one
+ * embedded in throtl_data, %NULL is returned.
+ */
+static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq)
+{
+	if (sq && sq->parent_sq)
+		return container_of(sq, struct throtl_grp, service_queue);
+	else
+		return NULL;
+}
+
+/**
+ * sq_to_td - return throtl_data the specified service queue belongs to
+ * @sq: the throtl_service_queue of interest
+ *
+ * A service_queue can be embeded in either a throtl_grp or throtl_data.
+ * Determine the associated throtl_data accordingly and return it.
+ */
+static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
+{
+	struct throtl_grp *tg = sq_to_tg(sq);
+
+	if (tg)
+		return tg->td;
+	else
+		return container_of(sq, struct throtl_data, service_queue);
+}
+
+/**
+ * throtl_log - log debug message via blktrace
+ * @sq: the service_queue being reported
+ * @fmt: printf format string
+ * @args: printf args
+ *
+ * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a
+ * throtl_grp; otherwise, just "throtl".
+ *
+ * TODO: this should be made a function and name formatting should happen
+ * after testing whether blktrace is enabled.
+ */
+#define throtl_log(sq, fmt, args...)	do {				\
+	struct throtl_grp *__tg = sq_to_tg((sq));			\
+	struct throtl_data *__td = sq_to_td((sq));			\
+									\
+	(void)__td;							\
+	if ((__tg)) {							\
+		char __pbuf[128];					\
 									\
-	blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf));		\
-	blk_add_trace_msg((tg)->td->queue, "throtl %s " fmt, __pbuf, ##args); \
+		blkg_path(tg_to_blkg(__tg), __pbuf, sizeof(__pbuf));	\
+		blk_add_trace_msg(__td->queue, "throtl %s " fmt, __pbuf, ##args); \
+	} else {							\
+		blk_add_trace_msg(__td->queue, "throtl " fmt, ##args);	\
+	}								\
 } while (0)
 
-#define throtl_log(td, fmt, args...)	\
-	blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
-
 /*
  * Worker for allocating per cpu stat for tgs. This is scheduled on the
  * system_wq once there are some groups on the alloc_list waiting for
@@ -402,9 +451,10 @@ static void throtl_schedule_delayed_work(struct throtl_data *td,
 					 unsigned long delay)
 {
 	struct delayed_work *dwork = &td->dispatch_work;
+	struct throtl_service_queue *sq = &td->service_queue;
 
 	mod_delayed_work(kthrotld_workqueue, dwork, delay);
-	throtl_log(td, "schedule work. delay=%lu jiffies=%lu", delay, jiffies);
+	throtl_log(sq, "schedule work. delay=%lu jiffies=%lu", delay, jiffies);
 }
 
 static void throtl_schedule_next_dispatch(struct throtl_data *td)
@@ -429,9 +479,10 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
 	tg->io_disp[rw] = 0;
 	tg->slice_start[rw] = jiffies;
 	tg->slice_end[rw] = jiffies + throtl_slice;
-	throtl_log_tg(tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
-			rw == READ ? 'R' : 'W', tg->slice_start[rw],
-			tg->slice_end[rw], jiffies);
+	throtl_log(&tg->service_queue,
+		   "[%c] new slice start=%lu end=%lu jiffies=%lu",
+		   rw == READ ? 'R' : 'W', tg->slice_start[rw],
+		   tg->slice_end[rw], jiffies);
 }
 
 static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
@@ -444,9 +495,10 @@ static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
 				       unsigned long jiffy_end)
 {
 	tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
-	throtl_log_tg(tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
-			rw == READ ? 'R' : 'W', tg->slice_start[rw],
-			tg->slice_end[rw], jiffies);
+	throtl_log(&tg->service_queue,
+		   "[%c] extend slice start=%lu end=%lu jiffies=%lu",
+		   rw == READ ? 'R' : 'W', tg->slice_start[rw],
+		   tg->slice_end[rw], jiffies);
 }
 
 /* Determine if previously allocated or extended slice is complete or not */
@@ -511,10 +563,10 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
 
 	tg->slice_start[rw] += nr_slices * throtl_slice;
 
-	throtl_log_tg(tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
-			" start=%lu end=%lu jiffies=%lu",
-			rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
-			tg->slice_start[rw], tg->slice_end[rw], jiffies);
+	throtl_log(&tg->service_queue,
+		   "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu",
+		   rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
+		   tg->slice_start[rw], tg->slice_end[rw], jiffies);
 }
 
 static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
@@ -852,7 +904,7 @@ void blk_throtl_dispatch_work_fn(struct work_struct *work)
 
 	bio_list_init(&bio_list_on_stack);
 
-	throtl_log(td, "dispatch nr_queued=%u read=%u write=%u",
+	throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u",
 		   td->nr_queued[READ] + td->nr_queued[WRITE],
 		   td->nr_queued[READ], td->nr_queued[WRITE]);
 
@@ -863,7 +915,7 @@ void blk_throtl_dispatch_work_fn(struct work_struct *work)
 			bio_list_merge(&bio_list_on_stack, &sq->bio_lists[rw]);
 			bio_list_init(&sq->bio_lists[rw]);
 		}
-		throtl_log(td, "bios disp=%u", nr_disp);
+		throtl_log(sq, "bios disp=%u", nr_disp);
 	}
 
 	throtl_schedule_next_dispatch(td);
@@ -972,9 +1024,10 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
 	else
 		*(unsigned int *)((void *)tg + cft->private) = ctx.v;
 
-	throtl_log_tg(tg, "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
-		      tg->bps[READ], tg->bps[WRITE],
-		      tg->iops[READ], tg->iops[WRITE]);
+	throtl_log(&tg->service_queue,
+		   "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
+		   tg->bps[READ], tg->bps[WRITE],
+		   tg->iops[READ], tg->iops[WRITE]);
 
 	/*
 	 * We're already holding queue_lock and know @tg is valid.  Let's
@@ -1131,12 +1184,11 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 	}
 
 queue_bio:
-	throtl_log_tg(tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu"
-			" iodisp=%u iops=%u queued=%d/%d",
-			rw == READ ? 'R' : 'W',
-			tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
-			tg->io_disp[rw], tg->iops[rw],
-			sq->nr_queued[READ], sq->nr_queued[WRITE]);
+	throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",
+		   rw == READ ? 'R' : 'W',
+		   tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
+		   tg->io_disp[rw], tg->iops[rw],
+		   sq->nr_queued[READ], sq->nr_queued[WRITE]);
 
 	bio_associate_current(bio);
 	throtl_add_bio_tg(bio, tg);
-- 
cgit v0.10.2


From 2a0f61e6ecd08d260054bde4b096ff207ce5350f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:36 -0700
Subject: blk-throttle: set REQ_THROTTLED from throtl_charge_bio() and gate
 stats update with it

With proper hierarchy support, a bio can be dispatched multiple times
until it reaches the top-level service_queue and we don't want to
update dispatch stats at each step.  They are local stats and will be
kept local.  If recursive stats are necessary, they should be
implemented separately and definitely not by updating counters
recursively on each dispatch.

This patch moves REQ_THROTTLED setting to throtl_charge_bio() and gate
stats update with it so that dispatch stats are updated only on the
first time the bio is charged to a throtl_grp, which will always be
the throtl_grp the bio was originally queued to.

This means that REQ_THROTTLED would be set even for bios which don't
get throttled.  As we don't want bios to leave blk-throtl with the
flag set, move REQ_THROTLLED clearing to the end of blk_throtl_bio()
and clear if the bio is being issued directly.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 2875ff6..420eaa1 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -757,7 +757,22 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 	tg->bytes_disp[rw] += bio->bi_size;
 	tg->io_disp[rw]++;
 
-	throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw);
+	/*
+	 * REQ_THROTTLED is used to prevent the same bio to be throttled
+	 * more than once as a throttled bio will go through blk-throtl the
+	 * second time when it eventually gets issued.  Set it when a bio
+	 * is being charged to a tg.
+	 *
+	 * Dispatch stats aren't recursive and each @bio should only be
+	 * accounted by the @tg it was originally associated with.  Let's
+	 * update the stats when setting REQ_THROTTLED for the first time
+	 * which is guaranteed to be for the @bio's original tg.
+	 */
+	if (!(bio->bi_rw & REQ_THROTTLED)) {
+		bio->bi_rw |= REQ_THROTTLED;
+		throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size,
+					     bio->bi_rw);
+	}
 }
 
 static void throtl_add_bio_tg(struct bio *bio, struct throtl_grp *tg)
@@ -821,7 +836,6 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
 
 	throtl_charge_bio(tg, bio);
 	bio_list_add(&sq->parent_sq->bio_lists[rw], bio);
-	bio->bi_rw |= REQ_THROTTLED;
 
 	throtl_trim_slice(tg, rw);
 }
@@ -1128,10 +1142,9 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 	struct blkcg *blkcg;
 	bool throttled = false;
 
-	if (bio->bi_rw & REQ_THROTTLED) {
-		bio->bi_rw &= ~REQ_THROTTLED;
+	/* see throtl_charge_bio() */
+	if (bio->bi_rw & REQ_THROTTLED)
 		goto out;
-	}
 
 	/*
 	 * A throtl_grp pointer retrieved under rcu can be used to access
@@ -1205,6 +1218,13 @@ out_unlock:
 out_unlock_rcu:
 	rcu_read_unlock();
 out:
+	/*
+	 * As multiple blk-throtls may stack in the same issue path, we
+	 * don't want bios to leave with the flag set.  Clear the flag if
+	 * being issued.
+	 */
+	if (!throttled)
+		bio->bi_rw &= ~REQ_THROTTLED;
 	return throttled;
 }
 
-- 
cgit v0.10.2


From 69df0ab030c94e851b77991c2f5e00bcf5294edc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:36 -0700
Subject: blk-throttle: separate out throtl_service_queue->pending_timer from
 throtl_data->dispatch_work

Currently, throtl_data->dispatch_work is a delayed_work item which
handles both delayed dispatch and issuing bios.  The two tasks will be
separated to support proper hierarchy.  To prepare for that, this
patch separates out the timer into throtl_service_queue->pending_timer
from throtl_data->dispatch_work and make the latter a work_struct.

* As the timer is now per-service_queue, it's initialized and
  del_sync'd as its corresponding service_queue is created and
  destroyed.  The timer, when triggered, simply schedules
  throtl_data->dispathc_work for execution.

* throtl_schedule_delayed_work() is renamed to
  throtl_schedule_pending_timer() and takes @sq and @expires now.

* Simiarly, throtl_schedule_next_dispatch() now takes @sq, which
  should be the parent_sq of the service_queue which just got a new
  bio or updated.  As the parent_sq is always the top-level
  service_queue now, this doesn't change anything at this point.

This patch doesn't introduce any behavior differences.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 420eaa1..a8d23f0 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -44,6 +44,7 @@ struct throtl_service_queue {
 	struct rb_node		*first_pending;	/* first node in the tree */
 	unsigned int		nr_pending;	/* # queued in the tree */
 	unsigned long		first_pending_disptime;	/* disptime of the first tg */
+	struct timer_list	pending_timer;	/* fires on first_pending_disptime */
 };
 
 enum tg_state_flags {
@@ -121,7 +122,7 @@ struct throtl_data
 	unsigned int nr_undestroyed_grps;
 
 	/* Work for dispatching throttled bios */
-	struct delayed_work dispatch_work;
+	struct work_struct dispatch_work;
 };
 
 /* list and work item to allocate percpu group stats */
@@ -131,6 +132,8 @@ static LIST_HEAD(tg_stats_alloc_list);
 static void tg_stats_alloc_fn(struct work_struct *);
 static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
 
+static void throtl_pending_timer_fn(unsigned long arg);
+
 static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
 {
 	return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
@@ -255,6 +258,13 @@ static void throtl_service_queue_init(struct throtl_service_queue *sq,
 	bio_list_init(&sq->bio_lists[1]);
 	sq->pending_tree = RB_ROOT;
 	sq->parent_sq = parent_sq;
+	setup_timer(&sq->pending_timer, throtl_pending_timer_fn,
+		    (unsigned long)sq);
+}
+
+static void throtl_service_queue_exit(struct throtl_service_queue *sq)
+{
+	del_timer_sync(&sq->pending_timer);
 }
 
 static void throtl_pd_init(struct blkcg_gq *blkg)
@@ -293,6 +303,8 @@ static void throtl_pd_exit(struct blkcg_gq *blkg)
 	spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
 
 	free_percpu(tg->stats_cpu);
+
+	throtl_service_queue_exit(&tg->service_queue);
 }
 
 static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
@@ -447,19 +459,17 @@ static void throtl_dequeue_tg(struct throtl_grp *tg)
 }
 
 /* Call with queue lock held */
-static void throtl_schedule_delayed_work(struct throtl_data *td,
-					 unsigned long delay)
+static void throtl_schedule_pending_timer(struct throtl_service_queue *sq,
+					  unsigned long expires)
 {
-	struct delayed_work *dwork = &td->dispatch_work;
-	struct throtl_service_queue *sq = &td->service_queue;
-
-	mod_delayed_work(kthrotld_workqueue, dwork, delay);
-	throtl_log(sq, "schedule work. delay=%lu jiffies=%lu", delay, jiffies);
+	mod_timer(&sq->pending_timer, expires);
+	throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu",
+		   expires - jiffies, jiffies);
 }
 
-static void throtl_schedule_next_dispatch(struct throtl_data *td)
+static void throtl_schedule_next_dispatch(struct throtl_service_queue *sq)
 {
-	struct throtl_service_queue *sq = &td->service_queue;
+	struct throtl_data *td = sq_to_td(sq);
 
 	/* any pending children left? */
 	if (!sq->nr_pending)
@@ -467,10 +477,14 @@ static void throtl_schedule_next_dispatch(struct throtl_data *td)
 
 	update_min_dispatch_time(sq);
 
-	if (time_before_eq(sq->first_pending_disptime, jiffies))
-		throtl_schedule_delayed_work(td, 0);
-	else
-		throtl_schedule_delayed_work(td, sq->first_pending_disptime - jiffies);
+	/* is the next dispatch time in the future? */
+	if (time_after(sq->first_pending_disptime, jiffies)) {
+		throtl_schedule_pending_timer(sq, sq->first_pending_disptime);
+		return;
+	}
+
+	/* kick immediate execution */
+	queue_work(kthrotld_workqueue, &td->dispatch_work);
 }
 
 static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
@@ -901,11 +915,19 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
 	return nr_disp;
 }
 
+static void throtl_pending_timer_fn(unsigned long arg)
+{
+	struct throtl_service_queue *sq = (void *)arg;
+	struct throtl_data *td = sq_to_td(sq);
+
+	queue_work(kthrotld_workqueue, &td->dispatch_work);
+}
+
 /* work function to dispatch throttled bios */
 void blk_throtl_dispatch_work_fn(struct work_struct *work)
 {
-	struct throtl_data *td = container_of(to_delayed_work(work),
-					      struct throtl_data, dispatch_work);
+	struct throtl_data *td = container_of(work, struct throtl_data,
+					      dispatch_work);
 	struct throtl_service_queue *sq = &td->service_queue;
 	struct request_queue *q = td->queue;
 	unsigned int nr_disp = 0;
@@ -932,7 +954,7 @@ void blk_throtl_dispatch_work_fn(struct work_struct *work)
 		throtl_log(sq, "bios disp=%u", nr_disp);
 	}
 
-	throtl_schedule_next_dispatch(td);
+	throtl_schedule_next_dispatch(sq);
 
 	spin_unlock_irq(q->queue_lock);
 
@@ -1020,7 +1042,7 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
 	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
 	struct blkg_conf_ctx ctx;
 	struct throtl_grp *tg;
-	struct throtl_data *td;
+	struct throtl_service_queue *sq;
 	int ret;
 
 	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
@@ -1028,7 +1050,7 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
 		return ret;
 
 	tg = blkg_to_tg(ctx.blkg);
-	td = ctx.blkg->q->td;
+	sq = &tg->service_queue;
 
 	if (!ctx.v)
 		ctx.v = -1;
@@ -1056,7 +1078,7 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
 
 	if (tg->flags & THROTL_TG_PENDING) {
 		tg_update_disptime(tg);
-		throtl_schedule_next_dispatch(td);
+		throtl_schedule_next_dispatch(sq->parent_sq);
 	}
 
 	blkg_conf_finish(&ctx);
@@ -1121,7 +1143,7 @@ static void throtl_shutdown_wq(struct request_queue *q)
 {
 	struct throtl_data *td = q->td;
 
-	cancel_delayed_work_sync(&td->dispatch_work);
+	cancel_work_sync(&td->dispatch_work);
 }
 
 static struct blkcg_policy blkcg_policy_throtl = {
@@ -1210,7 +1232,7 @@ queue_bio:
 	/* update @tg's dispatch time if @tg was empty before @bio */
 	if (tg->flags & THROTL_TG_WAS_EMPTY) {
 		tg_update_disptime(tg);
-		throtl_schedule_next_dispatch(td);
+		throtl_schedule_next_dispatch(tg->service_queue.parent_sq);
 	}
 
 out_unlock:
@@ -1273,7 +1295,7 @@ int blk_throtl_init(struct request_queue *q)
 	if (!td)
 		return -ENOMEM;
 
-	INIT_DELAYED_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
+	INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
 	throtl_service_queue_init(&td->service_queue, NULL);
 
 	q->td = td;
-- 
cgit v0.10.2


From 7f52f98c2a83339b89a27d01296354e5dbb90ad0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:37 -0700
Subject: blk-throttle: implement dispatch looping

throtl_select_dispatch() only dispatches throtl_quantum bios on each
invocation.  blk_throtl_dispatch_work_fn() in turn depends on
throtl_schedule_next_dispatch() scheduling the next dispatch window
immediately so that undue delays aren't incurred.  This effectively
chains multiple dispatch work item executions back-to-back when there
are more than throtl_quantum bios to dispatch on a given tick.

There is no reason to finish the current work item just to repeat it
immediately.  This patch makes throtl_schedule_next_dispatch() return
%false without doing anything if the current dispatch window is still
open and updates blk_throtl_dispatch_work_fn() repeat dispatching
after cpu_relax() on %false return.

This change will help implementing hierarchy support as dispatching
will be done from pending_timer and immediate reschedule of timer
function isn't supported and doesn't make much sense.

While this patch changes how dispatch behaves when there are more than
throtl_quantum bios to dispatch on a single tick, the behavior change
is immaterial.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a8d23f0..8ee8e4e 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -467,24 +467,41 @@ static void throtl_schedule_pending_timer(struct throtl_service_queue *sq,
 		   expires - jiffies, jiffies);
 }
 
-static void throtl_schedule_next_dispatch(struct throtl_service_queue *sq)
+/**
+ * throtl_schedule_next_dispatch - schedule the next dispatch cycle
+ * @sq: the service_queue to schedule dispatch for
+ * @force: force scheduling
+ *
+ * Arm @sq->pending_timer so that the next dispatch cycle starts on the
+ * dispatch time of the first pending child.  Returns %true if either timer
+ * is armed or there's no pending child left.  %false if the current
+ * dispatch window is still open and the caller should continue
+ * dispatching.
+ *
+ * If @force is %true, the dispatch timer is always scheduled and this
+ * function is guaranteed to return %true.  This is to be used when the
+ * caller can't dispatch itself and needs to invoke pending_timer
+ * unconditionally.  Note that forced scheduling is likely to induce short
+ * delay before dispatch starts even if @sq->first_pending_disptime is not
+ * in the future and thus shouldn't be used in hot paths.
+ */
+static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq,
+					  bool force)
 {
-	struct throtl_data *td = sq_to_td(sq);
-
 	/* any pending children left? */
 	if (!sq->nr_pending)
-		return;
+		return true;
 
 	update_min_dispatch_time(sq);
 
 	/* is the next dispatch time in the future? */
-	if (time_after(sq->first_pending_disptime, jiffies)) {
+	if (force || time_after(sq->first_pending_disptime, jiffies)) {
 		throtl_schedule_pending_timer(sq, sq->first_pending_disptime);
-		return;
+		return true;
 	}
 
-	/* kick immediate execution */
-	queue_work(kthrotld_workqueue, &td->dispatch_work);
+	/* tell the caller to continue dispatching */
+	return false;
 }
 
 static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
@@ -930,39 +947,47 @@ void blk_throtl_dispatch_work_fn(struct work_struct *work)
 					      dispatch_work);
 	struct throtl_service_queue *sq = &td->service_queue;
 	struct request_queue *q = td->queue;
-	unsigned int nr_disp = 0;
 	struct bio_list bio_list_on_stack;
 	struct bio *bio;
 	struct blk_plug plug;
-	int rw;
+	bool dispatched = false;
+	int rw, ret;
 
 	spin_lock_irq(q->queue_lock);
 
 	bio_list_init(&bio_list_on_stack);
 
-	throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u",
-		   td->nr_queued[READ] + td->nr_queued[WRITE],
-		   td->nr_queued[READ], td->nr_queued[WRITE]);
+	while (true) {
+		throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u",
+			   td->nr_queued[READ] + td->nr_queued[WRITE],
+			   td->nr_queued[READ], td->nr_queued[WRITE]);
+
+		ret = throtl_select_dispatch(sq);
+		if (ret) {
+			for (rw = READ; rw <= WRITE; rw++) {
+				bio_list_merge(&bio_list_on_stack, &sq->bio_lists[rw]);
+				bio_list_init(&sq->bio_lists[rw]);
+			}
+			throtl_log(sq, "bios disp=%u", ret);
+			dispatched = true;
+		}
 
-	nr_disp = throtl_select_dispatch(sq);
+		if (throtl_schedule_next_dispatch(sq, false))
+			break;
 
-	if (nr_disp) {
-		for (rw = READ; rw <= WRITE; rw++) {
-			bio_list_merge(&bio_list_on_stack, &sq->bio_lists[rw]);
-			bio_list_init(&sq->bio_lists[rw]);
-		}
-		throtl_log(sq, "bios disp=%u", nr_disp);
+		/* this dispatch windows is still open, relax and repeat */
+		spin_unlock_irq(q->queue_lock);
+		cpu_relax();
+		spin_lock_irq(q->queue_lock);
 	}
 
-	throtl_schedule_next_dispatch(sq);
-
 	spin_unlock_irq(q->queue_lock);
 
 	/*
 	 * If we dispatched some requests, unplug the queue to make sure
 	 * immediate dispatch
 	 */
-	if (nr_disp) {
+	if (dispatched) {
 		blk_start_plug(&plug);
 		while((bio = bio_list_pop(&bio_list_on_stack)))
 			generic_make_request(bio);
@@ -1078,7 +1103,7 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
 
 	if (tg->flags & THROTL_TG_PENDING) {
 		tg_update_disptime(tg);
-		throtl_schedule_next_dispatch(sq->parent_sq);
+		throtl_schedule_next_dispatch(sq->parent_sq, true);
 	}
 
 	blkg_conf_finish(&ctx);
@@ -1229,10 +1254,15 @@ queue_bio:
 	throtl_add_bio_tg(bio, tg);
 	throttled = true;
 
-	/* update @tg's dispatch time if @tg was empty before @bio */
+	/*
+	 * Update @tg's dispatch time and force schedule dispatch if @tg
+	 * was empty before @bio.  The forced scheduling isn't likely to
+	 * cause undue delay as @bio is likely to be dispatched directly if
+	 * its @tg's disptime is not in the future.
+	 */
 	if (tg->flags & THROTL_TG_WAS_EMPTY) {
 		tg_update_disptime(tg);
-		throtl_schedule_next_dispatch(tg->service_queue.parent_sq);
+		throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true);
 	}
 
 out_unlock:
-- 
cgit v0.10.2


From 6e1a5704cbbd244a8db2d7d59215cf9a4c9a0d31 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:37 -0700
Subject: blk-throttle: dispatch from throtl_pending_timer_fn()

Currently, blk_throtl_dispatch_work_fn() is responsible for both
dispatching bio's from throtl_grp's according to their limits and then
issuing the dispatched bios.

This patch moves the dispatch part to throtl_pending_timer_fn() so
that the work item is kicked iff there are bio's to issue.  This is to
avoid work item execution at each step when hierarchy support is
enabled.  bio's will be dispatched towards the top-level service_queue
from the timers at each layer and the work item will only be used to
issue the bio's which reached the top-level service_queue.

While fetching bio's to issue from bio_lists[],
blk_throtl_dispatch_work_fn() fetches all READs before WRITEs.  While
the original code also dispatched READs first, if multiple throtl_grps
are dispatched on the same run, WRITEs from throtl_grp which is
dispatched first would precede READs from throtl_grps which are
dispatched later.  While this is a behavior change, given that the
previous code already prioritized READs and block layer generally
prioritizes and segregates READs from WRITEs, this isn't likely to
make any noticeable differences.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 8ee8e4e..918d222 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -932,31 +932,26 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
 	return nr_disp;
 }
 
+/**
+ * throtl_pending_timer_fn - timer function for service_queue->pending_timer
+ * @arg: the throtl_service_queue being serviced
+ *
+ * This timer is armed when a child throtl_grp with active bio's become
+ * pending and queued on the service_queue's pending_tree and expires when
+ * the first child throtl_grp should be dispatched.  This function
+ * dispatches bio's from the children throtl_grps and kicks
+ * throtl_data->dispatch_work if there are bio's ready to be issued.
+ */
 static void throtl_pending_timer_fn(unsigned long arg)
 {
 	struct throtl_service_queue *sq = (void *)arg;
 	struct throtl_data *td = sq_to_td(sq);
-
-	queue_work(kthrotld_workqueue, &td->dispatch_work);
-}
-
-/* work function to dispatch throttled bios */
-void blk_throtl_dispatch_work_fn(struct work_struct *work)
-{
-	struct throtl_data *td = container_of(work, struct throtl_data,
-					      dispatch_work);
-	struct throtl_service_queue *sq = &td->service_queue;
 	struct request_queue *q = td->queue;
-	struct bio_list bio_list_on_stack;
-	struct bio *bio;
-	struct blk_plug plug;
 	bool dispatched = false;
-	int rw, ret;
+	int ret;
 
 	spin_lock_irq(q->queue_lock);
 
-	bio_list_init(&bio_list_on_stack);
-
 	while (true) {
 		throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u",
 			   td->nr_queued[READ] + td->nr_queued[WRITE],
@@ -964,10 +959,6 @@ void blk_throtl_dispatch_work_fn(struct work_struct *work)
 
 		ret = throtl_select_dispatch(sq);
 		if (ret) {
-			for (rw = READ; rw <= WRITE; rw++) {
-				bio_list_merge(&bio_list_on_stack, &sq->bio_lists[rw]);
-				bio_list_init(&sq->bio_lists[rw]);
-			}
 			throtl_log(sq, "bios disp=%u", ret);
 			dispatched = true;
 		}
@@ -981,13 +972,41 @@ void blk_throtl_dispatch_work_fn(struct work_struct *work)
 		spin_lock_irq(q->queue_lock);
 	}
 
+	if (dispatched)
+		queue_work(kthrotld_workqueue, &td->dispatch_work);
+
 	spin_unlock_irq(q->queue_lock);
+}
 
-	/*
-	 * If we dispatched some requests, unplug the queue to make sure
-	 * immediate dispatch
-	 */
-	if (dispatched) {
+/**
+ * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work
+ * @work: work item being executed
+ *
+ * This function is queued for execution when bio's reach the bio_lists[]
+ * of throtl_data->service_queue.  Those bio's are ready and issued by this
+ * function.
+ */
+void blk_throtl_dispatch_work_fn(struct work_struct *work)
+{
+	struct throtl_data *td = container_of(work, struct throtl_data,
+					      dispatch_work);
+	struct throtl_service_queue *td_sq = &td->service_queue;
+	struct request_queue *q = td->queue;
+	struct bio_list bio_list_on_stack;
+	struct bio *bio;
+	struct blk_plug plug;
+	int rw;
+
+	bio_list_init(&bio_list_on_stack);
+
+	spin_lock_irq(q->queue_lock);
+	for (rw = READ; rw <= WRITE; rw++) {
+		bio_list_merge(&bio_list_on_stack, &td_sq->bio_lists[rw]);
+		bio_list_init(&td_sq->bio_lists[rw]);
+	}
+	spin_unlock_irq(q->queue_lock);
+
+	if (!bio_list_empty(&bio_list_on_stack)) {
 		blk_start_plug(&plug);
 		while((bio = bio_list_pop(&bio_list_on_stack)))
 			generic_make_request(bio);
-- 
cgit v0.10.2


From 2a12f0dcdad1ba7c0e53bbff8e5f6d0ee7a29882 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:37 -0700
Subject: blk-throttle: make blk_throtl_drain() ready for hierarchy

The current blk_throtl_drain() assumes that all active throtl_grps are
queued on throtl_data->service_queue, which won't be true once
hierarchy support is implemented.

This patch makes blk_throtl_drain() perform post-order walk of the
blkg hierarchy draining each associated throtl_grp, which guarantees
that all bios will eventually be pushed to the top-level service_queue
in throtl_data.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 918d222..8c6e133 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1299,6 +1299,28 @@ out:
 	return throttled;
 }
 
+/*
+ * Dispatch all bios from all children tg's queued on @parent_sq.  On
+ * return, @parent_sq is guaranteed to not have any active children tg's
+ * and all bios from previously active tg's are on @parent_sq->bio_lists[].
+ */
+static void tg_drain_bios(struct throtl_service_queue *parent_sq)
+{
+	struct throtl_grp *tg;
+
+	while ((tg = throtl_rb_first(parent_sq))) {
+		struct throtl_service_queue *sq = &tg->service_queue;
+		struct bio *bio;
+
+		throtl_dequeue_tg(tg);
+
+		while ((bio = bio_list_peek(&sq->bio_lists[READ])))
+			tg_dispatch_one_bio(tg, bio_data_dir(bio));
+		while ((bio = bio_list_peek(&sq->bio_lists[WRITE])))
+			tg_dispatch_one_bio(tg, bio_data_dir(bio));
+	}
+}
+
 /**
  * blk_throtl_drain - drain throttled bios
  * @q: request_queue to drain throttled bios for
@@ -1309,27 +1331,34 @@ void blk_throtl_drain(struct request_queue *q)
 	__releases(q->queue_lock) __acquires(q->queue_lock)
 {
 	struct throtl_data *td = q->td;
-	struct throtl_service_queue *parent_sq = &td->service_queue;
-	struct throtl_grp *tg;
+	struct blkcg_gq *blkg;
+	struct cgroup *pos_cgrp;
 	struct bio *bio;
 	int rw;
 
 	queue_lockdep_assert_held(q);
+	rcu_read_lock();
 
-	while ((tg = throtl_rb_first(parent_sq))) {
-		struct throtl_service_queue *sq = &tg->service_queue;
+	/*
+	 * Drain each tg while doing post-order walk on the blkg tree, so
+	 * that all bios are propagated to td->service_queue.  It'd be
+	 * better to walk service_queue tree directly but blkg walk is
+	 * easier.
+	 */
+	blkg_for_each_descendant_post(blkg, pos_cgrp, td->queue->root_blkg)
+		tg_drain_bios(&blkg_to_tg(blkg)->service_queue);
 
-		throtl_dequeue_tg(tg);
+	tg_drain_bios(&td_root_tg(td)->service_queue);
 
-		while ((bio = bio_list_peek(&sq->bio_lists[READ])))
-			tg_dispatch_one_bio(tg, bio_data_dir(bio));
-		while ((bio = bio_list_peek(&sq->bio_lists[WRITE])))
-			tg_dispatch_one_bio(tg, bio_data_dir(bio));
-	}
+	/* finally, transfer bios from top-level tg's into the td */
+	tg_drain_bios(&td->service_queue);
+
+	rcu_read_unlock();
 	spin_unlock_irq(q->queue_lock);
 
+	/* all bios now should be in td->service_queue, issue them */
 	for (rw = READ; rw <= WRITE; rw++)
-		while ((bio = bio_list_pop(&parent_sq->bio_lists[rw])))
+		while ((bio = bio_list_pop(&td->service_queue.bio_lists[rw])))
 			generic_make_request(bio);
 
 	spin_lock_irq(q->queue_lock);
-- 
cgit v0.10.2


From 9e660acffcd1b5adc4ec1ffba0cbb584f86b8907 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:38 -0700
Subject: blk-throttle: make blk_throtl_bio() ready for hierarchy

Currently, blk_throtl_bio() issues the passed in bio directly if it's
within limits of its associated tg (throtl_grp).  This behavior
becomes incorrect with hierarchy support as the bio should be
accounted to and throttled by the ancestor throtl_grps too.

This patch makes the direct issue path of blk_throtl_bio() to loop
until it reaches the top-level service_queue or gets throttled.  If
the former, the bio can be issued directly; otherwise, it gets queued
at the first layer it was above limits.

As tg->parent_sq is always the top-level service queue currently, this
patch in itself doesn't make any behavior differences.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 8c6e133..52321a4 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1239,12 +1239,16 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 
 	sq = &tg->service_queue;
 
-	/* throtl is FIFO - if other bios are already queued, should queue */
-	if (sq->nr_queued[rw])
-		goto queue_bio;
+	while (true) {
+		/* throtl is FIFO - if bios are already queued, should queue */
+		if (sq->nr_queued[rw])
+			break;
 
-	/* Bio is with-in rate limit of group */
-	if (tg_may_dispatch(tg, bio, NULL)) {
+		/* if above limits, break to queue */
+		if (!tg_may_dispatch(tg, bio, NULL))
+			break;
+
+		/* within limits, let's charge and dispatch directly */
 		throtl_charge_bio(tg, bio);
 
 		/*
@@ -1259,10 +1263,19 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 		 * So keep on trimming slice even if bio is not queued.
 		 */
 		throtl_trim_slice(tg, rw);
-		goto out_unlock;
+
+		/*
+		 * @bio passed through this layer without being throttled.
+		 * Climb up the ladder.  If we''re already at the top, it
+		 * can be executed directly.
+		 */
+		sq = sq->parent_sq;
+		tg = sq_to_tg(sq);
+		if (!tg)
+			goto out_unlock;
 	}
 
-queue_bio:
+	/* out-of-limit, queue to @tg */
 	throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",
 		   rw == READ ? 'R' : 'W',
 		   tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
-- 
cgit v0.10.2


From 6bc9c2b464fb89eab705da87aa4284171d942369 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:38 -0700
Subject: blk-throttle: make tg_dispatch_one_bio() ready for hierarchy

tg_dispatch_one_bio() currently assumes that the parent_sq is the top
level one and the bio being dispatched is ready to be issued; however,
this assumption will be wrong with proper hierarchy support.  This
patch makes the following changes to make tg_dispatch_on_bio() ready
for hiearchy.

* throtl_data->nr_queued[] is incremented in blk_throtl_bio() instead
  of throtl_add_bio_tg() so that throtl_add_bio_tg() can be used to
  transfer a bio from a child tg to its parent.

* tg_dispatch_one_bio() is updated to distinguish whether its parent
  is another throtl_grp or the throtl_data.  If former, the bio is
  transferred to the parent throtl_grp using throtl_add_bio_tg().  If
  latter, the bio is ready to be issued and put on the top-level
  service_queue's bio_lists[] and throtl_data->nr_queued is
  decremented.

As all throtl_grps currently have the top level service_queue as their
->parent_sq, this patch in itself doesn't make any behavior
difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 52321a4..0420261 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -824,7 +824,6 @@ static void throtl_add_bio_tg(struct bio *bio, struct throtl_grp *tg)
 	/* Take a bio reference on tg */
 	blkg_get(tg_to_blkg(tg));
 	sq->nr_queued[rw]++;
-	tg->td->nr_queued[rw]++;
 	throtl_enqueue_tg(tg);
 }
 
@@ -855,20 +854,34 @@ static void tg_update_disptime(struct throtl_grp *tg)
 static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
 {
 	struct throtl_service_queue *sq = &tg->service_queue;
+	struct throtl_service_queue *parent_sq = sq->parent_sq;
+	struct throtl_grp *parent_tg = sq_to_tg(parent_sq);
 	struct bio *bio;
 
 	bio = bio_list_pop(&sq->bio_lists[rw]);
 	sq->nr_queued[rw]--;
-	/* Drop bio reference on blkg */
-	blkg_put(tg_to_blkg(tg));
-
-	BUG_ON(tg->td->nr_queued[rw] <= 0);
-	tg->td->nr_queued[rw]--;
 
 	throtl_charge_bio(tg, bio);
-	bio_list_add(&sq->parent_sq->bio_lists[rw], bio);
+
+	/*
+	 * If our parent is another tg, we just need to transfer @bio to
+	 * the parent using throtl_add_bio_tg().  If our parent is
+	 * @td->service_queue, @bio is ready to be issued.  Put it on its
+	 * bio_lists[] and decrease total number queued.  The caller is
+	 * responsible for issuing these bios.
+	 */
+	if (parent_tg) {
+		throtl_add_bio_tg(bio, parent_tg);
+	} else {
+		bio_list_add(&parent_sq->bio_lists[rw], bio);
+		BUG_ON(tg->td->nr_queued[rw] <= 0);
+		tg->td->nr_queued[rw]--;
+	}
 
 	throtl_trim_slice(tg, rw);
+
+	/* @bio is transferred to parent, drop its blkg reference */
+	blkg_put(tg_to_blkg(tg));
 }
 
 static int throtl_dispatch_tg(struct throtl_grp *tg)
@@ -1283,6 +1296,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 		   sq->nr_queued[READ], sq->nr_queued[WRITE]);
 
 	bio_associate_current(bio);
+	tg->td->nr_queued[rw]++;
 	throtl_add_bio_tg(bio, tg);
 	throttled = true;
 
-- 
cgit v0.10.2


From 2e48a530a3a7daebd0cc17866304a36d39b611de Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:38 -0700
Subject: blk-throttle: make throtl_pending_timer_fn() ready for hierarchy

throtl_pending_timer_fn() currently assumes that the parent_sq is the
top level one and the bio's dispatched are ready to be issued;
however, this assumption will be wrong with proper hierarchy support.
This patch makes the following changes to make
throtl_pending_timer_fn() ready for hiearchy.

* If the parent_sq isn't the top-level one, update the parent
  throtl_grp's dispatch time and schedule the next dispatch as
  necessary.  If the parent's dispatch time is now, repeat the
  function for the parent throtl_grp.

* If the parent_sq is the top-level one, kick issue work_item as
  before.

* The debug message printed by throtl_log() now prints out the
  service_queue's nr_queued[] instead of the total nr_queued as the
  latter becomes uninteresting and misleading with hierarchical
  dispatch.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 0420261..bc65077 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -952,23 +952,33 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
  * This timer is armed when a child throtl_grp with active bio's become
  * pending and queued on the service_queue's pending_tree and expires when
  * the first child throtl_grp should be dispatched.  This function
- * dispatches bio's from the children throtl_grps and kicks
- * throtl_data->dispatch_work if there are bio's ready to be issued.
+ * dispatches bio's from the children throtl_grps to the parent
+ * service_queue.
+ *
+ * If the parent's parent is another throtl_grp, dispatching is propagated
+ * by either arming its pending_timer or repeating dispatch directly.  If
+ * the top-level service_tree is reached, throtl_data->dispatch_work is
+ * kicked so that the ready bio's are issued.
  */
 static void throtl_pending_timer_fn(unsigned long arg)
 {
 	struct throtl_service_queue *sq = (void *)arg;
+	struct throtl_grp *tg = sq_to_tg(sq);
 	struct throtl_data *td = sq_to_td(sq);
 	struct request_queue *q = td->queue;
-	bool dispatched = false;
+	struct throtl_service_queue *parent_sq;
+	bool dispatched;
 	int ret;
 
 	spin_lock_irq(q->queue_lock);
+again:
+	parent_sq = sq->parent_sq;
+	dispatched = false;
 
 	while (true) {
 		throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u",
-			   td->nr_queued[READ] + td->nr_queued[WRITE],
-			   td->nr_queued[READ], td->nr_queued[WRITE]);
+			   sq->nr_queued[READ] + sq->nr_queued[WRITE],
+			   sq->nr_queued[READ], sq->nr_queued[WRITE]);
 
 		ret = throtl_select_dispatch(sq);
 		if (ret) {
@@ -985,9 +995,25 @@ static void throtl_pending_timer_fn(unsigned long arg)
 		spin_lock_irq(q->queue_lock);
 	}
 
-	if (dispatched)
-		queue_work(kthrotld_workqueue, &td->dispatch_work);
+	if (!dispatched)
+		goto out_unlock;
 
+	if (parent_sq) {
+		/* @parent_sq is another throl_grp, propagate dispatch */
+		if (tg->flags & THROTL_TG_WAS_EMPTY) {
+			tg_update_disptime(tg);
+			if (!throtl_schedule_next_dispatch(parent_sq, false)) {
+				/* window is already open, repeat dispatching */
+				sq = parent_sq;
+				tg = sq_to_tg(sq);
+				goto again;
+			}
+		}
+	} else {
+		/* reached the top-level, queue issueing */
+		queue_work(kthrotld_workqueue, &td->dispatch_work);
+	}
+out_unlock:
 	spin_unlock_irq(q->queue_lock);
 }
 
-- 
cgit v0.10.2


From c5cc2070b45333f40a3f99319b83c8caeb62ec05 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:38 -0700
Subject: blk-throttle: add throtl_qnode for dispatch fairness

With flat hierarchy, there's only single level of dispatching
happening and fairness beyond that point is the responsibility of the
rest of the block layer and driver, which usually works out okay;
however, with the planned hierarchy support,
service_queue->bio_lists[] can be filled up by bios from a single
source.  While the limits would still be honored, it'd be very easy to
starve IOs from siblings or children.

To avoid such starvation, this patch implements throtl_qnode and
converts service_queue->bio_lists[] to lists of per-source qnodes
which in turn contains the bio's.  For example, when a bio is
dispatched from a child group, the bio doesn't get queued on
->bio_lists[] directly but it first gets queued on the group's qnode
which in turn gets queued on service_queue->queued[].  When
dispatching for the upper level, the ->queued[] list is consumed in
round-robing order so that the dispatch windows is consumed fairly by
all IO sources.

There are two ways a bio can come to a throtl_grp - directly queued to
the group or dispatched from a child.  For the former
throtl_grp->qnode_on_self[rw] is used.  For the latter, the child's
->qnode_on_parent[rw].

Note that this means that the child which is contributing a bio to its
parent should stay pinned until all its bios are dispatched to its
grand-parent.  This patch moves blkg refcnting from bio add/remove
spots to qnode activation/deactivation so that the blkg containing an
active qnode is always pinned.  As child pins the parent, this is
sufficient for keeping the relevant sub-tree pinned while bios are in
flight.

The starvation issue was spotted by Vivek Goyal.

v2: The original patch used the same throtl_grp->qnode_on_self/parent
    for reads and writes causing RWs to be queued incorrectly if there
    already are outstanding IOs in the other direction.  They should
    be throtl_grp->qnode_on_self/parent[2] so that READs and WRITEs
    can use different qnodes.  Spotted by Vivek Goyal.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index bc65077..541bd0d 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -26,6 +26,35 @@ static struct blkcg_policy blkcg_policy_throtl;
 /* A workqueue to queue throttle related work */
 static struct workqueue_struct *kthrotld_workqueue;
 
+/*
+ * To implement hierarchical throttling, throtl_grps form a tree and bios
+ * are dispatched upwards level by level until they reach the top and get
+ * issued.  When dispatching bios from the children and local group at each
+ * level, if the bios are dispatched into a single bio_list, there's a risk
+ * of a local or child group which can queue many bios at once filling up
+ * the list starving others.
+ *
+ * To avoid such starvation, dispatched bios are queued separately
+ * according to where they came from.  When they are again dispatched to
+ * the parent, they're popped in round-robin order so that no single source
+ * hogs the dispatch window.
+ *
+ * throtl_qnode is used to keep the queued bios separated by their sources.
+ * Bios are queued to throtl_qnode which in turn is queued to
+ * throtl_service_queue and then dispatched in round-robin order.
+ *
+ * It's also used to track the reference counts on blkg's.  A qnode always
+ * belongs to a throtl_grp and gets queued on itself or the parent, so
+ * incrementing the reference of the associated throtl_grp when a qnode is
+ * queued and decrementing when dequeued is enough to keep the whole blkg
+ * tree pinned while bios are in flight.
+ */
+struct throtl_qnode {
+	struct list_head	node;		/* service_queue->queued[] */
+	struct bio_list		bios;		/* queued bios */
+	struct throtl_grp	*tg;		/* tg this qnode belongs to */
+};
+
 struct throtl_service_queue {
 	struct throtl_service_queue *parent_sq;	/* the parent service_queue */
 
@@ -33,7 +62,7 @@ struct throtl_service_queue {
 	 * Bios queued directly to this service_queue or dispatched from
 	 * children throtl_grp's.
 	 */
-	struct bio_list		bio_lists[2];	/* queued bios [READ/WRITE] */
+	struct list_head	queued[2];	/* throtl_qnode [READ/WRITE] */
 	unsigned int		nr_queued[2];	/* number of queued bios */
 
 	/*
@@ -76,6 +105,17 @@ struct throtl_grp {
 	struct throtl_service_queue service_queue;
 
 	/*
+	 * qnode_on_self is used when bios are directly queued to this
+	 * throtl_grp so that local bios compete fairly with bios
+	 * dispatched from children.  qnode_on_parent is used when bios are
+	 * dispatched from this throtl_grp into its parent and will compete
+	 * with the sibling qnode_on_parents and the parent's
+	 * qnode_on_self.
+	 */
+	struct throtl_qnode qnode_on_self[2];
+	struct throtl_qnode qnode_on_parent[2];
+
+	/*
 	 * Dispatch time in jiffies. This is the estimated time when group
 	 * will unthrottle and is ready to dispatch more bio. It is used as
 	 * key to sort active groups in service tree.
@@ -250,12 +290,95 @@ alloc_stats:
 		goto alloc_stats;
 }
 
+static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
+{
+	INIT_LIST_HEAD(&qn->node);
+	bio_list_init(&qn->bios);
+	qn->tg = tg;
+}
+
+/**
+ * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it
+ * @bio: bio being added
+ * @qn: qnode to add bio to
+ * @queued: the service_queue->queued[] list @qn belongs to
+ *
+ * Add @bio to @qn and put @qn on @queued if it's not already on.
+ * @qn->tg's reference count is bumped when @qn is activated.  See the
+ * comment on top of throtl_qnode definition for details.
+ */
+static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
+				 struct list_head *queued)
+{
+	bio_list_add(&qn->bios, bio);
+	if (list_empty(&qn->node)) {
+		list_add_tail(&qn->node, queued);
+		blkg_get(tg_to_blkg(qn->tg));
+	}
+}
+
+/**
+ * throtl_peek_queued - peek the first bio on a qnode list
+ * @queued: the qnode list to peek
+ */
+static struct bio *throtl_peek_queued(struct list_head *queued)
+{
+	struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
+	struct bio *bio;
+
+	if (list_empty(queued))
+		return NULL;
+
+	bio = bio_list_peek(&qn->bios);
+	WARN_ON_ONCE(!bio);
+	return bio;
+}
+
+/**
+ * throtl_pop_queued - pop the first bio form a qnode list
+ * @queued: the qnode list to pop a bio from
+ * @tg_to_put: optional out argument for throtl_grp to put
+ *
+ * Pop the first bio from the qnode list @queued.  After popping, the first
+ * qnode is removed from @queued if empty or moved to the end of @queued so
+ * that the popping order is round-robin.
+ *
+ * When the first qnode is removed, its associated throtl_grp should be put
+ * too.  If @tg_to_put is NULL, this function automatically puts it;
+ * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is
+ * responsible for putting it.
+ */
+static struct bio *throtl_pop_queued(struct list_head *queued,
+				     struct throtl_grp **tg_to_put)
+{
+	struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
+	struct bio *bio;
+
+	if (list_empty(queued))
+		return NULL;
+
+	bio = bio_list_pop(&qn->bios);
+	WARN_ON_ONCE(!bio);
+
+	if (bio_list_empty(&qn->bios)) {
+		list_del_init(&qn->node);
+		if (tg_to_put)
+			*tg_to_put = qn->tg;
+		else
+			blkg_put(tg_to_blkg(qn->tg));
+	} else {
+		list_move_tail(&qn->node, queued);
+	}
+
+	return bio;
+}
+
 /* init a service_queue, assumes the caller zeroed it */
 static void throtl_service_queue_init(struct throtl_service_queue *sq,
 				      struct throtl_service_queue *parent_sq)
 {
-	bio_list_init(&sq->bio_lists[0]);
-	bio_list_init(&sq->bio_lists[1]);
+	INIT_LIST_HEAD(&sq->queued[0]);
+	INIT_LIST_HEAD(&sq->queued[1]);
 	sq->pending_tree = RB_ROOT;
 	sq->parent_sq = parent_sq;
 	setup_timer(&sq->pending_timer, throtl_pending_timer_fn,
@@ -272,8 +395,14 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
 	struct throtl_grp *tg = blkg_to_tg(blkg);
 	struct throtl_data *td = blkg->q->td;
 	unsigned long flags;
+	int rw;
 
 	throtl_service_queue_init(&tg->service_queue, &td->service_queue);
+	for (rw = READ; rw <= WRITE; rw++) {
+		throtl_qnode_init(&tg->qnode_on_self[rw], tg);
+		throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
+	}
+
 	RB_CLEAR_NODE(&tg->rb_node);
 	tg->td = td;
 
@@ -715,7 +844,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
 	 * queued.
 	 */
 	BUG_ON(tg->service_queue.nr_queued[rw] &&
-	       bio != bio_list_peek(&tg->service_queue.bio_lists[rw]));
+	       bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
 
 	/* If tg->bps = -1, then BW is unlimited */
 	if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
@@ -806,11 +935,24 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 	}
 }
 
-static void throtl_add_bio_tg(struct bio *bio, struct throtl_grp *tg)
+/**
+ * throtl_add_bio_tg - add a bio to the specified throtl_grp
+ * @bio: bio to add
+ * @qn: qnode to use
+ * @tg: the target throtl_grp
+ *
+ * Add @bio to @tg's service_queue using @qn.  If @qn is not specified,
+ * tg->qnode_on_self[] is used.
+ */
+static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
+			      struct throtl_grp *tg)
 {
 	struct throtl_service_queue *sq = &tg->service_queue;
 	bool rw = bio_data_dir(bio);
 
+	if (!qn)
+		qn = &tg->qnode_on_self[rw];
+
 	/*
 	 * If @tg doesn't currently have any bios queued in the same
 	 * direction, queueing @bio can change when @tg should be
@@ -820,9 +962,8 @@ static void throtl_add_bio_tg(struct bio *bio, struct throtl_grp *tg)
 	if (!sq->nr_queued[rw])
 		tg->flags |= THROTL_TG_WAS_EMPTY;
 
-	bio_list_add(&sq->bio_lists[rw], bio);
-	/* Take a bio reference on tg */
-	blkg_get(tg_to_blkg(tg));
+	throtl_qnode_add_bio(bio, qn, &sq->queued[rw]);
+
 	sq->nr_queued[rw]++;
 	throtl_enqueue_tg(tg);
 }
@@ -833,10 +974,10 @@ static void tg_update_disptime(struct throtl_grp *tg)
 	unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
 	struct bio *bio;
 
-	if ((bio = bio_list_peek(&sq->bio_lists[READ])))
+	if ((bio = throtl_peek_queued(&sq->queued[READ])))
 		tg_may_dispatch(tg, bio, &read_wait);
 
-	if ((bio = bio_list_peek(&sq->bio_lists[WRITE])))
+	if ((bio = throtl_peek_queued(&sq->queued[WRITE])))
 		tg_may_dispatch(tg, bio, &write_wait);
 
 	min_wait = min(read_wait, write_wait);
@@ -856,9 +997,16 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
 	struct throtl_service_queue *sq = &tg->service_queue;
 	struct throtl_service_queue *parent_sq = sq->parent_sq;
 	struct throtl_grp *parent_tg = sq_to_tg(parent_sq);
+	struct throtl_grp *tg_to_put = NULL;
 	struct bio *bio;
 
-	bio = bio_list_pop(&sq->bio_lists[rw]);
+	/*
+	 * @bio is being transferred from @tg to @parent_sq.  Popping a bio
+	 * from @tg may put its reference and @parent_sq might end up
+	 * getting released prematurely.  Remember the tg to put and put it
+	 * after @bio is transferred to @parent_sq.
+	 */
+	bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put);
 	sq->nr_queued[rw]--;
 
 	throtl_charge_bio(tg, bio);
@@ -871,17 +1019,18 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
 	 * responsible for issuing these bios.
 	 */
 	if (parent_tg) {
-		throtl_add_bio_tg(bio, parent_tg);
+		throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg);
 	} else {
-		bio_list_add(&parent_sq->bio_lists[rw], bio);
+		throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
+				     &parent_sq->queued[rw]);
 		BUG_ON(tg->td->nr_queued[rw] <= 0);
 		tg->td->nr_queued[rw]--;
 	}
 
 	throtl_trim_slice(tg, rw);
 
-	/* @bio is transferred to parent, drop its blkg reference */
-	blkg_put(tg_to_blkg(tg));
+	if (tg_to_put)
+		blkg_put(tg_to_blkg(tg_to_put));
 }
 
 static int throtl_dispatch_tg(struct throtl_grp *tg)
@@ -894,7 +1043,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg)
 
 	/* Try to dispatch 75% READS and 25% WRITES */
 
-	while ((bio = bio_list_peek(&sq->bio_lists[READ])) &&
+	while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
 	       tg_may_dispatch(tg, bio, NULL)) {
 
 		tg_dispatch_one_bio(tg, bio_data_dir(bio));
@@ -904,7 +1053,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg)
 			break;
 	}
 
-	while ((bio = bio_list_peek(&sq->bio_lists[WRITE])) &&
+	while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
 	       tg_may_dispatch(tg, bio, NULL)) {
 
 		tg_dispatch_one_bio(tg, bio_data_dir(bio));
@@ -1039,10 +1188,9 @@ void blk_throtl_dispatch_work_fn(struct work_struct *work)
 	bio_list_init(&bio_list_on_stack);
 
 	spin_lock_irq(q->queue_lock);
-	for (rw = READ; rw <= WRITE; rw++) {
-		bio_list_merge(&bio_list_on_stack, &td_sq->bio_lists[rw]);
-		bio_list_init(&td_sq->bio_lists[rw]);
-	}
+	for (rw = READ; rw <= WRITE; rw++)
+		while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
+			bio_list_add(&bio_list_on_stack, bio);
 	spin_unlock_irq(q->queue_lock);
 
 	if (!bio_list_empty(&bio_list_on_stack)) {
@@ -1241,6 +1389,7 @@ static struct blkcg_policy blkcg_policy_throtl = {
 bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 {
 	struct throtl_data *td = q->td;
+	struct throtl_qnode *qn = NULL;
 	struct throtl_grp *tg;
 	struct throtl_service_queue *sq;
 	bool rw = bio_data_dir(bio);
@@ -1308,6 +1457,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 		 * Climb up the ladder.  If we''re already at the top, it
 		 * can be executed directly.
 		 */
+		qn = &tg->qnode_on_parent[rw];
 		sq = sq->parent_sq;
 		tg = sq_to_tg(sq);
 		if (!tg)
@@ -1323,7 +1473,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 
 	bio_associate_current(bio);
 	tg->td->nr_queued[rw]++;
-	throtl_add_bio_tg(bio, tg);
+	throtl_add_bio_tg(bio, qn, tg);
 	throttled = true;
 
 	/*
@@ -1367,9 +1517,9 @@ static void tg_drain_bios(struct throtl_service_queue *parent_sq)
 
 		throtl_dequeue_tg(tg);
 
-		while ((bio = bio_list_peek(&sq->bio_lists[READ])))
+		while ((bio = throtl_peek_queued(&sq->queued[READ])))
 			tg_dispatch_one_bio(tg, bio_data_dir(bio));
-		while ((bio = bio_list_peek(&sq->bio_lists[WRITE])))
+		while ((bio = throtl_peek_queued(&sq->queued[WRITE])))
 			tg_dispatch_one_bio(tg, bio_data_dir(bio));
 	}
 }
@@ -1411,7 +1561,8 @@ void blk_throtl_drain(struct request_queue *q)
 
 	/* all bios now should be in td->service_queue, issue them */
 	for (rw = READ; rw <= WRITE; rw++)
-		while ((bio = bio_list_pop(&td->service_queue.bio_lists[rw])))
+		while ((bio = throtl_pop_queued(&td->service_queue.queued[rw],
+						NULL)))
 			generic_make_request(bio);
 
 	spin_lock_irq(q->queue_lock);
-- 
cgit v0.10.2


From 32ee5bc4787dfbdb280b4d81a338dcdd55918c1e Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Tue, 14 May 2013 13:52:38 -0700
Subject: blk-throttle: Account for child group's start time in parent while
 bio climbs up

With the planned proper hierarchy support, a bio will climb up the
tree before actually being dispatched. This makes sure bio is also
subjected to parent's throttling limits, if any.

It might happen that parent is idle and when bio is transferred to
parent, a new slice starts fresh. But that is incorrect as parents
wait time should have started when bio was queued in child group and
causes IOs to be throttled more than configured as they climb the
hierarchy.

Given the fact that we have not written hierarchical algorithm in a
way where child's and parents time slices are synchronized, we
transfer the child's start time to parent if parent was idling.  If
parent was busy doing dispatch of other bios all this while, this is
not an issue.

Child's slice start time is passed to parent. Parent looks at its
last expired slice start time. If child's start time is after parents
old start time, that means parent had been idle and after parent
went idle, child had an IO queued. So use child's start time as
parent start time.

If parent's start time is after child's start time, that means,
when IO got queued in child group, parent was not idle. But later
it dispatched some IO, its slice got trimmed and then it went idle.
After a while child's request got shifted in parent group. In this
case use parent's old start time as new start time as that's the
duration of slice we did not use.

This logic is far from perfect as if there are multiple childs
then first child transferring the bio decides the start time while
a bio might have queued up even earlier in other child, which is
yet to be transferred up to parent. In that case we will lose
time and bandwidth in parent. This patch is just an approximation
to make situation somewhat better.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 541bd0d..7477f33 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -633,6 +633,28 @@ static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq,
 	return false;
 }
 
+static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
+		bool rw, unsigned long start)
+{
+	tg->bytes_disp[rw] = 0;
+	tg->io_disp[rw] = 0;
+
+	/*
+	 * Previous slice has expired. We must have trimmed it after last
+	 * bio dispatch. That means since start of last slice, we never used
+	 * that bandwidth. Do try to make use of that bandwidth while giving
+	 * credit.
+	 */
+	if (time_after_eq(start, tg->slice_start[rw]))
+		tg->slice_start[rw] = start;
+
+	tg->slice_end[rw] = jiffies + throtl_slice;
+	throtl_log(&tg->service_queue,
+		   "[%c] new slice with credit start=%lu end=%lu jiffies=%lu",
+		   rw == READ ? 'R' : 'W', tg->slice_start[rw],
+		   tg->slice_end[rw], jiffies);
+}
+
 static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
 {
 	tg->bytes_disp[rw] = 0;
@@ -992,6 +1014,16 @@ static void tg_update_disptime(struct throtl_grp *tg)
 	tg->flags &= ~THROTL_TG_WAS_EMPTY;
 }
 
+static void start_parent_slice_with_credit(struct throtl_grp *child_tg,
+					struct throtl_grp *parent_tg, bool rw)
+{
+	if (throtl_slice_used(parent_tg, rw)) {
+		throtl_start_new_slice_with_credit(parent_tg, rw,
+				child_tg->slice_start[rw]);
+	}
+
+}
+
 static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
 {
 	struct throtl_service_queue *sq = &tg->service_queue;
@@ -1020,6 +1052,7 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
 	 */
 	if (parent_tg) {
 		throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg);
+		start_parent_slice_with_credit(tg, parent_tg, rw);
 	} else {
 		throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
 				     &parent_sq->queued[rw]);
-- 
cgit v0.10.2


From 693e751e70843c29884cde326016e746fa16073a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:38 -0700
Subject: blk-throttle: implement throtl_grp->has_rules[]

blk_throtl_bio() has a quick exit path for throtl_grps without limits
configured.  It looks at the bps and iops limits and if both are not
configured, the bio is issued immediately.  While this is correct in
the current flat hierarchy as each throtl_grp behaves completely
independently, it would become wrong in proper hierarchy mode.  A
group without any limits could still be limited by one of its
ancestors and bio's queued for such group should not bypass
blk-throtl.

As having a quick bypass mechanism is beneficial, this patch
reimplements the mechanism such that it's correct even with proper
hierarchy.  throtl_grp->has_rules[] is added.  These booleans are
updated for the whole subtree whenever a config is updated so that
has_rules[] of the whole subtree stays synchronized.  They're also
updated when a new throtl_grp comes online so that it can't escape the
limits of its ancestors.

As no throtl_grp has another throtl_grp as parent now, this patch
doesn't yet make any behavior differences.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 7477f33..27f006b 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -124,6 +124,9 @@ struct throtl_grp {
 
 	unsigned int flags;
 
+	/* are there any throtl rules between this group and td? */
+	bool has_rules[2];
+
 	/* bytes per second rate limits */
 	uint64_t bps[2];
 
@@ -422,6 +425,30 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
 	spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
 }
 
+/*
+ * Set has_rules[] if @tg or any of its parents have limits configured.
+ * This doesn't require walking up to the top of the hierarchy as the
+ * parent's has_rules[] is guaranteed to be correct.
+ */
+static void tg_update_has_rules(struct throtl_grp *tg)
+{
+	struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
+	int rw;
+
+	for (rw = READ; rw <= WRITE; rw++)
+		tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) ||
+				    (tg->bps[rw] != -1 || tg->iops[rw] != -1);
+}
+
+static void throtl_pd_online(struct blkcg_gq *blkg)
+{
+	/*
+	 * We don't want new groups to escape the limits of its ancestors.
+	 * Update has_rules[] after a new group is brought online.
+	 */
+	tg_update_has_rules(blkg_to_tg(blkg));
+}
+
 static void throtl_pd_exit(struct blkcg_gq *blkg)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -843,12 +870,6 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
 	return 0;
 }
 
-static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {
-	if (tg->bps[rw] == -1 && tg->iops[rw] == -1)
-		return 1;
-	return 0;
-}
-
 /*
  * Returns whether one can dispatch a bio or not. Also returns approx number
  * of jiffies to wait before this bio is with-in IO rate and can be dispatched
@@ -1307,6 +1328,8 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
 	struct blkg_conf_ctx ctx;
 	struct throtl_grp *tg;
 	struct throtl_service_queue *sq;
+	struct blkcg_gq *blkg;
+	struct cgroup *pos_cgrp;
 	int ret;
 
 	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
@@ -1330,6 +1353,17 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
 		   tg->iops[READ], tg->iops[WRITE]);
 
 	/*
+	 * Update has_rules[] flags for the updated tg's subtree.  A tg is
+	 * considered to have rules if either the tg itself or any of its
+	 * ancestors has rules.  This identifies groups without any
+	 * restrictions in the whole hierarchy and allows them to bypass
+	 * blk-throttle.
+	 */
+	tg_update_has_rules(tg);
+	blkg_for_each_descendant_pre(blkg, pos_cgrp, ctx.blkg)
+		tg_update_has_rules(blkg_to_tg(blkg));
+
+	/*
 	 * We're already holding queue_lock and know @tg is valid.  Let's
 	 * apply the new config directly.
 	 *
@@ -1415,6 +1449,7 @@ static struct blkcg_policy blkcg_policy_throtl = {
 	.cftypes		= throtl_files,
 
 	.pd_init_fn		= throtl_pd_init,
+	.pd_online_fn		= throtl_pd_online,
 	.pd_exit_fn		= throtl_pd_exit,
 	.pd_reset_stats_fn	= throtl_pd_reset_stats,
 };
@@ -1442,7 +1477,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 	blkcg = bio_blkcg(bio);
 	tg = throtl_lookup_tg(td, blkcg);
 	if (tg) {
-		if (tg_no_rule_group(tg, rw)) {
+		if (!tg->has_rules[rw]) {
 			throtl_update_dispatch_stats(tg_to_blkg(tg),
 						     bio->bi_size, bio->bi_rw);
 			goto out_unlock_rcu;
-- 
cgit v0.10.2


From 9138125beabbb76b4a373d4a619870f6f5d86fc5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:38 -0700
Subject: blk-throttle: implement proper hierarchy support

With the recent updates, blk-throttle is finally ready for proper
hierarchy support.  Dispatching now honors service_queue->parent_sq
and propagates correctly.  The only thing missing is setting
->parent_sq correctly so that throtl_grp hierarchy matches the cgroup
hierarchy.

This patch updates throtl_pd_init() such that service_queues form the
same hierarchy as the cgroup hierarchy if sane_behavior is enabled.
As this concludes proper hierarchy support for blkcg, the shameful
.broken_hierarchy tag is removed from blkio_subsys.

v2: Updated blkio-controller.txt as suggested by Vivek.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Li Zefan <lizefan@huawei.com>

diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index da272c8..cd556b9 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -94,11 +94,13 @@ Throttling/Upper Limit policy
 
 Hierarchical Cgroups
 ====================
-- Currently only CFQ supports hierarchical groups. For throttling,
-  cgroup interface does allow creation of hierarchical cgroups and
-  internally it treats them as flat hierarchy.
 
-  If somebody created a hierarchy like as follows.
+Both CFQ and throttling implement hierarchy support; however,
+throttling's hierarchy support is enabled iff "sane_behavior" is
+enabled from cgroup side, which currently is a development option and
+not publicly available.
+
+If somebody created a hierarchy like as follows.
 
 			root
 			/  \
@@ -106,21 +108,20 @@ Hierarchical Cgroups
 			|
 		     test3
 
-  CFQ will handle the hierarchy correctly but and throttling will
-  practically treat all groups at same level. For details on CFQ
-  hierarchy support, refer to Documentation/block/cfq-iosched.txt.
-  Throttling will treat the hierarchy as if it looks like the
-  following.
+CFQ by default and throttling with "sane_behavior" will handle the
+hierarchy correctly.  For details on CFQ hierarchy support, refer to
+Documentation/block/cfq-iosched.txt.  For throttling, all limits apply
+to the whole subtree while all statistics are local to the IOs
+directly generated by tasks in that cgroup.
+
+Throttling without "sane_behavior" enabled from cgroup side will
+practically treat all groups at same level as if it looks like the
+following.
 
 				pivot
 			     /  /   \  \
 			root  test1 test2  test3
 
-  Nesting cgroups, while allowed, isn't officially supported and blkio
-  genereates warning when cgroups nest. Once throttling implements
-  hierarchy support, hierarchy will be supported and the warning will
-  be removed.
-
 Various user visible config options
 ===================================
 CONFIG_BLK_CGROUP
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index d074760..290792a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -911,14 +911,6 @@ struct cgroup_subsys blkio_subsys = {
 	.subsys_id = blkio_subsys_id,
 	.base_cftypes = blkcg_files,
 	.module = THIS_MODULE,
-
-	/*
-	 * blkio subsystem is utterly broken in terms of hierarchy support.
-	 * It treats all cgroups equally regardless of where they're
-	 * located in the hierarchy - all cgroups are treated as if they're
-	 * right below the root.  Fix it and remove the following.
-	 */
-	.broken_hierarchy = true,
 };
 EXPORT_SYMBOL_GPL(blkio_subsys);
 
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 27f006b..08a32df 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -397,10 +397,30 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
 	struct throtl_data *td = blkg->q->td;
+	struct throtl_service_queue *parent_sq;
 	unsigned long flags;
 	int rw;
 
-	throtl_service_queue_init(&tg->service_queue, &td->service_queue);
+	/*
+	 * If sane_hierarchy is enabled, we switch to properly hierarchical
+	 * behavior where limits on a given throtl_grp are applied to the
+	 * whole subtree rather than just the group itself.  e.g. If 16M
+	 * read_bps limit is set on the root group, the whole system can't
+	 * exceed 16M for the device.
+	 *
+	 * If sane_hierarchy is not enabled, the broken flat hierarchy
+	 * behavior is retained where all throtl_grps are treated as if
+	 * they're all separate root groups right below throtl_data.
+	 * Limits of a group don't interact with limits of other groups
+	 * regardless of the position of the group in the hierarchy.
+	 */
+	parent_sq = &td->service_queue;
+
+	if (cgroup_sane_behavior(blkg->blkcg->css.cgroup) && blkg->parent)
+		parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
+
+	throtl_service_queue_init(&tg->service_queue, parent_sq);
+
 	for (rw = READ; rw <= WRITE; rw++) {
 		throtl_qnode_init(&tg->qnode_on_self[rw], tg);
 		throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 5047355..09f1a14 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -272,6 +272,8 @@ enum {
 	 * - memcg: use_hierarchy is on by default and the cgroup file for
 	 *   the flag is not created.
 	 *
+	 * - blkcg: blk-throttle becomes properly hierarchical.
+	 *
 	 * The followings are planned changes.
 	 *
 	 * - release_agent will be disallowed once replacement notification
-- 
cgit v0.10.2


From a5faeaf9109578e65e1a32e2a3e76c8b47e7dcb6 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 28 Jun 2013 16:04:02 +0200
Subject: writeback: Fix periodic writeback after fs mount

Code in blkdev.c moves a device inode to default_backing_dev_info when
the last reference to the device is put and moves the device inode back
to its bdi when the first reference is acquired. This includes moving to
wb.b_dirty list if the device inode is dirty. The code however doesn't
setup timer to wake corresponding flusher thread and while wb.b_dirty
list is non-empty __mark_inode_dirty() will not set it up either. Thus
periodic writeback is effectively disabled until a sync(2) call which can
lead to unexpected data loss in case of crash or power failure.

Fix the problem by setting up a timer for periodic writeback in case we
add the first dirty inode to wb.b_dirty list in bdev_inode_switch_bdi().

Reported-by: Bert De Jonghe <Bert.DeJonghe@amplidata.com>
CC: stable@vger.kernel.org # >= 3.0
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2091db8..85f5c85 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -58,17 +58,24 @@ static void bdev_inode_switch_bdi(struct inode *inode,
 			struct backing_dev_info *dst)
 {
 	struct backing_dev_info *old = inode->i_data.backing_dev_info;
+	bool wakeup_bdi = false;
 
 	if (unlikely(dst == old))		/* deadlock avoidance */
 		return;
 	bdi_lock_two(&old->wb, &dst->wb);
 	spin_lock(&inode->i_lock);
 	inode->i_data.backing_dev_info = dst;
-	if (inode->i_state & I_DIRTY)
+	if (inode->i_state & I_DIRTY) {
+		if (bdi_cap_writeback_dirty(dst) && !wb_has_dirty_io(&dst->wb))
+			wakeup_bdi = true;
 		list_move(&inode->i_wb_list, &dst->wb.b_dirty);
+	}
 	spin_unlock(&inode->i_lock);
 	spin_unlock(&old->wb.list_lock);
 	spin_unlock(&dst->wb.list_lock);
+
+	if (wakeup_bdi)
+		bdi_wakeup_thread_delayed(dst);
 }
 
 /* Kill _all_ buffers and pagecache , dirty or not.. */
-- 
cgit v0.10.2


From a6b3f7614ca690e49e934c291f707b0c19312194 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 28 Jun 2013 21:32:27 +0200
Subject: block: Reserve only one queue tag for sync IO if only 3 tags are
 available

In case a device has three tags available we still reserve two of them
for sync IO. That leaves only a single tag for async IO such as
writeback from flusher thread which results in poor performance.

Allow async IO to consume two tags in case queue has three tag availabe
to get a decent async write performance.

This patch improves streaming write performance on a machine with such disk
from ~21 MB/s to ~52 MB/s. Also postmark throughput in presence of
streaming writer improves from 8 to 12 transactions per second so sync
IO doesn't seem to be harmed in presence of heavy async writer.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>

diff --git a/block/blk-tag.c b/block/blk-tag.c
index cc345e1..3f33d86 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -348,9 +348,16 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
 	 */
 	max_depth = bqt->max_depth;
 	if (!rq_is_sync(rq) && max_depth > 1) {
-		max_depth -= 2;
-		if (!max_depth)
+		switch (max_depth) {
+		case 2:
 			max_depth = 1;
+			break;
+		case 3:
+			max_depth = 2;
+			break;
+		default:
+			max_depth -= 2;
+		}
 		if (q->in_flight[BLK_RW_ASYNC] > max_depth)
 			return 1;
 	}
-- 
cgit v0.10.2


From d50235b7bc3ee0a0427984d763ea7534149531b4 Mon Sep 17 00:00:00 2001
From: Jianpeng Ma <majianpeng@gmail.com>
Date: Wed, 3 Jul 2013 13:25:24 +0200
Subject: elevator: Fix a race in elevator switching

There's a race between elevator switching and normal io operation.
    Because the allocation of struct elevator_queue and struct elevator_data
    don't in a atomic operation.So there are have chance to use NULL
    ->elevator_data.
    For example:
        Thread A:                               Thread B
        blk_queu_bio                            elevator_switch
        spin_lock_irq(q->queue_block)           elevator_alloc
        elv_merge                               elevator_init_fn

    Because call elevator_alloc, it can't hold queue_lock and the
    ->elevator_data is NULL.So at the same time, threadA call elv_merge and
    nedd some info of elevator_data.So the crash happened.

    Move the elevator_alloc into func elevator_init_fn, it make the
    operations in a atomic operation.

    Using the follow method can easy reproduce this bug
    1:dd if=/dev/sdb of=/dev/null
    2:while true;do echo noop > scheduler;echo deadline > scheduler;done

    The test method also use this method.

Signed-off-by: Jianpeng Ma <majianpeng@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d5cd3131..d5bbdcf 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -4347,18 +4347,28 @@ static void cfq_exit_queue(struct elevator_queue *e)
 	kfree(cfqd);
 }
 
-static int cfq_init_queue(struct request_queue *q)
+static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
 {
 	struct cfq_data *cfqd;
 	struct blkcg_gq *blkg __maybe_unused;
 	int i, ret;
+	struct elevator_queue *eq;
+
+	eq = elevator_alloc(q, e);
+	if (!eq)
+		return -ENOMEM;
 
 	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
-	if (!cfqd)
+	if (!cfqd) {
+		kobject_put(&eq->kobj);
 		return -ENOMEM;
+	}
+	eq->elevator_data = cfqd;
 
 	cfqd->queue = q;
-	q->elevator->elevator_data = cfqd;
+	spin_lock_irq(q->queue_lock);
+	q->elevator = eq;
+	spin_unlock_irq(q->queue_lock);
 
 	/* Init root service tree */
 	cfqd->grp_service_tree = CFQ_RB_ROOT;
@@ -4433,6 +4443,7 @@ static int cfq_init_queue(struct request_queue *q)
 
 out_free:
 	kfree(cfqd);
+	kobject_put(&eq->kobj);
 	return ret;
 }
 
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index ba19a3a..20614a3 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -337,13 +337,21 @@ static void deadline_exit_queue(struct elevator_queue *e)
 /*
  * initialize elevator private data (deadline_data).
  */
-static int deadline_init_queue(struct request_queue *q)
+static int deadline_init_queue(struct request_queue *q, struct elevator_type *e)
 {
 	struct deadline_data *dd;
+	struct elevator_queue *eq;
+
+	eq = elevator_alloc(q, e);
+	if (!eq)
+		return -ENOMEM;
 
 	dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node);
-	if (!dd)
+	if (!dd) {
+		kobject_put(&eq->kobj);
 		return -ENOMEM;
+	}
+	eq->elevator_data = dd;
 
 	INIT_LIST_HEAD(&dd->fifo_list[READ]);
 	INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
@@ -355,7 +363,9 @@ static int deadline_init_queue(struct request_queue *q)
 	dd->front_merges = 1;
 	dd->fifo_batch = fifo_batch;
 
-	q->elevator->elevator_data = dd;
+	spin_lock_irq(q->queue_lock);
+	q->elevator = eq;
+	spin_unlock_irq(q->queue_lock);
 	return 0;
 }
 
diff --git a/block/elevator.c b/block/elevator.c
index eba5b04..668394d 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -150,7 +150,7 @@ void __init load_default_elevator_module(void)
 
 static struct kobj_type elv_ktype;
 
-static struct elevator_queue *elevator_alloc(struct request_queue *q,
+struct elevator_queue *elevator_alloc(struct request_queue *q,
 				  struct elevator_type *e)
 {
 	struct elevator_queue *eq;
@@ -170,6 +170,7 @@ err:
 	elevator_put(e);
 	return NULL;
 }
+EXPORT_SYMBOL(elevator_alloc);
 
 static void elevator_release(struct kobject *kobj)
 {
@@ -221,16 +222,7 @@ int elevator_init(struct request_queue *q, char *name)
 		}
 	}
 
-	q->elevator = elevator_alloc(q, e);
-	if (!q->elevator)
-		return -ENOMEM;
-
-	err = e->ops.elevator_init_fn(q);
-	if (err) {
-		kobject_put(&q->elevator->kobj);
-		return err;
-	}
-
+	err = e->ops.elevator_init_fn(q, e);
 	return 0;
 }
 EXPORT_SYMBOL(elevator_init);
@@ -935,16 +927,9 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	spin_unlock_irq(q->queue_lock);
 
 	/* allocate, init and register new elevator */
-	err = -ENOMEM;
-	q->elevator = elevator_alloc(q, new_e);
-	if (!q->elevator)
-		goto fail_init;
-
-	err = new_e->ops.elevator_init_fn(q);
-	if (err) {
-		kobject_put(&q->elevator->kobj);
+	err = new_e->ops.elevator_init_fn(q, new_e);
+	if (err)
 		goto fail_init;
-	}
 
 	if (registered) {
 		err = elv_register_queue(q);
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index 5d1bf70..3de89d4 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -59,16 +59,27 @@ noop_latter_request(struct request_queue *q, struct request *rq)
 	return list_entry(rq->queuelist.next, struct request, queuelist);
 }
 
-static int noop_init_queue(struct request_queue *q)
+static int noop_init_queue(struct request_queue *q, struct elevator_type *e)
 {
 	struct noop_data *nd;
+	struct elevator_queue *eq;
+
+	eq = elevator_alloc(q, e);
+	if (!eq)
+		return -ENOMEM;
 
 	nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
-	if (!nd)
+	if (!nd) {
+		kobject_put(&eq->kobj);
 		return -ENOMEM;
+	}
+	eq->elevator_data = nd;
 
 	INIT_LIST_HEAD(&nd->queue);
-	q->elevator->elevator_data = nd;
+
+	spin_lock_irq(q->queue_lock);
+	q->elevator = eq;
+	spin_unlock_irq(q->queue_lock);
 	return 0;
 }
 
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index acd0312..306dd8c 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -7,6 +7,7 @@
 #ifdef CONFIG_BLOCK
 
 struct io_cq;
+struct elevator_type;
 
 typedef int (elevator_merge_fn) (struct request_queue *, struct request **,
 				 struct bio *);
@@ -35,7 +36,8 @@ typedef void (elevator_put_req_fn) (struct request *);
 typedef void (elevator_activate_req_fn) (struct request_queue *, struct request *);
 typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct request *);
 
-typedef int (elevator_init_fn) (struct request_queue *);
+typedef int (elevator_init_fn) (struct request_queue *,
+				struct elevator_type *e);
 typedef void (elevator_exit_fn) (struct elevator_queue *);
 
 struct elevator_ops
@@ -155,6 +157,8 @@ extern int elevator_init(struct request_queue *, char *);
 extern void elevator_exit(struct elevator_queue *);
 extern int elevator_change(struct request_queue *, const char *);
 extern bool elv_rq_merge_ok(struct request *, struct bio *);
+extern struct elevator_queue *elevator_alloc(struct request_queue *,
+					struct elevator_type *);
 
 /*
  * Helper functions.
-- 
cgit v0.10.2