Merge remote-tracking branch 'linus/master' into merge

Conflicts: Documentation/hwmon/ina2xx arch/powerpc/Kconfig arch/powerpc/boot/dts/b4860emu.dts arch/powerpc/boot/dts/b4qds.dtsi arch/powerpc/boot/dts/fsl/b4si-post.dtsi arch/powerpc/boot/dts/fsl/qoriq-sec6.0-0.dtsi arch/powerpc/boot/dts/p1023rdb.dts arch/powerpc/boot/dts/t4240emu.dts arch/powerpc/boot/dts/t4240qds.dts arch/powerpc/configs/85xx/p1023_defconfig arch/powerpc/configs/corenet32_smp_defconfig arch/powerpc/configs/corenet64_smp_defconfig arch/powerpc/configs/mpc85xx_smp_defconfig arch/powerpc/include/asm/cputable.h arch/powerpc/include/asm/device.h arch/powerpc/include/asm/epapr_hcalls.h arch/powerpc/include/asm/kvm_host.h arch/powerpc/include/asm/mpic.h arch/powerpc/include/asm/pci.h arch/powerpc/include/asm/ppc-opcode.h arch/powerpc/include/asm/ppc_asm.h arch/powerpc/include/asm/reg_booke.h arch/powerpc/kernel/epapr_paravirt.c arch/powerpc/kernel/process.c arch/powerpc/kernel/prom.c arch/powerpc/kernel/setup-common.c arch/powerpc/kernel/setup_32.c arch/powerpc/kernel/setup_64.c arch/powerpc/kernel/smp.c arch/powerpc/kernel/swsusp_asm64.S arch/powerpc/kernel/swsusp_booke.S arch/powerpc/kvm/book3s_pr.c arch/powerpc/kvm/booke.c arch/powerpc/kvm/booke.h arch/powerpc/kvm/e500.c arch/powerpc/kvm/e500.h arch/powerpc/kvm/e500_emulate.c arch/powerpc/kvm/e500mc.c arch/powerpc/kvm/powerpc.c arch/powerpc/perf/e6500-pmu.c arch/powerpc/platforms/85xx/Kconfig arch/powerpc/platforms/85xx/Makefile arch/powerpc/platforms/85xx/b4_qds.c arch/powerpc/platforms/85xx/c293pcie.c arch/powerpc/platforms/85xx/corenet_ds.c arch/powerpc/platforms/85xx/corenet_ds.h arch/powerpc/platforms/85xx/p1023_rds.c arch/powerpc/platforms/85xx/p2041_rdb.c arch/powerpc/platforms/85xx/p3041_ds.c arch/powerpc/platforms/85xx/p4080_ds.c arch/powerpc/platforms/85xx/p5020_ds.c arch/powerpc/platforms/85xx/p5040_ds.c arch/powerpc/platforms/85xx/smp.c arch/powerpc/platforms/85xx/t4240_qds.c arch/powerpc/platforms/Kconfig arch/powerpc/sysdev/Makefile arch/powerpc/sysdev/fsl_mpic_timer_wakeup.c arch/powerpc/sysdev/fsl_msi.c arch/powerpc/sysdev/fsl_pci.c arch/powerpc/sysdev/fsl_pci.h arch/powerpc/sysdev/fsl_soc.h arch/powerpc/sysdev/mpic.c arch/powerpc/sysdev/mpic_timer.c drivers/Kconfig drivers/clk/Kconfig drivers/clk/clk-ppc-corenet.c drivers/cpufreq/Kconfig.powerpc drivers/cpufreq/Makefile drivers/cpufreq/ppc-corenet-cpufreq.c drivers/crypto/caam/Kconfig drivers/crypto/caam/Makefile drivers/crypto/caam/ctrl.c drivers/crypto/caam/desc_constr.h drivers/crypto/caam/intern.h drivers/crypto/caam/jr.c drivers/crypto/caam/regs.h drivers/dma/fsldma.c drivers/hwmon/ina2xx.c drivers/iommu/Kconfig drivers/iommu/fsl_pamu.c drivers/iommu/fsl_pamu.h drivers/iommu/fsl_pamu_domain.c drivers/iommu/fsl_pamu_domain.h drivers/misc/Makefile drivers/mmc/card/block.c drivers/mmc/core/core.c drivers/mmc/host/sdhci-esdhc.h drivers/mmc/host/sdhci-pltfm.c drivers/mtd/nand/fsl_ifc_nand.c drivers/net/ethernet/freescale/gianfar.c drivers/net/ethernet/freescale/gianfar.h drivers/net/ethernet/freescale/gianfar_ethtool.c drivers/net/phy/at803x.c drivers/net/phy/phy_device.c drivers/net/phy/vitesse.c drivers/pci/msi.c drivers/staging/Kconfig drivers/staging/Makefile drivers/uio/Kconfig drivers/uio/Makefile drivers/uio/uio.c drivers/usb/host/ehci-fsl.c drivers/vfio/Kconfig drivers/vfio/Makefile include/crypto/algapi.h include/linux/iommu.h include/linux/mmc/sdhci.h include/linux/msi.h include/linux/netdev_features.h include/linux/phy.h include/linux/skbuff.h include/net/ip.h include/uapi/linux/vfio.h net/core/ethtool.c net/ipv4/route.c net/ipv6/route.c
author: Scott Wood <scottwood@freescale.com> 2013-12-14 01:15:24 (GMT)
committer: Scott Wood <scottwood@freescale.com> 2013-12-14 01:15:24 (GMT)
commit: b7c81aa3ab2ac2c140e278b6d0e9a0b95112cf0b (patch)
tree: 87828aaf5f82c7042bfc0307bc4ac499e10f93fb /block
parent: 22c782a4b14773fab7eab3c1db54ad7ad077e9b8 (diff)
parent: 78fd82238d0e5716578c326404184a27ba67fd6e (diff)
download: linux-fsl-qoriq-b7c81aa3ab2ac2c140e278b6d0e9a0b95112cf0b.tar.xz
44 files changed, 4677 insertions, 839 deletions
diff --git a/block/Kconfig b/block/Kconfig
index a7e40a7..2429515 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -99,6 +99,17 @@ config BLK_DEV_THROTTLING
 
 	See Documentation/cgroups/blkio-controller.txt for more information.
 
+config BLK_CMDLINE_PARSER
+	bool "Block device command line partition parser"
+	default n
+	---help---
+	Enabling this option allows you to specify the partition layout from
+	the kernel boot args.  This is typically of use for embedded devices
+	which don't otherwise have any standardized method for listing the
+	partitions on a block device.
+
+	See Documentation/block/cmdline-partition.txt for more information.
+
 menu "Partition Types"
 
 source "block/partitions/Kconfig"
diff --git a/block/Makefile b/block/Makefile
index 39b76ba..20645e8 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,8 +5,9 @@
 obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 			blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
 			blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
-			blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o \
-			partition-generic.o partitions/
+			blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
+			blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
+			genhd.o scsi_ioctl.o partition-generic.o partitions/
 
 obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
 obj-$(CONFIG_BLK_DEV_BSGLIB)	+= bsg-lib.o
@@ -18,3 +19,4 @@ obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
 
 obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
 obj-$(CONFIG_BLK_DEV_INTEGRITY)	+= blk-integrity.o
+obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index e8918ff..4e491d9 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -32,26 +32,6 @@ EXPORT_SYMBOL_GPL(blkcg_root);
 
 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
 
-static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
-				      struct request_queue *q, bool update_hint);
-
-/**
- * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
- * @d_blkg: loop cursor pointing to the current descendant
- * @pos_cgrp: used for iteration
- * @p_blkg: target blkg to walk descendants of
- *
- * Walk @c_blkg through the descendants of @p_blkg.  Must be used with RCU
- * read locked.  If called under either blkcg or queue lock, the iteration
- * is guaranteed to include all and only online blkgs.  The caller may
- * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip
- * subtree.
- */
-#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg)		\
-	cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
-		if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
-					      (p_blkg)->q, false)))
-
 static bool blkcg_policy_enabled(struct request_queue *q,
 				 const struct blkcg_policy *pol)
 {
@@ -71,18 +51,8 @@ static void blkg_free(struct blkcg_gq *blkg)
 	if (!blkg)
 		return;
 
-	for (i = 0; i < BLKCG_MAX_POLS; i++) {
-		struct blkcg_policy *pol = blkcg_policy[i];
-		struct blkg_policy_data *pd = blkg->pd[i];
-
-		if (!pd)
-			continue;
-
-		if (pol && pol->pd_exit_fn)
-			pol->pd_exit_fn(blkg);
-
-		kfree(pd);
-	}
+	for (i = 0; i < BLKCG_MAX_POLS; i++)
+		kfree(blkg->pd[i]);
 
 	blk_exit_rl(&blkg->rl);
 	kfree(blkg);
@@ -134,10 +104,6 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
 		blkg->pd[i] = pd;
 		pd->blkg = blkg;
 		pd->plid = i;
-
-		/* invoke per-policy init */
-		if (pol->pd_init_fn)
-			pol->pd_init_fn(blkg);
 	}
 
 	return blkg;
@@ -158,8 +124,8 @@ err_free:
  * @q's bypass state.  If @update_hint is %true, the caller should be
  * holding @q->queue_lock and lookup hint is updated on success.
  */
-static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
-				      struct request_queue *q, bool update_hint)
+struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
+			       bool update_hint)
 {
 	struct blkcg_gq *blkg;
 
@@ -234,16 +200,25 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 	}
 	blkg = new_blkg;
 
-	/* link parent and insert */
+	/* link parent */
 	if (blkcg_parent(blkcg)) {
 		blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
 		if (WARN_ON_ONCE(!blkg->parent)) {
-			blkg = ERR_PTR(-EINVAL);
+			ret = -EINVAL;
 			goto err_put_css;
 		}
 		blkg_get(blkg->parent);
 	}
 
+	/* invoke per-policy init */
+	for (i = 0; i < BLKCG_MAX_POLS; i++) {
+		struct blkcg_policy *pol = blkcg_policy[i];
+
+		if (blkg->pd[i] && pol->pd_init_fn)
+			pol->pd_init_fn(blkg);
+	}
+
+	/* insert */
 	spin_lock(&blkcg->lock);
 	ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
 	if (likely(!ret)) {
@@ -260,8 +235,13 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 	blkg->online = true;
 	spin_unlock(&blkcg->lock);
 
-	if (!ret)
+	if (!ret) {
+		if (blkcg == &blkcg_root) {
+			q->root_blkg = blkg;
+			q->root_rl.blkg = blkg;
+		}
 		return blkg;
+	}
 
 	/* @blkg failed fully initialized, use the usual release path */
 	blkg_put(blkg);
@@ -360,6 +340,15 @@ static void blkg_destroy(struct blkcg_gq *blkg)
 		rcu_assign_pointer(blkcg->blkg_hint, NULL);
 
 	/*
+	 * If root blkg is destroyed.  Just clear the pointer since root_rl
+	 * does not take reference on root blkg.
+	 */
+	if (blkcg == &blkcg_root) {
+		blkg->q->root_blkg = NULL;
+		blkg->q->root_rl.blkg = NULL;
+	}
+
+	/*
 	 * Put the reference taken at the time of creation so that when all
 	 * queues are gone, group can be destroyed.
 	 */
@@ -385,39 +374,40 @@ static void blkg_destroy_all(struct request_queue *q)
 		blkg_destroy(blkg);
 		spin_unlock(&blkcg->lock);
 	}
-
-	/*
-	 * root blkg is destroyed.  Just clear the pointer since
-	 * root_rl does not take reference on root blkg.
-	 */
-	q->root_blkg = NULL;
-	q->root_rl.blkg = NULL;
 }
 
-static void blkg_rcu_free(struct rcu_head *rcu_head)
+/*
+ * A group is RCU protected, but having an rcu lock does not mean that one
+ * can access all the fields of blkg and assume these are valid.  For
+ * example, don't try to follow throtl_data and request queue links.
+ *
+ * Having a reference to blkg under an rcu allows accesses to only values
+ * local to groups like group stats and group rate limits.
+ */
+void __blkg_release_rcu(struct rcu_head *rcu_head)
 {
-	blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head));
-}
+	struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
+	int i;
+
+	/* tell policies that this one is being freed */
+	for (i = 0; i < BLKCG_MAX_POLS; i++) {
+		struct blkcg_policy *pol = blkcg_policy[i];
+
+		if (blkg->pd[i] && pol->pd_exit_fn)
+			pol->pd_exit_fn(blkg);
+	}
 
-void __blkg_release(struct blkcg_gq *blkg)
-{
 	/* release the blkcg and parent blkg refs this blkg has been holding */
 	css_put(&blkg->blkcg->css);
-	if (blkg->parent)
+	if (blkg->parent) {
+		spin_lock_irq(blkg->q->queue_lock);
 		blkg_put(blkg->parent);
+		spin_unlock_irq(blkg->q->queue_lock);
+	}
 
-	/*
-	 * A group is freed in rcu manner. But having an rcu lock does not
-	 * mean that one can access all the fields of blkg and assume these
-	 * are valid. For example, don't try to follow throtl_data and
-	 * request queue links.
-	 *
-	 * Having a reference to blkg under an rcu allows acess to only
-	 * values local to groups like group stats and group rate limits
-	 */
-	call_rcu(&blkg->rcu_head, blkg_rcu_free);
+	blkg_free(blkg);
 }
-EXPORT_SYMBOL_GPL(__blkg_release);
+EXPORT_SYMBOL_GPL(__blkg_release_rcu);
 
 /*
  * The next function used by blk_queue_for_each_rl().  It's a bit tricky
@@ -454,10 +444,10 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl,
 	return &blkg->rl;
 }
 
-static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype,
-			     u64 val)
+static int blkcg_reset_stats(struct cgroup_subsys_state *css,
+			     struct cftype *cftype, u64 val)
 {
-	struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
+	struct blkcg *blkcg = css_to_blkcg(css);
 	struct blkcg_gq *blkg;
 	int i;
 
@@ -631,15 +621,13 @@ u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off)
 {
 	struct blkcg_policy *pol = blkcg_policy[pd->plid];
 	struct blkcg_gq *pos_blkg;
-	struct cgroup *pos_cgrp;
-	u64 sum;
+	struct cgroup_subsys_state *pos_css;
+	u64 sum = 0;
 
 	lockdep_assert_held(pd->blkg->q->queue_lock);
 
-	sum = blkg_stat_read((void *)pd + off);
-
 	rcu_read_lock();
-	blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) {
+	blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
 		struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
 		struct blkg_stat *stat = (void *)pos_pd + off;
 
@@ -666,16 +654,14 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
 {
 	struct blkcg_policy *pol = blkcg_policy[pd->plid];
 	struct blkcg_gq *pos_blkg;
-	struct cgroup *pos_cgrp;
-	struct blkg_rwstat sum;
+	struct cgroup_subsys_state *pos_css;
+	struct blkg_rwstat sum = { };
 	int i;
 
 	lockdep_assert_held(pd->blkg->q->queue_lock);
 
-	sum = blkg_rwstat_read((void *)pd + off);
-
 	rcu_read_lock();
-	blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) {
+	blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
 		struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
 		struct blkg_rwstat *rwstat = (void *)pos_pd + off;
 		struct blkg_rwstat tmp;
@@ -782,18 +768,18 @@ struct cftype blkcg_files[] = {
 
 /**
  * blkcg_css_offline - cgroup css_offline callback
- * @cgroup: cgroup of interest
+ * @css: css of interest
  *
- * This function is called when @cgroup is about to go away and responsible
- * for shooting down all blkgs associated with @cgroup.  blkgs should be
+ * This function is called when @css is about to go away and responsible
+ * for shooting down all blkgs associated with @css.  blkgs should be
  * removed while holding both q and blkcg locks.  As blkcg lock is nested
  * inside q lock, this function performs reverse double lock dancing.
  *
  * This is the blkcg counterpart of ioc_release_fn().
  */
-static void blkcg_css_offline(struct cgroup *cgroup)
+static void blkcg_css_offline(struct cgroup_subsys_state *css)
 {
-	struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
+	struct blkcg *blkcg = css_to_blkcg(css);
 
 	spin_lock_irq(&blkcg->lock);
 
@@ -815,21 +801,21 @@ static void blkcg_css_offline(struct cgroup *cgroup)
 	spin_unlock_irq(&blkcg->lock);
 }
 
-static void blkcg_css_free(struct cgroup *cgroup)
+static void blkcg_css_free(struct cgroup_subsys_state *css)
 {
-	struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
+	struct blkcg *blkcg = css_to_blkcg(css);
 
 	if (blkcg != &blkcg_root)
 		kfree(blkcg);
 }
 
-static struct cgroup_subsys_state *blkcg_css_alloc(struct cgroup *cgroup)
+static struct cgroup_subsys_state *
+blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 {
 	static atomic64_t id_seq = ATOMIC64_INIT(0);
 	struct blkcg *blkcg;
-	struct cgroup *parent = cgroup->parent;
 
-	if (!parent) {
+	if (!parent_css) {
 		blkcg = &blkcg_root;
 		goto done;
 	}
@@ -900,14 +886,15 @@ void blkcg_exit_queue(struct request_queue *q)
  * of the main cic data structures.  For now we allow a task to change
  * its cgroup only if it's the only owner of its ioc.
  */
-static int blkcg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
+static int blkcg_can_attach(struct cgroup_subsys_state *css,
+			    struct cgroup_taskset *tset)
 {
 	struct task_struct *task;
 	struct io_context *ioc;
 	int ret = 0;
 
 	/* task_lock() is needed to avoid races with exit_io_context() */
-	cgroup_taskset_for_each(task, cgrp, tset) {
+	cgroup_taskset_for_each(task, css, tset) {
 		task_lock(task);
 		ioc = task->io_context;
 		if (ioc && atomic_read(&ioc->nr_tasks) > 1)
@@ -928,14 +915,6 @@ struct cgroup_subsys blkio_subsys = {
 	.subsys_id = blkio_subsys_id,
 	.base_cftypes = blkcg_files,
 	.module = THIS_MODULE,
-
-	/*
-	 * blkio subsystem is utterly broken in terms of hierarchy support.
-	 * It treats all cgroups equally regardless of where they're
-	 * located in the hierarchy - all cgroups are treated as if they're
-	 * right below the root.  Fix it and remove the following.
-	 */
-	.broken_hierarchy = true,
 };
 EXPORT_SYMBOL_GPL(blkio_subsys);
 
@@ -998,8 +977,6 @@ int blkcg_activate_policy(struct request_queue *q,
 		ret = PTR_ERR(blkg);
 		goto out_unlock;
 	}
-	q->root_blkg = blkg;
-	q->root_rl.blkg = blkg;
 
 	list_for_each_entry(blkg, &q->blkg_list, q_node)
 		cnt++;
@@ -1152,7 +1129,7 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
 
 	/* kill the intf files first */
 	if (pol->cftypes)
-		cgroup_rm_cftypes(&blkio_subsys, pol->cftypes);
+		cgroup_rm_cftypes(pol->cftypes);
 
 	/* unregister and update blkgs */
 	blkcg_policy[pol->plid] = NULL;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 4e595ee..86154ea 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -179,22 +179,20 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 void blkg_conf_finish(struct blkg_conf_ctx *ctx);
 
 
-static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup)
+static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
 {
-	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
-			    struct blkcg, css);
+	return css ? container_of(css, struct blkcg, css) : NULL;
 }
 
 static inline struct blkcg *task_blkcg(struct task_struct *tsk)
 {
-	return container_of(task_subsys_state(tsk, blkio_subsys_id),
-			    struct blkcg, css);
+	return css_to_blkcg(task_css(tsk, blkio_subsys_id));
 }
 
 static inline struct blkcg *bio_blkcg(struct bio *bio)
 {
 	if (bio && bio->bi_css)
-		return container_of(bio->bi_css, struct blkcg, css);
+		return css_to_blkcg(bio->bi_css);
 	return task_blkcg(current);
 }
 
@@ -206,9 +204,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
  */
 static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
 {
-	struct cgroup *pcg = blkcg->css.cgroup->parent;
-
-	return pcg ? cgroup_to_blkcg(pcg) : NULL;
+	return css_to_blkcg(css_parent(&blkcg->css));
 }
 
 /**
@@ -266,7 +262,7 @@ static inline void blkg_get(struct blkcg_gq *blkg)
 	blkg->refcnt++;
 }
 
-void __blkg_release(struct blkcg_gq *blkg);
+void __blkg_release_rcu(struct rcu_head *rcu);
 
 /**
  * blkg_put - put a blkg reference
@@ -279,9 +275,44 @@ static inline void blkg_put(struct blkcg_gq *blkg)
 	lockdep_assert_held(blkg->q->queue_lock);
 	WARN_ON_ONCE(blkg->refcnt <= 0);
 	if (!--blkg->refcnt)
-		__blkg_release(blkg);
+		call_rcu(&blkg->rcu_head, __blkg_release_rcu);
 }
 
+struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
+			       bool update_hint);
+
+/**
+ * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
+ * @d_blkg: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @p_blkg: target blkg to walk descendants of
+ *
+ * Walk @c_blkg through the descendants of @p_blkg.  Must be used with RCU
+ * read locked.  If called under either blkcg or queue lock, the iteration
+ * is guaranteed to include all and only online blkgs.  The caller may
+ * update @pos_css by calling css_rightmost_descendant() to skip subtree.
+ * @p_blkg is included in the iteration and the first node to be visited.
+ */
+#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg)		\
+	css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css)	\
+		if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),	\
+					      (p_blkg)->q, false)))
+
+/**
+ * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
+ * @d_blkg: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @p_blkg: target blkg to walk descendants of
+ *
+ * Similar to blkg_for_each_descendant_pre() but performs post-order
+ * traversal instead.  Synchronization rules are the same.  @p_blkg is
+ * included in the iteration and the last node to be visited.
+ */
+#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg)		\
+	css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css)	\
+		if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),	\
+					      (p_blkg)->q, false)))
+
 /**
  * blk_get_rl - get request_list to use
  * @q: request_queue of interest
@@ -371,6 +402,11 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl,
 #define blk_queue_for_each_rl(rl, q)	\
 	for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
 
+static inline void blkg_stat_init(struct blkg_stat *stat)
+{
+	u64_stats_init(&stat->syncp);
+}
+
 /**
  * blkg_stat_add - add a value to a blkg_stat
  * @stat: target blkg_stat
@@ -399,9 +435,9 @@ static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
 	uint64_t v;
 
 	do {
-		start = u64_stats_fetch_begin(&stat->syncp);
+		start = u64_stats_fetch_begin_bh(&stat->syncp);
 		v = stat->cnt;
-	} while (u64_stats_fetch_retry(&stat->syncp, start));
+	} while (u64_stats_fetch_retry_bh(&stat->syncp, start));
 
 	return v;
 }
@@ -427,6 +463,11 @@ static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from)
 	blkg_stat_add(to, blkg_stat_read(from));
 }
 
+static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
+{
+	u64_stats_init(&rwstat->syncp);
+}
+
 /**
  * blkg_rwstat_add - add a value to a blkg_rwstat
  * @rwstat: target blkg_rwstat
@@ -467,9 +508,9 @@ static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
 	struct blkg_rwstat tmp;
 
 	do {
-		start = u64_stats_fetch_begin(&rwstat->syncp);
+		start = u64_stats_fetch_begin_bh(&rwstat->syncp);
 		tmp = *rwstat;
-	} while (u64_stats_fetch_retry(&rwstat->syncp, start));
+	} while (u64_stats_fetch_retry_bh(&rwstat->syncp, start));
 
 	return tmp;
 }
@@ -542,7 +583,6 @@ static inline int blkcg_activate_policy(struct request_queue *q,
 static inline void blkcg_deactivate_policy(struct request_queue *q,
 					   const struct blkcg_policy *pol) { }
 
-static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; }
 static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
 
 static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
diff --git a/block/blk-core.c b/block/blk-core.c
index d5745b5..8bdd012 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -16,6 +16,7 @@
 #include <linux/backing-dev.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <linux/highmem.h>
 #include <linux/mm.h>
 #include <linux/kernel_stat.h>
@@ -48,7 +49,7 @@ DEFINE_IDA(blk_queue_ida);
 /*
  * For the allocated request tables
  */
-static struct kmem_cache *request_cachep;
+struct kmem_cache *request_cachep = NULL;
 
 /*
  * For queue allocation
@@ -60,42 +61,6 @@ struct kmem_cache *blk_requestq_cachep;
  */
 static struct workqueue_struct *kblockd_workqueue;
 
-static void drive_stat_acct(struct request *rq, int new_io)
-{
-	struct hd_struct *part;
-	int rw = rq_data_dir(rq);
-	int cpu;
-
-	if (!blk_do_io_stat(rq))
-		return;
-
-	cpu = part_stat_lock();
-
-	if (!new_io) {
-		part = rq->part;
-		part_stat_inc(cpu, part, merges[rw]);
-	} else {
-		part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
-		if (!hd_struct_try_get(part)) {
-			/*
-			 * The partition is already being removed,
-			 * the request will be accounted on the disk only
-			 *
-			 * We take a reference on disk->part0 although that
-			 * partition will never be deleted, so we can treat
-			 * it as any other partition.
-			 */
-			part = &rq->rq_disk->part0;
-			hd_struct_get(part);
-		}
-		part_round_stats(cpu, part);
-		part_inc_in_flight(part, rw);
-		rq->part = part;
-	}
-
-	part_stat_unlock();
-}
-
 void blk_queue_congestion_threshold(struct request_queue *q)
 {
 	int nr;
@@ -145,7 +110,6 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	rq->cmd = rq->__cmd;
 	rq->cmd_len = BLK_MAX_CDB;
 	rq->tag = -1;
-	rq->ref_count = 1;
 	rq->start_time = jiffies;
 	set_start_time_ns(rq);
 	rq->part = NULL;
@@ -174,9 +138,9 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
 {
 	int bit;
 
-	printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg,
+	printk(KERN_INFO "%s: dev %s: type=%x, flags=%llx\n", msg,
 		rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,
-		rq->cmd_flags);
+		(unsigned long long) rq->cmd_flags);
 
 	printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",
 	       (unsigned long long)blk_rq_pos(rq),
@@ -595,9 +559,12 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	if (!q)
 		return NULL;
 
+	if (percpu_counter_init(&q->mq_usage_counter, 0))
+		goto fail_q;
+
 	q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
 	if (q->id < 0)
-		goto fail_q;
+		goto fail_c;
 
 	q->backing_dev_info.ra_pages =
 			(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
@@ -644,13 +611,19 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	q->bypass_depth = 1;
 	__set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
 
+	init_waitqueue_head(&q->mq_freeze_wq);
+
 	if (blkcg_init_queue(q))
-		goto fail_id;
+		goto fail_bdi;
 
 	return q;
 
+fail_bdi:
+	bdi_destroy(&q->backing_dev_info);
 fail_id:
 	ida_simple_remove(&blk_queue_ida, q->id);
+fail_c:
+	percpu_counter_destroy(&q->mq_usage_counter);
 fail_q:
 	kmem_cache_free(blk_requestq_cachep, q);
 	return NULL;
@@ -739,9 +712,17 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
 
 	q->sg_reserved_size = INT_MAX;
 
+	/* Protect q->elevator from elevator_change */
+	mutex_lock(&q->sysfs_lock);
+
 	/* init elevator */
-	if (elevator_init(q, NULL))
+	if (elevator_init(q, NULL)) {
+		mutex_unlock(&q->sysfs_lock);
 		return NULL;
+	}
+
+	mutex_unlock(&q->sysfs_lock);
+
 	return q;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1109,7 +1090,8 @@ retry:
 	goto retry;
 }
 
-struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
+static struct request *blk_old_get_request(struct request_queue *q, int rw,
+		gfp_t gfp_mask)
 {
 	struct request *rq;
 
@@ -1126,6 +1108,14 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
 
 	return rq;
 }
+
+struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
+{
+	if (q->mq_ops)
+		return blk_mq_alloc_request(q, rw, gfp_mask, false);
+	else
+		return blk_old_get_request(q, rw, gfp_mask);
+}
 EXPORT_SYMBOL(blk_get_request);
 
 /**
@@ -1211,7 +1201,7 @@ EXPORT_SYMBOL(blk_requeue_request);
 static void add_acct_request(struct request_queue *q, struct request *rq,
 			     int where)
 {
-	drive_stat_acct(rq, 1);
+	blk_account_io_start(rq, true);
 	__elv_add_request(q, rq, where);
 }
 
@@ -1272,8 +1262,6 @@ void __blk_put_request(struct request_queue *q, struct request *req)
 {
 	if (unlikely(!q))
 		return;
-	if (unlikely(--req->ref_count))
-		return;
 
 	blk_pm_put_request(req);
 
@@ -1302,12 +1290,17 @@ EXPORT_SYMBOL_GPL(__blk_put_request);
 
 void blk_put_request(struct request *req)
 {
-	unsigned long flags;
 	struct request_queue *q = req->q;
 
-	spin_lock_irqsave(q->queue_lock, flags);
-	__blk_put_request(q, req);
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	if (q->mq_ops)
+		blk_mq_free_request(req);
+	else {
+		unsigned long flags;
+
+		spin_lock_irqsave(q->queue_lock, flags);
+		__blk_put_request(q, req);
+		spin_unlock_irqrestore(q->queue_lock, flags);
+	}
 }
 EXPORT_SYMBOL(blk_put_request);
 
@@ -1343,8 +1336,8 @@ void blk_add_request_payload(struct request *rq, struct page *page,
 }
 EXPORT_SYMBOL_GPL(blk_add_request_payload);
 
-static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
-				   struct bio *bio)
+bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
+			    struct bio *bio)
 {
 	const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
 
@@ -1361,12 +1354,12 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
 	req->__data_len += bio->bi_size;
 	req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
 
-	drive_stat_acct(req, 0);
+	blk_account_io_start(req, false);
 	return true;
 }
 
-static bool bio_attempt_front_merge(struct request_queue *q,
-				    struct request *req, struct bio *bio)
+bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
+			     struct bio *bio)
 {
 	const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
 
@@ -1391,12 +1384,12 @@ static bool bio_attempt_front_merge(struct request_queue *q,
 	req->__data_len += bio->bi_size;
 	req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
 
-	drive_stat_acct(req, 0);
+	blk_account_io_start(req, false);
 	return true;
 }
 
 /**
- * attempt_plug_merge - try to merge with %current's plugged list
+ * blk_attempt_plug_merge - try to merge with %current's plugged list
  * @q: request_queue new bio is being queued at
  * @bio: new bio being queued
  * @request_count: out parameter for number of traversed plugged requests
@@ -1412,19 +1405,28 @@ static bool bio_attempt_front_merge(struct request_queue *q,
  * reliable access to the elevator outside queue lock.  Only check basic
  * merging parameters without querying the elevator.
  */
-static bool attempt_plug_merge(struct request_queue *q, struct bio *bio,
-			       unsigned int *request_count)
+bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
+			    unsigned int *request_count)
 {
 	struct blk_plug *plug;
 	struct request *rq;
 	bool ret = false;
+	struct list_head *plug_list;
+
+	if (blk_queue_nomerges(q))
+		goto out;
 
 	plug = current->plug;
 	if (!plug)
 		goto out;
 	*request_count = 0;
 
-	list_for_each_entry_reverse(rq, &plug->list, queuelist) {
+	if (q->mq_ops)
+		plug_list = &plug->mq_list;
+	else
+		plug_list = &plug->list;
+
+	list_for_each_entry_reverse(rq, plug_list, queuelist) {
 		int el_ret;
 
 		if (rq->q == q)
@@ -1492,7 +1494,7 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio)
 	 * Check if we can merge with the plugged list before grabbing
 	 * any locks.
 	 */
-	if (attempt_plug_merge(q, bio, &request_count))
+	if (blk_attempt_plug_merge(q, bio, &request_count))
 		return;
 
 	spin_lock_irq(q->queue_lock);
@@ -1549,11 +1551,9 @@ get_rq:
 	if (plug) {
 		/*
 		 * If this is the first request added after a plug, fire
-		 * of a plug trace. If others have been added before, check
-		 * if we have multiple devices in this plug. If so, make a
-		 * note to sort the list before dispatch.
+		 * of a plug trace.
 		 */
-		if (list_empty(&plug->list))
+		if (!request_count)
 			trace_block_plug(q);
 		else {
 			if (request_count >= BLK_MAX_REQUEST_COUNT) {
@@ -1562,7 +1562,7 @@ get_rq:
 			}
 		}
 		list_add_tail(&req->queuelist, &plug->list);
-		drive_stat_acct(req, 1);
+		blk_account_io_start(req, true);
 	} else {
 		spin_lock_irq(q->queue_lock);
 		add_acct_request(q, req, where);
@@ -2016,7 +2016,7 @@ unsigned int blk_rq_err_bytes(const struct request *rq)
 }
 EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
 
-static void blk_account_io_completion(struct request *req, unsigned int bytes)
+void blk_account_io_completion(struct request *req, unsigned int bytes)
 {
 	if (blk_do_io_stat(req)) {
 		const int rw = rq_data_dir(req);
@@ -2030,7 +2030,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
 	}
 }
 
-static void blk_account_io_done(struct request *req)
+void blk_account_io_done(struct request *req)
 {
 	/*
 	 * Account IO completion.  flush_rq isn't accounted as a
@@ -2078,6 +2078,42 @@ static inline struct request *blk_pm_peek_request(struct request_queue *q,
 }
 #endif
 
+void blk_account_io_start(struct request *rq, bool new_io)
+{
+	struct hd_struct *part;
+	int rw = rq_data_dir(rq);
+	int cpu;
+
+	if (!blk_do_io_stat(rq))
+		return;
+
+	cpu = part_stat_lock();
+
+	if (!new_io) {
+		part = rq->part;
+		part_stat_inc(cpu, part, merges[rw]);
+	} else {
+		part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
+		if (!hd_struct_try_get(part)) {
+			/*
+			 * The partition is already being removed,
+			 * the request will be accounted on the disk only
+			 *
+			 * We take a reference on disk->part0 although that
+			 * partition will never be deleted, so we can treat
+			 * it as any other partition.
+			 */
+			part = &rq->rq_disk->part0;
+			hd_struct_get(part);
+		}
+		part_round_stats(cpu, part);
+		part_inc_in_flight(part, rw);
+		rq->part = part;
+	}
+
+	part_stat_unlock();
+}
+
 /**
  * blk_peek_request - peek at the top of a request queue
  * @q: request queue to peek at
@@ -2229,6 +2265,7 @@ void blk_start_request(struct request *req)
 	if (unlikely(blk_bidi_rq(req)))
 		req->next_rq->resid_len = blk_rq_bytes(req->next_rq);
 
+	BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
 	blk_add_timer(req);
 }
 EXPORT_SYMBOL(blk_start_request);
@@ -2315,6 +2352,15 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 		case -EBADE:
 			error_type = "critical nexus";
 			break;
+		case -ETIMEDOUT:
+			error_type = "timeout";
+			break;
+		case -ENOSPC:
+			error_type = "critical space allocation";
+			break;
+		case -ENODATA:
+			error_type = "critical medium";
+			break;
 		case -EIO:
 		default:
 			error_type = "I/O";
@@ -2444,7 +2490,6 @@ static void blk_finish_request(struct request *req, int error)
 	if (req->cmd_flags & REQ_DONTPREP)
 		blk_unprep_request(req);
 
-
 	blk_account_io_done(req);
 
 	if (req->end_io)
@@ -2866,6 +2911,7 @@ void blk_start_plug(struct blk_plug *plug)
 
 	plug->magic = PLUG_MAGIC;
 	INIT_LIST_HEAD(&plug->list);
+	INIT_LIST_HEAD(&plug->mq_list);
 	INIT_LIST_HEAD(&plug->cb_list);
 
 	/*
@@ -2963,6 +3009,10 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 	BUG_ON(plug->magic != PLUG_MAGIC);
 
 	flush_plug_callbacks(plug, from_schedule);
+
+	if (!list_empty(&plug->mq_list))
+		blk_mq_flush_plug_list(plug, from_schedule);
+
 	if (list_empty(&plug->list))
 		return;
 
@@ -3180,7 +3230,8 @@ int __init blk_dev_init(void)
 
 	/* used for unplugging and affects IO latency/throughput - HIGHPRI */
 	kblockd_workqueue = alloc_workqueue("kblockd",
-					    WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
+					    WQ_MEM_RECLAIM | WQ_HIGHPRI |
+					    WQ_POWER_EFFICIENT, 0);
 	if (!kblockd_workqueue)
 		panic("Failed to create kblockd\n");
 
diff --git a/block/blk-exec.c b/block/blk-exec.c
index e706213..c3edf9d 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -5,6 +5,7 @@
 #include <linux/module.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <linux/sched/sysctl.h>
 
 #include "blk.h"
@@ -24,7 +25,6 @@ static void blk_end_sync_rq(struct request *rq, int error)
 	struct completion *waiting = rq->end_io_data;
 
 	rq->end_io_data = NULL;
-	__blk_put_request(rq->q, rq);
 
 	/*
 	 * complete last, if this is a stack request the process (and thus
@@ -59,6 +59,12 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 
 	rq->rq_disk = bd_disk;
 	rq->end_io = done;
+
+	if (q->mq_ops) {
+		blk_mq_insert_request(q, rq, true);
+		return;
+	}
+
 	/*
 	 * need to check this before __blk_run_queue(), because rq can
 	 * be freed before that returns.
@@ -68,9 +74,9 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 	spin_lock_irq(q->queue_lock);
 
 	if (unlikely(blk_queue_dying(q))) {
+		rq->cmd_flags |= REQ_QUIET; 
 		rq->errors = -ENXIO;
-		if (rq->end_io)
-			rq->end_io(rq, rq->errors);
+		__blk_end_request_all(rq, rq->errors);
 		spin_unlock_irq(q->queue_lock);
 		return;
 	}
@@ -103,12 +109,6 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
 	int err = 0;
 	unsigned long hang_check;
 
-	/*
-	 * we need an extra reference to the request, so we can look at
-	 * it after io completion
-	 */
-	rq->ref_count++;
-
 	if (!rq->sense) {
 		memset(sense, 0, sizeof(sense));
 		rq->sense = sense;
diff --git a/block/blk-flush.c b/block/blk-flush.c
index cc2b827..fb6f3c0 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -69,8 +69,10 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/gfp.h>
+#include <linux/blk-mq.h>
 
 #include "blk.h"
+#include "blk-mq.h"
 
 /* FLUSH/FUA sequences */
 enum {
@@ -124,6 +126,24 @@ static void blk_flush_restore_request(struct request *rq)
 	/* make @rq a normal request */
 	rq->cmd_flags &= ~REQ_FLUSH_SEQ;
 	rq->end_io = rq->flush.saved_end_io;
+
+	blk_clear_rq_complete(rq);
+}
+
+static void mq_flush_data_run(struct work_struct *work)
+{
+	struct request *rq;
+
+	rq = container_of(work, struct request, mq_flush_data);
+
+	memset(&rq->csd, 0, sizeof(rq->csd));
+	blk_mq_run_request(rq, true, false);
+}
+
+static void blk_mq_flush_data_insert(struct request *rq)
+{
+	INIT_WORK(&rq->mq_flush_data, mq_flush_data_run);
+	kblockd_schedule_work(rq->q, &rq->mq_flush_data);
 }
 
 /**
@@ -136,7 +156,7 @@ static void blk_flush_restore_request(struct request *rq)
  * completion and trigger the next step.
  *
  * CONTEXT:
- * spin_lock_irq(q->queue_lock)
+ * spin_lock_irq(q->queue_lock or q->mq_flush_lock)
  *
  * RETURNS:
  * %true if requests were added to the dispatch queue, %false otherwise.
@@ -146,7 +166,7 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
 {
 	struct request_queue *q = rq->q;
 	struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
-	bool queued = false;
+	bool queued = false, kicked;
 
 	BUG_ON(rq->flush.seq & seq);
 	rq->flush.seq |= seq;
@@ -167,8 +187,12 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
 
 	case REQ_FSEQ_DATA:
 		list_move_tail(&rq->flush.list, &q->flush_data_in_flight);
-		list_add(&rq->queuelist, &q->queue_head);
-		queued = true;
+		if (q->mq_ops)
+			blk_mq_flush_data_insert(rq);
+		else {
+			list_add(&rq->queuelist, &q->queue_head);
+			queued = true;
+		}
 		break;
 
 	case REQ_FSEQ_DONE:
@@ -181,28 +205,43 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
 		BUG_ON(!list_empty(&rq->queuelist));
 		list_del_init(&rq->flush.list);
 		blk_flush_restore_request(rq);
-		__blk_end_request_all(rq, error);
+		if (q->mq_ops)
+			blk_mq_end_io(rq, error);
+		else
+			__blk_end_request_all(rq, error);
 		break;
 
 	default:
 		BUG();
 	}
 
-	return blk_kick_flush(q) | queued;
+	kicked = blk_kick_flush(q);
+	/* blk_mq_run_flush will run queue */
+	if (q->mq_ops)
+		return queued;
+	return kicked | queued;
 }
 
 static void flush_end_io(struct request *flush_rq, int error)
 {
 	struct request_queue *q = flush_rq->q;
-	struct list_head *running = &q->flush_queue[q->flush_running_idx];
+	struct list_head *running;
 	bool queued = false;
 	struct request *rq, *n;
+	unsigned long flags = 0;
 
+	if (q->mq_ops) {
+		blk_mq_free_request(flush_rq);
+		spin_lock_irqsave(&q->mq_flush_lock, flags);
+	}
+	running = &q->flush_queue[q->flush_running_idx];
 	BUG_ON(q->flush_pending_idx == q->flush_running_idx);
 
 	/* account completion of the flush request */
 	q->flush_running_idx ^= 1;
-	elv_completed_request(q, flush_rq);
+
+	if (!q->mq_ops)
+		elv_completed_request(q, flush_rq);
 
 	/* and push the waiting requests to the next stage */
 	list_for_each_entry_safe(rq, n, running, flush.list) {
@@ -223,9 +262,48 @@ static void flush_end_io(struct request *flush_rq, int error)
 	 * directly into request_fn may confuse the driver.  Always use
 	 * kblockd.
 	 */
-	if (queued || q->flush_queue_delayed)
-		blk_run_queue_async(q);
+	if (queued || q->flush_queue_delayed) {
+		if (!q->mq_ops)
+			blk_run_queue_async(q);
+		else
+		/*
+		 * This can be optimized to only run queues with requests
+		 * queued if necessary.
+		 */
+			blk_mq_run_queues(q, true);
+	}
 	q->flush_queue_delayed = 0;
+	if (q->mq_ops)
+		spin_unlock_irqrestore(&q->mq_flush_lock, flags);
+}
+
+static void mq_flush_work(struct work_struct *work)
+{
+	struct request_queue *q;
+	struct request *rq;
+
+	q = container_of(work, struct request_queue, mq_flush_work);
+
+	/* We don't need set REQ_FLUSH_SEQ, it's for consistency */
+	rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ,
+		__GFP_WAIT|GFP_ATOMIC, true);
+	rq->cmd_type = REQ_TYPE_FS;
+	rq->end_io = flush_end_io;
+
+	blk_mq_run_request(rq, true, false);
+}
+
+/*
+ * We can't directly use q->flush_rq, because it doesn't have tag and is not in
+ * hctx->rqs[]. so we must allocate a new request, since we can't sleep here,
+ * so offload the work to workqueue.
+ *
+ * Note: we assume a flush request finished in any hardware queue will flush
+ * the whole disk cache.
+ */
+static void mq_run_flush(struct request_queue *q)
+{
+	kblockd_schedule_work(q, &q->mq_flush_work);
 }
 
 /**
@@ -236,7 +314,7 @@ static void flush_end_io(struct request *flush_rq, int error)
  * Please read the comment at the top of this file for more info.
  *
  * CONTEXT:
- * spin_lock_irq(q->queue_lock)
+ * spin_lock_irq(q->queue_lock or q->mq_flush_lock)
  *
  * RETURNS:
  * %true if flush was issued, %false otherwise.
@@ -261,13 +339,18 @@ static bool blk_kick_flush(struct request_queue *q)
 	 * Issue flush and toggle pending_idx.  This makes pending_idx
 	 * different from running_idx, which means flush is in flight.
 	 */
+	q->flush_pending_idx ^= 1;
+	if (q->mq_ops) {
+		mq_run_flush(q);
+		return true;
+	}
+
 	blk_rq_init(q, &q->flush_rq);
 	q->flush_rq.cmd_type = REQ_TYPE_FS;
 	q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
 	q->flush_rq.rq_disk = first_rq->rq_disk;
 	q->flush_rq.end_io = flush_end_io;
 
-	q->flush_pending_idx ^= 1;
 	list_add_tail(&q->flush_rq.queuelist, &q->queue_head);
 	return true;
 }
@@ -284,16 +367,37 @@ static void flush_data_end_io(struct request *rq, int error)
 		blk_run_queue_async(q);
 }
 
+static void mq_flush_data_end_io(struct request *rq, int error)
+{
+	struct request_queue *q = rq->q;
+	struct blk_mq_hw_ctx *hctx;
+	struct blk_mq_ctx *ctx;
+	unsigned long flags;
+
+	ctx = rq->mq_ctx;
+	hctx = q->mq_ops->map_queue(q, ctx->cpu);
+
+	/*
+	 * After populating an empty queue, kick it to avoid stall.  Read
+	 * the comment in flush_end_io().
+	 */
+	spin_lock_irqsave(&q->mq_flush_lock, flags);
+	if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error))
+		blk_mq_run_hw_queue(hctx, true);
+	spin_unlock_irqrestore(&q->mq_flush_lock, flags);
+}
+
 /**
  * blk_insert_flush - insert a new FLUSH/FUA request
  * @rq: request to insert
  *
  * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions.
+ * or __blk_mq_run_hw_queue() to dispatch request.
  * @rq is being submitted.  Analyze what needs to be done and put it on the
  * right queue.
  *
  * CONTEXT:
- * spin_lock_irq(q->queue_lock)
+ * spin_lock_irq(q->queue_lock) in !mq case
  */
 void blk_insert_flush(struct request *rq)
 {
@@ -316,7 +420,10 @@ void blk_insert_flush(struct request *rq)
 	 * complete the request.
 	 */
 	if (!policy) {
-		__blk_end_bidi_request(rq, 0, 0, 0);
+		if (q->mq_ops)
+			blk_mq_end_io(rq, 0);
+		else
+			__blk_end_bidi_request(rq, 0, 0, 0);
 		return;
 	}
 
@@ -329,7 +436,10 @@ void blk_insert_flush(struct request *rq)
 	 */
 	if ((policy & REQ_FSEQ_DATA) &&
 	    !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
-		list_add_tail(&rq->queuelist, &q->queue_head);
+		if (q->mq_ops) {
+			blk_mq_run_request(rq, false, true);
+		} else
+			list_add_tail(&rq->queuelist, &q->queue_head);
 		return;
 	}
 
@@ -341,6 +451,14 @@ void blk_insert_flush(struct request *rq)
 	INIT_LIST_HEAD(&rq->flush.list);
 	rq->cmd_flags |= REQ_FLUSH_SEQ;
 	rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
+	if (q->mq_ops) {
+		rq->end_io = mq_flush_data_end_io;
+
+		spin_lock_irq(&q->mq_flush_lock);
+		blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
+		spin_unlock_irq(&q->mq_flush_lock);
+		return;
+	}
 	rq->end_io = flush_data_end_io;
 
 	blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
@@ -384,15 +502,6 @@ void blk_abort_flushes(struct request_queue *q)
 	}
 }
 
-static void bio_end_flush(struct bio *bio, int err)
-{
-	if (err)
-		clear_bit(BIO_UPTODATE, &bio->bi_flags);
-	if (bio->bi_private)
-		complete(bio->bi_private);
-	bio_put(bio);
-}
-
 /**
  * blkdev_issue_flush - queue a flush
  * @bdev:	blockdev to issue flush for
@@ -408,7 +517,6 @@ static void bio_end_flush(struct bio *bio, int err)
 int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 		sector_t *error_sector)
 {
-	DECLARE_COMPLETION_ONSTACK(wait);
 	struct request_queue *q;
 	struct bio *bio;
 	int ret = 0;
@@ -430,13 +538,9 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 		return -ENXIO;
 
 	bio = bio_alloc(gfp_mask, 0);
-	bio->bi_end_io = bio_end_flush;
 	bio->bi_bdev = bdev;
-	bio->bi_private = &wait;
 
-	bio_get(bio);
-	submit_bio(WRITE_FLUSH, bio);
-	wait_for_completion_io(&wait);
+	ret = submit_bio_wait(WRITE_FLUSH, bio);
 
 	/*
 	 * The driver must store the error location in ->bi_sector, if
@@ -446,10 +550,13 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 	if (error_sector)
 		*error_sector = bio->bi_sector;
 
-	if (!bio_flagged(bio, BIO_UPTODATE))
-		ret = -EIO;
-
 	bio_put(bio);
 	return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_flush);
+
+void blk_mq_init_flush(struct request_queue *q)
+{
+	spin_lock_init(&q->mq_flush_lock);
+	INIT_WORK(&q->mq_flush_work, mq_flush_work);
+}
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 9c4bb82..242df01 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -6,7 +6,6 @@
 #include <linux/init.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
-#include <linux/bootmem.h>	/* for max_pfn/max_low_pfn */
 #include <linux/slab.h>
 
 #include "blk.h"
@@ -144,7 +143,8 @@ void put_io_context(struct io_context *ioc)
 	if (atomic_long_dec_and_test(&ioc->refcount)) {
 		spin_lock_irqsave(&ioc->lock, flags);
 		if (!hlist_empty(&ioc->icq_list))
-			schedule_work(&ioc->release_work);
+			queue_work(system_power_efficient_wq,
+					&ioc->release_work);
 		else
 			free_ioc = true;
 		spin_unlock_irqrestore(&ioc->lock, flags);
@@ -366,7 +366,7 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
 	if (!icq)
 		return NULL;
 
-	if (radix_tree_preload(gfp_mask) < 0) {
+	if (radix_tree_maybe_preload(gfp_mask) < 0) {
 		kmem_cache_free(et->icq_cache, icq);
 		return NULL;
 	}
diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
index 58916af..1855bf5 100644
--- a/block/blk-iopoll.c
+++ b/block/blk-iopoll.c
@@ -35,7 +35,7 @@ void blk_iopoll_sched(struct blk_iopoll *iop)
 	unsigned long flags;
 
 	local_irq_save(flags);
-	list_add_tail(&iop->list, &__get_cpu_var(blk_cpu_iopoll));
+	list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
 	__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
 	local_irq_restore(flags);
 }
@@ -79,7 +79,7 @@ EXPORT_SYMBOL(blk_iopoll_complete);
 
 static void blk_iopoll_softirq(struct softirq_action *h)
 {
-	struct list_head *list = &__get_cpu_var(blk_cpu_iopoll);
+	struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll);
 	int rearm = 0, budget = blk_iopoll_budget;
 	unsigned long start_time = jiffies;
 
@@ -189,8 +189,8 @@ void blk_iopoll_init(struct blk_iopoll *iop, int weight, blk_iopoll_fn *poll_fn)
 }
 EXPORT_SYMBOL(blk_iopoll_init);
 
-static int __cpuinit blk_iopoll_cpu_notify(struct notifier_block *self,
-					  unsigned long action, void *hcpu)
+static int blk_iopoll_cpu_notify(struct notifier_block *self,
+				 unsigned long action, void *hcpu)
 {
 	/*
 	 * If a CPU goes away, splice its entries to the current CPU
@@ -201,7 +201,7 @@ static int __cpuinit blk_iopoll_cpu_notify(struct notifier_block *self,
 
 		local_irq_disable();
 		list_splice_init(&per_cpu(blk_cpu_iopoll, cpu),
-				 &__get_cpu_var(blk_cpu_iopoll));
+				 this_cpu_ptr(&blk_cpu_iopoll));
 		__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
 		local_irq_enable();
 	}
@@ -209,7 +209,7 @@ static int __cpuinit blk_iopoll_cpu_notify(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __cpuinitdata blk_iopoll_cpu_notifier = {
+static struct notifier_block blk_iopoll_cpu_notifier = {
 	.notifier_call	= blk_iopoll_cpu_notify,
 };
 
diff --git a/block/blk-lib.c b/block/blk-lib.c
index d6f50d5..9b5b561 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -43,8 +43,8 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	DECLARE_COMPLETION_ONSTACK(wait);
 	struct request_queue *q = bdev_get_queue(bdev);
 	int type = REQ_WRITE | REQ_DISCARD;
-	sector_t max_discard_sectors;
-	sector_t granularity, alignment;
+	unsigned int max_discard_sectors, granularity;
+	int alignment;
 	struct bio_batch bb;
 	struct bio *bio;
 	int ret = 0;
@@ -58,16 +58,14 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 
 	/* Zero-sector (unknown) and one-sector granularities are the same.  */
 	granularity = max(q->limits.discard_granularity >> 9, 1U);
-	alignment = bdev_discard_alignment(bdev) >> 9;
-	alignment = sector_div(alignment, granularity);
+	alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
 
 	/*
 	 * Ensure that max_discard_sectors is of the proper
 	 * granularity, so that requests stay aligned after a split.
 	 */
 	max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
-	sector_div(max_discard_sectors, granularity);
-	max_discard_sectors *= granularity;
+	max_discard_sectors -= max_discard_sectors % granularity;
 	if (unlikely(!max_discard_sectors)) {
 		/* Avoid infinite loop below. Being cautious never hurts. */
 		return -EOPNOTSUPP;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 5f24482..1ffc589 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -308,6 +308,17 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
 	return ll_new_hw_segment(q, req, bio);
 }
 
+/*
+ * blk-mq uses req->special to carry normal driver per-request payload, it
+ * does not indicate a prepared command that we cannot merge with.
+ */
+static bool req_no_special_merge(struct request *req)
+{
+	struct request_queue *q = req->q;
+
+	return !q->mq_ops && req->special;
+}
+
 static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 				struct request *next)
 {
@@ -319,7 +330,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 	 * First check if the either of the requests are re-queued
 	 * requests.  Can't merge them if they are.
 	 */
-	if (req->special || next->special)
+	if (req_no_special_merge(req) || req_no_special_merge(next))
 		return 0;
 
 	/*
@@ -416,7 +427,7 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 
 	if (rq_data_dir(req) != rq_data_dir(next)
 	    || req->rq_disk != next->rq_disk
-	    || next->special)
+	    || req_no_special_merge(next))
 		return 0;
 
 	if (req->cmd_flags & REQ_WRITE_SAME &&
@@ -515,7 +526,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 		return false;
 
 	/* must be same device and not a special request */
-	if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special)
+	if (rq->rq_disk != bio->bi_bdev->bd_disk || req_no_special_merge(rq))
 		return false;
 
 	/* only merge integrity protected bio into ditto rq */
diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c
new file mode 100644
index 0000000..0045ace
--- /dev/null
+++ b/block/blk-mq-cpu.c
@@ -0,0 +1,93 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/list.h>
+#include <linux/llist.h>
+#include <linux/smp.h>
+#include <linux/cpu.h>
+
+#include <linux/blk-mq.h>
+#include "blk-mq.h"
+
+static LIST_HEAD(blk_mq_cpu_notify_list);
+static DEFINE_SPINLOCK(blk_mq_cpu_notify_lock);
+
+static int blk_mq_main_cpu_notify(struct notifier_block *self,
+				  unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long) hcpu;
+	struct blk_mq_cpu_notifier *notify;
+
+	spin_lock(&blk_mq_cpu_notify_lock);
+
+	list_for_each_entry(notify, &blk_mq_cpu_notify_list, list)
+		notify->notify(notify->data, action, cpu);
+
+	spin_unlock(&blk_mq_cpu_notify_lock);
+	return NOTIFY_OK;
+}
+
+static void blk_mq_cpu_notify(void *data, unsigned long action,
+			      unsigned int cpu)
+{
+	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+		/*
+		 * If the CPU goes away, ensure that we run any pending
+		 * completions.
+		 */
+		struct llist_node *node;
+		struct request *rq;
+
+		local_irq_disable();
+
+		node = llist_del_all(&per_cpu(ipi_lists, cpu));
+		while (node) {
+			struct llist_node *next = node->next;
+
+			rq = llist_entry(node, struct request, ll_list);
+			__blk_mq_end_io(rq, rq->errors);
+			node = next;
+		}
+
+		local_irq_enable();
+	}
+}
+
+static struct notifier_block __cpuinitdata blk_mq_main_cpu_notifier = {
+	.notifier_call	= blk_mq_main_cpu_notify,
+};
+
+void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
+{
+	BUG_ON(!notifier->notify);
+
+	spin_lock(&blk_mq_cpu_notify_lock);
+	list_add_tail(&notifier->list, &blk_mq_cpu_notify_list);
+	spin_unlock(&blk_mq_cpu_notify_lock);
+}
+
+void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
+{
+	spin_lock(&blk_mq_cpu_notify_lock);
+	list_del(&notifier->list);
+	spin_unlock(&blk_mq_cpu_notify_lock);
+}
+
+void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
+			      void (*fn)(void *, unsigned long, unsigned int),
+			      void *data)
+{
+	notifier->notify = fn;
+	notifier->data = data;
+}
+
+static struct blk_mq_cpu_notifier __cpuinitdata cpu_notifier = {
+	.notify = blk_mq_cpu_notify,
+};
+
+void __init blk_mq_cpu_init(void)
+{
+	register_hotcpu_notifier(&blk_mq_main_cpu_notifier);
+	blk_mq_register_cpu_notifier(&cpu_notifier);
+}
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
new file mode 100644
index 0000000..f872127
--- /dev/null
+++ b/block/blk-mq-cpumap.c
@@ -0,0 +1,108 @@
+#include <linux/kernel.h>
+#include <linux/threads.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/cpu.h>
+
+#include <linux/blk-mq.h>
+#include "blk.h"
+#include "blk-mq.h"
+
+static void show_map(unsigned int *map, unsigned int nr)
+{
+	int i;
+
+	pr_info("blk-mq: CPU -> queue map\n");
+	for_each_online_cpu(i)
+		pr_info("  CPU%2u -> Queue %u\n", i, map[i]);
+}
+
+static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues,
+			      const int cpu)
+{
+	return cpu / ((nr_cpus + nr_queues - 1) / nr_queues);
+}
+
+static int get_first_sibling(unsigned int cpu)
+{
+	unsigned int ret;
+
+	ret = cpumask_first(topology_thread_cpumask(cpu));
+	if (ret < nr_cpu_ids)
+		return ret;
+
+	return cpu;
+}
+
+int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
+{
+	unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling;
+	cpumask_var_t cpus;
+
+	if (!alloc_cpumask_var(&cpus, GFP_ATOMIC))
+		return 1;
+
+	cpumask_clear(cpus);
+	nr_cpus = nr_uniq_cpus = 0;
+	for_each_online_cpu(i) {
+		nr_cpus++;
+		first_sibling = get_first_sibling(i);
+		if (!cpumask_test_cpu(first_sibling, cpus))
+			nr_uniq_cpus++;
+		cpumask_set_cpu(i, cpus);
+	}
+
+	queue = 0;
+	for_each_possible_cpu(i) {
+		if (!cpu_online(i)) {
+			map[i] = 0;
+			continue;
+		}
+
+		/*
+		 * Easy case - we have equal or more hardware queues. Or
+		 * there are no thread siblings to take into account. Do
+		 * 1:1 if enough, or sequential mapping if less.
+		 */
+		if (nr_queues >= nr_cpus || nr_cpus == nr_uniq_cpus) {
+			map[i] = cpu_to_queue_index(nr_cpus, nr_queues, queue);
+			queue++;
+			continue;
+		}
+
+		/*
+		 * Less then nr_cpus queues, and we have some number of
+		 * threads per cores. Map sibling threads to the same
+		 * queue.
+		 */
+		first_sibling = get_first_sibling(i);
+		if (first_sibling == i) {
+			map[i] = cpu_to_queue_index(nr_uniq_cpus, nr_queues,
+							queue);
+			queue++;
+		} else
+			map[i] = map[first_sibling];
+	}
+
+	show_map(map, nr_cpus);
+	free_cpumask_var(cpus);
+	return 0;
+}
+
+unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg)
+{
+	unsigned int *map;
+
+	/* If cpus are offline, map them to first hctx */
+	map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL,
+				reg->numa_node);
+	if (!map)
+		return NULL;
+
+	if (!blk_mq_update_queue_map(map, reg->nr_hw_queues))
+		return map;
+
+	kfree(map);
+	return NULL;
+}
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
new file mode 100644
index 0000000..ba6cf8e
--- /dev/null
+++ b/block/blk-mq-sysfs.c
@@ -0,0 +1,384 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/backing-dev.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/smp.h>
+
+#include <linux/blk-mq.h>
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+
+static void blk_mq_sysfs_release(struct kobject *kobj)
+{
+}
+
+struct blk_mq_ctx_sysfs_entry {
+	struct attribute attr;
+	ssize_t (*show)(struct blk_mq_ctx *, char *);
+	ssize_t (*store)(struct blk_mq_ctx *, const char *, size_t);
+};
+
+struct blk_mq_hw_ctx_sysfs_entry {
+	struct attribute attr;
+	ssize_t (*show)(struct blk_mq_hw_ctx *, char *);
+	ssize_t (*store)(struct blk_mq_hw_ctx *, const char *, size_t);
+};
+
+static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr,
+				 char *page)
+{
+	struct blk_mq_ctx_sysfs_entry *entry;
+	struct blk_mq_ctx *ctx;
+	struct request_queue *q;
+	ssize_t res;
+
+	entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr);
+	ctx = container_of(kobj, struct blk_mq_ctx, kobj);
+	q = ctx->queue;
+
+	if (!entry->show)
+		return -EIO;
+
+	res = -ENOENT;
+	mutex_lock(&q->sysfs_lock);
+	if (!blk_queue_dying(q))
+		res = entry->show(ctx, page);
+	mutex_unlock(&q->sysfs_lock);
+	return res;
+}
+
+static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr,
+				  const char *page, size_t length)
+{
+	struct blk_mq_ctx_sysfs_entry *entry;
+	struct blk_mq_ctx *ctx;
+	struct request_queue *q;
+	ssize_t res;
+
+	entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr);
+	ctx = container_of(kobj, struct blk_mq_ctx, kobj);
+	q = ctx->queue;
+
+	if (!entry->store)
+		return -EIO;
+
+	res = -ENOENT;
+	mutex_lock(&q->sysfs_lock);
+	if (!blk_queue_dying(q))
+		res = entry->store(ctx, page, length);
+	mutex_unlock(&q->sysfs_lock);
+	return res;
+}
+
+static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj,
+				    struct attribute *attr, char *page)
+{
+	struct blk_mq_hw_ctx_sysfs_entry *entry;
+	struct blk_mq_hw_ctx *hctx;
+	struct request_queue *q;
+	ssize_t res;
+
+	entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr);
+	hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj);
+	q = hctx->queue;
+
+	if (!entry->show)
+		return -EIO;
+
+	res = -ENOENT;
+	mutex_lock(&q->sysfs_lock);
+	if (!blk_queue_dying(q))
+		res = entry->show(hctx, page);
+	mutex_unlock(&q->sysfs_lock);
+	return res;
+}
+
+static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj,
+				     struct attribute *attr, const char *page,
+				     size_t length)
+{
+	struct blk_mq_hw_ctx_sysfs_entry *entry;
+	struct blk_mq_hw_ctx *hctx;
+	struct request_queue *q;
+	ssize_t res;
+
+	entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr);
+	hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj);
+	q = hctx->queue;
+
+	if (!entry->store)
+		return -EIO;
+
+	res = -ENOENT;
+	mutex_lock(&q->sysfs_lock);
+	if (!blk_queue_dying(q))
+		res = entry->store(hctx, page, length);
+	mutex_unlock(&q->sysfs_lock);
+	return res;
+}
+
+static ssize_t blk_mq_sysfs_dispatched_show(struct blk_mq_ctx *ctx, char *page)
+{
+	return sprintf(page, "%lu %lu\n", ctx->rq_dispatched[1],
+				ctx->rq_dispatched[0]);
+}
+
+static ssize_t blk_mq_sysfs_merged_show(struct blk_mq_ctx *ctx, char *page)
+{
+	return sprintf(page, "%lu\n", ctx->rq_merged);
+}
+
+static ssize_t blk_mq_sysfs_completed_show(struct blk_mq_ctx *ctx, char *page)
+{
+	return sprintf(page, "%lu %lu\n", ctx->rq_completed[1],
+				ctx->rq_completed[0]);
+}
+
+static ssize_t sysfs_list_show(char *page, struct list_head *list, char *msg)
+{
+	char *start_page = page;
+	struct request *rq;
+
+	page += sprintf(page, "%s:\n", msg);
+
+	list_for_each_entry(rq, list, queuelist)
+		page += sprintf(page, "\t%p\n", rq);
+
+	return page - start_page;
+}
+
+static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page)
+{
+	ssize_t ret;
+
+	spin_lock(&ctx->lock);
+	ret = sysfs_list_show(page, &ctx->rq_list, "CTX pending");
+	spin_unlock(&ctx->lock);
+
+	return ret;
+}
+
+static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx,
+					   char *page)
+{
+	return sprintf(page, "%lu\n", hctx->queued);
+}
+
+static ssize_t blk_mq_hw_sysfs_run_show(struct blk_mq_hw_ctx *hctx, char *page)
+{
+	return sprintf(page, "%lu\n", hctx->run);
+}
+
+static ssize_t blk_mq_hw_sysfs_dispatched_show(struct blk_mq_hw_ctx *hctx,
+					       char *page)
+{
+	char *start_page = page;
+	int i;
+
+	page += sprintf(page, "%8u\t%lu\n", 0U, hctx->dispatched[0]);
+
+	for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) {
+		unsigned long d = 1U << (i - 1);
+
+		page += sprintf(page, "%8lu\t%lu\n", d, hctx->dispatched[i]);
+	}
+
+	return page - start_page;
+}
+
+static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx,
+					    char *page)
+{
+	ssize_t ret;
+
+	spin_lock(&hctx->lock);
+	ret = sysfs_list_show(page, &hctx->dispatch, "HCTX pending");
+	spin_unlock(&hctx->lock);
+
+	return ret;
+}
+
+static ssize_t blk_mq_hw_sysfs_ipi_show(struct blk_mq_hw_ctx *hctx, char *page)
+{
+	ssize_t ret;
+
+	spin_lock(&hctx->lock);
+	ret = sprintf(page, "%u\n", !!(hctx->flags & BLK_MQ_F_SHOULD_IPI));
+	spin_unlock(&hctx->lock);
+
+	return ret;
+}
+
+static ssize_t blk_mq_hw_sysfs_ipi_store(struct blk_mq_hw_ctx *hctx,
+					 const char *page, size_t len)
+{
+	struct blk_mq_ctx *ctx;
+	unsigned long ret;
+	unsigned int i;
+
+	if (kstrtoul(page, 10, &ret)) {
+		pr_err("blk-mq-sysfs: invalid input '%s'\n", page);
+		return -EINVAL;
+	}
+
+	spin_lock(&hctx->lock);
+	if (ret)
+		hctx->flags |= BLK_MQ_F_SHOULD_IPI;
+	else
+		hctx->flags &= ~BLK_MQ_F_SHOULD_IPI;
+	spin_unlock(&hctx->lock);
+
+	hctx_for_each_ctx(hctx, ctx, i)
+		ctx->ipi_redirect = !!ret;
+
+	return len;
+}
+
+static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
+{
+	return blk_mq_tag_sysfs_show(hctx->tags, page);
+}
+
+static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = {
+	.attr = {.name = "dispatched", .mode = S_IRUGO },
+	.show = blk_mq_sysfs_dispatched_show,
+};
+static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_merged = {
+	.attr = {.name = "merged", .mode = S_IRUGO },
+	.show = blk_mq_sysfs_merged_show,
+};
+static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_completed = {
+	.attr = {.name = "completed", .mode = S_IRUGO },
+	.show = blk_mq_sysfs_completed_show,
+};
+static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_rq_list = {
+	.attr = {.name = "rq_list", .mode = S_IRUGO },
+	.show = blk_mq_sysfs_rq_list_show,
+};
+
+static struct attribute *default_ctx_attrs[] = {
+	&blk_mq_sysfs_dispatched.attr,
+	&blk_mq_sysfs_merged.attr,
+	&blk_mq_sysfs_completed.attr,
+	&blk_mq_sysfs_rq_list.attr,
+	NULL,
+};
+
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_queued = {
+	.attr = {.name = "queued", .mode = S_IRUGO },
+	.show = blk_mq_hw_sysfs_queued_show,
+};
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_run = {
+	.attr = {.name = "run", .mode = S_IRUGO },
+	.show = blk_mq_hw_sysfs_run_show,
+};
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = {
+	.attr = {.name = "dispatched", .mode = S_IRUGO },
+	.show = blk_mq_hw_sysfs_dispatched_show,
+};
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = {
+	.attr = {.name = "pending", .mode = S_IRUGO },
+	.show = blk_mq_hw_sysfs_rq_list_show,
+};
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_ipi = {
+	.attr = {.name = "ipi_redirect", .mode = S_IRUGO | S_IWUSR},
+	.show = blk_mq_hw_sysfs_ipi_show,
+	.store = blk_mq_hw_sysfs_ipi_store,
+};
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = {
+	.attr = {.name = "tags", .mode = S_IRUGO },
+	.show = blk_mq_hw_sysfs_tags_show,
+};
+
+static struct attribute *default_hw_ctx_attrs[] = {
+	&blk_mq_hw_sysfs_queued.attr,
+	&blk_mq_hw_sysfs_run.attr,
+	&blk_mq_hw_sysfs_dispatched.attr,
+	&blk_mq_hw_sysfs_pending.attr,
+	&blk_mq_hw_sysfs_ipi.attr,
+	&blk_mq_hw_sysfs_tags.attr,
+	NULL,
+};
+
+static const struct sysfs_ops blk_mq_sysfs_ops = {
+	.show	= blk_mq_sysfs_show,
+	.store	= blk_mq_sysfs_store,
+};
+
+static const struct sysfs_ops blk_mq_hw_sysfs_ops = {
+	.show	= blk_mq_hw_sysfs_show,
+	.store	= blk_mq_hw_sysfs_store,
+};
+
+static struct kobj_type blk_mq_ktype = {
+	.sysfs_ops	= &blk_mq_sysfs_ops,
+	.release	= blk_mq_sysfs_release,
+};
+
+static struct kobj_type blk_mq_ctx_ktype = {
+	.sysfs_ops	= &blk_mq_sysfs_ops,
+	.default_attrs	= default_ctx_attrs,
+	.release	= blk_mq_sysfs_release,
+};
+
+static struct kobj_type blk_mq_hw_ktype = {
+	.sysfs_ops	= &blk_mq_hw_sysfs_ops,
+	.default_attrs	= default_hw_ctx_attrs,
+	.release	= blk_mq_sysfs_release,
+};
+
+void blk_mq_unregister_disk(struct gendisk *disk)
+{
+	struct request_queue *q = disk->queue;
+
+	kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
+	kobject_del(&q->mq_kobj);
+
+	kobject_put(&disk_to_dev(disk)->kobj);
+}
+
+int blk_mq_register_disk(struct gendisk *disk)
+{
+	struct device *dev = disk_to_dev(disk);
+	struct request_queue *q = disk->queue;
+	struct blk_mq_hw_ctx *hctx;
+	struct blk_mq_ctx *ctx;
+	int ret, i, j;
+
+	kobject_init(&q->mq_kobj, &blk_mq_ktype);
+
+	ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
+	if (ret < 0)
+		return ret;
+
+	kobject_uevent(&q->mq_kobj, KOBJ_ADD);
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
+		ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", i);
+		if (ret)
+			break;
+
+		if (!hctx->nr_ctx)
+			continue;
+
+		hctx_for_each_ctx(hctx, ctx, j) {
+			kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
+			ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
+			if (ret)
+				break;
+		}
+	}
+
+	if (ret) {
+		blk_mq_unregister_disk(disk);
+		return ret;
+	}
+
+	return 0;
+}
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
new file mode 100644
index 0000000..d64a02f
--- /dev/null
+++ b/block/blk-mq-tag.c
@@ -0,0 +1,204 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/percpu_ida.h>
+
+#include <linux/blk-mq.h>
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+
+/*
+ * Per tagged queue (tag address space) map
+ */
+struct blk_mq_tags {
+	unsigned int nr_tags;
+	unsigned int nr_reserved_tags;
+	unsigned int nr_batch_move;
+	unsigned int nr_max_cache;
+
+	struct percpu_ida free_tags;
+	struct percpu_ida reserved_tags;
+};
+
+void blk_mq_wait_for_tags(struct blk_mq_tags *tags)
+{
+	int tag = blk_mq_get_tag(tags, __GFP_WAIT, false);
+	blk_mq_put_tag(tags, tag);
+}
+
+bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
+{
+	return !tags ||
+		percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids) != 0;
+}
+
+static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp)
+{
+	int tag;
+
+	tag = percpu_ida_alloc(&tags->free_tags, gfp);
+	if (tag < 0)
+		return BLK_MQ_TAG_FAIL;
+	return tag + tags->nr_reserved_tags;
+}
+
+static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags,
+					      gfp_t gfp)
+{
+	int tag;
+
+	if (unlikely(!tags->nr_reserved_tags)) {
+		WARN_ON_ONCE(1);
+		return BLK_MQ_TAG_FAIL;
+	}
+
+	tag = percpu_ida_alloc(&tags->reserved_tags, gfp);
+	if (tag < 0)
+		return BLK_MQ_TAG_FAIL;
+	return tag;
+}
+
+unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved)
+{
+	if (!reserved)
+		return __blk_mq_get_tag(tags, gfp);
+
+	return __blk_mq_get_reserved_tag(tags, gfp);
+}
+
+static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
+{
+	BUG_ON(tag >= tags->nr_tags);
+
+	percpu_ida_free(&tags->free_tags, tag - tags->nr_reserved_tags);
+}
+
+static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
+				      unsigned int tag)
+{
+	BUG_ON(tag >= tags->nr_reserved_tags);
+
+	percpu_ida_free(&tags->reserved_tags, tag);
+}
+
+void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
+{
+	if (tag >= tags->nr_reserved_tags)
+		__blk_mq_put_tag(tags, tag);
+	else
+		__blk_mq_put_reserved_tag(tags, tag);
+}
+
+static int __blk_mq_tag_iter(unsigned id, void *data)
+{
+	unsigned long *tag_map = data;
+	__set_bit(id, tag_map);
+	return 0;
+}
+
+void blk_mq_tag_busy_iter(struct blk_mq_tags *tags,
+			  void (*fn)(void *, unsigned long *), void *data)
+{
+	unsigned long *tag_map;
+	size_t map_size;
+
+	map_size = ALIGN(tags->nr_tags, BITS_PER_LONG) / BITS_PER_LONG;
+	tag_map = kzalloc(map_size * sizeof(unsigned long), GFP_ATOMIC);
+	if (!tag_map)
+		return;
+
+	percpu_ida_for_each_free(&tags->free_tags, __blk_mq_tag_iter, tag_map);
+	if (tags->nr_reserved_tags)
+		percpu_ida_for_each_free(&tags->reserved_tags, __blk_mq_tag_iter,
+			tag_map);
+
+	fn(data, tag_map);
+	kfree(tag_map);
+}
+
+struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
+				     unsigned int reserved_tags, int node)
+{
+	unsigned int nr_tags, nr_cache;
+	struct blk_mq_tags *tags;
+	int ret;
+
+	if (total_tags > BLK_MQ_TAG_MAX) {
+		pr_err("blk-mq: tag depth too large\n");
+		return NULL;
+	}
+
+	tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node);
+	if (!tags)
+		return NULL;
+
+	nr_tags = total_tags - reserved_tags;
+	nr_cache = nr_tags / num_possible_cpus();
+
+	if (nr_cache < BLK_MQ_TAG_CACHE_MIN)
+		nr_cache = BLK_MQ_TAG_CACHE_MIN;
+	else if (nr_cache > BLK_MQ_TAG_CACHE_MAX)
+		nr_cache = BLK_MQ_TAG_CACHE_MAX;
+
+	tags->nr_tags = total_tags;
+	tags->nr_reserved_tags = reserved_tags;
+	tags->nr_max_cache = nr_cache;
+	tags->nr_batch_move = max(1u, nr_cache / 2);
+
+	ret = __percpu_ida_init(&tags->free_tags, tags->nr_tags -
+				tags->nr_reserved_tags,
+				tags->nr_max_cache,
+				tags->nr_batch_move);
+	if (ret)
+		goto err_free_tags;
+
+	if (reserved_tags) {
+		/*
+		 * With max_cahe and batch set to 1, the allocator fallbacks to
+		 * no cached. It's fine reserved tags allocation is slow.
+		 */
+		ret = __percpu_ida_init(&tags->reserved_tags, reserved_tags,
+				1, 1);
+		if (ret)
+			goto err_reserved_tags;
+	}
+
+	return tags;
+
+err_reserved_tags:
+	percpu_ida_destroy(&tags->free_tags);
+err_free_tags:
+	kfree(tags);
+	return NULL;
+}
+
+void blk_mq_free_tags(struct blk_mq_tags *tags)
+{
+	percpu_ida_destroy(&tags->free_tags);
+	percpu_ida_destroy(&tags->reserved_tags);
+	kfree(tags);
+}
+
+ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
+{
+	char *orig_page = page;
+	int cpu;
+
+	if (!tags)
+		return 0;
+
+	page += sprintf(page, "nr_tags=%u, reserved_tags=%u, batch_move=%u,"
+			" max_cache=%u\n", tags->nr_tags, tags->nr_reserved_tags,
+			tags->nr_batch_move, tags->nr_max_cache);
+
+	page += sprintf(page, "nr_free=%u, nr_reserved=%u\n",
+			percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids),
+			percpu_ida_free_tags(&tags->reserved_tags, nr_cpu_ids));
+
+	for_each_possible_cpu(cpu) {
+		page += sprintf(page, "  cpu%02u: nr_free=%u\n", cpu,
+				percpu_ida_free_tags(&tags->free_tags, cpu));
+	}
+
+	return page - orig_page;
+}
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
new file mode 100644
index 0000000..947ba2c
--- /dev/null
+++ b/block/blk-mq-tag.h
@@ -0,0 +1,27 @@
+#ifndef INT_BLK_MQ_TAG_H
+#define INT_BLK_MQ_TAG_H
+
+struct blk_mq_tags;
+
+extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node);
+extern void blk_mq_free_tags(struct blk_mq_tags *tags);
+
+extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved);
+extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags);
+extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag);
+extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data);
+extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
+extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
+
+enum {
+	BLK_MQ_TAG_CACHE_MIN	= 1,
+	BLK_MQ_TAG_CACHE_MAX	= 64,
+};
+
+enum {
+	BLK_MQ_TAG_FAIL		= -1U,
+	BLK_MQ_TAG_MIN		= BLK_MQ_TAG_CACHE_MIN,
+	BLK_MQ_TAG_MAX		= BLK_MQ_TAG_FAIL - 1,
+};
+
+#endif
diff --git a/block/blk-mq.c b/block/blk-mq.c
new file mode 100644
index 0000000..c79126e
--- /dev/null
+++ b/block/blk-mq.c
@@ -0,0 +1,1510 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/backing-dev.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/smp.h>
+#include <linux/llist.h>
+#include <linux/list_sort.h>
+#include <linux/cpu.h>
+#include <linux/cache.h>
+#include <linux/sched/sysctl.h>
+#include <linux/delay.h>
+
+#include <trace/events/block.h>
+
+#include <linux/blk-mq.h>
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+
+static DEFINE_MUTEX(all_q_mutex);
+static LIST_HEAD(all_q_list);
+
+static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
+
+DEFINE_PER_CPU(struct llist_head, ipi_lists);
+
+static struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
+					   unsigned int cpu)
+{
+	return per_cpu_ptr(q->queue_ctx, cpu);
+}
+
+/*
+ * This assumes per-cpu software queueing queues. They could be per-node
+ * as well, for instance. For now this is hardcoded as-is. Note that we don't
+ * care about preemption, since we know the ctx's are persistent. This does
+ * mean that we can't rely on ctx always matching the currently running CPU.
+ */
+static struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
+{
+	return __blk_mq_get_ctx(q, get_cpu());
+}
+
+static void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
+{
+	put_cpu();
+}
+
+/*
+ * Check if any of the ctx's have pending work in this hardware queue
+ */
+static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
+{
+	unsigned int i;
+
+	for (i = 0; i < hctx->nr_ctx_map; i++)
+		if (hctx->ctx_map[i])
+			return true;
+
+	return false;
+}
+
+/*
+ * Mark this ctx as having pending work in this hardware queue
+ */
+static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
+				     struct blk_mq_ctx *ctx)
+{
+	if (!test_bit(ctx->index_hw, hctx->ctx_map))
+		set_bit(ctx->index_hw, hctx->ctx_map);
+}
+
+static struct request *blk_mq_alloc_rq(struct blk_mq_hw_ctx *hctx, gfp_t gfp,
+				       bool reserved)
+{
+	struct request *rq;
+	unsigned int tag;
+
+	tag = blk_mq_get_tag(hctx->tags, gfp, reserved);
+	if (tag != BLK_MQ_TAG_FAIL) {
+		rq = hctx->rqs[tag];
+		rq->tag = tag;
+
+		return rq;
+	}
+
+	return NULL;
+}
+
+static int blk_mq_queue_enter(struct request_queue *q)
+{
+	int ret;
+
+	__percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
+	smp_wmb();
+	/* we have problems to freeze the queue if it's initializing */
+	if (!blk_queue_bypass(q) || !blk_queue_init_done(q))
+		return 0;
+
+	__percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
+
+	spin_lock_irq(q->queue_lock);
+	ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq,
+		!blk_queue_bypass(q), *q->queue_lock);
+	/* inc usage with lock hold to avoid freeze_queue runs here */
+	if (!ret)
+		__percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
+	spin_unlock_irq(q->queue_lock);
+
+	return ret;
+}
+
+static void blk_mq_queue_exit(struct request_queue *q)
+{
+	__percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
+}
+
+/*
+ * Guarantee no request is in use, so we can change any data structure of
+ * the queue afterward.
+ */
+static void blk_mq_freeze_queue(struct request_queue *q)
+{
+	bool drain;
+
+	spin_lock_irq(q->queue_lock);
+	drain = !q->bypass_depth++;
+	queue_flag_set(QUEUE_FLAG_BYPASS, q);
+	spin_unlock_irq(q->queue_lock);
+
+	if (!drain)
+		return;
+
+	while (true) {
+		s64 count;
+
+		spin_lock_irq(q->queue_lock);
+		count = percpu_counter_sum(&q->mq_usage_counter);
+		spin_unlock_irq(q->queue_lock);
+
+		if (count == 0)
+			break;
+		blk_mq_run_queues(q, false);
+		msleep(10);
+	}
+}
+
+static void blk_mq_unfreeze_queue(struct request_queue *q)
+{
+	bool wake = false;
+
+	spin_lock_irq(q->queue_lock);
+	if (!--q->bypass_depth) {
+		queue_flag_clear(QUEUE_FLAG_BYPASS, q);
+		wake = true;
+	}
+	WARN_ON_ONCE(q->bypass_depth < 0);
+	spin_unlock_irq(q->queue_lock);
+	if (wake)
+		wake_up_all(&q->mq_freeze_wq);
+}
+
+bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
+{
+	return blk_mq_has_free_tags(hctx->tags);
+}
+EXPORT_SYMBOL(blk_mq_can_queue);
+
+static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
+			       struct request *rq, unsigned int rw_flags)
+{
+	if (blk_queue_io_stat(q))
+		rw_flags |= REQ_IO_STAT;
+
+	rq->mq_ctx = ctx;
+	rq->cmd_flags = rw_flags;
+	ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
+}
+
+static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
+					      gfp_t gfp, bool reserved)
+{
+	return blk_mq_alloc_rq(hctx, gfp, reserved);
+}
+
+static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
+						   int rw, gfp_t gfp,
+						   bool reserved)
+{
+	struct request *rq;
+
+	do {
+		struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
+		struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
+
+		rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved);
+		if (rq) {
+			blk_mq_rq_ctx_init(q, ctx, rq, rw);
+			break;
+		}
+
+		blk_mq_put_ctx(ctx);
+		if (!(gfp & __GFP_WAIT))
+			break;
+
+		__blk_mq_run_hw_queue(hctx);
+		blk_mq_wait_for_tags(hctx->tags);
+	} while (1);
+
+	return rq;
+}
+
+struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
+		gfp_t gfp, bool reserved)
+{
+	struct request *rq;
+
+	if (blk_mq_queue_enter(q))
+		return NULL;
+
+	rq = blk_mq_alloc_request_pinned(q, rw, gfp, reserved);
+	if (rq)
+		blk_mq_put_ctx(rq->mq_ctx);
+	return rq;
+}
+
+struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw,
+					      gfp_t gfp)
+{
+	struct request *rq;
+
+	if (blk_mq_queue_enter(q))
+		return NULL;
+
+	rq = blk_mq_alloc_request_pinned(q, rw, gfp, true);
+	if (rq)
+		blk_mq_put_ctx(rq->mq_ctx);
+	return rq;
+}
+EXPORT_SYMBOL(blk_mq_alloc_reserved_request);
+
+/*
+ * Re-init and set pdu, if we have it
+ */
+static void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+	blk_rq_init(hctx->queue, rq);
+
+	if (hctx->cmd_size)
+		rq->special = blk_mq_rq_to_pdu(rq);
+}
+
+static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
+				  struct blk_mq_ctx *ctx, struct request *rq)
+{
+	const int tag = rq->tag;
+	struct request_queue *q = rq->q;
+
+	blk_mq_rq_init(hctx, rq);
+	blk_mq_put_tag(hctx->tags, tag);
+
+	blk_mq_queue_exit(q);
+}
+
+void blk_mq_free_request(struct request *rq)
+{
+	struct blk_mq_ctx *ctx = rq->mq_ctx;
+	struct blk_mq_hw_ctx *hctx;
+	struct request_queue *q = rq->q;
+
+	ctx->rq_completed[rq_is_sync(rq)]++;
+
+	hctx = q->mq_ops->map_queue(q, ctx->cpu);
+	__blk_mq_free_request(hctx, ctx, rq);
+}
+
+static void blk_mq_bio_endio(struct request *rq, struct bio *bio, int error)
+{
+	if (error)
+		clear_bit(BIO_UPTODATE, &bio->bi_flags);
+	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+		error = -EIO;
+
+	if (unlikely(rq->cmd_flags & REQ_QUIET))
+		set_bit(BIO_QUIET, &bio->bi_flags);
+
+	/* don't actually finish bio if it's part of flush sequence */
+	if (!(rq->cmd_flags & REQ_FLUSH_SEQ))
+		bio_endio(bio, error);
+}
+
+void blk_mq_complete_request(struct request *rq, int error)
+{
+	struct bio *bio = rq->bio;
+	unsigned int bytes = 0;
+
+	trace_block_rq_complete(rq->q, rq);
+
+	while (bio) {
+		struct bio *next = bio->bi_next;
+
+		bio->bi_next = NULL;
+		bytes += bio->bi_size;
+		blk_mq_bio_endio(rq, bio, error);
+		bio = next;
+	}
+
+	blk_account_io_completion(rq, bytes);
+
+	blk_account_io_done(rq);
+
+	if (rq->end_io)
+		rq->end_io(rq, error);
+	else
+		blk_mq_free_request(rq);
+}
+
+void __blk_mq_end_io(struct request *rq, int error)
+{
+	if (!blk_mark_rq_complete(rq))
+		blk_mq_complete_request(rq, error);
+}
+
+#if defined(CONFIG_SMP)
+
+/*
+ * Called with interrupts disabled.
+ */
+static void ipi_end_io(void *data)
+{
+	struct llist_head *list = &per_cpu(ipi_lists, smp_processor_id());
+	struct llist_node *entry, *next;
+	struct request *rq;
+
+	entry = llist_del_all(list);
+
+	while (entry) {
+		next = entry->next;
+		rq = llist_entry(entry, struct request, ll_list);
+		__blk_mq_end_io(rq, rq->errors);
+		entry = next;
+	}
+}
+
+static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu,
+			  struct request *rq, const int error)
+{
+	struct call_single_data *data = &rq->csd;
+
+	rq->errors = error;
+	rq->ll_list.next = NULL;
+
+	/*
+	 * If the list is non-empty, an existing IPI must already
+	 * be "in flight". If that is the case, we need not schedule
+	 * a new one.
+	 */
+	if (llist_add(&rq->ll_list, &per_cpu(ipi_lists, ctx->cpu))) {
+		data->func = ipi_end_io;
+		data->flags = 0;
+		__smp_call_function_single(ctx->cpu, data, 0);
+	}
+
+	return true;
+}
+#else /* CONFIG_SMP */
+static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu,
+			  struct request *rq, const int error)
+{
+	return false;
+}
+#endif
+
+/*
+ * End IO on this request on a multiqueue enabled driver. We'll either do
+ * it directly inline, or punt to a local IPI handler on the matching
+ * remote CPU.
+ */
+void blk_mq_end_io(struct request *rq, int error)
+{
+	struct blk_mq_ctx *ctx = rq->mq_ctx;
+	int cpu;
+
+	if (!ctx->ipi_redirect)
+		return __blk_mq_end_io(rq, error);
+
+	cpu = get_cpu();
+
+	if (cpu == ctx->cpu || !cpu_online(ctx->cpu) ||
+	    !ipi_remote_cpu(ctx, cpu, rq, error))
+		__blk_mq_end_io(rq, error);
+
+	put_cpu();
+}
+EXPORT_SYMBOL(blk_mq_end_io);
+
+static void blk_mq_start_request(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+
+	trace_block_rq_issue(q, rq);
+
+	/*
+	 * Just mark start time and set the started bit. Due to memory
+	 * ordering, we know we'll see the correct deadline as long as
+	 * REQ_ATOMIC_STARTED is seen.
+	 */
+	rq->deadline = jiffies + q->rq_timeout;
+	set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+}
+
+static void blk_mq_requeue_request(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+
+	trace_block_rq_requeue(q, rq);
+	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+}
+
+struct blk_mq_timeout_data {
+	struct blk_mq_hw_ctx *hctx;
+	unsigned long *next;
+	unsigned int *next_set;
+};
+
+static void blk_mq_timeout_check(void *__data, unsigned long *free_tags)
+{
+	struct blk_mq_timeout_data *data = __data;
+	struct blk_mq_hw_ctx *hctx = data->hctx;
+	unsigned int tag;
+
+	 /* It may not be in flight yet (this is where
+	 * the REQ_ATOMIC_STARTED flag comes in). The requests are
+	 * statically allocated, so we know it's always safe to access the
+	 * memory associated with a bit offset into ->rqs[].
+	 */
+	tag = 0;
+	do {
+		struct request *rq;
+
+		tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag);
+		if (tag >= hctx->queue_depth)
+			break;
+
+		rq = hctx->rqs[tag++];
+
+		if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
+			continue;
+
+		blk_rq_check_expired(rq, data->next, data->next_set);
+	} while (1);
+}
+
+static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx,
+					unsigned long *next,
+					unsigned int *next_set)
+{
+	struct blk_mq_timeout_data data = {
+		.hctx		= hctx,
+		.next		= next,
+		.next_set	= next_set,
+	};
+
+	/*
+	 * Ask the tagging code to iterate busy requests, so we can
+	 * check them for timeout.
+	 */
+	blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data);
+}
+
+static void blk_mq_rq_timer(unsigned long data)
+{
+	struct request_queue *q = (struct request_queue *) data;
+	struct blk_mq_hw_ctx *hctx;
+	unsigned long next = 0;
+	int i, next_set = 0;
+
+	queue_for_each_hw_ctx(q, hctx, i)
+		blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
+
+	if (next_set)
+		mod_timer(&q->timeout, round_jiffies_up(next));
+}
+
+/*
+ * Reverse check our software queue for entries that we could potentially
+ * merge with. Currently includes a hand-wavy stop count of 8, to not spend
+ * too much time checking for merges.
+ */
+static bool blk_mq_attempt_merge(struct request_queue *q,
+				 struct blk_mq_ctx *ctx, struct bio *bio)
+{
+	struct request *rq;
+	int checked = 8;
+
+	list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
+		int el_ret;
+
+		if (!checked--)
+			break;
+
+		if (!blk_rq_merge_ok(rq, bio))
+			continue;
+
+		el_ret = blk_try_merge(rq, bio);
+		if (el_ret == ELEVATOR_BACK_MERGE) {
+			if (bio_attempt_back_merge(q, rq, bio)) {
+				ctx->rq_merged++;
+				return true;
+			}
+			break;
+		} else if (el_ret == ELEVATOR_FRONT_MERGE) {
+			if (bio_attempt_front_merge(q, rq, bio)) {
+				ctx->rq_merged++;
+				return true;
+			}
+			break;
+		}
+	}
+
+	return false;
+}
+
+void blk_mq_add_timer(struct request *rq)
+{
+	__blk_add_timer(rq, NULL);
+}
+
+/*
+ * Run this hardware queue, pulling any software queues mapped to it in.
+ * Note that this function currently has various problems around ordering
+ * of IO. In particular, we'd like FIFO behaviour on handling existing
+ * items on the hctx->dispatch list. Ignore that for now.
+ */
+static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
+{
+	struct request_queue *q = hctx->queue;
+	struct blk_mq_ctx *ctx;
+	struct request *rq;
+	LIST_HEAD(rq_list);
+	int bit, queued;
+
+	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags)))
+		return;
+
+	hctx->run++;
+
+	/*
+	 * Touch any software queue that has pending entries.
+	 */
+	for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) {
+		clear_bit(bit, hctx->ctx_map);
+		ctx = hctx->ctxs[bit];
+		BUG_ON(bit != ctx->index_hw);
+
+		spin_lock(&ctx->lock);
+		list_splice_tail_init(&ctx->rq_list, &rq_list);
+		spin_unlock(&ctx->lock);
+	}
+
+	/*
+	 * If we have previous entries on our dispatch list, grab them
+	 * and stuff them at the front for more fair dispatch.
+	 */
+	if (!list_empty_careful(&hctx->dispatch)) {
+		spin_lock(&hctx->lock);
+		if (!list_empty(&hctx->dispatch))
+			list_splice_init(&hctx->dispatch, &rq_list);
+		spin_unlock(&hctx->lock);
+	}
+
+	/*
+	 * Delete and return all entries from our dispatch list
+	 */
+	queued = 0;
+
+	/*
+	 * Now process all the entries, sending them to the driver.
+	 */
+	while (!list_empty(&rq_list)) {
+		int ret;
+
+		rq = list_first_entry(&rq_list, struct request, queuelist);
+		list_del_init(&rq->queuelist);
+		blk_mq_start_request(rq);
+
+		/*
+		 * Last request in the series. Flag it as such, this
+		 * enables drivers to know when IO should be kicked off,
+		 * if they don't do it on a per-request basis.
+		 *
+		 * Note: the flag isn't the only condition drivers
+		 * should do kick off. If drive is busy, the last
+		 * request might not have the bit set.
+		 */
+		if (list_empty(&rq_list))
+			rq->cmd_flags |= REQ_END;
+
+		ret = q->mq_ops->queue_rq(hctx, rq);
+		switch (ret) {
+		case BLK_MQ_RQ_QUEUE_OK:
+			queued++;
+			continue;
+		case BLK_MQ_RQ_QUEUE_BUSY:
+			/*
+			 * FIXME: we should have a mechanism to stop the queue
+			 * like blk_stop_queue, otherwise we will waste cpu
+			 * time
+			 */
+			list_add(&rq->queuelist, &rq_list);
+			blk_mq_requeue_request(rq);
+			break;
+		default:
+			pr_err("blk-mq: bad return on queue: %d\n", ret);
+			rq->errors = -EIO;
+		case BLK_MQ_RQ_QUEUE_ERROR:
+			blk_mq_end_io(rq, rq->errors);
+			break;
+		}
+
+		if (ret == BLK_MQ_RQ_QUEUE_BUSY)
+			break;
+	}
+
+	if (!queued)
+		hctx->dispatched[0]++;
+	else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1)))
+		hctx->dispatched[ilog2(queued) + 1]++;
+
+	/*
+	 * Any items that need requeuing? Stuff them into hctx->dispatch,
+	 * that is where we will continue on next queue run.
+	 */
+	if (!list_empty(&rq_list)) {
+		spin_lock(&hctx->lock);
+		list_splice(&rq_list, &hctx->dispatch);
+		spin_unlock(&hctx->lock);
+	}
+}
+
+void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
+{
+	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags)))
+		return;
+
+	if (!async)
+		__blk_mq_run_hw_queue(hctx);
+	else {
+		struct request_queue *q = hctx->queue;
+
+		kblockd_schedule_delayed_work(q, &hctx->delayed_work, 0);
+	}
+}
+
+void blk_mq_run_queues(struct request_queue *q, bool async)
+{
+	struct blk_mq_hw_ctx *hctx;
+	int i;
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		if ((!blk_mq_hctx_has_pending(hctx) &&
+		    list_empty_careful(&hctx->dispatch)) ||
+		    test_bit(BLK_MQ_S_STOPPED, &hctx->flags))
+			continue;
+
+		blk_mq_run_hw_queue(hctx, async);
+	}
+}
+EXPORT_SYMBOL(blk_mq_run_queues);
+
+void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
+{
+	cancel_delayed_work(&hctx->delayed_work);
+	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
+}
+EXPORT_SYMBOL(blk_mq_stop_hw_queue);
+
+void blk_mq_stop_hw_queues(struct request_queue *q)
+{
+	struct blk_mq_hw_ctx *hctx;
+	int i;
+
+	queue_for_each_hw_ctx(q, hctx, i)
+		blk_mq_stop_hw_queue(hctx);
+}
+EXPORT_SYMBOL(blk_mq_stop_hw_queues);
+
+void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
+{
+	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
+	__blk_mq_run_hw_queue(hctx);
+}
+EXPORT_SYMBOL(blk_mq_start_hw_queue);
+
+void blk_mq_start_stopped_hw_queues(struct request_queue *q)
+{
+	struct blk_mq_hw_ctx *hctx;
+	int i;
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state))
+			continue;
+
+		clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
+		blk_mq_run_hw_queue(hctx, true);
+	}
+}
+EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
+
+static void blk_mq_work_fn(struct work_struct *work)
+{
+	struct blk_mq_hw_ctx *hctx;
+
+	hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work);
+	__blk_mq_run_hw_queue(hctx);
+}
+
+static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
+				    struct request *rq)
+{
+	struct blk_mq_ctx *ctx = rq->mq_ctx;
+
+	trace_block_rq_insert(hctx->queue, rq);
+
+	list_add_tail(&rq->queuelist, &ctx->rq_list);
+	blk_mq_hctx_mark_pending(hctx, ctx);
+
+	/*
+	 * We do this early, to ensure we are on the right CPU.
+	 */
+	blk_mq_add_timer(rq);
+}
+
+void blk_mq_insert_request(struct request_queue *q, struct request *rq,
+			   bool run_queue)
+{
+	struct blk_mq_hw_ctx *hctx;
+	struct blk_mq_ctx *ctx, *current_ctx;
+
+	ctx = rq->mq_ctx;
+	hctx = q->mq_ops->map_queue(q, ctx->cpu);
+
+	if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
+		blk_insert_flush(rq);
+	} else {
+		current_ctx = blk_mq_get_ctx(q);
+
+		if (!cpu_online(ctx->cpu)) {
+			ctx = current_ctx;
+			hctx = q->mq_ops->map_queue(q, ctx->cpu);
+			rq->mq_ctx = ctx;
+		}
+		spin_lock(&ctx->lock);
+		__blk_mq_insert_request(hctx, rq);
+		spin_unlock(&ctx->lock);
+
+		blk_mq_put_ctx(current_ctx);
+	}
+
+	if (run_queue)
+		__blk_mq_run_hw_queue(hctx);
+}
+EXPORT_SYMBOL(blk_mq_insert_request);
+
+/*
+ * This is a special version of blk_mq_insert_request to bypass FLUSH request
+ * check. Should only be used internally.
+ */
+void blk_mq_run_request(struct request *rq, bool run_queue, bool async)
+{
+	struct request_queue *q = rq->q;
+	struct blk_mq_hw_ctx *hctx;
+	struct blk_mq_ctx *ctx, *current_ctx;
+
+	current_ctx = blk_mq_get_ctx(q);
+
+	ctx = rq->mq_ctx;
+	if (!cpu_online(ctx->cpu)) {
+		ctx = current_ctx;
+		rq->mq_ctx = ctx;
+	}
+	hctx = q->mq_ops->map_queue(q, ctx->cpu);
+
+	/* ctx->cpu might be offline */
+	spin_lock(&ctx->lock);
+	__blk_mq_insert_request(hctx, rq);
+	spin_unlock(&ctx->lock);
+
+	blk_mq_put_ctx(current_ctx);
+
+	if (run_queue)
+		blk_mq_run_hw_queue(hctx, async);
+}
+
+static void blk_mq_insert_requests(struct request_queue *q,
+				     struct blk_mq_ctx *ctx,
+				     struct list_head *list,
+				     int depth,
+				     bool from_schedule)
+
+{
+	struct blk_mq_hw_ctx *hctx;
+	struct blk_mq_ctx *current_ctx;
+
+	trace_block_unplug(q, depth, !from_schedule);
+
+	current_ctx = blk_mq_get_ctx(q);
+
+	if (!cpu_online(ctx->cpu))
+		ctx = current_ctx;
+	hctx = q->mq_ops->map_queue(q, ctx->cpu);
+
+	/*
+	 * preemption doesn't flush plug list, so it's possible ctx->cpu is
+	 * offline now
+	 */
+	spin_lock(&ctx->lock);
+	while (!list_empty(list)) {
+		struct request *rq;
+
+		rq = list_first_entry(list, struct request, queuelist);
+		list_del_init(&rq->queuelist);
+		rq->mq_ctx = ctx;
+		__blk_mq_insert_request(hctx, rq);
+	}
+	spin_unlock(&ctx->lock);
+
+	blk_mq_put_ctx(current_ctx);
+
+	blk_mq_run_hw_queue(hctx, from_schedule);
+}
+
+static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct request *rqa = container_of(a, struct request, queuelist);
+	struct request *rqb = container_of(b, struct request, queuelist);
+
+	return !(rqa->mq_ctx < rqb->mq_ctx ||
+		 (rqa->mq_ctx == rqb->mq_ctx &&
+		  blk_rq_pos(rqa) < blk_rq_pos(rqb)));
+}
+
+void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
+{
+	struct blk_mq_ctx *this_ctx;
+	struct request_queue *this_q;
+	struct request *rq;
+	LIST_HEAD(list);
+	LIST_HEAD(ctx_list);
+	unsigned int depth;
+
+	list_splice_init(&plug->mq_list, &list);
+
+	list_sort(NULL, &list, plug_ctx_cmp);
+
+	this_q = NULL;
+	this_ctx = NULL;
+	depth = 0;
+
+	while (!list_empty(&list)) {
+		rq = list_entry_rq(list.next);
+		list_del_init(&rq->queuelist);
+		BUG_ON(!rq->q);
+		if (rq->mq_ctx != this_ctx) {
+			if (this_ctx) {
+				blk_mq_insert_requests(this_q, this_ctx,
+							&ctx_list, depth,
+							from_schedule);
+			}
+
+			this_ctx = rq->mq_ctx;
+			this_q = rq->q;
+			depth = 0;
+		}
+
+		depth++;
+		list_add_tail(&rq->queuelist, &ctx_list);
+	}
+
+	/*
+	 * If 'this_ctx' is set, we know we have entries to complete
+	 * on 'ctx_list'. Do those.
+	 */
+	if (this_ctx) {
+		blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
+				       from_schedule);
+	}
+}
+
+static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
+{
+	init_request_from_bio(rq, bio);
+	blk_account_io_start(rq, 1);
+}
+
+static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct blk_mq_hw_ctx *hctx;
+	struct blk_mq_ctx *ctx;
+	const int is_sync = rw_is_sync(bio->bi_rw);
+	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
+	int rw = bio_data_dir(bio);
+	struct request *rq;
+	unsigned int use_plug, request_count = 0;
+
+	/*
+	 * If we have multiple hardware queues, just go directly to
+	 * one of those for sync IO.
+	 */
+	use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync);
+
+	blk_queue_bounce(q, &bio);
+
+	if (use_plug && blk_attempt_plug_merge(q, bio, &request_count))
+		return;
+
+	if (blk_mq_queue_enter(q)) {
+		bio_endio(bio, -EIO);
+		return;
+	}
+
+	ctx = blk_mq_get_ctx(q);
+	hctx = q->mq_ops->map_queue(q, ctx->cpu);
+
+	trace_block_getrq(q, bio, rw);
+	rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false);
+	if (likely(rq))
+		blk_mq_rq_ctx_init(q, ctx, rq, rw);
+	else {
+		blk_mq_put_ctx(ctx);
+		trace_block_sleeprq(q, bio, rw);
+		rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC,
+							false);
+		ctx = rq->mq_ctx;
+		hctx = q->mq_ops->map_queue(q, ctx->cpu);
+	}
+
+	hctx->queued++;
+
+	if (unlikely(is_flush_fua)) {
+		blk_mq_bio_to_request(rq, bio);
+		blk_mq_put_ctx(ctx);
+		blk_insert_flush(rq);
+		goto run_queue;
+	}
+
+	/*
+	 * A task plug currently exists. Since this is completely lockless,
+	 * utilize that to temporarily store requests until the task is
+	 * either done or scheduled away.
+	 */
+	if (use_plug) {
+		struct blk_plug *plug = current->plug;
+
+		if (plug) {
+			blk_mq_bio_to_request(rq, bio);
+			if (list_empty(&plug->mq_list))
+				trace_block_plug(q);
+			else if (request_count >= BLK_MAX_REQUEST_COUNT) {
+				blk_flush_plug_list(plug, false);
+				trace_block_plug(q);
+			}
+			list_add_tail(&rq->queuelist, &plug->mq_list);
+			blk_mq_put_ctx(ctx);
+			return;
+		}
+	}
+
+	spin_lock(&ctx->lock);
+
+	if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
+	    blk_mq_attempt_merge(q, ctx, bio))
+		__blk_mq_free_request(hctx, ctx, rq);
+	else {
+		blk_mq_bio_to_request(rq, bio);
+		__blk_mq_insert_request(hctx, rq);
+	}
+
+	spin_unlock(&ctx->lock);
+	blk_mq_put_ctx(ctx);
+
+	/*
+	 * For a SYNC request, send it to the hardware immediately. For an
+	 * ASYNC request, just ensure that we run it later on. The latter
+	 * allows for merging opportunities and more efficient dispatching.
+	 */
+run_queue:
+	blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua);
+}
+
+/*
+ * Default mapping to a software queue, since we use one per CPU.
+ */
+struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu)
+{
+	return q->queue_hw_ctx[q->mq_map[cpu]];
+}
+EXPORT_SYMBOL(blk_mq_map_queue);
+
+struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *reg,
+						   unsigned int hctx_index)
+{
+	return kmalloc_node(sizeof(struct blk_mq_hw_ctx),
+				GFP_KERNEL | __GFP_ZERO, reg->numa_node);
+}
+EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue);
+
+void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx,
+				 unsigned int hctx_index)
+{
+	kfree(hctx);
+}
+EXPORT_SYMBOL(blk_mq_free_single_hw_queue);
+
+static void blk_mq_hctx_notify(void *data, unsigned long action,
+			       unsigned int cpu)
+{
+	struct blk_mq_hw_ctx *hctx = data;
+	struct blk_mq_ctx *ctx;
+	LIST_HEAD(tmp);
+
+	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
+		return;
+
+	/*
+	 * Move ctx entries to new CPU, if this one is going away.
+	 */
+	ctx = __blk_mq_get_ctx(hctx->queue, cpu);
+
+	spin_lock(&ctx->lock);
+	if (!list_empty(&ctx->rq_list)) {
+		list_splice_init(&ctx->rq_list, &tmp);
+		clear_bit(ctx->index_hw, hctx->ctx_map);
+	}
+	spin_unlock(&ctx->lock);
+
+	if (list_empty(&tmp))
+		return;
+
+	ctx = blk_mq_get_ctx(hctx->queue);
+	spin_lock(&ctx->lock);
+
+	while (!list_empty(&tmp)) {
+		struct request *rq;
+
+		rq = list_first_entry(&tmp, struct request, queuelist);
+		rq->mq_ctx = ctx;
+		list_move_tail(&rq->queuelist, &ctx->rq_list);
+	}
+
+	blk_mq_hctx_mark_pending(hctx, ctx);
+
+	spin_unlock(&ctx->lock);
+	blk_mq_put_ctx(ctx);
+}
+
+static void blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx,
+				    void (*init)(void *, struct blk_mq_hw_ctx *,
+					struct request *, unsigned int),
+				    void *data)
+{
+	unsigned int i;
+
+	for (i = 0; i < hctx->queue_depth; i++) {
+		struct request *rq = hctx->rqs[i];
+
+		init(data, hctx, rq, i);
+	}
+}
+
+void blk_mq_init_commands(struct request_queue *q,
+			  void (*init)(void *, struct blk_mq_hw_ctx *,
+					struct request *, unsigned int),
+			  void *data)
+{
+	struct blk_mq_hw_ctx *hctx;
+	unsigned int i;
+
+	queue_for_each_hw_ctx(q, hctx, i)
+		blk_mq_init_hw_commands(hctx, init, data);
+}
+EXPORT_SYMBOL(blk_mq_init_commands);
+
+static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx)
+{
+	struct page *page;
+
+	while (!list_empty(&hctx->page_list)) {
+		page = list_first_entry(&hctx->page_list, struct page, list);
+		list_del_init(&page->list);
+		__free_pages(page, page->private);
+	}
+
+	kfree(hctx->rqs);
+
+	if (hctx->tags)
+		blk_mq_free_tags(hctx->tags);
+}
+
+static size_t order_to_size(unsigned int order)
+{
+	size_t ret = PAGE_SIZE;
+
+	while (order--)
+		ret *= 2;
+
+	return ret;
+}
+
+static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
+			      unsigned int reserved_tags, int node)
+{
+	unsigned int i, j, entries_per_page, max_order = 4;
+	size_t rq_size, left;
+
+	INIT_LIST_HEAD(&hctx->page_list);
+
+	hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *),
+					GFP_KERNEL, node);
+	if (!hctx->rqs)
+		return -ENOMEM;
+
+	/*
+	 * rq_size is the size of the request plus driver payload, rounded
+	 * to the cacheline size
+	 */
+	rq_size = round_up(sizeof(struct request) + hctx->cmd_size,
+				cache_line_size());
+	left = rq_size * hctx->queue_depth;
+
+	for (i = 0; i < hctx->queue_depth;) {
+		int this_order = max_order;
+		struct page *page;
+		int to_do;
+		void *p;
+
+		while (left < order_to_size(this_order - 1) && this_order)
+			this_order--;
+
+		do {
+			page = alloc_pages_node(node, GFP_KERNEL, this_order);
+			if (page)
+				break;
+			if (!this_order--)
+				break;
+			if (order_to_size(this_order) < rq_size)
+				break;
+		} while (1);
+
+		if (!page)
+			break;
+
+		page->private = this_order;
+		list_add_tail(&page->list, &hctx->page_list);
+
+		p = page_address(page);
+		entries_per_page = order_to_size(this_order) / rq_size;
+		to_do = min(entries_per_page, hctx->queue_depth - i);
+		left -= to_do * rq_size;
+		for (j = 0; j < to_do; j++) {
+			hctx->rqs[i] = p;
+			blk_mq_rq_init(hctx, hctx->rqs[i]);
+			p += rq_size;
+			i++;
+		}
+	}
+
+	if (i < (reserved_tags + BLK_MQ_TAG_MIN))
+		goto err_rq_map;
+	else if (i != hctx->queue_depth) {
+		hctx->queue_depth = i;
+		pr_warn("%s: queue depth set to %u because of low memory\n",
+					__func__, i);
+	}
+
+	hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node);
+	if (!hctx->tags) {
+err_rq_map:
+		blk_mq_free_rq_map(hctx);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int blk_mq_init_hw_queues(struct request_queue *q,
+				 struct blk_mq_reg *reg, void *driver_data)
+{
+	struct blk_mq_hw_ctx *hctx;
+	unsigned int i, j;
+
+	/*
+	 * Initialize hardware queues
+	 */
+	queue_for_each_hw_ctx(q, hctx, i) {
+		unsigned int num_maps;
+		int node;
+
+		node = hctx->numa_node;
+		if (node == NUMA_NO_NODE)
+			node = hctx->numa_node = reg->numa_node;
+
+		INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn);
+		spin_lock_init(&hctx->lock);
+		INIT_LIST_HEAD(&hctx->dispatch);
+		hctx->queue = q;
+		hctx->queue_num = i;
+		hctx->flags = reg->flags;
+		hctx->queue_depth = reg->queue_depth;
+		hctx->cmd_size = reg->cmd_size;
+
+		blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
+						blk_mq_hctx_notify, hctx);
+		blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
+
+		if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node))
+			break;
+
+		/*
+		 * Allocate space for all possible cpus to avoid allocation in
+		 * runtime
+		 */
+		hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
+						GFP_KERNEL, node);
+		if (!hctx->ctxs)
+			break;
+
+		num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG;
+		hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long),
+						GFP_KERNEL, node);
+		if (!hctx->ctx_map)
+			break;
+
+		hctx->nr_ctx_map = num_maps;
+		hctx->nr_ctx = 0;
+
+		if (reg->ops->init_hctx &&
+		    reg->ops->init_hctx(hctx, driver_data, i))
+			break;
+	}
+
+	if (i == q->nr_hw_queues)
+		return 0;
+
+	/*
+	 * Init failed
+	 */
+	queue_for_each_hw_ctx(q, hctx, j) {
+		if (i == j)
+			break;
+
+		if (reg->ops->exit_hctx)
+			reg->ops->exit_hctx(hctx, j);
+
+		blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
+		blk_mq_free_rq_map(hctx);
+		kfree(hctx->ctxs);
+	}
+
+	return 1;
+}
+
+static void blk_mq_init_cpu_queues(struct request_queue *q,
+				   unsigned int nr_hw_queues)
+{
+	unsigned int i;
+
+	for_each_possible_cpu(i) {
+		struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
+		struct blk_mq_hw_ctx *hctx;
+
+		memset(__ctx, 0, sizeof(*__ctx));
+		__ctx->cpu = i;
+		spin_lock_init(&__ctx->lock);
+		INIT_LIST_HEAD(&__ctx->rq_list);
+		__ctx->queue = q;
+
+		/* If the cpu isn't online, the cpu is mapped to first hctx */
+		hctx = q->mq_ops->map_queue(q, i);
+		hctx->nr_ctx++;
+
+		if (!cpu_online(i))
+			continue;
+
+		/*
+		 * Set local node, IFF we have more than one hw queue. If
+		 * not, we remain on the home node of the device
+		 */
+		if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
+			hctx->numa_node = cpu_to_node(i);
+	}
+}
+
+static void blk_mq_map_swqueue(struct request_queue *q)
+{
+	unsigned int i;
+	struct blk_mq_hw_ctx *hctx;
+	struct blk_mq_ctx *ctx;
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		hctx->nr_ctx = 0;
+	}
+
+	/*
+	 * Map software to hardware queues
+	 */
+	queue_for_each_ctx(q, ctx, i) {
+		/* If the cpu isn't online, the cpu is mapped to first hctx */
+		hctx = q->mq_ops->map_queue(q, i);
+		ctx->index_hw = hctx->nr_ctx;
+		hctx->ctxs[hctx->nr_ctx++] = ctx;
+	}
+}
+
+struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
+					void *driver_data)
+{
+	struct blk_mq_hw_ctx **hctxs;
+	struct blk_mq_ctx *ctx;
+	struct request_queue *q;
+	int i;
+
+	if (!reg->nr_hw_queues ||
+	    !reg->ops->queue_rq || !reg->ops->map_queue ||
+	    !reg->ops->alloc_hctx || !reg->ops->free_hctx)
+		return ERR_PTR(-EINVAL);
+
+	if (!reg->queue_depth)
+		reg->queue_depth = BLK_MQ_MAX_DEPTH;
+	else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) {
+		pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth);
+		reg->queue_depth = BLK_MQ_MAX_DEPTH;
+	}
+
+	/*
+	 * Set aside a tag for flush requests.  It will only be used while
+	 * another flush request is in progress but outside the driver.
+	 *
+	 * TODO: only allocate if flushes are supported
+	 */
+	reg->queue_depth++;
+	reg->reserved_tags++;
+
+	if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN))
+		return ERR_PTR(-EINVAL);
+
+	ctx = alloc_percpu(struct blk_mq_ctx);
+	if (!ctx)
+		return ERR_PTR(-ENOMEM);
+
+	hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
+			reg->numa_node);
+
+	if (!hctxs)
+		goto err_percpu;
+
+	for (i = 0; i < reg->nr_hw_queues; i++) {
+		hctxs[i] = reg->ops->alloc_hctx(reg, i);
+		if (!hctxs[i])
+			goto err_hctxs;
+
+		hctxs[i]->numa_node = NUMA_NO_NODE;
+		hctxs[i]->queue_num = i;
+	}
+
+	q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node);
+	if (!q)
+		goto err_hctxs;
+
+	q->mq_map = blk_mq_make_queue_map(reg);
+	if (!q->mq_map)
+		goto err_map;
+
+	setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
+	blk_queue_rq_timeout(q, 30000);
+
+	q->nr_queues = nr_cpu_ids;
+	q->nr_hw_queues = reg->nr_hw_queues;
+
+	q->queue_ctx = ctx;
+	q->queue_hw_ctx = hctxs;
+
+	q->mq_ops = reg->ops;
+	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
+
+	blk_queue_make_request(q, blk_mq_make_request);
+	blk_queue_rq_timed_out(q, reg->ops->timeout);
+	if (reg->timeout)
+		blk_queue_rq_timeout(q, reg->timeout);
+
+	blk_mq_init_flush(q);
+	blk_mq_init_cpu_queues(q, reg->nr_hw_queues);
+
+	if (blk_mq_init_hw_queues(q, reg, driver_data))
+		goto err_hw;
+
+	blk_mq_map_swqueue(q);
+
+	mutex_lock(&all_q_mutex);
+	list_add_tail(&q->all_q_node, &all_q_list);
+	mutex_unlock(&all_q_mutex);
+
+	return q;
+err_hw:
+	kfree(q->mq_map);
+err_map:
+	blk_cleanup_queue(q);
+err_hctxs:
+	for (i = 0; i < reg->nr_hw_queues; i++) {
+		if (!hctxs[i])
+			break;
+		reg->ops->free_hctx(hctxs[i], i);
+	}
+	kfree(hctxs);
+err_percpu:
+	free_percpu(ctx);
+	return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(blk_mq_init_queue);
+
+void blk_mq_free_queue(struct request_queue *q)
+{
+	struct blk_mq_hw_ctx *hctx;
+	int i;
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		cancel_delayed_work_sync(&hctx->delayed_work);
+		kfree(hctx->ctx_map);
+		kfree(hctx->ctxs);
+		blk_mq_free_rq_map(hctx);
+		blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
+		if (q->mq_ops->exit_hctx)
+			q->mq_ops->exit_hctx(hctx, i);
+		q->mq_ops->free_hctx(hctx, i);
+	}
+
+	free_percpu(q->queue_ctx);
+	kfree(q->queue_hw_ctx);
+	kfree(q->mq_map);
+
+	q->queue_ctx = NULL;
+	q->queue_hw_ctx = NULL;
+	q->mq_map = NULL;
+
+	mutex_lock(&all_q_mutex);
+	list_del_init(&q->all_q_node);
+	mutex_unlock(&all_q_mutex);
+}
+EXPORT_SYMBOL(blk_mq_free_queue);
+
+/* Basically redo blk_mq_init_queue with queue frozen */
+static void blk_mq_queue_reinit(struct request_queue *q)
+{
+	blk_mq_freeze_queue(q);
+
+	blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
+
+	/*
+	 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
+	 * we should change hctx numa_node according to new topology (this
+	 * involves free and re-allocate memory, worthy doing?)
+	 */
+
+	blk_mq_map_swqueue(q);
+
+	blk_mq_unfreeze_queue(q);
+}
+
+static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
+				      unsigned long action, void *hcpu)
+{
+	struct request_queue *q;
+
+	/*
+	 * Before new mapping is established, hotadded cpu might already start
+	 * handling requests. This doesn't break anything as we map offline
+	 * CPUs to first hardware queue. We will re-init queue below to get
+	 * optimal settings.
+	 */
+	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
+	    action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
+		return NOTIFY_OK;
+
+	mutex_lock(&all_q_mutex);
+	list_for_each_entry(q, &all_q_list, all_q_node)
+		blk_mq_queue_reinit(q);
+	mutex_unlock(&all_q_mutex);
+	return NOTIFY_OK;
+}
+
+static int __init blk_mq_init(void)
+{
+	unsigned int i;
+
+	for_each_possible_cpu(i)
+		init_llist_head(&per_cpu(ipi_lists, i));
+
+	blk_mq_cpu_init();
+
+	/* Must be called after percpu_counter_hotcpu_callback() */
+	hotcpu_notifier(blk_mq_queue_reinit_notify, -10);
+
+	return 0;
+}
+subsys_initcall(blk_mq_init);
diff --git a/block/blk-mq.h b/block/blk-mq.h
new file mode 100644
index 0000000..52bf1f9
--- /dev/null
+++ b/block/blk-mq.h
@@ -0,0 +1,52 @@
+#ifndef INT_BLK_MQ_H
+#define INT_BLK_MQ_H
+
+struct blk_mq_ctx {
+	struct {
+		spinlock_t		lock;
+		struct list_head	rq_list;
+	}  ____cacheline_aligned_in_smp;
+
+	unsigned int		cpu;
+	unsigned int		index_hw;
+	unsigned int		ipi_redirect;
+
+	/* incremented at dispatch time */
+	unsigned long		rq_dispatched[2];
+	unsigned long		rq_merged;
+
+	/* incremented at completion time */
+	unsigned long		____cacheline_aligned_in_smp rq_completed[2];
+
+	struct request_queue	*queue;
+	struct kobject		kobj;
+};
+
+void __blk_mq_end_io(struct request *rq, int error);
+void blk_mq_complete_request(struct request *rq, int error);
+void blk_mq_run_request(struct request *rq, bool run_queue, bool async);
+void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
+void blk_mq_init_flush(struct request_queue *q);
+
+/*
+ * CPU hotplug helpers
+ */
+struct blk_mq_cpu_notifier;
+void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
+			      void (*fn)(void *, unsigned long, unsigned int),
+			      void *data);
+void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
+void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
+void blk_mq_cpu_init(void);
+DECLARE_PER_CPU(struct llist_head, ipi_lists);
+
+/*
+ * CPU -> queue mappings
+ */
+struct blk_mq_reg;
+extern unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg);
+extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues);
+
+void blk_mq_add_timer(struct request *rq);
+
+#endif
diff --git a/block/blk-settings.c b/block/blk-settings.c
index c50ecf0..05e8267 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -144,6 +144,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
 	lim->discard_zeroes_data = 1;
 	lim->max_segments = USHRT_MAX;
 	lim->max_hw_sectors = UINT_MAX;
+	lim->max_segment_size = UINT_MAX;
 	lim->max_sectors = UINT_MAX;
 	lim->max_write_same_sectors = UINT_MAX;
 }
@@ -195,17 +196,17 @@ EXPORT_SYMBOL(blk_queue_make_request);
 /**
  * blk_queue_bounce_limit - set bounce buffer limit for queue
  * @q: the request queue for the device
- * @dma_mask: the maximum address the device can handle
+ * @max_addr: the maximum address the device can handle
  *
  * Description:
  *    Different hardware can have different requirements as to what pages
  *    it can do I/O directly to. A low level driver can call
  *    blk_queue_bounce_limit to have lower memory pages allocated as bounce
- *    buffers for doing I/O to pages residing above @dma_mask.
+ *    buffers for doing I/O to pages residing above @max_addr.
  **/
-void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
+void blk_queue_bounce_limit(struct request_queue *q, u64 max_addr)
 {
-	unsigned long b_pfn = dma_mask >> PAGE_SHIFT;
+	unsigned long b_pfn = max_addr >> PAGE_SHIFT;
 	int dma = 0;
 
 	q->bounce_gfp = GFP_NOIO;
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 467c8de..57790c1 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -23,7 +23,7 @@ static void blk_done_softirq(struct softirq_action *h)
 	struct list_head *cpu_list, local_list;
 
 	local_irq_disable();
-	cpu_list = &__get_cpu_var(blk_cpu_done);
+	cpu_list = this_cpu_ptr(&blk_cpu_done);
 	list_replace_init(cpu_list, &local_list);
 	local_irq_enable();
 
@@ -36,7 +36,7 @@ static void blk_done_softirq(struct softirq_action *h)
 	}
 }
 
-#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS)
+#ifdef CONFIG_SMP
 static void trigger_softirq(void *data)
 {
 	struct request *rq = data;
@@ -44,7 +44,7 @@ static void trigger_softirq(void *data)
 	struct list_head *list;
 
 	local_irq_save(flags);
-	list = &__get_cpu_var(blk_cpu_done);
+	list = this_cpu_ptr(&blk_cpu_done);
 	list_add_tail(&rq->csd.list, list);
 
 	if (list->next == &rq->csd.list)
@@ -71,15 +71,15 @@ static int raise_blk_irq(int cpu, struct request *rq)
 
 	return 1;
 }
-#else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */
+#else /* CONFIG_SMP */
 static int raise_blk_irq(int cpu, struct request *rq)
 {
 	return 1;
 }
 #endif
 
-static int __cpuinit blk_cpu_notify(struct notifier_block *self,
-				    unsigned long action, void *hcpu)
+static int blk_cpu_notify(struct notifier_block *self, unsigned long action,
+			  void *hcpu)
 {
 	/*
 	 * If a CPU goes away, splice its entries to the current CPU
@@ -90,7 +90,7 @@ static int __cpuinit blk_cpu_notify(struct notifier_block *self,
 
 		local_irq_disable();
 		list_splice_init(&per_cpu(blk_cpu_done, cpu),
-				 &__get_cpu_var(blk_cpu_done));
+				 this_cpu_ptr(&blk_cpu_done));
 		raise_softirq_irqoff(BLOCK_SOFTIRQ);
 		local_irq_enable();
 	}
@@ -98,7 +98,7 @@ static int __cpuinit blk_cpu_notify(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __cpuinitdata blk_cpu_notifier = {
+static struct notifier_block blk_cpu_notifier = {
 	.notifier_call	= blk_cpu_notify,
 };
 
@@ -135,7 +135,7 @@ void __blk_complete_request(struct request *req)
 	if (ccpu == cpu || shared) {
 		struct list_head *list;
 do_local:
-		list = &__get_cpu_var(blk_cpu_done);
+		list = this_cpu_ptr(&blk_cpu_done);
 		list_add_tail(&req->csd.list, list);
 
 		/*
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 5efc5a6..9777952 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -7,6 +7,7 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/blktrace_api.h>
+#include <linux/blk-mq.h>
 
 #include "blk.h"
 #include "blk-cgroup.h"
@@ -29,7 +30,7 @@ queue_var_store(unsigned long *var, const char *page, size_t count)
 	int err;
 	unsigned long v;
 
-	err = strict_strtoul(page, 10, &v);
+	err = kstrtoul(page, 10, &v);
 	if (err || v > UINT_MAX)
 		return -EINVAL;
 
@@ -287,7 +288,7 @@ static ssize_t
 queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
 {
 	ssize_t ret = -EINVAL;
-#if defined(CONFIG_USE_GENERIC_SMP_HELPERS)
+#ifdef CONFIG_SMP
 	unsigned long val;
 
 	ret = queue_var_store(&val, page, count);
@@ -542,6 +543,11 @@ static void blk_release_queue(struct kobject *kobj)
 	if (q->queue_tags)
 		__blk_queue_free_tags(q);
 
+	percpu_counter_destroy(&q->mq_usage_counter);
+
+	if (q->mq_ops)
+		blk_mq_free_queue(q);
+
 	blk_trace_shutdown(q);
 
 	bdi_destroy(&q->backing_dev_info);
@@ -575,6 +581,7 @@ int blk_register_queue(struct gendisk *disk)
 	 * bypass from queue allocation.
 	 */
 	blk_queue_bypass_end(q);
+	queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q);
 
 	ret = blk_trace_init_sysfs(dev);
 	if (ret)
@@ -588,6 +595,9 @@ int blk_register_queue(struct gendisk *disk)
 
 	kobject_uevent(&q->kobj, KOBJ_ADD);
 
+	if (q->mq_ops)
+		blk_mq_register_disk(disk);
+
 	if (!q->request_fn)
 		return 0;
 
@@ -610,6 +620,9 @@ void blk_unregister_queue(struct gendisk *disk)
 	if (WARN_ON(!q))
 		return;
 
+	if (q->mq_ops)
+		blk_mq_unregister_disk(disk);
+
 	if (q->request_fn)
 		elv_unregister_queue(q);
 
diff --git a/block/blk-tag.c b/block/blk-tag.c
index cc345e1..3f33d86 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -348,9 +348,16 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
 	 */
 	max_depth = bqt->max_depth;
 	if (!rq_is_sync(rq) && max_depth > 1) {
-		max_depth -= 2;
-		if (!max_depth)
+		switch (max_depth) {
+		case 2:
 			max_depth = 1;
+			break;
+		case 3:
+			max_depth = 2;
+			break;
+		default:
+			max_depth -= 2;
+		}
 		if (q->in_flight[BLK_RW_ASYNC] > max_depth)
 			return 1;
 	}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 3114622..0653404 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -25,18 +25,61 @@ static struct blkcg_policy blkcg_policy_throtl;
 
 /* A workqueue to queue throttle related work */
 static struct workqueue_struct *kthrotld_workqueue;
-static void throtl_schedule_delayed_work(struct throtl_data *td,
-				unsigned long delay);
-
-struct throtl_rb_root {
-	struct rb_root rb;
-	struct rb_node *left;
-	unsigned int count;
-	unsigned long min_disptime;
+
+/*
+ * To implement hierarchical throttling, throtl_grps form a tree and bios
+ * are dispatched upwards level by level until they reach the top and get
+ * issued.  When dispatching bios from the children and local group at each
+ * level, if the bios are dispatched into a single bio_list, there's a risk
+ * of a local or child group which can queue many bios at once filling up
+ * the list starving others.
+ *
+ * To avoid such starvation, dispatched bios are queued separately
+ * according to where they came from.  When they are again dispatched to
+ * the parent, they're popped in round-robin order so that no single source
+ * hogs the dispatch window.
+ *
+ * throtl_qnode is used to keep the queued bios separated by their sources.
+ * Bios are queued to throtl_qnode which in turn is queued to
+ * throtl_service_queue and then dispatched in round-robin order.
+ *
+ * It's also used to track the reference counts on blkg's.  A qnode always
+ * belongs to a throtl_grp and gets queued on itself or the parent, so
+ * incrementing the reference of the associated throtl_grp when a qnode is
+ * queued and decrementing when dequeued is enough to keep the whole blkg
+ * tree pinned while bios are in flight.
+ */
+struct throtl_qnode {
+	struct list_head	node;		/* service_queue->queued[] */
+	struct bio_list		bios;		/* queued bios */
+	struct throtl_grp	*tg;		/* tg this qnode belongs to */
 };
 
-#define THROTL_RB_ROOT	(struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \
-			.count = 0, .min_disptime = 0}
+struct throtl_service_queue {
+	struct throtl_service_queue *parent_sq;	/* the parent service_queue */
+
+	/*
+	 * Bios queued directly to this service_queue or dispatched from
+	 * children throtl_grp's.
+	 */
+	struct list_head	queued[2];	/* throtl_qnode [READ/WRITE] */
+	unsigned int		nr_queued[2];	/* number of queued bios */
+
+	/*
+	 * RB tree of active children throtl_grp's, which are sorted by
+	 * their ->disptime.
+	 */
+	struct rb_root		pending_tree;	/* RB tree of active tgs */
+	struct rb_node		*first_pending;	/* first node in the tree */
+	unsigned int		nr_pending;	/* # queued in the tree */
+	unsigned long		first_pending_disptime;	/* disptime of the first tg */
+	struct timer_list	pending_timer;	/* fires on first_pending_disptime */
+};
+
+enum tg_state_flags {
+	THROTL_TG_PENDING	= 1 << 0,	/* on parent's pending tree */
+	THROTL_TG_WAS_EMPTY	= 1 << 1,	/* bio_lists[] became non-empty */
+};
 
 #define rb_entry_tg(node)	rb_entry((node), struct throtl_grp, rb_node)
 
@@ -52,9 +95,26 @@ struct throtl_grp {
 	/* must be the first member */
 	struct blkg_policy_data pd;
 
-	/* active throtl group service_tree member */
+	/* active throtl group service_queue member */
 	struct rb_node rb_node;
 
+	/* throtl_data this group belongs to */
+	struct throtl_data *td;
+
+	/* this group's service queue */
+	struct throtl_service_queue service_queue;
+
+	/*
+	 * qnode_on_self is used when bios are directly queued to this
+	 * throtl_grp so that local bios compete fairly with bios
+	 * dispatched from children.  qnode_on_parent is used when bios are
+	 * dispatched from this throtl_grp into its parent and will compete
+	 * with the sibling qnode_on_parents and the parent's
+	 * qnode_on_self.
+	 */
+	struct throtl_qnode qnode_on_self[2];
+	struct throtl_qnode qnode_on_parent[2];
+
 	/*
 	 * Dispatch time in jiffies. This is the estimated time when group
 	 * will unthrottle and is ready to dispatch more bio. It is used as
@@ -64,11 +124,8 @@ struct throtl_grp {
 
 	unsigned int flags;
 
-	/* Two lists for READ and WRITE */
-	struct bio_list bio_lists[2];
-
-	/* Number of queued bios on READ and WRITE lists */
-	unsigned int nr_queued[2];
+	/* are there any throtl rules between this group and td? */
+	bool has_rules[2];
 
 	/* bytes per second rate limits */
 	uint64_t bps[2];
@@ -85,9 +142,6 @@ struct throtl_grp {
 	unsigned long slice_start[2];
 	unsigned long slice_end[2];
 
-	/* Some throttle limits got updated for the group */
-	int limits_changed;
-
 	/* Per cpu stats pointer */
 	struct tg_stats_cpu __percpu *stats_cpu;
 
@@ -98,7 +152,7 @@ struct throtl_grp {
 struct throtl_data
 {
 	/* service tree for active throtl groups */
-	struct throtl_rb_root tg_service_tree;
+	struct throtl_service_queue service_queue;
 
 	struct request_queue *queue;
 
@@ -111,9 +165,7 @@ struct throtl_data
 	unsigned int nr_undestroyed_grps;
 
 	/* Work for dispatching throttled bios */
-	struct delayed_work throtl_work;
-
-	int limits_changed;
+	struct work_struct dispatch_work;
 };
 
 /* list and work item to allocate percpu group stats */
@@ -123,6 +175,8 @@ static LIST_HEAD(tg_stats_alloc_list);
 static void tg_stats_alloc_fn(struct work_struct *);
 static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
 
+static void throtl_pending_timer_fn(unsigned long arg);
+
 static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
 {
 	return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
@@ -143,39 +197,69 @@ static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
 	return blkg_to_tg(td->queue->root_blkg);
 }
 
-enum tg_state_flags {
-	THROTL_TG_FLAG_on_rr = 0,	/* on round-robin busy list */
-};
-
-#define THROTL_TG_FNS(name)						\
-static inline void throtl_mark_tg_##name(struct throtl_grp *tg)		\
-{									\
-	(tg)->flags |= (1 << THROTL_TG_FLAG_##name);			\
-}									\
-static inline void throtl_clear_tg_##name(struct throtl_grp *tg)	\
-{									\
-	(tg)->flags &= ~(1 << THROTL_TG_FLAG_##name);			\
-}									\
-static inline int throtl_tg_##name(const struct throtl_grp *tg)		\
-{									\
-	return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0;	\
+/**
+ * sq_to_tg - return the throl_grp the specified service queue belongs to
+ * @sq: the throtl_service_queue of interest
+ *
+ * Return the throtl_grp @sq belongs to.  If @sq is the top-level one
+ * embedded in throtl_data, %NULL is returned.
+ */
+static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq)
+{
+	if (sq && sq->parent_sq)
+		return container_of(sq, struct throtl_grp, service_queue);
+	else
+		return NULL;
 }
 
-THROTL_TG_FNS(on_rr);
+/**
+ * sq_to_td - return throtl_data the specified service queue belongs to
+ * @sq: the throtl_service_queue of interest
+ *
+ * A service_queue can be embeded in either a throtl_grp or throtl_data.
+ * Determine the associated throtl_data accordingly and return it.
+ */
+static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
+{
+	struct throtl_grp *tg = sq_to_tg(sq);
 
-#define throtl_log_tg(td, tg, fmt, args...)	do {			\
-	char __pbuf[128];						\
+	if (tg)
+		return tg->td;
+	else
+		return container_of(sq, struct throtl_data, service_queue);
+}
+
+/**
+ * throtl_log - log debug message via blktrace
+ * @sq: the service_queue being reported
+ * @fmt: printf format string
+ * @args: printf args
+ *
+ * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a
+ * throtl_grp; otherwise, just "throtl".
+ *
+ * TODO: this should be made a function and name formatting should happen
+ * after testing whether blktrace is enabled.
+ */
+#define throtl_log(sq, fmt, args...)	do {				\
+	struct throtl_grp *__tg = sq_to_tg((sq));			\
+	struct throtl_data *__td = sq_to_td((sq));			\
+									\
+	(void)__td;							\
+	if ((__tg)) {							\
+		char __pbuf[128];					\
 									\
-	blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf));		\
-	blk_add_trace_msg((td)->queue, "throtl %s " fmt, __pbuf, ##args); \
+		blkg_path(tg_to_blkg(__tg), __pbuf, sizeof(__pbuf));	\
+		blk_add_trace_msg(__td->queue, "throtl %s " fmt, __pbuf, ##args); \
+	} else {							\
+		blk_add_trace_msg(__td->queue, "throtl " fmt, ##args);	\
+	}								\
 } while (0)
 
-#define throtl_log(td, fmt, args...)	\
-	blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
-
-static inline unsigned int total_nr_queued(struct throtl_data *td)
+static void tg_stats_init(struct tg_stats_cpu *tg_stats)
 {
-	return td->nr_queued[0] + td->nr_queued[1];
+	blkg_rwstat_init(&tg_stats->service_bytes);
+	blkg_rwstat_init(&tg_stats->serviced);
 }
 
 /*
@@ -191,12 +275,16 @@ static void tg_stats_alloc_fn(struct work_struct *work)
 
 alloc_stats:
 	if (!stats_cpu) {
+		int cpu;
+
 		stats_cpu = alloc_percpu(struct tg_stats_cpu);
 		if (!stats_cpu) {
 			/* allocation failed, try again after some time */
 			schedule_delayed_work(dwork, msecs_to_jiffies(10));
 			return;
 		}
+		for_each_possible_cpu(cpu)
+			tg_stats_init(per_cpu_ptr(stats_cpu, cpu));
 	}
 
 	spin_lock_irq(&tg_stats_alloc_lock);
@@ -215,15 +303,141 @@ alloc_stats:
 		goto alloc_stats;
 }
 
+static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
+{
+	INIT_LIST_HEAD(&qn->node);
+	bio_list_init(&qn->bios);
+	qn->tg = tg;
+}
+
+/**
+ * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it
+ * @bio: bio being added
+ * @qn: qnode to add bio to
+ * @queued: the service_queue->queued[] list @qn belongs to
+ *
+ * Add @bio to @qn and put @qn on @queued if it's not already on.
+ * @qn->tg's reference count is bumped when @qn is activated.  See the
+ * comment on top of throtl_qnode definition for details.
+ */
+static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
+				 struct list_head *queued)
+{
+	bio_list_add(&qn->bios, bio);
+	if (list_empty(&qn->node)) {
+		list_add_tail(&qn->node, queued);
+		blkg_get(tg_to_blkg(qn->tg));
+	}
+}
+
+/**
+ * throtl_peek_queued - peek the first bio on a qnode list
+ * @queued: the qnode list to peek
+ */
+static struct bio *throtl_peek_queued(struct list_head *queued)
+{
+	struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
+	struct bio *bio;
+
+	if (list_empty(queued))
+		return NULL;
+
+	bio = bio_list_peek(&qn->bios);
+	WARN_ON_ONCE(!bio);
+	return bio;
+}
+
+/**
+ * throtl_pop_queued - pop the first bio form a qnode list
+ * @queued: the qnode list to pop a bio from
+ * @tg_to_put: optional out argument for throtl_grp to put
+ *
+ * Pop the first bio from the qnode list @queued.  After popping, the first
+ * qnode is removed from @queued if empty or moved to the end of @queued so
+ * that the popping order is round-robin.
+ *
+ * When the first qnode is removed, its associated throtl_grp should be put
+ * too.  If @tg_to_put is NULL, this function automatically puts it;
+ * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is
+ * responsible for putting it.
+ */
+static struct bio *throtl_pop_queued(struct list_head *queued,
+				     struct throtl_grp **tg_to_put)
+{
+	struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
+	struct bio *bio;
+
+	if (list_empty(queued))
+		return NULL;
+
+	bio = bio_list_pop(&qn->bios);
+	WARN_ON_ONCE(!bio);
+
+	if (bio_list_empty(&qn->bios)) {
+		list_del_init(&qn->node);
+		if (tg_to_put)
+			*tg_to_put = qn->tg;
+		else
+			blkg_put(tg_to_blkg(qn->tg));
+	} else {
+		list_move_tail(&qn->node, queued);
+	}
+
+	return bio;
+}
+
+/* init a service_queue, assumes the caller zeroed it */
+static void throtl_service_queue_init(struct throtl_service_queue *sq,
+				      struct throtl_service_queue *parent_sq)
+{
+	INIT_LIST_HEAD(&sq->queued[0]);
+	INIT_LIST_HEAD(&sq->queued[1]);
+	sq->pending_tree = RB_ROOT;
+	sq->parent_sq = parent_sq;
+	setup_timer(&sq->pending_timer, throtl_pending_timer_fn,
+		    (unsigned long)sq);
+}
+
+static void throtl_service_queue_exit(struct throtl_service_queue *sq)
+{
+	del_timer_sync(&sq->pending_timer);
+}
+
 static void throtl_pd_init(struct blkcg_gq *blkg)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
+	struct throtl_data *td = blkg->q->td;
+	struct throtl_service_queue *parent_sq;
 	unsigned long flags;
+	int rw;
+
+	/*
+	 * If sane_hierarchy is enabled, we switch to properly hierarchical
+	 * behavior where limits on a given throtl_grp are applied to the
+	 * whole subtree rather than just the group itself.  e.g. If 16M
+	 * read_bps limit is set on the root group, the whole system can't
+	 * exceed 16M for the device.
+	 *
+	 * If sane_hierarchy is not enabled, the broken flat hierarchy
+	 * behavior is retained where all throtl_grps are treated as if
+	 * they're all separate root groups right below throtl_data.
+	 * Limits of a group don't interact with limits of other groups
+	 * regardless of the position of the group in the hierarchy.
+	 */
+	parent_sq = &td->service_queue;
+
+	if (cgroup_sane_behavior(blkg->blkcg->css.cgroup) && blkg->parent)
+		parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
+
+	throtl_service_queue_init(&tg->service_queue, parent_sq);
+
+	for (rw = READ; rw <= WRITE; rw++) {
+		throtl_qnode_init(&tg->qnode_on_self[rw], tg);
+		throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
+	}
 
 	RB_CLEAR_NODE(&tg->rb_node);
-	bio_list_init(&tg->bio_lists[0]);
-	bio_list_init(&tg->bio_lists[1]);
-	tg->limits_changed = false;
+	tg->td = td;
 
 	tg->bps[READ] = -1;
 	tg->bps[WRITE] = -1;
@@ -241,6 +455,30 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
 	spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
 }
 
+/*
+ * Set has_rules[] if @tg or any of its parents have limits configured.
+ * This doesn't require walking up to the top of the hierarchy as the
+ * parent's has_rules[] is guaranteed to be correct.
+ */
+static void tg_update_has_rules(struct throtl_grp *tg)
+{
+	struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
+	int rw;
+
+	for (rw = READ; rw <= WRITE; rw++)
+		tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) ||
+				    (tg->bps[rw] != -1 || tg->iops[rw] != -1);
+}
+
+static void throtl_pd_online(struct blkcg_gq *blkg)
+{
+	/*
+	 * We don't want new groups to escape the limits of its ancestors.
+	 * Update has_rules[] after a new group is brought online.
+	 */
+	tg_update_has_rules(blkg_to_tg(blkg));
+}
+
 static void throtl_pd_exit(struct blkcg_gq *blkg)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -251,6 +489,8 @@ static void throtl_pd_exit(struct blkcg_gq *blkg)
 	spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
 
 	free_percpu(tg->stats_cpu);
+
+	throtl_service_queue_exit(&tg->service_queue);
 }
 
 static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
@@ -309,17 +549,18 @@ static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
 	return tg;
 }
 
-static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root)
+static struct throtl_grp *
+throtl_rb_first(struct throtl_service_queue *parent_sq)
 {
 	/* Service tree is empty */
-	if (!root->count)
+	if (!parent_sq->nr_pending)
 		return NULL;
 
-	if (!root->left)
-		root->left = rb_first(&root->rb);
+	if (!parent_sq->first_pending)
+		parent_sq->first_pending = rb_first(&parent_sq->pending_tree);
 
-	if (root->left)
-		return rb_entry_tg(root->left);
+	if (parent_sq->first_pending)
+		return rb_entry_tg(parent_sq->first_pending);
 
 	return NULL;
 }
@@ -330,29 +571,30 @@ static void rb_erase_init(struct rb_node *n, struct rb_root *root)
 	RB_CLEAR_NODE(n);
 }
 
-static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root)
+static void throtl_rb_erase(struct rb_node *n,
+			    struct throtl_service_queue *parent_sq)
 {
-	if (root->left == n)
-		root->left = NULL;
-	rb_erase_init(n, &root->rb);
-	--root->count;
+	if (parent_sq->first_pending == n)
+		parent_sq->first_pending = NULL;
+	rb_erase_init(n, &parent_sq->pending_tree);
+	--parent_sq->nr_pending;
 }
 
-static void update_min_dispatch_time(struct throtl_rb_root *st)
+static void update_min_dispatch_time(struct throtl_service_queue *parent_sq)
 {
 	struct throtl_grp *tg;
 
-	tg = throtl_rb_first(st);
+	tg = throtl_rb_first(parent_sq);
 	if (!tg)
 		return;
 
-	st->min_disptime = tg->disptime;
+	parent_sq->first_pending_disptime = tg->disptime;
 }
 
-static void
-tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
+static void tg_service_queue_add(struct throtl_grp *tg)
 {
-	struct rb_node **node = &st->rb.rb_node;
+	struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq;
+	struct rb_node **node = &parent_sq->pending_tree.rb_node;
 	struct rb_node *parent = NULL;
 	struct throtl_grp *__tg;
 	unsigned long key = tg->disptime;
@@ -371,89 +613,135 @@ tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
 	}
 
 	if (left)
-		st->left = &tg->rb_node;
+		parent_sq->first_pending = &tg->rb_node;
 
 	rb_link_node(&tg->rb_node, parent, node);
-	rb_insert_color(&tg->rb_node, &st->rb);
+	rb_insert_color(&tg->rb_node, &parent_sq->pending_tree);
+}
+
+static void __throtl_enqueue_tg(struct throtl_grp *tg)
+{
+	tg_service_queue_add(tg);
+	tg->flags |= THROTL_TG_PENDING;
+	tg->service_queue.parent_sq->nr_pending++;
 }
 
-static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
+static void throtl_enqueue_tg(struct throtl_grp *tg)
 {
-	struct throtl_rb_root *st = &td->tg_service_tree;
+	if (!(tg->flags & THROTL_TG_PENDING))
+		__throtl_enqueue_tg(tg);
+}
 
-	tg_service_tree_add(st, tg);
-	throtl_mark_tg_on_rr(tg);
-	st->count++;
+static void __throtl_dequeue_tg(struct throtl_grp *tg)
+{
+	throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq);
+	tg->flags &= ~THROTL_TG_PENDING;
 }
 
-static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
+static void throtl_dequeue_tg(struct throtl_grp *tg)
 {
-	if (!throtl_tg_on_rr(tg))
-		__throtl_enqueue_tg(td, tg);
+	if (tg->flags & THROTL_TG_PENDING)
+		__throtl_dequeue_tg(tg);
 }
 
-static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
+/* Call with queue lock held */
+static void throtl_schedule_pending_timer(struct throtl_service_queue *sq,
+					  unsigned long expires)
 {
-	throtl_rb_erase(&tg->rb_node, &td->tg_service_tree);
-	throtl_clear_tg_on_rr(tg);
+	mod_timer(&sq->pending_timer, expires);
+	throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu",
+		   expires - jiffies, jiffies);
 }
 
-static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
+/**
+ * throtl_schedule_next_dispatch - schedule the next dispatch cycle
+ * @sq: the service_queue to schedule dispatch for
+ * @force: force scheduling
+ *
+ * Arm @sq->pending_timer so that the next dispatch cycle starts on the
+ * dispatch time of the first pending child.  Returns %true if either timer
+ * is armed or there's no pending child left.  %false if the current
+ * dispatch window is still open and the caller should continue
+ * dispatching.
+ *
+ * If @force is %true, the dispatch timer is always scheduled and this
+ * function is guaranteed to return %true.  This is to be used when the
+ * caller can't dispatch itself and needs to invoke pending_timer
+ * unconditionally.  Note that forced scheduling is likely to induce short
+ * delay before dispatch starts even if @sq->first_pending_disptime is not
+ * in the future and thus shouldn't be used in hot paths.
+ */
+static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq,
+					  bool force)
 {
-	if (throtl_tg_on_rr(tg))
-		__throtl_dequeue_tg(td, tg);
+	/* any pending children left? */
+	if (!sq->nr_pending)
+		return true;
+
+	update_min_dispatch_time(sq);
+
+	/* is the next dispatch time in the future? */
+	if (force || time_after(sq->first_pending_disptime, jiffies)) {
+		throtl_schedule_pending_timer(sq, sq->first_pending_disptime);
+		return true;
+	}
+
+	/* tell the caller to continue dispatching */
+	return false;
 }
 
-static void throtl_schedule_next_dispatch(struct throtl_data *td)
+static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
+		bool rw, unsigned long start)
 {
-	struct throtl_rb_root *st = &td->tg_service_tree;
+	tg->bytes_disp[rw] = 0;
+	tg->io_disp[rw] = 0;
 
 	/*
-	 * If there are more bios pending, schedule more work.
+	 * Previous slice has expired. We must have trimmed it after last
+	 * bio dispatch. That means since start of last slice, we never used
+	 * that bandwidth. Do try to make use of that bandwidth while giving
+	 * credit.
 	 */
-	if (!total_nr_queued(td))
-		return;
-
-	BUG_ON(!st->count);
+	if (time_after_eq(start, tg->slice_start[rw]))
+		tg->slice_start[rw] = start;
 
-	update_min_dispatch_time(st);
-
-	if (time_before_eq(st->min_disptime, jiffies))
-		throtl_schedule_delayed_work(td, 0);
-	else
-		throtl_schedule_delayed_work(td, (st->min_disptime - jiffies));
+	tg->slice_end[rw] = jiffies + throtl_slice;
+	throtl_log(&tg->service_queue,
+		   "[%c] new slice with credit start=%lu end=%lu jiffies=%lu",
+		   rw == READ ? 'R' : 'W', tg->slice_start[rw],
+		   tg->slice_end[rw], jiffies);
 }
 
-static inline void
-throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
 {
 	tg->bytes_disp[rw] = 0;
 	tg->io_disp[rw] = 0;
 	tg->slice_start[rw] = jiffies;
 	tg->slice_end[rw] = jiffies + throtl_slice;
-	throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
-			rw == READ ? 'R' : 'W', tg->slice_start[rw],
-			tg->slice_end[rw], jiffies);
+	throtl_log(&tg->service_queue,
+		   "[%c] new slice start=%lu end=%lu jiffies=%lu",
+		   rw == READ ? 'R' : 'W', tg->slice_start[rw],
+		   tg->slice_end[rw], jiffies);
 }
 
-static inline void throtl_set_slice_end(struct throtl_data *td,
-		struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
+static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
+					unsigned long jiffy_end)
 {
 	tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
 }
 
-static inline void throtl_extend_slice(struct throtl_data *td,
-		struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
+static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
+				       unsigned long jiffy_end)
 {
 	tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
-	throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
-			rw == READ ? 'R' : 'W', tg->slice_start[rw],
-			tg->slice_end[rw], jiffies);
+	throtl_log(&tg->service_queue,
+		   "[%c] extend slice start=%lu end=%lu jiffies=%lu",
+		   rw == READ ? 'R' : 'W', tg->slice_start[rw],
+		   tg->slice_end[rw], jiffies);
 }
 
 /* Determine if previously allocated or extended slice is complete or not */
-static bool
-throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
 {
 	if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
 		return 0;
@@ -462,8 +750,7 @@ throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 }
 
 /* Trim the used slices and adjust slice start accordingly */
-static inline void
-throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
 {
 	unsigned long nr_slices, time_elapsed, io_trim;
 	u64 bytes_trim, tmp;
@@ -475,7 +762,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 	 * renewed. Don't try to trim the slice if slice is used. A new
 	 * slice will start when appropriate.
 	 */
-	if (throtl_slice_used(td, tg, rw))
+	if (throtl_slice_used(tg, rw))
 		return;
 
 	/*
@@ -486,7 +773,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 	 * is bad because it does not allow new slice to start.
 	 */
 
-	throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice);
+	throtl_set_slice_end(tg, rw, jiffies + throtl_slice);
 
 	time_elapsed = jiffies - tg->slice_start[rw];
 
@@ -515,14 +802,14 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 
 	tg->slice_start[rw] += nr_slices * throtl_slice;
 
-	throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
-			" start=%lu end=%lu jiffies=%lu",
-			rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
-			tg->slice_start[rw], tg->slice_end[rw], jiffies);
+	throtl_log(&tg->service_queue,
+		   "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu",
+		   rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
+		   tg->slice_start[rw], tg->slice_end[rw], jiffies);
 }
 
-static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
-		struct bio *bio, unsigned long *wait)
+static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
+				  unsigned long *wait)
 {
 	bool rw = bio_data_dir(bio);
 	unsigned int io_allowed;
@@ -571,8 +858,8 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
 	return 0;
 }
 
-static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
-		struct bio *bio, unsigned long *wait)
+static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
+				 unsigned long *wait)
 {
 	bool rw = bio_data_dir(bio);
 	u64 bytes_allowed, extra_bytes, tmp;
@@ -613,18 +900,12 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
 	return 0;
 }
 
-static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {
-	if (tg->bps[rw] == -1 && tg->iops[rw] == -1)
-		return 1;
-	return 0;
-}
-
 /*
  * Returns whether one can dispatch a bio or not. Also returns approx number
  * of jiffies to wait before this bio is with-in IO rate and can be dispatched
  */
-static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
-				struct bio *bio, unsigned long *wait)
+static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
+			    unsigned long *wait)
 {
 	bool rw = bio_data_dir(bio);
 	unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
@@ -635,7 +916,8 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
 	 * this function with a different bio if there are other bios
 	 * queued.
 	 */
-	BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
+	BUG_ON(tg->service_queue.nr_queued[rw] &&
+	       bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
 
 	/* If tg->bps = -1, then BW is unlimited */
 	if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
@@ -649,15 +931,15 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
 	 * existing slice to make sure it is at least throtl_slice interval
 	 * long since now.
 	 */
-	if (throtl_slice_used(td, tg, rw))
-		throtl_start_new_slice(td, tg, rw);
+	if (throtl_slice_used(tg, rw))
+		throtl_start_new_slice(tg, rw);
 	else {
 		if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
-			throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
+			throtl_extend_slice(tg, rw, jiffies + throtl_slice);
 	}
 
-	if (tg_with_in_bps_limit(td, tg, bio, &bps_wait)
-	    && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) {
+	if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
+	    tg_with_in_iops_limit(tg, bio, &iops_wait)) {
 		if (wait)
 			*wait = 0;
 		return 1;
@@ -669,7 +951,7 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
 		*wait = max_wait;
 
 	if (time_before(tg->slice_end[rw], jiffies + max_wait))
-		throtl_extend_slice(td, tg, rw, jiffies + max_wait);
+		throtl_extend_slice(tg, rw, jiffies + max_wait);
 
 	return 0;
 }
@@ -708,65 +990,136 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 	tg->bytes_disp[rw] += bio->bi_size;
 	tg->io_disp[rw]++;
 
-	throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw);
+	/*
+	 * REQ_THROTTLED is used to prevent the same bio to be throttled
+	 * more than once as a throttled bio will go through blk-throtl the
+	 * second time when it eventually gets issued.  Set it when a bio
+	 * is being charged to a tg.
+	 *
+	 * Dispatch stats aren't recursive and each @bio should only be
+	 * accounted by the @tg it was originally associated with.  Let's
+	 * update the stats when setting REQ_THROTTLED for the first time
+	 * which is guaranteed to be for the @bio's original tg.
+	 */
+	if (!(bio->bi_rw & REQ_THROTTLED)) {
+		bio->bi_rw |= REQ_THROTTLED;
+		throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size,
+					     bio->bi_rw);
+	}
 }
 
-static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
-			struct bio *bio)
+/**
+ * throtl_add_bio_tg - add a bio to the specified throtl_grp
+ * @bio: bio to add
+ * @qn: qnode to use
+ * @tg: the target throtl_grp
+ *
+ * Add @bio to @tg's service_queue using @qn.  If @qn is not specified,
+ * tg->qnode_on_self[] is used.
+ */
+static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
+			      struct throtl_grp *tg)
 {
+	struct throtl_service_queue *sq = &tg->service_queue;
 	bool rw = bio_data_dir(bio);
 
-	bio_list_add(&tg->bio_lists[rw], bio);
-	/* Take a bio reference on tg */
-	blkg_get(tg_to_blkg(tg));
-	tg->nr_queued[rw]++;
-	td->nr_queued[rw]++;
-	throtl_enqueue_tg(td, tg);
+	if (!qn)
+		qn = &tg->qnode_on_self[rw];
+
+	/*
+	 * If @tg doesn't currently have any bios queued in the same
+	 * direction, queueing @bio can change when @tg should be
+	 * dispatched.  Mark that @tg was empty.  This is automatically
+	 * cleaered on the next tg_update_disptime().
+	 */
+	if (!sq->nr_queued[rw])
+		tg->flags |= THROTL_TG_WAS_EMPTY;
+
+	throtl_qnode_add_bio(bio, qn, &sq->queued[rw]);
+
+	sq->nr_queued[rw]++;
+	throtl_enqueue_tg(tg);
 }
 
-static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
+static void tg_update_disptime(struct throtl_grp *tg)
 {
+	struct throtl_service_queue *sq = &tg->service_queue;
 	unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
 	struct bio *bio;
 
-	if ((bio = bio_list_peek(&tg->bio_lists[READ])))
-		tg_may_dispatch(td, tg, bio, &read_wait);
+	if ((bio = throtl_peek_queued(&sq->queued[READ])))
+		tg_may_dispatch(tg, bio, &read_wait);
 
-	if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
-		tg_may_dispatch(td, tg, bio, &write_wait);
+	if ((bio = throtl_peek_queued(&sq->queued[WRITE])))
+		tg_may_dispatch(tg, bio, &write_wait);
 
 	min_wait = min(read_wait, write_wait);
 	disptime = jiffies + min_wait;
 
 	/* Update dispatch time */
-	throtl_dequeue_tg(td, tg);
+	throtl_dequeue_tg(tg);
 	tg->disptime = disptime;
-	throtl_enqueue_tg(td, tg);
+	throtl_enqueue_tg(tg);
+
+	/* see throtl_add_bio_tg() */
+	tg->flags &= ~THROTL_TG_WAS_EMPTY;
 }
 
-static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
-				bool rw, struct bio_list *bl)
+static void start_parent_slice_with_credit(struct throtl_grp *child_tg,
+					struct throtl_grp *parent_tg, bool rw)
 {
-	struct bio *bio;
+	if (throtl_slice_used(parent_tg, rw)) {
+		throtl_start_new_slice_with_credit(parent_tg, rw,
+				child_tg->slice_start[rw]);
+	}
 
-	bio = bio_list_pop(&tg->bio_lists[rw]);
-	tg->nr_queued[rw]--;
-	/* Drop bio reference on blkg */
-	blkg_put(tg_to_blkg(tg));
+}
+
+static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
+{
+	struct throtl_service_queue *sq = &tg->service_queue;
+	struct throtl_service_queue *parent_sq = sq->parent_sq;
+	struct throtl_grp *parent_tg = sq_to_tg(parent_sq);
+	struct throtl_grp *tg_to_put = NULL;
+	struct bio *bio;
 
-	BUG_ON(td->nr_queued[rw] <= 0);
-	td->nr_queued[rw]--;
+	/*
+	 * @bio is being transferred from @tg to @parent_sq.  Popping a bio
+	 * from @tg may put its reference and @parent_sq might end up
+	 * getting released prematurely.  Remember the tg to put and put it
+	 * after @bio is transferred to @parent_sq.
+	 */
+	bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put);
+	sq->nr_queued[rw]--;
 
 	throtl_charge_bio(tg, bio);
-	bio_list_add(bl, bio);
-	bio->bi_rw |= REQ_THROTTLED;
 
-	throtl_trim_slice(td, tg, rw);
+	/*
+	 * If our parent is another tg, we just need to transfer @bio to
+	 * the parent using throtl_add_bio_tg().  If our parent is
+	 * @td->service_queue, @bio is ready to be issued.  Put it on its
+	 * bio_lists[] and decrease total number queued.  The caller is
+	 * responsible for issuing these bios.
+	 */
+	if (parent_tg) {
+		throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg);
+		start_parent_slice_with_credit(tg, parent_tg, rw);
+	} else {
+		throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
+				     &parent_sq->queued[rw]);
+		BUG_ON(tg->td->nr_queued[rw] <= 0);
+		tg->td->nr_queued[rw]--;
+	}
+
+	throtl_trim_slice(tg, rw);
+
+	if (tg_to_put)
+		blkg_put(tg_to_blkg(tg_to_put));
 }
 
-static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
-				struct bio_list *bl)
+static int throtl_dispatch_tg(struct throtl_grp *tg)
 {
+	struct throtl_service_queue *sq = &tg->service_queue;
 	unsigned int nr_reads = 0, nr_writes = 0;
 	unsigned int max_nr_reads = throtl_grp_quantum*3/4;
 	unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
@@ -774,20 +1127,20 @@ static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
 
 	/* Try to dispatch 75% READS and 25% WRITES */
 
-	while ((bio = bio_list_peek(&tg->bio_lists[READ]))
-		&& tg_may_dispatch(td, tg, bio, NULL)) {
+	while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
+	       tg_may_dispatch(tg, bio, NULL)) {
 
-		tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
+		tg_dispatch_one_bio(tg, bio_data_dir(bio));
 		nr_reads++;
 
 		if (nr_reads >= max_nr_reads)
 			break;
 	}
 
-	while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))
-		&& tg_may_dispatch(td, tg, bio, NULL)) {
+	while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
+	       tg_may_dispatch(tg, bio, NULL)) {
 
-		tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
+		tg_dispatch_one_bio(tg, bio_data_dir(bio));
 		nr_writes++;
 
 		if (nr_writes >= max_nr_writes)
@@ -797,14 +1150,13 @@ static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
 	return nr_reads + nr_writes;
 }
 
-static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
+static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
 {
 	unsigned int nr_disp = 0;
-	struct throtl_grp *tg;
-	struct throtl_rb_root *st = &td->tg_service_tree;
 
 	while (1) {
-		tg = throtl_rb_first(st);
+		struct throtl_grp *tg = throtl_rb_first(parent_sq);
+		struct throtl_service_queue *sq = &tg->service_queue;
 
 		if (!tg)
 			break;
@@ -812,14 +1164,12 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
 		if (time_before(jiffies, tg->disptime))
 			break;
 
-		throtl_dequeue_tg(td, tg);
+		throtl_dequeue_tg(tg);
 
-		nr_disp += throtl_dispatch_tg(td, tg, bl);
+		nr_disp += throtl_dispatch_tg(tg);
 
-		if (tg->nr_queued[0] || tg->nr_queued[1]) {
-			tg_update_disptime(td, tg);
-			throtl_enqueue_tg(td, tg);
-		}
+		if (sq->nr_queued[0] || sq->nr_queued[1])
+			tg_update_disptime(tg);
 
 		if (nr_disp >= throtl_quantum)
 			break;
@@ -828,111 +1178,111 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
 	return nr_disp;
 }
 
-static void throtl_process_limit_change(struct throtl_data *td)
+/**
+ * throtl_pending_timer_fn - timer function for service_queue->pending_timer
+ * @arg: the throtl_service_queue being serviced
+ *
+ * This timer is armed when a child throtl_grp with active bio's become
+ * pending and queued on the service_queue's pending_tree and expires when
+ * the first child throtl_grp should be dispatched.  This function
+ * dispatches bio's from the children throtl_grps to the parent
+ * service_queue.
+ *
+ * If the parent's parent is another throtl_grp, dispatching is propagated
+ * by either arming its pending_timer or repeating dispatch directly.  If
+ * the top-level service_tree is reached, throtl_data->dispatch_work is
+ * kicked so that the ready bio's are issued.
+ */
+static void throtl_pending_timer_fn(unsigned long arg)
 {
+	struct throtl_service_queue *sq = (void *)arg;
+	struct throtl_grp *tg = sq_to_tg(sq);
+	struct throtl_data *td = sq_to_td(sq);
 	struct request_queue *q = td->queue;
-	struct blkcg_gq *blkg, *n;
-
-	if (!td->limits_changed)
-		return;
-
-	xchg(&td->limits_changed, false);
-
-	throtl_log(td, "limits changed");
-
-	list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
-		struct throtl_grp *tg = blkg_to_tg(blkg);
+	struct throtl_service_queue *parent_sq;
+	bool dispatched;
+	int ret;
 
-		if (!tg->limits_changed)
-			continue;
+	spin_lock_irq(q->queue_lock);
+again:
+	parent_sq = sq->parent_sq;
+	dispatched = false;
+
+	while (true) {
+		throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u",
+			   sq->nr_queued[READ] + sq->nr_queued[WRITE],
+			   sq->nr_queued[READ], sq->nr_queued[WRITE]);
+
+		ret = throtl_select_dispatch(sq);
+		if (ret) {
+			throtl_log(sq, "bios disp=%u", ret);
+			dispatched = true;
+		}
 
-		if (!xchg(&tg->limits_changed, false))
-			continue;
+		if (throtl_schedule_next_dispatch(sq, false))
+			break;
 
-		throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
-			" riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE],
-			tg->iops[READ], tg->iops[WRITE]);
+		/* this dispatch windows is still open, relax and repeat */
+		spin_unlock_irq(q->queue_lock);
+		cpu_relax();
+		spin_lock_irq(q->queue_lock);
+	}
 
-		/*
-		 * Restart the slices for both READ and WRITES. It
-		 * might happen that a group's limit are dropped
-		 * suddenly and we don't want to account recently
-		 * dispatched IO with new low rate
-		 */
-		throtl_start_new_slice(td, tg, 0);
-		throtl_start_new_slice(td, tg, 1);
+	if (!dispatched)
+		goto out_unlock;
 
-		if (throtl_tg_on_rr(tg))
-			tg_update_disptime(td, tg);
+	if (parent_sq) {
+		/* @parent_sq is another throl_grp, propagate dispatch */
+		if (tg->flags & THROTL_TG_WAS_EMPTY) {
+			tg_update_disptime(tg);
+			if (!throtl_schedule_next_dispatch(parent_sq, false)) {
+				/* window is already open, repeat dispatching */
+				sq = parent_sq;
+				tg = sq_to_tg(sq);
+				goto again;
+			}
+		}
+	} else {
+		/* reached the top-level, queue issueing */
+		queue_work(kthrotld_workqueue, &td->dispatch_work);
 	}
+out_unlock:
+	spin_unlock_irq(q->queue_lock);
 }
 
-/* Dispatch throttled bios. Should be called without queue lock held. */
-static int throtl_dispatch(struct request_queue *q)
+/**
+ * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work
+ * @work: work item being executed
+ *
+ * This function is queued for execution when bio's reach the bio_lists[]
+ * of throtl_data->service_queue.  Those bio's are ready and issued by this
+ * function.
+ */
+void blk_throtl_dispatch_work_fn(struct work_struct *work)
 {
-	struct throtl_data *td = q->td;
-	unsigned int nr_disp = 0;
+	struct throtl_data *td = container_of(work, struct throtl_data,
+					      dispatch_work);
+	struct throtl_service_queue *td_sq = &td->service_queue;
+	struct request_queue *q = td->queue;
 	struct bio_list bio_list_on_stack;
 	struct bio *bio;
 	struct blk_plug plug;
-
-	spin_lock_irq(q->queue_lock);
-
-	throtl_process_limit_change(td);
-
-	if (!total_nr_queued(td))
-		goto out;
+	int rw;
 
 	bio_list_init(&bio_list_on_stack);
 
-	throtl_log(td, "dispatch nr_queued=%u read=%u write=%u",
-			total_nr_queued(td), td->nr_queued[READ],
-			td->nr_queued[WRITE]);
-
-	nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
-
-	if (nr_disp)
-		throtl_log(td, "bios disp=%u", nr_disp);
-
-	throtl_schedule_next_dispatch(td);
-out:
+	spin_lock_irq(q->queue_lock);
+	for (rw = READ; rw <= WRITE; rw++)
+		while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
+			bio_list_add(&bio_list_on_stack, bio);
 	spin_unlock_irq(q->queue_lock);
 
-	/*
-	 * If we dispatched some requests, unplug the queue to make sure
-	 * immediate dispatch
-	 */
-	if (nr_disp) {
+	if (!bio_list_empty(&bio_list_on_stack)) {
 		blk_start_plug(&plug);
 		while((bio = bio_list_pop(&bio_list_on_stack)))
 			generic_make_request(bio);
 		blk_finish_plug(&plug);
 	}
-	return nr_disp;
-}
-
-void blk_throtl_work(struct work_struct *work)
-{
-	struct throtl_data *td = container_of(work, struct throtl_data,
-					throtl_work.work);
-	struct request_queue *q = td->queue;
-
-	throtl_dispatch(q);
-}
-
-/* Call with queue lock held */
-static void
-throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
-{
-
-	struct delayed_work *dwork = &td->throtl_work;
-
-	/* schedule work if limits changed even if no bio is queued */
-	if (total_nr_queued(td) || td->limits_changed) {
-		mod_delayed_work(kthrotld_workqueue, dwork, delay);
-		throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
-				delay, jiffies);
-	}
 }
 
 static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
@@ -953,10 +1303,10 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
 	return __blkg_prfill_rwstat(sf, pd, &rwstat);
 }
 
-static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
-			       struct seq_file *sf)
+static int tg_print_cpu_rwstat(struct cgroup_subsys_state *css,
+			       struct cftype *cft, struct seq_file *sf)
 {
-	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+	struct blkcg *blkcg = css_to_blkcg(css);
 
 	blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl,
 			  cft->private, true);
@@ -985,29 +1335,31 @@ static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
 	return __blkg_prfill_u64(sf, pd, v);
 }
 
-static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft,
-			     struct seq_file *sf)
+static int tg_print_conf_u64(struct cgroup_subsys_state *css,
+			     struct cftype *cft, struct seq_file *sf)
 {
-	blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_u64,
+	blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_u64,
 			  &blkcg_policy_throtl, cft->private, false);
 	return 0;
 }
 
-static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft,
-			      struct seq_file *sf)
+static int tg_print_conf_uint(struct cgroup_subsys_state *css,
+			      struct cftype *cft, struct seq_file *sf)
 {
-	blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_uint,
+	blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_uint,
 			  &blkcg_policy_throtl, cft->private, false);
 	return 0;
 }
 
-static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
-		       bool is_u64)
+static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft,
+		       const char *buf, bool is_u64)
 {
-	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+	struct blkcg *blkcg = css_to_blkcg(css);
 	struct blkg_conf_ctx ctx;
 	struct throtl_grp *tg;
-	struct throtl_data *td;
+	struct throtl_service_queue *sq;
+	struct blkcg_gq *blkg;
+	struct cgroup_subsys_state *pos_css;
 	int ret;
 
 	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
@@ -1015,7 +1367,7 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
 		return ret;
 
 	tg = blkg_to_tg(ctx.blkg);
-	td = ctx.blkg->q->td;
+	sq = &tg->service_queue;
 
 	if (!ctx.v)
 		ctx.v = -1;
@@ -1025,25 +1377,51 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
 	else
 		*(unsigned int *)((void *)tg + cft->private) = ctx.v;
 
-	/* XXX: we don't need the following deferred processing */
-	xchg(&tg->limits_changed, true);
-	xchg(&td->limits_changed, true);
-	throtl_schedule_delayed_work(td, 0);
+	throtl_log(&tg->service_queue,
+		   "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
+		   tg->bps[READ], tg->bps[WRITE],
+		   tg->iops[READ], tg->iops[WRITE]);
+
+	/*
+	 * Update has_rules[] flags for the updated tg's subtree.  A tg is
+	 * considered to have rules if either the tg itself or any of its
+	 * ancestors has rules.  This identifies groups without any
+	 * restrictions in the whole hierarchy and allows them to bypass
+	 * blk-throttle.
+	 */
+	blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg)
+		tg_update_has_rules(blkg_to_tg(blkg));
+
+	/*
+	 * We're already holding queue_lock and know @tg is valid.  Let's
+	 * apply the new config directly.
+	 *
+	 * Restart the slices for both READ and WRITES. It might happen
+	 * that a group's limit are dropped suddenly and we don't want to
+	 * account recently dispatched IO with new low rate.
+	 */
+	throtl_start_new_slice(tg, 0);
+	throtl_start_new_slice(tg, 1);
+
+	if (tg->flags & THROTL_TG_PENDING) {
+		tg_update_disptime(tg);
+		throtl_schedule_next_dispatch(sq->parent_sq, true);
+	}
 
 	blkg_conf_finish(&ctx);
 	return 0;
 }
 
-static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft,
+static int tg_set_conf_u64(struct cgroup_subsys_state *css, struct cftype *cft,
 			   const char *buf)
 {
-	return tg_set_conf(cgrp, cft, buf, true);
+	return tg_set_conf(css, cft, buf, true);
 }
 
-static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft,
+static int tg_set_conf_uint(struct cgroup_subsys_state *css, struct cftype *cft,
 			    const char *buf)
 {
-	return tg_set_conf(cgrp, cft, buf, false);
+	return tg_set_conf(css, cft, buf, false);
 }
 
 static struct cftype throtl_files[] = {
@@ -1092,7 +1470,7 @@ static void throtl_shutdown_wq(struct request_queue *q)
 {
 	struct throtl_data *td = q->td;
 
-	cancel_delayed_work_sync(&td->throtl_work);
+	cancel_work_sync(&td->dispatch_work);
 }
 
 static struct blkcg_policy blkcg_policy_throtl = {
@@ -1100,6 +1478,7 @@ static struct blkcg_policy blkcg_policy_throtl = {
 	.cftypes		= throtl_files,
 
 	.pd_init_fn		= throtl_pd_init,
+	.pd_online_fn		= throtl_pd_online,
 	.pd_exit_fn		= throtl_pd_exit,
 	.pd_reset_stats_fn	= throtl_pd_reset_stats,
 };
@@ -1107,15 +1486,16 @@ static struct blkcg_policy blkcg_policy_throtl = {
 bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 {
 	struct throtl_data *td = q->td;
+	struct throtl_qnode *qn = NULL;
 	struct throtl_grp *tg;
-	bool rw = bio_data_dir(bio), update_disptime = true;
+	struct throtl_service_queue *sq;
+	bool rw = bio_data_dir(bio);
 	struct blkcg *blkcg;
 	bool throttled = false;
 
-	if (bio->bi_rw & REQ_THROTTLED) {
-		bio->bi_rw &= ~REQ_THROTTLED;
+	/* see throtl_charge_bio() */
+	if (bio->bi_rw & REQ_THROTTLED)
 		goto out;
-	}
 
 	/*
 	 * A throtl_grp pointer retrieved under rcu can be used to access
@@ -1126,7 +1506,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 	blkcg = bio_blkcg(bio);
 	tg = throtl_lookup_tg(td, blkcg);
 	if (tg) {
-		if (tg_no_rule_group(tg, rw)) {
+		if (!tg->has_rules[rw]) {
 			throtl_update_dispatch_stats(tg_to_blkg(tg),
 						     bio->bi_size, bio->bi_rw);
 			goto out_unlock_rcu;
@@ -1142,18 +1522,18 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 	if (unlikely(!tg))
 		goto out_unlock;
 
-	if (tg->nr_queued[rw]) {
-		/*
-		 * There is already another bio queued in same dir. No
-		 * need to update dispatch time.
-		 */
-		update_disptime = false;
-		goto queue_bio;
+	sq = &tg->service_queue;
 
-	}
+	while (true) {
+		/* throtl is FIFO - if bios are already queued, should queue */
+		if (sq->nr_queued[rw])
+			break;
+
+		/* if above limits, break to queue */
+		if (!tg_may_dispatch(tg, bio, NULL))
+			break;
 
-	/* Bio is with-in rate limit of group */
-	if (tg_may_dispatch(td, tg, bio, NULL)) {
+		/* within limits, let's charge and dispatch directly */
 		throtl_charge_bio(tg, bio);
 
 		/*
@@ -1167,25 +1547,41 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 		 *
 		 * So keep on trimming slice even if bio is not queued.
 		 */
-		throtl_trim_slice(td, tg, rw);
-		goto out_unlock;
+		throtl_trim_slice(tg, rw);
+
+		/*
+		 * @bio passed through this layer without being throttled.
+		 * Climb up the ladder.  If we''re already at the top, it
+		 * can be executed directly.
+		 */
+		qn = &tg->qnode_on_parent[rw];
+		sq = sq->parent_sq;
+		tg = sq_to_tg(sq);
+		if (!tg)
+			goto out_unlock;
 	}
 
-queue_bio:
-	throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu"
-			" iodisp=%u iops=%u queued=%d/%d",
-			rw == READ ? 'R' : 'W',
-			tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
-			tg->io_disp[rw], tg->iops[rw],
-			tg->nr_queued[READ], tg->nr_queued[WRITE]);
+	/* out-of-limit, queue to @tg */
+	throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",
+		   rw == READ ? 'R' : 'W',
+		   tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
+		   tg->io_disp[rw], tg->iops[rw],
+		   sq->nr_queued[READ], sq->nr_queued[WRITE]);
 
 	bio_associate_current(bio);
-	throtl_add_bio_tg(q->td, tg, bio);
+	tg->td->nr_queued[rw]++;
+	throtl_add_bio_tg(bio, qn, tg);
 	throttled = true;
 
-	if (update_disptime) {
-		tg_update_disptime(td, tg);
-		throtl_schedule_next_dispatch(td);
+	/*
+	 * Update @tg's dispatch time and force schedule dispatch if @tg
+	 * was empty before @bio.  The forced scheduling isn't likely to
+	 * cause undue delay as @bio is likely to be dispatched directly if
+	 * its @tg's disptime is not in the future.
+	 */
+	if (tg->flags & THROTL_TG_WAS_EMPTY) {
+		tg_update_disptime(tg);
+		throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true);
 	}
 
 out_unlock:
@@ -1193,9 +1589,38 @@ out_unlock:
 out_unlock_rcu:
 	rcu_read_unlock();
 out:
+	/*
+	 * As multiple blk-throtls may stack in the same issue path, we
+	 * don't want bios to leave with the flag set.  Clear the flag if
+	 * being issued.
+	 */
+	if (!throttled)
+		bio->bi_rw &= ~REQ_THROTTLED;
 	return throttled;
 }
 
+/*
+ * Dispatch all bios from all children tg's queued on @parent_sq.  On
+ * return, @parent_sq is guaranteed to not have any active children tg's
+ * and all bios from previously active tg's are on @parent_sq->bio_lists[].
+ */
+static void tg_drain_bios(struct throtl_service_queue *parent_sq)
+{
+	struct throtl_grp *tg;
+
+	while ((tg = throtl_rb_first(parent_sq))) {
+		struct throtl_service_queue *sq = &tg->service_queue;
+		struct bio *bio;
+
+		throtl_dequeue_tg(tg);
+
+		while ((bio = throtl_peek_queued(&sq->queued[READ])))
+			tg_dispatch_one_bio(tg, bio_data_dir(bio));
+		while ((bio = throtl_peek_queued(&sq->queued[WRITE])))
+			tg_dispatch_one_bio(tg, bio_data_dir(bio));
+	}
+}
+
 /**
  * blk_throtl_drain - drain throttled bios
  * @q: request_queue to drain throttled bios for
@@ -1206,27 +1631,34 @@ void blk_throtl_drain(struct request_queue *q)
 	__releases(q->queue_lock) __acquires(q->queue_lock)
 {
 	struct throtl_data *td = q->td;
-	struct throtl_rb_root *st = &td->tg_service_tree;
-	struct throtl_grp *tg;
-	struct bio_list bl;
+	struct blkcg_gq *blkg;
+	struct cgroup_subsys_state *pos_css;
 	struct bio *bio;
+	int rw;
 
 	queue_lockdep_assert_held(q);
+	rcu_read_lock();
 
-	bio_list_init(&bl);
+	/*
+	 * Drain each tg while doing post-order walk on the blkg tree, so
+	 * that all bios are propagated to td->service_queue.  It'd be
+	 * better to walk service_queue tree directly but blkg walk is
+	 * easier.
+	 */
+	blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg)
+		tg_drain_bios(&blkg_to_tg(blkg)->service_queue);
 
-	while ((tg = throtl_rb_first(st))) {
-		throtl_dequeue_tg(td, tg);
+	/* finally, transfer bios from top-level tg's into the td */
+	tg_drain_bios(&td->service_queue);
 
-		while ((bio = bio_list_peek(&tg->bio_lists[READ])))
-			tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
-		while ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
-			tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
-	}
+	rcu_read_unlock();
 	spin_unlock_irq(q->queue_lock);
 
-	while ((bio = bio_list_pop(&bl)))
-		generic_make_request(bio);
+	/* all bios now should be in td->service_queue, issue them */
+	for (rw = READ; rw <= WRITE; rw++)
+		while ((bio = throtl_pop_queued(&td->service_queue.queued[rw],
+						NULL)))
+			generic_make_request(bio);
 
 	spin_lock_irq(q->queue_lock);
 }
@@ -1240,9 +1672,8 @@ int blk_throtl_init(struct request_queue *q)
 	if (!td)
 		return -ENOMEM;
 
-	td->tg_service_tree = THROTL_RB_ROOT;
-	td->limits_changed = false;
-	INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
+	INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
+	throtl_service_queue_init(&td->service_queue, NULL);
 
 	q->td = td;
 	td->queue = q;
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 6e4744c..bba81c9 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -7,6 +7,7 @@
 #include <linux/fault-inject.h>
 
 #include "blk.h"
+#include "blk-mq.h"
 
 #ifdef CONFIG_FAIL_IO_TIMEOUT
 
@@ -31,7 +32,7 @@ static int __init fail_io_timeout_debugfs(void)
 	struct dentry *dir = fault_create_debugfs_attr("fail_io_timeout",
 						NULL, &fail_io_timeout);
 
-	return IS_ERR(dir) ? PTR_ERR(dir) : 0;
+	return PTR_ERR_OR_ZERO(dir);
 }
 
 late_initcall(fail_io_timeout_debugfs);
@@ -82,16 +83,25 @@ void blk_delete_timer(struct request *req)
 static void blk_rq_timed_out(struct request *req)
 {
 	struct request_queue *q = req->q;
-	enum blk_eh_timer_return ret;
+	enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
 
-	ret = q->rq_timed_out_fn(req);
+	if (q->rq_timed_out_fn)
+		ret = q->rq_timed_out_fn(req);
 	switch (ret) {
 	case BLK_EH_HANDLED:
-		__blk_complete_request(req);
+		/* Can we use req->errors here? */
+		if (q->mq_ops)
+			blk_mq_complete_request(req, req->errors);
+		else
+			__blk_complete_request(req);
 		break;
 	case BLK_EH_RESET_TIMER:
+		if (q->mq_ops)
+			blk_mq_add_timer(req);
+		else
+			blk_add_timer(req);
+
 		blk_clear_rq_complete(req);
-		blk_add_timer(req);
 		break;
 	case BLK_EH_NOT_HANDLED:
 		/*
@@ -107,6 +117,23 @@ static void blk_rq_timed_out(struct request *req)
 	}
 }
 
+void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
+			  unsigned int *next_set)
+{
+	if (time_after_eq(jiffies, rq->deadline)) {
+		list_del_init(&rq->timeout_list);
+
+		/*
+		 * Check if we raced with end io completion
+		 */
+		if (!blk_mark_rq_complete(rq))
+			blk_rq_timed_out(rq);
+	} else if (!*next_set || time_after(*next_timeout, rq->deadline)) {
+		*next_timeout = rq->deadline;
+		*next_set = 1;
+	}
+}
+
 void blk_rq_timed_out_timer(unsigned long data)
 {
 	struct request_queue *q = (struct request_queue *) data;
@@ -116,21 +143,8 @@ void blk_rq_timed_out_timer(unsigned long data)
 
 	spin_lock_irqsave(q->queue_lock, flags);
 
-	list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) {
-		if (time_after_eq(jiffies, rq->deadline)) {
-			list_del_init(&rq->timeout_list);
-
-			/*
-			 * Check if we raced with end io completion
-			 */
-			if (blk_mark_rq_complete(rq))
-				continue;
-			blk_rq_timed_out(rq);
-		} else if (!next_set || time_after(next, rq->deadline)) {
-			next = rq->deadline;
-			next_set = 1;
-		}
-	}
+	list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list)
+		blk_rq_check_expired(rq, &next, &next_set);
 
 	if (next_set)
 		mod_timer(&q->timeout, round_jiffies_up(next));
@@ -156,15 +170,7 @@ void blk_abort_request(struct request *req)
 }
 EXPORT_SYMBOL_GPL(blk_abort_request);
 
-/**
- * blk_add_timer - Start timeout timer for a single request
- * @req:	request that is about to start running.
- *
- * Notes:
- *    Each request has its own timer, and as it is added to the queue, we
- *    set up the timer. When the request completes, we cancel the timer.
- */
-void blk_add_timer(struct request *req)
+void __blk_add_timer(struct request *req, struct list_head *timeout_list)
 {
 	struct request_queue *q = req->q;
 	unsigned long expiry;
@@ -173,7 +179,6 @@ void blk_add_timer(struct request *req)
 		return;
 
 	BUG_ON(!list_empty(&req->timeout_list));
-	BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
 
 	/*
 	 * Some LLDs, like scsi, peek at the timeout to prevent a
@@ -183,7 +188,8 @@ void blk_add_timer(struct request *req)
 		req->timeout = q->rq_timeout;
 
 	req->deadline = jiffies + req->timeout;
-	list_add_tail(&req->timeout_list, &q->timeout_list);
+	if (timeout_list)
+		list_add_tail(&req->timeout_list, timeout_list);
 
 	/*
 	 * If the timer isn't already pending or this timeout is earlier
@@ -195,5 +201,19 @@ void blk_add_timer(struct request *req)
 	if (!timer_pending(&q->timeout) ||
 	    time_before(expiry, q->timeout.expires))
 		mod_timer(&q->timeout, expiry);
+
+}
+
+/**
+ * blk_add_timer - Start timeout timer for a single request
+ * @req:	request that is about to start running.
+ *
+ * Notes:
+ *    Each request has its own timer, and as it is added to the queue, we
+ *    set up the timer. When the request completes, we cancel the timer.
+ */
+void blk_add_timer(struct request *req)
+{
+	__blk_add_timer(req, &req->q->timeout_list);
 }
 
diff --git a/block/blk.h b/block/blk.h
index e837b8f..c90e1d8 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -10,6 +10,7 @@
 #define BLK_BATCH_REQ	32
 
 extern struct kmem_cache *blk_requestq_cachep;
+extern struct kmem_cache *request_cachep;
 extern struct kobj_type blk_queue_ktype;
 extern struct ida blk_queue_ida;
 
@@ -34,14 +35,30 @@ bool __blk_end_bidi_request(struct request *rq, int error,
 			    unsigned int nr_bytes, unsigned int bidi_bytes);
 
 void blk_rq_timed_out_timer(unsigned long data);
+void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
+			  unsigned int *next_set);
+void __blk_add_timer(struct request *req, struct list_head *timeout_list);
 void blk_delete_timer(struct request *);
 void blk_add_timer(struct request *);
 
+
+bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
+			     struct bio *bio);
+bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
+			    struct bio *bio);
+bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
+			    unsigned int *request_count);
+
+void blk_account_io_start(struct request *req, bool new_io);
+void blk_account_io_completion(struct request *req, unsigned int bytes);
+void blk_account_io_done(struct request *req);
+
 /*
  * Internal atomic flags for request handling
  */
 enum rq_atomic_flags {
 	REQ_ATOM_COMPLETE = 0,
+	REQ_ATOM_STARTED,
 };
 
 /*
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d5cd313..4d5cec1 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1508,6 +1508,29 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg)
 }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
+static void cfqg_stats_init(struct cfqg_stats *stats)
+{
+	blkg_rwstat_init(&stats->service_bytes);
+	blkg_rwstat_init(&stats->serviced);
+	blkg_rwstat_init(&stats->merged);
+	blkg_rwstat_init(&stats->service_time);
+	blkg_rwstat_init(&stats->wait_time);
+	blkg_rwstat_init(&stats->queued);
+
+	blkg_stat_init(&stats->sectors);
+	blkg_stat_init(&stats->time);
+
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+	blkg_stat_init(&stats->unaccounted_time);
+	blkg_stat_init(&stats->avg_queue_size_sum);
+	blkg_stat_init(&stats->avg_queue_size_samples);
+	blkg_stat_init(&stats->dequeue);
+	blkg_stat_init(&stats->group_wait_time);
+	blkg_stat_init(&stats->idle_time);
+	blkg_stat_init(&stats->empty_time);
+#endif
+}
+
 static void cfq_pd_init(struct blkcg_gq *blkg)
 {
 	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
@@ -1515,6 +1538,8 @@ static void cfq_pd_init(struct blkcg_gq *blkg)
 	cfq_init_cfqg_base(cfqg);
 	cfqg->weight = blkg->blkcg->cfq_weight;
 	cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight;
+	cfqg_stats_init(&cfqg->stats);
+	cfqg_stats_init(&cfqg->dead_stats);
 }
 
 static void cfq_pd_offline(struct blkcg_gq *blkg)
@@ -1607,12 +1632,11 @@ static u64 cfqg_prfill_weight_device(struct seq_file *sf,
 	return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
 }
 
-static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft,
-				    struct seq_file *sf)
+static int cfqg_print_weight_device(struct cgroup_subsys_state *css,
+				    struct cftype *cft, struct seq_file *sf)
 {
-	blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp),
-			  cfqg_prfill_weight_device, &blkcg_policy_cfq, 0,
-			  false);
+	blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_weight_device,
+			  &blkcg_policy_cfq, 0, false);
 	return 0;
 }
 
@@ -1626,35 +1650,34 @@ static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
 	return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
 }
 
-static int cfqg_print_leaf_weight_device(struct cgroup *cgrp,
+static int cfqg_print_leaf_weight_device(struct cgroup_subsys_state *css,
 					 struct cftype *cft,
 					 struct seq_file *sf)
 {
-	blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp),
-			  cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq, 0,
-			  false);
+	blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_leaf_weight_device,
+			  &blkcg_policy_cfq, 0, false);
 	return 0;
 }
 
-static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft,
+static int cfq_print_weight(struct cgroup_subsys_state *css, struct cftype *cft,
 			    struct seq_file *sf)
 {
-	seq_printf(sf, "%u\n", cgroup_to_blkcg(cgrp)->cfq_weight);
+	seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_weight);
 	return 0;
 }
 
-static int cfq_print_leaf_weight(struct cgroup *cgrp, struct cftype *cft,
-				 struct seq_file *sf)
+static int cfq_print_leaf_weight(struct cgroup_subsys_state *css,
+				 struct cftype *cft, struct seq_file *sf)
 {
-	seq_printf(sf, "%u\n",
-		   cgroup_to_blkcg(cgrp)->cfq_leaf_weight);
+	seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_leaf_weight);
 	return 0;
 }
 
-static int __cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
-				    const char *buf, bool is_leaf_weight)
+static int __cfqg_set_weight_device(struct cgroup_subsys_state *css,
+				    struct cftype *cft, const char *buf,
+				    bool is_leaf_weight)
 {
-	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+	struct blkcg *blkcg = css_to_blkcg(css);
 	struct blkg_conf_ctx ctx;
 	struct cfq_group *cfqg;
 	int ret;
@@ -1680,22 +1703,22 @@ static int __cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
 	return ret;
 }
 
-static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
-				  const char *buf)
+static int cfqg_set_weight_device(struct cgroup_subsys_state *css,
+				  struct cftype *cft, const char *buf)
 {
-	return __cfqg_set_weight_device(cgrp, cft, buf, false);
+	return __cfqg_set_weight_device(css, cft, buf, false);
 }
 
-static int cfqg_set_leaf_weight_device(struct cgroup *cgrp, struct cftype *cft,
-				       const char *buf)
+static int cfqg_set_leaf_weight_device(struct cgroup_subsys_state *css,
+				       struct cftype *cft, const char *buf)
 {
-	return __cfqg_set_weight_device(cgrp, cft, buf, true);
+	return __cfqg_set_weight_device(css, cft, buf, true);
 }
 
-static int __cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val,
-			    bool is_leaf_weight)
+static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
+			    u64 val, bool is_leaf_weight)
 {
-	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+	struct blkcg *blkcg = css_to_blkcg(css);
 	struct blkcg_gq *blkg;
 
 	if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
@@ -1727,30 +1750,32 @@ static int __cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val,
 	return 0;
 }
 
-static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
+static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
+			  u64 val)
 {
-	return __cfq_set_weight(cgrp, cft, val, false);
+	return __cfq_set_weight(css, cft, val, false);
 }
 
-static int cfq_set_leaf_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
+static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
+			       struct cftype *cft, u64 val)
 {
-	return __cfq_set_weight(cgrp, cft, val, true);
+	return __cfq_set_weight(css, cft, val, true);
 }
 
-static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft,
+static int cfqg_print_stat(struct cgroup_subsys_state *css, struct cftype *cft,
 			   struct seq_file *sf)
 {
-	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+	struct blkcg *blkcg = css_to_blkcg(css);
 
 	blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq,
 			  cft->private, false);
 	return 0;
 }
 
-static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
-			     struct seq_file *sf)
+static int cfqg_print_rwstat(struct cgroup_subsys_state *css,
+			     struct cftype *cft, struct seq_file *sf)
 {
-	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+	struct blkcg *blkcg = css_to_blkcg(css);
 
 	blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq,
 			  cft->private, true);
@@ -1773,20 +1798,20 @@ static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
 	return __blkg_prfill_rwstat(sf, pd, &sum);
 }
 
-static int cfqg_print_stat_recursive(struct cgroup *cgrp, struct cftype *cft,
-				     struct seq_file *sf)
+static int cfqg_print_stat_recursive(struct cgroup_subsys_state *css,
+				     struct cftype *cft, struct seq_file *sf)
 {
-	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+	struct blkcg *blkcg = css_to_blkcg(css);
 
 	blkcg_print_blkgs(sf, blkcg, cfqg_prfill_stat_recursive,
 			  &blkcg_policy_cfq, cft->private, false);
 	return 0;
 }
 
-static int cfqg_print_rwstat_recursive(struct cgroup *cgrp, struct cftype *cft,
-				       struct seq_file *sf)
+static int cfqg_print_rwstat_recursive(struct cgroup_subsys_state *css,
+				       struct cftype *cft, struct seq_file *sf)
 {
-	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+	struct blkcg *blkcg = css_to_blkcg(css);
 
 	blkcg_print_blkgs(sf, blkcg, cfqg_prfill_rwstat_recursive,
 			  &blkcg_policy_cfq, cft->private, true);
@@ -1803,17 +1828,17 @@ static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
 
 	if (samples) {
 		v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum);
-		do_div(v, samples);
+		v = div64_u64(v, samples);
 	}
 	__blkg_prfill_u64(sf, pd, v);
 	return 0;
 }
 
 /* print avg_queue_size */
-static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,
-				     struct seq_file *sf)
+static int cfqg_print_avg_queue_size(struct cgroup_subsys_state *css,
+				     struct cftype *cft, struct seq_file *sf)
 {
-	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+	struct blkcg *blkcg = css_to_blkcg(css);
 
 	blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size,
 			  &blkcg_policy_cfq, 0, false);
@@ -4347,18 +4372,28 @@ static void cfq_exit_queue(struct elevator_queue *e)
 	kfree(cfqd);
 }
 
-static int cfq_init_queue(struct request_queue *q)
+static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
 {
 	struct cfq_data *cfqd;
 	struct blkcg_gq *blkg __maybe_unused;
 	int i, ret;
+	struct elevator_queue *eq;
 
-	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
-	if (!cfqd)
+	eq = elevator_alloc(q, e);
+	if (!eq)
 		return -ENOMEM;
 
+	cfqd = kzalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node);
+	if (!cfqd) {
+		kobject_put(&eq->kobj);
+		return -ENOMEM;
+	}
+	eq->elevator_data = cfqd;
+
 	cfqd->queue = q;
-	q->elevator->elevator_data = cfqd;
+	spin_lock_irq(q->queue_lock);
+	q->elevator = eq;
+	spin_unlock_irq(q->queue_lock);
 
 	/* Init root service tree */
 	cfqd->grp_service_tree = CFQ_RB_ROOT;
@@ -4433,6 +4468,7 @@ static int cfq_init_queue(struct request_queue *q)
 
 out_free:
 	kfree(cfqd);
+	kobject_put(&eq->kobj);
 	return ret;
 }
 
diff --git a/block/cmdline-parser.c b/block/cmdline-parser.c
new file mode 100644
index 0000000..cc2637f
--- /dev/null
+++ b/block/cmdline-parser.c
@@ -0,0 +1,250 @@
+/*
+ * Parse command line, get partition information
+ *
+ * Written by Cai Zhiyong <caizhiyong@huawei.com>
+ *
+ */
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/cmdline-parser.h>
+
+static int parse_subpart(struct cmdline_subpart **subpart, char *partdef)
+{
+	int ret = 0;
+	struct cmdline_subpart *new_subpart;
+
+	*subpart = NULL;
+
+	new_subpart = kzalloc(sizeof(struct cmdline_subpart), GFP_KERNEL);
+	if (!new_subpart)
+		return -ENOMEM;
+
+	if (*partdef == '-') {
+		new_subpart->size = (sector_t)(~0ULL);
+		partdef++;
+	} else {
+		new_subpart->size = (sector_t)memparse(partdef, &partdef);
+		if (new_subpart->size < (sector_t)PAGE_SIZE) {
+			pr_warn("cmdline partition size is invalid.");
+			ret = -EINVAL;
+			goto fail;
+		}
+	}
+
+	if (*partdef == '@') {
+		partdef++;
+		new_subpart->from = (sector_t)memparse(partdef, &partdef);
+	} else {
+		new_subpart->from = (sector_t)(~0ULL);
+	}
+
+	if (*partdef == '(') {
+		int length;
+		char *next = strchr(++partdef, ')');
+
+		if (!next) {
+			pr_warn("cmdline partition format is invalid.");
+			ret = -EINVAL;
+			goto fail;
+		}
+
+		length = min_t(int, next - partdef,
+			       sizeof(new_subpart->name) - 1);
+		strncpy(new_subpart->name, partdef, length);
+		new_subpart->name[length] = '\0';
+
+		partdef = ++next;
+	} else
+		new_subpart->name[0] = '\0';
+
+	new_subpart->flags = 0;
+
+	if (!strncmp(partdef, "ro", 2)) {
+		new_subpart->flags |= PF_RDONLY;
+		partdef += 2;
+	}
+
+	if (!strncmp(partdef, "lk", 2)) {
+		new_subpart->flags |= PF_POWERUP_LOCK;
+		partdef += 2;
+	}
+
+	*subpart = new_subpart;
+	return 0;
+fail:
+	kfree(new_subpart);
+	return ret;
+}
+
+static void free_subpart(struct cmdline_parts *parts)
+{
+	struct cmdline_subpart *subpart;
+
+	while (parts->subpart) {
+		subpart = parts->subpart;
+		parts->subpart = subpart->next_subpart;
+		kfree(subpart);
+	}
+}
+
+static int parse_parts(struct cmdline_parts **parts, const char *bdevdef)
+{
+	int ret = -EINVAL;
+	char *next;
+	int length;
+	struct cmdline_subpart **next_subpart;
+	struct cmdline_parts *newparts;
+	char buf[BDEVNAME_SIZE + 32 + 4];
+
+	*parts = NULL;
+
+	newparts = kzalloc(sizeof(struct cmdline_parts), GFP_KERNEL);
+	if (!newparts)
+		return -ENOMEM;
+
+	next = strchr(bdevdef, ':');
+	if (!next) {
+		pr_warn("cmdline partition has no block device.");
+		goto fail;
+	}
+
+	length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1);
+	strncpy(newparts->name, bdevdef, length);
+	newparts->name[length] = '\0';
+	newparts->nr_subparts = 0;
+
+	next_subpart = &newparts->subpart;
+
+	while (next && *(++next)) {
+		bdevdef = next;
+		next = strchr(bdevdef, ',');
+
+		length = (!next) ? (sizeof(buf) - 1) :
+			min_t(int, next - bdevdef, sizeof(buf) - 1);
+
+		strncpy(buf, bdevdef, length);
+		buf[length] = '\0';
+
+		ret = parse_subpart(next_subpart, buf);
+		if (ret)
+			goto fail;
+
+		newparts->nr_subparts++;
+		next_subpart = &(*next_subpart)->next_subpart;
+	}
+
+	if (!newparts->subpart) {
+		pr_warn("cmdline partition has no valid partition.");
+		ret = -EINVAL;
+		goto fail;
+	}
+
+	*parts = newparts;
+
+	return 0;
+fail:
+	free_subpart(newparts);
+	kfree(newparts);
+	return ret;
+}
+
+void cmdline_parts_free(struct cmdline_parts **parts)
+{
+	struct cmdline_parts *next_parts;
+
+	while (*parts) {
+		next_parts = (*parts)->next_parts;
+		free_subpart(*parts);
+		kfree(*parts);
+		*parts = next_parts;
+	}
+}
+
+int cmdline_parts_parse(struct cmdline_parts **parts, const char *cmdline)
+{
+	int ret;
+	char *buf;
+	char *pbuf;
+	char *next;
+	struct cmdline_parts **next_parts;
+
+	*parts = NULL;
+
+	next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	next_parts = parts;
+
+	while (next && *pbuf) {
+		next = strchr(pbuf, ';');
+		if (next)
+			*next = '\0';
+
+		ret = parse_parts(next_parts, pbuf);
+		if (ret)
+			goto fail;
+
+		if (next)
+			pbuf = ++next;
+
+		next_parts = &(*next_parts)->next_parts;
+	}
+
+	if (!*parts) {
+		pr_warn("cmdline partition has no valid partition.");
+		ret = -EINVAL;
+		goto fail;
+	}
+
+	ret = 0;
+done:
+	kfree(buf);
+	return ret;
+
+fail:
+	cmdline_parts_free(parts);
+	goto done;
+}
+
+struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts,
+					 const char *bdev)
+{
+	while (parts && strncmp(bdev, parts->name, sizeof(parts->name)))
+		parts = parts->next_parts;
+	return parts;
+}
+
+/*
+ *  add_part()
+ *    0 success.
+ *    1 can not add so many partitions.
+ */
+void cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size,
+		       int slot,
+		       int (*add_part)(int, struct cmdline_subpart *, void *),
+		       void *param)
+
+{
+	sector_t from = 0;
+	struct cmdline_subpart *subpart;
+
+	for (subpart = parts->subpart; subpart;
+	     subpart = subpart->next_subpart, slot++) {
+		if (subpart->from == (sector_t)(~0ULL))
+			subpart->from = from;
+		else
+			from = subpart->from;
+
+		if (from >= disk_size)
+			break;
+
+		if (subpart->size > (disk_size - from))
+			subpart->size = disk_size - from;
+
+		from += subpart->size;
+
+		if (add_part(slot, subpart, param))
+			break;
+	}
+}
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 7c668c8..fbd5a67 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -59,6 +59,7 @@ static int compat_hdio_getgeo(struct gendisk *disk, struct block_device *bdev,
 	if (!disk->fops->getgeo)
 		return -ENOTTY;
 
+	memset(&geo, 0, sizeof(geo));
 	/*
 	 * We need to set the startsect first, the driver may
 	 * want to override it.
@@ -69,7 +70,7 @@ static int compat_hdio_getgeo(struct gendisk *disk, struct block_device *bdev,
 		return ret;
 
 	ret = copy_to_user(ugeo, &geo, 4);
-	ret |= __put_user(geo.start, &ugeo->start);
+	ret |= put_user(geo.start, &ugeo->start);
 	if (ret)
 		ret = -EFAULT;
 
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index ba19a3a..9ef6640 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -337,14 +337,22 @@ static void deadline_exit_queue(struct elevator_queue *e)
 /*
  * initialize elevator private data (deadline_data).
  */
-static int deadline_init_queue(struct request_queue *q)
+static int deadline_init_queue(struct request_queue *q, struct elevator_type *e)
 {
 	struct deadline_data *dd;
+	struct elevator_queue *eq;
 
-	dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node);
-	if (!dd)
+	eq = elevator_alloc(q, e);
+	if (!eq)
 		return -ENOMEM;
 
+	dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
+	if (!dd) {
+		kobject_put(&eq->kobj);
+		return -ENOMEM;
+	}
+	eq->elevator_data = dd;
+
 	INIT_LIST_HEAD(&dd->fifo_list[READ]);
 	INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
 	dd->sort_list[READ] = RB_ROOT;
@@ -355,7 +363,9 @@ static int deadline_init_queue(struct request_queue *q)
 	dd->front_merges = 1;
 	dd->fifo_batch = fifo_batch;
 
-	q->elevator->elevator_data = dd;
+	spin_lock_irq(q->queue_lock);
+	q->elevator = eq;
+	spin_unlock_irq(q->queue_lock);
 	return 0;
 }
 
diff --git a/block/elevator.c b/block/elevator.c
index eba5b04..b7ff286 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -150,12 +150,12 @@ void __init load_default_elevator_module(void)
 
 static struct kobj_type elv_ktype;
 
-static struct elevator_queue *elevator_alloc(struct request_queue *q,
+struct elevator_queue *elevator_alloc(struct request_queue *q,
 				  struct elevator_type *e)
 {
 	struct elevator_queue *eq;
 
-	eq = kmalloc_node(sizeof(*eq), GFP_KERNEL | __GFP_ZERO, q->node);
+	eq = kzalloc_node(sizeof(*eq), GFP_KERNEL, q->node);
 	if (unlikely(!eq))
 		goto err;
 
@@ -170,6 +170,7 @@ err:
 	elevator_put(e);
 	return NULL;
 }
+EXPORT_SYMBOL(elevator_alloc);
 
 static void elevator_release(struct kobject *kobj)
 {
@@ -185,6 +186,12 @@ int elevator_init(struct request_queue *q, char *name)
 	struct elevator_type *e = NULL;
 	int err;
 
+	/*
+	 * q->sysfs_lock must be held to provide mutual exclusion between
+	 * elevator_switch() and here.
+	 */
+	lockdep_assert_held(&q->sysfs_lock);
+
 	if (unlikely(q->elevator))
 		return 0;
 
@@ -221,16 +228,7 @@ int elevator_init(struct request_queue *q, char *name)
 		}
 	}
 
-	q->elevator = elevator_alloc(q, e);
-	if (!q->elevator)
-		return -ENOMEM;
-
-	err = e->ops.elevator_init_fn(q);
-	if (err) {
-		kobject_put(&q->elevator->kobj);
-		return err;
-	}
-
+	err = e->ops.elevator_init_fn(q, e);
 	return 0;
 }
 EXPORT_SYMBOL(elevator_init);
@@ -935,16 +933,9 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	spin_unlock_irq(q->queue_lock);
 
 	/* allocate, init and register new elevator */
-	err = -ENOMEM;
-	q->elevator = elevator_alloc(q, new_e);
-	if (!q->elevator)
-		goto fail_init;
-
-	err = new_e->ops.elevator_init_fn(q);
-	if (err) {
-		kobject_put(&q->elevator->kobj);
+	err = new_e->ops.elevator_init_fn(q, new_e);
+	if (err)
 		goto fail_init;
-	}
 
 	if (registered) {
 		err = elv_register_queue(q);
@@ -974,7 +965,7 @@ fail_init:
 /*
  * Switch this queue to the given IO scheduler.
  */
-int elevator_change(struct request_queue *q, const char *name)
+static int __elevator_change(struct request_queue *q, const char *name)
 {
 	char elevator_name[ELV_NAME_MAX];
 	struct elevator_type *e;
@@ -996,6 +987,18 @@ int elevator_change(struct request_queue *q, const char *name)
 
 	return elevator_switch(q, e);
 }
+
+int elevator_change(struct request_queue *q, const char *name)
+{
+	int ret;
+
+	/* Protect q->elevator from elevator_init() */
+	mutex_lock(&q->sysfs_lock);
+	ret = __elevator_change(q, name);
+	mutex_unlock(&q->sysfs_lock);
+
+	return ret;
+}
 EXPORT_SYMBOL(elevator_change);
 
 ssize_t elv_iosched_store(struct request_queue *q, const char *name,
@@ -1006,7 +1009,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
 	if (!q->elevator)
 		return count;
 
-	ret = elevator_change(q, name);
+	ret = __elevator_change(q, name);
 	if (!ret)
 		return count;
 
diff --git a/block/genhd.c b/block/genhd.c
index 20625ee..791f419 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -512,7 +512,7 @@ static void register_disk(struct gendisk *disk)
 
 	ddev->parent = disk->driverfs_dev;
 
-	dev_set_name(ddev, disk->disk_name);
+	dev_set_name(ddev, "%s", disk->disk_name);
 
 	/* delay uevents, until we scanned partition table */
 	dev_set_uevent_suppress(ddev, 1);
@@ -1252,8 +1252,7 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
 {
 	struct gendisk *disk;
 
-	disk = kmalloc_node(sizeof(struct gendisk),
-				GFP_KERNEL | __GFP_ZERO, node_id);
+	disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
 	if (disk) {
 		if (!init_part_stats(&disk->part0)) {
 			kfree(disk);
@@ -1489,9 +1488,11 @@ static void __disk_unblock_events(struct gendisk *disk, bool check_now)
 	intv = disk_events_poll_jiffies(disk);
 	set_timer_slack(&ev->dwork.timer, intv / 4);
 	if (check_now)
-		queue_delayed_work(system_freezable_wq, &ev->dwork, 0);
+		queue_delayed_work(system_freezable_power_efficient_wq,
+				&ev->dwork, 0);
 	else if (intv)
-		queue_delayed_work(system_freezable_wq, &ev->dwork, intv);
+		queue_delayed_work(system_freezable_power_efficient_wq,
+				&ev->dwork, intv);
 out_unlock:
 	spin_unlock_irqrestore(&ev->lock, flags);
 }
@@ -1534,7 +1535,8 @@ void disk_flush_events(struct gendisk *disk, unsigned int mask)
 	spin_lock_irq(&ev->lock);
 	ev->clearing |= mask;
 	if (!ev->block)
-		mod_delayed_work(system_freezable_wq, &ev->dwork, 0);
+		mod_delayed_work(system_freezable_power_efficient_wq,
+				&ev->dwork, 0);
 	spin_unlock_irq(&ev->lock);
 }
 
@@ -1627,7 +1629,8 @@ static void disk_check_events(struct disk_events *ev,
 
 	intv = disk_events_poll_jiffies(disk);
 	if (!ev->block && intv)
-		queue_delayed_work(system_freezable_wq, &ev->dwork, intv);
+		queue_delayed_work(system_freezable_power_efficient_wq,
+				&ev->dwork, intv);
 
 	spin_unlock_irq(&ev->lock);
 
diff --git a/block/ioctl.c b/block/ioctl.c
index a31d91d..7d5c3b2 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -64,7 +64,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 			part = add_partition(disk, partno, start, length,
 					     ADDPART_FLAG_NONE, NULL);
 			mutex_unlock(&bdev->bd_mutex);
-			return IS_ERR(part) ? PTR_ERR(part) : 0;
+			return PTR_ERR_OR_ZERO(part);
 		case BLKPG_DEL_PARTITION:
 			part = disk_get_part(disk, partno);
 			if (!part)
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index 5d1bf70..3de89d4 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -59,16 +59,27 @@ noop_latter_request(struct request_queue *q, struct request *rq)
 	return list_entry(rq->queuelist.next, struct request, queuelist);
 }
 
-static int noop_init_queue(struct request_queue *q)
+static int noop_init_queue(struct request_queue *q, struct elevator_type *e)
 {
 	struct noop_data *nd;
+	struct elevator_queue *eq;
+
+	eq = elevator_alloc(q, e);
+	if (!eq)
+		return -ENOMEM;
 
 	nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
-	if (!nd)
+	if (!nd) {
+		kobject_put(&eq->kobj);
 		return -ENOMEM;
+	}
+	eq->elevator_data = nd;
 
 	INIT_LIST_HEAD(&nd->queue);
-	q->elevator->elevator_data = nd;
+
+	spin_lock_irq(q->queue_lock);
+	q->elevator = eq;
+	spin_unlock_irq(q->queue_lock);
 	return 0;
 }
 
diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig
index 75a54e1..9b29a99 100644
--- a/block/partitions/Kconfig
+++ b/block/partitions/Kconfig
@@ -68,6 +68,17 @@ config ACORN_PARTITION_RISCIX
 	  of machines called RISCiX.  If you say 'Y' here, Linux will be able
 	  to read disks partitioned under RISCiX.
 
+config AIX_PARTITION
+	bool "AIX basic partition table support" if PARTITION_ADVANCED
+	help
+	  Say Y here if you would like to be able to read the hard disk
+	  partition table format used by IBM or Motorola PowerPC machines
+	  running AIX.  AIX actually uses a Logical Volume Manager, where
+	  "logical volumes" can be spread across one or multiple disks,
+	  but this driver works only for the simple case of partitions which
+	  are contiguous.
+	  Otherwise, say N.
+
 config OSF_PARTITION
 	bool "Alpha OSF partition support" if PARTITION_ADVANCED
 	default y if ALPHA
@@ -249,3 +260,10 @@ config SYSV68_PARTITION
 	  partition table format used by Motorola Delta machines (using
 	  sysv68).
 	  Otherwise, say N.
+
+config CMDLINE_PARTITION
+	bool "Command line partition support" if PARTITION_ADVANCED
+	select BLK_CMDLINE_PARSER
+	help
+	  Say Y here if you want to read the partition table from bootargs.
+	  The format for the command line is just like mtdparts.
diff --git a/block/partitions/Makefile b/block/partitions/Makefile
index 03af8ea..37a9527 100644
--- a/block/partitions/Makefile
+++ b/block/partitions/Makefile
@@ -7,6 +7,8 @@ obj-$(CONFIG_BLOCK) := check.o
 obj-$(CONFIG_ACORN_PARTITION) += acorn.o
 obj-$(CONFIG_AMIGA_PARTITION) += amiga.o
 obj-$(CONFIG_ATARI_PARTITION) += atari.o
+obj-$(CONFIG_AIX_PARTITION) += aix.o
+obj-$(CONFIG_CMDLINE_PARTITION) += cmdline.o
 obj-$(CONFIG_MAC_PARTITION) += mac.o
 obj-$(CONFIG_LDM_PARTITION) += ldm.o
 obj-$(CONFIG_MSDOS_PARTITION) += msdos.o
diff --git a/block/partitions/aix.c b/block/partitions/aix.c
new file mode 100644
index 0000000..43be471
--- /dev/null
+++ b/block/partitions/aix.c
@@ -0,0 +1,293 @@
+/*
+ *  fs/partitions/aix.c
+ *
+ *  Copyright (C) 2012-2013 Philippe De Muyter <phdm@macqel.be>
+ */
+
+#include "check.h"
+#include "aix.h"
+
+struct lvm_rec {
+	char lvm_id[4]; /* "_LVM" */
+	char reserved4[16];
+	__be32 lvmarea_len;
+	__be32 vgda_len;
+	__be32 vgda_psn[2];
+	char reserved36[10];
+	__be16 pp_size; /* log2(pp_size) */
+	char reserved46[12];
+	__be16 version;
+	};
+
+struct vgda {
+	__be32 secs;
+	__be32 usec;
+	char reserved8[16];
+	__be16 numlvs;
+	__be16 maxlvs;
+	__be16 pp_size;
+	__be16 numpvs;
+	__be16 total_vgdas;
+	__be16 vgda_size;
+	};
+
+struct lvd {
+	__be16 lv_ix;
+	__be16 res2;
+	__be16 res4;
+	__be16 maxsize;
+	__be16 lv_state;
+	__be16 mirror;
+	__be16 mirror_policy;
+	__be16 num_lps;
+	__be16 res10[8];
+	};
+
+struct lvname {
+	char name[64];
+	};
+
+struct ppe {
+	__be16 lv_ix;
+	unsigned short res2;
+	unsigned short res4;
+	__be16 lp_ix;
+	unsigned short res8[12];
+	};
+
+struct pvd {
+	char reserved0[16];
+	__be16 pp_count;
+	char reserved18[2];
+	__be32 psn_part1;
+	char reserved24[8];
+	struct ppe ppe[1016];
+	};
+
+#define LVM_MAXLVS 256
+
+/**
+ * last_lba(): return number of last logical block of device
+ * @bdev: block device
+ *
+ * Description: Returns last LBA value on success, 0 on error.
+ * This is stored (by sd and ide-geometry) in
+ *  the part[0] entry for this disk, and is the number of
+ *  physical sectors available on the disk.
+ */
+static u64 last_lba(struct block_device *bdev)
+{
+	if (!bdev || !bdev->bd_inode)
+		return 0;
+	return (bdev->bd_inode->i_size >> 9) - 1ULL;
+}
+
+/**
+ * read_lba(): Read bytes from disk, starting at given LBA
+ * @state
+ * @lba
+ * @buffer
+ * @count
+ *
+ * Description:  Reads @count bytes from @state->bdev into @buffer.
+ * Returns number of bytes read on success, 0 on error.
+ */
+static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer,
+			size_t count)
+{
+	size_t totalreadcount = 0;
+
+	if (!buffer || lba + count / 512 > last_lba(state->bdev))
+		return 0;
+
+	while (count) {
+		int copied = 512;
+		Sector sect;
+		unsigned char *data = read_part_sector(state, lba++, &sect);
+		if (!data)
+			break;
+		if (copied > count)
+			copied = count;
+		memcpy(buffer, data, copied);
+		put_dev_sector(sect);
+		buffer += copied;
+		totalreadcount += copied;
+		count -= copied;
+	}
+	return totalreadcount;
+}
+
+/**
+ * alloc_pvd(): reads physical volume descriptor
+ * @state
+ * @lba
+ *
+ * Description: Returns pvd on success,  NULL on error.
+ * Allocates space for pvd and fill it with disk blocks at @lba
+ * Notes: remember to free pvd when you're done!
+ */
+static struct pvd *alloc_pvd(struct parsed_partitions *state, u32 lba)
+{
+	size_t count = sizeof(struct pvd);
+	struct pvd *p;
+
+	p = kmalloc(count, GFP_KERNEL);
+	if (!p)
+		return NULL;
+
+	if (read_lba(state, lba, (u8 *) p, count) < count) {
+		kfree(p);
+		return NULL;
+	}
+	return p;
+}
+
+/**
+ * alloc_lvn(): reads logical volume names
+ * @state
+ * @lba
+ *
+ * Description: Returns lvn on success,  NULL on error.
+ * Allocates space for lvn and fill it with disk blocks at @lba
+ * Notes: remember to free lvn when you're done!
+ */
+static struct lvname *alloc_lvn(struct parsed_partitions *state, u32 lba)
+{
+	size_t count = sizeof(struct lvname) * LVM_MAXLVS;
+	struct lvname *p;
+
+	p = kmalloc(count, GFP_KERNEL);
+	if (!p)
+		return NULL;
+
+	if (read_lba(state, lba, (u8 *) p, count) < count) {
+		kfree(p);
+		return NULL;
+	}
+	return p;
+}
+
+int aix_partition(struct parsed_partitions *state)
+{
+	int ret = 0;
+	Sector sect;
+	unsigned char *d;
+	u32 pp_bytes_size;
+	u32 pp_blocks_size = 0;
+	u32 vgda_sector = 0;
+	u32 vgda_len = 0;
+	int numlvs = 0;
+	struct pvd *pvd;
+	struct lv_info {
+		unsigned short pps_per_lv;
+		unsigned short pps_found;
+		unsigned char lv_is_contiguous;
+	} *lvip;
+	struct lvname *n = NULL;
+
+	d = read_part_sector(state, 7, &sect);
+	if (d) {
+		struct lvm_rec *p = (struct lvm_rec *)d;
+		u16 lvm_version = be16_to_cpu(p->version);
+		char tmp[64];
+
+		if (lvm_version == 1) {
+			int pp_size_log2 = be16_to_cpu(p->pp_size);
+
+			pp_bytes_size = 1 << pp_size_log2;
+			pp_blocks_size = pp_bytes_size / 512;
+			snprintf(tmp, sizeof(tmp),
+				" AIX LVM header version %u found\n",
+				lvm_version);
+			vgda_len = be32_to_cpu(p->vgda_len);
+			vgda_sector = be32_to_cpu(p->vgda_psn[0]);
+		} else {
+			snprintf(tmp, sizeof(tmp),
+				" unsupported AIX LVM version %d found\n",
+				lvm_version);
+		}
+		strlcat(state->pp_buf, tmp, PAGE_SIZE);
+		put_dev_sector(sect);
+	}
+	if (vgda_sector && (d = read_part_sector(state, vgda_sector, &sect))) {
+		struct vgda *p = (struct vgda *)d;
+
+		numlvs = be16_to_cpu(p->numlvs);
+		put_dev_sector(sect);
+	}
+	lvip = kzalloc(sizeof(struct lv_info) * state->limit, GFP_KERNEL);
+	if (!lvip)
+		return 0;
+	if (numlvs && (d = read_part_sector(state, vgda_sector + 1, &sect))) {
+		struct lvd *p = (struct lvd *)d;
+		int i;
+
+		n = alloc_lvn(state, vgda_sector + vgda_len - 33);
+		if (n) {
+			int foundlvs = 0;
+
+			for (i = 0; foundlvs < numlvs && i < state->limit; i += 1) {
+				lvip[i].pps_per_lv = be16_to_cpu(p[i].num_lps);
+				if (lvip[i].pps_per_lv)
+					foundlvs += 1;
+			}
+		}
+		put_dev_sector(sect);
+	}
+	pvd = alloc_pvd(state, vgda_sector + 17);
+	if (pvd) {
+		int numpps = be16_to_cpu(pvd->pp_count);
+		int psn_part1 = be32_to_cpu(pvd->psn_part1);
+		int i;
+		int cur_lv_ix = -1;
+		int next_lp_ix = 1;
+		int lp_ix;
+
+		for (i = 0; i < numpps; i += 1) {
+			struct ppe *p = pvd->ppe + i;
+			unsigned int lv_ix;
+
+			lp_ix = be16_to_cpu(p->lp_ix);
+			if (!lp_ix) {
+				next_lp_ix = 1;
+				continue;
+			}
+			lv_ix = be16_to_cpu(p->lv_ix) - 1;
+			if (lv_ix > state->limit) {
+				cur_lv_ix = -1;
+				continue;
+			}
+			lvip[lv_ix].pps_found += 1;
+			if (lp_ix == 1) {
+				cur_lv_ix = lv_ix;
+				next_lp_ix = 1;
+			} else if (lv_ix != cur_lv_ix || lp_ix != next_lp_ix) {
+				next_lp_ix = 1;
+				continue;
+			}
+			if (lp_ix == lvip[lv_ix].pps_per_lv) {
+				char tmp[70];
+
+				put_partition(state, lv_ix + 1,
+				  (i + 1 - lp_ix) * pp_blocks_size + psn_part1,
+				  lvip[lv_ix].pps_per_lv * pp_blocks_size);
+				snprintf(tmp, sizeof(tmp), " <%s>\n",
+					 n[lv_ix].name);
+				strlcat(state->pp_buf, tmp, PAGE_SIZE);
+				lvip[lv_ix].lv_is_contiguous = 1;
+				ret = 1;
+				next_lp_ix = 1;
+			} else
+				next_lp_ix += 1;
+		}
+		for (i = 0; i < state->limit; i += 1)
+			if (lvip[i].pps_found && !lvip[i].lv_is_contiguous)
+				pr_warn("partition %s (%u pp's found) is "
+					"not contiguous\n",
+					n[i].name, lvip[i].pps_found);
+		kfree(pvd);
+	}
+	kfree(n);
+	kfree(lvip);
+	return ret;
+}
diff --git a/block/partitions/aix.h b/block/partitions/aix.h
new file mode 100644
index 0000000..e0c66a9
--- /dev/null
+++ b/block/partitions/aix.h
@@ -0,0 +1 @@
+extern int aix_partition(struct parsed_partitions *state);
diff --git a/block/partitions/check.c b/block/partitions/check.c
index 19ba207..9ac1df7 100644
--- a/block/partitions/check.c
+++ b/block/partitions/check.c
@@ -34,6 +34,7 @@
 #include "efi.h"
 #include "karma.h"
 #include "sysv68.h"
+#include "cmdline.h"
 
 int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/
 
@@ -65,6 +66,9 @@ static int (*check_part[])(struct parsed_partitions *) = {
 	adfspart_check_ADFS,
 #endif
 
+#ifdef CONFIG_CMDLINE_PARTITION
+	cmdline_partition,
+#endif
 #ifdef CONFIG_EFI_PARTITION
 	efi_partition,		/* this must come before msdos */
 #endif
diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c
new file mode 100644
index 0000000..5141b56
--- /dev/null
+++ b/block/partitions/cmdline.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright (C) 2013 HUAWEI
+ * Author: Cai Zhiyong <caizhiyong@huawei.com>
+ *
+ * Read block device partition table from the command line.
+ * Typically used for fixed block (eMMC) embedded devices.
+ * It has no MBR, so saves storage space. Bootloader can be easily accessed
+ * by absolute address of data on the block device.
+ * Users can easily change the partition.
+ *
+ * The format for the command line is just like mtdparts.
+ *
+ * For further information, see "Documentation/block/cmdline-partition.txt"
+ *
+ */
+
+#include <linux/cmdline-parser.h>
+
+#include "check.h"
+#include "cmdline.h"
+
+static char *cmdline;
+static struct cmdline_parts *bdev_parts;
+
+static int add_part(int slot, struct cmdline_subpart *subpart, void *param)
+{
+	int label_min;
+	struct partition_meta_info *info;
+	char tmp[sizeof(info->volname) + 4];
+	struct parsed_partitions *state = (struct parsed_partitions *)param;
+
+	if (slot >= state->limit)
+		return 1;
+
+	put_partition(state, slot, subpart->from >> 9,
+		      subpart->size >> 9);
+
+	info = &state->parts[slot].info;
+
+	label_min = min_t(int, sizeof(info->volname) - 1,
+			  sizeof(subpart->name));
+	strncpy(info->volname, subpart->name, label_min);
+	info->volname[label_min] = '\0';
+
+	snprintf(tmp, sizeof(tmp), "(%s)", info->volname);
+	strlcat(state->pp_buf, tmp, PAGE_SIZE);
+
+	state->parts[slot].has_info = true;
+
+	return 0;
+}
+
+static int __init cmdline_parts_setup(char *s)
+{
+	cmdline = s;
+	return 1;
+}
+__setup("blkdevparts=", cmdline_parts_setup);
+
+/*
+ * Purpose: allocate cmdline partitions.
+ * Returns:
+ * -1 if unable to read the partition table
+ *  0 if this isn't our partition table
+ *  1 if successful
+ */
+int cmdline_partition(struct parsed_partitions *state)
+{
+	sector_t disk_size;
+	char bdev[BDEVNAME_SIZE];
+	struct cmdline_parts *parts;
+
+	if (cmdline) {
+		if (bdev_parts)
+			cmdline_parts_free(&bdev_parts);
+
+		if (cmdline_parts_parse(&bdev_parts, cmdline)) {
+			cmdline = NULL;
+			return -1;
+		}
+		cmdline = NULL;
+	}
+
+	if (!bdev_parts)
+		return 0;
+
+	bdevname(state->bdev, bdev);
+	parts = cmdline_parts_find(bdev_parts, bdev);
+	if (!parts)
+		return 0;
+
+	disk_size = get_capacity(state->bdev->bd_disk) << 9;
+
+	cmdline_parts_set(parts, disk_size, 1, add_part, (void *)state);
+
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+
+	return 1;
+}
diff --git a/block/partitions/cmdline.h b/block/partitions/cmdline.h
new file mode 100644
index 0000000..26e0f8d
--- /dev/null
+++ b/block/partitions/cmdline.h
@@ -0,0 +1,2 @@
+
+int cmdline_partition(struct parsed_partitions *state);
diff --git a/block/partitions/efi.c b/block/partitions/efi.c
index c85fc89..dc51f46 100644
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@@ -25,6 +25,9 @@
  * TODO:
  *
  * Changelog:
+ * Mon August 5th, 2013 Davidlohr Bueso <davidlohr@hp.com>
+ * - detect hybrid MBRs, tighter pMBR checking & cleanups.
+ *
  * Mon Nov 09 2004 Matt Domsch <Matt_Domsch@dell.com>
  * - test for valid PMBR and valid PGPT before ever reading
  *   AGPT, allow override with 'gpt' kernel command line option.
@@ -93,6 +96,7 @@
  * - Code works, detects all the partitions.
  *
  ************************************************************/
+#include <linux/kernel.h>
 #include <linux/crc32.h>
 #include <linux/ctype.h>
 #include <linux/math64.h>
@@ -149,34 +153,89 @@ static u64 last_lba(struct block_device *bdev)
 		       bdev_logical_block_size(bdev)) - 1ULL;
 }
 
-static inline int
-pmbr_part_valid(struct partition *part)
+static inline int pmbr_part_valid(gpt_mbr_record *part)
 {
-        if (part->sys_ind == EFI_PMBR_OSTYPE_EFI_GPT &&
-            le32_to_cpu(part->start_sect) == 1UL)
-                return 1;
-        return 0;
+	if (part->os_type != EFI_PMBR_OSTYPE_EFI_GPT)
+		goto invalid;
+
+	/* set to 0x00000001 (i.e., the LBA of the GPT Partition Header) */
+	if (le32_to_cpu(part->starting_lba) != GPT_PRIMARY_PARTITION_TABLE_LBA)
+		goto invalid;
+
+	return GPT_MBR_PROTECTIVE;
+invalid:
+	return 0;
 }
 
 /**
  * is_pmbr_valid(): test Protective MBR for validity
  * @mbr: pointer to a legacy mbr structure
+ * @total_sectors: amount of sectors in the device
  *
- * Description: Returns 1 if PMBR is valid, 0 otherwise.
- * Validity depends on two things:
+ * Description: Checks for a valid protective or hybrid
+ * master boot record (MBR). The validity of a pMBR depends
+ * on all of the following properties:
  *  1) MSDOS signature is in the last two bytes of the MBR
  *  2) One partition of type 0xEE is found
+ *
+ * In addition, a hybrid MBR will have up to three additional
+ * primary partitions, which point to the same space that's
+ * marked out by up to three GPT partitions.
+ *
+ * Returns 0 upon invalid MBR, or GPT_MBR_PROTECTIVE or
+ * GPT_MBR_HYBRID depending on the device layout.
  */
-static int
-is_pmbr_valid(legacy_mbr *mbr)
+static int is_pmbr_valid(legacy_mbr *mbr, sector_t total_sectors)
 {
-	int i;
+	uint32_t sz = 0;
+	int i, part = 0, ret = 0; /* invalid by default */
+
 	if (!mbr || le16_to_cpu(mbr->signature) != MSDOS_MBR_SIGNATURE)
-                return 0;
+		goto done;
+
+	for (i = 0; i < 4; i++) {
+		ret = pmbr_part_valid(&mbr->partition_record[i]);
+		if (ret == GPT_MBR_PROTECTIVE) {
+			part = i;
+			/*
+			 * Ok, we at least know that there's a protective MBR,
+			 * now check if there are other partition types for
+			 * hybrid MBR.
+			 */
+			goto check_hybrid;
+		}
+	}
+
+	if (ret != GPT_MBR_PROTECTIVE)
+		goto done;
+check_hybrid:
 	for (i = 0; i < 4; i++)
-		if (pmbr_part_valid(&mbr->partition_record[i]))
-                        return 1;
-	return 0;
+		if ((mbr->partition_record[i].os_type !=
+			EFI_PMBR_OSTYPE_EFI_GPT) &&
+		    (mbr->partition_record[i].os_type != 0x00))
+			ret = GPT_MBR_HYBRID;
+
+	/*
+	 * Protective MBRs take up the lesser of the whole disk
+	 * or 2 TiB (32bit LBA), ignoring the rest of the disk.
+	 * Some partitioning programs, nonetheless, choose to set
+	 * the size to the maximum 32-bit limitation, disregarding
+	 * the disk size.
+	 *
+	 * Hybrid MBRs do not necessarily comply with this.
+	 *
+	 * Consider a bad value here to be a warning to support dd'ing
+	 * an image from a smaller disk to a larger disk.
+	 */
+	if (ret == GPT_MBR_PROTECTIVE) {
+		sz = le32_to_cpu(mbr->partition_record[part].size_in_lba);
+		if (sz != (uint32_t) total_sectors - 1 && sz != 0xFFFFFFFF)
+			pr_debug("GPT: mbr size in lba (%u) different than whole disk (%u).\n",
+				 sz, min_t(uint32_t,
+					   total_sectors - 1, 0xFFFFFFFF));
+	}
+done:
+	return ret;
 }
 
 /**
@@ -243,8 +302,7 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
 		return NULL;
 
 	if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba),
-                     (u8 *) pte,
-		     count) < count) {
+			(u8 *) pte, count) < count) {
 		kfree(pte);
                 pte=NULL;
 		return NULL;
@@ -364,7 +422,12 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
 			 (unsigned long long)lastlba);
 		goto fail;
 	}
-
+	if (le64_to_cpu((*gpt)->last_usable_lba) < le64_to_cpu((*gpt)->first_usable_lba)) {
+		pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n",
+			 (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba),
+			 (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba));
+		goto fail;
+	}
 	/* Check that sizeof_partition_entry has the correct value */
 	if (le32_to_cpu((*gpt)->sizeof_partition_entry) != sizeof(gpt_entry)) {
 		pr_debug("GUID Partitition Entry Size check failed.\n");
@@ -429,44 +492,42 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
 	if (!pgpt || !agpt)
 		return;
 	if (le64_to_cpu(pgpt->my_lba) != le64_to_cpu(agpt->alternate_lba)) {
-		printk(KERN_WARNING
-		       "GPT:Primary header LBA != Alt. header alternate_lba\n");
-		printk(KERN_WARNING "GPT:%lld != %lld\n",
+		pr_warn("GPT:Primary header LBA != Alt. header alternate_lba\n");
+		pr_warn("GPT:%lld != %lld\n",
 		       (unsigned long long)le64_to_cpu(pgpt->my_lba),
                        (unsigned long long)le64_to_cpu(agpt->alternate_lba));
 		error_found++;
 	}
 	if (le64_to_cpu(pgpt->alternate_lba) != le64_to_cpu(agpt->my_lba)) {
-		printk(KERN_WARNING
-		       "GPT:Primary header alternate_lba != Alt. header my_lba\n");
-		printk(KERN_WARNING "GPT:%lld != %lld\n",
+		pr_warn("GPT:Primary header alternate_lba != Alt. header my_lba\n");
+		pr_warn("GPT:%lld != %lld\n",
 		       (unsigned long long)le64_to_cpu(pgpt->alternate_lba),
                        (unsigned long long)le64_to_cpu(agpt->my_lba));
 		error_found++;
 	}
 	if (le64_to_cpu(pgpt->first_usable_lba) !=
             le64_to_cpu(agpt->first_usable_lba)) {
-		printk(KERN_WARNING "GPT:first_usable_lbas don't match.\n");
-		printk(KERN_WARNING "GPT:%lld != %lld\n",
+		pr_warn("GPT:first_usable_lbas don't match.\n");
+		pr_warn("GPT:%lld != %lld\n",
 		       (unsigned long long)le64_to_cpu(pgpt->first_usable_lba),
                        (unsigned long long)le64_to_cpu(agpt->first_usable_lba));
 		error_found++;
 	}
 	if (le64_to_cpu(pgpt->last_usable_lba) !=
             le64_to_cpu(agpt->last_usable_lba)) {
-		printk(KERN_WARNING "GPT:last_usable_lbas don't match.\n");
-		printk(KERN_WARNING "GPT:%lld != %lld\n",
+		pr_warn("GPT:last_usable_lbas don't match.\n");
+		pr_warn("GPT:%lld != %lld\n",
 		       (unsigned long long)le64_to_cpu(pgpt->last_usable_lba),
                        (unsigned long long)le64_to_cpu(agpt->last_usable_lba));
 		error_found++;
 	}
 	if (efi_guidcmp(pgpt->disk_guid, agpt->disk_guid)) {
-		printk(KERN_WARNING "GPT:disk_guids don't match.\n");
+		pr_warn("GPT:disk_guids don't match.\n");
 		error_found++;
 	}
 	if (le32_to_cpu(pgpt->num_partition_entries) !=
             le32_to_cpu(agpt->num_partition_entries)) {
-		printk(KERN_WARNING "GPT:num_partition_entries don't match: "
+		pr_warn("GPT:num_partition_entries don't match: "
 		       "0x%x != 0x%x\n",
 		       le32_to_cpu(pgpt->num_partition_entries),
 		       le32_to_cpu(agpt->num_partition_entries));
@@ -474,8 +535,7 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
 	}
 	if (le32_to_cpu(pgpt->sizeof_partition_entry) !=
             le32_to_cpu(agpt->sizeof_partition_entry)) {
-		printk(KERN_WARNING
-		       "GPT:sizeof_partition_entry values don't match: "
+		pr_warn("GPT:sizeof_partition_entry values don't match: "
 		       "0x%x != 0x%x\n",
                        le32_to_cpu(pgpt->sizeof_partition_entry),
 		       le32_to_cpu(agpt->sizeof_partition_entry));
@@ -483,34 +543,30 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
 	}
 	if (le32_to_cpu(pgpt->partition_entry_array_crc32) !=
             le32_to_cpu(agpt->partition_entry_array_crc32)) {
-		printk(KERN_WARNING
-		       "GPT:partition_entry_array_crc32 values don't match: "
+		pr_warn("GPT:partition_entry_array_crc32 values don't match: "
 		       "0x%x != 0x%x\n",
                        le32_to_cpu(pgpt->partition_entry_array_crc32),
 		       le32_to_cpu(agpt->partition_entry_array_crc32));
 		error_found++;
 	}
 	if (le64_to_cpu(pgpt->alternate_lba) != lastlba) {
-		printk(KERN_WARNING
-		       "GPT:Primary header thinks Alt. header is not at the end of the disk.\n");
-		printk(KERN_WARNING "GPT:%lld != %lld\n",
+		pr_warn("GPT:Primary header thinks Alt. header is not at the end of the disk.\n");
+		pr_warn("GPT:%lld != %lld\n",
 			(unsigned long long)le64_to_cpu(pgpt->alternate_lba),
 			(unsigned long long)lastlba);
 		error_found++;
 	}
 
 	if (le64_to_cpu(agpt->my_lba) != lastlba) {
-		printk(KERN_WARNING
-		       "GPT:Alternate GPT header not at the end of the disk.\n");
-		printk(KERN_WARNING "GPT:%lld != %lld\n",
+		pr_warn("GPT:Alternate GPT header not at the end of the disk.\n");
+		pr_warn("GPT:%lld != %lld\n",
 			(unsigned long long)le64_to_cpu(agpt->my_lba),
 			(unsigned long long)lastlba);
 		error_found++;
 	}
 
 	if (error_found)
-		printk(KERN_WARNING
-		       "GPT: Use GNU Parted to correct GPT errors.\n");
+		pr_warn("GPT: Use GNU Parted to correct GPT errors.\n");
 	return;
 }
 
@@ -536,6 +592,7 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
 	gpt_header *pgpt = NULL, *agpt = NULL;
 	gpt_entry *pptes = NULL, *aptes = NULL;
 	legacy_mbr *legacymbr;
+	sector_t total_sectors = i_size_read(state->bdev->bd_inode) >> 9;
 	u64 lastlba;
 
 	if (!ptes)
@@ -543,17 +600,22 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
 
 	lastlba = last_lba(state->bdev);
         if (!force_gpt) {
-                /* This will be added to the EFI Spec. per Intel after v1.02. */
-                legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL);
-                if (legacymbr) {
-                        read_lba(state, 0, (u8 *) legacymbr,
-				 sizeof (*legacymbr));
-                        good_pmbr = is_pmbr_valid(legacymbr);
-                        kfree(legacymbr);
-                }
-                if (!good_pmbr)
-                        goto fail;
-        }
+		/* This will be added to the EFI Spec. per Intel after v1.02. */
+		legacymbr = kzalloc(sizeof(*legacymbr), GFP_KERNEL);
+		if (!legacymbr)
+			goto fail;
+
+		read_lba(state, 0, (u8 *)legacymbr, sizeof(*legacymbr));
+		good_pmbr = is_pmbr_valid(legacymbr, total_sectors);
+		kfree(legacymbr);
+
+		if (!good_pmbr)
+			goto fail;
+
+		pr_debug("Device has a %s MBR\n",
+			 good_pmbr == GPT_MBR_PROTECTIVE ?
+						"protective" : "hybrid");
+	}
 
 	good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA,
 				 &pgpt, &pptes);
@@ -576,11 +638,8 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
                 *ptes = pptes;
                 kfree(agpt);
                 kfree(aptes);
-                if (!good_agpt) {
-                        printk(KERN_WARNING 
-			       "Alternate GPT is invalid, "
-                               "using primary GPT.\n");
-                }
+		if (!good_agpt)
+                        pr_warn("Alternate GPT is invalid, using primary GPT.\n");
                 return 1;
         }
         else if (good_agpt) {
@@ -588,8 +647,7 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
                 *ptes = aptes;
                 kfree(pgpt);
                 kfree(pptes);
-                printk(KERN_WARNING 
-                       "Primary GPT is invalid, using alternate GPT.\n");
+		pr_warn("Primary GPT is invalid, using alternate GPT.\n");
                 return 1;
         }
 
@@ -651,16 +709,15 @@ int efi_partition(struct parsed_partitions *state)
 		put_partition(state, i+1, start * ssz, size * ssz);
 
 		/* If this is a RAID volume, tell md */
-		if (!efi_guidcmp(ptes[i].partition_type_guid,
-				 PARTITION_LINUX_RAID_GUID))
+		if (!efi_guidcmp(ptes[i].partition_type_guid, PARTITION_LINUX_RAID_GUID))
 			state->parts[i + 1].flags = ADDPART_FLAG_RAID;
 
 		info = &state->parts[i + 1].info;
 		efi_guid_unparse(&ptes[i].unique_partition_guid, info->uuid);
 
 		/* Naively convert UTF16-LE to 7 bits. */
-		label_max = min(sizeof(info->volname) - 1,
-				sizeof(ptes[i].partition_name));
+		label_max = min(ARRAY_SIZE(info->volname) - 1,
+				ARRAY_SIZE(ptes[i].partition_name));
 		info->volname[label_max] = 0;
 		while (label_count < label_max) {
 			u8 c = ptes[i].partition_name[label_count] & 0xff;
diff --git a/block/partitions/efi.h b/block/partitions/efi.h
index b69ab72..4efcafb 100644
--- a/block/partitions/efi.h
+++ b/block/partitions/efi.h
@@ -37,6 +37,9 @@
 #define EFI_PMBR_OSTYPE_EFI 0xEF
 #define EFI_PMBR_OSTYPE_EFI_GPT 0xEE
 
+#define GPT_MBR_PROTECTIVE  1
+#define GPT_MBR_HYBRID      2
+
 #define GPT_HEADER_SIGNATURE 0x5452415020494645ULL
 #define GPT_HEADER_REVISION_V1 0x00010000
 #define GPT_PRIMARY_PARTITION_TABLE_LBA 1
@@ -101,11 +104,25 @@ typedef struct _gpt_entry {
 	efi_char16_t partition_name[72 / sizeof (efi_char16_t)];
 } __attribute__ ((packed)) gpt_entry;
 
+typedef struct _gpt_mbr_record {
+	u8	boot_indicator; /* unused by EFI, set to 0x80 for bootable */
+	u8	start_head;     /* unused by EFI, pt start in CHS */
+	u8	start_sector;   /* unused by EFI, pt start in CHS */
+	u8	start_track;
+	u8	os_type;        /* EFI and legacy non-EFI OS types */
+	u8	end_head;       /* unused by EFI, pt end in CHS */
+	u8	end_sector;     /* unused by EFI, pt end in CHS */
+	u8	end_track;      /* unused by EFI, pt end in CHS */
+	__le32	starting_lba;   /* used by EFI - start addr of the on disk pt */
+	__le32	size_in_lba;    /* used by EFI - size of pt in LBA */
+} __packed gpt_mbr_record;
+
+
 typedef struct _legacy_mbr {
 	u8 boot_code[440];
 	__le32 unique_mbr_signature;
 	__le16 unknown;
-	struct partition partition_record[4];
+	gpt_mbr_record partition_record[4];
 	__le16 signature;
 } __attribute__ ((packed)) legacy_mbr;
 
@@ -113,22 +130,3 @@ typedef struct _legacy_mbr {
 extern int efi_partition(struct parsed_partitions *state);
 
 #endif
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * --------------------------------------------------------------------------
- * Local variables:
- * c-indent-level: 4 
- * c-brace-imaginary-offset: 0
- * c-brace-offset: -4
- * c-argdecl-indent: 4
- * c-label-offset: -4
- * c-continued-statement-offset: 4
- * c-continued-brace-offset: 0
- * indent-tabs-mode: nil
- * tab-width: 8
- * End:
- */
diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c
index 7681cd2..9123f25 100644
--- a/block/partitions/msdos.c
+++ b/block/partitions/msdos.c
@@ -23,6 +23,7 @@
 #include "check.h"
 #include "msdos.h"
 #include "efi.h"
+#include "aix.h"
 
 /*
  * Many architectures don't like unaligned accesses, while
@@ -90,7 +91,7 @@ static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
 		if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M')
 			ret = 1;
 		put_dev_sector(sect);
-	};
+	}
 	return ret;
 }
 
@@ -142,7 +143,7 @@ static void parse_extended(struct parsed_partitions *state,
 			return;
 
 		if (!msdos_magic_present(data + 510))
-			goto done; 
+			goto done;
 
 		p = (struct partition *) (data + 0x1be);
 
@@ -155,7 +156,7 @@ static void parse_extended(struct parsed_partitions *state,
 		 * and OS/2 seems to use all four entries.
 		 */
 
-		/* 
+		/*
 		 * First process the data partition(s)
 		 */
 		for (i=0; i<4; i++, p++) {
@@ -263,7 +264,7 @@ static void parse_solaris_x86(struct parsed_partitions *state,
 }
 
 #if defined(CONFIG_BSD_DISKLABEL)
-/* 
+/*
  * Create devices for BSD partitions listed in a disklabel, under a
  * dos-like partition. See parse_extended() for more information.
  */
@@ -294,7 +295,7 @@ static void parse_bsd(struct parsed_partitions *state,
 
 		if (state->next == state->limit)
 			break;
-		if (p->p_fstype == BSD_FS_UNUSED) 
+		if (p->p_fstype == BSD_FS_UNUSED)
 			continue;
 		bsd_start = le32_to_cpu(p->p_offset);
 		bsd_size = le32_to_cpu(p->p_size);
@@ -441,7 +442,7 @@ static struct {
 	{NEW_SOLARIS_X86_PARTITION, parse_solaris_x86},
 	{0, NULL},
 };
- 
+
 int msdos_partition(struct parsed_partitions *state)
 {
 	sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
@@ -462,8 +463,12 @@ int msdos_partition(struct parsed_partitions *state)
 	 */
 	if (aix_magic_present(state, data)) {
 		put_dev_sector(sect);
+#ifdef CONFIG_AIX_PARTITION
+		return aix_partition(state);
+#else
 		strlcat(state->pp_buf, " [AIX]", PAGE_SIZE);
 		return 0;
+#endif
 	}
 
 	if (!msdos_magic_present(data + 510)) {
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index a5ffcc9..625e3e4 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -286,7 +286,8 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 		struct sg_io_hdr *hdr, fmode_t mode)
 {
 	unsigned long start_time;
-	int writing = 0, ret = 0;
+	ssize_t ret = 0;
+	int writing = 0;
 	struct request *rq;
 	char sense[SCSI_SENSE_BUFFERSIZE];
 	struct bio *bio;
@@ -321,37 +322,16 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 	}
 
 	if (hdr->iovec_count) {
-		const int size = sizeof(struct sg_iovec) * hdr->iovec_count;
 		size_t iov_data_len;
-		struct sg_iovec *sg_iov;
 		struct iovec *iov;
-		int i;
 
-		sg_iov = kmalloc(size, GFP_KERNEL);
-		if (!sg_iov) {
-			ret = -ENOMEM;
+		ret = rw_copy_check_uvector(-1, hdr->dxferp, hdr->iovec_count,
+					    0, NULL, &iov);
+		if (ret < 0)
 			goto out;
-		}
-
-		if (copy_from_user(sg_iov, hdr->dxferp, size)) {
-			kfree(sg_iov);
-			ret = -EFAULT;
-			goto out;
-		}
 
-		/*
-		 * Sum up the vecs, making sure they don't overflow
-		 */
-		iov = (struct iovec *) sg_iov;
-		iov_data_len = 0;
-		for (i = 0; i < hdr->iovec_count; i++) {
-			if (iov_data_len + iov[i].iov_len < iov_data_len) {
-				kfree(sg_iov);
-				ret = -EINVAL;
-				goto out;
-			}
-			iov_data_len += iov[i].iov_len;
-		}
+		iov_data_len = ret;
+		ret = 0;
 
 		/* SG_IO howto says that the shorter of the two wins */
 		if (hdr->dxfer_len < iov_data_len) {
@@ -361,9 +341,10 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 			iov_data_len = hdr->dxfer_len;
 		}
 
-		ret = blk_rq_map_user_iov(q, rq, NULL, sg_iov, hdr->iovec_count,
+		ret = blk_rq_map_user_iov(q, rq, NULL, (struct sg_iovec *) iov,
+					  hdr->iovec_count,
 					  iov_data_len, GFP_KERNEL);
-		kfree(sg_iov);
+		kfree(iov);
 	} else if (hdr->dxfer_len)
 		ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len,
 				      GFP_KERNEL);
author	Scott Wood <scottwood@freescale.com>	2013-12-14 01:15:24 (GMT)
committer	Scott Wood <scottwood@freescale.com>	2013-12-14 01:15:24 (GMT)
commit	b7c81aa3ab2ac2c140e278b6d0e9a0b95112cf0b (patch)
tree	87828aaf5f82c7042bfc0307bc4ac499e10f93fb /block
parent	22c782a4b14773fab7eab3c1db54ad7ad077e9b8 (diff)
parent	78fd82238d0e5716578c326404184a27ba67fd6e (diff)
download	linux-fsl-qoriq-b7c81aa3ab2ac2c140e278b6d0e9a0b95112cf0b.tar.xz