summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
authorScott Wood <scottwood@freescale.com>2013-12-14 01:15:24 (GMT)
committerScott Wood <scottwood@freescale.com>2013-12-14 01:15:24 (GMT)
commitb7c81aa3ab2ac2c140e278b6d0e9a0b95112cf0b (patch)
tree87828aaf5f82c7042bfc0307bc4ac499e10f93fb /block
parent22c782a4b14773fab7eab3c1db54ad7ad077e9b8 (diff)
parent78fd82238d0e5716578c326404184a27ba67fd6e (diff)
downloadlinux-fsl-qoriq-b7c81aa3ab2ac2c140e278b6d0e9a0b95112cf0b.tar.xz
Merge remote-tracking branch 'linus/master' into merge
Conflicts: Documentation/hwmon/ina2xx arch/powerpc/Kconfig arch/powerpc/boot/dts/b4860emu.dts arch/powerpc/boot/dts/b4qds.dtsi arch/powerpc/boot/dts/fsl/b4si-post.dtsi arch/powerpc/boot/dts/fsl/qoriq-sec6.0-0.dtsi arch/powerpc/boot/dts/p1023rdb.dts arch/powerpc/boot/dts/t4240emu.dts arch/powerpc/boot/dts/t4240qds.dts arch/powerpc/configs/85xx/p1023_defconfig arch/powerpc/configs/corenet32_smp_defconfig arch/powerpc/configs/corenet64_smp_defconfig arch/powerpc/configs/mpc85xx_smp_defconfig arch/powerpc/include/asm/cputable.h arch/powerpc/include/asm/device.h arch/powerpc/include/asm/epapr_hcalls.h arch/powerpc/include/asm/kvm_host.h arch/powerpc/include/asm/mpic.h arch/powerpc/include/asm/pci.h arch/powerpc/include/asm/ppc-opcode.h arch/powerpc/include/asm/ppc_asm.h arch/powerpc/include/asm/reg_booke.h arch/powerpc/kernel/epapr_paravirt.c arch/powerpc/kernel/process.c arch/powerpc/kernel/prom.c arch/powerpc/kernel/setup-common.c arch/powerpc/kernel/setup_32.c arch/powerpc/kernel/setup_64.c arch/powerpc/kernel/smp.c arch/powerpc/kernel/swsusp_asm64.S arch/powerpc/kernel/swsusp_booke.S arch/powerpc/kvm/book3s_pr.c arch/powerpc/kvm/booke.c arch/powerpc/kvm/booke.h arch/powerpc/kvm/e500.c arch/powerpc/kvm/e500.h arch/powerpc/kvm/e500_emulate.c arch/powerpc/kvm/e500mc.c arch/powerpc/kvm/powerpc.c arch/powerpc/perf/e6500-pmu.c arch/powerpc/platforms/85xx/Kconfig arch/powerpc/platforms/85xx/Makefile arch/powerpc/platforms/85xx/b4_qds.c arch/powerpc/platforms/85xx/c293pcie.c arch/powerpc/platforms/85xx/corenet_ds.c arch/powerpc/platforms/85xx/corenet_ds.h arch/powerpc/platforms/85xx/p1023_rds.c arch/powerpc/platforms/85xx/p2041_rdb.c arch/powerpc/platforms/85xx/p3041_ds.c arch/powerpc/platforms/85xx/p4080_ds.c arch/powerpc/platforms/85xx/p5020_ds.c arch/powerpc/platforms/85xx/p5040_ds.c arch/powerpc/platforms/85xx/smp.c arch/powerpc/platforms/85xx/t4240_qds.c arch/powerpc/platforms/Kconfig arch/powerpc/sysdev/Makefile arch/powerpc/sysdev/fsl_mpic_timer_wakeup.c arch/powerpc/sysdev/fsl_msi.c arch/powerpc/sysdev/fsl_pci.c arch/powerpc/sysdev/fsl_pci.h arch/powerpc/sysdev/fsl_soc.h arch/powerpc/sysdev/mpic.c arch/powerpc/sysdev/mpic_timer.c drivers/Kconfig drivers/clk/Kconfig drivers/clk/clk-ppc-corenet.c drivers/cpufreq/Kconfig.powerpc drivers/cpufreq/Makefile drivers/cpufreq/ppc-corenet-cpufreq.c drivers/crypto/caam/Kconfig drivers/crypto/caam/Makefile drivers/crypto/caam/ctrl.c drivers/crypto/caam/desc_constr.h drivers/crypto/caam/intern.h drivers/crypto/caam/jr.c drivers/crypto/caam/regs.h drivers/dma/fsldma.c drivers/hwmon/ina2xx.c drivers/iommu/Kconfig drivers/iommu/fsl_pamu.c drivers/iommu/fsl_pamu.h drivers/iommu/fsl_pamu_domain.c drivers/iommu/fsl_pamu_domain.h drivers/misc/Makefile drivers/mmc/card/block.c drivers/mmc/core/core.c drivers/mmc/host/sdhci-esdhc.h drivers/mmc/host/sdhci-pltfm.c drivers/mtd/nand/fsl_ifc_nand.c drivers/net/ethernet/freescale/gianfar.c drivers/net/ethernet/freescale/gianfar.h drivers/net/ethernet/freescale/gianfar_ethtool.c drivers/net/phy/at803x.c drivers/net/phy/phy_device.c drivers/net/phy/vitesse.c drivers/pci/msi.c drivers/staging/Kconfig drivers/staging/Makefile drivers/uio/Kconfig drivers/uio/Makefile drivers/uio/uio.c drivers/usb/host/ehci-fsl.c drivers/vfio/Kconfig drivers/vfio/Makefile include/crypto/algapi.h include/linux/iommu.h include/linux/mmc/sdhci.h include/linux/msi.h include/linux/netdev_features.h include/linux/phy.h include/linux/skbuff.h include/net/ip.h include/uapi/linux/vfio.h net/core/ethtool.c net/ipv4/route.c net/ipv6/route.c
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig11
-rw-r--r--block/Makefile6
-rw-r--r--block/blk-cgroup.c179
-rw-r--r--block/blk-cgroup.h72
-rw-r--r--block/blk-core.c193
-rw-r--r--block/blk-exec.c18
-rw-r--r--block/blk-flush.c173
-rw-r--r--block/blk-ioc.c6
-rw-r--r--block/blk-iopoll.c12
-rw-r--r--block/blk-lib.c10
-rw-r--r--block/blk-merge.c17
-rw-r--r--block/blk-mq-cpu.c93
-rw-r--r--block/blk-mq-cpumap.c108
-rw-r--r--block/blk-mq-sysfs.c384
-rw-r--r--block/blk-mq-tag.c204
-rw-r--r--block/blk-mq-tag.h27
-rw-r--r--block/blk-mq.c1510
-rw-r--r--block/blk-mq.h52
-rw-r--r--block/blk-settings.c9
-rw-r--r--block/blk-softirq.c18
-rw-r--r--block/blk-sysfs.c17
-rw-r--r--block/blk-tag.c11
-rw-r--r--block/blk-throttle.c1097
-rw-r--r--block/blk-timeout.c82
-rw-r--r--block/blk.h17
-rw-r--r--block/cfq-iosched.c136
-rw-r--r--block/cmdline-parser.c250
-rw-r--r--block/compat_ioctl.c3
-rw-r--r--block/deadline-iosched.c18
-rw-r--r--block/elevator.c49
-rw-r--r--block/genhd.c17
-rw-r--r--block/ioctl.c2
-rw-r--r--block/noop-iosched.c17
-rw-r--r--block/partitions/Kconfig18
-rw-r--r--block/partitions/Makefile2
-rw-r--r--block/partitions/aix.c293
-rw-r--r--block/partitions/aix.h1
-rw-r--r--block/partitions/check.c4
-rw-r--r--block/partitions/cmdline.c99
-rw-r--r--block/partitions/cmdline.h2
-rw-r--r--block/partitions/efi.c185
-rw-r--r--block/partitions/efi.h38
-rw-r--r--block/partitions/msdos.c17
-rw-r--r--block/scsi_ioctl.c39
44 files changed, 4677 insertions, 839 deletions
diff --git a/block/Kconfig b/block/Kconfig
index a7e40a7..2429515 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -99,6 +99,17 @@ config BLK_DEV_THROTTLING
See Documentation/cgroups/blkio-controller.txt for more information.
+config BLK_CMDLINE_PARSER
+ bool "Block device command line partition parser"
+ default n
+ ---help---
+ Enabling this option allows you to specify the partition layout from
+ the kernel boot args. This is typically of use for embedded devices
+ which don't otherwise have any standardized method for listing the
+ partitions on a block device.
+
+ See Documentation/block/cmdline-partition.txt for more information.
+
menu "Partition Types"
source "block/partitions/Kconfig"
diff --git a/block/Makefile b/block/Makefile
index 39b76ba..20645e8 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,8 +5,9 @@
obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
- blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o \
- partition-generic.o partitions/
+ blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
+ blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
+ genhd.o scsi_ioctl.o partition-generic.o partitions/
obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
@@ -18,3 +19,4 @@ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o
+obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index e8918ff..4e491d9 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -32,26 +32,6 @@ EXPORT_SYMBOL_GPL(blkcg_root);
static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
-static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
- struct request_queue *q, bool update_hint);
-
-/**
- * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
- * @d_blkg: loop cursor pointing to the current descendant
- * @pos_cgrp: used for iteration
- * @p_blkg: target blkg to walk descendants of
- *
- * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU
- * read locked. If called under either blkcg or queue lock, the iteration
- * is guaranteed to include all and only online blkgs. The caller may
- * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip
- * subtree.
- */
-#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \
- cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
- if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
- (p_blkg)->q, false)))
-
static bool blkcg_policy_enabled(struct request_queue *q,
const struct blkcg_policy *pol)
{
@@ -71,18 +51,8 @@ static void blkg_free(struct blkcg_gq *blkg)
if (!blkg)
return;
- for (i = 0; i < BLKCG_MAX_POLS; i++) {
- struct blkcg_policy *pol = blkcg_policy[i];
- struct blkg_policy_data *pd = blkg->pd[i];
-
- if (!pd)
- continue;
-
- if (pol && pol->pd_exit_fn)
- pol->pd_exit_fn(blkg);
-
- kfree(pd);
- }
+ for (i = 0; i < BLKCG_MAX_POLS; i++)
+ kfree(blkg->pd[i]);
blk_exit_rl(&blkg->rl);
kfree(blkg);
@@ -134,10 +104,6 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
blkg->pd[i] = pd;
pd->blkg = blkg;
pd->plid = i;
-
- /* invoke per-policy init */
- if (pol->pd_init_fn)
- pol->pd_init_fn(blkg);
}
return blkg;
@@ -158,8 +124,8 @@ err_free:
* @q's bypass state. If @update_hint is %true, the caller should be
* holding @q->queue_lock and lookup hint is updated on success.
*/
-static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
- struct request_queue *q, bool update_hint)
+struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
+ bool update_hint)
{
struct blkcg_gq *blkg;
@@ -234,16 +200,25 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
}
blkg = new_blkg;
- /* link parent and insert */
+ /* link parent */
if (blkcg_parent(blkcg)) {
blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
if (WARN_ON_ONCE(!blkg->parent)) {
- blkg = ERR_PTR(-EINVAL);
+ ret = -EINVAL;
goto err_put_css;
}
blkg_get(blkg->parent);
}
+ /* invoke per-policy init */
+ for (i = 0; i < BLKCG_MAX_POLS; i++) {
+ struct blkcg_policy *pol = blkcg_policy[i];
+
+ if (blkg->pd[i] && pol->pd_init_fn)
+ pol->pd_init_fn(blkg);
+ }
+
+ /* insert */
spin_lock(&blkcg->lock);
ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
if (likely(!ret)) {
@@ -260,8 +235,13 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
blkg->online = true;
spin_unlock(&blkcg->lock);
- if (!ret)
+ if (!ret) {
+ if (blkcg == &blkcg_root) {
+ q->root_blkg = blkg;
+ q->root_rl.blkg = blkg;
+ }
return blkg;
+ }
/* @blkg failed fully initialized, use the usual release path */
blkg_put(blkg);
@@ -360,6 +340,15 @@ static void blkg_destroy(struct blkcg_gq *blkg)
rcu_assign_pointer(blkcg->blkg_hint, NULL);
/*
+ * If root blkg is destroyed. Just clear the pointer since root_rl
+ * does not take reference on root blkg.
+ */
+ if (blkcg == &blkcg_root) {
+ blkg->q->root_blkg = NULL;
+ blkg->q->root_rl.blkg = NULL;
+ }
+
+ /*
* Put the reference taken at the time of creation so that when all
* queues are gone, group can be destroyed.
*/
@@ -385,39 +374,40 @@ static void blkg_destroy_all(struct request_queue *q)
blkg_destroy(blkg);
spin_unlock(&blkcg->lock);
}
-
- /*
- * root blkg is destroyed. Just clear the pointer since
- * root_rl does not take reference on root blkg.
- */
- q->root_blkg = NULL;
- q->root_rl.blkg = NULL;
}
-static void blkg_rcu_free(struct rcu_head *rcu_head)
+/*
+ * A group is RCU protected, but having an rcu lock does not mean that one
+ * can access all the fields of blkg and assume these are valid. For
+ * example, don't try to follow throtl_data and request queue links.
+ *
+ * Having a reference to blkg under an rcu allows accesses to only values
+ * local to groups like group stats and group rate limits.
+ */
+void __blkg_release_rcu(struct rcu_head *rcu_head)
{
- blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head));
-}
+ struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
+ int i;
+
+ /* tell policies that this one is being freed */
+ for (i = 0; i < BLKCG_MAX_POLS; i++) {
+ struct blkcg_policy *pol = blkcg_policy[i];
+
+ if (blkg->pd[i] && pol->pd_exit_fn)
+ pol->pd_exit_fn(blkg);
+ }
-void __blkg_release(struct blkcg_gq *blkg)
-{
/* release the blkcg and parent blkg refs this blkg has been holding */
css_put(&blkg->blkcg->css);
- if (blkg->parent)
+ if (blkg->parent) {
+ spin_lock_irq(blkg->q->queue_lock);
blkg_put(blkg->parent);
+ spin_unlock_irq(blkg->q->queue_lock);
+ }
- /*
- * A group is freed in rcu manner. But having an rcu lock does not
- * mean that one can access all the fields of blkg and assume these
- * are valid. For example, don't try to follow throtl_data and
- * request queue links.
- *
- * Having a reference to blkg under an rcu allows acess to only
- * values local to groups like group stats and group rate limits
- */
- call_rcu(&blkg->rcu_head, blkg_rcu_free);
+ blkg_free(blkg);
}
-EXPORT_SYMBOL_GPL(__blkg_release);
+EXPORT_SYMBOL_GPL(__blkg_release_rcu);
/*
* The next function used by blk_queue_for_each_rl(). It's a bit tricky
@@ -454,10 +444,10 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl,
return &blkg->rl;
}
-static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype,
- u64 val)
+static int blkcg_reset_stats(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 val)
{
- struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
+ struct blkcg *blkcg = css_to_blkcg(css);
struct blkcg_gq *blkg;
int i;
@@ -631,15 +621,13 @@ u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off)
{
struct blkcg_policy *pol = blkcg_policy[pd->plid];
struct blkcg_gq *pos_blkg;
- struct cgroup *pos_cgrp;
- u64 sum;
+ struct cgroup_subsys_state *pos_css;
+ u64 sum = 0;
lockdep_assert_held(pd->blkg->q->queue_lock);
- sum = blkg_stat_read((void *)pd + off);
-
rcu_read_lock();
- blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) {
+ blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
struct blkg_stat *stat = (void *)pos_pd + off;
@@ -666,16 +654,14 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
{
struct blkcg_policy *pol = blkcg_policy[pd->plid];
struct blkcg_gq *pos_blkg;
- struct cgroup *pos_cgrp;
- struct blkg_rwstat sum;
+ struct cgroup_subsys_state *pos_css;
+ struct blkg_rwstat sum = { };
int i;
lockdep_assert_held(pd->blkg->q->queue_lock);
- sum = blkg_rwstat_read((void *)pd + off);
-
rcu_read_lock();
- blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) {
+ blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
struct blkg_rwstat *rwstat = (void *)pos_pd + off;
struct blkg_rwstat tmp;
@@ -782,18 +768,18 @@ struct cftype blkcg_files[] = {
/**
* blkcg_css_offline - cgroup css_offline callback
- * @cgroup: cgroup of interest
+ * @css: css of interest
*
- * This function is called when @cgroup is about to go away and responsible
- * for shooting down all blkgs associated with @cgroup. blkgs should be
+ * This function is called when @css is about to go away and responsible
+ * for shooting down all blkgs associated with @css. blkgs should be
* removed while holding both q and blkcg locks. As blkcg lock is nested
* inside q lock, this function performs reverse double lock dancing.
*
* This is the blkcg counterpart of ioc_release_fn().
*/
-static void blkcg_css_offline(struct cgroup *cgroup)
+static void blkcg_css_offline(struct cgroup_subsys_state *css)
{
- struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
+ struct blkcg *blkcg = css_to_blkcg(css);
spin_lock_irq(&blkcg->lock);
@@ -815,21 +801,21 @@ static void blkcg_css_offline(struct cgroup *cgroup)
spin_unlock_irq(&blkcg->lock);
}
-static void blkcg_css_free(struct cgroup *cgroup)
+static void blkcg_css_free(struct cgroup_subsys_state *css)
{
- struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
+ struct blkcg *blkcg = css_to_blkcg(css);
if (blkcg != &blkcg_root)
kfree(blkcg);
}
-static struct cgroup_subsys_state *blkcg_css_alloc(struct cgroup *cgroup)
+static struct cgroup_subsys_state *
+blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
{
static atomic64_t id_seq = ATOMIC64_INIT(0);
struct blkcg *blkcg;
- struct cgroup *parent = cgroup->parent;
- if (!parent) {
+ if (!parent_css) {
blkcg = &blkcg_root;
goto done;
}
@@ -900,14 +886,15 @@ void blkcg_exit_queue(struct request_queue *q)
* of the main cic data structures. For now we allow a task to change
* its cgroup only if it's the only owner of its ioc.
*/
-static int blkcg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
+static int blkcg_can_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
{
struct task_struct *task;
struct io_context *ioc;
int ret = 0;
/* task_lock() is needed to avoid races with exit_io_context() */
- cgroup_taskset_for_each(task, cgrp, tset) {
+ cgroup_taskset_for_each(task, css, tset) {
task_lock(task);
ioc = task->io_context;
if (ioc && atomic_read(&ioc->nr_tasks) > 1)
@@ -928,14 +915,6 @@ struct cgroup_subsys blkio_subsys = {
.subsys_id = blkio_subsys_id,
.base_cftypes = blkcg_files,
.module = THIS_MODULE,
-
- /*
- * blkio subsystem is utterly broken in terms of hierarchy support.
- * It treats all cgroups equally regardless of where they're
- * located in the hierarchy - all cgroups are treated as if they're
- * right below the root. Fix it and remove the following.
- */
- .broken_hierarchy = true,
};
EXPORT_SYMBOL_GPL(blkio_subsys);
@@ -998,8 +977,6 @@ int blkcg_activate_policy(struct request_queue *q,
ret = PTR_ERR(blkg);
goto out_unlock;
}
- q->root_blkg = blkg;
- q->root_rl.blkg = blkg;
list_for_each_entry(blkg, &q->blkg_list, q_node)
cnt++;
@@ -1152,7 +1129,7 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
/* kill the intf files first */
if (pol->cftypes)
- cgroup_rm_cftypes(&blkio_subsys, pol->cftypes);
+ cgroup_rm_cftypes(pol->cftypes);
/* unregister and update blkgs */
blkcg_policy[pol->plid] = NULL;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 4e595ee..86154ea 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -179,22 +179,20 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
void blkg_conf_finish(struct blkg_conf_ctx *ctx);
-static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup)
+static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
{
- return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
- struct blkcg, css);
+ return css ? container_of(css, struct blkcg, css) : NULL;
}
static inline struct blkcg *task_blkcg(struct task_struct *tsk)
{
- return container_of(task_subsys_state(tsk, blkio_subsys_id),
- struct blkcg, css);
+ return css_to_blkcg(task_css(tsk, blkio_subsys_id));
}
static inline struct blkcg *bio_blkcg(struct bio *bio)
{
if (bio && bio->bi_css)
- return container_of(bio->bi_css, struct blkcg, css);
+ return css_to_blkcg(bio->bi_css);
return task_blkcg(current);
}
@@ -206,9 +204,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
*/
static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
{
- struct cgroup *pcg = blkcg->css.cgroup->parent;
-
- return pcg ? cgroup_to_blkcg(pcg) : NULL;
+ return css_to_blkcg(css_parent(&blkcg->css));
}
/**
@@ -266,7 +262,7 @@ static inline void blkg_get(struct blkcg_gq *blkg)
blkg->refcnt++;
}
-void __blkg_release(struct blkcg_gq *blkg);
+void __blkg_release_rcu(struct rcu_head *rcu);
/**
* blkg_put - put a blkg reference
@@ -279,9 +275,44 @@ static inline void blkg_put(struct blkcg_gq *blkg)
lockdep_assert_held(blkg->q->queue_lock);
WARN_ON_ONCE(blkg->refcnt <= 0);
if (!--blkg->refcnt)
- __blkg_release(blkg);
+ call_rcu(&blkg->rcu_head, __blkg_release_rcu);
}
+struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
+ bool update_hint);
+
+/**
+ * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
+ * @d_blkg: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @p_blkg: target blkg to walk descendants of
+ *
+ * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU
+ * read locked. If called under either blkcg or queue lock, the iteration
+ * is guaranteed to include all and only online blkgs. The caller may
+ * update @pos_css by calling css_rightmost_descendant() to skip subtree.
+ * @p_blkg is included in the iteration and the first node to be visited.
+ */
+#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \
+ css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \
+ if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \
+ (p_blkg)->q, false)))
+
+/**
+ * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
+ * @d_blkg: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @p_blkg: target blkg to walk descendants of
+ *
+ * Similar to blkg_for_each_descendant_pre() but performs post-order
+ * traversal instead. Synchronization rules are the same. @p_blkg is
+ * included in the iteration and the last node to be visited.
+ */
+#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \
+ css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \
+ if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \
+ (p_blkg)->q, false)))
+
/**
* blk_get_rl - get request_list to use
* @q: request_queue of interest
@@ -371,6 +402,11 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl,
#define blk_queue_for_each_rl(rl, q) \
for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
+static inline void blkg_stat_init(struct blkg_stat *stat)
+{
+ u64_stats_init(&stat->syncp);
+}
+
/**
* blkg_stat_add - add a value to a blkg_stat
* @stat: target blkg_stat
@@ -399,9 +435,9 @@ static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
uint64_t v;
do {
- start = u64_stats_fetch_begin(&stat->syncp);
+ start = u64_stats_fetch_begin_bh(&stat->syncp);
v = stat->cnt;
- } while (u64_stats_fetch_retry(&stat->syncp, start));
+ } while (u64_stats_fetch_retry_bh(&stat->syncp, start));
return v;
}
@@ -427,6 +463,11 @@ static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from)
blkg_stat_add(to, blkg_stat_read(from));
}
+static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
+{
+ u64_stats_init(&rwstat->syncp);
+}
+
/**
* blkg_rwstat_add - add a value to a blkg_rwstat
* @rwstat: target blkg_rwstat
@@ -467,9 +508,9 @@ static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
struct blkg_rwstat tmp;
do {
- start = u64_stats_fetch_begin(&rwstat->syncp);
+ start = u64_stats_fetch_begin_bh(&rwstat->syncp);
tmp = *rwstat;
- } while (u64_stats_fetch_retry(&rwstat->syncp, start));
+ } while (u64_stats_fetch_retry_bh(&rwstat->syncp, start));
return tmp;
}
@@ -542,7 +583,6 @@ static inline int blkcg_activate_policy(struct request_queue *q,
static inline void blkcg_deactivate_policy(struct request_queue *q,
const struct blkcg_policy *pol) { }
-static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; }
static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
diff --git a/block/blk-core.c b/block/blk-core.c
index d5745b5..8bdd012 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -16,6 +16,7 @@
#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
#include <linux/highmem.h>
#include <linux/mm.h>
#include <linux/kernel_stat.h>
@@ -48,7 +49,7 @@ DEFINE_IDA(blk_queue_ida);
/*
* For the allocated request tables
*/
-static struct kmem_cache *request_cachep;
+struct kmem_cache *request_cachep = NULL;
/*
* For queue allocation
@@ -60,42 +61,6 @@ struct kmem_cache *blk_requestq_cachep;
*/
static struct workqueue_struct *kblockd_workqueue;
-static void drive_stat_acct(struct request *rq, int new_io)
-{
- struct hd_struct *part;
- int rw = rq_data_dir(rq);
- int cpu;
-
- if (!blk_do_io_stat(rq))
- return;
-
- cpu = part_stat_lock();
-
- if (!new_io) {
- part = rq->part;
- part_stat_inc(cpu, part, merges[rw]);
- } else {
- part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
- if (!hd_struct_try_get(part)) {
- /*
- * The partition is already being removed,
- * the request will be accounted on the disk only
- *
- * We take a reference on disk->part0 although that
- * partition will never be deleted, so we can treat
- * it as any other partition.
- */
- part = &rq->rq_disk->part0;
- hd_struct_get(part);
- }
- part_round_stats(cpu, part);
- part_inc_in_flight(part, rw);
- rq->part = part;
- }
-
- part_stat_unlock();
-}
-
void blk_queue_congestion_threshold(struct request_queue *q)
{
int nr;
@@ -145,7 +110,6 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
rq->cmd = rq->__cmd;
rq->cmd_len = BLK_MAX_CDB;
rq->tag = -1;
- rq->ref_count = 1;
rq->start_time = jiffies;
set_start_time_ns(rq);
rq->part = NULL;
@@ -174,9 +138,9 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
{
int bit;
- printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg,
+ printk(KERN_INFO "%s: dev %s: type=%x, flags=%llx\n", msg,
rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,
- rq->cmd_flags);
+ (unsigned long long) rq->cmd_flags);
printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n",
(unsigned long long)blk_rq_pos(rq),
@@ -595,9 +559,12 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
if (!q)
return NULL;
+ if (percpu_counter_init(&q->mq_usage_counter, 0))
+ goto fail_q;
+
q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
if (q->id < 0)
- goto fail_q;
+ goto fail_c;
q->backing_dev_info.ra_pages =
(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
@@ -644,13 +611,19 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
q->bypass_depth = 1;
__set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
+ init_waitqueue_head(&q->mq_freeze_wq);
+
if (blkcg_init_queue(q))
- goto fail_id;
+ goto fail_bdi;
return q;
+fail_bdi:
+ bdi_destroy(&q->backing_dev_info);
fail_id:
ida_simple_remove(&blk_queue_ida, q->id);
+fail_c:
+ percpu_counter_destroy(&q->mq_usage_counter);
fail_q:
kmem_cache_free(blk_requestq_cachep, q);
return NULL;
@@ -739,9 +712,17 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
q->sg_reserved_size = INT_MAX;
+ /* Protect q->elevator from elevator_change */
+ mutex_lock(&q->sysfs_lock);
+
/* init elevator */
- if (elevator_init(q, NULL))
+ if (elevator_init(q, NULL)) {
+ mutex_unlock(&q->sysfs_lock);
return NULL;
+ }
+
+ mutex_unlock(&q->sysfs_lock);
+
return q;
}
EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1109,7 +1090,8 @@ retry:
goto retry;
}
-struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
+static struct request *blk_old_get_request(struct request_queue *q, int rw,
+ gfp_t gfp_mask)
{
struct request *rq;
@@ -1126,6 +1108,14 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
return rq;
}
+
+struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
+{
+ if (q->mq_ops)
+ return blk_mq_alloc_request(q, rw, gfp_mask, false);
+ else
+ return blk_old_get_request(q, rw, gfp_mask);
+}
EXPORT_SYMBOL(blk_get_request);
/**
@@ -1211,7 +1201,7 @@ EXPORT_SYMBOL(blk_requeue_request);
static void add_acct_request(struct request_queue *q, struct request *rq,
int where)
{
- drive_stat_acct(rq, 1);
+ blk_account_io_start(rq, true);
__elv_add_request(q, rq, where);
}
@@ -1272,8 +1262,6 @@ void __blk_put_request(struct request_queue *q, struct request *req)
{
if (unlikely(!q))
return;
- if (unlikely(--req->ref_count))
- return;
blk_pm_put_request(req);
@@ -1302,12 +1290,17 @@ EXPORT_SYMBOL_GPL(__blk_put_request);
void blk_put_request(struct request *req)
{
- unsigned long flags;
struct request_queue *q = req->q;
- spin_lock_irqsave(q->queue_lock, flags);
- __blk_put_request(q, req);
- spin_unlock_irqrestore(q->queue_lock, flags);
+ if (q->mq_ops)
+ blk_mq_free_request(req);
+ else {
+ unsigned long flags;
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ __blk_put_request(q, req);
+ spin_unlock_irqrestore(q->queue_lock, flags);
+ }
}
EXPORT_SYMBOL(blk_put_request);
@@ -1343,8 +1336,8 @@ void blk_add_request_payload(struct request *rq, struct page *page,
}
EXPORT_SYMBOL_GPL(blk_add_request_payload);
-static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
- struct bio *bio)
+bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
+ struct bio *bio)
{
const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
@@ -1361,12 +1354,12 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
req->__data_len += bio->bi_size;
req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
- drive_stat_acct(req, 0);
+ blk_account_io_start(req, false);
return true;
}
-static bool bio_attempt_front_merge(struct request_queue *q,
- struct request *req, struct bio *bio)
+bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
+ struct bio *bio)
{
const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
@@ -1391,12 +1384,12 @@ static bool bio_attempt_front_merge(struct request_queue *q,
req->__data_len += bio->bi_size;
req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
- drive_stat_acct(req, 0);
+ blk_account_io_start(req, false);
return true;
}
/**
- * attempt_plug_merge - try to merge with %current's plugged list
+ * blk_attempt_plug_merge - try to merge with %current's plugged list
* @q: request_queue new bio is being queued at
* @bio: new bio being queued
* @request_count: out parameter for number of traversed plugged requests
@@ -1412,19 +1405,28 @@ static bool bio_attempt_front_merge(struct request_queue *q,
* reliable access to the elevator outside queue lock. Only check basic
* merging parameters without querying the elevator.
*/
-static bool attempt_plug_merge(struct request_queue *q, struct bio *bio,
- unsigned int *request_count)
+bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
+ unsigned int *request_count)
{
struct blk_plug *plug;
struct request *rq;
bool ret = false;
+ struct list_head *plug_list;
+
+ if (blk_queue_nomerges(q))
+ goto out;
plug = current->plug;
if (!plug)
goto out;
*request_count = 0;
- list_for_each_entry_reverse(rq, &plug->list, queuelist) {
+ if (q->mq_ops)
+ plug_list = &plug->mq_list;
+ else
+ plug_list = &plug->list;
+
+ list_for_each_entry_reverse(rq, plug_list, queuelist) {
int el_ret;
if (rq->q == q)
@@ -1492,7 +1494,7 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio)
* Check if we can merge with the plugged list before grabbing
* any locks.
*/
- if (attempt_plug_merge(q, bio, &request_count))
+ if (blk_attempt_plug_merge(q, bio, &request_count))
return;
spin_lock_irq(q->queue_lock);
@@ -1549,11 +1551,9 @@ get_rq:
if (plug) {
/*
* If this is the first request added after a plug, fire
- * of a plug trace. If others have been added before, check
- * if we have multiple devices in this plug. If so, make a
- * note to sort the list before dispatch.
+ * of a plug trace.
*/
- if (list_empty(&plug->list))
+ if (!request_count)
trace_block_plug(q);
else {
if (request_count >= BLK_MAX_REQUEST_COUNT) {
@@ -1562,7 +1562,7 @@ get_rq:
}
}
list_add_tail(&req->queuelist, &plug->list);
- drive_stat_acct(req, 1);
+ blk_account_io_start(req, true);
} else {
spin_lock_irq(q->queue_lock);
add_acct_request(q, req, where);
@@ -2016,7 +2016,7 @@ unsigned int blk_rq_err_bytes(const struct request *rq)
}
EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
-static void blk_account_io_completion(struct request *req, unsigned int bytes)
+void blk_account_io_completion(struct request *req, unsigned int bytes)
{
if (blk_do_io_stat(req)) {
const int rw = rq_data_dir(req);
@@ -2030,7 +2030,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
}
}
-static void blk_account_io_done(struct request *req)
+void blk_account_io_done(struct request *req)
{
/*
* Account IO completion. flush_rq isn't accounted as a
@@ -2078,6 +2078,42 @@ static inline struct request *blk_pm_peek_request(struct request_queue *q,
}
#endif
+void blk_account_io_start(struct request *rq, bool new_io)
+{
+ struct hd_struct *part;
+ int rw = rq_data_dir(rq);
+ int cpu;
+
+ if (!blk_do_io_stat(rq))
+ return;
+
+ cpu = part_stat_lock();
+
+ if (!new_io) {
+ part = rq->part;
+ part_stat_inc(cpu, part, merges[rw]);
+ } else {
+ part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
+ if (!hd_struct_try_get(part)) {
+ /*
+ * The partition is already being removed,
+ * the request will be accounted on the disk only
+ *
+ * We take a reference on disk->part0 although that
+ * partition will never be deleted, so we can treat
+ * it as any other partition.
+ */
+ part = &rq->rq_disk->part0;
+ hd_struct_get(part);
+ }
+ part_round_stats(cpu, part);
+ part_inc_in_flight(part, rw);
+ rq->part = part;
+ }
+
+ part_stat_unlock();
+}
+
/**
* blk_peek_request - peek at the top of a request queue
* @q: request queue to peek at
@@ -2229,6 +2265,7 @@ void blk_start_request(struct request *req)
if (unlikely(blk_bidi_rq(req)))
req->next_rq->resid_len = blk_rq_bytes(req->next_rq);
+ BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
blk_add_timer(req);
}
EXPORT_SYMBOL(blk_start_request);
@@ -2315,6 +2352,15 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
case -EBADE:
error_type = "critical nexus";
break;
+ case -ETIMEDOUT:
+ error_type = "timeout";
+ break;
+ case -ENOSPC:
+ error_type = "critical space allocation";
+ break;
+ case -ENODATA:
+ error_type = "critical medium";
+ break;
case -EIO:
default:
error_type = "I/O";
@@ -2444,7 +2490,6 @@ static void blk_finish_request(struct request *req, int error)
if (req->cmd_flags & REQ_DONTPREP)
blk_unprep_request(req);
-
blk_account_io_done(req);
if (req->end_io)
@@ -2866,6 +2911,7 @@ void blk_start_plug(struct blk_plug *plug)
plug->magic = PLUG_MAGIC;
INIT_LIST_HEAD(&plug->list);
+ INIT_LIST_HEAD(&plug->mq_list);
INIT_LIST_HEAD(&plug->cb_list);
/*
@@ -2963,6 +3009,10 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
BUG_ON(plug->magic != PLUG_MAGIC);
flush_plug_callbacks(plug, from_schedule);
+
+ if (!list_empty(&plug->mq_list))
+ blk_mq_flush_plug_list(plug, from_schedule);
+
if (list_empty(&plug->list))
return;
@@ -3180,7 +3230,8 @@ int __init blk_dev_init(void)
/* used for unplugging and affects IO latency/throughput - HIGHPRI */
kblockd_workqueue = alloc_workqueue("kblockd",
- WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
+ WQ_MEM_RECLAIM | WQ_HIGHPRI |
+ WQ_POWER_EFFICIENT, 0);
if (!kblockd_workqueue)
panic("Failed to create kblockd\n");
diff --git a/block/blk-exec.c b/block/blk-exec.c
index e706213..c3edf9d 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -5,6 +5,7 @@
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
#include <linux/sched/sysctl.h>
#include "blk.h"
@@ -24,7 +25,6 @@ static void blk_end_sync_rq(struct request *rq, int error)
struct completion *waiting = rq->end_io_data;
rq->end_io_data = NULL;
- __blk_put_request(rq->q, rq);
/*
* complete last, if this is a stack request the process (and thus
@@ -59,6 +59,12 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
rq->rq_disk = bd_disk;
rq->end_io = done;
+
+ if (q->mq_ops) {
+ blk_mq_insert_request(q, rq, true);
+ return;
+ }
+
/*
* need to check this before __blk_run_queue(), because rq can
* be freed before that returns.
@@ -68,9 +74,9 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
spin_lock_irq(q->queue_lock);
if (unlikely(blk_queue_dying(q))) {
+ rq->cmd_flags |= REQ_QUIET;
rq->errors = -ENXIO;
- if (rq->end_io)
- rq->end_io(rq, rq->errors);
+ __blk_end_request_all(rq, rq->errors);
spin_unlock_irq(q->queue_lock);
return;
}
@@ -103,12 +109,6 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
int err = 0;
unsigned long hang_check;
- /*
- * we need an extra reference to the request, so we can look at
- * it after io completion
- */
- rq->ref_count++;
-
if (!rq->sense) {
memset(sense, 0, sizeof(sense));
rq->sense = sense;
diff --git a/block/blk-flush.c b/block/blk-flush.c
index cc2b827..fb6f3c0 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -69,8 +69,10 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/gfp.h>
+#include <linux/blk-mq.h>
#include "blk.h"
+#include "blk-mq.h"
/* FLUSH/FUA sequences */
enum {
@@ -124,6 +126,24 @@ static void blk_flush_restore_request(struct request *rq)
/* make @rq a normal request */
rq->cmd_flags &= ~REQ_FLUSH_SEQ;
rq->end_io = rq->flush.saved_end_io;
+
+ blk_clear_rq_complete(rq);
+}
+
+static void mq_flush_data_run(struct work_struct *work)
+{
+ struct request *rq;
+
+ rq = container_of(work, struct request, mq_flush_data);
+
+ memset(&rq->csd, 0, sizeof(rq->csd));
+ blk_mq_run_request(rq, true, false);
+}
+
+static void blk_mq_flush_data_insert(struct request *rq)
+{
+ INIT_WORK(&rq->mq_flush_data, mq_flush_data_run);
+ kblockd_schedule_work(rq->q, &rq->mq_flush_data);
}
/**
@@ -136,7 +156,7 @@ static void blk_flush_restore_request(struct request *rq)
* completion and trigger the next step.
*
* CONTEXT:
- * spin_lock_irq(q->queue_lock)
+ * spin_lock_irq(q->queue_lock or q->mq_flush_lock)
*
* RETURNS:
* %true if requests were added to the dispatch queue, %false otherwise.
@@ -146,7 +166,7 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
{
struct request_queue *q = rq->q;
struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
- bool queued = false;
+ bool queued = false, kicked;
BUG_ON(rq->flush.seq & seq);
rq->flush.seq |= seq;
@@ -167,8 +187,12 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
case REQ_FSEQ_DATA:
list_move_tail(&rq->flush.list, &q->flush_data_in_flight);
- list_add(&rq->queuelist, &q->queue_head);
- queued = true;
+ if (q->mq_ops)
+ blk_mq_flush_data_insert(rq);
+ else {
+ list_add(&rq->queuelist, &q->queue_head);
+ queued = true;
+ }
break;
case REQ_FSEQ_DONE:
@@ -181,28 +205,43 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
BUG_ON(!list_empty(&rq->queuelist));
list_del_init(&rq->flush.list);
blk_flush_restore_request(rq);
- __blk_end_request_all(rq, error);
+ if (q->mq_ops)
+ blk_mq_end_io(rq, error);
+ else
+ __blk_end_request_all(rq, error);
break;
default:
BUG();
}
- return blk_kick_flush(q) | queued;
+ kicked = blk_kick_flush(q);
+ /* blk_mq_run_flush will run queue */
+ if (q->mq_ops)
+ return queued;
+ return kicked | queued;
}
static void flush_end_io(struct request *flush_rq, int error)
{
struct request_queue *q = flush_rq->q;
- struct list_head *running = &q->flush_queue[q->flush_running_idx];
+ struct list_head *running;
bool queued = false;
struct request *rq, *n;
+ unsigned long flags = 0;
+ if (q->mq_ops) {
+ blk_mq_free_request(flush_rq);
+ spin_lock_irqsave(&q->mq_flush_lock, flags);
+ }
+ running = &q->flush_queue[q->flush_running_idx];
BUG_ON(q->flush_pending_idx == q->flush_running_idx);
/* account completion of the flush request */
q->flush_running_idx ^= 1;
- elv_completed_request(q, flush_rq);
+
+ if (!q->mq_ops)
+ elv_completed_request(q, flush_rq);
/* and push the waiting requests to the next stage */
list_for_each_entry_safe(rq, n, running, flush.list) {
@@ -223,9 +262,48 @@ static void flush_end_io(struct request *flush_rq, int error)
* directly into request_fn may confuse the driver. Always use
* kblockd.
*/
- if (queued || q->flush_queue_delayed)
- blk_run_queue_async(q);
+ if (queued || q->flush_queue_delayed) {
+ if (!q->mq_ops)
+ blk_run_queue_async(q);
+ else
+ /*
+ * This can be optimized to only run queues with requests
+ * queued if necessary.
+ */
+ blk_mq_run_queues(q, true);
+ }
q->flush_queue_delayed = 0;
+ if (q->mq_ops)
+ spin_unlock_irqrestore(&q->mq_flush_lock, flags);
+}
+
+static void mq_flush_work(struct work_struct *work)
+{
+ struct request_queue *q;
+ struct request *rq;
+
+ q = container_of(work, struct request_queue, mq_flush_work);
+
+ /* We don't need set REQ_FLUSH_SEQ, it's for consistency */
+ rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ,
+ __GFP_WAIT|GFP_ATOMIC, true);
+ rq->cmd_type = REQ_TYPE_FS;
+ rq->end_io = flush_end_io;
+
+ blk_mq_run_request(rq, true, false);
+}
+
+/*
+ * We can't directly use q->flush_rq, because it doesn't have tag and is not in
+ * hctx->rqs[]. so we must allocate a new request, since we can't sleep here,
+ * so offload the work to workqueue.
+ *
+ * Note: we assume a flush request finished in any hardware queue will flush
+ * the whole disk cache.
+ */
+static void mq_run_flush(struct request_queue *q)
+{
+ kblockd_schedule_work(q, &q->mq_flush_work);
}
/**
@@ -236,7 +314,7 @@ static void flush_end_io(struct request *flush_rq, int error)
* Please read the comment at the top of this file for more info.
*
* CONTEXT:
- * spin_lock_irq(q->queue_lock)
+ * spin_lock_irq(q->queue_lock or q->mq_flush_lock)
*
* RETURNS:
* %true if flush was issued, %false otherwise.
@@ -261,13 +339,18 @@ static bool blk_kick_flush(struct request_queue *q)
* Issue flush and toggle pending_idx. This makes pending_idx
* different from running_idx, which means flush is in flight.
*/
+ q->flush_pending_idx ^= 1;
+ if (q->mq_ops) {
+ mq_run_flush(q);
+ return true;
+ }
+
blk_rq_init(q, &q->flush_rq);
q->flush_rq.cmd_type = REQ_TYPE_FS;
q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
q->flush_rq.rq_disk = first_rq->rq_disk;
q->flush_rq.end_io = flush_end_io;
- q->flush_pending_idx ^= 1;
list_add_tail(&q->flush_rq.queuelist, &q->queue_head);
return true;
}
@@ -284,16 +367,37 @@ static void flush_data_end_io(struct request *rq, int error)
blk_run_queue_async(q);
}
+static void mq_flush_data_end_io(struct request *rq, int error)
+{
+ struct request_queue *q = rq->q;
+ struct blk_mq_hw_ctx *hctx;
+ struct blk_mq_ctx *ctx;
+ unsigned long flags;
+
+ ctx = rq->mq_ctx;
+ hctx = q->mq_ops->map_queue(q, ctx->cpu);
+
+ /*
+ * After populating an empty queue, kick it to avoid stall. Read
+ * the comment in flush_end_io().
+ */
+ spin_lock_irqsave(&q->mq_flush_lock, flags);
+ if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error))
+ blk_mq_run_hw_queue(hctx, true);
+ spin_unlock_irqrestore(&q->mq_flush_lock, flags);
+}
+
/**
* blk_insert_flush - insert a new FLUSH/FUA request
* @rq: request to insert
*
* To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions.
+ * or __blk_mq_run_hw_queue() to dispatch request.
* @rq is being submitted. Analyze what needs to be done and put it on the
* right queue.
*
* CONTEXT:
- * spin_lock_irq(q->queue_lock)
+ * spin_lock_irq(q->queue_lock) in !mq case
*/
void blk_insert_flush(struct request *rq)
{
@@ -316,7 +420,10 @@ void blk_insert_flush(struct request *rq)
* complete the request.
*/
if (!policy) {
- __blk_end_bidi_request(rq, 0, 0, 0);
+ if (q->mq_ops)
+ blk_mq_end_io(rq, 0);
+ else
+ __blk_end_bidi_request(rq, 0, 0, 0);
return;
}
@@ -329,7 +436,10 @@ void blk_insert_flush(struct request *rq)
*/
if ((policy & REQ_FSEQ_DATA) &&
!(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
- list_add_tail(&rq->queuelist, &q->queue_head);
+ if (q->mq_ops) {
+ blk_mq_run_request(rq, false, true);
+ } else
+ list_add_tail(&rq->queuelist, &q->queue_head);
return;
}
@@ -341,6 +451,14 @@ void blk_insert_flush(struct request *rq)
INIT_LIST_HEAD(&rq->flush.list);
rq->cmd_flags |= REQ_FLUSH_SEQ;
rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
+ if (q->mq_ops) {
+ rq->end_io = mq_flush_data_end_io;
+
+ spin_lock_irq(&q->mq_flush_lock);
+ blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
+ spin_unlock_irq(&q->mq_flush_lock);
+ return;
+ }
rq->end_io = flush_data_end_io;
blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
@@ -384,15 +502,6 @@ void blk_abort_flushes(struct request_queue *q)
}
}
-static void bio_end_flush(struct bio *bio, int err)
-{
- if (err)
- clear_bit(BIO_UPTODATE, &bio->bi_flags);
- if (bio->bi_private)
- complete(bio->bi_private);
- bio_put(bio);
-}
-
/**
* blkdev_issue_flush - queue a flush
* @bdev: blockdev to issue flush for
@@ -408,7 +517,6 @@ static void bio_end_flush(struct bio *bio, int err)
int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
sector_t *error_sector)
{
- DECLARE_COMPLETION_ONSTACK(wait);
struct request_queue *q;
struct bio *bio;
int ret = 0;
@@ -430,13 +538,9 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
return -ENXIO;
bio = bio_alloc(gfp_mask, 0);
- bio->bi_end_io = bio_end_flush;
bio->bi_bdev = bdev;
- bio->bi_private = &wait;
- bio_get(bio);
- submit_bio(WRITE_FLUSH, bio);
- wait_for_completion_io(&wait);
+ ret = submit_bio_wait(WRITE_FLUSH, bio);
/*
* The driver must store the error location in ->bi_sector, if
@@ -446,10 +550,13 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
if (error_sector)
*error_sector = bio->bi_sector;
- if (!bio_flagged(bio, BIO_UPTODATE))
- ret = -EIO;
-
bio_put(bio);
return ret;
}
EXPORT_SYMBOL(blkdev_issue_flush);
+
+void blk_mq_init_flush(struct request_queue *q)
+{
+ spin_lock_init(&q->mq_flush_lock);
+ INIT_WORK(&q->mq_flush_work, mq_flush_work);
+}
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 9c4bb82..242df01 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -6,7 +6,6 @@
#include <linux/init.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
-#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */
#include <linux/slab.h>
#include "blk.h"
@@ -144,7 +143,8 @@ void put_io_context(struct io_context *ioc)
if (atomic_long_dec_and_test(&ioc->refcount)) {
spin_lock_irqsave(&ioc->lock, flags);
if (!hlist_empty(&ioc->icq_list))
- schedule_work(&ioc->release_work);
+ queue_work(system_power_efficient_wq,
+ &ioc->release_work);
else
free_ioc = true;
spin_unlock_irqrestore(&ioc->lock, flags);
@@ -366,7 +366,7 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
if (!icq)
return NULL;
- if (radix_tree_preload(gfp_mask) < 0) {
+ if (radix_tree_maybe_preload(gfp_mask) < 0) {
kmem_cache_free(et->icq_cache, icq);
return NULL;
}
diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
index 58916af..1855bf5 100644
--- a/block/blk-iopoll.c
+++ b/block/blk-iopoll.c
@@ -35,7 +35,7 @@ void blk_iopoll_sched(struct blk_iopoll *iop)
unsigned long flags;
local_irq_save(flags);
- list_add_tail(&iop->list, &__get_cpu_var(blk_cpu_iopoll));
+ list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
local_irq_restore(flags);
}
@@ -79,7 +79,7 @@ EXPORT_SYMBOL(blk_iopoll_complete);
static void blk_iopoll_softirq(struct softirq_action *h)
{
- struct list_head *list = &__get_cpu_var(blk_cpu_iopoll);
+ struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll);
int rearm = 0, budget = blk_iopoll_budget;
unsigned long start_time = jiffies;
@@ -189,8 +189,8 @@ void blk_iopoll_init(struct blk_iopoll *iop, int weight, blk_iopoll_fn *poll_fn)
}
EXPORT_SYMBOL(blk_iopoll_init);
-static int __cpuinit blk_iopoll_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
+static int blk_iopoll_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
{
/*
* If a CPU goes away, splice its entries to the current CPU
@@ -201,7 +201,7 @@ static int __cpuinit blk_iopoll_cpu_notify(struct notifier_block *self,
local_irq_disable();
list_splice_init(&per_cpu(blk_cpu_iopoll, cpu),
- &__get_cpu_var(blk_cpu_iopoll));
+ this_cpu_ptr(&blk_cpu_iopoll));
__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
local_irq_enable();
}
@@ -209,7 +209,7 @@ static int __cpuinit blk_iopoll_cpu_notify(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block __cpuinitdata blk_iopoll_cpu_notifier = {
+static struct notifier_block blk_iopoll_cpu_notifier = {
.notifier_call = blk_iopoll_cpu_notify,
};
diff --git a/block/blk-lib.c b/block/blk-lib.c
index d6f50d5..9b5b561 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -43,8 +43,8 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
DECLARE_COMPLETION_ONSTACK(wait);
struct request_queue *q = bdev_get_queue(bdev);
int type = REQ_WRITE | REQ_DISCARD;
- sector_t max_discard_sectors;
- sector_t granularity, alignment;
+ unsigned int max_discard_sectors, granularity;
+ int alignment;
struct bio_batch bb;
struct bio *bio;
int ret = 0;
@@ -58,16 +58,14 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
/* Zero-sector (unknown) and one-sector granularities are the same. */
granularity = max(q->limits.discard_granularity >> 9, 1U);
- alignment = bdev_discard_alignment(bdev) >> 9;
- alignment = sector_div(alignment, granularity);
+ alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
/*
* Ensure that max_discard_sectors is of the proper
* granularity, so that requests stay aligned after a split.
*/
max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
- sector_div(max_discard_sectors, granularity);
- max_discard_sectors *= granularity;
+ max_discard_sectors -= max_discard_sectors % granularity;
if (unlikely(!max_discard_sectors)) {
/* Avoid infinite loop below. Being cautious never hurts. */
return -EOPNOTSUPP;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 5f24482..1ffc589 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -308,6 +308,17 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
return ll_new_hw_segment(q, req, bio);
}
+/*
+ * blk-mq uses req->special to carry normal driver per-request payload, it
+ * does not indicate a prepared command that we cannot merge with.
+ */
+static bool req_no_special_merge(struct request *req)
+{
+ struct request_queue *q = req->q;
+
+ return !q->mq_ops && req->special;
+}
+
static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
struct request *next)
{
@@ -319,7 +330,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
* First check if the either of the requests are re-queued
* requests. Can't merge them if they are.
*/
- if (req->special || next->special)
+ if (req_no_special_merge(req) || req_no_special_merge(next))
return 0;
/*
@@ -416,7 +427,7 @@ static int attempt_merge(struct request_queue *q, struct request *req,
if (rq_data_dir(req) != rq_data_dir(next)
|| req->rq_disk != next->rq_disk
- || next->special)
+ || req_no_special_merge(next))
return 0;
if (req->cmd_flags & REQ_WRITE_SAME &&
@@ -515,7 +526,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
return false;
/* must be same device and not a special request */
- if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special)
+ if (rq->rq_disk != bio->bi_bdev->bd_disk || req_no_special_merge(rq))
return false;
/* only merge integrity protected bio into ditto rq */
diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c
new file mode 100644
index 0000000..0045ace
--- /dev/null
+++ b/block/blk-mq-cpu.c
@@ -0,0 +1,93 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/list.h>
+#include <linux/llist.h>
+#include <linux/smp.h>
+#include <linux/cpu.h>
+
+#include <linux/blk-mq.h>
+#include "blk-mq.h"
+
+static LIST_HEAD(blk_mq_cpu_notify_list);
+static DEFINE_SPINLOCK(blk_mq_cpu_notify_lock);
+
+static int blk_mq_main_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (unsigned long) hcpu;
+ struct blk_mq_cpu_notifier *notify;
+
+ spin_lock(&blk_mq_cpu_notify_lock);
+
+ list_for_each_entry(notify, &blk_mq_cpu_notify_list, list)
+ notify->notify(notify->data, action, cpu);
+
+ spin_unlock(&blk_mq_cpu_notify_lock);
+ return NOTIFY_OK;
+}
+
+static void blk_mq_cpu_notify(void *data, unsigned long action,
+ unsigned int cpu)
+{
+ if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+ /*
+ * If the CPU goes away, ensure that we run any pending
+ * completions.
+ */
+ struct llist_node *node;
+ struct request *rq;
+
+ local_irq_disable();
+
+ node = llist_del_all(&per_cpu(ipi_lists, cpu));
+ while (node) {
+ struct llist_node *next = node->next;
+
+ rq = llist_entry(node, struct request, ll_list);
+ __blk_mq_end_io(rq, rq->errors);
+ node = next;
+ }
+
+ local_irq_enable();
+ }
+}
+
+static struct notifier_block __cpuinitdata blk_mq_main_cpu_notifier = {
+ .notifier_call = blk_mq_main_cpu_notify,
+};
+
+void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
+{
+ BUG_ON(!notifier->notify);
+
+ spin_lock(&blk_mq_cpu_notify_lock);
+ list_add_tail(&notifier->list, &blk_mq_cpu_notify_list);
+ spin_unlock(&blk_mq_cpu_notify_lock);
+}
+
+void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
+{
+ spin_lock(&blk_mq_cpu_notify_lock);
+ list_del(&notifier->list);
+ spin_unlock(&blk_mq_cpu_notify_lock);
+}
+
+void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
+ void (*fn)(void *, unsigned long, unsigned int),
+ void *data)
+{
+ notifier->notify = fn;
+ notifier->data = data;
+}
+
+static struct blk_mq_cpu_notifier __cpuinitdata cpu_notifier = {
+ .notify = blk_mq_cpu_notify,
+};
+
+void __init blk_mq_cpu_init(void)
+{
+ register_hotcpu_notifier(&blk_mq_main_cpu_notifier);
+ blk_mq_register_cpu_notifier(&cpu_notifier);
+}
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
new file mode 100644
index 0000000..f872127
--- /dev/null
+++ b/block/blk-mq-cpumap.c
@@ -0,0 +1,108 @@
+#include <linux/kernel.h>
+#include <linux/threads.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/cpu.h>
+
+#include <linux/blk-mq.h>
+#include "blk.h"
+#include "blk-mq.h"
+
+static void show_map(unsigned int *map, unsigned int nr)
+{
+ int i;
+
+ pr_info("blk-mq: CPU -> queue map\n");
+ for_each_online_cpu(i)
+ pr_info(" CPU%2u -> Queue %u\n", i, map[i]);
+}
+
+static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues,
+ const int cpu)
+{
+ return cpu / ((nr_cpus + nr_queues - 1) / nr_queues);
+}
+
+static int get_first_sibling(unsigned int cpu)
+{
+ unsigned int ret;
+
+ ret = cpumask_first(topology_thread_cpumask(cpu));
+ if (ret < nr_cpu_ids)
+ return ret;
+
+ return cpu;
+}
+
+int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
+{
+ unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling;
+ cpumask_var_t cpus;
+
+ if (!alloc_cpumask_var(&cpus, GFP_ATOMIC))
+ return 1;
+
+ cpumask_clear(cpus);
+ nr_cpus = nr_uniq_cpus = 0;
+ for_each_online_cpu(i) {
+ nr_cpus++;
+ first_sibling = get_first_sibling(i);
+ if (!cpumask_test_cpu(first_sibling, cpus))
+ nr_uniq_cpus++;
+ cpumask_set_cpu(i, cpus);
+ }
+
+ queue = 0;
+ for_each_possible_cpu(i) {
+ if (!cpu_online(i)) {
+ map[i] = 0;
+ continue;
+ }
+
+ /*
+ * Easy case - we have equal or more hardware queues. Or
+ * there are no thread siblings to take into account. Do
+ * 1:1 if enough, or sequential mapping if less.
+ */
+ if (nr_queues >= nr_cpus || nr_cpus == nr_uniq_cpus) {
+ map[i] = cpu_to_queue_index(nr_cpus, nr_queues, queue);
+ queue++;
+ continue;
+ }
+
+ /*
+ * Less then nr_cpus queues, and we have some number of
+ * threads per cores. Map sibling threads to the same
+ * queue.
+ */
+ first_sibling = get_first_sibling(i);
+ if (first_sibling == i) {
+ map[i] = cpu_to_queue_index(nr_uniq_cpus, nr_queues,
+ queue);
+ queue++;
+ } else
+ map[i] = map[first_sibling];
+ }
+
+ show_map(map, nr_cpus);
+ free_cpumask_var(cpus);
+ return 0;
+}
+
+unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg)
+{
+ unsigned int *map;
+
+ /* If cpus are offline, map them to first hctx */
+ map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL,
+ reg->numa_node);
+ if (!map)
+ return NULL;
+
+ if (!blk_mq_update_queue_map(map, reg->nr_hw_queues))
+ return map;
+
+ kfree(map);
+ return NULL;
+}
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
new file mode 100644
index 0000000..ba6cf8e
--- /dev/null
+++ b/block/blk-mq-sysfs.c
@@ -0,0 +1,384 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/backing-dev.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/smp.h>
+
+#include <linux/blk-mq.h>
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+
+static void blk_mq_sysfs_release(struct kobject *kobj)
+{
+}
+
+struct blk_mq_ctx_sysfs_entry {
+ struct attribute attr;
+ ssize_t (*show)(struct blk_mq_ctx *, char *);
+ ssize_t (*store)(struct blk_mq_ctx *, const char *, size_t);
+};
+
+struct blk_mq_hw_ctx_sysfs_entry {
+ struct attribute attr;
+ ssize_t (*show)(struct blk_mq_hw_ctx *, char *);
+ ssize_t (*store)(struct blk_mq_hw_ctx *, const char *, size_t);
+};
+
+static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr,
+ char *page)
+{
+ struct blk_mq_ctx_sysfs_entry *entry;
+ struct blk_mq_ctx *ctx;
+ struct request_queue *q;
+ ssize_t res;
+
+ entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr);
+ ctx = container_of(kobj, struct blk_mq_ctx, kobj);
+ q = ctx->queue;
+
+ if (!entry->show)
+ return -EIO;
+
+ res = -ENOENT;
+ mutex_lock(&q->sysfs_lock);
+ if (!blk_queue_dying(q))
+ res = entry->show(ctx, page);
+ mutex_unlock(&q->sysfs_lock);
+ return res;
+}
+
+static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr,
+ const char *page, size_t length)
+{
+ struct blk_mq_ctx_sysfs_entry *entry;
+ struct blk_mq_ctx *ctx;
+ struct request_queue *q;
+ ssize_t res;
+
+ entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr);
+ ctx = container_of(kobj, struct blk_mq_ctx, kobj);
+ q = ctx->queue;
+
+ if (!entry->store)
+ return -EIO;
+
+ res = -ENOENT;
+ mutex_lock(&q->sysfs_lock);
+ if (!blk_queue_dying(q))
+ res = entry->store(ctx, page, length);
+ mutex_unlock(&q->sysfs_lock);
+ return res;
+}
+
+static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj,
+ struct attribute *attr, char *page)
+{
+ struct blk_mq_hw_ctx_sysfs_entry *entry;
+ struct blk_mq_hw_ctx *hctx;
+ struct request_queue *q;
+ ssize_t res;
+
+ entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr);
+ hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj);
+ q = hctx->queue;
+
+ if (!entry->show)
+ return -EIO;
+
+ res = -ENOENT;
+ mutex_lock(&q->sysfs_lock);
+ if (!blk_queue_dying(q))
+ res = entry->show(hctx, page);
+ mutex_unlock(&q->sysfs_lock);
+ return res;
+}
+
+static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj,
+ struct attribute *attr, const char *page,
+ size_t length)
+{
+ struct blk_mq_hw_ctx_sysfs_entry *entry;
+ struct blk_mq_hw_ctx *hctx;
+ struct request_queue *q;
+ ssize_t res;
+
+ entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr);
+ hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj);
+ q = hctx->queue;
+
+ if (!entry->store)
+ return -EIO;
+
+ res = -ENOENT;
+ mutex_lock(&q->sysfs_lock);
+ if (!blk_queue_dying(q))
+ res = entry->store(hctx, page, length);
+ mutex_unlock(&q->sysfs_lock);
+ return res;
+}
+
+static ssize_t blk_mq_sysfs_dispatched_show(struct blk_mq_ctx *ctx, char *page)
+{
+ return sprintf(page, "%lu %lu\n", ctx->rq_dispatched[1],
+ ctx->rq_dispatched[0]);
+}
+
+static ssize_t blk_mq_sysfs_merged_show(struct blk_mq_ctx *ctx, char *page)
+{
+ return sprintf(page, "%lu\n", ctx->rq_merged);
+}
+
+static ssize_t blk_mq_sysfs_completed_show(struct blk_mq_ctx *ctx, char *page)
+{
+ return sprintf(page, "%lu %lu\n", ctx->rq_completed[1],
+ ctx->rq_completed[0]);
+}
+
+static ssize_t sysfs_list_show(char *page, struct list_head *list, char *msg)
+{
+ char *start_page = page;
+ struct request *rq;
+
+ page += sprintf(page, "%s:\n", msg);
+
+ list_for_each_entry(rq, list, queuelist)
+ page += sprintf(page, "\t%p\n", rq);
+
+ return page - start_page;
+}
+
+static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page)
+{
+ ssize_t ret;
+
+ spin_lock(&ctx->lock);
+ ret = sysfs_list_show(page, &ctx->rq_list, "CTX pending");
+ spin_unlock(&ctx->lock);
+
+ return ret;
+}
+
+static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx,
+ char *page)
+{
+ return sprintf(page, "%lu\n", hctx->queued);
+}
+
+static ssize_t blk_mq_hw_sysfs_run_show(struct blk_mq_hw_ctx *hctx, char *page)
+{
+ return sprintf(page, "%lu\n", hctx->run);
+}
+
+static ssize_t blk_mq_hw_sysfs_dispatched_show(struct blk_mq_hw_ctx *hctx,
+ char *page)
+{
+ char *start_page = page;
+ int i;
+
+ page += sprintf(page, "%8u\t%lu\n", 0U, hctx->dispatched[0]);
+
+ for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) {
+ unsigned long d = 1U << (i - 1);
+
+ page += sprintf(page, "%8lu\t%lu\n", d, hctx->dispatched[i]);
+ }
+
+ return page - start_page;
+}
+
+static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx,
+ char *page)
+{
+ ssize_t ret;
+
+ spin_lock(&hctx->lock);
+ ret = sysfs_list_show(page, &hctx->dispatch, "HCTX pending");
+ spin_unlock(&hctx->lock);
+
+ return ret;
+}
+
+static ssize_t blk_mq_hw_sysfs_ipi_show(struct blk_mq_hw_ctx *hctx, char *page)
+{
+ ssize_t ret;
+
+ spin_lock(&hctx->lock);
+ ret = sprintf(page, "%u\n", !!(hctx->flags & BLK_MQ_F_SHOULD_IPI));
+ spin_unlock(&hctx->lock);
+
+ return ret;
+}
+
+static ssize_t blk_mq_hw_sysfs_ipi_store(struct blk_mq_hw_ctx *hctx,
+ const char *page, size_t len)
+{
+ struct blk_mq_ctx *ctx;
+ unsigned long ret;
+ unsigned int i;
+
+ if (kstrtoul(page, 10, &ret)) {
+ pr_err("blk-mq-sysfs: invalid input '%s'\n", page);
+ return -EINVAL;
+ }
+
+ spin_lock(&hctx->lock);
+ if (ret)
+ hctx->flags |= BLK_MQ_F_SHOULD_IPI;
+ else
+ hctx->flags &= ~BLK_MQ_F_SHOULD_IPI;
+ spin_unlock(&hctx->lock);
+
+ hctx_for_each_ctx(hctx, ctx, i)
+ ctx->ipi_redirect = !!ret;
+
+ return len;
+}
+
+static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
+{
+ return blk_mq_tag_sysfs_show(hctx->tags, page);
+}
+
+static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = {
+ .attr = {.name = "dispatched", .mode = S_IRUGO },
+ .show = blk_mq_sysfs_dispatched_show,
+};
+static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_merged = {
+ .attr = {.name = "merged", .mode = S_IRUGO },
+ .show = blk_mq_sysfs_merged_show,
+};
+static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_completed = {
+ .attr = {.name = "completed", .mode = S_IRUGO },
+ .show = blk_mq_sysfs_completed_show,
+};
+static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_rq_list = {
+ .attr = {.name = "rq_list", .mode = S_IRUGO },
+ .show = blk_mq_sysfs_rq_list_show,
+};
+
+static struct attribute *default_ctx_attrs[] = {
+ &blk_mq_sysfs_dispatched.attr,
+ &blk_mq_sysfs_merged.attr,
+ &blk_mq_sysfs_completed.attr,
+ &blk_mq_sysfs_rq_list.attr,
+ NULL,
+};
+
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_queued = {
+ .attr = {.name = "queued", .mode = S_IRUGO },
+ .show = blk_mq_hw_sysfs_queued_show,
+};
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_run = {
+ .attr = {.name = "run", .mode = S_IRUGO },
+ .show = blk_mq_hw_sysfs_run_show,
+};
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = {
+ .attr = {.name = "dispatched", .mode = S_IRUGO },
+ .show = blk_mq_hw_sysfs_dispatched_show,
+};
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = {
+ .attr = {.name = "pending", .mode = S_IRUGO },
+ .show = blk_mq_hw_sysfs_rq_list_show,
+};
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_ipi = {
+ .attr = {.name = "ipi_redirect", .mode = S_IRUGO | S_IWUSR},
+ .show = blk_mq_hw_sysfs_ipi_show,
+ .store = blk_mq_hw_sysfs_ipi_store,
+};
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = {
+ .attr = {.name = "tags", .mode = S_IRUGO },
+ .show = blk_mq_hw_sysfs_tags_show,
+};
+
+static struct attribute *default_hw_ctx_attrs[] = {
+ &blk_mq_hw_sysfs_queued.attr,
+ &blk_mq_hw_sysfs_run.attr,
+ &blk_mq_hw_sysfs_dispatched.attr,
+ &blk_mq_hw_sysfs_pending.attr,
+ &blk_mq_hw_sysfs_ipi.attr,
+ &blk_mq_hw_sysfs_tags.attr,
+ NULL,
+};
+
+static const struct sysfs_ops blk_mq_sysfs_ops = {
+ .show = blk_mq_sysfs_show,
+ .store = blk_mq_sysfs_store,
+};
+
+static const struct sysfs_ops blk_mq_hw_sysfs_ops = {
+ .show = blk_mq_hw_sysfs_show,
+ .store = blk_mq_hw_sysfs_store,
+};
+
+static struct kobj_type blk_mq_ktype = {
+ .sysfs_ops = &blk_mq_sysfs_ops,
+ .release = blk_mq_sysfs_release,
+};
+
+static struct kobj_type blk_mq_ctx_ktype = {
+ .sysfs_ops = &blk_mq_sysfs_ops,
+ .default_attrs = default_ctx_attrs,
+ .release = blk_mq_sysfs_release,
+};
+
+static struct kobj_type blk_mq_hw_ktype = {
+ .sysfs_ops = &blk_mq_hw_sysfs_ops,
+ .default_attrs = default_hw_ctx_attrs,
+ .release = blk_mq_sysfs_release,
+};
+
+void blk_mq_unregister_disk(struct gendisk *disk)
+{
+ struct request_queue *q = disk->queue;
+
+ kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
+ kobject_del(&q->mq_kobj);
+
+ kobject_put(&disk_to_dev(disk)->kobj);
+}
+
+int blk_mq_register_disk(struct gendisk *disk)
+{
+ struct device *dev = disk_to_dev(disk);
+ struct request_queue *q = disk->queue;
+ struct blk_mq_hw_ctx *hctx;
+ struct blk_mq_ctx *ctx;
+ int ret, i, j;
+
+ kobject_init(&q->mq_kobj, &blk_mq_ktype);
+
+ ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
+ if (ret < 0)
+ return ret;
+
+ kobject_uevent(&q->mq_kobj, KOBJ_ADD);
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
+ ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", i);
+ if (ret)
+ break;
+
+ if (!hctx->nr_ctx)
+ continue;
+
+ hctx_for_each_ctx(hctx, ctx, j) {
+ kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
+ ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
+ if (ret)
+ break;
+ }
+ }
+
+ if (ret) {
+ blk_mq_unregister_disk(disk);
+ return ret;
+ }
+
+ return 0;
+}
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
new file mode 100644
index 0000000..d64a02f
--- /dev/null
+++ b/block/blk-mq-tag.c
@@ -0,0 +1,204 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/percpu_ida.h>
+
+#include <linux/blk-mq.h>
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+
+/*
+ * Per tagged queue (tag address space) map
+ */
+struct blk_mq_tags {
+ unsigned int nr_tags;
+ unsigned int nr_reserved_tags;
+ unsigned int nr_batch_move;
+ unsigned int nr_max_cache;
+
+ struct percpu_ida free_tags;
+ struct percpu_ida reserved_tags;
+};
+
+void blk_mq_wait_for_tags(struct blk_mq_tags *tags)
+{
+ int tag = blk_mq_get_tag(tags, __GFP_WAIT, false);
+ blk_mq_put_tag(tags, tag);
+}
+
+bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
+{
+ return !tags ||
+ percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids) != 0;
+}
+
+static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp)
+{
+ int tag;
+
+ tag = percpu_ida_alloc(&tags->free_tags, gfp);
+ if (tag < 0)
+ return BLK_MQ_TAG_FAIL;
+ return tag + tags->nr_reserved_tags;
+}
+
+static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags,
+ gfp_t gfp)
+{
+ int tag;
+
+ if (unlikely(!tags->nr_reserved_tags)) {
+ WARN_ON_ONCE(1);
+ return BLK_MQ_TAG_FAIL;
+ }
+
+ tag = percpu_ida_alloc(&tags->reserved_tags, gfp);
+ if (tag < 0)
+ return BLK_MQ_TAG_FAIL;
+ return tag;
+}
+
+unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved)
+{
+ if (!reserved)
+ return __blk_mq_get_tag(tags, gfp);
+
+ return __blk_mq_get_reserved_tag(tags, gfp);
+}
+
+static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
+{
+ BUG_ON(tag >= tags->nr_tags);
+
+ percpu_ida_free(&tags->free_tags, tag - tags->nr_reserved_tags);
+}
+
+static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
+ unsigned int tag)
+{
+ BUG_ON(tag >= tags->nr_reserved_tags);
+
+ percpu_ida_free(&tags->reserved_tags, tag);
+}
+
+void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
+{
+ if (tag >= tags->nr_reserved_tags)
+ __blk_mq_put_tag(tags, tag);
+ else
+ __blk_mq_put_reserved_tag(tags, tag);
+}
+
+static int __blk_mq_tag_iter(unsigned id, void *data)
+{
+ unsigned long *tag_map = data;
+ __set_bit(id, tag_map);
+ return 0;
+}
+
+void blk_mq_tag_busy_iter(struct blk_mq_tags *tags,
+ void (*fn)(void *, unsigned long *), void *data)
+{
+ unsigned long *tag_map;
+ size_t map_size;
+
+ map_size = ALIGN(tags->nr_tags, BITS_PER_LONG) / BITS_PER_LONG;
+ tag_map = kzalloc(map_size * sizeof(unsigned long), GFP_ATOMIC);
+ if (!tag_map)
+ return;
+
+ percpu_ida_for_each_free(&tags->free_tags, __blk_mq_tag_iter, tag_map);
+ if (tags->nr_reserved_tags)
+ percpu_ida_for_each_free(&tags->reserved_tags, __blk_mq_tag_iter,
+ tag_map);
+
+ fn(data, tag_map);
+ kfree(tag_map);
+}
+
+struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
+ unsigned int reserved_tags, int node)
+{
+ unsigned int nr_tags, nr_cache;
+ struct blk_mq_tags *tags;
+ int ret;
+
+ if (total_tags > BLK_MQ_TAG_MAX) {
+ pr_err("blk-mq: tag depth too large\n");
+ return NULL;
+ }
+
+ tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node);
+ if (!tags)
+ return NULL;
+
+ nr_tags = total_tags - reserved_tags;
+ nr_cache = nr_tags / num_possible_cpus();
+
+ if (nr_cache < BLK_MQ_TAG_CACHE_MIN)
+ nr_cache = BLK_MQ_TAG_CACHE_MIN;
+ else if (nr_cache > BLK_MQ_TAG_CACHE_MAX)
+ nr_cache = BLK_MQ_TAG_CACHE_MAX;
+
+ tags->nr_tags = total_tags;
+ tags->nr_reserved_tags = reserved_tags;
+ tags->nr_max_cache = nr_cache;
+ tags->nr_batch_move = max(1u, nr_cache / 2);
+
+ ret = __percpu_ida_init(&tags->free_tags, tags->nr_tags -
+ tags->nr_reserved_tags,
+ tags->nr_max_cache,
+ tags->nr_batch_move);
+ if (ret)
+ goto err_free_tags;
+
+ if (reserved_tags) {
+ /*
+ * With max_cahe and batch set to 1, the allocator fallbacks to
+ * no cached. It's fine reserved tags allocation is slow.
+ */
+ ret = __percpu_ida_init(&tags->reserved_tags, reserved_tags,
+ 1, 1);
+ if (ret)
+ goto err_reserved_tags;
+ }
+
+ return tags;
+
+err_reserved_tags:
+ percpu_ida_destroy(&tags->free_tags);
+err_free_tags:
+ kfree(tags);
+ return NULL;
+}
+
+void blk_mq_free_tags(struct blk_mq_tags *tags)
+{
+ percpu_ida_destroy(&tags->free_tags);
+ percpu_ida_destroy(&tags->reserved_tags);
+ kfree(tags);
+}
+
+ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
+{
+ char *orig_page = page;
+ int cpu;
+
+ if (!tags)
+ return 0;
+
+ page += sprintf(page, "nr_tags=%u, reserved_tags=%u, batch_move=%u,"
+ " max_cache=%u\n", tags->nr_tags, tags->nr_reserved_tags,
+ tags->nr_batch_move, tags->nr_max_cache);
+
+ page += sprintf(page, "nr_free=%u, nr_reserved=%u\n",
+ percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids),
+ percpu_ida_free_tags(&tags->reserved_tags, nr_cpu_ids));
+
+ for_each_possible_cpu(cpu) {
+ page += sprintf(page, " cpu%02u: nr_free=%u\n", cpu,
+ percpu_ida_free_tags(&tags->free_tags, cpu));
+ }
+
+ return page - orig_page;
+}
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
new file mode 100644
index 0000000..947ba2c
--- /dev/null
+++ b/block/blk-mq-tag.h
@@ -0,0 +1,27 @@
+#ifndef INT_BLK_MQ_TAG_H
+#define INT_BLK_MQ_TAG_H
+
+struct blk_mq_tags;
+
+extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node);
+extern void blk_mq_free_tags(struct blk_mq_tags *tags);
+
+extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved);
+extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags);
+extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag);
+extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data);
+extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
+extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
+
+enum {
+ BLK_MQ_TAG_CACHE_MIN = 1,
+ BLK_MQ_TAG_CACHE_MAX = 64,
+};
+
+enum {
+ BLK_MQ_TAG_FAIL = -1U,
+ BLK_MQ_TAG_MIN = BLK_MQ_TAG_CACHE_MIN,
+ BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1,
+};
+
+#endif
diff --git a/block/blk-mq.c b/block/blk-mq.c
new file mode 100644
index 0000000..c79126e
--- /dev/null
+++ b/block/blk-mq.c
@@ -0,0 +1,1510 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/backing-dev.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/smp.h>
+#include <linux/llist.h>
+#include <linux/list_sort.h>
+#include <linux/cpu.h>
+#include <linux/cache.h>
+#include <linux/sched/sysctl.h>
+#include <linux/delay.h>
+
+#include <trace/events/block.h>
+
+#include <linux/blk-mq.h>
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+
+static DEFINE_MUTEX(all_q_mutex);
+static LIST_HEAD(all_q_list);
+
+static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
+
+DEFINE_PER_CPU(struct llist_head, ipi_lists);
+
+static struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
+ unsigned int cpu)
+{
+ return per_cpu_ptr(q->queue_ctx, cpu);
+}
+
+/*
+ * This assumes per-cpu software queueing queues. They could be per-node
+ * as well, for instance. For now this is hardcoded as-is. Note that we don't
+ * care about preemption, since we know the ctx's are persistent. This does
+ * mean that we can't rely on ctx always matching the currently running CPU.
+ */
+static struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
+{
+ return __blk_mq_get_ctx(q, get_cpu());
+}
+
+static void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
+{
+ put_cpu();
+}
+
+/*
+ * Check if any of the ctx's have pending work in this hardware queue
+ */
+static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
+{
+ unsigned int i;
+
+ for (i = 0; i < hctx->nr_ctx_map; i++)
+ if (hctx->ctx_map[i])
+ return true;
+
+ return false;
+}
+
+/*
+ * Mark this ctx as having pending work in this hardware queue
+ */
+static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
+ struct blk_mq_ctx *ctx)
+{
+ if (!test_bit(ctx->index_hw, hctx->ctx_map))
+ set_bit(ctx->index_hw, hctx->ctx_map);
+}
+
+static struct request *blk_mq_alloc_rq(struct blk_mq_hw_ctx *hctx, gfp_t gfp,
+ bool reserved)
+{
+ struct request *rq;
+ unsigned int tag;
+
+ tag = blk_mq_get_tag(hctx->tags, gfp, reserved);
+ if (tag != BLK_MQ_TAG_FAIL) {
+ rq = hctx->rqs[tag];
+ rq->tag = tag;
+
+ return rq;
+ }
+
+ return NULL;
+}
+
+static int blk_mq_queue_enter(struct request_queue *q)
+{
+ int ret;
+
+ __percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
+ smp_wmb();
+ /* we have problems to freeze the queue if it's initializing */
+ if (!blk_queue_bypass(q) || !blk_queue_init_done(q))
+ return 0;
+
+ __percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
+
+ spin_lock_irq(q->queue_lock);
+ ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq,
+ !blk_queue_bypass(q), *q->queue_lock);
+ /* inc usage with lock hold to avoid freeze_queue runs here */
+ if (!ret)
+ __percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
+ spin_unlock_irq(q->queue_lock);
+
+ return ret;
+}
+
+static void blk_mq_queue_exit(struct request_queue *q)
+{
+ __percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
+}
+
+/*
+ * Guarantee no request is in use, so we can change any data structure of
+ * the queue afterward.
+ */
+static void blk_mq_freeze_queue(struct request_queue *q)
+{
+ bool drain;
+
+ spin_lock_irq(q->queue_lock);
+ drain = !q->bypass_depth++;
+ queue_flag_set(QUEUE_FLAG_BYPASS, q);
+ spin_unlock_irq(q->queue_lock);
+
+ if (!drain)
+ return;
+
+ while (true) {
+ s64 count;
+
+ spin_lock_irq(q->queue_lock);
+ count = percpu_counter_sum(&q->mq_usage_counter);
+ spin_unlock_irq(q->queue_lock);
+
+ if (count == 0)
+ break;
+ blk_mq_run_queues(q, false);
+ msleep(10);
+ }
+}
+
+static void blk_mq_unfreeze_queue(struct request_queue *q)
+{
+ bool wake = false;
+
+ spin_lock_irq(q->queue_lock);
+ if (!--q->bypass_depth) {
+ queue_flag_clear(QUEUE_FLAG_BYPASS, q);
+ wake = true;
+ }
+ WARN_ON_ONCE(q->bypass_depth < 0);
+ spin_unlock_irq(q->queue_lock);
+ if (wake)
+ wake_up_all(&q->mq_freeze_wq);
+}
+
+bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
+{
+ return blk_mq_has_free_tags(hctx->tags);
+}
+EXPORT_SYMBOL(blk_mq_can_queue);
+
+static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
+ struct request *rq, unsigned int rw_flags)
+{
+ if (blk_queue_io_stat(q))
+ rw_flags |= REQ_IO_STAT;
+
+ rq->mq_ctx = ctx;
+ rq->cmd_flags = rw_flags;
+ ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
+}
+
+static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
+ gfp_t gfp, bool reserved)
+{
+ return blk_mq_alloc_rq(hctx, gfp, reserved);
+}
+
+static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
+ int rw, gfp_t gfp,
+ bool reserved)
+{
+ struct request *rq;
+
+ do {
+ struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
+ struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
+
+ rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved);
+ if (rq) {
+ blk_mq_rq_ctx_init(q, ctx, rq, rw);
+ break;
+ }
+
+ blk_mq_put_ctx(ctx);
+ if (!(gfp & __GFP_WAIT))
+ break;
+
+ __blk_mq_run_hw_queue(hctx);
+ blk_mq_wait_for_tags(hctx->tags);
+ } while (1);
+
+ return rq;
+}
+
+struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
+ gfp_t gfp, bool reserved)
+{
+ struct request *rq;
+
+ if (blk_mq_queue_enter(q))
+ return NULL;
+
+ rq = blk_mq_alloc_request_pinned(q, rw, gfp, reserved);
+ if (rq)
+ blk_mq_put_ctx(rq->mq_ctx);
+ return rq;
+}
+
+struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw,
+ gfp_t gfp)
+{
+ struct request *rq;
+
+ if (blk_mq_queue_enter(q))
+ return NULL;
+
+ rq = blk_mq_alloc_request_pinned(q, rw, gfp, true);
+ if (rq)
+ blk_mq_put_ctx(rq->mq_ctx);
+ return rq;
+}
+EXPORT_SYMBOL(blk_mq_alloc_reserved_request);
+
+/*
+ * Re-init and set pdu, if we have it
+ */
+static void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+ blk_rq_init(hctx->queue, rq);
+
+ if (hctx->cmd_size)
+ rq->special = blk_mq_rq_to_pdu(rq);
+}
+
+static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
+ struct blk_mq_ctx *ctx, struct request *rq)
+{
+ const int tag = rq->tag;
+ struct request_queue *q = rq->q;
+
+ blk_mq_rq_init(hctx, rq);
+ blk_mq_put_tag(hctx->tags, tag);
+
+ blk_mq_queue_exit(q);
+}
+
+void blk_mq_free_request(struct request *rq)
+{
+ struct blk_mq_ctx *ctx = rq->mq_ctx;
+ struct blk_mq_hw_ctx *hctx;
+ struct request_queue *q = rq->q;
+
+ ctx->rq_completed[rq_is_sync(rq)]++;
+
+ hctx = q->mq_ops->map_queue(q, ctx->cpu);
+ __blk_mq_free_request(hctx, ctx, rq);
+}
+
+static void blk_mq_bio_endio(struct request *rq, struct bio *bio, int error)
+{
+ if (error)
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+ error = -EIO;
+
+ if (unlikely(rq->cmd_flags & REQ_QUIET))
+ set_bit(BIO_QUIET, &bio->bi_flags);
+
+ /* don't actually finish bio if it's part of flush sequence */
+ if (!(rq->cmd_flags & REQ_FLUSH_SEQ))
+ bio_endio(bio, error);
+}
+
+void blk_mq_complete_request(struct request *rq, int error)
+{
+ struct bio *bio = rq->bio;
+ unsigned int bytes = 0;
+
+ trace_block_rq_complete(rq->q, rq);
+
+ while (bio) {
+ struct bio *next = bio->bi_next;
+
+ bio->bi_next = NULL;
+ bytes += bio->bi_size;
+ blk_mq_bio_endio(rq, bio, error);
+ bio = next;
+ }
+
+ blk_account_io_completion(rq, bytes);
+
+ blk_account_io_done(rq);
+
+ if (rq->end_io)
+ rq->end_io(rq, error);
+ else
+ blk_mq_free_request(rq);
+}
+
+void __blk_mq_end_io(struct request *rq, int error)
+{
+ if (!blk_mark_rq_complete(rq))
+ blk_mq_complete_request(rq, error);
+}
+
+#if defined(CONFIG_SMP)
+
+/*
+ * Called with interrupts disabled.
+ */
+static void ipi_end_io(void *data)
+{
+ struct llist_head *list = &per_cpu(ipi_lists, smp_processor_id());
+ struct llist_node *entry, *next;
+ struct request *rq;
+
+ entry = llist_del_all(list);
+
+ while (entry) {
+ next = entry->next;
+ rq = llist_entry(entry, struct request, ll_list);
+ __blk_mq_end_io(rq, rq->errors);
+ entry = next;
+ }
+}
+
+static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu,
+ struct request *rq, const int error)
+{
+ struct call_single_data *data = &rq->csd;
+
+ rq->errors = error;
+ rq->ll_list.next = NULL;
+
+ /*
+ * If the list is non-empty, an existing IPI must already
+ * be "in flight". If that is the case, we need not schedule
+ * a new one.
+ */
+ if (llist_add(&rq->ll_list, &per_cpu(ipi_lists, ctx->cpu))) {
+ data->func = ipi_end_io;
+ data->flags = 0;
+ __smp_call_function_single(ctx->cpu, data, 0);
+ }
+
+ return true;
+}
+#else /* CONFIG_SMP */
+static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu,
+ struct request *rq, const int error)
+{
+ return false;
+}
+#endif
+
+/*
+ * End IO on this request on a multiqueue enabled driver. We'll either do
+ * it directly inline, or punt to a local IPI handler on the matching
+ * remote CPU.
+ */
+void blk_mq_end_io(struct request *rq, int error)
+{
+ struct blk_mq_ctx *ctx = rq->mq_ctx;
+ int cpu;
+
+ if (!ctx->ipi_redirect)
+ return __blk_mq_end_io(rq, error);
+
+ cpu = get_cpu();
+
+ if (cpu == ctx->cpu || !cpu_online(ctx->cpu) ||
+ !ipi_remote_cpu(ctx, cpu, rq, error))
+ __blk_mq_end_io(rq, error);
+
+ put_cpu();
+}
+EXPORT_SYMBOL(blk_mq_end_io);
+
+static void blk_mq_start_request(struct request *rq)
+{
+ struct request_queue *q = rq->q;
+
+ trace_block_rq_issue(q, rq);
+
+ /*
+ * Just mark start time and set the started bit. Due to memory
+ * ordering, we know we'll see the correct deadline as long as
+ * REQ_ATOMIC_STARTED is seen.
+ */
+ rq->deadline = jiffies + q->rq_timeout;
+ set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+}
+
+static void blk_mq_requeue_request(struct request *rq)
+{
+ struct request_queue *q = rq->q;
+
+ trace_block_rq_requeue(q, rq);
+ clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+}
+
+struct blk_mq_timeout_data {
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long *next;
+ unsigned int *next_set;
+};
+
+static void blk_mq_timeout_check(void *__data, unsigned long *free_tags)
+{
+ struct blk_mq_timeout_data *data = __data;
+ struct blk_mq_hw_ctx *hctx = data->hctx;
+ unsigned int tag;
+
+ /* It may not be in flight yet (this is where
+ * the REQ_ATOMIC_STARTED flag comes in). The requests are
+ * statically allocated, so we know it's always safe to access the
+ * memory associated with a bit offset into ->rqs[].
+ */
+ tag = 0;
+ do {
+ struct request *rq;
+
+ tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag);
+ if (tag >= hctx->queue_depth)
+ break;
+
+ rq = hctx->rqs[tag++];
+
+ if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
+ continue;
+
+ blk_rq_check_expired(rq, data->next, data->next_set);
+ } while (1);
+}
+
+static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx,
+ unsigned long *next,
+ unsigned int *next_set)
+{
+ struct blk_mq_timeout_data data = {
+ .hctx = hctx,
+ .next = next,
+ .next_set = next_set,
+ };
+
+ /*
+ * Ask the tagging code to iterate busy requests, so we can
+ * check them for timeout.
+ */
+ blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data);
+}
+
+static void blk_mq_rq_timer(unsigned long data)
+{
+ struct request_queue *q = (struct request_queue *) data;
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long next = 0;
+ int i, next_set = 0;
+
+ queue_for_each_hw_ctx(q, hctx, i)
+ blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
+
+ if (next_set)
+ mod_timer(&q->timeout, round_jiffies_up(next));
+}
+
+/*
+ * Reverse check our software queue for entries that we could potentially
+ * merge with. Currently includes a hand-wavy stop count of 8, to not spend
+ * too much time checking for merges.
+ */
+static bool blk_mq_attempt_merge(struct request_queue *q,
+ struct blk_mq_ctx *ctx, struct bio *bio)
+{
+ struct request *rq;
+ int checked = 8;
+
+ list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
+ int el_ret;
+
+ if (!checked--)
+ break;
+
+ if (!blk_rq_merge_ok(rq, bio))
+ continue;
+
+ el_ret = blk_try_merge(rq, bio);
+ if (el_ret == ELEVATOR_BACK_MERGE) {
+ if (bio_attempt_back_merge(q, rq, bio)) {
+ ctx->rq_merged++;
+ return true;
+ }
+ break;
+ } else if (el_ret == ELEVATOR_FRONT_MERGE) {
+ if (bio_attempt_front_merge(q, rq, bio)) {
+ ctx->rq_merged++;
+ return true;
+ }
+ break;
+ }
+ }
+
+ return false;
+}
+
+void blk_mq_add_timer(struct request *rq)
+{
+ __blk_add_timer(rq, NULL);
+}
+
+/*
+ * Run this hardware queue, pulling any software queues mapped to it in.
+ * Note that this function currently has various problems around ordering
+ * of IO. In particular, we'd like FIFO behaviour on handling existing
+ * items on the hctx->dispatch list. Ignore that for now.
+ */
+static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
+{
+ struct request_queue *q = hctx->queue;
+ struct blk_mq_ctx *ctx;
+ struct request *rq;
+ LIST_HEAD(rq_list);
+ int bit, queued;
+
+ if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags)))
+ return;
+
+ hctx->run++;
+
+ /*
+ * Touch any software queue that has pending entries.
+ */
+ for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) {
+ clear_bit(bit, hctx->ctx_map);
+ ctx = hctx->ctxs[bit];
+ BUG_ON(bit != ctx->index_hw);
+
+ spin_lock(&ctx->lock);
+ list_splice_tail_init(&ctx->rq_list, &rq_list);
+ spin_unlock(&ctx->lock);
+ }
+
+ /*
+ * If we have previous entries on our dispatch list, grab them
+ * and stuff them at the front for more fair dispatch.
+ */
+ if (!list_empty_careful(&hctx->dispatch)) {
+ spin_lock(&hctx->lock);
+ if (!list_empty(&hctx->dispatch))
+ list_splice_init(&hctx->dispatch, &rq_list);
+ spin_unlock(&hctx->lock);
+ }
+
+ /*
+ * Delete and return all entries from our dispatch list
+ */
+ queued = 0;
+
+ /*
+ * Now process all the entries, sending them to the driver.
+ */
+ while (!list_empty(&rq_list)) {
+ int ret;
+
+ rq = list_first_entry(&rq_list, struct request, queuelist);
+ list_del_init(&rq->queuelist);
+ blk_mq_start_request(rq);
+
+ /*
+ * Last request in the series. Flag it as such, this
+ * enables drivers to know when IO should be kicked off,
+ * if they don't do it on a per-request basis.
+ *
+ * Note: the flag isn't the only condition drivers
+ * should do kick off. If drive is busy, the last
+ * request might not have the bit set.
+ */
+ if (list_empty(&rq_list))
+ rq->cmd_flags |= REQ_END;
+
+ ret = q->mq_ops->queue_rq(hctx, rq);
+ switch (ret) {
+ case BLK_MQ_RQ_QUEUE_OK:
+ queued++;
+ continue;
+ case BLK_MQ_RQ_QUEUE_BUSY:
+ /*
+ * FIXME: we should have a mechanism to stop the queue
+ * like blk_stop_queue, otherwise we will waste cpu
+ * time
+ */
+ list_add(&rq->queuelist, &rq_list);
+ blk_mq_requeue_request(rq);
+ break;
+ default:
+ pr_err("blk-mq: bad return on queue: %d\n", ret);
+ rq->errors = -EIO;
+ case BLK_MQ_RQ_QUEUE_ERROR:
+ blk_mq_end_io(rq, rq->errors);
+ break;
+ }
+
+ if (ret == BLK_MQ_RQ_QUEUE_BUSY)
+ break;
+ }
+
+ if (!queued)
+ hctx->dispatched[0]++;
+ else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1)))
+ hctx->dispatched[ilog2(queued) + 1]++;
+
+ /*
+ * Any items that need requeuing? Stuff them into hctx->dispatch,
+ * that is where we will continue on next queue run.
+ */
+ if (!list_empty(&rq_list)) {
+ spin_lock(&hctx->lock);
+ list_splice(&rq_list, &hctx->dispatch);
+ spin_unlock(&hctx->lock);
+ }
+}
+
+void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
+{
+ if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags)))
+ return;
+
+ if (!async)
+ __blk_mq_run_hw_queue(hctx);
+ else {
+ struct request_queue *q = hctx->queue;
+
+ kblockd_schedule_delayed_work(q, &hctx->delayed_work, 0);
+ }
+}
+
+void blk_mq_run_queues(struct request_queue *q, bool async)
+{
+ struct blk_mq_hw_ctx *hctx;
+ int i;
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ if ((!blk_mq_hctx_has_pending(hctx) &&
+ list_empty_careful(&hctx->dispatch)) ||
+ test_bit(BLK_MQ_S_STOPPED, &hctx->flags))
+ continue;
+
+ blk_mq_run_hw_queue(hctx, async);
+ }
+}
+EXPORT_SYMBOL(blk_mq_run_queues);
+
+void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
+{
+ cancel_delayed_work(&hctx->delayed_work);
+ set_bit(BLK_MQ_S_STOPPED, &hctx->state);
+}
+EXPORT_SYMBOL(blk_mq_stop_hw_queue);
+
+void blk_mq_stop_hw_queues(struct request_queue *q)
+{
+ struct blk_mq_hw_ctx *hctx;
+ int i;
+
+ queue_for_each_hw_ctx(q, hctx, i)
+ blk_mq_stop_hw_queue(hctx);
+}
+EXPORT_SYMBOL(blk_mq_stop_hw_queues);
+
+void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
+{
+ clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
+ __blk_mq_run_hw_queue(hctx);
+}
+EXPORT_SYMBOL(blk_mq_start_hw_queue);
+
+void blk_mq_start_stopped_hw_queues(struct request_queue *q)
+{
+ struct blk_mq_hw_ctx *hctx;
+ int i;
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state))
+ continue;
+
+ clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
+ blk_mq_run_hw_queue(hctx, true);
+ }
+}
+EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
+
+static void blk_mq_work_fn(struct work_struct *work)
+{
+ struct blk_mq_hw_ctx *hctx;
+
+ hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work);
+ __blk_mq_run_hw_queue(hctx);
+}
+
+static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
+ struct request *rq)
+{
+ struct blk_mq_ctx *ctx = rq->mq_ctx;
+
+ trace_block_rq_insert(hctx->queue, rq);
+
+ list_add_tail(&rq->queuelist, &ctx->rq_list);
+ blk_mq_hctx_mark_pending(hctx, ctx);
+
+ /*
+ * We do this early, to ensure we are on the right CPU.
+ */
+ blk_mq_add_timer(rq);
+}
+
+void blk_mq_insert_request(struct request_queue *q, struct request *rq,
+ bool run_queue)
+{
+ struct blk_mq_hw_ctx *hctx;
+ struct blk_mq_ctx *ctx, *current_ctx;
+
+ ctx = rq->mq_ctx;
+ hctx = q->mq_ops->map_queue(q, ctx->cpu);
+
+ if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
+ blk_insert_flush(rq);
+ } else {
+ current_ctx = blk_mq_get_ctx(q);
+
+ if (!cpu_online(ctx->cpu)) {
+ ctx = current_ctx;
+ hctx = q->mq_ops->map_queue(q, ctx->cpu);
+ rq->mq_ctx = ctx;
+ }
+ spin_lock(&ctx->lock);
+ __blk_mq_insert_request(hctx, rq);
+ spin_unlock(&ctx->lock);
+
+ blk_mq_put_ctx(current_ctx);
+ }
+
+ if (run_queue)
+ __blk_mq_run_hw_queue(hctx);
+}
+EXPORT_SYMBOL(blk_mq_insert_request);
+
+/*
+ * This is a special version of blk_mq_insert_request to bypass FLUSH request
+ * check. Should only be used internally.
+ */
+void blk_mq_run_request(struct request *rq, bool run_queue, bool async)
+{
+ struct request_queue *q = rq->q;
+ struct blk_mq_hw_ctx *hctx;
+ struct blk_mq_ctx *ctx, *current_ctx;
+
+ current_ctx = blk_mq_get_ctx(q);
+
+ ctx = rq->mq_ctx;
+ if (!cpu_online(ctx->cpu)) {
+ ctx = current_ctx;
+ rq->mq_ctx = ctx;
+ }
+ hctx = q->mq_ops->map_queue(q, ctx->cpu);
+
+ /* ctx->cpu might be offline */
+ spin_lock(&ctx->lock);
+ __blk_mq_insert_request(hctx, rq);
+ spin_unlock(&ctx->lock);
+
+ blk_mq_put_ctx(current_ctx);
+
+ if (run_queue)
+ blk_mq_run_hw_queue(hctx, async);
+}
+
+static void blk_mq_insert_requests(struct request_queue *q,
+ struct blk_mq_ctx *ctx,
+ struct list_head *list,
+ int depth,
+ bool from_schedule)
+
+{
+ struct blk_mq_hw_ctx *hctx;
+ struct blk_mq_ctx *current_ctx;
+
+ trace_block_unplug(q, depth, !from_schedule);
+
+ current_ctx = blk_mq_get_ctx(q);
+
+ if (!cpu_online(ctx->cpu))
+ ctx = current_ctx;
+ hctx = q->mq_ops->map_queue(q, ctx->cpu);
+
+ /*
+ * preemption doesn't flush plug list, so it's possible ctx->cpu is
+ * offline now
+ */
+ spin_lock(&ctx->lock);
+ while (!list_empty(list)) {
+ struct request *rq;
+
+ rq = list_first_entry(list, struct request, queuelist);
+ list_del_init(&rq->queuelist);
+ rq->mq_ctx = ctx;
+ __blk_mq_insert_request(hctx, rq);
+ }
+ spin_unlock(&ctx->lock);
+
+ blk_mq_put_ctx(current_ctx);
+
+ blk_mq_run_hw_queue(hctx, from_schedule);
+}
+
+static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+ struct request *rqa = container_of(a, struct request, queuelist);
+ struct request *rqb = container_of(b, struct request, queuelist);
+
+ return !(rqa->mq_ctx < rqb->mq_ctx ||
+ (rqa->mq_ctx == rqb->mq_ctx &&
+ blk_rq_pos(rqa) < blk_rq_pos(rqb)));
+}
+
+void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
+{
+ struct blk_mq_ctx *this_ctx;
+ struct request_queue *this_q;
+ struct request *rq;
+ LIST_HEAD(list);
+ LIST_HEAD(ctx_list);
+ unsigned int depth;
+
+ list_splice_init(&plug->mq_list, &list);
+
+ list_sort(NULL, &list, plug_ctx_cmp);
+
+ this_q = NULL;
+ this_ctx = NULL;
+ depth = 0;
+
+ while (!list_empty(&list)) {
+ rq = list_entry_rq(list.next);
+ list_del_init(&rq->queuelist);
+ BUG_ON(!rq->q);
+ if (rq->mq_ctx != this_ctx) {
+ if (this_ctx) {
+ blk_mq_insert_requests(this_q, this_ctx,
+ &ctx_list, depth,
+ from_schedule);
+ }
+
+ this_ctx = rq->mq_ctx;
+ this_q = rq->q;
+ depth = 0;
+ }
+
+ depth++;
+ list_add_tail(&rq->queuelist, &ctx_list);
+ }
+
+ /*
+ * If 'this_ctx' is set, we know we have entries to complete
+ * on 'ctx_list'. Do those.
+ */
+ if (this_ctx) {
+ blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
+ from_schedule);
+ }
+}
+
+static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
+{
+ init_request_from_bio(rq, bio);
+ blk_account_io_start(rq, 1);
+}
+
+static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
+{
+ struct blk_mq_hw_ctx *hctx;
+ struct blk_mq_ctx *ctx;
+ const int is_sync = rw_is_sync(bio->bi_rw);
+ const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
+ int rw = bio_data_dir(bio);
+ struct request *rq;
+ unsigned int use_plug, request_count = 0;
+
+ /*
+ * If we have multiple hardware queues, just go directly to
+ * one of those for sync IO.
+ */
+ use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync);
+
+ blk_queue_bounce(q, &bio);
+
+ if (use_plug && blk_attempt_plug_merge(q, bio, &request_count))
+ return;
+
+ if (blk_mq_queue_enter(q)) {
+ bio_endio(bio, -EIO);
+ return;
+ }
+
+ ctx = blk_mq_get_ctx(q);
+ hctx = q->mq_ops->map_queue(q, ctx->cpu);
+
+ trace_block_getrq(q, bio, rw);
+ rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false);
+ if (likely(rq))
+ blk_mq_rq_ctx_init(q, ctx, rq, rw);
+ else {
+ blk_mq_put_ctx(ctx);
+ trace_block_sleeprq(q, bio, rw);
+ rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC,
+ false);
+ ctx = rq->mq_ctx;
+ hctx = q->mq_ops->map_queue(q, ctx->cpu);
+ }
+
+ hctx->queued++;
+
+ if (unlikely(is_flush_fua)) {
+ blk_mq_bio_to_request(rq, bio);
+ blk_mq_put_ctx(ctx);
+ blk_insert_flush(rq);
+ goto run_queue;
+ }
+
+ /*
+ * A task plug currently exists. Since this is completely lockless,
+ * utilize that to temporarily store requests until the task is
+ * either done or scheduled away.
+ */
+ if (use_plug) {
+ struct blk_plug *plug = current->plug;
+
+ if (plug) {
+ blk_mq_bio_to_request(rq, bio);
+ if (list_empty(&plug->mq_list))
+ trace_block_plug(q);
+ else if (request_count >= BLK_MAX_REQUEST_COUNT) {
+ blk_flush_plug_list(plug, false);
+ trace_block_plug(q);
+ }
+ list_add_tail(&rq->queuelist, &plug->mq_list);
+ blk_mq_put_ctx(ctx);
+ return;
+ }
+ }
+
+ spin_lock(&ctx->lock);
+
+ if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
+ blk_mq_attempt_merge(q, ctx, bio))
+ __blk_mq_free_request(hctx, ctx, rq);
+ else {
+ blk_mq_bio_to_request(rq, bio);
+ __blk_mq_insert_request(hctx, rq);
+ }
+
+ spin_unlock(&ctx->lock);
+ blk_mq_put_ctx(ctx);
+
+ /*
+ * For a SYNC request, send it to the hardware immediately. For an
+ * ASYNC request, just ensure that we run it later on. The latter
+ * allows for merging opportunities and more efficient dispatching.
+ */
+run_queue:
+ blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua);
+}
+
+/*
+ * Default mapping to a software queue, since we use one per CPU.
+ */
+struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu)
+{
+ return q->queue_hw_ctx[q->mq_map[cpu]];
+}
+EXPORT_SYMBOL(blk_mq_map_queue);
+
+struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *reg,
+ unsigned int hctx_index)
+{
+ return kmalloc_node(sizeof(struct blk_mq_hw_ctx),
+ GFP_KERNEL | __GFP_ZERO, reg->numa_node);
+}
+EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue);
+
+void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx,
+ unsigned int hctx_index)
+{
+ kfree(hctx);
+}
+EXPORT_SYMBOL(blk_mq_free_single_hw_queue);
+
+static void blk_mq_hctx_notify(void *data, unsigned long action,
+ unsigned int cpu)
+{
+ struct blk_mq_hw_ctx *hctx = data;
+ struct blk_mq_ctx *ctx;
+ LIST_HEAD(tmp);
+
+ if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
+ return;
+
+ /*
+ * Move ctx entries to new CPU, if this one is going away.
+ */
+ ctx = __blk_mq_get_ctx(hctx->queue, cpu);
+
+ spin_lock(&ctx->lock);
+ if (!list_empty(&ctx->rq_list)) {
+ list_splice_init(&ctx->rq_list, &tmp);
+ clear_bit(ctx->index_hw, hctx->ctx_map);
+ }
+ spin_unlock(&ctx->lock);
+
+ if (list_empty(&tmp))
+ return;
+
+ ctx = blk_mq_get_ctx(hctx->queue);
+ spin_lock(&ctx->lock);
+
+ while (!list_empty(&tmp)) {
+ struct request *rq;
+
+ rq = list_first_entry(&tmp, struct request, queuelist);
+ rq->mq_ctx = ctx;
+ list_move_tail(&rq->queuelist, &ctx->rq_list);
+ }
+
+ blk_mq_hctx_mark_pending(hctx, ctx);
+
+ spin_unlock(&ctx->lock);
+ blk_mq_put_ctx(ctx);
+}
+
+static void blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx,
+ void (*init)(void *, struct blk_mq_hw_ctx *,
+ struct request *, unsigned int),
+ void *data)
+{
+ unsigned int i;
+
+ for (i = 0; i < hctx->queue_depth; i++) {
+ struct request *rq = hctx->rqs[i];
+
+ init(data, hctx, rq, i);
+ }
+}
+
+void blk_mq_init_commands(struct request_queue *q,
+ void (*init)(void *, struct blk_mq_hw_ctx *,
+ struct request *, unsigned int),
+ void *data)
+{
+ struct blk_mq_hw_ctx *hctx;
+ unsigned int i;
+
+ queue_for_each_hw_ctx(q, hctx, i)
+ blk_mq_init_hw_commands(hctx, init, data);
+}
+EXPORT_SYMBOL(blk_mq_init_commands);
+
+static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx)
+{
+ struct page *page;
+
+ while (!list_empty(&hctx->page_list)) {
+ page = list_first_entry(&hctx->page_list, struct page, list);
+ list_del_init(&page->list);
+ __free_pages(page, page->private);
+ }
+
+ kfree(hctx->rqs);
+
+ if (hctx->tags)
+ blk_mq_free_tags(hctx->tags);
+}
+
+static size_t order_to_size(unsigned int order)
+{
+ size_t ret = PAGE_SIZE;
+
+ while (order--)
+ ret *= 2;
+
+ return ret;
+}
+
+static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
+ unsigned int reserved_tags, int node)
+{
+ unsigned int i, j, entries_per_page, max_order = 4;
+ size_t rq_size, left;
+
+ INIT_LIST_HEAD(&hctx->page_list);
+
+ hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *),
+ GFP_KERNEL, node);
+ if (!hctx->rqs)
+ return -ENOMEM;
+
+ /*
+ * rq_size is the size of the request plus driver payload, rounded
+ * to the cacheline size
+ */
+ rq_size = round_up(sizeof(struct request) + hctx->cmd_size,
+ cache_line_size());
+ left = rq_size * hctx->queue_depth;
+
+ for (i = 0; i < hctx->queue_depth;) {
+ int this_order = max_order;
+ struct page *page;
+ int to_do;
+ void *p;
+
+ while (left < order_to_size(this_order - 1) && this_order)
+ this_order--;
+
+ do {
+ page = alloc_pages_node(node, GFP_KERNEL, this_order);
+ if (page)
+ break;
+ if (!this_order--)
+ break;
+ if (order_to_size(this_order) < rq_size)
+ break;
+ } while (1);
+
+ if (!page)
+ break;
+
+ page->private = this_order;
+ list_add_tail(&page->list, &hctx->page_list);
+
+ p = page_address(page);
+ entries_per_page = order_to_size(this_order) / rq_size;
+ to_do = min(entries_per_page, hctx->queue_depth - i);
+ left -= to_do * rq_size;
+ for (j = 0; j < to_do; j++) {
+ hctx->rqs[i] = p;
+ blk_mq_rq_init(hctx, hctx->rqs[i]);
+ p += rq_size;
+ i++;
+ }
+ }
+
+ if (i < (reserved_tags + BLK_MQ_TAG_MIN))
+ goto err_rq_map;
+ else if (i != hctx->queue_depth) {
+ hctx->queue_depth = i;
+ pr_warn("%s: queue depth set to %u because of low memory\n",
+ __func__, i);
+ }
+
+ hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node);
+ if (!hctx->tags) {
+err_rq_map:
+ blk_mq_free_rq_map(hctx);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int blk_mq_init_hw_queues(struct request_queue *q,
+ struct blk_mq_reg *reg, void *driver_data)
+{
+ struct blk_mq_hw_ctx *hctx;
+ unsigned int i, j;
+
+ /*
+ * Initialize hardware queues
+ */
+ queue_for_each_hw_ctx(q, hctx, i) {
+ unsigned int num_maps;
+ int node;
+
+ node = hctx->numa_node;
+ if (node == NUMA_NO_NODE)
+ node = hctx->numa_node = reg->numa_node;
+
+ INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn);
+ spin_lock_init(&hctx->lock);
+ INIT_LIST_HEAD(&hctx->dispatch);
+ hctx->queue = q;
+ hctx->queue_num = i;
+ hctx->flags = reg->flags;
+ hctx->queue_depth = reg->queue_depth;
+ hctx->cmd_size = reg->cmd_size;
+
+ blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
+ blk_mq_hctx_notify, hctx);
+ blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
+
+ if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node))
+ break;
+
+ /*
+ * Allocate space for all possible cpus to avoid allocation in
+ * runtime
+ */
+ hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
+ GFP_KERNEL, node);
+ if (!hctx->ctxs)
+ break;
+
+ num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG;
+ hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long),
+ GFP_KERNEL, node);
+ if (!hctx->ctx_map)
+ break;
+
+ hctx->nr_ctx_map = num_maps;
+ hctx->nr_ctx = 0;
+
+ if (reg->ops->init_hctx &&
+ reg->ops->init_hctx(hctx, driver_data, i))
+ break;
+ }
+
+ if (i == q->nr_hw_queues)
+ return 0;
+
+ /*
+ * Init failed
+ */
+ queue_for_each_hw_ctx(q, hctx, j) {
+ if (i == j)
+ break;
+
+ if (reg->ops->exit_hctx)
+ reg->ops->exit_hctx(hctx, j);
+
+ blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
+ blk_mq_free_rq_map(hctx);
+ kfree(hctx->ctxs);
+ }
+
+ return 1;
+}
+
+static void blk_mq_init_cpu_queues(struct request_queue *q,
+ unsigned int nr_hw_queues)
+{
+ unsigned int i;
+
+ for_each_possible_cpu(i) {
+ struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
+ struct blk_mq_hw_ctx *hctx;
+
+ memset(__ctx, 0, sizeof(*__ctx));
+ __ctx->cpu = i;
+ spin_lock_init(&__ctx->lock);
+ INIT_LIST_HEAD(&__ctx->rq_list);
+ __ctx->queue = q;
+
+ /* If the cpu isn't online, the cpu is mapped to first hctx */
+ hctx = q->mq_ops->map_queue(q, i);
+ hctx->nr_ctx++;
+
+ if (!cpu_online(i))
+ continue;
+
+ /*
+ * Set local node, IFF we have more than one hw queue. If
+ * not, we remain on the home node of the device
+ */
+ if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
+ hctx->numa_node = cpu_to_node(i);
+ }
+}
+
+static void blk_mq_map_swqueue(struct request_queue *q)
+{
+ unsigned int i;
+ struct blk_mq_hw_ctx *hctx;
+ struct blk_mq_ctx *ctx;
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ hctx->nr_ctx = 0;
+ }
+
+ /*
+ * Map software to hardware queues
+ */
+ queue_for_each_ctx(q, ctx, i) {
+ /* If the cpu isn't online, the cpu is mapped to first hctx */
+ hctx = q->mq_ops->map_queue(q, i);
+ ctx->index_hw = hctx->nr_ctx;
+ hctx->ctxs[hctx->nr_ctx++] = ctx;
+ }
+}
+
+struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
+ void *driver_data)
+{
+ struct blk_mq_hw_ctx **hctxs;
+ struct blk_mq_ctx *ctx;
+ struct request_queue *q;
+ int i;
+
+ if (!reg->nr_hw_queues ||
+ !reg->ops->queue_rq || !reg->ops->map_queue ||
+ !reg->ops->alloc_hctx || !reg->ops->free_hctx)
+ return ERR_PTR(-EINVAL);
+
+ if (!reg->queue_depth)
+ reg->queue_depth = BLK_MQ_MAX_DEPTH;
+ else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) {
+ pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth);
+ reg->queue_depth = BLK_MQ_MAX_DEPTH;
+ }
+
+ /*
+ * Set aside a tag for flush requests. It will only be used while
+ * another flush request is in progress but outside the driver.
+ *
+ * TODO: only allocate if flushes are supported
+ */
+ reg->queue_depth++;
+ reg->reserved_tags++;
+
+ if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN))
+ return ERR_PTR(-EINVAL);
+
+ ctx = alloc_percpu(struct blk_mq_ctx);
+ if (!ctx)
+ return ERR_PTR(-ENOMEM);
+
+ hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
+ reg->numa_node);
+
+ if (!hctxs)
+ goto err_percpu;
+
+ for (i = 0; i < reg->nr_hw_queues; i++) {
+ hctxs[i] = reg->ops->alloc_hctx(reg, i);
+ if (!hctxs[i])
+ goto err_hctxs;
+
+ hctxs[i]->numa_node = NUMA_NO_NODE;
+ hctxs[i]->queue_num = i;
+ }
+
+ q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node);
+ if (!q)
+ goto err_hctxs;
+
+ q->mq_map = blk_mq_make_queue_map(reg);
+ if (!q->mq_map)
+ goto err_map;
+
+ setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
+ blk_queue_rq_timeout(q, 30000);
+
+ q->nr_queues = nr_cpu_ids;
+ q->nr_hw_queues = reg->nr_hw_queues;
+
+ q->queue_ctx = ctx;
+ q->queue_hw_ctx = hctxs;
+
+ q->mq_ops = reg->ops;
+ q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
+
+ blk_queue_make_request(q, blk_mq_make_request);
+ blk_queue_rq_timed_out(q, reg->ops->timeout);
+ if (reg->timeout)
+ blk_queue_rq_timeout(q, reg->timeout);
+
+ blk_mq_init_flush(q);
+ blk_mq_init_cpu_queues(q, reg->nr_hw_queues);
+
+ if (blk_mq_init_hw_queues(q, reg, driver_data))
+ goto err_hw;
+
+ blk_mq_map_swqueue(q);
+
+ mutex_lock(&all_q_mutex);
+ list_add_tail(&q->all_q_node, &all_q_list);
+ mutex_unlock(&all_q_mutex);
+
+ return q;
+err_hw:
+ kfree(q->mq_map);
+err_map:
+ blk_cleanup_queue(q);
+err_hctxs:
+ for (i = 0; i < reg->nr_hw_queues; i++) {
+ if (!hctxs[i])
+ break;
+ reg->ops->free_hctx(hctxs[i], i);
+ }
+ kfree(hctxs);
+err_percpu:
+ free_percpu(ctx);
+ return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(blk_mq_init_queue);
+
+void blk_mq_free_queue(struct request_queue *q)
+{
+ struct blk_mq_hw_ctx *hctx;
+ int i;
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ cancel_delayed_work_sync(&hctx->delayed_work);
+ kfree(hctx->ctx_map);
+ kfree(hctx->ctxs);
+ blk_mq_free_rq_map(hctx);
+ blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
+ if (q->mq_ops->exit_hctx)
+ q->mq_ops->exit_hctx(hctx, i);
+ q->mq_ops->free_hctx(hctx, i);
+ }
+
+ free_percpu(q->queue_ctx);
+ kfree(q->queue_hw_ctx);
+ kfree(q->mq_map);
+
+ q->queue_ctx = NULL;
+ q->queue_hw_ctx = NULL;
+ q->mq_map = NULL;
+
+ mutex_lock(&all_q_mutex);
+ list_del_init(&q->all_q_node);
+ mutex_unlock(&all_q_mutex);
+}
+EXPORT_SYMBOL(blk_mq_free_queue);
+
+/* Basically redo blk_mq_init_queue with queue frozen */
+static void blk_mq_queue_reinit(struct request_queue *q)
+{
+ blk_mq_freeze_queue(q);
+
+ blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
+
+ /*
+ * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
+ * we should change hctx numa_node according to new topology (this
+ * involves free and re-allocate memory, worthy doing?)
+ */
+
+ blk_mq_map_swqueue(q);
+
+ blk_mq_unfreeze_queue(q);
+}
+
+static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
+ unsigned long action, void *hcpu)
+{
+ struct request_queue *q;
+
+ /*
+ * Before new mapping is established, hotadded cpu might already start
+ * handling requests. This doesn't break anything as we map offline
+ * CPUs to first hardware queue. We will re-init queue below to get
+ * optimal settings.
+ */
+ if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
+ action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
+ return NOTIFY_OK;
+
+ mutex_lock(&all_q_mutex);
+ list_for_each_entry(q, &all_q_list, all_q_node)
+ blk_mq_queue_reinit(q);
+ mutex_unlock(&all_q_mutex);
+ return NOTIFY_OK;
+}
+
+static int __init blk_mq_init(void)
+{
+ unsigned int i;
+
+ for_each_possible_cpu(i)
+ init_llist_head(&per_cpu(ipi_lists, i));
+
+ blk_mq_cpu_init();
+
+ /* Must be called after percpu_counter_hotcpu_callback() */
+ hotcpu_notifier(blk_mq_queue_reinit_notify, -10);
+
+ return 0;
+}
+subsys_initcall(blk_mq_init);
diff --git a/block/blk-mq.h b/block/blk-mq.h
new file mode 100644
index 0000000..52bf1f9
--- /dev/null
+++ b/block/blk-mq.h
@@ -0,0 +1,52 @@
+#ifndef INT_BLK_MQ_H
+#define INT_BLK_MQ_H
+
+struct blk_mq_ctx {
+ struct {
+ spinlock_t lock;
+ struct list_head rq_list;
+ } ____cacheline_aligned_in_smp;
+
+ unsigned int cpu;
+ unsigned int index_hw;
+ unsigned int ipi_redirect;
+
+ /* incremented at dispatch time */
+ unsigned long rq_dispatched[2];
+ unsigned long rq_merged;
+
+ /* incremented at completion time */
+ unsigned long ____cacheline_aligned_in_smp rq_completed[2];
+
+ struct request_queue *queue;
+ struct kobject kobj;
+};
+
+void __blk_mq_end_io(struct request *rq, int error);
+void blk_mq_complete_request(struct request *rq, int error);
+void blk_mq_run_request(struct request *rq, bool run_queue, bool async);
+void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
+void blk_mq_init_flush(struct request_queue *q);
+
+/*
+ * CPU hotplug helpers
+ */
+struct blk_mq_cpu_notifier;
+void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
+ void (*fn)(void *, unsigned long, unsigned int),
+ void *data);
+void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
+void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
+void blk_mq_cpu_init(void);
+DECLARE_PER_CPU(struct llist_head, ipi_lists);
+
+/*
+ * CPU -> queue mappings
+ */
+struct blk_mq_reg;
+extern unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg);
+extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues);
+
+void blk_mq_add_timer(struct request *rq);
+
+#endif
diff --git a/block/blk-settings.c b/block/blk-settings.c
index c50ecf0..05e8267 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -144,6 +144,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
lim->discard_zeroes_data = 1;
lim->max_segments = USHRT_MAX;
lim->max_hw_sectors = UINT_MAX;
+ lim->max_segment_size = UINT_MAX;
lim->max_sectors = UINT_MAX;
lim->max_write_same_sectors = UINT_MAX;
}
@@ -195,17 +196,17 @@ EXPORT_SYMBOL(blk_queue_make_request);
/**
* blk_queue_bounce_limit - set bounce buffer limit for queue
* @q: the request queue for the device
- * @dma_mask: the maximum address the device can handle
+ * @max_addr: the maximum address the device can handle
*
* Description:
* Different hardware can have different requirements as to what pages
* it can do I/O directly to. A low level driver can call
* blk_queue_bounce_limit to have lower memory pages allocated as bounce
- * buffers for doing I/O to pages residing above @dma_mask.
+ * buffers for doing I/O to pages residing above @max_addr.
**/
-void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
+void blk_queue_bounce_limit(struct request_queue *q, u64 max_addr)
{
- unsigned long b_pfn = dma_mask >> PAGE_SHIFT;
+ unsigned long b_pfn = max_addr >> PAGE_SHIFT;
int dma = 0;
q->bounce_gfp = GFP_NOIO;
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 467c8de..57790c1 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -23,7 +23,7 @@ static void blk_done_softirq(struct softirq_action *h)
struct list_head *cpu_list, local_list;
local_irq_disable();
- cpu_list = &__get_cpu_var(blk_cpu_done);
+ cpu_list = this_cpu_ptr(&blk_cpu_done);
list_replace_init(cpu_list, &local_list);
local_irq_enable();
@@ -36,7 +36,7 @@ static void blk_done_softirq(struct softirq_action *h)
}
}
-#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS)
+#ifdef CONFIG_SMP
static void trigger_softirq(void *data)
{
struct request *rq = data;
@@ -44,7 +44,7 @@ static void trigger_softirq(void *data)
struct list_head *list;
local_irq_save(flags);
- list = &__get_cpu_var(blk_cpu_done);
+ list = this_cpu_ptr(&blk_cpu_done);
list_add_tail(&rq->csd.list, list);
if (list->next == &rq->csd.list)
@@ -71,15 +71,15 @@ static int raise_blk_irq(int cpu, struct request *rq)
return 1;
}
-#else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */
+#else /* CONFIG_SMP */
static int raise_blk_irq(int cpu, struct request *rq)
{
return 1;
}
#endif
-static int __cpuinit blk_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
+static int blk_cpu_notify(struct notifier_block *self, unsigned long action,
+ void *hcpu)
{
/*
* If a CPU goes away, splice its entries to the current CPU
@@ -90,7 +90,7 @@ static int __cpuinit blk_cpu_notify(struct notifier_block *self,
local_irq_disable();
list_splice_init(&per_cpu(blk_cpu_done, cpu),
- &__get_cpu_var(blk_cpu_done));
+ this_cpu_ptr(&blk_cpu_done));
raise_softirq_irqoff(BLOCK_SOFTIRQ);
local_irq_enable();
}
@@ -98,7 +98,7 @@ static int __cpuinit blk_cpu_notify(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block __cpuinitdata blk_cpu_notifier = {
+static struct notifier_block blk_cpu_notifier = {
.notifier_call = blk_cpu_notify,
};
@@ -135,7 +135,7 @@ void __blk_complete_request(struct request *req)
if (ccpu == cpu || shared) {
struct list_head *list;
do_local:
- list = &__get_cpu_var(blk_cpu_done);
+ list = this_cpu_ptr(&blk_cpu_done);
list_add_tail(&req->csd.list, list);
/*
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 5efc5a6..9777952 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -7,6 +7,7 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blktrace_api.h>
+#include <linux/blk-mq.h>
#include "blk.h"
#include "blk-cgroup.h"
@@ -29,7 +30,7 @@ queue_var_store(unsigned long *var, const char *page, size_t count)
int err;
unsigned long v;
- err = strict_strtoul(page, 10, &v);
+ err = kstrtoul(page, 10, &v);
if (err || v > UINT_MAX)
return -EINVAL;
@@ -287,7 +288,7 @@ static ssize_t
queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
{
ssize_t ret = -EINVAL;
-#if defined(CONFIG_USE_GENERIC_SMP_HELPERS)
+#ifdef CONFIG_SMP
unsigned long val;
ret = queue_var_store(&val, page, count);
@@ -542,6 +543,11 @@ static void blk_release_queue(struct kobject *kobj)
if (q->queue_tags)
__blk_queue_free_tags(q);
+ percpu_counter_destroy(&q->mq_usage_counter);
+
+ if (q->mq_ops)
+ blk_mq_free_queue(q);
+
blk_trace_shutdown(q);
bdi_destroy(&q->backing_dev_info);
@@ -575,6 +581,7 @@ int blk_register_queue(struct gendisk *disk)
* bypass from queue allocation.
*/
blk_queue_bypass_end(q);
+ queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q);
ret = blk_trace_init_sysfs(dev);
if (ret)
@@ -588,6 +595,9 @@ int blk_register_queue(struct gendisk *disk)
kobject_uevent(&q->kobj, KOBJ_ADD);
+ if (q->mq_ops)
+ blk_mq_register_disk(disk);
+
if (!q->request_fn)
return 0;
@@ -610,6 +620,9 @@ void blk_unregister_queue(struct gendisk *disk)
if (WARN_ON(!q))
return;
+ if (q->mq_ops)
+ blk_mq_unregister_disk(disk);
+
if (q->request_fn)
elv_unregister_queue(q);
diff --git a/block/blk-tag.c b/block/blk-tag.c
index cc345e1..3f33d86 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -348,9 +348,16 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
*/
max_depth = bqt->max_depth;
if (!rq_is_sync(rq) && max_depth > 1) {
- max_depth -= 2;
- if (!max_depth)
+ switch (max_depth) {
+ case 2:
max_depth = 1;
+ break;
+ case 3:
+ max_depth = 2;
+ break;
+ default:
+ max_depth -= 2;
+ }
if (q->in_flight[BLK_RW_ASYNC] > max_depth)
return 1;
}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 3114622..0653404 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -25,18 +25,61 @@ static struct blkcg_policy blkcg_policy_throtl;
/* A workqueue to queue throttle related work */
static struct workqueue_struct *kthrotld_workqueue;
-static void throtl_schedule_delayed_work(struct throtl_data *td,
- unsigned long delay);
-
-struct throtl_rb_root {
- struct rb_root rb;
- struct rb_node *left;
- unsigned int count;
- unsigned long min_disptime;
+
+/*
+ * To implement hierarchical throttling, throtl_grps form a tree and bios
+ * are dispatched upwards level by level until they reach the top and get
+ * issued. When dispatching bios from the children and local group at each
+ * level, if the bios are dispatched into a single bio_list, there's a risk
+ * of a local or child group which can queue many bios at once filling up
+ * the list starving others.
+ *
+ * To avoid such starvation, dispatched bios are queued separately
+ * according to where they came from. When they are again dispatched to
+ * the parent, they're popped in round-robin order so that no single source
+ * hogs the dispatch window.
+ *
+ * throtl_qnode is used to keep the queued bios separated by their sources.
+ * Bios are queued to throtl_qnode which in turn is queued to
+ * throtl_service_queue and then dispatched in round-robin order.
+ *
+ * It's also used to track the reference counts on blkg's. A qnode always
+ * belongs to a throtl_grp and gets queued on itself or the parent, so
+ * incrementing the reference of the associated throtl_grp when a qnode is
+ * queued and decrementing when dequeued is enough to keep the whole blkg
+ * tree pinned while bios are in flight.
+ */
+struct throtl_qnode {
+ struct list_head node; /* service_queue->queued[] */
+ struct bio_list bios; /* queued bios */
+ struct throtl_grp *tg; /* tg this qnode belongs to */
};
-#define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \
- .count = 0, .min_disptime = 0}
+struct throtl_service_queue {
+ struct throtl_service_queue *parent_sq; /* the parent service_queue */
+
+ /*
+ * Bios queued directly to this service_queue or dispatched from
+ * children throtl_grp's.
+ */
+ struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */
+ unsigned int nr_queued[2]; /* number of queued bios */
+
+ /*
+ * RB tree of active children throtl_grp's, which are sorted by
+ * their ->disptime.
+ */
+ struct rb_root pending_tree; /* RB tree of active tgs */
+ struct rb_node *first_pending; /* first node in the tree */
+ unsigned int nr_pending; /* # queued in the tree */
+ unsigned long first_pending_disptime; /* disptime of the first tg */
+ struct timer_list pending_timer; /* fires on first_pending_disptime */
+};
+
+enum tg_state_flags {
+ THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
+ THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
+};
#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
@@ -52,9 +95,26 @@ struct throtl_grp {
/* must be the first member */
struct blkg_policy_data pd;
- /* active throtl group service_tree member */
+ /* active throtl group service_queue member */
struct rb_node rb_node;
+ /* throtl_data this group belongs to */
+ struct throtl_data *td;
+
+ /* this group's service queue */
+ struct throtl_service_queue service_queue;
+
+ /*
+ * qnode_on_self is used when bios are directly queued to this
+ * throtl_grp so that local bios compete fairly with bios
+ * dispatched from children. qnode_on_parent is used when bios are
+ * dispatched from this throtl_grp into its parent and will compete
+ * with the sibling qnode_on_parents and the parent's
+ * qnode_on_self.
+ */
+ struct throtl_qnode qnode_on_self[2];
+ struct throtl_qnode qnode_on_parent[2];
+
/*
* Dispatch time in jiffies. This is the estimated time when group
* will unthrottle and is ready to dispatch more bio. It is used as
@@ -64,11 +124,8 @@ struct throtl_grp {
unsigned int flags;
- /* Two lists for READ and WRITE */
- struct bio_list bio_lists[2];
-
- /* Number of queued bios on READ and WRITE lists */
- unsigned int nr_queued[2];
+ /* are there any throtl rules between this group and td? */
+ bool has_rules[2];
/* bytes per second rate limits */
uint64_t bps[2];
@@ -85,9 +142,6 @@ struct throtl_grp {
unsigned long slice_start[2];
unsigned long slice_end[2];
- /* Some throttle limits got updated for the group */
- int limits_changed;
-
/* Per cpu stats pointer */
struct tg_stats_cpu __percpu *stats_cpu;
@@ -98,7 +152,7 @@ struct throtl_grp {
struct throtl_data
{
/* service tree for active throtl groups */
- struct throtl_rb_root tg_service_tree;
+ struct throtl_service_queue service_queue;
struct request_queue *queue;
@@ -111,9 +165,7 @@ struct throtl_data
unsigned int nr_undestroyed_grps;
/* Work for dispatching throttled bios */
- struct delayed_work throtl_work;
-
- int limits_changed;
+ struct work_struct dispatch_work;
};
/* list and work item to allocate percpu group stats */
@@ -123,6 +175,8 @@ static LIST_HEAD(tg_stats_alloc_list);
static void tg_stats_alloc_fn(struct work_struct *);
static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
+static void throtl_pending_timer_fn(unsigned long arg);
+
static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
{
return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
@@ -143,39 +197,69 @@ static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
return blkg_to_tg(td->queue->root_blkg);
}
-enum tg_state_flags {
- THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */
-};
-
-#define THROTL_TG_FNS(name) \
-static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \
-{ \
- (tg)->flags |= (1 << THROTL_TG_FLAG_##name); \
-} \
-static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \
-{ \
- (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \
-} \
-static inline int throtl_tg_##name(const struct throtl_grp *tg) \
-{ \
- return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \
+/**
+ * sq_to_tg - return the throl_grp the specified service queue belongs to
+ * @sq: the throtl_service_queue of interest
+ *
+ * Return the throtl_grp @sq belongs to. If @sq is the top-level one
+ * embedded in throtl_data, %NULL is returned.
+ */
+static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq)
+{
+ if (sq && sq->parent_sq)
+ return container_of(sq, struct throtl_grp, service_queue);
+ else
+ return NULL;
}
-THROTL_TG_FNS(on_rr);
+/**
+ * sq_to_td - return throtl_data the specified service queue belongs to
+ * @sq: the throtl_service_queue of interest
+ *
+ * A service_queue can be embeded in either a throtl_grp or throtl_data.
+ * Determine the associated throtl_data accordingly and return it.
+ */
+static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
+{
+ struct throtl_grp *tg = sq_to_tg(sq);
-#define throtl_log_tg(td, tg, fmt, args...) do { \
- char __pbuf[128]; \
+ if (tg)
+ return tg->td;
+ else
+ return container_of(sq, struct throtl_data, service_queue);
+}
+
+/**
+ * throtl_log - log debug message via blktrace
+ * @sq: the service_queue being reported
+ * @fmt: printf format string
+ * @args: printf args
+ *
+ * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a
+ * throtl_grp; otherwise, just "throtl".
+ *
+ * TODO: this should be made a function and name formatting should happen
+ * after testing whether blktrace is enabled.
+ */
+#define throtl_log(sq, fmt, args...) do { \
+ struct throtl_grp *__tg = sq_to_tg((sq)); \
+ struct throtl_data *__td = sq_to_td((sq)); \
+ \
+ (void)__td; \
+ if ((__tg)) { \
+ char __pbuf[128]; \
\
- blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf)); \
- blk_add_trace_msg((td)->queue, "throtl %s " fmt, __pbuf, ##args); \
+ blkg_path(tg_to_blkg(__tg), __pbuf, sizeof(__pbuf)); \
+ blk_add_trace_msg(__td->queue, "throtl %s " fmt, __pbuf, ##args); \
+ } else { \
+ blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \
+ } \
} while (0)
-#define throtl_log(td, fmt, args...) \
- blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
-
-static inline unsigned int total_nr_queued(struct throtl_data *td)
+static void tg_stats_init(struct tg_stats_cpu *tg_stats)
{
- return td->nr_queued[0] + td->nr_queued[1];
+ blkg_rwstat_init(&tg_stats->service_bytes);
+ blkg_rwstat_init(&tg_stats->serviced);
}
/*
@@ -191,12 +275,16 @@ static void tg_stats_alloc_fn(struct work_struct *work)
alloc_stats:
if (!stats_cpu) {
+ int cpu;
+
stats_cpu = alloc_percpu(struct tg_stats_cpu);
if (!stats_cpu) {
/* allocation failed, try again after some time */
schedule_delayed_work(dwork, msecs_to_jiffies(10));
return;
}
+ for_each_possible_cpu(cpu)
+ tg_stats_init(per_cpu_ptr(stats_cpu, cpu));
}
spin_lock_irq(&tg_stats_alloc_lock);
@@ -215,15 +303,141 @@ alloc_stats:
goto alloc_stats;
}
+static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
+{
+ INIT_LIST_HEAD(&qn->node);
+ bio_list_init(&qn->bios);
+ qn->tg = tg;
+}
+
+/**
+ * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it
+ * @bio: bio being added
+ * @qn: qnode to add bio to
+ * @queued: the service_queue->queued[] list @qn belongs to
+ *
+ * Add @bio to @qn and put @qn on @queued if it's not already on.
+ * @qn->tg's reference count is bumped when @qn is activated. See the
+ * comment on top of throtl_qnode definition for details.
+ */
+static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
+ struct list_head *queued)
+{
+ bio_list_add(&qn->bios, bio);
+ if (list_empty(&qn->node)) {
+ list_add_tail(&qn->node, queued);
+ blkg_get(tg_to_blkg(qn->tg));
+ }
+}
+
+/**
+ * throtl_peek_queued - peek the first bio on a qnode list
+ * @queued: the qnode list to peek
+ */
+static struct bio *throtl_peek_queued(struct list_head *queued)
+{
+ struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
+ struct bio *bio;
+
+ if (list_empty(queued))
+ return NULL;
+
+ bio = bio_list_peek(&qn->bios);
+ WARN_ON_ONCE(!bio);
+ return bio;
+}
+
+/**
+ * throtl_pop_queued - pop the first bio form a qnode list
+ * @queued: the qnode list to pop a bio from
+ * @tg_to_put: optional out argument for throtl_grp to put
+ *
+ * Pop the first bio from the qnode list @queued. After popping, the first
+ * qnode is removed from @queued if empty or moved to the end of @queued so
+ * that the popping order is round-robin.
+ *
+ * When the first qnode is removed, its associated throtl_grp should be put
+ * too. If @tg_to_put is NULL, this function automatically puts it;
+ * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is
+ * responsible for putting it.
+ */
+static struct bio *throtl_pop_queued(struct list_head *queued,
+ struct throtl_grp **tg_to_put)
+{
+ struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
+ struct bio *bio;
+
+ if (list_empty(queued))
+ return NULL;
+
+ bio = bio_list_pop(&qn->bios);
+ WARN_ON_ONCE(!bio);
+
+ if (bio_list_empty(&qn->bios)) {
+ list_del_init(&qn->node);
+ if (tg_to_put)
+ *tg_to_put = qn->tg;
+ else
+ blkg_put(tg_to_blkg(qn->tg));
+ } else {
+ list_move_tail(&qn->node, queued);
+ }
+
+ return bio;
+}
+
+/* init a service_queue, assumes the caller zeroed it */
+static void throtl_service_queue_init(struct throtl_service_queue *sq,
+ struct throtl_service_queue *parent_sq)
+{
+ INIT_LIST_HEAD(&sq->queued[0]);
+ INIT_LIST_HEAD(&sq->queued[1]);
+ sq->pending_tree = RB_ROOT;
+ sq->parent_sq = parent_sq;
+ setup_timer(&sq->pending_timer, throtl_pending_timer_fn,
+ (unsigned long)sq);
+}
+
+static void throtl_service_queue_exit(struct throtl_service_queue *sq)
+{
+ del_timer_sync(&sq->pending_timer);
+}
+
static void throtl_pd_init(struct blkcg_gq *blkg)
{
struct throtl_grp *tg = blkg_to_tg(blkg);
+ struct throtl_data *td = blkg->q->td;
+ struct throtl_service_queue *parent_sq;
unsigned long flags;
+ int rw;
+
+ /*
+ * If sane_hierarchy is enabled, we switch to properly hierarchical
+ * behavior where limits on a given throtl_grp are applied to the
+ * whole subtree rather than just the group itself. e.g. If 16M
+ * read_bps limit is set on the root group, the whole system can't
+ * exceed 16M for the device.
+ *
+ * If sane_hierarchy is not enabled, the broken flat hierarchy
+ * behavior is retained where all throtl_grps are treated as if
+ * they're all separate root groups right below throtl_data.
+ * Limits of a group don't interact with limits of other groups
+ * regardless of the position of the group in the hierarchy.
+ */
+ parent_sq = &td->service_queue;
+
+ if (cgroup_sane_behavior(blkg->blkcg->css.cgroup) && blkg->parent)
+ parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
+
+ throtl_service_queue_init(&tg->service_queue, parent_sq);
+
+ for (rw = READ; rw <= WRITE; rw++) {
+ throtl_qnode_init(&tg->qnode_on_self[rw], tg);
+ throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
+ }
RB_CLEAR_NODE(&tg->rb_node);
- bio_list_init(&tg->bio_lists[0]);
- bio_list_init(&tg->bio_lists[1]);
- tg->limits_changed = false;
+ tg->td = td;
tg->bps[READ] = -1;
tg->bps[WRITE] = -1;
@@ -241,6 +455,30 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
}
+/*
+ * Set has_rules[] if @tg or any of its parents have limits configured.
+ * This doesn't require walking up to the top of the hierarchy as the
+ * parent's has_rules[] is guaranteed to be correct.
+ */
+static void tg_update_has_rules(struct throtl_grp *tg)
+{
+ struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
+ int rw;
+
+ for (rw = READ; rw <= WRITE; rw++)
+ tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) ||
+ (tg->bps[rw] != -1 || tg->iops[rw] != -1);
+}
+
+static void throtl_pd_online(struct blkcg_gq *blkg)
+{
+ /*
+ * We don't want new groups to escape the limits of its ancestors.
+ * Update has_rules[] after a new group is brought online.
+ */
+ tg_update_has_rules(blkg_to_tg(blkg));
+}
+
static void throtl_pd_exit(struct blkcg_gq *blkg)
{
struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -251,6 +489,8 @@ static void throtl_pd_exit(struct blkcg_gq *blkg)
spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
free_percpu(tg->stats_cpu);
+
+ throtl_service_queue_exit(&tg->service_queue);
}
static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
@@ -309,17 +549,18 @@ static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
return tg;
}
-static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root)
+static struct throtl_grp *
+throtl_rb_first(struct throtl_service_queue *parent_sq)
{
/* Service tree is empty */
- if (!root->count)
+ if (!parent_sq->nr_pending)
return NULL;
- if (!root->left)
- root->left = rb_first(&root->rb);
+ if (!parent_sq->first_pending)
+ parent_sq->first_pending = rb_first(&parent_sq->pending_tree);
- if (root->left)
- return rb_entry_tg(root->left);
+ if (parent_sq->first_pending)
+ return rb_entry_tg(parent_sq->first_pending);
return NULL;
}
@@ -330,29 +571,30 @@ static void rb_erase_init(struct rb_node *n, struct rb_root *root)
RB_CLEAR_NODE(n);
}
-static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root)
+static void throtl_rb_erase(struct rb_node *n,
+ struct throtl_service_queue *parent_sq)
{
- if (root->left == n)
- root->left = NULL;
- rb_erase_init(n, &root->rb);
- --root->count;
+ if (parent_sq->first_pending == n)
+ parent_sq->first_pending = NULL;
+ rb_erase_init(n, &parent_sq->pending_tree);
+ --parent_sq->nr_pending;
}
-static void update_min_dispatch_time(struct throtl_rb_root *st)
+static void update_min_dispatch_time(struct throtl_service_queue *parent_sq)
{
struct throtl_grp *tg;
- tg = throtl_rb_first(st);
+ tg = throtl_rb_first(parent_sq);
if (!tg)
return;
- st->min_disptime = tg->disptime;
+ parent_sq->first_pending_disptime = tg->disptime;
}
-static void
-tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
+static void tg_service_queue_add(struct throtl_grp *tg)
{
- struct rb_node **node = &st->rb.rb_node;
+ struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq;
+ struct rb_node **node = &parent_sq->pending_tree.rb_node;
struct rb_node *parent = NULL;
struct throtl_grp *__tg;
unsigned long key = tg->disptime;
@@ -371,89 +613,135 @@ tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
}
if (left)
- st->left = &tg->rb_node;
+ parent_sq->first_pending = &tg->rb_node;
rb_link_node(&tg->rb_node, parent, node);
- rb_insert_color(&tg->rb_node, &st->rb);
+ rb_insert_color(&tg->rb_node, &parent_sq->pending_tree);
+}
+
+static void __throtl_enqueue_tg(struct throtl_grp *tg)
+{
+ tg_service_queue_add(tg);
+ tg->flags |= THROTL_TG_PENDING;
+ tg->service_queue.parent_sq->nr_pending++;
}
-static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
+static void throtl_enqueue_tg(struct throtl_grp *tg)
{
- struct throtl_rb_root *st = &td->tg_service_tree;
+ if (!(tg->flags & THROTL_TG_PENDING))
+ __throtl_enqueue_tg(tg);
+}
- tg_service_tree_add(st, tg);
- throtl_mark_tg_on_rr(tg);
- st->count++;
+static void __throtl_dequeue_tg(struct throtl_grp *tg)
+{
+ throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq);
+ tg->flags &= ~THROTL_TG_PENDING;
}
-static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
+static void throtl_dequeue_tg(struct throtl_grp *tg)
{
- if (!throtl_tg_on_rr(tg))
- __throtl_enqueue_tg(td, tg);
+ if (tg->flags & THROTL_TG_PENDING)
+ __throtl_dequeue_tg(tg);
}
-static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
+/* Call with queue lock held */
+static void throtl_schedule_pending_timer(struct throtl_service_queue *sq,
+ unsigned long expires)
{
- throtl_rb_erase(&tg->rb_node, &td->tg_service_tree);
- throtl_clear_tg_on_rr(tg);
+ mod_timer(&sq->pending_timer, expires);
+ throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu",
+ expires - jiffies, jiffies);
}
-static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
+/**
+ * throtl_schedule_next_dispatch - schedule the next dispatch cycle
+ * @sq: the service_queue to schedule dispatch for
+ * @force: force scheduling
+ *
+ * Arm @sq->pending_timer so that the next dispatch cycle starts on the
+ * dispatch time of the first pending child. Returns %true if either timer
+ * is armed or there's no pending child left. %false if the current
+ * dispatch window is still open and the caller should continue
+ * dispatching.
+ *
+ * If @force is %true, the dispatch timer is always scheduled and this
+ * function is guaranteed to return %true. This is to be used when the
+ * caller can't dispatch itself and needs to invoke pending_timer
+ * unconditionally. Note that forced scheduling is likely to induce short
+ * delay before dispatch starts even if @sq->first_pending_disptime is not
+ * in the future and thus shouldn't be used in hot paths.
+ */
+static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq,
+ bool force)
{
- if (throtl_tg_on_rr(tg))
- __throtl_dequeue_tg(td, tg);
+ /* any pending children left? */
+ if (!sq->nr_pending)
+ return true;
+
+ update_min_dispatch_time(sq);
+
+ /* is the next dispatch time in the future? */
+ if (force || time_after(sq->first_pending_disptime, jiffies)) {
+ throtl_schedule_pending_timer(sq, sq->first_pending_disptime);
+ return true;
+ }
+
+ /* tell the caller to continue dispatching */
+ return false;
}
-static void throtl_schedule_next_dispatch(struct throtl_data *td)
+static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
+ bool rw, unsigned long start)
{
- struct throtl_rb_root *st = &td->tg_service_tree;
+ tg->bytes_disp[rw] = 0;
+ tg->io_disp[rw] = 0;
/*
- * If there are more bios pending, schedule more work.
+ * Previous slice has expired. We must have trimmed it after last
+ * bio dispatch. That means since start of last slice, we never used
+ * that bandwidth. Do try to make use of that bandwidth while giving
+ * credit.
*/
- if (!total_nr_queued(td))
- return;
-
- BUG_ON(!st->count);
+ if (time_after_eq(start, tg->slice_start[rw]))
+ tg->slice_start[rw] = start;
- update_min_dispatch_time(st);
-
- if (time_before_eq(st->min_disptime, jiffies))
- throtl_schedule_delayed_work(td, 0);
- else
- throtl_schedule_delayed_work(td, (st->min_disptime - jiffies));
+ tg->slice_end[rw] = jiffies + throtl_slice;
+ throtl_log(&tg->service_queue,
+ "[%c] new slice with credit start=%lu end=%lu jiffies=%lu",
+ rw == READ ? 'R' : 'W', tg->slice_start[rw],
+ tg->slice_end[rw], jiffies);
}
-static inline void
-throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
{
tg->bytes_disp[rw] = 0;
tg->io_disp[rw] = 0;
tg->slice_start[rw] = jiffies;
tg->slice_end[rw] = jiffies + throtl_slice;
- throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
- rw == READ ? 'R' : 'W', tg->slice_start[rw],
- tg->slice_end[rw], jiffies);
+ throtl_log(&tg->service_queue,
+ "[%c] new slice start=%lu end=%lu jiffies=%lu",
+ rw == READ ? 'R' : 'W', tg->slice_start[rw],
+ tg->slice_end[rw], jiffies);
}
-static inline void throtl_set_slice_end(struct throtl_data *td,
- struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
+static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
+ unsigned long jiffy_end)
{
tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
}
-static inline void throtl_extend_slice(struct throtl_data *td,
- struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
+static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
+ unsigned long jiffy_end)
{
tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
- throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
- rw == READ ? 'R' : 'W', tg->slice_start[rw],
- tg->slice_end[rw], jiffies);
+ throtl_log(&tg->service_queue,
+ "[%c] extend slice start=%lu end=%lu jiffies=%lu",
+ rw == READ ? 'R' : 'W', tg->slice_start[rw],
+ tg->slice_end[rw], jiffies);
}
/* Determine if previously allocated or extended slice is complete or not */
-static bool
-throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
{
if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
return 0;
@@ -462,8 +750,7 @@ throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
}
/* Trim the used slices and adjust slice start accordingly */
-static inline void
-throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
{
unsigned long nr_slices, time_elapsed, io_trim;
u64 bytes_trim, tmp;
@@ -475,7 +762,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
* renewed. Don't try to trim the slice if slice is used. A new
* slice will start when appropriate.
*/
- if (throtl_slice_used(td, tg, rw))
+ if (throtl_slice_used(tg, rw))
return;
/*
@@ -486,7 +773,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
* is bad because it does not allow new slice to start.
*/
- throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice);
+ throtl_set_slice_end(tg, rw, jiffies + throtl_slice);
time_elapsed = jiffies - tg->slice_start[rw];
@@ -515,14 +802,14 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
tg->slice_start[rw] += nr_slices * throtl_slice;
- throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
- " start=%lu end=%lu jiffies=%lu",
- rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
- tg->slice_start[rw], tg->slice_end[rw], jiffies);
+ throtl_log(&tg->service_queue,
+ "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu",
+ rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
+ tg->slice_start[rw], tg->slice_end[rw], jiffies);
}
-static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
- struct bio *bio, unsigned long *wait)
+static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
+ unsigned long *wait)
{
bool rw = bio_data_dir(bio);
unsigned int io_allowed;
@@ -571,8 +858,8 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
return 0;
}
-static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
- struct bio *bio, unsigned long *wait)
+static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
+ unsigned long *wait)
{
bool rw = bio_data_dir(bio);
u64 bytes_allowed, extra_bytes, tmp;
@@ -613,18 +900,12 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
return 0;
}
-static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {
- if (tg->bps[rw] == -1 && tg->iops[rw] == -1)
- return 1;
- return 0;
-}
-
/*
* Returns whether one can dispatch a bio or not. Also returns approx number
* of jiffies to wait before this bio is with-in IO rate and can be dispatched
*/
-static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
- struct bio *bio, unsigned long *wait)
+static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
+ unsigned long *wait)
{
bool rw = bio_data_dir(bio);
unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
@@ -635,7 +916,8 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
* this function with a different bio if there are other bios
* queued.
*/
- BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
+ BUG_ON(tg->service_queue.nr_queued[rw] &&
+ bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
/* If tg->bps = -1, then BW is unlimited */
if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
@@ -649,15 +931,15 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
* existing slice to make sure it is at least throtl_slice interval
* long since now.
*/
- if (throtl_slice_used(td, tg, rw))
- throtl_start_new_slice(td, tg, rw);
+ if (throtl_slice_used(tg, rw))
+ throtl_start_new_slice(tg, rw);
else {
if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
- throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
+ throtl_extend_slice(tg, rw, jiffies + throtl_slice);
}
- if (tg_with_in_bps_limit(td, tg, bio, &bps_wait)
- && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) {
+ if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
+ tg_with_in_iops_limit(tg, bio, &iops_wait)) {
if (wait)
*wait = 0;
return 1;
@@ -669,7 +951,7 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
*wait = max_wait;
if (time_before(tg->slice_end[rw], jiffies + max_wait))
- throtl_extend_slice(td, tg, rw, jiffies + max_wait);
+ throtl_extend_slice(tg, rw, jiffies + max_wait);
return 0;
}
@@ -708,65 +990,136 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
tg->bytes_disp[rw] += bio->bi_size;
tg->io_disp[rw]++;
- throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw);
+ /*
+ * REQ_THROTTLED is used to prevent the same bio to be throttled
+ * more than once as a throttled bio will go through blk-throtl the
+ * second time when it eventually gets issued. Set it when a bio
+ * is being charged to a tg.
+ *
+ * Dispatch stats aren't recursive and each @bio should only be
+ * accounted by the @tg it was originally associated with. Let's
+ * update the stats when setting REQ_THROTTLED for the first time
+ * which is guaranteed to be for the @bio's original tg.
+ */
+ if (!(bio->bi_rw & REQ_THROTTLED)) {
+ bio->bi_rw |= REQ_THROTTLED;
+ throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size,
+ bio->bi_rw);
+ }
}
-static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
- struct bio *bio)
+/**
+ * throtl_add_bio_tg - add a bio to the specified throtl_grp
+ * @bio: bio to add
+ * @qn: qnode to use
+ * @tg: the target throtl_grp
+ *
+ * Add @bio to @tg's service_queue using @qn. If @qn is not specified,
+ * tg->qnode_on_self[] is used.
+ */
+static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
+ struct throtl_grp *tg)
{
+ struct throtl_service_queue *sq = &tg->service_queue;
bool rw = bio_data_dir(bio);
- bio_list_add(&tg->bio_lists[rw], bio);
- /* Take a bio reference on tg */
- blkg_get(tg_to_blkg(tg));
- tg->nr_queued[rw]++;
- td->nr_queued[rw]++;
- throtl_enqueue_tg(td, tg);
+ if (!qn)
+ qn = &tg->qnode_on_self[rw];
+
+ /*
+ * If @tg doesn't currently have any bios queued in the same
+ * direction, queueing @bio can change when @tg should be
+ * dispatched. Mark that @tg was empty. This is automatically
+ * cleaered on the next tg_update_disptime().
+ */
+ if (!sq->nr_queued[rw])
+ tg->flags |= THROTL_TG_WAS_EMPTY;
+
+ throtl_qnode_add_bio(bio, qn, &sq->queued[rw]);
+
+ sq->nr_queued[rw]++;
+ throtl_enqueue_tg(tg);
}
-static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
+static void tg_update_disptime(struct throtl_grp *tg)
{
+ struct throtl_service_queue *sq = &tg->service_queue;
unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
struct bio *bio;
- if ((bio = bio_list_peek(&tg->bio_lists[READ])))
- tg_may_dispatch(td, tg, bio, &read_wait);
+ if ((bio = throtl_peek_queued(&sq->queued[READ])))
+ tg_may_dispatch(tg, bio, &read_wait);
- if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
- tg_may_dispatch(td, tg, bio, &write_wait);
+ if ((bio = throtl_peek_queued(&sq->queued[WRITE])))
+ tg_may_dispatch(tg, bio, &write_wait);
min_wait = min(read_wait, write_wait);
disptime = jiffies + min_wait;
/* Update dispatch time */
- throtl_dequeue_tg(td, tg);
+ throtl_dequeue_tg(tg);
tg->disptime = disptime;
- throtl_enqueue_tg(td, tg);
+ throtl_enqueue_tg(tg);
+
+ /* see throtl_add_bio_tg() */
+ tg->flags &= ~THROTL_TG_WAS_EMPTY;
}
-static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
- bool rw, struct bio_list *bl)
+static void start_parent_slice_with_credit(struct throtl_grp *child_tg,
+ struct throtl_grp *parent_tg, bool rw)
{
- struct bio *bio;
+ if (throtl_slice_used(parent_tg, rw)) {
+ throtl_start_new_slice_with_credit(parent_tg, rw,
+ child_tg->slice_start[rw]);
+ }
- bio = bio_list_pop(&tg->bio_lists[rw]);
- tg->nr_queued[rw]--;
- /* Drop bio reference on blkg */
- blkg_put(tg_to_blkg(tg));
+}
+
+static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
+{
+ struct throtl_service_queue *sq = &tg->service_queue;
+ struct throtl_service_queue *parent_sq = sq->parent_sq;
+ struct throtl_grp *parent_tg = sq_to_tg(parent_sq);
+ struct throtl_grp *tg_to_put = NULL;
+ struct bio *bio;
- BUG_ON(td->nr_queued[rw] <= 0);
- td->nr_queued[rw]--;
+ /*
+ * @bio is being transferred from @tg to @parent_sq. Popping a bio
+ * from @tg may put its reference and @parent_sq might end up
+ * getting released prematurely. Remember the tg to put and put it
+ * after @bio is transferred to @parent_sq.
+ */
+ bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put);
+ sq->nr_queued[rw]--;
throtl_charge_bio(tg, bio);
- bio_list_add(bl, bio);
- bio->bi_rw |= REQ_THROTTLED;
- throtl_trim_slice(td, tg, rw);
+ /*
+ * If our parent is another tg, we just need to transfer @bio to
+ * the parent using throtl_add_bio_tg(). If our parent is
+ * @td->service_queue, @bio is ready to be issued. Put it on its
+ * bio_lists[] and decrease total number queued. The caller is
+ * responsible for issuing these bios.
+ */
+ if (parent_tg) {
+ throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg);
+ start_parent_slice_with_credit(tg, parent_tg, rw);
+ } else {
+ throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
+ &parent_sq->queued[rw]);
+ BUG_ON(tg->td->nr_queued[rw] <= 0);
+ tg->td->nr_queued[rw]--;
+ }
+
+ throtl_trim_slice(tg, rw);
+
+ if (tg_to_put)
+ blkg_put(tg_to_blkg(tg_to_put));
}
-static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
- struct bio_list *bl)
+static int throtl_dispatch_tg(struct throtl_grp *tg)
{
+ struct throtl_service_queue *sq = &tg->service_queue;
unsigned int nr_reads = 0, nr_writes = 0;
unsigned int max_nr_reads = throtl_grp_quantum*3/4;
unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
@@ -774,20 +1127,20 @@ static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
/* Try to dispatch 75% READS and 25% WRITES */
- while ((bio = bio_list_peek(&tg->bio_lists[READ]))
- && tg_may_dispatch(td, tg, bio, NULL)) {
+ while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
+ tg_may_dispatch(tg, bio, NULL)) {
- tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
+ tg_dispatch_one_bio(tg, bio_data_dir(bio));
nr_reads++;
if (nr_reads >= max_nr_reads)
break;
}
- while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))
- && tg_may_dispatch(td, tg, bio, NULL)) {
+ while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
+ tg_may_dispatch(tg, bio, NULL)) {
- tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
+ tg_dispatch_one_bio(tg, bio_data_dir(bio));
nr_writes++;
if (nr_writes >= max_nr_writes)
@@ -797,14 +1150,13 @@ static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
return nr_reads + nr_writes;
}
-static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
+static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
{
unsigned int nr_disp = 0;
- struct throtl_grp *tg;
- struct throtl_rb_root *st = &td->tg_service_tree;
while (1) {
- tg = throtl_rb_first(st);
+ struct throtl_grp *tg = throtl_rb_first(parent_sq);
+ struct throtl_service_queue *sq = &tg->service_queue;
if (!tg)
break;
@@ -812,14 +1164,12 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
if (time_before(jiffies, tg->disptime))
break;
- throtl_dequeue_tg(td, tg);
+ throtl_dequeue_tg(tg);
- nr_disp += throtl_dispatch_tg(td, tg, bl);
+ nr_disp += throtl_dispatch_tg(tg);
- if (tg->nr_queued[0] || tg->nr_queued[1]) {
- tg_update_disptime(td, tg);
- throtl_enqueue_tg(td, tg);
- }
+ if (sq->nr_queued[0] || sq->nr_queued[1])
+ tg_update_disptime(tg);
if (nr_disp >= throtl_quantum)
break;
@@ -828,111 +1178,111 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
return nr_disp;
}
-static void throtl_process_limit_change(struct throtl_data *td)
+/**
+ * throtl_pending_timer_fn - timer function for service_queue->pending_timer
+ * @arg: the throtl_service_queue being serviced
+ *
+ * This timer is armed when a child throtl_grp with active bio's become
+ * pending and queued on the service_queue's pending_tree and expires when
+ * the first child throtl_grp should be dispatched. This function
+ * dispatches bio's from the children throtl_grps to the parent
+ * service_queue.
+ *
+ * If the parent's parent is another throtl_grp, dispatching is propagated
+ * by either arming its pending_timer or repeating dispatch directly. If
+ * the top-level service_tree is reached, throtl_data->dispatch_work is
+ * kicked so that the ready bio's are issued.
+ */
+static void throtl_pending_timer_fn(unsigned long arg)
{
+ struct throtl_service_queue *sq = (void *)arg;
+ struct throtl_grp *tg = sq_to_tg(sq);
+ struct throtl_data *td = sq_to_td(sq);
struct request_queue *q = td->queue;
- struct blkcg_gq *blkg, *n;
-
- if (!td->limits_changed)
- return;
-
- xchg(&td->limits_changed, false);
-
- throtl_log(td, "limits changed");
-
- list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
- struct throtl_grp *tg = blkg_to_tg(blkg);
+ struct throtl_service_queue *parent_sq;
+ bool dispatched;
+ int ret;
- if (!tg->limits_changed)
- continue;
+ spin_lock_irq(q->queue_lock);
+again:
+ parent_sq = sq->parent_sq;
+ dispatched = false;
+
+ while (true) {
+ throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u",
+ sq->nr_queued[READ] + sq->nr_queued[WRITE],
+ sq->nr_queued[READ], sq->nr_queued[WRITE]);
+
+ ret = throtl_select_dispatch(sq);
+ if (ret) {
+ throtl_log(sq, "bios disp=%u", ret);
+ dispatched = true;
+ }
- if (!xchg(&tg->limits_changed, false))
- continue;
+ if (throtl_schedule_next_dispatch(sq, false))
+ break;
- throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
- " riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE],
- tg->iops[READ], tg->iops[WRITE]);
+ /* this dispatch windows is still open, relax and repeat */
+ spin_unlock_irq(q->queue_lock);
+ cpu_relax();
+ spin_lock_irq(q->queue_lock);
+ }
- /*
- * Restart the slices for both READ and WRITES. It
- * might happen that a group's limit are dropped
- * suddenly and we don't want to account recently
- * dispatched IO with new low rate
- */
- throtl_start_new_slice(td, tg, 0);
- throtl_start_new_slice(td, tg, 1);
+ if (!dispatched)
+ goto out_unlock;
- if (throtl_tg_on_rr(tg))
- tg_update_disptime(td, tg);
+ if (parent_sq) {
+ /* @parent_sq is another throl_grp, propagate dispatch */
+ if (tg->flags & THROTL_TG_WAS_EMPTY) {
+ tg_update_disptime(tg);
+ if (!throtl_schedule_next_dispatch(parent_sq, false)) {
+ /* window is already open, repeat dispatching */
+ sq = parent_sq;
+ tg = sq_to_tg(sq);
+ goto again;
+ }
+ }
+ } else {
+ /* reached the top-level, queue issueing */
+ queue_work(kthrotld_workqueue, &td->dispatch_work);
}
+out_unlock:
+ spin_unlock_irq(q->queue_lock);
}
-/* Dispatch throttled bios. Should be called without queue lock held. */
-static int throtl_dispatch(struct request_queue *q)
+/**
+ * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work
+ * @work: work item being executed
+ *
+ * This function is queued for execution when bio's reach the bio_lists[]
+ * of throtl_data->service_queue. Those bio's are ready and issued by this
+ * function.
+ */
+void blk_throtl_dispatch_work_fn(struct work_struct *work)
{
- struct throtl_data *td = q->td;
- unsigned int nr_disp = 0;
+ struct throtl_data *td = container_of(work, struct throtl_data,
+ dispatch_work);
+ struct throtl_service_queue *td_sq = &td->service_queue;
+ struct request_queue *q = td->queue;
struct bio_list bio_list_on_stack;
struct bio *bio;
struct blk_plug plug;
-
- spin_lock_irq(q->queue_lock);
-
- throtl_process_limit_change(td);
-
- if (!total_nr_queued(td))
- goto out;
+ int rw;
bio_list_init(&bio_list_on_stack);
- throtl_log(td, "dispatch nr_queued=%u read=%u write=%u",
- total_nr_queued(td), td->nr_queued[READ],
- td->nr_queued[WRITE]);
-
- nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
-
- if (nr_disp)
- throtl_log(td, "bios disp=%u", nr_disp);
-
- throtl_schedule_next_dispatch(td);
-out:
+ spin_lock_irq(q->queue_lock);
+ for (rw = READ; rw <= WRITE; rw++)
+ while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
+ bio_list_add(&bio_list_on_stack, bio);
spin_unlock_irq(q->queue_lock);
- /*
- * If we dispatched some requests, unplug the queue to make sure
- * immediate dispatch
- */
- if (nr_disp) {
+ if (!bio_list_empty(&bio_list_on_stack)) {
blk_start_plug(&plug);
while((bio = bio_list_pop(&bio_list_on_stack)))
generic_make_request(bio);
blk_finish_plug(&plug);
}
- return nr_disp;
-}
-
-void blk_throtl_work(struct work_struct *work)
-{
- struct throtl_data *td = container_of(work, struct throtl_data,
- throtl_work.work);
- struct request_queue *q = td->queue;
-
- throtl_dispatch(q);
-}
-
-/* Call with queue lock held */
-static void
-throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
-{
-
- struct delayed_work *dwork = &td->throtl_work;
-
- /* schedule work if limits changed even if no bio is queued */
- if (total_nr_queued(td) || td->limits_changed) {
- mod_delayed_work(kthrotld_workqueue, dwork, delay);
- throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
- delay, jiffies);
- }
}
static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
@@ -953,10 +1303,10 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
return __blkg_prfill_rwstat(sf, pd, &rwstat);
}
-static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
- struct seq_file *sf)
+static int tg_print_cpu_rwstat(struct cgroup_subsys_state *css,
+ struct cftype *cft, struct seq_file *sf)
{
- struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+ struct blkcg *blkcg = css_to_blkcg(css);
blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl,
cft->private, true);
@@ -985,29 +1335,31 @@ static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
return __blkg_prfill_u64(sf, pd, v);
}
-static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft,
- struct seq_file *sf)
+static int tg_print_conf_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft, struct seq_file *sf)
{
- blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_u64,
+ blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_u64,
&blkcg_policy_throtl, cft->private, false);
return 0;
}
-static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft,
- struct seq_file *sf)
+static int tg_print_conf_uint(struct cgroup_subsys_state *css,
+ struct cftype *cft, struct seq_file *sf)
{
- blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_uint,
+ blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_uint,
&blkcg_policy_throtl, cft->private, false);
return 0;
}
-static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
- bool is_u64)
+static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft,
+ const char *buf, bool is_u64)
{
- struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+ struct blkcg *blkcg = css_to_blkcg(css);
struct blkg_conf_ctx ctx;
struct throtl_grp *tg;
- struct throtl_data *td;
+ struct throtl_service_queue *sq;
+ struct blkcg_gq *blkg;
+ struct cgroup_subsys_state *pos_css;
int ret;
ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
@@ -1015,7 +1367,7 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
return ret;
tg = blkg_to_tg(ctx.blkg);
- td = ctx.blkg->q->td;
+ sq = &tg->service_queue;
if (!ctx.v)
ctx.v = -1;
@@ -1025,25 +1377,51 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
else
*(unsigned int *)((void *)tg + cft->private) = ctx.v;
- /* XXX: we don't need the following deferred processing */
- xchg(&tg->limits_changed, true);
- xchg(&td->limits_changed, true);
- throtl_schedule_delayed_work(td, 0);
+ throtl_log(&tg->service_queue,
+ "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
+ tg->bps[READ], tg->bps[WRITE],
+ tg->iops[READ], tg->iops[WRITE]);
+
+ /*
+ * Update has_rules[] flags for the updated tg's subtree. A tg is
+ * considered to have rules if either the tg itself or any of its
+ * ancestors has rules. This identifies groups without any
+ * restrictions in the whole hierarchy and allows them to bypass
+ * blk-throttle.
+ */
+ blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg)
+ tg_update_has_rules(blkg_to_tg(blkg));
+
+ /*
+ * We're already holding queue_lock and know @tg is valid. Let's
+ * apply the new config directly.
+ *
+ * Restart the slices for both READ and WRITES. It might happen
+ * that a group's limit are dropped suddenly and we don't want to
+ * account recently dispatched IO with new low rate.
+ */
+ throtl_start_new_slice(tg, 0);
+ throtl_start_new_slice(tg, 1);
+
+ if (tg->flags & THROTL_TG_PENDING) {
+ tg_update_disptime(tg);
+ throtl_schedule_next_dispatch(sq->parent_sq, true);
+ }
blkg_conf_finish(&ctx);
return 0;
}
-static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft,
+static int tg_set_conf_u64(struct cgroup_subsys_state *css, struct cftype *cft,
const char *buf)
{
- return tg_set_conf(cgrp, cft, buf, true);
+ return tg_set_conf(css, cft, buf, true);
}
-static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft,
+static int tg_set_conf_uint(struct cgroup_subsys_state *css, struct cftype *cft,
const char *buf)
{
- return tg_set_conf(cgrp, cft, buf, false);
+ return tg_set_conf(css, cft, buf, false);
}
static struct cftype throtl_files[] = {
@@ -1092,7 +1470,7 @@ static void throtl_shutdown_wq(struct request_queue *q)
{
struct throtl_data *td = q->td;
- cancel_delayed_work_sync(&td->throtl_work);
+ cancel_work_sync(&td->dispatch_work);
}
static struct blkcg_policy blkcg_policy_throtl = {
@@ -1100,6 +1478,7 @@ static struct blkcg_policy blkcg_policy_throtl = {
.cftypes = throtl_files,
.pd_init_fn = throtl_pd_init,
+ .pd_online_fn = throtl_pd_online,
.pd_exit_fn = throtl_pd_exit,
.pd_reset_stats_fn = throtl_pd_reset_stats,
};
@@ -1107,15 +1486,16 @@ static struct blkcg_policy blkcg_policy_throtl = {
bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
{
struct throtl_data *td = q->td;
+ struct throtl_qnode *qn = NULL;
struct throtl_grp *tg;
- bool rw = bio_data_dir(bio), update_disptime = true;
+ struct throtl_service_queue *sq;
+ bool rw = bio_data_dir(bio);
struct blkcg *blkcg;
bool throttled = false;
- if (bio->bi_rw & REQ_THROTTLED) {
- bio->bi_rw &= ~REQ_THROTTLED;
+ /* see throtl_charge_bio() */
+ if (bio->bi_rw & REQ_THROTTLED)
goto out;
- }
/*
* A throtl_grp pointer retrieved under rcu can be used to access
@@ -1126,7 +1506,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
blkcg = bio_blkcg(bio);
tg = throtl_lookup_tg(td, blkcg);
if (tg) {
- if (tg_no_rule_group(tg, rw)) {
+ if (!tg->has_rules[rw]) {
throtl_update_dispatch_stats(tg_to_blkg(tg),
bio->bi_size, bio->bi_rw);
goto out_unlock_rcu;
@@ -1142,18 +1522,18 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
if (unlikely(!tg))
goto out_unlock;
- if (tg->nr_queued[rw]) {
- /*
- * There is already another bio queued in same dir. No
- * need to update dispatch time.
- */
- update_disptime = false;
- goto queue_bio;
+ sq = &tg->service_queue;
- }
+ while (true) {
+ /* throtl is FIFO - if bios are already queued, should queue */
+ if (sq->nr_queued[rw])
+ break;
+
+ /* if above limits, break to queue */
+ if (!tg_may_dispatch(tg, bio, NULL))
+ break;
- /* Bio is with-in rate limit of group */
- if (tg_may_dispatch(td, tg, bio, NULL)) {
+ /* within limits, let's charge and dispatch directly */
throtl_charge_bio(tg, bio);
/*
@@ -1167,25 +1547,41 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
*
* So keep on trimming slice even if bio is not queued.
*/
- throtl_trim_slice(td, tg, rw);
- goto out_unlock;
+ throtl_trim_slice(tg, rw);
+
+ /*
+ * @bio passed through this layer without being throttled.
+ * Climb up the ladder. If we''re already at the top, it
+ * can be executed directly.
+ */
+ qn = &tg->qnode_on_parent[rw];
+ sq = sq->parent_sq;
+ tg = sq_to_tg(sq);
+ if (!tg)
+ goto out_unlock;
}
-queue_bio:
- throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu"
- " iodisp=%u iops=%u queued=%d/%d",
- rw == READ ? 'R' : 'W',
- tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
- tg->io_disp[rw], tg->iops[rw],
- tg->nr_queued[READ], tg->nr_queued[WRITE]);
+ /* out-of-limit, queue to @tg */
+ throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",
+ rw == READ ? 'R' : 'W',
+ tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
+ tg->io_disp[rw], tg->iops[rw],
+ sq->nr_queued[READ], sq->nr_queued[WRITE]);
bio_associate_current(bio);
- throtl_add_bio_tg(q->td, tg, bio);
+ tg->td->nr_queued[rw]++;
+ throtl_add_bio_tg(bio, qn, tg);
throttled = true;
- if (update_disptime) {
- tg_update_disptime(td, tg);
- throtl_schedule_next_dispatch(td);
+ /*
+ * Update @tg's dispatch time and force schedule dispatch if @tg
+ * was empty before @bio. The forced scheduling isn't likely to
+ * cause undue delay as @bio is likely to be dispatched directly if
+ * its @tg's disptime is not in the future.
+ */
+ if (tg->flags & THROTL_TG_WAS_EMPTY) {
+ tg_update_disptime(tg);
+ throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true);
}
out_unlock:
@@ -1193,9 +1589,38 @@ out_unlock:
out_unlock_rcu:
rcu_read_unlock();
out:
+ /*
+ * As multiple blk-throtls may stack in the same issue path, we
+ * don't want bios to leave with the flag set. Clear the flag if
+ * being issued.
+ */
+ if (!throttled)
+ bio->bi_rw &= ~REQ_THROTTLED;
return throttled;
}
+/*
+ * Dispatch all bios from all children tg's queued on @parent_sq. On
+ * return, @parent_sq is guaranteed to not have any active children tg's
+ * and all bios from previously active tg's are on @parent_sq->bio_lists[].
+ */
+static void tg_drain_bios(struct throtl_service_queue *parent_sq)
+{
+ struct throtl_grp *tg;
+
+ while ((tg = throtl_rb_first(parent_sq))) {
+ struct throtl_service_queue *sq = &tg->service_queue;
+ struct bio *bio;
+
+ throtl_dequeue_tg(tg);
+
+ while ((bio = throtl_peek_queued(&sq->queued[READ])))
+ tg_dispatch_one_bio(tg, bio_data_dir(bio));
+ while ((bio = throtl_peek_queued(&sq->queued[WRITE])))
+ tg_dispatch_one_bio(tg, bio_data_dir(bio));
+ }
+}
+
/**
* blk_throtl_drain - drain throttled bios
* @q: request_queue to drain throttled bios for
@@ -1206,27 +1631,34 @@ void blk_throtl_drain(struct request_queue *q)
__releases(q->queue_lock) __acquires(q->queue_lock)
{
struct throtl_data *td = q->td;
- struct throtl_rb_root *st = &td->tg_service_tree;
- struct throtl_grp *tg;
- struct bio_list bl;
+ struct blkcg_gq *blkg;
+ struct cgroup_subsys_state *pos_css;
struct bio *bio;
+ int rw;
queue_lockdep_assert_held(q);
+ rcu_read_lock();
- bio_list_init(&bl);
+ /*
+ * Drain each tg while doing post-order walk on the blkg tree, so
+ * that all bios are propagated to td->service_queue. It'd be
+ * better to walk service_queue tree directly but blkg walk is
+ * easier.
+ */
+ blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg)
+ tg_drain_bios(&blkg_to_tg(blkg)->service_queue);
- while ((tg = throtl_rb_first(st))) {
- throtl_dequeue_tg(td, tg);
+ /* finally, transfer bios from top-level tg's into the td */
+ tg_drain_bios(&td->service_queue);
- while ((bio = bio_list_peek(&tg->bio_lists[READ])))
- tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
- while ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
- tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
- }
+ rcu_read_unlock();
spin_unlock_irq(q->queue_lock);
- while ((bio = bio_list_pop(&bl)))
- generic_make_request(bio);
+ /* all bios now should be in td->service_queue, issue them */
+ for (rw = READ; rw <= WRITE; rw++)
+ while ((bio = throtl_pop_queued(&td->service_queue.queued[rw],
+ NULL)))
+ generic_make_request(bio);
spin_lock_irq(q->queue_lock);
}
@@ -1240,9 +1672,8 @@ int blk_throtl_init(struct request_queue *q)
if (!td)
return -ENOMEM;
- td->tg_service_tree = THROTL_RB_ROOT;
- td->limits_changed = false;
- INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
+ INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
+ throtl_service_queue_init(&td->service_queue, NULL);
q->td = td;
td->queue = q;
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 6e4744c..bba81c9 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -7,6 +7,7 @@
#include <linux/fault-inject.h>
#include "blk.h"
+#include "blk-mq.h"
#ifdef CONFIG_FAIL_IO_TIMEOUT
@@ -31,7 +32,7 @@ static int __init fail_io_timeout_debugfs(void)
struct dentry *dir = fault_create_debugfs_attr("fail_io_timeout",
NULL, &fail_io_timeout);
- return IS_ERR(dir) ? PTR_ERR(dir) : 0;
+ return PTR_ERR_OR_ZERO(dir);
}
late_initcall(fail_io_timeout_debugfs);
@@ -82,16 +83,25 @@ void blk_delete_timer(struct request *req)
static void blk_rq_timed_out(struct request *req)
{
struct request_queue *q = req->q;
- enum blk_eh_timer_return ret;
+ enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
- ret = q->rq_timed_out_fn(req);
+ if (q->rq_timed_out_fn)
+ ret = q->rq_timed_out_fn(req);
switch (ret) {
case BLK_EH_HANDLED:
- __blk_complete_request(req);
+ /* Can we use req->errors here? */
+ if (q->mq_ops)
+ blk_mq_complete_request(req, req->errors);
+ else
+ __blk_complete_request(req);
break;
case BLK_EH_RESET_TIMER:
+ if (q->mq_ops)
+ blk_mq_add_timer(req);
+ else
+ blk_add_timer(req);
+
blk_clear_rq_complete(req);
- blk_add_timer(req);
break;
case BLK_EH_NOT_HANDLED:
/*
@@ -107,6 +117,23 @@ static void blk_rq_timed_out(struct request *req)
}
}
+void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
+ unsigned int *next_set)
+{
+ if (time_after_eq(jiffies, rq->deadline)) {
+ list_del_init(&rq->timeout_list);
+
+ /*
+ * Check if we raced with end io completion
+ */
+ if (!blk_mark_rq_complete(rq))
+ blk_rq_timed_out(rq);
+ } else if (!*next_set || time_after(*next_timeout, rq->deadline)) {
+ *next_timeout = rq->deadline;
+ *next_set = 1;
+ }
+}
+
void blk_rq_timed_out_timer(unsigned long data)
{
struct request_queue *q = (struct request_queue *) data;
@@ -116,21 +143,8 @@ void blk_rq_timed_out_timer(unsigned long data)
spin_lock_irqsave(q->queue_lock, flags);
- list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) {
- if (time_after_eq(jiffies, rq->deadline)) {
- list_del_init(&rq->timeout_list);
-
- /*
- * Check if we raced with end io completion
- */
- if (blk_mark_rq_complete(rq))
- continue;
- blk_rq_timed_out(rq);
- } else if (!next_set || time_after(next, rq->deadline)) {
- next = rq->deadline;
- next_set = 1;
- }
- }
+ list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list)
+ blk_rq_check_expired(rq, &next, &next_set);
if (next_set)
mod_timer(&q->timeout, round_jiffies_up(next));
@@ -156,15 +170,7 @@ void blk_abort_request(struct request *req)
}
EXPORT_SYMBOL_GPL(blk_abort_request);
-/**
- * blk_add_timer - Start timeout timer for a single request
- * @req: request that is about to start running.
- *
- * Notes:
- * Each request has its own timer, and as it is added to the queue, we
- * set up the timer. When the request completes, we cancel the timer.
- */
-void blk_add_timer(struct request *req)
+void __blk_add_timer(struct request *req, struct list_head *timeout_list)
{
struct request_queue *q = req->q;
unsigned long expiry;
@@ -173,7 +179,6 @@ void blk_add_timer(struct request *req)
return;
BUG_ON(!list_empty(&req->timeout_list));
- BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
/*
* Some LLDs, like scsi, peek at the timeout to prevent a
@@ -183,7 +188,8 @@ void blk_add_timer(struct request *req)
req->timeout = q->rq_timeout;
req->deadline = jiffies + req->timeout;
- list_add_tail(&req->timeout_list, &q->timeout_list);
+ if (timeout_list)
+ list_add_tail(&req->timeout_list, timeout_list);
/*
* If the timer isn't already pending or this timeout is earlier
@@ -195,5 +201,19 @@ void blk_add_timer(struct request *req)
if (!timer_pending(&q->timeout) ||
time_before(expiry, q->timeout.expires))
mod_timer(&q->timeout, expiry);
+
+}
+
+/**
+ * blk_add_timer - Start timeout timer for a single request
+ * @req: request that is about to start running.
+ *
+ * Notes:
+ * Each request has its own timer, and as it is added to the queue, we
+ * set up the timer. When the request completes, we cancel the timer.
+ */
+void blk_add_timer(struct request *req)
+{
+ __blk_add_timer(req, &req->q->timeout_list);
}
diff --git a/block/blk.h b/block/blk.h
index e837b8f..c90e1d8 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -10,6 +10,7 @@
#define BLK_BATCH_REQ 32
extern struct kmem_cache *blk_requestq_cachep;
+extern struct kmem_cache *request_cachep;
extern struct kobj_type blk_queue_ktype;
extern struct ida blk_queue_ida;
@@ -34,14 +35,30 @@ bool __blk_end_bidi_request(struct request *rq, int error,
unsigned int nr_bytes, unsigned int bidi_bytes);
void blk_rq_timed_out_timer(unsigned long data);
+void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
+ unsigned int *next_set);
+void __blk_add_timer(struct request *req, struct list_head *timeout_list);
void blk_delete_timer(struct request *);
void blk_add_timer(struct request *);
+
+bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
+ struct bio *bio);
+bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
+ struct bio *bio);
+bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
+ unsigned int *request_count);
+
+void blk_account_io_start(struct request *req, bool new_io);
+void blk_account_io_completion(struct request *req, unsigned int bytes);
+void blk_account_io_done(struct request *req);
+
/*
* Internal atomic flags for request handling
*/
enum rq_atomic_flags {
REQ_ATOM_COMPLETE = 0,
+ REQ_ATOM_STARTED,
};
/*
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d5cd313..4d5cec1 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1508,6 +1508,29 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg)
}
#ifdef CONFIG_CFQ_GROUP_IOSCHED
+static void cfqg_stats_init(struct cfqg_stats *stats)
+{
+ blkg_rwstat_init(&stats->service_bytes);
+ blkg_rwstat_init(&stats->serviced);
+ blkg_rwstat_init(&stats->merged);
+ blkg_rwstat_init(&stats->service_time);
+ blkg_rwstat_init(&stats->wait_time);
+ blkg_rwstat_init(&stats->queued);
+
+ blkg_stat_init(&stats->sectors);
+ blkg_stat_init(&stats->time);
+
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+ blkg_stat_init(&stats->unaccounted_time);
+ blkg_stat_init(&stats->avg_queue_size_sum);
+ blkg_stat_init(&stats->avg_queue_size_samples);
+ blkg_stat_init(&stats->dequeue);
+ blkg_stat_init(&stats->group_wait_time);
+ blkg_stat_init(&stats->idle_time);
+ blkg_stat_init(&stats->empty_time);
+#endif
+}
+
static void cfq_pd_init(struct blkcg_gq *blkg)
{
struct cfq_group *cfqg = blkg_to_cfqg(blkg);
@@ -1515,6 +1538,8 @@ static void cfq_pd_init(struct blkcg_gq *blkg)
cfq_init_cfqg_base(cfqg);
cfqg->weight = blkg->blkcg->cfq_weight;
cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight;
+ cfqg_stats_init(&cfqg->stats);
+ cfqg_stats_init(&cfqg->dead_stats);
}
static void cfq_pd_offline(struct blkcg_gq *blkg)
@@ -1607,12 +1632,11 @@ static u64 cfqg_prfill_weight_device(struct seq_file *sf,
return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
}
-static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft,
- struct seq_file *sf)
+static int cfqg_print_weight_device(struct cgroup_subsys_state *css,
+ struct cftype *cft, struct seq_file *sf)
{
- blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp),
- cfqg_prfill_weight_device, &blkcg_policy_cfq, 0,
- false);
+ blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_weight_device,
+ &blkcg_policy_cfq, 0, false);
return 0;
}
@@ -1626,35 +1650,34 @@ static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
}
-static int cfqg_print_leaf_weight_device(struct cgroup *cgrp,
+static int cfqg_print_leaf_weight_device(struct cgroup_subsys_state *css,
struct cftype *cft,
struct seq_file *sf)
{
- blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp),
- cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq, 0,
- false);
+ blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_leaf_weight_device,
+ &blkcg_policy_cfq, 0, false);
return 0;
}
-static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft,
+static int cfq_print_weight(struct cgroup_subsys_state *css, struct cftype *cft,
struct seq_file *sf)
{
- seq_printf(sf, "%u\n", cgroup_to_blkcg(cgrp)->cfq_weight);
+ seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_weight);
return 0;
}
-static int cfq_print_leaf_weight(struct cgroup *cgrp, struct cftype *cft,
- struct seq_file *sf)
+static int cfq_print_leaf_weight(struct cgroup_subsys_state *css,
+ struct cftype *cft, struct seq_file *sf)
{
- seq_printf(sf, "%u\n",
- cgroup_to_blkcg(cgrp)->cfq_leaf_weight);
+ seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_leaf_weight);
return 0;
}
-static int __cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
- const char *buf, bool is_leaf_weight)
+static int __cfqg_set_weight_device(struct cgroup_subsys_state *css,
+ struct cftype *cft, const char *buf,
+ bool is_leaf_weight)
{
- struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+ struct blkcg *blkcg = css_to_blkcg(css);
struct blkg_conf_ctx ctx;
struct cfq_group *cfqg;
int ret;
@@ -1680,22 +1703,22 @@ static int __cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
return ret;
}
-static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
- const char *buf)
+static int cfqg_set_weight_device(struct cgroup_subsys_state *css,
+ struct cftype *cft, const char *buf)
{
- return __cfqg_set_weight_device(cgrp, cft, buf, false);
+ return __cfqg_set_weight_device(css, cft, buf, false);
}
-static int cfqg_set_leaf_weight_device(struct cgroup *cgrp, struct cftype *cft,
- const char *buf)
+static int cfqg_set_leaf_weight_device(struct cgroup_subsys_state *css,
+ struct cftype *cft, const char *buf)
{
- return __cfqg_set_weight_device(cgrp, cft, buf, true);
+ return __cfqg_set_weight_device(css, cft, buf, true);
}
-static int __cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val,
- bool is_leaf_weight)
+static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 val, bool is_leaf_weight)
{
- struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+ struct blkcg *blkcg = css_to_blkcg(css);
struct blkcg_gq *blkg;
if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
@@ -1727,30 +1750,32 @@ static int __cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val,
return 0;
}
-static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
+static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 val)
{
- return __cfq_set_weight(cgrp, cft, val, false);
+ return __cfq_set_weight(css, cft, val, false);
}
-static int cfq_set_leaf_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
+static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
{
- return __cfq_set_weight(cgrp, cft, val, true);
+ return __cfq_set_weight(css, cft, val, true);
}
-static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft,
+static int cfqg_print_stat(struct cgroup_subsys_state *css, struct cftype *cft,
struct seq_file *sf)
{
- struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+ struct blkcg *blkcg = css_to_blkcg(css);
blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq,
cft->private, false);
return 0;
}
-static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
- struct seq_file *sf)
+static int cfqg_print_rwstat(struct cgroup_subsys_state *css,
+ struct cftype *cft, struct seq_file *sf)
{
- struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+ struct blkcg *blkcg = css_to_blkcg(css);
blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq,
cft->private, true);
@@ -1773,20 +1798,20 @@ static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
return __blkg_prfill_rwstat(sf, pd, &sum);
}
-static int cfqg_print_stat_recursive(struct cgroup *cgrp, struct cftype *cft,
- struct seq_file *sf)
+static int cfqg_print_stat_recursive(struct cgroup_subsys_state *css,
+ struct cftype *cft, struct seq_file *sf)
{
- struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+ struct blkcg *blkcg = css_to_blkcg(css);
blkcg_print_blkgs(sf, blkcg, cfqg_prfill_stat_recursive,
&blkcg_policy_cfq, cft->private, false);
return 0;
}
-static int cfqg_print_rwstat_recursive(struct cgroup *cgrp, struct cftype *cft,
- struct seq_file *sf)
+static int cfqg_print_rwstat_recursive(struct cgroup_subsys_state *css,
+ struct cftype *cft, struct seq_file *sf)
{
- struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+ struct blkcg *blkcg = css_to_blkcg(css);
blkcg_print_blkgs(sf, blkcg, cfqg_prfill_rwstat_recursive,
&blkcg_policy_cfq, cft->private, true);
@@ -1803,17 +1828,17 @@ static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
if (samples) {
v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum);
- do_div(v, samples);
+ v = div64_u64(v, samples);
}
__blkg_prfill_u64(sf, pd, v);
return 0;
}
/* print avg_queue_size */
-static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,
- struct seq_file *sf)
+static int cfqg_print_avg_queue_size(struct cgroup_subsys_state *css,
+ struct cftype *cft, struct seq_file *sf)
{
- struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+ struct blkcg *blkcg = css_to_blkcg(css);
blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size,
&blkcg_policy_cfq, 0, false);
@@ -4347,18 +4372,28 @@ static void cfq_exit_queue(struct elevator_queue *e)
kfree(cfqd);
}
-static int cfq_init_queue(struct request_queue *q)
+static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
{
struct cfq_data *cfqd;
struct blkcg_gq *blkg __maybe_unused;
int i, ret;
+ struct elevator_queue *eq;
- cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
- if (!cfqd)
+ eq = elevator_alloc(q, e);
+ if (!eq)
return -ENOMEM;
+ cfqd = kzalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node);
+ if (!cfqd) {
+ kobject_put(&eq->kobj);
+ return -ENOMEM;
+ }
+ eq->elevator_data = cfqd;
+
cfqd->queue = q;
- q->elevator->elevator_data = cfqd;
+ spin_lock_irq(q->queue_lock);
+ q->elevator = eq;
+ spin_unlock_irq(q->queue_lock);
/* Init root service tree */
cfqd->grp_service_tree = CFQ_RB_ROOT;
@@ -4433,6 +4468,7 @@ static int cfq_init_queue(struct request_queue *q)
out_free:
kfree(cfqd);
+ kobject_put(&eq->kobj);
return ret;
}
diff --git a/block/cmdline-parser.c b/block/cmdline-parser.c
new file mode 100644
index 0000000..cc2637f
--- /dev/null
+++ b/block/cmdline-parser.c
@@ -0,0 +1,250 @@
+/*
+ * Parse command line, get partition information
+ *
+ * Written by Cai Zhiyong <caizhiyong@huawei.com>
+ *
+ */
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/cmdline-parser.h>
+
+static int parse_subpart(struct cmdline_subpart **subpart, char *partdef)
+{
+ int ret = 0;
+ struct cmdline_subpart *new_subpart;
+
+ *subpart = NULL;
+
+ new_subpart = kzalloc(sizeof(struct cmdline_subpart), GFP_KERNEL);
+ if (!new_subpart)
+ return -ENOMEM;
+
+ if (*partdef == '-') {
+ new_subpart->size = (sector_t)(~0ULL);
+ partdef++;
+ } else {
+ new_subpart->size = (sector_t)memparse(partdef, &partdef);
+ if (new_subpart->size < (sector_t)PAGE_SIZE) {
+ pr_warn("cmdline partition size is invalid.");
+ ret = -EINVAL;
+ goto fail;
+ }
+ }
+
+ if (*partdef == '@') {
+ partdef++;
+ new_subpart->from = (sector_t)memparse(partdef, &partdef);
+ } else {
+ new_subpart->from = (sector_t)(~0ULL);
+ }
+
+ if (*partdef == '(') {
+ int length;
+ char *next = strchr(++partdef, ')');
+
+ if (!next) {
+ pr_warn("cmdline partition format is invalid.");
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ length = min_t(int, next - partdef,
+ sizeof(new_subpart->name) - 1);
+ strncpy(new_subpart->name, partdef, length);
+ new_subpart->name[length] = '\0';
+
+ partdef = ++next;
+ } else
+ new_subpart->name[0] = '\0';
+
+ new_subpart->flags = 0;
+
+ if (!strncmp(partdef, "ro", 2)) {
+ new_subpart->flags |= PF_RDONLY;
+ partdef += 2;
+ }
+
+ if (!strncmp(partdef, "lk", 2)) {
+ new_subpart->flags |= PF_POWERUP_LOCK;
+ partdef += 2;
+ }
+
+ *subpart = new_subpart;
+ return 0;
+fail:
+ kfree(new_subpart);
+ return ret;
+}
+
+static void free_subpart(struct cmdline_parts *parts)
+{
+ struct cmdline_subpart *subpart;
+
+ while (parts->subpart) {
+ subpart = parts->subpart;
+ parts->subpart = subpart->next_subpart;
+ kfree(subpart);
+ }
+}
+
+static int parse_parts(struct cmdline_parts **parts, const char *bdevdef)
+{
+ int ret = -EINVAL;
+ char *next;
+ int length;
+ struct cmdline_subpart **next_subpart;
+ struct cmdline_parts *newparts;
+ char buf[BDEVNAME_SIZE + 32 + 4];
+
+ *parts = NULL;
+
+ newparts = kzalloc(sizeof(struct cmdline_parts), GFP_KERNEL);
+ if (!newparts)
+ return -ENOMEM;
+
+ next = strchr(bdevdef, ':');
+ if (!next) {
+ pr_warn("cmdline partition has no block device.");
+ goto fail;
+ }
+
+ length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1);
+ strncpy(newparts->name, bdevdef, length);
+ newparts->name[length] = '\0';
+ newparts->nr_subparts = 0;
+
+ next_subpart = &newparts->subpart;
+
+ while (next && *(++next)) {
+ bdevdef = next;
+ next = strchr(bdevdef, ',');
+
+ length = (!next) ? (sizeof(buf) - 1) :
+ min_t(int, next - bdevdef, sizeof(buf) - 1);
+
+ strncpy(buf, bdevdef, length);
+ buf[length] = '\0';
+
+ ret = parse_subpart(next_subpart, buf);
+ if (ret)
+ goto fail;
+
+ newparts->nr_subparts++;
+ next_subpart = &(*next_subpart)->next_subpart;
+ }
+
+ if (!newparts->subpart) {
+ pr_warn("cmdline partition has no valid partition.");
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ *parts = newparts;
+
+ return 0;
+fail:
+ free_subpart(newparts);
+ kfree(newparts);
+ return ret;
+}
+
+void cmdline_parts_free(struct cmdline_parts **parts)
+{
+ struct cmdline_parts *next_parts;
+
+ while (*parts) {
+ next_parts = (*parts)->next_parts;
+ free_subpart(*parts);
+ kfree(*parts);
+ *parts = next_parts;
+ }
+}
+
+int cmdline_parts_parse(struct cmdline_parts **parts, const char *cmdline)
+{
+ int ret;
+ char *buf;
+ char *pbuf;
+ char *next;
+ struct cmdline_parts **next_parts;
+
+ *parts = NULL;
+
+ next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ next_parts = parts;
+
+ while (next && *pbuf) {
+ next = strchr(pbuf, ';');
+ if (next)
+ *next = '\0';
+
+ ret = parse_parts(next_parts, pbuf);
+ if (ret)
+ goto fail;
+
+ if (next)
+ pbuf = ++next;
+
+ next_parts = &(*next_parts)->next_parts;
+ }
+
+ if (!*parts) {
+ pr_warn("cmdline partition has no valid partition.");
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ ret = 0;
+done:
+ kfree(buf);
+ return ret;
+
+fail:
+ cmdline_parts_free(parts);
+ goto done;
+}
+
+struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts,
+ const char *bdev)
+{
+ while (parts && strncmp(bdev, parts->name, sizeof(parts->name)))
+ parts = parts->next_parts;
+ return parts;
+}
+
+/*
+ * add_part()
+ * 0 success.
+ * 1 can not add so many partitions.
+ */
+void cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size,
+ int slot,
+ int (*add_part)(int, struct cmdline_subpart *, void *),
+ void *param)
+
+{
+ sector_t from = 0;
+ struct cmdline_subpart *subpart;
+
+ for (subpart = parts->subpart; subpart;
+ subpart = subpart->next_subpart, slot++) {
+ if (subpart->from == (sector_t)(~0ULL))
+ subpart->from = from;
+ else
+ from = subpart->from;
+
+ if (from >= disk_size)
+ break;
+
+ if (subpart->size > (disk_size - from))
+ subpart->size = disk_size - from;
+
+ from += subpart->size;
+
+ if (add_part(slot, subpart, param))
+ break;
+ }
+}
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 7c668c8..fbd5a67 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -59,6 +59,7 @@ static int compat_hdio_getgeo(struct gendisk *disk, struct block_device *bdev,
if (!disk->fops->getgeo)
return -ENOTTY;
+ memset(&geo, 0, sizeof(geo));
/*
* We need to set the startsect first, the driver may
* want to override it.
@@ -69,7 +70,7 @@ static int compat_hdio_getgeo(struct gendisk *disk, struct block_device *bdev,
return ret;
ret = copy_to_user(ugeo, &geo, 4);
- ret |= __put_user(geo.start, &ugeo->start);
+ ret |= put_user(geo.start, &ugeo->start);
if (ret)
ret = -EFAULT;
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index ba19a3a..9ef6640 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -337,14 +337,22 @@ static void deadline_exit_queue(struct elevator_queue *e)
/*
* initialize elevator private data (deadline_data).
*/
-static int deadline_init_queue(struct request_queue *q)
+static int deadline_init_queue(struct request_queue *q, struct elevator_type *e)
{
struct deadline_data *dd;
+ struct elevator_queue *eq;
- dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node);
- if (!dd)
+ eq = elevator_alloc(q, e);
+ if (!eq)
return -ENOMEM;
+ dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
+ if (!dd) {
+ kobject_put(&eq->kobj);
+ return -ENOMEM;
+ }
+ eq->elevator_data = dd;
+
INIT_LIST_HEAD(&dd->fifo_list[READ]);
INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
dd->sort_list[READ] = RB_ROOT;
@@ -355,7 +363,9 @@ static int deadline_init_queue(struct request_queue *q)
dd->front_merges = 1;
dd->fifo_batch = fifo_batch;
- q->elevator->elevator_data = dd;
+ spin_lock_irq(q->queue_lock);
+ q->elevator = eq;
+ spin_unlock_irq(q->queue_lock);
return 0;
}
diff --git a/block/elevator.c b/block/elevator.c
index eba5b04..b7ff286 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -150,12 +150,12 @@ void __init load_default_elevator_module(void)
static struct kobj_type elv_ktype;
-static struct elevator_queue *elevator_alloc(struct request_queue *q,
+struct elevator_queue *elevator_alloc(struct request_queue *q,
struct elevator_type *e)
{
struct elevator_queue *eq;
- eq = kmalloc_node(sizeof(*eq), GFP_KERNEL | __GFP_ZERO, q->node);
+ eq = kzalloc_node(sizeof(*eq), GFP_KERNEL, q->node);
if (unlikely(!eq))
goto err;
@@ -170,6 +170,7 @@ err:
elevator_put(e);
return NULL;
}
+EXPORT_SYMBOL(elevator_alloc);
static void elevator_release(struct kobject *kobj)
{
@@ -185,6 +186,12 @@ int elevator_init(struct request_queue *q, char *name)
struct elevator_type *e = NULL;
int err;
+ /*
+ * q->sysfs_lock must be held to provide mutual exclusion between
+ * elevator_switch() and here.
+ */
+ lockdep_assert_held(&q->sysfs_lock);
+
if (unlikely(q->elevator))
return 0;
@@ -221,16 +228,7 @@ int elevator_init(struct request_queue *q, char *name)
}
}
- q->elevator = elevator_alloc(q, e);
- if (!q->elevator)
- return -ENOMEM;
-
- err = e->ops.elevator_init_fn(q);
- if (err) {
- kobject_put(&q->elevator->kobj);
- return err;
- }
-
+ err = e->ops.elevator_init_fn(q, e);
return 0;
}
EXPORT_SYMBOL(elevator_init);
@@ -935,16 +933,9 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
spin_unlock_irq(q->queue_lock);
/* allocate, init and register new elevator */
- err = -ENOMEM;
- q->elevator = elevator_alloc(q, new_e);
- if (!q->elevator)
- goto fail_init;
-
- err = new_e->ops.elevator_init_fn(q);
- if (err) {
- kobject_put(&q->elevator->kobj);
+ err = new_e->ops.elevator_init_fn(q, new_e);
+ if (err)
goto fail_init;
- }
if (registered) {
err = elv_register_queue(q);
@@ -974,7 +965,7 @@ fail_init:
/*
* Switch this queue to the given IO scheduler.
*/
-int elevator_change(struct request_queue *q, const char *name)
+static int __elevator_change(struct request_queue *q, const char *name)
{
char elevator_name[ELV_NAME_MAX];
struct elevator_type *e;
@@ -996,6 +987,18 @@ int elevator_change(struct request_queue *q, const char *name)
return elevator_switch(q, e);
}
+
+int elevator_change(struct request_queue *q, const char *name)
+{
+ int ret;
+
+ /* Protect q->elevator from elevator_init() */
+ mutex_lock(&q->sysfs_lock);
+ ret = __elevator_change(q, name);
+ mutex_unlock(&q->sysfs_lock);
+
+ return ret;
+}
EXPORT_SYMBOL(elevator_change);
ssize_t elv_iosched_store(struct request_queue *q, const char *name,
@@ -1006,7 +1009,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
if (!q->elevator)
return count;
- ret = elevator_change(q, name);
+ ret = __elevator_change(q, name);
if (!ret)
return count;
diff --git a/block/genhd.c b/block/genhd.c
index 20625ee..791f419 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -512,7 +512,7 @@ static void register_disk(struct gendisk *disk)
ddev->parent = disk->driverfs_dev;
- dev_set_name(ddev, disk->disk_name);
+ dev_set_name(ddev, "%s", disk->disk_name);
/* delay uevents, until we scanned partition table */
dev_set_uevent_suppress(ddev, 1);
@@ -1252,8 +1252,7 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
{
struct gendisk *disk;
- disk = kmalloc_node(sizeof(struct gendisk),
- GFP_KERNEL | __GFP_ZERO, node_id);
+ disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
if (disk) {
if (!init_part_stats(&disk->part0)) {
kfree(disk);
@@ -1489,9 +1488,11 @@ static void __disk_unblock_events(struct gendisk *disk, bool check_now)
intv = disk_events_poll_jiffies(disk);
set_timer_slack(&ev->dwork.timer, intv / 4);
if (check_now)
- queue_delayed_work(system_freezable_wq, &ev->dwork, 0);
+ queue_delayed_work(system_freezable_power_efficient_wq,
+ &ev->dwork, 0);
else if (intv)
- queue_delayed_work(system_freezable_wq, &ev->dwork, intv);
+ queue_delayed_work(system_freezable_power_efficient_wq,
+ &ev->dwork, intv);
out_unlock:
spin_unlock_irqrestore(&ev->lock, flags);
}
@@ -1534,7 +1535,8 @@ void disk_flush_events(struct gendisk *disk, unsigned int mask)
spin_lock_irq(&ev->lock);
ev->clearing |= mask;
if (!ev->block)
- mod_delayed_work(system_freezable_wq, &ev->dwork, 0);
+ mod_delayed_work(system_freezable_power_efficient_wq,
+ &ev->dwork, 0);
spin_unlock_irq(&ev->lock);
}
@@ -1627,7 +1629,8 @@ static void disk_check_events(struct disk_events *ev,
intv = disk_events_poll_jiffies(disk);
if (!ev->block && intv)
- queue_delayed_work(system_freezable_wq, &ev->dwork, intv);
+ queue_delayed_work(system_freezable_power_efficient_wq,
+ &ev->dwork, intv);
spin_unlock_irq(&ev->lock);
diff --git a/block/ioctl.c b/block/ioctl.c
index a31d91d..7d5c3b2 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -64,7 +64,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
part = add_partition(disk, partno, start, length,
ADDPART_FLAG_NONE, NULL);
mutex_unlock(&bdev->bd_mutex);
- return IS_ERR(part) ? PTR_ERR(part) : 0;
+ return PTR_ERR_OR_ZERO(part);
case BLKPG_DEL_PARTITION:
part = disk_get_part(disk, partno);
if (!part)
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index 5d1bf70..3de89d4 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -59,16 +59,27 @@ noop_latter_request(struct request_queue *q, struct request *rq)
return list_entry(rq->queuelist.next, struct request, queuelist);
}
-static int noop_init_queue(struct request_queue *q)
+static int noop_init_queue(struct request_queue *q, struct elevator_type *e)
{
struct noop_data *nd;
+ struct elevator_queue *eq;
+
+ eq = elevator_alloc(q, e);
+ if (!eq)
+ return -ENOMEM;
nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
- if (!nd)
+ if (!nd) {
+ kobject_put(&eq->kobj);
return -ENOMEM;
+ }
+ eq->elevator_data = nd;
INIT_LIST_HEAD(&nd->queue);
- q->elevator->elevator_data = nd;
+
+ spin_lock_irq(q->queue_lock);
+ q->elevator = eq;
+ spin_unlock_irq(q->queue_lock);
return 0;
}
diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig
index 75a54e1..9b29a99 100644
--- a/block/partitions/Kconfig
+++ b/block/partitions/Kconfig
@@ -68,6 +68,17 @@ config ACORN_PARTITION_RISCIX
of machines called RISCiX. If you say 'Y' here, Linux will be able
to read disks partitioned under RISCiX.
+config AIX_PARTITION
+ bool "AIX basic partition table support" if PARTITION_ADVANCED
+ help
+ Say Y here if you would like to be able to read the hard disk
+ partition table format used by IBM or Motorola PowerPC machines
+ running AIX. AIX actually uses a Logical Volume Manager, where
+ "logical volumes" can be spread across one or multiple disks,
+ but this driver works only for the simple case of partitions which
+ are contiguous.
+ Otherwise, say N.
+
config OSF_PARTITION
bool "Alpha OSF partition support" if PARTITION_ADVANCED
default y if ALPHA
@@ -249,3 +260,10 @@ config SYSV68_PARTITION
partition table format used by Motorola Delta machines (using
sysv68).
Otherwise, say N.
+
+config CMDLINE_PARTITION
+ bool "Command line partition support" if PARTITION_ADVANCED
+ select BLK_CMDLINE_PARSER
+ help
+ Say Y here if you want to read the partition table from bootargs.
+ The format for the command line is just like mtdparts.
diff --git a/block/partitions/Makefile b/block/partitions/Makefile
index 03af8ea..37a9527 100644
--- a/block/partitions/Makefile
+++ b/block/partitions/Makefile
@@ -7,6 +7,8 @@ obj-$(CONFIG_BLOCK) := check.o
obj-$(CONFIG_ACORN_PARTITION) += acorn.o
obj-$(CONFIG_AMIGA_PARTITION) += amiga.o
obj-$(CONFIG_ATARI_PARTITION) += atari.o
+obj-$(CONFIG_AIX_PARTITION) += aix.o
+obj-$(CONFIG_CMDLINE_PARTITION) += cmdline.o
obj-$(CONFIG_MAC_PARTITION) += mac.o
obj-$(CONFIG_LDM_PARTITION) += ldm.o
obj-$(CONFIG_MSDOS_PARTITION) += msdos.o
diff --git a/block/partitions/aix.c b/block/partitions/aix.c
new file mode 100644
index 0000000..43be471
--- /dev/null
+++ b/block/partitions/aix.c
@@ -0,0 +1,293 @@
+/*
+ * fs/partitions/aix.c
+ *
+ * Copyright (C) 2012-2013 Philippe De Muyter <phdm@macqel.be>
+ */
+
+#include "check.h"
+#include "aix.h"
+
+struct lvm_rec {
+ char lvm_id[4]; /* "_LVM" */
+ char reserved4[16];
+ __be32 lvmarea_len;
+ __be32 vgda_len;
+ __be32 vgda_psn[2];
+ char reserved36[10];
+ __be16 pp_size; /* log2(pp_size) */
+ char reserved46[12];
+ __be16 version;
+ };
+
+struct vgda {
+ __be32 secs;
+ __be32 usec;
+ char reserved8[16];
+ __be16 numlvs;
+ __be16 maxlvs;
+ __be16 pp_size;
+ __be16 numpvs;
+ __be16 total_vgdas;
+ __be16 vgda_size;
+ };
+
+struct lvd {
+ __be16 lv_ix;
+ __be16 res2;
+ __be16 res4;
+ __be16 maxsize;
+ __be16 lv_state;
+ __be16 mirror;
+ __be16 mirror_policy;
+ __be16 num_lps;
+ __be16 res10[8];
+ };
+
+struct lvname {
+ char name[64];
+ };
+
+struct ppe {
+ __be16 lv_ix;
+ unsigned short res2;
+ unsigned short res4;
+ __be16 lp_ix;
+ unsigned short res8[12];
+ };
+
+struct pvd {
+ char reserved0[16];
+ __be16 pp_count;
+ char reserved18[2];
+ __be32 psn_part1;
+ char reserved24[8];
+ struct ppe ppe[1016];
+ };
+
+#define LVM_MAXLVS 256
+
+/**
+ * last_lba(): return number of last logical block of device
+ * @bdev: block device
+ *
+ * Description: Returns last LBA value on success, 0 on error.
+ * This is stored (by sd and ide-geometry) in
+ * the part[0] entry for this disk, and is the number of
+ * physical sectors available on the disk.
+ */
+static u64 last_lba(struct block_device *bdev)
+{
+ if (!bdev || !bdev->bd_inode)
+ return 0;
+ return (bdev->bd_inode->i_size >> 9) - 1ULL;
+}
+
+/**
+ * read_lba(): Read bytes from disk, starting at given LBA
+ * @state
+ * @lba
+ * @buffer
+ * @count
+ *
+ * Description: Reads @count bytes from @state->bdev into @buffer.
+ * Returns number of bytes read on success, 0 on error.
+ */
+static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer,
+ size_t count)
+{
+ size_t totalreadcount = 0;
+
+ if (!buffer || lba + count / 512 > last_lba(state->bdev))
+ return 0;
+
+ while (count) {
+ int copied = 512;
+ Sector sect;
+ unsigned char *data = read_part_sector(state, lba++, &sect);
+ if (!data)
+ break;
+ if (copied > count)
+ copied = count;
+ memcpy(buffer, data, copied);
+ put_dev_sector(sect);
+ buffer += copied;
+ totalreadcount += copied;
+ count -= copied;
+ }
+ return totalreadcount;
+}
+
+/**
+ * alloc_pvd(): reads physical volume descriptor
+ * @state
+ * @lba
+ *
+ * Description: Returns pvd on success, NULL on error.
+ * Allocates space for pvd and fill it with disk blocks at @lba
+ * Notes: remember to free pvd when you're done!
+ */
+static struct pvd *alloc_pvd(struct parsed_partitions *state, u32 lba)
+{
+ size_t count = sizeof(struct pvd);
+ struct pvd *p;
+
+ p = kmalloc(count, GFP_KERNEL);
+ if (!p)
+ return NULL;
+
+ if (read_lba(state, lba, (u8 *) p, count) < count) {
+ kfree(p);
+ return NULL;
+ }
+ return p;
+}
+
+/**
+ * alloc_lvn(): reads logical volume names
+ * @state
+ * @lba
+ *
+ * Description: Returns lvn on success, NULL on error.
+ * Allocates space for lvn and fill it with disk blocks at @lba
+ * Notes: remember to free lvn when you're done!
+ */
+static struct lvname *alloc_lvn(struct parsed_partitions *state, u32 lba)
+{
+ size_t count = sizeof(struct lvname) * LVM_MAXLVS;
+ struct lvname *p;
+
+ p = kmalloc(count, GFP_KERNEL);
+ if (!p)
+ return NULL;
+
+ if (read_lba(state, lba, (u8 *) p, count) < count) {
+ kfree(p);
+ return NULL;
+ }
+ return p;
+}
+
+int aix_partition(struct parsed_partitions *state)
+{
+ int ret = 0;
+ Sector sect;
+ unsigned char *d;
+ u32 pp_bytes_size;
+ u32 pp_blocks_size = 0;
+ u32 vgda_sector = 0;
+ u32 vgda_len = 0;
+ int numlvs = 0;
+ struct pvd *pvd;
+ struct lv_info {
+ unsigned short pps_per_lv;
+ unsigned short pps_found;
+ unsigned char lv_is_contiguous;
+ } *lvip;
+ struct lvname *n = NULL;
+
+ d = read_part_sector(state, 7, &sect);
+ if (d) {
+ struct lvm_rec *p = (struct lvm_rec *)d;
+ u16 lvm_version = be16_to_cpu(p->version);
+ char tmp[64];
+
+ if (lvm_version == 1) {
+ int pp_size_log2 = be16_to_cpu(p->pp_size);
+
+ pp_bytes_size = 1 << pp_size_log2;
+ pp_blocks_size = pp_bytes_size / 512;
+ snprintf(tmp, sizeof(tmp),
+ " AIX LVM header version %u found\n",
+ lvm_version);
+ vgda_len = be32_to_cpu(p->vgda_len);
+ vgda_sector = be32_to_cpu(p->vgda_psn[0]);
+ } else {
+ snprintf(tmp, sizeof(tmp),
+ " unsupported AIX LVM version %d found\n",
+ lvm_version);
+ }
+ strlcat(state->pp_buf, tmp, PAGE_SIZE);
+ put_dev_sector(sect);
+ }
+ if (vgda_sector && (d = read_part_sector(state, vgda_sector, &sect))) {
+ struct vgda *p = (struct vgda *)d;
+
+ numlvs = be16_to_cpu(p->numlvs);
+ put_dev_sector(sect);
+ }
+ lvip = kzalloc(sizeof(struct lv_info) * state->limit, GFP_KERNEL);
+ if (!lvip)
+ return 0;
+ if (numlvs && (d = read_part_sector(state, vgda_sector + 1, &sect))) {
+ struct lvd *p = (struct lvd *)d;
+ int i;
+
+ n = alloc_lvn(state, vgda_sector + vgda_len - 33);
+ if (n) {
+ int foundlvs = 0;
+
+ for (i = 0; foundlvs < numlvs && i < state->limit; i += 1) {
+ lvip[i].pps_per_lv = be16_to_cpu(p[i].num_lps);
+ if (lvip[i].pps_per_lv)
+ foundlvs += 1;
+ }
+ }
+ put_dev_sector(sect);
+ }
+ pvd = alloc_pvd(state, vgda_sector + 17);
+ if (pvd) {
+ int numpps = be16_to_cpu(pvd->pp_count);
+ int psn_part1 = be32_to_cpu(pvd->psn_part1);
+ int i;
+ int cur_lv_ix = -1;
+ int next_lp_ix = 1;
+ int lp_ix;
+
+ for (i = 0; i < numpps; i += 1) {
+ struct ppe *p = pvd->ppe + i;
+ unsigned int lv_ix;
+
+ lp_ix = be16_to_cpu(p->lp_ix);
+ if (!lp_ix) {
+ next_lp_ix = 1;
+ continue;
+ }
+ lv_ix = be16_to_cpu(p->lv_ix) - 1;
+ if (lv_ix > state->limit) {
+ cur_lv_ix = -1;
+ continue;
+ }
+ lvip[lv_ix].pps_found += 1;
+ if (lp_ix == 1) {
+ cur_lv_ix = lv_ix;
+ next_lp_ix = 1;
+ } else if (lv_ix != cur_lv_ix || lp_ix != next_lp_ix) {
+ next_lp_ix = 1;
+ continue;
+ }
+ if (lp_ix == lvip[lv_ix].pps_per_lv) {
+ char tmp[70];
+
+ put_partition(state, lv_ix + 1,
+ (i + 1 - lp_ix) * pp_blocks_size + psn_part1,
+ lvip[lv_ix].pps_per_lv * pp_blocks_size);
+ snprintf(tmp, sizeof(tmp), " <%s>\n",
+ n[lv_ix].name);
+ strlcat(state->pp_buf, tmp, PAGE_SIZE);
+ lvip[lv_ix].lv_is_contiguous = 1;
+ ret = 1;
+ next_lp_ix = 1;
+ } else
+ next_lp_ix += 1;
+ }
+ for (i = 0; i < state->limit; i += 1)
+ if (lvip[i].pps_found && !lvip[i].lv_is_contiguous)
+ pr_warn("partition %s (%u pp's found) is "
+ "not contiguous\n",
+ n[i].name, lvip[i].pps_found);
+ kfree(pvd);
+ }
+ kfree(n);
+ kfree(lvip);
+ return ret;
+}
diff --git a/block/partitions/aix.h b/block/partitions/aix.h
new file mode 100644
index 0000000..e0c66a9
--- /dev/null
+++ b/block/partitions/aix.h
@@ -0,0 +1 @@
+extern int aix_partition(struct parsed_partitions *state);
diff --git a/block/partitions/check.c b/block/partitions/check.c
index 19ba207..9ac1df7 100644
--- a/block/partitions/check.c
+++ b/block/partitions/check.c
@@ -34,6 +34,7 @@
#include "efi.h"
#include "karma.h"
#include "sysv68.h"
+#include "cmdline.h"
int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/
@@ -65,6 +66,9 @@ static int (*check_part[])(struct parsed_partitions *) = {
adfspart_check_ADFS,
#endif
+#ifdef CONFIG_CMDLINE_PARTITION
+ cmdline_partition,
+#endif
#ifdef CONFIG_EFI_PARTITION
efi_partition, /* this must come before msdos */
#endif
diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c
new file mode 100644
index 0000000..5141b56
--- /dev/null
+++ b/block/partitions/cmdline.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright (C) 2013 HUAWEI
+ * Author: Cai Zhiyong <caizhiyong@huawei.com>
+ *
+ * Read block device partition table from the command line.
+ * Typically used for fixed block (eMMC) embedded devices.
+ * It has no MBR, so saves storage space. Bootloader can be easily accessed
+ * by absolute address of data on the block device.
+ * Users can easily change the partition.
+ *
+ * The format for the command line is just like mtdparts.
+ *
+ * For further information, see "Documentation/block/cmdline-partition.txt"
+ *
+ */
+
+#include <linux/cmdline-parser.h>
+
+#include "check.h"
+#include "cmdline.h"
+
+static char *cmdline;
+static struct cmdline_parts *bdev_parts;
+
+static int add_part(int slot, struct cmdline_subpart *subpart, void *param)
+{
+ int label_min;
+ struct partition_meta_info *info;
+ char tmp[sizeof(info->volname) + 4];
+ struct parsed_partitions *state = (struct parsed_partitions *)param;
+
+ if (slot >= state->limit)
+ return 1;
+
+ put_partition(state, slot, subpart->from >> 9,
+ subpart->size >> 9);
+
+ info = &state->parts[slot].info;
+
+ label_min = min_t(int, sizeof(info->volname) - 1,
+ sizeof(subpart->name));
+ strncpy(info->volname, subpart->name, label_min);
+ info->volname[label_min] = '\0';
+
+ snprintf(tmp, sizeof(tmp), "(%s)", info->volname);
+ strlcat(state->pp_buf, tmp, PAGE_SIZE);
+
+ state->parts[slot].has_info = true;
+
+ return 0;
+}
+
+static int __init cmdline_parts_setup(char *s)
+{
+ cmdline = s;
+ return 1;
+}
+__setup("blkdevparts=", cmdline_parts_setup);
+
+/*
+ * Purpose: allocate cmdline partitions.
+ * Returns:
+ * -1 if unable to read the partition table
+ * 0 if this isn't our partition table
+ * 1 if successful
+ */
+int cmdline_partition(struct parsed_partitions *state)
+{
+ sector_t disk_size;
+ char bdev[BDEVNAME_SIZE];
+ struct cmdline_parts *parts;
+
+ if (cmdline) {
+ if (bdev_parts)
+ cmdline_parts_free(&bdev_parts);
+
+ if (cmdline_parts_parse(&bdev_parts, cmdline)) {
+ cmdline = NULL;
+ return -1;
+ }
+ cmdline = NULL;
+ }
+
+ if (!bdev_parts)
+ return 0;
+
+ bdevname(state->bdev, bdev);
+ parts = cmdline_parts_find(bdev_parts, bdev);
+ if (!parts)
+ return 0;
+
+ disk_size = get_capacity(state->bdev->bd_disk) << 9;
+
+ cmdline_parts_set(parts, disk_size, 1, add_part, (void *)state);
+
+ strlcat(state->pp_buf, "\n", PAGE_SIZE);
+
+ return 1;
+}
diff --git a/block/partitions/cmdline.h b/block/partitions/cmdline.h
new file mode 100644
index 0000000..26e0f8d
--- /dev/null
+++ b/block/partitions/cmdline.h
@@ -0,0 +1,2 @@
+
+int cmdline_partition(struct parsed_partitions *state);
diff --git a/block/partitions/efi.c b/block/partitions/efi.c
index c85fc89..dc51f46 100644
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@@ -25,6 +25,9 @@
* TODO:
*
* Changelog:
+ * Mon August 5th, 2013 Davidlohr Bueso <davidlohr@hp.com>
+ * - detect hybrid MBRs, tighter pMBR checking & cleanups.
+ *
* Mon Nov 09 2004 Matt Domsch <Matt_Domsch@dell.com>
* - test for valid PMBR and valid PGPT before ever reading
* AGPT, allow override with 'gpt' kernel command line option.
@@ -93,6 +96,7 @@
* - Code works, detects all the partitions.
*
************************************************************/
+#include <linux/kernel.h>
#include <linux/crc32.h>
#include <linux/ctype.h>
#include <linux/math64.h>
@@ -149,34 +153,89 @@ static u64 last_lba(struct block_device *bdev)
bdev_logical_block_size(bdev)) - 1ULL;
}
-static inline int
-pmbr_part_valid(struct partition *part)
+static inline int pmbr_part_valid(gpt_mbr_record *part)
{
- if (part->sys_ind == EFI_PMBR_OSTYPE_EFI_GPT &&
- le32_to_cpu(part->start_sect) == 1UL)
- return 1;
- return 0;
+ if (part->os_type != EFI_PMBR_OSTYPE_EFI_GPT)
+ goto invalid;
+
+ /* set to 0x00000001 (i.e., the LBA of the GPT Partition Header) */
+ if (le32_to_cpu(part->starting_lba) != GPT_PRIMARY_PARTITION_TABLE_LBA)
+ goto invalid;
+
+ return GPT_MBR_PROTECTIVE;
+invalid:
+ return 0;
}
/**
* is_pmbr_valid(): test Protective MBR for validity
* @mbr: pointer to a legacy mbr structure
+ * @total_sectors: amount of sectors in the device
*
- * Description: Returns 1 if PMBR is valid, 0 otherwise.
- * Validity depends on two things:
+ * Description: Checks for a valid protective or hybrid
+ * master boot record (MBR). The validity of a pMBR depends
+ * on all of the following properties:
* 1) MSDOS signature is in the last two bytes of the MBR
* 2) One partition of type 0xEE is found
+ *
+ * In addition, a hybrid MBR will have up to three additional
+ * primary partitions, which point to the same space that's
+ * marked out by up to three GPT partitions.
+ *
+ * Returns 0 upon invalid MBR, or GPT_MBR_PROTECTIVE or
+ * GPT_MBR_HYBRID depending on the device layout.
*/
-static int
-is_pmbr_valid(legacy_mbr *mbr)
+static int is_pmbr_valid(legacy_mbr *mbr, sector_t total_sectors)
{
- int i;
+ uint32_t sz = 0;
+ int i, part = 0, ret = 0; /* invalid by default */
+
if (!mbr || le16_to_cpu(mbr->signature) != MSDOS_MBR_SIGNATURE)
- return 0;
+ goto done;
+
+ for (i = 0; i < 4; i++) {
+ ret = pmbr_part_valid(&mbr->partition_record[i]);
+ if (ret == GPT_MBR_PROTECTIVE) {
+ part = i;
+ /*
+ * Ok, we at least know that there's a protective MBR,
+ * now check if there are other partition types for
+ * hybrid MBR.
+ */
+ goto check_hybrid;
+ }
+ }
+
+ if (ret != GPT_MBR_PROTECTIVE)
+ goto done;
+check_hybrid:
for (i = 0; i < 4; i++)
- if (pmbr_part_valid(&mbr->partition_record[i]))
- return 1;
- return 0;
+ if ((mbr->partition_record[i].os_type !=
+ EFI_PMBR_OSTYPE_EFI_GPT) &&
+ (mbr->partition_record[i].os_type != 0x00))
+ ret = GPT_MBR_HYBRID;
+
+ /*
+ * Protective MBRs take up the lesser of the whole disk
+ * or 2 TiB (32bit LBA), ignoring the rest of the disk.
+ * Some partitioning programs, nonetheless, choose to set
+ * the size to the maximum 32-bit limitation, disregarding
+ * the disk size.
+ *
+ * Hybrid MBRs do not necessarily comply with this.
+ *
+ * Consider a bad value here to be a warning to support dd'ing
+ * an image from a smaller disk to a larger disk.
+ */
+ if (ret == GPT_MBR_PROTECTIVE) {
+ sz = le32_to_cpu(mbr->partition_record[part].size_in_lba);
+ if (sz != (uint32_t) total_sectors - 1 && sz != 0xFFFFFFFF)
+ pr_debug("GPT: mbr size in lba (%u) different than whole disk (%u).\n",
+ sz, min_t(uint32_t,
+ total_sectors - 1, 0xFFFFFFFF));
+ }
+done:
+ return ret;
}
/**
@@ -243,8 +302,7 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
return NULL;
if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba),
- (u8 *) pte,
- count) < count) {
+ (u8 *) pte, count) < count) {
kfree(pte);
pte=NULL;
return NULL;
@@ -364,7 +422,12 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
(unsigned long long)lastlba);
goto fail;
}
-
+ if (le64_to_cpu((*gpt)->last_usable_lba) < le64_to_cpu((*gpt)->first_usable_lba)) {
+ pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n",
+ (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba),
+ (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba));
+ goto fail;
+ }
/* Check that sizeof_partition_entry has the correct value */
if (le32_to_cpu((*gpt)->sizeof_partition_entry) != sizeof(gpt_entry)) {
pr_debug("GUID Partitition Entry Size check failed.\n");
@@ -429,44 +492,42 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
if (!pgpt || !agpt)
return;
if (le64_to_cpu(pgpt->my_lba) != le64_to_cpu(agpt->alternate_lba)) {
- printk(KERN_WARNING
- "GPT:Primary header LBA != Alt. header alternate_lba\n");
- printk(KERN_WARNING "GPT:%lld != %lld\n",
+ pr_warn("GPT:Primary header LBA != Alt. header alternate_lba\n");
+ pr_warn("GPT:%lld != %lld\n",
(unsigned long long)le64_to_cpu(pgpt->my_lba),
(unsigned long long)le64_to_cpu(agpt->alternate_lba));
error_found++;
}
if (le64_to_cpu(pgpt->alternate_lba) != le64_to_cpu(agpt->my_lba)) {
- printk(KERN_WARNING
- "GPT:Primary header alternate_lba != Alt. header my_lba\n");
- printk(KERN_WARNING "GPT:%lld != %lld\n",
+ pr_warn("GPT:Primary header alternate_lba != Alt. header my_lba\n");
+ pr_warn("GPT:%lld != %lld\n",
(unsigned long long)le64_to_cpu(pgpt->alternate_lba),
(unsigned long long)le64_to_cpu(agpt->my_lba));
error_found++;
}
if (le64_to_cpu(pgpt->first_usable_lba) !=
le64_to_cpu(agpt->first_usable_lba)) {
- printk(KERN_WARNING "GPT:first_usable_lbas don't match.\n");
- printk(KERN_WARNING "GPT:%lld != %lld\n",
+ pr_warn("GPT:first_usable_lbas don't match.\n");
+ pr_warn("GPT:%lld != %lld\n",
(unsigned long long)le64_to_cpu(pgpt->first_usable_lba),
(unsigned long long)le64_to_cpu(agpt->first_usable_lba));
error_found++;
}
if (le64_to_cpu(pgpt->last_usable_lba) !=
le64_to_cpu(agpt->last_usable_lba)) {
- printk(KERN_WARNING "GPT:last_usable_lbas don't match.\n");
- printk(KERN_WARNING "GPT:%lld != %lld\n",
+ pr_warn("GPT:last_usable_lbas don't match.\n");
+ pr_warn("GPT:%lld != %lld\n",
(unsigned long long)le64_to_cpu(pgpt->last_usable_lba),
(unsigned long long)le64_to_cpu(agpt->last_usable_lba));
error_found++;
}
if (efi_guidcmp(pgpt->disk_guid, agpt->disk_guid)) {
- printk(KERN_WARNING "GPT:disk_guids don't match.\n");
+ pr_warn("GPT:disk_guids don't match.\n");
error_found++;
}
if (le32_to_cpu(pgpt->num_partition_entries) !=
le32_to_cpu(agpt->num_partition_entries)) {
- printk(KERN_WARNING "GPT:num_partition_entries don't match: "
+ pr_warn("GPT:num_partition_entries don't match: "
"0x%x != 0x%x\n",
le32_to_cpu(pgpt->num_partition_entries),
le32_to_cpu(agpt->num_partition_entries));
@@ -474,8 +535,7 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
}
if (le32_to_cpu(pgpt->sizeof_partition_entry) !=
le32_to_cpu(agpt->sizeof_partition_entry)) {
- printk(KERN_WARNING
- "GPT:sizeof_partition_entry values don't match: "
+ pr_warn("GPT:sizeof_partition_entry values don't match: "
"0x%x != 0x%x\n",
le32_to_cpu(pgpt->sizeof_partition_entry),
le32_to_cpu(agpt->sizeof_partition_entry));
@@ -483,34 +543,30 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
}
if (le32_to_cpu(pgpt->partition_entry_array_crc32) !=
le32_to_cpu(agpt->partition_entry_array_crc32)) {
- printk(KERN_WARNING
- "GPT:partition_entry_array_crc32 values don't match: "
+ pr_warn("GPT:partition_entry_array_crc32 values don't match: "
"0x%x != 0x%x\n",
le32_to_cpu(pgpt->partition_entry_array_crc32),
le32_to_cpu(agpt->partition_entry_array_crc32));
error_found++;
}
if (le64_to_cpu(pgpt->alternate_lba) != lastlba) {
- printk(KERN_WARNING
- "GPT:Primary header thinks Alt. header is not at the end of the disk.\n");
- printk(KERN_WARNING "GPT:%lld != %lld\n",
+ pr_warn("GPT:Primary header thinks Alt. header is not at the end of the disk.\n");
+ pr_warn("GPT:%lld != %lld\n",
(unsigned long long)le64_to_cpu(pgpt->alternate_lba),
(unsigned long long)lastlba);
error_found++;
}
if (le64_to_cpu(agpt->my_lba) != lastlba) {
- printk(KERN_WARNING
- "GPT:Alternate GPT header not at the end of the disk.\n");
- printk(KERN_WARNING "GPT:%lld != %lld\n",
+ pr_warn("GPT:Alternate GPT header not at the end of the disk.\n");
+ pr_warn("GPT:%lld != %lld\n",
(unsigned long long)le64_to_cpu(agpt->my_lba),
(unsigned long long)lastlba);
error_found++;
}
if (error_found)
- printk(KERN_WARNING
- "GPT: Use GNU Parted to correct GPT errors.\n");
+ pr_warn("GPT: Use GNU Parted to correct GPT errors.\n");
return;
}
@@ -536,6 +592,7 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
gpt_header *pgpt = NULL, *agpt = NULL;
gpt_entry *pptes = NULL, *aptes = NULL;
legacy_mbr *legacymbr;
+ sector_t total_sectors = i_size_read(state->bdev->bd_inode) >> 9;
u64 lastlba;
if (!ptes)
@@ -543,17 +600,22 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
lastlba = last_lba(state->bdev);
if (!force_gpt) {
- /* This will be added to the EFI Spec. per Intel after v1.02. */
- legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL);
- if (legacymbr) {
- read_lba(state, 0, (u8 *) legacymbr,
- sizeof (*legacymbr));
- good_pmbr = is_pmbr_valid(legacymbr);
- kfree(legacymbr);
- }
- if (!good_pmbr)
- goto fail;
- }
+ /* This will be added to the EFI Spec. per Intel after v1.02. */
+ legacymbr = kzalloc(sizeof(*legacymbr), GFP_KERNEL);
+ if (!legacymbr)
+ goto fail;
+
+ read_lba(state, 0, (u8 *)legacymbr, sizeof(*legacymbr));
+ good_pmbr = is_pmbr_valid(legacymbr, total_sectors);
+ kfree(legacymbr);
+
+ if (!good_pmbr)
+ goto fail;
+
+ pr_debug("Device has a %s MBR\n",
+ good_pmbr == GPT_MBR_PROTECTIVE ?
+ "protective" : "hybrid");
+ }
good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA,
&pgpt, &pptes);
@@ -576,11 +638,8 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
*ptes = pptes;
kfree(agpt);
kfree(aptes);
- if (!good_agpt) {
- printk(KERN_WARNING
- "Alternate GPT is invalid, "
- "using primary GPT.\n");
- }
+ if (!good_agpt)
+ pr_warn("Alternate GPT is invalid, using primary GPT.\n");
return 1;
}
else if (good_agpt) {
@@ -588,8 +647,7 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
*ptes = aptes;
kfree(pgpt);
kfree(pptes);
- printk(KERN_WARNING
- "Primary GPT is invalid, using alternate GPT.\n");
+ pr_warn("Primary GPT is invalid, using alternate GPT.\n");
return 1;
}
@@ -651,16 +709,15 @@ int efi_partition(struct parsed_partitions *state)
put_partition(state, i+1, start * ssz, size * ssz);
/* If this is a RAID volume, tell md */
- if (!efi_guidcmp(ptes[i].partition_type_guid,
- PARTITION_LINUX_RAID_GUID))
+ if (!efi_guidcmp(ptes[i].partition_type_guid, PARTITION_LINUX_RAID_GUID))
state->parts[i + 1].flags = ADDPART_FLAG_RAID;
info = &state->parts[i + 1].info;
efi_guid_unparse(&ptes[i].unique_partition_guid, info->uuid);
/* Naively convert UTF16-LE to 7 bits. */
- label_max = min(sizeof(info->volname) - 1,
- sizeof(ptes[i].partition_name));
+ label_max = min(ARRAY_SIZE(info->volname) - 1,
+ ARRAY_SIZE(ptes[i].partition_name));
info->volname[label_max] = 0;
while (label_count < label_max) {
u8 c = ptes[i].partition_name[label_count] & 0xff;
diff --git a/block/partitions/efi.h b/block/partitions/efi.h
index b69ab72..4efcafb 100644
--- a/block/partitions/efi.h
+++ b/block/partitions/efi.h
@@ -37,6 +37,9 @@
#define EFI_PMBR_OSTYPE_EFI 0xEF
#define EFI_PMBR_OSTYPE_EFI_GPT 0xEE
+#define GPT_MBR_PROTECTIVE 1
+#define GPT_MBR_HYBRID 2
+
#define GPT_HEADER_SIGNATURE 0x5452415020494645ULL
#define GPT_HEADER_REVISION_V1 0x00010000
#define GPT_PRIMARY_PARTITION_TABLE_LBA 1
@@ -101,11 +104,25 @@ typedef struct _gpt_entry {
efi_char16_t partition_name[72 / sizeof (efi_char16_t)];
} __attribute__ ((packed)) gpt_entry;
+typedef struct _gpt_mbr_record {
+ u8 boot_indicator; /* unused by EFI, set to 0x80 for bootable */
+ u8 start_head; /* unused by EFI, pt start in CHS */
+ u8 start_sector; /* unused by EFI, pt start in CHS */
+ u8 start_track;
+ u8 os_type; /* EFI and legacy non-EFI OS types */
+ u8 end_head; /* unused by EFI, pt end in CHS */
+ u8 end_sector; /* unused by EFI, pt end in CHS */
+ u8 end_track; /* unused by EFI, pt end in CHS */
+ __le32 starting_lba; /* used by EFI - start addr of the on disk pt */
+ __le32 size_in_lba; /* used by EFI - size of pt in LBA */
+} __packed gpt_mbr_record;
+
+
typedef struct _legacy_mbr {
u8 boot_code[440];
__le32 unique_mbr_signature;
__le16 unknown;
- struct partition partition_record[4];
+ gpt_mbr_record partition_record[4];
__le16 signature;
} __attribute__ ((packed)) legacy_mbr;
@@ -113,22 +130,3 @@ typedef struct _legacy_mbr {
extern int efi_partition(struct parsed_partitions *state);
#endif
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only. This must remain at the end
- * of the file.
- * --------------------------------------------------------------------------
- * Local variables:
- * c-indent-level: 4
- * c-brace-imaginary-offset: 0
- * c-brace-offset: -4
- * c-argdecl-indent: 4
- * c-label-offset: -4
- * c-continued-statement-offset: 4
- * c-continued-brace-offset: 0
- * indent-tabs-mode: nil
- * tab-width: 8
- * End:
- */
diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c
index 7681cd2..9123f25 100644
--- a/block/partitions/msdos.c
+++ b/block/partitions/msdos.c
@@ -23,6 +23,7 @@
#include "check.h"
#include "msdos.h"
#include "efi.h"
+#include "aix.h"
/*
* Many architectures don't like unaligned accesses, while
@@ -90,7 +91,7 @@ static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M')
ret = 1;
put_dev_sector(sect);
- };
+ }
return ret;
}
@@ -142,7 +143,7 @@ static void parse_extended(struct parsed_partitions *state,
return;
if (!msdos_magic_present(data + 510))
- goto done;
+ goto done;
p = (struct partition *) (data + 0x1be);
@@ -155,7 +156,7 @@ static void parse_extended(struct parsed_partitions *state,
* and OS/2 seems to use all four entries.
*/
- /*
+ /*
* First process the data partition(s)
*/
for (i=0; i<4; i++, p++) {
@@ -263,7 +264,7 @@ static void parse_solaris_x86(struct parsed_partitions *state,
}
#if defined(CONFIG_BSD_DISKLABEL)
-/*
+/*
* Create devices for BSD partitions listed in a disklabel, under a
* dos-like partition. See parse_extended() for more information.
*/
@@ -294,7 +295,7 @@ static void parse_bsd(struct parsed_partitions *state,
if (state->next == state->limit)
break;
- if (p->p_fstype == BSD_FS_UNUSED)
+ if (p->p_fstype == BSD_FS_UNUSED)
continue;
bsd_start = le32_to_cpu(p->p_offset);
bsd_size = le32_to_cpu(p->p_size);
@@ -441,7 +442,7 @@ static struct {
{NEW_SOLARIS_X86_PARTITION, parse_solaris_x86},
{0, NULL},
};
-
+
int msdos_partition(struct parsed_partitions *state)
{
sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
@@ -462,8 +463,12 @@ int msdos_partition(struct parsed_partitions *state)
*/
if (aix_magic_present(state, data)) {
put_dev_sector(sect);
+#ifdef CONFIG_AIX_PARTITION
+ return aix_partition(state);
+#else
strlcat(state->pp_buf, " [AIX]", PAGE_SIZE);
return 0;
+#endif
}
if (!msdos_magic_present(data + 510)) {
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index a5ffcc9..625e3e4 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -286,7 +286,8 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
struct sg_io_hdr *hdr, fmode_t mode)
{
unsigned long start_time;
- int writing = 0, ret = 0;
+ ssize_t ret = 0;
+ int writing = 0;
struct request *rq;
char sense[SCSI_SENSE_BUFFERSIZE];
struct bio *bio;
@@ -321,37 +322,16 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
}
if (hdr->iovec_count) {
- const int size = sizeof(struct sg_iovec) * hdr->iovec_count;
size_t iov_data_len;
- struct sg_iovec *sg_iov;
struct iovec *iov;
- int i;
- sg_iov = kmalloc(size, GFP_KERNEL);
- if (!sg_iov) {
- ret = -ENOMEM;
+ ret = rw_copy_check_uvector(-1, hdr->dxferp, hdr->iovec_count,
+ 0, NULL, &iov);
+ if (ret < 0)
goto out;
- }
-
- if (copy_from_user(sg_iov, hdr->dxferp, size)) {
- kfree(sg_iov);
- ret = -EFAULT;
- goto out;
- }
- /*
- * Sum up the vecs, making sure they don't overflow
- */
- iov = (struct iovec *) sg_iov;
- iov_data_len = 0;
- for (i = 0; i < hdr->iovec_count; i++) {
- if (iov_data_len + iov[i].iov_len < iov_data_len) {
- kfree(sg_iov);
- ret = -EINVAL;
- goto out;
- }
- iov_data_len += iov[i].iov_len;
- }
+ iov_data_len = ret;
+ ret = 0;
/* SG_IO howto says that the shorter of the two wins */
if (hdr->dxfer_len < iov_data_len) {
@@ -361,9 +341,10 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
iov_data_len = hdr->dxfer_len;
}
- ret = blk_rq_map_user_iov(q, rq, NULL, sg_iov, hdr->iovec_count,
+ ret = blk_rq_map_user_iov(q, rq, NULL, (struct sg_iovec *) iov,
+ hdr->iovec_count,
iov_data_len, GFP_KERNEL);
- kfree(sg_iov);
+ kfree(iov);
} else if (hdr->dxfer_len)
ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len,
GFP_KERNEL);