From eb571eeade2598635f813b3284d02c13a380301e Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Wed, 2 Jul 2014 15:35:16 -0400 Subject: block,scsi: verify return pointer from blk_get_request The blk-core dead queue checks introduce an error scenario to blk_get_request that returns NULL if the request queue has been shutdown. This affects the behavior for __GFP_WAIT callers, who should verify the return value before dereferencing. Signed-off-by: Joe Lawrence Acked-by: Jiri Kosina [for pktdvd] Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 51bf515..29d0567 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -448,6 +448,10 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, } rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_WAIT); + if (!rq) { + err = -ENODEV; + goto error_free_buffer; + } cmdlen = COMMAND_SIZE(opcode); @@ -520,8 +524,9 @@ out: } error: - kfree(buffer); blk_put_request(rq); +error_free_buffer: + kfree(buffer); return err; } EXPORT_SYMBOL_GPL(sg_scsi_ioctl); @@ -534,6 +539,8 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk, int err; rq = blk_get_request(q, WRITE, __GFP_WAIT); + if (!rq) + return -ENODEV; blk_rq_set_block_pc(rq); rq->timeout = BLK_DEFAULT_SG_TIMEOUT; rq->cmd[0] = cmd; diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index fea7e76..ca831f7 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -722,6 +722,8 @@ static int pd_special_command(struct pd_unit *disk, int err = 0; rq = blk_get_request(disk->gd->queue, READ, __GFP_WAIT); + if (!rq) + return -ENODEV; rq->cmd_type = REQ_TYPE_SPECIAL; rq->special = func; diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 758ac44..7fa8c80 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -704,6 +704,8 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command * rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ? WRITE : READ, __GFP_WAIT); + if (!rq) + return -ENODEV; blk_rq_set_block_pc(rq); if (cgc->buflen) { diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 5db8454..4c433bf 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -1960,6 +1960,8 @@ static void scsi_eh_lock_door(struct scsi_device *sdev) * request becomes available */ req = blk_get_request(sdev->request_queue, READ, GFP_KERNEL); + if (!req) + return; blk_rq_set_block_pc(req); -- cgit v0.10.2 From a492f075450f3ba87de36e5ffe92a9d0c7af9723 Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Thu, 28 Aug 2014 08:15:21 -0600 Subject: block,scsi: fixup blk_get_request dead queue scenarios The blk_get_request function may fail in low-memory conditions or during device removal (even if __GFP_WAIT is set). To distinguish between these errors, modify the blk_get_request call stack to return the appropriate ERR_PTR. Verify that all callers check the return status and consider IS_ERR instead of a simple NULL pointer check. For consistency, make a similar change to the blk_mq_alloc_request leg of blk_get_request. It may fail if the queue is dead, or the caller was unwilling to wait. Signed-off-by: Joe Lawrence Acked-by: Jiri Kosina [for pktdvd] Acked-by: Boaz Harrosh [for osd] Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe diff --git a/block/blk-core.c b/block/blk-core.c index c359d72..93603e6 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -933,9 +933,9 @@ static struct io_context *rq_ioc(struct bio *bio) * Get a free request from @q. This function may fail under memory * pressure or if @q is dead. * - * Must be callled with @q->queue_lock held and, - * Returns %NULL on failure, with @q->queue_lock held. - * Returns !%NULL on success, with @q->queue_lock *not held*. + * Must be called with @q->queue_lock held and, + * Returns ERR_PTR on failure, with @q->queue_lock held. + * Returns request pointer on success, with @q->queue_lock *not held*. */ static struct request *__get_request(struct request_list *rl, int rw_flags, struct bio *bio, gfp_t gfp_mask) @@ -949,7 +949,7 @@ static struct request *__get_request(struct request_list *rl, int rw_flags, int may_queue; if (unlikely(blk_queue_dying(q))) - return NULL; + return ERR_PTR(-ENODEV); may_queue = elv_may_queue(q, rw_flags); if (may_queue == ELV_MQUEUE_NO) @@ -974,7 +974,7 @@ static struct request *__get_request(struct request_list *rl, int rw_flags, * process is not a "batcher", and not * exempted by the IO scheduler */ - return NULL; + return ERR_PTR(-ENOMEM); } } } @@ -992,7 +992,7 @@ static struct request *__get_request(struct request_list *rl, int rw_flags, * allocated with any setting of ->nr_requests */ if (rl->count[is_sync] >= (3 * q->nr_requests / 2)) - return NULL; + return ERR_PTR(-ENOMEM); q->nr_rqs[is_sync]++; rl->count[is_sync]++; @@ -1097,7 +1097,7 @@ fail_alloc: rq_starved: if (unlikely(rl->count[is_sync] == 0)) rl->starved[is_sync] = 1; - return NULL; + return ERR_PTR(-ENOMEM); } /** @@ -1110,9 +1110,9 @@ rq_starved: * Get a free request from @q. If %__GFP_WAIT is set in @gfp_mask, this * function keeps retrying under memory pressure and fails iff @q is dead. * - * Must be callled with @q->queue_lock held and, - * Returns %NULL on failure, with @q->queue_lock held. - * Returns !%NULL on success, with @q->queue_lock *not held*. + * Must be called with @q->queue_lock held and, + * Returns ERR_PTR on failure, with @q->queue_lock held. + * Returns request pointer on success, with @q->queue_lock *not held*. */ static struct request *get_request(struct request_queue *q, int rw_flags, struct bio *bio, gfp_t gfp_mask) @@ -1125,12 +1125,12 @@ static struct request *get_request(struct request_queue *q, int rw_flags, rl = blk_get_rl(q, bio); /* transferred to @rq on success */ retry: rq = __get_request(rl, rw_flags, bio, gfp_mask); - if (rq) + if (!IS_ERR(rq)) return rq; if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dying(q))) { blk_put_rl(rl); - return NULL; + return rq; } /* wait on @rl and retry */ @@ -1167,7 +1167,7 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw, spin_lock_irq(q->queue_lock); rq = get_request(q, rw, NULL, gfp_mask); - if (!rq) + if (IS_ERR(rq)) spin_unlock_irq(q->queue_lock); /* q->queue_lock is unlocked at this point */ @@ -1219,8 +1219,8 @@ struct request *blk_make_request(struct request_queue *q, struct bio *bio, { struct request *rq = blk_get_request(q, bio_data_dir(bio), gfp_mask); - if (unlikely(!rq)) - return ERR_PTR(-ENOMEM); + if (IS_ERR(rq)) + return rq; blk_rq_set_block_pc(rq); @@ -1615,8 +1615,8 @@ get_rq: * Returns with the queue unlocked. */ req = get_request(q, rw_flags, bio, GFP_NOIO); - if (unlikely(!req)) { - bio_endio(bio, -ENODEV); /* @q is dead */ + if (IS_ERR(req)) { + bio_endio(bio, PTR_ERR(req)); /* @q is dead */ goto out_unlock; } diff --git a/block/blk-mq.c b/block/blk-mq.c index 5189cb1..940aa8a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -218,9 +218,11 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, struct blk_mq_hw_ctx *hctx; struct request *rq; struct blk_mq_alloc_data alloc_data; + int ret; - if (blk_mq_queue_enter(q)) - return NULL; + ret = blk_mq_queue_enter(q); + if (ret) + return ERR_PTR(ret); ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); @@ -240,6 +242,8 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, ctx = alloc_data.ctx; } blk_mq_put_ctx(ctx); + if (!rq) + return ERR_PTR(-EWOULDBLOCK); return rq; } EXPORT_SYMBOL(blk_mq_alloc_request); diff --git a/block/bsg.c b/block/bsg.c index ff46add..73c78fd 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -270,8 +270,8 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm, * map scatter-gather elements separately and string them to request */ rq = blk_get_request(q, rw, GFP_KERNEL); - if (!rq) - return ERR_PTR(-ENOMEM); + if (IS_ERR(rq)) + return rq; blk_rq_set_block_pc(rq); ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, has_write_perm); @@ -285,8 +285,8 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm, } next_rq = blk_get_request(q, READ, GFP_KERNEL); - if (!next_rq) { - ret = -ENOMEM; + if (IS_ERR(next_rq)) { + ret = PTR_ERR(next_rq); goto out; } rq->next_rq = next_rq; diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 29d0567..a8b0d02 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -318,8 +318,8 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, at_head = 1; rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL); - if (!rq) - return -ENOMEM; + if (IS_ERR(rq)) + return PTR_ERR(rq); blk_rq_set_block_pc(rq); if (blk_fill_sghdr_rq(q, rq, hdr, mode)) { @@ -448,8 +448,8 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, } rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_WAIT); - if (!rq) { - err = -ENODEV; + if (IS_ERR(rq)) { + err = PTR_ERR(rq); goto error_free_buffer; } @@ -539,8 +539,8 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk, int err; rq = blk_get_request(q, WRITE, __GFP_WAIT); - if (!rq) - return -ENODEV; + if (IS_ERR(rq)) + return PTR_ERR(rq); blk_rq_set_block_pc(rq); rq->timeout = BLK_DEFAULT_SG_TIMEOUT; rq->cmd[0] = cmd; diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index ca831f7..d48715b 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -722,8 +722,8 @@ static int pd_special_command(struct pd_unit *disk, int err = 0; rq = blk_get_request(disk->gd->queue, READ, __GFP_WAIT); - if (!rq) - return -ENODEV; + if (IS_ERR(rq)) + return PTR_ERR(rq); rq->cmd_type = REQ_TYPE_SPECIAL; rq->special = func; diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 7fa8c80..09e628da 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -704,8 +704,8 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command * rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ? WRITE : READ, __GFP_WAIT); - if (!rq) - return -ENODEV; + if (IS_ERR(rq)) + return PTR_ERR(rq); blk_rq_set_block_pc(rq); if (cgc->buflen) { diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index d5e2d12..5d55285 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -568,7 +568,7 @@ static struct carm_request *carm_get_special(struct carm_host *host) return NULL; rq = blk_get_request(host->oob_q, WRITE /* bogus */, GFP_KERNEL); - if (!rq) { + if (IS_ERR(rq)) { spin_lock_irqsave(&host->lock, flags); carm_put_request(host, crq); spin_unlock_irqrestore(&host->lock, flags); diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 898b84b..5d28a45 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -2180,8 +2180,8 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf, len = nr * CD_FRAMESIZE_RAW; rq = blk_get_request(q, READ, GFP_KERNEL); - if (!rq) { - ret = -ENOMEM; + if (IS_ERR(rq)) { + ret = PTR_ERR(rq); break; } blk_rq_set_block_pc(rq); diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c index f41558a..ca95860 100644 --- a/drivers/ide/ide-park.c +++ b/drivers/ide/ide-park.c @@ -46,7 +46,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout) * timeout has expired, so power management will be reenabled. */ rq = blk_get_request(q, READ, GFP_NOWAIT); - if (unlikely(!rq)) + if (IS_ERR(rq)) goto out; rq->cmd[0] = REQ_UNPARK_HEADS; diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c index 7bcf67e..e99507e 100644 --- a/drivers/scsi/device_handler/scsi_dh_alua.c +++ b/drivers/scsi/device_handler/scsi_dh_alua.c @@ -115,7 +115,7 @@ static struct request *get_alua_req(struct scsi_device *sdev, rq = blk_get_request(q, rw, GFP_NOIO); - if (!rq) { + if (IS_ERR(rq)) { sdev_printk(KERN_INFO, sdev, "%s: blk_get_request failed\n", __func__); return NULL; diff --git a/drivers/scsi/device_handler/scsi_dh_emc.c b/drivers/scsi/device_handler/scsi_dh_emc.c index 6f07f7f..8476538 100644 --- a/drivers/scsi/device_handler/scsi_dh_emc.c +++ b/drivers/scsi/device_handler/scsi_dh_emc.c @@ -275,7 +275,7 @@ static struct request *get_req(struct scsi_device *sdev, int cmd, rq = blk_get_request(sdev->request_queue, (cmd != INQUIRY) ? WRITE : READ, GFP_NOIO); - if (!rq) { + if (IS_ERR(rq)) { sdev_printk(KERN_INFO, sdev, "get_req: blk_get_request failed"); return NULL; } diff --git a/drivers/scsi/device_handler/scsi_dh_hp_sw.c b/drivers/scsi/device_handler/scsi_dh_hp_sw.c index e9d9fea..4ee2759 100644 --- a/drivers/scsi/device_handler/scsi_dh_hp_sw.c +++ b/drivers/scsi/device_handler/scsi_dh_hp_sw.c @@ -117,7 +117,7 @@ static int hp_sw_tur(struct scsi_device *sdev, struct hp_sw_dh_data *h) retry: req = blk_get_request(sdev->request_queue, WRITE, GFP_NOIO); - if (!req) + if (IS_ERR(req)) return SCSI_DH_RES_TEMP_UNAVAIL; blk_rq_set_block_pc(req); @@ -247,7 +247,7 @@ static int hp_sw_start_stop(struct hp_sw_dh_data *h) struct request *req; req = blk_get_request(h->sdev->request_queue, WRITE, GFP_ATOMIC); - if (!req) + if (IS_ERR(req)) return SCSI_DH_RES_TEMP_UNAVAIL; blk_rq_set_block_pc(req); diff --git a/drivers/scsi/device_handler/scsi_dh_rdac.c b/drivers/scsi/device_handler/scsi_dh_rdac.c index 826069d..1b5bc92 100644 --- a/drivers/scsi/device_handler/scsi_dh_rdac.c +++ b/drivers/scsi/device_handler/scsi_dh_rdac.c @@ -274,7 +274,7 @@ static struct request *get_rdac_req(struct scsi_device *sdev, rq = blk_get_request(q, rw, GFP_NOIO); - if (!rq) { + if (IS_ERR(rq)) { sdev_printk(KERN_INFO, sdev, "get_rdac_req: blk_get_request failed.\n"); return NULL; diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c index 5f4cbf0..fd19fd8 100644 --- a/drivers/scsi/osd/osd_initiator.c +++ b/drivers/scsi/osd/osd_initiator.c @@ -1567,8 +1567,8 @@ static struct request *_make_request(struct request_queue *q, bool has_write, struct request *req; req = blk_get_request(q, has_write ? WRITE : READ, flags); - if (unlikely(!req)) - return ERR_PTR(-ENOMEM); + if (IS_ERR(req)) + return req; blk_rq_set_block_pc(req); return req; diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c index 0727ea7..dff37a25 100644 --- a/drivers/scsi/osst.c +++ b/drivers/scsi/osst.c @@ -362,7 +362,7 @@ static int osst_execute(struct osst_request *SRpnt, const unsigned char *cmd, int write = (data_direction == DMA_TO_DEVICE); req = blk_get_request(SRpnt->stp->device->request_queue, write, GFP_KERNEL); - if (!req) + if (IS_ERR(req)) return DRIVER_ERROR << 24; blk_rq_set_block_pc(req); diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 4c433bf..a2c3d3d 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -1960,7 +1960,7 @@ static void scsi_eh_lock_door(struct scsi_device *sdev) * request becomes available */ req = blk_get_request(sdev->request_queue, READ, GFP_KERNEL); - if (!req) + if (IS_ERR(req)) return; blk_rq_set_block_pc(req); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index ce62e87..972d0a8 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -221,7 +221,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, int ret = DRIVER_ERROR << 24; req = blk_get_request(sdev->request_queue, write, __GFP_WAIT); - if (!req) + if (IS_ERR(req)) return ret; blk_rq_set_block_pc(req); diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 01cf888..6035444 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1711,9 +1711,9 @@ sg_start_req(Sg_request *srp, unsigned char *cmd) } rq = blk_get_request(q, rw, GFP_ATOMIC); - if (!rq) { + if (IS_ERR(rq)) { kfree(long_cmdp); - return -ENOMEM; + return PTR_ERR(rq); } blk_rq_set_block_pc(rq); diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index aff9689..59db5bf 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -490,7 +490,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd, req = blk_get_request(SRpnt->stp->device->request_queue, write, GFP_KERNEL); - if (!req) + if (IS_ERR(req)) return DRIVER_ERROR << 24; blk_rq_set_block_pc(req); diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 943b1db..70d9f6d 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -1050,7 +1050,7 @@ pscsi_execute_cmd(struct se_cmd *cmd) req = blk_get_request(pdv->pdv_sd->request_queue, (data_direction == DMA_TO_DEVICE), GFP_KERNEL); - if (!req) { + if (IS_ERR(req)) { pr_err("PSCSI: blk_get_request() failed\n"); ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; goto fail; -- cgit v0.10.2 From 55872c5a3c01f0fe7b5298d19e24e237f5b5ff06 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 28 Aug 2014 15:05:40 -0600 Subject: bsg: fix potential error pointer dereference Dan writes: block/bsg.c:327 bsg_map_hdr() error: 'next_rq' dereferencing possible ERR_PTR(). Fix this by setting next_rq to NULL, for the case where it can be != NULL but an error pointer. Reported-by: Dan Carpenter Signed-off-by: Jens Axboe diff --git a/block/bsg.c b/block/bsg.c index 73c78fd..276e869 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -287,6 +287,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm, next_rq = blk_get_request(q, READ, GFP_KERNEL); if (IS_ERR(next_rq)) { ret = PTR_ERR(next_rq); + next_rq = NULL; goto out; } rq->next_rq = next_rq; -- cgit v0.10.2 From f4da80727cfbc3590d95ff17ef8db96e6f1483a4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Sep 2014 08:15:20 +0900 Subject: blkcg: remove blkcg->id blkcg->id is a unique id given to each blkcg; however, the cgroup_subsys_state which each blkcg embeds already has ->serial_nr which can be used for the same purpose. Drop blkcg->id and replace its uses with blkcg->css.serial_nr. Rename cfq_cgroup->blkcg_id to ->blkcg_serial_nr and @id in check_blkcg_changed() to @serial_nr for consistency. Signed-off-by: Tejun Heo Acked-by: Vivek Goyal Signed-off-by: Jens Axboe diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index e17da94..0ac817b 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -822,7 +822,6 @@ static void blkcg_css_free(struct cgroup_subsys_state *css) static struct cgroup_subsys_state * blkcg_css_alloc(struct cgroup_subsys_state *parent_css) { - static atomic64_t id_seq = ATOMIC64_INIT(0); struct blkcg *blkcg; if (!parent_css) { @@ -836,7 +835,6 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT; blkcg->cfq_leaf_weight = CFQ_WEIGHT_DEFAULT; - blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */ done: spin_lock_init(&blkcg->lock); INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index d3fd7aa..c567865 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -50,9 +50,6 @@ struct blkcg { struct blkcg_gq *blkg_hint; struct hlist_head blkg_list; - /* for policies to test whether associated blkcg has changed */ - uint64_t id; - /* TODO: per-policy storage in blkcg */ unsigned int cfq_weight; /* belongs to cfq */ unsigned int cfq_leaf_weight; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index cadc378..900f569 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -299,7 +299,7 @@ struct cfq_io_cq { struct cfq_ttime ttime; int ioprio; /* the current ioprio */ #ifdef CONFIG_CFQ_GROUP_IOSCHED - uint64_t blkcg_id; /* the current blkcg ID */ + uint64_t blkcg_serial_nr; /* the current blkcg serial */ #endif }; @@ -3534,17 +3534,17 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { struct cfq_data *cfqd = cic_to_cfqd(cic); struct cfq_queue *sync_cfqq; - uint64_t id; + uint64_t serial_nr; rcu_read_lock(); - id = bio_blkcg(bio)->id; + serial_nr = bio_blkcg(bio)->css.serial_nr; rcu_read_unlock(); /* * Check whether blkcg has changed. The condition may trigger * spuriously on a newly created cic but there's no harm. */ - if (unlikely(!cfqd) || likely(cic->blkcg_id == id)) + if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr)) return; sync_cfqq = cic_to_cfqq(cic, 1); @@ -3558,7 +3558,7 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) cfq_put_queue(sync_cfqq); } - cic->blkcg_id = id; + cic->blkcg_serial_nr = serial_nr; } #else static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { } -- cgit v0.10.2 From ff9ea323816dc1c8ac7144afd4eab3ac97704430 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Sep 2014 08:03:56 +0900 Subject: block, bdi: an active gendisk always has a request_queue associated with it bdev_get_queue() returns the request_queue associated with the specified block_device. blk_get_backing_dev_info() makes use of bdev_get_queue() to determine the associated bdi given a block_device. All the callers of bdev_get_queue() including blk_get_backing_dev_info() assume that bdev_get_queue() may return NULL and implement NULL handling; however, bdev_get_queue() requires the passed in block_device is opened and attached to its gendisk. Because an active gendisk always has a valid request_queue associated with it, bdev_get_queue() can never return NULL and neither can blk_get_backing_dev_info(). Make it clear that neither of the two functions can return NULL and remove NULL handling from all the callers. Signed-off-by: Tejun Heo Cc: Chris Mason Cc: Dave Chinner Signed-off-by: Jens Axboe diff --git a/block/blk-core.c b/block/blk-core.c index 93603e6..8174461 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -83,18 +83,14 @@ void blk_queue_congestion_threshold(struct request_queue *q) * @bdev: device * * Locates the passed device's request queue and returns the address of its - * backing_dev_info - * - * Will return NULL if the request queue cannot be located. + * backing_dev_info. This function can only be called if @bdev is opened + * and the return value is never NULL. */ struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev) { - struct backing_dev_info *ret = NULL; struct request_queue *q = bdev_get_queue(bdev); - if (q) - ret = &q->backing_dev_info; - return ret; + return &q->backing_dev_info; } EXPORT_SYMBOL(blk_get_backing_dev_info); diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index 18b282c..f678c73 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -709,8 +709,6 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) if (!arg) return -EINVAL; bdi = blk_get_backing_dev_info(bdev); - if (bdi == NULL) - return -ENOTTY; return compat_put_long(arg, (bdi->ra_pages * PAGE_CACHE_SIZE) / 512); case BLKROGET: /* compatible */ @@ -731,8 +729,6 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) if (!capable(CAP_SYS_ADMIN)) return -EACCES; bdi = blk_get_backing_dev_info(bdev); - if (bdi == NULL) - return -ENOTTY; bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE; return 0; case BLKGETSIZE: diff --git a/block/ioctl.c b/block/ioctl.c index d6cda81..6c7bf90 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -356,8 +356,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, if (!arg) return -EINVAL; bdi = blk_get_backing_dev_info(bdev); - if (bdi == NULL) - return -ENOTTY; return put_long(arg, (bdi->ra_pages * PAGE_CACHE_SIZE) / 512); case BLKROGET: return put_int(arg, bdev_read_only(bdev) != 0); @@ -386,8 +384,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, if(!capable(CAP_SYS_ADMIN)) return -EACCES; bdi = blk_get_backing_dev_info(bdev); - if (bdi == NULL) - return -ENOTTY; bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE; return 0; case BLKBSZSET: diff --git a/fs/block_dev.c b/fs/block_dev.c index 6d72746..d3251ec 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1173,8 +1173,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (!ret) { bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); bdi = blk_get_backing_dev_info(bdev); - if (bdi == NULL) - bdi = &default_backing_dev_info; bdev_inode_switch_bdi(bdev->bd_inode, bdi); } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d0ed9e6..39ff591 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1694,7 +1694,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) if (!device->bdev) continue; bdi = blk_get_backing_dev_info(device->bdev); - if (bdi && bdi_congested(bdi, bdi_bits)) { + if (bdi_congested(bdi, bdi_bits)) { ret = 1; break; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index cd7b8ca..497fcde 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1678,8 +1678,6 @@ xfs_alloc_buftarg( btp->bt_dev = bdev->bd_dev; btp->bt_bdev = bdev; btp->bt_bdi = blk_get_backing_dev_info(bdev); - if (!btp->bt_bdi) - goto error; if (xfs_setsize_buftarg_early(btp, bdev)) goto error; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 518b465..e267bf0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -865,7 +865,7 @@ extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { - return bdev->bd_disk->queue; + return bdev->bd_disk->queue; /* this is never NULL */ } /* -- cgit v0.10.2 From e36f1dfce0b45d347927568efe1088821758cc3c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Sep 2014 08:03:57 +0900 Subject: bdi: remove unused stuff Two flags and one bdi_writeback field are no longer used. Remove them. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index e488e94..2103a7f 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -28,12 +28,10 @@ struct dentry; * Bits in backing_dev_info.state */ enum bdi_state { - BDI_wb_alloc, /* Default embedded wb allocated */ BDI_async_congested, /* The async (write) queue is getting full */ BDI_sync_congested, /* The sync queue is getting full */ BDI_registered, /* bdi_register() was done */ BDI_writeback_running, /* Writeback is in progress */ - BDI_unused, /* Available bits start here */ }; typedef int (congested_fn)(void *, int); @@ -50,7 +48,6 @@ enum bdi_stat_item { struct bdi_writeback { struct backing_dev_info *bdi; /* our parent bdi */ - unsigned int nr; unsigned long last_old_flush; /* last old data flush */ -- cgit v0.10.2 From b68757341d8015d28e261990deea58dd836e04da Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Sep 2014 08:03:58 +0900 Subject: bdi: remove bdi->wb_lock locking around bdi->dev clearing in bdi_unregister() The only places where NULL test on bdi->dev is used are bdi_[un]register(). The functions can't be called in parallel anyway and there's no point in protecting bdi->dev clearing with a lock. Remove bdi->wb_lock grabbing around bdi->dev clearing and move it after device_unregister() call so that bdi->dev doesn't have to be cached in a local variable. This patch shouldn't introduce any behavior difference. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 1706cbb..4afeefe 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -402,21 +402,15 @@ static void bdi_prune_sb(struct backing_dev_info *bdi) void bdi_unregister(struct backing_dev_info *bdi) { - struct device *dev = bdi->dev; - - if (dev) { + if (bdi->dev) { bdi_set_min_ratio(bdi, 0); trace_writeback_bdi_unregister(bdi); bdi_prune_sb(bdi); bdi_wb_shutdown(bdi); bdi_debug_unregister(bdi); - - spin_lock_bh(&bdi->wb_lock); + device_unregister(bdi->dev); bdi->dev = NULL; - spin_unlock_bh(&bdi->wb_lock); - - device_unregister(dev); } } EXPORT_SYMBOL(bdi_unregister); -- cgit v0.10.2 From c0ea1c22bce63a27b47da90ad1ac49ce48e1a8aa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Sep 2014 08:03:59 +0900 Subject: bdi: make backing_dev_info->wb.dwork canceling stricter Canceling of bdi->wb.dwork is currently a bit mushy. bdi_wb_shutdown() performs cancel_delayed_work_sync() at the end after shutting down and flushing the delayed_work and bdi_destroy() tries yet again after bdi_unregister(). bdi->wb.dwork is queued only after checking BDI_registered while holding bdi->wb_lock and bdi_wb_shutdown() clears the flag while holding the same lock and then flushes the delayed_work. There's no way the delayed_work can be queued again after that. Replace the two unnecessary cancel_delayed_work_sync() invocations with WARNs on pending. This simplifies and clarifies the code a bit and will help future changes in further isolating bdi_writeback handling. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 4afeefe..cb7c5e3 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -376,13 +376,7 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); flush_delayed_work(&bdi->wb.dwork); WARN_ON(!list_empty(&bdi->work_list)); - - /* - * This shouldn't be necessary unless @bdi for some reason has - * unflushed dirty IO after work_list is drained. Do it anyway - * just in case. - */ - cancel_delayed_work_sync(&bdi->wb.dwork); + WARN_ON(delayed_work_pending(&bdi->wb.dwork)); } /* @@ -497,12 +491,7 @@ void bdi_destroy(struct backing_dev_info *bdi) bdi_unregister(bdi); - /* - * If bdi_unregister() had already been called earlier, the dwork - * could still be pending because bdi_prune_sb() can race with the - * bdi_wakeup_thread_delayed() calls from __mark_inode_dirty(). - */ - cancel_delayed_work_sync(&bdi->wb.dwork); + WARN_ON(delayed_work_pending(&bdi->wb.dwork)); for (i = 0; i < NR_BDI_STAT_ITEMS; i++) percpu_counter_destroy(&bdi->bdi_stat[i]); -- cgit v0.10.2 From 1a1e4530eacca37e85a4d66a164273c7dba9110c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Sep 2014 08:04:00 +0900 Subject: bdi: explain the dirty list transferring in bdi_destroy() bdi_destroy() has code to transfer the remaining dirty inodes to the default_backing_dev_info; however, given the shutdown sequence, it isn't clear how such condition would happen. Also, it isn't a full solution as the transferred inodes stlil point to the bdi which is being destroyed. Operations on those inodes can end up accessing already released fields such as the percpu stat fields. Digging through the history, it seems that the code was added as a quick workaround for a bug report without fully root-causing the issue. We probably want to remove the code in time but for now let's add a comment noting that it is a quick workaround. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe diff --git a/mm/backing-dev.c b/mm/backing-dev.c index cb7c5e3..b65fe93 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -475,8 +475,17 @@ void bdi_destroy(struct backing_dev_info *bdi) int i; /* - * Splice our entries to the default_backing_dev_info, if this - * bdi disappears + * Splice our entries to the default_backing_dev_info. This + * condition shouldn't happen. @wb must be empty at this point and + * dirty inodes on it might cause other issues. This workaround is + * added by ce5f8e779519 ("writeback: splice dirty inode entries to + * default bdi on bdi_destroy()") without root-causing the issue. + * + * http://lkml.kernel.org/g/1253038617-30204-11-git-send-email-jens.axboe@oracle.com + * http://thread.gmane.org/gmane.linux.file-systems/35341/focus=35350 + * + * We should probably add WARN_ON() to find out whether it still + * happens and track it down if so. */ if (bdi_has_dirty_io(bdi)) { struct bdi_writeback *dst = &default_backing_dev_info.wb; -- cgit v0.10.2 From 018a17bdc8658ad448497c84d4ba21b6985820ec Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Sep 2014 08:04:01 +0900 Subject: bdi: reimplement bdev_inode_switch_bdi() A block_device may be attached to different gendisks and thus different bdis over time. bdev_inode_switch_bdi() is used to switch the associated bdi. The function assumes that the inode could be dirty and transfers it between bdis if so. This is a bit nasty in that it reaches into bdi internals. This patch reimplements the function so that it writes out the inode if dirty. This is a lot simpler and can be implemented without exposing bdi internals. Signed-off-by: Tejun Heo Cc: Alexander Viro Signed-off-by: Jens Axboe diff --git a/fs/block_dev.c b/fs/block_dev.c index d3251ec..cc8d68a 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -50,32 +50,22 @@ inline struct block_device *I_BDEV(struct inode *inode) EXPORT_SYMBOL(I_BDEV); /* - * Move the inode from its current bdi to a new bdi. If the inode is dirty we - * need to move it onto the dirty list of @dst so that the inode is always on - * the right list. + * Move the inode from its current bdi to a new bdi. Make sure the inode + * is clean before moving so that it doesn't linger on the old bdi. */ static void bdev_inode_switch_bdi(struct inode *inode, struct backing_dev_info *dst) { - struct backing_dev_info *old = inode->i_data.backing_dev_info; - bool wakeup_bdi = false; - - if (unlikely(dst == old)) /* deadlock avoidance */ - return; - bdi_lock_two(&old->wb, &dst->wb); - spin_lock(&inode->i_lock); - inode->i_data.backing_dev_info = dst; - if (inode->i_state & I_DIRTY) { - if (bdi_cap_writeback_dirty(dst) && !wb_has_dirty_io(&dst->wb)) - wakeup_bdi = true; - list_move(&inode->i_wb_list, &dst->wb.b_dirty); + while (true) { + spin_lock(&inode->i_lock); + if (!(inode->i_state & I_DIRTY)) { + inode->i_data.backing_dev_info = dst; + spin_unlock(&inode->i_lock); + return; + } + spin_unlock(&inode->i_lock); + WARN_ON_ONCE(write_inode_now(inode, true)); } - spin_unlock(&inode->i_lock); - spin_unlock(&old->wb.list_lock); - spin_unlock(&dst->wb.list_lock); - - if (wakeup_bdi) - bdi_wakeup_thread_delayed(dst); } /* Kill _all_ buffers and pagecache , dirty or not.. */ diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 2103a7f..5da6012 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -121,7 +121,6 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi); void bdi_writeback_workfn(struct work_struct *work); int bdi_has_dirty_io(struct backing_dev_info *bdi); void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); -void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2); extern spinlock_t bdi_lock; extern struct list_head bdi_list; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index b65fe93..7d63d5e 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -40,7 +40,7 @@ LIST_HEAD(bdi_list); /* bdi_wq serves all asynchronous writeback tasks */ struct workqueue_struct *bdi_wq; -void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) +static void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) { if (wb1 < wb2) { spin_lock(&wb1->list_lock); -- cgit v0.10.2 From bf57229745f849e500ba69ff91e35bc8160a7373 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 13 Sep 2014 16:40:08 -0700 Subject: blk-mq: remove REQ_END Pass an explicit parameter for the last request in a batch to ->queue_rq instead of using a request flag. Besides being a cleaner and non-stateful interface this is also required for the next patch, which fixes the blk-mq I/O submission code to not start a time too early. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe diff --git a/block/blk-mq.c b/block/blk-mq.c index e743d28..32b4797 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -384,7 +384,7 @@ void blk_mq_complete_request(struct request *rq) } EXPORT_SYMBOL(blk_mq_complete_request); -static void blk_mq_start_request(struct request *rq, bool last) +static void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; @@ -421,16 +421,6 @@ static void blk_mq_start_request(struct request *rq, bool last) */ rq->nr_phys_segments++; } - - /* - * Flag the last request in the series so that drivers know when IO - * should be kicked off, if they don't do it on a per-request basis. - * - * Note: the flag isn't the only condition drivers should do kick off. - * If drive is busy, the last request might not have the bit set. - */ - if (last) - rq->cmd_flags |= REQ_END; } static void __blk_mq_requeue_request(struct request *rq) @@ -440,8 +430,6 @@ static void __blk_mq_requeue_request(struct request *rq) trace_block_rq_requeue(q, rq); clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); - rq->cmd_flags &= ~REQ_END; - if (q->dma_drain_size && blk_rq_bytes(rq)) rq->nr_phys_segments--; } @@ -755,9 +743,9 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) rq = list_first_entry(&rq_list, struct request, queuelist); list_del_init(&rq->queuelist); - blk_mq_start_request(rq, list_empty(&rq_list)); + blk_mq_start_request(rq); - ret = q->mq_ops->queue_rq(hctx, rq); + ret = q->mq_ops->queue_rq(hctx, rq, list_empty(&rq_list)); switch (ret) { case BLK_MQ_RQ_QUEUE_OK: queued++; @@ -1198,14 +1186,14 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) int ret; blk_mq_bio_to_request(rq, bio); - blk_mq_start_request(rq, true); + blk_mq_start_request(rq); /* * For OK queue, we are done. For error, kill it. Any other * error (busy), just add it to our list as we previously * would have done */ - ret = q->mq_ops->queue_rq(data.hctx, rq); + ret = q->mq_ops->queue_rq(data.hctx, rq, true); if (ret == BLK_MQ_RQ_QUEUE_OK) goto done; else { diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 5c8e7fe..0e2084f 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3775,7 +3775,8 @@ static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx, return false; } -static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq) +static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq, + bool last) { int ret; diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 00d469c..c5b7315 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -313,7 +313,8 @@ static void null_request_fn(struct request_queue *q) } } -static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq) +static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq, + bool last) { struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 0a58140..13756e0 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -164,14 +164,14 @@ static void virtblk_done(struct virtqueue *vq) spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); } -static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) +static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req, + bool last) { struct virtio_blk *vblk = hctx->queue->queuedata; struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); unsigned long flags; unsigned int num; int qid = hctx->queue_num; - const bool last = (req->cmd_flags & REQ_END) != 0; int err; bool notify = false; diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 1f2bae4..f1df411 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1855,7 +1855,8 @@ static void scsi_mq_done(struct scsi_cmnd *cmd) blk_mq_complete_request(cmd->request); } -static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) +static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req, + bool last) { struct request_queue *q = req->q; struct scsi_device *sdev = q->queuedata; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index a1e31f2..9c4e306 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -77,7 +77,7 @@ struct blk_mq_tag_set { struct list_head tag_list; }; -typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *); +typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *, bool); typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int); typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 66c2167..bb7d664 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -188,7 +188,6 @@ enum rq_flag_bits { __REQ_MIXED_MERGE, /* merge of different types, fail separately */ __REQ_KERNEL, /* direct IO to kernel pages */ __REQ_PM, /* runtime pm request */ - __REQ_END, /* last of chain of requests */ __REQ_HASHED, /* on IO scheduler merge hash */ __REQ_MQ_INFLIGHT, /* track inflight for MQ */ __REQ_NR_BITS, /* stops here */ @@ -242,7 +241,6 @@ enum rq_flag_bits { #define REQ_SECURE (1ULL << __REQ_SECURE) #define REQ_KERNEL (1ULL << __REQ_KERNEL) #define REQ_PM (1ULL << __REQ_PM) -#define REQ_END (1ULL << __REQ_END) #define REQ_HASHED (1ULL << __REQ_HASHED) #define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) -- cgit v0.10.2 From e2490073cd7c3d6f6ef6e029a208edd4d38efac4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 13 Sep 2014 16:40:09 -0700 Subject: blk-mq: call blk_mq_start_request from ->queue_rq When we call blk_mq_start_request from the core blk-mq code before calling into ->queue_rq there is a racy window where the timeout handler can hit before we've fully set up the driver specific part of the command. Move the call to blk_mq_start_request into the driver so the driver can start the request only once it is fully set up. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe diff --git a/block/blk-mq.c b/block/blk-mq.c index 32b4797..141f2e0 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -384,7 +384,7 @@ void blk_mq_complete_request(struct request *rq) } EXPORT_SYMBOL(blk_mq_complete_request); -static void blk_mq_start_request(struct request *rq) +void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; @@ -422,16 +422,18 @@ static void blk_mq_start_request(struct request *rq) rq->nr_phys_segments++; } } +EXPORT_SYMBOL(blk_mq_start_request); static void __blk_mq_requeue_request(struct request *rq) { struct request_queue *q = rq->q; trace_block_rq_requeue(q, rq); - clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); - if (q->dma_drain_size && blk_rq_bytes(rq)) - rq->nr_phys_segments--; + if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { + if (q->dma_drain_size && blk_rq_bytes(rq)) + rq->nr_phys_segments--; + } } void blk_mq_requeue_request(struct request *rq) @@ -743,8 +745,6 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) rq = list_first_entry(&rq_list, struct request, queuelist); list_del_init(&rq->queuelist); - blk_mq_start_request(rq); - ret = q->mq_ops->queue_rq(hctx, rq, list_empty(&rq_list)); switch (ret) { case BLK_MQ_RQ_QUEUE_OK: @@ -1186,7 +1186,6 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) int ret; blk_mq_bio_to_request(rq, bio); - blk_mq_start_request(rq); /* * For OK queue, we are done. For error, kill it. Any other diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 0e2084f..4042440 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3783,6 +3783,8 @@ static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq, if (unlikely(mtip_check_unal_depth(hctx, rq))) return BLK_MQ_RQ_QUEUE_BUSY; + blk_mq_start_request(rq); + ret = mtip_submit_request(hctx, rq); if (likely(!ret)) return BLK_MQ_RQ_QUEUE_OK; diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index c5b7315..332ce20 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -321,6 +321,8 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq, cmd->rq = rq; cmd->nq = hctx->driver_data; + blk_mq_start_request(rq); + null_handle_cmd(cmd); return BLK_MQ_RQ_QUEUE_OK; } diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 13756e0..83816bf 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -205,6 +205,8 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req, } } + blk_mq_start_request(req); + num = blk_rq_map_sg(hctx->queue, vbr->req, vbr->sg); if (num) { if (rq_data_dir(vbr->req) == WRITE) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index f1df411..2dcd907 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1890,6 +1890,7 @@ static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req, scsi_init_cmd_errh(cmd); cmd->scsi_done = scsi_mq_done; + blk_mq_start_request(req); reason = scsi_dispatch_cmd(cmd); if (reason) { scsi_set_blocked(cmd, reason); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 9c4e306..878b6f7 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -159,6 +159,7 @@ struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index); struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int); +void blk_mq_start_request(struct request *rq); void blk_mq_end_io(struct request *rq, int error); void __blk_mq_end_io(struct request *rq, int error); -- cgit v0.10.2 From c8a446ad695ada43a885ec12b38411dbd190a11b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 13 Sep 2014 16:40:10 -0700 Subject: blk-mq: rename blk_mq_end_io to blk_mq_end_request Now that we've changed the driver API on the submission side use the opportunity to fix up the name on the completion side to fit into the general scheme. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe diff --git a/block/blk-flush.c b/block/blk-flush.c index 3cb5e9e..698e692 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -202,7 +202,7 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, list_del_init(&rq->flush.list); blk_flush_restore_request(rq); if (q->mq_ops) - blk_mq_end_io(rq, error); + blk_mq_end_request(rq, error); else __blk_end_request_all(rq, error); break; @@ -378,7 +378,7 @@ void blk_insert_flush(struct request *rq) */ if (!policy) { if (q->mq_ops) - blk_mq_end_io(rq, 0); + blk_mq_end_request(rq, 0); else __blk_end_bidi_request(rq, 0, 0, 0); return; diff --git a/block/blk-mq.c b/block/blk-mq.c index 141f2e0..1713686 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -300,7 +300,7 @@ void blk_mq_clone_flush_request(struct request *flush_rq, hctx->cmd_size); } -inline void __blk_mq_end_io(struct request *rq, int error) +inline void __blk_mq_end_request(struct request *rq, int error) { blk_account_io_done(rq); @@ -312,15 +312,15 @@ inline void __blk_mq_end_io(struct request *rq, int error) blk_mq_free_request(rq); } } -EXPORT_SYMBOL(__blk_mq_end_io); +EXPORT_SYMBOL(__blk_mq_end_request); -void blk_mq_end_io(struct request *rq, int error) +void blk_mq_end_request(struct request *rq, int error) { if (blk_update_request(rq, error, blk_rq_bytes(rq))) BUG(); - __blk_mq_end_io(rq, error); + __blk_mq_end_request(rq, error); } -EXPORT_SYMBOL(blk_mq_end_io); +EXPORT_SYMBOL(blk_mq_end_request); static void __blk_mq_complete_request_remote(void *data) { @@ -360,7 +360,7 @@ void __blk_mq_complete_request(struct request *rq) struct request_queue *q = rq->q; if (!q->softirq_done_fn) - blk_mq_end_io(rq, rq->errors); + blk_mq_end_request(rq, rq->errors); else blk_mq_ipi_complete_request(rq); } @@ -758,7 +758,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) pr_err("blk-mq: bad return on queue: %d\n", ret); case BLK_MQ_RQ_QUEUE_ERROR: rq->errors = -EIO; - blk_mq_end_io(rq, rq->errors); + blk_mq_end_request(rq, rq->errors); break; } @@ -1200,7 +1200,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) if (ret == BLK_MQ_RQ_QUEUE_ERROR) { rq->errors = -EIO; - blk_mq_end_io(rq, rq->errors); + blk_mq_end_request(rq, rq->errors); goto done; } } diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 4042440..6b7e8d0 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -247,7 +247,7 @@ static void mtip_async_complete(struct mtip_port *port, if (unlikely(cmd->unaligned)) up(&port->cmd_slot_unal); - blk_mq_end_io(rq, status ? -EIO : 0); + blk_mq_end_request(rq, status ? -EIO : 0); } /* @@ -3739,7 +3739,7 @@ static int mtip_submit_request(struct blk_mq_hw_ctx *hctx, struct request *rq) int err; err = mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq)); - blk_mq_end_io(rq, err); + blk_mq_end_request(rq, err); return 0; } diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 332ce20..ac50a29 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -177,7 +177,7 @@ static void end_cmd(struct nullb_cmd *cmd) { switch (queue_mode) { case NULL_Q_MQ: - blk_mq_end_io(cmd->rq, 0); + blk_mq_end_request(cmd->rq, 0); return; case NULL_Q_RQ: INIT_LIST_HEAD(&cmd->rq->queuelist); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 83816bf..f751fc3 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -135,7 +135,7 @@ static inline void virtblk_request_done(struct request *req) req->errors = (error != 0); } - blk_mq_end_io(req, error); + blk_mq_end_request(req, error); } static void virtblk_done(struct virtqueue *vq) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 2dcd907..73ce7d2 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -713,7 +713,7 @@ static bool scsi_end_request(struct request *req, int error, if (req->mq_ctx) { /* - * In the MQ case the command gets freed by __blk_mq_end_io, + * In the MQ case the command gets freed by __blk_mq_end_request, * so we have to do all cleanup that depends on it earlier. * * We also can't kick the queues from irq context, so we @@ -721,7 +721,7 @@ static bool scsi_end_request(struct request *req, int error, */ scsi_mq_uninit_cmd(cmd); - __blk_mq_end_io(req, error); + __blk_mq_end_request(req, error); if (scsi_target(sdev)->single_lun || !list_empty(&sdev->host->starved_list)) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 878b6f7..cb217c1 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -160,8 +160,8 @@ struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_ind struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int); void blk_mq_start_request(struct request *rq); -void blk_mq_end_io(struct request *rq, int error); -void __blk_mq_end_io(struct request *rq, int error); +void blk_mq_end_request(struct request *rq, int error); +void __blk_mq_end_request(struct request *rq, int error); void blk_mq_requeue_request(struct request *rq); void blk_mq_add_to_requeue_list(struct request *rq, bool at_head); -- cgit v0.10.2 From 81481eb423c295c5480a3fab9bb961cf286c91e7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 13 Sep 2014 16:40:11 -0700 Subject: blk-mq: fix and simplify tag iteration for the timeout handler Don't do a kmalloc from timer to handle timeouts, chances are we could be under heavy load or similar and thus just miss out on the timeouts. Fortunately it is very easy to just iterate over all in use tags, and doing this properly actually cleans up the blk_mq_busy_iter API as well, and prepares us for the next patch by passing a reserved argument to the iterator. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index c1b9242..b087880 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -392,45 +392,37 @@ void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, __blk_mq_put_reserved_tag(tags, tag); } -static void bt_for_each_free(struct blk_mq_bitmap_tags *bt, - unsigned long *free_map, unsigned int off) +static void bt_for_each(struct blk_mq_hw_ctx *hctx, + struct blk_mq_bitmap_tags *bt, unsigned int off, + busy_iter_fn *fn, void *data, bool reserved) { - int i; + struct request *rq; + int bit, i; for (i = 0; i < bt->map_nr; i++) { struct blk_align_bitmap *bm = &bt->map[i]; - int bit = 0; - - do { - bit = find_next_zero_bit(&bm->word, bm->depth, bit); - if (bit >= bm->depth) - break; - __set_bit(bit + off, free_map); - bit++; - } while (1); + for (bit = find_first_bit(&bm->word, bm->depth); + bit < bm->depth; + bit = find_next_bit(&bm->word, bm->depth, bit + 1)) { + rq = blk_mq_tag_to_rq(hctx->tags, off + bit); + if (rq->q == hctx->queue) + fn(hctx, rq, data, reserved); + } off += (1 << bt->bits_per_word); } } -void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, - void (*fn)(void *, unsigned long *), void *data) +void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn, + void *priv) { - unsigned long *tag_map; - size_t map_size; - - map_size = ALIGN(tags->nr_tags, BITS_PER_LONG) / BITS_PER_LONG; - tag_map = kzalloc(map_size * sizeof(unsigned long), GFP_ATOMIC); - if (!tag_map) - return; + struct blk_mq_tags *tags = hctx->tags; - bt_for_each_free(&tags->bitmap_tags, tag_map, tags->nr_reserved_tags); if (tags->nr_reserved_tags) - bt_for_each_free(&tags->breserved_tags, tag_map, 0); - - fn(data, tag_map); - kfree(tag_map); + bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true); + bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv, + false); } EXPORT_SYMBOL(blk_mq_tag_busy_iter); diff --git a/block/blk-mq.c b/block/blk-mq.c index 1713686..3baebca 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -525,58 +525,6 @@ struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) } EXPORT_SYMBOL(blk_mq_tag_to_rq); -struct blk_mq_timeout_data { - struct blk_mq_hw_ctx *hctx; - unsigned long *next; - unsigned int *next_set; -}; - -static void blk_mq_timeout_check(void *__data, unsigned long *free_tags) -{ - struct blk_mq_timeout_data *data = __data; - struct blk_mq_hw_ctx *hctx = data->hctx; - unsigned int tag; - - /* It may not be in flight yet (this is where - * the REQ_ATOMIC_STARTED flag comes in). The requests are - * statically allocated, so we know it's always safe to access the - * memory associated with a bit offset into ->rqs[]. - */ - tag = 0; - do { - struct request *rq; - - tag = find_next_zero_bit(free_tags, hctx->tags->nr_tags, tag); - if (tag >= hctx->tags->nr_tags) - break; - - rq = blk_mq_tag_to_rq(hctx->tags, tag++); - if (rq->q != hctx->queue) - continue; - if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) - continue; - - blk_rq_check_expired(rq, data->next, data->next_set); - } while (1); -} - -static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx, - unsigned long *next, - unsigned int *next_set) -{ - struct blk_mq_timeout_data data = { - .hctx = hctx, - .next = next, - .next_set = next_set, - }; - - /* - * Ask the tagging code to iterate busy requests, so we can - * check them for timeout. - */ - blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data); -} - static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq) { struct request_queue *q = rq->q; @@ -598,13 +546,30 @@ static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq) return q->mq_ops->timeout(rq); } + +struct blk_mq_timeout_data { + unsigned long next; + unsigned int next_set; +}; + +static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, + struct request *rq, void *priv, bool reserved) +{ + struct blk_mq_timeout_data *data = priv; + + if (test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) + blk_rq_check_expired(rq, &data->next, &data->next_set); +} -static void blk_mq_rq_timer(unsigned long data) +static void blk_mq_rq_timer(unsigned long priv) { - struct request_queue *q = (struct request_queue *) data; + struct request_queue *q = (struct request_queue *)priv; + struct blk_mq_timeout_data data = { + .next = 0, + .next_set = 0, + }; struct blk_mq_hw_ctx *hctx; - unsigned long next = 0; - int i, next_set = 0; + int i; queue_for_each_hw_ctx(q, hctx, i) { /* @@ -614,12 +579,12 @@ static void blk_mq_rq_timer(unsigned long data) if (!hctx->nr_ctx || !hctx->tags) continue; - blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); + blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data); } - if (next_set) { - next = blk_rq_timeout(round_jiffies_up(next)); - mod_timer(&q->timeout, next); + if (data.next_set) { + data.next = blk_rq_timeout(round_jiffies_up(data.next)); + mod_timer(&q->timeout, data.next); } else { queue_for_each_hw_ctx(q, hctx, i) blk_mq_tag_idle(hctx); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index cb217c1..0eb0f64 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -86,6 +86,9 @@ typedef int (init_request_fn)(void *, struct request *, unsigned int, typedef void (exit_request_fn)(void *, struct request *, unsigned int, unsigned int); +typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, + bool); + struct blk_mq_ops { /* * Queue request @@ -174,7 +177,8 @@ void blk_mq_stop_hw_queues(struct request_queue *q); void blk_mq_start_hw_queues(struct request_queue *q); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); -void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data); +void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn, + void *priv); /* * Driver command data is immediately after the request. So subtract request diff --git a/include/scsi/scsi_tcq.h b/include/scsi/scsi_tcq.h index cdcc90b..e645835 100644 --- a/include/scsi/scsi_tcq.h +++ b/include/scsi/scsi_tcq.h @@ -68,7 +68,7 @@ static inline void scsi_activate_tcq(struct scsi_device *sdev, int depth) return; if (!shost_use_blk_mq(sdev->host) && - blk_queue_tagged(sdev->request_queue)) + !blk_queue_tagged(sdev->request_queue)) blk_queue_init_tags(sdev->request_queue, depth, sdev->host->bqt); -- cgit v0.10.2 From 46f92d42ee37e10970e33891b7b61a342bd97aeb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 13 Sep 2014 16:40:12 -0700 Subject: blk-mq: unshared timeout handler Duplicate the (small) timeout handler in blk-mq so that we can pass arguments more easily to the driver timeout handler. This enables the next patch. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe diff --git a/block/blk-mq.c b/block/blk-mq.c index 3baebca..298d6e3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -525,9 +525,15 @@ struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) } EXPORT_SYMBOL(blk_mq_tag_to_rq); -static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq) +struct blk_mq_timeout_data { + unsigned long next; + unsigned int next_set; +}; + +static void blk_mq_rq_timed_out(struct request *req) { - struct request_queue *q = rq->q; + struct blk_mq_ops *ops = req->q->mq_ops; + enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; /* * We know that complete is set at this point. If STARTED isn't set @@ -538,27 +544,43 @@ static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq) * we both flags will get cleared. So check here again, and ignore * a timeout event with a request that isn't active. */ - if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) - return BLK_EH_NOT_HANDLED; - - if (!q->mq_ops->timeout) - return BLK_EH_RESET_TIMER; + if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags)) + return; - return q->mq_ops->timeout(rq); + if (ops->timeout) + ret = ops->timeout(req); + + switch (ret) { + case BLK_EH_HANDLED: + __blk_mq_complete_request(req); + break; + case BLK_EH_RESET_TIMER: + blk_add_timer(req); + blk_clear_rq_complete(req); + break; + case BLK_EH_NOT_HANDLED: + break; + default: + printk(KERN_ERR "block: bad eh return: %d\n", ret); + break; + } } -struct blk_mq_timeout_data { - unsigned long next; - unsigned int next_set; -}; - static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { struct blk_mq_timeout_data *data = priv; - if (test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) - blk_rq_check_expired(rq, &data->next, &data->next_set); + if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) + return; + + if (time_after_eq(jiffies, rq->deadline)) { + if (!blk_mark_rq_complete(rq)) + blk_mq_rq_timed_out(rq); + } else if (!data->next_set || time_after(data->next, rq->deadline)) { + data->next = rq->deadline; + data->next_set = 1; + } } static void blk_mq_rq_timer(unsigned long priv) @@ -1781,7 +1803,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) else blk_queue_make_request(q, blk_sq_make_request); - blk_queue_rq_timed_out(q, blk_mq_rq_timed_out); if (set->timeout) blk_queue_rq_timeout(q, set->timeout); diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 95a0959..4d44825 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -7,7 +7,6 @@ #include #include "blk.h" -#include "blk-mq.h" #ifdef CONFIG_FAIL_IO_TIMEOUT @@ -90,10 +89,7 @@ static void blk_rq_timed_out(struct request *req) switch (ret) { case BLK_EH_HANDLED: /* Can we use req->errors here? */ - if (q->mq_ops) - __blk_mq_complete_request(req); - else - __blk_complete_request(req); + __blk_complete_request(req); break; case BLK_EH_RESET_TIMER: blk_add_timer(req); @@ -113,7 +109,7 @@ static void blk_rq_timed_out(struct request *req) } } -void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, +static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, unsigned int *next_set) { if (time_after_eq(jiffies, rq->deadline)) { diff --git a/block/blk.h b/block/blk.h index 6748c4f..e515a28 100644 --- a/block/blk.h +++ b/block/blk.h @@ -38,8 +38,6 @@ bool __blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes, unsigned int bidi_bytes); void blk_rq_timed_out_timer(unsigned long data); -void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, - unsigned int *next_set); unsigned long blk_rq_timeout(unsigned long timeout); void blk_add_timer(struct request *req); void blk_delete_timer(struct request *); -- cgit v0.10.2 From 0152fb6b57c4fae769ee75ea2ae670f4ff39fba9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 13 Sep 2014 16:40:13 -0700 Subject: blk-mq: pass a reserved argument to the timeout handler Allow blk-mq to pass an argument to the timeout handler to indicate if we're timing out a reserved or regular command. For many drivers those need to be handled different. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe diff --git a/block/blk-mq.c b/block/blk-mq.c index 298d6e3..d12f198 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -530,7 +530,7 @@ struct blk_mq_timeout_data { unsigned int next_set; }; -static void blk_mq_rq_timed_out(struct request *req) +static void blk_mq_rq_timed_out(struct request *req, bool reserved) { struct blk_mq_ops *ops = req->q->mq_ops; enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; @@ -548,7 +548,7 @@ static void blk_mq_rq_timed_out(struct request *req) return; if (ops->timeout) - ret = ops->timeout(req); + ret = ops->timeout(req, reserved); switch (ret) { case BLK_EH_HANDLED: @@ -576,7 +576,7 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, if (time_after_eq(jiffies, rq->deadline)) { if (!blk_mark_rq_complete(rq)) - blk_mq_rq_timed_out(rq); + blk_mq_rq_timed_out(rq, reserved); } else if (!data->next_set || time_after(data->next, rq->deadline)) { data->next = rq->deadline; data->next_set = 1; diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 73ce7d2..86b1156 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1932,6 +1932,14 @@ out: return ret; } +static enum blk_eh_timer_return scsi_timeout(struct request *req, + bool reserved) +{ + if (reserved) + return BLK_EH_RESET_TIMER; + return scsi_times_out(req); +} + static int scsi_init_request(void *data, struct request *rq, unsigned int hctx_idx, unsigned int request_idx, unsigned int numa_node) @@ -2043,7 +2051,7 @@ static struct blk_mq_ops scsi_mq_ops = { .map_queue = blk_mq_map_queue, .queue_rq = scsi_queue_rq, .complete = scsi_softirq_done, - .timeout = scsi_times_out, + .timeout = scsi_timeout, .init_request = scsi_init_request, .exit_request = scsi_exit_request, }; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 0eb0f64..3253495 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -79,6 +79,7 @@ struct blk_mq_tag_set { typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *, bool); typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int); +typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool); typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); typedef int (init_request_fn)(void *, struct request *, unsigned int, @@ -103,7 +104,7 @@ struct blk_mq_ops { /* * Called on request timeout */ - rq_timed_out_fn *timeout; + timeout_fn *timeout; softirq_done_fn *complete; -- cgit v0.10.2 From 2edd2c740b2918eb0a9a1fe1b69678b903769ec2 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 17 Sep 2014 17:47:58 +0800 Subject: blk-mq: remove unnecessary blk_clear_rq_complete() This patch removes two unnecessary blk_clear_rq_complete(), the REQ_ATOM_COMPLETE flag is cleared inside blk_mq_start_request(), so: - The blk_clear_rq_complete() in blk_flush_restore_request() needn't because the request will be freed later, and clearing it here may open a small race window with timeout. - The blk_clear_rq_complete() in blk_mq_requeue_request() isn't necessary too, even though REQ_ATOM_STARTED is cleared in __blk_mq_requeue_request(), in theory it still may cause a small race window with timeout since the two clear_bit() may be reordered. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe diff --git a/block/blk-flush.c b/block/blk-flush.c index 698e692..c8e2576 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -126,8 +126,6 @@ static void blk_flush_restore_request(struct request *rq) /* make @rq a normal request */ rq->cmd_flags &= ~REQ_FLUSH_SEQ; rq->end_io = rq->flush.saved_end_io; - - blk_clear_rq_complete(rq); } static bool blk_flush_queue_rq(struct request *rq, bool add_front) diff --git a/block/blk-mq.c b/block/blk-mq.c index d12f198..3b277b4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -439,7 +439,6 @@ static void __blk_mq_requeue_request(struct request *rq) void blk_mq_requeue_request(struct request *rq) { __blk_mq_requeue_request(rq); - blk_clear_rq_complete(rq); BUG_ON(blk_queued_rq(rq)); blk_mq_add_to_requeue_list(rq, true); -- cgit v0.10.2 From aedcd72f6c283dffefbb8b808ae67bdd2c6eb11a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 17 Sep 2014 08:27:03 -0600 Subject: blk-mq: limit memory consumption if a crash dump is active It's not uncommon for crash dump kernels to be limited to 128MB or something low in that area. This is normally not a problem for devices as we don't use that much memory, but for some shared SCSI setups with huge queue depths, it can potentially fill most of memory with tons of request allocations. blk-mq does scale back when it fails to allocate memory, but it scales back just enough so that blk-mq succeeds. This could still leave the system with not enough memory to make any real progress. Check if we are in a kdump environment and limit the hardware queues and tag depth. Signed-off-by: Jens Axboe diff --git a/block/blk-mq.c b/block/blk-mq.c index 3b277b4..c5345a9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -1742,6 +1743,16 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (!ctx) return ERR_PTR(-ENOMEM); + /* + * If a crashdump is active, then we are potentially in a very + * memory constrained environment. Limit us to 1 queue and + * 64 tags to prevent using too much memory. + */ + if (is_kdump_kernel()) { + set->nr_hw_queues = 1; + set->queue_depth = min(64U, set->queue_depth); + } + hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, set->numa_node); -- cgit v0.10.2 From 5e940aaa597c15e916618240ae5838864f36c91e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 19 Sep 2014 21:53:46 +0800 Subject: blk-timeout: fix blk_add_timer Commit 8cb34819cdd5d(blk-mq: unshared timeout handler) introduces blk-mq's own timeout handler, and removes following line: blk_queue_rq_timed_out(q, blk_mq_rq_timed_out); which then causes blk_add_timer() to bypass adding the timer, since blk-mq no longer has q->rq_timed_out_fn defined. This patch fixes the problem by bypassing the check for blk-mq, so that both request deadlines are still set and the rolling timer updated. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 4d44825..8bae410 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -186,7 +186,8 @@ void blk_add_timer(struct request *req) struct request_queue *q = req->q; unsigned long expiry; - if (!q->rq_timed_out_fn) + /* blk-mq has its own handler, so we don't need ->rq_timed_out_fn */ + if (!q->mq_ops && !q->rq_timed_out_fn) return; BUG_ON(!list_empty(&req->timeout_list)); -- cgit v0.10.2 From 90415837659fec54f33584b423dab250eb1e8432 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 22 Sep 2014 10:21:48 -0600 Subject: block: fix blk_abort_request on blk-mq Signed-off-by: Christoph Hellwig Moved blk_mq_rq_timed_out() definition to the private blk-mq.h header. Signed-off-by: Jens Axboe diff --git a/block/blk-mq.c b/block/blk-mq.c index c5345a9..a3a8088 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -530,7 +530,7 @@ struct blk_mq_timeout_data { unsigned int next_set; }; -static void blk_mq_rq_timed_out(struct request *req, bool reserved) +void blk_mq_rq_timed_out(struct request *req, bool reserved) { struct blk_mq_ops *ops = req->q->mq_ops; enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; diff --git a/block/blk-mq.h b/block/blk-mq.h index ca4964a..a3c613a 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -60,6 +60,8 @@ extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int); extern int blk_mq_sysfs_register(struct request_queue *q); extern void blk_mq_sysfs_unregister(struct request_queue *q); +extern void blk_mq_rq_timed_out(struct request *req, bool reserved); + /* * Basic implementation of sparser bitmap, allowing the user to spread * the bits over more cachelines. diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 8bae410..56c0258 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -7,6 +7,7 @@ #include #include "blk.h" +#include "blk-mq.h" #ifdef CONFIG_FAIL_IO_TIMEOUT @@ -158,7 +159,10 @@ void blk_abort_request(struct request *req) if (blk_mark_rq_complete(req)) return; blk_delete_timer(req); - blk_rq_timed_out(req); + if (req->q->mq_ops) + blk_mq_rq_timed_out(req, false); + else + blk_rq_timed_out(req); } EXPORT_SYMBOL_GPL(blk_abort_request); -- cgit v0.10.2 From fe052529e465daff25225aac769828baa88b7252 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 22 Sep 2014 15:59:31 +0200 Subject: scsi: move blk_mq_start_request call earlier Some ATA drivers need the dma drain size workaround, and thus need to call blk_mq_start_request before the S/G mapping. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 86b1156..5c5617e 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1846,6 +1846,8 @@ static int scsi_mq_prep_fn(struct request *req) next_rq->special = bidi_sdb; } + blk_mq_start_request(req); + return scsi_setup_cmnd(sdev, req); } @@ -1880,17 +1882,19 @@ static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req, if (!scsi_host_queue_ready(q, shost, sdev)) goto out_dec_target_busy; + if (!(req->cmd_flags & REQ_DONTPREP)) { ret = prep_to_mq(scsi_mq_prep_fn(req)); if (ret) goto out_dec_host_busy; req->cmd_flags |= REQ_DONTPREP; + } else { + blk_mq_start_request(req); } scsi_init_cmd_errh(cmd); cmd->scsi_done = scsi_mq_done; - blk_mq_start_request(req); reason = scsi_dispatch_cmd(cmd); if (reason) { scsi_set_blocked(cmd, reason); -- cgit v0.10.2 From 08e98fc6016c890c2f4ffba6decc0ca9d2d5d7f8 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 25 Sep 2014 23:23:38 +0800 Subject: blk-mq: handle failure path for initializing hctx Failure of initializing one hctx isn't handled, so this patch introduces blk_mq_init_hctx() and its pair to handle it explicitly. Also this patch makes code cleaner. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe diff --git a/block/blk-mq.c b/block/blk-mq.c index a3a8088..66ef1fb 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1509,6 +1509,20 @@ static int blk_mq_hctx_notify(void *data, unsigned long action, return NOTIFY_OK; } +static void blk_mq_exit_hctx(struct request_queue *q, + struct blk_mq_tag_set *set, + struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) +{ + blk_mq_tag_idle(hctx); + + if (set->ops->exit_hctx) + set->ops->exit_hctx(hctx, hctx_idx); + + blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); + kfree(hctx->ctxs); + blk_mq_free_bitmap(&hctx->ctx_map); +} + static void blk_mq_exit_hw_queues(struct request_queue *q, struct blk_mq_tag_set *set, int nr_queue) { @@ -1518,17 +1532,8 @@ static void blk_mq_exit_hw_queues(struct request_queue *q, queue_for_each_hw_ctx(q, hctx, i) { if (i == nr_queue) break; - - blk_mq_tag_idle(hctx); - - if (set->ops->exit_hctx) - set->ops->exit_hctx(hctx, i); - - blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); - kfree(hctx->ctxs); - blk_mq_free_bitmap(&hctx->ctx_map); + blk_mq_exit_hctx(q, set, hctx, i); } - } static void blk_mq_free_hw_queues(struct request_queue *q, @@ -1543,53 +1548,72 @@ static void blk_mq_free_hw_queues(struct request_queue *q, } } -static int blk_mq_init_hw_queues(struct request_queue *q, - struct blk_mq_tag_set *set) +static int blk_mq_init_hctx(struct request_queue *q, + struct blk_mq_tag_set *set, + struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) { - struct blk_mq_hw_ctx *hctx; - unsigned int i; + int node; + + node = hctx->numa_node; + if (node == NUMA_NO_NODE) + node = hctx->numa_node = set->numa_node; + + INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); + INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn); + spin_lock_init(&hctx->lock); + INIT_LIST_HEAD(&hctx->dispatch); + hctx->queue = q; + hctx->queue_num = hctx_idx; + hctx->flags = set->flags; + hctx->cmd_size = set->cmd_size; + + blk_mq_init_cpu_notifier(&hctx->cpu_notifier, + blk_mq_hctx_notify, hctx); + blk_mq_register_cpu_notifier(&hctx->cpu_notifier); + + hctx->tags = set->tags[hctx_idx]; /* - * Initialize hardware queues + * Allocate space for all possible cpus to avoid allocation at + * runtime */ - queue_for_each_hw_ctx(q, hctx, i) { - int node; + hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), + GFP_KERNEL, node); + if (!hctx->ctxs) + goto unregister_cpu_notifier; - node = hctx->numa_node; - if (node == NUMA_NO_NODE) - node = hctx->numa_node = set->numa_node; + if (blk_mq_alloc_bitmap(&hctx->ctx_map, node)) + goto free_ctxs; - INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); - INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn); - spin_lock_init(&hctx->lock); - INIT_LIST_HEAD(&hctx->dispatch); - hctx->queue = q; - hctx->queue_num = i; - hctx->flags = set->flags; - hctx->cmd_size = set->cmd_size; + hctx->nr_ctx = 0; - blk_mq_init_cpu_notifier(&hctx->cpu_notifier, - blk_mq_hctx_notify, hctx); - blk_mq_register_cpu_notifier(&hctx->cpu_notifier); + if (set->ops->init_hctx && + set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) + goto free_bitmap; - hctx->tags = set->tags[i]; + return 0; - /* - * Allocate space for all possible cpus to avoid allocation at - * runtime - */ - hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), - GFP_KERNEL, node); - if (!hctx->ctxs) - break; + free_bitmap: + blk_mq_free_bitmap(&hctx->ctx_map); + free_ctxs: + kfree(hctx->ctxs); + unregister_cpu_notifier: + blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); - if (blk_mq_alloc_bitmap(&hctx->ctx_map, node)) - break; + return -1; +} - hctx->nr_ctx = 0; +static int blk_mq_init_hw_queues(struct request_queue *q, + struct blk_mq_tag_set *set) +{ + struct blk_mq_hw_ctx *hctx; + unsigned int i; - if (set->ops->init_hctx && - set->ops->init_hctx(hctx, set->driver_data, i)) + /* + * Initialize hardware queues + */ + queue_for_each_hw_ctx(q, hctx, i) { + if (blk_mq_init_hctx(q, set, hctx, i)) break; } -- cgit v0.10.2 From 1bcb1eada4f11a713cbe586d1b5a5d93a48277cb Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 25 Sep 2014 23:23:39 +0800 Subject: blk-mq: allocate flush_rq in blk_mq_init_flush() It is reasonable to allocate flush req in blk_mq_init_flush(). Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe diff --git a/block/blk-flush.c b/block/blk-flush.c index c8e2576..55028a7 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -472,7 +472,16 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, } EXPORT_SYMBOL(blkdev_issue_flush); -void blk_mq_init_flush(struct request_queue *q) +int blk_mq_init_flush(struct request_queue *q) { + struct blk_mq_tag_set *set = q->tag_set; + spin_lock_init(&q->mq_flush_lock); + + q->flush_rq = kzalloc(round_up(sizeof(struct request) + + set->cmd_size, cache_line_size()), + GFP_KERNEL); + if (!q->flush_rq) + return -ENOMEM; + return 0; } diff --git a/block/blk-mq.c b/block/blk-mq.c index 66ef1fb..78bcf8b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1848,17 +1848,10 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (set->ops->complete) blk_queue_softirq_done(q, set->ops->complete); - blk_mq_init_flush(q); blk_mq_init_cpu_queues(q, set->nr_hw_queues); - q->flush_rq = kzalloc(round_up(sizeof(struct request) + - set->cmd_size, cache_line_size()), - GFP_KERNEL); - if (!q->flush_rq) - goto err_hw; - if (blk_mq_init_hw_queues(q, set)) - goto err_flush_rq; + goto err_hw; mutex_lock(&all_q_mutex); list_add_tail(&q->all_q_node, &all_q_list); @@ -1866,12 +1859,15 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) blk_mq_add_queue_tag_set(set, q); + if (blk_mq_init_flush(q)) + goto err_hw_queues; + blk_mq_map_swqueue(q); return q; -err_flush_rq: - kfree(q->flush_rq); +err_hw_queues: + blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); err_hw: blk_cleanup_queue(q); err_hctxs: diff --git a/block/blk-mq.h b/block/blk-mq.h index a3c613a..ecac69c 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -27,7 +27,7 @@ struct blk_mq_ctx { void __blk_mq_complete_request(struct request *rq); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); -void blk_mq_init_flush(struct request_queue *q); +int blk_mq_init_flush(struct request_queue *q); void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); void blk_mq_clone_flush_request(struct request *flush_rq, -- cgit v0.10.2 From f355265571440a7db16e784b6edf4e7d26971a03 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 25 Sep 2014 23:23:40 +0800 Subject: block: introduce blk_init_flush and its pair These two temporary functions are introduced for holding flush initialization and de-initialization, so that we can introduce 'flush queue' easier in the following patch. And once 'flush queue' and its allocation/free functions are ready, they will be removed for sake of code readability. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe diff --git a/block/blk-core.c b/block/blk-core.c index 6946a42..0a9d172 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -705,8 +705,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, if (!q) return NULL; - q->flush_rq = kzalloc(sizeof(struct request), GFP_KERNEL); - if (!q->flush_rq) + if (blk_init_flush(q)) return NULL; if (blk_init_rl(&q->root_rl, q, GFP_KERNEL)) @@ -742,7 +741,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, return q; fail: - kfree(q->flush_rq); + blk_exit_flush(q); return NULL; } EXPORT_SYMBOL(blk_init_allocated_queue); diff --git a/block/blk-flush.c b/block/blk-flush.c index 55028a7..c72ab32 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -472,7 +472,7 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, } EXPORT_SYMBOL(blkdev_issue_flush); -int blk_mq_init_flush(struct request_queue *q) +static int blk_mq_init_flush(struct request_queue *q) { struct blk_mq_tag_set *set = q->tag_set; @@ -485,3 +485,20 @@ int blk_mq_init_flush(struct request_queue *q) return -ENOMEM; return 0; } + +int blk_init_flush(struct request_queue *q) +{ + if (q->mq_ops) + return blk_mq_init_flush(q); + + q->flush_rq = kzalloc(sizeof(struct request), GFP_KERNEL); + if (!q->flush_rq) + return -ENOMEM; + + return 0; +} + +void blk_exit_flush(struct request_queue *q) +{ + kfree(q->flush_rq); +} diff --git a/block/blk-mq.c b/block/blk-mq.c index 78bcf8b..2758cdf 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1859,7 +1859,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) blk_mq_add_queue_tag_set(set, q); - if (blk_mq_init_flush(q)) + if (blk_init_flush(q)) goto err_hw_queues; blk_mq_map_swqueue(q); diff --git a/block/blk-mq.h b/block/blk-mq.h index ecac69c..d567d52 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -27,7 +27,6 @@ struct blk_mq_ctx { void __blk_mq_complete_request(struct request *rq); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); -int blk_mq_init_flush(struct request_queue *q); void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); void blk_mq_clone_flush_request(struct request *flush_rq, diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 17f5c84..9490759 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -517,11 +517,11 @@ static void blk_release_queue(struct kobject *kobj) if (q->queue_tags) __blk_queue_free_tags(q); + blk_exit_flush(q); + if (q->mq_ops) blk_mq_free_queue(q); - kfree(q->flush_rq); - blk_trace_shutdown(q); bdi_destroy(&q->backing_dev_info); diff --git a/block/blk.h b/block/blk.h index e515a28..c6fa3d4 100644 --- a/block/blk.h +++ b/block/blk.h @@ -22,6 +22,9 @@ static inline void __blk_get_queue(struct request_queue *q) kobject_get(&q->kobj); } +int blk_init_flush(struct request_queue *q); +void blk_exit_flush(struct request_queue *q); + int blk_init_rl(struct request_list *rl, struct request_queue *q, gfp_t gfp_mask); void blk_exit_rl(struct request_list *rl); -- cgit v0.10.2 From 3c09676c12b1dabf84acbb5849bfc54acadaf092 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 25 Sep 2014 23:23:41 +0800 Subject: block: move flush initialization to blk_flush_init These fields are always used with the flush request, so initialize them together. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe diff --git a/block/blk-core.c b/block/blk-core.c index 0a9d172..222fe84 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -600,9 +600,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) #ifdef CONFIG_BLK_CGROUP INIT_LIST_HEAD(&q->blkg_list); #endif - INIT_LIST_HEAD(&q->flush_queue[0]); - INIT_LIST_HEAD(&q->flush_queue[1]); - INIT_LIST_HEAD(&q->flush_data_in_flight); INIT_DELAYED_WORK(&q->delay_work, blk_delay_work); kobject_init(&q->kobj, &blk_queue_ktype); diff --git a/block/blk-flush.c b/block/blk-flush.c index c72ab32..a49ffbd 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -488,6 +488,10 @@ static int blk_mq_init_flush(struct request_queue *q) int blk_init_flush(struct request_queue *q) { + INIT_LIST_HEAD(&q->flush_queue[0]); + INIT_LIST_HEAD(&q->flush_queue[1]); + INIT_LIST_HEAD(&q->flush_data_in_flight); + if (q->mq_ops) return blk_mq_init_flush(q); -- cgit v0.10.2 From 7ddab5de5b80d3111f9e6765714e728b2c4f1c07 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 25 Sep 2014 23:23:42 +0800 Subject: block: avoid to use q->flush_rq directly This patch trys to use local variable to access flush request, so that we can convert to per-queue flush machinery a bit easier. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe diff --git a/block/blk-flush.c b/block/blk-flush.c index a49ffbd..caf44756 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -223,7 +223,7 @@ static void flush_end_io(struct request *flush_rq, int error) if (q->mq_ops) { spin_lock_irqsave(&q->mq_flush_lock, flags); - q->flush_rq->tag = -1; + flush_rq->tag = -1; } running = &q->flush_queue[q->flush_running_idx]; @@ -281,6 +281,7 @@ static bool blk_kick_flush(struct request_queue *q) struct list_head *pending = &q->flush_queue[q->flush_pending_idx]; struct request *first_rq = list_first_entry(pending, struct request, flush.list); + struct request *flush_rq = q->flush_rq; /* C1 described at the top of this file */ if (q->flush_pending_idx != q->flush_running_idx || list_empty(pending)) @@ -298,16 +299,16 @@ static bool blk_kick_flush(struct request_queue *q) */ q->flush_pending_idx ^= 1; - blk_rq_init(q, q->flush_rq); + blk_rq_init(q, flush_rq); if (q->mq_ops) - blk_mq_clone_flush_request(q->flush_rq, first_rq); + blk_mq_clone_flush_request(flush_rq, first_rq); - q->flush_rq->cmd_type = REQ_TYPE_FS; - q->flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; - q->flush_rq->rq_disk = first_rq->rq_disk; - q->flush_rq->end_io = flush_end_io; + flush_rq->cmd_type = REQ_TYPE_FS; + flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; + flush_rq->rq_disk = first_rq->rq_disk; + flush_rq->end_io = flush_end_io; - return blk_flush_queue_rq(q->flush_rq, false); + return blk_flush_queue_rq(flush_rq, false); } static void flush_data_end_io(struct request *rq, int error) -- cgit v0.10.2 From 7c94e1c157a227837b04f02f5edeff8301410ba2 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 25 Sep 2014 23:23:43 +0800 Subject: block: introduce blk_flush_queue to drive flush machinery This patch introduces 'struct blk_flush_queue' and puts all flush machinery related fields into this structure, so that - flush implementation details aren't exposed to driver - it is easy to convert to per dispatch-queue flush machinery This patch is basically a mechanical replacement. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe diff --git a/block/blk-core.c b/block/blk-core.c index 222fe84..cfaca8c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -390,11 +390,13 @@ static void __blk_drain_queue(struct request_queue *q, bool drain_all) * be drained. Check all the queues and counters. */ if (drain_all) { + struct blk_flush_queue *fq = blk_get_flush_queue(q); drain |= !list_empty(&q->queue_head); for (i = 0; i < 2; i++) { drain |= q->nr_rqs[i]; drain |= q->in_flight[i]; - drain |= !list_empty(&q->flush_queue[i]); + if (fq) + drain |= !list_empty(&fq->flush_queue[i]); } } diff --git a/block/blk-flush.c b/block/blk-flush.c index caf44756..b01a86d 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -28,7 +28,7 @@ * * The actual execution of flush is double buffered. Whenever a request * needs to execute PRE or POSTFLUSH, it queues at - * q->flush_queue[q->flush_pending_idx]. Once certain criteria are met, a + * fq->flush_queue[fq->flush_pending_idx]. Once certain criteria are met, a * flush is issued and the pending_idx is toggled. When the flush * completes, all the requests which were pending are proceeded to the next * step. This allows arbitrary merging of different types of FLUSH/FUA @@ -155,7 +155,7 @@ static bool blk_flush_queue_rq(struct request *rq, bool add_front) * completion and trigger the next step. * * CONTEXT: - * spin_lock_irq(q->queue_lock or q->mq_flush_lock) + * spin_lock_irq(q->queue_lock or fq->mq_flush_lock) * * RETURNS: * %true if requests were added to the dispatch queue, %false otherwise. @@ -164,7 +164,8 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, int error) { struct request_queue *q = rq->q; - struct list_head *pending = &q->flush_queue[q->flush_pending_idx]; + struct blk_flush_queue *fq = blk_get_flush_queue(q); + struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; bool queued = false, kicked; BUG_ON(rq->flush.seq & seq); @@ -180,12 +181,12 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, case REQ_FSEQ_POSTFLUSH: /* queue for flush */ if (list_empty(pending)) - q->flush_pending_since = jiffies; + fq->flush_pending_since = jiffies; list_move_tail(&rq->flush.list, pending); break; case REQ_FSEQ_DATA: - list_move_tail(&rq->flush.list, &q->flush_data_in_flight); + list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); queued = blk_flush_queue_rq(rq, true); break; @@ -220,17 +221,18 @@ static void flush_end_io(struct request *flush_rq, int error) bool queued = false; struct request *rq, *n; unsigned long flags = 0; + struct blk_flush_queue *fq = blk_get_flush_queue(q); if (q->mq_ops) { - spin_lock_irqsave(&q->mq_flush_lock, flags); + spin_lock_irqsave(&fq->mq_flush_lock, flags); flush_rq->tag = -1; } - running = &q->flush_queue[q->flush_running_idx]; - BUG_ON(q->flush_pending_idx == q->flush_running_idx); + running = &fq->flush_queue[fq->flush_running_idx]; + BUG_ON(fq->flush_pending_idx == fq->flush_running_idx); /* account completion of the flush request */ - q->flush_running_idx ^= 1; + fq->flush_running_idx ^= 1; if (!q->mq_ops) elv_completed_request(q, flush_rq); @@ -254,13 +256,13 @@ static void flush_end_io(struct request *flush_rq, int error) * directly into request_fn may confuse the driver. Always use * kblockd. */ - if (queued || q->flush_queue_delayed) { + if (queued || fq->flush_queue_delayed) { WARN_ON(q->mq_ops); blk_run_queue_async(q); } - q->flush_queue_delayed = 0; + fq->flush_queue_delayed = 0; if (q->mq_ops) - spin_unlock_irqrestore(&q->mq_flush_lock, flags); + spin_unlock_irqrestore(&fq->mq_flush_lock, flags); } /** @@ -271,33 +273,34 @@ static void flush_end_io(struct request *flush_rq, int error) * Please read the comment at the top of this file for more info. * * CONTEXT: - * spin_lock_irq(q->queue_lock or q->mq_flush_lock) + * spin_lock_irq(q->queue_lock or fq->mq_flush_lock) * * RETURNS: * %true if flush was issued, %false otherwise. */ static bool blk_kick_flush(struct request_queue *q) { - struct list_head *pending = &q->flush_queue[q->flush_pending_idx]; + struct blk_flush_queue *fq = blk_get_flush_queue(q); + struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; struct request *first_rq = list_first_entry(pending, struct request, flush.list); - struct request *flush_rq = q->flush_rq; + struct request *flush_rq = fq->flush_rq; /* C1 described at the top of this file */ - if (q->flush_pending_idx != q->flush_running_idx || list_empty(pending)) + if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending)) return false; /* C2 and C3 */ - if (!list_empty(&q->flush_data_in_flight) && + if (!list_empty(&fq->flush_data_in_flight) && time_before(jiffies, - q->flush_pending_since + FLUSH_PENDING_TIMEOUT)) + fq->flush_pending_since + FLUSH_PENDING_TIMEOUT)) return false; /* * Issue flush and toggle pending_idx. This makes pending_idx * different from running_idx, which means flush is in flight. */ - q->flush_pending_idx ^= 1; + fq->flush_pending_idx ^= 1; blk_rq_init(q, flush_rq); if (q->mq_ops) @@ -329,6 +332,7 @@ static void mq_flush_data_end_io(struct request *rq, int error) struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; unsigned long flags; + struct blk_flush_queue *fq = blk_get_flush_queue(q); ctx = rq->mq_ctx; hctx = q->mq_ops->map_queue(q, ctx->cpu); @@ -337,10 +341,10 @@ static void mq_flush_data_end_io(struct request *rq, int error) * After populating an empty queue, kick it to avoid stall. Read * the comment in flush_end_io(). */ - spin_lock_irqsave(&q->mq_flush_lock, flags); + spin_lock_irqsave(&fq->mq_flush_lock, flags); if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error)) blk_mq_run_hw_queue(hctx, true); - spin_unlock_irqrestore(&q->mq_flush_lock, flags); + spin_unlock_irqrestore(&fq->mq_flush_lock, flags); } /** @@ -408,11 +412,13 @@ void blk_insert_flush(struct request *rq) rq->cmd_flags |= REQ_FLUSH_SEQ; rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ if (q->mq_ops) { + struct blk_flush_queue *fq = blk_get_flush_queue(q); + rq->end_io = mq_flush_data_end_io; - spin_lock_irq(&q->mq_flush_lock); + spin_lock_irq(&fq->mq_flush_lock); blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); - spin_unlock_irq(&q->mq_flush_lock); + spin_unlock_irq(&fq->mq_flush_lock); return; } rq->end_io = flush_data_end_io; @@ -473,31 +479,52 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, } EXPORT_SYMBOL(blkdev_issue_flush); -static int blk_mq_init_flush(struct request_queue *q) +static struct blk_flush_queue *blk_alloc_flush_queue( + struct request_queue *q) { - struct blk_mq_tag_set *set = q->tag_set; + struct blk_flush_queue *fq; + int rq_sz = sizeof(struct request); - spin_lock_init(&q->mq_flush_lock); + fq = kzalloc(sizeof(*fq), GFP_KERNEL); + if (!fq) + goto fail; - q->flush_rq = kzalloc(round_up(sizeof(struct request) + - set->cmd_size, cache_line_size()), - GFP_KERNEL); - if (!q->flush_rq) - return -ENOMEM; - return 0; + if (q->mq_ops) { + spin_lock_init(&fq->mq_flush_lock); + rq_sz = round_up(rq_sz + q->tag_set->cmd_size, + cache_line_size()); + } + + fq->flush_rq = kzalloc(rq_sz, GFP_KERNEL); + if (!fq->flush_rq) + goto fail_rq; + + INIT_LIST_HEAD(&fq->flush_queue[0]); + INIT_LIST_HEAD(&fq->flush_queue[1]); + INIT_LIST_HEAD(&fq->flush_data_in_flight); + + return fq; + + fail_rq: + kfree(fq); + fail: + return NULL; } -int blk_init_flush(struct request_queue *q) +static void blk_free_flush_queue(struct blk_flush_queue *fq) { - INIT_LIST_HEAD(&q->flush_queue[0]); - INIT_LIST_HEAD(&q->flush_queue[1]); - INIT_LIST_HEAD(&q->flush_data_in_flight); + /* bio based request queue hasn't flush queue */ + if (!fq) + return; - if (q->mq_ops) - return blk_mq_init_flush(q); + kfree(fq->flush_rq); + kfree(fq); +} - q->flush_rq = kzalloc(sizeof(struct request), GFP_KERNEL); - if (!q->flush_rq) +int blk_init_flush(struct request_queue *q) +{ + q->fq = blk_alloc_flush_queue(q); + if (!q->fq) return -ENOMEM; return 0; @@ -505,5 +532,5 @@ int blk_init_flush(struct request_queue *q) void blk_exit_flush(struct request_queue *q) { - kfree(q->flush_rq); + blk_free_flush_queue(q->fq); } diff --git a/block/blk-mq.c b/block/blk-mq.c index 2758cdf..d39e8a5 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -508,20 +508,22 @@ void blk_mq_kick_requeue_list(struct request_queue *q) } EXPORT_SYMBOL(blk_mq_kick_requeue_list); -static inline bool is_flush_request(struct request *rq, unsigned int tag) +static inline bool is_flush_request(struct request *rq, + struct blk_flush_queue *fq, unsigned int tag) { return ((rq->cmd_flags & REQ_FLUSH_SEQ) && - rq->q->flush_rq->tag == tag); + fq->flush_rq->tag == tag); } struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) { struct request *rq = tags->rqs[tag]; + struct blk_flush_queue *fq = blk_get_flush_queue(rq->q); - if (!is_flush_request(rq, tag)) + if (!is_flush_request(rq, fq, tag)) return rq; - return rq->q->flush_rq; + return fq->flush_rq; } EXPORT_SYMBOL(blk_mq_tag_to_rq); diff --git a/block/blk.h b/block/blk.h index c6fa3d4..833c4ac 100644 --- a/block/blk.h +++ b/block/blk.h @@ -12,11 +12,28 @@ /* Max future timer expiry for timeouts */ #define BLK_MAX_TIMEOUT (5 * HZ) +struct blk_flush_queue { + unsigned int flush_queue_delayed:1; + unsigned int flush_pending_idx:1; + unsigned int flush_running_idx:1; + unsigned long flush_pending_since; + struct list_head flush_queue[2]; + struct list_head flush_data_in_flight; + struct request *flush_rq; + spinlock_t mq_flush_lock; +}; + extern struct kmem_cache *blk_requestq_cachep; extern struct kmem_cache *request_cachep; extern struct kobj_type blk_queue_ktype; extern struct ida blk_queue_ida; +static inline struct blk_flush_queue *blk_get_flush_queue( + struct request_queue *q) +{ + return q->fq; +} + static inline void __blk_get_queue(struct request_queue *q) { kobject_get(&q->kobj); @@ -89,6 +106,7 @@ void blk_insert_flush(struct request *rq); static inline struct request *__elv_next_request(struct request_queue *q) { struct request *rq; + struct blk_flush_queue *fq = blk_get_flush_queue(q); while (1) { if (!list_empty(&q->queue_head)) { @@ -111,9 +129,9 @@ static inline struct request *__elv_next_request(struct request_queue *q) * should be restarted later. Please see flush_end_io() for * details. */ - if (q->flush_pending_idx != q->flush_running_idx && + if (fq->flush_pending_idx != fq->flush_running_idx && !queue_flush_queueable(q)) { - q->flush_queue_delayed = 1; + fq->flush_queue_delayed = 1; return NULL; } if (unlikely(blk_queue_bypass(q)) || diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e267bf0..49f3461 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -36,6 +36,7 @@ struct request; struct sg_io_hdr; struct bsg_job; struct blkcg_gq; +struct blk_flush_queue; #define BLKDEV_MIN_RQ 4 #define BLKDEV_MAX_RQ 128 /* Default maximum */ @@ -455,14 +456,7 @@ struct request_queue { */ unsigned int flush_flags; unsigned int flush_not_queueable:1; - unsigned int flush_queue_delayed:1; - unsigned int flush_pending_idx:1; - unsigned int flush_running_idx:1; - unsigned long flush_pending_since; - struct list_head flush_queue[2]; - struct list_head flush_data_in_flight; - struct request *flush_rq; - spinlock_t mq_flush_lock; + struct blk_flush_queue *fq; struct list_head requeue_list; spinlock_t requeue_lock; -- cgit v0.10.2 From ba483388e3058b3e412632a84e6bf1f134beaf3d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 25 Sep 2014 23:23:44 +0800 Subject: block: remove blk_init_flush() and its pair Now mission of the two helpers is over, and just call blk_alloc_flush_queue() and blk_free_flush_queue() directly. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe diff --git a/block/blk-core.c b/block/blk-core.c index cfaca8c..dba0a83 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -704,7 +704,8 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, if (!q) return NULL; - if (blk_init_flush(q)) + q->fq = blk_alloc_flush_queue(q); + if (!q->fq) return NULL; if (blk_init_rl(&q->root_rl, q, GFP_KERNEL)) @@ -740,7 +741,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, return q; fail: - blk_exit_flush(q); + blk_free_flush_queue(q->fq); return NULL; } EXPORT_SYMBOL(blk_init_allocated_queue); diff --git a/block/blk-flush.c b/block/blk-flush.c index b01a86d..d66cbf2 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -479,8 +479,7 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, } EXPORT_SYMBOL(blkdev_issue_flush); -static struct blk_flush_queue *blk_alloc_flush_queue( - struct request_queue *q) +struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q) { struct blk_flush_queue *fq; int rq_sz = sizeof(struct request); @@ -511,7 +510,7 @@ static struct blk_flush_queue *blk_alloc_flush_queue( return NULL; } -static void blk_free_flush_queue(struct blk_flush_queue *fq) +void blk_free_flush_queue(struct blk_flush_queue *fq) { /* bio based request queue hasn't flush queue */ if (!fq) @@ -520,17 +519,3 @@ static void blk_free_flush_queue(struct blk_flush_queue *fq) kfree(fq->flush_rq); kfree(fq); } - -int blk_init_flush(struct request_queue *q) -{ - q->fq = blk_alloc_flush_queue(q); - if (!q->fq) - return -ENOMEM; - - return 0; -} - -void blk_exit_flush(struct request_queue *q) -{ - blk_free_flush_queue(q->fq); -} diff --git a/block/blk-mq.c b/block/blk-mq.c index d39e8a5..59ca796 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1861,7 +1861,8 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) blk_mq_add_queue_tag_set(set, q); - if (blk_init_flush(q)) + q->fq = blk_alloc_flush_queue(q); + if (!q->fq) goto err_hw_queues; blk_mq_map_swqueue(q); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 9490759..718cffc4c 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -517,7 +517,7 @@ static void blk_release_queue(struct kobject *kobj) if (q->queue_tags) __blk_queue_free_tags(q); - blk_exit_flush(q); + blk_free_flush_queue(q->fq); if (q->mq_ops) blk_mq_free_queue(q); diff --git a/block/blk.h b/block/blk.h index 833c4ac..9eaa6e9 100644 --- a/block/blk.h +++ b/block/blk.h @@ -39,8 +39,8 @@ static inline void __blk_get_queue(struct request_queue *q) kobject_get(&q->kobj); } -int blk_init_flush(struct request_queue *q); -void blk_exit_flush(struct request_queue *q); +struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q); +void blk_free_flush_queue(struct blk_flush_queue *fq); int blk_init_rl(struct request_list *rl, struct request_queue *q, gfp_t gfp_mask); -- cgit v0.10.2 From 0bae352da54a95435f721705d3670a6eaefdcf87 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 25 Sep 2014 23:23:45 +0800 Subject: block: flush: avoid to figure out flush queue unnecessarily Just figuring out flush queue at the entry of kicking off flush machinery and request's completion handler, then pass it through. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe diff --git a/block/blk-flush.c b/block/blk-flush.c index d66cbf2..9bc5b4f 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -91,7 +91,8 @@ enum { FLUSH_PENDING_TIMEOUT = 5 * HZ, }; -static bool blk_kick_flush(struct request_queue *q); +static bool blk_kick_flush(struct request_queue *q, + struct blk_flush_queue *fq); static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq) { @@ -148,6 +149,7 @@ static bool blk_flush_queue_rq(struct request *rq, bool add_front) /** * blk_flush_complete_seq - complete flush sequence * @rq: FLUSH/FUA request being sequenced + * @fq: flush queue * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero) * @error: whether an error occurred * @@ -160,11 +162,11 @@ static bool blk_flush_queue_rq(struct request *rq, bool add_front) * RETURNS: * %true if requests were added to the dispatch queue, %false otherwise. */ -static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, - int error) +static bool blk_flush_complete_seq(struct request *rq, + struct blk_flush_queue *fq, + unsigned int seq, int error) { struct request_queue *q = rq->q; - struct blk_flush_queue *fq = blk_get_flush_queue(q); struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; bool queued = false, kicked; @@ -210,7 +212,7 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, BUG(); } - kicked = blk_kick_flush(q); + kicked = blk_kick_flush(q, fq); return kicked | queued; } @@ -242,7 +244,7 @@ static void flush_end_io(struct request *flush_rq, int error) unsigned int seq = blk_flush_cur_seq(rq); BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); - queued |= blk_flush_complete_seq(rq, seq, error); + queued |= blk_flush_complete_seq(rq, fq, seq, error); } /* @@ -268,6 +270,7 @@ static void flush_end_io(struct request *flush_rq, int error) /** * blk_kick_flush - consider issuing flush request * @q: request_queue being kicked + * @fq: flush queue * * Flush related states of @q have changed, consider issuing flush request. * Please read the comment at the top of this file for more info. @@ -278,9 +281,8 @@ static void flush_end_io(struct request *flush_rq, int error) * RETURNS: * %true if flush was issued, %false otherwise. */ -static bool blk_kick_flush(struct request_queue *q) +static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq) { - struct blk_flush_queue *fq = blk_get_flush_queue(q); struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; struct request *first_rq = list_first_entry(pending, struct request, flush.list); @@ -317,12 +319,13 @@ static bool blk_kick_flush(struct request_queue *q) static void flush_data_end_io(struct request *rq, int error) { struct request_queue *q = rq->q; + struct blk_flush_queue *fq = blk_get_flush_queue(q); /* * After populating an empty queue, kick it to avoid stall. Read * the comment in flush_end_io(). */ - if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error)) + if (blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error)) blk_run_queue_async(q); } @@ -342,7 +345,7 @@ static void mq_flush_data_end_io(struct request *rq, int error) * the comment in flush_end_io(). */ spin_lock_irqsave(&fq->mq_flush_lock, flags); - if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error)) + if (blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error)) blk_mq_run_hw_queue(hctx, true); spin_unlock_irqrestore(&fq->mq_flush_lock, flags); } @@ -364,6 +367,7 @@ void blk_insert_flush(struct request *rq) struct request_queue *q = rq->q; unsigned int fflags = q->flush_flags; /* may change, cache */ unsigned int policy = blk_flush_policy(fflags, rq); + struct blk_flush_queue *fq = blk_get_flush_queue(q); /* * @policy now records what operations need to be done. Adjust @@ -412,18 +416,16 @@ void blk_insert_flush(struct request *rq) rq->cmd_flags |= REQ_FLUSH_SEQ; rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ if (q->mq_ops) { - struct blk_flush_queue *fq = blk_get_flush_queue(q); - rq->end_io = mq_flush_data_end_io; spin_lock_irq(&fq->mq_flush_lock); - blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); + blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); spin_unlock_irq(&fq->mq_flush_lock); return; } rq->end_io = flush_data_end_io; - blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); + blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); } /** -- cgit v0.10.2 From e97c293cdf77263abdc021de280516e0017afc84 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 25 Sep 2014 23:23:46 +0800 Subject: block: introduce 'blk_mq_ctx' parameter to blk_get_flush_queue This patch adds 'blk_mq_ctx' parameter to blk_get_flush_queue(), so that this function can find the corresponding blk_flush_queue bound with current mq context since the flush queue will become per hw-queue. For legacy queue, the parameter can be simply 'NULL'. For multiqueue case, the parameter should be set as the context from which the related request is originated. With this context info, the hw queue and related flush queue can be found easily. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe diff --git a/block/blk-core.c b/block/blk-core.c index dba0a83..b1dd4e0 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -390,7 +390,7 @@ static void __blk_drain_queue(struct request_queue *q, bool drain_all) * be drained. Check all the queues and counters. */ if (drain_all) { - struct blk_flush_queue *fq = blk_get_flush_queue(q); + struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); drain |= !list_empty(&q->queue_head); for (i = 0; i < 2; i++) { drain |= q->nr_rqs[i]; diff --git a/block/blk-flush.c b/block/blk-flush.c index 9bc5b4f..004d95e 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -223,7 +223,7 @@ static void flush_end_io(struct request *flush_rq, int error) bool queued = false; struct request *rq, *n; unsigned long flags = 0; - struct blk_flush_queue *fq = blk_get_flush_queue(q); + struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx); if (q->mq_ops) { spin_lock_irqsave(&fq->mq_flush_lock, flags); @@ -319,7 +319,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq) static void flush_data_end_io(struct request *rq, int error) { struct request_queue *q = rq->q; - struct blk_flush_queue *fq = blk_get_flush_queue(q); + struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); /* * After populating an empty queue, kick it to avoid stall. Read @@ -333,11 +333,10 @@ static void mq_flush_data_end_io(struct request *rq, int error) { struct request_queue *q = rq->q; struct blk_mq_hw_ctx *hctx; - struct blk_mq_ctx *ctx; + struct blk_mq_ctx *ctx = rq->mq_ctx; unsigned long flags; - struct blk_flush_queue *fq = blk_get_flush_queue(q); + struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx); - ctx = rq->mq_ctx; hctx = q->mq_ops->map_queue(q, ctx->cpu); /* @@ -367,7 +366,7 @@ void blk_insert_flush(struct request *rq) struct request_queue *q = rq->q; unsigned int fflags = q->flush_flags; /* may change, cache */ unsigned int policy = blk_flush_policy(fflags, rq); - struct blk_flush_queue *fq = blk_get_flush_queue(q); + struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); /* * @policy now records what operations need to be done. Adjust diff --git a/block/blk-mq.c b/block/blk-mq.c index 59ca796..53b6def1 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -518,7 +518,8 @@ static inline bool is_flush_request(struct request *rq, struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) { struct request *rq = tags->rqs[tag]; - struct blk_flush_queue *fq = blk_get_flush_queue(rq->q); + /* mq_ctx of flush rq is always cloned from the corresponding req */ + struct blk_flush_queue *fq = blk_get_flush_queue(rq->q, rq->mq_ctx); if (!is_flush_request(rq, fq, tag)) return rq; diff --git a/block/blk.h b/block/blk.h index 9eaa6e9..7ecdd85 100644 --- a/block/blk.h +++ b/block/blk.h @@ -29,7 +29,7 @@ extern struct kobj_type blk_queue_ktype; extern struct ida blk_queue_ida; static inline struct blk_flush_queue *blk_get_flush_queue( - struct request_queue *q) + struct request_queue *q, struct blk_mq_ctx *ctx) { return q->fq; } @@ -106,7 +106,7 @@ void blk_insert_flush(struct request *rq); static inline struct request *__elv_next_request(struct request_queue *q) { struct request *rq; - struct blk_flush_queue *fq = blk_get_flush_queue(q); + struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); while (1) { if (!list_empty(&q->queue_head)) { -- cgit v0.10.2 From f70ced09170761acb69840cafaace4abc72cba4b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 25 Sep 2014 23:23:47 +0800 Subject: blk-mq: support per-distpatch_queue flush machinery This patch supports to run one single flush machinery for each blk-mq dispatch queue, so that: - current init_request and exit_request callbacks can cover flush request too, then the buggy copying way of initializing flush request's pdu can be fixed - flushing performance gets improved in case of multi hw-queue In fio sync write test over virtio-blk(4 hw queues, ioengine=sync, iodepth=64, numjobs=4, bs=4K), it is observed that througput gets increased a lot over my test environment: - throughput: +70% in case of virtio-blk over null_blk - throughput: +30% in case of virtio-blk over SSD image The multi virtqueue feature isn't merged to QEMU yet, and patches for the feature can be found in below tree: git://kernel.ubuntu.com/ming/qemu.git v2.1.0-mq.4 And simply passing 'num_queues=4 vectors=5' should be enough to enable multi queue(quad queue) feature for QEMU virtio-blk. Suggested-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe diff --git a/block/blk-core.c b/block/blk-core.c index b1dd4e0..e1c2775 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -704,7 +704,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, if (!q) return NULL; - q->fq = blk_alloc_flush_queue(q); + q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, 0); if (!q->fq) return NULL; diff --git a/block/blk-flush.c b/block/blk-flush.c index 004d95e..20badd7 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -305,8 +305,15 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq) fq->flush_pending_idx ^= 1; blk_rq_init(q, flush_rq); - if (q->mq_ops) - blk_mq_clone_flush_request(flush_rq, first_rq); + + /* + * Borrow tag from the first request since they can't + * be in flight at the same time. + */ + if (q->mq_ops) { + flush_rq->mq_ctx = first_rq->mq_ctx; + flush_rq->tag = first_rq->tag; + } flush_rq->cmd_type = REQ_TYPE_FS; flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; @@ -480,22 +487,22 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, } EXPORT_SYMBOL(blkdev_issue_flush); -struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q) +struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, + int node, int cmd_size) { struct blk_flush_queue *fq; int rq_sz = sizeof(struct request); - fq = kzalloc(sizeof(*fq), GFP_KERNEL); + fq = kzalloc_node(sizeof(*fq), GFP_KERNEL, node); if (!fq) goto fail; if (q->mq_ops) { spin_lock_init(&fq->mq_flush_lock); - rq_sz = round_up(rq_sz + q->tag_set->cmd_size, - cache_line_size()); + rq_sz = round_up(rq_sz + cmd_size, cache_line_size()); } - fq->flush_rq = kzalloc(rq_sz, GFP_KERNEL); + fq->flush_rq = kzalloc_node(rq_sz, GFP_KERNEL, node); if (!fq->flush_rq) goto fail_rq; diff --git a/block/blk-mq.c b/block/blk-mq.c index 53b6def1..4e7a314 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -281,26 +281,6 @@ void blk_mq_free_request(struct request *rq) __blk_mq_free_request(hctx, ctx, rq); } -/* - * Clone all relevant state from a request that has been put on hold in - * the flush state machine into the preallocated flush request that hangs - * off the request queue. - * - * For a driver the flush request should be invisible, that's why we are - * impersonating the original request here. - */ -void blk_mq_clone_flush_request(struct request *flush_rq, - struct request *orig_rq) -{ - struct blk_mq_hw_ctx *hctx = - orig_rq->q->mq_ops->map_queue(orig_rq->q, orig_rq->mq_ctx->cpu); - - flush_rq->mq_ctx = orig_rq->mq_ctx; - flush_rq->tag = orig_rq->tag; - memcpy(blk_mq_rq_to_pdu(flush_rq), blk_mq_rq_to_pdu(orig_rq), - hctx->cmd_size); -} - inline void __blk_mq_end_request(struct request *rq, int error) { blk_account_io_done(rq); @@ -1516,12 +1496,20 @@ static void blk_mq_exit_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { + unsigned flush_start_tag = set->queue_depth; + blk_mq_tag_idle(hctx); + if (set->ops->exit_request) + set->ops->exit_request(set->driver_data, + hctx->fq->flush_rq, hctx_idx, + flush_start_tag + hctx_idx); + if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); + blk_free_flush_queue(hctx->fq); kfree(hctx->ctxs); blk_mq_free_bitmap(&hctx->ctx_map); } @@ -1556,6 +1544,7 @@ static int blk_mq_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) { int node; + unsigned flush_start_tag = set->queue_depth; node = hctx->numa_node; if (node == NUMA_NO_NODE) @@ -1594,8 +1583,23 @@ static int blk_mq_init_hctx(struct request_queue *q, set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) goto free_bitmap; + hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size); + if (!hctx->fq) + goto exit_hctx; + + if (set->ops->init_request && + set->ops->init_request(set->driver_data, + hctx->fq->flush_rq, hctx_idx, + flush_start_tag + hctx_idx, node)) + goto free_fq; + return 0; + free_fq: + kfree(hctx->fq); + exit_hctx: + if (set->ops->exit_hctx) + set->ops->exit_hctx(hctx, hctx_idx); free_bitmap: blk_mq_free_bitmap(&hctx->ctx_map); free_ctxs: @@ -1862,16 +1866,10 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) blk_mq_add_queue_tag_set(set, q); - q->fq = blk_alloc_flush_queue(q); - if (!q->fq) - goto err_hw_queues; - blk_mq_map_swqueue(q); return q; -err_hw_queues: - blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); err_hw: blk_cleanup_queue(q); err_hctxs: diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 718cffc4c..e8f38a3 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -517,10 +517,10 @@ static void blk_release_queue(struct kobject *kobj) if (q->queue_tags) __blk_queue_free_tags(q); - blk_free_flush_queue(q->fq); - if (q->mq_ops) blk_mq_free_queue(q); + else + blk_free_flush_queue(q->fq); blk_trace_shutdown(q); diff --git a/block/blk.h b/block/blk.h index 7ecdd85..43b0361 100644 --- a/block/blk.h +++ b/block/blk.h @@ -2,6 +2,8 @@ #define BLK_INTERNAL_H #include +#include +#include "blk-mq.h" /* Amount of time in which a process may batch requests */ #define BLK_BATCH_TIME (HZ/50UL) @@ -31,7 +33,14 @@ extern struct ida blk_queue_ida; static inline struct blk_flush_queue *blk_get_flush_queue( struct request_queue *q, struct blk_mq_ctx *ctx) { - return q->fq; + struct blk_mq_hw_ctx *hctx; + + if (!q->mq_ops) + return q->fq; + + hctx = q->mq_ops->map_queue(q, ctx->cpu); + + return hctx->fq; } static inline void __blk_get_queue(struct request_queue *q) @@ -39,8 +48,9 @@ static inline void __blk_get_queue(struct request_queue *q) kobject_get(&q->kobj); } -struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q); -void blk_free_flush_queue(struct blk_flush_queue *fq); +struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, + int node, int cmd_size); +void blk_free_flush_queue(struct blk_flush_queue *q); int blk_init_rl(struct request_list *rl, struct request_queue *q, gfp_t gfp_mask); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 3253495..02c5d95 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -4,6 +4,7 @@ #include struct blk_mq_tags; +struct blk_flush_queue; struct blk_mq_cpu_notifier { struct list_head list; @@ -34,6 +35,7 @@ struct blk_mq_hw_ctx { struct request_queue *queue; unsigned int queue_num; + struct blk_flush_queue *fq; void *driver_data; @@ -119,6 +121,10 @@ struct blk_mq_ops { /* * Called for every command allocated by the block layer to allow * the driver to set up driver specific data. + * + * Tag greater than or equal to queue_depth is for setting up + * flush request. + * * Ditto for exit/teardown. */ init_request_fn *init_request; -- cgit v0.10.2 From e7258c1a269e0967856c81d182c286a78f5ecf15 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 26 Sep 2014 19:19:55 -0400 Subject: block: Get rid of bdev_integrity_enabled() bdev_integrity_enabled() is only used by bio_integrity_enabled(). Combine these two functions. Signed-off-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe diff --git a/Documentation/block/data-integrity.txt b/Documentation/block/data-integrity.txt index 2d735b0a..b4eacf4 100644 --- a/Documentation/block/data-integrity.txt +++ b/Documentation/block/data-integrity.txt @@ -192,16 +192,6 @@ will require extra work due to the application tag. supported by the block device. - int bdev_integrity_enabled(block_device, int rw); - - bdev_integrity_enabled() will return 1 if the block device - supports integrity metadata transfer for the data direction - specified in 'rw'. - - bdev_integrity_enabled() honors the write_generate and - read_verify flags in sysfs and will respond accordingly. - - int bio_integrity_prep(bio); To generate IMD for WRITE and to set up buffers for READ, the diff --git a/block/bio-integrity.c b/block/bio-integrity.c index f14b4ab..36b7885 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -147,24 +147,6 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, } EXPORT_SYMBOL(bio_integrity_add_page); -static int bdev_integrity_enabled(struct block_device *bdev, int rw) -{ - struct blk_integrity *bi = bdev_get_integrity(bdev); - - if (bi == NULL) - return 0; - - if (rw == READ && bi->verify_fn != NULL && - (bi->flags & INTEGRITY_FLAG_READ)) - return 1; - - if (rw == WRITE && bi->generate_fn != NULL && - (bi->flags & INTEGRITY_FLAG_WRITE)) - return 1; - - return 0; -} - /** * bio_integrity_enabled - Check whether integrity can be passed * @bio: bio to check @@ -174,16 +156,29 @@ static int bdev_integrity_enabled(struct block_device *bdev, int rw) * set prior to calling. The functions honors the write_generate and * read_verify flags in sysfs. */ -int bio_integrity_enabled(struct bio *bio) +bool bio_integrity_enabled(struct bio *bio) { + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + if (!bio_is_rw(bio)) - return 0; + return false; /* Already protected? */ if (bio_integrity(bio)) - return 0; + return false; + + if (bi == NULL) + return false; + + if (bio_data_dir(bio) == READ && bi->verify_fn != NULL && + (bi->flags & INTEGRITY_FLAG_READ)) + return true; + + if (bio_data_dir(bio) == WRITE && bi->generate_fn != NULL && + (bi->flags & INTEGRITY_FLAG_WRITE)) + return true; - return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio)); + return false; } EXPORT_SYMBOL(bio_integrity_enabled); diff --git a/include/linux/bio.h b/include/linux/bio.h index b39e500..63e399b 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -666,7 +666,7 @@ struct biovec_slab { extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int); extern void bio_integrity_free(struct bio *); extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int); -extern int bio_integrity_enabled(struct bio *bio); +extern bool bio_integrity_enabled(struct bio *bio); extern int bio_integrity_set_tag(struct bio *, void *, unsigned int); extern int bio_integrity_get_tag(struct bio *, void *, unsigned int); extern int bio_integrity_prep(struct bio *); @@ -685,9 +685,9 @@ static inline int bio_integrity(struct bio *bio) return 0; } -static inline int bio_integrity_enabled(struct bio *bio) +static inline bool bio_integrity_enabled(struct bio *bio) { - return 0; + return false; } static inline int bioset_integrity_create(struct bio_set *bs, int pool_size) -- cgit v0.10.2 From 180b2f95dd331010a9930a65c8a18d6d81b94dc1 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 26 Sep 2014 19:19:56 -0400 Subject: block: Replace bi_integrity with bi_special For commands like REQ_COPY we need a way to pass extra information along with each bio. Like integrity metadata this information must be available at the bottom of the stack so bi_private does not suffice. Rename the existing bi_integrity field to bi_special and make it a union so we can have different bio extensions for each class of command. We previously used bi_integrity != NULL as a way to identify whether a bio had integrity metadata or not. Introduce a REQ_INTEGRITY to be the indicator now that bi_special can contain different things. In addition, bio_integrity(bio) will now return a pointer to the integrity payload (when applicable). Signed-off-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe diff --git a/Documentation/block/data-integrity.txt b/Documentation/block/data-integrity.txt index b4eacf4..4d4de8b 100644 --- a/Documentation/block/data-integrity.txt +++ b/Documentation/block/data-integrity.txt @@ -129,11 +129,11 @@ interface for this is being worked on. 4.1 BIO The data integrity patches add a new field to struct bio when -CONFIG_BLK_DEV_INTEGRITY is enabled. bio->bi_integrity is a pointer -to a struct bip which contains the bio integrity payload. Essentially -a bip is a trimmed down struct bio which holds a bio_vec containing -the integrity metadata and the required housekeeping information (bvec -pool, vector count, etc.) +CONFIG_BLK_DEV_INTEGRITY is enabled. bio_integrity(bio) returns a +pointer to a struct bip which contains the bio integrity payload. +Essentially a bip is a trimmed down struct bio which holds a bio_vec +containing the integrity metadata and the required housekeeping +information (bvec pool, vector count, etc.) A kernel subsystem can enable data integrity protection on a bio by calling bio_integrity_alloc(bio). This will allocate and attach the diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 36b7885..bd3125c 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -79,6 +79,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, bip->bip_slab = idx; bip->bip_bio = bio; bio->bi_integrity = bip; + bio->bi_rw |= REQ_INTEGRITY; return bip; err: @@ -96,7 +97,7 @@ EXPORT_SYMBOL(bio_integrity_alloc); */ void bio_integrity_free(struct bio *bio) { - struct bio_integrity_payload *bip = bio->bi_integrity; + struct bio_integrity_payload *bip = bio_integrity(bio); struct bio_set *bs = bio->bi_pool; if (bip->bip_owns_buf) @@ -128,7 +129,7 @@ EXPORT_SYMBOL(bio_integrity_free); int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { - struct bio_integrity_payload *bip = bio->bi_integrity; + struct bio_integrity_payload *bip = bio_integrity(bio); struct bio_vec *iv; if (bip->bip_vcnt >= bip->bip_max_vcnt) { @@ -229,7 +230,7 @@ EXPORT_SYMBOL(bio_integrity_tag_size); static int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set) { - struct bio_integrity_payload *bip = bio->bi_integrity; + struct bio_integrity_payload *bip = bio_integrity(bio); struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); unsigned int nr_sectors; @@ -304,12 +305,12 @@ static int bio_integrity_generate_verify(struct bio *bio, int operate) struct bio_vec *bv; sector_t sector; unsigned int sectors, ret = 0, i; - void *prot_buf = bio->bi_integrity->bip_buf; + void *prot_buf = bio_integrity(bio)->bip_buf; if (operate) sector = bio->bi_iter.bi_sector; else - sector = bio->bi_integrity->bip_iter.bi_sector; + sector = bio_integrity(bio)->bip_iter.bi_sector; bix.disk_name = bio->bi_bdev->bd_disk->disk_name; bix.sector_size = bi->sector_size; @@ -505,7 +506,7 @@ static void bio_integrity_verify_fn(struct work_struct *work) */ void bio_integrity_endio(struct bio *bio, int error) { - struct bio_integrity_payload *bip = bio->bi_integrity; + struct bio_integrity_payload *bip = bio_integrity(bio); BUG_ON(bip->bip_bio != bio); @@ -536,7 +537,7 @@ EXPORT_SYMBOL(bio_integrity_endio); */ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) { - struct bio_integrity_payload *bip = bio->bi_integrity; + struct bio_integrity_payload *bip = bio_integrity(bio); struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9); @@ -558,7 +559,7 @@ EXPORT_SYMBOL(bio_integrity_advance); void bio_integrity_trim(struct bio *bio, unsigned int offset, unsigned int sectors) { - struct bio_integrity_payload *bip = bio->bi_integrity; + struct bio_integrity_payload *bip = bio_integrity(bio); struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); bio_integrity_advance(bio, offset << 9); @@ -577,7 +578,7 @@ EXPORT_SYMBOL(bio_integrity_trim); int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask) { - struct bio_integrity_payload *bip_src = bio_src->bi_integrity; + struct bio_integrity_payload *bip_src = bio_integrity(bio_src); struct bio_integrity_payload *bip; BUG_ON(bip_src == NULL); diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c index a7a691d..29f0477 100644 --- a/drivers/scsi/sd_dif.c +++ b/drivers/scsi/sd_dif.c @@ -383,9 +383,9 @@ void sd_dif_prepare(struct request *rq, sector_t hw_sector, if (bio_flagged(bio, BIO_MAPPED_INTEGRITY)) break; - virt = bio->bi_integrity->bip_iter.bi_sector & 0xffffffff; + virt = bio_integrity(bio)->bip_iter.bi_sector & 0xffffffff; - bip_for_each_vec(iv, bio->bi_integrity, iter) { + bip_for_each_vec(iv, bio_integrity(bio), iter) { sdt = kmap_atomic(iv.bv_page) + iv.bv_offset; @@ -434,9 +434,9 @@ void sd_dif_complete(struct scsi_cmnd *scmd, unsigned int good_bytes) struct bio_vec iv; struct bvec_iter iter; - virt = bio->bi_integrity->bip_iter.bi_sector & 0xffffffff; + virt = bio_integrity(bio)->bip_iter.bi_sector & 0xffffffff; - bip_for_each_vec(iv, bio->bi_integrity, iter) { + bip_for_each_vec(iv, bio_integrity(bio), iter) { sdt = kmap_atomic(iv.bv_page) + iv.bv_offset; diff --git a/include/linux/bio.h b/include/linux/bio.h index 63e399b..a810a74 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -293,6 +293,15 @@ static inline unsigned bio_segments(struct bio *bio) #define bio_get(bio) atomic_inc(&(bio)->bi_cnt) #if defined(CONFIG_BLK_DEV_INTEGRITY) + +static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) +{ + if (bio->bi_rw & REQ_INTEGRITY) + return bio->bi_integrity; + + return NULL; +} + /* * bio integrity payload */ @@ -661,8 +670,6 @@ struct biovec_slab { for_each_bio(_bio) \ bip_for_each_vec(_bvl, _bio->bi_integrity, _iter) -#define bio_integrity(bio) (bio->bi_integrity != NULL) - extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int); extern void bio_integrity_free(struct bio *); extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index bb7d664..6a5d2f2 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -78,9 +78,11 @@ struct bio { struct io_context *bi_ioc; struct cgroup_subsys_state *bi_css; #endif + union { #if defined(CONFIG_BLK_DEV_INTEGRITY) - struct bio_integrity_payload *bi_integrity; /* data integrity */ + struct bio_integrity_payload *bi_integrity; /* data integrity */ #endif + }; unsigned short bi_vcnt; /* how many bio_vec's */ @@ -162,6 +164,7 @@ enum rq_flag_bits { __REQ_WRITE_SAME, /* write same block many times */ __REQ_NOIDLE, /* don't anticipate more IO after this one */ + __REQ_INTEGRITY, /* I/O includes block integrity payload */ __REQ_FUA, /* forced unit access */ __REQ_FLUSH, /* request for cache flush */ @@ -203,13 +206,14 @@ enum rq_flag_bits { #define REQ_DISCARD (1ULL << __REQ_DISCARD) #define REQ_WRITE_SAME (1ULL << __REQ_WRITE_SAME) #define REQ_NOIDLE (1ULL << __REQ_NOIDLE) +#define REQ_INTEGRITY (1ULL << __REQ_INTEGRITY) #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) #define REQ_COMMON_MASK \ (REQ_WRITE | REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | \ REQ_DISCARD | REQ_WRITE_SAME | REQ_NOIDLE | REQ_FLUSH | REQ_FUA | \ - REQ_SECURE) + REQ_SECURE | REQ_INTEGRITY) #define REQ_CLONE_MASK REQ_COMMON_MASK #define BIO_NO_ADVANCE_ITER_MASK (REQ_DISCARD|REQ_WRITE_SAME) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 49f3461..7fcb2ca 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1514,12 +1514,9 @@ static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) return disk->integrity; } -static inline int blk_integrity_rq(struct request *rq) +static inline bool blk_integrity_rq(struct request *rq) { - if (rq->bio == NULL) - return 0; - - return bio_integrity(rq->bio); + return rq->cmd_flags & REQ_INTEGRITY; } static inline void blk_queue_max_integrity_segments(struct request_queue *q, -- cgit v0.10.2 From 8492b68bc4025e7bce1d57761bd7c047efda2f81 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 26 Sep 2014 19:19:57 -0400 Subject: block: Remove integrity tagging functions None of the filesystems appear interested in using the integrity tagging feature. Potentially because very few storage devices actually permit using the application tag space. Remove the tagging functions. Signed-off-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe diff --git a/Documentation/block/data-integrity.txt b/Documentation/block/data-integrity.txt index 4d4de8b..f56ec97 100644 --- a/Documentation/block/data-integrity.txt +++ b/Documentation/block/data-integrity.txt @@ -206,36 +206,6 @@ will require extra work due to the application tag. bio_integrity_enabled() returned 1. - int bio_integrity_tag_size(bio); - - If the filesystem wants to use the application tag space it will - first have to find out how much storage space is available. - Because tag space is generally limited (usually 2 bytes per - sector regardless of sector size), the integrity framework - supports interleaving the information between the sectors in an - I/O. - - Filesystems can call bio_integrity_tag_size(bio) to find out how - many bytes of storage are available for that particular bio. - - Another option is bdev_get_tag_size(block_device) which will - return the number of available bytes per hardware sector. - - - int bio_integrity_set_tag(bio, void *tag_buf, len); - - After a successful return from bio_integrity_prep(), - bio_integrity_set_tag() can be used to attach an opaque tag - buffer to a bio. Obviously this only makes sense if the I/O is - a WRITE. - - - int bio_integrity_get_tag(bio, void *tag_buf, len); - - Similarly, at READ I/O completion time the filesystem can - retrieve the tag buffer using bio_integrity_get_tag(). - - 5.3 PASSING EXISTING INTEGRITY METADATA Filesystems that either generate their own integrity metadata or @@ -288,8 +258,6 @@ will require extra work due to the application tag. .name = "STANDARDSBODY-TYPE-VARIANT-CSUM", .generate_fn = my_generate_fn, .verify_fn = my_verify_fn, - .get_tag_fn = my_get_tag_fn, - .set_tag_fn = my_set_tag_fn, .tuple_size = sizeof(struct my_tuple_size), .tag_size = , }; @@ -311,7 +279,5 @@ will require extra work due to the application tag. are available per hardware sector. For DIF this is either 2 or 0 depending on the value of the Control Mode Page ATO bit. - See 6.2 for a description of get_tag_fn and set_tag_fn. - ---------------------------------------------------------------------- 2007-12-24 Martin K. Petersen diff --git a/block/bio-integrity.c b/block/bio-integrity.c index bd3125c..367bb24 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -210,90 +210,6 @@ static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, } /** - * bio_integrity_tag_size - Retrieve integrity tag space - * @bio: bio to inspect - * - * Description: Returns the maximum number of tag bytes that can be - * attached to this bio. Filesystems can use this to determine how - * much metadata to attach to an I/O. - */ -unsigned int bio_integrity_tag_size(struct bio *bio) -{ - struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - - BUG_ON(bio->bi_iter.bi_size == 0); - - return bi->tag_size * (bio->bi_iter.bi_size / bi->sector_size); -} -EXPORT_SYMBOL(bio_integrity_tag_size); - -static int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, - int set) -{ - struct bio_integrity_payload *bip = bio_integrity(bio); - struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - unsigned int nr_sectors; - - BUG_ON(bip->bip_buf == NULL); - - if (bi->tag_size == 0) - return -1; - - nr_sectors = bio_integrity_hw_sectors(bi, - DIV_ROUND_UP(len, bi->tag_size)); - - if (nr_sectors * bi->tuple_size > bip->bip_iter.bi_size) { - printk(KERN_ERR "%s: tag too big for bio: %u > %u\n", __func__, - nr_sectors * bi->tuple_size, bip->bip_iter.bi_size); - return -1; - } - - if (set) - bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors); - else - bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors); - - return 0; -} - -/** - * bio_integrity_set_tag - Attach a tag buffer to a bio - * @bio: bio to attach buffer to - * @tag_buf: Pointer to a buffer containing tag data - * @len: Length of the included buffer - * - * Description: Use this function to tag a bio by leveraging the extra - * space provided by devices formatted with integrity protection. The - * size of the integrity buffer must be <= to the size reported by - * bio_integrity_tag_size(). - */ -int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len) -{ - BUG_ON(bio_data_dir(bio) != WRITE); - - return bio_integrity_tag(bio, tag_buf, len, 1); -} -EXPORT_SYMBOL(bio_integrity_set_tag); - -/** - * bio_integrity_get_tag - Retrieve a tag buffer from a bio - * @bio: bio to retrieve buffer from - * @tag_buf: Pointer to a buffer for the tag data - * @len: Length of the target buffer - * - * Description: Use this function to retrieve the tag buffer from a - * completed I/O. The size of the integrity buffer must be <= to the - * size reported by bio_integrity_tag_size(). - */ -int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len) -{ - BUG_ON(bio_data_dir(bio) != READ); - - return bio_integrity_tag(bio, tag_buf, len, 0); -} -EXPORT_SYMBOL(bio_integrity_get_tag); - -/** * bio_integrity_generate_verify - Generate/verify integrity metadata for a bio * @bio: bio to generate/verify integrity metadata for * @operate: operate number, 1 for generate, 0 for verify @@ -355,14 +271,6 @@ static void bio_integrity_generate(struct bio *bio) bio_integrity_generate_verify(bio, 1); } -static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi) -{ - if (bi) - return bi->tuple_size; - - return 0; -} - /** * bio_integrity_prep - Prepare bio for integrity I/O * @bio: bio to prepare @@ -393,7 +301,7 @@ int bio_integrity_prep(struct bio *bio) sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio)); /* Allocate kernel buffer for protection data */ - len = sectors * blk_integrity_tuple_size(bi); + len = sectors * bi->tuple_size; buf = kmalloc(len, GFP_NOIO | q->bounce_gfp); if (unlikely(buf == NULL)) { printk(KERN_ERR "could not allocate integrity buffer\n"); diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 7fbab84..7ac1716 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -418,8 +418,6 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) bi->generate_fn = template->generate_fn; bi->verify_fn = template->verify_fn; bi->tuple_size = template->tuple_size; - bi->set_tag_fn = template->set_tag_fn; - bi->get_tag_fn = template->get_tag_fn; bi->tag_size = template->tag_size; } else bi->name = bi_unsupported_name; diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c index 29f0477..38a7778 100644 --- a/drivers/scsi/sd_dif.c +++ b/drivers/scsi/sd_dif.c @@ -128,39 +128,10 @@ static int sd_dif_type1_verify_ip(struct blk_integrity_exchg *bix) return sd_dif_type1_verify(bix, sd_dif_ip_fn); } -/* - * Functions for interleaving and deinterleaving application tags - */ -static void sd_dif_type1_set_tag(void *prot, void *tag_buf, unsigned int sectors) -{ - struct sd_dif_tuple *sdt = prot; - u8 *tag = tag_buf; - unsigned int i, j; - - for (i = 0, j = 0 ; i < sectors ; i++, j += 2, sdt++) { - sdt->app_tag = tag[j] << 8 | tag[j+1]; - BUG_ON(sdt->app_tag == 0xffff); - } -} - -static void sd_dif_type1_get_tag(void *prot, void *tag_buf, unsigned int sectors) -{ - struct sd_dif_tuple *sdt = prot; - u8 *tag = tag_buf; - unsigned int i, j; - - for (i = 0, j = 0 ; i < sectors ; i++, j += 2, sdt++) { - tag[j] = (sdt->app_tag & 0xff00) >> 8; - tag[j+1] = sdt->app_tag & 0xff; - } -} - static struct blk_integrity dif_type1_integrity_crc = { .name = "T10-DIF-TYPE1-CRC", .generate_fn = sd_dif_type1_generate_crc, .verify_fn = sd_dif_type1_verify_crc, - .get_tag_fn = sd_dif_type1_get_tag, - .set_tag_fn = sd_dif_type1_set_tag, .tuple_size = sizeof(struct sd_dif_tuple), .tag_size = 0, }; @@ -169,8 +140,6 @@ static struct blk_integrity dif_type1_integrity_ip = { .name = "T10-DIF-TYPE1-IP", .generate_fn = sd_dif_type1_generate_ip, .verify_fn = sd_dif_type1_verify_ip, - .get_tag_fn = sd_dif_type1_get_tag, - .set_tag_fn = sd_dif_type1_set_tag, .tuple_size = sizeof(struct sd_dif_tuple), .tag_size = 0, }; @@ -245,42 +214,10 @@ static int sd_dif_type3_verify_ip(struct blk_integrity_exchg *bix) return sd_dif_type3_verify(bix, sd_dif_ip_fn); } -static void sd_dif_type3_set_tag(void *prot, void *tag_buf, unsigned int sectors) -{ - struct sd_dif_tuple *sdt = prot; - u8 *tag = tag_buf; - unsigned int i, j; - - for (i = 0, j = 0 ; i < sectors ; i++, j += 6, sdt++) { - sdt->app_tag = tag[j] << 8 | tag[j+1]; - sdt->ref_tag = tag[j+2] << 24 | tag[j+3] << 16 | - tag[j+4] << 8 | tag[j+5]; - } -} - -static void sd_dif_type3_get_tag(void *prot, void *tag_buf, unsigned int sectors) -{ - struct sd_dif_tuple *sdt = prot; - u8 *tag = tag_buf; - unsigned int i, j; - - for (i = 0, j = 0 ; i < sectors ; i++, j += 2, sdt++) { - tag[j] = (sdt->app_tag & 0xff00) >> 8; - tag[j+1] = sdt->app_tag & 0xff; - tag[j+2] = (sdt->ref_tag & 0xff000000) >> 24; - tag[j+3] = (sdt->ref_tag & 0xff0000) >> 16; - tag[j+4] = (sdt->ref_tag & 0xff00) >> 8; - tag[j+5] = sdt->ref_tag & 0xff; - BUG_ON(sdt->app_tag == 0xffff || sdt->ref_tag == 0xffffffff); - } -} - static struct blk_integrity dif_type3_integrity_crc = { .name = "T10-DIF-TYPE3-CRC", .generate_fn = sd_dif_type3_generate_crc, .verify_fn = sd_dif_type3_verify_crc, - .get_tag_fn = sd_dif_type3_get_tag, - .set_tag_fn = sd_dif_type3_set_tag, .tuple_size = sizeof(struct sd_dif_tuple), .tag_size = 0, }; @@ -289,8 +226,6 @@ static struct blk_integrity dif_type3_integrity_ip = { .name = "T10-DIF-TYPE3-IP", .generate_fn = sd_dif_type3_generate_ip, .verify_fn = sd_dif_type3_verify_ip, - .get_tag_fn = sd_dif_type3_get_tag, - .set_tag_fn = sd_dif_type3_set_tag, .tuple_size = sizeof(struct sd_dif_tuple), .tag_size = 0, }; diff --git a/include/linux/bio.h b/include/linux/bio.h index a810a74..63a0e53 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -362,7 +362,6 @@ extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *); extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs); extern struct bio_set *fs_bio_set; -unsigned int bio_integrity_tag_size(struct bio *bio); static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) { @@ -674,8 +673,6 @@ extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, un extern void bio_integrity_free(struct bio *); extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int); extern bool bio_integrity_enabled(struct bio *bio); -extern int bio_integrity_set_tag(struct bio *, void *, unsigned int); -extern int bio_integrity_get_tag(struct bio *, void *, unsigned int); extern int bio_integrity_prep(struct bio *); extern void bio_integrity_endio(struct bio *, int); extern void bio_integrity_advance(struct bio *, unsigned int); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7fcb2ca..0bf5d79 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1472,14 +1472,10 @@ struct blk_integrity_exchg { typedef void (integrity_gen_fn) (struct blk_integrity_exchg *); typedef int (integrity_vrfy_fn) (struct blk_integrity_exchg *); -typedef void (integrity_set_tag_fn) (void *, void *, unsigned int); -typedef void (integrity_get_tag_fn) (void *, void *, unsigned int); struct blk_integrity { integrity_gen_fn *generate_fn; integrity_vrfy_fn *verify_fn; - integrity_set_tag_fn *set_tag_fn; - integrity_get_tag_fn *get_tag_fn; unsigned short flags; unsigned short tuple_size; -- cgit v0.10.2 From 5f9378fa9ca214977b5bfc12197c67eea450fc40 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 26 Sep 2014 19:19:58 -0400 Subject: block: Remove bip_buf bip_buf is not really needed so we can remove it. Signed-off-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 367bb24..e84f7fb 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -101,7 +101,8 @@ void bio_integrity_free(struct bio *bio) struct bio_set *bs = bio->bi_pool; if (bip->bip_owns_buf) - kfree(bip->bip_buf); + kfree(page_address(bip->bip_vec->bv_page) + + bip->bip_vec->bv_offset); if (bs) { if (bip->bip_slab != BIO_POOL_NONE) @@ -219,14 +220,16 @@ static int bio_integrity_generate_verify(struct bio *bio, int operate) struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); struct blk_integrity_exchg bix; struct bio_vec *bv; + struct bio_integrity_payload *bip = bio_integrity(bio); sector_t sector; unsigned int sectors, ret = 0, i; - void *prot_buf = bio_integrity(bio)->bip_buf; + void *prot_buf = page_address(bip->bip_vec->bv_page) + + bip->bip_vec->bv_offset; if (operate) sector = bio->bi_iter.bi_sector; else - sector = bio_integrity(bio)->bip_iter.bi_sector; + sector = bip->bip_iter.bi_sector; bix.disk_name = bio->bi_bdev->bd_disk->disk_name; bix.sector_size = bi->sector_size; @@ -321,7 +324,6 @@ int bio_integrity_prep(struct bio *bio) } bip->bip_owns_buf = 1; - bip->bip_buf = buf; bip->bip_iter.bi_size = len; bip->bip_iter.bi_sector = bio->bi_iter.bi_sector; diff --git a/include/linux/bio.h b/include/linux/bio.h index 63a0e53..448d8c0 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -310,9 +310,6 @@ struct bio_integrity_payload { struct bvec_iter bip_iter; - /* kill - should just use bip_vec */ - void *bip_buf; /* generated integrity data */ - bio_end_io_t *bip_end_io; /* saved I/O completion fn */ unsigned short bip_slab; /* slab the bip came from */ -- cgit v0.10.2 From 3be91c4a3d090bd700bd6ee5bf457c1bbf189a4f Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 26 Sep 2014 19:19:59 -0400 Subject: block: Deprecate the use of the term sector in the context of block integrity The protection interval is not necessarily tied to the logical block size of a block device. Stop using the terms "sector" and "sectors". Going forward we will use the term "seed" to describe the initial reference tag value for a given I/O. "Interval" will be used to describe the portion of the data buffer that a given piece of protection information is associated with. Signed-off-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe diff --git a/block/bio-integrity.c b/block/bio-integrity.c index e84f7fb..6a3aacf 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -185,20 +185,20 @@ bool bio_integrity_enabled(struct bio *bio) EXPORT_SYMBOL(bio_integrity_enabled); /** - * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto + * bio_integrity_intervals - Return number of integrity intervals for a bio * @bi: blk_integrity profile for device - * @sectors: Number of 512 sectors to convert + * @sectors: Size of the bio in 512-byte sectors * * Description: The block layer calculates everything in 512 byte - * sectors but integrity metadata is done in terms of the hardware - * sector size of the storage device. Convert the block layer sectors - * to physical sectors. + * sectors but integrity metadata is done in terms of the data integrity + * interval size of the storage device. Convert the block layer sectors + * to the appropriate number of integrity intervals. */ -static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi, - unsigned int sectors) +static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, + unsigned int sectors) { /* At this point there are only 512b or 4096b DIF/EPP devices */ - if (bi->sector_size == 4096) + if (bi->interval == 4096) return sectors >>= 3; return sectors; @@ -207,7 +207,7 @@ static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi, static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, unsigned int sectors) { - return bio_integrity_hw_sectors(bi, sectors) * bi->tuple_size; + return bio_integrity_intervals(bi, sectors) * bi->tuple_size; } /** @@ -221,25 +221,25 @@ static int bio_integrity_generate_verify(struct bio *bio, int operate) struct blk_integrity_exchg bix; struct bio_vec *bv; struct bio_integrity_payload *bip = bio_integrity(bio); - sector_t sector; - unsigned int sectors, ret = 0, i; + sector_t seed; + unsigned int intervals, ret = 0, i; void *prot_buf = page_address(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset; if (operate) - sector = bio->bi_iter.bi_sector; + seed = bio->bi_iter.bi_sector; else - sector = bip->bip_iter.bi_sector; + seed = bip->bip_iter.bi_sector; bix.disk_name = bio->bi_bdev->bd_disk->disk_name; - bix.sector_size = bi->sector_size; + bix.interval = bi->interval; bio_for_each_segment_all(bv, bio, i) { void *kaddr = kmap_atomic(bv->bv_page); bix.data_buf = kaddr + bv->bv_offset; bix.data_size = bv->bv_len; bix.prot_buf = prot_buf; - bix.sector = sector; + bix.seed = seed; if (operate) bi->generate_fn(&bix); @@ -251,9 +251,9 @@ static int bio_integrity_generate_verify(struct bio *bio, int operate) } } - sectors = bv->bv_len / bi->sector_size; - sector += sectors; - prot_buf += sectors * bi->tuple_size; + intervals = bv->bv_len / bi->interval; + seed += intervals; + prot_buf += intervals * bi->tuple_size; kunmap_atomic(kaddr); } @@ -294,17 +294,17 @@ int bio_integrity_prep(struct bio *bio) unsigned long start, end; unsigned int len, nr_pages; unsigned int bytes, offset, i; - unsigned int sectors; + unsigned int intervals; bi = bdev_get_integrity(bio->bi_bdev); q = bdev_get_queue(bio->bi_bdev); BUG_ON(bi == NULL); BUG_ON(bio_integrity(bio)); - sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio)); + intervals = bio_integrity_intervals(bi, bio_sectors(bio)); /* Allocate kernel buffer for protection data */ - len = sectors * bi->tuple_size; + len = intervals * bi->tuple_size; buf = kmalloc(len, GFP_NOIO | q->bounce_gfp); if (unlikely(buf == NULL)) { printk(KERN_ERR "could not allocate integrity buffer\n"); diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 7ac1716..3a83a7d 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -154,10 +154,10 @@ int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2) if (!b1 || !b2) return -1; - if (b1->sector_size != b2->sector_size) { - printk(KERN_ERR "%s: %s/%s sector sz %u != %u\n", __func__, - gd1->disk_name, gd2->disk_name, - b1->sector_size, b2->sector_size); + if (b1->interval != b2->interval) { + pr_err("%s: %s/%s protection interval %u != %u\n", + __func__, gd1->disk_name, gd2->disk_name, + b1->interval, b2->interval); return -1; } @@ -407,7 +407,7 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) kobject_uevent(&bi->kobj, KOBJ_ADD); bi->flags |= INTEGRITY_FLAG_READ | INTEGRITY_FLAG_WRITE; - bi->sector_size = queue_logical_block_size(disk->queue); + bi->interval = queue_logical_block_size(disk->queue); disk->integrity = bi; } else bi = disk->integrity; diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c index 38a7778..1600270 100644 --- a/drivers/scsi/sd_dif.c +++ b/drivers/scsi/sd_dif.c @@ -57,16 +57,16 @@ static void sd_dif_type1_generate(struct blk_integrity_exchg *bix, csum_fn *fn) { void *buf = bix->data_buf; struct sd_dif_tuple *sdt = bix->prot_buf; - sector_t sector = bix->sector; + sector_t seed = bix->seed; unsigned int i; - for (i = 0 ; i < bix->data_size ; i += bix->sector_size, sdt++) { - sdt->guard_tag = fn(buf, bix->sector_size); - sdt->ref_tag = cpu_to_be32(sector & 0xffffffff); + for (i = 0 ; i < bix->data_size ; i += bix->interval, sdt++) { + sdt->guard_tag = fn(buf, bix->interval); + sdt->ref_tag = cpu_to_be32(seed & 0xffffffff); sdt->app_tag = 0; - buf += bix->sector_size; - sector++; + buf += bix->interval; + seed++; } } @@ -84,35 +84,35 @@ static int sd_dif_type1_verify(struct blk_integrity_exchg *bix, csum_fn *fn) { void *buf = bix->data_buf; struct sd_dif_tuple *sdt = bix->prot_buf; - sector_t sector = bix->sector; + sector_t seed = bix->seed; unsigned int i; __u16 csum; - for (i = 0 ; i < bix->data_size ; i += bix->sector_size, sdt++) { + for (i = 0 ; i < bix->data_size ; i += bix->interval, sdt++) { /* Unwritten sectors */ if (sdt->app_tag == 0xffff) return 0; - if (be32_to_cpu(sdt->ref_tag) != (sector & 0xffffffff)) { + if (be32_to_cpu(sdt->ref_tag) != (seed & 0xffffffff)) { printk(KERN_ERR "%s: ref tag error on sector %lu (rcvd %u)\n", - bix->disk_name, (unsigned long)sector, + bix->disk_name, (unsigned long)seed, be32_to_cpu(sdt->ref_tag)); return -EIO; } - csum = fn(buf, bix->sector_size); + csum = fn(buf, bix->interval); if (sdt->guard_tag != csum) { printk(KERN_ERR "%s: guard tag error on sector %lu " \ "(rcvd %04x, data %04x)\n", bix->disk_name, - (unsigned long)sector, + (unsigned long)seed, be16_to_cpu(sdt->guard_tag), be16_to_cpu(csum)); return -EIO; } - buf += bix->sector_size; - sector++; + buf += bix->interval; + seed++; } return 0; @@ -155,12 +155,12 @@ static void sd_dif_type3_generate(struct blk_integrity_exchg *bix, csum_fn *fn) struct sd_dif_tuple *sdt = bix->prot_buf; unsigned int i; - for (i = 0 ; i < bix->data_size ; i += bix->sector_size, sdt++) { - sdt->guard_tag = fn(buf, bix->sector_size); + for (i = 0 ; i < bix->data_size ; i += bix->interval, sdt++) { + sdt->guard_tag = fn(buf, bix->interval); sdt->ref_tag = 0; sdt->app_tag = 0; - buf += bix->sector_size; + buf += bix->interval; } } @@ -178,27 +178,27 @@ static int sd_dif_type3_verify(struct blk_integrity_exchg *bix, csum_fn *fn) { void *buf = bix->data_buf; struct sd_dif_tuple *sdt = bix->prot_buf; - sector_t sector = bix->sector; + sector_t seed = bix->seed; unsigned int i; __u16 csum; - for (i = 0 ; i < bix->data_size ; i += bix->sector_size, sdt++) { + for (i = 0 ; i < bix->data_size ; i += bix->interval, sdt++) { /* Unwritten sectors */ if (sdt->app_tag == 0xffff && sdt->ref_tag == 0xffffffff) return 0; - csum = fn(buf, bix->sector_size); + csum = fn(buf, bix->interval); if (sdt->guard_tag != csum) { printk(KERN_ERR "%s: guard tag error on sector %lu " \ "(rcvd %04x, data %04x)\n", bix->disk_name, - (unsigned long)sector, + (unsigned long)seed, be16_to_cpu(sdt->guard_tag), be16_to_cpu(csum)); return -EIO; } - buf += bix->sector_size; - sector++; + buf += bix->interval; + seed++; } return 0; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0bf5d79..d364c42 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1464,9 +1464,9 @@ static inline uint64_t rq_io_start_time_ns(struct request *req) struct blk_integrity_exchg { void *prot_buf; void *data_buf; - sector_t sector; + sector_t seed; unsigned int data_size; - unsigned short sector_size; + unsigned short interval; const char *disk_name; }; @@ -1479,7 +1479,7 @@ struct blk_integrity { unsigned short flags; unsigned short tuple_size; - unsigned short sector_size; + unsigned short interval; unsigned short tag_size; const char *name; -- cgit v0.10.2 From 5a2aa873059fec8b6385071d9c0802893a9b2f41 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 26 Sep 2014 19:20:00 -0400 Subject: block: Make protection interval calculation generic Now that the protection interval has been detached from the sector size we need to be able to handle sizes that are different from 4K and 512. Make the interval calculation generic. Signed-off-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 6a3aacf..cf40837 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -197,11 +197,7 @@ EXPORT_SYMBOL(bio_integrity_enabled); static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, unsigned int sectors) { - /* At this point there are only 512b or 4096b DIF/EPP devices */ - if (bi->interval == 4096) - return sectors >>= 3; - - return sectors; + return sectors >> (ilog2(bi->interval) - 9); } static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, -- cgit v0.10.2 From 1859308853b19c4daf4afaab910d3d52ac1ec2ff Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 26 Sep 2014 19:20:01 -0400 Subject: block: Clean up the code used to generate and verify integrity metadata Instead of the "operate" parameter we pass in a seed value and a pointer to a function that can be used to process the integrity metadata. The generation function is changed to have a return value to fit into this scheme. Signed-off-by: Martin K. Petersen Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe diff --git a/block/bio-integrity.c b/block/bio-integrity.c index cf40837..fe4de03 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -207,49 +207,37 @@ static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, } /** - * bio_integrity_generate_verify - Generate/verify integrity metadata for a bio + * bio_integrity_process - Process integrity metadata for a bio * @bio: bio to generate/verify integrity metadata for - * @operate: operate number, 1 for generate, 0 for verify + * @proc_fn: Pointer to the relevant processing function */ -static int bio_integrity_generate_verify(struct bio *bio, int operate) +static int bio_integrity_process(struct bio *bio, + integrity_processing_fn *proc_fn) { struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - struct blk_integrity_exchg bix; + struct blk_integrity_iter iter; struct bio_vec *bv; struct bio_integrity_payload *bip = bio_integrity(bio); - sector_t seed; - unsigned int intervals, ret = 0, i; + unsigned int i, ret = 0; void *prot_buf = page_address(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset; - if (operate) - seed = bio->bi_iter.bi_sector; - else - seed = bip->bip_iter.bi_sector; - - bix.disk_name = bio->bi_bdev->bd_disk->disk_name; - bix.interval = bi->interval; + iter.disk_name = bio->bi_bdev->bd_disk->disk_name; + iter.interval = bi->interval; + iter.seed = bip_get_seed(bip); + iter.prot_buf = prot_buf; bio_for_each_segment_all(bv, bio, i) { void *kaddr = kmap_atomic(bv->bv_page); - bix.data_buf = kaddr + bv->bv_offset; - bix.data_size = bv->bv_len; - bix.prot_buf = prot_buf; - bix.seed = seed; - - if (operate) - bi->generate_fn(&bix); - else { - ret = bi->verify_fn(&bix); - if (ret) { - kunmap_atomic(kaddr); - return ret; - } - } - intervals = bv->bv_len / bi->interval; - seed += intervals; - prot_buf += intervals * bi->tuple_size; + iter.data_buf = kaddr + bv->bv_offset; + iter.data_size = bv->bv_len; + + ret = proc_fn(&iter); + if (ret) { + kunmap_atomic(kaddr); + return ret; + } kunmap_atomic(kaddr); } @@ -257,20 +245,6 @@ static int bio_integrity_generate_verify(struct bio *bio, int operate) } /** - * bio_integrity_generate - Generate integrity metadata for a bio - * @bio: bio to generate integrity metadata for - * - * Description: Generates integrity metadata for a bio by calling the - * block device's generation callback function. The bio must have a - * bip attached with enough room to accommodate the generated - * integrity metadata. - */ -static void bio_integrity_generate(struct bio *bio) -{ - bio_integrity_generate_verify(bio, 1); -} - -/** * bio_integrity_prep - Prepare bio for integrity I/O * @bio: bio to prepare * @@ -321,7 +295,7 @@ int bio_integrity_prep(struct bio *bio) bip->bip_owns_buf = 1; bip->bip_iter.bi_size = len; - bip->bip_iter.bi_sector = bio->bi_iter.bi_sector; + bip_set_seed(bip, bio->bi_iter.bi_sector); /* Map it */ offset = offset_in_page(buf); @@ -357,26 +331,13 @@ int bio_integrity_prep(struct bio *bio) /* Auto-generate integrity metadata if this is a write */ if (bio_data_dir(bio) == WRITE) - bio_integrity_generate(bio); + bio_integrity_process(bio, bi->generate_fn); return 0; } EXPORT_SYMBOL(bio_integrity_prep); /** - * bio_integrity_verify - Verify integrity metadata for a bio - * @bio: bio to verify - * - * Description: This function is called to verify the integrity of a - * bio. The data in the bio io_vec is compared to the integrity - * metadata returned by the HBA. - */ -static int bio_integrity_verify(struct bio *bio) -{ - return bio_integrity_generate_verify(bio, 0); -} - -/** * bio_integrity_verify_fn - Integrity I/O completion worker * @work: Work struct stored in bio to be verified * @@ -389,9 +350,10 @@ static void bio_integrity_verify_fn(struct work_struct *work) struct bio_integrity_payload *bip = container_of(work, struct bio_integrity_payload, bip_work); struct bio *bio = bip->bip_bio; + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); int error; - error = bio_integrity_verify(bio); + error = bio_integrity_process(bio, bi->verify_fn); /* Restore original bio completion handler */ bio->bi_end_io = bip->bip_end_io; diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c index 1600270..801c418 100644 --- a/drivers/scsi/sd_dif.c +++ b/drivers/scsi/sd_dif.c @@ -53,42 +53,44 @@ static __u16 sd_dif_ip_fn(void *data, unsigned int len) * Type 1 and Type 2 protection use the same format: 16 bit guard tag, * 16 bit app tag, 32 bit reference tag. */ -static void sd_dif_type1_generate(struct blk_integrity_exchg *bix, csum_fn *fn) +static void sd_dif_type1_generate(struct blk_integrity_iter *iter, csum_fn *fn) { - void *buf = bix->data_buf; - struct sd_dif_tuple *sdt = bix->prot_buf; - sector_t seed = bix->seed; + void *buf = iter->data_buf; + struct sd_dif_tuple *sdt = iter->prot_buf; + sector_t seed = iter->seed; unsigned int i; - for (i = 0 ; i < bix->data_size ; i += bix->interval, sdt++) { - sdt->guard_tag = fn(buf, bix->interval); + for (i = 0 ; i < iter->data_size ; i += iter->interval, sdt++) { + sdt->guard_tag = fn(buf, iter->interval); sdt->ref_tag = cpu_to_be32(seed & 0xffffffff); sdt->app_tag = 0; - buf += bix->interval; + buf += iter->interval; seed++; } } -static void sd_dif_type1_generate_crc(struct blk_integrity_exchg *bix) +static int sd_dif_type1_generate_crc(struct blk_integrity_iter *iter) { - sd_dif_type1_generate(bix, sd_dif_crc_fn); + sd_dif_type1_generate(iter, sd_dif_crc_fn); + return 0; } -static void sd_dif_type1_generate_ip(struct blk_integrity_exchg *bix) +static int sd_dif_type1_generate_ip(struct blk_integrity_iter *iter) { - sd_dif_type1_generate(bix, sd_dif_ip_fn); + sd_dif_type1_generate(iter, sd_dif_ip_fn); + return 0; } -static int sd_dif_type1_verify(struct blk_integrity_exchg *bix, csum_fn *fn) +static int sd_dif_type1_verify(struct blk_integrity_iter *iter, csum_fn *fn) { - void *buf = bix->data_buf; - struct sd_dif_tuple *sdt = bix->prot_buf; - sector_t seed = bix->seed; + void *buf = iter->data_buf; + struct sd_dif_tuple *sdt = iter->prot_buf; + sector_t seed = iter->seed; unsigned int i; __u16 csum; - for (i = 0 ; i < bix->data_size ; i += bix->interval, sdt++) { + for (i = 0 ; i < iter->data_size ; i += iter->interval, sdt++) { /* Unwritten sectors */ if (sdt->app_tag == 0xffff) return 0; @@ -96,36 +98,36 @@ static int sd_dif_type1_verify(struct blk_integrity_exchg *bix, csum_fn *fn) if (be32_to_cpu(sdt->ref_tag) != (seed & 0xffffffff)) { printk(KERN_ERR "%s: ref tag error on sector %lu (rcvd %u)\n", - bix->disk_name, (unsigned long)seed, + iter->disk_name, (unsigned long)seed, be32_to_cpu(sdt->ref_tag)); return -EIO; } - csum = fn(buf, bix->interval); + csum = fn(buf, iter->interval); if (sdt->guard_tag != csum) { printk(KERN_ERR "%s: guard tag error on sector %lu " \ - "(rcvd %04x, data %04x)\n", bix->disk_name, + "(rcvd %04x, data %04x)\n", iter->disk_name, (unsigned long)seed, be16_to_cpu(sdt->guard_tag), be16_to_cpu(csum)); return -EIO; } - buf += bix->interval; + buf += iter->interval; seed++; } return 0; } -static int sd_dif_type1_verify_crc(struct blk_integrity_exchg *bix) +static int sd_dif_type1_verify_crc(struct blk_integrity_iter *iter) { - return sd_dif_type1_verify(bix, sd_dif_crc_fn); + return sd_dif_type1_verify(iter, sd_dif_crc_fn); } -static int sd_dif_type1_verify_ip(struct blk_integrity_exchg *bix) +static int sd_dif_type1_verify_ip(struct blk_integrity_iter *iter) { - return sd_dif_type1_verify(bix, sd_dif_ip_fn); + return sd_dif_type1_verify(iter, sd_dif_ip_fn); } static struct blk_integrity dif_type1_integrity_crc = { @@ -149,69 +151,71 @@ static struct blk_integrity dif_type1_integrity_ip = { * Type 3 protection has a 16-bit guard tag and 16 + 32 bits of opaque * tag space. */ -static void sd_dif_type3_generate(struct blk_integrity_exchg *bix, csum_fn *fn) +static void sd_dif_type3_generate(struct blk_integrity_iter *iter, csum_fn *fn) { - void *buf = bix->data_buf; - struct sd_dif_tuple *sdt = bix->prot_buf; + void *buf = iter->data_buf; + struct sd_dif_tuple *sdt = iter->prot_buf; unsigned int i; - for (i = 0 ; i < bix->data_size ; i += bix->interval, sdt++) { - sdt->guard_tag = fn(buf, bix->interval); + for (i = 0 ; i < iter->data_size ; i += iter->interval, sdt++) { + sdt->guard_tag = fn(buf, iter->interval); sdt->ref_tag = 0; sdt->app_tag = 0; - buf += bix->interval; + buf += iter->interval; } } -static void sd_dif_type3_generate_crc(struct blk_integrity_exchg *bix) +static int sd_dif_type3_generate_crc(struct blk_integrity_iter *iter) { - sd_dif_type3_generate(bix, sd_dif_crc_fn); + sd_dif_type3_generate(iter, sd_dif_crc_fn); + return 0; } -static void sd_dif_type3_generate_ip(struct blk_integrity_exchg *bix) +static int sd_dif_type3_generate_ip(struct blk_integrity_iter *iter) { - sd_dif_type3_generate(bix, sd_dif_ip_fn); + sd_dif_type3_generate(iter, sd_dif_ip_fn); + return 0; } -static int sd_dif_type3_verify(struct blk_integrity_exchg *bix, csum_fn *fn) +static int sd_dif_type3_verify(struct blk_integrity_iter *iter, csum_fn *fn) { - void *buf = bix->data_buf; - struct sd_dif_tuple *sdt = bix->prot_buf; - sector_t seed = bix->seed; + void *buf = iter->data_buf; + struct sd_dif_tuple *sdt = iter->prot_buf; + sector_t seed = iter->seed; unsigned int i; __u16 csum; - for (i = 0 ; i < bix->data_size ; i += bix->interval, sdt++) { + for (i = 0 ; i < iter->data_size ; i += iter->interval, sdt++) { /* Unwritten sectors */ if (sdt->app_tag == 0xffff && sdt->ref_tag == 0xffffffff) return 0; - csum = fn(buf, bix->interval); + csum = fn(buf, iter->interval); if (sdt->guard_tag != csum) { printk(KERN_ERR "%s: guard tag error on sector %lu " \ - "(rcvd %04x, data %04x)\n", bix->disk_name, + "(rcvd %04x, data %04x)\n", iter->disk_name, (unsigned long)seed, be16_to_cpu(sdt->guard_tag), be16_to_cpu(csum)); return -EIO; } - buf += bix->interval; + buf += iter->interval; seed++; } return 0; } -static int sd_dif_type3_verify_crc(struct blk_integrity_exchg *bix) +static int sd_dif_type3_verify_crc(struct blk_integrity_iter *iter) { - return sd_dif_type3_verify(bix, sd_dif_crc_fn); + return sd_dif_type3_verify(iter, sd_dif_crc_fn); } -static int sd_dif_type3_verify_ip(struct blk_integrity_exchg *bix) +static int sd_dif_type3_verify_ip(struct blk_integrity_iter *iter) { - return sd_dif_type3_verify(bix, sd_dif_ip_fn); + return sd_dif_type3_verify(iter, sd_dif_ip_fn); } static struct blk_integrity dif_type3_integrity_crc = { @@ -310,6 +314,7 @@ void sd_dif_prepare(struct request *rq, sector_t hw_sector, phys = hw_sector & 0xffffffff; __rq_for_each_bio(bio, rq) { + struct bio_integrity_payload *bip = bio_integrity(bio); struct bio_vec iv; struct bvec_iter iter; unsigned int j; @@ -318,9 +323,9 @@ void sd_dif_prepare(struct request *rq, sector_t hw_sector, if (bio_flagged(bio, BIO_MAPPED_INTEGRITY)) break; - virt = bio_integrity(bio)->bip_iter.bi_sector & 0xffffffff; + virt = bip_get_seed(bip) & 0xffffffff; - bip_for_each_vec(iv, bio_integrity(bio), iter) { + bip_for_each_vec(iv, bip, iter) { sdt = kmap_atomic(iv.bv_page) + iv.bv_offset; @@ -366,12 +371,13 @@ void sd_dif_complete(struct scsi_cmnd *scmd, unsigned int good_bytes) phys >>= 3; __rq_for_each_bio(bio, scmd->request) { + struct bio_integrity_payload *bip = bio_integrity(bio); struct bio_vec iv; struct bvec_iter iter; - virt = bio_integrity(bio)->bip_iter.bi_sector & 0xffffffff; + virt = bip_get_seed(bip) & 0xffffffff; - bip_for_each_vec(iv, bio_integrity(bio), iter) { + bip_for_each_vec(iv, bip, iter) { sdt = kmap_atomic(iv.bv_page) + iv.bv_offset; diff --git a/include/linux/bio.h b/include/linux/bio.h index 448d8c0..3fd3666 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -322,6 +322,18 @@ struct bio_integrity_payload { struct bio_vec *bip_vec; struct bio_vec bip_inline_vecs[0];/* embedded bvec array */ }; + +static inline sector_t bip_get_seed(struct bio_integrity_payload *bip) +{ + return bip->bip_iter.bi_sector; +} + +static inline void bip_set_seed(struct bio_integrity_payload *bip, + sector_t seed) +{ + bip->bip_iter.bi_sector = seed; +} + #endif /* CONFIG_BLK_DEV_INTEGRITY */ extern void bio_trim(struct bio *bio, int offset, int size); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d364c42..24c1e05 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1461,7 +1461,7 @@ static inline uint64_t rq_io_start_time_ns(struct request *req) #define INTEGRITY_FLAG_READ 2 /* verify data integrity on read */ #define INTEGRITY_FLAG_WRITE 4 /* generate data integrity on write */ -struct blk_integrity_exchg { +struct blk_integrity_iter { void *prot_buf; void *data_buf; sector_t seed; @@ -1470,12 +1470,11 @@ struct blk_integrity_exchg { const char *disk_name; }; -typedef void (integrity_gen_fn) (struct blk_integrity_exchg *); -typedef int (integrity_vrfy_fn) (struct blk_integrity_exchg *); +typedef int (integrity_processing_fn) (struct blk_integrity_iter *); struct blk_integrity { - integrity_gen_fn *generate_fn; - integrity_vrfy_fn *verify_fn; + integrity_processing_fn *generate_fn; + integrity_processing_fn *verify_fn; unsigned short flags; unsigned short tuple_size; -- cgit v0.10.2 From 8288f496eb1b1905c425e92eaf1abbb29119217b Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 26 Sep 2014 19:20:02 -0400 Subject: block: Add prefix to block integrity profile flags Add a BLK_ prefix to the integrity profile flags. Also rename the flags to be more consistent with the generate/verify terminology in the rest of the integrity code. Signed-off-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe diff --git a/block/bio-integrity.c b/block/bio-integrity.c index fe4de03..e64733b 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -173,11 +173,11 @@ bool bio_integrity_enabled(struct bio *bio) return false; if (bio_data_dir(bio) == READ && bi->verify_fn != NULL && - (bi->flags & INTEGRITY_FLAG_READ)) + (bi->flags & BLK_INTEGRITY_VERIFY)) return true; if (bio_data_dir(bio) == WRITE && bi->generate_fn != NULL && - (bi->flags & INTEGRITY_FLAG_WRITE)) + (bi->flags & BLK_INTEGRITY_GENERATE)) return true; return false; diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 3a83a7d..a7436cc 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -269,42 +269,42 @@ static ssize_t integrity_tag_size_show(struct blk_integrity *bi, char *page) return sprintf(page, "0\n"); } -static ssize_t integrity_read_store(struct blk_integrity *bi, - const char *page, size_t count) +static ssize_t integrity_verify_store(struct blk_integrity *bi, + const char *page, size_t count) { char *p = (char *) page; unsigned long val = simple_strtoul(p, &p, 10); if (val) - bi->flags |= INTEGRITY_FLAG_READ; + bi->flags |= BLK_INTEGRITY_VERIFY; else - bi->flags &= ~INTEGRITY_FLAG_READ; + bi->flags &= ~BLK_INTEGRITY_VERIFY; return count; } -static ssize_t integrity_read_show(struct blk_integrity *bi, char *page) +static ssize_t integrity_verify_show(struct blk_integrity *bi, char *page) { - return sprintf(page, "%d\n", (bi->flags & INTEGRITY_FLAG_READ) != 0); + return sprintf(page, "%d\n", (bi->flags & BLK_INTEGRITY_VERIFY) != 0); } -static ssize_t integrity_write_store(struct blk_integrity *bi, - const char *page, size_t count) +static ssize_t integrity_generate_store(struct blk_integrity *bi, + const char *page, size_t count) { char *p = (char *) page; unsigned long val = simple_strtoul(p, &p, 10); if (val) - bi->flags |= INTEGRITY_FLAG_WRITE; + bi->flags |= BLK_INTEGRITY_GENERATE; else - bi->flags &= ~INTEGRITY_FLAG_WRITE; + bi->flags &= ~BLK_INTEGRITY_GENERATE; return count; } -static ssize_t integrity_write_show(struct blk_integrity *bi, char *page) +static ssize_t integrity_generate_show(struct blk_integrity *bi, char *page) { - return sprintf(page, "%d\n", (bi->flags & INTEGRITY_FLAG_WRITE) != 0); + return sprintf(page, "%d\n", (bi->flags & BLK_INTEGRITY_GENERATE) != 0); } static struct integrity_sysfs_entry integrity_format_entry = { @@ -317,23 +317,23 @@ static struct integrity_sysfs_entry integrity_tag_size_entry = { .show = integrity_tag_size_show, }; -static struct integrity_sysfs_entry integrity_read_entry = { +static struct integrity_sysfs_entry integrity_verify_entry = { .attr = { .name = "read_verify", .mode = S_IRUGO | S_IWUSR }, - .show = integrity_read_show, - .store = integrity_read_store, + .show = integrity_verify_show, + .store = integrity_verify_store, }; -static struct integrity_sysfs_entry integrity_write_entry = { +static struct integrity_sysfs_entry integrity_generate_entry = { .attr = { .name = "write_generate", .mode = S_IRUGO | S_IWUSR }, - .show = integrity_write_show, - .store = integrity_write_store, + .show = integrity_generate_show, + .store = integrity_generate_store, }; static struct attribute *integrity_attrs[] = { &integrity_format_entry.attr, &integrity_tag_size_entry.attr, - &integrity_read_entry.attr, - &integrity_write_entry.attr, + &integrity_verify_entry.attr, + &integrity_generate_entry.attr, NULL, }; @@ -406,7 +406,7 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) kobject_uevent(&bi->kobj, KOBJ_ADD); - bi->flags |= INTEGRITY_FLAG_READ | INTEGRITY_FLAG_WRITE; + bi->flags |= BLK_INTEGRITY_VERIFY | BLK_INTEGRITY_GENERATE; bi->interval = queue_logical_block_size(disk->queue); disk->integrity = bi; } else @@ -419,6 +419,7 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) bi->verify_fn = template->verify_fn; bi->tuple_size = template->tuple_size; bi->tag_size = template->tag_size; + bi->flags |= template->flags; } else bi->name = bi_unsupported_name; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 24c1e05..cf92eb0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1458,8 +1458,10 @@ static inline uint64_t rq_io_start_time_ns(struct request *req) #if defined(CONFIG_BLK_DEV_INTEGRITY) -#define INTEGRITY_FLAG_READ 2 /* verify data integrity on read */ -#define INTEGRITY_FLAG_WRITE 4 /* generate data integrity on write */ +enum blk_integrity_flags { + BLK_INTEGRITY_VERIFY = 1 << 0, + BLK_INTEGRITY_GENERATE = 1 << 1, +}; struct blk_integrity_iter { void *prot_buf; -- cgit v0.10.2 From 3aec2f41a8baeb70aa77556a4e4dcec7d9d70b4d Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 26 Sep 2014 19:20:03 -0400 Subject: block: Add a disk flag to block integrity profile So far we have relied on the app tag size to determine whether a disk has been formatted with T10 protection information or not. However, not all target devices provide application tag storage. Add a flag to the block integrity profile that indicates whether the disk has been formatted with protection information. Signed-off-by: Martin K. Petersen Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index 279da08..8df0039 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -53,6 +53,14 @@ Description: 512 bytes of data. +What: /sys/block//integrity/device_is_integrity_capable +Date: July 2014 +Contact: Martin K. Petersen +Description: + Indicates whether a storage device is capable of storing + integrity metadata. Set if the device is T10 PI-capable. + + What: /sys/block//integrity/write_generate Date: June 2008 Contact: Martin K. Petersen diff --git a/block/blk-integrity.c b/block/blk-integrity.c index a7436cc..1c6ba44 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -307,6 +307,12 @@ static ssize_t integrity_generate_show(struct blk_integrity *bi, char *page) return sprintf(page, "%d\n", (bi->flags & BLK_INTEGRITY_GENERATE) != 0); } +static ssize_t integrity_device_show(struct blk_integrity *bi, char *page) +{ + return sprintf(page, "%u\n", + (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) != 0); +} + static struct integrity_sysfs_entry integrity_format_entry = { .attr = { .name = "format", .mode = S_IRUGO }, .show = integrity_format_show, @@ -329,11 +335,17 @@ static struct integrity_sysfs_entry integrity_generate_entry = { .store = integrity_generate_store, }; +static struct integrity_sysfs_entry integrity_device_entry = { + .attr = { .name = "device_is_integrity_capable", .mode = S_IRUGO }, + .show = integrity_device_show, +}; + static struct attribute *integrity_attrs[] = { &integrity_format_entry.attr, &integrity_tag_size_entry.attr, &integrity_verify_entry.attr, &integrity_generate_entry.attr, + &integrity_device_entry.attr, NULL, }; diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c index 801c418..1e971c6 100644 --- a/drivers/scsi/sd_dif.c +++ b/drivers/scsi/sd_dif.c @@ -270,7 +270,13 @@ void sd_dif_config_host(struct scsi_disk *sdkp) "Enabling DIX %s protection\n", disk->integrity->name); /* Signal to block layer that we support sector tagging */ - if (dif && type && sdkp->ATO) { + if (dif && type) { + + disk->integrity->flags |= BLK_INTEGRITY_DEVICE_CAPABLE; + + if (!sdkp) + return; + if (type == SD_DIF_TYPE3_PROTECTION) disk->integrity->tag_size = sizeof(u16) + sizeof(u32); else diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index cf92eb0..4600fc6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1461,6 +1461,7 @@ static inline uint64_t rq_io_start_time_ns(struct request *req) enum blk_integrity_flags { BLK_INTEGRITY_VERIFY = 1 << 0, BLK_INTEGRITY_GENERATE = 1 << 1, + BLK_INTEGRITY_DEVICE_CAPABLE = 1 << 2, }; struct blk_integrity_iter { -- cgit v0.10.2 From b1f01388574c9329922f760fc2a7335c2d14b08b Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 26 Sep 2014 19:20:04 -0400 Subject: block: Relocate bio integrity flags Move flags affecting the integrity code out of the bio bi_flags and into the block integrity payload. Signed-off-by: Martin K. Petersen Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe diff --git a/block/bio-integrity.c b/block/bio-integrity.c index e64733b..26aa901 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -100,7 +100,7 @@ void bio_integrity_free(struct bio *bio) struct bio_integrity_payload *bip = bio_integrity(bio); struct bio_set *bs = bio->bi_pool; - if (bip->bip_owns_buf) + if (bip->bip_flags & BIP_BLOCK_INTEGRITY) kfree(page_address(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset); @@ -293,7 +293,7 @@ int bio_integrity_prep(struct bio *bio) return -EIO; } - bip->bip_owns_buf = 1; + bip->bip_flags |= BIP_BLOCK_INTEGRITY; bip->bip_iter.bi_size = len; bip_set_seed(bip, bio->bi_iter.bi_sector); diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c index 1e971c6..4ce636f 100644 --- a/drivers/scsi/sd_dif.c +++ b/drivers/scsi/sd_dif.c @@ -326,7 +326,7 @@ void sd_dif_prepare(struct request *rq, sector_t hw_sector, unsigned int j; /* Already remapped? */ - if (bio_flagged(bio, BIO_MAPPED_INTEGRITY)) + if (bip->bip_flags & BIP_MAPPED_INTEGRITY) break; virt = bip_get_seed(bip) & 0xffffffff; @@ -347,7 +347,7 @@ void sd_dif_prepare(struct request *rq, sector_t hw_sector, kunmap_atomic(sdt); } - bio->bi_flags |= (1 << BIO_MAPPED_INTEGRITY); + bip->bip_flags |= BIP_MAPPED_INTEGRITY; } } diff --git a/include/linux/bio.h b/include/linux/bio.h index 3fd3666..b508cf6 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -315,7 +315,7 @@ struct bio_integrity_payload { unsigned short bip_slab; /* slab the bip came from */ unsigned short bip_vcnt; /* # of integrity bio_vecs */ unsigned short bip_max_vcnt; /* integrity bio_vec slots */ - unsigned bip_owns_buf:1; /* should free bip_buf */ + unsigned short bip_flags; /* control flags */ struct work_struct bip_work; /* I/O completion */ @@ -323,6 +323,13 @@ struct bio_integrity_payload { struct bio_vec bip_inline_vecs[0];/* embedded bvec array */ }; +enum bip_flags { + BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ + BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ + BIP_CTRL_NOCHECK = 1 << 2, /* disable HBA integrity checking */ + BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */ +}; + static inline sector_t bip_get_seed(struct bio_integrity_payload *bip) { return bip->bip_iter.bi_sector; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 6a5d2f2..38bc008 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -120,10 +120,8 @@ struct bio { #define BIO_USER_MAPPED 6 /* contains user pages */ #define BIO_EOPNOTSUPP 7 /* not supported */ #define BIO_NULL_MAPPED 8 /* contains invalid user pages */ -#define BIO_FS_INTEGRITY 9 /* fs owns integrity data, not block layer */ -#define BIO_QUIET 10 /* Make BIO Quiet */ -#define BIO_MAPPED_INTEGRITY 11/* integrity metadata has been remapped */ -#define BIO_SNAP_STABLE 12 /* bio data must be snapshotted during write */ +#define BIO_QUIET 9 /* Make BIO Quiet */ +#define BIO_SNAP_STABLE 10 /* bio data must be snapshotted during write */ /* * Flags starting here get preserved by bio_reset() - this includes -- cgit v0.10.2 From aae7df50190a640e51bfe11c93f94741ac82ff0b Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 26 Sep 2014 19:20:05 -0400 Subject: block: Integrity checksum flag Make the choice of checksum a per-I/O property by introducing a flag that can be inspected by the SCSI layer. There are several reasons for this: 1. It allows us to switch choice of checksum without unloading and reloading the HBA driver. 2. During error recovery we need to be able to tell the HBA that checksums read from disk should not be verified and converted to IP checksums. 3. For error injection purposes we need to be able to write a bad guard tag to storage. Since the storage device only supports T10 CRC we need to be able to disable IP checksum conversion on the HBA. Signed-off-by: Martin K. Petersen Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 26aa901..8e05484 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -297,6 +297,9 @@ int bio_integrity_prep(struct bio *bio) bip->bip_iter.bi_size = len; bip_set_seed(bip, bio->bi_iter.bi_sector); + if (bi->flags & BLK_INTEGRITY_IP_CHECKSUM) + bip->bip_flags |= BIP_IP_CHECKSUM; + /* Map it */ offset = offset_in_page(buf); for (i = 0 ; i < nr_pages ; i++) { diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c index 4ce636f..2198abe 100644 --- a/drivers/scsi/sd_dif.c +++ b/drivers/scsi/sd_dif.c @@ -255,12 +255,14 @@ void sd_dif_config_host(struct scsi_disk *sdkp) return; /* Enable DMA of protection information */ - if (scsi_host_get_guard(sdkp->device->host) & SHOST_DIX_GUARD_IP) + if (scsi_host_get_guard(sdkp->device->host) & SHOST_DIX_GUARD_IP) { if (type == SD_DIF_TYPE3_PROTECTION) blk_integrity_register(disk, &dif_type3_integrity_ip); else blk_integrity_register(disk, &dif_type1_integrity_ip); - else + + disk->integrity->flags |= BLK_INTEGRITY_IP_CHECKSUM; + } else if (type == SD_DIF_TYPE3_PROTECTION) blk_integrity_register(disk, &dif_type3_integrity_crc); else diff --git a/include/linux/bio.h b/include/linux/bio.h index b508cf6..14bff3f 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -328,6 +328,7 @@ enum bip_flags { BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ BIP_CTRL_NOCHECK = 1 << 2, /* disable HBA integrity checking */ BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */ + BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ }; static inline sector_t bip_get_seed(struct bio_integrity_payload *bip) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4600fc6..773df19 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1462,6 +1462,7 @@ enum blk_integrity_flags { BLK_INTEGRITY_VERIFY = 1 << 0, BLK_INTEGRITY_GENERATE = 1 << 1, BLK_INTEGRITY_DEVICE_CAPABLE = 1 << 2, + BLK_INTEGRITY_IP_CHECKSUM = 1 << 3, }; struct blk_integrity_iter { -- cgit v0.10.2 From 4eaf99beadcefbf126fa05e66fb40fca999e09fd Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 26 Sep 2014 19:20:06 -0400 Subject: block: Don't merge requests if integrity flags differ We'd occasionally merge requests with conflicting integrity flags. Introduce a merge helper which checks that the requests have compatible integrity payloads. Signed-off-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 1c6ba44..79ffb48 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -186,37 +186,53 @@ int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2) } EXPORT_SYMBOL(blk_integrity_compare); -int blk_integrity_merge_rq(struct request_queue *q, struct request *req, - struct request *next) +bool blk_integrity_merge_rq(struct request_queue *q, struct request *req, + struct request *next) { - if (blk_integrity_rq(req) != blk_integrity_rq(next)) - return -1; + if (blk_integrity_rq(req) == 0 && blk_integrity_rq(next) == 0) + return true; + + if (blk_integrity_rq(req) == 0 || blk_integrity_rq(next) == 0) + return false; + + if (bio_integrity(req->bio)->bip_flags != + bio_integrity(next->bio)->bip_flags) + return false; if (req->nr_integrity_segments + next->nr_integrity_segments > q->limits.max_integrity_segments) - return -1; + return false; - return 0; + return true; } EXPORT_SYMBOL(blk_integrity_merge_rq); -int blk_integrity_merge_bio(struct request_queue *q, struct request *req, - struct bio *bio) +bool blk_integrity_merge_bio(struct request_queue *q, struct request *req, + struct bio *bio) { int nr_integrity_segs; struct bio *next = bio->bi_next; + if (blk_integrity_rq(req) == 0 && bio_integrity(bio) == NULL) + return true; + + if (blk_integrity_rq(req) == 0 || bio_integrity(bio) == NULL) + return false; + + if (bio_integrity(req->bio)->bip_flags != bio_integrity(bio)->bip_flags) + return false; + bio->bi_next = NULL; nr_integrity_segs = blk_rq_count_integrity_sg(q, bio); bio->bi_next = next; if (req->nr_integrity_segments + nr_integrity_segs > q->limits.max_integrity_segments) - return -1; + return false; req->nr_integrity_segments += nr_integrity_segs; - return 0; + return true; } EXPORT_SYMBOL(blk_integrity_merge_bio); diff --git a/block/blk-merge.c b/block/blk-merge.c index 7788179..f71bad3 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -313,7 +313,7 @@ static inline int ll_new_hw_segment(struct request_queue *q, if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) goto no_merge; - if (bio_integrity(bio) && blk_integrity_merge_bio(q, req, bio)) + if (blk_integrity_merge_bio(q, req, bio) == false) goto no_merge; /* @@ -410,7 +410,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, if (total_phys_segments > queue_max_segments(q)) return 0; - if (blk_integrity_rq(req) && blk_integrity_merge_rq(q, req, next)) + if (blk_integrity_merge_rq(q, req, next) == false) return 0; /* Merge is OK... */ @@ -590,7 +590,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) return false; /* only merge integrity protected bio into ditto rq */ - if (bio_integrity(bio) != blk_integrity_rq(rq)) + if (blk_integrity_merge_bio(rq->q, rq, bio) == false) return false; /* must be using the same buffer */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 773df19..038b40f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1497,10 +1497,10 @@ extern int blk_integrity_compare(struct gendisk *, struct gendisk *); extern int blk_rq_map_integrity_sg(struct request_queue *, struct bio *, struct scatterlist *); extern int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); -extern int blk_integrity_merge_rq(struct request_queue *, struct request *, - struct request *); -extern int blk_integrity_merge_bio(struct request_queue *, struct request *, - struct bio *); +extern bool blk_integrity_merge_rq(struct request_queue *, struct request *, + struct request *); +extern bool blk_integrity_merge_bio(struct request_queue *, struct request *, + struct bio *); static inline struct blk_integrity *bdev_get_integrity(struct block_device *bdev) @@ -1580,15 +1580,15 @@ static inline unsigned short queue_max_integrity_segments(struct request_queue * { return 0; } -static inline int blk_integrity_merge_rq(struct request_queue *rq, - struct request *r1, - struct request *r2) +static inline bool blk_integrity_merge_rq(struct request_queue *rq, + struct request *r1, + struct request *r2) { return 0; } -static inline int blk_integrity_merge_bio(struct request_queue *rq, - struct request *r, - struct bio *b) +static inline bool blk_integrity_merge_bio(struct request_queue *rq, + struct request *r, + struct bio *b) { return 0; } -- cgit v0.10.2 From 2341c2f8c33196d02cf5a721746eea4e3c06674a Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 26 Sep 2014 19:20:07 -0400 Subject: block: Add T10 Protection Information functions The T10 Protection Information format is also used by some devices that do not go through the SCSI layer (virtual block devices, NVMe). Relocate the relevant functions to a block layer library that can be used without involving SCSI. Signed-off-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe diff --git a/block/Kconfig b/block/Kconfig index 2429515..161491d 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -77,6 +77,7 @@ config BLK_DEV_BSGLIB config BLK_DEV_INTEGRITY bool "Block layer data integrity support" + select CRC_T10DIF if BLK_DEV_INTEGRITY ---help--- Some storage devices allow extra information to be stored/retrieved to help protect the data. The block layer diff --git a/block/Makefile b/block/Makefile index a2ce6ac..00ecc97 100644 --- a/block/Makefile +++ b/block/Makefile @@ -20,6 +20,6 @@ obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o -obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o -obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o +obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o + diff --git a/block/t10-pi.c b/block/t10-pi.c new file mode 100644 index 0000000..24d6e97 --- /dev/null +++ b/block/t10-pi.c @@ -0,0 +1,197 @@ +/* + * t10_pi.c - Functions for generating and verifying T10 Protection + * Information. + * + * Copyright (C) 2007, 2008, 2014 Oracle Corporation + * Written by: Martin K. Petersen + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, + * USA. + * + */ + +#include +#include +#include +#include + +typedef __be16 (csum_fn) (void *, unsigned int); + +static const __be16 APP_ESCAPE = (__force __be16) 0xffff; +static const __be32 REF_ESCAPE = (__force __be32) 0xffffffff; + +static __be16 t10_pi_crc_fn(void *data, unsigned int len) +{ + return cpu_to_be16(crc_t10dif(data, len)); +} + +static __be16 t10_pi_ip_fn(void *data, unsigned int len) +{ + return (__force __be16)ip_compute_csum(data, len); +} + +/* + * Type 1 and Type 2 protection use the same format: 16 bit guard tag, + * 16 bit app tag, 32 bit reference tag. Type 3 does not define the ref + * tag. + */ +static int t10_pi_generate(struct blk_integrity_iter *iter, csum_fn *fn, + unsigned int type) +{ + unsigned int i; + + for (i = 0 ; i < iter->data_size ; i += iter->interval) { + struct t10_pi_tuple *pi = iter->prot_buf; + + pi->guard_tag = fn(iter->data_buf, iter->interval); + pi->app_tag = 0; + + if (type == 1) + pi->ref_tag = cpu_to_be32(lower_32_bits(iter->seed)); + else + pi->ref_tag = 0; + + iter->data_buf += iter->interval; + iter->prot_buf += sizeof(struct t10_pi_tuple); + iter->seed++; + } + + return 0; +} + +static int t10_pi_verify(struct blk_integrity_iter *iter, csum_fn *fn, + unsigned int type) +{ + unsigned int i; + + for (i = 0 ; i < iter->data_size ; i += iter->interval) { + struct t10_pi_tuple *pi = iter->prot_buf; + __be16 csum; + + switch (type) { + case 1: + case 2: + if (pi->app_tag == APP_ESCAPE) + goto next; + + if (be32_to_cpu(pi->ref_tag) != + lower_32_bits(iter->seed)) { + pr_err("%s: ref tag error at location %llu " \ + "(rcvd %u)\n", iter->disk_name, + (unsigned long long) + iter->seed, be32_to_cpu(pi->ref_tag)); + return -EILSEQ; + } + break; + case 3: + if (pi->app_tag == APP_ESCAPE && + pi->ref_tag == REF_ESCAPE) + goto next; + break; + } + + csum = fn(iter->data_buf, iter->interval); + + if (pi->guard_tag != csum) { + pr_err("%s: guard tag error at sector %llu " \ + "(rcvd %04x, want %04x)\n", iter->disk_name, + (unsigned long long)iter->seed, + be16_to_cpu(pi->guard_tag), be16_to_cpu(csum)); + return -EILSEQ; + } + +next: + iter->data_buf += iter->interval; + iter->prot_buf += sizeof(struct t10_pi_tuple); + iter->seed++; + } + + return 0; +} + +static int t10_pi_type1_generate_crc(struct blk_integrity_iter *iter) +{ + return t10_pi_generate(iter, t10_pi_crc_fn, 1); +} + +static int t10_pi_type1_generate_ip(struct blk_integrity_iter *iter) +{ + return t10_pi_generate(iter, t10_pi_ip_fn, 1); +} + +static int t10_pi_type1_verify_crc(struct blk_integrity_iter *iter) +{ + return t10_pi_verify(iter, t10_pi_crc_fn, 1); +} + +static int t10_pi_type1_verify_ip(struct blk_integrity_iter *iter) +{ + return t10_pi_verify(iter, t10_pi_ip_fn, 1); +} + +static int t10_pi_type3_generate_crc(struct blk_integrity_iter *iter) +{ + return t10_pi_generate(iter, t10_pi_crc_fn, 3); +} + +static int t10_pi_type3_generate_ip(struct blk_integrity_iter *iter) +{ + return t10_pi_generate(iter, t10_pi_ip_fn, 3); +} + +static int t10_pi_type3_verify_crc(struct blk_integrity_iter *iter) +{ + return t10_pi_verify(iter, t10_pi_crc_fn, 3); +} + +static int t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) +{ + return t10_pi_verify(iter, t10_pi_ip_fn, 3); +} + +struct blk_integrity t10_pi_type1_crc = { + .name = "T10-DIF-TYPE1-CRC", + .generate_fn = t10_pi_type1_generate_crc, + .verify_fn = t10_pi_type1_verify_crc, + .tuple_size = sizeof(struct t10_pi_tuple), + .tag_size = 0, +}; +EXPORT_SYMBOL(t10_pi_type1_crc); + +struct blk_integrity t10_pi_type1_ip = { + .name = "T10-DIF-TYPE1-IP", + .generate_fn = t10_pi_type1_generate_ip, + .verify_fn = t10_pi_type1_verify_ip, + .tuple_size = sizeof(struct t10_pi_tuple), + .tag_size = 0, +}; +EXPORT_SYMBOL(t10_pi_type1_ip); + +struct blk_integrity t10_pi_type3_crc = { + .name = "T10-DIF-TYPE3-CRC", + .generate_fn = t10_pi_type3_generate_crc, + .verify_fn = t10_pi_type3_verify_crc, + .tuple_size = sizeof(struct t10_pi_tuple), + .tag_size = 0, +}; +EXPORT_SYMBOL(t10_pi_type3_crc); + +struct blk_integrity t10_pi_type3_ip = { + .name = "T10-DIF-TYPE3-IP", + .generate_fn = t10_pi_type3_generate_ip, + .verify_fn = t10_pi_type3_verify_ip, + .tuple_size = sizeof(struct t10_pi_tuple), + .tag_size = 0, +}; +EXPORT_SYMBOL(t10_pi_type3_ip); diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig index 18a3358..9ece13f 100644 --- a/drivers/scsi/Kconfig +++ b/drivers/scsi/Kconfig @@ -62,7 +62,6 @@ comment "SCSI support type (disk, tape, CD-ROM)" config BLK_DEV_SD tristate "SCSI disk support" depends on SCSI - select CRC_T10DIF if BLK_DEV_INTEGRITY ---help--- If you want to use SCSI hard disks, Fibre Channel disks, Serial ATA (SATA) or Parallel ATA (PATA) hard disks, diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c index 2198abe..b7eaead 100644 --- a/drivers/scsi/sd_dif.c +++ b/drivers/scsi/sd_dif.c @@ -21,7 +21,7 @@ */ #include -#include +#include #include #include @@ -33,207 +33,8 @@ #include #include -#include - #include "sd.h" -typedef __u16 (csum_fn) (void *, unsigned int); - -static __u16 sd_dif_crc_fn(void *data, unsigned int len) -{ - return cpu_to_be16(crc_t10dif(data, len)); -} - -static __u16 sd_dif_ip_fn(void *data, unsigned int len) -{ - return ip_compute_csum(data, len); -} - -/* - * Type 1 and Type 2 protection use the same format: 16 bit guard tag, - * 16 bit app tag, 32 bit reference tag. - */ -static void sd_dif_type1_generate(struct blk_integrity_iter *iter, csum_fn *fn) -{ - void *buf = iter->data_buf; - struct sd_dif_tuple *sdt = iter->prot_buf; - sector_t seed = iter->seed; - unsigned int i; - - for (i = 0 ; i < iter->data_size ; i += iter->interval, sdt++) { - sdt->guard_tag = fn(buf, iter->interval); - sdt->ref_tag = cpu_to_be32(seed & 0xffffffff); - sdt->app_tag = 0; - - buf += iter->interval; - seed++; - } -} - -static int sd_dif_type1_generate_crc(struct blk_integrity_iter *iter) -{ - sd_dif_type1_generate(iter, sd_dif_crc_fn); - return 0; -} - -static int sd_dif_type1_generate_ip(struct blk_integrity_iter *iter) -{ - sd_dif_type1_generate(iter, sd_dif_ip_fn); - return 0; -} - -static int sd_dif_type1_verify(struct blk_integrity_iter *iter, csum_fn *fn) -{ - void *buf = iter->data_buf; - struct sd_dif_tuple *sdt = iter->prot_buf; - sector_t seed = iter->seed; - unsigned int i; - __u16 csum; - - for (i = 0 ; i < iter->data_size ; i += iter->interval, sdt++) { - /* Unwritten sectors */ - if (sdt->app_tag == 0xffff) - return 0; - - if (be32_to_cpu(sdt->ref_tag) != (seed & 0xffffffff)) { - printk(KERN_ERR - "%s: ref tag error on sector %lu (rcvd %u)\n", - iter->disk_name, (unsigned long)seed, - be32_to_cpu(sdt->ref_tag)); - return -EIO; - } - - csum = fn(buf, iter->interval); - - if (sdt->guard_tag != csum) { - printk(KERN_ERR "%s: guard tag error on sector %lu " \ - "(rcvd %04x, data %04x)\n", iter->disk_name, - (unsigned long)seed, - be16_to_cpu(sdt->guard_tag), be16_to_cpu(csum)); - return -EIO; - } - - buf += iter->interval; - seed++; - } - - return 0; -} - -static int sd_dif_type1_verify_crc(struct blk_integrity_iter *iter) -{ - return sd_dif_type1_verify(iter, sd_dif_crc_fn); -} - -static int sd_dif_type1_verify_ip(struct blk_integrity_iter *iter) -{ - return sd_dif_type1_verify(iter, sd_dif_ip_fn); -} - -static struct blk_integrity dif_type1_integrity_crc = { - .name = "T10-DIF-TYPE1-CRC", - .generate_fn = sd_dif_type1_generate_crc, - .verify_fn = sd_dif_type1_verify_crc, - .tuple_size = sizeof(struct sd_dif_tuple), - .tag_size = 0, -}; - -static struct blk_integrity dif_type1_integrity_ip = { - .name = "T10-DIF-TYPE1-IP", - .generate_fn = sd_dif_type1_generate_ip, - .verify_fn = sd_dif_type1_verify_ip, - .tuple_size = sizeof(struct sd_dif_tuple), - .tag_size = 0, -}; - - -/* - * Type 3 protection has a 16-bit guard tag and 16 + 32 bits of opaque - * tag space. - */ -static void sd_dif_type3_generate(struct blk_integrity_iter *iter, csum_fn *fn) -{ - void *buf = iter->data_buf; - struct sd_dif_tuple *sdt = iter->prot_buf; - unsigned int i; - - for (i = 0 ; i < iter->data_size ; i += iter->interval, sdt++) { - sdt->guard_tag = fn(buf, iter->interval); - sdt->ref_tag = 0; - sdt->app_tag = 0; - - buf += iter->interval; - } -} - -static int sd_dif_type3_generate_crc(struct blk_integrity_iter *iter) -{ - sd_dif_type3_generate(iter, sd_dif_crc_fn); - return 0; -} - -static int sd_dif_type3_generate_ip(struct blk_integrity_iter *iter) -{ - sd_dif_type3_generate(iter, sd_dif_ip_fn); - return 0; -} - -static int sd_dif_type3_verify(struct blk_integrity_iter *iter, csum_fn *fn) -{ - void *buf = iter->data_buf; - struct sd_dif_tuple *sdt = iter->prot_buf; - sector_t seed = iter->seed; - unsigned int i; - __u16 csum; - - for (i = 0 ; i < iter->data_size ; i += iter->interval, sdt++) { - /* Unwritten sectors */ - if (sdt->app_tag == 0xffff && sdt->ref_tag == 0xffffffff) - return 0; - - csum = fn(buf, iter->interval); - - if (sdt->guard_tag != csum) { - printk(KERN_ERR "%s: guard tag error on sector %lu " \ - "(rcvd %04x, data %04x)\n", iter->disk_name, - (unsigned long)seed, - be16_to_cpu(sdt->guard_tag), be16_to_cpu(csum)); - return -EIO; - } - - buf += iter->interval; - seed++; - } - - return 0; -} - -static int sd_dif_type3_verify_crc(struct blk_integrity_iter *iter) -{ - return sd_dif_type3_verify(iter, sd_dif_crc_fn); -} - -static int sd_dif_type3_verify_ip(struct blk_integrity_iter *iter) -{ - return sd_dif_type3_verify(iter, sd_dif_ip_fn); -} - -static struct blk_integrity dif_type3_integrity_crc = { - .name = "T10-DIF-TYPE3-CRC", - .generate_fn = sd_dif_type3_generate_crc, - .verify_fn = sd_dif_type3_verify_crc, - .tuple_size = sizeof(struct sd_dif_tuple), - .tag_size = 0, -}; - -static struct blk_integrity dif_type3_integrity_ip = { - .name = "T10-DIF-TYPE3-IP", - .generate_fn = sd_dif_type3_generate_ip, - .verify_fn = sd_dif_type3_verify_ip, - .tuple_size = sizeof(struct sd_dif_tuple), - .tag_size = 0, -}; - /* * Configure exchange of protection information between OS and HBA. */ @@ -257,16 +58,16 @@ void sd_dif_config_host(struct scsi_disk *sdkp) /* Enable DMA of protection information */ if (scsi_host_get_guard(sdkp->device->host) & SHOST_DIX_GUARD_IP) { if (type == SD_DIF_TYPE3_PROTECTION) - blk_integrity_register(disk, &dif_type3_integrity_ip); + blk_integrity_register(disk, &t10_pi_type3_ip); else - blk_integrity_register(disk, &dif_type1_integrity_ip); + blk_integrity_register(disk, &t10_pi_type1_ip); disk->integrity->flags |= BLK_INTEGRITY_IP_CHECKSUM; } else if (type == SD_DIF_TYPE3_PROTECTION) - blk_integrity_register(disk, &dif_type3_integrity_crc); + blk_integrity_register(disk, &t10_pi_type3_crc); else - blk_integrity_register(disk, &dif_type1_integrity_crc); + blk_integrity_register(disk, &t10_pi_type1_crc); sd_printk(KERN_NOTICE, sdkp, "Enabling DIX %s protection\n", disk->integrity->name); @@ -308,10 +109,10 @@ void sd_dif_config_host(struct scsi_disk *sdkp) void sd_dif_prepare(struct request *rq, sector_t hw_sector, unsigned int sector_sz) { - const int tuple_sz = sizeof(struct sd_dif_tuple); + const int tuple_sz = sizeof(struct t10_pi_tuple); struct bio *bio; struct scsi_disk *sdkp; - struct sd_dif_tuple *sdt; + struct t10_pi_tuple *pi; u32 phys, virt; sdkp = rq->bio->bi_bdev->bd_disk->private_data; @@ -334,19 +135,18 @@ void sd_dif_prepare(struct request *rq, sector_t hw_sector, virt = bip_get_seed(bip) & 0xffffffff; bip_for_each_vec(iv, bip, iter) { - sdt = kmap_atomic(iv.bv_page) - + iv.bv_offset; + pi = kmap_atomic(iv.bv_page) + iv.bv_offset; - for (j = 0; j < iv.bv_len; j += tuple_sz, sdt++) { + for (j = 0; j < iv.bv_len; j += tuple_sz, pi++) { - if (be32_to_cpu(sdt->ref_tag) == virt) - sdt->ref_tag = cpu_to_be32(phys); + if (be32_to_cpu(pi->ref_tag) == virt) + pi->ref_tag = cpu_to_be32(phys); virt++; phys++; } - kunmap_atomic(sdt); + kunmap_atomic(pi); } bip->bip_flags |= BIP_MAPPED_INTEGRITY; @@ -359,10 +159,10 @@ void sd_dif_prepare(struct request *rq, sector_t hw_sector, */ void sd_dif_complete(struct scsi_cmnd *scmd, unsigned int good_bytes) { - const int tuple_sz = sizeof(struct sd_dif_tuple); + const int tuple_sz = sizeof(struct t10_pi_tuple); struct scsi_disk *sdkp; struct bio *bio; - struct sd_dif_tuple *sdt; + struct t10_pi_tuple *pi; unsigned int j, sectors, sector_sz; u32 phys, virt; @@ -386,25 +186,24 @@ void sd_dif_complete(struct scsi_cmnd *scmd, unsigned int good_bytes) virt = bip_get_seed(bip) & 0xffffffff; bip_for_each_vec(iv, bip, iter) { - sdt = kmap_atomic(iv.bv_page) - + iv.bv_offset; + pi = kmap_atomic(iv.bv_page) + iv.bv_offset; - for (j = 0; j < iv.bv_len; j += tuple_sz, sdt++) { + for (j = 0; j < iv.bv_len; j += tuple_sz, pi++) { if (sectors == 0) { - kunmap_atomic(sdt); + kunmap_atomic(pi); return; } - if (be32_to_cpu(sdt->ref_tag) == phys) - sdt->ref_tag = cpu_to_be32(virt); + if (be32_to_cpu(pi->ref_tag) == phys) + pi->ref_tag = cpu_to_be32(virt); virt++; phys++; sectors--; } - kunmap_atomic(sdt); + kunmap_atomic(pi); } } } diff --git a/include/linux/crc-t10dif.h b/include/linux/crc-t10dif.h index b3cb71f..cf53d07 100644 --- a/include/linux/crc-t10dif.h +++ b/include/linux/crc-t10dif.h @@ -6,7 +6,8 @@ #define CRC_T10DIF_DIGEST_SIZE 2 #define CRC_T10DIF_BLOCK_SIZE 1 -__u16 crc_t10dif_generic(__u16 crc, const unsigned char *buffer, size_t len); -__u16 crc_t10dif(unsigned char const *, size_t); +extern __u16 crc_t10dif_generic(__u16 crc, const unsigned char *buffer, + size_t len); +extern __u16 crc_t10dif(unsigned char const *, size_t); #endif diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h new file mode 100644 index 0000000..6a8b994 --- /dev/null +++ b/include/linux/t10-pi.h @@ -0,0 +1,22 @@ +#ifndef _LINUX_T10_PI_H +#define _LINUX_T10_PI_H + +#include +#include + +/* + * T10 Protection Information tuple. + */ +struct t10_pi_tuple { + __be16 guard_tag; /* Checksum */ + __be16 app_tag; /* Opaque storage */ + __be32 ref_tag; /* Target LBA or indirect LBA */ +}; + + +extern struct blk_integrity t10_pi_type1_crc; +extern struct blk_integrity t10_pi_type1_ip; +extern struct blk_integrity t10_pi_type3_crc; +extern struct blk_integrity t10_pi_type3_ip; + +#endif -- cgit v0.10.2 From 582940508b5d589229d0232e0eeee8fef0d54809 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Tue, 16 Sep 2014 22:51:16 +0200 Subject: block: Replace strnicmp with strncasecmp The kernel used to contain two functions for length-delimited, case-insensitive string comparison, strnicmp with correct semantics and a slightly buggy strncasecmp. The latter is the POSIX name, so strnicmp was renamed to strncasecmp, and strnicmp made into a wrapper for the new strncasecmp to avoid breaking existing users. To allow the compat wrapper strnicmp to be removed at some point in the future, and to avoid the extra indirection cost, do s/strnicmp/strncasecmp/g. Cc: Jens Axboe Signed-off-by: Rasmus Villemoes Signed-off-by: Jens Axboe diff --git a/block/partitions/mac.c b/block/partitions/mac.c index 76d8ba6..c2c48ec 100644 --- a/block/partitions/mac.c +++ b/block/partitions/mac.c @@ -81,7 +81,7 @@ int mac_partition(struct parsed_partitions *state) be32_to_cpu(part->start_block) * (secsize/512), be32_to_cpu(part->block_count) * (secsize/512)); - if (!strnicmp(part->type, "Linux_RAID", 10)) + if (!strncasecmp(part->type, "Linux_RAID", 10)) state->parts[slot].flags = ADDPART_FLAG_RAID; #ifdef CONFIG_PPC_PMAC /* @@ -100,7 +100,7 @@ int mac_partition(struct parsed_partitions *state) goodness++; if (strcasecmp(part->type, "Apple_UNIX_SVR2") == 0 - || (strnicmp(part->type, "Linux", 5) == 0 + || (strncasecmp(part->type, "Linux", 5) == 0 && strcasecmp(part->type, "Linux_swap") != 0)) { int i, l; @@ -109,13 +109,13 @@ int mac_partition(struct parsed_partitions *state) if (strcmp(part->name, "/") == 0) goodness++; for (i = 0; i <= l - 4; ++i) { - if (strnicmp(part->name + i, "root", + if (strncasecmp(part->name + i, "root", 4) == 0) { goodness += 2; break; } } - if (strnicmp(part->name, "swap", 4) == 0) + if (strncasecmp(part->name, "swap", 4) == 0) goodness--; } -- cgit v0.10.2 From c611529e7cd3465ec0eada0f44200e8420c38908 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 26 Sep 2014 19:20:08 -0400 Subject: sd: Honor block layer integrity handling flags A set of flags introduced in the block layer enable better control over how protection information is handled. These flags are useful for both error injection and data recovery purposes. Checking can be enabled and disabled for controller and disk, and the guard tag format is now a per-I/O property. Update sd_protect_op to communicate the relevant information to the low-level device driver via a set of flags in scsi_cmnd. Signed-off-by: Martin K. Petersen Reviewed-by: Sagi Grimberg Acked-by: Christoph Hellwig Signed-off-by: Jens Axboe diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 2c2041c..9f7099f 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -610,29 +610,44 @@ static void scsi_disk_put(struct scsi_disk *sdkp) mutex_unlock(&sd_ref_mutex); } -static void sd_prot_op(struct scsi_cmnd *scmd, unsigned int dif) -{ - unsigned int prot_op = SCSI_PROT_NORMAL; - unsigned int dix = scsi_prot_sg_count(scmd); - - if (scmd->sc_data_direction == DMA_FROM_DEVICE) { - if (dif && dix) - prot_op = SCSI_PROT_READ_PASS; - else if (dif && !dix) - prot_op = SCSI_PROT_READ_STRIP; - else if (!dif && dix) - prot_op = SCSI_PROT_READ_INSERT; - } else { - if (dif && dix) - prot_op = SCSI_PROT_WRITE_PASS; - else if (dif && !dix) - prot_op = SCSI_PROT_WRITE_INSERT; - else if (!dif && dix) - prot_op = SCSI_PROT_WRITE_STRIP; + + +static unsigned char sd_setup_protect_cmnd(struct scsi_cmnd *scmd, + unsigned int dix, unsigned int dif) +{ + struct bio *bio = scmd->request->bio; + unsigned int prot_op = sd_prot_op(rq_data_dir(scmd->request), dix, dif); + unsigned int protect = 0; + + if (dix) { /* DIX Type 0, 1, 2, 3 */ + if (bio_integrity_flagged(bio, BIP_IP_CHECKSUM)) + scmd->prot_flags |= SCSI_PROT_IP_CHECKSUM; + + if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false) + scmd->prot_flags |= SCSI_PROT_GUARD_CHECK; + } + + if (dif != SD_DIF_TYPE3_PROTECTION) { /* DIX/DIF Type 0, 1, 2 */ + scmd->prot_flags |= SCSI_PROT_REF_INCREMENT; + + if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false) + scmd->prot_flags |= SCSI_PROT_REF_CHECK; + } + + if (dif) { /* DIX/DIF Type 1, 2, 3 */ + scmd->prot_flags |= SCSI_PROT_TRANSFER_PI; + + if (bio_integrity_flagged(bio, BIP_DISK_NOCHECK)) + protect = 3 << 5; /* Disable target PI checking */ + else + protect = 1 << 5; /* Enable target PI checking */ } scsi_set_prot_op(scmd, prot_op); scsi_set_prot_type(scmd, dif); + scmd->prot_flags &= sd_prot_flag_mask(prot_op); + + return protect; } static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode) @@ -893,7 +908,8 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt) sector_t block = blk_rq_pos(rq); sector_t threshold; unsigned int this_count = blk_rq_sectors(rq); - int ret, host_dif; + unsigned int dif, dix; + int ret; unsigned char protect; ret = scsi_init_io(SCpnt, GFP_ATOMIC); @@ -995,7 +1011,7 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt) SCpnt->cmnd[0] = WRITE_6; if (blk_integrity_rq(rq)) - sd_dif_prepare(rq, block, sdp->sector_size); + sd_dif_prepare(SCpnt); } else if (rq_data_dir(rq) == READ) { SCpnt->cmnd[0] = READ_6; @@ -1010,14 +1026,15 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt) "writing" : "reading", this_count, blk_rq_sectors(rq))); - /* Set RDPROTECT/WRPROTECT if disk is formatted with DIF */ - host_dif = scsi_host_dif_capable(sdp->host, sdkp->protection_type); - if (host_dif) - protect = 1 << 5; + dix = scsi_prot_sg_count(SCpnt); + dif = scsi_host_dif_capable(SCpnt->device->host, sdkp->protection_type); + + if (dif || dix) + protect = sd_setup_protect_cmnd(SCpnt, dix, dif); else protect = 0; - if (host_dif == SD_DIF_TYPE2_PROTECTION) { + if (protect && sdkp->protection_type == SD_DIF_TYPE2_PROTECTION) { SCpnt->cmnd = mempool_alloc(sd_cdb_pool, GFP_ATOMIC); if (unlikely(SCpnt->cmnd == NULL)) { @@ -1102,10 +1119,6 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt) } SCpnt->sdb.length = this_count * sdp->sector_size; - /* If DIF or DIX is enabled, tell HBA how to handle request */ - if (host_dif || scsi_prot_sg_count(SCpnt)) - sd_prot_op(SCpnt, host_dif); - /* * We shouldn't disconnect in the middle of a sector, so with a dumb * host adapter, it's safe to assume that we can at least transfer diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index 4c3ab83..4673778 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -167,6 +167,68 @@ enum sd_dif_target_protection_types { }; /* + * Look up the DIX operation based on whether the command is read or + * write and whether dix and dif are enabled. + */ +static inline unsigned int sd_prot_op(bool write, bool dix, bool dif) +{ + /* Lookup table: bit 2 (write), bit 1 (dix), bit 0 (dif) */ + const unsigned int ops[] = { /* wrt dix dif */ + SCSI_PROT_NORMAL, /* 0 0 0 */ + SCSI_PROT_READ_STRIP, /* 0 0 1 */ + SCSI_PROT_READ_INSERT, /* 0 1 0 */ + SCSI_PROT_READ_PASS, /* 0 1 1 */ + SCSI_PROT_NORMAL, /* 1 0 0 */ + SCSI_PROT_WRITE_INSERT, /* 1 0 1 */ + SCSI_PROT_WRITE_STRIP, /* 1 1 0 */ + SCSI_PROT_WRITE_PASS, /* 1 1 1 */ + }; + + return ops[write << 2 | dix << 1 | dif]; +} + +/* + * Returns a mask of the protection flags that are valid for a given DIX + * operation. + */ +static inline unsigned int sd_prot_flag_mask(unsigned int prot_op) +{ + const unsigned int flag_mask[] = { + [SCSI_PROT_NORMAL] = 0, + + [SCSI_PROT_READ_STRIP] = SCSI_PROT_TRANSFER_PI | + SCSI_PROT_GUARD_CHECK | + SCSI_PROT_REF_CHECK | + SCSI_PROT_REF_INCREMENT, + + [SCSI_PROT_READ_INSERT] = SCSI_PROT_REF_INCREMENT | + SCSI_PROT_IP_CHECKSUM, + + [SCSI_PROT_READ_PASS] = SCSI_PROT_TRANSFER_PI | + SCSI_PROT_GUARD_CHECK | + SCSI_PROT_REF_CHECK | + SCSI_PROT_REF_INCREMENT | + SCSI_PROT_IP_CHECKSUM, + + [SCSI_PROT_WRITE_INSERT] = SCSI_PROT_TRANSFER_PI | + SCSI_PROT_REF_INCREMENT, + + [SCSI_PROT_WRITE_STRIP] = SCSI_PROT_GUARD_CHECK | + SCSI_PROT_REF_CHECK | + SCSI_PROT_REF_INCREMENT | + SCSI_PROT_IP_CHECKSUM, + + [SCSI_PROT_WRITE_PASS] = SCSI_PROT_TRANSFER_PI | + SCSI_PROT_GUARD_CHECK | + SCSI_PROT_REF_CHECK | + SCSI_PROT_REF_INCREMENT | + SCSI_PROT_IP_CHECKSUM, + }; + + return flag_mask[prot_op]; +} + +/* * Data Integrity Field tuple. */ struct sd_dif_tuple { @@ -178,7 +240,7 @@ struct sd_dif_tuple { #ifdef CONFIG_BLK_DEV_INTEGRITY extern void sd_dif_config_host(struct scsi_disk *); -extern void sd_dif_prepare(struct request *rq, sector_t, unsigned int); +extern void sd_dif_prepare(struct scsi_cmnd *scmd); extern void sd_dif_complete(struct scsi_cmnd *, unsigned int); #else /* CONFIG_BLK_DEV_INTEGRITY */ @@ -186,7 +248,7 @@ extern void sd_dif_complete(struct scsi_cmnd *, unsigned int); static inline void sd_dif_config_host(struct scsi_disk *disk) { } -static inline int sd_dif_prepare(struct request *rq, sector_t s, unsigned int a) +static inline int sd_dif_prepare(struct scsi_cmnd *scmd) { return 0; } diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c index b7eaead..14c7d42 100644 --- a/drivers/scsi/sd_dif.c +++ b/drivers/scsi/sd_dif.c @@ -106,8 +106,7 @@ void sd_dif_config_host(struct scsi_disk *sdkp) * * Type 3 does not have a reference tag so no remapping is required. */ -void sd_dif_prepare(struct request *rq, sector_t hw_sector, - unsigned int sector_sz) +void sd_dif_prepare(struct scsi_cmnd *scmd) { const int tuple_sz = sizeof(struct t10_pi_tuple); struct bio *bio; @@ -115,14 +114,14 @@ void sd_dif_prepare(struct request *rq, sector_t hw_sector, struct t10_pi_tuple *pi; u32 phys, virt; - sdkp = rq->bio->bi_bdev->bd_disk->private_data; + sdkp = scsi_disk(scmd->request->rq_disk); if (sdkp->protection_type == SD_DIF_TYPE3_PROTECTION) return; - phys = hw_sector & 0xffffffff; + phys = scsi_prot_ref_tag(scmd); - __rq_for_each_bio(bio, rq) { + __rq_for_each_bio(bio, scmd->request) { struct bio_integrity_payload *bip = bio_integrity(bio); struct bio_vec iv; struct bvec_iter iter; @@ -163,7 +162,7 @@ void sd_dif_complete(struct scsi_cmnd *scmd, unsigned int good_bytes) struct scsi_disk *sdkp; struct bio *bio; struct t10_pi_tuple *pi; - unsigned int j, sectors, sector_sz; + unsigned int j, intervals; u32 phys, virt; sdkp = scsi_disk(scmd->request->rq_disk); @@ -171,12 +170,8 @@ void sd_dif_complete(struct scsi_cmnd *scmd, unsigned int good_bytes) if (sdkp->protection_type == SD_DIF_TYPE3_PROTECTION || good_bytes == 0) return; - sector_sz = scmd->device->sector_size; - sectors = good_bytes / sector_sz; - - phys = blk_rq_pos(scmd->request) & 0xffffffff; - if (sector_sz == 4096) - phys >>= 3; + intervals = good_bytes / scsi_prot_interval(scmd); + phys = scsi_prot_ref_tag(scmd); __rq_for_each_bio(bio, scmd->request) { struct bio_integrity_payload *bip = bio_integrity(bio); @@ -190,7 +185,7 @@ void sd_dif_complete(struct scsi_cmnd *scmd, unsigned int good_bytes) for (j = 0; j < iv.bv_len; j += tuple_sz, pi++) { - if (sectors == 0) { + if (intervals == 0) { kunmap_atomic(pi); return; } @@ -200,7 +195,7 @@ void sd_dif_complete(struct scsi_cmnd *scmd, unsigned int good_bytes) virt++; phys++; - sectors--; + intervals--; } kunmap_atomic(pi); diff --git a/include/linux/bio.h b/include/linux/bio.h index 14bff3f..ce6b759 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -292,6 +292,14 @@ static inline unsigned bio_segments(struct bio *bio) */ #define bio_get(bio) atomic_inc(&(bio)->bi_cnt) +enum bip_flags { + BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ + BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ + BIP_CTRL_NOCHECK = 1 << 2, /* disable HBA integrity checking */ + BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */ + BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ +}; + #if defined(CONFIG_BLK_DEV_INTEGRITY) static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) @@ -323,13 +331,15 @@ struct bio_integrity_payload { struct bio_vec bip_inline_vecs[0];/* embedded bvec array */ }; -enum bip_flags { - BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ - BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ - BIP_CTRL_NOCHECK = 1 << 2, /* disable HBA integrity checking */ - BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */ - BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ -}; +static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + + if (bip) + return bip->bip_flags & flag; + + return false; +} static inline sector_t bip_get_seed(struct bio_integrity_payload *bip) { @@ -701,9 +711,9 @@ extern void bio_integrity_init(void); #else /* CONFIG_BLK_DEV_INTEGRITY */ -static inline int bio_integrity(struct bio *bio) +static inline void *bio_integrity(struct bio *bio) { - return 0; + return NULL; } static inline bool bio_integrity_enabled(struct bio *bio) @@ -754,6 +764,11 @@ static inline void bio_integrity_init(void) return; } +static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) +{ + return false; +} + #endif /* CONFIG_BLK_DEV_INTEGRITY */ #endif /* CONFIG_BLOCK */ diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index 73f3490..522a5f27 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -10,9 +10,10 @@ #include struct Scsi_Host; -struct scsi_device; struct scsi_driver; +#include + /* * MAX_COMMAND_SIZE is: * The longest fixed-length SCSI CDB as per the SCSI standard. @@ -81,6 +82,7 @@ struct scsi_cmnd { unsigned char prot_op; unsigned char prot_type; + unsigned char prot_flags; unsigned short cmd_len; enum dma_data_direction sc_data_direction; @@ -252,6 +254,14 @@ static inline unsigned char scsi_get_prot_op(struct scsi_cmnd *scmd) return scmd->prot_op; } +enum scsi_prot_flags { + SCSI_PROT_TRANSFER_PI = 1 << 0, + SCSI_PROT_GUARD_CHECK = 1 << 1, + SCSI_PROT_REF_CHECK = 1 << 2, + SCSI_PROT_REF_INCREMENT = 1 << 3, + SCSI_PROT_IP_CHECKSUM = 1 << 4, +}; + /* * The controller usually does not know anything about the target it * is communicating with. However, when DIX is enabled the controller @@ -280,6 +290,17 @@ static inline sector_t scsi_get_lba(struct scsi_cmnd *scmd) return blk_rq_pos(scmd->request); } +static inline unsigned int scsi_prot_interval(struct scsi_cmnd *scmd) +{ + return scmd->device->sector_size; +} + +static inline u32 scsi_prot_ref_tag(struct scsi_cmnd *scmd) +{ + return blk_rq_pos(scmd->request) >> + (ilog2(scsi_prot_interval(scmd)) - 9) & 0xffffffff; +} + static inline unsigned scsi_prot_sg_count(struct scsi_cmnd *cmd) { return cmd->prot_sdb ? cmd->prot_sdb->table.nents : 0; @@ -316,17 +337,12 @@ static inline void set_driver_byte(struct scsi_cmnd *cmd, char status) static inline unsigned scsi_transfer_length(struct scsi_cmnd *scmd) { unsigned int xfer_len = scsi_out(scmd)->length; - unsigned int prot_op = scsi_get_prot_op(scmd); - unsigned int sector_size = scmd->device->sector_size; + unsigned int prot_interval = scsi_prot_interval(scmd); - switch (prot_op) { - case SCSI_PROT_NORMAL: - case SCSI_PROT_WRITE_STRIP: - case SCSI_PROT_READ_INSERT: - return xfer_len; - } + if (scmd->prot_flags & SCSI_PROT_TRANSFER_PI) + xfer_len += (xfer_len >> ilog2(prot_interval)) * 8; - return xfer_len + (xfer_len >> ilog2(sector_size)) * 8; + return xfer_len; } #endif /* _SCSI_SCSI_CMND_H */ -- cgit v0.10.2 From 4a0efdc933680d908de11712a774a2c9492c3d5a Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 1 Oct 2014 14:32:31 +0200 Subject: block: misplaced rq_complete tracepoint The rq_complete tracepoint was never issued for empty requests, causing the resulting blktrace information to never show any completion for those request. Signed-off-by: Hannes Reinecke Acked-by: Tejun Heo Signed-off-by: Jens Axboe diff --git a/block/blk-core.c b/block/blk-core.c index e1c2775..4aa9ccd 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2400,11 +2400,11 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) { int total_bytes; + trace_block_rq_complete(req->q, req, nr_bytes); + if (!req->bio) return false; - trace_block_rq_complete(req->q, req, nr_bytes); - /* * For fs requests, rq is just carrier of independent bio's * and each partial completion should be handled separately. -- cgit v0.10.2 From 11dfce509eaa35e8fc81cb50d0910c0e235fd7e2 Mon Sep 17 00:00:00 2001 From: Junichi Nomura Date: Fri, 3 Oct 2014 17:27:11 -0400 Subject: block: use bio_clone_fast() in blk_rq_prep_clone() Request cloning clones bios in the request to track the completion of each bio. For that purpose, we can use bio_clone_fast() instead of bio_clone() to avoid unnecessary allocation and copy of bvecs. This patch reduces memory footprint of request-based device-mapper (about 1-4KB for each request) and is a preparation for further reduction of memory usage by removing unused bvec mempool. Signed-off-by: Jun'ichi Nomura Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe diff --git a/block/blk-core.c b/block/blk-core.c index 4aa9ccd..ffcb47a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2926,7 +2926,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, blk_rq_init(NULL, rq); __rq_for_each_bio(bio_src, rq_src) { - bio = bio_clone_bioset(bio_src, gfp_mask, bs); + bio = bio_clone_fast(bio_src, gfp_mask, bs); if (!bio) goto free_and_out; -- cgit v0.10.2 From d8f429e1669b9709f5b669aac9d734dbe0640891 Mon Sep 17 00:00:00 2001 From: Junichi Nomura Date: Fri, 3 Oct 2014 17:27:12 -0400 Subject: block: add bioset_create_nobvec() Users of bio_clone_fast() do not want bios with their own bvecs. Allocating a bvec mempool as part of the bioset intended for such users is a waste of memory. bioset_create_nobvec() creates a bioset that doesn't have the bvec mempool. Signed-off-by: Jun'ichi Nomura Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe diff --git a/block/bio.c b/block/bio.c index 3e6331d..3e6e198 100644 --- a/block/bio.c +++ b/block/bio.c @@ -428,6 +428,9 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) front_pad = 0; inline_vecs = nr_iovecs; } else { + /* should not use nobvec bioset for nr_iovecs > 0 */ + if (WARN_ON_ONCE(!bs->bvec_pool && nr_iovecs > 0)) + return NULL; /* * generic_make_request() converts recursion to iteration; this * means if we're running beneath it, any bios we allocate and @@ -1900,20 +1903,9 @@ void bioset_free(struct bio_set *bs) } EXPORT_SYMBOL(bioset_free); -/** - * bioset_create - Create a bio_set - * @pool_size: Number of bio and bio_vecs to cache in the mempool - * @front_pad: Number of bytes to allocate in front of the returned bio - * - * Description: - * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller - * to ask for a number of bytes to be allocated in front of the bio. - * Front pad allocation is useful for embedding the bio inside - * another structure, to avoid allocating extra data to go with the bio. - * Note that the bio must be embedded at the END of that structure always, - * or things will break badly. - */ -struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) +static struct bio_set *__bioset_create(unsigned int pool_size, + unsigned int front_pad, + bool create_bvec_pool) { unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); struct bio_set *bs; @@ -1938,9 +1930,11 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) if (!bs->bio_pool) goto bad; - bs->bvec_pool = biovec_create_pool(pool_size); - if (!bs->bvec_pool) - goto bad; + if (create_bvec_pool) { + bs->bvec_pool = biovec_create_pool(pool_size); + if (!bs->bvec_pool) + goto bad; + } bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0); if (!bs->rescue_workqueue) @@ -1951,8 +1945,41 @@ bad: bioset_free(bs); return NULL; } + +/** + * bioset_create - Create a bio_set + * @pool_size: Number of bio and bio_vecs to cache in the mempool + * @front_pad: Number of bytes to allocate in front of the returned bio + * + * Description: + * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller + * to ask for a number of bytes to be allocated in front of the bio. + * Front pad allocation is useful for embedding the bio inside + * another structure, to avoid allocating extra data to go with the bio. + * Note that the bio must be embedded at the END of that structure always, + * or things will break badly. + */ +struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) +{ + return __bioset_create(pool_size, front_pad, true); +} EXPORT_SYMBOL(bioset_create); +/** + * bioset_create_nobvec - Create a bio_set without bio_vec mempool + * @pool_size: Number of bio to cache in the mempool + * @front_pad: Number of bytes to allocate in front of the returned bio + * + * Description: + * Same functionality as bioset_create() except that mempool is not + * created for bio_vecs. Saving some memory for bio_clone_fast() users. + */ +struct bio_set *bioset_create_nobvec(unsigned int pool_size, unsigned int front_pad) +{ + return __bioset_create(pool_size, front_pad, false); +} +EXPORT_SYMBOL(bioset_create_nobvec); + #ifdef CONFIG_BLK_CGROUP /** * bio_associate_current - associate a bio with %current diff --git a/include/linux/bio.h b/include/linux/bio.h index ce6b759..7347f48 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -378,6 +378,7 @@ static inline struct bio *bio_next_split(struct bio *bio, int sectors, } extern struct bio_set *bioset_create(unsigned int, unsigned int); +extern struct bio_set *bioset_create_nobvec(unsigned int, unsigned int); extern void bioset_free(struct bio_set *); extern mempool_t *biovec_create_pool(int pool_entries); -- cgit v0.10.2 From abab13b5c4fd1fec4f9a61622548012d93dc2831 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 7 Oct 2014 08:39:20 -0600 Subject: blk-mq: fix potential hang if rolling wakeup depth is too high We currently divide the queue depth by 4 as our batch wakeup count, but we split the wakeups over BT_WAIT_QUEUES number of wait queues. This defaults to 8. If the product of the resulting batch wake count and BT_WAIT_QUEUES is higher than the device queue depth, we can get into a situation where a task goes to sleep waiting for a request, but never gets woken up. Reported-by: Bart Van Assche Fixes: 4bb659b156996 Cc: stable@kernel.org Signed-off-by: Jens Axboe diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index b087880..146fd02 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -455,8 +455,8 @@ static void bt_update_count(struct blk_mq_bitmap_tags *bt, } bt->wake_cnt = BT_WAIT_BATCH; - if (bt->wake_cnt > depth / 4) - bt->wake_cnt = max(1U, depth / 4); + if (bt->wake_cnt > depth / BT_WAIT_QUEUES) + bt->wake_cnt = max(1U, depth / BT_WAIT_QUEUES); bt->depth = depth; } -- cgit v0.10.2 From 9d8f0bcca6ffa024a822ce4ab1008ab663f06672 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 7 Oct 2014 08:45:21 -0600 Subject: blk-mq: Make bt_clear_tag() easier to read Eliminate a backwards goto statement from bt_clear_tag(). Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 146fd02..8317175 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -351,15 +351,12 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag) return; wait_cnt = atomic_dec_return(&bs->wait_cnt); + if (unlikely(wait_cnt < 0)) + wait_cnt = atomic_inc_return(&bs->wait_cnt); if (wait_cnt == 0) { -wake: atomic_add(bt->wake_cnt, &bs->wait_cnt); bt_index_atomic_inc(&bt->wake_index); wake_up(&bs->wait); - } else if (wait_cnt < 0) { - wait_cnt = atomic_inc_return(&bs->wait_cnt); - if (!wait_cnt) - goto wake; } } -- cgit v0.10.2 From b8839b8c55f3fdd60dc36abcda7e0266aff7985c Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 8 Oct 2014 18:26:13 -0400 Subject: block: fix alignment_offset math that assumes io_min is a power-of-2 The math in both blk_stack_limits() and queue_limit_alignment_offset() assume that a block device's io_min (aka minimum_io_size) is always a power-of-2. Fix the math such that it works for non-power-of-2 io_min. This issue (of alignment_offset != 0) became apparent when testing dm-thinp with a thinp blocksize that matches a RAID6 stripesize of 1280K. Commit fdfb4c8c1 ("dm thin: set minimum_io_size to pool's data block size") unlocked the potential for alignment_offset != 0 due to the dm-thin-pool's io_min possibly being a non-power-of-2. Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org Acked-by: Martin K. Petersen Signed-off-by: Jens Axboe diff --git a/block/blk-settings.c b/block/blk-settings.c index f1a1795..aa02247 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -574,7 +574,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, bottom = max(b->physical_block_size, b->io_min) + alignment; /* Verify that top and bottom intervals line up */ - if (max(top, bottom) & (min(top, bottom) - 1)) { + if (max(top, bottom) % min(top, bottom)) { t->misaligned = 1; ret = -1; } @@ -619,7 +619,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, /* Find lowest common alignment_offset */ t->alignment_offset = lcm(t->alignment_offset, alignment) - & (max(t->physical_block_size, t->io_min) - 1); + % max(t->physical_block_size, t->io_min); /* Verify that new alignment_offset is on a logical block boundary */ if (t->alignment_offset & (t->logical_block_size - 1)) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 038b40f..5546392 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1279,10 +1279,9 @@ static inline int queue_alignment_offset(struct request_queue *q) static inline int queue_limit_alignment_offset(struct queue_limits *lim, sector_t sector) { unsigned int granularity = max(lim->physical_block_size, lim->io_min); - unsigned int alignment = (sector << 9) & (granularity - 1); + unsigned int alignment = sector_div(sector, granularity >> 9) << 9; - return (granularity + lim->alignment_offset - alignment) - & (granularity - 1); + return (granularity + lim->alignment_offset - alignment) % granularity; } static inline int bdev_alignment_offset(struct block_device *bdev) -- cgit v0.10.2 From 764f612c6c3c231b9c12cfae7c328ccc9c453258 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 9 Oct 2014 23:17:35 +0800 Subject: blk-merge: don't compute bi_phys_segments from bi_vcnt for cloned bio It isn't correct to figure out req->bi_phys_segments from bio->bi_vcnt if the bio is cloned. Signed-off-by: Ming Lei Tested-by: Jeff Mahoney Signed-off-by: Jens Axboe diff --git a/block/blk-merge.c b/block/blk-merge.c index f71bad3..ba99351 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -97,14 +97,18 @@ void blk_recalc_rq_segments(struct request *rq) void blk_recount_segments(struct request_queue *q, struct bio *bio) { - if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags) && + bool no_sg_merge = !!test_bit(QUEUE_FLAG_NO_SG_MERGE, + &q->queue_flags); + + if (no_sg_merge && !bio_flagged(bio, BIO_CLONED) && bio->bi_vcnt < queue_max_segments(q)) bio->bi_phys_segments = bio->bi_vcnt; else { struct bio *nxt = bio->bi_next; bio->bi_next = NULL; - bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, false); + bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, + no_sg_merge); bio->bi_next = nxt; } -- cgit v0.10.2 From ef3ecb66bcd6b2076dc8782e1315cf2807b73c0c Mon Sep 17 00:00:00 2001 From: Robert Elliott Date: Wed, 27 Aug 2014 10:50:31 -0500 Subject: block: make blk_update_request print prefix match ratelimited prefix In blk_update_request, change the printk_ratelimited prefix from end_request to blk_update_request so it matches the name printed if rate limiting occurs. Old: [10234.933106] blk_update_request: 174 callbacks suppressed [10234.934940] end_request: critical target error, dev sdr, sector 16 [10234.949788] end_request: critical target error, dev sdr, sector 16 New: [16863.445173] blk_update_request: 398 callbacks suppressed [16863.447029] blk_update_request: critical target error, dev sdr, sector 1442066176 [16863.449383] blk_update_request: critical target error, dev sdr, sector 802802888 [16863.451680] blk_update_request: critical target error, dev sdr, sector 1609535456 Signed-off-by: Robert Elliott Reviewed-by: Webb Scales Signed-off-by: Jens Axboe diff --git a/block/blk-core.c b/block/blk-core.c index ffcb47a..ecc124e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2444,8 +2444,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) error_type = "I/O"; break; } - printk_ratelimited(KERN_ERR "end_request: %s error, dev %s, sector %llu\n", - error_type, req->rq_disk ? + printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu\n", + __func__, error_type, req->rq_disk ? req->rq_disk->disk_name : "?", (unsigned long long)blk_rq_pos(req)); -- cgit v0.10.2 From 7b2b10e0e2c65ebc11314e1af9924d0824ec1562 Mon Sep 17 00:00:00 2001 From: Robert Elliott Date: Wed, 27 Aug 2014 10:50:36 -0500 Subject: block: include func name in __get_request prints In __get_request calls to printk_ratelimited, include the function name so the callbacks suppressed message matches the messages that are printed, and add "dev" before the device name so it matches other block layer messages. Signed-off-by: Robert Elliott Reviewed-by: Webb Scales Signed-off-by: Jens Axboe diff --git a/block/blk-core.c b/block/blk-core.c index ecc124e..d6ec7db 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1060,8 +1060,8 @@ fail_elvpriv: * shouldn't stall IO. Treat this request as !elvpriv. This will * disturb iosched and blkcg but weird is bettern than dead. */ - printk_ratelimited(KERN_WARNING "%s: request aux data allocation failed, iosched may be disturbed\n", - dev_name(q->backing_dev_info.dev)); + printk_ratelimited(KERN_WARNING "%s: dev %s: request aux data allocation failed, iosched may be disturbed\n", + __func__, dev_name(q->backing_dev_info.dev)); rq->cmd_flags &= ~REQ_ELVPRIV; rq->elv.icq = NULL; -- cgit v0.10.2 From b65c7491cb865577e83e6b7fae2aa2f4ea457c38 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Mon, 13 Oct 2014 14:07:27 -0700 Subject: bio-integrity: remove the needless fail handle of bip_slab creating bip_slab is created with SLAB_PANIC, so the fail handler is unneeded. Signed-off-by: Gu Zheng Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 8e05484..0984232 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -513,6 +513,4 @@ void __init bio_integrity_init(void) sizeof(struct bio_integrity_payload) + sizeof(struct bio_vec) * BIP_INLINE_VECS, 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - if (!bip_slab) - panic("Failed to create slab\n"); } -- cgit v0.10.2 From a86073e48ae85c9b50127facb0cc45bbd35972a1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 13 Oct 2014 15:41:54 -0600 Subject: blk-mq: allocate cpumask on the home node All other allocs are done on the specific node, somehow the cpumask for hw queue runs was missed. Fix that by using zalloc_cpumask_var_node() in blk_mq_init_queue(). Signed-off-by: Jens Axboe diff --git a/block/blk-mq.c b/block/blk-mq.c index 4e7a314..79aa11b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1802,7 +1802,8 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (!hctxs[i]) goto err_hctxs; - if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL)) + if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL, + node)) goto err_hctxs; atomic_set(&hctxs[i]->nr_active, 0); -- cgit v0.10.2 From e19a8a0ad2d255316830ead05b59c5a704434cbb Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Tue, 14 Oct 2014 09:00:44 -0600 Subject: block: Remove REQ_KERNEL REQ_KERNEL is no longer used. Remove it and drop the redundant uio argument to nfs_file_direct_{read,write}. Signed-off-by: Martin K. Petersen Cc: Christoph Hellwig Reported-by: Dan Carpenter Signed-off-by: Jens Axboe diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 65ef6e0..891f7dd 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -222,11 +222,9 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t #else VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE); - if (rw == READ || rw == KERNEL_READ) - return nfs_file_direct_read(iocb, iter, pos, - rw == READ ? true : false); - return nfs_file_direct_write(iocb, iter, pos, - rw == WRITE ? true : false); + if (rw == READ) + return nfs_file_direct_read(iocb, iter, pos); + return nfs_file_direct_write(iocb, iter, pos); #endif /* CONFIG_NFS_SWAP */ } @@ -512,7 +510,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * cache. */ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, - loff_t pos, bool uio) + loff_t pos) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -893,7 +891,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, * is no atomic O_APPEND write facility in the NFS protocol. */ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, - loff_t pos, bool uio) + loff_t pos) { ssize_t result = -EINVAL; struct file *file = iocb->ki_filp; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 524dd80..3b42cb8 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -171,7 +171,7 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) ssize_t result; if (iocb->ki_filp->f_flags & O_DIRECT) - return nfs_file_direct_read(iocb, to, iocb->ki_pos, true); + return nfs_file_direct_read(iocb, to, iocb->ki_pos); dprintk("NFS: read(%pD2, %zu@%lu)\n", iocb->ki_filp, @@ -648,7 +648,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) return result; if (file->f_flags & O_DIRECT) - return nfs_file_direct_write(iocb, from, pos, true); + return nfs_file_direct_write(iocb, from, pos); dprintk("NFS: write(%pD2, %zu@%Ld)\n", file, count, (long long) pos); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 38bc008..445d592 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -187,7 +187,6 @@ enum rq_flag_bits { __REQ_FLUSH_SEQ, /* request for flush sequence */ __REQ_IO_STAT, /* account I/O stat */ __REQ_MIXED_MERGE, /* merge of different types, fail separately */ - __REQ_KERNEL, /* direct IO to kernel pages */ __REQ_PM, /* runtime pm request */ __REQ_HASHED, /* on IO scheduler merge hash */ __REQ_MQ_INFLIGHT, /* track inflight for MQ */ @@ -241,7 +240,6 @@ enum rq_flag_bits { #define REQ_IO_STAT (1ULL << __REQ_IO_STAT) #define REQ_MIXED_MERGE (1ULL << __REQ_MIXED_MERGE) #define REQ_SECURE (1ULL << __REQ_SECURE) -#define REQ_KERNEL (1ULL << __REQ_KERNEL) #define REQ_PM (1ULL << __REQ_PM) #define REQ_HASHED (1ULL << __REQ_HASHED) #define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) diff --git a/include/linux/fs.h b/include/linux/fs.h index 94187721..9b5bc1c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -192,8 +192,6 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define READ 0 #define WRITE RW_MASK #define READA RWA_MASK -#define KERNEL_READ (READ|REQ_KERNEL) -#define KERNEL_WRITE (WRITE|REQ_KERNEL) #define READ_SYNC (READ | REQ_SYNC) #define WRITE_SYNC (WRITE | REQ_SYNC | REQ_NOIDLE) diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 5180a7e..e6e1c4e 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -464,10 +464,10 @@ extern int nfs3_removexattr (struct dentry *, const char *name); extern ssize_t nfs_direct_IO(int, struct kiocb *, struct iov_iter *, loff_t); extern ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, - loff_t pos, bool uio); + loff_t pos); extern ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, - loff_t pos, bool uio); + loff_t pos); /* * linux/fs/nfs/dir.c -- cgit v0.10.2