From c761d96b079e99d106fa4064e730ef7d0f312f9d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 2 Jan 2015 15:05:12 -0700 Subject: blk-mq: export blk_mq_freeze_queue() Commit b4c6a028774b exported the start and unfreeze, but we need the regular blk_mq_freeze_queue() for the loop conversion. Signed-off-by: Jens Axboe diff --git a/block/blk-mq.c b/block/blk-mq.c index 1a41d7a..a7d4a98 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -136,6 +136,7 @@ void blk_mq_freeze_queue(struct request_queue *q) blk_mq_freeze_queue_start(q); blk_mq_freeze_queue_wait(q); } +EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); void blk_mq_unfreeze_queue(struct request_queue *q) { diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 3b43f50..5b6500c 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -212,6 +212,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn, void *priv); +void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_unfreeze_queue(struct request_queue *q); void blk_mq_freeze_queue_start(struct request_queue *q); -- cgit v0.10.2 From dd22f551ac0ad366f92f601835f6623b83adc331 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 7 Jan 2015 18:05:34 +0200 Subject: block: Change direct_access calling convention In order to support accesses to larger chunks of memory, pass in a 'size' parameter (counted in bytes), and return the amount available at that address. Add a new helper function, bdev_direct_access(), to handle common functionality including partition handling, checking the length requested is positive, checking for the sector being page-aligned, and checking the length of the request does not pass the end of the partition. Signed-off-by: Matthew Wilcox Reviewed-by: Jan Kara Reviewed-by: Boaz Harrosh Signed-off-by: Jens Axboe diff --git a/Documentation/filesystems/xip.txt b/Documentation/filesystems/xip.txt index 0466ee5..b774729 100644 --- a/Documentation/filesystems/xip.txt +++ b/Documentation/filesystems/xip.txt @@ -28,12 +28,15 @@ Implementation Execute-in-place is implemented in three steps: block device operation, address space operation, and file operations. -A block device operation named direct_access is used to retrieve a -reference (pointer) to a block on-disk. The reference is supposed to be -cpu-addressable, physical address and remain valid until the release operation -is performed. A struct block_device reference is used to address the device, -and a sector_t argument is used to identify the individual block. As an -alternative, memory technology devices can be used for this. +A block device operation named direct_access is used to translate the +block device sector number to a page frame number (pfn) that identifies +the physical page for the memory. It also returns a kernel virtual +address that can be used to access the memory. + +The direct_access method takes a 'size' parameter that indicates the +number of bytes being requested. The function should return the number +of bytes that can be contiguously accessed at that offset. It may also +return a negative errno if an error occurs. The block device operation is optional, these block devices support it as of today: diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c index f532c92..20f8afe 100644 --- a/arch/powerpc/sysdev/axonram.c +++ b/arch/powerpc/sysdev/axonram.c @@ -139,26 +139,17 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio) * axon_ram_direct_access - direct_access() method for block device * @device, @sector, @data: see block_device_operations method */ -static int +static long axon_ram_direct_access(struct block_device *device, sector_t sector, - void **kaddr, unsigned long *pfn) + void **kaddr, unsigned long *pfn, long size) { struct axon_ram_bank *bank = device->bd_disk->private_data; - loff_t offset; - - offset = sector; - if (device->bd_part != NULL) - offset += device->bd_part->start_sect; - offset <<= AXON_RAM_SECTOR_SHIFT; - if (offset >= bank->size) { - dev_err(&bank->device->dev, "Access outside of address space\n"); - return -ERANGE; - } + loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT; *kaddr = (void *)(bank->ph_addr + offset); *pfn = virt_to_phys(kaddr) >> PAGE_SHIFT; - return 0; + return bank->size - offset; } static const struct block_device_operations axon_ram_devops = { diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 3598110..89e90ec 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -370,25 +370,25 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector, } #ifdef CONFIG_BLK_DEV_XIP -static int brd_direct_access(struct block_device *bdev, sector_t sector, - void **kaddr, unsigned long *pfn) +static long brd_direct_access(struct block_device *bdev, sector_t sector, + void **kaddr, unsigned long *pfn, long size) { struct brd_device *brd = bdev->bd_disk->private_data; struct page *page; if (!brd) return -ENODEV; - if (sector & (PAGE_SECTORS-1)) - return -EINVAL; - if (sector + PAGE_SECTORS > get_capacity(bdev->bd_disk)) - return -ERANGE; page = brd_insert_page(brd, sector); if (!page) return -ENOSPC; *kaddr = page_address(page); *pfn = page_to_pfn(page); - return 0; + /* + * TODO: If size > PAGE_SIZE, we could look to see if the next page in + * the file happens to be mapped to the next page of physical RAM. + */ + return PAGE_SIZE; } #endif diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index b550c8c..31d6884 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -28,8 +28,8 @@ static int dcssblk_open(struct block_device *bdev, fmode_t mode); static void dcssblk_release(struct gendisk *disk, fmode_t mode); static void dcssblk_make_request(struct request_queue *q, struct bio *bio); -static int dcssblk_direct_access(struct block_device *bdev, sector_t secnum, - void **kaddr, unsigned long *pfn); +static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum, + void **kaddr, unsigned long *pfn, long size); static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0"; @@ -866,25 +866,22 @@ fail: bio_io_error(bio); } -static int +static long dcssblk_direct_access (struct block_device *bdev, sector_t secnum, - void **kaddr, unsigned long *pfn) + void **kaddr, unsigned long *pfn, long size) { struct dcssblk_dev_info *dev_info; - unsigned long pgoff; + unsigned long offset, dev_sz; dev_info = bdev->bd_disk->private_data; if (!dev_info) return -ENODEV; - if (secnum % (PAGE_SIZE/512)) - return -EINVAL; - pgoff = secnum / (PAGE_SIZE / 512); - if ((pgoff+1)*PAGE_SIZE-1 > dev_info->end - dev_info->start) - return -ERANGE; - *kaddr = (void *) (dev_info->start+pgoff*PAGE_SIZE); + dev_sz = dev_info->end - dev_info->start; + offset = secnum * 512; + *kaddr = (void *) (dev_info->start + offset); *pfn = virt_to_phys(*kaddr) >> PAGE_SHIFT; - return 0; + return dev_sz - offset; } static void diff --git a/fs/block_dev.c b/fs/block_dev.c index b48c41b..f314c2c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -429,6 +429,46 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL_GPL(bdev_write_page); +/** + * bdev_direct_access() - Get the address for directly-accessibly memory + * @bdev: The device containing the memory + * @sector: The offset within the device + * @addr: Where to put the address of the memory + * @pfn: The Page Frame Number for the memory + * @size: The number of bytes requested + * + * If a block device is made up of directly addressable memory, this function + * will tell the caller the PFN and the address of the memory. The address + * may be directly dereferenced within the kernel without the need to call + * ioremap(), kmap() or similar. The PFN is suitable for inserting into + * page tables. + * + * Return: negative errno if an error occurs, otherwise the number of bytes + * accessible at this address. + */ +long bdev_direct_access(struct block_device *bdev, sector_t sector, + void **addr, unsigned long *pfn, long size) +{ + long avail; + const struct block_device_operations *ops = bdev->bd_disk->fops; + + if (size < 0) + return size; + if (!ops->direct_access) + return -EOPNOTSUPP; + if ((sector + DIV_ROUND_UP(size, 512)) > + part_nr_sects_read(bdev->bd_part)) + return -ERANGE; + sector += get_start_sect(bdev); + if (sector % (PAGE_SIZE / 512)) + return -EINVAL; + avail = ops->direct_access(bdev, sector, addr, pfn, size); + if (!avail) + return -ERANGE; + return min(avail, size); +} +EXPORT_SYMBOL_GPL(bdev_direct_access); + /* * pseudo-fs */ diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c index e98171a..bbc5fec 100644 --- a/fs/ext2/xip.c +++ b/fs/ext2/xip.c @@ -13,18 +13,12 @@ #include "ext2.h" #include "xip.h" -static inline int -__inode_direct_access(struct inode *inode, sector_t block, - void **kaddr, unsigned long *pfn) +static inline long __inode_direct_access(struct inode *inode, sector_t block, + void **kaddr, unsigned long *pfn, long size) { struct block_device *bdev = inode->i_sb->s_bdev; - const struct block_device_operations *ops = bdev->bd_disk->fops; - sector_t sector; - - sector = block * (PAGE_SIZE / 512); /* ext2 block to bdev sector */ - - BUG_ON(!ops->direct_access); - return ops->direct_access(bdev, sector, kaddr, pfn); + sector_t sector = block * (PAGE_SIZE / 512); + return bdev_direct_access(bdev, sector, kaddr, pfn, size); } static inline int @@ -53,12 +47,13 @@ ext2_clear_xip_target(struct inode *inode, sector_t block) { void *kaddr; unsigned long pfn; - int rc; + long size; - rc = __inode_direct_access(inode, block, &kaddr, &pfn); - if (!rc) - clear_page(kaddr); - return rc; + size = __inode_direct_access(inode, block, &kaddr, &pfn, PAGE_SIZE); + if (size < 0) + return size; + clear_page(kaddr); + return 0; } void ext2_xip_verify_sb(struct super_block *sb) @@ -77,7 +72,7 @@ void ext2_xip_verify_sb(struct super_block *sb) int ext2_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create, void **kmem, unsigned long *pfn) { - int rc; + long rc; sector_t block; /* first, retrieve the sector number */ @@ -86,6 +81,6 @@ int ext2_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create, return rc; /* retrieve address of the target data */ - rc = __inode_direct_access(mapping->host, block, kmem, pfn); - return rc; + rc = __inode_direct_access(mapping->host, block, kmem, pfn, PAGE_SIZE); + return (rc < 0) ? rc : 0; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 92f4b4b..e9086be 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1601,8 +1601,8 @@ struct block_device_operations { int (*rw_page)(struct block_device *, sector_t, struct page *, int rw); int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); - int (*direct_access) (struct block_device *, sector_t, - void **, unsigned long *); + long (*direct_access)(struct block_device *, sector_t, + void **, unsigned long *pfn, long size); unsigned int (*check_events) (struct gendisk *disk, unsigned int clearing); /* ->media_changed() is DEPRECATED, use ->check_events() instead */ @@ -1620,6 +1620,8 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int, extern int bdev_read_page(struct block_device *, sector_t, struct page *); extern int bdev_write_page(struct block_device *, sector_t, struct page *, struct writeback_control *); +extern long bdev_direct_access(struct block_device *, sector_t, void **addr, + unsigned long *pfn, long size); #else /* CONFIG_BLOCK */ struct block_device; -- cgit v0.10.2 From 0bf364984c4a799f75414de009ecd579d6d35a21 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 14 Jan 2015 08:49:55 -0700 Subject: blk-mq: fix false negative out-of-tags condition The blk-mq tagging tries to maintain some locality between CPUs and the tags issued. The tags are split into groups of words, and the words may not be fully populated. When searching for a new free tag, blk-mq may look at partial words, hence it passes in an offset/size to find_next_zero_bit(). However, it does that wrong, the size must always be the full length of the number of tags in that word, otherwise we'll potentially miss some near the end. Another issue is when __bt_get() goes from one word set to the next. It bumps the index, but not the last_tag associated with the previous index. Bump that to be in the range of the new word. Finally, clean up __bt_get() and __bt_get_word() a bit and get rid of the goto in there, and the unnecessary 'wrap' variable. Signed-off-by: Jens Axboe diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 60c9d4a..d4daee3 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -142,29 +142,30 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag) { - int tag, org_last_tag, end; - bool wrap = last_tag != 0; + int tag, org_last_tag = last_tag; - org_last_tag = last_tag; - end = bm->depth; - do { -restart: - tag = find_next_zero_bit(&bm->word, end, last_tag); - if (unlikely(tag >= end)) { + while (1) { + tag = find_next_zero_bit(&bm->word, bm->depth, last_tag); + if (unlikely(tag >= bm->depth)) { /* - * We started with an offset, start from 0 to + * We started with an offset, and we didn't reset the + * offset to 0 in a failure case, so start from 0 to * exhaust the map. */ - if (wrap) { - wrap = false; - end = org_last_tag; - last_tag = 0; - goto restart; + if (org_last_tag && last_tag) { + last_tag = org_last_tag = 0; + continue; } return -1; } + + if (!test_and_set_bit(tag, &bm->word)) + break; + last_tag = tag + 1; - } while (test_and_set_bit(tag, &bm->word)); + if (last_tag >= bm->depth - 1) + last_tag = 0; + } return tag; } @@ -199,9 +200,17 @@ static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt, goto done; } - last_tag = 0; - if (++index >= bt->map_nr) + /* + * Jump to next index, and reset the last tag to be the + * first tag of that index + */ + index++; + last_tag = (index << bt->bits_per_word); + + if (index >= bt->map_nr) { index = 0; + last_tag = 0; + } } *tag_cache = 0; -- cgit v0.10.2 From c6ce194325cef342313e3d27620411ce90a89c50 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Mon, 12 Jan 2015 15:21:01 -0500 Subject: cfq-iosched: fix incorrect filing of rt async cfqq Hi, If you can manage to submit an async write as the first async I/O from the context of a process with realtime scheduling priority, then a cfq_queue is allocated, but filed into the wrong async_cfqq bucket. It ends up in the best effort array, but actually has realtime I/O scheduling priority set in cfqq->ioprio. The reason is that cfq_get_queue assumes the default scheduling class and priority when there is no information present (i.e. when the async cfqq is created): static struct cfq_queue * cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, struct bio *bio, gfp_t gfp_mask) { const int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); const int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); cic->ioprio starts out as 0, which is "invalid". So, class of 0 (IOPRIO_CLASS_NONE) is passed to cfq_async_queue_prio like so: async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio); static struct cfq_queue ** cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) { switch (ioprio_class) { case IOPRIO_CLASS_RT: return &cfqd->async_cfqq[0][ioprio]; case IOPRIO_CLASS_NONE: ioprio = IOPRIO_NORM; /* fall through */ case IOPRIO_CLASS_BE: return &cfqd->async_cfqq[1][ioprio]; case IOPRIO_CLASS_IDLE: return &cfqd->async_idle_cfqq; default: BUG(); } } Here, instead of returning a class mapped from the process' scheduling priority, we get back the bucket associated with IOPRIO_CLASS_BE. Now, there is no queue allocated there yet, so we create it: cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask); That function ends up doing this: cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); cfq_init_prio_data(cfqq, cic); cfq_init_cfqq marks the priority as having changed. Then, cfq_init_prio data does this: ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); switch (ioprio_class) { default: printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); case IOPRIO_CLASS_NONE: /* * no prio set, inherit CPU scheduling settings */ cfqq->ioprio = task_nice_ioprio(tsk); cfqq->ioprio_class = task_nice_ioclass(tsk); break; So we basically have two code paths that treat IOPRIO_CLASS_NONE differently, which results in an RT async cfqq filed into a best effort bucket. Attached is a patch which fixes the problem. I'm not sure how to make it cleaner. Suggestions would be welcome. Signed-off-by: Jeff Moyer Tested-by: Hidehiro Kawai Cc: stable@kernel.org Signed-off-by: Jens Axboe diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 6f2751d..b9abdca 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3656,12 +3656,17 @@ static struct cfq_queue * cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, struct bio *bio, gfp_t gfp_mask) { - const int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); - const int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); + int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); + int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); struct cfq_queue **async_cfqq = NULL; struct cfq_queue *cfqq = NULL; if (!is_sync) { + if (!ioprio_valid(cic->ioprio)) { + struct task_struct *tsk = current; + ioprio = task_nice_ioprio(tsk); + ioprio_class = task_nice_ioclass(tsk); + } async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio); cfqq = *async_cfqq; } -- cgit v0.10.2 From d93ba7a5a97c9f315bacdcdb8de4e5f368e7b396 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Tue, 20 Jan 2015 20:06:30 -0500 Subject: block: Add discard flag to blkdev_issue_zeroout() function blkdev_issue_discard() will zero a given block range. This is done by way of explicit writing, thus provisioning or allocating the blocks on disk. There are use cases where the desired behavior is to zero the blocks but unprovision them if possible. The blocks must deterministically contain zeroes when they are subsequently read back. This patch adds a flag to blkdev_issue_zeroout() that provides this variant. If the discard flag is set and a block device guarantees discard_zeroes_data we will use REQ_DISCARD to clear the block range. If the device does not support discard_zeroes_data or if the discard request fails we will fall back to first REQ_WRITE_SAME and then a regular REQ_WRITE. Also update the callers of blkdev_issue_zero() to reflect the new flag and make sb_issue_zeroout() prefer the discard approach. Signed-off-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe diff --git a/block/blk-lib.c b/block/blk-lib.c index 8411be3..715e948 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -283,23 +283,45 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, * @sector: start sector * @nr_sects: number of sectors to write * @gfp_mask: memory allocation flags (for bio_alloc) + * @discard: whether to discard the block range * * Description: - * Generate and issue number of bios with zerofiled pages. + + * Zero-fill a block range. If the discard flag is set and the block + * device guarantees that subsequent READ operations to the block range + * in question will return zeroes, the blocks will be discarded. Should + * the discard request fail, if the discard flag is not set, or if + * discard_zeroes_data is not supported, this function will resort to + * zeroing the blocks manually, thus provisioning (allocating, + * anchoring) them. If the block device supports the WRITE SAME command + * blkdev_issue_zeroout() will use it to optimize the process of + * clearing the block range. Otherwise the zeroing will be performed + * using regular WRITE calls. */ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask) + sector_t nr_sects, gfp_t gfp_mask, bool discard) { + struct request_queue *q = bdev_get_queue(bdev); + unsigned char bdn[BDEVNAME_SIZE]; + + if (discard && blk_queue_discard(q) && q->limits.discard_zeroes_data) { + + if (!blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, 0)) + return 0; + + bdevname(bdev, bdn); + pr_warn("%s: DISCARD failed. Manually zeroing.\n", bdn); + } + if (bdev_write_same(bdev)) { - unsigned char bdn[BDEVNAME_SIZE]; if (!blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, ZERO_PAGE(0))) return 0; bdevname(bdev, bdn); - pr_err("%s: WRITE SAME failed. Manually zeroing.\n", bdn); + pr_warn("%s: WRITE SAME failed. Manually zeroing.\n", bdn); } return __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask); diff --git a/block/ioctl.c b/block/ioctl.c index 6c7bf90..7d8befd 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -198,7 +198,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, uint64_t start, if (start + len > (i_size_read(bdev->bd_inode) >> 9)) return -EINVAL; - return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL); + return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL, false); } static int put_ushort(unsigned long arg, unsigned short val) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index d169b4a..cee2035 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1388,7 +1388,7 @@ int drbd_submit_peer_request(struct drbd_device *device, list_add_tail(&peer_req->w.list, &device->active_ee); spin_unlock_irq(&device->resource->req_lock); if (blkdev_issue_zeroout(device->ldev->backing_bdev, - sector, data_size >> 9, GFP_NOIO)) + sector, data_size >> 9, GFP_NOIO, false)) peer_req->flags |= EE_WAS_ERROR; drbd_endio_write_sec_final(peer_req); return 0; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e9086be..4c4b732 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1162,7 +1162,7 @@ extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector, extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct page *page); extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask); + sector_t nr_sects, gfp_t gfp_mask, bool discard); static inline int sb_issue_discard(struct super_block *sb, sector_t block, sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags) { @@ -1176,7 +1176,7 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block, return blkdev_issue_zeroout(sb->s_bdev, block << (sb->s_blocksize_bits - 9), nr_blocks << (sb->s_blocksize_bits - 9), - gfp_mask); + gfp_mask, true); } extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm); -- cgit v0.10.2 From bb5c3cdda37aad22996d6da2addd58cadc0436c0 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Thu, 22 Jan 2015 14:39:20 +0200 Subject: block: Remove annoying "unknown partition table" message As Christoph put it: Can we just get rid of the warnings? It's fairly annoying as devices without partitions are perfectly fine and very useful. Me too I see this message every VM boot for ages on all my devices. Would love to just remove it. For me a partition-table is only needed for a booting BIOS, grub, and stuff. CC: Christoph Hellwig Signed-off-by: Boaz Harrosh Signed-off-by: Jens Axboe diff --git a/block/partitions/check.c b/block/partitions/check.c index 9ac1df7..16118d1 100644 --- a/block/partitions/check.c +++ b/block/partitions/check.c @@ -184,12 +184,12 @@ check_partition(struct gendisk *hd, struct block_device *bdev) if (err) /* The partition is unrecognized. So report I/O errors if there were any */ res = err; - if (!res) - strlcat(state->pp_buf, " unknown partition table\n", PAGE_SIZE); - else if (warn_no_part) - strlcat(state->pp_buf, " unable to read partition table\n", PAGE_SIZE); - - printk(KERN_INFO "%s", state->pp_buf); + if (res) { + if (warn_no_part) + strlcat(state->pp_buf, + " unable to read partition table\n", PAGE_SIZE); + printk(KERN_INFO "%s", state->pp_buf); + } free_page((unsigned long)state->pp_buf); free_partitions(state); -- cgit v0.10.2 From ee1b6f7aff94019c09e73837054979063f722046 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 15 Jan 2015 17:32:25 -0800 Subject: block: support different tag allocation policy The libata tag allocation is using a round-robin policy. Next patch will make libata use block generic tag allocation, so let's add a policy to tag allocation. Currently two policies: FIFO (default) and round-robin. Cc: Jens Axboe Cc: Tejun Heo Cc: Christoph Hellwig Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe diff --git a/block/blk-tag.c b/block/blk-tag.c index a185b86..f0344e6 100644 --- a/block/blk-tag.c +++ b/block/blk-tag.c @@ -119,7 +119,7 @@ fail: } static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q, - int depth) + int depth, int alloc_policy) { struct blk_queue_tag *tags; @@ -131,6 +131,8 @@ static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q, goto fail; atomic_set(&tags->refcnt, 1); + tags->alloc_policy = alloc_policy; + tags->next_tag = 0; return tags; fail: kfree(tags); @@ -140,10 +142,11 @@ fail: /** * blk_init_tags - initialize the tag info for an external tag map * @depth: the maximum queue depth supported + * @alloc_policy: tag allocation policy **/ -struct blk_queue_tag *blk_init_tags(int depth) +struct blk_queue_tag *blk_init_tags(int depth, int alloc_policy) { - return __blk_queue_init_tags(NULL, depth); + return __blk_queue_init_tags(NULL, depth, alloc_policy); } EXPORT_SYMBOL(blk_init_tags); @@ -152,19 +155,20 @@ EXPORT_SYMBOL(blk_init_tags); * @q: the request queue for the device * @depth: the maximum queue depth supported * @tags: the tag to use + * @alloc_policy: tag allocation policy * * Queue lock must be held here if the function is called to resize an * existing map. **/ int blk_queue_init_tags(struct request_queue *q, int depth, - struct blk_queue_tag *tags) + struct blk_queue_tag *tags, int alloc_policy) { int rc; BUG_ON(tags && q->queue_tags && tags != q->queue_tags); if (!tags && !q->queue_tags) { - tags = __blk_queue_init_tags(q, depth); + tags = __blk_queue_init_tags(q, depth, alloc_policy); if (!tags) return -ENOMEM; @@ -344,9 +348,21 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq) } do { - tag = find_first_zero_bit(bqt->tag_map, max_depth); - if (tag >= max_depth) - return 1; + if (bqt->alloc_policy == BLK_TAG_ALLOC_FIFO) { + tag = find_first_zero_bit(bqt->tag_map, max_depth); + if (tag >= max_depth) + return 1; + } else { + int start = bqt->next_tag; + int size = min_t(int, bqt->max_depth, max_depth + start); + tag = find_next_zero_bit(bqt->tag_map, size, start); + if (tag >= size && start + size > bqt->max_depth) { + size = start + size - bqt->max_depth; + tag = find_first_zero_bit(bqt->tag_map, size); + } + if (tag >= size) + return 1; + } } while (test_and_set_bit_lock(tag, bqt->tag_map)); /* @@ -354,6 +370,7 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq) * See blk_queue_end_tag for details. */ + bqt->next_tag = (tag + 1) % bqt->max_depth; rq->cmd_flags |= REQ_QUEUED; rq->tag = tag; bqt->tag_index[tag] = rq; diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c index 79aa179..e229425 100644 --- a/drivers/block/osdblk.c +++ b/drivers/block/osdblk.c @@ -423,7 +423,7 @@ static int osdblk_init_disk(struct osdblk_device *osdev) } /* switch queue to TCQ mode; allocate tag map */ - rc = blk_queue_init_tags(q, OSDBLK_MAX_REQ, NULL); + rc = blk_queue_init_tags(q, OSDBLK_MAX_REQ, NULL, BLK_TAG_ALLOC_FIFO); if (rc) { blk_cleanup_queue(q); put_disk(disk); diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c index 983aed1..921a8c8 100644 --- a/drivers/scsi/scsi_scan.c +++ b/drivers/scsi/scsi_scan.c @@ -290,7 +290,8 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget, if (!shost_use_blk_mq(sdev->host) && (shost->bqt || shost->hostt->use_blk_tags)) { blk_queue_init_tags(sdev->request_queue, - sdev->host->cmd_per_lun, shost->bqt); + sdev->host->cmd_per_lun, shost->bqt, + shost->hostt->tag_alloc_policy); } scsi_change_queue_depth(sdev, sdev->host->cmd_per_lun); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4c4b732..6f388fd 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -272,7 +272,11 @@ struct blk_queue_tag { int max_depth; /* what we will send to device */ int real_max_depth; /* what the array can hold */ atomic_t refcnt; /* map can be shared */ + int alloc_policy; /* tag allocation policy */ + int next_tag; /* next tag */ }; +#define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */ +#define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */ #define BLK_SCSI_MAX_CMDS (256) #define BLK_SCSI_CMD_PER_LONG (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8)) @@ -1139,11 +1143,11 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk) extern int blk_queue_start_tag(struct request_queue *, struct request *); extern struct request *blk_queue_find_tag(struct request_queue *, int); extern void blk_queue_end_tag(struct request_queue *, struct request *); -extern int blk_queue_init_tags(struct request_queue *, int, struct blk_queue_tag *); +extern int blk_queue_init_tags(struct request_queue *, int, struct blk_queue_tag *, int); extern void blk_queue_free_tags(struct request_queue *); extern int blk_queue_resize_tags(struct request_queue *, int); extern void blk_queue_invalidate_tags(struct request_queue *); -extern struct blk_queue_tag *blk_init_tags(int); +extern struct blk_queue_tag *blk_init_tags(int, int); extern void blk_free_tags(struct blk_queue_tag *); static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt, diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index 019e668..e113c75 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -402,6 +402,9 @@ struct scsi_host_template { */ unsigned char present; + /* If use block layer to manage tags, this is tag allocation policy */ + int tag_alloc_policy; + /* * Let the block layer assigns tags to all commands. */ diff --git a/include/scsi/scsi_tcq.h b/include/scsi/scsi_tcq.h index 9708b28..b27977e 100644 --- a/include/scsi/scsi_tcq.h +++ b/include/scsi/scsi_tcq.h @@ -66,7 +66,8 @@ static inline int scsi_init_shared_tag_map(struct Scsi_Host *shost, int depth) * devices on the shared host (for libata) */ if (!shost->bqt) { - shost->bqt = blk_init_tags(depth); + shost->bqt = blk_init_tags(depth, + shost->hostt->tag_alloc_policy); if (!shost->bqt) return -ENOMEM; } -- cgit v0.10.2 From 24391c0dc57c3756a219defaa781e68637d6ab7d Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 23 Jan 2015 14:18:00 -0700 Subject: blk-mq: add tag allocation policy This is the blk-mq part to support tag allocation policy. The default allocation policy isn't changed (though it's not a strict FIFO). The new policy is round-robin for libata. But it's a try-best implementation. If multiple tasks are competing, the tags returned will be mixed (which is unavoidable even with !mq, as requests from different tasks can be mixed in queue) Cc: Jens Axboe Cc: Tejun Heo Cc: Christoph Hellwig Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index d4daee3..e3387a7 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -140,7 +140,8 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, return atomic_read(&hctx->nr_active) < depth; } -static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag) +static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag, + bool nowrap) { int tag, org_last_tag = last_tag; @@ -152,7 +153,7 @@ static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag) * offset to 0 in a failure case, so start from 0 to * exhaust the map. */ - if (org_last_tag && last_tag) { + if (org_last_tag && last_tag && !nowrap) { last_tag = org_last_tag = 0; continue; } @@ -170,6 +171,8 @@ static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag) return tag; } +#define BT_ALLOC_RR(tags) (tags->alloc_policy == BLK_TAG_ALLOC_RR) + /* * Straight forward bitmap tag implementation, where each bit is a tag * (cleared == free, and set == busy). The small twist is using per-cpu @@ -182,7 +185,7 @@ static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag) * until the map is exhausted. */ static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt, - unsigned int *tag_cache) + unsigned int *tag_cache, struct blk_mq_tags *tags) { unsigned int last_tag, org_last_tag; int index, i, tag; @@ -194,7 +197,8 @@ static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt, index = TAG_TO_INDEX(bt, last_tag); for (i = 0; i < bt->map_nr; i++) { - tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag)); + tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag), + BT_ALLOC_RR(tags)); if (tag != -1) { tag += (index << bt->bits_per_word); goto done; @@ -221,7 +225,7 @@ static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt, * up using the specific cached tag. */ done: - if (tag == org_last_tag) { + if (tag == org_last_tag || unlikely(BT_ALLOC_RR(tags))) { last_tag = tag + 1; if (last_tag >= bt->depth - 1) last_tag = 0; @@ -250,13 +254,13 @@ static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt, static int bt_get(struct blk_mq_alloc_data *data, struct blk_mq_bitmap_tags *bt, struct blk_mq_hw_ctx *hctx, - unsigned int *last_tag) + unsigned int *last_tag, struct blk_mq_tags *tags) { struct bt_wait_state *bs; DEFINE_WAIT(wait); int tag; - tag = __bt_get(hctx, bt, last_tag); + tag = __bt_get(hctx, bt, last_tag, tags); if (tag != -1) return tag; @@ -267,7 +271,7 @@ static int bt_get(struct blk_mq_alloc_data *data, do { prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE); - tag = __bt_get(hctx, bt, last_tag); + tag = __bt_get(hctx, bt, last_tag, tags); if (tag != -1) break; @@ -282,7 +286,7 @@ static int bt_get(struct blk_mq_alloc_data *data, * Retry tag allocation after running the hardware queue, * as running the queue may also have found completions. */ - tag = __bt_get(hctx, bt, last_tag); + tag = __bt_get(hctx, bt, last_tag, tags); if (tag != -1) break; @@ -313,7 +317,7 @@ static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data) int tag; tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx, - &data->ctx->last_tag); + &data->ctx->last_tag, data->hctx->tags); if (tag >= 0) return tag + data->hctx->tags->nr_reserved_tags; @@ -329,7 +333,8 @@ static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data) return BLK_MQ_TAG_FAIL; } - tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, &zero); + tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, &zero, + data->hctx->tags); if (tag < 0) return BLK_MQ_TAG_FAIL; @@ -401,7 +406,8 @@ void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, BUG_ON(real_tag >= tags->nr_tags); bt_clear_tag(&tags->bitmap_tags, real_tag); - *last_tag = real_tag; + if (likely(tags->alloc_policy == BLK_TAG_ALLOC_FIFO)) + *last_tag = real_tag; } else { BUG_ON(tag >= tags->nr_reserved_tags); bt_clear_tag(&tags->breserved_tags, tag); @@ -538,10 +544,12 @@ static void bt_free(struct blk_mq_bitmap_tags *bt) } static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, - int node) + int node, int alloc_policy) { unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; + tags->alloc_policy = alloc_policy; + if (bt_alloc(&tags->bitmap_tags, depth, node, false)) goto enomem; if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true)) @@ -555,7 +563,8 @@ enomem: } struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, - unsigned int reserved_tags, int node) + unsigned int reserved_tags, + int node, int alloc_policy) { struct blk_mq_tags *tags; @@ -571,7 +580,7 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; - return blk_mq_init_bitmap_tags(tags, node); + return blk_mq_init_bitmap_tags(tags, node, alloc_policy); } void blk_mq_free_tags(struct blk_mq_tags *tags) diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index a6fa0fc..90767b3 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -42,10 +42,12 @@ struct blk_mq_tags { struct request **rqs; struct list_head page_list; + + int alloc_policy; }; -extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node); +extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node, int alloc_policy); extern void blk_mq_free_tags(struct blk_mq_tags *tags); extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); diff --git a/block/blk-mq.c b/block/blk-mq.c index a7d4a98..eb8e694 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1374,7 +1374,8 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, size_t rq_size, left; tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags, - set->numa_node); + set->numa_node, + BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); if (!tags) return NULL; diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 9ea95dd..49ab115 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -2188,6 +2188,8 @@ int scsi_mq_setup_tags(struct Scsi_Host *shost) shost->tag_set.cmd_size = cmd_size; shost->tag_set.numa_node = NUMA_NO_NODE; shost->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; + shost->tag_set.flags |= + BLK_ALLOC_POLICY_TO_MQ_FLAG(shost->hostt->tag_alloc_policy); shost->tag_set.driver_data = shost; return blk_mq_alloc_tag_set(&shost->tag_set); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 5b6500c..86b08b1 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -147,6 +147,8 @@ enum { BLK_MQ_F_SG_MERGE = 1 << 2, BLK_MQ_F_SYSFS_UP = 1 << 3, BLK_MQ_F_DEFER_ISSUE = 1 << 4, + BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, + BLK_MQ_F_ALLOC_POLICY_BITS = 1, BLK_MQ_S_STOPPED = 0, BLK_MQ_S_TAG_ACTIVE = 1, @@ -155,6 +157,12 @@ enum { BLK_MQ_CPU_WORK_BATCH = 8, }; +#define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \ + ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \ + ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) +#define BLK_ALLOC_POLICY_TO_MQ_FLAG(policy) \ + ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ + << BLK_MQ_F_ALLOC_POLICY_START_BIT) struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); void blk_mq_finish_init(struct request_queue *q); -- cgit v0.10.2 From febf71588c2a750e04dc2a8b0824ce120c48bd9e Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 17 Oct 2014 17:46:35 -0600 Subject: block: require blk_rq_prep_clone() be given an initialized clone request Prepare to allow blk_rq_prep_clone() to accept clone requests that were allocated from blk-mq request queues. As such the blk_rq_prep_clone() caller must first initialize the clone request. Signed-off-by: Keith Busch Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe diff --git a/block/blk-core.c b/block/blk-core.c index 3ad4055..1b5fa21 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2945,8 +2945,6 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, if (!bs) bs = fs_bio_set; - blk_rq_init(NULL, rq); - __rq_for_each_bio(bio_src, rq_src) { bio = bio_clone_fast(bio_src, gfp_mask, bs); if (!bio) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index b98cd9d..f251633 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1719,6 +1719,7 @@ static int setup_clone(struct request *clone, struct request *rq, { int r; + blk_rq_init(NULL, rq); r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, dm_rq_bio_constructor, tio); if (r) -- cgit v0.10.2 From 7fb4898e0cd6e7eba7419790921391e53848a35d Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 17 Oct 2014 17:46:38 -0600 Subject: block: add blk-mq support to blk_insert_cloned_request() If the request passed to blk_insert_cloned_request() was allocated by a blk-mq device it must be submitted using blk_mq_insert_request(). Signed-off-by: Keith Busch Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe diff --git a/block/blk-core.c b/block/blk-core.c index 1b5fa21..c81f02c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2048,6 +2048,13 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq))) return -EIO; + if (q->mq_ops) { + if (blk_queue_io_stat(q)) + blk_account_io_start(rq, true); + blk_mq_insert_request(rq, false, true, true); + return 0; + } + spin_lock_irqsave(q->queue_lock, flags); if (unlikely(blk_queue_dying(q))) { spin_unlock_irqrestore(q->queue_lock, flags); -- cgit v0.10.2 From 77a086890173c0958515f7322880e4680cf12f8e Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 12 Jan 2015 15:04:06 -0500 Subject: block: keep established cmd_flags when cloning into a blk-mq request blk_mq_alloc_request() may establish REQ_MQ_INFLIGHT in addition to incrementing the hctx->nr_active count. Any cmd_flags that are established in the newly allocated clone request must be preserved in addition to the cmd_flags that are later copied over from the original request as part of blk_rq_prep_clone(). Otherwise, if REQ_MQ_INFLIGHT isn't set in the clone request the hctx->nr_active count won't get decremented via blk_mq_free_request(). The only consumer of blk_rq_prep_clone() is request-based DM, which uses blk_rq_init() prior to calling blk_rq_prep_clone() for the non-blk-mq case. Given the cloned request's cmd_flags will be 0 it is safe to OR them with the original request's cmd_flags for both the non-blk-mq and blk-mq cases. Reported-by: Bart Van Assche Signed-off-by: Keith Busch Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe diff --git a/block/blk-core.c b/block/blk-core.c index c81f02c..b5e0cc3 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2914,7 +2914,7 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); static void __blk_rq_prep_clone(struct request *dst, struct request *src) { dst->cpu = src->cpu; - dst->cmd_flags = (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE; + dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE; dst->cmd_type = src->cmd_type; dst->__sector = blk_rq_pos(src); dst->__data_len = blk_rq_bytes(src); -- cgit v0.10.2 From ad9cf3bbd18a94806314741ac8092c3422f5aebe Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 16 Dec 2014 12:54:25 -0500 Subject: block: mark blk-mq devices as stackable Commit 4ee5eaf4 ("block: add a queue flag for request stacking support") introduced the concept of "STACKABLE" and blk-mq devices fit the definition in that they establish q->request_fn. So establish QUEUE_FLAG_STACKABLE in QUEUE_FLAG_MQ_DEFAULT. While not strictly needed (DM _could_ just check for q->mq_ops to assume the device is request-based), request-based DM support for blk-mq devices benefits from the ability to consistently check for QUEUE_FLAG_STACKABLE before allowing a device to be stacked into a request-based DM table. Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6f388fd..13e1640 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -520,6 +520,7 @@ struct request_queue { (1 << QUEUE_FLAG_ADD_RANDOM)) #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ + (1 << QUEUE_FLAG_STACKABLE) | \ (1 << QUEUE_FLAG_SAME_COMP)) static inline void queue_lockdep_assert_held(struct request_queue *q) -- cgit v0.10.2 From 42d2683a2704ef4bbbb07fd0b9486ab312dd8c56 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 18 Jan 2015 16:16:28 +0100 Subject: block: simplify bio_map_kern Just open code the trivial mapping from a kernel virtual address to a bio instead of going through the complex user address mapping machinery. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Jens Axboe diff --git a/block/bio.c b/block/bio.c index 471d738..54da51e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1563,9 +1563,8 @@ static void bio_copy_kern_endio(struct bio *bio, int err) { struct bio_vec *bvec; const int read = bio_data_dir(bio) == READ; - struct bio_map_data *bmd = bio->bi_private; + char *p = bio->bi_private; int i; - char *p = bmd->sgvecs[0].iov_base; bio_for_each_segment_all(bvec, bio, i) { char *addr = page_address(bvec->bv_page); @@ -1577,7 +1576,6 @@ static void bio_copy_kern_endio(struct bio *bio, int err) p += bvec->bv_len; } - kfree(bmd); bio_put(bio); } @@ -1595,28 +1593,58 @@ static void bio_copy_kern_endio(struct bio *bio, int err) struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, gfp_t gfp_mask, int reading) { - struct bio *bio; + unsigned long kaddr = (unsigned long)data; + unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = kaddr >> PAGE_SHIFT; struct bio_vec *bvec; - int i; + struct bio *bio; + void *p = data; + int nr_pages = 0, i; - bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask); - if (IS_ERR(bio)) - return bio; + /* + * Overflow, abort + */ + if (end < start) + return ERR_PTR(-EINVAL); - if (!reading) { - void *p = data; + nr_pages = end - start; + bio = bio_kmalloc(gfp_mask, nr_pages); + if (!bio) + return ERR_PTR(-ENOMEM); - bio_for_each_segment_all(bvec, bio, i) { - char *addr = page_address(bvec->bv_page); + while (len) { + struct page *page; + unsigned int bytes = PAGE_SIZE; - memcpy(addr, p, bvec->bv_len); - p += bvec->bv_len; - } + if (bytes > len) + bytes = len; + + page = alloc_page(q->bounce_gfp | gfp_mask); + if (!page) + goto cleanup; + + if (!reading) + memcpy(page_address(page), p, bytes); + + if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) + break; + + len -= bytes; + p += bytes; } - bio->bi_end_io = bio_copy_kern_endio; + if (!reading) + bio->bi_rw |= REQ_WRITE; + bio->bi_private = data; + bio->bi_end_io = bio_copy_kern_endio; return bio; + +cleanup: + bio_for_each_segment_all(bvec, bio, i) + __free_page(bvec->bv_page); + bio_put(bio); + return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL(bio_copy_kern); -- cgit v0.10.2 From ddad8dd0a162fde61646a627a3017c258601dc8a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 18 Jan 2015 16:16:29 +0100 Subject: block: use blk_rq_map_user_iov to implement blk_rq_map_user Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Jens Axboe diff --git a/block/bio.c b/block/bio.c index 54da51e..879921e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1102,7 +1102,7 @@ static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_c * bio_uncopy_user - finish previously mapped bio * @bio: bio being terminated * - * Free pages allocated from bio_copy_user() and write back data + * Free pages allocated from bio_copy_user_iov() and write back data * to user space in case of a read. */ int bio_uncopy_user(struct bio *bio) @@ -1256,32 +1256,6 @@ out_bmd: return ERR_PTR(ret); } -/** - * bio_copy_user - copy user data to bio - * @q: destination block queue - * @map_data: pointer to the rq_map_data holding pages (if necessary) - * @uaddr: start of user address - * @len: length in bytes - * @write_to_vm: bool indicating writing to pages or not - * @gfp_mask: memory allocation flags - * - * Prepares and returns a bio for indirect user io, bouncing data - * to/from kernel pages as necessary. Must be paired with - * call bio_uncopy_user() on io completion. - */ -struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data, - unsigned long uaddr, unsigned int len, - int write_to_vm, gfp_t gfp_mask) -{ - struct sg_iovec iov; - - iov.iov_base = (void __user *)uaddr; - iov.iov_len = len; - - return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask); -} -EXPORT_SYMBOL(bio_copy_user); - static struct bio *__bio_map_user_iov(struct request_queue *q, struct block_device *bdev, const struct sg_iovec *iov, int iov_count, @@ -1395,31 +1369,6 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, } /** - * bio_map_user - map user address into bio - * @q: the struct request_queue for the bio - * @bdev: destination block device - * @uaddr: start of user address - * @len: length in bytes - * @write_to_vm: bool indicating writing to pages or not - * @gfp_mask: memory allocation flags - * - * Map the user space address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev, - unsigned long uaddr, unsigned int len, int write_to_vm, - gfp_t gfp_mask) -{ - struct sg_iovec iov; - - iov.iov_base = (void __user *)uaddr; - iov.iov_len = len; - - return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask); -} -EXPORT_SYMBOL(bio_map_user); - -/** * bio_map_user_iov - map user sg_iovec table into bio * @q: the struct request_queue for the bio * @bdev: destination block device diff --git a/block/blk-map.c b/block/blk-map.c index f890d43..152a5fe 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -39,130 +39,6 @@ static int __blk_rq_unmap_user(struct bio *bio) return ret; } -static int __blk_rq_map_user(struct request_queue *q, struct request *rq, - struct rq_map_data *map_data, void __user *ubuf, - unsigned int len, gfp_t gfp_mask) -{ - unsigned long uaddr; - struct bio *bio, *orig_bio; - int reading, ret; - - reading = rq_data_dir(rq) == READ; - - /* - * if alignment requirement is satisfied, map in user pages for - * direct dma. else, set up kernel bounce buffers - */ - uaddr = (unsigned long) ubuf; - if (blk_rq_aligned(q, uaddr, len) && !map_data) - bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask); - else - bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask); - - if (IS_ERR(bio)) - return PTR_ERR(bio); - - if (map_data && map_data->null_mapped) - bio->bi_flags |= (1 << BIO_NULL_MAPPED); - - orig_bio = bio; - blk_queue_bounce(q, &bio); - - /* - * We link the bounce buffer in and could have to traverse it - * later so we have to get a ref to prevent it from being freed - */ - bio_get(bio); - - ret = blk_rq_append_bio(q, rq, bio); - if (!ret) - return bio->bi_iter.bi_size; - - /* if it was boucned we must call the end io function */ - bio_endio(bio, 0); - __blk_rq_unmap_user(orig_bio); - bio_put(bio); - return ret; -} - -/** - * blk_rq_map_user - map user data to a request, for REQ_TYPE_BLOCK_PC usage - * @q: request queue where request should be inserted - * @rq: request structure to fill - * @map_data: pointer to the rq_map_data holding pages (if necessary) - * @ubuf: the user buffer - * @len: length of user data - * @gfp_mask: memory allocation flags - * - * Description: - * Data will be mapped directly for zero copy I/O, if possible. Otherwise - * a kernel bounce buffer is used. - * - * A matching blk_rq_unmap_user() must be issued at the end of I/O, while - * still in process context. - * - * Note: The mapped bio may need to be bounced through blk_queue_bounce() - * before being submitted to the device, as pages mapped may be out of - * reach. It's the callers responsibility to make sure this happens. The - * original bio must be passed back in to blk_rq_unmap_user() for proper - * unmapping. - */ -int blk_rq_map_user(struct request_queue *q, struct request *rq, - struct rq_map_data *map_data, void __user *ubuf, - unsigned long len, gfp_t gfp_mask) -{ - unsigned long bytes_read = 0; - struct bio *bio = NULL; - int ret; - - if (len > (queue_max_hw_sectors(q) << 9)) - return -EINVAL; - if (!len) - return -EINVAL; - - if (!ubuf && (!map_data || !map_data->null_mapped)) - return -EINVAL; - - while (bytes_read != len) { - unsigned long map_len, end, start; - - map_len = min_t(unsigned long, len - bytes_read, BIO_MAX_SIZE); - end = ((unsigned long)ubuf + map_len + PAGE_SIZE - 1) - >> PAGE_SHIFT; - start = (unsigned long)ubuf >> PAGE_SHIFT; - - /* - * A bad offset could cause us to require BIO_MAX_PAGES + 1 - * pages. If this happens we just lower the requested - * mapping len by a page so that we can fit - */ - if (end - start > BIO_MAX_PAGES) - map_len -= PAGE_SIZE; - - ret = __blk_rq_map_user(q, rq, map_data, ubuf, map_len, - gfp_mask); - if (ret < 0) - goto unmap_rq; - if (!bio) - bio = rq->bio; - bytes_read += ret; - ubuf += ret; - - if (map_data) - map_data->offset += ret; - } - - if (!bio_flagged(bio, BIO_USER_MAPPED)) - rq->cmd_flags |= REQ_COPY_USER; - - return 0; -unmap_rq: - blk_rq_unmap_user(bio); - rq->bio = NULL; - return ret; -} -EXPORT_SYMBOL(blk_rq_map_user); - /** * blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage * @q: request queue where request should be inserted @@ -241,6 +117,19 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, } EXPORT_SYMBOL(blk_rq_map_user_iov); +int blk_rq_map_user(struct request_queue *q, struct request *rq, + struct rq_map_data *map_data, void __user *ubuf, + unsigned long len, gfp_t gfp_mask) +{ + struct sg_iovec iov; + + iov.iov_base = (void __user *)ubuf; + iov.iov_len = len; + + return blk_rq_map_user_iov(q, rq, map_data, &iov, 1, len, gfp_mask); +} +EXPORT_SYMBOL(blk_rq_map_user); + /** * blk_rq_unmap_user - unmap a request with user data * @bio: start of bio list diff --git a/include/linux/bio.h b/include/linux/bio.h index efead0b..d0d6735 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -428,8 +428,6 @@ extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int); extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, unsigned int, unsigned int); extern int bio_get_nr_vecs(struct block_device *); -extern struct bio *bio_map_user(struct request_queue *, struct block_device *, - unsigned long, unsigned int, int, gfp_t); struct sg_iovec; struct rq_map_data; extern struct bio *bio_map_user_iov(struct request_queue *, @@ -462,8 +460,6 @@ static inline void bio_flush_dcache_pages(struct bio *bi) extern void bio_copy_data(struct bio *dst, struct bio *src); extern int bio_alloc_pages(struct bio *bio, gfp_t gfp); -extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *, - unsigned long, unsigned int, int, gfp_t); extern struct bio *bio_copy_user_iov(struct request_queue *, struct rq_map_data *, const struct sg_iovec *, -- cgit v0.10.2 From 1dfa0f68c040080c5fefa7211b4ec34d202f8570 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 18 Jan 2015 16:16:30 +0100 Subject: block: add a helper to free bio bounce buffer pages The code sniplet to walk all bio_vecs and free their pages is opencoded in way to many places, so factor it into a helper. Also convert the slightly more complex cases in bio_kern_endio and __bio_copy_iov where we break the freeing from an existing loop into a separate one. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Jens Axboe diff --git a/block/bio.c b/block/bio.c index 879921e..0895f69 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1048,7 +1048,7 @@ static struct bio_map_data *bio_alloc_map_data(unsigned int iov_count, } static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_count, - int to_user, int from_user, int do_free_page) + int to_user, int from_user) { int ret = 0, i; struct bio_vec *bvec; @@ -1090,14 +1090,20 @@ static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_c iov_off = 0; } } - - if (do_free_page) - __free_page(bvec->bv_page); } return ret; } +static void bio_free_pages(struct bio *bio) +{ + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) + __free_page(bvec->bv_page); +} + /** * bio_uncopy_user - finish previously mapped bio * @bio: bio being terminated @@ -1108,8 +1114,7 @@ static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_c int bio_uncopy_user(struct bio *bio) { struct bio_map_data *bmd = bio->bi_private; - struct bio_vec *bvec; - int ret = 0, i; + int ret = 0; if (!bio_flagged(bio, BIO_NULL_MAPPED)) { /* @@ -1118,11 +1123,9 @@ int bio_uncopy_user(struct bio *bio) */ if (current->mm) ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs, - bio_data_dir(bio) == READ, - 0, bmd->is_our_pages); - else if (bmd->is_our_pages) - bio_for_each_segment_all(bvec, bio, i) - __free_page(bvec->bv_page); + bio_data_dir(bio) == READ, 0); + if (bmd->is_our_pages) + bio_free_pages(bio); } kfree(bmd); bio_put(bio); @@ -1149,7 +1152,6 @@ struct bio *bio_copy_user_iov(struct request_queue *q, int write_to_vm, gfp_t gfp_mask) { struct bio_map_data *bmd; - struct bio_vec *bvec; struct page *page; struct bio *bio; int i, ret; @@ -1238,7 +1240,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, */ if ((!write_to_vm && (!map_data || !map_data->null_mapped)) || (map_data && map_data->from_user)) { - ret = __bio_copy_iov(bio, iov, iov_count, 0, 1, 0); + ret = __bio_copy_iov(bio, iov, iov_count, 0, 1); if (ret) goto cleanup; } @@ -1247,9 +1249,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, return bio; cleanup: if (!map_data) - bio_for_each_segment_all(bvec, bio, i) - __free_page(bvec->bv_page); - + bio_free_pages(bio); bio_put(bio); out_bmd: kfree(bmd); @@ -1510,22 +1510,22 @@ EXPORT_SYMBOL(bio_map_kern); static void bio_copy_kern_endio(struct bio *bio, int err) { - struct bio_vec *bvec; - const int read = bio_data_dir(bio) == READ; + bio_free_pages(bio); + bio_put(bio); +} + +static void bio_copy_kern_endio_read(struct bio *bio, int err) +{ char *p = bio->bi_private; + struct bio_vec *bvec; int i; bio_for_each_segment_all(bvec, bio, i) { - char *addr = page_address(bvec->bv_page); - - if (read) - memcpy(p, addr, bvec->bv_len); - - __free_page(bvec->bv_page); + memcpy(p, page_address(bvec->bv_page), bvec->bv_len); p += bvec->bv_len; } - bio_put(bio); + bio_copy_kern_endio(bio, err); } /** @@ -1545,10 +1545,9 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, unsigned long kaddr = (unsigned long)data; unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; unsigned long start = kaddr >> PAGE_SHIFT; - struct bio_vec *bvec; struct bio *bio; void *p = data; - int nr_pages = 0, i; + int nr_pages = 0; /* * Overflow, abort @@ -1582,16 +1581,18 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, p += bytes; } - if (!reading) + if (reading) { + bio->bi_end_io = bio_copy_kern_endio_read; + bio->bi_private = data; + } else { + bio->bi_end_io = bio_copy_kern_endio; bio->bi_rw |= REQ_WRITE; + } - bio->bi_private = data; - bio->bi_end_io = bio_copy_kern_endio; return bio; cleanup: - bio_for_each_segment_all(bvec, bio, i) - __free_page(bvec->bv_page); + bio_free_pages(bio); bio_put(bio); return ERR_PTR(-ENOMEM); } -- cgit v0.10.2 From 26e49cfc7e988a76bf1e55cef0d9e438e5489180 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 18 Jan 2015 16:16:31 +0100 Subject: block: pass iov_iter to the BLOCK_PC mapping functions Make use of a new interface provided by iov_iter, backed by scatter-gather list of iovec, instead of the old interface based on sg_iovec. Also use iov_iter_advance() instead of manual iteration. This commit should contain only literal replacements, without functional changes. Cc: Christoph Hellwig Cc: Jens Axboe Cc: Doug Gilbert Cc: "James E.J. Bottomley" Signed-off-by: Kent Overstreet [dpark: add more description in commit message] Signed-off-by: Dongsu Park [hch: fixed to do a deep clone of the iov_iter, and to properly use the iov_iter direction] Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Jens Axboe diff --git a/block/bio.c b/block/bio.c index 0895f69..7d8c655 100644 --- a/block/bio.c +++ b/block/bio.c @@ -28,7 +28,6 @@ #include #include #include -#include /* for struct sg_iovec */ #include @@ -1022,21 +1021,11 @@ void bio_copy_data(struct bio *dst, struct bio *src) EXPORT_SYMBOL(bio_copy_data); struct bio_map_data { - int nr_sgvecs; int is_our_pages; - struct sg_iovec sgvecs[]; + struct iov_iter iter; + struct iovec iov[]; }; -static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, - const struct sg_iovec *iov, int iov_count, - int is_our_pages) -{ - memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count); - bmd->nr_sgvecs = iov_count; - bmd->is_our_pages = is_our_pages; - bio->bi_private = bmd; -} - static struct bio_map_data *bio_alloc_map_data(unsigned int iov_count, gfp_t gfp_mask) { @@ -1044,36 +1033,33 @@ static struct bio_map_data *bio_alloc_map_data(unsigned int iov_count, return NULL; return kmalloc(sizeof(struct bio_map_data) + - sizeof(struct sg_iovec) * iov_count, gfp_mask); + sizeof(struct iovec) * iov_count, gfp_mask); } -static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_count, - int to_user, int from_user) +static int __bio_copy_iov(struct bio *bio, const struct iov_iter *iter, + int to_user, int from_user) { int ret = 0, i; struct bio_vec *bvec; - int iov_idx = 0; - unsigned int iov_off = 0; + struct iov_iter iov_iter = *iter; bio_for_each_segment_all(bvec, bio, i) { char *bv_addr = page_address(bvec->bv_page); unsigned int bv_len = bvec->bv_len; - while (bv_len && iov_idx < iov_count) { - unsigned int bytes; - char __user *iov_addr; - - bytes = min_t(unsigned int, - iov[iov_idx].iov_len - iov_off, bv_len); - iov_addr = iov[iov_idx].iov_base + iov_off; + while (bv_len && iov_iter.count) { + struct iovec iov = iov_iter_iovec(&iov_iter); + unsigned int bytes = min_t(unsigned int, bv_len, + iov.iov_len); if (!ret) { if (to_user) - ret = copy_to_user(iov_addr, bv_addr, - bytes); + ret = copy_to_user(iov.iov_base, + bv_addr, bytes); if (from_user) - ret = copy_from_user(bv_addr, iov_addr, + ret = copy_from_user(bv_addr, + iov.iov_base, bytes); if (ret) @@ -1082,13 +1068,7 @@ static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_c bv_len -= bytes; bv_addr += bytes; - iov_addr += bytes; - iov_off += bytes; - - if (iov[iov_idx].iov_len == iov_off) { - iov_idx++; - iov_off = 0; - } + iov_iter_advance(&iov_iter, bytes); } } @@ -1122,7 +1102,7 @@ int bio_uncopy_user(struct bio *bio) * don't copy into a random user address space, just free. */ if (current->mm) - ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs, + ret = __bio_copy_iov(bio, &bmd->iter, bio_data_dir(bio) == READ, 0); if (bmd->is_our_pages) bio_free_pages(bio); @@ -1135,12 +1115,10 @@ EXPORT_SYMBOL(bio_uncopy_user); /** * bio_copy_user_iov - copy user data to bio - * @q: destination block queue - * @map_data: pointer to the rq_map_data holding pages (if necessary) - * @iov: the iovec. - * @iov_count: number of elements in the iovec - * @write_to_vm: bool indicating writing to pages or not - * @gfp_mask: memory allocation flags + * @q: destination block queue + * @map_data: pointer to the rq_map_data holding pages (if necessary) + * @iter: iovec iterator + * @gfp_mask: memory allocation flags * * Prepares and returns a bio for indirect user io, bouncing data * to/from kernel pages as necessary. Must be paired with @@ -1148,24 +1126,25 @@ EXPORT_SYMBOL(bio_uncopy_user); */ struct bio *bio_copy_user_iov(struct request_queue *q, struct rq_map_data *map_data, - const struct sg_iovec *iov, int iov_count, - int write_to_vm, gfp_t gfp_mask) + const struct iov_iter *iter, + gfp_t gfp_mask) { struct bio_map_data *bmd; struct page *page; struct bio *bio; int i, ret; int nr_pages = 0; - unsigned int len = 0; + unsigned int len = iter->count; unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0; - for (i = 0; i < iov_count; i++) { + for (i = 0; i < iter->nr_segs; i++) { unsigned long uaddr; unsigned long end; unsigned long start; - uaddr = (unsigned long)iov[i].iov_base; - end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; + uaddr = (unsigned long) iter->iov[i].iov_base; + end = (uaddr + iter->iov[i].iov_len + PAGE_SIZE - 1) + >> PAGE_SHIFT; start = uaddr >> PAGE_SHIFT; /* @@ -1175,22 +1154,31 @@ struct bio *bio_copy_user_iov(struct request_queue *q, return ERR_PTR(-EINVAL); nr_pages += end - start; - len += iov[i].iov_len; } if (offset) nr_pages++; - bmd = bio_alloc_map_data(iov_count, gfp_mask); + bmd = bio_alloc_map_data(iter->nr_segs, gfp_mask); if (!bmd) return ERR_PTR(-ENOMEM); + /* + * We need to do a deep copy of the iov_iter including the iovecs. + * The caller provided iov might point to an on-stack or otherwise + * shortlived one. + */ + bmd->is_our_pages = map_data ? 0 : 1; + memcpy(bmd->iov, iter->iov, sizeof(struct iovec) * iter->nr_segs); + iov_iter_init(&bmd->iter, iter->type, bmd->iov, + iter->nr_segs, iter->count); + ret = -ENOMEM; bio = bio_kmalloc(gfp_mask, nr_pages); if (!bio) goto out_bmd; - if (!write_to_vm) + if (iter->type & WRITE) bio->bi_rw |= REQ_WRITE; ret = 0; @@ -1238,14 +1226,14 @@ struct bio *bio_copy_user_iov(struct request_queue *q, /* * success */ - if ((!write_to_vm && (!map_data || !map_data->null_mapped)) || + if (((iter->type & WRITE) && (!map_data || !map_data->null_mapped)) || (map_data && map_data->from_user)) { - ret = __bio_copy_iov(bio, iov, iov_count, 0, 1); + ret = __bio_copy_iov(bio, iter, 0, 1); if (ret) goto cleanup; } - bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1); + bio->bi_private = bmd; return bio; cleanup: if (!map_data) @@ -1258,19 +1246,21 @@ out_bmd: static struct bio *__bio_map_user_iov(struct request_queue *q, struct block_device *bdev, - const struct sg_iovec *iov, int iov_count, - int write_to_vm, gfp_t gfp_mask) + const struct iov_iter *iter, + gfp_t gfp_mask) { - int i, j; + int j; int nr_pages = 0; struct page **pages; struct bio *bio; int cur_page = 0; int ret, offset; + struct iov_iter i; + struct iovec iov; - for (i = 0; i < iov_count; i++) { - unsigned long uaddr = (unsigned long)iov[i].iov_base; - unsigned long len = iov[i].iov_len; + iov_for_each(iov, i, *iter) { + unsigned long uaddr = (unsigned long) iov.iov_base; + unsigned long len = iov.iov_len; unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; unsigned long start = uaddr >> PAGE_SHIFT; @@ -1300,16 +1290,17 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, if (!pages) goto out; - for (i = 0; i < iov_count; i++) { - unsigned long uaddr = (unsigned long)iov[i].iov_base; - unsigned long len = iov[i].iov_len; + iov_for_each(iov, i, *iter) { + unsigned long uaddr = (unsigned long) iov.iov_base; + unsigned long len = iov.iov_len; unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; unsigned long start = uaddr >> PAGE_SHIFT; const int local_nr_pages = end - start; const int page_limit = cur_page + local_nr_pages; ret = get_user_pages_fast(uaddr, local_nr_pages, - write_to_vm, &pages[cur_page]); + (iter->type & WRITE) != WRITE, + &pages[cur_page]); if (ret < local_nr_pages) { ret = -EFAULT; goto out_unmap; @@ -1349,7 +1340,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, /* * set data direction, and check if mapped pages need bouncing */ - if (!write_to_vm) + if (iter->type & WRITE) bio->bi_rw |= REQ_WRITE; bio->bi_bdev = bdev; @@ -1357,10 +1348,10 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, return bio; out_unmap: - for (i = 0; i < nr_pages; i++) { - if(!pages[i]) + for (j = 0; j < nr_pages; j++) { + if (!pages[j]) break; - page_cache_release(pages[i]); + page_cache_release(pages[j]); } out: kfree(pages); @@ -1369,25 +1360,22 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, } /** - * bio_map_user_iov - map user sg_iovec table into bio - * @q: the struct request_queue for the bio - * @bdev: destination block device - * @iov: the iovec. - * @iov_count: number of elements in the iovec - * @write_to_vm: bool indicating writing to pages or not - * @gfp_mask: memory allocation flags + * bio_map_user_iov - map user iovec into bio + * @q: the struct request_queue for the bio + * @bdev: destination block device + * @iter: iovec iterator + * @gfp_mask: memory allocation flags * * Map the user space address into a bio suitable for io to a block * device. Returns an error pointer in case of error. */ struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev, - const struct sg_iovec *iov, int iov_count, - int write_to_vm, gfp_t gfp_mask) + const struct iov_iter *iter, + gfp_t gfp_mask) { struct bio *bio; - bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm, - gfp_mask); + bio = __bio_map_user_iov(q, bdev, iter, gfp_mask); if (IS_ERR(bio)) return bio; diff --git a/block/blk-map.c b/block/blk-map.c index 152a5fe..30e6bb8 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -5,7 +5,7 @@ #include #include #include -#include /* for struct sg_iovec */ +#include #include "blk.h" @@ -44,9 +44,7 @@ static int __blk_rq_unmap_user(struct bio *bio) * @q: request queue where request should be inserted * @rq: request to map data to * @map_data: pointer to the rq_map_data holding pages (if necessary) - * @iov: pointer to the iovec - * @iov_count: number of elements in the iovec - * @len: I/O byte count + * @iter: iovec iterator * @gfp_mask: memory allocation flags * * Description: @@ -63,20 +61,21 @@ static int __blk_rq_unmap_user(struct bio *bio) * unmapping. */ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, - struct rq_map_data *map_data, const struct sg_iovec *iov, - int iov_count, unsigned int len, gfp_t gfp_mask) + struct rq_map_data *map_data, + const struct iov_iter *iter, gfp_t gfp_mask) { struct bio *bio; - int i, read = rq_data_dir(rq) == READ; int unaligned = 0; + struct iov_iter i; + struct iovec iov; - if (!iov || iov_count <= 0) + if (!iter || !iter->count) return -EINVAL; - for (i = 0; i < iov_count; i++) { - unsigned long uaddr = (unsigned long)iov[i].iov_base; + iov_for_each(iov, i, *iter) { + unsigned long uaddr = (unsigned long) iov.iov_base; - if (!iov[i].iov_len) + if (!iov.iov_len) return -EINVAL; /* @@ -86,16 +85,15 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, unaligned = 1; } - if (unaligned || (q->dma_pad_mask & len) || map_data) - bio = bio_copy_user_iov(q, map_data, iov, iov_count, read, - gfp_mask); + if (unaligned || (q->dma_pad_mask & iter->count) || map_data) + bio = bio_copy_user_iov(q, map_data, iter, gfp_mask); else - bio = bio_map_user_iov(q, NULL, iov, iov_count, read, gfp_mask); + bio = bio_map_user_iov(q, NULL, iter, gfp_mask); if (IS_ERR(bio)) return PTR_ERR(bio); - if (bio->bi_iter.bi_size != len) { + if (bio->bi_iter.bi_size != iter->count) { /* * Grab an extra reference to this bio, as bio_unmap_user() * expects to be able to drop it twice as it happens on the @@ -121,12 +119,14 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq, struct rq_map_data *map_data, void __user *ubuf, unsigned long len, gfp_t gfp_mask) { - struct sg_iovec iov; + struct iovec iov; + struct iov_iter i; - iov.iov_base = (void __user *)ubuf; + iov.iov_base = ubuf; iov.iov_len = len; + iov_iter_init(&i, rq_data_dir(rq), &iov, 1, len); - return blk_rq_map_user_iov(q, rq, map_data, &iov, 1, len, gfp_mask); + return blk_rq_map_user_iov(q, rq, map_data, &i, gfp_mask); } EXPORT_SYMBOL(blk_rq_map_user); diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 28163fa..e1f71c3 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -332,7 +332,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, ret = 0; if (hdr->iovec_count) { - size_t iov_data_len; + struct iov_iter i; struct iovec *iov = NULL; ret = rw_copy_check_uvector(-1, hdr->dxferp, hdr->iovec_count, @@ -342,20 +342,11 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, goto out_free_cdb; } - iov_data_len = ret; - ret = 0; - /* SG_IO howto says that the shorter of the two wins */ - if (hdr->dxfer_len < iov_data_len) { - hdr->iovec_count = iov_shorten(iov, - hdr->iovec_count, - hdr->dxfer_len); - iov_data_len = hdr->dxfer_len; - } + iov_iter_init(&i, rq_data_dir(rq), iov, hdr->iovec_count, + min_t(unsigned, ret, hdr->dxfer_len)); - ret = blk_rq_map_user_iov(q, rq, NULL, (struct sg_iovec *) iov, - hdr->iovec_count, - iov_data_len, GFP_KERNEL); + ret = blk_rq_map_user_iov(q, rq, NULL, &i, GFP_KERNEL); kfree(iov); } else if (hdr->dxfer_len) ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len, diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index b14f64c..4052e59 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1734,22 +1734,19 @@ sg_start_req(Sg_request *srp, unsigned char *cmd) } if (iov_count) { - int len, size = sizeof(struct sg_iovec) * iov_count; + int size = sizeof(struct iovec) * iov_count; struct iovec *iov; + struct iov_iter i; iov = memdup_user(hp->dxferp, size); if (IS_ERR(iov)) return PTR_ERR(iov); - len = iov_length(iov, iov_count); - if (hp->dxfer_len < len) { - iov_count = iov_shorten(iov, iov_count, hp->dxfer_len); - len = hp->dxfer_len; - } + iov_iter_init(&i, rw, iov, iov_count, + min_t(size_t, hp->dxfer_len, + iov_length(iov, iov_count))); - res = blk_rq_map_user_iov(q, rq, md, (struct sg_iovec *)iov, - iov_count, - len, GFP_ATOMIC); + res = blk_rq_map_user_iov(q, rq, md, &i, GFP_ATOMIC); kfree(iov); } else res = blk_rq_map_user(q, rq, md, hp->dxferp, diff --git a/include/linux/bio.h b/include/linux/bio.h index d0d6735..0d6105b 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -428,11 +428,10 @@ extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int); extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, unsigned int, unsigned int); extern int bio_get_nr_vecs(struct block_device *); -struct sg_iovec; struct rq_map_data; extern struct bio *bio_map_user_iov(struct request_queue *, struct block_device *, - const struct sg_iovec *, int, int, gfp_t); + const struct iov_iter *, gfp_t); extern void bio_unmap_user(struct bio *); extern struct bio *bio_map_kern(struct request_queue *, void *, unsigned int, gfp_t); @@ -462,8 +461,8 @@ extern int bio_alloc_pages(struct bio *bio, gfp_t gfp); extern struct bio *bio_copy_user_iov(struct request_queue *, struct rq_map_data *, - const struct sg_iovec *, - int, int, gfp_t); + const struct iov_iter *, + gfp_t); extern int bio_uncopy_user(struct bio *); void zero_fill_bio(struct bio *bio); extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 13e1640..bf4ef66 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -855,8 +855,8 @@ extern int blk_rq_map_user(struct request_queue *, struct request *, extern int blk_rq_unmap_user(struct bio *); extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t); extern int blk_rq_map_user_iov(struct request_queue *, struct request *, - struct rq_map_data *, const struct sg_iovec *, - int, unsigned int, gfp_t); + struct rq_map_data *, const struct iov_iter *, + gfp_t); extern int blk_execute_rq(struct request_queue *, struct gendisk *, struct request *, int); extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, -- cgit v0.10.2 From 75c72b8366f35f2e5cf1b841b52095948878b794 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 18 Jan 2015 16:16:32 +0100 Subject: block: merge __bio_map_kern into bio_map_kern This saves a little code, and allow to simplify the error handling. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Jens Axboe diff --git a/block/bio.c b/block/bio.c index 7d8c655..a69a9c9 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1429,8 +1429,18 @@ static void bio_map_kern_endio(struct bio *bio, int err) bio_put(bio); } -static struct bio *__bio_map_kern(struct request_queue *q, void *data, - unsigned int len, gfp_t gfp_mask) +/** + * bio_map_kern - map kernel address into bio + * @q: the struct request_queue for the bio + * @data: pointer to buffer to map + * @len: length in bytes + * @gfp_mask: allocation flags for bio allocation + * + * Map the kernel address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. + */ +struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, + gfp_t gfp_mask) { unsigned long kaddr = (unsigned long)data; unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -1454,8 +1464,11 @@ static struct bio *__bio_map_kern(struct request_queue *q, void *data, bytes = len; if (bio_add_pc_page(q, bio, virt_to_page(data), bytes, - offset) < bytes) - break; + offset) < bytes) { + /* we don't support partial mappings */ + bio_put(bio); + return ERR_PTR(-EINVAL); + } data += bytes; len -= bytes; @@ -1465,35 +1478,6 @@ static struct bio *__bio_map_kern(struct request_queue *q, void *data, bio->bi_end_io = bio_map_kern_endio; return bio; } - -/** - * bio_map_kern - map kernel address into bio - * @q: the struct request_queue for the bio - * @data: pointer to buffer to map - * @len: length in bytes - * @gfp_mask: allocation flags for bio allocation - * - * Map the kernel address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, - gfp_t gfp_mask) -{ - struct bio *bio; - - bio = __bio_map_kern(q, data, len, gfp_mask); - if (IS_ERR(bio)) - return bio; - - if (bio->bi_iter.bi_size == len) - return bio; - - /* - * Don't support partial mappings. - */ - bio_put(bio); - return ERR_PTR(-EINVAL); -} EXPORT_SYMBOL(bio_map_kern); static void bio_copy_kern_endio(struct bio *bio, int err) -- cgit v0.10.2 From 37f19e57a0de3c4a3417aa13ff4d04f1e0dee4b3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 18 Jan 2015 16:16:33 +0100 Subject: block: merge __bio_map_user_iov into bio_map_user_iov And also remove the unused bdev argument. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Jens Axboe diff --git a/block/bio.c b/block/bio.c index a69a9c9..0723d4c 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1244,10 +1244,18 @@ out_bmd: return ERR_PTR(ret); } -static struct bio *__bio_map_user_iov(struct request_queue *q, - struct block_device *bdev, - const struct iov_iter *iter, - gfp_t gfp_mask) +/** + * bio_map_user_iov - map user iovec into bio + * @q: the struct request_queue for the bio + * @iter: iovec iterator + * @gfp_mask: memory allocation flags + * + * Map the user space address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. + */ +struct bio *bio_map_user_iov(struct request_queue *q, + const struct iov_iter *iter, + gfp_t gfp_mask) { int j; int nr_pages = 0; @@ -1343,8 +1351,15 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, if (iter->type & WRITE) bio->bi_rw |= REQ_WRITE; - bio->bi_bdev = bdev; bio->bi_flags |= (1 << BIO_USER_MAPPED); + + /* + * subtle -- if __bio_map_user() ended up bouncing a bio, + * it would normally disappear when its bi_end_io is run. + * however, we need it for the unmap, so grab an extra + * reference to it + */ + bio_get(bio); return bio; out_unmap: @@ -1359,37 +1374,6 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, return ERR_PTR(ret); } -/** - * bio_map_user_iov - map user iovec into bio - * @q: the struct request_queue for the bio - * @bdev: destination block device - * @iter: iovec iterator - * @gfp_mask: memory allocation flags - * - * Map the user space address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev, - const struct iov_iter *iter, - gfp_t gfp_mask) -{ - struct bio *bio; - - bio = __bio_map_user_iov(q, bdev, iter, gfp_mask); - if (IS_ERR(bio)) - return bio; - - /* - * subtle -- if __bio_map_user() ended up bouncing a bio, - * it would normally disappear when its bi_end_io is run. - * however, we need it for the unmap, so grab an extra - * reference to it - */ - bio_get(bio); - - return bio; -} - static void __bio_unmap_user(struct bio *bio) { struct bio_vec *bvec; diff --git a/block/blk-map.c b/block/blk-map.c index 30e6bb8..0f22911 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -88,7 +88,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, if (unaligned || (q->dma_pad_mask & iter->count) || map_data) bio = bio_copy_user_iov(q, map_data, iter, gfp_mask); else - bio = bio_map_user_iov(q, NULL, iter, gfp_mask); + bio = bio_map_user_iov(q, iter, gfp_mask); if (IS_ERR(bio)) return PTR_ERR(bio); diff --git a/include/linux/bio.h b/include/linux/bio.h index 0d6105b..da3a127 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -430,7 +430,6 @@ extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, extern int bio_get_nr_vecs(struct block_device *); struct rq_map_data; extern struct bio *bio_map_user_iov(struct request_queue *, - struct block_device *, const struct iov_iter *, gfp_t); extern void bio_unmap_user(struct bio *); extern struct bio *bio_map_kern(struct request_queue *, void *, unsigned int, -- cgit v0.10.2 From 9124d3fe21b0947b03f4b87bcfb7acd675d6e85b Mon Sep 17 00:00:00 2001 From: Dongsu Park Date: Sun, 18 Jan 2015 16:16:34 +0100 Subject: block: rewrite and split __bio_copy_iov() Rewrite __bio_copy_iov using the copy_page_{from,to}_iter helpers, and split it into two simpler functions. This commit should contain only literal replacements, without functional changes. Cc: Kent Overstreet Cc: Jens Axboe Cc: Al Viro Signed-off-by: Dongsu Park [hch: removed the __bio_copy_iov wrapper] Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Jens Axboe diff --git a/block/bio.c b/block/bio.c index 0723d4c..f66a4ea 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1036,43 +1036,66 @@ static struct bio_map_data *bio_alloc_map_data(unsigned int iov_count, sizeof(struct iovec) * iov_count, gfp_mask); } -static int __bio_copy_iov(struct bio *bio, const struct iov_iter *iter, - int to_user, int from_user) +/** + * bio_copy_from_iter - copy all pages from iov_iter to bio + * @bio: The &struct bio which describes the I/O as destination + * @iter: iov_iter as source + * + * Copy all pages from iov_iter to bio. + * Returns 0 on success, or error on failure. + */ +static int bio_copy_from_iter(struct bio *bio, struct iov_iter iter) { - int ret = 0, i; + int i; struct bio_vec *bvec; - struct iov_iter iov_iter = *iter; bio_for_each_segment_all(bvec, bio, i) { - char *bv_addr = page_address(bvec->bv_page); - unsigned int bv_len = bvec->bv_len; - - while (bv_len && iov_iter.count) { - struct iovec iov = iov_iter_iovec(&iov_iter); - unsigned int bytes = min_t(unsigned int, bv_len, - iov.iov_len); - - if (!ret) { - if (to_user) - ret = copy_to_user(iov.iov_base, - bv_addr, bytes); - - if (from_user) - ret = copy_from_user(bv_addr, - iov.iov_base, - bytes); - - if (ret) - ret = -EFAULT; - } + ssize_t ret; - bv_len -= bytes; - bv_addr += bytes; - iov_iter_advance(&iov_iter, bytes); - } + ret = copy_page_from_iter(bvec->bv_page, + bvec->bv_offset, + bvec->bv_len, + &iter); + + if (!iov_iter_count(&iter)) + break; + + if (ret < bvec->bv_len) + return -EFAULT; } - return ret; + return 0; +} + +/** + * bio_copy_to_iter - copy all pages from bio to iov_iter + * @bio: The &struct bio which describes the I/O as source + * @iter: iov_iter as destination + * + * Copy all pages from bio to iov_iter. + * Returns 0 on success, or error on failure. + */ +static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter) +{ + int i; + struct bio_vec *bvec; + + bio_for_each_segment_all(bvec, bio, i) { + ssize_t ret; + + ret = copy_page_to_iter(bvec->bv_page, + bvec->bv_offset, + bvec->bv_len, + &iter); + + if (!iov_iter_count(&iter)) + break; + + if (ret < bvec->bv_len) + return -EFAULT; + } + + return 0; } static void bio_free_pages(struct bio *bio) @@ -1101,9 +1124,8 @@ int bio_uncopy_user(struct bio *bio) * if we're in a workqueue, the request is orphaned, so * don't copy into a random user address space, just free. */ - if (current->mm) - ret = __bio_copy_iov(bio, &bmd->iter, - bio_data_dir(bio) == READ, 0); + if (current->mm && bio_data_dir(bio) == READ) + ret = bio_copy_to_iter(bio, bmd->iter); if (bmd->is_our_pages) bio_free_pages(bio); } @@ -1228,7 +1250,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, */ if (((iter->type & WRITE) && (!map_data || !map_data->null_mapped)) || (map_data && map_data->from_user)) { - ret = __bio_copy_iov(bio, iter, 0, 1); + ret = bio_copy_from_iter(bio, *iter); if (ret) goto cleanup; } -- cgit v0.10.2 From 9f9ee1f2b2f94f19437ae2def7c0d6636d7fe02e Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Thu, 5 Feb 2015 10:14:54 -0700 Subject: block: Quiesce zeroout wrapper blkdev_issue_zeroout() printed a warning if a device failed a discard or write same request despite advertising support for these. That's fine for SCSI since we'll disable these commands if we get an error back from the disk saying that they are not supported. And consequently the warning only gets printed once. There are other types of block devices that support discard, however, and these may return -EOPNOTSUPP for each command but leave discard enabled in the queue limits. This will cause a warning message for every blkdev_issue_zeroout() invocation. Remove the offending warning messages. Reported-by: Sedat Dilek Signed-off-by: Martin K. Petersen Tested-by: Sedat Dilek Signed-off-by: Jens Axboe diff --git a/block/blk-lib.c b/block/blk-lib.c index 715e948..7688ee3 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -286,7 +286,6 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, * @discard: whether to discard the block range * * Description: - * Zero-fill a block range. If the discard flag is set and the block * device guarantees that subsequent READ operations to the block range * in question will return zeroes, the blocks will be discarded. Should @@ -303,26 +302,15 @@ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, bool discard) { struct request_queue *q = bdev_get_queue(bdev); - unsigned char bdn[BDEVNAME_SIZE]; - - if (discard && blk_queue_discard(q) && q->limits.discard_zeroes_data) { - if (!blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, 0)) - return 0; - - bdevname(bdev, bdn); - pr_warn("%s: DISCARD failed. Manually zeroing.\n", bdn); - } + if (discard && blk_queue_discard(q) && q->limits.discard_zeroes_data && + blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, 0) == 0) + return 0; - if (bdev_write_same(bdev)) { - - if (!blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, - ZERO_PAGE(0))) - return 0; - - bdevname(bdev, bdn); - pr_warn("%s: WRITE SAME failed. Manually zeroing.\n", bdn); - } + if (bdev_write_same(bdev) && + blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, + ZERO_PAGE(0)) == 0) + return 0; return __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask); } -- cgit v0.10.2 From 69abaffec7d47a083739b79e3066cb3730eba72e Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 9 Feb 2015 16:42:49 +0300 Subject: cfq-iosched: handle failure of cfq group allocation Cfq_lookup_create_cfqg() allocates struct blkcg_gq using GFP_ATOMIC. In cfq_find_alloc_queue() possible allocation failure is not handled. As a result kernel oopses on NULL pointer dereference when cfq_link_cfqq_cfqg() calls cfqg_get() for NULL pointer. Bug was introduced in v3.5 in commit cd1604fab4f9 ("blkcg: factor out blkio_group creation"). Prior to that commit cfq group lookup had returned pointer to root group as fallback. This patch handles this error using existing fallback oom_cfqq. Signed-off-by: Konstantin Khlebnikov Acked-by: Tejun Heo Acked-by: Vivek Goyal Fixes: cd1604fab4f9 ("blkcg: factor out blkio_group creation") Cc: stable@kernel.org Signed-off-by: Jens Axboe diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index b9abdca..5da8e6e 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3590,6 +3590,11 @@ retry: blkcg = bio_blkcg(bio); cfqg = cfq_lookup_create_cfqg(cfqd, blkcg); + if (!cfqg) { + cfqq = &cfqd->oom_cfqq; + goto out; + } + cfqq = cic_to_cfqq(cic, is_sync); /* @@ -3626,7 +3631,7 @@ retry: } else cfqq = &cfqd->oom_cfqq; } - +out: if (new_cfqq) kmem_cache_free(cfq_pool, new_cfqq); -- cgit v0.10.2 From db507b3ffd9b7a1c87e732ac6e2c3a5d0babb15a Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 9 Feb 2015 12:21:54 -0500 Subject: dm: fix multipath regression due to initializing wrong request Commit febf715 ("block: require blk_rq_prep_clone() be given an initialized clone request") introduced a regression by calling blk_rq_init() on the original request rather than the clone request that is passed to setup_clone(). Signed-off-by: Mike Snitzer Fixes: febf71588c2a ("block: require blk_rq_prep_clone() be given an initialized clone request") Signed-off-by: Jens Axboe diff --git a/drivers/md/dm.c b/drivers/md/dm.c index f251633..71e6b73 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1719,7 +1719,7 @@ static int setup_clone(struct request *clone, struct request *rq, { int r; - blk_rq_init(NULL, rq); + blk_rq_init(NULL, clone); r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, dm_rq_bio_constructor, tio); if (r) -- cgit v0.10.2 From 201f201c33220f53856fd300e1990b779538d67f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 10 Feb 2015 13:31:34 -0700 Subject: blk-mq: make blk_mq_run_queues() static We no longer use it outside of blk-mq.c, so we can make it static and stop exporting it. Additionally, kill the 'async' argument, as there's only one used of it. Signed-off-by: Jens Axboe diff --git a/block/blk-mq.c b/block/blk-mq.c index eb8e694..1e4d459 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -33,6 +33,7 @@ static DEFINE_MUTEX(all_q_mutex); static LIST_HEAD(all_q_list); static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); +static void blk_mq_run_queues(struct request_queue *q); /* * Check if any of the ctx's have pending work in this hardware queue @@ -117,7 +118,7 @@ void blk_mq_freeze_queue_start(struct request_queue *q) if (freeze) { percpu_ref_kill(&q->mq_usage_counter); - blk_mq_run_queues(q, false); + blk_mq_run_queues(q); } } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start); @@ -853,7 +854,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) &hctx->run_work, 0); } -void blk_mq_run_queues(struct request_queue *q, bool async) +static void blk_mq_run_queues(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; int i; @@ -864,10 +865,9 @@ void blk_mq_run_queues(struct request_queue *q, bool async) test_bit(BLK_MQ_S_STOPPED, &hctx->state)) continue; - blk_mq_run_hw_queue(hctx, async); + blk_mq_run_hw_queue(hctx, false); } } -EXPORT_SYMBOL(blk_mq_run_queues); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) { @@ -905,7 +905,6 @@ void blk_mq_start_hw_queues(struct request_queue *q) } EXPORT_SYMBOL(blk_mq_start_hw_queues); - void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) { struct blk_mq_hw_ctx *hctx; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 86b08b1..ac6c7f5 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -175,7 +175,6 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set); void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); void blk_mq_insert_request(struct request *, bool, bool, bool); -void blk_mq_run_queues(struct request_queue *q, bool async); void blk_mq_free_request(struct request *rq); void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); -- cgit v0.10.2 From 854fbb9c699e34fe4889e6907c4fc73889192223 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 11 Feb 2015 08:20:13 -0700 Subject: block: prevent request-to-request merging with gaps if not allowed If the queue has SG_GAPS set, we must not merge across an sg gap. This is caught for the bio case, but currently not for the more rare case of merging two requests directly. Signed-off-by: Keith Busch Cut the dm bits, those will go through the dm tree, and fixed the test_bit() test. Signed-off-by: Jens Axboe diff --git a/block/blk-merge.c b/block/blk-merge.c index 89b97b5..9476b15 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -385,6 +385,14 @@ static bool req_no_special_merge(struct request *req) return !q->mq_ops && req->special; } +static int req_gap_to_prev(struct request *req, struct request *next) +{ + struct bio *prev = req->biotail; + + return bvec_gap_to_prev(&prev->bi_io_vec[prev->bi_vcnt - 1], + next->bio->bi_io_vec[0].bv_offset); +} + static int ll_merge_requests_fn(struct request_queue *q, struct request *req, struct request *next) { @@ -399,6 +407,10 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, if (req_no_special_merge(req) || req_no_special_merge(next)) return 0; + if (test_bit(QUEUE_FLAG_SG_GAPS, &q->queue_flags) && + req_gap_to_prev(req, next)) + return 0; + /* * Will it become too large? */ -- cgit v0.10.2 From 564e559f2baf6a868768d0cac286980b3cfd6e30 Mon Sep 17 00:00:00 2001 From: Tony Battersby Date: Wed, 11 Feb 2015 11:32:30 -0500 Subject: blk-mq: fix double-free in error path If the allocation of bt->bs fails, then bt->map can be freed twice, once in blk_mq_init_bitmap_tags() -> bt_alloc(), and once in blk_mq_init_bitmap_tags() -> bt_free(). Fix by setting the pointer to NULL after the first free. Cc: Signed-off-by: Tony Battersby Signed-off-by: Jens Axboe diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index e3387a7..d53a764 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -524,6 +524,7 @@ static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth, bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL); if (!bt->bs) { kfree(bt->map); + bt->map = NULL; return -ENOMEM; } -- cgit v0.10.2 From a0763b27bf24f028d8b049ad856b2ee031fb475d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 Feb 2015 14:07:49 +0100 Subject: block: handle the null_mapped flag correctly in blk_rq_map_user_iov The tape drivers (and the sg driver in a special case that doesn't matter here) use the null_mapped flag to tell blk_rq_map_user to not copy around any data into or out of the bounce buffers. blk_rq_map_user_iov never got that treatment, which didn't matter until I refactored blk_rq_map_user to be implemented in terms of blk_rq_map_user_iov. Signed-off-by: Christoph Hellwig Fixes: ddad8dd0a162 ("block: use blk_rq_map_user_iov to implement blk_rq_map_user") Signed-off-by: Jens Axboe diff --git a/block/blk-map.c b/block/blk-map.c index 0f22911..b8d2725 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -93,6 +93,9 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, if (IS_ERR(bio)) return PTR_ERR(bio); + if (map_data && map_data->null_mapped) + bio->bi_flags |= (1 << BIO_NULL_MAPPED); + if (bio->bi_iter.bi_size != iter->count) { /* * Grab an extra reference to this bio, as bio_unmap_user() -- cgit v0.10.2 From d427e3c82ef4fc5fbb22c0cef0b040e6767b1028 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 Feb 2015 14:07:50 +0100 Subject: block: remove unused function blk_bio_map_sg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe diff --git a/block/blk-merge.c b/block/blk-merge.c index 9476b15..fc1ff3b 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -283,35 +283,6 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, } EXPORT_SYMBOL(blk_rq_map_sg); -/** - * blk_bio_map_sg - map a bio to a scatterlist - * @q: request_queue in question - * @bio: bio being mapped - * @sglist: scatterlist being mapped - * - * Note: - * Caller must make sure sg can hold bio->bi_phys_segments entries - * - * Will return the number of sg entries setup - */ -int blk_bio_map_sg(struct request_queue *q, struct bio *bio, - struct scatterlist *sglist) -{ - struct scatterlist *sg = NULL; - int nsegs; - struct bio *next = bio->bi_next; - bio->bi_next = NULL; - - nsegs = __blk_bios_map_sg(q, bio, sglist, &sg); - bio->bi_next = next; - if (sg) - sg_mark_end(sg); - - BUG_ON(bio->bi_phys_segments && nsegs > bio->bi_phys_segments); - return nsegs; -} -EXPORT_SYMBOL(blk_bio_map_sg); - static inline int ll_new_hw_segment(struct request_queue *q, struct request *req, struct bio *bio) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bf4ef66..7f9a516 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1049,8 +1049,6 @@ extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable); extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); -extern int blk_bio_map_sg(struct request_queue *q, struct bio *bio, - struct scatterlist *sglist); extern void blk_dump_rq_flags(struct request *, char *); extern long nr_blockdev_pages(void); -- cgit v0.10.2