From ff06db1efb2ad6db06eb5b99b88a0c15a9cc9b0e Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 16 Jun 2016 09:53:58 +0200 Subject: floppy: fix open(O_ACCMODE) for ioctl-only open Commit 09954bad4 ("floppy: refactor open() flags handling"), as a side-effect, causes open(/dev/fdX, O_ACCMODE) to fail. It turns out that this is being used setfdprm userspace for ioctl-only open(). Reintroduce back the original behavior wrt !(FMODE_READ|FMODE_WRITE) modes, while still keeping the original O_NDELAY bug fixed. Cc: stable@vger.kernel.org # v4.5+ Reported-by: Wim Osterholt Tested-by: Wim Osterholt Signed-off-by: Jiri Kosina Signed-off-by: Jens Axboe diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index c557057..b71a9c7 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3663,11 +3663,6 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) opened_bdev[drive] = bdev; - if (!(mode & (FMODE_READ|FMODE_WRITE))) { - res = -EINVAL; - goto out; - } - res = -ENXIO; if (!floppy_track_buffer) { @@ -3711,13 +3706,15 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) if (UFDCS->rawcmd == 1) UFDCS->rawcmd = 2; - UDRS->last_checked = 0; - clear_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags); - check_disk_change(bdev); - if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags)) - goto out; - if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags)) - goto out; + if (mode & (FMODE_READ|FMODE_WRITE)) { + UDRS->last_checked = 0; + clear_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags); + check_disk_change(bdev); + if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags)) + goto out; + if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags)) + goto out; + } res = -EROFS; -- cgit v0.10.2 From dc5ff2b1d66f21c27a4c37236636dff6946437e4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 26 Jul 2016 11:38:20 +0200 Subject: writeback: Write dirty times for WB_SYNC_ALL writeback Currently we take care to handle I_DIRTY_TIME in vfs_fsync() and queue_io() so that inodes which have only dirty timestamps are properly written on fsync(2) and sync(2). However there are other call sites - most notably going through write_inode_now() - which expect inode to be clean after WB_SYNC_ALL writeback. This is not currently true as we do not clear I_DIRTY_TIME in __writeback_single_inode() even for WB_SYNC_ALL writeback in all the cases. This then resulted in the following oops because bdev_write_inode() did not clean the inode and writeback code later stumbled over a dirty inode with detached wb. general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN Modules linked in: CPU: 3 PID: 32 Comm: kworker/u10:1 Not tainted 4.6.0-rc3+ #349 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Workqueue: writeback wb_workfn (flush-11:0) task: ffff88006ccf1840 ti: ffff88006cda8000 task.ti: ffff88006cda8000 RIP: 0010:[] [] locked_inode_to_wb_and_lock_list+0xa2/0x750 RSP: 0018:ffff88006cdaf7d0 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff88006ccf2050 RDX: 0000000000000000 RSI: 000000114c8a8484 RDI: 0000000000000286 RBP: ffff88006cdaf820 R08: ffff88006ccf1840 R09: 0000000000000000 R10: 000229915090805f R11: 0000000000000001 R12: ffff88006a72f5e0 R13: dffffc0000000000 R14: ffffed000d4e5eed R15: ffffffff8830cf40 FS: 0000000000000000(0000) GS:ffff88006d500000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000003301bf8 CR3: 000000006368f000 CR4: 00000000000006e0 DR0: 0000000000001ec9 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000600 Stack: ffff88006a72f680 ffff88006a72f768 ffff8800671230d8 03ff88006cdaf948 ffff88006a72f668 ffff88006a72f5e0 ffff8800671230d8 ffff88006cdaf948 ffff880065b90cc8 ffff880067123100 ffff88006cdaf970 ffffffff8188e12e Call Trace: [< inline >] inode_to_wb_and_lock_list fs/fs-writeback.c:309 [] writeback_sb_inodes+0x4de/0x1250 fs/fs-writeback.c:1554 [] __writeback_inodes_wb+0x104/0x1e0 fs/fs-writeback.c:1600 [] wb_writeback+0x7ce/0xc90 fs/fs-writeback.c:1709 [< inline >] wb_do_writeback fs/fs-writeback.c:1844 [] wb_workfn+0x2f9/0x1000 fs/fs-writeback.c:1884 [] process_one_work+0x78e/0x15c0 kernel/workqueue.c:2094 [] worker_thread+0xdb/0xfc0 kernel/workqueue.c:2228 [] kthread+0x23f/0x2d0 drivers/block/aoe/aoecmd.c:1303 [] ret_from_fork+0x22/0x50 arch/x86/entry/entry_64.S:392 Code: 05 94 4a a8 06 85 c0 0f 85 03 03 00 00 e8 07 15 d0 ff 41 80 3e 00 0f 85 64 06 00 00 49 8b 9c 24 88 01 00 00 48 89 d8 48 c1 e8 03 <42> 80 3c 28 00 0f 85 17 06 00 00 48 8b 03 48 83 c0 50 48 39 c3 RIP [< inline >] wb_get include/linux/backing-dev-defs.h:212 RIP [] locked_inode_to_wb_and_lock_list+0xa2/0x750 fs/fs-writeback.c:281 RSP ---[ end trace 986a4d314dcb2694 ]--- Fix the problem by making sure __writeback_single_inode() writes inode only with dirty times in WB_SYNC_ALL mode. Reported-by: Dmitry Vyukov Tested-by: Laurent Dufour Signed-off-by: Jan Kara Signed-off-by: Jens Axboe diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 56c8fda..4d09d44 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1327,6 +1327,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) dirty = inode->i_state & I_DIRTY; if (inode->i_state & I_DIRTY_TIME) { if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) || + wbc->sync_mode == WB_SYNC_ALL || unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) || unlikely(time_after(jiffies, (inode->dirtied_time_when + -- cgit v0.10.2 From bfd279a868987e4bca7dc72a029e0fc50316d6df Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 26 Jul 2016 17:13:42 +0800 Subject: blkcg: kill unused field nr_undestroyed_grps 'nr_undestroyed_grps' in struct throtl_data was used to count the number of throtl_grp related with throtl_data, but now throtl_grp is tracked by blkcg_gq, so it is useless anymore. Signed-off-by: Hou Tao Signed-off-by: Jens Axboe diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 47a3e54..c5494e4 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -145,11 +145,6 @@ struct throtl_data /* Total Number of queued bios on READ and WRITE lists */ unsigned int nr_queued[2]; - /* - * number of total undestroyed groups - */ - unsigned int nr_undestroyed_grps; - /* Work for dispatching throttled bios */ struct work_struct dispatch_work; }; -- cgit v0.10.2 From 20bd723ec6a3261df5e02250cd3a1fbb09a343f2 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Wed, 27 Jul 2016 07:22:05 +0200 Subject: block: add missing group association in bio-cloning functions When a bio is cloned, the newly created bio must be associated with the same blkcg as the original bio (if BLK_CGROUP is enabled). If this operation is not performed, then the new bio is not associated with any group, and the group of the current task is returned when the group of the bio is requested. Depending on the cloning frequency, this may cause a large percentage of the bios belonging to a given group to be treated as if belonging to other groups (in most cases as if belonging to the root group). The expected group isolation may thereby be broken. This commit adds the missing association in bio-cloning functions. Fixes: da2f0f74cf7d ("Btrfs: add support for blkio controllers") Cc: stable@vger.kernel.org # v4.3+ Signed-off-by: Paolo Valente Reviewed-by: Nikolay Borisov Reviewed-by: Jeff Moyer Acked-by: Tejun Heo Signed-off-by: Jens Axboe diff --git a/block/bio.c b/block/bio.c index 54ee384..3f76a38 100644 --- a/block/bio.c +++ b/block/bio.c @@ -583,6 +583,8 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) bio->bi_rw = bio_src->bi_rw; bio->bi_iter = bio_src->bi_iter; bio->bi_io_vec = bio_src->bi_io_vec; + + bio_clone_blkcg_association(bio, bio_src); } EXPORT_SYMBOL(__bio_clone_fast); @@ -687,6 +689,8 @@ integrity_clone: } } + bio_clone_blkcg_association(bio, bio_src); + return bio; } EXPORT_SYMBOL(bio_clone_bioset); @@ -2004,6 +2008,17 @@ void bio_disassociate_task(struct bio *bio) } } +/** + * bio_clone_blkcg_association - clone blkcg association from src to dst bio + * @dst: destination bio + * @src: source bio + */ +void bio_clone_blkcg_association(struct bio *dst, struct bio *src) +{ + if (src->bi_css) + WARN_ON(bio_associate_blkcg(dst, src->bi_css)); +} + #endif /* CONFIG_BLK_CGROUP */ static void __init biovec_init_slabs(void) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cee4cb9..5850d79 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2697,12 +2697,6 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) btrfs_bio->csum = NULL; btrfs_bio->csum_allocated = NULL; btrfs_bio->end_io = NULL; - -#ifdef CONFIG_BLK_CGROUP - /* FIXME, put this into bio_clone_bioset */ - if (bio->bi_css) - bio_associate_blkcg(new, bio->bi_css); -#endif } return new; } diff --git a/include/linux/bio.h b/include/linux/bio.h index 583c108..e09a889 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -470,11 +470,14 @@ extern unsigned int bvec_nr_vecs(unsigned short idx); int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); int bio_associate_current(struct bio *bio); void bio_disassociate_task(struct bio *bio); +void bio_clone_blkcg_association(struct bio *dst, struct bio *src); #else /* CONFIG_BLK_CGROUP */ static inline int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) { return 0; } static inline int bio_associate_current(struct bio *bio) { return -ENOENT; } static inline void bio_disassociate_task(struct bio *bio) { } +static inline void bio_clone_blkcg_association(struct bio *dst, + struct bio *src) { } #endif /* CONFIG_BLK_CGROUP */ #ifdef CONFIG_HIGHMEM -- cgit v0.10.2 From 1aee6b9a7d947d42ed84baa4cf461e9d943b80f0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 27 Jul 2016 12:52:21 -0600 Subject: f2fs: drop bio->bi_rw manual assignment Merge 4fc29c1aa375 included this extra line, but it's not needed (or useful) since we'll bio_set_op_attrs() right after to properly set the op and flags for the bio. Signed-off-by: Jens Axboe diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index e262427..d64d2a5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -245,7 +245,6 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) bio_put(bio); return -EFAULT; } - bio->bi_rw = fio->op_flags; bio_set_op_attrs(bio, fio->op, fio->op_flags); __submit_bio(fio->sbi, bio, fio->type); -- cgit v0.10.2 From 77da160530dd1dc94f6ae15a981f24e5f0021e84 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 29 Jul 2016 10:40:31 +0200 Subject: block: fix use-after-free in seq file I got a KASAN report of use-after-free: ================================================================== BUG: KASAN: use-after-free in klist_iter_exit+0x61/0x70 at addr ffff8800b6581508 Read of size 8 by task trinity-c1/315 ============================================================================= BUG kmalloc-32 (Not tainted): kasan: bad access detected ----------------------------------------------------------------------------- Disabling lock debugging due to kernel taint INFO: Allocated in disk_seqf_start+0x66/0x110 age=144 cpu=1 pid=315 ___slab_alloc+0x4f1/0x520 __slab_alloc.isra.58+0x56/0x80 kmem_cache_alloc_trace+0x260/0x2a0 disk_seqf_start+0x66/0x110 traverse+0x176/0x860 seq_read+0x7e3/0x11a0 proc_reg_read+0xbc/0x180 do_loop_readv_writev+0x134/0x210 do_readv_writev+0x565/0x660 vfs_readv+0x67/0xa0 do_preadv+0x126/0x170 SyS_preadv+0xc/0x10 do_syscall_64+0x1a1/0x460 return_from_SYSCALL_64+0x0/0x6a INFO: Freed in disk_seqf_stop+0x42/0x50 age=160 cpu=1 pid=315 __slab_free+0x17a/0x2c0 kfree+0x20a/0x220 disk_seqf_stop+0x42/0x50 traverse+0x3b5/0x860 seq_read+0x7e3/0x11a0 proc_reg_read+0xbc/0x180 do_loop_readv_writev+0x134/0x210 do_readv_writev+0x565/0x660 vfs_readv+0x67/0xa0 do_preadv+0x126/0x170 SyS_preadv+0xc/0x10 do_syscall_64+0x1a1/0x460 return_from_SYSCALL_64+0x0/0x6a CPU: 1 PID: 315 Comm: trinity-c1 Tainted: G B 4.7.0+ #62 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 ffffea0002d96000 ffff880119b9f918 ffffffff81d6ce81 ffff88011a804480 ffff8800b6581500 ffff880119b9f948 ffffffff8146c7bd ffff88011a804480 ffffea0002d96000 ffff8800b6581500 fffffffffffffff4 ffff880119b9f970 Call Trace: [] dump_stack+0x65/0x84 [] print_trailer+0x10d/0x1a0 [] object_err+0x2f/0x40 [] kasan_report_error+0x221/0x520 [] __asan_report_load8_noabort+0x3e/0x40 [] klist_iter_exit+0x61/0x70 [] class_dev_iter_exit+0x9/0x10 [] disk_seqf_stop+0x3a/0x50 [] seq_read+0x4b2/0x11a0 [] proc_reg_read+0xbc/0x180 [] do_loop_readv_writev+0x134/0x210 [] do_readv_writev+0x565/0x660 [] vfs_readv+0x67/0xa0 [] do_preadv+0x126/0x170 [] SyS_preadv+0xc/0x10 This problem can occur in the following situation: open() - pread() - .seq_start() - iter = kmalloc() // succeeds - seqf->private = iter - .seq_stop() - kfree(seqf->private) - pread() - .seq_start() - iter = kmalloc() // fails - .seq_stop() - class_dev_iter_exit(seqf->private) // boom! old pointer As the comment in disk_seqf_stop() says, stop is called even if start failed, so we need to reinitialise the private pointer to NULL when seq iteration stops. An alternative would be to set the private pointer to NULL when the kmalloc() in disk_seqf_start() fails. Cc: stable@vger.kernel.org Signed-off-by: Vegard Nossum Acked-by: Tejun Heo Signed-off-by: Jens Axboe diff --git a/block/genhd.c b/block/genhd.c index 3c9dede..0ad8796 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -856,6 +856,7 @@ static void disk_seqf_stop(struct seq_file *seqf, void *v) if (iter) { class_dev_iter_exit(iter); kfree(iter); + seqf->private = NULL; } } -- cgit v0.10.2 From 97240963eb308d8d21a89c0459822f7ea98463b4 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 27 May 2016 12:59:35 +0200 Subject: nbd: fix race in ioctl Quentin ran into this bug: WARNING: CPU: 64 PID: 10085 at fs/sysfs/dir.c:31 sysfs_warn_dup+0x65/0x80 sysfs: cannot create duplicate filename '/devices/virtual/block/nbd3/pid' Modules linked in: nbd CPU: 64 PID: 10085 Comm: qemu-nbd Tainted: G D 4.6.0+ #7 0000000000000000 ffff8820330bba68 ffffffff814b8791 ffff8820330bbac8 0000000000000000 ffff8820330bbab8 ffffffff810d04ab ffff8820330bbaa8 0000001f00000296 0000000000017681 ffff8810380bf000 ffffffffa0001790 Call Trace: [] dump_stack+0x4d/0x6c [] __warn+0xdb/0x100 [] warn_slowpath_fmt+0x44/0x50 [] sysfs_warn_dup+0x65/0x80 [] sysfs_add_file_mode_ns+0x172/0x180 [] sysfs_create_file_ns+0x25/0x30 [] device_create_file+0x36/0x90 [] __nbd_ioctl+0x32d/0x9b0 [nbd] [] ? find_next_bit+0x18/0x20 [] ? select_idle_sibling+0xe9/0x120 [] ? __enqueue_entity+0x67/0x70 [] ? enqueue_task_fair+0x630/0xe20 [] ? resched_curr+0x36/0x70 [] ? check_preempt_curr+0x78/0x90 [] ? ttwu_do_wakeup+0x12/0x80 [] ? ttwu_do_activate.constprop.86+0x61/0x70 [] ? try_to_wake_up+0x185/0x2d0 [] ? default_wake_function+0xd/0x10 [] ? autoremove_wake_function+0x11/0x40 [] nbd_ioctl+0x67/0x94 [nbd] [] blkdev_ioctl+0x14d/0x940 [] ? put_pipe_info+0x22/0x60 [] block_ioctl+0x3c/0x40 [] do_vfs_ioctl+0x8d/0x5e0 [] ? ____fput+0x9/0x10 [] ? task_work_run+0x72/0x90 [] SyS_ioctl+0x47/0x80 [] entry_SYSCALL_64_fastpath+0x17/0x93 ---[ end trace 7899b295e4f850c8 ]--- It seems fairly obvious that device_create_file() is not being protected from being run concurrently on the same nbd. Quentin found the following relevant commits: 1a2ad21 nbd: add locking to nbd_ioctl 90b8f28 [PATCH] end of methods switch: remove the old ones d4430d6 [PATCH] beginning of methods conversion 08f8585 [PATCH] move block_device_operations to blkdev.h It would seem that the race was introduced in the process of moving nbd from BKL to unlocked ioctls. By setting nbd->task_recv while the mutex is held, we can prevent other processes from running concurrently (since nbd->task_recv is also checked while the mutex is held). Reported-and-tested-by: Quentin Casasnovas Cc: Markus Pargmann Cc: Paul Clements Cc: Pavel Machek Cc: Jens Axboe Cc: Al Viro Signed-off-by: Vegard Nossum Signed-off-by: Jens Axboe diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 6f55b26..a9e3980 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -451,14 +451,9 @@ static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev) sk_set_memalloc(nbd->sock->sk); - nbd->task_recv = current; - ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr); if (ret) { dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n"); - - nbd->task_recv = NULL; - return ret; } @@ -477,9 +472,6 @@ static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev) nbd_size_clear(nbd, bdev); device_remove_file(disk_to_dev(nbd->disk), &pid_attr); - - nbd->task_recv = NULL; - return ret; } @@ -788,6 +780,8 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, if (!nbd->sock) return -EINVAL; + /* We have to claim the device under the lock */ + nbd->task_recv = current; mutex_unlock(&nbd->tx_lock); nbd_parse_flags(nbd, bdev); @@ -796,6 +790,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, nbd_name(nbd)); if (IS_ERR(thread)) { mutex_lock(&nbd->tx_lock); + nbd->task_recv = NULL; return PTR_ERR(thread); } @@ -805,6 +800,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, kthread_stop(thread); mutex_lock(&nbd->tx_lock); + nbd->task_recv = NULL; sock_shutdown(nbd); nbd_clear_que(nbd); -- cgit v0.10.2 From 71f79fb3179e69b0c1448a2101a866d871c66e7f Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 1 Aug 2016 08:23:39 -0600 Subject: blk-mq: Allow timeouts to run while queue is freezing In case a submitted request gets stuck for some reason, the block layer can prevent the request starvation by starting the scheduled timeout work. If this stuck request occurs at the same time another thread has started a queue freeze, the blk_mq_timeout_work will not be able to acquire the queue reference and will return silently, thus not issuing the timeout. But since the request is already holding a q_usage_counter reference and is unable to complete, it will never release its reference, preventing the queue from completing the freeze started by first thread. This puts the request_queue in a hung state, forever waiting for the freeze completion. This was observed while running IO to a NVMe device at the same time we toggled the CPU hotplug code. Eventually, once a request got stuck requiring a timeout during a queue freeze, we saw the CPU Hotplug notification code get stuck inside blk_mq_freeze_queue_wait, as shown in the trace below. [c000000deaf13690] [c000000deaf13738] 0xc000000deaf13738 (unreliable) [c000000deaf13860] [c000000000015ce8] __switch_to+0x1f8/0x350 [c000000deaf138b0] [c000000000ade0e4] __schedule+0x314/0x990 [c000000deaf13940] [c000000000ade7a8] schedule+0x48/0xc0 [c000000deaf13970] [c0000000005492a4] blk_mq_freeze_queue_wait+0x74/0x110 [c000000deaf139e0] [c00000000054b6a8] blk_mq_queue_reinit_notify+0x1a8/0x2e0 [c000000deaf13a40] [c0000000000e7878] notifier_call_chain+0x98/0x100 [c000000deaf13a90] [c0000000000b8e08] cpu_notify_nofail+0x48/0xa0 [c000000deaf13ac0] [c0000000000b92f0] _cpu_down+0x2a0/0x400 [c000000deaf13b90] [c0000000000b94a8] cpu_down+0x58/0xa0 [c000000deaf13bc0] [c0000000006d5dcc] cpu_subsys_offline+0x2c/0x50 [c000000deaf13bf0] [c0000000006cd244] device_offline+0x104/0x140 [c000000deaf13c30] [c0000000006cd40c] online_store+0x6c/0xc0 [c000000deaf13c80] [c0000000006c8c78] dev_attr_store+0x68/0xa0 [c000000deaf13cc0] [c0000000003974d0] sysfs_kf_write+0x80/0xb0 [c000000deaf13d00] [c0000000003963e8] kernfs_fop_write+0x188/0x200 [c000000deaf13d50] [c0000000002e0f6c] __vfs_write+0x6c/0xe0 [c000000deaf13d90] [c0000000002e1ca0] vfs_write+0xc0/0x230 [c000000deaf13de0] [c0000000002e2cdc] SyS_write+0x6c/0x110 [c000000deaf13e30] [c000000000009204] system_call+0x38/0xb4 The fix is to allow the timeout work to execute in the window between dropping the initial refcount reference and the release of the last reference, which actually marks the freeze completion. This can be achieved with percpu_refcount_tryget, which does not require the counter to be alive. This way the timeout work can do it's job and terminate a stuck request even during a freeze, returning its reference and avoiding the deadlock. Allowing the timeout to run is just a part of the fix, since for some devices, we might get stuck again inside the device driver's timeout handler, should it attempt to allocate a new request in that path - which is a quite common action for Abort commands, which need to be sent after a timeout. In NVMe, for instance, we call blk_mq_alloc_request from inside the timeout handler, which will fail during a freeze, since it also tries to acquire a queue reference. I considered a similar change to blk_mq_alloc_request as a generic solution for further device driver hangs, but we can't do that, since it would allow new requests to disturb the freeze process. I thought about creating a new function in the block layer to support unfreezable requests for these occasions, but after working on it for a while, I feel like this should be handled in a per-driver basis. I'm now experimenting with changes to the NVMe timeout path, but I'm open to suggestions of ways to make this generic. Signed-off-by: Gabriel Krisman Bertazi Cc: Brian King Cc: Keith Busch Cc: linux-nvme@lists.infradead.org Cc: linux-block@vger.kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe diff --git a/block/blk-mq.c b/block/blk-mq.c index 576e711..6a63da1 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -672,7 +672,20 @@ static void blk_mq_timeout_work(struct work_struct *work) }; int i; - if (blk_queue_enter(q, true)) + /* A deadlock might occur if a request is stuck requiring a + * timeout at the same time a queue freeze is waiting + * completion, since the timeout code would not be able to + * acquire the queue reference here. + * + * That's why we don't use blk_queue_enter here; instead, we use + * percpu_ref_tryget directly, because we need to be able to + * obtain a reference even in the short window between the queue + * starting to freeze, by dropping the first reference in + * blk_mq_freeze_queue_start, and the moment the last request is + * consumed, marked by the instant q_usage_counter reaches + * zero. + */ + if (!percpu_ref_tryget(&q->q_usage_counter)) return; blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data); -- cgit v0.10.2 From df08c32ce3be5be138c1dbfcba203314a3a7cd6f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 31 Jul 2016 11:15:13 -0700 Subject: block: fix bdi vs gendisk lifetime mismatch The name for a bdi of a gendisk is derived from the gendisk's devt. However, since the gendisk is destroyed before the bdi it leaves a window where a new gendisk could dynamically reuse the same devt while a bdi with the same name is still live. Arrange for the bdi to hold a reference against its "owner" disk device while it is registered. Otherwise we can hit sysfs duplicate name collisions like the following: WARNING: CPU: 10 PID: 2078 at fs/sysfs/dir.c:31 sysfs_warn_dup+0x64/0x80 sysfs: cannot create duplicate filename '/devices/virtual/bdi/259:1' Hardware name: HP ProLiant DL580 Gen8, BIOS P79 05/06/2015 0000000000000286 0000000002c04ad5 ffff88006f24f970 ffffffff8134caec ffff88006f24f9c0 0000000000000000 ffff88006f24f9b0 ffffffff8108c351 0000001f0000000c ffff88105d236000 ffff88105d1031e0 ffff8800357427f8 Call Trace: [] dump_stack+0x63/0x87 [] __warn+0xd1/0xf0 [] warn_slowpath_fmt+0x5f/0x80 [] sysfs_warn_dup+0x64/0x80 [] sysfs_create_dir_ns+0x7e/0x90 [] kobject_add_internal+0xaa/0x320 [] ? vsnprintf+0x34e/0x4d0 [] kobject_add+0x75/0xd0 [] ? mutex_lock+0x12/0x2f [] device_add+0x125/0x610 [] device_create_groups_vargs+0xd8/0x100 [] device_create_vargs+0x1c/0x20 [] bdi_register+0x8c/0x180 [] bdi_register_dev+0x27/0x30 [] add_disk+0x175/0x4a0 Cc: Reported-by: Yi Zhang Tested-by: Yi Zhang Signed-off-by: Dan Williams Fixed up missing 0 return in bdi_register_owner(). Signed-off-by: Jens Axboe diff --git a/block/genhd.c b/block/genhd.c index 0ad8796..fcd6d4f 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -614,7 +614,7 @@ void device_add_disk(struct device *parent, struct gendisk *disk) /* Register BDI before referencing it from bdev */ bdi = &disk->queue->backing_dev_info; - bdi_register_dev(bdi, disk_devt(disk)); + bdi_register_owner(bdi, disk_to_dev(disk)); blk_register_region(disk_devt(disk), disk->minors, NULL, exact_match, exact_lock, disk); diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 3f10307..c357f27 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -163,6 +163,7 @@ struct backing_dev_info { wait_queue_head_t wb_waitq; struct device *dev; + struct device *owner; struct timer_list laptop_mode_wb_timer; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 491a917..43b93a9 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -24,6 +24,7 @@ __printf(3, 4) int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...); int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); +int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner); void bdi_unregister(struct backing_dev_info *bdi); int __must_check bdi_setup_and_register(struct backing_dev_info *, char *); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index efe2377..8fde443 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -825,6 +825,20 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) } EXPORT_SYMBOL(bdi_register_dev); +int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner) +{ + int rc; + + rc = bdi_register(bdi, NULL, "%u:%u", MAJOR(owner->devt), + MINOR(owner->devt)); + if (rc) + return rc; + bdi->owner = owner; + get_device(owner); + return 0; +} +EXPORT_SYMBOL(bdi_register_owner); + /* * Remove bdi from bdi_list, and ensure that it is no longer visible */ @@ -849,6 +863,11 @@ void bdi_unregister(struct backing_dev_info *bdi) device_unregister(bdi->dev); bdi->dev = NULL; } + + if (bdi->owner) { + put_device(bdi->owner); + bdi->owner = NULL; + } } void bdi_exit(struct backing_dev_info *bdi) -- cgit v0.10.2 From b571bc606e714e448b00920987d77b384a6a9570 Mon Sep 17 00:00:00 2001 From: Shaun Tancheff Date: Sat, 30 Jul 2016 16:45:48 -0500 Subject: Fixup direct bi_rw modifiers bi_rw should be using bio_set_op_attrs to set bi_rw. Signed-off-by: Shaun Tancheff Cc: Chris Mason Cc: Josef Bacik Cc: David Sterba Cc: Mike Christie Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 5850d79..881eb46 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2049,7 +2049,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, return -EIO; } bio->bi_bdev = dev->bdev; - bio->bi_rw = WRITE_SYNC; + bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC); bio_add_page(bio, page, length, pg_offset); if (btrfsic_submit_bio_wait(bio)) { -- cgit v0.10.2 From 6d25ec147e3a71858bed5439c92accd7f739a0a3 Mon Sep 17 00:00:00 2001 From: John Pittman Date: Mon, 1 Aug 2016 16:35:53 -0400 Subject: Include: blkdev: Removed duplicate 'struct request;' declaration. In include/linux/blkdev.h duplicate declarations of the request struct exist. Cleaned up by removing the second, unneeded declaration. Signed-off-by: John Pittman Signed-off-by: Jens Axboe diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index adf3307..de79359 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -47,7 +47,6 @@ struct pr_ops; */ #define BLKCG_MAX_POLS 2 -struct request; typedef void (rq_end_io_fn)(struct request *, int); #define BLK_RL_SYNCFULL (1U << 0) -- cgit v0.10.2 From c0f3fd2b387448d67ae83c4ce1cc69da375b9186 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 2 Aug 2016 08:45:44 -0600 Subject: blk-mq: fix deadlock in blk_mq_register_disk() error path If we fail registering any of the hardware queues, we call into blk_mq_unregister_disk() with the hotplug mutex already held. Since blk_mq_unregister_disk() attempts to acquire the same mutex, we end up in a less than happy place. Reported-by: Jinpu Wang Signed-off-by: Jens Axboe diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 4ea4dd8..fe822aa 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -380,15 +380,13 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) return ret; } -void blk_mq_unregister_disk(struct gendisk *disk) +static void __blk_mq_unregister_disk(struct gendisk *disk) { struct request_queue *q = disk->queue; struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; int i, j; - blk_mq_disable_hotplug(); - queue_for_each_hw_ctx(q, hctx, i) { blk_mq_unregister_hctx(hctx); @@ -405,6 +403,12 @@ void blk_mq_unregister_disk(struct gendisk *disk) kobject_put(&disk_to_dev(disk)->kobj); q->mq_sysfs_init_done = false; +} + +void blk_mq_unregister_disk(struct gendisk *disk) +{ + blk_mq_disable_hotplug(); + __blk_mq_unregister_disk(disk); blk_mq_enable_hotplug(); } @@ -450,7 +454,7 @@ int blk_mq_register_disk(struct gendisk *disk) } if (ret) - blk_mq_unregister_disk(disk); + __blk_mq_unregister_disk(disk); else q->mq_sysfs_init_done = true; out: -- cgit v0.10.2 From f0225cacfe7e69ff3234a125aeb0f3d65077835c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 4 Aug 2016 16:10:00 +0200 Subject: loop: don't try to use AIO for discards Fix a fat-fingered conversion to the req_op accessors, and also use a switch statement to make it more obvious what is being checked. Signed-off-by: Christoph Hellwig Reported-by: Dave Chinner Fixes: c2df40 ("drivers: use req op accessor"); Reviewed-by: Ming Lei Signed-off-by: Jens Axboe diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 075377e..91c2c88 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1659,11 +1659,15 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx, if (lo->lo_state != Lo_bound) return -EIO; - if (lo->use_dio && (req_op(cmd->rq) != REQ_OP_FLUSH || - req_op(cmd->rq) == REQ_OP_DISCARD)) - cmd->use_aio = true; - else + switch (req_op(cmd->rq)) { + case REQ_OP_FLUSH: + case REQ_OP_DISCARD: cmd->use_aio = false; + break; + default: + cmd->use_aio = lo->use_dio; + break; + } queue_kthread_work(&lo->worker, &cmd->work); -- cgit v0.10.2 From c1c87c2ba9ec06d8ba9e8a26c18c67a2ba9cd9c1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 4 Aug 2016 16:10:01 +0200 Subject: loop: make do_req_filebacked more robust Use a switch statement to iterate over the possible operations and error out if it's an incorrect one. Signed-off-by: Jens Axboe diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 91c2c88..c9f2107 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -510,14 +510,10 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, return 0; } - -static inline int lo_rw_simple(struct loop_device *lo, - struct request *rq, loff_t pos, bool rw) +static int do_req_filebacked(struct loop_device *lo, struct request *rq) { struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); - - if (cmd->use_aio) - return lo_rw_aio(lo, cmd, pos, rw); + loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset; /* * lo_write_simple and lo_read_simple should have been covered @@ -528,37 +524,30 @@ static inline int lo_rw_simple(struct loop_device *lo, * of the req at one time. And direct read IO doesn't need to * run flush_dcache_page(). */ - if (rw == WRITE) - return lo_write_simple(lo, rq, pos); - else - return lo_read_simple(lo, rq, pos); -} - -static int do_req_filebacked(struct loop_device *lo, struct request *rq) -{ - loff_t pos; - int ret; - - pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset; - - if (op_is_write(req_op(rq))) { - if (req_op(rq) == REQ_OP_FLUSH) - ret = lo_req_flush(lo, rq); - else if (req_op(rq) == REQ_OP_DISCARD) - ret = lo_discard(lo, rq, pos); - else if (lo->transfer) - ret = lo_write_transfer(lo, rq, pos); + switch (req_op(rq)) { + case REQ_OP_FLUSH: + return lo_req_flush(lo, rq); + case REQ_OP_DISCARD: + return lo_discard(lo, rq, pos); + case REQ_OP_WRITE: + if (lo->transfer) + return lo_write_transfer(lo, rq, pos); + else if (cmd->use_aio) + return lo_rw_aio(lo, cmd, pos, WRITE); else - ret = lo_rw_simple(lo, rq, pos, WRITE); - - } else { + return lo_write_simple(lo, rq, pos); + case REQ_OP_READ: if (lo->transfer) - ret = lo_read_transfer(lo, rq, pos); + return lo_read_transfer(lo, rq, pos); + else if (cmd->use_aio) + return lo_rw_aio(lo, cmd, pos, READ); else - ret = lo_rw_simple(lo, rq, pos, READ); + return lo_read_simple(lo, rq, pos); + default: + WARN_ON_ONCE(1); + return -EIO; + break; } - - return ret; } struct switch_request { -- cgit v0.10.2 From abf545484d31b68777a85c5c8f5b4bcde08283eb Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Thu, 4 Aug 2016 14:23:34 -0600 Subject: mm/block: convert rw_page users to bio op use The rw_page users were not converted to use bio/req ops. As a result bdev_write_page is not passing down REQ_OP_WRITE and the IOs will be sent down as reads. Signed-off-by: Mike Christie Fixes: 4e1b2d52a80d ("block, fs, drivers: remove REQ_OP compat defs and related code") Modified by me to: 1) Drop op_flags passing into ->rw_page(), as we don't use it. 2) Make op_is_write() and friends safe to use for !CONFIG_BLOCK Signed-off-by: Jens Axboe diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 3022dad..3439b28 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -300,20 +300,20 @@ static void copy_from_brd(void *dst, struct brd_device *brd, * Process a single bvec of a bio. */ static int brd_do_bvec(struct brd_device *brd, struct page *page, - unsigned int len, unsigned int off, int rw, + unsigned int len, unsigned int off, int op, sector_t sector) { void *mem; int err = 0; - if (rw != READ) { + if (op_is_write(op)) { err = copy_to_brd_setup(brd, sector, len); if (err) goto out; } mem = kmap_atomic(page); - if (rw == READ) { + if (!op_is_write(op)) { copy_from_brd(mem + off, brd, sector, len); flush_dcache_page(page); } else { @@ -330,7 +330,6 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio) { struct block_device *bdev = bio->bi_bdev; struct brd_device *brd = bdev->bd_disk->private_data; - int rw; struct bio_vec bvec; sector_t sector; struct bvec_iter iter; @@ -347,14 +346,12 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio) goto out; } - rw = bio_data_dir(bio); - bio_for_each_segment(bvec, bio, iter) { unsigned int len = bvec.bv_len; int err; err = brd_do_bvec(brd, bvec.bv_page, len, - bvec.bv_offset, rw, sector); + bvec.bv_offset, bio_op(bio), sector); if (err) goto io_error; sector += len >> SECTOR_SHIFT; @@ -369,11 +366,11 @@ io_error: } static int brd_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, int rw) + struct page *page, int op) { struct brd_device *brd = bdev->bd_disk->private_data; - int err = brd_do_bvec(brd, page, PAGE_SIZE, 0, rw, sector); - page_endio(page, rw & WRITE, err); + int err = brd_do_bvec(brd, page, PAGE_SIZE, 0, op, sector); + page_endio(page, op, err); return err; } diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 7454cf1..ca29649 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -843,15 +843,15 @@ static void zram_bio_discard(struct zram *zram, u32 index, } static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, - int offset, int rw) + int offset, int op) { unsigned long start_time = jiffies; int ret; - generic_start_io_acct(rw, bvec->bv_len >> SECTOR_SHIFT, + generic_start_io_acct(op, bvec->bv_len >> SECTOR_SHIFT, &zram->disk->part0); - if (rw == READ) { + if (!op_is_write(op)) { atomic64_inc(&zram->stats.num_reads); ret = zram_bvec_read(zram, bvec, index, offset); } else { @@ -859,10 +859,10 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, ret = zram_bvec_write(zram, bvec, index, offset); } - generic_end_io_acct(rw, &zram->disk->part0, start_time); + generic_end_io_acct(op, &zram->disk->part0, start_time); if (unlikely(ret)) { - if (rw == READ) + if (!op_is_write(op)) atomic64_inc(&zram->stats.failed_reads); else atomic64_inc(&zram->stats.failed_writes); @@ -873,7 +873,7 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, static void __zram_make_request(struct zram *zram, struct bio *bio) { - int offset, rw; + int offset; u32 index; struct bio_vec bvec; struct bvec_iter iter; @@ -888,7 +888,6 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) return; } - rw = bio_data_dir(bio); bio_for_each_segment(bvec, bio, iter) { int max_transfer_size = PAGE_SIZE - offset; @@ -903,15 +902,18 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) bv.bv_len = max_transfer_size; bv.bv_offset = bvec.bv_offset; - if (zram_bvec_rw(zram, &bv, index, offset, rw) < 0) + if (zram_bvec_rw(zram, &bv, index, offset, + bio_op(bio)) < 0) goto out; bv.bv_len = bvec.bv_len - max_transfer_size; bv.bv_offset += max_transfer_size; - if (zram_bvec_rw(zram, &bv, index + 1, 0, rw) < 0) + if (zram_bvec_rw(zram, &bv, index + 1, 0, + bio_op(bio)) < 0) goto out; } else - if (zram_bvec_rw(zram, &bvec, index, offset, rw) < 0) + if (zram_bvec_rw(zram, &bvec, index, offset, + bio_op(bio)) < 0) goto out; update_position(&index, &offset, &bvec); @@ -968,7 +970,7 @@ static void zram_slot_free_notify(struct block_device *bdev, } static int zram_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, int rw) + struct page *page, int op) { int offset, err = -EIO; u32 index; @@ -992,7 +994,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector, bv.bv_len = PAGE_SIZE; bv.bv_offset = 0; - err = zram_bvec_rw(zram, &bv, index, offset, rw); + err = zram_bvec_rw(zram, &bv, index, offset, op); put_zram: zram_meta_put(zram); out: @@ -1005,7 +1007,7 @@ out: * (e.g., SetPageError, set_page_dirty and extra works). */ if (err == 0) - page_endio(page, rw, 0); + page_endio(page, op, 0); return err; } diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 9dce03f..7cf3bdf 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1133,11 +1133,11 @@ static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip, static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip, struct page *page, unsigned int len, unsigned int off, - int rw, sector_t sector) + int op, sector_t sector) { int ret; - if (rw == READ) { + if (!op_is_write(op)) { ret = btt_read_pg(btt, bip, page, off, sector, len); flush_dcache_page(page); } else { @@ -1155,7 +1155,7 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio) struct bvec_iter iter; unsigned long start; struct bio_vec bvec; - int err = 0, rw; + int err = 0; bool do_acct; /* @@ -1170,7 +1170,6 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio) } do_acct = nd_iostat_start(bio, &start); - rw = bio_data_dir(bio); bio_for_each_segment(bvec, bio, iter) { unsigned int len = bvec.bv_len; @@ -1181,11 +1180,12 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio) BUG_ON(len % btt->sector_size); err = btt_do_bvec(btt, bip, bvec.bv_page, len, bvec.bv_offset, - rw, iter.bi_sector); + bio_op(bio), iter.bi_sector); if (err) { dev_info(&btt->nd_btt->dev, "io error in %s sector %lld, len %d,\n", - (rw == READ) ? "READ" : "WRITE", + (op_is_write(bio_op(bio))) ? "WRITE" : + "READ", (unsigned long long) iter.bi_sector, len); bio->bi_error = err; break; @@ -1200,12 +1200,12 @@ out: } static int btt_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, int rw) + struct page *page, int op) { struct btt *btt = bdev->bd_disk->private_data; - btt_do_bvec(btt, NULL, page, PAGE_SIZE, 0, rw, sector); - page_endio(page, rw & WRITE, 0); + btt_do_bvec(btt, NULL, page, PAGE_SIZE, 0, op, sector); + page_endio(page, op, 0); return 0; } diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index b511099..d64d924 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -67,7 +67,7 @@ static void pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset, } static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, - unsigned int len, unsigned int off, int rw, + unsigned int len, unsigned int off, int op, sector_t sector) { int rc = 0; @@ -79,7 +79,7 @@ static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) bad_pmem = true; - if (rw == READ) { + if (!op_is_write(op)) { if (unlikely(bad_pmem)) rc = -EIO; else { @@ -134,7 +134,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) do_acct = nd_iostat_start(bio, &start); bio_for_each_segment(bvec, bio, iter) { rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, - bvec.bv_offset, bio_data_dir(bio), + bvec.bv_offset, bio_op(bio), iter.bi_sector); if (rc) { bio->bi_error = rc; @@ -152,12 +152,12 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) } static int pmem_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, int rw) + struct page *page, int op) { struct pmem_device *pmem = bdev->bd_queue->queuedata; int rc; - rc = pmem_do_bvec(pmem, page, PAGE_SIZE, 0, rw, sector); + rc = pmem_do_bvec(pmem, page, PAGE_SIZE, 0, op, sector); /* * The ->rw_page interface is subtle and tricky. The core @@ -166,7 +166,7 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, * caused by double completion. */ if (rc == 0) - page_endio(page, rw & WRITE, 0); + page_endio(page, op, 0); return rc; } diff --git a/fs/block_dev.c b/fs/block_dev.c index 2033a3f..d402899 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -416,7 +416,8 @@ int bdev_read_page(struct block_device *bdev, sector_t sector, result = blk_queue_enter(bdev->bd_queue, false); if (result) return result; - result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ); + result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, + REQ_OP_READ); blk_queue_exit(bdev->bd_queue); return result; } @@ -445,7 +446,6 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, struct page *page, struct writeback_control *wbc) { int result; - int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE; const struct block_device_operations *ops = bdev->bd_disk->fops; if (!ops->rw_page || bdev_get_integrity(bdev)) @@ -455,7 +455,8 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, return result; set_page_writeback(page); - result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw); + result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, + REQ_OP_WRITE); if (result) end_page_writeback(page); else diff --git a/fs/mpage.c b/fs/mpage.c index 2ca1f39..7a09c55 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -50,7 +50,7 @@ static void mpage_end_io(struct bio *bio) bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; - page_endio(page, bio_data_dir(bio), bio->bi_error); + page_endio(page, bio_op(bio), bio->bi_error); } bio_put(bio); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index f254eb2..14b28ff 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -18,6 +18,17 @@ struct cgroup_subsys_state; typedef void (bio_end_io_t) (struct bio *); typedef void (bio_destructor_t) (struct bio *); +enum req_op { + REQ_OP_READ, + REQ_OP_WRITE, + REQ_OP_DISCARD, /* request to discard sectors */ + REQ_OP_SECURE_ERASE, /* request to securely erase sectors */ + REQ_OP_WRITE_SAME, /* write same block many times */ + REQ_OP_FLUSH, /* request for cache flush */ +}; + +#define REQ_OP_BITS 3 + #ifdef CONFIG_BLOCK /* * main unit of I/O for the block layer and lower layers (ie drivers and @@ -228,17 +239,6 @@ enum rq_flag_bits { #define REQ_HASHED (1ULL << __REQ_HASHED) #define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) -enum req_op { - REQ_OP_READ, - REQ_OP_WRITE, - REQ_OP_DISCARD, /* request to discard sectors */ - REQ_OP_SECURE_ERASE, /* request to securely erase sectors */ - REQ_OP_WRITE_SAME, /* write same block many times */ - REQ_OP_FLUSH, /* request for cache flush */ -}; - -#define REQ_OP_BITS 3 - typedef unsigned int blk_qc_t; #define BLK_QC_T_NONE -1U #define BLK_QC_T_SHIFT 16 diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index de79359..ccd68c0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1672,7 +1672,7 @@ struct blk_dax_ctl { struct block_device_operations { int (*open) (struct block_device *, fmode_t); void (*release) (struct gendisk *, fmode_t); - int (*rw_page)(struct block_device *, sector_t, struct page *, int rw); + int (*rw_page)(struct block_device *, sector_t, struct page *, int op); int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); long (*direct_access)(struct block_device *, sector_t, void **, pfn_t *, diff --git a/include/linux/fs.h b/include/linux/fs.h index f3f0b4c8..498255e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2480,12 +2480,13 @@ extern void init_special_inode(struct inode *, umode_t, dev_t); extern void make_bad_inode(struct inode *); extern bool is_bad_inode(struct inode *); -#ifdef CONFIG_BLOCK static inline bool op_is_write(unsigned int op) { return op == REQ_OP_READ ? false : true; } +#ifdef CONFIG_BLOCK + /* * return data direction, READ or WRITE */ diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 81363b8..4578637 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -510,7 +510,7 @@ static inline void wait_on_page_writeback(struct page *page) extern void end_page_writeback(struct page *page); void wait_for_stable_page(struct page *page); -void page_endio(struct page *page, int rw, int err); +void page_endio(struct page *page, int op, int err); /* * Add an arbitrary waiter to a page's wait queue diff --git a/mm/filemap.c b/mm/filemap.c index 3083ded..daef091 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -887,9 +887,9 @@ EXPORT_SYMBOL(end_page_writeback); * After completing I/O on a page, call this routine to update the page * flags appropriately */ -void page_endio(struct page *page, int rw, int err) +void page_endio(struct page *page, int op, int err) { - if (rw == READ) { + if (!op_is_write(op)) { if (!err) { SetPageUptodate(page); } else { @@ -897,7 +897,7 @@ void page_endio(struct page *page, int rw, int err) SetPageError(page); } unlock_page(page); - } else { /* rw == WRITE */ + } else { if (err) { SetPageError(page); if (page->mapping) -- cgit v0.10.2