From 45bce8f3e3436bbe2e03dd2b076abdce79ffabb7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 29 Nov 2012 10:21:43 -0800 Subject: fs/buffer.c: make block-size be per-page and protected by the page lock This makes the buffer size handling be a per-page thing, which allows us to not have to worry about locking too much when changing the buffer size. If a page doesn't have buffers, we still need to read the block size from the inode, but we can do that with ACCESS_ONCE(), so that even if the size is changing, we get a consistent value. This doesn't convert all functions - many of the buffer functions are used purely by filesystems, which in turn results in the buffer size being fixed at mount-time. So they don't have the same consistency issues that the raw device access can have. Signed-off-by: Linus Torvalds diff --git a/fs/buffer.c b/fs/buffer.c index b5f0442..28a74ff 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1553,6 +1553,28 @@ void unmap_underlying_metadata(struct block_device *bdev, sector_t block) EXPORT_SYMBOL(unmap_underlying_metadata); /* + * Size is a power-of-two in the range 512..PAGE_SIZE, + * and the case we care about most is PAGE_SIZE. + * + * So this *could* possibly be written with those + * constraints in mind (relevant mostly if some + * architecture has a slow bit-scan instruction) + */ +static inline int block_size_bits(unsigned int blocksize) +{ + return ilog2(blocksize); +} + +static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state) +{ + BUG_ON(!PageLocked(page)); + + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state); + return page_buffers(page); +} + +/* * NOTE! All mapped/uptodate combinations are valid: * * Mapped Uptodate Meaning @@ -1589,19 +1611,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page, sector_t block; sector_t last_block; struct buffer_head *bh, *head; - const unsigned blocksize = 1 << inode->i_blkbits; + unsigned int blocksize, bbits; int nr_underway = 0; int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); - BUG_ON(!PageLocked(page)); - - last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; - - if (!page_has_buffers(page)) { - create_empty_buffers(page, blocksize, + head = create_page_buffers(page, inode, (1 << BH_Dirty)|(1 << BH_Uptodate)); - } /* * Be very careful. We have no exclusion from __set_page_dirty_buffers @@ -1613,9 +1629,12 @@ static int __block_write_full_page(struct inode *inode, struct page *page, * handle that here by just cleaning them. */ - block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - head = page_buffers(page); bh = head; + blocksize = bh->b_size; + bbits = block_size_bits(blocksize); + + block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); + last_block = (i_size_read(inode) - 1) >> bbits; /* * Get all the dirty buffers mapped to disk addresses and @@ -1806,12 +1825,10 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len, BUG_ON(to > PAGE_CACHE_SIZE); BUG_ON(from > to); - blocksize = 1 << inode->i_blkbits; - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); - head = page_buffers(page); + head = create_page_buffers(page, inode, 0); + blocksize = head->b_size; + bbits = block_size_bits(blocksize); - bbits = inode->i_blkbits; block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); for(bh = head, block_start = 0; bh != head || !block_start; @@ -1881,11 +1898,11 @@ static int __block_commit_write(struct inode *inode, struct page *page, unsigned blocksize; struct buffer_head *bh, *head; - blocksize = 1 << inode->i_blkbits; + bh = head = page_buffers(page); + blocksize = bh->b_size; - for(bh = head = page_buffers(page), block_start = 0; - bh != head || !block_start; - block_start=block_end, bh = bh->b_this_page) { + block_start = 0; + do { block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (!buffer_uptodate(bh)) @@ -1895,7 +1912,10 @@ static int __block_commit_write(struct inode *inode, struct page *page, mark_buffer_dirty(bh); } clear_buffer_new(bh); - } + + block_start = block_end; + bh = bh->b_this_page; + } while (bh != head); /* * If this is a partial write which happened to make all buffers @@ -2020,7 +2040,6 @@ EXPORT_SYMBOL(generic_write_end); int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, unsigned long from) { - struct inode *inode = page->mapping->host; unsigned block_start, block_end, blocksize; unsigned to; struct buffer_head *bh, *head; @@ -2029,13 +2048,13 @@ int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, if (!page_has_buffers(page)) return 0; - blocksize = 1 << inode->i_blkbits; + head = page_buffers(page); + blocksize = head->b_size; to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count); to = from + to; if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize) return 0; - head = page_buffers(page); bh = head; block_start = 0; do { @@ -2068,18 +2087,16 @@ int block_read_full_page(struct page *page, get_block_t *get_block) struct inode *inode = page->mapping->host; sector_t iblock, lblock; struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; - unsigned int blocksize; + unsigned int blocksize, bbits; int nr, i; int fully_mapped = 1; - BUG_ON(!PageLocked(page)); - blocksize = 1 << inode->i_blkbits; - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); - head = page_buffers(page); + head = create_page_buffers(page, inode, 0); + blocksize = head->b_size; + bbits = block_size_bits(blocksize); - iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits; + iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); + lblock = (i_size_read(inode)+blocksize-1) >> bbits; bh = head; nr = 0; i = 0; -- cgit v0.10.2 From 1e8b33328a5407b447ff80953655a47014a6dcb9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 29 Nov 2012 10:49:50 -0800 Subject: blockdev: remove bd_block_size_semaphore again This reverts the block-device direct access code to the previous unlocked code, now that fs/buffer.c no longer needs external locking. With this, fs/block_dev.c is back to the original version, apart from a whitespace cleanup that I didn't want to revert. Signed-off-by: Linus Torvalds diff --git a/drivers/char/raw.c b/drivers/char/raw.c index 0bb207e..54a3a6d 100644 --- a/drivers/char/raw.c +++ b/drivers/char/raw.c @@ -285,7 +285,7 @@ static long raw_ctl_compat_ioctl(struct file *file, unsigned int cmd, static const struct file_operations raw_fops = { .read = do_sync_read, - .aio_read = blkdev_aio_read, + .aio_read = generic_file_aio_read, .write = do_sync_write, .aio_write = blkdev_aio_write, .fsync = blkdev_fsync, diff --git a/fs/block_dev.c b/fs/block_dev.c index 1a1e5e3..47a949d 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -116,8 +116,6 @@ EXPORT_SYMBOL(invalidate_bdev); int set_blocksize(struct block_device *bdev, int size) { - struct address_space *mapping; - /* Size must be a power of two, and between 512 and PAGE_SIZE */ if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) return -EINVAL; @@ -126,19 +124,6 @@ int set_blocksize(struct block_device *bdev, int size) if (size < bdev_logical_block_size(bdev)) return -EINVAL; - /* Prevent starting I/O or mapping the device */ - percpu_down_write(&bdev->bd_block_size_semaphore); - - /* Check that the block device is not memory mapped */ - mapping = bdev->bd_inode->i_mapping; - mutex_lock(&mapping->i_mmap_mutex); - if (mapping_mapped(mapping)) { - mutex_unlock(&mapping->i_mmap_mutex); - percpu_up_write(&bdev->bd_block_size_semaphore); - return -EBUSY; - } - mutex_unlock(&mapping->i_mmap_mutex); - /* Don't change the size if it is same as current */ if (bdev->bd_block_size != size) { sync_blockdev(bdev); @@ -146,9 +131,6 @@ int set_blocksize(struct block_device *bdev, int size) bdev->bd_inode->i_blkbits = blksize_bits(size); kill_bdev(bdev); } - - percpu_up_write(&bdev->bd_block_size_semaphore); - return 0; } @@ -459,12 +441,6 @@ static struct inode *bdev_alloc_inode(struct super_block *sb) struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); if (!ei) return NULL; - - if (unlikely(percpu_init_rwsem(&ei->bdev.bd_block_size_semaphore))) { - kmem_cache_free(bdev_cachep, ei); - return NULL; - } - return &ei->vfs_inode; } @@ -473,8 +449,6 @@ static void bdev_i_callback(struct rcu_head *head) struct inode *inode = container_of(head, struct inode, i_rcu); struct bdev_inode *bdi = BDEV_I(inode); - percpu_free_rwsem(&bdi->bdev.bd_block_size_semaphore); - kmem_cache_free(bdev_cachep, bdi); } @@ -1593,22 +1567,6 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) return blkdev_ioctl(bdev, mode, cmd, arg); } -ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - ssize_t ret; - struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); - - percpu_down_read(&bdev->bd_block_size_semaphore); - - ret = generic_file_aio_read(iocb, iov, nr_segs, pos); - - percpu_up_read(&bdev->bd_block_size_semaphore); - - return ret; -} -EXPORT_SYMBOL_GPL(blkdev_aio_read); - /* * Write data to the block device. Only intended for the block device itself * and the raw driver which basically is a fake block device. @@ -1620,16 +1578,12 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; - struct block_device *bdev = I_BDEV(file->f_mapping->host); struct blk_plug plug; ssize_t ret; BUG_ON(iocb->ki_pos != pos); blk_start_plug(&plug); - - percpu_down_read(&bdev->bd_block_size_semaphore); - ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); if (ret > 0 || ret == -EIOCBQUEUED) { ssize_t err; @@ -1638,62 +1592,11 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err < 0 && ret > 0) ret = err; } - - percpu_up_read(&bdev->bd_block_size_semaphore); - blk_finish_plug(&plug); - return ret; } EXPORT_SYMBOL_GPL(blkdev_aio_write); -static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) -{ - int ret; - struct block_device *bdev = I_BDEV(file->f_mapping->host); - - percpu_down_read(&bdev->bd_block_size_semaphore); - - ret = generic_file_mmap(file, vma); - - percpu_up_read(&bdev->bd_block_size_semaphore); - - return ret; -} - -static ssize_t blkdev_splice_read(struct file *file, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) -{ - ssize_t ret; - struct block_device *bdev = I_BDEV(file->f_mapping->host); - - percpu_down_read(&bdev->bd_block_size_semaphore); - - ret = generic_file_splice_read(file, ppos, pipe, len, flags); - - percpu_up_read(&bdev->bd_block_size_semaphore); - - return ret; -} - -static ssize_t blkdev_splice_write(struct pipe_inode_info *pipe, - struct file *file, loff_t *ppos, size_t len, - unsigned int flags) -{ - ssize_t ret; - struct block_device *bdev = I_BDEV(file->f_mapping->host); - - percpu_down_read(&bdev->bd_block_size_semaphore); - - ret = generic_file_splice_write(pipe, file, ppos, len, flags); - - percpu_up_read(&bdev->bd_block_size_semaphore); - - return ret; -} - - /* * Try to release a page associated with block device when the system * is under memory pressure. @@ -1724,16 +1627,16 @@ const struct file_operations def_blk_fops = { .llseek = block_llseek, .read = do_sync_read, .write = do_sync_write, - .aio_read = blkdev_aio_read, + .aio_read = generic_file_aio_read, .aio_write = blkdev_aio_write, - .mmap = blkdev_mmap, + .mmap = generic_file_mmap, .fsync = blkdev_fsync, .unlocked_ioctl = block_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_blkdev_ioctl, #endif - .splice_read = blkdev_splice_read, - .splice_write = blkdev_splice_write, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, }; int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) diff --git a/include/linux/fs.h b/include/linux/fs.h index b33cfc9..44f288e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -462,8 +462,6 @@ struct block_device { int bd_fsfreeze_count; /* Mutex for freeze */ struct mutex bd_fsfreeze_mutex; - /* A semaphore that prevents I/O while block size is being changed */ - struct percpu_rw_semaphore bd_block_size_semaphore; }; /* @@ -2379,8 +2377,6 @@ extern int generic_segment_checks(const struct iovec *iov, unsigned long *nr_segs, size_t *count, int access_flags); /* fs/block_dev.c */ -extern ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos); extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos); extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end, -- cgit v0.10.2 From ab73857e354ab9e317613cba7db714e2c12c6547 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 29 Nov 2012 12:27:00 -0800 Subject: direct-io: don't read inode->i_blkbits multiple times Since directio can work on a raw block device, and the block size of the device can change under it, we need to do the same thing that fs/buffer.c now does: read the block size a single time, using ACCESS_ONCE(). Reading it multiple times can get different results, which will then confuse the code because it actually encodes the i_blksize in relationship to the underlying logical blocksize. Signed-off-by: Linus Torvalds diff --git a/fs/direct-io.c b/fs/direct-io.c index f86c720..cf5b44b 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -540,6 +540,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, sector_t fs_endblk; /* Into file, in filesystem-sized blocks */ unsigned long fs_count; /* Number of filesystem-sized blocks */ int create; + unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor; /* * If there was a memory error and we've overwritten all the @@ -554,7 +555,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, fs_count = fs_endblk - fs_startblk + 1; map_bh->b_state = 0; - map_bh->b_size = fs_count << dio->inode->i_blkbits; + map_bh->b_size = fs_count << i_blkbits; /* * For writes inside i_size on a DIO_SKIP_HOLES filesystem we @@ -1053,7 +1054,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, int seg; size_t size; unsigned long addr; - unsigned blkbits = inode->i_blkbits; + unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits); + unsigned blkbits = i_blkbits; unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; loff_t end = offset; @@ -1149,7 +1151,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, dio->inode = inode; dio->rw = rw; sdio.blkbits = blkbits; - sdio.blkfactor = inode->i_blkbits - blkbits; + sdio.blkfactor = i_blkbits - blkbits; sdio.block_in_file = offset >> blkbits; sdio.get_block = get_block; -- cgit v0.10.2 From bbec0270bdd887f96377065ee38b8848b5afa395 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 29 Nov 2012 12:31:52 -0800 Subject: blkdev_max_block: make private to fs/buffer.c We really don't want to look at the block size for the raw block device accesses in fs/block-dev.c, because it may be changing from under us. So get rid of the max_block logic entirely, since the caller should already have done it anyway. That leaves the only user of this function in fs/buffer.c, so move the whole function there and make it static. Signed-off-by: Linus Torvalds diff --git a/fs/block_dev.c b/fs/block_dev.c index 47a949d..a1e09b4 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -70,19 +70,6 @@ static void bdev_inode_switch_bdi(struct inode *inode, spin_unlock(&dst->wb.list_lock); } -sector_t blkdev_max_block(struct block_device *bdev) -{ - sector_t retval = ~((sector_t)0); - loff_t sz = i_size_read(bdev->bd_inode); - - if (sz) { - unsigned int size = block_size(bdev); - unsigned int sizebits = blksize_bits(size); - retval = (sz >> sizebits); - } - return retval; -} - /* Kill _all_ buffers and pagecache , dirty or not.. */ void kill_bdev(struct block_device *bdev) { @@ -163,52 +150,12 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { - if (iblock >= blkdev_max_block(I_BDEV(inode))) { - if (create) - return -EIO; - - /* - * for reads, we're just trying to fill a partial page. - * return a hole, they will have to call get_block again - * before they can fill it, and they will get -EIO at that - * time - */ - return 0; - } bh->b_bdev = I_BDEV(inode); bh->b_blocknr = iblock; set_buffer_mapped(bh); return 0; } -static int -blkdev_get_blocks(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) -{ - sector_t end_block = blkdev_max_block(I_BDEV(inode)); - unsigned long max_blocks = bh->b_size >> inode->i_blkbits; - - if ((iblock + max_blocks) > end_block) { - max_blocks = end_block - iblock; - if ((long)max_blocks <= 0) { - if (create) - return -EIO; /* write fully beyond EOF */ - /* - * It is a read which is fully beyond EOF. We return - * a !buffer_mapped buffer - */ - max_blocks = 0; - } - } - - bh->b_bdev = I_BDEV(inode); - bh->b_blocknr = iblock; - bh->b_size = max_blocks << inode->i_blkbits; - if (max_blocks) - set_buffer_mapped(bh); - return 0; -} - static ssize_t blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) @@ -217,7 +164,7 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, struct inode *inode = file->f_mapping->host; return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset, - nr_segs, blkdev_get_blocks, NULL, NULL, 0); + nr_segs, blkdev_get_block, NULL, NULL, 0); } int __sync_blockdev(struct block_device *bdev, int wait) diff --git a/fs/buffer.c b/fs/buffer.c index 28a74ff..3586fb0 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -911,6 +911,18 @@ link_dev_buffers(struct page *page, struct buffer_head *head) attach_page_buffers(page, head); } +static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size) +{ + sector_t retval = ~((sector_t)0); + loff_t sz = i_size_read(bdev->bd_inode); + + if (sz) { + unsigned int sizebits = blksize_bits(size); + retval = (sz >> sizebits); + } + return retval; +} + /* * Initialise the state of a blockdev page's buffers. */ @@ -921,7 +933,7 @@ init_page_buffers(struct page *page, struct block_device *bdev, struct buffer_head *head = page_buffers(page); struct buffer_head *bh = head; int uptodate = PageUptodate(page); - sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode)); + sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size); do { if (!buffer_mapped(bh)) { diff --git a/include/linux/fs.h b/include/linux/fs.h index 44f288e..75fe9a1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2047,7 +2047,6 @@ extern void unregister_blkdev(unsigned int, const char *); extern struct block_device *bdget(dev_t); extern struct block_device *bdgrab(struct block_device *bdev); extern void bd_set_size(struct block_device *, loff_t size); -extern sector_t blkdev_max_block(struct block_device *bdev); extern void bd_forget(struct inode *inode); extern void bdput(struct block_device *); extern void invalidate_bdev(struct block_device *); -- cgit v0.10.2