diff options
Diffstat (limited to 'fs')
78 files changed, 1149 insertions, 519 deletions
diff --git a/fs/affs/file.c b/fs/affs/file.c index 0548c53..22fc7c8 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -511,8 +511,6 @@ affs_do_readpage_ofs(struct page *page, unsigned to) pr_debug("%s(%lu, %ld, 0, %d)\n", __func__, inode->i_ino, page->index, to); BUG_ON(to > PAGE_CACHE_SIZE); - kmap(page); - data = page_address(page); bsize = AFFS_SB(sb)->s_data_blksize; tmp = page->index << PAGE_CACHE_SHIFT; bidx = tmp / bsize; @@ -524,14 +522,15 @@ affs_do_readpage_ofs(struct page *page, unsigned to) return PTR_ERR(bh); tmp = min(bsize - boff, to - pos); BUG_ON(pos + tmp > to || tmp > bsize); + data = kmap_atomic(page); memcpy(data + pos, AFFS_DATA(bh) + boff, tmp); + kunmap_atomic(data); affs_brelse(bh); bidx++; pos += tmp; boff = 0; } flush_dcache_page(page); - kunmap(page); return 0; } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 051ea48..7d914c6 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -653,7 +653,7 @@ static unsigned long randomize_stack_top(unsigned long stack_top) if ((current->flags & PF_RANDOMIZE) && !(current->personality & ADDR_NO_RANDOMIZE)) { - random_variable = (unsigned long) get_random_int(); + random_variable = get_random_long(); random_variable &= STACK_RND_MASK; random_variable <<= PAGE_SHIFT; } diff --git a/fs/block_dev.c b/fs/block_dev.c index 39b3a17..826b164 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1201,7 +1201,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = disk; bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; - bdev->bd_inode->i_flags = disk->fops->direct_access ? S_DAX : 0; + if (IS_ENABLED(CONFIG_BLK_DEV_DAX) && disk->fops->direct_access) + bdev->bd_inode->i_flags = S_DAX; + else + bdev->bd_inode->i_flags = 0; + if (!partno) { ret = -ENXIO; bdev->bd_part = disk_get_part(disk, partno); @@ -1693,13 +1697,24 @@ static int blkdev_releasepage(struct page *page, gfp_t wait) return try_to_free_buffers(page); } +static int blkdev_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + if (dax_mapping(mapping)) { + struct block_device *bdev = I_BDEV(mapping->host); + + return dax_writeback_mapping_range(mapping, bdev, wbc); + } + return generic_writepages(mapping, wbc); +} + static const struct address_space_operations def_blk_aops = { .readpage = blkdev_readpage, .readpages = blkdev_readpages, .writepage = blkdev_writepage, .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, - .writepages = generic_writepages, + .writepages = blkdev_writepages, .releasepage = blkdev_releasepage, .direct_IO = blkdev_direct_IO, .is_dirty_writeback = buffer_check_dirty_writeback, diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index b90cd37..f6dac40 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1406,7 +1406,8 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, read_extent_buffer(eb, dest + bytes_left, name_off, name_len); if (eb != eb_in) { - btrfs_tree_read_unlock_blocking(eb); + if (!path->skip_locking) + btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); } ret = btrfs_find_item(fs_root, path, parent, 0, @@ -1426,9 +1427,10 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, eb = path->nodes[0]; /* make sure we can use eb after releasing the path */ if (eb != eb_in) { - atomic_inc(&eb->refs); - btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + if (!path->skip_locking) + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + path->nodes[0] = NULL; + path->locks[0] = 0; } btrfs_release_path(path); iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index c473c42..3346cd8 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -637,11 +637,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, faili = nr_pages - 1; cb->nr_pages = nr_pages; - /* In the parent-locked case, we only locked the range we are - * interested in. In all other cases, we can opportunistically - * cache decompressed data that goes beyond the requested range. */ - if (!(bio_flags & EXTENT_BIO_PARENT_LOCKED)) - add_ra_bio_pages(inode, em_start + em_len, cb); + add_ra_bio_pages(inode, em_start + em_len, cb); /* include any pages we added in add_ra-bio_pages */ uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0be47e4..b57daa8 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1689,7 +1689,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list, * */ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, - struct list_head *ins_list) + struct list_head *ins_list, bool *emitted) { struct btrfs_dir_item *di; struct btrfs_delayed_item *curr, *next; @@ -1733,6 +1733,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, if (over) return 1; + *emitted = true; } return 0; } diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index f70119f..0167853 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -144,7 +144,7 @@ void btrfs_put_delayed_items(struct list_head *ins_list, int btrfs_should_delete_dir_index(struct list_head *del_list, u64 index); int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, - struct list_head *ins_list); + struct list_head *ins_list, bool *emitted); /* for init */ int __init btrfs_delayed_inode_init(void); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 2e7c97a..392592d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2897,12 +2897,11 @@ static int __do_readpage(struct extent_io_tree *tree, struct block_device *bdev; int ret; int nr = 0; - int parent_locked = *bio_flags & EXTENT_BIO_PARENT_LOCKED; size_t pg_offset = 0; size_t iosize; size_t disk_io_size; size_t blocksize = inode->i_sb->s_blocksize; - unsigned long this_bio_flag = *bio_flags & EXTENT_BIO_PARENT_LOCKED; + unsigned long this_bio_flag = 0; set_page_extent_mapped(page); @@ -2942,18 +2941,16 @@ static int __do_readpage(struct extent_io_tree *tree, kunmap_atomic(userpage); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); - if (!parent_locked) - unlock_extent_cached(tree, cur, - cur + iosize - 1, - &cached, GFP_NOFS); + unlock_extent_cached(tree, cur, + cur + iosize - 1, + &cached, GFP_NOFS); break; } em = __get_extent_map(inode, page, pg_offset, cur, end - cur + 1, get_extent, em_cached); if (IS_ERR_OR_NULL(em)) { SetPageError(page); - if (!parent_locked) - unlock_extent(tree, cur, end); + unlock_extent(tree, cur, end); break; } extent_offset = cur - em->start; @@ -3038,12 +3035,9 @@ static int __do_readpage(struct extent_io_tree *tree, set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); - if (parent_locked) - free_extent_state(cached); - else - unlock_extent_cached(tree, cur, - cur + iosize - 1, - &cached, GFP_NOFS); + unlock_extent_cached(tree, cur, + cur + iosize - 1, + &cached, GFP_NOFS); cur = cur + iosize; pg_offset += iosize; continue; @@ -3052,8 +3046,7 @@ static int __do_readpage(struct extent_io_tree *tree, if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1, NULL)) { check_page_uptodate(tree, page); - if (!parent_locked) - unlock_extent(tree, cur, cur + iosize - 1); + unlock_extent(tree, cur, cur + iosize - 1); cur = cur + iosize; pg_offset += iosize; continue; @@ -3063,8 +3056,7 @@ static int __do_readpage(struct extent_io_tree *tree, */ if (block_start == EXTENT_MAP_INLINE) { SetPageError(page); - if (!parent_locked) - unlock_extent(tree, cur, cur + iosize - 1); + unlock_extent(tree, cur, cur + iosize - 1); cur = cur + iosize; pg_offset += iosize; continue; @@ -3083,8 +3075,7 @@ static int __do_readpage(struct extent_io_tree *tree, *bio_flags = this_bio_flag; } else { SetPageError(page); - if (!parent_locked) - unlock_extent(tree, cur, cur + iosize - 1); + unlock_extent(tree, cur, cur + iosize - 1); } cur = cur + iosize; pg_offset += iosize; @@ -3213,20 +3204,6 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, return ret; } -int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent, int mirror_num) -{ - struct bio *bio = NULL; - unsigned long bio_flags = EXTENT_BIO_PARENT_LOCKED; - int ret; - - ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num, - &bio_flags, READ, NULL); - if (bio) - ret = submit_one_bio(READ, bio, mirror_num, bio_flags); - return ret; -} - static noinline void update_nr_written(struct page *page, struct writeback_control *wbc, unsigned long nr_written) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 0377413..880d529 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -29,7 +29,6 @@ */ #define EXTENT_BIO_COMPRESSED 1 #define EXTENT_BIO_TREE_LOG 2 -#define EXTENT_BIO_PARENT_LOCKED 4 #define EXTENT_BIO_FLAG_SHIFT 16 /* these are bit numbers for test/set bit */ @@ -210,8 +209,6 @@ static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end); int extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, int mirror_num); -int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent, int mirror_num); int __init extent_io_init(void); void extent_io_exit(void); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5f06eb1..d96f5cf 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5717,6 +5717,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) char *name_ptr; int name_len; int is_curr = 0; /* ctx->pos points to the current index? */ + bool emitted; /* FIXME, use a real flag for deciding about the key type */ if (root->fs_info->tree_root == root) @@ -5745,6 +5746,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) if (ret < 0) goto err; + emitted = false; while (1) { leaf = path->nodes[0]; slot = path->slots[0]; @@ -5824,6 +5826,7 @@ skip: if (over) goto nopos; + emitted = true; di_len = btrfs_dir_name_len(leaf, di) + btrfs_dir_data_len(leaf, di) + sizeof(*di); di_cur += di_len; @@ -5836,11 +5839,20 @@ next: if (key_type == BTRFS_DIR_INDEX_KEY) { if (is_curr) ctx->pos++; - ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list); + ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list, &emitted); if (ret) goto nopos; } + /* + * If we haven't emitted any dir entry, we must not touch ctx->pos as + * it was was set to the termination value in previous call. We assume + * that "." and ".." were emitted if we reach this point and set the + * termination value as well for an empty directory. + */ + if (ctx->pos > 2 && !emitted) + goto nopos; + /* Reached end of directory/root. Bump pos past the last item. */ ctx->pos++; @@ -7974,6 +7986,7 @@ static void btrfs_endio_direct_read(struct bio *bio) kfree(dip); + dio_bio->bi_error = bio->bi_error; dio_end_io(dio_bio, bio->bi_error); if (io_bio->end_io) @@ -8028,6 +8041,7 @@ static void btrfs_endio_direct_write(struct bio *bio) kfree(dip); + dio_bio->bi_error = bio->bi_error; dio_end_io(dio_bio, bio->bi_error); bio_put(bio); } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 952172c..48aee98 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2794,24 +2794,29 @@ out: static struct page *extent_same_get_page(struct inode *inode, pgoff_t index) { struct page *page; - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; page = grab_cache_page(inode->i_mapping, index); if (!page) - return NULL; + return ERR_PTR(-ENOMEM); if (!PageUptodate(page)) { - if (extent_read_full_page_nolock(tree, page, btrfs_get_extent, - 0)) - return NULL; + int ret; + + ret = btrfs_readpage(NULL, page); + if (ret) + return ERR_PTR(ret); lock_page(page); if (!PageUptodate(page)) { unlock_page(page); page_cache_release(page); - return NULL; + return ERR_PTR(-EIO); + } + if (page->mapping != inode->i_mapping) { + unlock_page(page); + page_cache_release(page); + return ERR_PTR(-EAGAIN); } } - unlock_page(page); return page; } @@ -2823,17 +2828,31 @@ static int gather_extent_pages(struct inode *inode, struct page **pages, pgoff_t index = off >> PAGE_CACHE_SHIFT; for (i = 0; i < num_pages; i++) { +again: pages[i] = extent_same_get_page(inode, index + i); - if (!pages[i]) - return -ENOMEM; + if (IS_ERR(pages[i])) { + int err = PTR_ERR(pages[i]); + + if (err == -EAGAIN) + goto again; + pages[i] = NULL; + return err; + } } return 0; } -static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) +static int lock_extent_range(struct inode *inode, u64 off, u64 len, + bool retry_range_locking) { - /* do any pending delalloc/csum calc on src, one way or - another, and lock file content */ + /* + * Do any pending delalloc/csum calculations on inode, one way or + * another, and lock file content. + * The locking order is: + * + * 1) pages + * 2) range in the inode's io tree + */ while (1) { struct btrfs_ordered_extent *ordered; lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); @@ -2851,8 +2870,11 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); if (ordered) btrfs_put_ordered_extent(ordered); + if (!retry_range_locking) + return -EAGAIN; btrfs_wait_ordered_range(inode, off, len); } + return 0; } static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2) @@ -2877,15 +2899,24 @@ static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); } -static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) +static int btrfs_double_extent_lock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len, + bool retry_range_locking) { + int ret; + if (inode1 < inode2) { swap(inode1, inode2); swap(loff1, loff2); } - lock_extent_range(inode1, loff1, len); - lock_extent_range(inode2, loff2, len); + ret = lock_extent_range(inode1, loff1, len, retry_range_locking); + if (ret) + return ret; + ret = lock_extent_range(inode2, loff2, len, retry_range_locking); + if (ret) + unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, + loff1 + len - 1); + return ret; } struct cmp_pages { @@ -2901,11 +2932,15 @@ static void btrfs_cmp_data_free(struct cmp_pages *cmp) for (i = 0; i < cmp->num_pages; i++) { pg = cmp->src_pages[i]; - if (pg) + if (pg) { + unlock_page(pg); page_cache_release(pg); + } pg = cmp->dst_pages[i]; - if (pg) + if (pg) { + unlock_page(pg); page_cache_release(pg); + } } kfree(cmp->src_pages); kfree(cmp->dst_pages); @@ -2966,6 +3001,8 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, src_page = cmp->src_pages[i]; dst_page = cmp->dst_pages[i]; + ASSERT(PageLocked(src_page)); + ASSERT(PageLocked(dst_page)); addr = kmap_atomic(src_page); dst_addr = kmap_atomic(dst_page); @@ -3078,14 +3115,46 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, goto out_unlock; } +again: ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, &cmp); if (ret) goto out_unlock; if (same_inode) - lock_extent_range(src, same_lock_start, same_lock_len); + ret = lock_extent_range(src, same_lock_start, same_lock_len, + false); else - btrfs_double_extent_lock(src, loff, dst, dst_loff, len); + ret = btrfs_double_extent_lock(src, loff, dst, dst_loff, len, + false); + /* + * If one of the inodes has dirty pages in the respective range or + * ordered extents, we need to flush dellaloc and wait for all ordered + * extents in the range. We must unlock the pages and the ranges in the + * io trees to avoid deadlocks when flushing delalloc (requires locking + * pages) and when waiting for ordered extents to complete (they require + * range locking). + */ + if (ret == -EAGAIN) { + /* + * Ranges in the io trees already unlocked. Now unlock all + * pages before waiting for all IO to complete. + */ + btrfs_cmp_data_free(&cmp); + if (same_inode) { + btrfs_wait_ordered_range(src, same_lock_start, + same_lock_len); + } else { + btrfs_wait_ordered_range(src, loff, len); + btrfs_wait_ordered_range(dst, dst_loff, len); + } + goto again; + } + ASSERT(ret == 0); + if (WARN_ON(ret)) { + /* ranges in the io trees already unlocked */ + btrfs_cmp_data_free(&cmp); + return ret; + } /* pass original length for comparison so we stay within i_size */ ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp); @@ -3795,9 +3864,15 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, u64 lock_start = min_t(u64, off, destoff); u64 lock_len = max_t(u64, off, destoff) + len - lock_start; - lock_extent_range(src, lock_start, lock_len); + ret = lock_extent_range(src, lock_start, lock_len, true); } else { - btrfs_double_extent_lock(src, off, inode, destoff, len); + ret = btrfs_double_extent_lock(src, off, inode, destoff, len, + true); + } + ASSERT(ret == 0); + if (WARN_ON(ret)) { + /* ranges in the io trees already unlocked */ + goto out_unlock; } ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 7cf8509..2c849b0 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -310,8 +310,16 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); err = btrfs_insert_fs_root(root->fs_info, root); + /* + * The root might have been inserted already, as before we look + * for orphan roots, log replay might have happened, which + * triggers a transaction commit and qgroup accounting, which + * in turn reads and inserts fs roots while doing backref + * walking. + */ + if (err == -EEXIST) + err = 0; if (err) { - BUG_ON(err == -EEXIST); btrfs_free_fs_root(root); break; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index c222137..19adeb0 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1756,6 +1756,10 @@ int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) u32 pool; int ret, flags; + /* does not support pool namespace yet */ + if (ci->i_pool_ns_len) + return -EIO; + if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode), NOPOOLPERM)) return 0; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index cdbf8cf..6fe0ad2 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2753,7 +2753,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, void *inline_data, int inline_len, struct ceph_buffer *xattr_buf, struct ceph_mds_session *session, - struct ceph_cap *cap, int issued) + struct ceph_cap *cap, int issued, + u32 pool_ns_len) __releases(ci->i_ceph_lock) __releases(mdsc->snap_rwsem) { @@ -2873,6 +2874,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) { /* file layout may have changed */ ci->i_layout = grant->layout; + ci->i_pool_ns_len = pool_ns_len; + /* size/truncate_seq? */ queue_trunc = ceph_fill_file_size(inode, issued, le32_to_cpu(grant->truncate_seq), @@ -3411,6 +3414,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, u32 inline_len = 0; void *snaptrace; size_t snaptrace_len; + u32 pool_ns_len = 0; void *p, *end; dout("handle_caps from mds%d\n", mds); @@ -3463,6 +3467,21 @@ void ceph_handle_caps(struct ceph_mds_session *session, p += inline_len; } + if (le16_to_cpu(msg->hdr.version) >= 8) { + u64 flush_tid; + u32 caller_uid, caller_gid; + u32 osd_epoch_barrier; + /* version >= 5 */ + ceph_decode_32_safe(&p, end, osd_epoch_barrier, bad); + /* version >= 6 */ + ceph_decode_64_safe(&p, end, flush_tid, bad); + /* version >= 7 */ + ceph_decode_32_safe(&p, end, caller_uid, bad); + ceph_decode_32_safe(&p, end, caller_gid, bad); + /* version >= 8 */ + ceph_decode_32_safe(&p, end, pool_ns_len, bad); + } + /* lookup ino */ inode = ceph_find_inode(sb, vino); ci = ceph_inode(inode); @@ -3518,7 +3537,8 @@ void ceph_handle_caps(struct ceph_mds_session *session, &cap, &issued); handle_cap_grant(mdsc, inode, h, inline_version, inline_data, inline_len, - msg->middle, session, cap, issued); + msg->middle, session, cap, issued, + pool_ns_len); if (realm) ceph_put_snap_realm(mdsc, realm); goto done_unlocked; @@ -3542,7 +3562,8 @@ void ceph_handle_caps(struct ceph_mds_session *session, issued |= __ceph_caps_dirty(ci); handle_cap_grant(mdsc, inode, h, inline_version, inline_data, inline_len, - msg->middle, session, cap, issued); + msg->middle, session, cap, issued, + pool_ns_len); goto done_unlocked; case CEPH_CAP_OP_FLUSH_ACK: diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index fb4ba2e..5849b88 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -396,6 +396,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_symlink = NULL; memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); + ci->i_pool_ns_len = 0; ci->i_fragtree = RB_ROOT; mutex_init(&ci->i_fragtree_mutex); @@ -756,6 +757,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool) ci->i_ceph_flags &= ~CEPH_I_POOL_PERM; ci->i_layout = info->layout; + ci->i_pool_ns_len = iinfo->pool_ns_len; queue_trunc = ceph_fill_file_size(inode, issued, le32_to_cpu(info->truncate_seq), diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index e7b130a..911d64d 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -100,6 +100,14 @@ static int parse_reply_info_in(void **p, void *end, } else info->inline_version = CEPH_INLINE_NONE; + if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) { + ceph_decode_32_safe(p, end, info->pool_ns_len, bad); + ceph_decode_need(p, end, info->pool_ns_len, bad); + *p += info->pool_ns_len; + } else { + info->pool_ns_len = 0; + } + return 0; bad: return err; @@ -2298,6 +2306,14 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), CEPH_CAP_PIN); + /* deny access to directories with pool_ns layouts */ + if (req->r_inode && S_ISDIR(req->r_inode->i_mode) && + ceph_inode(req->r_inode)->i_pool_ns_len) + return -EIO; + if (req->r_locked_dir && + ceph_inode(req->r_locked_dir)->i_pool_ns_len) + return -EIO; + /* issue */ mutex_lock(&mdsc->mutex); __register_request(mdsc, req, dir); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index ccf11ef..37712cc 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -44,6 +44,7 @@ struct ceph_mds_reply_info_in { u64 inline_version; u32 inline_len; char *inline_data; + u32 pool_ns_len; }; /* diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 75b7d12..9c458eb 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -287,6 +287,7 @@ struct ceph_inode_info { struct ceph_dir_layout i_dir_layout; struct ceph_file_layout i_layout; + size_t i_pool_ns_len; char *i_symlink; /* for dirs */ diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 7dc886c..e956cba 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -175,7 +175,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata, * string to the length of the original string to allow for worst case. */ md_len = strlen(sb_mountdata) + INET6_ADDRSTRLEN; - mountdata = kzalloc(md_len + 1, GFP_KERNEL); + mountdata = kzalloc(md_len + sizeof("ip=") + 1, GFP_KERNEL); if (mountdata == NULL) { rc = -ENOMEM; goto compose_mount_options_err; diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index afa09fc..e682b36 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -714,7 +714,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) ses->auth_key.response = kmalloc(baselen + tilen, GFP_KERNEL); if (!ses->auth_key.response) { - rc = ENOMEM; + rc = -ENOMEM; ses->auth_key.len = 0; goto setup_ntlmv2_rsp_ret; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index c48ca13..2eea403 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1013,7 +1013,6 @@ const struct file_operations cifs_file_strict_ops = { .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, .clone_file_range = cifs_clone_file_range, - .clone_file_range = cifs_clone_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, }; diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 68c4547..83aac8b 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -31,19 +31,15 @@ * so that it will fit. We use hash_64 to convert the value to 31 bits, and * then add 1, to ensure that we don't end up with a 0 as the value. */ -#if BITS_PER_LONG == 64 static inline ino_t cifs_uniqueid_to_ino_t(u64 fileid) { + if ((sizeof(ino_t)) < (sizeof(u64))) + return (ino_t)hash_64(fileid, (sizeof(ino_t) * 8) - 1) + 1; + return (ino_t)fileid; + } -#else -static inline ino_t -cifs_uniqueid_to_ino_t(u64 fileid) -{ - return (ino_t)hash_64(fileid, (sizeof(ino_t) * 8) - 1) + 1; -} -#endif extern struct file_system_type cifs_fs_type; extern const struct address_space_operations cifs_addr_ops; diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 90b4f9f..76fcb50 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1396,11 +1396,10 @@ openRetry: * current bigbuf. */ static int -cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid) +discard_remaining_data(struct TCP_Server_Info *server) { unsigned int rfclen = get_rfc1002_length(server->smallbuf); int remaining = rfclen + 4 - server->total_read; - struct cifs_readdata *rdata = mid->callback_data; while (remaining > 0) { int length; @@ -1414,10 +1413,20 @@ cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid) remaining -= length; } - dequeue_mid(mid, rdata->result); return 0; } +static int +cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid) +{ + int length; + struct cifs_readdata *rdata = mid->callback_data; + + length = discard_remaining_data(server); + dequeue_mid(mid, rdata->result); + return length; +} + int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) { @@ -1446,6 +1455,12 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) return length; server->total_read += length; + if (server->ops->is_status_pending && + server->ops->is_status_pending(buf, server, 0)) { + discard_remaining_data(server); + return -1; + } + /* Was the SMB read successful? */ rdata->result = server->ops->map_error(buf, false); if (rdata->result != 0) { diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 4fbd92d..a763cd3 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2999,8 +2999,7 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) if (ses_init_buf) { ses_init_buf->trailer.session_req.called_len = 32; - if (server->server_RFC1001_name && - server->server_RFC1001_name[0] != 0) + if (server->server_RFC1001_name[0] != 0) rfc1002mangle(ses_init_buf->trailer. session_req.called_name, server->server_RFC1001_name, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 10f8d5c..42e1f44 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1106,21 +1106,25 @@ parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp, { char *data_offset; struct create_context *cc; - unsigned int next = 0; + unsigned int next; + unsigned int remaining; char *name; data_offset = (char *)rsp + 4 + le32_to_cpu(rsp->CreateContextsOffset); + remaining = le32_to_cpu(rsp->CreateContextsLength); cc = (struct create_context *)data_offset; - do { - cc = (struct create_context *)((char *)cc + next); + while (remaining >= sizeof(struct create_context)) { name = le16_to_cpu(cc->NameOffset) + (char *)cc; - if (le16_to_cpu(cc->NameLength) != 4 || - strncmp(name, "RqLs", 4)) { - next = le32_to_cpu(cc->Next); - continue; - } - return server->ops->parse_lease_buf(cc, epoch); - } while (next != 0); + if (le16_to_cpu(cc->NameLength) == 4 && + strncmp(name, "RqLs", 4) == 0) + return server->ops->parse_lease_buf(cc, epoch); + + next = le32_to_cpu(cc->Next); + if (!next) + break; + remaining -= next; + cc = (struct create_context *)((char *)cc + next); + } return 0; } @@ -79,15 +79,14 @@ struct page *read_dax_sector(struct block_device *bdev, sector_t n) } /* - * dax_clear_blocks() is called from within transaction context from XFS, + * dax_clear_sectors() is called from within transaction context from XFS, * and hence this means the stack from this point must follow GFP_NOFS * semantics for all operations. */ -int dax_clear_blocks(struct inode *inode, sector_t block, long _size) +int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size) { - struct block_device *bdev = inode->i_sb->s_bdev; struct blk_dax_ctl dax = { - .sector = block << (inode->i_blkbits - 9), + .sector = _sector, .size = _size, }; @@ -109,7 +108,7 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long _size) wmb_pmem(); return 0; } -EXPORT_SYMBOL_GPL(dax_clear_blocks); +EXPORT_SYMBOL_GPL(dax_clear_sectors); /* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */ static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first, @@ -485,11 +484,10 @@ static int dax_writeback_one(struct block_device *bdev, * end]. This is required by data integrity operations to ensure file data is * on persistent storage prior to completion of the operation. */ -int dax_writeback_mapping_range(struct address_space *mapping, loff_t start, - loff_t end) +int dax_writeback_mapping_range(struct address_space *mapping, + struct block_device *bdev, struct writeback_control *wbc) { struct inode *inode = mapping->host; - struct block_device *bdev = inode->i_sb->s_bdev; pgoff_t start_index, end_index, pmd_index; pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; @@ -500,8 +498,11 @@ int dax_writeback_mapping_range(struct address_space *mapping, loff_t start, if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) return -EIO; - start_index = start >> PAGE_CACHE_SHIFT; - end_index = end >> PAGE_CACHE_SHIFT; + if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) + return 0; + + start_index = wbc->range_start >> PAGE_CACHE_SHIFT; + end_index = wbc->range_end >> PAGE_CACHE_SHIFT; pmd_index = DAX_PMD_INDEX(start_index); rcu_read_lock(); @@ -1055,6 +1056,7 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault); int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct file *file = vma->vm_file; + int error; /* * We pass NO_SECTOR to dax_radix_entry() because we expect that a @@ -1064,7 +1066,13 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) * saves us from having to make a call to get_block() here to look * up the sector. */ - dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true); + error = dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, + true); + + if (error == -ENOMEM) + return VM_FAULT_OOM; + if (error) + return VM_FAULT_SIGBUS; return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); diff --git a/fs/dcache.c b/fs/dcache.c index 92d5140..2398f9f9 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -269,9 +269,6 @@ static inline int dname_external(const struct dentry *dentry) return dentry->d_name.name != dentry->d_iname; } -/* - * Make sure other CPUs see the inode attached before the type is set. - */ static inline void __d_set_inode_and_type(struct dentry *dentry, struct inode *inode, unsigned type_flags) @@ -279,28 +276,18 @@ static inline void __d_set_inode_and_type(struct dentry *dentry, unsigned flags; dentry->d_inode = inode; - smp_wmb(); flags = READ_ONCE(dentry->d_flags); flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); flags |= type_flags; WRITE_ONCE(dentry->d_flags, flags); } -/* - * Ideally, we want to make sure that other CPUs see the flags cleared before - * the inode is detached, but this is really a violation of RCU principles - * since the ordering suggests we should always set inode before flags. - * - * We should instead replace or discard the entire dentry - but that sucks - * performancewise on mass deletion/rename. - */ static inline void __d_clear_type_and_inode(struct dentry *dentry) { unsigned flags = READ_ONCE(dentry->d_flags); flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); WRITE_ONCE(dentry->d_flags, flags); - smp_wmb(); dentry->d_inode = NULL; } @@ -370,9 +357,11 @@ static void dentry_unlink_inode(struct dentry * dentry) __releases(dentry->d_inode->i_lock) { struct inode *inode = dentry->d_inode; + + raw_write_seqcount_begin(&dentry->d_seq); __d_clear_type_and_inode(dentry); hlist_del_init(&dentry->d_u.d_alias); - dentry_rcuwalk_invalidate(dentry); + raw_write_seqcount_end(&dentry->d_seq); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); if (!inode->i_nlink) @@ -1758,8 +1747,9 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) spin_lock(&dentry->d_lock); if (inode) hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); + raw_write_seqcount_begin(&dentry->d_seq); __d_set_inode_and_type(dentry, inode, add_flags); - dentry_rcuwalk_invalidate(dentry); + raw_write_seqcount_end(&dentry->d_seq); spin_unlock(&dentry->d_lock); fsnotify_d_instantiate(dentry, inode); } diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 1f107fd..655f21f 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -575,6 +575,26 @@ void devpts_kill_index(struct inode *ptmx_inode, int idx) mutex_unlock(&allocated_ptys_lock); } +/* + * pty code needs to hold extra references in case of last /dev/tty close + */ + +void devpts_add_ref(struct inode *ptmx_inode) +{ + struct super_block *sb = pts_sb_from_inode(ptmx_inode); + + atomic_inc(&sb->s_active); + ihold(ptmx_inode); +} + +void devpts_del_ref(struct inode *ptmx_inode) +{ + struct super_block *sb = pts_sb_from_inode(ptmx_inode); + + iput(ptmx_inode); + deactivate_super(sb); +} + /** * devpts_pty_new -- create a new inode in /dev/pts/ * @ptmx_inode: inode of the master diff --git a/fs/direct-io.c b/fs/direct-io.c index 1b2f7ff..d6a9012 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -472,8 +472,8 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio) dio->io_error = -EIO; if (dio->is_async && dio->rw == READ && dio->should_dirty) { - bio_check_pages_dirty(bio); /* transfers ownership */ err = bio->bi_error; + bio_check_pages_dirty(bio); /* transfers ownership */ } else { bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index c424e48..d48e0d2 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -10,6 +10,7 @@ #include <linux/efi.h> #include <linux/fs.h> #include <linux/slab.h> +#include <linux/mount.h> #include "internal.h" @@ -103,9 +104,78 @@ out_free: return size; } +static int +efivarfs_ioc_getxflags(struct file *file, void __user *arg) +{ + struct inode *inode = file->f_mapping->host; + unsigned int i_flags; + unsigned int flags = 0; + + i_flags = inode->i_flags; + if (i_flags & S_IMMUTABLE) + flags |= FS_IMMUTABLE_FL; + + if (copy_to_user(arg, &flags, sizeof(flags))) + return -EFAULT; + return 0; +} + +static int +efivarfs_ioc_setxflags(struct file *file, void __user *arg) +{ + struct inode *inode = file->f_mapping->host; + unsigned int flags; + unsigned int i_flags = 0; + int error; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (copy_from_user(&flags, arg, sizeof(flags))) + return -EFAULT; + + if (flags & ~FS_IMMUTABLE_FL) + return -EOPNOTSUPP; + + if (!capable(CAP_LINUX_IMMUTABLE)) + return -EPERM; + + if (flags & FS_IMMUTABLE_FL) + i_flags |= S_IMMUTABLE; + + + error = mnt_want_write_file(file); + if (error) + return error; + + inode_lock(inode); + inode_set_flags(inode, i_flags, S_IMMUTABLE); + inode_unlock(inode); + + mnt_drop_write_file(file); + + return 0; +} + +long +efivarfs_file_ioctl(struct file *file, unsigned int cmd, unsigned long p) +{ + void __user *arg = (void __user *)p; + + switch (cmd) { + case FS_IOC_GETFLAGS: + return efivarfs_ioc_getxflags(file, arg); + case FS_IOC_SETFLAGS: + return efivarfs_ioc_setxflags(file, arg); + } + + return -ENOTTY; +} + const struct file_operations efivarfs_file_operations = { .open = simple_open, .read = efivarfs_file_read, .write = efivarfs_file_write, .llseek = no_llseek, + .unlocked_ioctl = efivarfs_file_ioctl, }; diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index 3381b9d..e2ab6d0 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -15,7 +15,8 @@ #include "internal.h" struct inode *efivarfs_get_inode(struct super_block *sb, - const struct inode *dir, int mode, dev_t dev) + const struct inode *dir, int mode, + dev_t dev, bool is_removable) { struct inode *inode = new_inode(sb); @@ -23,6 +24,7 @@ struct inode *efivarfs_get_inode(struct super_block *sb, inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_flags = is_removable ? 0 : S_IMMUTABLE; switch (mode & S_IFMT) { case S_IFREG: inode->i_fop = &efivarfs_file_operations; @@ -102,22 +104,17 @@ static void efivarfs_hex_to_guid(const char *str, efi_guid_t *guid) static int efivarfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - struct inode *inode; + struct inode *inode = NULL; struct efivar_entry *var; int namelen, i = 0, err = 0; + bool is_removable = false; if (!efivarfs_valid_name(dentry->d_name.name, dentry->d_name.len)) return -EINVAL; - inode = efivarfs_get_inode(dir->i_sb, dir, mode, 0); - if (!inode) - return -ENOMEM; - var = kzalloc(sizeof(struct efivar_entry), GFP_KERNEL); - if (!var) { - err = -ENOMEM; - goto out; - } + if (!var) + return -ENOMEM; /* length of the variable name itself: remove GUID and separator */ namelen = dentry->d_name.len - EFI_VARIABLE_GUID_LEN - 1; @@ -125,6 +122,16 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry, efivarfs_hex_to_guid(dentry->d_name.name + namelen + 1, &var->var.VendorGuid); + if (efivar_variable_is_removable(var->var.VendorGuid, + dentry->d_name.name, namelen)) + is_removable = true; + + inode = efivarfs_get_inode(dir->i_sb, dir, mode, 0, is_removable); + if (!inode) { + err = -ENOMEM; + goto out; + } + for (i = 0; i < namelen; i++) var->var.VariableName[i] = dentry->d_name.name[i]; @@ -138,7 +145,8 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry, out: if (err) { kfree(var); - iput(inode); + if (inode) + iput(inode); } return err; } diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h index b5ff16a..b450518 100644 --- a/fs/efivarfs/internal.h +++ b/fs/efivarfs/internal.h @@ -15,7 +15,8 @@ extern const struct file_operations efivarfs_file_operations; extern const struct inode_operations efivarfs_dir_inode_operations; extern bool efivarfs_valid_name(const char *str, int len); extern struct inode *efivarfs_get_inode(struct super_block *sb, - const struct inode *dir, int mode, dev_t dev); + const struct inode *dir, int mode, dev_t dev, + bool is_removable); extern struct list_head efivarfs_list; diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index b8a564f..dd029d1 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -118,8 +118,9 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, struct dentry *dentry, *root = sb->s_root; unsigned long size = 0; char *name; - int len, i; + int len; int err = -ENOMEM; + bool is_removable = false; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) @@ -128,15 +129,17 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, memcpy(entry->var.VariableName, name16, name_size); memcpy(&(entry->var.VendorGuid), &vendor, sizeof(efi_guid_t)); - len = ucs2_strlen(entry->var.VariableName); + len = ucs2_utf8size(entry->var.VariableName); /* name, plus '-', plus GUID, plus NUL*/ name = kmalloc(len + 1 + EFI_VARIABLE_GUID_LEN + 1, GFP_KERNEL); if (!name) goto fail; - for (i = 0; i < len; i++) - name[i] = entry->var.VariableName[i] & 0xFF; + ucs2_as_utf8(name, entry->var.VariableName, len); + + if (efivar_variable_is_removable(entry->var.VendorGuid, name, len)) + is_removable = true; name[len] = '-'; @@ -144,7 +147,8 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, name[len + EFI_VARIABLE_GUID_LEN+1] = '\0'; - inode = efivarfs_get_inode(sb, d_inode(root), S_IFREG | 0644, 0); + inode = efivarfs_get_inode(sb, d_inode(root), S_IFREG | 0644, 0, + is_removable); if (!inode) goto fail_name; @@ -200,7 +204,7 @@ static int efivarfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_d_op = &efivarfs_d_ops; sb->s_time_gran = 1; - inode = efivarfs_get_inode(sb, NULL, S_IFDIR | 0755, 0); + inode = efivarfs_get_inode(sb, NULL, S_IFDIR | 0755, 0, true); if (!inode) return -ENOMEM; inode->i_op = &efivarfs_dir_inode_operations; diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 2c88d68..c1400b1 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -80,23 +80,6 @@ static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, return ret; } -static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct inode *inode = file_inode(vma->vm_file); - struct ext2_inode_info *ei = EXT2_I(inode); - int ret; - - sb_start_pagefault(inode->i_sb); - file_update_time(vma->vm_file); - down_read(&ei->dax_sem); - - ret = __dax_mkwrite(vma, vmf, ext2_get_block, NULL); - - up_read(&ei->dax_sem); - sb_end_pagefault(inode->i_sb); - return ret; -} - static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { @@ -124,7 +107,7 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma, static const struct vm_operations_struct ext2_dax_vm_ops = { .fault = ext2_dax_fault, .pmd_fault = ext2_dax_pmd_fault, - .page_mkwrite = ext2_dax_mkwrite, + .page_mkwrite = ext2_dax_fault, .pfn_mkwrite = ext2_dax_pfn_mkwrite, }; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 338eefd..6bd58e6 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -737,8 +737,10 @@ static int ext2_get_blocks(struct inode *inode, * so that it's not found by another thread before it's * initialised */ - err = dax_clear_blocks(inode, le32_to_cpu(chain[depth-1].key), - 1 << inode->i_blkbits); + err = dax_clear_sectors(inode->i_sb->s_bdev, + le32_to_cpu(chain[depth-1].key) << + (inode->i_blkbits - 9), + 1 << inode->i_blkbits); if (err) { mutex_unlock(&ei->truncate_mutex); goto cleanup; @@ -874,6 +876,14 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) static int ext2_writepages(struct address_space *mapping, struct writeback_control *wbc) { +#ifdef CONFIG_FS_DAX + if (dax_mapping(mapping)) { + return dax_writeback_mapping_range(mapping, + mapping->host->i_sb->s_bdev, + wbc); + } +#endif + return mpage_writepages(mapping, wbc, ext2_get_block); } @@ -1296,7 +1306,7 @@ void ext2_set_inode_flags(struct inode *inode) inode->i_flags |= S_NOATIME; if (flags & EXT2_DIRSYNC_FL) inode->i_flags |= S_DIRSYNC; - if (test_opt(inode->i_sb, DAX)) + if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode)) inode->i_flags |= S_DAX; } diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index ec0668a..fe1f50f 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -191,7 +191,6 @@ static int ext4_init_block_bitmap(struct super_block *sb, /* If checksum is bad mark all blocks used to prevent allocation * essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { - ext4_error(sb, "Checksum bad for group %u", block_group); grp = ext4_get_group_info(sb, block_group); if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) percpu_counter_sub(&sbi->s_freeclusters_counter, @@ -442,14 +441,16 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) } ext4_lock_group(sb, block_group); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - err = ext4_init_block_bitmap(sb, bh, block_group, desc); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); - if (err) + if (err) { + ext4_error(sb, "Failed to init block bitmap for group " + "%u: %d", block_group, err); goto out; + } goto verify; } ext4_unlock_group(sb, block_group); diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index c802120..38f7562 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -467,3 +467,59 @@ uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size) return size; return 0; } + +/* + * Validate dentries for encrypted directories to make sure we aren't + * potentially caching stale data after a key has been added or + * removed. + */ +static int ext4_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct inode *dir = d_inode(dentry->d_parent); + struct ext4_crypt_info *ci = EXT4_I(dir)->i_crypt_info; + int dir_has_key, cached_with_key; + + if (!ext4_encrypted_inode(dir)) + return 0; + + if (ci && ci->ci_keyring_key && + (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | + (1 << KEY_FLAG_REVOKED) | + (1 << KEY_FLAG_DEAD)))) + ci = NULL; + + /* this should eventually be an flag in d_flags */ + cached_with_key = dentry->d_fsdata != NULL; + dir_has_key = (ci != NULL); + + /* + * If the dentry was cached without the key, and it is a + * negative dentry, it might be a valid name. We can't check + * if the key has since been made available due to locking + * reasons, so we fail the validation so ext4_lookup() can do + * this check. + * + * We also fail the validation if the dentry was created with + * the key present, but we no longer have the key, or vice versa. + */ + if ((!cached_with_key && d_is_negative(dentry)) || + (!cached_with_key && dir_has_key) || + (cached_with_key && !dir_has_key)) { +#if 0 /* Revalidation debug */ + char buf[80]; + char *cp = simple_dname(dentry, buf, sizeof(buf)); + + if (IS_ERR(cp)) + cp = (char *) "???"; + pr_err("revalidate: %s %p %d %d %d\n", cp, dentry->d_fsdata, + cached_with_key, d_is_negative(dentry), + dir_has_key); +#endif + return 0; + } + return 1; +} + +const struct dentry_operations ext4_encrypted_d_ops = { + .d_revalidate = ext4_d_revalidate, +}; diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 1d1bca7..33f5e2a 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -111,6 +111,12 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) int dir_has_error = 0; struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; + if (ext4_encrypted_inode(inode)) { + err = ext4_get_encryption_info(inode); + if (err && err != -ENOKEY) + return err; + } + if (is_dx_dir(inode)) { err = ext4_dx_readdir(file, ctx); if (err != ERR_BAD_DX_DIR) { @@ -157,8 +163,11 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) index, 1); file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; bh = ext4_bread(NULL, inode, map.m_lblk, 0); - if (IS_ERR(bh)) - return PTR_ERR(bh); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + bh = NULL; + goto errout; + } } if (!bh) { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0662b28..157b458 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2302,6 +2302,7 @@ struct page *ext4_encrypt(struct inode *inode, int ext4_decrypt(struct page *page); int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, ext4_lblk_t len); +extern const struct dentry_operations ext4_encrypted_d_ops; #ifdef CONFIG_EXT4_FS_ENCRYPTION int ext4_init_crypto(void); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 0ffabaf..3753ceb 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3928,7 +3928,7 @@ static int convert_initialized_extent(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path **ppath, int flags, - unsigned int allocated, ext4_fsblk_t newblock) + unsigned int allocated) { struct ext4_ext_path *path = *ppath; struct ext4_extent *ex; @@ -4347,7 +4347,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { allocated = convert_initialized_extent( handle, inode, map, &path, - flags, allocated, newblock); + flags, allocated); goto out2; } else if (!ext4_ext_is_unwritten(ex)) goto out; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 1126436..4cd318f 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -262,23 +262,8 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, return result; } -static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - int err; - struct inode *inode = file_inode(vma->vm_file); - - sb_start_pagefault(inode->i_sb); - file_update_time(vma->vm_file); - down_read(&EXT4_I(inode)->i_mmap_sem); - err = __dax_mkwrite(vma, vmf, ext4_dax_mmap_get_block, NULL); - up_read(&EXT4_I(inode)->i_mmap_sem); - sb_end_pagefault(inode->i_sb); - - return err; -} - /* - * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_mkwrite() + * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault() * handler we check for races agaist truncate. Note that since we cycle through * i_mmap_sem, we are sure that also any hole punching that began before we * were called is finished by now and so if it included part of the file we @@ -311,7 +296,7 @@ static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma, static const struct vm_operations_struct ext4_dax_vm_ops = { .fault = ext4_dax_fault, .pmd_fault = ext4_dax_pmd_fault, - .page_mkwrite = ext4_dax_mkwrite, + .page_mkwrite = ext4_dax_fault, .pfn_mkwrite = ext4_dax_pfn_mkwrite, }; #else @@ -350,6 +335,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct vfsmount *mnt = filp->f_path.mnt; + struct inode *dir = filp->f_path.dentry->d_parent->d_inode; struct path path; char buf[64], *cp; int ret; @@ -393,6 +379,14 @@ static int ext4_file_open(struct inode * inode, struct file * filp) if (ext4_encryption_info(inode) == NULL) return -ENOKEY; } + if (ext4_encrypted_inode(dir) && + !ext4_is_child_context_consistent_with_parent(dir, inode)) { + ext4_warning(inode->i_sb, + "Inconsistent encryption contexts: %lu/%lu\n", + (unsigned long) dir->i_ino, + (unsigned long) inode->i_ino); + return -EPERM; + } /* * Set up the jbd2_inode if we are opening the inode for * writing and the journal is present diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 3fcfd50..acc0ad5 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -76,7 +76,6 @@ static int ext4_init_inode_bitmap(struct super_block *sb, /* If checksum is bad mark all blocks and inodes use to prevent * allocation, essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { - ext4_error(sb, "Checksum bad for group %u", block_group); grp = ext4_get_group_info(sb, block_group); if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) percpu_counter_sub(&sbi->s_freeclusters_counter, @@ -191,8 +190,11 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) set_buffer_verified(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); - if (err) + if (err) { + ext4_error(sb, "Failed to init inode bitmap for group " + "%u: %d", block_group, err); goto out; + } return bh; } ext4_unlock_group(sb, block_group); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 83bc8bf..aee960b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -686,6 +686,34 @@ out_sem: return retval; } +/* + * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages + * we have to be careful as someone else may be manipulating b_state as well. + */ +static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) +{ + unsigned long old_state; + unsigned long new_state; + + flags &= EXT4_MAP_FLAGS; + + /* Dummy buffer_head? Set non-atomically. */ + if (!bh->b_page) { + bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags; + return; + } + /* + * Someone else may be modifying b_state. Be careful! This is ugly but + * once we get rid of using bh as a container for mapping information + * to pass to / from get_block functions, this can go away. + */ + do { + old_state = READ_ONCE(bh->b_state); + new_state = (old_state & ~EXT4_MAP_FLAGS) | flags; + } while (unlikely( + cmpxchg(&bh->b_state, old_state, new_state) != old_state)); +} + /* Maximum number of blocks we map for direct IO at once. */ #define DIO_MAX_BLOCKS 4096 @@ -722,7 +750,7 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, ext4_io_end_t *io_end = ext4_inode_aio(inode); map_bh(bh, inode->i_sb, map.m_pblk); - bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; + ext4_update_bh_state(bh, map.m_flags); if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) set_buffer_defer_completion(bh); bh->b_size = inode->i_sb->s_blocksize * map.m_len; @@ -1685,7 +1713,7 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, return ret; map_bh(bh, inode->i_sb, map.m_pblk); - bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; + ext4_update_bh_state(bh, map.m_flags); if (buffer_unwritten(bh)) { /* A delayed write to unwritten bh should be marked @@ -2450,6 +2478,10 @@ static int ext4_writepages(struct address_space *mapping, trace_ext4_writepages(inode, wbc); + if (dax_mapping(mapping)) + return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, + wbc); + /* * No pages to write? This is mainly a kludge to avoid starting * a transaction for special inodes like journal inode on last iput() @@ -3253,29 +3285,29 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * case, we allocate an io_end structure to hook to the iocb. */ iocb->private = NULL; - ext4_inode_aio_set(inode, NULL); - if (!is_sync_kiocb(iocb)) { - io_end = ext4_init_io_end(inode, GFP_NOFS); - if (!io_end) { - ret = -ENOMEM; - goto retake_lock; - } - /* - * Grab reference for DIO. Will be dropped in ext4_end_io_dio() - */ - iocb->private = ext4_get_io_end(io_end); - /* - * we save the io structure for current async direct - * IO, so that later ext4_map_blocks() could flag the - * io structure whether there is a unwritten extents - * needs to be converted when IO is completed. - */ - ext4_inode_aio_set(inode, io_end); - } - if (overwrite) { get_block_func = ext4_get_block_overwrite; } else { + ext4_inode_aio_set(inode, NULL); + if (!is_sync_kiocb(iocb)) { + io_end = ext4_init_io_end(inode, GFP_NOFS); + if (!io_end) { + ret = -ENOMEM; + goto retake_lock; + } + /* + * Grab reference for DIO. Will be dropped in + * ext4_end_io_dio() + */ + iocb->private = ext4_get_io_end(io_end); + /* + * we save the io structure for current async direct + * IO, so that later ext4_map_blocks() could flag the + * io structure whether there is a unwritten extents + * needs to be converted when IO is completed. + */ + ext4_inode_aio_set(inode, io_end); + } get_block_func = ext4_get_block_write; dio_flags = DIO_LOCKING; } @@ -4127,7 +4159,7 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; - if (test_opt(inode->i_sb, DAX)) + if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode)) new_fl |= S_DAX; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 0f6c369..eae5917 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -208,7 +208,7 @@ static int ext4_ioctl_setflags(struct inode *inode, { struct ext4_inode_info *ei = EXT4_I(inode); handle_t *handle = NULL; - int err = EPERM, migrate = 0; + int err = -EPERM, migrate = 0; struct ext4_iloc iloc; unsigned int oldflags, mask, i; unsigned int jflag; @@ -583,6 +583,11 @@ group_extend_out: "Online defrag not supported with bigalloc"); err = -EOPNOTSUPP; goto mext_out; + } else if (IS_DAX(inode)) { + ext4_msg(sb, KERN_ERR, + "Online defrag not supported with DAX"); + err = -EOPNOTSUPP; + goto mext_out; } err = mnt_want_write_file(filp); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 61eaf74..4424b7b 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2285,7 +2285,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) if (group == 0) seq_puts(seq, "#group: free frags first [" " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " - " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]"); + " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n"); i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + sizeof(struct ext4_group_info); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index fb6f117..4098acc 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -265,11 +265,12 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, ext4_lblk_t orig_blk_offset, donor_blk_offset; unsigned long blocksize = orig_inode->i_sb->s_blocksize; unsigned int tmp_data_size, data_size, replaced_size; - int err2, jblocks, retries = 0; + int i, err2, jblocks, retries = 0; int replaced_count = 0; int from = data_offset_in_page << orig_inode->i_blkbits; int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; struct super_block *sb = orig_inode->i_sb; + struct buffer_head *bh = NULL; /* * It needs twice the amount of ordinary journal buffers because @@ -380,8 +381,17 @@ data_copy: } /* Perform all necessary steps similar write_begin()/write_end() * but keeping in mind that i_size will not change */ - *err = __block_write_begin(pagep[0], from, replaced_size, - ext4_get_block); + if (!page_has_buffers(pagep[0])) + create_empty_buffers(pagep[0], 1 << orig_inode->i_blkbits, 0); + bh = page_buffers(pagep[0]); + for (i = 0; i < data_offset_in_page; i++) + bh = bh->b_this_page; + for (i = 0; i < block_len_in_page; i++) { + *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0); + if (*err < 0) + break; + bh = bh->b_this_page; + } if (!*err) *err = block_commit_write(pagep[0], from, from + replaced_size); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 06574dd..48e4b89 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1558,6 +1558,24 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi struct ext4_dir_entry_2 *de; struct buffer_head *bh; + if (ext4_encrypted_inode(dir)) { + int res = ext4_get_encryption_info(dir); + + /* + * This should be a properly defined flag for + * dentry->d_flags when we uplift this to the VFS. + * d_fsdata is set to (void *) 1 if if the dentry is + * created while the directory was encrypted and we + * don't have access to the key. + */ + dentry->d_fsdata = NULL; + if (ext4_encryption_info(dir)) + dentry->d_fsdata = (void *) 1; + d_set_d_op(dentry, &ext4_encrypted_d_ops); + if (res && res != -ENOKEY) + return ERR_PTR(res); + } + if (dentry->d_name.len > EXT4_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); @@ -1585,11 +1603,15 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi return ERR_PTR(-EFSCORRUPTED); } if (!IS_ERR(inode) && ext4_encrypted_inode(dir) && - (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode)) && + (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !ext4_is_child_context_consistent_with_parent(dir, inode)) { + int nokey = ext4_encrypted_inode(inode) && + !ext4_encryption_info(inode); + iput(inode); + if (nokey) + return ERR_PTR(-ENOKEY); ext4_warning(inode->i_sb, "Inconsistent encryption contexts: %lu/%lu\n", (unsigned long) dir->i_ino, diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index ad62d7a..34038e3 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -198,7 +198,7 @@ static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size) if (flex_gd == NULL) goto out3; - if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_flex_group_data)) + if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_group_data)) goto out2; flex_gd->count = flexbg_size; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 6915c95..5c46ed9 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -223,6 +223,9 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi, #define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1) /* one round can affect upto 5 slots */ +static atomic_t isw_nr_in_flight = ATOMIC_INIT(0); +static struct workqueue_struct *isw_wq; + void __inode_attach_wb(struct inode *inode, struct page *page) { struct backing_dev_info *bdi = inode_to_bdi(inode); @@ -424,6 +427,8 @@ skip_switch: iput(inode); kfree(isw); + + atomic_dec(&isw_nr_in_flight); } static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head) @@ -433,7 +438,7 @@ static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head) /* needs to grab bh-unsafe locks, bounce to work item */ INIT_WORK(&isw->work, inode_switch_wbs_work_fn); - schedule_work(&isw->work); + queue_work(isw_wq, &isw->work); } /** @@ -469,7 +474,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) /* while holding I_WB_SWITCH, no one else can update the association */ spin_lock(&inode->i_lock); - if (inode->i_state & (I_WB_SWITCH | I_FREEING) || + if (!(inode->i_sb->s_flags & MS_ACTIVE) || + inode->i_state & (I_WB_SWITCH | I_FREEING) || inode_to_wb(inode) == isw->new_wb) { spin_unlock(&inode->i_lock); goto out_free; @@ -480,6 +486,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) ihold(inode); isw->inode = inode; + atomic_inc(&isw_nr_in_flight); + /* * In addition to synchronizing among switchers, I_WB_SWITCH tells * the RCU protected stat update paths to grab the mapping's @@ -840,6 +848,33 @@ restart: wb_put(last_wb); } +/** + * cgroup_writeback_umount - flush inode wb switches for umount + * + * This function is called when a super_block is about to be destroyed and + * flushes in-flight inode wb switches. An inode wb switch goes through + * RCU and then workqueue, so the two need to be flushed in order to ensure + * that all previously scheduled switches are finished. As wb switches are + * rare occurrences and synchronize_rcu() can take a while, perform + * flushing iff wb switches are in flight. + */ +void cgroup_writeback_umount(void) +{ + if (atomic_read(&isw_nr_in_flight)) { + synchronize_rcu(); + flush_workqueue(isw_wq); + } +} + +static int __init cgroup_writeback_init(void) +{ + isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0); + if (!isw_wq) + return -ENOMEM; + return 0; +} +fs_initcall(cgroup_writeback_init); + #else /* CONFIG_CGROUP_WRITEBACK */ static struct bdi_writeback * diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index 506765a..bb8d67e 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -376,12 +376,11 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry) struct inode *inode = d_inode(dentry); dnode_secno dno; int r; - int rep = 0; int err; hpfs_lock(dir->i_sb); hpfs_adjust_length(name, &len); -again: + err = -ENOENT; de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh); if (!de) @@ -401,33 +400,9 @@ again: hpfs_error(dir->i_sb, "there was error when removing dirent"); err = -EFSERROR; break; - case 2: /* no space for deleting, try to truncate file */ - + case 2: /* no space for deleting */ err = -ENOSPC; - if (rep++) - break; - - dentry_unhash(dentry); - if (!d_unhashed(dentry)) { - hpfs_unlock(dir->i_sb); - return -ENOSPC; - } - if (generic_permission(inode, MAY_WRITE) || - !S_ISREG(inode->i_mode) || - get_write_access(inode)) { - d_rehash(dentry); - } else { - struct iattr newattrs; - /*pr_info("truncating file before delete.\n");*/ - newattrs.ia_size = 0; - newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; - err = notify_change(dentry, &newattrs, NULL); - put_write_access(inode); - if (!err) - goto again; - } - hpfs_unlock(dir->i_sb); - return -ENOSPC; + break; default: drop_nlink(inode); err = 0; @@ -154,6 +154,12 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_rdev = 0; inode->dirtied_when = 0; +#ifdef CONFIG_CGROUP_WRITEBACK + inode->i_wb_frn_winner = 0; + inode->i_wb_frn_avg_time = 0; + inode->i_wb_frn_history = 0; +#endif + if (security_inode_alloc(inode)) goto out; spin_lock_init(&inode->i_lock); diff --git a/fs/jffs2/README.Locking b/fs/jffs2/README.Locking index 3ea3655..8918ac9 100644 --- a/fs/jffs2/README.Locking +++ b/fs/jffs2/README.Locking @@ -2,10 +2,6 @@ JFFS2 LOCKING DOCUMENTATION --------------------------- -At least theoretically, JFFS2 does not require the Big Kernel Lock -(BKL), which was always helpfully obtained for it by Linux 2.4 VFS -code. It has its own locking, as described below. - This document attempts to describe the existing locking rules for JFFS2. It is not expected to remain perfectly up to date, but ought to be fairly close. @@ -69,6 +65,7 @@ Ordering constraints: any f->sem held. 2. Never attempt to lock two file mutexes in one thread. No ordering rules have been made for doing so. + 3. Never lock a page cache page with f->sem held. erase_completion_lock spinlock diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c index 0ae91ad..b288c8a 100644 --- a/fs/jffs2/build.c +++ b/fs/jffs2/build.c @@ -50,7 +50,8 @@ next_inode(int *i, struct jffs2_inode_cache *ic, struct jffs2_sb_info *c) static void jffs2_build_inode_pass1(struct jffs2_sb_info *c, - struct jffs2_inode_cache *ic) + struct jffs2_inode_cache *ic, + int *dir_hardlinks) { struct jffs2_full_dirent *fd; @@ -69,19 +70,21 @@ static void jffs2_build_inode_pass1(struct jffs2_sb_info *c, dbg_fsbuild("child \"%s\" (ino #%u) of dir ino #%u doesn't exist!\n", fd->name, fd->ino, ic->ino); jffs2_mark_node_obsolete(c, fd->raw); + /* Clear the ic/raw union so it doesn't cause problems later. */ + fd->ic = NULL; continue; } + /* From this point, fd->raw is no longer used so we can set fd->ic */ + fd->ic = child_ic; + child_ic->pino_nlink++; + /* If we appear (at this stage) to have hard-linked directories, + * set a flag to trigger a scan later */ if (fd->type == DT_DIR) { - if (child_ic->pino_nlink) { - JFFS2_ERROR("child dir \"%s\" (ino #%u) of dir ino #%u appears to be a hard link\n", - fd->name, fd->ino, ic->ino); - /* TODO: What do we do about it? */ - } else { - child_ic->pino_nlink = ic->ino; - } - } else - child_ic->pino_nlink++; + child_ic->flags |= INO_FLAGS_IS_DIR; + if (child_ic->pino_nlink > 1) + *dir_hardlinks = 1; + } dbg_fsbuild("increased nlink for child \"%s\" (ino #%u)\n", fd->name, fd->ino); /* Can't free scan_dents so far. We might need them in pass 2 */ @@ -95,8 +98,7 @@ static void jffs2_build_inode_pass1(struct jffs2_sb_info *c, */ static int jffs2_build_filesystem(struct jffs2_sb_info *c) { - int ret; - int i; + int ret, i, dir_hardlinks = 0; struct jffs2_inode_cache *ic; struct jffs2_full_dirent *fd; struct jffs2_full_dirent *dead_fds = NULL; @@ -120,7 +122,7 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c) /* Now scan the directory tree, increasing nlink according to every dirent found. */ for_each_inode(i, c, ic) { if (ic->scan_dents) { - jffs2_build_inode_pass1(c, ic); + jffs2_build_inode_pass1(c, ic, &dir_hardlinks); cond_resched(); } } @@ -156,6 +158,20 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c) } dbg_fsbuild("pass 2a complete\n"); + + if (dir_hardlinks) { + /* If we detected directory hardlinks earlier, *hopefully* + * they are gone now because some of the links were from + * dead directories which still had some old dirents lying + * around and not yet garbage-collected, but which have + * been discarded above. So clear the pino_nlink field + * in each directory, so that the final scan below can + * print appropriate warnings. */ + for_each_inode(i, c, ic) { + if (ic->flags & INO_FLAGS_IS_DIR) + ic->pino_nlink = 0; + } + } dbg_fsbuild("freeing temporary data structures\n"); /* Finally, we can scan again and free the dirent structs */ @@ -163,6 +179,33 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c) while(ic->scan_dents) { fd = ic->scan_dents; ic->scan_dents = fd->next; + /* We do use the pino_nlink field to count nlink of + * directories during fs build, so set it to the + * parent ino# now. Now that there's hopefully only + * one. */ + if (fd->type == DT_DIR) { + if (!fd->ic) { + /* We'll have complained about it and marked the coresponding + raw node obsolete already. Just skip it. */ + continue; + } + + /* We *have* to have set this in jffs2_build_inode_pass1() */ + BUG_ON(!(fd->ic->flags & INO_FLAGS_IS_DIR)); + + /* We clear ic->pino_nlink ∀ directories' ic *only* if dir_hardlinks + * is set. Otherwise, we know this should never trigger anyway, so + * we don't do the check. And ic->pino_nlink still contains the nlink + * value (which is 1). */ + if (dir_hardlinks && fd->ic->pino_nlink) { + JFFS2_ERROR("child dir \"%s\" (ino #%u) of dir ino #%u is also hard linked from dir ino #%u\n", + fd->name, fd->ino, ic->ino, fd->ic->pino_nlink); + /* Should we unlink it from its previous parent? */ + } + + /* For directories, ic->pino_nlink holds that parent inode # */ + fd->ic->pino_nlink = ic->ino; + } jffs2_free_full_dirent(fd); } ic->scan_dents = NULL; @@ -241,11 +284,7 @@ static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *c, /* Reduce nlink of the child. If it's now zero, stick it on the dead_fds list to be cleaned up later. Else just free the fd */ - - if (fd->type == DT_DIR) - child_ic->pino_nlink = 0; - else - child_ic->pino_nlink--; + child_ic->pino_nlink--; if (!child_ic->pino_nlink) { dbg_fsbuild("inode #%u (\"%s\") now has no links; adding to dead_fds list.\n", diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index d211b8e..30c4c9e 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -843,9 +843,14 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, pr_notice("%s(): Link succeeded, unlink failed (err %d). You now have a hard link\n", __func__, ret); - /* Might as well let the VFS know */ - d_instantiate(new_dentry, d_inode(old_dentry)); - ihold(d_inode(old_dentry)); + /* + * We can't keep the target in dcache after that. + * For one thing, we can't afford dentry aliases for directories. + * For another, if there was a victim, we _can't_ set new inode + * for that sucker and we have to trigger mount eviction - the + * caller won't do it on its own since we are returning an error. + */ + d_invalidate(new_dentry); new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now); return ret; } diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index c5ac594..cad86ba 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -137,39 +137,33 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, struct page *pg; struct inode *inode = mapping->host; struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); - struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); - struct jffs2_raw_inode ri; - uint32_t alloc_len = 0; pgoff_t index = pos >> PAGE_CACHE_SHIFT; uint32_t pageofs = index << PAGE_CACHE_SHIFT; int ret = 0; - jffs2_dbg(1, "%s()\n", __func__); - - if (pageofs > inode->i_size) { - ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len, - ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); - if (ret) - return ret; - } - - mutex_lock(&f->sem); pg = grab_cache_page_write_begin(mapping, index, flags); - if (!pg) { - if (alloc_len) - jffs2_complete_reservation(c); - mutex_unlock(&f->sem); + if (!pg) return -ENOMEM; - } *pagep = pg; - if (alloc_len) { + jffs2_dbg(1, "%s()\n", __func__); + + if (pageofs > inode->i_size) { /* Make new hole frag from old EOF to new page */ + struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); + struct jffs2_raw_inode ri; struct jffs2_full_dnode *fn; + uint32_t alloc_len; jffs2_dbg(1, "Writing new hole frag 0x%x-0x%x between current EOF and new page\n", (unsigned int)inode->i_size, pageofs); + ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len, + ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); + if (ret) + goto out_page; + + mutex_lock(&f->sem); memset(&ri, 0, sizeof(ri)); ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); @@ -196,6 +190,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, if (IS_ERR(fn)) { ret = PTR_ERR(fn); jffs2_complete_reservation(c); + mutex_unlock(&f->sem); goto out_page; } ret = jffs2_add_full_dnode_to_inode(c, f, fn); @@ -210,10 +205,12 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, jffs2_mark_node_obsolete(c, fn->raw); jffs2_free_full_dnode(fn); jffs2_complete_reservation(c); + mutex_unlock(&f->sem); goto out_page; } jffs2_complete_reservation(c); inode->i_size = pageofs; + mutex_unlock(&f->sem); } /* @@ -222,18 +219,18 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, * case of a short-copy. */ if (!PageUptodate(pg)) { + mutex_lock(&f->sem); ret = jffs2_do_readpage_nolock(inode, pg); + mutex_unlock(&f->sem); if (ret) goto out_page; } - mutex_unlock(&f->sem); jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags); return ret; out_page: unlock_page(pg); page_cache_release(pg); - mutex_unlock(&f->sem); return ret; } diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c index 5a2dec2..95d5880 100644 --- a/fs/jffs2/gc.c +++ b/fs/jffs2/gc.c @@ -1296,14 +1296,17 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era BUG_ON(start > orig_start); } - /* First, use readpage() to read the appropriate page into the page cache */ - /* Q: What happens if we actually try to GC the _same_ page for which commit_write() - * triggered garbage collection in the first place? - * A: I _think_ it's OK. read_cache_page shouldn't deadlock, we'll write out the - * page OK. We'll actually write it out again in commit_write, which is a little - * suboptimal, but at least we're correct. - */ + /* The rules state that we must obtain the page lock *before* f->sem, so + * drop f->sem temporarily. Since we also hold c->alloc_sem, nothing's + * actually going to *change* so we're safe; we only allow reading. + * + * It is important to note that jffs2_write_begin() will ensure that its + * page is marked Uptodate before allocating space. That means that if we + * end up here trying to GC the *same* page that jffs2_write_begin() is + * trying to write out, read_cache_page() will not deadlock. */ + mutex_unlock(&f->sem); pg_ptr = jffs2_gc_fetch_page(c, f, start, &pg); + mutex_lock(&f->sem); if (IS_ERR(pg_ptr)) { pr_warn("read_cache_page() returned error: %ld\n", diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h index fa35ff7..0637271 100644 --- a/fs/jffs2/nodelist.h +++ b/fs/jffs2/nodelist.h @@ -194,6 +194,7 @@ struct jffs2_inode_cache { #define INO_STATE_CLEARING 6 /* In clear_inode() */ #define INO_FLAGS_XATTR_CHECKED 0x01 /* has no duplicate xattr_ref */ +#define INO_FLAGS_IS_DIR 0x02 /* is a directory */ #define RAWNODE_CLASS_INODE_CACHE 0 #define RAWNODE_CLASS_XATTR_DATUM 1 @@ -249,7 +250,10 @@ struct jffs2_readinode_info struct jffs2_full_dirent { - struct jffs2_raw_node_ref *raw; + union { + struct jffs2_raw_node_ref *raw; + struct jffs2_inode_cache *ic; /* Just during part of build */ + }; struct jffs2_full_dirent *next; uint32_t version; uint32_t ino; /* == zero for unlink */ @@ -1712,6 +1712,11 @@ static inline int should_follow_link(struct nameidata *nd, struct path *link, return 0; if (!follow) return 0; + /* make sure that d_is_symlink above matches inode */ + if (nd->flags & LOOKUP_RCU) { + if (read_seqcount_retry(&link->dentry->d_seq, seq)) + return -ECHILD; + } return pick_link(nd, link, inode, seq); } @@ -1743,11 +1748,11 @@ static int walk_component(struct nameidata *nd, int flags) if (err < 0) return err; - inode = d_backing_inode(path.dentry); seq = 0; /* we are already out of RCU mode */ err = -ENOENT; if (d_is_negative(path.dentry)) goto out_path_put; + inode = d_backing_inode(path.dentry); } if (flags & WALK_PUT) @@ -3192,12 +3197,12 @@ retry_lookup: return error; BUG_ON(nd->flags & LOOKUP_RCU); - inode = d_backing_inode(path.dentry); seq = 0; /* out of RCU mode, so the value doesn't matter */ if (unlikely(d_is_negative(path.dentry))) { path_to_nameidata(&path, nd); return -ENOENT; } + inode = d_backing_inode(path.dentry); finish_lookup: if (nd->depth) put_link(nd); @@ -3206,11 +3211,6 @@ finish_lookup: if (unlikely(error)) return error; - if (unlikely(d_is_symlink(path.dentry)) && !(open_flag & O_PATH)) { - path_to_nameidata(&path, nd); - return -ELOOP; - } - if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path.mnt) { path_to_nameidata(&path, nd); } else { @@ -3229,6 +3229,10 @@ finish_open: return error; } audit_inode(nd->name, nd->path.dentry, 0); + if (unlikely(d_is_symlink(nd->path.dentry)) && !(open_flag & O_PATH)) { + error = -ELOOP; + goto out; + } error = -EISDIR; if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry)) goto out; @@ -3273,6 +3277,10 @@ opened: goto exit_fput; } out: + if (unlikely(error > 0)) { + WARN_ON(1); + error = -EINVAL; + } if (got_write) mnt_drop_write(nd->path.mnt); path_put(&save_parent); diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 26c2de2..b7f8eae 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -633,7 +633,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx, d_rehash(newdent); } else { spin_lock(&dentry->d_lock); - NCP_FINFO(inode)->flags &= ~NCPI_DIR_CACHE; + NCP_FINFO(dir)->flags &= ~NCPI_DIR_CACHE; spin_unlock(&dentry->d_lock); } } else { diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c index c59a59c..35ab51c 100644 --- a/fs/nfs/blocklayout/extent_tree.c +++ b/fs/nfs/blocklayout/extent_tree.c @@ -476,6 +476,7 @@ static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg, for (i = 0; i < nr_pages; i++) put_page(arg->layoutupdate_pages[i]); + vfree(arg->start_p); kfree(arg->layoutupdate_pages); } else { put_page(arg->layoutupdate_page); @@ -559,10 +560,15 @@ retry: if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) { void *p = start_p, *end = p + arg->layoutupdate_len; + struct page *page = NULL; int i = 0; - for ( ; p < end; p += PAGE_SIZE) - arg->layoutupdate_pages[i++] = vmalloc_to_page(p); + arg->start_p = start_p; + for ( ; p < end; p += PAGE_SIZE) { + page = vmalloc_to_page(p); + arg->layoutupdate_pages[i++] = page; + get_page(page); + } } dprintk("%s found %zu ranges\n", __func__, count); diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index bd25dc7..dff8346 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -16,29 +16,8 @@ #define NFSDBG_FACILITY NFSDBG_PROC -static int nfs42_set_rw_stateid(nfs4_stateid *dst, struct file *file, - fmode_t fmode) -{ - struct nfs_open_context *open; - struct nfs_lock_context *lock; - int ret; - - open = get_nfs_open_context(nfs_file_open_context(file)); - lock = nfs_get_lock_context(open); - if (IS_ERR(lock)) { - put_nfs_open_context(open); - return PTR_ERR(lock); - } - - ret = nfs4_set_rw_stateid(dst, open, lock, fmode); - - nfs_put_lock_context(lock); - put_nfs_open_context(open); - return ret; -} - static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, - loff_t offset, loff_t len) + struct nfs_lock_context *lock, loff_t offset, loff_t len) { struct inode *inode = file_inode(filep); struct nfs_server *server = NFS_SERVER(inode); @@ -56,7 +35,8 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, msg->rpc_argp = &args; msg->rpc_resp = &res; - status = nfs42_set_rw_stateid(&args.falloc_stateid, filep, FMODE_WRITE); + status = nfs4_set_rw_stateid(&args.falloc_stateid, lock->open_context, + lock, FMODE_WRITE); if (status) return status; @@ -78,15 +58,26 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, { struct nfs_server *server = NFS_SERVER(file_inode(filep)); struct nfs4_exception exception = { }; + struct nfs_lock_context *lock; int err; + lock = nfs_get_lock_context(nfs_file_open_context(filep)); + if (IS_ERR(lock)) + return PTR_ERR(lock); + + exception.inode = file_inode(filep); + exception.state = lock->open_context->state; + do { - err = _nfs42_proc_fallocate(msg, filep, offset, len); - if (err == -ENOTSUPP) - return -EOPNOTSUPP; + err = _nfs42_proc_fallocate(msg, filep, lock, offset, len); + if (err == -ENOTSUPP) { + err = -EOPNOTSUPP; + break; + } err = nfs4_handle_exception(server, err, &exception); } while (exception.retry); + nfs_put_lock_context(lock); return err; } @@ -135,7 +126,8 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) return err; } -static loff_t _nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) +static loff_t _nfs42_proc_llseek(struct file *filep, + struct nfs_lock_context *lock, loff_t offset, int whence) { struct inode *inode = file_inode(filep); struct nfs42_seek_args args = { @@ -156,7 +148,8 @@ static loff_t _nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) if (!nfs_server_capable(inode, NFS_CAP_SEEK)) return -ENOTSUPP; - status = nfs42_set_rw_stateid(&args.sa_stateid, filep, FMODE_READ); + status = nfs4_set_rw_stateid(&args.sa_stateid, lock->open_context, + lock, FMODE_READ); if (status) return status; @@ -175,17 +168,28 @@ loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) { struct nfs_server *server = NFS_SERVER(file_inode(filep)); struct nfs4_exception exception = { }; + struct nfs_lock_context *lock; loff_t err; + lock = nfs_get_lock_context(nfs_file_open_context(filep)); + if (IS_ERR(lock)) + return PTR_ERR(lock); + + exception.inode = file_inode(filep); + exception.state = lock->open_context->state; + do { - err = _nfs42_proc_llseek(filep, offset, whence); + err = _nfs42_proc_llseek(filep, lock, offset, whence); if (err >= 0) break; - if (err == -ENOTSUPP) - return -EOPNOTSUPP; + if (err == -ENOTSUPP) { + err = -EOPNOTSUPP; + break; + } err = nfs4_handle_exception(server, err, &exception); } while (exception.retry); + nfs_put_lock_context(lock); return err; } @@ -298,8 +302,9 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server, } static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, - struct file *dst_f, loff_t src_offset, - loff_t dst_offset, loff_t count) + struct file *dst_f, struct nfs_lock_context *src_lock, + struct nfs_lock_context *dst_lock, loff_t src_offset, + loff_t dst_offset, loff_t count) { struct inode *src_inode = file_inode(src_f); struct inode *dst_inode = file_inode(dst_f); @@ -320,11 +325,13 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, msg->rpc_argp = &args; msg->rpc_resp = &res; - status = nfs42_set_rw_stateid(&args.src_stateid, src_f, FMODE_READ); + status = nfs4_set_rw_stateid(&args.src_stateid, src_lock->open_context, + src_lock, FMODE_READ); if (status) return status; - status = nfs42_set_rw_stateid(&args.dst_stateid, dst_f, FMODE_WRITE); + status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context, + dst_lock, FMODE_WRITE); if (status) return status; @@ -349,22 +356,48 @@ int nfs42_proc_clone(struct file *src_f, struct file *dst_f, }; struct inode *inode = file_inode(src_f); struct nfs_server *server = NFS_SERVER(file_inode(src_f)); - struct nfs4_exception exception = { }; - int err; + struct nfs_lock_context *src_lock; + struct nfs_lock_context *dst_lock; + struct nfs4_exception src_exception = { }; + struct nfs4_exception dst_exception = { }; + int err, err2; if (!nfs_server_capable(inode, NFS_CAP_CLONE)) return -EOPNOTSUPP; + src_lock = nfs_get_lock_context(nfs_file_open_context(src_f)); + if (IS_ERR(src_lock)) + return PTR_ERR(src_lock); + + src_exception.inode = file_inode(src_f); + src_exception.state = src_lock->open_context->state; + + dst_lock = nfs_get_lock_context(nfs_file_open_context(dst_f)); + if (IS_ERR(dst_lock)) { + err = PTR_ERR(dst_lock); + goto out_put_src_lock; + } + + dst_exception.inode = file_inode(dst_f); + dst_exception.state = dst_lock->open_context->state; + do { - err = _nfs42_proc_clone(&msg, src_f, dst_f, src_offset, - dst_offset, count); + err = _nfs42_proc_clone(&msg, src_f, dst_f, src_lock, dst_lock, + src_offset, dst_offset, count); if (err == -ENOTSUPP || err == -EOPNOTSUPP) { NFS_SERVER(inode)->caps &= ~NFS_CAP_CLONE; - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + break; } - err = nfs4_handle_exception(server, err, &exception); - } while (exception.retry); - return err; + err2 = nfs4_handle_exception(server, err, &src_exception); + err = nfs4_handle_exception(server, err, &dst_exception); + if (!err) + err = err2; + } while (src_exception.retry || dst_exception.retry); + nfs_put_lock_context(dst_lock); +out_put_src_lock: + nfs_put_lock_context(src_lock); + return err; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4bfc33a..1488159 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2466,9 +2466,9 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, dentry = d_add_unique(dentry, igrab(state->inode)); if (dentry == NULL) { dentry = opendata->dentry; - } else if (dentry != ctx->dentry) { + } else { dput(ctx->dentry); - ctx->dentry = dget(dentry); + ctx->dentry = dentry; } nfs_set_verifier(dentry, nfs_save_change_attribute(d_inode(opendata->dir))); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 482b6e9..2fa483e 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -252,6 +252,27 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) } } +/* + * Mark a pnfs_layout_hdr and all associated layout segments as invalid + * + * In order to continue using the pnfs_layout_hdr, a full recovery + * is required. + * Note that caller must hold inode->i_lock. + */ +static int +pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, + struct list_head *lseg_list) +{ + struct pnfs_layout_range range = { + .iomode = IOMODE_ANY, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + + set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); + return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range); +} + static int pnfs_iomode_to_fail_bit(u32 iomode) { @@ -554,9 +575,8 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) spin_lock(&nfsi->vfs_inode.i_lock); lo = nfsi->layout; if (lo) { - lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ - pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); pnfs_get_layout_hdr(lo); + pnfs_mark_layout_stateid_invalid(lo, &tmp_list); pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED); pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED); spin_unlock(&nfsi->vfs_inode.i_lock); @@ -617,11 +637,6 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, { struct pnfs_layout_hdr *lo; struct inode *inode; - struct pnfs_layout_range range = { - .iomode = IOMODE_ANY, - .offset = 0, - .length = NFS4_MAX_UINT64, - }; LIST_HEAD(lseg_list); int ret = 0; @@ -636,11 +651,11 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, spin_lock(&inode->i_lock); list_del_init(&lo->plh_bulk_destroy); - lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ - if (is_bulk_recall) - set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); - if (pnfs_mark_matching_lsegs_invalid(lo, &lseg_list, &range)) + if (pnfs_mark_layout_stateid_invalid(lo, &lseg_list)) { + if (is_bulk_recall) + set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); ret = -EAGAIN; + } spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&lseg_list); /* Free all lsegs that are attached to commit buckets */ @@ -1738,8 +1753,19 @@ pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode) if (lo->plh_return_iomode != 0) iomode = IOMODE_ANY; lo->plh_return_iomode = iomode; + set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); } +/** + * pnfs_mark_matching_lsegs_return - Free or return matching layout segments + * @lo: pointer to layout header + * @tmp_list: list header to be used with pnfs_free_lseg_list() + * @return_range: describe layout segment ranges to be returned + * + * This function is mainly intended for use by layoutrecall. It attempts + * to free the layout segment immediately, or else to mark it for return + * as soon as its reference count drops to zero. + */ int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, @@ -1762,12 +1788,11 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, lseg, lseg->pls_range.iomode, lseg->pls_range.offset, lseg->pls_range.length); + if (mark_lseg_invalid(lseg, tmp_list)) + continue; + remaining++; set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); pnfs_set_plh_return_iomode(lo, return_range->iomode); - if (!mark_lseg_invalid(lseg, tmp_list)) - remaining++; - set_bit(NFS_LAYOUT_RETURN_REQUESTED, - &lo->plh_flags); } return remaining; } diff --git a/fs/notify/mark.c b/fs/notify/mark.c index cfcbf11..7115c5d 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -91,7 +91,14 @@ #include <linux/fsnotify_backend.h> #include "fsnotify.h" +#define FSNOTIFY_REAPER_DELAY (1) /* 1 jiffy */ + struct srcu_struct fsnotify_mark_srcu; +static DEFINE_SPINLOCK(destroy_lock); +static LIST_HEAD(destroy_list); + +static void fsnotify_mark_destroy(struct work_struct *work); +static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy); void fsnotify_get_mark(struct fsnotify_mark *mark) { @@ -165,19 +172,10 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark) atomic_dec(&group->num_marks); } -static void -fsnotify_mark_free_rcu(struct rcu_head *rcu) -{ - struct fsnotify_mark *mark; - - mark = container_of(rcu, struct fsnotify_mark, g_rcu); - fsnotify_put_mark(mark); -} - /* - * Free fsnotify mark. The freeing is actually happening from a call_srcu - * callback. Caller must have a reference to the mark or be protected by - * fsnotify_mark_srcu. + * Free fsnotify mark. The freeing is actually happening from a kthread which + * first waits for srcu period end. Caller must have a reference to the mark + * or be protected by fsnotify_mark_srcu. */ void fsnotify_free_mark(struct fsnotify_mark *mark) { @@ -192,7 +190,11 @@ void fsnotify_free_mark(struct fsnotify_mark *mark) mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; spin_unlock(&mark->lock); - call_srcu(&fsnotify_mark_srcu, &mark->g_rcu, fsnotify_mark_free_rcu); + spin_lock(&destroy_lock); + list_add(&mark->g_list, &destroy_list); + spin_unlock(&destroy_lock); + queue_delayed_work(system_unbound_wq, &reaper_work, + FSNOTIFY_REAPER_DELAY); /* * Some groups like to know that marks are being freed. This is a @@ -388,7 +390,12 @@ err: spin_unlock(&mark->lock); - call_srcu(&fsnotify_mark_srcu, &mark->g_rcu, fsnotify_mark_free_rcu); + spin_lock(&destroy_lock); + list_add(&mark->g_list, &destroy_list); + spin_unlock(&destroy_lock); + queue_delayed_work(system_unbound_wq, &reaper_work, + FSNOTIFY_REAPER_DELAY); + return ret; } @@ -491,3 +498,21 @@ void fsnotify_init_mark(struct fsnotify_mark *mark, atomic_set(&mark->refcnt, 1); mark->free_mark = free_mark; } + +static void fsnotify_mark_destroy(struct work_struct *work) +{ + struct fsnotify_mark *mark, *next; + struct list_head private_destroy_list; + + spin_lock(&destroy_lock); + /* exchange the list head */ + list_replace_init(&destroy_list, &private_destroy_list); + spin_unlock(&destroy_lock); + + synchronize_srcu(&fsnotify_mark_srcu); + + list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) { + list_del_init(&mark->g_list); + fsnotify_put_mark(mark); + } +} diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 794fd15..cda0361 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -956,6 +956,7 @@ clean_orphan: tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, update_isize, end); if (tmp_ret < 0) { + ocfs2_inode_unlock(inode, 1); ret = tmp_ret; mlog_errno(ret); brelse(di_bh); diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 9581d190..77ebc2b 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -147,6 +147,10 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = ocfs2_inode_lock(inode, &di_bh, 1); if (ret < 0) { mlog_errno(ret); + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else + ret = VM_FAULT_SIGBUS; goto out; } diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index ed95272..52f6de5 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -618,7 +618,8 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir) * sole user of this dentry. Too tricky... Just unhash for * now. */ - d_drop(dentry); + if (!err) + d_drop(dentry); inode_unlock(dir); return err; @@ -903,6 +904,13 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, if (!overwrite && new_is_dir && !old_opaque && new_opaque) ovl_remove_opaque(newdentry); + /* + * Old dentry now lives in different location. Dentries in + * lowerstack are stale. We cannot drop them here because + * access to them is lockless. This could be only pure upper + * or opaque directory - numlower is zero. Or upper non-dir + * entry - its pureness is tracked by flag opaque. + */ if (old_opaque != new_opaque) { ovl_dentry_set_opaque(old, new_opaque); if (!overwrite) diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 49e2045..a4ff5d0 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -65,6 +65,8 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr) inode_lock(upperdentry->d_inode); err = notify_change(upperdentry, attr, NULL); + if (!err) + ovl_copyattr(upperdentry->d_inode, dentry->d_inode); inode_unlock(upperdentry->d_inode); } ovl_drop_write(dentry); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 8d826bd..619ad4b 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -76,12 +76,14 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry) if (oe->__upperdentry) { type = __OVL_PATH_UPPER; - if (oe->numlower) { - if (S_ISDIR(dentry->d_inode->i_mode)) - type |= __OVL_PATH_MERGE; - } else if (!oe->opaque) { + /* + * Non-dir dentry can hold lower dentry from previous + * location. Its purity depends only on opaque flag. + */ + if (oe->numlower && S_ISDIR(dentry->d_inode->i_mode)) + type |= __OVL_PATH_MERGE; + else if (!oe->opaque) type |= __OVL_PATH_PURE; - } } else { if (oe->numlower > 1) type |= __OVL_PATH_MERGE; @@ -341,6 +343,7 @@ static const struct dentry_operations ovl_dentry_operations = { static const struct dentry_operations ovl_reval_dentry_operations = { .d_release = ovl_dentry_release, + .d_select_inode = ovl_d_select_inode, .d_revalidate = ovl_dentry_revalidate, .d_weak_revalidate = ovl_dentry_weak_revalidate, }; @@ -202,6 +202,11 @@ static struct mount *last_dest, *last_source, *dest_master; static struct mountpoint *mp; static struct hlist_head *list; +static inline bool peers(struct mount *m1, struct mount *m2) +{ + return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id; +} + static int propagate_one(struct mount *m) { struct mount *child; @@ -212,7 +217,7 @@ static int propagate_one(struct mount *m) /* skip if mountpoint isn't covered by it */ if (!is_subdir(mp->m_dentry, m->mnt.mnt_root)) return 0; - if (m->mnt_group_id == last_dest->mnt_group_id) { + if (peers(m, last_dest)) { type = CL_MAKE_SHARED; } else { struct mount *n, *p; @@ -223,7 +228,7 @@ static int propagate_one(struct mount *m) last_source = last_source->mnt_master; last_dest = last_source->mnt_parent; } - if (n->mnt_group_id != last_dest->mnt_group_id) { + if (!peers(n, last_dest)) { last_source = last_source->mnt_master; last_dest = last_source->mnt_parent; } diff --git a/fs/read_write.c b/fs/read_write.c index 324ec27..dadf24e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -17,6 +17,7 @@ #include <linux/splice.h> #include <linux/compat.h> #include <linux/mount.h> +#include <linux/fs.h> #include "internal.h" #include <asm/uaccess.h> @@ -183,7 +184,7 @@ loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence) switch (whence) { case SEEK_SET: case SEEK_CUR: return generic_file_llseek_size(file, offset, whence, - ~0ULL, 0); + OFFSET_MAX, 0); default: return -EINVAL; } @@ -1532,10 +1533,12 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in, if (!(file_in->f_mode & FMODE_READ) || !(file_out->f_mode & FMODE_WRITE) || - (file_out->f_flags & O_APPEND) || - !file_in->f_op->clone_file_range) + (file_out->f_flags & O_APPEND)) return -EBADF; + if (!file_in->f_op->clone_file_range) + return -EOPNOTSUPP; + ret = clone_verify_area(file_in, pos_in, len, false); if (ret) return ret; @@ -415,6 +415,7 @@ void generic_shutdown_super(struct super_block *sb) sb->s_flags &= ~MS_ACTIVE; fsnotify_unmount_inodes(sb); + cgroup_writeback_umount(); evict_inodes(sb); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 5031170..66cdb44 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -287,6 +287,12 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, goto out; /* + * We don't do userfault handling for the final child pid update. + */ + if (current->flags & PF_EXITING) + goto out; + + /* * Check that we can return VM_FAULT_RETRY. * * NOTE: it should become possible to return VM_FAULT_RETRY @@ -940,7 +940,7 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, bool trusted = capable(CAP_SYS_ADMIN); struct simple_xattr *xattr; ssize_t remaining_size = size; - int err; + int err = 0; #ifdef CONFIG_FS_POSIX_ACL if (inode->i_acl) { @@ -965,11 +965,11 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, err = xattr_list_one(&buffer, &remaining_size, xattr->name); if (err) - return err; + break; } spin_unlock(&xattrs->lock); - return size - remaining_size; + return err ? err : size - remaining_size; } /* diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 379c089..a9ebabfe 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -55,7 +55,7 @@ xfs_count_page_state( } while ((bh = bh->b_this_page) != head); } -STATIC struct block_device * +struct block_device * xfs_find_bdev_for_inode( struct inode *inode) { @@ -1208,6 +1208,10 @@ xfs_vm_writepages( struct writeback_control *wbc) { xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); + if (dax_mapping(mapping)) + return dax_writeback_mapping_range(mapping, + xfs_find_bdev_for_inode(mapping->host), wbc); + return generic_writepages(mapping, wbc); } diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index f6ffc9a..a4343c6 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -62,5 +62,6 @@ int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset, struct buffer_head *map_bh, int create); extern void xfs_count_page_state(struct page *, int *, int *); +extern struct block_device *xfs_find_bdev_for_inode(struct inode *); #endif /* __XFS_AOPS_H__ */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 45ec9e4..6c87601 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -75,7 +75,8 @@ xfs_zero_extent( ssize_t size = XFS_FSB_TO_B(mp, count_fsb); if (IS_DAX(VFS_I(ip))) - return dax_clear_blocks(VFS_I(ip), block, size); + return dax_clear_sectors(xfs_find_bdev_for_inode(VFS_I(ip)), + sector, size); /* * let the block layer decide on the fastest method of diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index da37beb..be55688 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1109,27 +1109,10 @@ xlog_verify_head( bool tmp_wrapped; /* - * Search backwards through the log looking for the log record header - * block. This wraps all the way back around to the head so something is - * seriously wrong if we can't find it. - */ - found = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp, rhead_blk, - rhead, wrapped); - if (found < 0) - return found; - if (!found) { - xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__); - return -EIO; - } - - *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn)); - - /* - * Now that we have a tail block, check the head of the log for torn - * writes. Search again until we hit the tail or the maximum number of - * log record I/Os that could have been in flight at one time. Use a - * temporary buffer so we don't trash the rhead/bp pointer from the - * call above. + * Check the head of the log for torn writes. Search backwards from the + * head until we hit the tail or the maximum number of log record I/Os + * that could have been in flight at one time. Use a temporary buffer so + * we don't trash the rhead/bp pointers from the caller. */ tmp_bp = xlog_get_bp(log, 1); if (!tmp_bp) @@ -1216,6 +1199,115 @@ xlog_verify_head( } /* + * Check whether the head of the log points to an unmount record. In other + * words, determine whether the log is clean. If so, update the in-core state + * appropriately. + */ +static int +xlog_check_unmount_rec( + struct xlog *log, + xfs_daddr_t *head_blk, + xfs_daddr_t *tail_blk, + struct xlog_rec_header *rhead, + xfs_daddr_t rhead_blk, + struct xfs_buf *bp, + bool *clean) +{ + struct xlog_op_header *op_head; + xfs_daddr_t umount_data_blk; + xfs_daddr_t after_umount_blk; + int hblks; + int error; + char *offset; + + *clean = false; + + /* + * Look for unmount record. If we find it, then we know there was a + * clean unmount. Since 'i' could be the last block in the physical + * log, we convert to a log block before comparing to the head_blk. + * + * Save the current tail lsn to use to pass to xlog_clear_stale_blocks() + * below. We won't want to clear the unmount record if there is one, so + * we pass the lsn of the unmount record rather than the block after it. + */ + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + int h_size = be32_to_cpu(rhead->h_size); + int h_version = be32_to_cpu(rhead->h_version); + + if ((h_version & XLOG_VERSION_2) && + (h_size > XLOG_HEADER_CYCLE_SIZE)) { + hblks = h_size / XLOG_HEADER_CYCLE_SIZE; + if (h_size % XLOG_HEADER_CYCLE_SIZE) + hblks++; + } else { + hblks = 1; + } + } else { + hblks = 1; + } + after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len)); + after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize); + if (*head_blk == after_umount_blk && + be32_to_cpu(rhead->h_num_logops) == 1) { + umount_data_blk = rhead_blk + hblks; + umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize); + error = xlog_bread(log, umount_data_blk, 1, bp, &offset); + if (error) + return error; + + op_head = (struct xlog_op_header *)offset; + if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) { + /* + * Set tail and last sync so that newly written log + * records will point recovery to after the current + * unmount record. + */ + xlog_assign_atomic_lsn(&log->l_tail_lsn, + log->l_curr_cycle, after_umount_blk); + xlog_assign_atomic_lsn(&log->l_last_sync_lsn, + log->l_curr_cycle, after_umount_blk); + *tail_blk = after_umount_blk; + + *clean = true; + } + } + + return 0; +} + +static void +xlog_set_state( + struct xlog *log, + xfs_daddr_t head_blk, + struct xlog_rec_header *rhead, + xfs_daddr_t rhead_blk, + bool bump_cycle) +{ + /* + * Reset log values according to the state of the log when we + * crashed. In the case where head_blk == 0, we bump curr_cycle + * one because the next write starts a new cycle rather than + * continuing the cycle of the last good log record. At this + * point we have guaranteed that all partial log records have been + * accounted for. Therefore, we know that the last good log record + * written was complete and ended exactly on the end boundary + * of the physical log. + */ + log->l_prev_block = rhead_blk; + log->l_curr_block = (int)head_blk; + log->l_curr_cycle = be32_to_cpu(rhead->h_cycle); + if (bump_cycle) + log->l_curr_cycle++; + atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn)); + atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn)); + xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle, + BBTOB(log->l_curr_block)); + xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle, + BBTOB(log->l_curr_block)); +} + +/* * Find the sync block number or the tail of the log. * * This will be the block number of the last record to have its @@ -1238,22 +1330,20 @@ xlog_find_tail( xfs_daddr_t *tail_blk) { xlog_rec_header_t *rhead; - xlog_op_header_t *op_head; char *offset = NULL; xfs_buf_t *bp; int error; - xfs_daddr_t umount_data_blk; - xfs_daddr_t after_umount_blk; xfs_daddr_t rhead_blk; xfs_lsn_t tail_lsn; - int hblks; bool wrapped = false; + bool clean = false; /* * Find previous log record */ if ((error = xlog_find_head(log, head_blk))) return error; + ASSERT(*head_blk < INT_MAX); bp = xlog_get_bp(log, 1); if (!bp) @@ -1271,100 +1361,75 @@ xlog_find_tail( } /* - * Trim the head block back to skip over torn records. We can have - * multiple log I/Os in flight at any time, so we assume CRC failures - * back through the previous several records are torn writes and skip - * them. + * Search backwards through the log looking for the log record header + * block. This wraps all the way back around to the head so something is + * seriously wrong if we can't find it. */ - ASSERT(*head_blk < INT_MAX); - error = xlog_verify_head(log, head_blk, tail_blk, bp, &rhead_blk, - &rhead, &wrapped); - if (error) - goto done; + error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp, + &rhead_blk, &rhead, &wrapped); + if (error < 0) + return error; + if (!error) { + xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__); + return -EIO; + } + *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn)); /* - * Reset log values according to the state of the log when we - * crashed. In the case where head_blk == 0, we bump curr_cycle - * one because the next write starts a new cycle rather than - * continuing the cycle of the last good log record. At this - * point we have guaranteed that all partial log records have been - * accounted for. Therefore, we know that the last good log record - * written was complete and ended exactly on the end boundary - * of the physical log. + * Set the log state based on the current head record. */ - log->l_prev_block = rhead_blk; - log->l_curr_block = (int)*head_blk; - log->l_curr_cycle = be32_to_cpu(rhead->h_cycle); - if (wrapped) - log->l_curr_cycle++; - atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn)); - atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn)); - xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle, - BBTOB(log->l_curr_block)); - xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle, - BBTOB(log->l_curr_block)); + xlog_set_state(log, *head_blk, rhead, rhead_blk, wrapped); + tail_lsn = atomic64_read(&log->l_tail_lsn); /* - * Look for unmount record. If we find it, then we know there - * was a clean unmount. Since 'i' could be the last block in - * the physical log, we convert to a log block before comparing - * to the head_blk. + * Look for an unmount record at the head of the log. This sets the log + * state to determine whether recovery is necessary. + */ + error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead, + rhead_blk, bp, &clean); + if (error) + goto done; + + /* + * Verify the log head if the log is not clean (e.g., we have anything + * but an unmount record at the head). This uses CRC verification to + * detect and trim torn writes. If discovered, CRC failures are + * considered torn writes and the log head is trimmed accordingly. * - * Save the current tail lsn to use to pass to - * xlog_clear_stale_blocks() below. We won't want to clear the - * unmount record if there is one, so we pass the lsn of the - * unmount record rather than the block after it. + * Note that we can only run CRC verification when the log is dirty + * because there's no guarantee that the log data behind an unmount + * record is compatible with the current architecture. */ - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { - int h_size = be32_to_cpu(rhead->h_size); - int h_version = be32_to_cpu(rhead->h_version); + if (!clean) { + xfs_daddr_t orig_head = *head_blk; - if ((h_version & XLOG_VERSION_2) && - (h_size > XLOG_HEADER_CYCLE_SIZE)) { - hblks = h_size / XLOG_HEADER_CYCLE_SIZE; - if (h_size % XLOG_HEADER_CYCLE_SIZE) - hblks++; - } else { - hblks = 1; - } - } else { - hblks = 1; - } - after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len)); - after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize); - tail_lsn = atomic64_read(&log->l_tail_lsn); - if (*head_blk == after_umount_blk && - be32_to_cpu(rhead->h_num_logops) == 1) { - umount_data_blk = rhead_blk + hblks; - umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize); - error = xlog_bread(log, umount_data_blk, 1, bp, &offset); + error = xlog_verify_head(log, head_blk, tail_blk, bp, + &rhead_blk, &rhead, &wrapped); if (error) goto done; - op_head = (xlog_op_header_t *)offset; - if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) { - /* - * Set tail and last sync so that newly written - * log records will point recovery to after the - * current unmount record. - */ - xlog_assign_atomic_lsn(&log->l_tail_lsn, - log->l_curr_cycle, after_umount_blk); - xlog_assign_atomic_lsn(&log->l_last_sync_lsn, - log->l_curr_cycle, after_umount_blk); - *tail_blk = after_umount_blk; - - /* - * Note that the unmount was clean. If the unmount - * was not clean, we need to know this to rebuild the - * superblock counters from the perag headers if we - * have a filesystem using non-persistent counters. - */ - log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN; + /* update in-core state again if the head changed */ + if (*head_blk != orig_head) { + xlog_set_state(log, *head_blk, rhead, rhead_blk, + wrapped); + tail_lsn = atomic64_read(&log->l_tail_lsn); + error = xlog_check_unmount_rec(log, head_blk, tail_blk, + rhead, rhead_blk, bp, + &clean); + if (error) + goto done; } } /* + * Note that the unmount was clean. If the unmount was not clean, we + * need to know this to rebuild the superblock counters from the perag + * headers if we have a filesystem using non-persistent counters. + */ + if (clean) + log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN; + + /* * Make sure that there are no blocks in front of the head * with the same cycle number as the head. This can happen * because we allow multiple outstanding log writes concurrently, @@ -4491,7 +4556,7 @@ xlog_recover_process( * know precisely what failed. */ if (pass == XLOG_RECOVER_CRCPASS) { - if (rhead->h_crc && crc != le32_to_cpu(rhead->h_crc)) + if (rhead->h_crc && crc != rhead->h_crc) return -EFSBADCRC; return 0; } @@ -4502,7 +4567,7 @@ xlog_recover_process( * zero CRC check prevents warnings from being emitted when upgrading * the kernel from one that does not add CRCs by default. */ - if (crc != le32_to_cpu(rhead->h_crc)) { + if (crc != rhead->h_crc) { if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { xfs_alert(log->l_mp, "log record CRC mismatch: found 0x%x, expected 0x%x.", |