From d3e46fea1b1e8ba97a8c9dd8f54b97d086cd25aa Mon Sep 17 00:00:00 2001 From: David Sterba Date: Sun, 15 Jun 2014 02:04:19 +0200 Subject: btrfs: sink blocksize parameter to readahead_tree_block All callers pass nodesize. Signed-off-by: David Sterba diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 14a72ed..50eca33 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2282,7 +2282,7 @@ static void reada_for_search(struct btrfs_root *root, if ((search <= target && target - search <= 65536) || (search > target && search - target <= 65536)) { gen = btrfs_node_ptr_generation(node, nr); - readahead_tree_block(root, search, blocksize); + readahead_tree_block(root, search); nread += blocksize; } nscan++; @@ -2301,7 +2301,6 @@ static noinline void reada_for_balance(struct btrfs_root *root, u64 gen; u64 block1 = 0; u64 block2 = 0; - int blocksize; parent = path->nodes[level + 1]; if (!parent) @@ -2309,7 +2308,6 @@ static noinline void reada_for_balance(struct btrfs_root *root, nritems = btrfs_header_nritems(parent); slot = path->slots[level + 1]; - blocksize = root->nodesize; if (slot > 0) { block1 = btrfs_node_blockptr(parent, slot - 1); @@ -2334,9 +2332,9 @@ static noinline void reada_for_balance(struct btrfs_root *root, } if (block1) - readahead_tree_block(root, block1, blocksize); + readahead_tree_block(root, block1); if (block2) - readahead_tree_block(root, block2, blocksize); + readahead_tree_block(root, block2); } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3096512..be9d7c6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1073,12 +1073,12 @@ static const struct address_space_operations btree_aops = { .set_page_dirty = btree_set_page_dirty, }; -void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) +void readahead_tree_block(struct btrfs_root *root, u64 bytenr) { struct extent_buffer *buf = NULL; struct inode *btree_inode = root->fs_info->btree_inode; - buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize); if (!buf) return; read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 4146518..9cf4359 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -46,7 +46,7 @@ struct btrfs_fs_devices; struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, u64 parent_transid); -void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); +void readahead_tree_block(struct btrfs_root *root, u64 bytenr); int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, int mirror_num, struct extent_buffer **eb); struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 222d6ae..c025751 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7485,7 +7485,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, continue; } reada: - readahead_tree_block(root, bytenr, blocksize); + readahead_tree_block(root, bytenr); nread++; } wc->reada_slot = slot; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 74257d6..cb5d446 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2965,8 +2965,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, while (rb_node) { block = rb_entry(rb_node, struct tree_block, rb_node); if (!block->key_ready) - readahead_tree_block(rc->extent_root, block->bytenr, - block->key.objectid); + readahead_tree_block(rc->extent_root, block->bytenr); rb_node = rb_next(rb_node); } -- cgit v0.10.2 From b6ae40ec76dc7589d9f77bd281515be3ad58c533 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Sun, 15 Jun 2014 02:18:25 +0200 Subject: btrfs: remove blocksize from reada_extent Replace with global nodesize instead. Signed-off-by: David Sterba diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index b63ae20..5c3fde6 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -66,7 +66,6 @@ struct reada_extctl { struct reada_extent { u64 logical; struct btrfs_key top; - u32 blocksize; int err; struct list_head extctl; int refcnt; @@ -349,7 +348,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, blocksize = root->nodesize; re->logical = logical; - re->blocksize = blocksize; re->top = *top; INIT_LIST_HEAD(&re->extctl); spin_lock_init(&re->lock); @@ -660,7 +658,6 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, int mirror_num = 0; struct extent_buffer *eb = NULL; u64 logical; - u32 blocksize; int ret; int i; int need_kick = 0; @@ -694,7 +691,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, spin_unlock(&fs_info->reada_lock); return 0; } - dev->reada_next = re->logical + re->blocksize; + dev->reada_next = re->logical + fs_info->tree_root->nodesize; re->refcnt++; spin_unlock(&fs_info->reada_lock); @@ -709,7 +706,6 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, } } logical = re->logical; - blocksize = re->blocksize; spin_lock(&re->lock); if (re->scheduled_for == NULL) { @@ -724,8 +720,8 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, return 0; atomic_inc(&dev->reada_in_flight); - ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize, - mirror_num, &eb); + ret = reada_tree_block_flagged(fs_info->extent_root, logical, + fs_info->tree_root->nodesize, mirror_num, &eb); if (ret) __readahead_hook(fs_info->extent_root, NULL, logical, ret); else if (eb) @@ -851,7 +847,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) break; printk(KERN_DEBUG " re: logical %llu size %u empty %d for %lld", - re->logical, re->blocksize, + re->logical, fs_info->tree_root->nodesize, list_empty(&re->extctl), re->scheduled_for ? re->scheduled_for->devid : -1); @@ -886,7 +882,8 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) } printk(KERN_DEBUG "re: logical %llu size %u list empty %d for %lld", - re->logical, re->blocksize, list_empty(&re->extctl), + re->logical, fs_info->tree_root->nodesize, + list_empty(&re->extctl), re->scheduled_for ? re->scheduled_for->devid : -1); for (i = 0; i < re->nzones; ++i) { printk(KERN_CONT " zone %llu-%llu devs", -- cgit v0.10.2 From c0dcaa4d7b66b788b315c418bda3cca75c5938dc Mon Sep 17 00:00:00 2001 From: David Sterba Date: Sun, 15 Jun 2014 02:24:21 +0200 Subject: btrfs: sink blocksize parameter to reada_tree_block_flagged Signed-off-by: David Sterba diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index be9d7c6..8123b03 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1086,7 +1086,7 @@ void readahead_tree_block(struct btrfs_root *root, u64 bytenr) free_extent_buffer(buf); } -int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, +int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, int mirror_num, struct extent_buffer **eb) { struct extent_buffer *buf = NULL; @@ -1094,7 +1094,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree; int ret; - buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize); if (!buf) return 0; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 9cf4359..4d4ecdd 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -47,7 +47,7 @@ struct btrfs_fs_devices; struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, u64 parent_transid); void readahead_tree_block(struct btrfs_root *root, u64 bytenr); -int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, +int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, int mirror_num, struct extent_buffer **eb); struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 5c3fde6..4d3d4e5 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -721,7 +721,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, atomic_inc(&dev->reada_in_flight); ret = reada_tree_block_flagged(fs_info->extent_root, logical, - fs_info->tree_root->nodesize, mirror_num, &eb); + mirror_num, &eb); if (ret) __readahead_hook(fs_info->extent_root, NULL, logical, ret); else if (eb) -- cgit v0.10.2 From fe864576de7fb940b5bd1f8ab8908a08a3416ca0 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Sun, 15 Jun 2014 02:28:42 +0200 Subject: btrfs: sink blocksize parameter to btrfs_init_new_buffer Signed-off-by: David Sterba diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c025751..50ebc74 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7215,11 +7215,11 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, static struct extent_buffer * btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u32 blocksize, int level) + u64 bytenr, int level) { struct extent_buffer *buf; - buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize); if (!buf) return ERR_PTR(-ENOMEM); btrfs_set_header_generation(buf, trans->transid); @@ -7338,7 +7338,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, if (btrfs_test_is_dummy_root(root)) { buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, - blocksize, level); + level); if (!IS_ERR(buf)) root->alloc_bytenr += blocksize; return buf; @@ -7355,8 +7355,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, return ERR_PTR(ret); } - buf = btrfs_init_new_buffer(trans, root, ins.objectid, - blocksize, level); + buf = btrfs_init_new_buffer(trans, root, ins.objectid, level); BUG_ON(IS_ERR(buf)); /* -ENOMEM */ if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { -- cgit v0.10.2 From a83fffb75d09cd3d44167b7fb9c1ab9e2269445f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Sun, 15 Jun 2014 02:39:54 +0200 Subject: btrfs: sink blocksize parameter to btrfs_find_create_tree_block Finally it's clear that the requested blocksize is always equal to nodesize, with one exception, the superblock. Superblock has fixed size regardless of the metadata block size, but uses the same helpers to initialize sys array/chunk tree and to work with the chunk items. So it pretends to be an extent_buffer for a moment, btrfs_read_sys_array is full of special cases, we're adding one more. Signed-off-by: David Sterba diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8123b03..548cb54 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1078,7 +1078,7 @@ void readahead_tree_block(struct btrfs_root *root, u64 bytenr) struct extent_buffer *buf = NULL; struct inode *btree_inode = root->fs_info->btree_inode; - buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize); + buf = btrfs_find_create_tree_block(root, bytenr); if (!buf) return; read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, @@ -1094,7 +1094,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree; int ret; - buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize); + buf = btrfs_find_create_tree_block(root, bytenr); if (!buf) return 0; @@ -1125,12 +1125,12 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, } struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, - u64 bytenr, u32 blocksize) + u64 bytenr) { if (btrfs_test_is_dummy_root(root)) return alloc_test_extent_buffer(root->fs_info, bytenr, - blocksize); - return alloc_extent_buffer(root->fs_info, bytenr, blocksize); + root->nodesize); + return alloc_extent_buffer(root->fs_info, bytenr, root->nodesize); } @@ -1152,7 +1152,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, struct extent_buffer *buf = NULL; int ret; - buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize); + buf = btrfs_find_create_tree_block(root, bytenr); if (!buf) return NULL; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 4d4ecdd..27d44c0 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -50,7 +50,7 @@ void readahead_tree_block(struct btrfs_root *root, u64 bytenr); int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, int mirror_num, struct extent_buffer **eb); struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, - u64 bytenr, u32 blocksize); + u64 bytenr); void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf); int open_ctree(struct super_block *sb, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 50ebc74..8ff31f8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7219,7 +7219,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, { struct extent_buffer *buf; - buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize); + buf = btrfs_find_create_tree_block(root, bytenr); if (!buf) return ERR_PTR(-ENOMEM); btrfs_set_header_generation(buf, trans->transid); @@ -7825,7 +7825,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, next = btrfs_find_tree_block(root, bytenr); if (!next) { - next = btrfs_find_create_tree_block(root, bytenr, blocksize); + next = btrfs_find_create_tree_block(root, bytenr); if (!next) return -ENOMEM; btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9a02da1..4a42edc 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2164,7 +2164,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, parent = path->nodes[*level]; root_owner = btrfs_header_owner(parent); - next = btrfs_find_create_tree_block(root, bytenr, blocksize); + next = btrfs_find_create_tree_block(root, bytenr); if (!next) return -ENOMEM; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0144790..f0af9cd 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6247,8 +6247,13 @@ int btrfs_read_sys_array(struct btrfs_root *root) u32 cur; struct btrfs_key key; - sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET, - BTRFS_SUPER_INFO_SIZE); + ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize); + /* + * This will create extent buffer of nodesize, superblock size is + * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will + * overallocate but we can keep it as-is, only the first page is used. + */ + sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET); if (!sb) return -ENOMEM; btrfs_set_buffer_uptodate(sb); -- cgit v0.10.2 From 7476dfdaade5b373db4679555706546bd5b4bd6c Mon Sep 17 00:00:00 2001 From: David Sterba Date: Sun, 15 Jun 2014 03:34:59 +0200 Subject: btrfs: sink blocksize parameter to tree_block_processed Signed-off-by: David Sterba diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index cb5d446..d830853 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2855,9 +2855,10 @@ static void update_processed_blocks(struct reloc_control *rc, } } -static int tree_block_processed(u64 bytenr, u32 blocksize, - struct reloc_control *rc) +static int tree_block_processed(u64 bytenr, struct reloc_control *rc) { + u32 blocksize = rc->extent_root->nodesize; + if (test_range_bit(&rc->processed_blocks, bytenr, bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL)) return 1; @@ -3352,7 +3353,7 @@ static int __add_tree_block(struct reloc_control *rc, bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info, SKINNY_METADATA); - if (tree_block_processed(bytenr, blocksize, rc)) + if (tree_block_processed(bytenr, rc)) return 0; if (tree_search(blocks, bytenr)) @@ -3610,7 +3611,7 @@ static int find_data_references(struct reloc_control *rc, if (added) goto next; - if (!tree_block_processed(leaf->start, leaf->len, rc)) { + if (!tree_block_processed(leaf->start, rc)) { block = kmalloc(sizeof(*block), GFP_NOFS); if (!block) { err = -ENOMEM; -- cgit v0.10.2 From 23d79d81b13431756fee428acde49c9b770da132 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Sun, 15 Jun 2014 02:55:29 +0200 Subject: btrfs: use GFP_NOFS in __alloc_extent_buffer directly Same mask from all callers. Signed-off-by: David Sterba diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4ebabd2..619592d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4598,11 +4598,11 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) static struct extent_buffer * __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, - unsigned long len, gfp_t mask) + unsigned long len) { struct extent_buffer *eb = NULL; - eb = kmem_cache_zalloc(extent_buffer_cache, mask); + eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS); if (eb == NULL) return NULL; eb->start = start; @@ -4643,7 +4643,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) struct extent_buffer *new; unsigned long num_pages = num_extent_pages(src->start, src->len); - new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_NOFS); + new = __alloc_extent_buffer(NULL, src->start, src->len); if (new == NULL) return NULL; @@ -4672,7 +4672,7 @@ struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len) unsigned long num_pages = num_extent_pages(0, len); unsigned long i; - eb = __alloc_extent_buffer(NULL, start, len, GFP_NOFS); + eb = __alloc_extent_buffer(NULL, start, len); if (!eb) return NULL; @@ -4824,7 +4824,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, if (eb) return eb; - eb = __alloc_extent_buffer(fs_info, start, len, GFP_NOFS); + eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) return NULL; -- cgit v0.10.2 From 3f556f7853ec4845a7c219d026cbcdf4cfa8cea7 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Sun, 15 Jun 2014 03:20:26 +0200 Subject: btrfs: unify extent buffer allocation api Make the extent buffer allocation interface consistent. Cloned eb will set a valid fs_info. For dummy eb, we can drop the length parameter and set it from fs_info. The built-in sanity checks may pass a NULL fs_info that's queried for nodesize, but we know it's 4096. Signed-off-by: David Sterba diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 50eca33..276d418 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1363,8 +1363,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { BUG_ON(tm->slot != 0); - eb_rewin = alloc_dummy_extent_buffer(eb->start, - fs_info->tree_root->nodesize); + eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start); if (!eb_rewin) { btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); @@ -1444,7 +1443,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) } else if (old_root) { btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); - eb = alloc_dummy_extent_buffer(logical, root->nodesize); + eb = alloc_dummy_extent_buffer(root->fs_info, logical); } else { btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK); eb = btrfs_clone_extent_buffer(eb_root); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 619592d..dc424e3 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4643,7 +4643,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) struct extent_buffer *new; unsigned long num_pages = num_extent_pages(src->start, src->len); - new = __alloc_extent_buffer(NULL, src->start, src->len); + new = __alloc_extent_buffer(src->fs_info, src->start, src->len); if (new == NULL) return NULL; @@ -4666,13 +4666,26 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) return new; } -struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len) +struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) { struct extent_buffer *eb; - unsigned long num_pages = num_extent_pages(0, len); + unsigned long len; + unsigned long num_pages; unsigned long i; - eb = __alloc_extent_buffer(NULL, start, len); + if (!fs_info) { + /* + * Called only from tests that don't always have a fs_info + * available, but we know that nodesize is 4096 + */ + len = 4096; + } else { + len = fs_info->tree_root->nodesize; + } + num_pages = num_extent_pages(0, len); + + eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) return NULL; @@ -4770,7 +4783,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, eb = find_extent_buffer(fs_info, start); if (eb) return eb; - eb = alloc_dummy_extent_buffer(start, len); + eb = alloc_dummy_extent_buffer(fs_info, start); if (!eb) return NULL; eb->fs_info = fs_info; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index ece9ce8..e6553e3 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -263,7 +263,8 @@ void set_page_extent_mapped(struct page *page); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, unsigned long len); -struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len); +struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start); struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src); struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index cc286ce..f51963a 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -53,7 +53,7 @@ static int test_btrfs_split_item(void) return -ENOMEM; } - path->nodes[0] = eb = alloc_dummy_extent_buffer(0, 4096); + path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, 4096); if (!eb) { test_msg("Could not allocate dummy buffer\n"); ret = -ENOMEM; diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 3ae0f5b..a116b55 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -255,7 +255,7 @@ static noinline int test_btrfs_get_extent(void) goto out; } - root->node = alloc_dummy_extent_buffer(0, 4096); + root->node = alloc_dummy_extent_buffer(NULL, 4096); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); goto out; @@ -843,7 +843,7 @@ static int test_hole_first(void) goto out; } - root->node = alloc_dummy_extent_buffer(0, 4096); + root->node = alloc_dummy_extent_buffer(NULL, 4096); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); goto out; diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index ec3dcb2..7336b1c 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -404,6 +404,16 @@ int btrfs_test_qgroups(void) ret = -ENOMEM; goto out; } + /* We are using this root as our extent root */ + root->fs_info->extent_root = root; + + /* + * Some of the paths we test assume we have a filled out fs_info, so we + * just need to add the root in there so we don't panic. + */ + root->fs_info->tree_root = root; + root->fs_info->quota_root = root; + root->fs_info->quota_enabled = 1; /* * Can't use bytenr 0, some things freak out @@ -448,17 +458,6 @@ int btrfs_test_qgroups(void) goto out; } - /* We are using this root as our extent root */ - root->fs_info->extent_root = root; - - /* - * Some of the paths we test assume we have a filled out fs_info, so we - * just need to addt he root in there so we don't panic. - */ - root->fs_info->tree_root = root; - root->fs_info->quota_root = root; - root->fs_info->quota_enabled = 1; - test_msg("Running qgroup tests\n"); ret = test_no_shared_qgroup(root); if (ret) -- cgit v0.10.2 From ce3e69847e3ec79a38421bfd3d6f554d5e481231 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Sun, 15 Jun 2014 03:00:04 +0200 Subject: btrfs: sink parameter len to alloc_extent_buffer Because we're using globally known nodesize. Do the same for the sanity test function variant. Signed-off-by: David Sterba diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 548cb54..9c20453 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1128,9 +1128,8 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr) { if (btrfs_test_is_dummy_root(root)) - return alloc_test_extent_buffer(root->fs_info, bytenr, - root->nodesize); - return alloc_extent_buffer(root->fs_info, bytenr, root->nodesize); + return alloc_test_extent_buffer(root->fs_info, bytenr); + return alloc_extent_buffer(root->fs_info, bytenr); } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index dc424e3..c4ca90a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4775,7 +4775,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, unsigned long len) + u64 start) { struct extent_buffer *eb, *exists = NULL; int ret; @@ -4821,8 +4821,9 @@ free_eb: #endif struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, unsigned long len) + u64 start) { + unsigned long len = fs_info->tree_root->nodesize; unsigned long num_pages = num_extent_pages(start, len); unsigned long i; unsigned long index = start >> PAGE_CACHE_SHIFT; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index e6553e3..71268e5 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -262,7 +262,7 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); void set_page_extent_mapped(struct page *page); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, unsigned long len); + u64 start); struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src); @@ -378,5 +378,5 @@ noinline u64 find_lock_delalloc_range(struct inode *inode, u64 *end, u64 max_bytes); #endif struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, unsigned long len); + u64 start); #endif diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 7336b1c..73f299e 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -419,7 +419,7 @@ int btrfs_test_qgroups(void) * Can't use bytenr 0, some things freak out * *cough*backref walking code*cough* */ - root->node = alloc_test_extent_buffer(root->fs_info, 4096, 4096); + root->node = alloc_test_extent_buffer(root->fs_info, 4096); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); ret = -ENOMEM; -- cgit v0.10.2 From 381cf6587f8a8a8e981bc0c1aaaa8859b51dc756 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 2 Jan 2015 18:45:16 +0100 Subject: btrfs: fix leak of path in btrfs_find_item If btrfs_find_item is called with NULL path it allocates one locally but does not free it. Affected paths are inserting an orphan item for a file and for a subvol root. Move the path allocation to the callers. CC: # 3.14+ Fixes: 3f870c289900 ("btrfs: expand btrfs_find_item() to include find_orphan_item functionality") Signed-off-by: David Sterba diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 14a72ed..f54511d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2609,32 +2609,23 @@ static int key_search(struct extent_buffer *b, struct btrfs_key *key, return 0; } -int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *found_path, +int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, u64 iobjectid, u64 ioff, u8 key_type, struct btrfs_key *found_key) { int ret; struct btrfs_key key; struct extent_buffer *eb; - struct btrfs_path *path; + + ASSERT(path); key.type = key_type; key.objectid = iobjectid; key.offset = ioff; - if (found_path == NULL) { - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - } else - path = found_path; - ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); - if ((ret < 0) || (found_key == NULL)) { - if (path != found_path) - btrfs_free_path(path); + if ((ret < 0) || (found_key == NULL)) return ret; - } eb = path->nodes[0]; if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8c63419..6182e54 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1630,6 +1630,7 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, bool check_ref) { struct btrfs_root *root; + struct btrfs_path *path; int ret; if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) @@ -1669,8 +1670,14 @@ again: if (ret) goto fail; - ret = btrfs_find_item(fs_info->tree_root, NULL, BTRFS_ORPHAN_OBJECTID, + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto fail; + } + ret = btrfs_find_item(fs_info->tree_root, path, BTRFS_ORPHAN_OBJECTID, location->objectid, BTRFS_ORPHAN_ITEM_KEY, NULL); + btrfs_free_path(path); if (ret < 0) goto fail; if (ret == 0) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9a02da1..5be45c1 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1257,10 +1257,19 @@ static int insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset) { int ret; - ret = btrfs_find_item(root, NULL, BTRFS_ORPHAN_OBJECTID, + struct btrfs_path *path; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_find_item(root, path, BTRFS_ORPHAN_OBJECTID, offset, BTRFS_ORPHAN_ITEM_KEY, NULL); if (ret > 0) ret = btrfs_insert_orphan_item(trans, root, offset); + + btrfs_free_path(path); + return ret; } -- cgit v0.10.2 From 14692cc150d3ce10ea8766ccb2a8f483b77b49f0 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 2 Jan 2015 18:55:46 +0100 Subject: btrfs: cleanup, remove inode_item_info helper It's only a simple wrapper around btrfs_find_item, the locally defined key is not used. Signed-off-by: David Sterba diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 8729cf6..6fdf82b 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1246,17 +1246,6 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans, return ret; } -/* - * this makes the path point to (inum INODE_ITEM ioff) - */ -int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, - struct btrfs_path *path) -{ - struct btrfs_key key; - return btrfs_find_item(fs_root, path, inum, ioff, - BTRFS_INODE_ITEM_KEY, &key); -} - static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, struct btrfs_path *path, struct btrfs_key *found_key) diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 2a1ac6b..9c41fba 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -32,9 +32,6 @@ struct inode_fs_paths { typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, void *ctx); -int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, - struct btrfs_path *path); - int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, struct btrfs_path *path, struct btrfs_key *found_key, u64 *flags); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 9e1569f..4846f66 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -530,7 +530,11 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, goto err; } - ret = inode_item_info(inum, 0, local_root, swarn->path); + /* + * this makes the path point to (inum INODE_ITEM ioff) + */ + ret = btrfs_find_item(local_root, swarn->path, inum, 0, + BTRFS_INODE_ITEM_KEY, NULL); if (ret) { btrfs_release_path(swarn->path); goto err; -- cgit v0.10.2 From c234a24de91837db6f00aeebe28e3daddc53c1b7 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 2 Jan 2015 19:03:17 +0100 Subject: btrfs: cleanup, remove inode_ref_info helper A simple wrapper around btrfs_find_item. Signed-off-by: David Sterba diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 6fdf82b..f55721f 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1246,14 +1246,6 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans, return ret; } -static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, - struct btrfs_path *path, - struct btrfs_key *found_key) -{ - return btrfs_find_item(fs_root, path, inum, ioff, - BTRFS_INODE_REF_KEY, found_key); -} - int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, u64 start_off, struct btrfs_path *path, struct btrfs_inode_extref **ret_extref, @@ -1363,7 +1355,8 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); } - ret = inode_ref_info(parent, 0, fs_root, path, &found_key); + ret = btrfs_find_item(fs_root, path, parent, 0, + BTRFS_INODE_REF_KEY, &found_key); if (ret > 0) ret = -ENOENT; if (ret) @@ -1716,8 +1709,10 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, struct btrfs_key found_key; while (!ret) { - ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, - &found_key); + ret = btrfs_find_item(fs_root, path, inum, + parent ? parent + 1 : 0, BTRFS_INODE_REF_KEY, + &found_key); + if (ret < 0) break; if (ret) { -- cgit v0.10.2 From 9c4f61f01d269815bb7c37be3ede59c5587747c6 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 2 Jan 2015 19:12:57 +0100 Subject: btrfs: simplify insert_orphan_item We can search and add the orphan item in one go, btrfs_insert_orphan_item will find out if the item already exists. Signed-off-by: David Sterba diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 5be45c1..25a1c36 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1254,21 +1254,13 @@ out: } static int insert_orphan_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 offset) + struct btrfs_root *root, u64 ino) { int ret; - struct btrfs_path *path; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - ret = btrfs_find_item(root, path, BTRFS_ORPHAN_OBJECTID, - offset, BTRFS_ORPHAN_ITEM_KEY, NULL); - if (ret > 0) - ret = btrfs_insert_orphan_item(trans, root, offset); - - btrfs_free_path(path); + ret = btrfs_insert_orphan_item(trans, root, ino); + if (ret == -EEXIST) + ret = 0; return ret; } -- cgit v0.10.2 From 1d4c08e0a60be356134d0c466744d0d4e16ebab0 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 2 Jan 2015 19:36:14 +0100 Subject: btrfs: expand btrfs_find_item if found_key is NULL If the found_key is NULL, then btrfs_find_item becomes a verbose wrapper for simple btrfs_search_slot. After we've removed all such callers, passing a NULL key is not valid anymore. Signed-off-by: David Sterba diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index f54511d..20d1f2b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2618,13 +2618,14 @@ int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, struct extent_buffer *eb; ASSERT(path); + ASSERT(found_key); key.type = key_type; key.objectid = iobjectid; key.offset = ioff; ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); - if ((ret < 0) || (found_key == NULL)) + if (ret < 0) return ret; eb = path->nodes[0]; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6182e54..8d48660 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1631,6 +1631,7 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, { struct btrfs_root *root; struct btrfs_path *path; + struct btrfs_key key; int ret; if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) @@ -1675,8 +1676,11 @@ again: ret = -ENOMEM; goto fail; } - ret = btrfs_find_item(fs_info->tree_root, path, BTRFS_ORPHAN_OBJECTID, - location->objectid, BTRFS_ORPHAN_ITEM_KEY, NULL); + key.objectid = BTRFS_ORPHAN_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = location->objectid; + + ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); btrfs_free_path(path); if (ret < 0) goto fail; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8bf326a..370f234 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5009,6 +5009,7 @@ static int fixup_tree_root_location(struct btrfs_root *root, struct btrfs_root *new_root; struct btrfs_root_ref *ref; struct extent_buffer *leaf; + struct btrfs_key key; int ret; int err = 0; @@ -5019,9 +5020,12 @@ static int fixup_tree_root_location(struct btrfs_root *root, } err = -ENOENT; - ret = btrfs_find_item(root->fs_info->tree_root, path, - BTRFS_I(dir)->root->root_key.objectid, - location->objectid, BTRFS_ROOT_REF_KEY, NULL); + key.objectid = BTRFS_I(dir)->root->root_key.objectid; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = location->objectid; + + ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, path, + 0, 0); if (ret) { if (ret < 0) err = ret; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 4846f66..53575a4 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -520,6 +520,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, struct inode_fs_paths *ipath = NULL; struct btrfs_root *local_root; struct btrfs_key root_key; + struct btrfs_key key; root_key.objectid = root; root_key.type = BTRFS_ROOT_ITEM_KEY; @@ -533,8 +534,11 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, /* * this makes the path point to (inum INODE_ITEM ioff) */ - ret = btrfs_find_item(local_root, swarn->path, inum, 0, - BTRFS_INODE_ITEM_KEY, NULL); + key.objectid = inum; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0); if (ret) { btrfs_release_path(swarn->path); goto err; -- cgit v0.10.2 From e7070be198b34c26f39bd9010a29ce6462dc4f3e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 16 Dec 2014 08:54:43 -0800 Subject: Btrfs: change how we track dirty roots I've been overloading root->dirty_list to keep track of dirty roots and which roots need to have their commit roots switched at transaction commit time. This could cause us to lose an update to the root which could corrupt the file system. To fix this use a state bit to know if the root is dirty, and if it isn't set we go ahead and move the root to the dirty list. This way if we re-dirty the root after adding it to the switch_commit list we make sure to update it. This also makes it so that the extent root is always the last root on the dirty list to try and keep the amount of churn down at this point in the commit. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 14a72ed..97a98fc 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -213,11 +213,19 @@ static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) */ static void add_root_to_dirty_list(struct btrfs_root *root) { + if (test_bit(BTRFS_ROOT_DIRTY, &root->state) || + !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state)) + return; + spin_lock(&root->fs_info->trans_lock); - if (test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state) && - list_empty(&root->dirty_list)) { - list_add(&root->dirty_list, - &root->fs_info->dirty_cowonly_roots); + if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { + /* Want the extent tree to be the last on the list */ + if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID) + list_move_tail(&root->dirty_list, + &root->fs_info->dirty_cowonly_roots); + else + list_move(&root->dirty_list, + &root->fs_info->dirty_cowonly_roots); } spin_unlock(&root->fs_info->trans_lock); } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7e60741..45ed4dc 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1775,6 +1775,7 @@ struct btrfs_subvolume_writers { #define BTRFS_ROOT_DEFRAG_RUNNING 6 #define BTRFS_ROOT_FORCE_COW 7 #define BTRFS_ROOT_MULTI_LOG_TASKS 8 +#define BTRFS_ROOT_DIRTY 9 /* * in ram representation of the tree. extent_root is used for all allocations diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a605d4e..aa2219e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1020,6 +1020,7 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, u64 old_root_bytenr; u64 old_root_used; struct btrfs_root *tree_root = root->fs_info->tree_root; + bool extent_root = (root->objectid == BTRFS_EXTENT_TREE_OBJECTID); old_root_used = btrfs_root_used(&root->root_item); btrfs_write_dirty_block_groups(trans, root); @@ -1038,7 +1039,12 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, return ret; old_root_used = btrfs_root_used(&root->root_item); - ret = btrfs_write_dirty_block_groups(trans, root); + if (extent_root) { + ret = btrfs_write_dirty_block_groups(trans, root); + if (ret) + return ret; + } + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); if (ret) return ret; } @@ -1097,6 +1103,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, next = fs_info->dirty_cowonly_roots.next; list_del_init(next); root = list_entry(next, struct btrfs_root, dirty_list); + clear_bit(BTRFS_ROOT_DIRTY, &root->state); if (root != fs_info->extent_root) list_add_tail(&root->dirty_list, -- cgit v0.10.2 From ce93ec548cfa02f9cd6b70d546d5f36f4d160f57 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 17 Nov 2014 15:45:48 -0500 Subject: Btrfs: track dirty block groups on their own list Currently any time we try to update the block groups on disk we will walk _all_ block groups and check for the ->dirty flag to see if it is set. This function can get called several times during a commit. So if you have several terabytes of data you will be a very sad panda as we will loop through _all_ of the block groups several times, which makes the commit take a while which slows down the rest of the file system operations. This patch introduces a dirty list for the block groups that we get added to when we dirty the block group for the first time. Then we simply update any block groups that have been dirtied since the last time we called btrfs_write_dirty_block_groups. This allows us to clean up how we write the free space cache out so it is much cleaner. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 45ed4dc..0b4683f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1238,7 +1238,6 @@ enum btrfs_disk_cache_state { BTRFS_DC_ERROR = 1, BTRFS_DC_CLEAR = 2, BTRFS_DC_SETUP = 3, - BTRFS_DC_NEED_WRITE = 4, }; struct btrfs_caching_control { @@ -1276,7 +1275,6 @@ struct btrfs_block_group_cache { unsigned long full_stripe_len; unsigned int ro:1; - unsigned int dirty:1; unsigned int iref:1; unsigned int has_caching_ctl:1; unsigned int removed:1; @@ -1314,6 +1312,9 @@ struct btrfs_block_group_cache { struct list_head ro_list; atomic_t trimming; + + /* For dirty block groups */ + struct list_head dirty_list; }; /* delayed seq elem */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1511658..21c373f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -74,8 +74,9 @@ enum { RESERVE_ALLOC_NO_ACCOUNT = 2, }; -static int update_block_group(struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int alloc); +static int update_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, int alloc); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, @@ -3315,120 +3316,42 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_block_group_cache *cache; - int err = 0; + struct btrfs_transaction *cur_trans = trans->transaction; + int ret = 0; struct btrfs_path *path; - u64 last = 0; + + if (list_empty(&cur_trans->dirty_bgs)) + return 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; -again: - while (1) { - cache = btrfs_lookup_first_block_group(root->fs_info, last); - while (cache) { - if (cache->disk_cache_state == BTRFS_DC_CLEAR) - break; - cache = next_block_group(root, cache); - } - if (!cache) { - if (last == 0) - break; - last = 0; - continue; - } - err = cache_save_setup(cache, trans, path); - last = cache->key.objectid + cache->key.offset; - btrfs_put_block_group(cache); - } - - while (1) { - if (last == 0) { - err = btrfs_run_delayed_refs(trans, root, - (unsigned long)-1); - if (err) /* File system offline */ - goto out; - } - - cache = btrfs_lookup_first_block_group(root->fs_info, last); - while (cache) { - if (cache->disk_cache_state == BTRFS_DC_CLEAR) { - btrfs_put_block_group(cache); - goto again; - } - - if (cache->dirty) - break; - cache = next_block_group(root, cache); - } - if (!cache) { - if (last == 0) - break; - last = 0; - continue; - } - - if (cache->disk_cache_state == BTRFS_DC_SETUP) - cache->disk_cache_state = BTRFS_DC_NEED_WRITE; - cache->dirty = 0; - last = cache->key.objectid + cache->key.offset; - - err = write_one_cache_group(trans, root, path, cache); - btrfs_put_block_group(cache); - if (err) /* File system offline */ - goto out; - } - - while (1) { - /* - * I don't think this is needed since we're just marking our - * preallocated extent as written, but just in case it can't - * hurt. - */ - if (last == 0) { - err = btrfs_run_delayed_refs(trans, root, - (unsigned long)-1); - if (err) /* File system offline */ - goto out; - } - - cache = btrfs_lookup_first_block_group(root->fs_info, last); - while (cache) { - /* - * Really this shouldn't happen, but it could if we - * couldn't write the entire preallocated extent and - * splitting the extent resulted in a new block. - */ - if (cache->dirty) { - btrfs_put_block_group(cache); - goto again; - } - if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE) - break; - cache = next_block_group(root, cache); - } - if (!cache) { - if (last == 0) - break; - last = 0; - continue; - } - - err = btrfs_write_out_cache(root, trans, cache, path); - - /* - * If we didn't have an error then the cache state is still - * NEED_WRITE, so we can set it to WRITTEN. - */ - if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE) - cache->disk_cache_state = BTRFS_DC_WRITTEN; - last = cache->key.objectid + cache->key.offset; + /* + * We don't need the lock here since we are protected by the transaction + * commit. We want to do the cache_save_setup first and then run the + * delayed refs to make sure we have the best chance at doing this all + * in one shot. + */ + while (!list_empty(&cur_trans->dirty_bgs)) { + cache = list_first_entry(&cur_trans->dirty_bgs, + struct btrfs_block_group_cache, + dirty_list); + list_del_init(&cache->dirty_list); + if (cache->disk_cache_state == BTRFS_DC_CLEAR) + cache_save_setup(cache, trans, path); + if (!ret) + ret = btrfs_run_delayed_refs(trans, root, + (unsigned long) -1); + if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) + btrfs_write_out_cache(root, trans, cache, path); + if (!ret) + ret = write_one_cache_group(trans, root, path, cache); btrfs_put_block_group(cache); } -out: btrfs_free_path(path); - return err; + return ret; } int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) @@ -5375,8 +5298,9 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) btrfs_free_reserved_data_space(inode, num_bytes); } -static int update_block_group(struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int alloc) +static int update_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, int alloc) { struct btrfs_block_group_cache *cache = NULL; struct btrfs_fs_info *info = root->fs_info; @@ -5414,6 +5338,14 @@ static int update_block_group(struct btrfs_root *root, if (!alloc && cache->cached == BTRFS_CACHE_NO) cache_block_group(cache, 1); + spin_lock(&trans->transaction->dirty_bgs_lock); + if (list_empty(&cache->dirty_list)) { + list_add_tail(&cache->dirty_list, + &trans->transaction->dirty_bgs); + btrfs_get_block_group(cache); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + byte_in_group = bytenr - cache->key.objectid; WARN_ON(byte_in_group > cache->key.offset); @@ -5424,7 +5356,6 @@ static int update_block_group(struct btrfs_root *root, cache->disk_cache_state < BTRFS_DC_CLEAR) cache->disk_cache_state = BTRFS_DC_CLEAR; - cache->dirty = 1; old_val = btrfs_block_group_used(&cache->item); num_bytes = min(total, cache->key.offset - byte_in_group); if (alloc) { @@ -6103,7 +6034,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } - ret = update_block_group(root, bytenr, num_bytes, 0); + ret = update_block_group(trans, root, bytenr, num_bytes, 0); if (ret) { btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -7063,7 +6994,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = update_block_group(root, ins->objectid, ins->offset, 1); + ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", ins->objectid, ins->offset); @@ -7152,7 +7083,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, return ret; } - ret = update_block_group(root, ins->objectid, root->nodesize, 1); + ret = update_block_group(trans, root, ins->objectid, root->nodesize, + 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", ins->objectid, ins->offset); @@ -9005,6 +8937,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) INIT_LIST_HEAD(&cache->cluster_list); INIT_LIST_HEAD(&cache->bg_list); INIT_LIST_HEAD(&cache->ro_list); + INIT_LIST_HEAD(&cache->dirty_list); btrfs_init_free_space_ctl(cache); atomic_set(&cache->trimming, 0); @@ -9068,9 +9001,8 @@ int btrfs_read_block_groups(struct btrfs_root *root) * b) Setting 'dirty flag' makes sure that we flush * the new space cache info onto disk. */ - cache->disk_cache_state = BTRFS_DC_CLEAR; if (btrfs_test_opt(root, SPACE_CACHE)) - cache->dirty = 1; + cache->disk_cache_state = BTRFS_DC_CLEAR; } read_extent_buffer(leaf, &cache->item, @@ -9461,6 +9393,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, } } + spin_lock(&trans->transaction->dirty_bgs_lock); + if (!list_empty(&block_group->dirty_list)) { + list_del_init(&block_group->dirty_list); + btrfs_put_block_group(block_group); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + btrfs_remove_free_space_cache(block_group); spin_lock(&block_group->space_info->lock); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index d6c03f7..80a3141 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1243,6 +1243,7 @@ int btrfs_write_out_cache(struct btrfs_root *root, struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct inode *inode; int ret = 0; + enum btrfs_disk_cache_state dcs = BTRFS_DC_WRITTEN; root = root->fs_info->tree_root; @@ -1266,9 +1267,7 @@ int btrfs_write_out_cache(struct btrfs_root *root, ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, path, block_group->key.objectid); if (ret) { - spin_lock(&block_group->lock); - block_group->disk_cache_state = BTRFS_DC_ERROR; - spin_unlock(&block_group->lock); + dcs = BTRFS_DC_ERROR; ret = 0; #ifdef DEBUG btrfs_err(root->fs_info, @@ -1277,6 +1276,9 @@ int btrfs_write_out_cache(struct btrfs_root *root, #endif } + spin_lock(&block_group->lock); + block_group->disk_cache_state = dcs; + spin_unlock(&block_group->lock); iput(inode); return ret; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index aa2219e..e0faf80 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -248,6 +248,8 @@ loop: INIT_LIST_HEAD(&cur_trans->pending_chunks); INIT_LIST_HEAD(&cur_trans->switch_commits); INIT_LIST_HEAD(&cur_trans->pending_ordered); + INIT_LIST_HEAD(&cur_trans->dirty_bgs); + spin_lock_init(&cur_trans->dirty_bgs_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, fs_info->btree_inode->i_mapping); @@ -1028,7 +1030,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, while (1) { old_root_bytenr = btrfs_root_bytenr(&root->root_item); if (old_root_bytenr == root->node->start && - old_root_used == btrfs_root_used(&root->root_item)) + old_root_used == btrfs_root_used(&root->root_item) && + (!extent_root || + list_empty(&trans->transaction->dirty_bgs))) break; btrfs_set_root_node(&root->root_item, root->node); @@ -1047,6 +1051,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); if (ret) return ret; + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) + return ret; } return 0; @@ -1067,10 +1074,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, struct extent_buffer *eb; int ret; - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - if (ret) - return ret; - eb = btrfs_lock_root_node(fs_info->tree_root); ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb); @@ -1990,6 +1993,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, switch_commit_roots(cur_trans, root->fs_info); assert_qgroups_uptodate(trans); + ASSERT(list_empty(&cur_trans->dirty_bgs)); update_super_roots(root); btrfs_set_super_log_root(root->fs_info->super_copy, 0); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 00ed29c..3305451 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -58,6 +58,8 @@ struct btrfs_transaction { struct list_head pending_chunks; struct list_head pending_ordered; struct list_head switch_commits; + struct list_head dirty_bgs; + spinlock_t dirty_bgs_lock; struct btrfs_delayed_ref_root delayed_refs; int aborted; }; -- cgit v0.10.2 From a8df6fe666f9a3a7c08df70f94351f967462dd95 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 20 Jan 2015 12:40:53 +0000 Subject: Btrfs: fix setup_leaf_for_split() to avoid leaf corruption We were incorrectly detecting when the target key didn't exist anymore after releasing the path and re-searching the tree. This could make us split or duplicate (btrfs_split_item() and btrfs_duplicate_item() are its only callers at the moment) an item when we should not. For the case of duplicating an item, we currently only duplicate checksum items (csum tree) and file extent items (fs/subvol trees). For the checksum items we end up overriding the item completely, but for file extent items we update only some of their fields in the copy (done in __btrfs_drop_extents), which means we can end up having a logical corruption for some values. Also for the case where we duplicate a file extent item it will make us produce a leaf with a wrong key order, as btrfs_duplicate_item() advances us to the next slot and then its caller sets a smaller key on the new item at that slot (like in __btrfs_drop_extents() e.g.). Alternatively if the tree search in setup_leaf_for_split() leaves with path->slots[0] == btrfs_header_nritems(path->nodes[0]), we end up accessing beyond the leaf's end (when we check if the item's size has changed) and make our caller insert an item at the invalid slot btrfs_header_nritems(path->nodes[0]) + 1, causing an invalid memory access if the leaf is full or nearly full. This issue has been present since the introduction of this function in 2009: Btrfs: Add btrfs_duplicate_item commit ad48fd754676bfae4139be1a897b1ea58f9aaf21 Signed-off-by: Filipe Manana Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 6b2ec90..fdd8fb4 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4353,13 +4353,15 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, path->search_for_split = 1; ret = btrfs_search_slot(trans, root, &key, path, 0, 1); path->search_for_split = 0; + if (ret > 0) + ret = -EAGAIN; if (ret < 0) goto err; ret = -EAGAIN; leaf = path->nodes[0]; - /* if our item isn't there or got smaller, return now */ - if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0])) + /* if our item isn't there, return now */ + if (item_size != btrfs_item_size_nr(leaf, path->slots[0])) goto err; /* the leaf has changed, it now has room. return now */ -- cgit v0.10.2 From 68b663d13ceba777ee6e250154f0e3ab28bd1181 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 19 Dec 2014 18:38:37 +0100 Subject: btrfs: update message levels for errors Several messages that point to some internal problem, level INFO is wrong here. Signed-off-by: David Sterba Signed-off-by: Chris Mason diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 65384de..2a3e225 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -367,7 +367,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, ret = 0; goto out; } - printk_ratelimited(KERN_INFO "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n", + printk_ratelimited(KERN_ERR + "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n", eb->fs_info->sb->s_id, eb->start, parent_transid, btrfs_header_generation(eb)); ret = 1; @@ -633,21 +634,21 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { - printk_ratelimited(KERN_INFO "BTRFS (device %s): bad tree block start " + printk_ratelimited(KERN_ERR "BTRFS (device %s): bad tree block start " "%llu %llu\n", eb->fs_info->sb->s_id, found_start, eb->start); ret = -EIO; goto err; } if (check_tree_block_fsid(root, eb)) { - printk_ratelimited(KERN_INFO "BTRFS (device %s): bad fsid on block %llu\n", + printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n", eb->fs_info->sb->s_id, eb->start); ret = -EIO; goto err; } found_level = btrfs_header_level(eb); if (found_level >= BTRFS_MAX_LEVEL) { - btrfs_info(root->fs_info, "bad tree block level %d", + btrfs_err(root->fs_info, "bad tree block level %d", (int)btrfs_header_level(eb)); ret = -EIO; goto err; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 370f234..29892eb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3407,7 +3407,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) out: if (ret) - btrfs_crit(root->fs_info, + btrfs_err(root->fs_info, "could not do orphan cleanup %d", ret); btrfs_free_path(path); return ret; -- cgit v0.10.2 From aa8ee31209d644a3f4716cb0c43aba7889fbef31 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 19 Dec 2014 18:38:41 +0100 Subject: btrfs: update message levels during failed mount All error conditions from open_ctree shall be ERR. Warning would suggest that something's wrong and we can continue. Signed-off-by: David Sterba Signed-off-by: Chris Mason diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2a3e225..7ff24cb 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2533,7 +2533,7 @@ int open_ctree(struct super_block *sb, */ if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && (sectorsize != nodesize)) { - printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes " + printk(KERN_ERR "BTRFS: unequal leaf/node/sector sizes " "are not allowed for mixed block groups on %s\n", sb->s_id); goto fail_alloc; @@ -2641,12 +2641,12 @@ int open_ctree(struct super_block *sb, sb->s_blocksize_bits = blksize_bits(sectorsize); if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) { - printk(KERN_INFO "BTRFS: valid FS not found on %s\n", sb->s_id); + printk(KERN_ERR "BTRFS: valid FS not found on %s\n", sb->s_id); goto fail_sb_buffer; } if (sectorsize != PAGE_SIZE) { - printk(KERN_WARNING "BTRFS: Incompatible sector size(%lu) " + printk(KERN_ERR "BTRFS: incompatible sector size (%lu) " "found on %s\n", (unsigned long)sectorsize, sb->s_id); goto fail_sb_buffer; } @@ -2655,7 +2655,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_read_sys_array(tree_root); mutex_unlock(&fs_info->chunk_mutex); if (ret) { - printk(KERN_WARNING "BTRFS: failed to read the system " + printk(KERN_ERR "BTRFS: failed to read the system " "array on %s\n", sb->s_id); goto fail_sb_buffer; } @@ -2670,7 +2670,7 @@ int open_ctree(struct super_block *sb, generation); if (!chunk_root->node || !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { - printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n", + printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n", sb->s_id); goto fail_tree_roots; } @@ -2682,7 +2682,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_read_chunk_tree(chunk_root); if (ret) { - printk(KERN_WARNING "BTRFS: failed to read chunk tree on %s\n", + printk(KERN_ERR "BTRFS: failed to read chunk tree on %s\n", sb->s_id); goto fail_tree_roots; } @@ -2694,7 +2694,7 @@ int open_ctree(struct super_block *sb, btrfs_close_extra_devices(fs_info, fs_devices, 0); if (!fs_devices->latest_bdev) { - printk(KERN_CRIT "BTRFS: failed to read devices on %s\n", + printk(KERN_ERR "BTRFS: failed to read devices on %s\n", sb->s_id); goto fail_tree_roots; } @@ -2778,7 +2778,7 @@ retry_root_backup: ret = btrfs_recover_balance(fs_info); if (ret) { - printk(KERN_WARNING "BTRFS: failed to recover balance\n"); + printk(KERN_ERR "BTRFS: failed to recover balance\n"); goto fail_block_groups; } -- cgit v0.10.2 From f0954c66377b1610a512fd05f4d33afa54441c3b Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 19 Dec 2014 18:38:44 +0100 Subject: btrfs: update message levels after checksum errors The errors are worth noting and might get missed with INFO level. Signed-off-by: David Sterba Signed-off-by: Chris Mason diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7ff24cb..20da6820 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -318,7 +318,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, memcpy(&found, result, csum_size); read_extent_buffer(buf, &val, 0, csum_size); - printk_ratelimited(KERN_INFO + printk_ratelimited(KERN_WARNING "BTRFS: %s checksum verify failed on %llu wanted %X found %X " "level %d\n", root->fs_info->sb->s_id, buf->start, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 29892eb..5e52920 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2945,7 +2945,7 @@ static int __readpage_endio_check(struct inode *inode, return 0; zeroit: if (__ratelimit(&_rs)) - btrfs_info(BTRFS_I(inode)->root->fs_info, + btrfs_warn(BTRFS_I(inode)->root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u", btrfs_ino(inode), start, csum, csum_expected); memset(kaddr + pgoff, 1, len); -- cgit v0.10.2 From 5efa0490cc94aee06cd8d282683e22a8ce0a0026 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 19 Dec 2014 18:38:47 +0100 Subject: btrfs: set proper message level for skinny metadata This has been confusing people for too long, the message is really just informative. CC: # 3.10+ Signed-off-by: David Sterba Signed-off-by: Chris Mason diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 20da6820..0696a10 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2509,7 +2509,7 @@ int open_ctree(struct super_block *sb, features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) - printk(KERN_ERR "BTRFS: has skinny extents\n"); + printk(KERN_INFO "BTRFS: has skinny extents\n"); /* * flag our filesystem as having big metadata blocks if -- cgit v0.10.2 From 9ee49a047dc53fd21808cbb7f3b6a3345463e834 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 14 Jan 2015 19:52:13 +0100 Subject: btrfs: switch extent_state state to unsigned Currently there's a 4B hole in the structure between refs and state and there are only 16 bits used so we can make it unsigned. This will get a better packing and may save some stack space for local variables. The size of extent_state gets reduced by 8B and there are usually a lot of slab objects. struct extent_state { u64 start; /* 0 8 */ u64 end; /* 8 8 */ struct rb_node rb_node; /* 16 24 */ wait_queue_head_t wq; /* 40 24 */ /* --- cacheline 1 boundary (64 bytes) --- */ atomic_t refs; /* 64 4 */ /* XXX 4 bytes hole, try to pack */ long unsigned int state; /* 72 8 */ u64 private; /* 80 8 */ /* size: 88, cachelines: 2, members: 7 */ /* sum members: 84, holes: 1, sum holes: 4 */ /* last cacheline: 24 bytes */ }; Signed-off-by: David Sterba Signed-off-by: Chris Mason diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c4ca90a..65bd285 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -64,7 +64,7 @@ void btrfs_leak_debug_check(void) while (!list_empty(&states)) { state = list_entry(states.next, struct extent_state, leak_list); - pr_err("BTRFS: state leak: start %llu end %llu state %lu in tree %d refs %d\n", + pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", state->start, state->end, state->state, extent_state_in_tree(state), atomic_read(&state->refs)); @@ -396,21 +396,21 @@ static void merge_state(struct extent_io_tree *tree, } static void set_state_cb(struct extent_io_tree *tree, - struct extent_state *state, unsigned long *bits) + struct extent_state *state, unsigned *bits) { if (tree->ops && tree->ops->set_bit_hook) tree->ops->set_bit_hook(tree->mapping->host, state, bits); } static void clear_state_cb(struct extent_io_tree *tree, - struct extent_state *state, unsigned long *bits) + struct extent_state *state, unsigned *bits) { if (tree->ops && tree->ops->clear_bit_hook) tree->ops->clear_bit_hook(tree->mapping->host, state, bits); } static void set_state_bits(struct extent_io_tree *tree, - struct extent_state *state, unsigned long *bits); + struct extent_state *state, unsigned *bits); /* * insert an extent_state struct into the tree. 'bits' are set on the @@ -426,7 +426,7 @@ static int insert_state(struct extent_io_tree *tree, struct extent_state *state, u64 start, u64 end, struct rb_node ***p, struct rb_node **parent, - unsigned long *bits) + unsigned *bits) { struct rb_node *node; @@ -511,10 +511,10 @@ static struct extent_state *next_state(struct extent_state *state) */ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, struct extent_state *state, - unsigned long *bits, int wake) + unsigned *bits, int wake) { struct extent_state *next; - unsigned long bits_to_clear = *bits & ~EXTENT_CTLBITS; + unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS; if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; @@ -570,7 +570,7 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err) * This takes the tree lock, and returns 0 on success and < 0 on error. */ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, int wake, int delete, + unsigned bits, int wake, int delete, struct extent_state **cached_state, gfp_t mask) { @@ -789,9 +789,9 @@ out: static void set_state_bits(struct extent_io_tree *tree, struct extent_state *state, - unsigned long *bits) + unsigned *bits) { - unsigned long bits_to_set = *bits & ~EXTENT_CTLBITS; + unsigned bits_to_set = *bits & ~EXTENT_CTLBITS; set_state_cb(tree, state, bits); if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { @@ -803,7 +803,7 @@ static void set_state_bits(struct extent_io_tree *tree, static void cache_state_if_flags(struct extent_state *state, struct extent_state **cached_ptr, - const u64 flags) + unsigned flags) { if (cached_ptr && !(*cached_ptr)) { if (!flags || (state->state & flags)) { @@ -833,7 +833,7 @@ static void cache_state(struct extent_state *state, static int __must_check __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, unsigned long exclusive_bits, + unsigned bits, unsigned exclusive_bits, u64 *failed_start, struct extent_state **cached_state, gfp_t mask) { @@ -1034,7 +1034,7 @@ search_again: } int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, u64 * failed_start, + unsigned bits, u64 * failed_start, struct extent_state **cached_state, gfp_t mask) { return __set_extent_bit(tree, start, end, bits, 0, failed_start, @@ -1060,7 +1060,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, * boundary bits like LOCK. */ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, unsigned long clear_bits, + unsigned bits, unsigned clear_bits, struct extent_state **cached_state, gfp_t mask) { struct extent_state *state; @@ -1268,14 +1268,14 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, } int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, gfp_t mask) + unsigned bits, gfp_t mask) { return set_extent_bit(tree, start, end, bits, NULL, NULL, mask); } int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, gfp_t mask) + unsigned bits, gfp_t mask) { return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); } @@ -1330,10 +1330,11 @@ int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, * us if waiting is desired. */ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, struct extent_state **cached_state) + unsigned bits, struct extent_state **cached_state) { int err; u64 failed_start; + while (1) { err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, EXTENT_LOCKED, &failed_start, @@ -1440,7 +1441,7 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) */ static struct extent_state * find_first_extent_bit_state(struct extent_io_tree *tree, - u64 start, unsigned long bits) + u64 start, unsigned bits) { struct rb_node *node; struct extent_state *state; @@ -1474,7 +1475,7 @@ out: * If nothing was found, 1 is returned. If found something, return 0. */ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, unsigned long bits, + u64 *start_ret, u64 *end_ret, unsigned bits, struct extent_state **cached_state) { struct extent_state *state; @@ -1753,7 +1754,7 @@ out_failed: int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, struct page *locked_page, - unsigned long clear_bits, + unsigned clear_bits, unsigned long page_ops) { struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; @@ -1810,7 +1811,7 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, */ u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, u64 max_bytes, - unsigned long bits, int contig) + unsigned bits, int contig) { struct rb_node *node; struct extent_state *state; @@ -1928,7 +1929,7 @@ out: * range is found set. */ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, int filled, struct extent_state *cached) + unsigned bits, int filled, struct extent_state *cached) { struct extent_state *state = NULL; struct rb_node *node; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 71268e5..695b0cc 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -4,22 +4,22 @@ #include /* bits for the extent state */ -#define EXTENT_DIRTY 1 -#define EXTENT_WRITEBACK (1 << 1) -#define EXTENT_UPTODATE (1 << 2) -#define EXTENT_LOCKED (1 << 3) -#define EXTENT_NEW (1 << 4) -#define EXTENT_DELALLOC (1 << 5) -#define EXTENT_DEFRAG (1 << 6) -#define EXTENT_BOUNDARY (1 << 9) -#define EXTENT_NODATASUM (1 << 10) -#define EXTENT_DO_ACCOUNTING (1 << 11) -#define EXTENT_FIRST_DELALLOC (1 << 12) -#define EXTENT_NEED_WAIT (1 << 13) -#define EXTENT_DAMAGED (1 << 14) -#define EXTENT_NORESERVE (1 << 15) -#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) -#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) +#define EXTENT_DIRTY (1U << 0) +#define EXTENT_WRITEBACK (1U << 1) +#define EXTENT_UPTODATE (1U << 2) +#define EXTENT_LOCKED (1U << 3) +#define EXTENT_NEW (1U << 4) +#define EXTENT_DELALLOC (1U << 5) +#define EXTENT_DEFRAG (1U << 6) +#define EXTENT_BOUNDARY (1U << 9) +#define EXTENT_NODATASUM (1U << 10) +#define EXTENT_DO_ACCOUNTING (1U << 11) +#define EXTENT_FIRST_DELALLOC (1U << 12) +#define EXTENT_NEED_WAIT (1U << 13) +#define EXTENT_DAMAGED (1U << 14) +#define EXTENT_NORESERVE (1U << 15) +#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) +#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) /* * flags for bio submission. The high bits indicate the compression @@ -81,9 +81,9 @@ struct extent_io_ops { int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate); void (*set_bit_hook)(struct inode *inode, struct extent_state *state, - unsigned long *bits); + unsigned *bits); void (*clear_bit_hook)(struct inode *inode, struct extent_state *state, - unsigned long *bits); + unsigned *bits); void (*merge_extent_hook)(struct inode *inode, struct extent_state *new, struct extent_state *other); @@ -108,7 +108,7 @@ struct extent_state { /* ADD NEW ELEMENTS AFTER THIS */ wait_queue_head_t wq; atomic_t refs; - unsigned long state; + unsigned state; /* for use by the FS */ u64 private; @@ -188,7 +188,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, int try_release_extent_buffer(struct page *page); int lock_extent(struct extent_io_tree *tree, u64 start, u64 end); int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, struct extent_state **cached); + unsigned bits, struct extent_state **cached); int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end); int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached, gfp_t mask); @@ -202,21 +202,21 @@ void extent_io_exit(void); u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, - u64 max_bytes, unsigned long bits, int contig); + u64 max_bytes, unsigned bits, int contig); void free_extent_state(struct extent_state *state); int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, int filled, + unsigned bits, int filled, struct extent_state *cached_state); int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, gfp_t mask); + unsigned bits, gfp_t mask); int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, int wake, int delete, + unsigned bits, int wake, int delete, struct extent_state **cached, gfp_t mask); int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, gfp_t mask); + unsigned bits, gfp_t mask); int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, u64 *failed_start, + unsigned bits, u64 *failed_start, struct extent_state **cached_state, gfp_t mask); int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); @@ -229,14 +229,14 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, unsigned long clear_bits, + unsigned bits, unsigned clear_bits, struct extent_state **cached_state, gfp_t mask); int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, unsigned long bits, + u64 *start_ret, u64 *end_ret, unsigned bits, struct extent_state **cached_state); int extent_invalidatepage(struct extent_io_tree *tree, struct page *page, unsigned long offset); @@ -323,7 +323,7 @@ int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, struct page *locked_page, - unsigned long bits_to_clear, + unsigned bits_to_clear, unsigned long page_ops); struct bio * btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5e52920..220f0c3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1604,7 +1604,7 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root, * have pending delalloc work to be done. */ static void btrfs_set_bit_hook(struct inode *inode, - struct extent_state *state, unsigned long *bits) + struct extent_state *state, unsigned *bits) { if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC)) @@ -1645,7 +1645,7 @@ static void btrfs_set_bit_hook(struct inode *inode, */ static void btrfs_clear_bit_hook(struct inode *inode, struct extent_state *state, - unsigned long *bits) + unsigned *bits) { u64 len = state->end + 1 - state->start; diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 7e99c2f..9e9f236 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -258,8 +258,7 @@ static int test_find_delalloc(void) } ret = 0; out_bits: - clear_extent_bits(&tmp, 0, total_dirty - 1, - (unsigned long)-1, GFP_NOFS); + clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_NOFS); out: if (locked_page) page_cache_release(locked_page); -- cgit v0.10.2 From 730a78c741df4eaa24589015191f03c2d1240091 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 20 Jan 2015 19:05:37 +0100 Subject: btrfs: remove a no-op unfreeze superbock callback Signed-off-by: David Sterba Signed-off-by: Chris Mason diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 60f7cbe..100a044 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1948,11 +1948,6 @@ static int btrfs_freeze(struct super_block *sb) return btrfs_commit_transaction(trans, root); } -static int btrfs_unfreeze(struct super_block *sb) -{ - return 0; -} - static int btrfs_show_devname(struct seq_file *m, struct dentry *root) { struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); @@ -2001,7 +1996,6 @@ static const struct super_operations btrfs_super_ops = { .statfs = btrfs_statfs, .remount_fs = btrfs_remount, .freeze_fs = btrfs_freeze, - .unfreeze_fs = btrfs_unfreeze, }; static const struct file_operations btrfs_ctl_fops = { -- cgit v0.10.2 From 6219872dc6e56529159f04e73587ed0fcd63eb20 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 6 Jan 2015 20:18:45 +0000 Subject: Btrfs: lookup for block group only if needed when freeing a tree block Very often our extent buffer's header generation doesn't match the current transaction's id or it is also referenced by other trees (snapshots), so we don't need the corresponding block group cache object. Therefore only search for it if we are going to use it, so we avoid an unnecessary search in the block groups rbtree (and acquiring and releasing its spinlock). Freeing a tree block is performed when COWing or deleting a node/leaf, which implies we are holding the node/leaf's parent node lock, therefore reducing the amount of time spent when freeing a tree block helps reducing the amount of time we are holding the parent node's lock. For example, for a run of xfstests/generic/083, the block group cache object was needed only 682 times for a total of 226691 calls to free a tree block. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1c591d6..3af53f8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6136,7 +6136,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct extent_buffer *buf, u64 parent, int last_ref) { - struct btrfs_block_group_cache *cache = NULL; int pin = 1; int ret; @@ -6152,17 +6151,20 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (!last_ref) return; - cache = btrfs_lookup_block_group(root->fs_info, buf->start); - if (btrfs_header_generation(buf) == trans->transid) { + struct btrfs_block_group_cache *cache; + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { ret = check_ref_cleanup(trans, root, buf->start); if (!ret) goto out; } + cache = btrfs_lookup_block_group(root->fs_info, buf->start); + if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { pin_down_extent(root, cache, buf->start, buf->len, 1); + btrfs_put_block_group(cache); goto out; } @@ -6170,6 +6172,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, btrfs_add_free_space(cache, buf->start, buf->len); btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0); + btrfs_put_block_group(cache); trace_btrfs_reserved_extent_free(root, buf->start, buf->len); pin = 0; } @@ -6184,7 +6187,6 @@ out: * anymore. */ clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); - btrfs_put_block_group(cache); } /* Can return -ENOMEM */ -- cgit v0.10.2 From d36808e0d4fc4802892dbd1dba964912cddf1323 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sat, 10 Jan 2015 10:56:48 +0000 Subject: Btrfs: fix directory inconsistency after fsync log replay If we have an inode (file) with a link count greater than 1, remove one of its hard links, fsync the inode, power fail/crash and then replay the fsync log on the next mount, we end up getting the parent directory's metadata inconsistent - its i_size still reflects the deleted hard link and has dangling index entries (with no matching inode reference entries). This prevents the directory from ever being deletable, as its i_size can never decrease to BTRFS_EMPTY_DIR_SIZE even if all of its children inodes are deleted, and the dangling index entries can never be removed (as they point to an inode that does not exist anymore). This is easy to reproduce with the following excerpt from the test case for xfstests that I just made: _scratch_mkfs >> $seqres.full 2>&1 _init_flakey _mount_flakey # Create a test file with 2 hard links in the same directory. mkdir -p $SCRATCH_MNT/a/b echo "hello world" > $SCRATCH_MNT/a/b/foo ln $SCRATCH_MNT/a/b/foo $SCRATCH_MNT/a/b/bar # Make sure all metadata and data are durably persisted. sync # Now remove one of the hard links and fsync the inode. rm -f $SCRATCH_MNT/a/b/bar $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/a/b/foo # Simulate a crash/power loss. This makes sure the next mount # will see an fsync log and will replay that log. _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey # Remove the last hard link of the file and attempt to remove its parent # directory - this failed in btrfs because the fsync log and replay code # didn't decrement the parent directory's i_size and left dangling directory # index entries - this made the btrfs rmdir implementation always fail with # the error -ENOTEMPTY. # # The dangling directory index entries were visible to user space, but it was # impossible to do anything on them (unlink, open, read, write, stat, etc) # because the inode they pointed to did not exist anymore. # # The parent directory's metadata inconsistency (stale index entries) was # also detected by btrfs' fsck tool, which is run automatically by the fstests # framework when the test finishes. The error message reported by fsck was: # # root 5 inode 259 errors 2001, no inode item, link count wrong # unresolved ref dir 258 index 3 namelen 3 name bar filetype 1 errors 4, no inode ref # rm -f $SCRATCH_MNT/a/b/* rmdir $SCRATCH_MNT/a/b rmdir $SCRATCH_MNT/a To fix this just make sure that after an unlink, if the inode is fsync'ed, he parent inode is fully logged in the fsync log. A test case for xfstests follows soon. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 67e5bf7..60e1d00 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4273,6 +4273,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct dentry *old_parent = NULL; int ret = 0; u64 last_committed = root->fs_info->last_trans_committed; + const struct dentry * const first_parent = parent; + const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans > + last_committed); sb = inode->i_sb; @@ -4328,7 +4331,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_trans; } - inode_only = LOG_INODE_EXISTS; while (1) { if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) break; @@ -4337,8 +4339,22 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (root != BTRFS_I(inode)->root) break; + /* + * On unlink we must make sure our immediate parent directory + * inode is fully logged. This is to prevent leaving dangling + * directory index entries and a wrong directory inode's i_size. + * Not doing so can result in a directory being impossible to + * delete after log replay (rmdir will always fail with error + * -ENOTEMPTY). + */ + if (did_unlink && parent == first_parent) + inode_only = LOG_INODE_ALL; + else + inode_only = LOG_INODE_EXISTS; + if (BTRFS_I(inode)->generation > - root->fs_info->last_trans_committed) { + root->fs_info->last_trans_committed || + inode_only == LOG_INODE_ALL) { ret = btrfs_log_inode(trans, root, inode, inode_only, 0, LLONG_MAX, ctx); if (ret) -- cgit v0.10.2 From 2c2c452b0cafdc27442796c526ac929443654b0b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 13 Jan 2015 16:40:04 +0000 Subject: Btrfs: fix fsync when extend references are added to an inode If we added an extended reference to an inode and fsync'ed it, the log replay code would make our inode have an incorrect link count, which was lower then the expected/correct count. This resulted in stale directory index entries after deleting some of the hard links, and any access to the dangling directory entries resulted in -ESTALE errors because the entries pointed to inode items that don't exist anymore. This is easy to reproduce with the test case I made for xfstests, and the bulk of that test is: _scratch_mkfs "-O extref" >> $seqres.full 2>&1 _init_flakey _mount_flakey # Create a test file with 3001 hard links. This number is large enough to # make btrfs start using extrefs at some point even if the fs has the maximum # possible leaf/node size (64Kb). echo "hello world" > $SCRATCH_MNT/foo for i in `seq 1 3000`; do ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_`printf "%04d" $i` done # Make sure all metadata and data are durably persisted. sync # Add one more link to the inode that ends up being a btrfs extref and fsync # the inode. ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_3001 $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo # Simulate a crash/power loss. This makes sure the next mount # will see an fsync log and will replay that log. _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey # Now after the fsync log replay btrfs left our inode with a wrong link count N, # which was smaller than the correct link count M (N < M). # So after removing N hard links, the remaining M - N directory entries were # still visible to user space but it was impossible to do anything with them # because they pointed to an inode that didn't exist anymore. This resulted in # stale file handle errors (-ESTALE) when accessing those dentries for example. # # So remove all hard links except the first one and then attempt to read the # file, to verify we don't get an -ESTALE error when accessing the inodel # # The btrfs fsck tool also detected the incorrect inode link count and it # reported an error message like the following: # # root 5 inode 257 errors 2001, no inode item, link count wrong # unresolved ref dir 256 index 2978 namelen 13 name foo_link_2976 filetype 1 errors 4, no inode ref # # The fstests framework automatically calls fsck after a test is run, so we # don't need to call fsck explicitly here. rm -f $SCRATCH_MNT/foo_link_* cat $SCRATCH_MNT/foo status=0 exit So make sure an fsync always flushes the delayed inode item, so that the fsync log contains it (needed in order to trigger the link count fixup code) and fix the extref counting function, which always return -ENOENT to its caller (and made it assume there were always 0 extrefs). This issue has been present since the introduction of the extrefs feature (2012). A test case for xfstests follows soon. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 60e1d00..533cdb0 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1288,6 +1288,7 @@ static int count_inode_extrefs(struct btrfs_root *root, leaf = path->nodes[0]; item_size = btrfs_item_size_nr(leaf, path->slots[0]); ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + cur_offset = 0; while (cur_offset < item_size) { extref = (struct btrfs_inode_extref *) (ptr + cur_offset); @@ -1303,7 +1304,7 @@ static int count_inode_extrefs(struct btrfs_root *root, } btrfs_release_path(path); - if (ret < 0) + if (ret < 0 && ret != -ENOENT) return ret; return nlink; } @@ -1395,9 +1396,6 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, nlink = ret; ret = count_inode_extrefs(root, inode, path); - if (ret == -ENOENT) - ret = 0; - if (ret < 0) goto out; @@ -3966,15 +3964,22 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, max_key.type = (u8)-1; max_key.offset = (u64)-1; - /* Only run delayed items if we are a dir or a new file */ + /* + * Only run delayed items if we are a dir or a new file. + * Otherwise commit the delayed inode only, which is needed in + * order for the log replay code to mark inodes for link count + * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). + */ if (S_ISDIR(inode->i_mode) || - BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) { + BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) ret = btrfs_commit_inode_delayed_items(trans, inode); - if (ret) { - btrfs_free_path(path); - btrfs_free_path(dst_path); - return ret; - } + else + ret = btrfs_commit_inode_delayed_inode(inode); + + if (ret) { + btrfs_free_path(path); + btrfs_free_path(dst_path); + return ret; } mutex_lock(&BTRFS_I(inode)->log_mutex); -- cgit v0.10.2 From df8d116ffa379f3ef09d7ff28da0f0c921cc9fa1 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 14 Jan 2015 01:52:25 +0000 Subject: Btrfs: fix fsync log replay for inodes with a mix of regular refs and extrefs If we have an inode with a large number of hard links, some of which may be extrefs, turn a regular ref into an extref, fsync the inode and then replay the fsync log (after a crash/reboot), we can endup with an fsync log that makes the replay code always fail with -EOVERFLOW when processing the inode's references. This is easy to reproduce with the test case I made for xfstests. Its steps are the following: _scratch_mkfs "-O extref" >> $seqres.full 2>&1 _init_flakey _mount_flakey # Create a test file with 3001 hard links. This number is large enough to # make btrfs start using extrefs at some point even if the fs has the maximum # possible leaf/node size (64Kb). echo "hello world" > $SCRATCH_MNT/foo for i in `seq 1 3000`; do ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_`printf "%04d" $i` done # Make sure all metadata and data are durably persisted. sync # Now remove one link, add a new one with a new name, add another new one with # the same name as the one we just removed and fsync the inode. rm -f $SCRATCH_MNT/foo_link_0001 ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_3001 ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_0001 rm -f $SCRATCH_MNT/foo_link_0002 ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_3002 ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_3003 $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo # Simulate a crash/power loss. This makes sure the next mount # will see an fsync log and will replay that log. _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey # Check that the number of hard links is correct, we are able to remove all # the hard links and read the file's data. This is just to verify we don't # get stale file handle errors (due to dangling directory index entries that # point to inodes that no longer exist). echo "Link count: $(stat --format=%h $SCRATCH_MNT/foo)" [ -f $SCRATCH_MNT/foo ] || echo "Link foo is missing" for ((i = 1; i <= 3003; i++)); do name=foo_link_`printf "%04d" $i` if [ $i -eq 2 ]; then [ -f $SCRATCH_MNT/$name ] && echo "Link $name found" else [ -f $SCRATCH_MNT/$name ] || echo "Link $name is missing" fi done rm -f $SCRATCH_MNT/foo_link_* cat $SCRATCH_MNT/foo rm -f $SCRATCH_MNT/foo status=0 exit The fix is simply to correct the overflow condition when overwriting a reference item because it was wrong, trying to increase the item in the fs/subvol tree by an impossible amount. Also ensure that we don't insert one normal ref and one ext ref for the same dentry - this happened because processing a dir index entry from the parent in the log happened when the normal ref item was full, which made the logic insert an extref and later when the normal ref had enough room, it would be inserted again when processing the ref item from the child inode in the log. This issue has been present since the introduction of the extrefs feature (2012). A test case for xfstests follows soon. This test only passes if the previous patch titled "Btrfs: fix fsync when extend references are added to an inode" is applied too. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 8ffa478..265e03c 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -344,6 +344,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, return -ENOMEM; path->leave_spinning = 1; + path->skip_release_on_error = 1; ret = btrfs_insert_empty_item(trans, root, path, &key, ins_len); if (ret == -EEXIST) { @@ -362,8 +363,12 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, ptr = (unsigned long)(ref + 1); ret = 0; } else if (ret < 0) { - if (ret == -EOVERFLOW) - ret = -EMLINK; + if (ret == -EOVERFLOW) { + if (find_name_in_backref(path, name, name_len, &ref)) + ret = -EEXIST; + else + ret = -EMLINK; + } goto out; } else { ref = btrfs_item_ptr(path->nodes[0], path->slots[0], diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 533cdb0..a266587 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -453,11 +453,13 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, insert: btrfs_release_path(path); /* try to insert the key into the destination tree */ + path->skip_release_on_error = 1; ret = btrfs_insert_empty_item(trans, root, path, key, item_size); + path->skip_release_on_error = 0; /* make sure any existing item is the correct size */ - if (ret == -EEXIST) { + if (ret == -EEXIST || ret == -EOVERFLOW) { u32 found_size; found_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); @@ -844,7 +846,7 @@ out: static noinline int backref_in_log(struct btrfs_root *log, struct btrfs_key *key, u64 ref_objectid, - char *name, int namelen) + const char *name, int namelen) { struct btrfs_path *path; struct btrfs_inode_ref *ref; @@ -1556,6 +1558,30 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, } /* + * Return true if an inode reference exists in the log for the given name, + * inode and parent inode. + */ +static bool name_in_log_ref(struct btrfs_root *log_root, + const char *name, const int name_len, + const u64 dirid, const u64 ino) +{ + struct btrfs_key search_key; + + search_key.objectid = ino; + search_key.type = BTRFS_INODE_REF_KEY; + search_key.offset = dirid; + if (backref_in_log(log_root, &search_key, dirid, name, name_len)) + return true; + + search_key.type = BTRFS_INODE_EXTREF_KEY; + search_key.offset = btrfs_extref_hash(dirid, name, name_len); + if (backref_in_log(log_root, &search_key, dirid, name, name_len)) + return true; + + return false; +} + +/* * take a single entry in a log directory item and replay it into * the subvolume. * @@ -1665,10 +1691,17 @@ out: return ret; insert: + if (name_in_log_ref(root->log_root, name, name_len, + key->objectid, log_key.objectid)) { + /* The dentry will be added later. */ + ret = 0; + update_size = false; + goto out; + } btrfs_release_path(path); ret = insert_one_name(trans, root, path, key->objectid, key->offset, name, name_len, log_type, &log_key); - if (ret && ret != -ENOENT) + if (ret && ret != -ENOENT && ret != -EEXIST) goto out; update_size = false; ret = 0; -- cgit v0.10.2 From e34c330d639177bbb345bf2bde16613b00cc6e6b Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Tue, 20 Jan 2015 15:11:31 +0800 Subject: Btrfs: fix a out-of-bound access of raid_map We add the number of stripes on target devices into bbio->num_stripes if we are under device replacement, and we just sort the raid_map of those stripes that not on the target devices, so if when we need real raid_map, we need skip the stripes on the target devices. Signed-off-by: Zhao Lei Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 53575a4..673e32b 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1299,7 +1299,9 @@ out: static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio, u64 *raid_map) { if (raid_map) { - if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) + int real_stripes = bbio->num_stripes - bbio->num_tgtdevs; + + if (raid_map[real_stripes - 1] == RAID6_Q_STRIPE) return 3; else return 2; @@ -1420,7 +1422,8 @@ leave_nomem: scrub_stripe_index_and_offset(logical, raid_map, mapped_length, - bbio->num_stripes, + bbio->num_stripes - + bbio->num_tgtdevs, mirror_index, &stripe_index, &stripe_offset); -- cgit v0.10.2 From cc7539edea6dd02536d56f0a3405b8bb7ae24168 Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Tue, 20 Jan 2015 15:11:32 +0800 Subject: Btrfs: sort raid_map before adding tgtdev stripes It can avoid complex calculation of real stripes in sort, moreover, we can clean up code of sorting tgtdev_map because it will be in order initially. Signed-off-by: Zhao Lei Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0d9bfeb..711ce38 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4876,18 +4876,17 @@ static inline int parity_smaller(u64 a, u64 b) } /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ -static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) +static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map, + int num_stripes) { struct btrfs_bio_stripe s; - int real_stripes = bbio->num_stripes - bbio->num_tgtdevs; int i; u64 l; int again = 1; - int m; while (again) { again = 0; - for (i = 0; i < real_stripes - 1; i++) { + for (i = 0; i < num_stripes - 1; i++) { if (parity_smaller(raid_map[i], raid_map[i+1])) { s = bbio->stripes[i]; l = raid_map[i]; @@ -4896,13 +4895,6 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) bbio->stripes[i+1] = s; raid_map[i+1] = l; - if (bbio->tgtdev_map) { - m = bbio->tgtdev_map[i]; - bbio->tgtdev_map[i] = - bbio->tgtdev_map[i + 1]; - bbio->tgtdev_map[i + 1] = m; - } - again = 1; } } @@ -5340,6 +5332,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) max_errors = btrfs_chunk_max_errors(map); + if (raid_map) + sort_parity_stripes(bbio, raid_map, num_stripes); + tgtdev_indexes = 0; if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && dev_replace->tgtdev != NULL) { @@ -5443,10 +5438,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, bbio->stripes[0].physical = physical_to_patch_in_first_stripe; bbio->mirror_num = map->num_stripes + 1; } - if (raid_map) { - sort_parity_stripes(bbio, raid_map); + + if (raid_map_ret) *raid_map_ret = raid_map; - } out: if (dev_replace_is_ongoing) btrfs_dev_replace_unlock(dev_replace); -- cgit v0.10.2 From 8e5cfb55d3f7dc764cd7f4c966d4c2687eaf7569 Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Tue, 20 Jan 2015 15:11:33 +0800 Subject: Btrfs: Make raid_map array be inlined in btrfs_bio structure It can make code more simple and clear, we need not care about free bbio and raid_map together. Signed-off-by: Miao Xie Signed-off-by: Zhao Lei Signed-off-by: Chris Mason diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 8ab2a17..e301d33 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -79,13 +79,6 @@ struct btrfs_raid_bio { struct btrfs_fs_info *fs_info; struct btrfs_bio *bbio; - /* - * logical block numbers for the start of each stripe - * The last one or two are p/q. These are sorted, - * so raid_map[0] is the start of our full stripe - */ - u64 *raid_map; - /* while we're doing rmw on a stripe * we put it into a hash table so we can * lock the stripe and merge more rbios @@ -303,7 +296,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) */ static int rbio_bucket(struct btrfs_raid_bio *rbio) { - u64 num = rbio->raid_map[0]; + u64 num = rbio->bbio->raid_map[0]; /* * we shift down quite a bit. We're using byte @@ -606,8 +599,8 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, test_bit(RBIO_CACHE_BIT, &cur->flags)) return 0; - if (last->raid_map[0] != - cur->raid_map[0]) + if (last->bbio->raid_map[0] != + cur->bbio->raid_map[0]) return 0; /* we can't merge with different operations */ @@ -689,7 +682,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) spin_lock_irqsave(&h->lock, flags); list_for_each_entry(cur, &h->hash_list, hash_list) { walk++; - if (cur->raid_map[0] == rbio->raid_map[0]) { + if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) { spin_lock(&cur->bio_list_lock); /* can we steal this cached rbio's pages? */ @@ -842,18 +835,16 @@ done_nolock: } static inline void -__free_bbio_and_raid_map(struct btrfs_bio *bbio, u64 *raid_map, int need) +__free_bbio(struct btrfs_bio *bbio, int need) { - if (need) { - kfree(raid_map); + if (need) kfree(bbio); - } } -static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio) +static inline void free_bbio(struct btrfs_raid_bio *rbio) { - __free_bbio_and_raid_map(rbio->bbio, rbio->raid_map, - !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags)); + __free_bbio(rbio->bbio, + !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags)); } static void __free_raid_bio(struct btrfs_raid_bio *rbio) @@ -875,7 +866,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio) } } - free_bbio_and_raid_map(rbio); + free_bbio(rbio); kfree(rbio); } @@ -985,8 +976,7 @@ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) * this does not allocate any pages for rbio->pages. */ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, - struct btrfs_bio *bbio, u64 *raid_map, - u64 stripe_len) + struct btrfs_bio *bbio, u64 stripe_len) { struct btrfs_raid_bio *rbio; int nr_data = 0; @@ -1007,7 +997,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, INIT_LIST_HEAD(&rbio->stripe_cache); INIT_LIST_HEAD(&rbio->hash_list); rbio->bbio = bbio; - rbio->raid_map = raid_map; rbio->fs_info = root->fs_info; rbio->stripe_len = stripe_len; rbio->nr_pages = num_pages; @@ -1028,7 +1017,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, rbio->bio_pages = p + sizeof(struct page *) * num_pages; rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2; - if (raid_map[real_stripes - 1] == RAID6_Q_STRIPE) + if (bbio->raid_map[real_stripes - 1] == RAID6_Q_STRIPE) nr_data = real_stripes - 2; else nr_data = real_stripes - 1; @@ -1182,7 +1171,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) spin_lock_irq(&rbio->bio_list_lock); bio_list_for_each(bio, &rbio->bio_list) { start = (u64)bio->bi_iter.bi_sector << 9; - stripe_offset = start - rbio->raid_map[0]; + stripe_offset = start - rbio->bbio->raid_map[0]; page_index = stripe_offset >> PAGE_CACHE_SHIFT; for (i = 0; i < bio->bi_vcnt; i++) { @@ -1402,7 +1391,7 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, logical <<= 9; for (i = 0; i < rbio->nr_data; i++) { - stripe_start = rbio->raid_map[i]; + stripe_start = rbio->bbio->raid_map[i]; if (logical >= stripe_start && logical < stripe_start + rbio->stripe_len) { return i; @@ -1776,17 +1765,16 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) * our main entry point for writes from the rest of the FS. */ int raid56_parity_write(struct btrfs_root *root, struct bio *bio, - struct btrfs_bio *bbio, u64 *raid_map, - u64 stripe_len) + struct btrfs_bio *bbio, u64 stripe_len) { struct btrfs_raid_bio *rbio; struct btrfs_plug_cb *plug = NULL; struct blk_plug_cb *cb; int ret; - rbio = alloc_rbio(root, bbio, raid_map, stripe_len); + rbio = alloc_rbio(root, bbio, stripe_len); if (IS_ERR(rbio)) { - __free_bbio_and_raid_map(bbio, raid_map, 1); + __free_bbio(bbio, 1); return PTR_ERR(rbio); } bio_list_add(&rbio->bio_list, bio); @@ -1885,7 +1873,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) } /* all raid6 handling here */ - if (rbio->raid_map[rbio->real_stripes - 1] == + if (rbio->bbio->raid_map[rbio->real_stripes - 1] == RAID6_Q_STRIPE) { /* @@ -1922,8 +1910,9 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) * here due to a crc mismatch and we can't give them the * data they want */ - if (rbio->raid_map[failb] == RAID6_Q_STRIPE) { - if (rbio->raid_map[faila] == RAID5_P_STRIPE) { + if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) { + if (rbio->bbio->raid_map[faila] == + RAID5_P_STRIPE) { err = -EIO; goto cleanup; } @@ -1934,7 +1923,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) goto pstripe; } - if (rbio->raid_map[failb] == RAID5_P_STRIPE) { + if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) { raid6_datap_recov(rbio->real_stripes, PAGE_SIZE, faila, pointers); } else { @@ -2156,15 +2145,15 @@ cleanup: * of the drive. */ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, - struct btrfs_bio *bbio, u64 *raid_map, - u64 stripe_len, int mirror_num, int generic_io) + struct btrfs_bio *bbio, u64 stripe_len, + int mirror_num, int generic_io) { struct btrfs_raid_bio *rbio; int ret; - rbio = alloc_rbio(root, bbio, raid_map, stripe_len); + rbio = alloc_rbio(root, bbio, stripe_len); if (IS_ERR(rbio)) { - __free_bbio_and_raid_map(bbio, raid_map, generic_io); + __free_bbio(bbio, generic_io); return PTR_ERR(rbio); } @@ -2175,7 +2164,7 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { BUG(); - __free_bbio_and_raid_map(bbio, raid_map, generic_io); + __free_bbio(bbio, generic_io); kfree(rbio); return -EIO; } @@ -2240,14 +2229,14 @@ static void read_rebuild_work(struct btrfs_work *work) struct btrfs_raid_bio * raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, - struct btrfs_bio *bbio, u64 *raid_map, - u64 stripe_len, struct btrfs_device *scrub_dev, + struct btrfs_bio *bbio, u64 stripe_len, + struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors) { struct btrfs_raid_bio *rbio; int i; - rbio = alloc_rbio(root, bbio, raid_map, stripe_len); + rbio = alloc_rbio(root, bbio, stripe_len); if (IS_ERR(rbio)) return NULL; bio_list_add(&rbio->bio_list, bio); @@ -2279,10 +2268,10 @@ void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio, int stripe_offset; int index; - ASSERT(logical >= rbio->raid_map[0]); - ASSERT(logical + PAGE_SIZE <= rbio->raid_map[0] + + ASSERT(logical >= rbio->bbio->raid_map[0]); + ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] + rbio->stripe_len * rbio->nr_data); - stripe_offset = (int)(logical - rbio->raid_map[0]); + stripe_offset = (int)(logical - rbio->bbio->raid_map[0]); index = stripe_offset >> PAGE_CACHE_SHIFT; rbio->bio_pages[index] = page; } diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 31d4a15..2b5d797 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -43,16 +43,15 @@ struct btrfs_raid_bio; struct btrfs_device; int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, - struct btrfs_bio *bbio, u64 *raid_map, - u64 stripe_len, int mirror_num, int generic_io); + struct btrfs_bio *bbio, u64 stripe_len, + int mirror_num, int generic_io); int raid56_parity_write(struct btrfs_root *root, struct bio *bio, - struct btrfs_bio *bbio, u64 *raid_map, - u64 stripe_len); + struct btrfs_bio *bbio, u64 stripe_len); struct btrfs_raid_bio * raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, - struct btrfs_bio *bbio, u64 *raid_map, - u64 stripe_len, struct btrfs_device *scrub_dev, + struct btrfs_bio *bbio, u64 stripe_len, + struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors); void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, u64 logical); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 673e32b..9d07c98 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -66,7 +66,6 @@ struct scrub_ctx; struct scrub_recover { atomic_t refs; struct btrfs_bio *bbio; - u64 *raid_map; u64 map_length; }; @@ -857,7 +856,6 @@ static inline void scrub_put_recover(struct scrub_recover *recover) { if (atomic_dec_and_test(&recover->refs)) { kfree(recover->bbio); - kfree(recover->raid_map); kfree(recover); } } @@ -1296,12 +1294,12 @@ out: return 0; } -static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio, u64 *raid_map) +static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio) { - if (raid_map) { + if (bbio->raid_map) { int real_stripes = bbio->num_stripes - bbio->num_tgtdevs; - if (raid_map[real_stripes - 1] == RAID6_Q_STRIPE) + if (bbio->raid_map[real_stripes - 1] == RAID6_Q_STRIPE) return 3; else return 2; @@ -1347,7 +1345,6 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx, { struct scrub_recover *recover; struct btrfs_bio *bbio; - u64 *raid_map; u64 sublen; u64 mapped_length; u64 stripe_offset; @@ -1368,35 +1365,31 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx, sublen = min_t(u64, length, PAGE_SIZE); mapped_length = sublen; bbio = NULL; - raid_map = NULL; /* * with a length of PAGE_SIZE, each returned stripe * represents one mirror */ ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical, - &mapped_length, &bbio, 0, &raid_map); + &mapped_length, &bbio, 0, 1); if (ret || !bbio || mapped_length < sublen) { kfree(bbio); - kfree(raid_map); return -EIO; } recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS); if (!recover) { kfree(bbio); - kfree(raid_map); return -ENOMEM; } atomic_set(&recover->refs, 1); recover->bbio = bbio; - recover->raid_map = raid_map; recover->map_length = mapped_length; BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); - nmirrors = scrub_nr_raid_mirrors(bbio, raid_map); + nmirrors = scrub_nr_raid_mirrors(bbio); for (mirror_index = 0; mirror_index < nmirrors; mirror_index++) { struct scrub_block *sblock; @@ -1420,7 +1413,7 @@ leave_nomem: sblock->pagev[page_index] = page; page->logical = logical; - scrub_stripe_index_and_offset(logical, raid_map, + scrub_stripe_index_and_offset(logical, bbio->raid_map, mapped_length, bbio->num_stripes - bbio->num_tgtdevs, @@ -1469,7 +1462,7 @@ static void scrub_bio_wait_endio(struct bio *bio, int error) static inline int scrub_is_page_on_raid56(struct scrub_page *page) { - return page->recover && page->recover->raid_map; + return page->recover && page->recover->bbio->raid_map; } static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, @@ -1486,7 +1479,6 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, bio->bi_end_io = scrub_bio_wait_endio; ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio, - page->recover->raid_map, page->recover->map_length, page->mirror_num, 0); if (ret) @@ -2716,7 +2708,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) struct btrfs_raid_bio *rbio; struct scrub_page *spage; struct btrfs_bio *bbio = NULL; - u64 *raid_map = NULL; u64 length; int ret; @@ -2727,8 +2718,8 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) length = sparity->logic_end - sparity->logic_start + 1; ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE, sparity->logic_start, - &length, &bbio, 0, &raid_map); - if (ret || !bbio || !raid_map) + &length, &bbio, 0, 1); + if (ret || !bbio || !bbio->raid_map) goto bbio_out; bio = btrfs_io_bio_alloc(GFP_NOFS, 0); @@ -2740,8 +2731,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) bio->bi_end_io = scrub_parity_bio_endio; rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio, - raid_map, length, - sparity->scrub_dev, + length, sparity->scrub_dev, sparity->dbitmap, sparity->nsectors); if (!rbio) @@ -2759,7 +2749,6 @@ rbio_out: bio_put(bio); bbio_out: kfree(bbio); - kfree(raid_map); bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, sparity->nsectors); spin_lock(&sctx->stat_lock); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 711ce38..c0f1d52 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4876,8 +4876,7 @@ static inline int parity_smaller(u64 a, u64 b) } /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ -static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map, - int num_stripes) +static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) { struct btrfs_bio_stripe s; int i; @@ -4887,13 +4886,14 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map, while (again) { again = 0; for (i = 0; i < num_stripes - 1; i++) { - if (parity_smaller(raid_map[i], raid_map[i+1])) { + if (parity_smaller(bbio->raid_map[i], + bbio->raid_map[i+1])) { s = bbio->stripes[i]; - l = raid_map[i]; + l = bbio->raid_map[i]; bbio->stripes[i] = bbio->stripes[i+1]; - raid_map[i] = raid_map[i+1]; + bbio->raid_map[i] = bbio->raid_map[i+1]; bbio->stripes[i+1] = s; - raid_map[i+1] = l; + bbio->raid_map[i+1] = l; again = 1; } @@ -4904,7 +4904,7 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map, static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, - int mirror_num, u64 **raid_map_ret) + int mirror_num, int need_raid_map) { struct extent_map *em; struct map_lookup *map; @@ -4917,7 +4917,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 stripe_nr_orig; u64 stripe_nr_end; u64 stripe_len; - u64 *raid_map = NULL; int stripe_index; int i; int ret = 0; @@ -5039,7 +5038,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 physical_of_found = 0; ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, - logical, &tmp_length, &tmp_bbio, 0, NULL); + logical, &tmp_length, &tmp_bbio, 0, 0); if (ret) { WARN_ON(tmp_bbio != NULL); goto out; @@ -5160,13 +5159,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { - u64 tmp; - - if (raid_map_ret && + if (need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || mirror_num > 1)) { - int i, rot; - /* push stripe_nr back to the start of the full stripe */ stripe_nr = raid56_full_stripe_start; do_div(stripe_nr, stripe_len * nr_data_stripes(map)); @@ -5175,32 +5170,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, num_stripes = map->num_stripes; max_errors = nr_parity_stripes(map); - raid_map = kmalloc_array(num_stripes, sizeof(u64), - GFP_NOFS); - if (!raid_map) { - ret = -ENOMEM; - goto out; - } - - /* Work out the disk rotation on this stripe-set */ - tmp = stripe_nr; - rot = do_div(tmp, num_stripes); - - /* Fill in the logical address of each stripe */ - tmp = stripe_nr * nr_data_stripes(map); - for (i = 0; i < nr_data_stripes(map); i++) - raid_map[(i+rot) % num_stripes] = - em->start + (tmp + i) * map->stripe_len; - - raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; - if (map->type & BTRFS_BLOCK_GROUP_RAID6) - raid_map[(i+rot+1) % num_stripes] = - RAID6_Q_STRIPE; - *length = map->stripe_len; stripe_index = 0; stripe_offset = 0; } else { + u64 tmp; + /* * Mirror #0 or #1 means the original data block. * Mirror #2 is RAID5 parity block. @@ -5241,7 +5216,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, bbio = kzalloc(btrfs_bio_size(num_alloc_stripes, tgtdev_indexes), GFP_NOFS); if (!bbio) { - kfree(raid_map); ret = -ENOMEM; goto out; } @@ -5249,6 +5223,34 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (dev_replace_is_ongoing) bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes); + /* build raid_map */ + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) && + need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || + mirror_num > 1)) { + u64 tmp; + int i, rot; + + bbio->raid_map = (u64 *)((void *)bbio->stripes + + sizeof(struct btrfs_bio_stripe) * + num_alloc_stripes + + sizeof(int) * tgtdev_indexes); + + /* Work out the disk rotation on this stripe-set */ + tmp = stripe_nr; + rot = do_div(tmp, num_stripes); + + /* Fill in the logical address of each stripe */ + tmp = stripe_nr * nr_data_stripes(map); + for (i = 0; i < nr_data_stripes(map); i++) + bbio->raid_map[(i+rot) % num_stripes] = + em->start + (tmp + i) * map->stripe_len; + + bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; + if (map->type & BTRFS_BLOCK_GROUP_RAID6) + bbio->raid_map[(i+rot+1) % num_stripes] = + RAID6_Q_STRIPE; + } + if (rw & REQ_DISCARD) { int factor = 0; int sub_stripes = 0; @@ -5332,8 +5334,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) max_errors = btrfs_chunk_max_errors(map); - if (raid_map) - sort_parity_stripes(bbio, raid_map, num_stripes); + if (bbio->raid_map) + sort_parity_stripes(bbio, num_stripes); tgtdev_indexes = 0; if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && @@ -5438,9 +5440,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, bbio->stripes[0].physical = physical_to_patch_in_first_stripe; bbio->mirror_num = map->num_stripes + 1; } - - if (raid_map_ret) - *raid_map_ret = raid_map; out: if (dev_replace_is_ongoing) btrfs_dev_replace_unlock(dev_replace); @@ -5453,17 +5452,17 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, struct btrfs_bio **bbio_ret, int mirror_num) { return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, - mirror_num, NULL); + mirror_num, 0); } /* For Scrub/replace */ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num, - u64 **raid_map_ret) + int need_raid_map) { return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, - mirror_num, raid_map_ret); + mirror_num, need_raid_map); } int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, @@ -5802,7 +5801,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, u64 logical = (u64)bio->bi_iter.bi_sector << 9; u64 length = 0; u64 map_length; - u64 *raid_map = NULL; int ret; int dev_nr = 0; int total_devs = 1; @@ -5813,7 +5811,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, btrfs_bio_counter_inc_blocked(root->fs_info); ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, - mirror_num, &raid_map); + mirror_num, 1); if (ret) { btrfs_bio_counter_dec(root->fs_info); return ret; @@ -5826,15 +5824,13 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, bbio->fs_info = root->fs_info; atomic_set(&bbio->stripes_pending, bbio->num_stripes); - if (raid_map) { + if (bbio->raid_map) { /* In this case, map_length has been set to the length of a single stripe; not the whole write */ if (rw & WRITE) { - ret = raid56_parity_write(root, bio, bbio, - raid_map, map_length); + ret = raid56_parity_write(root, bio, bbio, map_length); } else { - ret = raid56_parity_recover(root, bio, bbio, - raid_map, map_length, + ret = raid56_parity_recover(root, bio, bbio, map_length, mirror_num, 1); } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index d6fe73c..fb0e8c3 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -307,6 +307,12 @@ struct btrfs_bio { int mirror_num; int num_tgtdevs; int *tgtdev_map; + /* + * logical block numbers for the start of each stripe + * The last one or two are p/q. These are sorted, + * so raid_map[0] is the start of our full stripe + */ + u64 *raid_map; struct btrfs_bio_stripe stripes[]; }; @@ -392,7 +398,8 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, #define btrfs_bio_size(total_stripes, real_stripes) \ (sizeof(struct btrfs_bio) + \ (sizeof(struct btrfs_bio_stripe) * (total_stripes)) + \ - (sizeof(int) * (real_stripes))) + (sizeof(int) * (real_stripes)) + \ + (sizeof(u64) * (real_stripes))) int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, @@ -400,7 +407,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num, - u64 **raid_map_ret); + int need_raid_map); int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, u64 chunk_start, u64 physical, u64 devid, u64 **logical, int *naddrs, int *stripe_len); -- cgit v0.10.2 From 6e9606d2a2dce098c1739fb3cd82a1c34fd73d3a Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Tue, 20 Jan 2015 15:11:34 +0800 Subject: Btrfs: add ref_count and free function for btrfs_bio 1: ref_count is simple than current RBIO_HOLD_BBIO_MAP_BIT flag to keep btrfs_bio's memory in raid56 recovery implement. 2: free function for bbio will make code clean and flexible, plus forced data type checking in compile. Changelog v1->v2: Rename following by David Sterba's suggestion: put_btrfs_bio() -> btrfs_put_bio() get_btrfs_bio() -> btrfs_get_bio() bbio->ref_count -> bbio->refs Signed-off-by: Zhao Lei Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3af53f8..1d361ac 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1926,7 +1926,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, */ ret = 0; } - kfree(bbio); + btrfs_put_bbio(bbio); } if (actual_bytes) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 65bd285..dab8af4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2058,7 +2058,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, sector = bbio->stripes[mirror_num-1].physical >> 9; bio->bi_iter.bi_sector = sector; dev = bbio->stripes[mirror_num-1].dev; - kfree(bbio); + btrfs_put_bbio(bbio); if (!dev || !dev->bdev || !dev->writeable) { bio_put(bio); return -EIO; diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index e301d33..cbc4162 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -58,15 +58,6 @@ */ #define RBIO_CACHE_READY_BIT 3 -/* - * bbio and raid_map is managed by the caller, so we shouldn't free - * them here. And besides that, all rbios with this flag should not - * be cached, because we need raid_map to check the rbios' stripe - * is the same or not, but it is very likely that the caller has - * free raid_map, so don't cache those rbios. - */ -#define RBIO_HOLD_BBIO_MAP_BIT 4 - #define RBIO_CACHE_SIZE 1024 enum btrfs_rbio_ops { @@ -834,19 +825,6 @@ done_nolock: remove_rbio_from_cache(rbio); } -static inline void -__free_bbio(struct btrfs_bio *bbio, int need) -{ - if (need) - kfree(bbio); -} - -static inline void free_bbio(struct btrfs_raid_bio *rbio) -{ - __free_bbio(rbio->bbio, - !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags)); -} - static void __free_raid_bio(struct btrfs_raid_bio *rbio) { int i; @@ -866,8 +844,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio) } } - free_bbio(rbio); - + btrfs_put_bbio(rbio->bbio); kfree(rbio); } @@ -1774,7 +1751,7 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio, rbio = alloc_rbio(root, bbio, stripe_len); if (IS_ERR(rbio)) { - __free_bbio(bbio, 1); + btrfs_put_bbio(bbio); return PTR_ERR(rbio); } bio_list_add(&rbio->bio_list, bio); @@ -1990,8 +1967,7 @@ cleanup: cleanup_io: if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { - if (err == 0 && - !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags)) + if (err == 0) cache_rbio_pages(rbio); else clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); @@ -2153,7 +2129,8 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, rbio = alloc_rbio(root, bbio, stripe_len); if (IS_ERR(rbio)) { - __free_bbio(bbio, generic_io); + if (generic_io) + btrfs_put_bbio(bbio); return PTR_ERR(rbio); } @@ -2164,7 +2141,8 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { BUG(); - __free_bbio(bbio, generic_io); + if (generic_io) + btrfs_put_bbio(bbio); kfree(rbio); return -EIO; } @@ -2173,7 +2151,7 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, btrfs_bio_counter_inc_noblocked(root->fs_info); rbio->generic_bio_cnt = 1; } else { - set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags); + btrfs_get_bbio(bbio); } /* diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 4d3d4e5..0e7beea 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -461,7 +461,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, spin_unlock(&fs_info->reada_lock); btrfs_dev_replace_unlock(&fs_info->dev_replace); - kfree(bbio); + btrfs_put_bbio(bbio); return re; error: @@ -486,7 +486,7 @@ error: kref_put(&zone->refcnt, reada_zone_release); spin_unlock(&fs_info->reada_lock); } - kfree(bbio); + btrfs_put_bbio(bbio); kfree(re); return re_exist; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 9d07c98..1388e71 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -855,7 +855,7 @@ static inline void scrub_get_recover(struct scrub_recover *recover) static inline void scrub_put_recover(struct scrub_recover *recover) { if (atomic_dec_and_test(&recover->refs)) { - kfree(recover->bbio); + btrfs_put_bbio(recover->bbio); kfree(recover); } } @@ -1373,13 +1373,13 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx, ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical, &mapped_length, &bbio, 0, 1); if (ret || !bbio || mapped_length < sublen) { - kfree(bbio); + btrfs_put_bbio(bbio); return -EIO; } recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS); if (!recover) { - kfree(bbio); + btrfs_put_bbio(bbio); return -ENOMEM; } @@ -2748,7 +2748,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) rbio_out: bio_put(bio); bbio_out: - kfree(bbio); + btrfs_put_bbio(bbio); bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, sparity->nsectors); spin_lock(&sctx->stat_lock); @@ -3879,14 +3879,14 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info, &mapped_length, &bbio, 0); if (ret || !bbio || mapped_length < extent_len || !bbio->stripes[0].dev->bdev) { - kfree(bbio); + btrfs_put_bbio(bbio); return; } *extent_physical = bbio->stripes[0].physical; *extent_mirror_num = bbio->mirror_num; *extent_dev = bbio->stripes[0].dev; - kfree(bbio); + btrfs_put_bbio(bbio); } static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c0f1d52..0843bcd 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4901,6 +4901,37 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) } } +static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) +{ + struct btrfs_bio *bbio = kzalloc( + sizeof(struct btrfs_bio) + + sizeof(struct btrfs_bio_stripe) * (total_stripes) + + sizeof(int) * (real_stripes) + + sizeof(u64) * (real_stripes), + GFP_NOFS); + if (!bbio) + return NULL; + + atomic_set(&bbio->error, 0); + atomic_set(&bbio->refs, 1); + + return bbio; +} + +void btrfs_get_bbio(struct btrfs_bio *bbio) +{ + WARN_ON(!atomic_read(&bbio->refs)); + atomic_inc(&bbio->refs); +} + +void btrfs_put_bbio(struct btrfs_bio *bbio) +{ + if (!bbio) + return; + if (atomic_dec_and_test(&bbio->refs)) + kfree(bbio); +} + static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, @@ -5052,7 +5083,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, * is not left of the left cursor */ ret = -EIO; - kfree(tmp_bbio); + btrfs_put_bbio(tmp_bbio); goto out; } @@ -5087,11 +5118,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } else { WARN_ON(1); ret = -EIO; - kfree(tmp_bbio); + btrfs_put_bbio(tmp_bbio); goto out; } - kfree(tmp_bbio); + btrfs_put_bbio(tmp_bbio); } else if (mirror_num > map->num_stripes) { mirror_num = 0; } @@ -5213,13 +5244,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, tgtdev_indexes = num_stripes; } - bbio = kzalloc(btrfs_bio_size(num_alloc_stripes, tgtdev_indexes), - GFP_NOFS); + bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes); if (!bbio) { ret = -ENOMEM; goto out; } - atomic_set(&bbio->error, 0); if (dev_replace_is_ongoing) bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes); @@ -5558,7 +5587,7 @@ static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int e bio_endio_nodec(bio, err); else bio_endio(bio, err); - kfree(bbio); + btrfs_put_bbio(bbio); } static void btrfs_end_bio(struct bio *bio, int err) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index fb0e8c3..4313e8f 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -295,6 +295,7 @@ typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); #define BTRFS_BIO_ORIG_BIO_SUBMITTED (1 << 0) struct btrfs_bio { + atomic_t refs; atomic_t stripes_pending; struct btrfs_fs_info *fs_info; bio_end_io_t *end_io; @@ -394,13 +395,8 @@ struct btrfs_balance_control { int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, u64 end, u64 *length); - -#define btrfs_bio_size(total_stripes, real_stripes) \ - (sizeof(struct btrfs_bio) + \ - (sizeof(struct btrfs_bio_stripe) * (total_stripes)) + \ - (sizeof(int) * (real_stripes)) + \ - (sizeof(u64) * (real_stripes))) - +void btrfs_get_bbio(struct btrfs_bio *bbio); +void btrfs_put_bbio(struct btrfs_bio *bbio); int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num); -- cgit v0.10.2 From b25c94c580c27bb6cff1e2f478746e77119215ad Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Tue, 20 Jan 2015 15:11:35 +0800 Subject: Btrfs: Fix a jump typo of nodatasum_case to avoid wrong WARN_ON() if (sctx->is_dev_replace && !is_metadata && !have_csum) { ... goto nodatasum_case; } ... nodatasum_case: WARN_ON(sctx->is_dev_replace); In above code, nodatasum_case marker should be moved after WARN_ON(). Signed-off-by: Zhao Lei Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 1388e71..016512c 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1036,9 +1036,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) if (!is_metadata && !have_csum) { struct scrub_fixup_nodatasum *fixup_nodatasum; -nodatasum_case: WARN_ON(sctx->is_dev_replace); +nodatasum_case: + /* * !is_metadata and !have_csum, this means that the data * might not be COW'ed, that it might be modified -- cgit v0.10.2 From 114ab50d823c2cc7cf60658fdbdaaba413009921 Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Tue, 20 Jan 2015 15:11:36 +0800 Subject: Btrfs: Remove noneed force_write in scrub_write_block_to_dev_replace It is always 1 in this place, because !1 case was already jumped out in previous code. Signed-off-by: Zhao Lei Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 016512c..f9e108b 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -250,8 +250,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, const u8 *csum, u64 generation, u16 csum_size); static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, - struct scrub_block *sblock_good, - int force_write); + struct scrub_block *sblock_good); static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good, int page_num, int force_write); @@ -1098,15 +1097,13 @@ nodatasum_case: sblock_other->no_io_error_seen) { if (sctx->is_dev_replace) { scrub_write_block_to_dev_replace(sblock_other); + goto corrected_error; } else { - int force_write = is_metadata || have_csum; - ret = scrub_repair_block_from_good_copy( - sblock_bad, sblock_other, - force_write); + sblock_bad, sblock_other); + if (!ret) + goto corrected_error; } - if (0 == ret) - goto corrected_error; } } @@ -1619,8 +1616,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, } static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, - struct scrub_block *sblock_good, - int force_write) + struct scrub_block *sblock_good) { int page_num; int ret = 0; @@ -1630,8 +1626,7 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, ret_sub = scrub_repair_page_from_good_copy(sblock_bad, sblock_good, - page_num, - force_write); + page_num, 1); if (ret_sub) ret = ret_sub; } -- cgit v0.10.2 From 09dd7a01c3436344fb4c3974b275e016264e0a27 Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Tue, 20 Jan 2015 15:11:37 +0800 Subject: Btrfs: Cleanup btrfs_bio_counter_inc_blocked() 1: Remove no-need DEFINE_WAIT(wait) 2: Add likely() for BTRFS_FS_STATE_DEV_REPLACING condition 3: Use while loop instead of goto Signed-off-by: Zhao Lei Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index ca6a3a3..92109b7 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -932,15 +932,15 @@ void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount) void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info) { - DEFINE_WAIT(wait); -again: - percpu_counter_inc(&fs_info->bio_counter); - if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) { + while (1) { + percpu_counter_inc(&fs_info->bio_counter); + if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING, + &fs_info->fs_state))) + break; + btrfs_bio_counter_dec(fs_info); wait_event(fs_info->replace_wait, !test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)); - goto again; } - } -- cgit v0.10.2 From 7653947fe6d59db72fbc26c4fc9ef842febc79e3 Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Tue, 20 Jan 2015 15:11:38 +0800 Subject: Btrfs: btrfs_rm_dev_replace_blocked(): Use wait_event() Signed-off-by: Zhao Lei Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 92109b7..5ec03d9 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -440,18 +440,9 @@ leave: */ static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info) { - s64 writers; - DEFINE_WAIT(wait); - set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); - do { - prepare_to_wait(&fs_info->replace_wait, &wait, - TASK_UNINTERRUPTIBLE); - writers = percpu_counter_sum(&fs_info->bio_counter); - if (writers) - schedule(); - finish_wait(&fs_info->replace_wait, &wait); - } while (writers); + wait_event(fs_info->replace_wait, !percpu_counter_sum( + &fs_info->bio_counter)); } /* -- cgit v0.10.2 From dc5f7a3bd820883793bb0d9789e938a34aa4da5f Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Tue, 20 Jan 2015 15:11:39 +0800 Subject: Btrfs: Break loop when reach BTRFS_MAX_MIRRORS in scrub_setup_recheck_block() Use break instead of useless loop should be more suitable in this case. Signed-off-by: Zhao Lei Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index f9e108b..68867b2 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1394,7 +1394,7 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx, struct scrub_page *page; if (mirror_index >= BTRFS_MAX_MIRRORS) - continue; + break; sblock = sblocks_for_recheck + mirror_index; sblock->sctx = sctx; -- cgit v0.10.2 From 8d6738c1bd74a27ff6a5043c5211c7bff7745420 Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Tue, 20 Jan 2015 15:11:40 +0800 Subject: Btrfs: Separate finding-right-mirror and writing-to-target's process in scrub_handle_errored_block() In corrent code, code of finding-right-mirror and writing-to-target are mixed in logic, if we find a right mirror but failed in writing to target, it will treat as "hadn't found right block", and fill the target with sblock_bad. Actually, "failed in writing to target" does not mean "source block is wrong", this patch separate above two condition in logic, and do some cleanup to make code clean. Signed-off-by: Zhao Lei Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 68867b2..c40ffbc 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1114,35 +1114,21 @@ nodatasum_case: success = 1; for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { - int sub_success; + struct scrub_block *sblock_other = NULL; - sub_success = 0; for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS && sblocks_for_recheck[mirror_index].page_count > 0; mirror_index++) { - struct scrub_block *sblock_other = - sblocks_for_recheck + mirror_index; - struct scrub_page *page_other = - sblock_other->pagev[page_num]; - - if (!page_other->io_error) { - ret = scrub_write_page_to_dev_replace( - sblock_other, page_num); - if (ret == 0) { - /* succeeded for this page */ - sub_success = 1; - break; - } else { - btrfs_dev_replace_stats_inc( - &sctx->dev_root-> - fs_info->dev_replace. - num_write_errors); - } + if (!sblocks_for_recheck[mirror_index]. + pagev[page_num]->io_error) { + sblock_other = sblocks_for_recheck + + mirror_index; + break; } } - if (!sub_success) { + if (!sblock_other) { /* * did not find a mirror to fetch the page * from. scrub_write_page_to_dev_replace() @@ -1150,13 +1136,17 @@ nodatasum_case: * filling the block with zeros before * submitting the write request */ + sblock_other = sblock_bad; + success = 0; + } + + if (scrub_write_page_to_dev_replace(sblock_other, + page_num) != 0) { + btrfs_dev_replace_stats_inc( + &sctx->dev_root-> + fs_info->dev_replace. + num_write_errors); success = 0; - ret = scrub_write_page_to_dev_replace( - sblock_bad, page_num); - if (ret) - btrfs_dev_replace_stats_inc( - &sctx->dev_root->fs_info-> - dev_replace.num_write_errors); } } -- cgit v0.10.2 From b968fed1c3810a0a0d575cab8a72431fbc807615 Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Tue, 20 Jan 2015 15:11:41 +0800 Subject: Btrfs: Combine per-page recover in dev-replace and scrub The code are similar, combine them to make code clean and easy to maintenance. Some lost condition are also completed with benefit of this combination. Signed-off-by: Zhao Lei Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index c40ffbc..aac40ae 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1107,54 +1107,10 @@ nodatasum_case: } } - /* - * for dev_replace, pick good pages and write to the target device. - */ - if (sctx->is_dev_replace) { - success = 1; - for (page_num = 0; page_num < sblock_bad->page_count; - page_num++) { - struct scrub_block *sblock_other = NULL; - - for (mirror_index = 0; - mirror_index < BTRFS_MAX_MIRRORS && - sblocks_for_recheck[mirror_index].page_count > 0; - mirror_index++) { - if (!sblocks_for_recheck[mirror_index]. - pagev[page_num]->io_error) { - sblock_other = sblocks_for_recheck + - mirror_index; - break; - } - } - - if (!sblock_other) { - /* - * did not find a mirror to fetch the page - * from. scrub_write_page_to_dev_replace() - * handles this case (page->io_error), by - * filling the block with zeros before - * submitting the write request - */ - sblock_other = sblock_bad; - success = 0; - } - - if (scrub_write_page_to_dev_replace(sblock_other, - page_num) != 0) { - btrfs_dev_replace_stats_inc( - &sctx->dev_root-> - fs_info->dev_replace. - num_write_errors); - success = 0; - } - } - - goto out; - } + if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace) + goto did_not_correct_error; /* - * for regular scrub, repair those pages that are errored. * In case of I/O errors in the area that is supposed to be * repaired, continue by picking good copies of those pages. * Select the good pages from mirrors to rewrite bad pages from @@ -1178,44 +1134,64 @@ nodatasum_case: * mirror, even if other 512 byte sectors in the same PAGE_SIZE * area are unreadable. */ - - /* can only fix I/O errors from here on */ - if (sblock_bad->no_io_error_seen) - goto did_not_correct_error; - success = 1; - for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { + for (page_num = 0; page_num < sblock_bad->page_count; + page_num++) { struct scrub_page *page_bad = sblock_bad->pagev[page_num]; + struct scrub_block *sblock_other = NULL; - if (!page_bad->io_error) + /* skip no-io-error page in scrub */ + if (!page_bad->io_error && !sctx->is_dev_replace) continue; - for (mirror_index = 0; - mirror_index < BTRFS_MAX_MIRRORS && - sblocks_for_recheck[mirror_index].page_count > 0; - mirror_index++) { - struct scrub_block *sblock_other = sblocks_for_recheck + - mirror_index; - struct scrub_page *page_other = sblock_other->pagev[ - page_num]; - - if (!page_other->io_error) { - ret = scrub_repair_page_from_good_copy( - sblock_bad, sblock_other, page_num, 0); - if (0 == ret) { - page_bad->io_error = 0; - break; /* succeeded for this page */ + /* try to find no-io-error page in mirrors */ + if (page_bad->io_error) { + for (mirror_index = 0; + mirror_index < BTRFS_MAX_MIRRORS && + sblocks_for_recheck[mirror_index].page_count > 0; + mirror_index++) { + if (!sblocks_for_recheck[mirror_index]. + pagev[page_num]->io_error) { + sblock_other = sblocks_for_recheck + + mirror_index; + break; } } + if (!sblock_other) + success = 0; } - if (page_bad->io_error) { - /* did not find a mirror to copy the page from */ - success = 0; + if (sctx->is_dev_replace) { + /* + * did not find a mirror to fetch the page + * from. scrub_write_page_to_dev_replace() + * handles this case (page->io_error), by + * filling the block with zeros before + * submitting the write request + */ + if (!sblock_other) + sblock_other = sblock_bad; + + if (scrub_write_page_to_dev_replace(sblock_other, + page_num) != 0) { + btrfs_dev_replace_stats_inc( + &sctx->dev_root-> + fs_info->dev_replace. + num_write_errors); + success = 0; + } + } else if (sblock_other) { + ret = scrub_repair_page_from_good_copy(sblock_bad, + sblock_other, + page_num, 0); + if (0 == ret) + page_bad->io_error = 0; + else + success = 0; } } - if (success) { + if (success && !sctx->is_dev_replace) { if (is_metadata || have_csum) { /* * need to verify the checksum now that all -- cgit v0.10.2 From be50a8ddaae1d07135fd7e1c7017c1611075a560 Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Tue, 20 Jan 2015 15:11:42 +0800 Subject: Btrfs: Simplify scrub_setup_recheck_block()'s argument scrub_setup_recheck_block() have many arguments but most of them can be get from one of them, we can remove them to make code clean. Some other cleanup for that function also included in this patch. Signed-off-by: Zhao Lei Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index aac40ae..c3a9893 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -235,10 +235,7 @@ static void scrub_pending_bio_dec(struct scrub_ctx *sctx); static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx); static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); -static int scrub_setup_recheck_block(struct scrub_ctx *sctx, - struct btrfs_fs_info *fs_info, - struct scrub_block *original_sblock, - u64 length, u64 logical, +static int scrub_setup_recheck_block(struct scrub_block *original_sblock, struct scrub_block *sblocks_for_recheck); static void scrub_recheck_block(struct btrfs_fs_info *fs_info, struct scrub_block *sblock, int is_metadata, @@ -960,8 +957,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) } /* setup the context, map the logical blocks and alloc the pages */ - ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length, - logical, sblocks_for_recheck); + ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck); if (ret) { spin_lock(&sctx->stat_lock); sctx->stat.read_errors++; @@ -1301,19 +1297,20 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map, } } -static int scrub_setup_recheck_block(struct scrub_ctx *sctx, - struct btrfs_fs_info *fs_info, - struct scrub_block *original_sblock, - u64 length, u64 logical, +static int scrub_setup_recheck_block(struct scrub_block *original_sblock, struct scrub_block *sblocks_for_recheck) { + struct scrub_ctx *sctx = original_sblock->sctx; + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + u64 length = original_sblock->page_count * PAGE_SIZE; + u64 logical = original_sblock->pagev[0]->logical; struct scrub_recover *recover; struct btrfs_bio *bbio; u64 sublen; u64 mapped_length; u64 stripe_offset; int stripe_index; - int page_index; + int page_index = 0; int mirror_index; int nmirrors; int ret; @@ -1324,7 +1321,6 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx, * the recheck procedure */ - page_index = 0; while (length > 0) { sublen = min_t(u64, length, PAGE_SIZE); mapped_length = sublen; @@ -1353,15 +1349,12 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx, BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); - nmirrors = scrub_nr_raid_mirrors(bbio); + nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS); for (mirror_index = 0; mirror_index < nmirrors; mirror_index++) { struct scrub_block *sblock; struct scrub_page *page; - if (mirror_index >= BTRFS_MAX_MIRRORS) - break; - sblock = sblocks_for_recheck + mirror_index; sblock->sctx = sctx; page = kzalloc(sizeof(*page), GFP_NOFS); -- cgit v0.10.2 From 10f11900167a83e0c229c4c27e73e720ebd55b5c Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Tue, 20 Jan 2015 15:11:43 +0800 Subject: Btrfs: Include map_type in raid_bio Corrent code use many kinds of "clever" way to determine operation target's raid type, as: raid_map != NULL or raid_map[MAX_NR] == RAID[56]_Q_STRIPE To make code easy to maintenance, this patch put raid type into bbio, and we can always get raid type from bbio with a "stupid" way. Signed-off-by: Zhao Lei Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index cbc4162..5264858 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -994,10 +994,12 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, rbio->bio_pages = p + sizeof(struct page *) * num_pages; rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2; - if (bbio->raid_map[real_stripes - 1] == RAID6_Q_STRIPE) + if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5) + nr_data = real_stripes - 1; + else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) nr_data = real_stripes - 2; else - nr_data = real_stripes - 1; + BUG(); rbio->nr_data = nr_data; return rbio; @@ -1850,9 +1852,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) } /* all raid6 handling here */ - if (rbio->bbio->raid_map[rbio->real_stripes - 1] == - RAID6_Q_STRIPE) { - + if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) { /* * single failure, rebuild from parity raid5 * style diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index c3a9893..19781e9 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1256,19 +1256,16 @@ out: static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio) { - if (bbio->raid_map) { - int real_stripes = bbio->num_stripes - bbio->num_tgtdevs; - - if (bbio->raid_map[real_stripes - 1] == RAID6_Q_STRIPE) - return 3; - else - return 2; - } else { + if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5) + return 2; + else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) + return 3; + else return (int)bbio->num_stripes; - } } -static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map, +static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, + u64 *raid_map, u64 mapped_length, int nstripes, int mirror, int *stripe_index, @@ -1276,7 +1273,7 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map, { int i; - if (raid_map) { + if (map_type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { /* RAID5/6 */ for (i = 0; i < nstripes; i++) { if (raid_map[i] == RAID6_Q_STRIPE || @@ -1350,6 +1347,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS); + for (mirror_index = 0; mirror_index < nmirrors; mirror_index++) { struct scrub_block *sblock; @@ -1370,7 +1368,9 @@ leave_nomem: sblock->pagev[page_index] = page; page->logical = logical; - scrub_stripe_index_and_offset(logical, bbio->raid_map, + scrub_stripe_index_and_offset(logical, + bbio->map_type, + bbio->raid_map, mapped_length, bbio->num_stripes - bbio->num_tgtdevs, @@ -1419,7 +1419,9 @@ static void scrub_bio_wait_endio(struct bio *bio, int error) static inline int scrub_is_page_on_raid56(struct scrub_page *page) { - return page->recover && page->recover->bbio->raid_map; + return page->recover && + (page->recover->bbio->map_type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)); } static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0843bcd..8933d70 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5453,6 +5453,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } *bbio_ret = bbio; + bbio->map_type = map->type; bbio->num_stripes = num_stripes; bbio->max_errors = max_errors; bbio->mirror_num = mirror_num; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 4313e8f..83069de 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -298,6 +298,7 @@ struct btrfs_bio { atomic_t refs; atomic_t stripes_pending; struct btrfs_fs_info *fs_info; + u64 map_type; /* get from map_lookup->type */ bio_end_io_t *end_io; struct bio *orig_bio; unsigned long flags; -- cgit v0.10.2 From ffe2d2034bbb34f49f76c808550fdfbea2ea1659 Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Tue, 20 Jan 2015 15:11:44 +0800 Subject: Btrfs: Introduce BTRFS_BLOCK_GROUP_RAID56_MASK to check raid56 simply So we can check raid56 with: (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) instead of long: (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) Signed-off-by: Zhao Lei Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0b4683f..2ecdac0 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1020,6 +1020,9 @@ enum btrfs_raid_types { BTRFS_BLOCK_GROUP_RAID6 | \ BTRFS_BLOCK_GROUP_DUP | \ BTRFS_BLOCK_GROUP_RAID10) +#define BTRFS_BLOCK_GROUP_RAID56_MASK (BTRFS_BLOCK_GROUP_RAID5 | \ + BTRFS_BLOCK_GROUP_RAID6) + /* * We need a bit for restriper to be able to tell when chunks of type * SINGLE are available. This "extended" profile format is used in diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 220f0c3..2d14acb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7812,8 +7812,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, } /* async crcs make it difficult to collect full stripe writes. */ - if (btrfs_get_alloc_profile(root, 1) & - (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) + if (btrfs_get_alloc_profile(root, 1) & BTRFS_BLOCK_GROUP_RAID56_MASK) async_submit = 0; else async_submit = 1; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 19781e9..1ae527c 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1273,7 +1273,7 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, { int i; - if (map_type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { + if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { /* RAID5/6 */ for (i = 0; i < nstripes; i++) { if (raid_map[i] == RAID6_Q_STRIPE || @@ -1420,8 +1420,7 @@ static void scrub_bio_wait_endio(struct bio *bio, int error) static inline int scrub_is_page_on_raid56(struct scrub_page *page) { return page->recover && - (page->recover->bbio->map_type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)); + (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); } static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, @@ -2994,8 +2993,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { increment = map->stripe_len; mirror_num = num % map->num_stripes + 1; - } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { + } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { get_raid56_logic_offset(physical, num, map, &offset, NULL); increment = map->stripe_len * nr_data_stripes(map); mirror_num = 1; @@ -3029,8 +3027,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, */ logical = base + offset; physical_end = physical + nstripes * map->stripe_len; - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { get_raid56_logic_offset(physical_end, num, map, &logic_end, NULL); logic_end += base; @@ -3076,8 +3073,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, ret = 0; while (physical < physical_end) { /* for raid56, we skip parity stripe */ - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ret = get_raid56_logic_offset(physical, num, map, &logical, &stripe_logical); logical += base; @@ -3235,8 +3231,7 @@ again: scrub_free_csums(sctx); if (extent_logical + extent_len < key.objectid + bytes) { - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { /* * loop until we find next data stripe * or we have finished all stripes. diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8933d70..da7e0e1 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4196,7 +4196,7 @@ static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) { - if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))) + if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) return; btrfs_set_fs_incompat(info, RAID56); @@ -4803,10 +4803,8 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root, BUG_ON(em->start > logical || em->start + em->len < logical); map = (struct map_lookup *)em->bdev; - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) len = map->stripe_len * nr_data_stripes(map); - } free_extent_map(em); return len; } @@ -4826,8 +4824,7 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, BUG_ON(em->start > logical || em->start + em->len < logical); map = (struct map_lookup *)em->bdev; - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) ret = 1; free_extent_map(em); return ret; @@ -4998,7 +4995,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, stripe_offset = offset - stripe_offset; /* if we're here for raid56, we need to know the stripe aligned start */ - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); raid56_full_stripe_start = offset; @@ -5011,8 +5008,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (rw & REQ_DISCARD) { /* we don't discard raid56 yet */ - if (map->type & - (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ret = -EOPNOTSUPP; goto out; } @@ -5022,7 +5018,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, /* For writes to RAID[56], allow a full stripeset across all disks. For other RAID types and for RAID[56] reads, just allow a single stripe (on a single disk). */ - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) && + if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && (rw & REQ_WRITE)) { max_len = stripe_len * nr_data_stripes(map) - (offset - raid56_full_stripe_start); @@ -5188,8 +5184,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, mirror_num = stripe_index - old_stripe_index + 1; } - } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { + } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { if (need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || mirror_num > 1)) { @@ -5253,7 +5248,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes); /* build raid_map */ - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) && + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || mirror_num > 1)) { u64 tmp; @@ -5534,8 +5529,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, do_div(length, map->num_stripes / map->sub_stripes); else if (map->type & BTRFS_BLOCK_GROUP_RAID0) do_div(length, map->num_stripes); - else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { + else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { do_div(length, nr_data_stripes(map)); rmap_len = map->stripe_len * nr_data_stripes(map); } -- cgit v0.10.2 From 570193454a5841a9fa477ce1bcc6c30ca6fcf552 Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Tue, 20 Jan 2015 15:11:45 +0800 Subject: Rename all ref_count to refs in struct refs is better than ref_count to record a struct's ref count. Signed-off-by: Zhao Lei Suggested-by: David Sterba Signed-off-by: Chris Mason diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 1ae527c..8af7372 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -79,7 +79,7 @@ struct scrub_page { u64 logical; u64 physical; u64 physical_for_dev_replace; - atomic_t ref_count; + atomic_t refs; struct { unsigned int mirror_num:8; unsigned int have_csum:1; @@ -112,7 +112,7 @@ struct scrub_block { struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK]; int page_count; atomic_t outstanding_pages; - atomic_t ref_count; /* free mem on transition to zero */ + atomic_t refs; /* free mem on transition to zero */ struct scrub_ctx *sctx; struct scrub_parity *sparity; struct { @@ -141,7 +141,7 @@ struct scrub_parity { int stripe_len; - atomic_t ref_count; + atomic_t refs; struct list_head spages; @@ -1313,7 +1313,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, int ret; /* - * note: the two members ref_count and outstanding_pages + * note: the two members refs and outstanding_pages * are not used (and not set) in the blocks that are used for * the recheck procedure */ @@ -2026,12 +2026,12 @@ static int scrub_checksum_super(struct scrub_block *sblock) static void scrub_block_get(struct scrub_block *sblock) { - atomic_inc(&sblock->ref_count); + atomic_inc(&sblock->refs); } static void scrub_block_put(struct scrub_block *sblock) { - if (atomic_dec_and_test(&sblock->ref_count)) { + if (atomic_dec_and_test(&sblock->refs)) { int i; if (sblock->sparity) @@ -2045,12 +2045,12 @@ static void scrub_block_put(struct scrub_block *sblock) static void scrub_page_get(struct scrub_page *spage) { - atomic_inc(&spage->ref_count); + atomic_inc(&spage->refs); } static void scrub_page_put(struct scrub_page *spage) { - if (atomic_dec_and_test(&spage->ref_count)) { + if (atomic_dec_and_test(&spage->refs)) { if (spage->page) __free_page(spage->page); kfree(spage); @@ -2176,7 +2176,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, /* one ref inside this function, plus one for each page added to * a bio later on */ - atomic_set(&sblock->ref_count, 1); + atomic_set(&sblock->refs, 1); sblock->sctx = sctx; sblock->no_io_error_seen = 1; @@ -2469,7 +2469,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity, /* one ref inside this function, plus one for each page added to * a bio later on */ - atomic_set(&sblock->ref_count, 1); + atomic_set(&sblock->refs, 1); sblock->sctx = sctx; sblock->no_io_error_seen = 1; sblock->sparity = sparity; @@ -2721,12 +2721,12 @@ static inline int scrub_calc_parity_bitmap_len(int nsectors) static void scrub_parity_get(struct scrub_parity *sparity) { - atomic_inc(&sparity->ref_count); + atomic_inc(&sparity->refs); } static void scrub_parity_put(struct scrub_parity *sparity) { - if (!atomic_dec_and_test(&sparity->ref_count)) + if (!atomic_dec_and_test(&sparity->refs)) return; scrub_parity_check_and_repair(sparity); @@ -2776,7 +2776,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, sparity->scrub_dev = sdev; sparity->logic_start = logic_start; sparity->logic_end = logic_end; - atomic_set(&sparity->ref_count, 1); + atomic_set(&sparity->refs, 1); INIT_LIST_HEAD(&sparity->spages); sparity->dbitmap = sparity->bitmap; sparity->ebitmap = (void *)sparity->bitmap + bitmap_len; -- cgit v0.10.2 From 26455d3318a1e2a38f783db07981e3ed67de40ed Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 17 Dec 2014 16:14:09 +0800 Subject: Btrfs: cleanup unused run_most "run_most" is not used anymore. Signed-off-by: Liu Bo Reviewed-by: Satoru Takeuchi Signed-off-by: Chris Mason diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1d361ac..53294da 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2769,7 +2769,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head; int ret; int run_all = count == (unsigned long)-1; - int run_most = 0; /* We'll clean this up in btrfs_cleanup_transaction */ if (trans->aborted) @@ -2779,10 +2778,8 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, root = root->fs_info->tree_root; delayed_refs = &trans->transaction->delayed_refs; - if (count == 0) { + if (count == 0) count = atomic_read(&delayed_refs->num_entries) * 2; - run_most = 1; - } again: #ifdef SCRAMBLE_DELAYED_REFS -- cgit v0.10.2 From 0ee13fe28ce387864c0d2b1e8c52b64abe2fcd02 Mon Sep 17 00:00:00 2001 From: Yang Dongsheng Date: Tue, 6 Jan 2015 20:54:42 +0800 Subject: btrfs: qgroup: move WARN_ON() to the correct location. In function qgroup_excl_accounting(), we need to WARN when qg->excl is less than what we want to free, same to child and parents. But currently, for parent qgroup, the WARN_ON() is located after freeing qg->excl. It will WARN out even we free it normally. This patch move this WARN_ON() before freeing qg->excl. Signed-off-by: Dongsheng Yang Reviewed-by: Satoru Takeuchi Signed-off-by: Chris Mason diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 48b60db..97159a8 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1431,9 +1431,8 @@ static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info, qgroup = u64_to_ptr(unode->aux); qgroup->rfer += sign * oper->num_bytes; qgroup->rfer_cmpr += sign * oper->num_bytes; + WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes); qgroup->excl += sign * oper->num_bytes; - if (sign < 0) - WARN_ON(qgroup->excl < oper->num_bytes); qgroup->excl_cmpr += sign * oper->num_bytes; qgroup_dirty(fs_info, qgroup); -- cgit v0.10.2 From 78f55e5e1fa9f684705325667d455a957f43ad66 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Tue, 13 Jan 2015 00:55:10 +0800 Subject: Btrfs: fix unused members in struct btrfs_root There isn't any real use of following members of struct btrfs_root so delete them. struct kobject root_kobj; struct completion kobj_unregister; Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2ecdac0..85c697d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1798,8 +1798,6 @@ struct btrfs_root { struct btrfs_fs_info *fs_info; struct extent_io_tree dirty_log_pages; - struct kobject root_kobj; - struct completion kobj_unregister; struct mutex objectid_mutex; spinlock_t accounting_lock; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0696a10..612d46e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1275,12 +1275,10 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize, memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); - memset(&root->root_kobj, 0, sizeof(root->root_kobj)); if (fs_info) root->defrag_trans_start = fs_info->generation; else root->defrag_trans_start = 0; - init_completion(&root->kobj_unregister); root->root_key.objectid = objectid; root->anon_dev = 0; -- cgit v0.10.2 From 95449a1626fb6b643b576b9fbafed02793627577 Mon Sep 17 00:00:00 2001 From: chandan Date: Thu, 15 Jan 2015 12:22:03 +0530 Subject: Btrfs: insert_new_root: Fix lock type of the extent buffer. btrfs_alloc_tree_block() returns an extent buffer on which a blocked lock has been taken. Hence assign the appropriate value to path->locks[level]. Signed-off-by: Chandan Rajendra Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index fdd8fb4..9936421 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -3380,7 +3380,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, add_root_to_dirty_list(root); extent_buffer_get(c); path->nodes[level] = c; - path->locks[level] = BTRFS_WRITE_LOCK; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; path->slots[level] = 0; return 0; } -- cgit v0.10.2 From a937b9791ec2ee71d5e303d2c02c8c1ad8abff35 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 12 Dec 2014 17:39:12 +0100 Subject: btrfs: kill btrfs_inode_*time helpers They just opencode taking address of the timespec member. Signed-off-by: David Sterba Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 85c697d..54a66fa 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2467,31 +2467,6 @@ BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32); BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32); BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64); - -static inline struct btrfs_timespec * -btrfs_inode_atime(struct btrfs_inode_item *inode_item) -{ - unsigned long ptr = (unsigned long)inode_item; - ptr += offsetof(struct btrfs_inode_item, atime); - return (struct btrfs_timespec *)ptr; -} - -static inline struct btrfs_timespec * -btrfs_inode_mtime(struct btrfs_inode_item *inode_item) -{ - unsigned long ptr = (unsigned long)inode_item; - ptr += offsetof(struct btrfs_inode_item, mtime); - return (struct btrfs_timespec *)ptr; -} - -static inline struct btrfs_timespec * -btrfs_inode_ctime(struct btrfs_inode_item *inode_item) -{ - unsigned long ptr = (unsigned long)inode_item; - ptr += offsetof(struct btrfs_inode_item, ctime); - return (struct btrfs_timespec *)ptr; -} - BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64); BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index de4e70f..116eb4b 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1755,19 +1755,19 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags); btrfs_set_stack_inode_block_group(inode_item, 0); - btrfs_set_stack_timespec_sec(btrfs_inode_atime(inode_item), + btrfs_set_stack_timespec_sec(&inode_item->atime, inode->i_atime.tv_sec); - btrfs_set_stack_timespec_nsec(btrfs_inode_atime(inode_item), + btrfs_set_stack_timespec_nsec(&inode_item->atime, inode->i_atime.tv_nsec); - btrfs_set_stack_timespec_sec(btrfs_inode_mtime(inode_item), + btrfs_set_stack_timespec_sec(&inode_item->mtime, inode->i_mtime.tv_sec); - btrfs_set_stack_timespec_nsec(btrfs_inode_mtime(inode_item), + btrfs_set_stack_timespec_nsec(&inode_item->mtime, inode->i_mtime.tv_nsec); - btrfs_set_stack_timespec_sec(btrfs_inode_ctime(inode_item), + btrfs_set_stack_timespec_sec(&inode_item->ctime, inode->i_ctime.tv_sec); - btrfs_set_stack_timespec_nsec(btrfs_inode_ctime(inode_item), + btrfs_set_stack_timespec_nsec(&inode_item->ctime, inode->i_ctime.tv_nsec); } @@ -1775,7 +1775,6 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) { struct btrfs_delayed_node *delayed_node; struct btrfs_inode_item *inode_item; - struct btrfs_timespec *tspec; delayed_node = btrfs_get_delayed_node(inode); if (!delayed_node) @@ -1802,17 +1801,14 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) *rdev = btrfs_stack_inode_rdev(inode_item); BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item); - tspec = btrfs_inode_atime(inode_item); - inode->i_atime.tv_sec = btrfs_stack_timespec_sec(tspec); - inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(tspec); + inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime); + inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime); - tspec = btrfs_inode_mtime(inode_item); - inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(tspec); - inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(tspec); + inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime); + inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime); - tspec = btrfs_inode_ctime(inode_item); - inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(tspec); - inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(tspec); + inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(&inode_item->ctime); + inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->ctime); inode->i_generation = BTRFS_I(inode)->generation; BTRFS_I(inode)->index_cnt = (u64)-1; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2d14acb..575164d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3490,7 +3490,6 @@ static void btrfs_read_locked_inode(struct inode *inode) struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; - struct btrfs_timespec *tspec; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key location; unsigned long ptr; @@ -3527,17 +3526,14 @@ static void btrfs_read_locked_inode(struct inode *inode) i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); - tspec = btrfs_inode_atime(inode_item); - inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec); - inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); + inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime); + inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime); - tspec = btrfs_inode_mtime(inode_item); - inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec); - inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); + inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime); + inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime); - tspec = btrfs_inode_ctime(inode_item); - inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec); - inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); + inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime); + inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime); inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); @@ -3658,19 +3654,19 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); - btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), + btrfs_set_token_timespec_sec(leaf, &item->atime, inode->i_atime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), + btrfs_set_token_timespec_nsec(leaf, &item->atime, inode->i_atime.tv_nsec, &token); - btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), + btrfs_set_token_timespec_sec(leaf, &item->mtime, inode->i_mtime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), + btrfs_set_token_timespec_nsec(leaf, &item->mtime, inode->i_mtime.tv_nsec, &token); - btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), + btrfs_set_token_timespec_sec(leaf, &item->ctime, inode->i_ctime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), + btrfs_set_token_timespec_nsec(leaf, &item->ctime, inode->i_ctime.tv_nsec, &token); btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 804432d..fe58572 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -2471,12 +2471,9 @@ verbose_printk("btrfs: send_utimes %llu\n", ino); if (ret < 0) goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); - TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, - btrfs_inode_atime(ii)); - TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, - btrfs_inode_mtime(ii)); - TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, - btrfs_inode_ctime(ii)); + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime); + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime); + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime); /* TODO Add otime support when the otime patches get into upstream */ ret = send_cmd(sctx); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index a266587..79f05bc 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3276,19 +3276,19 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); - btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), + btrfs_set_token_timespec_sec(leaf, &item->atime, inode->i_atime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), + btrfs_set_token_timespec_nsec(leaf, &item->atime, inode->i_atime.tv_nsec, &token); - btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), + btrfs_set_token_timespec_sec(leaf, &item->mtime, inode->i_mtime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), + btrfs_set_token_timespec_nsec(leaf, &item->mtime, inode->i_mtime.tv_nsec, &token); - btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), + btrfs_set_token_timespec_sec(leaf, &item->ctime, inode->i_ctime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), + btrfs_set_token_timespec_nsec(leaf, &item->ctime, inode->i_ctime.tv_nsec, &token); btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), -- cgit v0.10.2 From 9cc97d646216b6f2473fa4ab9f103514b86c6814 Mon Sep 17 00:00:00 2001 From: chandan r Date: Wed, 4 Jul 2012 12:48:07 +0530 Subject: Btrfs: Add code to support file creation time This patch adds a new member to the 'struct btrfs_inode' structure to hold the file creation time. Signed-off-by: chandan [refreshed, removed btrfs_inode_otime] Signed-off-by: David Sterba Signed-off-by: Chris Mason diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 4aadadc..de5e4f2 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -185,6 +185,9 @@ struct btrfs_inode { struct btrfs_delayed_node *delayed_node; + /* File creation time. */ + struct timespec i_otime; + struct inode vfs_inode; }; diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 116eb4b..82f0c7c 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1769,6 +1769,11 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, inode->i_ctime.tv_sec); btrfs_set_stack_timespec_nsec(&inode_item->ctime, inode->i_ctime.tv_nsec); + + btrfs_set_stack_timespec_sec(&inode_item->otime, + BTRFS_I(inode)->i_otime.tv_sec); + btrfs_set_stack_timespec_nsec(&inode_item->otime, + BTRFS_I(inode)->i_otime.tv_nsec); } int btrfs_fill_inode(struct inode *inode, u32 *rdev) @@ -1810,6 +1815,11 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(&inode_item->ctime); inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->ctime); + BTRFS_I(inode)->i_otime.tv_sec = + btrfs_stack_timespec_sec(&inode_item->otime); + BTRFS_I(inode)->i_otime.tv_nsec = + btrfs_stack_timespec_nsec(&inode_item->otime); + inode->i_generation = BTRFS_I(inode)->generation; BTRFS_I(inode)->index_cnt = (u64)-1; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 575164d..b029233 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3535,6 +3535,11 @@ static void btrfs_read_locked_inode(struct inode *inode) inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime); inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime); + BTRFS_I(inode)->i_otime.tv_sec = + btrfs_timespec_sec(leaf, &inode_item->otime); + BTRFS_I(inode)->i_otime.tv_nsec = + btrfs_timespec_nsec(leaf, &inode_item->otime); + inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); @@ -3669,6 +3674,11 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_timespec_nsec(leaf, &item->ctime, inode->i_ctime.tv_nsec, &token); + btrfs_set_token_timespec_sec(leaf, &item->otime, + BTRFS_I(inode)->i_otime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, &item->otime, + BTRFS_I(inode)->i_otime.tv_nsec, &token); + btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), &token); btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation, @@ -5260,7 +5270,10 @@ static struct inode *new_simple_dir(struct super_block *s, inode->i_op = &btrfs_dir_ro_inode_operations; inode->i_fop = &simple_dir_operations; inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = CURRENT_TIME; + inode->i_atime = inode->i_mtime; + inode->i_ctime = inode->i_mtime; + BTRFS_I(inode)->i_otime = inode->i_mtime; return inode; } @@ -5828,7 +5841,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, inode_init_owner(inode, dir, mode); inode_set_bytes(inode, 0); - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + + inode->i_mtime = CURRENT_TIME; + inode->i_atime = inode->i_mtime; + inode->i_ctime = inode->i_mtime; + BTRFS_I(inode)->i_otime = inode->i_mtime; + inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item, @@ -8577,6 +8595,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->delayed_node = NULL; + ei->i_otime.tv_sec = 0; + ei->i_otime.tv_nsec = 0; + inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree); extent_io_tree_init(&ei->io_tree, &inode->i_data); -- cgit v0.10.2 From 75d6ad382bb91f363452119d34238e156589ca2d Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 31 Oct 2014 17:18:08 +0100 Subject: btrfs: more superblock checks, lower bounds on devices and sectorsize/nodesize I received a few crafted images from Jiri, all got through the recently added superblock checks. The lower bounds checks for num_devices and sector/node -sizes were missing and caused a crash during mount. Tools for symbolic code execution were used to prepare the images contents. Reported-by: Jiri Slaby Signed-off-by: David Sterba Signed-off-by: Chris Mason diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 612d46e..1117136 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3871,6 +3871,21 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n", btrfs_super_log_root(sb)); + /* + * Check the lower bound, the alignment and other constraints are + * checked later. + */ + if (btrfs_super_nodesize(sb) < 4096) { + printk(KERN_ERR "BTRFS: nodesize too small: %u < 4096\n", + btrfs_super_nodesize(sb)); + ret = -EINVAL; + } + if (btrfs_super_sectorsize(sb) < 4096) { + printk(KERN_ERR "BTRFS: sectorsize too small: %u < 4096\n", + btrfs_super_sectorsize(sb)); + ret = -EINVAL; + } + if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) { printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n", fs_info->fsid, sb->dev_item.fsid); @@ -3884,6 +3899,10 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, if (btrfs_super_num_devices(sb) > (1UL << 31)) printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n", btrfs_super_num_devices(sb)); + if (btrfs_super_num_devices(sb) == 0) { + printk(KERN_ERR "BTRFS: number of devices is 0\n"); + ret = -EINVAL; + } if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) { printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n", -- cgit v0.10.2 From ce7fca5f57ed0fcd7e7b3d7b1a3e1791f8e56fa3 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 31 Oct 2014 18:42:05 +0100 Subject: btrfs: add checks for sys_chunk_array sizes Verify that possible minimum and maximum size is set, validity of contents is checked in btrfs_read_sys_array. Signed-off-by: David Sterba Signed-off-by: Chris Mason diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1117136..263d147 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3911,6 +3911,25 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, } /* + * Obvious sys_chunk_array corruptions, it must hold at least one key + * and one chunk + */ + if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { + printk(KERN_ERR "BTRFS: system chunk array too big %u > %u\n", + btrfs_super_sys_array_size(sb), + BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); + ret = -EINVAL; + } + if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key) + + sizeof(struct btrfs_chunk)) { + printk(KERN_ERR "BTRFS: system chunk array too small %u < %lu\n", + btrfs_super_sys_array_size(sb), + sizeof(struct btrfs_disk_key) + + sizeof(struct btrfs_chunk)); + ret = -EINVAL; + } + + /* * The generation is a global counter, we'll trust it more than the others * but it's still possible that it's the one that's wrong. */ -- cgit v0.10.2 From 1ffb22cf8c322bbfea6b35fe23d025841b49fede Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 31 Oct 2014 19:02:42 +0100 Subject: btrfs: cleanup, rename a few variables in btrfs_read_sys_array There's a pointer to buffer, integer offset and offset passed as pointer, try to find matching names for them. Signed-off-by: David Sterba Signed-off-by: Chris Mason diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index da7e0e1..b33c83b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6252,13 +6252,13 @@ int btrfs_read_sys_array(struct btrfs_root *root) struct extent_buffer *sb; struct btrfs_disk_key *disk_key; struct btrfs_chunk *chunk; - u8 *ptr; - unsigned long sb_ptr; + u8 *array_ptr; + unsigned long sb_array_offset; int ret = 0; u32 num_stripes; u32 array_size; u32 len = 0; - u32 cur; + u32 cur_offset; struct btrfs_key key; ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize); @@ -6290,20 +6290,21 @@ int btrfs_read_sys_array(struct btrfs_root *root) write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); array_size = btrfs_super_sys_array_size(super_copy); - ptr = super_copy->sys_chunk_array; - sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); - cur = 0; + array_ptr = super_copy->sys_chunk_array; + sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); + cur_offset = 0; - while (cur < array_size) { - disk_key = (struct btrfs_disk_key *)ptr; + while (cur_offset < array_size) { + disk_key = (struct btrfs_disk_key *)array_ptr; btrfs_disk_key_to_cpu(&key, disk_key); - len = sizeof(*disk_key); ptr += len; - sb_ptr += len; - cur += len; + len = sizeof(*disk_key); + array_ptr += len; + sb_array_offset += len; + cur_offset += len; if (key.type == BTRFS_CHUNK_ITEM_KEY) { - chunk = (struct btrfs_chunk *)sb_ptr; + chunk = (struct btrfs_chunk *)sb_array_offset; ret = read_one_chunk(root, &key, sb, chunk); if (ret) break; @@ -6313,9 +6314,9 @@ int btrfs_read_sys_array(struct btrfs_root *root) ret = -EIO; break; } - ptr += len; - sb_ptr += len; - cur += len; + array_ptr += len; + sb_array_offset += len; + cur_offset += len; } free_extent_buffer(sb); return ret; -- cgit v0.10.2 From e3540eab29e1b2260bc4b9b3979a49a00e3e3af8 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 5 Nov 2014 15:24:51 +0100 Subject: btrfs: add more checks to btrfs_read_sys_array Verify that the sys_array has enough bytes to read the next item. Signed-off-by: David Sterba Signed-off-by: Chris Mason diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b33c83b..2c4cab2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6296,20 +6296,34 @@ int btrfs_read_sys_array(struct btrfs_root *root) while (cur_offset < array_size) { disk_key = (struct btrfs_disk_key *)array_ptr; + len = sizeof(*disk_key); + if (cur_offset + len > array_size) + goto out_short_read; + btrfs_disk_key_to_cpu(&key, disk_key); - len = sizeof(*disk_key); array_ptr += len; sb_array_offset += len; cur_offset += len; if (key.type == BTRFS_CHUNK_ITEM_KEY) { chunk = (struct btrfs_chunk *)sb_array_offset; + /* + * At least one btrfs_chunk with one stripe must be + * present, exact stripe count check comes afterwards + */ + len = btrfs_chunk_item_size(1); + if (cur_offset + len > array_size) + goto out_short_read; + + num_stripes = btrfs_chunk_num_stripes(sb, chunk); + len = btrfs_chunk_item_size(num_stripes); + if (cur_offset + len > array_size) + goto out_short_read; + ret = read_one_chunk(root, &key, sb, chunk); if (ret) break; - num_stripes = btrfs_chunk_num_stripes(sb, chunk); - len = btrfs_chunk_item_size(num_stripes); } else { ret = -EIO; break; @@ -6320,6 +6334,12 @@ int btrfs_read_sys_array(struct btrfs_root *root) } free_extent_buffer(sb); return ret; + +out_short_read: + printk(KERN_ERR "BTRFS: sys_array too short to read %u bytes at offset %u\n", + len, cur_offset); + free_extent_buffer(sb); + return -EIO; } int btrfs_read_chunk_tree(struct btrfs_root *root) -- cgit v0.10.2 From d4b450cd4b33ce7c572e7fdccf33b59c4cdf361c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 29 Jan 2015 19:18:25 +0000 Subject: Btrfs: fix race between transaction commit and empty block group removal Committing a transaction can race with automatic removal of empty block groups (cleaner kthread), leading to a BUG_ON() in the transaction commit code while running btrfs_finish_extent_commit(). The following sequence diagram shows how it can happen: CPU 1 CPU 2 btrfs_commit_transaction() fs_info->running_transaction = NULL btrfs_finish_extent_commit() find_first_extent_bit() -> found range for block group X in fs_info->freed_extents[] btrfs_delete_unused_bgs() -> found block group X Removed block group X's range from fs_info->freed_extents[] btrfs_remove_chunk() btrfs_remove_block_group(bg X) unpin_extent_range(bg X range) btrfs_lookup_block_group(bg X) -> returns NULL -> BUG_ON() The trace that results from the BUG_ON() is: [48665.187808] ------------[ cut here ]------------ [48665.188032] kernel BUG at fs/btrfs/extent-tree.c:5675! [48665.188032] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC [48665.188032] Modules linked in: dm_flakey dm_mod crc32c_generic btrfs xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop parport_pc evdev microcode [48665.197388] CPU: 2 PID: 31211 Comm: kworker/u32:16 Tainted: G W 3.19.0-rc5-btrfs-next-4+ #1 [48665.197388] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014 [48665.197388] Workqueue: events_unbound btrfs_async_reclaim_metadata_space [btrfs] [48665.197388] task: ffff880222011810 ti: ffff8801b56a4000 task.ti: ffff8801b56a4000 [48665.197388] RIP: 0010:[] [] unpin_extent_range+0x6a/0x1ba [btrfs] [48665.197388] RSP: 0018:ffff8801b56a7b88 EFLAGS: 00010246 [48665.197388] RAX: 0000000000000000 RBX: ffff8802143a6000 RCX: ffff8802220120c8 [48665.197388] RDX: 0000000000000001 RSI: 0000000000000001 RDI: ffff8800a3c140b0 [48665.197388] RBP: ffff8801b56a7bd8 R08: 0000000000000003 R09: 0000000000000000 [48665.197388] R10: 0000000000000000 R11: 000000000000bbac R12: 0000000012e8e000 [48665.197388] R13: ffff8800a3c14000 R14: 0000000000000000 R15: 0000000000000000 [48665.197388] FS: 0000000000000000(0000) GS:ffff88023ec40000(0000) knlGS:0000000000000000 [48665.197388] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [48665.197388] CR2: 00007f065e42f270 CR3: 0000000206f70000 CR4: 00000000000006e0 [48665.197388] Stack: [48665.197388] ffff8801b56a7bd8 0000000012ea0000 01ff8800a3c14138 0000000012e9ffff [48665.197388] ffff880141df3dd8 ffff8802143a6000 ffff8800a3c14138 ffff880141df3df0 [48665.197388] ffff880141df3dd8 0000000000000000 ffff8801b56a7c08 ffffffffa0354227 [48665.197388] Call Trace: [48665.197388] [] btrfs_finish_extent_commit+0xb0/0xd9 [btrfs] [48665.197388] [] btrfs_commit_transaction+0x791/0x92c [btrfs] [48665.197388] [] flush_space+0x43d/0x452 [btrfs] [48665.197388] [] ? _raw_spin_unlock+0x28/0x33 [48665.197388] [] btrfs_async_reclaim_metadata_space+0x118/0x164 [btrfs] [48665.197388] [] ? process_one_work+0x14b/0x3ab [48665.197388] [] process_one_work+0x1e0/0x3ab [48665.197388] [] ? trace_hardirqs_off+0xd/0xf [48665.197388] [] worker_thread+0x210/0x2d0 [48665.197388] [] ? rescuer_thread+0x2c3/0x2c3 [48665.197388] [] kthread+0xef/0xf7 [48665.197388] [] ? _raw_spin_unlock_irq+0x2d/0x39 [48665.197388] [] ? __kthread_parkme+0xad/0xad [48665.197388] [] ret_from_fork+0x7c/0xb0 [48665.197388] [] ? __kthread_parkme+0xad/0xad [48665.197388] Code: 85 f6 74 14 49 8b 06 49 03 46 09 49 39 c4 72 1d 4c 89 f7 e8 83 ec ff ff 4c 89 e6 4c 89 ef e8 1e f1 ff ff 48 85 c0 49 89 c6 75 02 <0f> 0b 49 8b 1e 49 03 5e 09 48 8b [48665.197388] RIP [] unpin_extent_range+0x6a/0x1ba [btrfs] [48665.197388] RSP [48665.272246] ---[ end trace b9c6ab9957521376 ]--- Fix this by ensuring that unpining the block group's range in btrfs_finish_extent_commit() is done in a synchronized fashion with removing the block group's range from freed_extents[] in btrfs_delete_unused_bgs() This race got introduced with the change: Btrfs: remove empty block groups automatically commit 47ab2a6c689913db23ccae38349714edf8365e0a Signed-off-by: Filipe Manana Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 54a66fa..d3562dd 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1744,6 +1744,7 @@ struct btrfs_fs_info { spinlock_t unused_bgs_lock; struct list_head unused_bgs; + struct mutex unused_bg_unpin_mutex; /* For btrfs to record security options */ struct security_mnt_opts security_opts; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 263d147..41b320e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2242,6 +2242,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->qgroup_op_lock); spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); + mutex_init(&fs_info->unused_bg_unpin_mutex); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 53294da..857a859 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5735,10 +5735,13 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, unpin = &fs_info->freed_extents[0]; while (1) { + mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY, NULL); - if (ret) + if (ret) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; + } if (btrfs_test_opt(root, DISCARD)) ret = btrfs_discard_extent(root, start, @@ -5746,6 +5749,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, clear_extent_dirty(unpin, start, end, GFP_NOFS); unpin_extent_range(root, start, end, true); + mutex_unlock(&fs_info->unused_bg_unpin_mutex); cond_resched(); } @@ -9561,18 +9565,33 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) */ start = block_group->key.objectid; end = start + block_group->key.offset - 1; + /* + * Hold the unused_bg_unpin_mutex lock to avoid racing with + * btrfs_finish_extent_commit(). If we are at transaction N, + * another task might be running finish_extent_commit() for the + * previous transaction N - 1, and have seen a range belonging + * to the block group in freed_extents[] before we were able to + * clear the whole block group range from freed_extents[]. This + * means that task can lookup for the block group after we + * unpinned it from freed_extents[] and removed it, leading to + * a BUG_ON() at btrfs_unpin_extent_range(). + */ + mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, EXTENT_DIRTY, GFP_NOFS); if (ret) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); btrfs_set_block_group_rw(root, block_group); goto end_trans; } ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, EXTENT_DIRTY, GFP_NOFS); if (ret) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); btrfs_set_block_group_rw(root, block_group); goto end_trans; } + mutex_unlock(&fs_info->unused_bg_unpin_mutex); /* Reset pinned so btrfs_put_block_group doesn't complain */ block_group->pinned = 0; -- cgit v0.10.2 From 001a648df40ce6e7906773587f5fff48f61d0d73 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 23 Jan 2015 18:27:00 +0000 Subject: Btrfs: add missing cleanup on sysfs init failure If we failed during initialization of sysfs, we weren't unregistering the top level btrfs sysfs entry nor the debugfs stuff. Not unregistering the top level sysfs entry makes future attempts to reload the btrfs module impossible and the following is reported in dmesg: [ 2246.451296] WARNING: CPU: 3 PID: 10999 at fs/sysfs/dir.c:486 sysfs_warn_dup+0x91/0xb0() [ 2246.451298] sysfs: cannot create duplicate filename '/fs/btrfs' [ 2246.451298] Modules linked in: btrfs(+) raid6_pq xor bnep rfcomm bluetooth binfmt_misc nfsd auth_rpcgss oid_registry nfs_acl nfs lockd fscache sunrpc parport_pc parport psmouse serio_raw pcspkr evbug i2c_piix4 e1000 floppy [last unloaded: btrfs] [ 2246.451310] CPU: 3 PID: 10999 Comm: modprobe Tainted: G W 3.13.0-fdm-btrfs-next-24+ #7 [ 2246.451311] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 2246.451312] 0000000000000009 ffff8800d353fa08 ffffffff816f1da6 0000000000000410 [ 2246.451314] ffff8800d353fa58 ffff8800d353fa48 ffffffff8104a32c ffff88020821a290 [ 2246.451316] ffff88020821a290 ffff88020821a290 ffff8802148f0000 ffff8800d353fb80 [ 2246.451318] Call Trace: [ 2246.451322] [] dump_stack+0x4e/0x68 [ 2246.451324] [] warn_slowpath_common+0x8c/0xc0 [ 2246.451325] [] warn_slowpath_fmt+0x46/0x50 [ 2246.451328] [] ? strlcat+0x65/0x90 (....) This fixes the following change: btrfs: add simple debugfs interface commit 1bae30982bc86ab66d61ccb6e22792593b45d44d Signed-off-by: Filipe Manana Signed-off-by: Chris Mason diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 92db3f6..94edb0a 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -733,10 +733,18 @@ int btrfs_init_sysfs(void) ret = btrfs_init_debugfs(); if (ret) - return ret; + goto out1; init_feature_attrs(); ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); + if (ret) + goto out2; + + return 0; +out2: + debugfs_remove_recursive(btrfs_debugfs_root_dentry); +out1: + kset_unregister(btrfs_kset); return ret; } -- cgit v0.10.2 From de554a4fa61d77df2704be5b6b47472b2dbd1875 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 27 Jan 2015 23:06:42 +0000 Subject: Btrfs: fix scrub race leading to use-after-free While running a scrub on a kernel with CONFIG_DEBUG_PAGEALLOC=y, I got the following trace: [68127.807663] BUG: unable to handle kernel paging request at ffff8803f8947a50 [68127.807663] IP: [] do_raw_spin_lock+0x94/0x122 [68127.807663] PGD 3003067 PUD 43e1f5067 PMD 43e030067 PTE 80000003f8947060 [68127.807663] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC [68127.807663] Modules linked in: dm_flakey dm_mod crc32c_generic btrfs xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop parport_pc processor parpo [68127.807663] CPU: 2 PID: 3081 Comm: kworker/u8:5 Not tainted 3.18.0-rc6-btrfs-next-3+ #4 [68127.807663] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014 [68127.807663] Workqueue: btrfs-btrfs-scrub btrfs_scrub_helper [btrfs] [68127.807663] task: ffff880101fc5250 ti: ffff8803f097c000 task.ti: ffff8803f097c000 [68127.807663] RIP: 0010:[] [] do_raw_spin_lock+0x94/0x122 [68127.807663] RSP: 0018:ffff8803f097fbb8 EFLAGS: 00010093 [68127.807663] RAX: 0000000028dd386c RBX: ffff8803f8947a50 RCX: 0000000028dd3854 [68127.807663] RDX: 0000000000000018 RSI: 0000000000000002 RDI: 0000000000000001 [68127.807663] RBP: ffff8803f097fbd8 R08: 0000000000000004 R09: 0000000000000001 [68127.807663] R10: ffff880102620980 R11: ffff8801f3e8c900 R12: 000000000001d390 [68127.807663] R13: 00000000cabd13c8 R14: ffff8803f8947800 R15: ffff88037c574f00 [68127.807663] FS: 0000000000000000(0000) GS:ffff88043dd00000(0000) knlGS:0000000000000000 [68127.807663] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [68127.807663] CR2: ffff8803f8947a50 CR3: 00000000b6481000 CR4: 00000000000006e0 [68127.807663] Stack: [68127.807663] ffffffff823942a8 ffff8803f8947a50 ffff8802a3416f80 0000000000000000 [68127.807663] ffff8803f097fc18 ffffffff8141e7c0 ffffffff81072948 000000000034f314 [68127.807663] ffff8803f097fc08 0000000000000292 ffff8803f097fc48 ffff8803f8947a50 [68127.807663] Call Trace: [68127.807663] [] _raw_spin_lock_irqsave+0x4b/0x55 [68127.807663] [] ? __wake_up+0x22/0x4b [68127.807663] [] __wake_up+0x22/0x4b [68127.807663] [] scrub_pending_bio_dec+0x32/0x36 [btrfs] [68127.807663] [] scrub_bio_end_io_worker+0x5a3/0x5c9 [btrfs] [68127.807663] [] ? time_hardirqs_off+0x15/0x28 [68127.807663] [] ? trace_hardirqs_off_caller+0x4c/0xb9 [68127.807663] [] normal_work_helper+0xf1/0x238 [btrfs] [68127.807663] [] btrfs_scrub_helper+0x12/0x14 [btrfs] [68127.807663] [] process_one_work+0x1e4/0x3b6 [68127.807663] [] ? trace_hardirqs_off+0xd/0xf [68127.807663] [] worker_thread+0x1fb/0x2a8 [68127.807663] [] ? rescuer_thread+0x219/0x219 [68127.807663] [] kthread+0xdb/0xe3 [68127.807663] [] ? __kthread_parkme+0x67/0x67 [68127.807663] [] ret_from_fork+0x7c/0xb0 [68127.807663] [] ? __kthread_parkme+0x67/0x67 [68127.807663] Code: 39 c2 75 14 8d 8a 00 00 01 00 89 d0 f0 0f b1 0b 39 d0 0f 84 81 00 00 00 4c 69 2d 27 86 99 00 fa 00 00 00 45 31 e4 4d 39 ec 74 2b <8b> 13 89 d0 c1 e8 10 66 39 c2 75 [68127.807663] RIP [] do_raw_spin_lock+0x94/0x122 [68127.807663] RSP [68127.807663] CR2: ffff8803f8947a50 [68127.807663] ---[ end trace d7045aac00a66cd8 ]--- This is due to a race that can happen in a very tiny time window and is illustrated by the following sequence diagram: CPU 1 CPU 2 btrfs_scrub_dev() scrub_bio_end_io_worker() scrub_pending_bio_dec() atomic_dec(&sctx->bios_in_flight) wait sctx->bios_in_flight == 0 wait sctx->workers_pending == 0 mutex_lock(&fs_info->scrub_lock) (...) mutex_lock(&fs_info->scrub_lock) scrub_free_ctx(sctx) kfree(sctx) wake_up(&sctx->list_wait) __wake_up() spin_lock_irqsave(&sctx->list_wait->lock, flags) Another variation of this scenario that results in the same use-after-free issue is: CPU 1 CPU 2 btrfs_scrub_dev() wait sctx->bios_in_flight == 0 scrub_bio_end_io_worker() scrub_pending_bio_dec() __wake_up(&sctx->list_wait) spin_lock_irqsave(&sctx->list_wait->lock, flags) default_wake_function() wake up task at CPU 2 wait sctx->workers_pending == 0 mutex_lock(&fs_info->scrub_lock) (...) mutex_lock(&fs_info->scrub_lock) scrub_free_ctx(sctx) kfree(sctx) spin_unlock_irqrestore(&sctx->list_wait->lock, flags) Fix this by holding the scrub lock while doing the wakeup. This isn't a recent regression, the issue as been around since the scrub feature was added (2011, commit a2de733c78fa7af51ba9670482fa7d392aa67c57). Signed-off-by: Filipe Manana Signed-off-by: Chris Mason diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 8af7372..6d6e155 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -306,8 +306,17 @@ static void scrub_pending_bio_inc(struct scrub_ctx *sctx) static void scrub_pending_bio_dec(struct scrub_ctx *sctx) { + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + + /* + * Hold the scrub_lock while doing the wakeup to ensure the + * sctx (and its wait queue list_wait) isn't destroyed/freed + * during the wakeup. + */ + mutex_lock(&fs_info->scrub_lock); atomic_dec(&sctx->bios_in_flight); wake_up(&sctx->list_wait); + mutex_unlock(&fs_info->scrub_lock); } static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) @@ -379,10 +388,15 @@ static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx) mutex_lock(&fs_info->scrub_lock); atomic_dec(&fs_info->scrubs_running); atomic_dec(&fs_info->scrubs_paused); - mutex_unlock(&fs_info->scrub_lock); atomic_dec(&sctx->workers_pending); wake_up(&fs_info->scrub_pause_wait); + /* + * Hold the scrub_lock while doing the wakeup to ensure the + * sctx (and its wait queue list_wait) isn't destroyed/freed + * during the wakeup. + */ wake_up(&sctx->list_wait); + mutex_unlock(&fs_info->scrub_lock); } static void scrub_free_csums(struct scrub_ctx *sctx) -- cgit v0.10.2 From 289454ad26a2d752e04b07234a175feda9ec0f4e Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 6 Jan 2015 01:01:03 +0900 Subject: btrfs: clear bio reference after submit_one_bio() After submit_one_bio(), `bio' can go away. However submit_extent_page() leave `bio' referable if submit_one_bio() failed (e.g. -ENOMEM on OOM). It will cause invalid paging request when submit_extent_page() is called next time. I reproduced ENOMEM case with the following script (need CONFIG_FAIL_PAGE_ALLOC, and CONFIG_FAULT_INJECTION_DEBUG_FS). #!/bin/bash dmesgout=dmesg.txt start=100000 end=300000 step=1000 # btrfs options device=/dev/vdb1 directory=/mnt/btrfs # fault-injection options percent=100 times=3 mkdir -p $directory || exit 1 mount -o compress $device $directory || exit 1 rm -f $directory/file || exit 1 dd if=/dev/zero of=$directory/file bs=1M count=512 || exit 1 for interval in `seq $start $step $end`; do dmesg -C echo 1 > /proc/sys/vm/drop_caches sync export FAILCMD_TYPE=fail_page_alloc ./failcmd.sh -p $percent -t $times -i $interval \ --ignore-gfp-highmem=N --ignore-gfp-wait=N --min-order=0 \ -- \ cat $directory/file > /dev/null dmesg > ${dmesgout} if grep -q BUG: ${dmesgout}; then cat ${dmesgout} exit 1 fi done umount $directory exit 0 Signed-off-by: Naohiro Aota Tested-by: Satoru Takeuchi Signed-off-by: Chris Mason diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index dab8af4..a7f6600 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2817,8 +2817,10 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, bio_add_page(bio, page, page_size, offset) < page_size) { ret = submit_one_bio(rw, bio, mirror_num, prev_bio_flags); - if (ret < 0) + if (ret < 0) { + *bio_ret = NULL; return ret; + } bio = NULL; } else { return 0; -- cgit v0.10.2 From 2f0810880f082fa8ba66ab2c33b02e4ff9770a5e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 9 Jan 2015 10:40:15 -0800 Subject: btrfs: delete chunk allocation attemp when setting block group ro Below test will fail currently: mkfs.ext4 -F /dev/sda btrfs-convert /dev/sda mount /dev/sda /mnt btrfs device add -f /dev/sdb /mnt btrfs balance start -v -dconvert=raid1 -mconvert=raid1 /mnt The reason is there are some block groups with usage 0, but the whole disk hasn't free space to allocate new chunk, so we even can't set such block group readonly. This patch deletes the chunk allocation when setting block group ro. For META, we already have reserve. But for SYSTEM, we don't have, so the check_system_chunk is still required. Signed-off-by: Shaohua Li Signed-off-by: Chris Mason diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 857a859..50de1fa 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -8482,14 +8482,6 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, if (IS_ERR(trans)) return PTR_ERR(trans); - alloc_flags = update_block_group_flags(root, cache->flags); - if (alloc_flags != cache->flags) { - ret = do_chunk_alloc(trans, root, alloc_flags, - CHUNK_ALLOC_FORCE); - if (ret < 0) - goto out; - } - ret = set_block_group_ro(cache, 0); if (!ret) goto out; @@ -8500,6 +8492,11 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, goto out; ret = set_block_group_ro(cache, 0); out: + if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { + alloc_flags = update_block_group_flags(root, cache->flags); + check_system_chunk(trans, root, alloc_flags); + } + btrfs_end_transaction(trans, root); return ret; } -- cgit v0.10.2 From b76808fc26a6ffab6ff0f85689d3051df9293af5 Mon Sep 17 00:00:00 2001 From: Gui Hecheng Date: Wed, 31 Dec 2014 09:51:35 +0800 Subject: btrfs: cleanup init for list in free-space-cache o removed an unecessary INIT_LIST_HEAD after LIST_HEAD o merge a declare & INIT_LIST_HEAD pair into one LIST_HEAD Signed-off-by: Gui Hecheng Reviewed-by: David Sterba Signed-off-by: Chris Mason diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 80a3141..a719785 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -651,15 +651,13 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, struct io_ctl io_ctl; struct btrfs_key key; struct btrfs_free_space *e, *n; - struct list_head bitmaps; + LIST_HEAD(bitmaps); u64 num_entries; u64 num_bitmaps; u64 generation; u8 type; int ret = 0; - INIT_LIST_HEAD(&bitmaps); - /* Nothing in the space cache, goodbye */ if (!i_size_read(inode)) return 0; @@ -2905,7 +2903,6 @@ int btrfs_find_space_cluster(struct btrfs_root *root, trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, min_bytes); - INIT_LIST_HEAD(&bitmaps); ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, bytes + empty_size, cont1_bytes, min_bytes); -- cgit v0.10.2 From eb710b152003ea74561512b3f55c1e3c71dcef39 Mon Sep 17 00:00:00 2001 From: Satoru Takeuchi Date: Wed, 24 Dec 2014 14:52:04 +0900 Subject: Btrfs: Remove unnecessary placeholder in btrfs_err_code "notused" is not necessary. Set 1 to the first entry is enough. Signed-off-by: Takeuchi Satoru Reviewed-by: David Sterba Signed-off-by: Chris Mason diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 611e1c5..b6dec05 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -495,8 +495,7 @@ struct btrfs_ioctl_send_args { /* Error codes as returned by the kernel */ enum btrfs_err_code { - notused, - BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, + BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET = 1, BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, -- cgit v0.10.2 From 575849ecf5d103ca9bbf0a6b9e89eba335d4e750 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 11 Feb 2015 11:12:39 +0000 Subject: Btrfs: fix scheduler warning when syncing log We try to lock a mutex while the current task state is not TASK_RUNNING, which results in the following warning when CONFIG_DEBUG_LOCK_ALLOC=y: [30736.772501] ------------[ cut here ]------------ [30736.774545] WARNING: CPU: 9 PID: 19972 at kernel/sched/core.c:7300 __might_sleep+0x8b/0xa8() [30736.783453] do not call blocking ops when !TASK_RUNNING; state=2 set at [] prepare_to_wait+0x43/0x89 [30736.786261] Modules linked in: dm_flakey dm_mod crc32c_generic btrfs xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop parport_pc psmouse parport pcspkr microcode serio_raw evdev processor thermal_sys i2c_piix4 i2c_core button ext4 crc16 jbd2 mbcache sg sr_mod cdrom sd_mod ata_generic virtio_scsi floppy ata_piix libata virtio_pci virtio_ring e1000 virtio scsi_mod [30736.794323] CPU: 9 PID: 19972 Comm: fsstress Not tainted 3.19.0-rc7-btrfs-next-5+ #1 [30736.795821] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014 [30736.798788] 0000000000000009 ffff88042743fbd8 ffffffff814248ed ffff88043d32f2d8 [30736.800504] ffff88042743fc28 ffff88042743fc18 ffffffff81045338 0000000000000001 [30736.802131] ffffffff81064514 ffffffff817c52d1 000000000000026d 0000000000000000 [30736.803676] Call Trace: [30736.804256] [] dump_stack+0x4c/0x65 [30736.805245] [] warn_slowpath_common+0xa1/0xbb [30736.806360] [] ? __might_sleep+0x8b/0xa8 [30736.807391] [] warn_slowpath_fmt+0x46/0x48 [30736.808511] [] ? prepare_to_wait+0x43/0x89 [30736.809620] [] ? prepare_to_wait+0x43/0x89 [30736.810691] [] __might_sleep+0x8b/0xa8 [30736.811703] [] mutex_lock_nested+0x2f/0x3a0 [30736.812889] [] ? trace_hardirqs_on_caller+0x18f/0x1ab [30736.814138] [] ? trace_hardirqs_on+0xd/0xf [30736.819878] [] wait_for_writer.isra.12+0x91/0xaa [btrfs] [30736.821260] [] ? signal_pending_state+0x31/0x31 [30736.822410] [] btrfs_sync_log+0x160/0x947 [btrfs] [30736.823574] [] ? trace_hardirqs_on_caller+0x18f/0x1ab [30736.824847] [] ? trace_hardirqs_on+0xd/0xf [30736.825972] [] btrfs_sync_file+0x2b0/0x319 [btrfs] [30736.827684] [] vfs_fsync_range+0x21/0x23 [30736.828932] [] vfs_fsync+0x1c/0x1e [30736.829917] [] do_fsync+0x34/0x4e [30736.830862] [] SyS_fsync+0x10/0x14 [30736.831819] [] system_call_fastpath+0x12/0x17 [30736.832982] ---[ end trace c0b57df60d32ae5c ]--- Fix this my acquiring the mutex after calling finish_wait(), which sets the task's state to TASK_RUNNING. Signed-off-by: Filipe Manana Reviewed-by: Liu Bo Signed-off-by: Chris Mason diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 79f05bc..7870bdb 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2448,8 +2448,8 @@ static void wait_for_writer(struct btrfs_trans_handle *trans, mutex_unlock(&root->log_mutex); if (atomic_read(&root->log_writers)) schedule(); - mutex_lock(&root->log_mutex); finish_wait(&root->log_writer_wait, &wait); + mutex_lock(&root->log_mutex); } } -- cgit v0.10.2 From f55985f4dda5cfb6967c17e96237f3c859076eb3 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 9 Feb 2015 21:14:24 +0000 Subject: Btrfs: scrub, fix sleep in atomic context My previous patch "Btrfs: fix scrub race leading to use-after-free" introduced the possibility to sleep in an atomic context, which happens when the scrub_lock mutex is held at the time scrub_pending_bio_dec() is called - this function can be called under an atomic context. Chris ran into this in a debug kernel which gave the following trace: [ 1928.950319] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:621 [ 1928.967334] in_atomic(): 1, irqs_disabled(): 0, pid: 149670, name: fsstress [ 1928.981324] INFO: lockdep is turned off. [ 1928.989244] CPU: 24 PID: 149670 Comm: fsstress Tainted: G W 3.19.0-rc7-mason+ #41 [ 1929.006418] Hardware name: ZTSYSTEMS Echo Ridge T4 /A9DRPF-10D, BIOS 1.07 05/10/2012 [ 1929.022207] ffffffff81a22cf8 ffff881076e03b78 ffffffff816b8dd9 ffff881076e03b78 [ 1929.037267] ffff880d8e828710 ffff881076e03ba8 ffffffff810856c4 ffff881076e03bc8 [ 1929.052315] 0000000000000000 000000000000026d ffffffff81a22cf8 ffff881076e03bd8 [ 1929.067381] Call Trace: [ 1929.072344] [] dump_stack+0x4f/0x6e [ 1929.083968] [] ___might_sleep+0x174/0x230 [ 1929.095352] [] __might_sleep+0x52/0x90 [ 1929.106223] [] mutex_lock_nested+0x2f/0x3b0 [ 1929.117951] [] ? trace_hardirqs_on+0xd/0x10 [ 1929.129708] [] scrub_pending_bio_dec+0x38/0x70 [btrfs] [ 1929.143370] [] scrub_parity_bio_endio+0x50/0x70 [btrfs] [ 1929.157191] [] bio_endio+0x53/0xa0 [ 1929.167382] [] rbio_orig_end_io+0x7c/0xa0 [btrfs] [ 1929.180161] [] raid_write_parity_end_io+0x5a/0x80 [btrfs] [ 1929.194318] [] bio_endio+0x53/0xa0 [ 1929.204496] [] blk_update_request+0x1eb/0x450 [ 1929.216569] [] ? trigger_load_balance+0x78/0x500 [ 1929.229176] [] scsi_end_request+0x3d/0x1f0 [ 1929.240740] [] scsi_io_completion+0xac/0x5b0 [ 1929.252654] [] scsi_finish_command+0xf0/0x150 [ 1929.264725] [] scsi_softirq_done+0x147/0x170 [ 1929.276635] [] blk_done_softirq+0x86/0xa0 [ 1929.288014] [] __do_softirq+0xde/0x600 [ 1929.298885] [] irq_exit+0xbd/0xd0 (...) Fix this by using a reference count on the scrub context structure instead of locking the scrub_lock mutex. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 6d6e155..db21f17 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -193,6 +193,15 @@ struct scrub_ctx { */ struct btrfs_scrub_progress stat; spinlock_t stat_lock; + + /* + * Use a ref counter to avoid use-after-free issues. Scrub workers + * decrement bios_in_flight and workers_pending and then do a wakeup + * on the list_wait wait queue. We must ensure the main scrub task + * doesn't free the scrub context before or while the workers are + * doing the wakeup() call. + */ + atomic_t refs; }; struct scrub_fixup_nodatasum { @@ -297,26 +306,20 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, static void copy_nocow_pages_worker(struct btrfs_work *work); static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); +static void scrub_put_ctx(struct scrub_ctx *sctx); static void scrub_pending_bio_inc(struct scrub_ctx *sctx) { + atomic_inc(&sctx->refs); atomic_inc(&sctx->bios_in_flight); } static void scrub_pending_bio_dec(struct scrub_ctx *sctx) { - struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; - - /* - * Hold the scrub_lock while doing the wakeup to ensure the - * sctx (and its wait queue list_wait) isn't destroyed/freed - * during the wakeup. - */ - mutex_lock(&fs_info->scrub_lock); atomic_dec(&sctx->bios_in_flight); wake_up(&sctx->list_wait); - mutex_unlock(&fs_info->scrub_lock); + scrub_put_ctx(sctx); } static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) @@ -350,6 +353,7 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) { struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + atomic_inc(&sctx->refs); /* * increment scrubs_running to prevent cancel requests from * completing as long as a worker is running. we must also @@ -388,15 +392,11 @@ static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx) mutex_lock(&fs_info->scrub_lock); atomic_dec(&fs_info->scrubs_running); atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); atomic_dec(&sctx->workers_pending); wake_up(&fs_info->scrub_pause_wait); - /* - * Hold the scrub_lock while doing the wakeup to ensure the - * sctx (and its wait queue list_wait) isn't destroyed/freed - * during the wakeup. - */ wake_up(&sctx->list_wait); - mutex_unlock(&fs_info->scrub_lock); + scrub_put_ctx(sctx); } static void scrub_free_csums(struct scrub_ctx *sctx) @@ -442,6 +442,12 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) kfree(sctx); } +static void scrub_put_ctx(struct scrub_ctx *sctx) +{ + if (atomic_dec_and_test(&sctx->refs)) + scrub_free_ctx(sctx); +} + static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) { @@ -466,6 +472,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) sctx = kzalloc(sizeof(*sctx), GFP_NOFS); if (!sctx) goto nomem; + atomic_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; sctx->pages_per_rd_bio = pages_per_rd_bio; sctx->curr = -1; @@ -3739,7 +3746,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, scrub_workers_put(fs_info); mutex_unlock(&fs_info->scrub_lock); - scrub_free_ctx(sctx); + scrub_put_ctx(sctx); return ret; } -- cgit v0.10.2 From 13212b54d18d5235fb97fbdcba8ae453fd2a3a51 Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Thu, 12 Feb 2015 14:18:17 +0800 Subject: btrfs: Fix out-of-space bug Btrfs will report NO_SPACE when we create and remove files for several times, and we can't write to filesystem until mount it again. Steps to reproduce: 1: Create a single-dev btrfs fs with default option 2: Write a file into it to take up most fs space 3: Delete above file 4: Wait about 100s to let chunk removed 5: goto 2 Script is like following: #!/bin/bash # Recommend 1.2G space, too large disk will make test slow DEV="/dev/sda16" MNT="/mnt/tmp" dev_size="$(lsblk -bn -o SIZE "$DEV")" || exit 2 file_size_m=$((dev_size * 75 / 100 / 1024 / 1024)) echo "Loop write ${file_size_m}M file on $((dev_size / 1024 / 1024))M dev" for ((i = 0; i < 10; i++)); do umount "$MNT" 2>/dev/null; done echo "mkfs $DEV" mkfs.btrfs -f "$DEV" >/dev/null || exit 2 echo "mount $DEV $MNT" mount "$DEV" "$MNT" || exit 2 for ((loop_i = 0; loop_i < 20; loop_i++)); do echo echo "loop $loop_i" echo "dd file..." cmd=(dd if=/dev/zero of="$MNT"/file0 bs=1M count="$file_size_m") "${cmd[@]}" 2>/dev/null || { # NO_SPACE error triggered echo "dd failed: ${cmd[*]}" exit 1 } echo "rm file..." rm -f "$MNT"/file0 || exit 2 for ((i = 0; i < 10; i++)); do df "$MNT" | tail -1 sleep 10 done done Reason: It is triggered by commit: 47ab2a6c689913db23ccae38349714edf8365e0a which is used to remove empty block groups automatically, but the reason is not in that patch. Code before works well because btrfs don't need to create and delete chunks so many times with high complexity. Above bug is caused by many reason, any of them can trigger it. Reason1: When we remove some continuous chunks but leave other chunks after, these disk space should be used by chunk-recreating, but in current code, only first create will successed. Fixed by Forrest Liu in: Btrfs: fix find_free_dev_extent() malfunction in case device tree has hole Reason2: contains_pending_extent() return wrong value in calculation. Fixed by Forrest Liu in: Btrfs: fix find_free_dev_extent() malfunction in case device tree has hole Reason3: btrfs_check_data_free_space() try to commit transaction and retry allocating chunk when the first allocating failed, but space_info->full is set in first allocating, and prevent second allocating in retry. Fixed in this patch by clear space_info->full in commit transaction. Tested for severial times by above script. Changelog v3->v4: use light weight int instead of atomic_t to record have_remove_bgs in transaction, suggested by: Josef Bacik Changelog v2->v3: v2 fixed the bug by adding more commit-transaction, but we only need to reclaim space when we are really have no space for new chunk, noticed by: Filipe David Manana Actually, our code already have this type of commit-and-retry, we only need to make it working with removed-bgs. v3 fixed the bug with above way. Changelog v1->v2: v1 will introduce a new bug when delete and create chunk in same disk space in same transaction, noticed by: Filipe David Manana V2 fix this bug by commit transaction after remove block grops. Reported-by: Tsutomu Itoh Suggested-by: Filipe David Manana Suggested-by: Josef Bacik Signed-off-by: Zhao Lei Signed-off-by: Chris Mason diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e0faf80..038fcf6 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -220,6 +220,7 @@ loop: * commit the transaction. */ atomic_set(&cur_trans->use_count, 2); + cur_trans->have_free_bgs = 0; cur_trans->start_time = get_seconds(); cur_trans->delayed_refs.href_root = RB_ROOT; @@ -2037,6 +2038,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_finish_extent_commit(trans, root); + if (cur_trans->have_free_bgs) + btrfs_clear_space_info_full(root->fs_info); + root->fs_info->last_trans_committed = cur_trans->transid; /* * We needn't acquire the lock here because there is no other task diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 3305451..937050a 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -47,6 +47,11 @@ struct btrfs_transaction { atomic_t num_writers; atomic_t use_count; + /* + * true if there is free bgs operations in this transaction + */ + int have_free_bgs; + /* Be protected by fs_info->trans_lock when we want to change it. */ enum btrfs_trans_state state; struct list_head list; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2c4cab2..cd4d131 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1310,6 +1310,8 @@ again: if (ret) { btrfs_error(root->fs_info, ret, "Failed to remove dev extent item"); + } else { + trans->transaction->have_free_bgs = 1; } out: btrfs_free_path(path); -- cgit v0.10.2 From 3e05bde8c3c2dd761da4d52944a087907955a53c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 11 Feb 2015 15:08:57 -0500 Subject: Btrfs: only adjust outstanding_extents when we do a short write We have this weird dance where we always inc outstanding_extents when we do a O_DIRECT write, even if we allocate the entire range. To get around this we also drop the metadata space if we successfully write. This is an unnecessary dance, we only need to jack up outstanding_extents if we don't satisfy the entire range request in get_blocks_direct, otherwise we are good using our original reservation. So drop the unconditional inc and the drop of the metadata space that we have for the unconditional inc. Thanks, Signed-off-by: Josef Bacik Reviewed-by: Liu Bo Signed-off-by: Chris Mason diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b029233..5a5b902 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7155,6 +7155,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, u64 start = iblock << inode->i_blkbits; u64 lockstart, lockend; u64 len = bh_result->b_size; + u64 orig_len = len; int unlock_bits = EXTENT_LOCKED; int ret = 0; @@ -7290,9 +7291,11 @@ unlock: if (start + len > i_size_read(inode)) i_size_write(inode, start + len); - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->lock); + if (len < orig_len) { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); + } ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockstart + len - 1, EXTENT_DELALLOC, NULL, @@ -8073,8 +8076,6 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, else if (ret >= 0 && (size_t)ret < count) btrfs_delalloc_release_space(inode, count - (size_t)ret); - else - btrfs_delalloc_release_metadata(inode, 0); } out: if (wakeup) -- cgit v0.10.2 From 3266789f9d08b27275bae5ab1dcd27d1bbf15e79 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 11 Feb 2015 15:08:58 -0500 Subject: Btrfs: don't set and clear delalloc for O_DIRECT writes We do this to get the space accounting, but this is just needless churn on the io_tree, so just drop setting/clearing delalloc and just drop the reserved data space when we have a successfull allocation. Thanks, Signed-off-by: Josef Bacik Reviewed-by: Liu Bo Signed-off-by: Chris Mason diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5a5b902..3b95792 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7160,7 +7160,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, int ret = 0; if (create) - unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY; + unlock_bits |= EXTENT_DIRTY; else len = min_t(u64, len, root->sectorsize); @@ -7296,11 +7296,7 @@ unlock: BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); } - - ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockstart + len - 1, EXTENT_DELALLOC, NULL, - &cached_state, GFP_NOFS); - BUG_ON(ret); + btrfs_free_reserved_data_space(inode, len); } /* -- cgit v0.10.2 From dcab6a3b2ae657a2017637083c28ee303b6b1b8e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 11 Feb 2015 15:08:59 -0500 Subject: Btrfs: account for large extents with enospc On our gluster boxes we stream large tar balls of backups onto our fses. With 160gb of ram this means we get really large contiguous ranges of dirty data, but the way our ENOSPC stuff works is that as long as it's contiguous we only hold metadata reservation for one extent. The problem is we limit our extents to 128mb, so we'll end up with at least 800 extents so our enospc accounting is quite a bit lower than what we need. To keep track of this make sure we increase outstanding_extents for every multiple of the max extent size so we can be sure to have enough reserved metadata space. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index d3562dd..b3dd55f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -198,6 +198,8 @@ static int btrfs_csum_sizes[] = { 4, 0 }; #define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024) +#define BTRFS_MAX_EXTENT_SIZE (128 * 1024 * 1024) + /* * The key defines the order in the tree, and so it also defines (optimal) * block layout. diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 50de1fa..0f67370 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4963,19 +4963,25 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root, /** * drop_outstanding_extent - drop an outstanding extent * @inode: the inode we're dropping the extent for + * @num_bytes: the number of bytes we're relaseing. * * This is called when we are freeing up an outstanding extent, either called * after an error or after an extent is written. This will return the number of * reserved extents that need to be freed. This must be called with * BTRFS_I(inode)->lock held. */ -static unsigned drop_outstanding_extent(struct inode *inode) +static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes) { unsigned drop_inode_space = 0; unsigned dropped_extents = 0; + unsigned num_extents = 0; - BUG_ON(!BTRFS_I(inode)->outstanding_extents); - BTRFS_I(inode)->outstanding_extents--; + num_extents = (unsigned)div64_u64(num_bytes + + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + ASSERT(num_extents); + ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents); + BTRFS_I(inode)->outstanding_extents -= num_extents; if (BTRFS_I(inode)->outstanding_extents == 0 && test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, @@ -5146,7 +5152,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) out_fail: spin_lock(&BTRFS_I(inode)->lock); - dropped = drop_outstanding_extent(inode); + dropped = drop_outstanding_extent(inode, num_bytes); /* * If the inodes csum_bytes is the same as the original * csum_bytes then we know we haven't raced with any free()ers @@ -5225,7 +5231,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) num_bytes = ALIGN(num_bytes, root->sectorsize); spin_lock(&BTRFS_I(inode)->lock); - dropped = drop_outstanding_extent(inode); + dropped = drop_outstanding_extent(inode, num_bytes); if (num_bytes) to_free = calc_csum_metadata_size(inode, num_bytes, 0); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a7f6600..29850d4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3242,7 +3242,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, page, &delalloc_start, &delalloc_end, - 128 * 1024 * 1024); + BTRFS_MAX_EXTENT_SIZE); if (nr_delalloc == 0) { delalloc_start = delalloc_end + 1; continue; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3b95792..8564d8c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1530,10 +1530,45 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, static void btrfs_split_extent_hook(struct inode *inode, struct extent_state *orig, u64 split) { + u64 size; + /* not delalloc, ignore it */ if (!(orig->state & EXTENT_DELALLOC)) return; + size = orig->end - orig->start + 1; + if (size > BTRFS_MAX_EXTENT_SIZE) { + u64 num_extents; + u64 new_size; + + /* + * We need the largest size of the remaining extent to see if we + * need to add a new outstanding extent. Think of the following + * case + * + * [MEAX_EXTENT_SIZEx2 - 4k][4k] + * + * The new_size would just be 4k and we'd think we had enough + * outstanding extents for this if we only took one side of the + * split, same goes for the other direction. We need to see if + * the larger size still is the same amount of extents as the + * original size, because if it is we need to add a new + * outstanding extent. But if we split up and the larger size + * is less than the original then we are good to go since we've + * already accounted for the extra extent in our original + * accounting. + */ + new_size = orig->end - split + 1; + if ((split - orig->start) > new_size) + new_size = split - orig->start; + + num_extents = div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE) < num_extents) + return; + } + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); @@ -1549,10 +1584,34 @@ static void btrfs_merge_extent_hook(struct inode *inode, struct extent_state *new, struct extent_state *other) { + u64 new_size, old_size; + u64 num_extents; + /* not delalloc, ignore it */ if (!(other->state & EXTENT_DELALLOC)) return; + old_size = other->end - other->start + 1; + new_size = old_size + (new->end - new->start + 1); + + /* we're not bigger than the max, unreserve the space and go */ + if (new_size <= BTRFS_MAX_EXTENT_SIZE) { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->lock); + return; + } + + /* + * If we grew by another max_extent, just return, we want to keep that + * reserved amount. + */ + num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE) > num_extents) + return; + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents--; spin_unlock(&BTRFS_I(inode)->lock); @@ -1648,6 +1707,8 @@ static void btrfs_clear_bit_hook(struct inode *inode, unsigned *bits) { u64 len = state->end + 1 - state->start; + u64 num_extents = div64_u64(len + BTRFS_MAX_EXTENT_SIZE -1, + BTRFS_MAX_EXTENT_SIZE); spin_lock(&BTRFS_I(inode)->lock); if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) @@ -1667,7 +1728,7 @@ static void btrfs_clear_bit_hook(struct inode *inode, *bits &= ~EXTENT_FIRST_DELALLOC; } else if (!(*bits & EXTENT_DO_ACCOUNTING)) { spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents--; + BTRFS_I(inode)->outstanding_extents -= num_extents; spin_unlock(&BTRFS_I(inode)->lock); } -- cgit v0.10.2 From 3d84be799194147e04c0e3129ed44a948773b80a Mon Sep 17 00:00:00 2001 From: Forrest Liu Date: Wed, 11 Feb 2015 14:24:12 +0800 Subject: Btrfs: fix BUG_ON in btrfs_orphan_add() when delete unused block group Removing large amount of block group in a transaction may encounters BUG_ON() in btrfs_orphan_add(). That is because btrfs_orphan_reserve_metadata() will grab metadata reservation from transaction handle, and btrfs_delete_unused_bgs() didn't reserve metadata for trnasaction handle when delete unused block group. The problem can be reproduce by following script mntpath=/btrfs loopdev=/dev/loop0 filepath=/home/forrest/image umount $mntpath losetup -d $loopdev truncate --size 1000g $filepath losetup $loopdev $filepath mkfs.btrfs -f $loopdev mount $loopdev $mntpath for j in `seq 1 1 1000`; do fallocate -l 1g $mntpath/$j done # wait cleaner thread remove unused block group sleep 300 The call trace that results from the BUG_ON() is: [ 613.093084] ------------[ cut here ]------------ [ 613.097928] kernel BUG at fs/btrfs/inode.c:3142! [ 613.105855] invalid opcode: 0000 [#1] SMP [ 613.112702] Modules linked in: coretemp(E) crc32_pclmul(E) ghash_clmulni_intel(E) aesni_intel(E) snd_ens1371(E) snd_ac97_codec(E) aes_x86_64(E) lrw(E) gf128mul(E) glue_helper(E) ppdev(E) ac97_bus(E) ablk_helper(E) gameport(E) cryptd(E) snd_rawmidi(E) snd_seq_device(E) snd_pcm(E) vmw_balloon(E) snd_timer(E) snd(E) soundcore(E) serio_raw(E) vmwgfx(E) ttm(E) drm_kms_helper(E) drm(E) vmw_vmci(E) parport_pc(E) shpchp(E) i2c_piix4(E) mac_hid(E) lp(E) parport(E) btrfs(E) xor(E) raid6_pq(E) hid_generic(E) usbhid(E) hid(E) psmouse(E) ahci(E) libahci(E) e1000(E) mptspi(E) mptscsih(E) mptbase(E) floppy(E) vmw_pvscsi(E) vmxnet3(E) [ 613.144196] CPU: 0 PID: 1480 Comm: btrfs-cleaner Tainted: G E 3.19.0-rc7-custom #2 [ 613.148501] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/31/2013 [ 613.152694] task: ffff880035cdb1a0 ti: ffff880039cf4000 task.ti: ffff880039cf4000 [ 613.154969] RIP: 0010:[] [] btrfs_orphan_add+0x1d2/0x1e0 [btrfs] [ 613.157780] RSP: 0018:ffff880039cf7c48 EFLAGS: 00010286 [ 613.159560] RAX: 00000000ffffffe4 RBX: ffff88003bd981a0 RCX: ffff88003c9e4000 [ 613.161904] RDX: 0000000000002244 RSI: 0000000000040000 RDI: ffff88003c9e4138 [ 613.164264] RBP: ffff880039cf7c88 R08: 000060ffc0000850 R09: 0000000000000000 [ 613.166507] R10: ffff88003bc4b7a0 R11: ffffea0000eb6740 R12: ffff88003c9c0000 [ 613.168681] R13: ffff88003c102160 R14: ffff88003c9c0458 R15: 0000000000000001 [ 613.170932] FS: 0000000000000000(0000) GS:ffff88003f600000(0000) knlGS:0000000000000000 [ 613.173316] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 613.175227] CR2: 00007f6343537000 CR3: 0000000036329000 CR4: 00000000000407f0 [ 613.177554] Stack: [ 613.178712] ffff880039cf7c88 ffffffffa0182a54 ffff88003c9e4b04 ffff88003c9c7800 [ 613.181297] ffff88003bc4b7a0 ffff88003bd981a0 ffff88003c8db200 ffff88003c2fcc60 [ 613.183782] ffff880039cf7d18 ffffffffa012da97 ffff88003bc4b7a4 ffff88003bc4b7a0 [ 613.186171] Call Trace: [ 613.187493] [] ? lookup_free_space_inode+0x44/0x100 [btrfs] [ 613.189801] [] btrfs_remove_block_group+0x137/0x740 [btrfs] [ 613.192126] [] btrfs_remove_chunk+0x672/0x780 [btrfs] [ 613.194267] [] btrfs_delete_unused_bgs+0x25f/0x280 [btrfs] [ 613.196567] [] cleaner_kthread+0x12c/0x190 [btrfs] [ 613.198687] [] ? check_leaf+0x350/0x350 [btrfs] [ 613.200758] [] kthread+0xd2/0xf0 [ 613.202616] [] ? kthread_create_on_node+0x180/0x180 [ 613.204738] [] ret_from_fork+0x7c/0xb0 [ 613.206652] [] ? kthread_create_on_node+0x180/0x180 [ 613.208741] Code: ff ff 0f 1f 80 00 00 00 00 89 45 c8 3e 80 63 80 fd 48 89 df e8 d0 23 fe ff 8b 45 c8 e9 14 ff ff ff b8 f4 ff ff ff e9 12 ff ff ff <0f> 0b 66 66 66 2e 0f 1f 84 00 00 00 00 00 66 66 66 66 90 55 48 [ 613.216562] RIP [] btrfs_orphan_add+0x1d2/0x1e0 [btrfs] [ 613.218828] RSP [ 613.220382] ---[ end trace 71073106deb8a457 ]--- This patch replace btrfs_join_transaction() with btrfs_start_transaction() in btrfs_delete_unused_bgs() to revent BUG_ON() in btrfs_orphan_add() Signed-off-by: Forrest Liu Signed-off-by: Chris Mason diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0f67370..28ce5c8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -9555,7 +9555,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * Want to do this before we do anything else so we can recover * properly if we fail to join the transaction. */ - trans = btrfs_join_transaction(root); + /* 1 for btrfs_orphan_reserve_metadata() */ + trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { btrfs_set_block_group_rw(root, block_group); ret = PTR_ERR(trans); -- cgit v0.10.2 From 1a4bcf470c886b955adf36486f4c86f2441d85cb Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 13 Feb 2015 12:30:56 +0000 Subject: Btrfs: fix fsync data loss after adding hard link to inode We have a scenario where after the fsync log replay we can lose file data that had been previously fsync'ed if we added an hard link for our inode and after that we sync'ed the fsync log (for example by fsync'ing some other file or directory). This is because when adding an hard link we updated the inode item in the log tree with an i_size value of 0. At that point the new inode item was in memory only and a subsequent fsync log replay would not make us lose the file data. However if after adding the hard link we sync the log tree to disk, by fsync'ing some other file or directory for example, we ended up losing the file data after log replay, because the inode item in the persisted log tree had an an i_size of zero. This is easy to reproduce, and the following excerpt from my test for xfstests shows this: _scratch_mkfs >> $seqres.full 2>&1 _init_flakey _mount_flakey # Create one file with data and fsync it. # This made the btrfs fsync log persist the data and the inode metadata with # a correct inode->i_size (4096 bytes). $XFS_IO_PROG -f -c "pwrite -S 0xaa -b 4K 0 4K" -c "fsync" \ $SCRATCH_MNT/foo | _filter_xfs_io # Now add one hard link to our file. This made the btrfs code update the fsync # log, in memory only, with an inode metadata having a size of 0. ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link # Now force persistence of the fsync log to disk, for example, by fsyncing some # other file. touch $SCRATCH_MNT/bar $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/bar # Before a power loss or crash, we could read the 4Kb of data from our file as # expected. echo "File content before:" od -t x1 $SCRATCH_MNT/foo # Simulate a crash/power loss. _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey # After the fsync log replay, because the fsync log had a value of 0 for our # inode's i_size, we couldn't read anymore the 4Kb of data that we previously # wrote and fsync'ed. The size of the file became 0 after the fsync log replay. echo "File content after:" od -t x1 $SCRATCH_MNT/foo Another alternative test, that doesn't need to fsync an inode in the same transaction it was created, is: _scratch_mkfs >> $seqres.full 2>&1 _init_flakey _mount_flakey # Create our test file with some data. $XFS_IO_PROG -f -c "pwrite -S 0xaa -b 8K 0 8K" \ $SCRATCH_MNT/foo | _filter_xfs_io # Make sure the file is durably persisted. sync # Append some data to our file, to increase its size. $XFS_IO_PROG -f -c "pwrite -S 0xcc -b 4K 8K 4K" \ $SCRATCH_MNT/foo | _filter_xfs_io # Fsync the file, so from this point on if a crash/power failure happens, our # new data is guaranteed to be there next time the fs is mounted. $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo # Add one hard link to our file. This made btrfs write into the in memory fsync # log a special inode with generation 0 and an i_size of 0 too. Note that this # didn't update the inode in the fsync log on disk. ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link # Now make sure the in memory fsync log is durably persisted. # Creating and fsync'ing another file will do it. touch $SCRATCH_MNT/bar $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/bar # As expected, before the crash/power failure, we should be able to read the # 12Kb of file data. echo "File content before:" od -t x1 $SCRATCH_MNT/foo # Simulate a crash/power loss. _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey # After mounting the fs again, the fsync log was replayed. # The btrfs fsync log replay code didn't update the i_size of the persisted # inode because the inode item in the log had a special generation with a # value of 0 (and it couldn't know the correct i_size, since that inode item # had a 0 i_size too). This made the last 4Kb of file data inaccessible and # effectively lost. echo "File content after:" od -t x1 $SCRATCH_MNT/foo This isn't a new issue/regression. This problem has been around since the log tree code was added in 2008: Btrfs: Add a write ahead tree log to optimize synchronous operations (commit e02119d5a7b4396c5a872582fddc8bd6d305a70a) Test cases for xfstests follow soon. CC: Signed-off-by: Filipe Manana Signed-off-by: Chris Mason diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7870bdb..5f649bb 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -490,8 +490,20 @@ insert: src_item = (struct btrfs_inode_item *)src_ptr; dst_item = (struct btrfs_inode_item *)dst_ptr; - if (btrfs_inode_generation(eb, src_item) == 0) + if (btrfs_inode_generation(eb, src_item) == 0) { + struct extent_buffer *dst_eb = path->nodes[0]; + + if (S_ISREG(btrfs_inode_mode(eb, src_item)) && + S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) { + struct btrfs_map_token token; + u64 ino_size = btrfs_inode_size(eb, src_item); + + btrfs_init_map_token(&token); + btrfs_set_token_inode_size(dst_eb, dst_item, + ino_size, &token); + } goto no_copy; + } if (overwrite_root && S_ISDIR(btrfs_inode_mode(eb, src_item)) && @@ -3250,7 +3262,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, static void fill_inode_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf, struct btrfs_inode_item *item, - struct inode *inode, int log_inode_only) + struct inode *inode, int log_inode_only, + u64 logged_isize) { struct btrfs_map_token token; @@ -3263,7 +3276,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, * to say 'update this inode with these values' */ btrfs_set_token_inode_generation(leaf, item, 0, &token); - btrfs_set_token_inode_size(leaf, item, 0, &token); + btrfs_set_token_inode_size(leaf, item, logged_isize, &token); } else { btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation, @@ -3315,7 +3328,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans, return ret; inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); - fill_inode_item(trans, path->nodes[0], inode_item, inode, 0); + fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0); btrfs_release_path(path); return 0; } @@ -3324,7 +3337,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *dst_path, struct btrfs_path *src_path, u64 *last_extent, - int start_slot, int nr, int inode_only) + int start_slot, int nr, int inode_only, + u64 logged_isize) { unsigned long src_offset; unsigned long dst_offset; @@ -3381,7 +3395,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, dst_path->slots[0], struct btrfs_inode_item); fill_inode_item(trans, dst_path->nodes[0], inode_item, - inode, inode_only == LOG_INODE_EXISTS); + inode, inode_only == LOG_INODE_EXISTS, + logged_isize); } else { copy_extent_buffer(dst_path->nodes[0], src, dst_offset, src_offset, ins_sizes[i]); @@ -3933,6 +3948,33 @@ process: return ret; } +static int logged_inode_size(struct btrfs_root *log, struct inode *inode, + struct btrfs_path *path, u64 *size_ret) +{ + struct btrfs_key key; + int ret; + + key.objectid = btrfs_ino(inode); + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); + if (ret < 0) { + return ret; + } else if (ret > 0) { + *size_ret = i_size_read(inode); + } else { + struct btrfs_inode_item *item; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + *size_ret = btrfs_inode_size(path->nodes[0], item); + } + + btrfs_release_path(path); + return 0; +} + /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. @@ -3970,6 +4012,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, bool fast_search = false; u64 ino = btrfs_ino(inode); struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + u64 logged_isize = 0; path = btrfs_alloc_path(); if (!path) @@ -4030,6 +4073,25 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, max_key_type = BTRFS_XATTR_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, max_key_type); } else { + if (inode_only == LOG_INODE_EXISTS) { + /* + * Make sure the new inode item we write to the log has + * the same isize as the current one (if it exists). + * This is necessary to prevent data loss after log + * replay, and also to prevent doing a wrong expanding + * truncate - for e.g. create file, write 4K into offset + * 0, fsync, write 4K into offset 4096, add hard link, + * fsync some other file (to sync log), power fail - if + * we use the inode's current i_size, after log replay + * we get a 8Kb file, with the last 4Kb extent as a hole + * (zeroes), as if an expanding truncate happened, + * instead of getting a file of 4Kb only. + */ + err = logged_inode_size(log, inode, path, + &logged_isize); + if (err) + goto out_unlock; + } if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags)) { clear_bit(BTRFS_INODE_COPY_EVERYTHING, @@ -4085,7 +4147,8 @@ again: } ret = copy_items(trans, inode, dst_path, path, &last_extent, - ins_start_slot, ins_nr, inode_only); + ins_start_slot, ins_nr, inode_only, + logged_isize); if (ret < 0) { err = ret; goto out_unlock; @@ -4109,7 +4172,7 @@ next_slot: if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, &last_extent, ins_start_slot, - ins_nr, inode_only); + ins_nr, inode_only, logged_isize); if (ret < 0) { err = ret; goto out_unlock; @@ -4130,7 +4193,8 @@ next_slot: } if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, &last_extent, - ins_start_slot, ins_nr, inode_only); + ins_start_slot, ins_nr, inode_only, + logged_isize); if (ret < 0) { err = ret; goto out_unlock; -- cgit v0.10.2 From a742994aa2e271eb8cd8e043d276515ec858ed73 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 13 Feb 2015 16:56:14 +0000 Subject: Btrfs: don't remove extents and xattrs when logging new names If we are recording in the tree log that an inode has new names (new hard links were added), we would drop items, belonging to the inode, that we shouldn't: 1) When the flag BTRFS_INODE_COPY_EVERYTHING is set in the inode's runtime flags, we ended up dropping all the extent and xattr items that were previously logged. This was done only in memory, since logging a new name doesn't imply syncing the log; 2) When the flag BTRFS_INODE_COPY_EVERYTHING is set in the inode's runtime flags, we ended up dropping all the xattr items that were previously logged. Like the case before, this was done only in memory because logging a new name doesn't imply syncing the log. This led to some surprises in scenarios such as the following: 1) write some extents to an inode; 2) fsync the inode; 3) truncate the inode or delete/modify some of its xattrs 4) add a new hard link for that inode 5) fsync some other file, to force the log tree to be durably persisted 6) power failure happens The next time the fs is mounted, the fsync log replay code is executed, and the resulting file doesn't have the content it had when the last fsync against it was performed, instead if has a content matching what it had when the last transaction commit happened. So change the behaviour such that when a new name is logged, only the inode item and reference items are processed. This is easy to reproduce with the test I just made for xfstests, whose main body is: _scratch_mkfs >> $seqres.full 2>&1 _init_flakey _mount_flakey # Create our test file with some data. $XFS_IO_PROG -f -c "pwrite -S 0xaa -b 8K 0 8K" \ $SCRATCH_MNT/foo | _filter_xfs_io # Make sure the file is durably persisted. sync # Append some data to our file, to increase its size. $XFS_IO_PROG -f -c "pwrite -S 0xcc -b 4K 8K 4K" \ $SCRATCH_MNT/foo | _filter_xfs_io # Fsync the file, so from this point on if a crash/power failure happens, our # new data is guaranteed to be there next time the fs is mounted. $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo # Now shrink our file to 5000 bytes. $XFS_IO_PROG -c "truncate 5000" $SCRATCH_MNT/foo # Now do an expanding truncate to a size larger than what we had when we last # fsync'ed our file. This is just to verify that after power failure and # replaying the fsync log, our file matches what it was when we last fsync'ed # it - 12Kb size, first 8Kb of data had a value of 0xaa and the last 4Kb of # data had a value of 0xcc. $XFS_IO_PROG -c "truncate 32K" $SCRATCH_MNT/foo # Add one hard link to our file. This made btrfs drop all of our file's # metadata from the fsync log, including the metadata relative to the # extent we just wrote and fsync'ed. This change was made only to the fsync # log in memory, so adding the hard link alone doesn't change the persisted # fsync log. This happened because the previous truncates set the runtime # flag BTRFS_INODE_NEEDS_FULL_SYNC in the btrfs inode structure. ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link # Now make sure the in memory fsync log is durably persisted. # Creating and fsync'ing another file will do it. # After this our persisted fsync log will no longer have metadata for our file # foo that points to the extent we wrote and fsync'ed before. touch $SCRATCH_MNT/bar $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/bar # As expected, before the crash/power failure, we should be able to see a file # with a size of 32Kb, with its first 5000 bytes having the value 0xaa and all # the remaining bytes with value 0x00. echo "File content before:" od -t x1 $SCRATCH_MNT/foo # Simulate a crash/power loss. _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey # After mounting the fs again, the fsync log was replayed. # The expected result is to see a file with a size of 12Kb, with its first 8Kb # of data having the value 0xaa and its last 4Kb of data having a value of 0xcc. # The btrfs bug used to leave the file as it used te be as of the last # transaction commit - that is, with a size of 8Kb with all bytes having a # value of 0xaa. echo "File content after:" od -t x1 $SCRATCH_MNT/foo The test case for xfstests follows soon. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 5f649bb..f96996a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4069,8 +4069,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (S_ISDIR(inode->i_mode)) { int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; - if (inode_only == LOG_INODE_EXISTS) - max_key_type = BTRFS_XATTR_ITEM_KEY; + if (inode_only == LOG_INODE_EXISTS) { + max_key_type = BTRFS_INODE_EXTREF_KEY; + max_key.type = max_key_type; + } ret = drop_objectid_items(trans, log, path, ino, max_key_type); } else { if (inode_only == LOG_INODE_EXISTS) { @@ -4092,18 +4094,31 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (err) goto out_unlock; } - if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags)) { - clear_bit(BTRFS_INODE_COPY_EVERYTHING, - &BTRFS_I(inode)->runtime_flags); - ret = btrfs_truncate_inode_items(trans, log, - inode, 0, 0); - } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, - &BTRFS_I(inode)->runtime_flags) || + if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags)) { + if (inode_only == LOG_INODE_EXISTS) { + max_key.type = BTRFS_INODE_EXTREF_KEY; + ret = drop_objectid_items(trans, log, path, ino, + max_key.type); + } else { + clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags); + ret = btrfs_truncate_inode_items(trans, log, + inode, 0, 0); + } + } else if (test_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags) || inode_only == LOG_INODE_EXISTS) { - if (inode_only == LOG_INODE_ALL) + if (inode_only == LOG_INODE_ALL) { + clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags); fast_search = true; - max_key.type = BTRFS_XATTR_ITEM_KEY; + max_key.type = BTRFS_XATTR_ITEM_KEY; + } else { + max_key.type = BTRFS_INODE_EXTREF_KEY; + } ret = drop_objectid_items(trans, log, path, ino, max_key.type); } else { -- cgit v0.10.2