From 60b62978bc5e903cd487de34972fb30f76c74a2e Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 30 Apr 2013 17:29:29 +0000 Subject: btrfs: annotate quota tree for lockdep Quota tree has been missing from lockdep annotations, though no warning has been seen in the wild. There's currently one entry that does not belong there, BTRFS_ORPHAN_OBJECTID. No such tree exists, it's probably a copy & paste mistake, the id is defined among tree ids. Signed-off-by: David Sterba Signed-off-by: Josef Bacik diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 63c328a..2720d55 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -88,12 +88,12 @@ struct btrfs_ordered_sum; /* holds checksums of all the data extents */ #define BTRFS_CSUM_TREE_OBJECTID 7ULL -/* for storing balance parameters in the root tree */ -#define BTRFS_BALANCE_OBJECTID -4ULL - /* holds quota configuration and tracking */ #define BTRFS_QUOTA_TREE_OBJECTID 8ULL +/* for storing balance parameters in the root tree */ +#define BTRFS_BALANCE_OBJECTID -4ULL + /* orhpan objectid for tracking unlinked/truncated files */ #define BTRFS_ORPHAN_OBJECTID -5ULL diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4e9ebe1..72b1727 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -152,7 +152,7 @@ static struct btrfs_lockdep_keyset { { .id = BTRFS_DEV_TREE_OBJECTID, .name_stem = "dev" }, { .id = BTRFS_FS_TREE_OBJECTID, .name_stem = "fs" }, { .id = BTRFS_CSUM_TREE_OBJECTID, .name_stem = "csum" }, - { .id = BTRFS_ORPHAN_OBJECTID, .name_stem = "orphan" }, + { .id = BTRFS_QUOTA_TREE_OBJECTID, .name_stem = "quota" }, { .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" }, { .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" }, { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" }, -- cgit v0.10.2 From a52f4cd2b1a863a42c1cb268b1cddad451cdfede Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 1 May 2013 16:23:41 +0000 Subject: Btrfs: fix off-by-one in fiemap lock_extent/unlock_extent expect an exclusive end. Tested-by: David Sterba Signed-off-by: Liu Bo Signed-off-by: Josef Bacik diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d2ac518..3e6e410 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3989,7 +3989,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, last_for_get_extent = isize; } - lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, + lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 0, &cached_state); em = get_extent_skip_holes(inode, start, last_for_get_extent, @@ -4076,7 +4076,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, out_free: free_extent_map(em); out: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len, + unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state, GFP_NOFS); return ret; } -- cgit v0.10.2 From 03b71c6ca6286625d8f1ed44aabab9b5bf5dac10 Mon Sep 17 00:00:00 2001 From: Gabriel de Perthuis Date: Mon, 6 May 2013 17:40:18 +0000 Subject: btrfs: don't stop searching after encountering the wrong item The search ioctl skips items that are too large for a result buffer, but inline items of a certain size occuring before any search result is found would trigger an overflow and stop the search entirely. Bug: https://bugzilla.kernel.org/show_bug.cgi?id=57641 Cc: stable@vger.kernel.org Signed-off-by: Gabriel de Perthuis Signed-off-by: Josef Bacik diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0de4a2f..0f81d67 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1801,7 +1801,11 @@ static noinline int copy_to_sk(struct btrfs_root *root, item_off = btrfs_item_ptr_offset(leaf, i); item_len = btrfs_item_size_nr(leaf, i); - if (item_len > BTRFS_SEARCH_ARGS_BUFSIZE) + btrfs_item_key_to_cpu(leaf, key, i); + if (!key_in_sk(key, sk)) + continue; + + if (sizeof(sh) + item_len > BTRFS_SEARCH_ARGS_BUFSIZE) item_len = 0; if (sizeof(sh) + item_len + *sk_offset > @@ -1810,10 +1814,6 @@ static noinline int copy_to_sk(struct btrfs_root *root, goto overflow; } - btrfs_item_key_to_cpu(leaf, key, i); - if (!key_in_sk(key, sk)) - continue; - sh.objectid = key->objectid; sh.offset = key->offset; sh.type = key->type; -- cgit v0.10.2 From 69a85bd87cc81bcbd36730d4a1214c12fdb8a548 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 8 May 2013 13:30:11 -0400 Subject: Btrfs: don't null pointer deref on abort I'm sorry, theres no excuse for this sort of work. We need to use root->leafsize since eb may be NULL. Thanks, Signed-off-by: Josef Bacik diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 72b1727..e8b29da 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3808,7 +3808,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, while (start <= end) { eb = btrfs_find_tree_block(root, start, root->leafsize); - start += eb->len; + start += root->leafsize; if (!eb) continue; wait_on_extent_buffer_writeback(eb); -- cgit v0.10.2 From 73e1e61fb85ab206854b6d87ff31733628bb8d72 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 8 May 2013 16:44:57 -0400 Subject: Btrfs: remove warn on in free space cache writeout This catches block groups that are too large to properly cache. We deal with this case fine, so the warning just confuses users. Remove the warning. Thanks, Signed-off-by: Josef Bacik diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ecca6c7..6a8bb9c 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -920,10 +920,8 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, /* Make sure we can fit our crcs into the first page */ if (io_ctl.check_crcs && - (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) { - WARN_ON(1); + (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) goto out_nospc; - } io_ctl_set_generation(&io_ctl, trans->transid); -- cgit v0.10.2 From b1c79e0947e0c190f865e2eb7b84a0fea0021cec Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 9 May 2013 13:49:30 -0400 Subject: Btrfs: handle running extent ops with skinny metadata Chris hit a bug where we weren't finding extent records when running extent ops. This is because we use the delayed_ref_head when running the extent op, which means we can't use the ->type checks to see if we are metadata. We also lose the level of the metadata we are working on. So to fix this we can just check the ->is_data section of the extent_op, and we can store the level of the buffer we were modifying in the extent_op. Thanks, Signed-off-by: Josef Bacik diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index de6de8e..02fae7f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -951,10 +951,12 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, BUG_ON(ret); /* -ENOMEM */ } if (new_flags != 0) { + int level = btrfs_header_level(buf); + ret = btrfs_set_disk_extent_flags(trans, root, buf->start, buf->len, - new_flags, 0); + new_flags, level, 0); if (ret) return ret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2720d55..d6dd49b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3075,7 +3075,7 @@ int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 flags, - int is_data); + int level, int is_data); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index f75fcaf..70b962c 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -60,6 +60,7 @@ struct btrfs_delayed_ref_node { struct btrfs_delayed_extent_op { struct btrfs_disk_key key; u64 flags_to_set; + int level; unsigned int update_key:1; unsigned int update_flags:1; unsigned int is_data:1; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2305b5c..c4c94b3 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2070,8 +2070,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, u32 item_size; int ret; int err = 0; - int metadata = (node->type == BTRFS_TREE_BLOCK_REF_KEY || - node->type == BTRFS_SHARED_BLOCK_REF_KEY); + int metadata = !extent_op->is_data; if (trans->aborted) return 0; @@ -2086,11 +2085,8 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, key.objectid = node->bytenr; if (metadata) { - struct btrfs_delayed_tree_ref *tree_ref; - - tree_ref = btrfs_delayed_node_to_tree_ref(node); key.type = BTRFS_METADATA_ITEM_KEY; - key.offset = tree_ref->level; + key.offset = extent_op->level; } else { key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = node->num_bytes; @@ -2719,7 +2715,7 @@ out: int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 flags, - int is_data) + int level, int is_data) { struct btrfs_delayed_extent_op *extent_op; int ret; @@ -2732,6 +2728,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, extent_op->update_flags = 1; extent_op->update_key = 0; extent_op->is_data = is_data ? 1 : 0; + extent_op->level = level; ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, num_bytes, extent_op); @@ -6763,6 +6760,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, extent_op->update_key = 1; extent_op->update_flags = 1; extent_op->is_data = 0; + extent_op->level = level; ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, ins.objectid, @@ -6934,7 +6932,8 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); BUG_ON(ret); /* -ENOMEM */ ret = btrfs_set_disk_extent_flags(trans, root, eb->start, - eb->len, flag, 0); + eb->len, flag, + btrfs_header_level(eb), 0); BUG_ON(ret); /* -ENOMEM */ wc->flags[level] |= flag; } -- cgit v0.10.2 From 49688107527a24b0ed3780576257a1225902180b Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 7 May 2013 17:28:03 +0000 Subject: Btrfs: don't allow device replace on RAID5/RAID6 This is not yet supported and causes crashes. One sad user reported that it destroyed his filesystem. One failure is in __btrfs_map_block+0xc1f calling kmalloc(0). 0x5f21f is in __btrfs_map_block (fs/btrfs/volumes.c:4923). 4918 num_stripes = map->num_stripes; 4919 max_errors = nr_parity_stripes(map); 4920 4921 raid_map = kmalloc(sizeof(u64) * num_stripes, 4922 GFP_NOFS); 4923 if (!raid_map) { 4924 ret = -ENOMEM; 4925 goto out; 4926 } 4927 There might be more issues. Until this is really tested, don't allow users to start the procedure on RAID5/RAID6 filesystems. Signed-off-by: Stefan Behrens Signed-off-by: Josef Bacik diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 7ba7b39..65241f3 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -313,6 +313,11 @@ int btrfs_dev_replace_start(struct btrfs_root *root, struct btrfs_device *tgt_device = NULL; struct btrfs_device *src_device = NULL; + if (btrfs_fs_incompat(fs_info, RAID56)) { + pr_warn("btrfs: dev_replace cannot yet handle RAID5/RAID6\n"); + return -EINVAL; + } + switch (args->start.cont_reading_from_srcdev_mode) { case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: -- cgit v0.10.2 From c16c2e2e51c2f0951fffa73c343b8fcb641108ba Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Wed, 8 May 2013 08:10:25 +0000 Subject: Btrfs: fix possible memory leak in the find_parent_nodes() In the find_parent_nodes(), if read_tree_block() fails, we can not return directly, we should free some allocated memory otherwise memory leak happens. Signed-off-by: Wang Shilong Signed-off-by: Josef Bacik diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index b4fb415..290e347 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -918,7 +918,8 @@ again: ref->parent, bsz, 0); if (!eb || !extent_buffer_uptodate(eb)) { free_extent_buffer(eb); - return -EIO; + ret = -EIO; + goto out; } ret = find_extent_in_eb(eb, bytenr, *extent_item_pos, &eie); -- cgit v0.10.2 From 379cde741b220091d2124fb500b178b90ad7f460 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Wed, 8 May 2013 08:56:09 +0000 Subject: Btrfs: fix possible memory leak in replace_path() In replace_path(), if read_tree_block() fails, we cannot return directly, we should free some allocated memory otherwise memory leak happens. Similar to Wang's "Btrfs: fix possible memory leak in the find_parent_nodes()" patch, the current commit fixes an issue that is related to the "Btrfs: fix all callers of read_tree_block" commit. Signed-off-by: Stefan Behrens Signed-off-by: Josef Bacik diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 704a1b8..5c5b8bb 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1773,7 +1773,7 @@ again: if (!eb || !extent_buffer_uptodate(eb)) { ret = (!eb) ? -ENOMEM : -EIO; free_extent_buffer(eb); - return ret; + break; } btrfs_tree_lock(eb); if (cow) { -- cgit v0.10.2 From 8250dabedb633e162bce89f2aacf5e65fa9e6464 Mon Sep 17 00:00:00 2001 From: Andreas Philipp Date: Sat, 11 May 2013 11:13:03 +0000 Subject: Correct allowed raid levels on balance. Raid5 with 3 devices is well defined while the old logic allowed raid5 only with a minimum of 4 devices when converting the block group profile via btrfs balance. Creating a raid5 with just three devices using mkfs.btrfs worked always as expected. This is now fixed and the whole logic is rewritten. Signed-off-by: Andreas Philipp Signed-off-by: Josef Bacik diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a191bac..062e930 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3120,14 +3120,13 @@ int btrfs_balance(struct btrfs_balance_control *bctl, allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; if (num_devices == 1) allowed |= BTRFS_BLOCK_GROUP_DUP; - else if (num_devices < 4) + else if (num_devices > 1) allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); - else - allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6); - + if (num_devices > 2) + allowed |= BTRFS_BLOCK_GROUP_RAID5; + if (num_devices > 3) + allowed |= (BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID6); if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && (!alloc_profile_is_valid(bctl->data.target, 1) || (bctl->data.target & ~allowed))) { -- cgit v0.10.2 From 7cfa9e51d2948ae90e7599cc114dcce2c7c2b1fc Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 13 May 2013 13:55:08 +0000 Subject: Btrfs: don't abort the current transaction if there is no enough space for inode cache The filesystem with inode cache was forced to be read-only when we umounted it. Steps to reproduce: # mkfs.btrfs -f ${DEV} # mount -o inode_cache ${DEV} ${MNT} # dd if=/dev/zero of=${MNT}/file1 bs=1M count=8192 # btrfs fi syn ${MNT} # dd if=${MNT}/file1 of=/dev/null bs=1M # rm -f ${MNT}/file1 # btrfs fi syn ${MNT} # umount ${MNT} It is because there was no enough space to do inode cache truncation, and then we aborted the current transaction. But no space error is not a serious problem when we write out the inode cache, and it is safe that we just skip this step if we meet this problem. So we need not abort the current transaction. Reported-by: Tsutomu Itoh Signed-off-by: Miao Xie Tested-by: Tsutomu Itoh Signed-off-by: Josef Bacik diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index d26f67a..9818d4a 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -468,7 +468,8 @@ again: if (i_size_read(inode) > 0) { ret = btrfs_truncate_free_space_cache(root, trans, path, inode); if (ret) { - btrfs_abort_transaction(trans, root, ret); + if (ret != -ENOSPC) + btrfs_abort_transaction(trans, root, ret); goto out_put; } } -- cgit v0.10.2 From 7b61cd92242542944fc27024900c495a6a7b3396 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 13 May 2013 13:55:09 +0000 Subject: Btrfs: don't use global block reservation for inode cache truncation It is very likely that there are lots of subvolumes/snapshots in the filesystem, so if we use global block reservation to do inode cache truncation, we may hog all the free space that is reserved in global rsv. So it is better that we do the free space reservation for inode cache truncation by ourselves. Cc: Tsutomu Itoh Signed-off-by: Miao Xie Signed-off-by: Josef Bacik diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c4c94b3..162a66b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3106,6 +3106,11 @@ again: WARN_ON(ret); if (i_size_read(inode) > 0) { + ret = btrfs_check_trunc_cache_free_space(root, + &root->fs_info->global_block_rsv); + if (ret) + goto out_put; + ret = btrfs_truncate_free_space_cache(root, trans, path, inode); if (ret) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 6a8bb9c..e530096 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -197,30 +197,32 @@ int create_free_space_inode(struct btrfs_root *root, block_group->key.objectid); } -int btrfs_truncate_free_space_cache(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct inode *inode) +int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, + struct btrfs_block_rsv *rsv) { - struct btrfs_block_rsv *rsv; u64 needed_bytes; - loff_t oldsize; - int ret = 0; - - rsv = trans->block_rsv; - trans->block_rsv = &root->fs_info->global_block_rsv; + int ret; /* 1 for slack space, 1 for updating the inode */ needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) + btrfs_calc_trans_metadata_size(root, 1); - spin_lock(&trans->block_rsv->lock); - if (trans->block_rsv->reserved < needed_bytes) { - spin_unlock(&trans->block_rsv->lock); - trans->block_rsv = rsv; - return -ENOSPC; - } - spin_unlock(&trans->block_rsv->lock); + spin_lock(&rsv->lock); + if (rsv->reserved < needed_bytes) + ret = -ENOSPC; + else + ret = 0; + spin_unlock(&rsv->lock); + return 0; +} + +int btrfs_truncate_free_space_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct inode *inode) +{ + loff_t oldsize; + int ret = 0; oldsize = i_size_read(inode); btrfs_i_size_write(inode, 0); @@ -232,9 +234,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, */ ret = btrfs_truncate_inode_items(trans, root, inode, 0, BTRFS_EXTENT_DATA_KEY); - if (ret) { - trans->block_rsv = rsv; btrfs_abort_transaction(trans, root, ret); return ret; } @@ -242,7 +242,6 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, ret = btrfs_update_inode(trans, root, inode); if (ret) btrfs_abort_transaction(trans, root, ret); - trans->block_rsv = rsv; return ret; } diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 4dc17d8..8b7f19f 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -54,6 +54,8 @@ int create_free_space_inode(struct btrfs_root *root, struct btrfs_block_group_cache *block_group, struct btrfs_path *path); +int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, + struct btrfs_block_rsv *rsv); int btrfs_truncate_free_space_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans, struct btrfs_path *path, diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 9818d4a..2c66ddb 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -429,11 +429,12 @@ int btrfs_save_ino_cache(struct btrfs_root *root, num_bytes = trans->bytes_reserved; /* * 1 item for inode item insertion if need - * 3 items for inode item update (in the worst case) + * 4 items for inode item update (in the worst case) + * 1 items for slack space if we need do truncation * 1 item for free space object * 3 items for pre-allocation */ - trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8); + trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 10); ret = btrfs_block_rsv_add(root, trans->block_rsv, trans->bytes_reserved, BTRFS_RESERVE_NO_FLUSH); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 5c5b8bb..395b820 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3350,6 +3350,11 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info, } truncate: + ret = btrfs_check_trunc_cache_free_space(root, + &fs_info->global_block_rsv); + if (ret) + goto out; + path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; -- cgit v0.10.2 From b586b32374909863311b7c916c7c0c709141e35a Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 13 May 2013 13:55:10 +0000 Subject: Btrfs: optimize the error handle of use_block_rsv() cc: Tsutomu Itoh Signed-off-by: Miao Xie Signed-off-by: Josef Bacik diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 162a66b..5e2c0bf 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6656,48 +6656,39 @@ use_block_rsv(struct btrfs_trans_handle *trans, block_rsv = get_block_rsv(trans, root); - if (block_rsv->size == 0) { - ret = reserve_metadata_bytes(root, block_rsv, blocksize, - BTRFS_RESERVE_NO_FLUSH); - /* - * If we couldn't reserve metadata bytes try and use some from - * the global reserve. - */ - if (ret && block_rsv != global_rsv) { - ret = block_rsv_use_bytes(global_rsv, blocksize); - if (!ret) - return global_rsv; - return ERR_PTR(ret); - } else if (ret) { - return ERR_PTR(ret); - } - return block_rsv; - } + if (unlikely(block_rsv->size == 0)) + goto try_reserve; ret = block_rsv_use_bytes(block_rsv, blocksize); if (!ret) return block_rsv; - if (ret && !block_rsv->failfast) { - if (btrfs_test_opt(root, ENOSPC_DEBUG)) { - static DEFINE_RATELIMIT_STATE(_rs, - DEFAULT_RATELIMIT_INTERVAL * 10, - /*DEFAULT_RATELIMIT_BURST*/ 1); - if (__ratelimit(&_rs)) - WARN(1, KERN_DEBUG - "btrfs: block rsv returned %d\n", ret); - } - ret = reserve_metadata_bytes(root, block_rsv, blocksize, - BTRFS_RESERVE_NO_FLUSH); - if (!ret) { - return block_rsv; - } else if (ret && block_rsv != global_rsv) { - ret = block_rsv_use_bytes(global_rsv, blocksize); - if (!ret) - return global_rsv; - } - } - return ERR_PTR(-ENOSPC); + if (block_rsv->failfast) + return ERR_PTR(ret); + + if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + static DEFINE_RATELIMIT_STATE(_rs, + DEFAULT_RATELIMIT_INTERVAL * 10, + /*DEFAULT_RATELIMIT_BURST*/ 1); + if (__ratelimit(&_rs)) + WARN(1, KERN_DEBUG + "btrfs: block rsv returned %d\n", ret); + } +try_reserve: + ret = reserve_metadata_bytes(root, block_rsv, blocksize, + BTRFS_RESERVE_NO_FLUSH); + if (!ret) + return block_rsv; + /* + * If we couldn't reserve metadata bytes try and use some from + * the global reserve. + */ + if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL) { + ret = block_rsv_use_bytes(global_rsv, blocksize); + if (!ret) + return global_rsv; + } + return ERR_PTR(ret); } static void unuse_block_rsv(struct btrfs_fs_info *fs_info, -- cgit v0.10.2 From 5881cfc924c8143dbc3e1f343516fc6527eb8311 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 13 May 2013 13:55:11 +0000 Subject: Btrfs: don't steal the reserved space from the global reserve if their space type is different If the type of the space we need is different with the global reserve, we can not steal the space from the global reserve, because we can not allocate the space from the free space cache that the global reserve points to. Cc: Tsutomu Itoh Signed-off-by: Miao Xie Signed-off-by: Josef Bacik diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5e2c0bf..54e63b2 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6681,9 +6681,11 @@ try_reserve: return block_rsv; /* * If we couldn't reserve metadata bytes try and use some from - * the global reserve. + * the global reserve if its space type is the same as the global + * reservation. */ - if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL) { + if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && + block_rsv->space_info == global_rsv->space_info) { ret = block_rsv_use_bytes(global_rsv, blocksize); if (!ret) return global_rsv; -- cgit v0.10.2 From d88033dbf4c23279b012725876f1e164e09644ff Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 13 May 2013 13:55:12 +0000 Subject: Btrfs: update the global reserve if it is empty Before applying this patch, we reserved the space for the global reserve by the minimum unit if we found it is empty, it was unreasonable and inefficient, because if the global reserve space was depleted, it implied that the size of the global reserve was too small. In this case, we shoud update the global reserve and fill it. Cc: Tsutomu Itoh Signed-off-by: Miao Xie Signed-off-by: Josef Bacik diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 54e63b2..42f5e61 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6653,12 +6653,13 @@ use_block_rsv(struct btrfs_trans_handle *trans, struct btrfs_block_rsv *block_rsv; struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; int ret; + bool global_updated = false; block_rsv = get_block_rsv(trans, root); if (unlikely(block_rsv->size == 0)) goto try_reserve; - +again: ret = block_rsv_use_bytes(block_rsv, blocksize); if (!ret) return block_rsv; @@ -6666,6 +6667,12 @@ use_block_rsv(struct btrfs_trans_handle *trans, if (block_rsv->failfast) return ERR_PTR(ret); + if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { + global_updated = true; + update_global_block_rsv(root->fs_info); + goto again; + } + if (btrfs_test_opt(root, ENOSPC_DEBUG)) { static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL * 10, -- cgit v0.10.2 From b9aa55bed1c1a3a329da31884b643c62d57ebb21 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 14 May 2013 02:12:15 +0000 Subject: Btrfs: return errno if possible when we fail to allocate memory We need to set return value explicitly, otherwise we'll lose the error value. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1669c3b..99a9c25 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -714,8 +714,10 @@ retry: async_extent->ram_size - 1, 0); em = alloc_extent_map(); - if (!em) + if (!em) { + ret = -ENOMEM; goto out_free_reserve; + } em->start = async_extent->start; em->len = async_extent->ram_size; em->orig_start = em->start; @@ -922,8 +924,10 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, } em = alloc_extent_map(); - if (!em) + if (!em) { + ret = -ENOMEM; goto out_reserve; + } em->start = start; em->orig_start = em->start; ram_size = ins.offset; -- cgit v0.10.2 From 89042e5ad23d50449691141334f30d53d6271266 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 15 May 2013 07:48:15 +0000 Subject: Btrfs: fix accessing a freed tree root inode_tree_del() will move the tree root into the dead root list, and then the tree will be destroyed by the cleaner. So if we remove the delayed node which is cached in the inode after inode_tree_del(), we may access a freed tree root. Fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 99a9c25..790eceb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4727,6 +4727,7 @@ void btrfs_evict_inode(struct inode *inode) btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); no_delete: + btrfs_remove_delayed_node(inode); clear_inode(inode); return; } @@ -7982,7 +7983,6 @@ void btrfs_destroy_inode(struct inode *inode) inode_tree_del(inode); btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); free: - btrfs_remove_delayed_node(inode); call_rcu(&inode->i_rcu, btrfs_i_callback); } -- cgit v0.10.2 From e1409cef85894f96f4bddc6633d64d1c5275e2a3 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 15 May 2013 07:48:16 +0000 Subject: Btrfs: fix unprotected root node of the subvolume's inode rb-tree The root node of the rb-tree may be changed, so we should get it under the lock. Fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 790eceb..19eef3e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4843,14 +4843,13 @@ static void inode_tree_add(struct inode *inode) struct rb_node **p; struct rb_node *parent; u64 ino = btrfs_ino(inode); -again: - p = &root->inode_tree.rb_node; - parent = NULL; if (inode_unhashed(inode)) return; - +again: + parent = NULL; spin_lock(&root->inode_lock); + p = &root->inode_tree.rb_node; while (*p) { parent = *p; entry = rb_entry(parent, struct btrfs_inode, rb_node); -- cgit v0.10.2 From 061594ef171a5ba52b5786688ae766907b0bda2b Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 15 May 2013 07:48:17 +0000 Subject: Btrfs: pause the space balance when remounting to R/O Signed-off-by: Miao Xie Signed-off-by: Josef Bacik diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a4807ce..f0857e0 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1263,6 +1263,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) btrfs_dev_replace_suspend_for_unmount(fs_info); btrfs_scrub_cancel(fs_info); + btrfs_pause_balance(fs_info); ret = btrfs_commit_super(root); if (ret) -- cgit v0.10.2 From 314297c2a3fbcbda992507f70cd04cc82084e434 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 15 May 2013 07:48:18 +0000 Subject: Btrfs: remove BUG_ON() in btrfs_read_fs_tree_no_radix() We have checked if ->node is NULL or not, so it is unnecessary to use BUG_ON() to check again. Remove it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e8b29da..9b9f286 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1513,7 +1513,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, } root->commit_root = btrfs_root_node(root); - BUG_ON(!root->node); /* -ENOMEM */ out: if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { root->ref_cows = 1; -- cgit v0.10.2 From b216cbfb52c08300c203abf06ea9519d15d10045 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 15 May 2013 07:48:21 +0000 Subject: Btrfs: don't invoke btrfs_invalidate_inodes() in the spin lock context btrfs_invalidate_inodes() may sleep, so we should not invoke it in the spin lock context. Fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9b9f286..1b03f83 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3658,8 +3658,11 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, ordered_operations); list_del_init(&btrfs_inode->ordered_operations); + spin_unlock(&root->fs_info->ordered_extent_lock); btrfs_invalidate_inodes(btrfs_inode->root); + + spin_lock(&root->fs_info->ordered_extent_lock); } spin_unlock(&root->fs_info->ordered_extent_lock); @@ -3781,8 +3784,11 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) list_del_init(&btrfs_inode->delalloc_inodes); clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, &btrfs_inode->runtime_flags); + spin_unlock(&root->fs_info->delalloc_lock); btrfs_invalidate_inodes(btrfs_inode->root); + + spin_lock(&root->fs_info->delalloc_lock); } spin_unlock(&root->fs_info->delalloc_lock); -- cgit v0.10.2 From 17a5adccf3fd01added91f3bf9aa7ee9aa28843b Mon Sep 17 00:00:00 2001 From: Alexandre Oliva Date: Wed, 15 May 2013 11:38:55 -0400 Subject: btrfs: do away with non-whole_page extent I/O end_bio_extent_readpage computes whole_page based on bv_offset and bv_len, without taking into account that blk_update_request may modify them when some of the blocks to be read into a page produce a read error. This would cause the read to unlock only part of the file range associated with the page, which would in turn leave the entire page locked, which would not only keep the process blocked instead of returning -EIO to it, but also prevent any further access to the file. It turns out that btrfs always issues whole-page reads and writes. The special handling of non-whole_page appears to be a mistake or a left-over from a time when this wasn't the case. Indeed, end_bio_extent_writepage distinguished between whole_page and non-whole_page writes but behaved identically in both cases! I've replaced the whole_page computations with warnings, just to be sure that we're not issuing partial page reads or writes. The warnings should probably just go away some time. Signed-off-by: Alexandre Oliva Signed-off-by: Josef Bacik diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3e6e410..ca4355d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1948,28 +1948,6 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) } /* - * helper function to unlock a page if all the extents in the tree - * for that page are unlocked - */ -static void check_page_locked(struct extent_io_tree *tree, struct page *page) -{ - u64 start = page_offset(page); - u64 end = start + PAGE_CACHE_SIZE - 1; - if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) - unlock_page(page); -} - -/* - * helper function to end page writeback if all the extents - * in the tree for that page are done with writeback - */ -static void check_page_writeback(struct extent_io_tree *tree, - struct page *page) -{ - end_page_writeback(page); -} - -/* * When IO fails, either with EIO or csum verification fails, we * try other mirrors that might have a good copy of the data. This * io_failure_record is used to record state as we go through all the @@ -2398,19 +2376,24 @@ static void end_bio_extent_writepage(struct bio *bio, int err) struct extent_io_tree *tree; u64 start; u64 end; - int whole_page; do { struct page *page = bvec->bv_page; tree = &BTRFS_I(page->mapping->host)->io_tree; - start = page_offset(page) + bvec->bv_offset; - end = start + bvec->bv_len - 1; + /* We always issue full-page reads, but if some block + * in a page fails to read, blk_update_request() will + * advance bv_offset and adjust bv_len to compensate. + * Print a warning for nonzero offsets, and an error + * if they don't add up to a full page. */ + if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) + printk("%s page write in btrfs with offset %u and length %u\n", + bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE + ? KERN_ERR "partial" : KERN_INFO "incomplete", + bvec->bv_offset, bvec->bv_len); - if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) - whole_page = 1; - else - whole_page = 0; + start = page_offset(page); + end = start + bvec->bv_offset + bvec->bv_len - 1; if (--bvec >= bio->bi_io_vec) prefetchw(&bvec->bv_page->flags); @@ -2418,10 +2401,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err) if (end_extent_writepage(page, err, start, end)) continue; - if (whole_page) - end_page_writeback(page); - else - check_page_writeback(tree, page); + end_page_writeback(page); } while (bvec >= bio->bi_io_vec); bio_put(bio); @@ -2446,7 +2426,6 @@ static void end_bio_extent_readpage(struct bio *bio, int err) struct extent_io_tree *tree; u64 start; u64 end; - int whole_page; int mirror; int ret; @@ -2463,13 +2442,19 @@ static void end_bio_extent_readpage(struct bio *bio, int err) (long int)bio->bi_bdev); tree = &BTRFS_I(page->mapping->host)->io_tree; - start = page_offset(page) + bvec->bv_offset; - end = start + bvec->bv_len - 1; + /* We always issue full-page reads, but if some block + * in a page fails to read, blk_update_request() will + * advance bv_offset and adjust bv_len to compensate. + * Print a warning for nonzero offsets, and an error + * if they don't add up to a full page. */ + if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) + printk("%s page read in btrfs with offset %u and length %u\n", + bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE + ? KERN_ERR "partial" : KERN_INFO "incomplete", + bvec->bv_offset, bvec->bv_len); - if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) - whole_page = 1; - else - whole_page = 0; + start = page_offset(page); + end = start + bvec->bv_offset + bvec->bv_len - 1; if (++bvec <= bvec_end) prefetchw(&bvec->bv_page->flags); @@ -2528,23 +2513,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err) } unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); - if (whole_page) { - if (uptodate) { - SetPageUptodate(page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - unlock_page(page); + if (uptodate) { + SetPageUptodate(page); } else { - if (uptodate) { - check_page_uptodate(tree, page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - check_page_locked(tree, page); + ClearPageUptodate(page); + SetPageError(page); } + unlock_page(page); } while (bvec <= bvec_end); bio_put(bio); -- cgit v0.10.2 From 3a6cad9009c85e29e83aafc8ac00b1dd5067fc5f Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Thu, 16 May 2013 14:48:19 +0000 Subject: Btrfs: explicitly use global_block_rsv for quota_tree The quota_tree was set up to use the empty_block_rsv before which would be problematic when the filesystem is filled up and ENOSPC happens during internal operations while the quota tree is updated and COWed (when the btrfs_qgroup_info_item items) are written. In fact, use_block_rsv() which is used in btrfs_cow_block() falls back to the global_block_rsv in this case. But just in order to make it more clear what is happening, change it to explicitly use the global_block_rsv. Signed-off-by: Stefan Behrens Signed-off-by: Josef Bacik diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 42f5e61..df472ab 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4564,6 +4564,8 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; + if (fs_info->quota_root) + fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; update_global_block_rsv(fs_info); -- cgit v0.10.2 From 655b09fe540b73edeaabfb4c2d700be51a1f8bce Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 17 May 2013 14:06:51 -0400 Subject: Btrfs: make sure roots are assigned before freeing their nodes If we fail to load the chunk tree we'll call free_root_pointers, except we may not have assigned the roots for the dev_root/extent_root/csum_root yet, so we could NULL pointer deref at this point. Just add checks to make sure these roots are set to keep us from panicing. Thanks, Signed-off-by: Josef Bacik diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1b03f83..4bdb052 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1987,30 +1987,33 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) { free_extent_buffer(info->tree_root->node); free_extent_buffer(info->tree_root->commit_root); - free_extent_buffer(info->dev_root->node); - free_extent_buffer(info->dev_root->commit_root); - free_extent_buffer(info->extent_root->node); - free_extent_buffer(info->extent_root->commit_root); - free_extent_buffer(info->csum_root->node); - free_extent_buffer(info->csum_root->commit_root); - if (info->quota_root) { - free_extent_buffer(info->quota_root->node); - free_extent_buffer(info->quota_root->commit_root); - } - info->tree_root->node = NULL; info->tree_root->commit_root = NULL; - info->dev_root->node = NULL; - info->dev_root->commit_root = NULL; - info->extent_root->node = NULL; - info->extent_root->commit_root = NULL; - info->csum_root->node = NULL; - info->csum_root->commit_root = NULL; + + if (info->dev_root) { + free_extent_buffer(info->dev_root->node); + free_extent_buffer(info->dev_root->commit_root); + info->dev_root->node = NULL; + info->dev_root->commit_root = NULL; + } + if (info->extent_root) { + free_extent_buffer(info->extent_root->node); + free_extent_buffer(info->extent_root->commit_root); + info->extent_root->node = NULL; + info->extent_root->commit_root = NULL; + } + if (info->csum_root) { + free_extent_buffer(info->csum_root->node); + free_extent_buffer(info->csum_root->commit_root); + info->csum_root->node = NULL; + info->csum_root->commit_root = NULL; + } if (info->quota_root) { + free_extent_buffer(info->quota_root->node); + free_extent_buffer(info->quota_root->commit_root); info->quota_root->node = NULL; info->quota_root->commit_root = NULL; } - if (chunk_root) { free_extent_buffer(info->chunk_root->node); free_extent_buffer(info->chunk_root->commit_root); -- cgit v0.10.2 From 9be3395bcd4ad4af76476ac38152b4cafa6b6159 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 17 May 2013 18:30:14 -0400 Subject: Btrfs: use a btrfs bioset instead of abusing bio internals Btrfs has been pointer tagging bi_private and using bi_bdev to store the stripe index and mirror number of failed IOs. As bios bubble back up through the call chain, we use these to decide if and how to retry our IOs. They are also used to count IO failures on a per device basis. Recently a bio tracepoint was added lead to crashes because we were abusing bi_bdev. This commit adds a btrfs bioset, and creates explicit fields for the mirror number and stripe index. The plan is to extend this structure for all of the fields currently in struct btrfs_bio, which will mean one less kmalloc in our IO path. Signed-off-by: Chris Mason Reported-by: Tejun Heo diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 18af6f4..1431a69 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1700,7 +1700,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, unsigned int j; DECLARE_COMPLETION_ONSTACK(complete); - bio = bio_alloc(GFP_NOFS, num_pages - i); + bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i); if (!bio) { printk(KERN_INFO "btrfsic: bio_alloc() for %u pages failed!\n", diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4e9ebe1..ca0ea99 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3128,7 +3128,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait) * caller */ device->flush_bio = NULL; - bio = bio_alloc(GFP_NOFS, 0); + bio = btrfs_io_bio_alloc(GFP_NOFS, 0); if (!bio) return -ENOMEM; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d2ac518..fe1d6c3 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -23,6 +23,7 @@ static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; +static struct bio_set *btrfs_bioset; #ifdef CONFIG_BTRFS_DEBUG static LIST_HEAD(buffers); @@ -125,10 +126,20 @@ int __init extent_io_init(void) SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_buffer_cache) goto free_state_cache; + + btrfs_bioset = bioset_create(BIO_POOL_SIZE, + offsetof(struct btrfs_io_bio, bio)); + if (!btrfs_bioset) + goto free_buffer_cache; return 0; +free_buffer_cache: + kmem_cache_destroy(extent_buffer_cache); + extent_buffer_cache = NULL; + free_state_cache: kmem_cache_destroy(extent_state_cache); + extent_state_cache = NULL; return -ENOMEM; } @@ -145,6 +156,8 @@ void extent_io_exit(void) kmem_cache_destroy(extent_state_cache); if (extent_buffer_cache) kmem_cache_destroy(extent_buffer_cache); + if (btrfs_bioset) + bioset_free(btrfs_bioset); } void extent_io_tree_init(struct extent_io_tree *tree, @@ -2046,7 +2059,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num)) return 0; - bio = bio_alloc(GFP_NOFS, 1); + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); if (!bio) return -EIO; bio->bi_private = &compl; @@ -2336,7 +2349,7 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, return -EIO; } - bio = bio_alloc(GFP_NOFS, 1); + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); if (!bio) { free_io_failure(inode, failrec, 0); return -EIO; @@ -2457,10 +2470,11 @@ static void end_bio_extent_readpage(struct bio *bio, int err) struct page *page = bvec->bv_page; struct extent_state *cached = NULL; struct extent_state *state; + struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " - "mirror=%ld\n", (u64)bio->bi_sector, err, - (long int)bio->bi_bdev); + "mirror=%lu\n", (u64)bio->bi_sector, err, + io_bio->mirror_num); tree = &BTRFS_I(page->mapping->host)->io_tree; start = page_offset(page) + bvec->bv_offset; @@ -2485,7 +2499,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) } spin_unlock(&tree->lock); - mirror = (int)(unsigned long)bio->bi_bdev; + mirror = io_bio->mirror_num; if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { ret = tree->ops->readpage_end_io_hook(page, start, end, state, mirror); @@ -2550,17 +2564,23 @@ static void end_bio_extent_readpage(struct bio *bio, int err) bio_put(bio); } +/* + * this allocates from the btrfs_bioset. We're returning a bio right now + * but you can call btrfs_io_bio for the appropriate container_of magic + */ struct bio * btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, gfp_t gfp_flags) { struct bio *bio; - bio = bio_alloc(gfp_flags, nr_vecs); + bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset); if (bio == NULL && (current->flags & PF_MEMALLOC)) { - while (!bio && (nr_vecs /= 2)) - bio = bio_alloc(gfp_flags, nr_vecs); + while (!bio && (nr_vecs /= 2)) { + bio = bio_alloc_bioset(gfp_flags, + nr_vecs, btrfs_bioset); + } } if (bio) { @@ -2571,6 +2591,19 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, return bio; } +struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) +{ + return bio_clone_bioset(bio, gfp_mask, btrfs_bioset); +} + + +/* this also allocates from the btrfs_bioset */ +struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) +{ + return bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset); +} + + static int __must_check submit_one_bio(int rw, struct bio *bio, int mirror_num, unsigned long bio_flags) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index a2c03a1..41fb81e 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -336,6 +336,8 @@ int extent_clear_unlock_delalloc(struct inode *inode, struct bio * btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, gfp_t gfp_flags); +struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs); +struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask); struct btrfs_fs_info; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1669c3b..d59dddf 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6927,7 +6927,11 @@ struct btrfs_dio_private { /* IO errors */ int errors; + /* orig_bio is our btrfs_io_bio */ struct bio *orig_bio; + + /* dio_bio came from fs/direct-io.c */ + struct bio *dio_bio; }; static void btrfs_endio_direct_read(struct bio *bio, int err) @@ -6937,6 +6941,7 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) struct bio_vec *bvec = bio->bi_io_vec; struct inode *inode = dip->inode; struct btrfs_root *root = BTRFS_I(inode)->root; + struct bio *dio_bio; u64 start; start = dip->logical_offset; @@ -6976,14 +6981,15 @@ failed: unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, dip->logical_offset + dip->bytes - 1); - bio->bi_private = dip->private; + dio_bio = dip->dio_bio; kfree(dip); /* If we had a csum failure make sure to clear the uptodate flag */ if (err) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - dio_end_io(bio, err); + clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); + dio_end_io(dio_bio, err); + bio_put(bio); } static void btrfs_endio_direct_write(struct bio *bio, int err) @@ -6994,6 +7000,7 @@ static void btrfs_endio_direct_write(struct bio *bio, int err) struct btrfs_ordered_extent *ordered = NULL; u64 ordered_offset = dip->logical_offset; u64 ordered_bytes = dip->bytes; + struct bio *dio_bio; int ret; if (err) @@ -7021,14 +7028,15 @@ out_test: goto again; } out_done: - bio->bi_private = dip->private; + dio_bio = dip->dio_bio; kfree(dip); /* If we had an error make sure to clear the uptodate flag */ if (err) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - dio_end_io(bio, err); + clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); + dio_end_io(dio_bio, err); + bio_put(bio); } static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, @@ -7064,10 +7072,10 @@ static void btrfs_end_dio_bio(struct bio *bio, int err) if (!atomic_dec_and_test(&dip->pending_bios)) goto out; - if (dip->errors) + if (dip->errors) { bio_io_error(dip->orig_bio); - else { - set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags); + } else { + set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags); bio_endio(dip->orig_bio, 0); } out: @@ -7242,25 +7250,34 @@ out_err: return 0; } -static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, - loff_t file_offset) +static void btrfs_submit_direct(int rw, struct bio *dio_bio, + struct inode *inode, loff_t file_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_dio_private *dip; - struct bio_vec *bvec = bio->bi_io_vec; + struct bio_vec *bvec = dio_bio->bi_io_vec; + struct bio *io_bio; int skip_sum; int write = rw & REQ_WRITE; int ret = 0; skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS); + + if (!io_bio) { + ret = -ENOMEM; + goto free_ordered; + } + dip = kmalloc(sizeof(*dip), GFP_NOFS); if (!dip) { ret = -ENOMEM; - goto free_ordered; + goto free_io_bio; } - dip->private = bio->bi_private; + dip->private = dio_bio->bi_private; + io_bio->bi_private = dio_bio->bi_private; dip->inode = inode; dip->logical_offset = file_offset; @@ -7268,22 +7285,27 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, do { dip->bytes += bvec->bv_len; bvec++; - } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1)); + } while (bvec <= (dio_bio->bi_io_vec + dio_bio->bi_vcnt - 1)); - dip->disk_bytenr = (u64)bio->bi_sector << 9; - bio->bi_private = dip; + dip->disk_bytenr = (u64)dio_bio->bi_sector << 9; + io_bio->bi_private = dip; dip->errors = 0; - dip->orig_bio = bio; + dip->orig_bio = io_bio; + dip->dio_bio = dio_bio; atomic_set(&dip->pending_bios, 0); if (write) - bio->bi_end_io = btrfs_endio_direct_write; + io_bio->bi_end_io = btrfs_endio_direct_write; else - bio->bi_end_io = btrfs_endio_direct_read; + io_bio->bi_end_io = btrfs_endio_direct_read; ret = btrfs_submit_direct_hook(rw, dip, skip_sum); if (!ret) return; + +free_io_bio: + bio_put(io_bio); + free_ordered: /* * If this is a write, we need to clean up the reserved space and kill @@ -7299,7 +7321,7 @@ free_ordered: btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); } - bio_endio(bio, ret); + bio_endio(dio_bio, ret); } static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb, diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 0740621..0525e13 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1050,7 +1050,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, } /* put a new bio on the list */ - bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); + bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); if (!bio) return -ENOMEM; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index f489e24..79bd479 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1296,7 +1296,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, } WARN_ON(!page->page); - bio = bio_alloc(GFP_NOFS, 1); + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); if (!bio) { page->io_error = 1; sblock->no_io_error_seen = 0; @@ -1431,7 +1431,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, return -EIO; } - bio = bio_alloc(GFP_NOFS, 1); + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); if (!bio) return -EIO; bio->bi_bdev = page_bad->dev->bdev; @@ -1522,7 +1522,7 @@ again: sbio->dev = wr_ctx->tgtdev; bio = sbio->bio; if (!bio) { - bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio); + bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio); if (!bio) { mutex_unlock(&wr_ctx->wr_lock); return -ENOMEM; @@ -1930,7 +1930,7 @@ again: sbio->dev = spage->dev; bio = sbio->bio; if (!bio) { - bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio); + bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio); if (!bio) return -ENOMEM; sbio->bio = bio; @@ -3307,7 +3307,7 @@ static int write_page_nocow(struct scrub_ctx *sctx, "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); return -EIO; } - bio = bio_alloc(GFP_NOFS, 1); + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); if (!bio) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a191bac..317afc7 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5019,42 +5019,16 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, return 0; } -static void *merge_stripe_index_into_bio_private(void *bi_private, - unsigned int stripe_index) -{ - /* - * with single, dup, RAID0, RAID1 and RAID10, stripe_index is - * at most 1. - * The alternative solution (instead of stealing bits from the - * pointer) would be to allocate an intermediate structure - * that contains the old private pointer plus the stripe_index. - */ - BUG_ON((((uintptr_t)bi_private) & 3) != 0); - BUG_ON(stripe_index > 3); - return (void *)(((uintptr_t)bi_private) | stripe_index); -} - -static struct btrfs_bio *extract_bbio_from_bio_private(void *bi_private) -{ - return (struct btrfs_bio *)(((uintptr_t)bi_private) & ~((uintptr_t)3)); -} - -static unsigned int extract_stripe_index_from_bio_private(void *bi_private) -{ - return (unsigned int)((uintptr_t)bi_private) & 3; -} - static void btrfs_end_bio(struct bio *bio, int err) { - struct btrfs_bio *bbio = extract_bbio_from_bio_private(bio->bi_private); + struct btrfs_bio *bbio = bio->bi_private; int is_orig_bio = 0; if (err) { atomic_inc(&bbio->error); if (err == -EIO || err == -EREMOTEIO) { unsigned int stripe_index = - extract_stripe_index_from_bio_private( - bio->bi_private); + btrfs_io_bio(bio)->stripe_index; struct btrfs_device *dev; BUG_ON(stripe_index >= bbio->num_stripes); @@ -5084,8 +5058,7 @@ static void btrfs_end_bio(struct bio *bio, int err) } bio->bi_private = bbio->private; bio->bi_end_io = bbio->end_io; - bio->bi_bdev = (struct block_device *) - (unsigned long)bbio->mirror_num; + btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; /* only send an error to the higher layers if it is * beyond the tolerance of the btrfs bio */ @@ -5211,8 +5184,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, struct btrfs_device *dev = bbio->stripes[dev_nr].dev; bio->bi_private = bbio; - bio->bi_private = merge_stripe_index_into_bio_private( - bio->bi_private, (unsigned int)dev_nr); + btrfs_io_bio(bio)->stripe_index = dev_nr; bio->bi_end_io = btrfs_end_bio; bio->bi_sector = physical >> 9; #ifdef DEBUG @@ -5273,8 +5245,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) if (atomic_dec_and_test(&bbio->stripes_pending)) { bio->bi_private = bbio->private; bio->bi_end_io = bbio->end_io; - bio->bi_bdev = (struct block_device *) - (unsigned long)bbio->mirror_num; + btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; bio->bi_sector = logical >> 9; kfree(bbio); bio_endio(bio, -EIO); @@ -5352,7 +5323,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, } if (dev_nr < total_devs - 1) { - bio = bio_clone(first_bio, GFP_NOFS); + bio = btrfs_bio_clone(first_bio, GFP_NOFS); BUG_ON(!bio); /* -ENOMEM */ } else { bio = first_bio; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 845ccbb..f6247e2 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -152,6 +152,26 @@ struct btrfs_fs_devices { int rotating; }; +/* + * we need the mirror number and stripe index to be passed around + * the call chain while we are processing end_io (especially errors). + * Really, what we need is a btrfs_bio structure that has this info + * and is properly sized with its stripe array, but we're not there + * quite yet. We have our own btrfs bioset, and all of the bios + * we allocate are actually btrfs_io_bios. We'll cram as much of + * struct btrfs_bio as we can into this over time. + */ +struct btrfs_io_bio { + unsigned long mirror_num; + unsigned long stripe_index; + struct bio bio; +}; + +static inline struct btrfs_io_bio *btrfs_io_bio(struct bio *bio) +{ + return container_of(bio, struct btrfs_io_bio, bio); +} + struct btrfs_bio_stripe { struct btrfs_device *dev; u64 physical; -- cgit v0.10.2