From 3fed40cc97f32bebfd34a55364de9b44dcbede59 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 13 Sep 2012 04:51:36 -0600 Subject: Btrfs: cleanup duplicated division functions div_factor{_fine} has been implemented for two times, cleanup it. And I move them into a independent file named math.h because they are common math functions. Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3d3e2c1..7563db7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -33,6 +33,7 @@ #include "volumes.h" #include "locking.h" #include "free-space-cache.h" +#include "math.h" #undef SCRAMBLE_DELAYED_REFS @@ -649,24 +650,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) rcu_read_unlock(); } -static u64 div_factor(u64 num, int factor) -{ - if (factor == 10) - return num; - num *= factor; - do_div(num, 10); - return num; -} - -static u64 div_factor_fine(u64 num, int factor) -{ - if (factor == 100) - return num; - num *= factor; - do_div(num, 100); - return num; -} - u64 btrfs_find_block_group(struct btrfs_root *root, u64 search_start, u64 search_hint, int owner) { diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h new file mode 100644 index 0000000..b7816ce --- /dev/null +++ b/fs/btrfs/math.h @@ -0,0 +1,44 @@ + +/* + * Copyright (C) 2012 Fujitsu. All rights reserved. + * Written by Miao Xie + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_MATH_H +#define __BTRFS_MATH_H + +#include + +static inline u64 div_factor(u64 num, int factor) +{ + if (factor == 10) + return num; + num *= factor; + do_div(num, 10); + return num; +} + +static inline u64 div_factor_fine(u64 num, int factor) +{ + if (factor == 100) + return num; + num *= factor; + do_div(num, 100); + return num; +} + +#endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0f5ebb7..a8adf26 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -25,7 +25,6 @@ #include #include #include -#include #include "compat.h" #include "ctree.h" #include "extent_map.h" @@ -36,6 +35,7 @@ #include "async-thread.h" #include "check-integrity.h" #include "rcu-string.h" +#include "math.h" static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -2338,18 +2338,6 @@ static int chunk_profiles_filter(u64 chunk_type, return 1; } -static u64 div_factor_fine(u64 num, int factor) -{ - if (factor <= 0) - return 0; - if (factor >= 100) - return num; - - num *= factor; - do_div(num, 100); - return num; -} - static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, struct btrfs_balance_args *bargs) { @@ -2514,15 +2502,6 @@ static int should_balance_chunk(struct btrfs_root *root, return 1; } -static u64 div_factor(u64 num, int factor) -{ - if (factor == 10) - return num; - num *= factor; - do_div(num, 10); - return num; -} - static int __btrfs_balance(struct btrfs_fs_info *fs_info) { struct btrfs_balance_control *bctl = fs_info->balance_ctl; -- cgit v0.10.2 From 561c294d4cfb30c4acfa0a243448fc55af730d87 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 16 Oct 2012 11:32:18 +0000 Subject: Btrfs: fix wrong comment in can_overcommit() The comment is not coincident with the code. Fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 7563db7..2cfcce2 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3668,9 +3668,9 @@ static int can_overcommit(struct btrfs_root *root, avail >>= 1; /* - * If we aren't flushing don't let us overcommit too much, say - * 1/8th of the space. If we can flush, let it overcommit up to - * 1/2 of the space. + * If we aren't flushing all things, let us overcommit up to + * 1/2th of the space. If we can flush, don't let us overcommit + * too much, let it overcommit up to 1/8 of the space. */ if (flush) avail >>= 3; -- cgit v0.10.2 From 08e007d2e57744472a9424735a368ffe6d625597 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 16 Oct 2012 11:33:38 +0000 Subject: Btrfs: improve the noflush reservation In some places(such as: evicting inode), we just can not flush the reserved space of delalloc, flushing the delayed directory index and delayed inode is OK, but we don't try to flush those things and just go back when there is no enough space to be reserved. This patch fixes this problem. We defined 3 types of the flush operations: NO_FLUSH, FLUSH_LIMIT and FLUSH_ALL. If we can in the transaction, we should not flush anything, or the deadlock would happen, so use NO_FLUSH. If we flushing the reserved space of delalloc would cause deadlock, use FLUSH_LIMIT. In the other cases, FLUSH_ALL is used, and we will flush all things. Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c72ead8..8fd9fe4 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2900,6 +2900,18 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); + +enum btrfs_reserve_flush_enum { + /* If we are in the transaction, we can't flush anything.*/ + BTRFS_RESERVE_NO_FLUSH, + /* + * Flushing delalloc may cause deadlock somewhere, in this + * case, use FLUSH LIMIT + */ + BTRFS_RESERVE_FLUSH_LIMIT, + BTRFS_RESERVE_FLUSH_ALL, +}; + int btrfs_check_data_free_space(struct inode *inode, u64 bytes); void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, @@ -2919,19 +2931,13 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, void btrfs_free_block_rsv(struct btrfs_root *root, struct btrfs_block_rsv *rsv); int btrfs_block_rsv_add(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes); -int btrfs_block_rsv_add_noflush(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes); + struct btrfs_block_rsv *block_rsv, u64 num_bytes, + enum btrfs_reserve_flush_enum flush); int btrfs_block_rsv_check(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, int min_factor); int btrfs_block_rsv_refill(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved); -int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved); + struct btrfs_block_rsv *block_rsv, u64 min_reserved, + enum btrfs_reserve_flush_enum flush); int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 478f66b..0c6dca5 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -651,7 +651,8 @@ static int btrfs_delayed_inode_reserve_metadata( */ if (!src_rsv || (!trans->bytes_reserved && src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { - ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); + ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, + BTRFS_RESERVE_NO_FLUSH); /* * Since we're under a transaction reserve_metadata_bytes could * try to commit the transaction which will make it return @@ -686,7 +687,8 @@ static int btrfs_delayed_inode_reserve_metadata( * reserve something strictly for us. If not be a pain and try * to steal from the delalloc block rsv. */ - ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); + ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, + BTRFS_RESERVE_NO_FLUSH); if (!ret) goto out; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2cfcce2..2136add 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3644,7 +3644,7 @@ out: static int can_overcommit(struct btrfs_root *root, struct btrfs_space_info *space_info, u64 bytes, - int flush) + enum btrfs_reserve_flush_enum flush) { u64 profile = btrfs_get_alloc_profile(root, 0); u64 avail; @@ -3672,7 +3672,7 @@ static int can_overcommit(struct btrfs_root *root, * 1/2th of the space. If we can flush, don't let us overcommit * too much, let it overcommit up to 1/8 of the space. */ - if (flush) + if (flush == BTRFS_RESERVE_FLUSH_ALL) avail >>= 3; else avail >>= 1; @@ -3696,6 +3696,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, long time_left; unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; int loops = 0; + enum btrfs_reserve_flush_enum flush; trans = (struct btrfs_trans_handle *)current->journal_info; block_rsv = &root->fs_info->delalloc_block_rsv; @@ -3723,8 +3724,12 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, wait_event(root->fs_info->async_submit_wait, !atomic_read(&root->fs_info->async_delalloc_pages)); + if (!trans) + flush = BTRFS_RESERVE_FLUSH_ALL; + else + flush = BTRFS_RESERVE_NO_FLUSH; spin_lock(&space_info->lock); - if (can_overcommit(root, space_info, orig, !trans)) { + if (can_overcommit(root, space_info, orig, flush)) { spin_unlock(&space_info->lock); break; } @@ -3882,7 +3887,8 @@ static int flush_space(struct btrfs_root *root, */ static int reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, - u64 orig_bytes, int flush) + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) { struct btrfs_space_info *space_info = block_rsv->space_info; u64 used; @@ -3895,10 +3901,11 @@ again: ret = 0; spin_lock(&space_info->lock); /* - * We only want to wait if somebody other than us is flushing and we are - * actually alloed to flush. + * We only want to wait if somebody other than us is flushing and we + * are actually allowed to flush all things. */ - while (flush && !flushing && space_info->flush) { + while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing && + space_info->flush) { spin_unlock(&space_info->lock); /* * If we have a trans handle we can't wait because the flusher @@ -3964,23 +3971,40 @@ again: * Couldn't make our reservation, save our place so while we're trying * to reclaim space we can actually use it instead of somebody else * stealing it from us. + * + * We make the other tasks wait for the flush only when we can flush + * all things. */ - if (ret && flush) { + if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) { flushing = true; space_info->flush = 1; } spin_unlock(&space_info->lock); - if (!ret || !flush) + if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) goto out; ret = flush_space(root, space_info, num_bytes, orig_bytes, flush_state); flush_state++; + + /* + * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock + * would happen. So skip delalloc flush. + */ + if (flush == BTRFS_RESERVE_FLUSH_LIMIT && + (flush_state == FLUSH_DELALLOC || + flush_state == FLUSH_DELALLOC_WAIT)) + flush_state = ALLOC_CHUNK; + if (!ret) goto again; - else if (flush_state <= COMMIT_TRANS) + else if (flush == BTRFS_RESERVE_FLUSH_LIMIT && + flush_state < COMMIT_TRANS) + goto again; + else if (flush == BTRFS_RESERVE_FLUSH_ALL && + flush_state <= COMMIT_TRANS) goto again; out: @@ -4131,9 +4155,9 @@ void btrfs_free_block_rsv(struct btrfs_root *root, kfree(rsv); } -static inline int __block_rsv_add(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes, int flush) +int btrfs_block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 num_bytes, + enum btrfs_reserve_flush_enum flush) { int ret; @@ -4149,20 +4173,6 @@ static inline int __block_rsv_add(struct btrfs_root *root, return ret; } -int btrfs_block_rsv_add(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) -{ - return __block_rsv_add(root, block_rsv, num_bytes, 1); -} - -int btrfs_block_rsv_add_noflush(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) -{ - return __block_rsv_add(root, block_rsv, num_bytes, 0); -} - int btrfs_block_rsv_check(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, int min_factor) { @@ -4181,9 +4191,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root, return ret; } -static inline int __btrfs_block_rsv_refill(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved, int flush) +int btrfs_block_rsv_refill(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 min_reserved, + enum btrfs_reserve_flush_enum flush) { u64 num_bytes = 0; int ret = -ENOSPC; @@ -4211,20 +4221,6 @@ static inline int __btrfs_block_rsv_refill(struct btrfs_root *root, return ret; } -int btrfs_block_rsv_refill(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved) -{ - return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1); -} - -int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved) -{ - return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0); -} - int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes) @@ -4515,14 +4511,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) u64 csum_bytes; unsigned nr_extents = 0; int extra_reserve = 0; - int flush = 1; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; int ret; /* Need to be holding the i_mutex here if we aren't free space cache */ if (btrfs_is_free_space_inode(inode)) - flush = 0; + flush = BTRFS_RESERVE_NO_FLUSH; - if (flush && btrfs_transaction_in_commit(root->fs_info)) + if (flush != BTRFS_RESERVE_NO_FLUSH && + btrfs_transaction_in_commit(root->fs_info)) schedule_timeout(1); mutex_lock(&BTRFS_I(inode)->delalloc_mutex); @@ -6252,7 +6249,8 @@ use_block_rsv(struct btrfs_trans_handle *trans, block_rsv = get_block_rsv(trans, root); if (block_rsv->size == 0) { - ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); + ret = reserve_metadata_bytes(root, block_rsv, blocksize, + BTRFS_RESERVE_NO_FLUSH); /* * If we couldn't reserve metadata bytes try and use some from * the global reserve. @@ -6279,7 +6277,8 @@ use_block_rsv(struct btrfs_trans_handle *trans, printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret); WARN_ON(1); } - ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); + ret = reserve_metadata_bytes(root, block_rsv, blocksize, + BTRFS_RESERVE_NO_FLUSH); if (!ret) { return block_rsv; } else if (ret && block_rsv != global_rsv) { diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index b1a1c92..d26f67a 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -434,8 +434,9 @@ int btrfs_save_ino_cache(struct btrfs_root *root, * 3 items for pre-allocation */ trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8); - ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv, - trans->bytes_reserved); + ret = btrfs_block_rsv_add(root, trans->block_rsv, + trans->bytes_reserved, + BTRFS_RESERVE_NO_FLUSH); if (ret) goto out; trace_btrfs_space_reservation(root->fs_info, "ino_cache", diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 95542a1..db3dd4e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3829,7 +3829,8 @@ void btrfs_evict_inode(struct inode *inode) * inode item when doing the truncate. */ while (1) { - ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size); + ret = btrfs_block_rsv_refill(root, rsv, min_size, + BTRFS_RESERVE_FLUSH_LIMIT); /* * Try and steal from the global reserve since we will @@ -3847,7 +3848,7 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } - trans = btrfs_start_transaction_noflush(root, 1); + trans = btrfs_start_transaction_lflush(root, 1); if (IS_ERR(trans)) { btrfs_orphan_del(NULL, inode); btrfs_free_block_rsv(root, rsv); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 776f0aa..242d6de 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2074,7 +2074,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, BUG_ON(IS_ERR(trans)); trans->block_rsv = rc->block_rsv; - ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved); + ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved, + BTRFS_RESERVE_FLUSH_ALL); if (ret) { BUG_ON(ret != -EAGAIN); ret = btrfs_commit_transaction(trans, root); @@ -2184,7 +2185,8 @@ int prepare_to_merge(struct reloc_control *rc, int err) again: if (!err) { num_bytes = rc->merging_rsv_size; - ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); + ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes, + BTRFS_RESERVE_FLUSH_ALL); if (ret) err = ret; } @@ -2459,7 +2461,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, num_bytes = calcu_metadata_size(rc, node, 1) * 2; trans->block_rsv = rc->block_rsv; - ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); + ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes, + BTRFS_RESERVE_FLUSH_ALL); if (ret) { if (ret == -EAGAIN) rc->commit_transaction = 1; @@ -3685,7 +3688,8 @@ int prepare_to_relocate(struct reloc_control *rc) * is no reservation in transaction handle. */ ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv, - rc->extent_root->nodesize * 256); + rc->extent_root->nodesize * 256, + BTRFS_RESERVE_FLUSH_ALL); if (ret) return ret; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 04bbfb1..4e1def4 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -295,9 +295,9 @@ static int may_wait_transaction(struct btrfs_root *root, int type) return 0; } -static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, - u64 num_items, int type, - int noflush) +static struct btrfs_trans_handle * +start_transaction(struct btrfs_root *root, u64 num_items, int type, + enum btrfs_reserve_flush_enum flush) { struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; @@ -331,14 +331,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, } num_bytes = btrfs_calc_trans_metadata_size(root, num_items); - if (noflush) - ret = btrfs_block_rsv_add_noflush(root, - &root->fs_info->trans_block_rsv, - num_bytes); - else - ret = btrfs_block_rsv_add(root, - &root->fs_info->trans_block_rsv, - num_bytes); + ret = btrfs_block_rsv_add(root, + &root->fs_info->trans_block_rsv, + num_bytes, flush); if (ret) return ERR_PTR(ret); } @@ -422,13 +417,15 @@ got_it: struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, int num_items) { - return start_transaction(root, num_items, TRANS_START, 0); + return start_transaction(root, num_items, TRANS_START, + BTRFS_RESERVE_FLUSH_ALL); } -struct btrfs_trans_handle *btrfs_start_transaction_noflush( +struct btrfs_trans_handle *btrfs_start_transaction_lflush( struct btrfs_root *root, int num_items) { - return start_transaction(root, num_items, TRANS_START, 1); + return start_transaction(root, num_items, TRANS_START, + BTRFS_RESERVE_FLUSH_LIMIT); } struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) @@ -1032,8 +1029,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); if (to_reserve > 0) { - ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv, - to_reserve); + ret = btrfs_block_rsv_add(root, &pending->block_rsv, + to_reserve, + BTRFS_RESERVE_NO_FLUSH); if (ret) { pending->error = ret; goto no_free_objectid; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 8096194..0e8aa1e 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -105,7 +105,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, int num_items); -struct btrfs_trans_handle *btrfs_start_transaction_noflush( +struct btrfs_trans_handle *btrfs_start_transaction_lflush( struct btrfs_root *root, int num_items); struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); -- cgit v0.10.2 From de1ee92ac3bce4c9d760016c4d6198158e6e2f15 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 19 Oct 2012 16:50:56 -0400 Subject: Btrfs: recheck bio against block device when we map the bio Alex reported a problem where we were writing between chunks on a rbd device. The thing is we do bio_add_page using logical offsets, but the physical offset may be different. So when we map the bio now check to see if the bio is still ok with the physical offset, and if it is not split the bio up and redo the bio_add_page with the physical sector. This fixes the problem for Alex and doesn't affect performance in the normal case. Thanks, Reported-and-tested-by: Alex Elder Signed-off-by: Josef Bacik Signed-off-by: Chris Mason diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a8adf26..eaaf0bf 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4217,6 +4217,113 @@ static noinline void schedule_bio(struct btrfs_root *root, &device->work); } +static int bio_size_ok(struct block_device *bdev, struct bio *bio, + sector_t sector) +{ + struct bio_vec *prev; + struct request_queue *q = bdev_get_queue(bdev); + unsigned short max_sectors = queue_max_sectors(q); + struct bvec_merge_data bvm = { + .bi_bdev = bdev, + .bi_sector = sector, + .bi_rw = bio->bi_rw, + }; + + if (bio->bi_vcnt == 0) { + WARN_ON(1); + return 1; + } + + prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; + if ((bio->bi_size >> 9) > max_sectors) + return 0; + + if (!q->merge_bvec_fn) + return 1; + + bvm.bi_size = bio->bi_size - prev->bv_len; + if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) + return 0; + return 1; +} + +static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, + struct bio *bio, u64 physical, int dev_nr, + int rw, int async) +{ + struct btrfs_device *dev = bbio->stripes[dev_nr].dev; + + bio->bi_private = bbio; + bio->bi_private = merge_stripe_index_into_bio_private( + bio->bi_private, (unsigned int)dev_nr); + bio->bi_end_io = btrfs_end_bio; + bio->bi_sector = physical >> 9; +#ifdef DEBUG + { + struct rcu_string *name; + + rcu_read_lock(); + name = rcu_dereference(dev->name); + pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu " + "(%s id %llu), size=%u\n", rw, + (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, + name->str, dev->devid, bio->bi_size); + rcu_read_unlock(); + } +#endif + bio->bi_bdev = dev->bdev; + if (async) + schedule_bio(root, dev, rw, bio); + else + btrfsic_submit_bio(rw, bio); +} + +static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, + struct bio *first_bio, struct btrfs_device *dev, + int dev_nr, int rw, int async) +{ + struct bio_vec *bvec = first_bio->bi_io_vec; + struct bio *bio; + int nr_vecs = bio_get_nr_vecs(dev->bdev); + u64 physical = bbio->stripes[dev_nr].physical; + +again: + bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS); + if (!bio) + return -ENOMEM; + + while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) { + if (bio_add_page(bio, bvec->bv_page, bvec->bv_len, + bvec->bv_offset) < bvec->bv_len) { + u64 len = bio->bi_size; + + atomic_inc(&bbio->stripes_pending); + submit_stripe_bio(root, bbio, bio, physical, dev_nr, + rw, async); + physical += len; + goto again; + } + bvec++; + } + + submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async); + return 0; +} + +static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) +{ + atomic_inc(&bbio->error); + if (atomic_dec_and_test(&bbio->stripes_pending)) { + bio->bi_private = bbio->private; + bio->bi_end_io = bbio->end_io; + bio->bi_bdev = (struct block_device *) + (unsigned long)bbio->mirror_num; + bio->bi_sector = logical >> 9; + kfree(bbio); + bio_endio(bio, -EIO); + } +} + int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, int mirror_num, int async_submit) { @@ -4255,40 +4362,36 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, atomic_set(&bbio->stripes_pending, bbio->num_stripes); while (dev_nr < total_devs) { + dev = bbio->stripes[dev_nr].dev; + if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { + bbio_error(bbio, first_bio, logical); + dev_nr++; + continue; + } + + /* + * Check and see if we're ok with this bio based on it's size + * and offset with the given device. + */ + if (!bio_size_ok(dev->bdev, first_bio, + bbio->stripes[dev_nr].physical >> 9)) { + ret = breakup_stripe_bio(root, bbio, first_bio, dev, + dev_nr, rw, async_submit); + BUG_ON(ret); + dev_nr++; + continue; + } + if (dev_nr < total_devs - 1) { bio = bio_clone(first_bio, GFP_NOFS); BUG_ON(!bio); /* -ENOMEM */ } else { bio = first_bio; } - bio->bi_private = bbio; - bio->bi_private = merge_stripe_index_into_bio_private( - bio->bi_private, (unsigned int)dev_nr); - bio->bi_end_io = btrfs_end_bio; - bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; - dev = bbio->stripes[dev_nr].dev; - if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { -#ifdef DEBUG - struct rcu_string *name; - - rcu_read_lock(); - name = rcu_dereference(dev->name); - pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu " - "(%s id %llu), size=%u\n", rw, - (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, - name->str, dev->devid, bio->bi_size); - rcu_read_unlock(); -#endif - bio->bi_bdev = dev->bdev; - if (async_submit) - schedule_bio(root, dev, rw, bio); - else - btrfsic_submit_bio(rw, bio); - } else { - bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; - bio->bi_sector = logical >> 9; - bio_endio(bio, -EIO); - } + + submit_stripe_bio(root, bbio, bio, + bbio->stripes[dev_nr].physical, dev_nr, rw, + async_submit); dev_nr++; } return 0; -- cgit v0.10.2 From de6c4115a297d4bbf178aca9948c3539f89c9caa Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 18 Oct 2012 08:18:01 +0000 Subject: Btrfs: fix unnecessary while loop when search the free space, cache When we find a bitmap free space entry, we may check the previous extent entry covers the offset or not. But if we find this entry is also a bitmap entry, we will continue to check the previous entry of the current one by a while loop. It is unnecessary because it is impossible that the extent entry which is in front of a bitmap entry can cover the offset of the entry after that bitmap entry. Signed-off-by: Miao Xie Reviewed-by: Liu Bo Signed-off-by: Chris Mason diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 1027b85..557502c 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1250,18 +1250,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, * if previous extent entry covers the offset, * we should return it instead of the bitmap entry */ - n = &entry->offset_index; - while (1) { - n = rb_prev(n); - if (!n) - break; + n = rb_prev(&entry->offset_index); + if (n) { prev = rb_entry(n, struct btrfs_free_space, offset_index); - if (!prev->bitmap) { - if (prev->offset + prev->bytes > offset) - entry = prev; - break; - } + if (!prev->bitmap && + prev->offset + prev->bytes > offset) + entry = prev; } } return entry; @@ -1287,18 +1282,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, } if (entry->bitmap) { - n = &entry->offset_index; - while (1) { - n = rb_prev(n); - if (!n) - break; + n = rb_prev(&entry->offset_index); + if (n) { prev = rb_entry(n, struct btrfs_free_space, offset_index); - if (!prev->bitmap) { - if (prev->offset + prev->bytes > offset) - return prev; - break; - } + if (!prev->bitmap && + prev->offset + prev->bytes > offset) + return prev; } if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) return entry; -- cgit v0.10.2 From 95c80bb1f6b24b57058d971ed252b2c1c5121b51 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 19 Oct 2012 09:50:52 +0000 Subject: Btrfs: MOD_LOG_KEY_REMOVE_WHILE_MOVING never change node's nritems Key MOD_LOG_KEY_REMOVE_WHILE_MOVING means that we're doing memmove inside an extent buffer node, and the node's number of items remains unchanged (unless we are inserting a single pointer, but we have MOD_LOG_KEY_ADD for that). So we don't need to increase node's number of items during rewinding, otherwise we may get an node larger than leafsize and cause general protection errors later. Here is the details, - If we do memory move for inserting a single pointer, we need to add node's nritems by one, and we honor MOD_LOG_KEY_ADD for adding. - If we do memory move for deleting a single pointer, we need to decrease node's nritems by one, and we honor MOD_LOG_KEY_REMOVE for deleting. - If we do memory move for balance left/right, we need to decrease node's nritems, and we honor MOD_LOG_KEY_REMOVE for balaning. Signed-off-by: Liu Bo Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index cdfb4c4..b12c039 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1140,13 +1140,13 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, switch (tm->op) { case MOD_LOG_KEY_REMOVE_WHILE_FREEING: BUG_ON(tm->slot < n); - case MOD_LOG_KEY_REMOVE_WHILE_MOVING: case MOD_LOG_KEY_REMOVE: + n++; + case MOD_LOG_KEY_REMOVE_WHILE_MOVING: btrfs_set_node_key(eb, &tm->key, tm->slot); btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); btrfs_set_node_ptr_generation(eb, tm->slot, tm->generation); - n++; break; case MOD_LOG_KEY_REPLACE: BUG_ON(tm->slot >= n); -- cgit v0.10.2 From 6a7a665d78c5dd8bc76a010648c4e7d84517ab5a Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 19 Oct 2012 09:50:53 +0000 Subject: Btrfs: reorder tree mod log operations in deleting a pointer Since we don't use MOD_LOG_KEY_REMOVE_WHILE_MOVING to add nritems during rewinding, we should insert a MOD_LOG_KEY_REMOVE operation first. Signed-off-by: Liu Bo Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index b12c039..4d518bd 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4609,6 +4609,12 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 nritems; int ret; + if (tree_mod_log && level) { + ret = tree_mod_log_insert_key(root->fs_info, parent, slot, + MOD_LOG_KEY_REMOVE); + BUG_ON(ret < 0); + } + nritems = btrfs_header_nritems(parent); if (slot != nritems - 1) { if (tree_mod_log && level) @@ -4619,10 +4625,6 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_node_key_ptr_offset(slot + 1), sizeof(struct btrfs_key_ptr) * (nritems - slot - 1)); - } else if (tree_mod_log && level) { - ret = tree_mod_log_insert_key(root->fs_info, parent, slot, - MOD_LOG_KEY_REMOVE); - BUG_ON(ret < 0); } nritems--; -- cgit v0.10.2 From 0e411ecec60138f22442728f036d38cfea007817 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 19 Oct 2012 09:50:54 +0000 Subject: Btrfs: kill unnecessary arguments in del_ptr The argument 'tree_mod_log' is not necessary since all of callers enable it. Signed-off-by: Liu Bo Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 4d518bd..615b749 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -38,8 +38,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int level, int slot, - int tree_mod_log); + struct btrfs_path *path, int level, int slot); static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb); struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr, @@ -1827,7 +1826,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(right) == 0) { clean_tree_block(trans, root, right); btrfs_tree_unlock(right); - del_ptr(trans, root, path, level + 1, pslot + 1, 1); + del_ptr(trans, root, path, level + 1, pslot + 1); root_sub_used(root, right->len); btrfs_free_tree_block(trans, root, right, 0, 1); free_extent_buffer_stale(right); @@ -1871,7 +1870,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(mid) == 0) { clean_tree_block(trans, root, mid); btrfs_tree_unlock(mid); - del_ptr(trans, root, path, level + 1, pslot, 1); + del_ptr(trans, root, path, level + 1, pslot); root_sub_used(root, mid->len); btrfs_free_tree_block(trans, root, mid, 0, 1); free_extent_buffer_stale(mid); @@ -4602,14 +4601,13 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root * empty a node. */ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int level, int slot, - int tree_mod_log) + struct btrfs_path *path, int level, int slot) { struct extent_buffer *parent = path->nodes[level]; u32 nritems; int ret; - if (tree_mod_log && level) { + if (level) { ret = tree_mod_log_insert_key(root->fs_info, parent, slot, MOD_LOG_KEY_REMOVE); BUG_ON(ret < 0); @@ -4617,7 +4615,7 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, nritems = btrfs_header_nritems(parent); if (slot != nritems - 1) { - if (tree_mod_log && level) + if (level) tree_mod_log_eb_move(root->fs_info, parent, slot, slot + 1, nritems - slot - 1); memmove_extent_buffer(parent, @@ -4658,7 +4656,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, struct extent_buffer *leaf) { WARN_ON(btrfs_header_generation(leaf) != trans->transid); - del_ptr(trans, root, path, 1, path->slots[1], 1); + del_ptr(trans, root, path, 1, path->slots[1]); /* * btrfs_free_extent is expensive, we want to make sure we -- cgit v0.10.2 From 32adf0901371c8b9d258dba7811f3067d1d2ea5c Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 19 Oct 2012 12:52:15 +0000 Subject: Btrfs: cleanup unused arguments 'disk_key' is not used at all. Signed-off-by: Liu Bo Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 615b749..100c274 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -775,8 +775,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, static noinline void tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, - struct btrfs_disk_key *disk_key, int slot, int atomic) + struct extent_buffer *eb, int slot, int atomic) { int ret; @@ -1835,7 +1834,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, struct btrfs_disk_key right_key; btrfs_node_key(right, &right_key, 0); tree_mod_log_set_node_key(root->fs_info, parent, - &right_key, pslot + 1, 0); + pslot + 1, 0); btrfs_set_node_key(parent, &right_key, pslot + 1); btrfs_mark_buffer_dirty(parent); } @@ -1879,7 +1878,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* update the parent key to reflect our changes */ struct btrfs_disk_key mid_key; btrfs_node_key(mid, &mid_key, 0); - tree_mod_log_set_node_key(root->fs_info, parent, &mid_key, + tree_mod_log_set_node_key(root->fs_info, parent, pslot, 0); btrfs_set_node_key(parent, &mid_key, pslot); btrfs_mark_buffer_dirty(parent); @@ -1979,7 +1978,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, orig_slot += left_nr; btrfs_node_key(mid, &disk_key, 0); tree_mod_log_set_node_key(root->fs_info, parent, - &disk_key, pslot, 0); + pslot, 0); btrfs_set_node_key(parent, &disk_key, pslot); btrfs_mark_buffer_dirty(parent); if (btrfs_header_nritems(left) > orig_slot) { @@ -2032,7 +2031,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_node_key(right, &disk_key, 0); tree_mod_log_set_node_key(root->fs_info, parent, - &disk_key, pslot + 1, 0); + pslot + 1, 0); btrfs_set_node_key(parent, &disk_key, pslot + 1); btrfs_mark_buffer_dirty(parent); @@ -2916,7 +2915,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans, if (!path->nodes[i]) break; t = path->nodes[i]; - tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1); + tree_mod_log_set_node_key(root->fs_info, t, tslot, 1); btrfs_set_node_key(t, key, tslot); btrfs_mark_buffer_dirty(path->nodes[i]); if (tslot != 0) -- cgit v0.10.2 From 7b398f8e58c415738e397645c926253c428cf002 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 22 Oct 2012 15:52:28 -0400 Subject: Btrfs: fill the global reserve when unpinning space Dave gave me an image of a very full file system that would abort the transaction because it ran out of space while committing the transaction. This is because we would think there was plenty of room to create a snapshot even though the global reserve was not full. This happens because we calculate the global reserve size before we unpin any space, so after we unpin the space we allow reservations to occur even though we haven't reserved all of the space for our global reserve. Fix this by adding to the global reserve while unpinning in order to make sure we always have enough space to do our work. With this patch we no longer end up with an aborted transaction, we return ENOSPC properly to the person trying to create the snapshot. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2136add..b495cb4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4949,9 +4949,13 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_group_cache *cache = NULL; + struct btrfs_space_info *space_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; u64 len; + bool readonly; while (start <= end) { + readonly = false; if (!cache || start >= cache->key.objectid + cache->key.offset) { if (cache) @@ -4969,15 +4973,30 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) } start += len; + space_info = cache->space_info; - spin_lock(&cache->space_info->lock); + spin_lock(&space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; - cache->space_info->bytes_pinned -= len; - if (cache->ro) - cache->space_info->bytes_readonly += len; + space_info->bytes_pinned -= len; + if (cache->ro) { + space_info->bytes_readonly += len; + readonly = true; + } spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); + if (!readonly && global_rsv->space_info == space_info) { + spin_lock(&global_rsv->lock); + if (!global_rsv->full) { + len = min(len, global_rsv->size - + global_rsv->reserved); + global_rsv->reserved += len; + space_info->bytes_may_use += len; + if (global_rsv->reserved >= global_rsv->size) + global_rsv->full = 1; + } + spin_unlock(&global_rsv->lock); + } + spin_unlock(&space_info->lock); } if (cache) -- cgit v0.10.2 From 8ccf6f19b67f7e0921063cc309f4672a6afcb528 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 25 Oct 2012 09:28:04 +0000 Subject: Btrfs: make delalloc inodes be flushed by multi-task This patch introduce a new worker pool named "flush_workers", and if we want to force all the inode with pending delalloc to the disks, we can queue those inodes into the work queue of the worker pool, in this way, those inodes will be flushed by multi-task. Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8fd9fe4..cad1656 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1333,6 +1333,7 @@ struct btrfs_fs_info { struct btrfs_workers generic_worker; struct btrfs_workers workers; struct btrfs_workers delalloc_workers; + struct btrfs_workers flush_workers; struct btrfs_workers endio_workers; struct btrfs_workers endio_meta_workers; struct btrfs_workers endio_meta_write_workers; @@ -3277,6 +3278,19 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans, int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); /* inode.c */ +struct btrfs_delalloc_work { + struct inode *inode; + int wait; + int delay_iput; + struct completion completion; + struct list_head list; + struct btrfs_work work; +}; + +struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, + int wait, int delay_iput); +void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work); + struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, int create); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7cda519..bd70c28 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2279,6 +2279,10 @@ int open_ctree(struct super_block *sb, fs_info->thread_pool_size, &fs_info->generic_worker); + btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc", + fs_info->thread_pool_size, + &fs_info->generic_worker); + btrfs_init_workers(&fs_info->submit_workers, "submit", min_t(u64, fs_devices->num_devices, fs_info->thread_pool_size), @@ -2350,6 +2354,7 @@ int open_ctree(struct super_block *sb, ret |= btrfs_start_workers(&fs_info->delayed_workers); ret |= btrfs_start_workers(&fs_info->caching_workers); ret |= btrfs_start_workers(&fs_info->readahead_workers); + ret |= btrfs_start_workers(&fs_info->flush_workers); if (ret) { err = -ENOMEM; goto fail_sb_buffer; @@ -2667,6 +2672,7 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->submit_workers); btrfs_stop_workers(&fs_info->delayed_workers); btrfs_stop_workers(&fs_info->caching_workers); + btrfs_stop_workers(&fs_info->flush_workers); fail_alloc: fail_iput: btrfs_mapping_tree_free(&fs_info->mapping_tree); @@ -3339,6 +3345,7 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->delayed_workers); btrfs_stop_workers(&fs_info->caching_workers); btrfs_stop_workers(&fs_info->readahead_workers); + btrfs_stop_workers(&fs_info->flush_workers); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_test_opt(root, CHECK_INTEGRITY)) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index db3dd4e..dce9e21 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -71,6 +71,7 @@ static const struct file_operations btrfs_dir_file_operations; static struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; +static struct kmem_cache *btrfs_delalloc_work_cachep; struct kmem_cache *btrfs_trans_handle_cachep; struct kmem_cache *btrfs_transaction_cachep; struct kmem_cache *btrfs_path_cachep; @@ -7204,6 +7205,8 @@ void btrfs_destroy_cachep(void) kmem_cache_destroy(btrfs_path_cachep); if (btrfs_free_space_cachep) kmem_cache_destroy(btrfs_free_space_cachep); + if (btrfs_delalloc_work_cachep) + kmem_cache_destroy(btrfs_delalloc_work_cachep); } int btrfs_init_cachep(void) @@ -7238,6 +7241,13 @@ int btrfs_init_cachep(void) if (!btrfs_free_space_cachep) goto fail; + btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work", + sizeof(struct btrfs_delalloc_work), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); + if (!btrfs_delalloc_work_cachep) + goto fail; + return 0; fail: btrfs_destroy_cachep(); @@ -7448,6 +7458,49 @@ out_notrans: return ret; } +static void btrfs_run_delalloc_work(struct btrfs_work *work) +{ + struct btrfs_delalloc_work *delalloc_work; + + delalloc_work = container_of(work, struct btrfs_delalloc_work, + work); + if (delalloc_work->wait) + btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1); + else + filemap_flush(delalloc_work->inode->i_mapping); + + if (delalloc_work->delay_iput) + btrfs_add_delayed_iput(delalloc_work->inode); + else + iput(delalloc_work->inode); + complete(&delalloc_work->completion); +} + +struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, + int wait, int delay_iput) +{ + struct btrfs_delalloc_work *work; + + work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS); + if (!work) + return NULL; + + init_completion(&work->completion); + INIT_LIST_HEAD(&work->list); + work->inode = inode; + work->wait = wait; + work->delay_iput = delay_iput; + work->work.func = btrfs_run_delalloc_work; + + return work; +} + +void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) +{ + wait_for_completion(&work->completion); + kmem_cache_free(btrfs_delalloc_work_cachep, work); +} + /* * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. @@ -7457,10 +7510,15 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) struct list_head *head = &root->fs_info->delalloc_inodes; struct btrfs_inode *binode; struct inode *inode; + struct btrfs_delalloc_work *work, *next; + struct list_head works; + int ret = 0; if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; + INIT_LIST_HEAD(&works); + spin_lock(&root->fs_info->delalloc_lock); while (!list_empty(head)) { binode = list_entry(head->next, struct btrfs_inode, @@ -7470,11 +7528,14 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) list_del_init(&binode->delalloc_inodes); spin_unlock(&root->fs_info->delalloc_lock); if (inode) { - filemap_flush(inode->i_mapping); - if (delay_iput) - btrfs_add_delayed_iput(inode); - else - iput(inode); + work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); + if (!work) { + ret = -ENOMEM; + goto out; + } + list_add_tail(&work->list, &works); + btrfs_queue_worker(&root->fs_info->flush_workers, + &work->work); } cond_resched(); spin_lock(&root->fs_info->delalloc_lock); @@ -7493,7 +7554,12 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) atomic_read(&root->fs_info->async_delalloc_pages) == 0)); } atomic_dec(&root->fs_info->async_submit_draining); - return 0; +out: + list_for_each_entry_safe(work, next, &works, list) { + list_del_init(&work->list); + btrfs_wait_and_free_delalloc_work(work); + } + return ret; } static int btrfs_symlink(struct inode *dir, struct dentry *dentry, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 242d6de..270f24f 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4061,7 +4061,11 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) (unsigned long long)rc->block_group->key.objectid, (unsigned long long)rc->block_group->flags); - btrfs_start_delalloc_inodes(fs_info->tree_root, 0); + ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0); + if (ret < 0) { + err = ret; + goto out; + } btrfs_wait_ordered_extents(fs_info->tree_root, 0); while (1) { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 4e1def4..9c466f9 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1497,7 +1497,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, WARN_ON(cur_trans != trans->transaction); if (flush_on_commit || snap_pending) { - btrfs_start_delalloc_inodes(root, 1); + ret = btrfs_start_delalloc_inodes(root, 1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto cleanup_transaction; + } btrfs_wait_ordered_extents(root, 1); } -- cgit v0.10.2 From 25287e0a16c0ad068aa89ab01aea6c699b31ec12 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 25 Oct 2012 09:31:03 +0000 Subject: Btrfs: make ordered operations be handled by multi-task The process of the ordered operations is similar to the delalloc inode flush, so we handle them by flush workers. Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 7772f02..ab2a3c0 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -519,13 +519,17 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) * extra check to make sure the ordered operation list really is empty * before we return */ -void btrfs_run_ordered_operations(struct btrfs_root *root, int wait) +int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) { struct btrfs_inode *btrfs_inode; struct inode *inode; struct list_head splice; + struct list_head works; + struct btrfs_delalloc_work *work, *next; + int ret = 0; INIT_LIST_HEAD(&splice); + INIT_LIST_HEAD(&works); mutex_lock(&root->fs_info->ordered_operations_mutex); spin_lock(&root->fs_info->ordered_extent_lock); @@ -533,6 +537,7 @@ again: list_splice_init(&root->fs_info->ordered_operations, &splice); while (!list_empty(&splice)) { + btrfs_inode = list_entry(splice.next, struct btrfs_inode, ordered_operations); @@ -549,15 +554,26 @@ again: list_add_tail(&BTRFS_I(inode)->ordered_operations, &root->fs_info->ordered_operations); } + + if (!inode) + continue; spin_unlock(&root->fs_info->ordered_extent_lock); - if (inode) { - if (wait) - btrfs_wait_ordered_range(inode, 0, (u64)-1); - else - filemap_flush(inode->i_mapping); - btrfs_add_delayed_iput(inode); + work = btrfs_alloc_delalloc_work(inode, wait, 1); + if (!work) { + if (list_empty(&BTRFS_I(inode)->ordered_operations)) + list_add_tail(&btrfs_inode->ordered_operations, + &splice); + spin_lock(&root->fs_info->ordered_extent_lock); + list_splice_tail(&splice, + &root->fs_info->ordered_operations); + spin_unlock(&root->fs_info->ordered_extent_lock); + ret = -ENOMEM; + goto out; } + list_add_tail(&work->list, &works); + btrfs_queue_worker(&root->fs_info->flush_workers, + &work->work); cond_resched(); spin_lock(&root->fs_info->ordered_extent_lock); @@ -566,7 +582,13 @@ again: goto again; spin_unlock(&root->fs_info->ordered_extent_lock); +out: + list_for_each_entry_safe(work, next, &works, list) { + list_del_init(&work->list); + btrfs_wait_and_free_delalloc_work(work); + } mutex_unlock(&root->fs_info->ordered_operations_mutex); + return ret; } /* @@ -934,15 +956,6 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, if (last_mod < root->fs_info->last_trans_committed) return; - /* - * the transaction is already committing. Just start the IO and - * don't bother with all of this list nonsense - */ - if (trans && root->fs_info->running_transaction->blocked) { - btrfs_wait_ordered_range(inode, 0, (u64)-1); - return; - } - spin_lock(&root->fs_info->ordered_extent_lock); if (list_empty(&BTRFS_I(inode)->ordered_operations)) { list_add_tail(&BTRFS_I(inode)->ordered_operations, @@ -959,6 +972,7 @@ int __init ordered_data_init(void) NULL); if (!btrfs_ordered_extent_cache) return -ENOMEM; + return 0; } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index dd27a0b..e8dcec6 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -186,7 +186,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); -void btrfs_run_ordered_operations(struct btrfs_root *root, int wait); +int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 9c466f9..259f74e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1412,15 +1412,21 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_transaction *prev_trans = NULL; DEFINE_WAIT(wait); - int ret = -EIO; + int ret; int should_grow = 0; unsigned long now = get_seconds(); int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); - btrfs_run_ordered_operations(root, 0); + ret = btrfs_run_ordered_operations(root, 0); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto cleanup_transaction; + } - if (cur_trans->aborted) + if (cur_trans->aborted) { + ret = cur_trans->aborted; goto cleanup_transaction; + } /* make a pass through all the delayed refs we have so far * any runnings procs may add more while we are here @@ -1523,7 +1529,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, * it here and no for sure that nothing new will be added * to the list */ - btrfs_run_ordered_operations(root, 1); + ret = btrfs_run_ordered_operations(root, 1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto cleanup_transaction; + } prepare_to_wait(&cur_trans->writer_wait, &wait, TASK_UNINTERRUPTIBLE); -- cgit v0.10.2 From 9afab8820bb8b55af669b199597d6716e04d1ba8 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 25 Oct 2012 09:41:36 +0000 Subject: Btrfs: make ordered extent be flushed by multi-task Though the process of the ordered extents is a bit different with the delalloc inode flush, but we can see it as a subset of the delalloc inode flush, so we also handle them by flush workers. Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index ab2a3c0..eecc20f 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -211,6 +211,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, init_waitqueue_head(&entry->wait); INIT_LIST_HEAD(&entry->list); INIT_LIST_HEAD(&entry->root_extent_list); + INIT_LIST_HEAD(&entry->work_list); + init_completion(&entry->completion); trace_btrfs_ordered_extent_add(inode, entry); @@ -464,18 +466,28 @@ void btrfs_remove_ordered_extent(struct inode *inode, wake_up(&entry->wait); } +static void btrfs_run_ordered_extent_work(struct btrfs_work *work) +{ + struct btrfs_ordered_extent *ordered; + + ordered = container_of(work, struct btrfs_ordered_extent, flush_work); + btrfs_start_ordered_extent(ordered->inode, ordered, 1); + complete(&ordered->completion); +} + /* * wait for all the ordered extents in a root. This is done when balancing * space between drives. */ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) { - struct list_head splice; + struct list_head splice, works; struct list_head *cur; - struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_extent *ordered, *next; struct inode *inode; INIT_LIST_HEAD(&splice); + INIT_LIST_HEAD(&works); spin_lock(&root->fs_info->ordered_extent_lock); list_splice_init(&root->fs_info->ordered_extents, &splice); @@ -494,19 +506,32 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) spin_unlock(&root->fs_info->ordered_extent_lock); if (inode) { - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - if (delay_iput) - btrfs_add_delayed_iput(inode); - else - iput(inode); + ordered->flush_work.func = btrfs_run_ordered_extent_work; + list_add_tail(&ordered->work_list, &works); + btrfs_queue_worker(&root->fs_info->flush_workers, + &ordered->flush_work); } else { btrfs_put_ordered_extent(ordered); } + cond_resched(); spin_lock(&root->fs_info->ordered_extent_lock); } spin_unlock(&root->fs_info->ordered_extent_lock); + + list_for_each_entry_safe(ordered, next, &works, work_list) { + list_del_init(&ordered->work_list); + wait_for_completion(&ordered->completion); + + inode = ordered->inode; + btrfs_put_ordered_extent(ordered); + if (delay_iput) + btrfs_add_delayed_iput(inode); + else + iput(inode); + + cond_resched(); + } } /* diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index e8dcec6..efc7c29 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -128,8 +128,11 @@ struct btrfs_ordered_extent { struct list_head root_extent_list; struct btrfs_work work; -}; + struct completion completion; + struct btrfs_work flush_work; + struct list_head work_list; +}; /* * calculates the total size you need to allocate for an ordered sum -- cgit v0.10.2 From 0253f40ef9a709a1af39ce38b1d998af090f8127 Mon Sep 17 00:00:00 2001 From: "jeff.liu" Date: Sat, 27 Oct 2012 12:06:39 +0000 Subject: Btrfs: Remove the invalid shrink size check up from btrfs_shrink_dev() Remove an invalid size check up from btrfs_shrink_dev(). The new size should not larger than the device->total_bytes as it was already verified before coming to here(i.e. new_size < old_size). Remove invalid check up for btrfs_shrink_dev(). Signed-off-by: Jie Liu Signed-off-by: Chris Mason diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8fcf9a5..14c0d2e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1409,7 +1409,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, btrfs_commit_transaction(trans, root); } else if (new_size < old_size) { ret = btrfs_shrink_device(device, new_size); - } + } /* equal, nothing need to do */ out_free: kfree(vol_args); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index eaaf0bf..32a8842 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3059,9 +3059,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) u64 old_size = device->total_bytes; u64 diff = device->total_bytes - new_size; - if (new_size >= device->total_bytes) - return -EINVAL; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; -- cgit v0.10.2 From d1423248734df6d9aff769abffd675dc034e0601 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Wed, 31 Oct 2012 15:16:32 +0000 Subject: Btrfs: Fix typo in fs/btrfs Correct spelling typo in btrfs. Signed-off-by: Masanari Iida Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index cad1656..2d41cb2 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -413,7 +413,7 @@ struct btrfs_root_backup { __le64 bytes_used; __le64 num_devices; /* future */ - __le64 unsed_64[4]; + __le64 unused_64[4]; u8 tree_root_level; u8 chunk_root_level; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 32a8842..eeed97d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4261,7 +4261,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, rcu_read_lock(); name = rcu_dereference(dev->name); - pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu " + pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu " "(%s id %llu), size=%u\n", rw, (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, name->str, dev->devid, bio->bi_size); -- cgit v0.10.2 From 292fd7fc39aa06668f3a8db546714e727120cb3e Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 30 Oct 2012 17:16:16 +0000 Subject: Btrfs: don't allow degraded mount if too many devices are missing The current behavior is to allow mounting or remounting a filesystem writeable in degraded mode if at least one writeable device is present. The next failed write access to a missing device which is above the tolerance of the configured level of redundancy results in an read-only enforcement. Even without this, the next time barrier_all_devices() is called and more devices are missing than tolerable, the switch to read-only mode takes place. In order to behave predictably and to provide proper feedback to the user at mount time, this patch compares the number of missing devices with the number of devices that are tolerated to be missing according to the configured RAID level. If more devices are missing than tolerated, e.g. if two devices are missing in case of RAID1, only a read-only mount and remount is allowed. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index bd70c28..0643159 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2508,6 +2508,13 @@ retry_root_backup: } fs_info->num_tolerated_disk_barrier_failures = btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); + if (fs_info->fs_devices->missing_devices > + fs_info->num_tolerated_disk_barrier_failures && + !(sb->s_flags & MS_RDONLY)) { + printk(KERN_WARNING + "Btrfs: too many missing devices, writeable mount is not allowed\n"); + goto fail_block_groups; + } fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, "btrfs-cleaner"); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 915ac14..acd2df8 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1226,6 +1226,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) goto restore; } + if (fs_info->fs_devices->missing_devices > + fs_info->num_tolerated_disk_barrier_failures && + !(*flags & MS_RDONLY)) { + printk(KERN_WARNING + "Btrfs: too many missing devices, writeable remount is not allowed\n"); + ret = -EACCES; + goto restore; + } + if (btrfs_super_log_root(fs_info->super_copy) != 0) { ret = -EINVAL; goto restore; -- cgit v0.10.2 From 183f37fa3503332740c76f1b493f4304ec889358 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 1 Nov 2012 06:38:47 +0000 Subject: Btrfs: do not log extents when we only log new names When we log new names, we need to log just enough to recreate the inode during log replay, and there is no need to log extents along with it. This actually fixes a bug revealed by xfstests 241, where it shows that we're logging some extents that have not updated metadata, so we don't get proper EXTENT_DATA items to be copied to log tree. Signed-off-by: Liu Bo Signed-off-by: Chris Mason diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 81e407d..4ec41ec 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3435,7 +3435,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); } else { - fast_search = true; + if (inode_only == LOG_INODE_ALL) + fast_search = true; max_key.type = BTRFS_XATTR_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, BTRFS_XATTR_ITEM_KEY); -- cgit v0.10.2 From 9f3959c53d57d010ae6f4205fbd0159cb7976a83 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 1 Nov 2012 06:38:48 +0000 Subject: Btrfs: get right arguments for btrfs_wait_ordered_range btrfs_wait_ordered_range expects for 'len' instead of 'end'. Signed-off-by: Liu Bo Signed-off-by: Chris Mason diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9ab1bed..d2df981 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1562,7 +1562,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * range being left. */ atomic_inc(&root->log_batch); - btrfs_wait_ordered_range(inode, start, end); + btrfs_wait_ordered_range(inode, start, end - start + 1); atomic_inc(&root->log_batch); /* -- cgit v0.10.2 From 4fde183d8c755f8a8bdffcb03a8d947e62ccea6a Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 1 Nov 2012 06:38:49 +0000 Subject: Btrfs: cleanup for btrfs_wait_order_range Variable 'found' is no more used. Signed-off-by: Liu Bo Signed-off-by: Chris Mason diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index eecc20f..f107312 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -653,7 +653,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) u64 end; u64 orig_end; struct btrfs_ordered_extent *ordered; - int found; if (start + len < start) { orig_end = INT_LIMIT(loff_t); @@ -689,7 +688,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) filemap_fdatawait_range(inode->i_mapping, start, orig_end); end = orig_end; - found = 0; while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, end); if (!ordered) @@ -702,7 +700,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) btrfs_put_ordered_extent(ordered); break; } - found++; btrfs_start_ordered_extent(inode, ordered, 1); end = ordered->file_offset; btrfs_put_ordered_extent(ordered); -- cgit v0.10.2 From b7d5b0a819498a9c04e1d18201a42468f7edd92a Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 1 Nov 2012 07:32:18 +0000 Subject: Btrfs: fix joining the same transaction handler more than 2 times If we flush inodes with pending delalloc in a transaction, we may join the same transaction handler more than 2 times. The reason is: Task use_count of trans handle commit_transaction 1 |-> btrfs_start_delalloc_inodes 1 |-> run_delalloc_nocow 1 |-> join_transaction 2 |-> cow_file_range 2 |-> join_transaction 3 In fact, cow_file_range needn't join the transaction again because the caller have joined the transaction, so we fix this problem by this way. Reported-by: Liu Bo Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dce9e21..96d2090 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -804,14 +804,14 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start, * required to start IO on it. It may be clean and already done with * IO when we return. */ -static noinline int cow_file_range(struct inode *inode, - struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, - int unlock) +static noinline int __cow_file_range(struct btrfs_trans_handle *trans, + struct inode *inode, + struct btrfs_root *root, + struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written, + int unlock) { - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; u64 alloc_hint = 0; u64 num_bytes; unsigned long ram_size; @@ -824,25 +824,10 @@ static noinline int cow_file_range(struct inode *inode, int ret = 0; BUG_ON(btrfs_is_free_space_inode(inode)); - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - start, end, locked_page, - EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_UNLOCK | - EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_DIRTY | - EXTENT_SET_WRITEBACK | - EXTENT_END_WRITEBACK); - return PTR_ERR(trans); - } - trans->block_rsv = &root->fs_info->delalloc_block_rsv; num_bytes = (end - start + blocksize) & ~(blocksize - 1); num_bytes = max(blocksize, num_bytes); disk_num_bytes = num_bytes; - ret = 0; /* if this is a small write inside eof, kick off defrag */ if (num_bytes < 64 * 1024 && @@ -953,11 +938,9 @@ static noinline int cow_file_range(struct inode *inode, alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; } - ret = 0; out: - btrfs_end_transaction(trans, root); - return ret; + out_unlock: extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, @@ -972,6 +955,39 @@ out_unlock: goto out; } +static noinline int cow_file_range(struct inode *inode, + struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written, + int unlock) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + start, end, locked_page, + EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); + return PTR_ERR(trans); + } + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + + ret = __cow_file_range(trans, inode, root, locked_page, start, end, + page_started, nr_written, unlock); + + btrfs_end_transaction(trans, root); + + return ret; +} + /* * work queue call back to started compression on a file and pages */ @@ -1282,9 +1298,9 @@ out_check: btrfs_release_path(path); if (cow_start != (u64)-1) { - ret = cow_file_range(inode, locked_page, cow_start, - found_key.offset - 1, page_started, - nr_written, 1); + ret = __cow_file_range(trans, inode, root, locked_page, + cow_start, found_key.offset - 1, + page_started, nr_written, 1); if (ret) { btrfs_abort_transaction(trans, root, ret); goto error; @@ -1353,8 +1369,9 @@ out_check: } if (cow_start != (u64)-1) { - ret = cow_file_range(inode, locked_page, cow_start, end, - page_started, nr_written, 1); + ret = __cow_file_range(trans, inode, root, locked_page, + cow_start, end, + page_started, nr_written, 1); if (ret) { btrfs_abort_transaction(trans, root, ret); goto error; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 259f74e..44a5d73 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -312,6 +312,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type, WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK); h = current->journal_info; h->use_count++; + WARN_ON(h->use_count > 2); h->orig_rsv = h->block_rsv; h->block_rsv = NULL; goto got_it; -- cgit v0.10.2 From ca46963718ef7368c84267c9f5e7394c3890442a Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 1 Nov 2012 07:33:14 +0000 Subject: Btrfs: fix missing flush when committing a transaction Consider the following case: Task1 Task2 start_transaction commit_transaction check pending snapshots list and the list is empty. add pending snapshot into list skip the delalloc flush end_transaction ... And then the problem that the snapshot is different with the source subvolume happen. This patch fixes the above problem by flush all pending stuffs when all the other tasks end the transaction. Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 44a5d73..bc1f523 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1399,6 +1399,48 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_trans_handle_cachep, trans); } +static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); + int snap_pending = 0; + int ret; + + if (!flush_on_commit) { + spin_lock(&root->fs_info->trans_lock); + if (!list_empty(&trans->transaction->pending_snapshots)) + snap_pending = 1; + spin_unlock(&root->fs_info->trans_lock); + } + + if (flush_on_commit || snap_pending) { + btrfs_start_delalloc_inodes(root, 1); + btrfs_wait_ordered_extents(root, 1); + } + + ret = btrfs_run_delayed_items(trans, root); + if (ret) + return ret; + + /* + * running the delayed items may have added new refs. account + * them now so that they hinder processing of more delayed refs + * as little as possible. + */ + btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); + + /* + * rename don't use btrfs_join_transaction, so, once we + * set the transaction to blocked above, we aren't going + * to get any new ordered operations. We can safely run + * it here and no for sure that nothing new will be added + * to the list + */ + btrfs_run_ordered_operations(root, 1); + + return 0; +} + /* * btrfs_transaction state sequence: * in_commit = 0, blocked = 0 (initial) @@ -1416,7 +1458,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, int ret; int should_grow = 0; unsigned long now = get_seconds(); - int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); ret = btrfs_run_ordered_operations(root, 0); if (ret) { @@ -1495,47 +1536,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, should_grow = 1; do { - int snap_pending = 0; - joined = cur_trans->num_joined; - if (!list_empty(&trans->transaction->pending_snapshots)) - snap_pending = 1; WARN_ON(cur_trans != trans->transaction); - if (flush_on_commit || snap_pending) { - ret = btrfs_start_delalloc_inodes(root, 1); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto cleanup_transaction; - } - btrfs_wait_ordered_extents(root, 1); - } - - ret = btrfs_run_delayed_items(trans, root); + ret = btrfs_flush_all_pending_stuffs(trans, root); if (ret) goto cleanup_transaction; - /* - * running the delayed items may have added new refs. account - * them now so that they hinder processing of more delayed refs - * as little as possible. - */ - btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); - - /* - * rename don't use btrfs_join_transaction, so, once we - * set the transaction to blocked above, we aren't going - * to get any new ordered operations. We can safely run - * it here and no for sure that nothing new will be added - * to the list - */ - ret = btrfs_run_ordered_operations(root, 1); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto cleanup_transaction; - } - prepare_to_wait(&cur_trans->writer_wait, &wait, TASK_UNINTERRUPTIBLE); @@ -1548,6 +1556,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, } while (atomic_read(&cur_trans->num_writers) > 1 || (should_grow && cur_trans->num_joined != joined)); + ret = btrfs_flush_all_pending_stuffs(trans, root); + if (ret) + goto cleanup_transaction; + /* * Ok now we need to make sure to block out any other joins while we * commit the transaction. We could have started a join before setting -- cgit v0.10.2 From 315a9850da2b89c83971b26fe54a60f22bdd91ad Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 1 Nov 2012 07:33:59 +0000 Subject: Btrfs: fix wrong file extent length There are two types of the file extent - inline extent and regular extent, When we log file extents, we didn't take inline extent into account, fix it. Signed-off-by: Miao Xie Reviewed-by: Liu Bo Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2d41cb2..f9a0786 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3263,6 +3263,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid, u64 bytenr, int mod); +u64 btrfs_file_extent_length(struct btrfs_path *path); int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 1ad08e4e4..bd38cef 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -133,7 +133,6 @@ fail: return ERR_PTR(ret); } - int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid, @@ -151,6 +150,26 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, return ret; } +u64 btrfs_file_extent_length(struct btrfs_path *path) +{ + int extent_type; + struct btrfs_file_extent_item *fi; + u64 len; + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(path->nodes[0], fi); + + if (extent_type == BTRFS_FILE_EXTENT_REG || + extent_type == BTRFS_FILE_EXTENT_PREALLOC) + len = btrfs_file_extent_num_bytes(path->nodes[0], fi); + else if (extent_type == BTRFS_FILE_EXTENT_INLINE) + len = btrfs_file_extent_inline_len(path->nodes[0], fi); + else + BUG(); + + return len; +} static int __btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, struct bio *bio, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4ec41ec..bcf0e48 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3143,7 +3143,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans, struct btrfs_path *dst_path, struct log_args *args) { struct btrfs_root *log = root->log_root; - struct btrfs_file_extent_item *fi; struct btrfs_key key; u64 start = em->mod_start; u64 search_start = start; @@ -3199,10 +3198,7 @@ again: } } while (key.offset > start); - fi = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - num_bytes = btrfs_file_extent_num_bytes(path->nodes[0], - fi); + num_bytes = btrfs_file_extent_length(path); if (key.offset + num_bytes <= start) { btrfs_release_path(path); return -ENOENT; @@ -3211,8 +3207,7 @@ again: args->src = path->nodes[0]; next_slot: btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - fi = btrfs_item_ptr(args->src, path->slots[0], - struct btrfs_file_extent_item); + num_bytes = btrfs_file_extent_length(path); if (args->nr && args->start_slot + args->nr == path->slots[0]) { args->nr++; @@ -3230,7 +3225,6 @@ next_slot: } nritems = btrfs_header_nritems(path->nodes[0]); path->slots[0]++; - num_bytes = btrfs_file_extent_num_bytes(args->src, fi); if (len < num_bytes) { /* I _think_ this is ok, envision we write to a * preallocated space that is adjacent to a previously -- cgit v0.10.2 From bbe1426764e5dfaa57e7b12cc954acdb3fb7f94b Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 1 Nov 2012 07:34:54 +0000 Subject: Btrfs: fix unprotected extent map operation when logging file extents We forget to protect the modified_extents list, fix it. Signed-off-by: Miao Xie Reviewed-by: Liu Bo Signed-off-by: Chris Mason diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index bcf0e48..d1947af 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3526,8 +3526,10 @@ next_slot: struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; struct extent_map *em, *n; + write_lock(&tree->lock); list_for_each_entry_safe(em, n, &tree->modified_extents, list) list_del_init(&em->list); + write_unlock(&tree->lock); } if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { -- cgit v0.10.2 From 5269b67e3d809dcaa4c6763a343423bb1b7b3fe6 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 1 Nov 2012 07:35:23 +0000 Subject: Btrfs: fix missing log when BTRFS_INODE_NEEDS_FULL_SYNC is set If we set BTRFS_INODE_NEEDS_FULL_SYNC, we should log all the extent, but now we forget to take it into account, and set a wrong max key, if so, we will skip the file extent metadata when doing logging. Fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d1947af..40b9efd 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3394,7 +3394,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, /* today the code can only do partial logging of directories */ - if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) + if (S_ISDIR(inode->i_mode) || + (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags) && + inode_only == LOG_INODE_EXISTS)) max_key.type = BTRFS_XATTR_ITEM_KEY; else max_key.type = (u8)-1; -- cgit v0.10.2 From 31b1a2bd758f439fc945b3ac5899d890cb7e2dc6 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 3 Nov 2012 10:58:34 +0000 Subject: fs/btrfs: use WARN Use WARN rather than printk followed by WARN_ON(1), for conciseness. A simplified version of the semantic patch that makes this transformation is as follows: (http://coccinelle.lip6.fr/) // @@ expression list es; @@ -printk( +WARN(1, es); -WARN_ON(1); // Signed-off-by: Julia Lawall Reviewed-by: David Sterba Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 100c274..0e4adb0 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1359,19 +1359,16 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, u64 search_start; int ret; - if (trans->transaction != root->fs_info->running_transaction) { - printk(KERN_CRIT "trans %llu running %llu\n", + if (trans->transaction != root->fs_info->running_transaction) + WARN(1, KERN_CRIT "trans %llu running %llu\n", (unsigned long long)trans->transid, (unsigned long long) root->fs_info->running_transaction->transid); - WARN_ON(1); - } - if (trans->transid != root->fs_info->generation) { - printk(KERN_CRIT "trans %llu running %llu\n", + + if (trans->transid != root->fs_info->generation) + WARN(1, KERN_CRIT "trans %llu running %llu\n", (unsigned long long)trans->transid, (unsigned long long)root->fs_info->generation); - WARN_ON(1); - } if (!should_cow_block(trans, root, buf)) { *cow_ret = buf; @@ -3640,11 +3637,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(left, old_left_nritems + push_items); /* fixup right node */ - if (push_items > right_nritems) { - printk(KERN_CRIT "push items %d nr %u\n", push_items, + if (push_items > right_nritems) + WARN(1, KERN_CRIT "push items %d nr %u\n", push_items, right_nritems); - WARN_ON(1); - } if (push_items < right_nritems) { push_space = btrfs_item_offset_nr(right, push_items - 1) - diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0643159..07a2162 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3397,14 +3397,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) int was_dirty; btrfs_assert_tree_locked(buf); - if (transid != root->fs_info->generation) { - printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " + if (transid != root->fs_info->generation) + WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, " "found %llu running %llu\n", (unsigned long long)buf->start, (unsigned long long)transid, (unsigned long long)root->fs_info->generation); - WARN_ON(1); - } was_dirty = set_extent_buffer_dirty(buf); if (!was_dirty) { spin_lock(&root->fs_info->delalloc_lock); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b495cb4..0bcb954 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6292,10 +6292,9 @@ use_block_rsv(struct btrfs_trans_handle *trans, static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, /*DEFAULT_RATELIMIT_BURST*/ 2); - if (__ratelimit(&_rs)) { - printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret); - WARN_ON(1); - } + if (__ratelimit(&_rs)) + WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n", + ret); ret = reserve_metadata_bytes(root, block_rsv, blocksize, BTRFS_RESERVE_NO_FLUSH); if (!ret) { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 472873a..3c062c8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -341,12 +341,10 @@ static int insert_state(struct extent_io_tree *tree, { struct rb_node *node; - if (end < start) { - printk(KERN_ERR "btrfs end < start %llu %llu\n", + if (end < start) + WARN(1, KERN_ERR "btrfs end < start %llu %llu\n", (unsigned long long)end, (unsigned long long)start); - WARN_ON(1); - } state->start = start; state->end = end; @@ -4721,10 +4719,9 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, } if (start + min_len > eb->len) { - printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " + WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, " "wanted %lu %lu\n", (unsigned long long)eb->start, eb->len, start, min_len); - WARN_ON(1); return -EINVAL; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 96d2090..6dca345 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5458,8 +5458,7 @@ again: extent_map_end(em) - 1, NULL, GFP_NOFS); goto insert; } else { - printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); - WARN_ON(1); + WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type); } not_found: em->start = start; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index bc1f523..f21f39f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -145,16 +145,12 @@ loop: * the log must never go across transaction boundaries. */ smp_mb(); - if (!list_empty(&fs_info->tree_mod_seq_list)) { - printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when " + if (!list_empty(&fs_info->tree_mod_seq_list)) + WARN(1, KERN_ERR "btrfs: tree_mod_seq_list not empty when " "creating a fresh transaction\n"); - WARN_ON(1); - } - if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) { - printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when " + if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) + WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when " "creating a fresh transaction\n"); - WARN_ON(1); - } atomic_set(&fs_info->tree_mod_seq, 0); spin_lock_init(&cur_trans->commit_lock); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index eeed97d..3f4bfee 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3323,9 +3323,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, cur = cur->next; if (!device->writeable) { - printk(KERN_ERR + WARN(1, KERN_ERR "btrfs: read-only device in alloc_list\n"); - WARN_ON(1); continue; } -- cgit v0.10.2 From 6c1500f22a7be3a24ad3dffcdbf04be3f676521b Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 3 Nov 2012 20:30:18 +0000 Subject: fs/btrfs: drop if around WARN_ON Just use WARN_ON rather than an if containing only WARN_ON(1). A simplified version of the semantic patch that makes this transformation is as follows: (http://coccinelle.lip6.fr/) // @@ expression e; @@ - if (e) WARN_ON(1); + WARN_ON(e); // Signed-off-by: Julia Lawall Reviewed-by: David Sterba Signed-off-by: Chris Mason diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 208d8aa..a321952 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -890,8 +890,7 @@ again: while (!list_empty(&prefs)) { ref = list_first_entry(&prefs, struct __prelim_ref, list); list_del(&ref->list); - if (ref->count < 0) - WARN_ON(1); + WARN_ON(ref->count < 0); if (ref->count && ref->root_id && ref->parent == 0) { /* no parent == root of tree */ ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0e4adb0..5c2cf99 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1464,10 +1464,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, if (cache_only && parent_level != 1) return 0; - if (trans->transaction != root->fs_info->running_transaction) - WARN_ON(1); - if (trans->transid != root->fs_info->generation) - WARN_ON(1); + WARN_ON(trans->transaction != root->fs_info->running_transaction); + WARN_ON(trans->transid != root->fs_info->generation); parent_nritems = btrfs_header_nritems(parent); blocksize = btrfs_level_size(root, parent_level - 1); @@ -3398,8 +3396,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, if (push_items == 0) goto out_unlock; - if (!empty && push_items == left_nritems) - WARN_ON(1); + WARN_ON(!empty && push_items == left_nritems); /* push left to right */ right_nritems = btrfs_header_nritems(right); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6dca345..e8733fa 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1675,8 +1675,7 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, struct extent_state **cached_state) { - if ((end & (PAGE_CACHE_SIZE - 1)) == 0) - WARN_ON(1); + WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0); return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, cached_state, GFP_NOFS); } -- cgit v0.10.2 From 37c4146d2208ba7e4463e8dd95a1bf9e3d865280 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 5 Nov 2012 12:42:08 +0000 Subject: Btrfs: fix a deadlock in aborting transaction due to ENOSPC When committing a transaction, we may bail out of running delayed refs due to ENOSPC, and then abort the current transaction to flip into readonly. But we'll hit a deadlock on ref head's lock since we forget to release its lock and other cleanup stuff. Signed-off-by: Liu Bo Signed-off-by: Chris Mason diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0bcb954..f8a358a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2297,6 +2297,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, kfree(extent_op); if (ret) { + list_del_init(&locked_ref->cluster); + mutex_unlock(&locked_ref->mutex); + printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret); spin_lock(&delayed_refs->lock); return ret; @@ -2339,6 +2342,10 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, count++; if (ret) { + if (locked_ref) { + list_del_init(&locked_ref->cluster); + mutex_unlock(&locked_ref->mutex); + } printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret); spin_lock(&delayed_refs->lock); return ret; -- cgit v0.10.2 From 109f2365f1928af241b2ccbd0f6ba0b93d911288 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 5 Nov 2012 12:42:09 +0000 Subject: Btrfs: fix a double free on pending snapshots in error handling When creating a snapshot, failing to commit a transaction can end up with aborting the transaction, following by doing a cleanup for it, where we'll free all snapshots pending to disk. So we check it and avoid double free on pending snapshots. Signed-off-by: Liu Bo Signed-off-by: Chris Mason diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 14c0d2e..e262cd8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -571,8 +571,12 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, ret = btrfs_commit_transaction(trans, root->fs_info->extent_root); } - if (ret) + if (ret) { + /* cleanup_transaction has freed this for us */ + if (trans->aborted) + pending_snapshot = NULL; goto fail; + } ret = pending_snapshot->error; if (ret) -- cgit v0.10.2 From d03f918ab9036cc71740c0aa796c8e02e6f6f6d3 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 13:10:49 +0000 Subject: Btrfs: Don't trust the superblock label and simply printk("%s") it Someone who is root or capable(CAP_SYS_ADMIN) could corrupt the superblock and make Btrfs printk("%s") crash while holding the uuid_mutex since nobody forces a limit on the string. Since the uuid_mutex is significant, the system would be unusable afterwards. Signed-off-by: Stefan Behrens Reviewed-by: David Sterba Signed-off-by: Chris Mason diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3f4bfee..db79fb7 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -764,10 +764,13 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, devid = btrfs_stack_device_id(&disk_super->dev_item); transid = btrfs_super_generation(disk_super); total_devices = btrfs_super_num_devices(disk_super); - if (disk_super->label[0]) + if (disk_super->label[0]) { + if (disk_super->label[BTRFS_LABEL_SIZE - 1]) + disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; printk(KERN_INFO "device label %s ", disk_super->label); - else + } else { printk(KERN_INFO "device fsid %pU ", disk_super->fsid); + } printk(KERN_CONT "devid %llu transid %llu %s\n", (unsigned long long)devid, (unsigned long long)transid, path); ret = device_list_add(path, disk_super, devid, fs_devices_ret); -- cgit v0.10.2 From e1f5790e0588bc5b11eb57f95bfde8702049dd0d Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Thu, 8 Nov 2012 04:47:33 +0000 Subject: Btrfs: set hole punching time properly Even if the hole punching is executed, the modification time of the file is not updated. So, current time is set to inode. Signed-off-by: Tsutomu Itoh Signed-off-by: Chris Mason diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d2df981..883cf82 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1964,6 +1964,9 @@ out_trans: if (!trans) goto out_free; + inode_inc_iversion(inode); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + trans->block_rsv = &root->fs_info->trans_block_rsv; ret = btrfs_update_inode(trans, root, inode); nr = trans->blocks_used; -- cgit v0.10.2 From 3ef5969cd8a42a78ccdbc53f7abb2e6136b2ec65 Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Thu, 8 Nov 2012 21:27:24 +0000 Subject: Btrfs: merge inode_list in __merge_refs When __merge_refs merges two refs, it is also needed to merge the inode_list of both refs. Otherwise we have missed backrefs and memory leaks. This happens for example if two inodes share an extent and both lie in the same leaf and thus also have the same parent. Signed-off-by: Alexander Block Reviewed-by: Jan Schmidt Signed-off-by: Chris Mason diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index a321952..04edf69 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -461,6 +461,7 @@ static int __merge_refs(struct list_head *head, int mode) pos2 = n2, n2 = pos2->next) { struct __prelim_ref *ref2; struct __prelim_ref *xchg; + struct extent_inode_elem *eie; ref2 = list_entry(pos2, struct __prelim_ref, list); @@ -472,12 +473,20 @@ static int __merge_refs(struct list_head *head, int mode) ref1 = ref2; ref2 = xchg; } - ref1->count += ref2->count; } else { if (ref1->parent != ref2->parent) continue; - ref1->count += ref2->count; } + + eie = ref1->inode_list; + while (eie && eie->next) + eie = eie->next; + if (eie) + eie->next = ref2->inode_list; + else + ref1->inode_list = ref2->inode_list; + ref1->count += ref2->count; + list_del(&ref2->list); kfree(ref2); } -- cgit v0.10.2 From b53d3f5db2b79637acadc06a330db6c2c60863f5 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 14 Nov 2012 14:34:34 +0000 Subject: Btrfs: cleanup for btrfs_btree_balance_dirty - 'nr' is no more used. - btrfs_btree_balance_dirty() and __btrfs_btree_balance_dirty() can share a bunch of code. Signed-off-by: Liu Bo Signed-off-by: Chris Mason diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0c6dca5..3483603 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1257,7 +1257,6 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) struct btrfs_delayed_node *delayed_node = NULL; struct btrfs_root *root; struct btrfs_block_rsv *block_rsv; - unsigned long nr = 0; int need_requeue = 0; int ret; @@ -1318,11 +1317,9 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) delayed_node); mutex_unlock(&delayed_node->mutex); - nr = trans->blocks_used; - trans->block_rsv = block_rsv; btrfs_end_transaction_dmeta(trans, root); - __btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty_nodelay(root); free_path: btrfs_free_path(path); out: diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 07a2162..ff5d259 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3411,7 +3411,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) } } -void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) +static void __btrfs_btree_balance_dirty(struct btrfs_root *root, + int flush_delayed) { /* * looks as though older kernels can get into trouble with @@ -3423,7 +3424,8 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) if (current->flags & PF_MEMALLOC) return; - btrfs_balance_delayed_items(root); + if (flush_delayed) + btrfs_balance_delayed_items(root); num_dirty = root->fs_info->dirty_metadata_bytes; @@ -3434,25 +3436,14 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) return; } -void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) +void btrfs_btree_balance_dirty(struct btrfs_root *root) { - /* - * looks as though older kernels can get into trouble with - * this code, they end up stuck in balance_dirty_pages forever - */ - u64 num_dirty; - unsigned long thresh = 32 * 1024 * 1024; - - if (current->flags & PF_MEMALLOC) - return; - - num_dirty = root->fs_info->dirty_metadata_bytes; + __btrfs_btree_balance_dirty(root, 1); +} - if (num_dirty > thresh) { - balance_dirty_pages_ratelimited_nr( - root->fs_info->btree_inode->i_mapping, 1); - } - return; +void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root) +{ + __btrfs_btree_balance_dirty(root, 0); } int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 2025a91..305c33e 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -62,8 +62,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, struct btrfs_key *location); int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); -void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); -void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); +void btrfs_btree_balance_dirty(struct btrfs_root *root); +void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root); void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 883cf82..bd7f1b0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1349,7 +1349,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, balance_dirty_pages_ratelimited_nr(inode->i_mapping, dirty_pages); if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) - btrfs_btree_balance_dirty(root, 1); + btrfs_btree_balance_dirty(root); pos += copied; num_written += copied; @@ -1803,7 +1803,6 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) u64 cur_offset = lockstart; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); u64 drop_end; - unsigned long nr; int ret = 0; int err = 0; bool same_page = (offset >> PAGE_CACHE_SHIFT) == @@ -1931,9 +1930,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) break; } - nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); trans = btrfs_start_transaction(root, 3); if (IS_ERR(trans)) { @@ -1969,9 +1967,8 @@ out_trans: trans->block_rsv = &root->fs_info->trans_block_rsv; ret = btrfs_update_inode(trans, root, inode); - nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); out_free: btrfs_free_path(path); btrfs_free_block_rsv(root, rsv); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e8733fa..aabf747 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3091,7 +3091,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) struct btrfs_trans_handle *trans; struct inode *inode = dentry->d_inode; int ret; - unsigned long nr = 0; trans = __unlink_start_trans(dir, dentry); if (IS_ERR(trans)) @@ -3111,9 +3110,8 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) } out: - nr = trans->blocks_used; __unlink_end_trans(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); return ret; } @@ -3203,7 +3201,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) int err = 0; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_trans_handle *trans; - unsigned long nr = 0; if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; @@ -3232,9 +3229,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) if (!err) btrfs_i_size_write(inode, 0); out: - nr = trans->blocks_used; __unlink_end_trans(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); return err; } @@ -3800,7 +3796,6 @@ void btrfs_evict_inode(struct inode *inode) struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv, *global_rsv; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); - unsigned long nr; int ret; trace_btrfs_inode_evict(inode); @@ -3882,10 +3877,9 @@ void btrfs_evict_inode(struct inode *inode) ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); - nr = trans->blocks_used; btrfs_end_transaction(trans, root); trans = NULL; - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); } btrfs_free_block_rsv(root, rsv); @@ -3901,9 +3895,8 @@ void btrfs_evict_inode(struct inode *inode) root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) btrfs_return_ino(root, btrfs_ino(inode)); - nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); no_delete: clear_inode(inode); return; @@ -4915,7 +4908,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, int err; int drop_inode = 0; u64 objectid; - unsigned long nr = 0; u64 index = 0; if (!new_valid_dev(rdev)) @@ -4965,9 +4957,8 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, d_instantiate(dentry, inode); } out_unlock: - nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -4983,7 +4974,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, struct inode *inode = NULL; int drop_inode = 0; int err; - unsigned long nr = 0; u64 objectid; u64 index = 0; @@ -5033,13 +5023,12 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, d_instantiate(dentry, inode); } out_unlock: - nr = trans->blocks_used; btrfs_end_transaction(trans, root); if (drop_inode) { inode_dec_link_count(inode); iput(inode); } - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); return err; } @@ -5050,7 +5039,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = old_dentry->d_inode; u64 index; - unsigned long nr = 0; int err; int drop_inode = 0; @@ -5094,14 +5082,13 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, btrfs_log_new_name(trans, inode, NULL, parent); } - nr = trans->blocks_used; btrfs_end_transaction(trans, root); fail: if (drop_inode) { inode_dec_link_count(inode); iput(inode); } - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); return err; } @@ -5114,7 +5101,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) int drop_on_err = 0; u64 objectid = 0; u64 index = 0; - unsigned long nr = 1; /* * 2 items for inode and ref @@ -5160,11 +5146,10 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) drop_on_err = 0; out_fail: - nr = trans->blocks_used; btrfs_end_transaction(trans, root); if (drop_on_err) iput(inode); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); return err; } @@ -6872,7 +6857,6 @@ static int btrfs_truncate(struct inode *inode) int ret; int err = 0; struct btrfs_trans_handle *trans; - unsigned long nr; u64 mask = root->sectorsize - 1; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); @@ -6995,9 +6979,8 @@ static int btrfs_truncate(struct inode *inode) break; } - nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) { @@ -7031,9 +7014,8 @@ static int btrfs_truncate(struct inode *inode) if (ret && !err) err = ret; - nr = trans->blocks_used; ret = btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); } out: @@ -7594,7 +7576,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, unsigned long ptr; struct btrfs_file_extent_item *ei; struct extent_buffer *leaf; - unsigned long nr = 0; name_len = strlen(symname) + 1; if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) @@ -7692,13 +7673,12 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, out_unlock: if (!err) d_instantiate(dentry, inode); - nr = trans->blocks_used; btrfs_end_transaction(trans, root); if (drop_inode) { inode_dec_link_count(inode); iput(inode); } - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); return err; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 270f24f..300e09a 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2025,7 +2025,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, struct btrfs_root_item *root_item; struct btrfs_path *path; struct extent_buffer *leaf; - unsigned long nr; int level; int max_level; int replaced = 0; @@ -2126,10 +2125,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, path->slots[level]); root_item->drop_level = level; - nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); if (replaced && rc->stage == UPDATE_DATA_PTRS) invalidate_extent_cache(root, &key, &next_key); @@ -2156,10 +2154,9 @@ out: btrfs_update_reloc_root(trans, root); } - nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); if (replaced && rc->stage == UPDATE_DATA_PTRS) invalidate_extent_cache(root, &key, &next_key); @@ -3262,7 +3259,6 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info, struct btrfs_path *path; struct btrfs_root *root = fs_info->tree_root; struct btrfs_trans_handle *trans; - unsigned long nr; int ret = 0; if (inode) @@ -3296,9 +3292,8 @@ truncate: ret = btrfs_truncate_free_space_cache(root, trans, path, inode); btrfs_free_path(path); - nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); out: iput(inode); return ret; @@ -3715,7 +3710,6 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) struct btrfs_trans_handle *trans = NULL; struct btrfs_path *path; struct btrfs_extent_item *ei; - unsigned long nr; u64 flags; u32 item_size; int ret; @@ -3832,9 +3826,8 @@ restart: ret = btrfs_commit_transaction(trans, rc->extent_root); BUG_ON(ret); } else { - nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, rc->extent_root); - btrfs_btree_balance_dirty(rc->extent_root, nr); + btrfs_btree_balance_dirty(rc->extent_root); } trans = NULL; @@ -3864,9 +3857,8 @@ restart: GFP_NOFS); if (trans) { - nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, rc->extent_root); - btrfs_btree_balance_dirty(rc->extent_root, nr); + btrfs_btree_balance_dirty(rc->extent_root); } if (!err) { @@ -3945,7 +3937,6 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans; struct btrfs_root *root; struct btrfs_key key; - unsigned long nr; u64 objectid = BTRFS_FIRST_FREE_OBJECTID; int err = 0; @@ -3973,9 +3964,8 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, err = btrfs_orphan_add(trans, inode); out: - nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); if (err) { if (inode) iput(inode); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f21f39f..7b29735 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -952,7 +952,6 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) struct btrfs_fs_info *info = root->fs_info; struct btrfs_trans_handle *trans; int ret; - unsigned long nr; if (xchg(&root->defrag_running, 1)) return 0; @@ -964,9 +963,8 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) ret = btrfs_defrag_leaves(trans, root, cacheonly); - nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(info->tree_root, nr); + btrfs_btree_balance_dirty(info->tree_root); cond_resched(); if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) -- cgit v0.10.2 From d25628bdd66aedd6e07729d8dc6c8ee846d66d72 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 14 Nov 2012 14:35:30 +0000 Subject: Btrfs: protect devices list with its mutex Since we've kill the bigger one volume_mutex, we need to add devices list mutex back. Signed-off-by: Liu Bo Signed-off-by: Chris Mason diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index db79fb7..92e586b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1681,16 +1681,17 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) filemap_write_and_wait(bdev->bd_inode->i_mapping); devices = &root->fs_info->fs_devices->devices; - /* - * we have the volume lock, so we don't need the extra - * device list mutex while reading the list here. - */ + + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); list_for_each_entry(device, devices, dev_list) { if (device->bdev == bdev) { ret = -EEXIST; + mutex_unlock( + &root->fs_info->fs_devices->device_list_mutex); goto error; } } + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); device = kzalloc(sizeof(*device), GFP_NOFS); if (!device) { -- cgit v0.10.2 From d9d181c1ba7aa09a6d2698e8c7e75b515524d504 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 2 Nov 2012 09:58:09 +0100 Subject: Btrfs: rename the scrub context structure The device replace procedure makes use of the scrub code. The scrub code is the most efficient code to read the allocated data of a disk, i.e. it reads sequentially in order to avoid disk head movements, it skips unallocated blocks, it uses read ahead mechanisms, and it contains all the code to detect and repair defects. This commit is a first preparation step to adapt the scrub code to be shareable for the device replace procedure. The block device will be removed from the scrub context state structure in a later step. It used to be the source block device. The scrub code as it is used for the device replace procedure reads the source data from whereever it is optimal. The source device might even be gone (disconnected, for instance due to a hardware failure). Or the drive can be so faulty so that the device replace procedure tries to avoid access to the faulty source drive as much as possible, and only if all other mirrors are damaged, as a last resort, the source disk is accessed. The modified scrub code operates as if it would handle the source drive and thereby generates an exact copy of the source disk on the target disk, even if the source disk is not present at all. Therefore the block device pointer to the source disk is removed in a later patch, and therefore the context structure is renamed (this is the goal of the current patch) to reflect that no source block device scope is there anymore. Summary: This first preparation step consists of a textual substitution of the term "dev" to the term "ctx" whereever the scrub context is used. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 27892f6..29c8aac 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -42,10 +42,10 @@ */ struct scrub_block; -struct scrub_dev; +struct scrub_ctx; #define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ -#define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */ +#define SCRUB_BIOS_PER_CTX 16 /* 1 MB per device in flight */ #define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ struct scrub_page { @@ -66,7 +66,7 @@ struct scrub_page { struct scrub_bio { int index; - struct scrub_dev *sdev; + struct scrub_ctx *sctx; struct bio *bio; int err; u64 logical; @@ -82,7 +82,7 @@ struct scrub_block { int page_count; atomic_t outstanding_pages; atomic_t ref_count; /* free mem on transition to zero */ - struct scrub_dev *sdev; + struct scrub_ctx *sctx; struct { unsigned int header_error:1; unsigned int checksum_error:1; @@ -91,8 +91,8 @@ struct scrub_block { }; }; -struct scrub_dev { - struct scrub_bio *bios[SCRUB_BIOS_PER_DEV]; +struct scrub_ctx { + struct scrub_bio *bios[SCRUB_BIOS_PER_CTX]; struct btrfs_device *dev; int first_free; int curr; @@ -116,7 +116,7 @@ struct scrub_dev { }; struct scrub_fixup_nodatasum { - struct scrub_dev *sdev; + struct scrub_ctx *sctx; u64 logical; struct btrfs_root *root; struct btrfs_work work; @@ -138,7 +138,7 @@ struct scrub_warning { static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); -static int scrub_setup_recheck_block(struct scrub_dev *sdev, +static int scrub_setup_recheck_block(struct scrub_ctx *sctx, struct btrfs_mapping_tree *map_tree, u64 length, u64 logical, struct scrub_block *sblock); @@ -163,9 +163,9 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock); static int scrub_checksum_super(struct scrub_block *sblock); static void scrub_block_get(struct scrub_block *sblock); static void scrub_block_put(struct scrub_block *sblock); -static int scrub_add_page_to_bio(struct scrub_dev *sdev, +static int scrub_add_page_to_bio(struct scrub_ctx *sctx, struct scrub_page *spage); -static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, +static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, u64 physical, u64 flags, u64 gen, int mirror_num, u8 *csum, int force); static void scrub_bio_end_io(struct bio *bio, int err); @@ -173,27 +173,27 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work); static void scrub_block_complete(struct scrub_block *sblock); -static void scrub_free_csums(struct scrub_dev *sdev) +static void scrub_free_csums(struct scrub_ctx *sctx) { - while (!list_empty(&sdev->csum_list)) { + while (!list_empty(&sctx->csum_list)) { struct btrfs_ordered_sum *sum; - sum = list_first_entry(&sdev->csum_list, + sum = list_first_entry(&sctx->csum_list, struct btrfs_ordered_sum, list); list_del(&sum->list); kfree(sum); } } -static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) +static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) { int i; - if (!sdev) + if (!sctx) return; /* this can happen when scrub is cancelled */ - if (sdev->curr != -1) { - struct scrub_bio *sbio = sdev->bios[sdev->curr]; + if (sctx->curr != -1) { + struct scrub_bio *sbio = sctx->bios[sctx->curr]; for (i = 0; i < sbio->page_count; i++) { BUG_ON(!sbio->pagev[i]); @@ -203,69 +203,69 @@ static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) bio_put(sbio->bio); } - for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { - struct scrub_bio *sbio = sdev->bios[i]; + for (i = 0; i < SCRUB_BIOS_PER_CTX; ++i) { + struct scrub_bio *sbio = sctx->bios[i]; if (!sbio) break; kfree(sbio); } - scrub_free_csums(sdev); - kfree(sdev); + scrub_free_csums(sctx); + kfree(sctx); } static noinline_for_stack -struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) +struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev) { - struct scrub_dev *sdev; + struct scrub_ctx *sctx; int i; struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; int pages_per_bio; pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO, bio_get_nr_vecs(dev->bdev)); - sdev = kzalloc(sizeof(*sdev), GFP_NOFS); - if (!sdev) + sctx = kzalloc(sizeof(*sctx), GFP_NOFS); + if (!sctx) goto nomem; - sdev->dev = dev; - sdev->pages_per_bio = pages_per_bio; - sdev->curr = -1; - for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { + sctx->dev = dev; + sctx->pages_per_bio = pages_per_bio; + sctx->curr = -1; + for (i = 0; i < SCRUB_BIOS_PER_CTX; ++i) { struct scrub_bio *sbio; sbio = kzalloc(sizeof(*sbio), GFP_NOFS); if (!sbio) goto nomem; - sdev->bios[i] = sbio; + sctx->bios[i] = sbio; sbio->index = i; - sbio->sdev = sdev; + sbio->sctx = sctx; sbio->page_count = 0; sbio->work.func = scrub_bio_end_io_worker; - if (i != SCRUB_BIOS_PER_DEV-1) - sdev->bios[i]->next_free = i + 1; + if (i != SCRUB_BIOS_PER_CTX - 1) + sctx->bios[i]->next_free = i + 1; else - sdev->bios[i]->next_free = -1; - } - sdev->first_free = 0; - sdev->nodesize = dev->dev_root->nodesize; - sdev->leafsize = dev->dev_root->leafsize; - sdev->sectorsize = dev->dev_root->sectorsize; - atomic_set(&sdev->in_flight, 0); - atomic_set(&sdev->fixup_cnt, 0); - atomic_set(&sdev->cancel_req, 0); - sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy); - INIT_LIST_HEAD(&sdev->csum_list); - - spin_lock_init(&sdev->list_lock); - spin_lock_init(&sdev->stat_lock); - init_waitqueue_head(&sdev->list_wait); - return sdev; + sctx->bios[i]->next_free = -1; + } + sctx->first_free = 0; + sctx->nodesize = dev->dev_root->nodesize; + sctx->leafsize = dev->dev_root->leafsize; + sctx->sectorsize = dev->dev_root->sectorsize; + atomic_set(&sctx->in_flight, 0); + atomic_set(&sctx->fixup_cnt, 0); + atomic_set(&sctx->cancel_req, 0); + sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy); + INIT_LIST_HEAD(&sctx->csum_list); + + spin_lock_init(&sctx->list_lock); + spin_lock_init(&sctx->stat_lock); + init_waitqueue_head(&sctx->list_wait); + return sctx; nomem: - scrub_free_dev(sdev); + scrub_free_ctx(sctx); return ERR_PTR(-ENOMEM); } @@ -345,7 +345,7 @@ err: static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) { - struct btrfs_device *dev = sblock->sdev->dev; + struct btrfs_device *dev = sblock->sctx->dev; struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; struct btrfs_path *path; struct btrfs_key found_key; @@ -530,21 +530,21 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work) { int ret; struct scrub_fixup_nodatasum *fixup; - struct scrub_dev *sdev; + struct scrub_ctx *sctx; struct btrfs_trans_handle *trans = NULL; struct btrfs_fs_info *fs_info; struct btrfs_path *path; int uncorrectable = 0; fixup = container_of(work, struct scrub_fixup_nodatasum, work); - sdev = fixup->sdev; + sctx = fixup->sctx; fs_info = fixup->root->fs_info; path = btrfs_alloc_path(); if (!path) { - spin_lock(&sdev->stat_lock); - ++sdev->stat.malloc_errors; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + ++sctx->stat.malloc_errors; + spin_unlock(&sctx->stat_lock); uncorrectable = 1; goto out; } @@ -573,22 +573,22 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work) } WARN_ON(ret != 1); - spin_lock(&sdev->stat_lock); - ++sdev->stat.corrected_errors; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + ++sctx->stat.corrected_errors; + spin_unlock(&sctx->stat_lock); out: if (trans && !IS_ERR(trans)) btrfs_end_transaction(trans, fixup->root); if (uncorrectable) { - spin_lock(&sdev->stat_lock); - ++sdev->stat.uncorrectable_errors; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + ++sctx->stat.uncorrectable_errors; + spin_unlock(&sctx->stat_lock); printk_ratelimited_in_rcu(KERN_ERR "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", (unsigned long long)fixup->logical, - rcu_str_deref(sdev->dev->name)); + rcu_str_deref(sctx->dev->name)); } btrfs_free_path(path); @@ -599,9 +599,9 @@ out: atomic_dec(&fs_info->scrubs_running); atomic_dec(&fs_info->scrubs_paused); mutex_unlock(&fs_info->scrub_lock); - atomic_dec(&sdev->fixup_cnt); + atomic_dec(&sctx->fixup_cnt); wake_up(&fs_info->scrub_pause_wait); - wake_up(&sdev->list_wait); + wake_up(&sctx->list_wait); } /* @@ -614,7 +614,7 @@ out: */ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) { - struct scrub_dev *sdev = sblock_to_check->sdev; + struct scrub_ctx *sctx = sblock_to_check->sctx; struct btrfs_fs_info *fs_info; u64 length; u64 logical; @@ -633,7 +633,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) DEFAULT_RATELIMIT_BURST); BUG_ON(sblock_to_check->page_count < 1); - fs_info = sdev->dev->dev_root->fs_info; + fs_info = sctx->dev->dev_root->fs_info; length = sblock_to_check->page_count * PAGE_SIZE; logical = sblock_to_check->pagev[0].logical; generation = sblock_to_check->pagev[0].generation; @@ -677,25 +677,25 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sizeof(*sblocks_for_recheck), GFP_NOFS); if (!sblocks_for_recheck) { - spin_lock(&sdev->stat_lock); - sdev->stat.malloc_errors++; - sdev->stat.read_errors++; - sdev->stat.uncorrectable_errors++; - spin_unlock(&sdev->stat_lock); - btrfs_dev_stat_inc_and_print(sdev->dev, + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + sctx->stat.read_errors++; + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + btrfs_dev_stat_inc_and_print(sctx->dev, BTRFS_DEV_STAT_READ_ERRS); goto out; } /* setup the context, map the logical blocks and alloc the pages */ - ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length, + ret = scrub_setup_recheck_block(sctx, &fs_info->mapping_tree, length, logical, sblocks_for_recheck); if (ret) { - spin_lock(&sdev->stat_lock); - sdev->stat.read_errors++; - sdev->stat.uncorrectable_errors++; - spin_unlock(&sdev->stat_lock); - btrfs_dev_stat_inc_and_print(sdev->dev, + spin_lock(&sctx->stat_lock); + sctx->stat.read_errors++; + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + btrfs_dev_stat_inc_and_print(sctx->dev, BTRFS_DEV_STAT_READ_ERRS); goto out; } @@ -704,13 +704,13 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) /* build and submit the bios for the failed mirror, check checksums */ ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, - csum, generation, sdev->csum_size); + csum, generation, sctx->csum_size); if (ret) { - spin_lock(&sdev->stat_lock); - sdev->stat.read_errors++; - sdev->stat.uncorrectable_errors++; - spin_unlock(&sdev->stat_lock); - btrfs_dev_stat_inc_and_print(sdev->dev, + spin_lock(&sctx->stat_lock); + sctx->stat.read_errors++; + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + btrfs_dev_stat_inc_and_print(sctx->dev, BTRFS_DEV_STAT_READ_ERRS); goto out; } @@ -725,45 +725,45 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * different bio (usually one of the two latter cases is * the cause) */ - spin_lock(&sdev->stat_lock); - sdev->stat.unverified_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.unverified_errors++; + spin_unlock(&sctx->stat_lock); goto out; } if (!sblock_bad->no_io_error_seen) { - spin_lock(&sdev->stat_lock); - sdev->stat.read_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.read_errors++; + spin_unlock(&sctx->stat_lock); if (__ratelimit(&_rs)) scrub_print_warning("i/o error", sblock_to_check); - btrfs_dev_stat_inc_and_print(sdev->dev, + btrfs_dev_stat_inc_and_print(sctx->dev, BTRFS_DEV_STAT_READ_ERRS); } else if (sblock_bad->checksum_error) { - spin_lock(&sdev->stat_lock); - sdev->stat.csum_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.csum_errors++; + spin_unlock(&sctx->stat_lock); if (__ratelimit(&_rs)) scrub_print_warning("checksum error", sblock_to_check); - btrfs_dev_stat_inc_and_print(sdev->dev, + btrfs_dev_stat_inc_and_print(sctx->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); } else if (sblock_bad->header_error) { - spin_lock(&sdev->stat_lock); - sdev->stat.verify_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.verify_errors++; + spin_unlock(&sctx->stat_lock); if (__ratelimit(&_rs)) scrub_print_warning("checksum/header error", sblock_to_check); if (sblock_bad->generation_error) - btrfs_dev_stat_inc_and_print(sdev->dev, + btrfs_dev_stat_inc_and_print(sctx->dev, BTRFS_DEV_STAT_GENERATION_ERRS); else - btrfs_dev_stat_inc_and_print(sdev->dev, + btrfs_dev_stat_inc_and_print(sctx->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); } - if (sdev->readonly) + if (sctx->readonly) goto did_not_correct_error; if (!is_metadata && !have_csum) { @@ -779,7 +779,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS); if (!fixup_nodatasum) goto did_not_correct_error; - fixup_nodatasum->sdev = sdev; + fixup_nodatasum->sctx = sctx; fixup_nodatasum->logical = logical; fixup_nodatasum->root = fs_info->extent_root; fixup_nodatasum->mirror_num = failed_mirror_index + 1; @@ -796,7 +796,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) atomic_inc(&fs_info->scrubs_running); atomic_inc(&fs_info->scrubs_paused); mutex_unlock(&fs_info->scrub_lock); - atomic_inc(&sdev->fixup_cnt); + atomic_inc(&sctx->fixup_cnt); fixup_nodatasum->work.func = scrub_fixup_nodatasum; btrfs_queue_worker(&fs_info->scrub_workers, &fixup_nodatasum->work); @@ -818,7 +818,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) ret = scrub_recheck_block(fs_info, sblocks_for_recheck + mirror_index, is_metadata, have_csum, csum, - generation, sdev->csum_size); + generation, sctx->csum_size); if (ret) goto did_not_correct_error; } @@ -930,7 +930,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) */ ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, csum, - generation, sdev->csum_size); + generation, sctx->csum_size); if (!ret && !sblock_bad->header_error && !sblock_bad->checksum_error && sblock_bad->no_io_error_seen) @@ -939,23 +939,23 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) goto did_not_correct_error; } else { corrected_error: - spin_lock(&sdev->stat_lock); - sdev->stat.corrected_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.corrected_errors++; + spin_unlock(&sctx->stat_lock); printk_ratelimited_in_rcu(KERN_ERR "btrfs: fixed up error at logical %llu on dev %s\n", (unsigned long long)logical, - rcu_str_deref(sdev->dev->name)); + rcu_str_deref(sctx->dev->name)); } } else { did_not_correct_error: - spin_lock(&sdev->stat_lock); - sdev->stat.uncorrectable_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); printk_ratelimited_in_rcu(KERN_ERR "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", (unsigned long long)logical, - rcu_str_deref(sdev->dev->name)); + rcu_str_deref(sctx->dev->name)); } out: @@ -978,7 +978,7 @@ out: return 0; } -static int scrub_setup_recheck_block(struct scrub_dev *sdev, +static int scrub_setup_recheck_block(struct scrub_ctx *sctx, struct btrfs_mapping_tree *map_tree, u64 length, u64 logical, struct scrub_block *sblocks_for_recheck) @@ -988,7 +988,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev, int ret; /* - * note: the three members sdev, ref_count and outstanding_pages + * note: the three members sctx, ref_count and outstanding_pages * are not used (and not set) in the blocks that are used for * the recheck procedure */ @@ -1028,9 +1028,9 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev, page->mirror_num = mirror_index + 1; page->page = alloc_page(GFP_NOFS); if (!page->page) { - spin_lock(&sdev->stat_lock); - sdev->stat.malloc_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); kfree(bbio); return -ENOMEM; } @@ -1259,14 +1259,14 @@ static void scrub_checksum(struct scrub_block *sblock) static int scrub_checksum_data(struct scrub_block *sblock) { - struct scrub_dev *sdev = sblock->sdev; + struct scrub_ctx *sctx = sblock->sctx; u8 csum[BTRFS_CSUM_SIZE]; u8 *on_disk_csum; struct page *page; void *buffer; u32 crc = ~(u32)0; int fail = 0; - struct btrfs_root *root = sdev->dev->dev_root; + struct btrfs_root *root = sctx->dev->dev_root; u64 len; int index; @@ -1278,7 +1278,7 @@ static int scrub_checksum_data(struct scrub_block *sblock) page = sblock->pagev[0].page; buffer = kmap_atomic(page); - len = sdev->sectorsize; + len = sctx->sectorsize; index = 0; for (;;) { u64 l = min_t(u64, len, PAGE_SIZE); @@ -1296,7 +1296,7 @@ static int scrub_checksum_data(struct scrub_block *sblock) } btrfs_csum_final(crc, csum); - if (memcmp(csum, on_disk_csum, sdev->csum_size)) + if (memcmp(csum, on_disk_csum, sctx->csum_size)) fail = 1; return fail; @@ -1304,9 +1304,9 @@ static int scrub_checksum_data(struct scrub_block *sblock) static int scrub_checksum_tree_block(struct scrub_block *sblock) { - struct scrub_dev *sdev = sblock->sdev; + struct scrub_ctx *sctx = sblock->sctx; struct btrfs_header *h; - struct btrfs_root *root = sdev->dev->dev_root; + struct btrfs_root *root = sctx->dev->dev_root; struct btrfs_fs_info *fs_info = root->fs_info; u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; @@ -1324,7 +1324,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) page = sblock->pagev[0].page; mapped_buffer = kmap_atomic(page); h = (struct btrfs_header *)mapped_buffer; - memcpy(on_disk_csum, h->csum, sdev->csum_size); + memcpy(on_disk_csum, h->csum, sctx->csum_size); /* * we don't use the getter functions here, as we @@ -1345,8 +1345,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) BTRFS_UUID_SIZE)) ++fail; - BUG_ON(sdev->nodesize != sdev->leafsize); - len = sdev->nodesize - BTRFS_CSUM_SIZE; + BUG_ON(sctx->nodesize != sctx->leafsize); + len = sctx->nodesize - BTRFS_CSUM_SIZE; mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; index = 0; @@ -1368,7 +1368,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) } btrfs_csum_final(crc, calculated_csum); - if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) + if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) ++crc_fail; return fail || crc_fail; @@ -1377,8 +1377,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) static int scrub_checksum_super(struct scrub_block *sblock) { struct btrfs_super_block *s; - struct scrub_dev *sdev = sblock->sdev; - struct btrfs_root *root = sdev->dev->dev_root; + struct scrub_ctx *sctx = sblock->sctx; + struct btrfs_root *root = sctx->dev->dev_root; struct btrfs_fs_info *fs_info = root->fs_info; u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; @@ -1396,7 +1396,7 @@ static int scrub_checksum_super(struct scrub_block *sblock) page = sblock->pagev[0].page; mapped_buffer = kmap_atomic(page); s = (struct btrfs_super_block *)mapped_buffer; - memcpy(on_disk_csum, s->csum, sdev->csum_size); + memcpy(on_disk_csum, s->csum, sctx->csum_size); if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) ++fail_cor; @@ -1429,7 +1429,7 @@ static int scrub_checksum_super(struct scrub_block *sblock) } btrfs_csum_final(crc, calculated_csum); - if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) + if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) ++fail_cor; if (fail_cor + fail_gen) { @@ -1438,14 +1438,14 @@ static int scrub_checksum_super(struct scrub_block *sblock) * They will get written with the next transaction commit * anyway */ - spin_lock(&sdev->stat_lock); - ++sdev->stat.super_errors; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + ++sctx->stat.super_errors; + spin_unlock(&sctx->stat_lock); if (fail_cor) - btrfs_dev_stat_inc_and_print(sdev->dev, + btrfs_dev_stat_inc_and_print(sctx->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); else - btrfs_dev_stat_inc_and_print(sdev->dev, + btrfs_dev_stat_inc_and_print(sctx->dev, BTRFS_DEV_STAT_GENERATION_ERRS); } @@ -1469,21 +1469,21 @@ static void scrub_block_put(struct scrub_block *sblock) } } -static void scrub_submit(struct scrub_dev *sdev) +static void scrub_submit(struct scrub_ctx *sctx) { struct scrub_bio *sbio; - if (sdev->curr == -1) + if (sctx->curr == -1) return; - sbio = sdev->bios[sdev->curr]; - sdev->curr = -1; - atomic_inc(&sdev->in_flight); + sbio = sctx->bios[sctx->curr]; + sctx->curr = -1; + atomic_inc(&sctx->in_flight); btrfsic_submit_bio(READ, sbio->bio); } -static int scrub_add_page_to_bio(struct scrub_dev *sdev, +static int scrub_add_page_to_bio(struct scrub_ctx *sctx, struct scrub_page *spage) { struct scrub_block *sblock = spage->sblock; @@ -1494,20 +1494,20 @@ again: /* * grab a fresh bio or wait for one to become available */ - while (sdev->curr == -1) { - spin_lock(&sdev->list_lock); - sdev->curr = sdev->first_free; - if (sdev->curr != -1) { - sdev->first_free = sdev->bios[sdev->curr]->next_free; - sdev->bios[sdev->curr]->next_free = -1; - sdev->bios[sdev->curr]->page_count = 0; - spin_unlock(&sdev->list_lock); + while (sctx->curr == -1) { + spin_lock(&sctx->list_lock); + sctx->curr = sctx->first_free; + if (sctx->curr != -1) { + sctx->first_free = sctx->bios[sctx->curr]->next_free; + sctx->bios[sctx->curr]->next_free = -1; + sctx->bios[sctx->curr]->page_count = 0; + spin_unlock(&sctx->list_lock); } else { - spin_unlock(&sdev->list_lock); - wait_event(sdev->list_wait, sdev->first_free != -1); + spin_unlock(&sctx->list_lock); + wait_event(sctx->list_wait, sctx->first_free != -1); } } - sbio = sdev->bios[sdev->curr]; + sbio = sctx->bios[sctx->curr]; if (sbio->page_count == 0) { struct bio *bio; @@ -1515,7 +1515,7 @@ again: sbio->logical = spage->logical; bio = sbio->bio; if (!bio) { - bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio); + bio = bio_alloc(GFP_NOFS, sctx->pages_per_bio); if (!bio) return -ENOMEM; sbio->bio = bio; @@ -1523,14 +1523,14 @@ again: bio->bi_private = sbio; bio->bi_end_io = scrub_bio_end_io; - bio->bi_bdev = sdev->dev->bdev; + bio->bi_bdev = sctx->dev->bdev; bio->bi_sector = spage->physical >> 9; sbio->err = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical || sbio->logical + sbio->page_count * PAGE_SIZE != spage->logical) { - scrub_submit(sdev); + scrub_submit(sctx); goto again; } @@ -1542,20 +1542,20 @@ again: sbio->bio = NULL; return -EIO; } - scrub_submit(sdev); + scrub_submit(sctx); goto again; } scrub_block_get(sblock); /* one for the added page */ atomic_inc(&sblock->outstanding_pages); sbio->page_count++; - if (sbio->page_count == sdev->pages_per_bio) - scrub_submit(sdev); + if (sbio->page_count == sctx->pages_per_bio) + scrub_submit(sctx); return 0; } -static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, +static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, u64 physical, u64 flags, u64 gen, int mirror_num, u8 *csum, int force) { @@ -1564,15 +1564,15 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, sblock = kzalloc(sizeof(*sblock), GFP_NOFS); if (!sblock) { - spin_lock(&sdev->stat_lock); - sdev->stat.malloc_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); return -ENOMEM; } /* one ref inside this function, plus one for each page later on */ atomic_set(&sblock->ref_count, 1); - sblock->sdev = sdev; + sblock->sctx = sctx; sblock->no_io_error_seen = 1; for (index = 0; len > 0; index++) { @@ -1582,9 +1582,9 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); spage->page = alloc_page(GFP_NOFS); if (!spage->page) { - spin_lock(&sdev->stat_lock); - sdev->stat.malloc_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); while (index > 0) { index--; __free_page(sblock->pagev[index].page); @@ -1593,7 +1593,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, return -ENOMEM; } spage->sblock = sblock; - spage->dev = sdev->dev; + spage->dev = sctx->dev; spage->flags = flags; spage->generation = gen; spage->logical = logical; @@ -1601,7 +1601,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, spage->mirror_num = mirror_num; if (csum) { spage->have_csum = 1; - memcpy(spage->csum, csum, sdev->csum_size); + memcpy(spage->csum, csum, sctx->csum_size); } else { spage->have_csum = 0; } @@ -1616,7 +1616,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, struct scrub_page *spage = sblock->pagev + index; int ret; - ret = scrub_add_page_to_bio(sdev, spage); + ret = scrub_add_page_to_bio(sctx, spage); if (ret) { scrub_block_put(sblock); return ret; @@ -1624,7 +1624,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, } if (force) - scrub_submit(sdev); + scrub_submit(sctx); /* last one frees, either here or in bio completion for last page */ scrub_block_put(sblock); @@ -1634,8 +1634,8 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, static void scrub_bio_end_io(struct bio *bio, int err) { struct scrub_bio *sbio = bio->bi_private; - struct scrub_dev *sdev = sbio->sdev; - struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; + struct scrub_ctx *sctx = sbio->sctx; + struct btrfs_fs_info *fs_info = sctx->dev->dev_root->fs_info; sbio->err = err; sbio->bio = bio; @@ -1646,7 +1646,7 @@ static void scrub_bio_end_io(struct bio *bio, int err) static void scrub_bio_end_io_worker(struct btrfs_work *work) { struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); - struct scrub_dev *sdev = sbio->sdev; + struct scrub_ctx *sctx = sbio->sctx; int i; BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO); @@ -1671,12 +1671,12 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) bio_put(sbio->bio); sbio->bio = NULL; - spin_lock(&sdev->list_lock); - sbio->next_free = sdev->first_free; - sdev->first_free = sbio->index; - spin_unlock(&sdev->list_lock); - atomic_dec(&sdev->in_flight); - wake_up(&sdev->list_wait); + spin_lock(&sctx->list_lock); + sbio->next_free = sctx->first_free; + sctx->first_free = sbio->index; + spin_unlock(&sctx->list_lock); + atomic_dec(&sctx->in_flight); + wake_up(&sctx->list_wait); } static void scrub_block_complete(struct scrub_block *sblock) @@ -1687,7 +1687,7 @@ static void scrub_block_complete(struct scrub_block *sblock) scrub_checksum(sblock); } -static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, +static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, u8 *csum) { struct btrfs_ordered_sum *sum = NULL; @@ -1695,15 +1695,15 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, unsigned long i; unsigned long num_sectors; - while (!list_empty(&sdev->csum_list)) { - sum = list_first_entry(&sdev->csum_list, + while (!list_empty(&sctx->csum_list)) { + sum = list_first_entry(&sctx->csum_list, struct btrfs_ordered_sum, list); if (sum->bytenr > logical) return 0; if (sum->bytenr + sum->len > logical) break; - ++sdev->stat.csum_discards; + ++sctx->stat.csum_discards; list_del(&sum->list); kfree(sum); sum = NULL; @@ -1711,10 +1711,10 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, if (!sum) return 0; - num_sectors = sum->len / sdev->sectorsize; + num_sectors = sum->len / sctx->sectorsize; for (i = 0; i < num_sectors; ++i) { if (sum->sums[i].bytenr == logical) { - memcpy(csum, &sum->sums[i].sum, sdev->csum_size); + memcpy(csum, &sum->sums[i].sum, sctx->csum_size); ret = 1; break; } @@ -1727,7 +1727,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, } /* scrub extent tries to collect up to 64 kB for each bio */ -static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, +static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, u64 physical, u64 flags, u64 gen, int mirror_num) { int ret; @@ -1735,20 +1735,20 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, u32 blocksize; if (flags & BTRFS_EXTENT_FLAG_DATA) { - blocksize = sdev->sectorsize; - spin_lock(&sdev->stat_lock); - sdev->stat.data_extents_scrubbed++; - sdev->stat.data_bytes_scrubbed += len; - spin_unlock(&sdev->stat_lock); + blocksize = sctx->sectorsize; + spin_lock(&sctx->stat_lock); + sctx->stat.data_extents_scrubbed++; + sctx->stat.data_bytes_scrubbed += len; + spin_unlock(&sctx->stat_lock); } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - BUG_ON(sdev->nodesize != sdev->leafsize); - blocksize = sdev->nodesize; - spin_lock(&sdev->stat_lock); - sdev->stat.tree_extents_scrubbed++; - sdev->stat.tree_bytes_scrubbed += len; - spin_unlock(&sdev->stat_lock); + BUG_ON(sctx->nodesize != sctx->leafsize); + blocksize = sctx->nodesize; + spin_lock(&sctx->stat_lock); + sctx->stat.tree_extents_scrubbed++; + sctx->stat.tree_bytes_scrubbed += len; + spin_unlock(&sctx->stat_lock); } else { - blocksize = sdev->sectorsize; + blocksize = sctx->sectorsize; BUG_ON(1); } @@ -1758,11 +1758,11 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, if (flags & BTRFS_EXTENT_FLAG_DATA) { /* push csums to sbio */ - have_csum = scrub_find_csum(sdev, logical, l, csum); + have_csum = scrub_find_csum(sctx, logical, l, csum); if (have_csum == 0) - ++sdev->stat.no_csum; + ++sctx->stat.no_csum; } - ret = scrub_pages(sdev, logical, l, physical, flags, gen, + ret = scrub_pages(sctx, logical, l, physical, flags, gen, mirror_num, have_csum ? csum : NULL, 0); if (ret) return ret; @@ -1773,11 +1773,11 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, return 0; } -static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, +static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct map_lookup *map, int num, u64 base, u64 length) { struct btrfs_path *path; - struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; + struct btrfs_fs_info *fs_info = sctx->dev->dev_root->fs_info; struct btrfs_root *root = fs_info->extent_root; struct btrfs_root *csum_root = fs_info->csum_root; struct btrfs_extent_item *extent; @@ -1843,8 +1843,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, */ logical = base + offset; - wait_event(sdev->list_wait, - atomic_read(&sdev->in_flight) == 0); + wait_event(sctx->list_wait, + atomic_read(&sctx->in_flight) == 0); atomic_inc(&fs_info->scrubs_paused); wake_up(&fs_info->scrub_pause_wait); @@ -1898,7 +1898,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, * canceled? */ if (atomic_read(&fs_info->scrub_cancel_req) || - atomic_read(&sdev->cancel_req)) { + atomic_read(&sctx->cancel_req)) { ret = -ECANCELED; goto out; } @@ -1907,9 +1907,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, */ if (atomic_read(&fs_info->scrub_pause_req)) { /* push queued extents */ - scrub_submit(sdev); - wait_event(sdev->list_wait, - atomic_read(&sdev->in_flight) == 0); + scrub_submit(sctx); + wait_event(sctx->list_wait, + atomic_read(&sctx->in_flight) == 0); atomic_inc(&fs_info->scrubs_paused); wake_up(&fs_info->scrub_pause_wait); mutex_lock(&fs_info->scrub_lock); @@ -1926,7 +1926,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, ret = btrfs_lookup_csums_range(csum_root, logical, logical + map->stripe_len - 1, - &sdev->csum_list, 1); + &sctx->csum_list, 1); if (ret) goto out; @@ -2004,7 +2004,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, key.objectid; } - ret = scrub_extent(sdev, key.objectid, key.offset, + ret = scrub_extent(sctx, key.objectid, key.offset, key.objectid - logical + physical, flags, generation, mirror_num); if (ret) @@ -2016,12 +2016,12 @@ next: btrfs_release_path(path); logical += increment; physical += map->stripe_len; - spin_lock(&sdev->stat_lock); - sdev->stat.last_physical = physical; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.last_physical = physical; + spin_unlock(&sctx->stat_lock); } /* push queued extents */ - scrub_submit(sdev); + scrub_submit(sctx); out: blk_finish_plug(&plug); @@ -2029,12 +2029,12 @@ out: return ret < 0 ? ret : 0; } -static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, +static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length, u64 dev_offset) { struct btrfs_mapping_tree *map_tree = - &sdev->dev->dev_root->fs_info->mapping_tree; + &sctx->dev->dev_root->fs_info->mapping_tree; struct map_lookup *map; struct extent_map *em; int i; @@ -2055,9 +2055,9 @@ static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, goto out; for (i = 0; i < map->num_stripes; ++i) { - if (map->stripes[i].dev == sdev->dev && + if (map->stripes[i].dev == sctx->dev && map->stripes[i].physical == dev_offset) { - ret = scrub_stripe(sdev, map, i, chunk_offset, length); + ret = scrub_stripe(sctx, map, i, chunk_offset, length); if (ret) goto out; } @@ -2069,11 +2069,11 @@ out: } static noinline_for_stack -int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) +int scrub_enumerate_chunks(struct scrub_ctx *sctx, u64 start, u64 end) { struct btrfs_dev_extent *dev_extent = NULL; struct btrfs_path *path; - struct btrfs_root *root = sdev->dev->dev_root; + struct btrfs_root *root = sctx->dev->dev_root; struct btrfs_fs_info *fs_info = root->fs_info; u64 length; u64 chunk_tree; @@ -2094,7 +2094,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) path->search_commit_root = 1; path->skip_locking = 1; - key.objectid = sdev->dev->devid; + key.objectid = sctx->dev->devid; key.offset = 0ull; key.type = BTRFS_DEV_EXTENT_KEY; @@ -2117,7 +2117,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) btrfs_item_key_to_cpu(l, &found_key, slot); - if (found_key.objectid != sdev->dev->devid) + if (found_key.objectid != sctx->dev->devid) break; if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) @@ -2151,7 +2151,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) ret = -ENOENT; break; } - ret = scrub_chunk(sdev, chunk_tree, chunk_objectid, + ret = scrub_chunk(sctx, chunk_tree, chunk_objectid, chunk_offset, length, found_key.offset); btrfs_put_block_group(cache); if (ret) @@ -2170,13 +2170,13 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) return ret < 0 ? ret : 0; } -static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) +static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx) { int i; u64 bytenr; u64 gen; int ret; - struct btrfs_device *device = sdev->dev; + struct btrfs_device *device = sctx->dev; struct btrfs_root *root = device->dev_root; if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) @@ -2189,12 +2189,12 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) break; - ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, + ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); if (ret) return ret; } - wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); + wait_event(sctx->list_wait, atomic_read(&sctx->in_flight) == 0); return 0; } @@ -2238,7 +2238,7 @@ static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, int readonly) { - struct scrub_dev *sdev; + struct scrub_ctx *sctx; struct btrfs_fs_info *fs_info = root->fs_info; int ret; struct btrfs_device *dev; @@ -2302,41 +2302,41 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, scrub_workers_put(root); return -EINPROGRESS; } - sdev = scrub_setup_dev(dev); - if (IS_ERR(sdev)) { + sctx = scrub_setup_ctx(dev); + if (IS_ERR(sctx)) { mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); scrub_workers_put(root); - return PTR_ERR(sdev); + return PTR_ERR(sctx); } - sdev->readonly = readonly; - dev->scrub_device = sdev; + sctx->readonly = readonly; + dev->scrub_device = sctx; atomic_inc(&fs_info->scrubs_running); mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); down_read(&fs_info->scrub_super_lock); - ret = scrub_supers(sdev); + ret = scrub_supers(sctx); up_read(&fs_info->scrub_super_lock); if (!ret) - ret = scrub_enumerate_chunks(sdev, start, end); + ret = scrub_enumerate_chunks(sctx, start, end); - wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); + wait_event(sctx->list_wait, atomic_read(&sctx->in_flight) == 0); atomic_dec(&fs_info->scrubs_running); wake_up(&fs_info->scrub_pause_wait); - wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0); + wait_event(sctx->list_wait, atomic_read(&sctx->fixup_cnt) == 0); if (progress) - memcpy(progress, &sdev->stat, sizeof(*progress)); + memcpy(progress, &sctx->stat, sizeof(*progress)); mutex_lock(&fs_info->scrub_lock); dev->scrub_device = NULL; mutex_unlock(&fs_info->scrub_lock); - scrub_free_dev(sdev); + scrub_free_ctx(sctx); scrub_workers_put(root); return ret; @@ -2407,15 +2407,15 @@ int btrfs_scrub_cancel(struct btrfs_root *root) int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev) { struct btrfs_fs_info *fs_info = root->fs_info; - struct scrub_dev *sdev; + struct scrub_ctx *sctx; mutex_lock(&fs_info->scrub_lock); - sdev = dev->scrub_device; - if (!sdev) { + sctx = dev->scrub_device; + if (!sctx) { mutex_unlock(&fs_info->scrub_lock); return -ENOTCONN; } - atomic_inc(&sdev->cancel_req); + atomic_inc(&sctx->cancel_req); while (dev->scrub_device) { mutex_unlock(&fs_info->scrub_lock); wait_event(fs_info->scrub_pause_wait, @@ -2453,15 +2453,15 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, struct btrfs_scrub_progress *progress) { struct btrfs_device *dev; - struct scrub_dev *sdev = NULL; + struct scrub_ctx *sctx = NULL; mutex_lock(&root->fs_info->fs_devices->device_list_mutex); dev = btrfs_find_device(root, devid, NULL, NULL); if (dev) - sdev = dev->scrub_device; - if (sdev) - memcpy(progress, &sdev->stat, sizeof(*progress)); + sctx = dev->scrub_device; + if (sctx) + memcpy(progress, &sctx->stat, sizeof(*progress)); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV; + return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 53c06af..1789cda 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -88,7 +88,7 @@ struct btrfs_device { u8 uuid[BTRFS_UUID_SIZE]; /* per-device scrub information */ - struct scrub_dev *scrub_device; + struct scrub_ctx *scrub_device; struct btrfs_work work; struct rcu_head rcu; -- cgit v0.10.2 From a36cf8b8933e4a7a7f2f2cbc3c70b097e97f7fd1 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 2 Nov 2012 13:26:57 +0100 Subject: Btrfs: remove the block device pointer from the scrub context struct The block device is removed from the scrub context state structure. The scrub code as it is used for the device replace procedure reads the source data from whereever it is optimal. The source device might even be gone (disconnected, for instance due to a hardware failure). Or the drive can be so faulty so that the device replace procedure tries to avoid access to the faulty source drive as much as possible, and only if all other mirrors are damaged, as a last resort, the source disk is accessed. The modified scrub code operates as if it would handle the source drive and thereby generates an exact copy of the source disk on the target disk, even if the source disk is not present at all. Therefore the block device pointer to the source disk is removed in the scrub context struct and moved into the lower level scope of scrub_bio, fixup and page structures where the block device context is known. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 29c8aac..822c08a 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -67,6 +67,7 @@ struct scrub_page { struct scrub_bio { int index; struct scrub_ctx *sctx; + struct btrfs_device *dev; struct bio *bio; int err; u64 logical; @@ -93,7 +94,7 @@ struct scrub_block { struct scrub_ctx { struct scrub_bio *bios[SCRUB_BIOS_PER_CTX]; - struct btrfs_device *dev; + struct btrfs_root *dev_root; int first_free; int curr; atomic_t in_flight; @@ -117,6 +118,7 @@ struct scrub_ctx { struct scrub_fixup_nodatasum { struct scrub_ctx *sctx; + struct btrfs_device *dev; u64 logical; struct btrfs_root *root; struct btrfs_work work; @@ -166,8 +168,8 @@ static void scrub_block_put(struct scrub_block *sblock); static int scrub_add_page_to_bio(struct scrub_ctx *sctx, struct scrub_page *spage); static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, - u64 physical, u64 flags, u64 gen, int mirror_num, - u8 *csum, int force); + u64 physical, struct btrfs_device *dev, u64 flags, + u64 gen, int mirror_num, u8 *csum, int force); static void scrub_bio_end_io(struct bio *bio, int err); static void scrub_bio_end_io_worker(struct btrfs_work *work); static void scrub_block_complete(struct scrub_block *sblock); @@ -228,9 +230,9 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev) sctx = kzalloc(sizeof(*sctx), GFP_NOFS); if (!sctx) goto nomem; - sctx->dev = dev; sctx->pages_per_bio = pages_per_bio; sctx->curr = -1; + sctx->dev_root = dev->dev_root; for (i = 0; i < SCRUB_BIOS_PER_CTX; ++i) { struct scrub_bio *sbio; @@ -345,8 +347,8 @@ err: static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) { - struct btrfs_device *dev = sblock->sctx->dev; - struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; + struct btrfs_device *dev; + struct btrfs_fs_info *fs_info; struct btrfs_path *path; struct btrfs_key found_key; struct extent_buffer *eb; @@ -361,15 +363,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) const int bufsize = 4096; int ret; + WARN_ON(sblock->page_count < 1); + dev = sblock->pagev[0].dev; + fs_info = sblock->sctx->dev_root->fs_info; + path = btrfs_alloc_path(); swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); - BUG_ON(sblock->page_count < 1); swarn.sector = (sblock->pagev[0].physical) >> 9; swarn.logical = sblock->pagev[0].logical; swarn.errstr = errstr; - swarn.dev = dev; + swarn.dev = NULL; swarn.msg_bufsize = bufsize; swarn.scratch_bufsize = bufsize; @@ -405,6 +410,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) } while (ret != 1); } else { swarn.path = path; + swarn.dev = dev; iterate_extent_inodes(fs_info, found_key.objectid, extent_item_pos, 1, scrub_print_warning_inode, &swarn); @@ -588,7 +594,7 @@ out: printk_ratelimited_in_rcu(KERN_ERR "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", (unsigned long long)fixup->logical, - rcu_str_deref(sctx->dev->name)); + rcu_str_deref(fixup->dev->name)); } btrfs_free_path(path); @@ -615,6 +621,7 @@ out: static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) { struct scrub_ctx *sctx = sblock_to_check->sctx; + struct btrfs_device *dev; struct btrfs_fs_info *fs_info; u64 length; u64 logical; @@ -633,7 +640,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) DEFAULT_RATELIMIT_BURST); BUG_ON(sblock_to_check->page_count < 1); - fs_info = sctx->dev->dev_root->fs_info; + fs_info = sctx->dev_root->fs_info; length = sblock_to_check->page_count * PAGE_SIZE; logical = sblock_to_check->pagev[0].logical; generation = sblock_to_check->pagev[0].generation; @@ -643,6 +650,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) BTRFS_EXTENT_FLAG_DATA); have_csum = sblock_to_check->pagev[0].have_csum; csum = sblock_to_check->pagev[0].csum; + dev = sblock_to_check->pagev[0].dev; /* * read all mirrors one after the other. This includes to @@ -682,8 +690,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sctx->stat.read_errors++; sctx->stat.uncorrectable_errors++; spin_unlock(&sctx->stat_lock); - btrfs_dev_stat_inc_and_print(sctx->dev, - BTRFS_DEV_STAT_READ_ERRS); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); goto out; } @@ -695,8 +702,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sctx->stat.read_errors++; sctx->stat.uncorrectable_errors++; spin_unlock(&sctx->stat_lock); - btrfs_dev_stat_inc_and_print(sctx->dev, - BTRFS_DEV_STAT_READ_ERRS); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); goto out; } BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); @@ -710,8 +716,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sctx->stat.read_errors++; sctx->stat.uncorrectable_errors++; spin_unlock(&sctx->stat_lock); - btrfs_dev_stat_inc_and_print(sctx->dev, - BTRFS_DEV_STAT_READ_ERRS); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); goto out; } @@ -738,15 +743,14 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) spin_unlock(&sctx->stat_lock); if (__ratelimit(&_rs)) scrub_print_warning("i/o error", sblock_to_check); - btrfs_dev_stat_inc_and_print(sctx->dev, - BTRFS_DEV_STAT_READ_ERRS); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); } else if (sblock_bad->checksum_error) { spin_lock(&sctx->stat_lock); sctx->stat.csum_errors++; spin_unlock(&sctx->stat_lock); if (__ratelimit(&_rs)) scrub_print_warning("checksum error", sblock_to_check); - btrfs_dev_stat_inc_and_print(sctx->dev, + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); } else if (sblock_bad->header_error) { spin_lock(&sctx->stat_lock); @@ -756,10 +760,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) scrub_print_warning("checksum/header error", sblock_to_check); if (sblock_bad->generation_error) - btrfs_dev_stat_inc_and_print(sctx->dev, + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_GENERATION_ERRS); else - btrfs_dev_stat_inc_and_print(sctx->dev, + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); } @@ -780,6 +784,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) if (!fixup_nodatasum) goto did_not_correct_error; fixup_nodatasum->sctx = sctx; + fixup_nodatasum->dev = dev; fixup_nodatasum->logical = logical; fixup_nodatasum->root = fs_info->extent_root; fixup_nodatasum->mirror_num = failed_mirror_index + 1; @@ -945,7 +950,7 @@ corrected_error: printk_ratelimited_in_rcu(KERN_ERR "btrfs: fixed up error at logical %llu on dev %s\n", (unsigned long long)logical, - rcu_str_deref(sctx->dev->name)); + rcu_str_deref(dev->name)); } } else { did_not_correct_error: @@ -955,7 +960,7 @@ did_not_correct_error: printk_ratelimited_in_rcu(KERN_ERR "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", (unsigned long long)logical, - rcu_str_deref(sctx->dev->name)); + rcu_str_deref(dev->name)); } out: @@ -1266,7 +1271,7 @@ static int scrub_checksum_data(struct scrub_block *sblock) void *buffer; u32 crc = ~(u32)0; int fail = 0; - struct btrfs_root *root = sctx->dev->dev_root; + struct btrfs_root *root = sctx->dev_root; u64 len; int index; @@ -1306,7 +1311,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) { struct scrub_ctx *sctx = sblock->sctx; struct btrfs_header *h; - struct btrfs_root *root = sctx->dev->dev_root; + struct btrfs_root *root = sctx->dev_root; struct btrfs_fs_info *fs_info = root->fs_info; u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; @@ -1378,7 +1383,7 @@ static int scrub_checksum_super(struct scrub_block *sblock) { struct btrfs_super_block *s; struct scrub_ctx *sctx = sblock->sctx; - struct btrfs_root *root = sctx->dev->dev_root; + struct btrfs_root *root = sctx->dev_root; struct btrfs_fs_info *fs_info = root->fs_info; u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; @@ -1442,10 +1447,10 @@ static int scrub_checksum_super(struct scrub_block *sblock) ++sctx->stat.super_errors; spin_unlock(&sctx->stat_lock); if (fail_cor) - btrfs_dev_stat_inc_and_print(sctx->dev, + btrfs_dev_stat_inc_and_print(sblock->pagev[0].dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); else - btrfs_dev_stat_inc_and_print(sctx->dev, + btrfs_dev_stat_inc_and_print(sblock->pagev[0].dev, BTRFS_DEV_STAT_GENERATION_ERRS); } @@ -1513,6 +1518,7 @@ again: sbio->physical = spage->physical; sbio->logical = spage->logical; + sbio->dev = spage->dev; bio = sbio->bio; if (!bio) { bio = bio_alloc(GFP_NOFS, sctx->pages_per_bio); @@ -1523,13 +1529,14 @@ again: bio->bi_private = sbio; bio->bi_end_io = scrub_bio_end_io; - bio->bi_bdev = sctx->dev->bdev; - bio->bi_sector = spage->physical >> 9; + bio->bi_bdev = sbio->dev->bdev; + bio->bi_sector = sbio->physical >> 9; sbio->err = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical || sbio->logical + sbio->page_count * PAGE_SIZE != - spage->logical) { + spage->logical || + sbio->dev != spage->dev) { scrub_submit(sctx); goto again; } @@ -1556,8 +1563,8 @@ again: } static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, - u64 physical, u64 flags, u64 gen, int mirror_num, - u8 *csum, int force) + u64 physical, struct btrfs_device *dev, u64 flags, + u64 gen, int mirror_num, u8 *csum, int force) { struct scrub_block *sblock; int index; @@ -1593,7 +1600,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, return -ENOMEM; } spage->sblock = sblock; - spage->dev = sctx->dev; + spage->dev = dev; spage->flags = flags; spage->generation = gen; spage->logical = logical; @@ -1634,8 +1641,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, static void scrub_bio_end_io(struct bio *bio, int err) { struct scrub_bio *sbio = bio->bi_private; - struct scrub_ctx *sctx = sbio->sctx; - struct btrfs_fs_info *fs_info = sctx->dev->dev_root->fs_info; + struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; sbio->err = err; sbio->bio = bio; @@ -1728,7 +1734,8 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, /* scrub extent tries to collect up to 64 kB for each bio */ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, - u64 physical, u64 flags, u64 gen, int mirror_num) + u64 physical, struct btrfs_device *dev, u64 flags, + u64 gen, int mirror_num) { int ret; u8 csum[BTRFS_CSUM_SIZE]; @@ -1762,7 +1769,7 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, if (have_csum == 0) ++sctx->stat.no_csum; } - ret = scrub_pages(sctx, logical, l, physical, flags, gen, + ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen, mirror_num, have_csum ? csum : NULL, 0); if (ret) return ret; @@ -1774,10 +1781,12 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, } static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, - struct map_lookup *map, int num, u64 base, u64 length) + struct map_lookup *map, + struct btrfs_device *scrub_dev, + int num, u64 base, u64 length) { struct btrfs_path *path; - struct btrfs_fs_info *fs_info = sctx->dev->dev_root->fs_info; + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; struct btrfs_root *root = fs_info->extent_root; struct btrfs_root *csum_root = fs_info->csum_root; struct btrfs_extent_item *extent; @@ -1797,7 +1806,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct reada_control *reada2; struct btrfs_key key_start; struct btrfs_key key_end; - u64 increment = map->stripe_len; u64 offset; @@ -2006,7 +2014,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, ret = scrub_extent(sctx, key.objectid, key.offset, key.objectid - logical + physical, - flags, generation, mirror_num); + scrub_dev, flags, generation, + mirror_num); if (ret) goto out; @@ -2030,11 +2039,13 @@ out: } static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, - u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length, - u64 dev_offset) + struct btrfs_device *scrub_dev, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset, u64 length, + u64 dev_offset) { struct btrfs_mapping_tree *map_tree = - &sctx->dev->dev_root->fs_info->mapping_tree; + &sctx->dev_root->fs_info->mapping_tree; struct map_lookup *map; struct extent_map *em; int i; @@ -2055,9 +2066,10 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, goto out; for (i = 0; i < map->num_stripes; ++i) { - if (map->stripes[i].dev == sctx->dev && + if (map->stripes[i].dev->bdev == scrub_dev->bdev && map->stripes[i].physical == dev_offset) { - ret = scrub_stripe(sctx, map, i, chunk_offset, length); + ret = scrub_stripe(sctx, map, scrub_dev, i, + chunk_offset, length); if (ret) goto out; } @@ -2069,11 +2081,12 @@ out: } static noinline_for_stack -int scrub_enumerate_chunks(struct scrub_ctx *sctx, u64 start, u64 end) +int scrub_enumerate_chunks(struct scrub_ctx *sctx, + struct btrfs_device *scrub_dev, u64 start, u64 end) { struct btrfs_dev_extent *dev_extent = NULL; struct btrfs_path *path; - struct btrfs_root *root = sctx->dev->dev_root; + struct btrfs_root *root = sctx->dev_root; struct btrfs_fs_info *fs_info = root->fs_info; u64 length; u64 chunk_tree; @@ -2094,11 +2107,10 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, u64 start, u64 end) path->search_commit_root = 1; path->skip_locking = 1; - key.objectid = sctx->dev->devid; + key.objectid = scrub_dev->devid; key.offset = 0ull; key.type = BTRFS_DEV_EXTENT_KEY; - while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -2117,7 +2129,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, u64 start, u64 end) btrfs_item_key_to_cpu(l, &found_key, slot); - if (found_key.objectid != sctx->dev->devid) + if (found_key.objectid != scrub_dev->devid) break; if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) @@ -2151,7 +2163,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, u64 start, u64 end) ret = -ENOENT; break; } - ret = scrub_chunk(sctx, chunk_tree, chunk_objectid, + ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid, chunk_offset, length, found_key.offset); btrfs_put_block_group(cache); if (ret) @@ -2170,14 +2182,14 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, u64 start, u64 end) return ret < 0 ? ret : 0; } -static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx) +static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, + struct btrfs_device *scrub_dev) { int i; u64 bytenr; u64 gen; int ret; - struct btrfs_device *device = sctx->dev; - struct btrfs_root *root = device->dev_root; + struct btrfs_root *root = sctx->dev_root; if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) return -EIO; @@ -2186,11 +2198,12 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx) for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) + if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes) break; ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, - BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); + scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, + NULL, 1); if (ret) return ret; } @@ -2317,11 +2330,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); down_read(&fs_info->scrub_super_lock); - ret = scrub_supers(sctx); + ret = scrub_supers(sctx, dev); up_read(&fs_info->scrub_super_lock); if (!ret) - ret = scrub_enumerate_chunks(sctx, start, end); + ret = scrub_enumerate_chunks(sctx, dev, start, end); wait_event(sctx->list_wait, atomic_read(&sctx->in_flight) == 0); atomic_dec(&fs_info->scrubs_running); -- cgit v0.10.2 From 7a9e9987681198c56ac7f165725ca322d7a196e1 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 2 Nov 2012 14:58:04 +0100 Subject: Btrfs: make the scrub page array dynamically allocated With the modified design (in order to support the devive replace procedure) it is necessary to alloc the page array dynamically. The reason is that pages are reused. At first a page is used for the bio to read the data from the filesystem, then the same page is reused for the bio that writes the data to the target disk. Since the read process and the write process are completely decoupled, this requires a new concept of refcounts and get/put functions for pages, and it requires to use newly created pages for each read bio which are freed after the write operation is finished. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 822c08a..15ac82a 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -46,6 +46,12 @@ struct scrub_ctx; #define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ #define SCRUB_BIOS_PER_CTX 16 /* 1 MB per device in flight */ + +/* + * the following value times PAGE_SIZE needs to be large enough to match the + * largest node/leaf/sector size that shall be supported. + * Values larger than BTRFS_STRIPE_LEN are not supported. + */ #define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ struct scrub_page { @@ -56,6 +62,7 @@ struct scrub_page { u64 generation; u64 logical; u64 physical; + atomic_t ref_count; struct { unsigned int mirror_num:8; unsigned int have_csum:1; @@ -79,7 +86,7 @@ struct scrub_bio { }; struct scrub_block { - struct scrub_page pagev[SCRUB_MAX_PAGES_PER_BLOCK]; + struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK]; int page_count; atomic_t outstanding_pages; atomic_t ref_count; /* free mem on transition to zero */ @@ -165,6 +172,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock); static int scrub_checksum_super(struct scrub_block *sblock); static void scrub_block_get(struct scrub_block *sblock); static void scrub_block_put(struct scrub_block *sblock); +static void scrub_page_get(struct scrub_page *spage); +static void scrub_page_put(struct scrub_page *spage); static int scrub_add_page_to_bio(struct scrub_ctx *sctx, struct scrub_page *spage); static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, @@ -364,15 +373,15 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) int ret; WARN_ON(sblock->page_count < 1); - dev = sblock->pagev[0].dev; + dev = sblock->pagev[0]->dev; fs_info = sblock->sctx->dev_root->fs_info; path = btrfs_alloc_path(); swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); - swarn.sector = (sblock->pagev[0].physical) >> 9; - swarn.logical = sblock->pagev[0].logical; + swarn.sector = (sblock->pagev[0]->physical) >> 9; + swarn.logical = sblock->pagev[0]->logical; swarn.errstr = errstr; swarn.dev = NULL; swarn.msg_bufsize = bufsize; @@ -642,15 +651,15 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) BUG_ON(sblock_to_check->page_count < 1); fs_info = sctx->dev_root->fs_info; length = sblock_to_check->page_count * PAGE_SIZE; - logical = sblock_to_check->pagev[0].logical; - generation = sblock_to_check->pagev[0].generation; - BUG_ON(sblock_to_check->pagev[0].mirror_num < 1); - failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1; - is_metadata = !(sblock_to_check->pagev[0].flags & + logical = sblock_to_check->pagev[0]->logical; + generation = sblock_to_check->pagev[0]->generation; + BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1); + failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1; + is_metadata = !(sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA); - have_csum = sblock_to_check->pagev[0].have_csum; - csum = sblock_to_check->pagev[0].csum; - dev = sblock_to_check->pagev[0].dev; + have_csum = sblock_to_check->pagev[0]->have_csum; + csum = sblock_to_check->pagev[0]->csum; + dev = sblock_to_check->pagev[0]->dev; /* * read all mirrors one after the other. This includes to @@ -892,7 +901,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) success = 1; for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { - struct scrub_page *page_bad = sblock_bad->pagev + page_num; + struct scrub_page *page_bad = sblock_bad->pagev[page_num]; if (!page_bad->io_error) continue; @@ -903,8 +912,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) mirror_index++) { struct scrub_block *sblock_other = sblocks_for_recheck + mirror_index; - struct scrub_page *page_other = sblock_other->pagev + - page_num; + struct scrub_page *page_other = sblock_other->pagev[ + page_num]; if (!page_other->io_error) { ret = scrub_repair_page_from_good_copy( @@ -971,11 +980,11 @@ out: mirror_index; int page_index; - for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO; - page_index++) - if (sblock->pagev[page_index].page) - __free_page( - sblock->pagev[page_index].page); + for (page_index = 0; page_index < sblock->page_count; + page_index++) { + sblock->pagev[page_index]->sblock = NULL; + scrub_page_put(sblock->pagev[page_index]); + } } kfree(sblocks_for_recheck); } @@ -993,7 +1002,7 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx, int ret; /* - * note: the three members sctx, ref_count and outstanding_pages + * note: the two members ref_count and outstanding_pages * are not used (and not set) in the blocks that are used for * the recheck procedure */ @@ -1025,21 +1034,27 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx, continue; sblock = sblocks_for_recheck + mirror_index; - page = sblock->pagev + page_index; - page->logical = logical; - page->physical = bbio->stripes[mirror_index].physical; - /* for missing devices, dev->bdev is NULL */ - page->dev = bbio->stripes[mirror_index].dev; - page->mirror_num = mirror_index + 1; - page->page = alloc_page(GFP_NOFS); - if (!page->page) { + sblock->sctx = sctx; + page = kzalloc(sizeof(*page), GFP_NOFS); + if (!page) { +leave_nomem: spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; spin_unlock(&sctx->stat_lock); kfree(bbio); return -ENOMEM; } + scrub_page_get(page); + sblock->pagev[page_index] = page; + page->logical = logical; + page->physical = bbio->stripes[mirror_index].physical; + /* for missing devices, dev->bdev is NULL */ + page->dev = bbio->stripes[mirror_index].dev; + page->mirror_num = mirror_index + 1; sblock->page_count++; + page->page = alloc_page(GFP_NOFS); + if (!page->page) + goto leave_nomem; } kfree(bbio); length -= sublen; @@ -1071,7 +1086,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info, for (page_num = 0; page_num < sblock->page_count; page_num++) { struct bio *bio; int ret; - struct scrub_page *page = sblock->pagev + page_num; + struct scrub_page *page = sblock->pagev[page_num]; DECLARE_COMPLETION_ONSTACK(complete); if (page->dev->bdev == NULL) { @@ -1080,7 +1095,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info, continue; } - BUG_ON(!page->page); + WARN_ON(!page->page); bio = bio_alloc(GFP_NOFS, 1); if (!bio) return -EIO; @@ -1125,14 +1140,14 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, struct btrfs_root *root = fs_info->extent_root; void *mapped_buffer; - BUG_ON(!sblock->pagev[0].page); + WARN_ON(!sblock->pagev[0]->page); if (is_metadata) { struct btrfs_header *h; - mapped_buffer = kmap_atomic(sblock->pagev[0].page); + mapped_buffer = kmap_atomic(sblock->pagev[0]->page); h = (struct btrfs_header *)mapped_buffer; - if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || + if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) || memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) { @@ -1146,7 +1161,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, if (!have_csum) return; - mapped_buffer = kmap_atomic(sblock->pagev[0].page); + mapped_buffer = kmap_atomic(sblock->pagev[0]->page); } for (page_num = 0;;) { @@ -1162,9 +1177,9 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, page_num++; if (page_num >= sblock->page_count) break; - BUG_ON(!sblock->pagev[page_num].page); + WARN_ON(!sblock->pagev[page_num]->page); - mapped_buffer = kmap_atomic(sblock->pagev[page_num].page); + mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page); } btrfs_csum_final(crc, calculated_csum); @@ -1202,11 +1217,11 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good, int page_num, int force_write) { - struct scrub_page *page_bad = sblock_bad->pagev + page_num; - struct scrub_page *page_good = sblock_good->pagev + page_num; + struct scrub_page *page_bad = sblock_bad->pagev[page_num]; + struct scrub_page *page_good = sblock_good->pagev[page_num]; - BUG_ON(sblock_bad->pagev[page_num].page == NULL); - BUG_ON(sblock_good->pagev[page_num].page == NULL); + BUG_ON(page_bad->page == NULL); + BUG_ON(page_good->page == NULL); if (force_write || sblock_bad->header_error || sblock_bad->checksum_error || page_bad->io_error) { struct bio *bio; @@ -1247,8 +1262,8 @@ static void scrub_checksum(struct scrub_block *sblock) u64 flags; int ret; - BUG_ON(sblock->page_count < 1); - flags = sblock->pagev[0].flags; + WARN_ON(sblock->page_count < 1); + flags = sblock->pagev[0]->flags; ret = 0; if (flags & BTRFS_EXTENT_FLAG_DATA) ret = scrub_checksum_data(sblock); @@ -1276,11 +1291,11 @@ static int scrub_checksum_data(struct scrub_block *sblock) int index; BUG_ON(sblock->page_count < 1); - if (!sblock->pagev[0].have_csum) + if (!sblock->pagev[0]->have_csum) return 0; - on_disk_csum = sblock->pagev[0].csum; - page = sblock->pagev[0].page; + on_disk_csum = sblock->pagev[0]->csum; + page = sblock->pagev[0]->page; buffer = kmap_atomic(page); len = sctx->sectorsize; @@ -1295,8 +1310,8 @@ static int scrub_checksum_data(struct scrub_block *sblock) break; index++; BUG_ON(index >= sblock->page_count); - BUG_ON(!sblock->pagev[index].page); - page = sblock->pagev[index].page; + BUG_ON(!sblock->pagev[index]->page); + page = sblock->pagev[index]->page; buffer = kmap_atomic(page); } @@ -1326,7 +1341,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) int index; BUG_ON(sblock->page_count < 1); - page = sblock->pagev[0].page; + page = sblock->pagev[0]->page; mapped_buffer = kmap_atomic(page); h = (struct btrfs_header *)mapped_buffer; memcpy(on_disk_csum, h->csum, sctx->csum_size); @@ -1337,10 +1352,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) * b) the page is already kmapped */ - if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr)) + if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr)) ++fail; - if (sblock->pagev[0].generation != le64_to_cpu(h->generation)) + if (sblock->pagev[0]->generation != le64_to_cpu(h->generation)) ++fail; if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) @@ -1365,8 +1380,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) break; index++; BUG_ON(index >= sblock->page_count); - BUG_ON(!sblock->pagev[index].page); - page = sblock->pagev[index].page; + BUG_ON(!sblock->pagev[index]->page); + page = sblock->pagev[index]->page; mapped_buffer = kmap_atomic(page); mapped_size = PAGE_SIZE; p = mapped_buffer; @@ -1398,15 +1413,15 @@ static int scrub_checksum_super(struct scrub_block *sblock) int index; BUG_ON(sblock->page_count < 1); - page = sblock->pagev[0].page; + page = sblock->pagev[0]->page; mapped_buffer = kmap_atomic(page); s = (struct btrfs_super_block *)mapped_buffer; memcpy(on_disk_csum, s->csum, sctx->csum_size); - if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) + if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr)) ++fail_cor; - if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) + if (sblock->pagev[0]->generation != le64_to_cpu(s->generation)) ++fail_gen; if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) @@ -1426,8 +1441,8 @@ static int scrub_checksum_super(struct scrub_block *sblock) break; index++; BUG_ON(index >= sblock->page_count); - BUG_ON(!sblock->pagev[index].page); - page = sblock->pagev[index].page; + BUG_ON(!sblock->pagev[index]->page); + page = sblock->pagev[index]->page; mapped_buffer = kmap_atomic(page); mapped_size = PAGE_SIZE; p = mapped_buffer; @@ -1447,10 +1462,10 @@ static int scrub_checksum_super(struct scrub_block *sblock) ++sctx->stat.super_errors; spin_unlock(&sctx->stat_lock); if (fail_cor) - btrfs_dev_stat_inc_and_print(sblock->pagev[0].dev, + btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); else - btrfs_dev_stat_inc_and_print(sblock->pagev[0].dev, + btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, BTRFS_DEV_STAT_GENERATION_ERRS); } @@ -1468,12 +1483,25 @@ static void scrub_block_put(struct scrub_block *sblock) int i; for (i = 0; i < sblock->page_count; i++) - if (sblock->pagev[i].page) - __free_page(sblock->pagev[i].page); + scrub_page_put(sblock->pagev[i]); kfree(sblock); } } +static void scrub_page_get(struct scrub_page *spage) +{ + atomic_inc(&spage->ref_count); +} + +static void scrub_page_put(struct scrub_page *spage) +{ + if (atomic_dec_and_test(&spage->ref_count)) { + if (spage->page) + __free_page(spage->page); + kfree(spage); + } +} + static void scrub_submit(struct scrub_ctx *sctx) { struct scrub_bio *sbio; @@ -1577,28 +1605,28 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, return -ENOMEM; } - /* one ref inside this function, plus one for each page later on */ + /* one ref inside this function, plus one for each page added to + * a bio later on */ atomic_set(&sblock->ref_count, 1); sblock->sctx = sctx; sblock->no_io_error_seen = 1; for (index = 0; len > 0; index++) { - struct scrub_page *spage = sblock->pagev + index; + struct scrub_page *spage; u64 l = min_t(u64, len, PAGE_SIZE); - BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); - spage->page = alloc_page(GFP_NOFS); - if (!spage->page) { + spage = kzalloc(sizeof(*spage), GFP_NOFS); + if (!spage) { +leave_nomem: spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; spin_unlock(&sctx->stat_lock); - while (index > 0) { - index--; - __free_page(sblock->pagev[index].page); - } - kfree(sblock); + scrub_block_put(sblock); return -ENOMEM; } + BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); + scrub_page_get(spage); + sblock->pagev[index] = spage; spage->sblock = sblock; spage->dev = dev; spage->flags = flags; @@ -1613,14 +1641,17 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, spage->have_csum = 0; } sblock->page_count++; + spage->page = alloc_page(GFP_NOFS); + if (!spage->page) + goto leave_nomem; len -= l; logical += l; physical += l; } - BUG_ON(sblock->page_count == 0); + WARN_ON(sblock->page_count == 0); for (index = 0; index < sblock->page_count; index++) { - struct scrub_page *spage = sblock->pagev + index; + struct scrub_page *spage = sblock->pagev[index]; int ret; ret = scrub_add_page_to_bio(sctx, spage); @@ -2289,6 +2320,22 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, return -EINVAL; } + if (fs_info->chunk_root->nodesize > + PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK || + fs_info->chunk_root->sectorsize > + PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) { + /* + * would exhaust the array bounds of pagev member in + * struct scrub_block + */ + pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n", + fs_info->chunk_root->nodesize, + SCRUB_MAX_PAGES_PER_BLOCK, + fs_info->chunk_root->sectorsize, + SCRUB_MAX_PAGES_PER_BLOCK); + return -EINVAL; + } + ret = scrub_workers_get(root); if (ret) return ret; -- cgit v0.10.2 From cb2ced73d8c7a38b5f699e267deadf2a2cfe911c Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 2 Nov 2012 16:14:21 +0100 Subject: Btrfs: in scrub repair code, optimize the reading of mirrors In case that disk blocks need to be repaired (rewritten), the current code at first (for simplicity reasons) reads all alternate mirrors in the first step, afterwards selects the best one in a second step. This is now changed to read one alternate mirror after the other and to leave the loop early when a perfect mirror is found. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 15ac82a..7d38f40 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -819,26 +819,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) /* * now build and submit the bios for the other mirrors, check - * checksums - */ - for (mirror_index = 0; - mirror_index < BTRFS_MAX_MIRRORS && - sblocks_for_recheck[mirror_index].page_count > 0; - mirror_index++) { - if (mirror_index == failed_mirror_index) - continue; - - /* build and submit the bios, check checksums */ - ret = scrub_recheck_block(fs_info, - sblocks_for_recheck + mirror_index, - is_metadata, have_csum, csum, - generation, sctx->csum_size); - if (ret) - goto did_not_correct_error; - } - - /* - * first try to pick the mirror which is completely without I/O + * checksums. + * First try to pick the mirror which is completely without I/O * errors and also does not have a checksum error. * If one is found, and if a checksum is present, the full block * that is known to contain an error is rewritten. Afterwards @@ -854,10 +836,17 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) mirror_index < BTRFS_MAX_MIRRORS && sblocks_for_recheck[mirror_index].page_count > 0; mirror_index++) { - struct scrub_block *sblock_other = sblocks_for_recheck + - mirror_index; + struct scrub_block *sblock_other; - if (!sblock_other->header_error && + if (mirror_index == failed_mirror_index) + continue; + sblock_other = sblocks_for_recheck + mirror_index; + + /* build and submit the bios, check checksums */ + ret = scrub_recheck_block(fs_info, sblock_other, is_metadata, + have_csum, csum, generation, + sctx->csum_size); + if (!ret && !sblock_other->header_error && !sblock_other->checksum_error && sblock_other->no_io_error_seen) { int force_write = is_metadata || have_csum; -- cgit v0.10.2 From 34f5c8e90b3f002672cd6b4e6e7c5b959fd981ae Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 2 Nov 2012 16:16:26 +0100 Subject: Btrfs: in scrub repair code, simplify alloc error handling In the scrub repair code, the code is changed to handle memory allocation errors a little bit smarter. The change is to handle it just like a read error. This simplifies the code and removes a couple of lines of code, since the code to handle read errors is there anyway. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 7d38f40..fcd5bcc 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -151,10 +151,10 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx, struct btrfs_mapping_tree *map_tree, u64 length, u64 logical, struct scrub_block *sblock); -static int scrub_recheck_block(struct btrfs_fs_info *fs_info, - struct scrub_block *sblock, int is_metadata, - int have_csum, u8 *csum, u64 generation, - u16 csum_size); +static void scrub_recheck_block(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock, int is_metadata, + int have_csum, u8 *csum, u64 generation, + u16 csum_size); static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, struct scrub_block *sblock, int is_metadata, int have_csum, @@ -718,16 +718,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sblock_bad = sblocks_for_recheck + failed_mirror_index; /* build and submit the bios for the failed mirror, check checksums */ - ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, - csum, generation, sctx->csum_size); - if (ret) { - spin_lock(&sctx->stat_lock); - sctx->stat.read_errors++; - sctx->stat.uncorrectable_errors++; - spin_unlock(&sctx->stat_lock); - btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); - goto out; - } + scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, + csum, generation, sctx->csum_size); if (!sblock_bad->header_error && !sblock_bad->checksum_error && sblock_bad->no_io_error_seen) { @@ -843,10 +835,11 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sblock_other = sblocks_for_recheck + mirror_index; /* build and submit the bios, check checksums */ - ret = scrub_recheck_block(fs_info, sblock_other, is_metadata, - have_csum, csum, generation, - sctx->csum_size); - if (!ret && !sblock_other->header_error && + scrub_recheck_block(fs_info, sblock_other, is_metadata, + have_csum, csum, generation, + sctx->csum_size); + + if (!sblock_other->header_error && !sblock_other->checksum_error && sblock_other->no_io_error_seen) { int force_write = is_metadata || have_csum; @@ -931,10 +924,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * is verified, but most likely the data comes out * of the page cache. */ - ret = scrub_recheck_block(fs_info, sblock_bad, - is_metadata, have_csum, csum, - generation, sctx->csum_size); - if (!ret && !sblock_bad->header_error && + scrub_recheck_block(fs_info, sblock_bad, + is_metadata, have_csum, csum, + generation, sctx->csum_size); + if (!sblock_bad->header_error && !sblock_bad->checksum_error && sblock_bad->no_io_error_seen) goto corrected_error; @@ -1061,10 +1054,10 @@ leave_nomem: * to take those pages that are not errored from all the mirrors so that * the pages that are errored in the just handled mirror can be repaired. */ -static int scrub_recheck_block(struct btrfs_fs_info *fs_info, - struct scrub_block *sblock, int is_metadata, - int have_csum, u8 *csum, u64 generation, - u16 csum_size) +static void scrub_recheck_block(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock, int is_metadata, + int have_csum, u8 *csum, u64 generation, + u16 csum_size) { int page_num; @@ -1074,7 +1067,6 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info, for (page_num = 0; page_num < sblock->page_count; page_num++) { struct bio *bio; - int ret; struct scrub_page *page = sblock->pagev[page_num]; DECLARE_COMPLETION_ONSTACK(complete); @@ -1086,18 +1078,17 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info, WARN_ON(!page->page); bio = bio_alloc(GFP_NOFS, 1); - if (!bio) - return -EIO; + if (!bio) { + page->io_error = 1; + sblock->no_io_error_seen = 0; + continue; + } bio->bi_bdev = page->dev->bdev; bio->bi_sector = page->physical >> 9; bio->bi_end_io = scrub_complete_bio_end_io; bio->bi_private = &complete; - ret = bio_add_page(bio, page->page, PAGE_SIZE, 0); - if (PAGE_SIZE != ret) { - bio_put(bio); - return -EIO; - } + bio_add_page(bio, page->page, PAGE_SIZE, 0); btrfsic_submit_bio(READ, bio); /* this will also unplug the queue */ @@ -1114,7 +1105,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info, have_csum, csum, generation, csum_size); - return 0; + return; } static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, -- cgit v0.10.2 From b6bfebc13218f1fc1502041a810919d3a81b8b4e Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 2 Nov 2012 16:44:58 +0100 Subject: Btrfs: cleanup scrub bio and worker wait code Just move some code into functions to make everything more readable. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index fcd5bcc..a67b1a1 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011 STRATO. All rights reserved. + * Copyright (C) 2011, 2012 STRATO. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public @@ -104,8 +104,8 @@ struct scrub_ctx { struct btrfs_root *dev_root; int first_free; int curr; - atomic_t in_flight; - atomic_t fixup_cnt; + atomic_t bios_in_flight; + atomic_t workers_pending; spinlock_t list_lock; wait_queue_head_t list_wait; u16 csum_size; @@ -146,6 +146,10 @@ struct scrub_warning { }; +static void scrub_pending_bio_inc(struct scrub_ctx *sctx); +static void scrub_pending_bio_dec(struct scrub_ctx *sctx); +static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); +static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx); static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); static int scrub_setup_recheck_block(struct scrub_ctx *sctx, struct btrfs_mapping_tree *map_tree, @@ -184,6 +188,59 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work); static void scrub_block_complete(struct scrub_block *sblock); +static void scrub_pending_bio_inc(struct scrub_ctx *sctx) +{ + atomic_inc(&sctx->bios_in_flight); +} + +static void scrub_pending_bio_dec(struct scrub_ctx *sctx) +{ + atomic_dec(&sctx->bios_in_flight); + wake_up(&sctx->list_wait); +} + +/* + * used for workers that require transaction commits (i.e., for the + * NOCOW case) + */ +static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) +{ + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + + /* + * increment scrubs_running to prevent cancel requests from + * completing as long as a worker is running. we must also + * increment scrubs_paused to prevent deadlocking on pause + * requests used for transactions commits (as the worker uses a + * transaction context). it is safe to regard the worker + * as paused for all matters practical. effectively, we only + * avoid cancellation requests from completing. + */ + mutex_lock(&fs_info->scrub_lock); + atomic_inc(&fs_info->scrubs_running); + atomic_inc(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + atomic_inc(&sctx->workers_pending); +} + +/* used for workers that require transaction commits */ +static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx) +{ + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + + /* + * see scrub_pending_trans_workers_inc() why we're pretending + * to be paused in the scrub counters + */ + mutex_lock(&fs_info->scrub_lock); + atomic_dec(&fs_info->scrubs_running); + atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + atomic_dec(&sctx->workers_pending); + wake_up(&fs_info->scrub_pause_wait); + wake_up(&sctx->list_wait); +} + static void scrub_free_csums(struct scrub_ctx *sctx) { while (!list_empty(&sctx->csum_list)) { @@ -264,8 +321,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev) sctx->nodesize = dev->dev_root->nodesize; sctx->leafsize = dev->dev_root->leafsize; sctx->sectorsize = dev->dev_root->sectorsize; - atomic_set(&sctx->in_flight, 0); - atomic_set(&sctx->fixup_cnt, 0); + atomic_set(&sctx->bios_in_flight, 0); + atomic_set(&sctx->workers_pending, 0); atomic_set(&sctx->cancel_req, 0); sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy); INIT_LIST_HEAD(&sctx->csum_list); @@ -609,14 +666,7 @@ out: btrfs_free_path(path); kfree(fixup); - /* see caller why we're pretending to be paused in the scrub counters */ - mutex_lock(&fs_info->scrub_lock); - atomic_dec(&fs_info->scrubs_running); - atomic_dec(&fs_info->scrubs_paused); - mutex_unlock(&fs_info->scrub_lock); - atomic_dec(&sctx->fixup_cnt); - wake_up(&fs_info->scrub_pause_wait); - wake_up(&sctx->list_wait); + scrub_pending_trans_workers_dec(sctx); } /* @@ -789,20 +839,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) fixup_nodatasum->logical = logical; fixup_nodatasum->root = fs_info->extent_root; fixup_nodatasum->mirror_num = failed_mirror_index + 1; - /* - * increment scrubs_running to prevent cancel requests from - * completing as long as a fixup worker is running. we must also - * increment scrubs_paused to prevent deadlocking on pause - * requests used for transactions commits (as the worker uses a - * transaction context). it is safe to regard the fixup worker - * as paused for all matters practical. effectively, we only - * avoid cancellation requests from completing. - */ - mutex_lock(&fs_info->scrub_lock); - atomic_inc(&fs_info->scrubs_running); - atomic_inc(&fs_info->scrubs_paused); - mutex_unlock(&fs_info->scrub_lock); - atomic_inc(&sctx->fixup_cnt); + scrub_pending_trans_workers_inc(sctx); fixup_nodatasum->work.func = scrub_fixup_nodatasum; btrfs_queue_worker(&fs_info->scrub_workers, &fixup_nodatasum->work); @@ -1491,7 +1528,7 @@ static void scrub_submit(struct scrub_ctx *sctx) sbio = sctx->bios[sctx->curr]; sctx->curr = -1; - atomic_inc(&sctx->in_flight); + scrub_pending_bio_inc(sctx); btrfsic_submit_bio(READ, sbio->bio); } @@ -1692,8 +1729,7 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) sbio->next_free = sctx->first_free; sctx->first_free = sbio->index; spin_unlock(&sctx->list_lock); - atomic_dec(&sctx->in_flight); - wake_up(&sctx->list_wait); + scrub_pending_bio_dec(sctx); } static void scrub_block_complete(struct scrub_block *sblock) @@ -1863,7 +1899,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, logical = base + offset; wait_event(sctx->list_wait, - atomic_read(&sctx->in_flight) == 0); + atomic_read(&sctx->bios_in_flight) == 0); atomic_inc(&fs_info->scrubs_paused); wake_up(&fs_info->scrub_pause_wait); @@ -1928,7 +1964,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, /* push queued extents */ scrub_submit(sctx); wait_event(sctx->list_wait, - atomic_read(&sctx->in_flight) == 0); + atomic_read(&sctx->bios_in_flight) == 0); atomic_inc(&fs_info->scrubs_paused); wake_up(&fs_info->scrub_pause_wait); mutex_lock(&fs_info->scrub_lock); @@ -2218,7 +2254,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, if (ret) return ret; } - wait_event(sctx->list_wait, atomic_read(&sctx->in_flight) == 0); + wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); return 0; } @@ -2363,11 +2399,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, if (!ret) ret = scrub_enumerate_chunks(sctx, dev, start, end); - wait_event(sctx->list_wait, atomic_read(&sctx->in_flight) == 0); + wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); atomic_dec(&fs_info->scrubs_running); wake_up(&fs_info->scrub_pause_wait); - wait_event(sctx->list_wait, atomic_read(&sctx->fixup_cnt) == 0); + wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0); if (progress) memcpy(progress, &sctx->stat, sizeof(*progress)); -- cgit v0.10.2 From beaf8ab3afef27ed81255d9808b67f6d390ca06f Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 12 Nov 2012 14:03:45 +0100 Subject: Btrfs: move some common code into a subfunction Some code to open block devices, to read the superblock and to handle errors was repeated multiple times in 3 places, and the following patch makes use of it as well. This code is now moved into a subfunction. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 92e586b..4def1fd 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -108,6 +108,44 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) return NULL; } +static int +btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, + int flush, struct block_device **bdev, + struct buffer_head **bh) +{ + int ret; + + *bdev = blkdev_get_by_path(device_path, flags, holder); + + if (IS_ERR(*bdev)) { + ret = PTR_ERR(*bdev); + printk(KERN_INFO "btrfs: open %s failed\n", device_path); + goto error; + } + + if (flush) + filemap_write_and_wait((*bdev)->bd_inode->i_mapping); + ret = set_blocksize(*bdev, 4096); + if (ret) { + blkdev_put(*bdev, flags); + goto error; + } + invalidate_bdev(*bdev); + *bh = btrfs_read_dev_super(*bdev); + if (!*bh) { + ret = -EINVAL; + blkdev_put(*bdev, flags); + goto error; + } + + return 0; + +error: + *bdev = NULL; + *bh = NULL; + return ret; +} + static void requeue_list(struct btrfs_pending_bios *pending_bios, struct bio *head, struct bio *tail) { @@ -637,18 +675,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, if (!device->name) continue; - bdev = blkdev_get_by_path(device->name->str, flags, holder); - if (IS_ERR(bdev)) { - printk(KERN_INFO "btrfs: open %s failed\n", device->name->str); - goto error; - } - filemap_write_and_wait(bdev->bd_inode->i_mapping); - invalidate_bdev(bdev); - set_blocksize(bdev, 4096); - - bh = btrfs_read_dev_super(bdev); - if (!bh) - goto error_close; + ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, + &bdev, &bh); + if (ret) + continue; disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); @@ -697,9 +727,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, error_brelse: brelse(bh); -error_close: blkdev_put(bdev, flags); -error: continue; } if (fs_devices->open_devices == 0) { @@ -744,22 +772,10 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, u64 total_devices; flags |= FMODE_EXCL; - bdev = blkdev_get_by_path(path, flags, holder); - - if (IS_ERR(bdev)) { - ret = PTR_ERR(bdev); - goto error; - } - mutex_lock(&uuid_mutex); - ret = set_blocksize(bdev, 4096); + ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh); if (ret) - goto error_close; - bh = btrfs_read_dev_super(bdev); - if (!bh) { - ret = -EINVAL; - goto error_close; - } + goto error; disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); transid = btrfs_super_generation(disk_super); @@ -777,10 +793,9 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, if (!ret && fs_devices_ret) (*fs_devices_ret)->total_devices = total_devices; brelse(bh); -error_close: - mutex_unlock(&uuid_mutex); blkdev_put(bdev, flags); error: + mutex_unlock(&uuid_mutex); return ret; } @@ -1374,20 +1389,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) goto out; } } else { - bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL, - root->fs_info->bdev_holder); - if (IS_ERR(bdev)) { - ret = PTR_ERR(bdev); + ret = btrfs_get_bdev_and_sb(device_path, + FMODE_READ | FMODE_EXCL, + root->fs_info->bdev_holder, 0, + &bdev, &bh); + if (ret) goto out; - } - - set_blocksize(bdev, 4096); - invalidate_bdev(bdev); - bh = btrfs_read_dev_super(bdev); - if (!bh) { - ret = -EINVAL; - goto error_close; - } disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); dev_uuid = disk_super->dev_item.uuid; -- cgit v0.10.2 From 7ba15b7d211846c187a7c5dc75a5964476f8bc89 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 14:42:30 +0100 Subject: Btrfs: add two more find_device() methods The new function btrfs_find_device_missing_or_by_path() will be used for the device replace procedure. This function itself calls the second new function btrfs_find_device_by_path(). Unfortunately, it is not possible to currently make the rest of the code use these functions as well, since all functions that look similar at first view are all a little bit different in what they are doing. But in the future, new code could benefit from these two new functions, and currently, device replace uses them. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 4def1fd..1483041 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1522,6 +1522,65 @@ error_undo: goto error_brelse; } +int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, + struct btrfs_device **device) +{ + int ret = 0; + struct btrfs_super_block *disk_super; + u64 devid; + u8 *dev_uuid; + struct block_device *bdev; + struct buffer_head *bh; + + *device = NULL; + ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, + root->fs_info->bdev_holder, 0, &bdev, &bh); + if (ret) + return ret; + disk_super = (struct btrfs_super_block *)bh->b_data; + devid = btrfs_stack_device_id(&disk_super->dev_item); + dev_uuid = disk_super->dev_item.uuid; + *device = btrfs_find_device(root, devid, dev_uuid, + disk_super->fsid); + brelse(bh); + if (!*device) + ret = -ENOENT; + blkdev_put(bdev, FMODE_READ); + return ret; +} + +int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, + char *device_path, + struct btrfs_device **device) +{ + *device = NULL; + if (strcmp(device_path, "missing") == 0) { + struct list_head *devices; + struct btrfs_device *tmp; + + devices = &root->fs_info->fs_devices->devices; + /* + * It is safe to read the devices since the volume_mutex + * is held by the caller. + */ + list_for_each_entry(tmp, devices, dev_list) { + if (tmp->in_fs_metadata && !tmp->bdev) { + *device = tmp; + break; + } + } + + if (!*device) { + pr_err("btrfs: no missing device found\n"); + return -ENOENT; + } + + return 0; + } else { + return btrfs_find_device_by_path(root, device_path, device); + } +} + /* * does all the dirty work required for changing file system's UUID. */ diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 1789cda..657bb12 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -268,6 +268,11 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_fs_devices **fs_devices_ret); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices); +int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, + char *device_path, + struct btrfs_device **device); +int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, + struct btrfs_device **device); int btrfs_add_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_device *device); -- cgit v0.10.2 From 5d9640517d92d05843711ea982cbeff42d7ed32d Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 14:59:07 +0100 Subject: Btrfs: Pass fs_info to btrfs_num_copies() instead of mapping_tree This is required for the device replace procedure in a later step. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 5a3e45d..58dfac1 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -723,7 +723,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, } num_copies = - btrfs_num_copies(&state->root->fs_info->mapping_tree, + btrfs_num_copies(state->root->fs_info, next_bytenr, state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", @@ -903,7 +903,7 @@ static int btrfsic_process_superblock_dev_mirror( } num_copies = - btrfs_num_copies(&state->root->fs_info->mapping_tree, + btrfs_num_copies(state->root->fs_info, next_bytenr, state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", @@ -1287,7 +1287,7 @@ static int btrfsic_create_link_to_next_block( *next_blockp = NULL; if (0 == *num_copiesp) { *num_copiesp = - btrfs_num_copies(&state->root->fs_info->mapping_tree, + btrfs_num_copies(state->root->fs_info, next_bytenr, state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", @@ -1489,7 +1489,7 @@ static int btrfsic_handle_extent_data( chunk_len = num_bytes; num_copies = - btrfs_num_copies(&state->root->fs_info->mapping_tree, + btrfs_num_copies(state->root->fs_info, next_bytenr, state->datablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", @@ -2463,7 +2463,7 @@ static int btrfsic_process_written_superblock( } num_copies = - btrfs_num_copies(&state->root->fs_info->mapping_tree, + btrfs_num_copies(state->root->fs_info, next_bytenr, BTRFS_SUPER_INFO_SIZE); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", @@ -2960,7 +2960,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, struct btrfsic_block_data_ctx block_ctx; int match = 0; - num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, + num_copies = btrfs_num_copies(state->root->fs_info, bytenr, state->metablock_size); for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ff5d259..ba2b931 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -387,7 +387,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) break; - num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, + num_copies = btrfs_num_copies(root->fs_info, eb->start, eb->len); if (num_copies == 1) break; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3c062c8..e0b7138 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2044,10 +2044,10 @@ static int clean_io_failure(u64 start, struct page *page) spin_unlock(&BTRFS_I(inode)->io_tree.lock); if (state && state->start == failrec->start) { - map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; - num_copies = btrfs_num_copies(map_tree, failrec->logical, - failrec->len); + num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, + failrec->logical, failrec->len); if (num_copies > 1) { + map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; ret = repair_io_failure(map_tree, start, failrec->len, failrec->logical, page, failrec->failed_mirror); @@ -2157,9 +2157,8 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, * clean_io_failure() clean all those errors at once. */ } - num_copies = btrfs_num_copies( - &BTRFS_I(inode)->root->fs_info->mapping_tree, - failrec->logical, failrec->len); + num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, + failrec->logical, failrec->len); if (num_copies == 1) { /* * we only have a single copy of the data, so don't bother with diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1483041..5612767 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3785,8 +3785,9 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) } } -int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) +int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) { + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; struct extent_map *em; struct map_lookup *map; struct extent_map_tree *em_tree = &map_tree->map_tree; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 657bb12..35ea442 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -278,7 +278,7 @@ int btrfs_add_device(struct btrfs_trans_handle *trans, struct btrfs_device *device); int btrfs_rm_device(struct btrfs_root *root, char *device_path); void btrfs_cleanup_fs_uuids(void); -int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len); +int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 new_size); struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, -- cgit v0.10.2 From 3ec706c831d4c96905c287013c8228b21619a1d9 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 15:46:42 +0100 Subject: Btrfs: pass fs_info to btrfs_map_block() instead of mapping_tree This is required for the device replace procedure in a later step. Two calling functions also had to be changed to have the fs_info pointer: repair_io_failure() and scrub_setup_recheck_block(). Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 58dfac1..8f9abed 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1582,7 +1582,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, struct btrfs_device *device; length = len; - ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ, + ret = btrfs_map_block(state->root->fs_info, READ, bytenr, &length, &multi, mirror_num); device = multi->stripes[0].dev; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f8a358a..b4d438f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1818,7 +1818,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, /* Tell the block device(s) that the sectors can be discarded */ - ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, + ret = btrfs_map_block(root->fs_info, REQ_DISCARD, bytenr, &num_bytes, &bbio, 0); /* Error condition is -ENOMEM */ if (!ret) { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e0b7138..62ec6e4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1917,12 +1917,12 @@ static void repair_io_failure_callback(struct bio *bio, int err) * the standard behavior is to write all copies in a raid setup. here we only * want to write the one bad copy. so we do the mapping for ourselves and issue * submit_bio directly. - * to avoid any synchonization issues, wait for the data after writing, which + * to avoid any synchronization issues, wait for the data after writing, which * actually prevents the read that triggered the error from finishing. * currently, there can be no more than two copies of every data bit. thus, * exactly one rewrite is required. */ -int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, +int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, u64 length, u64 logical, struct page *page, int mirror_num) { @@ -1944,7 +1944,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, bio->bi_size = 0; map_length = length; - ret = btrfs_map_block(map_tree, WRITE, logical, + ret = btrfs_map_block(fs_info, WRITE, logical, &map_length, &bbio, mirror_num); if (ret) { bio_put(bio); @@ -1982,14 +1982,13 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, int mirror_num) { - struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; u64 start = eb->start; unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); int ret = 0; for (i = 0; i < num_pages; i++) { struct page *p = extent_buffer_page(eb, i); - ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE, + ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE, start, p, mirror_num); if (ret) break; @@ -2008,7 +2007,7 @@ static int clean_io_failure(u64 start, struct page *page) u64 private; u64 private_failure; struct io_failure_record *failrec; - struct btrfs_mapping_tree *map_tree; + struct btrfs_fs_info *fs_info; struct extent_state *state; int num_copies; int did_repair = 0; @@ -2044,11 +2043,11 @@ static int clean_io_failure(u64 start, struct page *page) spin_unlock(&BTRFS_I(inode)->io_tree.lock); if (state && state->start == failrec->start) { - num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, - failrec->logical, failrec->len); + fs_info = BTRFS_I(inode)->root->fs_info; + num_copies = btrfs_num_copies(fs_info, failrec->logical, + failrec->len); if (num_copies > 1) { - map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; - ret = repair_io_failure(map_tree, start, failrec->len, + ret = repair_io_failure(fs_info, start, failrec->len, failrec->logical, page, failrec->failed_mirror); did_repair = !ret; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 711d12b..2eacfab 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -337,9 +337,9 @@ struct bio * btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, gfp_t gfp_flags); -struct btrfs_mapping_tree; +struct btrfs_fs_info; -int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, +int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, u64 length, u64 logical, struct page *page, int mirror_num); int end_extent_writepage(struct page *page, int err, u64 start, u64 end); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index aabf747..5d1675a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1549,7 +1549,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, unsigned long bio_flags) { struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - struct btrfs_mapping_tree *map_tree; u64 logical = (u64)bio->bi_sector << 9; u64 length = 0; u64 map_length; @@ -1559,11 +1558,10 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, return 0; length = bio->bi_size; - map_tree = &root->fs_info->mapping_tree; map_length = length; - ret = btrfs_map_block(map_tree, READ, logical, + ret = btrfs_map_block(root->fs_info, READ, logical, &map_length, NULL, 0); - /* Will always return 0 or 1 with map_multi == NULL */ + /* Will always return 0 with map_multi == NULL */ BUG_ON(ret < 0); if (map_length < length + size) return 1; @@ -6364,7 +6362,6 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, { struct inode *inode = dip->inode; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; struct bio *bio; struct bio *orig_bio = dip->orig_bio; struct bio_vec *bvec = orig_bio->bi_io_vec; @@ -6377,7 +6374,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, int async_submit = 0; map_length = orig_bio->bi_size; - ret = btrfs_map_block(map_tree, READ, start_sector << 9, + ret = btrfs_map_block(root->fs_info, READ, start_sector << 9, &map_length, NULL, 0); if (ret) { bio_put(orig_bio); @@ -6431,7 +6428,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, bio->bi_end_io = btrfs_end_dio_bio; map_length = orig_bio->bi_size; - ret = btrfs_map_block(map_tree, READ, start_sector << 9, + ret = btrfs_map_block(root->fs_info, READ, + start_sector << 9, &map_length, NULL, 0); if (ret) { bio_put(bio); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index a955669..0ddc565 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -323,7 +323,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, struct reada_extent *re = NULL; struct reada_extent *re_exist = NULL; struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; struct btrfs_bio *bbio = NULL; struct btrfs_device *dev; struct btrfs_device *prev_dev; @@ -358,7 +357,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, * map block */ length = blocksize; - ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0); + ret = btrfs_map_block(fs_info, REQ_WRITE, logical, &length, &bbio, 0); if (ret || !bbio || length < blocksize) goto error; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a67b1a1..894bb27 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -152,7 +152,7 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx); static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); static int scrub_setup_recheck_block(struct scrub_ctx *sctx, - struct btrfs_mapping_tree *map_tree, + struct btrfs_fs_info *fs_info, u64 length, u64 logical, struct scrub_block *sblock); static void scrub_recheck_block(struct btrfs_fs_info *fs_info, @@ -523,7 +523,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) } if (PageUptodate(page)) { - struct btrfs_mapping_tree *map_tree; + struct btrfs_fs_info *fs_info; if (PageDirty(page)) { /* * we need to write the data to the defect sector. the @@ -544,8 +544,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) ret = -EIO; goto out; } - map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; - ret = repair_io_failure(map_tree, offset, PAGE_SIZE, + fs_info = BTRFS_I(inode)->root->fs_info; + ret = repair_io_failure(fs_info, offset, PAGE_SIZE, fixup->logical, page, fixup->mirror_num); unlock_page(page); @@ -754,7 +754,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) } /* setup the context, map the logical blocks and alloc the pages */ - ret = scrub_setup_recheck_block(sctx, &fs_info->mapping_tree, length, + ret = scrub_setup_recheck_block(sctx, fs_info, length, logical, sblocks_for_recheck); if (ret) { spin_lock(&sctx->stat_lock); @@ -1012,7 +1012,7 @@ out: } static int scrub_setup_recheck_block(struct scrub_ctx *sctx, - struct btrfs_mapping_tree *map_tree, + struct btrfs_fs_info *fs_info, u64 length, u64 logical, struct scrub_block *sblocks_for_recheck) { @@ -1036,7 +1036,7 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx, * with a length of PAGE_SIZE, each returned stripe * represents one mirror */ - ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length, + ret = btrfs_map_block(fs_info, WRITE, logical, &mapped_length, &bbio, 0); if (ret || !bbio || mapped_length < sublen) { kfree(bbio); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5612767..96bb2e4 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3826,13 +3826,14 @@ static int find_live_mirror(struct map_lookup *map, int first, int num, return optimal; } -static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, +static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num) { struct extent_map *em; struct map_lookup *map; + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; struct extent_map_tree *em_tree = &map_tree->map_tree; u64 offset; u64 stripe_offset; @@ -4061,11 +4062,11 @@ out: return ret; } -int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, +int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num) { - return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret, + return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, mirror_num); } @@ -4394,7 +4395,6 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, int mirror_num, int async_submit) { - struct btrfs_mapping_tree *map_tree; struct btrfs_device *dev; struct bio *first_bio = bio; u64 logical = (u64)bio->bi_sector << 9; @@ -4406,10 +4406,9 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, struct btrfs_bio *bbio = NULL; length = bio->bi_size; - map_tree = &root->fs_info->mapping_tree; map_length = length; - ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio, + ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, mirror_num); if (ret) /* -ENOMEM */ return ret; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 35ea442..ad5566d 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -248,7 +248,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 start, u64 num_bytes); -int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, +int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num); int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, -- cgit v0.10.2 From a8a6dab77997a371f1925a4001021eea3ee5cb88 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 15:50:14 +0100 Subject: Btrfs: add btrfs_scratch_superblock() function This new function is used by the device replace procedure in a later patch. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 96bb2e4..6cd8a32 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5106,3 +5106,21 @@ int btrfs_get_dev_stats(struct btrfs_root *root, stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; return 0; } + +int btrfs_scratch_superblock(struct btrfs_device *device) +{ + struct buffer_head *bh; + struct btrfs_super_block *disk_super; + + bh = btrfs_read_dev_super(device->bdev); + if (!bh) + return -EINVAL; + disk_super = (struct btrfs_super_block *)bh->b_data; + + memset(&disk_super->magic, 0, sizeof(disk_super->magic)); + set_buffer_dirty(bh); + sync_dirty_buffer(bh); + brelse(bh); + + return 0; +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index ad5566d..7eaaf4e 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -301,6 +301,7 @@ int btrfs_get_dev_stats(struct btrfs_root *root, int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); +int btrfs_scratch_superblock(struct btrfs_device *device); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, int index) -- cgit v0.10.2 From aa1b8cd409f05e1489ec77ff219eff6ed4b801b8 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 17:03:39 +0100 Subject: Btrfs: pass fs_info instead of root A small number of functions that are used in a device replace procedure when the operation is resumed at mount time are unable to pass the same root pointer that would be used in the regular (ioctl) context. And since the root pointer is not required, only the fs_info is, the root pointer argument is replaced with the fs_info pointer argument. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f9a0786..f8bb62c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3540,15 +3540,16 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending); /* scrub.c */ -int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, - struct btrfs_scrub_progress *progress, int readonly); +int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, + u64 end, struct btrfs_scrub_progress *progress, + int readonly); void btrfs_scrub_pause(struct btrfs_root *root); void btrfs_scrub_pause_super(struct btrfs_root *root); void btrfs_scrub_continue(struct btrfs_root *root); void btrfs_scrub_continue_super(struct btrfs_root *root); -int __btrfs_scrub_cancel(struct btrfs_fs_info *info); -int btrfs_scrub_cancel(struct btrfs_root *root); -int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev); +int btrfs_scrub_cancel(struct btrfs_fs_info *info); +int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info, + struct btrfs_device *dev); int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid); int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, struct btrfs_scrub_progress *progress); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ba2b931..42a8024 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3283,9 +3283,9 @@ int close_ctree(struct btrfs_root *root) smp_mb(); /* pause restriper - we want to resume on mount */ - btrfs_pause_balance(root->fs_info); + btrfs_pause_balance(fs_info); - btrfs_scrub_cancel(root); + btrfs_scrub_cancel(fs_info); /* wait for any defraggers to finish */ wait_event(fs_info->transaction_wait, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e262cd8..b40b827 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1343,7 +1343,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, printk(KERN_INFO "btrfs: resizing devid %llu\n", (unsigned long long)devid); } - device = btrfs_find_device(root, devid, NULL, NULL); + device = btrfs_find_device(root->fs_info, devid, NULL, NULL); if (!device) { printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", (unsigned long long)devid); @@ -2332,7 +2332,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) s_uuid = di_args->uuid; mutex_lock(&fs_devices->device_list_mutex); - dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL); + dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL); mutex_unlock(&fs_devices->device_list_mutex); if (!dev) { @@ -3089,7 +3089,7 @@ static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg) if (IS_ERR(sa)) return PTR_ERR(sa); - ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end, + ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end, &sa->progress, sa->flags & BTRFS_SCRUB_READONLY); if (copy_to_user(arg, sa, sizeof(*sa))) @@ -3104,7 +3104,7 @@ static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - return btrfs_scrub_cancel(root); + return btrfs_scrub_cancel(root->fs_info); } static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 894bb27..6cf23f4 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2262,9 +2262,8 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, /* * get a reference count on fs_info->scrub_workers. start worker if necessary */ -static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) +static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; mutex_lock(&fs_info->scrub_lock); @@ -2283,10 +2282,8 @@ out: return ret; } -static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) +static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = root->fs_info; - mutex_lock(&fs_info->scrub_lock); if (--fs_info->scrub_workers_refcnt == 0) btrfs_stop_workers(&fs_info->scrub_workers); @@ -2294,29 +2291,29 @@ static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) mutex_unlock(&fs_info->scrub_lock); } - -int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, - struct btrfs_scrub_progress *progress, int readonly) +int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, + u64 end, struct btrfs_scrub_progress *progress, + int readonly) { struct scrub_ctx *sctx; - struct btrfs_fs_info *fs_info = root->fs_info; int ret; struct btrfs_device *dev; - if (btrfs_fs_closing(root->fs_info)) + if (btrfs_fs_closing(fs_info)) return -EINVAL; /* * check some assumptions */ - if (root->nodesize != root->leafsize) { + if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) { printk(KERN_ERR "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", - root->nodesize, root->leafsize); + fs_info->chunk_root->nodesize, + fs_info->chunk_root->leafsize); return -EINVAL; } - if (root->nodesize > BTRFS_STRIPE_LEN) { + if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) { /* * in this case scrub is unable to calculate the checksum * the way scrub is implemented. Do not handle this @@ -2324,15 +2321,16 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, */ printk(KERN_ERR "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", - root->nodesize, BTRFS_STRIPE_LEN); + fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN); return -EINVAL; } - if (root->sectorsize != PAGE_SIZE) { + if (fs_info->chunk_root->sectorsize != PAGE_SIZE) { /* not supported for data w/o checksums */ printk(KERN_ERR "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n", - root->sectorsize, (unsigned long long)PAGE_SIZE); + fs_info->chunk_root->sectorsize, + (unsigned long long)PAGE_SIZE); return -EINVAL; } @@ -2352,37 +2350,37 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, return -EINVAL; } - ret = scrub_workers_get(root); + ret = scrub_workers_get(fs_info); if (ret) return ret; - mutex_lock(&root->fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(root, devid, NULL, NULL); + mutex_lock(&fs_info->fs_devices->device_list_mutex); + dev = btrfs_find_device(fs_info, devid, NULL, NULL); if (!dev || dev->missing) { - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - scrub_workers_put(root); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + scrub_workers_put(fs_info); return -ENODEV; } mutex_lock(&fs_info->scrub_lock); if (!dev->in_fs_metadata) { mutex_unlock(&fs_info->scrub_lock); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - scrub_workers_put(root); - return -ENODEV; + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + scrub_workers_put(fs_info); + return -EIO; } if (dev->scrub_device) { mutex_unlock(&fs_info->scrub_lock); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - scrub_workers_put(root); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + scrub_workers_put(fs_info); return -EINPROGRESS; } sctx = scrub_setup_ctx(dev); if (IS_ERR(sctx)) { mutex_unlock(&fs_info->scrub_lock); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - scrub_workers_put(root); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + scrub_workers_put(fs_info); return PTR_ERR(sctx); } sctx->readonly = readonly; @@ -2390,7 +2388,7 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, atomic_inc(&fs_info->scrubs_running); mutex_unlock(&fs_info->scrub_lock); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); down_read(&fs_info->scrub_super_lock); ret = scrub_supers(sctx, dev); @@ -2413,7 +2411,7 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, mutex_unlock(&fs_info->scrub_lock); scrub_free_ctx(sctx); - scrub_workers_put(root); + scrub_workers_put(fs_info); return ret; } @@ -2453,9 +2451,8 @@ void btrfs_scrub_continue_super(struct btrfs_root *root) up_write(&root->fs_info->scrub_super_lock); } -int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) +int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) { - mutex_lock(&fs_info->scrub_lock); if (!atomic_read(&fs_info->scrubs_running)) { mutex_unlock(&fs_info->scrub_lock); @@ -2475,14 +2472,9 @@ int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) return 0; } -int btrfs_scrub_cancel(struct btrfs_root *root) +int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info, + struct btrfs_device *dev) { - return __btrfs_scrub_cancel(root->fs_info); -} - -int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev) -{ - struct btrfs_fs_info *fs_info = root->fs_info; struct scrub_ctx *sctx; mutex_lock(&fs_info->scrub_lock); @@ -2514,12 +2506,12 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid) * does not go away in cancel_dev. FIXME: find a better solution */ mutex_lock(&fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(root, devid, NULL, NULL); + dev = btrfs_find_device(fs_info, devid, NULL, NULL); if (!dev) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); return -ENODEV; } - ret = btrfs_scrub_cancel_dev(root, dev); + ret = btrfs_scrub_cancel_dev(fs_info, dev); mutex_unlock(&fs_info->fs_devices->device_list_mutex); return ret; @@ -2532,7 +2524,7 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, struct scrub_ctx *sctx = NULL; mutex_lock(&root->fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(root, devid, NULL, NULL); + dev = btrfs_find_device(root->fs_info, devid, NULL, NULL); if (dev) sctx = dev->scrub_device; if (sctx) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index acd2df8..a1a6c296 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -116,7 +116,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { sb->s_flags |= MS_RDONLY; printk(KERN_INFO "btrfs is forced readonly\n"); - __btrfs_scrub_cancel(fs_info); + btrfs_scrub_cancel(fs_info); // WARN_ON(1); } } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6cd8a32..d2c0bcc 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1398,7 +1398,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); dev_uuid = disk_super->dev_item.uuid; - device = btrfs_find_device(root, devid, dev_uuid, + device = btrfs_find_device(root->fs_info, devid, dev_uuid, disk_super->fsid); if (!device) { ret = -ENOENT; @@ -1435,7 +1435,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) spin_unlock(&root->fs_info->free_chunk_lock); device->in_fs_metadata = 0; - btrfs_scrub_cancel_dev(root, device); + btrfs_scrub_cancel_dev(root->fs_info, device); /* * the device list mutex makes sure that we don't change @@ -1492,7 +1492,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) * at this point, the device is zero sized. We want to * remove it from the devices list and zero out the old super */ - if (clear_super) { + if (clear_super && disk_super) { /* make sure this device isn't detected as part of * the FS anymore */ @@ -1540,7 +1540,7 @@ int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); dev_uuid = disk_super->dev_item.uuid; - *device = btrfs_find_device(root, devid, dev_uuid, + *device = btrfs_find_device(root->fs_info, devid, dev_uuid, disk_super->fsid); brelse(bh); if (!*device) @@ -1699,7 +1699,8 @@ next_slot: read_extent_buffer(leaf, fs_uuid, (unsigned long)btrfs_device_fsid(dev_item), BTRFS_UUID_SIZE); - device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); + device = btrfs_find_device(root->fs_info, devid, dev_uuid, + fs_uuid); BUG_ON(!device); /* Logic error */ if (device->fs_devices->seeding) { @@ -4463,13 +4464,13 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, return 0; } -struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, +struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, u8 *uuid, u8 *fsid) { struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; - cur_devices = root->fs_info->fs_devices; + cur_devices = fs_info->fs_devices; while (cur_devices) { if (!fsid || !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { @@ -4567,8 +4568,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, read_extent_buffer(leaf, uuid, (unsigned long) btrfs_stripe_dev_uuid_nr(chunk, i), BTRFS_UUID_SIZE); - map->stripes[i].dev = btrfs_find_device(root, devid, uuid, - NULL); + map->stripes[i].dev = btrfs_find_device(root->fs_info, devid, + uuid, NULL); if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { kfree(map); free_extent_map(em); @@ -4686,7 +4687,7 @@ static int read_one_dev(struct btrfs_root *root, return ret; } - device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); + device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid); if (!device || !device->bdev) { if (!btrfs_test_opt(root, DEGRADED)) return -EIO; @@ -5078,7 +5079,7 @@ int btrfs_get_dev_stats(struct btrfs_root *root, int i; mutex_lock(&fs_devices->device_list_mutex); - dev = btrfs_find_device(root, stats->devid, NULL, NULL); + dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL); mutex_unlock(&fs_devices->device_list_mutex); if (!dev) { diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 7eaaf4e..802e2ba 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -281,7 +281,7 @@ void btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 new_size); -struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, +struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, u8 *uuid, u8 *fsid); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_root *root, char *path); -- cgit v0.10.2 From 1acd6831d98779c88cd57f0a5826d6df0b09f3fa Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 17:11:06 +0100 Subject: Btrfs: avoid risk of a deadlock in btrfs_handle_error Remove the attempt to cancel a running scrub or device replace operation in btrfs_handle_error() because it adds the risk of a deadlock. The only penalty of not canceling the operation is that some I/O remains active until the procedure completes. This is basically the same thing that happens to other tasks that are running in user mode context, they are not affected or stopped in btrfs_handle_error(), these tasks just need to handle write errors correctly. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a1a6c296..ef24158 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -116,7 +116,16 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { sb->s_flags |= MS_RDONLY; printk(KERN_INFO "btrfs is forced readonly\n"); - btrfs_scrub_cancel(fs_info); + /* + * Note that a running device replace operation is not + * canceled here although there is no way to update + * the progress. It would add the risk of a deadlock, + * therefore the canceling is ommited. The only penalty + * is that some I/O remains active until the procedure + * completes. The next time when the filesystem is + * mounted writeable again, the device replace + * operation continues. + */ // WARN_ON(1); } } -- cgit v0.10.2 From e922e087a35c437acef3bc88ce31e59c699c38bd Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 17:26:40 +0100 Subject: Btrfs: enhance btrfs structures for device replace support Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f8bb62c..0781fd4 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -885,6 +885,42 @@ struct btrfs_dev_stats_item { __le64 values[BTRFS_DEV_STAT_VALUES_MAX]; } __attribute__ ((__packed__)); +#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0 +#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 +#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0 +#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1 +#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2 +#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3 +#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4 + +struct btrfs_dev_replace { + u64 replace_state; /* see #define above */ + u64 time_started; /* seconds since 1-Jan-1970 */ + u64 time_stopped; /* seconds since 1-Jan-1970 */ + atomic64_t num_write_errors; + atomic64_t num_uncorrectable_read_errors; + + u64 cursor_left; + u64 committed_cursor_left; + u64 cursor_left_last_write_of_item; + u64 cursor_right; + + u64 cont_reading_from_srcdev_mode; /* see #define above */ + + int is_valid; + int item_needs_writeback; + struct btrfs_device *srcdev; + struct btrfs_device *tgtdev; + + pid_t lock_owner; + atomic_t nesting_level; + struct mutex lock_finishing_cancel_unmount; + struct mutex lock_management_lock; + struct mutex lock; + + struct btrfs_scrub_progress scrub_progress; +}; + /* different types of block groups (and chunks) */ #define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) #define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) @@ -1471,6 +1507,9 @@ struct btrfs_fs_info { int backup_root_index; int num_tolerated_disk_barrier_failures; + + /* device replace state */ + struct btrfs_dev_replace dev_replace; }; /* diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 42a8024..9d1b710 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2131,6 +2131,11 @@ int open_ctree(struct super_block *sb, init_rwsem(&fs_info->extent_commit_sem); init_rwsem(&fs_info->cleanup_work_sem); init_rwsem(&fs_info->subvol_sem); + fs_info->dev_replace.lock_owner = 0; + atomic_set(&fs_info->dev_replace.nesting_level, 0); + mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); + mutex_init(&fs_info->dev_replace.lock_management_lock); + mutex_init(&fs_info->dev_replace.lock); spin_lock_init(&fs_info->qgroup_lock); fs_info->qgroup_tree = RB_ROOT; -- cgit v0.10.2 From a2bff64025d7a707ac49155bb6678a636e55096e Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 17:32:20 +0100 Subject: Btrfs: introduce a btrfs_dev_replace_item type Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0781fd4..147406d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -921,6 +921,23 @@ struct btrfs_dev_replace { struct btrfs_scrub_progress scrub_progress; }; +struct btrfs_dev_replace_item { + /* + * grow this item struct at the end for future enhancements and keep + * the existing values unchanged + */ + __le64 src_devid; + __le64 cursor_left; + __le64 cursor_right; + __le64 cont_reading_from_srcdev_mode; + + __le64 replace_state; + __le64 time_started; + __le64 time_stopped; + __le64 num_write_errors; + __le64 num_uncorrectable_read_errors; +} __attribute__ ((__packed__)); + /* different types of block groups (and chunks) */ #define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) #define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) @@ -1763,6 +1780,12 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_DEV_STATS_KEY 249 /* + * Persistantly stores the device replace state in the device tree. + * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0). + */ +#define BTRFS_DEV_REPLACE_KEY 250 + +/* * string items are for debugging. They just store a short string of * data in the FS */ @@ -2795,6 +2818,49 @@ BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item, BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, rsv_excl, 64); +/* btrfs_dev_replace_item */ +BTRFS_SETGET_FUNCS(dev_replace_src_devid, + struct btrfs_dev_replace_item, src_devid, 64); +BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode, + struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode, + 64); +BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item, + replace_state, 64); +BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item, + time_started, 64); +BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item, + time_stopped, 64); +BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item, + num_write_errors, 64); +BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors, + struct btrfs_dev_replace_item, num_uncorrectable_read_errors, + 64); +BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item, + cursor_left, 64); +BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item, + cursor_right, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid, + struct btrfs_dev_replace_item, src_devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode, + struct btrfs_dev_replace_item, + cont_reading_from_srcdev_mode, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state, + struct btrfs_dev_replace_item, replace_state, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started, + struct btrfs_dev_replace_item, time_started, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped, + struct btrfs_dev_replace_item, time_stopped, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors, + struct btrfs_dev_replace_item, num_write_errors, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors, + struct btrfs_dev_replace_item, + num_uncorrectable_read_errors, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left, + struct btrfs_dev_replace_item, cursor_left, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, + struct btrfs_dev_replace_item, cursor_right, 64); + static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) { return sb->s_fs_info; diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 5e23684..50d95fd 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -297,6 +297,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) case BTRFS_DEV_STATS_KEY: printk(KERN_INFO "\t\tdevice stats\n"); break; + case BTRFS_DEV_REPLACE_KEY: + printk(KERN_INFO "\t\tdev replace\n"); + break; }; } } -- cgit v0.10.2 From 5ac00addc7ac09110995fe967071d191b5981cc1 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 17:54:08 +0100 Subject: Btrfs: disallow mutually exclusive admin operations from user mode Btrfs admin operations that are manually started from user mode and that cannot be executed at the same time return -EINPROGRESS. A common way to enter and leave this locked section is introduced since it used to be specific to the balance operation. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 147406d..e9dc780 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1527,6 +1527,8 @@ struct btrfs_fs_info { /* device replace state */ struct btrfs_dev_replace dev_replace; + + atomic_t mutually_exclusive_operation_running; }; /* diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b40b827..26f46da 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1317,13 +1317,13 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - mutex_lock(&root->fs_info->volume_mutex); - if (root->fs_info->balance_ctl) { - printk(KERN_INFO "btrfs: balance in progress\n"); - ret = -EINVAL; - goto out; + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, + 1)) { + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + return -EINPROGRESS; } + mutex_lock(&root->fs_info->volume_mutex); vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); @@ -1419,6 +1419,7 @@ out_free: kfree(vol_args); out: mutex_unlock(&root->fs_info->volume_mutex); + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); return ret; } @@ -2160,9 +2161,17 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) if (btrfs_root_readonly(root)) return -EROFS; + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, + 1)) { + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + return -EINPROGRESS; + } ret = mnt_want_write_file(file); - if (ret) + if (ret) { + atomic_set(&root->fs_info->mutually_exclusive_operation_running, + 0); return ret; + } switch (inode->i_mode & S_IFMT) { case S_IFDIR: @@ -2214,6 +2223,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) } out: mnt_drop_write_file(file); + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); return ret; } @@ -2225,13 +2235,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - mutex_lock(&root->fs_info->volume_mutex); - if (root->fs_info->balance_ctl) { - printk(KERN_INFO "btrfs: balance in progress\n"); - ret = -EINVAL; - goto out; + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, + 1)) { + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + return -EINPROGRESS; } + mutex_lock(&root->fs_info->volume_mutex); vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); @@ -2244,6 +2254,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) kfree(vol_args); out: mutex_unlock(&root->fs_info->volume_mutex); + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); return ret; } @@ -2258,13 +2269,13 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; - mutex_lock(&root->fs_info->volume_mutex); - if (root->fs_info->balance_ctl) { - printk(KERN_INFO "btrfs: balance in progress\n"); - ret = -EINVAL; - goto out; + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, + 1)) { + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + return -EINPROGRESS; } + mutex_lock(&root->fs_info->volume_mutex); vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); @@ -2277,6 +2288,7 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) kfree(vol_args); out: mutex_unlock(&root->fs_info->volume_mutex); + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); return ret; } @@ -3319,6 +3331,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) struct btrfs_ioctl_balance_args *bargs; struct btrfs_balance_control *bctl; int ret; + int need_to_clear_lock = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -3354,10 +3367,13 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) bargs = NULL; } - if (fs_info->balance_ctl) { + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, + 1)) { + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); ret = -EINPROGRESS; goto out_bargs; } + need_to_clear_lock = 1; bctl = kzalloc(sizeof(*bctl), GFP_NOFS); if (!bctl) { @@ -3391,6 +3407,9 @@ do_balance: out_bargs: kfree(bargs); out: + if (need_to_clear_lock) + atomic_set(&root->fs_info->mutually_exclusive_operation_running, + 0); mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); mnt_drop_write_file(file); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d2c0bcc..33ca36b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2952,6 +2952,7 @@ static int balance_kthread(void *data) ret = btrfs_balance(fs_info->balance_ctl, NULL); } + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); @@ -2974,6 +2975,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) return 0; } + WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); if (IS_ERR(tsk)) return PTR_ERR(tsk); -- cgit v0.10.2 From 63a212abc2315972b245f93cb11ae3acf3c0b513 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 18:29:28 +0100 Subject: Btrfs: disallow some operations on the device replace target device This patch adds some code to disallow operations on the device that is used as the target for the device replace operation. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e9dc780..746cb6a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3649,7 +3649,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, /* scrub.c */ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, - int readonly); + int readonly, int is_dev_replace); void btrfs_scrub_pause(struct btrfs_root *root); void btrfs_scrub_pause_super(struct btrfs_root *root); void btrfs_scrub_continue(struct btrfs_root *root); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b4d438f..98af837 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7468,7 +7468,8 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) * check to make sure we can actually find a chunk with enough * space to fit our block group in. */ - if (device->total_bytes > device->bytes_used + min_free) { + if (device->total_bytes > device->bytes_used + min_free && + !device->is_tgtdev_for_dev_replace) { ret = find_free_dev_extent(device, min_free, &dev_offset, NULL); if (!ret) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 26f46da..e54b5e5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1375,6 +1375,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, } } + if (device->is_tgtdev_for_dev_replace) { + ret = -EINVAL; + goto out_free; + } + old_size = device->total_bytes; if (mod < 0) { @@ -3102,7 +3107,8 @@ static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg) return PTR_ERR(sa); ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end, - &sa->progress, sa->flags & BTRFS_SCRUB_READONLY); + &sa->progress, sa->flags & BTRFS_SCRUB_READONLY, + 0); if (copy_to_user(arg, sa, sizeof(*sa))) ret = -EFAULT; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 6cf23f4..460e30b 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -116,6 +116,9 @@ struct scrub_ctx { u32 sectorsize; u32 nodesize; u32 leafsize; + + int is_dev_replace; + /* * statistics */ @@ -284,7 +287,7 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) } static noinline_for_stack -struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev) +struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) { struct scrub_ctx *sctx; int i; @@ -296,6 +299,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev) sctx = kzalloc(sizeof(*sctx), GFP_NOFS); if (!sctx) goto nomem; + sctx->is_dev_replace = is_dev_replace; sctx->pages_per_bio = pages_per_bio; sctx->curr = -1; sctx->dev_root = dev->dev_root; @@ -2293,7 +2297,7 @@ static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, - int readonly) + int readonly, int is_dev_replace) { struct scrub_ctx *sctx; int ret; @@ -2356,14 +2360,14 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_lock(&fs_info->fs_devices->device_list_mutex); dev = btrfs_find_device(fs_info, devid, NULL, NULL); - if (!dev || dev->missing) { + if (!dev || (dev->missing && !is_dev_replace)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); scrub_workers_put(fs_info); return -ENODEV; } mutex_lock(&fs_info->scrub_lock); - if (!dev->in_fs_metadata) { + if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) { mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); scrub_workers_put(fs_info); @@ -2376,7 +2380,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, scrub_workers_put(fs_info); return -EINPROGRESS; } - sctx = scrub_setup_ctx(dev); + sctx = scrub_setup_ctx(dev, is_dev_replace); if (IS_ERR(sctx)) { mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index ef24158..837ad2d 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1354,7 +1354,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) min_stripe_size = BTRFS_STRIPE_LEN; list_for_each_entry(device, &fs_devices->devices, dev_list) { - if (!device->in_fs_metadata || !device->bdev) + if (!device->in_fs_metadata || !device->bdev || + device->is_tgtdev_for_dev_replace) continue; avail_space = device->total_bytes - device->bytes_used; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 33ca36b..31f7af8 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -518,8 +518,9 @@ again: /* This is the initialized path, it is safe to release the devices. */ list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { if (device->in_fs_metadata) { - if (!latest_transid || - device->generation > latest_transid) { + if (!device->is_tgtdev_for_dev_replace && + (!latest_transid || + device->generation > latest_transid)) { latest_devid = device->devid; latest_transid = device->generation; latest_bdev = device->bdev; @@ -814,7 +815,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, *length = 0; - if (start >= device->total_bytes) + if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace) return 0; path = btrfs_alloc_path(); @@ -931,7 +932,7 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, max_hole_size = 0; hole_size = 0; - if (search_start >= search_end) { + if (search_start >= search_end || device->is_tgtdev_for_dev_replace) { ret = -ENOSPC; goto error; } @@ -1114,6 +1115,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_key key; WARN_ON(!device->in_fs_metadata); + WARN_ON(device->is_tgtdev_for_dev_replace); path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -1375,7 +1377,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) * is held. */ list_for_each_entry(tmp, devices, dev_list) { - if (tmp->in_fs_metadata && !tmp->bdev) { + if (tmp->in_fs_metadata && + !tmp->is_tgtdev_for_dev_replace && + !tmp->bdev) { device = tmp; break; } @@ -1406,6 +1410,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) } } + if (device->is_tgtdev_for_dev_replace) { + pr_err("btrfs: unable to remove the dev_replace target dev\n"); + ret = -EINVAL; + goto error_brelse; + } + if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { printk(KERN_ERR "btrfs: unable to remove the only writeable " "device\n"); @@ -1425,6 +1435,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (ret) goto error_undo; + /* + * TODO: the superblock still includes this device in its num_devices + * counter although write_all_supers() is not locked out. This + * could give a filesystem state which requires a degraded mount. + */ ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); if (ret) goto error_undo; @@ -1808,6 +1823,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) device->dev_root = root->fs_info->dev_root; device->bdev = bdev; device->in_fs_metadata = 1; + device->is_tgtdev_for_dev_replace = 0; device->mode = FMODE_EXCL; set_blocksize(device->bdev, 4096); @@ -1971,7 +1987,8 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans, if (!device->writeable) return -EACCES; - if (new_size <= device->total_bytes) + if (new_size <= device->total_bytes || + device->is_tgtdev_for_dev_replace) return -EINVAL; btrfs_set_super_total_bytes(super_copy, old_total + diff); @@ -2600,7 +2617,8 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) size_to_free = div_factor(old_size, 1); size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); if (!device->writeable || - device->total_bytes - device->bytes_used > size_to_free) + device->total_bytes - device->bytes_used > size_to_free || + device->is_tgtdev_for_dev_replace) continue; ret = btrfs_shrink_device(device, old_size - size_to_free); @@ -3132,6 +3150,9 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) u64 old_size = device->total_bytes; u64 diff = device->total_bytes - new_size; + if (device->is_tgtdev_for_dev_replace) + return -EINVAL; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -3401,7 +3422,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, continue; } - if (!device->in_fs_metadata) + if (!device->in_fs_metadata || + device->is_tgtdev_for_dev_replace) continue; if (device->total_bytes > device->bytes_used) @@ -4612,6 +4634,7 @@ static void fill_device_from_item(struct extent_buffer *leaf, device->io_align = btrfs_device_io_align(leaf, dev_item); device->io_width = btrfs_device_io_width(leaf, dev_item); device->sector_size = btrfs_device_sector_size(leaf, dev_item); + device->is_tgtdev_for_dev_replace = 0; ptr = (unsigned long)btrfs_device_uuid(dev_item); read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); @@ -4722,7 +4745,7 @@ static int read_one_dev(struct btrfs_root *root, fill_device_from_item(leaf, dev_item, device); device->dev_root = root->fs_info->dev_root; device->in_fs_metadata = 1; - if (device->writeable) { + if (device->writeable && !device->is_tgtdev_for_dev_replace) { device->fs_devices->total_rw_bytes += device->total_bytes; spin_lock(&root->fs_info->free_chunk_lock); root->fs_info->free_chunk_space += device->total_bytes - diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 802e2ba..8fd5a4d 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -50,6 +50,7 @@ struct btrfs_device { int in_fs_metadata; int missing; int can_discard; + int is_tgtdev_for_dev_replace; spinlock_t io_lock; -- cgit v0.10.2 From 618919236ba54361e93106f4951d233a7ade63cd Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 18:51:52 +0100 Subject: Btrfs: handle errors from btrfs_map_bio() everywhere With the addition of the device replace procedure, it is possible for btrfs_map_bio(READ) to report an error. This happens when the specific mirror is requested which is located on the target disk, and the copy operation has not yet copied this block. Hence the block cannot be read and this error state is indicated by returning EIO. Some background information follows now. A new mirror is added while the device replace procedure is running. btrfs_get_num_copies() returns one more, and btrfs_map_bio(GET_READ_MIRROR) adds one more mirror if a disk location is involved that was already handled by the device replace copy operation. The assigned mirror num is the highest mirror number, e.g. the value 3 in case of RAID1. If btrfs_map_bio() is invoked with mirror_num == 0 (i.e., select any mirror), the copy on the target drive is never selected because that disk shall be able to perform the write requests as quickly as possible. The parallel execution of read requests would only slow down the disk copy procedure. Second case is that btrfs_map_bio() is called with mirror_num > 0. This is done from the repair code only. In this case, the highest mirror num is assigned to the target disk, since it is used last. And when this mirror is not available because the copy procedure has not yet handled this area, an error is returned. Everywhere in the code the handling of such errors is added now. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 8f9abed..badc6f1 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1585,6 +1585,18 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, ret = btrfs_map_block(state->root->fs_info, READ, bytenr, &length, &multi, mirror_num); + if (ret) { + block_ctx_out->start = 0; + block_ctx_out->dev_bytenr = 0; + block_ctx_out->len = 0; + block_ctx_out->dev = NULL; + block_ctx_out->datav = NULL; + block_ctx_out->pagev = NULL; + block_ctx_out->mem_to_free = NULL; + + return ret; + } + device = multi->stripes[0].dev; block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev); block_ctx_out->dev_bytenr = multi->stripes[0].physical; @@ -1594,8 +1606,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, block_ctx_out->pagev = NULL; block_ctx_out->mem_to_free = NULL; - if (0 == ret) - kfree(multi); + kfree(multi); if (NULL == block_ctx_out->dev) { ret = -ENXIO; printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n"); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index c6467aa..94ab2f8 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -687,7 +687,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + bio_endio(comp_bio, ret); bio_put(comp_bio); @@ -712,7 +713,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, } ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + bio_endio(comp_bio, ret); bio_put(comp_bio); return 0; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9d1b710..0e41047 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -852,11 +852,16 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { + int ret; + /* * when we're called for a write, we're already in the async * submission context. Just jump into btrfs_map_bio */ - return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); + ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); + if (ret) + bio_endio(bio, ret); + return ret; } static int check_async_write(struct inode *inode, unsigned long bio_flags) @@ -878,7 +883,6 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, int ret; if (!(rw & REQ_WRITE)) { - /* * called for a read, do the setup so that checksum validation * can happen in the async kernel threads @@ -886,26 +890,32 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, bio, 1); if (ret) - return ret; - return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, - mirror_num, 0); + goto out_w_error; + ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, + mirror_num, 0); } else if (!async) { ret = btree_csum_one_bio(bio); if (ret) - return ret; - return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, - mirror_num, 0); + goto out_w_error; + ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, + mirror_num, 0); + } else { + /* + * kthread helpers are used to submit writes so that + * checksumming can happen in parallel across all CPUs + */ + ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, + inode, rw, bio, mirror_num, 0, + bio_offset, + __btree_submit_bio_start, + __btree_submit_bio_done); } - /* - * kthread helpers are used to submit writes so that checksumming - * can happen in parallel across all CPUs - */ - return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, - inode, rw, bio, mirror_num, 0, - bio_offset, - __btree_submit_bio_start, - __btree_submit_bio_done); + if (ret) { +out_w_error: + bio_endio(bio, ret); + } + return ret; } #ifdef CONFIG_MIGRATION diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 62ec6e4..1b319df 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2462,10 +2462,6 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, return bio; } -/* - * Since writes are async, they will only return -ENOMEM. - * Reads can return the full range of I/O error conditions. - */ static int __must_check submit_one_bio(int rw, struct bio *bio, int mirror_num, unsigned long bio_flags) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5d1675a..d7bf2e7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1602,7 +1602,12 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; - return btrfs_map_bio(root, rw, bio, mirror_num, 1); + int ret; + + ret = btrfs_map_bio(root, rw, bio, mirror_num, 1); + if (ret) + bio_endio(bio, ret); + return ret; } /* @@ -1626,15 +1631,17 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, if (!(rw & REQ_WRITE)) { ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); if (ret) - return ret; + goto out; if (bio_flags & EXTENT_BIO_COMPRESSED) { - return btrfs_submit_compressed_read(inode, bio, - mirror_num, bio_flags); + ret = btrfs_submit_compressed_read(inode, bio, + mirror_num, + bio_flags); + goto out; } else if (!skip_sum) { ret = btrfs_lookup_bio_sums(root, inode, bio, NULL); if (ret) - return ret; + goto out; } goto mapit; } else if (!skip_sum) { @@ -1642,15 +1649,21 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) goto mapit; /* we're doing a write, do the async checksumming */ - return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, + ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, inode, rw, bio, mirror_num, bio_flags, bio_offset, __btrfs_submit_bio_start, __btrfs_submit_bio_done); + goto out; } mapit: - return btrfs_map_bio(root, rw, bio, mirror_num, 0); + ret = btrfs_map_bio(root, rw, bio, mirror_num, 0); + +out: + if (ret < 0) + bio_endio(bio, ret); + return ret; } /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 31f7af8..4158628 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4435,7 +4435,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, mirror_num); - if (ret) /* -ENOMEM */ + if (ret) return ret; total_devs = bbio->num_stripes; -- cgit v0.10.2 From ff023aac31198e88507d626825379b28ea481d4d Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 6 Nov 2012 11:43:11 +0100 Subject: Btrfs: add code to scrub to copy read data to another disk The device replace procedure makes use of the scrub code. The scrub code is the most efficient code to read the allocated data of a disk, i.e. it reads sequentially in order to avoid disk head movements, it skips unallocated blocks, it uses read ahead mechanisms, and it contains all the code to detect and repair defects. This commit adds code to scrub to allow the scrub code to copy read data to another disk. One goal is to be able to perform as fast as possible. Therefore the write requests are collected until huge bios are built, and the write process is decoupled from the read process with some kind of flow control, of course, in order to limit the allocated memory. The best performance on spinning disks could by reached when the head movements are avoided as much as possible. Therefore a single worker is used to interface the read process with the write process. The regular scrub operation works as fast as before, it is not negatively influenced and actually it is more or less unchanged. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 746cb6a..ded7caa 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1483,6 +1483,8 @@ struct btrfs_fs_info { struct rw_semaphore scrub_super_lock; int scrub_workers_refcnt; struct btrfs_workers scrub_workers; + struct btrfs_workers scrub_wr_completion_workers; + struct btrfs_workers scrub_nocow_workers; #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY u32 check_integrity_print_mask; diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h new file mode 100644 index 0000000..1fb5c89 --- /dev/null +++ b/fs/btrfs/dev-replace.h @@ -0,0 +1,26 @@ +/* + * Copyright (C) STRATO AG 2012. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#if !defined(__BTRFS_DEV_REPLACE__) +#define __BTRFS_DEV_REPLACE__ + +static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value) +{ + atomic64_inc(stat_value); +} +#endif diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 0ddc565..9f363e1 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -418,12 +418,17 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, */ continue; } + if (!dev->bdev) { + /* cannot read ahead on missing device */ + continue; + } prev_dev = dev; ret = radix_tree_insert(&dev->reada_extents, index, re); if (ret) { while (--i >= 0) { dev = bbio->stripes[i].dev; BUG_ON(dev == NULL); + /* ignore whether the entry was inserted */ radix_tree_delete(&dev->reada_extents, index); } BUG_ON(fs_info == NULL); @@ -914,7 +919,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, generation = btrfs_header_generation(node); free_extent_buffer(node); - reada_add_block(rc, start, &max_key, level, generation); + if (reada_add_block(rc, start, &max_key, level, generation)) { + kfree(rc); + return ERR_PTR(-ENOMEM); + } reada_start_machine(root->fs_info); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 460e30b..61157a2 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -25,6 +25,7 @@ #include "transaction.h" #include "backref.h" #include "extent_io.h" +#include "dev-replace.h" #include "check-integrity.h" #include "rcu-string.h" @@ -44,8 +45,15 @@ struct scrub_block; struct scrub_ctx; -#define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ -#define SCRUB_BIOS_PER_CTX 16 /* 1 MB per device in flight */ +/* + * the following three values only influence the performance. + * The last one configures the number of parallel and outstanding I/O + * operations. The first two values configure an upper limit for the number + * of (dynamically allocated) pages that are added to a bio. + */ +#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */ +#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */ +#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */ /* * the following value times PAGE_SIZE needs to be large enough to match the @@ -62,6 +70,7 @@ struct scrub_page { u64 generation; u64 logical; u64 physical; + u64 physical_for_dev_replace; atomic_t ref_count; struct { unsigned int mirror_num:8; @@ -79,7 +88,11 @@ struct scrub_bio { int err; u64 logical; u64 physical; - struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; +#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO + struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO]; +#else + struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO]; +#endif int page_count; int next_free; struct btrfs_work work; @@ -99,8 +112,16 @@ struct scrub_block { }; }; +struct scrub_wr_ctx { + struct scrub_bio *wr_curr_bio; + struct btrfs_device *tgtdev; + int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */ + atomic_t flush_all_writes; + struct mutex wr_lock; +}; + struct scrub_ctx { - struct scrub_bio *bios[SCRUB_BIOS_PER_CTX]; + struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX]; struct btrfs_root *dev_root; int first_free; int curr; @@ -112,12 +133,13 @@ struct scrub_ctx { struct list_head csum_list; atomic_t cancel_req; int readonly; - int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */ + int pages_per_rd_bio; u32 sectorsize; u32 nodesize; u32 leafsize; int is_dev_replace; + struct scrub_wr_ctx wr_ctx; /* * statistics @@ -135,6 +157,15 @@ struct scrub_fixup_nodatasum { int mirror_num; }; +struct scrub_copy_nocow_ctx { + struct scrub_ctx *sctx; + u64 logical; + u64 len; + int mirror_num; + u64 physical_for_dev_replace; + struct btrfs_work work; +}; + struct scrub_warning { struct btrfs_path *path; u64 extent_item_size; @@ -156,8 +187,9 @@ static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx); static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); static int scrub_setup_recheck_block(struct scrub_ctx *sctx, struct btrfs_fs_info *fs_info, + struct scrub_block *original_sblock, u64 length, u64 logical, - struct scrub_block *sblock); + struct scrub_block *sblocks_for_recheck); static void scrub_recheck_block(struct btrfs_fs_info *fs_info, struct scrub_block *sblock, int is_metadata, int have_csum, u8 *csum, u64 generation, @@ -174,6 +206,9 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good, int page_num, int force_write); +static void scrub_write_block_to_dev_replace(struct scrub_block *sblock); +static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, + int page_num); static int scrub_checksum_data(struct scrub_block *sblock); static int scrub_checksum_tree_block(struct scrub_block *sblock); static int scrub_checksum_super(struct scrub_block *sblock); @@ -181,14 +216,38 @@ static void scrub_block_get(struct scrub_block *sblock); static void scrub_block_put(struct scrub_block *sblock); static void scrub_page_get(struct scrub_page *spage); static void scrub_page_put(struct scrub_page *spage); -static int scrub_add_page_to_bio(struct scrub_ctx *sctx, - struct scrub_page *spage); +static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, + struct scrub_page *spage); static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, u64 physical, struct btrfs_device *dev, u64 flags, - u64 gen, int mirror_num, u8 *csum, int force); + u64 gen, int mirror_num, u8 *csum, int force, + u64 physical_for_dev_replace); static void scrub_bio_end_io(struct bio *bio, int err); static void scrub_bio_end_io_worker(struct btrfs_work *work); static void scrub_block_complete(struct scrub_block *sblock); +static void scrub_remap_extent(struct btrfs_fs_info *fs_info, + u64 extent_logical, u64 extent_len, + u64 *extent_physical, + struct btrfs_device **extent_dev, + int *extent_mirror_num); +static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, + struct scrub_wr_ctx *wr_ctx, + struct btrfs_fs_info *fs_info, + struct btrfs_device *dev, + int is_dev_replace); +static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx); +static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, + struct scrub_page *spage); +static void scrub_wr_submit(struct scrub_ctx *sctx); +static void scrub_wr_bio_end_io(struct bio *bio, int err); +static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); +static int write_page_nocow(struct scrub_ctx *sctx, + u64 physical_for_dev_replace, struct page *page); +static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, + void *ctx); +static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, + int mirror_num, u64 physical_for_dev_replace); +static void copy_nocow_pages_worker(struct btrfs_work *work); static void scrub_pending_bio_inc(struct scrub_ctx *sctx) @@ -262,19 +321,20 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) if (!sctx) return; + scrub_free_wr_ctx(&sctx->wr_ctx); + /* this can happen when scrub is cancelled */ if (sctx->curr != -1) { struct scrub_bio *sbio = sctx->bios[sctx->curr]; for (i = 0; i < sbio->page_count; i++) { - BUG_ON(!sbio->pagev[i]); - BUG_ON(!sbio->pagev[i]->page); + WARN_ON(!sbio->pagev[i]->page); scrub_block_put(sbio->pagev[i]->sblock); } bio_put(sbio->bio); } - for (i = 0; i < SCRUB_BIOS_PER_CTX; ++i) { + for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { struct scrub_bio *sbio = sctx->bios[i]; if (!sbio) @@ -292,18 +352,29 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) struct scrub_ctx *sctx; int i; struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; - int pages_per_bio; + int pages_per_rd_bio; + int ret; - pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO, - bio_get_nr_vecs(dev->bdev)); + /* + * the setting of pages_per_rd_bio is correct for scrub but might + * be wrong for the dev_replace code where we might read from + * different devices in the initial huge bios. However, that + * code is able to correctly handle the case when adding a page + * to a bio fails. + */ + if (dev->bdev) + pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO, + bio_get_nr_vecs(dev->bdev)); + else + pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; sctx = kzalloc(sizeof(*sctx), GFP_NOFS); if (!sctx) goto nomem; sctx->is_dev_replace = is_dev_replace; - sctx->pages_per_bio = pages_per_bio; + sctx->pages_per_rd_bio = pages_per_rd_bio; sctx->curr = -1; sctx->dev_root = dev->dev_root; - for (i = 0; i < SCRUB_BIOS_PER_CTX; ++i) { + for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { struct scrub_bio *sbio; sbio = kzalloc(sizeof(*sbio), GFP_NOFS); @@ -316,7 +387,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) sbio->page_count = 0; sbio->work.func = scrub_bio_end_io_worker; - if (i != SCRUB_BIOS_PER_CTX - 1) + if (i != SCRUB_BIOS_PER_SCTX - 1) sctx->bios[i]->next_free = i + 1; else sctx->bios[i]->next_free = -1; @@ -334,6 +405,13 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) spin_lock_init(&sctx->list_lock); spin_lock_init(&sctx->stat_lock); init_waitqueue_head(&sctx->list_wait); + + ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info, + fs_info->dev_replace.tgtdev, is_dev_replace); + if (ret) { + scrub_free_ctx(sctx); + return ERR_PTR(ret); + } return sctx; nomem: @@ -341,7 +419,8 @@ nomem: return ERR_PTR(-ENOMEM); } -static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) +static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, + void *warn_ctx) { u64 isize; u32 nlink; @@ -349,7 +428,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) int i; struct extent_buffer *eb; struct btrfs_inode_item *inode_item; - struct scrub_warning *swarn = ctx; + struct scrub_warning *swarn = warn_ctx; struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; struct inode_fs_paths *ipath = NULL; struct btrfs_root *local_root; @@ -492,11 +571,11 @@ out: kfree(swarn.msg_buf); } -static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) +static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) { struct page *page = NULL; unsigned long index; - struct scrub_fixup_nodatasum *fixup = ctx; + struct scrub_fixup_nodatasum *fixup = fixup_ctx; int ret; int corrected = 0; struct btrfs_key key; @@ -660,7 +739,9 @@ out: spin_lock(&sctx->stat_lock); ++sctx->stat.uncorrectable_errors; spin_unlock(&sctx->stat_lock); - + btrfs_dev_replace_stats_inc( + &sctx->dev_root->fs_info->dev_replace. + num_uncorrectable_read_errors); printk_ratelimited_in_rcu(KERN_ERR "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", (unsigned long long)fixup->logical, @@ -715,6 +796,11 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) csum = sblock_to_check->pagev[0]->csum; dev = sblock_to_check->pagev[0]->dev; + if (sctx->is_dev_replace && !is_metadata && !have_csum) { + sblocks_for_recheck = NULL; + goto nodatasum_case; + } + /* * read all mirrors one after the other. This includes to * re-read the extent or metadata block that failed (that was @@ -758,7 +844,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) } /* setup the context, map the logical blocks and alloc the pages */ - ret = scrub_setup_recheck_block(sctx, fs_info, length, + ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length, logical, sblocks_for_recheck); if (ret) { spin_lock(&sctx->stat_lock); @@ -789,6 +875,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sctx->stat.unverified_errors++; spin_unlock(&sctx->stat_lock); + if (sctx->is_dev_replace) + scrub_write_block_to_dev_replace(sblock_bad); goto out; } @@ -822,12 +910,15 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) BTRFS_DEV_STAT_CORRUPTION_ERRS); } - if (sctx->readonly) + if (sctx->readonly && !sctx->is_dev_replace) goto did_not_correct_error; if (!is_metadata && !have_csum) { struct scrub_fixup_nodatasum *fixup_nodatasum; +nodatasum_case: + WARN_ON(sctx->is_dev_replace); + /* * !is_metadata and !have_csum, this means that the data * might not be COW'ed, that it might be modified @@ -883,18 +974,79 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) if (!sblock_other->header_error && !sblock_other->checksum_error && sblock_other->no_io_error_seen) { - int force_write = is_metadata || have_csum; - - ret = scrub_repair_block_from_good_copy(sblock_bad, - sblock_other, - force_write); + if (sctx->is_dev_replace) { + scrub_write_block_to_dev_replace(sblock_other); + } else { + int force_write = is_metadata || have_csum; + + ret = scrub_repair_block_from_good_copy( + sblock_bad, sblock_other, + force_write); + } if (0 == ret) goto corrected_error; } } /* - * in case of I/O errors in the area that is supposed to be + * for dev_replace, pick good pages and write to the target device. + */ + if (sctx->is_dev_replace) { + success = 1; + for (page_num = 0; page_num < sblock_bad->page_count; + page_num++) { + int sub_success; + + sub_success = 0; + for (mirror_index = 0; + mirror_index < BTRFS_MAX_MIRRORS && + sblocks_for_recheck[mirror_index].page_count > 0; + mirror_index++) { + struct scrub_block *sblock_other = + sblocks_for_recheck + mirror_index; + struct scrub_page *page_other = + sblock_other->pagev[page_num]; + + if (!page_other->io_error) { + ret = scrub_write_page_to_dev_replace( + sblock_other, page_num); + if (ret == 0) { + /* succeeded for this page */ + sub_success = 1; + break; + } else { + btrfs_dev_replace_stats_inc( + &sctx->dev_root-> + fs_info->dev_replace. + num_write_errors); + } + } + } + + if (!sub_success) { + /* + * did not find a mirror to fetch the page + * from. scrub_write_page_to_dev_replace() + * handles this case (page->io_error), by + * filling the block with zeros before + * submitting the write request + */ + success = 0; + ret = scrub_write_page_to_dev_replace( + sblock_bad, page_num); + if (ret) + btrfs_dev_replace_stats_inc( + &sctx->dev_root->fs_info-> + dev_replace.num_write_errors); + } + } + + goto out; + } + + /* + * for regular scrub, repair those pages that are errored. + * In case of I/O errors in the area that is supposed to be * repaired, continue by picking good copies of those pages. * Select the good pages from mirrors to rewrite bad pages from * the area to fix. Afterwards verify the checksum of the block @@ -1017,6 +1169,7 @@ out: static int scrub_setup_recheck_block(struct scrub_ctx *sctx, struct btrfs_fs_info *fs_info, + struct scrub_block *original_sblock, u64 length, u64 logical, struct scrub_block *sblocks_for_recheck) { @@ -1047,7 +1200,7 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx, return -EIO; } - BUG_ON(page_index >= SCRUB_PAGES_PER_BIO); + BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; mirror_index++) { struct scrub_block *sblock; @@ -1071,6 +1224,10 @@ leave_nomem: sblock->pagev[page_index] = page; page->logical = logical; page->physical = bbio->stripes[mirror_index].physical; + BUG_ON(page_index >= original_sblock->page_count); + page->physical_for_dev_replace = + original_sblock->pagev[page_index]-> + physical_for_dev_replace; /* for missing devices, dev->bdev is NULL */ page->dev = bbio->stripes[mirror_index].dev; page->mirror_num = mirror_index + 1; @@ -1249,6 +1406,12 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, int ret; DECLARE_COMPLETION_ONSTACK(complete); + if (!page_bad->dev->bdev) { + printk_ratelimited(KERN_WARNING + "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n"); + return -EIO; + } + bio = bio_alloc(GFP_NOFS, 1); if (!bio) return -EIO; @@ -1269,6 +1432,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, if (!bio_flagged(bio, BIO_UPTODATE)) { btrfs_dev_stat_inc_and_print(page_bad->dev, BTRFS_DEV_STAT_WRITE_ERRS); + btrfs_dev_replace_stats_inc( + &sblock_bad->sctx->dev_root->fs_info-> + dev_replace.num_write_errors); bio_put(bio); return -EIO; } @@ -1278,7 +1444,168 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, return 0; } -static void scrub_checksum(struct scrub_block *sblock) +static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) +{ + int page_num; + + for (page_num = 0; page_num < sblock->page_count; page_num++) { + int ret; + + ret = scrub_write_page_to_dev_replace(sblock, page_num); + if (ret) + btrfs_dev_replace_stats_inc( + &sblock->sctx->dev_root->fs_info->dev_replace. + num_write_errors); + } +} + +static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, + int page_num) +{ + struct scrub_page *spage = sblock->pagev[page_num]; + + BUG_ON(spage->page == NULL); + if (spage->io_error) { + void *mapped_buffer = kmap_atomic(spage->page); + + memset(mapped_buffer, 0, PAGE_CACHE_SIZE); + flush_dcache_page(spage->page); + kunmap_atomic(mapped_buffer); + } + return scrub_add_page_to_wr_bio(sblock->sctx, spage); +} + +static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, + struct scrub_page *spage) +{ + struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; + struct scrub_bio *sbio; + int ret; + + mutex_lock(&wr_ctx->wr_lock); +again: + if (!wr_ctx->wr_curr_bio) { + wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio), + GFP_NOFS); + if (!wr_ctx->wr_curr_bio) { + mutex_unlock(&wr_ctx->wr_lock); + return -ENOMEM; + } + wr_ctx->wr_curr_bio->sctx = sctx; + wr_ctx->wr_curr_bio->page_count = 0; + } + sbio = wr_ctx->wr_curr_bio; + if (sbio->page_count == 0) { + struct bio *bio; + + sbio->physical = spage->physical_for_dev_replace; + sbio->logical = spage->logical; + sbio->dev = wr_ctx->tgtdev; + bio = sbio->bio; + if (!bio) { + bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio); + if (!bio) { + mutex_unlock(&wr_ctx->wr_lock); + return -ENOMEM; + } + sbio->bio = bio; + } + + bio->bi_private = sbio; + bio->bi_end_io = scrub_wr_bio_end_io; + bio->bi_bdev = sbio->dev->bdev; + bio->bi_sector = sbio->physical >> 9; + sbio->err = 0; + } else if (sbio->physical + sbio->page_count * PAGE_SIZE != + spage->physical_for_dev_replace || + sbio->logical + sbio->page_count * PAGE_SIZE != + spage->logical) { + scrub_wr_submit(sctx); + goto again; + } + + ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); + if (ret != PAGE_SIZE) { + if (sbio->page_count < 1) { + bio_put(sbio->bio); + sbio->bio = NULL; + mutex_unlock(&wr_ctx->wr_lock); + return -EIO; + } + scrub_wr_submit(sctx); + goto again; + } + + sbio->pagev[sbio->page_count] = spage; + scrub_page_get(spage); + sbio->page_count++; + if (sbio->page_count == wr_ctx->pages_per_wr_bio) + scrub_wr_submit(sctx); + mutex_unlock(&wr_ctx->wr_lock); + + return 0; +} + +static void scrub_wr_submit(struct scrub_ctx *sctx) +{ + struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; + struct scrub_bio *sbio; + + if (!wr_ctx->wr_curr_bio) + return; + + sbio = wr_ctx->wr_curr_bio; + wr_ctx->wr_curr_bio = NULL; + WARN_ON(!sbio->bio->bi_bdev); + scrub_pending_bio_inc(sctx); + /* process all writes in a single worker thread. Then the block layer + * orders the requests before sending them to the driver which + * doubled the write performance on spinning disks when measured + * with Linux 3.5 */ + btrfsic_submit_bio(WRITE, sbio->bio); +} + +static void scrub_wr_bio_end_io(struct bio *bio, int err) +{ + struct scrub_bio *sbio = bio->bi_private; + struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; + + sbio->err = err; + sbio->bio = bio; + + sbio->work.func = scrub_wr_bio_end_io_worker; + btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work); +} + +static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) +{ + struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); + struct scrub_ctx *sctx = sbio->sctx; + int i; + + WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO); + if (sbio->err) { + struct btrfs_dev_replace *dev_replace = + &sbio->sctx->dev_root->fs_info->dev_replace; + + for (i = 0; i < sbio->page_count; i++) { + struct scrub_page *spage = sbio->pagev[i]; + + spage->io_error = 1; + btrfs_dev_replace_stats_inc(&dev_replace-> + num_write_errors); + } + } + + for (i = 0; i < sbio->page_count; i++) + scrub_page_put(sbio->pagev[i]); + + bio_put(sbio->bio); + kfree(sbio); + scrub_pending_bio_dec(sctx); +} + +static int scrub_checksum(struct scrub_block *sblock) { u64 flags; int ret; @@ -1296,6 +1623,8 @@ static void scrub_checksum(struct scrub_block *sblock) WARN_ON(1); if (ret) scrub_handle_errored_block(sblock); + + return ret; } static int scrub_checksum_data(struct scrub_block *sblock) @@ -1386,7 +1715,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) BTRFS_UUID_SIZE)) ++fail; - BUG_ON(sctx->nodesize != sctx->leafsize); + WARN_ON(sctx->nodesize != sctx->leafsize); len = sctx->nodesize - BTRFS_CSUM_SIZE; mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; @@ -1534,11 +1863,24 @@ static void scrub_submit(struct scrub_ctx *sctx) sctx->curr = -1; scrub_pending_bio_inc(sctx); - btrfsic_submit_bio(READ, sbio->bio); + if (!sbio->bio->bi_bdev) { + /* + * this case should not happen. If btrfs_map_block() is + * wrong, it could happen for dev-replace operations on + * missing devices when no mirrors are available, but in + * this case it should already fail the mount. + * This case is handled correctly (but _very_ slowly). + */ + printk_ratelimited(KERN_WARNING + "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n"); + bio_endio(sbio->bio, -EIO); + } else { + btrfsic_submit_bio(READ, sbio->bio); + } } -static int scrub_add_page_to_bio(struct scrub_ctx *sctx, - struct scrub_page *spage) +static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, + struct scrub_page *spage) { struct scrub_block *sblock = spage->sblock; struct scrub_bio *sbio; @@ -1570,7 +1912,7 @@ again: sbio->dev = spage->dev; bio = sbio->bio; if (!bio) { - bio = bio_alloc(GFP_NOFS, sctx->pages_per_bio); + bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio); if (!bio) return -ENOMEM; sbio->bio = bio; @@ -1602,10 +1944,10 @@ again: goto again; } - scrub_block_get(sblock); /* one for the added page */ + scrub_block_get(sblock); /* one for the page added to the bio */ atomic_inc(&sblock->outstanding_pages); sbio->page_count++; - if (sbio->page_count == sctx->pages_per_bio) + if (sbio->page_count == sctx->pages_per_rd_bio) scrub_submit(sctx); return 0; @@ -1613,7 +1955,8 @@ again: static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, u64 physical, struct btrfs_device *dev, u64 flags, - u64 gen, int mirror_num, u8 *csum, int force) + u64 gen, int mirror_num, u8 *csum, int force, + u64 physical_for_dev_replace) { struct scrub_block *sblock; int index; @@ -1654,6 +1997,7 @@ leave_nomem: spage->generation = gen; spage->logical = logical; spage->physical = physical; + spage->physical_for_dev_replace = physical_for_dev_replace; spage->mirror_num = mirror_num; if (csum) { spage->have_csum = 1; @@ -1668,6 +2012,7 @@ leave_nomem: len -= l; logical += l; physical += l; + physical_for_dev_replace += l; } WARN_ON(sblock->page_count == 0); @@ -1675,7 +2020,7 @@ leave_nomem: struct scrub_page *spage = sblock->pagev[index]; int ret; - ret = scrub_add_page_to_bio(sctx, spage); + ret = scrub_add_page_to_rd_bio(sctx, spage); if (ret) { scrub_block_put(sblock); return ret; @@ -1707,7 +2052,7 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) struct scrub_ctx *sctx = sbio->sctx; int i; - BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO); + BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO); if (sbio->err) { for (i = 0; i < sbio->page_count; i++) { struct scrub_page *spage = sbio->pagev[i]; @@ -1733,15 +2078,30 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) sbio->next_free = sctx->first_free; sctx->first_free = sbio->index; spin_unlock(&sctx->list_lock); + + if (sctx->is_dev_replace && + atomic_read(&sctx->wr_ctx.flush_all_writes)) { + mutex_lock(&sctx->wr_ctx.wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_ctx.wr_lock); + } + scrub_pending_bio_dec(sctx); } static void scrub_block_complete(struct scrub_block *sblock) { - if (!sblock->no_io_error_seen) + if (!sblock->no_io_error_seen) { scrub_handle_errored_block(sblock); - else - scrub_checksum(sblock); + } else { + /* + * if has checksum error, write via repair mechanism in + * dev replace case, otherwise write here in dev replace + * case. + */ + if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace) + scrub_write_block_to_dev_replace(sblock); + } } static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, @@ -1786,7 +2146,7 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, /* scrub extent tries to collect up to 64 kB for each bio */ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, u64 physical, struct btrfs_device *dev, u64 flags, - u64 gen, int mirror_num) + u64 gen, int mirror_num, u64 physical_for_dev_replace) { int ret; u8 csum[BTRFS_CSUM_SIZE]; @@ -1799,7 +2159,7 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, sctx->stat.data_bytes_scrubbed += len; spin_unlock(&sctx->stat_lock); } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - BUG_ON(sctx->nodesize != sctx->leafsize); + WARN_ON(sctx->nodesize != sctx->leafsize); blocksize = sctx->nodesize; spin_lock(&sctx->stat_lock); sctx->stat.tree_extents_scrubbed++; @@ -1807,7 +2167,7 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, spin_unlock(&sctx->stat_lock); } else { blocksize = sctx->sectorsize; - BUG_ON(1); + WARN_ON(1); } while (len) { @@ -1819,14 +2179,23 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, have_csum = scrub_find_csum(sctx, logical, l, csum); if (have_csum == 0) ++sctx->stat.no_csum; + if (sctx->is_dev_replace && !have_csum) { + ret = copy_nocow_pages(sctx, logical, l, + mirror_num, + physical_for_dev_replace); + goto behind_scrub_pages; + } } ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen, - mirror_num, have_csum ? csum : NULL, 0); + mirror_num, have_csum ? csum : NULL, 0, + physical_for_dev_replace); +behind_scrub_pages: if (ret) return ret; len -= l; logical += l; physical += l; + physical_for_dev_replace += l; } return 0; } @@ -1834,7 +2203,8 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct map_lookup *map, struct btrfs_device *scrub_dev, - int num, u64 base, u64 length) + int num, u64 base, u64 length, + int is_dev_replace) { struct btrfs_path *path; struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; @@ -1859,6 +2229,11 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct btrfs_key key_end; u64 increment = map->stripe_len; u64 offset; + u64 extent_logical; + u64 extent_physical; + u64 extent_len; + struct btrfs_device *extent_dev; + int extent_mirror_num; nstripes = length; offset = 0; @@ -1966,9 +2341,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, */ if (atomic_read(&fs_info->scrub_pause_req)) { /* push queued extents */ + atomic_set(&sctx->wr_ctx.flush_all_writes, 1); scrub_submit(sctx); + mutex_lock(&sctx->wr_ctx.wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_ctx.wr_lock); wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); + atomic_set(&sctx->wr_ctx.flush_all_writes, 0); atomic_inc(&fs_info->scrubs_paused); wake_up(&fs_info->scrub_pause_wait); mutex_lock(&fs_info->scrub_lock); @@ -2063,10 +2443,20 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, key.objectid; } - ret = scrub_extent(sctx, key.objectid, key.offset, - key.objectid - logical + physical, - scrub_dev, flags, generation, - mirror_num); + extent_logical = key.objectid; + extent_physical = key.objectid - logical + physical; + extent_len = key.offset; + extent_dev = scrub_dev; + extent_mirror_num = mirror_num; + if (is_dev_replace) + scrub_remap_extent(fs_info, extent_logical, + extent_len, &extent_physical, + &extent_dev, + &extent_mirror_num); + ret = scrub_extent(sctx, extent_logical, extent_len, + extent_physical, extent_dev, flags, + generation, extent_mirror_num, + key.objectid - logical + physical); if (ret) goto out; @@ -2080,10 +2470,13 @@ next: sctx->stat.last_physical = physical; spin_unlock(&sctx->stat_lock); } +out: /* push queued extents */ scrub_submit(sctx); + mutex_lock(&sctx->wr_ctx.wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_ctx.wr_lock); -out: blk_finish_plug(&plug); btrfs_free_path(path); return ret < 0 ? ret : 0; @@ -2093,14 +2486,14 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length, - u64 dev_offset) + u64 dev_offset, int is_dev_replace) { struct btrfs_mapping_tree *map_tree = &sctx->dev_root->fs_info->mapping_tree; struct map_lookup *map; struct extent_map *em; int i; - int ret = -EINVAL; + int ret = 0; read_lock(&map_tree->map_tree.lock); em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); @@ -2120,7 +2513,8 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, if (map->stripes[i].dev->bdev == scrub_dev->bdev && map->stripes[i].physical == dev_offset) { ret = scrub_stripe(sctx, map, scrub_dev, i, - chunk_offset, length); + chunk_offset, length, + is_dev_replace); if (ret) goto out; } @@ -2133,7 +2527,8 @@ out: static noinline_for_stack int scrub_enumerate_chunks(struct scrub_ctx *sctx, - struct btrfs_device *scrub_dev, u64 start, u64 end) + struct btrfs_device *scrub_dev, u64 start, u64 end, + int is_dev_replace) { struct btrfs_dev_extent *dev_extent = NULL; struct btrfs_path *path; @@ -2149,6 +2544,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, struct btrfs_key key; struct btrfs_key found_key; struct btrfs_block_group_cache *cache; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; path = btrfs_alloc_path(); if (!path) @@ -2214,11 +2610,61 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, ret = -ENOENT; break; } + dev_replace->cursor_right = found_key.offset + length; + dev_replace->cursor_left = found_key.offset; + dev_replace->item_needs_writeback = 1; ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid, - chunk_offset, length, found_key.offset); + chunk_offset, length, found_key.offset, + is_dev_replace); + + /* + * flush, submit all pending read and write bios, afterwards + * wait for them. + * Note that in the dev replace case, a read request causes + * write requests that are submitted in the read completion + * worker. Therefore in the current situation, it is required + * that all write requests are flushed, so that all read and + * write requests are really completed when bios_in_flight + * changes to 0. + */ + atomic_set(&sctx->wr_ctx.flush_all_writes, 1); + scrub_submit(sctx); + mutex_lock(&sctx->wr_ctx.wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_ctx.wr_lock); + + wait_event(sctx->list_wait, + atomic_read(&sctx->bios_in_flight) == 0); + atomic_set(&sctx->wr_ctx.flush_all_writes, 0); + atomic_inc(&fs_info->scrubs_paused); + wake_up(&fs_info->scrub_pause_wait); + wait_event(sctx->list_wait, + atomic_read(&sctx->workers_pending) == 0); + + mutex_lock(&fs_info->scrub_lock); + while (atomic_read(&fs_info->scrub_pause_req)) { + mutex_unlock(&fs_info->scrub_lock); + wait_event(fs_info->scrub_pause_wait, + atomic_read(&fs_info->scrub_pause_req) == 0); + mutex_lock(&fs_info->scrub_lock); + } + atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + wake_up(&fs_info->scrub_pause_wait); + + dev_replace->cursor_left = dev_replace->cursor_right; + dev_replace->item_needs_writeback = 1; btrfs_put_block_group(cache); if (ret) break; + if (atomic64_read(&dev_replace->num_write_errors) > 0) { + ret = -EIO; + break; + } + if (sctx->stat.malloc_errors > 0) { + ret = -ENOMEM; + break; + } key.offset = found_key.offset + length; btrfs_release_path(path); @@ -2254,7 +2700,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, - NULL, 1); + NULL, 1, bytenr); if (ret) return ret; } @@ -2266,18 +2712,38 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, /* * get a reference count on fs_info->scrub_workers. start worker if necessary */ -static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info) +static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, + int is_dev_replace) { int ret = 0; mutex_lock(&fs_info->scrub_lock); if (fs_info->scrub_workers_refcnt == 0) { - btrfs_init_workers(&fs_info->scrub_workers, "scrub", - fs_info->thread_pool_size, &fs_info->generic_worker); + if (is_dev_replace) + btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1, + &fs_info->generic_worker); + else + btrfs_init_workers(&fs_info->scrub_workers, "scrub", + fs_info->thread_pool_size, + &fs_info->generic_worker); fs_info->scrub_workers.idle_thresh = 4; ret = btrfs_start_workers(&fs_info->scrub_workers); if (ret) goto out; + btrfs_init_workers(&fs_info->scrub_wr_completion_workers, + "scrubwrc", + fs_info->thread_pool_size, + &fs_info->generic_worker); + fs_info->scrub_wr_completion_workers.idle_thresh = 2; + ret = btrfs_start_workers( + &fs_info->scrub_wr_completion_workers); + if (ret) + goto out; + btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1, + &fs_info->generic_worker); + ret = btrfs_start_workers(&fs_info->scrub_nocow_workers); + if (ret) + goto out; } ++fs_info->scrub_workers_refcnt; out: @@ -2289,8 +2755,11 @@ out: static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) { mutex_lock(&fs_info->scrub_lock); - if (--fs_info->scrub_workers_refcnt == 0) + if (--fs_info->scrub_workers_refcnt == 0) { btrfs_stop_workers(&fs_info->scrub_workers); + btrfs_stop_workers(&fs_info->scrub_wr_completion_workers); + btrfs_stop_workers(&fs_info->scrub_nocow_workers); + } WARN_ON(fs_info->scrub_workers_refcnt < 0); mutex_unlock(&fs_info->scrub_lock); } @@ -2354,7 +2823,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, return -EINVAL; } - ret = scrub_workers_get(fs_info); + ret = scrub_workers_get(fs_info, is_dev_replace); if (ret) return ret; @@ -2394,12 +2863,15 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); - down_read(&fs_info->scrub_super_lock); - ret = scrub_supers(sctx, dev); - up_read(&fs_info->scrub_super_lock); + if (!is_dev_replace) { + down_read(&fs_info->scrub_super_lock); + ret = scrub_supers(sctx, dev); + up_read(&fs_info->scrub_super_lock); + } if (!ret) - ret = scrub_enumerate_chunks(sctx, dev, start, end); + ret = scrub_enumerate_chunks(sctx, dev, start, end, + is_dev_replace); wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); atomic_dec(&fs_info->scrubs_running); @@ -2537,3 +3009,272 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; } + +static void scrub_remap_extent(struct btrfs_fs_info *fs_info, + u64 extent_logical, u64 extent_len, + u64 *extent_physical, + struct btrfs_device **extent_dev, + int *extent_mirror_num) +{ + u64 mapped_length; + struct btrfs_bio *bbio = NULL; + int ret; + + mapped_length = extent_len; + ret = btrfs_map_block(fs_info, READ, extent_logical, + &mapped_length, &bbio, 0); + if (ret || !bbio || mapped_length < extent_len || + !bbio->stripes[0].dev->bdev) { + kfree(bbio); + return; + } + + *extent_physical = bbio->stripes[0].physical; + *extent_mirror_num = bbio->mirror_num; + *extent_dev = bbio->stripes[0].dev; + kfree(bbio); +} + +static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, + struct scrub_wr_ctx *wr_ctx, + struct btrfs_fs_info *fs_info, + struct btrfs_device *dev, + int is_dev_replace) +{ + WARN_ON(wr_ctx->wr_curr_bio != NULL); + + mutex_init(&wr_ctx->wr_lock); + wr_ctx->wr_curr_bio = NULL; + if (!is_dev_replace) + return 0; + + WARN_ON(!dev->bdev); + wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO, + bio_get_nr_vecs(dev->bdev)); + wr_ctx->tgtdev = dev; + atomic_set(&wr_ctx->flush_all_writes, 0); + return 0; +} + +static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx) +{ + mutex_lock(&wr_ctx->wr_lock); + kfree(wr_ctx->wr_curr_bio); + wr_ctx->wr_curr_bio = NULL; + mutex_unlock(&wr_ctx->wr_lock); +} + +static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, + int mirror_num, u64 physical_for_dev_replace) +{ + struct scrub_copy_nocow_ctx *nocow_ctx; + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + + nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS); + if (!nocow_ctx) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + return -ENOMEM; + } + + scrub_pending_trans_workers_inc(sctx); + + nocow_ctx->sctx = sctx; + nocow_ctx->logical = logical; + nocow_ctx->len = len; + nocow_ctx->mirror_num = mirror_num; + nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; + nocow_ctx->work.func = copy_nocow_pages_worker; + btrfs_queue_worker(&fs_info->scrub_nocow_workers, + &nocow_ctx->work); + + return 0; +} + +static void copy_nocow_pages_worker(struct btrfs_work *work) +{ + struct scrub_copy_nocow_ctx *nocow_ctx = + container_of(work, struct scrub_copy_nocow_ctx, work); + struct scrub_ctx *sctx = nocow_ctx->sctx; + u64 logical = nocow_ctx->logical; + u64 len = nocow_ctx->len; + int mirror_num = nocow_ctx->mirror_num; + u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; + int ret; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_fs_info *fs_info; + struct btrfs_path *path; + struct btrfs_root *root; + int not_written = 0; + + fs_info = sctx->dev_root->fs_info; + root = fs_info->extent_root; + + path = btrfs_alloc_path(); + if (!path) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + not_written = 1; + goto out; + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + not_written = 1; + goto out; + } + + ret = iterate_inodes_from_logical(logical, fs_info, path, + copy_nocow_pages_for_inode, + nocow_ctx); + if (ret != 0 && ret != -ENOENT) { + pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n", + (unsigned long long)logical, + (unsigned long long)physical_for_dev_replace, + (unsigned long long)len, + (unsigned long long)mirror_num, ret); + not_written = 1; + goto out; + } + +out: + if (trans && !IS_ERR(trans)) + btrfs_end_transaction(trans, root); + if (not_written) + btrfs_dev_replace_stats_inc(&fs_info->dev_replace. + num_uncorrectable_read_errors); + + btrfs_free_path(path); + kfree(nocow_ctx); + + scrub_pending_trans_workers_dec(sctx); +} + +static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) +{ + unsigned long index; + struct scrub_copy_nocow_ctx *nocow_ctx = ctx; + int ret = 0; + struct btrfs_key key; + struct inode *inode = NULL; + struct btrfs_root *local_root; + u64 physical_for_dev_replace; + u64 len; + struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; + + key.objectid = root; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + local_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(local_root)) + return PTR_ERR(local_root); + + key.type = BTRFS_INODE_ITEM_KEY; + key.objectid = inum; + key.offset = 0; + inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; + len = nocow_ctx->len; + while (len >= PAGE_CACHE_SIZE) { + struct page *page = NULL; + int ret_sub; + + index = offset >> PAGE_CACHE_SHIFT; + + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + if (!page) { + pr_err("find_or_create_page() failed\n"); + ret = -ENOMEM; + goto next_page; + } + + if (PageUptodate(page)) { + if (PageDirty(page)) + goto next_page; + } else { + ClearPageError(page); + ret_sub = extent_read_full_page(&BTRFS_I(inode)-> + io_tree, + page, btrfs_get_extent, + nocow_ctx->mirror_num); + if (ret_sub) { + ret = ret_sub; + goto next_page; + } + wait_on_page_locked(page); + if (!PageUptodate(page)) { + ret = -EIO; + goto next_page; + } + } + ret_sub = write_page_nocow(nocow_ctx->sctx, + physical_for_dev_replace, page); + if (ret_sub) { + ret = ret_sub; + goto next_page; + } + +next_page: + if (page) { + unlock_page(page); + put_page(page); + } + offset += PAGE_CACHE_SIZE; + physical_for_dev_replace += PAGE_CACHE_SIZE; + len -= PAGE_CACHE_SIZE; + } + + if (inode) + iput(inode); + return ret; +} + +static int write_page_nocow(struct scrub_ctx *sctx, + u64 physical_for_dev_replace, struct page *page) +{ + struct bio *bio; + struct btrfs_device *dev; + int ret; + DECLARE_COMPLETION_ONSTACK(compl); + + dev = sctx->wr_ctx.tgtdev; + if (!dev) + return -EIO; + if (!dev->bdev) { + printk_ratelimited(KERN_WARNING + "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); + return -EIO; + } + bio = bio_alloc(GFP_NOFS, 1); + if (!bio) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + return -ENOMEM; + } + bio->bi_private = &compl; + bio->bi_end_io = scrub_complete_bio_end_io; + bio->bi_size = 0; + bio->bi_sector = physical_for_dev_replace >> 9; + bio->bi_bdev = dev->bdev; + ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); + if (ret != PAGE_CACHE_SIZE) { +leave_with_eio: + bio_put(bio); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); + return -EIO; + } + btrfsic_submit_bio(WRITE_SYNC, bio); + wait_for_completion(&compl); + + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + goto leave_with_eio; + + bio_put(bio); + return 0; +} diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 837ad2d..ad43806 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1195,7 +1195,8 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size); btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size); btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers, + new_pool_size); } static int btrfs_remount(struct super_block *sb, int *flags, char *data) -- cgit v0.10.2 From e93c89c1aaaaaec3487c4c18dd02360371790722 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 17:33:06 +0100 Subject: Btrfs: add new sources for device replace code This adds a new file to the sources together with the header file and the changes to ioctl.h and ctree.h that are required by the new C source file. Additionally, 4 new functions are added to volume.c that deal with device creation and destruction. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index d7fcdba..7df3e0f 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ - reada.o backref.o ulist.o qgroup.o send.o + reada.o backref.o ulist.o qgroup.o send.o dev-replace.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ded7caa..45e7f75 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -142,6 +142,8 @@ struct btrfs_ordered_sum; #define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 +#define BTRFS_DEV_REPLACE_DEVID 0 + /* * the max metadata block size. This limit is somewhat artificial, * but the memmove costs go through the roof for larger blocks. diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c new file mode 100644 index 0000000..66dbc8d --- /dev/null +++ b/fs/btrfs/dev-replace.c @@ -0,0 +1,856 @@ +/* + * Copyright (C) STRATO AG 2012. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "compat.h" +#include "ctree.h" +#include "extent_map.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "volumes.h" +#include "async-thread.h" +#include "check-integrity.h" +#include "rcu-string.h" +#include "dev-replace.h" + +static u64 btrfs_get_seconds_since_1970(void); +static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, + int scrub_ret); +static void btrfs_dev_replace_update_device_in_mapping_tree( + struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev, + struct btrfs_device *tgtdev); +static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, + char *srcdev_name, + struct btrfs_device **device); +static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); +static int btrfs_dev_replace_kthread(void *data); +static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info); + + +int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) +{ + struct btrfs_key key; + struct btrfs_root *dev_root = fs_info->dev_root; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct extent_buffer *eb; + int slot; + int ret = 0; + struct btrfs_path *path = NULL; + int item_size; + struct btrfs_dev_replace_item *ptr; + u64 src_devid; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = 0; + key.type = BTRFS_DEV_REPLACE_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); + if (ret) { +no_valid_dev_replace_entry_found: + ret = 0; + dev_replace->replace_state = + BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED; + dev_replace->cont_reading_from_srcdev_mode = + BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS; + dev_replace->replace_state = 0; + dev_replace->time_started = 0; + dev_replace->time_stopped = 0; + atomic64_set(&dev_replace->num_write_errors, 0); + atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); + dev_replace->cursor_left = 0; + dev_replace->committed_cursor_left = 0; + dev_replace->cursor_left_last_write_of_item = 0; + dev_replace->cursor_right = 0; + dev_replace->srcdev = NULL; + dev_replace->tgtdev = NULL; + dev_replace->is_valid = 0; + dev_replace->item_needs_writeback = 0; + goto out; + } + slot = path->slots[0]; + eb = path->nodes[0]; + item_size = btrfs_item_size_nr(eb, slot); + ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item); + + if (item_size != sizeof(struct btrfs_dev_replace_item)) { + pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n"); + goto no_valid_dev_replace_entry_found; + } + + src_devid = btrfs_dev_replace_src_devid(eb, ptr); + dev_replace->cont_reading_from_srcdev_mode = + btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr); + dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr); + dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr); + dev_replace->time_stopped = + btrfs_dev_replace_time_stopped(eb, ptr); + atomic64_set(&dev_replace->num_write_errors, + btrfs_dev_replace_num_write_errors(eb, ptr)); + atomic64_set(&dev_replace->num_uncorrectable_read_errors, + btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr)); + dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr); + dev_replace->committed_cursor_left = dev_replace->cursor_left; + dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left; + dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr); + dev_replace->is_valid = 1; + + dev_replace->item_needs_writeback = 0; + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + dev_replace->srcdev = NULL; + dev_replace->tgtdev = NULL; + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + dev_replace->srcdev = btrfs_find_device(fs_info, src_devid, + NULL, NULL); + dev_replace->tgtdev = btrfs_find_device(fs_info, + BTRFS_DEV_REPLACE_DEVID, + NULL, NULL); + /* + * allow 'btrfs dev replace_cancel' if src/tgt device is + * missing + */ + if (!dev_replace->srcdev && + !btrfs_test_opt(dev_root, DEGRADED)) { + ret = -EIO; + pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n", + (unsigned long long)src_devid); + } + if (!dev_replace->tgtdev && + !btrfs_test_opt(dev_root, DEGRADED)) { + ret = -EIO; + pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n", + (unsigned long long)BTRFS_DEV_REPLACE_DEVID); + } + if (dev_replace->tgtdev) { + if (dev_replace->srcdev) { + dev_replace->tgtdev->total_bytes = + dev_replace->srcdev->total_bytes; + dev_replace->tgtdev->disk_total_bytes = + dev_replace->srcdev->disk_total_bytes; + dev_replace->tgtdev->bytes_used = + dev_replace->srcdev->bytes_used; + } + dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1; + btrfs_init_dev_replace_tgtdev_for_resume(fs_info, + dev_replace->tgtdev); + } + break; + } + +out: + if (path) + btrfs_free_path(path); + return ret; +} + +/* + * called from commit_transaction. Writes changed device replace state to + * disk. + */ +int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + int ret; + struct btrfs_root *dev_root = fs_info->dev_root; + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *eb; + struct btrfs_dev_replace_item *ptr; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + btrfs_dev_replace_lock(dev_replace); + if (!dev_replace->is_valid || + !dev_replace->item_needs_writeback) { + btrfs_dev_replace_unlock(dev_replace); + return 0; + } + btrfs_dev_replace_unlock(dev_replace); + + key.objectid = 0; + key.type = BTRFS_DEV_REPLACE_KEY; + key.offset = 0; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); + if (ret < 0) { + pr_warn("btrfs: error %d while searching for dev_replace item!\n", + ret); + goto out; + } + + if (ret == 0 && + btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { + /* + * need to delete old one and insert a new one. + * Since no attempt is made to recover any old state, if the + * dev_replace state is 'running', the data on the target + * drive is lost. + * It would be possible to recover the state: just make sure + * that the beginning of the item is never changed and always + * contains all the essential information. Then read this + * minimal set of information and use it as a base for the + * new state. + */ + ret = btrfs_del_item(trans, dev_root, path); + if (ret != 0) { + pr_warn("btrfs: delete too small dev_replace item failed %d!\n", + ret); + goto out; + } + ret = 1; + } + + if (ret == 1) { + /* need to insert a new item */ + btrfs_release_path(path); + ret = btrfs_insert_empty_item(trans, dev_root, path, + &key, sizeof(*ptr)); + if (ret < 0) { + pr_warn("btrfs: insert dev_replace item failed %d!\n", + ret); + goto out; + } + } + + eb = path->nodes[0]; + ptr = btrfs_item_ptr(eb, path->slots[0], + struct btrfs_dev_replace_item); + + btrfs_dev_replace_lock(dev_replace); + if (dev_replace->srcdev) + btrfs_set_dev_replace_src_devid(eb, ptr, + dev_replace->srcdev->devid); + else + btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1); + btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr, + dev_replace->cont_reading_from_srcdev_mode); + btrfs_set_dev_replace_replace_state(eb, ptr, + dev_replace->replace_state); + btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started); + btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped); + btrfs_set_dev_replace_num_write_errors(eb, ptr, + atomic64_read(&dev_replace->num_write_errors)); + btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr, + atomic64_read(&dev_replace->num_uncorrectable_read_errors)); + dev_replace->cursor_left_last_write_of_item = + dev_replace->cursor_left; + btrfs_set_dev_replace_cursor_left(eb, ptr, + dev_replace->cursor_left_last_write_of_item); + btrfs_set_dev_replace_cursor_right(eb, ptr, + dev_replace->cursor_right); + dev_replace->item_needs_writeback = 0; + btrfs_dev_replace_unlock(dev_replace); + + btrfs_mark_buffer_dirty(eb); + +out: + btrfs_free_path(path); + + return ret; +} + +void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + dev_replace->committed_cursor_left = + dev_replace->cursor_left_last_write_of_item; +} + +static u64 btrfs_get_seconds_since_1970(void) +{ + struct timespec t = CURRENT_TIME_SEC; + + return t.tv_sec; +} + +int btrfs_dev_replace_start(struct btrfs_root *root, + struct btrfs_ioctl_dev_replace_args *args) +{ + struct btrfs_trans_handle *trans; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + int ret; + struct btrfs_device *tgt_device = NULL; + struct btrfs_device *src_device = NULL; + + switch (args->start.cont_reading_from_srcdev_mode) { + case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: + case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: + break; + default: + return -EINVAL; + } + + if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') || + args->start.tgtdev_name[0] == '\0') + return -EINVAL; + + mutex_lock(&fs_info->volume_mutex); + ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, + &tgt_device); + if (ret) { + pr_err("btrfs: target device %s is invalid!\n", + args->start.tgtdev_name); + mutex_unlock(&fs_info->volume_mutex); + return -EINVAL; + } + + ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid, + args->start.srcdev_name, + &src_device); + mutex_unlock(&fs_info->volume_mutex); + if (ret) { + ret = -EINVAL; + goto leave_no_lock; + } + + if (tgt_device->total_bytes < src_device->total_bytes) { + pr_err("btrfs: target device is smaller than source device!\n"); + ret = -EINVAL; + goto leave_no_lock; + } + + btrfs_dev_replace_lock(dev_replace); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; + goto leave; + } + + dev_replace->cont_reading_from_srcdev_mode = + args->start.cont_reading_from_srcdev_mode; + WARN_ON(!src_device); + dev_replace->srcdev = src_device; + WARN_ON(!tgt_device); + dev_replace->tgtdev = tgt_device; + + printk_in_rcu(KERN_INFO + "btrfs: dev_replace from %s (devid %llu) to %s) started\n", + src_device->missing ? "" : + rcu_str_deref(src_device->name), + src_device->devid, + rcu_str_deref(tgt_device->name)); + + tgt_device->total_bytes = src_device->total_bytes; + tgt_device->disk_total_bytes = src_device->disk_total_bytes; + tgt_device->bytes_used = src_device->bytes_used; + + /* + * from now on, the writes to the srcdev are all duplicated to + * go to the tgtdev as well (refer to btrfs_map_block()). + */ + dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; + dev_replace->time_started = btrfs_get_seconds_since_1970(); + dev_replace->cursor_left = 0; + dev_replace->committed_cursor_left = 0; + dev_replace->cursor_left_last_write_of_item = 0; + dev_replace->cursor_right = 0; + dev_replace->is_valid = 1; + dev_replace->item_needs_writeback = 1; + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + btrfs_dev_replace_unlock(dev_replace); + + btrfs_wait_ordered_extents(root, 0); + + /* force writing the updated state information to disk */ + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_dev_replace_lock(dev_replace); + goto leave; + } + + ret = btrfs_commit_transaction(trans, root); + WARN_ON(ret); + + /* the disk copy procedure reuses the scrub code */ + ret = btrfs_scrub_dev(fs_info, src_device->devid, 0, + src_device->total_bytes, + &dev_replace->scrub_progress, 0, 1); + + ret = btrfs_dev_replace_finishing(root->fs_info, ret); + WARN_ON(ret); + + return 0; + +leave: + dev_replace->srcdev = NULL; + dev_replace->tgtdev = NULL; + btrfs_dev_replace_unlock(dev_replace); +leave_no_lock: + if (tgt_device) + btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + return ret; +} + +static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, + int scrub_ret) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_device *tgt_device; + struct btrfs_device *src_device; + struct btrfs_root *root = fs_info->tree_root; + u8 uuid_tmp[BTRFS_UUID_SIZE]; + struct btrfs_trans_handle *trans; + int ret = 0; + + /* don't allow cancel or unmount to disturb the finishing procedure */ + mutex_lock(&dev_replace->lock_finishing_cancel_unmount); + + btrfs_dev_replace_lock(dev_replace); + /* was the operation canceled, or is it finished? */ + if (dev_replace->replace_state != + BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) { + btrfs_dev_replace_unlock(dev_replace); + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return 0; + } + + tgt_device = dev_replace->tgtdev; + src_device = dev_replace->srcdev; + btrfs_dev_replace_unlock(dev_replace); + + /* replace old device with new one in mapping tree */ + if (!scrub_ret) + btrfs_dev_replace_update_device_in_mapping_tree(fs_info, + src_device, + tgt_device); + + /* + * flush all outstanding I/O and inode extent mappings before the + * copy operation is declared as being finished + */ + btrfs_start_delalloc_inodes(root, 0); + btrfs_wait_ordered_extents(root, 0); + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans, root); + WARN_ON(ret); + + /* keep away write_all_supers() during the finishing procedure */ + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + btrfs_dev_replace_lock(dev_replace); + dev_replace->replace_state = + scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED + : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED; + dev_replace->tgtdev = NULL; + dev_replace->srcdev = NULL; + dev_replace->time_stopped = btrfs_get_seconds_since_1970(); + dev_replace->item_needs_writeback = 1; + + if (scrub_ret) { + printk_in_rcu(KERN_ERR + "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", + src_device->missing ? "" : + rcu_str_deref(src_device->name), + src_device->devid, + rcu_str_deref(tgt_device->name), scrub_ret); + btrfs_dev_replace_unlock(dev_replace); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + if (tgt_device) + btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + + return 0; + } + + printk_in_rcu(KERN_INFO + "btrfs: dev_replace from %s (devid %llu) to %s) finished\n", + src_device->missing ? "" : + rcu_str_deref(src_device->name), + src_device->devid, + rcu_str_deref(tgt_device->name)); + tgt_device->is_tgtdev_for_dev_replace = 0; + tgt_device->devid = src_device->devid; + src_device->devid = BTRFS_DEV_REPLACE_DEVID; + tgt_device->bytes_used = src_device->bytes_used; + memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp)); + memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid)); + memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid)); + tgt_device->total_bytes = src_device->total_bytes; + tgt_device->disk_total_bytes = src_device->disk_total_bytes; + tgt_device->bytes_used = src_device->bytes_used; + if (fs_info->sb->s_bdev == src_device->bdev) + fs_info->sb->s_bdev = tgt_device->bdev; + if (fs_info->fs_devices->latest_bdev == src_device->bdev) + fs_info->fs_devices->latest_bdev = tgt_device->bdev; + list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); + + btrfs_rm_dev_replace_srcdev(fs_info, src_device); + if (src_device->bdev) { + /* zero out the old super */ + btrfs_scratch_superblock(src_device); + } + /* + * this is again a consistent state where no dev_replace procedure + * is running, the target device is part of the filesystem, the + * source device is not part of the filesystem anymore and its 1st + * superblock is scratched out so that it is no longer marked to + * belong to this filesystem. + */ + btrfs_dev_replace_unlock(dev_replace); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + + /* write back the superblocks */ + trans = btrfs_start_transaction(root, 0); + if (!IS_ERR(trans)) + btrfs_commit_transaction(trans, root); + + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + + return 0; +} + +static void btrfs_dev_replace_update_device_in_mapping_tree( + struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev, + struct btrfs_device *tgtdev) +{ + struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; + struct extent_map *em; + struct map_lookup *map; + u64 start = 0; + int i; + + write_lock(&em_tree->lock); + do { + em = lookup_extent_mapping(em_tree, start, (u64)-1); + if (!em) + break; + map = (struct map_lookup *)em->bdev; + for (i = 0; i < map->num_stripes; i++) + if (srcdev == map->stripes[i].dev) + map->stripes[i].dev = tgtdev; + start = em->start + em->len; + free_extent_map(em); + } while (start); + write_unlock(&em_tree->lock); +} + +static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, + char *srcdev_name, + struct btrfs_device **device) +{ + int ret; + + if (srcdevid) { + ret = 0; + *device = btrfs_find_device(root->fs_info, srcdevid, NULL, + NULL); + if (!*device) + ret = -ENOENT; + } else { + ret = btrfs_find_device_missing_or_by_path(root, srcdev_name, + device); + } + return ret; +} + +void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dev_replace_args *args) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + btrfs_dev_replace_lock(dev_replace); + /* even if !dev_replace_is_valid, the values are good enough for + * the replace_status ioctl */ + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + args->status.replace_state = dev_replace->replace_state; + args->status.time_started = dev_replace->time_started; + args->status.time_stopped = dev_replace->time_stopped; + args->status.num_write_errors = + atomic64_read(&dev_replace->num_write_errors); + args->status.num_uncorrectable_read_errors = + atomic64_read(&dev_replace->num_uncorrectable_read_errors); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + args->status.progress_1000 = 0; + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + args->status.progress_1000 = 1000; + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + args->status.progress_1000 = div64_u64(dev_replace->cursor_left, + div64_u64(dev_replace->srcdev->total_bytes, 1000)); + break; + } + btrfs_dev_replace_unlock(dev_replace); +} + +int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dev_replace_args *args) +{ + args->result = __btrfs_dev_replace_cancel(fs_info); + return 0; +} + +static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_device *tgt_device = NULL; + struct btrfs_trans_handle *trans; + struct btrfs_root *root = fs_info->tree_root; + u64 result; + int ret; + + mutex_lock(&dev_replace->lock_finishing_cancel_unmount); + btrfs_dev_replace_lock(dev_replace); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; + btrfs_dev_replace_unlock(dev_replace); + goto leave; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + tgt_device = dev_replace->tgtdev; + dev_replace->tgtdev = NULL; + dev_replace->srcdev = NULL; + break; + } + dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; + dev_replace->time_stopped = btrfs_get_seconds_since_1970(); + dev_replace->item_needs_writeback = 1; + btrfs_dev_replace_unlock(dev_replace); + btrfs_scrub_cancel(fs_info); + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans, root); + WARN_ON(ret); + if (tgt_device) + btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + +leave: + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return result; +} + +void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + mutex_lock(&dev_replace->lock_finishing_cancel_unmount); + btrfs_dev_replace_lock(dev_replace); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; + dev_replace->time_stopped = btrfs_get_seconds_since_1970(); + dev_replace->item_needs_writeback = 1; + pr_info("btrfs: suspending dev_replace for unmount\n"); + break; + } + + btrfs_dev_replace_unlock(dev_replace); + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); +} + +/* resume dev_replace procedure that was interrupted by unmount */ +int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) +{ + struct task_struct *task; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + btrfs_dev_replace_lock(dev_replace); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + btrfs_dev_replace_unlock(dev_replace); + return 0; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; + break; + } + if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) { + pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n" + "btrfs: you may cancel the operation after 'mount -o degraded'\n"); + btrfs_dev_replace_unlock(dev_replace); + return 0; + } + btrfs_dev_replace_unlock(dev_replace); + + WARN_ON(atomic_xchg( + &fs_info->mutually_exclusive_operation_running, 1)); + task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl"); + return PTR_RET(task); +} + +static int btrfs_dev_replace_kthread(void *data) +{ + struct btrfs_fs_info *fs_info = data; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_ioctl_dev_replace_args *status_args; + u64 progress; + + status_args = kzalloc(sizeof(*status_args), GFP_NOFS); + if (status_args) { + btrfs_dev_replace_status(fs_info, status_args); + progress = status_args->status.progress_1000; + kfree(status_args); + do_div(progress, 10); + printk_in_rcu(KERN_INFO + "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n", + dev_replace->srcdev->missing ? "" : + rcu_str_deref(dev_replace->srcdev->name), + dev_replace->srcdev->devid, + dev_replace->tgtdev ? + rcu_str_deref(dev_replace->tgtdev->name) : + "", + (unsigned int)progress); + } + btrfs_dev_replace_continue_on_mount(fs_info); + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + + return 0; +} + +static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + int ret; + + ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid, + dev_replace->committed_cursor_left, + dev_replace->srcdev->total_bytes, + &dev_replace->scrub_progress, 0, 1); + ret = btrfs_dev_replace_finishing(fs_info, ret); + WARN_ON(ret); + return 0; +} + +int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) +{ + if (!dev_replace->is_valid) + return 0; + + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + return 0; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + /* + * return true even if tgtdev is missing (this is + * something that can happen if the dev_replace + * procedure is suspended by an umount and then + * the tgtdev is missing (or "btrfs dev scan") was + * not called and the the filesystem is remounted + * in degraded state. This does not stop the + * dev_replace procedure. It needs to be canceled + * manually if the cancelation is wanted. + */ + break; + } + return 1; +} + +void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace) +{ + /* the beginning is just an optimization for the typical case */ + if (atomic_read(&dev_replace->nesting_level) == 0) { +acquire_lock: + /* this is not a nested case where the same thread + * is trying to acqurire the same lock twice */ + mutex_lock(&dev_replace->lock); + mutex_lock(&dev_replace->lock_management_lock); + dev_replace->lock_owner = current->pid; + atomic_inc(&dev_replace->nesting_level); + mutex_unlock(&dev_replace->lock_management_lock); + return; + } + + mutex_lock(&dev_replace->lock_management_lock); + if (atomic_read(&dev_replace->nesting_level) > 0 && + dev_replace->lock_owner == current->pid) { + WARN_ON(!mutex_is_locked(&dev_replace->lock)); + atomic_inc(&dev_replace->nesting_level); + mutex_unlock(&dev_replace->lock_management_lock); + return; + } + + mutex_unlock(&dev_replace->lock_management_lock); + goto acquire_lock; +} + +void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace) +{ + WARN_ON(!mutex_is_locked(&dev_replace->lock)); + mutex_lock(&dev_replace->lock_management_lock); + WARN_ON(atomic_read(&dev_replace->nesting_level) < 1); + WARN_ON(dev_replace->lock_owner != current->pid); + atomic_dec(&dev_replace->nesting_level); + if (atomic_read(&dev_replace->nesting_level) == 0) { + dev_replace->lock_owner = 0; + mutex_unlock(&dev_replace->lock_management_lock); + mutex_unlock(&dev_replace->lock); + } else { + mutex_unlock(&dev_replace->lock_management_lock); + } +} diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 1fb5c89..20035cb 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -19,6 +19,24 @@ #if !defined(__BTRFS_DEV_REPLACE__) #define __BTRFS_DEV_REPLACE__ +struct btrfs_ioctl_dev_replace_args; + +int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info); +int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info); +int btrfs_dev_replace_start(struct btrfs_root *root, + struct btrfs_ioctl_dev_replace_args *args); +void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dev_replace_args *args); +int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dev_replace_args *args); +void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); +int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); +int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); +void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace); +void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace); + static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value) { atomic64_inc(stat_value); diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 731e287..62006ba 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -123,6 +123,48 @@ struct btrfs_ioctl_scrub_args { __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8]; }; +#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0 +#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 +struct btrfs_ioctl_dev_replace_start_params { + __u64 srcdevid; /* in, if 0, use srcdev_name instead */ + __u8 srcdev_name[BTRFS_PATH_NAME_MAX + 1]; /* in */ + __u8 tgtdev_name[BTRFS_PATH_NAME_MAX + 1]; /* in */ + __u64 cont_reading_from_srcdev_mode; /* in, see #define + * above */ +}; + +#define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED 0 +#define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED 1 +#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED 2 +#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 3 +#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED 4 +struct btrfs_ioctl_dev_replace_status_params { + __u64 replace_state; /* out, see #define above */ + __u64 progress_1000; /* out, 0 <= x <= 1000 */ + __u64 time_started; /* out, seconds since 1-Jan-1970 */ + __u64 time_stopped; /* out, seconds since 1-Jan-1970 */ + __u64 num_write_errors; /* out */ + __u64 num_uncorrectable_read_errors; /* out */ +}; + +#define BTRFS_IOCTL_DEV_REPLACE_CMD_START 0 +#define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS 1 +#define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL 2 +#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR 0 +#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED 1 +#define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED 2 +struct btrfs_ioctl_dev_replace_args { + __u64 cmd; /* in */ + __u64 result; /* out */ + + union { + struct btrfs_ioctl_dev_replace_start_params start; + struct btrfs_ioctl_dev_replace_status_params status; + }; /* in/out */ + + __u64 spare[64]; +}; + #define BTRFS_DEVICE_PATH_NAME_MAX 1024 struct btrfs_ioctl_dev_info_args { __u64 devid; /* in/out */ @@ -453,4 +495,7 @@ struct btrfs_ioctl_send_args { struct btrfs_ioctl_qgroup_limit_args) #define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ struct btrfs_ioctl_get_dev_stats) +#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \ + struct btrfs_ioctl_dev_replace_args) + #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 4158628..5777e6a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1537,6 +1537,53 @@ error_undo: goto error_brelse; } +void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev) +{ + WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex)); + list_del_rcu(&srcdev->dev_list); + list_del_rcu(&srcdev->dev_alloc_list); + fs_info->fs_devices->num_devices--; + if (srcdev->missing) { + fs_info->fs_devices->missing_devices--; + fs_info->fs_devices->rw_devices++; + } + if (srcdev->can_discard) + fs_info->fs_devices->num_can_discard--; + if (srcdev->bdev) + fs_info->fs_devices->open_devices--; + + call_rcu(&srcdev->rcu, free_device); +} + +void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *tgtdev) +{ + struct btrfs_device *next_device; + + WARN_ON(!tgtdev); + mutex_lock(&fs_info->fs_devices->device_list_mutex); + if (tgtdev->bdev) { + btrfs_scratch_superblock(tgtdev); + fs_info->fs_devices->open_devices--; + } + fs_info->fs_devices->num_devices--; + if (tgtdev->can_discard) + fs_info->fs_devices->num_can_discard++; + + next_device = list_entry(fs_info->fs_devices->devices.next, + struct btrfs_device, dev_list); + if (tgtdev->bdev == fs_info->sb->s_bdev) + fs_info->sb->s_bdev = next_device->bdev; + if (tgtdev->bdev == fs_info->fs_devices->latest_bdev) + fs_info->fs_devices->latest_bdev = next_device->bdev; + list_del_rcu(&tgtdev->dev_list); + + call_rcu(&tgtdev->rcu, free_device); + + mutex_unlock(&fs_info->fs_devices->device_list_mutex); +} + int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, struct btrfs_device **device) { @@ -1931,6 +1978,98 @@ error: return ret; } +int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, + struct btrfs_device **device_out) +{ + struct request_queue *q; + struct btrfs_device *device; + struct block_device *bdev; + struct btrfs_fs_info *fs_info = root->fs_info; + struct list_head *devices; + struct rcu_string *name; + int ret = 0; + + *device_out = NULL; + if (fs_info->fs_devices->seeding) + return -EINVAL; + + bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, + fs_info->bdev_holder); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + + filemap_write_and_wait(bdev->bd_inode->i_mapping); + + devices = &fs_info->fs_devices->devices; + list_for_each_entry(device, devices, dev_list) { + if (device->bdev == bdev) { + ret = -EEXIST; + goto error; + } + } + + device = kzalloc(sizeof(*device), GFP_NOFS); + if (!device) { + ret = -ENOMEM; + goto error; + } + + name = rcu_string_strdup(device_path, GFP_NOFS); + if (!name) { + kfree(device); + ret = -ENOMEM; + goto error; + } + rcu_assign_pointer(device->name, name); + + q = bdev_get_queue(bdev); + if (blk_queue_discard(q)) + device->can_discard = 1; + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + device->writeable = 1; + device->work.func = pending_bios_fn; + generate_random_uuid(device->uuid); + device->devid = BTRFS_DEV_REPLACE_DEVID; + spin_lock_init(&device->io_lock); + device->generation = 0; + device->io_width = root->sectorsize; + device->io_align = root->sectorsize; + device->sector_size = root->sectorsize; + device->total_bytes = i_size_read(bdev->bd_inode); + device->disk_total_bytes = device->total_bytes; + device->dev_root = fs_info->dev_root; + device->bdev = bdev; + device->in_fs_metadata = 1; + device->is_tgtdev_for_dev_replace = 1; + device->mode = FMODE_EXCL; + set_blocksize(device->bdev, 4096); + device->fs_devices = fs_info->fs_devices; + list_add(&device->dev_list, &fs_info->fs_devices->devices); + fs_info->fs_devices->num_devices++; + fs_info->fs_devices->open_devices++; + if (device->can_discard) + fs_info->fs_devices->num_can_discard++; + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + + *device_out = device; + return ret; + +error: + blkdev_put(bdev, FMODE_EXCL); + return ret; +} + +void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, + struct btrfs_device *tgtdev) +{ + WARN_ON(fs_info->fs_devices->rw_devices == 0); + tgtdev->io_width = fs_info->dev_root->sectorsize; + tgtdev->io_align = fs_info->dev_root->sectorsize; + tgtdev->sector_size = fs_info->dev_root->sectorsize; + tgtdev->dev_root = fs_info->dev_root; + tgtdev->in_fs_metadata = 1; +} + static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, struct btrfs_device *device) { diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 8fd5a4d..58d7937 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -286,6 +286,8 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, u8 *uuid, u8 *fsid); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_root *root, char *path); +int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, + struct btrfs_device **device_out); int btrfs_balance(struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs); int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); @@ -302,6 +304,12 @@ int btrfs_get_dev_stats(struct btrfs_root *root, int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); +void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev); +void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *tgtdev); +void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, + struct btrfs_device *tgtdev); int btrfs_scratch_superblock(struct btrfs_device *device); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, -- cgit v0.10.2 From 8dabb7420f014ab0f9f04afae8ae046c0f48b270 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 6 Nov 2012 13:15:27 +0100 Subject: Btrfs: change core code of btrfs to support the device replace operations This commit contains all the essential changes to the core code of Btrfs for support of the device replace procedure. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0e41047..76b8250 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -45,6 +45,7 @@ #include "inode-map.h" #include "check-integrity.h" #include "rcu-string.h" +#include "dev-replace.h" #ifdef CONFIG_X86 #include @@ -2438,7 +2439,11 @@ int open_ctree(struct super_block *sb, goto fail_tree_roots; } - btrfs_close_extra_devices(fs_devices); + /* + * keep the device that is marked to be the target device for the + * dev_replace procedure + */ + btrfs_close_extra_devices(fs_info, fs_devices, 0); if (!fs_devices->latest_bdev) { printk(KERN_CRIT "btrfs: failed to read devices on %s\n", @@ -2510,6 +2515,14 @@ retry_root_backup: goto fail_block_groups; } + ret = btrfs_init_dev_replace(fs_info); + if (ret) { + pr_err("btrfs: failed to init dev_replace: %d\n", ret); + goto fail_block_groups; + } + + btrfs_close_extra_devices(fs_info, fs_devices, 1); + ret = btrfs_init_space_info(fs_info); if (ret) { printk(KERN_ERR "Failed to initial space info: %d\n", ret); @@ -2658,6 +2671,13 @@ retry_root_backup: return ret; } + ret = btrfs_resume_dev_replace_async(fs_info); + if (ret) { + pr_warn("btrfs: failed to resume dev_replace\n"); + close_ctree(tree_root); + return ret; + } + return 0; fail_qgroup: @@ -3300,6 +3320,8 @@ int close_ctree(struct btrfs_root *root) /* pause restriper - we want to resume on mount */ btrfs_pause_balance(fs_info); + btrfs_dev_replace_suspend_for_unmount(fs_info); + btrfs_scrub_cancel(fs_info); /* wait for any defraggers to finish */ diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 9f363e1..c705a48 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -27,6 +27,7 @@ #include "volumes.h" #include "disk-io.h" #include "transaction.h" +#include "dev-replace.h" #undef DEBUG @@ -331,6 +332,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, int nzones = 0; int i; unsigned long index = logical >> PAGE_CACHE_SHIFT; + int dev_replace_is_ongoing; spin_lock(&fs_info->reada_lock); re = radix_tree_lookup(&fs_info->reada_tree, index); @@ -392,6 +394,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, } /* insert extent in reada_tree + all per-device trees, all or nothing */ + btrfs_dev_replace_lock(&fs_info->dev_replace); spin_lock(&fs_info->reada_lock); ret = radix_tree_insert(&fs_info->reada_tree, index, re); if (ret == -EEXIST) { @@ -399,13 +402,17 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, BUG_ON(!re_exist); re_exist->refcnt++; spin_unlock(&fs_info->reada_lock); + btrfs_dev_replace_unlock(&fs_info->dev_replace); goto error; } if (ret) { spin_unlock(&fs_info->reada_lock); + btrfs_dev_replace_unlock(&fs_info->dev_replace); goto error; } prev_dev = NULL; + dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing( + &fs_info->dev_replace); for (i = 0; i < nzones; ++i) { dev = bbio->stripes[i].dev; if (dev == prev_dev) { @@ -422,6 +429,14 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, /* cannot read ahead on missing device */ continue; } + if (dev_replace_is_ongoing && + dev == fs_info->dev_replace.tgtdev) { + /* + * as this device is selected for reading only as + * a last resort, skip it for read ahead. + */ + continue; + } prev_dev = dev; ret = radix_tree_insert(&dev->reada_extents, index, re); if (ret) { @@ -434,10 +449,12 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, BUG_ON(fs_info == NULL); radix_tree_delete(&fs_info->reada_tree, index); spin_unlock(&fs_info->reada_lock); + btrfs_dev_replace_unlock(&fs_info->dev_replace); goto error; } } spin_unlock(&fs_info->reada_lock); + btrfs_dev_replace_unlock(&fs_info->dev_replace); kfree(bbio); return re; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 61157a2..30cbf69 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2843,12 +2843,17 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, return -EIO; } - if (dev->scrub_device) { + btrfs_dev_replace_lock(&fs_info->dev_replace); + if (dev->scrub_device || + (!is_dev_replace && + btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { + btrfs_dev_replace_unlock(&fs_info->dev_replace); mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); scrub_workers_put(fs_info); return -EINPROGRESS; } + btrfs_dev_replace_unlock(&fs_info->dev_replace); sctx = scrub_setup_ctx(dev, is_dev_replace); if (IS_ERR(sctx)) { mutex_unlock(&fs_info->scrub_lock); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index ad43806..def4f24 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -55,6 +55,7 @@ #include "export.h" #include "compression.h" #include "rcu-string.h" +#include "dev-replace.h" #define CREATE_TRACE_POINTS #include @@ -1225,8 +1226,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) return 0; if (*flags & MS_RDONLY) { + /* + * this also happens on 'umount -rf' or on shutdown, when + * the filesystem is busy. + */ sb->s_flags |= MS_RDONLY; + btrfs_dev_replace_suspend_for_unmount(fs_info); + btrfs_scrub_cancel(fs_info); + ret = btrfs_commit_super(root); if (ret) goto restore; @@ -1263,6 +1271,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (ret) goto restore; + ret = btrfs_resume_dev_replace_async(fs_info); + if (ret) { + pr_warn("btrfs: failed to resume dev_replace\n"); + goto restore; + } sb->s_flags &= ~MS_RDONLY; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 7b29735..bcc6b65 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -30,6 +30,7 @@ #include "tree-log.h" #include "inode-map.h" #include "volumes.h" +#include "dev-replace.h" #define BTRFS_ROOT_TRANS_TAG 0 @@ -845,7 +846,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, return ret; ret = btrfs_run_dev_stats(trans, root->fs_info); - BUG_ON(ret); + WARN_ON(ret); + ret = btrfs_run_dev_replace(trans, root->fs_info); + WARN_ON(ret); ret = btrfs_run_qgroups(trans, root->fs_info); BUG_ON(ret); @@ -868,6 +871,8 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, switch_commit_root(fs_info->extent_root); up_write(&fs_info->extent_commit_sem); + btrfs_after_dev_replace_commit(fs_info); + return 0; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5777e6a..a4e0963 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -36,6 +36,7 @@ #include "check-integrity.h" #include "rcu-string.h" #include "math.h" +#include "dev-replace.h" static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -505,7 +506,8 @@ error: return ERR_PTR(-ENOMEM); } -void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) +void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, + struct btrfs_fs_devices *fs_devices, int step) { struct btrfs_device *device, *next; @@ -528,6 +530,21 @@ again: continue; } + if (device->devid == BTRFS_DEV_REPLACE_DEVID) { + /* + * In the first step, keep the device which has + * the correct fsid and the devid that is used + * for the dev_replace procedure. + * In the second step, the dev_replace state is + * read from the device tree and it is known + * whether the procedure is really active or + * not, which means whether this device is + * used or whether it should be removed. + */ + if (step == 0 || device->is_tgtdev_for_dev_replace) { + continue; + } + } if (device->bdev) { blkdev_put(device->bdev, device->mode); device->bdev = NULL; @@ -536,7 +553,8 @@ again: if (device->writeable) { list_del_init(&device->dev_alloc_list); device->writeable = 0; - fs_devices->rw_devices--; + if (!device->is_tgtdev_for_dev_replace) + fs_devices->rw_devices--; } list_del_init(&device->dev_list); fs_devices->num_devices--; @@ -594,7 +612,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) if (device->bdev) fs_devices->open_devices--; - if (device->writeable) { + if (device->writeable && !device->is_tgtdev_for_dev_replace) { list_del_init(&device->dev_alloc_list); fs_devices->rw_devices--; } @@ -718,7 +736,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fs_devices->rotating = 1; fs_devices->open_devices++; - if (device->writeable) { + if (device->writeable && !device->is_tgtdev_for_dev_replace) { fs_devices->rw_devices++; list_add(&device->dev_alloc_list, &fs_devices->alloc_list); @@ -1350,16 +1368,22 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) root->fs_info->avail_system_alloc_bits | root->fs_info->avail_metadata_alloc_bits; - if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && - root->fs_info->fs_devices->num_devices <= 4) { + num_devices = root->fs_info->fs_devices->num_devices; + btrfs_dev_replace_lock(&root->fs_info->dev_replace); + if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) { + WARN_ON(num_devices < 1); + num_devices--; + } + btrfs_dev_replace_unlock(&root->fs_info->dev_replace); + + if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { printk(KERN_ERR "btrfs: unable to go below four devices " "on raid10\n"); ret = -EINVAL; goto out; } - if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && - root->fs_info->fs_devices->num_devices <= 2) { + if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) { printk(KERN_ERR "btrfs: unable to go below two " "devices on raid1\n"); ret = -EINVAL; @@ -2935,6 +2959,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, u64 allowed; int mixed = 0; int ret; + u64 num_devices; if (btrfs_fs_closing(fs_info) || atomic_read(&fs_info->balance_pause_req) || @@ -2963,10 +2988,17 @@ int btrfs_balance(struct btrfs_balance_control *bctl, } } + num_devices = fs_info->fs_devices->num_devices; + btrfs_dev_replace_lock(&fs_info->dev_replace); + if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { + BUG_ON(num_devices < 1); + num_devices--; + } + btrfs_dev_replace_unlock(&fs_info->dev_replace); allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; - if (fs_info->fs_devices->num_devices == 1) + if (num_devices == 1) allowed |= BTRFS_BLOCK_GROUP_DUP; - else if (fs_info->fs_devices->num_devices < 4) + else if (num_devices < 4) allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); else allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | @@ -3591,6 +3623,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, devices_info[ndevs].total_avail = total_avail; devices_info[ndevs].dev = device; ++ndevs; + WARN_ON(ndevs > fs_devices->rw_devices); } /* @@ -4773,6 +4806,7 @@ static void fill_device_from_item(struct extent_buffer *leaf, device->io_align = btrfs_device_io_align(leaf, dev_item); device->io_width = btrfs_device_io_width(leaf, dev_item); device->sector_size = btrfs_device_sector_size(leaf, dev_item); + WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); device->is_tgtdev_for_dev_replace = 0; ptr = (unsigned long)btrfs_device_uuid(dev_item); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 58d7937..37d0157 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -268,7 +268,8 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_fs_devices **fs_devices_ret); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); -void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices); +void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, + struct btrfs_fs_devices *fs_devices, int step); int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, char *device_path, struct btrfs_device **device); -- cgit v0.10.2 From 29a8d9a0bce6a5abac1f313400c2e189e8d10e67 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 6 Nov 2012 14:16:24 +0100 Subject: Btrfs: introduce GET_READ_MIRRORS functionality for btrfs_map_block() Before this commit, btrfs_map_block() was called with REQ_WRITE in order to retrieve the list of mirrors for a disk block. This needs to be changed for the device replace procedure since it makes a difference whether you are asking for read mirrors or for locations to write to. GET_READ_MIRRORS is introduced as a new interface to call btrfs_map_block(). In the current commit, the functionality is not yet changed, only the interface for GET_READ_MIRRORS is introduced and all the places that should use this new interface are adapted. The reason that REQ_WRITE cannot be abused anymore to retrieve a list of read mirrors is that during a running dev replace operation all write requests to the live filesystem are duplicated to also write to the target drive. Keep in mind that the target disk is only partially a valid copy of the source disk while the operation is ongoing. All writes go to the target disk, but not all reads would return valid data on the target disk. Therefore it is not possible anymore to abuse a REQ_WRITE interface to find valid mirrors for a REQ_READ. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 45e7f75..46bd7d5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -174,6 +174,9 @@ static int btrfs_csum_sizes[] = { 4, 0 }; /* four bytes for CRC32 */ #define BTRFS_EMPTY_DIR_SIZE 0 +/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */ +#define REQ_GET_READ_MIRRORS (1 << 30) + #define BTRFS_FT_UNKNOWN 0 #define BTRFS_FT_REG_FILE 1 #define BTRFS_FT_DIR 2 diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index c705a48..96b93da 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -359,7 +359,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, * map block */ length = blocksize; - ret = btrfs_map_block(fs_info, REQ_WRITE, logical, &length, &bbio, 0); + ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, &length, + &bbio, 0); if (ret || !bbio || length < blocksize) goto error; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 30cbf69..30ba997 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1193,8 +1193,8 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx, * with a length of PAGE_SIZE, each returned stripe * represents one mirror */ - ret = btrfs_map_block(fs_info, WRITE, logical, &mapped_length, - &bbio, 0); + ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, + &mapped_length, &bbio, 0); if (ret || !bbio || mapped_length < sublen) { kfree(bbio); return -EIO; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a4e0963..de0c05c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4103,7 +4103,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, stripe_nr_end - stripe_nr_orig); stripe_index = do_div(stripe_nr, map->num_stripes); } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { - if (rw & (REQ_WRITE | REQ_DISCARD)) + if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; @@ -4115,7 +4115,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (rw & (REQ_WRITE | REQ_DISCARD)) { + if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) { num_stripes = map->num_stripes; } else if (mirror_num) { stripe_index = mirror_num - 1; @@ -4129,7 +4129,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, stripe_index = do_div(stripe_nr, factor); stripe_index *= map->sub_stripes; - if (rw & REQ_WRITE) + if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) num_stripes = map->sub_stripes; else if (rw & REQ_DISCARD) num_stripes = min_t(u64, map->sub_stripes * @@ -4242,7 +4242,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } - if (rw & REQ_WRITE) { + if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_DUP)) { -- cgit v0.10.2 From 472262f35a6b3407e761b700d74c53530e5f144d Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 6 Nov 2012 14:43:46 +0100 Subject: Btrfs: changes to live filesystem are also written to replacement disk During a running dev replace operation, all write requests to the live filesystem are duplicated to also write to the target drive. Therefore btrfs_map_block() is changed to duplicate stripes that are written to the source disk of a device replace procedure to be written to the target disk as well. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index de0c05c..4d3bf18 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4044,6 +4044,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, int num_stripes; int max_errors = 0; struct btrfs_bio *bbio = NULL; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + int dev_replace_is_ongoing = 0; + int num_alloc_stripes; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, *length); @@ -4089,6 +4092,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (!bbio_ret) goto out; + btrfs_dev_replace_lock(dev_replace); + dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); + if (!dev_replace_is_ongoing) + btrfs_dev_replace_unlock(dev_replace); + num_stripes = 1; stripe_index = 0; stripe_nr_orig = stripe_nr; @@ -4155,7 +4163,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } BUG_ON(stripe_index >= map->num_stripes); - bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS); + num_alloc_stripes = num_stripes; + if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD))) + num_alloc_stripes <<= 1; + bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS); if (!bbio) { ret = -ENOMEM; goto out; @@ -4250,11 +4261,48 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } + if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && + dev_replace->tgtdev != NULL) { + int index_where_to_add; + u64 srcdev_devid = dev_replace->srcdev->devid; + + /* + * duplicate the write operations while the dev replace + * procedure is running. Since the copying of the old disk + * to the new disk takes place at run time while the + * filesystem is mounted writable, the regular write + * operations to the old disk have to be duplicated to go + * to the new disk as well. + * Note that device->missing is handled by the caller, and + * that the write to the old disk is already set up in the + * stripes array. + */ + index_where_to_add = num_stripes; + for (i = 0; i < num_stripes; i++) { + if (bbio->stripes[i].dev->devid == srcdev_devid) { + /* write to new disk, too */ + struct btrfs_bio_stripe *new = + bbio->stripes + index_where_to_add; + struct btrfs_bio_stripe *old = + bbio->stripes + i; + + new->physical = old->physical; + new->length = old->length; + new->dev = dev_replace->tgtdev; + index_where_to_add++; + max_errors++; + } + } + num_stripes = index_where_to_add; + } + *bbio_ret = bbio; bbio->num_stripes = num_stripes; bbio->max_errors = max_errors; bbio->mirror_num = mirror_num; out: + if (dev_replace_is_ongoing) + btrfs_dev_replace_unlock(dev_replace); free_extent_map(em); return ret; } -- cgit v0.10.2 From 30d9861ff9520e2a112eae71029bc9f7e915a441 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 6 Nov 2012 14:52:18 +0100 Subject: Btrfs: optionally avoid reads from device replace source drive It is desirable to be able to configure the device replace procedure to avoid reading the source drive (the one to be copied) whenever possible. This is useful when the number of read errors on this disk is high, because it would delay the copy procedure alot. Therefore there is an option to avoid reading from the source disk unless the repair procedure really needs to access it. The regular read req asks for mapping the block with mirror_num == 0, in this case the source disk is avoided whenever possible. The repair code selects the mirror_num explicitly (mirror_num != 0), this case is not changed by this commit. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 4d3bf18..e2e01a3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4007,16 +4007,37 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) return ret; } -static int find_live_mirror(struct map_lookup *map, int first, int num, - int optimal) +static int find_live_mirror(struct btrfs_fs_info *fs_info, + struct map_lookup *map, int first, int num, + int optimal, int dev_replace_is_ongoing) { int i; - if (map->stripes[optimal].dev->bdev) - return optimal; - for (i = first; i < first + num; i++) { - if (map->stripes[i].dev->bdev) - return i; + int tolerance; + struct btrfs_device *srcdev; + + if (dev_replace_is_ongoing && + fs_info->dev_replace.cont_reading_from_srcdev_mode == + BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) + srcdev = fs_info->dev_replace.srcdev; + else + srcdev = NULL; + + /* + * try to avoid the drive that is the source drive for a + * dev-replace procedure, only choose it if no other non-missing + * mirror is available + */ + for (tolerance = 0; tolerance < 2; tolerance++) { + if (map->stripes[optimal].dev->bdev && + (tolerance || map->stripes[optimal].dev != srcdev)) + return optimal; + for (i = first; i < first + num; i++) { + if (map->stripes[i].dev->bdev && + (tolerance || map->stripes[i].dev != srcdev)) + return i; + } } + /* we couldn't find one that doesn't fail. Just return something * and the io error handling code will clean up eventually */ @@ -4116,9 +4137,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, else if (mirror_num) stripe_index = mirror_num - 1; else { - stripe_index = find_live_mirror(map, 0, + stripe_index = find_live_mirror(fs_info, map, 0, map->num_stripes, - current->pid % map->num_stripes); + current->pid % map->num_stripes, + dev_replace_is_ongoing); mirror_num = stripe_index + 1; } @@ -4147,9 +4169,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, stripe_index += mirror_num - 1; else { int old_stripe_index = stripe_index; - stripe_index = find_live_mirror(map, stripe_index, + stripe_index = find_live_mirror(fs_info, map, + stripe_index, map->sub_stripes, stripe_index + - current->pid % map->sub_stripes); + current->pid % map->sub_stripes, + dev_replace_is_ongoing); mirror_num = stripe_index - old_stripe_index + 1; } } else { -- cgit v0.10.2 From 72d7aefccd512b66cd5543e652eae04be12085fc Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 6 Nov 2012 14:57:46 +0100 Subject: Btrfs: increase BTRFS_MAX_MIRRORS by one for dev replace This change of the define is effective in all modes, it is required and used only in the case when a device replace procedure is running. The reason is that during an active device replace procedure, the target device of the copy operation is a mirror for the filesystem data as well that can be used to read data in order to repair read errors on other disks. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 46bd7d5..91ff078 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -48,7 +48,7 @@ struct btrfs_ordered_sum; #define BTRFS_MAGIC "_BHRfS_M" -#define BTRFS_MAX_MIRRORS 2 +#define BTRFS_MAX_MIRRORS 3 #define BTRFS_MAX_LEVEL 8 -- cgit v0.10.2 From ad6d620e2a5704f6bf3a39c92a75aad962c51cb3 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 6 Nov 2012 15:06:47 +0100 Subject: Btrfs: allow repair code to include target disk when searching mirrors Make the target disk of a running device replace operation available for reading. This is only used as a last ressort for the defect repair procedure. And it is dependent on the location of the data block to read, because during an ongoing device replace operation, the target drive is only partially filled with the filesystem data. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e2e01a3..32a4948 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4004,6 +4004,12 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) else ret = 1; free_extent_map(em); + + btrfs_dev_replace_lock(&fs_info->dev_replace); + if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) + ret++; + btrfs_dev_replace_unlock(&fs_info->dev_replace); + return ret; } @@ -4068,6 +4074,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; int dev_replace_is_ongoing = 0; int num_alloc_stripes; + int patch_the_first_stripe_for_dev_replace = 0; + u64 physical_to_patch_in_first_stripe = 0; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, *length); @@ -4084,9 +4092,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, map = (struct map_lookup *)em->bdev; offset = logical - em->start; - if (mirror_num > map->num_stripes) - mirror_num = 0; - stripe_nr = offset; /* * stripe_nr counts the total number of stripes we have to stride @@ -4118,6 +4123,88 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (!dev_replace_is_ongoing) btrfs_dev_replace_unlock(dev_replace); + if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && + !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) && + dev_replace->tgtdev != NULL) { + /* + * in dev-replace case, for repair case (that's the only + * case where the mirror is selected explicitly when + * calling btrfs_map_block), blocks left of the left cursor + * can also be read from the target drive. + * For REQ_GET_READ_MIRRORS, the target drive is added as + * the last one to the array of stripes. For READ, it also + * needs to be supported using the same mirror number. + * If the requested block is not left of the left cursor, + * EIO is returned. This can happen because btrfs_num_copies() + * returns one more in the dev-replace case. + */ + u64 tmp_length = *length; + struct btrfs_bio *tmp_bbio = NULL; + int tmp_num_stripes; + u64 srcdev_devid = dev_replace->srcdev->devid; + int index_srcdev = 0; + int found = 0; + u64 physical_of_found = 0; + + ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, + logical, &tmp_length, &tmp_bbio, 0); + if (ret) { + WARN_ON(tmp_bbio != NULL); + goto out; + } + + tmp_num_stripes = tmp_bbio->num_stripes; + if (mirror_num > tmp_num_stripes) { + /* + * REQ_GET_READ_MIRRORS does not contain this + * mirror, that means that the requested area + * is not left of the left cursor + */ + ret = -EIO; + kfree(tmp_bbio); + goto out; + } + + /* + * process the rest of the function using the mirror_num + * of the source drive. Therefore look it up first. + * At the end, patch the device pointer to the one of the + * target drive. + */ + for (i = 0; i < tmp_num_stripes; i++) { + if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) { + /* + * In case of DUP, in order to keep it + * simple, only add the mirror with the + * lowest physical address + */ + if (found && + physical_of_found <= + tmp_bbio->stripes[i].physical) + continue; + index_srcdev = i; + found = 1; + physical_of_found = + tmp_bbio->stripes[i].physical; + } + } + + if (found) { + mirror_num = index_srcdev + 1; + patch_the_first_stripe_for_dev_replace = 1; + physical_to_patch_in_first_stripe = physical_of_found; + } else { + WARN_ON(1); + ret = -EIO; + kfree(tmp_bbio); + goto out; + } + + kfree(tmp_bbio); + } else if (mirror_num > map->num_stripes) { + mirror_num = 0; + } + num_stripes = 1; stripe_index = 0; stripe_nr_orig = stripe_nr; @@ -4188,8 +4275,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, BUG_ON(stripe_index >= map->num_stripes); num_alloc_stripes = num_stripes; - if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD))) - num_alloc_stripes <<= 1; + if (dev_replace_is_ongoing) { + if (rw & (REQ_WRITE | REQ_DISCARD)) + num_alloc_stripes <<= 1; + if (rw & REQ_GET_READ_MIRRORS) + num_alloc_stripes++; + } bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS); if (!bbio) { ret = -ENOMEM; @@ -4318,12 +4409,70 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } num_stripes = index_where_to_add; + } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) && + dev_replace->tgtdev != NULL) { + u64 srcdev_devid = dev_replace->srcdev->devid; + int index_srcdev = 0; + int found = 0; + u64 physical_of_found = 0; + + /* + * During the dev-replace procedure, the target drive can + * also be used to read data in case it is needed to repair + * a corrupt block elsewhere. This is possible if the + * requested area is left of the left cursor. In this area, + * the target drive is a full copy of the source drive. + */ + for (i = 0; i < num_stripes; i++) { + if (bbio->stripes[i].dev->devid == srcdev_devid) { + /* + * In case of DUP, in order to keep it + * simple, only add the mirror with the + * lowest physical address + */ + if (found && + physical_of_found <= + bbio->stripes[i].physical) + continue; + index_srcdev = i; + found = 1; + physical_of_found = bbio->stripes[i].physical; + } + } + if (found) { + u64 length = map->stripe_len; + + if (physical_of_found + length <= + dev_replace->cursor_left) { + struct btrfs_bio_stripe *tgtdev_stripe = + bbio->stripes + num_stripes; + + tgtdev_stripe->physical = physical_of_found; + tgtdev_stripe->length = + bbio->stripes[index_srcdev].length; + tgtdev_stripe->dev = dev_replace->tgtdev; + + num_stripes++; + } + } } *bbio_ret = bbio; bbio->num_stripes = num_stripes; bbio->max_errors = max_errors; bbio->mirror_num = mirror_num; + + /* + * this is the case that REQ_READ && dev_replace_is_ongoing && + * mirror_num == num_stripes + 1 && dev_replace target drive is + * available as a mirror + */ + if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { + WARN_ON(num_stripes > 1); + bbio->stripes[0].dev = dev_replace->tgtdev; + bbio->stripes[0].physical = physical_to_patch_in_first_stripe; + bbio->mirror_num = map->num_stripes + 1; + } out: if (dev_replace_is_ongoing) btrfs_dev_replace_unlock(dev_replace); -- cgit v0.10.2 From 3f6bcfbd4149875662773eb40a62294cddf215d4 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 6 Nov 2012 15:08:53 +0100 Subject: Btrfs: add support for device replace ioctls This is the commit that allows to start the device replace procedure. An ioctl() interface is added that supports starting and canceling the device replace procedure, and to retrieve the status and progress. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e54b5e5..9a71fec 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -55,6 +55,7 @@ #include "backref.h" #include "rcu-string.h" #include "send.h" +#include "dev-replace.h" /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) @@ -3171,6 +3172,51 @@ static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root, return ret; } +static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_dev_replace_args *p; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + p = memdup_user(arg, sizeof(*p)); + if (IS_ERR(p)) + return PTR_ERR(p); + + switch (p->cmd) { + case BTRFS_IOCTL_DEV_REPLACE_CMD_START: + if (atomic_xchg( + &root->fs_info->mutually_exclusive_operation_running, + 1)) { + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + ret = -EINPROGRESS; + } else { + ret = btrfs_dev_replace_start(root, p); + atomic_set( + &root->fs_info->mutually_exclusive_operation_running, + 0); + } + break; + case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS: + btrfs_dev_replace_status(root->fs_info, p); + ret = 0; + break; + case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL: + ret = btrfs_dev_replace_cancel(root->fs_info, p); + break; + default: + ret = -EINVAL; + break; + } + + if (copy_to_user(arg, p, sizeof(*p))) + ret = -EFAULT; + + kfree(p); + return ret; +} + static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) { int ret = 0; @@ -3826,6 +3872,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_qgroup_create(root, argp); case BTRFS_IOC_QGROUP_LIMIT: return btrfs_ioctl_qgroup_limit(root, argp); + case BTRFS_IOC_DEV_REPLACE: + return btrfs_ioctl_dev_replace(root, argp); } return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 62006ba..dabca9c 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -30,6 +30,8 @@ struct btrfs_ioctl_vol_args { char name[BTRFS_PATH_NAME_MAX + 1]; }; +#define BTRFS_DEVICE_PATH_NAME_MAX 1024 + #define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) #define BTRFS_SUBVOL_RDONLY (1ULL << 1) #define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2) @@ -127,10 +129,10 @@ struct btrfs_ioctl_scrub_args { #define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 struct btrfs_ioctl_dev_replace_start_params { __u64 srcdevid; /* in, if 0, use srcdev_name instead */ - __u8 srcdev_name[BTRFS_PATH_NAME_MAX + 1]; /* in */ - __u8 tgtdev_name[BTRFS_PATH_NAME_MAX + 1]; /* in */ __u64 cont_reading_from_srcdev_mode; /* in, see #define * above */ + __u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */ + __u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */ }; #define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED 0 @@ -165,7 +167,6 @@ struct btrfs_ioctl_dev_replace_args { __u64 spare[64]; }; -#define BTRFS_DEVICE_PATH_NAME_MAX 1024 struct btrfs_ioctl_dev_info_args { __u64 devid; /* in/out */ __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */ -- cgit v0.10.2 From 071401258a580dec2a3e0c2700b7e76f3ed43320 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Fri, 23 Nov 2012 03:03:14 +0000 Subject: Btrfs: do not warn_on io_ctl->cur in io_ctl_map_page io_ctl_map_page is called by many functions in free-space-cache. In most scenarios, the ->cur is not null, e.g. io_ctl_add_entry. I think we'd better remove the warn_on here. Signed-off-by: Wang Sheng-Hui Signed-off-by: Chris Mason diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 557502c..efdd1d3 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -307,7 +307,6 @@ static void io_ctl_unmap_page(struct io_ctl *io_ctl) static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) { - WARN_ON(io_ctl->cur); BUG_ON(io_ctl->index >= io_ctl->num_pages); io_ctl->page = io_ctl->pages[io_ctl->index++]; io_ctl->cur = kmap(io_ctl->page); -- cgit v0.10.2 From db2254bce4f19f458aaa05f9d00b39f413f7488c Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 26 Nov 2012 02:58:36 +0000 Subject: Btrfs: fix an while-loop of listxattr If we found an invalid xattr dir item, we'd better try the next one instead. Signed-off-by: Liu Bo Signed-off-by: Chris Mason diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 3f4e2d6..e9d3840 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -265,7 +265,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); if (verify_dir_item(root, leaf, di)) - continue; + goto next; name_len = btrfs_dir_name_len(leaf, di); total_size += name_len + 1; -- cgit v0.10.2 From 9a8c28bec1b40e934ed28149b7eaa7d2fafed92d Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 08:40:43 +0000 Subject: Btrfs: pass root object into btrfs_ioctl_{start, wait}_sync() Since we have gotten the root in the caller, just pass it into btrfs_ioctl_{start, wait}_sync() directly. Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9a71fec..5022e62 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3058,9 +3058,9 @@ long btrfs_ioctl_trans_end(struct file *file) return 0; } -static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp) +static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, + void __user *argp) { - struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root; struct btrfs_trans_handle *trans; u64 transid; int ret; @@ -3081,9 +3081,9 @@ static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp return 0; } -static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp) +static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root, + void __user *argp) { - struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root; u64 transid; if (argp) { @@ -3843,9 +3843,9 @@ long btrfs_ioctl(struct file *file, unsigned int btrfs_sync_fs(file->f_dentry->d_sb, 1); return 0; case BTRFS_IOC_START_SYNC: - return btrfs_ioctl_start_sync(file, argp); + return btrfs_ioctl_start_sync(root, argp); case BTRFS_IOC_WAIT_SYNC: - return btrfs_ioctl_wait_sync(file, argp); + return btrfs_ioctl_wait_sync(root, argp); case BTRFS_IOC_SCRUB: return btrfs_ioctl_scrub(root, argp); case BTRFS_IOC_SCRUB_CANCEL: -- cgit v0.10.2 From ff7c1d33551862c86f7737fe88edc3e499d291e6 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 08:41:29 +0000 Subject: Btrfs: don't start a new transaction when starting sync If there is no running transaction in the fs, we needn't start a new one when we want to start sync. Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5022e62..7b1f614 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3065,16 +3065,22 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, u64 transid; int ret; - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); + trans = btrfs_attach_transaction(root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) != -ENOENT) + return PTR_ERR(trans); + + /* No running transaction, don't bother */ + transid = root->fs_info->last_trans_committed; + goto out; + } transid = trans->transid; ret = btrfs_commit_transaction_async(trans, root, 0); if (ret) { btrfs_end_transaction(trans, root); return ret; } - +out: if (argp) if (copy_to_user(argp, &transid, sizeof(transid))) return -EFAULT; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index bcc6b65..8db401f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1307,9 +1307,10 @@ static void do_async_commit(struct work_struct *work) * We've got freeze protection passed with the transaction. * Tell lockdep about it. */ - rwsem_acquire_read( - &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], - 0, 1, _THIS_IP_); + if (ac->newtrans->type < TRANS_JOIN_NOLOCK) + rwsem_acquire_read( + &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], + 0, 1, _THIS_IP_); current->journal_info = ac->newtrans; @@ -1347,8 +1348,10 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, * Tell lockdep we've released the freeze rwsem, since the * async commit thread will be the one to unlock it. */ - rwsem_release(&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], - 1, _THIS_IP_); + if (trans->type < TRANS_JOIN_NOLOCK) + rwsem_release( + &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], + 1, _THIS_IP_); schedule_delayed_work(&ac->work, 0); -- cgit v0.10.2 From 8cd2807f79b73ef2d8c1cb6b3732dc5758ac7212 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 08:42:07 +0000 Subject: Btrfs: fix wrong return value of btrfs_wait_for_commit() If the id of the existed transaction is more than the one we specified, it means the specified transaction was commited, so we should return 0, not EINVAL. Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 8db401f..e6509b9 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -456,28 +456,31 @@ static noinline void wait_for_commit(struct btrfs_root *root, int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) { struct btrfs_transaction *cur_trans = NULL, *t; - int ret; + int ret = 0; - ret = 0; if (transid) { if (transid <= root->fs_info->last_trans_committed) goto out; + ret = -EINVAL; /* find specified transaction */ spin_lock(&root->fs_info->trans_lock); list_for_each_entry(t, &root->fs_info->trans_list, list) { if (t->transid == transid) { cur_trans = t; atomic_inc(&cur_trans->use_count); + ret = 0; break; } - if (t->transid > transid) + if (t->transid > transid) { + ret = 0; break; + } } spin_unlock(&root->fs_info->trans_lock); - ret = -EINVAL; + /* The specified transaction doesn't exist */ if (!cur_trans) - goto out; /* bad transid */ + goto out; } else { /* find newest transaction that is committing | committed */ spin_lock(&root->fs_info->trans_lock); @@ -497,9 +500,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) } wait_for_commit(root, cur_trans); - put_transaction(cur_trans); - ret = 0; out: return ret; } -- cgit v0.10.2 From 3c04ce01053413007b9df88313b8b8e17272b57b Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 08:43:07 +0000 Subject: Btrfs: get write access when setting the default subvolume When wen want to set the default subvolume, we must get write access, or we will change the R/O file system. Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7b1f614..10bc65e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2843,12 +2843,19 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) struct btrfs_disk_key disk_key; u64 objectid = 0; u64 dir_id; + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (copy_from_user(&objectid, argp, sizeof(objectid))) - return -EFAULT; + ret = mnt_want_write_file(file); + if (ret) + return ret; + + if (copy_from_user(&objectid, argp, sizeof(objectid))) { + ret = -EFAULT; + goto out; + } if (!objectid) objectid = root->root_key.objectid; @@ -2858,21 +2865,28 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) location.offset = (u64)-1; new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); - if (IS_ERR(new_root)) - return PTR_ERR(new_root); + if (IS_ERR(new_root)) { + ret = PTR_ERR(new_root); + goto out; + } - if (btrfs_root_refs(&new_root->root_item) == 0) - return -ENOENT; + if (btrfs_root_refs(&new_root->root_item) == 0) { + ret = -ENOENT; + goto out; + } path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (!path) { + ret = -ENOMEM; + goto out; + } path->leave_spinning = 1; trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { btrfs_free_path(path); - return PTR_ERR(trans); + ret = PTR_ERR(trans); + goto out; } dir_id = btrfs_super_root_dir(root->fs_info->super_copy); @@ -2883,7 +2897,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) btrfs_end_transaction(trans, root); printk(KERN_ERR "Umm, you don't have the default dir item, " "this isn't going to work\n"); - return -ENOENT; + ret = -ENOENT; + goto out; } btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); @@ -2893,8 +2908,9 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL); btrfs_end_transaction(trans, root); - - return 0; +out: + mnt_drop_write_file(file); + return ret; } void btrfs_get_block_group_info(struct list_head *groups_list, -- cgit v0.10.2 From 198605a8e2077f174c9834c97b836f535e4e56dd Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 08:43:45 +0000 Subject: Btrfs: get write access when doing resize fs Steps to reproduce: # mkfs.btrfs # mount -o ro # mount -o ro # mount -o remount,rw # umount # btrfs fi resize 10g We re-sized a R/O filesystem. The reason is that we just check the R/O flag of the super block object. It is not enough, because the kernel may set the R/O flag only for the mount point. We need invoke mnt_want_write_file() to do a full check. Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 10bc65e..2be49b4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1298,12 +1298,13 @@ out_ra: return ret; } -static noinline int btrfs_ioctl_resize(struct btrfs_root *root, +static noinline int btrfs_ioctl_resize(struct file *file, void __user *arg) { u64 new_size; u64 old_size; u64 devid = 1; + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct btrfs_ioctl_vol_args *vol_args; struct btrfs_trans_handle *trans; struct btrfs_device *device = NULL; @@ -1318,6 +1319,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, if (!capable(CAP_SYS_ADMIN)) return -EPERM; + ret = mnt_want_write_file(file); + if (ret) + return ret; + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); @@ -1425,6 +1430,7 @@ out_free: kfree(vol_args); out: mutex_unlock(&root->fs_info->volume_mutex); + mnt_drop_write_file(file); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); return ret; } @@ -3832,7 +3838,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_DEFRAG_RANGE: return btrfs_ioctl_defrag(file, argp); case BTRFS_IOC_RESIZE: - return btrfs_ioctl_resize(root, argp); + return btrfs_ioctl_resize(file, argp); case BTRFS_IOC_ADD_DEV: return btrfs_ioctl_add_dev(root, argp); case BTRFS_IOC_RM_DEV: -- cgit v0.10.2 From da24927b1e1925da5c1885cb483231dabe027e15 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 08:44:50 +0000 Subject: Btrfs: get write access when removing a device Steps to reproduce: # mkfs.btrfs -d single -m single # mount -o ro # mount -o ro # mount -o remount,rw # umount # btrfs device delete We can remove a device from a R/O filesystem. The reason is that we just check the R/O flag of the super block object. It is not enough, because the kernel may set the R/O flag only for the mount point. We need invoke mnt_want_write_file() to do a full check. Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2be49b4..ee36009 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2270,20 +2270,23 @@ out: return ret; } -static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) { + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct btrfs_ioctl_vol_args *vol_args; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; + ret = mnt_want_write_file(file); + if (ret) + return ret; if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + mnt_drop_write_file(file); return -EINPROGRESS; } @@ -2300,6 +2303,7 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) kfree(vol_args); out: mutex_unlock(&root->fs_info->volume_mutex); + mnt_drop_write_file(file); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); return ret; } @@ -3842,7 +3846,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_ADD_DEV: return btrfs_ioctl_add_dev(root, argp); case BTRFS_IOC_RM_DEV: - return btrfs_ioctl_rm_dev(root, argp); + return btrfs_ioctl_rm_dev(file, argp); case BTRFS_IOC_FS_INFO: return btrfs_ioctl_fs_info(root, argp); case BTRFS_IOC_DEV_INFO: -- cgit v0.10.2 From b8e95489bf0ddf767e4bd38f537e0adad16ee830 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 08:48:01 +0000 Subject: Btrfs: get write access for scrub We need get write access for scrub, or we will modify the R/O fs. Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ee36009..12b18c0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3127,10 +3127,11 @@ static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root, return btrfs_wait_for_commit(root, transid); } -static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_scrub(struct file *file, void __user *arg) { - int ret; + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct btrfs_ioctl_scrub_args *sa; + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -3139,6 +3140,12 @@ static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg) if (IS_ERR(sa)) return PTR_ERR(sa); + if (!(sa->flags & BTRFS_SCRUB_READONLY)) { + ret = mnt_want_write_file(file); + if (ret) + goto out; + } + ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end, &sa->progress, sa->flags & BTRFS_SCRUB_READONLY, 0); @@ -3146,6 +3153,9 @@ static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg) if (copy_to_user(arg, sa, sizeof(*sa))) ret = -EFAULT; + if (!(sa->flags & BTRFS_SCRUB_READONLY)) + mnt_drop_write_file(file); +out: kfree(sa); return ret; } @@ -3879,7 +3889,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_WAIT_SYNC: return btrfs_ioctl_wait_sync(root, argp); case BTRFS_IOC_SCRUB: - return btrfs_ioctl_scrub(root, argp); + return btrfs_ioctl_scrub(file, argp); case BTRFS_IOC_SCRUB_CANCEL: return btrfs_ioctl_scrub_cancel(root, argp); case BTRFS_IOC_SCRUB_PROGRESS: -- cgit v0.10.2 From 905b0dda06a064db08b8a814e968786ff3c4cc19 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 08:50:11 +0000 Subject: Btrfs: get write access for qgroup operations We need get write access for qgroup operations, or we will modify the R/O fs. Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 12b18c0..657d83c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3558,8 +3558,9 @@ out: return ret; } -static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) { + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct btrfs_ioctl_quota_ctl_args *sa; struct btrfs_trans_handle *trans = NULL; int ret; @@ -3568,12 +3569,15 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; + ret = mnt_want_write_file(file); + if (ret) + return ret; sa = memdup_user(arg, sizeof(*sa)); - if (IS_ERR(sa)) - return PTR_ERR(sa); + if (IS_ERR(sa)) { + ret = PTR_ERR(sa); + goto drop_write; + } if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) { trans = btrfs_start_transaction(root, 2); @@ -3606,14 +3610,16 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg) if (err && !ret) ret = err; } - out: kfree(sa); +drop_write: + mnt_drop_write_file(file); return ret; } -static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) { + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct btrfs_ioctl_qgroup_assign_args *sa; struct btrfs_trans_handle *trans; int ret; @@ -3622,12 +3628,15 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; + ret = mnt_want_write_file(file); + if (ret) + return ret; sa = memdup_user(arg, sizeof(*sa)); - if (IS_ERR(sa)) - return PTR_ERR(sa); + if (IS_ERR(sa)) { + ret = PTR_ERR(sa); + goto drop_write; + } trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { @@ -3650,11 +3659,14 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg) out: kfree(sa); +drop_write: + mnt_drop_write_file(file); return ret; } -static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) { + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct btrfs_ioctl_qgroup_create_args *sa; struct btrfs_trans_handle *trans; int ret; @@ -3663,12 +3675,15 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; + ret = mnt_want_write_file(file); + if (ret) + return ret; sa = memdup_user(arg, sizeof(*sa)); - if (IS_ERR(sa)) - return PTR_ERR(sa); + if (IS_ERR(sa)) { + ret = PTR_ERR(sa); + goto drop_write; + } trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { @@ -3690,11 +3705,14 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg) out: kfree(sa); +drop_write: + mnt_drop_write_file(file); return ret; } -static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) { + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct btrfs_ioctl_qgroup_limit_args *sa; struct btrfs_trans_handle *trans; int ret; @@ -3704,12 +3722,15 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; + ret = mnt_want_write_file(file); + if (ret) + return ret; sa = memdup_user(arg, sizeof(*sa)); - if (IS_ERR(sa)) - return PTR_ERR(sa); + if (IS_ERR(sa)) { + ret = PTR_ERR(sa); + goto drop_write; + } trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { @@ -3732,6 +3753,8 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg) out: kfree(sa); +drop_write: + mnt_drop_write_file(file); return ret; } @@ -3907,13 +3930,13 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_GET_DEV_STATS: return btrfs_ioctl_get_dev_stats(root, argp); case BTRFS_IOC_QUOTA_CTL: - return btrfs_ioctl_quota_ctl(root, argp); + return btrfs_ioctl_quota_ctl(file, argp); case BTRFS_IOC_QGROUP_ASSIGN: - return btrfs_ioctl_qgroup_assign(root, argp); + return btrfs_ioctl_qgroup_assign(file, argp); case BTRFS_IOC_QGROUP_CREATE: - return btrfs_ioctl_qgroup_create(root, argp); + return btrfs_ioctl_qgroup_create(file, argp); case BTRFS_IOC_QGROUP_LIMIT: - return btrfs_ioctl_qgroup_limit(root, argp); + return btrfs_ioctl_qgroup_limit(file, argp); case BTRFS_IOC_DEV_REPLACE: return btrfs_ioctl_dev_replace(root, argp); } -- cgit v0.10.2 From 9247f3170b2c3d648707c93bbebcd763fac17c06 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 09:24:43 +0000 Subject: Btrfs: use slabs for auto defrag allocation The auto defrag allocation is in the fast path of the IO, so use slabs to improve the speed of the allocation. And besides that, it can do check for leaked objects when the module is removed. Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 91ff078..389c057 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3505,6 +3505,8 @@ void btrfs_get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space); /* file.c */ +int btrfs_auto_defrag_init(void); +void btrfs_auto_defrag_exit(void); int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index bd7f1b0..15117ea 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -41,6 +41,7 @@ #include "compat.h" #include "volumes.h" +static struct kmem_cache *btrfs_inode_defrag_cachep; /* * when auto defrag is enabled we * queue up these defrag structs to remember which @@ -127,7 +128,7 @@ static void __btrfs_add_inode_defrag(struct inode *inode, return; exists: - kfree(defrag); + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); return; } @@ -157,7 +158,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, else transid = BTRFS_I(inode)->root->last_trans; - defrag = kzalloc(sizeof(*defrag), GFP_NOFS); + defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); if (!defrag) return -ENOMEM; @@ -169,7 +170,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) __btrfs_add_inode_defrag(inode, defrag); else - kfree(defrag); + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); spin_unlock(&root->fs_info->defrag_inodes_lock); return 0; } @@ -315,7 +316,8 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) next: spin_lock(&fs_info->defrag_inodes_lock); next_free: - kfree(defrag); + if (defrag) + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); } spin_unlock(&fs_info->defrag_inodes_lock); @@ -2293,3 +2295,21 @@ const struct file_operations btrfs_file_operations = { .compat_ioctl = btrfs_ioctl, #endif }; + +void btrfs_auto_defrag_exit(void) +{ + if (btrfs_inode_defrag_cachep) + kmem_cache_destroy(btrfs_inode_defrag_cachep); +} + +int btrfs_auto_defrag_init(void) +{ + btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", + sizeof(struct inode_defrag), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); + if (!btrfs_inode_defrag_cachep) + return -ENOMEM; + + return 0; +} diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index def4f24..99545df 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1680,10 +1680,14 @@ static int __init init_btrfs_fs(void) if (err) goto free_ordered_data; - err = btrfs_interface_init(); + err = btrfs_auto_defrag_init(); if (err) goto free_delayed_inode; + err = btrfs_interface_init(); + if (err) + goto free_auto_defrag; + err = register_filesystem(&btrfs_fs_type); if (err) goto unregister_ioctl; @@ -1695,6 +1699,8 @@ static int __init init_btrfs_fs(void) unregister_ioctl: btrfs_interface_exit(); +free_auto_defrag: + btrfs_auto_defrag_exit(); free_delayed_inode: btrfs_delayed_inode_exit(); free_ordered_data: @@ -1714,6 +1720,7 @@ free_compress: static void __exit exit_btrfs_fs(void) { btrfs_destroy_cachep(); + btrfs_auto_defrag_exit(); btrfs_delayed_inode_exit(); ordered_data_exit(); extent_map_exit(); -- cgit v0.10.2 From 8ddc473433b5e8ce8693db9f6e251f5a28267528 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 09:25:38 +0000 Subject: Btrfs: fix unprotected defragable inode insertion We forget to get the defrag lock when we re-add the defragable inode, Fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 15117ea..0091832 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -91,7 +91,7 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1, * If an existing record is found the defrag item you * pass in is freed */ -static void __btrfs_add_inode_defrag(struct inode *inode, +static int __btrfs_add_inode_defrag(struct inode *inode, struct inode_defrag *defrag) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -119,18 +119,24 @@ static void __btrfs_add_inode_defrag(struct inode *inode, entry->transid = defrag->transid; if (defrag->last_offset > entry->last_offset) entry->last_offset = defrag->last_offset; - goto exists; + return -EEXIST; } } set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); rb_link_node(&defrag->rb_node, parent, p); rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); - return; + return 0; +} -exists: - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - return; +static inline int __need_auto_defrag(struct btrfs_root *root) +{ + if (!btrfs_test_opt(root, AUTO_DEFRAG)) + return 0; + + if (btrfs_fs_closing(root->fs_info)) + return 0; + return 1; } /* @@ -143,11 +149,9 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, struct btrfs_root *root = BTRFS_I(inode)->root; struct inode_defrag *defrag; u64 transid; + int ret; - if (!btrfs_test_opt(root, AUTO_DEFRAG)) - return 0; - - if (btrfs_fs_closing(root->fs_info)) + if (!__need_auto_defrag(root)) return 0; if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) @@ -167,15 +171,51 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, defrag->root = root->root_key.objectid; spin_lock(&root->fs_info->defrag_inodes_lock); - if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) - __btrfs_add_inode_defrag(inode, defrag); - else + if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) { + /* + * If we set IN_DEFRAG flag and evict the inode from memory, + * and then re-read this inode, this new inode doesn't have + * IN_DEFRAG flag. At the case, we may find the existed defrag. + */ + ret = __btrfs_add_inode_defrag(inode, defrag); + if (ret) + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + } else { kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + } spin_unlock(&root->fs_info->defrag_inodes_lock); return 0; } /* + * Requeue the defrag object. If there is a defrag object that points to + * the same inode in the tree, we will merge them together (by + * __btrfs_add_inode_defrag()) and free the one that we want to requeue. + */ +void btrfs_requeue_inode_defrag(struct inode *inode, + struct inode_defrag *defrag) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + if (!__need_auto_defrag(root)) + goto out; + + /* + * Here we don't check the IN_DEFRAG flag, because we need merge + * them together. + */ + spin_lock(&root->fs_info->defrag_inodes_lock); + ret = __btrfs_add_inode_defrag(inode, defrag); + spin_unlock(&root->fs_info->defrag_inodes_lock); + if (ret) + goto out; + return; +out: + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); +} + +/* * must be called with the defrag_inodes lock held */ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, @@ -294,7 +334,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) */ if (num_defrag == defrag_batch) { defrag->last_offset = range.start; - __btrfs_add_inode_defrag(inode, defrag); + btrfs_requeue_inode_defrag(inode, defrag); /* * we don't want to kfree defrag, we added it back to * the rbtree @@ -308,7 +348,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) */ defrag->last_offset = 0; defrag->cycled = 1; - __btrfs_add_inode_defrag(inode, defrag); + btrfs_requeue_inode_defrag(inode, defrag); defrag = NULL; } -- cgit v0.10.2 From 26176e7c2aa923327becdc25b5aca2cb907ac932 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 09:26:20 +0000 Subject: Btrfs: restructure btrfs_run_defrag_inodes() This patch restructure btrfs_run_defrag_inodes() and make the code of the auto defragment more readable. Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 389c057..6ba56ae 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3510,6 +3510,7 @@ void btrfs_auto_defrag_exit(void); int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); +void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, int skip_pinned); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 76b8250..3229531 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3329,7 +3329,7 @@ int close_ctree(struct btrfs_root *root) (atomic_read(&fs_info->defrag_running) == 0)); /* clear out the rbtree of defraggable inodes */ - btrfs_run_defrag_inodes(fs_info); + btrfs_cleanup_defrag_inodes(fs_info); if (!(fs_info->sb->s_flags & MS_RDONLY)) { ret = btrfs_commit_super(root); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0091832..3c6f747 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -216,11 +216,11 @@ out: } /* - * must be called with the defrag_inodes lock held + * pick the defragable inode that we want, if it doesn't exist, we will get + * the next one. */ -struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, - u64 root, u64 ino, - struct rb_node **next) +static struct inode_defrag * +btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino) { struct inode_defrag *entry = NULL; struct inode_defrag tmp; @@ -231,7 +231,8 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, tmp.ino = ino; tmp.root = root; - p = info->defrag_inodes.rb_node; + spin_lock(&fs_info->defrag_inodes_lock); + p = fs_info->defrag_inodes.rb_node; while (p) { parent = p; entry = rb_entry(parent, struct inode_defrag, rb_node); @@ -242,52 +243,128 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, else if (ret > 0) p = parent->rb_right; else - return entry; + goto out; } - if (next) { - while (parent && __compare_inode_defrag(&tmp, entry) > 0) { - parent = rb_next(parent); + if (parent && __compare_inode_defrag(&tmp, entry) > 0) { + parent = rb_next(parent); + if (parent) entry = rb_entry(parent, struct inode_defrag, rb_node); - } - *next = parent; + else + entry = NULL; } - return NULL; +out: + if (entry) + rb_erase(parent, &fs_info->defrag_inodes); + spin_unlock(&fs_info->defrag_inodes_lock); + return entry; } -/* - * run through the list of inodes in the FS that need - * defragging - */ -int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) +void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) { struct inode_defrag *defrag; + struct rb_node *node; + + spin_lock(&fs_info->defrag_inodes_lock); + node = rb_first(&fs_info->defrag_inodes); + while (node) { + rb_erase(node, &fs_info->defrag_inodes); + defrag = rb_entry(node, struct inode_defrag, rb_node); + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + + if (need_resched()) { + spin_unlock(&fs_info->defrag_inodes_lock); + cond_resched(); + spin_lock(&fs_info->defrag_inodes_lock); + } + + node = rb_first(&fs_info->defrag_inodes); + } + spin_unlock(&fs_info->defrag_inodes_lock); +} + +#define BTRFS_DEFRAG_BATCH 1024 + +static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, + struct inode_defrag *defrag) +{ struct btrfs_root *inode_root; struct inode *inode; - struct rb_node *n; struct btrfs_key key; struct btrfs_ioctl_defrag_range_args range; - u64 first_ino = 0; - u64 root_objectid = 0; int num_defrag; - int defrag_batch = 1024; + /* get the inode */ + key.objectid = defrag->root; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.offset = (u64)-1; + inode_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(inode_root)) { + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + return PTR_ERR(inode_root); + } + + key.objectid = defrag->ino; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); + if (IS_ERR(inode)) { + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + return PTR_ERR(inode); + } + + /* do a chunk of defrag */ + clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); memset(&range, 0, sizeof(range)); range.len = (u64)-1; + range.start = defrag->last_offset; + num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, + BTRFS_DEFRAG_BATCH); + /* + * if we filled the whole defrag batch, there + * must be more work to do. Queue this defrag + * again + */ + if (num_defrag == BTRFS_DEFRAG_BATCH) { + defrag->last_offset = range.start; + btrfs_requeue_inode_defrag(inode, defrag); + } else if (defrag->last_offset && !defrag->cycled) { + /* + * we didn't fill our defrag batch, but + * we didn't start at zero. Make sure we loop + * around to the start of the file. + */ + defrag->last_offset = 0; + defrag->cycled = 1; + btrfs_requeue_inode_defrag(inode, defrag); + } else { + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + } + + iput(inode); + return 0; +} + +/* + * run through the list of inodes in the FS that need + * defragging + */ +int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) +{ + struct inode_defrag *defrag; + u64 first_ino = 0; + u64 root_objectid = 0; atomic_inc(&fs_info->defrag_running); - spin_lock(&fs_info->defrag_inodes_lock); while(1) { - n = NULL; + if (!__need_auto_defrag(fs_info->tree_root)) + break; /* find an inode to defrag */ - defrag = btrfs_find_defrag_inode(fs_info, root_objectid, - first_ino, &n); + defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, + first_ino); if (!defrag) { - if (n) { - defrag = rb_entry(n, struct inode_defrag, - rb_node); - } else if (root_objectid || first_ino) { + if (root_objectid || first_ino) { root_objectid = 0; first_ino = 0; continue; @@ -296,71 +373,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) } } - /* remove it from the rbtree */ first_ino = defrag->ino + 1; root_objectid = defrag->root; - rb_erase(&defrag->rb_node, &fs_info->defrag_inodes); - - if (btrfs_fs_closing(fs_info)) - goto next_free; - spin_unlock(&fs_info->defrag_inodes_lock); - - /* get the inode */ - key.objectid = defrag->root; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); - key.offset = (u64)-1; - inode_root = btrfs_read_fs_root_no_name(fs_info, &key); - if (IS_ERR(inode_root)) - goto next; - - key.objectid = defrag->ino; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); - key.offset = 0; - - inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); - if (IS_ERR(inode)) - goto next; - - /* do a chunk of defrag */ - clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); - range.start = defrag->last_offset; - num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, - defrag_batch); - /* - * if we filled the whole defrag batch, there - * must be more work to do. Queue this defrag - * again - */ - if (num_defrag == defrag_batch) { - defrag->last_offset = range.start; - btrfs_requeue_inode_defrag(inode, defrag); - /* - * we don't want to kfree defrag, we added it back to - * the rbtree - */ - defrag = NULL; - } else if (defrag->last_offset && !defrag->cycled) { - /* - * we didn't fill our defrag batch, but - * we didn't start at zero. Make sure we loop - * around to the start of the file. - */ - defrag->last_offset = 0; - defrag->cycled = 1; - btrfs_requeue_inode_defrag(inode, defrag); - defrag = NULL; - } - - iput(inode); -next: - spin_lock(&fs_info->defrag_inodes_lock); -next_free: - if (defrag) - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + __btrfs_run_defrag_inode(fs_info, defrag); } - spin_unlock(&fs_info->defrag_inodes_lock); - atomic_dec(&fs_info->defrag_running); /* -- cgit v0.10.2 From b66f00da0cfceb856c17706b77906b63437f6fda Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 09:27:29 +0000 Subject: Btrfs: fix freeze vs auto defrag If we freeze the fs, the auto defragment should not run. Fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 3c6f747..d415a05 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -318,8 +318,11 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, memset(&range, 0, sizeof(range)); range.len = (u64)-1; range.start = defrag->last_offset; + + sb_start_write(fs_info->sb); num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, BTRFS_DEFRAG_BATCH); + sb_end_write(fs_info->sb); /* * if we filled the whole defrag batch, there * must be more work to do. Queue this defrag -- cgit v0.10.2 From cb3806ec88a7e1e9d1fbde34cbc0bf153b7e7c3f Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 27 Nov 2012 16:10:21 +0000 Subject: Btrfs: fix race in check-integrity caused by usage of bitfield The structure member mirror_num is modified concurrently to the structure member is_iodone. This doesn't require any locking by design, unless everything is stored in the same 32 bits of a bit field. This was the case and xfstest 284 was able to trigger false warnings from the checker code. This patch seperates the bits and fixes the race. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index badc6f1..11d47bf 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -137,7 +137,7 @@ struct btrfsic_block { unsigned int never_written:1; /* block was added because it was * referenced, not because it was * written */ - unsigned int mirror_num:2; /* large enough to hold + unsigned int mirror_num; /* large enough to hold * BTRFS_SUPER_MIRROR_MAX */ struct btrfsic_dev_state *dev_state; u64 dev_bytenr; /* key, physical byte num on disk */ -- cgit v0.10.2 From f9c83748deb94aba89b5c1085f887ddcdab223ef Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 27 Nov 2012 17:39:50 +0000 Subject: Btrfs: fix a build warning for an unused label This issue was detected by the "0-DAY kernel build testing". fs/btrfs/volumes.c: In function 'btrfs_rm_device': fs/btrfs/volumes.c:1505:1: warning: label 'error_close' defined but not used [-Wunused-label] Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 32a4948..0b1b7e2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1544,7 +1544,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) error_brelse: brelse(bh); -error_close: if (bdev) blkdev_put(bdev, FMODE_READ | FMODE_EXCL); out: -- cgit v0.10.2 From af1be4f851db4f4975f0139211a6561776ef37c0 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 27 Nov 2012 17:39:51 +0000 Subject: Btrfs: fix a scrub regression in case of write errors This regression was introduced by the device-replace patches. Scrub immediately stops checking those disks that have write errors. This is nothing that happens in the real world, but it is wrong since scrub is the tool to detect and repair defects. Fix it. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 30ba997..8db6a64 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2657,7 +2657,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, btrfs_put_block_group(cache); if (ret) break; - if (atomic64_read(&dev_replace->num_write_errors) > 0) { + if (is_dev_replace && + atomic64_read(&dev_replace->num_write_errors) > 0) { ret = -EIO; break; } -- cgit v0.10.2 From 797f4277113bff142b6c64a55abaef64d7d67d5c Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 28 Nov 2012 10:28:07 +0000 Subject: Btrfs: use existing align macros in btrfs_allocate() The kernel developers have implemented some often-used align macros, we should use them instead of the complex code. Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d415a05..a43d0ae 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2054,12 +2054,12 @@ static long btrfs_fallocate(struct file *file, int mode, u64 alloc_end; u64 alloc_hint = 0; u64 locked_end; - u64 mask = BTRFS_I(inode)->root->sectorsize - 1; struct extent_map *em; + int blocksize = BTRFS_I(inode)->root->sectorsize; int ret; - alloc_start = offset & ~mask; - alloc_end = (offset + len + mask) & ~mask; + alloc_start = round_down(offset, blocksize); + alloc_end = round_up(offset + len, blocksize); /* Make sure we aren't being give some crap mode */ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) @@ -2140,7 +2140,7 @@ static long btrfs_fallocate(struct file *file, int mode, } last_byte = min(extent_map_end(em), alloc_end); actual_end = min_t(u64, extent_map_end(em), offset + len); - last_byte = (last_byte + mask) & ~mask; + last_byte = ALIGN(last_byte, blocksize); if (em->block_start == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && -- cgit v0.10.2 From 0ff6fabdb0a862b22df4dd75873578392478e64d Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 28 Nov 2012 10:28:54 +0000 Subject: Btrfs: fix off-by-one error of the reserved size of btrfs_allocate() alloc_end is not the real end of the current extent, it is the start of the next adjoining extent. So we needn't +1 when calculating the size the space that is about to be reserved. Signed-off-by: Miao Xie Reviewed-by: Liu Bo Signed-off-by: Chris Mason diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a43d0ae..8e3d678 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2072,7 +2072,7 @@ static long btrfs_fallocate(struct file *file, int mode, * Make sure we have enough space before we do the * allocation. */ - ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1); + ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); if (ret) return ret; @@ -2179,7 +2179,7 @@ static long btrfs_fallocate(struct file *file, int mode, out: mutex_unlock(&inode->i_mutex); /* Let go of our reservation. */ - btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1); + btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); return ret; } -- cgit v0.10.2 From 755ac67f83e515af55adbfe55134eb7d90839cdb Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 28 Nov 2012 10:43:11 +0000 Subject: Btrfs: skip adding an acl attribute if we don't have to If the acl can be exactly represented in the traditional file mode permission bits, we don't set another acl attribute. Signed-off-by: Liu Bo Signed-off-by: Chris Mason diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 0c16e3d..e15d2b0 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -121,6 +121,8 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans, ret = posix_acl_equiv_mode(acl, &inode->i_mode); if (ret < 0) return ret; + if (ret == 0) + acl = NULL; } ret = 0; break; -- cgit v0.10.2 From 01e6deb25ae11e7b85484bf5e550eb540c50c63e Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 28 Nov 2012 10:43:12 +0000 Subject: Btrfs: don't add a NULL extended attribute Passing a null extended attribute value means to remove the attribute, but we don't have to add a new NULL extended attribute. Signed-off-by: Liu Bo Signed-off-by: Chris Mason diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index e9d3840..aef6bb3 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -122,6 +122,16 @@ static int do_setxattr(struct btrfs_trans_handle *trans, */ if (!value) goto out; + } else { + di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), + name, name_len, 0); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + if (!di && !value) + goto out; + btrfs_release_path(path); } again: -- cgit v0.10.2 From 05dadc09f52ad5a631da1aa8767c5b80e121f0c4 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Thu, 29 Nov 2012 05:08:26 +0000 Subject: Btrfs: add fiemap's flag check When the flag not supported is specified, it is necessary to return the error to the caller. So, we add the validity check of the fiemap's flag. Signed-off-by: Tsutomu Itoh Signed-off-by: Chris Mason diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d7bf2e7..a1761f0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6595,9 +6595,17 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, btrfs_submit_direct, 0); } +#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) + static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { + int ret; + + ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS); + if (ret) + return ret; + return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); } -- cgit v0.10.2 From 2794ed013b3551cbae887ea1b93c52aaacb7370d Mon Sep 17 00:00:00 2001 From: Filipe Brandenburger Date: Fri, 30 Nov 2012 03:40:08 +0000 Subject: Btrfs: fix permissions of empty files not affected by umask When a new file is created with btrfs_create(), the inode will initially be created with permissions 0666 and later on in btrfs_init_acl() it will be adapted to mask out the umask bits. The problem is that this change won't make it into the btrfs_inode unless there's another change to the inode (e.g. writing content changing the size or touching the file changing the mtime.) This fix adds a call to btrfs_update_inode() to btrfs_create() to make sure that the change will not get lost if the in-memory inode is flushed before other changes are made to the file. Signed-off-by: Filipe Brandenburger Reviewed-by: Liu Bo Signed-off-by: Chris Mason diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a1761f0..adab791 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4951,6 +4951,12 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, goto out_unlock; } + err = btrfs_update_inode(trans, root, inode); + if (err) { + drop_inode = 1; + goto out_unlock; + } + /* * If the active LSM wants to access the inode during * d_instantiate it needs these. Smack checks to see -- cgit v0.10.2 From 43baa579b3b1f059f68c51ef754ec59c87a35745 Mon Sep 17 00:00:00 2001 From: Filipe Brandenburger Date: Fri, 30 Nov 2012 03:40:09 +0000 Subject: Btrfs: refactor error handling to drop inode in btrfs_create() Refactor it by checking whether the inode has been created and needs to be dropped (drop_inode_on_err) and also if the err variable is set. That way the variable doesn't need to be set on each and every error handling block. Signed-off-by: Filipe Brandenburger Signed-off-by: Chris Mason diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index adab791..657f16d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4989,7 +4989,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = NULL; - int drop_inode = 0; + int drop_inode_on_err = 0; int err; u64 objectid; u64 index = 0; @@ -5014,12 +5014,11 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, err = PTR_ERR(inode); goto out_unlock; } + drop_inode_on_err = 1; err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) { - drop_inode = 1; + if (err) goto out_unlock; - } /* * If the active LSM wants to access the inode during @@ -5032,16 +5031,16 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) - drop_inode = 1; - else { - inode->i_mapping->a_ops = &btrfs_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; - d_instantiate(dentry, inode); - } + goto out_unlock; + + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + d_instantiate(dentry, inode); + out_unlock: btrfs_end_transaction(trans, root); - if (drop_inode) { + if (err && drop_inode_on_err) { inode_dec_link_count(inode); iput(inode); } -- cgit v0.10.2 From 960097622d48bf0ee8f6c0cf751a904066c4b45b Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Fri, 30 Nov 2012 06:30:14 +0000 Subject: Btrfs: use ctl->unit for free space calculation instead of block_group->sectorsize We should use ctl->unit for free space calculation instead of block_group->sectorsize even though for free space use_bitmap or free space cluster we only have sectorsize assigned to ctl->unit currently. Also, we can keep it consisten in code style. Signed-off-by: Wang Sheng-Hui Signed-off-by: Chris Mason diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index efdd1d3..59ea2e4 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1353,7 +1353,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) u64 bitmap_bytes; u64 extent_bytes; u64 size = block_group->key.offset; - u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize; + u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); BUG_ON(ctl->total_bitmaps > max_bitmaps); @@ -1639,8 +1639,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, * some block groups are so tiny they can't be enveloped by a bitmap, so * don't even bother to create a bitmap for this */ - if (BITS_PER_BITMAP * block_group->sectorsize > - block_group->key.offset) + if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset) return false; return true; @@ -2287,10 +2286,10 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, unsigned long total_found = 0; int ret; - i = offset_to_bit(entry->offset, block_group->sectorsize, + i = offset_to_bit(entry->offset, ctl->unit, max_t(u64, offset, entry->offset)); - want_bits = bytes_to_bits(bytes, block_group->sectorsize); - min_bits = bytes_to_bits(min_bytes, block_group->sectorsize); + want_bits = bytes_to_bits(bytes, ctl->unit); + min_bits = bytes_to_bits(min_bytes, ctl->unit); again: found_bits = 0; @@ -2314,23 +2313,22 @@ again: total_found += found_bits; - if (cluster->max_size < found_bits * block_group->sectorsize) - cluster->max_size = found_bits * block_group->sectorsize; + if (cluster->max_size < found_bits * ctl->unit) + cluster->max_size = found_bits * ctl->unit; if (total_found < want_bits || cluster->max_size < cont1_bytes) { i = next_zero + 1; goto again; } - cluster->window_start = start * block_group->sectorsize + - entry->offset; + cluster->window_start = start * ctl->unit + entry->offset; rb_erase(&entry->offset_index, &ctl->free_space_offset); ret = tree_insert_offset(&cluster->root, entry->offset, &entry->offset_index, 1); BUG_ON(ret); /* -EEXIST; Logic error */ trace_btrfs_setup_cluster(block_group, cluster, - total_found * block_group->sectorsize, 1); + total_found * ctl->unit, 1); return 0; } -- cgit v0.10.2 From fb57dc817c24d46b035320d871b7a3fcc778558d Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 30 Nov 2012 11:24:22 +0000 Subject: Btrfs: parse parent 0 into correct value in tracepoint Value 0 is not a tree id, so besides an upper limit, a lower limit is necessary as well while parsing root types of tracepoint. Signed-off-by: Liu Bo Signed-off-by: Chris Mason diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 54fab04..ea546a4 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -45,7 +45,8 @@ struct extent_buffer; #define show_root_type(obj) \ obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) || \ - (obj <= BTRFS_CSUM_TREE_OBJECTID )) ? __show_root_type(obj) : "-" + (obj >= BTRFS_ROOT_TREE_OBJECTID && \ + obj <= BTRFS_CSUM_TREE_OBJECTID)) ? __show_root_type(obj) : "-" #define BTRFS_GROUP_FLAGS \ { BTRFS_BLOCK_GROUP_DATA, "DATA"}, \ -- cgit v0.10.2 From 543eabd5e1929bc73e22b279aa911eb01447535f Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 5 Dec 2012 10:52:48 +0000 Subject: Btrfs: don't auto defrag a file when doing directIO If we runt the direct IO, we should not run auto defrag, because it may introduce buffered IO vs direcIO problem, and make direct IO slow down. Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 657f16d..bf60958 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5692,9 +5692,6 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, if (IS_ERR(trans)) return ERR_CAST(trans); - if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024) - btrfs_add_inode_defrag(trans, inode); - trans->block_rsv = &root->fs_info->delalloc_block_rsv; alloc_hint = get_extent_allocation_hint(inode, start, len); -- cgit v0.10.2 From 4b5829a8e3104c8bc115d926a0285d3ff9bcfc77 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 5 Dec 2012 10:53:25 +0000 Subject: Btrfs: fix missing reserved space release in error path of delalloc reservation We forget to release the reserved space in the error path of delalloc reservatiom, fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 98af837..e152809 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4559,6 +4559,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) ret = btrfs_qgroup_reserve(root, num_bytes + nr_extents * root->leafsize); if (ret) { + spin_lock(&BTRFS_I(inode)->lock); + calc_csum_metadata_size(inode, num_bytes, 0); + spin_unlock(&BTRFS_I(inode)->lock); mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); return ret; } @@ -4594,6 +4597,10 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) btrfs_ino(inode), to_free, 0); } + if (root->fs_info->quota_enabled) { + btrfs_qgroup_free(root, num_bytes + + nr_extents * root->leafsize); + } mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); return ret; } -- cgit v0.10.2 From 6347b3c433a4cff00eb2299c7f2c7d1d8b24c1fc Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 5 Dec 2012 10:53:45 +0000 Subject: Btrfs: fix off-by-one error of the same page check in btrfs_punch_hole() (start + len) is the start of the adjacent extent, not the end of the current extent, so we should not use it to check the hole is on the same page or not. Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 8e3d678..d75412b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1867,8 +1867,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) u64 drop_end; int ret = 0; int err = 0; - bool same_page = (offset >> PAGE_CACHE_SHIFT) == - ((offset + len) >> PAGE_CACHE_SHIFT); + bool same_page = ((offset >> PAGE_CACHE_SHIFT) == + ((offset + len - 1) >> PAGE_CACHE_SHIFT)); btrfs_wait_ordered_range(inode, offset, len); -- cgit v0.10.2 From 0061280d2c7240805cfd7b1f493da967c97c2f34 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 5 Dec 2012 10:54:12 +0000 Subject: Btrfs: fix the page that is beyond EOF Steps to reproduce: # mkfs.btrfs # mount # dd if=/dev/zero of=/ bs=512 seek=5 count=8 # fallocate -p -o 2048 -l 16384 / # dd if=/dev/zero of=/ bs=4096 seek=3 count=8 conv=notrunc,nocreat # umount # dmesg WARNING: at fs/btrfs/inode.c:7140 btrfs_destroy_inode+0x2eb/0x330 The reason is that we inputed a range which is beyond the end of the file. And because the end of this range was not page-aligned, we had to truncate the last page in this range, this operation is similar to a buffered file write. In other words, we reserved enough space and clear the data which was in the hole range on that page. But when we expanded that test file, write the data into the same page, we forgot that we have reserved enough space for the buffered write of that page because in most cases there is no page that is beyond the end of the file. As a result, we reserved the space twice. In fact, we needn't truncate the page if it is beyond the end of the file, just release the allocated space in that range. Fix the above problem by this way. Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d75412b..700ffd2 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1859,9 +1859,9 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) struct btrfs_path *path; struct btrfs_block_rsv *rsv; struct btrfs_trans_handle *trans; - u64 mask = BTRFS_I(inode)->root->sectorsize - 1; - u64 lockstart = (offset + mask) & ~mask; - u64 lockend = ((offset + len) & ~mask) - 1; + u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); + u64 lockend = round_down(offset + len, + BTRFS_I(inode)->root->sectorsize) - 1; u64 cur_offset = lockstart; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); u64 drop_end; @@ -1896,10 +1896,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } /* zero the front end of the last page */ - ret = btrfs_truncate_page(inode, offset + len, 0, 1); - if (ret) { - mutex_unlock(&inode->i_mutex); - return ret; + if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) { + ret = btrfs_truncate_page(inode, offset + len, 0, 1); + if (ret) { + mutex_unlock(&inode->i_mutex); + return ret; + } } if (lockend < lockstart) { -- cgit v0.10.2 From 7426cc04d407621773af3a0403e57642e40c36bf Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 5 Dec 2012 10:54:52 +0000 Subject: Btrfs: punch hole past the end of the file Since we can pre-allocate the space past EOF, we should be able to reclaim that space if we need. This patch implements it by removing the EOF check. Though the manual of fallocate command says we can use truncate command to reclaim the pre-allocated space which past EOF, but because truncate command changes the file size, we must run several commands to reclaim the space if we don't want to change the file size, so it is not a good choice. Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 700ffd2..71c2dc1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1873,26 +1873,28 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) btrfs_wait_ordered_range(inode, offset, len); mutex_lock(&inode->i_mutex); - if (offset >= inode->i_size) { - mutex_unlock(&inode->i_mutex); - return 0; - } - + /* + * We needn't truncate any page which is beyond the end of the file + * because we are sure there is no data there. + */ /* * Only do this if we are in the same page and we aren't doing the * entire page. */ if (same_page && len < PAGE_CACHE_SIZE) { - ret = btrfs_truncate_page(inode, offset, len, 0); + if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) + ret = btrfs_truncate_page(inode, offset, len, 0); mutex_unlock(&inode->i_mutex); return ret; } /* zero back part of the first page */ - ret = btrfs_truncate_page(inode, offset, 0, 0); - if (ret) { - mutex_unlock(&inode->i_mutex); - return ret; + if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) { + ret = btrfs_truncate_page(inode, offset, 0, 0); + if (ret) { + mutex_unlock(&inode->i_mutex); + return ret; + } } /* zero the front end of the last page */ -- cgit v0.10.2 From ac6a2b36f9fcfbe4865550afb6d333dec6b57578 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 5 Dec 2012 10:56:13 +0000 Subject: Btrfs: fix wrong return value of btrfs_truncate_page() ret variant may be set to 0 if we read page successfully, but it might be released before we lock it again. On this case, if we fail to allocate a new page, we will return 0, it is wrong, fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bf60958..0446cbe 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3521,11 +3521,11 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, if (ret) goto out; - ret = -ENOMEM; again: page = find_or_create_page(mapping, index, mask); if (!page) { btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); + ret = -ENOMEM; goto out; } @@ -3574,7 +3574,6 @@ again: goto out_unlock; } - ret = 0; if (offset != PAGE_CACHE_SIZE) { if (!len) len = PAGE_CACHE_SIZE - offset; -- cgit v0.10.2 From b8b8ff590f99678616f9ea85f5088542d1cfc0be Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Thu, 6 Dec 2012 19:25:48 +0000 Subject: btrfs: Notify udev when removing device Currently udev does not know about the device being removed from the file system. This may result in the situation where we're unable to mount the file system by UUID or by LABEL because the by-uuid and by-label links may still point to the device which is no longer part of the btrfs file system and hence does not have any btrfs super block. It can be easily reproduced by the following: mkfs.btrfs -L bugfs /dev/loop[0-6] mount /dev/loop0 /mnt/test btrfs device delete /dev/loop0 /mnt/test umount /mnt/test mount LABEL=bugfs /mnt/test <---- this fails then see: ls -l /dev/disk/by-label/bugfs which will still point to the /dev/loop0 We did not noticed this before because libblkid would send the udev event for us when it notice that the link does not fit the reality, however it does not do that anymore and completely relies on udev information. Fix this by sending the KOBJ_CHANGE event to the bdev kobject after successful device removal. Note that this does not affect device addition, because we will open the device prior the addition from userspace and udev will notice that and reread the device afterwards. Signed-off-by: Lukas Czerner Signed-off-by: Chris Mason diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0b1b7e2..886f4ba 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -72,6 +72,19 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices) kfree(fs_devices); } +static void btrfs_kobject_uevent(struct block_device *bdev, + enum kobject_action action) +{ + int ret; + + ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); + if (ret) + pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n", + action, + kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), + &disk_to_dev(bdev->bd_disk)->kobj); +} + void btrfs_cleanup_fs_uuids(void) { struct btrfs_fs_devices *fs_devices; @@ -1542,6 +1555,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) ret = 0; + /* Notify udev that device has changed */ + btrfs_kobject_uevent(bdev, KOBJ_CHANGE); + error_brelse: brelse(bh); if (bdev) -- cgit v0.10.2 From 5f3ab90a72f98adbf00c50ac2d4d2b47cf4a9685 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Fri, 7 Dec 2012 09:28:54 +0000 Subject: Btrfs: rename root_times_lock to root_item_lock Originally root_times_lock was introduced as part of send/receive code however newly developed patch to label the subvol reused the same lock, so renaming it for a meaningful name. Signed-off-by: Anand Jain Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 5c2cf99..01efcbc 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -5114,13 +5114,13 @@ int btrfs_compare_trees(struct btrfs_root *left_root, right_path->search_commit_root = 1; right_path->skip_locking = 1; - spin_lock(&left_root->root_times_lock); + spin_lock(&left_root->root_item_lock); left_start_ctransid = btrfs_root_ctransid(&left_root->root_item); - spin_unlock(&left_root->root_times_lock); + spin_unlock(&left_root->root_item_lock); - spin_lock(&right_root->root_times_lock); + spin_lock(&right_root->root_item_lock); right_start_ctransid = btrfs_root_ctransid(&right_root->root_item); - spin_unlock(&right_root->root_times_lock); + spin_unlock(&right_root->root_item_lock); trans = btrfs_join_transaction(left_root); if (IS_ERR(trans)) { @@ -5215,15 +5215,15 @@ int btrfs_compare_trees(struct btrfs_root *left_root, goto out; } - spin_lock(&left_root->root_times_lock); + spin_lock(&left_root->root_item_lock); ctransid = btrfs_root_ctransid(&left_root->root_item); - spin_unlock(&left_root->root_times_lock); + spin_unlock(&left_root->root_item_lock); if (ctransid != left_start_ctransid) left_start_ctransid = 0; - spin_lock(&right_root->root_times_lock); + spin_lock(&right_root->root_item_lock); ctransid = btrfs_root_ctransid(&right_root->root_item); - spin_unlock(&right_root->root_times_lock); + spin_unlock(&right_root->root_item_lock); if (ctransid != right_start_ctransid) right_start_ctransid = 0; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6ba56ae..313a6ad 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1645,7 +1645,7 @@ struct btrfs_root { int force_cow; - spinlock_t root_times_lock; + spinlock_t root_item_lock; }; struct btrfs_ioctl_defrag_range_args { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3229531..faf1826 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1204,7 +1204,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->root_key.objectid = objectid; root->anon_dev = 0; - spin_lock_init(&root->root_times_lock); + spin_lock_init(&root->root_item_lock); } static int __must_check find_and_setup_root(struct btrfs_root *tree_root, diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index eb923d0..668af53 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -548,9 +548,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct btrfs_root_item *item = &root->root_item; struct timespec ct = CURRENT_TIME; - spin_lock(&root->root_times_lock); + spin_lock(&root->root_item_lock); item->ctransid = cpu_to_le64(trans->transid); item->ctime.sec = cpu_to_le64(ct.tv_sec); item->ctime.nsec = cpu_to_le32(ct.tv_nsec); - spin_unlock(&root->root_times_lock); + spin_unlock(&root->root_item_lock); } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index e78b297..5445454 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -4397,9 +4397,9 @@ static int full_send_tree(struct send_ctx *sctx) if (!path) return -ENOMEM; - spin_lock(&send_root->root_times_lock); + spin_lock(&send_root->root_item_lock); start_ctransid = btrfs_root_ctransid(&send_root->root_item); - spin_unlock(&send_root->root_times_lock); + spin_unlock(&send_root->root_item_lock); key.objectid = BTRFS_FIRST_FREE_OBJECTID; key.type = BTRFS_INODE_ITEM_KEY; @@ -4422,9 +4422,9 @@ join_trans: * Make sure the tree has not changed after re-joining. We detect this * by comparing start_ctransid and ctransid. They should always match. */ - spin_lock(&send_root->root_times_lock); + spin_lock(&send_root->root_item_lock); ctransid = btrfs_root_ctransid(&send_root->root_item); - spin_unlock(&send_root->root_times_lock); + spin_unlock(&send_root->root_item_lock); if (ctransid != start_ctransid) { WARN(1, KERN_WARNING "btrfs: the root that you're trying to " -- cgit v0.10.2 From e99761514999f64aff1985460967f93d9e8417f4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 11 Oct 2012 15:53:56 -0400 Subject: Btrfs: only log the inode item if we can get away with it Currently we copy all the file information into the log, inode item, the refs, xattrs etc. Except most of this doesn't change from fsync to fsync, just the inode item changes. So set a flag if an xattr changes or a link is added, and otherwise only log the inode item. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index ed8ca7c..2411baf 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -39,6 +39,7 @@ #define BTRFS_INODE_HAS_ORPHAN_ITEM 5 #define BTRFS_INODE_HAS_ASYNC_EXTENT 6 #define BTRFS_INODE_NEEDS_FULL_SYNC 7 +#define BTRFS_INODE_COPY_EVERYTHING 8 /* in memory btrfs inode */ struct btrfs_inode { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0446cbe..123815f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5083,6 +5083,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, inode_inc_iversion(inode); inode->i_ctime = CURRENT_TIME; ihold(inode); + set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 40b9efd..f05fca7 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3429,14 +3429,20 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } else { if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags)) { + clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags); ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); } else { if (inode_only == LOG_INODE_ALL) fast_search = true; - max_key.type = BTRFS_XATTR_ITEM_KEY; + if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags)) + max_key.type = BTRFS_XATTR_ITEM_KEY; + else + max_key.type = BTRFS_INODE_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, - BTRFS_XATTR_ITEM_KEY); + max_key.type); } } if (ret) { diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index aef6bb3..446a684 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -208,6 +208,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans, inode_inc_iversion(inode); inode->i_ctime = CURRENT_TIME; + set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); out: -- cgit v0.10.2 From a95249b392c3ab843d7b25ab6817ecc9ea0b82ee Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 11 Oct 2012 16:17:34 -0400 Subject: Btrfs: don't bother copying if we're only logging the inode We don't copy inode items anwyay, we just copy them straight into the log from the in memory inode. So if we know we're only logging the inode, don't bother dropping anything, just try to insert it and either if it succeeds or we get EEXIST we can update the inode item in the log and carry on. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f05fca7..ab7168e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2996,6 +2996,26 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, } +static int log_inode_item(struct btrfs_trans_handle *trans, + struct btrfs_root *log, struct btrfs_path *path, + struct inode *inode) +{ + struct btrfs_inode_item *inode_item; + struct btrfs_key key; + int ret; + + memcpy(&key, &BTRFS_I(inode)->location, sizeof(key)); + ret = btrfs_insert_empty_item(trans, log, path, &key, + sizeof(*inode_item)); + if (ret && ret != -EEXIST) + return ret; + inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + fill_inode_item(trans, path->nodes[0], inode_item, inode, 0); + btrfs_release_path(path); + return 0; +} + static noinline int copy_items(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *dst_path, @@ -3433,17 +3453,24 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, &BTRFS_I(inode)->runtime_flags); ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); - } else { + } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags)) { if (inode_only == LOG_INODE_ALL) fast_search = true; - if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, - &BTRFS_I(inode)->runtime_flags)) - max_key.type = BTRFS_XATTR_ITEM_KEY; - else - max_key.type = BTRFS_INODE_ITEM_KEY; + max_key.type = BTRFS_XATTR_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, max_key.type); + } else { + if (inode_only == LOG_INODE_ALL) + fast_search = true; + ret = log_inode_item(trans, log, dst_path, inode); + if (ret) { + err = ret; + goto out_unlock; + } + goto log_extents; } + } if (ret) { err = ret; @@ -3522,6 +3549,7 @@ next_slot: ins_nr = 0; } +log_extents: if (fast_search) { btrfs_release_path(path); btrfs_release_path(dst_path); -- cgit v0.10.2 From b812ce28796f746f14ba6cc451250c422db447b2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 16 Nov 2012 13:56:32 -0500 Subject: Btrfs: inline csums if we're fsyncing The tree logging stuff needs the csums to be on the ordered extents in order to log them properly, so mark that we're sync and inline the csum creation so we don't have to wait on the csumming to be done when logging extents that are still in flight. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 2411baf..2a8c242 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -91,6 +91,9 @@ struct btrfs_inode { unsigned long runtime_flags; + /* Keep track of who's O_SYNC/fsycing currently */ + atomic_t sync_writers; + /* full 64 bit generation number, struct vfs_inode doesn't have a big * enough field for this. */ diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 71c2dc1..7f4654a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1472,6 +1472,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, ssize_t num_written = 0; ssize_t err = 0; size_t count, ocount; + bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); sb_start_write(inode->i_sb); @@ -1529,6 +1530,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, } } + if (sync) + atomic_inc(&BTRFS_I(inode)->sync_writers); + if (unlikely(file->f_flags & O_DIRECT)) { num_written = __btrfs_direct_write(iocb, iov, nr_segs, pos, ppos, count, ocount); @@ -1563,6 +1567,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, num_written = err; } out: + if (sync) + atomic_dec(&BTRFS_I(inode)->sync_writers); sb_end_write(inode->i_sb); current->backing_dev_info = NULL; return num_written ? num_written : err; @@ -1613,7 +1619,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * out of the ->i_mutex. If so, we can flush the dirty pages by * multi-task, and make the performance up. */ + atomic_inc(&BTRFS_I(inode)->sync_writers); ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + atomic_dec(&BTRFS_I(inode)->sync_writers); if (ret) return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 123815f..7855aac 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1622,6 +1622,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, int ret = 0; int skip_sum; int metadata = 0; + int async = !atomic_read(&BTRFS_I(inode)->sync_writers); skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; @@ -1644,7 +1645,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, goto out; } goto mapit; - } else if (!skip_sum) { + } else if (async && !skip_sum) { /* csum items have already been cloned */ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) goto mapit; @@ -1655,6 +1656,10 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, __btrfs_submit_bio_start, __btrfs_submit_bio_done); goto out; + } else if (!skip_sum) { + ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); + if (ret) + goto out; } mapit: @@ -6333,6 +6338,9 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, struct btrfs_root *root = BTRFS_I(inode)->root; int ret; + if (async_submit) + async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); + bio_get(bio); if (!write) { @@ -7113,6 +7121,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); ei->io_tree.track_uptodate = 1; ei->io_failure_tree.track_uptodate = 1; + atomic_set(&ei->sync_writers, 0); mutex_init(&ei->log_mutex); mutex_init(&ei->delalloc_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); -- cgit v0.10.2 From b493968096944a11422c4d80fb87af537ca1cac7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 3 Dec 2012 10:31:19 -0500 Subject: Btrfs: keep track of the extents original block length If we've written to a prealloc extent we need to know the original block len for the extent. We can't figure this out currently since ->block_len is just set to the extent length. So introduce ->orig_block_len so that we know how many bytes were in the original extent for proper extent logging that future patches will need. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 6792255..99a0dcb 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -24,6 +24,7 @@ struct extent_map { u64 mod_start; u64 mod_len; u64 orig_start; + u64 orig_block_len; u64 block_start; u64 block_len; u64 generation; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7f4654a..6810145 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -588,6 +588,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->block_len = em->block_len; else split->block_len = split->len; + split->orig_block_len = max(split->block_len, + em->orig_block_len); split->generation = gen; split->bdev = em->bdev; split->flags = flags; @@ -609,6 +611,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->flags = flags; split->compress_type = em->compress_type; split->generation = gen; + split->orig_block_len = max(em->block_len, + em->orig_block_len); if (compressed) { split->block_len = em->block_len; @@ -1838,6 +1842,7 @@ out: hole_em->block_start = EXTENT_MAP_HOLE; hole_em->block_len = 0; + hole_em->orig_block_len = 0; hole_em->bdev = root->fs_info->fs_devices->latest_bdev; hole_em->compress_type = BTRFS_COMPRESS_NONE; hole_em->generation = trans->transid; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7855aac..bfd59bc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -699,6 +699,7 @@ retry: em->block_start = ins.objectid; em->block_len = ins.offset; + em->orig_block_len = ins.offset; em->bdev = root->fs_info->fs_devices->latest_bdev; em->compress_type = async_extent->compress_type; set_bit(EXTENT_FLAG_PINNED, &em->flags); @@ -886,6 +887,7 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, em->block_start = ins.objectid; em->block_len = ins.offset; + em->orig_block_len = ins.offset; em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); @@ -1143,6 +1145,7 @@ static noinline int run_delalloc_nocow(struct inode *inode, u64 extent_offset; u64 disk_bytenr; u64 num_bytes; + u64 disk_num_bytes; int extent_type; int ret, err; int type; @@ -1245,6 +1248,8 @@ next_slot: extent_offset = btrfs_file_extent_offset(leaf, fi); extent_end = found_key.offset + btrfs_file_extent_num_bytes(leaf, fi); + disk_num_bytes = + btrfs_file_extent_disk_num_bytes(leaf, fi); if (extent_end <= start) { path->slots[0]++; goto next_slot; @@ -1319,6 +1324,7 @@ out_check: em->len = num_bytes; em->block_len = num_bytes; em->block_start = disk_bytenr; + em->orig_block_len = disk_num_bytes; em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); set_bit(EXTENT_FLAG_PREALLOC, &em->flags); @@ -3696,6 +3702,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) hole_em->block_start = EXTENT_MAP_HOLE; hole_em->block_len = 0; + hole_em->orig_block_len = 0; hole_em->bdev = root->fs_info->fs_devices->latest_bdev; hole_em->compress_type = BTRFS_COMPRESS_NONE; hole_em->generation = trans->transid; @@ -5374,6 +5381,8 @@ again: em->len = extent_end - extent_start; em->orig_start = extent_start - btrfs_file_extent_offset(leaf, item); + em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, + item); bytenr = btrfs_file_extent_disk_bytenr(leaf, item); if (bytenr == 0) { em->block_start = EXTENT_MAP_HOLE; @@ -5383,8 +5392,7 @@ again: set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); em->compress_type = compress_type; em->block_start = bytenr; - em->block_len = btrfs_file_extent_disk_num_bytes(leaf, - item); + em->block_len = em->orig_block_len; } else { bytenr += btrfs_file_extent_offset(leaf, item); em->block_start = bytenr; @@ -5414,6 +5422,7 @@ again: em->start = extent_start + extent_offset; em->len = (copy_size + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + em->orig_block_len = em->len; em->orig_start = EXTENT_MAP_INLINE; if (compress_type) { set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); @@ -5721,6 +5730,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, em->block_start = ins.objectid; em->block_len = ins.offset; + em->orig_block_len = ins.offset; em->bdev = root->fs_info->fs_devices->latest_bdev; /* @@ -5914,7 +5924,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, static struct extent_map *create_pinned_em(struct inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, - int type) + u64 orig_block_len, int type) { struct extent_map_tree *em_tree; struct extent_map *em; @@ -5932,6 +5942,7 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, em->block_len = block_len; em->block_start = block_start; em->bdev = root->fs_info->fs_devices->latest_bdev; + em->orig_block_len = orig_block_len; set_bit(EXTENT_FLAG_PINNED, &em->flags); if (type == BTRFS_ORDERED_PREALLOC) set_bit(EXTENT_FLAG_PREALLOC, &em->flags); @@ -6068,12 +6079,14 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (can_nocow_odirect(trans, inode, start, len) == 1) { u64 orig_start = em->start; + u64 orig_block_len = em->orig_block_len; if (type == BTRFS_ORDERED_PREALLOC) { free_extent_map(em); em = create_pinned_em(inode, start, len, orig_start, - block_start, len, type); + block_start, len, + orig_block_len, type); if (IS_ERR(em)) { btrfs_end_transaction(trans, root); goto unlock_err; @@ -7771,6 +7784,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, em->len = ins.offset; em->block_start = ins.objectid; em->block_len = ins.offset; + em->orig_block_len = ins.offset; em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PREALLOC, &em->flags); em->generation = trans->transid; -- cgit v0.10.2 From b11e234d21e73df94099e473a080bca502b9a496 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 3 Dec 2012 10:58:15 -0500 Subject: Btrfs: do not mark ems as prealloc if we are writing to them We are going to use EM's to log extents in the future, so we need to not mark them as prealloc if they aren't actually prealloc extents. Instead mark them with FILLING so we know to ammend mod_start/mod_len and that way we don't confuse the extent logging code. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index b8cbc8d..85ae2b6 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -266,9 +266,9 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, em->mod_start = em->start; em->mod_len = em->len; - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) { prealloc = true; - clear_bit(EXTENT_FLAG_PREALLOC, &em->flags); + clear_bit(EXTENT_FLAG_FILLING, &em->flags); } try_merge_map(tree, em); diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 99a0dcb..922943c 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -14,6 +14,7 @@ #define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ #define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ +#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */ struct extent_map { struct rb_node rb_node; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bfd59bc..73e6833 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1327,7 +1327,7 @@ out_check: em->orig_block_len = disk_num_bytes; em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); - set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + set_bit(EXTENT_FLAG_FILLING, &em->flags); while (1) { write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); @@ -5945,7 +5945,7 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, em->orig_block_len = orig_block_len; set_bit(EXTENT_FLAG_PINNED, &em->flags); if (type == BTRFS_ORDERED_PREALLOC) - set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + set_bit(EXTENT_FLAG_FILLING, &em->flags); do { btrfs_drop_extent_cache(inode, em->start, -- cgit v0.10.2 From d6393786cd40f67709324bc4f08d7e4b911153fe Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 12 Dec 2012 17:00:01 -0500 Subject: Btrfs: add path->really_keep_locks You'd think path->keep_locks would keep all the locks wouldn't you? You'd be wrong. It only keeps them if the slot is pointing to the last item in the node. This is for use with btrfs_next_leaf, which needs this sort of thing. But the horrible horrible things I'm going to do to the tree log means I really need everything held from root to leaf so I can add and delete items in the same search. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 01efcbc..0c5c28f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2212,6 +2212,9 @@ static noinline void unlock_up(struct btrfs_path *path, int level, int no_skips = 0; struct extent_buffer *t; + if (path->really_keep_locks) + return; + for (i = level; i < BTRFS_MAX_LEVEL; i++) { if (!path->nodes[i]) break; @@ -2259,7 +2262,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) { int i; - if (path->keep_locks) + if (path->keep_locks || path->really_keep_locks) return; for (i = level; i < BTRFS_MAX_LEVEL; i++) { @@ -2492,7 +2495,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root if (!cow) write_lock_level = -1; - if (cow && (p->keep_locks || p->lowest_level)) + if (cow && (p->really_keep_locks || p->keep_locks || p->lowest_level)) write_lock_level = BTRFS_MAX_LEVEL; min_write_lock_level = write_lock_level; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 313a6ad..9ed452f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -576,6 +576,7 @@ struct btrfs_path { unsigned int skip_locking:1; unsigned int leave_spinning:1; unsigned int search_commit_root:1; + unsigned int really_keep_locks:1; }; /* -- cgit v0.10.2 From 70c8a91ce21b83ccd2d9e7c968775430ead4353d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 11 Oct 2012 16:54:30 -0400 Subject: Btrfs: log changed inodes based on the extent map tree We don't really need to copy extents from the source tree since we have all of the information already available to us in the extent_map tree. So instead just write the extents straight to the log tree and don't bother to copy the extent items from the source tree. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0c5c28f..e8b3264 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -5490,6 +5490,139 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) return btrfs_next_old_leaf(root, path, 0); } +/* Release the path up to but not including the given level */ +static void btrfs_release_level(struct btrfs_path *path, int level) +{ + int i; + + for (i = 0; i < level; i++) { + path->slots[i] = 0; + if (!path->nodes[i]) + continue; + if (path->locks[i]) { + btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); + path->locks[i] = 0; + } + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; + } +} + +/* + * This function assumes 2 things + * + * 1) You are using path->keep_locks + * 2) You are not inserting items. + * + * If either of these are not true do not use this function. If you need a next + * leaf with either of these not being true then this function can be easily + * adapted to do that, but at the moment these are the limitations. + */ +int btrfs_next_leaf_write(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + int del) +{ + struct extent_buffer *b; + struct btrfs_key key; + u32 nritems; + int level = 1; + int slot; + int ret = 1; + int write_lock_level = BTRFS_MAX_LEVEL; + int ins_len = del ? -1 : 0; + + WARN_ON(!(path->keep_locks || path->really_keep_locks)); + + nritems = btrfs_header_nritems(path->nodes[0]); + btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); + + while (path->nodes[level]) { + nritems = btrfs_header_nritems(path->nodes[level]); + if (!(path->locks[level] & BTRFS_WRITE_LOCK)) { +search: + btrfs_release_path(path); + ret = btrfs_search_slot(trans, root, &key, path, + ins_len, 1); + if (ret < 0) + goto out; + level = 1; + continue; + } + + if (path->slots[level] >= nritems - 1) { + level++; + continue; + } + + btrfs_release_level(path, level); + break; + } + + if (!path->nodes[level]) { + ret = 1; + goto out; + } + + path->slots[level]++; + b = path->nodes[level]; + + while (b) { + level = btrfs_header_level(b); + + if (!should_cow_block(trans, root, b)) + goto cow_done; + + btrfs_set_path_blocking(path); + ret = btrfs_cow_block(trans, root, b, + path->nodes[level + 1], + path->slots[level + 1], &b); + if (ret) + goto out; +cow_done: + path->nodes[level] = b; + btrfs_clear_path_blocking(path, NULL, 0); + if (level != 0) { + ret = setup_nodes_for_search(trans, root, path, b, + level, ins_len, + &write_lock_level); + if (ret == -EAGAIN) + goto search; + if (ret) + goto out; + + b = path->nodes[level]; + slot = path->slots[level]; + + ret = read_block_for_search(trans, root, path, + &b, level, slot, &key, 0); + if (ret == -EAGAIN) + goto search; + if (ret) + goto out; + level = btrfs_header_level(b); + if (!btrfs_try_tree_write_lock(b)) { + btrfs_set_path_blocking(path); + btrfs_tree_lock(b); + btrfs_clear_path_blocking(path, b, + BTRFS_WRITE_LOCK); + } + path->locks[level] = BTRFS_WRITE_LOCK; + path->nodes[level] = b; + path->slots[level] = 0; + } else { + path->slots[level] = 0; + ret = 0; + break; + } + } + +out: + if (ret) + btrfs_release_path(path); + + return ret; +} + int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq) { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9ed452f..55aff67 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3187,6 +3187,9 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, } int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); +int btrfs_next_leaf_write(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + int del); int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq); static inline int btrfs_next_old_item(struct btrfs_root *root, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 85ae2b6..fff2c28 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -49,7 +49,7 @@ void extent_map_tree_init(struct extent_map_tree *tree) struct extent_map *alloc_extent_map(void) { struct extent_map *em; - em = kmem_cache_alloc(extent_map_cache, GFP_NOFS); + em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS); if (!em) return NULL; em->in_tree = 0; @@ -198,16 +198,15 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) merge = rb_entry(rb, struct extent_map, rb_node); if (rb && mergable_maps(merge, em)) { em->start = merge->start; + em->orig_start = merge->orig_start; em->len += merge->len; em->block_len += merge->block_len; em->block_start = merge->block_start; merge->in_tree = 0; - if (merge->generation > em->generation) { - em->mod_start = em->start; - em->mod_len = em->len; - em->generation = merge->generation; - list_move(&em->list, &tree->modified_extents); - } + em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; + em->mod_start = merge->mod_start; + em->generation = max(em->generation, merge->generation); + list_move(&em->list, &tree->modified_extents); list_del_init(&merge->list); rb_erase(&merge->rb_node, &tree->map); @@ -223,11 +222,8 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->block_len += merge->len; rb_erase(&merge->rb_node, &tree->map); merge->in_tree = 0; - if (merge->generation > em->generation) { - em->mod_len = em->len; - em->generation = merge->generation; - list_move(&em->list, &tree->modified_extents); - } + em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; + em->generation = max(em->generation, merge->generation); list_del_init(&merge->list); free_extent_map(merge); } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 6810145..c56088e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -621,7 +621,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, } else { split->block_len = split->len; split->block_start = em->block_start + diff; - split->orig_start = split->start; + split->orig_start = em->orig_start; } ret = add_extent_mapping(em_tree, split); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 73e6833..355a297 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -95,6 +95,10 @@ static noinline int cow_file_range(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, int unlock); +static struct extent_map *create_pinned_em(struct inode *inode, u64 start, + u64 len, u64 orig_start, + u64 block_start, u64 block_len, + u64 orig_block_len, int type); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, @@ -704,10 +708,14 @@ retry: em->compress_type = async_extent->compress_type; set_bit(EXTENT_FLAG_PINNED, &em->flags); set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->generation = -1; while (1) { write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); + if (!ret) + list_move(&em->list, + &em_tree->modified_extents); write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); @@ -890,10 +898,14 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, em->orig_block_len = ins.offset; em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); + em->generation = -1; while (1) { write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); + if (!ret) + list_move(&em->list, + &em_tree->modified_extents); write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); @@ -1320,7 +1332,7 @@ out_check: em = alloc_extent_map(); BUG_ON(!em); /* -ENOMEM */ em->start = cur_offset; - em->orig_start = em->start; + em->orig_start = found_key.offset - extent_offset; em->len = num_bytes; em->block_len = num_bytes; em->block_start = disk_bytenr; @@ -1328,9 +1340,13 @@ out_check: em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); set_bit(EXTENT_FLAG_FILLING, &em->flags); + em->generation = -1; while (1) { write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); + if (!ret) + list_move(&em->list, + &em_tree->modified_extents); write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); @@ -5371,6 +5387,7 @@ again: if (start + len <= found_key.offset) goto not_found; em->start = start; + em->orig_start = start; em->len = found_key.offset - start; goto not_found_em; } @@ -5423,7 +5440,7 @@ again: em->len = (copy_size + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); em->orig_block_len = em->len; - em->orig_start = EXTENT_MAP_INLINE; + em->orig_start = em->start; if (compress_type) { set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); em->compress_type = compress_type; @@ -5476,6 +5493,7 @@ again: } not_found: em->start = start; + em->orig_start = start; em->len = len; not_found_em: em->block_start = EXTENT_MAP_HOLE; @@ -5677,30 +5695,14 @@ out: } static struct extent_map *btrfs_new_extent_direct(struct inode *inode, - struct extent_map *em, u64 start, u64 len) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; struct btrfs_key ins; u64 alloc_hint; int ret; - bool insert = false; - - /* - * Ok if the extent map we looked up is a hole and is for the exact - * range we want, there is no reason to allocate a new one, however if - * it is not right then we need to free this one and drop the cache for - * our range. - */ - if (em->block_start != EXTENT_MAP_HOLE || em->start != start || - em->len != len) { - free_extent_map(em); - em = NULL; - insert = true; - btrfs_drop_extent_cache(inode, start, start + len - 1, 0); - } trans = btrfs_join_transaction(root); if (IS_ERR(trans)) @@ -5716,38 +5718,10 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, goto out; } - if (!em) { - em = alloc_extent_map(); - if (!em) { - em = ERR_PTR(-ENOMEM); - goto out; - } - } - - em->start = start; - em->orig_start = em->start; - em->len = ins.offset; - - em->block_start = ins.objectid; - em->block_len = ins.offset; - em->orig_block_len = ins.offset; - em->bdev = root->fs_info->fs_devices->latest_bdev; - - /* - * We need to do this because if we're using the original em we searched - * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that. - */ - em->flags = 0; - set_bit(EXTENT_FLAG_PINNED, &em->flags); - - while (insert) { - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - if (ret != -EEXIST) - break; - btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0); - } + em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, + ins.offset, ins.offset, 0); + if (IS_ERR(em)) + goto out; ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, ins.offset, ins.offset, 0); @@ -5943,6 +5917,7 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, em->block_start = block_start; em->bdev = root->fs_info->fs_devices->latest_bdev; em->orig_block_len = orig_block_len; + em->generation = -1; set_bit(EXTENT_FLAG_PINNED, &em->flags); if (type == BTRFS_ORDERED_PREALLOC) set_bit(EXTENT_FLAG_FILLING, &em->flags); @@ -5952,6 +5927,9 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, em->start + em->len - 1, 0); write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); + if (!ret) + list_move(&em->list, + &em_tree->modified_extents); write_unlock(&em_tree->lock); } while (ret == -EEXIST); @@ -6078,7 +6056,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, goto must_cow; if (can_nocow_odirect(trans, inode, start, len) == 1) { - u64 orig_start = em->start; + u64 orig_start = em->orig_start; u64 orig_block_len = em->orig_block_len; if (type == BTRFS_ORDERED_PREALLOC) { @@ -6110,7 +6088,8 @@ must_cow: * it above */ len = bh_result->b_size; - em = btrfs_new_extent_direct(inode, em, start, len); + free_extent_map(em); + em = btrfs_new_extent_direct(inode, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto unlock_err; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ab7168e..7244481 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3150,145 +3150,220 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) return 0; } -struct log_args { - struct extent_buffer *src; - u64 next_offset; - int start_slot; - int nr; -}; +static int drop_adjacent_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct extent_map *em, + struct btrfs_path *path) +{ + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + struct btrfs_key key, new_key; + struct btrfs_map_token token; + u64 extent_end; + u64 extent_offset = 0; + int extent_type; + int del_slot = 0; + int del_nr = 0; + int ret = 0; + + while (1) { + btrfs_init_map_token(&token); + leaf = path->nodes[0]; + path->slots[0]++; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + if (del_nr) { + ret = btrfs_del_items(trans, root, path, + del_slot, del_nr); + if (ret) + return ret; + del_nr = 0; + } + + ret = btrfs_next_leaf_write(trans, root, path, 1); + if (ret < 0) + return ret; + if (ret > 0) + return 0; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != btrfs_ino(inode) || + key.type != BTRFS_EXTENT_DATA_KEY || + key.offset >= em->start + em->len) + break; + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_token_file_extent_type(leaf, fi, &token); + if (extent_type == BTRFS_FILE_EXTENT_REG || + extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + extent_offset = btrfs_token_file_extent_offset(leaf, + fi, &token); + extent_end = key.offset + + btrfs_token_file_extent_num_bytes(leaf, fi, + &token); + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + extent_end = key.offset + + btrfs_file_extent_inline_len(leaf, fi); + } else { + BUG(); + } + + if (extent_end <= em->len + em->start) { + if (!del_nr) { + del_slot = path->slots[0]; + } + del_nr++; + continue; + } + + /* + * Ok so we'll ignore previous items if we log a new extent, + * which can lead to overlapping extents, so if we have an + * existing extent we want to adjust we _have_ to check the next + * guy to make sure we even need this extent anymore, this keeps + * us from panicing in set_item_key_safe. + */ + if (path->slots[0] < btrfs_header_nritems(leaf) - 1) { + struct btrfs_key tmp_key; + + btrfs_item_key_to_cpu(leaf, &tmp_key, + path->slots[0] + 1); + if (tmp_key.objectid == btrfs_ino(inode) && + tmp_key.type == BTRFS_EXTENT_DATA_KEY && + tmp_key.offset <= em->start + em->len) { + if (!del_nr) + del_slot = path->slots[0]; + del_nr++; + continue; + } + } + + BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + memcpy(&new_key, &key, sizeof(new_key)); + new_key.offset = em->start + em->len; + btrfs_set_item_key_safe(trans, root, path, &new_key); + extent_offset += em->start + em->len - key.offset; + btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, + &token); + btrfs_set_token_file_extent_num_bytes(leaf, fi, extent_end - + (em->start + em->len), + &token); + btrfs_mark_buffer_dirty(leaf); + } + + if (del_nr) + ret = btrfs_del_items(trans, root, path, del_slot, del_nr); + + return ret; +} static int log_one_extent(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_root *root, - struct extent_map *em, struct btrfs_path *path, - struct btrfs_path *dst_path, struct log_args *args) + struct extent_map *em, struct btrfs_path *path) { struct btrfs_root *log = root->log_root; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + struct list_head ordered_sums; struct btrfs_key key; - u64 start = em->mod_start; - u64 search_start = start; - u64 len = em->mod_len; - u64 num_bytes; - int nritems; + u64 csum_offset = em->mod_start - em->start; + u64 csum_len = em->mod_len; + u64 extent_offset = em->start - em->orig_start; + u64 block_len; int ret; + bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - if (BTRFS_I(inode)->logged_trans == trans->transid) { - ret = __btrfs_drop_extents(trans, log, inode, dst_path, start, - start + len, NULL, 0); - if (ret) - return ret; + INIT_LIST_HEAD(&ordered_sums); + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = em->start; + path->really_keep_locks = 1; + + ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi)); + if (ret && ret != -EEXIST) { + path->really_keep_locks = 0; + return ret; + } + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, em->generation); + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + skip_csum = true; + btrfs_set_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_PREALLOC); + } else { + btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); + if (em->block_start == 0) + skip_csum = true; + } + + block_len = max(em->block_len, em->orig_block_len); + if (em->compress_type != BTRFS_COMPRESS_NONE) { + btrfs_set_file_extent_disk_bytenr(leaf, fi, em->block_start); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, block_len); + } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { + btrfs_set_file_extent_disk_bytenr(leaf, fi, + em->block_start - + extent_offset); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, block_len); + } else { + btrfs_set_file_extent_disk_bytenr(leaf, fi, 0); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, 0); } - while (len) { - if (args->nr) - goto next_slot; -again: - key.objectid = btrfs_ino(inode); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = search_start; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - return ret; - - if (ret) { - /* - * A rare case were we can have an em for a section of a - * larger extent so we need to make sure that this em - * falls within the extent we've found. If not we just - * bail and go back to ye-olde way of doing things but - * it happens often enough in testing that we need to do - * this dance to make sure. - */ - do { - if (path->slots[0] == 0) { - btrfs_release_path(path); - if (search_start == 0) - return -ENOENT; - search_start--; - goto again; - } + btrfs_set_file_extent_offset(leaf, fi, em->start - em->orig_start); + btrfs_set_file_extent_num_bytes(leaf, fi, em->len); + btrfs_set_file_extent_ram_bytes(leaf, fi, em->len); + btrfs_set_file_extent_compression(leaf, fi, em->compress_type); + btrfs_set_file_extent_encryption(leaf, fi, 0); + btrfs_set_file_extent_other_encoding(leaf, fi, 0); + btrfs_mark_buffer_dirty(leaf); - path->slots[0]--; - btrfs_item_key_to_cpu(path->nodes[0], &key, - path->slots[0]); - if (key.objectid != btrfs_ino(inode) || - key.type != BTRFS_EXTENT_DATA_KEY) { - btrfs_release_path(path); - return -ENOENT; - } - } while (key.offset > start); + /* + * Have to check the extent to the right of us to make sure it doesn't + * fall in our current range. We're ok if the previous extent is in our + * range since the recovery stuff will run us in key order and thus just + * drop the part we overwrote. + */ + ret = drop_adjacent_extents(trans, log, inode, em, path); + btrfs_release_path(path); + path->really_keep_locks = 0; + if (ret) { + return ret; + } - num_bytes = btrfs_file_extent_length(path); - if (key.offset + num_bytes <= start) { - btrfs_release_path(path); - return -ENOENT; - } - } - args->src = path->nodes[0]; -next_slot: - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - num_bytes = btrfs_file_extent_length(path); - if (args->nr && - args->start_slot + args->nr == path->slots[0]) { - args->nr++; - } else if (args->nr) { - ret = copy_items(trans, inode, dst_path, args->src, - args->start_slot, args->nr, - LOG_INODE_ALL); - if (ret) - return ret; - args->nr = 1; - args->start_slot = path->slots[0]; - } else if (!args->nr) { - args->nr = 1; - args->start_slot = path->slots[0]; - } - nritems = btrfs_header_nritems(path->nodes[0]); - path->slots[0]++; - if (len < num_bytes) { - /* I _think_ this is ok, envision we write to a - * preallocated space that is adjacent to a previously - * written preallocated space that gets merged when we - * mark this preallocated space written. If we do not - * have the adjacent extent in cache then when we copy - * this extent it could end up being larger than our EM - * thinks it is, which is a-ok, so just set len to 0. - */ - len = 0; - } else { - len -= num_bytes; - } - start = key.offset + num_bytes; - args->next_offset = start; - search_start = start; + if (skip_csum) + return 0; - if (path->slots[0] < nritems) { - if (len) - goto next_slot; - break; - } + /* block start is already adjusted for the file extent offset. */ + ret = btrfs_lookup_csums_range(log->fs_info->csum_root, + em->block_start + csum_offset, + em->block_start + csum_offset + + csum_len - 1, &ordered_sums, 0); + if (ret) + return ret; - if (args->nr) { - ret = copy_items(trans, inode, dst_path, args->src, - args->start_slot, args->nr, - LOG_INODE_ALL); - if (ret) - return ret; - args->nr = 0; - btrfs_release_path(path); - } + while (!list_empty(&ordered_sums)) { + struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, + struct btrfs_ordered_sum, + list); + if (!ret) + ret = btrfs_csum_file_blocks(trans, log, sums); + list_del(&sums->list); + kfree(sums); } - return 0; + return ret; } static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, - struct btrfs_path *path, - struct btrfs_path *dst_path) + struct btrfs_path *path) { - struct log_args args; struct extent_map *em, *n; struct list_head extents; struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; @@ -3297,8 +3372,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, INIT_LIST_HEAD(&extents); - memset(&args, 0, sizeof(args)); - write_lock(&tree->lock); test_gen = root->fs_info->last_trans_committed; @@ -3331,34 +3404,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, write_unlock(&tree->lock); - /* - * If the previous EM and the last extent we left off on aren't - * sequential then we need to copy the items we have and redo - * our search - */ - if (args.nr && em->mod_start != args.next_offset) { - ret = copy_items(trans, inode, dst_path, args.src, - args.start_slot, args.nr, - LOG_INODE_ALL); - if (ret) { - free_extent_map(em); - write_lock(&tree->lock); - continue; - } - btrfs_release_path(path); - args.nr = 0; - } - - ret = log_one_extent(trans, inode, root, em, path, dst_path, &args); + ret = log_one_extent(trans, inode, root, em, path); free_extent_map(em); write_lock(&tree->lock); } WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); - if (!ret && args.nr) - ret = copy_items(trans, inode, dst_path, args.src, - args.start_slot, args.nr, LOG_INODE_ALL); btrfs_release_path(path); return ret; } @@ -3551,10 +3603,8 @@ next_slot: log_extents: if (fast_search) { - btrfs_release_path(path); btrfs_release_path(dst_path); - ret = btrfs_log_changed_extents(trans, root, inode, path, - dst_path); + ret = btrfs_log_changed_extents(trans, root, inode, dst_path); if (ret) { err = ret; goto out_unlock; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 886f4ba..d79b5b6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4983,6 +4983,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, em->bdev = (struct block_device *)map; em->start = logical; em->len = length; + em->orig_start = 0; em->block_start = 0; em->block_len = em->len; -- cgit v0.10.2 From bb146eb265091f472ada52a3419d41e9b0ff1f7d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 15 Oct 2012 13:30:43 -0400 Subject: Btrfs: move checks in set_page_dirty under DEBUG This is a high traffic function, let's try and do as little as possible during normal operations shall we? Signed-off-by: Josef Bacik Signed-off-by: Chris Mason diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index faf1826..b8f7f04 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1001,6 +1001,7 @@ static void btree_invalidatepage(struct page *page, unsigned long offset) static int btree_set_page_dirty(struct page *page) { +#ifdef DEBUG struct extent_buffer *eb; BUG_ON(!PagePrivate(page)); @@ -1009,6 +1010,7 @@ static int btree_set_page_dirty(struct page *page) BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); BUG_ON(!atomic_read(&eb->refs)); btrfs_assert_tree_locked(eb); +#endif return __set_page_dirty_nobuffers(page); } -- cgit v0.10.2 From ed7b63eb8afd0bb8d978a23184d70c105b54aa26 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 15 Oct 2012 13:33:54 -0400 Subject: Btrfs: only clear dirty on the buffer if it is marked as dirty No reason to set the path blocking or loop through all of the pages if the extent buffer isn't actually marked dirty. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b8f7f04..65f0367 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1142,11 +1142,11 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, root->fs_info->dirty_metadata_bytes); } spin_unlock(&root->fs_info->delalloc_lock); - } - /* ugh, clear_extent_buffer_dirty needs to lock the page */ - btrfs_set_lock_blocking(buf); - clear_extent_buffer_dirty(buf); + /* ugh, clear_extent_buffer_dirty needs to lock the page */ + btrfs_set_lock_blocking(buf); + clear_extent_buffer_dirty(buf); + } } } -- cgit v0.10.2 From ad9145596986b672d8c8235c92ed5307f82d045d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 15 Oct 2012 13:39:33 -0400 Subject: Btrfs: don't memset new tokens Our token logic depends on token->kaddr being set, and if it is not it sets everything properly as needed. So instead of memsetting just set token->kaddr to NULL. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 55aff67..cd02205 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1860,7 +1860,7 @@ struct btrfs_map_token { static inline void btrfs_init_map_token (struct btrfs_map_token *token) { - memset(token, 0, sizeof(*token)); + token->kaddr = NULL; } /* some macros to generate set/get funcs for the struct fields. This -- cgit v0.10.2 From 41be1f3b40b87de33cd2e7463dce88596dbdccc4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 15 Oct 2012 13:43:18 -0400 Subject: Btrfs: optimize leaf_space_used This gets called at least 4 times for every level while adding an object, and it involves 3 kmapping calls, which on my box take about 5us a piece. So instead use a token, which brings us down to 1 kmap call and makes this function take 1/3 of the time per call. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e8b3264..e7bea1d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -3298,14 +3298,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans, */ static int leaf_space_used(struct extent_buffer *l, int start, int nr) { + struct btrfs_item *start_item; + struct btrfs_item *end_item; + struct btrfs_map_token token; int data_len; int nritems = btrfs_header_nritems(l); int end = min(nritems, start + nr) - 1; if (!nr) return 0; - data_len = btrfs_item_end_nr(l, start); - data_len = data_len - btrfs_item_offset_nr(l, end); + btrfs_init_map_token(&token); + start_item = btrfs_item_nr(l, start); + end_item = btrfs_item_nr(l, end); + data_len = btrfs_token_item_offset(l, start_item, &token) + + btrfs_token_item_size(l, start_item, &token); + data_len = data_len - btrfs_token_item_offset(l, end_item, &token); data_len += sizeof(struct btrfs_item) * nr; WARN_ON(data_len < 0); return data_len; -- cgit v0.10.2 From 0b1c6ccadee4ea4adb98799f3430fc72e57a187f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 23 Oct 2012 16:03:44 -0400 Subject: Btrfs: use tokens where we can in the tree log If we are syncing over and over the overhead of doing all those maps in fill_inode_item and log_changed_extents really starts to hurt, so use map tokens so we can avoid all the extra mapping. Since the token maps from our offset to the end of the page make sure to set the first thing in the item first so we really only do one map. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7244481..83186c7 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2952,33 +2952,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *item, struct inode *inode, int log_inode_only) { - btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); - btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); - btrfs_set_inode_mode(leaf, item, inode->i_mode); - btrfs_set_inode_nlink(leaf, item, inode->i_nlink); - - btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), - inode->i_atime.tv_sec); - btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), - inode->i_atime.tv_nsec); - - btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), - inode->i_mtime.tv_sec); - btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), - inode->i_mtime.tv_nsec); - - btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), - inode->i_ctime.tv_sec); - btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), - inode->i_ctime.tv_nsec); - - btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); - - btrfs_set_inode_sequence(leaf, item, inode->i_version); - btrfs_set_inode_transid(leaf, item, trans->transid); - btrfs_set_inode_rdev(leaf, item, inode->i_rdev); - btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); - btrfs_set_inode_block_group(leaf, item, 0); + struct btrfs_map_token token; + + btrfs_init_map_token(&token); if (log_inode_only) { /* set the generation to zero so the recover code @@ -2986,14 +2962,43 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, * just to say 'this inode exists' and a logging * to say 'update this inode with these values' */ - btrfs_set_inode_generation(leaf, item, 0); - btrfs_set_inode_size(leaf, item, 0); + btrfs_set_token_inode_generation(leaf, item, 0, &token); + btrfs_set_token_inode_size(leaf, item, 0, &token); } else { - btrfs_set_inode_generation(leaf, item, - BTRFS_I(inode)->generation); - btrfs_set_inode_size(leaf, item, inode->i_size); - } - + btrfs_set_token_inode_generation(leaf, item, + BTRFS_I(inode)->generation, + &token); + btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); + } + + btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); + btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); + btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); + btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); + + btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_nsec, &token); + + btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_nsec, &token); + + btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_nsec, &token); + + btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), + &token); + + btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); + btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); + btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); + btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); + btrfs_set_token_inode_block_group(leaf, item, 0, &token); } static int log_inode_item(struct btrfs_trans_handle *trans, @@ -3267,6 +3272,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; struct list_head ordered_sums; + struct btrfs_map_token token; struct btrfs_key key; u64 csum_offset = em->mod_start - em->start; u64 csum_len = em->mod_len; @@ -3276,6 +3282,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; INIT_LIST_HEAD(&ordered_sums); + btrfs_init_map_token(&token); key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = em->start; @@ -3289,37 +3296,49 @@ static int log_one_extent(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - btrfs_set_file_extent_generation(leaf, fi, em->generation); + btrfs_set_token_file_extent_generation(leaf, fi, em->generation, + &token); if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { skip_csum = true; - btrfs_set_file_extent_type(leaf, fi, - BTRFS_FILE_EXTENT_PREALLOC); + btrfs_set_token_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_PREALLOC, + &token); } else { - btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); + btrfs_set_token_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG, + &token); if (em->block_start == 0) skip_csum = true; } block_len = max(em->block_len, em->orig_block_len); if (em->compress_type != BTRFS_COMPRESS_NONE) { - btrfs_set_file_extent_disk_bytenr(leaf, fi, em->block_start); - btrfs_set_file_extent_disk_num_bytes(leaf, fi, block_len); + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, + em->block_start, + &token); + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, + &token); } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { - btrfs_set_file_extent_disk_bytenr(leaf, fi, - em->block_start - - extent_offset); - btrfs_set_file_extent_disk_num_bytes(leaf, fi, block_len); + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, + em->block_start - + extent_offset, &token); + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, + &token); } else { - btrfs_set_file_extent_disk_bytenr(leaf, fi, 0); - btrfs_set_file_extent_disk_num_bytes(leaf, fi, 0); - } - - btrfs_set_file_extent_offset(leaf, fi, em->start - em->orig_start); - btrfs_set_file_extent_num_bytes(leaf, fi, em->len); - btrfs_set_file_extent_ram_bytes(leaf, fi, em->len); - btrfs_set_file_extent_compression(leaf, fi, em->compress_type); - btrfs_set_file_extent_encryption(leaf, fi, 0); - btrfs_set_file_extent_other_encoding(leaf, fi, 0); + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, + &token); + } + + btrfs_set_token_file_extent_offset(leaf, fi, + em->start - em->orig_start, + &token); + btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); + btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->len, &token); + btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, + &token); + btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); + btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); btrfs_mark_buffer_dirty(leaf); /* -- cgit v0.10.2 From 5124e00ec5b0be56155a11aec416fcc5125339f1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 7 Nov 2012 13:44:13 -0500 Subject: Btrfs: only unlock and relock if we have to I noticed while doing fsync tests that we were always dropping the path and re-searching when we first cow the log root even though we've already gotten the write lock on the root. That's because we don't take into account that there might not be a parent node, so fix the check to make sure there is actually a parent node before we undo all of this work for nothing. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e7bea1d..c7b67cf 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2564,7 +2564,10 @@ again: * must have write locks on this node and the * parent */ - if (level + 1 > write_lock_level) { + if (level > write_lock_level || + (level + 1 > write_lock_level && + level + 1 < BTRFS_MAX_LEVEL && + p->nodes[level + 1])) { write_lock_level = level + 1; btrfs_release_path(p); goto again; -- cgit v0.10.2 From 6c760c072403f446ff829ec9e89568943a3c2ef2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Nov 2012 10:53:21 -0500 Subject: Btrfs: do not call file_update_time in aio_write This starts a transaction and dirties the inode everytime we call it, which is super expensive if you have a write heavy workload. We will be updating the inode when the IO completes and we reserve the space for the inode update when we reserve space for the write, so there is no chance of loss of information or enospc issues. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index c56088e..20452c1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1464,6 +1464,24 @@ out: return written ? written : err; } +static void update_time_for_write(struct inode *inode) +{ + struct timespec now; + + if (IS_NOCMTIME(inode)) + return; + + now = current_fs_time(inode->i_sb); + if (!timespec_equal(&inode->i_mtime, &now)) + inode->i_mtime = now; + + if (!timespec_equal(&inode->i_ctime, &now)) + inode->i_ctime = now; + + if (IS_I_VERSION(inode)) + inode_inc_iversion(inode); +} + static ssize_t btrfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) @@ -1519,11 +1537,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, goto out; } - err = file_update_time(file); - if (err) { - mutex_unlock(&inode->i_mutex); - goto out; - } + /* + * We reserve space for updating the inode when we reserve space for the + * extent we are going to write, so we will enospc out there. We don't + * need to start yet another transaction to update the inode as we will + * update the inode when we finish writing whatever data we write. + */ + update_time_for_write(inode); start_pos = round_down(pos, root->sectorsize); if (start_pos > i_size_read(inode)) { @@ -1563,8 +1583,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, * this will either be one more than the running transaction * or the generation used for the next transaction if there isn't * one running right now. + * + * We also have to set last_sub_trans to the current log transid, + * otherwise subsequent syncs to a file that's been synced in this + * transaction will appear to have already occured. */ BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; + BTRFS_I(inode)->last_sub_trans = root->log_transid; if (num_written > 0 || num_written == -EIOCBQUEUED) { err = generic_write_sync(file, pos, num_written); if (err < 0 && num_written > 0) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 355a297..1673dbd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1922,22 +1922,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ - ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); - if (!ret) { - if (nolock) - trans = btrfs_join_transaction_nolock(root); - else - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto out; - } - trans->block_rsv = &root->fs_info->delalloc_block_rsv; - ret = btrfs_update_inode_fallback(trans, root, inode); - if (ret) /* -ENOMEM or corruption */ - btrfs_abort_transaction(trans, root, ret); + btrfs_ordered_update_i_size(inode, 0, ordered_extent); + if (nolock) + trans = btrfs_join_transaction_nolock(root); + else + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out; } + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + ret = btrfs_update_inode_fallback(trans, root, inode); + if (ret) /* -ENOMEM or corruption */ + btrfs_abort_transaction(trans, root, ret); goto out; } @@ -1986,15 +1984,11 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) add_pending_csums(trans, inode, ordered_extent->file_offset, &ordered_extent->list); - ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); - if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { - ret = btrfs_update_inode_fallback(trans, root, inode); - if (ret) { /* -ENOMEM or corruption */ - btrfs_abort_transaction(trans, root, ret); - goto out_unlock; - } - } else { - btrfs_set_inode_last_trans(trans, inode); + btrfs_ordered_update_i_size(inode, 0, ordered_extent); + ret = btrfs_update_inode_fallback(trans, root, inode); + if (ret) { /* -ENOMEM or corruption */ + btrfs_abort_transaction(trans, root, ret); + goto out_unlock; } ret = 0; out_unlock: -- cgit v0.10.2 From 4ded4f639533ed5f02a0f0ab20d43bb9659c91f8 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Wed, 14 Nov 2012 18:57:29 +0000 Subject: Btrfs: fix BUG() in scrub when first superblock reading gives EIO This fixes a very special case that can be reproduced by just disconnecting a disk at runtime, and without unmounting the filesystem first, start scrub on the filesystem with the disconnected disk. All read and write EIOs are handled correctly, only the first superblock is an exception and gives a BUG() in a subfunction. The BUG() is correct, it would crash later otherwise. The subfunction must not be called for superblocks and this is what the fix changes. Reported-by: Joeri Vanthienen Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 8db6a64..bdbb94f 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -785,6 +785,17 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) BUG_ON(sblock_to_check->page_count < 1); fs_info = sctx->dev_root->fs_info; + if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { + /* + * if we find an error in a super block, we just report it. + * They will get written with the next transaction commit + * anyway + */ + spin_lock(&sctx->stat_lock); + ++sctx->stat.super_errors; + spin_unlock(&sctx->stat_lock); + return 0; + } length = sblock_to_check->page_count * PAGE_SIZE; logical = sblock_to_check->pagev[0]->logical; generation = sblock_to_check->pagev[0]->generation; -- cgit v0.10.2 From 31e502298d80e2af9001d17dc419a3fd4b0bebef Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 21 Nov 2012 14:18:10 +0000 Subject: Btrfs: put raid properties into global table Raid properties can be shared among raid calculation code, we can put them into a global table to keep it simple. Signed-off-by: Liu Bo Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index cd02205..44d9bc8 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3077,6 +3077,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range); int btrfs_init_space_info(struct btrfs_fs_info *fs_info); int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); +int __get_raid_index(u64 flags); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e152809..b9526f7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5479,7 +5479,7 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache) return 0; } -static int __get_block_group_index(u64 flags) +int __get_raid_index(u64 flags) { int index; @@ -5499,7 +5499,7 @@ static int __get_block_group_index(u64 flags) static int get_block_group_index(struct btrfs_block_group_cache *cache) { - return __get_block_group_index(cache->flags); + return __get_raid_index(cache->flags); } enum btrfs_loop_type { @@ -7441,7 +7441,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) */ target = get_restripe_target(root->fs_info, block_group->flags); if (target) { - index = __get_block_group_index(extended_to_chunk(target)); + index = __get_raid_index(extended_to_chunk(target)); } else { /* * this is just a balance, so if we were marked as full diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d79b5b6..5cce6aa 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3491,6 +3491,14 @@ static int btrfs_cmp_device_info(const void *a, const void *b) return 0; } +struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { + { 2, 1, 0, 4, 2, 2 /* raid10 */ }, + { 1, 1, 2, 2, 2, 2 /* raid1 */ }, + { 1, 2, 1, 1, 1, 2 /* dup */ }, + { 1, 1, 0, 2, 1, 1 /* raid0 */ }, + { 1, 1, 0, 1, 1, 1 /* single */ }, +}; + static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, struct map_lookup **map_ret, @@ -3520,43 +3528,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, int ndevs; int i; int j; + int index; BUG_ON(!alloc_profile_is_valid(type, 0)); if (list_empty(&fs_devices->alloc_list)) return -ENOSPC; - sub_stripes = 1; - dev_stripes = 1; - devs_increment = 1; - ncopies = 1; - devs_max = 0; /* 0 == as many as possible */ - devs_min = 1; + index = __get_raid_index(type); - /* - * define the properties of each RAID type. - * FIXME: move this to a global table and use it in all RAID - * calculation code - */ - if (type & (BTRFS_BLOCK_GROUP_DUP)) { - dev_stripes = 2; - ncopies = 2; - devs_max = 1; - } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) { - devs_min = 2; - } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) { - devs_increment = 2; - ncopies = 2; - devs_max = 2; - devs_min = 2; - } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) { - sub_stripes = 2; - devs_increment = 2; - ncopies = 2; - devs_min = 4; - } else { - devs_max = 1; - } + sub_stripes = btrfs_raid_array[index].sub_stripes; + dev_stripes = btrfs_raid_array[index].dev_stripes; + devs_max = btrfs_raid_array[index].devs_max; + devs_min = btrfs_raid_array[index].devs_min; + devs_increment = btrfs_raid_array[index].devs_increment; + ncopies = btrfs_raid_array[index].ncopies; if (type & BTRFS_BLOCK_GROUP_DATA) { max_stripe_size = 1024 * 1024 * 1024; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 37d0157..d3c3939 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -180,6 +180,15 @@ struct btrfs_device_info { u64 total_avail; }; +struct btrfs_raid_attr { + int sub_stripes; /* sub_stripes info for map */ + int dev_stripes; /* stripes per dev */ + int devs_max; /* max devs to use */ + int devs_min; /* min devs needed */ + int devs_increment; /* ndevs has to be a multiple of this */ + int ncopies; /* how many copies to data has */ +}; + struct map_lookup { u64 type; int io_align; -- cgit v0.10.2 From 9185aa587b7425f8f4520da2e66792f5f3c2b815 Mon Sep 17 00:00:00 2001 From: Filipe Brandenburger Date: Fri, 30 Nov 2012 03:40:08 +0000 Subject: Btrfs: fix permissions of empty files not affected by umask When a new file is created with btrfs_create(), the inode will initially be created with permissions 0666 and later on in btrfs_init_acl() it will be adapted to mask out the umask bits. The problem is that this change won't make it into the btrfs_inode unless there's another change to the inode (e.g. writing content changing the size or touching the file changing the mtime.) This fix adds a call to btrfs_update_inode() to btrfs_create() to make sure that the change will not get lost if the in-memory inode is flushed before other changes are made to the file. Signed-off-by: Filipe Brandenburger Reviewed-by: Liu Bo Signed-off-by: Chris Mason diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1673dbd..2e6918c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5041,6 +5041,10 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (err) goto out_unlock; + err = btrfs_update_inode(trans, root, inode); + if (err) + goto out_unlock; + /* * If the active LSM wants to access the inode during * d_instantiate it needs these. Smack checks to see -- cgit v0.10.2 From 1135d6df222046a0ec14a1c9335de99907879922 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Dec 2012 13:46:43 -0500 Subject: Btrfs: fix autodefrag and umount lockup This happens because writeback_inodes_sb_nr_if_idle does down_read. This doesn't work for us and it has not been fixed upstream yet, so do it ourselves and use that instead so we can stop having this stupid long standing lockup. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b9526f7..afc3ac5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3689,6 +3689,20 @@ static int can_overcommit(struct btrfs_root *root, return 0; } +static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb, + unsigned long nr_pages, + enum wb_reason reason) +{ + if (!writeback_in_progress(sb->s_bdi) && + down_read_trylock(&sb->s_umount)) { + writeback_inodes_sb_nr(sb, nr_pages, reason); + up_read(&sb->s_umount); + return 1; + } + + return 0; +} + /* * shrink metadata reservation for delalloc */ @@ -3721,8 +3735,9 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, while (delalloc_bytes && loops < 3) { max_reclaim = min(delalloc_bytes, to_reclaim); nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; - writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, - WB_REASON_FS_FREE_SPACE); + writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb, + nr_pages, + WB_REASON_FS_FREE_SPACE); /* * We need to wait for the async pages to actually start before -- cgit v0.10.2 From c64c2bd890df3b9a66c52c33df110777058c011e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Dec 2012 13:48:14 -0500 Subject: Btrfs: don't take inode delalloc mutex if we're a free space inode This confuses and angers lockdep even though it's ok. We don't really need the lock for free space inodes since only the transaction committer will be reserving space. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index afc3ac5..d133edf 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4535,16 +4535,25 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) int extra_reserve = 0; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; int ret; + bool delalloc_lock = true; - /* Need to be holding the i_mutex here if we aren't free space cache */ - if (btrfs_is_free_space_inode(inode)) + /* If we are a free space inode we need to not flush since we will be in + * the middle of a transaction commit. We also don't need the delalloc + * mutex since we won't race with anybody. We need this mostly to make + * lockdep shut its filthy mouth. + */ + if (btrfs_is_free_space_inode(inode)) { flush = BTRFS_RESERVE_NO_FLUSH; + delalloc_lock = false; + } if (flush != BTRFS_RESERVE_NO_FLUSH && btrfs_transaction_in_commit(root->fs_info)) schedule_timeout(1); - mutex_lock(&BTRFS_I(inode)->delalloc_mutex); + if (delalloc_lock) + mutex_lock(&BTRFS_I(inode)->delalloc_mutex); + num_bytes = ALIGN(num_bytes, root->sectorsize); spin_lock(&BTRFS_I(inode)->lock); @@ -4577,7 +4586,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) spin_lock(&BTRFS_I(inode)->lock); calc_csum_metadata_size(inode, num_bytes, 0); spin_unlock(&BTRFS_I(inode)->lock); - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); + if (delalloc_lock) + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); return ret; } } @@ -4616,7 +4626,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) btrfs_qgroup_free(root, num_bytes + nr_extents * root->leafsize); } - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); + if (delalloc_lock) + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); return ret; } @@ -4628,7 +4639,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) } BTRFS_I(inode)->reserved_extents += nr_extents; spin_unlock(&BTRFS_I(inode)->lock); - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); + + if (delalloc_lock) + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); if (to_reserve) trace_btrfs_space_reservation(root->fs_info,"delalloc", -- cgit v0.10.2 From 9c52057c698fb96f8f07e7a4bcf4801a092bda89 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 17 Dec 2012 14:26:57 -0500 Subject: Btrfs: fix hash overflow handling The handling for directory crc hash overflows was fairly obscure, split_leaf returns EOVERFLOW when we try to extend the item and that is supposed to bubble up to userland. For a while it did so, but along the way we added better handling of errors and forced the FS readonly if we hit IO errors during the directory insertion. Along the way, we started testing only for EEXIST and the EOVERFLOW case was dropped. The end result is that we may force the FS readonly if we catch a directory hash bucket overflow. This fixes a few problem spots. First I add tests for EOVERFLOW in the places where we can safely just return the error up the chain. btrfs_rename is harder though, because it tries to insert the new directory item only after it has already unlinked anything the rename was going to overwrite. Rather than adding very complex logic, I added a helper to test for the hash overflow case early while it is still safe to bail out. Snapshot and subvolume creation had a similar problem, so they are using the new helper now too. Signed-off-by: Chris Mason Reported-by: Pascal Junod diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 44d9bc8..547b7b0 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3283,6 +3283,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct btrfs_root *root); /* dir-item.c */ +int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, + const char *name, int name_len); int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, struct inode *dir, diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index c1a074d..502c215 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -213,6 +213,65 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, return btrfs_match_dir_item_name(root, path, name, name_len); } +int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, + const char *name, int name_len) +{ + int ret; + struct btrfs_key key; + struct btrfs_dir_item *di; + int data_size; + struct extent_buffer *leaf; + int slot; + struct btrfs_path *path; + + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = dir; + btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); + key.offset = btrfs_name_hash(name, name_len); + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + + /* return back any errors */ + if (ret < 0) + goto out; + + /* nothing found, we're safe */ + if (ret > 0) { + ret = 0; + goto out; + } + + /* we found an item, look for our name in the item */ + di = btrfs_match_dir_item_name(root, path, name, name_len); + if (di) { + /* our exact name was found */ + ret = -EEXIST; + goto out; + } + + /* + * see if there is room in the item to insert this + * name + */ + data_size = sizeof(*di) + name_len + sizeof(struct btrfs_item); + leaf = path->nodes[0]; + slot = path->slots[0]; + if (data_size + btrfs_item_size_nr(leaf, slot) + + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) { + ret = -EOVERFLOW; + } else { + /* plenty of insertion room */ + ret = 0; + } +out: + btrfs_free_path(path); + return ret; +} + /* * lookup a directory item based on index. 'dir' is the objectid * we're searching in, and 'mod' tells us if you plan on deleting the diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2e6918c..e95b1f9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4885,7 +4885,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, ret = btrfs_insert_dir_item(trans, root, name, name_len, parent_inode, &key, btrfs_inode_type(inode), index); - if (ret == -EEXIST) + if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; else if (ret) { btrfs_abort_transaction(trans, root, ret); @@ -7336,6 +7336,28 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (S_ISDIR(old_inode->i_mode) && new_inode && new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; + + + /* check for collisions, even if the name isn't there */ + ret = btrfs_check_dir_item_collision(root, new_dir->i_ino, + new_dentry->d_name.name, + new_dentry->d_name.len); + + if (ret) { + if (ret == -EEXIST) { + /* we shouldn't get + * eexist without a new_inode */ + if (!new_inode) { + WARN_ON(1); + return ret; + } + } else { + /* maybe -EOVERFLOW */ + return ret; + } + } + ret = 0; + /* * we're using rename to replace one file with another. * and the replacement file is large. Start IO on it now so diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 657d83c..d4608ab 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -710,6 +710,16 @@ static noinline int btrfs_mksubvol(struct path *parent, if (error) goto out_dput; + /* + * even if this name doesn't exist, we may get hash collisions. + * check for them now when we can safely fail + */ + error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, + dir->i_ino, name, + namelen); + if (error) + goto out_dput; + down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e6509b9..87fac9a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1190,7 +1190,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, parent_inode, &key, BTRFS_FT_DIR, index); /* We have check then name at the beginning, so it is impossible. */ - BUG_ON(ret == -EEXIST); + BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); if (ret) { btrfs_abort_transaction(trans, root, ret); goto fail; -- cgit v0.10.2 From 213490b301773ea9c6fb89a86424a6901fcdd069 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 11 Sep 2012 08:33:50 -0600 Subject: Btrfs: fix a bug of per-file nocow Users report a bug, the reproducer is: $ mkfs.btrfs /dev/loop0 $ mount /dev/loop0 /mnt/btrfs/ $ mkdir /mnt/btrfs/dir $ chattr +C /mnt/btrfs/dir/ $ dd if=/dev/zero of=/mnt/btrfs/dir/foo bs=4K count=10; $ lsattr /mnt/btrfs/dir/foo ---------------C- /mnt/btrfs/dir/foo $ filefrag /mnt/btrfs/dir/foo /mnt/btrfs/dir/foo: 1 extent found ---> an extent $ dd if=/dev/zero of=/mnt/btrfs/dir/foo bs=4K count=1 seek=5 conv=notrunc,nocreat; sync $ filefrag /mnt/btrfs/dir/foo /mnt/btrfs/dir/foo: 3 extents found ---> with nocow, btrfs breaks the extent into three parts The new created file should not only inherit the NODATACOW flag, but also honor NODATASUM flag, because we must do COW on a file extent with checksum. Signed-off-by: Liu Bo Signed-off-by: Chris Mason diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e95b1f9..67ed24a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4818,8 +4818,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, if (S_ISREG(mode)) { if (btrfs_test_opt(root, NODATASUM)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; - if (btrfs_test_opt(root, NODATACOW) || - (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW)) + if (btrfs_test_opt(root, NODATACOW)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d4608ab..7624212 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -141,8 +141,11 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; } - if (flags & BTRFS_INODE_NODATACOW) + if (flags & BTRFS_INODE_NODATACOW) { BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; + if (S_ISREG(inode->i_mode)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; + } btrfs_update_iflags(inode); } -- cgit v0.10.2