From f1ebcc74d5b2159f44c96b479b6eb8afc7829095 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 14 Nov 2011 20:48:06 -0500 Subject: Btrfs: fix tree corruption after multi-thread snapshots and inode_cache flush The btrfs snapshotting code requires that once a root has been snapshotted, we don't change it during a commit. But there are two cases to lead to tree corruptions: 1) multi-thread snapshots can commit serveral snapshots in a transaction, and this may change the src root when processing the following pending snapshots, which lead to the former snapshots corruptions; 2) the free inode cache was changing the roots when it root the cache, which lead to corruptions. This fixes things by making sure we force COW the block after we create a snapshot during commiting a transaction, then any changes to the roots will result in COW, and we get all the fs roots and snapshot roots to be consistent. Signed-off-by: Liu Bo Signed-off-by: Miao Xie Signed-off-by: Chris Mason diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0fe615e..dede441 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -514,10 +514,25 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { + /* ensure we can see the force_cow */ + smp_rmb(); + + /* + * We do not need to cow a block if + * 1) this block is not created or changed in this transaction; + * 2) this block does not belong to TREE_RELOC tree; + * 3) the root is not forced COW. + * + * What is forced COW: + * when we create snapshot during commiting the transaction, + * after we've finished coping src root, we must COW the shared + * block to ensure the metadata consistency. + */ if (btrfs_header_generation(buf) == trans->transid && !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && - btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) + btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && + !root->force_cow) return 0; return 1; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b9ba59f..b1cb3c0 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1271,6 +1271,8 @@ struct btrfs_root { * for stat. It may be used for more later */ dev_t anon_dev; + + int force_cow; }; struct btrfs_ioctl_defrag_range_args { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 6a0574e..81376d9 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -785,6 +785,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, btrfs_save_ino_cache(root, trans); + /* see comments in should_cow_block() */ + root->force_cow = 0; + smp_wmb(); + if (root->commit_root != root->node) { mutex_lock(&root->fs_commit_mutex); switch_commit_root(root); @@ -947,6 +951,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_tree_unlock(old); free_extent_buffer(old); + /* see comments in should_cow_block() */ + root->force_cow = 1; + smp_wmb(); + btrfs_set_root_node(new_root_item, tmp); /* record when the snapshot was created in key.offset */ key.offset = trans->transid; -- cgit v0.10.2 From 387125fc722a8ed432066b85a552917343bdafca Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 18 Nov 2011 15:07:51 -0500 Subject: Btrfs: fix barrier flushes When btrfs is writing the super blocks, it send barrier flushes to make sure writeback caching drives get all the metadata on disk in the right order. But, we have two bugs in the way these are sent down. When doing full commits (not via the tree log), we are sending the barrier down before the last super when it should be going down before the first. In multi-device setups, we should be waiting for the barriers to complete on all devices before writing any of the supers. Both of these bugs can cause corruptions on power failures. We fix it with some new code to send down empty barriers to all devices before writing the first super. Alexandre Oliva found the multi-device bug. Arne Jansen did the async barrier loop. Signed-off-by: Chris Mason Reported-by: Alexandre Oliva diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b6a5c0d..48d3013 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2573,22 +2573,10 @@ static int write_dev_supers(struct btrfs_device *device, int errors = 0; u32 crc; u64 bytenr; - int last_barrier = 0; if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; - /* make sure only the last submit_bh does a barrier */ - if (do_barriers) { - for (i = 0; i < max_mirrors; i++) { - bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= - device->total_bytes) - break; - last_barrier = i; - } - } - for (i = 0; i < max_mirrors; i++) { bytenr = btrfs_sb_offset(i); if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) @@ -2634,17 +2622,136 @@ static int write_dev_supers(struct btrfs_device *device, bh->b_end_io = btrfs_end_buffer_write_sync; } - if (i == last_barrier && do_barriers) - ret = submit_bh(WRITE_FLUSH_FUA, bh); - else - ret = submit_bh(WRITE_SYNC, bh); - + /* + * we fua the first super. The others we allow + * to go down lazy. + */ + ret = submit_bh(WRITE_FUA, bh); if (ret) errors++; } return errors < i ? 0 : -1; } +/* + * endio for the write_dev_flush, this will wake anyone waiting + * for the barrier when it is done + */ +static void btrfs_end_empty_barrier(struct bio *bio, int err) +{ + if (err) { + if (err == -EOPNOTSUPP) + set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + clear_bit(BIO_UPTODATE, &bio->bi_flags); + } + if (bio->bi_private) + complete(bio->bi_private); + bio_put(bio); +} + +/* + * trigger flushes for one the devices. If you pass wait == 0, the flushes are + * sent down. With wait == 1, it waits for the previous flush. + * + * any device where the flush fails with eopnotsupp are flagged as not-barrier + * capable + */ +static int write_dev_flush(struct btrfs_device *device, int wait) +{ + struct bio *bio; + int ret = 0; + + if (device->nobarriers) + return 0; + + if (wait) { + bio = device->flush_bio; + if (!bio) + return 0; + + wait_for_completion(&device->flush_wait); + + if (bio_flagged(bio, BIO_EOPNOTSUPP)) { + printk("btrfs: disabling barriers on dev %s\n", + device->name); + device->nobarriers = 1; + } + if (!bio_flagged(bio, BIO_UPTODATE)) { + ret = -EIO; + } + + /* drop the reference from the wait == 0 run */ + bio_put(bio); + device->flush_bio = NULL; + + return ret; + } + + /* + * one reference for us, and we leave it for the + * caller + */ + device->flush_bio = NULL;; + bio = bio_alloc(GFP_NOFS, 0); + if (!bio) + return -ENOMEM; + + bio->bi_end_io = btrfs_end_empty_barrier; + bio->bi_bdev = device->bdev; + init_completion(&device->flush_wait); + bio->bi_private = &device->flush_wait; + device->flush_bio = bio; + + bio_get(bio); + submit_bio(WRITE_FLUSH, bio); + + return 0; +} + +/* + * send an empty flush down to each device in parallel, + * then wait for them + */ +static int barrier_all_devices(struct btrfs_fs_info *info) +{ + struct list_head *head; + struct btrfs_device *dev; + int errors = 0; + int ret; + + /* send down all the barriers */ + head = &info->fs_devices->devices; + list_for_each_entry_rcu(dev, head, dev_list) { + if (!dev->bdev) { + errors++; + continue; + } + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + ret = write_dev_flush(dev, 0); + if (ret) + errors++; + } + + /* wait for all the barriers */ + list_for_each_entry_rcu(dev, head, dev_list) { + if (!dev->bdev) { + errors++; + continue; + } + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + ret = write_dev_flush(dev, 1); + if (ret) + errors++; + } + if (errors) + return -EIO; + return 0; +} + int write_all_supers(struct btrfs_root *root, int max_mirrors) { struct list_head *head; @@ -2666,6 +2773,10 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) mutex_lock(&root->fs_info->fs_devices->device_list_mutex); head = &root->fs_info->fs_devices->devices; + + if (do_barriers) + barrier_all_devices(root->fs_info); + list_for_each_entry_rcu(dev, head, dev_list) { if (!dev->bdev) { total_errors++; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index ab5b1c4..78f2d4d 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -100,6 +100,12 @@ struct btrfs_device { struct reada_zone *reada_curr_zone; struct radix_tree_root reada_zones; struct radix_tree_root reada_extents; + + /* for sending down flush barriers */ + struct bio *flush_bio; + struct completion flush_wait; + int nobarriers; + }; struct btrfs_fs_devices { -- cgit v0.10.2 From 745c4d8e160afaf6c75e887c27ea4b75c8142b26 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Sun, 20 Nov 2011 07:31:57 -0500 Subject: btrfs: Fix up 32/64-bit compatibility for new ioctls This patch casts to unsigned long before casting to a pointer and fixes the following warnings: fs/btrfs/extent_io.c:2289:20: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] fs/btrfs/ioctl.c:2933:37: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] fs/btrfs/ioctl.c:2937:21: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] fs/btrfs/ioctl.c:3020:21: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] fs/btrfs/scrub.c:275:4: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] fs/btrfs/backref.c:686:27: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] Signed-off-by: Jeff Mahoney Signed-off-by: Chris Mason diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 8855aad..22c64ff 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -683,7 +683,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, return PTR_ERR(fspath); if (fspath > fspath_min) { - ipath->fspath->val[i] = (u64)fspath; + ipath->fspath->val[i] = (u64)(unsigned long)fspath; ++ipath->fspath->elem_cnt; ipath->fspath->bytes_left = fspath - fspath_min; } else { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1f87c4d..47fdba7 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2286,7 +2286,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) } if (!uptodate) { u64 failed_mirror; - failed_mirror = (u64)bio->bi_bdev; + failed_mirror = (unsigned long)bio->bi_bdev; if (tree->ops && tree->ops->readpage_io_failed_hook) ret = tree->ops->readpage_io_failed_hook( bio, page, start, end, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4a34c47..bda53fc 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2930,11 +2930,13 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) goto out; for (i = 0; i < ipath->fspath->elem_cnt; ++i) { - rel_ptr = ipath->fspath->val[i] - (u64)ipath->fspath->val; + rel_ptr = ipath->fspath->val[i] - + (u64)(unsigned long)ipath->fspath->val; ipath->fspath->val[i] = rel_ptr; } - ret = copy_to_user((void *)ipa->fspath, (void *)ipath->fspath, size); + ret = copy_to_user((void *)(unsigned long)ipa->fspath, + (void *)(unsigned long)ipath->fspath, size); if (ret) { ret = -EFAULT; goto out; @@ -3017,7 +3019,8 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, if (ret < 0) goto out; - ret = copy_to_user((void *)loi->inodes, (void *)inodes, size); + ret = copy_to_user((void *)(unsigned long)loi->inodes, + (void *)(unsigned long)inodes, size); if (ret) ret = -EFAULT; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index f4190f2..fab420d 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -272,7 +272,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) swarn->logical, swarn->dev->name, (unsigned long long)swarn->sector, root, inum, offset, min(isize - offset, (u64)PAGE_SIZE), nlink, - (char *)ipath->fspath->val[i]); + (char *)(unsigned long)ipath->fspath->val[i]); free_ipath(ipath); return 0; -- cgit v0.10.2 From 32240a913d9f3a5aad42175d7696590ea1bfdb08 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Sun, 20 Nov 2011 07:33:38 -0500 Subject: btrfs: mirror_num should be int, not u64 My previous patch introduced some u64 for failed_mirror variables, this one makes it consistent again. Signed-off-by: Jan Schmidt Signed-off-by: Chris Mason diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 48d3013..94abc25 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -620,7 +620,7 @@ out: static int btree_io_failed_hook(struct bio *failed_bio, struct page *page, u64 start, u64 end, - u64 mirror_num, struct extent_state *state) + int mirror_num, struct extent_state *state) { struct extent_io_tree *tree; unsigned long len; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 47fdba7..a877b8b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2285,8 +2285,8 @@ static void end_bio_extent_readpage(struct bio *bio, int err) clean_io_failure(start, page); } if (!uptodate) { - u64 failed_mirror; - failed_mirror = (unsigned long)bio->bi_bdev; + int failed_mirror; + failed_mirror = (int)(unsigned long)bio->bi_bdev; if (tree->ops && tree->ops->readpage_io_failed_hook) ret = tree->ops->readpage_io_failed_hook( bio, page, start, end, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index feb9be0..7604c30 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -70,7 +70,7 @@ struct extent_io_ops { unsigned long bio_flags); int (*readpage_io_hook)(struct page *page, u64 start, u64 end); int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, - u64 start, u64 end, u64 failed_mirror, + u64 start, u64 end, int failed_mirror, struct extent_state *state); int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, u64 start, u64 end, -- cgit v0.10.2 From 0f0fbf1d0e188d129756e9508090af4bdbfde00b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sun, 20 Nov 2011 07:33:38 -0500 Subject: Btrfs: fix to search one more bitmap for cluster setup Suppose there are two bitmaps [0, 256], [256, 512] and one extent [100, 120] in the free space cache, and we want to setup a cluster with offset=100, bytes=50. In this case, there will be only one bitmap [256, 512] in the temporary bitmaps list, and then setup_cluster_bitmap() won't search bitmap [0, 256]. The cause is, the list is constructed in setup_cluster_no_bitmap(), and only bitmaps with bitmap_entry->offset >= offset will be added into the list, and the very bitmap that convers offset has bitmap_entry->offset <= offset. Signed-off-by: Li Zefan Signed-off-by: Chris Mason diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 181760f..8f792f4 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2453,11 +2453,23 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, struct btrfs_free_space *entry; struct rb_node *node; int ret = -ENOSPC; + u64 bitmap_offset = offset_to_bitmap(ctl, offset); if (ctl->total_bitmaps == 0) return -ENOSPC; /* + * The bitmap that covers offset won't be in the list unless offset + * is just its start offset. + */ + entry = list_first_entry(bitmaps, struct btrfs_free_space, list); + if (entry->offset != bitmap_offset) { + entry = tree_search_offset(ctl, bitmap_offset, 1, 0); + if (entry && list_empty(&entry->list)) + list_add(&entry->list, bitmaps); + } + + /* * First check our cached list of bitmaps and see if there is an entry * here that will work. */ -- cgit v0.10.2 From 52621cb6ed0e0e14358bb317bda7cd5fbd5c2a27 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sun, 20 Nov 2011 07:33:38 -0500 Subject: Btrfs: avoid unnecessary bitmap search for cluster setup setup_cluster_no_bitmap() searches all the extents and bitmaps starting from offset. Therefore if it returns -ENOSPC, all the bitmaps starting from offset are in the bitmaps list, so it's sufficient to search from this list in setup_cluser_bitmap(). Signed-off-by: Li Zefan Signed-off-by: Chris Mason diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 8f792f4..8c32434 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2451,7 +2451,6 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry; - struct rb_node *node; int ret = -ENOSPC; u64 bitmap_offset = offset_to_bitmap(ctl, offset); @@ -2469,10 +2468,6 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, list_add(&entry->list, bitmaps); } - /* - * First check our cached list of bitmaps and see if there is an entry - * here that will work. - */ list_for_each_entry(entry, bitmaps, list) { if (entry->bytes < min_bytes) continue; @@ -2483,38 +2478,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, } /* - * If we do have entries on our list and we are here then we didn't find - * anything, so go ahead and get the next entry after the last entry in - * this list and start the search from there. + * The bitmaps list has all the bitmaps that record free space + * starting after offset, so no more search is required. */ - if (!list_empty(bitmaps)) { - entry = list_entry(bitmaps->prev, struct btrfs_free_space, - list); - node = rb_next(&entry->offset_index); - if (!node) - return -ENOSPC; - entry = rb_entry(node, struct btrfs_free_space, offset_index); - goto search; - } - - entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1); - if (!entry) - return -ENOSPC; - -search: - node = &entry->offset_index; - do { - entry = rb_entry(node, struct btrfs_free_space, offset_index); - node = rb_next(&entry->offset_index); - if (!entry->bitmap) - continue; - if (entry->bytes < min_bytes) - continue; - ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, - bytes, min_bytes); - } while (ret && node); - - return ret; + return -ENOSPC; } /* @@ -2532,8 +2499,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, u64 offset, u64 bytes, u64 empty_size) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct list_head bitmaps; struct btrfs_free_space *entry, *tmp; + LIST_HEAD(bitmaps); u64 min_bytes; int ret; @@ -2572,7 +2539,6 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, goto out; } - INIT_LIST_HEAD(&bitmaps); ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, bytes, min_bytes); if (ret) -- cgit v0.10.2 From fadc0d8be4dfca80f6c568bc5874931893c6709b Mon Sep 17 00:00:00 2001 From: David Sterba Date: Sun, 20 Nov 2011 07:33:38 -0500 Subject: btrfs: fix stat blocks accounting Round inode bytes and delalloc bytes up to real blocksize before converting to sector size. Otherwise eg. files smaller than 512 are reported with zero blocks due to incorrect rounding. Signed-off-by: David Sterba Signed-off-by: Chris Mason diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e16215f..8ad26b1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6794,11 +6794,13 @@ static int btrfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = dentry->d_inode; + u32 blocksize = inode->i_sb->s_blocksize; + generic_fillattr(inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; stat->blksize = PAGE_CACHE_SIZE; - stat->blocks = (inode_get_bytes(inode) + - BTRFS_I(inode)->delalloc_bytes) >> 9; + stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + + ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9; return 0; } -- cgit v0.10.2 From 5bb1468238e20b15921909e9f9601e945f03bac7 Mon Sep 17 00:00:00 2001 From: Arnd Hannemann Date: Sun, 20 Nov 2011 07:33:38 -0500 Subject: Btrfs: prefix resize related printks with btrfs: For the user it is confusing to find something like: [10197.627710] new size for /dev/mapper/vg0-usr_share is 3221225472 in kernel log, because it doesn't point directly to btrfs. This patch prefixes those messages with "btrfs:" like other btrfs related printks. Signed-off-by: Arnd Hannemann Signed-off-by: Chris Mason diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index bda53fc..a90e749 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1216,12 +1216,12 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, *devstr = '\0'; devstr = vol_args->name; devid = simple_strtoull(devstr, &end, 10); - printk(KERN_INFO "resizing devid %llu\n", + printk(KERN_INFO "btrfs: resizing devid %llu\n", (unsigned long long)devid); } device = btrfs_find_device(root, devid, NULL, NULL); if (!device) { - printk(KERN_INFO "resizer unable to find device %llu\n", + printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", (unsigned long long)devid); ret = -EINVAL; goto out_unlock; @@ -1267,7 +1267,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, do_div(new_size, root->sectorsize); new_size *= root->sectorsize; - printk(KERN_INFO "new size for %s is %llu\n", + printk(KERN_INFO "btrfs: new size for %s is %llu\n", device->name, (unsigned long long)new_size); if (new_size > old_size) { -- cgit v0.10.2 From 291c7d2f577428f896daa5002e784959328a80aa Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 14 Nov 2011 13:52:14 -0500 Subject: Btrfs: wait on caching if we're loading the free space cache We've been hitting panics when running xfstest 13 in a loop for long periods of time. And actually this problem has always existed so we've been hitting these things randomly for a while. Basically what happens is we get a thread coming into the allocator and reading the space cache off of disk and adding the entries to the free space cache as we go. Then we get another thread that comes in and tries to allocate from that block group. Since block_group->cached != BTRFS_CACHE_NO it goes ahead and tries to do the allocation. We do this because if we're doing the old slow way of caching we don't want to hold people up and wait for everything to finish. The problem with this is we could end up discarding the space cache at some arbitrary point in the future, which means we could very well end up allocating space that is either bad, or when the real caching happens it could end up thinking the space isn't in use when it really is and cause all sorts of other problems. The solution is to add a new flag to indicate we are loading the free space cache from disk, and always try to cache the block group if cache->cached != BTRFS_CACHE_FINISHED. That way if we are loading the space cache anybody else who tries to allocate from the block group will have to wait until it's finished to make sure it completes successfully. Thanks, Signed-off-by: Josef Bacik diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b1cb3c0..04a5dfc 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -848,7 +848,8 @@ struct btrfs_free_cluster { enum btrfs_caching_type { BTRFS_CACHE_NO = 0, BTRFS_CACHE_STARTED = 1, - BTRFS_CACHE_FINISHED = 2, + BTRFS_CACHE_FAST = 2, + BTRFS_CACHE_FINISHED = 3, }; enum btrfs_disk_cache_state { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0f47b3e..5d86877 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -467,13 +467,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, struct btrfs_root *root, int load_cache_only) { + DEFINE_WAIT(wait); struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_caching_control *caching_ctl; int ret = 0; - smp_mb(); - if (cache->cached != BTRFS_CACHE_NO) + caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); + BUG_ON(!caching_ctl); + + INIT_LIST_HEAD(&caching_ctl->list); + mutex_init(&caching_ctl->mutex); + init_waitqueue_head(&caching_ctl->wait); + caching_ctl->block_group = cache; + caching_ctl->progress = cache->key.objectid; + atomic_set(&caching_ctl->count, 1); + caching_ctl->work.func = caching_thread; + + spin_lock(&cache->lock); + /* + * This should be a rare occasion, but this could happen I think in the + * case where one thread starts to load the space cache info, and then + * some other thread starts a transaction commit which tries to do an + * allocation while the other thread is still loading the space cache + * info. The previous loop should have kept us from choosing this block + * group, but if we've moved to the state where we will wait on caching + * block groups we need to first check if we're doing a fast load here, + * so we can wait for it to finish, otherwise we could end up allocating + * from a block group who's cache gets evicted for one reason or + * another. + */ + while (cache->cached == BTRFS_CACHE_FAST) { + struct btrfs_caching_control *ctl; + + ctl = cache->caching_ctl; + atomic_inc(&ctl->count); + prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&cache->lock); + + schedule(); + + finish_wait(&ctl->wait, &wait); + put_caching_control(ctl); + spin_lock(&cache->lock); + } + + if (cache->cached != BTRFS_CACHE_NO) { + spin_unlock(&cache->lock); + kfree(caching_ctl); return 0; + } + WARN_ON(cache->caching_ctl); + cache->caching_ctl = caching_ctl; + cache->cached = BTRFS_CACHE_FAST; + spin_unlock(&cache->lock); /* * We can't do the read from on-disk cache during a commit since we need @@ -484,56 +530,51 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, if (trans && (!trans->transaction->in_commit) && (root && root != root->fs_info->tree_root) && btrfs_test_opt(root, SPACE_CACHE)) { - spin_lock(&cache->lock); - if (cache->cached != BTRFS_CACHE_NO) { - spin_unlock(&cache->lock); - return 0; - } - cache->cached = BTRFS_CACHE_STARTED; - spin_unlock(&cache->lock); - ret = load_free_space_cache(fs_info, cache); spin_lock(&cache->lock); if (ret == 1) { + cache->caching_ctl = NULL; cache->cached = BTRFS_CACHE_FINISHED; cache->last_byte_to_unpin = (u64)-1; } else { - cache->cached = BTRFS_CACHE_NO; + if (load_cache_only) { + cache->caching_ctl = NULL; + cache->cached = BTRFS_CACHE_NO; + } else { + cache->cached = BTRFS_CACHE_STARTED; + } } spin_unlock(&cache->lock); + wake_up(&caching_ctl->wait); if (ret == 1) { + put_caching_control(caching_ctl); free_excluded_extents(fs_info->extent_root, cache); return 0; } + } else { + /* + * We are not going to do the fast caching, set cached to the + * appropriate value and wakeup any waiters. + */ + spin_lock(&cache->lock); + if (load_cache_only) { + cache->caching_ctl = NULL; + cache->cached = BTRFS_CACHE_NO; + } else { + cache->cached = BTRFS_CACHE_STARTED; + } + spin_unlock(&cache->lock); + wake_up(&caching_ctl->wait); } - if (load_cache_only) - return 0; - - caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); - BUG_ON(!caching_ctl); - - INIT_LIST_HEAD(&caching_ctl->list); - mutex_init(&caching_ctl->mutex); - init_waitqueue_head(&caching_ctl->wait); - caching_ctl->block_group = cache; - caching_ctl->progress = cache->key.objectid; - /* one for caching kthread, one for caching block group list */ - atomic_set(&caching_ctl->count, 2); - caching_ctl->work.func = caching_thread; - - spin_lock(&cache->lock); - if (cache->cached != BTRFS_CACHE_NO) { - spin_unlock(&cache->lock); - kfree(caching_ctl); + if (load_cache_only) { + put_caching_control(caching_ctl); return 0; } - cache->caching_ctl = caching_ctl; - cache->cached = BTRFS_CACHE_STARTED; - spin_unlock(&cache->lock); down_write(&fs_info->extent_commit_sem); + atomic_inc(&caching_ctl->count); list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); up_write(&fs_info->extent_commit_sem); @@ -5177,13 +5218,15 @@ search: } have_block_group: - if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { + cached = block_group_cache_done(block_group); + if (unlikely(!cached)) { u64 free_percent; + found_uncached_bg = true; ret = cache_block_group(block_group, trans, orig_root, 1); if (block_group->cached == BTRFS_CACHE_FINISHED) - goto have_block_group; + goto alloc; free_percent = btrfs_block_group_used(&block_group->item); free_percent *= 100; @@ -5205,7 +5248,6 @@ have_block_group: orig_root, 0); BUG_ON(ret); } - found_uncached_bg = true; /* * If loop is set for cached only, try the next block @@ -5215,10 +5257,7 @@ have_block_group: goto loop; } - cached = block_group_cache_done(block_group); - if (unlikely(!cached)) - found_uncached_bg = true; - +alloc: if (unlikely(block_group->ro)) goto loop; -- cgit v0.10.2 From f7d61dcd6873c49bcc42be2caa2af1c2511aa915 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 15 Nov 2011 09:31:24 -0500 Subject: Btrfs: clear pages dirty for io and set them extent mapped When doing the io_ctl helpers to clean up the free space cache stuff I stopped using our normal prepare_pages stuff, which means I of course forgot to do things like set the pages extent mapped, which will cause us all sorts of wonderful propblems. Thanks, Signed-off-by: Josef Bacik diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 8c32434..aedacdb 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -351,6 +351,11 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode, } } + for (i = 0; i < io_ctl->num_pages; i++) { + clear_page_dirty_for_io(io_ctl->pages[i]); + set_page_extent_mapped(io_ctl->pages[i]); + } + return 0; } -- cgit v0.10.2 From 4d479cf010d56ec9c54f3099992d039918f1296b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 17 Nov 2011 11:34:31 -0500 Subject: Btrfs: sectorsize align offsets in fiemap We've been hitting BUG()'s in btrfs_cont_expand and btrfs_fallocate and anywhere else that calls btrfs_get_extent while running xfstests 13 in a loop. This is because fiemap is calling btrfs_get_extent with non-sectorsize aligned offsets, which will end up adding mappings that are not sectorsize aligned, which will cause problems in some cases for subsequent calls to btrfs_get_extent for similar areas that are sectorsize aligned. With this patch I ran xfstests 13 in a loop for a couple of hours and didn't hit the problem that I could previously hit in at most 20 minutes. Thanks, Signed-off-by: Josef Bacik diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a877b8b..9472d3d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3366,6 +3366,9 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return -ENOMEM; path->leave_spinning = 1; + start = ALIGN(start, BTRFS_I(inode)->root->sectorsize); + len = ALIGN(len, BTRFS_I(inode)->root->sectorsize); + /* * lookup the last file extent. We're not using i_size here * because there might be preallocation past i_size @@ -3413,7 +3416,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, &cached_state, GFP_NOFS); - em = get_extent_skip_holes(inode, off, last_for_get_extent, + em = get_extent_skip_holes(inode, start, last_for_get_extent, get_extent); if (!em) goto out; -- cgit v0.10.2 From 24a70313969fc3fc440216b40babdb42564acff3 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 21 Nov 2011 09:39:11 -0500 Subject: Btrfs: remove free-space-cache.c WARN during log replay The log replay code only partially loads block groups, since the block group caching code is able to detect and deal with extents the logging code has pinned down. While the logging code is pinning down block groups, there is a bogus WARN_ON we're hitting if the code wasn't able to find an extent in the cache. This commit removes the warning because it can happen any time there isn't a valid free space cache for that block group. Signed-off-by: Chris Mason diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index aedacdb..6e5b7e4 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1849,7 +1849,13 @@ again: info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1, 0); if (!info) { - WARN_ON(1); + /* the tree logging code might be calling us before we + * have fully loaded the free space rbtree for this + * block group. So it is possible the entry won't + * be in the rbtree yet at all. The caching code + * will make sure not to put it in the rbtree if + * the logging code has pinned it. + */ goto out_lock; } } -- cgit v0.10.2