summaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/async-thread.c11
-rw-r--r--fs/btrfs/async-thread.h1
-rw-r--r--fs/btrfs/backref.c123
-rw-r--r--fs/btrfs/backref.h3
-rw-r--r--fs/btrfs/btrfs_inode.h40
-rw-r--r--fs/btrfs/check-integrity.c18
-rw-r--r--fs/btrfs/compression.c21
-rw-r--r--fs/btrfs/ctree.c106
-rw-r--r--fs/btrfs/ctree.h93
-rw-r--r--fs/btrfs/delayed-inode.c8
-rw-r--r--fs/btrfs/dev-replace.c82
-rw-r--r--fs/btrfs/dir-item.c12
-rw-r--r--fs/btrfs/disk-io.c292
-rw-r--r--fs/btrfs/disk-io.h16
-rw-r--r--fs/btrfs/export.c4
-rw-r--r--fs/btrfs/extent-tree.c267
-rw-r--r--fs/btrfs/extent_io.c483
-rw-r--r--fs/btrfs/extent_io.h60
-rw-r--r--fs/btrfs/file-item.c30
-rw-r--r--fs/btrfs/file.c151
-rw-r--r--fs/btrfs/free-space-cache.c157
-rw-r--r--fs/btrfs/hash.c20
-rw-r--r--fs/btrfs/inode-item.c12
-rw-r--r--fs/btrfs/inode-map.c68
-rw-r--r--fs/btrfs/inode.c648
-rw-r--r--fs/btrfs/ioctl.c65
-rw-r--r--fs/btrfs/lzo.c3
-rw-r--r--fs/btrfs/orphan.c4
-rw-r--r--fs/btrfs/print-tree.c3
-rw-r--r--fs/btrfs/qgroup.c30
-rw-r--r--fs/btrfs/raid56.c8
-rw-r--r--fs/btrfs/reada.c2
-rw-r--r--fs/btrfs/relocation.c142
-rw-r--r--fs/btrfs/scrub.c67
-rw-r--r--fs/btrfs/send.c47
-rw-r--r--fs/btrfs/super.c137
-rw-r--r--fs/btrfs/sysfs.c41
-rw-r--r--fs/btrfs/sysfs.h16
-rw-r--r--fs/btrfs/tests/free-space-tests.c516
-rw-r--r--fs/btrfs/transaction.c52
-rw-r--r--fs/btrfs/transaction.h2
-rw-r--r--fs/btrfs/tree-log.c276
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/uuid-tree.c1
-rw-r--r--fs/btrfs/volumes.c689
-rw-r--r--fs/btrfs/volumes.h166
-rw-r--r--fs/btrfs/xattr.c4
-rw-r--r--fs/btrfs/zlib.c141
48 files changed, 3582 insertions, 1558 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index fbd76de..4dabeb8 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -74,6 +74,7 @@ BTRFS_WORK_HELPER(endio_helper);
BTRFS_WORK_HELPER(endio_meta_helper);
BTRFS_WORK_HELPER(endio_meta_write_helper);
BTRFS_WORK_HELPER(endio_raid56_helper);
+BTRFS_WORK_HELPER(endio_repair_helper);
BTRFS_WORK_HELPER(rmw_helper);
BTRFS_WORK_HELPER(endio_write_helper);
BTRFS_WORK_HELPER(freespace_write_helper);
@@ -91,7 +92,7 @@ __btrfs_alloc_workqueue(const char *name, int flags, int max_active,
{
struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
- if (unlikely(!ret))
+ if (!ret)
return NULL;
ret->max_active = max_active;
@@ -115,7 +116,7 @@ __btrfs_alloc_workqueue(const char *name, int flags, int max_active,
ret->normal_wq = alloc_workqueue("%s-%s", flags,
ret->max_active, "btrfs",
name);
- if (unlikely(!ret->normal_wq)) {
+ if (!ret->normal_wq) {
kfree(ret);
return NULL;
}
@@ -137,12 +138,12 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
{
struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
- if (unlikely(!ret))
+ if (!ret)
return NULL;
ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI,
max_active, thresh);
- if (unlikely(!ret->normal)) {
+ if (!ret->normal) {
kfree(ret);
return NULL;
}
@@ -150,7 +151,7 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
if (flags & WQ_HIGHPRI) {
ret->high = __btrfs_alloc_workqueue(name, flags, max_active,
thresh);
- if (unlikely(!ret->high)) {
+ if (!ret->high) {
__btrfs_destroy_workqueue(ret->normal);
kfree(ret);
return NULL;
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index e9e31c9..e386c29 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -53,6 +53,7 @@ BTRFS_WORK_HELPER_PROTO(endio_helper);
BTRFS_WORK_HELPER_PROTO(endio_meta_helper);
BTRFS_WORK_HELPER_PROTO(endio_meta_write_helper);
BTRFS_WORK_HELPER_PROTO(endio_raid56_helper);
+BTRFS_WORK_HELPER_PROTO(endio_repair_helper);
BTRFS_WORK_HELPER_PROTO(rmw_helper);
BTRFS_WORK_HELPER_PROTO(endio_write_helper);
BTRFS_WORK_HELPER_PROTO(freespace_write_helper);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 54a201d..2d3e32e 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -25,6 +25,9 @@
#include "delayed-ref.h"
#include "locking.h"
+/* Just an arbitrary number so we can be sure this happened */
+#define BACKREF_FOUND_SHARED 6
+
struct extent_inode_elem {
u64 inum;
u64 offset;
@@ -377,7 +380,8 @@ out:
static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 time_seq,
struct list_head *head,
- const u64 *extent_item_pos, u64 total_refs)
+ const u64 *extent_item_pos, u64 total_refs,
+ u64 root_objectid)
{
int err;
int ret = 0;
@@ -402,6 +406,10 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
continue;
if (ref->count == 0)
continue;
+ if (root_objectid && ref->root_id != root_objectid) {
+ ret = BACKREF_FOUND_SHARED;
+ goto out;
+ }
err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
parents, extent_item_pos,
total_refs);
@@ -482,7 +490,7 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
continue;
BUG_ON(!ref->wanted_disk_byte);
eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
- fs_info->tree_root->leafsize, 0);
+ 0);
if (!eb || !extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
return -EIO;
@@ -561,7 +569,8 @@ static void __merge_refs(struct list_head *head, int mode)
* smaller or equal that seq to the list
*/
static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
- struct list_head *prefs, u64 *total_refs)
+ struct list_head *prefs, u64 *total_refs,
+ u64 inum)
{
struct btrfs_delayed_extent_op *extent_op = head->extent_op;
struct rb_node *n = &head->node.rb_node;
@@ -625,6 +634,16 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
key.objectid = ref->objectid;
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = ref->offset;
+
+ /*
+ * Found a inum that doesn't match our known inum, we
+ * know it's shared.
+ */
+ if (inum && ref->objectid != inum) {
+ ret = BACKREF_FOUND_SHARED;
+ break;
+ }
+
ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0,
node->bytenr,
node->ref_mod * sgn, GFP_ATOMIC);
@@ -659,7 +678,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
static int __add_inline_refs(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 bytenr,
int *info_level, struct list_head *prefs,
- u64 *total_refs)
+ u64 *total_refs, u64 inum)
{
int ret = 0;
int slot;
@@ -744,6 +763,12 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
dref);
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+
+ if (inum && key.objectid != inum) {
+ ret = BACKREF_FOUND_SHARED;
+ break;
+ }
+
root = btrfs_extent_data_ref_root(leaf, dref);
ret = __add_prelim_ref(prefs, root, &key, 0, 0,
bytenr, count, GFP_NOFS);
@@ -765,7 +790,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
*/
static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 bytenr,
- int info_level, struct list_head *prefs)
+ int info_level, struct list_head *prefs, u64 inum)
{
struct btrfs_root *extent_root = fs_info->extent_root;
int ret;
@@ -827,6 +852,12 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
dref);
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+
+ if (inum && key.objectid != inum) {
+ ret = BACKREF_FOUND_SHARED;
+ break;
+ }
+
root = btrfs_extent_data_ref_root(leaf, dref);
ret = __add_prelim_ref(prefs, root, &key, 0, 0,
bytenr, count, GFP_NOFS);
@@ -854,7 +885,8 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
static int find_parent_nodes(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 time_seq, struct ulist *refs,
- struct ulist *roots, const u64 *extent_item_pos)
+ struct ulist *roots, const u64 *extent_item_pos,
+ u64 root_objectid, u64 inum)
{
struct btrfs_key key;
struct btrfs_path *path;
@@ -929,7 +961,8 @@ again:
}
spin_unlock(&delayed_refs->lock);
ret = __add_delayed_refs(head, time_seq,
- &prefs_delayed, &total_refs);
+ &prefs_delayed, &total_refs,
+ inum);
mutex_unlock(&head->mutex);
if (ret)
goto out;
@@ -951,11 +984,11 @@ again:
key.type == BTRFS_METADATA_ITEM_KEY)) {
ret = __add_inline_refs(fs_info, path, bytenr,
&info_level, &prefs,
- &total_refs);
+ &total_refs, inum);
if (ret)
goto out;
ret = __add_keyed_refs(fs_info, path, bytenr,
- info_level, &prefs);
+ info_level, &prefs, inum);
if (ret)
goto out;
}
@@ -971,7 +1004,8 @@ again:
__merge_refs(&prefs, 1);
ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
- extent_item_pos, total_refs);
+ extent_item_pos, total_refs,
+ root_objectid);
if (ret)
goto out;
@@ -981,6 +1015,11 @@ again:
ref = list_first_entry(&prefs, struct __prelim_ref, list);
WARN_ON(ref->count < 0);
if (roots && ref->count && ref->root_id && ref->parent == 0) {
+ if (root_objectid && ref->root_id != root_objectid) {
+ ret = BACKREF_FOUND_SHARED;
+ goto out;
+ }
+
/* no parent == root of tree */
ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
if (ret < 0)
@@ -989,12 +1028,10 @@ again:
if (ref->count && ref->parent) {
if (extent_item_pos && !ref->inode_list &&
ref->level == 0) {
- u32 bsz;
struct extent_buffer *eb;
- bsz = btrfs_level_size(fs_info->extent_root,
- ref->level);
+
eb = read_tree_block(fs_info->extent_root,
- ref->parent, bsz, 0);
+ ref->parent, 0);
if (!eb || !extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
ret = -EIO;
@@ -1087,7 +1124,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
return -ENOMEM;
ret = find_parent_nodes(trans, fs_info, bytenr,
- time_seq, *leafs, NULL, extent_item_pos);
+ time_seq, *leafs, NULL, extent_item_pos, 0, 0);
if (ret < 0 && ret != -ENOENT) {
free_leaf_list(*leafs);
return ret;
@@ -1130,7 +1167,7 @@ static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans,
ULIST_ITER_INIT(&uiter);
while (1) {
ret = find_parent_nodes(trans, fs_info, bytenr,
- time_seq, tmp, *roots, NULL);
+ time_seq, tmp, *roots, NULL, 0, 0);
if (ret < 0 && ret != -ENOENT) {
ulist_free(tmp);
ulist_free(*roots);
@@ -1161,6 +1198,54 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
return ret;
}
+int btrfs_check_shared(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 root_objectid,
+ u64 inum, u64 bytenr)
+{
+ struct ulist *tmp = NULL;
+ struct ulist *roots = NULL;
+ struct ulist_iterator uiter;
+ struct ulist_node *node;
+ struct seq_list elem = {};
+ int ret = 0;
+
+ tmp = ulist_alloc(GFP_NOFS);
+ roots = ulist_alloc(GFP_NOFS);
+ if (!tmp || !roots) {
+ ulist_free(tmp);
+ ulist_free(roots);
+ return -ENOMEM;
+ }
+
+ if (trans)
+ btrfs_get_tree_mod_seq(fs_info, &elem);
+ else
+ down_read(&fs_info->commit_root_sem);
+ ULIST_ITER_INIT(&uiter);
+ while (1) {
+ ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp,
+ roots, NULL, root_objectid, inum);
+ if (ret == BACKREF_FOUND_SHARED) {
+ ret = 1;
+ break;
+ }
+ if (ret < 0 && ret != -ENOENT)
+ break;
+ node = ulist_next(tmp, &uiter);
+ if (!node)
+ break;
+ bytenr = node->val;
+ cond_resched();
+ }
+ if (trans)
+ btrfs_put_tree_mod_seq(fs_info, &elem);
+ else
+ up_read(&fs_info->commit_root_sem);
+ ulist_free(tmp);
+ ulist_free(roots);
+ return ret;
+}
+
/*
* this makes the path point to (inum INODE_ITEM ioff)
*/
@@ -1193,7 +1278,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
unsigned long ptr;
key.objectid = inode_objectid;
- btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+ key.type = BTRFS_INODE_EXTREF_KEY;
key.offset = start_off;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -1233,7 +1318,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
ret = -ENOENT;
if (found_key.objectid != inode_objectid)
break;
- if (btrfs_key_type(&found_key) != BTRFS_INODE_EXTREF_KEY)
+ if (found_key.type != BTRFS_INODE_EXTREF_KEY)
break;
ret = 0;
@@ -1366,7 +1451,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
}
btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
if (found_key->type == BTRFS_METADATA_ITEM_KEY)
- size = fs_info->extent_root->leafsize;
+ size = fs_info->extent_root->nodesize;
else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
size = found_key->offset;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 86fc20f..2a1ac6b 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -71,6 +71,9 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
u64 start_off, struct btrfs_path *path,
struct btrfs_inode_extref **ret_extref,
u64 *found_off);
+int btrfs_check_shared(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 root_objectid,
+ u64 inum, u64 bytenr);
int __init btrfs_prelim_ref_init(void);
void btrfs_prelim_ref_exit(void);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 43527fd..4aadadc 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -44,6 +44,17 @@
#define BTRFS_INODE_IN_DELALLOC_LIST 9
#define BTRFS_INODE_READDIO_NEED_LOCK 10
#define BTRFS_INODE_HAS_PROPS 11
+/*
+ * The following 3 bits are meant only for the btree inode.
+ * When any of them is set, it means an error happened while writing an
+ * extent buffer belonging to:
+ * 1) a non-log btree
+ * 2) a log btree and first log sub-transaction
+ * 3) a log btree and second log sub-transaction
+ */
+#define BTRFS_INODE_BTREE_ERR 12
+#define BTRFS_INODE_BTREE_LOG1_ERR 13
+#define BTRFS_INODE_BTREE_LOG2_ERR 14
/* in memory btrfs inode */
struct btrfs_inode {
@@ -121,6 +132,12 @@ struct btrfs_inode {
u64 delalloc_bytes;
/*
+ * total number of bytes pending defrag, used by stat to check whether
+ * it needs COW.
+ */
+ u64 defrag_bytes;
+
+ /*
* the size of the file stored in the metadata on disk. data=ordered
* means the in-memory i_size might be larger than the size on disk
* because not all the blocks are written yet.
@@ -234,13 +251,25 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
BTRFS_I(inode)->last_sub_trans <=
BTRFS_I(inode)->last_log_commit &&
BTRFS_I(inode)->last_sub_trans <=
- BTRFS_I(inode)->root->last_log_commit)
- return 1;
+ BTRFS_I(inode)->root->last_log_commit) {
+ /*
+ * After a ranged fsync we might have left some extent maps
+ * (that fall outside the fsync's range). So return false
+ * here if the list isn't empty, to make sure btrfs_log_inode()
+ * will be called and process those extent maps.
+ */
+ smp_mb();
+ if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents))
+ return 1;
+ }
return 0;
}
+#define BTRFS_DIO_ORIG_BIO_SUBMITTED 0x1
+
struct btrfs_dio_private {
struct inode *inode;
+ unsigned long flags;
u64 logical_offset;
u64 disk_bytenr;
u64 bytes;
@@ -257,7 +286,12 @@ struct btrfs_dio_private {
/* dio_bio came from fs/direct-io.c */
struct bio *dio_bio;
- u8 csum[0];
+
+ /*
+ * The original bio may be splited to several sub-bios, this is
+ * done during endio of sub-bios
+ */
+ int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int);
};
/*
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index ce92ae3..cb7f3fe 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -807,7 +807,7 @@ static int btrfsic_process_superblock_dev_mirror(
/* super block bytenr is always the unmapped device bytenr */
dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
- if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes)
+ if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes)
return -1;
bh = __bread(superblock_bdev, dev_bytenr / 4096,
BTRFS_SUPER_INFO_SIZE);
@@ -820,7 +820,6 @@ static int btrfsic_process_superblock_dev_mirror(
btrfs_super_magic(super_tmp) != BTRFS_MAGIC ||
memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
btrfs_super_nodesize(super_tmp) != state->metablock_size ||
- btrfs_super_leafsize(super_tmp) != state->metablock_size ||
btrfs_super_sectorsize(super_tmp) != state->datablock_size) {
brelse(bh);
return 0;
@@ -1252,8 +1251,7 @@ static void btrfsic_read_from_block_data(
while (len > 0) {
cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page));
- BUG_ON(i >= (block_ctx->len + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT);
+ BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_CACHE_SIZE));
kaddr = block_ctx->datav[i];
memcpy(dst, kaddr + offset_in_page, cur);
@@ -3120,24 +3118,12 @@ int btrfsic_mount(struct btrfs_root *root,
struct list_head *dev_head = &fs_devices->devices;
struct btrfs_device *device;
- if (root->nodesize != root->leafsize) {
- printk(KERN_INFO
- "btrfsic: cannot handle nodesize %d != leafsize %d!\n",
- root->nodesize, root->leafsize);
- return -1;
- }
if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) {
printk(KERN_INFO
"btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
root->nodesize, PAGE_CACHE_SIZE);
return -1;
}
- if (root->leafsize & ((u64)PAGE_CACHE_SIZE - 1)) {
- printk(KERN_INFO
- "btrfsic: cannot handle leafsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
- root->leafsize, PAGE_CACHE_SIZE);
- return -1;
- }
if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) {
printk(KERN_INFO
"btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 1daea0b..d3220d3 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -91,8 +91,7 @@ static inline int compressed_bio_size(struct btrfs_root *root,
u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
return sizeof(struct compressed_bio) +
- ((disk_size + root->sectorsize - 1) / root->sectorsize) *
- csum_size;
+ (DIV_ROUND_UP(disk_size, root->sectorsize)) * csum_size;
}
static struct bio *compressed_bio_alloc(struct block_device *bdev,
@@ -389,7 +388,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
* freed before we're done setting it up
*/
atomic_inc(&cb->pending_bios);
- ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio,
+ BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
if (!skip_sum) {
@@ -420,7 +420,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
}
bio_get(bio);
- ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio, BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
if (!skip_sum) {
@@ -615,8 +615,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
cb->compress_type = extent_compress_type(bio_flags);
cb->orig_bio = bio;
- nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
- PAGE_CACHE_SIZE;
+ nr_pages = DIV_ROUND_UP(compressed_len, PAGE_CACHE_SIZE);
cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages,
GFP_NOFS);
if (!cb->compressed_pages)
@@ -670,7 +669,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
PAGE_CACHE_SIZE) {
bio_get(comp_bio);
- ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+ ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio,
+ BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
/*
@@ -686,8 +686,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
comp_bio, sums);
BUG_ON(ret); /* -ENOMEM */
}
- sums += (comp_bio->bi_iter.bi_size +
- root->sectorsize - 1) / root->sectorsize;
+ sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
+ root->sectorsize);
ret = btrfs_map_bio(root, READ, comp_bio,
mirror_num, 0);
@@ -708,7 +708,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
}
bio_get(comp_bio);
- ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+ ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio,
+ BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 44ee5d2..19bc616 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -258,9 +258,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
else
btrfs_node_key(buf, &disk_key, 0);
- cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
- new_root_objectid, &disk_key, level,
- buf->start, 0);
+ cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
+ &disk_key, level, buf->start, 0);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -1133,9 +1132,9 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
} else
parent_start = 0;
- cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
- root->root_key.objectid, &disk_key,
- level, search_start, empty_size);
+ cow = btrfs_alloc_tree_block(trans, root, parent_start,
+ root->root_key.objectid, &disk_key, level,
+ search_start, empty_size);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -1425,7 +1424,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
struct tree_mod_root *old_root = NULL;
u64 old_generation = 0;
u64 logical;
- u32 blocksize;
eb_root = btrfs_read_lock_root_node(root);
tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
@@ -1444,8 +1442,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
- blocksize = btrfs_level_size(root, old_root->level);
- old = read_tree_block(root, logical, blocksize, 0);
+ old = read_tree_block(root, logical, 0);
if (WARN_ON(!old || !extent_buffer_uptodate(old))) {
free_extent_buffer(old);
btrfs_warn(root->fs_info,
@@ -1506,10 +1503,9 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf)
{
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ if (btrfs_test_is_dummy_root(root))
return 0;
-#endif
+
/* ensure we can see the force_cow */
smp_rmb();
@@ -1651,7 +1647,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
WARN_ON(trans->transid != root->fs_info->generation);
parent_nritems = btrfs_header_nritems(parent);
- blocksize = btrfs_level_size(root, parent_level - 1);
+ blocksize = root->nodesize;
end_slot = parent_nritems;
if (parent_nritems == 1)
@@ -1685,15 +1681,14 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
continue;
}
- cur = btrfs_find_tree_block(root, blocknr, blocksize);
+ cur = btrfs_find_tree_block(root, blocknr);
if (cur)
uptodate = btrfs_buffer_uptodate(cur, gen, 0);
else
uptodate = 0;
if (!cur || !uptodate) {
if (!cur) {
- cur = read_tree_block(root, blocknr,
- blocksize, gen);
+ cur = read_tree_block(root, blocknr, gen);
if (!cur || !extent_buffer_uptodate(cur)) {
free_extent_buffer(cur);
return -EIO;
@@ -1872,7 +1867,6 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
BUG_ON(level == 0);
eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
- btrfs_level_size(root, level - 1),
btrfs_node_ptr_generation(parent, slot));
if (eb && !extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
@@ -2267,8 +2261,8 @@ static void reada_for_search(struct btrfs_root *root,
node = path->nodes[level];
search = btrfs_node_blockptr(node, slot);
- blocksize = btrfs_level_size(root, level - 1);
- eb = btrfs_find_tree_block(root, search, blocksize);
+ blocksize = root->nodesize;
+ eb = btrfs_find_tree_block(root, search);
if (eb) {
free_extent_buffer(eb);
return;
@@ -2298,7 +2292,7 @@ static void reada_for_search(struct btrfs_root *root,
if ((search <= target && target - search <= 65536) ||
(search > target && search - target <= 65536)) {
gen = btrfs_node_ptr_generation(node, nr);
- readahead_tree_block(root, search, blocksize, gen);
+ readahead_tree_block(root, search, blocksize);
nread += blocksize;
}
nscan++;
@@ -2325,12 +2319,12 @@ static noinline void reada_for_balance(struct btrfs_root *root,
nritems = btrfs_header_nritems(parent);
slot = path->slots[level + 1];
- blocksize = btrfs_level_size(root, level);
+ blocksize = root->nodesize;
if (slot > 0) {
block1 = btrfs_node_blockptr(parent, slot - 1);
gen = btrfs_node_ptr_generation(parent, slot - 1);
- eb = btrfs_find_tree_block(root, block1, blocksize);
+ eb = btrfs_find_tree_block(root, block1);
/*
* if we get -eagain from btrfs_buffer_uptodate, we
* don't want to return eagain here. That will loop
@@ -2343,16 +2337,16 @@ static noinline void reada_for_balance(struct btrfs_root *root,
if (slot + 1 < nritems) {
block2 = btrfs_node_blockptr(parent, slot + 1);
gen = btrfs_node_ptr_generation(parent, slot + 1);
- eb = btrfs_find_tree_block(root, block2, blocksize);
+ eb = btrfs_find_tree_block(root, block2);
if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
block2 = 0;
free_extent_buffer(eb);
}
if (block1)
- readahead_tree_block(root, block1, blocksize, 0);
+ readahead_tree_block(root, block1, blocksize);
if (block2)
- readahead_tree_block(root, block2, blocksize, 0);
+ readahead_tree_block(root, block2, blocksize);
}
@@ -2454,16 +2448,14 @@ read_block_for_search(struct btrfs_trans_handle *trans,
{
u64 blocknr;
u64 gen;
- u32 blocksize;
struct extent_buffer *b = *eb_ret;
struct extent_buffer *tmp;
int ret;
blocknr = btrfs_node_blockptr(b, slot);
gen = btrfs_node_ptr_generation(b, slot);
- blocksize = btrfs_level_size(root, level - 1);
- tmp = btrfs_find_tree_block(root, blocknr, blocksize);
+ tmp = btrfs_find_tree_block(root, blocknr);
if (tmp) {
/* first we do an atomic uptodate check */
if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
@@ -2507,7 +2499,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
btrfs_release_path(p);
ret = -EAGAIN;
- tmp = read_tree_block(root, blocknr, blocksize, 0);
+ tmp = read_tree_block(root, blocknr, 0);
if (tmp) {
/*
* If the read above didn't mark this buffer up to date,
@@ -2792,8 +2784,6 @@ again:
if (!should_cow_block(trans, root, b))
goto cow_done;
- btrfs_set_path_blocking(p);
-
/*
* must have write locks on this node and the
* parent
@@ -2807,6 +2797,7 @@ again:
goto again;
}
+ btrfs_set_path_blocking(p);
err = btrfs_cow_block(trans, root, b,
p->nodes[level + 1],
p->slots[level + 1], &b);
@@ -3362,9 +3353,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
else
btrfs_node_key(lower, &lower_key, 0);
- c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
- root->root_key.objectid, &lower_key,
- level, root->node->start, 0);
+ c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ &lower_key, level, root->node->start, 0);
if (IS_ERR(c))
return PTR_ERR(c);
@@ -3502,9 +3492,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
mid = (c_nritems + 1) / 2;
btrfs_node_key(c, &disk_key, mid);
- split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
- root->root_key.objectid,
- &disk_key, level, c->start, 0);
+ split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ &disk_key, level, c->start, 0);
if (IS_ERR(split))
return PTR_ERR(split);
@@ -4282,13 +4271,12 @@ again:
else
btrfs_item_key(l, &disk_key, mid);
- right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
- root->root_key.objectid,
- &disk_key, 0, l->start, 0);
+ right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ &disk_key, 0, l->start, 0);
if (IS_ERR(right))
return PTR_ERR(right);
- root_add_used(root, root->leafsize);
+ root_add_used(root, root->nodesize);
memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
btrfs_set_header_bytenr(right, right->start);
@@ -4626,8 +4614,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
ptr = btrfs_item_ptr_offset(leaf, slot);
memmove_extent_buffer(leaf, ptr,
(unsigned long)fi,
- offsetof(struct btrfs_file_extent_item,
- disk_bytenr));
+ BTRFS_FILE_EXTENT_INLINE_DATA_START);
}
}
@@ -4738,6 +4725,12 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
int slot;
struct btrfs_map_token token;
+ if (path->slots[0] == 0) {
+ btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+ fixup_low_keys(root, path, &disk_key, 1);
+ }
+ btrfs_unlock_up_safe(path, 1);
+
btrfs_init_map_token(&token);
leaf = path->nodes[0];
@@ -4798,12 +4791,6 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
}
btrfs_set_header_nritems(leaf, nritems + nr);
-
- if (slot == 0) {
- btrfs_cpu_key_to_disk(&disk_key, cpu_key);
- fixup_low_keys(root, path, &disk_key, 1);
- }
- btrfs_unlock_up_safe(path, 1);
btrfs_mark_buffer_dirty(leaf);
if (btrfs_leaf_free_space(root, leaf) < 0) {
@@ -5145,8 +5132,9 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
u32 nritems;
int level;
int ret = 1;
+ int keep_locks = path->keep_locks;
- WARN_ON(!path->keep_locks);
+ path->keep_locks = 1;
again:
cur = btrfs_read_lock_root_node(root);
level = btrfs_header_level(cur);
@@ -5210,7 +5198,6 @@ find_next_key:
path->slots[level] = slot;
if (level == path->lowest_level) {
ret = 0;
- unlock_up(path, level, 1, 0, NULL);
goto out;
}
btrfs_set_path_blocking(path);
@@ -5225,9 +5212,12 @@ find_next_key:
btrfs_clear_path_blocking(path, NULL, 0);
}
out:
- if (ret == 0)
+ path->keep_locks = keep_locks;
+ if (ret == 0) {
+ btrfs_unlock_up_safe(path, path->lowest_level + 1);
+ btrfs_set_path_blocking(path);
memcpy(min_key, &found_key, sizeof(found_key));
- btrfs_set_path_blocking(path);
+ }
return ret;
}
@@ -5375,7 +5365,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
goto out;
}
- tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS);
+ tmp_buf = kmalloc(left_root->nodesize, GFP_NOFS);
if (!tmp_buf) {
ret = -ENOMEM;
goto out;
@@ -5520,18 +5510,18 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
goto out;
advance_right = ADVANCE;
} else {
- enum btrfs_compare_tree_result cmp;
+ enum btrfs_compare_tree_result result;
WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
ret = tree_compare_item(left_root, left_path,
right_path, tmp_buf);
if (ret)
- cmp = BTRFS_COMPARE_TREE_CHANGED;
+ result = BTRFS_COMPARE_TREE_CHANGED;
else
- cmp = BTRFS_COMPARE_TREE_SAME;
+ result = BTRFS_COMPARE_TREE_SAME;
ret = changed_cb(left_root, right_root,
left_path, right_path,
- &left_key, cmp, ctx);
+ &left_key, result, ctx);
if (ret < 0)
goto out;
advance_left = ADVANCE;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8e29b61..d557264e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -34,6 +34,7 @@
#include <linux/pagemap.h>
#include <linux/btrfs.h>
#include <linux/workqueue.h>
+#include <linux/security.h>
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
@@ -62,13 +63,6 @@ struct btrfs_ordered_sum;
#define BTRFS_COMPAT_EXTENT_TREE_V0
-/*
- * files bigger than this get some pre-flushing when they are added
- * to the ordered operations list. That way we limit the total
- * work done by the commit
- */
-#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)
-
/* holds pointers to all of the tree roots */
#define BTRFS_ROOT_TREE_OBJECTID 1ULL
@@ -391,10 +385,12 @@ struct btrfs_header {
sizeof(struct btrfs_header)) / \
sizeof(struct btrfs_key_ptr))
#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
-#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize))
+#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->nodesize))
+#define BTRFS_FILE_EXTENT_INLINE_DATA_START \
+ (offsetof(struct btrfs_file_extent_item, disk_bytenr))
#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
sizeof(struct btrfs_item) - \
- sizeof(struct btrfs_file_extent_item))
+ BTRFS_FILE_EXTENT_INLINE_DATA_START)
#define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
sizeof(struct btrfs_item) -\
sizeof(struct btrfs_dir_item))
@@ -474,7 +470,7 @@ struct btrfs_super_block {
__le64 num_devices;
__le32 sectorsize;
__le32 nodesize;
- __le32 leafsize;
+ __le32 __unused_leafsize;
__le32 stripesize;
__le32 sys_chunk_array_size;
__le64 chunk_root_generation;
@@ -903,6 +899,8 @@ struct btrfs_file_extent_item {
/*
* disk space consumed by the extent, checksum blocks are included
* in these numbers
+ *
+ * At this offset in the structure, the inline extent data start.
*/
__le64 disk_bytenr;
__le64 disk_num_bytes;
@@ -1305,8 +1303,8 @@ struct btrfs_block_group_cache {
*/
struct list_head cluster_list;
- /* For delayed block group creation */
- struct list_head new_bg_list;
+ /* For delayed block group creation or deletion of empty block groups */
+ struct list_head bg_list;
};
/* delayed seq elem */
@@ -1545,6 +1543,7 @@ struct btrfs_fs_info {
struct btrfs_workqueue *endio_workers;
struct btrfs_workqueue *endio_meta_workers;
struct btrfs_workqueue *endio_raid56_workers;
+ struct btrfs_workqueue *endio_repair_workers;
struct btrfs_workqueue *rmw_workers;
struct btrfs_workqueue *endio_meta_write_workers;
struct btrfs_workqueue *endio_write_workers;
@@ -1574,6 +1573,7 @@ struct btrfs_fs_info {
int do_barriers;
int closing;
int log_root_recovering;
+ int open;
u64 total_pinned;
@@ -1723,6 +1723,12 @@ struct btrfs_fs_info {
/* Used to reclaim the metadata space in the background. */
struct work_struct async_reclaim_work;
+
+ spinlock_t unused_bgs_lock;
+ struct list_head unused_bgs;
+
+ /* For btrfs to record security options */
+ struct security_mnt_opts security_opts;
};
struct btrfs_subvolume_writers {
@@ -1776,12 +1782,12 @@ struct btrfs_root {
/* free ino cache stuff */
struct btrfs_free_space_ctl *free_ino_ctl;
- enum btrfs_caching_type cached;
- spinlock_t cache_lock;
- wait_queue_head_t cache_wait;
+ enum btrfs_caching_type ino_cache_state;
+ spinlock_t ino_cache_lock;
+ wait_queue_head_t ino_cache_wait;
struct btrfs_free_space_ctl *free_ino_pinned;
- u64 cache_progress;
- struct inode *cache_inode;
+ u64 ino_cache_progress;
+ struct inode *ino_cache_inode;
struct mutex log_mutex;
wait_queue_head_t log_writer_wait;
@@ -1806,18 +1812,14 @@ struct btrfs_root {
/* node allocations are done in nodesize units */
u32 nodesize;
- /* leaf allocations are done in leafsize units */
- u32 leafsize;
-
u32 stripesize;
u32 type;
u64 highest_objectid;
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ /* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */
u64 alloc_bytenr;
-#endif
u64 defrag_trans_start;
struct btrfs_key defrag_progress;
@@ -2094,6 +2096,7 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_MOUNT_CHANGE_INODE_CACHE (1 << 24)
#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
+#define BTRFS_DEFAULT_MAX_INLINE (8192)
#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -2995,8 +2998,6 @@ BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
sectorsize, 32);
BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
nodesize, 32);
-BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block,
- leafsize, 32);
BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
stripesize, 32);
BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
@@ -3049,14 +3050,12 @@ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
static inline unsigned long
btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
{
- unsigned long offset = (unsigned long)e;
- offset += offsetof(struct btrfs_file_extent_item, disk_bytenr);
- return offset;
+ return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START;
}
static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
{
- return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize;
+ return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize;
}
BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
@@ -3086,9 +3085,7 @@ BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
struct btrfs_item *e)
{
- unsigned long offset;
- offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
- return btrfs_item_size(eb, e) - offset;
+ return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
}
/* this returns the number of file bytes represented by the inline item.
@@ -3232,13 +3229,6 @@ static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
return sb->s_fs_info;
}
-static inline u32 btrfs_level_size(struct btrfs_root *root, int level)
-{
- if (level == 0)
- return root->leafsize;
- return root->nodesize;
-}
-
/* helper function to cast into the data area of the leaf. */
#define btrfs_item_ptr(leaf, slot, type) \
((type *)(btrfs_leaf_data(leaf) + \
@@ -3263,7 +3253,7 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
unsigned num_items)
{
- return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+ return (root->nodesize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
2 * num_items;
}
@@ -3274,8 +3264,7 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
unsigned num_items)
{
- return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
- num_items;
+ return root->nodesize * BTRFS_MAX_LEVEL * num_items;
}
int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
@@ -3305,9 +3294,9 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
u64 bytenr);
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int get_block_group_index(struct btrfs_block_group_cache *cache);
-struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u32 blocksize,
- u64 parent, u64 root_objectid,
+struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 parent,
+ u64 root_objectid,
struct btrfs_disk_key *key, int level,
u64 hint, u64 empty_size);
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
@@ -3363,6 +3352,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
u64 size);
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 group_start);
+void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
@@ -3604,6 +3594,7 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
kfree(fs_info->uuid_root);
kfree(fs_info->super_copy);
kfree(fs_info->super_for_commit);
+ security_free_mnt_opts(&fs_info->security_opts);
kfree(fs_info);
}
@@ -3739,8 +3730,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
struct bio *bio, u32 *dst);
int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
- struct btrfs_dio_private *dip, struct bio *bio,
- u64 logical_offset);
+ struct bio *bio, u64 logical_offset);
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 objectid, u64 pos,
@@ -4141,8 +4131,15 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
/* Sanity test specific functions */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
void btrfs_test_destroy_inode(struct inode *inode);
-int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
- u64 rfer, u64 excl);
#endif
+static inline int btrfs_test_is_dummy_root(struct btrfs_root *root)
+{
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ return 1;
+#endif
+ return 0;
+}
+
#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index a2e90f8..054577b 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1042,7 +1042,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
int ret;
key.objectid = node->inode_id;
- btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
@@ -1099,7 +1099,7 @@ err_out:
search:
btrfs_release_path(path);
- btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+ key.type = BTRFS_INODE_EXTREF_KEY;
key.offset = -1;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
@@ -1473,7 +1473,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
}
delayed_item->key.objectid = btrfs_ino(dir);
- btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY);
+ delayed_item->key.type = BTRFS_DIR_INDEX_KEY;
delayed_item->key.offset = index;
dir_item = (struct btrfs_dir_item *)delayed_item->data;
@@ -1542,7 +1542,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
return PTR_ERR(node);
item_key.objectid = btrfs_ino(dir);
- btrfs_set_key_type(&item_key, BTRFS_DIR_INDEX_KEY);
+ item_key.type = BTRFS_DIR_INDEX_KEY;
item_key.offset = index;
ret = btrfs_delete_delayed_insertion_item(root, node, &item_key);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index eea26e1..6f662b3 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -168,8 +168,12 @@ no_valid_dev_replace_entry_found:
dev_replace->srcdev->total_bytes;
dev_replace->tgtdev->disk_total_bytes =
dev_replace->srcdev->disk_total_bytes;
+ dev_replace->tgtdev->commit_total_bytes =
+ dev_replace->srcdev->commit_total_bytes;
dev_replace->tgtdev->bytes_used =
dev_replace->srcdev->bytes_used;
+ dev_replace->tgtdev->commit_bytes_used =
+ dev_replace->srcdev->commit_bytes_used;
}
dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
@@ -329,30 +333,34 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
args->start.tgtdev_name[0] == '\0')
return -EINVAL;
- mutex_lock(&fs_info->volume_mutex);
- ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
- &tgt_device);
- if (ret) {
- btrfs_err(fs_info, "target device %s is invalid!",
- args->start.tgtdev_name);
- mutex_unlock(&fs_info->volume_mutex);
- return -EINVAL;
+ /*
+ * Here we commit the transaction to make sure commit_total_bytes
+ * of all the devices are updated.
+ */
+ trans = btrfs_attach_transaction(root);
+ if (!IS_ERR(trans)) {
+ ret = btrfs_commit_transaction(trans, root);
+ if (ret)
+ return ret;
+ } else if (PTR_ERR(trans) != -ENOENT) {
+ return PTR_ERR(trans);
}
+ /* the disk copy procedure reuses the scrub code */
+ mutex_lock(&fs_info->volume_mutex);
ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
args->start.srcdev_name,
&src_device);
- mutex_unlock(&fs_info->volume_mutex);
if (ret) {
- ret = -EINVAL;
- goto leave_no_lock;
+ mutex_unlock(&fs_info->volume_mutex);
+ return ret;
}
- if (tgt_device->total_bytes < src_device->total_bytes) {
- btrfs_err(fs_info, "target device is smaller than source device!");
- ret = -EINVAL;
- goto leave_no_lock;
- }
+ ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
+ src_device, &tgt_device);
+ mutex_unlock(&fs_info->volume_mutex);
+ if (ret)
+ return ret;
btrfs_dev_replace_lock(dev_replace);
switch (dev_replace->replace_state) {
@@ -380,10 +388,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
src_device->devid,
rcu_str_deref(tgt_device->name));
- tgt_device->total_bytes = src_device->total_bytes;
- tgt_device->disk_total_bytes = src_device->disk_total_bytes;
- tgt_device->bytes_used = src_device->bytes_used;
-
/*
* from now on, the writes to the srcdev are all duplicated to
* go to the tgtdev as well (refer to btrfs_map_block()).
@@ -414,7 +418,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
/* the disk copy procedure reuses the scrub code */
ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
- src_device->total_bytes,
+ btrfs_device_get_total_bytes(src_device),
&dev_replace->scrub_progress, 0, 1);
ret = btrfs_dev_replace_finishing(root->fs_info, ret);
@@ -426,9 +430,7 @@ leave:
dev_replace->srcdev = NULL;
dev_replace->tgtdev = NULL;
btrfs_dev_replace_unlock(dev_replace);
-leave_no_lock:
- if (tgt_device)
- btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+ btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
return ret;
}
@@ -507,9 +509,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
ret = btrfs_commit_transaction(trans, root);
WARN_ON(ret);
+ mutex_lock(&uuid_mutex);
/* keep away write_all_supers() during the finishing procedure */
- mutex_lock(&root->fs_info->chunk_mutex);
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_lock(&root->fs_info->chunk_mutex);
btrfs_dev_replace_lock(dev_replace);
dev_replace->replace_state =
scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
@@ -532,8 +535,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
src_device->devid,
rcu_str_deref(tgt_device->name), scrub_ret);
btrfs_dev_replace_unlock(dev_replace);
- mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
mutex_unlock(&root->fs_info->chunk_mutex);
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&uuid_mutex);
if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
@@ -542,7 +546,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
}
printk_in_rcu(KERN_INFO
- "BTRFS: dev_replace from %s (devid %llu) to %s) finished\n",
+ "BTRFS: dev_replace from %s (devid %llu) to %s finished\n",
src_device->missing ? "<missing disk>" :
rcu_str_deref(src_device->name),
src_device->devid,
@@ -550,23 +554,29 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
tgt_device->is_tgtdev_for_dev_replace = 0;
tgt_device->devid = src_device->devid;
src_device->devid = BTRFS_DEV_REPLACE_DEVID;
- tgt_device->bytes_used = src_device->bytes_used;
memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
- tgt_device->total_bytes = src_device->total_bytes;
- tgt_device->disk_total_bytes = src_device->disk_total_bytes;
- tgt_device->bytes_used = src_device->bytes_used;
+ btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes);
+ btrfs_device_set_disk_total_bytes(tgt_device,
+ src_device->disk_total_bytes);
+ btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
+ ASSERT(list_empty(&src_device->resized_list));
+ tgt_device->commit_total_bytes = src_device->commit_total_bytes;
+ tgt_device->commit_bytes_used = src_device->bytes_used;
if (fs_info->sb->s_bdev == src_device->bdev)
fs_info->sb->s_bdev = tgt_device->bdev;
if (fs_info->fs_devices->latest_bdev == src_device->bdev)
fs_info->fs_devices->latest_bdev = tgt_device->bdev;
list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
+ fs_info->fs_devices->rw_devices++;
/* replace the sysfs entry */
btrfs_kobj_rm_device(fs_info, src_device);
btrfs_kobj_add_device(fs_info, tgt_device);
+ btrfs_dev_replace_unlock(dev_replace);
+
btrfs_rm_dev_replace_blocked(fs_info);
btrfs_rm_dev_replace_srcdev(fs_info, src_device);
@@ -580,9 +590,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
* superblock is scratched out so that it is no longer marked to
* belong to this filesystem.
*/
- btrfs_dev_replace_unlock(dev_replace);
- mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
mutex_unlock(&root->fs_info->chunk_mutex);
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&uuid_mutex);
/* write back the superblocks */
trans = btrfs_start_transaction(root, 0);
@@ -643,6 +653,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct btrfs_device *srcdev;
btrfs_dev_replace_lock(dev_replace);
/* even if !dev_replace_is_valid, the values are good enough for
@@ -665,8 +676,9 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ srcdev = dev_replace->srcdev;
args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
- div64_u64(dev_replace->srcdev->total_bytes, 1000));
+ div64_u64(btrfs_device_get_total_bytes(srcdev), 1000));
break;
}
btrfs_dev_replace_unlock(dev_replace);
@@ -825,7 +837,7 @@ static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
dev_replace->committed_cursor_left,
- dev_replace->srcdev->total_bytes,
+ btrfs_device_get_total_bytes(dev_replace->srcdev),
&dev_replace->scrub_progress, 0, 1);
ret = btrfs_dev_replace_finishing(fs_info, ret);
WARN_ON(ret);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index a0691df..fc8df86 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -86,7 +86,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root));
key.objectid = objectid;
- btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+ key.type = BTRFS_XATTR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
data_size = sizeof(*dir_item) + name_len + data_len;
@@ -137,7 +137,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
u32 data_size;
key.objectid = btrfs_ino(dir);
- btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+ key.type = BTRFS_DIR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
path = btrfs_alloc_path();
@@ -204,7 +204,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
int cow = mod != 0;
key.objectid = dir;
- btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+ key.type = BTRFS_DIR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
@@ -234,7 +234,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
return -ENOMEM;
key.objectid = dir;
- btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+ key.type = BTRFS_DIR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -297,7 +297,7 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
int cow = mod != 0;
key.objectid = dir;
- btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+ key.type = BTRFS_DIR_INDEX_KEY;
key.offset = objectid;
ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
@@ -367,7 +367,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
int cow = mod != 0;
key.objectid = dir;
- btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+ key.type = BTRFS_XATTR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
if (ret < 0)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7e221b0..1ad0f47 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -72,21 +72,41 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root);
static void btrfs_error_commit_super(struct btrfs_root *root);
/*
- * end_io_wq structs are used to do processing in task context when an IO is
- * complete. This is used during reads to verify checksums, and it is used
+ * btrfs_end_io_wq structs are used to do processing in task context when an IO
+ * is complete. This is used during reads to verify checksums, and it is used
* by writes to insert metadata for new file extents after IO is complete.
*/
-struct end_io_wq {
+struct btrfs_end_io_wq {
struct bio *bio;
bio_end_io_t *end_io;
void *private;
struct btrfs_fs_info *info;
int error;
- int metadata;
+ enum btrfs_wq_endio_type metadata;
struct list_head list;
struct btrfs_work work;
};
+static struct kmem_cache *btrfs_end_io_wq_cache;
+
+int __init btrfs_end_io_wq_init(void)
+{
+ btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
+ sizeof(struct btrfs_end_io_wq),
+ 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ NULL);
+ if (!btrfs_end_io_wq_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void btrfs_end_io_wq_exit(void)
+{
+ if (btrfs_end_io_wq_cache)
+ kmem_cache_destroy(btrfs_end_io_wq_cache);
+}
+
/*
* async submit bios are used to offload expensive checksumming
* onto the worker threads. They checksum file and metadata bios
@@ -327,8 +347,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
{
struct extent_state *cached_state = NULL;
int ret;
- bool need_lock = (current->journal_info ==
- (void *)BTRFS_SEND_TRANS_STUB);
+ bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB);
if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
return 0;
@@ -348,9 +367,9 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
ret = 0;
goto out;
}
- printk_ratelimited("parent transid verify failed on %llu wanted %llu "
- "found %llu\n",
- eb->start, parent_transid, btrfs_header_generation(eb));
+ printk_ratelimited(KERN_INFO "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
+ eb->fs_info->sb->s_id, eb->start,
+ parent_transid, btrfs_header_generation(eb));
ret = 1;
/*
@@ -607,22 +626,22 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
goto err;
eb->read_mirror = mirror;
- if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
+ if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
ret = -EIO;
goto err;
}
found_start = btrfs_header_bytenr(eb);
if (found_start != eb->start) {
- printk_ratelimited(KERN_INFO "BTRFS: bad tree block start "
+ printk_ratelimited(KERN_INFO "BTRFS (device %s): bad tree block start "
"%llu %llu\n",
- found_start, eb->start);
+ eb->fs_info->sb->s_id, found_start, eb->start);
ret = -EIO;
goto err;
}
if (check_tree_block_fsid(root, eb)) {
- printk_ratelimited(KERN_INFO "BTRFS: bad fsid on block %llu\n",
- eb->start);
+ printk_ratelimited(KERN_INFO "BTRFS (device %s): bad fsid on block %llu\n",
+ eb->fs_info->sb->s_id, eb->start);
ret = -EIO;
goto err;
}
@@ -680,7 +699,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
eb = (struct extent_buffer *)page->private;
- set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+ set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
eb->read_mirror = failed_mirror;
atomic_dec(&eb->io_pages);
if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
@@ -690,7 +709,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
static void end_workqueue_bio(struct bio *bio, int err)
{
- struct end_io_wq *end_io_wq = bio->bi_private;
+ struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
struct btrfs_fs_info *fs_info;
struct btrfs_workqueue *wq;
btrfs_work_func_t func;
@@ -713,7 +732,11 @@ static void end_workqueue_bio(struct bio *bio, int err)
func = btrfs_endio_write_helper;
}
} else {
- if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
+ if (unlikely(end_io_wq->metadata ==
+ BTRFS_WQ_ENDIO_DIO_REPAIR)) {
+ wq = fs_info->endio_repair_workers;
+ func = btrfs_endio_repair_helper;
+ } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
wq = fs_info->endio_raid56_workers;
func = btrfs_endio_raid56_helper;
} else if (end_io_wq->metadata) {
@@ -729,19 +752,12 @@ static void end_workqueue_bio(struct bio *bio, int err)
btrfs_queue_work(wq, &end_io_wq->work);
}
-/*
- * For the metadata arg you want
- *
- * 0 - if data
- * 1 - if normal metadta
- * 2 - if writing to the free space cache area
- * 3 - raid parity work
- */
int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
- int metadata)
+ enum btrfs_wq_endio_type metadata)
{
- struct end_io_wq *end_io_wq;
- end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
+ struct btrfs_end_io_wq *end_io_wq;
+
+ end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
if (!end_io_wq)
return -ENOMEM;
@@ -925,7 +941,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
* can happen in the async kernel threads
*/
ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
- bio, 1);
+ bio, BTRFS_WQ_ENDIO_METADATA);
if (ret)
goto out_w_error;
ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
@@ -1057,20 +1073,17 @@ static const struct address_space_operations btree_aops = {
.set_page_dirty = btree_set_page_dirty,
};
-int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
- u64 parent_transid)
+void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize)
{
struct extent_buffer *buf = NULL;
struct inode *btree_inode = root->fs_info->btree_inode;
- int ret = 0;
buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
if (!buf)
- return 0;
+ return;
read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
buf, 0, WAIT_NONE, btree_get_extent, 0);
free_extent_buffer(buf);
- return ret;
}
int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
@@ -1106,7 +1119,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
}
struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
- u64 bytenr, u32 blocksize)
+ u64 bytenr)
{
return find_extent_buffer(root->fs_info, bytenr);
}
@@ -1114,11 +1127,9 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
u64 bytenr, u32 blocksize)
{
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ if (btrfs_test_is_dummy_root(root))
return alloc_test_extent_buffer(root->fs_info, bytenr,
blocksize);
-#endif
return alloc_extent_buffer(root->fs_info, bytenr, blocksize);
}
@@ -1136,12 +1147,12 @@ int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
}
struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
- u32 blocksize, u64 parent_transid)
+ u64 parent_transid)
{
struct extent_buffer *buf = NULL;
int ret;
- buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize);
if (!buf)
return NULL;
@@ -1183,7 +1194,7 @@ static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
if (!writers)
return ERR_PTR(-ENOMEM);
- ret = percpu_counter_init(&writers->counter, 0);
+ ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL);
if (ret < 0) {
kfree(writers);
return ERR_PTR(ret);
@@ -1200,16 +1211,14 @@ btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
kfree(writers);
}
-static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
- u32 stripesize, struct btrfs_root *root,
- struct btrfs_fs_info *fs_info,
+static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
+ struct btrfs_root *root, struct btrfs_fs_info *fs_info,
u64 objectid)
{
root->node = NULL;
root->commit_root = NULL;
root->sectorsize = sectorsize;
root->nodesize = nodesize;
- root->leafsize = leafsize;
root->stripesize = stripesize;
root->state = 0;
root->orphan_cleanup_state = 0;
@@ -1295,7 +1304,7 @@ struct btrfs_root *btrfs_alloc_dummy_root(void)
root = btrfs_alloc_root(NULL);
if (!root)
return ERR_PTR(-ENOMEM);
- __setup_root(4096, 4096, 4096, 4096, root, NULL, 1);
+ __setup_root(4096, 4096, 4096, root, NULL, 1);
set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state);
root->alloc_bytenr = 0;
@@ -1318,15 +1327,13 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
if (!root)
return ERR_PTR(-ENOMEM);
- __setup_root(tree_root->nodesize, tree_root->leafsize,
- tree_root->sectorsize, tree_root->stripesize,
- root, fs_info, objectid);
+ __setup_root(tree_root->nodesize, tree_root->sectorsize,
+ tree_root->stripesize, root, fs_info, objectid);
root->root_key.objectid = objectid;
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
root->root_key.offset = 0;
- leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
- 0, objectid, NULL, 0, 0, 0);
+ leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
leaf = NULL;
@@ -1396,9 +1403,9 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
if (!root)
return ERR_PTR(-ENOMEM);
- __setup_root(tree_root->nodesize, tree_root->leafsize,
- tree_root->sectorsize, tree_root->stripesize,
- root, fs_info, BTRFS_TREE_LOG_OBJECTID);
+ __setup_root(tree_root->nodesize, tree_root->sectorsize,
+ tree_root->stripesize, root, fs_info,
+ BTRFS_TREE_LOG_OBJECTID);
root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
@@ -1413,9 +1420,8 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
* updated (along with back refs to the log tree).
*/
- leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
- BTRFS_TREE_LOG_OBJECTID, NULL,
- 0, 0, 0);
+ leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
+ NULL, 0, 0, 0);
if (IS_ERR(leaf)) {
kfree(root);
return ERR_CAST(leaf);
@@ -1465,7 +1471,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
btrfs_set_stack_inode_generation(inode_item, 1);
btrfs_set_stack_inode_size(inode_item, 3);
btrfs_set_stack_inode_nlink(inode_item, 1);
- btrfs_set_stack_inode_nbytes(inode_item, root->leafsize);
+ btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
btrfs_set_root_node(&log_root->root_item, log_root->node);
@@ -1485,7 +1491,6 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
struct btrfs_fs_info *fs_info = tree_root->fs_info;
struct btrfs_path *path;
u64 generation;
- u32 blocksize;
int ret;
path = btrfs_alloc_path();
@@ -1498,9 +1503,8 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
goto alloc_fail;
}
- __setup_root(tree_root->nodesize, tree_root->leafsize,
- tree_root->sectorsize, tree_root->stripesize,
- root, fs_info, key->objectid);
+ __setup_root(tree_root->nodesize, tree_root->sectorsize,
+ tree_root->stripesize, root, fs_info, key->objectid);
ret = btrfs_find_root(tree_root, key, path,
&root->root_item, &root->root_key);
@@ -1511,9 +1515,8 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
}
generation = btrfs_root_generation(&root->root_item);
- blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
- blocksize, generation);
+ generation);
if (!root->node) {
ret = -ENOMEM;
goto find_fail;
@@ -1573,8 +1576,8 @@ int btrfs_init_fs_root(struct btrfs_root *root)
root->subv_writers = writers;
btrfs_init_free_ino_ctl(root);
- spin_lock_init(&root->cache_lock);
- init_waitqueue_head(&root->cache_wait);
+ spin_lock_init(&root->ino_cache_lock);
+ init_waitqueue_head(&root->ino_cache_wait);
ret = get_anon_bdev(&root->anon_dev);
if (ret)
@@ -1708,10 +1711,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
return ret;
}
-/*
- * If this fails, caller must call bdi_destroy() to get rid of the
- * bdi again.
- */
static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
{
int err;
@@ -1734,16 +1733,16 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
static void end_workqueue_fn(struct btrfs_work *work)
{
struct bio *bio;
- struct end_io_wq *end_io_wq;
+ struct btrfs_end_io_wq *end_io_wq;
int error;
- end_io_wq = container_of(work, struct end_io_wq, work);
+ end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
bio = end_io_wq->bio;
error = end_io_wq->error;
bio->bi_private = end_io_wq->private;
bio->bi_end_io = end_io_wq->end_io;
- kfree(end_io_wq);
+ kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
bio_endio_nodec(bio, error);
}
@@ -1772,6 +1771,7 @@ static int cleaner_kthread(void *arg)
}
btrfs_run_delayed_iputs(root);
+ btrfs_delete_unused_bgs(root->fs_info);
again = btrfs_clean_one_deleted_snapshot(root);
mutex_unlock(&root->fs_info->cleaner_mutex);
@@ -2063,6 +2063,7 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
btrfs_destroy_workqueue(fs_info->endio_workers);
btrfs_destroy_workqueue(fs_info->endio_meta_workers);
btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
+ btrfs_destroy_workqueue(fs_info->endio_repair_workers);
btrfs_destroy_workqueue(fs_info->rmw_workers);
btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
btrfs_destroy_workqueue(fs_info->endio_write_workers);
@@ -2143,8 +2144,6 @@ int open_ctree(struct super_block *sb,
{
u32 sectorsize;
u32 nodesize;
- u32 leafsize;
- u32 blocksize;
u32 stripesize;
u64 generation;
u64 features;
@@ -2188,7 +2187,7 @@ int open_ctree(struct super_block *sb,
goto fail_srcu;
}
- ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0);
+ ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
if (ret) {
err = ret;
goto fail_bdi;
@@ -2196,13 +2195,13 @@ int open_ctree(struct super_block *sb,
fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE *
(1 + ilog2(nr_cpu_ids));
- ret = percpu_counter_init(&fs_info->delalloc_bytes, 0);
+ ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
if (ret) {
err = ret;
goto fail_dirty_metadata_bytes;
}
- ret = percpu_counter_init(&fs_info->bio_counter, 0);
+ ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL);
if (ret) {
err = ret;
goto fail_delalloc_bytes;
@@ -2233,6 +2232,7 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->super_lock);
spin_lock_init(&fs_info->qgroup_op_lock);
spin_lock_init(&fs_info->buffer_lock);
+ spin_lock_init(&fs_info->unused_bgs_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
@@ -2242,6 +2242,7 @@ int open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
INIT_LIST_HEAD(&fs_info->space_info);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
+ INIT_LIST_HEAD(&fs_info->unused_bgs);
btrfs_mapping_init(&fs_info->mapping_tree);
btrfs_init_block_rsv(&fs_info->global_block_rsv,
BTRFS_BLOCK_RSV_GLOBAL);
@@ -2260,7 +2261,7 @@ int open_ctree(struct super_block *sb,
atomic_set(&fs_info->qgroup_op_seq, 0);
atomic64_set(&fs_info->tree_mod_seq, 0);
fs_info->sb = sb;
- fs_info->max_inline = 8192 * 1024;
+ fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
fs_info->metadata_ratio = 0;
fs_info->defrag_inodes = RB_ROOT;
fs_info->free_chunk_space = 0;
@@ -2389,7 +2390,7 @@ int open_ctree(struct super_block *sb,
goto fail_alloc;
}
- __setup_root(4096, 4096, 4096, 4096, tree_root,
+ __setup_root(4096, 4096, 4096, tree_root,
fs_info, BTRFS_ROOT_TREE_OBJECTID);
invalidate_bdev(fs_devices->latest_bdev);
@@ -2469,19 +2470,22 @@ int open_ctree(struct super_block *sb,
goto fail_alloc;
}
- if (btrfs_super_leafsize(disk_super) !=
+ /*
+ * Leafsize and nodesize were always equal, this is only a sanity check.
+ */
+ if (le32_to_cpu(disk_super->__unused_leafsize) !=
btrfs_super_nodesize(disk_super)) {
printk(KERN_ERR "BTRFS: couldn't mount because metadata "
"blocksizes don't match. node %d leaf %d\n",
btrfs_super_nodesize(disk_super),
- btrfs_super_leafsize(disk_super));
+ le32_to_cpu(disk_super->__unused_leafsize));
err = -EINVAL;
goto fail_alloc;
}
- if (btrfs_super_leafsize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
+ if (btrfs_super_nodesize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
printk(KERN_ERR "BTRFS: couldn't mount because metadata "
"blocksize (%d) was too large\n",
- btrfs_super_leafsize(disk_super));
+ btrfs_super_nodesize(disk_super));
err = -EINVAL;
goto fail_alloc;
}
@@ -2498,17 +2502,16 @@ int open_ctree(struct super_block *sb,
* flag our filesystem as having big metadata blocks if
* they are bigger than the page size
*/
- if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) {
+ if (btrfs_super_nodesize(disk_super) > PAGE_CACHE_SIZE) {
if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n");
features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
}
nodesize = btrfs_super_nodesize(disk_super);
- leafsize = btrfs_super_leafsize(disk_super);
sectorsize = btrfs_super_sectorsize(disk_super);
stripesize = btrfs_super_stripesize(disk_super);
- fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids));
+ fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
/*
@@ -2516,7 +2519,7 @@ int open_ctree(struct super_block *sb,
* extent buffers for the same range. It leads to corruptions
*/
if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
- (sectorsize != leafsize)) {
+ (sectorsize != nodesize)) {
printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes "
"are not allowed for mixed block groups on %s\n",
sb->s_id);
@@ -2579,6 +2582,8 @@ int open_ctree(struct super_block *sb,
btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
fs_info->endio_raid56_workers =
btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
+ fs_info->endio_repair_workers =
+ btrfs_alloc_workqueue("endio-repair", flags, 1, 0);
fs_info->rmw_workers =
btrfs_alloc_workqueue("rmw", flags, max_active, 2);
fs_info->endio_write_workers =
@@ -2600,11 +2605,12 @@ int open_ctree(struct super_block *sb,
fs_info->submit_workers && fs_info->flush_workers &&
fs_info->endio_workers && fs_info->endio_meta_workers &&
fs_info->endio_meta_write_workers &&
+ fs_info->endio_repair_workers &&
fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
fs_info->endio_freespace_worker && fs_info->rmw_workers &&
fs_info->caching_workers && fs_info->readahead_workers &&
fs_info->fixup_workers && fs_info->delayed_workers &&
- fs_info->fixup_workers && fs_info->extent_workers &&
+ fs_info->extent_workers &&
fs_info->qgroup_rescan_workers)) {
err = -ENOMEM;
goto fail_sb_buffer;
@@ -2615,7 +2621,6 @@ int open_ctree(struct super_block *sb,
4 * 1024 * 1024 / PAGE_CACHE_SIZE);
tree_root->nodesize = nodesize;
- tree_root->leafsize = leafsize;
tree_root->sectorsize = sectorsize;
tree_root->stripesize = stripesize;
@@ -2642,16 +2647,14 @@ int open_ctree(struct super_block *sb,
goto fail_sb_buffer;
}
- blocksize = btrfs_level_size(tree_root,
- btrfs_super_chunk_root_level(disk_super));
generation = btrfs_super_chunk_root_generation(disk_super);
- __setup_root(nodesize, leafsize, sectorsize, stripesize,
- chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
+ __setup_root(nodesize, sectorsize, stripesize, chunk_root,
+ fs_info, BTRFS_CHUNK_TREE_OBJECTID);
chunk_root->node = read_tree_block(chunk_root,
btrfs_super_chunk_root(disk_super),
- blocksize, generation);
+ generation);
if (!chunk_root->node ||
!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n",
@@ -2684,13 +2687,11 @@ int open_ctree(struct super_block *sb,
}
retry_root_backup:
- blocksize = btrfs_level_size(tree_root,
- btrfs_super_root_level(disk_super));
generation = btrfs_super_generation(disk_super);
tree_root->node = read_tree_block(tree_root,
btrfs_super_root(disk_super),
- blocksize, generation);
+ generation);
if (!tree_root->node ||
!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n",
@@ -2859,9 +2860,6 @@ retry_root_backup:
err = -EIO;
goto fail_qgroup;
}
- blocksize =
- btrfs_level_size(tree_root,
- btrfs_super_log_root_level(disk_super));
log_tree_root = btrfs_alloc_root(fs_info);
if (!log_tree_root) {
@@ -2869,11 +2867,10 @@ retry_root_backup:
goto fail_qgroup;
}
- __setup_root(nodesize, leafsize, sectorsize, stripesize,
+ __setup_root(nodesize, sectorsize, stripesize,
log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
log_tree_root->node = read_tree_block(tree_root, bytenr,
- blocksize,
generation + 1);
if (!log_tree_root->node ||
!extent_buffer_uptodate(log_tree_root->node)) {
@@ -2980,6 +2977,8 @@ retry_root_backup:
fs_info->update_uuid_tree_gen = 1;
}
+ fs_info->open = 1;
+
return 0;
fail_qgroup:
@@ -3139,7 +3138,8 @@ static int write_dev_supers(struct btrfs_device *device,
for (i = 0; i < max_mirrors; i++) {
bytenr = btrfs_sb_offset(i);
- if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+ device->commit_total_bytes)
break;
if (wait) {
@@ -3456,8 +3456,9 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
btrfs_set_stack_device_type(dev_item, dev->type);
btrfs_set_stack_device_id(dev_item, dev->devid);
btrfs_set_stack_device_total_bytes(dev_item,
- dev->disk_total_bytes);
- btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
+ dev->commit_total_bytes);
+ btrfs_set_stack_device_bytes_used(dev_item,
+ dev->commit_bytes_used);
btrfs_set_stack_device_io_align(dev_item, dev->io_align);
btrfs_set_stack_device_io_width(dev_item, dev->io_width);
btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
@@ -3532,7 +3533,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
static void free_fs_root(struct btrfs_root *root)
{
- iput(root->cache_inode);
+ iput(root->ino_cache_inode);
WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
btrfs_free_block_rsv(root, root->orphan_block_rsv);
root->orphan_block_rsv = NULL;
@@ -3623,7 +3624,7 @@ int btrfs_commit_super(struct btrfs_root *root)
return btrfs_commit_transaction(trans, root);
}
-int close_ctree(struct btrfs_root *root)
+void close_ctree(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
@@ -3689,6 +3690,7 @@ int close_ctree(struct btrfs_root *root)
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
btrfs_stop_all_workers(fs_info);
+ fs_info->open = 0;
free_root_pointers(fs_info, 1);
iput(fs_info->btree_inode);
@@ -3711,8 +3713,6 @@ int close_ctree(struct btrfs_root *root)
btrfs_free_block_rsv(root, root->orphan_block_rsv);
root->orphan_block_rsv = NULL;
-
- return 0;
}
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
@@ -3814,10 +3814,73 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
int read_only)
{
+ struct btrfs_super_block *sb = fs_info->super_copy;
+ int ret = 0;
+
+ if (sb->root_level > BTRFS_MAX_LEVEL) {
+ printk(KERN_ERR "BTRFS: tree_root level too big: %d > %d\n",
+ sb->root_level, BTRFS_MAX_LEVEL);
+ ret = -EINVAL;
+ }
+ if (sb->chunk_root_level > BTRFS_MAX_LEVEL) {
+ printk(KERN_ERR "BTRFS: chunk_root level too big: %d > %d\n",
+ sb->chunk_root_level, BTRFS_MAX_LEVEL);
+ ret = -EINVAL;
+ }
+ if (sb->log_root_level > BTRFS_MAX_LEVEL) {
+ printk(KERN_ERR "BTRFS: log_root level too big: %d > %d\n",
+ sb->log_root_level, BTRFS_MAX_LEVEL);
+ ret = -EINVAL;
+ }
+
/*
- * Placeholder for checks
+ * The common minimum, we don't know if we can trust the nodesize/sectorsize
+ * items yet, they'll be verified later. Issue just a warning.
*/
- return 0;
+ if (!IS_ALIGNED(sb->root, 4096))
+ printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
+ sb->root);
+ if (!IS_ALIGNED(sb->chunk_root, 4096))
+ printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
+ sb->chunk_root);
+ if (!IS_ALIGNED(sb->log_root, 4096))
+ printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
+ sb->log_root);
+
+ if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
+ printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n",
+ fs_info->fsid, sb->dev_item.fsid);
+ ret = -EINVAL;
+ }
+
+ /*
+ * Hint to catch really bogus numbers, bitflips or so, more exact checks are
+ * done later
+ */
+ if (sb->num_devices > (1UL << 31))
+ printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n",
+ sb->num_devices);
+
+ if (sb->bytenr != BTRFS_SUPER_INFO_OFFSET) {
+ printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n",
+ sb->bytenr, BTRFS_SUPER_INFO_OFFSET);
+ ret = -EINVAL;
+ }
+
+ /*
+ * The generation is a global counter, we'll trust it more than the others
+ * but it's still possible that it's the one that's wrong.
+ */
+ if (sb->generation < sb->chunk_root_generation)
+ printk(KERN_WARNING
+ "BTRFS: suspicious: generation < chunk_root_generation: %llu < %llu\n",
+ sb->generation, sb->chunk_root_generation);
+ if (sb->generation < sb->cache_generation && sb->cache_generation != (u64)-1)
+ printk(KERN_WARNING
+ "BTRFS: suspicious: generation < cache_generation: %llu < %llu\n",
+ sb->generation, sb->cache_generation);
+
+ return ret;
}
static void btrfs_error_commit_super(struct btrfs_root *root)
@@ -4009,9 +4072,8 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
while (start <= end) {
- eb = btrfs_find_tree_block(root, start,
- root->leafsize);
- start += root->leafsize;
+ eb = btrfs_find_tree_block(root, start);
+ start += root->nodesize;
if (!eb)
continue;
wait_on_extent_buffer_writeback(eb);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 23ce3ce..4146518 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -25,11 +25,12 @@
#define BTRFS_SUPER_MIRROR_MAX 3
#define BTRFS_SUPER_MIRROR_SHIFT 12
-enum {
+enum btrfs_wq_endio_type {
BTRFS_WQ_ENDIO_DATA = 0,
BTRFS_WQ_ENDIO_METADATA = 1,
BTRFS_WQ_ENDIO_FREE_SPACE = 2,
BTRFS_WQ_ENDIO_RAID56 = 3,
+ BTRFS_WQ_ENDIO_DIO_REPAIR = 4,
};
static inline u64 btrfs_sb_offset(int mirror)
@@ -44,9 +45,8 @@ struct btrfs_device;
struct btrfs_fs_devices;
struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
- u32 blocksize, u64 parent_transid);
-int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
- u64 parent_transid);
+ u64 parent_transid);
+void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize);
int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
int mirror_num, struct extent_buffer **eb);
struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
@@ -56,13 +56,13 @@ void clean_tree_block(struct btrfs_trans_handle *trans,
int open_ctree(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
char *options);
-int close_ctree(struct btrfs_root *root);
+void close_ctree(struct btrfs_root *root);
int write_ctree_super(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int max_mirrors);
struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
int btrfs_commit_super(struct btrfs_root *root);
struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
- u64 bytenr, u32 blocksize);
+ u64 bytenr);
struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
struct btrfs_key *location);
int btrfs_init_fs_root(struct btrfs_root *root);
@@ -119,7 +119,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
u32 btrfs_csum_data(char *data, u32 seed, size_t len);
void btrfs_csum_final(u32 crc, char *result);
int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
- int metadata);
+ enum btrfs_wq_endio_type metadata);
int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
int rw, struct bio *bio, int mirror_num,
unsigned long bio_flags, u64 bio_offset,
@@ -141,6 +141,8 @@ int btree_lock_page_hook(struct page *page, void *data,
void (*flush_fn)(void *));
int btrfs_calc_num_tolerated_disk_barrier_failures(
struct btrfs_fs_info *fs_info);
+int __init btrfs_end_io_wq_init(void);
+void btrfs_end_io_wq_exit(void);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 41422a3..37d1645 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -70,7 +70,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
return ERR_PTR(-ESTALE);
key.objectid = root_objectid;
- btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
index = srcu_read_lock(&fs_info->subvol_srcu);
@@ -82,7 +82,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
}
key.objectid = objectid;
- btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
inode = btrfs_iget(sb, &key, root, NULL);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3efe1c3..d565895 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -491,7 +491,7 @@ next:
key.objectid);
if (key.type == BTRFS_METADATA_ITEM_KEY)
last = key.objectid +
- fs_info->tree_root->leafsize;
+ fs_info->tree_root->nodesize;
else
last = key.objectid + key.offset;
@@ -765,7 +765,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
* different
*/
if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
- offset = root->leafsize;
+ offset = root->nodesize;
metadata = 0;
}
@@ -799,13 +799,13 @@ again:
path->slots[0]);
if (key.objectid == bytenr &&
key.type == BTRFS_EXTENT_ITEM_KEY &&
- key.offset == root->leafsize)
+ key.offset == root->nodesize)
ret = 0;
}
if (ret) {
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = root->leafsize;
+ key.offset = root->nodesize;
btrfs_release_path(path);
goto again;
}
@@ -2651,7 +2651,7 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
num_bytes = btrfs_calc_trans_metadata_size(root, 1);
num_heads = heads_to_leaves(root, num_heads);
if (num_heads > 1)
- num_bytes += (num_heads - 1) * root->leafsize;
+ num_bytes += (num_heads - 1) * root->nodesize;
num_bytes <<= 1;
global_rsv = &root->fs_info->global_block_rsv;
@@ -3073,10 +3073,10 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
u64, u64, u64, u64, u64, u64, int);
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+
+ if (btrfs_test_is_dummy_root(root))
return 0;
-#endif
+
ref_root = btrfs_header_owner(buf);
nritems = btrfs_header_nritems(buf);
level = btrfs_header_level(buf);
@@ -3097,7 +3097,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
for (i = 0; i < nritems; i++) {
if (level == 0) {
btrfs_item_key_to_cpu(buf, &key, i);
- if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
continue;
fi = btrfs_item_ptr(buf, i,
struct btrfs_file_extent_item);
@@ -3117,7 +3117,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
goto fail;
} else {
bytenr = btrfs_node_blockptr(buf, i);
- num_bytes = btrfs_level_size(root, level - 1);
+ num_bytes = root->nodesize;
ret = process_func(trans, root, bytenr, num_bytes,
parent, ref_root, level - 1, 0,
1);
@@ -3494,7 +3494,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
if (!found)
return -ENOMEM;
- ret = percpu_counter_init(&found->total_bytes_pinned, 0);
+ ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL);
if (ret) {
kfree(found);
return ret;
@@ -4343,11 +4343,21 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
}
static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info,
- struct btrfs_fs_info *fs_info)
+ struct btrfs_fs_info *fs_info,
+ int flush_state)
{
u64 used;
spin_lock(&space_info->lock);
+ /*
+ * We run out of space and have not got any free space via flush_space,
+ * so don't bother doing async reclaim.
+ */
+ if (flush_state > COMMIT_TRANS && space_info->full) {
+ spin_unlock(&space_info->lock);
+ return 0;
+ }
+
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_pinned + space_info->bytes_readonly +
space_info->bytes_may_use;
@@ -4380,11 +4390,12 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
flush_space(fs_info->fs_root, space_info, to_reclaim,
to_reclaim, flush_state);
flush_state++;
- if (!btrfs_need_do_async_reclaim(space_info, fs_info))
+ if (!btrfs_need_do_async_reclaim(space_info, fs_info,
+ flush_state))
return;
} while (flush_state <= COMMIT_TRANS);
- if (btrfs_need_do_async_reclaim(space_info, fs_info))
+ if (btrfs_need_do_async_reclaim(space_info, fs_info, flush_state))
queue_work(system_unbound_wq, work);
}
@@ -4502,7 +4513,13 @@ again:
space_info->flush = 1;
} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
used += orig_bytes;
- if (need_do_async_reclaim(space_info, root->fs_info, used) &&
+ /*
+ * We will do the space reservation dance during log replay,
+ * which means we won't have fs_info->fs_root set, so don't do
+ * the async reclaim as we will panic.
+ */
+ if (!root->fs_info->log_root_recovering &&
+ need_do_async_reclaim(space_info, root->fs_info, used) &&
!work_busy(&root->fs_info->async_reclaim_work))
queue_work(system_unbound_wq,
&root->fs_info->async_reclaim_work);
@@ -4839,7 +4856,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
if (num_bytes * 3 > meta_used)
num_bytes = div64_u64(meta_used, 3);
- return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
+ return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10);
}
static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
@@ -4988,7 +5005,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (root->fs_info->quota_enabled) {
/* One for parent inode, two for dir entries */
- num_bytes = 3 * root->leafsize;
+ num_bytes = 3 * root->nodesize;
ret = btrfs_qgroup_reserve(root, num_bytes);
if (ret)
return ret;
@@ -5176,7 +5193,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
if (root->fs_info->quota_enabled) {
ret = btrfs_qgroup_reserve(root, num_bytes +
- nr_extents * root->leafsize);
+ nr_extents * root->nodesize);
if (ret)
goto out_fail;
}
@@ -5185,7 +5202,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
if (unlikely(ret)) {
if (root->fs_info->quota_enabled)
btrfs_qgroup_free(root, num_bytes +
- nr_extents * root->leafsize);
+ nr_extents * root->nodesize);
goto out_fail;
}
@@ -5301,7 +5318,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
btrfs_ino(inode), to_free, 0);
if (root->fs_info->quota_enabled) {
btrfs_qgroup_free(root, num_bytes +
- dropped * root->leafsize);
+ dropped * root->nodesize);
}
btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
@@ -5422,6 +5439,20 @@ static int update_block_group(struct btrfs_root *root,
spin_unlock(&cache->space_info->lock);
} else {
old_val -= num_bytes;
+
+ /*
+ * No longer have used bytes in this block group, queue
+ * it for deletion.
+ */
+ if (old_val == 0) {
+ spin_lock(&info->unused_bgs_lock);
+ if (list_empty(&cache->bg_list)) {
+ btrfs_get_block_group(cache);
+ list_add_tail(&cache->bg_list,
+ &info->unused_bgs);
+ }
+ spin_unlock(&info->unused_bgs_lock);
+ }
btrfs_set_block_group_used(&cache->item, old_val);
cache->pinned += num_bytes;
cache->space_info->bytes_pinned += num_bytes;
@@ -6233,10 +6264,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ if (btrfs_test_is_dummy_root(root))
return 0;
-#endif
+
add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
/*
@@ -6263,14 +6293,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
return ret;
}
-static u64 stripe_align(struct btrfs_root *root,
- struct btrfs_block_group_cache *cache,
- u64 val, u64 num_bytes)
-{
- u64 ret = ALIGN(val, root->stripesize);
- return ret;
-}
-
/*
* when we wait for progress in the block group caching, its because
* our allocation attempt failed at least once. So, we must sleep
@@ -6464,7 +6486,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
bool have_caching_bg = false;
WARN_ON(num_bytes < root->sectorsize);
- btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
+ ins->type = BTRFS_EXTENT_ITEM_KEY;
ins->objectid = 0;
ins->offset = 0;
@@ -6751,8 +6773,7 @@ unclustered_alloc:
goto loop;
}
checks:
- search_start = stripe_align(root, block_group,
- offset, num_bytes);
+ search_start = ALIGN(offset, root->stripesize);
/* move on to the next group */
if (search_start + num_bytes >
@@ -7077,7 +7098,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path) {
btrfs_free_and_pin_reserved_extent(root, ins->objectid,
- root->leafsize);
+ root->nodesize);
return -ENOMEM;
}
@@ -7086,7 +7107,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
ins, size);
if (ret) {
btrfs_free_and_pin_reserved_extent(root, ins->objectid,
- root->leafsize);
+ root->nodesize);
btrfs_free_path(path);
return ret;
}
@@ -7101,7 +7122,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
if (skinny_metadata) {
iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
- num_bytes = root->leafsize;
+ num_bytes = root->nodesize;
} else {
block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
btrfs_set_tree_block_key(leaf, block_info, key);
@@ -7131,14 +7152,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
return ret;
}
- ret = update_block_group(root, ins->objectid, root->leafsize, 1);
+ ret = update_block_group(root, ins->objectid, root->nodesize, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
ins->objectid, ins->offset);
BUG();
}
- trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->leafsize);
+ trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize);
return ret;
}
@@ -7213,17 +7234,19 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
btrfs_set_buffer_uptodate(buf);
if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+ buf->log_index = root->log_transid % 2;
/*
* we allow two log transactions at a time, use different
* EXENT bit to differentiate dirty pages.
*/
- if (root->log_transid % 2 == 0)
+ if (buf->log_index == 0)
set_extent_dirty(&root->dirty_log_pages, buf->start,
buf->start + buf->len - 1, GFP_NOFS);
else
set_extent_new(&root->dirty_log_pages, buf->start,
buf->start + buf->len - 1, GFP_NOFS);
} else {
+ buf->log_index = -1;
set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
buf->start + buf->len - 1, GFP_NOFS);
}
@@ -7300,8 +7323,8 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
*
* returns the tree buffer or NULL.
*/
-struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u32 blocksize,
+struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
u64 parent, u64 root_objectid,
struct btrfs_disk_key *key, int level,
u64 hint, u64 empty_size)
@@ -7311,18 +7334,18 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
struct extent_buffer *buf;
u64 flags = 0;
int ret;
+ u32 blocksize = root->nodesize;
bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
SKINNY_METADATA);
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) {
+ if (btrfs_test_is_dummy_root(root)) {
buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
blocksize, level);
if (!IS_ERR(buf))
root->alloc_bytenr += blocksize;
return buf;
}
-#endif
+
block_rsv = use_block_rsv(trans, root, blocksize);
if (IS_ERR(block_rsv))
return ERR_CAST(block_rsv);
@@ -7417,7 +7440,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
eb = path->nodes[wc->level];
nritems = btrfs_header_nritems(eb);
- blocksize = btrfs_level_size(root, wc->level - 1);
+ blocksize = root->nodesize;
for (slot = path->slots[wc->level]; slot < nritems; slot++) {
if (nread >= wc->reada_count)
@@ -7464,10 +7487,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
continue;
}
reada:
- ret = readahead_tree_block(root, bytenr, blocksize,
- generation);
- if (ret)
- break;
+ readahead_tree_block(root, bytenr, blocksize);
nread++;
}
wc->reada_slot = slot;
@@ -7626,7 +7646,6 @@ walk_down:
level = root_level;
while (level >= 0) {
if (path->nodes[level] == NULL) {
- int child_bsize = root->nodesize;
int parent_slot;
u64 child_gen;
u64 child_bytenr;
@@ -7638,8 +7657,7 @@ walk_down:
child_bytenr = btrfs_node_blockptr(eb, parent_slot);
child_gen = btrfs_node_ptr_generation(eb, parent_slot);
- eb = read_tree_block(root, child_bytenr, child_bsize,
- child_gen);
+ eb = read_tree_block(root, child_bytenr, child_gen);
if (!eb || !extent_buffer_uptodate(eb)) {
ret = -EIO;
goto out;
@@ -7655,7 +7673,7 @@ walk_down:
ret = btrfs_qgroup_record_ref(trans, root->fs_info,
root->objectid,
child_bytenr,
- child_bsize,
+ root->nodesize,
BTRFS_QGROUP_OPER_SUB_SUBTREE,
0);
if (ret)
@@ -7806,9 +7824,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
}
bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
- blocksize = btrfs_level_size(root, level - 1);
+ blocksize = root->nodesize;
- next = btrfs_find_tree_block(root, bytenr, blocksize);
+ next = btrfs_find_tree_block(root, bytenr);
if (!next) {
next = btrfs_find_create_tree_block(root, bytenr, blocksize);
if (!next)
@@ -7870,7 +7888,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
if (!next) {
if (reada && level == 1)
reada_walk_down(trans, root, wc, path);
- next = read_tree_block(root, bytenr, blocksize, generation);
+ next = read_tree_block(root, bytenr, generation);
if (!next || !extent_buffer_uptodate(next)) {
free_extent_buffer(next);
return -EIO;
@@ -8853,6 +8871,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
}
up_write(&info->commit_root_sem);
+ spin_lock(&info->unused_bgs_lock);
+ while (!list_empty(&info->unused_bgs)) {
+ block_group = list_first_entry(&info->unused_bgs,
+ struct btrfs_block_group_cache,
+ bg_list);
+ list_del_init(&block_group->bg_list);
+ btrfs_put_block_group(block_group);
+ }
+ spin_unlock(&info->unused_bgs_lock);
+
spin_lock(&info->block_group_cache_lock);
while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
block_group = rb_entry(n, struct btrfs_block_group_cache,
@@ -8987,7 +9015,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
init_rwsem(&cache->data_rwsem);
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
- INIT_LIST_HEAD(&cache->new_bg_list);
+ INIT_LIST_HEAD(&cache->bg_list);
btrfs_init_free_space_ctl(cache);
return cache;
@@ -9009,7 +9037,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
root = info->extent_root;
key.objectid = 0;
key.offset = 0;
- btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
+ key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -9128,8 +9156,18 @@ int btrfs_read_block_groups(struct btrfs_root *root)
__link_block_group(space_info, cache);
set_avail_alloc_bits(root->fs_info, cache->flags);
- if (btrfs_chunk_readonly(root, cache->key.objectid))
+ if (btrfs_chunk_readonly(root, cache->key.objectid)) {
set_block_group_ro(cache, 1);
+ } else if (btrfs_block_group_used(&cache->item) == 0) {
+ spin_lock(&info->unused_bgs_lock);
+ /* Should always be true but just in case. */
+ if (list_empty(&cache->bg_list)) {
+ btrfs_get_block_group(cache);
+ list_add_tail(&cache->bg_list,
+ &info->unused_bgs);
+ }
+ spin_unlock(&info->unused_bgs_lock);
+ }
}
list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
@@ -9170,10 +9208,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_key key;
int ret = 0;
- list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
- new_bg_list) {
- list_del_init(&block_group->new_bg_list);
-
+ list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
+ list_del_init(&block_group->bg_list);
if (ret)
continue;
@@ -9259,7 +9295,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
__link_block_group(cache->space_info, cache);
- list_add_tail(&cache->new_bg_list, &trans->new_bgs);
+ list_add_tail(&cache->bg_list, &trans->new_bgs);
set_avail_alloc_bits(extent_root->fs_info, type);
@@ -9413,8 +9449,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
memcpy(&key, &block_group->key, sizeof(key));
- btrfs_clear_space_info_full(root->fs_info);
-
btrfs_put_block_group(block_group);
btrfs_put_block_group(block_group);
@@ -9430,6 +9464,101 @@ out:
return ret;
}
+/*
+ * Process the unused_bgs list and remove any that don't have any allocated
+ * space inside of them.
+ */
+void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_space_info *space_info;
+ struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_trans_handle *trans;
+ int ret = 0;
+
+ if (!fs_info->open)
+ return;
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ while (!list_empty(&fs_info->unused_bgs)) {
+ u64 start, end;
+
+ block_group = list_first_entry(&fs_info->unused_bgs,
+ struct btrfs_block_group_cache,
+ bg_list);
+ space_info = block_group->space_info;
+ list_del_init(&block_group->bg_list);
+ if (ret || btrfs_mixed_space_info(space_info)) {
+ btrfs_put_block_group(block_group);
+ continue;
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+
+ /* Don't want to race with allocators so take the groups_sem */
+ down_write(&space_info->groups_sem);
+ spin_lock(&block_group->lock);
+ if (block_group->reserved ||
+ btrfs_block_group_used(&block_group->item) ||
+ block_group->ro) {
+ /*
+ * We want to bail if we made new allocations or have
+ * outstanding allocations in this block group. We do
+ * the ro check in case balance is currently acting on
+ * this block group.
+ */
+ spin_unlock(&block_group->lock);
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
+ spin_unlock(&block_group->lock);
+
+ /* We don't want to force the issue, only flip if it's ok. */
+ ret = set_block_group_ro(block_group, 0);
+ up_write(&space_info->groups_sem);
+ if (ret < 0) {
+ ret = 0;
+ goto next;
+ }
+
+ /*
+ * Want to do this before we do anything else so we can recover
+ * properly if we fail to join the transaction.
+ */
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ btrfs_set_block_group_rw(root, block_group);
+ ret = PTR_ERR(trans);
+ goto next;
+ }
+
+ /*
+ * We could have pending pinned extents for this block group,
+ * just delete them, we don't care about them anymore.
+ */
+ start = block_group->key.objectid;
+ end = start + block_group->key.offset - 1;
+ clear_extent_bits(&fs_info->freed_extents[0], start, end,
+ EXTENT_DIRTY, GFP_NOFS);
+ clear_extent_bits(&fs_info->freed_extents[1], start, end,
+ EXTENT_DIRTY, GFP_NOFS);
+
+ /* Reset pinned so btrfs_put_block_group doesn't complain */
+ block_group->pinned = 0;
+
+ /*
+ * Btrfs_remove_chunk will abort the transaction if things go
+ * horribly wrong.
+ */
+ ret = btrfs_remove_chunk(trans, root,
+ block_group->key.objectid);
+ btrfs_end_transaction(trans, root);
+next:
+ btrfs_put_block_group(block_group);
+ spin_lock(&fs_info->unused_bgs_lock);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+}
+
int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
{
struct btrfs_space_info *space_info;
@@ -9561,7 +9690,7 @@ void btrfs_end_nocow_write(struct btrfs_root *root)
int btrfs_start_nocow_write(struct btrfs_root *root)
{
- if (unlikely(atomic_read(&root->will_be_snapshoted)))
+ if (atomic_read(&root->will_be_snapshoted))
return 0;
percpu_counter_inc(&root->subv_writers->counter);
@@ -9569,7 +9698,7 @@ int btrfs_start_nocow_write(struct btrfs_root *root)
* Make sure counter is updated before we check for snapshot creation.
*/
smp_mb();
- if (unlikely(atomic_read(&root->will_be_snapshoted))) {
+ if (atomic_read(&root->will_be_snapshoted)) {
btrfs_end_nocow_write(root);
return 0;
}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index af0359d..bf3f424 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -25,6 +25,11 @@ static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
static struct bio_set *btrfs_bioset;
+static inline bool extent_state_in_tree(const struct extent_state *state)
+{
+ return !RB_EMPTY_NODE(&state->rb_node);
+}
+
#ifdef CONFIG_BTRFS_DEBUG
static LIST_HEAD(buffers);
static LIST_HEAD(states);
@@ -59,9 +64,9 @@ void btrfs_leak_debug_check(void)
while (!list_empty(&states)) {
state = list_entry(states.next, struct extent_state, leak_list);
- printk(KERN_ERR "BTRFS: state leak: start %llu end %llu "
- "state %lu in tree %p refs %d\n",
- state->start, state->end, state->state, state->tree,
+ pr_err("BTRFS: state leak: start %llu end %llu state %lu in tree %d refs %d\n",
+ state->start, state->end, state->state,
+ extent_state_in_tree(state),
atomic_read(&state->refs));
list_del(&state->leak_list);
kmem_cache_free(extent_state_cache, state);
@@ -209,7 +214,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
return state;
state->state = 0;
state->private = 0;
- state->tree = NULL;
+ RB_CLEAR_NODE(&state->rb_node);
btrfs_leak_debug_add(&state->leak_list, &states);
atomic_set(&state->refs, 1);
init_waitqueue_head(&state->wq);
@@ -222,7 +227,7 @@ void free_extent_state(struct extent_state *state)
if (!state)
return;
if (atomic_dec_and_test(&state->refs)) {
- WARN_ON(state->tree);
+ WARN_ON(extent_state_in_tree(state));
btrfs_leak_debug_del(&state->leak_list);
trace_free_extent_state(state, _RET_IP_);
kmem_cache_free(extent_state_cache, state);
@@ -371,8 +376,8 @@ static void merge_state(struct extent_io_tree *tree,
other->state == state->state) {
merge_cb(tree, state, other);
state->start = other->start;
- other->tree = NULL;
rb_erase(&other->rb_node, &tree->state);
+ RB_CLEAR_NODE(&other->rb_node);
free_extent_state(other);
}
}
@@ -383,8 +388,8 @@ static void merge_state(struct extent_io_tree *tree,
other->state == state->state) {
merge_cb(tree, state, other);
state->end = other->end;
- other->tree = NULL;
rb_erase(&other->rb_node, &tree->state);
+ RB_CLEAR_NODE(&other->rb_node);
free_extent_state(other);
}
}
@@ -442,7 +447,6 @@ static int insert_state(struct extent_io_tree *tree,
found->start, found->end, start, end);
return -EEXIST;
}
- state->tree = tree;
merge_state(tree, state);
return 0;
}
@@ -486,7 +490,6 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
free_extent_state(prealloc);
return -EEXIST;
}
- prealloc->tree = tree;
return 0;
}
@@ -524,9 +527,9 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
wake_up(&state->wq);
if (state->state == 0) {
next = next_state(state);
- if (state->tree) {
+ if (extent_state_in_tree(state)) {
rb_erase(&state->rb_node, &tree->state);
- state->tree = NULL;
+ RB_CLEAR_NODE(&state->rb_node);
free_extent_state(state);
} else {
WARN_ON(1);
@@ -606,8 +609,8 @@ again:
cached_state = NULL;
}
- if (cached && cached->tree && cached->start <= start &&
- cached->end > start) {
+ if (cached && extent_state_in_tree(cached) &&
+ cached->start <= start && cached->end > start) {
if (clear)
atomic_dec(&cached->refs);
state = cached;
@@ -843,7 +846,7 @@ again:
if (cached_state && *cached_state) {
state = *cached_state;
if (state->start <= start && state->end > start &&
- state->tree) {
+ extent_state_in_tree(state)) {
node = &state->rb_node;
goto hit_next;
}
@@ -1069,7 +1072,7 @@ again:
if (cached_state && *cached_state) {
state = *cached_state;
if (state->start <= start && state->end > start &&
- state->tree) {
+ extent_state_in_tree(state)) {
node = &state->rb_node;
goto hit_next;
}
@@ -1459,7 +1462,7 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
spin_lock(&tree->lock);
if (cached_state && *cached_state) {
state = *cached_state;
- if (state->end == start - 1 && state->tree) {
+ if (state->end == start - 1 && extent_state_in_tree(state)) {
n = rb_next(&state->rb_node);
while (n) {
state = rb_entry(n, struct extent_state,
@@ -1905,7 +1908,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
int bitset = 0;
spin_lock(&tree->lock);
- if (cached && cached->tree && cached->start <= start &&
+ if (cached && extent_state_in_tree(cached) && cached->start <= start &&
cached->end > start)
node = &cached->rb_node;
else
@@ -1959,27 +1962,7 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
SetPageUptodate(page);
}
-/*
- * When IO fails, either with EIO or csum verification fails, we
- * try other mirrors that might have a good copy of the data. This
- * io_failure_record is used to record state as we go through all the
- * mirrors. If another mirror has good data, the page is set up to date
- * and things continue. If a good mirror can't be found, the original
- * bio end_io callback is called to indicate things have failed.
- */
-struct io_failure_record {
- struct page *page;
- u64 start;
- u64 len;
- u64 logical;
- unsigned long bio_flags;
- int this_mirror;
- int failed_mirror;
- int in_validation;
-};
-
-static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
- int did_repair)
+int free_io_failure(struct inode *inode, struct io_failure_record *rec)
{
int ret;
int err = 0;
@@ -2012,10 +1995,10 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
* currently, there can be no more than two copies of every data bit. thus,
* exactly one rewrite is required.
*/
-int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
- u64 length, u64 logical, struct page *page,
- int mirror_num)
+int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
+ struct page *page, unsigned int pg_offset, int mirror_num)
{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct bio *bio;
struct btrfs_device *dev;
u64 map_length = 0;
@@ -2053,7 +2036,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
return -EIO;
}
bio->bi_bdev = dev->bdev;
- bio_add_page(bio, page, length, start - page_offset(page));
+ bio_add_page(bio, page, length, pg_offset);
if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) {
/* try to remap that extent elsewhere? */
@@ -2063,10 +2046,9 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
}
printk_ratelimited_in_rcu(KERN_INFO
- "BTRFS: read error corrected: ino %lu off %llu "
- "(dev %s sector %llu)\n", page->mapping->host->i_ino,
- start, rcu_str_deref(dev->name), sector);
-
+ "BTRFS: read error corrected: ino %llu off %llu (dev %s sector %llu)\n",
+ btrfs_ino(inode), start,
+ rcu_str_deref(dev->name), sector);
bio_put(bio);
return 0;
}
@@ -2082,9 +2064,11 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
return -EROFS;
for (i = 0; i < num_pages; i++) {
- struct page *p = extent_buffer_page(eb, i);
- ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE,
- start, p, mirror_num);
+ struct page *p = eb->pages[i];
+
+ ret = repair_io_failure(root->fs_info->btree_inode, start,
+ PAGE_CACHE_SIZE, start, p,
+ start - page_offset(p), mirror_num);
if (ret)
break;
start += PAGE_CACHE_SIZE;
@@ -2097,16 +2081,15 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
* each time an IO finishes, we do a fast check in the IO failure tree
* to see if we need to process or clean up an io_failure_record
*/
-static int clean_io_failure(u64 start, struct page *page)
+int clean_io_failure(struct inode *inode, u64 start, struct page *page,
+ unsigned int pg_offset)
{
u64 private;
u64 private_failure;
struct io_failure_record *failrec;
- struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct extent_state *state;
int num_copies;
- int did_repair = 0;
int ret;
private = 0;
@@ -2127,7 +2110,6 @@ static int clean_io_failure(u64 start, struct page *page)
/* there was no real error, just free the record */
pr_debug("clean_io_failure: freeing dummy error at %llu\n",
failrec->start);
- did_repair = 1;
goto out;
}
if (fs_info->sb->s_flags & MS_RDONLY)
@@ -2144,55 +2126,70 @@ static int clean_io_failure(u64 start, struct page *page)
num_copies = btrfs_num_copies(fs_info, failrec->logical,
failrec->len);
if (num_copies > 1) {
- ret = repair_io_failure(fs_info, start, failrec->len,
- failrec->logical, page,
- failrec->failed_mirror);
- did_repair = !ret;
+ repair_io_failure(inode, start, failrec->len,
+ failrec->logical, page,
+ pg_offset, failrec->failed_mirror);
}
- ret = 0;
}
out:
- if (!ret)
- ret = free_io_failure(inode, failrec, did_repair);
+ free_io_failure(inode, failrec);
- return ret;
+ return 0;
}
/*
- * this is a generic handler for readpage errors (default
- * readpage_io_failed_hook). if other copies exist, read those and write back
- * good data to the failed position. does not investigate in remapping the
- * failed extent elsewhere, hoping the device will be smart enough to do this as
- * needed
+ * Can be called when
+ * - hold extent lock
+ * - under ordered extent
+ * - the inode is freeing
*/
+void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end)
+{
+ struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+ struct io_failure_record *failrec;
+ struct extent_state *state, *next;
-static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
- struct page *page, u64 start, u64 end,
- int failed_mirror)
+ if (RB_EMPTY_ROOT(&failure_tree->state))
+ return;
+
+ spin_lock(&failure_tree->lock);
+ state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
+ while (state) {
+ if (state->start > end)
+ break;
+
+ ASSERT(state->end <= end);
+
+ next = next_state(state);
+
+ failrec = (struct io_failure_record *)state->private;
+ free_extent_state(state);
+ kfree(failrec);
+
+ state = next;
+ }
+ spin_unlock(&failure_tree->lock);
+}
+
+int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
+ struct io_failure_record **failrec_ret)
{
- struct io_failure_record *failrec = NULL;
+ struct io_failure_record *failrec;
u64 private;
struct extent_map *em;
- struct inode *inode = page->mapping->host;
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
- struct bio *bio;
- struct btrfs_io_bio *btrfs_failed_bio;
- struct btrfs_io_bio *btrfs_bio;
- int num_copies;
int ret;
- int read_mode;
u64 logical;
- BUG_ON(failed_bio->bi_rw & REQ_WRITE);
-
ret = get_state_private(failure_tree, start, &private);
if (ret) {
failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
if (!failrec)
return -ENOMEM;
+
failrec->start = start;
failrec->len = end - start + 1;
failrec->this_mirror = 0;
@@ -2212,11 +2209,11 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
em = NULL;
}
read_unlock(&em_tree->lock);
-
if (!em) {
kfree(failrec);
return -EIO;
}
+
logical = start - em->start;
logical = em->block_start + logical;
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
@@ -2225,8 +2222,10 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
extent_set_compress_type(&failrec->bio_flags,
em->compress_type);
}
- pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
- "len=%llu\n", logical, start, failrec->len);
+
+ pr_debug("Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu\n",
+ logical, start, failrec->len);
+
failrec->logical = logical;
free_extent_map(em);
@@ -2246,8 +2245,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
}
} else {
failrec = (struct io_failure_record *)(unsigned long)private;
- pr_debug("bio_readpage_error: (found) logical=%llu, "
- "start=%llu, len=%llu, validation=%d\n",
+ pr_debug("Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d\n",
failrec->logical, failrec->start, failrec->len,
failrec->in_validation);
/*
@@ -2256,6 +2254,17 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
* clean_io_failure() clean all those errors at once.
*/
}
+
+ *failrec_ret = failrec;
+
+ return 0;
+}
+
+int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
+ struct io_failure_record *failrec, int failed_mirror)
+{
+ int num_copies;
+
num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
failrec->logical, failrec->len);
if (num_copies == 1) {
@@ -2264,10 +2273,9 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
* all the retry and error correction code that follows. no
* matter what the error is, it is very likely to persist.
*/
- pr_debug("bio_readpage_error: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
+ pr_debug("Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
num_copies, failrec->this_mirror, failed_mirror);
- free_io_failure(inode, failrec, 0);
- return -EIO;
+ return 0;
}
/*
@@ -2287,7 +2295,6 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
BUG_ON(failrec->in_validation);
failrec->in_validation = 1;
failrec->this_mirror = failed_mirror;
- read_mode = READ_SYNC | REQ_FAILFAST_DEV;
} else {
/*
* we're ready to fulfill a) and b) alongside. get a good copy
@@ -2303,25 +2310,36 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
failrec->this_mirror++;
if (failrec->this_mirror == failed_mirror)
failrec->this_mirror++;
- read_mode = READ_SYNC;
}
if (failrec->this_mirror > num_copies) {
- pr_debug("bio_readpage_error: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
+ pr_debug("Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
num_copies, failrec->this_mirror, failed_mirror);
- free_io_failure(inode, failrec, 0);
- return -EIO;
+ return 0;
}
+ return 1;
+}
+
+
+struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
+ struct io_failure_record *failrec,
+ struct page *page, int pg_offset, int icsum,
+ bio_end_io_t *endio_func, void *data)
+{
+ struct bio *bio;
+ struct btrfs_io_bio *btrfs_failed_bio;
+ struct btrfs_io_bio *btrfs_bio;
+
bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
- if (!bio) {
- free_io_failure(inode, failrec, 0);
- return -EIO;
- }
- bio->bi_end_io = failed_bio->bi_end_io;
+ if (!bio)
+ return NULL;
+
+ bio->bi_end_io = endio_func;
bio->bi_iter.bi_sector = failrec->logical >> 9;
bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
bio->bi_iter.bi_size = 0;
+ bio->bi_private = data;
btrfs_failed_bio = btrfs_io_bio(failed_bio);
if (btrfs_failed_bio->csum) {
@@ -2330,21 +2348,73 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
btrfs_bio = btrfs_io_bio(bio);
btrfs_bio->csum = btrfs_bio->csum_inline;
- phy_offset >>= inode->i_sb->s_blocksize_bits;
- phy_offset *= csum_size;
- memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + phy_offset,
+ icsum *= csum_size;
+ memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum,
csum_size);
}
- bio_add_page(bio, page, failrec->len, start - page_offset(page));
+ bio_add_page(bio, page, failrec->len, pg_offset);
+
+ return bio;
+}
+
+/*
+ * this is a generic handler for readpage errors (default
+ * readpage_io_failed_hook). if other copies exist, read those and write back
+ * good data to the failed position. does not investigate in remapping the
+ * failed extent elsewhere, hoping the device will be smart enough to do this as
+ * needed
+ */
+
+static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
+ struct page *page, u64 start, u64 end,
+ int failed_mirror)
+{
+ struct io_failure_record *failrec;
+ struct inode *inode = page->mapping->host;
+ struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+ struct bio *bio;
+ int read_mode;
+ int ret;
+
+ BUG_ON(failed_bio->bi_rw & REQ_WRITE);
+
+ ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
+ if (ret)
+ return ret;
+
+ ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror);
+ if (!ret) {
+ free_io_failure(inode, failrec);
+ return -EIO;
+ }
+
+ if (failed_bio->bi_vcnt > 1)
+ read_mode = READ_SYNC | REQ_FAILFAST_DEV;
+ else
+ read_mode = READ_SYNC;
+
+ phy_offset >>= inode->i_sb->s_blocksize_bits;
+ bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
+ start - page_offset(page),
+ (int)phy_offset, failed_bio->bi_end_io,
+ NULL);
+ if (!bio) {
+ free_io_failure(inode, failrec);
+ return -EIO;
+ }
- pr_debug("bio_readpage_error: submitting new read[%#x] to "
- "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
- failrec->this_mirror, num_copies, failrec->in_validation);
+ pr_debug("Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d\n",
+ read_mode, failrec->this_mirror, failrec->in_validation);
ret = tree->ops->submit_bio_hook(inode, read_mode, bio,
failrec->this_mirror,
failrec->bio_flags, 0);
+ if (ret) {
+ free_io_failure(inode, failrec);
+ bio_put(bio);
+ }
+
return ret;
}
@@ -2469,7 +2539,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
struct inode *inode = page->mapping->host;
pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
- "mirror=%lu\n", (u64)bio->bi_iter.bi_sector, err,
+ "mirror=%u\n", (u64)bio->bi_iter.bi_sector, err,
io_bio->mirror_num);
tree = &BTRFS_I(inode)->io_tree;
@@ -2503,7 +2573,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
if (ret)
uptodate = 0;
else
- clean_io_failure(start, page);
+ clean_io_failure(inode, start, page, 0);
}
if (likely(uptodate))
@@ -2540,12 +2610,12 @@ readpage_ok:
if (likely(uptodate)) {
loff_t i_size = i_size_read(inode);
pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
- unsigned offset;
+ unsigned off;
/* Zero out the end if this page straddles i_size */
- offset = i_size & (PAGE_CACHE_SIZE-1);
- if (page->index == end_index && offset)
- zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+ off = i_size & (PAGE_CACHE_SIZE-1);
+ if (page->index == end_index && off)
+ zero_user_segment(page, off, PAGE_CACHE_SIZE);
SetPageUptodate(page);
} else {
ClearPageUptodate(page);
@@ -2618,9 +2688,18 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
{
- return bio_clone_bioset(bio, gfp_mask, btrfs_bioset);
-}
+ struct btrfs_io_bio *btrfs_bio;
+ struct bio *new;
+ new = bio_clone_bioset(bio, gfp_mask, btrfs_bioset);
+ if (new) {
+ btrfs_bio = btrfs_io_bio(new);
+ btrfs_bio->csum = NULL;
+ btrfs_bio->csum_allocated = NULL;
+ btrfs_bio->end_io = NULL;
+ }
+ return new;
+}
/* this also allocates from the btrfs_bioset */
struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
@@ -3501,7 +3580,7 @@ lock_extent_buffer_for_io(struct extent_buffer *eb,
num_pages = num_extent_pages(eb->start, eb->len);
for (i = 0; i < num_pages; i++) {
- struct page *p = extent_buffer_page(eb, i);
+ struct page *p = eb->pages[i];
if (!trylock_page(p)) {
if (!flush) {
@@ -3522,6 +3601,68 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb)
wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
}
+static void set_btree_ioerr(struct page *page)
+{
+ struct extent_buffer *eb = (struct extent_buffer *)page->private;
+ struct btrfs_inode *btree_ino = BTRFS_I(eb->fs_info->btree_inode);
+
+ SetPageError(page);
+ if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
+ return;
+
+ /*
+ * If writeback for a btree extent that doesn't belong to a log tree
+ * failed, increment the counter transaction->eb_write_errors.
+ * We do this because while the transaction is running and before it's
+ * committing (when we call filemap_fdata[write|wait]_range against
+ * the btree inode), we might have
+ * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
+ * returns an error or an error happens during writeback, when we're
+ * committing the transaction we wouldn't know about it, since the pages
+ * can be no longer dirty nor marked anymore for writeback (if a
+ * subsequent modification to the extent buffer didn't happen before the
+ * transaction commit), which makes filemap_fdata[write|wait]_range not
+ * able to find the pages tagged with SetPageError at transaction
+ * commit time. So if this happens we must abort the transaction,
+ * otherwise we commit a super block with btree roots that point to
+ * btree nodes/leafs whose content on disk is invalid - either garbage
+ * or the content of some node/leaf from a past generation that got
+ * cowed or deleted and is no longer valid.
+ *
+ * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
+ * not be enough - we need to distinguish between log tree extents vs
+ * non-log tree extents, and the next filemap_fdatawait_range() call
+ * will catch and clear such errors in the mapping - and that call might
+ * be from a log sync and not from a transaction commit. Also, checking
+ * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
+ * not done and would not be reliable - the eb might have been released
+ * from memory and reading it back again means that flag would not be
+ * set (since it's a runtime flag, not persisted on disk).
+ *
+ * Using the flags below in the btree inode also makes us achieve the
+ * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
+ * writeback for all dirty pages and before filemap_fdatawait_range()
+ * is called, the writeback for all dirty pages had already finished
+ * with errors - because we were not using AS_EIO/AS_ENOSPC,
+ * filemap_fdatawait_range() would return success, as it could not know
+ * that writeback errors happened (the pages were no longer tagged for
+ * writeback).
+ */
+ switch (eb->log_index) {
+ case -1:
+ set_bit(BTRFS_INODE_BTREE_ERR, &btree_ino->runtime_flags);
+ break;
+ case 0:
+ set_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
+ break;
+ case 1:
+ set_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
+ break;
+ default:
+ BUG(); /* unexpected, logic error */
+ }
+}
+
static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
{
struct bio_vec *bvec;
@@ -3535,10 +3676,9 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
BUG_ON(!eb);
done = atomic_dec_and_test(&eb->io_pages);
- if (err || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
- set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+ if (err || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
ClearPageUptodate(page);
- SetPageError(page);
+ set_btree_ioerr(page);
}
end_page_writeback(page);
@@ -3565,14 +3705,14 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META;
int ret = 0;
- clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+ clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
num_pages = num_extent_pages(eb->start, eb->len);
atomic_set(&eb->io_pages, num_pages);
if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
bio_flags = EXTENT_BIO_TREE_LOG;
for (i = 0; i < num_pages; i++) {
- struct page *p = extent_buffer_page(eb, i);
+ struct page *p = eb->pages[i];
clear_page_dirty_for_io(p);
set_page_writeback(p);
@@ -3582,8 +3722,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
0, epd->bio_flags, bio_flags);
epd->bio_flags = bio_flags;
if (ret) {
- set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
- SetPageError(p);
+ set_btree_ioerr(p);
+ end_page_writeback(p);
if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
end_extent_buffer_writeback(eb);
ret = -EIO;
@@ -3596,7 +3736,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
if (unlikely(ret)) {
for (; i < num_pages; i++) {
- struct page *p = extent_buffer_page(eb, i);
+ struct page *p = eb->pages[i];
+ clear_page_dirty_for_io(p);
unlock_page(p);
}
}
@@ -4166,19 +4307,6 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
return NULL;
}
-static noinline int count_ext_ref(u64 inum, u64 offset, u64 root_id, void *ctx)
-{
- unsigned long cnt = *((unsigned long *)ctx);
-
- cnt++;
- *((unsigned long *)ctx) = cnt;
-
- /* Now we're sure that the extent is shared. */
- if (cnt > 1)
- return 1;
- return 0;
-}
-
int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len, get_extent_t *get_extent)
{
@@ -4195,6 +4323,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
struct btrfs_path *path;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
int end = 0;
u64 em_start = 0;
u64 em_len = 0;
@@ -4215,8 +4344,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
* lookup the last file extent. We're not using i_size here
* because there might be preallocation past i_size
*/
- ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
- path, btrfs_ino(inode), -1, 0);
+ ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
+ 0);
if (ret < 0) {
btrfs_free_path(path);
return ret;
@@ -4224,7 +4353,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
WARN_ON(!ret);
path->slots[0]--;
btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
- found_type = btrfs_key_type(&found_key);
+ found_type = found_key.type;
/* No extents, but there might be delalloc bits */
if (found_key.objectid != btrfs_ino(inode) ||
@@ -4309,25 +4438,27 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
} else if (em->block_start == EXTENT_MAP_DELALLOC) {
flags |= (FIEMAP_EXTENT_DELALLOC |
FIEMAP_EXTENT_UNKNOWN);
- } else {
- unsigned long ref_cnt = 0;
+ } else if (fieinfo->fi_extents_max) {
+ u64 bytenr = em->block_start -
+ (em->start - em->orig_start);
disko = em->block_start + offset_in_extent;
/*
* As btrfs supports shared space, this information
* can be exported to userspace tools via
- * flag FIEMAP_EXTENT_SHARED.
+ * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0
+ * then we're just getting a count and we can skip the
+ * lookup stuff.
*/
- ret = iterate_inodes_from_logical(
- em->block_start,
- BTRFS_I(inode)->root->fs_info,
- path, count_ext_ref, &ref_cnt);
- if (ret < 0 && ret != -ENOENT)
+ ret = btrfs_check_shared(NULL, root->fs_info,
+ root->objectid,
+ btrfs_ino(inode), bytenr);
+ if (ret < 0)
goto out_free;
-
- if (ref_cnt > 1)
+ if (ret)
flags |= FIEMAP_EXTENT_SHARED;
+ ret = 0;
}
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
flags |= FIEMAP_EXTENT_ENCODED;
@@ -4381,24 +4512,21 @@ int extent_buffer_under_io(struct extent_buffer *eb)
/*
* Helper for releasing extent buffer page.
*/
-static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
- unsigned long start_idx)
+static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
{
unsigned long index;
- unsigned long num_pages;
struct page *page;
int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
BUG_ON(extent_buffer_under_io(eb));
- num_pages = num_extent_pages(eb->start, eb->len);
- index = start_idx + num_pages;
- if (start_idx >= index)
+ index = num_extent_pages(eb->start, eb->len);
+ if (index == 0)
return;
do {
index--;
- page = extent_buffer_page(eb, index);
+ page = eb->pages[index];
if (page && mapped) {
spin_lock(&page->mapping->private_lock);
/*
@@ -4429,7 +4557,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
/* One for when we alloced the page */
page_cache_release(page);
}
- } while (index != start_idx);
+ } while (index != 0);
}
/*
@@ -4437,7 +4565,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
*/
static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
{
- btrfs_release_extent_buffer_page(eb, 0);
+ btrfs_release_extent_buffer_page(eb);
__free_extent_buffer(eb);
}
@@ -4580,7 +4708,8 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb,
num_pages = num_extent_pages(eb->start, eb->len);
for (i = 0; i < num_pages; i++) {
- struct page *p = extent_buffer_page(eb, i);
+ struct page *p = eb->pages[i];
+
if (p != accessed)
mark_page_accessed(p);
}
@@ -4749,7 +4878,7 @@ again:
*/
SetPageChecked(eb->pages[0]);
for (i = 1; i < num_pages; i++) {
- p = extent_buffer_page(eb, i);
+ p = eb->pages[i];
ClearPageChecked(p);
unlock_page(p);
}
@@ -4794,7 +4923,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
}
/* Should be safe to release our pages at this point */
- btrfs_release_extent_buffer_page(eb, 0);
+ btrfs_release_extent_buffer_page(eb);
call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
return 1;
}
@@ -4860,7 +4989,7 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb)
num_pages = num_extent_pages(eb->start, eb->len);
for (i = 0; i < num_pages; i++) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
if (!PageDirty(page))
continue;
@@ -4896,7 +5025,7 @@ int set_extent_buffer_dirty(struct extent_buffer *eb)
WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
for (i = 0; i < num_pages; i++)
- set_page_dirty(extent_buffer_page(eb, i));
+ set_page_dirty(eb->pages[i]);
return was_dirty;
}
@@ -4909,7 +5038,7 @@ int clear_extent_buffer_uptodate(struct extent_buffer *eb)
clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
num_pages = num_extent_pages(eb->start, eb->len);
for (i = 0; i < num_pages; i++) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
if (page)
ClearPageUptodate(page);
}
@@ -4925,7 +5054,7 @@ int set_extent_buffer_uptodate(struct extent_buffer *eb)
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
num_pages = num_extent_pages(eb->start, eb->len);
for (i = 0; i < num_pages; i++) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
SetPageUptodate(page);
}
return 0;
@@ -4965,7 +5094,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
num_pages = num_extent_pages(eb->start, eb->len);
for (i = start_i; i < num_pages; i++) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
if (wait == WAIT_NONE) {
if (!trylock_page(page))
goto unlock_exit;
@@ -4984,11 +5113,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
goto unlock_exit;
}
- clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+ clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
eb->read_mirror = 0;
atomic_set(&eb->io_pages, num_reads);
for (i = start_i; i < num_pages; i++) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
if (!PageUptodate(page)) {
ClearPageError(page);
err = __extent_read_full_page(tree, page,
@@ -5013,7 +5142,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
return ret;
for (i = start_i; i < num_pages; i++) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
wait_on_page_locked(page);
if (!PageUptodate(page))
ret = -EIO;
@@ -5024,7 +5153,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
unlock_exit:
i = start_i;
while (locked_pages > 0) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
i++;
unlock_page(page);
locked_pages--;
@@ -5050,7 +5179,7 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
while (len > 0) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
cur = min(len, (PAGE_CACHE_SIZE - offset));
kaddr = page_address(page);
@@ -5082,7 +5211,7 @@ int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv,
offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
while (len > 0) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
cur = min(len, (PAGE_CACHE_SIZE - offset));
kaddr = page_address(page);
@@ -5131,7 +5260,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
return -EINVAL;
}
- p = extent_buffer_page(eb, i);
+ p = eb->pages[i];
kaddr = page_address(p);
*map = kaddr + offset;
*map_len = PAGE_CACHE_SIZE - offset;
@@ -5157,7 +5286,7 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
while (len > 0) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
cur = min(len, (PAGE_CACHE_SIZE - offset));
@@ -5191,7 +5320,7 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
while (len > 0) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
WARN_ON(!PageUptodate(page));
cur = min(len, PAGE_CACHE_SIZE - offset);
@@ -5221,7 +5350,7 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
while (len > 0) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
WARN_ON(!PageUptodate(page));
cur = min(len, PAGE_CACHE_SIZE - offset);
@@ -5252,7 +5381,7 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
(PAGE_CACHE_SIZE - 1);
while (len > 0) {
- page = extent_buffer_page(dst, i);
+ page = dst->pages[i];
WARN_ON(!PageUptodate(page));
cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
@@ -5330,8 +5459,7 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
cur = min_t(unsigned long, cur,
(unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
- copy_pages(extent_buffer_page(dst, dst_i),
- extent_buffer_page(dst, src_i),
+ copy_pages(dst->pages[dst_i], dst->pages[src_i],
dst_off_in_page, src_off_in_page, cur);
src_offset += cur;
@@ -5377,8 +5505,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
cur = min_t(unsigned long, len, src_off_in_page + 1);
cur = min(cur, dst_off_in_page + 1);
- copy_pages(extent_buffer_page(dst, dst_i),
- extent_buffer_page(dst, src_i),
+ copy_pages(dst->pages[dst_i], dst->pages[src_i],
dst_off_in_page - cur + 1,
src_off_in_page - cur + 1, cur);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index ccc264e..6d4b938 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -11,8 +11,6 @@
#define EXTENT_NEW (1 << 4)
#define EXTENT_DELALLOC (1 << 5)
#define EXTENT_DEFRAG (1 << 6)
-#define EXTENT_DEFRAG_DONE (1 << 7)
-#define EXTENT_BUFFER_FILLED (1 << 8)
#define EXTENT_BOUNDARY (1 << 9)
#define EXTENT_NODATASUM (1 << 10)
#define EXTENT_DO_ACCOUNTING (1 << 11)
@@ -34,16 +32,16 @@
/* these are bit numbers for test/set bit */
#define EXTENT_BUFFER_UPTODATE 0
-#define EXTENT_BUFFER_BLOCKING 1
#define EXTENT_BUFFER_DIRTY 2
#define EXTENT_BUFFER_CORRUPT 3
#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */
#define EXTENT_BUFFER_TREE_REF 5
#define EXTENT_BUFFER_STALE 6
#define EXTENT_BUFFER_WRITEBACK 7
-#define EXTENT_BUFFER_IOERR 8
+#define EXTENT_BUFFER_READ_ERR 8 /* read IO error */
#define EXTENT_BUFFER_DUMMY 9
#define EXTENT_BUFFER_IN_TREE 10
+#define EXTENT_BUFFER_WRITE_ERR 11 /* write IO error */
/* these are flags for extent_clear_unlock_delalloc */
#define PAGE_UNLOCK (1 << 0)
@@ -57,7 +55,6 @@
* map has page->private set to one.
*/
#define EXTENT_PAGE_PRIVATE 1
-#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
struct extent_state;
struct btrfs_root;
@@ -108,7 +105,6 @@ struct extent_state {
struct rb_node rb_node;
/* ADD NEW ELEMENTS AFTER THIS */
- struct extent_io_tree *tree;
wait_queue_head_t wq;
atomic_t refs;
unsigned long state;
@@ -126,8 +122,6 @@ struct extent_state {
struct extent_buffer {
u64 start;
unsigned long len;
- unsigned long map_start;
- unsigned long map_len;
unsigned long bflags;
struct btrfs_fs_info *fs_info;
spinlock_t refs_lock;
@@ -144,7 +138,9 @@ struct extent_buffer {
atomic_t blocking_readers;
atomic_t spinning_readers;
atomic_t spinning_writers;
- int lock_nested;
+ short lock_nested;
+ /* >= 0 if eb belongs to a log tree, -1 otherwise */
+ short log_index;
/* protects write locks */
rwlock_t lock;
@@ -286,12 +282,6 @@ static inline unsigned long num_extent_pages(u64 start, u64 len)
(start >> PAGE_CACHE_SHIFT);
}
-static inline struct page *extent_buffer_page(struct extent_buffer *eb,
- unsigned long i)
-{
- return eb->pages[i];
-}
-
static inline void extent_buffer_get(struct extent_buffer *eb)
{
atomic_inc(&eb->refs);
@@ -341,18 +331,50 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask);
struct btrfs_fs_info;
-int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
- u64 length, u64 logical, struct page *page,
- int mirror_num);
+int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
+ struct page *page, unsigned int pg_offset,
+ int mirror_num);
+int clean_io_failure(struct inode *inode, u64 start, struct page *page,
+ unsigned int pg_offset);
int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
int mirror_num);
+
+/*
+ * When IO fails, either with EIO or csum verification fails, we
+ * try other mirrors that might have a good copy of the data. This
+ * io_failure_record is used to record state as we go through all the
+ * mirrors. If another mirror has good data, the page is set up to date
+ * and things continue. If a good mirror can't be found, the original
+ * bio end_io callback is called to indicate things have failed.
+ */
+struct io_failure_record {
+ struct page *page;
+ u64 start;
+ u64 len;
+ u64 logical;
+ unsigned long bio_flags;
+ int this_mirror;
+ int failed_mirror;
+ int in_validation;
+};
+
+void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end);
+int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
+ struct io_failure_record **failrec_ret);
+int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
+ struct io_failure_record *failrec, int fail_mirror);
+struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
+ struct io_failure_record *failrec,
+ struct page *page, int pg_offset, int icsum,
+ bio_end_io_t *endio_func, void *data);
+int free_io_failure(struct inode *inode, struct io_failure_record *rec);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
noinline u64 find_lock_delalloc_range(struct inode *inode,
struct extent_io_tree *tree,
struct page *locked_page, u64 *start,
u64 *end, u64 max_bytes);
+#endif
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, unsigned long len);
#endif
-#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 54c84da..783a943 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -55,7 +55,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
return -ENOMEM;
file_key.objectid = objectid;
file_key.offset = pos;
- btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
+ file_key.type = BTRFS_EXTENT_DATA_KEY;
path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &file_key,
@@ -100,7 +100,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans,
file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
file_key.offset = bytenr;
- btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
+ file_key.type = BTRFS_EXTENT_CSUM_KEY;
ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
if (ret < 0)
goto fail;
@@ -111,7 +111,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans,
goto fail;
path->slots[0]--;
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY)
+ if (found_key.type != BTRFS_EXTENT_CSUM_KEY)
goto fail;
csum_offset = (bytenr - found_key.offset) >>
@@ -148,7 +148,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
file_key.objectid = objectid;
file_key.offset = offset;
- btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
+ file_key.type = BTRFS_EXTENT_DATA_KEY;
ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
return ret;
}
@@ -299,19 +299,9 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
}
int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
- struct btrfs_dio_private *dip, struct bio *bio,
- u64 offset)
+ struct bio *bio, u64 offset)
{
- int len = (bio->bi_iter.bi_sector << 9) - dip->disk_bytenr;
- u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
- int ret;
-
- len >>= inode->i_sb->s_blocksize_bits;
- len *= csum_size;
-
- ret = __btrfs_lookup_bio_sums(root, inode, bio, offset,
- (u32 *)(dip->csum + len), 1);
- return ret;
+ return __btrfs_lookup_bio_sums(root, inode, bio, offset, NULL, 1);
}
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
@@ -329,8 +319,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
u64 csum_end;
u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
- ASSERT(start == ALIGN(start, root->sectorsize) &&
- (end + 1) == ALIGN(end + 1, root->sectorsize));
+ ASSERT(IS_ALIGNED(start, root->sectorsize) &&
+ IS_ALIGNED(end + 1, root->sectorsize));
path = btrfs_alloc_path();
if (!path)
@@ -720,7 +710,7 @@ again:
bytenr = sums->bytenr + total_bytes;
file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
file_key.offset = bytenr;
- btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
+ file_key.type = BTRFS_EXTENT_CSUM_KEY;
item = btrfs_lookup_csum(trans, root, path, bytenr, 1);
if (!IS_ERR(item)) {
@@ -790,7 +780,7 @@ again:
csum_offset = (bytenr - found_key.offset) >>
root->fs_info->sb->s_blocksize_bits;
- if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY ||
+ if (found_key.type != BTRFS_EXTENT_CSUM_KEY ||
found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) {
goto insert;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ff1cc03..a18ceab 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -299,7 +299,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
/* get the inode */
key.objectid = defrag->root;
- btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
index = srcu_read_lock(&fs_info->subvol_srcu);
@@ -311,7 +311,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
}
key.objectid = defrag->ino;
- btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
if (IS_ERR(inode)) {
@@ -452,7 +452,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
if (unlikely(copied == 0))
break;
- if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
+ if (copied < PAGE_CACHE_SIZE - offset) {
offset += copied;
} else {
pg++;
@@ -1481,9 +1481,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
bool force_page_uptodate = false;
bool need_unlock;
- nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
- PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
- (sizeof(struct page *)));
+ nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_CACHE_SIZE),
+ PAGE_CACHE_SIZE / (sizeof(struct page *)));
nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
nrptrs = max(nrptrs, 8);
pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
@@ -1497,8 +1496,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
size_t write_bytes = min(iov_iter_count(i),
nrptrs * (size_t)PAGE_CACHE_SIZE -
offset);
- size_t num_pages = (write_bytes + offset +
- PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ size_t num_pages = DIV_ROUND_UP(write_bytes + offset,
+ PAGE_CACHE_SIZE);
size_t reserve_bytes;
size_t dirty_pages;
size_t copied;
@@ -1526,9 +1525,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
* our prealloc extent may be smaller than
* write_bytes, so scale down.
*/
- num_pages = (write_bytes + offset +
- PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
+ num_pages = DIV_ROUND_UP(write_bytes + offset,
+ PAGE_CACHE_SIZE);
reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
ret = 0;
} else {
@@ -1590,9 +1588,8 @@ again:
dirty_pages = 0;
} else {
force_page_uptodate = false;
- dirty_pages = (copied + offset +
- PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
+ dirty_pages = DIV_ROUND_UP(copied + offset,
+ PAGE_CACHE_SIZE);
}
/*
@@ -1653,7 +1650,7 @@ again:
cond_resched();
balance_dirty_pages_ratelimited(inode->i_mapping);
- if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
+ if (dirty_pages < (root->nodesize >> PAGE_CACHE_SHIFT) + 1)
btrfs_btree_balance_dirty(root);
pos += copied;
@@ -1795,7 +1792,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
if (sync)
atomic_inc(&BTRFS_I(inode)->sync_writers);
- if (unlikely(file->f_flags & O_DIRECT)) {
+ if (file->f_flags & O_DIRECT) {
num_written = __btrfs_direct_write(iocb, from, pos);
} else {
num_written = __btrfs_buffered_write(file, from, pos);
@@ -1852,6 +1849,20 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
return 0;
}
+static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
+{
+ int ret;
+
+ atomic_inc(&BTRFS_I(inode)->sync_writers);
+ ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+ if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+ atomic_dec(&BTRFS_I(inode)->sync_writers);
+
+ return ret;
+}
+
/*
* fsync call for both files and directories. This logs the inode into
* the tree log instead of forcing full commits whenever possible.
@@ -1881,30 +1892,64 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* multi-task, and make the performance up. See
* btrfs_wait_ordered_range for an explanation of the ASYNC check.
*/
- atomic_inc(&BTRFS_I(inode)->sync_writers);
- ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
- if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
- atomic_dec(&BTRFS_I(inode)->sync_writers);
+ ret = start_ordered_ops(inode, start, end);
if (ret)
return ret;
mutex_lock(&inode->i_mutex);
-
- /*
- * We flush the dirty pages again to avoid some dirty pages in the
- * range being left.
- */
atomic_inc(&root->log_batch);
full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
+ /*
+ * We might have have had more pages made dirty after calling
+ * start_ordered_ops and before acquiring the inode's i_mutex.
+ */
if (full_sync) {
+ /*
+ * For a full sync, we need to make sure any ordered operations
+ * start and finish before we start logging the inode, so that
+ * all extents are persisted and the respective file extent
+ * items are in the fs/subvol btree.
+ */
ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
- if (ret) {
- mutex_unlock(&inode->i_mutex);
- goto out;
- }
+ } else {
+ /*
+ * Start any new ordered operations before starting to log the
+ * inode. We will wait for them to finish in btrfs_sync_log().
+ *
+ * Right before acquiring the inode's mutex, we might have new
+ * writes dirtying pages, which won't immediately start the
+ * respective ordered operations - that is done through the
+ * fill_delalloc callbacks invoked from the writepage and
+ * writepages address space operations. So make sure we start
+ * all ordered operations before starting to log our inode. Not
+ * doing this means that while logging the inode, writeback
+ * could start and invoke writepage/writepages, which would call
+ * the fill_delalloc callbacks (cow_file_range,
+ * submit_compressed_extents). These callbacks add first an
+ * extent map to the modified list of extents and then create
+ * the respective ordered operation, which means in
+ * tree-log.c:btrfs_log_inode() we might capture all existing
+ * ordered operations (with btrfs_get_logged_extents()) before
+ * the fill_delalloc callback adds its ordered operation, and by
+ * the time we visit the modified list of extent maps (with
+ * btrfs_log_changed_extents()), we see and process the extent
+ * map they created. We then use the extent map to construct a
+ * file extent item for logging without waiting for the
+ * respective ordered operation to finish - this file extent
+ * item points to a disk location that might not have yet been
+ * written to, containing random data - so after a crash a log
+ * replay will make our inode have file extent items that point
+ * to disk locations containing invalid data, as we returned
+ * success to userspace without waiting for the respective
+ * ordered operation to finish, because it wasn't captured by
+ * btrfs_get_logged_extents().
+ */
+ ret = start_ordered_ops(inode, start, end);
+ }
+ if (ret) {
+ mutex_unlock(&inode->i_mutex);
+ goto out;
}
atomic_inc(&root->log_batch);
@@ -1984,6 +2029,25 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
mutex_unlock(&inode->i_mutex);
+ /*
+ * If any of the ordered extents had an error, just return it to user
+ * space, so that the application knows some writes didn't succeed and
+ * can take proper action (retry for e.g.). Blindly committing the
+ * transaction in this case, would fool userspace that everything was
+ * successful. And we also want to make sure our log doesn't contain
+ * file extent items pointing to extents that weren't fully written to -
+ * just like in the non fast fsync path, where we check for the ordered
+ * operation's error flag before writing to the log tree and return -EIO
+ * if any of them had this flag set (btrfs_wait_ordered_range) -
+ * therefore we need to check for errors in the ordered operations,
+ * which are indicated by ctx.io_err.
+ */
+ if (ctx.io_err) {
+ btrfs_end_transaction(trans, root);
+ ret = ctx.io_err;
+ goto out;
+ }
+
if (ret != BTRFS_NO_LOG_SYNC) {
if (!ret) {
ret = btrfs_sync_log(trans, root, &ctx);
@@ -2621,23 +2685,28 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
- u64 lockstart = *offset;
- u64 lockend = i_size_read(inode);
- u64 start = *offset;
- u64 len = i_size_read(inode);
+ u64 lockstart;
+ u64 lockend;
+ u64 start;
+ u64 len;
int ret = 0;
- lockend = max_t(u64, root->sectorsize, lockend);
+ if (inode->i_size == 0)
+ return -ENXIO;
+
+ /*
+ * *offset can be negative, in this case we start finding DATA/HOLE from
+ * the very start of the file.
+ */
+ start = max_t(loff_t, 0, *offset);
+
+ lockstart = round_down(start, root->sectorsize);
+ lockend = round_up(i_size_read(inode), root->sectorsize);
if (lockend <= lockstart)
lockend = lockstart + root->sectorsize;
-
lockend--;
len = lockend - lockstart + 1;
- len = max_t(u64, len, root->sectorsize);
- if (inode->i_size == 0)
- return -ENXIO;
-
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
&cached_state);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 2b0a627..3384819 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -279,8 +279,7 @@ static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
int num_pages;
int check_crcs = 0;
- num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
+ num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
check_crcs = 1;
@@ -1998,6 +1997,128 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
return merged;
}
+static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info,
+ bool update_stat)
+{
+ struct btrfs_free_space *bitmap;
+ unsigned long i;
+ unsigned long j;
+ const u64 end = info->offset + info->bytes;
+ const u64 bitmap_offset = offset_to_bitmap(ctl, end);
+ u64 bytes;
+
+ bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
+ if (!bitmap)
+ return false;
+
+ i = offset_to_bit(bitmap->offset, ctl->unit, end);
+ j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
+ if (j == i)
+ return false;
+ bytes = (j - i) * ctl->unit;
+ info->bytes += bytes;
+
+ if (update_stat)
+ bitmap_clear_bits(ctl, bitmap, end, bytes);
+ else
+ __bitmap_clear_bits(ctl, bitmap, end, bytes);
+
+ if (!bitmap->bytes)
+ free_bitmap(ctl, bitmap);
+
+ return true;
+}
+
+static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info,
+ bool update_stat)
+{
+ struct btrfs_free_space *bitmap;
+ u64 bitmap_offset;
+ unsigned long i;
+ unsigned long j;
+ unsigned long prev_j;
+ u64 bytes;
+
+ bitmap_offset = offset_to_bitmap(ctl, info->offset);
+ /* If we're on a boundary, try the previous logical bitmap. */
+ if (bitmap_offset == info->offset) {
+ if (info->offset == 0)
+ return false;
+ bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
+ }
+
+ bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
+ if (!bitmap)
+ return false;
+
+ i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
+ j = 0;
+ prev_j = (unsigned long)-1;
+ for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
+ if (j > i)
+ break;
+ prev_j = j;
+ }
+ if (prev_j == i)
+ return false;
+
+ if (prev_j == (unsigned long)-1)
+ bytes = (i + 1) * ctl->unit;
+ else
+ bytes = (i - prev_j) * ctl->unit;
+
+ info->offset -= bytes;
+ info->bytes += bytes;
+
+ if (update_stat)
+ bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
+ else
+ __bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
+
+ if (!bitmap->bytes)
+ free_bitmap(ctl, bitmap);
+
+ return true;
+}
+
+/*
+ * We prefer always to allocate from extent entries, both for clustered and
+ * non-clustered allocation requests. So when attempting to add a new extent
+ * entry, try to see if there's adjacent free space in bitmap entries, and if
+ * there is, migrate that space from the bitmaps to the extent.
+ * Like this we get better chances of satisfying space allocation requests
+ * because we attempt to satisfy them based on a single cache entry, and never
+ * on 2 or more entries - even if the entries represent a contiguous free space
+ * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
+ * ends).
+ */
+static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info,
+ bool update_stat)
+{
+ /*
+ * Only work with disconnected entries, as we can change their offset,
+ * and must be extent entries.
+ */
+ ASSERT(!info->bitmap);
+ ASSERT(RB_EMPTY_NODE(&info->offset_index));
+
+ if (ctl->total_bitmaps > 0) {
+ bool stole_end;
+ bool stole_front = false;
+
+ stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
+ if (ctl->total_bitmaps > 0)
+ stole_front = steal_from_bitmap_to_front(ctl, info,
+ update_stat);
+
+ if (stole_end || stole_front)
+ try_merge_free_space(ctl, info, update_stat);
+ }
+}
+
int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
u64 offset, u64 bytes)
{
@@ -2010,6 +2131,7 @@ int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
info->offset = offset;
info->bytes = bytes;
+ RB_CLEAR_NODE(&info->offset_index);
spin_lock(&ctl->tree_lock);
@@ -2029,6 +2151,14 @@ int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
goto out;
}
link:
+ /*
+ * Only steal free space from adjacent bitmaps if we're sure we're not
+ * going to add the new free space to existing bitmap entries - because
+ * that would mean unnecessary work that would be reverted. Therefore
+ * attempt to steal space from bitmaps if we're adding an extent entry.
+ */
+ steal_from_bitmap(ctl, info, true);
+
ret = link_free_space(ctl, info);
if (ret)
kmem_cache_free(btrfs_free_space_cachep, info);
@@ -2205,10 +2335,13 @@ __btrfs_return_cluster_to_free_space(
entry = rb_entry(node, struct btrfs_free_space, offset_index);
node = rb_next(&entry->offset_index);
rb_erase(&entry->offset_index, &cluster->root);
+ RB_CLEAR_NODE(&entry->offset_index);
bitmap = (entry->bitmap != NULL);
- if (!bitmap)
+ if (!bitmap) {
try_merge_free_space(ctl, entry, false);
+ steal_from_bitmap(ctl, entry, false);
+ }
tree_insert_offset(&ctl->free_space_offset,
entry->offset, &entry->offset_index, bitmap);
}
@@ -3033,10 +3166,10 @@ struct inode *lookup_free_ino_inode(struct btrfs_root *root,
{
struct inode *inode = NULL;
- spin_lock(&root->cache_lock);
- if (root->cache_inode)
- inode = igrab(root->cache_inode);
- spin_unlock(&root->cache_lock);
+ spin_lock(&root->ino_cache_lock);
+ if (root->ino_cache_inode)
+ inode = igrab(root->ino_cache_inode);
+ spin_unlock(&root->ino_cache_lock);
if (inode)
return inode;
@@ -3044,10 +3177,10 @@ struct inode *lookup_free_ino_inode(struct btrfs_root *root,
if (IS_ERR(inode))
return inode;
- spin_lock(&root->cache_lock);
+ spin_lock(&root->ino_cache_lock);
if (!btrfs_fs_closing(root->fs_info))
- root->cache_inode = igrab(inode);
- spin_unlock(&root->cache_lock);
+ root->ino_cache_inode = igrab(inode);
+ spin_unlock(&root->ino_cache_lock);
return inode;
}
@@ -3176,6 +3309,7 @@ again:
map = NULL;
add_new_bitmap(ctl, info, offset);
bitmap_info = info;
+ info = NULL;
}
bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
@@ -3186,6 +3320,8 @@ again:
if (bytes)
goto again;
+ if (info)
+ kmem_cache_free(btrfs_free_space_cachep, info);
if (map)
kfree(map);
return 0;
@@ -3260,6 +3396,7 @@ have_info:
goto have_info;
}
+ ret = 0;
goto out;
}
diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
index 85889aa..aae520b 100644
--- a/fs/btrfs/hash.c
+++ b/fs/btrfs/hash.c
@@ -20,10 +20,8 @@ static struct crypto_shash *tfm;
int __init btrfs_hash_init(void)
{
tfm = crypto_alloc_shash("crc32c", 0, 0);
- if (IS_ERR(tfm))
- return PTR_ERR(tfm);
- return 0;
+ return PTR_ERR_OR_ZERO(tfm);
}
void btrfs_hash_exit(void)
@@ -33,18 +31,16 @@ void btrfs_hash_exit(void)
u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length)
{
- struct {
- struct shash_desc shash;
- char ctx[crypto_shash_descsize(tfm)];
- } desc;
+ SHASH_DESC_ON_STACK(shash, tfm);
+ u32 *ctx = (u32 *)shash_desc_ctx(shash);
int err;
- desc.shash.tfm = tfm;
- desc.shash.flags = 0;
- *(u32 *)desc.ctx = crc;
+ shash->tfm = tfm;
+ shash->flags = 0;
+ *ctx = crc;
- err = crypto_shash_update(&desc.shash, address, length);
+ err = crypto_shash_update(shash, address, length);
BUG_ON(err);
- return *(u32 *)desc.ctx;
+ return *ctx;
}
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 2be38df..8ffa478 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -135,7 +135,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
u32 item_size;
key.objectid = inode_objectid;
- btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+ key.type = BTRFS_INODE_EXTREF_KEY;
key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
path = btrfs_alloc_path();
@@ -209,7 +209,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
key.objectid = inode_objectid;
key.offset = ref_objectid;
- btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+ key.type = BTRFS_INODE_REF_KEY;
path = btrfs_alloc_path();
if (!path)
@@ -337,7 +337,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
key.objectid = inode_objectid;
key.offset = ref_objectid;
- btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+ key.type = BTRFS_INODE_REF_KEY;
path = btrfs_alloc_path();
if (!path)
@@ -400,7 +400,7 @@ int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
struct btrfs_key key;
int ret;
key.objectid = objectid;
- btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
ret = btrfs_insert_empty_item(trans, root, path, &key,
@@ -420,13 +420,13 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
struct btrfs_key found_key;
ret = btrfs_search_slot(trans, root, location, path, ins_len, cow);
- if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY &&
+ if (ret > 0 && location->type == BTRFS_ROOT_ITEM_KEY &&
location->offset == (u64)-1 && path->slots[0] != 0) {
slot = path->slots[0] - 1;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.objectid == location->objectid &&
- btrfs_key_type(&found_key) == btrfs_key_type(location)) {
+ found_key.type == location->type) {
path->slots[0]--;
return 0;
}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 888fbe1..83d646b 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -87,7 +87,7 @@ again:
*/
btrfs_item_key_to_cpu(leaf, &key, 0);
btrfs_release_path(path);
- root->cache_progress = last;
+ root->ino_cache_progress = last;
up_read(&fs_info->commit_root_sem);
schedule_timeout(1);
goto again;
@@ -106,7 +106,7 @@ again:
if (last != (u64)-1 && last + 1 != key.objectid) {
__btrfs_add_free_space(ctl, last + 1,
key.objectid - last - 1);
- wake_up(&root->cache_wait);
+ wake_up(&root->ino_cache_wait);
}
last = key.objectid;
@@ -119,14 +119,14 @@ next:
root->highest_objectid - last - 1);
}
- spin_lock(&root->cache_lock);
- root->cached = BTRFS_CACHE_FINISHED;
- spin_unlock(&root->cache_lock);
+ spin_lock(&root->ino_cache_lock);
+ root->ino_cache_state = BTRFS_CACHE_FINISHED;
+ spin_unlock(&root->ino_cache_lock);
- root->cache_progress = (u64)-1;
+ root->ino_cache_progress = (u64)-1;
btrfs_unpin_free_ino(root);
out:
- wake_up(&root->cache_wait);
+ wake_up(&root->ino_cache_wait);
up_read(&fs_info->commit_root_sem);
btrfs_free_path(path);
@@ -144,20 +144,20 @@ static void start_caching(struct btrfs_root *root)
if (!btrfs_test_opt(root, INODE_MAP_CACHE))
return;
- spin_lock(&root->cache_lock);
- if (root->cached != BTRFS_CACHE_NO) {
- spin_unlock(&root->cache_lock);
+ spin_lock(&root->ino_cache_lock);
+ if (root->ino_cache_state != BTRFS_CACHE_NO) {
+ spin_unlock(&root->ino_cache_lock);
return;
}
- root->cached = BTRFS_CACHE_STARTED;
- spin_unlock(&root->cache_lock);
+ root->ino_cache_state = BTRFS_CACHE_STARTED;
+ spin_unlock(&root->ino_cache_lock);
ret = load_free_ino_cache(root->fs_info, root);
if (ret == 1) {
- spin_lock(&root->cache_lock);
- root->cached = BTRFS_CACHE_FINISHED;
- spin_unlock(&root->cache_lock);
+ spin_lock(&root->ino_cache_lock);
+ root->ino_cache_state = BTRFS_CACHE_FINISHED;
+ spin_unlock(&root->ino_cache_lock);
return;
}
@@ -196,11 +196,11 @@ again:
start_caching(root);
- wait_event(root->cache_wait,
- root->cached == BTRFS_CACHE_FINISHED ||
+ wait_event(root->ino_cache_wait,
+ root->ino_cache_state == BTRFS_CACHE_FINISHED ||
root->free_ino_ctl->free_space > 0);
- if (root->cached == BTRFS_CACHE_FINISHED &&
+ if (root->ino_cache_state == BTRFS_CACHE_FINISHED &&
root->free_ino_ctl->free_space == 0)
return -ENOSPC;
else
@@ -214,17 +214,17 @@ void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
if (!btrfs_test_opt(root, INODE_MAP_CACHE))
return;
again:
- if (root->cached == BTRFS_CACHE_FINISHED) {
+ if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
__btrfs_add_free_space(pinned, objectid, 1);
} else {
down_write(&root->fs_info->commit_root_sem);
- spin_lock(&root->cache_lock);
- if (root->cached == BTRFS_CACHE_FINISHED) {
- spin_unlock(&root->cache_lock);
+ spin_lock(&root->ino_cache_lock);
+ if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
+ spin_unlock(&root->ino_cache_lock);
up_write(&root->fs_info->commit_root_sem);
goto again;
}
- spin_unlock(&root->cache_lock);
+ spin_unlock(&root->ino_cache_lock);
start_caching(root);
@@ -235,10 +235,10 @@ again:
}
/*
- * When a transaction is committed, we'll move those inode numbers which
- * are smaller than root->cache_progress from pinned tree to free_ino tree,
- * and others will just be dropped, because the commit root we were
- * searching has changed.
+ * When a transaction is committed, we'll move those inode numbers which are
+ * smaller than root->ino_cache_progress from pinned tree to free_ino tree, and
+ * others will just be dropped, because the commit root we were searching has
+ * changed.
*
* Must be called with root->fs_info->commit_root_sem held
*/
@@ -261,10 +261,10 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
info = rb_entry(n, struct btrfs_free_space, offset_index);
BUG_ON(info->bitmap); /* Logic error */
- if (info->offset > root->cache_progress)
+ if (info->offset > root->ino_cache_progress)
goto free;
- else if (info->offset + info->bytes > root->cache_progress)
- count = root->cache_progress - info->offset + 1;
+ else if (info->offset + info->bytes > root->ino_cache_progress)
+ count = root->ino_cache_progress - info->offset + 1;
else
count = info->bytes;
@@ -462,13 +462,13 @@ again:
}
}
- spin_lock(&root->cache_lock);
- if (root->cached != BTRFS_CACHE_FINISHED) {
+ spin_lock(&root->ino_cache_lock);
+ if (root->ino_cache_state != BTRFS_CACHE_FINISHED) {
ret = -1;
- spin_unlock(&root->cache_lock);
+ spin_unlock(&root->ino_cache_lock);
goto out_put;
}
- spin_unlock(&root->cache_lock);
+ spin_unlock(&root->ino_cache_lock);
spin_lock(&ctl->tree_lock);
prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 016c403..fc9c043 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -153,7 +153,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
key.objectid = btrfs_ino(inode);
key.offset = start;
- btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+ key.type = BTRFS_EXTENT_DATA_KEY;
datasize = btrfs_file_extent_calc_inline_size(cur_size);
path->leave_spinning = 1;
@@ -249,8 +249,8 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
data_len = compressed_size;
if (start > 0 ||
- actual_end >= PAGE_CACHE_SIZE ||
- data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
+ actual_end > PAGE_CACHE_SIZE ||
+ data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) ||
(!compressed_size &&
(actual_end & (root->sectorsize - 1)) == 0) ||
end + 1 < isize ||
@@ -348,6 +348,23 @@ static noinline int add_async_extent(struct async_cow *cow,
return 0;
}
+static inline int inode_need_compress(struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ /* force compress */
+ if (btrfs_test_opt(root, FORCE_COMPRESS))
+ return 1;
+ /* bad compression ratios */
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
+ return 0;
+ if (btrfs_test_opt(root, COMPRESS) ||
+ BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
+ BTRFS_I(inode)->force_compress)
+ return 1;
+ return 0;
+}
+
/*
* we create compressed extents in two phases. The first
* phase compresses a range of pages that have already been
@@ -444,10 +461,7 @@ again:
* inode has not been flagged as nocompress. This flag can
* change at any time if we discover bad compression ratios.
*/
- if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
- (btrfs_test_opt(root, COMPRESS) ||
- (BTRFS_I(inode)->force_compress) ||
- (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
+ if (inode_need_compress(inode)) {
WARN_ON(pages);
pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
if (!pages) {
@@ -1094,7 +1108,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
async_cow->locked_page = locked_page;
async_cow->start = start;
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
+ !btrfs_test_opt(root, FORCE_COMPRESS))
cur_end = end;
else
cur_end = min(end, start + 512 * 1024 - 1);
@@ -1445,6 +1460,26 @@ error:
return ret;
}
+static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
+{
+
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+ !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
+ return 0;
+
+ /*
+ * @defrag_bytes is a hint value, no spinlock held here,
+ * if is not zero, it means the file is defragging.
+ * Force cow if given extent needs to be defragged.
+ */
+ if (BTRFS_I(inode)->defrag_bytes &&
+ test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
+ EXTENT_DEFRAG, 0, NULL))
+ return 1;
+
+ return 0;
+}
+
/*
* extent_io.c call back to do delayed allocation processing
*/
@@ -1453,17 +1488,15 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
unsigned long *nr_written)
{
int ret;
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ int force_cow = need_force_cow(inode, start, end);
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 1, nr_written);
- } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) {
+ } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 0, nr_written);
- } else if (!btrfs_test_opt(root, COMPRESS) &&
- !(BTRFS_I(inode)->force_compress) &&
- !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) {
+ } else if (!inode_need_compress(inode)) {
ret = cow_file_range(inode, locked_page, start, end,
page_started, nr_written, 1);
} else {
@@ -1555,6 +1588,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
struct extent_state *state, unsigned long *bits)
{
+ if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
+ WARN_ON(1);
/*
* set_bit and clear bit hooks normally require _irqsave/restore
* but in this case, we are only testing for the DELALLOC
@@ -1577,6 +1612,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
root->fs_info->delalloc_batch);
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->delalloc_bytes += len;
+ if (*bits & EXTENT_DEFRAG)
+ BTRFS_I(inode)->defrag_bytes += len;
if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
&BTRFS_I(inode)->runtime_flags))
btrfs_add_delalloc_inodes(root, inode);
@@ -1591,6 +1628,13 @@ static void btrfs_clear_bit_hook(struct inode *inode,
struct extent_state *state,
unsigned long *bits)
{
+ u64 len = state->end + 1 - state->start;
+
+ spin_lock(&BTRFS_I(inode)->lock);
+ if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
+ BTRFS_I(inode)->defrag_bytes -= len;
+ spin_unlock(&BTRFS_I(inode)->lock);
+
/*
* set_bit and clear bit hooks normally require _irqsave/restore
* but in this case, we are only testing for the DELALLOC
@@ -1598,7 +1642,6 @@ static void btrfs_clear_bit_hook(struct inode *inode,
*/
if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
- u64 len = state->end + 1 - state->start;
bool do_list = !btrfs_is_free_space_inode(inode);
if (*bits & EXTENT_FIRST_DELALLOC) {
@@ -2660,6 +2703,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
+ btrfs_free_io_failure_record(inode, ordered_extent->file_offset,
+ ordered_extent->file_offset +
+ ordered_extent->len - 1);
+
if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
truncated = true;
logical_len = ordered_extent->truncated_len;
@@ -2856,6 +2903,40 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
return 0;
}
+static int __readpage_endio_check(struct inode *inode,
+ struct btrfs_io_bio *io_bio,
+ int icsum, struct page *page,
+ int pgoff, u64 start, size_t len)
+{
+ char *kaddr;
+ u32 csum_expected;
+ u32 csum = ~(u32)0;
+ static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ csum_expected = *(((u32 *)io_bio->csum) + icsum);
+
+ kaddr = kmap_atomic(page);
+ csum = btrfs_csum_data(kaddr + pgoff, csum, len);
+ btrfs_csum_final(csum, (char *)&csum);
+ if (csum != csum_expected)
+ goto zeroit;
+
+ kunmap_atomic(kaddr);
+ return 0;
+zeroit:
+ if (__ratelimit(&_rs))
+ btrfs_info(BTRFS_I(inode)->root->fs_info,
+ "csum failed ino %llu off %llu csum %u expected csum %u",
+ btrfs_ino(inode), start, csum, csum_expected);
+ memset(kaddr + pgoff, 1, len);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr);
+ if (csum_expected == 0)
+ return 0;
+ return -EIO;
+}
+
/*
* when reads are done, we need to check csums to verify the data is correct
* if there's a match, we allow the bio to finish. If not, the code in
@@ -2868,20 +2949,15 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
size_t offset = start - page_offset(page);
struct inode *inode = page->mapping->host;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- char *kaddr;
struct btrfs_root *root = BTRFS_I(inode)->root;
- u32 csum_expected;
- u32 csum = ~(u32)0;
- static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
if (PageChecked(page)) {
ClearPageChecked(page);
- goto good;
+ return 0;
}
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
- goto good;
+ return 0;
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
@@ -2891,28 +2967,8 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
}
phy_offset >>= inode->i_sb->s_blocksize_bits;
- csum_expected = *(((u32 *)io_bio->csum) + phy_offset);
-
- kaddr = kmap_atomic(page);
- csum = btrfs_csum_data(kaddr + offset, csum, end - start + 1);
- btrfs_csum_final(csum, (char *)&csum);
- if (csum != csum_expected)
- goto zeroit;
-
- kunmap_atomic(kaddr);
-good:
- return 0;
-
-zeroit:
- if (__ratelimit(&_rs))
- btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
- btrfs_ino(page->mapping->host), start, csum, csum_expected);
- memset(kaddr + offset, 1, end - start + 1);
- flush_dcache_page(page);
- kunmap_atomic(kaddr);
- if (csum_expected == 0)
- return 0;
- return -EIO;
+ return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
+ start, (size_t)(end - start + 1));
}
struct delayed_iput {
@@ -3159,7 +3215,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
path->reada = -1;
key.objectid = BTRFS_ORPHAN_OBJECTID;
- btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = (u64)-1;
while (1) {
@@ -3186,7 +3242,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
/* make sure the item matches what we want */
if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
break;
- if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
+ if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
break;
/* release the path since we're done with it */
@@ -3662,7 +3718,8 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
* without delay
*/
if (!btrfs_is_free_space_inode(inode)
- && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
+ && !root->fs_info->log_root_recovering) {
btrfs_update_root_times(trans, root);
ret = btrfs_delayed_update_inode(trans, root, inode);
@@ -4085,7 +4142,7 @@ search_again:
fi = NULL;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- found_type = btrfs_key_type(&found_key);
+ found_type = found_key.type;
if (found_key.objectid != ino)
break;
@@ -4747,6 +4804,8 @@ void btrfs_evict_inode(struct inode *inode)
/* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ btrfs_free_io_failure_record(inode, 0, (u64)-1);
+
if (root->fs_info->log_root_recovering) {
BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
&BTRFS_I(inode)->runtime_flags));
@@ -5331,7 +5390,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
btrfs_get_delayed_items(inode, &ins_list, &del_list);
}
- btrfs_set_key_type(&key, key_type);
+ key.type = key_type;
key.offset = ctx->pos;
key.objectid = btrfs_ino(inode);
@@ -5356,7 +5415,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
if (found_key.objectid != key.objectid)
break;
- if (btrfs_key_type(&found_key) != key_type)
+ if (found_key.type != key_type)
break;
if (found_key.offset < ctx->pos)
goto next;
@@ -5568,7 +5627,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
int ret;
key.objectid = btrfs_ino(inode);
- btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+ key.type = BTRFS_DIR_INDEX_KEY;
key.offset = (u64)-1;
path = btrfs_alloc_path();
@@ -5600,7 +5659,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
if (found_key.objectid != btrfs_ino(inode) ||
- btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
+ found_key.type != BTRFS_DIR_INDEX_KEY) {
BTRFS_I(inode)->index_cnt = 2;
goto out;
}
@@ -5718,7 +5777,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
key[0].objectid = objectid;
- btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
+ key[0].type = BTRFS_INODE_ITEM_KEY;
key[0].offset = 0;
sizes[0] = sizeof(struct btrfs_inode_item);
@@ -5731,7 +5790,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
* add more hard links than can fit in the ref item.
*/
key[1].objectid = objectid;
- btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
+ key[1].type = BTRFS_INODE_REF_KEY;
key[1].offset = ref_objectid;
sizes[1] = name_len + sizeof(*ref);
@@ -5740,7 +5799,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
location = &BTRFS_I(inode)->location;
location->objectid = objectid;
location->offset = 0;
- btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
+ location->type = BTRFS_INODE_ITEM_KEY;
ret = btrfs_insert_inode_locked(inode);
if (ret < 0)
@@ -5832,7 +5891,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
} else {
key.objectid = ino;
- btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
}
@@ -6191,21 +6250,60 @@ out_fail_inode:
goto out_fail;
}
+/* Find next extent map of a given extent map, caller needs to ensure locks */
+static struct extent_map *next_extent_map(struct extent_map *em)
+{
+ struct rb_node *next;
+
+ next = rb_next(&em->rb_node);
+ if (!next)
+ return NULL;
+ return container_of(next, struct extent_map, rb_node);
+}
+
+static struct extent_map *prev_extent_map(struct extent_map *em)
+{
+ struct rb_node *prev;
+
+ prev = rb_prev(&em->rb_node);
+ if (!prev)
+ return NULL;
+ return container_of(prev, struct extent_map, rb_node);
+}
+
/* helper for btfs_get_extent. Given an existing extent in the tree,
+ * the existing extent is the nearest extent to map_start,
* and an extent that you want to insert, deal with overlap and insert
- * the new extent into the tree.
+ * the best fitted new extent into the tree.
*/
static int merge_extent_mapping(struct extent_map_tree *em_tree,
struct extent_map *existing,
struct extent_map *em,
u64 map_start)
{
+ struct extent_map *prev;
+ struct extent_map *next;
+ u64 start;
+ u64 end;
u64 start_diff;
BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
- start_diff = map_start - em->start;
- em->start = map_start;
- em->len = existing->start - em->start;
+
+ if (existing->start > map_start) {
+ next = existing;
+ prev = prev_extent_map(next);
+ } else {
+ prev = existing;
+ next = next_extent_map(prev);
+ }
+
+ start = prev ? extent_map_end(prev) : em->start;
+ start = max_t(u64, start, em->start);
+ end = next ? next->start : extent_map_end(em);
+ end = min_t(u64, end, extent_map_end(em));
+ start_diff = start - em->start;
+ em->start = start;
+ em->len = end - start;
if (em->block_start < EXTENT_MAP_LAST_BYTE &&
!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
em->block_start += start_diff;
@@ -6333,7 +6431,7 @@ again:
struct btrfs_file_extent_item);
/* are we inside the extent that was found? */
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- found_type = btrfs_key_type(&found_key);
+ found_type = found_key.type;
if (found_key.objectid != objectid ||
found_type != BTRFS_EXTENT_DATA_KEY) {
/*
@@ -6482,25 +6580,21 @@ insert:
ret = 0;
- existing = lookup_extent_mapping(em_tree, start, len);
- if (existing && (existing->start > start ||
- existing->start + existing->len <= start)) {
+ existing = search_extent_mapping(em_tree, start, len);
+ /*
+ * existing will always be non-NULL, since there must be
+ * extent causing the -EEXIST.
+ */
+ if (start >= extent_map_end(existing) ||
+ start <= existing->start) {
+ /*
+ * The existing extent map is the one nearest to
+ * the [start, start + len) range which overlaps
+ */
+ err = merge_extent_mapping(em_tree, existing,
+ em, start);
free_extent_map(existing);
- existing = NULL;
- }
- if (!existing) {
- existing = lookup_extent_mapping(em_tree, em->start,
- em->len);
- if (existing) {
- err = merge_extent_mapping(em_tree, existing,
- em, start);
- free_extent_map(existing);
- if (err) {
- free_extent_map(em);
- em = NULL;
- }
- } else {
- err = -EIO;
+ if (err) {
free_extent_map(em);
em = NULL;
}
@@ -7112,8 +7206,10 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
block_start, len,
orig_block_len,
ram_bytes, type);
- if (IS_ERR(em))
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
goto unlock_err;
+ }
}
ret = btrfs_add_ordered_extent_dio(inode, start,
@@ -7188,45 +7284,277 @@ unlock_err:
return ret;
}
-static void btrfs_endio_direct_read(struct bio *bio, int err)
+static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio,
+ int rw, int mirror_num)
{
- struct btrfs_dio_private *dip = bio->bi_private;
- struct bio_vec *bvec;
- struct inode *inode = dip->inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct bio *dio_bio;
- u32 *csums = (u32 *)dip->csum;
+ int ret;
+
+ BUG_ON(rw & REQ_WRITE);
+
+ bio_get(bio);
+
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio,
+ BTRFS_WQ_ENDIO_DIO_REPAIR);
+ if (ret)
+ goto err;
+
+ ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
+err:
+ bio_put(bio);
+ return ret;
+}
+
+static int btrfs_check_dio_repairable(struct inode *inode,
+ struct bio *failed_bio,
+ struct io_failure_record *failrec,
+ int failed_mirror)
+{
+ int num_copies;
+
+ num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
+ failrec->logical, failrec->len);
+ if (num_copies == 1) {
+ /*
+ * we only have a single copy of the data, so don't bother with
+ * all the retry and error correction code that follows. no
+ * matter what the error is, it is very likely to persist.
+ */
+ pr_debug("Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
+ num_copies, failrec->this_mirror, failed_mirror);
+ return 0;
+ }
+
+ failrec->failed_mirror = failed_mirror;
+ failrec->this_mirror++;
+ if (failrec->this_mirror == failed_mirror)
+ failrec->this_mirror++;
+
+ if (failrec->this_mirror > num_copies) {
+ pr_debug("Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
+ num_copies, failrec->this_mirror, failed_mirror);
+ return 0;
+ }
+
+ return 1;
+}
+
+static int dio_read_error(struct inode *inode, struct bio *failed_bio,
+ struct page *page, u64 start, u64 end,
+ int failed_mirror, bio_end_io_t *repair_endio,
+ void *repair_arg)
+{
+ struct io_failure_record *failrec;
+ struct bio *bio;
+ int isector;
+ int read_mode;
+ int ret;
+
+ BUG_ON(failed_bio->bi_rw & REQ_WRITE);
+
+ ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
+ if (ret)
+ return ret;
+
+ ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
+ failed_mirror);
+ if (!ret) {
+ free_io_failure(inode, failrec);
+ return -EIO;
+ }
+
+ if (failed_bio->bi_vcnt > 1)
+ read_mode = READ_SYNC | REQ_FAILFAST_DEV;
+ else
+ read_mode = READ_SYNC;
+
+ isector = start - btrfs_io_bio(failed_bio)->logical;
+ isector >>= inode->i_sb->s_blocksize_bits;
+ bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
+ 0, isector, repair_endio, repair_arg);
+ if (!bio) {
+ free_io_failure(inode, failrec);
+ return -EIO;
+ }
+
+ btrfs_debug(BTRFS_I(inode)->root->fs_info,
+ "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n",
+ read_mode, failrec->this_mirror, failrec->in_validation);
+
+ ret = submit_dio_repair_bio(inode, bio, read_mode,
+ failrec->this_mirror);
+ if (ret) {
+ free_io_failure(inode, failrec);
+ bio_put(bio);
+ }
+
+ return ret;
+}
+
+struct btrfs_retry_complete {
+ struct completion done;
+ struct inode *inode;
+ u64 start;
+ int uptodate;
+};
+
+static void btrfs_retry_endio_nocsum(struct bio *bio, int err)
+{
+ struct btrfs_retry_complete *done = bio->bi_private;
+ struct bio_vec *bvec;
+ int i;
+
+ if (err)
+ goto end;
+
+ done->uptodate = 1;
+ bio_for_each_segment_all(bvec, bio, i)
+ clean_io_failure(done->inode, done->start, bvec->bv_page, 0);
+end:
+ complete(&done->done);
+ bio_put(bio);
+}
+
+static int __btrfs_correct_data_nocsum(struct inode *inode,
+ struct btrfs_io_bio *io_bio)
+{
+ struct bio_vec *bvec;
+ struct btrfs_retry_complete done;
u64 start;
int i;
+ int ret;
+
+ start = io_bio->logical;
+ done.inode = inode;
+
+ bio_for_each_segment_all(bvec, &io_bio->bio, i) {
+try_again:
+ done.uptodate = 0;
+ done.start = start;
+ init_completion(&done.done);
+
+ ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
+ start + bvec->bv_len - 1,
+ io_bio->mirror_num,
+ btrfs_retry_endio_nocsum, &done);
+ if (ret)
+ return ret;
+
+ wait_for_completion(&done.done);
+
+ if (!done.uptodate) {
+ /* We might have another mirror, so try again */
+ goto try_again;
+ }
+
+ start += bvec->bv_len;
+ }
+
+ return 0;
+}
+
+static void btrfs_retry_endio(struct bio *bio, int err)
+{
+ struct btrfs_retry_complete *done = bio->bi_private;
+ struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+ struct bio_vec *bvec;
+ int uptodate;
+ int ret;
+ int i;
+
+ if (err)
+ goto end;
- start = dip->logical_offset;
+ uptodate = 1;
bio_for_each_segment_all(bvec, bio, i) {
- if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
- struct page *page = bvec->bv_page;
- char *kaddr;
- u32 csum = ~(u32)0;
- unsigned long flags;
-
- local_irq_save(flags);
- kaddr = kmap_atomic(page);
- csum = btrfs_csum_data(kaddr + bvec->bv_offset,
- csum, bvec->bv_len);
- btrfs_csum_final(csum, (char *)&csum);
- kunmap_atomic(kaddr);
- local_irq_restore(flags);
-
- flush_dcache_page(bvec->bv_page);
- if (csum != csums[i]) {
- btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
- btrfs_ino(inode), start, csum,
- csums[i]);
- err = -EIO;
- }
+ ret = __readpage_endio_check(done->inode, io_bio, i,
+ bvec->bv_page, 0,
+ done->start, bvec->bv_len);
+ if (!ret)
+ clean_io_failure(done->inode, done->start,
+ bvec->bv_page, 0);
+ else
+ uptodate = 0;
+ }
+
+ done->uptodate = uptodate;
+end:
+ complete(&done->done);
+ bio_put(bio);
+}
+
+static int __btrfs_subio_endio_read(struct inode *inode,
+ struct btrfs_io_bio *io_bio, int err)
+{
+ struct bio_vec *bvec;
+ struct btrfs_retry_complete done;
+ u64 start;
+ u64 offset = 0;
+ int i;
+ int ret;
+
+ err = 0;
+ start = io_bio->logical;
+ done.inode = inode;
+
+ bio_for_each_segment_all(bvec, &io_bio->bio, i) {
+ ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
+ 0, start, bvec->bv_len);
+ if (likely(!ret))
+ goto next;
+try_again:
+ done.uptodate = 0;
+ done.start = start;
+ init_completion(&done.done);
+
+ ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
+ start + bvec->bv_len - 1,
+ io_bio->mirror_num,
+ btrfs_retry_endio, &done);
+ if (ret) {
+ err = ret;
+ goto next;
}
+ wait_for_completion(&done.done);
+
+ if (!done.uptodate) {
+ /* We might have another mirror, so try again */
+ goto try_again;
+ }
+next:
+ offset += bvec->bv_len;
start += bvec->bv_len;
}
+ return err;
+}
+
+static int btrfs_subio_endio_read(struct inode *inode,
+ struct btrfs_io_bio *io_bio, int err)
+{
+ bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+
+ if (skip_csum) {
+ if (unlikely(err))
+ return __btrfs_correct_data_nocsum(inode, io_bio);
+ else
+ return 0;
+ } else {
+ return __btrfs_subio_endio_read(inode, io_bio, err);
+ }
+}
+
+static void btrfs_endio_direct_read(struct bio *bio, int err)
+{
+ struct btrfs_dio_private *dip = bio->bi_private;
+ struct inode *inode = dip->inode;
+ struct bio *dio_bio;
+ struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+
+ if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
+ err = btrfs_subio_endio_read(inode, io_bio, err);
+
unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
dip->logical_offset + dip->bytes - 1);
dio_bio = dip->dio_bio;
@@ -7237,6 +7565,9 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
if (err)
clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
dio_end_io(dio_bio, err);
+
+ if (io_bio->end_io)
+ io_bio->end_io(io_bio, err);
bio_put(bio);
}
@@ -7302,12 +7633,17 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
{
struct btrfs_dio_private *dip = bio->bi_private;
+ if (err)
+ btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
+ "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
+ btrfs_ino(dip->inode), bio->bi_rw,
+ (unsigned long long)bio->bi_iter.bi_sector,
+ bio->bi_iter.bi_size, err);
+
+ if (dip->subio_endio)
+ err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
+
if (err) {
- btrfs_err(BTRFS_I(dip->inode)->root->fs_info,
- "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
- btrfs_ino(dip->inode), bio->bi_rw,
- (unsigned long long)bio->bi_iter.bi_sector,
- bio->bi_iter.bi_size, err);
dip->errors = 1;
/*
@@ -7338,6 +7674,38 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
}
+static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root,
+ struct inode *inode,
+ struct btrfs_dio_private *dip,
+ struct bio *bio,
+ u64 file_offset)
+{
+ struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+ struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
+ int ret;
+
+ /*
+ * We load all the csum data we need when we submit
+ * the first bio to reduce the csum tree search and
+ * contention.
+ */
+ if (dip->logical_offset == file_offset) {
+ ret = btrfs_lookup_bio_sums_dio(root, inode, dip->orig_bio,
+ file_offset);
+ if (ret)
+ return ret;
+ }
+
+ if (bio == dip->orig_bio)
+ return 0;
+
+ file_offset -= dip->logical_offset;
+ file_offset >>= inode->i_sb->s_blocksize_bits;
+ io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset);
+
+ return 0;
+}
+
static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
int rw, u64 file_offset, int skip_sum,
int async_submit)
@@ -7353,7 +7721,8 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
bio_get(bio);
if (!write) {
- ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio,
+ BTRFS_WQ_ENDIO_DATA);
if (ret)
goto err;
}
@@ -7376,13 +7745,12 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
if (ret)
goto err;
- } else if (!skip_sum) {
- ret = btrfs_lookup_bio_sums_dio(root, inode, dip, bio,
- file_offset);
+ } else {
+ ret = btrfs_lookup_and_bind_dio_csum(root, inode, dip, bio,
+ file_offset);
if (ret)
goto err;
}
-
map:
ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
err:
@@ -7403,7 +7771,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
u64 submit_len = 0;
u64 map_length;
int nr_pages = 0;
- int ret = 0;
+ int ret;
int async_submit = 0;
map_length = orig_bio->bi_iter.bi_size;
@@ -7414,6 +7782,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
if (map_length >= orig_bio->bi_iter.bi_size) {
bio = orig_bio;
+ dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
goto submit;
}
@@ -7430,12 +7799,13 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
bio->bi_private = dip;
bio->bi_end_io = btrfs_end_dio_bio;
+ btrfs_io_bio(bio)->logical = file_offset;
atomic_inc(&dip->pending_bios);
while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
- if (unlikely(map_length < submit_len + bvec->bv_len ||
+ if (map_length < submit_len + bvec->bv_len ||
bio_add_page(bio, bvec->bv_page, bvec->bv_len,
- bvec->bv_offset) < bvec->bv_len)) {
+ bvec->bv_offset) < bvec->bv_len) {
/*
* inc the count before we submit the bio so
* we know the end IO handler won't happen before
@@ -7464,6 +7834,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
goto out_err;
bio->bi_private = dip;
bio->bi_end_io = btrfs_end_dio_bio;
+ btrfs_io_bio(bio)->logical = file_offset;
map_length = orig_bio->bi_iter.bi_size;
ret = btrfs_map_block(root->fs_info, rw,
@@ -7507,11 +7878,10 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_dio_private *dip;
struct bio *io_bio;
+ struct btrfs_io_bio *btrfs_bio;
int skip_sum;
- int sum_len;
int write = rw & REQ_WRITE;
int ret = 0;
- u16 csum_size;
skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
@@ -7521,16 +7891,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
goto free_ordered;
}
- if (!skip_sum && !write) {
- csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
- sum_len = dio_bio->bi_iter.bi_size >>
- inode->i_sb->s_blocksize_bits;
- sum_len *= csum_size;
- } else {
- sum_len = 0;
- }
-
- dip = kmalloc(sizeof(*dip) + sum_len, GFP_NOFS);
+ dip = kzalloc(sizeof(*dip), GFP_NOFS);
if (!dip) {
ret = -ENOMEM;
goto free_io_bio;
@@ -7542,20 +7903,25 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
dip->bytes = dio_bio->bi_iter.bi_size;
dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
io_bio->bi_private = dip;
- dip->errors = 0;
dip->orig_bio = io_bio;
dip->dio_bio = dio_bio;
atomic_set(&dip->pending_bios, 0);
+ btrfs_bio = btrfs_io_bio(io_bio);
+ btrfs_bio->logical = file_offset;
- if (write)
+ if (write) {
io_bio->bi_end_io = btrfs_endio_direct_write;
- else
+ } else {
io_bio->bi_end_io = btrfs_endio_direct_read;
+ dip->subio_endio = btrfs_subio_endio_read;
+ }
ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
if (!ret)
return;
+ if (btrfs_bio->end_io)
+ btrfs_bio->end_io(btrfs_bio, ret);
free_io_bio:
bio_put(io_bio);
@@ -7652,8 +8018,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
ret = btrfs_delalloc_reserve_space(inode, count);
if (ret)
goto out;
- } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
- &BTRFS_I(inode)->runtime_flags))) {
+ } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+ &BTRFS_I(inode)->runtime_flags)) {
inode_dio_done(inode);
flags = DIO_LOCKING | DIO_SKIP_HOLES;
wakeup = false;
@@ -8173,6 +8539,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->last_sub_trans = 0;
ei->logged_trans = 0;
ei->delalloc_bytes = 0;
+ ei->defrag_bytes = 0;
ei->disk_i_size = 0;
ei->flags = 0;
ei->csum_bytes = 0;
@@ -8231,6 +8598,7 @@ void btrfs_destroy_inode(struct inode *inode)
WARN_ON(BTRFS_I(inode)->reserved_extents);
WARN_ON(BTRFS_I(inode)->delalloc_bytes);
WARN_ON(BTRFS_I(inode)->csum_bytes);
+ WARN_ON(BTRFS_I(inode)->defrag_bytes);
/*
* This can happen where we create an inode, but somebody else also
@@ -8646,7 +9014,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
spin_unlock(&root->delalloc_lock);
work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
- if (unlikely(!work)) {
+ if (!work) {
if (delay_iput)
btrfs_add_delayed_iput(inode);
else
@@ -8832,7 +9200,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
}
key.objectid = btrfs_ino(inode);
key.offset = 0;
- btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+ key.type = BTRFS_EXTENT_DATA_KEY;
datasize = btrfs_file_extent_calc_inline_size(name_len);
err = btrfs_insert_empty_item(trans, root, path, &key,
datasize);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8a8e298..0fe1aa0 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -332,6 +332,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
goto out_drop;
} else {
+ ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
+ if (ret && ret != -ENODATA)
+ goto out_drop;
ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
}
@@ -477,8 +480,7 @@ static noinline int create_subvol(struct inode *dir,
if (ret)
goto fail;
- leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
- 0, objectid, NULL, 0, 0, 0);
+ leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
goto fail;
@@ -503,7 +505,7 @@ static noinline int create_subvol(struct inode *dir,
btrfs_set_stack_inode_generation(inode_item, 1);
btrfs_set_stack_inode_size(inode_item, 3);
btrfs_set_stack_inode_nlink(inode_item, 1);
- btrfs_set_stack_inode_nbytes(inode_item, root->leafsize);
+ btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
btrfs_set_root_flags(&root_item, 0);
@@ -535,7 +537,7 @@ static noinline int create_subvol(struct inode *dir,
key.objectid = objectid;
key.offset = 0;
- btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ key.type = BTRFS_ROOT_ITEM_KEY;
ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
&root_item);
if (ret)
@@ -882,7 +884,7 @@ out_unlock:
* file you want to defrag, we return 0 to let you know to skip this
* part of the file
*/
-static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh)
+static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh)
{
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_map *em = NULL;
@@ -917,7 +919,7 @@ static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh)
*/
static int find_new_extents(struct btrfs_root *root,
struct inode *inode, u64 newer_than,
- u64 *off, int thresh)
+ u64 *off, u32 thresh)
{
struct btrfs_path *path;
struct btrfs_key min_key;
@@ -936,12 +938,9 @@ static int find_new_extents(struct btrfs_root *root,
min_key.offset = *off;
while (1) {
- path->keep_locks = 1;
ret = btrfs_search_forward(root, &min_key, path, newer_than);
if (ret != 0)
goto none;
- path->keep_locks = 0;
- btrfs_unlock_up_safe(path, 1);
process_slot:
if (min_key.objectid != ino)
goto none;
@@ -1029,7 +1028,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
return ret;
}
-static int should_defrag_range(struct inode *inode, u64 start, int thresh,
+static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
u64 *last_len, u64 *skip, u64 *defrag_end,
int compress)
{
@@ -1259,7 +1258,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
int ret;
int defrag_count = 0;
int compress_type = BTRFS_COMPRESS_ZLIB;
- int extent_thresh = range->extent_thresh;
+ u32 extent_thresh = range->extent_thresh;
unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
unsigned long cluster = max_cluster;
u64 new_align = ~((u64)128 * 1024 - 1);
@@ -1335,8 +1334,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
inode->i_mapping->writeback_index = i;
while (i <= last_index && defrag_count < max_to_defrag &&
- (i < (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT)) {
+ (i < DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE))) {
/*
* make sure we stop running if someone unmounts
* the FS
@@ -1359,7 +1357,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
* the should_defrag function tells us how much to skip
* bump our counter by the suggested amount
*/
- next = (skip + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ next = DIV_ROUND_UP(skip, PAGE_CACHE_SIZE);
i = max(i + 1, next);
continue;
}
@@ -1554,7 +1552,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
goto out_free;
}
- old_size = device->total_bytes;
+ old_size = btrfs_device_get_total_bytes(device);
if (mod < 0) {
if (new_size > old_size) {
@@ -2089,8 +2087,6 @@ static noinline int search_ioctl(struct inode *inode,
key.type = sk->min_type;
key.offset = sk->min_offset;
- path->keep_locks = 1;
-
while (1) {
ret = btrfs_search_forward(root, &key, path, sk->min_transid);
if (ret != 0) {
@@ -2423,9 +2419,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
goto out_dput;
}
- err = d_invalidate(dentry);
- if (err)
- goto out_unlock;
+ d_invalidate(dentry);
down_write(&root->fs_info->subvol_sem);
@@ -2510,7 +2504,6 @@ out_release:
btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
out_up_write:
up_write(&root->fs_info->subvol_sem);
-out_unlock:
if (err) {
spin_lock(&dest->root_item_lock);
root_flags = btrfs_root_flags(&dest->root_item);
@@ -2526,9 +2519,9 @@ out_unlock:
ASSERT(dest->send_in_progress == 0);
/* the last ref */
- if (dest->cache_inode) {
- iput(dest->cache_inode);
- dest->cache_inode = NULL;
+ if (dest->ino_cache_inode) {
+ iput(dest->ino_cache_inode);
+ dest->ino_cache_inode = NULL;
}
}
out_dput:
@@ -2634,6 +2627,9 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
ret = btrfs_init_new_device(root, vol_args->name);
+ if (!ret)
+ btrfs_info(root->fs_info, "disk added %s",vol_args->name);
+
kfree(vol_args);
out:
mutex_unlock(&root->fs_info->volume_mutex);
@@ -2673,6 +2669,9 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
mutex_unlock(&root->fs_info->volume_mutex);
atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+ if (!ret)
+ btrfs_info(root->fs_info, "disk deleted %s",vol_args->name);
+
out:
kfree(vol_args);
err_drop:
@@ -2737,8 +2736,8 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
}
di_args->devid = dev->devid;
- di_args->bytes_used = dev->bytes_used;
- di_args->total_bytes = dev->total_bytes;
+ di_args->bytes_used = btrfs_device_get_bytes_used(dev);
+ di_args->total_bytes = btrfs_device_get_total_bytes(dev);
memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
if (dev->name) {
struct rcu_string *name;
@@ -3164,7 +3163,7 @@ static void clone_update_extent_map(struct inode *inode,
em->start + em->len - 1, 0);
}
- if (unlikely(ret))
+ if (ret)
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
}
@@ -3199,7 +3198,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
u64 last_dest_end = destoff;
ret = -ENOMEM;
- buf = vmalloc(btrfs_level_size(root, 0));
+ buf = vmalloc(root->nodesize);
if (!buf)
return ret;
@@ -3252,11 +3251,11 @@ process_slot:
slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &key, slot);
- if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
+ if (key.type > BTRFS_EXTENT_DATA_KEY ||
key.objectid != btrfs_ino(src))
break;
- if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
+ if (key.type == BTRFS_EXTENT_DATA_KEY) {
struct btrfs_file_extent_item *extent;
int type;
u32 size;
@@ -5283,6 +5282,12 @@ long btrfs_ioctl(struct file *file, unsigned int
if (ret)
return ret;
ret = btrfs_sync_fs(file->f_dentry->d_sb, 1);
+ /*
+ * The transaction thread may want to do more work,
+ * namely it pokes the cleaner ktread that will start
+ * processing uncleaned subvols.
+ */
+ wake_up_process(root->fs_info->transaction_kthread);
return ret;
}
case BTRFS_IOC_START_SYNC:
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index dfad851..78285f3 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -266,8 +266,7 @@ static int lzo_decompress_biovec(struct list_head *ws,
char *data_in;
unsigned long page_in_index = 0;
unsigned long page_out_index = 0;
- unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
- PAGE_CACHE_SIZE;
+ unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE);
unsigned long buf_start;
unsigned long buf_offset = 0;
unsigned long bytes;
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 65793ed..47767d5 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -27,7 +27,7 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
int ret = 0;
key.objectid = BTRFS_ORPHAN_OBJECTID;
- btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = offset;
path = btrfs_alloc_path();
@@ -48,7 +48,7 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
int ret = 0;
key.objectid = BTRFS_ORPHAN_OBJECTID;
- btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = offset;
path = btrfs_alloc_path();
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 9626b4a..647ab12 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -195,7 +195,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
for (i = 0 ; i < nr ; i++) {
item = btrfs_item_nr(i);
btrfs_item_key_to_cpu(l, &key, i);
- type = btrfs_key_type(&key);
+ type = key.type;
printk(KERN_INFO "\titem %d key (%llu %u %llu) itemoff %d "
"itemsize %d\n",
i, key.objectid, type, key.offset,
@@ -336,7 +336,6 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
for (i = 0; i < nr; i++) {
struct extent_buffer *next = read_tree_block(root,
btrfs_node_blockptr(c, i),
- btrfs_level_size(root, level - 1),
btrfs_node_ptr_generation(c, i));
if (btrfs_is_leaf(next) &&
level != 1)
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index ded5c60..48b60db 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -539,10 +539,9 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_key key;
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &quota_root->state)))
+ if (btrfs_test_is_dummy_root(quota_root))
return 0;
-#endif
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -551,9 +550,15 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
key.type = BTRFS_QGROUP_INFO_KEY;
key.offset = qgroupid;
+ /*
+ * Avoid a transaction abort by catching -EEXIST here. In that
+ * case, we proceed by re-initializing the existing structure
+ * on disk.
+ */
+
ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
sizeof(*qgroup_info));
- if (ret)
+ if (ret && ret != -EEXIST)
goto out;
leaf = path->nodes[0];
@@ -572,7 +577,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
key.type = BTRFS_QGROUP_LIMIT_KEY;
ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
sizeof(*qgroup_limit));
- if (ret)
+ if (ret && ret != -EEXIST)
goto out;
leaf = path->nodes[0];
@@ -692,10 +697,9 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
int ret;
int slot;
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ if (btrfs_test_is_dummy_root(root))
return 0;
-#endif
+
key.objectid = 0;
key.type = BTRFS_QGROUP_INFO_KEY;
key.offset = qgroup->qgroupid;
@@ -1335,6 +1339,8 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
INIT_LIST_HEAD(&oper->elem.list);
oper->elem.seq = 0;
+ trace_btrfs_qgroup_record_ref(oper);
+
if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) {
/*
* If any operation for this bytenr/ref_root combo
@@ -2077,6 +2083,8 @@ static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
ASSERT(is_fstree(oper->ref_root));
+ trace_btrfs_qgroup_account(oper);
+
switch (oper->type) {
case BTRFS_QGROUP_OPER_ADD_EXCL:
case BTRFS_QGROUP_OPER_SUB_EXCL:
@@ -2237,7 +2245,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
if (srcid) {
struct btrfs_root *srcroot;
struct btrfs_key srckey;
- int srcroot_level;
srckey.objectid = srcid;
srckey.type = BTRFS_ROOT_ITEM_KEY;
@@ -2249,8 +2256,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
}
rcu_read_lock();
- srcroot_level = btrfs_header_level(srcroot->node);
- level_size = btrfs_level_size(srcroot, srcroot_level);
+ level_size = srcroot->nodesize;
rcu_read_unlock();
}
@@ -2566,7 +2572,7 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
found.type != BTRFS_METADATA_ITEM_KEY)
continue;
if (found.type == BTRFS_METADATA_ITEM_KEY)
- num_bytes = fs_info->extent_root->leafsize;
+ num_bytes = fs_info->extent_root->nodesize;
else
num_bytes = found.offset;
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 0a6b6e4..6a41631 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -912,7 +912,7 @@ static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
{
unsigned long nr = stripe_len * nr_stripes;
- return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE);
}
/*
@@ -1442,7 +1442,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
struct btrfs_bio *bbio = rbio->bbio;
struct bio_list bio_list;
int ret;
- int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
int pagenr;
int stripe;
struct bio *bio;
@@ -1725,7 +1725,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
int pagenr, stripe;
void **pointers;
int faila = -1, failb = -1;
- int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
struct page *page;
int err;
int i;
@@ -1940,7 +1940,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
struct btrfs_bio *bbio = rbio->bbio;
struct bio_list bio_list;
int ret;
- int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
int pagenr;
int stripe;
struct bio *bio;
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 20408c6..b63ae20 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -347,7 +347,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
if (!re)
return NULL;
- blocksize = btrfs_level_size(root, level);
+ blocksize = root->nodesize;
re->logical = logical;
re->blocksize = blocksize;
re->top = *top;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 65245a0..74257d6 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -736,7 +736,8 @@ again:
err = ret;
goto out;
}
- BUG_ON(!ret || !path1->slots[0]);
+ ASSERT(ret);
+ ASSERT(path1->slots[0]);
path1->slots[0]--;
@@ -746,10 +747,10 @@ again:
* the backref was added previously when processing
* backref of type BTRFS_TREE_BLOCK_REF_KEY
*/
- BUG_ON(!list_is_singular(&cur->upper));
+ ASSERT(list_is_singular(&cur->upper));
edge = list_entry(cur->upper.next, struct backref_edge,
list[LOWER]);
- BUG_ON(!list_empty(&edge->list[UPPER]));
+ ASSERT(list_empty(&edge->list[UPPER]));
exist = edge->node[UPPER];
/*
* add the upper level block to pending list if we need
@@ -831,7 +832,7 @@ again:
cur->cowonly = 1;
}
#else
- BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
+ ASSERT(key.type != BTRFS_EXTENT_REF_V0_KEY);
if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
#endif
if (key.objectid == key.offset) {
@@ -840,7 +841,7 @@ again:
* backref of this type.
*/
root = find_reloc_root(rc, cur->bytenr);
- BUG_ON(!root);
+ ASSERT(root);
cur->root = root;
break;
}
@@ -868,7 +869,7 @@ again:
} else {
upper = rb_entry(rb_node, struct backref_node,
rb_node);
- BUG_ON(!upper->checked);
+ ASSERT(upper->checked);
INIT_LIST_HEAD(&edge->list[UPPER]);
}
list_add_tail(&edge->list[LOWER], &cur->upper);
@@ -892,7 +893,7 @@ again:
if (btrfs_root_level(&root->root_item) == cur->level) {
/* tree root */
- BUG_ON(btrfs_root_bytenr(&root->root_item) !=
+ ASSERT(btrfs_root_bytenr(&root->root_item) ==
cur->bytenr);
if (should_ignore_root(root))
list_add(&cur->list, &useless);
@@ -927,7 +928,7 @@ again:
need_check = true;
for (; level < BTRFS_MAX_LEVEL; level++) {
if (!path2->nodes[level]) {
- BUG_ON(btrfs_root_bytenr(&root->root_item) !=
+ ASSERT(btrfs_root_bytenr(&root->root_item) ==
lower->bytenr);
if (should_ignore_root(root))
list_add(&lower->list, &useless);
@@ -977,12 +978,15 @@ again:
need_check = false;
list_add_tail(&edge->list[UPPER],
&list);
- } else
+ } else {
+ if (upper->checked)
+ need_check = true;
INIT_LIST_HEAD(&edge->list[UPPER]);
+ }
} else {
upper = rb_entry(rb_node, struct backref_node,
rb_node);
- BUG_ON(!upper->checked);
+ ASSERT(upper->checked);
INIT_LIST_HEAD(&edge->list[UPPER]);
if (!upper->owner)
upper->owner = btrfs_header_owner(eb);
@@ -1026,7 +1030,7 @@ next:
* everything goes well, connect backref nodes and insert backref nodes
* into the cache.
*/
- BUG_ON(!node->checked);
+ ASSERT(node->checked);
cowonly = node->cowonly;
if (!cowonly) {
rb_node = tree_insert(&cache->rb_root, node->bytenr,
@@ -1062,8 +1066,21 @@ next:
continue;
}
- BUG_ON(!upper->checked);
- BUG_ON(cowonly != upper->cowonly);
+ if (!upper->checked) {
+ /*
+ * Still want to blow up for developers since this is a
+ * logic bug.
+ */
+ ASSERT(0);
+ err = -EINVAL;
+ goto out;
+ }
+ if (cowonly != upper->cowonly) {
+ ASSERT(0);
+ err = -EINVAL;
+ goto out;
+ }
+
if (!cowonly) {
rb_node = tree_insert(&cache->rb_root, upper->bytenr,
&upper->rb_node);
@@ -1086,7 +1103,7 @@ next:
while (!list_empty(&useless)) {
upper = list_entry(useless.next, struct backref_node, list);
list_del_init(&upper->list);
- BUG_ON(!list_empty(&upper->upper));
+ ASSERT(list_empty(&upper->upper));
if (upper == node)
node = NULL;
if (upper->lowest) {
@@ -1119,29 +1136,45 @@ out:
if (err) {
while (!list_empty(&useless)) {
lower = list_entry(useless.next,
- struct backref_node, upper);
- list_del_init(&lower->upper);
+ struct backref_node, list);
+ list_del_init(&lower->list);
}
- upper = node;
- INIT_LIST_HEAD(&list);
- while (upper) {
- if (RB_EMPTY_NODE(&upper->rb_node)) {
- list_splice_tail(&upper->upper, &list);
- free_backref_node(cache, upper);
- }
-
- if (list_empty(&list))
- break;
-
- edge = list_entry(list.next, struct backref_edge,
- list[LOWER]);
+ while (!list_empty(&list)) {
+ edge = list_first_entry(&list, struct backref_edge,
+ list[UPPER]);
+ list_del(&edge->list[UPPER]);
list_del(&edge->list[LOWER]);
+ lower = edge->node[LOWER];
upper = edge->node[UPPER];
free_backref_edge(cache, edge);
+
+ /*
+ * Lower is no longer linked to any upper backref nodes
+ * and isn't in the cache, we can free it ourselves.
+ */
+ if (list_empty(&lower->upper) &&
+ RB_EMPTY_NODE(&lower->rb_node))
+ list_add(&lower->list, &useless);
+
+ if (!RB_EMPTY_NODE(&upper->rb_node))
+ continue;
+
+ /* Add this guy's upper edges to the list to proces */
+ list_for_each_entry(edge, &upper->upper, list[LOWER])
+ list_add_tail(&edge->list[UPPER], &list);
+ if (list_empty(&upper->upper))
+ list_add(&upper->list, &useless);
+ }
+
+ while (!list_empty(&useless)) {
+ lower = list_entry(useless.next,
+ struct backref_node, list);
+ list_del_init(&lower->list);
+ free_backref_node(cache, lower);
}
return ERR_PTR(err);
}
- BUG_ON(node && node->detached);
+ ASSERT(!node || !node->detached);
return node;
}
@@ -1787,7 +1820,7 @@ again:
btrfs_node_key_to_cpu(parent, next_key, slot + 1);
old_bytenr = btrfs_node_blockptr(parent, slot);
- blocksize = btrfs_level_size(dest, level - 1);
+ blocksize = dest->nodesize;
old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
if (level <= max_level) {
@@ -1813,8 +1846,7 @@ again:
break;
}
- eb = read_tree_block(dest, old_bytenr, blocksize,
- old_ptr_gen);
+ eb = read_tree_block(dest, old_bytenr, old_ptr_gen);
if (!eb || !extent_buffer_uptodate(eb)) {
ret = (!eb) ? -ENOMEM : -EIO;
free_extent_buffer(eb);
@@ -1944,7 +1976,6 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
u64 bytenr;
u64 ptr_gen = 0;
u64 last_snapshot;
- u32 blocksize;
u32 nritems;
last_snapshot = btrfs_root_last_snapshot(&root->root_item);
@@ -1970,8 +2001,7 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
}
bytenr = btrfs_node_blockptr(eb, path->slots[i]);
- blocksize = btrfs_level_size(root, i - 1);
- eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
+ eb = read_tree_block(root, bytenr, ptr_gen);
if (!eb || !extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
return -EIO;
@@ -2316,7 +2346,7 @@ void free_reloc_roots(struct list_head *list)
}
static noinline_for_stack
-int merge_reloc_roots(struct reloc_control *rc)
+void merge_reloc_roots(struct reloc_control *rc)
{
struct btrfs_root *root;
struct btrfs_root *reloc_root;
@@ -2397,7 +2427,6 @@ out:
}
BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
- return ret;
}
static void free_block_list(struct rb_root *blocks)
@@ -2544,8 +2573,7 @@ u64 calcu_metadata_size(struct reloc_control *rc,
if (next->processed && (reserve || next != node))
break;
- num_bytes += btrfs_level_size(rc->extent_root,
- next->level);
+ num_bytes += rc->extent_root->nodesize;
if (list_empty(&next->upper))
break;
@@ -2679,9 +2707,9 @@ static int do_relocation(struct btrfs_trans_handle *trans,
goto next;
}
- blocksize = btrfs_level_size(root, node->level);
+ blocksize = root->nodesize;
generation = btrfs_node_ptr_generation(upper->eb, slot);
- eb = read_tree_block(root, bytenr, blocksize, generation);
+ eb = read_tree_block(root, bytenr, generation);
if (!eb || !extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
err = -EIO;
@@ -2789,7 +2817,7 @@ static void __mark_block_processed(struct reloc_control *rc,
u32 blocksize;
if (node->level == 0 ||
in_block_group(node->bytenr, rc->block_group)) {
- blocksize = btrfs_level_size(rc->extent_root, node->level);
+ blocksize = rc->extent_root->nodesize;
mark_block_processed(rc, node->bytenr, blocksize);
}
node->processed = 1;
@@ -2843,7 +2871,7 @@ static int get_tree_block_key(struct reloc_control *rc,
BUG_ON(block->key_ready);
eb = read_tree_block(rc->extent_root, block->bytenr,
- block->key.objectid, block->key.offset);
+ block->key.offset);
if (!eb || !extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
return -EIO;
@@ -2858,20 +2886,6 @@ static int get_tree_block_key(struct reloc_control *rc,
return 0;
}
-static int reada_tree_block(struct reloc_control *rc,
- struct tree_block *block)
-{
- BUG_ON(block->key_ready);
- if (block->key.type == BTRFS_METADATA_ITEM_KEY)
- readahead_tree_block(rc->extent_root, block->bytenr,
- block->key.objectid,
- rc->extent_root->leafsize);
- else
- readahead_tree_block(rc->extent_root, block->bytenr,
- block->key.objectid, block->key.offset);
- return 0;
-}
-
/*
* helper function to relocate a tree block
*/
@@ -2951,7 +2965,8 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
while (rb_node) {
block = rb_entry(rb_node, struct tree_block, rb_node);
if (!block->key_ready)
- reada_tree_block(rc, block);
+ readahead_tree_block(rc->extent_root, block->bytenr,
+ block->key.objectid);
rb_node = rb_next(rb_node);
}
@@ -3313,7 +3328,7 @@ static int add_tree_block(struct reloc_control *rc,
return -ENOMEM;
block->bytenr = extent_key->objectid;
- block->key.objectid = rc->extent_root->leafsize;
+ block->key.objectid = rc->extent_root->nodesize;
block->key.offset = generation;
block->level = level;
block->key_ready = 0;
@@ -3640,7 +3655,7 @@ int add_data_references(struct reloc_control *rc,
struct btrfs_extent_inline_ref *iref;
unsigned long ptr;
unsigned long end;
- u32 blocksize = btrfs_level_size(rc->extent_root, 0);
+ u32 blocksize = rc->extent_root->nodesize;
int ret = 0;
int err = 0;
@@ -3783,7 +3798,7 @@ next:
}
if (key.type == BTRFS_METADATA_ITEM_KEY &&
- key.objectid + rc->extent_root->leafsize <=
+ key.objectid + rc->extent_root->nodesize <=
rc->search_start) {
path->slots[0]++;
goto next;
@@ -3801,7 +3816,7 @@ next:
rc->search_start = key.objectid + key.offset;
else
rc->search_start = key.objectid +
- rc->extent_root->leafsize;
+ rc->extent_root->nodesize;
memcpy(extent_key, &key, sizeof(key));
return 0;
}
@@ -4096,7 +4111,6 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
BTRFS_INODE_PREALLOC);
btrfs_mark_buffer_dirty(leaf);
- btrfs_release_path(path);
out:
btrfs_free_path(path);
return ret;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f4a41f3..efa0831 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -137,7 +137,6 @@ struct scrub_ctx {
int pages_per_rd_bio;
u32 sectorsize;
u32 nodesize;
- u32 leafsize;
int is_dev_replace;
struct scrub_wr_ctx wr_ctx;
@@ -178,17 +177,12 @@ struct scrub_copy_nocow_ctx {
struct scrub_warning {
struct btrfs_path *path;
u64 extent_item_size;
- char *scratch_buf;
- char *msg_buf;
const char *errstr;
sector_t sector;
u64 logical;
struct btrfs_device *dev;
- int msg_bufsize;
- int scratch_bufsize;
};
-
static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
@@ -438,7 +432,6 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
}
sctx->first_free = 0;
sctx->nodesize = dev->dev_root->nodesize;
- sctx->leafsize = dev->dev_root->leafsize;
sctx->sectorsize = dev->dev_root->sectorsize;
atomic_set(&sctx->bios_in_flight, 0);
atomic_set(&sctx->workers_pending, 0);
@@ -553,7 +546,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
u64 ref_root;
u32 item_size;
u8 ref_level;
- const int bufsize = 4096;
int ret;
WARN_ON(sblock->page_count < 1);
@@ -561,18 +553,13 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
fs_info = sblock->sctx->dev_root->fs_info;
path = btrfs_alloc_path();
+ if (!path)
+ return;
- swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
- swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
swarn.sector = (sblock->pagev[0]->physical) >> 9;
swarn.logical = sblock->pagev[0]->logical;
swarn.errstr = errstr;
swarn.dev = NULL;
- swarn.msg_bufsize = bufsize;
- swarn.scratch_bufsize = bufsize;
-
- if (!path || !swarn.scratch_buf || !swarn.msg_buf)
- goto out;
ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
&flags);
@@ -613,8 +600,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
out:
btrfs_free_path(path);
- kfree(swarn.scratch_buf);
- kfree(swarn.msg_buf);
}
static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
@@ -681,9 +666,9 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
ret = -EIO;
goto out;
}
- fs_info = BTRFS_I(inode)->root->fs_info;
- ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
+ ret = repair_io_failure(inode, offset, PAGE_SIZE,
fixup->logical, page,
+ offset - page_offset(page),
fixup->mirror_num);
unlock_page(page);
corrected = !ret;
@@ -1361,6 +1346,16 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
return;
}
+static inline int scrub_check_fsid(u8 fsid[],
+ struct scrub_page *spage)
+{
+ struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
+ int ret;
+
+ ret = memcmp(fsid, fs_devices->fsid, BTRFS_UUID_SIZE);
+ return !ret;
+}
+
static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock,
int is_metadata, int have_csum,
@@ -1380,7 +1375,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
h = (struct btrfs_header *)mapped_buffer;
if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
- memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
+ !scrub_check_fsid(h->fsid, sblock->pagev[0]) ||
memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
BTRFS_UUID_SIZE)) {
sblock->header_error = 1;
@@ -1751,14 +1746,13 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
++fail;
- if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
+ if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
++fail;
if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
BTRFS_UUID_SIZE))
++fail;
- WARN_ON(sctx->nodesize != sctx->leafsize);
len = sctx->nodesize - BTRFS_CSUM_SIZE;
mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
@@ -1791,8 +1785,6 @@ static int scrub_checksum_super(struct scrub_block *sblock)
{
struct btrfs_super_block *s;
struct scrub_ctx *sctx = sblock->sctx;
- struct btrfs_root *root = sctx->dev_root;
- struct btrfs_fs_info *fs_info = root->fs_info;
u8 calculated_csum[BTRFS_CSUM_SIZE];
u8 on_disk_csum[BTRFS_CSUM_SIZE];
struct page *page;
@@ -1817,7 +1809,7 @@ static int scrub_checksum_super(struct scrub_block *sblock)
if (sblock->pagev[0]->generation != btrfs_super_generation(s))
++fail_gen;
- if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
+ if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
++fail_cor;
len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
@@ -2196,7 +2188,6 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
sctx->stat.data_bytes_scrubbed += len;
spin_unlock(&sctx->stat_lock);
} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
- WARN_ON(sctx->nodesize != sctx->leafsize);
blocksize = sctx->nodesize;
spin_lock(&sctx->stat_lock);
sctx->stat.tree_extents_scrubbed++;
@@ -2487,7 +2478,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
btrfs_item_key_to_cpu(l, &key, slot);
if (key.type == BTRFS_METADATA_ITEM_KEY)
- bytes = root->leafsize;
+ bytes = root->nodesize;
else
bytes = key.offset;
@@ -2714,7 +2705,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (found_key.objectid != scrub_dev->devid)
break;
- if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
+ if (found_key.type != BTRFS_DEV_EXTENT_KEY)
break;
if (found_key.offset >= end)
@@ -2828,11 +2819,16 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
return -EIO;
- gen = root->fs_info->last_trans_committed;
+ /* Seed devices of a new filesystem has their own generation. */
+ if (scrub_dev->fs_devices != root->fs_info->fs_devices)
+ gen = scrub_dev->generation;
+ else
+ gen = root->fs_info->last_trans_committed;
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
bytenr = btrfs_sb_offset(i);
- if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >
+ scrub_dev->commit_total_bytes)
break;
ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
@@ -2910,17 +2906,6 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
if (btrfs_fs_closing(fs_info))
return -EINVAL;
- /*
- * check some assumptions
- */
- if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
- btrfs_err(fs_info,
- "scrub: size assumption nodesize == leafsize (%d == %d) fails",
- fs_info->chunk_root->nodesize,
- fs_info->chunk_root->leafsize);
- return -EINVAL;
- }
-
if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
/*
* in this case scrub is unable to calculate the checksum
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 6528aa6..874828d 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -515,7 +515,8 @@ static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
set_fs(KERNEL_DS);
while (pos < len) {
- ret = vfs_write(filp, (char *)buf + pos, len - pos, off);
+ ret = vfs_write(filp, (__force const char __user *)buf + pos,
+ len - pos, off);
/* TODO handle that correctly */
/*if (ret == -ERESTARTSYS) {
continue;
@@ -985,11 +986,13 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
int num;
u8 type;
- if (found_key->type == BTRFS_XATTR_ITEM_KEY)
- buf_len = BTRFS_MAX_XATTR_SIZE(root);
- else
- buf_len = PATH_MAX;
-
+ /*
+ * Start with a small buffer (1 page). If later we end up needing more
+ * space, which can happen for xattrs on a fs with a leaf size greater
+ * then the page size, attempt to increase the buffer. Typically xattr
+ * values are small.
+ */
+ buf_len = PATH_MAX;
buf = kmalloc(buf_len, GFP_NOFS);
if (!buf) {
ret = -ENOMEM;
@@ -1016,7 +1019,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
ret = -ENAMETOOLONG;
goto out;
}
- if (name_len + data_len > buf_len) {
+ if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)) {
ret = -E2BIG;
goto out;
}
@@ -1024,12 +1027,34 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
/*
* Path too long
*/
- if (name_len + data_len > buf_len) {
+ if (name_len + data_len > PATH_MAX) {
ret = -ENAMETOOLONG;
goto out;
}
}
+ if (name_len + data_len > buf_len) {
+ buf_len = name_len + data_len;
+ if (is_vmalloc_addr(buf)) {
+ vfree(buf);
+ buf = NULL;
+ } else {
+ char *tmp = krealloc(buf, buf_len,
+ GFP_NOFS | __GFP_NOWARN);
+
+ if (!tmp)
+ kfree(buf);
+ buf = tmp;
+ }
+ if (!buf) {
+ buf = vmalloc(buf_len);
+ if (!buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ }
+
read_extent_buffer(eb, buf, (unsigned long)(di + 1),
name_len + data_len);
@@ -1050,7 +1075,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
}
out:
- kfree(buf);
+ kvfree(buf);
return ret;
}
@@ -3302,7 +3327,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
if (ret < 0 && ret != -ENOENT) {
goto out;
} else if (ret == -ENOENT) {
- ret = 1;
+ ret = 0;
break;
}
@@ -5703,7 +5728,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
NULL);
sort_clone_roots = 1;
- current->journal_info = (void *)BTRFS_SEND_TRANS_STUB;
+ current->journal_info = BTRFS_SEND_TRANS_STUB;
ret = send_subvol(sctx);
current->journal_info = NULL;
if (ret < 0)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index c4124de..a2b97ef 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -60,6 +60,7 @@
#include "backref.h"
#include "tests/btrfs-tests.h"
+#include "qgroup.h"
#define CREATE_TRACE_POINTS
#include <trace/events/btrfs.h>
@@ -307,13 +308,7 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
static void btrfs_put_super(struct super_block *sb)
{
- (void)close_ctree(btrfs_sb(sb)->tree_root);
- /* FIXME: need to fix VFS to return error? */
- /* AV: return it _where_? ->put_super() can be triggered by any number
- * of async events, up to and including delivery of SIGKILL to the
- * last process that kept it busy. Or segfault in the aforementioned
- * process... Whom would you report that to?
- */
+ close_ctree(btrfs_sb(sb)->tree_root);
}
enum {
@@ -400,7 +395,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
int ret = 0;
char *compress_type;
bool compress_force = false;
- bool compress = false;
cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
if (cache_gen)
@@ -478,7 +472,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
/* Fallthrough */
case Opt_compress:
case Opt_compress_type:
- compress = true;
if (token == Opt_compress ||
token == Opt_compress_force ||
strcmp(args[0].from, "zlib") == 0) {
@@ -508,11 +501,18 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
btrfs_set_and_info(root, FORCE_COMPRESS,
"force %s compression",
compress_type);
- } else if (compress) {
+ } else {
if (!btrfs_test_opt(root, COMPRESS))
btrfs_info(root->fs_info,
"btrfs: use %s compression",
compress_type);
+ /*
+ * If we remount from compress-force=xxx to
+ * compress=xxx, we need clear FORCE_COMPRESS
+ * flag, otherwise, there is no way for users
+ * to disable forcible compression separately.
+ */
+ btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
}
break;
case Opt_ssd:
@@ -1014,7 +1014,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",nodatacow");
if (btrfs_test_opt(root, NOBARRIER))
seq_puts(seq, ",nobarrier");
- if (info->max_inline != 8192 * 1024)
+ if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE)
seq_printf(seq, ",max_inline=%llu", info->max_inline);
if (info->alloc_start != 0)
seq_printf(seq, ",alloc_start=%llu", info->alloc_start);
@@ -1215,6 +1215,56 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
return root;
}
+static int parse_security_options(char *orig_opts,
+ struct security_mnt_opts *sec_opts)
+{
+ char *secdata = NULL;
+ int ret = 0;
+
+ secdata = alloc_secdata();
+ if (!secdata)
+ return -ENOMEM;
+ ret = security_sb_copy_data(orig_opts, secdata);
+ if (ret) {
+ free_secdata(secdata);
+ return ret;
+ }
+ ret = security_sb_parse_opts_str(secdata, sec_opts);
+ free_secdata(secdata);
+ return ret;
+}
+
+static int setup_security_options(struct btrfs_fs_info *fs_info,
+ struct super_block *sb,
+ struct security_mnt_opts *sec_opts)
+{
+ int ret = 0;
+
+ /*
+ * Call security_sb_set_mnt_opts() to check whether new sec_opts
+ * is valid.
+ */
+ ret = security_sb_set_mnt_opts(sb, sec_opts, 0, NULL);
+ if (ret)
+ return ret;
+
+#ifdef CONFIG_SECURITY
+ if (!fs_info->security_opts.num_mnt_opts) {
+ /* first time security setup, copy sec_opts to fs_info */
+ memcpy(&fs_info->security_opts, sec_opts, sizeof(*sec_opts));
+ } else {
+ /*
+ * Since SELinux(the only one supports security_mnt_opts) does
+ * NOT support changing context during remount/mount same sb,
+ * This must be the same or part of the same security options,
+ * just free it.
+ */
+ security_free_mnt_opts(sec_opts);
+ }
+#endif
+ return ret;
+}
+
/*
* Find a superblock for the given device / mount point.
*
@@ -1229,6 +1279,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
struct dentry *root;
struct btrfs_fs_devices *fs_devices = NULL;
struct btrfs_fs_info *fs_info = NULL;
+ struct security_mnt_opts new_sec_opts;
fmode_t mode = FMODE_READ;
char *subvol_name = NULL;
u64 subvol_objectid = 0;
@@ -1251,9 +1302,16 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
return root;
}
+ security_init_mnt_opts(&new_sec_opts);
+ if (data) {
+ error = parse_security_options(data, &new_sec_opts);
+ if (error)
+ return ERR_PTR(error);
+ }
+
error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
if (error)
- return ERR_PTR(error);
+ goto error_sec_opts;
/*
* Setup a dummy root and fs_info for test/set super. This is because
@@ -1262,13 +1320,16 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
* then open_ctree will properly initialize everything later.
*/
fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
- if (!fs_info)
- return ERR_PTR(-ENOMEM);
+ if (!fs_info) {
+ error = -ENOMEM;
+ goto error_sec_opts;
+ }
fs_info->fs_devices = fs_devices;
fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
+ security_init_mnt_opts(&fs_info->security_opts);
if (!fs_info->super_copy || !fs_info->super_for_commit) {
error = -ENOMEM;
goto error_fs_info;
@@ -1306,8 +1367,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
}
root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
- if (IS_ERR(root))
+ if (IS_ERR(root)) {
+ deactivate_locked_super(s);
+ error = PTR_ERR(root);
+ goto error_sec_opts;
+ }
+
+ fs_info = btrfs_sb(s);
+ error = setup_security_options(fs_info, s, &new_sec_opts);
+ if (error) {
+ dput(root);
deactivate_locked_super(s);
+ goto error_sec_opts;
+ }
return root;
@@ -1315,6 +1387,8 @@ error_close_devices:
btrfs_close_devices(fs_devices);
error_fs_info:
free_fs_info(fs_info);
+error_sec_opts:
+ security_free_mnt_opts(&new_sec_opts);
return ERR_PTR(error);
}
@@ -1396,6 +1470,21 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
sync_filesystem(sb);
btrfs_remount_prepare(fs_info);
+ if (data) {
+ struct security_mnt_opts new_sec_opts;
+
+ security_init_mnt_opts(&new_sec_opts);
+ ret = parse_security_options(data, &new_sec_opts);
+ if (ret)
+ goto restore;
+ ret = setup_security_options(fs_info, sb,
+ &new_sec_opts);
+ if (ret) {
+ security_free_mnt_opts(&new_sec_opts);
+ goto restore;
+ }
+ }
+
ret = btrfs_parse_options(root, data);
if (ret) {
ret = -EINVAL;
@@ -1694,7 +1783,11 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
int ret;
- /* holding chunk_muext to avoid allocating new chunks */
+ /*
+ * holding chunk_muext to avoid allocating new chunks, holding
+ * device_list_mutex to avoid the device being removed
+ */
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
mutex_lock(&fs_info->chunk_mutex);
rcu_read_lock();
list_for_each_entry_rcu(found, head, list) {
@@ -1735,11 +1828,13 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
if (ret) {
mutex_unlock(&fs_info->chunk_mutex);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
return ret;
}
buf->f_bavail += div_u64(total_free_data, factor);
buf->f_bavail = buf->f_bavail >> bits;
mutex_unlock(&fs_info->chunk_mutex);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
buf->f_type = BTRFS_SUPER_MAGIC;
buf->f_bsize = dentry->d_sb->s_blocksize;
@@ -1769,7 +1864,7 @@ static struct file_system_type btrfs_fs_type = {
.name = "btrfs",
.mount = btrfs_mount,
.kill_sb = btrfs_kill_super,
- .fs_flags = FS_REQUIRES_DEV,
+ .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
};
MODULE_ALIAS_FS("btrfs");
@@ -1993,11 +2088,15 @@ static int __init init_btrfs_fs(void)
err = btrfs_prelim_ref_init();
if (err)
+ goto free_delayed_ref;
+
+ err = btrfs_end_io_wq_init();
+ if (err)
goto free_prelim_ref;
err = btrfs_interface_init();
if (err)
- goto free_delayed_ref;
+ goto free_end_io_wq;
btrfs_init_lockdep();
@@ -2015,6 +2114,8 @@ static int __init init_btrfs_fs(void)
unregister_ioctl:
btrfs_interface_exit();
+free_end_io_wq:
+ btrfs_end_io_wq_exit();
free_prelim_ref:
btrfs_prelim_ref_exit();
free_delayed_ref:
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 12e5355..b2e7bb4 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -242,7 +242,7 @@ static ssize_t global_rsv_size_show(struct kobject *kobj,
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf);
}
-BTRFS_ATTR(global_rsv_size, 0444, global_rsv_size_show);
+BTRFS_ATTR(global_rsv_size, global_rsv_size_show);
static ssize_t global_rsv_reserved_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
@@ -251,7 +251,7 @@ static ssize_t global_rsv_reserved_show(struct kobject *kobj,
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf);
}
-BTRFS_ATTR(global_rsv_reserved, 0444, global_rsv_reserved_show);
+BTRFS_ATTR(global_rsv_reserved, global_rsv_reserved_show);
#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj)
#define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj)
@@ -306,7 +306,7 @@ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \
struct btrfs_space_info *sinfo = to_space_info(kobj); \
return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf); \
} \
-BTRFS_ATTR(field, 0444, btrfs_space_info_show_##field)
+BTRFS_ATTR(field, btrfs_space_info_show_##field)
static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj,
struct kobj_attribute *a,
@@ -325,7 +325,7 @@ SPACE_INFO_ATTR(bytes_reserved);
SPACE_INFO_ATTR(bytes_may_use);
SPACE_INFO_ATTR(disk_used);
SPACE_INFO_ATTR(disk_total);
-BTRFS_ATTR(total_bytes_pinned, 0444, btrfs_space_info_show_total_bytes_pinned);
+BTRFS_ATTR(total_bytes_pinned, btrfs_space_info_show_total_bytes_pinned);
static struct attribute *space_info_attrs[] = {
BTRFS_ATTR_PTR(flags),
@@ -363,7 +363,8 @@ static ssize_t btrfs_label_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return snprintf(buf, PAGE_SIZE, "%s\n", fs_info->super_copy->label);
+ char *label = fs_info->super_copy->label;
+ return snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label);
}
static ssize_t btrfs_label_store(struct kobject *kobj,
@@ -374,8 +375,18 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
struct btrfs_trans_handle *trans;
struct btrfs_root *root = fs_info->fs_root;
int ret;
+ size_t p_len;
- if (len >= BTRFS_LABEL_SIZE)
+ if (fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ /*
+ * p_len is the len until the first occurrence of either
+ * '\n' or '\0'
+ */
+ p_len = strcspn(buf, "\n");
+
+ if (p_len >= BTRFS_LABEL_SIZE)
return -EINVAL;
trans = btrfs_start_transaction(root, 0);
@@ -383,7 +394,8 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
return PTR_ERR(trans);
spin_lock(&root->fs_info->super_lock);
- strcpy(fs_info->super_copy->label, buf);
+ memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE);
+ memcpy(fs_info->super_copy->label, buf, p_len);
spin_unlock(&root->fs_info->super_lock);
ret = btrfs_commit_transaction(trans, root);
@@ -392,14 +404,7 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
return ret;
}
-BTRFS_ATTR_RW(label, 0644, btrfs_label_show, btrfs_label_store);
-
-static ssize_t btrfs_no_store(struct kobject *kobj,
- struct kobj_attribute *a,
- const char *buf, size_t len)
-{
- return -EPERM;
-}
+BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store);
static ssize_t btrfs_nodesize_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
@@ -409,7 +414,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj,
return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
}
-BTRFS_ATTR_RW(nodesize, 0444, btrfs_nodesize_show, btrfs_no_store);
+BTRFS_ATTR(nodesize, btrfs_nodesize_show);
static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
@@ -419,7 +424,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
}
-BTRFS_ATTR_RW(sectorsize, 0444, btrfs_sectorsize_show, btrfs_no_store);
+BTRFS_ATTR(sectorsize, btrfs_sectorsize_show);
static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
@@ -429,7 +434,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
}
-BTRFS_ATTR_RW(clone_alignment, 0444, btrfs_clone_alignment_show, btrfs_no_store);
+BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show);
static struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(label),
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index ac46df3..f7dd298 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -20,16 +20,20 @@ enum btrfs_feature_set {
.store = _store, \
}
-#define BTRFS_ATTR_RW(_name, _mode, _show, _store) \
-static struct kobj_attribute btrfs_attr_##_name = \
- __INIT_KOBJ_ATTR(_name, _mode, _show, _store)
-#define BTRFS_ATTR(_name, _mode, _show) \
- BTRFS_ATTR_RW(_name, _mode, _show, NULL)
+#define BTRFS_ATTR_RW(_name, _show, _store) \
+ static struct kobj_attribute btrfs_attr_##_name = \
+ __INIT_KOBJ_ATTR(_name, 0644, _show, _store)
+
+#define BTRFS_ATTR(_name, _show) \
+ static struct kobj_attribute btrfs_attr_##_name = \
+ __INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
+
#define BTRFS_ATTR_PTR(_name) (&btrfs_attr_##_name.attr)
#define BTRFS_RAID_ATTR(_name, _show) \
-static struct kobj_attribute btrfs_raid_attr_##_name = \
+ static struct kobj_attribute btrfs_raid_attr_##_name = \
__INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
+
#define BTRFS_RAID_ATTR_PTR(_name) (&btrfs_raid_attr_##_name.attr)
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index c8d9ddf..2299bfd 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -40,11 +40,12 @@ static struct btrfs_block_group_cache *init_test_block_group(void)
cache->key.offset = 1024 * 1024 * 1024;
cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
cache->sectorsize = 4096;
+ cache->full_stripe_len = 4096;
spin_lock_init(&cache->lock);
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
- INIT_LIST_HEAD(&cache->new_bg_list);
+ INIT_LIST_HEAD(&cache->bg_list);
btrfs_init_free_space_ctl(cache);
@@ -364,6 +365,517 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
return 0;
}
+/* Used by test_steal_space_from_bitmap_to_extent(). */
+static bool test_use_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info)
+{
+ return ctl->free_extents > 0;
+}
+
+/* Used by test_steal_space_from_bitmap_to_extent(). */
+static int
+check_num_extents_and_bitmaps(const struct btrfs_block_group_cache *cache,
+ const int num_extents,
+ const int num_bitmaps)
+{
+ if (cache->free_space_ctl->free_extents != num_extents) {
+ test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n",
+ cache->free_space_ctl->free_extents, num_extents);
+ return -EINVAL;
+ }
+ if (cache->free_space_ctl->total_bitmaps != num_bitmaps) {
+ test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n",
+ cache->free_space_ctl->total_bitmaps, num_bitmaps);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/* Used by test_steal_space_from_bitmap_to_extent(). */
+static int check_cache_empty(struct btrfs_block_group_cache *cache)
+{
+ u64 offset;
+ u64 max_extent_size;
+
+ /*
+ * Now lets confirm that there's absolutely no free space left to
+ * allocate.
+ */
+ if (cache->free_space_ctl->free_space != 0) {
+ test_msg("Cache free space is not 0\n");
+ return -EINVAL;
+ }
+
+ /* And any allocation request, no matter how small, should fail now. */
+ offset = btrfs_find_space_for_alloc(cache, 0, 4096, 0,
+ &max_extent_size);
+ if (offset != 0) {
+ test_msg("Space allocation did not fail, returned offset: %llu",
+ offset);
+ return -EINVAL;
+ }
+
+ /* And no extent nor bitmap entries in the cache anymore. */
+ return check_num_extents_and_bitmaps(cache, 0, 0);
+}
+
+/*
+ * Before we were able to steal free space from a bitmap entry to an extent
+ * entry, we could end up with 2 entries representing a contiguous free space.
+ * One would be an extent entry and the other a bitmap entry. Since in order
+ * to allocate space to a caller we use only 1 entry, we couldn't return that
+ * whole range to the caller if it was requested. This forced the caller to
+ * either assume ENOSPC or perform several smaller space allocations, which
+ * wasn't optimal as they could be spread all over the block group while under
+ * concurrency (extra overhead and fragmentation).
+ *
+ * This stealing approach is benefical, since we always prefer to allocate from
+ * extent entries, both for clustered and non-clustered allocation requests.
+ */
+static int
+test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
+{
+ int ret;
+ u64 offset;
+ u64 max_extent_size;
+
+ bool (*use_bitmap_op)(struct btrfs_free_space_ctl *,
+ struct btrfs_free_space *);
+
+ test_msg("Running space stealing from bitmap to extent\n");
+
+ /*
+ * For this test, we want to ensure we end up with an extent entry
+ * immediately adjacent to a bitmap entry, where the bitmap starts
+ * at an offset where the extent entry ends. We keep adding and
+ * removing free space to reach into this state, but to get there
+ * we need to reach a point where marking new free space doesn't
+ * result in adding new extent entries or merging the new space
+ * with existing extent entries - the space ends up being marked
+ * in an existing bitmap that covers the new free space range.
+ *
+ * To get there, we need to reach the threshold defined set at
+ * cache->free_space_ctl->extents_thresh, which currently is
+ * 256 extents on a x86_64 system at least, and a few other
+ * conditions (check free_space_cache.c). Instead of making the
+ * test much longer and complicated, use a "use_bitmap" operation
+ * that forces use of bitmaps as soon as we have at least 1
+ * extent entry.
+ */
+ use_bitmap_op = cache->free_space_ctl->op->use_bitmap;
+ cache->free_space_ctl->op->use_bitmap = test_use_bitmap;
+
+ /*
+ * Extent entry covering free space range [128Mb - 256Kb, 128Mb - 128Kb[
+ */
+ ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 - 256 * 1024,
+ 128 * 1024, 0);
+ if (ret) {
+ test_msg("Couldn't add extent entry %d\n", ret);
+ return ret;
+ }
+
+ /* Bitmap entry covering free space range [128Mb + 512Kb, 256Mb[ */
+ ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 512 * 1024,
+ 128 * 1024 * 1024 - 512 * 1024, 1);
+ if (ret) {
+ test_msg("Couldn't add bitmap entry %d\n", ret);
+ return ret;
+ }
+
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Now make only the first 256Kb of the bitmap marked as free, so that
+ * we end up with only the following ranges marked as free space:
+ *
+ * [128Mb - 256Kb, 128Mb - 128Kb[
+ * [128Mb + 512Kb, 128Mb + 768Kb[
+ */
+ ret = btrfs_remove_free_space(cache,
+ 128 * 1024 * 1024 + 768 * 1024,
+ 128 * 1024 * 1024 - 768 * 1024);
+ if (ret) {
+ test_msg("Failed to free part of bitmap space %d\n", ret);
+ return ret;
+ }
+
+ /* Confirm that only those 2 ranges are marked as free. */
+ if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024,
+ 128 * 1024)) {
+ test_msg("Free space range missing\n");
+ return -ENOENT;
+ }
+ if (!test_check_exists(cache, 128 * 1024 * 1024 + 512 * 1024,
+ 256 * 1024)) {
+ test_msg("Free space range missing\n");
+ return -ENOENT;
+ }
+
+ /*
+ * Confirm that the bitmap range [128Mb + 768Kb, 256Mb[ isn't marked
+ * as free anymore.
+ */
+ if (test_check_exists(cache, 128 * 1024 * 1024 + 768 * 1024,
+ 128 * 1024 * 1024 - 768 * 1024)) {
+ test_msg("Bitmap region not removed from space cache\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Confirm that the region [128Mb + 256Kb, 128Mb + 512Kb[, which is
+ * covered by the bitmap, isn't marked as free.
+ */
+ if (test_check_exists(cache, 128 * 1024 * 1024 + 256 * 1024,
+ 256 * 1024)) {
+ test_msg("Invalid bitmap region marked as free\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Confirm that the region [128Mb, 128Mb + 256Kb[, which is covered
+ * by the bitmap too, isn't marked as free either.
+ */
+ if (test_check_exists(cache, 128 * 1024 * 1024,
+ 256 * 1024)) {
+ test_msg("Invalid bitmap region marked as free\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Now lets mark the region [128Mb, 128Mb + 512Kb[ as free too. But,
+ * lets make sure the free space cache marks it as free in the bitmap,
+ * and doesn't insert a new extent entry to represent this region.
+ */
+ ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 512 * 1024);
+ if (ret) {
+ test_msg("Error adding free space: %d\n", ret);
+ return ret;
+ }
+ /* Confirm the region is marked as free. */
+ if (!test_check_exists(cache, 128 * 1024 * 1024, 512 * 1024)) {
+ test_msg("Bitmap region not marked as free\n");
+ return -ENOENT;
+ }
+
+ /*
+ * Confirm that no new extent entries or bitmap entries were added to
+ * the cache after adding that free space region.
+ */
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Now lets add a small free space region to the right of the previous
+ * one, which is not contiguous with it and is part of the bitmap too.
+ * The goal is to test that the bitmap entry space stealing doesn't
+ * steal this space region.
+ */
+ ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 + 16 * 1024 * 1024,
+ 4096);
+ if (ret) {
+ test_msg("Error adding free space: %d\n", ret);
+ return ret;
+ }
+
+ /*
+ * Confirm that no new extent entries or bitmap entries were added to
+ * the cache after adding that free space region.
+ */
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Now mark the region [128Mb - 128Kb, 128Mb[ as free too. This will
+ * expand the range covered by the existing extent entry that represents
+ * the free space [128Mb - 256Kb, 128Mb - 128Kb[.
+ */
+ ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 128 * 1024,
+ 128 * 1024);
+ if (ret) {
+ test_msg("Error adding free space: %d\n", ret);
+ return ret;
+ }
+ /* Confirm the region is marked as free. */
+ if (!test_check_exists(cache, 128 * 1024 * 1024 - 128 * 1024,
+ 128 * 1024)) {
+ test_msg("Extent region not marked as free\n");
+ return -ENOENT;
+ }
+
+ /*
+ * Confirm that our extent entry didn't stole all free space from the
+ * bitmap, because of the small 4Kb free space region.
+ */
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * So now we have the range [128Mb - 256Kb, 128Mb + 768Kb[ as free
+ * space. Without stealing bitmap free space into extent entry space,
+ * we would have all this free space represented by 2 entries in the
+ * cache:
+ *
+ * extent entry covering range: [128Mb - 256Kb, 128Mb[
+ * bitmap entry covering range: [128Mb, 128Mb + 768Kb[
+ *
+ * Attempting to allocate the whole free space (1Mb) would fail, because
+ * we can't allocate from multiple entries.
+ * With the bitmap free space stealing, we get a single extent entry
+ * that represents the 1Mb free space, and therefore we're able to
+ * allocate the whole free space at once.
+ */
+ if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024,
+ 1 * 1024 * 1024)) {
+ test_msg("Expected region not marked as free\n");
+ return -ENOENT;
+ }
+
+ if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 4096)) {
+ test_msg("Cache free space is not 1Mb + 4Kb\n");
+ return -EINVAL;
+ }
+
+ offset = btrfs_find_space_for_alloc(cache,
+ 0, 1 * 1024 * 1024, 0,
+ &max_extent_size);
+ if (offset != (128 * 1024 * 1024 - 256 * 1024)) {
+ test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
+ offset);
+ return -EINVAL;
+ }
+
+ /* All that remains is a 4Kb free space region in a bitmap. Confirm. */
+ ret = check_num_extents_and_bitmaps(cache, 1, 1);
+ if (ret)
+ return ret;
+
+ if (cache->free_space_ctl->free_space != 4096) {
+ test_msg("Cache free space is not 4Kb\n");
+ return -EINVAL;
+ }
+
+ offset = btrfs_find_space_for_alloc(cache,
+ 0, 4096, 0,
+ &max_extent_size);
+ if (offset != (128 * 1024 * 1024 + 16 * 1024 * 1024)) {
+ test_msg("Failed to allocate 4Kb from space cache, returned offset is: %llu\n",
+ offset);
+ return -EINVAL;
+ }
+
+ ret = check_cache_empty(cache);
+ if (ret)
+ return ret;
+
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+ /*
+ * Now test a similar scenario, but where our extent entry is located
+ * to the right of the bitmap entry, so that we can check that stealing
+ * space from a bitmap to the front of an extent entry works.
+ */
+
+ /*
+ * Extent entry covering free space range [128Mb + 128Kb, 128Mb + 256Kb[
+ */
+ ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 128 * 1024,
+ 128 * 1024, 0);
+ if (ret) {
+ test_msg("Couldn't add extent entry %d\n", ret);
+ return ret;
+ }
+
+ /* Bitmap entry covering free space range [0, 128Mb - 512Kb[ */
+ ret = test_add_free_space_entry(cache, 0,
+ 128 * 1024 * 1024 - 512 * 1024, 1);
+ if (ret) {
+ test_msg("Couldn't add bitmap entry %d\n", ret);
+ return ret;
+ }
+
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Now make only the last 256Kb of the bitmap marked as free, so that
+ * we end up with only the following ranges marked as free space:
+ *
+ * [128Mb + 128b, 128Mb + 256Kb[
+ * [128Mb - 768Kb, 128Mb - 512Kb[
+ */
+ ret = btrfs_remove_free_space(cache,
+ 0,
+ 128 * 1024 * 1024 - 768 * 1024);
+ if (ret) {
+ test_msg("Failed to free part of bitmap space %d\n", ret);
+ return ret;
+ }
+
+ /* Confirm that only those 2 ranges are marked as free. */
+ if (!test_check_exists(cache, 128 * 1024 * 1024 + 128 * 1024,
+ 128 * 1024)) {
+ test_msg("Free space range missing\n");
+ return -ENOENT;
+ }
+ if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024,
+ 256 * 1024)) {
+ test_msg("Free space range missing\n");
+ return -ENOENT;
+ }
+
+ /*
+ * Confirm that the bitmap range [0, 128Mb - 768Kb[ isn't marked
+ * as free anymore.
+ */
+ if (test_check_exists(cache, 0,
+ 128 * 1024 * 1024 - 768 * 1024)) {
+ test_msg("Bitmap region not removed from space cache\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Confirm that the region [128Mb - 512Kb, 128Mb[, which is
+ * covered by the bitmap, isn't marked as free.
+ */
+ if (test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024,
+ 512 * 1024)) {
+ test_msg("Invalid bitmap region marked as free\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Now lets mark the region [128Mb - 512Kb, 128Mb[ as free too. But,
+ * lets make sure the free space cache marks it as free in the bitmap,
+ * and doesn't insert a new extent entry to represent this region.
+ */
+ ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 512 * 1024,
+ 512 * 1024);
+ if (ret) {
+ test_msg("Error adding free space: %d\n", ret);
+ return ret;
+ }
+ /* Confirm the region is marked as free. */
+ if (!test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024,
+ 512 * 1024)) {
+ test_msg("Bitmap region not marked as free\n");
+ return -ENOENT;
+ }
+
+ /*
+ * Confirm that no new extent entries or bitmap entries were added to
+ * the cache after adding that free space region.
+ */
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Now lets add a small free space region to the left of the previous
+ * one, which is not contiguous with it and is part of the bitmap too.
+ * The goal is to test that the bitmap entry space stealing doesn't
+ * steal this space region.
+ */
+ ret = btrfs_add_free_space(cache, 32 * 1024 * 1024, 8192);
+ if (ret) {
+ test_msg("Error adding free space: %d\n", ret);
+ return ret;
+ }
+
+ /*
+ * Now mark the region [128Mb, 128Mb + 128Kb[ as free too. This will
+ * expand the range covered by the existing extent entry that represents
+ * the free space [128Mb + 128Kb, 128Mb + 256Kb[.
+ */
+ ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 128 * 1024);
+ if (ret) {
+ test_msg("Error adding free space: %d\n", ret);
+ return ret;
+ }
+ /* Confirm the region is marked as free. */
+ if (!test_check_exists(cache, 128 * 1024 * 1024, 128 * 1024)) {
+ test_msg("Extent region not marked as free\n");
+ return -ENOENT;
+ }
+
+ /*
+ * Confirm that our extent entry didn't stole all free space from the
+ * bitmap, because of the small 8Kb free space region.
+ */
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * So now we have the range [128Mb - 768Kb, 128Mb + 256Kb[ as free
+ * space. Without stealing bitmap free space into extent entry space,
+ * we would have all this free space represented by 2 entries in the
+ * cache:
+ *
+ * extent entry covering range: [128Mb, 128Mb + 256Kb[
+ * bitmap entry covering range: [128Mb - 768Kb, 128Mb[
+ *
+ * Attempting to allocate the whole free space (1Mb) would fail, because
+ * we can't allocate from multiple entries.
+ * With the bitmap free space stealing, we get a single extent entry
+ * that represents the 1Mb free space, and therefore we're able to
+ * allocate the whole free space at once.
+ */
+ if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024,
+ 1 * 1024 * 1024)) {
+ test_msg("Expected region not marked as free\n");
+ return -ENOENT;
+ }
+
+ if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 8192)) {
+ test_msg("Cache free space is not 1Mb + 8Kb\n");
+ return -EINVAL;
+ }
+
+ offset = btrfs_find_space_for_alloc(cache,
+ 0, 1 * 1024 * 1024, 0,
+ &max_extent_size);
+ if (offset != (128 * 1024 * 1024 - 768 * 1024)) {
+ test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
+ offset);
+ return -EINVAL;
+ }
+
+ /* All that remains is a 8Kb free space region in a bitmap. Confirm. */
+ ret = check_num_extents_and_bitmaps(cache, 1, 1);
+ if (ret)
+ return ret;
+
+ if (cache->free_space_ctl->free_space != 8192) {
+ test_msg("Cache free space is not 8Kb\n");
+ return -EINVAL;
+ }
+
+ offset = btrfs_find_space_for_alloc(cache,
+ 0, 8192, 0,
+ &max_extent_size);
+ if (offset != (32 * 1024 * 1024)) {
+ test_msg("Failed to allocate 8Kb from space cache, returned offset is: %llu\n",
+ offset);
+ return -EINVAL;
+ }
+
+ ret = check_cache_empty(cache);
+ if (ret)
+ return ret;
+
+ cache->free_space_ctl->op->use_bitmap = use_bitmap_op;
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+ return 0;
+}
+
int btrfs_test_free_space_cache(void)
{
struct btrfs_block_group_cache *cache;
@@ -386,6 +898,8 @@ int btrfs_test_free_space_cache(void)
ret = test_bitmaps_and_extents(cache);
if (ret)
goto out;
+
+ ret = test_steal_space_from_bitmap_to_extent(cache);
out:
__btrfs_remove_free_space_cache(cache->free_space_ctl);
kfree(cache->free_space_ctl);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index d89c6d3..dcaae36 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -386,7 +386,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
int ret;
/* Send isn't supposed to start transactions. */
- ASSERT(current->journal_info != (void *)BTRFS_SEND_TRANS_STUB);
+ ASSERT(current->journal_info != BTRFS_SEND_TRANS_STUB);
if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
return ERR_PTR(-EROFS);
@@ -408,7 +408,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
if (num_items > 0 && root != root->fs_info->chunk_root) {
if (root->fs_info->quota_enabled &&
is_fstree(root->root_key.objectid)) {
- qgroup_reserved = num_items * root->leafsize;
+ qgroup_reserved = num_items * root->nodesize;
ret = btrfs_qgroup_reserve(root, qgroup_reserved);
if (ret)
return ERR_PTR(ret);
@@ -418,7 +418,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
/*
* Do the reservation for the relocation root creation
*/
- if (unlikely(need_reserve_reloc_root(root))) {
+ if (need_reserve_reloc_root(root)) {
num_bytes += root->nodesize;
reloc_reserved = true;
}
@@ -609,7 +609,6 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
if (transid <= root->fs_info->last_trans_committed)
goto out;
- ret = -EINVAL;
/* find specified transaction */
spin_lock(&root->fs_info->trans_lock);
list_for_each_entry(t, &root->fs_info->trans_list, list) {
@@ -625,9 +624,16 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
}
}
spin_unlock(&root->fs_info->trans_lock);
- /* The specified transaction doesn't exist */
- if (!cur_trans)
+
+ /*
+ * The specified transaction doesn't exist, or we
+ * raced with btrfs_commit_transaction
+ */
+ if (!cur_trans) {
+ if (transid > root->fs_info->last_trans_committed)
+ ret = -EINVAL;
goto out;
+ }
} else {
/* find newest transaction that is committing | committed */
spin_lock(&root->fs_info->trans_lock);
@@ -851,6 +857,8 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
struct extent_state *cached_state = NULL;
u64 start = 0;
u64 end;
+ struct btrfs_inode *btree_ino = BTRFS_I(root->fs_info->btree_inode);
+ bool errors = false;
while (!find_first_extent_bit(dirty_pages, start, &start, &end,
EXTENT_NEED_WAIT, &cached_state)) {
@@ -864,6 +872,26 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
}
if (err)
werr = err;
+
+ if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+ if ((mark & EXTENT_DIRTY) &&
+ test_and_clear_bit(BTRFS_INODE_BTREE_LOG1_ERR,
+ &btree_ino->runtime_flags))
+ errors = true;
+
+ if ((mark & EXTENT_NEW) &&
+ test_and_clear_bit(BTRFS_INODE_BTREE_LOG2_ERR,
+ &btree_ino->runtime_flags))
+ errors = true;
+ } else {
+ if (test_and_clear_bit(BTRFS_INODE_BTREE_ERR,
+ &btree_ino->runtime_flags))
+ errors = true;
+ }
+
+ if (errors && !werr)
+ werr = -EIO;
+
return werr;
}
@@ -1629,6 +1657,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
{
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_transaction *prev_trans = NULL;
+ struct btrfs_inode *btree_ino = BTRFS_I(root->fs_info->btree_inode);
int ret;
/* Stop the commit early if ->aborted is set */
@@ -1868,6 +1897,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
sizeof(*root->fs_info->super_copy));
+ btrfs_update_commit_device_size(root->fs_info);
+ btrfs_update_commit_device_bytes_used(root, cur_trans);
+
+ clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
+ clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
+
spin_lock(&root->fs_info->trans_lock);
cur_trans->state = TRANS_STATE_UNBLOCKED;
root->fs_info->running_transaction = NULL;
@@ -1981,9 +2016,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
ret = btrfs_drop_snapshot(root, NULL, 0, 0);
else
ret = btrfs_drop_snapshot(root, NULL, 1, 0);
- /*
- * If we encounter a transaction abort during snapshot cleaning, we
- * don't want to crash here
- */
+
return (ret < 0) ? 0 : 1;
}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 579be51..d8f40e1 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -79,7 +79,7 @@ struct btrfs_transaction {
#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \
__TRANS_ATTACH)
-#define BTRFS_SEND_TRANS_STUB 1
+#define BTRFS_SEND_TRANS_STUB ((void *)1)
struct btrfs_trans_handle {
u64 transid;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d296efe..1475979 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -97,7 +97,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
int inode_only,
const loff_t start,
- const loff_t end);
+ const loff_t end,
+ struct btrfs_log_ctx *ctx);
static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid);
@@ -1498,7 +1499,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
return -EIO;
key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
- btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = objectid;
ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
@@ -1637,6 +1638,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
found_key.type == log_key.type &&
found_key.offset == log_key.offset &&
btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
+ update_size = false;
goto out;
}
@@ -2157,7 +2159,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
- blocksize = btrfs_level_size(root, *level - 1);
+ blocksize = root->nodesize;
parent = path->nodes[*level];
root_owner = btrfs_header_owner(parent);
@@ -2983,8 +2985,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
min_key.type = key_type;
min_key.offset = min_offset;
- path->keep_locks = 1;
-
ret = btrfs_search_forward(root, &min_key, path, trans->transid);
/*
@@ -3364,7 +3364,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
* or deletes of this inode don't have to relog the inode
* again
*/
- if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY &&
+ if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
!skip_csum) {
int found_type;
extent = btrfs_item_ptr(src, start_slot + i,
@@ -3573,107 +3573,33 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
return 0;
}
-static int log_one_extent(struct btrfs_trans_handle *trans,
- struct inode *inode, struct btrfs_root *root,
- struct extent_map *em, struct btrfs_path *path,
- struct list_head *logged_list)
+static int wait_ordered_extents(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct btrfs_root *root,
+ const struct extent_map *em,
+ const struct list_head *logged_list,
+ bool *ordered_io_error)
{
- struct btrfs_root *log = root->log_root;
- struct btrfs_file_extent_item *fi;
- struct extent_buffer *leaf;
struct btrfs_ordered_extent *ordered;
- struct list_head ordered_sums;
- struct btrfs_map_token token;
- struct btrfs_key key;
+ struct btrfs_root *log = root->log_root;
u64 mod_start = em->mod_start;
u64 mod_len = em->mod_len;
+ const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
u64 csum_offset;
u64 csum_len;
- u64 extent_offset = em->start - em->orig_start;
- u64 block_len;
- int ret;
- bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
- int extent_inserted = 0;
-
- INIT_LIST_HEAD(&ordered_sums);
- btrfs_init_map_token(&token);
-
- ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
- em->start + em->len, NULL, 0, 1,
- sizeof(*fi), &extent_inserted);
- if (ret)
- return ret;
-
- if (!extent_inserted) {
- key.objectid = btrfs_ino(inode);
- key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = em->start;
-
- ret = btrfs_insert_empty_item(trans, log, path, &key,
- sizeof(*fi));
- if (ret)
- return ret;
- }
- leaf = path->nodes[0];
- fi = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
-
- btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
- &token);
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
- skip_csum = true;
- btrfs_set_token_file_extent_type(leaf, fi,
- BTRFS_FILE_EXTENT_PREALLOC,
- &token);
- } else {
- btrfs_set_token_file_extent_type(leaf, fi,
- BTRFS_FILE_EXTENT_REG,
- &token);
- if (em->block_start == EXTENT_MAP_HOLE)
- skip_csum = true;
- }
-
- block_len = max(em->block_len, em->orig_block_len);
- if (em->compress_type != BTRFS_COMPRESS_NONE) {
- btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
- em->block_start,
- &token);
- btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
- &token);
- } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
- btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
- em->block_start -
- extent_offset, &token);
- btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
- &token);
- } else {
- btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
- btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
- &token);
- }
-
- btrfs_set_token_file_extent_offset(leaf, fi,
- em->start - em->orig_start,
- &token);
- btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
- btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
- btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
- &token);
- btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
- btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
- btrfs_mark_buffer_dirty(leaf);
+ LIST_HEAD(ordered_sums);
+ int ret = 0;
- btrfs_release_path(path);
- if (ret) {
- return ret;
- }
+ *ordered_io_error = false;
- if (skip_csum)
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+ em->block_start == EXTENT_MAP_HOLE)
return 0;
/*
- * First check and see if our csums are on our outstanding ordered
- * extents.
+ * Wait far any ordered extent that covers our extent map. If it
+ * finishes without an error, first check and see if our csums are on
+ * our outstanding ordered extents.
*/
list_for_each_entry(ordered, logged_list, log_list) {
struct btrfs_ordered_sum *sum;
@@ -3685,6 +3611,24 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
mod_start + mod_len <= ordered->file_offset)
continue;
+ if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
+ !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
+ !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
+ const u64 start = ordered->file_offset;
+ const u64 end = ordered->file_offset + ordered->len - 1;
+
+ WARN_ON(ordered->inode != inode);
+ filemap_fdatawrite_range(inode->i_mapping, start, end);
+ }
+
+ wait_event(ordered->wait,
+ (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) ||
+ test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
+
+ if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
+ *ordered_io_error = true;
+ break;
+ }
/*
* We are going to copy all the csums on this ordered extent, so
* go ahead and adjust mod_start and mod_len in case this
@@ -3716,6 +3660,9 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
}
}
+ if (skip_csum)
+ continue;
+
/*
* To keep us from looping for the above case of an ordered
* extent that falls inside of the logged extent.
@@ -3733,18 +3680,16 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
list_for_each_entry(sum, &ordered->list, list) {
ret = btrfs_csum_file_blocks(trans, log, sum);
if (ret)
- goto unlocked;
+ break;
}
-
}
-unlocked:
- if (!mod_len || ret)
+ if (*ordered_io_error || !mod_len || ret || skip_csum)
return ret;
if (em->compress_type) {
csum_offset = 0;
- csum_len = block_len;
+ csum_len = max(em->block_len, em->orig_block_len);
} else {
csum_offset = mod_start - em->start;
csum_len = mod_len;
@@ -3771,11 +3716,106 @@ unlocked:
return ret;
}
+static int log_one_extent(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct btrfs_root *root,
+ const struct extent_map *em,
+ struct btrfs_path *path,
+ const struct list_head *logged_list,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *log = root->log_root;
+ struct btrfs_file_extent_item *fi;
+ struct extent_buffer *leaf;
+ struct btrfs_map_token token;
+ struct btrfs_key key;
+ u64 extent_offset = em->start - em->orig_start;
+ u64 block_len;
+ int ret;
+ int extent_inserted = 0;
+ bool ordered_io_err = false;
+
+ ret = wait_ordered_extents(trans, inode, root, em, logged_list,
+ &ordered_io_err);
+ if (ret)
+ return ret;
+
+ if (ordered_io_err) {
+ ctx->io_err = -EIO;
+ return 0;
+ }
+
+ btrfs_init_map_token(&token);
+
+ ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
+ em->start + em->len, NULL, 0, 1,
+ sizeof(*fi), &extent_inserted);
+ if (ret)
+ return ret;
+
+ if (!extent_inserted) {
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = em->start;
+
+ ret = btrfs_insert_empty_item(trans, log, path, &key,
+ sizeof(*fi));
+ if (ret)
+ return ret;
+ }
+ leaf = path->nodes[0];
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
+ &token);
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ btrfs_set_token_file_extent_type(leaf, fi,
+ BTRFS_FILE_EXTENT_PREALLOC,
+ &token);
+ else
+ btrfs_set_token_file_extent_type(leaf, fi,
+ BTRFS_FILE_EXTENT_REG,
+ &token);
+
+ block_len = max(em->block_len, em->orig_block_len);
+ if (em->compress_type != BTRFS_COMPRESS_NONE) {
+ btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
+ em->block_start,
+ &token);
+ btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
+ &token);
+ } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+ btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
+ em->block_start -
+ extent_offset, &token);
+ btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
+ &token);
+ } else {
+ btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
+ btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
+ &token);
+ }
+
+ btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token);
+ btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
+ btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
+ btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
+ &token);
+ btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
+ btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
+ btrfs_mark_buffer_dirty(leaf);
+
+ btrfs_release_path(path);
+
+ return ret;
+}
+
static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode,
struct btrfs_path *path,
- struct list_head *logged_list)
+ struct list_head *logged_list,
+ struct btrfs_log_ctx *ctx)
{
struct extent_map *em, *n;
struct list_head extents;
@@ -3833,7 +3873,8 @@ process:
write_unlock(&tree->lock);
- ret = log_one_extent(trans, inode, root, em, path, logged_list);
+ ret = log_one_extent(trans, inode, root, em, path, logged_list,
+ ctx);
write_lock(&tree->lock);
clear_em_logging(tree, em);
free_extent_map(em);
@@ -3863,7 +3904,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
int inode_only,
const loff_t start,
- const loff_t end)
+ const loff_t end,
+ struct btrfs_log_ctx *ctx)
{
struct btrfs_path *path;
struct btrfs_path *dst_path;
@@ -3964,7 +4006,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
err = ret;
goto out_unlock;
}
- path->keep_locks = 1;
while (1) {
ins_nr = 0;
@@ -3994,7 +4035,8 @@ again:
if (ret < 0) {
err = ret;
goto out_unlock;
- } if (ret) {
+ }
+ if (ret) {
ins_nr = 0;
btrfs_release_path(path);
continue;
@@ -4048,7 +4090,7 @@ log_extents:
btrfs_release_path(dst_path);
if (fast_search) {
ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
- &logged_list);
+ &logged_list, ctx);
if (ret) {
err = ret;
goto out_unlock;
@@ -4093,18 +4135,8 @@ log_extents:
}
}
- write_lock(&em_tree->lock);
- /*
- * If we're doing a ranged fsync and there are still modified extents
- * in the list, we must run on the next fsync call as it might cover
- * those extents (a full fsync or an fsync for other range).
- */
- if (list_empty(&em_tree->modified_extents)) {
- BTRFS_I(inode)->logged_trans = trans->transid;
- BTRFS_I(inode)->last_log_commit =
- BTRFS_I(inode)->last_sub_trans;
- }
- write_unlock(&em_tree->lock);
+ BTRFS_I(inode)->logged_trans = trans->transid;
+ BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
out_unlock:
if (unlikely(err))
btrfs_put_logged_extents(&logged_list);
@@ -4248,7 +4280,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (ret)
goto end_no_trans;
- ret = btrfs_log_inode(trans, root, inode, inode_only, start, end);
+ ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
if (ret)
goto end_trans;
@@ -4277,7 +4309,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (BTRFS_I(inode)->generation >
root->fs_info->last_trans_committed) {
ret = btrfs_log_inode(trans, root, inode, inode_only,
- 0, LLONG_MAX);
+ 0, LLONG_MAX, ctx);
if (ret)
goto end_trans;
}
@@ -4369,7 +4401,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
again:
key.objectid = BTRFS_TREE_LOG_OBJECTID;
key.offset = (u64)-1;
- btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ key.type = BTRFS_ROOT_ITEM_KEY;
while (1) {
ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index e2e798a..154990c 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -28,6 +28,7 @@
struct btrfs_log_ctx {
int log_ret;
int log_transid;
+ int io_err;
struct list_head list;
};
@@ -35,6 +36,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
{
ctx->log_ret = 0;
ctx->log_transid = 0;
+ ctx->io_err = 0;
INIT_LIST_HEAD(&ctx->list);
}
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index f6a4c03..7782829 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -279,7 +279,6 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
key.offset = 0;
again_search_slot:
- path->keep_locks = 1;
ret = btrfs_search_forward(root, &key, path, 0);
if (ret) {
if (ret > 0)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 340a92d..d47289c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -50,7 +50,7 @@ static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
-static DEFINE_MUTEX(uuid_mutex);
+DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
static void lock_chunks(struct btrfs_root *root)
@@ -74,6 +74,7 @@ static struct btrfs_fs_devices *__alloc_fs_devices(void)
mutex_init(&fs_devs->device_list_mutex);
INIT_LIST_HEAD(&fs_devs->devices);
+ INIT_LIST_HEAD(&fs_devs->resized_devices);
INIT_LIST_HEAD(&fs_devs->alloc_list);
INIT_LIST_HEAD(&fs_devs->list);
@@ -154,11 +155,13 @@ static struct btrfs_device *__alloc_device(void)
INIT_LIST_HEAD(&dev->dev_list);
INIT_LIST_HEAD(&dev->dev_alloc_list);
+ INIT_LIST_HEAD(&dev->resized_list);
spin_lock_init(&dev->io_lock);
spin_lock_init(&dev->reada_lock);
atomic_set(&dev->reada_in_flight, 0);
+ atomic_set(&dev->dev_stats_ccnt, 0);
INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT);
INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT);
@@ -474,14 +477,13 @@ static noinline int device_list_add(const char *path,
return PTR_ERR(fs_devices);
list_add(&fs_devices->list, &fs_uuids);
- fs_devices->latest_devid = devid;
- fs_devices->latest_trans = found_transid;
device = NULL;
} else {
device = __find_device(&fs_devices->devices, devid,
disk_super->dev_item.uuid);
}
+
if (!device) {
if (fs_devices->opened)
return -EBUSY;
@@ -529,12 +531,12 @@ static noinline int device_list_add(const char *path,
*/
/*
- * As of now don't allow update to btrfs_fs_device through
- * the btrfs dev scan cli, after FS has been mounted.
+ * For now, we do allow update to btrfs_fs_device through the
+ * btrfs dev scan cli after FS has been mounted. We're still
+ * tracking a problem where systems fail mount by subvolume id
+ * when we reject replacement on a mounted FS.
*/
- if (fs_devices->opened) {
- return -EBUSY;
- } else {
+ if (!fs_devices->opened && found_transid < device->generation) {
/*
* That is if the FS is _not_ mounted and if you
* are here, that means there is more than one
@@ -542,8 +544,7 @@ static noinline int device_list_add(const char *path,
* with larger generation number or the last-in if
* generation are equal.
*/
- if (found_transid < device->generation)
- return -EEXIST;
+ return -EEXIST;
}
name = rcu_string_strdup(path, GFP_NOFS);
@@ -566,10 +567,6 @@ static noinline int device_list_add(const char *path,
if (!fs_devices->opened)
device->generation = found_transid;
- if (found_transid > fs_devices->latest_trans) {
- fs_devices->latest_devid = devid;
- fs_devices->latest_trans = found_transid;
- }
*fs_devices_ret = fs_devices;
return ret;
@@ -585,8 +582,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
if (IS_ERR(fs_devices))
return fs_devices;
- fs_devices->latest_devid = orig->latest_devid;
- fs_devices->latest_trans = orig->latest_trans;
+ mutex_lock(&orig->device_list_mutex);
fs_devices->total_devices = orig->total_devices;
/* We have held the volume lock, it is safe to get the devices. */
@@ -615,8 +611,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
device->fs_devices = fs_devices;
fs_devices->num_devices++;
}
+ mutex_unlock(&orig->device_list_mutex);
return fs_devices;
error:
+ mutex_unlock(&orig->device_list_mutex);
free_fs_devices(fs_devices);
return ERR_PTR(-ENOMEM);
}
@@ -625,10 +623,7 @@ void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
struct btrfs_fs_devices *fs_devices, int step)
{
struct btrfs_device *device, *next;
-
- struct block_device *latest_bdev = NULL;
- u64 latest_devid = 0;
- u64 latest_transid = 0;
+ struct btrfs_device *latest_dev = NULL;
mutex_lock(&uuid_mutex);
again:
@@ -636,11 +631,9 @@ again:
list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
if (device->in_fs_metadata) {
if (!device->is_tgtdev_for_dev_replace &&
- (!latest_transid ||
- device->generation > latest_transid)) {
- latest_devid = device->devid;
- latest_transid = device->generation;
- latest_bdev = device->bdev;
+ (!latest_dev ||
+ device->generation > latest_dev->generation)) {
+ latest_dev = device;
}
continue;
}
@@ -682,9 +675,7 @@ again:
goto again;
}
- fs_devices->latest_bdev = latest_bdev;
- fs_devices->latest_devid = latest_devid;
- fs_devices->latest_trans = latest_transid;
+ fs_devices->latest_bdev = latest_dev->bdev;
mutex_unlock(&uuid_mutex);
}
@@ -733,8 +724,6 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
fs_devices->rw_devices--;
}
- if (device->can_discard)
- fs_devices->num_can_discard--;
if (device->missing)
fs_devices->missing_devices--;
@@ -799,11 +788,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
struct block_device *bdev;
struct list_head *head = &fs_devices->devices;
struct btrfs_device *device;
- struct block_device *latest_bdev = NULL;
+ struct btrfs_device *latest_dev = NULL;
struct buffer_head *bh;
struct btrfs_super_block *disk_super;
- u64 latest_devid = 0;
- u64 latest_transid = 0;
u64 devid;
int seeding = 1;
int ret = 0;
@@ -831,11 +818,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
goto error_brelse;
device->generation = btrfs_super_generation(disk_super);
- if (!latest_transid || device->generation > latest_transid) {
- latest_devid = devid;
- latest_transid = device->generation;
- latest_bdev = bdev;
- }
+ if (!latest_dev ||
+ device->generation > latest_dev->generation)
+ latest_dev = device;
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
device->writeable = 0;
@@ -845,10 +830,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
}
q = bdev_get_queue(bdev);
- if (blk_queue_discard(q)) {
+ if (blk_queue_discard(q))
device->can_discard = 1;
- fs_devices->num_can_discard++;
- }
device->bdev = bdev;
device->in_fs_metadata = 0;
@@ -878,9 +861,7 @@ error_brelse:
}
fs_devices->seeding = seeding;
fs_devices->opened = 1;
- fs_devices->latest_bdev = latest_bdev;
- fs_devices->latest_devid = latest_devid;
- fs_devices->latest_trans = latest_transid;
+ fs_devices->latest_bdev = latest_dev->bdev;
fs_devices->total_rw_bytes = 0;
out:
return ret;
@@ -1054,7 +1035,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
if (key.objectid > device->devid)
break;
- if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+ if (key.type != BTRFS_DEV_EXTENT_KEY)
goto next;
dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
@@ -1206,7 +1187,7 @@ again:
if (key.objectid > device->devid)
break;
- if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+ if (key.type != BTRFS_DEV_EXTENT_KEY)
goto next;
if (key.offset > search_start) {
@@ -1285,7 +1266,7 @@ out:
static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device,
- u64 start)
+ u64 start, u64 *dev_extent_len)
{
int ret;
struct btrfs_path *path;
@@ -1327,13 +1308,8 @@ again:
goto out;
}
- if (device->bytes_used > 0) {
- u64 len = btrfs_dev_extent_length(leaf, extent);
- device->bytes_used -= len;
- spin_lock(&root->fs_info->free_chunk_lock);
- root->fs_info->free_chunk_space += len;
- spin_unlock(&root->fs_info->free_chunk_lock);
- }
+ *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
+
ret = btrfs_del_item(trans, root, path);
if (ret) {
btrfs_error(root->fs_info, ret,
@@ -1483,8 +1459,10 @@ static int btrfs_add_device(struct btrfs_trans_handle *trans,
btrfs_set_device_io_align(leaf, dev_item, device->io_align);
btrfs_set_device_io_width(leaf, dev_item, device->io_width);
btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
- btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
- btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+ btrfs_set_device_total_bytes(leaf, dev_item,
+ btrfs_device_get_disk_total_bytes(device));
+ btrfs_set_device_bytes_used(leaf, dev_item,
+ btrfs_device_get_bytes_used(device));
btrfs_set_device_group(leaf, dev_item, 0);
btrfs_set_device_seek_speed(leaf, dev_item, 0);
btrfs_set_device_bandwidth(leaf, dev_item, 0);
@@ -1540,7 +1518,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
- lock_chunks(root);
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
@@ -1556,7 +1533,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
goto out;
out:
btrfs_free_path(path);
- unlock_chunks(root);
btrfs_commit_transaction(trans, root);
return ret;
}
@@ -1672,8 +1648,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
if (device->writeable) {
lock_chunks(root);
list_del_init(&device->dev_alloc_list);
+ device->fs_devices->rw_devices--;
unlock_chunks(root);
- root->fs_info->fs_devices->rw_devices--;
clear_super = true;
}
@@ -1692,11 +1668,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
if (ret)
goto error_undo;
- spin_lock(&root->fs_info->free_chunk_lock);
- root->fs_info->free_chunk_space = device->total_bytes -
- device->bytes_used;
- spin_unlock(&root->fs_info->free_chunk_lock);
-
device->in_fs_metadata = 0;
btrfs_scrub_cancel_dev(root->fs_info, device);
@@ -1750,9 +1721,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
fs_devices = fs_devices->seed;
}
cur_devices->seed = NULL;
- lock_chunks(root);
__btrfs_close_devices(cur_devices);
- unlock_chunks(root);
free_fs_devices(cur_devices);
}
@@ -1825,8 +1794,8 @@ error_undo:
lock_chunks(root);
list_add(&device->dev_alloc_list,
&root->fs_info->fs_devices->alloc_list);
+ device->fs_devices->rw_devices++;
unlock_chunks(root);
- root->fs_info->fs_devices->rw_devices++;
}
goto error_brelse;
}
@@ -1834,29 +1803,57 @@ error_undo:
void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
struct btrfs_device *srcdev)
{
+ struct btrfs_fs_devices *fs_devices;
+
WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
+ /*
+ * in case of fs with no seed, srcdev->fs_devices will point
+ * to fs_devices of fs_info. However when the dev being replaced is
+ * a seed dev it will point to the seed's local fs_devices. In short
+ * srcdev will have its correct fs_devices in both the cases.
+ */
+ fs_devices = srcdev->fs_devices;
+
list_del_rcu(&srcdev->dev_list);
list_del_rcu(&srcdev->dev_alloc_list);
- fs_info->fs_devices->num_devices--;
- if (srcdev->missing) {
- fs_info->fs_devices->missing_devices--;
- fs_info->fs_devices->rw_devices++;
- }
- if (srcdev->can_discard)
- fs_info->fs_devices->num_can_discard--;
- if (srcdev->bdev) {
- fs_info->fs_devices->open_devices--;
+ fs_devices->num_devices--;
+ if (srcdev->missing)
+ fs_devices->missing_devices--;
- /*
- * zero out the old super if it is not writable
- * (e.g. seed device)
- */
- if (srcdev->writeable)
- btrfs_scratch_superblock(srcdev);
+ if (srcdev->writeable) {
+ fs_devices->rw_devices--;
+ /* zero out the old super if it is writable */
+ btrfs_scratch_superblock(srcdev);
}
+ if (srcdev->bdev)
+ fs_devices->open_devices--;
+
call_rcu(&srcdev->rcu, free_device);
+
+ /*
+ * unless fs_devices is seed fs, num_devices shouldn't go
+ * zero
+ */
+ BUG_ON(!fs_devices->num_devices && !fs_devices->seeding);
+
+ /* if this is no devs we rather delete the fs_devices */
+ if (!fs_devices->num_devices) {
+ struct btrfs_fs_devices *tmp_fs_devices;
+
+ tmp_fs_devices = fs_info->fs_devices;
+ while (tmp_fs_devices) {
+ if (tmp_fs_devices->seed == fs_devices) {
+ tmp_fs_devices->seed = fs_devices->seed;
+ break;
+ }
+ tmp_fs_devices = tmp_fs_devices->seed;
+ }
+ fs_devices->seed = NULL;
+ __btrfs_close_devices(fs_devices);
+ free_fs_devices(fs_devices);
+ }
}
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
@@ -1864,6 +1861,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
{
struct btrfs_device *next_device;
+ mutex_lock(&uuid_mutex);
WARN_ON(!tgtdev);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
if (tgtdev->bdev) {
@@ -1871,8 +1869,6 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
fs_info->fs_devices->open_devices--;
}
fs_info->fs_devices->num_devices--;
- if (tgtdev->can_discard)
- fs_info->fs_devices->num_can_discard++;
next_device = list_entry(fs_info->fs_devices->devices.next,
struct btrfs_device, dev_list);
@@ -1885,6 +1881,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
call_rcu(&tgtdev->rcu, free_device);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&uuid_mutex);
}
static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
@@ -1983,17 +1980,17 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
synchronize_rcu);
+ list_for_each_entry(device, &seed_devices->devices, dev_list)
+ device->fs_devices = seed_devices;
+ lock_chunks(root);
list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
- list_for_each_entry(device, &seed_devices->devices, dev_list) {
- device->fs_devices = seed_devices;
- }
+ unlock_chunks(root);
fs_devices->seeding = 0;
fs_devices->num_devices = 0;
fs_devices->open_devices = 0;
fs_devices->missing_devices = 0;
- fs_devices->num_can_discard = 0;
fs_devices->rotating = 0;
fs_devices->seed = seed_devices;
@@ -2093,7 +2090,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
struct list_head *devices;
struct super_block *sb = root->fs_info->sb;
struct rcu_string *name;
- u64 total_bytes;
+ u64 tmp;
int seeding_dev = 0;
int ret = 0;
@@ -2149,8 +2146,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
goto error;
}
- lock_chunks(root);
-
q = bdev_get_queue(bdev);
if (blk_queue_discard(q))
device->can_discard = 1;
@@ -2161,6 +2156,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
device->sector_size = root->sectorsize;
device->total_bytes = i_size_read(bdev->bd_inode);
device->disk_total_bytes = device->total_bytes;
+ device->commit_total_bytes = device->total_bytes;
device->dev_root = root->fs_info->dev_root;
device->bdev = bdev;
device->in_fs_metadata = 1;
@@ -2178,6 +2174,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
device->fs_devices = root->fs_info->fs_devices;
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ lock_chunks(root);
list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
list_add(&device->dev_alloc_list,
&root->fs_info->fs_devices->alloc_list);
@@ -2185,8 +2182,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
root->fs_info->fs_devices->open_devices++;
root->fs_info->fs_devices->rw_devices++;
root->fs_info->fs_devices->total_devices++;
- if (device->can_discard)
- root->fs_info->fs_devices->num_can_discard++;
root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
spin_lock(&root->fs_info->free_chunk_lock);
@@ -2196,26 +2191,45 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
if (!blk_queue_nonrot(bdev_get_queue(bdev)))
root->fs_info->fs_devices->rotating = 1;
- total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
+ tmp = btrfs_super_total_bytes(root->fs_info->super_copy);
btrfs_set_super_total_bytes(root->fs_info->super_copy,
- total_bytes + device->total_bytes);
+ tmp + device->total_bytes);
- total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
+ tmp = btrfs_super_num_devices(root->fs_info->super_copy);
btrfs_set_super_num_devices(root->fs_info->super_copy,
- total_bytes + 1);
+ tmp + 1);
/* add sysfs device entry */
btrfs_kobj_add_device(root->fs_info, device);
+ /*
+ * we've got more storage, clear any full flags on the space
+ * infos
+ */
+ btrfs_clear_space_info_full(root->fs_info);
+
+ unlock_chunks(root);
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
if (seeding_dev) {
- char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
+ lock_chunks(root);
ret = init_first_rw_device(trans, root, device);
+ unlock_chunks(root);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto error_trans;
}
+ }
+
+ ret = btrfs_add_device(trans, root, device);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto error_trans;
+ }
+
+ if (seeding_dev) {
+ char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
+
ret = btrfs_finish_sprout(trans, root);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
@@ -2229,21 +2243,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
root->fs_info->fsid);
if (kobject_rename(&root->fs_info->super_kobj, fsid_buf))
goto error_trans;
- } else {
- ret = btrfs_add_device(trans, root, device);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- goto error_trans;
- }
}
- /*
- * we've got more storage, clear any full flags on the space
- * infos
- */
- btrfs_clear_space_info_full(root->fs_info);
-
- unlock_chunks(root);
root->fs_info->num_tolerated_disk_barrier_failures =
btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
ret = btrfs_commit_transaction(trans, root);
@@ -2275,7 +2276,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
return ret;
error_trans:
- unlock_chunks(root);
btrfs_end_transaction(trans, root);
rcu_string_free(device->name);
btrfs_kobj_rm_device(root->fs_info, device);
@@ -2290,6 +2290,7 @@ error:
}
int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
+ struct btrfs_device *srcdev,
struct btrfs_device **device_out)
{
struct request_queue *q;
@@ -2302,24 +2303,38 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
int ret = 0;
*device_out = NULL;
- if (fs_info->fs_devices->seeding)
+ if (fs_info->fs_devices->seeding) {
+ btrfs_err(fs_info, "the filesystem is a seed filesystem!");
return -EINVAL;
+ }
bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
fs_info->bdev_holder);
- if (IS_ERR(bdev))
+ if (IS_ERR(bdev)) {
+ btrfs_err(fs_info, "target device %s is invalid!", device_path);
return PTR_ERR(bdev);
+ }
filemap_write_and_wait(bdev->bd_inode->i_mapping);
devices = &fs_info->fs_devices->devices;
list_for_each_entry(device, devices, dev_list) {
if (device->bdev == bdev) {
+ btrfs_err(fs_info, "target device is in the filesystem!");
ret = -EEXIST;
goto error;
}
}
+
+ if (i_size_read(bdev->bd_inode) <
+ btrfs_device_get_total_bytes(srcdev)) {
+ btrfs_err(fs_info, "target device is smaller than source device!");
+ ret = -EINVAL;
+ goto error;
+ }
+
+
device = btrfs_alloc_device(NULL, &devid, NULL);
if (IS_ERR(device)) {
ret = PTR_ERR(device);
@@ -2343,8 +2358,12 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
device->io_width = root->sectorsize;
device->io_align = root->sectorsize;
device->sector_size = root->sectorsize;
- device->total_bytes = i_size_read(bdev->bd_inode);
- device->disk_total_bytes = device->total_bytes;
+ device->total_bytes = btrfs_device_get_total_bytes(srcdev);
+ device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
+ device->bytes_used = btrfs_device_get_bytes_used(srcdev);
+ ASSERT(list_empty(&srcdev->resized_list));
+ device->commit_total_bytes = srcdev->commit_total_bytes;
+ device->commit_bytes_used = device->bytes_used;
device->dev_root = fs_info->dev_root;
device->bdev = bdev;
device->in_fs_metadata = 1;
@@ -2356,8 +2375,6 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
list_add(&device->dev_list, &fs_info->fs_devices->devices);
fs_info->fs_devices->num_devices++;
fs_info->fs_devices->open_devices++;
- if (device->can_discard)
- fs_info->fs_devices->num_can_discard++;
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
*device_out = device;
@@ -2416,8 +2433,10 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
btrfs_set_device_io_align(leaf, dev_item, device->io_align);
btrfs_set_device_io_width(leaf, dev_item, device->io_width);
btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
- btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
- btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+ btrfs_set_device_total_bytes(leaf, dev_item,
+ btrfs_device_get_disk_total_bytes(device));
+ btrfs_set_device_bytes_used(leaf, dev_item,
+ btrfs_device_get_bytes_used(device));
btrfs_mark_buffer_dirty(leaf);
out:
@@ -2425,40 +2444,44 @@ out:
return ret;
}
-static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size)
{
struct btrfs_super_block *super_copy =
device->dev_root->fs_info->super_copy;
- u64 old_total = btrfs_super_total_bytes(super_copy);
- u64 diff = new_size - device->total_bytes;
+ struct btrfs_fs_devices *fs_devices;
+ u64 old_total;
+ u64 diff;
if (!device->writeable)
return -EACCES;
+
+ lock_chunks(device->dev_root);
+ old_total = btrfs_super_total_bytes(super_copy);
+ diff = new_size - device->total_bytes;
+
if (new_size <= device->total_bytes ||
- device->is_tgtdev_for_dev_replace)
+ device->is_tgtdev_for_dev_replace) {
+ unlock_chunks(device->dev_root);
return -EINVAL;
+ }
+
+ fs_devices = device->dev_root->fs_info->fs_devices;
btrfs_set_super_total_bytes(super_copy, old_total + diff);
device->fs_devices->total_rw_bytes += diff;
- device->total_bytes = new_size;
- device->disk_total_bytes = new_size;
+ btrfs_device_set_total_bytes(device, new_size);
+ btrfs_device_set_disk_total_bytes(device, new_size);
btrfs_clear_space_info_full(device->dev_root->fs_info);
+ if (list_empty(&device->resized_list))
+ list_add_tail(&device->resized_list,
+ &fs_devices->resized_devices);
+ unlock_chunks(device->dev_root);
return btrfs_update_device(trans, device);
}
-int btrfs_grow_device(struct btrfs_trans_handle *trans,
- struct btrfs_device *device, u64 new_size)
-{
- int ret;
- lock_chunks(device->dev_root);
- ret = __btrfs_grow_device(trans, device, new_size);
- unlock_chunks(device->dev_root);
- return ret;
-}
-
static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 chunk_tree, u64 chunk_objectid,
@@ -2510,6 +2533,7 @@ static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
u32 cur;
struct btrfs_key key;
+ lock_chunks(root);
array_size = btrfs_super_sys_array_size(super_copy);
ptr = super_copy->sys_chunk_array;
@@ -2539,79 +2563,95 @@ static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
cur += len;
}
}
+ unlock_chunks(root);
return ret;
}
-static int btrfs_relocate_chunk(struct btrfs_root *root,
- u64 chunk_tree, u64 chunk_objectid,
- u64 chunk_offset)
+int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 chunk_offset)
{
struct extent_map_tree *em_tree;
- struct btrfs_root *extent_root;
- struct btrfs_trans_handle *trans;
struct extent_map *em;
+ struct btrfs_root *extent_root = root->fs_info->extent_root;
struct map_lookup *map;
- int ret;
- int i;
+ u64 dev_extent_len = 0;
+ u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+ u64 chunk_tree = root->fs_info->chunk_root->objectid;
+ int i, ret = 0;
+ /* Just in case */
root = root->fs_info->chunk_root;
- extent_root = root->fs_info->extent_root;
em_tree = &root->fs_info->mapping_tree.map_tree;
- ret = btrfs_can_relocate(extent_root, chunk_offset);
- if (ret)
- return -ENOSPC;
-
- /* step one, relocate all the extents inside this chunk */
- ret = btrfs_relocate_block_group(extent_root, chunk_offset);
- if (ret)
- return ret;
-
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- btrfs_std_error(root->fs_info, ret);
- return ret;
- }
-
- lock_chunks(root);
-
- /*
- * step two, delete the device extents and the
- * chunk tree entries
- */
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, chunk_offset, 1);
read_unlock(&em_tree->lock);
- BUG_ON(!em || em->start > chunk_offset ||
- em->start + em->len < chunk_offset);
+ if (!em || em->start > chunk_offset ||
+ em->start + em->len < chunk_offset) {
+ /*
+ * This is a logic error, but we don't want to just rely on the
+ * user having built with ASSERT enabled, so if ASSERT doens't
+ * do anything we still error out.
+ */
+ ASSERT(0);
+ if (em)
+ free_extent_map(em);
+ return -EINVAL;
+ }
map = (struct map_lookup *)em->bdev;
for (i = 0; i < map->num_stripes; i++) {
- ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
- map->stripes[i].physical);
- BUG_ON(ret);
+ struct btrfs_device *device = map->stripes[i].dev;
+ ret = btrfs_free_dev_extent(trans, device,
+ map->stripes[i].physical,
+ &dev_extent_len);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
+
+ if (device->bytes_used > 0) {
+ lock_chunks(root);
+ btrfs_device_set_bytes_used(device,
+ device->bytes_used - dev_extent_len);
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space += dev_extent_len;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+ btrfs_clear_space_info_full(root->fs_info);
+ unlock_chunks(root);
+ }
if (map->stripes[i].dev) {
ret = btrfs_update_device(trans, map->stripes[i].dev);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
}
}
ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
chunk_offset);
-
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
}
ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
write_lock(&em_tree->lock);
remove_extent_mapping(em_tree, em);
@@ -2619,12 +2659,46 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
/* once for the tree */
free_extent_map(em);
+out:
/* once for us */
free_extent_map(em);
+ return ret;
+}
- unlock_chunks(root);
+static int btrfs_relocate_chunk(struct btrfs_root *root,
+ u64 chunk_tree, u64 chunk_objectid,
+ u64 chunk_offset)
+{
+ struct btrfs_root *extent_root;
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ root = root->fs_info->chunk_root;
+ extent_root = root->fs_info->extent_root;
+
+ ret = btrfs_can_relocate(extent_root, chunk_offset);
+ if (ret)
+ return -ENOSPC;
+
+ /* step one, relocate all the extents inside this chunk */
+ ret = btrfs_relocate_block_group(extent_root, chunk_offset);
+ if (ret)
+ return ret;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ btrfs_std_error(root->fs_info, ret);
+ return ret;
+ }
+
+ /*
+ * step two, delete the device extents and the
+ * chunk tree entries
+ */
+ ret = btrfs_remove_chunk(trans, root, chunk_offset);
btrfs_end_transaction(trans, root);
- return 0;
+ return ret;
}
static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
@@ -2677,8 +2751,8 @@ again:
found_key.offset);
if (ret == -ENOSPC)
failed++;
- else if (ret)
- BUG();
+ else
+ BUG_ON(ret);
}
if (found_key.offset == 0)
@@ -3085,11 +3159,12 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
/* step one make some room on all the devices */
devices = &fs_info->fs_devices->devices;
list_for_each_entry(device, devices, dev_list) {
- old_size = device->total_bytes;
+ old_size = btrfs_device_get_total_bytes(device);
size_to_free = div_factor(old_size, 1);
size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
if (!device->writeable ||
- device->total_bytes - device->bytes_used > size_to_free ||
+ btrfs_device_get_total_bytes(device) -
+ btrfs_device_get_bytes_used(device) > size_to_free ||
device->is_tgtdev_for_dev_replace)
continue;
@@ -3644,8 +3719,6 @@ static int btrfs_uuid_scan_kthread(void *data)
max_key.type = BTRFS_ROOT_ITEM_KEY;
max_key.offset = (u64)-1;
- path->keep_locks = 1;
-
while (1) {
ret = btrfs_search_forward(root, &key, path, 0);
if (ret) {
@@ -3897,8 +3970,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
struct btrfs_key key;
struct btrfs_super_block *super_copy = root->fs_info->super_copy;
u64 old_total = btrfs_super_total_bytes(super_copy);
- u64 old_size = device->total_bytes;
- u64 diff = device->total_bytes - new_size;
+ u64 old_size = btrfs_device_get_total_bytes(device);
+ u64 diff = old_size - new_size;
if (device->is_tgtdev_for_dev_replace)
return -EINVAL;
@@ -3911,7 +3984,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
lock_chunks(root);
- device->total_bytes = new_size;
+ btrfs_device_set_total_bytes(device, new_size);
if (device->writeable) {
device->fs_devices->total_rw_bytes -= diff;
spin_lock(&root->fs_info->free_chunk_lock);
@@ -3977,7 +4050,7 @@ again:
ret = -ENOSPC;
lock_chunks(root);
- device->total_bytes = old_size;
+ btrfs_device_set_total_bytes(device, old_size);
if (device->writeable)
device->fs_devices->total_rw_bytes += diff;
spin_lock(&root->fs_info->free_chunk_lock);
@@ -3995,18 +4068,17 @@ again:
}
lock_chunks(root);
+ btrfs_device_set_disk_total_bytes(device, new_size);
+ if (list_empty(&device->resized_list))
+ list_add_tail(&device->resized_list,
+ &root->fs_info->fs_devices->resized_devices);
- device->disk_total_bytes = new_size;
- /* Now btrfs_update_device() will change the on-disk size. */
- ret = btrfs_update_device(trans, device);
- if (ret) {
- unlock_chunks(root);
- btrfs_end_transaction(trans, root);
- goto done;
- }
WARN_ON(diff > old_total);
btrfs_set_super_total_bytes(super_copy, old_total - diff);
unlock_chunks(root);
+
+ /* Now btrfs_update_device() will change the on-disk size. */
+ ret = btrfs_update_device(trans, device);
btrfs_end_transaction(trans, root);
done:
btrfs_free_path(path);
@@ -4022,10 +4094,13 @@ static int btrfs_add_system_chunk(struct btrfs_root *root,
u32 array_size;
u8 *ptr;
+ lock_chunks(root);
array_size = btrfs_super_sys_array_size(super_copy);
if (array_size + item_size + sizeof(disk_key)
- > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
+ > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
+ unlock_chunks(root);
return -EFBIG;
+ }
ptr = super_copy->sys_chunk_array + array_size;
btrfs_cpu_key_to_disk(&disk_key, key);
@@ -4034,6 +4109,8 @@ static int btrfs_add_system_chunk(struct btrfs_root *root,
memcpy(ptr, chunk, item_size);
item_size += sizeof(disk_key);
btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
+ unlock_chunks(root);
+
return 0;
}
@@ -4403,6 +4480,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
if (ret)
goto error_del_extent;
+ for (i = 0; i < map->num_stripes; i++) {
+ num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
+ btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
+ }
+
+ spin_lock(&extent_root->fs_info->free_chunk_lock);
+ extent_root->fs_info->free_chunk_space -= (stripe_size *
+ map->num_stripes);
+ spin_unlock(&extent_root->fs_info->free_chunk_lock);
+
free_extent_map(em);
check_raid56_incompat_flag(extent_root->fs_info, type);
@@ -4474,7 +4561,6 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
device = map->stripes[i].dev;
dev_offset = map->stripes[i].physical;
- device->bytes_used += stripe_size;
ret = btrfs_update_device(trans, device);
if (ret)
goto out;
@@ -4487,11 +4573,6 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
goto out;
}
- spin_lock(&extent_root->fs_info->free_chunk_lock);
- extent_root->fs_info->free_chunk_space -= (stripe_size *
- map->num_stripes);
- spin_unlock(&extent_root->fs_info->free_chunk_lock);
-
stripe = &chunk->stripe;
for (i = 0; i < map->num_stripes; i++) {
device = map->stripes[i].dev;
@@ -4571,16 +4652,25 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
alloc_profile);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- goto out;
+ return ret;
+}
+
+static inline int btrfs_chunk_max_errors(struct map_lookup *map)
+{
+ int max_errors;
+
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_DUP)) {
+ max_errors = 1;
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
+ max_errors = 2;
+ } else {
+ max_errors = 0;
}
- ret = btrfs_add_device(trans, fs_info->chunk_root, device);
- if (ret)
- btrfs_abort_transaction(trans, root, ret);
-out:
- return ret;
+ return max_errors;
}
int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
@@ -4589,6 +4679,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
struct map_lookup *map;
struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
int readonly = 0;
+ int miss_ndevs = 0;
int i;
read_lock(&map_tree->map_tree.lock);
@@ -4597,18 +4688,27 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
if (!em)
return 1;
- if (btrfs_test_opt(root, DEGRADED)) {
- free_extent_map(em);
- return 0;
- }
-
map = (struct map_lookup *)em->bdev;
for (i = 0; i < map->num_stripes; i++) {
+ if (map->stripes[i].dev->missing) {
+ miss_ndevs++;
+ continue;
+ }
+
if (!map->stripes[i].dev->writeable) {
readonly = 1;
- break;
+ goto end;
}
}
+
+ /*
+ * If the number of missing devices is larger than max errors,
+ * we can not write the data into that chunk successfully, so
+ * set it readonly.
+ */
+ if (miss_ndevs > btrfs_chunk_max_errors(map))
+ readonly = 1;
+end:
free_extent_map(em);
return readonly;
}
@@ -5009,6 +5109,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
num_stripes = min_t(u64, map->num_stripes,
stripe_nr_end - stripe_nr_orig);
stripe_index = do_div(stripe_nr, map->num_stripes);
+ if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)))
+ mirror_num = 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
num_stripes = map->num_stripes;
@@ -5112,6 +5214,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
/* We distribute the parity blocks across stripes */
tmp = stripe_nr + stripe_index;
stripe_index = do_div(tmp, map->num_stripes);
+ if (!(rw & (REQ_WRITE | REQ_DISCARD |
+ REQ_GET_READ_MIRRORS)) && mirror_num <= 1)
+ mirror_num = 1;
}
} else {
/*
@@ -5219,16 +5324,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
}
}
- if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
- if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10 |
- BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_DUP)) {
- max_errors = 1;
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
- max_errors = 2;
- }
- }
+ if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
+ max_errors = btrfs_chunk_max_errors(map);
if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
dev_replace->tgtdev != NULL) {
@@ -5611,8 +5708,8 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
name = rcu_dereference(dev->name);
pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
"(%s id %llu), size=%u\n", rw,
- (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
- name->str, dev->devid, bio->bi_size);
+ (u64)bio->bi_iter.bi_sector, (u_long)dev->bdev->bd_dev,
+ name->str, dev->devid, bio->bi_iter.bi_size);
rcu_read_unlock();
}
#endif
@@ -5790,10 +5887,10 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
}
static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
+ struct btrfs_fs_devices *fs_devices,
u64 devid, u8 *dev_uuid)
{
struct btrfs_device *device;
- struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
device = btrfs_alloc_device(NULL, &devid, dev_uuid);
if (IS_ERR(device))
@@ -5930,7 +6027,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
}
if (!map->stripes[i].dev) {
map->stripes[i].dev =
- add_missing_dev(root, devid, uuid);
+ add_missing_dev(root, root->fs_info->fs_devices,
+ devid, uuid);
if (!map->stripes[i].dev) {
free_extent_map(em);
return -EIO;
@@ -5957,7 +6055,9 @@ static void fill_device_from_item(struct extent_buffer *leaf,
device->devid = btrfs_device_id(leaf, dev_item);
device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
device->total_bytes = device->disk_total_bytes;
+ device->commit_total_bytes = device->disk_total_bytes;
device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
+ device->commit_bytes_used = device->bytes_used;
device->type = btrfs_device_type(leaf, dev_item);
device->io_align = btrfs_device_io_align(leaf, dev_item);
device->io_width = btrfs_device_io_width(leaf, dev_item);
@@ -5969,7 +6069,8 @@ static void fill_device_from_item(struct extent_buffer *leaf,
read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
}
-static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
+static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root,
+ u8 *fsid)
{
struct btrfs_fs_devices *fs_devices;
int ret;
@@ -5978,49 +6079,56 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
fs_devices = root->fs_info->fs_devices->seed;
while (fs_devices) {
- if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
- ret = 0;
- goto out;
- }
+ if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE))
+ return fs_devices;
+
fs_devices = fs_devices->seed;
}
fs_devices = find_fsid(fsid);
if (!fs_devices) {
- ret = -ENOENT;
- goto out;
+ if (!btrfs_test_opt(root, DEGRADED))
+ return ERR_PTR(-ENOENT);
+
+ fs_devices = alloc_fs_devices(fsid);
+ if (IS_ERR(fs_devices))
+ return fs_devices;
+
+ fs_devices->seeding = 1;
+ fs_devices->opened = 1;
+ return fs_devices;
}
fs_devices = clone_fs_devices(fs_devices);
- if (IS_ERR(fs_devices)) {
- ret = PTR_ERR(fs_devices);
- goto out;
- }
+ if (IS_ERR(fs_devices))
+ return fs_devices;
ret = __btrfs_open_devices(fs_devices, FMODE_READ,
root->fs_info->bdev_holder);
if (ret) {
free_fs_devices(fs_devices);
+ fs_devices = ERR_PTR(ret);
goto out;
}
if (!fs_devices->seeding) {
__btrfs_close_devices(fs_devices);
free_fs_devices(fs_devices);
- ret = -EINVAL;
+ fs_devices = ERR_PTR(-EINVAL);
goto out;
}
fs_devices->seed = root->fs_info->fs_devices->seed;
root->fs_info->fs_devices->seed = fs_devices;
out:
- return ret;
+ return fs_devices;
}
static int read_one_dev(struct btrfs_root *root,
struct extent_buffer *leaf,
struct btrfs_dev_item *dev_item)
{
+ struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
struct btrfs_device *device;
u64 devid;
int ret;
@@ -6034,31 +6142,48 @@ static int read_one_dev(struct btrfs_root *root,
BTRFS_UUID_SIZE);
if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
- ret = open_seed_devices(root, fs_uuid);
- if (ret && !btrfs_test_opt(root, DEGRADED))
- return ret;
+ fs_devices = open_seed_devices(root, fs_uuid);
+ if (IS_ERR(fs_devices))
+ return PTR_ERR(fs_devices);
}
device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
- if (!device || !device->bdev) {
+ if (!device) {
if (!btrfs_test_opt(root, DEGRADED))
return -EIO;
- if (!device) {
- btrfs_warn(root->fs_info, "devid %llu missing", devid);
- device = add_missing_dev(root, devid, dev_uuid);
- if (!device)
- return -ENOMEM;
- } else if (!device->missing) {
+ btrfs_warn(root->fs_info, "devid %llu missing", devid);
+ device = add_missing_dev(root, fs_devices, devid, dev_uuid);
+ if (!device)
+ return -ENOMEM;
+ } else {
+ if (!device->bdev && !btrfs_test_opt(root, DEGRADED))
+ return -EIO;
+
+ if(!device->bdev && !device->missing) {
/*
* this happens when a device that was properly setup
* in the device info lists suddenly goes bad.
* device->bdev is NULL, and so we have to set
* device->missing to one here
*/
- root->fs_info->fs_devices->missing_devices++;
+ device->fs_devices->missing_devices++;
device->missing = 1;
}
+
+ /* Move the device to its own fs_devices */
+ if (device->fs_devices != fs_devices) {
+ ASSERT(device->missing);
+
+ list_move(&device->dev_list, &fs_devices->devices);
+ device->fs_devices->num_devices--;
+ fs_devices->num_devices++;
+
+ device->fs_devices->missing_devices--;
+ fs_devices->missing_devices++;
+
+ device->fs_devices = fs_devices;
+ }
}
if (device->fs_devices != root->fs_info->fs_devices) {
@@ -6374,16 +6499,18 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
struct btrfs_root *dev_root = fs_info->dev_root;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
+ int stats_cnt;
int ret = 0;
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
- if (!device->dev_stats_valid || !device->dev_stats_dirty)
+ if (!device->dev_stats_valid || !btrfs_dev_stats_dirty(device))
continue;
+ stats_cnt = atomic_read(&device->dev_stats_ccnt);
ret = update_dev_stat_item(trans, dev_root, device);
if (!ret)
- device->dev_stats_dirty = 0;
+ atomic_sub(stats_cnt, &device->dev_stats_ccnt);
}
mutex_unlock(&fs_devices->device_list_mutex);
@@ -6482,3 +6609,51 @@ int btrfs_scratch_superblock(struct btrfs_device *device)
return 0;
}
+
+/*
+ * Update the size of all devices, which is used for writing out the
+ * super blocks.
+ */
+void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *curr, *next;
+
+ if (list_empty(&fs_devices->resized_devices))
+ return;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ lock_chunks(fs_info->dev_root);
+ list_for_each_entry_safe(curr, next, &fs_devices->resized_devices,
+ resized_list) {
+ list_del_init(&curr->resized_list);
+ curr->commit_total_bytes = curr->disk_total_bytes;
+ }
+ unlock_chunks(fs_info->dev_root);
+ mutex_unlock(&fs_devices->device_list_mutex);
+}
+
+/* Must be invoked during the transaction commit */
+void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
+ struct btrfs_transaction *transaction)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct btrfs_device *dev;
+ int i;
+
+ if (list_empty(&transaction->pending_chunks))
+ return;
+
+ /* In order to kick the device replace finish process */
+ lock_chunks(root);
+ list_for_each_entry(em, &transaction->pending_chunks, list) {
+ map = (struct map_lookup *)em->bdev;
+
+ for (i = 0; i < map->num_stripes; i++) {
+ dev = map->stripes[i].dev;
+ dev->commit_bytes_used = dev->bytes_used;
+ }
+ }
+ unlock_chunks(root);
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2aaa00c..08980fa 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -24,6 +24,8 @@
#include <linux/btrfs.h>
#include "async-thread.h"
+extern struct mutex uuid_mutex;
+
#define BTRFS_STRIPE_LEN (64 * 1024)
struct buffer_head;
@@ -32,41 +34,59 @@ struct btrfs_pending_bios {
struct bio *tail;
};
+/*
+ * Use sequence counter to get consistent device stat data on
+ * 32-bit processors.
+ */
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+#include <linux/seqlock.h>
+#define __BTRFS_NEED_DEVICE_DATA_ORDERED
+#define btrfs_device_data_ordered_init(device) \
+ seqcount_init(&device->data_seqcount)
+#else
+#define btrfs_device_data_ordered_init(device) do { } while (0)
+#endif
+
struct btrfs_device {
struct list_head dev_list;
struct list_head dev_alloc_list;
struct btrfs_fs_devices *fs_devices;
+
struct btrfs_root *dev_root;
+ struct rcu_string *name;
+
+ u64 generation;
+
+ spinlock_t io_lock ____cacheline_aligned;
+ int running_pending;
/* regular prio bios */
struct btrfs_pending_bios pending_bios;
/* WRITE_SYNC bios */
struct btrfs_pending_bios pending_sync_bios;
- u64 generation;
- int running_pending;
+ struct block_device *bdev;
+
+ /* the mode sent to blkdev_get */
+ fmode_t mode;
+
int writeable;
int in_fs_metadata;
int missing;
int can_discard;
int is_tgtdev_for_dev_replace;
- spinlock_t io_lock;
- /* the mode sent to blkdev_get */
- fmode_t mode;
-
- struct block_device *bdev;
-
-
- struct rcu_string *name;
+#ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED
+ seqcount_t data_seqcount;
+#endif
/* the internal btrfs device id */
u64 devid;
- /* size of the device */
+ /* size of the device in memory */
u64 total_bytes;
- /* size of the disk */
+ /* size of the device on disk */
u64 disk_total_bytes;
/* bytes used */
@@ -83,10 +103,26 @@ struct btrfs_device {
/* minimal io size for this device */
u32 sector_size;
-
/* physical drive uuid (or lvm uuid) */
u8 uuid[BTRFS_UUID_SIZE];
+ /*
+ * size of the device on the current transaction
+ *
+ * This variant is update when committing the transaction,
+ * and protected by device_list_mutex
+ */
+ u64 commit_total_bytes;
+
+ /* bytes used on the current transaction */
+ u64 commit_bytes_used;
+ /*
+ * used to manage the device which is resized
+ *
+ * It is protected by chunk_lock.
+ */
+ struct list_head resized_list;
+
/* for sending down flush barriers */
int nobarriers;
struct bio *flush_bio;
@@ -107,26 +143,90 @@ struct btrfs_device {
struct radix_tree_root reada_zones;
struct radix_tree_root reada_extents;
-
/* disk I/O failure stats. For detailed description refer to
* enum btrfs_dev_stat_values in ioctl.h */
int dev_stats_valid;
- int dev_stats_dirty; /* counters need to be written to disk */
+
+ /* Counter to record the change of device stats */
+ atomic_t dev_stats_ccnt;
atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
};
+/*
+ * If we read those variants at the context of their own lock, we needn't
+ * use the following helpers, reading them directly is safe.
+ */
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+#define BTRFS_DEVICE_GETSET_FUNCS(name) \
+static inline u64 \
+btrfs_device_get_##name(const struct btrfs_device *dev) \
+{ \
+ u64 size; \
+ unsigned int seq; \
+ \
+ do { \
+ seq = read_seqcount_begin(&dev->data_seqcount); \
+ size = dev->name; \
+ } while (read_seqcount_retry(&dev->data_seqcount, seq)); \
+ return size; \
+} \
+ \
+static inline void \
+btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \
+{ \
+ preempt_disable(); \
+ write_seqcount_begin(&dev->data_seqcount); \
+ dev->name = size; \
+ write_seqcount_end(&dev->data_seqcount); \
+ preempt_enable(); \
+}
+#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT)
+#define BTRFS_DEVICE_GETSET_FUNCS(name) \
+static inline u64 \
+btrfs_device_get_##name(const struct btrfs_device *dev) \
+{ \
+ u64 size; \
+ \
+ preempt_disable(); \
+ size = dev->name; \
+ preempt_enable(); \
+ return size; \
+} \
+ \
+static inline void \
+btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \
+{ \
+ preempt_disable(); \
+ dev->name = size; \
+ preempt_enable(); \
+}
+#else
+#define BTRFS_DEVICE_GETSET_FUNCS(name) \
+static inline u64 \
+btrfs_device_get_##name(const struct btrfs_device *dev) \
+{ \
+ return dev->name; \
+} \
+ \
+static inline void \
+btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \
+{ \
+ dev->name = size; \
+}
+#endif
+
+BTRFS_DEVICE_GETSET_FUNCS(total_bytes);
+BTRFS_DEVICE_GETSET_FUNCS(disk_total_bytes);
+BTRFS_DEVICE_GETSET_FUNCS(bytes_used);
+
struct btrfs_fs_devices {
u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
- /* the device with this id has the most recent copy of the super */
- u64 latest_devid;
- u64 latest_trans;
u64 num_devices;
u64 open_devices;
u64 rw_devices;
u64 missing_devices;
u64 total_rw_bytes;
- u64 num_can_discard;
u64 total_devices;
struct block_device *latest_bdev;
@@ -139,6 +239,7 @@ struct btrfs_fs_devices {
struct mutex device_list_mutex;
struct list_head devices;
+ struct list_head resized_devices;
/* devices not currently being allocated */
struct list_head alloc_list;
struct list_head list;
@@ -167,8 +268,9 @@ struct btrfs_fs_devices {
*/
typedef void (btrfs_io_bio_end_io_t) (struct btrfs_io_bio *bio, int err);
struct btrfs_io_bio {
- unsigned long mirror_num;
- unsigned long stripe_index;
+ unsigned int mirror_num;
+ unsigned int stripe_index;
+ u64 logical;
u8 *csum;
u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
u8 *csum_allocated;
@@ -325,6 +427,7 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
int btrfs_init_new_device(struct btrfs_root *root, char *path);
int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
+ struct btrfs_device *srcdev,
struct btrfs_device **device_out);
int btrfs_balance(struct btrfs_balance_control *bctl,
struct btrfs_ioctl_balance_args *bargs);
@@ -360,11 +463,20 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root,
u64 chunk_offset, u64 chunk_size);
+int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 chunk_offset);
+
+static inline int btrfs_dev_stats_dirty(struct btrfs_device *dev)
+{
+ return atomic_read(&dev->dev_stats_ccnt);
+}
+
static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
int index)
{
atomic_inc(dev->dev_stat_values + index);
- dev->dev_stats_dirty = 1;
+ smp_mb__before_atomic();
+ atomic_inc(&dev->dev_stats_ccnt);
}
static inline int btrfs_dev_stat_read(struct btrfs_device *dev,
@@ -379,7 +491,8 @@ static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev,
int ret;
ret = atomic_xchg(dev->dev_stat_values + index, 0);
- dev->dev_stats_dirty = 1;
+ smp_mb__before_atomic();
+ atomic_inc(&dev->dev_stats_ccnt);
return ret;
}
@@ -387,7 +500,8 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
int index, unsigned long val)
{
atomic_set(dev->dev_stat_values + index, val);
- dev->dev_stats_dirty = 1;
+ smp_mb__before_atomic();
+ atomic_inc(&dev->dev_stats_ccnt);
}
static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
@@ -395,4 +509,8 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
{
btrfs_dev_stat_set(dev, index, 0);
}
+
+void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info);
+void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
+ struct btrfs_transaction *transaction);
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index ad8328d..dcf2013 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -237,7 +237,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
* first xattr that we find and walk forward
*/
key.objectid = btrfs_ino(inode);
- btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+ key.type = BTRFS_XATTR_ITEM_KEY;
key.offset = 0;
path = btrfs_alloc_path();
@@ -273,7 +273,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
/* check to make sure this item is what we want */
if (found_key.objectid != key.objectid)
break;
- if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY)
+ if (found_key.type != BTRFS_XATTR_ITEM_KEY)
break;
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index b67d8fc..759fa4e 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -33,8 +33,7 @@
#include "compression.h"
struct workspace {
- z_stream inf_strm;
- z_stream def_strm;
+ z_stream strm;
char *buf;
struct list_head list;
};
@@ -43,8 +42,7 @@ static void zlib_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
- vfree(workspace->def_strm.workspace);
- vfree(workspace->inf_strm.workspace);
+ vfree(workspace->strm.workspace);
kfree(workspace->buf);
kfree(workspace);
}
@@ -52,17 +50,17 @@ static void zlib_free_workspace(struct list_head *ws)
static struct list_head *zlib_alloc_workspace(void)
{
struct workspace *workspace;
+ int workspacesize;
workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
if (!workspace)
return ERR_PTR(-ENOMEM);
- workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize(
- MAX_WBITS, MAX_MEM_LEVEL));
- workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
+ workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
+ zlib_inflate_workspacesize());
+ workspace->strm.workspace = vmalloc(workspacesize);
workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
- if (!workspace->def_strm.workspace ||
- !workspace->inf_strm.workspace || !workspace->buf)
+ if (!workspace->strm.workspace || !workspace->buf)
goto fail;
INIT_LIST_HEAD(&workspace->list);
@@ -96,14 +94,14 @@ static int zlib_compress_pages(struct list_head *ws,
*total_out = 0;
*total_in = 0;
- if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
+ if (Z_OK != zlib_deflateInit(&workspace->strm, 3)) {
printk(KERN_WARNING "BTRFS: deflateInit failed\n");
ret = -EIO;
goto out;
}
- workspace->def_strm.total_in = 0;
- workspace->def_strm.total_out = 0;
+ workspace->strm.total_in = 0;
+ workspace->strm.total_out = 0;
in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
data_in = kmap(in_page);
@@ -117,25 +115,25 @@ static int zlib_compress_pages(struct list_head *ws,
pages[0] = out_page;
nr_pages = 1;
- workspace->def_strm.next_in = data_in;
- workspace->def_strm.next_out = cpage_out;
- workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
- workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
+ workspace->strm.next_in = data_in;
+ workspace->strm.next_out = cpage_out;
+ workspace->strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->strm.avail_in = min(len, PAGE_CACHE_SIZE);
- while (workspace->def_strm.total_in < len) {
- ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
+ while (workspace->strm.total_in < len) {
+ ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH);
if (ret != Z_OK) {
printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n",
ret);
- zlib_deflateEnd(&workspace->def_strm);
+ zlib_deflateEnd(&workspace->strm);
ret = -EIO;
goto out;
}
/* we're making it bigger, give up */
- if (workspace->def_strm.total_in > 8192 &&
- workspace->def_strm.total_in <
- workspace->def_strm.total_out) {
+ if (workspace->strm.total_in > 8192 &&
+ workspace->strm.total_in <
+ workspace->strm.total_out) {
ret = -E2BIG;
goto out;
}
@@ -143,7 +141,7 @@ static int zlib_compress_pages(struct list_head *ws,
* before the total_in so we will pull in a new page for
* the stream end if required
*/
- if (workspace->def_strm.avail_out == 0) {
+ if (workspace->strm.avail_out == 0) {
kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
@@ -158,19 +156,19 @@ static int zlib_compress_pages(struct list_head *ws,
cpage_out = kmap(out_page);
pages[nr_pages] = out_page;
nr_pages++;
- workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
- workspace->def_strm.next_out = cpage_out;
+ workspace->strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->strm.next_out = cpage_out;
}
/* we're all done */
- if (workspace->def_strm.total_in >= len)
+ if (workspace->strm.total_in >= len)
break;
/* we've read in a full page, get a new one */
- if (workspace->def_strm.avail_in == 0) {
- if (workspace->def_strm.total_out > max_out)
+ if (workspace->strm.avail_in == 0) {
+ if (workspace->strm.total_out > max_out)
break;
- bytes_left = len - workspace->def_strm.total_in;
+ bytes_left = len - workspace->strm.total_in;
kunmap(in_page);
page_cache_release(in_page);
@@ -178,28 +176,28 @@ static int zlib_compress_pages(struct list_head *ws,
in_page = find_get_page(mapping,
start >> PAGE_CACHE_SHIFT);
data_in = kmap(in_page);
- workspace->def_strm.avail_in = min(bytes_left,
+ workspace->strm.avail_in = min(bytes_left,
PAGE_CACHE_SIZE);
- workspace->def_strm.next_in = data_in;
+ workspace->strm.next_in = data_in;
}
}
- workspace->def_strm.avail_in = 0;
- ret = zlib_deflate(&workspace->def_strm, Z_FINISH);
- zlib_deflateEnd(&workspace->def_strm);
+ workspace->strm.avail_in = 0;
+ ret = zlib_deflate(&workspace->strm, Z_FINISH);
+ zlib_deflateEnd(&workspace->strm);
if (ret != Z_STREAM_END) {
ret = -EIO;
goto out;
}
- if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
+ if (workspace->strm.total_out >= workspace->strm.total_in) {
ret = -E2BIG;
goto out;
}
ret = 0;
- *total_out = workspace->def_strm.total_out;
- *total_in = workspace->def_strm.total_in;
+ *total_out = workspace->strm.total_out;
+ *total_in = workspace->strm.total_in;
out:
*out_pages = nr_pages;
if (out_page)
@@ -225,19 +223,18 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
size_t total_out = 0;
unsigned long page_in_index = 0;
unsigned long page_out_index = 0;
- unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
- PAGE_CACHE_SIZE;
+ unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE);
unsigned long buf_start;
unsigned long pg_offset;
data_in = kmap(pages_in[page_in_index]);
- workspace->inf_strm.next_in = data_in;
- workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
- workspace->inf_strm.total_in = 0;
+ workspace->strm.next_in = data_in;
+ workspace->strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
+ workspace->strm.total_in = 0;
- workspace->inf_strm.total_out = 0;
- workspace->inf_strm.next_out = workspace->buf;
- workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->strm.total_out = 0;
+ workspace->strm.next_out = workspace->buf;
+ workspace->strm.avail_out = PAGE_CACHE_SIZE;
pg_offset = 0;
/* If it's deflate, and it's got no preset dictionary, then
@@ -247,21 +244,21 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
!(((data_in[0]<<8) + data_in[1]) % 31)) {
wbits = -((data_in[0] >> 4) + 8);
- workspace->inf_strm.next_in += 2;
- workspace->inf_strm.avail_in -= 2;
+ workspace->strm.next_in += 2;
+ workspace->strm.avail_in -= 2;
}
- if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+ if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
printk(KERN_WARNING "BTRFS: inflateInit failed\n");
return -EIO;
}
- while (workspace->inf_strm.total_in < srclen) {
- ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+ while (workspace->strm.total_in < srclen) {
+ ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH);
if (ret != Z_OK && ret != Z_STREAM_END)
break;
buf_start = total_out;
- total_out = workspace->inf_strm.total_out;
+ total_out = workspace->strm.total_out;
/* we didn't make progress in this inflate call, we're done */
if (buf_start == total_out)
@@ -276,10 +273,10 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
goto done;
}
- workspace->inf_strm.next_out = workspace->buf;
- workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->strm.next_out = workspace->buf;
+ workspace->strm.avail_out = PAGE_CACHE_SIZE;
- if (workspace->inf_strm.avail_in == 0) {
+ if (workspace->strm.avail_in == 0) {
unsigned long tmp;
kunmap(pages_in[page_in_index]);
page_in_index++;
@@ -288,9 +285,9 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
break;
}
data_in = kmap(pages_in[page_in_index]);
- workspace->inf_strm.next_in = data_in;
- tmp = srclen - workspace->inf_strm.total_in;
- workspace->inf_strm.avail_in = min(tmp,
+ workspace->strm.next_in = data_in;
+ tmp = srclen - workspace->strm.total_in;
+ workspace->strm.avail_in = min(tmp,
PAGE_CACHE_SIZE);
}
}
@@ -299,7 +296,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
else
ret = 0;
done:
- zlib_inflateEnd(&workspace->inf_strm);
+ zlib_inflateEnd(&workspace->strm);
if (data_in)
kunmap(pages_in[page_in_index]);
return ret;
@@ -317,13 +314,13 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
unsigned long total_out = 0;
char *kaddr;
- workspace->inf_strm.next_in = data_in;
- workspace->inf_strm.avail_in = srclen;
- workspace->inf_strm.total_in = 0;
+ workspace->strm.next_in = data_in;
+ workspace->strm.avail_in = srclen;
+ workspace->strm.total_in = 0;
- workspace->inf_strm.next_out = workspace->buf;
- workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
- workspace->inf_strm.total_out = 0;
+ workspace->strm.next_out = workspace->buf;
+ workspace->strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->strm.total_out = 0;
/* If it's deflate, and it's got no preset dictionary, then
we can tell zlib to skip the adler32 check. */
if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
@@ -331,11 +328,11 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
!(((data_in[0]<<8) + data_in[1]) % 31)) {
wbits = -((data_in[0] >> 4) + 8);
- workspace->inf_strm.next_in += 2;
- workspace->inf_strm.avail_in -= 2;
+ workspace->strm.next_in += 2;
+ workspace->strm.avail_in -= 2;
}
- if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+ if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
printk(KERN_WARNING "BTRFS: inflateInit failed\n");
return -EIO;
}
@@ -346,12 +343,12 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
unsigned long bytes;
unsigned long pg_offset = 0;
- ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+ ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH);
if (ret != Z_OK && ret != Z_STREAM_END)
break;
buf_start = total_out;
- total_out = workspace->inf_strm.total_out;
+ total_out = workspace->strm.total_out;
if (total_out == buf_start) {
ret = -EIO;
@@ -377,8 +374,8 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
pg_offset += bytes;
bytes_left -= bytes;
next:
- workspace->inf_strm.next_out = workspace->buf;
- workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->strm.next_out = workspace->buf;
+ workspace->strm.avail_out = PAGE_CACHE_SIZE;
}
if (ret != Z_STREAM_END && bytes_left != 0)
@@ -386,7 +383,7 @@ next:
else
ret = 0;
- zlib_inflateEnd(&workspace->inf_strm);
+ zlib_inflateEnd(&workspace->strm);
return ret;
}